Index: head/sys/cddl/compat/opensolaris/kern/opensolaris_kobj.c
===================================================================
--- head/sys/cddl/compat/opensolaris/kern/opensolaris_kobj.c	(revision 175201)
+++ head/sys/cddl/compat/opensolaris/kern/opensolaris_kobj.c	(revision 175202)
@@ -1,220 +1,220 @@
 /*-
  * Copyright (c) 2007 Pawel Jakub Dawidek <pjd@FreeBSD.org>
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <sys/types.h>
 #include <sys/systm.h>
 #include <sys/kernel.h>
 #include <sys/systm.h>
 #include <sys/kthread.h>
 #include <sys/namei.h>
 #include <sys/proc.h>
 #include <sys/filedesc.h>
 #include <sys/fcntl.h>
 #include <sys/linker.h>
 #include <sys/kobj.h>
 
 void
 kobj_free(void *address, size_t size)
 {
 
 	kmem_free(address, size);
 }
 
 void *
 kobj_alloc(size_t size, int flag)
 {
 
 	return (kmem_alloc(size, (flag & KM_NOWAIT) ? KM_NOSLEEP : KM_SLEEP));
 }
 
 void *
 kobj_zalloc(size_t size, int flag)
 {
 	void *p;
 
 	if ((p = kobj_alloc(size, flag)) != NULL)
 		bzero(p, size);
 	return (p);
 }
 
 static void *
 kobj_open_file_vnode(const char *file)
 {
 	struct thread *td = curthread;
 	struct nameidata nd;
 	int error, flags;
 
 	if (td->td_proc->p_fd->fd_rdir == NULL)
 		td->td_proc->p_fd->fd_rdir = rootvnode;
 	if (td->td_proc->p_fd->fd_cdir == NULL)
 		td->td_proc->p_fd->fd_cdir = rootvnode;
 
 	flags = FREAD;
 	NDINIT(&nd, LOOKUP, NOFOLLOW, UIO_SYSSPACE, file, td);
 	error = vn_open_cred(&nd, &flags, 0, td->td_ucred, NULL);
 	NDFREE(&nd, NDF_ONLY_PNBUF);
 	if (error != 0)
 		return (NULL);
 	/* We just unlock so we hold a reference. */
 	VOP_UNLOCK(nd.ni_vp, 0, td);
 	return (nd.ni_vp);
 }
 
 static void *
 kobj_open_file_loader(const char *file)
 {
 
 	return (preload_search_by_name(file));
 }
 
 struct _buf *
 kobj_open_file(const char *file)
 {
 	struct _buf *out;
 
 	out = kmem_alloc(sizeof(*out), KM_SLEEP);
 	out->mounted = root_mounted();
 	/*
 	 * If root is already mounted we read file using file system,
 	 * if not, we use loader.
 	 */
 	if (out->mounted)
 		out->ptr = kobj_open_file_vnode(file);
 	else
 		out->ptr = kobj_open_file_loader(file);
 	if (out->ptr == NULL) {
 		kmem_free(out, sizeof(*out));
 		return ((struct _buf *)-1);
 	}
 	return (out);
 }
 
 static int
 kobj_get_filesize_vnode(struct _buf *file, uint64_t *size)
 {
 	struct vnode *vp = file->ptr;
 	struct thread *td = curthread;
 	struct vattr va;
 	int error;
 
-	vn_lock(vp, LK_SHARED | LK_RETRY, td);
+	vn_lock(vp, LK_SHARED | LK_RETRY);
 	error = VOP_GETATTR(vp, &va, td->td_ucred, td);
 	VOP_UNLOCK(vp, 0, td);
 	if (error == 0)
 		*size = (uint64_t)va.va_size;
 	return (error);
 }
 
 static int
 kobj_get_filesize_loader(struct _buf *file, uint64_t *size)
 {
 	void *ptr;
 
 	ptr = preload_search_info(file->ptr, MODINFO_SIZE);
 	if (ptr == NULL)
 		return (ENOENT);
 	*size = (uint64_t)*(size_t *)ptr;
 	return (0);
 }
 
 int
 kobj_get_filesize(struct _buf *file, uint64_t *size)
 {
 
 	if (file->mounted)
 		return (kobj_get_filesize_vnode(file, size));
 	else
 		return (kobj_get_filesize_loader(file, size));
 }
 
 int
 kobj_read_file_vnode(struct _buf *file, char *buf, unsigned size, unsigned off)
 {
 	struct vnode *vp = file->ptr;
 	struct thread *td = curthread;
 	struct uio auio;
 	struct iovec aiov;
 	int error;
 
 	bzero(&aiov, sizeof(aiov));
 	bzero(&auio, sizeof(auio));
 
 	aiov.iov_base = buf;
 	aiov.iov_len = size;
 
 	auio.uio_iov = &aiov;
 	auio.uio_offset = (off_t)off;
 	auio.uio_segflg = UIO_SYSSPACE;
 	auio.uio_rw = UIO_READ;
 	auio.uio_iovcnt = 1;
 	auio.uio_resid = size;
 	auio.uio_td = td;
 
-	vn_lock(vp, LK_SHARED | LK_RETRY, td);
+	vn_lock(vp, LK_SHARED | LK_RETRY);
 	error = VOP_READ(vp, &auio, IO_UNIT | IO_SYNC, td->td_ucred);
 	VOP_UNLOCK(vp, 0, td);
 	return (error != 0 ? -1 : size - auio.uio_resid);
 }
 
 int
 kobj_read_file_loader(struct _buf *file, char *buf, unsigned size, unsigned off)
 {
 	char *ptr;
 
 	ptr = preload_search_info(file->ptr, MODINFO_ADDR);
 	if (ptr == NULL)
 		return (ENOENT);
 	ptr = *(void **)ptr;
 	bcopy(ptr + off, buf, size);
 	return (0);
 }
 
 int
 kobj_read_file(struct _buf *file, char *buf, unsigned size, unsigned off)
 {
 
 	if (file->mounted)
 		return (kobj_read_file_vnode(file, buf, size, off));
 	else
 		return (kobj_read_file_loader(file, buf, size, off));
 }
 
 void
 kobj_close_file(struct _buf *file)
 {
 
 	if (file->mounted) {
 		struct vnode *vp = file->ptr;
 		struct thread *td = curthread;
 		int flags = FREAD;
 
 		vn_close(vp, flags, td->td_ucred, td);
 	}
 	kmem_free(file, sizeof(*file));
 }
Index: head/sys/cddl/compat/opensolaris/kern/opensolaris_vfs.c
===================================================================
--- head/sys/cddl/compat/opensolaris/kern/opensolaris_vfs.c	(revision 175201)
+++ head/sys/cddl/compat/opensolaris/kern/opensolaris_vfs.c	(revision 175202)
@@ -1,280 +1,280 @@
 /*-
  * Copyright (c) 2006-2007 Pawel Jakub Dawidek <pjd@FreeBSD.org>
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <sys/param.h>
 #include <sys/kernel.h>
 #include <sys/systm.h>
 #include <sys/mount.h>
 #include <sys/cred.h>
 #include <sys/vfs.h>
 #include <sys/priv.h>
 #include <sys/libkern.h>
 
 MALLOC_DECLARE(M_MOUNT);
 
 TAILQ_HEAD(vfsoptlist, vfsopt);
 struct vfsopt {
 	TAILQ_ENTRY(vfsopt) link;
 	char	*name;
 	void	*value;
 	int	len;
 };
 
 void
 vfs_setmntopt(vfs_t *vfsp, const char *name, const char *arg,
     int flags __unused)
 {
 	struct vfsopt *opt;
 	size_t namesize;
 
 	if (vfsp->mnt_opt == NULL) {
 		vfsp->mnt_opt = malloc(sizeof(*vfsp->mnt_opt), M_MOUNT, M_WAITOK);
 		TAILQ_INIT(vfsp->mnt_opt);
 	}
 
 	opt = malloc(sizeof(*opt), M_MOUNT, M_WAITOK);
 
 	namesize = strlen(name) + 1;
 	opt->name = malloc(namesize, M_MOUNT, M_WAITOK);
 	strlcpy(opt->name, name, namesize);
 
 	if (arg == NULL) {
 		opt->value = NULL;
 		opt->len = 0;
 	} else {
 		opt->len = strlen(arg) + 1;
 		opt->value = malloc(opt->len, M_MOUNT, M_WAITOK);
 		bcopy(arg, opt->value, opt->len);
 	}
 	/* TODO: Locking. */
 	TAILQ_INSERT_TAIL(vfsp->mnt_opt, opt, link);
 }
 
 void
 vfs_clearmntopt(vfs_t *vfsp, const char *name)
 {
 	struct vfsopt *opt;
 
 	if (vfsp->mnt_opt == NULL)
 		return;
 	/* TODO: Locking. */
 	TAILQ_FOREACH(opt, vfsp->mnt_opt, link) {
 		if (strcmp(opt->name, name) == 0)
 			break;
 	}
 	if (opt != NULL) {
 		TAILQ_REMOVE(vfsp->mnt_opt, opt, link);
 		free(opt->name, M_MOUNT);
 		if (opt->value != NULL)
 			free(opt->value, M_MOUNT);
 		free(opt, M_MOUNT);
 	}
 }
 
 int
 vfs_optionisset(const vfs_t *vfsp, const char *opt, char **argp)
 {
 	struct vfsoptlist *opts = vfsp->mnt_opt;
 	int error;
 
 	if (opts == NULL)
 		return (0);
 	error = vfs_getopt(opts, opt, (void **)argp, NULL);
 	return (error != 0 ? 0 : 1);
 }
 
 int
 traverse(vnode_t **cvpp, int lktype)
 {
 	kthread_t *td = curthread;
 	vnode_t *cvp;
 	vnode_t *tvp;
 	vfs_t *vfsp;
 	int error;
 
 	cvp = *cvpp;
 	tvp = NULL;
 
 	/*
 	 * If this vnode is mounted on, then we transparently indirect
 	 * to the vnode which is the root of the mounted file system.
 	 * Before we do this we must check that an unmount is not in
 	 * progress on this vnode.
 	 */
 
 	for (;;) {
 		/*
 		 * Reached the end of the mount chain?
 		 */
 		vfsp = vn_mountedvfs(cvp);
 		if (vfsp == NULL)
 			break;
 		/*
 		 * tvp is NULL for *cvpp vnode, which we can't unlock.
 		 */
 		if (tvp != NULL)
 			vput(cvp);
 		else
 			vrele(cvp);
 
 		/*
 		 * The read lock must be held across the call to VFS_ROOT() to
 		 * prevent a concurrent unmount from destroying the vfs.
 		 */
 		error = VFS_ROOT(vfsp, lktype, &tvp, td);
 		if (error != 0)
 			return (error);
 		cvp = tvp;
 	}
 
 	*cvpp = cvp;
 	return (0);
 }
 
 int
 domount(kthread_t *td, vnode_t *vp, const char *fstype, char *fspath,
     char *fspec, int fsflags)
 {
 	struct mount *mp;
 	struct vfsconf *vfsp;
 	struct ucred *newcr, *oldcr;
 	int error;
 
 	/*
 	 * Be ultra-paranoid about making sure the type and fspath
 	 * variables will fit in our mp buffers, including the
 	 * terminating NUL.
 	 */
 	if (strlen(fstype) >= MFSNAMELEN || strlen(fspath) >= MNAMELEN)
 		return (ENAMETOOLONG);
 
 	vfsp = vfs_byname_kld(fstype, td, &error);
 	if (vfsp == NULL)
 		return (ENODEV);
 
 	if (vp->v_type != VDIR)
 		return (ENOTDIR);
 	VI_LOCK(vp);
 	if ((vp->v_iflag & VI_MOUNT) != 0 ||
 	    vp->v_mountedhere != NULL) {
 		VI_UNLOCK(vp);
 		return (EBUSY);
 	}
 	vp->v_iflag |= VI_MOUNT;
 	VI_UNLOCK(vp);
 
 	/*
 	 * Allocate and initialize the filesystem.
 	 */
-	vn_lock(vp, LK_SHARED | LK_RETRY, td);
+	vn_lock(vp, LK_SHARED | LK_RETRY);
 	mp = vfs_mount_alloc(vp, vfsp, fspath, td);
 	VOP_UNLOCK(vp, 0, td);
 
 	mp->mnt_optnew = NULL;
 	vfs_setmntopt(mp, "from", fspec, 0);
 	mp->mnt_optnew = mp->mnt_opt;
 	mp->mnt_opt = NULL;
 
 	/*
 	 * Set the mount level flags.
 	 * crdup() can sleep, so do it before acquiring a mutex.
 	 */
 	newcr = crdup(kcred);
 	MNT_ILOCK(mp);
 	if (fsflags & MNT_RDONLY)
 		mp->mnt_flag |= MNT_RDONLY;
 	mp->mnt_flag &=~ MNT_UPDATEMASK;
 	mp->mnt_flag |= fsflags & (MNT_UPDATEMASK | MNT_FORCE | MNT_ROOTFS);
 	/*
 	 * Unprivileged user can trigger mounting a snapshot, but we don't want
 	 * him to unmount it, so we switch to privileged credentials.
 	 */
 	oldcr = mp->mnt_cred;
 	mp->mnt_cred = newcr;
 	mp->mnt_stat.f_owner = mp->mnt_cred->cr_uid;
 	MNT_IUNLOCK(mp);
 	crfree(oldcr);
 	/*
 	 * Mount the filesystem.
 	 * XXX The final recipients of VFS_MOUNT just overwrite the ndp they
 	 * get.  No freeing of cn_pnbuf.
 	 */
 	error = VFS_MOUNT(mp, td);
 
 	if (!error) {
 		if (mp->mnt_opt != NULL)
 			vfs_freeopts(mp->mnt_opt);
 		mp->mnt_opt = mp->mnt_optnew;
 		(void)VFS_STATFS(mp, &mp->mnt_stat, td);
 	}
 	/*
 	 * Prevent external consumers of mount options from reading
 	 * mnt_optnew.
 	*/
 	mp->mnt_optnew = NULL;
-	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td);
+	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
 	/*
 	 * Put the new filesystem on the mount list after root.
 	 */
 #ifdef FREEBSD_NAMECACHE
 	cache_purge(vp);
 #endif
 	if (!error) {
 		vnode_t *mvp;
 
 		VI_LOCK(vp);
 		vp->v_iflag &= ~VI_MOUNT;
 		VI_UNLOCK(vp);
 		vp->v_mountedhere = mp;
 		mtx_lock(&mountlist_mtx);
 		TAILQ_INSERT_TAIL(&mountlist, mp, mnt_list);
 		mtx_unlock(&mountlist_mtx);
 		vfs_event_signal(NULL, VQ_MOUNT, 0);
 		if (VFS_ROOT(mp, LK_EXCLUSIVE, &mvp, td))
 			panic("mount: lost mount");
 		mountcheckdirs(vp, mvp);
 		vput(mvp);
 		VOP_UNLOCK(vp, 0, td);
 		if ((mp->mnt_flag & MNT_RDONLY) == 0)
 			error = vfs_allocate_syncvnode(mp);
 		vfs_unbusy(mp, td);
 		if (error)
 			vrele(vp);
 		else
 			vfs_mountedfrom(mp, fspec);
 	} else {
 		VI_LOCK(vp);
 		vp->v_iflag &= ~VI_MOUNT;
 		VI_UNLOCK(vp);
 		VOP_UNLOCK(vp, 0, td);
 		vfs_unbusy(mp, td);
 		vfs_mount_destroy(mp);
 	}
 	return (error);
 }
Index: head/sys/cddl/compat/opensolaris/sys/vnode.h
===================================================================
--- head/sys/cddl/compat/opensolaris/sys/vnode.h	(revision 175201)
+++ head/sys/cddl/compat/opensolaris/sys/vnode.h	(revision 175202)
@@ -1,268 +1,268 @@
 /*-
  * Copyright (c) 2007 Pawel Jakub Dawidek <pjd@FreeBSD.org>
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  * $FreeBSD$
  */
 
 #ifndef _OPENSOLARIS_SYS_VNODE_H_
 #define	_OPENSOLARIS_SYS_VNODE_H_
 
 #include_next <sys/vnode.h>
 #include <sys/mount.h>
 #include <sys/cred.h>
 #include <sys/fcntl.h>
 #include <sys/namei.h>
 #include <sys/proc.h>
 #include <sys/filedesc.h>
 #include <sys/syscallsubr.h>
 
 typedef	struct vnode	vnode_t;
 typedef	struct vattr	vattr_t;
 typedef	void		caller_context_t;
 
 typedef	struct vop_vector	vnodeops_t;
 #define	vop_fid		vop_vptofh
 #define	vop_fid_args	vop_vptofh_args
 #define	a_fid		a_fhp
 
 #define	v_count	v_usecount
 
 static __inline int
 vn_is_readonly(vnode_t *vp)
 {
 	return (vp->v_mount->mnt_flag & MNT_RDONLY);
 }
 #define	vn_vfswlock(vp)		(0)
 #define	vn_vfsunlock(vp)	do { } while (0)
 #define	vn_ismntpt(vp)		((vp)->v_type == VDIR && (vp)->v_mountedhere != NULL)
 #define	vn_mountedvfs(vp)	((vp)->v_mountedhere)
 #define	vn_has_cached_data(vp)	((vp)->v_object != NULL && (vp)->v_object->resident_page_count > 0)
 
 #define	VN_HOLD(v)	vref(v)
 #define	VN_RELE(v)	vrele(v)
 #define	VN_URELE(v)	vput(v)
 
 #define	VOP_REALVP(vp, vpp)	(*(vpp) = (vp), 0)
 
 #define	vnevent_remove(vp)	do { } while (0)
 #define	vnevent_rmdir(vp)	do { } while (0)
 #define	vnevent_rename_src(vp)	do { } while (0)
 #define	vnevent_rename_dest(vp)	do { } while (0)
 
 
 #define	IS_DEVVP(vp)	\
 	((vp)->v_type == VCHR || (vp)->v_type == VBLK || (vp)->v_type == VFIFO)
 
 #define	MODEMASK	ALLPERMS
 
 #define	specvp(vp, rdev, type, cr)	(VN_HOLD(vp), (vp))
 #define	MANDMODE(mode)	(0)
 #define	chklock(vp, op, offset, size, mode, ct)	(0)
 #define	cleanlocks(vp, pid, foo)	do { } while (0)
 #define	cleanshares(vp, pid)		do { } while (0)
 
 /*
  * We will use va_spare is place of Solaris' va_mask.
  * This field is initialized in zfs_setattr().
  */
 #define	va_mask		va_spare
 /* TODO: va_fileid is shorter than va_nodeid !!! */
 #define	va_nodeid	va_fileid
 /* TODO: This field needs conversion! */
 #define	va_nblocks	va_bytes
 #define	va_blksize	va_blocksize
 #define	va_seq		va_gen
 
 #define	MAXOFFSET_T	OFF_MAX
 #define	EXCL		0
 
 #define	AT_TYPE		0x0001
 #define	AT_MODE		0x0002
 #define	AT_UID		0x0004
 #define	AT_GID		0x0008
 #define	AT_FSID		0x0010
 #define	AT_NODEID	0x0020
 #define	AT_NLINK	0x0040
 #define	AT_SIZE		0x0080
 #define	AT_ATIME	0x0100
 #define	AT_MTIME	0x0200
 #define	AT_CTIME	0x0400
 #define	AT_RDEV		0x0800
 #define	AT_BLKSIZE	0x1000
 #define	AT_NBLOCKS	0x2000
 #define	AT_SEQ		0x4000
 #define	AT_NOSET	(AT_NLINK|AT_RDEV|AT_FSID|AT_NODEID|AT_TYPE|\
 			 AT_BLKSIZE|AT_NBLOCKS|AT_SEQ)
 
 #define	ACCESSED		(AT_ATIME)
 #define	STATE_CHANGED		(AT_CTIME)
 #define	CONTENT_MODIFIED	(AT_MTIME | AT_CTIME)
 
 static __inline void
 vattr_init_mask(vattr_t *vap)
 {
 
 	vap->va_mask = 0;
 
 	if (vap->va_type != VNON)
 		vap->va_mask |= AT_TYPE;
 	if (vap->va_uid != (uid_t)VNOVAL)
 		vap->va_mask |= AT_UID;
 	if (vap->va_gid != (gid_t)VNOVAL)
 		vap->va_mask |= AT_GID;
 	if (vap->va_size != (u_quad_t)VNOVAL)
 		vap->va_mask |= AT_SIZE;
 	if (vap->va_atime.tv_sec != VNOVAL)
 		vap->va_mask |= AT_ATIME;
 	if (vap->va_mtime.tv_sec != VNOVAL)
 		vap->va_mask |= AT_MTIME;
 	if (vap->va_mode != (u_short)VNOVAL)
 		vap->va_mask |= AT_MODE;
 }
 
 #define	FCREAT	O_CREAT
 #define	FTRUNC	O_TRUNC
 #define	FDSYNC	FFSYNC
 #define	FRSYNC	FFSYNC
 #define	FSYNC	FFSYNC
 #define	FOFFMAX	0x00
 
 enum create	{ CRCREAT };
 
 static __inline int
 zfs_vn_open(char *pnamep, enum uio_seg seg, int filemode, int createmode,
     vnode_t **vpp, enum create crwhy, mode_t umask)
 {
 	struct thread *td = curthread;
 	struct nameidata nd;
 	int error;
 
 	ASSERT(seg == UIO_SYSSPACE);
 	ASSERT(filemode == (FWRITE | FCREAT | FTRUNC | FOFFMAX));
 	ASSERT(crwhy == CRCREAT);
 	ASSERT(umask == 0);
 
 	if (td->td_proc->p_fd->fd_rdir == NULL)
 		td->td_proc->p_fd->fd_rdir = rootvnode;
 	if (td->td_proc->p_fd->fd_cdir == NULL)
 		td->td_proc->p_fd->fd_cdir = rootvnode;
 
 	NDINIT(&nd, LOOKUP, NOFOLLOW, UIO_SYSSPACE, pnamep, td);
 	error = vn_open_cred(&nd, &filemode, createmode, td->td_ucred, NULL);
 	NDFREE(&nd, NDF_ONLY_PNBUF);
 	if (error == 0) {
 		/* We just unlock so we hold a reference. */
 		VN_HOLD(nd.ni_vp);
 		VOP_UNLOCK(nd.ni_vp, 0, td);
 		*vpp = nd.ni_vp;
 	}
 	return (error);
 }
 #define	vn_open(pnamep, seg, filemode, createmode, vpp, crwhy, umask)	\
 	zfs_vn_open((pnamep), (seg), (filemode), (createmode), (vpp), (crwhy), (umask))
 
 #define	RLIM64_INFINITY	0
 static __inline int
 zfs_vn_rdwr(enum uio_rw rw, vnode_t *vp, caddr_t base, ssize_t len,
     offset_t offset, enum uio_seg seg, int ioflag, int ulimit, cred_t *cr,
     ssize_t *residp)
 {
 	struct thread *td = curthread;
 	int error, vfslocked, resid;
 
 	ASSERT(rw == UIO_WRITE);
 	ASSERT(ioflag == 0);
 	ASSERT(ulimit == RLIM64_INFINITY);
 
 	ioflag = IO_APPEND | IO_UNIT;
 
 	vfslocked = VFS_LOCK_GIANT(vp->v_mount);
 	VOP_LEASE(vp, td, td->td_ucred, LEASE_WRITE);
 	error = vn_rdwr(rw, vp, base, len, offset, seg, ioflag, cr, NOCRED,
 	    &resid, td);
 	VFS_UNLOCK_GIANT(vfslocked);
 	if (residp != NULL)
 		*residp = (ssize_t)resid;
 	return (error);
 }
 #define	vn_rdwr(rw, vp, base, len, offset, seg, ioflag, ulimit, cr, residp) \
 	zfs_vn_rdwr((rw), (vp), (base), (len), (offset), (seg), (ioflag), (ulimit), (cr), (residp))
 
 static __inline int
 zfs_vop_fsync(vnode_t *vp, int flag, cred_t *cr)
 {
 	struct thread *td = curthread;
 	struct mount *mp;
 	int error, vfslocked;
 
 	ASSERT(flag == FSYNC);
 
 	vfslocked = VFS_LOCK_GIANT(vp->v_mount);
 	if ((error = vn_start_write(vp, &mp, V_WAIT | PCATCH)) != 0)
 		goto drop;
-	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td);
+	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
 	error = VOP_FSYNC(vp, MNT_WAIT, td);
 	VOP_UNLOCK(vp, 0, td);
 	vn_finished_write(mp);
 drop:
 	VFS_UNLOCK_GIANT(vfslocked);
 	return (error);
 }
 #define	VOP_FSYNC(vp, flag, cr)	zfs_vop_fsync((vp), (flag), (cr))
 
 static __inline int
 zfs_vop_close(vnode_t *vp, int flag, int count, offset_t offset, cred_t *cr)
 {
 
 	ASSERT(flag == (FWRITE | FCREAT | FTRUNC | FOFFMAX));
 	ASSERT(count == 1);
 	ASSERT(offset == 0);
 
 	return (vn_close(vp, flag, cr, curthread));
 }
 #define	VOP_CLOSE(vp, oflags, count, offset, cr)			\
 	zfs_vop_close((vp), (oflags), (count), (offset), (cr))
 
 static __inline int
 vn_rename(char *from, char *to, enum uio_seg seg)
 {
 
 	ASSERT(seg == UIO_SYSSPACE);
 
 	return (kern_rename(curthread, from, to, seg));
 }
 
 enum rm	{ RMFILE };
 static __inline int
 vn_remove(char *fnamep, enum uio_seg seg, enum rm dirflag)
 {
 
 	ASSERT(seg == UIO_SYSSPACE);
 	ASSERT(dirflag == RMFILE);
 
 	return (kern_unlink(curthread, fnamep, seg));
 }
 
 #endif	/* _OPENSOLARIS_SYS_VNODE_H_ */
Index: head/sys/cddl/contrib/opensolaris/uts/common/fs/gfs.c
===================================================================
--- head/sys/cddl/contrib/opensolaris/uts/common/fs/gfs.c	(revision 175201)
+++ head/sys/cddl/contrib/opensolaris/uts/common/fs/gfs.c	(revision 175202)
@@ -1,884 +1,884 @@
 /*
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
  * Common Development and Distribution License (the "License").
  * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or http://www.opensolaris.org/os/licensing.
  * See the License for the specific language governing permissions
  * and limitations under the License.
  *
  * When distributing Covered Code, include this CDDL HEADER in each
  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  * If applicable, add the following below this CDDL HEADER, with the
  * fields enclosed by brackets "[]" replaced with your own identifying
  * information: Portions Copyright [yyyy] [name of copyright owner]
  *
  * CDDL HEADER END
  */
 /* Portions Copyright 2007 Shivakumar GN */
 /*
  * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
 #pragma ident	"%Z%%M%	%I%	%E% SMI"
 
 #include <sys/types.h>
 #include <sys/cmn_err.h>
 #include <sys/debug.h>
 #include <sys/dirent.h>
 #include <sys/kmem.h>
 #include <sys/mman.h>
 #include <sys/mutex.h>
 #include <sys/sysmacros.h>
 #include <sys/systm.h>
 #include <sys/uio.h>
 #include <sys/vfs.h>
 #include <sys/vnode.h>
 #include <sys/cred.h>
 #include <sys/kdb.h>
 
 #include <sys/gfs.h>
 
 /*
  * Generic pseudo-filesystem routines.
  *
  * There are significant similarities between the implementation of certain file
  * system entry points across different filesystems.  While one could attempt to
  * "choke up on the bat" and incorporate common functionality into a VOP
  * preamble or postamble, such an approach is limited in the benefit it can
  * provide.  In this file we instead define a toolkit of routines which can be
  * called from a filesystem (with in-kernel pseudo-filesystems being the focus
  * of the exercise) in a more component-like fashion.
  *
  * There are three basic classes of routines:
  *
  * 1) Lowlevel support routines
  *
  *    These routines are designed to play a support role for existing
  *    pseudo-filesystems (such as procfs).  They simplify common tasks,
  *    without enforcing the filesystem to hand over management to GFS.  The
  *    routines covered are:
  *
  *	gfs_readdir_init()
  *	gfs_readdir_emit()
  *	gfs_readdir_emitn()
  *	gfs_readdir_pred()
  *	gfs_readdir_fini()
  *	gfs_lookup_dot()
  *
  * 2) Complete GFS management
  *
  *    These routines take a more active role in management of the
  *    pseudo-filesystem.  They handle the relationship between vnode private
  *    data and VFS data, as well as the relationship between vnodes in the
  *    directory hierarchy.
  *
  *    In order to use these interfaces, the first member of every private
  *    v_data must be a gfs_file_t or a gfs_dir_t.  This hands over all control
  *    to GFS.
  *
  * 	gfs_file_create()
  * 	gfs_dir_create()
  * 	gfs_root_create()
  *
  *	gfs_file_inactive()
  *	gfs_dir_inactive()
  *	gfs_dir_lookup()
  *	gfs_dir_readdir()
  *
  * 	gfs_vop_inactive()
  * 	gfs_vop_lookup()
  * 	gfs_vop_readdir()
  * 	gfs_vop_map()
  *
  * 3) Single File pseudo-filesystems
  *
  *    This routine creates a rooted file to be overlayed ontop of another
  *    file in the physical filespace.
  *
  *    Note that the parent is NULL (actually the vfs), but there is nothing
  *    technically keeping such a file from utilizing the "Complete GFS
  *    management" set of routines.
  *
  * 	gfs_root_create_file()
  */
 
 /*
  * Low level directory routines
  *
  * These routines provide some simple abstractions for reading directories.
  * They are designed to be used by existing pseudo filesystems (namely procfs)
  * that already have a complicated management infrastructure.
  */
 
 /*
  * gfs_readdir_init: initiate a generic readdir
  *   st		- a pointer to an uninitialized gfs_readdir_state_t structure
  *   name_max	- the directory's maximum file name length
  *   ureclen	- the exported file-space record length (1 for non-legacy FSs)
  *   uiop	- the uiop passed to readdir
  *   parent	- the parent directory's inode
  *   self	- this directory's inode
  *
  * Returns 0 or a non-zero errno.
  *
  * Typical VOP_READDIR usage of gfs_readdir_*:
  *
  *	if ((error = gfs_readdir_init(...)) != 0)
  *		return (error);
  *	eof = 0;
  *	while ((error = gfs_readdir_pred(..., &voffset)) != 0) {
  *		if (!consumer_entry_at(voffset))
  *			voffset = consumer_next_entry(voffset);
  *		if (consumer_eof(voffset)) {
  *			eof = 1
  *			break;
  *		}
  *		if ((error = gfs_readdir_emit(..., voffset,
  *		    consumer_ino(voffset), consumer_name(voffset))) != 0)
  *			break;
  *	}
  *	return (gfs_readdir_fini(..., error, eofp, eof));
  *
  * As you can see, a zero result from gfs_readdir_pred() or
  * gfs_readdir_emit() indicates that processing should continue,
  * whereas a non-zero result indicates that the loop should terminate.
  * Most consumers need do nothing more than let gfs_readdir_fini()
  * determine what the cause of failure was and return the appropriate
  * value.
  */
 int
 gfs_readdir_init(gfs_readdir_state_t *st, int name_max, int ureclen,
     uio_t *uiop, ino64_t parent, ino64_t self)
 {
 	if (uiop->uio_loffset < 0 || uiop->uio_resid <= 0 ||
 	    (uiop->uio_loffset % ureclen) != 0)
 		return (EINVAL);
 
 	st->grd_ureclen = ureclen;
 	st->grd_oresid = uiop->uio_resid;
 	st->grd_namlen = name_max;
 	st->grd_dirent = kmem_zalloc(DIRENT64_RECLEN(st->grd_namlen), KM_SLEEP);
 	st->grd_parent = parent;
 	st->grd_self = self;
 
 	return (0);
 }
 
 /*
  * gfs_readdir_emit_int: internal routine to emit directory entry
  *
  *   st		- the current readdir state, which must have d_ino and d_name
  *                set
  *   uiop	- caller-supplied uio pointer
  *   next	- the offset of the next entry
  */
 static int
 gfs_readdir_emit_int(gfs_readdir_state_t *st, uio_t *uiop, offset_t next,
     int *ncookies, u_long **cookies)
 {
 	int reclen, namlen;
 
 	namlen = strlen(st->grd_dirent->d_name);
 	reclen = DIRENT64_RECLEN(namlen);
 
 	if (reclen > uiop->uio_resid) {
 		/*
 		 * Error if no entries were returned yet
 		 */
 		if (uiop->uio_resid == st->grd_oresid)
 			return (EINVAL);
 		return (-1);
 	}
 
 	/* XXX: This can change in the future. */
 	st->grd_dirent->d_type = DT_DIR;
 	st->grd_dirent->d_reclen = (ushort_t)reclen;
 	st->grd_dirent->d_namlen = namlen;
 
 	if (uiomove((caddr_t)st->grd_dirent, reclen, UIO_READ, uiop))
 		return (EFAULT);
 
 	uiop->uio_loffset = next;
 	if (*cookies != NULL) {
 		**cookies = next;
 		(*cookies)++;
 		(*ncookies)--;
 		KASSERT(*ncookies >= 0, ("ncookies=%d", *ncookies));
 	}
 
 	return (0);
 }
 
 /*
  * gfs_readdir_emit: emit a directory entry
  *   voff       - the virtual offset (obtained from gfs_readdir_pred)
  *   ino        - the entry's inode
  *   name       - the entry's name
  *
  * Returns a 0 on success, a non-zero errno on failure, or -1 if the
  * readdir loop should terminate.  A non-zero result (either errno or
  * -1) from this function is typically passed directly to
  * gfs_readdir_fini().
  */
 int
 gfs_readdir_emit(gfs_readdir_state_t *st, uio_t *uiop, offset_t voff,
     ino64_t ino, const char *name, int *ncookies, u_long **cookies)
 {
 	offset_t off = (voff + 2) * st->grd_ureclen;
 
 	st->grd_dirent->d_ino = ino;
 	(void) strncpy(st->grd_dirent->d_name, name, st->grd_namlen);
 
 	/*
 	 * Inter-entry offsets are invalid, so we assume a record size of
 	 * grd_ureclen and explicitly set the offset appropriately.
 	 */
 	return (gfs_readdir_emit_int(st, uiop, off + st->grd_ureclen, ncookies,
 	    cookies));
 }
 
 /*
  * gfs_readdir_pred: readdir loop predicate
  *   voffp - a pointer in which the next virtual offset should be stored
  *
  * Returns a 0 on success, a non-zero errno on failure, or -1 if the
  * readdir loop should terminate.  A non-zero result (either errno or
  * -1) from this function is typically passed directly to
  * gfs_readdir_fini().
  */
 int
 gfs_readdir_pred(gfs_readdir_state_t *st, uio_t *uiop, offset_t *voffp,
     int *ncookies, u_long **cookies)
 {
 	offset_t off, voff;
 	int error;
 
 top:
 	if (uiop->uio_resid <= 0)
 		return (-1);
 
 	off = uiop->uio_loffset / st->grd_ureclen;
 	voff = off - 2;
 	if (off == 0) {
 		if ((error = gfs_readdir_emit(st, uiop, voff, st->grd_self,
 		    ".", ncookies, cookies)) == 0)
 			goto top;
 	} else if (off == 1) {
 		if ((error = gfs_readdir_emit(st, uiop, voff, st->grd_parent,
 		    "..", ncookies, cookies)) == 0)
 			goto top;
 	} else {
 		*voffp = voff;
 		return (0);
 	}
 
 	return (error);
 }
 
 /*
  * gfs_readdir_fini: generic readdir cleanup
  *   error	- if positive, an error to return
  *   eofp	- the eofp passed to readdir
  *   eof	- the eof value
  *
  * Returns a 0 on success, a non-zero errno on failure.  This result
  * should be returned from readdir.
  */
 int
 gfs_readdir_fini(gfs_readdir_state_t *st, int error, int *eofp, int eof)
 {
 	kmem_free(st->grd_dirent, DIRENT64_RECLEN(st->grd_namlen));
 	if (error > 0)
 		return (error);
 	if (eofp)
 		*eofp = eof;
 	return (0);
 }
 
 /*
  * gfs_lookup_dot
  *
  * Performs a basic check for "." and ".." directory entries.
  */
 int
 gfs_lookup_dot(vnode_t **vpp, vnode_t *dvp, vnode_t *pvp, const char *nm)
 {
 	if (*nm == '\0' || strcmp(nm, ".") == 0) {
 		VN_HOLD(dvp);
 		*vpp = dvp;
 		return (0);
 	} else if (strcmp(nm, "..") == 0) {
 		if (pvp == NULL) {
 			ASSERT(dvp->v_flag & VROOT);
 			VN_HOLD(dvp);
 			*vpp = dvp;
 		} else {
 			VN_HOLD(pvp);
 			*vpp = pvp;
 		}
-		vn_lock(*vpp, LK_EXCLUSIVE | LK_RETRY, curthread);
+		vn_lock(*vpp, LK_EXCLUSIVE | LK_RETRY);
 		return (0);
 	}
 
 	return (-1);
 }
 
 /*
  * gfs_file_create(): create a new GFS file
  *
  *   size	- size of private data structure (v_data)
  *   pvp	- parent vnode (GFS directory)
  *   ops	- vnode operations vector
  *
  * In order to use this interface, the parent vnode must have been created by
  * gfs_dir_create(), and the private data stored in v_data must have a
  * 'gfs_file_t' as its first field.
  *
  * Given these constraints, this routine will automatically:
  *
  * 	- Allocate v_data for the vnode
  * 	- Initialize necessary fields in the vnode
  * 	- Hold the parent
  */
 vnode_t *
 gfs_file_create(size_t size, vnode_t *pvp, vfs_t *vfsp, vnodeops_t *ops)
 {
 	gfs_file_t *fp;
 	vnode_t *vp;
 	int error;
 
 	/*
 	 * Allocate vnode and internal data structure
 	 */
 	fp = kmem_zalloc(size, KM_SLEEP);
 	error = getnewvnode("zfs", vfsp, ops, &vp);
 	ASSERT(error == 0);
 	vp->v_data = (caddr_t)fp;
 
 	/*
 	 * Set up various pointers
 	 */
 	fp->gfs_vnode = vp;
 	fp->gfs_parent = pvp;
 	fp->gfs_size = size;
 	fp->gfs_type = GFS_FILE;
 
 	error = insmntque(vp, vfsp);
 	KASSERT(error == 0, ("insmntque() failed: error %d", error));
 
 	/*
 	 * Initialize vnode and hold parent.
 	 */
 	if (pvp)
 		VN_HOLD(pvp);
 
 	return (vp);
 }
 
 /*
  * gfs_dir_create: creates a new directory in the parent
  *
  *   size	- size of private data structure (v_data)
  *   pvp	- parent vnode (GFS directory)
  *   ops	- vnode operations vector
  *   entries	- NULL-terminated list of static entries (if any)
  *   maxlen	- maximum length of a directory entry
  *   readdir_cb	- readdir callback (see gfs_dir_readdir)
  *   inode_cb	- inode callback (see gfs_dir_readdir)
  *   lookup_cb	- lookup callback (see gfs_dir_lookup)
  *
  * In order to use this function, the first member of the private vnode
  * structure (v_data) must be a gfs_dir_t.  For each directory, there are
  * static entries, defined when the structure is initialized, and dynamic
  * entries, retrieved through callbacks.
  *
  * If a directory has static entries, then it must supply a inode callback,
  * which will compute the inode number based on the parent and the index.
  * For a directory with dynamic entries, the caller must supply a readdir
  * callback and a lookup callback.  If a static lookup fails, we fall back to
  * the supplied lookup callback, if any.
  *
  * This function also performs the same initialization as gfs_file_create().
  */
 vnode_t *
 gfs_dir_create(size_t struct_size, vnode_t *pvp, vfs_t *vfsp, vnodeops_t *ops,
     gfs_dirent_t *entries, gfs_inode_cb inode_cb, int maxlen,
     gfs_readdir_cb readdir_cb, gfs_lookup_cb lookup_cb)
 {
 	vnode_t *vp;
 	gfs_dir_t *dp;
 	gfs_dirent_t *de;
 
 	vp = gfs_file_create(struct_size, pvp, vfsp, ops);
 	vp->v_type = VDIR;
 
 	dp = vp->v_data;
 	dp->gfsd_file.gfs_type = GFS_DIR;
 	dp->gfsd_maxlen = maxlen;
 
 	if (entries != NULL) {
 		for (de = entries; de->gfse_name != NULL; de++)
 			dp->gfsd_nstatic++;
 
 		dp->gfsd_static = kmem_alloc(
 		    dp->gfsd_nstatic * sizeof (gfs_dirent_t), KM_SLEEP);
 		bcopy(entries, dp->gfsd_static,
 		    dp->gfsd_nstatic * sizeof (gfs_dirent_t));
 	}
 
 	dp->gfsd_readdir = readdir_cb;
 	dp->gfsd_lookup = lookup_cb;
 	dp->gfsd_inode = inode_cb;
 
 	mutex_init(&dp->gfsd_lock, NULL, MUTEX_DEFAULT, NULL);
 
 	return (vp);
 }
 
 /*
  * gfs_root_create(): create a root vnode for a GFS filesystem
  *
  * Similar to gfs_dir_create(), this creates a root vnode for a filesystem.  The
  * only difference is that it takes a vfs_t instead of a vnode_t as its parent.
  */
 vnode_t *
 gfs_root_create(size_t size, vfs_t *vfsp, vnodeops_t *ops, ino64_t ino,
     gfs_dirent_t *entries, gfs_inode_cb inode_cb, int maxlen,
     gfs_readdir_cb readdir_cb, gfs_lookup_cb lookup_cb)
 {
 	vnode_t *vp;
 
 	VFS_HOLD(vfsp);
 	vp = gfs_dir_create(size, NULL, vfsp, ops, entries, inode_cb,
 	    maxlen, readdir_cb, lookup_cb);
 	/* Manually set the inode */
 	((gfs_file_t *)vp->v_data)->gfs_ino = ino;
 	vp->v_flag |= VROOT;
 
 	return (vp);
 }
 
 /*
  * gfs_file_inactive()
  *
  * Called from the VOP_INACTIVE() routine.  If necessary, this routine will
  * remove the given vnode from the parent directory and clean up any references
  * in the VFS layer.
  *
  * If the vnode was not removed (due to a race with vget), then NULL is
  * returned.  Otherwise, a pointer to the private data is returned.
  */
 void *
 gfs_file_inactive(vnode_t *vp)
 {
 	int i;
 	gfs_dirent_t *ge = NULL;
 	gfs_file_t *fp = vp->v_data;
 	gfs_dir_t *dp = NULL;
 	void *data;
 
 	if (fp->gfs_parent == NULL)
 		goto found;
 
 	dp = fp->gfs_parent->v_data;
 
 	/*
 	 * First, see if this vnode is cached in the parent.
 	 */
 	gfs_dir_lock(dp);
 
 	/*
 	 * Find it in the set of static entries.
 	 */
 	for (i = 0; i < dp->gfsd_nstatic; i++)  {
 		ge = &dp->gfsd_static[i];
 
 		if (ge->gfse_vnode == vp)
 			goto found;
 	}
 
 	/*
 	 * If 'ge' is NULL, then it is a dynamic entry.
 	 */
 	ge = NULL;
 
 found:
 	VI_LOCK(vp);
 	ASSERT(vp->v_count < 2);
 	/*
 	 * Really remove this vnode
 	 */
 	data = vp->v_data;
 	if (ge != NULL) {
 		/*
 		 * If this was a statically cached entry, simply set the
 		 * cached vnode to NULL.
 		 */
 		ge->gfse_vnode = NULL;
 	}
 	if (vp->v_count == 1) {
 		vp->v_usecount--;
 		vdropl(vp);
 	} else {
 		VI_UNLOCK(vp);
 	}
 
 	/*
 	 * Free vnode and release parent
 	 */
 	if (fp->gfs_parent) {
 		gfs_dir_unlock(dp);
 		VI_LOCK(fp->gfs_parent);
 		fp->gfs_parent->v_usecount--;
 		VI_UNLOCK(fp->gfs_parent);
 	} else {
 		ASSERT(vp->v_vfsp != NULL);
 		VFS_RELE(vp->v_vfsp);
 	}
 
 	return (data);
 }
 
 /*
  * gfs_dir_inactive()
  *
  * Same as above, but for directories.
  */
 void *
 gfs_dir_inactive(vnode_t *vp)
 {
 	gfs_dir_t *dp;
 
 	ASSERT(vp->v_type == VDIR);
 
 	if ((dp = gfs_file_inactive(vp)) != NULL) {
 		mutex_destroy(&dp->gfsd_lock);
 		if (dp->gfsd_nstatic)
 			kmem_free(dp->gfsd_static,
 			    dp->gfsd_nstatic * sizeof (gfs_dirent_t));
 	}
 
 	return (dp);
 }
 
 /*
  * gfs_dir_lookup()
  *
  * Looks up the given name in the directory and returns the corresponding vnode,
  * if found.
  *
  * First, we search statically defined entries, if any.  If a match is found,
  * and GFS_CACHE_VNODE is set and the vnode exists, we simply return the
  * existing vnode.  Otherwise, we call the static entry's callback routine,
  * caching the result if necessary.
  *
  * If no static entry is found, we invoke the lookup callback, if any.  The
  * arguments to this callback are:
  *
  *	int gfs_lookup_cb(vnode_t *pvp, const char *nm, vnode_t **vpp);
  *
  *	pvp	- parent vnode
  *	nm	- name of entry
  *	vpp	- pointer to resulting vnode
  *
  * 	Returns 0 on success, non-zero on error.
  */
 int
 gfs_dir_lookup(vnode_t *dvp, const char *nm, vnode_t **vpp)
 {
 	int i;
 	gfs_dirent_t *ge;
 	vnode_t *vp;
 	gfs_dir_t *dp = dvp->v_data;
 	int ret = 0;
 
 	ASSERT(dvp->v_type == VDIR);
 
 	if (gfs_lookup_dot(vpp, dvp, dp->gfsd_file.gfs_parent, nm) == 0)
 		return (0);
 
 	gfs_dir_lock(dp);
 
 	/*
 	 * Search static entries.
 	 */
 	for (i = 0; i < dp->gfsd_nstatic; i++) {
 		ge = &dp->gfsd_static[i];
 
 		if (strcmp(ge->gfse_name, nm) == 0) {
 			if (ge->gfse_vnode) {
 				ASSERT(ge->gfse_flags & GFS_CACHE_VNODE);
 				vp = ge->gfse_vnode;
 				VN_HOLD(vp);
 				goto out;
 			}
 
 			/*
 			 * We drop the directory lock, as the constructor will
 			 * need to do KM_SLEEP allocations.  If we return from
 			 * the constructor only to find that a parallel
 			 * operation has completed, and GFS_CACHE_VNODE is set
 			 * for this entry, we discard the result in favor of the
 			 * cached vnode.
 			 */
 			gfs_dir_unlock(dp);
 			vp = ge->gfse_ctor(dvp);
 			gfs_dir_lock(dp);
 
 			((gfs_file_t *)vp->v_data)->gfs_index = i;
 
 			/* Set the inode according to the callback. */
 			((gfs_file_t *)vp->v_data)->gfs_ino =
 			    dp->gfsd_inode(dvp, i);
 
 			if (ge->gfse_flags & GFS_CACHE_VNODE) {
 				if (ge->gfse_vnode == NULL) {
 					ge->gfse_vnode = vp;
 				} else {
 					/*
 					 * A parallel constructor beat us to it;
 					 * return existing vnode.  We have to be
 					 * careful because we can't release the
 					 * current vnode while holding the
 					 * directory lock; its inactive routine
 					 * will try to lock this directory.
 					 */
 					vnode_t *oldvp = vp;
 					vp = ge->gfse_vnode;
 					VN_HOLD(vp);
 
 					gfs_dir_unlock(dp);
 					VN_RELE(oldvp);
 					gfs_dir_lock(dp);
 				}
 			}
 
 			goto out;
 		}
 	}
 
 	/*
 	 * See if there is a dynamic constructor.
 	 */
 	if (dp->gfsd_lookup) {
 		ino64_t ino;
 		gfs_file_t *fp;
 
 		/*
 		 * Once again, drop the directory lock, as the lookup routine
 		 * will need to allocate memory, or otherwise deadlock on this
 		 * directory.
 		 */
 		gfs_dir_unlock(dp);
 		ret = dp->gfsd_lookup(dvp, nm, &vp, &ino);
 		gfs_dir_lock(dp);
 		if (ret != 0)
 			goto out;
 
 		fp = (gfs_file_t *)vp->v_data;
 		fp->gfs_index = -1;
 		fp->gfs_ino = ino;
 	} else {
 		/*
 		 * No static entry found, and there is no lookup callback, so
 		 * return ENOENT.
 		 */
 		ret = ENOENT;
 	}
 
 out:
 	gfs_dir_unlock(dp);
 
 	if (ret == 0)
 		*vpp = vp;
 	else
 		*vpp = NULL;
 
 	return (ret);
 }
 
 /*
  * gfs_dir_readdir: does a readdir() on the given directory
  *
  *    dvp	- directory vnode
  *    uiop	- uio structure
  *    eofp	- eof pointer
  *    data	- arbitrary data passed to readdir callback
  *
  * This routine does all the readdir() dirty work.  Even so, the caller must
  * supply two callbacks in order to get full compatibility.
  *
  * If the directory contains static entries, an inode callback must be
  * specified.  This avoids having to create every vnode and call VOP_GETATTR()
  * when reading the directory.  This function has the following arguments:
  *
  *	ino_t gfs_inode_cb(vnode_t *vp, int index);
  *
  * 	vp	- vnode for the directory
  * 	index	- index in original gfs_dirent_t array
  *
  * 	Returns the inode number for the given entry.
  *
  * For directories with dynamic entries, a readdir callback must be provided.
  * This is significantly more complex, thanks to the particulars of
  * VOP_READDIR().
  *
  *	int gfs_readdir_cb(vnode_t *vp, struct dirent64 *dp, int *eofp,
  *	    offset_t *off, offset_t *nextoff, void *data)
  *
  *	vp	- directory vnode
  *	dp	- directory entry, sized according to maxlen given to
  *		  gfs_dir_create().  callback must fill in d_name and
  *		  d_ino.
  *	eofp	- callback must set to 1 when EOF has been reached
  *	off	- on entry, the last offset read from the directory.  Callback
  *		  must set to the offset of the current entry, typically left
  *		  untouched.
  *	nextoff	- callback must set to offset of next entry.  Typically
  *		  (off + 1)
  *	data	- caller-supplied data
  *
  *	Return 0 on success, or error on failure.
  */
 int
 gfs_dir_readdir(vnode_t *dvp, uio_t *uiop, int *eofp, int *ncookies,
     u_long **cookies, void *data)
 {
 	gfs_readdir_state_t gstate;
 	int error, eof = 0;
 	ino64_t ino, pino;
 	offset_t off, next;
 	gfs_dir_t *dp = dvp->v_data;
 
 	ino = dp->gfsd_file.gfs_ino;
 
 	if (dp->gfsd_file.gfs_parent == NULL)
 		pino = ino;		/* root of filesystem */
 	else
 		pino = ((gfs_file_t *)
 		    (dp->gfsd_file.gfs_parent->v_data))->gfs_ino;
 
 	if ((error = gfs_readdir_init(&gstate, dp->gfsd_maxlen, 1, uiop,
 	    pino, ino)) != 0)
 		return (error);
 
 	while ((error = gfs_readdir_pred(&gstate, uiop, &off, ncookies,
 	    cookies)) == 0 && !eof) {
 
 		if (off >= 0 && off < dp->gfsd_nstatic) {
 			ino = dp->gfsd_inode(dvp, off);
 
 			if ((error = gfs_readdir_emit(&gstate, uiop,
 			    off, ino, dp->gfsd_static[off].gfse_name, ncookies,
 			    cookies)) != 0)
 				break;
 
 		} else if (dp->gfsd_readdir) {
 			off -= dp->gfsd_nstatic;
 
 			if ((error = dp->gfsd_readdir(dvp,
 			    gstate.grd_dirent, &eof, &off, &next,
 			    data)) != 0 || eof)
 				break;
 
 			off += dp->gfsd_nstatic + 2;
 			next += dp->gfsd_nstatic + 2;
 
 			if ((error = gfs_readdir_emit_int(&gstate, uiop,
 			    next, ncookies, cookies)) != 0)
 				break;
 		} else {
 			/*
 			 * Offset is beyond the end of the static entries, and
 			 * we have no dynamic entries.  Set EOF.
 			 */
 			eof = 1;
 		}
 	}
 
 	return (gfs_readdir_fini(&gstate, error, eofp, eof));
 }
 
 /*
  * gfs_vop_readdir: VOP_READDIR() entry point
  *
  * For use directly in vnode ops table.  Given a GFS directory, calls
  * gfs_dir_readdir() as necessary.
  */
 /* ARGSUSED */
 int
 gfs_vop_readdir(ap)
 	struct vop_readdir_args /* {
 		struct vnode *a_vp;
 		struct uio *a_uio;
 		struct ucred *a_cred;
 		int *a_eofflag;
 		int *ncookies;
 		u_long **a_cookies;
 	} */ *ap;
 {
 	vnode_t *vp = ap->a_vp;
 	uio_t *uiop = ap->a_uio;
 	int *eofp = ap->a_eofflag;
 	int ncookies = 0;
 	u_long *cookies = NULL;
 	int error;
 
 	if (ap->a_ncookies) {
 		/*
 		 * Minimum entry size is dirent size and 1 byte for a file name.
 		 */
 		ncookies = uiop->uio_resid / (sizeof(struct dirent) - sizeof(((struct dirent *)NULL)->d_name) + 1);
 		cookies = malloc(ncookies * sizeof(u_long), M_TEMP, M_WAITOK);
 		*ap->a_cookies = cookies;
 		*ap->a_ncookies = ncookies;
 	}
 
 	error = gfs_dir_readdir(vp, uiop, eofp, &ncookies, &cookies, NULL);
 
 	if (error == 0) {
 		/* Subtract unused cookies */
 		if (ap->a_ncookies)
 			*ap->a_ncookies -= ncookies;
 	} else if (ap->a_ncookies) {
 		free(*ap->a_cookies, M_TEMP);
 		*ap->a_cookies = NULL;
 		*ap->a_ncookies = 0;
 	}
 
 	return (error);
 }
 
 /*
  * gfs_vop_inactive: VOP_INACTIVE() entry point
  *
  * Given a vnode that is a GFS file or directory, call gfs_file_inactive() or
  * gfs_dir_inactive() as necessary, and kmem_free()s associated private data.
  */
 /* ARGSUSED */
 int
 gfs_vop_inactive(ap)
 	struct vop_inactive_args /* {
 		struct vnode *a_vp;
 		struct thread *a_td;
 	} */ *ap;
 {
 	vnode_t *vp = ap->a_vp;
 	gfs_file_t *fp = vp->v_data;
 	void *data;
 
 	if (fp->gfs_type == GFS_DIR)
 		data = gfs_dir_inactive(vp);
 	else
 		data = gfs_file_inactive(vp);
 
 	if (data != NULL)
 		kmem_free(data, fp->gfs_size);
 	vp->v_data = NULL;
 	return (0);
 }
Index: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_ctldir.c
===================================================================
--- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_ctldir.c	(revision 175201)
+++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_ctldir.c	(revision 175202)
@@ -1,1119 +1,1119 @@
 /*
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
  * Common Development and Distribution License (the "License").
  * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or http://www.opensolaris.org/os/licensing.
  * See the License for the specific language governing permissions
  * and limitations under the License.
  *
  * When distributing Covered Code, include this CDDL HEADER in each
  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  * If applicable, add the following below this CDDL HEADER, with the
  * fields enclosed by brackets "[]" replaced with your own identifying
  * information: Portions Copyright [yyyy] [name of copyright owner]
  *
  * CDDL HEADER END
  */
 /*
  * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
 #pragma ident	"%Z%%M%	%I%	%E% SMI"
 
 /*
  * ZFS control directory (a.k.a. ".zfs")
  *
  * This directory provides a common location for all ZFS meta-objects.
  * Currently, this is only the 'snapshot' directory, but this may expand in the
  * future.  The elements are built using the GFS primitives, as the hierarchy
  * does not actually exist on disk.
  *
  * For 'snapshot', we don't want to have all snapshots always mounted, because
  * this would take up a huge amount of space in /etc/mnttab.  We have three
  * types of objects:
  *
  * 	ctldir ------> snapshotdir -------> snapshot
  *                                             |
  *                                             |
  *                                             V
  *                                         mounted fs
  *
  * The 'snapshot' node contains just enough information to lookup '..' and act
  * as a mountpoint for the snapshot.  Whenever we lookup a specific snapshot, we
  * perform an automount of the underlying filesystem and return the
  * corresponding vnode.
  *
  * All mounts are handled automatically by the kernel, but unmounts are
  * (currently) handled from user land.  The main reason is that there is no
  * reliable way to auto-unmount the filesystem when it's "no longer in use".
  * When the user unmounts a filesystem, we call zfsctl_unmount(), which
  * unmounts any snapshots within the snapshot directory.
  */
 
 #include <sys/zfs_context.h>
 #include <sys/zfs_ctldir.h>
 #include <sys/zfs_ioctl.h>
 #include <sys/zfs_vfsops.h>
 #include <sys/namei.h>
 #include <sys/gfs.h>
 #include <sys/stat.h>
 #include <sys/dmu.h>
 #include <sys/mount.h>
 
 typedef struct {
 	char		*se_name;
 	vnode_t		*se_root;
 	avl_node_t	se_node;
 } zfs_snapentry_t;
 
 static int
 snapentry_compare(const void *a, const void *b)
 {
 	const zfs_snapentry_t *sa = a;
 	const zfs_snapentry_t *sb = b;
 	int ret = strcmp(sa->se_name, sb->se_name);
 
 	if (ret < 0)
 		return (-1);
 	else if (ret > 0)
 		return (1);
 	else
 		return (0);
 }
 
 static struct vop_vector zfsctl_ops_root;
 static struct vop_vector zfsctl_ops_snapdir;
 static struct vop_vector zfsctl_ops_snapshot;
 
 static vnode_t *zfsctl_mknode_snapdir(vnode_t *);
 static vnode_t *zfsctl_snapshot_mknode(vnode_t *, uint64_t objset);
 
 typedef struct zfsctl_node {
 	gfs_dir_t	zc_gfs_private;
 	uint64_t	zc_id;
 	timestruc_t	zc_cmtime;	/* ctime and mtime, always the same */
 } zfsctl_node_t;
 
 typedef struct zfsctl_snapdir {
 	zfsctl_node_t	sd_node;
 	kmutex_t	sd_lock;
 	avl_tree_t	sd_snaps;
 } zfsctl_snapdir_t;
 
 /*
  * Root directory elements.  We have only a single static entry, 'snapshot'.
  */
 static gfs_dirent_t zfsctl_root_entries[] = {
 	{ "snapshot", zfsctl_mknode_snapdir, GFS_CACHE_VNODE },
 	{ NULL }
 };
 
 /* include . and .. in the calculation */
 #define	NROOT_ENTRIES	((sizeof (zfsctl_root_entries) / \
     sizeof (gfs_dirent_t)) + 1)
 
 
 /*
  * Initialize the various GFS pieces we'll need to create and manipulate .zfs
  * directories.  This is called from the ZFS init routine, and initializes the
  * vnode ops vectors that we'll be using.
  */
 void
 zfsctl_init(void)
 {
 }
 
 void
 zfsctl_fini(void)
 {
 }
 
 /*
  * Return the inode number associated with the 'snapshot' directory.
  */
 /* ARGSUSED */
 static ino64_t
 zfsctl_root_inode_cb(vnode_t *vp, int index)
 {
 	ASSERT(index == 0);
 	return (ZFSCTL_INO_SNAPDIR);
 }
 
 /*
  * Create the '.zfs' directory.  This directory is cached as part of the VFS
  * structure.  This results in a hold on the vfs_t.  The code in zfs_umount()
  * therefore checks against a vfs_count of 2 instead of 1.  This reference
  * is removed when the ctldir is destroyed in the unmount.
  */
 void
 zfsctl_create(zfsvfs_t *zfsvfs)
 {
 	vnode_t *vp, *rvp;
 	zfsctl_node_t *zcp;
 
 	ASSERT(zfsvfs->z_ctldir == NULL);
 
 	vp = gfs_root_create(sizeof (zfsctl_node_t), zfsvfs->z_vfs,
 	    &zfsctl_ops_root, ZFSCTL_INO_ROOT, zfsctl_root_entries,
 	    zfsctl_root_inode_cb, MAXNAMELEN, NULL, NULL);
 	zcp = vp->v_data;
 	zcp->zc_id = ZFSCTL_INO_ROOT;
 
 	VERIFY(VFS_ROOT(zfsvfs->z_vfs, LK_EXCLUSIVE, &rvp, curthread) == 0);
 	ZFS_TIME_DECODE(&zcp->zc_cmtime, VTOZ(rvp)->z_phys->zp_crtime);
 	VN_URELE(rvp);
 
 	/*
 	 * We're only faking the fact that we have a root of a filesystem for
 	 * the sake of the GFS interfaces.  Undo the flag manipulation it did
 	 * for us.
 	 */
 	vp->v_vflag &= ~VV_ROOT;
 
 	zfsvfs->z_ctldir = vp;
 }
 
 /*
  * Destroy the '.zfs' directory.  Only called when the filesystem is unmounted.
  * There might still be more references if we were force unmounted, but only
  * new zfs_inactive() calls can occur and they don't reference .zfs
  */
 void
 zfsctl_destroy(zfsvfs_t *zfsvfs)
 {
 	VN_RELE(zfsvfs->z_ctldir);
 	zfsvfs->z_ctldir = NULL;
 }
 
 /*
  * Given a root znode, retrieve the associated .zfs directory.
  * Add a hold to the vnode and return it.
  */
 vnode_t *
 zfsctl_root(znode_t *zp)
 {
 	ASSERT(zfs_has_ctldir(zp));
 	VN_HOLD(zp->z_zfsvfs->z_ctldir);
 	return (zp->z_zfsvfs->z_ctldir);
 }
 
 /*
  * Common open routine.  Disallow any write access.
  */
 /* ARGSUSED */
 static int
 zfsctl_common_open(struct vop_open_args *ap)
 {
 	int flags = ap->a_mode;
 
 	if (flags & FWRITE)
 		return (EACCES);
 
 	return (0);
 }
 
 /*
  * Common close routine.  Nothing to do here.
  */
 /* ARGSUSED */
 static int
 zfsctl_common_close(struct vop_close_args *ap)
 {
 	return (0);
 }
 
 /*
  * Common access routine.  Disallow writes.
  */
 /* ARGSUSED */
 static int
 zfsctl_common_access(ap)
 	struct vop_access_args /* {
 		struct vnode *a_vp;
 		int  a_mode;
 		struct ucred *a_cred;
 		struct thread *a_td;
 	} */ *ap;
 {
 	int mode = ap->a_mode;
 
 	if (mode & VWRITE)
 		return (EACCES);
 
 	return (0);
 }
 
 /*
  * Common getattr function.  Fill in basic information.
  */
 static void
 zfsctl_common_getattr(vnode_t *vp, vattr_t *vap)
 {
 	zfsctl_node_t	*zcp = vp->v_data;
 	timestruc_t	now;
 
 	vap->va_uid = 0;
 	vap->va_gid = 0;
 	vap->va_rdev = 0;
 	/*
 	 * We are a purly virtual object, so we have no
 	 * blocksize or allocated blocks.
 	 */
 	vap->va_blksize = 0;
 	vap->va_nblocks = 0;
 	vap->va_seq = 0;
 	vap->va_fsid = vp->v_mount->mnt_stat.f_fsid.val[0];
 	vap->va_mode = S_IRUSR | S_IXUSR | S_IRGRP | S_IXGRP |
 	    S_IROTH | S_IXOTH;
 	vap->va_type = VDIR;
 	/*
 	 * We live in the now (for atime).
 	 */
 	gethrestime(&now);
 	vap->va_atime = now;
 	vap->va_mtime = vap->va_ctime = vap->va_birthtime = zcp->zc_cmtime;
 	/* FreeBSD: Reset chflags(2) flags. */
 	vap->va_flags = 0;
 }
 
 static int
 zfsctl_common_fid(ap)
 	struct vop_fid_args /* {
 		struct vnode *a_vp;
 		struct fid *a_fid;
 	} */ *ap;
 {
 	vnode_t		*vp = ap->a_vp;
 	fid_t		*fidp = (void *)ap->a_fid;
 	zfsvfs_t	*zfsvfs = vp->v_vfsp->vfs_data;
 	zfsctl_node_t	*zcp = vp->v_data;
 	uint64_t	object = zcp->zc_id;
 	zfid_short_t	*zfid;
 	int		i;
 
 	ZFS_ENTER(zfsvfs);
 
 	fidp->fid_len = SHORT_FID_LEN;
 
 	zfid = (zfid_short_t *)fidp;
 
 	zfid->zf_len = SHORT_FID_LEN;
 
 	for (i = 0; i < sizeof (zfid->zf_object); i++)
 		zfid->zf_object[i] = (uint8_t)(object >> (8 * i));
 
 	/* .zfs znodes always have a generation number of 0 */
 	for (i = 0; i < sizeof (zfid->zf_gen); i++)
 		zfid->zf_gen[i] = 0;
 
 	ZFS_EXIT(zfsvfs);
 	return (0);
 }
 
 static int
 zfsctl_common_reclaim(ap)
 	struct vop_reclaim_args /* {
 		struct vnode *a_vp;
 		struct thread *a_td;
 	} */ *ap;
 {
 	vnode_t *vp = ap->a_vp;
 
 	/*
 	 * Destroy the vm object and flush associated pages.
 	 */
 	vnode_destroy_vobject(vp);
 	VI_LOCK(vp);
 	vp->v_data = NULL;
 	VI_UNLOCK(vp);
 	return (0);
 }
 
 /*
  * .zfs inode namespace
  *
  * We need to generate unique inode numbers for all files and directories
  * within the .zfs pseudo-filesystem.  We use the following scheme:
  *
  * 	ENTRY			ZFSCTL_INODE
  * 	.zfs			1
  * 	.zfs/snapshot		2
  * 	.zfs/snapshot/<snap>	objectid(snap)
  */
 
 #define	ZFSCTL_INO_SNAP(id)	(id)
 
 /*
  * Get root directory attributes.
  */
 /* ARGSUSED */
 static int
 zfsctl_root_getattr(ap)
 	struct vop_getattr_args /* {
 		struct vnode *a_vp;
 		struct vattr *a_vap;
 		struct ucred *a_cred;
 		struct thread *a_td;
 	} */ *ap;
 {
 	struct vnode *vp = ap->a_vp;
 	struct vattr *vap = ap->a_vap;
 	zfsvfs_t *zfsvfs = vp->v_vfsp->vfs_data;
 
 	ZFS_ENTER(zfsvfs);
 	vap->va_nodeid = ZFSCTL_INO_ROOT;
 	vap->va_nlink = vap->va_size = NROOT_ENTRIES;
 
 	zfsctl_common_getattr(vp, vap);
 	ZFS_EXIT(zfsvfs);
 
 	return (0);
 }
 
 /*
  * Special case the handling of "..".
  */
 /* ARGSUSED */
 int
 zfsctl_root_lookup(vnode_t *dvp, char *nm, vnode_t **vpp, pathname_t *pnp,
     int flags, vnode_t *rdir, cred_t *cr)
 {
 	zfsvfs_t *zfsvfs = dvp->v_vfsp->vfs_data;
 	int err;
 
 	ZFS_ENTER(zfsvfs);
 
 	if (strcmp(nm, "..") == 0) {
 		err = VFS_ROOT(dvp->v_vfsp, LK_EXCLUSIVE, vpp, curthread);
 		if (err == 0)
 			VOP_UNLOCK(*vpp, 0, curthread);
 	} else {
 		err = gfs_dir_lookup(dvp, nm, vpp);
 	}
 
 	ZFS_EXIT(zfsvfs);
 
 	return (err);
 }
 
 /*
  * Special case the handling of "..".
  */
 /* ARGSUSED */
 int
 zfsctl_root_lookup_vop(ap)
 	struct vop_lookup_args /* {
 		struct vnode *a_dvp;
 		struct vnode **a_vpp;
 		struct componentname *a_cnp;
 	} */ *ap;
 {
 	vnode_t *dvp = ap->a_dvp;
 	vnode_t **vpp = ap->a_vpp;
 	cred_t *cr = ap->a_cnp->cn_cred;
 	int flags = ap->a_cnp->cn_flags;
 	int nameiop = ap->a_cnp->cn_nameiop;
 	char nm[NAME_MAX + 1];
 	int err;
 
 	if ((flags & ISLASTCN) && (nameiop == RENAME || nameiop == CREATE))
 		return (EOPNOTSUPP);
 
 	ASSERT(ap->a_cnp->cn_namelen < sizeof(nm));
 	strlcpy(nm, ap->a_cnp->cn_nameptr, ap->a_cnp->cn_namelen + 1);
 
 	err = zfsctl_root_lookup(dvp, nm, vpp, NULL, 0, NULL, cr);
 	if (err == 0 && (nm[0] != '.' || nm[1] != '\0'))
-		vn_lock(*vpp, LK_EXCLUSIVE | LK_RETRY, ap->a_cnp->cn_thread);
+		vn_lock(*vpp, LK_EXCLUSIVE | LK_RETRY);
 
 	return (err);
 }
 
 static struct vop_vector zfsctl_ops_root = {
 	.vop_default =	&default_vnodeops,
 	.vop_open =	zfsctl_common_open,
 	.vop_close =	zfsctl_common_close,
 	.vop_ioctl =	VOP_EINVAL,
 	.vop_getattr =	zfsctl_root_getattr,
 	.vop_access =	zfsctl_common_access,
 	.vop_readdir =	gfs_vop_readdir,
 	.vop_lookup =	zfsctl_root_lookup_vop,
 	.vop_inactive =	gfs_vop_inactive,
 	.vop_reclaim =	zfsctl_common_reclaim,
 	.vop_fid =	zfsctl_common_fid,
 };
 
 static int
 zfsctl_snapshot_zname(vnode_t *vp, const char *name, int len, char *zname)
 {
 	objset_t *os = ((zfsvfs_t *)((vp)->v_vfsp->vfs_data))->z_os;
 
 	dmu_objset_name(os, zname);
 	if (strlen(zname) + 1 + strlen(name) >= len)
 		return (ENAMETOOLONG);
 	(void) strcat(zname, "@");
 	(void) strcat(zname, name);
 	return (0);
 }
 
 static int
 zfsctl_unmount_snap(vnode_t *dvp, const char *name, int force, cred_t *cr)
 {
 	zfsctl_snapdir_t *sdp = dvp->v_data;
 	zfs_snapentry_t search, *sep;
 	struct vop_inactive_args ap;
 	avl_index_t where;
 	int err;
 
 	ASSERT(MUTEX_HELD(&sdp->sd_lock));
 
 	search.se_name = (char *)name;
 	if ((sep = avl_find(&sdp->sd_snaps, &search, &where)) == NULL)
 		return (ENOENT);
 
 	ASSERT(vn_ismntpt(sep->se_root));
 
 	/* this will be dropped by dounmount() */
 	if ((err = vn_vfswlock(sep->se_root)) != 0)
 		return (err);
 
 	err = dounmount(vn_mountedvfs(sep->se_root), force, curthread);
 	if (err)
 		return (err);
 	ASSERT(sep->se_root->v_count == 1);
 	ap.a_vp = sep->se_root;
 	gfs_vop_inactive(&ap);
 
 	avl_remove(&sdp->sd_snaps, sep);
 	kmem_free(sep->se_name, strlen(sep->se_name) + 1);
 	kmem_free(sep, sizeof (zfs_snapentry_t));
 
 	return (0);
 }
 
 #if 0
 static void
 zfsctl_rename_snap(zfsctl_snapdir_t *sdp, zfs_snapentry_t *sep, const char *nm)
 {
 	avl_index_t where;
 	vfs_t *vfsp;
 	refstr_t *pathref;
 	char newpath[MAXNAMELEN];
 	char *tail;
 
 	ASSERT(MUTEX_HELD(&sdp->sd_lock));
 	ASSERT(sep != NULL);
 
 	vfsp = vn_mountedvfs(sep->se_root);
 	ASSERT(vfsp != NULL);
 
 	vfs_lock_wait(vfsp);
 
 	/*
 	 * Change the name in the AVL tree.
 	 */
 	avl_remove(&sdp->sd_snaps, sep);
 	kmem_free(sep->se_name, strlen(sep->se_name) + 1);
 	sep->se_name = kmem_alloc(strlen(nm) + 1, KM_SLEEP);
 	(void) strcpy(sep->se_name, nm);
 	VERIFY(avl_find(&sdp->sd_snaps, sep, &where) == NULL);
 	avl_insert(&sdp->sd_snaps, sep, where);
 
 	/*
 	 * Change the current mountpoint info:
 	 * 	- update the tail of the mntpoint path
 	 *	- update the tail of the resource path
 	 */
 	pathref = vfs_getmntpoint(vfsp);
 	(void) strncpy(newpath, refstr_value(pathref), sizeof (newpath));
 	VERIFY((tail = strrchr(newpath, '/')) != NULL);
 	*(tail+1) = '\0';
 	ASSERT3U(strlen(newpath) + strlen(nm), <, sizeof (newpath));
 	(void) strcat(newpath, nm);
 	refstr_rele(pathref);
 	vfs_setmntpoint(vfsp, newpath);
 
 	pathref = vfs_getresource(vfsp);
 	(void) strncpy(newpath, refstr_value(pathref), sizeof (newpath));
 	VERIFY((tail = strrchr(newpath, '@')) != NULL);
 	*(tail+1) = '\0';
 	ASSERT3U(strlen(newpath) + strlen(nm), <, sizeof (newpath));
 	(void) strcat(newpath, nm);
 	refstr_rele(pathref);
 	vfs_setresource(vfsp, newpath);
 
 	vfs_unlock(vfsp);
 }
 #endif
 
 #if 0
 static int
 zfsctl_snapdir_rename(vnode_t *sdvp, char *snm, vnode_t *tdvp, char *tnm,
     cred_t *cr)
 {
 	zfsctl_snapdir_t *sdp = sdvp->v_data;
 	zfs_snapentry_t search, *sep;
 	avl_index_t where;
 	char from[MAXNAMELEN], to[MAXNAMELEN];
 	int err;
 
 	err = zfsctl_snapshot_zname(sdvp, snm, MAXNAMELEN, from);
 	if (err)
 		return (err);
 	err = zfs_secpolicy_write(from, cr);
 	if (err)
 		return (err);
 
 	/*
 	 * Cannot move snapshots out of the snapdir.
 	 */
 	if (sdvp != tdvp)
 		return (EINVAL);
 
 	if (strcmp(snm, tnm) == 0)
 		return (0);
 
 	err = zfsctl_snapshot_zname(tdvp, tnm, MAXNAMELEN, to);
 	if (err)
 		return (err);
 
 	mutex_enter(&sdp->sd_lock);
 
 	search.se_name = (char *)snm;
 	if ((sep = avl_find(&sdp->sd_snaps, &search, &where)) == NULL) {
 		mutex_exit(&sdp->sd_lock);
 		return (ENOENT);
 	}
 
 	err = dmu_objset_rename(from, to, B_FALSE);
 	if (err == 0)
 		zfsctl_rename_snap(sdp, sep, tnm);
 
 	mutex_exit(&sdp->sd_lock);
 
 	return (err);
 }
 #endif
 
 #if 0
 /* ARGSUSED */
 static int
 zfsctl_snapdir_remove(vnode_t *dvp, char *name, vnode_t *cwd, cred_t *cr)
 {
 	zfsctl_snapdir_t *sdp = dvp->v_data;
 	char snapname[MAXNAMELEN];
 	int err;
 
 	err = zfsctl_snapshot_zname(dvp, name, MAXNAMELEN, snapname);
 	if (err)
 		return (err);
 	err = zfs_secpolicy_write(snapname, cr);
 	if (err)
 		return (err);
 
 	mutex_enter(&sdp->sd_lock);
 
 	err = zfsctl_unmount_snap(dvp, name, 0, cr);
 	if (err) {
 		mutex_exit(&sdp->sd_lock);
 		return (err);
 	}
 
 	err = dmu_objset_destroy(snapname);
 
 	mutex_exit(&sdp->sd_lock);
 
 	return (err);
 }
 #endif
 
 /*
  * Lookup entry point for the 'snapshot' directory.  Try to open the
  * snapshot if it exist, creating the pseudo filesystem vnode as necessary.
  * Perform a mount of the associated dataset on top of the vnode.
  */
 /* ARGSUSED */
 int
 zfsctl_snapdir_lookup(ap)
 	struct vop_lookup_args /* {
 		struct vnode *a_dvp;
 		struct vnode **a_vpp;
 		struct componentname *a_cnp;
 	} */ *ap;
 {
 	vnode_t *dvp = ap->a_dvp;
 	vnode_t **vpp = ap->a_vpp;
 	char nm[NAME_MAX + 1];
 	zfsctl_snapdir_t *sdp = dvp->v_data;
 	objset_t *snap;
 	char snapname[MAXNAMELEN];
 	char *mountpoint;
 	zfs_snapentry_t *sep, search;
 	size_t mountpoint_len;
 	avl_index_t where;
 	zfsvfs_t *zfsvfs = dvp->v_vfsp->vfs_data;
 	int err;
 
 	ASSERT(ap->a_cnp->cn_namelen < sizeof(nm));
 	strlcpy(nm, ap->a_cnp->cn_nameptr, ap->a_cnp->cn_namelen + 1);
 
 	ASSERT(dvp->v_type == VDIR);
 
 	if (gfs_lookup_dot(vpp, dvp, zfsvfs->z_ctldir, nm) == 0)
 		return (0);
 
 	*vpp = NULL;
 
 	/*
 	 * If we get a recursive call, that means we got called
 	 * from the domount() code while it was trying to look up the
 	 * spec (which looks like a local path for zfs).  We need to
 	 * add some flag to domount() to tell it not to do this lookup.
 	 */
 	if (MUTEX_HELD(&sdp->sd_lock))
 		return (ENOENT);
 
 	ZFS_ENTER(zfsvfs);
 
 	mutex_enter(&sdp->sd_lock);
 	search.se_name = (char *)nm;
 	if ((sep = avl_find(&sdp->sd_snaps, &search, &where)) != NULL) {
 		*vpp = sep->se_root;
 		VN_HOLD(*vpp);
 		if ((*vpp)->v_mountedhere == NULL) {
 			/*
 			 * The snapshot was unmounted behind our backs,
 			 * try to remount it.
 			 */
 			goto domount;
 		}
-		vn_lock(*vpp, LK_EXCLUSIVE | LK_RETRY, ap->a_cnp->cn_thread);
+		vn_lock(*vpp, LK_EXCLUSIVE | LK_RETRY);
 		mutex_exit(&sdp->sd_lock);
 		ZFS_EXIT(zfsvfs);
 		return (0);
 	}
 
 	/*
 	 * The requested snapshot is not currently mounted, look it up.
 	 */
 	err = zfsctl_snapshot_zname(dvp, nm, MAXNAMELEN, snapname);
 	if (err) {
 		mutex_exit(&sdp->sd_lock);
 		ZFS_EXIT(zfsvfs);
 		return (err);
 	}
 	if (dmu_objset_open(snapname, DMU_OST_ZFS,
 	    DS_MODE_STANDARD | DS_MODE_READONLY, &snap) != 0) {
 		mutex_exit(&sdp->sd_lock);
 		ZFS_EXIT(zfsvfs);
 		return (ENOENT);
 	}
 
 	sep = kmem_alloc(sizeof (zfs_snapentry_t), KM_SLEEP);
 	sep->se_name = kmem_alloc(strlen(nm) + 1, KM_SLEEP);
 	(void) strcpy(sep->se_name, nm);
 	*vpp = sep->se_root = zfsctl_snapshot_mknode(dvp, dmu_objset_id(snap));
 	VN_HOLD(*vpp);
 	avl_insert(&sdp->sd_snaps, sep, where);
 
 	dmu_objset_close(snap);
 domount:
 	mountpoint_len = strlen(dvp->v_vfsp->mnt_stat.f_mntonname) +
 	    strlen("/.zfs/snapshot/") + strlen(nm) + 1;
 	mountpoint = kmem_alloc(mountpoint_len, KM_SLEEP);
 	(void) snprintf(mountpoint, mountpoint_len, "%s/.zfs/snapshot/%s",
 	    dvp->v_vfsp->mnt_stat.f_mntonname, nm);
 	err = domount(curthread, *vpp, "zfs", mountpoint, snapname, 0);
 	kmem_free(mountpoint, mountpoint_len);
 	/* FreeBSD: This line was moved from below to avoid a lock recursion. */
 	if (err == 0)
-		vn_lock(*vpp, LK_EXCLUSIVE | LK_RETRY, curthread);
+		vn_lock(*vpp, LK_EXCLUSIVE | LK_RETRY);
 	mutex_exit(&sdp->sd_lock);
 
 	/*
 	 * If we had an error, drop our hold on the vnode and
 	 * zfsctl_snapshot_inactive() will clean up.
 	 */
 	if (err) {
 		VN_RELE(*vpp);
 		*vpp = NULL;
 	}
 	return (err);
 }
 
 /* ARGSUSED */
 static int
 zfsctl_snapdir_readdir_cb(vnode_t *vp, struct dirent64 *dp, int *eofp,
     offset_t *offp, offset_t *nextp, void *data)
 {
 	zfsvfs_t *zfsvfs = vp->v_vfsp->vfs_data;
 	char snapname[MAXNAMELEN];
 	uint64_t id, cookie;
 
 	ZFS_ENTER(zfsvfs);
 
 	cookie = *offp;
 	if (dmu_snapshot_list_next(zfsvfs->z_os, MAXNAMELEN, snapname, &id,
 	    &cookie) == ENOENT) {
 		*eofp = 1;
 		ZFS_EXIT(zfsvfs);
 		return (0);
 	}
 
 	(void) strcpy(dp->d_name, snapname);
 	dp->d_ino = ZFSCTL_INO_SNAP(id);
 	*nextp = cookie;
 
 	ZFS_EXIT(zfsvfs);
 
 	return (0);
 }
 
 vnode_t *
 zfsctl_mknode_snapdir(vnode_t *pvp)
 {
 	vnode_t *vp;
 	zfsctl_snapdir_t *sdp;
 
 	vp = gfs_dir_create(sizeof (zfsctl_snapdir_t), pvp, pvp->v_vfsp,
 	    &zfsctl_ops_snapdir, NULL, NULL, MAXNAMELEN,
 	    zfsctl_snapdir_readdir_cb, NULL);
 	sdp = vp->v_data;
 	sdp->sd_node.zc_id = ZFSCTL_INO_SNAPDIR;
 	sdp->sd_node.zc_cmtime = ((zfsctl_node_t *)pvp->v_data)->zc_cmtime;
 	mutex_init(&sdp->sd_lock, NULL, MUTEX_DEFAULT, NULL);
 	avl_create(&sdp->sd_snaps, snapentry_compare,
 	    sizeof (zfs_snapentry_t), offsetof(zfs_snapentry_t, se_node));
 	return (vp);
 }
 
 /* ARGSUSED */
 static int
 zfsctl_snapdir_getattr(ap)
 	struct vop_getattr_args /* {
 		struct vnode *a_vp;
 		struct vattr *a_vap;
 		struct ucred *a_cred;
 		struct thread *a_td;
 	} */ *ap;
 {
 	struct vnode *vp = ap->a_vp;
 	struct vattr *vap = ap->a_vap;
 	zfsvfs_t *zfsvfs = vp->v_vfsp->vfs_data;
 	zfsctl_snapdir_t *sdp = vp->v_data;
 
 	ZFS_ENTER(zfsvfs);
 	zfsctl_common_getattr(vp, vap);
 	vap->va_nodeid = gfs_file_inode(vp);
 	vap->va_nlink = vap->va_size = avl_numnodes(&sdp->sd_snaps) + 2;
 	ZFS_EXIT(zfsvfs);
 
 	return (0);
 }
 
 /* ARGSUSED */
 static int
 zfsctl_snapdir_inactive(ap)
 	struct vop_inactive_args /* {
 		struct vnode *a_vp;
 		struct thread *a_td;
 	} */ *ap;
 {
 	vnode_t *vp = ap->a_vp;
 	zfsctl_snapdir_t *sdp = vp->v_data;
 	void *private;
 
 	private = gfs_dir_inactive(vp);
 	if (private != NULL) {
 		ASSERT(avl_numnodes(&sdp->sd_snaps) == 0);
 		mutex_destroy(&sdp->sd_lock);
 		avl_destroy(&sdp->sd_snaps);
 		kmem_free(private, sizeof (zfsctl_snapdir_t));
 	}
 	return (0);
 }
 
 static struct vop_vector zfsctl_ops_snapdir = {
 	.vop_default =	&default_vnodeops,
 	.vop_open =	zfsctl_common_open,
 	.vop_close =	zfsctl_common_close,
 	.vop_ioctl =	VOP_EINVAL,
 	.vop_getattr =	zfsctl_snapdir_getattr,
 	.vop_access =	zfsctl_common_access,
 	.vop_readdir =	gfs_vop_readdir,
 	.vop_lookup =	zfsctl_snapdir_lookup,
 	.vop_inactive =	zfsctl_snapdir_inactive,
 	.vop_reclaim =	zfsctl_common_reclaim,
 	.vop_fid =	zfsctl_common_fid,
 };
 
 static vnode_t *
 zfsctl_snapshot_mknode(vnode_t *pvp, uint64_t objset)
 {
 	vnode_t *vp;
 	zfsctl_node_t *zcp;
 
 	vp = gfs_dir_create(sizeof (zfsctl_node_t), pvp, pvp->v_vfsp,
 	    &zfsctl_ops_snapshot, NULL, NULL, MAXNAMELEN, NULL, NULL);
 	zcp = vp->v_data;
 	zcp->zc_id = objset;
 
 	return (vp);
 }
 
 static int
 zfsctl_snapshot_inactive(ap)
 	struct vop_inactive_args /* {
 		struct vnode *a_vp;
 		struct thread *a_td;
 	} */ *ap;
 {
 	vnode_t *vp = ap->a_vp;
 	struct vop_inactive_args iap;
 	zfsctl_snapdir_t *sdp;
 	zfs_snapentry_t *sep, *next;
 	int locked;
 	vnode_t *dvp;
 
 	VERIFY(gfs_dir_lookup(vp, "..", &dvp) == 0);
 	sdp = dvp->v_data;
 	VOP_UNLOCK(dvp, 0, ap->a_td);
 
 	if (!(locked = MUTEX_HELD(&sdp->sd_lock)))
 		mutex_enter(&sdp->sd_lock);
 
 	if (vp->v_count > 1) {
 		if (!locked)
 			mutex_exit(&sdp->sd_lock);
 		return (0);
 	}
 	ASSERT(!vn_ismntpt(vp));
 
 	sep = avl_first(&sdp->sd_snaps);
 	while (sep != NULL) {
 		next = AVL_NEXT(&sdp->sd_snaps, sep);
 
 		if (sep->se_root == vp) {
 			avl_remove(&sdp->sd_snaps, sep);
 			kmem_free(sep->se_name, strlen(sep->se_name) + 1);
 			kmem_free(sep, sizeof (zfs_snapentry_t));
 			break;
 		}
 		sep = next;
 	}
 	ASSERT(sep != NULL);
 
 	if (!locked)
 		mutex_exit(&sdp->sd_lock);
 	VN_RELE(dvp);
 
 	/*
 	 * Dispose of the vnode for the snapshot mount point.
 	 * This is safe to do because once this entry has been removed
 	 * from the AVL tree, it can't be found again, so cannot become
 	 * "active".  If we lookup the same name again we will end up
 	 * creating a new vnode.
 	 */
 	iap.a_vp = vp;
 	return (gfs_vop_inactive(&iap));
 }
 
 static int
 zfsctl_traverse_begin(vnode_t **vpp, int lktype, kthread_t *td)
 {
 
 	VN_HOLD(*vpp);
 	/* Snapshot should be already mounted, but just in case. */
 	if (vn_mountedvfs(*vpp) == NULL)
 		return (ENOENT);
 	return (traverse(vpp, lktype));
 }
 
 static void
 zfsctl_traverse_end(vnode_t *vp, int err)
 {
 
 	if (err == 0)
 		vput(vp);
 	else
 		VN_RELE(vp);
 }
 
 static int
 zfsctl_snapshot_getattr(ap)
 	struct vop_getattr_args /* {
 		struct vnode *a_vp;
 		struct vattr *a_vap;
 		struct ucred *a_cred;
 		struct thread *a_td;
 	} */ *ap;
 {
 	vnode_t *vp = ap->a_vp;
 	int err;
 
 	err = zfsctl_traverse_begin(&vp, LK_SHARED | LK_RETRY, ap->a_td);
 	if (err == 0)
 		err = VOP_GETATTR(vp, ap->a_vap, ap->a_cred, ap->a_td);
 	zfsctl_traverse_end(vp, err);
 	return (err);
 }
 
 static int
 zfsctl_snapshot_fid(ap)
 	struct vop_fid_args /* {
 		struct vnode *a_vp;
 		struct fid *a_fid;
 	} */ *ap;
 {
 	vnode_t *vp = ap->a_vp;
 	int err;
 
 	err = zfsctl_traverse_begin(&vp, LK_SHARED | LK_RETRY, curthread);
 	if (err == 0)
 		err = VOP_VPTOFH(vp, (void *)ap->a_fid);
 	zfsctl_traverse_end(vp, err);
 	return (err);
 }
 
 /*
  * These VP's should never see the light of day.  They should always
  * be covered.
  */
 static struct vop_vector zfsctl_ops_snapshot = {
 	.vop_default =	&default_vnodeops,
 	.vop_inactive =	zfsctl_snapshot_inactive,
 	.vop_reclaim =	zfsctl_common_reclaim,
 	.vop_getattr =	zfsctl_snapshot_getattr,
 	.vop_fid =	zfsctl_snapshot_fid,
 };
 
 int
 zfsctl_lookup_objset(vfs_t *vfsp, uint64_t objsetid, zfsvfs_t **zfsvfsp)
 {
 	zfsvfs_t *zfsvfs = vfsp->vfs_data;
 	vnode_t *dvp, *vp;
 	zfsctl_snapdir_t *sdp;
 	zfsctl_node_t *zcp;
 	zfs_snapentry_t *sep;
 	int error;
 
 	ASSERT(zfsvfs->z_ctldir != NULL);
 	error = zfsctl_root_lookup(zfsvfs->z_ctldir, "snapshot", &dvp,
 	    NULL, 0, NULL, kcred);
 	if (error != 0)
 		return (error);
 	sdp = dvp->v_data;
 
 	mutex_enter(&sdp->sd_lock);
 	sep = avl_first(&sdp->sd_snaps);
 	while (sep != NULL) {
 		vp = sep->se_root;
 		zcp = vp->v_data;
 		if (zcp->zc_id == objsetid)
 			break;
 
 		sep = AVL_NEXT(&sdp->sd_snaps, sep);
 	}
 
 	if (sep != NULL) {
 		VN_HOLD(vp);
 		error = traverse(&vp, LK_SHARED | LK_RETRY);
 		if (error == 0) {
 			if (vp == sep->se_root)
 				error = EINVAL;
 			else
 				*zfsvfsp = VTOZ(vp)->z_zfsvfs;
 		}
 		mutex_exit(&sdp->sd_lock);
 		if (error == 0)
 			VN_URELE(vp);
 		else
 			VN_RELE(vp);
 	} else {
 		error = EINVAL;
 		mutex_exit(&sdp->sd_lock);
 	}
 
 	VN_RELE(dvp);
 
 	return (error);
 }
 
 /*
  * Unmount any snapshots for the given filesystem.  This is called from
  * zfs_umount() - if we have a ctldir, then go through and unmount all the
  * snapshots.
  */
 int
 zfsctl_umount_snapshots(vfs_t *vfsp, int fflags, cred_t *cr)
 {
 	struct vop_inactive_args ap;
 	zfsvfs_t *zfsvfs = vfsp->vfs_data;
 	vnode_t *dvp, *svp;
 	zfsctl_snapdir_t *sdp;
 	zfs_snapentry_t *sep, *next;
 	int error;
 
 	ASSERT(zfsvfs->z_ctldir != NULL);
 	error = zfsctl_root_lookup(zfsvfs->z_ctldir, "snapshot", &dvp,
 	    NULL, 0, NULL, cr);
 	if (error != 0)
 		return (error);
 	sdp = dvp->v_data;
 
 	mutex_enter(&sdp->sd_lock);
 
 	sep = avl_first(&sdp->sd_snaps);
 	while (sep != NULL) {
 		svp = sep->se_root;
 		next = AVL_NEXT(&sdp->sd_snaps, sep);
 
 		/*
 		 * If this snapshot is not mounted, then it must
 		 * have just been unmounted by somebody else, and
 		 * will be cleaned up by zfsctl_snapdir_inactive().
 		 */
 		if (vn_ismntpt(svp)) {
 			if ((error = vn_vfswlock(svp)) != 0)
 				goto out;
 
 			/*
 			 * Increase usecount, so dounmount() won't vrele() it
 			 * to 0 and call zfsctl_snapdir_inactive().
 			 */
 			VN_HOLD(svp);
 			vfsp = vn_mountedvfs(svp);
 			mtx_lock(&Giant);
 			error = dounmount(vfsp, fflags, curthread);
 			mtx_unlock(&Giant);
 			if (error != 0) {
 				VN_RELE(svp);
 				goto out;
 			}
 
 			avl_remove(&sdp->sd_snaps, sep);
 			kmem_free(sep->se_name, strlen(sep->se_name) + 1);
 			kmem_free(sep, sizeof (zfs_snapentry_t));
 
 			/*
 			 * We can't use VN_RELE(), as that will try to
 			 * invoke zfsctl_snapdir_inactive(), and that
 			 * would lead to an attempt to re-grab the sd_lock.
 			 */
 			ASSERT3U(svp->v_count, ==, 1);
 			ap.a_vp = svp;
 			gfs_vop_inactive(&ap);
 		}
 		sep = next;
 	}
 out:
 	mutex_exit(&sdp->sd_lock);
 	VN_RELE(dvp);
 
 	return (error);
 }
Index: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_replay.c
===================================================================
--- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_replay.c	(revision 175201)
+++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_replay.c	(revision 175202)
@@ -1,430 +1,430 @@
 /*
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
  * Common Development and Distribution License (the "License").
  * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or http://www.opensolaris.org/os/licensing.
  * See the License for the specific language governing permissions
  * and limitations under the License.
  *
  * When distributing Covered Code, include this CDDL HEADER in each
  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  * If applicable, add the following below this CDDL HEADER, with the
  * fields enclosed by brackets "[]" replaced with your own identifying
  * information: Portions Copyright [yyyy] [name of copyright owner]
  *
  * CDDL HEADER END
  */
 /*
  * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
 #pragma ident	"%Z%%M%	%I%	%E% SMI"
 
 #include <sys/types.h>
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/sysmacros.h>
 #include <sys/cmn_err.h>
 #include <sys/kmem.h>
 #include <sys/file.h>
 #include <sys/fcntl.h>
 #include <sys/vfs.h>
 #include <sys/fs/zfs.h>
 #include <sys/zfs_znode.h>
 #include <sys/zfs_dir.h>
 #include <sys/zfs_acl.h>
 #include <sys/spa.h>
 #include <sys/zil.h>
 #include <sys/byteorder.h>
 #include <sys/stat.h>
 #include <sys/acl.h>
 #include <sys/atomic.h>
 #include <sys/cred.h>
 #include <sys/namei.h>
 
 /*
  * Functions to replay ZFS intent log (ZIL) records
  * The functions are called through a function vector (zfs_replay_vector)
  * which is indexed by the transaction type.
  */
 
 static void
 zfs_init_vattr(vattr_t *vap, uint64_t mask, uint64_t mode,
 	uint64_t uid, uint64_t gid, uint64_t rdev, uint64_t nodeid)
 {
 	VATTR_NULL(vap);
 	vap->va_mask = (uint_t)mask;
 	vap->va_type = IFTOVT(mode);
 	vap->va_mode = mode & MODEMASK;
 	vap->va_uid = (uid_t)uid;
 	vap->va_gid = (gid_t)gid;
 	vap->va_rdev = zfs_cmpldev(rdev);
 	vap->va_nodeid = nodeid;
 }
 
 /* ARGSUSED */
 static int
 zfs_replay_error(zfsvfs_t *zfsvfs, lr_t *lr, boolean_t byteswap)
 {
 	return (ENOTSUP);
 }
 
 static int
 zfs_replay_create(zfsvfs_t *zfsvfs, lr_create_t *lr, boolean_t byteswap)
 {
 	char *name = (char *)(lr + 1);	/* name follows lr_create_t */
 	char *link;			/* symlink content follows name */
 	znode_t *dzp;
 	vnode_t *vp = NULL;
 	vattr_t va;
 	struct componentname cn;
 	int error;
 
 	if (byteswap)
 		byteswap_uint64_array(lr, sizeof (*lr));
 
 	if ((error = zfs_zget(zfsvfs, lr->lr_doid, &dzp)) != 0)
 		return (error);
 
 	zfs_init_vattr(&va, AT_TYPE | AT_MODE | AT_UID | AT_GID,
 	    lr->lr_mode, lr->lr_uid, lr->lr_gid, lr->lr_rdev, lr->lr_foid);
 
 	/*
 	 * All forms of zfs create (create, mkdir, mkxattrdir, symlink)
 	 * eventually end up in zfs_mknode(), which assigns the object's
 	 * creation time and generation number.  The generic VOP_CREATE()
 	 * doesn't have either concept, so we smuggle the values inside
 	 * the vattr's otherwise unused va_ctime and va_nblocks fields.
 	 */
 	ZFS_TIME_DECODE(&va.va_ctime, lr->lr_crtime);
 	va.va_nblocks = lr->lr_gen;
 
 	cn.cn_nameptr = name;
 	cn.cn_cred = kcred;
 	cn.cn_thread = curthread;
 	cn.cn_flags = SAVENAME;
 
-	vn_lock(ZTOV(dzp), LK_EXCLUSIVE | LK_RETRY, curthread);
+	vn_lock(ZTOV(dzp), LK_EXCLUSIVE | LK_RETRY);
 	switch ((int)lr->lr_common.lrc_txtype) {
 	case TX_CREATE:
 		error = VOP_CREATE(ZTOV(dzp), &vp, &cn, &va);
 		break;
 	case TX_MKDIR:
 		error = VOP_MKDIR(ZTOV(dzp), &vp, &cn, &va);
 		break;
 	case TX_MKXATTR:
 		error = zfs_make_xattrdir(dzp, &va, &vp, kcred);
 		break;
 	case TX_SYMLINK:
 		link = name + strlen(name) + 1;
 		error = VOP_SYMLINK(ZTOV(dzp), &vp, &cn, &va, link);
 		break;
 	default:
 		error = ENOTSUP;
 	}
 	VOP_UNLOCK(ZTOV(dzp), 0, curthread);
 
 	if (error == 0 && vp != NULL) {
 		VOP_UNLOCK(vp, 0, curthread);
 		VN_RELE(vp);
 	}
 
 	VN_RELE(ZTOV(dzp));
 
 	return (error);
 }
 
 static int
 zfs_replay_remove(zfsvfs_t *zfsvfs, lr_remove_t *lr, boolean_t byteswap)
 {
 	char *name = (char *)(lr + 1);	/* name follows lr_remove_t */
 	znode_t *dzp;
 	struct componentname cn;
 	vnode_t *vp;
 	int error;
 
 	if (byteswap)
 		byteswap_uint64_array(lr, sizeof (*lr));
 
 	if ((error = zfs_zget(zfsvfs, lr->lr_doid, &dzp)) != 0)
 		return (error);
 
 	bzero(&cn, sizeof(cn));
 	cn.cn_nameptr = name;
 	cn.cn_namelen = strlen(name);
 	cn.cn_nameiop = DELETE;
 	cn.cn_flags = ISLASTCN | SAVENAME;
 	cn.cn_lkflags = LK_EXCLUSIVE | LK_RETRY;
 	cn.cn_cred = kcred;
 	cn.cn_thread = curthread;
-	vn_lock(ZTOV(dzp), LK_EXCLUSIVE | LK_RETRY, curthread);
+	vn_lock(ZTOV(dzp), LK_EXCLUSIVE | LK_RETRY);
 	error = VOP_LOOKUP(ZTOV(dzp), &vp, &cn);
 	if (error != 0) {
 		VOP_UNLOCK(ZTOV(dzp), 0, curthread);
 		goto fail;
 	}
 
 	switch ((int)lr->lr_common.lrc_txtype) {
 	case TX_REMOVE:
 		error = VOP_REMOVE(ZTOV(dzp), vp, &cn);
 		break;
 	case TX_RMDIR:
 		error = VOP_RMDIR(ZTOV(dzp), vp, &cn);
 		break;
 	default:
 		error = ENOTSUP;
 	}
 	vput(vp);
 	VOP_UNLOCK(ZTOV(dzp), 0, curthread);
 fail:
 	VN_RELE(ZTOV(dzp));
 
 	return (error);
 }
 
 static int
 zfs_replay_link(zfsvfs_t *zfsvfs, lr_link_t *lr, boolean_t byteswap)
 {
 	char *name = (char *)(lr + 1);	/* name follows lr_link_t */
 	znode_t *dzp, *zp;
 	struct componentname cn;
 	int error;
 
 	if (byteswap)
 		byteswap_uint64_array(lr, sizeof (*lr));
 
 	if ((error = zfs_zget(zfsvfs, lr->lr_doid, &dzp)) != 0)
 		return (error);
 
 	if ((error = zfs_zget(zfsvfs, lr->lr_link_obj, &zp)) != 0) {
 		VN_RELE(ZTOV(dzp));
 		return (error);
 	}
 
 	cn.cn_nameptr = name;
 	cn.cn_cred = kcred;
 	cn.cn_thread = curthread;
 	cn.cn_flags = SAVENAME;
 
-	vn_lock(ZTOV(dzp), LK_EXCLUSIVE | LK_RETRY, curthread);
-	vn_lock(ZTOV(zp), LK_EXCLUSIVE | LK_RETRY, curthread);
+	vn_lock(ZTOV(dzp), LK_EXCLUSIVE | LK_RETRY);
+	vn_lock(ZTOV(zp), LK_EXCLUSIVE | LK_RETRY);
 	error = VOP_LINK(ZTOV(dzp), ZTOV(zp), &cn);
 	VOP_UNLOCK(ZTOV(zp), 0, curthread);
 	VOP_UNLOCK(ZTOV(dzp), 0, curthread);
 
 	VN_RELE(ZTOV(zp));
 	VN_RELE(ZTOV(dzp));
 
 	return (error);
 }
 
 static int
 zfs_replay_rename(zfsvfs_t *zfsvfs, lr_rename_t *lr, boolean_t byteswap)
 {
 	char *sname = (char *)(lr + 1);	/* sname and tname follow lr_rename_t */
 	char *tname = sname + strlen(sname) + 1;
 	znode_t *sdzp, *tdzp;
 	struct componentname scn, tcn;
 	vnode_t *svp, *tvp;
 	kthread_t *td = curthread;
 	int error;
 
 	if (byteswap)
 		byteswap_uint64_array(lr, sizeof (*lr));
 
 	if ((error = zfs_zget(zfsvfs, lr->lr_sdoid, &sdzp)) != 0)
 		return (error);
 
 	if ((error = zfs_zget(zfsvfs, lr->lr_tdoid, &tdzp)) != 0) {
 		VN_RELE(ZTOV(sdzp));
 		return (error);
 	}
 
 	svp = tvp = NULL;
 
 	bzero(&scn, sizeof(scn));
 	scn.cn_nameptr = sname;
 	scn.cn_namelen = strlen(sname);
 	scn.cn_nameiop = DELETE;
 	scn.cn_flags = ISLASTCN | SAVENAME;
 	scn.cn_lkflags = LK_EXCLUSIVE | LK_RETRY;
 	scn.cn_cred = kcred;
 	scn.cn_thread = td;
-	vn_lock(ZTOV(sdzp), LK_EXCLUSIVE | LK_RETRY, td);
+	vn_lock(ZTOV(sdzp), LK_EXCLUSIVE | LK_RETRY);
 	error = VOP_LOOKUP(ZTOV(sdzp), &svp, &scn);
 	VOP_UNLOCK(ZTOV(sdzp), 0, td);
 	if (error != 0)
 		goto fail;
 	VOP_UNLOCK(svp, 0, td);
 
 	bzero(&tcn, sizeof(tcn));
 	tcn.cn_nameptr = tname;
 	tcn.cn_namelen = strlen(tname);
 	tcn.cn_nameiop = RENAME;
 	tcn.cn_flags = ISLASTCN | SAVENAME;
 	tcn.cn_lkflags = LK_EXCLUSIVE | LK_RETRY;
 	tcn.cn_cred = kcred;
 	tcn.cn_thread = td;
-	vn_lock(ZTOV(tdzp), LK_EXCLUSIVE | LK_RETRY, td);
+	vn_lock(ZTOV(tdzp), LK_EXCLUSIVE | LK_RETRY);
 	error = VOP_LOOKUP(ZTOV(tdzp), &tvp, &tcn);
 	if (error == EJUSTRETURN)
 		tvp = NULL;
 	else if (error != 0) {
 		VOP_UNLOCK(ZTOV(tdzp), 0, td);
 		goto fail;
 	}
 
 	error = VOP_RENAME(ZTOV(sdzp), svp, &scn, ZTOV(tdzp), tvp, &tcn);
 	return (error);
 fail:
 	if (svp != NULL)
 		vrele(svp);
 	if (tvp != NULL)
 		vrele(tvp);
 	VN_RELE(ZTOV(tdzp));
 	VN_RELE(ZTOV(sdzp));
 
 	return (error);
 }
 
 static int
 zfs_replay_write(zfsvfs_t *zfsvfs, lr_write_t *lr, boolean_t byteswap)
 {
 	char *data = (char *)(lr + 1);	/* data follows lr_write_t */
 	znode_t	*zp;
 	int error;
 	ssize_t resid;
 
 	if (byteswap)
 		byteswap_uint64_array(lr, sizeof (*lr));
 
 	if ((error = zfs_zget(zfsvfs, lr->lr_foid, &zp)) != 0) {
 		/*
 		 * As we can log writes out of order, it's possible the
 		 * file has been removed. In this case just drop the write
 		 * and return success.
 		 */
 		if (error == ENOENT)
 			error = 0;
 		return (error);
 	}
 
 	error = vn_rdwr(UIO_WRITE, ZTOV(zp), data, lr->lr_length,
 	    lr->lr_offset, UIO_SYSSPACE, 0, RLIM64_INFINITY, kcred, &resid);
 
 	VN_RELE(ZTOV(zp));
 
 	return (error);
 }
 
 static int
 zfs_replay_truncate(zfsvfs_t *zfsvfs, lr_truncate_t *lr, boolean_t byteswap)
 {
 
 	ZFS_LOG(0, "Unexpected code path, report to pjd@FreeBSD.org");
 	return (EOPNOTSUPP);
 }
 
 static int
 zfs_replay_setattr(zfsvfs_t *zfsvfs, lr_setattr_t *lr, boolean_t byteswap)
 {
 	znode_t *zp;
 	vattr_t va;
 	vnode_t *vp;
 	int error;
 
 	if (byteswap)
 		byteswap_uint64_array(lr, sizeof (*lr));
 
 	if ((error = zfs_zget(zfsvfs, lr->lr_foid, &zp)) != 0) {
 		/*
 		 * As we can log setattrs out of order, it's possible the
 		 * file has been removed. In this case just drop the setattr
 		 * and return success.
 		 */
 		if (error == ENOENT)
 			error = 0;
 		return (error);
 	}
 
 	zfs_init_vattr(&va, lr->lr_mask, lr->lr_mode,
 	    lr->lr_uid, lr->lr_gid, 0, lr->lr_foid);
 
 	va.va_size = lr->lr_size;
 	ZFS_TIME_DECODE(&va.va_atime, lr->lr_atime);
 	ZFS_TIME_DECODE(&va.va_mtime, lr->lr_mtime);
 
 	vp = ZTOV(zp);
-	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, curthread);
+	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
 	error = VOP_SETATTR(vp, &va, kcred, curthread);
 	VOP_UNLOCK(vp, 0, curthread);
 	VN_RELE(vp);
 
 	return (error);
 }
 
 static int
 zfs_replay_acl(zfsvfs_t *zfsvfs, lr_acl_t *lr, boolean_t byteswap)
 {
 	ace_t *ace = (ace_t *)(lr + 1);	/* ace array follows lr_acl_t */
 #ifdef TODO
 	vsecattr_t vsa;
 #endif
 	znode_t *zp;
 	int error;
 
 	if (byteswap) {
 		byteswap_uint64_array(lr, sizeof (*lr));
 		zfs_ace_byteswap(ace, lr->lr_aclcnt);
 	}
 
 	if ((error = zfs_zget(zfsvfs, lr->lr_foid, &zp)) != 0) {
 		/*
 		 * As we can log acls out of order, it's possible the
 		 * file has been removed. In this case just drop the acl
 		 * and return success.
 		 */
 		if (error == ENOENT)
 			error = 0;
 		return (error);
 	}
 
 #ifdef TODO
 	bzero(&vsa, sizeof (vsa));
 	vsa.vsa_mask = VSA_ACE | VSA_ACECNT;
 	vsa.vsa_aclcnt = lr->lr_aclcnt;
 	vsa.vsa_aclentp = ace;
 
 	error = VOP_SETSECATTR(ZTOV(zp), &vsa, 0, kcred);
 #else
 	error = EOPNOTSUPP;
 #endif
 
 	VN_RELE(ZTOV(zp));
 
 	return (error);
 }
 
 /*
  * Callback vectors for replaying records
  */
 zil_replay_func_t *zfs_replay_vector[TX_MAX_TYPE] = {
 	zfs_replay_error,	/* 0 no such transaction type */
 	zfs_replay_create,	/* TX_CREATE */
 	zfs_replay_create,	/* TX_MKDIR */
 	zfs_replay_create,	/* TX_MKXATTR */
 	zfs_replay_create,	/* TX_SYMLINK */
 	zfs_replay_remove,	/* TX_REMOVE */
 	zfs_replay_remove,	/* TX_RMDIR */
 	zfs_replay_link,	/* TX_LINK */
 	zfs_replay_rename,	/* TX_RENAME */
 	zfs_replay_write,	/* TX_WRITE */
 	zfs_replay_truncate,	/* TX_TRUNCATE */
 	zfs_replay_setattr,	/* TX_SETATTR */
 	zfs_replay_acl,		/* TX_ACL */
 };
Index: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_vfsops.c
===================================================================
--- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_vfsops.c	(revision 175201)
+++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_vfsops.c	(revision 175202)
@@ -1,1021 +1,1021 @@
 /*
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
  * Common Development and Distribution License (the "License").
  * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or http://www.opensolaris.org/os/licensing.
  * See the License for the specific language governing permissions
  * and limitations under the License.
  *
  * When distributing Covered Code, include this CDDL HEADER in each
  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  * If applicable, add the following below this CDDL HEADER, with the
  * fields enclosed by brackets "[]" replaced with your own identifying
  * information: Portions Copyright [yyyy] [name of copyright owner]
  *
  * CDDL HEADER END
  */
 /*
  * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
 #pragma ident	"%Z%%M%	%I%	%E% SMI"
 
 #include <sys/types.h>
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/kernel.h>
 #include <sys/sysmacros.h>
 #include <sys/kmem.h>
 #include <sys/acl.h>
 #include <sys/vnode.h>
 #include <sys/vfs.h>
 #include <sys/mntent.h>
 #include <sys/mount.h>
 #include <sys/cmn_err.h>
 #include <sys/zfs_znode.h>
 #include <sys/zfs_dir.h>
 #include <sys/zil.h>
 #include <sys/fs/zfs.h>
 #include <sys/dmu.h>
 #include <sys/dsl_prop.h>
 #include <sys/dsl_dataset.h>
 #include <sys/spa.h>
 #include <sys/zap.h>
 #include <sys/varargs.h>
 #include <sys/policy.h>
 #include <sys/atomic.h>
 #include <sys/zfs_ioctl.h>
 #include <sys/zfs_ctldir.h>
 #include <sys/sunddi.h>
 #include <sys/dnlc.h>
 
 struct mtx zfs_debug_mtx;
 MTX_SYSINIT(zfs_debug_mtx, &zfs_debug_mtx, "zfs_debug", MTX_DEF);
 SYSCTL_NODE(_vfs, OID_AUTO, zfs, CTLFLAG_RW, 0, "ZFS file system");
 int zfs_debug_level = 0;
 TUNABLE_INT("vfs.zfs.debug", &zfs_debug_level);
 SYSCTL_INT(_vfs_zfs, OID_AUTO, debug, CTLFLAG_RW, &zfs_debug_level, 0,
     "Debug level");
 
 static int zfs_mount(vfs_t *vfsp, kthread_t *td);
 static int zfs_umount(vfs_t *vfsp, int fflag, kthread_t *td);
 static int zfs_root(vfs_t *vfsp, int flags, vnode_t **vpp, kthread_t *td);
 static int zfs_statfs(vfs_t *vfsp, struct statfs *statp, kthread_t *td);
 static int zfs_vget(vfs_t *vfsp, ino_t ino, int flags, vnode_t **vpp);
 static int zfs_sync(vfs_t *vfsp, int waitfor, kthread_t *td);
 static int zfs_fhtovp(vfs_t *vfsp, fid_t *fidp, vnode_t **vpp);
 static void zfs_objset_close(zfsvfs_t *zfsvfs);
 static void zfs_freevfs(vfs_t *vfsp);
 
 static struct vfsops zfs_vfsops = {
 	.vfs_mount =		zfs_mount,
 	.vfs_unmount =		zfs_umount,
 	.vfs_root =		zfs_root,
 	.vfs_statfs =		zfs_statfs,
 	.vfs_vget =		zfs_vget,
 	.vfs_sync =		zfs_sync,
 	.vfs_fhtovp =		zfs_fhtovp,
 };
 
 VFS_SET(zfs_vfsops, zfs, VFCF_JAIL);
 
 /*
  * We need to keep a count of active fs's.
  * This is necessary to prevent our module
  * from being unloaded after a umount -f
  */
 static uint32_t	zfs_active_fs_count = 0;
 
 /*ARGSUSED*/
 static int
 zfs_sync(vfs_t *vfsp, int waitfor, kthread_t *td)
 {
 
 	/*
 	 * Data integrity is job one.  We don't want a compromised kernel
 	 * writing to the storage pool, so we never sync during panic.
 	 */
 	if (panicstr)
 		return (0);
 
 	if (vfsp != NULL) {
 		/*
 		 * Sync a specific filesystem.
 		 */
 		zfsvfs_t *zfsvfs = vfsp->vfs_data;
 		int error;
 
 		error = vfs_stdsync(vfsp, waitfor, td);
 		if (error != 0)
 			return (error);
 
 		ZFS_ENTER(zfsvfs);
 		if (zfsvfs->z_log != NULL)
 			zil_commit(zfsvfs->z_log, UINT64_MAX, 0);
 		else
 			txg_wait_synced(dmu_objset_pool(zfsvfs->z_os), 0);
 		ZFS_EXIT(zfsvfs);
 	} else {
 		/*
 		 * Sync all ZFS filesystems.  This is what happens when you
 		 * run sync(1M).  Unlike other filesystems, ZFS honors the
 		 * request by waiting for all pools to commit all dirty data.
 		 */
 		spa_sync_allpools();
 	}
 
 	return (0);
 }
 
 static void
 atime_changed_cb(void *arg, uint64_t newval)
 {
 	zfsvfs_t *zfsvfs = arg;
 
 	if (newval == TRUE) {
 		zfsvfs->z_atime = TRUE;
 		zfsvfs->z_vfs->vfs_flag &= ~MNT_NOATIME;
 		vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_NOATIME);
 		vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_ATIME, NULL, 0);
 	} else {
 		zfsvfs->z_atime = FALSE;
 		zfsvfs->z_vfs->vfs_flag |= MNT_NOATIME;
 		vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_ATIME);
 		vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_NOATIME, NULL, 0);
 	}
 }
 
 static void
 xattr_changed_cb(void *arg, uint64_t newval)
 {
 	zfsvfs_t *zfsvfs = arg;
 
 	if (newval == TRUE) {
 		/* XXX locking on vfs_flag? */
 #ifdef TODO
 		zfsvfs->z_vfs->vfs_flag |= VFS_XATTR;
 #endif
 		vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_NOXATTR);
 		vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_XATTR, NULL, 0);
 	} else {
 		/* XXX locking on vfs_flag? */
 #ifdef TODO
 		zfsvfs->z_vfs->vfs_flag &= ~VFS_XATTR;
 #endif
 		vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_XATTR);
 		vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_NOXATTR, NULL, 0);
 	}
 }
 
 static void
 blksz_changed_cb(void *arg, uint64_t newval)
 {
 	zfsvfs_t *zfsvfs = arg;
 
 	if (newval < SPA_MINBLOCKSIZE ||
 	    newval > SPA_MAXBLOCKSIZE || !ISP2(newval))
 		newval = SPA_MAXBLOCKSIZE;
 
 	zfsvfs->z_max_blksz = newval;
 	zfsvfs->z_vfs->vfs_bsize = newval;
 }
 
 static void
 readonly_changed_cb(void *arg, uint64_t newval)
 {
 	zfsvfs_t *zfsvfs = arg;
 
 	if (newval) {
 		/* XXX locking on vfs_flag? */
 		zfsvfs->z_vfs->vfs_flag |= VFS_RDONLY;
 		vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_RW);
 		vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_RO, NULL, 0);
 	} else {
 		/* XXX locking on vfs_flag? */
 		zfsvfs->z_vfs->vfs_flag &= ~VFS_RDONLY;
 		vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_RO);
 		vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_RW, NULL, 0);
 	}
 }
 
 static void
 setuid_changed_cb(void *arg, uint64_t newval)
 {
 	zfsvfs_t *zfsvfs = arg;
 
 	if (newval == FALSE) {
 		zfsvfs->z_vfs->vfs_flag |= VFS_NOSETUID;
 		vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_SETUID);
 		vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_NOSETUID, NULL, 0);
 	} else {
 		zfsvfs->z_vfs->vfs_flag &= ~VFS_NOSETUID;
 		vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_NOSETUID);
 		vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_SETUID, NULL, 0);
 	}
 }
 
 static void
 exec_changed_cb(void *arg, uint64_t newval)
 {
 	zfsvfs_t *zfsvfs = arg;
 
 	if (newval == FALSE) {
 		zfsvfs->z_vfs->vfs_flag |= VFS_NOEXEC;
 		vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_EXEC);
 		vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_NOEXEC, NULL, 0);
 	} else {
 		zfsvfs->z_vfs->vfs_flag &= ~VFS_NOEXEC;
 		vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_NOEXEC);
 		vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_EXEC, NULL, 0);
 	}
 }
 
 static void
 snapdir_changed_cb(void *arg, uint64_t newval)
 {
 	zfsvfs_t *zfsvfs = arg;
 
 	zfsvfs->z_show_ctldir = newval;
 }
 
 static void
 acl_mode_changed_cb(void *arg, uint64_t newval)
 {
 	zfsvfs_t *zfsvfs = arg;
 
 	zfsvfs->z_acl_mode = newval;
 }
 
 static void
 acl_inherit_changed_cb(void *arg, uint64_t newval)
 {
 	zfsvfs_t *zfsvfs = arg;
 
 	zfsvfs->z_acl_inherit = newval;
 }
 
 static int
 zfs_refresh_properties(vfs_t *vfsp)
 {
 	zfsvfs_t *zfsvfs = vfsp->vfs_data;
 
 	/*
 	 * Remount operations default to "rw" unless "ro" is explicitly
 	 * specified.
 	 */
 	if (vfs_optionisset(vfsp, MNTOPT_RO, NULL)) {
 		readonly_changed_cb(zfsvfs, B_TRUE);
 	} else {
 		if (!dmu_objset_is_snapshot(zfsvfs->z_os))
 			readonly_changed_cb(zfsvfs, B_FALSE);
 		else if (vfs_optionisset(vfsp, MNTOPT_RW, NULL))
 			return (EROFS);
 	}
 
 	if (vfs_optionisset(vfsp, MNTOPT_NOSUID, NULL)) {
 		setuid_changed_cb(zfsvfs, B_FALSE);
 	} else {
 		if (vfs_optionisset(vfsp, MNTOPT_NOSETUID, NULL))
 			setuid_changed_cb(zfsvfs, B_FALSE);
 		else if (vfs_optionisset(vfsp, MNTOPT_SETUID, NULL))
 			setuid_changed_cb(zfsvfs, B_TRUE);
 	}
 
 	if (vfs_optionisset(vfsp, MNTOPT_NOEXEC, NULL))
 		exec_changed_cb(zfsvfs, B_FALSE);
 	else if (vfs_optionisset(vfsp, MNTOPT_EXEC, NULL))
 		exec_changed_cb(zfsvfs, B_TRUE);
 
 	if (vfs_optionisset(vfsp, MNTOPT_ATIME, NULL))
 		atime_changed_cb(zfsvfs, B_TRUE);
 	else if (vfs_optionisset(vfsp, MNTOPT_NOATIME, NULL))
 		atime_changed_cb(zfsvfs, B_FALSE);
 
 	if (vfs_optionisset(vfsp, MNTOPT_XATTR, NULL))
 		xattr_changed_cb(zfsvfs, B_TRUE);
 	else if (vfs_optionisset(vfsp, MNTOPT_NOXATTR, NULL))
 		xattr_changed_cb(zfsvfs, B_FALSE);
 
 	return (0);
 }
 
 static int
 zfs_register_callbacks(vfs_t *vfsp)
 {
 	struct dsl_dataset *ds = NULL;
 	objset_t *os = NULL;
 	zfsvfs_t *zfsvfs = NULL;
 	int readonly, do_readonly = FALSE;
 	int setuid, do_setuid = FALSE;
 	int exec, do_exec = FALSE;
 	int xattr, do_xattr = FALSE;
 	int error = 0;
 
 	ASSERT(vfsp);
 	zfsvfs = vfsp->vfs_data;
 	ASSERT(zfsvfs);
 	os = zfsvfs->z_os;
 
 	/*
 	 * The act of registering our callbacks will destroy any mount
 	 * options we may have.  In order to enable temporary overrides
 	 * of mount options, we stash away the current values and
 	 * restore them after we register the callbacks.
 	 */
 	if (vfs_optionisset(vfsp, MNTOPT_RO, NULL)) {
 		readonly = B_TRUE;
 		do_readonly = B_TRUE;
 	} else if (vfs_optionisset(vfsp, MNTOPT_RW, NULL)) {
 		readonly = B_FALSE;
 		do_readonly = B_TRUE;
 	}
 	if (vfs_optionisset(vfsp, MNTOPT_NOSUID, NULL)) {
 		setuid = B_FALSE;
 		do_setuid = B_TRUE;
 	} else {
 		if (vfs_optionisset(vfsp, MNTOPT_NOSETUID, NULL)) {
 			setuid = B_FALSE;
 			do_setuid = B_TRUE;
 		} else if (vfs_optionisset(vfsp, MNTOPT_SETUID, NULL)) {
 			setuid = B_TRUE;
 			do_setuid = B_TRUE;
 		}
 	}
 	if (vfs_optionisset(vfsp, MNTOPT_NOEXEC, NULL)) {
 		exec = B_FALSE;
 		do_exec = B_TRUE;
 	} else if (vfs_optionisset(vfsp, MNTOPT_EXEC, NULL)) {
 		exec = B_TRUE;
 		do_exec = B_TRUE;
 	}
 	if (vfs_optionisset(vfsp, MNTOPT_NOXATTR, NULL)) {
 		xattr = B_FALSE;
 		do_xattr = B_TRUE;
 	} else if (vfs_optionisset(vfsp, MNTOPT_XATTR, NULL)) {
 		xattr = B_TRUE;
 		do_xattr = B_TRUE;
 	}
 
 	/*
 	 * Register property callbacks.
 	 *
 	 * It would probably be fine to just check for i/o error from
 	 * the first prop_register(), but I guess I like to go
 	 * overboard...
 	 */
 	ds = dmu_objset_ds(os);
 	error = dsl_prop_register(ds, "atime", atime_changed_cb, zfsvfs);
 	error = error ? error : dsl_prop_register(ds,
 	    "xattr", xattr_changed_cb, zfsvfs);
 	error = error ? error : dsl_prop_register(ds,
 	    "recordsize", blksz_changed_cb, zfsvfs);
 	error = error ? error : dsl_prop_register(ds,
 	    "readonly", readonly_changed_cb, zfsvfs);
 	error = error ? error : dsl_prop_register(ds,
 	    "setuid", setuid_changed_cb, zfsvfs);
 	error = error ? error : dsl_prop_register(ds,
 	    "exec", exec_changed_cb, zfsvfs);
 	error = error ? error : dsl_prop_register(ds,
 	    "snapdir", snapdir_changed_cb, zfsvfs);
 	error = error ? error : dsl_prop_register(ds,
 	    "aclmode", acl_mode_changed_cb, zfsvfs);
 	error = error ? error : dsl_prop_register(ds,
 	    "aclinherit", acl_inherit_changed_cb, zfsvfs);
 	if (error)
 		goto unregister;
 
 	/*
 	 * Invoke our callbacks to restore temporary mount options.
 	 */
 	if (do_readonly)
 		readonly_changed_cb(zfsvfs, readonly);
 	if (do_setuid)
 		setuid_changed_cb(zfsvfs, setuid);
 	if (do_exec)
 		exec_changed_cb(zfsvfs, exec);
 	if (do_xattr)
 		xattr_changed_cb(zfsvfs, xattr);
 
 	return (0);
 
 unregister:
 	/*
 	 * We may attempt to unregister some callbacks that are not
 	 * registered, but this is OK; it will simply return ENOMSG,
 	 * which we will ignore.
 	 */
 	(void) dsl_prop_unregister(ds, "atime", atime_changed_cb, zfsvfs);
 	(void) dsl_prop_unregister(ds, "xattr", xattr_changed_cb, zfsvfs);
 	(void) dsl_prop_unregister(ds, "recordsize", blksz_changed_cb, zfsvfs);
 	(void) dsl_prop_unregister(ds, "readonly", readonly_changed_cb, zfsvfs);
 	(void) dsl_prop_unregister(ds, "setuid", setuid_changed_cb, zfsvfs);
 	(void) dsl_prop_unregister(ds, "exec", exec_changed_cb, zfsvfs);
 	(void) dsl_prop_unregister(ds, "snapdir", snapdir_changed_cb, zfsvfs);
 	(void) dsl_prop_unregister(ds, "aclmode", acl_mode_changed_cb, zfsvfs);
 	(void) dsl_prop_unregister(ds, "aclinherit", acl_inherit_changed_cb,
 	    zfsvfs);
 	return (error);
 
 }
 
 static int
 zfs_domount(vfs_t *vfsp, char *osname, kthread_t *td)
 {
 	cred_t *cr = td->td_ucred;
 	uint64_t recordsize, readonly;
 	int error = 0;
 	int mode;
 	zfsvfs_t *zfsvfs;
 	znode_t *zp = NULL;
 
 	ASSERT(vfsp);
 	ASSERT(osname);
 
 	/*
 	 * Initialize the zfs-specific filesystem structure.
 	 * Should probably make this a kmem cache, shuffle fields,
 	 * and just bzero up to z_hold_mtx[].
 	 */
 	zfsvfs = kmem_zalloc(sizeof (zfsvfs_t), KM_SLEEP);
 	zfsvfs->z_vfs = vfsp;
 	zfsvfs->z_parent = zfsvfs;
 	zfsvfs->z_assign = TXG_NOWAIT;
 	zfsvfs->z_max_blksz = SPA_MAXBLOCKSIZE;
 	zfsvfs->z_show_ctldir = ZFS_SNAPDIR_VISIBLE;
 
 	mutex_init(&zfsvfs->z_znodes_lock, NULL, MUTEX_DEFAULT, NULL);
 	list_create(&zfsvfs->z_all_znodes, sizeof (znode_t),
 	    offsetof(znode_t, z_link_node));
 	rw_init(&zfsvfs->z_um_lock, NULL, RW_DEFAULT, NULL);
 
 	if (error = dsl_prop_get_integer(osname, "recordsize", &recordsize,
 	    NULL))
 		goto out;
 	zfsvfs->z_vfs->vfs_bsize = recordsize;
 
 	vfsp->vfs_data = zfsvfs;
 	vfsp->mnt_flag |= MNT_LOCAL;
 	vfsp->mnt_kern_flag |= MNTK_MPSAFE;
 	vfsp->mnt_kern_flag |= MNTK_LOOKUP_SHARED;
 
 	if (error = dsl_prop_get_integer(osname, "readonly", &readonly, NULL))
 		goto out;
 
 	if (readonly)
 		mode = DS_MODE_PRIMARY | DS_MODE_READONLY;
 	else
 		mode = DS_MODE_PRIMARY;
 
 	error = dmu_objset_open(osname, DMU_OST_ZFS, mode, &zfsvfs->z_os);
 	if (error == EROFS) {
 		mode = DS_MODE_PRIMARY | DS_MODE_READONLY;
 		error = dmu_objset_open(osname, DMU_OST_ZFS, mode,
 		    &zfsvfs->z_os);
 	}
 
 	if (error)
 		goto out;
 
 	if (error = zfs_init_fs(zfsvfs, &zp, cr))
 		goto out;
 
 	if (dmu_objset_is_snapshot(zfsvfs->z_os)) {
 		uint64_t xattr;
 
 		ASSERT(mode & DS_MODE_READONLY);
 		atime_changed_cb(zfsvfs, B_FALSE);
 		readonly_changed_cb(zfsvfs, B_TRUE);
 		if (error = dsl_prop_get_integer(osname, "xattr", &xattr, NULL))
 			goto out;
 		xattr_changed_cb(zfsvfs, xattr);
 		zfsvfs->z_issnap = B_TRUE;
 	} else {
 		error = zfs_register_callbacks(vfsp);
 		if (error)
 			goto out;
 
 		zfs_unlinked_drain(zfsvfs);
 
 		/*
 		 * Parse and replay the intent log.
 		 */
 		zil_replay(zfsvfs->z_os, zfsvfs, &zfsvfs->z_assign,
 		    zfs_replay_vector);
 
 		if (!zil_disable)
 			zfsvfs->z_log = zil_open(zfsvfs->z_os, zfs_get_data);
 	}
 
 	vfs_mountedfrom(vfsp, osname);
 
 	if (!zfsvfs->z_issnap)
 		zfsctl_create(zfsvfs);
 out:
 	if (error) {
 		if (zfsvfs->z_os)
 			dmu_objset_close(zfsvfs->z_os);
 		rw_destroy(&zfsvfs->z_um_lock);
 		mutex_destroy(&zfsvfs->z_znodes_lock);
 		kmem_free(zfsvfs, sizeof (zfsvfs_t));
 	} else {
 		atomic_add_32(&zfs_active_fs_count, 1);
 	}
 
 	return (error);
 
 }
 
 void
 zfs_unregister_callbacks(zfsvfs_t *zfsvfs)
 {
 	objset_t *os = zfsvfs->z_os;
 	struct dsl_dataset *ds;
 
 	/*
 	 * Unregister properties.
 	 */
 	if (!dmu_objset_is_snapshot(os)) {
 		ds = dmu_objset_ds(os);
 		VERIFY(dsl_prop_unregister(ds, "atime", atime_changed_cb,
 		    zfsvfs) == 0);
 
 		VERIFY(dsl_prop_unregister(ds, "xattr", xattr_changed_cb,
 		    zfsvfs) == 0);
 
 		VERIFY(dsl_prop_unregister(ds, "recordsize", blksz_changed_cb,
 		    zfsvfs) == 0);
 
 		VERIFY(dsl_prop_unregister(ds, "readonly", readonly_changed_cb,
 		    zfsvfs) == 0);
 
 		VERIFY(dsl_prop_unregister(ds, "setuid", setuid_changed_cb,
 		    zfsvfs) == 0);
 
 		VERIFY(dsl_prop_unregister(ds, "exec", exec_changed_cb,
 		    zfsvfs) == 0);
 
 		VERIFY(dsl_prop_unregister(ds, "snapdir", snapdir_changed_cb,
 		    zfsvfs) == 0);
 
 		VERIFY(dsl_prop_unregister(ds, "aclmode", acl_mode_changed_cb,
 		    zfsvfs) == 0);
 
 		VERIFY(dsl_prop_unregister(ds, "aclinherit",
 		    acl_inherit_changed_cb, zfsvfs) == 0);
 	}
 }
 
 /*ARGSUSED*/
 static int
 zfs_mount(vfs_t *vfsp, kthread_t *td)
 {
 	char *from;
 	int error;
 
 	/*
 	 * When doing a remount, we simply refresh our temporary properties
 	 * according to those options set in the current VFS options.
 	 */
 	if (vfsp->vfs_flag & MS_REMOUNT)
 		return (zfs_refresh_properties(vfsp));
 
 	if (vfs_getopt(vfsp->mnt_optnew, "from", (void **)&from, NULL))
 		return (EINVAL);
 
 	DROP_GIANT();
 	error = zfs_domount(vfsp, from, td);
 	PICKUP_GIANT();
 	return (error);
 }
 
 static int
 zfs_statfs(vfs_t *vfsp, struct statfs *statp, kthread_t *td)
 {
 	zfsvfs_t *zfsvfs = vfsp->vfs_data;
 	uint64_t refdbytes, availbytes, usedobjs, availobjs;
 
 	statp->f_version = STATFS_VERSION;
 
 	ZFS_ENTER(zfsvfs);
 
 	dmu_objset_space(zfsvfs->z_os,
 	    &refdbytes, &availbytes, &usedobjs, &availobjs);
 
 	/*
 	 * The underlying storage pool actually uses multiple block sizes.
 	 * We report the fragsize as the smallest block size we support,
 	 * and we report our blocksize as the filesystem's maximum blocksize.
 	 */
 	statp->f_bsize = zfsvfs->z_vfs->vfs_bsize;
 	statp->f_iosize = zfsvfs->z_vfs->vfs_bsize;
 
 	/*
 	 * The following report "total" blocks of various kinds in the
 	 * file system, but reported in terms of f_frsize - the
 	 * "fragment" size.
 	 */
 
 	statp->f_blocks = (refdbytes + availbytes) / statp->f_bsize;
 	statp->f_bfree = availbytes / statp->f_bsize;
 	statp->f_bavail = statp->f_bfree; /* no root reservation */
 
 	/*
 	 * statvfs() should really be called statufs(), because it assumes
 	 * static metadata.  ZFS doesn't preallocate files, so the best
 	 * we can do is report the max that could possibly fit in f_files,
 	 * and that minus the number actually used in f_ffree.
 	 * For f_ffree, report the smaller of the number of object available
 	 * and the number of blocks (each object will take at least a block).
 	 */
 	statp->f_ffree = MIN(availobjs, statp->f_bfree);
 	statp->f_files = statp->f_ffree + usedobjs;
 
 	/*
 	 * We're a zfs filesystem.
 	 */
 	(void) strlcpy(statp->f_fstypename, "zfs", sizeof(statp->f_fstypename));
 
 	strlcpy(statp->f_mntfromname, vfsp->mnt_stat.f_mntfromname,
 	    sizeof(statp->f_mntfromname));
 	strlcpy(statp->f_mntonname, vfsp->mnt_stat.f_mntonname,
 	    sizeof(statp->f_mntonname));
 
 	statp->f_namemax = ZFS_MAXNAMELEN;
 
 	ZFS_EXIT(zfsvfs);
 	return (0);
 }
 
 static int
 zfs_root(vfs_t *vfsp, int flags, vnode_t **vpp, kthread_t *td)
 {
 	zfsvfs_t *zfsvfs = vfsp->vfs_data;
 	znode_t *rootzp;
 	int error;
 
 	ZFS_ENTER(zfsvfs);
 
 	error = zfs_zget(zfsvfs, zfsvfs->z_root, &rootzp);
 	if (error == 0) {
 		*vpp = ZTOV(rootzp);
-		error = vn_lock(*vpp, flags, td);
+		error = vn_lock(*vpp, flags);
 		(*vpp)->v_vflag |= VV_ROOT;
 	}
 
 	ZFS_EXIT(zfsvfs);
 	return (error);
 }
 
 /*ARGSUSED*/
 static int
 zfs_umount(vfs_t *vfsp, int fflag, kthread_t *td)
 {
 	zfsvfs_t *zfsvfs = vfsp->vfs_data;
 	cred_t *cr = td->td_ucred;
 	int ret;
 
 	if ((ret = secpolicy_fs_unmount(cr, vfsp)) != 0)
 		return (ret);
 
 	(void) dnlc_purge_vfsp(vfsp, 0);
 
 	/*
 	 * Unmount any snapshots mounted under .zfs before unmounting the
 	 * dataset itself.
 	 */
 	if (zfsvfs->z_ctldir != NULL) {
 		if ((ret = zfsctl_umount_snapshots(vfsp, fflag, cr)) != 0)
 			return (ret);
 		ret = vflush(vfsp, 0, 0, td);
 		ASSERT(ret == EBUSY);
 		if (!(fflag & MS_FORCE)) {
 			if (zfsvfs->z_ctldir->v_count > 1)
 				return (EBUSY);
 			ASSERT(zfsvfs->z_ctldir->v_count == 1);
 		}
 		zfsctl_destroy(zfsvfs);
 		ASSERT(zfsvfs->z_ctldir == NULL);
 	}
 
 	/*
 	 * Flush all the files.
 	 */
 	ret = vflush(vfsp, 1, (fflag & MS_FORCE) ? FORCECLOSE : 0, td);
 	if (ret != 0) {
 		if (!zfsvfs->z_issnap) {
 			zfsctl_create(zfsvfs);
 			ASSERT(zfsvfs->z_ctldir != NULL);
 		}
 		return (ret);
 	}
 
 	if (fflag & MS_FORCE) {
 		MNT_ILOCK(vfsp);
 		vfsp->mnt_kern_flag |= MNTK_UNMOUNTF;
 		MNT_IUNLOCK(vfsp);
 		zfsvfs->z_unmounted1 = B_TRUE;
 
 		/*
 		 * Wait for all zfs threads to leave zfs.
 		 * Grabbing a rwlock as reader in all vops and
 		 * as writer here doesn't work because it too easy to get
 		 * multiple reader enters as zfs can re-enter itself.
 		 * This can lead to deadlock if there is an intervening
 		 * rw_enter as writer.
 		 * So a file system threads ref count (z_op_cnt) is used.
 		 * A polling loop on z_op_cnt may seem inefficient, but
 		 * - this saves all threads on exit from having to grab a
 		 *   mutex in order to cv_signal
 		 * - only occurs on forced unmount in the rare case when
 		 *   there are outstanding threads within the file system.
 		 */
 		while (zfsvfs->z_op_cnt) {
 			delay(1);
 		}
 	}
 
 	zfs_objset_close(zfsvfs);
 	VFS_RELE(vfsp);
 	zfs_freevfs(vfsp);
 
 	return (0);
 }
 
 static int
 zfs_vget(vfs_t *vfsp, ino_t ino, int flags, vnode_t **vpp)
 {
 	zfsvfs_t	*zfsvfs = vfsp->vfs_data;
 	znode_t		*zp;
 	int 		err;
 
 	ZFS_ENTER(zfsvfs);
 	err = zfs_zget(zfsvfs, ino, &zp);
 	if (err == 0 && zp->z_unlinked) {
 		VN_RELE(ZTOV(zp));
 		err = EINVAL;
 	}
 	if (err != 0)
 		*vpp = NULL;
 	else {
 		*vpp = ZTOV(zp);
-		vn_lock(*vpp, flags, curthread);
+		vn_lock(*vpp, flags);
 	}
 	ZFS_EXIT(zfsvfs);
 	return (err);
 }
 
 static int
 zfs_fhtovp(vfs_t *vfsp, fid_t *fidp, vnode_t **vpp)
 {
 	kthread_t	*td = curthread;
 	zfsvfs_t	*zfsvfs = vfsp->vfs_data;
 	znode_t		*zp;
 	uint64_t	object = 0;
 	uint64_t	fid_gen = 0;
 	uint64_t	gen_mask;
 	uint64_t	zp_gen;
 	int		i, err;
 
 	*vpp = NULL;
 
 	ZFS_ENTER(zfsvfs);
 
 	if (fidp->fid_len == LONG_FID_LEN) {
 		zfid_long_t	*zlfid = (zfid_long_t *)fidp;
 		uint64_t	objsetid = 0;
 		uint64_t	setgen = 0;
 
 		for (i = 0; i < sizeof (zlfid->zf_setid); i++)
 			objsetid |= ((uint64_t)zlfid->zf_setid[i]) << (8 * i);
 
 		for (i = 0; i < sizeof (zlfid->zf_setgen); i++)
 			setgen |= ((uint64_t)zlfid->zf_setgen[i]) << (8 * i);
 
 		ZFS_EXIT(zfsvfs);
 
 		err = zfsctl_lookup_objset(vfsp, objsetid, &zfsvfs);
 		if (err)
 			return (EINVAL);
 		ZFS_ENTER(zfsvfs);
 	}
 
 	if (fidp->fid_len == SHORT_FID_LEN || fidp->fid_len == LONG_FID_LEN) {
 		zfid_short_t	*zfid = (zfid_short_t *)fidp;
 
 		for (i = 0; i < sizeof (zfid->zf_object); i++)
 			object |= ((uint64_t)zfid->zf_object[i]) << (8 * i);
 
 		for (i = 0; i < sizeof (zfid->zf_gen); i++)
 			fid_gen |= ((uint64_t)zfid->zf_gen[i]) << (8 * i);
 	} else {
 		ZFS_EXIT(zfsvfs);
 		return (EINVAL);
 	}
 
 	/* A zero fid_gen means we are in the .zfs control directories */
 	if (fid_gen == 0 &&
 	    (object == ZFSCTL_INO_ROOT || object == ZFSCTL_INO_SNAPDIR)) {
 		*vpp = zfsvfs->z_ctldir;
 		ASSERT(*vpp != NULL);
 		if (object == ZFSCTL_INO_SNAPDIR) {
 			VERIFY(zfsctl_root_lookup(*vpp, "snapshot", vpp, NULL,
 			    0, NULL, NULL) == 0);
 		} else {
 			VN_HOLD(*vpp);
 		}
 		ZFS_EXIT(zfsvfs);
 		/* XXX: LK_RETRY? */
-		vn_lock(*vpp, LK_EXCLUSIVE | LK_RETRY, td);
+		vn_lock(*vpp, LK_EXCLUSIVE | LK_RETRY);
 		return (0);
 	}
 
 	gen_mask = -1ULL >> (64 - 8 * i);
 
 	dprintf("getting %llu [%u mask %llx]\n", object, fid_gen, gen_mask);
 	if (err = zfs_zget(zfsvfs, object, &zp)) {
 		ZFS_EXIT(zfsvfs);
 		return (err);
 	}
 	zp_gen = zp->z_phys->zp_gen & gen_mask;
 	if (zp_gen == 0)
 		zp_gen = 1;
 	if (zp->z_unlinked || zp_gen != fid_gen) {
 		dprintf("znode gen (%u) != fid gen (%u)\n", zp_gen, fid_gen);
 		VN_RELE(ZTOV(zp));
 		ZFS_EXIT(zfsvfs);
 		return (EINVAL);
 	}
 
 	*vpp = ZTOV(zp);
 	/* XXX: LK_RETRY? */
-	vn_lock(*vpp, LK_EXCLUSIVE | LK_RETRY, td);
+	vn_lock(*vpp, LK_EXCLUSIVE | LK_RETRY);
 	vnode_create_vobject(*vpp, zp->z_phys->zp_size, td);
 	ZFS_EXIT(zfsvfs);
 	return (0);
 }
 
 static void
 zfs_objset_close(zfsvfs_t *zfsvfs)
 {
 	znode_t		*zp, *nextzp;
 	objset_t	*os = zfsvfs->z_os;
 
 	/*
 	 * For forced unmount, at this point all vops except zfs_inactive
 	 * are erroring EIO. We need to now suspend zfs_inactive threads
 	 * while we are freeing dbufs before switching zfs_inactive
 	 * to use behaviour without a objset.
 	 */
 	rw_enter(&zfsvfs->z_um_lock, RW_WRITER);
 
 	/*
 	 * Release all holds on dbufs
 	 * Note, although we have stopped all other vop threads and
 	 * zfs_inactive(), the dmu can callback via znode_pageout_func()
 	 * which can zfs_znode_free() the znode.
 	 * So we lock z_all_znodes; search the list for a held
 	 * dbuf; drop the lock (we know zp can't disappear if we hold
 	 * a dbuf lock; then regrab the lock and restart.
 	 */
 	mutex_enter(&zfsvfs->z_znodes_lock);
 	for (zp = list_head(&zfsvfs->z_all_znodes); zp; zp = nextzp) {
 		nextzp = list_next(&zfsvfs->z_all_znodes, zp);
 		if (zp->z_dbuf_held) {
 			/* dbufs should only be held when force unmounting */
 			zp->z_dbuf_held = 0;
 			mutex_exit(&zfsvfs->z_znodes_lock);
 			dmu_buf_rele(zp->z_dbuf, NULL);
 			/* Start again */
 			mutex_enter(&zfsvfs->z_znodes_lock);
 			nextzp = list_head(&zfsvfs->z_all_znodes);
 		}
 	}
 	mutex_exit(&zfsvfs->z_znodes_lock);
 
 	/*
 	 * Unregister properties.
 	 */
 	if (!dmu_objset_is_snapshot(os))
 		zfs_unregister_callbacks(zfsvfs);
 
 	/*
 	 * Switch zfs_inactive to behaviour without an objset.
 	 * It just tosses cached pages and frees the znode & vnode.
 	 * Then re-enable zfs_inactive threads in that new behaviour.
 	 */
 	zfsvfs->z_unmounted2 = B_TRUE;
 	rw_exit(&zfsvfs->z_um_lock); /* re-enable any zfs_inactive threads */
 
 	/*
 	 * Close the zil. Can't close the zil while zfs_inactive
 	 * threads are blocked as zil_close can call zfs_inactive.
 	 */
 	if (zfsvfs->z_log) {
 		zil_close(zfsvfs->z_log);
 		zfsvfs->z_log = NULL;
 	}
 
 	/*
 	 * Evict all dbufs so that cached znodes will be freed
 	 */
 	if (dmu_objset_evict_dbufs(os, 1)) {
 		txg_wait_synced(dmu_objset_pool(zfsvfs->z_os), 0);
 		(void) dmu_objset_evict_dbufs(os, 0);
 	}
 
 	/*
 	 * Finally close the objset
 	 */
 	dmu_objset_close(os);
 }
 
 static void
 zfs_freevfs(vfs_t *vfsp)
 {
 	zfsvfs_t *zfsvfs = vfsp->vfs_data;
 	int i;
 
 	for (i = 0; i != ZFS_OBJ_MTX_SZ; i++)
 		mutex_destroy(&zfsvfs->z_hold_mtx[i]);
 	rw_destroy(&zfsvfs->z_um_lock);
 	mutex_destroy(&zfsvfs->z_znodes_lock);
 	kmem_free(zfsvfs, sizeof (zfsvfs_t));
 
 	atomic_add_32(&zfs_active_fs_count, -1);
 }
 
 #ifdef __i386__
 static int desiredvnodes_backup;
 #endif
 
 static void
 zfs_vnodes_adjust(void)
 {
 #ifdef __i386__
 	int val;
 
 	desiredvnodes_backup = desiredvnodes;
 
 	/*
 	 * We calculate newdesiredvnodes the same way it is done in
 	 * vntblinit(). If it is equal to desiredvnodes, it means that
 	 * it wasn't tuned by the administrator and we can tune it down.
 	 */
 	val = min(maxproc + cnt.v_page_count / 4, 2 * vm_kmem_size /
 	    (5 * (sizeof(struct vm_object) + sizeof(struct vnode))));
 	if (desiredvnodes == val)
 		desiredvnodes = (3 * desiredvnodes) / 4;
 #endif
 }
 
 static void
 zfs_vnodes_adjust_back(void)
 {
 
 #ifdef __i386__
 	desiredvnodes = desiredvnodes_backup;
 #endif
 }
 
 void
 zfs_init(void)
 {
 
 	printf("ZFS filesystem version " ZFS_VERSION_STRING "\n");
 
 	/*
 	 * Initialize .zfs directory structures
 	 */
 	zfsctl_init();
 
 	/*
 	 * Initialize znode cache, vnode ops, etc...
 	 */
 	zfs_znode_init();
 
 	/*
 	 * Reduce number of vnodes. Originally number of vnodes is calculated
 	 * with UFS inode in mind. We reduce it here, because it's too big for
 	 * ZFS/i386.
 	 */
 	zfs_vnodes_adjust();
 }
 
 void
 zfs_fini(void)
 {
 	zfsctl_fini();
 	zfs_znode_fini();
 	zfs_vnodes_adjust_back();
 }
 
 int
 zfs_busy(void)
 {
 	return (zfs_active_fs_count != 0);
 }
Index: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_vnops.c
===================================================================
--- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_vnops.c	(revision 175201)
+++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_vnops.c	(revision 175202)
@@ -1,3599 +1,3599 @@
 /*
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
  * Common Development and Distribution License (the "License").
  * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or http://www.opensolaris.org/os/licensing.
  * See the License for the specific language governing permissions
  * and limitations under the License.
  *
  * When distributing Covered Code, include this CDDL HEADER in each
  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  * If applicable, add the following below this CDDL HEADER, with the
  * fields enclosed by brackets "[]" replaced with your own identifying
  * information: Portions Copyright [yyyy] [name of copyright owner]
  *
  * CDDL HEADER END
  */
 /*
  * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
 /* Portions Copyright 2007 Jeremy Teo */
 
 #pragma ident	"%Z%%M%	%I%	%E% SMI"
 
 #include <sys/types.h>
 #include <sys/param.h>
 #include <sys/time.h>
 #include <sys/systm.h>
 #include <sys/sysmacros.h>
 #include <sys/resource.h>
 #include <sys/vfs.h>
 #include <sys/vnode.h>
 #include <sys/file.h>
 #include <sys/stat.h>
 #include <sys/kmem.h>
 #include <sys/taskq.h>
 #include <sys/uio.h>
 #include <sys/atomic.h>
 #include <sys/namei.h>
 #include <sys/mman.h>
 #include <sys/cmn_err.h>
 #include <sys/errno.h>
 #include <sys/unistd.h>
 #include <sys/zfs_vfsops.h>
 #include <sys/zfs_dir.h>
 #include <sys/zfs_acl.h>
 #include <sys/zfs_ioctl.h>
 #include <sys/fs/zfs.h>
 #include <sys/dmu.h>
 #include <sys/spa.h>
 #include <sys/txg.h>
 #include <sys/dbuf.h>
 #include <sys/zap.h>
 #include <sys/dirent.h>
 #include <sys/policy.h>
 #include <sys/sunddi.h>
 #include <sys/filio.h>
 #include <sys/zfs_ctldir.h>
 #include <sys/dnlc.h>
 #include <sys/zfs_rlock.h>
 #include <sys/bio.h>
 #include <sys/buf.h>
 #include <sys/sf_buf.h>
 #include <sys/sched.h>
 
 /*
  * Programming rules.
  *
  * Each vnode op performs some logical unit of work.  To do this, the ZPL must
  * properly lock its in-core state, create a DMU transaction, do the work,
  * record this work in the intent log (ZIL), commit the DMU transaction,
  * and wait the the intent log to commit if it's is a synchronous operation.
  * Morover, the vnode ops must work in both normal and log replay context.
  * The ordering of events is important to avoid deadlocks and references
  * to freed memory.  The example below illustrates the following Big Rules:
  *
  *  (1) A check must be made in each zfs thread for a mounted file system.
  *	This is done avoiding races using ZFS_ENTER(zfsvfs).
  *	A ZFS_EXIT(zfsvfs) is needed before all returns.
  *
  *  (2)	VN_RELE() should always be the last thing except for zil_commit()
  *	(if necessary) and ZFS_EXIT(). This is for 3 reasons:
  *	First, if it's the last reference, the vnode/znode
  *	can be freed, so the zp may point to freed memory.  Second, the last
  *	reference will call zfs_zinactive(), which may induce a lot of work --
  *	pushing cached pages (which acquires range locks) and syncing out
  *	cached atime changes.  Third, zfs_zinactive() may require a new tx,
  *	which could deadlock the system if you were already holding one.
  *
  *  (3)	All range locks must be grabbed before calling dmu_tx_assign(),
  *	as they can span dmu_tx_assign() calls.
  *
  *  (4)	Always pass zfsvfs->z_assign as the second argument to dmu_tx_assign().
  *	In normal operation, this will be TXG_NOWAIT.  During ZIL replay,
  *	it will be a specific txg.  Either way, dmu_tx_assign() never blocks.
  *	This is critical because we don't want to block while holding locks.
  *	Note, in particular, that if a lock is sometimes acquired before
  *	the tx assigns, and sometimes after (e.g. z_lock), then failing to
  *	use a non-blocking assign can deadlock the system.  The scenario:
  *
  *	Thread A has grabbed a lock before calling dmu_tx_assign().
  *	Thread B is in an already-assigned tx, and blocks for this lock.
  *	Thread A calls dmu_tx_assign(TXG_WAIT) and blocks in txg_wait_open()
  *	forever, because the previous txg can't quiesce until B's tx commits.
  *
  *	If dmu_tx_assign() returns ERESTART and zfsvfs->z_assign is TXG_NOWAIT,
  *	then drop all locks, call dmu_tx_wait(), and try again.
  *
  *  (5)	If the operation succeeded, generate the intent log entry for it
  *	before dropping locks.  This ensures that the ordering of events
  *	in the intent log matches the order in which they actually occurred.
  *
  *  (6)	At the end of each vnode op, the DMU tx must always commit,
  *	regardless of whether there were any errors.
  *
  *  (7)	After dropping all locks, invoke zil_commit(zilog, seq, foid)
  *	to ensure that synchronous semantics are provided when necessary.
  *
  * In general, this is how things should be ordered in each vnode op:
  *
  *	ZFS_ENTER(zfsvfs);		// exit if unmounted
  * top:
  *	zfs_dirent_lock(&dl, ...)	// lock directory entry (may VN_HOLD())
  *	rw_enter(...);			// grab any other locks you need
  *	tx = dmu_tx_create(...);	// get DMU tx
  *	dmu_tx_hold_*();		// hold each object you might modify
  *	error = dmu_tx_assign(tx, zfsvfs->z_assign);	// try to assign
  *	if (error) {
  *		rw_exit(...);		// drop locks
  *		zfs_dirent_unlock(dl);	// unlock directory entry
  *		VN_RELE(...);		// release held vnodes
  *		if (error == ERESTART && zfsvfs->z_assign == TXG_NOWAIT) {
  *			dmu_tx_wait(tx);
  *			dmu_tx_abort(tx);
  *			goto top;
  *		}
  *		dmu_tx_abort(tx);	// abort DMU tx
  *		ZFS_EXIT(zfsvfs);	// finished in zfs
  *		return (error);		// really out of space
  *	}
  *	error = do_real_work();		// do whatever this VOP does
  *	if (error == 0)
  *		zfs_log_*(...);		// on success, make ZIL entry
  *	dmu_tx_commit(tx);		// commit DMU tx -- error or not
  *	rw_exit(...);			// drop locks
  *	zfs_dirent_unlock(dl);		// unlock directory entry
  *	VN_RELE(...);			// release held vnodes
  *	zil_commit(zilog, seq, foid);	// synchronous when necessary
  *	ZFS_EXIT(zfsvfs);		// finished in zfs
  *	return (error);			// done, report error
  */
 /* ARGSUSED */
 static int
 zfs_open(vnode_t **vpp, int flag, cred_t *cr)
 {
 	znode_t	*zp = VTOZ(*vpp);
 
 	/* Keep a count of the synchronous opens in the znode */
 	if (flag & (FSYNC | FDSYNC))
 		atomic_inc_32(&zp->z_sync_cnt);
 	return (0);
 }
 
 /* ARGSUSED */
 static int
 zfs_close(vnode_t *vp, int flag, int count, offset_t offset, cred_t *cr)
 {
 	znode_t	*zp = VTOZ(vp);
 
 	/* Decrement the synchronous opens in the znode */
 	if (flag & (FSYNC | FDSYNC))
 		atomic_dec_32(&zp->z_sync_cnt);
 
 	/*
 	 * Clean up any locks held by this process on the vp.
 	 */
 	cleanlocks(vp, ddi_get_pid(), 0);
 	cleanshares(vp, ddi_get_pid());
 
 	return (0);
 }
 
 /*
  * Lseek support for finding holes (cmd == _FIO_SEEK_HOLE) and
  * data (cmd == _FIO_SEEK_DATA). "off" is an in/out parameter.
  */
 static int
 zfs_holey(vnode_t *vp, u_long cmd, offset_t *off)
 {
 	znode_t	*zp = VTOZ(vp);
 	uint64_t noff = (uint64_t)*off; /* new offset */
 	uint64_t file_sz;
 	int error;
 	boolean_t hole;
 
 	file_sz = zp->z_phys->zp_size;
 	if (noff >= file_sz)  {
 		return (ENXIO);
 	}
 
 	if (cmd == _FIO_SEEK_HOLE)
 		hole = B_TRUE;
 	else
 		hole = B_FALSE;
 
 	error = dmu_offset_next(zp->z_zfsvfs->z_os, zp->z_id, hole, &noff);
 
 	/* end of file? */
 	if ((error == ESRCH) || (noff > file_sz)) {
 		/*
 		 * Handle the virtual hole at the end of file.
 		 */
 		if (hole) {
 			*off = file_sz;
 			return (0);
 		}
 		return (ENXIO);
 	}
 
 	if (noff < *off)
 		return (error);
 	*off = noff;
 	return (error);
 }
 
 /* ARGSUSED */
 static int
 zfs_ioctl(vnode_t *vp, u_long com, intptr_t data, int flag, cred_t *cred,
     int *rvalp)
 {
 	offset_t off;
 	int error;
 	zfsvfs_t *zfsvfs;
 
 	switch (com) {
 	    case _FIOFFS:
 		return (0);
 
 		/*
 		 * The following two ioctls are used by bfu.  Faking out,
 		 * necessary to avoid bfu errors.
 		 */
 	    case _FIOGDIO:
 	    case _FIOSDIO:
 		return (0);
 
 	    case _FIO_SEEK_DATA:
 	    case _FIO_SEEK_HOLE:
 		if (ddi_copyin((void *)data, &off, sizeof (off), flag))
 			return (EFAULT);
 
 		zfsvfs = VTOZ(vp)->z_zfsvfs;
 		ZFS_ENTER(zfsvfs);
 
 		/* offset parameter is in/out */
 		error = zfs_holey(vp, com, &off);
 		ZFS_EXIT(zfsvfs);
 		if (error)
 			return (error);
 		if (ddi_copyout(&off, (void *)data, sizeof (off), flag))
 			return (EFAULT);
 		return (0);
 	}
 	return (ENOTTY);
 }
 
 /*
  * When a file is memory mapped, we must keep the IO data synchronized
  * between the DMU cache and the memory mapped pages.  What this means:
  *
  * On Write:	If we find a memory mapped page, we write to *both*
  *		the page and the dmu buffer.
  *
  * NOTE: We will always "break up" the IO into PAGESIZE uiomoves when
  *	the file is memory mapped.
  */
 static int
 mappedwrite(vnode_t *vp, int nbytes, uio_t *uio, dmu_tx_t *tx)
 {
 	znode_t *zp = VTOZ(vp);
 	objset_t *os = zp->z_zfsvfs->z_os;
 	vm_object_t obj;
 	vm_page_t m;
 	struct sf_buf *sf;
 	int64_t start, off;
 	int len = nbytes;
 	int error = 0;
 	uint64_t dirbytes;
 
 	ASSERT(vp->v_mount != NULL);
 	obj = vp->v_object;
 	ASSERT(obj != NULL);
 
 	start = uio->uio_loffset;
 	off = start & PAGEOFFSET;
 	dirbytes = 0;
 	VM_OBJECT_LOCK(obj);
 	for (start &= PAGEMASK; len > 0; start += PAGESIZE) {
 		uint64_t bytes = MIN(PAGESIZE - off, len);
 		uint64_t fsize;
 
 again:
 		if ((m = vm_page_lookup(obj, OFF_TO_IDX(start))) != NULL &&
 		    vm_page_is_valid(m, (vm_offset_t)off, bytes)) {
 			uint64_t woff;
 			caddr_t va;
 
 			if (vm_page_sleep_if_busy(m, FALSE, "zfsmwb"))
 				goto again;
 			fsize = obj->un_pager.vnp.vnp_size;
 			vm_page_busy(m);
 			vm_page_lock_queues();
 			vm_page_undirty(m);
 			vm_page_unlock_queues();
 			VM_OBJECT_UNLOCK(obj);
 			if (dirbytes > 0) {
 				error = dmu_write_uio(os, zp->z_id, uio,
 				    dirbytes, tx);
 				dirbytes = 0;
 			}
 			if (error == 0) {
 				sched_pin();
 				sf = sf_buf_alloc(m, SFB_CPUPRIVATE);
 				va = (caddr_t)sf_buf_kva(sf);
 				woff = uio->uio_loffset - off;
 				error = uiomove(va + off, bytes, UIO_WRITE, uio);
 				/*
 				 * The uiomove() above could have been partially
 				 * successful, that's why we call dmu_write()
 				 * below unconditionally. The page was marked
 				 * non-dirty above and we would lose the changes
 				 * without doing so. If the uiomove() failed
 				 * entirely, well, we just write what we got
 				 * before one more time.
 				 */
 				dmu_write(os, zp->z_id, woff,
 				    MIN(PAGESIZE, fsize - woff), va, tx);
 				sf_buf_free(sf);
 				sched_unpin();
 			}
 			VM_OBJECT_LOCK(obj);
 			vm_page_wakeup(m);
 		} else {
 			dirbytes += bytes;
 		}
 		len -= bytes;
 		off = 0;
 		if (error)
 			break;
 	}
 	VM_OBJECT_UNLOCK(obj);
 	if (error == 0 && dirbytes > 0)
 		error = dmu_write_uio(os, zp->z_id, uio, dirbytes, tx);
 	return (error);
 }
 
 /*
  * When a file is memory mapped, we must keep the IO data synchronized
  * between the DMU cache and the memory mapped pages.  What this means:
  *
  * On Read:	We "read" preferentially from memory mapped pages,
  *		else we default from the dmu buffer.
  *
  * NOTE: We will always "break up" the IO into PAGESIZE uiomoves when
  *	the file is memory mapped.
  */
 static int
 mappedread(vnode_t *vp, int nbytes, uio_t *uio)
 {
 	znode_t *zp = VTOZ(vp);
 	objset_t *os = zp->z_zfsvfs->z_os;
 	vm_object_t obj;
 	vm_page_t m;
 	struct sf_buf *sf;
 	int64_t start, off;
 	caddr_t va;
 	int len = nbytes;
 	int error = 0;
 	uint64_t dirbytes;
 
 	ASSERT(vp->v_mount != NULL);
 	obj = vp->v_object;
 	ASSERT(obj != NULL);
 
 	start = uio->uio_loffset;
 	off = start & PAGEOFFSET;
 	dirbytes = 0;
 	VM_OBJECT_LOCK(obj);
 	for (start &= PAGEMASK; len > 0; start += PAGESIZE) {
 		uint64_t bytes = MIN(PAGESIZE - off, len);
 
 again:
 		if ((m = vm_page_lookup(obj, OFF_TO_IDX(start))) != NULL &&
 		    vm_page_is_valid(m, (vm_offset_t)off, bytes)) {
 			if (vm_page_sleep_if_busy(m, FALSE, "zfsmrb"))
 				goto again;
 			vm_page_busy(m);
 			VM_OBJECT_UNLOCK(obj);
 			if (dirbytes > 0) {
 				error = dmu_read_uio(os, zp->z_id, uio,
 				    dirbytes);
 				dirbytes = 0;
 			}
 			if (error == 0) {
 				sched_pin();
 				sf = sf_buf_alloc(m, SFB_CPUPRIVATE);
 				va = (caddr_t)sf_buf_kva(sf);
 				error = uiomove(va + off, bytes, UIO_READ, uio);
 				sf_buf_free(sf);
 				sched_unpin();
 			}
 			VM_OBJECT_LOCK(obj);
 			vm_page_wakeup(m);
 		} else if (m != NULL && uio->uio_segflg == UIO_NOCOPY) {
 			/*
 			 * The code below is here to make sendfile(2) work
 			 * correctly with ZFS. As pointed out by ups@
 			 * sendfile(2) should be changed to use VOP_GETPAGES(),
 			 * but it pessimize performance of sendfile/UFS, that's
 			 * why I handle this special case in ZFS code.
 			 */
 			if (vm_page_sleep_if_busy(m, FALSE, "zfsmrb"))
 				goto again;
 			vm_page_busy(m);
 			VM_OBJECT_UNLOCK(obj);
 			if (dirbytes > 0) {
 				error = dmu_read_uio(os, zp->z_id, uio,
 				    dirbytes);
 				dirbytes = 0;
 			}
 			if (error == 0) {
 				sched_pin();
 				sf = sf_buf_alloc(m, SFB_CPUPRIVATE);
 				va = (caddr_t)sf_buf_kva(sf);
 				error = dmu_read(os, zp->z_id, start + off,
 				    bytes, (void *)(va + off));
 				sf_buf_free(sf);
 				sched_unpin();
 			}
 			VM_OBJECT_LOCK(obj);
 			vm_page_wakeup(m);
 			if (error == 0)
 				uio->uio_resid -= bytes;
 		} else {
 			dirbytes += bytes;
 		}
 		len -= bytes;
 		off = 0;
 		if (error)
 			break;
 	}
 	VM_OBJECT_UNLOCK(obj);
 	if (error == 0 && dirbytes > 0)
 		error = dmu_read_uio(os, zp->z_id, uio, dirbytes);
 	return (error);
 }
 
 offset_t zfs_read_chunk_size = 1024 * 1024; /* Tunable */
 
 /*
  * Read bytes from specified file into supplied buffer.
  *
  *	IN:	vp	- vnode of file to be read from.
  *		uio	- structure supplying read location, range info,
  *			  and return buffer.
  *		ioflag	- SYNC flags; used to provide FRSYNC semantics.
  *		cr	- credentials of caller.
  *
  *	OUT:	uio	- updated offset and range, buffer filled.
  *
  *	RETURN:	0 if success
  *		error code if failure
  *
  * Side Effects:
  *	vp - atime updated if byte count > 0
  */
 /* ARGSUSED */
 static int
 zfs_read(vnode_t *vp, uio_t *uio, int ioflag, cred_t *cr, caller_context_t *ct)
 {
 	znode_t		*zp = VTOZ(vp);
 	zfsvfs_t	*zfsvfs = zp->z_zfsvfs;
 	objset_t	*os = zfsvfs->z_os;
 	ssize_t		n, nbytes;
 	int		error;
 	rl_t		*rl;
 
 	ZFS_ENTER(zfsvfs);
 
 	/*
 	 * Validate file offset
 	 */
 	if (uio->uio_loffset < (offset_t)0) {
 		ZFS_EXIT(zfsvfs);
 		return (EINVAL);
 	}
 
 	/*
 	 * Fasttrack empty reads
 	 */
 	if (uio->uio_resid == 0) {
 		ZFS_EXIT(zfsvfs);
 		return (0);
 	}
 
 	/*
 	 * Check for mandatory locks
 	 */
 	if (MANDMODE((mode_t)zp->z_phys->zp_mode)) {
 		if (error = chklock(vp, FREAD,
 		    uio->uio_loffset, uio->uio_resid, uio->uio_fmode, ct)) {
 			ZFS_EXIT(zfsvfs);
 			return (error);
 		}
 	}
 
 	/*
 	 * If we're in FRSYNC mode, sync out this znode before reading it.
 	 */
 	if (ioflag & FRSYNC)
 		zil_commit(zfsvfs->z_log, zp->z_last_itx, zp->z_id);
 
 	/*
 	 * Lock the range against changes.
 	 */
 	rl = zfs_range_lock(zp, uio->uio_loffset, uio->uio_resid, RL_READER);
 
 	/*
 	 * If we are reading past end-of-file we can skip
 	 * to the end; but we might still need to set atime.
 	 */
 	if (uio->uio_loffset >= zp->z_phys->zp_size) {
 		error = 0;
 		goto out;
 	}
 
 	ASSERT(uio->uio_loffset < zp->z_phys->zp_size);
 	n = MIN(uio->uio_resid, zp->z_phys->zp_size - uio->uio_loffset);
 
 	while (n > 0) {
 		nbytes = MIN(n, zfs_read_chunk_size -
 		    P2PHASE(uio->uio_loffset, zfs_read_chunk_size));
 
 		if (vn_has_cached_data(vp))
 			error = mappedread(vp, nbytes, uio);
 		else
 			error = dmu_read_uio(os, zp->z_id, uio, nbytes);
 		if (error)
 			break;
 
 		n -= nbytes;
 	}
 
 out:
 	zfs_range_unlock(rl);
 
 	ZFS_ACCESSTIME_STAMP(zfsvfs, zp);
 	ZFS_EXIT(zfsvfs);
 	return (error);
 }
 
 /*
  * Fault in the pages of the first n bytes specified by the uio structure.
  * 1 byte in each page is touched and the uio struct is unmodified.
  * Any error will exit this routine as this is only a best
  * attempt to get the pages resident. This is a copy of ufs_trans_touch().
  */
 static void
 zfs_prefault_write(ssize_t n, struct uio *uio)
 {
 	struct iovec *iov;
 	ulong_t cnt, incr;
 	caddr_t p;
 
 	if (uio->uio_segflg != UIO_USERSPACE)
 		return;
 
 	iov = uio->uio_iov;
 
 	while (n) {
 		cnt = MIN(iov->iov_len, n);
 		if (cnt == 0) {
 			/* empty iov entry */
 			iov++;
 			continue;
 		}
 		n -= cnt;
 		/*
 		 * touch each page in this segment.
 		 */
 		p = iov->iov_base;
 		while (cnt) {
 			if (fubyte(p) == -1)
 				return;
 			incr = MIN(cnt, PAGESIZE);
 			p += incr;
 			cnt -= incr;
 		}
 		/*
 		 * touch the last byte in case it straddles a page.
 		 */
 		p--;
 		if (fubyte(p) == -1)
 			return;
 		iov++;
 	}
 }
 
 /*
  * Write the bytes to a file.
  *
  *	IN:	vp	- vnode of file to be written to.
  *		uio	- structure supplying write location, range info,
  *			  and data buffer.
  *		ioflag	- IO_APPEND flag set if in append mode.
  *		cr	- credentials of caller.
  *
  *	OUT:	uio	- updated offset and range.
  *
  *	RETURN:	0 if success
  *		error code if failure
  *
  * Timestamps:
  *	vp - ctime|mtime updated if byte count > 0
  */
 /* ARGSUSED */
 static int
 zfs_write(vnode_t *vp, uio_t *uio, int ioflag, cred_t *cr, caller_context_t *ct)
 {
 	znode_t		*zp = VTOZ(vp);
 	rlim64_t	limit = MAXOFFSET_T;
 	ssize_t		start_resid = uio->uio_resid;
 	ssize_t		tx_bytes;
 	uint64_t	end_size;
 	dmu_tx_t	*tx;
 	zfsvfs_t	*zfsvfs = zp->z_zfsvfs;
 	zilog_t		*zilog = zfsvfs->z_log;
 	offset_t	woff;
 	ssize_t		n, nbytes;
 	rl_t		*rl;
 	int		max_blksz = zfsvfs->z_max_blksz;
 	int		error;
 
 	/*
 	 * Fasttrack empty write
 	 */
 	n = start_resid;
 	if (n == 0)
 		return (0);
 
 	if (limit == RLIM64_INFINITY || limit > MAXOFFSET_T)
 		limit = MAXOFFSET_T;
 
 	ZFS_ENTER(zfsvfs);
 
 	/*
 	 * Pre-fault the pages to ensure slow (eg NFS) pages
 	 * don't hold up txg.
 	 */
 	zfs_prefault_write(n, uio);
 
 	/*
 	 * If in append mode, set the io offset pointer to eof.
 	 */
 	if (ioflag & IO_APPEND) {
 		/*
 		 * Range lock for a file append:
 		 * The value for the start of range will be determined by
 		 * zfs_range_lock() (to guarantee append semantics).
 		 * If this write will cause the block size to increase,
 		 * zfs_range_lock() will lock the entire file, so we must
 		 * later reduce the range after we grow the block size.
 		 */
 		rl = zfs_range_lock(zp, 0, n, RL_APPEND);
 		if (rl->r_len == UINT64_MAX) {
 			/* overlocked, zp_size can't change */
 			woff = uio->uio_loffset = zp->z_phys->zp_size;
 		} else {
 			woff = uio->uio_loffset = rl->r_off;
 		}
 	} else {
 		woff = uio->uio_loffset;
 		/*
 		 * Validate file offset
 		 */
 		if (woff < 0) {
 			ZFS_EXIT(zfsvfs);
 			return (EINVAL);
 		}
 
 		/*
 		 * If we need to grow the block size then zfs_range_lock()
 		 * will lock a wider range than we request here.
 		 * Later after growing the block size we reduce the range.
 		 */
 		rl = zfs_range_lock(zp, woff, n, RL_WRITER);
 	}
 
 	if (woff >= limit) {
 		zfs_range_unlock(rl);
 		ZFS_EXIT(zfsvfs);
 		return (EFBIG);
 	}
 
 	if ((woff + n) > limit || woff > (limit - n))
 		n = limit - woff;
 
 	/*
 	 * Check for mandatory locks
 	 */
 	if (MANDMODE((mode_t)zp->z_phys->zp_mode) &&
 	    (error = chklock(vp, FWRITE, woff, n, uio->uio_fmode, ct)) != 0) {
 		zfs_range_unlock(rl);
 		ZFS_EXIT(zfsvfs);
 		return (error);
 	}
 	end_size = MAX(zp->z_phys->zp_size, woff + n);
 
 	/*
 	 * Write the file in reasonable size chunks.  Each chunk is written
 	 * in a separate transaction; this keeps the intent log records small
 	 * and allows us to do more fine-grained space accounting.
 	 */
 	while (n > 0) {
 		/*
 		 * Start a transaction.
 		 */
 		woff = uio->uio_loffset;
 		tx = dmu_tx_create(zfsvfs->z_os);
 		dmu_tx_hold_bonus(tx, zp->z_id);
 		dmu_tx_hold_write(tx, zp->z_id, woff, MIN(n, max_blksz));
 		error = dmu_tx_assign(tx, zfsvfs->z_assign);
 		if (error) {
 			if (error == ERESTART &&
 			    zfsvfs->z_assign == TXG_NOWAIT) {
 				dmu_tx_wait(tx);
 				dmu_tx_abort(tx);
 				continue;
 			}
 			dmu_tx_abort(tx);
 			break;
 		}
 
 		/*
 		 * If zfs_range_lock() over-locked we grow the blocksize
 		 * and then reduce the lock range.  This will only happen
 		 * on the first iteration since zfs_range_reduce() will
 		 * shrink down r_len to the appropriate size.
 		 */
 		if (rl->r_len == UINT64_MAX) {
 			uint64_t new_blksz;
 
 			if (zp->z_blksz > max_blksz) {
 				ASSERT(!ISP2(zp->z_blksz));
 				new_blksz = MIN(end_size, SPA_MAXBLOCKSIZE);
 			} else {
 				new_blksz = MIN(end_size, max_blksz);
 			}
 			zfs_grow_blocksize(zp, new_blksz, tx);
 			zfs_range_reduce(rl, woff, n);
 		}
 
 		/*
 		 * XXX - should we really limit each write to z_max_blksz?
 		 * Perhaps we should use SPA_MAXBLOCKSIZE chunks?
 		 */
 		nbytes = MIN(n, max_blksz - P2PHASE(woff, max_blksz));
 
 		if (woff + nbytes > zp->z_phys->zp_size)
 			vnode_pager_setsize(vp, woff + nbytes);
 
 		rw_enter(&zp->z_map_lock, RW_READER);
 
 		tx_bytes = uio->uio_resid;
 		if (vn_has_cached_data(vp)) {
 			rw_exit(&zp->z_map_lock);
 			error = mappedwrite(vp, nbytes, uio, tx);
 		} else {
 			error = dmu_write_uio(zfsvfs->z_os, zp->z_id,
 			    uio, nbytes, tx);
 			rw_exit(&zp->z_map_lock);
 		}
 		tx_bytes -= uio->uio_resid;
 
 		/*
 		 * If we made no progress, we're done.  If we made even
 		 * partial progress, update the znode and ZIL accordingly.
 		 */
 		if (tx_bytes == 0) {
 			dmu_tx_commit(tx);
 			ASSERT(error != 0);
 			break;
 		}
 
 		/*
 		 * Clear Set-UID/Set-GID bits on successful write if not
 		 * privileged and at least one of the excute bits is set.
 		 *
 		 * It would be nice to to this after all writes have
 		 * been done, but that would still expose the ISUID/ISGID
 		 * to another app after the partial write is committed.
 		 */
 		mutex_enter(&zp->z_acl_lock);
 		if ((zp->z_phys->zp_mode & (S_IXUSR | (S_IXUSR >> 3) |
 		    (S_IXUSR >> 6))) != 0 &&
 		    (zp->z_phys->zp_mode & (S_ISUID | S_ISGID)) != 0 &&
 		    secpolicy_vnode_setid_retain(cr,
 		    (zp->z_phys->zp_mode & S_ISUID) != 0 &&
 		    zp->z_phys->zp_uid == 0) != 0) {
 			    zp->z_phys->zp_mode &= ~(S_ISUID | S_ISGID);
 		}
 		mutex_exit(&zp->z_acl_lock);
 
 		/*
 		 * Update time stamp.  NOTE: This marks the bonus buffer as
 		 * dirty, so we don't have to do it again for zp_size.
 		 */
 		zfs_time_stamper(zp, CONTENT_MODIFIED, tx);
 
 		/*
 		 * Update the file size (zp_size) if it has changed;
 		 * account for possible concurrent updates.
 		 */
 		while ((end_size = zp->z_phys->zp_size) < uio->uio_loffset)
 			(void) atomic_cas_64(&zp->z_phys->zp_size, end_size,
 			    uio->uio_loffset);
 		zfs_log_write(zilog, tx, TX_WRITE, zp, woff, tx_bytes, ioflag);
 		dmu_tx_commit(tx);
 
 		if (error != 0)
 			break;
 		ASSERT(tx_bytes == nbytes);
 		n -= nbytes;
 	}
 
 	zfs_range_unlock(rl);
 
 	/*
 	 * If we're in replay mode, or we made no progress, return error.
 	 * Otherwise, it's at least a partial write, so it's successful.
 	 */
 	if (zfsvfs->z_assign >= TXG_INITIAL || uio->uio_resid == start_resid) {
 		ZFS_EXIT(zfsvfs);
 		return (error);
 	}
 
 	if (ioflag & (FSYNC | FDSYNC))
 		zil_commit(zilog, zp->z_last_itx, zp->z_id);
 
 	ZFS_EXIT(zfsvfs);
 	return (0);
 }
 
 void
 zfs_get_done(dmu_buf_t *db, void *vzgd)
 {
 	zgd_t *zgd = (zgd_t *)vzgd;
 	rl_t *rl = zgd->zgd_rl;
 	vnode_t *vp = ZTOV(rl->r_zp);
 	int vfslocked;
 
 	vfslocked = VFS_LOCK_GIANT(vp->v_vfsp);
 	dmu_buf_rele(db, vzgd);
 	zfs_range_unlock(rl);
 	VN_RELE(vp);
 	zil_add_vdev(zgd->zgd_zilog, DVA_GET_VDEV(BP_IDENTITY(zgd->zgd_bp)));
 	kmem_free(zgd, sizeof (zgd_t));
 	VFS_UNLOCK_GIANT(vfslocked);
 }
 
 /*
  * Get data to generate a TX_WRITE intent log record.
  */
 int
 zfs_get_data(void *arg, lr_write_t *lr, char *buf, zio_t *zio)
 {
 	zfsvfs_t *zfsvfs = arg;
 	objset_t *os = zfsvfs->z_os;
 	znode_t *zp;
 	uint64_t off = lr->lr_offset;
 	dmu_buf_t *db;
 	rl_t *rl;
 	zgd_t *zgd;
 	int dlen = lr->lr_length;		/* length of user data */
 	int error = 0;
 
 	ASSERT(zio);
 	ASSERT(dlen != 0);
 
 	/*
 	 * Nothing to do if the file has been removed
 	 */
 	if (zfs_zget(zfsvfs, lr->lr_foid, &zp) != 0)
 		return (ENOENT);
 	if (zp->z_unlinked) {
 		VN_RELE(ZTOV(zp));
 		return (ENOENT);
 	}
 
 	/*
 	 * Write records come in two flavors: immediate and indirect.
 	 * For small writes it's cheaper to store the data with the
 	 * log record (immediate); for large writes it's cheaper to
 	 * sync the data and get a pointer to it (indirect) so that
 	 * we don't have to write the data twice.
 	 */
 	if (buf != NULL) { /* immediate write */
 		rl = zfs_range_lock(zp, off, dlen, RL_READER);
 		/* test for truncation needs to be done while range locked */
 		if (off >= zp->z_phys->zp_size) {
 			error = ENOENT;
 			goto out;
 		}
 		VERIFY(0 == dmu_read(os, lr->lr_foid, off, dlen, buf));
 	} else { /* indirect write */
 		uint64_t boff; /* block starting offset */
 
 		/*
 		 * Have to lock the whole block to ensure when it's
 		 * written out and it's checksum is being calculated
 		 * that no one can change the data. We need to re-check
 		 * blocksize after we get the lock in case it's changed!
 		 */
 		for (;;) {
 			if (ISP2(zp->z_blksz)) {
 				boff = P2ALIGN_TYPED(off, zp->z_blksz,
 				    uint64_t);
 			} else {
 				boff = 0;
 			}
 			dlen = zp->z_blksz;
 			rl = zfs_range_lock(zp, boff, dlen, RL_READER);
 			if (zp->z_blksz == dlen)
 				break;
 			zfs_range_unlock(rl);
 		}
 		/* test for truncation needs to be done while range locked */
 		if (off >= zp->z_phys->zp_size) {
 			error = ENOENT;
 			goto out;
 		}
 		zgd = (zgd_t *)kmem_alloc(sizeof (zgd_t), KM_SLEEP);
 		zgd->zgd_rl = rl;
 		zgd->zgd_zilog = zfsvfs->z_log;
 		zgd->zgd_bp = &lr->lr_blkptr;
 		VERIFY(0 == dmu_buf_hold(os, lr->lr_foid, boff, zgd, &db));
 		ASSERT(boff == db->db_offset);
 		lr->lr_blkoff = off - boff;
 		error = dmu_sync(zio, db, &lr->lr_blkptr,
 		    lr->lr_common.lrc_txg, zfs_get_done, zgd);
 		ASSERT(error == EEXIST || lr->lr_length <= zp->z_blksz);
 		if (error == 0) {
 			zil_add_vdev(zfsvfs->z_log,
 			    DVA_GET_VDEV(BP_IDENTITY(&lr->lr_blkptr)));
 		}
 		/*
 		 * If we get EINPROGRESS, then we need to wait for a
 		 * write IO initiated by dmu_sync() to complete before
 		 * we can release this dbuf.  We will finish everything
 		 * up in the zfs_get_done() callback.
 		 */
 		if (error == EINPROGRESS)
 			return (0);
 		dmu_buf_rele(db, zgd);
 		kmem_free(zgd, sizeof (zgd_t));
 	}
 out:
 	zfs_range_unlock(rl);
 	VN_RELE(ZTOV(zp));
 	return (error);
 }
 
 /*ARGSUSED*/
 static int
 zfs_access(vnode_t *vp, int mode, int flags, cred_t *cr)
 {
 	znode_t *zp = VTOZ(vp);
 	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
 	int error;
 
 	ZFS_ENTER(zfsvfs);
 	error = zfs_zaccess_rwx(zp, mode, cr);
 	ZFS_EXIT(zfsvfs);
 	return (error);
 }
 
 /*
  * Lookup an entry in a directory, or an extended attribute directory.
  * If it exists, return a held vnode reference for it.
  *
  *	IN:	dvp	- vnode of directory to search.
  *		nm	- name of entry to lookup.
  *		pnp	- full pathname to lookup [UNUSED].
  *		flags	- LOOKUP_XATTR set if looking for an attribute.
  *		rdir	- root directory vnode [UNUSED].
  *		cr	- credentials of caller.
  *
  *	OUT:	vpp	- vnode of located entry, NULL if not found.
  *
  *	RETURN:	0 if success
  *		error code if failure
  *
  * Timestamps:
  *	NA
  */
 /* ARGSUSED */
 static int
 zfs_lookup(vnode_t *dvp, char *nm, vnode_t **vpp, struct componentname *cnp,
     int nameiop, cred_t *cr, kthread_t *td)
 {
 
 	znode_t *zdp = VTOZ(dvp);
 	zfsvfs_t *zfsvfs = zdp->z_zfsvfs;
 	int	error;
 
 	ZFS_ENTER(zfsvfs);
 
 	*vpp = NULL;
 
 #ifdef TODO
 	if (flags & LOOKUP_XATTR) {
 		/*
 		 * If the xattr property is off, refuse the lookup request.
 		 */
 		if (!(zfsvfs->z_vfs->vfs_flag & VFS_XATTR)) {
 			ZFS_EXIT(zfsvfs);
 			return (EINVAL);
 		}
 
 		/*
 		 * We don't allow recursive attributes..
 		 * Maybe someday we will.
 		 */
 		if (zdp->z_phys->zp_flags & ZFS_XATTR) {
 			ZFS_EXIT(zfsvfs);
 			return (EINVAL);
 		}
 
 		if (error = zfs_get_xattrdir(VTOZ(dvp), vpp, cr, flags)) {
 			ZFS_EXIT(zfsvfs);
 			return (error);
 		}
 
 		/*
 		 * Do we have permission to get into attribute directory?
 		 */
 
 		if (error = zfs_zaccess(VTOZ(*vpp), ACE_EXECUTE, cr)) {
 			VN_RELE(*vpp);
 		}
 
 		ZFS_EXIT(zfsvfs);
 		return (error);
 	}
 #endif	/* TODO */
 
 	if (dvp->v_type != VDIR) {
 		ZFS_EXIT(zfsvfs);
 		return (ENOTDIR);
 	}
 
 	/*
 	 * Check accessibility of directory.
 	 */
 
 	if (error = zfs_zaccess(zdp, ACE_EXECUTE, cr)) {
 		ZFS_EXIT(zfsvfs);
 		return (error);
 	}
 
 	if ((error = zfs_dirlook(zdp, nm, vpp)) == 0) {
 
 		/*
 		 * Convert device special files
 		 */
 		if (IS_DEVVP(*vpp)) {
 			vnode_t	*svp;
 
 			svp = specvp(*vpp, (*vpp)->v_rdev, (*vpp)->v_type, cr);
 			VN_RELE(*vpp);
 			if (svp == NULL)
 				error = ENOSYS;
 			else
 				*vpp = svp;
 		}
 	}
 
 	ZFS_EXIT(zfsvfs);
 
 	/* Translate errors and add SAVENAME when needed. */
 	if (cnp->cn_flags & ISLASTCN) {
 		switch (nameiop) {
 		case CREATE:
 		case RENAME:
 			if (error == ENOENT) {
 				error = EJUSTRETURN;
 				cnp->cn_flags |= SAVENAME;
 				break;
 			}
 			/* FALLTHROUGH */
 		case DELETE:
 			if (error == 0)
 				cnp->cn_flags |= SAVENAME;
 			break;
 		}
 	}
 	if (error == 0 && (nm[0] != '.' || nm[1] != '\0')) {
 		int ltype = 0;
 
 		if (cnp->cn_flags & ISDOTDOT) {
 			ltype = VOP_ISLOCKED(dvp, td);
 			VOP_UNLOCK(dvp, 0, td);
 		}
-		error = vn_lock(*vpp, cnp->cn_lkflags, td);
+		error = vn_lock(*vpp, cnp->cn_lkflags);
 		if (cnp->cn_flags & ISDOTDOT)
-			vn_lock(dvp, ltype | LK_RETRY, td);
+			vn_lock(dvp, ltype | LK_RETRY);
 		if (error != 0) {
 			VN_RELE(*vpp);
 			*vpp = NULL;
 			return (error);
 		}
 	}
 
 #ifdef FREEBSD_NAMECACHE
 	/*
 	 * Insert name into cache (as non-existent) if appropriate.
 	 */
 	if (error == ENOENT && (cnp->cn_flags & MAKEENTRY) && nameiop != CREATE)
 		cache_enter(dvp, *vpp, cnp);
 	/*
 	 * Insert name into cache if appropriate.
 	 */
 	if (error == 0 && (cnp->cn_flags & MAKEENTRY)) {
 		if (!(cnp->cn_flags & ISLASTCN) ||
 		    (nameiop != DELETE && nameiop != RENAME)) {
 			cache_enter(dvp, *vpp, cnp);
 		}
 	}
 #endif
 
 	return (error);
 }
 
 /*
  * Attempt to create a new entry in a directory.  If the entry
  * already exists, truncate the file if permissible, else return
  * an error.  Return the vp of the created or trunc'd file.
  *
  *	IN:	dvp	- vnode of directory to put new file entry in.
  *		name	- name of new file entry.
  *		vap	- attributes of new file.
  *		excl	- flag indicating exclusive or non-exclusive mode.
  *		mode	- mode to open file with.
  *		cr	- credentials of caller.
  *		flag	- large file flag [UNUSED].
  *
  *	OUT:	vpp	- vnode of created or trunc'd entry.
  *
  *	RETURN:	0 if success
  *		error code if failure
  *
  * Timestamps:
  *	dvp - ctime|mtime updated if new entry created
  *	 vp - ctime|mtime always, atime if new
  */
 /* ARGSUSED */
 static int
 zfs_create(vnode_t *dvp, char *name, vattr_t *vap, int excl, int mode,
     vnode_t **vpp, cred_t *cr, kthread_t *td)
 {
 	znode_t		*zp, *dzp = VTOZ(dvp);
 	zfsvfs_t	*zfsvfs = dzp->z_zfsvfs;
 	zilog_t		*zilog = zfsvfs->z_log;
 	objset_t	*os = zfsvfs->z_os;
 	zfs_dirlock_t	*dl;
 	dmu_tx_t	*tx;
 	int		error;
 	uint64_t	zoid;
 
 	ZFS_ENTER(zfsvfs);
 
 top:
 	*vpp = NULL;
 
 	if ((vap->va_mode & VSVTX) && secpolicy_vnode_stky_modify(cr))
 		vap->va_mode &= ~VSVTX;
 
 	if (*name == '\0') {
 		/*
 		 * Null component name refers to the directory itself.
 		 */
 		VN_HOLD(dvp);
 		zp = dzp;
 		dl = NULL;
 		error = 0;
 	} else {
 		/* possible VN_HOLD(zp) */
 		if (error = zfs_dirent_lock(&dl, dzp, name, &zp, 0)) {
 			if (strcmp(name, "..") == 0)
 				error = EISDIR;
 			ZFS_EXIT(zfsvfs);
 			return (error);
 		}
 	}
 
 	zoid = zp ? zp->z_id : -1ULL;
 
 	if (zp == NULL) {
 		/*
 		 * Create a new file object and update the directory
 		 * to reference it.
 		 */
 		if (error = zfs_zaccess(dzp, ACE_ADD_FILE, cr)) {
 			goto out;
 		}
 
 		/*
 		 * We only support the creation of regular files in
 		 * extended attribute directories.
 		 */
 		if ((dzp->z_phys->zp_flags & ZFS_XATTR) &&
 		    (vap->va_type != VREG)) {
 			error = EINVAL;
 			goto out;
 		}
 
 		tx = dmu_tx_create(os);
 		dmu_tx_hold_bonus(tx, DMU_NEW_OBJECT);
 		dmu_tx_hold_bonus(tx, dzp->z_id);
 		dmu_tx_hold_zap(tx, dzp->z_id, TRUE, name);
 		if (dzp->z_phys->zp_flags & ZFS_INHERIT_ACE)
 			dmu_tx_hold_write(tx, DMU_NEW_OBJECT,
 			    0, SPA_MAXBLOCKSIZE);
 		error = dmu_tx_assign(tx, zfsvfs->z_assign);
 		if (error) {
 			zfs_dirent_unlock(dl);
 			if (error == ERESTART &&
 			    zfsvfs->z_assign == TXG_NOWAIT) {
 				dmu_tx_wait(tx);
 				dmu_tx_abort(tx);
 				goto top;
 			}
 			dmu_tx_abort(tx);
 			ZFS_EXIT(zfsvfs);
 			return (error);
 		}
 		zfs_mknode(dzp, vap, &zoid, tx, cr, 0, &zp, 0);
 		ASSERT(zp->z_id == zoid);
 		(void) zfs_link_create(dl, zp, tx, ZNEW);
 		zfs_log_create(zilog, tx, TX_CREATE, dzp, zp, name);
 		dmu_tx_commit(tx);
 	} else {
 		/*
 		 * A directory entry already exists for this name.
 		 */
 		/*
 		 * Can't truncate an existing file if in exclusive mode.
 		 */
 		if (excl == EXCL) {
 			error = EEXIST;
 			goto out;
 		}
 		/*
 		 * Can't open a directory for writing.
 		 */
 		if ((ZTOV(zp)->v_type == VDIR) && (mode & S_IWRITE)) {
 			error = EISDIR;
 			goto out;
 		}
 		/*
 		 * Verify requested access to file.
 		 */
 		if (mode && (error = zfs_zaccess_rwx(zp, mode, cr))) {
 			goto out;
 		}
 
 		mutex_enter(&dzp->z_lock);
 		dzp->z_seq++;
 		mutex_exit(&dzp->z_lock);
 
 		/*
 		 * Truncate regular files if requested.
 		 */
 		if ((ZTOV(zp)->v_type == VREG) &&
 		    (vap->va_mask & AT_SIZE) && (vap->va_size == 0)) {
 			error = zfs_freesp(zp, 0, 0, mode, TRUE);
 			if (error == ERESTART &&
 			    zfsvfs->z_assign == TXG_NOWAIT) {
 				/* NB: we already did dmu_tx_wait() */
 				zfs_dirent_unlock(dl);
 				VN_RELE(ZTOV(zp));
 				goto top;
 			}
 		}
 	}
 out:
 
 	if (error == 0) {
 		*vpp = ZTOV(zp);
-		vn_lock(*vpp, LK_EXCLUSIVE | LK_RETRY, td);
+		vn_lock(*vpp, LK_EXCLUSIVE | LK_RETRY);
 	}
 
 	if (dl)
 		zfs_dirent_unlock(dl);
 
 	if (error) {
 		if (zp)
 			VN_RELE(ZTOV(zp));
 	} else {
 		*vpp = ZTOV(zp);
 		/*
 		 * If vnode is for a device return a specfs vnode instead.
 		 */
 		if (IS_DEVVP(*vpp)) {
 			struct vnode *svp;
 
 			svp = specvp(*vpp, (*vpp)->v_rdev, (*vpp)->v_type, cr);
 			VN_RELE(*vpp);
 			if (svp == NULL) {
 				error = ENOSYS;
 			}
 			*vpp = svp;
 		}
 	}
 
 	ZFS_EXIT(zfsvfs);
 	return (error);
 }
 
 /*
  * Remove an entry from a directory.
  *
  *	IN:	dvp	- vnode of directory to remove entry from.
  *		name	- name of entry to remove.
  *		cr	- credentials of caller.
  *
  *	RETURN:	0 if success
  *		error code if failure
  *
  * Timestamps:
  *	dvp - ctime|mtime
  *	 vp - ctime (if nlink > 0)
  */
 static int
 zfs_remove(vnode_t *dvp, char *name, cred_t *cr)
 {
 	znode_t		*zp, *dzp = VTOZ(dvp);
 	znode_t		*xzp = NULL;
 	vnode_t		*vp;
 	zfsvfs_t	*zfsvfs = dzp->z_zfsvfs;
 	zilog_t		*zilog = zfsvfs->z_log;
 	uint64_t	acl_obj, xattr_obj;
 	zfs_dirlock_t	*dl;
 	dmu_tx_t	*tx;
 	boolean_t	may_delete_now, delete_now = FALSE;
 	boolean_t	unlinked;
 	int		error;
 
 	ZFS_ENTER(zfsvfs);
 
 top:
 	/*
 	 * Attempt to lock directory; fail if entry doesn't exist.
 	 */
 	if (error = zfs_dirent_lock(&dl, dzp, name, &zp, ZEXISTS)) {
 		ZFS_EXIT(zfsvfs);
 		return (error);
 	}
 
 	vp = ZTOV(zp);
 
 	if (error = zfs_zaccess_delete(dzp, zp, cr)) {
 		goto out;
 	}
 
 	/*
 	 * Need to use rmdir for removing directories.
 	 */
 	if (vp->v_type == VDIR) {
 		error = EPERM;
 		goto out;
 	}
 
 	vnevent_remove(vp);
 
 	dnlc_remove(dvp, name);
 
 	may_delete_now = FALSE;
 
 	/*
 	 * We may delete the znode now, or we may put it in the unlinked set;
 	 * it depends on whether we're the last link, and on whether there are
 	 * other holds on the vnode.  So we dmu_tx_hold() the right things to
 	 * allow for either case.
 	 */
 	tx = dmu_tx_create(zfsvfs->z_os);
 	dmu_tx_hold_zap(tx, dzp->z_id, FALSE, name);
 	dmu_tx_hold_bonus(tx, zp->z_id);
 	if (may_delete_now)
 		dmu_tx_hold_free(tx, zp->z_id, 0, DMU_OBJECT_END);
 
 	/* are there any extended attributes? */
 	if ((xattr_obj = zp->z_phys->zp_xattr) != 0) {
 		/* XXX - do we need this if we are deleting? */
 		dmu_tx_hold_bonus(tx, xattr_obj);
 	}
 
 	/* are there any additional acls */
 	if ((acl_obj = zp->z_phys->zp_acl.z_acl_extern_obj) != 0 &&
 	    may_delete_now)
 		dmu_tx_hold_free(tx, acl_obj, 0, DMU_OBJECT_END);
 
 	/* charge as an update -- would be nice not to charge at all */
 	dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL);
 
 	error = dmu_tx_assign(tx, zfsvfs->z_assign);
 	if (error) {
 		zfs_dirent_unlock(dl);
 		VN_RELE(vp);
 		if (error == ERESTART && zfsvfs->z_assign == TXG_NOWAIT) {
 			dmu_tx_wait(tx);
 			dmu_tx_abort(tx);
 			goto top;
 		}
 		dmu_tx_abort(tx);
 		ZFS_EXIT(zfsvfs);
 		return (error);
 	}
 
 	/*
 	 * Remove the directory entry.
 	 */
 	error = zfs_link_destroy(dl, zp, tx, 0, &unlinked);
 
 	if (error) {
 		dmu_tx_commit(tx);
 		goto out;
 	}
 
 	if (0 && unlinked) {
 		VI_LOCK(vp);
 		delete_now = may_delete_now &&
 		    vp->v_count == 1 && !vn_has_cached_data(vp) &&
 		    zp->z_phys->zp_xattr == xattr_obj &&
 		    zp->z_phys->zp_acl.z_acl_extern_obj == acl_obj;
 		VI_UNLOCK(vp);
 	}
 
 	if (delete_now) {
 		if (zp->z_phys->zp_xattr) {
 			error = zfs_zget(zfsvfs, zp->z_phys->zp_xattr, &xzp);
 			ASSERT3U(error, ==, 0);
 			ASSERT3U(xzp->z_phys->zp_links, ==, 2);
 			dmu_buf_will_dirty(xzp->z_dbuf, tx);
 			mutex_enter(&xzp->z_lock);
 			xzp->z_unlinked = 1;
 			xzp->z_phys->zp_links = 0;
 			mutex_exit(&xzp->z_lock);
 			zfs_unlinked_add(xzp, tx);
 			zp->z_phys->zp_xattr = 0; /* probably unnecessary */
 		}
 		mutex_enter(&zp->z_lock);
 		VI_LOCK(vp);
 		vp->v_count--;
 		ASSERT3U(vp->v_count, ==, 0);
 		VI_UNLOCK(vp);
 		mutex_exit(&zp->z_lock);
 		zfs_znode_delete(zp, tx);
 		VFS_RELE(zfsvfs->z_vfs);
 	} else if (unlinked) {
 		zfs_unlinked_add(zp, tx);
 	}
 
 	zfs_log_remove(zilog, tx, TX_REMOVE, dzp, name);
 
 	dmu_tx_commit(tx);
 out:
 	zfs_dirent_unlock(dl);
 
 	if (!delete_now) {
 		VN_RELE(vp);
 	} else if (xzp) {
 		/* this rele delayed to prevent nesting transactions */
 		VN_RELE(ZTOV(xzp));
 	}
 
 	ZFS_EXIT(zfsvfs);
 	return (error);
 }
 
 /*
  * Create a new directory and insert it into dvp using the name
  * provided.  Return a pointer to the inserted directory.
  *
  *	IN:	dvp	- vnode of directory to add subdir to.
  *		dirname	- name of new directory.
  *		vap	- attributes of new directory.
  *		cr	- credentials of caller.
  *
  *	OUT:	vpp	- vnode of created directory.
  *
  *	RETURN:	0 if success
  *		error code if failure
  *
  * Timestamps:
  *	dvp - ctime|mtime updated
  *	 vp - ctime|mtime|atime updated
  */
 static int
 zfs_mkdir(vnode_t *dvp, char *dirname, vattr_t *vap, vnode_t **vpp, cred_t *cr)
 {
 	znode_t		*zp, *dzp = VTOZ(dvp);
 	zfsvfs_t	*zfsvfs = dzp->z_zfsvfs;
 	zilog_t		*zilog = zfsvfs->z_log;
 	zfs_dirlock_t	*dl;
 	uint64_t	zoid = 0;
 	dmu_tx_t	*tx;
 	int		error;
 
 	ASSERT(vap->va_type == VDIR);
 
 	ZFS_ENTER(zfsvfs);
 
 	if (dzp->z_phys->zp_flags & ZFS_XATTR) {
 		ZFS_EXIT(zfsvfs);
 		return (EINVAL);
 	}
 top:
 	*vpp = NULL;
 
 	/*
 	 * First make sure the new directory doesn't exist.
 	 */
 	if (error = zfs_dirent_lock(&dl, dzp, dirname, &zp, ZNEW)) {
 		ZFS_EXIT(zfsvfs);
 		return (error);
 	}
 
 	if (error = zfs_zaccess(dzp, ACE_ADD_SUBDIRECTORY, cr)) {
 		zfs_dirent_unlock(dl);
 		ZFS_EXIT(zfsvfs);
 		return (error);
 	}
 
 	/*
 	 * Add a new entry to the directory.
 	 */
 	tx = dmu_tx_create(zfsvfs->z_os);
 	dmu_tx_hold_zap(tx, dzp->z_id, TRUE, dirname);
 	dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, FALSE, NULL);
 	if (dzp->z_phys->zp_flags & ZFS_INHERIT_ACE)
 		dmu_tx_hold_write(tx, DMU_NEW_OBJECT,
 		    0, SPA_MAXBLOCKSIZE);
 	error = dmu_tx_assign(tx, zfsvfs->z_assign);
 	if (error) {
 		zfs_dirent_unlock(dl);
 		if (error == ERESTART && zfsvfs->z_assign == TXG_NOWAIT) {
 			dmu_tx_wait(tx);
 			dmu_tx_abort(tx);
 			goto top;
 		}
 		dmu_tx_abort(tx);
 		ZFS_EXIT(zfsvfs);
 		return (error);
 	}
 
 	/*
 	 * Create new node.
 	 */
 	zfs_mknode(dzp, vap, &zoid, tx, cr, 0, &zp, 0);
 
 	/*
 	 * Now put new name in parent dir.
 	 */
 	(void) zfs_link_create(dl, zp, tx, ZNEW);
 
 	*vpp = ZTOV(zp);
 
 	zfs_log_create(zilog, tx, TX_MKDIR, dzp, zp, dirname);
 	dmu_tx_commit(tx);
 
-	vn_lock(*vpp, LK_EXCLUSIVE | LK_RETRY, curthread);
+	vn_lock(*vpp, LK_EXCLUSIVE | LK_RETRY);
 
 	zfs_dirent_unlock(dl);
 
 	ZFS_EXIT(zfsvfs);
 	return (0);
 }
 
 /*
  * Remove a directory subdir entry.  If the current working
  * directory is the same as the subdir to be removed, the
  * remove will fail.
  *
  *	IN:	dvp	- vnode of directory to remove from.
  *		name	- name of directory to be removed.
  *		cwd	- vnode of current working directory.
  *		cr	- credentials of caller.
  *
  *	RETURN:	0 if success
  *		error code if failure
  *
  * Timestamps:
  *	dvp - ctime|mtime updated
  */
 static int
 zfs_rmdir(vnode_t *dvp, char *name, vnode_t *cwd, cred_t *cr)
 {
 	znode_t		*dzp = VTOZ(dvp);
 	znode_t		*zp;
 	vnode_t		*vp;
 	zfsvfs_t	*zfsvfs = dzp->z_zfsvfs;
 	zilog_t		*zilog = zfsvfs->z_log;
 	zfs_dirlock_t	*dl;
 	dmu_tx_t	*tx;
 	int		error;
 
 	ZFS_ENTER(zfsvfs);
 
 top:
 	zp = NULL;
 
 	/*
 	 * Attempt to lock directory; fail if entry doesn't exist.
 	 */
 	if (error = zfs_dirent_lock(&dl, dzp, name, &zp, ZEXISTS)) {
 		ZFS_EXIT(zfsvfs);
 		return (error);
 	}
 
 	vp = ZTOV(zp);
 
 	if (error = zfs_zaccess_delete(dzp, zp, cr)) {
 		goto out;
 	}
 
 	if (vp->v_type != VDIR) {
 		error = ENOTDIR;
 		goto out;
 	}
 
 	if (vp == cwd) {
 		error = EINVAL;
 		goto out;
 	}
 
 	vnevent_rmdir(vp);
 
 	/*
 	 * Grab a lock on the directory to make sure that noone is
 	 * trying to add (or lookup) entries while we are removing it.
 	 */
 	rw_enter(&zp->z_name_lock, RW_WRITER);
 
 	/*
 	 * Grab a lock on the parent pointer to make sure we play well
 	 * with the treewalk and directory rename code.
 	 */
 	rw_enter(&zp->z_parent_lock, RW_WRITER);
 
 	tx = dmu_tx_create(zfsvfs->z_os);
 	dmu_tx_hold_zap(tx, dzp->z_id, FALSE, name);
 	dmu_tx_hold_bonus(tx, zp->z_id);
 	dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL);
 	error = dmu_tx_assign(tx, zfsvfs->z_assign);
 	if (error) {
 		rw_exit(&zp->z_parent_lock);
 		rw_exit(&zp->z_name_lock);
 		zfs_dirent_unlock(dl);
 		VN_RELE(vp);
 		if (error == ERESTART && zfsvfs->z_assign == TXG_NOWAIT) {
 			dmu_tx_wait(tx);
 			dmu_tx_abort(tx);
 			goto top;
 		}
 		dmu_tx_abort(tx);
 		ZFS_EXIT(zfsvfs);
 		return (error);
 	}
 
 #ifdef FREEBSD_NAMECACHE
 	cache_purge(dvp);
 #endif
 
 	error = zfs_link_destroy(dl, zp, tx, 0, NULL);
 
 	if (error == 0)
 		zfs_log_remove(zilog, tx, TX_RMDIR, dzp, name);
 
 	dmu_tx_commit(tx);
 
 	rw_exit(&zp->z_parent_lock);
 	rw_exit(&zp->z_name_lock);
 #ifdef FREEBSD_NAMECACHE
 	cache_purge(vp);
 #endif
 out:
 	zfs_dirent_unlock(dl);
 
 	VN_RELE(vp);
 
 	ZFS_EXIT(zfsvfs);
 	return (error);
 }
 
 /*
  * Read as many directory entries as will fit into the provided
  * buffer from the given directory cursor position (specified in
  * the uio structure.
  *
  *	IN:	vp	- vnode of directory to read.
  *		uio	- structure supplying read location, range info,
  *			  and return buffer.
  *		cr	- credentials of caller.
  *
  *	OUT:	uio	- updated offset and range, buffer filled.
  *		eofp	- set to true if end-of-file detected.
  *
  *	RETURN:	0 if success
  *		error code if failure
  *
  * Timestamps:
  *	vp - atime updated
  *
  * Note that the low 4 bits of the cookie returned by zap is always zero.
  * This allows us to use the low range for "special" directory entries:
  * We use 0 for '.', and 1 for '..'.  If this is the root of the filesystem,
  * we use the offset 2 for the '.zfs' directory.
  */
 /* ARGSUSED */
 static int
 zfs_readdir(vnode_t *vp, uio_t *uio, cred_t *cr, int *eofp, int *ncookies, u_long **cookies)
 {
 	znode_t		*zp = VTOZ(vp);
 	iovec_t		*iovp;
 	dirent64_t	*odp;
 	zfsvfs_t	*zfsvfs = zp->z_zfsvfs;
 	objset_t	*os;
 	caddr_t		outbuf;
 	size_t		bufsize;
 	zap_cursor_t	zc;
 	zap_attribute_t	zap;
 	uint_t		bytes_wanted;
 	uint64_t	offset; /* must be unsigned; checks for < 1 */
 	int		local_eof;
 	int		outcount;
 	int		error;
 	uint8_t		prefetch;
 	uint8_t		type;
 	int		ncooks;
 	u_long		*cooks = NULL;
 
 	ZFS_ENTER(zfsvfs);
 
 	/*
 	 * If we are not given an eof variable,
 	 * use a local one.
 	 */
 	if (eofp == NULL)
 		eofp = &local_eof;
 
 	/*
 	 * Check for valid iov_len.
 	 */
 	if (uio->uio_iov->iov_len <= 0) {
 		ZFS_EXIT(zfsvfs);
 		return (EINVAL);
 	}
 
 	/*
 	 * Quit if directory has been removed (posix)
 	 */
 	if ((*eofp = zp->z_unlinked) != 0) {
 		ZFS_EXIT(zfsvfs);
 		return (0);
 	}
 
 	error = 0;
 	os = zfsvfs->z_os;
 	offset = uio->uio_loffset;
 	prefetch = zp->z_zn_prefetch;
 
 	/*
 	 * Initialize the iterator cursor.
 	 */
 	if (offset <= 3) {
 		/*
 		 * Start iteration from the beginning of the directory.
 		 */
 		zap_cursor_init(&zc, os, zp->z_id);
 	} else {
 		/*
 		 * The offset is a serialized cursor.
 		 */
 		zap_cursor_init_serialized(&zc, os, zp->z_id, offset);
 	}
 
 	/*
 	 * Get space to change directory entries into fs independent format.
 	 */
 	iovp = uio->uio_iov;
 	bytes_wanted = iovp->iov_len;
 	if (uio->uio_segflg != UIO_SYSSPACE || uio->uio_iovcnt != 1) {
 		bufsize = bytes_wanted;
 		outbuf = kmem_alloc(bufsize, KM_SLEEP);
 		odp = (struct dirent64 *)outbuf;
 	} else {
 		bufsize = bytes_wanted;
 		odp = (struct dirent64 *)iovp->iov_base;
 	}
 
 	if (ncookies != NULL) {
 		/*
 		 * Minimum entry size is dirent size and 1 byte for a file name.
 		 */
 		ncooks = uio->uio_resid / (sizeof(struct dirent) - sizeof(((struct dirent *)NULL)->d_name) + 1);
 		cooks = malloc(ncooks * sizeof(u_long), M_TEMP, M_WAITOK);
 		*cookies = cooks;
 		*ncookies = ncooks;
 	}
 
 	/*
 	 * Transform to file-system independent format
 	 */
 	outcount = 0;
 	while (outcount < bytes_wanted) {
 		ino64_t objnum;
 		ushort_t reclen;
 
 		/*
 		 * Special case `.', `..', and `.zfs'.
 		 */
 		if (offset == 0) {
 			(void) strcpy(zap.za_name, ".");
 			objnum = zp->z_id;
 			type = DT_DIR;
 		} else if (offset == 1) {
 			(void) strcpy(zap.za_name, "..");
 			objnum = zp->z_phys->zp_parent;
 			type = DT_DIR;
 		} else if (offset == 2 && zfs_show_ctldir(zp)) {
 			(void) strcpy(zap.za_name, ZFS_CTLDIR_NAME);
 			objnum = ZFSCTL_INO_ROOT;
 			type = DT_DIR;
 		} else {
 			/*
 			 * Grab next entry.
 			 */
 			if (error = zap_cursor_retrieve(&zc, &zap)) {
 				if ((*eofp = (error == ENOENT)) != 0)
 					break;
 				else
 					goto update;
 			}
 
 			if (zap.za_integer_length != 8 ||
 			    zap.za_num_integers != 1) {
 				cmn_err(CE_WARN, "zap_readdir: bad directory "
 				    "entry, obj = %lld, offset = %lld\n",
 				    (u_longlong_t)zp->z_id,
 				    (u_longlong_t)offset);
 				error = ENXIO;
 				goto update;
 			}
 
 			objnum = ZFS_DIRENT_OBJ(zap.za_first_integer);
 			/*
 			 * MacOS X can extract the object type here such as:
 			 * uint8_t type = ZFS_DIRENT_TYPE(zap.za_first_integer);
 			 */
 			type = ZFS_DIRENT_TYPE(zap.za_first_integer);
 		}
 		reclen = DIRENT64_RECLEN(strlen(zap.za_name));
 
 		/*
 		 * Will this entry fit in the buffer?
 		 */
 		if (outcount + reclen > bufsize) {
 			/*
 			 * Did we manage to fit anything in the buffer?
 			 */
 			if (!outcount) {
 				error = EINVAL;
 				goto update;
 			}
 			break;
 		}
 		/*
 		 * Add this entry:
 		 */
 		odp->d_ino = objnum;
 		odp->d_reclen = reclen;
 		odp->d_namlen = strlen(zap.za_name);
 		(void) strlcpy(odp->d_name, zap.za_name, odp->d_namlen + 1);
 		odp->d_type = type;
 		outcount += reclen;
 		odp = (dirent64_t *)((intptr_t)odp + reclen);
 
 		ASSERT(outcount <= bufsize);
 
 		/* Prefetch znode */
 		if (prefetch)
 			dmu_prefetch(os, objnum, 0, 0);
 
 		/*
 		 * Move to the next entry, fill in the previous offset.
 		 */
 		if (offset > 2 || (offset == 2 && !zfs_show_ctldir(zp))) {
 			zap_cursor_advance(&zc);
 			offset = zap_cursor_serialize(&zc);
 		} else {
 			offset += 1;
 		}
 
 		if (cooks != NULL) {
 			*cooks++ = offset;
 			ncooks--;
 			KASSERT(ncooks >= 0, ("ncookies=%d", ncooks));
 		}
 	}
 	zp->z_zn_prefetch = B_FALSE; /* a lookup will re-enable pre-fetching */
 
 	/* Subtract unused cookies */
 	if (ncookies != NULL)
 		*ncookies -= ncooks;
 
 	if (uio->uio_segflg == UIO_SYSSPACE && uio->uio_iovcnt == 1) {
 		iovp->iov_base += outcount;
 		iovp->iov_len -= outcount;
 		uio->uio_resid -= outcount;
 	} else if (error = uiomove(outbuf, (long)outcount, UIO_READ, uio)) {
 		/*
 		 * Reset the pointer.
 		 */
 		offset = uio->uio_loffset;
 	}
 
 update:
 	zap_cursor_fini(&zc);
 	if (uio->uio_segflg != UIO_SYSSPACE || uio->uio_iovcnt != 1)
 		kmem_free(outbuf, bufsize);
 
 	if (error == ENOENT)
 		error = 0;
 
 	ZFS_ACCESSTIME_STAMP(zfsvfs, zp);
 
 	uio->uio_loffset = offset;
 	ZFS_EXIT(zfsvfs);
 	if (error != 0 && cookies != NULL) {
 		free(*cookies, M_TEMP);
 		*cookies = NULL;
 		*ncookies = 0;
 	}
 	return (error);
 }
 
 static int
 zfs_fsync(vnode_t *vp, int syncflag, cred_t *cr)
 {
 	znode_t	*zp = VTOZ(vp);
 	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
 
 	ZFS_ENTER(zfsvfs);
 	zil_commit(zfsvfs->z_log, zp->z_last_itx, zp->z_id);
 	ZFS_EXIT(zfsvfs);
 	return (0);
 }
 
 /*
  * Get the requested file attributes and place them in the provided
  * vattr structure.
  *
  *	IN:	vp	- vnode of file.
  *		vap	- va_mask identifies requested attributes.
  *		flags	- [UNUSED]
  *		cr	- credentials of caller.
  *
  *	OUT:	vap	- attribute values.
  *
  *	RETURN:	0 (always succeeds)
  */
 /* ARGSUSED */
 static int
 zfs_getattr(vnode_t *vp, vattr_t *vap, int flags, cred_t *cr)
 {
 	znode_t *zp = VTOZ(vp);
 	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
 	znode_phys_t *pzp = zp->z_phys;
 	uint32_t blksize;
 	u_longlong_t nblocks;
 	int	error;
 
 	ZFS_ENTER(zfsvfs);
 
 	/*
 	 * Return all attributes.  It's cheaper to provide the answer
 	 * than to determine whether we were asked the question.
 	 */
 	mutex_enter(&zp->z_lock);
 
 	vap->va_type = IFTOVT(pzp->zp_mode);
 	vap->va_mode = pzp->zp_mode & ~S_IFMT;
 	vap->va_uid = zp->z_phys->zp_uid;
 	vap->va_gid = zp->z_phys->zp_gid;
 	vap->va_nodeid = zp->z_id;
 	vap->va_nlink = MIN(pzp->zp_links, UINT32_MAX);	/* nlink_t limit! */
 	vap->va_size = pzp->zp_size;
 	vap->va_fsid = vp->v_mount->mnt_stat.f_fsid.val[0];
 	vap->va_rdev = zfs_cmpldev(pzp->zp_rdev);
 	vap->va_seq = zp->z_seq;
 	vap->va_flags = 0;	/* FreeBSD: Reset chflags(2) flags. */
 
 	ZFS_TIME_DECODE(&vap->va_atime, pzp->zp_atime);
 	ZFS_TIME_DECODE(&vap->va_mtime, pzp->zp_mtime);
 	ZFS_TIME_DECODE(&vap->va_ctime, pzp->zp_ctime);
 	ZFS_TIME_DECODE(&vap->va_birthtime, pzp->zp_crtime);
 
 	/*
 	 * If ACL is trivial don't bother looking for ACE_READ_ATTRIBUTES.
 	 * Also, if we are the owner don't bother, since owner should
 	 * always be allowed to read basic attributes of file.
 	 */
 	if (!(zp->z_phys->zp_flags & ZFS_ACL_TRIVIAL) &&
 	    (zp->z_phys->zp_uid != crgetuid(cr))) {
 		if (error = zfs_zaccess(zp, ACE_READ_ATTRIBUTES, cr)) {
 			mutex_exit(&zp->z_lock);
 			ZFS_EXIT(zfsvfs);
 			return (error);
 		}
 	}
 
 	mutex_exit(&zp->z_lock);
 
 	dmu_object_size_from_db(zp->z_dbuf, &blksize, &nblocks);
 	vap->va_blksize = blksize;
 	vap->va_bytes = nblocks << 9;	/* nblocks * 512 */
 
 	if (zp->z_blksz == 0) {
 		/*
 		 * Block size hasn't been set; suggest maximal I/O transfers.
 		 */
 		vap->va_blksize = zfsvfs->z_max_blksz;
 	}
 
 	ZFS_EXIT(zfsvfs);
 	return (0);
 }
 
 /*
  * Set the file attributes to the values contained in the
  * vattr structure.
  *
  *	IN:	vp	- vnode of file to be modified.
  *		vap	- new attribute values.
  *		flags	- ATTR_UTIME set if non-default time values provided.
  *		cr	- credentials of caller.
  *
  *	RETURN:	0 if success
  *		error code if failure
  *
  * Timestamps:
  *	vp - ctime updated, mtime updated if size changed.
  */
 /* ARGSUSED */
 static int
 zfs_setattr(vnode_t *vp, vattr_t *vap, int flags, cred_t *cr,
 	caller_context_t *ct)
 {
 	struct znode	*zp = VTOZ(vp);
 	znode_phys_t	*pzp = zp->z_phys;
 	zfsvfs_t	*zfsvfs = zp->z_zfsvfs;
 	zilog_t		*zilog = zfsvfs->z_log;
 	dmu_tx_t	*tx;
 	vattr_t		oldva;
 	uint_t		mask = vap->va_mask;
 	uint_t		saved_mask;
 	int		trim_mask = 0;
 	uint64_t	new_mode;
 	znode_t		*attrzp;
 	int		need_policy = FALSE;
 	int		err;
 
 	if (mask == 0)
 		return (0);
 
 	if (mask & AT_NOSET)
 		return (EINVAL);
 
 	if (mask & AT_SIZE && vp->v_type == VDIR)
 		return (EISDIR);
 
 	if (mask & AT_SIZE && vp->v_type != VREG && vp->v_type != VFIFO)
 		return (EINVAL);
 
 	ZFS_ENTER(zfsvfs);
 
 top:
 	attrzp = NULL;
 
 	if (zfsvfs->z_vfs->vfs_flag & VFS_RDONLY) {
 		ZFS_EXIT(zfsvfs);
 		return (EROFS);
 	}
 
 	/*
 	 * First validate permissions
 	 */
 
 	if (mask & AT_SIZE) {
 		err = zfs_zaccess(zp, ACE_WRITE_DATA, cr);
 		if (err) {
 			ZFS_EXIT(zfsvfs);
 			return (err);
 		}
 		/*
 		 * XXX - Note, we are not providing any open
 		 * mode flags here (like FNDELAY), so we may
 		 * block if there are locks present... this
 		 * should be addressed in openat().
 		 */
 		do {
 			err = zfs_freesp(zp, vap->va_size, 0, 0, FALSE);
 			/* NB: we already did dmu_tx_wait() if necessary */
 		} while (err == ERESTART && zfsvfs->z_assign == TXG_NOWAIT);
 		if (err) {
 			ZFS_EXIT(zfsvfs);
 			return (err);
 		}
 	}
 
 	if (mask & (AT_ATIME|AT_MTIME))
 		need_policy = zfs_zaccess_v4_perm(zp, ACE_WRITE_ATTRIBUTES, cr);
 
 	if (mask & (AT_UID|AT_GID)) {
 		int	idmask = (mask & (AT_UID|AT_GID));
 		int	take_owner;
 		int	take_group;
 
 		/*
 		 * NOTE: even if a new mode is being set,
 		 * we may clear S_ISUID/S_ISGID bits.
 		 */
 
 		if (!(mask & AT_MODE))
 			vap->va_mode = pzp->zp_mode;
 
 		/*
 		 * Take ownership or chgrp to group we are a member of
 		 */
 
 		take_owner = (mask & AT_UID) && (vap->va_uid == crgetuid(cr));
 		take_group = (mask & AT_GID) && groupmember(vap->va_gid, cr);
 
 		/*
 		 * If both AT_UID and AT_GID are set then take_owner and
 		 * take_group must both be set in order to allow taking
 		 * ownership.
 		 *
 		 * Otherwise, send the check through secpolicy_vnode_setattr()
 		 *
 		 */
 
 		if (((idmask == (AT_UID|AT_GID)) && take_owner && take_group) ||
 		    ((idmask == AT_UID) && take_owner) ||
 		    ((idmask == AT_GID) && take_group)) {
 			if (zfs_zaccess_v4_perm(zp, ACE_WRITE_OWNER, cr) == 0) {
 				/*
 				 * Remove setuid/setgid for non-privileged users
 				 */
 				secpolicy_setid_clear(vap, cr);
 				trim_mask = (mask & (AT_UID|AT_GID));
 			} else {
 				need_policy =  TRUE;
 			}
 		} else {
 			need_policy =  TRUE;
 		}
 	}
 
 	mutex_enter(&zp->z_lock);
 	oldva.va_mode = pzp->zp_mode;
 	oldva.va_uid = zp->z_phys->zp_uid;
 	oldva.va_gid = zp->z_phys->zp_gid;
 	mutex_exit(&zp->z_lock);
 
 	if (mask & AT_MODE) {
 		if (zfs_zaccess_v4_perm(zp, ACE_WRITE_ACL, cr) == 0) {
 			err = secpolicy_setid_setsticky_clear(vp, vap,
 			    &oldva, cr);
 			if (err) {
 				ZFS_EXIT(zfsvfs);
 				return (err);
 			}
 			trim_mask |= AT_MODE;
 		} else {
 			need_policy = TRUE;
 		}
 	}
 
 	if (need_policy) {
 		/*
 		 * If trim_mask is set then take ownership
 		 * has been granted or write_acl is present and user
 		 * has the ability to modify mode.  In that case remove
 		 * UID|GID and or MODE from mask so that
 		 * secpolicy_vnode_setattr() doesn't revoke it.
 		 */
 
 		if (trim_mask) {
 			saved_mask = vap->va_mask;
 			vap->va_mask &= ~trim_mask;
 
 		}
 		err = secpolicy_vnode_setattr(cr, vp, vap, &oldva, flags,
 		    (int (*)(void *, int, cred_t *))zfs_zaccess_rwx, zp);
 		if (err) {
 			ZFS_EXIT(zfsvfs);
 			return (err);
 		}
 
 		if (trim_mask)
 			vap->va_mask |= saved_mask;
 	}
 
 	/*
 	 * secpolicy_vnode_setattr, or take ownership may have
 	 * changed va_mask
 	 */
 	mask = vap->va_mask;
 
 	tx = dmu_tx_create(zfsvfs->z_os);
 	dmu_tx_hold_bonus(tx, zp->z_id);
 
 	if (mask & AT_MODE) {
 		uint64_t pmode = pzp->zp_mode;
 
 		new_mode = (pmode & S_IFMT) | (vap->va_mode & ~S_IFMT);
 
 		if (zp->z_phys->zp_acl.z_acl_extern_obj)
 			dmu_tx_hold_write(tx,
 			    pzp->zp_acl.z_acl_extern_obj, 0, SPA_MAXBLOCKSIZE);
 		else
 			dmu_tx_hold_write(tx, DMU_NEW_OBJECT,
 			    0, ZFS_ACL_SIZE(MAX_ACL_SIZE));
 	}
 
 	if ((mask & (AT_UID | AT_GID)) && zp->z_phys->zp_xattr != 0) {
 		err = zfs_zget(zp->z_zfsvfs, zp->z_phys->zp_xattr, &attrzp);
 		if (err) {
 			dmu_tx_abort(tx);
 			ZFS_EXIT(zfsvfs);
 			return (err);
 		}
 		dmu_tx_hold_bonus(tx, attrzp->z_id);
 	}
 
 	err = dmu_tx_assign(tx, zfsvfs->z_assign);
 	if (err) {
 		if (attrzp)
 			VN_RELE(ZTOV(attrzp));
 		if (err == ERESTART && zfsvfs->z_assign == TXG_NOWAIT) {
 			dmu_tx_wait(tx);
 			dmu_tx_abort(tx);
 			goto top;
 		}
 		dmu_tx_abort(tx);
 		ZFS_EXIT(zfsvfs);
 		return (err);
 	}
 
 	dmu_buf_will_dirty(zp->z_dbuf, tx);
 
 	/*
 	 * Set each attribute requested.
 	 * We group settings according to the locks they need to acquire.
 	 *
 	 * Note: you cannot set ctime directly, although it will be
 	 * updated as a side-effect of calling this function.
 	 */
 
 	mutex_enter(&zp->z_lock);
 
 	if (mask & AT_MODE) {
 		err = zfs_acl_chmod_setattr(zp, new_mode, tx);
 		ASSERT3U(err, ==, 0);
 	}
 
 	if (attrzp)
 		mutex_enter(&attrzp->z_lock);
 
 	if (mask & AT_UID) {
 		zp->z_phys->zp_uid = (uint64_t)vap->va_uid;
 		if (attrzp) {
 			attrzp->z_phys->zp_uid = (uint64_t)vap->va_uid;
 		}
 	}
 
 	if (mask & AT_GID) {
 		zp->z_phys->zp_gid = (uint64_t)vap->va_gid;
 		if (attrzp)
 			attrzp->z_phys->zp_gid = (uint64_t)vap->va_gid;
 	}
 
 	if (attrzp)
 		mutex_exit(&attrzp->z_lock);
 
 	if (mask & AT_ATIME)
 		ZFS_TIME_ENCODE(&vap->va_atime, pzp->zp_atime);
 
 	if (mask & AT_MTIME)
 		ZFS_TIME_ENCODE(&vap->va_mtime, pzp->zp_mtime);
 
 	if (mask & AT_SIZE)
 		zfs_time_stamper_locked(zp, CONTENT_MODIFIED, tx);
 	else if (mask != 0)
 		zfs_time_stamper_locked(zp, STATE_CHANGED, tx);
 
 	if (mask != 0)
 		zfs_log_setattr(zilog, tx, TX_SETATTR, zp, vap, mask);
 
 	mutex_exit(&zp->z_lock);
 
 	if (attrzp)
 		VN_RELE(ZTOV(attrzp));
 
 	dmu_tx_commit(tx);
 
 	ZFS_EXIT(zfsvfs);
 	return (err);
 }
 
 typedef struct zfs_zlock {
 	krwlock_t	*zl_rwlock;	/* lock we acquired */
 	znode_t		*zl_znode;	/* znode we held */
 	struct zfs_zlock *zl_next;	/* next in list */
 } zfs_zlock_t;
 
 /*
  * Drop locks and release vnodes that were held by zfs_rename_lock().
  */
 static void
 zfs_rename_unlock(zfs_zlock_t **zlpp)
 {
 	zfs_zlock_t *zl;
 
 	while ((zl = *zlpp) != NULL) {
 		if (zl->zl_znode != NULL)
 			VN_RELE(ZTOV(zl->zl_znode));
 		rw_exit(zl->zl_rwlock);
 		*zlpp = zl->zl_next;
 		kmem_free(zl, sizeof (*zl));
 	}
 }
 
 /*
  * Search back through the directory tree, using the ".." entries.
  * Lock each directory in the chain to prevent concurrent renames.
  * Fail any attempt to move a directory into one of its own descendants.
  * XXX - z_parent_lock can overlap with map or grow locks
  */
 static int
 zfs_rename_lock(znode_t *szp, znode_t *tdzp, znode_t *sdzp, zfs_zlock_t **zlpp)
 {
 	zfs_zlock_t	*zl;
 	znode_t		*zp = tdzp;
 	uint64_t	rootid = zp->z_zfsvfs->z_root;
 	uint64_t	*oidp = &zp->z_id;
 	krwlock_t	*rwlp = &szp->z_parent_lock;
 	krw_t		rw = RW_WRITER;
 
 	/*
 	 * First pass write-locks szp and compares to zp->z_id.
 	 * Later passes read-lock zp and compare to zp->z_parent.
 	 */
 	do {
 		if (!rw_tryenter(rwlp, rw)) {
 			/*
 			 * Another thread is renaming in this path.
 			 * Note that if we are a WRITER, we don't have any
 			 * parent_locks held yet.
 			 */
 			if (rw == RW_READER && zp->z_id > szp->z_id) {
 				/*
 				 * Drop our locks and restart
 				 */
 				zfs_rename_unlock(&zl);
 				*zlpp = NULL;
 				zp = tdzp;
 				oidp = &zp->z_id;
 				rwlp = &szp->z_parent_lock;
 				rw = RW_WRITER;
 				continue;
 			} else {
 				/*
 				 * Wait for other thread to drop its locks
 				 */
 				rw_enter(rwlp, rw);
 			}
 		}
 
 		zl = kmem_alloc(sizeof (*zl), KM_SLEEP);
 		zl->zl_rwlock = rwlp;
 		zl->zl_znode = NULL;
 		zl->zl_next = *zlpp;
 		*zlpp = zl;
 
 		if (*oidp == szp->z_id)		/* We're a descendant of szp */
 			return (EINVAL);
 
 		if (*oidp == rootid)		/* We've hit the top */
 			return (0);
 
 		if (rw == RW_READER) {		/* i.e. not the first pass */
 			int error = zfs_zget(zp->z_zfsvfs, *oidp, &zp);
 			if (error)
 				return (error);
 			zl->zl_znode = zp;
 		}
 		oidp = &zp->z_phys->zp_parent;
 		rwlp = &zp->z_parent_lock;
 		rw = RW_READER;
 
 	} while (zp->z_id != sdzp->z_id);
 
 	return (0);
 }
 
 /*
  * Move an entry from the provided source directory to the target
  * directory.  Change the entry name as indicated.
  *
  *	IN:	sdvp	- Source directory containing the "old entry".
  *		snm	- Old entry name.
  *		tdvp	- Target directory to contain the "new entry".
  *		tnm	- New entry name.
  *		cr	- credentials of caller.
  *
  *	RETURN:	0 if success
  *		error code if failure
  *
  * Timestamps:
  *	sdvp,tdvp - ctime|mtime updated
  */
 static int
 zfs_rename(vnode_t *sdvp, char *snm, vnode_t *tdvp, char *tnm, cred_t *cr)
 {
 	znode_t		*tdzp, *szp, *tzp;
 	znode_t		*sdzp = VTOZ(sdvp);
 	zfsvfs_t	*zfsvfs = sdzp->z_zfsvfs;
 	zilog_t		*zilog = zfsvfs->z_log;
 	vnode_t		*realvp;
 	zfs_dirlock_t	*sdl, *tdl;
 	dmu_tx_t	*tx;
 	zfs_zlock_t	*zl;
 	int		cmp, serr, terr, error;
 
 	ZFS_ENTER(zfsvfs);
 
 	/*
 	 * Make sure we have the real vp for the target directory.
 	 */
 	if (VOP_REALVP(tdvp, &realvp) == 0)
 		tdvp = realvp;
 
 	if (tdvp->v_vfsp != sdvp->v_vfsp) {
 		ZFS_EXIT(zfsvfs);
 		return (EXDEV);
 	}
 
 	tdzp = VTOZ(tdvp);
 top:
 	szp = NULL;
 	tzp = NULL;
 	zl = NULL;
 
 	/*
 	 * This is to prevent the creation of links into attribute space
 	 * by renaming a linked file into/outof an attribute directory.
 	 * See the comment in zfs_link() for why this is considered bad.
 	 */
 	if ((tdzp->z_phys->zp_flags & ZFS_XATTR) !=
 	    (sdzp->z_phys->zp_flags & ZFS_XATTR)) {
 		ZFS_EXIT(zfsvfs);
 		return (EINVAL);
 	}
 
 	/*
 	 * Lock source and target directory entries.  To prevent deadlock,
 	 * a lock ordering must be defined.  We lock the directory with
 	 * the smallest object id first, or if it's a tie, the one with
 	 * the lexically first name.
 	 */
 	if (sdzp->z_id < tdzp->z_id) {
 		cmp = -1;
 	} else if (sdzp->z_id > tdzp->z_id) {
 		cmp = 1;
 	} else {
 		cmp = strcmp(snm, tnm);
 		if (cmp == 0) {
 			/*
 			 * POSIX: "If the old argument and the new argument
 			 * both refer to links to the same existing file,
 			 * the rename() function shall return successfully
 			 * and perform no other action."
 			 */
 			ZFS_EXIT(zfsvfs);
 			return (0);
 		}
 	}
 	if (cmp < 0) {
 		serr = zfs_dirent_lock(&sdl, sdzp, snm, &szp, ZEXISTS);
 		terr = zfs_dirent_lock(&tdl, tdzp, tnm, &tzp, 0);
 	} else {
 		terr = zfs_dirent_lock(&tdl, tdzp, tnm, &tzp, 0);
 		serr = zfs_dirent_lock(&sdl, sdzp, snm, &szp, ZEXISTS);
 	}
 
 	if (serr) {
 		/*
 		 * Source entry invalid or not there.
 		 */
 		if (!terr) {
 			zfs_dirent_unlock(tdl);
 			if (tzp)
 				VN_RELE(ZTOV(tzp));
 		}
 		if (strcmp(snm, ".") == 0 || strcmp(snm, "..") == 0)
 			serr = EINVAL;
 		ZFS_EXIT(zfsvfs);
 		return (serr);
 	}
 	if (terr) {
 		zfs_dirent_unlock(sdl);
 		VN_RELE(ZTOV(szp));
 		if (strcmp(tnm, "..") == 0)
 			terr = EINVAL;
 		ZFS_EXIT(zfsvfs);
 		return (terr);
 	}
 
 	/*
 	 * Must have write access at the source to remove the old entry
 	 * and write access at the target to create the new entry.
 	 * Note that if target and source are the same, this can be
 	 * done in a single check.
 	 */
 
 	if (error = zfs_zaccess_rename(sdzp, szp, tdzp, tzp, cr))
 		goto out;
 
 	if (ZTOV(szp)->v_type == VDIR) {
 		/*
 		 * Check to make sure rename is valid.
 		 * Can't do a move like this: /usr/a/b to /usr/a/b/c/d
 		 */
 		if (error = zfs_rename_lock(szp, tdzp, sdzp, &zl))
 			goto out;
 	}
 
 	/*
 	 * Does target exist?
 	 */
 	if (tzp) {
 		/*
 		 * Source and target must be the same type.
 		 */
 		if (ZTOV(szp)->v_type == VDIR) {
 			if (ZTOV(tzp)->v_type != VDIR) {
 				error = ENOTDIR;
 				goto out;
 			}
 		} else {
 			if (ZTOV(tzp)->v_type == VDIR) {
 				error = EISDIR;
 				goto out;
 			}
 		}
 		/*
 		 * POSIX dictates that when the source and target
 		 * entries refer to the same file object, rename
 		 * must do nothing and exit without error.
 		 */
 		if (szp->z_id == tzp->z_id) {
 			error = 0;
 			goto out;
 		}
 	}
 
 	vnevent_rename_src(ZTOV(szp));
 	if (tzp)
 		vnevent_rename_dest(ZTOV(tzp));
 
 	tx = dmu_tx_create(zfsvfs->z_os);
 	dmu_tx_hold_bonus(tx, szp->z_id);	/* nlink changes */
 	dmu_tx_hold_bonus(tx, sdzp->z_id);	/* nlink changes */
 	dmu_tx_hold_zap(tx, sdzp->z_id, FALSE, snm);
 	dmu_tx_hold_zap(tx, tdzp->z_id, TRUE, tnm);
 	if (sdzp != tdzp)
 		dmu_tx_hold_bonus(tx, tdzp->z_id);	/* nlink changes */
 	if (tzp)
 		dmu_tx_hold_bonus(tx, tzp->z_id);	/* parent changes */
 	dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL);
 	error = dmu_tx_assign(tx, zfsvfs->z_assign);
 	if (error) {
 		if (zl != NULL)
 			zfs_rename_unlock(&zl);
 		zfs_dirent_unlock(sdl);
 		zfs_dirent_unlock(tdl);
 		VN_RELE(ZTOV(szp));
 		if (tzp)
 			VN_RELE(ZTOV(tzp));
 		if (error == ERESTART && zfsvfs->z_assign == TXG_NOWAIT) {
 			dmu_tx_wait(tx);
 			dmu_tx_abort(tx);
 			goto top;
 		}
 		dmu_tx_abort(tx);
 		ZFS_EXIT(zfsvfs);
 		return (error);
 	}
 
 	if (tzp)	/* Attempt to remove the existing target */
 		error = zfs_link_destroy(tdl, tzp, tx, 0, NULL);
 
 	if (error == 0) {
 		error = zfs_link_create(tdl, szp, tx, ZRENAMING);
 		if (error == 0) {
 			error = zfs_link_destroy(sdl, szp, tx, ZRENAMING, NULL);
 			ASSERT(error == 0);
 			zfs_log_rename(zilog, tx, TX_RENAME, sdzp,
 			    sdl->dl_name, tdzp, tdl->dl_name, szp);
 		}
 #ifdef FREEBSD_NAMECACHE
 		if (error == 0) {
 			cache_purge(sdvp);
 			cache_purge(tdvp);
 		}
 #endif
 	}
 
 	dmu_tx_commit(tx);
 out:
 	if (zl != NULL)
 		zfs_rename_unlock(&zl);
 
 	zfs_dirent_unlock(sdl);
 	zfs_dirent_unlock(tdl);
 
 	VN_RELE(ZTOV(szp));
 	if (tzp)
 		VN_RELE(ZTOV(tzp));
 
 	ZFS_EXIT(zfsvfs);
 
 	return (error);
 }
 
 /*
  * Insert the indicated symbolic reference entry into the directory.
  *
  *	IN:	dvp	- Directory to contain new symbolic link.
  *		link	- Name for new symlink entry.
  *		vap	- Attributes of new entry.
  *		target	- Target path of new symlink.
  *		cr	- credentials of caller.
  *
  *	RETURN:	0 if success
  *		error code if failure
  *
  * Timestamps:
  *	dvp - ctime|mtime updated
  */
 static int
 zfs_symlink(vnode_t *dvp, vnode_t **vpp, char *name, vattr_t *vap, char *link, cred_t *cr, kthread_t *td)
 {
 	znode_t		*zp, *dzp = VTOZ(dvp);
 	zfs_dirlock_t	*dl;
 	dmu_tx_t	*tx;
 	zfsvfs_t	*zfsvfs = dzp->z_zfsvfs;
 	zilog_t		*zilog = zfsvfs->z_log;
 	uint64_t	zoid;
 	int		len = strlen(link);
 	int		error;
 
 	ASSERT(vap->va_type == VLNK);
 
 	ZFS_ENTER(zfsvfs);
 top:
 	if (error = zfs_zaccess(dzp, ACE_ADD_FILE, cr)) {
 		ZFS_EXIT(zfsvfs);
 		return (error);
 	}
 
 	if (len > MAXPATHLEN) {
 		ZFS_EXIT(zfsvfs);
 		return (ENAMETOOLONG);
 	}
 
 	/*
 	 * Attempt to lock directory; fail if entry already exists.
 	 */
 	if (error = zfs_dirent_lock(&dl, dzp, name, &zp, ZNEW)) {
 		ZFS_EXIT(zfsvfs);
 		return (error);
 	}
 
 	tx = dmu_tx_create(zfsvfs->z_os);
 	dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, MAX(1, len));
 	dmu_tx_hold_bonus(tx, dzp->z_id);
 	dmu_tx_hold_zap(tx, dzp->z_id, TRUE, name);
 	if (dzp->z_phys->zp_flags & ZFS_INHERIT_ACE)
 		dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, SPA_MAXBLOCKSIZE);
 	error = dmu_tx_assign(tx, zfsvfs->z_assign);
 	if (error) {
 		zfs_dirent_unlock(dl);
 		if (error == ERESTART && zfsvfs->z_assign == TXG_NOWAIT) {
 			dmu_tx_wait(tx);
 			dmu_tx_abort(tx);
 			goto top;
 		}
 		dmu_tx_abort(tx);
 		ZFS_EXIT(zfsvfs);
 		return (error);
 	}
 
 	dmu_buf_will_dirty(dzp->z_dbuf, tx);
 
 	/*
 	 * Create a new object for the symlink.
 	 * Put the link content into bonus buffer if it will fit;
 	 * otherwise, store it just like any other file data.
 	 */
 	zoid = 0;
 	if (sizeof (znode_phys_t) + len <= dmu_bonus_max()) {
 		zfs_mknode(dzp, vap, &zoid, tx, cr, 0, &zp, len);
 		if (len != 0)
 			bcopy(link, zp->z_phys + 1, len);
 	} else {
 		dmu_buf_t *dbp;
 
 		zfs_mknode(dzp, vap, &zoid, tx, cr, 0, &zp, 0);
 
 		/*
 		 * Nothing can access the znode yet so no locking needed
 		 * for growing the znode's blocksize.
 		 */
 		zfs_grow_blocksize(zp, len, tx);
 
 		VERIFY(0 == dmu_buf_hold(zfsvfs->z_os, zoid, 0, FTAG, &dbp));
 		dmu_buf_will_dirty(dbp, tx);
 
 		ASSERT3U(len, <=, dbp->db_size);
 		bcopy(link, dbp->db_data, len);
 		dmu_buf_rele(dbp, FTAG);
 	}
 	zp->z_phys->zp_size = len;
 
 	/*
 	 * Insert the new object into the directory.
 	 */
 	(void) zfs_link_create(dl, zp, tx, ZNEW);
 out:
 	if (error == 0) {
 		zfs_log_symlink(zilog, tx, TX_SYMLINK, dzp, zp, name, link);
 		*vpp = ZTOV(zp);
-		vn_lock(*vpp, LK_EXCLUSIVE | LK_RETRY, td);
+		vn_lock(*vpp, LK_EXCLUSIVE | LK_RETRY);
 	}
 
 	dmu_tx_commit(tx);
 
 	zfs_dirent_unlock(dl);
 
 	ZFS_EXIT(zfsvfs);
 	return (error);
 }
 
 /*
  * Return, in the buffer contained in the provided uio structure,
  * the symbolic path referred to by vp.
  *
  *	IN:	vp	- vnode of symbolic link.
  *		uoip	- structure to contain the link path.
  *		cr	- credentials of caller.
  *
  *	OUT:	uio	- structure to contain the link path.
  *
  *	RETURN:	0 if success
  *		error code if failure
  *
  * Timestamps:
  *	vp - atime updated
  */
 /* ARGSUSED */
 static int
 zfs_readlink(vnode_t *vp, uio_t *uio, cred_t *cr)
 {
 	znode_t		*zp = VTOZ(vp);
 	zfsvfs_t	*zfsvfs = zp->z_zfsvfs;
 	size_t		bufsz;
 	int		error;
 
 	ZFS_ENTER(zfsvfs);
 
 	bufsz = (size_t)zp->z_phys->zp_size;
 	if (bufsz + sizeof (znode_phys_t) <= zp->z_dbuf->db_size) {
 		error = uiomove(zp->z_phys + 1,
 		    MIN((size_t)bufsz, uio->uio_resid), UIO_READ, uio);
 	} else {
 		dmu_buf_t *dbp;
 		error = dmu_buf_hold(zfsvfs->z_os, zp->z_id, 0, FTAG, &dbp);
 		if (error) {
 			ZFS_EXIT(zfsvfs);
 			return (error);
 		}
 		error = uiomove(dbp->db_data,
 		    MIN((size_t)bufsz, uio->uio_resid), UIO_READ, uio);
 		dmu_buf_rele(dbp, FTAG);
 	}
 
 	ZFS_ACCESSTIME_STAMP(zfsvfs, zp);
 	ZFS_EXIT(zfsvfs);
 	return (error);
 }
 
 /*
  * Insert a new entry into directory tdvp referencing svp.
  *
  *	IN:	tdvp	- Directory to contain new entry.
  *		svp	- vnode of new entry.
  *		name	- name of new entry.
  *		cr	- credentials of caller.
  *
  *	RETURN:	0 if success
  *		error code if failure
  *
  * Timestamps:
  *	tdvp - ctime|mtime updated
  *	 svp - ctime updated
  */
 /* ARGSUSED */
 static int
 zfs_link(vnode_t *tdvp, vnode_t *svp, char *name, cred_t *cr)
 {
 	znode_t		*dzp = VTOZ(tdvp);
 	znode_t		*tzp, *szp;
 	zfsvfs_t	*zfsvfs = dzp->z_zfsvfs;
 	zilog_t		*zilog = zfsvfs->z_log;
 	zfs_dirlock_t	*dl;
 	dmu_tx_t	*tx;
 	vnode_t		*realvp;
 	int		error;
 
 	ASSERT(tdvp->v_type == VDIR);
 
 	ZFS_ENTER(zfsvfs);
 
 	if (VOP_REALVP(svp, &realvp) == 0)
 		svp = realvp;
 
 	if (svp->v_vfsp != tdvp->v_vfsp) {
 		ZFS_EXIT(zfsvfs);
 		return (EXDEV);
 	}
 
 	szp = VTOZ(svp);
 top:
 	/*
 	 * We do not support links between attributes and non-attributes
 	 * because of the potential security risk of creating links
 	 * into "normal" file space in order to circumvent restrictions
 	 * imposed in attribute space.
 	 */
 	if ((szp->z_phys->zp_flags & ZFS_XATTR) !=
 	    (dzp->z_phys->zp_flags & ZFS_XATTR)) {
 		ZFS_EXIT(zfsvfs);
 		return (EINVAL);
 	}
 
 	/*
 	 * POSIX dictates that we return EPERM here.
 	 * Better choices include ENOTSUP or EISDIR.
 	 */
 	if (svp->v_type == VDIR) {
 		ZFS_EXIT(zfsvfs);
 		return (EPERM);
 	}
 
 	if ((uid_t)szp->z_phys->zp_uid != crgetuid(cr) &&
 	    secpolicy_basic_link(cr) != 0) {
 		ZFS_EXIT(zfsvfs);
 		return (EPERM);
 	}
 
 	if (error = zfs_zaccess(dzp, ACE_ADD_FILE, cr)) {
 		ZFS_EXIT(zfsvfs);
 		return (error);
 	}
 
 	/*
 	 * Attempt to lock directory; fail if entry already exists.
 	 */
 	if (error = zfs_dirent_lock(&dl, dzp, name, &tzp, ZNEW)) {
 		ZFS_EXIT(zfsvfs);
 		return (error);
 	}
 
 	tx = dmu_tx_create(zfsvfs->z_os);
 	dmu_tx_hold_bonus(tx, szp->z_id);
 	dmu_tx_hold_zap(tx, dzp->z_id, TRUE, name);
 	error = dmu_tx_assign(tx, zfsvfs->z_assign);
 	if (error) {
 		zfs_dirent_unlock(dl);
 		if (error == ERESTART && zfsvfs->z_assign == TXG_NOWAIT) {
 			dmu_tx_wait(tx);
 			dmu_tx_abort(tx);
 			goto top;
 		}
 		dmu_tx_abort(tx);
 		ZFS_EXIT(zfsvfs);
 		return (error);
 	}
 
 	error = zfs_link_create(dl, szp, tx, 0);
 
 	if (error == 0)
 		zfs_log_link(zilog, tx, TX_LINK, dzp, szp, name);
 
 	dmu_tx_commit(tx);
 
 	zfs_dirent_unlock(dl);
 
 	ZFS_EXIT(zfsvfs);
 	return (error);
 }
 
 void
 zfs_inactive(vnode_t *vp, cred_t *cr)
 {
 	znode_t	*zp = VTOZ(vp);
 	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
 	int error;
 
 	rw_enter(&zfsvfs->z_um_lock, RW_READER);
 	if (zfsvfs->z_unmounted2) {
 		ASSERT(zp->z_dbuf_held == 0);
 
 		mutex_enter(&zp->z_lock);
 		VI_LOCK(vp);
 		vp->v_count = 0; /* count arrives as 1 */
 		VI_UNLOCK(vp);
 		if (zp->z_dbuf == NULL) {
 			mutex_exit(&zp->z_lock);
 			zfs_znode_free(zp);
 		} else {
 			mutex_exit(&zp->z_lock);
 		}
 		rw_exit(&zfsvfs->z_um_lock);
 		VFS_RELE(zfsvfs->z_vfs);
 		return;
 	}
 
 	if (zp->z_atime_dirty && zp->z_unlinked == 0) {
 		dmu_tx_t *tx = dmu_tx_create(zfsvfs->z_os);
 
 		dmu_tx_hold_bonus(tx, zp->z_id);
 		error = dmu_tx_assign(tx, TXG_WAIT);
 		if (error) {
 			dmu_tx_abort(tx);
 		} else {
 			dmu_buf_will_dirty(zp->z_dbuf, tx);
 			mutex_enter(&zp->z_lock);
 			zp->z_atime_dirty = 0;
 			mutex_exit(&zp->z_lock);
 			dmu_tx_commit(tx);
 		}
 	}
 
 	zfs_zinactive(zp);
 	rw_exit(&zfsvfs->z_um_lock);
 }
 
 CTASSERT(sizeof(struct zfid_short) <= sizeof(struct fid));
 CTASSERT(sizeof(struct zfid_long) <= sizeof(struct fid));
 
 static int
 zfs_fid(vnode_t *vp, fid_t *fidp)
 {
 	znode_t		*zp = VTOZ(vp);
 	zfsvfs_t	*zfsvfs = zp->z_zfsvfs;
 	uint32_t	gen = (uint32_t)zp->z_phys->zp_gen;
 	uint64_t	object = zp->z_id;
 	zfid_short_t	*zfid;
 	int		size, i;
 
 	ZFS_ENTER(zfsvfs);
 
 	size = (zfsvfs->z_parent != zfsvfs) ? LONG_FID_LEN : SHORT_FID_LEN;
 	fidp->fid_len = size;
 
 	zfid = (zfid_short_t *)fidp;
 
 	zfid->zf_len = size;
 
 	for (i = 0; i < sizeof (zfid->zf_object); i++)
 		zfid->zf_object[i] = (uint8_t)(object >> (8 * i));
 
 	/* Must have a non-zero generation number to distinguish from .zfs */
 	if (gen == 0)
 		gen = 1;
 	for (i = 0; i < sizeof (zfid->zf_gen); i++)
 		zfid->zf_gen[i] = (uint8_t)(gen >> (8 * i));
 
 	if (size == LONG_FID_LEN) {
 		uint64_t	objsetid = dmu_objset_id(zfsvfs->z_os);
 		zfid_long_t	*zlfid;
 
 		zlfid = (zfid_long_t *)fidp;
 
 		for (i = 0; i < sizeof (zlfid->zf_setid); i++)
 			zlfid->zf_setid[i] = (uint8_t)(objsetid >> (8 * i));
 
 		/* XXX - this should be the generation number for the objset */
 		for (i = 0; i < sizeof (zlfid->zf_setgen); i++)
 			zlfid->zf_setgen[i] = 0;
 	}
 
 	ZFS_EXIT(zfsvfs);
 	return (0);
 }
 
 static int
 zfs_pathconf(vnode_t *vp, int cmd, ulong_t *valp, cred_t *cr)
 {
 	znode_t		*zp, *xzp;
 	zfsvfs_t	*zfsvfs;
 	zfs_dirlock_t	*dl;
 	int		error;
 
 	switch (cmd) {
 	case _PC_LINK_MAX:
 		*valp = INT_MAX;
 		return (0);
 
 	case _PC_FILESIZEBITS:
 		*valp = 64;
 		return (0);
 
 #if 0
 	case _PC_XATTR_EXISTS:
 		zp = VTOZ(vp);
 		zfsvfs = zp->z_zfsvfs;
 		ZFS_ENTER(zfsvfs);
 		*valp = 0;
 		error = zfs_dirent_lock(&dl, zp, "", &xzp,
 		    ZXATTR | ZEXISTS | ZSHARED);
 		if (error == 0) {
 			zfs_dirent_unlock(dl);
 			if (!zfs_dirempty(xzp))
 				*valp = 1;
 			VN_RELE(ZTOV(xzp));
 		} else if (error == ENOENT) {
 			/*
 			 * If there aren't extended attributes, it's the
 			 * same as having zero of them.
 			 */
 			error = 0;
 		}
 		ZFS_EXIT(zfsvfs);
 		return (error);
 #endif
 
 	case _PC_ACL_EXTENDED:
 		*valp = 0;	/* TODO */
 		return (0);
 
 	case _PC_MIN_HOLE_SIZE:
 		*valp = (int)SPA_MINBLOCKSIZE;
 		return (0);
 
 	default:
 		return (EOPNOTSUPP);
 	}
 }
 
 #ifdef TODO
 /*ARGSUSED*/
 static int
 zfs_getsecattr(vnode_t *vp, vsecattr_t *vsecp, int flag, cred_t *cr)
 {
 	znode_t *zp = VTOZ(vp);
 	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
 	int error;
 
 	ZFS_ENTER(zfsvfs);
 	error = zfs_getacl(zp, vsecp, cr);
 	ZFS_EXIT(zfsvfs);
 
 	return (error);
 }
 #endif	/* TODO */
 
 #ifdef TODO
 /*ARGSUSED*/
 static int
 zfs_setsecattr(vnode_t *vp, vsecattr_t *vsecp, int flag, cred_t *cr)
 {
 	znode_t *zp = VTOZ(vp);
 	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
 	int error;
 
 	ZFS_ENTER(zfsvfs);
 	error = zfs_setacl(zp, vsecp, cr);
 	ZFS_EXIT(zfsvfs);
 	return (error);
 }
 #endif	/* TODO */
 
 static int
 zfs_freebsd_open(ap)
 	struct vop_open_args /* {
 		struct vnode *a_vp;
 		int a_mode;
 		struct ucred *a_cred;
 		struct thread *a_td;
 	} */ *ap;
 {
 	vnode_t	*vp = ap->a_vp;
 	znode_t *zp = VTOZ(vp);
 	int error;
 
 	error = zfs_open(&vp, ap->a_mode, ap->a_cred);
 	if (error == 0)
 		vnode_create_vobject(vp, zp->z_phys->zp_size, ap->a_td);
 	return (error);
 }
 
 static int
 zfs_freebsd_close(ap)
 	struct vop_close_args /* {
 		struct vnode *a_vp;
 		int  a_fflag;
 		struct ucred *a_cred;
 		struct thread *a_td;
 	} */ *ap;
 {
 
 	return (zfs_close(ap->a_vp, ap->a_fflag, 0, 0, ap->a_cred));
 }
 
 static int
 zfs_freebsd_ioctl(ap)
 	struct vop_ioctl_args /* {
 		struct vnode *a_vp;
 		u_long a_command;
 		caddr_t a_data;
 		int a_fflag;
 		struct ucred *cred;
 		struct thread *td;
 	} */ *ap;
 {
 
 	return (zfs_ioctl(ap->a_vp, ap->a_command, (intptr_t)ap->a_data,
 	    ap->a_fflag, ap->a_cred, NULL));
 }
 
 static int
 zfs_freebsd_read(ap)
 	struct vop_read_args /* {
 		struct vnode *a_vp;
 		struct uio *a_uio;
 		int a_ioflag;
 		struct ucred *a_cred;
 	} */ *ap;
 {
 
 	return (zfs_read(ap->a_vp, ap->a_uio, ap->a_ioflag, ap->a_cred, NULL));
 }
 
 static int
 zfs_freebsd_write(ap)
 	struct vop_write_args /* {
 		struct vnode *a_vp;
 		struct uio *a_uio;
 		int a_ioflag;
 		struct ucred *a_cred;
 	} */ *ap;
 {
 
 	return (zfs_write(ap->a_vp, ap->a_uio, ap->a_ioflag, ap->a_cred, NULL));
 }
 
 static int
 zfs_freebsd_access(ap)
 	struct vop_access_args /* {
 		struct vnode *a_vp;
 		int  a_mode;
 		struct ucred *a_cred;
 		struct thread *a_td;
 	} */ *ap;
 {
 
 	return (zfs_access(ap->a_vp, ap->a_mode, 0, ap->a_cred));
 }
 
 static int
 zfs_freebsd_lookup(ap)
 	struct vop_lookup_args /* {
 		struct vnode *a_dvp;
 		struct vnode **a_vpp;
 		struct componentname *a_cnp;
 	} */ *ap;
 {
 	struct componentname *cnp = ap->a_cnp;
 	char nm[NAME_MAX + 1];
 
 	ASSERT(cnp->cn_namelen < sizeof(nm));
 	strlcpy(nm, cnp->cn_nameptr, MIN(cnp->cn_namelen + 1, sizeof(nm)));
 
 	return (zfs_lookup(ap->a_dvp, nm, ap->a_vpp, cnp, cnp->cn_nameiop,
 	    cnp->cn_cred, cnp->cn_thread));
 }
 
 static int
 zfs_freebsd_create(ap)
 	struct vop_create_args /* {
 		struct vnode *a_dvp;
 		struct vnode **a_vpp;
 		struct componentname *a_cnp;
 		struct vattr *a_vap;
 	} */ *ap;
 {
 	struct componentname *cnp = ap->a_cnp;
 	vattr_t *vap = ap->a_vap;
 	int mode;
 
 	ASSERT(cnp->cn_flags & SAVENAME);
 
 	vattr_init_mask(vap);
 	mode = vap->va_mode & ALLPERMS;
 
 	return (zfs_create(ap->a_dvp, cnp->cn_nameptr, vap, !EXCL, mode,
 	    ap->a_vpp, cnp->cn_cred, cnp->cn_thread));
 }
 
 static int
 zfs_freebsd_remove(ap)
 	struct vop_remove_args /* {
 		struct vnode *a_dvp;
 		struct vnode *a_vp;
 		struct componentname *a_cnp;
 	} */ *ap;
 {
 
 	ASSERT(ap->a_cnp->cn_flags & SAVENAME);
 
 	return (zfs_remove(ap->a_dvp, ap->a_cnp->cn_nameptr,
 	    ap->a_cnp->cn_cred));
 }
 
 static int
 zfs_freebsd_mkdir(ap)
 	struct vop_mkdir_args /* {
 		struct vnode *a_dvp;
 		struct vnode **a_vpp;
 		struct componentname *a_cnp;
 		struct vattr *a_vap;
 	} */ *ap;
 {
 	vattr_t *vap = ap->a_vap;
 
 	ASSERT(ap->a_cnp->cn_flags & SAVENAME);
 
 	vattr_init_mask(vap);
 
 	return (zfs_mkdir(ap->a_dvp, ap->a_cnp->cn_nameptr, vap, ap->a_vpp,
 	    ap->a_cnp->cn_cred));
 }
 
 static int
 zfs_freebsd_rmdir(ap)
 	struct vop_rmdir_args /* {
 		struct vnode *a_dvp;
 		struct vnode *a_vp;
 		struct componentname *a_cnp;
 	} */ *ap;
 {
 	struct componentname *cnp = ap->a_cnp;
 
 	ASSERT(cnp->cn_flags & SAVENAME);
 
 	return (zfs_rmdir(ap->a_dvp, cnp->cn_nameptr, NULL, cnp->cn_cred));
 }
 
 static int
 zfs_freebsd_readdir(ap)
 	struct vop_readdir_args /* {
 		struct vnode *a_vp;
 		struct uio *a_uio;
 		struct ucred *a_cred;
 		int *a_eofflag;
 		int *a_ncookies;
 		u_long **a_cookies;
 	} */ *ap;
 {
 
 	return (zfs_readdir(ap->a_vp, ap->a_uio, ap->a_cred, ap->a_eofflag,
 	    ap->a_ncookies, ap->a_cookies));
 }
 
 static int
 zfs_freebsd_fsync(ap)
 	struct vop_fsync_args /* {
 		struct vnode *a_vp;
 		int a_waitfor;
 		struct thread *a_td;
 	} */ *ap;
 {
 
 	vop_stdfsync(ap);
 	return (zfs_fsync(ap->a_vp, 0, ap->a_td->td_ucred));
 }
 
 static int
 zfs_freebsd_getattr(ap)
 	struct vop_getattr_args /* {
 		struct vnode *a_vp;
 		struct vattr *a_vap;
 		struct ucred *a_cred;
 		struct thread *a_td;
 	} */ *ap;
 {
 
 	return (zfs_getattr(ap->a_vp, ap->a_vap, 0, ap->a_cred));
 }
 
 static int
 zfs_freebsd_setattr(ap)
 	struct vop_setattr_args /* {
 		struct vnode *a_vp;
 		struct vattr *a_vap;
 		struct ucred *a_cred;
 		struct thread *a_td;
 	} */ *ap;
 {
 	vattr_t *vap = ap->a_vap;
 
 	/* No support for FreeBSD's chflags(2). */
 	if (vap->va_flags != VNOVAL)
 		return (EOPNOTSUPP);
 
 	vattr_init_mask(vap);
 	vap->va_mask &= ~AT_NOSET;
 
 	return (zfs_setattr(ap->a_vp, vap, 0, ap->a_cred, NULL));
 }
 
 static int
 zfs_freebsd_rename(ap)
 	struct vop_rename_args  /* {
 		struct vnode *a_fdvp;
 		struct vnode *a_fvp;
 		struct componentname *a_fcnp;
 		struct vnode *a_tdvp;
 		struct vnode *a_tvp;
 		struct componentname *a_tcnp;
 	} */ *ap;
 {
 	vnode_t *fdvp = ap->a_fdvp;
 	vnode_t *fvp = ap->a_fvp;
 	vnode_t *tdvp = ap->a_tdvp;
 	vnode_t *tvp = ap->a_tvp;
 	int error;
 
 	ASSERT(ap->a_fcnp->cn_flags & SAVENAME);
 	ASSERT(ap->a_tcnp->cn_flags & SAVENAME);
 
 	error = zfs_rename(fdvp, ap->a_fcnp->cn_nameptr, tdvp,
 	    ap->a_tcnp->cn_nameptr, ap->a_fcnp->cn_cred);
 
 	if (tdvp == tvp)
 		VN_RELE(tdvp);
 	else
 		VN_URELE(tdvp);
 	if (tvp)
 		VN_URELE(tvp);
 	VN_RELE(fdvp);
 	VN_RELE(fvp);
 
 	return (error);
 }
 
 static int
 zfs_freebsd_symlink(ap)
 	struct vop_symlink_args /* {
 		struct vnode *a_dvp;
 		struct vnode **a_vpp;
 		struct componentname *a_cnp;
 		struct vattr *a_vap;
 		char *a_target;
 	} */ *ap;
 {
 	struct componentname *cnp = ap->a_cnp;
 	vattr_t *vap = ap->a_vap;
 
 	ASSERT(cnp->cn_flags & SAVENAME);
 
 	vap->va_type = VLNK;	/* FreeBSD: Syscall only sets va_mode. */
 	vattr_init_mask(vap);
 
 	return (zfs_symlink(ap->a_dvp, ap->a_vpp, cnp->cn_nameptr, vap,
 	    ap->a_target, cnp->cn_cred, cnp->cn_thread));
 }
 
 static int
 zfs_freebsd_readlink(ap)
 	struct vop_readlink_args /* {
 		struct vnode *a_vp;
 		struct uio *a_uio;
 		struct ucred *a_cred;
 	} */ *ap;
 {
 
 	return (zfs_readlink(ap->a_vp, ap->a_uio, ap->a_cred));
 }
 
 static int
 zfs_freebsd_link(ap)
 	struct vop_link_args /* {
 		struct vnode *a_tdvp;
 		struct vnode *a_vp;
 		struct componentname *a_cnp;
 	} */ *ap;
 {
 	struct componentname *cnp = ap->a_cnp;
 
 	ASSERT(cnp->cn_flags & SAVENAME);
 
 	return (zfs_link(ap->a_tdvp, ap->a_vp, cnp->cn_nameptr, cnp->cn_cred));
 }
 
 static int
 zfs_freebsd_inactive(ap)
 	struct vop_inactive_args /* {
 		struct vnode *a_vp;
 		struct thread *a_td;
 	} */ *ap;
 {
 	vnode_t *vp = ap->a_vp;
 
 	zfs_inactive(vp, ap->a_td->td_ucred);
 	return (0);
 }
 
 static int
 zfs_freebsd_reclaim(ap)
 	struct vop_reclaim_args /* {
 		struct vnode *a_vp;
 		struct thread *a_td;
 	} */ *ap;
 {
 	vnode_t	*vp = ap->a_vp;
 	znode_t	*zp = VTOZ(vp);
 	zfsvfs_t *zfsvfs;
 	int rele = 1;
 
 	ASSERT(zp != NULL);
 
 	/*
 	 * Destroy the vm object and flush associated pages.
 	 */
 	vnode_destroy_vobject(vp);
 
 	mutex_enter(&zp->z_lock);
 	ASSERT(zp->z_phys);
 	ASSERT(zp->z_dbuf_held);
 	zfsvfs = zp->z_zfsvfs;
 	if (!zp->z_unlinked) {
 		zp->z_dbuf_held = 0;
 		ZTOV(zp) = NULL;
 		mutex_exit(&zp->z_lock);
 		dmu_buf_rele(zp->z_dbuf, NULL);
 	} else {
 		mutex_exit(&zp->z_lock);
 	}
 	VI_LOCK(vp);
 	if (vp->v_count > 0)
 		rele = 0;
 	vp->v_data = NULL;
 	ASSERT(vp->v_holdcnt >= 1);
 	VI_UNLOCK(vp);
 	if (!zp->z_unlinked && rele)
 		VFS_RELE(zfsvfs->z_vfs);
 	return (0);
 }
 
 static int
 zfs_freebsd_fid(ap)
 	struct vop_fid_args /* {
 		struct vnode *a_vp;
 		struct fid *a_fid;
 	} */ *ap;
 {
 
 	return (zfs_fid(ap->a_vp, (void *)ap->a_fid));
 }
 
 static int
 zfs_freebsd_pathconf(ap)
 	struct vop_pathconf_args /* {
 		struct vnode *a_vp;
 		int a_name;
 		register_t *a_retval;
 	} */ *ap;
 {
 	ulong_t val;
 	int error;
 
 	error = zfs_pathconf(ap->a_vp, ap->a_name, &val, curthread->td_ucred);
 	if (error == 0)
 		*ap->a_retval = val;
 	else if (error == EOPNOTSUPP)
 		error = vop_stdpathconf(ap);
 	return (error);
 }
 
 /*
  * Advisory record locking support
  */
 static int
 zfs_freebsd_advlock(ap)
 	struct vop_advlock_args /* {
 		struct vnode *a_vp;
 		caddr_t  a_id;
 		int  a_op;
 		struct flock *a_fl;
 		int  a_flags;
 	} */ *ap;
 {
 	znode_t	*zp = VTOZ(ap->a_vp);
 
 	return (lf_advlock(ap, &(zp->z_lockf), zp->z_phys->zp_size));
 }
 
 struct vop_vector zfs_vnodeops;
 struct vop_vector zfs_fifoops;
 
 struct vop_vector zfs_vnodeops = {
 	.vop_default =	&default_vnodeops,
 	.vop_inactive =	zfs_freebsd_inactive,
 	.vop_reclaim =	zfs_freebsd_reclaim,
 	.vop_access =	zfs_freebsd_access,
 #ifdef FREEBSD_NAMECACHE
 	.vop_lookup =	vfs_cache_lookup,
 	.vop_cachedlookup = zfs_freebsd_lookup,
 #else
 	.vop_lookup =	zfs_freebsd_lookup,
 #endif
 	.vop_getattr =	zfs_freebsd_getattr,
 	.vop_setattr =	zfs_freebsd_setattr,
 	.vop_create =	zfs_freebsd_create,
 	.vop_mknod =	zfs_freebsd_create,
 	.vop_mkdir =	zfs_freebsd_mkdir,
 	.vop_readdir =	zfs_freebsd_readdir,
 	.vop_fsync =	zfs_freebsd_fsync,
 	.vop_open =	zfs_freebsd_open,
 	.vop_close =	zfs_freebsd_close,
 	.vop_rmdir =	zfs_freebsd_rmdir,
 	.vop_ioctl =	zfs_freebsd_ioctl,
 	.vop_link =	zfs_freebsd_link,
 	.vop_symlink =	zfs_freebsd_symlink,
 	.vop_readlink =	zfs_freebsd_readlink,
 	.vop_read =	zfs_freebsd_read,
 	.vop_write =	zfs_freebsd_write,
 	.vop_remove =	zfs_freebsd_remove,
 	.vop_rename =	zfs_freebsd_rename,
 	.vop_advlock =	zfs_freebsd_advlock,
 	.vop_pathconf =	zfs_freebsd_pathconf,
 	.vop_bmap =	VOP_EOPNOTSUPP,
 	.vop_fid =	zfs_freebsd_fid,
 };
 
 struct vop_vector zfs_fifoops = {
 	.vop_default =	&fifo_specops,
 	.vop_fsync =	VOP_PANIC,
 	.vop_access =	zfs_freebsd_access,
 	.vop_getattr =	zfs_freebsd_getattr,
 	.vop_inactive =	zfs_freebsd_inactive,
 	.vop_read =	VOP_PANIC,
 	.vop_reclaim =	zfs_freebsd_reclaim,
 	.vop_setattr =	zfs_freebsd_setattr,
 	.vop_write =	VOP_PANIC,
 	.vop_fid =	zfs_freebsd_fid,
 };
Index: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_znode.c
===================================================================
--- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_znode.c	(revision 175201)
+++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_znode.c	(revision 175202)
@@ -1,1072 +1,1072 @@
 /*
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
  * Common Development and Distribution License (the "License").
  * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or http://www.opensolaris.org/os/licensing.
  * See the License for the specific language governing permissions
  * and limitations under the License.
  *
  * When distributing Covered Code, include this CDDL HEADER in each
  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  * If applicable, add the following below this CDDL HEADER, with the
  * fields enclosed by brackets "[]" replaced with your own identifying
  * information: Portions Copyright [yyyy] [name of copyright owner]
  *
  * CDDL HEADER END
  */
 /*
  * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
 /* Portions Copyright 2007 Jeremy Teo */
 
 #pragma ident	"%Z%%M%	%I%	%E% SMI"
 
 #ifdef _KERNEL
 #include <sys/types.h>
 #include <sys/param.h>
 #include <sys/time.h>
 #include <sys/systm.h>
 #include <sys/sysmacros.h>
 #include <sys/resource.h>
 #include <sys/mntent.h>
 #include <sys/vfs.h>
 #include <sys/vnode.h>
 #include <sys/file.h>
 #include <sys/kmem.h>
 #include <sys/cmn_err.h>
 #include <sys/errno.h>
 #include <sys/unistd.h>
 #include <sys/atomic.h>
 #include <sys/zfs_dir.h>
 #include <sys/zfs_acl.h>
 #include <sys/zfs_ioctl.h>
 #include <sys/zfs_rlock.h>
 #include <sys/fs/zfs.h>
 #endif /* _KERNEL */
 
 #include <sys/dmu.h>
 #include <sys/refcount.h>
 #include <sys/stat.h>
 #include <sys/zap.h>
 #include <sys/zfs_znode.h>
 #include <sys/refcount.h>
 
 /* Used by fstat(1). */
 SYSCTL_INT(_debug_sizeof, OID_AUTO, znode, CTLFLAG_RD, 0, sizeof(znode_t),
     "sizeof(znode_t)");
 
 /*
  * Functions needed for userland (ie: libzpool) are not put under
  * #ifdef_KERNEL; the rest of the functions have dependencies
  * (such as VFS logic) that will not compile easily in userland.
  */
 #ifdef _KERNEL
 struct kmem_cache *znode_cache = NULL;
 
 /*ARGSUSED*/
 static void
 znode_pageout_func(dmu_buf_t *dbuf, void *user_ptr)
 {
 	znode_t *zp = user_ptr;
 	vnode_t *vp;
 
 	mutex_enter(&zp->z_lock);
 	vp = ZTOV(zp);
 	if (vp == NULL) {
 		mutex_exit(&zp->z_lock);
 		zfs_znode_free(zp);
 	} else if (vp->v_count == 0) {
 		ZTOV(zp) = NULL;
 		vhold(vp);
 		mutex_exit(&zp->z_lock);
-		vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, curthread);
+		vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
 		vrecycle(vp, curthread);
 		VOP_UNLOCK(vp, 0, curthread);
 		vdrop(vp);
 		zfs_znode_free(zp);
 	} else {
 		/* signal force unmount that this znode can be freed */
 		zp->z_dbuf = NULL;
 		mutex_exit(&zp->z_lock);
 	}
 }
 
 extern struct vop_vector zfs_vnodeops;
 extern struct vop_vector zfs_fifoops;
 
 /*
  * XXX: We cannot use this function as a cache constructor, because
  *      there is one global cache for all file systems and we need
  *      to pass vfsp here, which is not possible, because argument
  *      'cdrarg' is defined at kmem_cache_create() time.
  */
 static int
 zfs_znode_cache_constructor(void *buf, void *cdrarg, int kmflags)
 {
 	znode_t *zp = buf;
 	vnode_t *vp;
 	vfs_t *vfsp = cdrarg;
 	int error;
 
 	if (cdrarg != NULL) {
 		error = getnewvnode("zfs", vfsp, &zfs_vnodeops, &vp);
 		ASSERT(error == 0);
 		zp->z_vnode = vp;
 		vp->v_data = (caddr_t)zp;
 		vp->v_vnlock->lk_flags |= LK_CANRECURSE;
 		vp->v_vnlock->lk_flags &= ~LK_NOSHARE;
 	} else {
 		zp->z_vnode = NULL;
 	}
 	mutex_init(&zp->z_lock, NULL, MUTEX_DEFAULT, NULL);
 	rw_init(&zp->z_map_lock, NULL, RW_DEFAULT, NULL);
 	rw_init(&zp->z_parent_lock, NULL, RW_DEFAULT, NULL);
 	rw_init(&zp->z_name_lock, NULL, RW_DEFAULT, NULL);
 	mutex_init(&zp->z_acl_lock, NULL, MUTEX_DEFAULT, NULL);
 
 	mutex_init(&zp->z_range_lock, NULL, MUTEX_DEFAULT, NULL);
 	avl_create(&zp->z_range_avl, zfs_range_compare,
 	    sizeof (rl_t), offsetof(rl_t, r_node));
 
 	zp->z_dbuf_held = 0;
 	zp->z_dirlocks = 0;
 	zp->z_lockf = NULL;
 	return (0);
 }
 
 /*ARGSUSED*/
 static void
 zfs_znode_cache_destructor(void *buf, void *cdarg)
 {
 	znode_t *zp = buf;
 
 	ASSERT(zp->z_dirlocks == 0);
 	mutex_destroy(&zp->z_lock);
 	rw_destroy(&zp->z_map_lock);
 	rw_destroy(&zp->z_parent_lock);
 	rw_destroy(&zp->z_name_lock);
 	mutex_destroy(&zp->z_acl_lock);
 	mutex_destroy(&zp->z_range_lock);
 	avl_destroy(&zp->z_range_avl);
 
 	ASSERT(zp->z_dbuf_held == 0);
 }
 
 void
 zfs_znode_init(void)
 {
 	/*
 	 * Initialize zcache
 	 */
 	ASSERT(znode_cache == NULL);
 	znode_cache = kmem_cache_create("zfs_znode_cache",
 	    sizeof (znode_t), 0, /* zfs_znode_cache_constructor */ NULL,
 	    zfs_znode_cache_destructor, NULL, NULL, NULL, 0);
 }
 
 void
 zfs_znode_fini(void)
 {
 	/*
 	 * Cleanup zcache
 	 */
 	if (znode_cache)
 		kmem_cache_destroy(znode_cache);
 	znode_cache = NULL;
 }
 
 /*
  * zfs_init_fs - Initialize the zfsvfs struct and the file system
  *	incore "master" object.  Verify version compatibility.
  */
 int
 zfs_init_fs(zfsvfs_t *zfsvfs, znode_t **zpp, cred_t *cr)
 {
 	objset_t	*os = zfsvfs->z_os;
 	uint64_t	version = ZPL_VERSION;
 	int		i, error;
 	dmu_object_info_t doi;
 	uint64_t fsid_guid;
 
 	*zpp = NULL;
 
 	/*
 	 * XXX - hack to auto-create the pool root filesystem at
 	 * the first attempted mount.
 	 */
 	if (dmu_object_info(os, MASTER_NODE_OBJ, &doi) == ENOENT) {
 		dmu_tx_t *tx = dmu_tx_create(os);
 
 		dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, TRUE, NULL); /* master */
 		dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, TRUE, NULL); /* del queue */
 		dmu_tx_hold_bonus(tx, DMU_NEW_OBJECT); /* root node */
 		error = dmu_tx_assign(tx, TXG_WAIT);
 		ASSERT3U(error, ==, 0);
 		zfs_create_fs(os, cr, tx);
 		dmu_tx_commit(tx);
 	}
 
 	error = zap_lookup(os, MASTER_NODE_OBJ, ZPL_VERSION_OBJ, 8, 1,
 	    &version);
 	if (error) {
 		return (error);
 	} else if (version != ZPL_VERSION) {
 		(void) printf("Mismatched versions:  File system "
 		    "is version %lld on-disk format, which is "
 		    "incompatible with this software version %lld!",
 		    (u_longlong_t)version, ZPL_VERSION);
 		return (ENOTSUP);
 	}
 
 	/*
 	 * The fsid is 64 bits, composed of an 8-bit fs type, which
 	 * separates our fsid from any other filesystem types, and a
 	 * 56-bit objset unique ID.  The objset unique ID is unique to
 	 * all objsets open on this system, provided by unique_create().
 	 * The 8-bit fs type must be put in the low bits of fsid[1]
 	 * because that's where other Solaris filesystems put it.
 	 */
 	fsid_guid = dmu_objset_fsid_guid(os);
 	ASSERT((fsid_guid & ~((1ULL<<56)-1)) == 0);
 	zfsvfs->z_vfs->vfs_fsid.val[0] = fsid_guid;
 	zfsvfs->z_vfs->vfs_fsid.val[1] = ((fsid_guid>>32) << 8) |
 	    zfsvfs->z_vfs->mnt_vfc->vfc_typenum & 0xFF;
 
 	error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_ROOT_OBJ, 8, 1,
 	    &zfsvfs->z_root);
 	if (error)
 		return (error);
 	ASSERT(zfsvfs->z_root != 0);
 
 	/*
 	 * Create the per mount vop tables.
 	 */
 
 	/*
 	 * Initialize zget mutex's
 	 */
 	for (i = 0; i != ZFS_OBJ_MTX_SZ; i++)
 		mutex_init(&zfsvfs->z_hold_mtx[i], NULL, MUTEX_DEFAULT, NULL);
 
 	error = zfs_zget(zfsvfs, zfsvfs->z_root, zpp);
 	if (error)
 		return (error);
 	ASSERT3U((*zpp)->z_id, ==, zfsvfs->z_root);
 
 	error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_UNLINKED_SET, 8, 1,
 	    &zfsvfs->z_unlinkedobj);
 	if (error)
 		return (error);
 
 	return (0);
 }
 
 /*
  * define a couple of values we need available
  * for both 64 and 32 bit environments.
  */
 #ifndef NBITSMINOR64
 #define	NBITSMINOR64	32
 #endif
 #ifndef MAXMAJ64
 #define	MAXMAJ64	0xffffffffUL
 #endif
 #ifndef	MAXMIN64
 #define	MAXMIN64	0xffffffffUL
 #endif
 #ifndef major
 #define	major(x)	((int)(((u_int)(x) >> 8)&0xff))	/* major number */
 #endif
 #ifndef minor
 #define	minor(x)	((int)((x)&0xffff00ff))		/* minor number */
 #endif
 
 /*
  * Create special expldev for ZFS private use.
  * Can't use standard expldev since it doesn't do
  * what we want.  The standard expldev() takes a
  * dev32_t in LP64 and expands it to a long dev_t.
  * We need an interface that takes a dev32_t in ILP32
  * and expands it to a long dev_t.
  */
 static uint64_t
 zfs_expldev(dev_t dev)
 {
 	return (((uint64_t)major(dev) << NBITSMINOR64) | minor(dev));
 }
 /*
  * Special cmpldev for ZFS private use.
  * Can't use standard cmpldev since it takes
  * a long dev_t and compresses it to dev32_t in
  * LP64.  We need to do a compaction of a long dev_t
  * to a dev32_t in ILP32.
  */
 dev_t
 zfs_cmpldev(uint64_t dev)
 {
 	return (makedev((dev >> NBITSMINOR64), (dev & MAXMIN64)));
 }
 
 /*
  * Construct a new znode/vnode and intialize.
  *
  * This does not do a call to dmu_set_user() that is
  * up to the caller to do, in case you don't want to
  * return the znode
  */
 static znode_t *
 zfs_znode_alloc(zfsvfs_t *zfsvfs, dmu_buf_t *db, uint64_t obj_num, int blksz)
 {
 	znode_t	*zp;
 	vnode_t *vp;
 	int error;
 
 	zp = kmem_cache_alloc(znode_cache, KM_SLEEP);
 	zfs_znode_cache_constructor(zp, zfsvfs->z_vfs, 0);
 
 	ASSERT(zp->z_dirlocks == NULL);
 
 	zp->z_phys = db->db_data;
 	zp->z_zfsvfs = zfsvfs;
 	zp->z_unlinked = 0;
 	zp->z_atime_dirty = 0;
 	zp->z_dbuf_held = 0;
 	zp->z_mapcnt = 0;
 	zp->z_last_itx = 0;
 	zp->z_dbuf = db;
 	zp->z_id = obj_num;
 	zp->z_blksz = blksz;
 	zp->z_seq = 0x7A4653;
 	zp->z_sync_cnt = 0;
 
 	mutex_enter(&zfsvfs->z_znodes_lock);
 	list_insert_tail(&zfsvfs->z_all_znodes, zp);
 	mutex_exit(&zfsvfs->z_znodes_lock);
 
 	vp = ZTOV(zp);
 	if (vp == NULL)
 		return (zp);
 
 	error = insmntque(vp, zfsvfs->z_vfs);
 	KASSERT(error == 0, ("insmntque() failed: error %d", error));
 
 	vp->v_type = IFTOVT((mode_t)zp->z_phys->zp_mode);
 	switch (vp->v_type) {
 	case VDIR:
 		zp->z_zn_prefetch = B_TRUE; /* z_prefetch default is enabled */
 		break;
 	case VFIFO:
 		vp->v_op = &zfs_fifoops;
 		break;
 	}
 
 	return (zp);
 }
 
 static void
 zfs_znode_dmu_init(znode_t *zp)
 {
 	znode_t		*nzp;
 	zfsvfs_t	*zfsvfs = zp->z_zfsvfs;
 	dmu_buf_t	*db = zp->z_dbuf;
 
 	mutex_enter(&zp->z_lock);
 
 	nzp = dmu_buf_set_user_ie(db, zp, &zp->z_phys, znode_pageout_func);
 
 	/*
 	 * there should be no
 	 * concurrent zgets on this object.
 	 */
 	ASSERT3P(nzp, ==, NULL);
 
 	/*
 	 * Slap on VROOT if we are the root znode
 	 */
 	if (zp->z_id == zfsvfs->z_root) {
 		ZTOV(zp)->v_flag |= VROOT;
 	}
 
 	ASSERT(zp->z_dbuf_held == 0);
 	zp->z_dbuf_held = 1;
 	VFS_HOLD(zfsvfs->z_vfs);
 	mutex_exit(&zp->z_lock);
 }
 
 /*
  * Create a new DMU object to hold a zfs znode.
  *
  *	IN:	dzp	- parent directory for new znode
  *		vap	- file attributes for new znode
  *		tx	- dmu transaction id for zap operations
  *		cr	- credentials of caller
  *		flag	- flags:
  *			  IS_ROOT_NODE	- new object will be root
  *			  IS_XATTR	- new object is an attribute
  *			  IS_REPLAY	- intent log replay
  *
  *	OUT:	oid	- ID of created object
  *
  */
 void
 zfs_mknode(znode_t *dzp, vattr_t *vap, uint64_t *oid, dmu_tx_t *tx, cred_t *cr,
 	uint_t flag, znode_t **zpp, int bonuslen)
 {
 	dmu_buf_t	*dbp;
 	znode_phys_t	*pzp;
 	znode_t		*zp;
 	zfsvfs_t	*zfsvfs = dzp->z_zfsvfs;
 	timestruc_t	now;
 	uint64_t	gen;
 	int		err;
 
 	ASSERT(vap && (vap->va_mask & (AT_TYPE|AT_MODE)) == (AT_TYPE|AT_MODE));
 
 	if (zfsvfs->z_assign >= TXG_INITIAL) {		/* ZIL replay */
 		*oid = vap->va_nodeid;
 		flag |= IS_REPLAY;
 		now = vap->va_ctime;		/* see zfs_replay_create() */
 		gen = vap->va_nblocks;		/* ditto */
 	} else {
 		*oid = 0;
 		gethrestime(&now);
 		gen = dmu_tx_get_txg(tx);
 	}
 
 	/*
 	 * Create a new DMU object.
 	 */
 	/*
 	 * There's currently no mechanism for pre-reading the blocks that will
 	 * be to needed allocate a new object, so we accept the small chance
 	 * that there will be an i/o error and we will fail one of the
 	 * assertions below.
 	 */
 	if (vap->va_type == VDIR) {
 		if (flag & IS_REPLAY) {
 			err = zap_create_claim(zfsvfs->z_os, *oid,
 			    DMU_OT_DIRECTORY_CONTENTS,
 			    DMU_OT_ZNODE, sizeof (znode_phys_t) + bonuslen, tx);
 			ASSERT3U(err, ==, 0);
 		} else {
 			*oid = zap_create(zfsvfs->z_os,
 			    DMU_OT_DIRECTORY_CONTENTS,
 			    DMU_OT_ZNODE, sizeof (znode_phys_t) + bonuslen, tx);
 		}
 	} else {
 		if (flag & IS_REPLAY) {
 			err = dmu_object_claim(zfsvfs->z_os, *oid,
 			    DMU_OT_PLAIN_FILE_CONTENTS, 0,
 			    DMU_OT_ZNODE, sizeof (znode_phys_t) + bonuslen, tx);
 			ASSERT3U(err, ==, 0);
 		} else {
 			*oid = dmu_object_alloc(zfsvfs->z_os,
 			    DMU_OT_PLAIN_FILE_CONTENTS, 0,
 			    DMU_OT_ZNODE, sizeof (znode_phys_t) + bonuslen, tx);
 		}
 	}
 	VERIFY(0 == dmu_bonus_hold(zfsvfs->z_os, *oid, NULL, &dbp));
 	dmu_buf_will_dirty(dbp, tx);
 
 	/*
 	 * Initialize the znode physical data to zero.
 	 */
 	ASSERT(dbp->db_size >= sizeof (znode_phys_t));
 	bzero(dbp->db_data, dbp->db_size);
 	pzp = dbp->db_data;
 
 	/*
 	 * If this is the root, fix up the half-initialized parent pointer
 	 * to reference the just-allocated physical data area.
 	 */
 	if (flag & IS_ROOT_NODE) {
 		dzp->z_phys = pzp;
 		dzp->z_id = *oid;
 	}
 
 	/*
 	 * If parent is an xattr, so am I.
 	 */
 	if (dzp->z_phys->zp_flags & ZFS_XATTR)
 		flag |= IS_XATTR;
 
 	if (vap->va_type == VBLK || vap->va_type == VCHR) {
 		pzp->zp_rdev = zfs_expldev(vap->va_rdev);
 	}
 
 	if (vap->va_type == VDIR) {
 		pzp->zp_size = 2;		/* contents ("." and "..") */
 		pzp->zp_links = (flag & (IS_ROOT_NODE | IS_XATTR)) ? 2 : 1;
 	}
 
 	pzp->zp_parent = dzp->z_id;
 	if (flag & IS_XATTR)
 		pzp->zp_flags |= ZFS_XATTR;
 
 	pzp->zp_gen = gen;
 
 	ZFS_TIME_ENCODE(&now, pzp->zp_crtime);
 	ZFS_TIME_ENCODE(&now, pzp->zp_ctime);
 
 	if (vap->va_mask & AT_ATIME) {
 		ZFS_TIME_ENCODE(&vap->va_atime, pzp->zp_atime);
 	} else {
 		ZFS_TIME_ENCODE(&now, pzp->zp_atime);
 	}
 
 	if (vap->va_mask & AT_MTIME) {
 		ZFS_TIME_ENCODE(&vap->va_mtime, pzp->zp_mtime);
 	} else {
 		ZFS_TIME_ENCODE(&now, pzp->zp_mtime);
 	}
 
 	pzp->zp_mode = MAKEIMODE(vap->va_type, vap->va_mode);
 	zp = zfs_znode_alloc(zfsvfs, dbp, *oid, 0);
 
 	zfs_perm_init(zp, dzp, flag, vap, tx, cr);
 
 	if (zpp) {
 		kmutex_t *hash_mtx = ZFS_OBJ_MUTEX(zp);
 
 		mutex_enter(hash_mtx);
 		zfs_znode_dmu_init(zp);
 		mutex_exit(hash_mtx);
 
 		*zpp = zp;
 	} else {
 		if (ZTOV(zp) != NULL)
 			ZTOV(zp)->v_count = 0;
 		dmu_buf_rele(dbp, NULL);
 		zfs_znode_free(zp);
 	}
 }
 
 int
 zfs_zget(zfsvfs_t *zfsvfs, uint64_t obj_num, znode_t **zpp)
 {
 	dmu_object_info_t doi;
 	dmu_buf_t	*db;
 	znode_t		*zp;
 	vnode_t		*vp;
 	int err;
 
 	*zpp = NULL;
 
 	ZFS_OBJ_HOLD_ENTER(zfsvfs, obj_num);
 
 	err = dmu_bonus_hold(zfsvfs->z_os, obj_num, NULL, &db);
 	if (err) {
 		ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num);
 		return (err);
 	}
 
 	dmu_object_info_from_db(db, &doi);
 	if (doi.doi_bonus_type != DMU_OT_ZNODE ||
 	    doi.doi_bonus_size < sizeof (znode_phys_t)) {
 		dmu_buf_rele(db, NULL);
 		ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num);
 		return (EINVAL);
 	}
 
 	ASSERT(db->db_object == obj_num);
 	ASSERT(db->db_offset == -1);
 	ASSERT(db->db_data != NULL);
 
 	zp = dmu_buf_get_user(db);
 
 	if (zp != NULL) {
 		mutex_enter(&zp->z_lock);
 
 		ASSERT3U(zp->z_id, ==, obj_num);
 		if (zp->z_unlinked) {
 			dmu_buf_rele(db, NULL);
 			mutex_exit(&zp->z_lock);
 			ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num);
 			return (ENOENT);
 		} else if (zp->z_dbuf_held) {
 			dmu_buf_rele(db, NULL);
 		} else {
 			zp->z_dbuf_held = 1;
 			VFS_HOLD(zfsvfs->z_vfs);
 		}
 
 		if (ZTOV(zp) != NULL)
 			VN_HOLD(ZTOV(zp));
 		else {
 			err = getnewvnode("zfs", zfsvfs->z_vfs, &zfs_vnodeops,
 			    &zp->z_vnode);
 			ASSERT(err == 0);
 			vp = ZTOV(zp);
 			vp->v_data = (caddr_t)zp;
 			vp->v_vnlock->lk_flags |= LK_CANRECURSE;
 			vp->v_vnlock->lk_flags &= ~LK_NOSHARE;
 			vp->v_type = IFTOVT((mode_t)zp->z_phys->zp_mode);
 			if (vp->v_type == VDIR)
 				zp->z_zn_prefetch = B_TRUE;	/* z_prefetch default is enabled */
 			err = insmntque(vp, zfsvfs->z_vfs);
 			KASSERT(err == 0, ("insmntque() failed: error %d", err));
 		}
 		mutex_exit(&zp->z_lock);
 		ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num);
 		*zpp = zp;
 		return (0);
 	}
 
 	/*
 	 * Not found create new znode/vnode
 	 */
 	zp = zfs_znode_alloc(zfsvfs, db, obj_num, doi.doi_data_block_size);
 	ASSERT3U(zp->z_id, ==, obj_num);
 	zfs_znode_dmu_init(zp);
 	ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num);
 	*zpp = zp;
 	return (0);
 }
 
 void
 zfs_znode_delete(znode_t *zp, dmu_tx_t *tx)
 {
 	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
 	int error;
 
 	ZFS_OBJ_HOLD_ENTER(zfsvfs, zp->z_id);
 	if (zp->z_phys->zp_acl.z_acl_extern_obj) {
 		error = dmu_object_free(zfsvfs->z_os,
 		    zp->z_phys->zp_acl.z_acl_extern_obj, tx);
 		ASSERT3U(error, ==, 0);
 	}
 	error = dmu_object_free(zfsvfs->z_os, zp->z_id, tx);
 	ASSERT3U(error, ==, 0);
 	zp->z_dbuf_held = 0;
 	ZFS_OBJ_HOLD_EXIT(zfsvfs, zp->z_id);
 	dmu_buf_rele(zp->z_dbuf, NULL);
 }
 
 void
 zfs_zinactive(znode_t *zp)
 {
 	vnode_t	*vp = ZTOV(zp);
 	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
 	uint64_t z_id = zp->z_id;
 
 	ASSERT(zp->z_dbuf_held && zp->z_phys);
 
 	/*
 	 * Don't allow a zfs_zget() while were trying to release this znode
 	 */
 	ZFS_OBJ_HOLD_ENTER(zfsvfs, z_id);
 
 	mutex_enter(&zp->z_lock);
 	VI_LOCK(vp);
 	if (vp->v_count > 0) {
 		/*
 		 * If the hold count is greater than zero, somebody has
 		 * obtained a new reference on this znode while we were
 		 * processing it here, so we are done.
 		 */
 		VI_UNLOCK(vp);
 		mutex_exit(&zp->z_lock);
 		ZFS_OBJ_HOLD_EXIT(zfsvfs, z_id);
 		return;
 	}
 	VI_UNLOCK(vp);
 
 	/*
 	 * If this was the last reference to a file with no links,
 	 * remove the file from the file system.
 	 */
 	if (zp->z_unlinked) {
 		ZTOV(zp) = NULL;
 		mutex_exit(&zp->z_lock);
 		ZFS_OBJ_HOLD_EXIT(zfsvfs, z_id);
 		ASSERT(vp->v_count == 0);
 		vrecycle(vp, curthread);
 		zfs_rmnode(zp);
 		VFS_RELE(zfsvfs->z_vfs);
 		return;
 	}
 	ASSERT(zp->z_phys);
 	ASSERT(zp->z_dbuf_held);
 	mutex_exit(&zp->z_lock);
 	ZFS_OBJ_HOLD_EXIT(zfsvfs, z_id);
 }
 
 void
 zfs_znode_free(znode_t *zp)
 {
 	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
 
 	mutex_enter(&zfsvfs->z_znodes_lock);
 	list_remove(&zfsvfs->z_all_znodes, zp);
 	mutex_exit(&zfsvfs->z_znodes_lock);
 
 	kmem_cache_free(znode_cache, zp);
 }
 
 void
 zfs_time_stamper_locked(znode_t *zp, uint_t flag, dmu_tx_t *tx)
 {
 	timestruc_t	now;
 
 	ASSERT(MUTEX_HELD(&zp->z_lock));
 
 	gethrestime(&now);
 
 	if (tx) {
 		dmu_buf_will_dirty(zp->z_dbuf, tx);
 		zp->z_atime_dirty = 0;
 		zp->z_seq++;
 	} else {
 		zp->z_atime_dirty = 1;
 	}
 
 	if (flag & AT_ATIME)
 		ZFS_TIME_ENCODE(&now, zp->z_phys->zp_atime);
 
 	if (flag & AT_MTIME)
 		ZFS_TIME_ENCODE(&now, zp->z_phys->zp_mtime);
 
 	if (flag & AT_CTIME)
 		ZFS_TIME_ENCODE(&now, zp->z_phys->zp_ctime);
 }
 
 /*
  * Update the requested znode timestamps with the current time.
  * If we are in a transaction, then go ahead and mark the znode
  * dirty in the transaction so the timestamps will go to disk.
  * Otherwise, we will get pushed next time the znode is updated
  * in a transaction, or when this znode eventually goes inactive.
  *
  * Why is this OK?
  *  1 - Only the ACCESS time is ever updated outside of a transaction.
  *  2 - Multiple consecutive updates will be collapsed into a single
  *	znode update by the transaction grouping semantics of the DMU.
  */
 void
 zfs_time_stamper(znode_t *zp, uint_t flag, dmu_tx_t *tx)
 {
 	mutex_enter(&zp->z_lock);
 	zfs_time_stamper_locked(zp, flag, tx);
 	mutex_exit(&zp->z_lock);
 }
 
 /*
  * Grow the block size for a file.
  *
  *	IN:	zp	- znode of file to free data in.
  *		size	- requested block size
  *		tx	- open transaction.
  *
  * NOTE: this function assumes that the znode is write locked.
  */
 void
 zfs_grow_blocksize(znode_t *zp, uint64_t size, dmu_tx_t *tx)
 {
 	int		error;
 	u_longlong_t	dummy;
 
 	if (size <= zp->z_blksz)
 		return;
 	/*
 	 * If the file size is already greater than the current blocksize,
 	 * we will not grow.  If there is more than one block in a file,
 	 * the blocksize cannot change.
 	 */
 	if (zp->z_blksz && zp->z_phys->zp_size > zp->z_blksz)
 		return;
 
 	error = dmu_object_set_blocksize(zp->z_zfsvfs->z_os, zp->z_id,
 	    size, 0, tx);
 	if (error == ENOTSUP)
 		return;
 	ASSERT3U(error, ==, 0);
 
 	/* What blocksize did we actually get? */
 	dmu_object_size_from_db(zp->z_dbuf, &zp->z_blksz, &dummy);
 }
 
 /*
  * Free space in a file.
  *
  *	IN:	zp	- znode of file to free data in.
  *		off	- start of section to free.
  *		len	- length of section to free (0 => to EOF).
  *		flag	- current file open mode flags.
  *
  * 	RETURN:	0 if success
  *		error code if failure
  */
 int
 zfs_freesp(znode_t *zp, uint64_t off, uint64_t len, int flag, boolean_t log)
 {
 	vnode_t *vp = ZTOV(zp);
 	dmu_tx_t *tx;
 	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
 	zilog_t *zilog = zfsvfs->z_log;
 	rl_t *rl;
 	uint64_t end = off + len;
 	uint64_t size, new_blksz;
 	int error;
 
 	if (ZTOV(zp)->v_type == VFIFO)
 		return (0);
 
 	/*
 	 * If we will change zp_size then lock the whole file,
 	 * otherwise just lock the range being freed.
 	 */
 	if (len == 0 || off + len > zp->z_phys->zp_size) {
 		rl = zfs_range_lock(zp, 0, UINT64_MAX, RL_WRITER);
 	} else {
 		rl = zfs_range_lock(zp, off, len, RL_WRITER);
 		/* recheck, in case zp_size changed */
 		if (off + len > zp->z_phys->zp_size) {
 			/* lost race: file size changed, lock whole file */
 			zfs_range_unlock(rl);
 			rl = zfs_range_lock(zp, 0, UINT64_MAX, RL_WRITER);
 		}
 	}
 
 	/*
 	 * Nothing to do if file already at desired length.
 	 */
 	size = zp->z_phys->zp_size;
 	if (len == 0 && size == off && off != 0) {
 		zfs_range_unlock(rl);
 		return (0);
 	}
 
 	tx = dmu_tx_create(zfsvfs->z_os);
 	dmu_tx_hold_bonus(tx, zp->z_id);
 	new_blksz = 0;
 	if (end > size &&
 	    (!ISP2(zp->z_blksz) || zp->z_blksz < zfsvfs->z_max_blksz)) {
 		/*
 		 * We are growing the file past the current block size.
 		 */
 		if (zp->z_blksz > zp->z_zfsvfs->z_max_blksz) {
 			ASSERT(!ISP2(zp->z_blksz));
 			new_blksz = MIN(end, SPA_MAXBLOCKSIZE);
 		} else {
 			new_blksz = MIN(end, zp->z_zfsvfs->z_max_blksz);
 		}
 		dmu_tx_hold_write(tx, zp->z_id, 0, MIN(end, new_blksz));
 	} else if (off < size) {
 		/*
 		 * If len == 0, we are truncating the file.
 		 */
 		dmu_tx_hold_free(tx, zp->z_id, off, len ? len : DMU_OBJECT_END);
 	}
 
 	error = dmu_tx_assign(tx, zfsvfs->z_assign);
 	if (error) {
 		if (error == ERESTART && zfsvfs->z_assign == TXG_NOWAIT)
 			dmu_tx_wait(tx);
 		dmu_tx_abort(tx);
 		zfs_range_unlock(rl);
 		return (error);
 	}
 
 	if (new_blksz)
 		zfs_grow_blocksize(zp, new_blksz, tx);
 
 	if (end > size || len == 0)
 		zp->z_phys->zp_size = end;
 
 	if (off < size) {
 		objset_t *os = zfsvfs->z_os;
 		uint64_t rlen = len;
 
 		if (len == 0)
 			rlen = -1;
 		else if (end > size)
 			rlen = size - off;
 		VERIFY(0 == dmu_free_range(os, zp->z_id, off, rlen, tx));
 	}
 
 	if (log) {
 		zfs_time_stamper(zp, CONTENT_MODIFIED, tx);
 		zfs_log_truncate(zilog, tx, TX_TRUNCATE, zp, off, len);
 	}
 
 	zfs_range_unlock(rl);
 
 	dmu_tx_commit(tx);
 
 	/*
 	 * Clear any mapped pages in the truncated region.  This has to
 	 * happen outside of the transaction to avoid the possibility of
 	 * a deadlock with someone trying to push a page that we are
 	 * about to invalidate.
 	 */
 	rw_enter(&zp->z_map_lock, RW_WRITER);
 	if (end > size)
 		vnode_pager_setsize(vp, end);
 	else if (len == 0) {
 #if 0
 		error = vtruncbuf(vp, curthread->td_ucred, curthread, end, PAGE_SIZE);
 #else
 		error = vinvalbuf(vp, V_SAVE, curthread, 0, 0);
 		vnode_pager_setsize(vp, end);
 #endif
 	}
 	rw_exit(&zp->z_map_lock);
 
 	return (0);
 }
 
 void
 zfs_create_fs(objset_t *os, cred_t *cr, dmu_tx_t *tx)
 {
 	zfsvfs_t	zfsvfs;
 	uint64_t	moid, doid, roid = 0;
 	uint64_t	version = ZPL_VERSION;
 	int		error;
 	znode_t		*rootzp = NULL;
 	vattr_t		vattr;
 
 	/*
 	 * First attempt to create master node.
 	 */
 	/*
 	 * In an empty objset, there are no blocks to read and thus
 	 * there can be no i/o errors (which we assert below).
 	 */
 	moid = MASTER_NODE_OBJ;
 	error = zap_create_claim(os, moid, DMU_OT_MASTER_NODE,
 	    DMU_OT_NONE, 0, tx);
 	ASSERT(error == 0);
 
 	/*
 	 * Set starting attributes.
 	 */
 
 	error = zap_update(os, moid, ZPL_VERSION_OBJ, 8, 1, &version, tx);
 	ASSERT(error == 0);
 
 	/*
 	 * Create a delete queue.
 	 */
 	doid = zap_create(os, DMU_OT_UNLINKED_SET, DMU_OT_NONE, 0, tx);
 
 	error = zap_add(os, moid, ZFS_UNLINKED_SET, 8, 1, &doid, tx);
 	ASSERT(error == 0);
 
 	/*
 	 * Create root znode.  Create minimal znode/vnode/zfsvfs
 	 * to allow zfs_mknode to work.
 	 */
 	vattr.va_mask = AT_MODE|AT_UID|AT_GID|AT_TYPE;
 	vattr.va_type = VDIR;
 	vattr.va_mode = S_IFDIR|0755;
 	vattr.va_uid = UID_ROOT;
 	vattr.va_gid = GID_WHEEL;
 
 	rootzp = kmem_cache_alloc(znode_cache, KM_SLEEP);
 	zfs_znode_cache_constructor(rootzp, NULL, 0);
 	rootzp->z_zfsvfs = &zfsvfs;
 	rootzp->z_unlinked = 0;
 	rootzp->z_atime_dirty = 0;
 	rootzp->z_dbuf_held = 0;
 
 	bzero(&zfsvfs, sizeof (zfsvfs_t));
 
 	zfsvfs.z_os = os;
 	zfsvfs.z_assign = TXG_NOWAIT;
 	zfsvfs.z_parent = &zfsvfs;
 
 	mutex_init(&zfsvfs.z_znodes_lock, NULL, MUTEX_DEFAULT, NULL);
 	list_create(&zfsvfs.z_all_znodes, sizeof (znode_t),
 	    offsetof(znode_t, z_link_node));
 
 	zfs_mknode(rootzp, &vattr, &roid, tx, cr, IS_ROOT_NODE, NULL, 0);
 	ASSERT3U(rootzp->z_id, ==, roid);
 	error = zap_add(os, moid, ZFS_ROOT_OBJ, 8, 1, &roid, tx);
 	ASSERT(error == 0);
 
 	mutex_destroy(&zfsvfs.z_znodes_lock);
 	kmem_cache_free(znode_cache, rootzp);
 }
 #endif /* _KERNEL */
 
 /*
  * Given an object number, return its parent object number and whether
  * or not the object is an extended attribute directory.
  */
 static int
 zfs_obj_to_pobj(objset_t *osp, uint64_t obj, uint64_t *pobjp, int *is_xattrdir)
 {
 	dmu_buf_t *db;
 	dmu_object_info_t doi;
 	znode_phys_t *zp;
 	int error;
 
 	if ((error = dmu_bonus_hold(osp, obj, FTAG, &db)) != 0)
 		return (error);
 
 	dmu_object_info_from_db(db, &doi);
 	if (doi.doi_bonus_type != DMU_OT_ZNODE ||
 	    doi.doi_bonus_size < sizeof (znode_phys_t)) {
 		dmu_buf_rele(db, FTAG);
 		return (EINVAL);
 	}
 
 	zp = db->db_data;
 	*pobjp = zp->zp_parent;
 	*is_xattrdir = ((zp->zp_flags & ZFS_XATTR) != 0) &&
 	    S_ISDIR(zp->zp_mode);
 	dmu_buf_rele(db, FTAG);
 
 	return (0);
 }
 
 int
 zfs_obj_to_path(objset_t *osp, uint64_t obj, char *buf, int len)
 {
 	char *path = buf + len - 1;
 	int error;
 
 	*path = '\0';
 
 	for (;;) {
 		uint64_t pobj;
 		char component[MAXNAMELEN + 2];
 		size_t complen;
 		int is_xattrdir;
 
 		if ((error = zfs_obj_to_pobj(osp, obj, &pobj,
 		    &is_xattrdir)) != 0)
 			break;
 
 		if (pobj == obj) {
 			if (path[0] != '/')
 				*--path = '/';
 			break;
 		}
 
 		component[0] = '/';
 		if (is_xattrdir) {
 			(void) sprintf(component + 1, "<xattrdir>");
 		} else {
 			error = zap_value_search(osp, pobj, obj, component + 1);
 			if (error != 0)
 				break;
 		}
 
 		complen = strlen(component);
 		path -= complen;
 		ASSERT(path >= buf);
 		bcopy(component, path, complen);
 		obj = pobj;
 	}
 
 	if (error == 0)
 		(void) memmove(buf, path, buf + len - path);
 	return (error);
 }
Index: head/sys/compat/linprocfs/linprocfs.c
===================================================================
--- head/sys/compat/linprocfs/linprocfs.c	(revision 175201)
+++ head/sys/compat/linprocfs/linprocfs.c	(revision 175202)
@@ -1,1275 +1,1275 @@
 /*-
  * Copyright (c) 2000 Dag-Erling Co�dan Sm�rgrav
  * Copyright (c) 1999 Pierre Beyssac
  * Copyright (c) 1993 Jan-Simon Pendry
  * Copyright (c) 1993
  *	The Regents of the University of California.  All rights reserved.
  *
  * This code is derived from software contributed to Berkeley by
  * Jan-Simon Pendry.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. All advertising materials mentioning features or use of this software
  *    must display the following acknowledgement:
  *	This product includes software developed by the University of
  *	California, Berkeley and its contributors.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	@(#)procfs_status.c	8.4 (Berkeley) 6/15/94
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <sys/param.h>
 #include <sys/queue.h>
 #include <sys/blist.h>
 #include <sys/conf.h>
 #include <sys/exec.h>
 #include <sys/filedesc.h>
 #include <sys/jail.h>
 #include <sys/kernel.h>
 #include <sys/linker.h>
 #include <sys/lock.h>
 #include <sys/malloc.h>
 #include <sys/mount.h>
 #include <sys/msg.h>
 #include <sys/mutex.h>
 #include <sys/namei.h>
 #include <sys/proc.h>
 #include <sys/resourcevar.h>
 #include <sys/sbuf.h>
 #include <sys/sem.h>
 #include <sys/smp.h>
 #include <sys/socket.h>
 #include <sys/sysctl.h>
 #include <sys/systm.h>
 #include <sys/time.h>
 #include <sys/tty.h>
 #include <sys/user.h>
 #include <sys/vmmeter.h>
 #include <sys/vnode.h>
 
 #include <net/if.h>
 
 #include <vm/vm.h>
 #include <vm/pmap.h>
 #include <vm/vm_map.h>
 #include <vm/vm_param.h>
 #include <vm/vm_object.h>
 #include <vm/swap_pager.h>
 
 #include <machine/clock.h>
 
 #if defined(__i386__) || defined(__amd64__)
 #include <machine/cputypes.h>
 #include <machine/md_var.h>
 #endif /* __i386__ || __amd64__ */
 
 #include "opt_compat.h"
 #ifdef COMPAT_LINUX32				/* XXX */
 #include <machine/../linux32/linux.h>
 #else
 #include <machine/../linux/linux.h>
 #endif
 #include <compat/linux/linux_ioctl.h>
 #include <compat/linux/linux_mib.h>
 #include <compat/linux/linux_util.h>
 #include <fs/pseudofs/pseudofs.h>
 #include <fs/procfs/procfs.h>
 
 /*
  * Various conversion macros
  */
 #define T2J(x) (((x) * 100UL) / (stathz ? stathz : hz))	/* ticks to jiffies */
 #define T2S(x) ((x) / (stathz ? stathz : hz))		/* ticks to seconds */
 #define B2K(x) ((x) >> 10)				/* bytes to kbytes */
 #define B2P(x) ((x) >> PAGE_SHIFT)			/* bytes to pages */
 #define P2B(x) ((x) << PAGE_SHIFT)			/* pages to bytes */
 #define P2K(x) ((x) << (PAGE_SHIFT - 10))		/* pages to kbytes */
 
 /**
  * @brief Mapping of ki_stat in struct kinfo_proc to the linux state
  *
  * The linux procfs state field displays one of the characters RSDZTW to
  * denote running, sleeping in an interruptible wait, waiting in an
  * uninterruptible disk sleep, a zombie process, process is being traced
  * or stopped, or process is paging respectively.
  *
  * Our struct kinfo_proc contains the variable ki_stat which contains a
  * value out of SIDL, SRUN, SSLEEP, SSTOP, SZOMB, SWAIT and SLOCK.
  *
  * This character array is used with ki_stati-1 as an index and tries to
  * map our states to suitable linux states.
  */
 static char linux_state[] = "RRSTZDD";
 
 /*
  * Filler function for proc/meminfo
  */
 static int
 linprocfs_domeminfo(PFS_FILL_ARGS)
 {
 	unsigned long memtotal;		/* total memory in bytes */
 	unsigned long memused;		/* used memory in bytes */
 	unsigned long memfree;		/* free memory in bytes */
 	unsigned long memshared;	/* shared memory ??? */
 	unsigned long buffers, cached;	/* buffer / cache memory ??? */
 	unsigned long long swaptotal;	/* total swap space in bytes */
 	unsigned long long swapused;	/* used swap space in bytes */
 	unsigned long long swapfree;	/* free swap space in bytes */
 	vm_object_t object;
 	int i, j;
 
 	memtotal = physmem * PAGE_SIZE;
 	/*
 	 * The correct thing here would be:
 	 *
 	memfree = cnt.v_free_count * PAGE_SIZE;
 	memused = memtotal - memfree;
 	 *
 	 * but it might mislead linux binaries into thinking there
 	 * is very little memory left, so we cheat and tell them that
 	 * all memory that isn't wired down is free.
 	 */
 	memused = cnt.v_wire_count * PAGE_SIZE;
 	memfree = memtotal - memused;
 	swap_pager_status(&i, &j);
 	swaptotal = (unsigned long long)i * PAGE_SIZE;
 	swapused = (unsigned long long)j * PAGE_SIZE;
 	swapfree = swaptotal - swapused;
 	memshared = 0;
 	mtx_lock(&vm_object_list_mtx);
 	TAILQ_FOREACH(object, &vm_object_list, object_list)
 		if (object->shadow_count > 1)
 			memshared += object->resident_page_count;
 	mtx_unlock(&vm_object_list_mtx);
 	memshared *= PAGE_SIZE;
 	/*
 	 * We'd love to be able to write:
 	 *
 	buffers = bufspace;
 	 *
 	 * but bufspace is internal to vfs_bio.c and we don't feel
 	 * like unstaticizing it just for linprocfs's sake.
 	 */
 	buffers = 0;
 	cached = cnt.v_cache_count * PAGE_SIZE;
 
 	sbuf_printf(sb,
 	    "	     total:    used:	free:  shared: buffers:	 cached:\n"
 	    "Mem:  %lu %lu %lu %lu %lu %lu\n"
 	    "Swap: %llu %llu %llu\n"
 	    "MemTotal: %9lu kB\n"
 	    "MemFree:  %9lu kB\n"
 	    "MemShared:%9lu kB\n"
 	    "Buffers:  %9lu kB\n"
 	    "Cached:   %9lu kB\n"
 	    "SwapTotal:%9llu kB\n"
 	    "SwapFree: %9llu kB\n",
 	    memtotal, memused, memfree, memshared, buffers, cached,
 	    swaptotal, swapused, swapfree,
 	    B2K(memtotal), B2K(memfree),
 	    B2K(memshared), B2K(buffers), B2K(cached),
 	    B2K(swaptotal), B2K(swapfree));
 
 	return (0);
 }
 
 #if defined(__i386__) || defined(__amd64__)
 /*
  * Filler function for proc/cpuinfo (i386 & amd64 version)
  */
 static int
 linprocfs_docpuinfo(PFS_FILL_ARGS)
 {
 	int hw_model[2];
 	char model[128];
 	size_t size;
 	int class, fqmhz, fqkhz;
 	int i;
 
 	/*
 	 * We default the flags to include all non-conflicting flags,
 	 * and the Intel versions of conflicting flags.
 	 */
 	static char *flags[] = {
 		"fpu",	    "vme",     "de",	   "pse",      "tsc",
 		"msr",	    "pae",     "mce",	   "cx8",      "apic",
 		"sep",	    "sep",     "mtrr",	   "pge",      "mca",
 		"cmov",	    "pat",     "pse36",	   "pn",       "b19",
 		"b20",	    "b21",     "mmxext",   "mmx",      "fxsr",
 		"xmm",	    "b26",     "b27",	   "b28",      "b29",
 		"3dnowext", "3dnow"
 	};
 
 	switch (cpu_class) {
 #ifdef __i386__
 	case CPUCLASS_286:
 		class = 2;
 		break;
 	case CPUCLASS_386:
 		class = 3;
 		break;
 	case CPUCLASS_486:
 		class = 4;
 		break;
 	case CPUCLASS_586:
 		class = 5;
 		break;
 	case CPUCLASS_686:
 		class = 6;
 		break;
 	default:
 		class = 0;
 		break;
 #else /* __amd64__ */
 	default:
 		class = 15;
 		break;
 #endif
 	}
 
 	hw_model[0] = CTL_HW;
 	hw_model[1] = HW_MODEL;
 	model[0] = '\0';
 	size = sizeof(model);
 	if (kernel_sysctl(td, hw_model, 2, &model, &size, 0, 0, 0, 0) != 0)
 		strcpy(model, "unknown");
 	for (i = 0; i < mp_ncpus; ++i) {
 		sbuf_printf(sb,
 		    "processor\t: %d\n"
 		    "vendor_id\t: %.20s\n"
 		    "cpu family\t: %d\n"
 		    "model\t\t: %d\n"
 		    "model name\t: %s\n"
 		    "stepping\t: %d\n",
 		    i, cpu_vendor, class, cpu, model, cpu_id & 0xf);
 		/* XXX per-cpu vendor / class / model / id? */
 	}
 
 	sbuf_cat(sb,
 	    "flags\t\t:");
 
 	if (!strcmp(cpu_vendor, "AuthenticAMD") && (class < 6)) {
 		flags[16] = "fcmov";
 	} else if (!strcmp(cpu_vendor, "CyrixInstead")) {
 		flags[24] = "cxmmx";
 	}
 
 	for (i = 0; i < 32; i++)
 		if (cpu_feature & (1 << i))
 			sbuf_printf(sb, " %s", flags[i]);
 	sbuf_cat(sb, "\n");
 	if (class >= 5) {
 		fqmhz = (tsc_freq + 4999) / 1000000;
 		fqkhz = ((tsc_freq + 4999) / 10000) % 100;
 		sbuf_printf(sb,
 		    "cpu MHz\t\t: %d.%02d\n"
 		    "bogomips\t: %d.%02d\n",
 		    fqmhz, fqkhz, fqmhz, fqkhz);
 	}
 
 	return (0);
 }
 #endif /* __i386__ || __amd64__ */
 
 /*
  * Filler function for proc/mtab
  *
  * This file doesn't exist in Linux' procfs, but is included here so
  * users can symlink /compat/linux/etc/mtab to /proc/mtab
  */
 static int
 linprocfs_domtab(PFS_FILL_ARGS)
 {
 	struct nameidata nd;
 	struct mount *mp;
 	const char *lep;
 	char *dlep, *flep, *mntto, *mntfrom, *fstype;
 	size_t lep_len;
 	int error;
 
 	/* resolve symlinks etc. in the emulation tree prefix */
 	NDINIT(&nd, LOOKUP, FOLLOW | MPSAFE, UIO_SYSSPACE, linux_emul_path, td);
 	flep = NULL;
 	error = namei(&nd);
 	VFS_UNLOCK_GIANT(NDHASGIANT(&nd));
 	if (error != 0 || vn_fullpath(td, nd.ni_vp, &dlep, &flep) != 0)
 		lep = linux_emul_path;
 	else
 		lep = dlep;
 	lep_len = strlen(lep);
 
 	mtx_lock(&mountlist_mtx);
 	error = 0;
 	TAILQ_FOREACH(mp, &mountlist, mnt_list) {
 		/* determine device name */
 		mntfrom = mp->mnt_stat.f_mntfromname;
 
 		/* determine mount point */
 		mntto = mp->mnt_stat.f_mntonname;
 		if (strncmp(mntto, lep, lep_len) == 0 &&
 		    mntto[lep_len] == '/')
 			mntto += lep_len;
 
 		/* determine fs type */
 		fstype = mp->mnt_stat.f_fstypename;
 		if (strcmp(fstype, pn->pn_info->pi_name) == 0)
 			mntfrom = fstype = "proc";
 		else if (strcmp(fstype, "procfs") == 0)
 			continue;
 
 		if (strcmp(fstype, "linsysfs") == 0) {
 			sbuf_printf(sb, "/sys %s sysfs %s", mntto,
 			    mp->mnt_stat.f_flags & MNT_RDONLY ? "ro" : "rw");
 		} else {
 			sbuf_printf(sb, "%s %s %s %s", mntfrom, mntto, fstype,
 			    mp->mnt_stat.f_flags & MNT_RDONLY ? "ro" : "rw");
 		}
 #define ADD_OPTION(opt, name) \
 	if (mp->mnt_stat.f_flags & (opt)) sbuf_printf(sb, "," name);
 		ADD_OPTION(MNT_SYNCHRONOUS,	"sync");
 		ADD_OPTION(MNT_NOEXEC,		"noexec");
 		ADD_OPTION(MNT_NOSUID,		"nosuid");
 		ADD_OPTION(MNT_UNION,		"union");
 		ADD_OPTION(MNT_ASYNC,		"async");
 		ADD_OPTION(MNT_SUIDDIR,		"suiddir");
 		ADD_OPTION(MNT_NOSYMFOLLOW,	"nosymfollow");
 		ADD_OPTION(MNT_NOATIME,		"noatime");
 #undef ADD_OPTION
 		/* a real Linux mtab will also show NFS options */
 		sbuf_printf(sb, " 0 0\n");
 	}
 	mtx_unlock(&mountlist_mtx);
 	if (flep != NULL)
 		free(flep, M_TEMP);
 	return (error);
 }
 
 /*
  * Filler function for proc/stat
  */
 static int
 linprocfs_dostat(PFS_FILL_ARGS)
 {
 	struct pcpu *pcpu;
 	long cp_time[CPUSTATES];
 	long *cp;
 	int i;
 
 	read_cpu_time(cp_time);
 	sbuf_printf(sb, "cpu %ld %ld %ld %ld\n",
 	    T2J(cp_time[CP_USER]),
 	    T2J(cp_time[CP_NICE]),
 	    T2J(cp_time[CP_SYS] /*+ cp_time[CP_INTR]*/),
 	    T2J(cp_time[CP_IDLE]));
 	for (i = 0; i <= mp_maxid; ++i) {
 		if (CPU_ABSENT(i))
 			continue;
 		pcpu = pcpu_find(i);
 		cp = pcpu->pc_cp_time;
 		sbuf_printf(sb, "cpu%d %ld %ld %ld %ld\n", i,
 		    T2J(cp[CP_USER]),
 		    T2J(cp[CP_NICE]),
 		    T2J(cp[CP_SYS] /*+ cp[CP_INTR]*/),
 		    T2J(cp[CP_IDLE]));
 	}
 	sbuf_printf(sb,
 	    "disk 0 0 0 0\n"
 	    "page %u %u\n"
 	    "swap %u %u\n"
 	    "intr %u\n"
 	    "ctxt %u\n"
 	    "btime %lld\n",
 	    cnt.v_vnodepgsin,
 	    cnt.v_vnodepgsout,
 	    cnt.v_swappgsin,
 	    cnt.v_swappgsout,
 	    cnt.v_intr,
 	    cnt.v_swtch,
 	    (long long)boottime.tv_sec);
 	return (0);
 }
 
 /*
  * Filler function for proc/uptime
  */
 static int
 linprocfs_douptime(PFS_FILL_ARGS)
 {
 	long cp_time[CPUSTATES];
 	struct timeval tv;
 
 	getmicrouptime(&tv);
 	read_cpu_time(cp_time);
 	sbuf_printf(sb, "%lld.%02ld %ld.%02ld\n",
 	    (long long)tv.tv_sec, tv.tv_usec / 10000,
 	    T2S(cp_time[CP_IDLE]), T2J(cp_time[CP_IDLE]) % 100);
 	return (0);
 }
 
 /*
  * Get OS build date
  */
 static void
 linprocfs_osbuild(struct thread *td, struct sbuf *sb)
 {
 #if 0
 	char osbuild[256];
 	char *cp1, *cp2;
 
 	strncpy(osbuild, version, 256);
 	osbuild[255] = '\0';
 	cp1 = strstr(osbuild, "\n");
 	cp2 = strstr(osbuild, ":");
 	if (cp1 && cp2) {
 		*cp1 = *cp2 = '\0';
 		cp1 = strstr(osbuild, "#");
 	} else
 		cp1 = NULL;
 	if (cp1)
 		sbuf_printf(sb, "%s%s", cp1, cp2 + 1);
 	else
 #endif
 		sbuf_cat(sb, "#4 Sun Dec 18 04:30:00 CET 1977");
 }
 
 /*
  * Get OS builder
  */
 static void
 linprocfs_osbuilder(struct thread *td, struct sbuf *sb)
 {
 #if 0
 	char builder[256];
 	char *cp;
 
 	cp = strstr(version, "\n    ");
 	if (cp) {
 		strncpy(builder, cp + 5, 256);
 		builder[255] = '\0';
 		cp = strstr(builder, ":");
 		if (cp)
 			*cp = '\0';
 	}
 	if (cp)
 		sbuf_cat(sb, builder);
 	else
 #endif
 		sbuf_cat(sb, "des@freebsd.org");
 }
 
 /*
  * Filler function for proc/version
  */
 static int
 linprocfs_doversion(PFS_FILL_ARGS)
 {
 	char osname[LINUX_MAX_UTSNAME];
 	char osrelease[LINUX_MAX_UTSNAME];
 
 	linux_get_osname(td, osname);
 	linux_get_osrelease(td, osrelease);
 	sbuf_printf(sb, "%s version %s (", osname, osrelease);
 	linprocfs_osbuilder(td, sb);
 	sbuf_cat(sb, ") (gcc version " __VERSION__ ") ");
 	linprocfs_osbuild(td, sb);
 	sbuf_cat(sb, "\n");
 
 	return (0);
 }
 
 /*
  * Filler function for proc/loadavg
  */
 static int
 linprocfs_doloadavg(PFS_FILL_ARGS)
 {
 
 	sbuf_printf(sb,
 	    "%d.%02d %d.%02d %d.%02d %d/%d %d\n",
 	    (int)(averunnable.ldavg[0] / averunnable.fscale),
 	    (int)(averunnable.ldavg[0] * 100 / averunnable.fscale % 100),
 	    (int)(averunnable.ldavg[1] / averunnable.fscale),
 	    (int)(averunnable.ldavg[1] * 100 / averunnable.fscale % 100),
 	    (int)(averunnable.ldavg[2] / averunnable.fscale),
 	    (int)(averunnable.ldavg[2] * 100 / averunnable.fscale % 100),
 	    1,				/* number of running tasks */
 	    nprocs,			/* number of tasks */
 	    lastpid			/* the last pid */
 	);
 	return (0);
 }
 
 /*
  * Filler function for proc/pid/stat
  */
 static int
 linprocfs_doprocstat(PFS_FILL_ARGS)
 {
 	struct kinfo_proc kp;
 	char state;
 	static int ratelimit = 0;
 
 	PROC_LOCK(p);
 	fill_kinfo_proc(p, &kp);
 	sbuf_printf(sb, "%d", p->p_pid);
 #define PS_ADD(name, fmt, arg) sbuf_printf(sb, " " fmt, arg)
 	PS_ADD("comm",		"(%s)",	p->p_comm);
 	if (kp.ki_stat > sizeof(linux_state)) {
 		state = 'R';
 
 		if (ratelimit == 0) {
 			printf("linprocfs: don't know how to handle unknown FreeBSD state %d/%zd, mapping to R\n",
 			    kp.ki_stat, sizeof(linux_state));
 			++ratelimit;
 		}
 	} else
 		state = linux_state[kp.ki_stat - 1];
 	PS_ADD("state",		"%c",	state);
 	PS_ADD("ppid",		"%d",	p->p_pptr ? p->p_pptr->p_pid : 0);
 	PS_ADD("pgrp",		"%d",	p->p_pgid);
 	PS_ADD("session",	"%d",	p->p_session->s_sid);
 	PROC_UNLOCK(p);
 	PS_ADD("tty",		"%d",	0); /* XXX */
 	PS_ADD("tpgid",		"%d",	kp.ki_tpgid);
 	PS_ADD("flags",		"%u",	0); /* XXX */
 	PS_ADD("minflt",	"%lu",	kp.ki_rusage.ru_minflt);
 	PS_ADD("cminflt",	"%lu",	kp.ki_rusage_ch.ru_minflt);
 	PS_ADD("majflt",	"%lu",	kp.ki_rusage.ru_majflt);
 	PS_ADD("cmajflt",	"%lu",	kp.ki_rusage_ch.ru_majflt);
 	PS_ADD("utime",		"%ld",	T2J(tvtohz(&kp.ki_rusage.ru_utime)));
 	PS_ADD("stime",		"%ld",	T2J(tvtohz(&kp.ki_rusage.ru_stime)));
 	PS_ADD("cutime",	"%ld",	T2J(tvtohz(&kp.ki_rusage_ch.ru_utime)));
 	PS_ADD("cstime",	"%ld",	T2J(tvtohz(&kp.ki_rusage_ch.ru_stime)));
 	PS_ADD("priority",	"%d",	kp.ki_pri.pri_user);
 	PS_ADD("nice",		"%d",	kp.ki_nice); /* 19 (nicest) to -19 */
 	PS_ADD("0",		"%d",	0); /* removed field */
 	PS_ADD("itrealvalue",	"%d",	0); /* XXX */
 	/* XXX: starttime is not right, it is the _same_ for _every_ process.
 	   It should be the number of jiffies between system boot and process
 	   start. */
 	PS_ADD("starttime",	"%lu",	T2J(tvtohz(&kp.ki_start)));
 	PS_ADD("vsize",		"%ju",	P2K((uintmax_t)kp.ki_size));
 	PS_ADD("rss",		"%ju",	(uintmax_t)kp.ki_rssize);
 	PS_ADD("rlim",		"%lu",	kp.ki_rusage.ru_maxrss);
 	PS_ADD("startcode",	"%u",	(unsigned)0);
 	PS_ADD("endcode",	"%u",	0); /* XXX */
 	PS_ADD("startstack",	"%u",	0); /* XXX */
 	PS_ADD("kstkesp",	"%u",	0); /* XXX */
 	PS_ADD("kstkeip",	"%u",	0); /* XXX */
 	PS_ADD("signal",	"%u",	0); /* XXX */
 	PS_ADD("blocked",	"%u",	0); /* XXX */
 	PS_ADD("sigignore",	"%u",	0); /* XXX */
 	PS_ADD("sigcatch",	"%u",	0); /* XXX */
 	PS_ADD("wchan",		"%u",	0); /* XXX */
 	PS_ADD("nswap",		"%lu",	kp.ki_rusage.ru_nswap);
 	PS_ADD("cnswap",	"%lu",	kp.ki_rusage_ch.ru_nswap);
 	PS_ADD("exitsignal",	"%d",	0); /* XXX */
 	PS_ADD("processor",	"%u",	kp.ki_lastcpu);
 	PS_ADD("rt_priority",	"%u",	0); /* XXX */ /* >= 2.5.19 */
 	PS_ADD("policy",	"%u",	kp.ki_pri.pri_class); /* >= 2.5.19 */
 #undef PS_ADD
 	sbuf_putc(sb, '\n');
 
 	return (0);
 }
 
 /*
  * Filler function for proc/pid/statm
  */
 static int
 linprocfs_doprocstatm(PFS_FILL_ARGS)
 {
 	struct kinfo_proc kp;
 	segsz_t lsize;
 
 	PROC_LOCK(p);
 	fill_kinfo_proc(p, &kp);
 	PROC_UNLOCK(p);
 
 	/*
 	 * See comments in linprocfs_doprocstatus() regarding the
 	 * computation of lsize.
 	 */
 	/* size resident share trs drs lrs dt */
 	sbuf_printf(sb, "%ju ", B2P((uintmax_t)kp.ki_size));
 	sbuf_printf(sb, "%ju ", (uintmax_t)kp.ki_rssize);
 	sbuf_printf(sb, "%ju ", (uintmax_t)0); /* XXX */
 	sbuf_printf(sb, "%ju ",	(uintmax_t)kp.ki_tsize);
 	sbuf_printf(sb, "%ju ", (uintmax_t)(kp.ki_dsize + kp.ki_ssize));
 	lsize = B2P(kp.ki_size) - kp.ki_dsize -
 	    kp.ki_ssize - kp.ki_tsize - 1;
 	sbuf_printf(sb, "%ju ", (uintmax_t)lsize);
 	sbuf_printf(sb, "%ju\n", (uintmax_t)0); /* XXX */
 
 	return (0);
 }
 
 /*
  * Filler function for proc/pid/status
  */
 static int
 linprocfs_doprocstatus(PFS_FILL_ARGS)
 {
 	struct kinfo_proc kp;
 	char *state;
 	segsz_t lsize;
 	struct thread *td2;
 	struct sigacts *ps;
 	int i;
 
 	PROC_LOCK(p);
 	td2 = FIRST_THREAD_IN_PROC(p); /* XXXKSE pretend only one thread */
 
 	if (P_SHOULDSTOP(p)) {
 		state = "T (stopped)";
 	} else {
 		PROC_SLOCK(p);
 		switch(p->p_state) {
 		case PRS_NEW:
 			state = "I (idle)";
 			break;
 		case PRS_NORMAL:
 			if (p->p_flag & P_WEXIT) {
 				state = "X (exiting)";
 				break;
 			}
 			switch(td2->td_state) {
 			case TDS_INHIBITED:
 				state = "S (sleeping)";
 				break;
 			case TDS_RUNQ:
 			case TDS_RUNNING:
 				state = "R (running)";
 				break;
 			default:
 				state = "? (unknown)";
 				break;
 			}
 			break;
 		case PRS_ZOMBIE:
 			state = "Z (zombie)";
 			break;
 		default:
 			state = "? (unknown)";
 			break;
 		}
 		PROC_SUNLOCK(p);
 	}
 
 	fill_kinfo_proc(p, &kp);
 	sbuf_printf(sb, "Name:\t%s\n",		p->p_comm); /* XXX escape */
 	sbuf_printf(sb, "State:\t%s\n",		state);
 
 	/*
 	 * Credentials
 	 */
 	sbuf_printf(sb, "Pid:\t%d\n",		p->p_pid);
 	sbuf_printf(sb, "PPid:\t%d\n",		p->p_pptr ?
 						p->p_pptr->p_pid : 0);
 	sbuf_printf(sb, "Uid:\t%d %d %d %d\n",	p->p_ucred->cr_ruid,
 						p->p_ucred->cr_uid,
 						p->p_ucred->cr_svuid,
 						/* FreeBSD doesn't have fsuid */
 						p->p_ucred->cr_uid);
 	sbuf_printf(sb, "Gid:\t%d %d %d %d\n",	p->p_ucred->cr_rgid,
 						p->p_ucred->cr_gid,
 						p->p_ucred->cr_svgid,
 						/* FreeBSD doesn't have fsgid */
 						p->p_ucred->cr_gid);
 	sbuf_cat(sb, "Groups:\t");
 	for (i = 0; i < p->p_ucred->cr_ngroups; i++)
 		sbuf_printf(sb, "%d ",		p->p_ucred->cr_groups[i]);
 	PROC_UNLOCK(p);
 	sbuf_putc(sb, '\n');
 
 	/*
 	 * Memory
 	 *
 	 * While our approximation of VmLib may not be accurate (I
 	 * don't know of a simple way to verify it, and I'm not sure
 	 * it has much meaning anyway), I believe it's good enough.
 	 *
 	 * The same code that could (I think) accurately compute VmLib
 	 * could also compute VmLck, but I don't really care enough to
 	 * implement it. Submissions are welcome.
 	 */
 	sbuf_printf(sb, "VmSize:\t%8ju kB\n",	B2K((uintmax_t)kp.ki_size));
 	sbuf_printf(sb, "VmLck:\t%8u kB\n",	P2K(0)); /* XXX */
 	sbuf_printf(sb, "VmRss:\t%8ju kB\n",	P2K((uintmax_t)kp.ki_rssize));
 	sbuf_printf(sb, "VmData:\t%8ju kB\n",	P2K((uintmax_t)kp.ki_dsize));
 	sbuf_printf(sb, "VmStk:\t%8ju kB\n",	P2K((uintmax_t)kp.ki_ssize));
 	sbuf_printf(sb, "VmExe:\t%8ju kB\n",	P2K((uintmax_t)kp.ki_tsize));
 	lsize = B2P(kp.ki_size) - kp.ki_dsize -
 	    kp.ki_ssize - kp.ki_tsize - 1;
 	sbuf_printf(sb, "VmLib:\t%8ju kB\n",	P2K((uintmax_t)lsize));
 
 	/*
 	 * Signal masks
 	 *
 	 * We support up to 128 signals, while Linux supports 32,
 	 * but we only define 32 (the same 32 as Linux, to boot), so
 	 * just show the lower 32 bits of each mask. XXX hack.
 	 *
 	 * NB: on certain platforms (Sparc at least) Linux actually
 	 * supports 64 signals, but this code is a long way from
 	 * running on anything but i386, so ignore that for now.
 	 */
 	PROC_LOCK(p);
 	sbuf_printf(sb, "SigPnd:\t%08x\n",	p->p_siglist.__bits[0]);
 	/*
 	 * I can't seem to find out where the signal mask is in
 	 * relation to struct proc, so SigBlk is left unimplemented.
 	 */
 	sbuf_printf(sb, "SigBlk:\t%08x\n",	0); /* XXX */
 	ps = p->p_sigacts;
 	mtx_lock(&ps->ps_mtx);
 	sbuf_printf(sb, "SigIgn:\t%08x\n",	ps->ps_sigignore.__bits[0]);
 	sbuf_printf(sb, "SigCgt:\t%08x\n",	ps->ps_sigcatch.__bits[0]);
 	mtx_unlock(&ps->ps_mtx);
 	PROC_UNLOCK(p);
 
 	/*
 	 * Linux also prints the capability masks, but we don't have
 	 * capabilities yet, and when we do get them they're likely to
 	 * be meaningless to Linux programs, so we lie. XXX
 	 */
 	sbuf_printf(sb, "CapInh:\t%016x\n",	0);
 	sbuf_printf(sb, "CapPrm:\t%016x\n",	0);
 	sbuf_printf(sb, "CapEff:\t%016x\n",	0);
 
 	return (0);
 }
 
 
 /*
  * Filler function for proc/pid/cwd
  */
 static int
 linprocfs_doproccwd(PFS_FILL_ARGS)
 {
 	char *fullpath = "unknown";
 	char *freepath = NULL;
 
 	vn_fullpath(td, p->p_fd->fd_cdir, &fullpath, &freepath);
 	sbuf_printf(sb, "%s", fullpath);
 	if (freepath)
 		free(freepath, M_TEMP);
 	return (0);
 }
 
 /*
  * Filler function for proc/pid/root
  */
 static int
 linprocfs_doprocroot(PFS_FILL_ARGS)
 {
 	struct vnode *rvp;
 	char *fullpath = "unknown";
 	char *freepath = NULL;
 
 	rvp = jailed(p->p_ucred) ? p->p_fd->fd_jdir : p->p_fd->fd_rdir;
 	vn_fullpath(td, rvp, &fullpath, &freepath);
 	sbuf_printf(sb, "%s", fullpath);
 	if (freepath)
 		free(freepath, M_TEMP);
 	return (0);
 }
 
 /*
  * Filler function for proc/pid/cmdline
  */
 static int
 linprocfs_doproccmdline(PFS_FILL_ARGS)
 {
 	struct ps_strings pstr;
 	char **ps_argvstr;
 	int error, i;
 
 	/*
 	 * If we are using the ps/cmdline caching, use that.  Otherwise
 	 * revert back to the old way which only implements full cmdline
 	 * for the currept process and just p->p_comm for all other
 	 * processes.
 	 * Note that if the argv is no longer available, we deliberately
 	 * don't fall back on p->p_comm or return an error: the authentic
 	 * Linux behaviour is to return zero-length in this case.
 	 */
 
 	PROC_LOCK(p);
 	if (p->p_args && p_cansee(td, p) == 0) {
 		sbuf_bcpy(sb, p->p_args->ar_args, p->p_args->ar_length);
 		PROC_UNLOCK(p);
 	} else if (p != td->td_proc) {
 		PROC_UNLOCK(p);
 		sbuf_printf(sb, "%.*s", MAXCOMLEN, p->p_comm);
 	} else {
 		PROC_UNLOCK(p);
 		error = copyin((void *)p->p_sysent->sv_psstrings, &pstr,
 		    sizeof(pstr));
 		if (error)
 			return (error);
 		if (pstr.ps_nargvstr > ARG_MAX)
 			return (E2BIG);
 		ps_argvstr = malloc(pstr.ps_nargvstr * sizeof(char *),
 		    M_TEMP, M_WAITOK);
 		error = copyin((void *)pstr.ps_argvstr, ps_argvstr,
 		    pstr.ps_nargvstr * sizeof(char *));
 		if (error) {
 			free(ps_argvstr, M_TEMP);
 			return (error);
 		}
 		for (i = 0; i < pstr.ps_nargvstr; i++) {
 			sbuf_copyin(sb, ps_argvstr[i], 0);
 			sbuf_printf(sb, "%c", '\0');
 		}
 		free(ps_argvstr, M_TEMP);
 	}
 
 	return (0);
 }
 
 /*
  * Filler function for proc/pid/environ
  */
 static int
 linprocfs_doprocenviron(PFS_FILL_ARGS)
 {
 
 	sbuf_printf(sb, "doprocenviron\n%c", '\0');
 	return (0);
 }
 
 /*
  * Filler function for proc/pid/maps
  */
 static int
 linprocfs_doprocmaps(PFS_FILL_ARGS)
 {
 	char mebuffer[512];
 	vm_map_t map = &p->p_vmspace->vm_map;
 	vm_map_entry_t entry, tmp_entry;
 	vm_object_t obj, tobj, lobj;
 	vm_offset_t saved_end;
 	vm_ooffset_t off = 0;
 	char *name = "", *freename = NULL;
 	size_t len;
 	ino_t ino;
 	unsigned int last_timestamp;
 	int ref_count, shadow_count, flags;
 	int error;
 	struct vnode *vp;
 	struct vattr vat;
 	int locked;
 
 	PROC_LOCK(p);
 	error = p_candebug(td, p);
 	PROC_UNLOCK(p);
 	if (error)
 		return (error);
 
 	if (uio->uio_rw != UIO_READ)
 		return (EOPNOTSUPP);
 
 	if (uio->uio_offset != 0)
 		return (0);
 
 	error = 0;
 	vm_map_lock_read(map);
 	for (entry = map->header.next;
 	    ((uio->uio_resid > 0) && (entry != &map->header));
 	    entry = entry->next) {
 		name = "";
 		freename = NULL;
 		if (entry->eflags & MAP_ENTRY_IS_SUB_MAP)
 			continue;
 		saved_end = entry->end;
 		obj = entry->object.vm_object;
 		for (lobj = tobj = obj; tobj; tobj = tobj->backing_object) {
 			VM_OBJECT_LOCK(tobj);
 			if (lobj != obj)
 				VM_OBJECT_UNLOCK(lobj);
 			lobj = tobj;
 		}
 		ino = 0;
 		if (lobj) {
 			off = IDX_TO_OFF(lobj->size);
 			if (lobj->type == OBJT_VNODE) {
 				vp = lobj->handle;
 				if (vp)
 					vref(vp);
 			}
 			else
 				vp = NULL;
 			if (lobj != obj)
 				VM_OBJECT_UNLOCK(lobj);
 			flags = obj->flags;
 			ref_count = obj->ref_count;
 			shadow_count = obj->shadow_count;
 			VM_OBJECT_UNLOCK(obj);
 			if (vp) {
 				vn_fullpath(td, vp, &name, &freename);
 				locked = VFS_LOCK_GIANT(vp->v_mount);
-				vn_lock(vp, LK_SHARED | LK_RETRY, td);
+				vn_lock(vp, LK_SHARED | LK_RETRY);
 				VOP_GETATTR(vp, &vat, td->td_ucred, td);
 				ino = vat.va_fileid;
 				vput(vp);
 				VFS_UNLOCK_GIANT(locked);
 			}
 		} else {
 			flags = 0;
 			ref_count = 0;
 			shadow_count = 0;
 		}
 
 		/*
 		 * format:
 		 *  start, end, access, offset, major, minor, inode, name.
 		 */
 		snprintf(mebuffer, sizeof mebuffer,
 		    "%08lx-%08lx %s%s%s%s %08lx %02x:%02x %lu%s%s\n",
 		    (u_long)entry->start, (u_long)entry->end,
 		    (entry->protection & VM_PROT_READ)?"r":"-",
 		    (entry->protection & VM_PROT_WRITE)?"w":"-",
 		    (entry->protection & VM_PROT_EXECUTE)?"x":"-",
 		    "p",
 		    (u_long)off,
 		    0,
 		    0,
 		    (u_long)ino,
 		    *name ? "     " : "",
 		    name
 		    );
 		if (freename)
 			free(freename, M_TEMP);
 		len = strlen(mebuffer);
 		if (len > uio->uio_resid)
 			len = uio->uio_resid; /*
 					       * XXX We should probably return
 					       * EFBIG here, as in procfs.
 					       */
 		last_timestamp = map->timestamp;
 		vm_map_unlock_read(map);
 		error = uiomove(mebuffer, len, uio);
 		vm_map_lock_read(map);
 		if (error)
 			break;
 		if (last_timestamp + 1 != map->timestamp) {
 			/*
 			 * Look again for the entry because the map was
 			 * modified while it was unlocked.  Specifically,
 			 * the entry may have been clipped, merged, or deleted.
 			 */
 			vm_map_lookup_entry(map, saved_end - 1, &tmp_entry);
 			entry = tmp_entry;
 		}
 	}
 	vm_map_unlock_read(map);
 
 	return (error);
 }
 
 /*
  * Filler function for proc/net/dev
  */
 static int
 linprocfs_donetdev(PFS_FILL_ARGS)
 {
 	char ifname[16]; /* XXX LINUX_IFNAMSIZ */
 	struct ifnet *ifp;
 
 	sbuf_printf(sb, "%6s|%58s|%s\n%6s|%58s|%58s\n",
 	    "Inter-", "   Receive", "  Transmit", " face",
 	    "bytes    packets errs drop fifo frame compressed",
 	    "bytes    packets errs drop fifo frame compressed");
 
 	IFNET_RLOCK();
 	TAILQ_FOREACH(ifp, &ifnet, if_link) {
 		linux_ifname(ifp, ifname, sizeof ifname);
 			sbuf_printf(sb, "%6.6s:", ifname);
 		sbuf_printf(sb, "%8lu %7lu %4lu %4lu %4lu %5lu %10lu %9lu ",
 		    0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL);
 		sbuf_printf(sb, "%8lu %7lu %4lu %4lu %4lu %5lu %7lu %10lu\n",
 		    0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL);
 	}
 	IFNET_RUNLOCK();
 
 	return (0);
 }
 
 /*
  * Filler function for proc/sys/kernel/osrelease
  */
 static int
 linprocfs_doosrelease(PFS_FILL_ARGS)
 {
 	char osrelease[LINUX_MAX_UTSNAME];
 
 	linux_get_osrelease(td, osrelease);
 	sbuf_printf(sb, "%s\n", osrelease);
 
 	return (0);
 }
 
 /*
  * Filler function for proc/sys/kernel/ostype
  */
 static int
 linprocfs_doostype(PFS_FILL_ARGS)
 {
 	char osname[LINUX_MAX_UTSNAME];
 
 	linux_get_osname(td, osname);
 	sbuf_printf(sb, "%s\n", osname);
 
 	return (0);
 }
 
 /*
  * Filler function for proc/sys/kernel/version
  */
 static int
 linprocfs_doosbuild(PFS_FILL_ARGS)
 {
 
 	linprocfs_osbuild(td, sb);
 	sbuf_cat(sb, "\n");
 	return (0);
 }
 
 /*
  * Filler function for proc/sys/kernel/msgmni
  */
 static int
 linprocfs_domsgmni(PFS_FILL_ARGS)
 {
 
 	sbuf_printf(sb, "%d\n", msginfo.msgmni);
 	return (0);
 }
 
 /*
  * Filler function for proc/sys/kernel/pid_max
  */
 static int
 linprocfs_dopid_max(PFS_FILL_ARGS)
 {
 
 	sbuf_printf(sb, "%i\n", PID_MAX);
 	return (0);
 }
 
 /*
  * Filler function for proc/sys/kernel/sem
  */
 static int
 linprocfs_dosem(PFS_FILL_ARGS)
 {
 
 	sbuf_printf(sb, "%d %d %d %d\n", seminfo.semmsl, seminfo.semmns,
 	    seminfo.semopm, seminfo.semmni);
 	return (0);
 }
 
 /*
  * Filler function for proc/scsi/device_info
  */
 static int
 linprocfs_doscsidevinfo(PFS_FILL_ARGS)
 {
 
 	return (0);
 }
 
 /*
  * Filler function for proc/scsi/scsi
  */
 static int
 linprocfs_doscsiscsi(PFS_FILL_ARGS)
 {
 
 	return (0);
 }
 
 extern struct cdevsw *cdevsw[];
 
 /*
  * Filler function for proc/devices
  */
 static int
 linprocfs_dodevices(PFS_FILL_ARGS)
 {
 	char *char_devices;
 	sbuf_printf(sb, "Character devices:\n");
 
 	char_devices = linux_get_char_devices();
 	sbuf_printf(sb, "%s", char_devices);
 	linux_free_get_char_devices(char_devices);
 
 	sbuf_printf(sb, "\nBlock devices:\n");
 
 	return (0);
 }
 
 /*
  * Filler function for proc/cmdline
  */
 static int
 linprocfs_docmdline(PFS_FILL_ARGS)
 {
 
 	sbuf_printf(sb, "BOOT_IMAGE=%s", kernelname);
 	sbuf_printf(sb, " ro root=302\n");
 	return (0);
 }
 
 #if 0
 /*
  * Filler function for proc/modules
  */
 static int
 linprocfs_domodules(PFS_FILL_ARGS)
 {
 	struct linker_file *lf;
 
 	TAILQ_FOREACH(lf, &linker_files, link) {
 		sbuf_printf(sb, "%-20s%8lu%4d\n", lf->filename,
 		    (unsigned long)lf->size, lf->refs);
 	}
 	return (0);
 }
 #endif
 
 /*
  * Constructor
  */
 static int
 linprocfs_init(PFS_INIT_ARGS)
 {
 	struct pfs_node *root;
 	struct pfs_node *dir;
 
 	root = pi->pi_root;
 
 	/* /proc/... */
 	pfs_create_file(root, "cmdline", &linprocfs_docmdline,
 	    NULL, NULL, NULL, PFS_RD);
 	pfs_create_file(root, "cpuinfo", &linprocfs_docpuinfo,
 	    NULL, NULL, NULL, PFS_RD);
 	pfs_create_file(root, "devices", &linprocfs_dodevices,
 	    NULL, NULL, NULL, PFS_RD);
 	pfs_create_file(root, "loadavg", &linprocfs_doloadavg,
 	    NULL, NULL, NULL, PFS_RD);
 	pfs_create_file(root, "meminfo", &linprocfs_domeminfo,
 	    NULL, NULL, NULL, PFS_RD);
 #if 0
 	pfs_create_file(root, "modules", &linprocfs_domodules,
 	    NULL, NULL, NULL, PFS_RD);
 #endif
 	pfs_create_file(root, "mounts", &linprocfs_domtab,
 	    NULL, NULL, NULL, PFS_RD);
 	pfs_create_file(root, "mtab", &linprocfs_domtab,
 	    NULL, NULL, NULL, PFS_RD);
 	pfs_create_link(root, "self", &procfs_docurproc,
 	    NULL, NULL, NULL, 0);
 	pfs_create_file(root, "stat", &linprocfs_dostat,
 	    NULL, NULL, NULL, PFS_RD);
 	pfs_create_file(root, "uptime", &linprocfs_douptime,
 	    NULL, NULL, NULL, PFS_RD);
 	pfs_create_file(root, "version", &linprocfs_doversion,
 	    NULL, NULL, NULL, PFS_RD);
 
 	/* /proc/net/... */
 	dir = pfs_create_dir(root, "net", NULL, NULL, NULL, 0);
 	pfs_create_file(dir, "dev", &linprocfs_donetdev,
 	    NULL, NULL, NULL, PFS_RD);
 
 	/* /proc/<pid>/... */
 	dir = pfs_create_dir(root, "pid", NULL, NULL, NULL, PFS_PROCDEP);
 	pfs_create_file(dir, "cmdline", &linprocfs_doproccmdline,
 	    NULL, NULL, NULL, PFS_RD);
 	pfs_create_link(dir, "cwd", &linprocfs_doproccwd,
 	    NULL, NULL, NULL, 0);
 	pfs_create_file(dir, "environ", &linprocfs_doprocenviron,
 	    NULL, NULL, NULL, PFS_RD);
 	pfs_create_link(dir, "exe", &procfs_doprocfile,
 	    NULL, &procfs_notsystem, NULL, 0);
 	pfs_create_file(dir, "maps", &linprocfs_doprocmaps,
 	    NULL, NULL, NULL, PFS_RD);
 	pfs_create_file(dir, "mem", &procfs_doprocmem,
 	    &procfs_attr, &procfs_candebug, NULL, PFS_RDWR|PFS_RAW);
 	pfs_create_link(dir, "root", &linprocfs_doprocroot,
 	    NULL, NULL, NULL, 0);
 	pfs_create_file(dir, "stat", &linprocfs_doprocstat,
 	    NULL, NULL, NULL, PFS_RD);
 	pfs_create_file(dir, "statm", &linprocfs_doprocstatm,
 	    NULL, NULL, NULL, PFS_RD);
 	pfs_create_file(dir, "status", &linprocfs_doprocstatus,
 	    NULL, NULL, NULL, PFS_RD);
 
 	/* /proc/scsi/... */
 	dir = pfs_create_dir(root, "scsi", NULL, NULL, NULL, 0);
 	pfs_create_file(dir, "device_info", &linprocfs_doscsidevinfo,
 	    NULL, NULL, NULL, PFS_RD);
 	pfs_create_file(dir, "scsi", &linprocfs_doscsiscsi,
 	    NULL, NULL, NULL, PFS_RD);
 
 	/* /proc/sys/... */
 	dir = pfs_create_dir(root, "sys", NULL, NULL, NULL, 0);
 	/* /proc/sys/kernel/... */
 	dir = pfs_create_dir(dir, "kernel", NULL, NULL, NULL, 0);
 	pfs_create_file(dir, "osrelease", &linprocfs_doosrelease,
 	    NULL, NULL, NULL, PFS_RD);
 	pfs_create_file(dir, "ostype", &linprocfs_doostype,
 	    NULL, NULL, NULL, PFS_RD);
 	pfs_create_file(dir, "version", &linprocfs_doosbuild,
 	    NULL, NULL, NULL, PFS_RD);
 	pfs_create_file(dir, "msgmni", &linprocfs_domsgmni,
 	    NULL, NULL, NULL, PFS_RD);
 	pfs_create_file(dir, "pid_max", &linprocfs_dopid_max,
 	    NULL, NULL, NULL, PFS_RD);
 	pfs_create_file(dir, "sem", &linprocfs_dosem,
 	    NULL, NULL, NULL, PFS_RD);
 
 	return (0);
 }
 
 /*
  * Destructor
  */
 static int
 linprocfs_uninit(PFS_INIT_ARGS)
 {
 
 	/* nothing to do, pseudofs will GC */
 	return (0);
 }
 
 PSEUDOFS(linprocfs, 1);
 MODULE_DEPEND(linprocfs, linux, 1, 1, 1);
 MODULE_DEPEND(linprocfs, procfs, 1, 1, 1);
 MODULE_DEPEND(linprocfs, sysvmsg, 1, 1, 1);
 MODULE_DEPEND(linprocfs, sysvsem, 1, 1, 1);
Index: head/sys/compat/linux/linux_file.c
===================================================================
--- head/sys/compat/linux/linux_file.c	(revision 175201)
+++ head/sys/compat/linux/linux_file.c	(revision 175202)
@@ -1,1353 +1,1353 @@
 /*-
  * Copyright (c) 1994-1995 S�ren Schmidt
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer
  *    in this position and unchanged.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. The name of the author may not be used to endorse or promote products
  *    derived from this software without specific prior written permission
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_compat.h"
 #include "opt_mac.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/conf.h>
 #include <sys/dirent.h>
 #include <sys/fcntl.h>
 #include <sys/file.h>
 #include <sys/filedesc.h>
 #include <sys/lock.h>
 #include <sys/malloc.h>
 #include <sys/mount.h>
 #include <sys/mutex.h>
 #include <sys/namei.h>
 #include <sys/proc.h>
 #include <sys/stat.h>
 #include <sys/sx.h>
 #include <sys/syscallsubr.h>
 #include <sys/sysproto.h>
 #include <sys/tty.h>
 #include <sys/unistd.h>
 #include <sys/vnode.h>
 
 #include <security/mac/mac_framework.h>
 
 #include <ufs/ufs/extattr.h>
 #include <ufs/ufs/quota.h>
 #include <ufs/ufs/ufsmount.h>
 
 #ifdef COMPAT_LINUX32
 #include <machine/../linux32/linux.h>
 #include <machine/../linux32/linux32_proto.h>
 #else
 #include <machine/../linux/linux.h>
 #include <machine/../linux/linux_proto.h>
 #endif
 #include <compat/linux/linux_util.h>
 
 int
 linux_creat(struct thread *td, struct linux_creat_args *args)
 {
     char *path;
     int error;
 
     LCONVPATHEXIST(td, args->path, &path);
 
 #ifdef DEBUG
 	if (ldebug(creat))
 		printf(ARGS(creat, "%s, %d"), path, args->mode);
 #endif
     error = kern_open(td, path, UIO_SYSSPACE, O_WRONLY | O_CREAT | O_TRUNC,
 	args->mode);
     LFREEPATH(path);
     return (error);
 }
 
 
 static int
 linux_common_open(struct thread *td, char *path, int l_flags, int mode, int openat)
 {
     struct proc *p = td->td_proc;
     struct file *fp;
     int fd;
     int bsd_flags, error;
 
     bsd_flags = 0;
     switch (l_flags & LINUX_O_ACCMODE) {
     case LINUX_O_WRONLY:
 	bsd_flags |= O_WRONLY;
 	break;
     case LINUX_O_RDWR:
 	bsd_flags |= O_RDWR;
 	break;
     default:
 	bsd_flags |= O_RDONLY;
     }
     if (l_flags & LINUX_O_NDELAY)
 	bsd_flags |= O_NONBLOCK;
     if (l_flags & LINUX_O_APPEND)
 	bsd_flags |= O_APPEND;
     if (l_flags & LINUX_O_SYNC)
 	bsd_flags |= O_FSYNC;
     if (l_flags & LINUX_O_NONBLOCK)
 	bsd_flags |= O_NONBLOCK;
     if (l_flags & LINUX_FASYNC)
 	bsd_flags |= O_ASYNC;
     if (l_flags & LINUX_O_CREAT)
 	bsd_flags |= O_CREAT;
     if (l_flags & LINUX_O_TRUNC)
 	bsd_flags |= O_TRUNC;
     if (l_flags & LINUX_O_EXCL)
 	bsd_flags |= O_EXCL;
     if (l_flags & LINUX_O_NOCTTY)
 	bsd_flags |= O_NOCTTY;
     if (l_flags & LINUX_O_DIRECT)
 	bsd_flags |= O_DIRECT;
     if (l_flags & LINUX_O_NOFOLLOW)
 	bsd_flags |= O_NOFOLLOW;
     /* XXX LINUX_O_NOATIME: unable to be easily implemented. */
 
     error = kern_open(td, path, UIO_SYSSPACE, bsd_flags, mode);
     if (!error) {
 	    fd = td->td_retval[0];
 	    /*
 	     * XXX In between kern_open() and fget(), another process
 	     * having the same filedesc could use that fd without
 	     * checking below.
 	     */
 	    error = fget(td, fd, &fp);
 	    if (!error) {
 		    sx_slock(&proctree_lock);
 		    PROC_LOCK(p);
 		    if (!(bsd_flags & O_NOCTTY) &&
 			SESS_LEADER(p) && !(p->p_flag & P_CONTROLT)) {
 			    PROC_UNLOCK(p);
 			    sx_unlock(&proctree_lock);
 			    if (fp->f_type == DTYPE_VNODE)
 				    (void) fo_ioctl(fp, TIOCSCTTY, (caddr_t) 0,
 					     td->td_ucred, td);
 		    } else {
 			    PROC_UNLOCK(p);
 			    sx_sunlock(&proctree_lock);
 		    }
 		    if (l_flags & LINUX_O_DIRECTORY) {
 			    if (fp->f_type != DTYPE_VNODE ||
 				fp->f_vnode->v_type != VDIR) {
 				    error = ENOTDIR;
 			    }
 		    }
 		    fdrop(fp, td);
 		    /*
 		     * XXX as above, fdrop()/kern_close() pair is racy.
 		     */
 		    if (error)
 			    kern_close(td, fd);
 	    }
     }
 
 #ifdef DEBUG
     if (ldebug(open))
 	    printf(LMSG("open returns error %d"), error);
 #endif
     if (!openat)
 	LFREEPATH(path);
     return error;
 }
 
 /*
  * common code for linux *at set of syscalls
  *
  * works like this:
  * if filename is absolute 
  *    ignore dirfd
  * else
  *    if dirfd == AT_FDCWD 
  *       return CWD/filename
  *    else
  *       return DIRFD/filename
  */
 static int
 linux_at(struct thread *td, int dirfd, char *filename, char **newpath, char **freebuf)
 {
    	struct file *fp;
 	int error = 0, vfslocked;
 	struct vnode *dvp;
 	struct filedesc *fdp = td->td_proc->p_fd;
 	char *fullpath = "unknown";
 	char *freepath = NULL;
 
 	/* don't do anything if the pathname is absolute */
 	if (*filename == '/') {
 	   	*newpath= filename;
 	   	return (0);
 	}
 
 	/* check for AT_FDWCD */
 	if (dirfd == LINUX_AT_FDCWD) {
 	   	FILEDESC_SLOCK(fdp);
 		dvp = fdp->fd_cdir;
 		vref(dvp);
 	   	FILEDESC_SUNLOCK(fdp);
 	} else {
 	   	error = fget(td, dirfd, &fp);
 		if (error)
 		   	return (error);
 		dvp = fp->f_vnode;
 		/* only a dir can be dfd */
 		if (dvp->v_type != VDIR) {
 		   	fdrop(fp, td);
 			return (ENOTDIR);
 		}
 		vref(dvp);
 		fdrop(fp, td);
 	}
 
 	/*
 	 * XXXRW: This is bogus, as vn_fullpath() returns only an advisory
 	 * file path, and may fail in several common situations, including
 	 * for file systmes that don't use the name cache, and if the entry
 	 * for the file falls out of the name cache.  We should implement
 	 * openat() in the FreeBSD native system call layer properly (using a
 	 * requested starting directory), and have Linux and other ABIs wrap
 	 * the native implementation.
 	 */
 	error = vn_fullpath(td, dvp, &fullpath, &freepath);
 	if (!error) {
 	   	*newpath = malloc(strlen(fullpath) + strlen(filename) + 2, M_TEMP, M_WAITOK | M_ZERO);
 		*freebuf = freepath;
 		sprintf(*newpath, "%s/%s", fullpath, filename);
 	} else {
 		*newpath = NULL;
 	}
 	vfslocked = VFS_LOCK_GIANT(dvp->v_mount);
 	vrele(dvp);
 	VFS_UNLOCK_GIANT(vfslocked);
 	return (error);
 }
 
 int
 linux_openat(struct thread *td, struct linux_openat_args *args)
 {
 	char *newpath, *oldpath, *freebuf, *path;
 	int error;
 
 	oldpath = malloc(MAXPATHLEN, M_TEMP, M_WAITOK);
 	error = copyinstr(args->filename, oldpath, MAXPATHLEN, NULL);
 	if (error) {
 		free(oldpath, M_TEMP);
 		return (error);
 	}
 #ifdef DEBUG
 	if (ldebug(openat))
 		printf(ARGS(openat, "%i, %s, 0x%x, 0x%x"), args->dfd,
 		    oldpath, args->flags, args->mode);
 #endif
 	newpath = freebuf = NULL;
 	error = linux_at(td, args->dfd, oldpath, &newpath, &freebuf);
 	if (error == 0) {
 #ifdef DEBUG
 		if (ldebug(openat))
 			printf(LMSG("newpath: %s"), newpath);
 #endif
 		if (args->flags & LINUX_O_CREAT)
 			LCONVPATH_SEG(td, newpath, &path, 1, UIO_SYSSPACE);
 		else
 			LCONVPATH_SEG(td, newpath, &path, 0, UIO_SYSSPACE);
 	}
 	if (freebuf)
 	   	free(freebuf, M_TEMP);
 	if (*oldpath != '/')
    	   	free(newpath, M_TEMP);
 	if (error == 0) {
 		error = linux_common_open(td, path, args->flags,
 		    args->mode, 1);
 		LFREEPATH(path);
 	}
 	free(oldpath, M_TEMP);
 	return (error);
 }
 
 int
 linux_open(struct thread *td, struct linux_open_args *args)
 {
     char *path;
 
     if (args->flags & LINUX_O_CREAT)
 	LCONVPATHCREAT(td, args->path, &path);
     else
 	LCONVPATHEXIST(td, args->path, &path);
 
 #ifdef DEBUG
 	if (ldebug(open))
 		printf(ARGS(open, "%s, 0x%x, 0x%x"),
 		    path, args->flags, args->mode);
 #endif
 
     return linux_common_open(td, path, args->flags, args->mode, 0);
 }
 
 int
 linux_lseek(struct thread *td, struct linux_lseek_args *args)
 {
 
     struct lseek_args /* {
 	int fd;
 	int pad;
 	off_t offset;
 	int whence;
     } */ tmp_args;
     int error;
 
 #ifdef DEBUG
 	if (ldebug(lseek))
 		printf(ARGS(lseek, "%d, %ld, %d"),
 		    args->fdes, (long)args->off, args->whence);
 #endif
     tmp_args.fd = args->fdes;
     tmp_args.offset = (off_t)args->off;
     tmp_args.whence = args->whence;
     error = lseek(td, &tmp_args);
     return error;
 }
 
 int
 linux_llseek(struct thread *td, struct linux_llseek_args *args)
 {
 	struct lseek_args bsd_args;
 	int error;
 	off_t off;
 
 #ifdef DEBUG
 	if (ldebug(llseek))
 		printf(ARGS(llseek, "%d, %d:%d, %d"),
 		    args->fd, args->ohigh, args->olow, args->whence);
 #endif
 	off = (args->olow) | (((off_t) args->ohigh) << 32);
 
 	bsd_args.fd = args->fd;
 	bsd_args.offset = off;
 	bsd_args.whence = args->whence;
 
 	if ((error = lseek(td, &bsd_args)))
 		return error;
 
 	if ((error = copyout(td->td_retval, args->res, sizeof (off_t))))
 		return error;
 
 	td->td_retval[0] = 0;
 	return 0;
 }
 
 int
 linux_readdir(struct thread *td, struct linux_readdir_args *args)
 {
 	struct linux_getdents_args lda;
 
 	lda.fd = args->fd;
 	lda.dent = args->dent;
 	lda.count = 1;
 	return linux_getdents(td, &lda);
 }
 
 /*
  * Note that linux_getdents(2) and linux_getdents64(2) have the same
  * arguments. They only differ in the definition of struct dirent they
  * operate on. We use this to common the code, with the exception of
  * accessing struct dirent. Note that linux_readdir(2) is implemented
  * by means of linux_getdents(2). In this case we never operate on
  * struct dirent64 and thus don't need to handle it...
  */
 
 struct l_dirent {
 	l_long		d_ino;
 	l_off_t		d_off;
 	l_ushort	d_reclen;
 	char		d_name[LINUX_NAME_MAX + 1];
 };
 
 struct l_dirent64 {
 	uint64_t	d_ino;
 	int64_t		d_off;
 	l_ushort	d_reclen;
 	u_char		d_type;
 	char		d_name[LINUX_NAME_MAX + 1];
 };
 
 #define LINUX_RECLEN(de,namlen) \
     ALIGN((((char *)&(de)->d_name - (char *)de) + (namlen) + 1))
 
 #define	LINUX_DIRBLKSIZ		512
 
 static int
 getdents_common(struct thread *td, struct linux_getdents64_args *args,
     int is64bit)
 {
 	struct dirent *bdp;
 	struct vnode *vp;
 	caddr_t inp, buf;		/* BSD-format */
 	int len, reclen;		/* BSD-format */
 	caddr_t outp;			/* Linux-format */
 	int resid, linuxreclen=0;	/* Linux-format */
 	struct file *fp;
 	struct uio auio;
 	struct iovec aiov;
 	off_t off;
 	struct l_dirent linux_dirent;
 	struct l_dirent64 linux_dirent64;
 	int buflen, error, eofflag, nbytes, justone;
 	u_long *cookies = NULL, *cookiep;
 	int ncookies, vfslocked;
 
 	nbytes = args->count;
 	if (nbytes == 1) {
 		/* readdir(2) case. Always struct dirent. */
 		if (is64bit)
 			return (EINVAL);
 		nbytes = sizeof(linux_dirent);
 		justone = 1;
 	} else
 		justone = 0;
 
 	if ((error = getvnode(td->td_proc->p_fd, args->fd, &fp)) != 0)
 		return (error);
 
 	if ((fp->f_flag & FREAD) == 0) {
 		fdrop(fp, td);
 		return (EBADF);
 	}
 
 	vp = fp->f_vnode;
 	vfslocked = VFS_LOCK_GIANT(vp->v_mount);
 	if (vp->v_type != VDIR) {
 		VFS_UNLOCK_GIANT(vfslocked);
 		fdrop(fp, td);
 		return (EINVAL);
 	}
 
 	off = fp->f_offset;
 
 	buflen = max(LINUX_DIRBLKSIZ, nbytes);
 	buflen = min(buflen, MAXBSIZE);
 	buf = malloc(buflen, M_TEMP, M_WAITOK);
-	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td);
+	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
 
 again:
 	aiov.iov_base = buf;
 	aiov.iov_len = buflen;
 	auio.uio_iov = &aiov;
 	auio.uio_iovcnt = 1;
 	auio.uio_rw = UIO_READ;
 	auio.uio_segflg = UIO_SYSSPACE;
 	auio.uio_td = td;
 	auio.uio_resid = buflen;
 	auio.uio_offset = off;
 
 	if (cookies) {
 		free(cookies, M_TEMP);
 		cookies = NULL;
 	}
 
 #ifdef MAC
 	/*
 	 * Do directory search MAC check using non-cached credentials.
 	 */
 	if ((error = mac_vnode_check_readdir(td->td_ucred, vp)))
 		goto out;
 #endif /* MAC */
 	if ((error = VOP_READDIR(vp, &auio, fp->f_cred, &eofflag, &ncookies,
 		 &cookies)))
 		goto out;
 
 	inp = buf;
 	outp = (caddr_t)args->dirent;
 	resid = nbytes;
 	if ((len = buflen - auio.uio_resid) <= 0)
 		goto eof;
 
 	cookiep = cookies;
 
 	if (cookies) {
 		/*
 		 * When using cookies, the vfs has the option of reading from
 		 * a different offset than that supplied (UFS truncates the
 		 * offset to a block boundary to make sure that it never reads
 		 * partway through a directory entry, even if the directory
 		 * has been compacted).
 		 */
 		while (len > 0 && ncookies > 0 && *cookiep <= off) {
 			bdp = (struct dirent *) inp;
 			len -= bdp->d_reclen;
 			inp += bdp->d_reclen;
 			cookiep++;
 			ncookies--;
 		}
 	}
 
 	while (len > 0) {
 		if (cookiep && ncookies == 0)
 			break;
 		bdp = (struct dirent *) inp;
 		reclen = bdp->d_reclen;
 		if (reclen & 3) {
 			error = EFAULT;
 			goto out;
 		}
 
 		if (bdp->d_fileno == 0) {
 			inp += reclen;
 			if (cookiep) {
 				off = *cookiep++;
 				ncookies--;
 			} else
 				off += reclen;
 
 			len -= reclen;
 			continue;
 		}
 
 		linuxreclen = (is64bit)
 		    ? LINUX_RECLEN(&linux_dirent64, bdp->d_namlen)
 		    : LINUX_RECLEN(&linux_dirent, bdp->d_namlen);
 
 		if (reclen > len || resid < linuxreclen) {
 			outp++;
 			break;
 		}
 
 		if (justone) {
 			/* readdir(2) case. */
 			linux_dirent.d_ino = (l_long)bdp->d_fileno;
 			linux_dirent.d_off = (l_off_t)linuxreclen;
 			linux_dirent.d_reclen = (l_ushort)bdp->d_namlen;
 			strcpy(linux_dirent.d_name, bdp->d_name);
 			error = copyout(&linux_dirent, outp, linuxreclen);
 		} else {
 			if (is64bit) {
 				linux_dirent64.d_ino = bdp->d_fileno;
 				linux_dirent64.d_off = (cookiep)
 				    ? (l_off_t)*cookiep
 				    : (l_off_t)(off + reclen);
 				linux_dirent64.d_reclen =
 				    (l_ushort)linuxreclen;
 				linux_dirent64.d_type = bdp->d_type;
 				strcpy(linux_dirent64.d_name, bdp->d_name);
 				error = copyout(&linux_dirent64, outp,
 				    linuxreclen);
 			} else {
 				linux_dirent.d_ino = bdp->d_fileno;
 				linux_dirent.d_off = (cookiep)
 				    ? (l_off_t)*cookiep
 				    : (l_off_t)(off + reclen);
 				linux_dirent.d_reclen = (l_ushort)linuxreclen;
 				strcpy(linux_dirent.d_name, bdp->d_name);
 				error = copyout(&linux_dirent, outp,
 				    linuxreclen);
 			}
 		}
 		if (error)
 			goto out;
 
 		inp += reclen;
 		if (cookiep) {
 			off = *cookiep++;
 			ncookies--;
 		} else
 			off += reclen;
 
 		outp += linuxreclen;
 		resid -= linuxreclen;
 		len -= reclen;
 		if (justone)
 			break;
 	}
 
 	if (outp == (caddr_t)args->dirent)
 		goto again;
 
 	fp->f_offset = off;
 	if (justone)
 		nbytes = resid + linuxreclen;
 
 eof:
 	td->td_retval[0] = nbytes - resid;
 
 out:
 	if (cookies)
 		free(cookies, M_TEMP);
 
 	VOP_UNLOCK(vp, 0, td);
 	VFS_UNLOCK_GIANT(vfslocked);
 	fdrop(fp, td);
 	free(buf, M_TEMP);
 	return (error);
 }
 
 int
 linux_getdents(struct thread *td, struct linux_getdents_args *args)
 {
 
 #ifdef DEBUG
 	if (ldebug(getdents))
 		printf(ARGS(getdents, "%d, *, %d"), args->fd, args->count);
 #endif
 
 	return (getdents_common(td, (struct linux_getdents64_args*)args, 0));
 }
 
 int
 linux_getdents64(struct thread *td, struct linux_getdents64_args *args)
 {
 
 #ifdef DEBUG
 	if (ldebug(getdents64))
 		printf(ARGS(getdents64, "%d, *, %d"), args->fd, args->count);
 #endif
 
 	return (getdents_common(td, args, 1));
 }
 
 /*
  * These exist mainly for hooks for doing /compat/linux translation.
  */
 
 int
 linux_access(struct thread *td, struct linux_access_args *args)
 {
 	char *path;
 	int error;
 
 	/* linux convention */
 	if (args->flags & ~(F_OK | X_OK | W_OK | R_OK))
 		return (EINVAL);
 
 	LCONVPATHEXIST(td, args->path, &path);
 
 #ifdef DEBUG
 	if (ldebug(access))
 		printf(ARGS(access, "%s, %d"), path, args->flags);
 #endif
 	error = kern_access(td, path, UIO_SYSSPACE, args->flags);
 	LFREEPATH(path);
 
 	return (error);
 }
 
 int
 linux_unlink(struct thread *td, struct linux_unlink_args *args)
 {
 	char *path;
 	int error;
 	struct stat st;
 
 	LCONVPATHEXIST(td, args->path, &path);
 
 #ifdef DEBUG
 	if (ldebug(unlink))
 		printf(ARGS(unlink, "%s"), path);
 #endif
 
 	error = kern_unlink(td, path, UIO_SYSSPACE);
 	if (error == EPERM)
 		/* Introduce POSIX noncompliant behaviour of Linux */
 		if (kern_stat(td, path, UIO_SYSSPACE, &st) == 0)
 			if (S_ISDIR(st.st_mode))
 				error = EISDIR;
 	LFREEPATH(path);
 	return (error);
 }
 
 int
 linux_chdir(struct thread *td, struct linux_chdir_args *args)
 {
 	char *path;
 	int error;
 
 	LCONVPATHEXIST(td, args->path, &path);
 
 #ifdef DEBUG
 	if (ldebug(chdir))
 		printf(ARGS(chdir, "%s"), path);
 #endif
 	error = kern_chdir(td, path, UIO_SYSSPACE);
 	LFREEPATH(path);
 	return (error);
 }
 
 int
 linux_chmod(struct thread *td, struct linux_chmod_args *args)
 {
 	char *path;
 	int error;
 
 	LCONVPATHEXIST(td, args->path, &path);
 
 #ifdef DEBUG
 	if (ldebug(chmod))
 		printf(ARGS(chmod, "%s, %d"), path, args->mode);
 #endif
 	error = kern_chmod(td, path, UIO_SYSSPACE, args->mode);
 	LFREEPATH(path);
 	return (error);
 }
 
 int
 linux_mkdir(struct thread *td, struct linux_mkdir_args *args)
 {
 	char *path;
 	int error;
 
 	LCONVPATHCREAT(td, args->path, &path);
 
 #ifdef DEBUG
 	if (ldebug(mkdir))
 		printf(ARGS(mkdir, "%s, %d"), path, args->mode);
 #endif
 	error = kern_mkdir(td, path, UIO_SYSSPACE, args->mode);
 	LFREEPATH(path);
 	return (error);
 }
 
 int
 linux_rmdir(struct thread *td, struct linux_rmdir_args *args)
 {
 	char *path;
 	int error;
 
 	LCONVPATHEXIST(td, args->path, &path);
 
 #ifdef DEBUG
 	if (ldebug(rmdir))
 		printf(ARGS(rmdir, "%s"), path);
 #endif
 	error = kern_rmdir(td, path, UIO_SYSSPACE);
 	LFREEPATH(path);
 	return (error);
 }
 
 int
 linux_rename(struct thread *td, struct linux_rename_args *args)
 {
 	char *from, *to;
 	int error;
 
 	LCONVPATHEXIST(td, args->from, &from);
 	/* Expand LCONVPATHCREATE so that `from' can be freed on errors */
 	error = linux_emul_convpath(td, args->to, UIO_USERSPACE, &to, 1);
 	if (to == NULL) {
 		LFREEPATH(from);
 		return (error);
 	}
 
 #ifdef DEBUG
 	if (ldebug(rename))
 		printf(ARGS(rename, "%s, %s"), from, to);
 #endif
 	error = kern_rename(td, from, to, UIO_SYSSPACE);
 	LFREEPATH(from);
 	LFREEPATH(to);
 	return (error);
 }
 
 int
 linux_symlink(struct thread *td, struct linux_symlink_args *args)
 {
 	char *path, *to;
 	int error;
 
 	LCONVPATHEXIST(td, args->path, &path);
 	/* Expand LCONVPATHCREATE so that `path' can be freed on errors */
 	error = linux_emul_convpath(td, args->to, UIO_USERSPACE, &to, 1);
 	if (to == NULL) {
 		LFREEPATH(path);
 		return (error);
 	}
 
 #ifdef DEBUG
 	if (ldebug(symlink))
 		printf(ARGS(symlink, "%s, %s"), path, to);
 #endif
 	error = kern_symlink(td, path, to, UIO_SYSSPACE);
 	LFREEPATH(path);
 	LFREEPATH(to);
 	return (error);
 }
 
 int
 linux_readlink(struct thread *td, struct linux_readlink_args *args)
 {
 	char *name;
 	int error;
 
 	LCONVPATHEXIST(td, args->name, &name);
 
 #ifdef DEBUG
 	if (ldebug(readlink))
 		printf(ARGS(readlink, "%s, %p, %d"), name, (void *)args->buf,
 		    args->count);
 #endif
 	error = kern_readlink(td, name, UIO_SYSSPACE, args->buf, UIO_USERSPACE,
 	    args->count);
 	LFREEPATH(name);
 	return (error);
 }
 
 int
 linux_truncate(struct thread *td, struct linux_truncate_args *args)
 {
 	char *path;
 	int error;
 
 	LCONVPATHEXIST(td, args->path, &path);
 
 #ifdef DEBUG
 	if (ldebug(truncate))
 		printf(ARGS(truncate, "%s, %ld"), path, (long)args->length);
 #endif
 
 	error = kern_truncate(td, path, UIO_SYSSPACE, args->length);
 	LFREEPATH(path);
 	return (error);
 }
 
 int
 linux_ftruncate(struct thread *td, struct linux_ftruncate_args *args)
 {
 	struct ftruncate_args /* {
 		int fd;
 		int pad;
 		off_t length;
 		} */ nuap;
 	   
 	nuap.fd = args->fd;
 	nuap.length = args->length;
 	return (ftruncate(td, &nuap));
 }
 
 int
 linux_link(struct thread *td, struct linux_link_args *args)
 {
 	char *path, *to;
 	int error;
 
 	LCONVPATHEXIST(td, args->path, &path);
 	/* Expand LCONVPATHCREATE so that `path' can be freed on errors */
 	error = linux_emul_convpath(td, args->to, UIO_USERSPACE, &to, 1);
 	if (to == NULL) {
 		LFREEPATH(path);
 		return (error);
 	}
 
 #ifdef DEBUG
 	if (ldebug(link))
 		printf(ARGS(link, "%s, %s"), path, to);
 #endif
 	error = kern_link(td, path, to, UIO_SYSSPACE);
 	LFREEPATH(path);
 	LFREEPATH(to);
 	return (error);
 }
 
 int
 linux_fdatasync(td, uap)
 	struct thread *td;
 	struct linux_fdatasync_args *uap;
 {
 	struct fsync_args bsd;
 
 	bsd.fd = uap->fd;
 	return fsync(td, &bsd);
 }
 
 int
 linux_pread(td, uap)
 	struct thread *td;
 	struct linux_pread_args *uap;
 {
 	struct pread_args bsd;
 	struct vnode *vp;
 	int error;
 
 	bsd.fd = uap->fd;
 	bsd.buf = uap->buf;
 	bsd.nbyte = uap->nbyte;
 	bsd.offset = uap->offset;
 
 	error = pread(td, &bsd);
 
 	if (error == 0) {
    	   	/* This seems to violate POSIX but linux does it */
    	   	if ((error = fgetvp(td, uap->fd, &vp)) != 0)
    		   	return (error);
 		if (vp->v_type == VDIR) {
    		   	vrele(vp);
 			return (EISDIR);
 		}
 		vrele(vp);
 	}
 
 	return (error);
 }
 
 int
 linux_pwrite(td, uap)
 	struct thread *td;
 	struct linux_pwrite_args *uap;
 {
 	struct pwrite_args bsd;
 
 	bsd.fd = uap->fd;
 	bsd.buf = uap->buf;
 	bsd.nbyte = uap->nbyte;
 	bsd.offset = uap->offset;
 	return pwrite(td, &bsd);
 }
 
 int
 linux_mount(struct thread *td, struct linux_mount_args *args)
 {
 	struct ufs_args ufs;
 	char fstypename[MFSNAMELEN];
 	char mntonname[MNAMELEN], mntfromname[MNAMELEN];
 	int error;
 	int fsflags;
 	void *fsdata;
 
 	error = copyinstr(args->filesystemtype, fstypename, MFSNAMELEN - 1,
 	    NULL);
 	if (error)
 		return (error);
 	error = copyinstr(args->specialfile, mntfromname, MNAMELEN - 1, NULL);
 	if (error)
 		return (error);
 	error = copyinstr(args->dir, mntonname, MNAMELEN - 1, NULL);
 	if (error)
 		return (error);
 
 #ifdef DEBUG
 	if (ldebug(mount))
 		printf(ARGS(mount, "%s, %s, %s"),
 		    fstypename, mntfromname, mntonname);
 #endif
 
 	if (strcmp(fstypename, "ext2") == 0) {
 		strcpy(fstypename, "ext2fs");
 		fsdata = &ufs;
 		ufs.fspec = mntfromname;
 #define DEFAULT_ROOTID		-2
 		ufs.export.ex_root = DEFAULT_ROOTID;
 		ufs.export.ex_flags =
 		    args->rwflag & LINUX_MS_RDONLY ? MNT_EXRDONLY : 0;
 	} else if (strcmp(fstypename, "proc") == 0) {
 		strcpy(fstypename, "linprocfs");
 		fsdata = NULL;
 	} else {
 		return (ENODEV);
 	}
 
 	fsflags = 0;
 
 	if ((args->rwflag & 0xffff0000) == 0xc0ed0000) {
 		/*
 		 * Linux SYNC flag is not included; the closest equivalent
 		 * FreeBSD has is !ASYNC, which is our default.
 		 */
 		if (args->rwflag & LINUX_MS_RDONLY)
 			fsflags |= MNT_RDONLY;
 		if (args->rwflag & LINUX_MS_NOSUID)
 			fsflags |= MNT_NOSUID;
 		if (args->rwflag & LINUX_MS_NOEXEC)
 			fsflags |= MNT_NOEXEC;
 		if (args->rwflag & LINUX_MS_REMOUNT)
 			fsflags |= MNT_UPDATE;
 	}
 
 	if (strcmp(fstypename, "linprocfs") == 0) {
 		error = kernel_vmount(fsflags,
 			"fstype", fstypename,
 			"fspath", mntonname,
 			NULL);
 	} else
 		error = EOPNOTSUPP;
 	return (error);
 }
 
 int
 linux_oldumount(struct thread *td, struct linux_oldumount_args *args)
 {
 	struct linux_umount_args args2;
 
 	args2.path = args->path;
 	args2.flags = 0;
 	return (linux_umount(td, &args2));
 }
 
 int
 linux_umount(struct thread *td, struct linux_umount_args *args)
 {
 	struct unmount_args bsd;
 
 	bsd.path = args->path;
 	bsd.flags = args->flags;	/* XXX correct? */
 	return (unmount(td, &bsd));
 }
 
 /*
  * fcntl family of syscalls
  */
 
 struct l_flock {
 	l_short		l_type;
 	l_short		l_whence;
 	l_off_t		l_start;
 	l_off_t		l_len;
 	l_pid_t		l_pid;
 }
 #if defined(__amd64__) && defined(COMPAT_LINUX32)
 __packed
 #endif
 ;
 
 static void
 linux_to_bsd_flock(struct l_flock *linux_flock, struct flock *bsd_flock)
 {
 	switch (linux_flock->l_type) {
 	case LINUX_F_RDLCK:
 		bsd_flock->l_type = F_RDLCK;
 		break;
 	case LINUX_F_WRLCK:
 		bsd_flock->l_type = F_WRLCK;
 		break;
 	case LINUX_F_UNLCK:
 		bsd_flock->l_type = F_UNLCK;
 		break;
 	default:
 		bsd_flock->l_type = -1;
 		break;
 	}
 	bsd_flock->l_whence = linux_flock->l_whence;
 	bsd_flock->l_start = (off_t)linux_flock->l_start;
 	bsd_flock->l_len = (off_t)linux_flock->l_len;
 	bsd_flock->l_pid = (pid_t)linux_flock->l_pid;
 }
 
 static void
 bsd_to_linux_flock(struct flock *bsd_flock, struct l_flock *linux_flock)
 {
 	switch (bsd_flock->l_type) {
 	case F_RDLCK:
 		linux_flock->l_type = LINUX_F_RDLCK;
 		break;
 	case F_WRLCK:
 		linux_flock->l_type = LINUX_F_WRLCK;
 		break;
 	case F_UNLCK:
 		linux_flock->l_type = LINUX_F_UNLCK;
 		break;
 	}
 	linux_flock->l_whence = bsd_flock->l_whence;
 	linux_flock->l_start = (l_off_t)bsd_flock->l_start;
 	linux_flock->l_len = (l_off_t)bsd_flock->l_len;
 	linux_flock->l_pid = (l_pid_t)bsd_flock->l_pid;
 }
 
 #if defined(__i386__) || (defined(__amd64__) && defined(COMPAT_LINUX32))
 struct l_flock64 {
 	l_short		l_type;
 	l_short		l_whence;
 	l_loff_t	l_start;
 	l_loff_t	l_len;
 	l_pid_t		l_pid;
 }
 #if defined(__amd64__) && defined(COMPAT_LINUX32)
 __packed
 #endif
 ;
 
 static void
 linux_to_bsd_flock64(struct l_flock64 *linux_flock, struct flock *bsd_flock)
 {
 	switch (linux_flock->l_type) {
 	case LINUX_F_RDLCK:
 		bsd_flock->l_type = F_RDLCK;
 		break;
 	case LINUX_F_WRLCK:
 		bsd_flock->l_type = F_WRLCK;
 		break;
 	case LINUX_F_UNLCK:
 		bsd_flock->l_type = F_UNLCK;
 		break;
 	default:
 		bsd_flock->l_type = -1;
 		break;
 	}
 	bsd_flock->l_whence = linux_flock->l_whence;
 	bsd_flock->l_start = (off_t)linux_flock->l_start;
 	bsd_flock->l_len = (off_t)linux_flock->l_len;
 	bsd_flock->l_pid = (pid_t)linux_flock->l_pid;
 }
 
 static void
 bsd_to_linux_flock64(struct flock *bsd_flock, struct l_flock64 *linux_flock)
 {
 	switch (bsd_flock->l_type) {
 	case F_RDLCK:
 		linux_flock->l_type = LINUX_F_RDLCK;
 		break;
 	case F_WRLCK:
 		linux_flock->l_type = LINUX_F_WRLCK;
 		break;
 	case F_UNLCK:
 		linux_flock->l_type = LINUX_F_UNLCK;
 		break;
 	}
 	linux_flock->l_whence = bsd_flock->l_whence;
 	linux_flock->l_start = (l_loff_t)bsd_flock->l_start;
 	linux_flock->l_len = (l_loff_t)bsd_flock->l_len;
 	linux_flock->l_pid = (l_pid_t)bsd_flock->l_pid;
 }
 #endif /* __i386__ || (__amd64__ && COMPAT_LINUX32) */
 
 static int
 fcntl_common(struct thread *td, struct linux_fcntl64_args *args)
 {
 	struct l_flock linux_flock;
 	struct flock bsd_flock;
 	struct file *fp;
 	long arg;
 	int error, result;
 
 	switch (args->cmd) {
 	case LINUX_F_DUPFD:
 		return (kern_fcntl(td, args->fd, F_DUPFD, args->arg));
 
 	case LINUX_F_GETFD:
 		return (kern_fcntl(td, args->fd, F_GETFD, 0));
 
 	case LINUX_F_SETFD:
 		return (kern_fcntl(td, args->fd, F_SETFD, args->arg));
 
 	case LINUX_F_GETFL:
 		error = kern_fcntl(td, args->fd, F_GETFL, 0);
 		result = td->td_retval[0];
 		td->td_retval[0] = 0;
 		if (result & O_RDONLY)
 			td->td_retval[0] |= LINUX_O_RDONLY;
 		if (result & O_WRONLY)
 			td->td_retval[0] |= LINUX_O_WRONLY;
 		if (result & O_RDWR)
 			td->td_retval[0] |= LINUX_O_RDWR;
 		if (result & O_NDELAY)
 			td->td_retval[0] |= LINUX_O_NONBLOCK;
 		if (result & O_APPEND)
 			td->td_retval[0] |= LINUX_O_APPEND;
 		if (result & O_FSYNC)
 			td->td_retval[0] |= LINUX_O_SYNC;
 		if (result & O_ASYNC)
 			td->td_retval[0] |= LINUX_FASYNC;
 #ifdef LINUX_O_NOFOLLOW
 		if (result & O_NOFOLLOW)
 			td->td_retval[0] |= LINUX_O_NOFOLLOW;
 #endif
 #ifdef LINUX_O_DIRECT
 		if (result & O_DIRECT)
 			td->td_retval[0] |= LINUX_O_DIRECT;
 #endif
 		return (error);
 
 	case LINUX_F_SETFL:
 		arg = 0;
 		if (args->arg & LINUX_O_NDELAY)
 			arg |= O_NONBLOCK;
 		if (args->arg & LINUX_O_APPEND)
 			arg |= O_APPEND;
 		if (args->arg & LINUX_O_SYNC)
 			arg |= O_FSYNC;
 		if (args->arg & LINUX_FASYNC)
 			arg |= O_ASYNC;
 #ifdef LINUX_O_NOFOLLOW
 		if (args->arg & LINUX_O_NOFOLLOW)
 			arg |= O_NOFOLLOW;
 #endif
 #ifdef LINUX_O_DIRECT
 		if (args->arg & LINUX_O_DIRECT)
 			arg |= O_DIRECT;
 #endif
 		return (kern_fcntl(td, args->fd, F_SETFL, arg));
 
 	case LINUX_F_GETLK:
 		error = copyin((void *)args->arg, &linux_flock,
 		    sizeof(linux_flock));
 		if (error)
 			return (error);
 		linux_to_bsd_flock(&linux_flock, &bsd_flock);
 		error = kern_fcntl(td, args->fd, F_GETLK, (intptr_t)&bsd_flock);
 		if (error)
 			return (error);
 		bsd_to_linux_flock(&bsd_flock, &linux_flock);
 		return (copyout(&linux_flock, (void *)args->arg,
 		    sizeof(linux_flock)));
 
 	case LINUX_F_SETLK:
 		error = copyin((void *)args->arg, &linux_flock,
 		    sizeof(linux_flock));
 		if (error)
 			return (error);
 		linux_to_bsd_flock(&linux_flock, &bsd_flock);
 		return (kern_fcntl(td, args->fd, F_SETLK,
 		    (intptr_t)&bsd_flock));
 
 	case LINUX_F_SETLKW:
 		error = copyin((void *)args->arg, &linux_flock,
 		    sizeof(linux_flock));
 		if (error)
 			return (error);
 		linux_to_bsd_flock(&linux_flock, &bsd_flock);
 		return (kern_fcntl(td, args->fd, F_SETLKW,
 		     (intptr_t)&bsd_flock));
 
 	case LINUX_F_GETOWN:
 		return (kern_fcntl(td, args->fd, F_GETOWN, 0));
 
 	case LINUX_F_SETOWN:
 		/*
 		 * XXX some Linux applications depend on F_SETOWN having no
 		 * significant effect for pipes (SIGIO is not delivered for
 		 * pipes under Linux-2.2.35 at least).
 		 */
 		error = fget(td, args->fd, &fp);
 		if (error)
 			return (error);
 		if (fp->f_type == DTYPE_PIPE) {
 			fdrop(fp, td);
 			return (EINVAL);
 		}
 		fdrop(fp, td);
 
 		return (kern_fcntl(td, args->fd, F_SETOWN, args->arg));
 	}
 
 	return (EINVAL);
 }
 
 int
 linux_fcntl(struct thread *td, struct linux_fcntl_args *args)
 {
 	struct linux_fcntl64_args args64;
 
 #ifdef DEBUG
 	if (ldebug(fcntl))
 		printf(ARGS(fcntl, "%d, %08x, *"), args->fd, args->cmd);
 #endif
 
 	args64.fd = args->fd;
 	args64.cmd = args->cmd;
 	args64.arg = args->arg;
 	return (fcntl_common(td, &args64));
 }
 
 #if defined(__i386__) || (defined(__amd64__) && defined(COMPAT_LINUX32))
 int
 linux_fcntl64(struct thread *td, struct linux_fcntl64_args *args)
 {
 	struct l_flock64 linux_flock;
 	struct flock bsd_flock;
 	int error;
 
 #ifdef DEBUG
 	if (ldebug(fcntl64))
 		printf(ARGS(fcntl64, "%d, %08x, *"), args->fd, args->cmd);
 #endif
 
 	switch (args->cmd) {
 	case LINUX_F_GETLK64:
 		error = copyin((void *)args->arg, &linux_flock,
 		    sizeof(linux_flock));
 		if (error)
 			return (error);
 		linux_to_bsd_flock64(&linux_flock, &bsd_flock);
 		error = kern_fcntl(td, args->fd, F_GETLK, (intptr_t)&bsd_flock);
 		if (error)
 			return (error);
 		bsd_to_linux_flock64(&bsd_flock, &linux_flock);
 		return (copyout(&linux_flock, (void *)args->arg,
 			    sizeof(linux_flock)));
 
 	case LINUX_F_SETLK64:
 		error = copyin((void *)args->arg, &linux_flock,
 		    sizeof(linux_flock));
 		if (error)
 			return (error);
 		linux_to_bsd_flock64(&linux_flock, &bsd_flock);
 		return (kern_fcntl(td, args->fd, F_SETLK,
 		    (intptr_t)&bsd_flock));
 
 	case LINUX_F_SETLKW64:
 		error = copyin((void *)args->arg, &linux_flock,
 		    sizeof(linux_flock));
 		if (error)
 			return (error);
 		linux_to_bsd_flock64(&linux_flock, &bsd_flock);
 		return (kern_fcntl(td, args->fd, F_SETLKW,
 		    (intptr_t)&bsd_flock));
 	}
 
 	return (fcntl_common(td, args));
 }
 #endif /* __i386__ || (__amd64__ && COMPAT_LINUX32) */
 
 int
 linux_chown(struct thread *td, struct linux_chown_args *args)
 {
 	char *path;
 	int error;
 
 	LCONVPATHEXIST(td, args->path, &path);
 
 #ifdef DEBUG
 	if (ldebug(chown))
 		printf(ARGS(chown, "%s, %d, %d"), path, args->uid, args->gid);
 #endif
 	error = kern_chown(td, path, UIO_SYSSPACE, args->uid, args->gid);
 	LFREEPATH(path);
 	return (error);
 }
 
 int
 linux_lchown(struct thread *td, struct linux_lchown_args *args)
 {
 	char *path;
 	int error;
 
 	LCONVPATHEXIST(td, args->path, &path);
 
 #ifdef DEBUG
 	if (ldebug(lchown))
 		printf(ARGS(lchown, "%s, %d, %d"), path, args->uid, args->gid);
 #endif
 	error = kern_lchown(td, path, UIO_SYSSPACE, args->uid, args->gid);
 	LFREEPATH(path);
 	return (error);
 }
Index: head/sys/compat/linux/linux_getcwd.c
===================================================================
--- head/sys/compat/linux/linux_getcwd.c	(revision 175201)
+++ head/sys/compat/linux/linux_getcwd.c	(revision 175202)
@@ -1,477 +1,477 @@
 /* $OpenBSD: linux_getcwd.c,v 1.2 2001/05/16 12:50:21 ho Exp $ */
 /* $NetBSD: vfs_getcwd.c,v 1.3.2.3 1999/07/11 10:24:09 sommerfeld Exp $ */
 /*-
  * Copyright (c) 1999 The NetBSD Foundation, Inc.
  * All rights reserved.
  *
  * This code is derived from software contributed to The NetBSD Foundation
  * by Bill Sommerfeld.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. All advertising materials mentioning features or use of this software
  *    must display the following acknowledgement:
  *        This product includes software developed by the NetBSD
  *        Foundation, Inc. and its contributors.
  * 4. Neither the name of The NetBSD Foundation nor the names of its
  *    contributors may be used to endorse or promote products derived
  *    from this software without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
  * POSSIBILITY OF SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_compat.h"
 #include "opt_mac.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/namei.h>
 #include <sys/filedesc.h>
 #include <sys/kernel.h>
 #include <sys/file.h>
 #include <sys/stat.h>
 #include <sys/syscallsubr.h>
 #include <sys/vnode.h>
 #include <sys/mount.h>
 #include <sys/proc.h>
 #include <sys/uio.h>
 #include <sys/malloc.h>
 #include <sys/dirent.h>
 #include <ufs/ufs/dir.h>	/* XXX only for DIRBLKSIZ */
 
 #ifdef COMPAT_LINUX32
 #include <machine/../linux32/linux.h>
 #include <machine/../linux32/linux32_proto.h>
 #else
 #include <machine/../linux/linux.h>
 #include <machine/../linux/linux_proto.h>
 #endif
 #include <compat/linux/linux_util.h>
 
 #include <security/mac/mac_framework.h>
 
 static int
 linux_getcwd_scandir(struct vnode **, struct vnode **,
     char **, char *, struct thread *);
 static int
 linux_getcwd_common(struct vnode *, struct vnode *,
 		   char **, char *, int, int, struct thread *);
 
 #define DIRENT_MINSIZE (sizeof(struct dirent) - (MAXNAMLEN+1) + 4)
 
 /*
  * Vnode variable naming conventions in this file:
  *
  * rvp: the current root we're aiming towards.
  * lvp, *lvpp: the "lower" vnode
  * uvp, *uvpp: the "upper" vnode.
  *
  * Since all the vnodes we're dealing with are directories, and the
  * lookups are going *up* in the filesystem rather than *down*, the
  * usual "pvp" (parent) or "dvp" (directory) naming conventions are
  * too confusing.
  */
 
 /*
  * XXX Will infinite loop in certain cases if a directory read reliably
  *	returns EINVAL on last block.
  * XXX is EINVAL the right thing to return if a directory is malformed?
  */
 
 /*
  * XXX Untested vs. mount -o union; probably does the wrong thing.
  */
 
 /*
  * Find parent vnode of *lvpp, return in *uvpp
  *
  * If we care about the name, scan it looking for name of directory
  * entry pointing at lvp.
  *
  * Place the name in the buffer which starts at bufp, immediately
  * before *bpp, and move bpp backwards to point at the start of it.
  *
  * On entry, *lvpp is a locked vnode reference; on exit, it is vput and NULL'ed
  * On exit, *uvpp is either NULL or is a locked vnode reference.
  */
 static int
 linux_getcwd_scandir(lvpp, uvpp, bpp, bufp, td)
 	struct vnode **lvpp;
 	struct vnode **uvpp;
 	char **bpp;
 	char *bufp;
 	struct thread *td;
 {
 	int     error = 0;
 	int     eofflag;
 	off_t   off;
 	int     tries;
 	struct uio uio;
 	struct iovec iov;
 	char   *dirbuf = NULL;
 	int	dirbuflen;
 	ino_t   fileno;
 	struct vattr va;
 	struct vnode *uvp = NULL;
 	struct vnode *lvp = *lvpp;	
 	struct componentname cn;
 	int len, reclen;
 	tries = 0;
 
 	/*
 	 * If we want the filename, get some info we need while the
 	 * current directory is still locked.
 	 */
 	if (bufp != NULL) {
 		error = VOP_GETATTR(lvp, &va, td->td_ucred, td);
 		if (error) {
 			vput(lvp);
 			*lvpp = NULL;
 			*uvpp = NULL;
 			return error;
 		}
 	}
 
 	/*
 	 * Ok, we have to do it the hard way..
 	 * Next, get parent vnode using lookup of ..
 	 */
 	cn.cn_nameiop = LOOKUP;
 	cn.cn_flags = ISLASTCN | ISDOTDOT | RDONLY;
 	cn.cn_thread = td;
 	cn.cn_cred = td->td_ucred;
 	cn.cn_pnbuf = NULL;
 	cn.cn_nameptr = "..";
 	cn.cn_namelen = 2;
 	cn.cn_consume = 0;
 	cn.cn_lkflags = LK_EXCLUSIVE;
 	
 	/*
 	 * At this point, lvp is locked and will be unlocked by the lookup.
 	 * On successful return, *uvpp will be locked
 	 */
 #ifdef MAC
 	error = mac_vnode_check_lookup(td->td_ucred, lvp, &cn);
 	if (error == 0)
 #endif
 		error = VOP_LOOKUP(lvp, uvpp, &cn);
 	if (error) {
 		vput(lvp);
 		*lvpp = NULL;
 		*uvpp = NULL;
 		return error;
 	}
 	uvp = *uvpp;
 
 	/* If we don't care about the pathname, we're done */
 	if (bufp == NULL) {
 		vput(lvp);
 		*lvpp = NULL;
 		return 0;
 	}
 	
 	fileno = va.va_fileid;
 
 	dirbuflen = DIRBLKSIZ;
 	if (dirbuflen < va.va_blocksize)
 		dirbuflen = va.va_blocksize;
 	dirbuf = (char *)malloc(dirbuflen, M_TEMP, M_WAITOK);
 
 #if 0
 unionread:
 #endif
 	off = 0;
 	do {
 		/* call VOP_READDIR of parent */
 		iov.iov_base = dirbuf;
 		iov.iov_len = dirbuflen;
 
 		uio.uio_iov = &iov;
 		uio.uio_iovcnt = 1;
 		uio.uio_offset = off;
 		uio.uio_resid = dirbuflen;
 		uio.uio_segflg = UIO_SYSSPACE;
 		uio.uio_rw = UIO_READ;
 		uio.uio_td = td;
 
 		eofflag = 0;
 
 #ifdef MAC
 		error = mac_vnode_check_readdir(td->td_ucred, uvp);
 		if (error == 0)
 #endif /* MAC */
 			error = VOP_READDIR(uvp, &uio, td->td_ucred, &eofflag,
 			    0, 0);
 
 		off = uio.uio_offset;
 
 		/*
 		 * Try again if NFS tosses its cookies.
 		 * XXX this can still loop forever if the directory is busted
 		 * such that the second or subsequent page of it always
 		 * returns EINVAL
 		 */
 		if ((error == EINVAL) && (tries < 3)) {
 			off = 0;
 			tries++;
 			continue;	/* once more, with feeling */
 		}
 
 		if (!error) {
 			char   *cpos;
 			struct dirent *dp;
 			
 			cpos = dirbuf;
 			tries = 0;
 				
 			/* scan directory page looking for matching vnode */ 
 			for (len = (dirbuflen - uio.uio_resid); len > 0; len -= reclen) {
 				dp = (struct dirent *) cpos;
 				reclen = dp->d_reclen;
 
 				/* check for malformed directory.. */
 				if (reclen < DIRENT_MINSIZE) {
 					error = EINVAL;
 					goto out;
 				}
 				/*
 				 * XXX should perhaps do VOP_LOOKUP to
 				 * check that we got back to the right place,
 				 * but getting the locking games for that
 				 * right would be heinous.
 				 */
 				if ((dp->d_type != DT_WHT) &&
 				    (dp->d_fileno == fileno)) {
 					char *bp = *bpp;
 					bp -= dp->d_namlen;
 					
 					if (bp <= bufp) {
 						error = ERANGE;
 						goto out;
 					}
 					bcopy(dp->d_name, bp, dp->d_namlen);
 					error = 0;
 					*bpp = bp;
 					goto out;
 				}
 				cpos += reclen;
 			}
 		}
 	} while (!eofflag);
 	error = ENOENT;
 		
 out:
 	vput(lvp);
 	*lvpp = NULL;
 	free(dirbuf, M_TEMP);
 	return error;
 }
 
 
 /*
  * common routine shared by sys___getcwd() and linux_vn_isunder()
  */
 
 #define GETCWD_CHECK_ACCESS 0x0001
 
 static int
 linux_getcwd_common (lvp, rvp, bpp, bufp, limit, flags, td)
 	struct vnode *lvp;
 	struct vnode *rvp;
 	char **bpp;
 	char *bufp;
 	int limit;
 	int flags;
 	struct thread *td;
 {
 	struct filedesc *fdp = td->td_proc->p_fd;
 	struct vnode *uvp = NULL;
 	char *bp = NULL;
 	int error;
 	int perms = VEXEC;
 
 	if (rvp == NULL) {
 		rvp = fdp->fd_rdir;
 		if (rvp == NULL)
 			rvp = rootvnode;
 	}
 	
 	VREF(rvp);
 	VREF(lvp);
 
 	/*
 	 * Error handling invariant:
 	 * Before a `goto out':
 	 *	lvp is either NULL, or locked and held.
 	 *	uvp is either NULL, or locked and held.
 	 */
 
-	error = vn_lock(lvp, LK_EXCLUSIVE | LK_RETRY, td);
+	error = vn_lock(lvp, LK_EXCLUSIVE | LK_RETRY);
 	if (error != 0)
 		panic("vn_lock LK_RETRY returned error %d", error);
 	if (bufp)
 		bp = *bpp;
 	/*
 	 * this loop will terminate when one of the following happens:
 	 *	- we hit the root
 	 *	- getdirentries or lookup fails
 	 *	- we run out of space in the buffer.
 	 */
 	if (lvp == rvp) {
 		if (bp)
 			*(--bp) = '/';
 		goto out;
 	}
 	do {
 		if (lvp->v_type != VDIR) {
 			error = ENOTDIR;
 			goto out;
 		}
 		
 		/*
 		 * access check here is optional, depending on
 		 * whether or not caller cares.
 		 */
 		if (flags & GETCWD_CHECK_ACCESS) {
 			error = VOP_ACCESS(lvp, perms, td->td_ucred, td);
 			if (error)
 				goto out;
 			perms = VEXEC|VREAD;
 		}
 		
 		/*
 		 * step up if we're a covered vnode..
 		 */
 		while (lvp->v_vflag & VV_ROOT) {
 			struct vnode *tvp;
 
 			if (lvp == rvp)
 				goto out;
 			
 			tvp = lvp;
 			lvp = lvp->v_mount->mnt_vnodecovered;
 			vput(tvp);
 			/*
 			 * hodie natus est radici frater
 			 */
 			if (lvp == NULL) {
 				error = ENOENT;
 				goto out;
 			}
 			VREF(lvp);
-			error = vn_lock(lvp, LK_EXCLUSIVE | LK_RETRY, td);
+			error = vn_lock(lvp, LK_EXCLUSIVE | LK_RETRY);
 			if (error != 0)
 				panic("vn_lock LK_RETRY returned %d", error);
 		}
 		error = linux_getcwd_scandir(&lvp, &uvp, &bp, bufp, td);
 		if (error)
 			goto out;
 #ifdef DIAGNOSTIC		
 		if (lvp != NULL)
 			panic("getcwd: oops, forgot to null lvp");
 		if (bufp && (bp <= bufp)) {
 			panic("getcwd: oops, went back too far");
 		}
 #endif		
 		if (bp) 
 			*(--bp) = '/';
 		lvp = uvp;
 		uvp = NULL;
 		limit--;
 	} while ((lvp != rvp) && (limit > 0)); 
 
 out:
 	if (bpp)
 		*bpp = bp;
 	if (uvp)
 		vput(uvp);
 	if (lvp)
 		vput(lvp);
 	vrele(rvp);
 	return error;
 }
 
 
 /*
  * Find pathname of process's current directory.
  *
  * Use vfs vnode-to-name reverse cache; if that fails, fall back
  * to reading directory contents.
  */
 
 int
 linux_getcwd(struct thread *td, struct linux_getcwd_args *args)
 {
 	caddr_t bp, bend, path;
 	int error, len, lenused;
 
 #ifdef DEBUG
 	if (ldebug(getcwd))
 		printf(ARGS(getcwd, "%p, %ld"), args->buf, (long)args->bufsize);
 #endif
 
 	len = args->bufsize;
 
 	if (len > MAXPATHLEN*4)
 		len = MAXPATHLEN*4;
 	else if (len < 2)
 		return ERANGE;
 
 	path = (char *)malloc(len, M_TEMP, M_WAITOK);
 
 	error = kern___getcwd(td, path, UIO_SYSSPACE, len);
 	if (!error) {
 		lenused = strlen(path) + 1;
 		if (lenused <= args->bufsize) {
 			td->td_retval[0] = lenused;
 			error = copyout(path, args->buf, lenused);
 		}
 		else
 			error = ERANGE;
 	} else {
 		bp = &path[len];
 		bend = bp;
 		*(--bp) = '\0';
 
 		/*
 		 * 5th argument here is "max number of vnodes to traverse".
 		 * Since each entry takes up at least 2 bytes in the output buffer,
 		 * limit it to N/2 vnodes for an N byte buffer.
 		 */
 
 		mtx_lock(&Giant);
 		error = linux_getcwd_common (td->td_proc->p_fd->fd_cdir, NULL,
 		    &bp, path, len/2, GETCWD_CHECK_ACCESS, td);
 		mtx_unlock(&Giant);
 
 		if (error)
 			goto out;
 		lenused = bend - bp;
 		td->td_retval[0] = lenused;
 		/* put the result into user buffer */
 		error = copyout(bp, args->buf, lenused);
 	}
 out:
 	free(path, M_TEMP);
 	return (error);
 }
 
Index: head/sys/compat/opensolaris/kern/opensolaris_kobj.c
===================================================================
--- head/sys/compat/opensolaris/kern/opensolaris_kobj.c	(revision 175201)
+++ head/sys/compat/opensolaris/kern/opensolaris_kobj.c	(revision 175202)
@@ -1,220 +1,220 @@
 /*-
  * Copyright (c) 2007 Pawel Jakub Dawidek <pjd@FreeBSD.org>
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <sys/types.h>
 #include <sys/systm.h>
 #include <sys/kernel.h>
 #include <sys/systm.h>
 #include <sys/kthread.h>
 #include <sys/namei.h>
 #include <sys/proc.h>
 #include <sys/filedesc.h>
 #include <sys/fcntl.h>
 #include <sys/linker.h>
 #include <sys/kobj.h>
 
 void
 kobj_free(void *address, size_t size)
 {
 
 	kmem_free(address, size);
 }
 
 void *
 kobj_alloc(size_t size, int flag)
 {
 
 	return (kmem_alloc(size, (flag & KM_NOWAIT) ? KM_NOSLEEP : KM_SLEEP));
 }
 
 void *
 kobj_zalloc(size_t size, int flag)
 {
 	void *p;
 
 	if ((p = kobj_alloc(size, flag)) != NULL)
 		bzero(p, size);
 	return (p);
 }
 
 static void *
 kobj_open_file_vnode(const char *file)
 {
 	struct thread *td = curthread;
 	struct nameidata nd;
 	int error, flags;
 
 	if (td->td_proc->p_fd->fd_rdir == NULL)
 		td->td_proc->p_fd->fd_rdir = rootvnode;
 	if (td->td_proc->p_fd->fd_cdir == NULL)
 		td->td_proc->p_fd->fd_cdir = rootvnode;
 
 	flags = FREAD;
 	NDINIT(&nd, LOOKUP, NOFOLLOW, UIO_SYSSPACE, file, td);
 	error = vn_open_cred(&nd, &flags, 0, td->td_ucred, NULL);
 	NDFREE(&nd, NDF_ONLY_PNBUF);
 	if (error != 0)
 		return (NULL);
 	/* We just unlock so we hold a reference. */
 	VOP_UNLOCK(nd.ni_vp, 0, td);
 	return (nd.ni_vp);
 }
 
 static void *
 kobj_open_file_loader(const char *file)
 {
 
 	return (preload_search_by_name(file));
 }
 
 struct _buf *
 kobj_open_file(const char *file)
 {
 	struct _buf *out;
 
 	out = kmem_alloc(sizeof(*out), KM_SLEEP);
 	out->mounted = root_mounted();
 	/*
 	 * If root is already mounted we read file using file system,
 	 * if not, we use loader.
 	 */
 	if (out->mounted)
 		out->ptr = kobj_open_file_vnode(file);
 	else
 		out->ptr = kobj_open_file_loader(file);
 	if (out->ptr == NULL) {
 		kmem_free(out, sizeof(*out));
 		return ((struct _buf *)-1);
 	}
 	return (out);
 }
 
 static int
 kobj_get_filesize_vnode(struct _buf *file, uint64_t *size)
 {
 	struct vnode *vp = file->ptr;
 	struct thread *td = curthread;
 	struct vattr va;
 	int error;
 
-	vn_lock(vp, LK_SHARED | LK_RETRY, td);
+	vn_lock(vp, LK_SHARED | LK_RETRY);
 	error = VOP_GETATTR(vp, &va, td->td_ucred, td);
 	VOP_UNLOCK(vp, 0, td);
 	if (error == 0)
 		*size = (uint64_t)va.va_size;
 	return (error);
 }
 
 static int
 kobj_get_filesize_loader(struct _buf *file, uint64_t *size)
 {
 	void *ptr;
 
 	ptr = preload_search_info(file->ptr, MODINFO_SIZE);
 	if (ptr == NULL)
 		return (ENOENT);
 	*size = (uint64_t)*(size_t *)ptr;
 	return (0);
 }
 
 int
 kobj_get_filesize(struct _buf *file, uint64_t *size)
 {
 
 	if (file->mounted)
 		return (kobj_get_filesize_vnode(file, size));
 	else
 		return (kobj_get_filesize_loader(file, size));
 }
 
 int
 kobj_read_file_vnode(struct _buf *file, char *buf, unsigned size, unsigned off)
 {
 	struct vnode *vp = file->ptr;
 	struct thread *td = curthread;
 	struct uio auio;
 	struct iovec aiov;
 	int error;
 
 	bzero(&aiov, sizeof(aiov));
 	bzero(&auio, sizeof(auio));
 
 	aiov.iov_base = buf;
 	aiov.iov_len = size;
 
 	auio.uio_iov = &aiov;
 	auio.uio_offset = (off_t)off;
 	auio.uio_segflg = UIO_SYSSPACE;
 	auio.uio_rw = UIO_READ;
 	auio.uio_iovcnt = 1;
 	auio.uio_resid = size;
 	auio.uio_td = td;
 
-	vn_lock(vp, LK_SHARED | LK_RETRY, td);
+	vn_lock(vp, LK_SHARED | LK_RETRY);
 	error = VOP_READ(vp, &auio, IO_UNIT | IO_SYNC, td->td_ucred);
 	VOP_UNLOCK(vp, 0, td);
 	return (error != 0 ? -1 : size - auio.uio_resid);
 }
 
 int
 kobj_read_file_loader(struct _buf *file, char *buf, unsigned size, unsigned off)
 {
 	char *ptr;
 
 	ptr = preload_search_info(file->ptr, MODINFO_ADDR);
 	if (ptr == NULL)
 		return (ENOENT);
 	ptr = *(void **)ptr;
 	bcopy(ptr + off, buf, size);
 	return (0);
 }
 
 int
 kobj_read_file(struct _buf *file, char *buf, unsigned size, unsigned off)
 {
 
 	if (file->mounted)
 		return (kobj_read_file_vnode(file, buf, size, off));
 	else
 		return (kobj_read_file_loader(file, buf, size, off));
 }
 
 void
 kobj_close_file(struct _buf *file)
 {
 
 	if (file->mounted) {
 		struct vnode *vp = file->ptr;
 		struct thread *td = curthread;
 		int flags = FREAD;
 
 		vn_close(vp, flags, td->td_ucred, td);
 	}
 	kmem_free(file, sizeof(*file));
 }
Index: head/sys/compat/opensolaris/kern/opensolaris_vfs.c
===================================================================
--- head/sys/compat/opensolaris/kern/opensolaris_vfs.c	(revision 175201)
+++ head/sys/compat/opensolaris/kern/opensolaris_vfs.c	(revision 175202)
@@ -1,280 +1,280 @@
 /*-
  * Copyright (c) 2006-2007 Pawel Jakub Dawidek <pjd@FreeBSD.org>
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <sys/param.h>
 #include <sys/kernel.h>
 #include <sys/systm.h>
 #include <sys/mount.h>
 #include <sys/cred.h>
 #include <sys/vfs.h>
 #include <sys/priv.h>
 #include <sys/libkern.h>
 
 MALLOC_DECLARE(M_MOUNT);
 
 TAILQ_HEAD(vfsoptlist, vfsopt);
 struct vfsopt {
 	TAILQ_ENTRY(vfsopt) link;
 	char	*name;
 	void	*value;
 	int	len;
 };
 
 void
 vfs_setmntopt(vfs_t *vfsp, const char *name, const char *arg,
     int flags __unused)
 {
 	struct vfsopt *opt;
 	size_t namesize;
 
 	if (vfsp->mnt_opt == NULL) {
 		vfsp->mnt_opt = malloc(sizeof(*vfsp->mnt_opt), M_MOUNT, M_WAITOK);
 		TAILQ_INIT(vfsp->mnt_opt);
 	}
 
 	opt = malloc(sizeof(*opt), M_MOUNT, M_WAITOK);
 
 	namesize = strlen(name) + 1;
 	opt->name = malloc(namesize, M_MOUNT, M_WAITOK);
 	strlcpy(opt->name, name, namesize);
 
 	if (arg == NULL) {
 		opt->value = NULL;
 		opt->len = 0;
 	} else {
 		opt->len = strlen(arg) + 1;
 		opt->value = malloc(opt->len, M_MOUNT, M_WAITOK);
 		bcopy(arg, opt->value, opt->len);
 	}
 	/* TODO: Locking. */
 	TAILQ_INSERT_TAIL(vfsp->mnt_opt, opt, link);
 }
 
 void
 vfs_clearmntopt(vfs_t *vfsp, const char *name)
 {
 	struct vfsopt *opt;
 
 	if (vfsp->mnt_opt == NULL)
 		return;
 	/* TODO: Locking. */
 	TAILQ_FOREACH(opt, vfsp->mnt_opt, link) {
 		if (strcmp(opt->name, name) == 0)
 			break;
 	}
 	if (opt != NULL) {
 		TAILQ_REMOVE(vfsp->mnt_opt, opt, link);
 		free(opt->name, M_MOUNT);
 		if (opt->value != NULL)
 			free(opt->value, M_MOUNT);
 		free(opt, M_MOUNT);
 	}
 }
 
 int
 vfs_optionisset(const vfs_t *vfsp, const char *opt, char **argp)
 {
 	struct vfsoptlist *opts = vfsp->mnt_opt;
 	int error;
 
 	if (opts == NULL)
 		return (0);
 	error = vfs_getopt(opts, opt, (void **)argp, NULL);
 	return (error != 0 ? 0 : 1);
 }
 
 int
 traverse(vnode_t **cvpp, int lktype)
 {
 	kthread_t *td = curthread;
 	vnode_t *cvp;
 	vnode_t *tvp;
 	vfs_t *vfsp;
 	int error;
 
 	cvp = *cvpp;
 	tvp = NULL;
 
 	/*
 	 * If this vnode is mounted on, then we transparently indirect
 	 * to the vnode which is the root of the mounted file system.
 	 * Before we do this we must check that an unmount is not in
 	 * progress on this vnode.
 	 */
 
 	for (;;) {
 		/*
 		 * Reached the end of the mount chain?
 		 */
 		vfsp = vn_mountedvfs(cvp);
 		if (vfsp == NULL)
 			break;
 		/*
 		 * tvp is NULL for *cvpp vnode, which we can't unlock.
 		 */
 		if (tvp != NULL)
 			vput(cvp);
 		else
 			vrele(cvp);
 
 		/*
 		 * The read lock must be held across the call to VFS_ROOT() to
 		 * prevent a concurrent unmount from destroying the vfs.
 		 */
 		error = VFS_ROOT(vfsp, lktype, &tvp, td);
 		if (error != 0)
 			return (error);
 		cvp = tvp;
 	}
 
 	*cvpp = cvp;
 	return (0);
 }
 
 int
 domount(kthread_t *td, vnode_t *vp, const char *fstype, char *fspath,
     char *fspec, int fsflags)
 {
 	struct mount *mp;
 	struct vfsconf *vfsp;
 	struct ucred *newcr, *oldcr;
 	int error;
 
 	/*
 	 * Be ultra-paranoid about making sure the type and fspath
 	 * variables will fit in our mp buffers, including the
 	 * terminating NUL.
 	 */
 	if (strlen(fstype) >= MFSNAMELEN || strlen(fspath) >= MNAMELEN)
 		return (ENAMETOOLONG);
 
 	vfsp = vfs_byname_kld(fstype, td, &error);
 	if (vfsp == NULL)
 		return (ENODEV);
 
 	if (vp->v_type != VDIR)
 		return (ENOTDIR);
 	VI_LOCK(vp);
 	if ((vp->v_iflag & VI_MOUNT) != 0 ||
 	    vp->v_mountedhere != NULL) {
 		VI_UNLOCK(vp);
 		return (EBUSY);
 	}
 	vp->v_iflag |= VI_MOUNT;
 	VI_UNLOCK(vp);
 
 	/*
 	 * Allocate and initialize the filesystem.
 	 */
-	vn_lock(vp, LK_SHARED | LK_RETRY, td);
+	vn_lock(vp, LK_SHARED | LK_RETRY);
 	mp = vfs_mount_alloc(vp, vfsp, fspath, td);
 	VOP_UNLOCK(vp, 0, td);
 
 	mp->mnt_optnew = NULL;
 	vfs_setmntopt(mp, "from", fspec, 0);
 	mp->mnt_optnew = mp->mnt_opt;
 	mp->mnt_opt = NULL;
 
 	/*
 	 * Set the mount level flags.
 	 * crdup() can sleep, so do it before acquiring a mutex.
 	 */
 	newcr = crdup(kcred);
 	MNT_ILOCK(mp);
 	if (fsflags & MNT_RDONLY)
 		mp->mnt_flag |= MNT_RDONLY;
 	mp->mnt_flag &=~ MNT_UPDATEMASK;
 	mp->mnt_flag |= fsflags & (MNT_UPDATEMASK | MNT_FORCE | MNT_ROOTFS);
 	/*
 	 * Unprivileged user can trigger mounting a snapshot, but we don't want
 	 * him to unmount it, so we switch to privileged credentials.
 	 */
 	oldcr = mp->mnt_cred;
 	mp->mnt_cred = newcr;
 	mp->mnt_stat.f_owner = mp->mnt_cred->cr_uid;
 	MNT_IUNLOCK(mp);
 	crfree(oldcr);
 	/*
 	 * Mount the filesystem.
 	 * XXX The final recipients of VFS_MOUNT just overwrite the ndp they
 	 * get.  No freeing of cn_pnbuf.
 	 */
 	error = VFS_MOUNT(mp, td);
 
 	if (!error) {
 		if (mp->mnt_opt != NULL)
 			vfs_freeopts(mp->mnt_opt);
 		mp->mnt_opt = mp->mnt_optnew;
 		(void)VFS_STATFS(mp, &mp->mnt_stat, td);
 	}
 	/*
 	 * Prevent external consumers of mount options from reading
 	 * mnt_optnew.
 	*/
 	mp->mnt_optnew = NULL;
-	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td);
+	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
 	/*
 	 * Put the new filesystem on the mount list after root.
 	 */
 #ifdef FREEBSD_NAMECACHE
 	cache_purge(vp);
 #endif
 	if (!error) {
 		vnode_t *mvp;
 
 		VI_LOCK(vp);
 		vp->v_iflag &= ~VI_MOUNT;
 		VI_UNLOCK(vp);
 		vp->v_mountedhere = mp;
 		mtx_lock(&mountlist_mtx);
 		TAILQ_INSERT_TAIL(&mountlist, mp, mnt_list);
 		mtx_unlock(&mountlist_mtx);
 		vfs_event_signal(NULL, VQ_MOUNT, 0);
 		if (VFS_ROOT(mp, LK_EXCLUSIVE, &mvp, td))
 			panic("mount: lost mount");
 		mountcheckdirs(vp, mvp);
 		vput(mvp);
 		VOP_UNLOCK(vp, 0, td);
 		if ((mp->mnt_flag & MNT_RDONLY) == 0)
 			error = vfs_allocate_syncvnode(mp);
 		vfs_unbusy(mp, td);
 		if (error)
 			vrele(vp);
 		else
 			vfs_mountedfrom(mp, fspec);
 	} else {
 		VI_LOCK(vp);
 		vp->v_iflag &= ~VI_MOUNT;
 		VI_UNLOCK(vp);
 		VOP_UNLOCK(vp, 0, td);
 		vfs_unbusy(mp, td);
 		vfs_mount_destroy(mp);
 	}
 	return (error);
 }
Index: head/sys/compat/opensolaris/sys/vnode.h
===================================================================
--- head/sys/compat/opensolaris/sys/vnode.h	(revision 175201)
+++ head/sys/compat/opensolaris/sys/vnode.h	(revision 175202)
@@ -1,268 +1,268 @@
 /*-
  * Copyright (c) 2007 Pawel Jakub Dawidek <pjd@FreeBSD.org>
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  * $FreeBSD$
  */
 
 #ifndef _OPENSOLARIS_SYS_VNODE_H_
 #define	_OPENSOLARIS_SYS_VNODE_H_
 
 #include_next <sys/vnode.h>
 #include <sys/mount.h>
 #include <sys/cred.h>
 #include <sys/fcntl.h>
 #include <sys/namei.h>
 #include <sys/proc.h>
 #include <sys/filedesc.h>
 #include <sys/syscallsubr.h>
 
 typedef	struct vnode	vnode_t;
 typedef	struct vattr	vattr_t;
 typedef	void		caller_context_t;
 
 typedef	struct vop_vector	vnodeops_t;
 #define	vop_fid		vop_vptofh
 #define	vop_fid_args	vop_vptofh_args
 #define	a_fid		a_fhp
 
 #define	v_count	v_usecount
 
 static __inline int
 vn_is_readonly(vnode_t *vp)
 {
 	return (vp->v_mount->mnt_flag & MNT_RDONLY);
 }
 #define	vn_vfswlock(vp)		(0)
 #define	vn_vfsunlock(vp)	do { } while (0)
 #define	vn_ismntpt(vp)		((vp)->v_type == VDIR && (vp)->v_mountedhere != NULL)
 #define	vn_mountedvfs(vp)	((vp)->v_mountedhere)
 #define	vn_has_cached_data(vp)	((vp)->v_object != NULL && (vp)->v_object->resident_page_count > 0)
 
 #define	VN_HOLD(v)	vref(v)
 #define	VN_RELE(v)	vrele(v)
 #define	VN_URELE(v)	vput(v)
 
 #define	VOP_REALVP(vp, vpp)	(*(vpp) = (vp), 0)
 
 #define	vnevent_remove(vp)	do { } while (0)
 #define	vnevent_rmdir(vp)	do { } while (0)
 #define	vnevent_rename_src(vp)	do { } while (0)
 #define	vnevent_rename_dest(vp)	do { } while (0)
 
 
 #define	IS_DEVVP(vp)	\
 	((vp)->v_type == VCHR || (vp)->v_type == VBLK || (vp)->v_type == VFIFO)
 
 #define	MODEMASK	ALLPERMS
 
 #define	specvp(vp, rdev, type, cr)	(VN_HOLD(vp), (vp))
 #define	MANDMODE(mode)	(0)
 #define	chklock(vp, op, offset, size, mode, ct)	(0)
 #define	cleanlocks(vp, pid, foo)	do { } while (0)
 #define	cleanshares(vp, pid)		do { } while (0)
 
 /*
  * We will use va_spare is place of Solaris' va_mask.
  * This field is initialized in zfs_setattr().
  */
 #define	va_mask		va_spare
 /* TODO: va_fileid is shorter than va_nodeid !!! */
 #define	va_nodeid	va_fileid
 /* TODO: This field needs conversion! */
 #define	va_nblocks	va_bytes
 #define	va_blksize	va_blocksize
 #define	va_seq		va_gen
 
 #define	MAXOFFSET_T	OFF_MAX
 #define	EXCL		0
 
 #define	AT_TYPE		0x0001
 #define	AT_MODE		0x0002
 #define	AT_UID		0x0004
 #define	AT_GID		0x0008
 #define	AT_FSID		0x0010
 #define	AT_NODEID	0x0020
 #define	AT_NLINK	0x0040
 #define	AT_SIZE		0x0080
 #define	AT_ATIME	0x0100
 #define	AT_MTIME	0x0200
 #define	AT_CTIME	0x0400
 #define	AT_RDEV		0x0800
 #define	AT_BLKSIZE	0x1000
 #define	AT_NBLOCKS	0x2000
 #define	AT_SEQ		0x4000
 #define	AT_NOSET	(AT_NLINK|AT_RDEV|AT_FSID|AT_NODEID|AT_TYPE|\
 			 AT_BLKSIZE|AT_NBLOCKS|AT_SEQ)
 
 #define	ACCESSED		(AT_ATIME)
 #define	STATE_CHANGED		(AT_CTIME)
 #define	CONTENT_MODIFIED	(AT_MTIME | AT_CTIME)
 
 static __inline void
 vattr_init_mask(vattr_t *vap)
 {
 
 	vap->va_mask = 0;
 
 	if (vap->va_type != VNON)
 		vap->va_mask |= AT_TYPE;
 	if (vap->va_uid != (uid_t)VNOVAL)
 		vap->va_mask |= AT_UID;
 	if (vap->va_gid != (gid_t)VNOVAL)
 		vap->va_mask |= AT_GID;
 	if (vap->va_size != (u_quad_t)VNOVAL)
 		vap->va_mask |= AT_SIZE;
 	if (vap->va_atime.tv_sec != VNOVAL)
 		vap->va_mask |= AT_ATIME;
 	if (vap->va_mtime.tv_sec != VNOVAL)
 		vap->va_mask |= AT_MTIME;
 	if (vap->va_mode != (u_short)VNOVAL)
 		vap->va_mask |= AT_MODE;
 }
 
 #define	FCREAT	O_CREAT
 #define	FTRUNC	O_TRUNC
 #define	FDSYNC	FFSYNC
 #define	FRSYNC	FFSYNC
 #define	FSYNC	FFSYNC
 #define	FOFFMAX	0x00
 
 enum create	{ CRCREAT };
 
 static __inline int
 zfs_vn_open(char *pnamep, enum uio_seg seg, int filemode, int createmode,
     vnode_t **vpp, enum create crwhy, mode_t umask)
 {
 	struct thread *td = curthread;
 	struct nameidata nd;
 	int error;
 
 	ASSERT(seg == UIO_SYSSPACE);
 	ASSERT(filemode == (FWRITE | FCREAT | FTRUNC | FOFFMAX));
 	ASSERT(crwhy == CRCREAT);
 	ASSERT(umask == 0);
 
 	if (td->td_proc->p_fd->fd_rdir == NULL)
 		td->td_proc->p_fd->fd_rdir = rootvnode;
 	if (td->td_proc->p_fd->fd_cdir == NULL)
 		td->td_proc->p_fd->fd_cdir = rootvnode;
 
 	NDINIT(&nd, LOOKUP, NOFOLLOW, UIO_SYSSPACE, pnamep, td);
 	error = vn_open_cred(&nd, &filemode, createmode, td->td_ucred, NULL);
 	NDFREE(&nd, NDF_ONLY_PNBUF);
 	if (error == 0) {
 		/* We just unlock so we hold a reference. */
 		VN_HOLD(nd.ni_vp);
 		VOP_UNLOCK(nd.ni_vp, 0, td);
 		*vpp = nd.ni_vp;
 	}
 	return (error);
 }
 #define	vn_open(pnamep, seg, filemode, createmode, vpp, crwhy, umask)	\
 	zfs_vn_open((pnamep), (seg), (filemode), (createmode), (vpp), (crwhy), (umask))
 
 #define	RLIM64_INFINITY	0
 static __inline int
 zfs_vn_rdwr(enum uio_rw rw, vnode_t *vp, caddr_t base, ssize_t len,
     offset_t offset, enum uio_seg seg, int ioflag, int ulimit, cred_t *cr,
     ssize_t *residp)
 {
 	struct thread *td = curthread;
 	int error, vfslocked, resid;
 
 	ASSERT(rw == UIO_WRITE);
 	ASSERT(ioflag == 0);
 	ASSERT(ulimit == RLIM64_INFINITY);
 
 	ioflag = IO_APPEND | IO_UNIT;
 
 	vfslocked = VFS_LOCK_GIANT(vp->v_mount);
 	VOP_LEASE(vp, td, td->td_ucred, LEASE_WRITE);
 	error = vn_rdwr(rw, vp, base, len, offset, seg, ioflag, cr, NOCRED,
 	    &resid, td);
 	VFS_UNLOCK_GIANT(vfslocked);
 	if (residp != NULL)
 		*residp = (ssize_t)resid;
 	return (error);
 }
 #define	vn_rdwr(rw, vp, base, len, offset, seg, ioflag, ulimit, cr, residp) \
 	zfs_vn_rdwr((rw), (vp), (base), (len), (offset), (seg), (ioflag), (ulimit), (cr), (residp))
 
 static __inline int
 zfs_vop_fsync(vnode_t *vp, int flag, cred_t *cr)
 {
 	struct thread *td = curthread;
 	struct mount *mp;
 	int error, vfslocked;
 
 	ASSERT(flag == FSYNC);
 
 	vfslocked = VFS_LOCK_GIANT(vp->v_mount);
 	if ((error = vn_start_write(vp, &mp, V_WAIT | PCATCH)) != 0)
 		goto drop;
-	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td);
+	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
 	error = VOP_FSYNC(vp, MNT_WAIT, td);
 	VOP_UNLOCK(vp, 0, td);
 	vn_finished_write(mp);
 drop:
 	VFS_UNLOCK_GIANT(vfslocked);
 	return (error);
 }
 #define	VOP_FSYNC(vp, flag, cr)	zfs_vop_fsync((vp), (flag), (cr))
 
 static __inline int
 zfs_vop_close(vnode_t *vp, int flag, int count, offset_t offset, cred_t *cr)
 {
 
 	ASSERT(flag == (FWRITE | FCREAT | FTRUNC | FOFFMAX));
 	ASSERT(count == 1);
 	ASSERT(offset == 0);
 
 	return (vn_close(vp, flag, cr, curthread));
 }
 #define	VOP_CLOSE(vp, oflags, count, offset, cr)			\
 	zfs_vop_close((vp), (oflags), (count), (offset), (cr))
 
 static __inline int
 vn_rename(char *from, char *to, enum uio_seg seg)
 {
 
 	ASSERT(seg == UIO_SYSSPACE);
 
 	return (kern_rename(curthread, from, to, seg));
 }
 
 enum rm	{ RMFILE };
 static __inline int
 vn_remove(char *fnamep, enum uio_seg seg, enum rm dirflag)
 {
 
 	ASSERT(seg == UIO_SYSSPACE);
 	ASSERT(dirflag == RMFILE);
 
 	return (kern_unlink(curthread, fnamep, seg));
 }
 
 #endif	/* _OPENSOLARIS_SYS_VNODE_H_ */
Index: head/sys/compat/pecoff/imgact_pecoff.c
===================================================================
--- head/sys/compat/pecoff/imgact_pecoff.c	(revision 175201)
+++ head/sys/compat/pecoff/imgact_pecoff.c	(revision 175202)
@@ -1,606 +1,606 @@
 /*-
  * Copyright (c) 2000 Masaru OKI
  * Copyright (c) 1994, 1995, 1998 Scott Bartram
  * Copyright (c) 1994 Adam Glass
  * Copyright (c) 1993, 1994 Christopher G. Demetriou
  *
  * originally from NetBSD kern/exec_ecoff.c
  *
  * Copyright (c) 2000 Takanori Watanabe
  * Copyright (c) 2000 KUROSAWA Takahiro
  * Copyright (c) 1995-1996 Sen Schmidt
  * Copyright (c) 1996 Peter Wemm
  * All rights reserved.
  *
  * originally from FreeBSD kern/imgact_elf.c
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. All advertising materials mentioning features or use of this software
  *    must display the following acknowledgement:
  *      This product includes software developed by Masaru OKI.
  * 4. The name of the author may not be used to endorse or promote products
  *    derived from this software without specific prior written permission
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/imgact.h>
 #include <sys/lock.h>
 #include <sys/malloc.h>
 #include <sys/mman.h>
 #include <sys/mutex.h>
 #include <sys/namei.h>
 #include <sys/proc.h>
 #include <sys/signalvar.h>
 #include <sys/sysent.h>
 #include <sys/vnode.h>
 
 #include <machine/reg.h>
 
 #include <vm/vm.h>
 #include <vm/vm_kern.h>
 #include <vm/vm_param.h>
 #include <vm/pmap.h>
 #include <vm/vm_map.h>
 #include <vm/vm_object.h>
 #include <vm/vm_extern.h>
 
 #include <sys/exec.h>
 #include <sys/kernel.h>
 #include <sys/module.h>
 #include <machine/cpu.h>
 #include <sys/syscall.h>
 #include <sys/sysent.h>
 #include <machine/md_var.h>
 #include <machine/pecoff_machdep.h>
 #include <compat/pecoff/imgact_pecoff.h>
 
 #include "opt_pecoff.h"
 
 #define PECOFF_PE_SIGNATURE "PE\0\0"
 static int      pecoff_fixup(register_t **, struct image_params *);
 #ifndef PECOFF_DEBUG
 #define DPRINTF(a)
 #else
 #define DPRINTF(a) printf a
 #endif
 static struct sysentvec pecoff_sysvec = {
 	SYS_MAXSYSCALL,
 	sysent,
 	0,
 	0,
 	NULL,
 	0,
 	NULL,
 	NULL,
 	pecoff_fixup,
 	sendsig,
 	sigcode,
 	&szsigcode,
 	0,
 	"FreeBSD PECoff",
 	NULL,
 	NULL,
 	MINSIGSTKSZ,
 	PAGE_SIZE,
 	VM_MIN_ADDRESS,
 	VM_MAXUSER_ADDRESS,
 	USRSTACK,
 	PS_STRINGS,
 	VM_PROT_ALL,
 	exec_copyout_strings,
 	exec_setregs,
 	NULL
 	
 };
 
 static const char signature[] = PECOFF_PE_SIGNATURE;
 
 static int 
 exec_pecoff_coff_prep_omagic(struct image_params *,
 			     struct coff_filehdr *,
 			     struct coff_aouthdr *, int peoffs);
 static int 
 exec_pecoff_coff_prep_nmagic(struct image_params *,
 			     struct coff_filehdr *,
 			     struct coff_aouthdr *, int peoffs);
 static int 
 exec_pecoff_coff_prep_zmagic(struct image_params *,
 			     struct coff_filehdr *,
 			     struct coff_aouthdr *, int peoffs);
 
 static int 
 exec_pecoff_coff_makecmds(struct image_params *,
 			  struct coff_filehdr *, int);
 
 static int      pecoff_signature(struct thread *, struct vnode *, const struct pecoff_dos_filehdr *);
 static int      pecoff_read_from(struct thread *, struct vnode *, int, caddr_t, int);
 static int 
 pecoff_load_section(struct thread * td,
 		    struct vmspace * vmspace, struct vnode * vp,
 	     vm_offset_t offset, caddr_t vmaddr, size_t memsz, size_t filsz,
 		    vm_prot_t prot);
 
 static int 
 pecoff_fixup(register_t ** stack_base, struct image_params * imgp)
 {
 	int             len = sizeof(struct pecoff_args);
 	struct pecoff_imghdr *ap;
 	register_t     *pos;
 
 	pos = *stack_base + (imgp->args->argc + imgp->args->envc + 2);
 	ap = (struct pecoff_imghdr *) imgp->auxargs;
 	if (copyout(ap, pos, len)) {
 		return 0;
 	}
 	free(ap, M_TEMP);
 	imgp->auxargs = NULL;
 	(*stack_base)--;
 	suword(*stack_base, (long) imgp->args->argc);
 	return 0;
 }
 
 static int 
 pecoff_load_section(struct thread * td, struct vmspace * vmspace, struct vnode * vp, vm_offset_t offset, caddr_t vmaddr, size_t memsz, size_t filsz, vm_prot_t prot)
 {
 	size_t          map_len;
 	vm_offset_t     map_addr;
 	int             error, rv;
 	size_t          copy_len;
 	size_t          copy_map_len;
 	size_t          copy_start;
 	vm_object_t     object;
 	vm_offset_t     copy_map_offset;
 	vm_offset_t     file_addr;
 	vm_offset_t     data_buf = 0;
 
 	object = vp->v_object;
 	error = 0;
 
 	map_addr = trunc_page((vm_offset_t) vmaddr);
 	file_addr = trunc_page(offset);
 	DPRINTF(("SECARG:%x %p %x %x\n", offset, vmaddr, memsz, filsz));
 	if (file_addr != offset) {
 		/*
 		 * The section is not on page  boundary. We can't use
 		 * vm_map_insert(). Use copyin instead.
 		 */
 		map_len = round_page(memsz);
 		copy_len = filsz;
 		copy_map_offset = file_addr;
 		copy_map_len = round_page(offset + filsz) - file_addr;
 		copy_start = offset - file_addr;
 
 		DPRINTF(("offset=%x vmaddr=%lx filsz=%x memsz=%x\n",
 			 offset, (long)vmaddr, filsz, memsz));
 		DPRINTF(("map_len=%x copy_len=%x copy_map_offset=%x"
 			 " copy_map_len=%x copy_start=%x\n",
 			 map_len, copy_len, copy_map_offset,
 			 copy_map_len, copy_start));
 	} else {
 
 		map_len = trunc_page(filsz);
 
 		if (map_len != 0) {
 			vm_object_reference(object);
 			vm_map_lock(&vmspace->vm_map);
 			rv = vm_map_insert(&vmspace->vm_map,
 					   object,
 					   file_addr,	/* file offset */
 					   map_addr,	/* virtual start */
 					   map_addr + map_len,	/* virtual end */
 					   prot,
 					   VM_PROT_ALL,
 					   MAP_COPY_ON_WRITE | MAP_PREFAULT);
 
 			vm_map_unlock(&vmspace->vm_map);
 			if (rv != KERN_SUCCESS) {
 				vm_object_deallocate(object);
 				return EINVAL;
 			}
 			/* we can stop now if we've covered it all */
 			if (memsz == filsz)
 				return 0;
 
 		}
 		copy_map_offset = trunc_page(offset + filsz);
 		copy_map_len = PAGE_SIZE;
 		copy_start = 0;
 		copy_len = (offset + filsz) - trunc_page(offset + filsz);
 		map_addr = trunc_page((vm_offset_t) vmaddr + filsz);
 		map_len = round_page((vm_offset_t) vmaddr + memsz) - map_addr;
 
 	}
 
 	if (map_len != 0) {
 		vm_map_lock(&vmspace->vm_map);
 		rv = vm_map_insert(&vmspace->vm_map, NULL, 0,
 				   map_addr, map_addr + map_len,
 				   VM_PROT_ALL, VM_PROT_ALL, 0);
 		vm_map_unlock(&vmspace->vm_map);
 		DPRINTF(("EMP-rv:%d,%x %x\n", rv, map_addr, map_addr + map_len));
 		if (rv != KERN_SUCCESS) {
 			return EINVAL;
 		}
 	}
 	DPRINTF(("COPYARG %x %x\n", map_addr, copy_len));
 	if (copy_len != 0) {
 		vm_object_reference(object);
 		rv = vm_map_find(exec_map,
 				 object,
 				 copy_map_offset,
 				 &data_buf,
 				 copy_map_len,
 				 TRUE,
 				 VM_PROT_READ,
 				 VM_PROT_ALL,
 				 MAP_COPY_ON_WRITE | MAP_PREFAULT_PARTIAL);
 		if (rv != KERN_SUCCESS) {
 			vm_object_deallocate(object);
 			return EINVAL;
 		}
 		/* send the page fragment to user space */
 
 		error = copyout((caddr_t) data_buf + copy_start,
 				(caddr_t) map_addr, copy_len);
 		vm_map_remove(exec_map, data_buf, data_buf + copy_map_len);
 		DPRINTF(("%d\n", error));
 		if (error)
 			return (error);
 	}
 	/*
 	 * set it to the specified protection
 	 */
 	vm_map_protect(&vmspace->vm_map, map_addr,
 		       map_addr + map_len, prot,
 		       FALSE);
 	return error;
 
 }
 static int 
 pecoff_load_file(struct thread * td, const char *file, u_long * addr, u_long * entry, u_long * ldexport)
 {
 
 	struct nameidata nd;
 	struct pecoff_dos_filehdr dh;
 	struct coff_filehdr *fp = 0;
 	struct coff_aouthdr *ap;
 	struct pecoff_opthdr *wp;
 	struct coff_scnhdr *sh = 0;
 	struct vmspace *vmspace = td->td_proc->p_vmspace;
 	struct vattr    attr;
 	struct image_params image_params, *imgp;
 	int             peofs;
 	int             error, i, scnsiz;
 
 	imgp = &image_params;
 	/*
 	 * Initialize part of the common data
 	 */
 	imgp->proc = td->td_proc;
 	imgp->execlabel = NULL;
 	imgp->attr = &attr;
 	imgp->firstpage = NULL;
 
 	NDINIT(&nd, LOOKUP, LOCKLEAF | FOLLOW, UIO_SYSSPACE, file, td);
 
 	if ((error = namei(&nd)) != 0) {
 		nd.ni_vp = NULL;
 		goto fail;
 	}
 	NDFREE(&nd, NDF_ONLY_PNBUF);
 	imgp->vp = nd.ni_vp;
 
 	/*
 	 * Check permissions, modes, uid, etc on the file, and "open" it.
 	 */
 	error = exec_check_permissions(imgp);
 	VOP_UNLOCK(nd.ni_vp, 0, td);
 	if (error)
 		goto fail;
 	if ((error = pecoff_read_from(td, imgp->vp, 0, (caddr_t) & dh, sizeof(dh))) != 0)
 		goto fail;
 	if ((error = pecoff_signature(td, imgp->vp, &dh) != 0))
 		goto fail;
 	fp = malloc(PECOFF_HDR_SIZE, M_TEMP, M_WAITOK);
 	peofs = dh.d_peofs + sizeof(signature) - 1;
 	if ((error = pecoff_read_from(td, imgp->vp, peofs, (caddr_t) fp, PECOFF_HDR_SIZE) != 0))
 		goto fail;
 	if (COFF_BADMAG(fp)) {
 		error = ENOEXEC;
 		goto fail;
 	}
 	ap = (void *) ((char *) fp + sizeof(struct coff_filehdr));
 	wp = (void *) ((char *) ap + sizeof(struct coff_aouthdr));
 	/* read section header */
 	scnsiz = sizeof(struct coff_scnhdr) * fp->f_nscns;
 	sh = malloc(scnsiz, M_TEMP, M_WAITOK);
 	if ((error = pecoff_read_from(td, imgp->vp, peofs + PECOFF_HDR_SIZE,
 				      (caddr_t) sh, scnsiz)) != 0)
 		goto fail;
 
 	/*
 	 * Read Section infomation and map sections.
 	 */
 
 	for (i = 0; i < fp->f_nscns; i++) {
 		int             prot = 0;
 
 		if (sh[i].s_flags & COFF_STYP_DISCARD)
 			continue;
 		/* XXX ? */
 		if ((sh[i].s_flags & COFF_STYP_TEXT) &&
 		    (sh[i].s_flags & COFF_STYP_EXEC) == 0)
 			continue;
 		if ((sh[i].s_flags & (COFF_STYP_TEXT | COFF_STYP_DATA | COFF_STYP_BSS)) == 0)
 			continue;
 
 		prot |= (sh[i].s_flags & COFF_STYP_READ) ? VM_PROT_READ : 0;
 		prot |= (sh[i].s_flags & COFF_STYP_WRITE) ? VM_PROT_WRITE : 0;
 		prot |= (sh[i].s_flags & COFF_STYP_EXEC) ? VM_PROT_EXECUTE : 0;
 
 		sh[i].s_vaddr += wp->w_base;	/* RVA --> VA */
 		if ((error = pecoff_load_section(td, vmspace, imgp->vp, sh[i].s_scnptr
 						 ,(caddr_t) sh[i].s_vaddr,
 						 sh[i].s_paddr, sh[i].s_size
 						 ,prot)) != 0)
 			goto fail;
 
 	}
 	*entry = wp->w_base + ap->a_entry;
 	*addr = wp->w_base;
 	*ldexport = wp->w_imghdr[0].i_vaddr + wp->w_base;
 fail:
 	if (fp)
 		free(fp, M_TEMP);
 	if (sh)
 		free(sh, M_TEMP);
 	if (nd.ni_vp)
 		vrele(nd.ni_vp);
 
 	return error;
 }
 static int
 exec_pecoff_coff_prep_omagic(struct image_params * imgp,
 			     struct coff_filehdr * fp,
 			     struct coff_aouthdr * ap, int peofs)
 {
 	return ENOEXEC;
 }
 static int
 exec_pecoff_coff_prep_nmagic(struct image_params * imgp,
 			     struct coff_filehdr * fp,
 			     struct coff_aouthdr * ap, int peofs)
 {
 	return ENOEXEC;
 }
 static int
 exec_pecoff_coff_prep_zmagic(struct image_params * imgp,
 			     struct coff_filehdr * fp,
 			     struct coff_aouthdr * ap, int peofs)
 {
 	int             scnsiz = sizeof(struct coff_scnhdr) * fp->f_nscns;
 	int             error = ENOEXEC, i;
 	int             prot;
 	u_long          text_size = 0, data_size = 0, dsize;
 	u_long          text_addr = 0, data_addr = VM_MAXUSER_ADDRESS;
 	u_long          ldexport = 0, ldbase = 0;
 	struct pecoff_opthdr *wp;
 	struct coff_scnhdr *sh;
 	struct vmspace *vmspace;
 	struct pecoff_args *argp = NULL;
 
 	sh = malloc(scnsiz, M_TEMP, M_WAITOK);
 
 	wp = (void *) ((char *) ap + sizeof(struct coff_aouthdr));
 	error = pecoff_read_from(FIRST_THREAD_IN_PROC(imgp->proc), imgp->vp,
 	    peofs + PECOFF_HDR_SIZE, (caddr_t) sh, scnsiz);
 	if (error)
 		return (error);
 	error = exec_new_vmspace(imgp, &pecoff_sysvec);
 	if (error)
 		return (error);
 	vmspace = imgp->proc->p_vmspace;
 	for (i = 0; i < fp->f_nscns; i++) {
 		prot = VM_PROT_WRITE;	/* XXX for relocation? */
 		prot |= (sh[i].s_flags & COFF_STYP_READ) ? VM_PROT_READ : 0;
 		prot |= (sh[i].s_flags & COFF_STYP_WRITE) ? VM_PROT_WRITE : 0;
 		prot |= (sh[i].s_flags & COFF_STYP_EXEC) ? VM_PROT_EXECUTE : 0;
 		sh[i].s_vaddr += wp->w_base;
 		if (sh[i].s_flags & COFF_STYP_DISCARD)
 			continue;
 		if ((sh[i].s_flags & COFF_STYP_TEXT) != 0) {
 
 			error = pecoff_load_section(
 			    FIRST_THREAD_IN_PROC(imgp->proc),
 			    vmspace, imgp->vp, sh[i].s_scnptr,
 			    (caddr_t) sh[i].s_vaddr, sh[i].s_paddr,
 			    sh[i].s_size ,prot);
 			DPRINTF(("ERROR%d\n", error));
 			if (error)
 				goto fail;
 			text_addr = trunc_page(sh[i].s_vaddr);
 			text_size = trunc_page(sh[i].s_size + sh[i].s_vaddr - text_addr);
 
 		}
 		if ((sh[i].s_flags & (COFF_STYP_DATA|COFF_STYP_BSS)) != 0) {
 			if (pecoff_load_section(
 			    FIRST_THREAD_IN_PROC(imgp->proc), vmspace,
 			    imgp->vp, sh[i].s_scnptr, (caddr_t) sh[i].s_vaddr,
 			    sh[i].s_paddr, sh[i].s_size, prot) != 0)
 				goto fail;
 			data_addr = min(trunc_page(sh[i].s_vaddr), data_addr);
 			dsize = round_page(sh[i].s_vaddr + sh[i].s_paddr)
 				- data_addr;
 			data_size = max(dsize, data_size);
 
 		}
 	}
 	vmspace->vm_tsize = text_size >> PAGE_SHIFT;
 	vmspace->vm_taddr = (caddr_t) (uintptr_t) text_addr;
 	vmspace->vm_dsize = data_size >> PAGE_SHIFT;
 	vmspace->vm_daddr = (caddr_t) (uintptr_t) data_addr;
 	argp = malloc(sizeof(struct pecoff_args), M_TEMP, M_WAITOK);
 	if (argp == NULL) {
 		error = ENOMEM;
 		goto fail;
 	}
 	argp->a_base = wp->w_base;
 	argp->a_entry = wp->w_base + ap->a_entry;
 	argp->a_end = data_addr + data_size;
 	argp->a_subsystem = wp->w_subvers;
 	error = pecoff_load_file(FIRST_THREAD_IN_PROC(imgp->proc),
 	    "/usr/libexec/ld.so.dll", &ldbase, &imgp->entry_addr, &ldexport);
 	if (error)
 		goto fail;
 
 	argp->a_ldbase = ldbase;
 	argp->a_ldexport = ldexport;
 	memcpy(argp->a_imghdr, wp->w_imghdr, sizeof(struct pecoff_imghdr) * 16);
 	for (i = 0; i < 16; i++) {
 		argp->a_imghdr[i].i_vaddr += wp->w_base;
 	}
 	imgp->proc->p_sysent = &pecoff_sysvec;
 	imgp->auxargs = argp;
 	imgp->auxarg_size = sizeof(struct pecoff_args);
 	imgp->interpreted = 0;
 
 	if (sh != NULL)
 		free(sh, M_TEMP);
 	return 0;
 fail:
 	error = (error) ? error : ENOEXEC;
 	if (sh != NULL)
 		free(sh, M_TEMP);
 	if (argp != NULL)
 		free(argp, M_TEMP);
 
 	return error;
 }
 
 int
 exec_pecoff_coff_makecmds(struct image_params * imgp,
 			  struct coff_filehdr * fp, int peofs)
 {
 	struct coff_aouthdr *ap;
 	int             error;
 
 	if (COFF_BADMAG(fp)) {
 		return ENOEXEC;
 	}
 	ap = (void *) ((char *) fp + sizeof(struct coff_filehdr));
 	switch (ap->a_magic) {
 	case COFF_OMAGIC:
 		error = exec_pecoff_coff_prep_omagic(imgp, fp, ap, peofs);
 		break;
 	case COFF_NMAGIC:
 		error = exec_pecoff_coff_prep_nmagic(imgp, fp, ap, peofs);
 		break;
 	case COFF_ZMAGIC:
 		error = exec_pecoff_coff_prep_zmagic(imgp, fp, ap, peofs);
 		break;
 	default:
 		return ENOEXEC;
 	}
 
 	return error;
 }
 
 static int
 pecoff_signature(td, vp, dp)
 	struct thread  *td;
 	struct vnode   *vp;
 	const struct pecoff_dos_filehdr *dp;
 {
 	int             error;
 	char            buf[512];
 	char           *pesig;
 	if (DOS_BADMAG(dp)) {
 		return ENOEXEC;
 	}
 	error = pecoff_read_from(td, vp, dp->d_peofs, buf, sizeof(buf));
 	if (error) {
 		return error;
 	}
 	pesig = buf;
 	if (memcmp(pesig, signature, sizeof(signature) - 1) == 0) {
 		return 0;
 	}
 	return EFTYPE;
 }
 int
 pecoff_read_from(td, vp, pos, buf, siz)
 	struct thread  *td;
 	struct vnode   *vp;
 	int             pos;
 	caddr_t         buf;
 	int             siz;
 {
 	int             error;
 	size_t          resid;
 
 	error = vn_rdwr(UIO_READ, vp, buf, siz, pos,
 			UIO_SYSSPACE, IO_NODELOCKED, td->td_ucred, NOCRED,
 			&resid, td);
 	if (error)
 		return error;
 
 	if (resid != 0) {
 		return ENOEXEC;
 	}
 	return 0;
 }
 
 static int 
 imgact_pecoff(struct image_params * imgp)
 {
 	const struct pecoff_dos_filehdr *dp = (const struct pecoff_dos_filehdr *)
 	imgp->image_header;
 	struct coff_filehdr *fp;
 	int             error, peofs;
 	struct thread *td = curthread;
 
 	error = pecoff_signature(FIRST_THREAD_IN_PROC(imgp->proc),
 	    imgp->vp, dp);
 	if (error) {
 		return -1;
 	}
 	VOP_UNLOCK(imgp->vp, 0, td);
 
 	peofs = dp->d_peofs + sizeof(signature) - 1;
 	fp = malloc(PECOFF_HDR_SIZE, M_TEMP, M_WAITOK);
 	error = pecoff_read_from(FIRST_THREAD_IN_PROC(imgp->proc),
 	     imgp->vp, peofs, (caddr_t) fp, PECOFF_HDR_SIZE);
 	if (error)
 		goto fail;
 
 	error = exec_pecoff_coff_makecmds(imgp, fp, peofs);
 fail:   
 	free(fp, M_TEMP);
-        vn_lock(imgp->vp, LK_EXCLUSIVE | LK_RETRY, td);
+        vn_lock(imgp->vp, LK_EXCLUSIVE | LK_RETRY);
 	return error;
 }
 
 static struct execsw pecoff_execsw = {imgact_pecoff, "FreeBSD PEcoff"};
 EXEC_SET(pecoff, pecoff_execsw);
Index: head/sys/compat/svr4/imgact_svr4.c
===================================================================
--- head/sys/compat/svr4/imgact_svr4.c	(revision 175201)
+++ head/sys/compat/svr4/imgact_svr4.c	(revision 175202)
@@ -1,243 +1,243 @@
 /*-
  * Copyright (c) 1998 Mark Newton
  * Copyright (c) 1994-1996 S�ren Schmidt
  * All rights reserved.
  *
  * Based heavily on /sys/kern/imgact_aout.c which is:
  * Copyright (c) 1993, David Greenman
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer 
  *    in this position and unchanged.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. The name of the author may not be used to endorse or promote products
  *    derived from this software without specific prior written permission
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/exec.h>
 #include <sys/imgact.h>
 #include <sys/imgact_aout.h>
 #include <sys/kernel.h>
 #include <sys/lock.h>
 #include <sys/mman.h>
 #include <sys/mutex.h>
 #include <sys/proc.h>
 #include <sys/resourcevar.h>
 #include <sys/vnode.h>
 
 #include <vm/vm.h>
 #include <vm/vm_kern.h>
 #include <vm/vm_param.h>
 #include <vm/pmap.h>
 #include <vm/vm_map.h>
 #include <vm/vm_extern.h>
 
 #include <compat/svr4/svr4.h>
 
 static int	exec_svr4_imgact(struct image_params *iparams);
 
 static int
 exec_svr4_imgact(imgp)
     struct image_params *imgp;
 {
     const struct exec *a_out = (const struct exec *) imgp->image_header;
     struct vmspace *vmspace;
     vm_offset_t vmaddr;
     unsigned long virtual_offset, file_offset;
     vm_offset_t buffer;
     unsigned long bss_size;
     int error;
     struct thread *td = curthread;
 
     if (((a_out->a_magic >> 16) & 0xff) != 0x64)
 	return -1;
 
     /*
      * Set file/virtual offset based on a.out variant.
      */
     switch ((int)(a_out->a_magic & 0xffff)) {
     case 0413:
 	virtual_offset = 0;
 	file_offset = 1024;
 	break;
     case 0314:
 	virtual_offset = 4096;
 	file_offset = 0;
 	break;
     default:
 	return (-1);
     }
     bss_size = round_page(a_out->a_bss);
 #ifdef DEBUG
     printf("imgact: text: %08lx, data: %08lx, bss: %08lx\n", (u_long)a_out->a_text, (u_long)a_out->a_data, bss_size);
 #endif
 
     /*
      * Check various fields in header for validity/bounds.
      */
     if (a_out->a_entry < virtual_offset ||
 	a_out->a_entry >= virtual_offset + a_out->a_text ||
 	a_out->a_text & PAGE_MASK || a_out->a_data & PAGE_MASK)
 	return (-1);
 
     /* text + data can't exceed file size */
     if (a_out->a_data + a_out->a_text > imgp->attr->va_size)
 	return (EFAULT);
     /*
      * text/data/bss must not exceed limits
      */
     PROC_LOCK(imgp->proc);
     if (a_out->a_text > maxtsiz ||
 	a_out->a_data + bss_size > lim_cur(imgp->proc, RLIMIT_DATA)) {
     	PROC_UNLOCK(imgp->proc);
 	return (ENOMEM);
     }
     PROC_UNLOCK(imgp->proc);
 
     VOP_UNLOCK(imgp->vp, 0, td);
 
     /*
      * Destroy old process VM and create a new one (with a new stack)
      */
     error = exec_new_vmspace(imgp, &svr4_sysvec);
     if (error)
 	    goto fail;
     vmspace = imgp->proc->p_vmspace;
 
     /*
      * Check if file_offset page aligned,.
      * Currently we cannot handle misalinged file offsets,
      * and so we read in the entire image (what a waste).
      */
     if (file_offset & PAGE_MASK) {
 #ifdef DEBUG
 	printf("imgact: Non page aligned binary %lu\n", file_offset);
 #endif
 	/*
 	 * Map text+data+bss read/write/execute
 	 */
 	vmaddr = virtual_offset;
 	error = vm_map_find(&vmspace->vm_map, NULL, 0, &vmaddr,
 		    	    a_out->a_text + a_out->a_data + bss_size, FALSE,
 			    VM_PROT_ALL, VM_PROT_ALL, 0);
 	if (error)
 	    goto fail;
 
 	error = vm_mmap(kernel_map, &buffer,
 			round_page(a_out->a_text + a_out->a_data + file_offset),
 			VM_PROT_READ, VM_PROT_READ, 0,
 			OBJT_VNODE, imgp->vp, trunc_page(file_offset));
 	if (error)
 	    goto fail;
 
 	error = copyout((caddr_t)(buffer + file_offset), (caddr_t)vmaddr, 
 			a_out->a_text + a_out->a_data);
 
 	vm_map_remove(kernel_map, buffer,
 		      buffer + round_page(a_out->a_text + a_out->a_data + file_offset));
 
 	if (error)
 	    goto fail;
 
 	/*
 	 * remove write enable on the 'text' part
 	 */
 	error = vm_map_protect(&vmspace->vm_map,
 			       vmaddr,
 		   	       vmaddr + a_out->a_text,
 		   	       VM_PROT_EXECUTE|VM_PROT_READ,
 		   	       TRUE);
 	if (error)
 	    goto fail;
     }
     else {
 #ifdef DEBUG
 	printf("imgact: Page aligned binary %lu\n", file_offset);
 #endif
 	/*
 	 * Map text+data read/execute
 	 */
 	vmaddr = virtual_offset;
 	error = vm_mmap(&vmspace->vm_map, &vmaddr,
 			a_out->a_text + a_out->a_data,
 	    		VM_PROT_READ | VM_PROT_EXECUTE,
 	    		VM_PROT_ALL,
 	    		MAP_PRIVATE | MAP_FIXED,
 			OBJT_VNODE, imgp->vp, file_offset);
 	if (error)
 	    goto fail;
     
 #ifdef DEBUG
 	printf("imgact: startaddr=%08lx, length=%08lx\n", (u_long)vmaddr,
 	    (u_long)a_out->a_text + a_out->a_data);
 #endif
 	/*
 	 * allow read/write of data
 	 */
 	error = vm_map_protect(&vmspace->vm_map,
 			       vmaddr + a_out->a_text,
 			       vmaddr + a_out->a_text + a_out->a_data,
 			       VM_PROT_ALL,
 			       FALSE);
 	if (error)
 	    goto fail;
     
 	/*
 	 * Allocate anon demand-zeroed area for uninitialized data
 	 */
 	if (bss_size != 0) {
 	    vmaddr = virtual_offset + a_out->a_text + a_out->a_data;
 	    error = vm_map_find(&vmspace->vm_map, NULL, 0, &vmaddr, 
 				bss_size, FALSE, VM_PROT_ALL, VM_PROT_ALL, 0);
 	    if (error)
 		goto fail;
 #ifdef DEBUG
 	    printf("imgact: bssaddr=%08lx, length=%08lx\n",
 	        (u_long)vmaddr, bss_size);
 #endif
 
 	}
     }
     /* Fill in process VM information */
     vmspace->vm_tsize = round_page(a_out->a_text) >> PAGE_SHIFT;
     vmspace->vm_dsize = round_page(a_out->a_data + bss_size) >> PAGE_SHIFT;
     vmspace->vm_taddr = (caddr_t)virtual_offset;
     vmspace->vm_daddr = (caddr_t)virtual_offset + a_out->a_text;
 
     /* Fill in image_params */
     imgp->interpreted = 0;
     imgp->entry_addr = a_out->a_entry;
     
     imgp->proc->p_sysent = &svr4_sysvec;
 fail:
-    vn_lock(imgp->vp, LK_EXCLUSIVE | LK_RETRY, td);
+    vn_lock(imgp->vp, LK_EXCLUSIVE | LK_RETRY);
     return (error);
 }
 
 /*
  * Tell kern_execve.c about it, with a little help from the linker.
  */
 struct execsw svr4_execsw = { exec_svr4_imgact, "svr4 ELF" };
 EXEC_SET(execsw_set, svr4_execsw);
 
Index: head/sys/compat/svr4/svr4_fcntl.c
===================================================================
--- head/sys/compat/svr4/svr4_fcntl.c	(revision 175201)
+++ head/sys/compat/svr4/svr4_fcntl.c	(revision 175202)
@@ -1,725 +1,725 @@
 /*-
  * Copyright (c) 1998 Mark Newton
  * Copyright (c) 1994, 1997 Christos Zoulas.  
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. All advertising materials mentioning features or use of this software
  *    must display the following acknowledgement:
  *	This product includes software developed by Christos Zoulas.
  * 4. The name of the author may not be used to endorse or promote products
  *    derived from this software without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_mac.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/file.h>
 #include <sys/filedesc.h>
 /*#include <sys/ioctl.h>*/
 #include <sys/lock.h>
 #include <sys/malloc.h>
 #include <sys/mount.h>
 #include <sys/mutex.h>
 #include <sys/namei.h>
 #include <sys/priv.h>
 #include <sys/proc.h>
 #include <sys/stat.h>
 #include <sys/syscallsubr.h>
 #include <sys/unistd.h>
 #include <sys/vnode.h>
 
 #include <sys/sysproto.h>
 
 #include <compat/svr4/svr4.h>
 #include <compat/svr4/svr4_types.h>
 #include <compat/svr4/svr4_signal.h>
 #include <compat/svr4/svr4_proto.h>
 #include <compat/svr4/svr4_util.h>
 #include <compat/svr4/svr4_fcntl.h>
 
 #include <security/mac/mac_framework.h>
 
 static int svr4_to_bsd_flags(int);
 static u_long svr4_to_bsd_cmd(u_long);
 static int fd_revoke(struct thread *, int);
 static int fd_truncate(struct thread *, int, struct flock *);
 static int bsd_to_svr4_flags(int);
 static void bsd_to_svr4_flock(struct flock *, struct svr4_flock *);
 static void svr4_to_bsd_flock(struct svr4_flock *, struct flock *);
 static void bsd_to_svr4_flock64(struct flock *, struct svr4_flock64 *);
 static void svr4_to_bsd_flock64(struct svr4_flock64 *, struct flock *);
 
 static u_long
 svr4_to_bsd_cmd(cmd)
 	u_long	cmd;
 {
 	switch (cmd) {
 	case SVR4_F_DUPFD:
 		return F_DUPFD;
 	case SVR4_F_GETFD:
 		return F_GETFD;
 	case SVR4_F_SETFD:
 		return F_SETFD;
 	case SVR4_F_GETFL:
 		return F_GETFL;
 	case SVR4_F_SETFL:
 		return F_SETFL;
 	case SVR4_F_GETLK:
 		return F_GETLK;
 	case SVR4_F_SETLK:
 		return F_SETLK;
 	case SVR4_F_SETLKW:
 		return F_SETLKW;
 	default:
 		return -1;
 	}
 }
 
 static int
 svr4_to_bsd_flags(l)
 	int	l;
 {
 	int	r = 0;
 	r |= (l & SVR4_O_RDONLY) ? O_RDONLY : 0;
 	r |= (l & SVR4_O_WRONLY) ? O_WRONLY : 0;
 	r |= (l & SVR4_O_RDWR) ? O_RDWR : 0;
 	r |= (l & SVR4_O_NDELAY) ? O_NONBLOCK : 0;
 	r |= (l & SVR4_O_APPEND) ? O_APPEND : 0;
 	r |= (l & SVR4_O_SYNC) ? O_FSYNC : 0;
 	r |= (l & SVR4_O_NONBLOCK) ? O_NONBLOCK : 0;
 	r |= (l & SVR4_O_PRIV) ? O_EXLOCK : 0;
 	r |= (l & SVR4_O_CREAT) ? O_CREAT : 0;
 	r |= (l & SVR4_O_TRUNC) ? O_TRUNC : 0;
 	r |= (l & SVR4_O_EXCL) ? O_EXCL : 0;
 	r |= (l & SVR4_O_NOCTTY) ? O_NOCTTY : 0;
 	return r;
 }
 
 static int
 bsd_to_svr4_flags(l)
 	int	l;
 {
 	int	r = 0;
 	r |= (l & O_RDONLY) ? SVR4_O_RDONLY : 0;
 	r |= (l & O_WRONLY) ? SVR4_O_WRONLY : 0;
 	r |= (l & O_RDWR) ? SVR4_O_RDWR : 0;
 	r |= (l & O_NDELAY) ? SVR4_O_NONBLOCK : 0;
 	r |= (l & O_APPEND) ? SVR4_O_APPEND : 0;
 	r |= (l & O_FSYNC) ? SVR4_O_SYNC : 0;
 	r |= (l & O_NONBLOCK) ? SVR4_O_NONBLOCK : 0;
 	r |= (l & O_EXLOCK) ? SVR4_O_PRIV : 0;
 	r |= (l & O_CREAT) ? SVR4_O_CREAT : 0;
 	r |= (l & O_TRUNC) ? SVR4_O_TRUNC : 0;
 	r |= (l & O_EXCL) ? SVR4_O_EXCL : 0;
 	r |= (l & O_NOCTTY) ? SVR4_O_NOCTTY : 0;
 	return r;
 }
 
 
 static void
 bsd_to_svr4_flock(iflp, oflp)
 	struct flock		*iflp;
 	struct svr4_flock	*oflp;
 {
 	switch (iflp->l_type) {
 	case F_RDLCK:
 		oflp->l_type = SVR4_F_RDLCK;
 		break;
 	case F_WRLCK:
 		oflp->l_type = SVR4_F_WRLCK;
 		break;
 	case F_UNLCK:
 		oflp->l_type = SVR4_F_UNLCK;
 		break;
 	default:
 		oflp->l_type = -1;
 		break;
 	}
 
 	oflp->l_whence = (short) iflp->l_whence;
 	oflp->l_start = (svr4_off_t) iflp->l_start;
 	oflp->l_len = (svr4_off_t) iflp->l_len;
 	oflp->l_sysid = 0;
 	oflp->l_pid = (svr4_pid_t) iflp->l_pid;
 }
 
 
 static void
 svr4_to_bsd_flock(iflp, oflp)
 	struct svr4_flock	*iflp;
 	struct flock		*oflp;
 {
 	switch (iflp->l_type) {
 	case SVR4_F_RDLCK:
 		oflp->l_type = F_RDLCK;
 		break;
 	case SVR4_F_WRLCK:
 		oflp->l_type = F_WRLCK;
 		break;
 	case SVR4_F_UNLCK:
 		oflp->l_type = F_UNLCK;
 		break;
 	default:
 		oflp->l_type = -1;
 		break;
 	}
 
 	oflp->l_whence = iflp->l_whence;
 	oflp->l_start = (off_t) iflp->l_start;
 	oflp->l_len = (off_t) iflp->l_len;
 	oflp->l_pid = (pid_t) iflp->l_pid;
 
 }
 
 static void
 bsd_to_svr4_flock64(iflp, oflp)
 	struct flock		*iflp;
 	struct svr4_flock64	*oflp;
 {
 	switch (iflp->l_type) {
 	case F_RDLCK:
 		oflp->l_type = SVR4_F_RDLCK;
 		break;
 	case F_WRLCK:
 		oflp->l_type = SVR4_F_WRLCK;
 		break;
 	case F_UNLCK:
 		oflp->l_type = SVR4_F_UNLCK;
 		break;
 	default:
 		oflp->l_type = -1;
 		break;
 	}
 
 	oflp->l_whence = (short) iflp->l_whence;
 	oflp->l_start = (svr4_off64_t) iflp->l_start;
 	oflp->l_len = (svr4_off64_t) iflp->l_len;
 	oflp->l_sysid = 0;
 	oflp->l_pid = (svr4_pid_t) iflp->l_pid;
 }
 
 
 static void
 svr4_to_bsd_flock64(iflp, oflp)
 	struct svr4_flock64	*iflp;
 	struct flock		*oflp;
 {
 	switch (iflp->l_type) {
 	case SVR4_F_RDLCK:
 		oflp->l_type = F_RDLCK;
 		break;
 	case SVR4_F_WRLCK:
 		oflp->l_type = F_WRLCK;
 		break;
 	case SVR4_F_UNLCK:
 		oflp->l_type = F_UNLCK;
 		break;
 	default:
 		oflp->l_type = -1;
 		break;
 	}
 
 	oflp->l_whence = iflp->l_whence;
 	oflp->l_start = (off_t) iflp->l_start;
 	oflp->l_len = (off_t) iflp->l_len;
 	oflp->l_pid = (pid_t) iflp->l_pid;
 
 }
 
 
 static int
 fd_revoke(td, fd)
 	struct thread *td;
 	int fd;
 {
 	struct vnode *vp;
 	struct mount *mp;
 	struct vattr vattr;
 	int error, *retval;
 
 	retval = td->td_retval;
 	if ((error = fgetvp(td, fd, &vp)) != 0)
 		return (error);
 
 	if (vp->v_type != VCHR && vp->v_type != VBLK) {
 		error = EINVAL;
 		goto out;
 	}
 
 #ifdef MAC
-	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td);
+	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
 	error = mac_vnode_check_revoke(td->td_ucred, vp);
 	VOP_UNLOCK(vp, 0, td);
 	if (error)
 		goto out;
 #endif
 
 	if ((error = VOP_GETATTR(vp, &vattr, td->td_ucred, td)) != 0)
 		goto out;
 
 	if (td->td_ucred->cr_uid != vattr.va_uid &&
 	    (error = priv_check(td, PRIV_VFS_ADMIN)) != 0)
 		goto out;
 
 	if ((error = vn_start_write(vp, &mp, V_WAIT | PCATCH)) != 0)
 		goto out;
 	if (vcount(vp) > 1)
 		VOP_REVOKE(vp, REVOKEALL);
 	vn_finished_write(mp);
 out:
 	vrele(vp);
 	return error;
 }
 
 
 static int
 fd_truncate(td, fd, flp)
 	struct thread *td;
 	int fd;
 	struct flock *flp;
 {
 	off_t start, length;
 	struct file *fp;
 	struct vnode *vp;
 	struct vattr vattr;
 	int error, *retval;
 	struct ftruncate_args ft;
 
 	retval = td->td_retval;
 
 	/*
 	 * We only support truncating the file.
 	 */
 	if ((error = fget(td, fd, &fp)) != 0)
 		return (error);
 
 	vp = fp->f_vnode;
 
 	if (fp->f_type != DTYPE_VNODE || vp->v_type == VFIFO) {
 		fdrop(fp, td);
 		return ESPIPE;
 	}
 
 	if ((error = VOP_GETATTR(vp, &vattr, td->td_ucred, td)) != 0) {
 		fdrop(fp, td);
 		return error;
 	}
 
 	length = vattr.va_size;
 
 	switch (flp->l_whence) {
 	case SEEK_CUR:
 		start = fp->f_offset + flp->l_start;
 		break;
 
 	case SEEK_END:
 		start = flp->l_start + length;
 		break;
 
 	case SEEK_SET:
 		start = flp->l_start;
 		break;
 
 	default:
 		fdrop(fp, td);
 		return EINVAL;
 	}
 
 	if (start + flp->l_len < length) {
 		/* We don't support free'ing in the middle of the file */
 		fdrop(fp, td);
 		return EINVAL;
 	}
 
 	ft.fd = fd;
 	ft.length = start;
 
 	error = ftruncate(td, &ft);
 
 	fdrop(fp, td);
 	return (error);
 }
 
 int
 svr4_sys_open(td, uap)
 	register struct thread *td;
 	struct svr4_sys_open_args *uap;
 {
 	struct proc *p = td->td_proc;
 	char *newpath;
 	int bsd_flags, error, retval;
 
 	CHECKALTEXIST(td, uap->path, &newpath);
 
 	bsd_flags = svr4_to_bsd_flags(uap->flags);
 	error = kern_open(td, newpath, UIO_SYSSPACE, bsd_flags, uap->mode);
 	free(newpath, M_TEMP);
 
 	if (error) {
 	  /*	        uprintf("svr4_open(%s, 0x%0x, 0%o): %d\n", uap->path,
 			uap->flags, uap->mode, error);*/
 		return error;
 	}
 
 	retval = td->td_retval[0];
 
 	PROC_LOCK(p);
 	if (!(bsd_flags & O_NOCTTY) && SESS_LEADER(p) &&
 	    !(p->p_flag & P_CONTROLT)) {
 #if defined(NOTYET)
 		struct file	*fp;
 
 		error = fget(td, retval, &fp);
 		PROC_UNLOCK(p);
 		/*
 		 * we may have lost a race the above open() and
 		 * another thread issuing a close()
 		 */
 		if (error) 
 			return (EBADF);	/* XXX: correct errno? */
 		/* ignore any error, just give it a try */
 		if (fp->f_type == DTYPE_VNODE)
 			fo_ioctl(fp, TIOCSCTTY, (caddr_t) 0, td->td_ucred,
 			    td);
 		fdrop(fp, td);
 	} else {
 		PROC_UNLOCK(p);
 	}
 #else
 	}
 	PROC_UNLOCK(p);
 #endif
 	return error;
 }
 
 int
 svr4_sys_open64(td, uap)
 	register struct thread *td;
 	struct svr4_sys_open64_args *uap;
 {
 	return svr4_sys_open(td, (struct svr4_sys_open_args *)uap);
 }
 
 int
 svr4_sys_creat(td, uap)
 	register struct thread *td;
 	struct svr4_sys_creat_args *uap;
 {
 	char *newpath;
 	int error;
 
 	CHECKALTEXIST(td, uap->path, &newpath);
 
 	error = kern_open(td, newpath, UIO_SYSSPACE, O_WRONLY | O_CREAT |
 	    O_TRUNC, uap->mode);
 	free(newpath, M_TEMP);
 	return (error);
 }
 
 int
 svr4_sys_creat64(td, uap)
 	register struct thread *td;
 	struct svr4_sys_creat64_args *uap;
 {
 	return svr4_sys_creat(td, (struct svr4_sys_creat_args *)uap);
 }
 
 int
 svr4_sys_llseek(td, uap)
 	register struct thread *td;
 	struct svr4_sys_llseek_args *uap;
 {
 	struct lseek_args ap;
 
 	ap.fd = uap->fd;
 
 #if BYTE_ORDER == BIG_ENDIAN
 	ap.offset = (((u_int64_t) uap->offset1) << 32) | 
 		uap->offset2;
 #else
 	ap.offset = (((u_int64_t) uap->offset2) << 32) | 
 		uap->offset1;
 #endif
 	ap.whence = uap->whence;
 
 	return lseek(td, &ap);
 }
 
 int
 svr4_sys_access(td, uap)
 	register struct thread *td;
 	struct svr4_sys_access_args *uap;
 {
 	char *newpath;
 	int error;
 
 	CHECKALTEXIST(td, uap->path, &newpath);
 	error = kern_access(td, newpath, UIO_SYSSPACE, uap->flags);
 	free(newpath, M_TEMP);
 	return (error);
 }
 
 #if defined(NOTYET)
 int
 svr4_sys_pread(td, uap)
 	register struct thread *td;
 	struct svr4_sys_pread_args *uap;
 {
 	struct pread_args pra;
 
 	/*
 	 * Just translate the args structure and call the NetBSD
 	 * pread(2) system call (offset type is 64-bit in NetBSD).
 	 */
 	pra.fd = uap->fd;
 	pra.buf = uap->buf;
 	pra.nbyte = uap->nbyte;
 	pra.offset = uap->off;
 
 	return pread(td, &pra);
 }
 #endif
 
 #if defined(NOTYET)
 int
 svr4_sys_pread64(td, v, retval)
 	register struct thread *td;
 	void *v; 
 	register_t *retval;
 {
 
 	struct svr4_sys_pread64_args *uap = v;
 	struct sys_pread_args pra;
 
 	/*
 	 * Just translate the args structure and call the NetBSD
 	 * pread(2) system call (offset type is 64-bit in NetBSD).
 	 */
 	pra.fd = uap->fd;
 	pra.buf = uap->buf;
 	pra.nbyte = uap->nbyte;
 	pra.offset = uap->off;
 
 	return (sys_pread(td, &pra, retval));
 }
 #endif /* NOTYET */
 
 #if defined(NOTYET)
 int
 svr4_sys_pwrite(td, uap)
 	register struct thread *td;
 	struct svr4_sys_pwrite_args *uap;
 {
 	struct pwrite_args pwa;
 
 	/*
 	 * Just translate the args structure and call the NetBSD
 	 * pwrite(2) system call (offset type is 64-bit in NetBSD).
 	 */
 	pwa.fd = uap->fd;
 	pwa.buf = uap->buf;
 	pwa.nbyte = uap->nbyte;
 	pwa.offset = uap->off;
 
 	return pwrite(td, &pwa);
 }
 #endif
 
 #if defined(NOTYET)
 int
 svr4_sys_pwrite64(td, v, retval)
 	register struct thread *td;
 	void *v; 
 	register_t *retval;
 {
 	struct svr4_sys_pwrite64_args *uap = v;
 	struct sys_pwrite_args pwa;
 
 	/*
 	 * Just translate the args structure and call the NetBSD
 	 * pwrite(2) system call (offset type is 64-bit in NetBSD).
 	 */
 	pwa.fd = uap->fd;
 	pwa.buf = uap->buf;
 	pwa.nbyte = uap->nbyte;
 	pwa.offset = uap->off;
 
 	return (sys_pwrite(td, &pwa, retval));
 }
 #endif /* NOTYET */
 
 int
 svr4_sys_fcntl(td, uap)
 	register struct thread *td;
 	struct svr4_sys_fcntl_args *uap;
 {
 	int cmd, error, *retval;
 
 	retval = td->td_retval;
 
 	cmd = svr4_to_bsd_cmd(uap->cmd);
 
 	switch (cmd) {
 	case F_DUPFD:
 	case F_GETFD:
 	case F_SETFD:
 		return (kern_fcntl(td, uap->fd, cmd, (intptr_t)uap->arg));
 
 	case F_GETFL:
 		error = kern_fcntl(td, uap->fd, cmd, (intptr_t)uap->arg);
 		if (error)
 			return (error);
 		*retval = bsd_to_svr4_flags(*retval);
 		return (error);
 
 	case F_SETFL:
 		{
 			/*
 			 * we must save the O_ASYNC flag, as that is
 			 * handled by ioctl(_, I_SETSIG, _) emulation.
 			 */
 			int flags;
 
 			DPRINTF(("Setting flags %p\n", uap->arg));
 
 			error = kern_fcntl(td, uap->fd, F_GETFL, 0);
 			if (error)
 				return (error);
 			flags = *retval;
 			flags &= O_ASYNC;
 			flags |= svr4_to_bsd_flags((u_long) uap->arg);
 			return (kern_fcntl(td, uap->fd, F_SETFL, flags));
 		}
 
 	case F_GETLK:
 	case F_SETLK:
 	case F_SETLKW:
 		{
 			struct svr4_flock	ifl;
 			struct flock		fl;
 
 			error = copyin(uap->arg, &ifl, sizeof (ifl));
 			if (error)
 				return (error);
 
 			svr4_to_bsd_flock(&ifl, &fl);
 
 			error = kern_fcntl(td, uap->fd, cmd, (intptr_t)&fl);
 			if (error || cmd != F_GETLK)
 				return (error);
 
 			bsd_to_svr4_flock(&fl, &ifl);
 
 			return (copyout(&ifl, uap->arg, sizeof (ifl)));
 		}
 	case -1:
 		switch (uap->cmd) {
 		case SVR4_F_DUP2FD:
 			{
 				struct dup2_args du;
 
 				du.from = uap->fd;
 				du.to = (int)uap->arg;
 				error = dup2(td, &du);
 				if (error)
 					return error;
 				*retval = du.to;
 				return 0;
 			}
 
 		case SVR4_F_FREESP:
 			{
 				struct svr4_flock	 ifl;
 				struct flock		 fl;
 
 				error = copyin(uap->arg, &ifl,
 				    sizeof ifl);
 				if (error)
 					return error;
 				svr4_to_bsd_flock(&ifl, &fl);
 				return fd_truncate(td, uap->fd, &fl);
 			}
 
 		case SVR4_F_GETLK64:
 		case SVR4_F_SETLK64:
 		case SVR4_F_SETLKW64:
 			{
 				struct svr4_flock64	ifl;
 				struct flock		fl;
 
 				switch (uap->cmd) {
 				case SVR4_F_GETLK64:
 					cmd = F_GETLK;
 					break;
 				case SVR4_F_SETLK64:
 					cmd = F_SETLK;
 					break;					
 				case SVR4_F_SETLKW64:
 					cmd = F_SETLKW;
 					break;
 				}
 				error = copyin(uap->arg, &ifl,
 				    sizeof (ifl));
 				if (error)
 					return (error);
 
 				svr4_to_bsd_flock64(&ifl, &fl);
 
 				error = kern_fcntl(td, uap->fd, cmd,
 				    (intptr_t)&fl);
 				if (error || cmd != F_GETLK)
 					return (error);
 
 				bsd_to_svr4_flock64(&fl, &ifl);
 
 				return (copyout(&ifl, uap->arg,
 				    sizeof (ifl)));
 			}
 
 		case SVR4_F_FREESP64:
 			{
 				struct svr4_flock64	 ifl;
 				struct flock		 fl;
 
 				error = copyin(uap->arg, &ifl,
 				    sizeof ifl);
 				if (error)
 					return error;
 				svr4_to_bsd_flock64(&ifl, &fl);
 				return fd_truncate(td, uap->fd, &fl);
 			}
 
 		case SVR4_F_REVOKE:
 			return fd_revoke(td, uap->fd);
 
 		default:
 			return ENOSYS;
 		}
 
 	default:
 		return ENOSYS;
 	}
 }
Index: head/sys/compat/svr4/svr4_misc.c
===================================================================
--- head/sys/compat/svr4/svr4_misc.c	(revision 175201)
+++ head/sys/compat/svr4/svr4_misc.c	(revision 175202)
@@ -1,1635 +1,1635 @@
 /*-
  * Copyright (c) 1998 Mark Newton
  * Copyright (c) 1994 Christos Zoulas
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. The name of the author may not be used to endorse or promote products
  *    derived from this software without specific prior written permission
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 /*
  * SVR4 compatibility module.
  *
  * SVR4 system calls that are implemented differently in BSD are
  * handled here.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_mac.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/dirent.h>
 #include <sys/fcntl.h>
 #include <sys/filedesc.h>
 #include <sys/imgact.h>
 #include <sys/kernel.h>
 #include <sys/lock.h>
 #include <sys/malloc.h>
 #include <sys/file.h>		/* Must come after sys/malloc.h */
 #include <sys/mman.h>
 #include <sys/mount.h>
 #include <sys/msg.h>
 #include <sys/mutex.h>
 #include <sys/namei.h>
 #include <sys/priv.h>
 #include <sys/proc.h>
 #include <sys/ptrace.h>
 #include <sys/resource.h>
 #include <sys/resourcevar.h>
 #include <sys/sem.h>
 #include <sys/signalvar.h>
 #include <sys/stat.h>
 #include <sys/sx.h>
 #include <sys/syscallsubr.h>
 #include <sys/sysproto.h>
 #include <sys/time.h>
 #include <sys/times.h>
 #include <sys/uio.h>
 #include <sys/vnode.h>
 #include <sys/wait.h>
 
 #include <compat/svr4/svr4.h>
 #include <compat/svr4/svr4_types.h>
 #include <compat/svr4/svr4_signal.h>
 #include <compat/svr4/svr4_proto.h>
 #include <compat/svr4/svr4_util.h>
 #include <compat/svr4/svr4_sysconfig.h>
 #include <compat/svr4/svr4_dirent.h>
 #include <compat/svr4/svr4_acl.h>
 #include <compat/svr4/svr4_ulimit.h>
 #include <compat/svr4/svr4_statvfs.h>
 #include <compat/svr4/svr4_hrt.h>
 #include <compat/svr4/svr4_mman.h>
 #include <compat/svr4/svr4_wait.h>
 
 #include <security/mac/mac_framework.h>
 
 #include <machine/vmparam.h>
 #include <vm/vm.h>
 #include <vm/vm_param.h>
 #include <vm/vm_map.h>
 #if defined(__FreeBSD__)
 #include <vm/uma.h>
 #include <vm/vm_extern.h>
 #endif
 
 #if defined(NetBSD)
 # if defined(UVM)
 #  include <uvm/uvm_extern.h>
 # endif
 #endif
 
 #define	BSD_DIRENT(cp)		((struct dirent *)(cp))
 
 static int svr4_mknod(struct thread *, register_t *, char *,
     svr4_mode_t, svr4_dev_t);
 
 static __inline clock_t timeval_to_clock_t(struct timeval *);
 static int svr4_setinfo	(pid_t , struct rusage *, int, svr4_siginfo_t *);
 
 struct svr4_hrtcntl_args;
 static int svr4_hrtcntl	(struct thread *, struct svr4_hrtcntl_args *,
     register_t *);
 static void bsd_statfs_to_svr4_statvfs(const struct statfs *,
     struct svr4_statvfs *);
 static void bsd_statfs_to_svr4_statvfs64(const struct statfs *,
     struct svr4_statvfs64 *);
 static struct proc *svr4_pfind(pid_t pid);
 
 /* BOGUS noop */
 #if defined(BOGUS)
 int
 svr4_sys_setitimer(td, uap)
         register struct thread *td;
 	struct svr4_sys_setitimer_args *uap;
 {
         td->td_retval[0] = 0;
 	return 0;
 }
 #endif
 
 int
 svr4_sys_wait(td, uap)
 	struct thread *td;
 	struct svr4_sys_wait_args *uap;
 {
 	int error, st, sig;
 
 	error = kern_wait(td, WAIT_ANY, &st, 0, NULL);
 	if (error)
 		return (error);
       
 	if (WIFSIGNALED(st)) {
 		sig = WTERMSIG(st);
 		if (sig >= 0 && sig < NSIG)
 			st = (st & ~0177) | SVR4_BSD2SVR4_SIG(sig);
 	} else if (WIFSTOPPED(st)) {
 		sig = WSTOPSIG(st);
 		if (sig >= 0 && sig < NSIG)
 			st = (st & ~0xff00) | (SVR4_BSD2SVR4_SIG(sig) << 8);
 	}
 
 	/*
 	 * It looks like wait(2) on svr4/solaris/2.4 returns
 	 * the status in retval[1], and the pid on retval[0].
 	 */
 	td->td_retval[1] = st;
 
 	if (uap->status)
 		error = copyout(&st, uap->status, sizeof(st));
 
 	return (error);
 }
 
 int
 svr4_sys_execv(td, uap)
 	struct thread *td;
 	struct svr4_sys_execv_args *uap;
 {
 	struct image_args eargs;
 	char *path;
 	int error;
 
 	CHECKALTEXIST(td, uap->path, &path);
 
 	error = exec_copyin_args(&eargs, path, UIO_SYSSPACE, uap->argp, NULL);
 	free(path, M_TEMP);
 	if (error == 0)
 		error = kern_execve(td, &eargs, NULL);
 	return (error);
 }
 
 int
 svr4_sys_execve(td, uap)
 	struct thread *td;
 	struct svr4_sys_execve_args *uap;
 {
 	struct image_args eargs;
 	char *path;
 	int error;
 
 	CHECKALTEXIST(td, uap->path, &path);
 
 	error = exec_copyin_args(&eargs, path, UIO_SYSSPACE, uap->argp,
 	    uap->envp);
 	free(path, M_TEMP);
 	if (error == 0)
 		error = kern_execve(td, &eargs, NULL);
 	return (error);
 }
 
 int
 svr4_sys_time(td, v)
 	struct thread *td;
 	struct svr4_sys_time_args *v;
 {
 	struct svr4_sys_time_args *uap = v;
 	int error = 0;
 	struct timeval tv;
 
 	microtime(&tv);
 	if (uap->t)
 		error = copyout(&tv.tv_sec, uap->t,
 				sizeof(*(uap->t)));
 	td->td_retval[0] = (int) tv.tv_sec;
 
 	return error;
 }
 
 
 /*
  * Read SVR4-style directory entries.  We suck them into kernel space so
  * that they can be massaged before being copied out to user code.  
  *
  * This code is ported from the Linux emulator:  Changes to the VFS interface
  * between FreeBSD and NetBSD have made it simpler to port it from there than
  * to adapt the NetBSD version.
  */
 int
 svr4_sys_getdents64(td, uap)
 	struct thread *td;
 	struct svr4_sys_getdents64_args *uap;
 {
 	register struct dirent *bdp;
 	struct vnode *vp;
 	caddr_t inp, buf;		/* BSD-format */
 	int len, reclen;		/* BSD-format */
 	caddr_t outp;			/* SVR4-format */
 	int resid, svr4reclen=0;	/* SVR4-format */
 	struct file *fp;
 	struct uio auio;
 	struct iovec aiov;
 	off_t off;
 	struct svr4_dirent64 svr4_dirent;
 	int buflen, error, eofflag, nbytes, justone, vfslocked;
 	u_long *cookies = NULL, *cookiep;
 	int ncookies;
 
 	DPRINTF(("svr4_sys_getdents64(%d, *, %d)\n",
 		uap->fd, uap->nbytes));
 	if ((error = getvnode(td->td_proc->p_fd, uap->fd, &fp)) != 0) {
 		return (error);
 	}
 
 	if ((fp->f_flag & FREAD) == 0) {
 		fdrop(fp, td);
 		return (EBADF);
 	}
 
 	vp = fp->f_vnode;
 	vfslocked = VFS_LOCK_GIANT(vp->v_mount);
 	if (vp->v_type != VDIR) {
 		VFS_UNLOCK_GIANT(vfslocked);
 		fdrop(fp, td);
 		return (EINVAL);
 	}
 
 	nbytes = uap->nbytes;
 	if (nbytes == 1) {
 		nbytes = sizeof (struct svr4_dirent64);
 		justone = 1;
 	}
 	else
 		justone = 0;
 
 	off = fp->f_offset;
 #define	DIRBLKSIZ	512		/* XXX we used to use ufs's DIRBLKSIZ */
 	buflen = max(DIRBLKSIZ, nbytes);
 	buflen = min(buflen, MAXBSIZE);
 	buf = malloc(buflen, M_TEMP, M_WAITOK);
-	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td);
+	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
 again:
 	aiov.iov_base = buf;
 	aiov.iov_len = buflen;
 	auio.uio_iov = &aiov;
 	auio.uio_iovcnt = 1;
 	auio.uio_rw = UIO_READ;
 	auio.uio_segflg = UIO_SYSSPACE;
 	auio.uio_td = td;
 	auio.uio_resid = buflen;
 	auio.uio_offset = off;
 
 	if (cookies) {
 		free(cookies, M_TEMP);
 		cookies = NULL;
 	}
 
 #ifdef MAC
 	error = mac_vnode_check_readdir(td->td_ucred, vp);
 	if (error)
 		goto out;
 #endif
 
 	error = VOP_READDIR(vp, &auio, fp->f_cred, &eofflag,
 						&ncookies, &cookies);
 	if (error) {
 		goto out;
 	}
 
 	inp = buf;
 	outp = (caddr_t) uap->dp;
 	resid = nbytes;
 	if ((len = buflen - auio.uio_resid) <= 0) {
 		goto eof;
 	}
 
 	cookiep = cookies;
 
 	if (cookies) {
 		/*
 		 * When using cookies, the vfs has the option of reading from
 		 * a different offset than that supplied (UFS truncates the
 		 * offset to a block boundary to make sure that it never reads
 		 * partway through a directory entry, even if the directory
 		 * has been compacted).
 		 */
 		while (len > 0 && ncookies > 0 && *cookiep <= off) {
 			bdp = (struct dirent *) inp;
 			len -= bdp->d_reclen;
 			inp += bdp->d_reclen;
 			cookiep++;
 			ncookies--;
 		}
 	}
 
 	while (len > 0) {
 		if (cookiep && ncookies == 0)
 			break;
 		bdp = (struct dirent *) inp;
 		reclen = bdp->d_reclen;
 		if (reclen & 3) {
 			DPRINTF(("svr4_readdir: reclen=%d\n", reclen));
 			error = EFAULT;
 			goto out;
 		}
   
 		if (bdp->d_fileno == 0) {
 	    		inp += reclen;
 			if (cookiep) {
 				off = *cookiep++;
 				ncookies--;
 			} else
 				off += reclen;
 			len -= reclen;
 			continue;
 		}
 		svr4reclen = SVR4_RECLEN(&svr4_dirent, bdp->d_namlen);
 		if (reclen > len || resid < svr4reclen) {
 			outp++;
 			break;
 		}
 		svr4_dirent.d_ino = (long) bdp->d_fileno;
 		if (justone) {
 			/*
 			 * old svr4-style readdir usage.
 			 */
 			svr4_dirent.d_off = (svr4_off_t) svr4reclen;
 			svr4_dirent.d_reclen = (u_short) bdp->d_namlen;
 		} else {
 			svr4_dirent.d_off = (svr4_off_t)(off + reclen);
 			svr4_dirent.d_reclen = (u_short) svr4reclen;
 		}
 		strcpy(svr4_dirent.d_name, bdp->d_name);
 		if ((error = copyout((caddr_t)&svr4_dirent, outp, svr4reclen)))
 			goto out;
 		inp += reclen;
 		if (cookiep) {
 			off = *cookiep++;
 			ncookies--;
 		} else
 			off += reclen;
 		outp += svr4reclen;
 		resid -= svr4reclen;
 		len -= reclen;
 		if (justone)
 			break;
     	}
 
 	if (outp == (caddr_t) uap->dp)
 		goto again;
 	fp->f_offset = off;
 
 	if (justone)
 		nbytes = resid + svr4reclen;
 
 eof:
 	td->td_retval[0] = nbytes - resid;
 out:
 	VOP_UNLOCK(vp, 0, td);
 	VFS_UNLOCK_GIANT(vfslocked);
 	fdrop(fp, td);
 	if (cookies)
 		free(cookies, M_TEMP);
 	free(buf, M_TEMP);
 	return error;
 }
 
 
 int
 svr4_sys_getdents(td, uap)
 	struct thread *td;
 	struct svr4_sys_getdents_args *uap;
 {
 	struct dirent *bdp;
 	struct vnode *vp;
 	caddr_t inp, buf;	/* BSD-format */
 	int len, reclen;	/* BSD-format */
 	caddr_t outp;		/* SVR4-format */
 	int resid, svr4_reclen;	/* SVR4-format */
 	struct file *fp;
 	struct uio auio;
 	struct iovec aiov;
 	struct svr4_dirent idb;
 	off_t off;		/* true file offset */
 	int buflen, error, eofflag, vfslocked;
 	u_long *cookiebuf = NULL, *cookie;
 	int ncookies = 0, *retval = td->td_retval;
 
 	if (uap->nbytes < 0)
 		return (EINVAL);
 
 	if ((error = getvnode(td->td_proc->p_fd, uap->fd, &fp)) != 0)
 		return (error);
 
 	if ((fp->f_flag & FREAD) == 0) {
 		fdrop(fp, td);
 		return (EBADF);
 	}
 
 	vp = fp->f_vnode;
 	vfslocked = VFS_LOCK_GIANT(vp->v_mount);
 	if (vp->v_type != VDIR) {
 		VFS_UNLOCK_GIANT(vfslocked);
 		fdrop(fp, td);
 		return (EINVAL);
 	}
 
 	buflen = min(MAXBSIZE, uap->nbytes);
 	buf = malloc(buflen, M_TEMP, M_WAITOK);
-	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td);
+	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
 	off = fp->f_offset;
 again:
 	aiov.iov_base = buf;
 	aiov.iov_len = buflen;
 	auio.uio_iov = &aiov;
 	auio.uio_iovcnt = 1;
 	auio.uio_rw = UIO_READ;
 	auio.uio_segflg = UIO_SYSSPACE;
 	auio.uio_td = td;
 	auio.uio_resid = buflen;
 	auio.uio_offset = off;
 
 #ifdef MAC
 	error = mac_vnode_check_readdir(td->td_ucred, vp);
 	if (error)
 		goto out;
 #endif
 
 	/*
          * First we read into the malloc'ed buffer, then
          * we massage it into user space, one record at a time.
          */
 	error = VOP_READDIR(vp, &auio, fp->f_cred, &eofflag, &ncookies,
 	    &cookiebuf);
 	if (error) {
 		goto out;
 	}
 
 	inp = buf;
 	outp = uap->buf;
 	resid = uap->nbytes;
 	if ((len = buflen - auio.uio_resid) == 0)
 		goto eof;
 
 	for (cookie = cookiebuf; len > 0; len -= reclen) {
 		bdp = (struct dirent *)inp;
 		reclen = bdp->d_reclen;
 		if (reclen & 3)
 			panic("svr4_sys_getdents64: bad reclen");
 		off = *cookie++;	/* each entry points to the next */
 		if ((off >> 32) != 0) {
 			uprintf("svr4_sys_getdents64: dir offset too large for emulated program");
 			error = EINVAL;
 			goto out;
 		}
 		if (bdp->d_fileno == 0) {
 			inp += reclen;	/* it is a hole; squish it out */
 			continue;
 		}
 		svr4_reclen = SVR4_RECLEN(&idb, bdp->d_namlen);
 		if (reclen > len || resid < svr4_reclen) {
 			/* entry too big for buffer, so just stop */
 			outp++;
 			break;
 		}
 		/*
 		 * Massage in place to make a SVR4-shaped dirent (otherwise
 		 * we have to worry about touching user memory outside of
 		 * the copyout() call).
 		 */
 		idb.d_ino = (svr4_ino_t)bdp->d_fileno;
 		idb.d_off = (svr4_off_t)off;
 		idb.d_reclen = (u_short)svr4_reclen;
 		strcpy(idb.d_name, bdp->d_name);
 		if ((error = copyout((caddr_t)&idb, outp, svr4_reclen)))
 			goto out;
 		/* advance past this real entry */
 		inp += reclen;
 		/* advance output past SVR4-shaped entry */
 		outp += svr4_reclen;
 		resid -= svr4_reclen;
 	}
 
 	/* if we squished out the whole block, try again */
 	if (outp == uap->buf)
 		goto again;
 	fp->f_offset = off;	/* update the vnode offset */
 
 eof:
 	*retval = uap->nbytes - resid;
 out:
 	VOP_UNLOCK(vp, 0, td);
 	VFS_UNLOCK_GIANT(vfslocked);
 	fdrop(fp, td);
 	if (cookiebuf)
 		free(cookiebuf, M_TEMP);
 	free(buf, M_TEMP);
 	return error;
 }
 
 
 int
 svr4_sys_mmap(td, uap)
 	struct thread *td;
 	struct svr4_sys_mmap_args *uap;
 {
 	struct mmap_args	 mm;
 	int             *retval;
 
 	retval = td->td_retval;
 #define _MAP_NEW	0x80000000
 	/*
          * Verify the arguments.
          */
 	if (uap->prot & ~(PROT_READ | PROT_WRITE | PROT_EXEC))
 		return EINVAL;	/* XXX still needed? */
 
 	if (uap->len == 0)
 		return EINVAL;
 
 	mm.prot = uap->prot;
 	mm.len = uap->len;
 	mm.flags = uap->flags & ~_MAP_NEW;
 	mm.fd = uap->fd;
 	mm.addr = uap->addr;
 	mm.pos = uap->pos;
 
 	return mmap(td, &mm);
 }
 
 int
 svr4_sys_mmap64(td, uap)
 	struct thread *td;
 	struct svr4_sys_mmap64_args *uap;
 {
 	struct mmap_args	 mm;
 	void		*rp;
 
 #define _MAP_NEW	0x80000000
 	/*
          * Verify the arguments.
          */
 	if (uap->prot & ~(PROT_READ | PROT_WRITE | PROT_EXEC))
 		return EINVAL;	/* XXX still needed? */
 
 	if (uap->len == 0)
 		return EINVAL;
 
 	mm.prot = uap->prot;
 	mm.len = uap->len;
 	mm.flags = uap->flags & ~_MAP_NEW;
 	mm.fd = uap->fd;
 	mm.addr = uap->addr;
 	mm.pos = uap->pos;
 
 	rp = (void *) round_page((vm_offset_t)(td->td_proc->p_vmspace->vm_daddr + maxdsiz));
 	if ((mm.flags & MAP_FIXED) == 0 &&
 	    mm.addr != 0 && (void *)mm.addr < rp)
 		mm.addr = rp;
 
 	return mmap(td, &mm);
 }
 
 
 int
 svr4_sys_fchroot(td, uap)
 	struct thread *td;
 	struct svr4_sys_fchroot_args *uap;
 {
 	struct filedesc	*fdp = td->td_proc->p_fd;
 	struct vnode	*vp;
 	struct file	*fp;
 	int		 error, vfslocked;
 
 	if ((error = priv_check(td, PRIV_VFS_FCHROOT)) != 0)
 		return error;
 	if ((error = getvnode(fdp, uap->fd, &fp)) != 0)
 		return error;
 	vp = fp->f_vnode;
 	VREF(vp);
 	fdrop(fp, td);
 	vfslocked = VFS_LOCK_GIANT(vp->v_mount);
-	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td);
+	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
 	error = change_dir(vp, td);
 	if (error)
 		goto fail;
 #ifdef MAC
 	error = mac_vnode_check_chroot(td->td_ucred, vp);
 	if (error)
 		goto fail;
 #endif
 	VOP_UNLOCK(vp, 0, td);
 	error = change_root(vp, td);
 	vrele(vp);
 	VFS_UNLOCK_GIANT(vfslocked);
 	return (error);
 fail:
 	vput(vp);
 	VFS_UNLOCK_GIANT(vfslocked);
 	return (error);
 }
 
 
 static int
 svr4_mknod(td, retval, path, mode, dev)
 	struct thread *td;
 	register_t *retval;
 	char *path;
 	svr4_mode_t mode;
 	svr4_dev_t dev;
 {
 	char *newpath;
 	int error;
 
 	CHECKALTEXIST(td, path, &newpath);
 
 	if (S_ISFIFO(mode))
 		error = kern_mkfifo(td, newpath, UIO_SYSSPACE, mode);
 	else
 		error = kern_mknod(td, newpath, UIO_SYSSPACE, mode, dev);
 	free(newpath, M_TEMP);
 	return (error);
 }
 
 
 int
 svr4_sys_mknod(td, uap)
 	register struct thread *td;
 	struct svr4_sys_mknod_args *uap;
 {
         int *retval = td->td_retval;
 	return svr4_mknod(td, retval,
 			  uap->path, uap->mode,
 			  (svr4_dev_t)svr4_to_bsd_odev_t(uap->dev));
 }
 
 
 int
 svr4_sys_xmknod(td, uap)
 	struct thread *td;
 	struct svr4_sys_xmknod_args *uap;
 {
         int *retval = td->td_retval;
 	return svr4_mknod(td, retval,
 			  uap->path, uap->mode,
 			  (svr4_dev_t)svr4_to_bsd_dev_t(uap->dev));
 }
 
 
 int
 svr4_sys_vhangup(td, uap)
 	struct thread *td;
 	struct svr4_sys_vhangup_args *uap;
 {
 	return 0;
 }
 
 
 int
 svr4_sys_sysconfig(td, uap)
 	struct thread *td;
 	struct svr4_sys_sysconfig_args *uap;
 {
 	int *retval;
 
 	retval = &(td->td_retval[0]);
 
 	switch (uap->name) {
 	case SVR4_CONFIG_UNUSED:
 		*retval = 0;
 		break;
 	case SVR4_CONFIG_NGROUPS:
 		*retval = NGROUPS_MAX;
 		break;
 	case SVR4_CONFIG_CHILD_MAX:
 		*retval = maxproc;
 		break;
 	case SVR4_CONFIG_OPEN_FILES:
 		*retval = maxfiles;
 		break;
 	case SVR4_CONFIG_POSIX_VER:
 		*retval = 198808;
 		break;
 	case SVR4_CONFIG_PAGESIZE:
 		*retval = PAGE_SIZE;
 		break;
 	case SVR4_CONFIG_CLK_TCK:
 		*retval = 60;	/* should this be `hz', ie. 100? */
 		break;
 	case SVR4_CONFIG_XOPEN_VER:
 		*retval = 2;	/* XXX: What should that be? */
 		break;
 	case SVR4_CONFIG_PROF_TCK:
 		*retval = 60;	/* XXX: What should that be? */
 		break;
 	case SVR4_CONFIG_NPROC_CONF:
 		*retval = 1;	/* Only one processor for now */
 		break;
 	case SVR4_CONFIG_NPROC_ONLN:
 		*retval = 1;	/* And it better be online */
 		break;
 	case SVR4_CONFIG_AIO_LISTIO_MAX:
 	case SVR4_CONFIG_AIO_MAX:
 	case SVR4_CONFIG_AIO_PRIO_DELTA_MAX:
 		*retval = 0;	/* No aio support */
 		break;
 	case SVR4_CONFIG_DELAYTIMER_MAX:
 		*retval = 0;	/* No delaytimer support */
 		break;
 	case SVR4_CONFIG_MQ_OPEN_MAX:
 		*retval = msginfo.msgmni;
 		break;
 	case SVR4_CONFIG_MQ_PRIO_MAX:
 		*retval = 0;	/* XXX: Don't know */
 		break;
 	case SVR4_CONFIG_RTSIG_MAX:
 		*retval = 0;
 		break;
 	case SVR4_CONFIG_SEM_NSEMS_MAX:
 		*retval = seminfo.semmni;
 		break;
 	case SVR4_CONFIG_SEM_VALUE_MAX:
 		*retval = seminfo.semvmx;
 		break;
 	case SVR4_CONFIG_SIGQUEUE_MAX:
 		*retval = 0;	/* XXX: Don't know */
 		break;
 	case SVR4_CONFIG_SIGRT_MIN:
 	case SVR4_CONFIG_SIGRT_MAX:
 		*retval = 0;	/* No real time signals */
 		break;
 	case SVR4_CONFIG_TIMER_MAX:
 		*retval = 3;	/* XXX: real, virtual, profiling */
 		break;
 #if defined(NOTYET)
 	case SVR4_CONFIG_PHYS_PAGES:
 #if defined(UVM)
 		*retval = uvmexp.free;	/* XXX: free instead of total */
 #else
 		*retval = cnt.v_free_count;	/* XXX: free instead of total */
 #endif
 		break;
 	case SVR4_CONFIG_AVPHYS_PAGES:
 #if defined(UVM)
 		*retval = uvmexp.active;	/* XXX: active instead of avg */
 #else
 		*retval = cnt.v_active_count;	/* XXX: active instead of avg */
 #endif
 		break;
 #endif /* NOTYET */
 
 	default:
 		return EINVAL;
 	}
 	return 0;
 }
 
 /* ARGSUSED */
 int
 svr4_sys_break(td, uap)
 	struct thread *td;
 	struct svr4_sys_break_args *uap;
 {
 	struct obreak_args ap;
 
 	ap.nsize = uap->nsize;
 	return (obreak(td, &ap));
 }
 
 static __inline clock_t
 timeval_to_clock_t(tv)
 	struct timeval *tv;
 {
 	return tv->tv_sec * hz + tv->tv_usec / (1000000 / hz);
 }
 
 
 int
 svr4_sys_times(td, uap)
 	struct thread *td;
 	struct svr4_sys_times_args *uap;
 {
 	struct timeval tv, utime, stime, cutime, cstime;
 	struct tms tms;
 	struct proc *p;
 	int error;
 
 	p = td->td_proc;
 	PROC_LOCK(p);
 	PROC_SLOCK(p);
 	calcru(p, &utime, &stime);
 	PROC_SUNLOCK(p);
 	calccru(p, &cutime, &cstime);
 	PROC_UNLOCK(p);
 
 	tms.tms_utime = timeval_to_clock_t(&utime);
 	tms.tms_stime = timeval_to_clock_t(&stime);
 
 	tms.tms_cutime = timeval_to_clock_t(&cutime);
 	tms.tms_cstime = timeval_to_clock_t(&cstime);
 
 	error = copyout(&tms, uap->tp, sizeof(tms));
 	if (error)
 		return (error);
 
 	microtime(&tv);
 	td->td_retval[0] = (int)timeval_to_clock_t(&tv);
 	return (0);
 }
 
 
 int
 svr4_sys_ulimit(td, uap)
 	struct thread *td;
 	struct svr4_sys_ulimit_args *uap;
 {
         int *retval = td->td_retval;
 	int error;
 
 	switch (uap->cmd) {
 	case SVR4_GFILLIM:
 		PROC_LOCK(td->td_proc);
 		*retval = lim_cur(td->td_proc, RLIMIT_FSIZE) / 512;
 		PROC_UNLOCK(td->td_proc);
 		if (*retval == -1)
 			*retval = 0x7fffffff;
 		return 0;
 
 	case SVR4_SFILLIM:
 		{
 			struct rlimit krl;
 
 			krl.rlim_cur = uap->newlimit * 512;
 			PROC_LOCK(td->td_proc);
 			krl.rlim_max = lim_max(td->td_proc, RLIMIT_FSIZE);
 			PROC_UNLOCK(td->td_proc);
 
 			error = kern_setrlimit(td, RLIMIT_FSIZE, &krl);
 			if (error)
 				return error;
 
 			PROC_LOCK(td->td_proc);
 			*retval = lim_cur(td->td_proc, RLIMIT_FSIZE);
 			PROC_UNLOCK(td->td_proc);
 			if (*retval == -1)
 				*retval = 0x7fffffff;
 			return 0;
 		}
 
 	case SVR4_GMEMLIM:
 		{
 			struct vmspace *vm = td->td_proc->p_vmspace;
 			register_t r;
 
 			PROC_LOCK(td->td_proc);
 			r = lim_cur(td->td_proc, RLIMIT_DATA);
 			PROC_UNLOCK(td->td_proc);
 
 			if (r == -1)
 				r = 0x7fffffff;
 			mtx_lock(&Giant);	/* XXX */
 			r += (long) vm->vm_daddr;
 			mtx_unlock(&Giant);
 			if (r < 0)
 				r = 0x7fffffff;
 			*retval = r;
 			return 0;
 		}
 
 	case SVR4_GDESLIM:
 		PROC_LOCK(td->td_proc);
 		*retval = lim_cur(td->td_proc, RLIMIT_NOFILE);
 		PROC_UNLOCK(td->td_proc);
 		if (*retval == -1)
 			*retval = 0x7fffffff;
 		return 0;
 
 	default:
 		return EINVAL;
 	}
 }
 
 static struct proc *
 svr4_pfind(pid)
 	pid_t pid;
 {
 	struct proc *p;
 
 	/* look in the live processes */
 	if ((p = pfind(pid)) == NULL)
 		/* look in the zombies */
 		p = zpfind(pid);
 
 	return p;
 }
 
 
 int
 svr4_sys_pgrpsys(td, uap)
 	struct thread *td;
 	struct svr4_sys_pgrpsys_args *uap;
 {
         int *retval = td->td_retval;
 	struct proc *p = td->td_proc;
 
 	switch (uap->cmd) {
 	case 1:			/* setpgrp() */
 		/*
 		 * SVR4 setpgrp() (which takes no arguments) has the
 		 * semantics that the session ID is also created anew, so
 		 * in almost every sense, setpgrp() is identical to
 		 * setsid() for SVR4.  (Under BSD, the difference is that
 		 * a setpgid(0,0) will not create a new session.)
 		 */
 		setsid(td, NULL);
 		/*FALLTHROUGH*/
 
 	case 0:			/* getpgrp() */
 		PROC_LOCK(p);
 		*retval = p->p_pgrp->pg_id;
 		PROC_UNLOCK(p);
 		return 0;
 
 	case 2:			/* getsid(pid) */
 		if (uap->pid == 0)
 			PROC_LOCK(p);
 		else if ((p = svr4_pfind(uap->pid)) == NULL)
 			return ESRCH;
 		/*
 		 * This has already been initialized to the pid of
 		 * the session leader.
 		 */
 		*retval = (register_t) p->p_session->s_sid;
 		PROC_UNLOCK(p);
 		return 0;
 
 	case 3:			/* setsid() */
 		return setsid(td, NULL);
 
 	case 4:			/* getpgid(pid) */
 
 		if (uap->pid == 0)
 			PROC_LOCK(p);
 		else if ((p = svr4_pfind(uap->pid)) == NULL)
 			return ESRCH;
 
 		*retval = (int) p->p_pgrp->pg_id;
 		PROC_UNLOCK(p);
 		return 0;
 
 	case 5:			/* setpgid(pid, pgid); */
 		{
 			struct setpgid_args sa;
 
 			sa.pid = uap->pid;
 			sa.pgid = uap->pgid;
 			return setpgid(td, &sa);
 		}
 
 	default:
 		return EINVAL;
 	}
 }
 
 struct svr4_hrtcntl_args {
 	int 			cmd;
 	int 			fun;
 	int 			clk;
 	svr4_hrt_interval_t *	iv;
 	svr4_hrt_time_t *	ti;
 };
 
 
 static int
 svr4_hrtcntl(td, uap, retval)
 	struct thread *td;
 	struct svr4_hrtcntl_args *uap;
 	register_t *retval;
 {
 	switch (uap->fun) {
 	case SVR4_HRT_CNTL_RES:
 		DPRINTF(("htrcntl(RES)\n"));
 		*retval = SVR4_HRT_USEC;
 		return 0;
 
 	case SVR4_HRT_CNTL_TOFD:
 		DPRINTF(("htrcntl(TOFD)\n"));
 		{
 			struct timeval tv;
 			svr4_hrt_time_t t;
 			if (uap->clk != SVR4_HRT_CLK_STD) {
 				DPRINTF(("clk == %d\n", uap->clk));
 				return EINVAL;
 			}
 			if (uap->ti == NULL) {
 				DPRINTF(("ti NULL\n"));
 				return EINVAL;
 			}
 			microtime(&tv);
 			t.h_sec = tv.tv_sec;
 			t.h_rem = tv.tv_usec;
 			t.h_res = SVR4_HRT_USEC;
 			return copyout(&t, uap->ti, sizeof(t));
 		}
 
 	case SVR4_HRT_CNTL_START:
 		DPRINTF(("htrcntl(START)\n"));
 		return ENOSYS;
 
 	case SVR4_HRT_CNTL_GET:
 		DPRINTF(("htrcntl(GET)\n"));
 		return ENOSYS;
 	default:
 		DPRINTF(("Bad htrcntl command %d\n", uap->fun));
 		return ENOSYS;
 	}
 }
 
 
 int
 svr4_sys_hrtsys(td, uap) 
 	struct thread *td;
 	struct svr4_sys_hrtsys_args *uap;
 {
         int *retval = td->td_retval;
 
 	switch (uap->cmd) {
 	case SVR4_HRT_CNTL:
 		return svr4_hrtcntl(td, (struct svr4_hrtcntl_args *) uap,
 				    retval);
 
 	case SVR4_HRT_ALRM:
 		DPRINTF(("hrtalarm\n"));
 		return ENOSYS;
 
 	case SVR4_HRT_SLP:
 		DPRINTF(("hrtsleep\n"));
 		return ENOSYS;
 
 	case SVR4_HRT_CAN:
 		DPRINTF(("hrtcancel\n"));
 		return ENOSYS;
 
 	default:
 		DPRINTF(("Bad hrtsys command %d\n", uap->cmd));
 		return EINVAL;
 	}
 }
 
 
 static int
 svr4_setinfo(pid, ru, st, s)
 	pid_t pid;
 	struct rusage *ru;
 	int st;
 	svr4_siginfo_t *s;
 {
 	svr4_siginfo_t i;
 	int sig;
 
 	memset(&i, 0, sizeof(i));
 
 	i.svr4_si_signo = SVR4_SIGCHLD;
 	i.svr4_si_errno = 0;	/* XXX? */
 
 	i.svr4_si_pid = pid;
 	if (ru) {
 		i.svr4_si_stime = ru->ru_stime.tv_sec;
 		i.svr4_si_utime = ru->ru_utime.tv_sec;
 	}
 
 	if (WIFEXITED(st)) {
 		i.svr4_si_status = WEXITSTATUS(st);
 		i.svr4_si_code = SVR4_CLD_EXITED;
 	} else if (WIFSTOPPED(st)) {
 		sig = WSTOPSIG(st);
 		if (sig >= 0 && sig < NSIG)
 			i.svr4_si_status = SVR4_BSD2SVR4_SIG(sig);
 
 		if (i.svr4_si_status == SVR4_SIGCONT)
 			i.svr4_si_code = SVR4_CLD_CONTINUED;
 		else
 			i.svr4_si_code = SVR4_CLD_STOPPED;
 	} else {
 		sig = WTERMSIG(st);
 		if (sig >= 0 && sig < NSIG)
 			i.svr4_si_status = SVR4_BSD2SVR4_SIG(sig);
 
 		if (WCOREDUMP(st))
 			i.svr4_si_code = SVR4_CLD_DUMPED;
 		else
 			i.svr4_si_code = SVR4_CLD_KILLED;
 	}
 
 	DPRINTF(("siginfo [pid %ld signo %d code %d errno %d status %d]\n",
 		 i.svr4_si_pid, i.svr4_si_signo, i.svr4_si_code, i.svr4_si_errno,
 		 i.svr4_si_status));
 
 	return copyout(&i, s, sizeof(i));
 }
 
 
 int
 svr4_sys_waitsys(td, uap)
 	struct thread *td;
 	struct svr4_sys_waitsys_args *uap;
 {
 	struct rusage ru;
 	pid_t pid;
 	int nfound, status;
 	int error, *retval = td->td_retval;
 	struct proc *p, *q;
 
 	DPRINTF(("waitsys(%d, %d, %p, %x)\n", 
 	         uap->grp, uap->id,
 		 uap->info, uap->options));
 
 	q = td->td_proc;
 	switch (uap->grp) {
 	case SVR4_P_PID:
 		pid = uap->id;
 		break;
 
 	case SVR4_P_PGID:
 		PROC_LOCK(q);
 		pid = -q->p_pgid;
 		PROC_UNLOCK(q);
 		break;
 
 	case SVR4_P_ALL:
 		pid = WAIT_ANY;
 		break;
 
 	default:
 		return EINVAL;
 	}
 
 	/* Hand off the easy cases to kern_wait(). */
 	if (!(uap->options & (SVR4_WNOWAIT)) &&
 	    (uap->options & (SVR4_WEXITED | SVR4_WTRAPPED))) {
 		int options;
 
 		options = 0;
 		if (uap->options & SVR4_WSTOPPED)
 			options |= WUNTRACED;
 		if (uap->options & SVR4_WCONTINUED)
 			options |= WCONTINUED;
 		if (uap->options & SVR4_WNOHANG)
 			options |= WNOHANG;
 
 		error = kern_wait(td, pid, &status, options, &ru);
 		if (error)
 			return (error);
 		if (uap->options & SVR4_WNOHANG && *retval == 0)
 			error = svr4_setinfo(*retval, NULL, 0, uap->info);
 		else
 			error = svr4_setinfo(*retval, &ru, status, uap->info);
 		*retval = 0;
 		return (error);
 	}
 
 	/*
 	 * Ok, handle the weird cases.  Either WNOWAIT is set (meaning we
 	 * just want to see if there is a process to harvest, we dont'
 	 * want to actually harvest it), or WEXIT and WTRAPPED are clear
 	 * meaning we want to ignore zombies.  Either way, we don't have
 	 * to handle harvesting zombies here.  We do have to duplicate the
 	 * other portions of kern_wait() though, especially for the
 	 * WCONTINUED and WSTOPPED.
 	 */
 loop:
 	nfound = 0;
 	sx_slock(&proctree_lock);
 	LIST_FOREACH(p, &q->p_children, p_sibling) {
 		PROC_LOCK(p);
 		if (pid != WAIT_ANY &&
 		    p->p_pid != pid && p->p_pgid != -pid) {
 			PROC_UNLOCK(p);
 			DPRINTF(("pid %d pgid %d != %d\n", p->p_pid,
 				 p->p_pgid, pid));
 			continue;
 		}
 		if (p_canwait(td, p)) {
 			PROC_UNLOCK(p);
 			continue;
 		}
 
 		nfound++;
 
 		PROC_SLOCK(p);
 		/*
 		 * See if we have a zombie.  If so, WNOWAIT should be set,
 		 * as otherwise we should have called kern_wait() up above.
 		 */
 		if ((p->p_state == PRS_ZOMBIE) && 
 		    ((uap->options & (SVR4_WEXITED|SVR4_WTRAPPED)))) {
 			PROC_SUNLOCK(p);
 			KASSERT(uap->options & SVR4_WNOWAIT,
 			    ("WNOWAIT is clear"));
 
 			/* Found a zombie, so cache info in local variables. */
 			pid = p->p_pid;
 			status = p->p_xstat;
 			ru = p->p_ru;
 			PROC_SLOCK(p);
 			calcru(p, &ru.ru_utime, &ru.ru_stime);
 			PROC_SUNLOCK(p);
 			PROC_UNLOCK(p);
 			sx_sunlock(&proctree_lock);
 
 			/* Copy the info out to userland. */
 			*retval = 0;
 			DPRINTF(("found %d\n", pid));
 			return (svr4_setinfo(pid, &ru, status, uap->info));
 		}
 
 		/*
 		 * See if we have a stopped or continued process.
 		 * XXX: This duplicates the same code in kern_wait().
 		 */
 		if ((p->p_flag & P_STOPPED_SIG) &&
 		    (p->p_suspcount == p->p_numthreads) &&
 		    (p->p_flag & P_WAITED) == 0 &&
 		    (p->p_flag & P_TRACED || uap->options & SVR4_WSTOPPED)) {
 			PROC_SUNLOCK(p);
 		        if (((uap->options & SVR4_WNOWAIT)) == 0)
 				p->p_flag |= P_WAITED;
 			sx_sunlock(&proctree_lock);
 			pid = p->p_pid;
 			status = W_STOPCODE(p->p_xstat);
 			ru = p->p_ru;
 			PROC_SLOCK(p);
 			calcru(p, &ru.ru_utime, &ru.ru_stime);
 			PROC_SUNLOCK(p);
 			PROC_UNLOCK(p);
 
 		        if (((uap->options & SVR4_WNOWAIT)) == 0) {
 				PROC_LOCK(q);
 				sigqueue_take(p->p_ksi);
 				PROC_UNLOCK(q);
 			}
 
 			*retval = 0;
 			DPRINTF(("jobcontrol %d\n", pid));
 			return (svr4_setinfo(pid, &ru, status, uap->info));
 		}
 		PROC_SUNLOCK(p);
 		if (uap->options & SVR4_WCONTINUED &&
 		    (p->p_flag & P_CONTINUED)) {
 			sx_sunlock(&proctree_lock);
 		        if (((uap->options & SVR4_WNOWAIT)) == 0)
 				p->p_flag &= ~P_CONTINUED;
 			pid = p->p_pid;
 			ru = p->p_ru;
 			status = SIGCONT;
 			PROC_SLOCK(p);
 			calcru(p, &ru.ru_utime, &ru.ru_stime);
 			PROC_SUNLOCK(p);
 			PROC_UNLOCK(p);
 
 		        if (((uap->options & SVR4_WNOWAIT)) == 0) {
 				PROC_LOCK(q);
 				sigqueue_take(p->p_ksi);
 				PROC_UNLOCK(q);
 			}
 
 			*retval = 0;
 			DPRINTF(("jobcontrol %d\n", pid));
 			return (svr4_setinfo(pid, &ru, status, uap->info));
 		}
 		PROC_UNLOCK(p);
 	}
 
 	if (nfound == 0) {
 		sx_sunlock(&proctree_lock);
 		return (ECHILD);
 	}
 
 	if (uap->options & SVR4_WNOHANG) {
 		sx_sunlock(&proctree_lock);
 		*retval = 0;
 		return (svr4_setinfo(0, NULL, 0, uap->info));
 	}
 
 	PROC_LOCK(q);
 	sx_sunlock(&proctree_lock);
 	if (q->p_flag & P_STATCHILD) {
 		q->p_flag &= ~P_STATCHILD;
 		error = 0;
 	} else
 		error = msleep(q, &q->p_mtx, PWAIT | PCATCH, "svr4_wait", 0);
 	PROC_UNLOCK(q);
 	if (error)
 		return error;
 	goto loop;
 }
 
 
 static void
 bsd_statfs_to_svr4_statvfs(bfs, sfs)
 	const struct statfs *bfs;
 	struct svr4_statvfs *sfs;
 {
 	sfs->f_bsize = bfs->f_iosize; /* XXX */
 	sfs->f_frsize = bfs->f_bsize;
 	sfs->f_blocks = bfs->f_blocks;
 	sfs->f_bfree = bfs->f_bfree;
 	sfs->f_bavail = bfs->f_bavail;
 	sfs->f_files = bfs->f_files;
 	sfs->f_ffree = bfs->f_ffree;
 	sfs->f_favail = bfs->f_ffree;
 	sfs->f_fsid = bfs->f_fsid.val[0];
 	memcpy(sfs->f_basetype, bfs->f_fstypename, sizeof(sfs->f_basetype));
 	sfs->f_flag = 0;
 	if (bfs->f_flags & MNT_RDONLY)
 		sfs->f_flag |= SVR4_ST_RDONLY;
 	if (bfs->f_flags & MNT_NOSUID)
 		sfs->f_flag |= SVR4_ST_NOSUID;
 	sfs->f_namemax = MAXNAMLEN;
 	memcpy(sfs->f_fstr, bfs->f_fstypename, sizeof(sfs->f_fstr)); /* XXX */
 	memset(sfs->f_filler, 0, sizeof(sfs->f_filler));
 }
 
 
 static void
 bsd_statfs_to_svr4_statvfs64(bfs, sfs)
 	const struct statfs *bfs;
 	struct svr4_statvfs64 *sfs;
 {
 	sfs->f_bsize = bfs->f_iosize; /* XXX */
 	sfs->f_frsize = bfs->f_bsize;
 	sfs->f_blocks = bfs->f_blocks;
 	sfs->f_bfree = bfs->f_bfree;
 	sfs->f_bavail = bfs->f_bavail;
 	sfs->f_files = bfs->f_files;
 	sfs->f_ffree = bfs->f_ffree;
 	sfs->f_favail = bfs->f_ffree;
 	sfs->f_fsid = bfs->f_fsid.val[0];
 	memcpy(sfs->f_basetype, bfs->f_fstypename, sizeof(sfs->f_basetype));
 	sfs->f_flag = 0;
 	if (bfs->f_flags & MNT_RDONLY)
 		sfs->f_flag |= SVR4_ST_RDONLY;
 	if (bfs->f_flags & MNT_NOSUID)
 		sfs->f_flag |= SVR4_ST_NOSUID;
 	sfs->f_namemax = MAXNAMLEN;
 	memcpy(sfs->f_fstr, bfs->f_fstypename, sizeof(sfs->f_fstr)); /* XXX */
 	memset(sfs->f_filler, 0, sizeof(sfs->f_filler));
 }
 
 
 int
 svr4_sys_statvfs(td, uap)
 	struct thread *td;
 	struct svr4_sys_statvfs_args *uap;
 {
 	struct svr4_statvfs sfs;
 	struct statfs bfs;
 	char *path;
 	int error;
 
 	CHECKALTEXIST(td, uap->path, &path);
 
 	error = kern_statfs(td, path, UIO_SYSSPACE, &bfs);
 	free(path, M_TEMP);
 	if (error)
 		return (error);
 	bsd_statfs_to_svr4_statvfs(&bfs, &sfs);
 	return copyout(&sfs, uap->fs, sizeof(sfs));
 }
 
 
 int
 svr4_sys_fstatvfs(td, uap)
 	struct thread *td;
 	struct svr4_sys_fstatvfs_args *uap;
 {
 	struct svr4_statvfs sfs;
 	struct statfs bfs;
 	int error;
 
 	error = kern_fstatfs(td, uap->fd, &bfs);
 	if (error)
 		return (error);
 	bsd_statfs_to_svr4_statvfs(&bfs, &sfs);
 	return copyout(&sfs, uap->fs, sizeof(sfs));
 }
 
 
 int
 svr4_sys_statvfs64(td, uap)
 	struct thread *td;
 	struct svr4_sys_statvfs64_args *uap;
 {
 	struct svr4_statvfs64 sfs;
 	struct statfs bfs;
 	char *path;
 	int error;
 
 	CHECKALTEXIST(td, uap->path, &path);
 
 	error = kern_statfs(td, path, UIO_SYSSPACE, &bfs);
 	free(path, M_TEMP);
 	if (error)
 		return (error);
 	bsd_statfs_to_svr4_statvfs64(&bfs, &sfs);
 	return copyout(&sfs, uap->fs, sizeof(sfs));
 }
 
 
 int
 svr4_sys_fstatvfs64(td, uap) 
 	struct thread *td;
 	struct svr4_sys_fstatvfs64_args *uap;
 {
 	struct svr4_statvfs64 sfs;
 	struct statfs bfs;
 	int error;
 
 	error = kern_fstatfs(td, uap->fd, &bfs);
 	if (error)
 		return (error);
 	bsd_statfs_to_svr4_statvfs64(&bfs, &sfs);
 	return copyout(&sfs, uap->fs, sizeof(sfs));
 }
 
 int
 svr4_sys_alarm(td, uap)
 	struct thread *td;
 	struct svr4_sys_alarm_args *uap;
 {
         struct itimerval itv, oitv;
 	int error;
 
 	timevalclear(&itv.it_interval);
 	itv.it_value.tv_sec = uap->sec;
 	itv.it_value.tv_usec = 0;
 	error = kern_setitimer(td, ITIMER_REAL, &itv, &oitv);
 	if (error)
 		return (error);
 	if (oitv.it_value.tv_usec != 0)
 		oitv.it_value.tv_sec++;
 	td->td_retval[0] = oitv.it_value.tv_sec;
 	return (0);
 }
 
 int
 svr4_sys_gettimeofday(td, uap)
 	struct thread *td;
 	struct svr4_sys_gettimeofday_args *uap;
 {
 	if (uap->tp) {
 		struct timeval atv;
 
 		microtime(&atv);
 		return copyout(&atv, uap->tp, sizeof (atv));
 	}
 
 	return 0;
 }
 
 int
 svr4_sys_facl(td, uap)
 	struct thread *td;
 	struct svr4_sys_facl_args *uap;
 {
 	int *retval;
 
 	retval = td->td_retval;
 	*retval = 0;
 
 	switch (uap->cmd) {
 	case SVR4_SYS_SETACL:
 		/* We don't support acls on any filesystem */
 		return ENOSYS;
 
 	case SVR4_SYS_GETACL:
 		return copyout(retval, &uap->num,
 		    sizeof(uap->num));
 
 	case SVR4_SYS_GETACLCNT:
 		return 0;
 
 	default:
 		return EINVAL;
 	}
 }
 
 
 int
 svr4_sys_acl(td, uap)
 	struct thread *td;
 	struct svr4_sys_acl_args *uap;
 {
 	/* XXX: for now the same */
 	return svr4_sys_facl(td, (struct svr4_sys_facl_args *)uap);
 }
 
 int
 svr4_sys_auditsys(td, uap)
 	struct thread *td;
 	struct svr4_sys_auditsys_args *uap;
 {
 	/*
 	 * XXX: Big brother is *not* watching.
 	 */
 	return 0;
 }
 
 int
 svr4_sys_memcntl(td, uap)
 	struct thread *td;
 	struct svr4_sys_memcntl_args *uap;
 {
 	switch (uap->cmd) {
 	case SVR4_MC_SYNC:
 		{
 			struct msync_args msa;
 
 			msa.addr = uap->addr;
 			msa.len = uap->len;
 			msa.flags = (int)uap->arg;
 
 			return msync(td, &msa);
 		}
 	case SVR4_MC_ADVISE:
 		{
 			struct madvise_args maa;
 
 			maa.addr = uap->addr;
 			maa.len = uap->len;
 			maa.behav = (int)uap->arg;
 
 			return madvise(td, &maa);
 		}
 	case SVR4_MC_LOCK:
 	case SVR4_MC_UNLOCK:
 	case SVR4_MC_LOCKAS:
 	case SVR4_MC_UNLOCKAS:
 		return EOPNOTSUPP;
 	default:
 		return ENOSYS;
 	}
 }
 
 
 int
 svr4_sys_nice(td, uap)
 	struct thread *td;
 	struct svr4_sys_nice_args *uap;
 {
 	struct setpriority_args ap;
 	int error;
 
 	ap.which = PRIO_PROCESS;
 	ap.who = 0;
 	ap.prio = uap->prio;
 
 	if ((error = setpriority(td, &ap)) != 0)
 		return error;
 
 	/* the cast is stupid, but the structures are the same */
 	if ((error = getpriority(td, (struct getpriority_args *)&ap)) != 0)
 		return error;
 
 	return 0;
 }
 
 int
 svr4_sys_resolvepath(td, uap)
 	struct thread *td;
 	struct svr4_sys_resolvepath_args *uap;
 {
 	struct nameidata nd;
 	int error, *retval = td->td_retval;
 	unsigned int ncopy;
 	int vfslocked;
 
 	NDINIT(&nd, LOOKUP, NOFOLLOW | SAVENAME | MPSAFE, UIO_USERSPACE,
 	    uap->path, td);
 
 	if ((error = namei(&nd)) != 0)
 		return error;
 	vfslocked = NDHASGIANT(&nd);
 
 	ncopy = min(uap->bufsiz, strlen(nd.ni_cnd.cn_pnbuf) + 1);
 	if ((error = copyout(nd.ni_cnd.cn_pnbuf, uap->buf, ncopy)) != 0)
 		goto bad;
 
 	*retval = ncopy;
 bad:
 	NDFREE(&nd, NDF_ONLY_PNBUF);
 	vput(nd.ni_vp);
 	VFS_UNLOCK_GIANT(vfslocked);
 	return error;
 }
Index: head/sys/contrib/opensolaris/uts/common/fs/gfs.c
===================================================================
--- head/sys/contrib/opensolaris/uts/common/fs/gfs.c	(revision 175201)
+++ head/sys/contrib/opensolaris/uts/common/fs/gfs.c	(revision 175202)
@@ -1,884 +1,884 @@
 /*
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
  * Common Development and Distribution License (the "License").
  * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or http://www.opensolaris.org/os/licensing.
  * See the License for the specific language governing permissions
  * and limitations under the License.
  *
  * When distributing Covered Code, include this CDDL HEADER in each
  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  * If applicable, add the following below this CDDL HEADER, with the
  * fields enclosed by brackets "[]" replaced with your own identifying
  * information: Portions Copyright [yyyy] [name of copyright owner]
  *
  * CDDL HEADER END
  */
 /* Portions Copyright 2007 Shivakumar GN */
 /*
  * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
 #pragma ident	"%Z%%M%	%I%	%E% SMI"
 
 #include <sys/types.h>
 #include <sys/cmn_err.h>
 #include <sys/debug.h>
 #include <sys/dirent.h>
 #include <sys/kmem.h>
 #include <sys/mman.h>
 #include <sys/mutex.h>
 #include <sys/sysmacros.h>
 #include <sys/systm.h>
 #include <sys/uio.h>
 #include <sys/vfs.h>
 #include <sys/vnode.h>
 #include <sys/cred.h>
 #include <sys/kdb.h>
 
 #include <sys/gfs.h>
 
 /*
  * Generic pseudo-filesystem routines.
  *
  * There are significant similarities between the implementation of certain file
  * system entry points across different filesystems.  While one could attempt to
  * "choke up on the bat" and incorporate common functionality into a VOP
  * preamble or postamble, such an approach is limited in the benefit it can
  * provide.  In this file we instead define a toolkit of routines which can be
  * called from a filesystem (with in-kernel pseudo-filesystems being the focus
  * of the exercise) in a more component-like fashion.
  *
  * There are three basic classes of routines:
  *
  * 1) Lowlevel support routines
  *
  *    These routines are designed to play a support role for existing
  *    pseudo-filesystems (such as procfs).  They simplify common tasks,
  *    without enforcing the filesystem to hand over management to GFS.  The
  *    routines covered are:
  *
  *	gfs_readdir_init()
  *	gfs_readdir_emit()
  *	gfs_readdir_emitn()
  *	gfs_readdir_pred()
  *	gfs_readdir_fini()
  *	gfs_lookup_dot()
  *
  * 2) Complete GFS management
  *
  *    These routines take a more active role in management of the
  *    pseudo-filesystem.  They handle the relationship between vnode private
  *    data and VFS data, as well as the relationship between vnodes in the
  *    directory hierarchy.
  *
  *    In order to use these interfaces, the first member of every private
  *    v_data must be a gfs_file_t or a gfs_dir_t.  This hands over all control
  *    to GFS.
  *
  * 	gfs_file_create()
  * 	gfs_dir_create()
  * 	gfs_root_create()
  *
  *	gfs_file_inactive()
  *	gfs_dir_inactive()
  *	gfs_dir_lookup()
  *	gfs_dir_readdir()
  *
  * 	gfs_vop_inactive()
  * 	gfs_vop_lookup()
  * 	gfs_vop_readdir()
  * 	gfs_vop_map()
  *
  * 3) Single File pseudo-filesystems
  *
  *    This routine creates a rooted file to be overlayed ontop of another
  *    file in the physical filespace.
  *
  *    Note that the parent is NULL (actually the vfs), but there is nothing
  *    technically keeping such a file from utilizing the "Complete GFS
  *    management" set of routines.
  *
  * 	gfs_root_create_file()
  */
 
 /*
  * Low level directory routines
  *
  * These routines provide some simple abstractions for reading directories.
  * They are designed to be used by existing pseudo filesystems (namely procfs)
  * that already have a complicated management infrastructure.
  */
 
 /*
  * gfs_readdir_init: initiate a generic readdir
  *   st		- a pointer to an uninitialized gfs_readdir_state_t structure
  *   name_max	- the directory's maximum file name length
  *   ureclen	- the exported file-space record length (1 for non-legacy FSs)
  *   uiop	- the uiop passed to readdir
  *   parent	- the parent directory's inode
  *   self	- this directory's inode
  *
  * Returns 0 or a non-zero errno.
  *
  * Typical VOP_READDIR usage of gfs_readdir_*:
  *
  *	if ((error = gfs_readdir_init(...)) != 0)
  *		return (error);
  *	eof = 0;
  *	while ((error = gfs_readdir_pred(..., &voffset)) != 0) {
  *		if (!consumer_entry_at(voffset))
  *			voffset = consumer_next_entry(voffset);
  *		if (consumer_eof(voffset)) {
  *			eof = 1
  *			break;
  *		}
  *		if ((error = gfs_readdir_emit(..., voffset,
  *		    consumer_ino(voffset), consumer_name(voffset))) != 0)
  *			break;
  *	}
  *	return (gfs_readdir_fini(..., error, eofp, eof));
  *
  * As you can see, a zero result from gfs_readdir_pred() or
  * gfs_readdir_emit() indicates that processing should continue,
  * whereas a non-zero result indicates that the loop should terminate.
  * Most consumers need do nothing more than let gfs_readdir_fini()
  * determine what the cause of failure was and return the appropriate
  * value.
  */
 int
 gfs_readdir_init(gfs_readdir_state_t *st, int name_max, int ureclen,
     uio_t *uiop, ino64_t parent, ino64_t self)
 {
 	if (uiop->uio_loffset < 0 || uiop->uio_resid <= 0 ||
 	    (uiop->uio_loffset % ureclen) != 0)
 		return (EINVAL);
 
 	st->grd_ureclen = ureclen;
 	st->grd_oresid = uiop->uio_resid;
 	st->grd_namlen = name_max;
 	st->grd_dirent = kmem_zalloc(DIRENT64_RECLEN(st->grd_namlen), KM_SLEEP);
 	st->grd_parent = parent;
 	st->grd_self = self;
 
 	return (0);
 }
 
 /*
  * gfs_readdir_emit_int: internal routine to emit directory entry
  *
  *   st		- the current readdir state, which must have d_ino and d_name
  *                set
  *   uiop	- caller-supplied uio pointer
  *   next	- the offset of the next entry
  */
 static int
 gfs_readdir_emit_int(gfs_readdir_state_t *st, uio_t *uiop, offset_t next,
     int *ncookies, u_long **cookies)
 {
 	int reclen, namlen;
 
 	namlen = strlen(st->grd_dirent->d_name);
 	reclen = DIRENT64_RECLEN(namlen);
 
 	if (reclen > uiop->uio_resid) {
 		/*
 		 * Error if no entries were returned yet
 		 */
 		if (uiop->uio_resid == st->grd_oresid)
 			return (EINVAL);
 		return (-1);
 	}
 
 	/* XXX: This can change in the future. */
 	st->grd_dirent->d_type = DT_DIR;
 	st->grd_dirent->d_reclen = (ushort_t)reclen;
 	st->grd_dirent->d_namlen = namlen;
 
 	if (uiomove((caddr_t)st->grd_dirent, reclen, UIO_READ, uiop))
 		return (EFAULT);
 
 	uiop->uio_loffset = next;
 	if (*cookies != NULL) {
 		**cookies = next;
 		(*cookies)++;
 		(*ncookies)--;
 		KASSERT(*ncookies >= 0, ("ncookies=%d", *ncookies));
 	}
 
 	return (0);
 }
 
 /*
  * gfs_readdir_emit: emit a directory entry
  *   voff       - the virtual offset (obtained from gfs_readdir_pred)
  *   ino        - the entry's inode
  *   name       - the entry's name
  *
  * Returns a 0 on success, a non-zero errno on failure, or -1 if the
  * readdir loop should terminate.  A non-zero result (either errno or
  * -1) from this function is typically passed directly to
  * gfs_readdir_fini().
  */
 int
 gfs_readdir_emit(gfs_readdir_state_t *st, uio_t *uiop, offset_t voff,
     ino64_t ino, const char *name, int *ncookies, u_long **cookies)
 {
 	offset_t off = (voff + 2) * st->grd_ureclen;
 
 	st->grd_dirent->d_ino = ino;
 	(void) strncpy(st->grd_dirent->d_name, name, st->grd_namlen);
 
 	/*
 	 * Inter-entry offsets are invalid, so we assume a record size of
 	 * grd_ureclen and explicitly set the offset appropriately.
 	 */
 	return (gfs_readdir_emit_int(st, uiop, off + st->grd_ureclen, ncookies,
 	    cookies));
 }
 
 /*
  * gfs_readdir_pred: readdir loop predicate
  *   voffp - a pointer in which the next virtual offset should be stored
  *
  * Returns a 0 on success, a non-zero errno on failure, or -1 if the
  * readdir loop should terminate.  A non-zero result (either errno or
  * -1) from this function is typically passed directly to
  * gfs_readdir_fini().
  */
 int
 gfs_readdir_pred(gfs_readdir_state_t *st, uio_t *uiop, offset_t *voffp,
     int *ncookies, u_long **cookies)
 {
 	offset_t off, voff;
 	int error;
 
 top:
 	if (uiop->uio_resid <= 0)
 		return (-1);
 
 	off = uiop->uio_loffset / st->grd_ureclen;
 	voff = off - 2;
 	if (off == 0) {
 		if ((error = gfs_readdir_emit(st, uiop, voff, st->grd_self,
 		    ".", ncookies, cookies)) == 0)
 			goto top;
 	} else if (off == 1) {
 		if ((error = gfs_readdir_emit(st, uiop, voff, st->grd_parent,
 		    "..", ncookies, cookies)) == 0)
 			goto top;
 	} else {
 		*voffp = voff;
 		return (0);
 	}
 
 	return (error);
 }
 
 /*
  * gfs_readdir_fini: generic readdir cleanup
  *   error	- if positive, an error to return
  *   eofp	- the eofp passed to readdir
  *   eof	- the eof value
  *
  * Returns a 0 on success, a non-zero errno on failure.  This result
  * should be returned from readdir.
  */
 int
 gfs_readdir_fini(gfs_readdir_state_t *st, int error, int *eofp, int eof)
 {
 	kmem_free(st->grd_dirent, DIRENT64_RECLEN(st->grd_namlen));
 	if (error > 0)
 		return (error);
 	if (eofp)
 		*eofp = eof;
 	return (0);
 }
 
 /*
  * gfs_lookup_dot
  *
  * Performs a basic check for "." and ".." directory entries.
  */
 int
 gfs_lookup_dot(vnode_t **vpp, vnode_t *dvp, vnode_t *pvp, const char *nm)
 {
 	if (*nm == '\0' || strcmp(nm, ".") == 0) {
 		VN_HOLD(dvp);
 		*vpp = dvp;
 		return (0);
 	} else if (strcmp(nm, "..") == 0) {
 		if (pvp == NULL) {
 			ASSERT(dvp->v_flag & VROOT);
 			VN_HOLD(dvp);
 			*vpp = dvp;
 		} else {
 			VN_HOLD(pvp);
 			*vpp = pvp;
 		}
-		vn_lock(*vpp, LK_EXCLUSIVE | LK_RETRY, curthread);
+		vn_lock(*vpp, LK_EXCLUSIVE | LK_RETRY);
 		return (0);
 	}
 
 	return (-1);
 }
 
 /*
  * gfs_file_create(): create a new GFS file
  *
  *   size	- size of private data structure (v_data)
  *   pvp	- parent vnode (GFS directory)
  *   ops	- vnode operations vector
  *
  * In order to use this interface, the parent vnode must have been created by
  * gfs_dir_create(), and the private data stored in v_data must have a
  * 'gfs_file_t' as its first field.
  *
  * Given these constraints, this routine will automatically:
  *
  * 	- Allocate v_data for the vnode
  * 	- Initialize necessary fields in the vnode
  * 	- Hold the parent
  */
 vnode_t *
 gfs_file_create(size_t size, vnode_t *pvp, vfs_t *vfsp, vnodeops_t *ops)
 {
 	gfs_file_t *fp;
 	vnode_t *vp;
 	int error;
 
 	/*
 	 * Allocate vnode and internal data structure
 	 */
 	fp = kmem_zalloc(size, KM_SLEEP);
 	error = getnewvnode("zfs", vfsp, ops, &vp);
 	ASSERT(error == 0);
 	vp->v_data = (caddr_t)fp;
 
 	/*
 	 * Set up various pointers
 	 */
 	fp->gfs_vnode = vp;
 	fp->gfs_parent = pvp;
 	fp->gfs_size = size;
 	fp->gfs_type = GFS_FILE;
 
 	error = insmntque(vp, vfsp);
 	KASSERT(error == 0, ("insmntque() failed: error %d", error));
 
 	/*
 	 * Initialize vnode and hold parent.
 	 */
 	if (pvp)
 		VN_HOLD(pvp);
 
 	return (vp);
 }
 
 /*
  * gfs_dir_create: creates a new directory in the parent
  *
  *   size	- size of private data structure (v_data)
  *   pvp	- parent vnode (GFS directory)
  *   ops	- vnode operations vector
  *   entries	- NULL-terminated list of static entries (if any)
  *   maxlen	- maximum length of a directory entry
  *   readdir_cb	- readdir callback (see gfs_dir_readdir)
  *   inode_cb	- inode callback (see gfs_dir_readdir)
  *   lookup_cb	- lookup callback (see gfs_dir_lookup)
  *
  * In order to use this function, the first member of the private vnode
  * structure (v_data) must be a gfs_dir_t.  For each directory, there are
  * static entries, defined when the structure is initialized, and dynamic
  * entries, retrieved through callbacks.
  *
  * If a directory has static entries, then it must supply a inode callback,
  * which will compute the inode number based on the parent and the index.
  * For a directory with dynamic entries, the caller must supply a readdir
  * callback and a lookup callback.  If a static lookup fails, we fall back to
  * the supplied lookup callback, if any.
  *
  * This function also performs the same initialization as gfs_file_create().
  */
 vnode_t *
 gfs_dir_create(size_t struct_size, vnode_t *pvp, vfs_t *vfsp, vnodeops_t *ops,
     gfs_dirent_t *entries, gfs_inode_cb inode_cb, int maxlen,
     gfs_readdir_cb readdir_cb, gfs_lookup_cb lookup_cb)
 {
 	vnode_t *vp;
 	gfs_dir_t *dp;
 	gfs_dirent_t *de;
 
 	vp = gfs_file_create(struct_size, pvp, vfsp, ops);
 	vp->v_type = VDIR;
 
 	dp = vp->v_data;
 	dp->gfsd_file.gfs_type = GFS_DIR;
 	dp->gfsd_maxlen = maxlen;
 
 	if (entries != NULL) {
 		for (de = entries; de->gfse_name != NULL; de++)
 			dp->gfsd_nstatic++;
 
 		dp->gfsd_static = kmem_alloc(
 		    dp->gfsd_nstatic * sizeof (gfs_dirent_t), KM_SLEEP);
 		bcopy(entries, dp->gfsd_static,
 		    dp->gfsd_nstatic * sizeof (gfs_dirent_t));
 	}
 
 	dp->gfsd_readdir = readdir_cb;
 	dp->gfsd_lookup = lookup_cb;
 	dp->gfsd_inode = inode_cb;
 
 	mutex_init(&dp->gfsd_lock, NULL, MUTEX_DEFAULT, NULL);
 
 	return (vp);
 }
 
 /*
  * gfs_root_create(): create a root vnode for a GFS filesystem
  *
  * Similar to gfs_dir_create(), this creates a root vnode for a filesystem.  The
  * only difference is that it takes a vfs_t instead of a vnode_t as its parent.
  */
 vnode_t *
 gfs_root_create(size_t size, vfs_t *vfsp, vnodeops_t *ops, ino64_t ino,
     gfs_dirent_t *entries, gfs_inode_cb inode_cb, int maxlen,
     gfs_readdir_cb readdir_cb, gfs_lookup_cb lookup_cb)
 {
 	vnode_t *vp;
 
 	VFS_HOLD(vfsp);
 	vp = gfs_dir_create(size, NULL, vfsp, ops, entries, inode_cb,
 	    maxlen, readdir_cb, lookup_cb);
 	/* Manually set the inode */
 	((gfs_file_t *)vp->v_data)->gfs_ino = ino;
 	vp->v_flag |= VROOT;
 
 	return (vp);
 }
 
 /*
  * gfs_file_inactive()
  *
  * Called from the VOP_INACTIVE() routine.  If necessary, this routine will
  * remove the given vnode from the parent directory and clean up any references
  * in the VFS layer.
  *
  * If the vnode was not removed (due to a race with vget), then NULL is
  * returned.  Otherwise, a pointer to the private data is returned.
  */
 void *
 gfs_file_inactive(vnode_t *vp)
 {
 	int i;
 	gfs_dirent_t *ge = NULL;
 	gfs_file_t *fp = vp->v_data;
 	gfs_dir_t *dp = NULL;
 	void *data;
 
 	if (fp->gfs_parent == NULL)
 		goto found;
 
 	dp = fp->gfs_parent->v_data;
 
 	/*
 	 * First, see if this vnode is cached in the parent.
 	 */
 	gfs_dir_lock(dp);
 
 	/*
 	 * Find it in the set of static entries.
 	 */
 	for (i = 0; i < dp->gfsd_nstatic; i++)  {
 		ge = &dp->gfsd_static[i];
 
 		if (ge->gfse_vnode == vp)
 			goto found;
 	}
 
 	/*
 	 * If 'ge' is NULL, then it is a dynamic entry.
 	 */
 	ge = NULL;
 
 found:
 	VI_LOCK(vp);
 	ASSERT(vp->v_count < 2);
 	/*
 	 * Really remove this vnode
 	 */
 	data = vp->v_data;
 	if (ge != NULL) {
 		/*
 		 * If this was a statically cached entry, simply set the
 		 * cached vnode to NULL.
 		 */
 		ge->gfse_vnode = NULL;
 	}
 	if (vp->v_count == 1) {
 		vp->v_usecount--;
 		vdropl(vp);
 	} else {
 		VI_UNLOCK(vp);
 	}
 
 	/*
 	 * Free vnode and release parent
 	 */
 	if (fp->gfs_parent) {
 		gfs_dir_unlock(dp);
 		VI_LOCK(fp->gfs_parent);
 		fp->gfs_parent->v_usecount--;
 		VI_UNLOCK(fp->gfs_parent);
 	} else {
 		ASSERT(vp->v_vfsp != NULL);
 		VFS_RELE(vp->v_vfsp);
 	}
 
 	return (data);
 }
 
 /*
  * gfs_dir_inactive()
  *
  * Same as above, but for directories.
  */
 void *
 gfs_dir_inactive(vnode_t *vp)
 {
 	gfs_dir_t *dp;
 
 	ASSERT(vp->v_type == VDIR);
 
 	if ((dp = gfs_file_inactive(vp)) != NULL) {
 		mutex_destroy(&dp->gfsd_lock);
 		if (dp->gfsd_nstatic)
 			kmem_free(dp->gfsd_static,
 			    dp->gfsd_nstatic * sizeof (gfs_dirent_t));
 	}
 
 	return (dp);
 }
 
 /*
  * gfs_dir_lookup()
  *
  * Looks up the given name in the directory and returns the corresponding vnode,
  * if found.
  *
  * First, we search statically defined entries, if any.  If a match is found,
  * and GFS_CACHE_VNODE is set and the vnode exists, we simply return the
  * existing vnode.  Otherwise, we call the static entry's callback routine,
  * caching the result if necessary.
  *
  * If no static entry is found, we invoke the lookup callback, if any.  The
  * arguments to this callback are:
  *
  *	int gfs_lookup_cb(vnode_t *pvp, const char *nm, vnode_t **vpp);
  *
  *	pvp	- parent vnode
  *	nm	- name of entry
  *	vpp	- pointer to resulting vnode
  *
  * 	Returns 0 on success, non-zero on error.
  */
 int
 gfs_dir_lookup(vnode_t *dvp, const char *nm, vnode_t **vpp)
 {
 	int i;
 	gfs_dirent_t *ge;
 	vnode_t *vp;
 	gfs_dir_t *dp = dvp->v_data;
 	int ret = 0;
 
 	ASSERT(dvp->v_type == VDIR);
 
 	if (gfs_lookup_dot(vpp, dvp, dp->gfsd_file.gfs_parent, nm) == 0)
 		return (0);
 
 	gfs_dir_lock(dp);
 
 	/*
 	 * Search static entries.
 	 */
 	for (i = 0; i < dp->gfsd_nstatic; i++) {
 		ge = &dp->gfsd_static[i];
 
 		if (strcmp(ge->gfse_name, nm) == 0) {
 			if (ge->gfse_vnode) {
 				ASSERT(ge->gfse_flags & GFS_CACHE_VNODE);
 				vp = ge->gfse_vnode;
 				VN_HOLD(vp);
 				goto out;
 			}
 
 			/*
 			 * We drop the directory lock, as the constructor will
 			 * need to do KM_SLEEP allocations.  If we return from
 			 * the constructor only to find that a parallel
 			 * operation has completed, and GFS_CACHE_VNODE is set
 			 * for this entry, we discard the result in favor of the
 			 * cached vnode.
 			 */
 			gfs_dir_unlock(dp);
 			vp = ge->gfse_ctor(dvp);
 			gfs_dir_lock(dp);
 
 			((gfs_file_t *)vp->v_data)->gfs_index = i;
 
 			/* Set the inode according to the callback. */
 			((gfs_file_t *)vp->v_data)->gfs_ino =
 			    dp->gfsd_inode(dvp, i);
 
 			if (ge->gfse_flags & GFS_CACHE_VNODE) {
 				if (ge->gfse_vnode == NULL) {
 					ge->gfse_vnode = vp;
 				} else {
 					/*
 					 * A parallel constructor beat us to it;
 					 * return existing vnode.  We have to be
 					 * careful because we can't release the
 					 * current vnode while holding the
 					 * directory lock; its inactive routine
 					 * will try to lock this directory.
 					 */
 					vnode_t *oldvp = vp;
 					vp = ge->gfse_vnode;
 					VN_HOLD(vp);
 
 					gfs_dir_unlock(dp);
 					VN_RELE(oldvp);
 					gfs_dir_lock(dp);
 				}
 			}
 
 			goto out;
 		}
 	}
 
 	/*
 	 * See if there is a dynamic constructor.
 	 */
 	if (dp->gfsd_lookup) {
 		ino64_t ino;
 		gfs_file_t *fp;
 
 		/*
 		 * Once again, drop the directory lock, as the lookup routine
 		 * will need to allocate memory, or otherwise deadlock on this
 		 * directory.
 		 */
 		gfs_dir_unlock(dp);
 		ret = dp->gfsd_lookup(dvp, nm, &vp, &ino);
 		gfs_dir_lock(dp);
 		if (ret != 0)
 			goto out;
 
 		fp = (gfs_file_t *)vp->v_data;
 		fp->gfs_index = -1;
 		fp->gfs_ino = ino;
 	} else {
 		/*
 		 * No static entry found, and there is no lookup callback, so
 		 * return ENOENT.
 		 */
 		ret = ENOENT;
 	}
 
 out:
 	gfs_dir_unlock(dp);
 
 	if (ret == 0)
 		*vpp = vp;
 	else
 		*vpp = NULL;
 
 	return (ret);
 }
 
 /*
  * gfs_dir_readdir: does a readdir() on the given directory
  *
  *    dvp	- directory vnode
  *    uiop	- uio structure
  *    eofp	- eof pointer
  *    data	- arbitrary data passed to readdir callback
  *
  * This routine does all the readdir() dirty work.  Even so, the caller must
  * supply two callbacks in order to get full compatibility.
  *
  * If the directory contains static entries, an inode callback must be
  * specified.  This avoids having to create every vnode and call VOP_GETATTR()
  * when reading the directory.  This function has the following arguments:
  *
  *	ino_t gfs_inode_cb(vnode_t *vp, int index);
  *
  * 	vp	- vnode for the directory
  * 	index	- index in original gfs_dirent_t array
  *
  * 	Returns the inode number for the given entry.
  *
  * For directories with dynamic entries, a readdir callback must be provided.
  * This is significantly more complex, thanks to the particulars of
  * VOP_READDIR().
  *
  *	int gfs_readdir_cb(vnode_t *vp, struct dirent64 *dp, int *eofp,
  *	    offset_t *off, offset_t *nextoff, void *data)
  *
  *	vp	- directory vnode
  *	dp	- directory entry, sized according to maxlen given to
  *		  gfs_dir_create().  callback must fill in d_name and
  *		  d_ino.
  *	eofp	- callback must set to 1 when EOF has been reached
  *	off	- on entry, the last offset read from the directory.  Callback
  *		  must set to the offset of the current entry, typically left
  *		  untouched.
  *	nextoff	- callback must set to offset of next entry.  Typically
  *		  (off + 1)
  *	data	- caller-supplied data
  *
  *	Return 0 on success, or error on failure.
  */
 int
 gfs_dir_readdir(vnode_t *dvp, uio_t *uiop, int *eofp, int *ncookies,
     u_long **cookies, void *data)
 {
 	gfs_readdir_state_t gstate;
 	int error, eof = 0;
 	ino64_t ino, pino;
 	offset_t off, next;
 	gfs_dir_t *dp = dvp->v_data;
 
 	ino = dp->gfsd_file.gfs_ino;
 
 	if (dp->gfsd_file.gfs_parent == NULL)
 		pino = ino;		/* root of filesystem */
 	else
 		pino = ((gfs_file_t *)
 		    (dp->gfsd_file.gfs_parent->v_data))->gfs_ino;
 
 	if ((error = gfs_readdir_init(&gstate, dp->gfsd_maxlen, 1, uiop,
 	    pino, ino)) != 0)
 		return (error);
 
 	while ((error = gfs_readdir_pred(&gstate, uiop, &off, ncookies,
 	    cookies)) == 0 && !eof) {
 
 		if (off >= 0 && off < dp->gfsd_nstatic) {
 			ino = dp->gfsd_inode(dvp, off);
 
 			if ((error = gfs_readdir_emit(&gstate, uiop,
 			    off, ino, dp->gfsd_static[off].gfse_name, ncookies,
 			    cookies)) != 0)
 				break;
 
 		} else if (dp->gfsd_readdir) {
 			off -= dp->gfsd_nstatic;
 
 			if ((error = dp->gfsd_readdir(dvp,
 			    gstate.grd_dirent, &eof, &off, &next,
 			    data)) != 0 || eof)
 				break;
 
 			off += dp->gfsd_nstatic + 2;
 			next += dp->gfsd_nstatic + 2;
 
 			if ((error = gfs_readdir_emit_int(&gstate, uiop,
 			    next, ncookies, cookies)) != 0)
 				break;
 		} else {
 			/*
 			 * Offset is beyond the end of the static entries, and
 			 * we have no dynamic entries.  Set EOF.
 			 */
 			eof = 1;
 		}
 	}
 
 	return (gfs_readdir_fini(&gstate, error, eofp, eof));
 }
 
 /*
  * gfs_vop_readdir: VOP_READDIR() entry point
  *
  * For use directly in vnode ops table.  Given a GFS directory, calls
  * gfs_dir_readdir() as necessary.
  */
 /* ARGSUSED */
 int
 gfs_vop_readdir(ap)
 	struct vop_readdir_args /* {
 		struct vnode *a_vp;
 		struct uio *a_uio;
 		struct ucred *a_cred;
 		int *a_eofflag;
 		int *ncookies;
 		u_long **a_cookies;
 	} */ *ap;
 {
 	vnode_t *vp = ap->a_vp;
 	uio_t *uiop = ap->a_uio;
 	int *eofp = ap->a_eofflag;
 	int ncookies = 0;
 	u_long *cookies = NULL;
 	int error;
 
 	if (ap->a_ncookies) {
 		/*
 		 * Minimum entry size is dirent size and 1 byte for a file name.
 		 */
 		ncookies = uiop->uio_resid / (sizeof(struct dirent) - sizeof(((struct dirent *)NULL)->d_name) + 1);
 		cookies = malloc(ncookies * sizeof(u_long), M_TEMP, M_WAITOK);
 		*ap->a_cookies = cookies;
 		*ap->a_ncookies = ncookies;
 	}
 
 	error = gfs_dir_readdir(vp, uiop, eofp, &ncookies, &cookies, NULL);
 
 	if (error == 0) {
 		/* Subtract unused cookies */
 		if (ap->a_ncookies)
 			*ap->a_ncookies -= ncookies;
 	} else if (ap->a_ncookies) {
 		free(*ap->a_cookies, M_TEMP);
 		*ap->a_cookies = NULL;
 		*ap->a_ncookies = 0;
 	}
 
 	return (error);
 }
 
 /*
  * gfs_vop_inactive: VOP_INACTIVE() entry point
  *
  * Given a vnode that is a GFS file or directory, call gfs_file_inactive() or
  * gfs_dir_inactive() as necessary, and kmem_free()s associated private data.
  */
 /* ARGSUSED */
 int
 gfs_vop_inactive(ap)
 	struct vop_inactive_args /* {
 		struct vnode *a_vp;
 		struct thread *a_td;
 	} */ *ap;
 {
 	vnode_t *vp = ap->a_vp;
 	gfs_file_t *fp = vp->v_data;
 	void *data;
 
 	if (fp->gfs_type == GFS_DIR)
 		data = gfs_dir_inactive(vp);
 	else
 		data = gfs_file_inactive(vp);
 
 	if (data != NULL)
 		kmem_free(data, fp->gfs_size);
 	vp->v_data = NULL;
 	return (0);
 }
Index: head/sys/contrib/opensolaris/uts/common/fs/zfs/zfs_ctldir.c
===================================================================
--- head/sys/contrib/opensolaris/uts/common/fs/zfs/zfs_ctldir.c	(revision 175201)
+++ head/sys/contrib/opensolaris/uts/common/fs/zfs/zfs_ctldir.c	(revision 175202)
@@ -1,1119 +1,1119 @@
 /*
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
  * Common Development and Distribution License (the "License").
  * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or http://www.opensolaris.org/os/licensing.
  * See the License for the specific language governing permissions
  * and limitations under the License.
  *
  * When distributing Covered Code, include this CDDL HEADER in each
  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  * If applicable, add the following below this CDDL HEADER, with the
  * fields enclosed by brackets "[]" replaced with your own identifying
  * information: Portions Copyright [yyyy] [name of copyright owner]
  *
  * CDDL HEADER END
  */
 /*
  * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
 #pragma ident	"%Z%%M%	%I%	%E% SMI"
 
 /*
  * ZFS control directory (a.k.a. ".zfs")
  *
  * This directory provides a common location for all ZFS meta-objects.
  * Currently, this is only the 'snapshot' directory, but this may expand in the
  * future.  The elements are built using the GFS primitives, as the hierarchy
  * does not actually exist on disk.
  *
  * For 'snapshot', we don't want to have all snapshots always mounted, because
  * this would take up a huge amount of space in /etc/mnttab.  We have three
  * types of objects:
  *
  * 	ctldir ------> snapshotdir -------> snapshot
  *                                             |
  *                                             |
  *                                             V
  *                                         mounted fs
  *
  * The 'snapshot' node contains just enough information to lookup '..' and act
  * as a mountpoint for the snapshot.  Whenever we lookup a specific snapshot, we
  * perform an automount of the underlying filesystem and return the
  * corresponding vnode.
  *
  * All mounts are handled automatically by the kernel, but unmounts are
  * (currently) handled from user land.  The main reason is that there is no
  * reliable way to auto-unmount the filesystem when it's "no longer in use".
  * When the user unmounts a filesystem, we call zfsctl_unmount(), which
  * unmounts any snapshots within the snapshot directory.
  */
 
 #include <sys/zfs_context.h>
 #include <sys/zfs_ctldir.h>
 #include <sys/zfs_ioctl.h>
 #include <sys/zfs_vfsops.h>
 #include <sys/namei.h>
 #include <sys/gfs.h>
 #include <sys/stat.h>
 #include <sys/dmu.h>
 #include <sys/mount.h>
 
 typedef struct {
 	char		*se_name;
 	vnode_t		*se_root;
 	avl_node_t	se_node;
 } zfs_snapentry_t;
 
 static int
 snapentry_compare(const void *a, const void *b)
 {
 	const zfs_snapentry_t *sa = a;
 	const zfs_snapentry_t *sb = b;
 	int ret = strcmp(sa->se_name, sb->se_name);
 
 	if (ret < 0)
 		return (-1);
 	else if (ret > 0)
 		return (1);
 	else
 		return (0);
 }
 
 static struct vop_vector zfsctl_ops_root;
 static struct vop_vector zfsctl_ops_snapdir;
 static struct vop_vector zfsctl_ops_snapshot;
 
 static vnode_t *zfsctl_mknode_snapdir(vnode_t *);
 static vnode_t *zfsctl_snapshot_mknode(vnode_t *, uint64_t objset);
 
 typedef struct zfsctl_node {
 	gfs_dir_t	zc_gfs_private;
 	uint64_t	zc_id;
 	timestruc_t	zc_cmtime;	/* ctime and mtime, always the same */
 } zfsctl_node_t;
 
 typedef struct zfsctl_snapdir {
 	zfsctl_node_t	sd_node;
 	kmutex_t	sd_lock;
 	avl_tree_t	sd_snaps;
 } zfsctl_snapdir_t;
 
 /*
  * Root directory elements.  We have only a single static entry, 'snapshot'.
  */
 static gfs_dirent_t zfsctl_root_entries[] = {
 	{ "snapshot", zfsctl_mknode_snapdir, GFS_CACHE_VNODE },
 	{ NULL }
 };
 
 /* include . and .. in the calculation */
 #define	NROOT_ENTRIES	((sizeof (zfsctl_root_entries) / \
     sizeof (gfs_dirent_t)) + 1)
 
 
 /*
  * Initialize the various GFS pieces we'll need to create and manipulate .zfs
  * directories.  This is called from the ZFS init routine, and initializes the
  * vnode ops vectors that we'll be using.
  */
 void
 zfsctl_init(void)
 {
 }
 
 void
 zfsctl_fini(void)
 {
 }
 
 /*
  * Return the inode number associated with the 'snapshot' directory.
  */
 /* ARGSUSED */
 static ino64_t
 zfsctl_root_inode_cb(vnode_t *vp, int index)
 {
 	ASSERT(index == 0);
 	return (ZFSCTL_INO_SNAPDIR);
 }
 
 /*
  * Create the '.zfs' directory.  This directory is cached as part of the VFS
  * structure.  This results in a hold on the vfs_t.  The code in zfs_umount()
  * therefore checks against a vfs_count of 2 instead of 1.  This reference
  * is removed when the ctldir is destroyed in the unmount.
  */
 void
 zfsctl_create(zfsvfs_t *zfsvfs)
 {
 	vnode_t *vp, *rvp;
 	zfsctl_node_t *zcp;
 
 	ASSERT(zfsvfs->z_ctldir == NULL);
 
 	vp = gfs_root_create(sizeof (zfsctl_node_t), zfsvfs->z_vfs,
 	    &zfsctl_ops_root, ZFSCTL_INO_ROOT, zfsctl_root_entries,
 	    zfsctl_root_inode_cb, MAXNAMELEN, NULL, NULL);
 	zcp = vp->v_data;
 	zcp->zc_id = ZFSCTL_INO_ROOT;
 
 	VERIFY(VFS_ROOT(zfsvfs->z_vfs, LK_EXCLUSIVE, &rvp, curthread) == 0);
 	ZFS_TIME_DECODE(&zcp->zc_cmtime, VTOZ(rvp)->z_phys->zp_crtime);
 	VN_URELE(rvp);
 
 	/*
 	 * We're only faking the fact that we have a root of a filesystem for
 	 * the sake of the GFS interfaces.  Undo the flag manipulation it did
 	 * for us.
 	 */
 	vp->v_vflag &= ~VV_ROOT;
 
 	zfsvfs->z_ctldir = vp;
 }
 
 /*
  * Destroy the '.zfs' directory.  Only called when the filesystem is unmounted.
  * There might still be more references if we were force unmounted, but only
  * new zfs_inactive() calls can occur and they don't reference .zfs
  */
 void
 zfsctl_destroy(zfsvfs_t *zfsvfs)
 {
 	VN_RELE(zfsvfs->z_ctldir);
 	zfsvfs->z_ctldir = NULL;
 }
 
 /*
  * Given a root znode, retrieve the associated .zfs directory.
  * Add a hold to the vnode and return it.
  */
 vnode_t *
 zfsctl_root(znode_t *zp)
 {
 	ASSERT(zfs_has_ctldir(zp));
 	VN_HOLD(zp->z_zfsvfs->z_ctldir);
 	return (zp->z_zfsvfs->z_ctldir);
 }
 
 /*
  * Common open routine.  Disallow any write access.
  */
 /* ARGSUSED */
 static int
 zfsctl_common_open(struct vop_open_args *ap)
 {
 	int flags = ap->a_mode;
 
 	if (flags & FWRITE)
 		return (EACCES);
 
 	return (0);
 }
 
 /*
  * Common close routine.  Nothing to do here.
  */
 /* ARGSUSED */
 static int
 zfsctl_common_close(struct vop_close_args *ap)
 {
 	return (0);
 }
 
 /*
  * Common access routine.  Disallow writes.
  */
 /* ARGSUSED */
 static int
 zfsctl_common_access(ap)
 	struct vop_access_args /* {
 		struct vnode *a_vp;
 		int  a_mode;
 		struct ucred *a_cred;
 		struct thread *a_td;
 	} */ *ap;
 {
 	int mode = ap->a_mode;
 
 	if (mode & VWRITE)
 		return (EACCES);
 
 	return (0);
 }
 
 /*
  * Common getattr function.  Fill in basic information.
  */
 static void
 zfsctl_common_getattr(vnode_t *vp, vattr_t *vap)
 {
 	zfsctl_node_t	*zcp = vp->v_data;
 	timestruc_t	now;
 
 	vap->va_uid = 0;
 	vap->va_gid = 0;
 	vap->va_rdev = 0;
 	/*
 	 * We are a purly virtual object, so we have no
 	 * blocksize or allocated blocks.
 	 */
 	vap->va_blksize = 0;
 	vap->va_nblocks = 0;
 	vap->va_seq = 0;
 	vap->va_fsid = vp->v_mount->mnt_stat.f_fsid.val[0];
 	vap->va_mode = S_IRUSR | S_IXUSR | S_IRGRP | S_IXGRP |
 	    S_IROTH | S_IXOTH;
 	vap->va_type = VDIR;
 	/*
 	 * We live in the now (for atime).
 	 */
 	gethrestime(&now);
 	vap->va_atime = now;
 	vap->va_mtime = vap->va_ctime = vap->va_birthtime = zcp->zc_cmtime;
 	/* FreeBSD: Reset chflags(2) flags. */
 	vap->va_flags = 0;
 }
 
 static int
 zfsctl_common_fid(ap)
 	struct vop_fid_args /* {
 		struct vnode *a_vp;
 		struct fid *a_fid;
 	} */ *ap;
 {
 	vnode_t		*vp = ap->a_vp;
 	fid_t		*fidp = (void *)ap->a_fid;
 	zfsvfs_t	*zfsvfs = vp->v_vfsp->vfs_data;
 	zfsctl_node_t	*zcp = vp->v_data;
 	uint64_t	object = zcp->zc_id;
 	zfid_short_t	*zfid;
 	int		i;
 
 	ZFS_ENTER(zfsvfs);
 
 	fidp->fid_len = SHORT_FID_LEN;
 
 	zfid = (zfid_short_t *)fidp;
 
 	zfid->zf_len = SHORT_FID_LEN;
 
 	for (i = 0; i < sizeof (zfid->zf_object); i++)
 		zfid->zf_object[i] = (uint8_t)(object >> (8 * i));
 
 	/* .zfs znodes always have a generation number of 0 */
 	for (i = 0; i < sizeof (zfid->zf_gen); i++)
 		zfid->zf_gen[i] = 0;
 
 	ZFS_EXIT(zfsvfs);
 	return (0);
 }
 
 static int
 zfsctl_common_reclaim(ap)
 	struct vop_reclaim_args /* {
 		struct vnode *a_vp;
 		struct thread *a_td;
 	} */ *ap;
 {
 	vnode_t *vp = ap->a_vp;
 
 	/*
 	 * Destroy the vm object and flush associated pages.
 	 */
 	vnode_destroy_vobject(vp);
 	VI_LOCK(vp);
 	vp->v_data = NULL;
 	VI_UNLOCK(vp);
 	return (0);
 }
 
 /*
  * .zfs inode namespace
  *
  * We need to generate unique inode numbers for all files and directories
  * within the .zfs pseudo-filesystem.  We use the following scheme:
  *
  * 	ENTRY			ZFSCTL_INODE
  * 	.zfs			1
  * 	.zfs/snapshot		2
  * 	.zfs/snapshot/<snap>	objectid(snap)
  */
 
 #define	ZFSCTL_INO_SNAP(id)	(id)
 
 /*
  * Get root directory attributes.
  */
 /* ARGSUSED */
 static int
 zfsctl_root_getattr(ap)
 	struct vop_getattr_args /* {
 		struct vnode *a_vp;
 		struct vattr *a_vap;
 		struct ucred *a_cred;
 		struct thread *a_td;
 	} */ *ap;
 {
 	struct vnode *vp = ap->a_vp;
 	struct vattr *vap = ap->a_vap;
 	zfsvfs_t *zfsvfs = vp->v_vfsp->vfs_data;
 
 	ZFS_ENTER(zfsvfs);
 	vap->va_nodeid = ZFSCTL_INO_ROOT;
 	vap->va_nlink = vap->va_size = NROOT_ENTRIES;
 
 	zfsctl_common_getattr(vp, vap);
 	ZFS_EXIT(zfsvfs);
 
 	return (0);
 }
 
 /*
  * Special case the handling of "..".
  */
 /* ARGSUSED */
 int
 zfsctl_root_lookup(vnode_t *dvp, char *nm, vnode_t **vpp, pathname_t *pnp,
     int flags, vnode_t *rdir, cred_t *cr)
 {
 	zfsvfs_t *zfsvfs = dvp->v_vfsp->vfs_data;
 	int err;
 
 	ZFS_ENTER(zfsvfs);
 
 	if (strcmp(nm, "..") == 0) {
 		err = VFS_ROOT(dvp->v_vfsp, LK_EXCLUSIVE, vpp, curthread);
 		if (err == 0)
 			VOP_UNLOCK(*vpp, 0, curthread);
 	} else {
 		err = gfs_dir_lookup(dvp, nm, vpp);
 	}
 
 	ZFS_EXIT(zfsvfs);
 
 	return (err);
 }
 
 /*
  * Special case the handling of "..".
  */
 /* ARGSUSED */
 int
 zfsctl_root_lookup_vop(ap)
 	struct vop_lookup_args /* {
 		struct vnode *a_dvp;
 		struct vnode **a_vpp;
 		struct componentname *a_cnp;
 	} */ *ap;
 {
 	vnode_t *dvp = ap->a_dvp;
 	vnode_t **vpp = ap->a_vpp;
 	cred_t *cr = ap->a_cnp->cn_cred;
 	int flags = ap->a_cnp->cn_flags;
 	int nameiop = ap->a_cnp->cn_nameiop;
 	char nm[NAME_MAX + 1];
 	int err;
 
 	if ((flags & ISLASTCN) && (nameiop == RENAME || nameiop == CREATE))
 		return (EOPNOTSUPP);
 
 	ASSERT(ap->a_cnp->cn_namelen < sizeof(nm));
 	strlcpy(nm, ap->a_cnp->cn_nameptr, ap->a_cnp->cn_namelen + 1);
 
 	err = zfsctl_root_lookup(dvp, nm, vpp, NULL, 0, NULL, cr);
 	if (err == 0 && (nm[0] != '.' || nm[1] != '\0'))
-		vn_lock(*vpp, LK_EXCLUSIVE | LK_RETRY, ap->a_cnp->cn_thread);
+		vn_lock(*vpp, LK_EXCLUSIVE | LK_RETRY);
 
 	return (err);
 }
 
 static struct vop_vector zfsctl_ops_root = {
 	.vop_default =	&default_vnodeops,
 	.vop_open =	zfsctl_common_open,
 	.vop_close =	zfsctl_common_close,
 	.vop_ioctl =	VOP_EINVAL,
 	.vop_getattr =	zfsctl_root_getattr,
 	.vop_access =	zfsctl_common_access,
 	.vop_readdir =	gfs_vop_readdir,
 	.vop_lookup =	zfsctl_root_lookup_vop,
 	.vop_inactive =	gfs_vop_inactive,
 	.vop_reclaim =	zfsctl_common_reclaim,
 	.vop_fid =	zfsctl_common_fid,
 };
 
 static int
 zfsctl_snapshot_zname(vnode_t *vp, const char *name, int len, char *zname)
 {
 	objset_t *os = ((zfsvfs_t *)((vp)->v_vfsp->vfs_data))->z_os;
 
 	dmu_objset_name(os, zname);
 	if (strlen(zname) + 1 + strlen(name) >= len)
 		return (ENAMETOOLONG);
 	(void) strcat(zname, "@");
 	(void) strcat(zname, name);
 	return (0);
 }
 
 static int
 zfsctl_unmount_snap(vnode_t *dvp, const char *name, int force, cred_t *cr)
 {
 	zfsctl_snapdir_t *sdp = dvp->v_data;
 	zfs_snapentry_t search, *sep;
 	struct vop_inactive_args ap;
 	avl_index_t where;
 	int err;
 
 	ASSERT(MUTEX_HELD(&sdp->sd_lock));
 
 	search.se_name = (char *)name;
 	if ((sep = avl_find(&sdp->sd_snaps, &search, &where)) == NULL)
 		return (ENOENT);
 
 	ASSERT(vn_ismntpt(sep->se_root));
 
 	/* this will be dropped by dounmount() */
 	if ((err = vn_vfswlock(sep->se_root)) != 0)
 		return (err);
 
 	err = dounmount(vn_mountedvfs(sep->se_root), force, curthread);
 	if (err)
 		return (err);
 	ASSERT(sep->se_root->v_count == 1);
 	ap.a_vp = sep->se_root;
 	gfs_vop_inactive(&ap);
 
 	avl_remove(&sdp->sd_snaps, sep);
 	kmem_free(sep->se_name, strlen(sep->se_name) + 1);
 	kmem_free(sep, sizeof (zfs_snapentry_t));
 
 	return (0);
 }
 
 #if 0
 static void
 zfsctl_rename_snap(zfsctl_snapdir_t *sdp, zfs_snapentry_t *sep, const char *nm)
 {
 	avl_index_t where;
 	vfs_t *vfsp;
 	refstr_t *pathref;
 	char newpath[MAXNAMELEN];
 	char *tail;
 
 	ASSERT(MUTEX_HELD(&sdp->sd_lock));
 	ASSERT(sep != NULL);
 
 	vfsp = vn_mountedvfs(sep->se_root);
 	ASSERT(vfsp != NULL);
 
 	vfs_lock_wait(vfsp);
 
 	/*
 	 * Change the name in the AVL tree.
 	 */
 	avl_remove(&sdp->sd_snaps, sep);
 	kmem_free(sep->se_name, strlen(sep->se_name) + 1);
 	sep->se_name = kmem_alloc(strlen(nm) + 1, KM_SLEEP);
 	(void) strcpy(sep->se_name, nm);
 	VERIFY(avl_find(&sdp->sd_snaps, sep, &where) == NULL);
 	avl_insert(&sdp->sd_snaps, sep, where);
 
 	/*
 	 * Change the current mountpoint info:
 	 * 	- update the tail of the mntpoint path
 	 *	- update the tail of the resource path
 	 */
 	pathref = vfs_getmntpoint(vfsp);
 	(void) strncpy(newpath, refstr_value(pathref), sizeof (newpath));
 	VERIFY((tail = strrchr(newpath, '/')) != NULL);
 	*(tail+1) = '\0';
 	ASSERT3U(strlen(newpath) + strlen(nm), <, sizeof (newpath));
 	(void) strcat(newpath, nm);
 	refstr_rele(pathref);
 	vfs_setmntpoint(vfsp, newpath);
 
 	pathref = vfs_getresource(vfsp);
 	(void) strncpy(newpath, refstr_value(pathref), sizeof (newpath));
 	VERIFY((tail = strrchr(newpath, '@')) != NULL);
 	*(tail+1) = '\0';
 	ASSERT3U(strlen(newpath) + strlen(nm), <, sizeof (newpath));
 	(void) strcat(newpath, nm);
 	refstr_rele(pathref);
 	vfs_setresource(vfsp, newpath);
 
 	vfs_unlock(vfsp);
 }
 #endif
 
 #if 0
 static int
 zfsctl_snapdir_rename(vnode_t *sdvp, char *snm, vnode_t *tdvp, char *tnm,
     cred_t *cr)
 {
 	zfsctl_snapdir_t *sdp = sdvp->v_data;
 	zfs_snapentry_t search, *sep;
 	avl_index_t where;
 	char from[MAXNAMELEN], to[MAXNAMELEN];
 	int err;
 
 	err = zfsctl_snapshot_zname(sdvp, snm, MAXNAMELEN, from);
 	if (err)
 		return (err);
 	err = zfs_secpolicy_write(from, cr);
 	if (err)
 		return (err);
 
 	/*
 	 * Cannot move snapshots out of the snapdir.
 	 */
 	if (sdvp != tdvp)
 		return (EINVAL);
 
 	if (strcmp(snm, tnm) == 0)
 		return (0);
 
 	err = zfsctl_snapshot_zname(tdvp, tnm, MAXNAMELEN, to);
 	if (err)
 		return (err);
 
 	mutex_enter(&sdp->sd_lock);
 
 	search.se_name = (char *)snm;
 	if ((sep = avl_find(&sdp->sd_snaps, &search, &where)) == NULL) {
 		mutex_exit(&sdp->sd_lock);
 		return (ENOENT);
 	}
 
 	err = dmu_objset_rename(from, to, B_FALSE);
 	if (err == 0)
 		zfsctl_rename_snap(sdp, sep, tnm);
 
 	mutex_exit(&sdp->sd_lock);
 
 	return (err);
 }
 #endif
 
 #if 0
 /* ARGSUSED */
 static int
 zfsctl_snapdir_remove(vnode_t *dvp, char *name, vnode_t *cwd, cred_t *cr)
 {
 	zfsctl_snapdir_t *sdp = dvp->v_data;
 	char snapname[MAXNAMELEN];
 	int err;
 
 	err = zfsctl_snapshot_zname(dvp, name, MAXNAMELEN, snapname);
 	if (err)
 		return (err);
 	err = zfs_secpolicy_write(snapname, cr);
 	if (err)
 		return (err);
 
 	mutex_enter(&sdp->sd_lock);
 
 	err = zfsctl_unmount_snap(dvp, name, 0, cr);
 	if (err) {
 		mutex_exit(&sdp->sd_lock);
 		return (err);
 	}
 
 	err = dmu_objset_destroy(snapname);
 
 	mutex_exit(&sdp->sd_lock);
 
 	return (err);
 }
 #endif
 
 /*
  * Lookup entry point for the 'snapshot' directory.  Try to open the
  * snapshot if it exist, creating the pseudo filesystem vnode as necessary.
  * Perform a mount of the associated dataset on top of the vnode.
  */
 /* ARGSUSED */
 int
 zfsctl_snapdir_lookup(ap)
 	struct vop_lookup_args /* {
 		struct vnode *a_dvp;
 		struct vnode **a_vpp;
 		struct componentname *a_cnp;
 	} */ *ap;
 {
 	vnode_t *dvp = ap->a_dvp;
 	vnode_t **vpp = ap->a_vpp;
 	char nm[NAME_MAX + 1];
 	zfsctl_snapdir_t *sdp = dvp->v_data;
 	objset_t *snap;
 	char snapname[MAXNAMELEN];
 	char *mountpoint;
 	zfs_snapentry_t *sep, search;
 	size_t mountpoint_len;
 	avl_index_t where;
 	zfsvfs_t *zfsvfs = dvp->v_vfsp->vfs_data;
 	int err;
 
 	ASSERT(ap->a_cnp->cn_namelen < sizeof(nm));
 	strlcpy(nm, ap->a_cnp->cn_nameptr, ap->a_cnp->cn_namelen + 1);
 
 	ASSERT(dvp->v_type == VDIR);
 
 	if (gfs_lookup_dot(vpp, dvp, zfsvfs->z_ctldir, nm) == 0)
 		return (0);
 
 	*vpp = NULL;
 
 	/*
 	 * If we get a recursive call, that means we got called
 	 * from the domount() code while it was trying to look up the
 	 * spec (which looks like a local path for zfs).  We need to
 	 * add some flag to domount() to tell it not to do this lookup.
 	 */
 	if (MUTEX_HELD(&sdp->sd_lock))
 		return (ENOENT);
 
 	ZFS_ENTER(zfsvfs);
 
 	mutex_enter(&sdp->sd_lock);
 	search.se_name = (char *)nm;
 	if ((sep = avl_find(&sdp->sd_snaps, &search, &where)) != NULL) {
 		*vpp = sep->se_root;
 		VN_HOLD(*vpp);
 		if ((*vpp)->v_mountedhere == NULL) {
 			/*
 			 * The snapshot was unmounted behind our backs,
 			 * try to remount it.
 			 */
 			goto domount;
 		}
-		vn_lock(*vpp, LK_EXCLUSIVE | LK_RETRY, ap->a_cnp->cn_thread);
+		vn_lock(*vpp, LK_EXCLUSIVE | LK_RETRY);
 		mutex_exit(&sdp->sd_lock);
 		ZFS_EXIT(zfsvfs);
 		return (0);
 	}
 
 	/*
 	 * The requested snapshot is not currently mounted, look it up.
 	 */
 	err = zfsctl_snapshot_zname(dvp, nm, MAXNAMELEN, snapname);
 	if (err) {
 		mutex_exit(&sdp->sd_lock);
 		ZFS_EXIT(zfsvfs);
 		return (err);
 	}
 	if (dmu_objset_open(snapname, DMU_OST_ZFS,
 	    DS_MODE_STANDARD | DS_MODE_READONLY, &snap) != 0) {
 		mutex_exit(&sdp->sd_lock);
 		ZFS_EXIT(zfsvfs);
 		return (ENOENT);
 	}
 
 	sep = kmem_alloc(sizeof (zfs_snapentry_t), KM_SLEEP);
 	sep->se_name = kmem_alloc(strlen(nm) + 1, KM_SLEEP);
 	(void) strcpy(sep->se_name, nm);
 	*vpp = sep->se_root = zfsctl_snapshot_mknode(dvp, dmu_objset_id(snap));
 	VN_HOLD(*vpp);
 	avl_insert(&sdp->sd_snaps, sep, where);
 
 	dmu_objset_close(snap);
 domount:
 	mountpoint_len = strlen(dvp->v_vfsp->mnt_stat.f_mntonname) +
 	    strlen("/.zfs/snapshot/") + strlen(nm) + 1;
 	mountpoint = kmem_alloc(mountpoint_len, KM_SLEEP);
 	(void) snprintf(mountpoint, mountpoint_len, "%s/.zfs/snapshot/%s",
 	    dvp->v_vfsp->mnt_stat.f_mntonname, nm);
 	err = domount(curthread, *vpp, "zfs", mountpoint, snapname, 0);
 	kmem_free(mountpoint, mountpoint_len);
 	/* FreeBSD: This line was moved from below to avoid a lock recursion. */
 	if (err == 0)
-		vn_lock(*vpp, LK_EXCLUSIVE | LK_RETRY, curthread);
+		vn_lock(*vpp, LK_EXCLUSIVE | LK_RETRY);
 	mutex_exit(&sdp->sd_lock);
 
 	/*
 	 * If we had an error, drop our hold on the vnode and
 	 * zfsctl_snapshot_inactive() will clean up.
 	 */
 	if (err) {
 		VN_RELE(*vpp);
 		*vpp = NULL;
 	}
 	return (err);
 }
 
 /* ARGSUSED */
 static int
 zfsctl_snapdir_readdir_cb(vnode_t *vp, struct dirent64 *dp, int *eofp,
     offset_t *offp, offset_t *nextp, void *data)
 {
 	zfsvfs_t *zfsvfs = vp->v_vfsp->vfs_data;
 	char snapname[MAXNAMELEN];
 	uint64_t id, cookie;
 
 	ZFS_ENTER(zfsvfs);
 
 	cookie = *offp;
 	if (dmu_snapshot_list_next(zfsvfs->z_os, MAXNAMELEN, snapname, &id,
 	    &cookie) == ENOENT) {
 		*eofp = 1;
 		ZFS_EXIT(zfsvfs);
 		return (0);
 	}
 
 	(void) strcpy(dp->d_name, snapname);
 	dp->d_ino = ZFSCTL_INO_SNAP(id);
 	*nextp = cookie;
 
 	ZFS_EXIT(zfsvfs);
 
 	return (0);
 }
 
 vnode_t *
 zfsctl_mknode_snapdir(vnode_t *pvp)
 {
 	vnode_t *vp;
 	zfsctl_snapdir_t *sdp;
 
 	vp = gfs_dir_create(sizeof (zfsctl_snapdir_t), pvp, pvp->v_vfsp,
 	    &zfsctl_ops_snapdir, NULL, NULL, MAXNAMELEN,
 	    zfsctl_snapdir_readdir_cb, NULL);
 	sdp = vp->v_data;
 	sdp->sd_node.zc_id = ZFSCTL_INO_SNAPDIR;
 	sdp->sd_node.zc_cmtime = ((zfsctl_node_t *)pvp->v_data)->zc_cmtime;
 	mutex_init(&sdp->sd_lock, NULL, MUTEX_DEFAULT, NULL);
 	avl_create(&sdp->sd_snaps, snapentry_compare,
 	    sizeof (zfs_snapentry_t), offsetof(zfs_snapentry_t, se_node));
 	return (vp);
 }
 
 /* ARGSUSED */
 static int
 zfsctl_snapdir_getattr(ap)
 	struct vop_getattr_args /* {
 		struct vnode *a_vp;
 		struct vattr *a_vap;
 		struct ucred *a_cred;
 		struct thread *a_td;
 	} */ *ap;
 {
 	struct vnode *vp = ap->a_vp;
 	struct vattr *vap = ap->a_vap;
 	zfsvfs_t *zfsvfs = vp->v_vfsp->vfs_data;
 	zfsctl_snapdir_t *sdp = vp->v_data;
 
 	ZFS_ENTER(zfsvfs);
 	zfsctl_common_getattr(vp, vap);
 	vap->va_nodeid = gfs_file_inode(vp);
 	vap->va_nlink = vap->va_size = avl_numnodes(&sdp->sd_snaps) + 2;
 	ZFS_EXIT(zfsvfs);
 
 	return (0);
 }
 
 /* ARGSUSED */
 static int
 zfsctl_snapdir_inactive(ap)
 	struct vop_inactive_args /* {
 		struct vnode *a_vp;
 		struct thread *a_td;
 	} */ *ap;
 {
 	vnode_t *vp = ap->a_vp;
 	zfsctl_snapdir_t *sdp = vp->v_data;
 	void *private;
 
 	private = gfs_dir_inactive(vp);
 	if (private != NULL) {
 		ASSERT(avl_numnodes(&sdp->sd_snaps) == 0);
 		mutex_destroy(&sdp->sd_lock);
 		avl_destroy(&sdp->sd_snaps);
 		kmem_free(private, sizeof (zfsctl_snapdir_t));
 	}
 	return (0);
 }
 
 static struct vop_vector zfsctl_ops_snapdir = {
 	.vop_default =	&default_vnodeops,
 	.vop_open =	zfsctl_common_open,
 	.vop_close =	zfsctl_common_close,
 	.vop_ioctl =	VOP_EINVAL,
 	.vop_getattr =	zfsctl_snapdir_getattr,
 	.vop_access =	zfsctl_common_access,
 	.vop_readdir =	gfs_vop_readdir,
 	.vop_lookup =	zfsctl_snapdir_lookup,
 	.vop_inactive =	zfsctl_snapdir_inactive,
 	.vop_reclaim =	zfsctl_common_reclaim,
 	.vop_fid =	zfsctl_common_fid,
 };
 
 static vnode_t *
 zfsctl_snapshot_mknode(vnode_t *pvp, uint64_t objset)
 {
 	vnode_t *vp;
 	zfsctl_node_t *zcp;
 
 	vp = gfs_dir_create(sizeof (zfsctl_node_t), pvp, pvp->v_vfsp,
 	    &zfsctl_ops_snapshot, NULL, NULL, MAXNAMELEN, NULL, NULL);
 	zcp = vp->v_data;
 	zcp->zc_id = objset;
 
 	return (vp);
 }
 
 static int
 zfsctl_snapshot_inactive(ap)
 	struct vop_inactive_args /* {
 		struct vnode *a_vp;
 		struct thread *a_td;
 	} */ *ap;
 {
 	vnode_t *vp = ap->a_vp;
 	struct vop_inactive_args iap;
 	zfsctl_snapdir_t *sdp;
 	zfs_snapentry_t *sep, *next;
 	int locked;
 	vnode_t *dvp;
 
 	VERIFY(gfs_dir_lookup(vp, "..", &dvp) == 0);
 	sdp = dvp->v_data;
 	VOP_UNLOCK(dvp, 0, ap->a_td);
 
 	if (!(locked = MUTEX_HELD(&sdp->sd_lock)))
 		mutex_enter(&sdp->sd_lock);
 
 	if (vp->v_count > 1) {
 		if (!locked)
 			mutex_exit(&sdp->sd_lock);
 		return (0);
 	}
 	ASSERT(!vn_ismntpt(vp));
 
 	sep = avl_first(&sdp->sd_snaps);
 	while (sep != NULL) {
 		next = AVL_NEXT(&sdp->sd_snaps, sep);
 
 		if (sep->se_root == vp) {
 			avl_remove(&sdp->sd_snaps, sep);
 			kmem_free(sep->se_name, strlen(sep->se_name) + 1);
 			kmem_free(sep, sizeof (zfs_snapentry_t));
 			break;
 		}
 		sep = next;
 	}
 	ASSERT(sep != NULL);
 
 	if (!locked)
 		mutex_exit(&sdp->sd_lock);
 	VN_RELE(dvp);
 
 	/*
 	 * Dispose of the vnode for the snapshot mount point.
 	 * This is safe to do because once this entry has been removed
 	 * from the AVL tree, it can't be found again, so cannot become
 	 * "active".  If we lookup the same name again we will end up
 	 * creating a new vnode.
 	 */
 	iap.a_vp = vp;
 	return (gfs_vop_inactive(&iap));
 }
 
 static int
 zfsctl_traverse_begin(vnode_t **vpp, int lktype, kthread_t *td)
 {
 
 	VN_HOLD(*vpp);
 	/* Snapshot should be already mounted, but just in case. */
 	if (vn_mountedvfs(*vpp) == NULL)
 		return (ENOENT);
 	return (traverse(vpp, lktype));
 }
 
 static void
 zfsctl_traverse_end(vnode_t *vp, int err)
 {
 
 	if (err == 0)
 		vput(vp);
 	else
 		VN_RELE(vp);
 }
 
 static int
 zfsctl_snapshot_getattr(ap)
 	struct vop_getattr_args /* {
 		struct vnode *a_vp;
 		struct vattr *a_vap;
 		struct ucred *a_cred;
 		struct thread *a_td;
 	} */ *ap;
 {
 	vnode_t *vp = ap->a_vp;
 	int err;
 
 	err = zfsctl_traverse_begin(&vp, LK_SHARED | LK_RETRY, ap->a_td);
 	if (err == 0)
 		err = VOP_GETATTR(vp, ap->a_vap, ap->a_cred, ap->a_td);
 	zfsctl_traverse_end(vp, err);
 	return (err);
 }
 
 static int
 zfsctl_snapshot_fid(ap)
 	struct vop_fid_args /* {
 		struct vnode *a_vp;
 		struct fid *a_fid;
 	} */ *ap;
 {
 	vnode_t *vp = ap->a_vp;
 	int err;
 
 	err = zfsctl_traverse_begin(&vp, LK_SHARED | LK_RETRY, curthread);
 	if (err == 0)
 		err = VOP_VPTOFH(vp, (void *)ap->a_fid);
 	zfsctl_traverse_end(vp, err);
 	return (err);
 }
 
 /*
  * These VP's should never see the light of day.  They should always
  * be covered.
  */
 static struct vop_vector zfsctl_ops_snapshot = {
 	.vop_default =	&default_vnodeops,
 	.vop_inactive =	zfsctl_snapshot_inactive,
 	.vop_reclaim =	zfsctl_common_reclaim,
 	.vop_getattr =	zfsctl_snapshot_getattr,
 	.vop_fid =	zfsctl_snapshot_fid,
 };
 
 int
 zfsctl_lookup_objset(vfs_t *vfsp, uint64_t objsetid, zfsvfs_t **zfsvfsp)
 {
 	zfsvfs_t *zfsvfs = vfsp->vfs_data;
 	vnode_t *dvp, *vp;
 	zfsctl_snapdir_t *sdp;
 	zfsctl_node_t *zcp;
 	zfs_snapentry_t *sep;
 	int error;
 
 	ASSERT(zfsvfs->z_ctldir != NULL);
 	error = zfsctl_root_lookup(zfsvfs->z_ctldir, "snapshot", &dvp,
 	    NULL, 0, NULL, kcred);
 	if (error != 0)
 		return (error);
 	sdp = dvp->v_data;
 
 	mutex_enter(&sdp->sd_lock);
 	sep = avl_first(&sdp->sd_snaps);
 	while (sep != NULL) {
 		vp = sep->se_root;
 		zcp = vp->v_data;
 		if (zcp->zc_id == objsetid)
 			break;
 
 		sep = AVL_NEXT(&sdp->sd_snaps, sep);
 	}
 
 	if (sep != NULL) {
 		VN_HOLD(vp);
 		error = traverse(&vp, LK_SHARED | LK_RETRY);
 		if (error == 0) {
 			if (vp == sep->se_root)
 				error = EINVAL;
 			else
 				*zfsvfsp = VTOZ(vp)->z_zfsvfs;
 		}
 		mutex_exit(&sdp->sd_lock);
 		if (error == 0)
 			VN_URELE(vp);
 		else
 			VN_RELE(vp);
 	} else {
 		error = EINVAL;
 		mutex_exit(&sdp->sd_lock);
 	}
 
 	VN_RELE(dvp);
 
 	return (error);
 }
 
 /*
  * Unmount any snapshots for the given filesystem.  This is called from
  * zfs_umount() - if we have a ctldir, then go through and unmount all the
  * snapshots.
  */
 int
 zfsctl_umount_snapshots(vfs_t *vfsp, int fflags, cred_t *cr)
 {
 	struct vop_inactive_args ap;
 	zfsvfs_t *zfsvfs = vfsp->vfs_data;
 	vnode_t *dvp, *svp;
 	zfsctl_snapdir_t *sdp;
 	zfs_snapentry_t *sep, *next;
 	int error;
 
 	ASSERT(zfsvfs->z_ctldir != NULL);
 	error = zfsctl_root_lookup(zfsvfs->z_ctldir, "snapshot", &dvp,
 	    NULL, 0, NULL, cr);
 	if (error != 0)
 		return (error);
 	sdp = dvp->v_data;
 
 	mutex_enter(&sdp->sd_lock);
 
 	sep = avl_first(&sdp->sd_snaps);
 	while (sep != NULL) {
 		svp = sep->se_root;
 		next = AVL_NEXT(&sdp->sd_snaps, sep);
 
 		/*
 		 * If this snapshot is not mounted, then it must
 		 * have just been unmounted by somebody else, and
 		 * will be cleaned up by zfsctl_snapdir_inactive().
 		 */
 		if (vn_ismntpt(svp)) {
 			if ((error = vn_vfswlock(svp)) != 0)
 				goto out;
 
 			/*
 			 * Increase usecount, so dounmount() won't vrele() it
 			 * to 0 and call zfsctl_snapdir_inactive().
 			 */
 			VN_HOLD(svp);
 			vfsp = vn_mountedvfs(svp);
 			mtx_lock(&Giant);
 			error = dounmount(vfsp, fflags, curthread);
 			mtx_unlock(&Giant);
 			if (error != 0) {
 				VN_RELE(svp);
 				goto out;
 			}
 
 			avl_remove(&sdp->sd_snaps, sep);
 			kmem_free(sep->se_name, strlen(sep->se_name) + 1);
 			kmem_free(sep, sizeof (zfs_snapentry_t));
 
 			/*
 			 * We can't use VN_RELE(), as that will try to
 			 * invoke zfsctl_snapdir_inactive(), and that
 			 * would lead to an attempt to re-grab the sd_lock.
 			 */
 			ASSERT3U(svp->v_count, ==, 1);
 			ap.a_vp = svp;
 			gfs_vop_inactive(&ap);
 		}
 		sep = next;
 	}
 out:
 	mutex_exit(&sdp->sd_lock);
 	VN_RELE(dvp);
 
 	return (error);
 }
Index: head/sys/contrib/opensolaris/uts/common/fs/zfs/zfs_replay.c
===================================================================
--- head/sys/contrib/opensolaris/uts/common/fs/zfs/zfs_replay.c	(revision 175201)
+++ head/sys/contrib/opensolaris/uts/common/fs/zfs/zfs_replay.c	(revision 175202)
@@ -1,430 +1,430 @@
 /*
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
  * Common Development and Distribution License (the "License").
  * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or http://www.opensolaris.org/os/licensing.
  * See the License for the specific language governing permissions
  * and limitations under the License.
  *
  * When distributing Covered Code, include this CDDL HEADER in each
  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  * If applicable, add the following below this CDDL HEADER, with the
  * fields enclosed by brackets "[]" replaced with your own identifying
  * information: Portions Copyright [yyyy] [name of copyright owner]
  *
  * CDDL HEADER END
  */
 /*
  * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
 #pragma ident	"%Z%%M%	%I%	%E% SMI"
 
 #include <sys/types.h>
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/sysmacros.h>
 #include <sys/cmn_err.h>
 #include <sys/kmem.h>
 #include <sys/file.h>
 #include <sys/fcntl.h>
 #include <sys/vfs.h>
 #include <sys/fs/zfs.h>
 #include <sys/zfs_znode.h>
 #include <sys/zfs_dir.h>
 #include <sys/zfs_acl.h>
 #include <sys/spa.h>
 #include <sys/zil.h>
 #include <sys/byteorder.h>
 #include <sys/stat.h>
 #include <sys/acl.h>
 #include <sys/atomic.h>
 #include <sys/cred.h>
 #include <sys/namei.h>
 
 /*
  * Functions to replay ZFS intent log (ZIL) records
  * The functions are called through a function vector (zfs_replay_vector)
  * which is indexed by the transaction type.
  */
 
 static void
 zfs_init_vattr(vattr_t *vap, uint64_t mask, uint64_t mode,
 	uint64_t uid, uint64_t gid, uint64_t rdev, uint64_t nodeid)
 {
 	VATTR_NULL(vap);
 	vap->va_mask = (uint_t)mask;
 	vap->va_type = IFTOVT(mode);
 	vap->va_mode = mode & MODEMASK;
 	vap->va_uid = (uid_t)uid;
 	vap->va_gid = (gid_t)gid;
 	vap->va_rdev = zfs_cmpldev(rdev);
 	vap->va_nodeid = nodeid;
 }
 
 /* ARGSUSED */
 static int
 zfs_replay_error(zfsvfs_t *zfsvfs, lr_t *lr, boolean_t byteswap)
 {
 	return (ENOTSUP);
 }
 
 static int
 zfs_replay_create(zfsvfs_t *zfsvfs, lr_create_t *lr, boolean_t byteswap)
 {
 	char *name = (char *)(lr + 1);	/* name follows lr_create_t */
 	char *link;			/* symlink content follows name */
 	znode_t *dzp;
 	vnode_t *vp = NULL;
 	vattr_t va;
 	struct componentname cn;
 	int error;
 
 	if (byteswap)
 		byteswap_uint64_array(lr, sizeof (*lr));
 
 	if ((error = zfs_zget(zfsvfs, lr->lr_doid, &dzp)) != 0)
 		return (error);
 
 	zfs_init_vattr(&va, AT_TYPE | AT_MODE | AT_UID | AT_GID,
 	    lr->lr_mode, lr->lr_uid, lr->lr_gid, lr->lr_rdev, lr->lr_foid);
 
 	/*
 	 * All forms of zfs create (create, mkdir, mkxattrdir, symlink)
 	 * eventually end up in zfs_mknode(), which assigns the object's
 	 * creation time and generation number.  The generic VOP_CREATE()
 	 * doesn't have either concept, so we smuggle the values inside
 	 * the vattr's otherwise unused va_ctime and va_nblocks fields.
 	 */
 	ZFS_TIME_DECODE(&va.va_ctime, lr->lr_crtime);
 	va.va_nblocks = lr->lr_gen;
 
 	cn.cn_nameptr = name;
 	cn.cn_cred = kcred;
 	cn.cn_thread = curthread;
 	cn.cn_flags = SAVENAME;
 
-	vn_lock(ZTOV(dzp), LK_EXCLUSIVE | LK_RETRY, curthread);
+	vn_lock(ZTOV(dzp), LK_EXCLUSIVE | LK_RETRY);
 	switch ((int)lr->lr_common.lrc_txtype) {
 	case TX_CREATE:
 		error = VOP_CREATE(ZTOV(dzp), &vp, &cn, &va);
 		break;
 	case TX_MKDIR:
 		error = VOP_MKDIR(ZTOV(dzp), &vp, &cn, &va);
 		break;
 	case TX_MKXATTR:
 		error = zfs_make_xattrdir(dzp, &va, &vp, kcred);
 		break;
 	case TX_SYMLINK:
 		link = name + strlen(name) + 1;
 		error = VOP_SYMLINK(ZTOV(dzp), &vp, &cn, &va, link);
 		break;
 	default:
 		error = ENOTSUP;
 	}
 	VOP_UNLOCK(ZTOV(dzp), 0, curthread);
 
 	if (error == 0 && vp != NULL) {
 		VOP_UNLOCK(vp, 0, curthread);
 		VN_RELE(vp);
 	}
 
 	VN_RELE(ZTOV(dzp));
 
 	return (error);
 }
 
 static int
 zfs_replay_remove(zfsvfs_t *zfsvfs, lr_remove_t *lr, boolean_t byteswap)
 {
 	char *name = (char *)(lr + 1);	/* name follows lr_remove_t */
 	znode_t *dzp;
 	struct componentname cn;
 	vnode_t *vp;
 	int error;
 
 	if (byteswap)
 		byteswap_uint64_array(lr, sizeof (*lr));
 
 	if ((error = zfs_zget(zfsvfs, lr->lr_doid, &dzp)) != 0)
 		return (error);
 
 	bzero(&cn, sizeof(cn));
 	cn.cn_nameptr = name;
 	cn.cn_namelen = strlen(name);
 	cn.cn_nameiop = DELETE;
 	cn.cn_flags = ISLASTCN | SAVENAME;
 	cn.cn_lkflags = LK_EXCLUSIVE | LK_RETRY;
 	cn.cn_cred = kcred;
 	cn.cn_thread = curthread;
-	vn_lock(ZTOV(dzp), LK_EXCLUSIVE | LK_RETRY, curthread);
+	vn_lock(ZTOV(dzp), LK_EXCLUSIVE | LK_RETRY);
 	error = VOP_LOOKUP(ZTOV(dzp), &vp, &cn);
 	if (error != 0) {
 		VOP_UNLOCK(ZTOV(dzp), 0, curthread);
 		goto fail;
 	}
 
 	switch ((int)lr->lr_common.lrc_txtype) {
 	case TX_REMOVE:
 		error = VOP_REMOVE(ZTOV(dzp), vp, &cn);
 		break;
 	case TX_RMDIR:
 		error = VOP_RMDIR(ZTOV(dzp), vp, &cn);
 		break;
 	default:
 		error = ENOTSUP;
 	}
 	vput(vp);
 	VOP_UNLOCK(ZTOV(dzp), 0, curthread);
 fail:
 	VN_RELE(ZTOV(dzp));
 
 	return (error);
 }
 
 static int
 zfs_replay_link(zfsvfs_t *zfsvfs, lr_link_t *lr, boolean_t byteswap)
 {
 	char *name = (char *)(lr + 1);	/* name follows lr_link_t */
 	znode_t *dzp, *zp;
 	struct componentname cn;
 	int error;
 
 	if (byteswap)
 		byteswap_uint64_array(lr, sizeof (*lr));
 
 	if ((error = zfs_zget(zfsvfs, lr->lr_doid, &dzp)) != 0)
 		return (error);
 
 	if ((error = zfs_zget(zfsvfs, lr->lr_link_obj, &zp)) != 0) {
 		VN_RELE(ZTOV(dzp));
 		return (error);
 	}
 
 	cn.cn_nameptr = name;
 	cn.cn_cred = kcred;
 	cn.cn_thread = curthread;
 	cn.cn_flags = SAVENAME;
 
-	vn_lock(ZTOV(dzp), LK_EXCLUSIVE | LK_RETRY, curthread);
-	vn_lock(ZTOV(zp), LK_EXCLUSIVE | LK_RETRY, curthread);
+	vn_lock(ZTOV(dzp), LK_EXCLUSIVE | LK_RETRY);
+	vn_lock(ZTOV(zp), LK_EXCLUSIVE | LK_RETRY);
 	error = VOP_LINK(ZTOV(dzp), ZTOV(zp), &cn);
 	VOP_UNLOCK(ZTOV(zp), 0, curthread);
 	VOP_UNLOCK(ZTOV(dzp), 0, curthread);
 
 	VN_RELE(ZTOV(zp));
 	VN_RELE(ZTOV(dzp));
 
 	return (error);
 }
 
 static int
 zfs_replay_rename(zfsvfs_t *zfsvfs, lr_rename_t *lr, boolean_t byteswap)
 {
 	char *sname = (char *)(lr + 1);	/* sname and tname follow lr_rename_t */
 	char *tname = sname + strlen(sname) + 1;
 	znode_t *sdzp, *tdzp;
 	struct componentname scn, tcn;
 	vnode_t *svp, *tvp;
 	kthread_t *td = curthread;
 	int error;
 
 	if (byteswap)
 		byteswap_uint64_array(lr, sizeof (*lr));
 
 	if ((error = zfs_zget(zfsvfs, lr->lr_sdoid, &sdzp)) != 0)
 		return (error);
 
 	if ((error = zfs_zget(zfsvfs, lr->lr_tdoid, &tdzp)) != 0) {
 		VN_RELE(ZTOV(sdzp));
 		return (error);
 	}
 
 	svp = tvp = NULL;
 
 	bzero(&scn, sizeof(scn));
 	scn.cn_nameptr = sname;
 	scn.cn_namelen = strlen(sname);
 	scn.cn_nameiop = DELETE;
 	scn.cn_flags = ISLASTCN | SAVENAME;
 	scn.cn_lkflags = LK_EXCLUSIVE | LK_RETRY;
 	scn.cn_cred = kcred;
 	scn.cn_thread = td;
-	vn_lock(ZTOV(sdzp), LK_EXCLUSIVE | LK_RETRY, td);
+	vn_lock(ZTOV(sdzp), LK_EXCLUSIVE | LK_RETRY);
 	error = VOP_LOOKUP(ZTOV(sdzp), &svp, &scn);
 	VOP_UNLOCK(ZTOV(sdzp), 0, td);
 	if (error != 0)
 		goto fail;
 	VOP_UNLOCK(svp, 0, td);
 
 	bzero(&tcn, sizeof(tcn));
 	tcn.cn_nameptr = tname;
 	tcn.cn_namelen = strlen(tname);
 	tcn.cn_nameiop = RENAME;
 	tcn.cn_flags = ISLASTCN | SAVENAME;
 	tcn.cn_lkflags = LK_EXCLUSIVE | LK_RETRY;
 	tcn.cn_cred = kcred;
 	tcn.cn_thread = td;
-	vn_lock(ZTOV(tdzp), LK_EXCLUSIVE | LK_RETRY, td);
+	vn_lock(ZTOV(tdzp), LK_EXCLUSIVE | LK_RETRY);
 	error = VOP_LOOKUP(ZTOV(tdzp), &tvp, &tcn);
 	if (error == EJUSTRETURN)
 		tvp = NULL;
 	else if (error != 0) {
 		VOP_UNLOCK(ZTOV(tdzp), 0, td);
 		goto fail;
 	}
 
 	error = VOP_RENAME(ZTOV(sdzp), svp, &scn, ZTOV(tdzp), tvp, &tcn);
 	return (error);
 fail:
 	if (svp != NULL)
 		vrele(svp);
 	if (tvp != NULL)
 		vrele(tvp);
 	VN_RELE(ZTOV(tdzp));
 	VN_RELE(ZTOV(sdzp));
 
 	return (error);
 }
 
 static int
 zfs_replay_write(zfsvfs_t *zfsvfs, lr_write_t *lr, boolean_t byteswap)
 {
 	char *data = (char *)(lr + 1);	/* data follows lr_write_t */
 	znode_t	*zp;
 	int error;
 	ssize_t resid;
 
 	if (byteswap)
 		byteswap_uint64_array(lr, sizeof (*lr));
 
 	if ((error = zfs_zget(zfsvfs, lr->lr_foid, &zp)) != 0) {
 		/*
 		 * As we can log writes out of order, it's possible the
 		 * file has been removed. In this case just drop the write
 		 * and return success.
 		 */
 		if (error == ENOENT)
 			error = 0;
 		return (error);
 	}
 
 	error = vn_rdwr(UIO_WRITE, ZTOV(zp), data, lr->lr_length,
 	    lr->lr_offset, UIO_SYSSPACE, 0, RLIM64_INFINITY, kcred, &resid);
 
 	VN_RELE(ZTOV(zp));
 
 	return (error);
 }
 
 static int
 zfs_replay_truncate(zfsvfs_t *zfsvfs, lr_truncate_t *lr, boolean_t byteswap)
 {
 
 	ZFS_LOG(0, "Unexpected code path, report to pjd@FreeBSD.org");
 	return (EOPNOTSUPP);
 }
 
 static int
 zfs_replay_setattr(zfsvfs_t *zfsvfs, lr_setattr_t *lr, boolean_t byteswap)
 {
 	znode_t *zp;
 	vattr_t va;
 	vnode_t *vp;
 	int error;
 
 	if (byteswap)
 		byteswap_uint64_array(lr, sizeof (*lr));
 
 	if ((error = zfs_zget(zfsvfs, lr->lr_foid, &zp)) != 0) {
 		/*
 		 * As we can log setattrs out of order, it's possible the
 		 * file has been removed. In this case just drop the setattr
 		 * and return success.
 		 */
 		if (error == ENOENT)
 			error = 0;
 		return (error);
 	}
 
 	zfs_init_vattr(&va, lr->lr_mask, lr->lr_mode,
 	    lr->lr_uid, lr->lr_gid, 0, lr->lr_foid);
 
 	va.va_size = lr->lr_size;
 	ZFS_TIME_DECODE(&va.va_atime, lr->lr_atime);
 	ZFS_TIME_DECODE(&va.va_mtime, lr->lr_mtime);
 
 	vp = ZTOV(zp);
-	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, curthread);
+	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
 	error = VOP_SETATTR(vp, &va, kcred, curthread);
 	VOP_UNLOCK(vp, 0, curthread);
 	VN_RELE(vp);
 
 	return (error);
 }
 
 static int
 zfs_replay_acl(zfsvfs_t *zfsvfs, lr_acl_t *lr, boolean_t byteswap)
 {
 	ace_t *ace = (ace_t *)(lr + 1);	/* ace array follows lr_acl_t */
 #ifdef TODO
 	vsecattr_t vsa;
 #endif
 	znode_t *zp;
 	int error;
 
 	if (byteswap) {
 		byteswap_uint64_array(lr, sizeof (*lr));
 		zfs_ace_byteswap(ace, lr->lr_aclcnt);
 	}
 
 	if ((error = zfs_zget(zfsvfs, lr->lr_foid, &zp)) != 0) {
 		/*
 		 * As we can log acls out of order, it's possible the
 		 * file has been removed. In this case just drop the acl
 		 * and return success.
 		 */
 		if (error == ENOENT)
 			error = 0;
 		return (error);
 	}
 
 #ifdef TODO
 	bzero(&vsa, sizeof (vsa));
 	vsa.vsa_mask = VSA_ACE | VSA_ACECNT;
 	vsa.vsa_aclcnt = lr->lr_aclcnt;
 	vsa.vsa_aclentp = ace;
 
 	error = VOP_SETSECATTR(ZTOV(zp), &vsa, 0, kcred);
 #else
 	error = EOPNOTSUPP;
 #endif
 
 	VN_RELE(ZTOV(zp));
 
 	return (error);
 }
 
 /*
  * Callback vectors for replaying records
  */
 zil_replay_func_t *zfs_replay_vector[TX_MAX_TYPE] = {
 	zfs_replay_error,	/* 0 no such transaction type */
 	zfs_replay_create,	/* TX_CREATE */
 	zfs_replay_create,	/* TX_MKDIR */
 	zfs_replay_create,	/* TX_MKXATTR */
 	zfs_replay_create,	/* TX_SYMLINK */
 	zfs_replay_remove,	/* TX_REMOVE */
 	zfs_replay_remove,	/* TX_RMDIR */
 	zfs_replay_link,	/* TX_LINK */
 	zfs_replay_rename,	/* TX_RENAME */
 	zfs_replay_write,	/* TX_WRITE */
 	zfs_replay_truncate,	/* TX_TRUNCATE */
 	zfs_replay_setattr,	/* TX_SETATTR */
 	zfs_replay_acl,		/* TX_ACL */
 };
Index: head/sys/contrib/opensolaris/uts/common/fs/zfs/zfs_vfsops.c
===================================================================
--- head/sys/contrib/opensolaris/uts/common/fs/zfs/zfs_vfsops.c	(revision 175201)
+++ head/sys/contrib/opensolaris/uts/common/fs/zfs/zfs_vfsops.c	(revision 175202)
@@ -1,1021 +1,1021 @@
 /*
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
  * Common Development and Distribution License (the "License").
  * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or http://www.opensolaris.org/os/licensing.
  * See the License for the specific language governing permissions
  * and limitations under the License.
  *
  * When distributing Covered Code, include this CDDL HEADER in each
  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  * If applicable, add the following below this CDDL HEADER, with the
  * fields enclosed by brackets "[]" replaced with your own identifying
  * information: Portions Copyright [yyyy] [name of copyright owner]
  *
  * CDDL HEADER END
  */
 /*
  * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
 #pragma ident	"%Z%%M%	%I%	%E% SMI"
 
 #include <sys/types.h>
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/kernel.h>
 #include <sys/sysmacros.h>
 #include <sys/kmem.h>
 #include <sys/acl.h>
 #include <sys/vnode.h>
 #include <sys/vfs.h>
 #include <sys/mntent.h>
 #include <sys/mount.h>
 #include <sys/cmn_err.h>
 #include <sys/zfs_znode.h>
 #include <sys/zfs_dir.h>
 #include <sys/zil.h>
 #include <sys/fs/zfs.h>
 #include <sys/dmu.h>
 #include <sys/dsl_prop.h>
 #include <sys/dsl_dataset.h>
 #include <sys/spa.h>
 #include <sys/zap.h>
 #include <sys/varargs.h>
 #include <sys/policy.h>
 #include <sys/atomic.h>
 #include <sys/zfs_ioctl.h>
 #include <sys/zfs_ctldir.h>
 #include <sys/sunddi.h>
 #include <sys/dnlc.h>
 
 struct mtx zfs_debug_mtx;
 MTX_SYSINIT(zfs_debug_mtx, &zfs_debug_mtx, "zfs_debug", MTX_DEF);
 SYSCTL_NODE(_vfs, OID_AUTO, zfs, CTLFLAG_RW, 0, "ZFS file system");
 int zfs_debug_level = 0;
 TUNABLE_INT("vfs.zfs.debug", &zfs_debug_level);
 SYSCTL_INT(_vfs_zfs, OID_AUTO, debug, CTLFLAG_RW, &zfs_debug_level, 0,
     "Debug level");
 
 static int zfs_mount(vfs_t *vfsp, kthread_t *td);
 static int zfs_umount(vfs_t *vfsp, int fflag, kthread_t *td);
 static int zfs_root(vfs_t *vfsp, int flags, vnode_t **vpp, kthread_t *td);
 static int zfs_statfs(vfs_t *vfsp, struct statfs *statp, kthread_t *td);
 static int zfs_vget(vfs_t *vfsp, ino_t ino, int flags, vnode_t **vpp);
 static int zfs_sync(vfs_t *vfsp, int waitfor, kthread_t *td);
 static int zfs_fhtovp(vfs_t *vfsp, fid_t *fidp, vnode_t **vpp);
 static void zfs_objset_close(zfsvfs_t *zfsvfs);
 static void zfs_freevfs(vfs_t *vfsp);
 
 static struct vfsops zfs_vfsops = {
 	.vfs_mount =		zfs_mount,
 	.vfs_unmount =		zfs_umount,
 	.vfs_root =		zfs_root,
 	.vfs_statfs =		zfs_statfs,
 	.vfs_vget =		zfs_vget,
 	.vfs_sync =		zfs_sync,
 	.vfs_fhtovp =		zfs_fhtovp,
 };
 
 VFS_SET(zfs_vfsops, zfs, VFCF_JAIL);
 
 /*
  * We need to keep a count of active fs's.
  * This is necessary to prevent our module
  * from being unloaded after a umount -f
  */
 static uint32_t	zfs_active_fs_count = 0;
 
 /*ARGSUSED*/
 static int
 zfs_sync(vfs_t *vfsp, int waitfor, kthread_t *td)
 {
 
 	/*
 	 * Data integrity is job one.  We don't want a compromised kernel
 	 * writing to the storage pool, so we never sync during panic.
 	 */
 	if (panicstr)
 		return (0);
 
 	if (vfsp != NULL) {
 		/*
 		 * Sync a specific filesystem.
 		 */
 		zfsvfs_t *zfsvfs = vfsp->vfs_data;
 		int error;
 
 		error = vfs_stdsync(vfsp, waitfor, td);
 		if (error != 0)
 			return (error);
 
 		ZFS_ENTER(zfsvfs);
 		if (zfsvfs->z_log != NULL)
 			zil_commit(zfsvfs->z_log, UINT64_MAX, 0);
 		else
 			txg_wait_synced(dmu_objset_pool(zfsvfs->z_os), 0);
 		ZFS_EXIT(zfsvfs);
 	} else {
 		/*
 		 * Sync all ZFS filesystems.  This is what happens when you
 		 * run sync(1M).  Unlike other filesystems, ZFS honors the
 		 * request by waiting for all pools to commit all dirty data.
 		 */
 		spa_sync_allpools();
 	}
 
 	return (0);
 }
 
 static void
 atime_changed_cb(void *arg, uint64_t newval)
 {
 	zfsvfs_t *zfsvfs = arg;
 
 	if (newval == TRUE) {
 		zfsvfs->z_atime = TRUE;
 		zfsvfs->z_vfs->vfs_flag &= ~MNT_NOATIME;
 		vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_NOATIME);
 		vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_ATIME, NULL, 0);
 	} else {
 		zfsvfs->z_atime = FALSE;
 		zfsvfs->z_vfs->vfs_flag |= MNT_NOATIME;
 		vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_ATIME);
 		vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_NOATIME, NULL, 0);
 	}
 }
 
 static void
 xattr_changed_cb(void *arg, uint64_t newval)
 {
 	zfsvfs_t *zfsvfs = arg;
 
 	if (newval == TRUE) {
 		/* XXX locking on vfs_flag? */
 #ifdef TODO
 		zfsvfs->z_vfs->vfs_flag |= VFS_XATTR;
 #endif
 		vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_NOXATTR);
 		vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_XATTR, NULL, 0);
 	} else {
 		/* XXX locking on vfs_flag? */
 #ifdef TODO
 		zfsvfs->z_vfs->vfs_flag &= ~VFS_XATTR;
 #endif
 		vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_XATTR);
 		vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_NOXATTR, NULL, 0);
 	}
 }
 
 static void
 blksz_changed_cb(void *arg, uint64_t newval)
 {
 	zfsvfs_t *zfsvfs = arg;
 
 	if (newval < SPA_MINBLOCKSIZE ||
 	    newval > SPA_MAXBLOCKSIZE || !ISP2(newval))
 		newval = SPA_MAXBLOCKSIZE;
 
 	zfsvfs->z_max_blksz = newval;
 	zfsvfs->z_vfs->vfs_bsize = newval;
 }
 
 static void
 readonly_changed_cb(void *arg, uint64_t newval)
 {
 	zfsvfs_t *zfsvfs = arg;
 
 	if (newval) {
 		/* XXX locking on vfs_flag? */
 		zfsvfs->z_vfs->vfs_flag |= VFS_RDONLY;
 		vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_RW);
 		vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_RO, NULL, 0);
 	} else {
 		/* XXX locking on vfs_flag? */
 		zfsvfs->z_vfs->vfs_flag &= ~VFS_RDONLY;
 		vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_RO);
 		vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_RW, NULL, 0);
 	}
 }
 
 static void
 setuid_changed_cb(void *arg, uint64_t newval)
 {
 	zfsvfs_t *zfsvfs = arg;
 
 	if (newval == FALSE) {
 		zfsvfs->z_vfs->vfs_flag |= VFS_NOSETUID;
 		vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_SETUID);
 		vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_NOSETUID, NULL, 0);
 	} else {
 		zfsvfs->z_vfs->vfs_flag &= ~VFS_NOSETUID;
 		vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_NOSETUID);
 		vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_SETUID, NULL, 0);
 	}
 }
 
 static void
 exec_changed_cb(void *arg, uint64_t newval)
 {
 	zfsvfs_t *zfsvfs = arg;
 
 	if (newval == FALSE) {
 		zfsvfs->z_vfs->vfs_flag |= VFS_NOEXEC;
 		vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_EXEC);
 		vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_NOEXEC, NULL, 0);
 	} else {
 		zfsvfs->z_vfs->vfs_flag &= ~VFS_NOEXEC;
 		vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_NOEXEC);
 		vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_EXEC, NULL, 0);
 	}
 }
 
 static void
 snapdir_changed_cb(void *arg, uint64_t newval)
 {
 	zfsvfs_t *zfsvfs = arg;
 
 	zfsvfs->z_show_ctldir = newval;
 }
 
 static void
 acl_mode_changed_cb(void *arg, uint64_t newval)
 {
 	zfsvfs_t *zfsvfs = arg;
 
 	zfsvfs->z_acl_mode = newval;
 }
 
 static void
 acl_inherit_changed_cb(void *arg, uint64_t newval)
 {
 	zfsvfs_t *zfsvfs = arg;
 
 	zfsvfs->z_acl_inherit = newval;
 }
 
 static int
 zfs_refresh_properties(vfs_t *vfsp)
 {
 	zfsvfs_t *zfsvfs = vfsp->vfs_data;
 
 	/*
 	 * Remount operations default to "rw" unless "ro" is explicitly
 	 * specified.
 	 */
 	if (vfs_optionisset(vfsp, MNTOPT_RO, NULL)) {
 		readonly_changed_cb(zfsvfs, B_TRUE);
 	} else {
 		if (!dmu_objset_is_snapshot(zfsvfs->z_os))
 			readonly_changed_cb(zfsvfs, B_FALSE);
 		else if (vfs_optionisset(vfsp, MNTOPT_RW, NULL))
 			return (EROFS);
 	}
 
 	if (vfs_optionisset(vfsp, MNTOPT_NOSUID, NULL)) {
 		setuid_changed_cb(zfsvfs, B_FALSE);
 	} else {
 		if (vfs_optionisset(vfsp, MNTOPT_NOSETUID, NULL))
 			setuid_changed_cb(zfsvfs, B_FALSE);
 		else if (vfs_optionisset(vfsp, MNTOPT_SETUID, NULL))
 			setuid_changed_cb(zfsvfs, B_TRUE);
 	}
 
 	if (vfs_optionisset(vfsp, MNTOPT_NOEXEC, NULL))
 		exec_changed_cb(zfsvfs, B_FALSE);
 	else if (vfs_optionisset(vfsp, MNTOPT_EXEC, NULL))
 		exec_changed_cb(zfsvfs, B_TRUE);
 
 	if (vfs_optionisset(vfsp, MNTOPT_ATIME, NULL))
 		atime_changed_cb(zfsvfs, B_TRUE);
 	else if (vfs_optionisset(vfsp, MNTOPT_NOATIME, NULL))
 		atime_changed_cb(zfsvfs, B_FALSE);
 
 	if (vfs_optionisset(vfsp, MNTOPT_XATTR, NULL))
 		xattr_changed_cb(zfsvfs, B_TRUE);
 	else if (vfs_optionisset(vfsp, MNTOPT_NOXATTR, NULL))
 		xattr_changed_cb(zfsvfs, B_FALSE);
 
 	return (0);
 }
 
 static int
 zfs_register_callbacks(vfs_t *vfsp)
 {
 	struct dsl_dataset *ds = NULL;
 	objset_t *os = NULL;
 	zfsvfs_t *zfsvfs = NULL;
 	int readonly, do_readonly = FALSE;
 	int setuid, do_setuid = FALSE;
 	int exec, do_exec = FALSE;
 	int xattr, do_xattr = FALSE;
 	int error = 0;
 
 	ASSERT(vfsp);
 	zfsvfs = vfsp->vfs_data;
 	ASSERT(zfsvfs);
 	os = zfsvfs->z_os;
 
 	/*
 	 * The act of registering our callbacks will destroy any mount
 	 * options we may have.  In order to enable temporary overrides
 	 * of mount options, we stash away the current values and
 	 * restore them after we register the callbacks.
 	 */
 	if (vfs_optionisset(vfsp, MNTOPT_RO, NULL)) {
 		readonly = B_TRUE;
 		do_readonly = B_TRUE;
 	} else if (vfs_optionisset(vfsp, MNTOPT_RW, NULL)) {
 		readonly = B_FALSE;
 		do_readonly = B_TRUE;
 	}
 	if (vfs_optionisset(vfsp, MNTOPT_NOSUID, NULL)) {
 		setuid = B_FALSE;
 		do_setuid = B_TRUE;
 	} else {
 		if (vfs_optionisset(vfsp, MNTOPT_NOSETUID, NULL)) {
 			setuid = B_FALSE;
 			do_setuid = B_TRUE;
 		} else if (vfs_optionisset(vfsp, MNTOPT_SETUID, NULL)) {
 			setuid = B_TRUE;
 			do_setuid = B_TRUE;
 		}
 	}
 	if (vfs_optionisset(vfsp, MNTOPT_NOEXEC, NULL)) {
 		exec = B_FALSE;
 		do_exec = B_TRUE;
 	} else if (vfs_optionisset(vfsp, MNTOPT_EXEC, NULL)) {
 		exec = B_TRUE;
 		do_exec = B_TRUE;
 	}
 	if (vfs_optionisset(vfsp, MNTOPT_NOXATTR, NULL)) {
 		xattr = B_FALSE;
 		do_xattr = B_TRUE;
 	} else if (vfs_optionisset(vfsp, MNTOPT_XATTR, NULL)) {
 		xattr = B_TRUE;
 		do_xattr = B_TRUE;
 	}
 
 	/*
 	 * Register property callbacks.
 	 *
 	 * It would probably be fine to just check for i/o error from
 	 * the first prop_register(), but I guess I like to go
 	 * overboard...
 	 */
 	ds = dmu_objset_ds(os);
 	error = dsl_prop_register(ds, "atime", atime_changed_cb, zfsvfs);
 	error = error ? error : dsl_prop_register(ds,
 	    "xattr", xattr_changed_cb, zfsvfs);
 	error = error ? error : dsl_prop_register(ds,
 	    "recordsize", blksz_changed_cb, zfsvfs);
 	error = error ? error : dsl_prop_register(ds,
 	    "readonly", readonly_changed_cb, zfsvfs);
 	error = error ? error : dsl_prop_register(ds,
 	    "setuid", setuid_changed_cb, zfsvfs);
 	error = error ? error : dsl_prop_register(ds,
 	    "exec", exec_changed_cb, zfsvfs);
 	error = error ? error : dsl_prop_register(ds,
 	    "snapdir", snapdir_changed_cb, zfsvfs);
 	error = error ? error : dsl_prop_register(ds,
 	    "aclmode", acl_mode_changed_cb, zfsvfs);
 	error = error ? error : dsl_prop_register(ds,
 	    "aclinherit", acl_inherit_changed_cb, zfsvfs);
 	if (error)
 		goto unregister;
 
 	/*
 	 * Invoke our callbacks to restore temporary mount options.
 	 */
 	if (do_readonly)
 		readonly_changed_cb(zfsvfs, readonly);
 	if (do_setuid)
 		setuid_changed_cb(zfsvfs, setuid);
 	if (do_exec)
 		exec_changed_cb(zfsvfs, exec);
 	if (do_xattr)
 		xattr_changed_cb(zfsvfs, xattr);
 
 	return (0);
 
 unregister:
 	/*
 	 * We may attempt to unregister some callbacks that are not
 	 * registered, but this is OK; it will simply return ENOMSG,
 	 * which we will ignore.
 	 */
 	(void) dsl_prop_unregister(ds, "atime", atime_changed_cb, zfsvfs);
 	(void) dsl_prop_unregister(ds, "xattr", xattr_changed_cb, zfsvfs);
 	(void) dsl_prop_unregister(ds, "recordsize", blksz_changed_cb, zfsvfs);
 	(void) dsl_prop_unregister(ds, "readonly", readonly_changed_cb, zfsvfs);
 	(void) dsl_prop_unregister(ds, "setuid", setuid_changed_cb, zfsvfs);
 	(void) dsl_prop_unregister(ds, "exec", exec_changed_cb, zfsvfs);
 	(void) dsl_prop_unregister(ds, "snapdir", snapdir_changed_cb, zfsvfs);
 	(void) dsl_prop_unregister(ds, "aclmode", acl_mode_changed_cb, zfsvfs);
 	(void) dsl_prop_unregister(ds, "aclinherit", acl_inherit_changed_cb,
 	    zfsvfs);
 	return (error);
 
 }
 
 static int
 zfs_domount(vfs_t *vfsp, char *osname, kthread_t *td)
 {
 	cred_t *cr = td->td_ucred;
 	uint64_t recordsize, readonly;
 	int error = 0;
 	int mode;
 	zfsvfs_t *zfsvfs;
 	znode_t *zp = NULL;
 
 	ASSERT(vfsp);
 	ASSERT(osname);
 
 	/*
 	 * Initialize the zfs-specific filesystem structure.
 	 * Should probably make this a kmem cache, shuffle fields,
 	 * and just bzero up to z_hold_mtx[].
 	 */
 	zfsvfs = kmem_zalloc(sizeof (zfsvfs_t), KM_SLEEP);
 	zfsvfs->z_vfs = vfsp;
 	zfsvfs->z_parent = zfsvfs;
 	zfsvfs->z_assign = TXG_NOWAIT;
 	zfsvfs->z_max_blksz = SPA_MAXBLOCKSIZE;
 	zfsvfs->z_show_ctldir = ZFS_SNAPDIR_VISIBLE;
 
 	mutex_init(&zfsvfs->z_znodes_lock, NULL, MUTEX_DEFAULT, NULL);
 	list_create(&zfsvfs->z_all_znodes, sizeof (znode_t),
 	    offsetof(znode_t, z_link_node));
 	rw_init(&zfsvfs->z_um_lock, NULL, RW_DEFAULT, NULL);
 
 	if (error = dsl_prop_get_integer(osname, "recordsize", &recordsize,
 	    NULL))
 		goto out;
 	zfsvfs->z_vfs->vfs_bsize = recordsize;
 
 	vfsp->vfs_data = zfsvfs;
 	vfsp->mnt_flag |= MNT_LOCAL;
 	vfsp->mnt_kern_flag |= MNTK_MPSAFE;
 	vfsp->mnt_kern_flag |= MNTK_LOOKUP_SHARED;
 
 	if (error = dsl_prop_get_integer(osname, "readonly", &readonly, NULL))
 		goto out;
 
 	if (readonly)
 		mode = DS_MODE_PRIMARY | DS_MODE_READONLY;
 	else
 		mode = DS_MODE_PRIMARY;
 
 	error = dmu_objset_open(osname, DMU_OST_ZFS, mode, &zfsvfs->z_os);
 	if (error == EROFS) {
 		mode = DS_MODE_PRIMARY | DS_MODE_READONLY;
 		error = dmu_objset_open(osname, DMU_OST_ZFS, mode,
 		    &zfsvfs->z_os);
 	}
 
 	if (error)
 		goto out;
 
 	if (error = zfs_init_fs(zfsvfs, &zp, cr))
 		goto out;
 
 	if (dmu_objset_is_snapshot(zfsvfs->z_os)) {
 		uint64_t xattr;
 
 		ASSERT(mode & DS_MODE_READONLY);
 		atime_changed_cb(zfsvfs, B_FALSE);
 		readonly_changed_cb(zfsvfs, B_TRUE);
 		if (error = dsl_prop_get_integer(osname, "xattr", &xattr, NULL))
 			goto out;
 		xattr_changed_cb(zfsvfs, xattr);
 		zfsvfs->z_issnap = B_TRUE;
 	} else {
 		error = zfs_register_callbacks(vfsp);
 		if (error)
 			goto out;
 
 		zfs_unlinked_drain(zfsvfs);
 
 		/*
 		 * Parse and replay the intent log.
 		 */
 		zil_replay(zfsvfs->z_os, zfsvfs, &zfsvfs->z_assign,
 		    zfs_replay_vector);
 
 		if (!zil_disable)
 			zfsvfs->z_log = zil_open(zfsvfs->z_os, zfs_get_data);
 	}
 
 	vfs_mountedfrom(vfsp, osname);
 
 	if (!zfsvfs->z_issnap)
 		zfsctl_create(zfsvfs);
 out:
 	if (error) {
 		if (zfsvfs->z_os)
 			dmu_objset_close(zfsvfs->z_os);
 		rw_destroy(&zfsvfs->z_um_lock);
 		mutex_destroy(&zfsvfs->z_znodes_lock);
 		kmem_free(zfsvfs, sizeof (zfsvfs_t));
 	} else {
 		atomic_add_32(&zfs_active_fs_count, 1);
 	}
 
 	return (error);
 
 }
 
 void
 zfs_unregister_callbacks(zfsvfs_t *zfsvfs)
 {
 	objset_t *os = zfsvfs->z_os;
 	struct dsl_dataset *ds;
 
 	/*
 	 * Unregister properties.
 	 */
 	if (!dmu_objset_is_snapshot(os)) {
 		ds = dmu_objset_ds(os);
 		VERIFY(dsl_prop_unregister(ds, "atime", atime_changed_cb,
 		    zfsvfs) == 0);
 
 		VERIFY(dsl_prop_unregister(ds, "xattr", xattr_changed_cb,
 		    zfsvfs) == 0);
 
 		VERIFY(dsl_prop_unregister(ds, "recordsize", blksz_changed_cb,
 		    zfsvfs) == 0);
 
 		VERIFY(dsl_prop_unregister(ds, "readonly", readonly_changed_cb,
 		    zfsvfs) == 0);
 
 		VERIFY(dsl_prop_unregister(ds, "setuid", setuid_changed_cb,
 		    zfsvfs) == 0);
 
 		VERIFY(dsl_prop_unregister(ds, "exec", exec_changed_cb,
 		    zfsvfs) == 0);
 
 		VERIFY(dsl_prop_unregister(ds, "snapdir", snapdir_changed_cb,
 		    zfsvfs) == 0);
 
 		VERIFY(dsl_prop_unregister(ds, "aclmode", acl_mode_changed_cb,
 		    zfsvfs) == 0);
 
 		VERIFY(dsl_prop_unregister(ds, "aclinherit",
 		    acl_inherit_changed_cb, zfsvfs) == 0);
 	}
 }
 
 /*ARGSUSED*/
 static int
 zfs_mount(vfs_t *vfsp, kthread_t *td)
 {
 	char *from;
 	int error;
 
 	/*
 	 * When doing a remount, we simply refresh our temporary properties
 	 * according to those options set in the current VFS options.
 	 */
 	if (vfsp->vfs_flag & MS_REMOUNT)
 		return (zfs_refresh_properties(vfsp));
 
 	if (vfs_getopt(vfsp->mnt_optnew, "from", (void **)&from, NULL))
 		return (EINVAL);
 
 	DROP_GIANT();
 	error = zfs_domount(vfsp, from, td);
 	PICKUP_GIANT();
 	return (error);
 }
 
 static int
 zfs_statfs(vfs_t *vfsp, struct statfs *statp, kthread_t *td)
 {
 	zfsvfs_t *zfsvfs = vfsp->vfs_data;
 	uint64_t refdbytes, availbytes, usedobjs, availobjs;
 
 	statp->f_version = STATFS_VERSION;
 
 	ZFS_ENTER(zfsvfs);
 
 	dmu_objset_space(zfsvfs->z_os,
 	    &refdbytes, &availbytes, &usedobjs, &availobjs);
 
 	/*
 	 * The underlying storage pool actually uses multiple block sizes.
 	 * We report the fragsize as the smallest block size we support,
 	 * and we report our blocksize as the filesystem's maximum blocksize.
 	 */
 	statp->f_bsize = zfsvfs->z_vfs->vfs_bsize;
 	statp->f_iosize = zfsvfs->z_vfs->vfs_bsize;
 
 	/*
 	 * The following report "total" blocks of various kinds in the
 	 * file system, but reported in terms of f_frsize - the
 	 * "fragment" size.
 	 */
 
 	statp->f_blocks = (refdbytes + availbytes) / statp->f_bsize;
 	statp->f_bfree = availbytes / statp->f_bsize;
 	statp->f_bavail = statp->f_bfree; /* no root reservation */
 
 	/*
 	 * statvfs() should really be called statufs(), because it assumes
 	 * static metadata.  ZFS doesn't preallocate files, so the best
 	 * we can do is report the max that could possibly fit in f_files,
 	 * and that minus the number actually used in f_ffree.
 	 * For f_ffree, report the smaller of the number of object available
 	 * and the number of blocks (each object will take at least a block).
 	 */
 	statp->f_ffree = MIN(availobjs, statp->f_bfree);
 	statp->f_files = statp->f_ffree + usedobjs;
 
 	/*
 	 * We're a zfs filesystem.
 	 */
 	(void) strlcpy(statp->f_fstypename, "zfs", sizeof(statp->f_fstypename));
 
 	strlcpy(statp->f_mntfromname, vfsp->mnt_stat.f_mntfromname,
 	    sizeof(statp->f_mntfromname));
 	strlcpy(statp->f_mntonname, vfsp->mnt_stat.f_mntonname,
 	    sizeof(statp->f_mntonname));
 
 	statp->f_namemax = ZFS_MAXNAMELEN;
 
 	ZFS_EXIT(zfsvfs);
 	return (0);
 }
 
 static int
 zfs_root(vfs_t *vfsp, int flags, vnode_t **vpp, kthread_t *td)
 {
 	zfsvfs_t *zfsvfs = vfsp->vfs_data;
 	znode_t *rootzp;
 	int error;
 
 	ZFS_ENTER(zfsvfs);
 
 	error = zfs_zget(zfsvfs, zfsvfs->z_root, &rootzp);
 	if (error == 0) {
 		*vpp = ZTOV(rootzp);
-		error = vn_lock(*vpp, flags, td);
+		error = vn_lock(*vpp, flags);
 		(*vpp)->v_vflag |= VV_ROOT;
 	}
 
 	ZFS_EXIT(zfsvfs);
 	return (error);
 }
 
 /*ARGSUSED*/
 static int
 zfs_umount(vfs_t *vfsp, int fflag, kthread_t *td)
 {
 	zfsvfs_t *zfsvfs = vfsp->vfs_data;
 	cred_t *cr = td->td_ucred;
 	int ret;
 
 	if ((ret = secpolicy_fs_unmount(cr, vfsp)) != 0)
 		return (ret);
 
 	(void) dnlc_purge_vfsp(vfsp, 0);
 
 	/*
 	 * Unmount any snapshots mounted under .zfs before unmounting the
 	 * dataset itself.
 	 */
 	if (zfsvfs->z_ctldir != NULL) {
 		if ((ret = zfsctl_umount_snapshots(vfsp, fflag, cr)) != 0)
 			return (ret);
 		ret = vflush(vfsp, 0, 0, td);
 		ASSERT(ret == EBUSY);
 		if (!(fflag & MS_FORCE)) {
 			if (zfsvfs->z_ctldir->v_count > 1)
 				return (EBUSY);
 			ASSERT(zfsvfs->z_ctldir->v_count == 1);
 		}
 		zfsctl_destroy(zfsvfs);
 		ASSERT(zfsvfs->z_ctldir == NULL);
 	}
 
 	/*
 	 * Flush all the files.
 	 */
 	ret = vflush(vfsp, 1, (fflag & MS_FORCE) ? FORCECLOSE : 0, td);
 	if (ret != 0) {
 		if (!zfsvfs->z_issnap) {
 			zfsctl_create(zfsvfs);
 			ASSERT(zfsvfs->z_ctldir != NULL);
 		}
 		return (ret);
 	}
 
 	if (fflag & MS_FORCE) {
 		MNT_ILOCK(vfsp);
 		vfsp->mnt_kern_flag |= MNTK_UNMOUNTF;
 		MNT_IUNLOCK(vfsp);
 		zfsvfs->z_unmounted1 = B_TRUE;
 
 		/*
 		 * Wait for all zfs threads to leave zfs.
 		 * Grabbing a rwlock as reader in all vops and
 		 * as writer here doesn't work because it too easy to get
 		 * multiple reader enters as zfs can re-enter itself.
 		 * This can lead to deadlock if there is an intervening
 		 * rw_enter as writer.
 		 * So a file system threads ref count (z_op_cnt) is used.
 		 * A polling loop on z_op_cnt may seem inefficient, but
 		 * - this saves all threads on exit from having to grab a
 		 *   mutex in order to cv_signal
 		 * - only occurs on forced unmount in the rare case when
 		 *   there are outstanding threads within the file system.
 		 */
 		while (zfsvfs->z_op_cnt) {
 			delay(1);
 		}
 	}
 
 	zfs_objset_close(zfsvfs);
 	VFS_RELE(vfsp);
 	zfs_freevfs(vfsp);
 
 	return (0);
 }
 
 static int
 zfs_vget(vfs_t *vfsp, ino_t ino, int flags, vnode_t **vpp)
 {
 	zfsvfs_t	*zfsvfs = vfsp->vfs_data;
 	znode_t		*zp;
 	int 		err;
 
 	ZFS_ENTER(zfsvfs);
 	err = zfs_zget(zfsvfs, ino, &zp);
 	if (err == 0 && zp->z_unlinked) {
 		VN_RELE(ZTOV(zp));
 		err = EINVAL;
 	}
 	if (err != 0)
 		*vpp = NULL;
 	else {
 		*vpp = ZTOV(zp);
-		vn_lock(*vpp, flags, curthread);
+		vn_lock(*vpp, flags);
 	}
 	ZFS_EXIT(zfsvfs);
 	return (err);
 }
 
 static int
 zfs_fhtovp(vfs_t *vfsp, fid_t *fidp, vnode_t **vpp)
 {
 	kthread_t	*td = curthread;
 	zfsvfs_t	*zfsvfs = vfsp->vfs_data;
 	znode_t		*zp;
 	uint64_t	object = 0;
 	uint64_t	fid_gen = 0;
 	uint64_t	gen_mask;
 	uint64_t	zp_gen;
 	int		i, err;
 
 	*vpp = NULL;
 
 	ZFS_ENTER(zfsvfs);
 
 	if (fidp->fid_len == LONG_FID_LEN) {
 		zfid_long_t	*zlfid = (zfid_long_t *)fidp;
 		uint64_t	objsetid = 0;
 		uint64_t	setgen = 0;
 
 		for (i = 0; i < sizeof (zlfid->zf_setid); i++)
 			objsetid |= ((uint64_t)zlfid->zf_setid[i]) << (8 * i);
 
 		for (i = 0; i < sizeof (zlfid->zf_setgen); i++)
 			setgen |= ((uint64_t)zlfid->zf_setgen[i]) << (8 * i);
 
 		ZFS_EXIT(zfsvfs);
 
 		err = zfsctl_lookup_objset(vfsp, objsetid, &zfsvfs);
 		if (err)
 			return (EINVAL);
 		ZFS_ENTER(zfsvfs);
 	}
 
 	if (fidp->fid_len == SHORT_FID_LEN || fidp->fid_len == LONG_FID_LEN) {
 		zfid_short_t	*zfid = (zfid_short_t *)fidp;
 
 		for (i = 0; i < sizeof (zfid->zf_object); i++)
 			object |= ((uint64_t)zfid->zf_object[i]) << (8 * i);
 
 		for (i = 0; i < sizeof (zfid->zf_gen); i++)
 			fid_gen |= ((uint64_t)zfid->zf_gen[i]) << (8 * i);
 	} else {
 		ZFS_EXIT(zfsvfs);
 		return (EINVAL);
 	}
 
 	/* A zero fid_gen means we are in the .zfs control directories */
 	if (fid_gen == 0 &&
 	    (object == ZFSCTL_INO_ROOT || object == ZFSCTL_INO_SNAPDIR)) {
 		*vpp = zfsvfs->z_ctldir;
 		ASSERT(*vpp != NULL);
 		if (object == ZFSCTL_INO_SNAPDIR) {
 			VERIFY(zfsctl_root_lookup(*vpp, "snapshot", vpp, NULL,
 			    0, NULL, NULL) == 0);
 		} else {
 			VN_HOLD(*vpp);
 		}
 		ZFS_EXIT(zfsvfs);
 		/* XXX: LK_RETRY? */
-		vn_lock(*vpp, LK_EXCLUSIVE | LK_RETRY, td);
+		vn_lock(*vpp, LK_EXCLUSIVE | LK_RETRY);
 		return (0);
 	}
 
 	gen_mask = -1ULL >> (64 - 8 * i);
 
 	dprintf("getting %llu [%u mask %llx]\n", object, fid_gen, gen_mask);
 	if (err = zfs_zget(zfsvfs, object, &zp)) {
 		ZFS_EXIT(zfsvfs);
 		return (err);
 	}
 	zp_gen = zp->z_phys->zp_gen & gen_mask;
 	if (zp_gen == 0)
 		zp_gen = 1;
 	if (zp->z_unlinked || zp_gen != fid_gen) {
 		dprintf("znode gen (%u) != fid gen (%u)\n", zp_gen, fid_gen);
 		VN_RELE(ZTOV(zp));
 		ZFS_EXIT(zfsvfs);
 		return (EINVAL);
 	}
 
 	*vpp = ZTOV(zp);
 	/* XXX: LK_RETRY? */
-	vn_lock(*vpp, LK_EXCLUSIVE | LK_RETRY, td);
+	vn_lock(*vpp, LK_EXCLUSIVE | LK_RETRY);
 	vnode_create_vobject(*vpp, zp->z_phys->zp_size, td);
 	ZFS_EXIT(zfsvfs);
 	return (0);
 }
 
 static void
 zfs_objset_close(zfsvfs_t *zfsvfs)
 {
 	znode_t		*zp, *nextzp;
 	objset_t	*os = zfsvfs->z_os;
 
 	/*
 	 * For forced unmount, at this point all vops except zfs_inactive
 	 * are erroring EIO. We need to now suspend zfs_inactive threads
 	 * while we are freeing dbufs before switching zfs_inactive
 	 * to use behaviour without a objset.
 	 */
 	rw_enter(&zfsvfs->z_um_lock, RW_WRITER);
 
 	/*
 	 * Release all holds on dbufs
 	 * Note, although we have stopped all other vop threads and
 	 * zfs_inactive(), the dmu can callback via znode_pageout_func()
 	 * which can zfs_znode_free() the znode.
 	 * So we lock z_all_znodes; search the list for a held
 	 * dbuf; drop the lock (we know zp can't disappear if we hold
 	 * a dbuf lock; then regrab the lock and restart.
 	 */
 	mutex_enter(&zfsvfs->z_znodes_lock);
 	for (zp = list_head(&zfsvfs->z_all_znodes); zp; zp = nextzp) {
 		nextzp = list_next(&zfsvfs->z_all_znodes, zp);
 		if (zp->z_dbuf_held) {
 			/* dbufs should only be held when force unmounting */
 			zp->z_dbuf_held = 0;
 			mutex_exit(&zfsvfs->z_znodes_lock);
 			dmu_buf_rele(zp->z_dbuf, NULL);
 			/* Start again */
 			mutex_enter(&zfsvfs->z_znodes_lock);
 			nextzp = list_head(&zfsvfs->z_all_znodes);
 		}
 	}
 	mutex_exit(&zfsvfs->z_znodes_lock);
 
 	/*
 	 * Unregister properties.
 	 */
 	if (!dmu_objset_is_snapshot(os))
 		zfs_unregister_callbacks(zfsvfs);
 
 	/*
 	 * Switch zfs_inactive to behaviour without an objset.
 	 * It just tosses cached pages and frees the znode & vnode.
 	 * Then re-enable zfs_inactive threads in that new behaviour.
 	 */
 	zfsvfs->z_unmounted2 = B_TRUE;
 	rw_exit(&zfsvfs->z_um_lock); /* re-enable any zfs_inactive threads */
 
 	/*
 	 * Close the zil. Can't close the zil while zfs_inactive
 	 * threads are blocked as zil_close can call zfs_inactive.
 	 */
 	if (zfsvfs->z_log) {
 		zil_close(zfsvfs->z_log);
 		zfsvfs->z_log = NULL;
 	}
 
 	/*
 	 * Evict all dbufs so that cached znodes will be freed
 	 */
 	if (dmu_objset_evict_dbufs(os, 1)) {
 		txg_wait_synced(dmu_objset_pool(zfsvfs->z_os), 0);
 		(void) dmu_objset_evict_dbufs(os, 0);
 	}
 
 	/*
 	 * Finally close the objset
 	 */
 	dmu_objset_close(os);
 }
 
 static void
 zfs_freevfs(vfs_t *vfsp)
 {
 	zfsvfs_t *zfsvfs = vfsp->vfs_data;
 	int i;
 
 	for (i = 0; i != ZFS_OBJ_MTX_SZ; i++)
 		mutex_destroy(&zfsvfs->z_hold_mtx[i]);
 	rw_destroy(&zfsvfs->z_um_lock);
 	mutex_destroy(&zfsvfs->z_znodes_lock);
 	kmem_free(zfsvfs, sizeof (zfsvfs_t));
 
 	atomic_add_32(&zfs_active_fs_count, -1);
 }
 
 #ifdef __i386__
 static int desiredvnodes_backup;
 #endif
 
 static void
 zfs_vnodes_adjust(void)
 {
 #ifdef __i386__
 	int val;
 
 	desiredvnodes_backup = desiredvnodes;
 
 	/*
 	 * We calculate newdesiredvnodes the same way it is done in
 	 * vntblinit(). If it is equal to desiredvnodes, it means that
 	 * it wasn't tuned by the administrator and we can tune it down.
 	 */
 	val = min(maxproc + cnt.v_page_count / 4, 2 * vm_kmem_size /
 	    (5 * (sizeof(struct vm_object) + sizeof(struct vnode))));
 	if (desiredvnodes == val)
 		desiredvnodes = (3 * desiredvnodes) / 4;
 #endif
 }
 
 static void
 zfs_vnodes_adjust_back(void)
 {
 
 #ifdef __i386__
 	desiredvnodes = desiredvnodes_backup;
 #endif
 }
 
 void
 zfs_init(void)
 {
 
 	printf("ZFS filesystem version " ZFS_VERSION_STRING "\n");
 
 	/*
 	 * Initialize .zfs directory structures
 	 */
 	zfsctl_init();
 
 	/*
 	 * Initialize znode cache, vnode ops, etc...
 	 */
 	zfs_znode_init();
 
 	/*
 	 * Reduce number of vnodes. Originally number of vnodes is calculated
 	 * with UFS inode in mind. We reduce it here, because it's too big for
 	 * ZFS/i386.
 	 */
 	zfs_vnodes_adjust();
 }
 
 void
 zfs_fini(void)
 {
 	zfsctl_fini();
 	zfs_znode_fini();
 	zfs_vnodes_adjust_back();
 }
 
 int
 zfs_busy(void)
 {
 	return (zfs_active_fs_count != 0);
 }
Index: head/sys/contrib/opensolaris/uts/common/fs/zfs/zfs_vnops.c
===================================================================
--- head/sys/contrib/opensolaris/uts/common/fs/zfs/zfs_vnops.c	(revision 175201)
+++ head/sys/contrib/opensolaris/uts/common/fs/zfs/zfs_vnops.c	(revision 175202)
@@ -1,3599 +1,3599 @@
 /*
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
  * Common Development and Distribution License (the "License").
  * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or http://www.opensolaris.org/os/licensing.
  * See the License for the specific language governing permissions
  * and limitations under the License.
  *
  * When distributing Covered Code, include this CDDL HEADER in each
  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  * If applicable, add the following below this CDDL HEADER, with the
  * fields enclosed by brackets "[]" replaced with your own identifying
  * information: Portions Copyright [yyyy] [name of copyright owner]
  *
  * CDDL HEADER END
  */
 /*
  * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
 /* Portions Copyright 2007 Jeremy Teo */
 
 #pragma ident	"%Z%%M%	%I%	%E% SMI"
 
 #include <sys/types.h>
 #include <sys/param.h>
 #include <sys/time.h>
 #include <sys/systm.h>
 #include <sys/sysmacros.h>
 #include <sys/resource.h>
 #include <sys/vfs.h>
 #include <sys/vnode.h>
 #include <sys/file.h>
 #include <sys/stat.h>
 #include <sys/kmem.h>
 #include <sys/taskq.h>
 #include <sys/uio.h>
 #include <sys/atomic.h>
 #include <sys/namei.h>
 #include <sys/mman.h>
 #include <sys/cmn_err.h>
 #include <sys/errno.h>
 #include <sys/unistd.h>
 #include <sys/zfs_vfsops.h>
 #include <sys/zfs_dir.h>
 #include <sys/zfs_acl.h>
 #include <sys/zfs_ioctl.h>
 #include <sys/fs/zfs.h>
 #include <sys/dmu.h>
 #include <sys/spa.h>
 #include <sys/txg.h>
 #include <sys/dbuf.h>
 #include <sys/zap.h>
 #include <sys/dirent.h>
 #include <sys/policy.h>
 #include <sys/sunddi.h>
 #include <sys/filio.h>
 #include <sys/zfs_ctldir.h>
 #include <sys/dnlc.h>
 #include <sys/zfs_rlock.h>
 #include <sys/bio.h>
 #include <sys/buf.h>
 #include <sys/sf_buf.h>
 #include <sys/sched.h>
 
 /*
  * Programming rules.
  *
  * Each vnode op performs some logical unit of work.  To do this, the ZPL must
  * properly lock its in-core state, create a DMU transaction, do the work,
  * record this work in the intent log (ZIL), commit the DMU transaction,
  * and wait the the intent log to commit if it's is a synchronous operation.
  * Morover, the vnode ops must work in both normal and log replay context.
  * The ordering of events is important to avoid deadlocks and references
  * to freed memory.  The example below illustrates the following Big Rules:
  *
  *  (1) A check must be made in each zfs thread for a mounted file system.
  *	This is done avoiding races using ZFS_ENTER(zfsvfs).
  *	A ZFS_EXIT(zfsvfs) is needed before all returns.
  *
  *  (2)	VN_RELE() should always be the last thing except for zil_commit()
  *	(if necessary) and ZFS_EXIT(). This is for 3 reasons:
  *	First, if it's the last reference, the vnode/znode
  *	can be freed, so the zp may point to freed memory.  Second, the last
  *	reference will call zfs_zinactive(), which may induce a lot of work --
  *	pushing cached pages (which acquires range locks) and syncing out
  *	cached atime changes.  Third, zfs_zinactive() may require a new tx,
  *	which could deadlock the system if you were already holding one.
  *
  *  (3)	All range locks must be grabbed before calling dmu_tx_assign(),
  *	as they can span dmu_tx_assign() calls.
  *
  *  (4)	Always pass zfsvfs->z_assign as the second argument to dmu_tx_assign().
  *	In normal operation, this will be TXG_NOWAIT.  During ZIL replay,
  *	it will be a specific txg.  Either way, dmu_tx_assign() never blocks.
  *	This is critical because we don't want to block while holding locks.
  *	Note, in particular, that if a lock is sometimes acquired before
  *	the tx assigns, and sometimes after (e.g. z_lock), then failing to
  *	use a non-blocking assign can deadlock the system.  The scenario:
  *
  *	Thread A has grabbed a lock before calling dmu_tx_assign().
  *	Thread B is in an already-assigned tx, and blocks for this lock.
  *	Thread A calls dmu_tx_assign(TXG_WAIT) and blocks in txg_wait_open()
  *	forever, because the previous txg can't quiesce until B's tx commits.
  *
  *	If dmu_tx_assign() returns ERESTART and zfsvfs->z_assign is TXG_NOWAIT,
  *	then drop all locks, call dmu_tx_wait(), and try again.
  *
  *  (5)	If the operation succeeded, generate the intent log entry for it
  *	before dropping locks.  This ensures that the ordering of events
  *	in the intent log matches the order in which they actually occurred.
  *
  *  (6)	At the end of each vnode op, the DMU tx must always commit,
  *	regardless of whether there were any errors.
  *
  *  (7)	After dropping all locks, invoke zil_commit(zilog, seq, foid)
  *	to ensure that synchronous semantics are provided when necessary.
  *
  * In general, this is how things should be ordered in each vnode op:
  *
  *	ZFS_ENTER(zfsvfs);		// exit if unmounted
  * top:
  *	zfs_dirent_lock(&dl, ...)	// lock directory entry (may VN_HOLD())
  *	rw_enter(...);			// grab any other locks you need
  *	tx = dmu_tx_create(...);	// get DMU tx
  *	dmu_tx_hold_*();		// hold each object you might modify
  *	error = dmu_tx_assign(tx, zfsvfs->z_assign);	// try to assign
  *	if (error) {
  *		rw_exit(...);		// drop locks
  *		zfs_dirent_unlock(dl);	// unlock directory entry
  *		VN_RELE(...);		// release held vnodes
  *		if (error == ERESTART && zfsvfs->z_assign == TXG_NOWAIT) {
  *			dmu_tx_wait(tx);
  *			dmu_tx_abort(tx);
  *			goto top;
  *		}
  *		dmu_tx_abort(tx);	// abort DMU tx
  *		ZFS_EXIT(zfsvfs);	// finished in zfs
  *		return (error);		// really out of space
  *	}
  *	error = do_real_work();		// do whatever this VOP does
  *	if (error == 0)
  *		zfs_log_*(...);		// on success, make ZIL entry
  *	dmu_tx_commit(tx);		// commit DMU tx -- error or not
  *	rw_exit(...);			// drop locks
  *	zfs_dirent_unlock(dl);		// unlock directory entry
  *	VN_RELE(...);			// release held vnodes
  *	zil_commit(zilog, seq, foid);	// synchronous when necessary
  *	ZFS_EXIT(zfsvfs);		// finished in zfs
  *	return (error);			// done, report error
  */
 /* ARGSUSED */
 static int
 zfs_open(vnode_t **vpp, int flag, cred_t *cr)
 {
 	znode_t	*zp = VTOZ(*vpp);
 
 	/* Keep a count of the synchronous opens in the znode */
 	if (flag & (FSYNC | FDSYNC))
 		atomic_inc_32(&zp->z_sync_cnt);
 	return (0);
 }
 
 /* ARGSUSED */
 static int
 zfs_close(vnode_t *vp, int flag, int count, offset_t offset, cred_t *cr)
 {
 	znode_t	*zp = VTOZ(vp);
 
 	/* Decrement the synchronous opens in the znode */
 	if (flag & (FSYNC | FDSYNC))
 		atomic_dec_32(&zp->z_sync_cnt);
 
 	/*
 	 * Clean up any locks held by this process on the vp.
 	 */
 	cleanlocks(vp, ddi_get_pid(), 0);
 	cleanshares(vp, ddi_get_pid());
 
 	return (0);
 }
 
 /*
  * Lseek support for finding holes (cmd == _FIO_SEEK_HOLE) and
  * data (cmd == _FIO_SEEK_DATA). "off" is an in/out parameter.
  */
 static int
 zfs_holey(vnode_t *vp, u_long cmd, offset_t *off)
 {
 	znode_t	*zp = VTOZ(vp);
 	uint64_t noff = (uint64_t)*off; /* new offset */
 	uint64_t file_sz;
 	int error;
 	boolean_t hole;
 
 	file_sz = zp->z_phys->zp_size;
 	if (noff >= file_sz)  {
 		return (ENXIO);
 	}
 
 	if (cmd == _FIO_SEEK_HOLE)
 		hole = B_TRUE;
 	else
 		hole = B_FALSE;
 
 	error = dmu_offset_next(zp->z_zfsvfs->z_os, zp->z_id, hole, &noff);
 
 	/* end of file? */
 	if ((error == ESRCH) || (noff > file_sz)) {
 		/*
 		 * Handle the virtual hole at the end of file.
 		 */
 		if (hole) {
 			*off = file_sz;
 			return (0);
 		}
 		return (ENXIO);
 	}
 
 	if (noff < *off)
 		return (error);
 	*off = noff;
 	return (error);
 }
 
 /* ARGSUSED */
 static int
 zfs_ioctl(vnode_t *vp, u_long com, intptr_t data, int flag, cred_t *cred,
     int *rvalp)
 {
 	offset_t off;
 	int error;
 	zfsvfs_t *zfsvfs;
 
 	switch (com) {
 	    case _FIOFFS:
 		return (0);
 
 		/*
 		 * The following two ioctls are used by bfu.  Faking out,
 		 * necessary to avoid bfu errors.
 		 */
 	    case _FIOGDIO:
 	    case _FIOSDIO:
 		return (0);
 
 	    case _FIO_SEEK_DATA:
 	    case _FIO_SEEK_HOLE:
 		if (ddi_copyin((void *)data, &off, sizeof (off), flag))
 			return (EFAULT);
 
 		zfsvfs = VTOZ(vp)->z_zfsvfs;
 		ZFS_ENTER(zfsvfs);
 
 		/* offset parameter is in/out */
 		error = zfs_holey(vp, com, &off);
 		ZFS_EXIT(zfsvfs);
 		if (error)
 			return (error);
 		if (ddi_copyout(&off, (void *)data, sizeof (off), flag))
 			return (EFAULT);
 		return (0);
 	}
 	return (ENOTTY);
 }
 
 /*
  * When a file is memory mapped, we must keep the IO data synchronized
  * between the DMU cache and the memory mapped pages.  What this means:
  *
  * On Write:	If we find a memory mapped page, we write to *both*
  *		the page and the dmu buffer.
  *
  * NOTE: We will always "break up" the IO into PAGESIZE uiomoves when
  *	the file is memory mapped.
  */
 static int
 mappedwrite(vnode_t *vp, int nbytes, uio_t *uio, dmu_tx_t *tx)
 {
 	znode_t *zp = VTOZ(vp);
 	objset_t *os = zp->z_zfsvfs->z_os;
 	vm_object_t obj;
 	vm_page_t m;
 	struct sf_buf *sf;
 	int64_t start, off;
 	int len = nbytes;
 	int error = 0;
 	uint64_t dirbytes;
 
 	ASSERT(vp->v_mount != NULL);
 	obj = vp->v_object;
 	ASSERT(obj != NULL);
 
 	start = uio->uio_loffset;
 	off = start & PAGEOFFSET;
 	dirbytes = 0;
 	VM_OBJECT_LOCK(obj);
 	for (start &= PAGEMASK; len > 0; start += PAGESIZE) {
 		uint64_t bytes = MIN(PAGESIZE - off, len);
 		uint64_t fsize;
 
 again:
 		if ((m = vm_page_lookup(obj, OFF_TO_IDX(start))) != NULL &&
 		    vm_page_is_valid(m, (vm_offset_t)off, bytes)) {
 			uint64_t woff;
 			caddr_t va;
 
 			if (vm_page_sleep_if_busy(m, FALSE, "zfsmwb"))
 				goto again;
 			fsize = obj->un_pager.vnp.vnp_size;
 			vm_page_busy(m);
 			vm_page_lock_queues();
 			vm_page_undirty(m);
 			vm_page_unlock_queues();
 			VM_OBJECT_UNLOCK(obj);
 			if (dirbytes > 0) {
 				error = dmu_write_uio(os, zp->z_id, uio,
 				    dirbytes, tx);
 				dirbytes = 0;
 			}
 			if (error == 0) {
 				sched_pin();
 				sf = sf_buf_alloc(m, SFB_CPUPRIVATE);
 				va = (caddr_t)sf_buf_kva(sf);
 				woff = uio->uio_loffset - off;
 				error = uiomove(va + off, bytes, UIO_WRITE, uio);
 				/*
 				 * The uiomove() above could have been partially
 				 * successful, that's why we call dmu_write()
 				 * below unconditionally. The page was marked
 				 * non-dirty above and we would lose the changes
 				 * without doing so. If the uiomove() failed
 				 * entirely, well, we just write what we got
 				 * before one more time.
 				 */
 				dmu_write(os, zp->z_id, woff,
 				    MIN(PAGESIZE, fsize - woff), va, tx);
 				sf_buf_free(sf);
 				sched_unpin();
 			}
 			VM_OBJECT_LOCK(obj);
 			vm_page_wakeup(m);
 		} else {
 			dirbytes += bytes;
 		}
 		len -= bytes;
 		off = 0;
 		if (error)
 			break;
 	}
 	VM_OBJECT_UNLOCK(obj);
 	if (error == 0 && dirbytes > 0)
 		error = dmu_write_uio(os, zp->z_id, uio, dirbytes, tx);
 	return (error);
 }
 
 /*
  * When a file is memory mapped, we must keep the IO data synchronized
  * between the DMU cache and the memory mapped pages.  What this means:
  *
  * On Read:	We "read" preferentially from memory mapped pages,
  *		else we default from the dmu buffer.
  *
  * NOTE: We will always "break up" the IO into PAGESIZE uiomoves when
  *	the file is memory mapped.
  */
 static int
 mappedread(vnode_t *vp, int nbytes, uio_t *uio)
 {
 	znode_t *zp = VTOZ(vp);
 	objset_t *os = zp->z_zfsvfs->z_os;
 	vm_object_t obj;
 	vm_page_t m;
 	struct sf_buf *sf;
 	int64_t start, off;
 	caddr_t va;
 	int len = nbytes;
 	int error = 0;
 	uint64_t dirbytes;
 
 	ASSERT(vp->v_mount != NULL);
 	obj = vp->v_object;
 	ASSERT(obj != NULL);
 
 	start = uio->uio_loffset;
 	off = start & PAGEOFFSET;
 	dirbytes = 0;
 	VM_OBJECT_LOCK(obj);
 	for (start &= PAGEMASK; len > 0; start += PAGESIZE) {
 		uint64_t bytes = MIN(PAGESIZE - off, len);
 
 again:
 		if ((m = vm_page_lookup(obj, OFF_TO_IDX(start))) != NULL &&
 		    vm_page_is_valid(m, (vm_offset_t)off, bytes)) {
 			if (vm_page_sleep_if_busy(m, FALSE, "zfsmrb"))
 				goto again;
 			vm_page_busy(m);
 			VM_OBJECT_UNLOCK(obj);
 			if (dirbytes > 0) {
 				error = dmu_read_uio(os, zp->z_id, uio,
 				    dirbytes);
 				dirbytes = 0;
 			}
 			if (error == 0) {
 				sched_pin();
 				sf = sf_buf_alloc(m, SFB_CPUPRIVATE);
 				va = (caddr_t)sf_buf_kva(sf);
 				error = uiomove(va + off, bytes, UIO_READ, uio);
 				sf_buf_free(sf);
 				sched_unpin();
 			}
 			VM_OBJECT_LOCK(obj);
 			vm_page_wakeup(m);
 		} else if (m != NULL && uio->uio_segflg == UIO_NOCOPY) {
 			/*
 			 * The code below is here to make sendfile(2) work
 			 * correctly with ZFS. As pointed out by ups@
 			 * sendfile(2) should be changed to use VOP_GETPAGES(),
 			 * but it pessimize performance of sendfile/UFS, that's
 			 * why I handle this special case in ZFS code.
 			 */
 			if (vm_page_sleep_if_busy(m, FALSE, "zfsmrb"))
 				goto again;
 			vm_page_busy(m);
 			VM_OBJECT_UNLOCK(obj);
 			if (dirbytes > 0) {
 				error = dmu_read_uio(os, zp->z_id, uio,
 				    dirbytes);
 				dirbytes = 0;
 			}
 			if (error == 0) {
 				sched_pin();
 				sf = sf_buf_alloc(m, SFB_CPUPRIVATE);
 				va = (caddr_t)sf_buf_kva(sf);
 				error = dmu_read(os, zp->z_id, start + off,
 				    bytes, (void *)(va + off));
 				sf_buf_free(sf);
 				sched_unpin();
 			}
 			VM_OBJECT_LOCK(obj);
 			vm_page_wakeup(m);
 			if (error == 0)
 				uio->uio_resid -= bytes;
 		} else {
 			dirbytes += bytes;
 		}
 		len -= bytes;
 		off = 0;
 		if (error)
 			break;
 	}
 	VM_OBJECT_UNLOCK(obj);
 	if (error == 0 && dirbytes > 0)
 		error = dmu_read_uio(os, zp->z_id, uio, dirbytes);
 	return (error);
 }
 
 offset_t zfs_read_chunk_size = 1024 * 1024; /* Tunable */
 
 /*
  * Read bytes from specified file into supplied buffer.
  *
  *	IN:	vp	- vnode of file to be read from.
  *		uio	- structure supplying read location, range info,
  *			  and return buffer.
  *		ioflag	- SYNC flags; used to provide FRSYNC semantics.
  *		cr	- credentials of caller.
  *
  *	OUT:	uio	- updated offset and range, buffer filled.
  *
  *	RETURN:	0 if success
  *		error code if failure
  *
  * Side Effects:
  *	vp - atime updated if byte count > 0
  */
 /* ARGSUSED */
 static int
 zfs_read(vnode_t *vp, uio_t *uio, int ioflag, cred_t *cr, caller_context_t *ct)
 {
 	znode_t		*zp = VTOZ(vp);
 	zfsvfs_t	*zfsvfs = zp->z_zfsvfs;
 	objset_t	*os = zfsvfs->z_os;
 	ssize_t		n, nbytes;
 	int		error;
 	rl_t		*rl;
 
 	ZFS_ENTER(zfsvfs);
 
 	/*
 	 * Validate file offset
 	 */
 	if (uio->uio_loffset < (offset_t)0) {
 		ZFS_EXIT(zfsvfs);
 		return (EINVAL);
 	}
 
 	/*
 	 * Fasttrack empty reads
 	 */
 	if (uio->uio_resid == 0) {
 		ZFS_EXIT(zfsvfs);
 		return (0);
 	}
 
 	/*
 	 * Check for mandatory locks
 	 */
 	if (MANDMODE((mode_t)zp->z_phys->zp_mode)) {
 		if (error = chklock(vp, FREAD,
 		    uio->uio_loffset, uio->uio_resid, uio->uio_fmode, ct)) {
 			ZFS_EXIT(zfsvfs);
 			return (error);
 		}
 	}
 
 	/*
 	 * If we're in FRSYNC mode, sync out this znode before reading it.
 	 */
 	if (ioflag & FRSYNC)
 		zil_commit(zfsvfs->z_log, zp->z_last_itx, zp->z_id);
 
 	/*
 	 * Lock the range against changes.
 	 */
 	rl = zfs_range_lock(zp, uio->uio_loffset, uio->uio_resid, RL_READER);
 
 	/*
 	 * If we are reading past end-of-file we can skip
 	 * to the end; but we might still need to set atime.
 	 */
 	if (uio->uio_loffset >= zp->z_phys->zp_size) {
 		error = 0;
 		goto out;
 	}
 
 	ASSERT(uio->uio_loffset < zp->z_phys->zp_size);
 	n = MIN(uio->uio_resid, zp->z_phys->zp_size - uio->uio_loffset);
 
 	while (n > 0) {
 		nbytes = MIN(n, zfs_read_chunk_size -
 		    P2PHASE(uio->uio_loffset, zfs_read_chunk_size));
 
 		if (vn_has_cached_data(vp))
 			error = mappedread(vp, nbytes, uio);
 		else
 			error = dmu_read_uio(os, zp->z_id, uio, nbytes);
 		if (error)
 			break;
 
 		n -= nbytes;
 	}
 
 out:
 	zfs_range_unlock(rl);
 
 	ZFS_ACCESSTIME_STAMP(zfsvfs, zp);
 	ZFS_EXIT(zfsvfs);
 	return (error);
 }
 
 /*
  * Fault in the pages of the first n bytes specified by the uio structure.
  * 1 byte in each page is touched and the uio struct is unmodified.
  * Any error will exit this routine as this is only a best
  * attempt to get the pages resident. This is a copy of ufs_trans_touch().
  */
 static void
 zfs_prefault_write(ssize_t n, struct uio *uio)
 {
 	struct iovec *iov;
 	ulong_t cnt, incr;
 	caddr_t p;
 
 	if (uio->uio_segflg != UIO_USERSPACE)
 		return;
 
 	iov = uio->uio_iov;
 
 	while (n) {
 		cnt = MIN(iov->iov_len, n);
 		if (cnt == 0) {
 			/* empty iov entry */
 			iov++;
 			continue;
 		}
 		n -= cnt;
 		/*
 		 * touch each page in this segment.
 		 */
 		p = iov->iov_base;
 		while (cnt) {
 			if (fubyte(p) == -1)
 				return;
 			incr = MIN(cnt, PAGESIZE);
 			p += incr;
 			cnt -= incr;
 		}
 		/*
 		 * touch the last byte in case it straddles a page.
 		 */
 		p--;
 		if (fubyte(p) == -1)
 			return;
 		iov++;
 	}
 }
 
 /*
  * Write the bytes to a file.
  *
  *	IN:	vp	- vnode of file to be written to.
  *		uio	- structure supplying write location, range info,
  *			  and data buffer.
  *		ioflag	- IO_APPEND flag set if in append mode.
  *		cr	- credentials of caller.
  *
  *	OUT:	uio	- updated offset and range.
  *
  *	RETURN:	0 if success
  *		error code if failure
  *
  * Timestamps:
  *	vp - ctime|mtime updated if byte count > 0
  */
 /* ARGSUSED */
 static int
 zfs_write(vnode_t *vp, uio_t *uio, int ioflag, cred_t *cr, caller_context_t *ct)
 {
 	znode_t		*zp = VTOZ(vp);
 	rlim64_t	limit = MAXOFFSET_T;
 	ssize_t		start_resid = uio->uio_resid;
 	ssize_t		tx_bytes;
 	uint64_t	end_size;
 	dmu_tx_t	*tx;
 	zfsvfs_t	*zfsvfs = zp->z_zfsvfs;
 	zilog_t		*zilog = zfsvfs->z_log;
 	offset_t	woff;
 	ssize_t		n, nbytes;
 	rl_t		*rl;
 	int		max_blksz = zfsvfs->z_max_blksz;
 	int		error;
 
 	/*
 	 * Fasttrack empty write
 	 */
 	n = start_resid;
 	if (n == 0)
 		return (0);
 
 	if (limit == RLIM64_INFINITY || limit > MAXOFFSET_T)
 		limit = MAXOFFSET_T;
 
 	ZFS_ENTER(zfsvfs);
 
 	/*
 	 * Pre-fault the pages to ensure slow (eg NFS) pages
 	 * don't hold up txg.
 	 */
 	zfs_prefault_write(n, uio);
 
 	/*
 	 * If in append mode, set the io offset pointer to eof.
 	 */
 	if (ioflag & IO_APPEND) {
 		/*
 		 * Range lock for a file append:
 		 * The value for the start of range will be determined by
 		 * zfs_range_lock() (to guarantee append semantics).
 		 * If this write will cause the block size to increase,
 		 * zfs_range_lock() will lock the entire file, so we must
 		 * later reduce the range after we grow the block size.
 		 */
 		rl = zfs_range_lock(zp, 0, n, RL_APPEND);
 		if (rl->r_len == UINT64_MAX) {
 			/* overlocked, zp_size can't change */
 			woff = uio->uio_loffset = zp->z_phys->zp_size;
 		} else {
 			woff = uio->uio_loffset = rl->r_off;
 		}
 	} else {
 		woff = uio->uio_loffset;
 		/*
 		 * Validate file offset
 		 */
 		if (woff < 0) {
 			ZFS_EXIT(zfsvfs);
 			return (EINVAL);
 		}
 
 		/*
 		 * If we need to grow the block size then zfs_range_lock()
 		 * will lock a wider range than we request here.
 		 * Later after growing the block size we reduce the range.
 		 */
 		rl = zfs_range_lock(zp, woff, n, RL_WRITER);
 	}
 
 	if (woff >= limit) {
 		zfs_range_unlock(rl);
 		ZFS_EXIT(zfsvfs);
 		return (EFBIG);
 	}
 
 	if ((woff + n) > limit || woff > (limit - n))
 		n = limit - woff;
 
 	/*
 	 * Check for mandatory locks
 	 */
 	if (MANDMODE((mode_t)zp->z_phys->zp_mode) &&
 	    (error = chklock(vp, FWRITE, woff, n, uio->uio_fmode, ct)) != 0) {
 		zfs_range_unlock(rl);
 		ZFS_EXIT(zfsvfs);
 		return (error);
 	}
 	end_size = MAX(zp->z_phys->zp_size, woff + n);
 
 	/*
 	 * Write the file in reasonable size chunks.  Each chunk is written
 	 * in a separate transaction; this keeps the intent log records small
 	 * and allows us to do more fine-grained space accounting.
 	 */
 	while (n > 0) {
 		/*
 		 * Start a transaction.
 		 */
 		woff = uio->uio_loffset;
 		tx = dmu_tx_create(zfsvfs->z_os);
 		dmu_tx_hold_bonus(tx, zp->z_id);
 		dmu_tx_hold_write(tx, zp->z_id, woff, MIN(n, max_blksz));
 		error = dmu_tx_assign(tx, zfsvfs->z_assign);
 		if (error) {
 			if (error == ERESTART &&
 			    zfsvfs->z_assign == TXG_NOWAIT) {
 				dmu_tx_wait(tx);
 				dmu_tx_abort(tx);
 				continue;
 			}
 			dmu_tx_abort(tx);
 			break;
 		}
 
 		/*
 		 * If zfs_range_lock() over-locked we grow the blocksize
 		 * and then reduce the lock range.  This will only happen
 		 * on the first iteration since zfs_range_reduce() will
 		 * shrink down r_len to the appropriate size.
 		 */
 		if (rl->r_len == UINT64_MAX) {
 			uint64_t new_blksz;
 
 			if (zp->z_blksz > max_blksz) {
 				ASSERT(!ISP2(zp->z_blksz));
 				new_blksz = MIN(end_size, SPA_MAXBLOCKSIZE);
 			} else {
 				new_blksz = MIN(end_size, max_blksz);
 			}
 			zfs_grow_blocksize(zp, new_blksz, tx);
 			zfs_range_reduce(rl, woff, n);
 		}
 
 		/*
 		 * XXX - should we really limit each write to z_max_blksz?
 		 * Perhaps we should use SPA_MAXBLOCKSIZE chunks?
 		 */
 		nbytes = MIN(n, max_blksz - P2PHASE(woff, max_blksz));
 
 		if (woff + nbytes > zp->z_phys->zp_size)
 			vnode_pager_setsize(vp, woff + nbytes);
 
 		rw_enter(&zp->z_map_lock, RW_READER);
 
 		tx_bytes = uio->uio_resid;
 		if (vn_has_cached_data(vp)) {
 			rw_exit(&zp->z_map_lock);
 			error = mappedwrite(vp, nbytes, uio, tx);
 		} else {
 			error = dmu_write_uio(zfsvfs->z_os, zp->z_id,
 			    uio, nbytes, tx);
 			rw_exit(&zp->z_map_lock);
 		}
 		tx_bytes -= uio->uio_resid;
 
 		/*
 		 * If we made no progress, we're done.  If we made even
 		 * partial progress, update the znode and ZIL accordingly.
 		 */
 		if (tx_bytes == 0) {
 			dmu_tx_commit(tx);
 			ASSERT(error != 0);
 			break;
 		}
 
 		/*
 		 * Clear Set-UID/Set-GID bits on successful write if not
 		 * privileged and at least one of the excute bits is set.
 		 *
 		 * It would be nice to to this after all writes have
 		 * been done, but that would still expose the ISUID/ISGID
 		 * to another app after the partial write is committed.
 		 */
 		mutex_enter(&zp->z_acl_lock);
 		if ((zp->z_phys->zp_mode & (S_IXUSR | (S_IXUSR >> 3) |
 		    (S_IXUSR >> 6))) != 0 &&
 		    (zp->z_phys->zp_mode & (S_ISUID | S_ISGID)) != 0 &&
 		    secpolicy_vnode_setid_retain(cr,
 		    (zp->z_phys->zp_mode & S_ISUID) != 0 &&
 		    zp->z_phys->zp_uid == 0) != 0) {
 			    zp->z_phys->zp_mode &= ~(S_ISUID | S_ISGID);
 		}
 		mutex_exit(&zp->z_acl_lock);
 
 		/*
 		 * Update time stamp.  NOTE: This marks the bonus buffer as
 		 * dirty, so we don't have to do it again for zp_size.
 		 */
 		zfs_time_stamper(zp, CONTENT_MODIFIED, tx);
 
 		/*
 		 * Update the file size (zp_size) if it has changed;
 		 * account for possible concurrent updates.
 		 */
 		while ((end_size = zp->z_phys->zp_size) < uio->uio_loffset)
 			(void) atomic_cas_64(&zp->z_phys->zp_size, end_size,
 			    uio->uio_loffset);
 		zfs_log_write(zilog, tx, TX_WRITE, zp, woff, tx_bytes, ioflag);
 		dmu_tx_commit(tx);
 
 		if (error != 0)
 			break;
 		ASSERT(tx_bytes == nbytes);
 		n -= nbytes;
 	}
 
 	zfs_range_unlock(rl);
 
 	/*
 	 * If we're in replay mode, or we made no progress, return error.
 	 * Otherwise, it's at least a partial write, so it's successful.
 	 */
 	if (zfsvfs->z_assign >= TXG_INITIAL || uio->uio_resid == start_resid) {
 		ZFS_EXIT(zfsvfs);
 		return (error);
 	}
 
 	if (ioflag & (FSYNC | FDSYNC))
 		zil_commit(zilog, zp->z_last_itx, zp->z_id);
 
 	ZFS_EXIT(zfsvfs);
 	return (0);
 }
 
 void
 zfs_get_done(dmu_buf_t *db, void *vzgd)
 {
 	zgd_t *zgd = (zgd_t *)vzgd;
 	rl_t *rl = zgd->zgd_rl;
 	vnode_t *vp = ZTOV(rl->r_zp);
 	int vfslocked;
 
 	vfslocked = VFS_LOCK_GIANT(vp->v_vfsp);
 	dmu_buf_rele(db, vzgd);
 	zfs_range_unlock(rl);
 	VN_RELE(vp);
 	zil_add_vdev(zgd->zgd_zilog, DVA_GET_VDEV(BP_IDENTITY(zgd->zgd_bp)));
 	kmem_free(zgd, sizeof (zgd_t));
 	VFS_UNLOCK_GIANT(vfslocked);
 }
 
 /*
  * Get data to generate a TX_WRITE intent log record.
  */
 int
 zfs_get_data(void *arg, lr_write_t *lr, char *buf, zio_t *zio)
 {
 	zfsvfs_t *zfsvfs = arg;
 	objset_t *os = zfsvfs->z_os;
 	znode_t *zp;
 	uint64_t off = lr->lr_offset;
 	dmu_buf_t *db;
 	rl_t *rl;
 	zgd_t *zgd;
 	int dlen = lr->lr_length;		/* length of user data */
 	int error = 0;
 
 	ASSERT(zio);
 	ASSERT(dlen != 0);
 
 	/*
 	 * Nothing to do if the file has been removed
 	 */
 	if (zfs_zget(zfsvfs, lr->lr_foid, &zp) != 0)
 		return (ENOENT);
 	if (zp->z_unlinked) {
 		VN_RELE(ZTOV(zp));
 		return (ENOENT);
 	}
 
 	/*
 	 * Write records come in two flavors: immediate and indirect.
 	 * For small writes it's cheaper to store the data with the
 	 * log record (immediate); for large writes it's cheaper to
 	 * sync the data and get a pointer to it (indirect) so that
 	 * we don't have to write the data twice.
 	 */
 	if (buf != NULL) { /* immediate write */
 		rl = zfs_range_lock(zp, off, dlen, RL_READER);
 		/* test for truncation needs to be done while range locked */
 		if (off >= zp->z_phys->zp_size) {
 			error = ENOENT;
 			goto out;
 		}
 		VERIFY(0 == dmu_read(os, lr->lr_foid, off, dlen, buf));
 	} else { /* indirect write */
 		uint64_t boff; /* block starting offset */
 
 		/*
 		 * Have to lock the whole block to ensure when it's
 		 * written out and it's checksum is being calculated
 		 * that no one can change the data. We need to re-check
 		 * blocksize after we get the lock in case it's changed!
 		 */
 		for (;;) {
 			if (ISP2(zp->z_blksz)) {
 				boff = P2ALIGN_TYPED(off, zp->z_blksz,
 				    uint64_t);
 			} else {
 				boff = 0;
 			}
 			dlen = zp->z_blksz;
 			rl = zfs_range_lock(zp, boff, dlen, RL_READER);
 			if (zp->z_blksz == dlen)
 				break;
 			zfs_range_unlock(rl);
 		}
 		/* test for truncation needs to be done while range locked */
 		if (off >= zp->z_phys->zp_size) {
 			error = ENOENT;
 			goto out;
 		}
 		zgd = (zgd_t *)kmem_alloc(sizeof (zgd_t), KM_SLEEP);
 		zgd->zgd_rl = rl;
 		zgd->zgd_zilog = zfsvfs->z_log;
 		zgd->zgd_bp = &lr->lr_blkptr;
 		VERIFY(0 == dmu_buf_hold(os, lr->lr_foid, boff, zgd, &db));
 		ASSERT(boff == db->db_offset);
 		lr->lr_blkoff = off - boff;
 		error = dmu_sync(zio, db, &lr->lr_blkptr,
 		    lr->lr_common.lrc_txg, zfs_get_done, zgd);
 		ASSERT(error == EEXIST || lr->lr_length <= zp->z_blksz);
 		if (error == 0) {
 			zil_add_vdev(zfsvfs->z_log,
 			    DVA_GET_VDEV(BP_IDENTITY(&lr->lr_blkptr)));
 		}
 		/*
 		 * If we get EINPROGRESS, then we need to wait for a
 		 * write IO initiated by dmu_sync() to complete before
 		 * we can release this dbuf.  We will finish everything
 		 * up in the zfs_get_done() callback.
 		 */
 		if (error == EINPROGRESS)
 			return (0);
 		dmu_buf_rele(db, zgd);
 		kmem_free(zgd, sizeof (zgd_t));
 	}
 out:
 	zfs_range_unlock(rl);
 	VN_RELE(ZTOV(zp));
 	return (error);
 }
 
 /*ARGSUSED*/
 static int
 zfs_access(vnode_t *vp, int mode, int flags, cred_t *cr)
 {
 	znode_t *zp = VTOZ(vp);
 	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
 	int error;
 
 	ZFS_ENTER(zfsvfs);
 	error = zfs_zaccess_rwx(zp, mode, cr);
 	ZFS_EXIT(zfsvfs);
 	return (error);
 }
 
 /*
  * Lookup an entry in a directory, or an extended attribute directory.
  * If it exists, return a held vnode reference for it.
  *
  *	IN:	dvp	- vnode of directory to search.
  *		nm	- name of entry to lookup.
  *		pnp	- full pathname to lookup [UNUSED].
  *		flags	- LOOKUP_XATTR set if looking for an attribute.
  *		rdir	- root directory vnode [UNUSED].
  *		cr	- credentials of caller.
  *
  *	OUT:	vpp	- vnode of located entry, NULL if not found.
  *
  *	RETURN:	0 if success
  *		error code if failure
  *
  * Timestamps:
  *	NA
  */
 /* ARGSUSED */
 static int
 zfs_lookup(vnode_t *dvp, char *nm, vnode_t **vpp, struct componentname *cnp,
     int nameiop, cred_t *cr, kthread_t *td)
 {
 
 	znode_t *zdp = VTOZ(dvp);
 	zfsvfs_t *zfsvfs = zdp->z_zfsvfs;
 	int	error;
 
 	ZFS_ENTER(zfsvfs);
 
 	*vpp = NULL;
 
 #ifdef TODO
 	if (flags & LOOKUP_XATTR) {
 		/*
 		 * If the xattr property is off, refuse the lookup request.
 		 */
 		if (!(zfsvfs->z_vfs->vfs_flag & VFS_XATTR)) {
 			ZFS_EXIT(zfsvfs);
 			return (EINVAL);
 		}
 
 		/*
 		 * We don't allow recursive attributes..
 		 * Maybe someday we will.
 		 */
 		if (zdp->z_phys->zp_flags & ZFS_XATTR) {
 			ZFS_EXIT(zfsvfs);
 			return (EINVAL);
 		}
 
 		if (error = zfs_get_xattrdir(VTOZ(dvp), vpp, cr, flags)) {
 			ZFS_EXIT(zfsvfs);
 			return (error);
 		}
 
 		/*
 		 * Do we have permission to get into attribute directory?
 		 */
 
 		if (error = zfs_zaccess(VTOZ(*vpp), ACE_EXECUTE, cr)) {
 			VN_RELE(*vpp);
 		}
 
 		ZFS_EXIT(zfsvfs);
 		return (error);
 	}
 #endif	/* TODO */
 
 	if (dvp->v_type != VDIR) {
 		ZFS_EXIT(zfsvfs);
 		return (ENOTDIR);
 	}
 
 	/*
 	 * Check accessibility of directory.
 	 */
 
 	if (error = zfs_zaccess(zdp, ACE_EXECUTE, cr)) {
 		ZFS_EXIT(zfsvfs);
 		return (error);
 	}
 
 	if ((error = zfs_dirlook(zdp, nm, vpp)) == 0) {
 
 		/*
 		 * Convert device special files
 		 */
 		if (IS_DEVVP(*vpp)) {
 			vnode_t	*svp;
 
 			svp = specvp(*vpp, (*vpp)->v_rdev, (*vpp)->v_type, cr);
 			VN_RELE(*vpp);
 			if (svp == NULL)
 				error = ENOSYS;
 			else
 				*vpp = svp;
 		}
 	}
 
 	ZFS_EXIT(zfsvfs);
 
 	/* Translate errors and add SAVENAME when needed. */
 	if (cnp->cn_flags & ISLASTCN) {
 		switch (nameiop) {
 		case CREATE:
 		case RENAME:
 			if (error == ENOENT) {
 				error = EJUSTRETURN;
 				cnp->cn_flags |= SAVENAME;
 				break;
 			}
 			/* FALLTHROUGH */
 		case DELETE:
 			if (error == 0)
 				cnp->cn_flags |= SAVENAME;
 			break;
 		}
 	}
 	if (error == 0 && (nm[0] != '.' || nm[1] != '\0')) {
 		int ltype = 0;
 
 		if (cnp->cn_flags & ISDOTDOT) {
 			ltype = VOP_ISLOCKED(dvp, td);
 			VOP_UNLOCK(dvp, 0, td);
 		}
-		error = vn_lock(*vpp, cnp->cn_lkflags, td);
+		error = vn_lock(*vpp, cnp->cn_lkflags);
 		if (cnp->cn_flags & ISDOTDOT)
-			vn_lock(dvp, ltype | LK_RETRY, td);
+			vn_lock(dvp, ltype | LK_RETRY);
 		if (error != 0) {
 			VN_RELE(*vpp);
 			*vpp = NULL;
 			return (error);
 		}
 	}
 
 #ifdef FREEBSD_NAMECACHE
 	/*
 	 * Insert name into cache (as non-existent) if appropriate.
 	 */
 	if (error == ENOENT && (cnp->cn_flags & MAKEENTRY) && nameiop != CREATE)
 		cache_enter(dvp, *vpp, cnp);
 	/*
 	 * Insert name into cache if appropriate.
 	 */
 	if (error == 0 && (cnp->cn_flags & MAKEENTRY)) {
 		if (!(cnp->cn_flags & ISLASTCN) ||
 		    (nameiop != DELETE && nameiop != RENAME)) {
 			cache_enter(dvp, *vpp, cnp);
 		}
 	}
 #endif
 
 	return (error);
 }
 
 /*
  * Attempt to create a new entry in a directory.  If the entry
  * already exists, truncate the file if permissible, else return
  * an error.  Return the vp of the created or trunc'd file.
  *
  *	IN:	dvp	- vnode of directory to put new file entry in.
  *		name	- name of new file entry.
  *		vap	- attributes of new file.
  *		excl	- flag indicating exclusive or non-exclusive mode.
  *		mode	- mode to open file with.
  *		cr	- credentials of caller.
  *		flag	- large file flag [UNUSED].
  *
  *	OUT:	vpp	- vnode of created or trunc'd entry.
  *
  *	RETURN:	0 if success
  *		error code if failure
  *
  * Timestamps:
  *	dvp - ctime|mtime updated if new entry created
  *	 vp - ctime|mtime always, atime if new
  */
 /* ARGSUSED */
 static int
 zfs_create(vnode_t *dvp, char *name, vattr_t *vap, int excl, int mode,
     vnode_t **vpp, cred_t *cr, kthread_t *td)
 {
 	znode_t		*zp, *dzp = VTOZ(dvp);
 	zfsvfs_t	*zfsvfs = dzp->z_zfsvfs;
 	zilog_t		*zilog = zfsvfs->z_log;
 	objset_t	*os = zfsvfs->z_os;
 	zfs_dirlock_t	*dl;
 	dmu_tx_t	*tx;
 	int		error;
 	uint64_t	zoid;
 
 	ZFS_ENTER(zfsvfs);
 
 top:
 	*vpp = NULL;
 
 	if ((vap->va_mode & VSVTX) && secpolicy_vnode_stky_modify(cr))
 		vap->va_mode &= ~VSVTX;
 
 	if (*name == '\0') {
 		/*
 		 * Null component name refers to the directory itself.
 		 */
 		VN_HOLD(dvp);
 		zp = dzp;
 		dl = NULL;
 		error = 0;
 	} else {
 		/* possible VN_HOLD(zp) */
 		if (error = zfs_dirent_lock(&dl, dzp, name, &zp, 0)) {
 			if (strcmp(name, "..") == 0)
 				error = EISDIR;
 			ZFS_EXIT(zfsvfs);
 			return (error);
 		}
 	}
 
 	zoid = zp ? zp->z_id : -1ULL;
 
 	if (zp == NULL) {
 		/*
 		 * Create a new file object and update the directory
 		 * to reference it.
 		 */
 		if (error = zfs_zaccess(dzp, ACE_ADD_FILE, cr)) {
 			goto out;
 		}
 
 		/*
 		 * We only support the creation of regular files in
 		 * extended attribute directories.
 		 */
 		if ((dzp->z_phys->zp_flags & ZFS_XATTR) &&
 		    (vap->va_type != VREG)) {
 			error = EINVAL;
 			goto out;
 		}
 
 		tx = dmu_tx_create(os);
 		dmu_tx_hold_bonus(tx, DMU_NEW_OBJECT);
 		dmu_tx_hold_bonus(tx, dzp->z_id);
 		dmu_tx_hold_zap(tx, dzp->z_id, TRUE, name);
 		if (dzp->z_phys->zp_flags & ZFS_INHERIT_ACE)
 			dmu_tx_hold_write(tx, DMU_NEW_OBJECT,
 			    0, SPA_MAXBLOCKSIZE);
 		error = dmu_tx_assign(tx, zfsvfs->z_assign);
 		if (error) {
 			zfs_dirent_unlock(dl);
 			if (error == ERESTART &&
 			    zfsvfs->z_assign == TXG_NOWAIT) {
 				dmu_tx_wait(tx);
 				dmu_tx_abort(tx);
 				goto top;
 			}
 			dmu_tx_abort(tx);
 			ZFS_EXIT(zfsvfs);
 			return (error);
 		}
 		zfs_mknode(dzp, vap, &zoid, tx, cr, 0, &zp, 0);
 		ASSERT(zp->z_id == zoid);
 		(void) zfs_link_create(dl, zp, tx, ZNEW);
 		zfs_log_create(zilog, tx, TX_CREATE, dzp, zp, name);
 		dmu_tx_commit(tx);
 	} else {
 		/*
 		 * A directory entry already exists for this name.
 		 */
 		/*
 		 * Can't truncate an existing file if in exclusive mode.
 		 */
 		if (excl == EXCL) {
 			error = EEXIST;
 			goto out;
 		}
 		/*
 		 * Can't open a directory for writing.
 		 */
 		if ((ZTOV(zp)->v_type == VDIR) && (mode & S_IWRITE)) {
 			error = EISDIR;
 			goto out;
 		}
 		/*
 		 * Verify requested access to file.
 		 */
 		if (mode && (error = zfs_zaccess_rwx(zp, mode, cr))) {
 			goto out;
 		}
 
 		mutex_enter(&dzp->z_lock);
 		dzp->z_seq++;
 		mutex_exit(&dzp->z_lock);
 
 		/*
 		 * Truncate regular files if requested.
 		 */
 		if ((ZTOV(zp)->v_type == VREG) &&
 		    (vap->va_mask & AT_SIZE) && (vap->va_size == 0)) {
 			error = zfs_freesp(zp, 0, 0, mode, TRUE);
 			if (error == ERESTART &&
 			    zfsvfs->z_assign == TXG_NOWAIT) {
 				/* NB: we already did dmu_tx_wait() */
 				zfs_dirent_unlock(dl);
 				VN_RELE(ZTOV(zp));
 				goto top;
 			}
 		}
 	}
 out:
 
 	if (error == 0) {
 		*vpp = ZTOV(zp);
-		vn_lock(*vpp, LK_EXCLUSIVE | LK_RETRY, td);
+		vn_lock(*vpp, LK_EXCLUSIVE | LK_RETRY);
 	}
 
 	if (dl)
 		zfs_dirent_unlock(dl);
 
 	if (error) {
 		if (zp)
 			VN_RELE(ZTOV(zp));
 	} else {
 		*vpp = ZTOV(zp);
 		/*
 		 * If vnode is for a device return a specfs vnode instead.
 		 */
 		if (IS_DEVVP(*vpp)) {
 			struct vnode *svp;
 
 			svp = specvp(*vpp, (*vpp)->v_rdev, (*vpp)->v_type, cr);
 			VN_RELE(*vpp);
 			if (svp == NULL) {
 				error = ENOSYS;
 			}
 			*vpp = svp;
 		}
 	}
 
 	ZFS_EXIT(zfsvfs);
 	return (error);
 }
 
 /*
  * Remove an entry from a directory.
  *
  *	IN:	dvp	- vnode of directory to remove entry from.
  *		name	- name of entry to remove.
  *		cr	- credentials of caller.
  *
  *	RETURN:	0 if success
  *		error code if failure
  *
  * Timestamps:
  *	dvp - ctime|mtime
  *	 vp - ctime (if nlink > 0)
  */
 static int
 zfs_remove(vnode_t *dvp, char *name, cred_t *cr)
 {
 	znode_t		*zp, *dzp = VTOZ(dvp);
 	znode_t		*xzp = NULL;
 	vnode_t		*vp;
 	zfsvfs_t	*zfsvfs = dzp->z_zfsvfs;
 	zilog_t		*zilog = zfsvfs->z_log;
 	uint64_t	acl_obj, xattr_obj;
 	zfs_dirlock_t	*dl;
 	dmu_tx_t	*tx;
 	boolean_t	may_delete_now, delete_now = FALSE;
 	boolean_t	unlinked;
 	int		error;
 
 	ZFS_ENTER(zfsvfs);
 
 top:
 	/*
 	 * Attempt to lock directory; fail if entry doesn't exist.
 	 */
 	if (error = zfs_dirent_lock(&dl, dzp, name, &zp, ZEXISTS)) {
 		ZFS_EXIT(zfsvfs);
 		return (error);
 	}
 
 	vp = ZTOV(zp);
 
 	if (error = zfs_zaccess_delete(dzp, zp, cr)) {
 		goto out;
 	}
 
 	/*
 	 * Need to use rmdir for removing directories.
 	 */
 	if (vp->v_type == VDIR) {
 		error = EPERM;
 		goto out;
 	}
 
 	vnevent_remove(vp);
 
 	dnlc_remove(dvp, name);
 
 	may_delete_now = FALSE;
 
 	/*
 	 * We may delete the znode now, or we may put it in the unlinked set;
 	 * it depends on whether we're the last link, and on whether there are
 	 * other holds on the vnode.  So we dmu_tx_hold() the right things to
 	 * allow for either case.
 	 */
 	tx = dmu_tx_create(zfsvfs->z_os);
 	dmu_tx_hold_zap(tx, dzp->z_id, FALSE, name);
 	dmu_tx_hold_bonus(tx, zp->z_id);
 	if (may_delete_now)
 		dmu_tx_hold_free(tx, zp->z_id, 0, DMU_OBJECT_END);
 
 	/* are there any extended attributes? */
 	if ((xattr_obj = zp->z_phys->zp_xattr) != 0) {
 		/* XXX - do we need this if we are deleting? */
 		dmu_tx_hold_bonus(tx, xattr_obj);
 	}
 
 	/* are there any additional acls */
 	if ((acl_obj = zp->z_phys->zp_acl.z_acl_extern_obj) != 0 &&
 	    may_delete_now)
 		dmu_tx_hold_free(tx, acl_obj, 0, DMU_OBJECT_END);
 
 	/* charge as an update -- would be nice not to charge at all */
 	dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL);
 
 	error = dmu_tx_assign(tx, zfsvfs->z_assign);
 	if (error) {
 		zfs_dirent_unlock(dl);
 		VN_RELE(vp);
 		if (error == ERESTART && zfsvfs->z_assign == TXG_NOWAIT) {
 			dmu_tx_wait(tx);
 			dmu_tx_abort(tx);
 			goto top;
 		}
 		dmu_tx_abort(tx);
 		ZFS_EXIT(zfsvfs);
 		return (error);
 	}
 
 	/*
 	 * Remove the directory entry.
 	 */
 	error = zfs_link_destroy(dl, zp, tx, 0, &unlinked);
 
 	if (error) {
 		dmu_tx_commit(tx);
 		goto out;
 	}
 
 	if (0 && unlinked) {
 		VI_LOCK(vp);
 		delete_now = may_delete_now &&
 		    vp->v_count == 1 && !vn_has_cached_data(vp) &&
 		    zp->z_phys->zp_xattr == xattr_obj &&
 		    zp->z_phys->zp_acl.z_acl_extern_obj == acl_obj;
 		VI_UNLOCK(vp);
 	}
 
 	if (delete_now) {
 		if (zp->z_phys->zp_xattr) {
 			error = zfs_zget(zfsvfs, zp->z_phys->zp_xattr, &xzp);
 			ASSERT3U(error, ==, 0);
 			ASSERT3U(xzp->z_phys->zp_links, ==, 2);
 			dmu_buf_will_dirty(xzp->z_dbuf, tx);
 			mutex_enter(&xzp->z_lock);
 			xzp->z_unlinked = 1;
 			xzp->z_phys->zp_links = 0;
 			mutex_exit(&xzp->z_lock);
 			zfs_unlinked_add(xzp, tx);
 			zp->z_phys->zp_xattr = 0; /* probably unnecessary */
 		}
 		mutex_enter(&zp->z_lock);
 		VI_LOCK(vp);
 		vp->v_count--;
 		ASSERT3U(vp->v_count, ==, 0);
 		VI_UNLOCK(vp);
 		mutex_exit(&zp->z_lock);
 		zfs_znode_delete(zp, tx);
 		VFS_RELE(zfsvfs->z_vfs);
 	} else if (unlinked) {
 		zfs_unlinked_add(zp, tx);
 	}
 
 	zfs_log_remove(zilog, tx, TX_REMOVE, dzp, name);
 
 	dmu_tx_commit(tx);
 out:
 	zfs_dirent_unlock(dl);
 
 	if (!delete_now) {
 		VN_RELE(vp);
 	} else if (xzp) {
 		/* this rele delayed to prevent nesting transactions */
 		VN_RELE(ZTOV(xzp));
 	}
 
 	ZFS_EXIT(zfsvfs);
 	return (error);
 }
 
 /*
  * Create a new directory and insert it into dvp using the name
  * provided.  Return a pointer to the inserted directory.
  *
  *	IN:	dvp	- vnode of directory to add subdir to.
  *		dirname	- name of new directory.
  *		vap	- attributes of new directory.
  *		cr	- credentials of caller.
  *
  *	OUT:	vpp	- vnode of created directory.
  *
  *	RETURN:	0 if success
  *		error code if failure
  *
  * Timestamps:
  *	dvp - ctime|mtime updated
  *	 vp - ctime|mtime|atime updated
  */
 static int
 zfs_mkdir(vnode_t *dvp, char *dirname, vattr_t *vap, vnode_t **vpp, cred_t *cr)
 {
 	znode_t		*zp, *dzp = VTOZ(dvp);
 	zfsvfs_t	*zfsvfs = dzp->z_zfsvfs;
 	zilog_t		*zilog = zfsvfs->z_log;
 	zfs_dirlock_t	*dl;
 	uint64_t	zoid = 0;
 	dmu_tx_t	*tx;
 	int		error;
 
 	ASSERT(vap->va_type == VDIR);
 
 	ZFS_ENTER(zfsvfs);
 
 	if (dzp->z_phys->zp_flags & ZFS_XATTR) {
 		ZFS_EXIT(zfsvfs);
 		return (EINVAL);
 	}
 top:
 	*vpp = NULL;
 
 	/*
 	 * First make sure the new directory doesn't exist.
 	 */
 	if (error = zfs_dirent_lock(&dl, dzp, dirname, &zp, ZNEW)) {
 		ZFS_EXIT(zfsvfs);
 		return (error);
 	}
 
 	if (error = zfs_zaccess(dzp, ACE_ADD_SUBDIRECTORY, cr)) {
 		zfs_dirent_unlock(dl);
 		ZFS_EXIT(zfsvfs);
 		return (error);
 	}
 
 	/*
 	 * Add a new entry to the directory.
 	 */
 	tx = dmu_tx_create(zfsvfs->z_os);
 	dmu_tx_hold_zap(tx, dzp->z_id, TRUE, dirname);
 	dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, FALSE, NULL);
 	if (dzp->z_phys->zp_flags & ZFS_INHERIT_ACE)
 		dmu_tx_hold_write(tx, DMU_NEW_OBJECT,
 		    0, SPA_MAXBLOCKSIZE);
 	error = dmu_tx_assign(tx, zfsvfs->z_assign);
 	if (error) {
 		zfs_dirent_unlock(dl);
 		if (error == ERESTART && zfsvfs->z_assign == TXG_NOWAIT) {
 			dmu_tx_wait(tx);
 			dmu_tx_abort(tx);
 			goto top;
 		}
 		dmu_tx_abort(tx);
 		ZFS_EXIT(zfsvfs);
 		return (error);
 	}
 
 	/*
 	 * Create new node.
 	 */
 	zfs_mknode(dzp, vap, &zoid, tx, cr, 0, &zp, 0);
 
 	/*
 	 * Now put new name in parent dir.
 	 */
 	(void) zfs_link_create(dl, zp, tx, ZNEW);
 
 	*vpp = ZTOV(zp);
 
 	zfs_log_create(zilog, tx, TX_MKDIR, dzp, zp, dirname);
 	dmu_tx_commit(tx);
 
-	vn_lock(*vpp, LK_EXCLUSIVE | LK_RETRY, curthread);
+	vn_lock(*vpp, LK_EXCLUSIVE | LK_RETRY);
 
 	zfs_dirent_unlock(dl);
 
 	ZFS_EXIT(zfsvfs);
 	return (0);
 }
 
 /*
  * Remove a directory subdir entry.  If the current working
  * directory is the same as the subdir to be removed, the
  * remove will fail.
  *
  *	IN:	dvp	- vnode of directory to remove from.
  *		name	- name of directory to be removed.
  *		cwd	- vnode of current working directory.
  *		cr	- credentials of caller.
  *
  *	RETURN:	0 if success
  *		error code if failure
  *
  * Timestamps:
  *	dvp - ctime|mtime updated
  */
 static int
 zfs_rmdir(vnode_t *dvp, char *name, vnode_t *cwd, cred_t *cr)
 {
 	znode_t		*dzp = VTOZ(dvp);
 	znode_t		*zp;
 	vnode_t		*vp;
 	zfsvfs_t	*zfsvfs = dzp->z_zfsvfs;
 	zilog_t		*zilog = zfsvfs->z_log;
 	zfs_dirlock_t	*dl;
 	dmu_tx_t	*tx;
 	int		error;
 
 	ZFS_ENTER(zfsvfs);
 
 top:
 	zp = NULL;
 
 	/*
 	 * Attempt to lock directory; fail if entry doesn't exist.
 	 */
 	if (error = zfs_dirent_lock(&dl, dzp, name, &zp, ZEXISTS)) {
 		ZFS_EXIT(zfsvfs);
 		return (error);
 	}
 
 	vp = ZTOV(zp);
 
 	if (error = zfs_zaccess_delete(dzp, zp, cr)) {
 		goto out;
 	}
 
 	if (vp->v_type != VDIR) {
 		error = ENOTDIR;
 		goto out;
 	}
 
 	if (vp == cwd) {
 		error = EINVAL;
 		goto out;
 	}
 
 	vnevent_rmdir(vp);
 
 	/*
 	 * Grab a lock on the directory to make sure that noone is
 	 * trying to add (or lookup) entries while we are removing it.
 	 */
 	rw_enter(&zp->z_name_lock, RW_WRITER);
 
 	/*
 	 * Grab a lock on the parent pointer to make sure we play well
 	 * with the treewalk and directory rename code.
 	 */
 	rw_enter(&zp->z_parent_lock, RW_WRITER);
 
 	tx = dmu_tx_create(zfsvfs->z_os);
 	dmu_tx_hold_zap(tx, dzp->z_id, FALSE, name);
 	dmu_tx_hold_bonus(tx, zp->z_id);
 	dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL);
 	error = dmu_tx_assign(tx, zfsvfs->z_assign);
 	if (error) {
 		rw_exit(&zp->z_parent_lock);
 		rw_exit(&zp->z_name_lock);
 		zfs_dirent_unlock(dl);
 		VN_RELE(vp);
 		if (error == ERESTART && zfsvfs->z_assign == TXG_NOWAIT) {
 			dmu_tx_wait(tx);
 			dmu_tx_abort(tx);
 			goto top;
 		}
 		dmu_tx_abort(tx);
 		ZFS_EXIT(zfsvfs);
 		return (error);
 	}
 
 #ifdef FREEBSD_NAMECACHE
 	cache_purge(dvp);
 #endif
 
 	error = zfs_link_destroy(dl, zp, tx, 0, NULL);
 
 	if (error == 0)
 		zfs_log_remove(zilog, tx, TX_RMDIR, dzp, name);
 
 	dmu_tx_commit(tx);
 
 	rw_exit(&zp->z_parent_lock);
 	rw_exit(&zp->z_name_lock);
 #ifdef FREEBSD_NAMECACHE
 	cache_purge(vp);
 #endif
 out:
 	zfs_dirent_unlock(dl);
 
 	VN_RELE(vp);
 
 	ZFS_EXIT(zfsvfs);
 	return (error);
 }
 
 /*
  * Read as many directory entries as will fit into the provided
  * buffer from the given directory cursor position (specified in
  * the uio structure.
  *
  *	IN:	vp	- vnode of directory to read.
  *		uio	- structure supplying read location, range info,
  *			  and return buffer.
  *		cr	- credentials of caller.
  *
  *	OUT:	uio	- updated offset and range, buffer filled.
  *		eofp	- set to true if end-of-file detected.
  *
  *	RETURN:	0 if success
  *		error code if failure
  *
  * Timestamps:
  *	vp - atime updated
  *
  * Note that the low 4 bits of the cookie returned by zap is always zero.
  * This allows us to use the low range for "special" directory entries:
  * We use 0 for '.', and 1 for '..'.  If this is the root of the filesystem,
  * we use the offset 2 for the '.zfs' directory.
  */
 /* ARGSUSED */
 static int
 zfs_readdir(vnode_t *vp, uio_t *uio, cred_t *cr, int *eofp, int *ncookies, u_long **cookies)
 {
 	znode_t		*zp = VTOZ(vp);
 	iovec_t		*iovp;
 	dirent64_t	*odp;
 	zfsvfs_t	*zfsvfs = zp->z_zfsvfs;
 	objset_t	*os;
 	caddr_t		outbuf;
 	size_t		bufsize;
 	zap_cursor_t	zc;
 	zap_attribute_t	zap;
 	uint_t		bytes_wanted;
 	uint64_t	offset; /* must be unsigned; checks for < 1 */
 	int		local_eof;
 	int		outcount;
 	int		error;
 	uint8_t		prefetch;
 	uint8_t		type;
 	int		ncooks;
 	u_long		*cooks = NULL;
 
 	ZFS_ENTER(zfsvfs);
 
 	/*
 	 * If we are not given an eof variable,
 	 * use a local one.
 	 */
 	if (eofp == NULL)
 		eofp = &local_eof;
 
 	/*
 	 * Check for valid iov_len.
 	 */
 	if (uio->uio_iov->iov_len <= 0) {
 		ZFS_EXIT(zfsvfs);
 		return (EINVAL);
 	}
 
 	/*
 	 * Quit if directory has been removed (posix)
 	 */
 	if ((*eofp = zp->z_unlinked) != 0) {
 		ZFS_EXIT(zfsvfs);
 		return (0);
 	}
 
 	error = 0;
 	os = zfsvfs->z_os;
 	offset = uio->uio_loffset;
 	prefetch = zp->z_zn_prefetch;
 
 	/*
 	 * Initialize the iterator cursor.
 	 */
 	if (offset <= 3) {
 		/*
 		 * Start iteration from the beginning of the directory.
 		 */
 		zap_cursor_init(&zc, os, zp->z_id);
 	} else {
 		/*
 		 * The offset is a serialized cursor.
 		 */
 		zap_cursor_init_serialized(&zc, os, zp->z_id, offset);
 	}
 
 	/*
 	 * Get space to change directory entries into fs independent format.
 	 */
 	iovp = uio->uio_iov;
 	bytes_wanted = iovp->iov_len;
 	if (uio->uio_segflg != UIO_SYSSPACE || uio->uio_iovcnt != 1) {
 		bufsize = bytes_wanted;
 		outbuf = kmem_alloc(bufsize, KM_SLEEP);
 		odp = (struct dirent64 *)outbuf;
 	} else {
 		bufsize = bytes_wanted;
 		odp = (struct dirent64 *)iovp->iov_base;
 	}
 
 	if (ncookies != NULL) {
 		/*
 		 * Minimum entry size is dirent size and 1 byte for a file name.
 		 */
 		ncooks = uio->uio_resid / (sizeof(struct dirent) - sizeof(((struct dirent *)NULL)->d_name) + 1);
 		cooks = malloc(ncooks * sizeof(u_long), M_TEMP, M_WAITOK);
 		*cookies = cooks;
 		*ncookies = ncooks;
 	}
 
 	/*
 	 * Transform to file-system independent format
 	 */
 	outcount = 0;
 	while (outcount < bytes_wanted) {
 		ino64_t objnum;
 		ushort_t reclen;
 
 		/*
 		 * Special case `.', `..', and `.zfs'.
 		 */
 		if (offset == 0) {
 			(void) strcpy(zap.za_name, ".");
 			objnum = zp->z_id;
 			type = DT_DIR;
 		} else if (offset == 1) {
 			(void) strcpy(zap.za_name, "..");
 			objnum = zp->z_phys->zp_parent;
 			type = DT_DIR;
 		} else if (offset == 2 && zfs_show_ctldir(zp)) {
 			(void) strcpy(zap.za_name, ZFS_CTLDIR_NAME);
 			objnum = ZFSCTL_INO_ROOT;
 			type = DT_DIR;
 		} else {
 			/*
 			 * Grab next entry.
 			 */
 			if (error = zap_cursor_retrieve(&zc, &zap)) {
 				if ((*eofp = (error == ENOENT)) != 0)
 					break;
 				else
 					goto update;
 			}
 
 			if (zap.za_integer_length != 8 ||
 			    zap.za_num_integers != 1) {
 				cmn_err(CE_WARN, "zap_readdir: bad directory "
 				    "entry, obj = %lld, offset = %lld\n",
 				    (u_longlong_t)zp->z_id,
 				    (u_longlong_t)offset);
 				error = ENXIO;
 				goto update;
 			}
 
 			objnum = ZFS_DIRENT_OBJ(zap.za_first_integer);
 			/*
 			 * MacOS X can extract the object type here such as:
 			 * uint8_t type = ZFS_DIRENT_TYPE(zap.za_first_integer);
 			 */
 			type = ZFS_DIRENT_TYPE(zap.za_first_integer);
 		}
 		reclen = DIRENT64_RECLEN(strlen(zap.za_name));
 
 		/*
 		 * Will this entry fit in the buffer?
 		 */
 		if (outcount + reclen > bufsize) {
 			/*
 			 * Did we manage to fit anything in the buffer?
 			 */
 			if (!outcount) {
 				error = EINVAL;
 				goto update;
 			}
 			break;
 		}
 		/*
 		 * Add this entry:
 		 */
 		odp->d_ino = objnum;
 		odp->d_reclen = reclen;
 		odp->d_namlen = strlen(zap.za_name);
 		(void) strlcpy(odp->d_name, zap.za_name, odp->d_namlen + 1);
 		odp->d_type = type;
 		outcount += reclen;
 		odp = (dirent64_t *)((intptr_t)odp + reclen);
 
 		ASSERT(outcount <= bufsize);
 
 		/* Prefetch znode */
 		if (prefetch)
 			dmu_prefetch(os, objnum, 0, 0);
 
 		/*
 		 * Move to the next entry, fill in the previous offset.
 		 */
 		if (offset > 2 || (offset == 2 && !zfs_show_ctldir(zp))) {
 			zap_cursor_advance(&zc);
 			offset = zap_cursor_serialize(&zc);
 		} else {
 			offset += 1;
 		}
 
 		if (cooks != NULL) {
 			*cooks++ = offset;
 			ncooks--;
 			KASSERT(ncooks >= 0, ("ncookies=%d", ncooks));
 		}
 	}
 	zp->z_zn_prefetch = B_FALSE; /* a lookup will re-enable pre-fetching */
 
 	/* Subtract unused cookies */
 	if (ncookies != NULL)
 		*ncookies -= ncooks;
 
 	if (uio->uio_segflg == UIO_SYSSPACE && uio->uio_iovcnt == 1) {
 		iovp->iov_base += outcount;
 		iovp->iov_len -= outcount;
 		uio->uio_resid -= outcount;
 	} else if (error = uiomove(outbuf, (long)outcount, UIO_READ, uio)) {
 		/*
 		 * Reset the pointer.
 		 */
 		offset = uio->uio_loffset;
 	}
 
 update:
 	zap_cursor_fini(&zc);
 	if (uio->uio_segflg != UIO_SYSSPACE || uio->uio_iovcnt != 1)
 		kmem_free(outbuf, bufsize);
 
 	if (error == ENOENT)
 		error = 0;
 
 	ZFS_ACCESSTIME_STAMP(zfsvfs, zp);
 
 	uio->uio_loffset = offset;
 	ZFS_EXIT(zfsvfs);
 	if (error != 0 && cookies != NULL) {
 		free(*cookies, M_TEMP);
 		*cookies = NULL;
 		*ncookies = 0;
 	}
 	return (error);
 }
 
 static int
 zfs_fsync(vnode_t *vp, int syncflag, cred_t *cr)
 {
 	znode_t	*zp = VTOZ(vp);
 	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
 
 	ZFS_ENTER(zfsvfs);
 	zil_commit(zfsvfs->z_log, zp->z_last_itx, zp->z_id);
 	ZFS_EXIT(zfsvfs);
 	return (0);
 }
 
 /*
  * Get the requested file attributes and place them in the provided
  * vattr structure.
  *
  *	IN:	vp	- vnode of file.
  *		vap	- va_mask identifies requested attributes.
  *		flags	- [UNUSED]
  *		cr	- credentials of caller.
  *
  *	OUT:	vap	- attribute values.
  *
  *	RETURN:	0 (always succeeds)
  */
 /* ARGSUSED */
 static int
 zfs_getattr(vnode_t *vp, vattr_t *vap, int flags, cred_t *cr)
 {
 	znode_t *zp = VTOZ(vp);
 	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
 	znode_phys_t *pzp = zp->z_phys;
 	uint32_t blksize;
 	u_longlong_t nblocks;
 	int	error;
 
 	ZFS_ENTER(zfsvfs);
 
 	/*
 	 * Return all attributes.  It's cheaper to provide the answer
 	 * than to determine whether we were asked the question.
 	 */
 	mutex_enter(&zp->z_lock);
 
 	vap->va_type = IFTOVT(pzp->zp_mode);
 	vap->va_mode = pzp->zp_mode & ~S_IFMT;
 	vap->va_uid = zp->z_phys->zp_uid;
 	vap->va_gid = zp->z_phys->zp_gid;
 	vap->va_nodeid = zp->z_id;
 	vap->va_nlink = MIN(pzp->zp_links, UINT32_MAX);	/* nlink_t limit! */
 	vap->va_size = pzp->zp_size;
 	vap->va_fsid = vp->v_mount->mnt_stat.f_fsid.val[0];
 	vap->va_rdev = zfs_cmpldev(pzp->zp_rdev);
 	vap->va_seq = zp->z_seq;
 	vap->va_flags = 0;	/* FreeBSD: Reset chflags(2) flags. */
 
 	ZFS_TIME_DECODE(&vap->va_atime, pzp->zp_atime);
 	ZFS_TIME_DECODE(&vap->va_mtime, pzp->zp_mtime);
 	ZFS_TIME_DECODE(&vap->va_ctime, pzp->zp_ctime);
 	ZFS_TIME_DECODE(&vap->va_birthtime, pzp->zp_crtime);
 
 	/*
 	 * If ACL is trivial don't bother looking for ACE_READ_ATTRIBUTES.
 	 * Also, if we are the owner don't bother, since owner should
 	 * always be allowed to read basic attributes of file.
 	 */
 	if (!(zp->z_phys->zp_flags & ZFS_ACL_TRIVIAL) &&
 	    (zp->z_phys->zp_uid != crgetuid(cr))) {
 		if (error = zfs_zaccess(zp, ACE_READ_ATTRIBUTES, cr)) {
 			mutex_exit(&zp->z_lock);
 			ZFS_EXIT(zfsvfs);
 			return (error);
 		}
 	}
 
 	mutex_exit(&zp->z_lock);
 
 	dmu_object_size_from_db(zp->z_dbuf, &blksize, &nblocks);
 	vap->va_blksize = blksize;
 	vap->va_bytes = nblocks << 9;	/* nblocks * 512 */
 
 	if (zp->z_blksz == 0) {
 		/*
 		 * Block size hasn't been set; suggest maximal I/O transfers.
 		 */
 		vap->va_blksize = zfsvfs->z_max_blksz;
 	}
 
 	ZFS_EXIT(zfsvfs);
 	return (0);
 }
 
 /*
  * Set the file attributes to the values contained in the
  * vattr structure.
  *
  *	IN:	vp	- vnode of file to be modified.
  *		vap	- new attribute values.
  *		flags	- ATTR_UTIME set if non-default time values provided.
  *		cr	- credentials of caller.
  *
  *	RETURN:	0 if success
  *		error code if failure
  *
  * Timestamps:
  *	vp - ctime updated, mtime updated if size changed.
  */
 /* ARGSUSED */
 static int
 zfs_setattr(vnode_t *vp, vattr_t *vap, int flags, cred_t *cr,
 	caller_context_t *ct)
 {
 	struct znode	*zp = VTOZ(vp);
 	znode_phys_t	*pzp = zp->z_phys;
 	zfsvfs_t	*zfsvfs = zp->z_zfsvfs;
 	zilog_t		*zilog = zfsvfs->z_log;
 	dmu_tx_t	*tx;
 	vattr_t		oldva;
 	uint_t		mask = vap->va_mask;
 	uint_t		saved_mask;
 	int		trim_mask = 0;
 	uint64_t	new_mode;
 	znode_t		*attrzp;
 	int		need_policy = FALSE;
 	int		err;
 
 	if (mask == 0)
 		return (0);
 
 	if (mask & AT_NOSET)
 		return (EINVAL);
 
 	if (mask & AT_SIZE && vp->v_type == VDIR)
 		return (EISDIR);
 
 	if (mask & AT_SIZE && vp->v_type != VREG && vp->v_type != VFIFO)
 		return (EINVAL);
 
 	ZFS_ENTER(zfsvfs);
 
 top:
 	attrzp = NULL;
 
 	if (zfsvfs->z_vfs->vfs_flag & VFS_RDONLY) {
 		ZFS_EXIT(zfsvfs);
 		return (EROFS);
 	}
 
 	/*
 	 * First validate permissions
 	 */
 
 	if (mask & AT_SIZE) {
 		err = zfs_zaccess(zp, ACE_WRITE_DATA, cr);
 		if (err) {
 			ZFS_EXIT(zfsvfs);
 			return (err);
 		}
 		/*
 		 * XXX - Note, we are not providing any open
 		 * mode flags here (like FNDELAY), so we may
 		 * block if there are locks present... this
 		 * should be addressed in openat().
 		 */
 		do {
 			err = zfs_freesp(zp, vap->va_size, 0, 0, FALSE);
 			/* NB: we already did dmu_tx_wait() if necessary */
 		} while (err == ERESTART && zfsvfs->z_assign == TXG_NOWAIT);
 		if (err) {
 			ZFS_EXIT(zfsvfs);
 			return (err);
 		}
 	}
 
 	if (mask & (AT_ATIME|AT_MTIME))
 		need_policy = zfs_zaccess_v4_perm(zp, ACE_WRITE_ATTRIBUTES, cr);
 
 	if (mask & (AT_UID|AT_GID)) {
 		int	idmask = (mask & (AT_UID|AT_GID));
 		int	take_owner;
 		int	take_group;
 
 		/*
 		 * NOTE: even if a new mode is being set,
 		 * we may clear S_ISUID/S_ISGID bits.
 		 */
 
 		if (!(mask & AT_MODE))
 			vap->va_mode = pzp->zp_mode;
 
 		/*
 		 * Take ownership or chgrp to group we are a member of
 		 */
 
 		take_owner = (mask & AT_UID) && (vap->va_uid == crgetuid(cr));
 		take_group = (mask & AT_GID) && groupmember(vap->va_gid, cr);
 
 		/*
 		 * If both AT_UID and AT_GID are set then take_owner and
 		 * take_group must both be set in order to allow taking
 		 * ownership.
 		 *
 		 * Otherwise, send the check through secpolicy_vnode_setattr()
 		 *
 		 */
 
 		if (((idmask == (AT_UID|AT_GID)) && take_owner && take_group) ||
 		    ((idmask == AT_UID) && take_owner) ||
 		    ((idmask == AT_GID) && take_group)) {
 			if (zfs_zaccess_v4_perm(zp, ACE_WRITE_OWNER, cr) == 0) {
 				/*
 				 * Remove setuid/setgid for non-privileged users
 				 */
 				secpolicy_setid_clear(vap, cr);
 				trim_mask = (mask & (AT_UID|AT_GID));
 			} else {
 				need_policy =  TRUE;
 			}
 		} else {
 			need_policy =  TRUE;
 		}
 	}
 
 	mutex_enter(&zp->z_lock);
 	oldva.va_mode = pzp->zp_mode;
 	oldva.va_uid = zp->z_phys->zp_uid;
 	oldva.va_gid = zp->z_phys->zp_gid;
 	mutex_exit(&zp->z_lock);
 
 	if (mask & AT_MODE) {
 		if (zfs_zaccess_v4_perm(zp, ACE_WRITE_ACL, cr) == 0) {
 			err = secpolicy_setid_setsticky_clear(vp, vap,
 			    &oldva, cr);
 			if (err) {
 				ZFS_EXIT(zfsvfs);
 				return (err);
 			}
 			trim_mask |= AT_MODE;
 		} else {
 			need_policy = TRUE;
 		}
 	}
 
 	if (need_policy) {
 		/*
 		 * If trim_mask is set then take ownership
 		 * has been granted or write_acl is present and user
 		 * has the ability to modify mode.  In that case remove
 		 * UID|GID and or MODE from mask so that
 		 * secpolicy_vnode_setattr() doesn't revoke it.
 		 */
 
 		if (trim_mask) {
 			saved_mask = vap->va_mask;
 			vap->va_mask &= ~trim_mask;
 
 		}
 		err = secpolicy_vnode_setattr(cr, vp, vap, &oldva, flags,
 		    (int (*)(void *, int, cred_t *))zfs_zaccess_rwx, zp);
 		if (err) {
 			ZFS_EXIT(zfsvfs);
 			return (err);
 		}
 
 		if (trim_mask)
 			vap->va_mask |= saved_mask;
 	}
 
 	/*
 	 * secpolicy_vnode_setattr, or take ownership may have
 	 * changed va_mask
 	 */
 	mask = vap->va_mask;
 
 	tx = dmu_tx_create(zfsvfs->z_os);
 	dmu_tx_hold_bonus(tx, zp->z_id);
 
 	if (mask & AT_MODE) {
 		uint64_t pmode = pzp->zp_mode;
 
 		new_mode = (pmode & S_IFMT) | (vap->va_mode & ~S_IFMT);
 
 		if (zp->z_phys->zp_acl.z_acl_extern_obj)
 			dmu_tx_hold_write(tx,
 			    pzp->zp_acl.z_acl_extern_obj, 0, SPA_MAXBLOCKSIZE);
 		else
 			dmu_tx_hold_write(tx, DMU_NEW_OBJECT,
 			    0, ZFS_ACL_SIZE(MAX_ACL_SIZE));
 	}
 
 	if ((mask & (AT_UID | AT_GID)) && zp->z_phys->zp_xattr != 0) {
 		err = zfs_zget(zp->z_zfsvfs, zp->z_phys->zp_xattr, &attrzp);
 		if (err) {
 			dmu_tx_abort(tx);
 			ZFS_EXIT(zfsvfs);
 			return (err);
 		}
 		dmu_tx_hold_bonus(tx, attrzp->z_id);
 	}
 
 	err = dmu_tx_assign(tx, zfsvfs->z_assign);
 	if (err) {
 		if (attrzp)
 			VN_RELE(ZTOV(attrzp));
 		if (err == ERESTART && zfsvfs->z_assign == TXG_NOWAIT) {
 			dmu_tx_wait(tx);
 			dmu_tx_abort(tx);
 			goto top;
 		}
 		dmu_tx_abort(tx);
 		ZFS_EXIT(zfsvfs);
 		return (err);
 	}
 
 	dmu_buf_will_dirty(zp->z_dbuf, tx);
 
 	/*
 	 * Set each attribute requested.
 	 * We group settings according to the locks they need to acquire.
 	 *
 	 * Note: you cannot set ctime directly, although it will be
 	 * updated as a side-effect of calling this function.
 	 */
 
 	mutex_enter(&zp->z_lock);
 
 	if (mask & AT_MODE) {
 		err = zfs_acl_chmod_setattr(zp, new_mode, tx);
 		ASSERT3U(err, ==, 0);
 	}
 
 	if (attrzp)
 		mutex_enter(&attrzp->z_lock);
 
 	if (mask & AT_UID) {
 		zp->z_phys->zp_uid = (uint64_t)vap->va_uid;
 		if (attrzp) {
 			attrzp->z_phys->zp_uid = (uint64_t)vap->va_uid;
 		}
 	}
 
 	if (mask & AT_GID) {
 		zp->z_phys->zp_gid = (uint64_t)vap->va_gid;
 		if (attrzp)
 			attrzp->z_phys->zp_gid = (uint64_t)vap->va_gid;
 	}
 
 	if (attrzp)
 		mutex_exit(&attrzp->z_lock);
 
 	if (mask & AT_ATIME)
 		ZFS_TIME_ENCODE(&vap->va_atime, pzp->zp_atime);
 
 	if (mask & AT_MTIME)
 		ZFS_TIME_ENCODE(&vap->va_mtime, pzp->zp_mtime);
 
 	if (mask & AT_SIZE)
 		zfs_time_stamper_locked(zp, CONTENT_MODIFIED, tx);
 	else if (mask != 0)
 		zfs_time_stamper_locked(zp, STATE_CHANGED, tx);
 
 	if (mask != 0)
 		zfs_log_setattr(zilog, tx, TX_SETATTR, zp, vap, mask);
 
 	mutex_exit(&zp->z_lock);
 
 	if (attrzp)
 		VN_RELE(ZTOV(attrzp));
 
 	dmu_tx_commit(tx);
 
 	ZFS_EXIT(zfsvfs);
 	return (err);
 }
 
 typedef struct zfs_zlock {
 	krwlock_t	*zl_rwlock;	/* lock we acquired */
 	znode_t		*zl_znode;	/* znode we held */
 	struct zfs_zlock *zl_next;	/* next in list */
 } zfs_zlock_t;
 
 /*
  * Drop locks and release vnodes that were held by zfs_rename_lock().
  */
 static void
 zfs_rename_unlock(zfs_zlock_t **zlpp)
 {
 	zfs_zlock_t *zl;
 
 	while ((zl = *zlpp) != NULL) {
 		if (zl->zl_znode != NULL)
 			VN_RELE(ZTOV(zl->zl_znode));
 		rw_exit(zl->zl_rwlock);
 		*zlpp = zl->zl_next;
 		kmem_free(zl, sizeof (*zl));
 	}
 }
 
 /*
  * Search back through the directory tree, using the ".." entries.
  * Lock each directory in the chain to prevent concurrent renames.
  * Fail any attempt to move a directory into one of its own descendants.
  * XXX - z_parent_lock can overlap with map or grow locks
  */
 static int
 zfs_rename_lock(znode_t *szp, znode_t *tdzp, znode_t *sdzp, zfs_zlock_t **zlpp)
 {
 	zfs_zlock_t	*zl;
 	znode_t		*zp = tdzp;
 	uint64_t	rootid = zp->z_zfsvfs->z_root;
 	uint64_t	*oidp = &zp->z_id;
 	krwlock_t	*rwlp = &szp->z_parent_lock;
 	krw_t		rw = RW_WRITER;
 
 	/*
 	 * First pass write-locks szp and compares to zp->z_id.
 	 * Later passes read-lock zp and compare to zp->z_parent.
 	 */
 	do {
 		if (!rw_tryenter(rwlp, rw)) {
 			/*
 			 * Another thread is renaming in this path.
 			 * Note that if we are a WRITER, we don't have any
 			 * parent_locks held yet.
 			 */
 			if (rw == RW_READER && zp->z_id > szp->z_id) {
 				/*
 				 * Drop our locks and restart
 				 */
 				zfs_rename_unlock(&zl);
 				*zlpp = NULL;
 				zp = tdzp;
 				oidp = &zp->z_id;
 				rwlp = &szp->z_parent_lock;
 				rw = RW_WRITER;
 				continue;
 			} else {
 				/*
 				 * Wait for other thread to drop its locks
 				 */
 				rw_enter(rwlp, rw);
 			}
 		}
 
 		zl = kmem_alloc(sizeof (*zl), KM_SLEEP);
 		zl->zl_rwlock = rwlp;
 		zl->zl_znode = NULL;
 		zl->zl_next = *zlpp;
 		*zlpp = zl;
 
 		if (*oidp == szp->z_id)		/* We're a descendant of szp */
 			return (EINVAL);
 
 		if (*oidp == rootid)		/* We've hit the top */
 			return (0);
 
 		if (rw == RW_READER) {		/* i.e. not the first pass */
 			int error = zfs_zget(zp->z_zfsvfs, *oidp, &zp);
 			if (error)
 				return (error);
 			zl->zl_znode = zp;
 		}
 		oidp = &zp->z_phys->zp_parent;
 		rwlp = &zp->z_parent_lock;
 		rw = RW_READER;
 
 	} while (zp->z_id != sdzp->z_id);
 
 	return (0);
 }
 
 /*
  * Move an entry from the provided source directory to the target
  * directory.  Change the entry name as indicated.
  *
  *	IN:	sdvp	- Source directory containing the "old entry".
  *		snm	- Old entry name.
  *		tdvp	- Target directory to contain the "new entry".
  *		tnm	- New entry name.
  *		cr	- credentials of caller.
  *
  *	RETURN:	0 if success
  *		error code if failure
  *
  * Timestamps:
  *	sdvp,tdvp - ctime|mtime updated
  */
 static int
 zfs_rename(vnode_t *sdvp, char *snm, vnode_t *tdvp, char *tnm, cred_t *cr)
 {
 	znode_t		*tdzp, *szp, *tzp;
 	znode_t		*sdzp = VTOZ(sdvp);
 	zfsvfs_t	*zfsvfs = sdzp->z_zfsvfs;
 	zilog_t		*zilog = zfsvfs->z_log;
 	vnode_t		*realvp;
 	zfs_dirlock_t	*sdl, *tdl;
 	dmu_tx_t	*tx;
 	zfs_zlock_t	*zl;
 	int		cmp, serr, terr, error;
 
 	ZFS_ENTER(zfsvfs);
 
 	/*
 	 * Make sure we have the real vp for the target directory.
 	 */
 	if (VOP_REALVP(tdvp, &realvp) == 0)
 		tdvp = realvp;
 
 	if (tdvp->v_vfsp != sdvp->v_vfsp) {
 		ZFS_EXIT(zfsvfs);
 		return (EXDEV);
 	}
 
 	tdzp = VTOZ(tdvp);
 top:
 	szp = NULL;
 	tzp = NULL;
 	zl = NULL;
 
 	/*
 	 * This is to prevent the creation of links into attribute space
 	 * by renaming a linked file into/outof an attribute directory.
 	 * See the comment in zfs_link() for why this is considered bad.
 	 */
 	if ((tdzp->z_phys->zp_flags & ZFS_XATTR) !=
 	    (sdzp->z_phys->zp_flags & ZFS_XATTR)) {
 		ZFS_EXIT(zfsvfs);
 		return (EINVAL);
 	}
 
 	/*
 	 * Lock source and target directory entries.  To prevent deadlock,
 	 * a lock ordering must be defined.  We lock the directory with
 	 * the smallest object id first, or if it's a tie, the one with
 	 * the lexically first name.
 	 */
 	if (sdzp->z_id < tdzp->z_id) {
 		cmp = -1;
 	} else if (sdzp->z_id > tdzp->z_id) {
 		cmp = 1;
 	} else {
 		cmp = strcmp(snm, tnm);
 		if (cmp == 0) {
 			/*
 			 * POSIX: "If the old argument and the new argument
 			 * both refer to links to the same existing file,
 			 * the rename() function shall return successfully
 			 * and perform no other action."
 			 */
 			ZFS_EXIT(zfsvfs);
 			return (0);
 		}
 	}
 	if (cmp < 0) {
 		serr = zfs_dirent_lock(&sdl, sdzp, snm, &szp, ZEXISTS);
 		terr = zfs_dirent_lock(&tdl, tdzp, tnm, &tzp, 0);
 	} else {
 		terr = zfs_dirent_lock(&tdl, tdzp, tnm, &tzp, 0);
 		serr = zfs_dirent_lock(&sdl, sdzp, snm, &szp, ZEXISTS);
 	}
 
 	if (serr) {
 		/*
 		 * Source entry invalid or not there.
 		 */
 		if (!terr) {
 			zfs_dirent_unlock(tdl);
 			if (tzp)
 				VN_RELE(ZTOV(tzp));
 		}
 		if (strcmp(snm, ".") == 0 || strcmp(snm, "..") == 0)
 			serr = EINVAL;
 		ZFS_EXIT(zfsvfs);
 		return (serr);
 	}
 	if (terr) {
 		zfs_dirent_unlock(sdl);
 		VN_RELE(ZTOV(szp));
 		if (strcmp(tnm, "..") == 0)
 			terr = EINVAL;
 		ZFS_EXIT(zfsvfs);
 		return (terr);
 	}
 
 	/*
 	 * Must have write access at the source to remove the old entry
 	 * and write access at the target to create the new entry.
 	 * Note that if target and source are the same, this can be
 	 * done in a single check.
 	 */
 
 	if (error = zfs_zaccess_rename(sdzp, szp, tdzp, tzp, cr))
 		goto out;
 
 	if (ZTOV(szp)->v_type == VDIR) {
 		/*
 		 * Check to make sure rename is valid.
 		 * Can't do a move like this: /usr/a/b to /usr/a/b/c/d
 		 */
 		if (error = zfs_rename_lock(szp, tdzp, sdzp, &zl))
 			goto out;
 	}
 
 	/*
 	 * Does target exist?
 	 */
 	if (tzp) {
 		/*
 		 * Source and target must be the same type.
 		 */
 		if (ZTOV(szp)->v_type == VDIR) {
 			if (ZTOV(tzp)->v_type != VDIR) {
 				error = ENOTDIR;
 				goto out;
 			}
 		} else {
 			if (ZTOV(tzp)->v_type == VDIR) {
 				error = EISDIR;
 				goto out;
 			}
 		}
 		/*
 		 * POSIX dictates that when the source and target
 		 * entries refer to the same file object, rename
 		 * must do nothing and exit without error.
 		 */
 		if (szp->z_id == tzp->z_id) {
 			error = 0;
 			goto out;
 		}
 	}
 
 	vnevent_rename_src(ZTOV(szp));
 	if (tzp)
 		vnevent_rename_dest(ZTOV(tzp));
 
 	tx = dmu_tx_create(zfsvfs->z_os);
 	dmu_tx_hold_bonus(tx, szp->z_id);	/* nlink changes */
 	dmu_tx_hold_bonus(tx, sdzp->z_id);	/* nlink changes */
 	dmu_tx_hold_zap(tx, sdzp->z_id, FALSE, snm);
 	dmu_tx_hold_zap(tx, tdzp->z_id, TRUE, tnm);
 	if (sdzp != tdzp)
 		dmu_tx_hold_bonus(tx, tdzp->z_id);	/* nlink changes */
 	if (tzp)
 		dmu_tx_hold_bonus(tx, tzp->z_id);	/* parent changes */
 	dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL);
 	error = dmu_tx_assign(tx, zfsvfs->z_assign);
 	if (error) {
 		if (zl != NULL)
 			zfs_rename_unlock(&zl);
 		zfs_dirent_unlock(sdl);
 		zfs_dirent_unlock(tdl);
 		VN_RELE(ZTOV(szp));
 		if (tzp)
 			VN_RELE(ZTOV(tzp));
 		if (error == ERESTART && zfsvfs->z_assign == TXG_NOWAIT) {
 			dmu_tx_wait(tx);
 			dmu_tx_abort(tx);
 			goto top;
 		}
 		dmu_tx_abort(tx);
 		ZFS_EXIT(zfsvfs);
 		return (error);
 	}
 
 	if (tzp)	/* Attempt to remove the existing target */
 		error = zfs_link_destroy(tdl, tzp, tx, 0, NULL);
 
 	if (error == 0) {
 		error = zfs_link_create(tdl, szp, tx, ZRENAMING);
 		if (error == 0) {
 			error = zfs_link_destroy(sdl, szp, tx, ZRENAMING, NULL);
 			ASSERT(error == 0);
 			zfs_log_rename(zilog, tx, TX_RENAME, sdzp,
 			    sdl->dl_name, tdzp, tdl->dl_name, szp);
 		}
 #ifdef FREEBSD_NAMECACHE
 		if (error == 0) {
 			cache_purge(sdvp);
 			cache_purge(tdvp);
 		}
 #endif
 	}
 
 	dmu_tx_commit(tx);
 out:
 	if (zl != NULL)
 		zfs_rename_unlock(&zl);
 
 	zfs_dirent_unlock(sdl);
 	zfs_dirent_unlock(tdl);
 
 	VN_RELE(ZTOV(szp));
 	if (tzp)
 		VN_RELE(ZTOV(tzp));
 
 	ZFS_EXIT(zfsvfs);
 
 	return (error);
 }
 
 /*
  * Insert the indicated symbolic reference entry into the directory.
  *
  *	IN:	dvp	- Directory to contain new symbolic link.
  *		link	- Name for new symlink entry.
  *		vap	- Attributes of new entry.
  *		target	- Target path of new symlink.
  *		cr	- credentials of caller.
  *
  *	RETURN:	0 if success
  *		error code if failure
  *
  * Timestamps:
  *	dvp - ctime|mtime updated
  */
 static int
 zfs_symlink(vnode_t *dvp, vnode_t **vpp, char *name, vattr_t *vap, char *link, cred_t *cr, kthread_t *td)
 {
 	znode_t		*zp, *dzp = VTOZ(dvp);
 	zfs_dirlock_t	*dl;
 	dmu_tx_t	*tx;
 	zfsvfs_t	*zfsvfs = dzp->z_zfsvfs;
 	zilog_t		*zilog = zfsvfs->z_log;
 	uint64_t	zoid;
 	int		len = strlen(link);
 	int		error;
 
 	ASSERT(vap->va_type == VLNK);
 
 	ZFS_ENTER(zfsvfs);
 top:
 	if (error = zfs_zaccess(dzp, ACE_ADD_FILE, cr)) {
 		ZFS_EXIT(zfsvfs);
 		return (error);
 	}
 
 	if (len > MAXPATHLEN) {
 		ZFS_EXIT(zfsvfs);
 		return (ENAMETOOLONG);
 	}
 
 	/*
 	 * Attempt to lock directory; fail if entry already exists.
 	 */
 	if (error = zfs_dirent_lock(&dl, dzp, name, &zp, ZNEW)) {
 		ZFS_EXIT(zfsvfs);
 		return (error);
 	}
 
 	tx = dmu_tx_create(zfsvfs->z_os);
 	dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, MAX(1, len));
 	dmu_tx_hold_bonus(tx, dzp->z_id);
 	dmu_tx_hold_zap(tx, dzp->z_id, TRUE, name);
 	if (dzp->z_phys->zp_flags & ZFS_INHERIT_ACE)
 		dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, SPA_MAXBLOCKSIZE);
 	error = dmu_tx_assign(tx, zfsvfs->z_assign);
 	if (error) {
 		zfs_dirent_unlock(dl);
 		if (error == ERESTART && zfsvfs->z_assign == TXG_NOWAIT) {
 			dmu_tx_wait(tx);
 			dmu_tx_abort(tx);
 			goto top;
 		}
 		dmu_tx_abort(tx);
 		ZFS_EXIT(zfsvfs);
 		return (error);
 	}
 
 	dmu_buf_will_dirty(dzp->z_dbuf, tx);
 
 	/*
 	 * Create a new object for the symlink.
 	 * Put the link content into bonus buffer if it will fit;
 	 * otherwise, store it just like any other file data.
 	 */
 	zoid = 0;
 	if (sizeof (znode_phys_t) + len <= dmu_bonus_max()) {
 		zfs_mknode(dzp, vap, &zoid, tx, cr, 0, &zp, len);
 		if (len != 0)
 			bcopy(link, zp->z_phys + 1, len);
 	} else {
 		dmu_buf_t *dbp;
 
 		zfs_mknode(dzp, vap, &zoid, tx, cr, 0, &zp, 0);
 
 		/*
 		 * Nothing can access the znode yet so no locking needed
 		 * for growing the znode's blocksize.
 		 */
 		zfs_grow_blocksize(zp, len, tx);
 
 		VERIFY(0 == dmu_buf_hold(zfsvfs->z_os, zoid, 0, FTAG, &dbp));
 		dmu_buf_will_dirty(dbp, tx);
 
 		ASSERT3U(len, <=, dbp->db_size);
 		bcopy(link, dbp->db_data, len);
 		dmu_buf_rele(dbp, FTAG);
 	}
 	zp->z_phys->zp_size = len;
 
 	/*
 	 * Insert the new object into the directory.
 	 */
 	(void) zfs_link_create(dl, zp, tx, ZNEW);
 out:
 	if (error == 0) {
 		zfs_log_symlink(zilog, tx, TX_SYMLINK, dzp, zp, name, link);
 		*vpp = ZTOV(zp);
-		vn_lock(*vpp, LK_EXCLUSIVE | LK_RETRY, td);
+		vn_lock(*vpp, LK_EXCLUSIVE | LK_RETRY);
 	}
 
 	dmu_tx_commit(tx);
 
 	zfs_dirent_unlock(dl);
 
 	ZFS_EXIT(zfsvfs);
 	return (error);
 }
 
 /*
  * Return, in the buffer contained in the provided uio structure,
  * the symbolic path referred to by vp.
  *
  *	IN:	vp	- vnode of symbolic link.
  *		uoip	- structure to contain the link path.
  *		cr	- credentials of caller.
  *
  *	OUT:	uio	- structure to contain the link path.
  *
  *	RETURN:	0 if success
  *		error code if failure
  *
  * Timestamps:
  *	vp - atime updated
  */
 /* ARGSUSED */
 static int
 zfs_readlink(vnode_t *vp, uio_t *uio, cred_t *cr)
 {
 	znode_t		*zp = VTOZ(vp);
 	zfsvfs_t	*zfsvfs = zp->z_zfsvfs;
 	size_t		bufsz;
 	int		error;
 
 	ZFS_ENTER(zfsvfs);
 
 	bufsz = (size_t)zp->z_phys->zp_size;
 	if (bufsz + sizeof (znode_phys_t) <= zp->z_dbuf->db_size) {
 		error = uiomove(zp->z_phys + 1,
 		    MIN((size_t)bufsz, uio->uio_resid), UIO_READ, uio);
 	} else {
 		dmu_buf_t *dbp;
 		error = dmu_buf_hold(zfsvfs->z_os, zp->z_id, 0, FTAG, &dbp);
 		if (error) {
 			ZFS_EXIT(zfsvfs);
 			return (error);
 		}
 		error = uiomove(dbp->db_data,
 		    MIN((size_t)bufsz, uio->uio_resid), UIO_READ, uio);
 		dmu_buf_rele(dbp, FTAG);
 	}
 
 	ZFS_ACCESSTIME_STAMP(zfsvfs, zp);
 	ZFS_EXIT(zfsvfs);
 	return (error);
 }
 
 /*
  * Insert a new entry into directory tdvp referencing svp.
  *
  *	IN:	tdvp	- Directory to contain new entry.
  *		svp	- vnode of new entry.
  *		name	- name of new entry.
  *		cr	- credentials of caller.
  *
  *	RETURN:	0 if success
  *		error code if failure
  *
  * Timestamps:
  *	tdvp - ctime|mtime updated
  *	 svp - ctime updated
  */
 /* ARGSUSED */
 static int
 zfs_link(vnode_t *tdvp, vnode_t *svp, char *name, cred_t *cr)
 {
 	znode_t		*dzp = VTOZ(tdvp);
 	znode_t		*tzp, *szp;
 	zfsvfs_t	*zfsvfs = dzp->z_zfsvfs;
 	zilog_t		*zilog = zfsvfs->z_log;
 	zfs_dirlock_t	*dl;
 	dmu_tx_t	*tx;
 	vnode_t		*realvp;
 	int		error;
 
 	ASSERT(tdvp->v_type == VDIR);
 
 	ZFS_ENTER(zfsvfs);
 
 	if (VOP_REALVP(svp, &realvp) == 0)
 		svp = realvp;
 
 	if (svp->v_vfsp != tdvp->v_vfsp) {
 		ZFS_EXIT(zfsvfs);
 		return (EXDEV);
 	}
 
 	szp = VTOZ(svp);
 top:
 	/*
 	 * We do not support links between attributes and non-attributes
 	 * because of the potential security risk of creating links
 	 * into "normal" file space in order to circumvent restrictions
 	 * imposed in attribute space.
 	 */
 	if ((szp->z_phys->zp_flags & ZFS_XATTR) !=
 	    (dzp->z_phys->zp_flags & ZFS_XATTR)) {
 		ZFS_EXIT(zfsvfs);
 		return (EINVAL);
 	}
 
 	/*
 	 * POSIX dictates that we return EPERM here.
 	 * Better choices include ENOTSUP or EISDIR.
 	 */
 	if (svp->v_type == VDIR) {
 		ZFS_EXIT(zfsvfs);
 		return (EPERM);
 	}
 
 	if ((uid_t)szp->z_phys->zp_uid != crgetuid(cr) &&
 	    secpolicy_basic_link(cr) != 0) {
 		ZFS_EXIT(zfsvfs);
 		return (EPERM);
 	}
 
 	if (error = zfs_zaccess(dzp, ACE_ADD_FILE, cr)) {
 		ZFS_EXIT(zfsvfs);
 		return (error);
 	}
 
 	/*
 	 * Attempt to lock directory; fail if entry already exists.
 	 */
 	if (error = zfs_dirent_lock(&dl, dzp, name, &tzp, ZNEW)) {
 		ZFS_EXIT(zfsvfs);
 		return (error);
 	}
 
 	tx = dmu_tx_create(zfsvfs->z_os);
 	dmu_tx_hold_bonus(tx, szp->z_id);
 	dmu_tx_hold_zap(tx, dzp->z_id, TRUE, name);
 	error = dmu_tx_assign(tx, zfsvfs->z_assign);
 	if (error) {
 		zfs_dirent_unlock(dl);
 		if (error == ERESTART && zfsvfs->z_assign == TXG_NOWAIT) {
 			dmu_tx_wait(tx);
 			dmu_tx_abort(tx);
 			goto top;
 		}
 		dmu_tx_abort(tx);
 		ZFS_EXIT(zfsvfs);
 		return (error);
 	}
 
 	error = zfs_link_create(dl, szp, tx, 0);
 
 	if (error == 0)
 		zfs_log_link(zilog, tx, TX_LINK, dzp, szp, name);
 
 	dmu_tx_commit(tx);
 
 	zfs_dirent_unlock(dl);
 
 	ZFS_EXIT(zfsvfs);
 	return (error);
 }
 
 void
 zfs_inactive(vnode_t *vp, cred_t *cr)
 {
 	znode_t	*zp = VTOZ(vp);
 	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
 	int error;
 
 	rw_enter(&zfsvfs->z_um_lock, RW_READER);
 	if (zfsvfs->z_unmounted2) {
 		ASSERT(zp->z_dbuf_held == 0);
 
 		mutex_enter(&zp->z_lock);
 		VI_LOCK(vp);
 		vp->v_count = 0; /* count arrives as 1 */
 		VI_UNLOCK(vp);
 		if (zp->z_dbuf == NULL) {
 			mutex_exit(&zp->z_lock);
 			zfs_znode_free(zp);
 		} else {
 			mutex_exit(&zp->z_lock);
 		}
 		rw_exit(&zfsvfs->z_um_lock);
 		VFS_RELE(zfsvfs->z_vfs);
 		return;
 	}
 
 	if (zp->z_atime_dirty && zp->z_unlinked == 0) {
 		dmu_tx_t *tx = dmu_tx_create(zfsvfs->z_os);
 
 		dmu_tx_hold_bonus(tx, zp->z_id);
 		error = dmu_tx_assign(tx, TXG_WAIT);
 		if (error) {
 			dmu_tx_abort(tx);
 		} else {
 			dmu_buf_will_dirty(zp->z_dbuf, tx);
 			mutex_enter(&zp->z_lock);
 			zp->z_atime_dirty = 0;
 			mutex_exit(&zp->z_lock);
 			dmu_tx_commit(tx);
 		}
 	}
 
 	zfs_zinactive(zp);
 	rw_exit(&zfsvfs->z_um_lock);
 }
 
 CTASSERT(sizeof(struct zfid_short) <= sizeof(struct fid));
 CTASSERT(sizeof(struct zfid_long) <= sizeof(struct fid));
 
 static int
 zfs_fid(vnode_t *vp, fid_t *fidp)
 {
 	znode_t		*zp = VTOZ(vp);
 	zfsvfs_t	*zfsvfs = zp->z_zfsvfs;
 	uint32_t	gen = (uint32_t)zp->z_phys->zp_gen;
 	uint64_t	object = zp->z_id;
 	zfid_short_t	*zfid;
 	int		size, i;
 
 	ZFS_ENTER(zfsvfs);
 
 	size = (zfsvfs->z_parent != zfsvfs) ? LONG_FID_LEN : SHORT_FID_LEN;
 	fidp->fid_len = size;
 
 	zfid = (zfid_short_t *)fidp;
 
 	zfid->zf_len = size;
 
 	for (i = 0; i < sizeof (zfid->zf_object); i++)
 		zfid->zf_object[i] = (uint8_t)(object >> (8 * i));
 
 	/* Must have a non-zero generation number to distinguish from .zfs */
 	if (gen == 0)
 		gen = 1;
 	for (i = 0; i < sizeof (zfid->zf_gen); i++)
 		zfid->zf_gen[i] = (uint8_t)(gen >> (8 * i));
 
 	if (size == LONG_FID_LEN) {
 		uint64_t	objsetid = dmu_objset_id(zfsvfs->z_os);
 		zfid_long_t	*zlfid;
 
 		zlfid = (zfid_long_t *)fidp;
 
 		for (i = 0; i < sizeof (zlfid->zf_setid); i++)
 			zlfid->zf_setid[i] = (uint8_t)(objsetid >> (8 * i));
 
 		/* XXX - this should be the generation number for the objset */
 		for (i = 0; i < sizeof (zlfid->zf_setgen); i++)
 			zlfid->zf_setgen[i] = 0;
 	}
 
 	ZFS_EXIT(zfsvfs);
 	return (0);
 }
 
 static int
 zfs_pathconf(vnode_t *vp, int cmd, ulong_t *valp, cred_t *cr)
 {
 	znode_t		*zp, *xzp;
 	zfsvfs_t	*zfsvfs;
 	zfs_dirlock_t	*dl;
 	int		error;
 
 	switch (cmd) {
 	case _PC_LINK_MAX:
 		*valp = INT_MAX;
 		return (0);
 
 	case _PC_FILESIZEBITS:
 		*valp = 64;
 		return (0);
 
 #if 0
 	case _PC_XATTR_EXISTS:
 		zp = VTOZ(vp);
 		zfsvfs = zp->z_zfsvfs;
 		ZFS_ENTER(zfsvfs);
 		*valp = 0;
 		error = zfs_dirent_lock(&dl, zp, "", &xzp,
 		    ZXATTR | ZEXISTS | ZSHARED);
 		if (error == 0) {
 			zfs_dirent_unlock(dl);
 			if (!zfs_dirempty(xzp))
 				*valp = 1;
 			VN_RELE(ZTOV(xzp));
 		} else if (error == ENOENT) {
 			/*
 			 * If there aren't extended attributes, it's the
 			 * same as having zero of them.
 			 */
 			error = 0;
 		}
 		ZFS_EXIT(zfsvfs);
 		return (error);
 #endif
 
 	case _PC_ACL_EXTENDED:
 		*valp = 0;	/* TODO */
 		return (0);
 
 	case _PC_MIN_HOLE_SIZE:
 		*valp = (int)SPA_MINBLOCKSIZE;
 		return (0);
 
 	default:
 		return (EOPNOTSUPP);
 	}
 }
 
 #ifdef TODO
 /*ARGSUSED*/
 static int
 zfs_getsecattr(vnode_t *vp, vsecattr_t *vsecp, int flag, cred_t *cr)
 {
 	znode_t *zp = VTOZ(vp);
 	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
 	int error;
 
 	ZFS_ENTER(zfsvfs);
 	error = zfs_getacl(zp, vsecp, cr);
 	ZFS_EXIT(zfsvfs);
 
 	return (error);
 }
 #endif	/* TODO */
 
 #ifdef TODO
 /*ARGSUSED*/
 static int
 zfs_setsecattr(vnode_t *vp, vsecattr_t *vsecp, int flag, cred_t *cr)
 {
 	znode_t *zp = VTOZ(vp);
 	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
 	int error;
 
 	ZFS_ENTER(zfsvfs);
 	error = zfs_setacl(zp, vsecp, cr);
 	ZFS_EXIT(zfsvfs);
 	return (error);
 }
 #endif	/* TODO */
 
 static int
 zfs_freebsd_open(ap)
 	struct vop_open_args /* {
 		struct vnode *a_vp;
 		int a_mode;
 		struct ucred *a_cred;
 		struct thread *a_td;
 	} */ *ap;
 {
 	vnode_t	*vp = ap->a_vp;
 	znode_t *zp = VTOZ(vp);
 	int error;
 
 	error = zfs_open(&vp, ap->a_mode, ap->a_cred);
 	if (error == 0)
 		vnode_create_vobject(vp, zp->z_phys->zp_size, ap->a_td);
 	return (error);
 }
 
 static int
 zfs_freebsd_close(ap)
 	struct vop_close_args /* {
 		struct vnode *a_vp;
 		int  a_fflag;
 		struct ucred *a_cred;
 		struct thread *a_td;
 	} */ *ap;
 {
 
 	return (zfs_close(ap->a_vp, ap->a_fflag, 0, 0, ap->a_cred));
 }
 
 static int
 zfs_freebsd_ioctl(ap)
 	struct vop_ioctl_args /* {
 		struct vnode *a_vp;
 		u_long a_command;
 		caddr_t a_data;
 		int a_fflag;
 		struct ucred *cred;
 		struct thread *td;
 	} */ *ap;
 {
 
 	return (zfs_ioctl(ap->a_vp, ap->a_command, (intptr_t)ap->a_data,
 	    ap->a_fflag, ap->a_cred, NULL));
 }
 
 static int
 zfs_freebsd_read(ap)
 	struct vop_read_args /* {
 		struct vnode *a_vp;
 		struct uio *a_uio;
 		int a_ioflag;
 		struct ucred *a_cred;
 	} */ *ap;
 {
 
 	return (zfs_read(ap->a_vp, ap->a_uio, ap->a_ioflag, ap->a_cred, NULL));
 }
 
 static int
 zfs_freebsd_write(ap)
 	struct vop_write_args /* {
 		struct vnode *a_vp;
 		struct uio *a_uio;
 		int a_ioflag;
 		struct ucred *a_cred;
 	} */ *ap;
 {
 
 	return (zfs_write(ap->a_vp, ap->a_uio, ap->a_ioflag, ap->a_cred, NULL));
 }
 
 static int
 zfs_freebsd_access(ap)
 	struct vop_access_args /* {
 		struct vnode *a_vp;
 		int  a_mode;
 		struct ucred *a_cred;
 		struct thread *a_td;
 	} */ *ap;
 {
 
 	return (zfs_access(ap->a_vp, ap->a_mode, 0, ap->a_cred));
 }
 
 static int
 zfs_freebsd_lookup(ap)
 	struct vop_lookup_args /* {
 		struct vnode *a_dvp;
 		struct vnode **a_vpp;
 		struct componentname *a_cnp;
 	} */ *ap;
 {
 	struct componentname *cnp = ap->a_cnp;
 	char nm[NAME_MAX + 1];
 
 	ASSERT(cnp->cn_namelen < sizeof(nm));
 	strlcpy(nm, cnp->cn_nameptr, MIN(cnp->cn_namelen + 1, sizeof(nm)));
 
 	return (zfs_lookup(ap->a_dvp, nm, ap->a_vpp, cnp, cnp->cn_nameiop,
 	    cnp->cn_cred, cnp->cn_thread));
 }
 
 static int
 zfs_freebsd_create(ap)
 	struct vop_create_args /* {
 		struct vnode *a_dvp;
 		struct vnode **a_vpp;
 		struct componentname *a_cnp;
 		struct vattr *a_vap;
 	} */ *ap;
 {
 	struct componentname *cnp = ap->a_cnp;
 	vattr_t *vap = ap->a_vap;
 	int mode;
 
 	ASSERT(cnp->cn_flags & SAVENAME);
 
 	vattr_init_mask(vap);
 	mode = vap->va_mode & ALLPERMS;
 
 	return (zfs_create(ap->a_dvp, cnp->cn_nameptr, vap, !EXCL, mode,
 	    ap->a_vpp, cnp->cn_cred, cnp->cn_thread));
 }
 
 static int
 zfs_freebsd_remove(ap)
 	struct vop_remove_args /* {
 		struct vnode *a_dvp;
 		struct vnode *a_vp;
 		struct componentname *a_cnp;
 	} */ *ap;
 {
 
 	ASSERT(ap->a_cnp->cn_flags & SAVENAME);
 
 	return (zfs_remove(ap->a_dvp, ap->a_cnp->cn_nameptr,
 	    ap->a_cnp->cn_cred));
 }
 
 static int
 zfs_freebsd_mkdir(ap)
 	struct vop_mkdir_args /* {
 		struct vnode *a_dvp;
 		struct vnode **a_vpp;
 		struct componentname *a_cnp;
 		struct vattr *a_vap;
 	} */ *ap;
 {
 	vattr_t *vap = ap->a_vap;
 
 	ASSERT(ap->a_cnp->cn_flags & SAVENAME);
 
 	vattr_init_mask(vap);
 
 	return (zfs_mkdir(ap->a_dvp, ap->a_cnp->cn_nameptr, vap, ap->a_vpp,
 	    ap->a_cnp->cn_cred));
 }
 
 static int
 zfs_freebsd_rmdir(ap)
 	struct vop_rmdir_args /* {
 		struct vnode *a_dvp;
 		struct vnode *a_vp;
 		struct componentname *a_cnp;
 	} */ *ap;
 {
 	struct componentname *cnp = ap->a_cnp;
 
 	ASSERT(cnp->cn_flags & SAVENAME);
 
 	return (zfs_rmdir(ap->a_dvp, cnp->cn_nameptr, NULL, cnp->cn_cred));
 }
 
 static int
 zfs_freebsd_readdir(ap)
 	struct vop_readdir_args /* {
 		struct vnode *a_vp;
 		struct uio *a_uio;
 		struct ucred *a_cred;
 		int *a_eofflag;
 		int *a_ncookies;
 		u_long **a_cookies;
 	} */ *ap;
 {
 
 	return (zfs_readdir(ap->a_vp, ap->a_uio, ap->a_cred, ap->a_eofflag,
 	    ap->a_ncookies, ap->a_cookies));
 }
 
 static int
 zfs_freebsd_fsync(ap)
 	struct vop_fsync_args /* {
 		struct vnode *a_vp;
 		int a_waitfor;
 		struct thread *a_td;
 	} */ *ap;
 {
 
 	vop_stdfsync(ap);
 	return (zfs_fsync(ap->a_vp, 0, ap->a_td->td_ucred));
 }
 
 static int
 zfs_freebsd_getattr(ap)
 	struct vop_getattr_args /* {
 		struct vnode *a_vp;
 		struct vattr *a_vap;
 		struct ucred *a_cred;
 		struct thread *a_td;
 	} */ *ap;
 {
 
 	return (zfs_getattr(ap->a_vp, ap->a_vap, 0, ap->a_cred));
 }
 
 static int
 zfs_freebsd_setattr(ap)
 	struct vop_setattr_args /* {
 		struct vnode *a_vp;
 		struct vattr *a_vap;
 		struct ucred *a_cred;
 		struct thread *a_td;
 	} */ *ap;
 {
 	vattr_t *vap = ap->a_vap;
 
 	/* No support for FreeBSD's chflags(2). */
 	if (vap->va_flags != VNOVAL)
 		return (EOPNOTSUPP);
 
 	vattr_init_mask(vap);
 	vap->va_mask &= ~AT_NOSET;
 
 	return (zfs_setattr(ap->a_vp, vap, 0, ap->a_cred, NULL));
 }
 
 static int
 zfs_freebsd_rename(ap)
 	struct vop_rename_args  /* {
 		struct vnode *a_fdvp;
 		struct vnode *a_fvp;
 		struct componentname *a_fcnp;
 		struct vnode *a_tdvp;
 		struct vnode *a_tvp;
 		struct componentname *a_tcnp;
 	} */ *ap;
 {
 	vnode_t *fdvp = ap->a_fdvp;
 	vnode_t *fvp = ap->a_fvp;
 	vnode_t *tdvp = ap->a_tdvp;
 	vnode_t *tvp = ap->a_tvp;
 	int error;
 
 	ASSERT(ap->a_fcnp->cn_flags & SAVENAME);
 	ASSERT(ap->a_tcnp->cn_flags & SAVENAME);
 
 	error = zfs_rename(fdvp, ap->a_fcnp->cn_nameptr, tdvp,
 	    ap->a_tcnp->cn_nameptr, ap->a_fcnp->cn_cred);
 
 	if (tdvp == tvp)
 		VN_RELE(tdvp);
 	else
 		VN_URELE(tdvp);
 	if (tvp)
 		VN_URELE(tvp);
 	VN_RELE(fdvp);
 	VN_RELE(fvp);
 
 	return (error);
 }
 
 static int
 zfs_freebsd_symlink(ap)
 	struct vop_symlink_args /* {
 		struct vnode *a_dvp;
 		struct vnode **a_vpp;
 		struct componentname *a_cnp;
 		struct vattr *a_vap;
 		char *a_target;
 	} */ *ap;
 {
 	struct componentname *cnp = ap->a_cnp;
 	vattr_t *vap = ap->a_vap;
 
 	ASSERT(cnp->cn_flags & SAVENAME);
 
 	vap->va_type = VLNK;	/* FreeBSD: Syscall only sets va_mode. */
 	vattr_init_mask(vap);
 
 	return (zfs_symlink(ap->a_dvp, ap->a_vpp, cnp->cn_nameptr, vap,
 	    ap->a_target, cnp->cn_cred, cnp->cn_thread));
 }
 
 static int
 zfs_freebsd_readlink(ap)
 	struct vop_readlink_args /* {
 		struct vnode *a_vp;
 		struct uio *a_uio;
 		struct ucred *a_cred;
 	} */ *ap;
 {
 
 	return (zfs_readlink(ap->a_vp, ap->a_uio, ap->a_cred));
 }
 
 static int
 zfs_freebsd_link(ap)
 	struct vop_link_args /* {
 		struct vnode *a_tdvp;
 		struct vnode *a_vp;
 		struct componentname *a_cnp;
 	} */ *ap;
 {
 	struct componentname *cnp = ap->a_cnp;
 
 	ASSERT(cnp->cn_flags & SAVENAME);
 
 	return (zfs_link(ap->a_tdvp, ap->a_vp, cnp->cn_nameptr, cnp->cn_cred));
 }
 
 static int
 zfs_freebsd_inactive(ap)
 	struct vop_inactive_args /* {
 		struct vnode *a_vp;
 		struct thread *a_td;
 	} */ *ap;
 {
 	vnode_t *vp = ap->a_vp;
 
 	zfs_inactive(vp, ap->a_td->td_ucred);
 	return (0);
 }
 
 static int
 zfs_freebsd_reclaim(ap)
 	struct vop_reclaim_args /* {
 		struct vnode *a_vp;
 		struct thread *a_td;
 	} */ *ap;
 {
 	vnode_t	*vp = ap->a_vp;
 	znode_t	*zp = VTOZ(vp);
 	zfsvfs_t *zfsvfs;
 	int rele = 1;
 
 	ASSERT(zp != NULL);
 
 	/*
 	 * Destroy the vm object and flush associated pages.
 	 */
 	vnode_destroy_vobject(vp);
 
 	mutex_enter(&zp->z_lock);
 	ASSERT(zp->z_phys);
 	ASSERT(zp->z_dbuf_held);
 	zfsvfs = zp->z_zfsvfs;
 	if (!zp->z_unlinked) {
 		zp->z_dbuf_held = 0;
 		ZTOV(zp) = NULL;
 		mutex_exit(&zp->z_lock);
 		dmu_buf_rele(zp->z_dbuf, NULL);
 	} else {
 		mutex_exit(&zp->z_lock);
 	}
 	VI_LOCK(vp);
 	if (vp->v_count > 0)
 		rele = 0;
 	vp->v_data = NULL;
 	ASSERT(vp->v_holdcnt >= 1);
 	VI_UNLOCK(vp);
 	if (!zp->z_unlinked && rele)
 		VFS_RELE(zfsvfs->z_vfs);
 	return (0);
 }
 
 static int
 zfs_freebsd_fid(ap)
 	struct vop_fid_args /* {
 		struct vnode *a_vp;
 		struct fid *a_fid;
 	} */ *ap;
 {
 
 	return (zfs_fid(ap->a_vp, (void *)ap->a_fid));
 }
 
 static int
 zfs_freebsd_pathconf(ap)
 	struct vop_pathconf_args /* {
 		struct vnode *a_vp;
 		int a_name;
 		register_t *a_retval;
 	} */ *ap;
 {
 	ulong_t val;
 	int error;
 
 	error = zfs_pathconf(ap->a_vp, ap->a_name, &val, curthread->td_ucred);
 	if (error == 0)
 		*ap->a_retval = val;
 	else if (error == EOPNOTSUPP)
 		error = vop_stdpathconf(ap);
 	return (error);
 }
 
 /*
  * Advisory record locking support
  */
 static int
 zfs_freebsd_advlock(ap)
 	struct vop_advlock_args /* {
 		struct vnode *a_vp;
 		caddr_t  a_id;
 		int  a_op;
 		struct flock *a_fl;
 		int  a_flags;
 	} */ *ap;
 {
 	znode_t	*zp = VTOZ(ap->a_vp);
 
 	return (lf_advlock(ap, &(zp->z_lockf), zp->z_phys->zp_size));
 }
 
 struct vop_vector zfs_vnodeops;
 struct vop_vector zfs_fifoops;
 
 struct vop_vector zfs_vnodeops = {
 	.vop_default =	&default_vnodeops,
 	.vop_inactive =	zfs_freebsd_inactive,
 	.vop_reclaim =	zfs_freebsd_reclaim,
 	.vop_access =	zfs_freebsd_access,
 #ifdef FREEBSD_NAMECACHE
 	.vop_lookup =	vfs_cache_lookup,
 	.vop_cachedlookup = zfs_freebsd_lookup,
 #else
 	.vop_lookup =	zfs_freebsd_lookup,
 #endif
 	.vop_getattr =	zfs_freebsd_getattr,
 	.vop_setattr =	zfs_freebsd_setattr,
 	.vop_create =	zfs_freebsd_create,
 	.vop_mknod =	zfs_freebsd_create,
 	.vop_mkdir =	zfs_freebsd_mkdir,
 	.vop_readdir =	zfs_freebsd_readdir,
 	.vop_fsync =	zfs_freebsd_fsync,
 	.vop_open =	zfs_freebsd_open,
 	.vop_close =	zfs_freebsd_close,
 	.vop_rmdir =	zfs_freebsd_rmdir,
 	.vop_ioctl =	zfs_freebsd_ioctl,
 	.vop_link =	zfs_freebsd_link,
 	.vop_symlink =	zfs_freebsd_symlink,
 	.vop_readlink =	zfs_freebsd_readlink,
 	.vop_read =	zfs_freebsd_read,
 	.vop_write =	zfs_freebsd_write,
 	.vop_remove =	zfs_freebsd_remove,
 	.vop_rename =	zfs_freebsd_rename,
 	.vop_advlock =	zfs_freebsd_advlock,
 	.vop_pathconf =	zfs_freebsd_pathconf,
 	.vop_bmap =	VOP_EOPNOTSUPP,
 	.vop_fid =	zfs_freebsd_fid,
 };
 
 struct vop_vector zfs_fifoops = {
 	.vop_default =	&fifo_specops,
 	.vop_fsync =	VOP_PANIC,
 	.vop_access =	zfs_freebsd_access,
 	.vop_getattr =	zfs_freebsd_getattr,
 	.vop_inactive =	zfs_freebsd_inactive,
 	.vop_read =	VOP_PANIC,
 	.vop_reclaim =	zfs_freebsd_reclaim,
 	.vop_setattr =	zfs_freebsd_setattr,
 	.vop_write =	VOP_PANIC,
 	.vop_fid =	zfs_freebsd_fid,
 };
Index: head/sys/contrib/opensolaris/uts/common/fs/zfs/zfs_znode.c
===================================================================
--- head/sys/contrib/opensolaris/uts/common/fs/zfs/zfs_znode.c	(revision 175201)
+++ head/sys/contrib/opensolaris/uts/common/fs/zfs/zfs_znode.c	(revision 175202)
@@ -1,1072 +1,1072 @@
 /*
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
  * Common Development and Distribution License (the "License").
  * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or http://www.opensolaris.org/os/licensing.
  * See the License for the specific language governing permissions
  * and limitations under the License.
  *
  * When distributing Covered Code, include this CDDL HEADER in each
  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  * If applicable, add the following below this CDDL HEADER, with the
  * fields enclosed by brackets "[]" replaced with your own identifying
  * information: Portions Copyright [yyyy] [name of copyright owner]
  *
  * CDDL HEADER END
  */
 /*
  * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
 /* Portions Copyright 2007 Jeremy Teo */
 
 #pragma ident	"%Z%%M%	%I%	%E% SMI"
 
 #ifdef _KERNEL
 #include <sys/types.h>
 #include <sys/param.h>
 #include <sys/time.h>
 #include <sys/systm.h>
 #include <sys/sysmacros.h>
 #include <sys/resource.h>
 #include <sys/mntent.h>
 #include <sys/vfs.h>
 #include <sys/vnode.h>
 #include <sys/file.h>
 #include <sys/kmem.h>
 #include <sys/cmn_err.h>
 #include <sys/errno.h>
 #include <sys/unistd.h>
 #include <sys/atomic.h>
 #include <sys/zfs_dir.h>
 #include <sys/zfs_acl.h>
 #include <sys/zfs_ioctl.h>
 #include <sys/zfs_rlock.h>
 #include <sys/fs/zfs.h>
 #endif /* _KERNEL */
 
 #include <sys/dmu.h>
 #include <sys/refcount.h>
 #include <sys/stat.h>
 #include <sys/zap.h>
 #include <sys/zfs_znode.h>
 #include <sys/refcount.h>
 
 /* Used by fstat(1). */
 SYSCTL_INT(_debug_sizeof, OID_AUTO, znode, CTLFLAG_RD, 0, sizeof(znode_t),
     "sizeof(znode_t)");
 
 /*
  * Functions needed for userland (ie: libzpool) are not put under
  * #ifdef_KERNEL; the rest of the functions have dependencies
  * (such as VFS logic) that will not compile easily in userland.
  */
 #ifdef _KERNEL
 struct kmem_cache *znode_cache = NULL;
 
 /*ARGSUSED*/
 static void
 znode_pageout_func(dmu_buf_t *dbuf, void *user_ptr)
 {
 	znode_t *zp = user_ptr;
 	vnode_t *vp;
 
 	mutex_enter(&zp->z_lock);
 	vp = ZTOV(zp);
 	if (vp == NULL) {
 		mutex_exit(&zp->z_lock);
 		zfs_znode_free(zp);
 	} else if (vp->v_count == 0) {
 		ZTOV(zp) = NULL;
 		vhold(vp);
 		mutex_exit(&zp->z_lock);
-		vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, curthread);
+		vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
 		vrecycle(vp, curthread);
 		VOP_UNLOCK(vp, 0, curthread);
 		vdrop(vp);
 		zfs_znode_free(zp);
 	} else {
 		/* signal force unmount that this znode can be freed */
 		zp->z_dbuf = NULL;
 		mutex_exit(&zp->z_lock);
 	}
 }
 
 extern struct vop_vector zfs_vnodeops;
 extern struct vop_vector zfs_fifoops;
 
 /*
  * XXX: We cannot use this function as a cache constructor, because
  *      there is one global cache for all file systems and we need
  *      to pass vfsp here, which is not possible, because argument
  *      'cdrarg' is defined at kmem_cache_create() time.
  */
 static int
 zfs_znode_cache_constructor(void *buf, void *cdrarg, int kmflags)
 {
 	znode_t *zp = buf;
 	vnode_t *vp;
 	vfs_t *vfsp = cdrarg;
 	int error;
 
 	if (cdrarg != NULL) {
 		error = getnewvnode("zfs", vfsp, &zfs_vnodeops, &vp);
 		ASSERT(error == 0);
 		zp->z_vnode = vp;
 		vp->v_data = (caddr_t)zp;
 		vp->v_vnlock->lk_flags |= LK_CANRECURSE;
 		vp->v_vnlock->lk_flags &= ~LK_NOSHARE;
 	} else {
 		zp->z_vnode = NULL;
 	}
 	mutex_init(&zp->z_lock, NULL, MUTEX_DEFAULT, NULL);
 	rw_init(&zp->z_map_lock, NULL, RW_DEFAULT, NULL);
 	rw_init(&zp->z_parent_lock, NULL, RW_DEFAULT, NULL);
 	rw_init(&zp->z_name_lock, NULL, RW_DEFAULT, NULL);
 	mutex_init(&zp->z_acl_lock, NULL, MUTEX_DEFAULT, NULL);
 
 	mutex_init(&zp->z_range_lock, NULL, MUTEX_DEFAULT, NULL);
 	avl_create(&zp->z_range_avl, zfs_range_compare,
 	    sizeof (rl_t), offsetof(rl_t, r_node));
 
 	zp->z_dbuf_held = 0;
 	zp->z_dirlocks = 0;
 	zp->z_lockf = NULL;
 	return (0);
 }
 
 /*ARGSUSED*/
 static void
 zfs_znode_cache_destructor(void *buf, void *cdarg)
 {
 	znode_t *zp = buf;
 
 	ASSERT(zp->z_dirlocks == 0);
 	mutex_destroy(&zp->z_lock);
 	rw_destroy(&zp->z_map_lock);
 	rw_destroy(&zp->z_parent_lock);
 	rw_destroy(&zp->z_name_lock);
 	mutex_destroy(&zp->z_acl_lock);
 	mutex_destroy(&zp->z_range_lock);
 	avl_destroy(&zp->z_range_avl);
 
 	ASSERT(zp->z_dbuf_held == 0);
 }
 
 void
 zfs_znode_init(void)
 {
 	/*
 	 * Initialize zcache
 	 */
 	ASSERT(znode_cache == NULL);
 	znode_cache = kmem_cache_create("zfs_znode_cache",
 	    sizeof (znode_t), 0, /* zfs_znode_cache_constructor */ NULL,
 	    zfs_znode_cache_destructor, NULL, NULL, NULL, 0);
 }
 
 void
 zfs_znode_fini(void)
 {
 	/*
 	 * Cleanup zcache
 	 */
 	if (znode_cache)
 		kmem_cache_destroy(znode_cache);
 	znode_cache = NULL;
 }
 
 /*
  * zfs_init_fs - Initialize the zfsvfs struct and the file system
  *	incore "master" object.  Verify version compatibility.
  */
 int
 zfs_init_fs(zfsvfs_t *zfsvfs, znode_t **zpp, cred_t *cr)
 {
 	objset_t	*os = zfsvfs->z_os;
 	uint64_t	version = ZPL_VERSION;
 	int		i, error;
 	dmu_object_info_t doi;
 	uint64_t fsid_guid;
 
 	*zpp = NULL;
 
 	/*
 	 * XXX - hack to auto-create the pool root filesystem at
 	 * the first attempted mount.
 	 */
 	if (dmu_object_info(os, MASTER_NODE_OBJ, &doi) == ENOENT) {
 		dmu_tx_t *tx = dmu_tx_create(os);
 
 		dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, TRUE, NULL); /* master */
 		dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, TRUE, NULL); /* del queue */
 		dmu_tx_hold_bonus(tx, DMU_NEW_OBJECT); /* root node */
 		error = dmu_tx_assign(tx, TXG_WAIT);
 		ASSERT3U(error, ==, 0);
 		zfs_create_fs(os, cr, tx);
 		dmu_tx_commit(tx);
 	}
 
 	error = zap_lookup(os, MASTER_NODE_OBJ, ZPL_VERSION_OBJ, 8, 1,
 	    &version);
 	if (error) {
 		return (error);
 	} else if (version != ZPL_VERSION) {
 		(void) printf("Mismatched versions:  File system "
 		    "is version %lld on-disk format, which is "
 		    "incompatible with this software version %lld!",
 		    (u_longlong_t)version, ZPL_VERSION);
 		return (ENOTSUP);
 	}
 
 	/*
 	 * The fsid is 64 bits, composed of an 8-bit fs type, which
 	 * separates our fsid from any other filesystem types, and a
 	 * 56-bit objset unique ID.  The objset unique ID is unique to
 	 * all objsets open on this system, provided by unique_create().
 	 * The 8-bit fs type must be put in the low bits of fsid[1]
 	 * because that's where other Solaris filesystems put it.
 	 */
 	fsid_guid = dmu_objset_fsid_guid(os);
 	ASSERT((fsid_guid & ~((1ULL<<56)-1)) == 0);
 	zfsvfs->z_vfs->vfs_fsid.val[0] = fsid_guid;
 	zfsvfs->z_vfs->vfs_fsid.val[1] = ((fsid_guid>>32) << 8) |
 	    zfsvfs->z_vfs->mnt_vfc->vfc_typenum & 0xFF;
 
 	error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_ROOT_OBJ, 8, 1,
 	    &zfsvfs->z_root);
 	if (error)
 		return (error);
 	ASSERT(zfsvfs->z_root != 0);
 
 	/*
 	 * Create the per mount vop tables.
 	 */
 
 	/*
 	 * Initialize zget mutex's
 	 */
 	for (i = 0; i != ZFS_OBJ_MTX_SZ; i++)
 		mutex_init(&zfsvfs->z_hold_mtx[i], NULL, MUTEX_DEFAULT, NULL);
 
 	error = zfs_zget(zfsvfs, zfsvfs->z_root, zpp);
 	if (error)
 		return (error);
 	ASSERT3U((*zpp)->z_id, ==, zfsvfs->z_root);
 
 	error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_UNLINKED_SET, 8, 1,
 	    &zfsvfs->z_unlinkedobj);
 	if (error)
 		return (error);
 
 	return (0);
 }
 
 /*
  * define a couple of values we need available
  * for both 64 and 32 bit environments.
  */
 #ifndef NBITSMINOR64
 #define	NBITSMINOR64	32
 #endif
 #ifndef MAXMAJ64
 #define	MAXMAJ64	0xffffffffUL
 #endif
 #ifndef	MAXMIN64
 #define	MAXMIN64	0xffffffffUL
 #endif
 #ifndef major
 #define	major(x)	((int)(((u_int)(x) >> 8)&0xff))	/* major number */
 #endif
 #ifndef minor
 #define	minor(x)	((int)((x)&0xffff00ff))		/* minor number */
 #endif
 
 /*
  * Create special expldev for ZFS private use.
  * Can't use standard expldev since it doesn't do
  * what we want.  The standard expldev() takes a
  * dev32_t in LP64 and expands it to a long dev_t.
  * We need an interface that takes a dev32_t in ILP32
  * and expands it to a long dev_t.
  */
 static uint64_t
 zfs_expldev(dev_t dev)
 {
 	return (((uint64_t)major(dev) << NBITSMINOR64) | minor(dev));
 }
 /*
  * Special cmpldev for ZFS private use.
  * Can't use standard cmpldev since it takes
  * a long dev_t and compresses it to dev32_t in
  * LP64.  We need to do a compaction of a long dev_t
  * to a dev32_t in ILP32.
  */
 dev_t
 zfs_cmpldev(uint64_t dev)
 {
 	return (makedev((dev >> NBITSMINOR64), (dev & MAXMIN64)));
 }
 
 /*
  * Construct a new znode/vnode and intialize.
  *
  * This does not do a call to dmu_set_user() that is
  * up to the caller to do, in case you don't want to
  * return the znode
  */
 static znode_t *
 zfs_znode_alloc(zfsvfs_t *zfsvfs, dmu_buf_t *db, uint64_t obj_num, int blksz)
 {
 	znode_t	*zp;
 	vnode_t *vp;
 	int error;
 
 	zp = kmem_cache_alloc(znode_cache, KM_SLEEP);
 	zfs_znode_cache_constructor(zp, zfsvfs->z_vfs, 0);
 
 	ASSERT(zp->z_dirlocks == NULL);
 
 	zp->z_phys = db->db_data;
 	zp->z_zfsvfs = zfsvfs;
 	zp->z_unlinked = 0;
 	zp->z_atime_dirty = 0;
 	zp->z_dbuf_held = 0;
 	zp->z_mapcnt = 0;
 	zp->z_last_itx = 0;
 	zp->z_dbuf = db;
 	zp->z_id = obj_num;
 	zp->z_blksz = blksz;
 	zp->z_seq = 0x7A4653;
 	zp->z_sync_cnt = 0;
 
 	mutex_enter(&zfsvfs->z_znodes_lock);
 	list_insert_tail(&zfsvfs->z_all_znodes, zp);
 	mutex_exit(&zfsvfs->z_znodes_lock);
 
 	vp = ZTOV(zp);
 	if (vp == NULL)
 		return (zp);
 
 	error = insmntque(vp, zfsvfs->z_vfs);
 	KASSERT(error == 0, ("insmntque() failed: error %d", error));
 
 	vp->v_type = IFTOVT((mode_t)zp->z_phys->zp_mode);
 	switch (vp->v_type) {
 	case VDIR:
 		zp->z_zn_prefetch = B_TRUE; /* z_prefetch default is enabled */
 		break;
 	case VFIFO:
 		vp->v_op = &zfs_fifoops;
 		break;
 	}
 
 	return (zp);
 }
 
 static void
 zfs_znode_dmu_init(znode_t *zp)
 {
 	znode_t		*nzp;
 	zfsvfs_t	*zfsvfs = zp->z_zfsvfs;
 	dmu_buf_t	*db = zp->z_dbuf;
 
 	mutex_enter(&zp->z_lock);
 
 	nzp = dmu_buf_set_user_ie(db, zp, &zp->z_phys, znode_pageout_func);
 
 	/*
 	 * there should be no
 	 * concurrent zgets on this object.
 	 */
 	ASSERT3P(nzp, ==, NULL);
 
 	/*
 	 * Slap on VROOT if we are the root znode
 	 */
 	if (zp->z_id == zfsvfs->z_root) {
 		ZTOV(zp)->v_flag |= VROOT;
 	}
 
 	ASSERT(zp->z_dbuf_held == 0);
 	zp->z_dbuf_held = 1;
 	VFS_HOLD(zfsvfs->z_vfs);
 	mutex_exit(&zp->z_lock);
 }
 
 /*
  * Create a new DMU object to hold a zfs znode.
  *
  *	IN:	dzp	- parent directory for new znode
  *		vap	- file attributes for new znode
  *		tx	- dmu transaction id for zap operations
  *		cr	- credentials of caller
  *		flag	- flags:
  *			  IS_ROOT_NODE	- new object will be root
  *			  IS_XATTR	- new object is an attribute
  *			  IS_REPLAY	- intent log replay
  *
  *	OUT:	oid	- ID of created object
  *
  */
 void
 zfs_mknode(znode_t *dzp, vattr_t *vap, uint64_t *oid, dmu_tx_t *tx, cred_t *cr,
 	uint_t flag, znode_t **zpp, int bonuslen)
 {
 	dmu_buf_t	*dbp;
 	znode_phys_t	*pzp;
 	znode_t		*zp;
 	zfsvfs_t	*zfsvfs = dzp->z_zfsvfs;
 	timestruc_t	now;
 	uint64_t	gen;
 	int		err;
 
 	ASSERT(vap && (vap->va_mask & (AT_TYPE|AT_MODE)) == (AT_TYPE|AT_MODE));
 
 	if (zfsvfs->z_assign >= TXG_INITIAL) {		/* ZIL replay */
 		*oid = vap->va_nodeid;
 		flag |= IS_REPLAY;
 		now = vap->va_ctime;		/* see zfs_replay_create() */
 		gen = vap->va_nblocks;		/* ditto */
 	} else {
 		*oid = 0;
 		gethrestime(&now);
 		gen = dmu_tx_get_txg(tx);
 	}
 
 	/*
 	 * Create a new DMU object.
 	 */
 	/*
 	 * There's currently no mechanism for pre-reading the blocks that will
 	 * be to needed allocate a new object, so we accept the small chance
 	 * that there will be an i/o error and we will fail one of the
 	 * assertions below.
 	 */
 	if (vap->va_type == VDIR) {
 		if (flag & IS_REPLAY) {
 			err = zap_create_claim(zfsvfs->z_os, *oid,
 			    DMU_OT_DIRECTORY_CONTENTS,
 			    DMU_OT_ZNODE, sizeof (znode_phys_t) + bonuslen, tx);
 			ASSERT3U(err, ==, 0);
 		} else {
 			*oid = zap_create(zfsvfs->z_os,
 			    DMU_OT_DIRECTORY_CONTENTS,
 			    DMU_OT_ZNODE, sizeof (znode_phys_t) + bonuslen, tx);
 		}
 	} else {
 		if (flag & IS_REPLAY) {
 			err = dmu_object_claim(zfsvfs->z_os, *oid,
 			    DMU_OT_PLAIN_FILE_CONTENTS, 0,
 			    DMU_OT_ZNODE, sizeof (znode_phys_t) + bonuslen, tx);
 			ASSERT3U(err, ==, 0);
 		} else {
 			*oid = dmu_object_alloc(zfsvfs->z_os,
 			    DMU_OT_PLAIN_FILE_CONTENTS, 0,
 			    DMU_OT_ZNODE, sizeof (znode_phys_t) + bonuslen, tx);
 		}
 	}
 	VERIFY(0 == dmu_bonus_hold(zfsvfs->z_os, *oid, NULL, &dbp));
 	dmu_buf_will_dirty(dbp, tx);
 
 	/*
 	 * Initialize the znode physical data to zero.
 	 */
 	ASSERT(dbp->db_size >= sizeof (znode_phys_t));
 	bzero(dbp->db_data, dbp->db_size);
 	pzp = dbp->db_data;
 
 	/*
 	 * If this is the root, fix up the half-initialized parent pointer
 	 * to reference the just-allocated physical data area.
 	 */
 	if (flag & IS_ROOT_NODE) {
 		dzp->z_phys = pzp;
 		dzp->z_id = *oid;
 	}
 
 	/*
 	 * If parent is an xattr, so am I.
 	 */
 	if (dzp->z_phys->zp_flags & ZFS_XATTR)
 		flag |= IS_XATTR;
 
 	if (vap->va_type == VBLK || vap->va_type == VCHR) {
 		pzp->zp_rdev = zfs_expldev(vap->va_rdev);
 	}
 
 	if (vap->va_type == VDIR) {
 		pzp->zp_size = 2;		/* contents ("." and "..") */
 		pzp->zp_links = (flag & (IS_ROOT_NODE | IS_XATTR)) ? 2 : 1;
 	}
 
 	pzp->zp_parent = dzp->z_id;
 	if (flag & IS_XATTR)
 		pzp->zp_flags |= ZFS_XATTR;
 
 	pzp->zp_gen = gen;
 
 	ZFS_TIME_ENCODE(&now, pzp->zp_crtime);
 	ZFS_TIME_ENCODE(&now, pzp->zp_ctime);
 
 	if (vap->va_mask & AT_ATIME) {
 		ZFS_TIME_ENCODE(&vap->va_atime, pzp->zp_atime);
 	} else {
 		ZFS_TIME_ENCODE(&now, pzp->zp_atime);
 	}
 
 	if (vap->va_mask & AT_MTIME) {
 		ZFS_TIME_ENCODE(&vap->va_mtime, pzp->zp_mtime);
 	} else {
 		ZFS_TIME_ENCODE(&now, pzp->zp_mtime);
 	}
 
 	pzp->zp_mode = MAKEIMODE(vap->va_type, vap->va_mode);
 	zp = zfs_znode_alloc(zfsvfs, dbp, *oid, 0);
 
 	zfs_perm_init(zp, dzp, flag, vap, tx, cr);
 
 	if (zpp) {
 		kmutex_t *hash_mtx = ZFS_OBJ_MUTEX(zp);
 
 		mutex_enter(hash_mtx);
 		zfs_znode_dmu_init(zp);
 		mutex_exit(hash_mtx);
 
 		*zpp = zp;
 	} else {
 		if (ZTOV(zp) != NULL)
 			ZTOV(zp)->v_count = 0;
 		dmu_buf_rele(dbp, NULL);
 		zfs_znode_free(zp);
 	}
 }
 
 int
 zfs_zget(zfsvfs_t *zfsvfs, uint64_t obj_num, znode_t **zpp)
 {
 	dmu_object_info_t doi;
 	dmu_buf_t	*db;
 	znode_t		*zp;
 	vnode_t		*vp;
 	int err;
 
 	*zpp = NULL;
 
 	ZFS_OBJ_HOLD_ENTER(zfsvfs, obj_num);
 
 	err = dmu_bonus_hold(zfsvfs->z_os, obj_num, NULL, &db);
 	if (err) {
 		ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num);
 		return (err);
 	}
 
 	dmu_object_info_from_db(db, &doi);
 	if (doi.doi_bonus_type != DMU_OT_ZNODE ||
 	    doi.doi_bonus_size < sizeof (znode_phys_t)) {
 		dmu_buf_rele(db, NULL);
 		ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num);
 		return (EINVAL);
 	}
 
 	ASSERT(db->db_object == obj_num);
 	ASSERT(db->db_offset == -1);
 	ASSERT(db->db_data != NULL);
 
 	zp = dmu_buf_get_user(db);
 
 	if (zp != NULL) {
 		mutex_enter(&zp->z_lock);
 
 		ASSERT3U(zp->z_id, ==, obj_num);
 		if (zp->z_unlinked) {
 			dmu_buf_rele(db, NULL);
 			mutex_exit(&zp->z_lock);
 			ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num);
 			return (ENOENT);
 		} else if (zp->z_dbuf_held) {
 			dmu_buf_rele(db, NULL);
 		} else {
 			zp->z_dbuf_held = 1;
 			VFS_HOLD(zfsvfs->z_vfs);
 		}
 
 		if (ZTOV(zp) != NULL)
 			VN_HOLD(ZTOV(zp));
 		else {
 			err = getnewvnode("zfs", zfsvfs->z_vfs, &zfs_vnodeops,
 			    &zp->z_vnode);
 			ASSERT(err == 0);
 			vp = ZTOV(zp);
 			vp->v_data = (caddr_t)zp;
 			vp->v_vnlock->lk_flags |= LK_CANRECURSE;
 			vp->v_vnlock->lk_flags &= ~LK_NOSHARE;
 			vp->v_type = IFTOVT((mode_t)zp->z_phys->zp_mode);
 			if (vp->v_type == VDIR)
 				zp->z_zn_prefetch = B_TRUE;	/* z_prefetch default is enabled */
 			err = insmntque(vp, zfsvfs->z_vfs);
 			KASSERT(err == 0, ("insmntque() failed: error %d", err));
 		}
 		mutex_exit(&zp->z_lock);
 		ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num);
 		*zpp = zp;
 		return (0);
 	}
 
 	/*
 	 * Not found create new znode/vnode
 	 */
 	zp = zfs_znode_alloc(zfsvfs, db, obj_num, doi.doi_data_block_size);
 	ASSERT3U(zp->z_id, ==, obj_num);
 	zfs_znode_dmu_init(zp);
 	ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num);
 	*zpp = zp;
 	return (0);
 }
 
 void
 zfs_znode_delete(znode_t *zp, dmu_tx_t *tx)
 {
 	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
 	int error;
 
 	ZFS_OBJ_HOLD_ENTER(zfsvfs, zp->z_id);
 	if (zp->z_phys->zp_acl.z_acl_extern_obj) {
 		error = dmu_object_free(zfsvfs->z_os,
 		    zp->z_phys->zp_acl.z_acl_extern_obj, tx);
 		ASSERT3U(error, ==, 0);
 	}
 	error = dmu_object_free(zfsvfs->z_os, zp->z_id, tx);
 	ASSERT3U(error, ==, 0);
 	zp->z_dbuf_held = 0;
 	ZFS_OBJ_HOLD_EXIT(zfsvfs, zp->z_id);
 	dmu_buf_rele(zp->z_dbuf, NULL);
 }
 
 void
 zfs_zinactive(znode_t *zp)
 {
 	vnode_t	*vp = ZTOV(zp);
 	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
 	uint64_t z_id = zp->z_id;
 
 	ASSERT(zp->z_dbuf_held && zp->z_phys);
 
 	/*
 	 * Don't allow a zfs_zget() while were trying to release this znode
 	 */
 	ZFS_OBJ_HOLD_ENTER(zfsvfs, z_id);
 
 	mutex_enter(&zp->z_lock);
 	VI_LOCK(vp);
 	if (vp->v_count > 0) {
 		/*
 		 * If the hold count is greater than zero, somebody has
 		 * obtained a new reference on this znode while we were
 		 * processing it here, so we are done.
 		 */
 		VI_UNLOCK(vp);
 		mutex_exit(&zp->z_lock);
 		ZFS_OBJ_HOLD_EXIT(zfsvfs, z_id);
 		return;
 	}
 	VI_UNLOCK(vp);
 
 	/*
 	 * If this was the last reference to a file with no links,
 	 * remove the file from the file system.
 	 */
 	if (zp->z_unlinked) {
 		ZTOV(zp) = NULL;
 		mutex_exit(&zp->z_lock);
 		ZFS_OBJ_HOLD_EXIT(zfsvfs, z_id);
 		ASSERT(vp->v_count == 0);
 		vrecycle(vp, curthread);
 		zfs_rmnode(zp);
 		VFS_RELE(zfsvfs->z_vfs);
 		return;
 	}
 	ASSERT(zp->z_phys);
 	ASSERT(zp->z_dbuf_held);
 	mutex_exit(&zp->z_lock);
 	ZFS_OBJ_HOLD_EXIT(zfsvfs, z_id);
 }
 
 void
 zfs_znode_free(znode_t *zp)
 {
 	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
 
 	mutex_enter(&zfsvfs->z_znodes_lock);
 	list_remove(&zfsvfs->z_all_znodes, zp);
 	mutex_exit(&zfsvfs->z_znodes_lock);
 
 	kmem_cache_free(znode_cache, zp);
 }
 
 void
 zfs_time_stamper_locked(znode_t *zp, uint_t flag, dmu_tx_t *tx)
 {
 	timestruc_t	now;
 
 	ASSERT(MUTEX_HELD(&zp->z_lock));
 
 	gethrestime(&now);
 
 	if (tx) {
 		dmu_buf_will_dirty(zp->z_dbuf, tx);
 		zp->z_atime_dirty = 0;
 		zp->z_seq++;
 	} else {
 		zp->z_atime_dirty = 1;
 	}
 
 	if (flag & AT_ATIME)
 		ZFS_TIME_ENCODE(&now, zp->z_phys->zp_atime);
 
 	if (flag & AT_MTIME)
 		ZFS_TIME_ENCODE(&now, zp->z_phys->zp_mtime);
 
 	if (flag & AT_CTIME)
 		ZFS_TIME_ENCODE(&now, zp->z_phys->zp_ctime);
 }
 
 /*
  * Update the requested znode timestamps with the current time.
  * If we are in a transaction, then go ahead and mark the znode
  * dirty in the transaction so the timestamps will go to disk.
  * Otherwise, we will get pushed next time the znode is updated
  * in a transaction, or when this znode eventually goes inactive.
  *
  * Why is this OK?
  *  1 - Only the ACCESS time is ever updated outside of a transaction.
  *  2 - Multiple consecutive updates will be collapsed into a single
  *	znode update by the transaction grouping semantics of the DMU.
  */
 void
 zfs_time_stamper(znode_t *zp, uint_t flag, dmu_tx_t *tx)
 {
 	mutex_enter(&zp->z_lock);
 	zfs_time_stamper_locked(zp, flag, tx);
 	mutex_exit(&zp->z_lock);
 }
 
 /*
  * Grow the block size for a file.
  *
  *	IN:	zp	- znode of file to free data in.
  *		size	- requested block size
  *		tx	- open transaction.
  *
  * NOTE: this function assumes that the znode is write locked.
  */
 void
 zfs_grow_blocksize(znode_t *zp, uint64_t size, dmu_tx_t *tx)
 {
 	int		error;
 	u_longlong_t	dummy;
 
 	if (size <= zp->z_blksz)
 		return;
 	/*
 	 * If the file size is already greater than the current blocksize,
 	 * we will not grow.  If there is more than one block in a file,
 	 * the blocksize cannot change.
 	 */
 	if (zp->z_blksz && zp->z_phys->zp_size > zp->z_blksz)
 		return;
 
 	error = dmu_object_set_blocksize(zp->z_zfsvfs->z_os, zp->z_id,
 	    size, 0, tx);
 	if (error == ENOTSUP)
 		return;
 	ASSERT3U(error, ==, 0);
 
 	/* What blocksize did we actually get? */
 	dmu_object_size_from_db(zp->z_dbuf, &zp->z_blksz, &dummy);
 }
 
 /*
  * Free space in a file.
  *
  *	IN:	zp	- znode of file to free data in.
  *		off	- start of section to free.
  *		len	- length of section to free (0 => to EOF).
  *		flag	- current file open mode flags.
  *
  * 	RETURN:	0 if success
  *		error code if failure
  */
 int
 zfs_freesp(znode_t *zp, uint64_t off, uint64_t len, int flag, boolean_t log)
 {
 	vnode_t *vp = ZTOV(zp);
 	dmu_tx_t *tx;
 	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
 	zilog_t *zilog = zfsvfs->z_log;
 	rl_t *rl;
 	uint64_t end = off + len;
 	uint64_t size, new_blksz;
 	int error;
 
 	if (ZTOV(zp)->v_type == VFIFO)
 		return (0);
 
 	/*
 	 * If we will change zp_size then lock the whole file,
 	 * otherwise just lock the range being freed.
 	 */
 	if (len == 0 || off + len > zp->z_phys->zp_size) {
 		rl = zfs_range_lock(zp, 0, UINT64_MAX, RL_WRITER);
 	} else {
 		rl = zfs_range_lock(zp, off, len, RL_WRITER);
 		/* recheck, in case zp_size changed */
 		if (off + len > zp->z_phys->zp_size) {
 			/* lost race: file size changed, lock whole file */
 			zfs_range_unlock(rl);
 			rl = zfs_range_lock(zp, 0, UINT64_MAX, RL_WRITER);
 		}
 	}
 
 	/*
 	 * Nothing to do if file already at desired length.
 	 */
 	size = zp->z_phys->zp_size;
 	if (len == 0 && size == off && off != 0) {
 		zfs_range_unlock(rl);
 		return (0);
 	}
 
 	tx = dmu_tx_create(zfsvfs->z_os);
 	dmu_tx_hold_bonus(tx, zp->z_id);
 	new_blksz = 0;
 	if (end > size &&
 	    (!ISP2(zp->z_blksz) || zp->z_blksz < zfsvfs->z_max_blksz)) {
 		/*
 		 * We are growing the file past the current block size.
 		 */
 		if (zp->z_blksz > zp->z_zfsvfs->z_max_blksz) {
 			ASSERT(!ISP2(zp->z_blksz));
 			new_blksz = MIN(end, SPA_MAXBLOCKSIZE);
 		} else {
 			new_blksz = MIN(end, zp->z_zfsvfs->z_max_blksz);
 		}
 		dmu_tx_hold_write(tx, zp->z_id, 0, MIN(end, new_blksz));
 	} else if (off < size) {
 		/*
 		 * If len == 0, we are truncating the file.
 		 */
 		dmu_tx_hold_free(tx, zp->z_id, off, len ? len : DMU_OBJECT_END);
 	}
 
 	error = dmu_tx_assign(tx, zfsvfs->z_assign);
 	if (error) {
 		if (error == ERESTART && zfsvfs->z_assign == TXG_NOWAIT)
 			dmu_tx_wait(tx);
 		dmu_tx_abort(tx);
 		zfs_range_unlock(rl);
 		return (error);
 	}
 
 	if (new_blksz)
 		zfs_grow_blocksize(zp, new_blksz, tx);
 
 	if (end > size || len == 0)
 		zp->z_phys->zp_size = end;
 
 	if (off < size) {
 		objset_t *os = zfsvfs->z_os;
 		uint64_t rlen = len;
 
 		if (len == 0)
 			rlen = -1;
 		else if (end > size)
 			rlen = size - off;
 		VERIFY(0 == dmu_free_range(os, zp->z_id, off, rlen, tx));
 	}
 
 	if (log) {
 		zfs_time_stamper(zp, CONTENT_MODIFIED, tx);
 		zfs_log_truncate(zilog, tx, TX_TRUNCATE, zp, off, len);
 	}
 
 	zfs_range_unlock(rl);
 
 	dmu_tx_commit(tx);
 
 	/*
 	 * Clear any mapped pages in the truncated region.  This has to
 	 * happen outside of the transaction to avoid the possibility of
 	 * a deadlock with someone trying to push a page that we are
 	 * about to invalidate.
 	 */
 	rw_enter(&zp->z_map_lock, RW_WRITER);
 	if (end > size)
 		vnode_pager_setsize(vp, end);
 	else if (len == 0) {
 #if 0
 		error = vtruncbuf(vp, curthread->td_ucred, curthread, end, PAGE_SIZE);
 #else
 		error = vinvalbuf(vp, V_SAVE, curthread, 0, 0);
 		vnode_pager_setsize(vp, end);
 #endif
 	}
 	rw_exit(&zp->z_map_lock);
 
 	return (0);
 }
 
 void
 zfs_create_fs(objset_t *os, cred_t *cr, dmu_tx_t *tx)
 {
 	zfsvfs_t	zfsvfs;
 	uint64_t	moid, doid, roid = 0;
 	uint64_t	version = ZPL_VERSION;
 	int		error;
 	znode_t		*rootzp = NULL;
 	vattr_t		vattr;
 
 	/*
 	 * First attempt to create master node.
 	 */
 	/*
 	 * In an empty objset, there are no blocks to read and thus
 	 * there can be no i/o errors (which we assert below).
 	 */
 	moid = MASTER_NODE_OBJ;
 	error = zap_create_claim(os, moid, DMU_OT_MASTER_NODE,
 	    DMU_OT_NONE, 0, tx);
 	ASSERT(error == 0);
 
 	/*
 	 * Set starting attributes.
 	 */
 
 	error = zap_update(os, moid, ZPL_VERSION_OBJ, 8, 1, &version, tx);
 	ASSERT(error == 0);
 
 	/*
 	 * Create a delete queue.
 	 */
 	doid = zap_create(os, DMU_OT_UNLINKED_SET, DMU_OT_NONE, 0, tx);
 
 	error = zap_add(os, moid, ZFS_UNLINKED_SET, 8, 1, &doid, tx);
 	ASSERT(error == 0);
 
 	/*
 	 * Create root znode.  Create minimal znode/vnode/zfsvfs
 	 * to allow zfs_mknode to work.
 	 */
 	vattr.va_mask = AT_MODE|AT_UID|AT_GID|AT_TYPE;
 	vattr.va_type = VDIR;
 	vattr.va_mode = S_IFDIR|0755;
 	vattr.va_uid = UID_ROOT;
 	vattr.va_gid = GID_WHEEL;
 
 	rootzp = kmem_cache_alloc(znode_cache, KM_SLEEP);
 	zfs_znode_cache_constructor(rootzp, NULL, 0);
 	rootzp->z_zfsvfs = &zfsvfs;
 	rootzp->z_unlinked = 0;
 	rootzp->z_atime_dirty = 0;
 	rootzp->z_dbuf_held = 0;
 
 	bzero(&zfsvfs, sizeof (zfsvfs_t));
 
 	zfsvfs.z_os = os;
 	zfsvfs.z_assign = TXG_NOWAIT;
 	zfsvfs.z_parent = &zfsvfs;
 
 	mutex_init(&zfsvfs.z_znodes_lock, NULL, MUTEX_DEFAULT, NULL);
 	list_create(&zfsvfs.z_all_znodes, sizeof (znode_t),
 	    offsetof(znode_t, z_link_node));
 
 	zfs_mknode(rootzp, &vattr, &roid, tx, cr, IS_ROOT_NODE, NULL, 0);
 	ASSERT3U(rootzp->z_id, ==, roid);
 	error = zap_add(os, moid, ZFS_ROOT_OBJ, 8, 1, &roid, tx);
 	ASSERT(error == 0);
 
 	mutex_destroy(&zfsvfs.z_znodes_lock);
 	kmem_cache_free(znode_cache, rootzp);
 }
 #endif /* _KERNEL */
 
 /*
  * Given an object number, return its parent object number and whether
  * or not the object is an extended attribute directory.
  */
 static int
 zfs_obj_to_pobj(objset_t *osp, uint64_t obj, uint64_t *pobjp, int *is_xattrdir)
 {
 	dmu_buf_t *db;
 	dmu_object_info_t doi;
 	znode_phys_t *zp;
 	int error;
 
 	if ((error = dmu_bonus_hold(osp, obj, FTAG, &db)) != 0)
 		return (error);
 
 	dmu_object_info_from_db(db, &doi);
 	if (doi.doi_bonus_type != DMU_OT_ZNODE ||
 	    doi.doi_bonus_size < sizeof (znode_phys_t)) {
 		dmu_buf_rele(db, FTAG);
 		return (EINVAL);
 	}
 
 	zp = db->db_data;
 	*pobjp = zp->zp_parent;
 	*is_xattrdir = ((zp->zp_flags & ZFS_XATTR) != 0) &&
 	    S_ISDIR(zp->zp_mode);
 	dmu_buf_rele(db, FTAG);
 
 	return (0);
 }
 
 int
 zfs_obj_to_path(objset_t *osp, uint64_t obj, char *buf, int len)
 {
 	char *path = buf + len - 1;
 	int error;
 
 	*path = '\0';
 
 	for (;;) {
 		uint64_t pobj;
 		char component[MAXNAMELEN + 2];
 		size_t complen;
 		int is_xattrdir;
 
 		if ((error = zfs_obj_to_pobj(osp, obj, &pobj,
 		    &is_xattrdir)) != 0)
 			break;
 
 		if (pobj == obj) {
 			if (path[0] != '/')
 				*--path = '/';
 			break;
 		}
 
 		component[0] = '/';
 		if (is_xattrdir) {
 			(void) sprintf(component + 1, "<xattrdir>");
 		} else {
 			error = zap_value_search(osp, pobj, obj, component + 1);
 			if (error != 0)
 				break;
 		}
 
 		complen = strlen(component);
 		path -= complen;
 		ASSERT(path >= buf);
 		bcopy(component, path, complen);
 		obj = pobj;
 	}
 
 	if (error == 0)
 		(void) memmove(buf, path, buf + len - path);
 	return (error);
 }
Index: head/sys/dev/hwpmc/hwpmc_mod.c
===================================================================
--- head/sys/dev/hwpmc/hwpmc_mod.c	(revision 175201)
+++ head/sys/dev/hwpmc/hwpmc_mod.c	(revision 175202)
@@ -1,4627 +1,4627 @@
 /*-
  * Copyright (c) 2003-2007 Joseph Koshy
  * Copyright (c) 2007 The FreeBSD Foundation
  * All rights reserved.
  *
  * Portions of this software were developed by A. Joseph Koshy under
  * sponsorship from the FreeBSD Foundation and Google, Inc.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <sys/param.h>
 #include <sys/eventhandler.h>
 #include <sys/jail.h>
 #include <sys/kernel.h>
 #include <sys/kthread.h>
 #include <sys/limits.h>
 #include <sys/lock.h>
 #include <sys/malloc.h>
 #include <sys/module.h>
 #include <sys/mutex.h>
 #include <sys/pmc.h>
 #include <sys/pmckern.h>
 #include <sys/pmclog.h>
 #include <sys/priv.h>
 #include <sys/proc.h>
 #include <sys/queue.h>
 #include <sys/resourcevar.h>
 #include <sys/sched.h>
 #include <sys/signalvar.h>
 #include <sys/smp.h>
 #include <sys/sx.h>
 #include <sys/sysctl.h>
 #include <sys/sysent.h>
 #include <sys/systm.h>
 #include <sys/vnode.h>
 
 #include <sys/linker.h>		/* needs to be after <sys/malloc.h> */
 
 #include <machine/atomic.h>
 #include <machine/md_var.h>
 
 /*
  * Types
  */
 
 enum pmc_flags {
 	PMC_FLAG_NONE	  = 0x00, /* do nothing */
 	PMC_FLAG_REMOVE   = 0x01, /* atomically remove entry from hash */
 	PMC_FLAG_ALLOCATE = 0x02, /* add entry to hash if not found */
 };
 
 /*
  * The offset in sysent where the syscall is allocated.
  */
 
 static int pmc_syscall_num = NO_SYSCALL;
 struct pmc_cpu		**pmc_pcpu;	 /* per-cpu state */
 pmc_value_t		*pmc_pcpu_saved; /* saved PMC values: CSW handling */
 
 #define	PMC_PCPU_SAVED(C,R)	pmc_pcpu_saved[(R) + md->pmd_npmc*(C)]
 
 struct mtx_pool		*pmc_mtxpool;
 static int		*pmc_pmcdisp;	 /* PMC row dispositions */
 
 #define	PMC_ROW_DISP_IS_FREE(R)		(pmc_pmcdisp[(R)] == 0)
 #define	PMC_ROW_DISP_IS_THREAD(R)	(pmc_pmcdisp[(R)] > 0)
 #define	PMC_ROW_DISP_IS_STANDALONE(R)	(pmc_pmcdisp[(R)] < 0)
 
 #define	PMC_MARK_ROW_FREE(R) do {					  \
 	pmc_pmcdisp[(R)] = 0;						  \
 } while (0)
 
 #define	PMC_MARK_ROW_STANDALONE(R) do {					  \
 	KASSERT(pmc_pmcdisp[(R)] <= 0, ("[pmc,%d] row disposition error", \
 		    __LINE__));						  \
 	atomic_add_int(&pmc_pmcdisp[(R)], -1);				  \
 	KASSERT(pmc_pmcdisp[(R)] >= (-mp_ncpus), ("[pmc,%d] row "	  \
 		"disposition error", __LINE__));			  \
 } while (0)
 
 #define	PMC_UNMARK_ROW_STANDALONE(R) do { 				  \
 	atomic_add_int(&pmc_pmcdisp[(R)], 1);				  \
 	KASSERT(pmc_pmcdisp[(R)] <= 0, ("[pmc,%d] row disposition error", \
 		    __LINE__));						  \
 } while (0)
 
 #define	PMC_MARK_ROW_THREAD(R) do {					  \
 	KASSERT(pmc_pmcdisp[(R)] >= 0, ("[pmc,%d] row disposition error", \
 		    __LINE__));						  \
 	atomic_add_int(&pmc_pmcdisp[(R)], 1);				  \
 } while (0)
 
 #define	PMC_UNMARK_ROW_THREAD(R) do {					  \
 	atomic_add_int(&pmc_pmcdisp[(R)], -1);				  \
 	KASSERT(pmc_pmcdisp[(R)] >= 0, ("[pmc,%d] row disposition error", \
 		    __LINE__));						  \
 } while (0)
 
 
 /* various event handlers */
 static eventhandler_tag	pmc_exit_tag, pmc_fork_tag;
 
 /* Module statistics */
 struct pmc_op_getdriverstats pmc_stats;
 
 /* Machine/processor dependent operations */
 struct pmc_mdep  *md;
 
 /*
  * Hash tables mapping owner processes and target threads to PMCs.
  */
 
 struct mtx pmc_processhash_mtx;		/* spin mutex */
 static u_long pmc_processhashmask;
 static LIST_HEAD(pmc_processhash, pmc_process)	*pmc_processhash;
 
 /*
  * Hash table of PMC owner descriptors.  This table is protected by
  * the shared PMC "sx" lock.
  */
 
 static u_long pmc_ownerhashmask;
 static LIST_HEAD(pmc_ownerhash, pmc_owner)	*pmc_ownerhash;
 
 /*
  * List of PMC owners with system-wide sampling PMCs.
  */
 
 static LIST_HEAD(, pmc_owner)			pmc_ss_owners;
 
 
 /*
  * Prototypes
  */
 
 #ifdef	DEBUG
 static int	pmc_debugflags_sysctl_handler(SYSCTL_HANDLER_ARGS);
 static int	pmc_debugflags_parse(char *newstr, char *fence);
 #endif
 
 static int	load(struct module *module, int cmd, void *arg);
 static int	pmc_attach_process(struct proc *p, struct pmc *pm);
 static struct pmc *pmc_allocate_pmc_descriptor(void);
 static struct pmc_owner *pmc_allocate_owner_descriptor(struct proc *p);
 static int	pmc_attach_one_process(struct proc *p, struct pmc *pm);
 static int	pmc_can_allocate_rowindex(struct proc *p, unsigned int ri,
     int cpu);
 static int	pmc_can_attach(struct pmc *pm, struct proc *p);
 static void	pmc_capture_user_callchain(int cpu, struct trapframe *tf);
 static void	pmc_cleanup(void);
 static int	pmc_detach_process(struct proc *p, struct pmc *pm);
 static int	pmc_detach_one_process(struct proc *p, struct pmc *pm,
     int flags);
 static void	pmc_destroy_owner_descriptor(struct pmc_owner *po);
 static struct pmc_owner *pmc_find_owner_descriptor(struct proc *p);
 static int	pmc_find_pmc(pmc_id_t pmcid, struct pmc **pm);
 static struct pmc *pmc_find_pmc_descriptor_in_process(struct pmc_owner *po,
     pmc_id_t pmc);
 static struct pmc_process *pmc_find_process_descriptor(struct proc *p,
     uint32_t mode);
 static void	pmc_force_context_switch(void);
 static void	pmc_link_target_process(struct pmc *pm,
     struct pmc_process *pp);
 static void	pmc_log_all_process_mappings(struct pmc_owner *po);
 static void	pmc_log_kernel_mappings(struct pmc *pm);
 static void	pmc_log_process_mappings(struct pmc_owner *po, struct proc *p);
 static void	pmc_maybe_remove_owner(struct pmc_owner *po);
 static void	pmc_process_csw_in(struct thread *td);
 static void	pmc_process_csw_out(struct thread *td);
 static void	pmc_process_exit(void *arg, struct proc *p);
 static void	pmc_process_fork(void *arg, struct proc *p1,
     struct proc *p2, int n);
 static void	pmc_process_samples(int cpu);
 static void	pmc_release_pmc_descriptor(struct pmc *pmc);
 static void	pmc_remove_owner(struct pmc_owner *po);
 static void	pmc_remove_process_descriptor(struct pmc_process *pp);
 static void	pmc_restore_cpu_binding(struct pmc_binding *pb);
 static void	pmc_save_cpu_binding(struct pmc_binding *pb);
 static void	pmc_select_cpu(int cpu);
 static int	pmc_start(struct pmc *pm);
 static int	pmc_stop(struct pmc *pm);
 static int	pmc_syscall_handler(struct thread *td, void *syscall_args);
 static void	pmc_unlink_target_process(struct pmc *pmc,
     struct pmc_process *pp);
 
 /*
  * Kernel tunables and sysctl(8) interface.
  */
 
 SYSCTL_NODE(_kern, OID_AUTO, hwpmc, CTLFLAG_RW, 0, "HWPMC parameters");
 
 static int pmc_callchaindepth = PMC_CALLCHAIN_DEPTH;
 TUNABLE_INT(PMC_SYSCTL_NAME_PREFIX "callchaindepth", &pmc_callchaindepth);
 SYSCTL_INT(_kern_hwpmc, OID_AUTO, callchaindepth, CTLFLAG_TUN|CTLFLAG_RD,
     &pmc_callchaindepth, 0, "depth of call chain records");
 
 #ifdef	DEBUG
 struct pmc_debugflags pmc_debugflags = PMC_DEBUG_DEFAULT_FLAGS;
 char	pmc_debugstr[PMC_DEBUG_STRSIZE];
 TUNABLE_STR(PMC_SYSCTL_NAME_PREFIX "debugflags", pmc_debugstr,
     sizeof(pmc_debugstr));
 SYSCTL_PROC(_kern_hwpmc, OID_AUTO, debugflags,
     CTLTYPE_STRING|CTLFLAG_RW|CTLFLAG_TUN,
     0, 0, pmc_debugflags_sysctl_handler, "A", "debug flags");
 #endif
 
 /*
  * kern.hwpmc.hashrows -- determines the number of rows in the
  * of the hash table used to look up threads
  */
 
 static int pmc_hashsize = PMC_HASH_SIZE;
 TUNABLE_INT(PMC_SYSCTL_NAME_PREFIX "hashsize", &pmc_hashsize);
 SYSCTL_INT(_kern_hwpmc, OID_AUTO, hashsize, CTLFLAG_TUN|CTLFLAG_RD,
     &pmc_hashsize, 0, "rows in hash tables");
 
 /*
  * kern.hwpmc.nsamples --- number of PC samples/callchain stacks per CPU
  */
 
 static int pmc_nsamples = PMC_NSAMPLES;
 TUNABLE_INT(PMC_SYSCTL_NAME_PREFIX "nsamples", &pmc_nsamples);
 SYSCTL_INT(_kern_hwpmc, OID_AUTO, nsamples, CTLFLAG_TUN|CTLFLAG_RD,
     &pmc_nsamples, 0, "number of PC samples per CPU");
 
 
 /*
  * kern.hwpmc.mtxpoolsize -- number of mutexes in the mutex pool.
  */
 
 static int pmc_mtxpool_size = PMC_MTXPOOL_SIZE;
 TUNABLE_INT(PMC_SYSCTL_NAME_PREFIX "mtxpoolsize", &pmc_mtxpool_size);
 SYSCTL_INT(_kern_hwpmc, OID_AUTO, mtxpoolsize, CTLFLAG_TUN|CTLFLAG_RD,
     &pmc_mtxpool_size, 0, "size of spin mutex pool");
 
 
 /*
  * security.bsd.unprivileged_syspmcs -- allow non-root processes to
  * allocate system-wide PMCs.
  *
  * Allowing unprivileged processes to allocate system PMCs is convenient
  * if system-wide measurements need to be taken concurrently with other
  * per-process measurements.  This feature is turned off by default.
  */
 
 static int pmc_unprivileged_syspmcs = 0;
 TUNABLE_INT("security.bsd.unprivileged_syspmcs", &pmc_unprivileged_syspmcs);
 SYSCTL_INT(_security_bsd, OID_AUTO, unprivileged_syspmcs, CTLFLAG_RW,
     &pmc_unprivileged_syspmcs, 0,
     "allow unprivileged process to allocate system PMCs");
 
 /*
  * Hash function.  Discard the lower 2 bits of the pointer since
  * these are always zero for our uses.  The hash multiplier is
  * round((2^LONG_BIT) * ((sqrt(5)-1)/2)).
  */
 
 #if	LONG_BIT == 64
 #define	_PMC_HM		11400714819323198486u
 #elif	LONG_BIT == 32
 #define	_PMC_HM		2654435769u
 #else
 #error 	Must know the size of 'long' to compile
 #endif
 
 #define	PMC_HASH_PTR(P,M)	((((unsigned long) (P) >> 2) * _PMC_HM) & (M))
 
 /*
  * Syscall structures
  */
 
 /* The `sysent' for the new syscall */
 static struct sysent pmc_sysent = {
 	2,			/* sy_narg */
 	pmc_syscall_handler	/* sy_call */
 };
 
 static struct syscall_module_data pmc_syscall_mod = {
 	load,
 	NULL,
 	&pmc_syscall_num,
 	&pmc_sysent,
 	{ 0, NULL }
 };
 
 static moduledata_t pmc_mod = {
 	PMC_MODULE_NAME,
 	syscall_module_handler,
 	&pmc_syscall_mod
 };
 
 DECLARE_MODULE(pmc, pmc_mod, SI_SUB_SMP, SI_ORDER_ANY);
 MODULE_VERSION(pmc, PMC_VERSION);
 
 #ifdef	DEBUG
 enum pmc_dbgparse_state {
 	PMCDS_WS,		/* in whitespace */
 	PMCDS_MAJOR,		/* seen a major keyword */
 	PMCDS_MINOR
 };
 
 static int
 pmc_debugflags_parse(char *newstr, char *fence)
 {
 	char c, *p, *q;
 	struct pmc_debugflags *tmpflags;
 	int error, found, *newbits, tmp;
 	size_t kwlen;
 
 	MALLOC(tmpflags, struct pmc_debugflags *, sizeof(*tmpflags),
 	    M_PMC, M_WAITOK|M_ZERO);
 
 	p = newstr;
 	error = 0;
 
 	for (; p < fence && (c = *p); p++) {
 
 		/* skip white space */
 		if (c == ' ' || c == '\t')
 			continue;
 
 		/* look for a keyword followed by "=" */
 		for (q = p; p < fence && (c = *p) && c != '='; p++)
 			;
 		if (c != '=') {
 			error = EINVAL;
 			goto done;
 		}
 
 		kwlen = p - q;
 		newbits = NULL;
 
 		/* lookup flag group name */
 #define	DBG_SET_FLAG_MAJ(S,F)						\
 		if (kwlen == sizeof(S)-1 && strncmp(q, S, kwlen) == 0)	\
 			newbits = &tmpflags->pdb_ ## F;
 
 		DBG_SET_FLAG_MAJ("cpu",		CPU);
 		DBG_SET_FLAG_MAJ("csw",		CSW);
 		DBG_SET_FLAG_MAJ("logging",	LOG);
 		DBG_SET_FLAG_MAJ("module",	MOD);
 		DBG_SET_FLAG_MAJ("md", 		MDP);
 		DBG_SET_FLAG_MAJ("owner",	OWN);
 		DBG_SET_FLAG_MAJ("pmc",		PMC);
 		DBG_SET_FLAG_MAJ("process",	PRC);
 		DBG_SET_FLAG_MAJ("sampling", 	SAM);
 
 		if (newbits == NULL) {
 			error = EINVAL;
 			goto done;
 		}
 
 		p++;		/* skip the '=' */
 
 		/* Now parse the individual flags */
 		tmp = 0;
 	newflag:
 		for (q = p; p < fence && (c = *p); p++)
 			if (c == ' ' || c == '\t' || c == ',')
 				break;
 
 		/* p == fence or c == ws or c == "," or c == 0 */
 
 		if ((kwlen = p - q) == 0) {
 			*newbits = tmp;
 			continue;
 		}
 
 		found = 0;
 #define	DBG_SET_FLAG_MIN(S,F)						\
 		if (kwlen == sizeof(S)-1 && strncmp(q, S, kwlen) == 0)	\
 			tmp |= found = (1 << PMC_DEBUG_MIN_ ## F)
 
 		/* a '*' denotes all possible flags in the group */
 		if (kwlen == 1 && *q == '*')
 			tmp = found = ~0;
 		/* look for individual flag names */
 		DBG_SET_FLAG_MIN("allocaterow", ALR);
 		DBG_SET_FLAG_MIN("allocate",	ALL);
 		DBG_SET_FLAG_MIN("attach",	ATT);
 		DBG_SET_FLAG_MIN("bind",	BND);
 		DBG_SET_FLAG_MIN("config",	CFG);
 		DBG_SET_FLAG_MIN("exec",	EXC);
 		DBG_SET_FLAG_MIN("exit",	EXT);
 		DBG_SET_FLAG_MIN("find",	FND);
 		DBG_SET_FLAG_MIN("flush",	FLS);
 		DBG_SET_FLAG_MIN("fork",	FRK);
 		DBG_SET_FLAG_MIN("getbuf",	GTB);
 		DBG_SET_FLAG_MIN("hook",	PMH);
 		DBG_SET_FLAG_MIN("init",	INI);
 		DBG_SET_FLAG_MIN("intr",	INT);
 		DBG_SET_FLAG_MIN("linktarget",	TLK);
 		DBG_SET_FLAG_MIN("mayberemove", OMR);
 		DBG_SET_FLAG_MIN("ops",		OPS);
 		DBG_SET_FLAG_MIN("read",	REA);
 		DBG_SET_FLAG_MIN("register",	REG);
 		DBG_SET_FLAG_MIN("release",	REL);
 		DBG_SET_FLAG_MIN("remove",	ORM);
 		DBG_SET_FLAG_MIN("sample",	SAM);
 		DBG_SET_FLAG_MIN("scheduleio",	SIO);
 		DBG_SET_FLAG_MIN("select",	SEL);
 		DBG_SET_FLAG_MIN("signal",	SIG);
 		DBG_SET_FLAG_MIN("swi",		SWI);
 		DBG_SET_FLAG_MIN("swo",		SWO);
 		DBG_SET_FLAG_MIN("start",	STA);
 		DBG_SET_FLAG_MIN("stop",	STO);
 		DBG_SET_FLAG_MIN("syscall",	PMS);
 		DBG_SET_FLAG_MIN("unlinktarget", TUL);
 		DBG_SET_FLAG_MIN("write",	WRI);
 		if (found == 0) {
 			/* unrecognized flag name */
 			error = EINVAL;
 			goto done;
 		}
 
 		if (c == 0 || c == ' ' || c == '\t') {	/* end of flag group */
 			*newbits = tmp;
 			continue;
 		}
 
 		p++;
 		goto newflag;
 	}
 
 	/* save the new flag set */
 	bcopy(tmpflags, &pmc_debugflags, sizeof(pmc_debugflags));
 
  done:
 	FREE(tmpflags, M_PMC);
 	return error;
 }
 
 static int
 pmc_debugflags_sysctl_handler(SYSCTL_HANDLER_ARGS)
 {
 	char *fence, *newstr;
 	int error;
 	unsigned int n;
 
 	(void) arg1; (void) arg2; /* unused parameters */
 
 	n = sizeof(pmc_debugstr);
 	MALLOC(newstr, char *, n, M_PMC, M_ZERO|M_WAITOK);
 	(void) strlcpy(newstr, pmc_debugstr, n);
 
 	error = sysctl_handle_string(oidp, newstr, n, req);
 
 	/* if there is a new string, parse and copy it */
 	if (error == 0 && req->newptr != NULL) {
 		fence = newstr + (n < req->newlen ? n : req->newlen + 1);
 		if ((error = pmc_debugflags_parse(newstr, fence)) == 0)
 			(void) strlcpy(pmc_debugstr, newstr,
 			    sizeof(pmc_debugstr));
 	}
 
 	FREE(newstr, M_PMC);
 
 	return error;
 }
 #endif
 
 /*
  * Concurrency Control
  *
  * The driver manages the following data structures:
  *
  *   - target process descriptors, one per target process
  *   - owner process descriptors (and attached lists), one per owner process
  *   - lookup hash tables for owner and target processes
  *   - PMC descriptors (and attached lists)
  *   - per-cpu hardware state
  *   - the 'hook' variable through which the kernel calls into
  *     this module
  *   - the machine hardware state (managed by the MD layer)
  *
  * These data structures are accessed from:
  *
  * - thread context-switch code
  * - interrupt handlers (possibly on multiple cpus)
  * - kernel threads on multiple cpus running on behalf of user
  *   processes doing system calls
  * - this driver's private kernel threads
  *
  * = Locks and Locking strategy =
  *
  * The driver uses four locking strategies for its operation:
  *
  * - The global SX lock "pmc_sx" is used to protect internal
  *   data structures.
  *
  *   Calls into the module by syscall() start with this lock being
  *   held in exclusive mode.  Depending on the requested operation,
  *   the lock may be downgraded to 'shared' mode to allow more
  *   concurrent readers into the module.  Calls into the module from
  *   other parts of the kernel acquire the lock in shared mode.
  *
  *   This SX lock is held in exclusive mode for any operations that
  *   modify the linkages between the driver's internal data structures.
  *
  *   The 'pmc_hook' function pointer is also protected by this lock.
  *   It is only examined with the sx lock held in exclusive mode.  The
  *   kernel module is allowed to be unloaded only with the sx lock held
  *   in exclusive mode.  In normal syscall handling, after acquiring the
  *   pmc_sx lock we first check that 'pmc_hook' is non-null before
  *   proceeding.  This prevents races between the thread unloading the module
  *   and other threads seeking to use the module.
  *
  * - Lookups of target process structures and owner process structures
  *   cannot use the global "pmc_sx" SX lock because these lookups need
  *   to happen during context switches and in other critical sections
  *   where sleeping is not allowed.  We protect these lookup tables
  *   with their own private spin-mutexes, "pmc_processhash_mtx" and
  *   "pmc_ownerhash_mtx".
  *
  * - Interrupt handlers work in a lock free manner.  At interrupt
  *   time, handlers look at the PMC pointer (phw->phw_pmc) configured
  *   when the PMC was started.  If this pointer is NULL, the interrupt
  *   is ignored after updating driver statistics.  We ensure that this
  *   pointer is set (using an atomic operation if necessary) before the
  *   PMC hardware is started.  Conversely, this pointer is unset atomically
  *   only after the PMC hardware is stopped.
  *
  *   We ensure that everything needed for the operation of an
  *   interrupt handler is available without it needing to acquire any
  *   locks.  We also ensure that a PMC's software state is destroyed only
  *   after the PMC is taken off hardware (on all CPUs).
  *
  * - Context-switch handling with process-private PMCs needs more
  *   care.
  *
  *   A given process may be the target of multiple PMCs.  For example,
  *   PMCATTACH and PMCDETACH may be requested by a process on one CPU
  *   while the target process is running on another.  A PMC could also
  *   be getting released because its owner is exiting.  We tackle
  *   these situations in the following manner:
  *
  *   - each target process structure 'pmc_process' has an array
  *     of 'struct pmc *' pointers, one for each hardware PMC.
  *
  *   - At context switch IN time, each "target" PMC in RUNNING state
  *     gets started on hardware and a pointer to each PMC is copied into
  *     the per-cpu phw array.  The 'runcount' for the PMC is
  *     incremented.
  *
  *   - At context switch OUT time, all process-virtual PMCs are stopped
  *     on hardware.  The saved value is added to the PMCs value field
  *     only if the PMC is in a non-deleted state (the PMCs state could
  *     have changed during the current time slice).
  *
  *     Note that since in-between a switch IN on a processor and a switch
  *     OUT, the PMC could have been released on another CPU.  Therefore
  *     context switch OUT always looks at the hardware state to turn
  *     OFF PMCs and will update a PMC's saved value only if reachable
  *     from the target process record.
  *
  *   - OP PMCRELEASE could be called on a PMC at any time (the PMC could
  *     be attached to many processes at the time of the call and could
  *     be active on multiple CPUs).
  *
  *     We prevent further scheduling of the PMC by marking it as in
  *     state 'DELETED'.  If the runcount of the PMC is non-zero then
  *     this PMC is currently running on a CPU somewhere.  The thread
  *     doing the PMCRELEASE operation waits by repeatedly doing a
  *     pause() till the runcount comes to zero.
  *
  * The contents of a PMC descriptor (struct pmc) are protected using
  * a spin-mutex.  In order to save space, we use a mutex pool.
  *
  * In terms of lock types used by witness(4), we use:
  * - Type "pmc-sx", used by the global SX lock.
  * - Type "pmc-sleep", for sleep mutexes used by logger threads.
  * - Type "pmc-per-proc", for protecting PMC owner descriptors.
  * - Type "pmc-leaf", used for all other spin mutexes.
  */
 
 /*
  * save the cpu binding of the current kthread
  */
 
 static void
 pmc_save_cpu_binding(struct pmc_binding *pb)
 {
 	PMCDBG(CPU,BND,2, "%s", "save-cpu");
 	thread_lock(curthread);
 	pb->pb_bound = sched_is_bound(curthread);
 	pb->pb_cpu   = curthread->td_oncpu;
 	thread_unlock(curthread);
 	PMCDBG(CPU,BND,2, "save-cpu cpu=%d", pb->pb_cpu);
 }
 
 /*
  * restore the cpu binding of the current thread
  */
 
 static void
 pmc_restore_cpu_binding(struct pmc_binding *pb)
 {
 	PMCDBG(CPU,BND,2, "restore-cpu curcpu=%d restore=%d",
 	    curthread->td_oncpu, pb->pb_cpu);
 	thread_lock(curthread);
 	if (pb->pb_bound)
 		sched_bind(curthread, pb->pb_cpu);
 	else
 		sched_unbind(curthread);
 	thread_unlock(curthread);
 	PMCDBG(CPU,BND,2, "%s", "restore-cpu done");
 }
 
 /*
  * move execution over the specified cpu and bind it there.
  */
 
 static void
 pmc_select_cpu(int cpu)
 {
 	KASSERT(cpu >= 0 && cpu < mp_ncpus,
 	    ("[pmc,%d] bad cpu number %d", __LINE__, cpu));
 
 	/* never move to a disabled CPU */
 	KASSERT(pmc_cpu_is_disabled(cpu) == 0, ("[pmc,%d] selecting "
 	    "disabled CPU %d", __LINE__, cpu));
 
 	PMCDBG(CPU,SEL,2, "select-cpu cpu=%d", cpu);
 	thread_lock(curthread);
 	sched_bind(curthread, cpu);
 	thread_unlock(curthread);
 
 	KASSERT(curthread->td_oncpu == cpu,
 	    ("[pmc,%d] CPU not bound [cpu=%d, curr=%d]", __LINE__,
 		cpu, curthread->td_oncpu));
 
 	PMCDBG(CPU,SEL,2, "select-cpu cpu=%d ok", cpu);
 }
 
 /*
  * Force a context switch.
  *
  * We do this by pause'ing for 1 tick -- invoking mi_switch() is not
  * guaranteed to force a context switch.
  */
 
 static void
 pmc_force_context_switch(void)
 {
 
 	pause("pmcctx", 1);
 }
 
 /*
  * Get the file name for an executable.  This is a simple wrapper
  * around vn_fullpath(9).
  */
 
 static void
 pmc_getfilename(struct vnode *v, char **fullpath, char **freepath)
 {
 	struct thread *td;
 
 	td = curthread;
 	*fullpath = "unknown";
 	*freepath = NULL;
-	vn_lock(v, LK_CANRECURSE | LK_EXCLUSIVE | LK_RETRY, td);
+	vn_lock(v, LK_CANRECURSE | LK_EXCLUSIVE | LK_RETRY);
 	vn_fullpath(td, v, fullpath, freepath);
 	VOP_UNLOCK(v, 0, td);
 }
 
 /*
  * remove an process owning PMCs
  */
 
 void
 pmc_remove_owner(struct pmc_owner *po)
 {
 	struct pmc *pm, *tmp;
 
 	sx_assert(&pmc_sx, SX_XLOCKED);
 
 	PMCDBG(OWN,ORM,1, "remove-owner po=%p", po);
 
 	/* Remove descriptor from the owner hash table */
 	LIST_REMOVE(po, po_next);
 
 	/* release all owned PMC descriptors */
 	LIST_FOREACH_SAFE(pm, &po->po_pmcs, pm_next, tmp) {
 		PMCDBG(OWN,ORM,2, "pmc=%p", pm);
 		KASSERT(pm->pm_owner == po,
 		    ("[pmc,%d] owner %p != po %p", __LINE__, pm->pm_owner, po));
 
 		pmc_release_pmc_descriptor(pm);	/* will unlink from the list */
 	}
 
 	KASSERT(po->po_sscount == 0,
 	    ("[pmc,%d] SS count not zero", __LINE__));
 	KASSERT(LIST_EMPTY(&po->po_pmcs),
 	    ("[pmc,%d] PMC list not empty", __LINE__));
 
 	/* de-configure the log file if present */
 	if (po->po_flags & PMC_PO_OWNS_LOGFILE)
 		pmclog_deconfigure_log(po);
 }
 
 /*
  * remove an owner process record if all conditions are met.
  */
 
 static void
 pmc_maybe_remove_owner(struct pmc_owner *po)
 {
 
 	PMCDBG(OWN,OMR,1, "maybe-remove-owner po=%p", po);
 
 	/*
 	 * Remove owner record if
 	 * - this process does not own any PMCs
 	 * - this process has not allocated a system-wide sampling buffer
 	 */
 
 	if (LIST_EMPTY(&po->po_pmcs) &&
 	    ((po->po_flags & PMC_PO_OWNS_LOGFILE) == 0)) {
 		pmc_remove_owner(po);
 		pmc_destroy_owner_descriptor(po);
 	}
 }
 
 /*
  * Add an association between a target process and a PMC.
  */
 
 static void
 pmc_link_target_process(struct pmc *pm, struct pmc_process *pp)
 {
 	int ri;
 	struct pmc_target *pt;
 
 	sx_assert(&pmc_sx, SX_XLOCKED);
 
 	KASSERT(pm != NULL && pp != NULL,
 	    ("[pmc,%d] Null pm %p or pp %p", __LINE__, pm, pp));
 	KASSERT(PMC_IS_VIRTUAL_MODE(PMC_TO_MODE(pm)),
 	    ("[pmc,%d] Attaching a non-process-virtual pmc=%p to pid=%d",
 		__LINE__, pm, pp->pp_proc->p_pid));
 	KASSERT(pp->pp_refcnt >= 0 && pp->pp_refcnt < ((int) md->pmd_npmc - 1),
 	    ("[pmc,%d] Illegal reference count %d for process record %p",
 		__LINE__, pp->pp_refcnt, (void *) pp));
 
 	ri = PMC_TO_ROWINDEX(pm);
 
 	PMCDBG(PRC,TLK,1, "link-target pmc=%p ri=%d pmc-process=%p",
 	    pm, ri, pp);
 
 #ifdef	DEBUG
 	LIST_FOREACH(pt, &pm->pm_targets, pt_next)
 	    if (pt->pt_process == pp)
 		    KASSERT(0, ("[pmc,%d] pp %p already in pmc %p targets",
 				__LINE__, pp, pm));
 #endif
 
 	MALLOC(pt, struct pmc_target *, sizeof(struct pmc_target),
 	    M_PMC, M_ZERO|M_WAITOK);
 
 	pt->pt_process = pp;
 
 	LIST_INSERT_HEAD(&pm->pm_targets, pt, pt_next);
 
 	atomic_store_rel_ptr((uintptr_t *)&pp->pp_pmcs[ri].pp_pmc,
 	    (uintptr_t)pm);
 
 	if (pm->pm_owner->po_owner == pp->pp_proc)
 		pm->pm_flags |= PMC_F_ATTACHED_TO_OWNER;
 
 	/*
 	 * Initialize the per-process values at this row index.
 	 */
 	pp->pp_pmcs[ri].pp_pmcval = PMC_TO_MODE(pm) == PMC_MODE_TS ?
 	    pm->pm_sc.pm_reloadcount : 0;
 
 	pp->pp_refcnt++;
 
 }
 
 /*
  * Removes the association between a target process and a PMC.
  */
 
 static void
 pmc_unlink_target_process(struct pmc *pm, struct pmc_process *pp)
 {
 	int ri;
 	struct proc *p;
 	struct pmc_target *ptgt;
 
 	sx_assert(&pmc_sx, SX_XLOCKED);
 
 	KASSERT(pm != NULL && pp != NULL,
 	    ("[pmc,%d] Null pm %p or pp %p", __LINE__, pm, pp));
 
 	KASSERT(pp->pp_refcnt >= 1 && pp->pp_refcnt < (int) md->pmd_npmc,
 	    ("[pmc,%d] Illegal ref count %d on process record %p",
 		__LINE__, pp->pp_refcnt, (void *) pp));
 
 	ri = PMC_TO_ROWINDEX(pm);
 
 	PMCDBG(PRC,TUL,1, "unlink-target pmc=%p ri=%d pmc-process=%p",
 	    pm, ri, pp);
 
 	KASSERT(pp->pp_pmcs[ri].pp_pmc == pm,
 	    ("[pmc,%d] PMC ri %d mismatch pmc %p pp->[ri] %p", __LINE__,
 		ri, pm, pp->pp_pmcs[ri].pp_pmc));
 
 	pp->pp_pmcs[ri].pp_pmc = NULL;
 	pp->pp_pmcs[ri].pp_pmcval = (pmc_value_t) 0;
 
 	/* Remove owner-specific flags */
 	if (pm->pm_owner->po_owner == pp->pp_proc) {
 		pp->pp_flags &= ~PMC_PP_ENABLE_MSR_ACCESS;
 		pm->pm_flags &= ~PMC_F_ATTACHED_TO_OWNER;
 	}
 
 	pp->pp_refcnt--;
 
 	/* Remove the target process from the PMC structure */
 	LIST_FOREACH(ptgt, &pm->pm_targets, pt_next)
 		if (ptgt->pt_process == pp)
 			break;
 
 	KASSERT(ptgt != NULL, ("[pmc,%d] process %p (pp: %p) not found "
 		    "in pmc %p", __LINE__, pp->pp_proc, pp, pm));
 
 	LIST_REMOVE(ptgt, pt_next);
 	FREE(ptgt, M_PMC);
 
 	/* if the PMC now lacks targets, send the owner a SIGIO */
 	if (LIST_EMPTY(&pm->pm_targets)) {
 		p = pm->pm_owner->po_owner;
 		PROC_LOCK(p);
 		psignal(p, SIGIO);
 		PROC_UNLOCK(p);
 
 		PMCDBG(PRC,SIG,2, "signalling proc=%p signal=%d", p,
 		    SIGIO);
 	}
 }
 
 /*
  * Check if PMC 'pm' may be attached to target process 't'.
  */
 
 static int
 pmc_can_attach(struct pmc *pm, struct proc *t)
 {
 	struct proc *o;		/* pmc owner */
 	struct ucred *oc, *tc;	/* owner, target credentials */
 	int decline_attach, i;
 
 	/*
 	 * A PMC's owner can always attach that PMC to itself.
 	 */
 
 	if ((o = pm->pm_owner->po_owner) == t)
 		return 0;
 
 	PROC_LOCK(o);
 	oc = o->p_ucred;
 	crhold(oc);
 	PROC_UNLOCK(o);
 
 	PROC_LOCK(t);
 	tc = t->p_ucred;
 	crhold(tc);
 	PROC_UNLOCK(t);
 
 	/*
 	 * The effective uid of the PMC owner should match at least one
 	 * of the {effective,real,saved} uids of the target process.
 	 */
 
 	decline_attach = oc->cr_uid != tc->cr_uid &&
 	    oc->cr_uid != tc->cr_svuid &&
 	    oc->cr_uid != tc->cr_ruid;
 
 	/*
 	 * Every one of the target's group ids, must be in the owner's
 	 * group list.
 	 */
 	for (i = 0; !decline_attach && i < tc->cr_ngroups; i++)
 		decline_attach = !groupmember(tc->cr_groups[i], oc);
 
 	/* check the read and saved gids too */
 	if (decline_attach == 0)
 		decline_attach = !groupmember(tc->cr_rgid, oc) ||
 		    !groupmember(tc->cr_svgid, oc);
 
 	crfree(tc);
 	crfree(oc);
 
 	return !decline_attach;
 }
 
 /*
  * Attach a process to a PMC.
  */
 
 static int
 pmc_attach_one_process(struct proc *p, struct pmc *pm)
 {
 	int ri;
 	char *fullpath, *freepath;
 	struct pmc_process	*pp;
 
 	sx_assert(&pmc_sx, SX_XLOCKED);
 
 	PMCDBG(PRC,ATT,2, "attach-one pm=%p ri=%d proc=%p (%d, %s)", pm,
 	    PMC_TO_ROWINDEX(pm), p, p->p_pid, p->p_comm);
 
 	/*
 	 * Locate the process descriptor corresponding to process 'p',
 	 * allocating space as needed.
 	 *
 	 * Verify that rowindex 'pm_rowindex' is free in the process
 	 * descriptor.
 	 *
 	 * If not, allocate space for a descriptor and link the
 	 * process descriptor and PMC.
 	 */
 	ri = PMC_TO_ROWINDEX(pm);
 
 	if ((pp = pmc_find_process_descriptor(p, PMC_FLAG_ALLOCATE)) == NULL)
 		return ENOMEM;
 
 	if (pp->pp_pmcs[ri].pp_pmc == pm) /* already present at slot [ri] */
 		return EEXIST;
 
 	if (pp->pp_pmcs[ri].pp_pmc != NULL)
 		return EBUSY;
 
 	pmc_link_target_process(pm, pp);
 
 	if (PMC_IS_SAMPLING_MODE(PMC_TO_MODE(pm)) &&
 	    (pm->pm_flags & PMC_F_ATTACHED_TO_OWNER) == 0)
 		pm->pm_flags |= PMC_F_NEEDS_LOGFILE;
 
 	pm->pm_flags |= PMC_F_ATTACH_DONE; /* mark as attached */
 
 	/* issue an attach event to a configured log file */
 	if (pm->pm_owner->po_flags & PMC_PO_OWNS_LOGFILE) {
 		pmc_getfilename(p->p_textvp, &fullpath, &freepath);
 		pmclog_process_pmcattach(pm, p->p_pid, fullpath);
 		if (freepath)
 			FREE(freepath, M_TEMP);
 		if (PMC_IS_SAMPLING_MODE(PMC_TO_MODE(pm)))
 			pmc_log_process_mappings(pm->pm_owner, p);
 	}
 	/* mark process as using HWPMCs */
 	PROC_LOCK(p);
 	p->p_flag |= P_HWPMC;
 	PROC_UNLOCK(p);
 
 	return 0;
 }
 
 /*
  * Attach a process and optionally its children
  */
 
 static int
 pmc_attach_process(struct proc *p, struct pmc *pm)
 {
 	int error;
 	struct proc *top;
 
 	sx_assert(&pmc_sx, SX_XLOCKED);
 
 	PMCDBG(PRC,ATT,1, "attach pm=%p ri=%d proc=%p (%d, %s)", pm,
 	    PMC_TO_ROWINDEX(pm), p, p->p_pid, p->p_comm);
 
 
 	/*
 	 * If this PMC successfully allowed a GETMSR operation
 	 * in the past, disallow further ATTACHes.
 	 */
 
 	if ((pm->pm_flags & PMC_PP_ENABLE_MSR_ACCESS) != 0)
 		return EPERM;
 
 	if ((pm->pm_flags & PMC_F_DESCENDANTS) == 0)
 		return pmc_attach_one_process(p, pm);
 
 	/*
 	 * Traverse all child processes, attaching them to
 	 * this PMC.
 	 */
 
 	sx_slock(&proctree_lock);
 
 	top = p;
 
 	for (;;) {
 		if ((error = pmc_attach_one_process(p, pm)) != 0)
 			break;
 		if (!LIST_EMPTY(&p->p_children))
 			p = LIST_FIRST(&p->p_children);
 		else for (;;) {
 			if (p == top)
 				goto done;
 			if (LIST_NEXT(p, p_sibling)) {
 				p = LIST_NEXT(p, p_sibling);
 				break;
 			}
 			p = p->p_pptr;
 		}
 	}
 
 	if (error)
 		(void) pmc_detach_process(top, pm);
 
  done:
 	sx_sunlock(&proctree_lock);
 	return error;
 }
 
 /*
  * Detach a process from a PMC.  If there are no other PMCs tracking
  * this process, remove the process structure from its hash table.  If
  * 'flags' contains PMC_FLAG_REMOVE, then free the process structure.
  */
 
 static int
 pmc_detach_one_process(struct proc *p, struct pmc *pm, int flags)
 {
 	int ri;
 	struct pmc_process *pp;
 
 	sx_assert(&pmc_sx, SX_XLOCKED);
 
 	KASSERT(pm != NULL,
 	    ("[pmc,%d] null pm pointer", __LINE__));
 
 	ri = PMC_TO_ROWINDEX(pm);
 
 	PMCDBG(PRC,ATT,2, "detach-one pm=%p ri=%d proc=%p (%d, %s) flags=0x%x",
 	    pm, ri, p, p->p_pid, p->p_comm, flags);
 
 	if ((pp = pmc_find_process_descriptor(p, 0)) == NULL)
 		return ESRCH;
 
 	if (pp->pp_pmcs[ri].pp_pmc != pm)
 		return EINVAL;
 
 	pmc_unlink_target_process(pm, pp);
 
 	/* Issue a detach entry if a log file is configured */
 	if (pm->pm_owner->po_flags & PMC_PO_OWNS_LOGFILE)
 		pmclog_process_pmcdetach(pm, p->p_pid);
 
 	/*
 	 * If there are no PMCs targetting this process, we remove its
 	 * descriptor from the target hash table and unset the P_HWPMC
 	 * flag in the struct proc.
 	 */
 	KASSERT(pp->pp_refcnt >= 0 && pp->pp_refcnt < (int) md->pmd_npmc,
 	    ("[pmc,%d] Illegal refcnt %d for process struct %p",
 		__LINE__, pp->pp_refcnt, pp));
 
 	if (pp->pp_refcnt != 0)	/* still a target of some PMC */
 		return 0;
 
 	pmc_remove_process_descriptor(pp);
 
 	if (flags & PMC_FLAG_REMOVE)
 		FREE(pp, M_PMC);
 
 	PROC_LOCK(p);
 	p->p_flag &= ~P_HWPMC;
 	PROC_UNLOCK(p);
 
 	return 0;
 }
 
 /*
  * Detach a process and optionally its descendants from a PMC.
  */
 
 static int
 pmc_detach_process(struct proc *p, struct pmc *pm)
 {
 	struct proc *top;
 
 	sx_assert(&pmc_sx, SX_XLOCKED);
 
 	PMCDBG(PRC,ATT,1, "detach pm=%p ri=%d proc=%p (%d, %s)", pm,
 	    PMC_TO_ROWINDEX(pm), p, p->p_pid, p->p_comm);
 
 	if ((pm->pm_flags & PMC_F_DESCENDANTS) == 0)
 		return pmc_detach_one_process(p, pm, PMC_FLAG_REMOVE);
 
 	/*
 	 * Traverse all children, detaching them from this PMC.  We
 	 * ignore errors since we could be detaching a PMC from a
 	 * partially attached proc tree.
 	 */
 
 	sx_slock(&proctree_lock);
 
 	top = p;
 
 	for (;;) {
 		(void) pmc_detach_one_process(p, pm, PMC_FLAG_REMOVE);
 
 		if (!LIST_EMPTY(&p->p_children))
 			p = LIST_FIRST(&p->p_children);
 		else for (;;) {
 			if (p == top)
 				goto done;
 			if (LIST_NEXT(p, p_sibling)) {
 				p = LIST_NEXT(p, p_sibling);
 				break;
 			}
 			p = p->p_pptr;
 		}
 	}
 
  done:
 	sx_sunlock(&proctree_lock);
 
 	if (LIST_EMPTY(&pm->pm_targets))
 		pm->pm_flags &= ~PMC_F_ATTACH_DONE;
 
 	return 0;
 }
 
 
 /*
  * Thread context switch IN
  */
 
 static void
 pmc_process_csw_in(struct thread *td)
 {
 	int cpu;
 	unsigned int ri;
 	struct pmc *pm;
 	struct proc *p;
 	struct pmc_cpu *pc;
 	struct pmc_hw *phw;
 	struct pmc_process *pp;
 	pmc_value_t newvalue;
 
 	p = td->td_proc;
 
 	if ((pp = pmc_find_process_descriptor(p, PMC_FLAG_NONE)) == NULL)
 		return;
 
 	KASSERT(pp->pp_proc == td->td_proc,
 	    ("[pmc,%d] not my thread state", __LINE__));
 
 	critical_enter(); /* no preemption from this point */
 
 	cpu = PCPU_GET(cpuid); /* td->td_oncpu is invalid */
 
 	PMCDBG(CSW,SWI,1, "cpu=%d proc=%p (%d, %s) pp=%p", cpu, p,
 	    p->p_pid, p->p_comm, pp);
 
 	KASSERT(cpu >= 0 && cpu < mp_ncpus,
 	    ("[pmc,%d] wierd CPU id %d", __LINE__, cpu));
 
 	pc = pmc_pcpu[cpu];
 
 	for (ri = 0; ri < md->pmd_npmc; ri++) {
 
 		if ((pm = pp->pp_pmcs[ri].pp_pmc) == NULL)
 			continue;
 
 		KASSERT(PMC_IS_VIRTUAL_MODE(PMC_TO_MODE(pm)),
 		    ("[pmc,%d] Target PMC in non-virtual mode (%d)",
 			__LINE__, PMC_TO_MODE(pm)));
 
 		KASSERT(PMC_TO_ROWINDEX(pm) == ri,
 		    ("[pmc,%d] Row index mismatch pmc %d != ri %d",
 			__LINE__, PMC_TO_ROWINDEX(pm), ri));
 
 		/*
 		 * Only PMCs that are marked as 'RUNNING' need
 		 * be placed on hardware.
 		 */
 
 		if (pm->pm_state != PMC_STATE_RUNNING)
 			continue;
 
 		/* increment PMC runcount */
 		atomic_add_rel_32(&pm->pm_runcount, 1);
 
 		/* configure the HWPMC we are going to use. */
 		md->pmd_config_pmc(cpu, ri, pm);
 
 		phw = pc->pc_hwpmcs[ri];
 
 		KASSERT(phw != NULL,
 		    ("[pmc,%d] null hw pointer", __LINE__));
 
 		KASSERT(phw->phw_pmc == pm,
 		    ("[pmc,%d] hw->pmc %p != pmc %p", __LINE__,
 			phw->phw_pmc, pm));
 
 		/*
 		 * Write out saved value and start the PMC.
 		 *
 		 * Sampling PMCs use a per-process value, while
 		 * counting mode PMCs use a per-pmc value that is
 		 * inherited across descendants.
 		 */
 		if (PMC_TO_MODE(pm) == PMC_MODE_TS) {
 			mtx_pool_lock_spin(pmc_mtxpool, pm);
 			newvalue = PMC_PCPU_SAVED(cpu,ri) =
 			    pp->pp_pmcs[ri].pp_pmcval;
 			mtx_pool_unlock_spin(pmc_mtxpool, pm);
 		} else {
 			KASSERT(PMC_TO_MODE(pm) == PMC_MODE_TC,
 			    ("[pmc,%d] illegal mode=%d", __LINE__,
 			    PMC_TO_MODE(pm)));
 			mtx_pool_lock_spin(pmc_mtxpool, pm);
 			newvalue = PMC_PCPU_SAVED(cpu, ri) =
 			    pm->pm_gv.pm_savedvalue;
 			mtx_pool_unlock_spin(pmc_mtxpool, pm);
 		}
 
 		PMCDBG(CSW,SWI,1,"cpu=%d ri=%d new=%jd", cpu, ri, newvalue);
 
 		md->pmd_write_pmc(cpu, ri, newvalue);
 		md->pmd_start_pmc(cpu, ri);
 	}
 
 	/*
 	 * perform any other architecture/cpu dependent thread
 	 * switch-in actions.
 	 */
 
 	(void) (*md->pmd_switch_in)(pc, pp);
 
 	critical_exit();
 
 }
 
 /*
  * Thread context switch OUT.
  */
 
 static void
 pmc_process_csw_out(struct thread *td)
 {
 	int cpu;
 	enum pmc_mode mode;
 	unsigned int ri;
 	struct pmc *pm;
 	struct proc *p;
 	struct pmc_cpu *pc;
 	struct pmc_process *pp;
 	int64_t tmp;
 	pmc_value_t newvalue;
 
 	/*
 	 * Locate our process descriptor; this may be NULL if
 	 * this process is exiting and we have already removed
 	 * the process from the target process table.
 	 *
 	 * Note that due to kernel preemption, multiple
 	 * context switches may happen while the process is
 	 * exiting.
 	 *
 	 * Note also that if the target process cannot be
 	 * found we still need to deconfigure any PMCs that
 	 * are currently running on hardware.
 	 */
 
 	p = td->td_proc;
 	pp = pmc_find_process_descriptor(p, PMC_FLAG_NONE);
 
 	/*
 	 * save PMCs
 	 */
 
 	critical_enter();
 
 	cpu = PCPU_GET(cpuid); /* td->td_oncpu is invalid */
 
 	PMCDBG(CSW,SWO,1, "cpu=%d proc=%p (%d, %s) pp=%p", cpu, p,
 	    p->p_pid, p->p_comm, pp);
 
 	KASSERT(cpu >= 0 && cpu < mp_ncpus,
 	    ("[pmc,%d wierd CPU id %d", __LINE__, cpu));
 
 	pc = pmc_pcpu[cpu];
 
 	/*
 	 * When a PMC gets unlinked from a target PMC, it will
 	 * be removed from the target's pp_pmc[] array.
 	 *
 	 * However, on a MP system, the target could have been
 	 * executing on another CPU at the time of the unlink.
 	 * So, at context switch OUT time, we need to look at
 	 * the hardware to determine if a PMC is scheduled on
 	 * it.
 	 */
 
 	for (ri = 0; ri < md->pmd_npmc; ri++) {
 
 		pm = NULL;
 		(void) (*md->pmd_get_config)(cpu, ri, &pm);
 
 		if (pm == NULL)	/* nothing at this row index */
 			continue;
 
 		mode = PMC_TO_MODE(pm);
 		if (!PMC_IS_VIRTUAL_MODE(mode))
 			continue; /* not a process virtual PMC */
 
 		KASSERT(PMC_TO_ROWINDEX(pm) == ri,
 		    ("[pmc,%d] ri mismatch pmc(%d) ri(%d)",
 			__LINE__, PMC_TO_ROWINDEX(pm), ri));
 
 		/* Stop hardware if not already stopped */
 		if (pm->pm_stalled == 0)
 			md->pmd_stop_pmc(cpu, ri);
 
 		/* reduce this PMC's runcount */
 		atomic_subtract_rel_32(&pm->pm_runcount, 1);
 
 		/*
 		 * If this PMC is associated with this process,
 		 * save the reading.
 		 */
 
 		if (pp != NULL && pp->pp_pmcs[ri].pp_pmc != NULL) {
 
 			KASSERT(pm == pp->pp_pmcs[ri].pp_pmc,
 			    ("[pmc,%d] pm %p != pp_pmcs[%d] %p", __LINE__,
 				pm, ri, pp->pp_pmcs[ri].pp_pmc));
 
 			KASSERT(pp->pp_refcnt > 0,
 			    ("[pmc,%d] pp refcnt = %d", __LINE__,
 				pp->pp_refcnt));
 
 			md->pmd_read_pmc(cpu, ri, &newvalue);
 
 			tmp = newvalue - PMC_PCPU_SAVED(cpu,ri);
 
 			PMCDBG(CSW,SWI,1,"cpu=%d ri=%d tmp=%jd", cpu, ri,
 			    tmp);
 
 			if (mode == PMC_MODE_TS) {
 
 				/*
 				 * For sampling process-virtual PMCs,
 				 * we expect the count to be
 				 * decreasing as the 'value'
 				 * programmed into the PMC is the
 				 * number of events to be seen till
 				 * the next sampling interrupt.
 				 */
 				if (tmp < 0)
 					tmp += pm->pm_sc.pm_reloadcount;
 				mtx_pool_lock_spin(pmc_mtxpool, pm);
 				pp->pp_pmcs[ri].pp_pmcval -= tmp;
 				if ((int64_t) pp->pp_pmcs[ri].pp_pmcval < 0)
 					pp->pp_pmcs[ri].pp_pmcval +=
 					    pm->pm_sc.pm_reloadcount;
 				mtx_pool_unlock_spin(pmc_mtxpool, pm);
 
 			} else {
 
 				/*
 				 * For counting process-virtual PMCs,
 				 * we expect the count to be
 				 * increasing monotonically, modulo a 64
 				 * bit wraparound.
 				 */
 				KASSERT((int64_t) tmp >= 0,
 				    ("[pmc,%d] negative increment cpu=%d "
 				     "ri=%d newvalue=%jx saved=%jx "
 				     "incr=%jx", __LINE__, cpu, ri,
 				     newvalue, PMC_PCPU_SAVED(cpu,ri), tmp));
 
 				mtx_pool_lock_spin(pmc_mtxpool, pm);
 				pm->pm_gv.pm_savedvalue += tmp;
 				pp->pp_pmcs[ri].pp_pmcval += tmp;
 				mtx_pool_unlock_spin(pmc_mtxpool, pm);
 
 				if (pm->pm_flags & PMC_F_LOG_PROCCSW)
 					pmclog_process_proccsw(pm, pp, tmp);
 			}
 		}
 
 		/* mark hardware as free */
 		md->pmd_config_pmc(cpu, ri, NULL);
 	}
 
 	/*
 	 * perform any other architecture/cpu dependent thread
 	 * switch out functions.
 	 */
 
 	(void) (*md->pmd_switch_out)(pc, pp);
 
 	critical_exit();
 }
 
 /*
  * Log a KLD operation.
  */
 
 static void
 pmc_process_kld_load(struct pmckern_map_in *pkm)
 {
 	struct pmc_owner *po;
 
 	sx_assert(&pmc_sx, SX_LOCKED);
 
 	/*
 	 * Notify owners of system sampling PMCs about KLD operations.
 	 */
 
 	LIST_FOREACH(po, &pmc_ss_owners, po_ssnext)
 	    if (po->po_flags & PMC_PO_OWNS_LOGFILE)
 	    	pmclog_process_map_in(po, (pid_t) -1, pkm->pm_address,
 		    (char *) pkm->pm_file);
 
 	/*
 	 * TODO: Notify owners of (all) process-sampling PMCs too.
 	 */
 
 	return;
 }
 
 static void
 pmc_process_kld_unload(struct pmckern_map_out *pkm)
 {
 	struct pmc_owner *po;
 
 	sx_assert(&pmc_sx, SX_LOCKED);
 
 	LIST_FOREACH(po, &pmc_ss_owners, po_ssnext)
 	    if (po->po_flags & PMC_PO_OWNS_LOGFILE)
 		pmclog_process_map_out(po, (pid_t) -1,
 		    pkm->pm_address, pkm->pm_address + pkm->pm_size);
 
 	/*
 	 * TODO: Notify owners of process-sampling PMCs.
 	 */
 }
 
 /*
  * A mapping change for a process.
  */
 
 static void
 pmc_process_mmap(struct thread *td, struct pmckern_map_in *pkm)
 {
 	int ri;
 	pid_t pid;
 	char *fullpath, *freepath;
 	const struct pmc *pm;
 	struct pmc_owner *po;
 	const struct pmc_process *pp;
 
 	freepath = fullpath = NULL;
 	pmc_getfilename((struct vnode *) pkm->pm_file, &fullpath, &freepath);
 
 	pid = td->td_proc->p_pid;
 
 	/* Inform owners of all system-wide sampling PMCs. */
 	LIST_FOREACH(po, &pmc_ss_owners, po_ssnext)
 	    if (po->po_flags & PMC_PO_OWNS_LOGFILE)
 		pmclog_process_map_in(po, pid, pkm->pm_address, fullpath);
 
 	if ((pp = pmc_find_process_descriptor(td->td_proc, 0)) == NULL)
 		goto done;
 
 	/*
 	 * Inform sampling PMC owners tracking this process.
 	 */
 	for (ri = 0; ri < md->pmd_npmc; ri++)
 		if ((pm = pp->pp_pmcs[ri].pp_pmc) != NULL &&
 		    PMC_IS_SAMPLING_MODE(PMC_TO_MODE(pm)))
 			pmclog_process_map_in(pm->pm_owner,
 			    pid, pkm->pm_address, fullpath);
 
   done:
 	if (freepath)
 		FREE(freepath, M_TEMP);
 }
 
 
 /*
  * Log an munmap request.
  */
 
 static void
 pmc_process_munmap(struct thread *td, struct pmckern_map_out *pkm)
 {
 	int ri;
 	pid_t pid;
 	struct pmc_owner *po;
 	const struct pmc *pm;
 	const struct pmc_process *pp;
 
 	pid = td->td_proc->p_pid;
 
 	LIST_FOREACH(po, &pmc_ss_owners, po_ssnext)
 	    if (po->po_flags & PMC_PO_OWNS_LOGFILE)
 		pmclog_process_map_out(po, pid, pkm->pm_address,
 		    pkm->pm_address + pkm->pm_size);
 
 	if ((pp = pmc_find_process_descriptor(td->td_proc, 0)) == NULL)
 		return;
 
 	for (ri = 0; ri < md->pmd_npmc; ri++)
 		if ((pm = pp->pp_pmcs[ri].pp_pmc) != NULL &&
 		    PMC_IS_SAMPLING_MODE(PMC_TO_MODE(pm)))
 			pmclog_process_map_out(pm->pm_owner, pid,
 			    pkm->pm_address, pkm->pm_address + pkm->pm_size);
 }
 
 /*
  * Log mapping information about the kernel.
  */
 
 static void
 pmc_log_kernel_mappings(struct pmc *pm)
 {
 	struct pmc_owner *po;
 	struct pmckern_map_in *km, *kmbase;
 
 	sx_assert(&pmc_sx, SX_LOCKED);
 	KASSERT(PMC_IS_SAMPLING_MODE(PMC_TO_MODE(pm)),
 	    ("[pmc,%d] non-sampling PMC (%p) desires mapping information",
 		__LINE__, (void *) pm));
 
 	po = pm->pm_owner;
 
 	if (po->po_flags & PMC_PO_INITIAL_MAPPINGS_DONE)
 		return;
 
 	/*
 	 * Log the current set of kernel modules.
 	 */
 	kmbase = linker_hwpmc_list_objects();
 	for (km = kmbase; km->pm_file != NULL; km++) {
 		PMCDBG(LOG,REG,1,"%s %p", (char *) km->pm_file,
 		    (void *) km->pm_address);
 		pmclog_process_map_in(po, (pid_t) -1, km->pm_address,
 		    km->pm_file);
 	}
 	FREE(kmbase, M_LINKER);
 
 	po->po_flags |= PMC_PO_INITIAL_MAPPINGS_DONE;
 }
 
 /*
  * Log the mappings for a single process.
  */
 
 static void
 pmc_log_process_mappings(struct pmc_owner *po, struct proc *p)
 {
 }
 
 /*
  * Log mappings for all processes in the system.
  */
 
 static void
 pmc_log_all_process_mappings(struct pmc_owner *po)
 {
 	struct proc *p, *top;
 
 	sx_assert(&pmc_sx, SX_XLOCKED);
 
 	if ((p = pfind(1)) == NULL)
 		panic("[pmc,%d] Cannot find init", __LINE__);
 
 	PROC_UNLOCK(p);
 
 	sx_slock(&proctree_lock);
 
 	top = p;
 
 	for (;;) {
 		pmc_log_process_mappings(po, p);
 		if (!LIST_EMPTY(&p->p_children))
 			p = LIST_FIRST(&p->p_children);
 		else for (;;) {
 			if (p == top)
 				goto done;
 			if (LIST_NEXT(p, p_sibling)) {
 				p = LIST_NEXT(p, p_sibling);
 				break;
 			}
 			p = p->p_pptr;
 		}
 	}
  done:
 	sx_sunlock(&proctree_lock);
 }
 
 /*
  * The 'hook' invoked from the kernel proper
  */
 
 
 #ifdef	DEBUG
 const char *pmc_hooknames[] = {
 	/* these strings correspond to PMC_FN_* in <sys/pmckern.h> */
 	"",
 	"EXEC",
 	"CSW-IN",
 	"CSW-OUT",
 	"SAMPLE",
 	"KLDLOAD",
 	"KLDUNLOAD",
 	"MMAP",
 	"MUNMAP",
 	"CALLCHAIN"
 };
 #endif
 
 static int
 pmc_hook_handler(struct thread *td, int function, void *arg)
 {
 
 	PMCDBG(MOD,PMH,1, "hook td=%p func=%d \"%s\" arg=%p", td, function,
 	    pmc_hooknames[function], arg);
 
 	switch (function)
 	{
 
 	/*
 	 * Process exec()
 	 */
 
 	case PMC_FN_PROCESS_EXEC:
 	{
 		char *fullpath, *freepath;
 		unsigned int ri;
 		int is_using_hwpmcs;
 		struct pmc *pm;
 		struct proc *p;
 		struct pmc_owner *po;
 		struct pmc_process *pp;
 		struct pmckern_procexec *pk;
 
 		sx_assert(&pmc_sx, SX_XLOCKED);
 
 		p = td->td_proc;
 		pmc_getfilename(p->p_textvp, &fullpath, &freepath);
 
 		pk = (struct pmckern_procexec *) arg;
 
 		/* Inform owners of SS mode PMCs of the exec event. */
 		LIST_FOREACH(po, &pmc_ss_owners, po_ssnext)
 		    if (po->po_flags & PMC_PO_OWNS_LOGFILE)
 			    pmclog_process_procexec(po, PMC_ID_INVALID,
 				p->p_pid, pk->pm_entryaddr, fullpath);
 
 		PROC_LOCK(p);
 		is_using_hwpmcs = p->p_flag & P_HWPMC;
 		PROC_UNLOCK(p);
 
 		if (!is_using_hwpmcs) {
 			if (freepath)
 				FREE(freepath, M_TEMP);
 			break;
 		}
 
 		/*
 		 * PMCs are not inherited across an exec():  remove any
 		 * PMCs that this process is the owner of.
 		 */
 
 		if ((po = pmc_find_owner_descriptor(p)) != NULL) {
 			pmc_remove_owner(po);
 			pmc_destroy_owner_descriptor(po);
 		}
 
 		/*
 		 * If the process being exec'ed is not the target of any
 		 * PMC, we are done.
 		 */
 		if ((pp = pmc_find_process_descriptor(p, 0)) == NULL) {
 			if (freepath)
 				FREE(freepath, M_TEMP);
 			break;
 		}
 
 		/*
 		 * Log the exec event to all monitoring owners.  Skip
 		 * owners who have already recieved the event because
 		 * they had system sampling PMCs active.
 		 */
 		for (ri = 0; ri < md->pmd_npmc; ri++)
 			if ((pm = pp->pp_pmcs[ri].pp_pmc) != NULL) {
 				po = pm->pm_owner;
 				if (po->po_sscount == 0 &&
 				    po->po_flags & PMC_PO_OWNS_LOGFILE)
 					pmclog_process_procexec(po, pm->pm_id,
 					    p->p_pid, pk->pm_entryaddr,
 					    fullpath);
 			}
 
 		if (freepath)
 			FREE(freepath, M_TEMP);
 
 
 		PMCDBG(PRC,EXC,1, "exec proc=%p (%d, %s) cred-changed=%d",
 		    p, p->p_pid, p->p_comm, pk->pm_credentialschanged);
 
 		if (pk->pm_credentialschanged == 0) /* no change */
 			break;
 
 		/*
 		 * If the newly exec()'ed process has a different credential
 		 * than before, allow it to be the target of a PMC only if
 		 * the PMC's owner has sufficient priviledge.
 		 */
 
 		for (ri = 0; ri < md->pmd_npmc; ri++)
 			if ((pm = pp->pp_pmcs[ri].pp_pmc) != NULL)
 				if (pmc_can_attach(pm, td->td_proc) != 0)
 					pmc_detach_one_process(td->td_proc,
 					    pm, PMC_FLAG_NONE);
 
 		KASSERT(pp->pp_refcnt >= 0 && pp->pp_refcnt < (int) md->pmd_npmc,
 		    ("[pmc,%d] Illegal ref count %d on pp %p", __LINE__,
 			pp->pp_refcnt, pp));
 
 		/*
 		 * If this process is no longer the target of any
 		 * PMCs, we can remove the process entry and free
 		 * up space.
 		 */
 
 		if (pp->pp_refcnt == 0) {
 			pmc_remove_process_descriptor(pp);
 			FREE(pp, M_PMC);
 			break;
 		}
 
 	}
 	break;
 
 	case PMC_FN_CSW_IN:
 		pmc_process_csw_in(td);
 		break;
 
 	case PMC_FN_CSW_OUT:
 		pmc_process_csw_out(td);
 		break;
 
 	/*
 	 * Process accumulated PC samples.
 	 *
 	 * This function is expected to be called by hardclock() for
 	 * each CPU that has accumulated PC samples.
 	 *
 	 * This function is to be executed on the CPU whose samples
 	 * are being processed.
 	 */
 	case PMC_FN_DO_SAMPLES:
 
 		/*
 		 * Clear the cpu specific bit in the CPU mask before
 		 * do the rest of the processing.  If the NMI handler
 		 * gets invoked after the "atomic_clear_int()" call
 		 * below but before "pmc_process_samples()" gets
 		 * around to processing the interrupt, then we will
 		 * come back here at the next hardclock() tick (and
 		 * may find nothing to do if "pmc_process_samples()"
 		 * had already processed the interrupt).  We don't
 		 * lose the interrupt sample.
 		 */
 		atomic_clear_int(&pmc_cpumask, (1 << PCPU_GET(cpuid)));
 		pmc_process_samples(PCPU_GET(cpuid));
 		break;
 
 
 	case PMC_FN_KLD_LOAD:
 		sx_assert(&pmc_sx, SX_LOCKED);
 		pmc_process_kld_load((struct pmckern_map_in *) arg);
 		break;
 
 	case PMC_FN_KLD_UNLOAD:
 		sx_assert(&pmc_sx, SX_LOCKED);
 		pmc_process_kld_unload((struct pmckern_map_out *) arg);
 		break;
 
 	case PMC_FN_MMAP:
 		sx_assert(&pmc_sx, SX_LOCKED);
 		pmc_process_mmap(td, (struct pmckern_map_in *) arg);
 		break;
 
 	case PMC_FN_MUNMAP:
 		sx_assert(&pmc_sx, SX_LOCKED);
 		pmc_process_munmap(td, (struct pmckern_map_out *) arg);
 		break;
 
 	case PMC_FN_USER_CALLCHAIN:
 		/*
 		 * Record a call chain.
 		 */
 		pmc_capture_user_callchain(PCPU_GET(cpuid),
 		    (struct trapframe *) arg);
 		break;
 
 	default:
 #ifdef	DEBUG
 		KASSERT(0, ("[pmc,%d] unknown hook %d\n", __LINE__, function));
 #endif
 		break;
 
 	}
 
 	return 0;
 }
 
 /*
  * allocate a 'struct pmc_owner' descriptor in the owner hash table.
  */
 
 static struct pmc_owner *
 pmc_allocate_owner_descriptor(struct proc *p)
 {
 	uint32_t hindex;
 	struct pmc_owner *po;
 	struct pmc_ownerhash *poh;
 
 	hindex = PMC_HASH_PTR(p, pmc_ownerhashmask);
 	poh = &pmc_ownerhash[hindex];
 
 	/* allocate space for N pointers and one descriptor struct */
 	MALLOC(po, struct pmc_owner *, sizeof(struct pmc_owner),
 	    M_PMC, M_ZERO|M_WAITOK);
 
 	po->po_sscount = po->po_error = po->po_flags = 0;
 	po->po_file  = NULL;
 	po->po_owner = p;
 	po->po_kthread = NULL;
 	LIST_INIT(&po->po_pmcs);
 	LIST_INSERT_HEAD(poh, po, po_next); /* insert into hash table */
 
 	TAILQ_INIT(&po->po_logbuffers);
 	mtx_init(&po->po_mtx, "pmc-owner-mtx", "pmc-per-proc", MTX_SPIN);
 
 	PMCDBG(OWN,ALL,1, "allocate-owner proc=%p (%d, %s) pmc-owner=%p",
 	    p, p->p_pid, p->p_comm, po);
 
 	return po;
 }
 
 static void
 pmc_destroy_owner_descriptor(struct pmc_owner *po)
 {
 
 	PMCDBG(OWN,REL,1, "destroy-owner po=%p proc=%p (%d, %s)",
 	    po, po->po_owner, po->po_owner->p_pid, po->po_owner->p_comm);
 
 	mtx_destroy(&po->po_mtx);
 	FREE(po, M_PMC);
 }
 
 /*
  * find the descriptor corresponding to process 'p', adding or removing it
  * as specified by 'mode'.
  */
 
 static struct pmc_process *
 pmc_find_process_descriptor(struct proc *p, uint32_t mode)
 {
 	uint32_t hindex;
 	struct pmc_process *pp, *ppnew;
 	struct pmc_processhash *pph;
 
 	hindex = PMC_HASH_PTR(p, pmc_processhashmask);
 	pph = &pmc_processhash[hindex];
 
 	ppnew = NULL;
 
 	/*
 	 * Pre-allocate memory in the FIND_ALLOCATE case since we
 	 * cannot call malloc(9) once we hold a spin lock.
 	 */
 
 	if (mode & PMC_FLAG_ALLOCATE) {
 		/* allocate additional space for 'n' pmc pointers */
 		MALLOC(ppnew, struct pmc_process *,
 		    sizeof(struct pmc_process) + md->pmd_npmc *
 		    sizeof(struct pmc_targetstate), M_PMC, M_ZERO|M_WAITOK);
 	}
 
 	mtx_lock_spin(&pmc_processhash_mtx);
 	LIST_FOREACH(pp, pph, pp_next)
 	    if (pp->pp_proc == p)
 		    break;
 
 	if ((mode & PMC_FLAG_REMOVE) && pp != NULL)
 		LIST_REMOVE(pp, pp_next);
 
 	if ((mode & PMC_FLAG_ALLOCATE) && pp == NULL &&
 	    ppnew != NULL) {
 		ppnew->pp_proc = p;
 		LIST_INSERT_HEAD(pph, ppnew, pp_next);
 		pp = ppnew;
 		ppnew = NULL;
 	}
 	mtx_unlock_spin(&pmc_processhash_mtx);
 
 	if (pp != NULL && ppnew != NULL)
 		FREE(ppnew, M_PMC);
 
 	return pp;
 }
 
 /*
  * remove a process descriptor from the process hash table.
  */
 
 static void
 pmc_remove_process_descriptor(struct pmc_process *pp)
 {
 	KASSERT(pp->pp_refcnt == 0,
 	    ("[pmc,%d] Removing process descriptor %p with count %d",
 		__LINE__, pp, pp->pp_refcnt));
 
 	mtx_lock_spin(&pmc_processhash_mtx);
 	LIST_REMOVE(pp, pp_next);
 	mtx_unlock_spin(&pmc_processhash_mtx);
 }
 
 
 /*
  * find an owner descriptor corresponding to proc 'p'
  */
 
 static struct pmc_owner *
 pmc_find_owner_descriptor(struct proc *p)
 {
 	uint32_t hindex;
 	struct pmc_owner *po;
 	struct pmc_ownerhash *poh;
 
 	hindex = PMC_HASH_PTR(p, pmc_ownerhashmask);
 	poh = &pmc_ownerhash[hindex];
 
 	po = NULL;
 	LIST_FOREACH(po, poh, po_next)
 	    if (po->po_owner == p)
 		    break;
 
 	PMCDBG(OWN,FND,1, "find-owner proc=%p (%d, %s) hindex=0x%x -> "
 	    "pmc-owner=%p", p, p->p_pid, p->p_comm, hindex, po);
 
 	return po;
 }
 
 /*
  * pmc_allocate_pmc_descriptor
  *
  * Allocate a pmc descriptor and initialize its
  * fields.
  */
 
 static struct pmc *
 pmc_allocate_pmc_descriptor(void)
 {
 	struct pmc *pmc;
 
 	MALLOC(pmc, struct pmc *, sizeof(struct pmc), M_PMC, M_ZERO|M_WAITOK);
 
 	if (pmc != NULL) {
 		pmc->pm_owner = NULL;
 		LIST_INIT(&pmc->pm_targets);
 	}
 
 	PMCDBG(PMC,ALL,1, "allocate-pmc -> pmc=%p", pmc);
 
 	return pmc;
 }
 
 /*
  * Destroy a pmc descriptor.
  */
 
 static void
 pmc_destroy_pmc_descriptor(struct pmc *pm)
 {
 	(void) pm;
 
 #ifdef	DEBUG
 	KASSERT(pm->pm_state == PMC_STATE_DELETED ||
 	    pm->pm_state == PMC_STATE_FREE,
 	    ("[pmc,%d] destroying non-deleted PMC", __LINE__));
 	KASSERT(LIST_EMPTY(&pm->pm_targets),
 	    ("[pmc,%d] destroying pmc with targets", __LINE__));
 	KASSERT(pm->pm_owner == NULL,
 	    ("[pmc,%d] destroying pmc attached to an owner", __LINE__));
 	KASSERT(pm->pm_runcount == 0,
 	    ("[pmc,%d] pmc has non-zero run count %d", __LINE__,
 		pm->pm_runcount));
 #endif
 }
 
 static void
 pmc_wait_for_pmc_idle(struct pmc *pm)
 {
 #ifdef	DEBUG
 	volatile int maxloop;
 
 	maxloop = 100 * mp_ncpus;
 #endif
 
 	/*
 	 * Loop (with a forced context switch) till the PMC's runcount
 	 * comes down to zero.
 	 */
 	while (atomic_load_acq_32(&pm->pm_runcount) > 0) {
 #ifdef	DEBUG
 		maxloop--;
 		KASSERT(maxloop > 0,
 		    ("[pmc,%d] (ri%d, rc%d) waiting too long for "
 			"pmc to be free", __LINE__,
 			PMC_TO_ROWINDEX(pm), pm->pm_runcount));
 #endif
 		pmc_force_context_switch();
 	}
 }
 
 /*
  * This function does the following things:
  *
  *  - detaches the PMC from hardware
  *  - unlinks all target threads that were attached to it
  *  - removes the PMC from its owner's list
  *  - destroy's the PMC private mutex
  *
  * Once this function completes, the given pmc pointer can be safely
  * FREE'd by the caller.
  */
 
 static void
 pmc_release_pmc_descriptor(struct pmc *pm)
 {
 	u_int ri, cpu;
 	enum pmc_mode mode;
 	struct pmc_hw *phw;
 	struct pmc_owner *po;
 	struct pmc_process *pp;
 	struct pmc_target *ptgt, *tmp;
 	struct pmc_binding pb;
 
 	sx_assert(&pmc_sx, SX_XLOCKED);
 
 	KASSERT(pm, ("[pmc,%d] null pmc", __LINE__));
 
 	ri   = PMC_TO_ROWINDEX(pm);
 	mode = PMC_TO_MODE(pm);
 
 	PMCDBG(PMC,REL,1, "release-pmc pmc=%p ri=%d mode=%d", pm, ri,
 	    mode);
 
 	/*
 	 * First, we take the PMC off hardware.
 	 */
 	cpu = 0;
 	if (PMC_IS_SYSTEM_MODE(mode)) {
 
 		/*
 		 * A system mode PMC runs on a specific CPU.  Switch
 		 * to this CPU and turn hardware off.
 		 */
 		pmc_save_cpu_binding(&pb);
 
 		cpu = PMC_TO_CPU(pm);
 
 		pmc_select_cpu(cpu);
 
 		/* switch off non-stalled CPUs */
 		if (pm->pm_state == PMC_STATE_RUNNING &&
 		    pm->pm_stalled == 0) {
 
 			phw = pmc_pcpu[cpu]->pc_hwpmcs[ri];
 
 			KASSERT(phw->phw_pmc == pm,
 			    ("[pmc, %d] pmc ptr ri(%d) hw(%p) pm(%p)",
 				__LINE__, ri, phw->phw_pmc, pm));
 			PMCDBG(PMC,REL,2, "stopping cpu=%d ri=%d", cpu, ri);
 
 			critical_enter();
 			md->pmd_stop_pmc(cpu, ri);
 			critical_exit();
 		}
 
 		PMCDBG(PMC,REL,2, "decfg cpu=%d ri=%d", cpu, ri);
 
 		critical_enter();
 		md->pmd_config_pmc(cpu, ri, NULL);
 		critical_exit();
 
 		/* adjust the global and process count of SS mode PMCs */
 		if (mode == PMC_MODE_SS && pm->pm_state == PMC_STATE_RUNNING) {
 			po = pm->pm_owner;
 			po->po_sscount--;
 			if (po->po_sscount == 0) {
 				atomic_subtract_rel_int(&pmc_ss_count, 1);
 				LIST_REMOVE(po, po_ssnext);
 			}
 		}
 
 		pm->pm_state = PMC_STATE_DELETED;
 
 		pmc_restore_cpu_binding(&pb);
 
 		/*
 		 * We could have references to this PMC structure in
 		 * the per-cpu sample queues.  Wait for the queue to
 		 * drain.
 		 */
 		pmc_wait_for_pmc_idle(pm);
 
 	} else if (PMC_IS_VIRTUAL_MODE(mode)) {
 
 		/*
 		 * A virtual PMC could be running on multiple CPUs at
 		 * a given instant.
 		 *
 		 * By marking its state as DELETED, we ensure that
 		 * this PMC is never further scheduled on hardware.
 		 *
 		 * Then we wait till all CPUs are done with this PMC.
 		 */
 		pm->pm_state = PMC_STATE_DELETED;
 
 
 		/* Wait for the PMCs runcount to come to zero. */
 		pmc_wait_for_pmc_idle(pm);
 
 		/*
 		 * At this point the PMC is off all CPUs and cannot be
 		 * freshly scheduled onto a CPU.  It is now safe to
 		 * unlink all targets from this PMC.  If a
 		 * process-record's refcount falls to zero, we remove
 		 * it from the hash table.  The module-wide SX lock
 		 * protects us from races.
 		 */
 		LIST_FOREACH_SAFE(ptgt, &pm->pm_targets, pt_next, tmp) {
 			pp = ptgt->pt_process;
 			pmc_unlink_target_process(pm, pp); /* frees 'ptgt' */
 
 			PMCDBG(PMC,REL,3, "pp->refcnt=%d", pp->pp_refcnt);
 
 			/*
 			 * If the target process record shows that no
 			 * PMCs are attached to it, reclaim its space.
 			 */
 
 			if (pp->pp_refcnt == 0) {
 				pmc_remove_process_descriptor(pp);
 				FREE(pp, M_PMC);
 			}
 		}
 
 		cpu = curthread->td_oncpu; /* setup cpu for pmd_release() */
 
 	}
 
 	/*
 	 * Release any MD resources
 	 */
 
 	(void) md->pmd_release_pmc(cpu, ri, pm);
 
 	/*
 	 * Update row disposition
 	 */
 
 	if (PMC_IS_SYSTEM_MODE(PMC_TO_MODE(pm)))
 		PMC_UNMARK_ROW_STANDALONE(ri);
 	else
 		PMC_UNMARK_ROW_THREAD(ri);
 
 	/* unlink from the owner's list */
 	if (pm->pm_owner) {
 		LIST_REMOVE(pm, pm_next);
 		pm->pm_owner = NULL;
 	}
 
 	pmc_destroy_pmc_descriptor(pm);
 }
 
 /*
  * Register an owner and a pmc.
  */
 
 static int
 pmc_register_owner(struct proc *p, struct pmc *pmc)
 {
 	struct pmc_owner *po;
 
 	sx_assert(&pmc_sx, SX_XLOCKED);
 
 	if ((po = pmc_find_owner_descriptor(p)) == NULL)
 		if ((po = pmc_allocate_owner_descriptor(p)) == NULL)
 			return ENOMEM;
 
 	KASSERT(pmc->pm_owner == NULL,
 	    ("[pmc,%d] attempting to own an initialized PMC", __LINE__));
 	pmc->pm_owner  = po;
 
 	LIST_INSERT_HEAD(&po->po_pmcs, pmc, pm_next);
 
 	PROC_LOCK(p);
 	p->p_flag |= P_HWPMC;
 	PROC_UNLOCK(p);
 
 	if (po->po_flags & PMC_PO_OWNS_LOGFILE)
 		pmclog_process_pmcallocate(pmc);
 
 	PMCDBG(PMC,REG,1, "register-owner pmc-owner=%p pmc=%p",
 	    po, pmc);
 
 	return 0;
 }
 
 /*
  * Return the current row disposition:
  * == 0 => FREE
  *  > 0 => PROCESS MODE
  *  < 0 => SYSTEM MODE
  */
 
 int
 pmc_getrowdisp(int ri)
 {
 	return pmc_pmcdisp[ri];
 }
 
 /*
  * Check if a PMC at row index 'ri' can be allocated to the current
  * process.
  *
  * Allocation can fail if:
  *   - the current process is already being profiled by a PMC at index 'ri',
  *     attached to it via OP_PMCATTACH.
  *   - the current process has already allocated a PMC at index 'ri'
  *     via OP_ALLOCATE.
  */
 
 static int
 pmc_can_allocate_rowindex(struct proc *p, unsigned int ri, int cpu)
 {
 	enum pmc_mode mode;
 	struct pmc *pm;
 	struct pmc_owner *po;
 	struct pmc_process *pp;
 
 	PMCDBG(PMC,ALR,1, "can-allocate-rowindex proc=%p (%d, %s) ri=%d "
 	    "cpu=%d", p, p->p_pid, p->p_comm, ri, cpu);
 
 	/*
 	 * We shouldn't have already allocated a process-mode PMC at
 	 * row index 'ri'.
 	 *
 	 * We shouldn't have allocated a system-wide PMC on the same
 	 * CPU and same RI.
 	 */
 	if ((po = pmc_find_owner_descriptor(p)) != NULL)
 		LIST_FOREACH(pm, &po->po_pmcs, pm_next) {
 		    if (PMC_TO_ROWINDEX(pm) == ri) {
 			    mode = PMC_TO_MODE(pm);
 			    if (PMC_IS_VIRTUAL_MODE(mode))
 				    return EEXIST;
 			    if (PMC_IS_SYSTEM_MODE(mode) &&
 				(int) PMC_TO_CPU(pm) == cpu)
 				    return EEXIST;
 		    }
 	        }
 
 	/*
 	 * We also shouldn't be the target of any PMC at this index
 	 * since otherwise a PMC_ATTACH to ourselves will fail.
 	 */
 	if ((pp = pmc_find_process_descriptor(p, 0)) != NULL)
 		if (pp->pp_pmcs[ri].pp_pmc)
 			return EEXIST;
 
 	PMCDBG(PMC,ALR,2, "can-allocate-rowindex proc=%p (%d, %s) ri=%d ok",
 	    p, p->p_pid, p->p_comm, ri);
 
 	return 0;
 }
 
 /*
  * Check if a given PMC at row index 'ri' can be currently used in
  * mode 'mode'.
  */
 
 static int
 pmc_can_allocate_row(int ri, enum pmc_mode mode)
 {
 	enum pmc_disp	disp;
 
 	sx_assert(&pmc_sx, SX_XLOCKED);
 
 	PMCDBG(PMC,ALR,1, "can-allocate-row ri=%d mode=%d", ri, mode);
 
 	if (PMC_IS_SYSTEM_MODE(mode))
 		disp = PMC_DISP_STANDALONE;
 	else
 		disp = PMC_DISP_THREAD;
 
 	/*
 	 * check disposition for PMC row 'ri':
 	 *
 	 * Expected disposition		Row-disposition		Result
 	 *
 	 * STANDALONE			STANDALONE or FREE	proceed
 	 * STANDALONE			THREAD			fail
 	 * THREAD			THREAD or FREE		proceed
 	 * THREAD			STANDALONE		fail
 	 */
 
 	if (!PMC_ROW_DISP_IS_FREE(ri) &&
 	    !(disp == PMC_DISP_THREAD && PMC_ROW_DISP_IS_THREAD(ri)) &&
 	    !(disp == PMC_DISP_STANDALONE && PMC_ROW_DISP_IS_STANDALONE(ri)))
 		return EBUSY;
 
 	/*
 	 * All OK
 	 */
 
 	PMCDBG(PMC,ALR,2, "can-allocate-row ri=%d mode=%d ok", ri, mode);
 
 	return 0;
 
 }
 
 /*
  * Find a PMC descriptor with user handle 'pmcid' for thread 'td'.
  */
 
 static struct pmc *
 pmc_find_pmc_descriptor_in_process(struct pmc_owner *po, pmc_id_t pmcid)
 {
 	struct pmc *pm;
 
 	KASSERT(PMC_ID_TO_ROWINDEX(pmcid) < md->pmd_npmc,
 	    ("[pmc,%d] Illegal pmc index %d (max %d)", __LINE__,
 		PMC_ID_TO_ROWINDEX(pmcid), md->pmd_npmc));
 
 	LIST_FOREACH(pm, &po->po_pmcs, pm_next)
 	    if (pm->pm_id == pmcid)
 		    return pm;
 
 	return NULL;
 }
 
 static int
 pmc_find_pmc(pmc_id_t pmcid, struct pmc **pmc)
 {
 
 	struct pmc *pm;
 	struct pmc_owner *po;
 
 	PMCDBG(PMC,FND,1, "find-pmc id=%d", pmcid);
 
 	if ((po = pmc_find_owner_descriptor(curthread->td_proc)) == NULL)
 		return ESRCH;
 
 	if ((pm = pmc_find_pmc_descriptor_in_process(po, pmcid)) == NULL)
 		return EINVAL;
 
 	PMCDBG(PMC,FND,2, "find-pmc id=%d -> pmc=%p", pmcid, pm);
 
 	*pmc = pm;
 	return 0;
 }
 
 /*
  * Start a PMC.
  */
 
 static int
 pmc_start(struct pmc *pm)
 {
 	int error, cpu, ri;
 	enum pmc_mode mode;
 	struct pmc_owner *po;
 	struct pmc_binding pb;
 
 	KASSERT(pm != NULL,
 	    ("[pmc,%d] null pm", __LINE__));
 
 	mode = PMC_TO_MODE(pm);
 	ri   = PMC_TO_ROWINDEX(pm);
 	error = 0;
 
 	PMCDBG(PMC,OPS,1, "start pmc=%p mode=%d ri=%d", pm, mode, ri);
 
 	po = pm->pm_owner;
 
 	/*
 	 * Disallow PMCSTART if a logfile is required but has not been
 	 * configured yet.
 	 */
 	if ((pm->pm_flags & PMC_F_NEEDS_LOGFILE) &&
 	    (po->po_flags & PMC_PO_OWNS_LOGFILE) == 0)
 		return EDOOFUS;	/* programming error */
 
 	/*
 	 * If this is a sampling mode PMC, log mapping information for
 	 * the kernel modules that are currently loaded.
 	 */
 	if (PMC_IS_SAMPLING_MODE(PMC_TO_MODE(pm)))
 	    pmc_log_kernel_mappings(pm);
 
 	if (PMC_IS_VIRTUAL_MODE(mode)) {
 
 		/*
 		 * If a PMCATTACH has never been done on this PMC,
 		 * attach it to its owner process.
 		 */
 
 		if (LIST_EMPTY(&pm->pm_targets))
 			error = (pm->pm_flags & PMC_F_ATTACH_DONE) ? ESRCH :
 			    pmc_attach_process(po->po_owner, pm);
 
 		/*
 		 * If the PMC is attached to its owner, then force a context
 		 * switch to ensure that the MD state gets set correctly.
 		 */
 
 		if (error == 0) {
 			pm->pm_state = PMC_STATE_RUNNING;
 			if (pm->pm_flags & PMC_F_ATTACHED_TO_OWNER)
 				pmc_force_context_switch();
 		}
 
 		return error;
 	}
 
 
 	/*
 	 * A system-wide PMC.
 	 *
 	 * Add the owner to the global list if this is a system-wide
 	 * sampling PMC.
 	 */
 
 	if (mode == PMC_MODE_SS) {
 		if (po->po_sscount == 0) {
 			LIST_INSERT_HEAD(&pmc_ss_owners, po, po_ssnext);
 			atomic_add_rel_int(&pmc_ss_count, 1);
 			PMCDBG(PMC,OPS,1, "po=%p in global list", po);
 		}
 		po->po_sscount++;
 	}
 
 	/* Log mapping information for all processes in the system. */
 	pmc_log_all_process_mappings(po);
 
 	/*
 	 * Move to the CPU associated with this
 	 * PMC, and start the hardware.
 	 */
 
 	pmc_save_cpu_binding(&pb);
 
 	cpu = PMC_TO_CPU(pm);
 
 	if (pmc_cpu_is_disabled(cpu))
 		return ENXIO;
 
 	pmc_select_cpu(cpu);
 
 	/*
 	 * global PMCs are configured at allocation time
 	 * so write out the initial value and start the PMC.
 	 */
 
 	pm->pm_state = PMC_STATE_RUNNING;
 
 	critical_enter();
 	if ((error = md->pmd_write_pmc(cpu, ri,
 		 PMC_IS_SAMPLING_MODE(mode) ?
 		 pm->pm_sc.pm_reloadcount :
 		 pm->pm_sc.pm_initial)) == 0)
 		error = md->pmd_start_pmc(cpu, ri);
 	critical_exit();
 
 	pmc_restore_cpu_binding(&pb);
 
 	return error;
 }
 
 /*
  * Stop a PMC.
  */
 
 static int
 pmc_stop(struct pmc *pm)
 {
 	int cpu, error, ri;
 	struct pmc_owner *po;
 	struct pmc_binding pb;
 
 	KASSERT(pm != NULL, ("[pmc,%d] null pmc", __LINE__));
 
 	PMCDBG(PMC,OPS,1, "stop pmc=%p mode=%d ri=%d", pm,
 	    PMC_TO_MODE(pm), PMC_TO_ROWINDEX(pm));
 
 	pm->pm_state = PMC_STATE_STOPPED;
 
 	/*
 	 * If the PMC is a virtual mode one, changing the state to
 	 * non-RUNNING is enough to ensure that the PMC never gets
 	 * scheduled.
 	 *
 	 * If this PMC is current running on a CPU, then it will
 	 * handled correctly at the time its target process is context
 	 * switched out.
 	 */
 
 	if (PMC_IS_VIRTUAL_MODE(PMC_TO_MODE(pm)))
 		return 0;
 
 	/*
 	 * A system-mode PMC.  Move to the CPU associated with
 	 * this PMC, and stop the hardware.  We update the
 	 * 'initial count' so that a subsequent PMCSTART will
 	 * resume counting from the current hardware count.
 	 */
 
 	pmc_save_cpu_binding(&pb);
 
 	cpu = PMC_TO_CPU(pm);
 
 	KASSERT(cpu >= 0 && cpu < mp_ncpus,
 	    ("[pmc,%d] illegal cpu=%d", __LINE__, cpu));
 
 	if (pmc_cpu_is_disabled(cpu))
 		return ENXIO;
 
 	pmc_select_cpu(cpu);
 
 	ri = PMC_TO_ROWINDEX(pm);
 
 	critical_enter();
 	if ((error = md->pmd_stop_pmc(cpu, ri)) == 0)
 		error = md->pmd_read_pmc(cpu, ri, &pm->pm_sc.pm_initial);
 	critical_exit();
 
 	pmc_restore_cpu_binding(&pb);
 
 	po = pm->pm_owner;
 
 	/* remove this owner from the global list of SS PMC owners */
 	if (PMC_TO_MODE(pm) == PMC_MODE_SS) {
 		po->po_sscount--;
 		if (po->po_sscount == 0) {
 			atomic_subtract_rel_int(&pmc_ss_count, 1);
 			LIST_REMOVE(po, po_ssnext);
 			PMCDBG(PMC,OPS,2,"po=%p removed from global list", po);
 		}
 	}
 
 	return error;
 }
 
 
 #ifdef	DEBUG
 static const char *pmc_op_to_name[] = {
 #undef	__PMC_OP
 #define	__PMC_OP(N, D)	#N ,
 	__PMC_OPS()
 	NULL
 };
 #endif
 
 /*
  * The syscall interface
  */
 
 #define	PMC_GET_SX_XLOCK(...) do {		\
 	sx_xlock(&pmc_sx);			\
 	if (pmc_hook == NULL) {			\
 		sx_xunlock(&pmc_sx);		\
 		return __VA_ARGS__;		\
 	}					\
 } while (0)
 
 #define	PMC_DOWNGRADE_SX() do {			\
 	sx_downgrade(&pmc_sx);			\
 	is_sx_downgraded = 1;			\
 } while (0)
 
 static int
 pmc_syscall_handler(struct thread *td, void *syscall_args)
 {
 	int error, is_sx_downgraded, op;
 	struct pmc_syscall_args *c;
 	void *arg;
 
 	PMC_GET_SX_XLOCK(ENOSYS);
 
 	DROP_GIANT();
 
 	is_sx_downgraded = 0;
 
 	c = (struct pmc_syscall_args *) syscall_args;
 
 	op = c->pmop_code;
 	arg = c->pmop_data;
 
 	PMCDBG(MOD,PMS,1, "syscall op=%d \"%s\" arg=%p", op,
 	    pmc_op_to_name[op], arg);
 
 	error = 0;
 	atomic_add_int(&pmc_stats.pm_syscalls, 1);
 
 	switch(op)
 	{
 
 
 	/*
 	 * Configure a log file.
 	 *
 	 * XXX This OP will be reworked.
 	 */
 
 	case PMC_OP_CONFIGURELOG:
 	{
 		struct proc *p;
 		struct pmc *pm;
 		struct pmc_owner *po;
 		struct pmc_op_configurelog cl;
 
 		sx_assert(&pmc_sx, SX_XLOCKED);
 
 		if ((error = copyin(arg, &cl, sizeof(cl))) != 0)
 			break;
 
 		/* mark this process as owning a log file */
 		p = td->td_proc;
 		if ((po = pmc_find_owner_descriptor(p)) == NULL)
 			if ((po = pmc_allocate_owner_descriptor(p)) == NULL) {
 				error = ENOMEM;
 				break;
 			}
 
 		/*
 		 * If a valid fd was passed in, try to configure that,
 		 * otherwise if 'fd' was less than zero and there was
 		 * a log file configured, flush its buffers and
 		 * de-configure it.
 		 */
 		if (cl.pm_logfd >= 0)
 			error = pmclog_configure_log(po, cl.pm_logfd);
 		else if (po->po_flags & PMC_PO_OWNS_LOGFILE) {
 			pmclog_process_closelog(po);
 			error = pmclog_flush(po);
 			if (error == 0) {
 				LIST_FOREACH(pm, &po->po_pmcs, pm_next)
 				    if (pm->pm_flags & PMC_F_NEEDS_LOGFILE &&
 					pm->pm_state == PMC_STATE_RUNNING)
 					    pmc_stop(pm);
 				error = pmclog_deconfigure_log(po);
 			}
 		} else
 			error = EINVAL;
 
 		if (error)
 			break;
 	}
 	break;
 
 
 	/*
 	 * Flush a log file.
 	 */
 
 	case PMC_OP_FLUSHLOG:
 	{
 		struct pmc_owner *po;
 
 		sx_assert(&pmc_sx, SX_XLOCKED);
 
 		if ((po = pmc_find_owner_descriptor(td->td_proc)) == NULL) {
 			error = EINVAL;
 			break;
 		}
 
 		error = pmclog_flush(po);
 	}
 	break;
 
 	/*
 	 * Retrieve hardware configuration.
 	 */
 
 	case PMC_OP_GETCPUINFO:	/* CPU information */
 	{
 		struct pmc_op_getcpuinfo gci;
 
 		gci.pm_cputype = md->pmd_cputype;
 		gci.pm_ncpu    = mp_ncpus;
 		gci.pm_npmc    = md->pmd_npmc;
 		gci.pm_nclass  = md->pmd_nclass;
 		bcopy(md->pmd_classes, &gci.pm_classes,
 		    sizeof(gci.pm_classes));
 		error = copyout(&gci, arg, sizeof(gci));
 	}
 	break;
 
 
 	/*
 	 * Get module statistics
 	 */
 
 	case PMC_OP_GETDRIVERSTATS:
 	{
 		struct pmc_op_getdriverstats gms;
 
 		bcopy(&pmc_stats, &gms, sizeof(gms));
 		error = copyout(&gms, arg, sizeof(gms));
 	}
 	break;
 
 
 	/*
 	 * Retrieve module version number
 	 */
 
 	case PMC_OP_GETMODULEVERSION:
 	{
 		uint32_t cv, modv;
 
 		/* retrieve the client's idea of the ABI version */
 		if ((error = copyin(arg, &cv, sizeof(uint32_t))) != 0)
 			break;
 		/* don't service clients newer than our driver */
 		modv = PMC_VERSION;
 		if ((cv & 0xFFFF0000) > (modv & 0xFFFF0000)) {
 			error = EPROGMISMATCH;
 			break;
 		}
 		error = copyout(&modv, arg, sizeof(int));
 	}
 	break;
 
 
 	/*
 	 * Retrieve the state of all the PMCs on a given
 	 * CPU.
 	 */
 
 	case PMC_OP_GETPMCINFO:
 	{
 		uint32_t cpu, n, npmc;
 		size_t pmcinfo_size;
 		struct pmc *pm;
 		struct pmc_info *p, *pmcinfo;
 		struct pmc_op_getpmcinfo *gpi;
 		struct pmc_owner *po;
 		struct pmc_binding pb;
 
 		PMC_DOWNGRADE_SX();
 
 		gpi = (struct pmc_op_getpmcinfo *) arg;
 
 		if ((error = copyin(&gpi->pm_cpu, &cpu, sizeof(cpu))) != 0)
 			break;
 
 		if (cpu >= (unsigned int) mp_ncpus) {
 			error = EINVAL;
 			break;
 		}
 
 		if (pmc_cpu_is_disabled(cpu)) {
 			error = ENXIO;
 			break;
 		}
 
 		/* switch to CPU 'cpu' */
 		pmc_save_cpu_binding(&pb);
 		pmc_select_cpu(cpu);
 
 		npmc = md->pmd_npmc;
 
 		pmcinfo_size = npmc * sizeof(struct pmc_info);
 		MALLOC(pmcinfo, struct pmc_info *, pmcinfo_size, M_PMC,
 		    M_WAITOK);
 
 		p = pmcinfo;
 
 		for (n = 0; n < md->pmd_npmc; n++, p++) {
 
 			if ((error = md->pmd_describe(cpu, n, p, &pm)) != 0)
 				break;
 
 			if (PMC_ROW_DISP_IS_STANDALONE(n))
 				p->pm_rowdisp = PMC_DISP_STANDALONE;
 			else if (PMC_ROW_DISP_IS_THREAD(n))
 				p->pm_rowdisp = PMC_DISP_THREAD;
 			else
 				p->pm_rowdisp = PMC_DISP_FREE;
 
 			p->pm_ownerpid = -1;
 
 			if (pm == NULL)	/* no PMC associated */
 				continue;
 
 			po = pm->pm_owner;
 
 			KASSERT(po->po_owner != NULL,
 			    ("[pmc,%d] pmc_owner had a null proc pointer",
 				__LINE__));
 
 			p->pm_ownerpid = po->po_owner->p_pid;
 			p->pm_mode     = PMC_TO_MODE(pm);
 			p->pm_event    = pm->pm_event;
 			p->pm_flags    = pm->pm_flags;
 
 			if (PMC_IS_SAMPLING_MODE(PMC_TO_MODE(pm)))
 				p->pm_reloadcount =
 				    pm->pm_sc.pm_reloadcount;
 		}
 
 		pmc_restore_cpu_binding(&pb);
 
 		/* now copy out the PMC info collected */
 		if (error == 0)
 			error = copyout(pmcinfo, &gpi->pm_pmcs, pmcinfo_size);
 
 		FREE(pmcinfo, M_PMC);
 	}
 	break;
 
 
 	/*
 	 * Set the administrative state of a PMC.  I.e. whether
 	 * the PMC is to be used or not.
 	 */
 
 	case PMC_OP_PMCADMIN:
 	{
 		int cpu, ri;
 		enum pmc_state request;
 		struct pmc_cpu *pc;
 		struct pmc_hw *phw;
 		struct pmc_op_pmcadmin pma;
 		struct pmc_binding pb;
 
 		sx_assert(&pmc_sx, SX_XLOCKED);
 
 		KASSERT(td == curthread,
 		    ("[pmc,%d] td != curthread", __LINE__));
 
 		error = priv_check(td, PRIV_PMC_MANAGE);
 		if (error)
 			break;
 
 		if ((error = copyin(arg, &pma, sizeof(pma))) != 0)
 			break;
 
 		cpu = pma.pm_cpu;
 
 		if (cpu < 0 || cpu >= mp_ncpus) {
 			error = EINVAL;
 			break;
 		}
 
 		if (pmc_cpu_is_disabled(cpu)) {
 			error = ENXIO;
 			break;
 		}
 
 		request = pma.pm_state;
 
 		if (request != PMC_STATE_DISABLED &&
 		    request != PMC_STATE_FREE) {
 			error = EINVAL;
 			break;
 		}
 
 		ri = pma.pm_pmc; /* pmc id == row index */
 		if (ri < 0 || ri >= (int) md->pmd_npmc) {
 			error = EINVAL;
 			break;
 		}
 
 		/*
 		 * We can't disable a PMC with a row-index allocated
 		 * for process virtual PMCs.
 		 */
 
 		if (PMC_ROW_DISP_IS_THREAD(ri) &&
 		    request == PMC_STATE_DISABLED) {
 			error = EBUSY;
 			break;
 		}
 
 		/*
 		 * otherwise, this PMC on this CPU is either free or
 		 * in system-wide mode.
 		 */
 
 		pmc_save_cpu_binding(&pb);
 		pmc_select_cpu(cpu);
 
 		pc  = pmc_pcpu[cpu];
 		phw = pc->pc_hwpmcs[ri];
 
 		/*
 		 * XXX do we need some kind of 'forced' disable?
 		 */
 
 		if (phw->phw_pmc == NULL) {
 			if (request == PMC_STATE_DISABLED &&
 			    (phw->phw_state & PMC_PHW_FLAG_IS_ENABLED)) {
 				phw->phw_state &= ~PMC_PHW_FLAG_IS_ENABLED;
 				PMC_MARK_ROW_STANDALONE(ri);
 			} else if (request == PMC_STATE_FREE &&
 			    (phw->phw_state & PMC_PHW_FLAG_IS_ENABLED) == 0) {
 				phw->phw_state |=  PMC_PHW_FLAG_IS_ENABLED;
 				PMC_UNMARK_ROW_STANDALONE(ri);
 			}
 			/* other cases are a no-op */
 		} else
 			error = EBUSY;
 
 		pmc_restore_cpu_binding(&pb);
 	}
 	break;
 
 
 	/*
 	 * Allocate a PMC.
 	 */
 
 	case PMC_OP_PMCALLOCATE:
 	{
 		uint32_t caps;
 		u_int cpu;
 		int n;
 		enum pmc_mode mode;
 		struct pmc *pmc;
 		struct pmc_hw *phw;
 		struct pmc_op_pmcallocate pa;
 		struct pmc_binding pb;
 
 		if ((error = copyin(arg, &pa, sizeof(pa))) != 0)
 			break;
 
 		caps = pa.pm_caps;
 		mode = pa.pm_mode;
 		cpu  = pa.pm_cpu;
 
 		if ((mode != PMC_MODE_SS  &&  mode != PMC_MODE_SC  &&
 		     mode != PMC_MODE_TS  &&  mode != PMC_MODE_TC) ||
 		    (cpu != (u_int) PMC_CPU_ANY && cpu >= (u_int) mp_ncpus)) {
 			error = EINVAL;
 			break;
 		}
 
 		/*
 		 * Virtual PMCs should only ask for a default CPU.
 		 * System mode PMCs need to specify a non-default CPU.
 		 */
 
 		if ((PMC_IS_VIRTUAL_MODE(mode) && cpu != (u_int) PMC_CPU_ANY) ||
 		    (PMC_IS_SYSTEM_MODE(mode) && cpu == (u_int) PMC_CPU_ANY)) {
 			error = EINVAL;
 			break;
 		}
 
 		/*
 		 * Check that a disabled CPU is not being asked for.
 		 */
 
 		if (PMC_IS_SYSTEM_MODE(mode) && pmc_cpu_is_disabled(cpu)) {
 			error = ENXIO;
 			break;
 		}
 
 		/*
 		 * Refuse an allocation for a system-wide PMC if this
 		 * process has been jailed, or if this process lacks
 		 * super-user credentials and the sysctl tunable
 		 * 'security.bsd.unprivileged_syspmcs' is zero.
 		 */
 
 		if (PMC_IS_SYSTEM_MODE(mode)) {
 			if (jailed(curthread->td_ucred)) {
 				error = EPERM;
 				break;
 			}
 			if (!pmc_unprivileged_syspmcs) {
 				error = priv_check(curthread,
 				    PRIV_PMC_SYSTEM);
 				if (error)
 					break;
 			}
 		}
 
 		if (error)
 			break;
 
 		/*
 		 * Look for valid values for 'pm_flags'
 		 */
 
 		if ((pa.pm_flags & ~(PMC_F_DESCENDANTS | PMC_F_LOG_PROCCSW |
 		    PMC_F_LOG_PROCEXIT | PMC_F_CALLCHAIN)) != 0) {
 			error = EINVAL;
 			break;
 		}
 
 		/* process logging options are not allowed for system PMCs */
 		if (PMC_IS_SYSTEM_MODE(mode) && (pa.pm_flags &
 		    (PMC_F_LOG_PROCCSW | PMC_F_LOG_PROCEXIT))) {
 			error = EINVAL;
 			break;
 		}
 
 		/*
 		 * All sampling mode PMCs need to be able to interrupt the
 		 * CPU.
 		 */
 		if (PMC_IS_SAMPLING_MODE(mode))
 			caps |= PMC_CAP_INTERRUPT;
 
 		/* A valid class specifier should have been passed in. */
 		for (n = 0; n < md->pmd_nclass; n++)
 			if (md->pmd_classes[n].pm_class == pa.pm_class)
 				break;
 		if (n == md->pmd_nclass) {
 			error = EINVAL;
 			break;
 		}
 
 		/* The requested PMC capabilities should be feasible. */
 		if ((md->pmd_classes[n].pm_caps & caps) != caps) {
 			error = EOPNOTSUPP;
 			break;
 		}
 
 		PMCDBG(PMC,ALL,2, "event=%d caps=0x%x mode=%d cpu=%d",
 		    pa.pm_ev, caps, mode, cpu);
 
 		pmc = pmc_allocate_pmc_descriptor();
 		pmc->pm_id    = PMC_ID_MAKE_ID(cpu,pa.pm_mode,pa.pm_class,
 		    PMC_ID_INVALID);
 		pmc->pm_event = pa.pm_ev;
 		pmc->pm_state = PMC_STATE_FREE;
 		pmc->pm_caps  = caps;
 		pmc->pm_flags = pa.pm_flags;
 
 		/* switch thread to CPU 'cpu' */
 		pmc_save_cpu_binding(&pb);
 
 #define	PMC_IS_SHAREABLE_PMC(cpu, n)				\
 	(pmc_pcpu[(cpu)]->pc_hwpmcs[(n)]->phw_state &		\
 	 PMC_PHW_FLAG_IS_SHAREABLE)
 #define	PMC_IS_UNALLOCATED(cpu, n)				\
 	(pmc_pcpu[(cpu)]->pc_hwpmcs[(n)]->phw_pmc == NULL)
 
 		if (PMC_IS_SYSTEM_MODE(mode)) {
 			pmc_select_cpu(cpu);
 			for (n = 0; n < (int) md->pmd_npmc; n++)
 				if (pmc_can_allocate_row(n, mode) == 0 &&
 				    pmc_can_allocate_rowindex(
 					    curthread->td_proc, n, cpu) == 0 &&
 				    (PMC_IS_UNALLOCATED(cpu, n) ||
 				     PMC_IS_SHAREABLE_PMC(cpu, n)) &&
 				    md->pmd_allocate_pmc(cpu, n, pmc,
 					&pa) == 0)
 					break;
 		} else {
 			/* Process virtual mode */
 			for (n = 0; n < (int) md->pmd_npmc; n++) {
 				if (pmc_can_allocate_row(n, mode) == 0 &&
 				    pmc_can_allocate_rowindex(
 					    curthread->td_proc, n,
 					    PMC_CPU_ANY) == 0 &&
 				    md->pmd_allocate_pmc(curthread->td_oncpu,
 					n, pmc, &pa) == 0)
 					break;
 			}
 		}
 
 #undef	PMC_IS_UNALLOCATED
 #undef	PMC_IS_SHAREABLE_PMC
 
 		pmc_restore_cpu_binding(&pb);
 
 		if (n == (int) md->pmd_npmc) {
 			pmc_destroy_pmc_descriptor(pmc);
 			FREE(pmc, M_PMC);
 			pmc = NULL;
 			error = EINVAL;
 			break;
 		}
 
 		/* Fill in the correct value in the ID field */
 		pmc->pm_id = PMC_ID_MAKE_ID(cpu,mode,pa.pm_class,n);
 
 		PMCDBG(PMC,ALL,2, "ev=%d class=%d mode=%d n=%d -> pmcid=%x",
 		    pmc->pm_event, pa.pm_class, mode, n, pmc->pm_id);
 
 		/* Process mode PMCs with logging enabled need log files */
 		if (pmc->pm_flags & (PMC_F_LOG_PROCEXIT | PMC_F_LOG_PROCCSW))
 			pmc->pm_flags |= PMC_F_NEEDS_LOGFILE;
 
 		/* All system mode sampling PMCs require a log file */
 		if (PMC_IS_SAMPLING_MODE(mode) && PMC_IS_SYSTEM_MODE(mode))
 			pmc->pm_flags |= PMC_F_NEEDS_LOGFILE;
 
 		/*
 		 * Configure global pmc's immediately
 		 */
 
 		if (PMC_IS_SYSTEM_MODE(PMC_TO_MODE(pmc))) {
 
 			pmc_save_cpu_binding(&pb);
 			pmc_select_cpu(cpu);
 
 			phw = pmc_pcpu[cpu]->pc_hwpmcs[n];
 
 			if ((phw->phw_state & PMC_PHW_FLAG_IS_ENABLED) == 0 ||
 			    (error = md->pmd_config_pmc(cpu, n, pmc)) != 0) {
 				(void) md->pmd_release_pmc(cpu, n, pmc);
 				pmc_destroy_pmc_descriptor(pmc);
 				FREE(pmc, M_PMC);
 				pmc = NULL;
 				pmc_restore_cpu_binding(&pb);
 				error = EPERM;
 				break;
 			}
 
 			pmc_restore_cpu_binding(&pb);
 		}
 
 		pmc->pm_state    = PMC_STATE_ALLOCATED;
 
 		/*
 		 * mark row disposition
 		 */
 
 		if (PMC_IS_SYSTEM_MODE(mode))
 			PMC_MARK_ROW_STANDALONE(n);
 		else
 			PMC_MARK_ROW_THREAD(n);
 
 		/*
 		 * Register this PMC with the current thread as its owner.
 		 */
 
 		if ((error =
 		    pmc_register_owner(curthread->td_proc, pmc)) != 0) {
 			pmc_release_pmc_descriptor(pmc);
 			FREE(pmc, M_PMC);
 			pmc = NULL;
 			break;
 		}
 
 		/*
 		 * Return the allocated index.
 		 */
 
 		pa.pm_pmcid = pmc->pm_id;
 
 		error = copyout(&pa, arg, sizeof(pa));
 	}
 	break;
 
 
 	/*
 	 * Attach a PMC to a process.
 	 */
 
 	case PMC_OP_PMCATTACH:
 	{
 		struct pmc *pm;
 		struct proc *p;
 		struct pmc_op_pmcattach a;
 
 		sx_assert(&pmc_sx, SX_XLOCKED);
 
 		if ((error = copyin(arg, &a, sizeof(a))) != 0)
 			break;
 
 		if (a.pm_pid < 0) {
 			error = EINVAL;
 			break;
 		} else if (a.pm_pid == 0)
 			a.pm_pid = td->td_proc->p_pid;
 
 		if ((error = pmc_find_pmc(a.pm_pmc, &pm)) != 0)
 			break;
 
 		if (PMC_IS_SYSTEM_MODE(PMC_TO_MODE(pm))) {
 			error = EINVAL;
 			break;
 		}
 
 		/* PMCs may be (re)attached only when allocated or stopped */
 		if (pm->pm_state == PMC_STATE_RUNNING) {
 			error = EBUSY;
 			break;
 		} else if (pm->pm_state != PMC_STATE_ALLOCATED &&
 		    pm->pm_state != PMC_STATE_STOPPED) {
 			error = EINVAL;
 			break;
 		}
 
 		/* lookup pid */
 		if ((p = pfind(a.pm_pid)) == NULL) {
 			error = ESRCH;
 			break;
 		}
 
 		/*
 		 * Ignore processes that are working on exiting.
 		 */
 		if (p->p_flag & P_WEXIT) {
 			error = ESRCH;
 			PROC_UNLOCK(p);	/* pfind() returns a locked process */
 			break;
 		}
 
 		/*
 		 * we are allowed to attach a PMC to a process if
 		 * we can debug it.
 		 */
 		error = p_candebug(curthread, p);
 
 		PROC_UNLOCK(p);
 
 		if (error == 0)
 			error = pmc_attach_process(p, pm);
 	}
 	break;
 
 
 	/*
 	 * Detach an attached PMC from a process.
 	 */
 
 	case PMC_OP_PMCDETACH:
 	{
 		struct pmc *pm;
 		struct proc *p;
 		struct pmc_op_pmcattach a;
 
 		if ((error = copyin(arg, &a, sizeof(a))) != 0)
 			break;
 
 		if (a.pm_pid < 0) {
 			error = EINVAL;
 			break;
 		} else if (a.pm_pid == 0)
 			a.pm_pid = td->td_proc->p_pid;
 
 		if ((error = pmc_find_pmc(a.pm_pmc, &pm)) != 0)
 			break;
 
 		if ((p = pfind(a.pm_pid)) == NULL) {
 			error = ESRCH;
 			break;
 		}
 
 		/*
 		 * Treat processes that are in the process of exiting
 		 * as if they were not present.
 		 */
 
 		if (p->p_flag & P_WEXIT)
 			error = ESRCH;
 
 		PROC_UNLOCK(p);	/* pfind() returns a locked process */
 
 		if (error == 0)
 			error = pmc_detach_process(p, pm);
 	}
 	break;
 
 
 	/*
 	 * Retrieve the MSR number associated with the counter
 	 * 'pmc_id'.  This allows processes to directly use RDPMC
 	 * instructions to read their PMCs, without the overhead of a
 	 * system call.
 	 */
 
 	case PMC_OP_PMCGETMSR:
 	{
 		int ri;
 		struct pmc	*pm;
 		struct pmc_target *pt;
 		struct pmc_op_getmsr gm;
 
 		PMC_DOWNGRADE_SX();
 
 		/* CPU has no 'GETMSR' support */
 		if (md->pmd_get_msr == NULL) {
 			error = ENOSYS;
 			break;
 		}
 
 		if ((error = copyin(arg, &gm, sizeof(gm))) != 0)
 			break;
 
 		if ((error = pmc_find_pmc(gm.pm_pmcid, &pm)) != 0)
 			break;
 
 		/*
 		 * The allocated PMC has to be a process virtual PMC,
 		 * i.e., of type MODE_T[CS].  Global PMCs can only be
 		 * read using the PMCREAD operation since they may be
 		 * allocated on a different CPU than the one we could
 		 * be running on at the time of the RDPMC instruction.
 		 *
 		 * The GETMSR operation is not allowed for PMCs that
 		 * are inherited across processes.
 		 */
 
 		if (!PMC_IS_VIRTUAL_MODE(PMC_TO_MODE(pm)) ||
 		    (pm->pm_flags & PMC_F_DESCENDANTS)) {
 			error = EINVAL;
 			break;
 		}
 
 		/*
 		 * It only makes sense to use a RDPMC (or its
 		 * equivalent instruction on non-x86 architectures) on
 		 * a process that has allocated and attached a PMC to
 		 * itself.  Conversely the PMC is only allowed to have
 		 * one process attached to it -- its owner.
 		 */
 
 		if ((pt = LIST_FIRST(&pm->pm_targets)) == NULL ||
 		    LIST_NEXT(pt, pt_next) != NULL ||
 		    pt->pt_process->pp_proc != pm->pm_owner->po_owner) {
 			error = EINVAL;
 			break;
 		}
 
 		ri = PMC_TO_ROWINDEX(pm);
 
 		if ((error = (*md->pmd_get_msr)(ri, &gm.pm_msr)) < 0)
 			break;
 
 		if ((error = copyout(&gm, arg, sizeof(gm))) < 0)
 			break;
 
 		/*
 		 * Mark our process as using MSRs.  Update machine
 		 * state using a forced context switch.
 		 */
 
 		pt->pt_process->pp_flags |= PMC_PP_ENABLE_MSR_ACCESS;
 		pmc_force_context_switch();
 
 	}
 	break;
 
 	/*
 	 * Release an allocated PMC
 	 */
 
 	case PMC_OP_PMCRELEASE:
 	{
 		pmc_id_t pmcid;
 		struct pmc *pm;
 		struct pmc_owner *po;
 		struct pmc_op_simple sp;
 
 		/*
 		 * Find PMC pointer for the named PMC.
 		 *
 		 * Use pmc_release_pmc_descriptor() to switch off the
 		 * PMC, remove all its target threads, and remove the
 		 * PMC from its owner's list.
 		 *
 		 * Remove the owner record if this is the last PMC
 		 * owned.
 		 *
 		 * Free up space.
 		 */
 
 		if ((error = copyin(arg, &sp, sizeof(sp))) != 0)
 			break;
 
 		pmcid = sp.pm_pmcid;
 
 		if ((error = pmc_find_pmc(pmcid, &pm)) != 0)
 			break;
 
 		po = pm->pm_owner;
 		pmc_release_pmc_descriptor(pm);
 		pmc_maybe_remove_owner(po);
 
 		FREE(pm, M_PMC);
 	}
 	break;
 
 
 	/*
 	 * Read and/or write a PMC.
 	 */
 
 	case PMC_OP_PMCRW:
 	{
 		uint32_t cpu, ri;
 		struct pmc *pm;
 		struct pmc_op_pmcrw *pprw;
 		struct pmc_op_pmcrw prw;
 		struct pmc_binding pb;
 		pmc_value_t oldvalue;
 
 		PMC_DOWNGRADE_SX();
 
 		if ((error = copyin(arg, &prw, sizeof(prw))) != 0)
 			break;
 
 		ri = 0;
 		PMCDBG(PMC,OPS,1, "rw id=%d flags=0x%x", prw.pm_pmcid,
 		    prw.pm_flags);
 
 		/* must have at least one flag set */
 		if ((prw.pm_flags & (PMC_F_OLDVALUE|PMC_F_NEWVALUE)) == 0) {
 			error = EINVAL;
 			break;
 		}
 
 		/* locate pmc descriptor */
 		if ((error = pmc_find_pmc(prw.pm_pmcid, &pm)) != 0)
 			break;
 
 		/* Can't read a PMC that hasn't been started. */
 		if (pm->pm_state != PMC_STATE_ALLOCATED &&
 		    pm->pm_state != PMC_STATE_STOPPED &&
 		    pm->pm_state != PMC_STATE_RUNNING) {
 			error = EINVAL;
 			break;
 		}
 
 		/* writing a new value is allowed only for 'STOPPED' pmcs */
 		if (pm->pm_state == PMC_STATE_RUNNING &&
 		    (prw.pm_flags & PMC_F_NEWVALUE)) {
 			error = EBUSY;
 			break;
 		}
 
 		if (PMC_IS_VIRTUAL_MODE(PMC_TO_MODE(pm))) {
 
 			/*
 			 * If this PMC is attached to its owner (i.e.,
 			 * the process requesting this operation) and
 			 * is running, then attempt to get an
 			 * upto-date reading from hardware for a READ.
 			 * Writes are only allowed when the PMC is
 			 * stopped, so only update the saved value
 			 * field.
 			 *
 			 * If the PMC is not running, or is not
 			 * attached to its owner, read/write to the
 			 * savedvalue field.
 			 */
 
 			ri = PMC_TO_ROWINDEX(pm);
 
 			mtx_pool_lock_spin(pmc_mtxpool, pm);
 			cpu = curthread->td_oncpu;
 
 			if (prw.pm_flags & PMC_F_OLDVALUE) {
 				if ((pm->pm_flags & PMC_F_ATTACHED_TO_OWNER) &&
 				    (pm->pm_state == PMC_STATE_RUNNING))
 					error = (*md->pmd_read_pmc)(cpu, ri,
 					    &oldvalue);
 				else
 					oldvalue = pm->pm_gv.pm_savedvalue;
 			}
 			if (prw.pm_flags & PMC_F_NEWVALUE)
 				pm->pm_gv.pm_savedvalue = prw.pm_value;
 
 			mtx_pool_unlock_spin(pmc_mtxpool, pm);
 
 		} else { /* System mode PMCs */
 			cpu = PMC_TO_CPU(pm);
 			ri  = PMC_TO_ROWINDEX(pm);
 
 			if (pmc_cpu_is_disabled(cpu)) {
 				error = ENXIO;
 				break;
 			}
 
 			/* move this thread to CPU 'cpu' */
 			pmc_save_cpu_binding(&pb);
 			pmc_select_cpu(cpu);
 
 			critical_enter();
 			/* save old value */
 			if (prw.pm_flags & PMC_F_OLDVALUE)
 				if ((error = (*md->pmd_read_pmc)(cpu, ri,
 					 &oldvalue)))
 					goto error;
 			/* write out new value */
 			if (prw.pm_flags & PMC_F_NEWVALUE)
 				error = (*md->pmd_write_pmc)(cpu, ri,
 				    prw.pm_value);
 		error:
 			critical_exit();
 			pmc_restore_cpu_binding(&pb);
 			if (error)
 				break;
 		}
 
 		pprw = (struct pmc_op_pmcrw *) arg;
 
 #ifdef	DEBUG
 		if (prw.pm_flags & PMC_F_NEWVALUE)
 			PMCDBG(PMC,OPS,2, "rw id=%d new %jx -> old %jx",
 			    ri, prw.pm_value, oldvalue);
 		else if (prw.pm_flags & PMC_F_OLDVALUE)
 			PMCDBG(PMC,OPS,2, "rw id=%d -> old %jx", ri, oldvalue);
 #endif
 
 		/* return old value if requested */
 		if (prw.pm_flags & PMC_F_OLDVALUE)
 			if ((error = copyout(&oldvalue, &pprw->pm_value,
 				 sizeof(prw.pm_value))))
 				break;
 
 	}
 	break;
 
 
 	/*
 	 * Set the sampling rate for a sampling mode PMC and the
 	 * initial count for a counting mode PMC.
 	 */
 
 	case PMC_OP_PMCSETCOUNT:
 	{
 		struct pmc *pm;
 		struct pmc_op_pmcsetcount sc;
 
 		PMC_DOWNGRADE_SX();
 
 		if ((error = copyin(arg, &sc, sizeof(sc))) != 0)
 			break;
 
 		if ((error = pmc_find_pmc(sc.pm_pmcid, &pm)) != 0)
 			break;
 
 		if (pm->pm_state == PMC_STATE_RUNNING) {
 			error = EBUSY;
 			break;
 		}
 
 		if (PMC_IS_SAMPLING_MODE(PMC_TO_MODE(pm)))
 			pm->pm_sc.pm_reloadcount = sc.pm_count;
 		else
 			pm->pm_sc.pm_initial = sc.pm_count;
 	}
 	break;
 
 
 	/*
 	 * Start a PMC.
 	 */
 
 	case PMC_OP_PMCSTART:
 	{
 		pmc_id_t pmcid;
 		struct pmc *pm;
 		struct pmc_op_simple sp;
 
 		sx_assert(&pmc_sx, SX_XLOCKED);
 
 		if ((error = copyin(arg, &sp, sizeof(sp))) != 0)
 			break;
 
 		pmcid = sp.pm_pmcid;
 
 		if ((error = pmc_find_pmc(pmcid, &pm)) != 0)
 			break;
 
 		KASSERT(pmcid == pm->pm_id,
 		    ("[pmc,%d] pmcid %x != id %x", __LINE__,
 			pm->pm_id, pmcid));
 
 		if (pm->pm_state == PMC_STATE_RUNNING) /* already running */
 			break;
 		else if (pm->pm_state != PMC_STATE_STOPPED &&
 		    pm->pm_state != PMC_STATE_ALLOCATED) {
 			error = EINVAL;
 			break;
 		}
 
 		error = pmc_start(pm);
 	}
 	break;
 
 
 	/*
 	 * Stop a PMC.
 	 */
 
 	case PMC_OP_PMCSTOP:
 	{
 		pmc_id_t pmcid;
 		struct pmc *pm;
 		struct pmc_op_simple sp;
 
 		PMC_DOWNGRADE_SX();
 
 		if ((error = copyin(arg, &sp, sizeof(sp))) != 0)
 			break;
 
 		pmcid = sp.pm_pmcid;
 
 		/*
 		 * Mark the PMC as inactive and invoke the MD stop
 		 * routines if needed.
 		 */
 
 		if ((error = pmc_find_pmc(pmcid, &pm)) != 0)
 			break;
 
 		KASSERT(pmcid == pm->pm_id,
 		    ("[pmc,%d] pmc id %x != pmcid %x", __LINE__,
 			pm->pm_id, pmcid));
 
 		if (pm->pm_state == PMC_STATE_STOPPED) /* already stopped */
 			break;
 		else if (pm->pm_state != PMC_STATE_RUNNING) {
 			error = EINVAL;
 			break;
 		}
 
 		error = pmc_stop(pm);
 	}
 	break;
 
 
 	/*
 	 * Write a user supplied value to the log file.
 	 */
 
 	case PMC_OP_WRITELOG:
 	{
 		struct pmc_op_writelog wl;
 		struct pmc_owner *po;
 
 		PMC_DOWNGRADE_SX();
 
 		if ((error = copyin(arg, &wl, sizeof(wl))) != 0)
 			break;
 
 		if ((po = pmc_find_owner_descriptor(td->td_proc)) == NULL) {
 			error = EINVAL;
 			break;
 		}
 
 		if ((po->po_flags & PMC_PO_OWNS_LOGFILE) == 0) {
 			error = EINVAL;
 			break;
 		}
 
 		error = pmclog_process_userlog(po, &wl);
 	}
 	break;
 
 
 	default:
 		error = EINVAL;
 		break;
 	}
 
 	if (is_sx_downgraded)
 		sx_sunlock(&pmc_sx);
 	else
 		sx_xunlock(&pmc_sx);
 
 	if (error)
 		atomic_add_int(&pmc_stats.pm_syscall_errors, 1);
 
 	PICKUP_GIANT();
 
 	return error;
 }
 
 /*
  * Helper functions
  */
 
 
 /*
  * Mark the thread as needing callchain capture and post an AST.  The
  * actual callchain capture will be done in a context where it is safe
  * to take page faults.
  */
 
 static void
 pmc_post_callchain_ast(void)
 {
 	struct thread *td;
 
 	td = curthread;
 
 	/*
 	 * Mark this thread as needing processing in ast().
 	 * td->td_pflags will be safe to touch as the process was in
 	 * user space when it was interrupted.
 	 */
 	td->td_pflags |= TDP_CALLCHAIN;
 
 	/*
 	 * Again, since we've entered this function directly from
 	 * userland, `td' is guaranteed to be not locked by this CPU,
 	 * so its safe to try acquire the thread lock even though we
 	 * are executing in an NMI context.  We need to acquire this
 	 * lock before touching `td_flags' because other CPUs may be
 	 * in the process of touching this field.
 	 */
 	thread_lock(td);
 	td->td_flags |= TDF_ASTPENDING;
 	thread_unlock(td);
 
 	return;
 }
 
 /*
  * Interrupt processing.
  *
  * Find a free slot in the per-cpu array of samples and capture the
  * current callchain there.  If a sample was successfully added, a bit
  * is set in mask 'pmc_cpumask' denoting that the DO_SAMPLES hook
  * needs to be invoked from the clock handler.
  *
  * This function is meant to be called from an NMI handler.  It cannot
  * use any of the locking primitives supplied by the OS.
  */
 
 int
 pmc_process_interrupt(int cpu, struct pmc *pm, struct trapframe *tf,
     int inuserspace)
 {
 	int error, callchaindepth;
 	struct thread *td;
 	struct pmc_sample *ps;
 	struct pmc_samplebuffer *psb;
 
 	error = 0;
 
 	/*
 	 * Allocate space for a sample buffer.
 	 */
 	psb = pmc_pcpu[cpu]->pc_sb;
 
 	ps = psb->ps_write;
 	if (ps->ps_nsamples) {	/* in use, reader hasn't caught up */
 		pm->pm_stalled = 1;
 		atomic_add_int(&pmc_stats.pm_intr_bufferfull, 1);
 		PMCDBG(SAM,INT,1,"(spc) cpu=%d pm=%p tf=%p um=%d wr=%d rd=%d",
 		    cpu, pm, (void *) tf, inuserspace,
 		    (int) (psb->ps_write - psb->ps_samples),
 		    (int) (psb->ps_read - psb->ps_samples));
 		error = ENOMEM;
 		goto done;
 	}
 
 
 	/* Fill in entry. */
 	PMCDBG(SAM,INT,1,"cpu=%d pm=%p tf=%p um=%d wr=%d rd=%d", cpu, pm,
 	    (void *) tf, inuserspace,
 	    (int) (psb->ps_write - psb->ps_samples),
 	    (int) (psb->ps_read - psb->ps_samples));
 
 	atomic_add_rel_32(&pm->pm_runcount, 1);	/* hold onto PMC */
 	ps->ps_pmc = pm;
 	if ((td = curthread) && td->td_proc)
 		ps->ps_pid = td->td_proc->p_pid;
 	else
 		ps->ps_pid = -1;
 	ps->ps_cpu = cpu;
 	ps->ps_flags = inuserspace ? PMC_CC_F_USERSPACE : 0;
 
 	callchaindepth = (pm->pm_flags & PMC_F_CALLCHAIN) ?
 	    pmc_callchaindepth : 1;
 
 	if (callchaindepth == 1)
 		ps->ps_pc[0] = PMC_TRAPFRAME_TO_PC(tf);
 	else {
 		/*
 		 * Kernel stack traversals can be done immediately,
 		 * while we defer to an AST for user space traversals.
 		 */
 		if (!inuserspace)
 			callchaindepth =
 			    pmc_save_kernel_callchain(ps->ps_pc,
 				callchaindepth, tf);
 		else {
 			pmc_post_callchain_ast();
 			callchaindepth = PMC_SAMPLE_INUSE;
 		}
 	}
 
 	ps->ps_nsamples = callchaindepth;	/* mark entry as in use */
 
 	/* increment write pointer, modulo ring buffer size */
 	ps++;
 	if (ps == psb->ps_fence)
 		psb->ps_write = psb->ps_samples;
 	else
 		psb->ps_write = ps;
 
  done:
 	/* mark CPU as needing processing */
 	atomic_set_rel_int(&pmc_cpumask, (1 << cpu));
 
 	return (error);
 }
 
 /*
  * Capture a user call chain.  This function will be called from ast()
  * before control returns to userland and before the process gets
  * rescheduled.
  */
 
 static void
 pmc_capture_user_callchain(int cpu, struct trapframe *tf)
 {
 	int i;
 	struct pmc *pm;
 	struct pmc_sample *ps;
 	struct pmc_samplebuffer *psb;
 
 	psb = pmc_pcpu[cpu]->pc_sb;
 
 	/*
 	 * Iterate through all deferred callchain requests.
 	 */
 
 	for (i = 0; i < pmc_nsamples; i++) {
 
 		ps = &psb->ps_samples[i];
 		if (ps->ps_nsamples != PMC_SAMPLE_INUSE)
 			continue;
 
 		pm = ps->ps_pmc;
 
 		KASSERT(pm->pm_flags & PMC_F_CALLCHAIN,
 		    ("[pmc,%d] Retrieving callchain for PMC that doesn't "
 			"want it", __LINE__));
 
 		/*
 		 * Retrieve the callchain and mark the sample buffer
 		 * as 'processable' by the timer tick sweep code.
 		 */
 		ps->ps_nsamples = pmc_save_user_callchain(ps->ps_pc,
 		    pmc_callchaindepth, tf);
 	}
 
 	return;
 }
 
 
 /*
  * Process saved PC samples.
  */
 
 static void
 pmc_process_samples(int cpu)
 {
 	int n, ri;
 	struct pmc *pm;
 	struct thread *td;
 	struct pmc_owner *po;
 	struct pmc_sample *ps;
 	struct pmc_samplebuffer *psb;
 
 	KASSERT(PCPU_GET(cpuid) == cpu,
 	    ("[pmc,%d] not on the correct CPU pcpu=%d cpu=%d", __LINE__,
 		PCPU_GET(cpuid), cpu));
 
 	psb = pmc_pcpu[cpu]->pc_sb;
 
 	for (n = 0; n < pmc_nsamples; n++) { /* bound on #iterations */
 
 		ps = psb->ps_read;
 		if (ps->ps_nsamples == PMC_SAMPLE_FREE)
 			break;
 		if (ps->ps_nsamples == PMC_SAMPLE_INUSE) {
 			/* Need a rescan at a later time. */
 			atomic_set_rel_int(&pmc_cpumask, (1 << cpu));
 			break;
 		}
 
 		pm = ps->ps_pmc;
 		po = pm->pm_owner;
 
 		KASSERT(PMC_IS_SAMPLING_MODE(PMC_TO_MODE(pm)),
 		    ("[pmc,%d] pmc=%p non-sampling mode=%d", __LINE__,
 			pm, PMC_TO_MODE(pm)));
 
 		/* Ignore PMCs that have been switched off */
 		if (pm->pm_state != PMC_STATE_RUNNING)
 			goto entrydone;
 
 		PMCDBG(SAM,OPS,1,"cpu=%d pm=%p n=%d fl=%x wr=%d rd=%d", cpu,
 		    pm, ps->ps_nsamples, ps->ps_flags,
 		    (int) (psb->ps_write - psb->ps_samples),
 		    (int) (psb->ps_read - psb->ps_samples));
 
 		/*
 		 * If this is a process-mode PMC that is attached to
 		 * its owner, and if the PC is in user mode, update
 		 * profiling statistics like timer-based profiling
 		 * would have done.
 		 */
 		if (pm->pm_flags & PMC_F_ATTACHED_TO_OWNER) {
 			if (ps->ps_flags & PMC_CC_F_USERSPACE) {
 				td = FIRST_THREAD_IN_PROC(po->po_owner);
 				addupc_intr(td, ps->ps_pc[0], 1);
 			}
 			goto entrydone;
 		}
 
 		/*
 		 * Otherwise, this is either a sampling mode PMC that
 		 * is attached to a different process than its owner,
 		 * or a system-wide sampling PMC.  Dispatch a log
 		 * entry to the PMC's owner process.
 		 */
 
 		pmclog_process_callchain(pm, ps);
 
 	entrydone:
 		ps->ps_nsamples = 0;	/* mark entry as free */
 		atomic_subtract_rel_32(&pm->pm_runcount, 1);
 
 		/* increment read pointer, modulo sample size */
 		if (++ps == psb->ps_fence)
 			psb->ps_read = psb->ps_samples;
 		else
 			psb->ps_read = ps;
 	}
 
 	atomic_add_int(&pmc_stats.pm_log_sweeps, 1);
 
 	/* Do not re-enable stalled PMCs if we failed to process any samples */
 	if (n == 0)
 		return;
 
 	/*
 	 * Restart any stalled sampling PMCs on this CPU.
 	 *
 	 * If the NMI handler sets the pm_stalled field of a PMC after
 	 * the check below, we'll end up processing the stalled PMC at
 	 * the next hardclock tick.
 	 */
 	for (n = 0; n < md->pmd_npmc; n++) {
 		(void) (*md->pmd_get_config)(cpu,n,&pm);
 		if (pm == NULL ||			 /* !cfg'ed */
 		    pm->pm_state != PMC_STATE_RUNNING || /* !active */
 		    !PMC_IS_SAMPLING_MODE(PMC_TO_MODE(pm)) || /* !sampling */
 		    pm->pm_stalled == 0) /* !stalled */
 			continue;
 
 		pm->pm_stalled = 0;
 		ri = PMC_TO_ROWINDEX(pm);
 		(*md->pmd_start_pmc)(cpu, ri);
 	}
 }
 
 /*
  * Event handlers.
  */
 
 /*
  * Handle a process exit.
  *
  * Remove this process from all hash tables.  If this process
  * owned any PMCs, turn off those PMCs and deallocate them,
  * removing any associations with target processes.
  *
  * This function will be called by the last 'thread' of a
  * process.
  *
  * XXX This eventhandler gets called early in the exit process.
  * Consider using a 'hook' invocation from thread_exit() or equivalent
  * spot.  Another negative is that kse_exit doesn't seem to call
  * exit1() [??].
  *
  */
 
 static void
 pmc_process_exit(void *arg __unused, struct proc *p)
 {
 	int is_using_hwpmcs;
 	int cpu;
 	unsigned int ri;
 	struct pmc *pm;
 	struct pmc_process *pp;
 	struct pmc_owner *po;
 	pmc_value_t newvalue, tmp;
 
 	PROC_LOCK(p);
 	is_using_hwpmcs = p->p_flag & P_HWPMC;
 	PROC_UNLOCK(p);
 
 	/*
 	 * Log a sysexit event to all SS PMC owners.
 	 */
 	LIST_FOREACH(po, &pmc_ss_owners, po_ssnext)
 	    if (po->po_flags & PMC_PO_OWNS_LOGFILE)
 		    pmclog_process_sysexit(po, p->p_pid);
 
 	if (!is_using_hwpmcs)
 		return;
 
 	PMC_GET_SX_XLOCK();
 	PMCDBG(PRC,EXT,1,"process-exit proc=%p (%d, %s)", p, p->p_pid,
 	    p->p_comm);
 
 	/*
 	 * Since this code is invoked by the last thread in an exiting
 	 * process, we would have context switched IN at some prior
 	 * point.  However, with PREEMPTION, kernel mode context
 	 * switches may happen any time, so we want to disable a
 	 * context switch OUT till we get any PMCs targetting this
 	 * process off the hardware.
 	 *
 	 * We also need to atomically remove this process'
 	 * entry from our target process hash table, using
 	 * PMC_FLAG_REMOVE.
 	 */
 	PMCDBG(PRC,EXT,1, "process-exit proc=%p (%d, %s)", p, p->p_pid,
 	    p->p_comm);
 
 	critical_enter(); /* no preemption */
 
 	cpu = curthread->td_oncpu;
 
 	if ((pp = pmc_find_process_descriptor(p,
 		 PMC_FLAG_REMOVE)) != NULL) {
 
 		PMCDBG(PRC,EXT,2,
 		    "process-exit proc=%p pmc-process=%p", p, pp);
 
 		/*
 		 * The exiting process could the target of
 		 * some PMCs which will be running on
 		 * currently executing CPU.
 		 *
 		 * We need to turn these PMCs off like we
 		 * would do at context switch OUT time.
 		 */
 		for (ri = 0; ri < md->pmd_npmc; ri++) {
 
 			/*
 			 * Pick up the pmc pointer from hardware
 			 * state similar to the CSW_OUT code.
 			 */
 			pm = NULL;
 			(void) (*md->pmd_get_config)(cpu, ri, &pm);
 
 			PMCDBG(PRC,EXT,2, "ri=%d pm=%p", ri, pm);
 
 			if (pm == NULL ||
 			    !PMC_IS_VIRTUAL_MODE(PMC_TO_MODE(pm)))
 				continue;
 
 			PMCDBG(PRC,EXT,2, "ppmcs[%d]=%p pm=%p "
 			    "state=%d", ri, pp->pp_pmcs[ri].pp_pmc,
 			    pm, pm->pm_state);
 
 			KASSERT(PMC_TO_ROWINDEX(pm) == ri,
 			    ("[pmc,%d] ri mismatch pmc(%d) ri(%d)",
 				__LINE__, PMC_TO_ROWINDEX(pm), ri));
 
 			KASSERT(pm == pp->pp_pmcs[ri].pp_pmc,
 			    ("[pmc,%d] pm %p != pp_pmcs[%d] %p",
 				__LINE__, pm, ri, pp->pp_pmcs[ri].pp_pmc));
 
 			(void) md->pmd_stop_pmc(cpu, ri);
 
 			KASSERT(pm->pm_runcount > 0,
 			    ("[pmc,%d] bad runcount ri %d rc %d",
 				__LINE__, ri, pm->pm_runcount));
 
 			/* Stop hardware only if it is actually running */
 			if (pm->pm_state == PMC_STATE_RUNNING &&
 			    pm->pm_stalled == 0) {
 				md->pmd_read_pmc(cpu, ri, &newvalue);
 				tmp = newvalue -
 				    PMC_PCPU_SAVED(cpu,ri);
 
 				mtx_pool_lock_spin(pmc_mtxpool, pm);
 				pm->pm_gv.pm_savedvalue += tmp;
 				pp->pp_pmcs[ri].pp_pmcval += tmp;
 				mtx_pool_unlock_spin(pmc_mtxpool, pm);
 			}
 
 			atomic_subtract_rel_32(&pm->pm_runcount,1);
 
 			KASSERT((int) pm->pm_runcount >= 0,
 			    ("[pmc,%d] runcount is %d", __LINE__, ri));
 
 			(void) md->pmd_config_pmc(cpu, ri, NULL);
 		}
 
 		/*
 		 * Inform the MD layer of this pseudo "context switch
 		 * out"
 		 */
 		(void) md->pmd_switch_out(pmc_pcpu[cpu], pp);
 
 		critical_exit(); /* ok to be pre-empted now */
 
 		/*
 		 * Unlink this process from the PMCs that are
 		 * targetting it.  This will send a signal to
 		 * all PMC owner's whose PMCs are orphaned.
 		 *
 		 * Log PMC value at exit time if requested.
 		 */
 		for (ri = 0; ri < md->pmd_npmc; ri++)
 			if ((pm = pp->pp_pmcs[ri].pp_pmc) != NULL) {
 				if (pm->pm_flags & PMC_F_NEEDS_LOGFILE &&
 				    PMC_IS_COUNTING_MODE(PMC_TO_MODE(pm)))
 					pmclog_process_procexit(pm, pp);
 				pmc_unlink_target_process(pm, pp);
 			}
 		FREE(pp, M_PMC);
 
 	} else
 		critical_exit(); /* pp == NULL */
 
 
 	/*
 	 * If the process owned PMCs, free them up and free up
 	 * memory.
 	 */
 	if ((po = pmc_find_owner_descriptor(p)) != NULL) {
 		pmc_remove_owner(po);
 		pmc_destroy_owner_descriptor(po);
 	}
 
 	sx_xunlock(&pmc_sx);
 }
 
 /*
  * Handle a process fork.
  *
  * If the parent process 'p1' is under HWPMC monitoring, then copy
  * over any attached PMCs that have 'do_descendants' semantics.
  */
 
 static void
 pmc_process_fork(void *arg __unused, struct proc *p1, struct proc *newproc,
     int flags)
 {
 	int is_using_hwpmcs;
 	unsigned int ri;
 	uint32_t do_descendants;
 	struct pmc *pm;
 	struct pmc_owner *po;
 	struct pmc_process *ppnew, *ppold;
 
 	(void) flags;		/* unused parameter */
 
 	PROC_LOCK(p1);
 	is_using_hwpmcs = p1->p_flag & P_HWPMC;
 	PROC_UNLOCK(p1);
 
 	/*
 	 * If there are system-wide sampling PMCs active, we need to
 	 * log all fork events to their owner's logs.
 	 */
 
 	LIST_FOREACH(po, &pmc_ss_owners, po_ssnext)
 	    if (po->po_flags & PMC_PO_OWNS_LOGFILE)
 		    pmclog_process_procfork(po, p1->p_pid, newproc->p_pid);
 
 	if (!is_using_hwpmcs)
 		return;
 
 	PMC_GET_SX_XLOCK();
 	PMCDBG(PMC,FRK,1, "process-fork proc=%p (%d, %s) -> %p", p1,
 	    p1->p_pid, p1->p_comm, newproc);
 
 	/*
 	 * If the parent process (curthread->td_proc) is a
 	 * target of any PMCs, look for PMCs that are to be
 	 * inherited, and link these into the new process
 	 * descriptor.
 	 */
 	if ((ppold = pmc_find_process_descriptor(curthread->td_proc,
 		 PMC_FLAG_NONE)) == NULL)
 		goto done;		/* nothing to do */
 
 	do_descendants = 0;
 	for (ri = 0; ri < md->pmd_npmc; ri++)
 		if ((pm = ppold->pp_pmcs[ri].pp_pmc) != NULL)
 			do_descendants |= pm->pm_flags & PMC_F_DESCENDANTS;
 	if (do_descendants == 0) /* nothing to do */
 		goto done;
 
 	/* allocate a descriptor for the new process  */
 	if ((ppnew = pmc_find_process_descriptor(newproc,
 		 PMC_FLAG_ALLOCATE)) == NULL)
 		goto done;
 
 	/*
 	 * Run through all PMCs that were targeting the old process
 	 * and which specified F_DESCENDANTS and attach them to the
 	 * new process.
 	 *
 	 * Log the fork event to all owners of PMCs attached to this
 	 * process, if not already logged.
 	 */
 	for (ri = 0; ri < md->pmd_npmc; ri++)
 		if ((pm = ppold->pp_pmcs[ri].pp_pmc) != NULL &&
 		    (pm->pm_flags & PMC_F_DESCENDANTS)) {
 			pmc_link_target_process(pm, ppnew);
 			po = pm->pm_owner;
 			if (po->po_sscount == 0 &&
 			    po->po_flags & PMC_PO_OWNS_LOGFILE)
 				pmclog_process_procfork(po, p1->p_pid,
 				    newproc->p_pid);
 		}
 
 	/*
 	 * Now mark the new process as being tracked by this driver.
 	 */
 	PROC_LOCK(newproc);
 	newproc->p_flag |= P_HWPMC;
 	PROC_UNLOCK(newproc);
 
  done:
 	sx_xunlock(&pmc_sx);
 }
 
 
 /*
  * initialization
  */
 
 static const char *pmc_name_of_pmcclass[] = {
 #undef	__PMC_CLASS
 #define	__PMC_CLASS(N) #N ,
 	__PMC_CLASSES()
 };
 
 static int
 pmc_initialize(void)
 {
 	int cpu, error, n;
 	struct pmc_binding pb;
 	struct pmc_sample *ps;
 	struct pmc_samplebuffer *sb;
 
 	md = NULL;
 	error = 0;
 
 #ifdef	DEBUG
 	/* parse debug flags first */
 	if (TUNABLE_STR_FETCH(PMC_SYSCTL_NAME_PREFIX "debugflags",
 		pmc_debugstr, sizeof(pmc_debugstr)))
 		pmc_debugflags_parse(pmc_debugstr,
 		    pmc_debugstr+strlen(pmc_debugstr));
 #endif
 
 	PMCDBG(MOD,INI,0, "PMC Initialize (version %x)", PMC_VERSION);
 
 	/* check kernel version */
 	if (pmc_kernel_version != PMC_VERSION) {
 		if (pmc_kernel_version == 0)
 			printf("hwpmc: this kernel has not been compiled with "
 			    "'options HWPMC_HOOKS'.\n");
 		else
 			printf("hwpmc: kernel version (0x%x) does not match "
 			    "module version (0x%x).\n", pmc_kernel_version,
 			    PMC_VERSION);
 		return EPROGMISMATCH;
 	}
 
 	/*
 	 * check sysctl parameters
 	 */
 
 	if (pmc_hashsize <= 0) {
 		(void) printf("hwpmc: tunable \"hashsize\"=%d must be "
 		    "greater than zero.\n", pmc_hashsize);
 		pmc_hashsize = PMC_HASH_SIZE;
 	}
 
 	if (pmc_nsamples <= 0 || pmc_nsamples > 65535) {
 		(void) printf("hwpmc: tunable \"nsamples\"=%d out of "
 		    "range.\n", pmc_nsamples);
 		pmc_nsamples = PMC_NSAMPLES;
 	}
 
 	if (pmc_callchaindepth <= 0 ||
 	    pmc_callchaindepth > PMC_CALLCHAIN_DEPTH_MAX) {
 		(void) printf("hwpmc: tunable \"callchaindepth\"=%d out of "
 		    "range.\n", pmc_callchaindepth);
 		pmc_callchaindepth = PMC_CALLCHAIN_DEPTH;
 	}
 
 	md = pmc_md_initialize();
 
 	if (md == NULL || md->pmd_init == NULL)
 		return ENOSYS;
 
 	/* allocate space for the per-cpu array */
 	MALLOC(pmc_pcpu, struct pmc_cpu **, mp_ncpus * sizeof(struct pmc_cpu *),
 	    M_PMC, M_WAITOK|M_ZERO);
 
 	/* per-cpu 'saved values' for managing process-mode PMCs */
 	MALLOC(pmc_pcpu_saved, pmc_value_t *,
 	    sizeof(pmc_value_t) * mp_ncpus * md->pmd_npmc, M_PMC, M_WAITOK);
 
 	/* perform cpu dependent initialization */
 	pmc_save_cpu_binding(&pb);
 	for (cpu = 0; cpu < mp_ncpus; cpu++) {
 		if (pmc_cpu_is_disabled(cpu))
 			continue;
 		pmc_select_cpu(cpu);
 		if ((error = md->pmd_init(cpu)) != 0)
 			break;
 	}
 	pmc_restore_cpu_binding(&pb);
 
 	if (error != 0)
 		return error;
 
 	/* allocate space for the sample array */
 	for (cpu = 0; cpu < mp_ncpus; cpu++) {
 		if (pmc_cpu_is_disabled(cpu))
 			continue;
 		MALLOC(sb, struct pmc_samplebuffer *,
 		    sizeof(struct pmc_samplebuffer) +
 		    pmc_nsamples * sizeof(struct pmc_sample), M_PMC,
 		    M_WAITOK|M_ZERO);
 
 		sb->ps_read = sb->ps_write = sb->ps_samples;
 		sb->ps_fence = sb->ps_samples + pmc_nsamples;
 		KASSERT(pmc_pcpu[cpu] != NULL,
 		    ("[pmc,%d] cpu=%d Null per-cpu data", __LINE__, cpu));
 
 		MALLOC(sb->ps_callchains, uintptr_t *,
 		    pmc_callchaindepth * pmc_nsamples * sizeof(uintptr_t),
 		    M_PMC, M_WAITOK|M_ZERO);
 
 		for (n = 0, ps = sb->ps_samples; n < pmc_nsamples; n++, ps++)
 			ps->ps_pc = sb->ps_callchains +
 			    (n * pmc_callchaindepth);
 
 		pmc_pcpu[cpu]->pc_sb = sb;
 	}
 
 	/* allocate space for the row disposition array */
 	pmc_pmcdisp = malloc(sizeof(enum pmc_mode) * md->pmd_npmc,
 	    M_PMC, M_WAITOK|M_ZERO);
 
 	KASSERT(pmc_pmcdisp != NULL,
 	    ("[pmc,%d] pmcdisp allocation returned NULL", __LINE__));
 
 	/* mark all PMCs as available */
 	for (n = 0; n < (int) md->pmd_npmc; n++)
 		PMC_MARK_ROW_FREE(n);
 
 	/* allocate thread hash tables */
 	pmc_ownerhash = hashinit(pmc_hashsize, M_PMC,
 	    &pmc_ownerhashmask);
 
 	pmc_processhash = hashinit(pmc_hashsize, M_PMC,
 	    &pmc_processhashmask);
 	mtx_init(&pmc_processhash_mtx, "pmc-process-hash", "pmc-leaf",
 	    MTX_SPIN);
 
 	LIST_INIT(&pmc_ss_owners);
 	pmc_ss_count = 0;
 
 	/* allocate a pool of spin mutexes */
 	pmc_mtxpool = mtx_pool_create("pmc-leaf", pmc_mtxpool_size,
 	    MTX_SPIN);
 
 	PMCDBG(MOD,INI,1, "pmc_ownerhash=%p, mask=0x%lx "
 	    "targethash=%p mask=0x%lx", pmc_ownerhash, pmc_ownerhashmask,
 	    pmc_processhash, pmc_processhashmask);
 
 	/* register process {exit,fork,exec} handlers */
 	pmc_exit_tag = EVENTHANDLER_REGISTER(process_exit,
 	    pmc_process_exit, NULL, EVENTHANDLER_PRI_ANY);
 	pmc_fork_tag = EVENTHANDLER_REGISTER(process_fork,
 	    pmc_process_fork, NULL, EVENTHANDLER_PRI_ANY);
 
 	/* initialize logging */
 	pmclog_initialize();
 
 	/* set hook functions */
 	pmc_intr = md->pmd_intr;
 	pmc_hook = pmc_hook_handler;
 
 	if (error == 0) {
 		printf(PMC_MODULE_NAME ":");
 		for (n = 0; n < (int) md->pmd_nclass; n++) {
 			printf(" %s/%d/0x%b",
 			    pmc_name_of_pmcclass[md->pmd_classes[n].pm_class],
 			    md->pmd_nclasspmcs[n],
 			    md->pmd_classes[n].pm_caps,
 			    "\20"
 			    "\1INT\2USR\3SYS\4EDG\5THR"
 			    "\6REA\7WRI\10INV\11QUA\12PRC"
 			    "\13TAG\14CSC");
 		}
 		printf("\n");
 	}
 
 	return error;
 }
 
 /* prepare to be unloaded */
 static void
 pmc_cleanup(void)
 {
 	int cpu;
 	struct pmc_ownerhash *ph;
 	struct pmc_owner *po, *tmp;
 	struct pmc_binding pb;
 #ifdef	DEBUG
 	struct pmc_processhash *prh;
 #endif
 
 	PMCDBG(MOD,INI,0, "%s", "cleanup");
 
 	/* switch off sampling */
 	atomic_store_rel_int(&pmc_cpumask, 0);
 	pmc_intr = NULL;
 
 	sx_xlock(&pmc_sx);
 	if (pmc_hook == NULL) {	/* being unloaded already */
 		sx_xunlock(&pmc_sx);
 		return;
 	}
 
 	pmc_hook = NULL; /* prevent new threads from entering module */
 
 	/* deregister event handlers */
 	EVENTHANDLER_DEREGISTER(process_fork, pmc_fork_tag);
 	EVENTHANDLER_DEREGISTER(process_exit, pmc_exit_tag);
 
 	/* send SIGBUS to all owner threads, free up allocations */
 	if (pmc_ownerhash)
 		for (ph = pmc_ownerhash;
 		     ph <= &pmc_ownerhash[pmc_ownerhashmask];
 		     ph++) {
 			LIST_FOREACH_SAFE(po, ph, po_next, tmp) {
 				pmc_remove_owner(po);
 
 				/* send SIGBUS to owner processes */
 				PMCDBG(MOD,INI,2, "cleanup signal proc=%p "
 				    "(%d, %s)", po->po_owner,
 				    po->po_owner->p_pid,
 				    po->po_owner->p_comm);
 
 				PROC_LOCK(po->po_owner);
 				psignal(po->po_owner, SIGBUS);
 				PROC_UNLOCK(po->po_owner);
 
 				pmc_destroy_owner_descriptor(po);
 			}
 		}
 
 	/* reclaim allocated data structures */
 	if (pmc_mtxpool)
 		mtx_pool_destroy(&pmc_mtxpool);
 
 	mtx_destroy(&pmc_processhash_mtx);
 	if (pmc_processhash) {
 #ifdef	DEBUG
 		struct pmc_process *pp;
 
 		PMCDBG(MOD,INI,3, "%s", "destroy process hash");
 		for (prh = pmc_processhash;
 		     prh <= &pmc_processhash[pmc_processhashmask];
 		     prh++)
 			LIST_FOREACH(pp, prh, pp_next)
 			    PMCDBG(MOD,INI,3, "pid=%d", pp->pp_proc->p_pid);
 #endif
 
 		hashdestroy(pmc_processhash, M_PMC, pmc_processhashmask);
 		pmc_processhash = NULL;
 	}
 
 	if (pmc_ownerhash) {
 		PMCDBG(MOD,INI,3, "%s", "destroy owner hash");
 		hashdestroy(pmc_ownerhash, M_PMC, pmc_ownerhashmask);
 		pmc_ownerhash = NULL;
 	}
 
 	KASSERT(LIST_EMPTY(&pmc_ss_owners),
 	    ("[pmc,%d] Global SS owner list not empty", __LINE__));
 	KASSERT(pmc_ss_count == 0,
 	    ("[pmc,%d] Global SS count not empty", __LINE__));
 
 	/* free the per-cpu sample buffers */
 	for (cpu = 0; cpu < mp_ncpus; cpu++) {
 		if (pmc_cpu_is_disabled(cpu))
 			continue;
 		KASSERT(pmc_pcpu[cpu]->pc_sb != NULL,
 		    ("[pmc,%d] Null cpu sample buffer cpu=%d", __LINE__,
 			cpu));
 		FREE(pmc_pcpu[cpu]->pc_sb->ps_callchains, M_PMC);
 		FREE(pmc_pcpu[cpu]->pc_sb, M_PMC);
 		pmc_pcpu[cpu]->pc_sb = NULL;
 	}
 
  	/* do processor dependent cleanup */
 	PMCDBG(MOD,INI,3, "%s", "md cleanup");
 	if (md) {
 		pmc_save_cpu_binding(&pb);
 		for (cpu = 0; cpu < mp_ncpus; cpu++) {
 			PMCDBG(MOD,INI,1,"pmc-cleanup cpu=%d pcs=%p",
 			    cpu, pmc_pcpu[cpu]);
 			if (pmc_cpu_is_disabled(cpu))
 				continue;
 			pmc_select_cpu(cpu);
 			if (pmc_pcpu[cpu])
 				(void) md->pmd_cleanup(cpu);
 		}
 		FREE(md, M_PMC);
 		md = NULL;
 		pmc_restore_cpu_binding(&pb);
 	}
 
 	/* deallocate per-cpu structures */
 	FREE(pmc_pcpu, M_PMC);
 	pmc_pcpu = NULL;
 
 	FREE(pmc_pcpu_saved, M_PMC);
 	pmc_pcpu_saved = NULL;
 
 	if (pmc_pmcdisp) {
 		FREE(pmc_pmcdisp, M_PMC);
 		pmc_pmcdisp = NULL;
 	}
 
 	pmclog_shutdown();
 
 	sx_xunlock(&pmc_sx); 	/* we are done */
 }
 
 /*
  * The function called at load/unload.
  */
 
 static int
 load (struct module *module __unused, int cmd, void *arg __unused)
 {
 	int error;
 
 	error = 0;
 
 	switch (cmd) {
 	case MOD_LOAD :
 		/* initialize the subsystem */
 		error = pmc_initialize();
 		if (error != 0)
 			break;
 		PMCDBG(MOD,INI,1, "syscall=%d ncpus=%d",
 		    pmc_syscall_num, mp_ncpus);
 		break;
 
 
 	case MOD_UNLOAD :
 	case MOD_SHUTDOWN:
 		pmc_cleanup();
 		PMCDBG(MOD,INI,1, "%s", "unloaded");
 		break;
 
 	default :
 		error = EINVAL;	/* XXX should panic(9) */
 		break;
 	}
 
 	return error;
 }
 
 /* memory pool */
 MALLOC_DEFINE(M_PMC, "pmc", "Memory space for the PMC module");
Index: head/sys/dev/md/md.c
===================================================================
--- head/sys/dev/md/md.c	(revision 175201)
+++ head/sys/dev/md/md.c	(revision 175202)
@@ -1,1307 +1,1307 @@
 /*-
  * ----------------------------------------------------------------------------
  * "THE BEER-WARE LICENSE" (Revision 42):
  * <phk@FreeBSD.ORG> wrote this file.  As long as you retain this notice you
  * can do whatever you want with this stuff. If we meet some day, and you think
  * this stuff is worth it, you can buy me a beer in return.   Poul-Henning Kamp
  * ----------------------------------------------------------------------------
  *
  * $FreeBSD$
  *
  */
 
 /*-
  * The following functions are based in the vn(4) driver: mdstart_swap(),
  * mdstart_vnode(), mdcreate_swap(), mdcreate_vnode() and mddestroy(),
  * and as such under the following copyright:
  *
  * Copyright (c) 1988 University of Utah.
  * Copyright (c) 1990, 1993
  *	The Regents of the University of California.  All rights reserved.
  *
  * This code is derived from software contributed to Berkeley by
  * the Systems Programming Group of the University of Utah Computer
  * Science Department.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  * from: Utah Hdr: vn.c 1.13 94/04/02
  *
  *	from: @(#)vn.c	8.6 (Berkeley) 4/1/94
  * From: src/sys/dev/vn/vn.c,v 1.122 2000/12/16 16:06:03
  */
 
 #include "opt_geom.h"
 #include "opt_md.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/bio.h>
 #include <sys/conf.h>
 #include <sys/devicestat.h>
 #include <sys/fcntl.h>
 #include <sys/kernel.h>
 #include <sys/kthread.h>
 #include <sys/linker.h>
 #include <sys/lock.h>
 #include <sys/malloc.h>
 #include <sys/mdioctl.h>
 #include <sys/mount.h>
 #include <sys/mutex.h>
 #include <sys/sx.h>
 #include <sys/namei.h>
 #include <sys/proc.h>
 #include <sys/queue.h>
 #include <sys/sched.h>
 #include <sys/sf_buf.h>
 #include <sys/sysctl.h>
 #include <sys/vnode.h>
 
 #include <geom/geom.h>
 
 #include <vm/vm.h>
 #include <vm/vm_object.h>
 #include <vm/vm_page.h>
 #include <vm/vm_pager.h>
 #include <vm/swap_pager.h>
 #include <vm/uma.h>
 
 #define MD_MODVER 1
 
 #define MD_SHUTDOWN	0x10000		/* Tell worker thread to terminate. */
 #define	MD_EXITING	0x20000		/* Worker thread is exiting. */
 
 #ifndef MD_NSECT
 #define MD_NSECT (10000 * 2)
 #endif
 
 static MALLOC_DEFINE(M_MD, "md_disk", "Memory Disk");
 static MALLOC_DEFINE(M_MDSECT, "md_sectors", "Memory Disk Sectors");
 
 static int md_debug;
 SYSCTL_INT(_debug, OID_AUTO, mddebug, CTLFLAG_RW, &md_debug, 0, "");
 
 #if defined(MD_ROOT) && defined(MD_ROOT_SIZE)
 /*
  * Preloaded image gets put here.
  * Applications that patch the object with the image can determine
  * the size looking at the start and end markers (strings),
  * so we want them contiguous.
  */
 static struct {
 	u_char start[MD_ROOT_SIZE*1024];
 	u_char end[128];
 } mfs_root = {
 	.start = "MFS Filesystem goes here",
 	.end = "MFS Filesystem had better STOP here",
 };
 #endif
 
 static g_init_t g_md_init;
 static g_fini_t g_md_fini;
 static g_start_t g_md_start;
 static g_access_t g_md_access;
 static void g_md_dumpconf(struct sbuf *sb, const char *indent, struct g_geom *gp, 
     struct g_consumer *cp __unused, struct g_provider *pp);
 
 static int	mdunits;
 static struct cdev *status_dev = 0;
 static struct sx md_sx;
 
 static d_ioctl_t mdctlioctl;
 
 static struct cdevsw mdctl_cdevsw = {
 	.d_version =	D_VERSION,
 	.d_ioctl =	mdctlioctl,
 	.d_name =	MD_NAME,
 };
 
 struct g_class g_md_class = {
 	.name = "MD",
 	.version = G_VERSION,
 	.init = g_md_init,
 	.fini = g_md_fini,
 	.start = g_md_start,
 	.access = g_md_access,
 	.dumpconf = g_md_dumpconf,
 };
 
 DECLARE_GEOM_CLASS(g_md_class, g_md);
 
 
 static LIST_HEAD(, md_s) md_softc_list = LIST_HEAD_INITIALIZER(&md_softc_list);
 
 #define NINDIR	(PAGE_SIZE / sizeof(uintptr_t))
 #define NMASK	(NINDIR-1)
 static int nshift;
 
 struct indir {
 	uintptr_t	*array;
 	u_int		total;
 	u_int		used;
 	u_int		shift;
 };
 
 struct md_s {
 	int unit;
 	LIST_ENTRY(md_s) list;
 	struct bio_queue_head bio_queue;
 	struct mtx queue_mtx;
 	struct cdev *dev;
 	enum md_types type;
 	off_t mediasize;
 	unsigned sectorsize;
 	unsigned opencount;
 	unsigned fwheads;
 	unsigned fwsectors;
 	unsigned flags;
 	char name[20];
 	struct proc *procp;
 	struct g_geom *gp;
 	struct g_provider *pp;
 	int (*start)(struct md_s *sc, struct bio *bp);
 	struct devstat *devstat;
 
 	/* MD_MALLOC related fields */
 	struct indir *indir;
 	uma_zone_t uma;
 
 	/* MD_PRELOAD related fields */
 	u_char *pl_ptr;
 	size_t pl_len;
 
 	/* MD_VNODE related fields */
 	struct vnode *vnode;
 	char file[PATH_MAX];
 	struct ucred *cred;
 
 	/* MD_SWAP related fields */
 	vm_object_t object;
 };
 
 static struct indir *
 new_indir(u_int shift)
 {
 	struct indir *ip;
 
 	ip = malloc(sizeof *ip, M_MD, M_NOWAIT | M_ZERO);
 	if (ip == NULL)
 		return (NULL);
 	ip->array = malloc(sizeof(uintptr_t) * NINDIR,
 	    M_MDSECT, M_NOWAIT | M_ZERO);
 	if (ip->array == NULL) {
 		free(ip, M_MD);
 		return (NULL);
 	}
 	ip->total = NINDIR;
 	ip->shift = shift;
 	return (ip);
 }
 
 static void
 del_indir(struct indir *ip)
 {
 
 	free(ip->array, M_MDSECT);
 	free(ip, M_MD);
 }
 
 static void
 destroy_indir(struct md_s *sc, struct indir *ip)
 {
 	int i;
 
 	for (i = 0; i < NINDIR; i++) {
 		if (!ip->array[i])
 			continue;
 		if (ip->shift)
 			destroy_indir(sc, (struct indir*)(ip->array[i]));
 		else if (ip->array[i] > 255)
 			uma_zfree(sc->uma, (void *)(ip->array[i]));
 	}
 	del_indir(ip);
 }
 
 /*
  * This function does the math and allocates the top level "indir" structure
  * for a device of "size" sectors.
  */
 
 static struct indir *
 dimension(off_t size)
 {
 	off_t rcnt;
 	struct indir *ip;
 	int i, layer;
 
 	rcnt = size;
 	layer = 0;
 	while (rcnt > NINDIR) {
 		rcnt /= NINDIR;
 		layer++;
 	}
 	/* figure out log2(NINDIR) */
 	for (i = NINDIR, nshift = -1; i; nshift++)
 		i >>= 1;
 
 	/*
 	 * XXX: the top layer is probably not fully populated, so we allocate
 	 * too much space for ip->array in here.
 	 */
 	ip = malloc(sizeof *ip, M_MD, M_WAITOK | M_ZERO);
 	ip->array = malloc(sizeof(uintptr_t) * NINDIR,
 	    M_MDSECT, M_WAITOK | M_ZERO);
 	ip->total = NINDIR;
 	ip->shift = layer * nshift;
 	return (ip);
 }
 
 /*
  * Read a given sector
  */
 
 static uintptr_t
 s_read(struct indir *ip, off_t offset)
 {
 	struct indir *cip;
 	int idx;
 	uintptr_t up;
 
 	if (md_debug > 1)
 		printf("s_read(%jd)\n", (intmax_t)offset);
 	up = 0;
 	for (cip = ip; cip != NULL;) {
 		if (cip->shift) {
 			idx = (offset >> cip->shift) & NMASK;
 			up = cip->array[idx];
 			cip = (struct indir *)up;
 			continue;
 		}
 		idx = offset & NMASK;
 		return (cip->array[idx]);
 	}
 	return (0);
 }
 
 /*
  * Write a given sector, prune the tree if the value is 0
  */
 
 static int
 s_write(struct indir *ip, off_t offset, uintptr_t ptr)
 {
 	struct indir *cip, *lip[10];
 	int idx, li;
 	uintptr_t up;
 
 	if (md_debug > 1)
 		printf("s_write(%jd, %p)\n", (intmax_t)offset, (void *)ptr);
 	up = 0;
 	li = 0;
 	cip = ip;
 	for (;;) {
 		lip[li++] = cip;
 		if (cip->shift) {
 			idx = (offset >> cip->shift) & NMASK;
 			up = cip->array[idx];
 			if (up != 0) {
 				cip = (struct indir *)up;
 				continue;
 			}
 			/* Allocate branch */
 			cip->array[idx] =
 			    (uintptr_t)new_indir(cip->shift - nshift);
 			if (cip->array[idx] == 0)
 				return (ENOSPC);
 			cip->used++;
 			up = cip->array[idx];
 			cip = (struct indir *)up;
 			continue;
 		}
 		/* leafnode */
 		idx = offset & NMASK;
 		up = cip->array[idx];
 		if (up != 0)
 			cip->used--;
 		cip->array[idx] = ptr;
 		if (ptr != 0)
 			cip->used++;
 		break;
 	}
 	if (cip->used != 0 || li == 1)
 		return (0);
 	li--;
 	while (cip->used == 0 && cip != ip) {
 		li--;
 		idx = (offset >> lip[li]->shift) & NMASK;
 		up = lip[li]->array[idx];
 		KASSERT(up == (uintptr_t)cip, ("md screwed up"));
 		del_indir(cip);
 		lip[li]->array[idx] = 0;
 		lip[li]->used--;
 		cip = lip[li];
 	}
 	return (0);
 }
 
 
 static int
 g_md_access(struct g_provider *pp, int r, int w, int e)
 {
 	struct md_s *sc;
 
 	sc = pp->geom->softc;
 	if (sc == NULL)
 		return (ENXIO);
 	r += pp->acr;
 	w += pp->acw;
 	e += pp->ace;
 	if ((sc->flags & MD_READONLY) != 0 && w > 0)
 		return (EROFS);
 	if ((pp->acr + pp->acw + pp->ace) == 0 && (r + w + e) > 0) {
 		sc->opencount = 1;
 	} else if ((pp->acr + pp->acw + pp->ace) > 0 && (r + w + e) == 0) {
 		sc->opencount = 0;
 	}
 	return (0);
 }
 
 static void
 g_md_start(struct bio *bp)
 {
 	struct md_s *sc;
 
 	sc = bp->bio_to->geom->softc;
 	if ((bp->bio_cmd == BIO_READ) || (bp->bio_cmd == BIO_WRITE))
 		devstat_start_transaction_bio(sc->devstat, bp);
 	mtx_lock(&sc->queue_mtx);
 	bioq_disksort(&sc->bio_queue, bp);
 	mtx_unlock(&sc->queue_mtx);
 	wakeup(sc);
 }
 
 static int
 mdstart_malloc(struct md_s *sc, struct bio *bp)
 {
 	int i, error;
 	u_char *dst;
 	off_t secno, nsec, uc;
 	uintptr_t sp, osp;
 
 	switch (bp->bio_cmd) {
 	case BIO_READ:
 	case BIO_WRITE:
 	case BIO_DELETE:
 		break;
 	default:
 		return (EOPNOTSUPP);
 	}
 
 	nsec = bp->bio_length / sc->sectorsize;
 	secno = bp->bio_offset / sc->sectorsize;
 	dst = bp->bio_data;
 	error = 0;
 	while (nsec--) {
 		osp = s_read(sc->indir, secno);
 		if (bp->bio_cmd == BIO_DELETE) {
 			if (osp != 0)
 				error = s_write(sc->indir, secno, 0);
 		} else if (bp->bio_cmd == BIO_READ) {
 			if (osp == 0)
 				bzero(dst, sc->sectorsize);
 			else if (osp <= 255)
 				for (i = 0; i < sc->sectorsize; i++)
 					dst[i] = osp;
 			else
 				bcopy((void *)osp, dst, sc->sectorsize);
 			osp = 0;
 		} else if (bp->bio_cmd == BIO_WRITE) {
 			if (sc->flags & MD_COMPRESS) {
 				uc = dst[0];
 				for (i = 1; i < sc->sectorsize; i++)
 					if (dst[i] != uc)
 						break;
 			} else {
 				i = 0;
 				uc = 0;
 			}
 			if (i == sc->sectorsize) {
 				if (osp != uc)
 					error = s_write(sc->indir, secno, uc);
 			} else {
 				if (osp <= 255) {
 					sp = (uintptr_t)uma_zalloc(sc->uma,
 					    M_NOWAIT);
 					if (sp == 0) {
 						error = ENOSPC;
 						break;
 					}
 					bcopy(dst, (void *)sp, sc->sectorsize);
 					error = s_write(sc->indir, secno, sp);
 				} else {
 					bcopy(dst, (void *)osp, sc->sectorsize);
 					osp = 0;
 				}
 			}
 		} else {
 			error = EOPNOTSUPP;
 		}
 		if (osp > 255)
 			uma_zfree(sc->uma, (void*)osp);
 		if (error != 0)
 			break;
 		secno++;
 		dst += sc->sectorsize;
 	}
 	bp->bio_resid = 0;
 	return (error);
 }
 
 static int
 mdstart_preload(struct md_s *sc, struct bio *bp)
 {
 
 	switch (bp->bio_cmd) {
 	case BIO_READ:
 		bcopy(sc->pl_ptr + bp->bio_offset, bp->bio_data,
 		    bp->bio_length);
 		break;
 	case BIO_WRITE:
 		bcopy(bp->bio_data, sc->pl_ptr + bp->bio_offset,
 		    bp->bio_length);
 		break;
 	}
 	bp->bio_resid = 0;
 	return (0);
 }
 
 static int
 mdstart_vnode(struct md_s *sc, struct bio *bp)
 {
 	int error, vfslocked;
 	struct uio auio;
 	struct iovec aiov;
 	struct mount *mp;
 	struct vnode *vp;
 	struct thread *td;
 
 	switch (bp->bio_cmd) {
 	case BIO_READ:
 	case BIO_WRITE:
 	case BIO_FLUSH:
 		break;
 	default:
 		return (EOPNOTSUPP);
 	}
 
 	td = curthread;
 	vp = sc->vnode;
 
 	/*
 	 * VNODE I/O
 	 *
 	 * If an error occurs, we set BIO_ERROR but we do not set
 	 * B_INVAL because (for a write anyway), the buffer is
 	 * still valid.
 	 */
 
 	if (bp->bio_cmd == BIO_FLUSH) {
 		vfslocked = VFS_LOCK_GIANT(vp->v_mount);
 		(void) vn_start_write(vp, &mp, V_WAIT);
-		vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td);
+		vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
 		error = VOP_FSYNC(vp, MNT_WAIT, td);
 		VOP_UNLOCK(vp, 0, td);
 		vn_finished_write(mp);
 		VFS_UNLOCK_GIANT(vfslocked);
 		return (error);
 	}
 
 	bzero(&auio, sizeof(auio));
 
 	aiov.iov_base = bp->bio_data;
 	aiov.iov_len = bp->bio_length;
 	auio.uio_iov = &aiov;
 	auio.uio_iovcnt = 1;
 	auio.uio_offset = (vm_ooffset_t)bp->bio_offset;
 	auio.uio_segflg = UIO_SYSSPACE;
 	if (bp->bio_cmd == BIO_READ)
 		auio.uio_rw = UIO_READ;
 	else if (bp->bio_cmd == BIO_WRITE)
 		auio.uio_rw = UIO_WRITE;
 	else
 		panic("wrong BIO_OP in mdstart_vnode");
 	auio.uio_resid = bp->bio_length;
 	auio.uio_td = td;
 	/*
 	 * When reading set IO_DIRECT to try to avoid double-caching
 	 * the data.  When writing IO_DIRECT is not optimal.
 	 */
 	vfslocked = VFS_LOCK_GIANT(vp->v_mount);
 	if (bp->bio_cmd == BIO_READ) {
-		vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td);
+		vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
 		error = VOP_READ(vp, &auio, IO_DIRECT, sc->cred);
 		VOP_UNLOCK(vp, 0, td);
 	} else {
 		(void) vn_start_write(vp, &mp, V_WAIT);
-		vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td);
+		vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
 		error = VOP_WRITE(vp, &auio, sc->flags & MD_ASYNC ? 0 : IO_SYNC,
 		    sc->cred);
 		VOP_UNLOCK(vp, 0, td);
 		vn_finished_write(mp);
 	}
 	VFS_UNLOCK_GIANT(vfslocked);
 	bp->bio_resid = auio.uio_resid;
 	return (error);
 }
 
 static int
 mdstart_swap(struct md_s *sc, struct bio *bp)
 {
 	struct sf_buf *sf;
 	int rv, offs, len, lastend;
 	vm_pindex_t i, lastp;
 	vm_page_t m;
 	u_char *p;
 
 	switch (bp->bio_cmd) {
 	case BIO_READ:
 	case BIO_WRITE:
 	case BIO_DELETE:
 		break;
 	default:
 		return (EOPNOTSUPP);
 	}
 
 	p = bp->bio_data;
 
 	/*
 	 * offs is the offset at which to start operating on the
 	 * next (ie, first) page.  lastp is the last page on
 	 * which we're going to operate.  lastend is the ending
 	 * position within that last page (ie, PAGE_SIZE if
 	 * we're operating on complete aligned pages).
 	 */
 	offs = bp->bio_offset % PAGE_SIZE;
 	lastp = (bp->bio_offset + bp->bio_length - 1) / PAGE_SIZE;
 	lastend = (bp->bio_offset + bp->bio_length - 1) % PAGE_SIZE + 1;
 
 	rv = VM_PAGER_OK;
 	VM_OBJECT_LOCK(sc->object);
 	vm_object_pip_add(sc->object, 1);
 	for (i = bp->bio_offset / PAGE_SIZE; i <= lastp; i++) {
 		len = ((i == lastp) ? lastend : PAGE_SIZE) - offs;
 
 		m = vm_page_grab(sc->object, i,
 		    VM_ALLOC_NORMAL|VM_ALLOC_RETRY);
 		VM_OBJECT_UNLOCK(sc->object);
 		sched_pin();
 		sf = sf_buf_alloc(m, SFB_CPUPRIVATE);
 		VM_OBJECT_LOCK(sc->object);
 		if (bp->bio_cmd == BIO_READ) {
 			if (m->valid != VM_PAGE_BITS_ALL)
 				rv = vm_pager_get_pages(sc->object, &m, 1, 0);
 			if (rv == VM_PAGER_ERROR) {
 				sf_buf_free(sf);
 				sched_unpin();
 				vm_page_lock_queues();
 				vm_page_wakeup(m);
 				vm_page_unlock_queues();
 				break;
 			}
 			bcopy((void *)(sf_buf_kva(sf) + offs), p, len);
 		} else if (bp->bio_cmd == BIO_WRITE) {
 			if (len != PAGE_SIZE && m->valid != VM_PAGE_BITS_ALL)
 				rv = vm_pager_get_pages(sc->object, &m, 1, 0);
 			if (rv == VM_PAGER_ERROR) {
 				sf_buf_free(sf);
 				sched_unpin();
 				vm_page_lock_queues();
 				vm_page_wakeup(m);
 				vm_page_unlock_queues();
 				break;
 			}
 			bcopy(p, (void *)(sf_buf_kva(sf) + offs), len);
 			m->valid = VM_PAGE_BITS_ALL;
 #if 0
 		} else if (bp->bio_cmd == BIO_DELETE) {
 			if (len != PAGE_SIZE && m->valid != VM_PAGE_BITS_ALL)
 				rv = vm_pager_get_pages(sc->object, &m, 1, 0);
 			if (rv == VM_PAGER_ERROR) {
 				sf_buf_free(sf);
 				sched_unpin();
 				vm_page_lock_queues();
 				vm_page_wakeup(m);
 				vm_page_unlock_queues();
 				break;
 			}
 			bzero((void *)(sf_buf_kva(sf) + offs), len);
 			vm_page_dirty(m);
 			m->valid = VM_PAGE_BITS_ALL;
 #endif
 		}
 		sf_buf_free(sf);
 		sched_unpin();
 		vm_page_lock_queues();
 		vm_page_wakeup(m);
 		vm_page_activate(m);
 		if (bp->bio_cmd == BIO_WRITE)
 			vm_page_dirty(m);
 		vm_page_unlock_queues();
 
 		/* Actions on further pages start at offset 0 */
 		p += PAGE_SIZE - offs;
 		offs = 0;
 #if 0
 if (bootverbose || bp->bio_offset / PAGE_SIZE < 17)
 printf("wire_count %d busy %d flags %x hold_count %d act_count %d queue %d valid %d dirty %d @ %d\n",
     m->wire_count, m->busy, 
     m->flags, m->hold_count, m->act_count, m->queue, m->valid, m->dirty, i);
 #endif
 	}
 	vm_object_pip_subtract(sc->object, 1);
 	vm_object_set_writeable_dirty(sc->object);
 	VM_OBJECT_UNLOCK(sc->object);
 	return (rv != VM_PAGER_ERROR ? 0 : ENOSPC);
 }
 
 static void
 md_kthread(void *arg)
 {
 	struct md_s *sc;
 	struct bio *bp;
 	int error;
 
 	sc = arg;
 	thread_lock(curthread);
 	sched_prio(curthread, PRIBIO);
 	thread_unlock(curthread);
 	if (sc->type == MD_VNODE)
 		curthread->td_pflags |= TDP_NORUNNINGBUF;
 
 	for (;;) {
 		mtx_lock(&sc->queue_mtx);
 		if (sc->flags & MD_SHUTDOWN) {
 			sc->flags |= MD_EXITING;
 			mtx_unlock(&sc->queue_mtx);
 			kproc_exit(0);
 		}
 		bp = bioq_takefirst(&sc->bio_queue);
 		if (!bp) {
 			msleep(sc, &sc->queue_mtx, PRIBIO | PDROP, "mdwait", 0);
 			continue;
 		}
 		mtx_unlock(&sc->queue_mtx);
 		if (bp->bio_cmd == BIO_GETATTR) {
 			if (sc->fwsectors && sc->fwheads &&
 			    (g_handleattr_int(bp, "GEOM::fwsectors",
 			    sc->fwsectors) ||
 			    g_handleattr_int(bp, "GEOM::fwheads",
 			    sc->fwheads)))
 				error = -1;
 			else
 				error = EOPNOTSUPP;
 		} else {
 			error = sc->start(sc, bp);
 		}
 
 		if (error != -1) {
 			bp->bio_completed = bp->bio_length;
 			g_io_deliver(bp, error);
 			if ((bp->bio_cmd == BIO_READ) || (bp->bio_cmd == BIO_WRITE))
 				devstat_end_transaction_bio(sc->devstat, bp);
 		}
 	}
 }
 
 static struct md_s *
 mdfind(int unit)
 {
 	struct md_s *sc;
 
 	LIST_FOREACH(sc, &md_softc_list, list) {
 		if (sc->unit == unit)
 			break;
 	}
 	return (sc);
 }
 
 static struct md_s *
 mdnew(int unit, int *errp, enum md_types type)
 {
 	struct md_s *sc, *sc2;
 	int error, max = -1;
 
 	*errp = 0;
 	LIST_FOREACH(sc2, &md_softc_list, list) {
 		if (unit == sc2->unit) {
 			*errp = EBUSY;
 			return (NULL);
 		}
 		if (unit == -1 && sc2->unit > max) 
 			max = sc2->unit;
 	}
 	if (unit == -1)
 		unit = max + 1;
 	sc = (struct md_s *)malloc(sizeof *sc, M_MD, M_WAITOK | M_ZERO);
 	sc->type = type;
 	bioq_init(&sc->bio_queue);
 	mtx_init(&sc->queue_mtx, "md bio queue", NULL, MTX_DEF);
 	sc->unit = unit;
 	sprintf(sc->name, "md%d", unit);
 	LIST_INSERT_HEAD(&md_softc_list, sc, list);
 	error = kproc_create(md_kthread, sc, &sc->procp, 0, 0,"%s", sc->name);
 	if (error == 0)
 		return (sc);
 	LIST_REMOVE(sc, list);
 	mtx_destroy(&sc->queue_mtx);
 	free(sc, M_MD);
 	*errp = error;
 	return (NULL);
 }
 
 static void
 mdinit(struct md_s *sc)
 {
 
 	struct g_geom *gp;
 	struct g_provider *pp;
 
 	g_topology_lock();
 	gp = g_new_geomf(&g_md_class, "md%d", sc->unit);
 	gp->softc = sc;
 	pp = g_new_providerf(gp, "md%d", sc->unit);
 	pp->mediasize = sc->mediasize;
 	pp->sectorsize = sc->sectorsize;
 	sc->gp = gp;
 	sc->pp = pp;
 	g_error_provider(pp, 0);
 	g_topology_unlock();
 	sc->devstat = devstat_new_entry("md", sc->unit, sc->sectorsize,
 	    DEVSTAT_ALL_SUPPORTED, DEVSTAT_TYPE_DIRECT, DEVSTAT_PRIORITY_MAX);
 }
 
 /*
  * XXX: we should check that the range they feed us is mapped.
  * XXX: we should implement read-only.
  */
 
 static int
 mdcreate_preload(struct md_s *sc, struct md_ioctl *mdio)
 {
 
 	if (mdio->md_options & ~(MD_AUTOUNIT | MD_FORCE))
 		return (EINVAL);
 	sc->flags = mdio->md_options & MD_FORCE;
 	/* Cast to pointer size, then to pointer to avoid warning */
 	sc->pl_ptr = (u_char *)(uintptr_t)mdio->md_base;
 	sc->pl_len = (size_t)sc->mediasize;
 	return (0);
 }
 
 
 static int
 mdcreate_malloc(struct md_s *sc, struct md_ioctl *mdio)
 {
 	uintptr_t sp;
 	int error;
 	off_t u;
 
 	error = 0;
 	if (mdio->md_options & ~(MD_AUTOUNIT | MD_COMPRESS | MD_RESERVE))
 		return (EINVAL);
 	if (mdio->md_sectorsize != 0 && !powerof2(mdio->md_sectorsize))
 		return (EINVAL);
 	/* Compression doesn't make sense if we have reserved space */
 	if (mdio->md_options & MD_RESERVE)
 		mdio->md_options &= ~MD_COMPRESS;
 	if (mdio->md_fwsectors != 0)
 		sc->fwsectors = mdio->md_fwsectors;
 	if (mdio->md_fwheads != 0)
 		sc->fwheads = mdio->md_fwheads;
 	sc->flags = mdio->md_options & (MD_COMPRESS | MD_FORCE);
 	sc->indir = dimension(sc->mediasize / sc->sectorsize);
 	sc->uma = uma_zcreate(sc->name, sc->sectorsize, NULL, NULL, NULL, NULL,
 	    0x1ff, 0);
 	if (mdio->md_options & MD_RESERVE) {
 		off_t nsectors;
 
 		nsectors = sc->mediasize / sc->sectorsize;
 		for (u = 0; u < nsectors; u++) {
 			sp = (uintptr_t)uma_zalloc(sc->uma, M_NOWAIT | M_ZERO);
 			if (sp != 0)
 				error = s_write(sc->indir, u, sp);
 			else
 				error = ENOMEM;
 			if (error != 0)
 				break;
 		}
 	}
 	return (error);
 }
 
 
 static int
 mdsetcred(struct md_s *sc, struct ucred *cred)
 {
 	char *tmpbuf;
 	int error = 0;
 
 	/*
 	 * Set credits in our softc
 	 */
 
 	if (sc->cred)
 		crfree(sc->cred);
 	sc->cred = crhold(cred);
 
 	/*
 	 * Horrible kludge to establish credentials for NFS  XXX.
 	 */
 
 	if (sc->vnode) {
 		struct uio auio;
 		struct iovec aiov;
 
 		tmpbuf = malloc(sc->sectorsize, M_TEMP, M_WAITOK);
 		bzero(&auio, sizeof(auio));
 
 		aiov.iov_base = tmpbuf;
 		aiov.iov_len = sc->sectorsize;
 		auio.uio_iov = &aiov;
 		auio.uio_iovcnt = 1;
 		auio.uio_offset = 0;
 		auio.uio_rw = UIO_READ;
 		auio.uio_segflg = UIO_SYSSPACE;
 		auio.uio_resid = aiov.iov_len;
-		vn_lock(sc->vnode, LK_EXCLUSIVE | LK_RETRY, curthread);
+		vn_lock(sc->vnode, LK_EXCLUSIVE | LK_RETRY);
 		error = VOP_READ(sc->vnode, &auio, 0, sc->cred);
 		VOP_UNLOCK(sc->vnode, 0, curthread);
 		free(tmpbuf, M_TEMP);
 	}
 	return (error);
 }
 
 static int
 mdcreate_vnode(struct md_s *sc, struct md_ioctl *mdio, struct thread *td)
 {
 	struct vattr vattr;
 	struct nameidata nd;
 	int error, flags, vfslocked;
 
 	error = copyinstr(mdio->md_file, sc->file, sizeof(sc->file), NULL);
 	if (error != 0)
 		return (error);
 	flags = FREAD|FWRITE;
 	/*
 	 * If the user specified that this is a read only device, unset the
 	 * FWRITE mask before trying to open the backing store.
 	 */
 	if ((mdio->md_options & MD_READONLY) != 0)
 		flags &= ~FWRITE;
 	NDINIT(&nd, LOOKUP, FOLLOW | MPSAFE, UIO_SYSSPACE, sc->file, td);
 	error = vn_open(&nd, &flags, 0, NULL);
 	if (error != 0)
 		return (error);
 	vfslocked = NDHASGIANT(&nd);
 	NDFREE(&nd, NDF_ONLY_PNBUF);
 	if (nd.ni_vp->v_type != VREG ||
 	    (error = VOP_GETATTR(nd.ni_vp, &vattr, td->td_ucred, td))) {
 		VOP_UNLOCK(nd.ni_vp, 0, td);
 		(void)vn_close(nd.ni_vp, flags, td->td_ucred, td);
 		VFS_UNLOCK_GIANT(vfslocked);
 		return (error ? error : EINVAL);
 	}
 	nd.ni_vp->v_vflag |= VV_MD;
 	VOP_UNLOCK(nd.ni_vp, 0, td);
 
 	if (mdio->md_fwsectors != 0)
 		sc->fwsectors = mdio->md_fwsectors;
 	if (mdio->md_fwheads != 0)
 		sc->fwheads = mdio->md_fwheads;
 	sc->flags = mdio->md_options & (MD_FORCE | MD_ASYNC);
 	if (!(flags & FWRITE))
 		sc->flags |= MD_READONLY;
 	sc->vnode = nd.ni_vp;
 
 	error = mdsetcred(sc, td->td_ucred);
 	if (error != 0) {
-		vn_lock(nd.ni_vp, LK_EXCLUSIVE | LK_RETRY, td);
+		vn_lock(nd.ni_vp, LK_EXCLUSIVE | LK_RETRY);
 		nd.ni_vp->v_vflag &= ~VV_MD;
 		VOP_UNLOCK(nd.ni_vp, 0, td);
 		(void)vn_close(nd.ni_vp, flags, td->td_ucred, td);
 		VFS_UNLOCK_GIANT(vfslocked);
 		return (error);
 	}
 	VFS_UNLOCK_GIANT(vfslocked);
 	return (0);
 }
 
 static int
 mddestroy(struct md_s *sc, struct thread *td)
 {
 	int vfslocked;
 
 	if (sc->gp) {
 		sc->gp->softc = NULL;
 		g_topology_lock();
 		g_wither_geom(sc->gp, ENXIO);
 		g_topology_unlock();
 		sc->gp = NULL;
 		sc->pp = NULL;
 	}
 	if (sc->devstat) {
 		devstat_remove_entry(sc->devstat);
 		sc->devstat = NULL;
 	}
 	mtx_lock(&sc->queue_mtx);
 	sc->flags |= MD_SHUTDOWN;
 	wakeup(sc);
 	while (!(sc->flags & MD_EXITING))
 		msleep(sc->procp, &sc->queue_mtx, PRIBIO, "mddestroy", hz / 10);
 	mtx_unlock(&sc->queue_mtx);
 	mtx_destroy(&sc->queue_mtx);
 	if (sc->vnode != NULL) {
 		vfslocked = VFS_LOCK_GIANT(sc->vnode->v_mount);
-		vn_lock(sc->vnode, LK_EXCLUSIVE | LK_RETRY, td);
+		vn_lock(sc->vnode, LK_EXCLUSIVE | LK_RETRY);
 		sc->vnode->v_vflag &= ~VV_MD;
 		VOP_UNLOCK(sc->vnode, 0, td);
 		(void)vn_close(sc->vnode, sc->flags & MD_READONLY ?
 		    FREAD : (FREAD|FWRITE), sc->cred, td);
 		VFS_UNLOCK_GIANT(vfslocked);
 	}
 	if (sc->cred != NULL)
 		crfree(sc->cred);
 	if (sc->object != NULL)
 		vm_object_deallocate(sc->object);
 	if (sc->indir)
 		destroy_indir(sc, sc->indir);
 	if (sc->uma)
 		uma_zdestroy(sc->uma);
 
 	LIST_REMOVE(sc, list);
 	free(sc, M_MD);
 	return (0);
 }
 
 static int
 mdcreate_swap(struct md_s *sc, struct md_ioctl *mdio, struct thread *td)
 {
 	vm_ooffset_t npage;
 	int error;
 
 	/*
 	 * Range check.  Disallow negative sizes or any size less then the
 	 * size of a page.  Then round to a page.
 	 */
 	if (sc->mediasize == 0 || (sc->mediasize % PAGE_SIZE) != 0)
 		return (EDOM);
 
 	/*
 	 * Allocate an OBJT_SWAP object.
 	 *
 	 * Note the truncation.
 	 */
 
 	npage = mdio->md_mediasize / PAGE_SIZE;
 	if (mdio->md_fwsectors != 0)
 		sc->fwsectors = mdio->md_fwsectors;
 	if (mdio->md_fwheads != 0)
 		sc->fwheads = mdio->md_fwheads;
 	sc->object = vm_pager_allocate(OBJT_SWAP, NULL, PAGE_SIZE * npage,
 	    VM_PROT_DEFAULT, 0);
 	if (sc->object == NULL)
 		return (ENOMEM);
 	sc->flags = mdio->md_options & MD_FORCE;
 	if (mdio->md_options & MD_RESERVE) {
 		if (swap_pager_reserve(sc->object, 0, npage) < 0) {
 			vm_object_deallocate(sc->object);
 			sc->object = NULL;
 			return (EDOM);
 		}
 	}
 	error = mdsetcred(sc, td->td_ucred);
 	if (error != 0) {
 		vm_object_deallocate(sc->object);
 		sc->object = NULL;
 	}
 	return (error);
 }
 
 
 static int
 xmdctlioctl(struct cdev *dev, u_long cmd, caddr_t addr, int flags, struct thread *td)
 {
 	struct md_ioctl *mdio;
 	struct md_s *sc;
 	int error, i;
 
 	if (md_debug)
 		printf("mdctlioctl(%s %lx %p %x %p)\n",
 			devtoname(dev), cmd, addr, flags, td);
 
 	mdio = (struct md_ioctl *)addr;
 	if (mdio->md_version != MDIOVERSION)
 		return (EINVAL);
 
 	/*
 	 * We assert the version number in the individual ioctl
 	 * handlers instead of out here because (a) it is possible we
 	 * may add another ioctl in the future which doesn't read an
 	 * mdio, and (b) the correct return value for an unknown ioctl
 	 * is ENOIOCTL, not EINVAL.
 	 */
 	error = 0;
 	switch (cmd) {
 	case MDIOCATTACH:
 		switch (mdio->md_type) {
 		case MD_MALLOC:
 		case MD_PRELOAD:
 		case MD_VNODE:
 		case MD_SWAP:
 			break;
 		default:
 			return (EINVAL);
 		}
 		if (mdio->md_options & MD_AUTOUNIT)
 			sc = mdnew(-1, &error, mdio->md_type);
 		else
 			sc = mdnew(mdio->md_unit, &error, mdio->md_type);
 		if (sc == NULL)
 			return (error);
 		if (mdio->md_options & MD_AUTOUNIT)
 			mdio->md_unit = sc->unit;
 		sc->mediasize = mdio->md_mediasize;
 		if (mdio->md_sectorsize == 0)
 			sc->sectorsize = DEV_BSIZE;
 		else
 			sc->sectorsize = mdio->md_sectorsize;
 		error = EDOOFUS;
 		switch (sc->type) {
 		case MD_MALLOC:
 			sc->start = mdstart_malloc;
 			error = mdcreate_malloc(sc, mdio);
 			break;
 		case MD_PRELOAD:
 			sc->start = mdstart_preload;
 			error = mdcreate_preload(sc, mdio);
 			break;
 		case MD_VNODE:
 			sc->start = mdstart_vnode;
 			error = mdcreate_vnode(sc, mdio, td);
 			break;
 		case MD_SWAP:
 			sc->start = mdstart_swap;
 			error = mdcreate_swap(sc, mdio, td);
 			break;
 		}
 		if (error != 0) {
 			mddestroy(sc, td);
 			return (error);
 		}
 
 		/* Prune off any residual fractional sector */
 		i = sc->mediasize % sc->sectorsize;
 		sc->mediasize -= i;
 
 		mdinit(sc);
 		return (0);
 	case MDIOCDETACH:
 		if (mdio->md_mediasize != 0 || mdio->md_options != 0)
 			return (EINVAL);
 
 		sc = mdfind(mdio->md_unit);
 		if (sc == NULL)
 			return (ENOENT);
 		if (sc->opencount != 0 && !(sc->flags & MD_FORCE))
 			return (EBUSY);
 		return (mddestroy(sc, td));
 	case MDIOCQUERY:
 		sc = mdfind(mdio->md_unit);
 		if (sc == NULL)
 			return (ENOENT);
 		mdio->md_type = sc->type;
 		mdio->md_options = sc->flags;
 		mdio->md_mediasize = sc->mediasize;
 		mdio->md_sectorsize = sc->sectorsize;
 		if (sc->type == MD_VNODE)
 			error = copyout(sc->file, mdio->md_file,
 			    strlen(sc->file) + 1);
 		return (error);
 	case MDIOCLIST:
 		i = 1;
 		LIST_FOREACH(sc, &md_softc_list, list) {
 			if (i == MDNPAD - 1)
 				mdio->md_pad[i] = -1;
 			else
 				mdio->md_pad[i++] = sc->unit;
 		}
 		mdio->md_pad[0] = i - 1;
 		return (0);
 	default:
 		return (ENOIOCTL);
 	};
 }
 
 static int
 mdctlioctl(struct cdev *dev, u_long cmd, caddr_t addr, int flags, struct thread *td)
 {
 	int error; 
 
 	sx_xlock(&md_sx);
 	error = xmdctlioctl(dev, cmd, addr, flags, td);
 	sx_xunlock(&md_sx);
 	return (error);
 }
 
 static void
 md_preloaded(u_char *image, size_t length)
 {
 	struct md_s *sc;
 	int error;
 
 	sc = mdnew(-1, &error, MD_PRELOAD);
 	if (sc == NULL)
 		return;
 	sc->mediasize = length;
 	sc->sectorsize = DEV_BSIZE;
 	sc->pl_ptr = image;
 	sc->pl_len = length;
 	sc->start = mdstart_preload;
 #ifdef MD_ROOT
 	if (sc->unit == 0)
 		rootdevnames[0] = "ufs:/dev/md0";
 #endif
 	mdinit(sc);
 }
 
 static void
 g_md_init(struct g_class *mp __unused)
 {
 
 	caddr_t mod;
 	caddr_t c;
 	u_char *ptr, *name, *type;
 	unsigned len;
 
 	mod = NULL;
 	sx_init(&md_sx, "MD config lock");
 	g_topology_unlock();
 #ifdef MD_ROOT_SIZE
 	sx_xlock(&md_sx);
 	md_preloaded(mfs_root.start, sizeof(mfs_root.start));
 	sx_xunlock(&md_sx);
 #endif
 	/* XXX: are preload_* static or do they need Giant ? */
 	while ((mod = preload_search_next_name(mod)) != NULL) {
 		name = (char *)preload_search_info(mod, MODINFO_NAME);
 		if (name == NULL)
 			continue;
 		type = (char *)preload_search_info(mod, MODINFO_TYPE);
 		if (type == NULL)
 			continue;
 		if (strcmp(type, "md_image") && strcmp(type, "mfs_root"))
 			continue;
 		c = preload_search_info(mod, MODINFO_ADDR);
 		ptr = *(u_char **)c;
 		c = preload_search_info(mod, MODINFO_SIZE);
 		len = *(size_t *)c;
 		printf("%s%d: Preloaded image <%s> %d bytes at %p\n",
 		    MD_NAME, mdunits, name, len, ptr);
 		sx_xlock(&md_sx);
 		md_preloaded(ptr, len);
 		sx_xunlock(&md_sx);
 	}
 	status_dev = make_dev(&mdctl_cdevsw, MAXMINOR, UID_ROOT, GID_WHEEL,
 	    0600, MDCTL_NAME);
 	g_topology_lock();
 }
 
 static void
 g_md_dumpconf(struct sbuf *sb, const char *indent, struct g_geom *gp, 
     struct g_consumer *cp __unused, struct g_provider *pp)
 {
 	struct md_s *mp;
 	char *type;
 
 	mp = gp->softc;
 	if (mp == NULL)
 		return;
 
 	switch (mp->type) {
 	case MD_MALLOC:
 		type = "malloc";
 		break;
 	case MD_PRELOAD:
 		type = "preload";
 		break;
 	case MD_VNODE:
 		type = "vnode";
 		break;
 	case MD_SWAP:
 		type = "swap";
 		break;
 	default:
 		type = "unknown";
 		break;
 	}
 
 	if (pp != NULL) {
 		if (indent == NULL) {
 			sbuf_printf(sb, " u %d", mp->unit);
 			sbuf_printf(sb, " s %ju", (uintmax_t) mp->sectorsize);
 			sbuf_printf(sb, " f %ju", (uintmax_t) mp->fwheads);
 			sbuf_printf(sb, " fs %ju", (uintmax_t) mp->fwsectors);
 			sbuf_printf(sb, " l %ju", (uintmax_t) mp->mediasize);
 			sbuf_printf(sb, " t %s", type);
 			if (mp->type == MD_VNODE && mp->vnode != NULL)
 				sbuf_printf(sb, " file %s", mp->file);
 		} else {
 			sbuf_printf(sb, "%s<unit>%d</unit>\n", indent,
 			    mp->unit);
 			sbuf_printf(sb, "%s<sectorsize>%ju</sectorsize>\n",
 			    indent, (uintmax_t) mp->sectorsize);
 			sbuf_printf(sb, "%s<fwheads>%ju</fwheads>\n",
 			    indent, (uintmax_t) mp->fwheads);
 			sbuf_printf(sb, "%s<fwsectors>%ju</fwsectors>\n",
 			    indent, (uintmax_t) mp->fwsectors);
 			sbuf_printf(sb, "%s<length>%ju</length>\n",
 			    indent, (uintmax_t) mp->mediasize);
 			sbuf_printf(sb, "%s<type>%s</type>\n", indent,
 			    type);
 			if (mp->type == MD_VNODE && mp->vnode != NULL)
 				sbuf_printf(sb, "%s<file>%s</file>\n",
 				    indent, mp->file);
 		}
 	}
 }
 
 static void
 g_md_fini(struct g_class *mp __unused)
 {
 
 	sx_destroy(&md_sx);
 	if (status_dev != NULL)
 		destroy_dev(status_dev);
 }
Index: head/sys/fs/cd9660/cd9660_lookup.c
===================================================================
--- head/sys/fs/cd9660/cd9660_lookup.c	(revision 175201)
+++ head/sys/fs/cd9660/cd9660_lookup.c	(revision 175202)
@@ -1,430 +1,430 @@
 /*-
  * Copyright (c) 1989, 1993, 1994
  *	The Regents of the University of California.  All rights reserved.
  *
  * This code is derived from software contributed to Berkeley
  * by Pace Willisson (pace@blitz.com).  The Rock Ridge Extension
  * Support code is derived from software contributed to Berkeley
  * by Atsushi Murai (amurai@spec.co.jp).
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	from: @(#)ufs_lookup.c	7.33 (Berkeley) 5/19/91
  *	@(#)cd9660_lookup.c	8.2 (Berkeley) 1/23/94
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/namei.h>
 #include <sys/bio.h>
 #include <sys/buf.h>
 #include <sys/vnode.h>
 #include <sys/mount.h>
 
 #include <fs/cd9660/iso.h>
 #include <fs/cd9660/cd9660_node.h>
 #include <fs/cd9660/iso_rrip.h>
 
 /*
  * Convert a component of a pathname into a pointer to a locked inode.
  * This is a very central and rather complicated routine.
  * If the filesystem is not maintained in a strict tree hierarchy,
  * this can result in a deadlock situation (see comments in code below).
  *
  * The flag argument is LOOKUP, CREATE, RENAME, or DELETE depending on
  * whether the name is to be looked up, created, renamed, or deleted.
  * When CREATE, RENAME, or DELETE is specified, information usable in
  * creating, renaming, or deleting a directory entry may be calculated.
  * If flag has LOCKPARENT or'ed into it and the target of the pathname
  * exists, lookup returns both the target and its parent directory locked.
  * When creating or renaming and LOCKPARENT is specified, the target may
  * not be ".".  When deleting and LOCKPARENT is specified, the target may
  * be "."., but the caller must check to ensure it does an vrele and iput
  * instead of two iputs.
  *
  * Overall outline of ufs_lookup:
  *
  *	search for name in directory, to found or notfound
  * notfound:
  *	if creating, return locked directory, leaving info on available slots
  *	else return error
  * found:
  *	if at end of path and deleting, return information to allow delete
  *	if at end of path and rewriting (RENAME and LOCKPARENT), lock target
  *	  inode and return info to allow rewrite
  *	if not at end, add name to cache; if at end and neither creating
  *	  nor deleting, add name to cache
  *
  * NOTE: (LOOKUP | LOCKPARENT) currently returns the parent inode unlocked.
  */
 int
 cd9660_lookup(ap)
 	struct vop_cachedlookup_args /* {
 		struct vnode *a_dvp;
 		struct vnode **a_vpp;
 		struct componentname *a_cnp;
 	} */ *ap;
 {
 	struct vnode *vdp;		/* vnode for directory being searched */
 	struct iso_node *dp;		/* inode for directory being searched */
 	struct iso_mnt *imp;		/* filesystem that directory is in */
 	struct buf *bp;			/* a buffer of directory entries */
 	struct iso_directory_record *ep = 0;/* the current directory entry */
 	int entryoffsetinblock;		/* offset of ep in bp's buffer */
 	int saveoffset = 0;		/* offset of last directory entry in dir */
 	int numdirpasses;		/* strategy for directory search */
 	doff_t endsearch;		/* offset to end directory search */
 	struct vnode *pdp;		/* saved dp during symlink work */
 	struct vnode *tdp;		/* returned by cd9660_vget_internal */
 	u_long bmask;			/* block offset mask */
 	int error;
 	ino_t ino = 0, saved_ino;
 	int reclen;
 	u_short namelen;
 	int isoflags;
 	char altname[NAME_MAX];
 	int res;
 	int assoc, len;
 	char *name;
 	struct vnode **vpp = ap->a_vpp;
 	struct componentname *cnp = ap->a_cnp;
 	int flags = cnp->cn_flags;
 	int nameiop = cnp->cn_nameiop;
 	struct thread *td = cnp->cn_thread;
 
 	bp = NULL;
 	*vpp = NULL;
 	vdp = ap->a_dvp;
 	dp = VTOI(vdp);
 	imp = dp->i_mnt;
 
 	/*
 	 * We now have a segment name to search for, and a directory to search.
 	 */
 
 	len = cnp->cn_namelen;
 	name = cnp->cn_nameptr;
 	/*
 	 * A leading `=' means, we are looking for an associated file
 	 */
 	if ((assoc = (imp->iso_ftype != ISO_FTYPE_RRIP && *name == ASSOCCHAR)))
 	{
 		len--;
 		name++;
 	}
 
 	/*
 	 * If there is cached information on a previous search of
 	 * this directory, pick up where we last left off.
 	 * We cache only lookups as these are the most common
 	 * and have the greatest payoff. Caching CREATE has little
 	 * benefit as it usually must search the entire directory
 	 * to determine that the entry does not exist. Caching the
 	 * location of the last DELETE or RENAME has not reduced
 	 * profiling time and hence has been removed in the interest
 	 * of simplicity.
 	 */
 	bmask = imp->im_bmask;
 	if (nameiop != LOOKUP || dp->i_diroff == 0 ||
 	    dp->i_diroff > dp->i_size) {
 		entryoffsetinblock = 0;
 		dp->i_offset = 0;
 		numdirpasses = 1;
 	} else {
 		dp->i_offset = dp->i_diroff;
 		if ((entryoffsetinblock = dp->i_offset & bmask) &&
 		    (error = cd9660_blkatoff(vdp, (off_t)dp->i_offset, NULL, &bp)))
 				return (error);
 		numdirpasses = 2;
 		nchstats.ncs_2passes++;
 	}
 	endsearch = dp->i_size;
 
 searchloop:
 	while (dp->i_offset < endsearch) {
 		/*
 		 * If offset is on a block boundary,
 		 * read the next directory block.
 		 * Release previous if it exists.
 		 */
 		if ((dp->i_offset & bmask) == 0) {
 			if (bp != NULL)
 				brelse(bp);
 			if ((error =
 			    cd9660_blkatoff(vdp, (off_t)dp->i_offset, NULL, &bp)) != 0)
 				return (error);
 			entryoffsetinblock = 0;
 		}
 		/*
 		 * Get pointer to next entry.
 		 */
 		ep = (struct iso_directory_record *)
 			((char *)bp->b_data + entryoffsetinblock);
 
 		reclen = isonum_711(ep->length);
 		if (reclen == 0) {
 			/* skip to next block, if any */
 			dp->i_offset =
 			    (dp->i_offset & ~bmask) + imp->logical_block_size;
 			continue;
 		}
 
 		if (reclen < ISO_DIRECTORY_RECORD_SIZE)
 			/* illegal entry, stop */
 			break;
 
 		if (entryoffsetinblock + reclen > imp->logical_block_size)
 			/* entries are not allowed to cross boundaries */
 			break;
 
 		namelen = isonum_711(ep->name_len);
 		isoflags = isonum_711(imp->iso_ftype == ISO_FTYPE_HIGH_SIERRA?
 				      &ep->date[6]: ep->flags);
 
 		if (reclen < ISO_DIRECTORY_RECORD_SIZE + namelen)
 			/* illegal entry, stop */
 			break;
 
 		/*
 		 * Check for a name match.
 		 */
 		switch (imp->iso_ftype) {
 		default:
 			if (!(isoflags & 4) == !assoc) {
 				if ((len == 1
 				     && *name == '.')
 				    || (flags & ISDOTDOT)) {
 					if (namelen == 1
 					    && ep->name[0] == ((flags & ISDOTDOT) ? 1 : 0)) {
 						/*
 						 * Save directory entry's inode number and
 						 * release directory buffer.
 						 */
 						dp->i_ino = isodirino(ep, imp);
 						goto found;
 					}
 					if (namelen != 1
 					    || ep->name[0] != 0)
 						goto notfound;
 				} else if (!(res = isofncmp(name, len,
 							    ep->name, namelen,
 							    imp->joliet_level,
 							    imp->im_flags,
 							    imp->im_d2l,
 							    imp->im_l2d))) {
 					if (isoflags & 2)
 						ino = isodirino(ep, imp);
 					else
 						ino = dbtob(bp->b_blkno)
 							+ entryoffsetinblock;
 					saveoffset = dp->i_offset;
 				} else if (ino)
 					goto foundino;
 #ifdef	NOSORTBUG	/* On some CDs directory entries are not sorted correctly */
 				else if (res < 0)
 					goto notfound;
 				else if (res > 0 && numdirpasses == 2)
 					numdirpasses++;
 #endif
 			}
 			break;
 		case ISO_FTYPE_RRIP:
 			if (isonum_711(ep->flags)&2)
 				ino = isodirino(ep, imp);
 			else
 				ino = dbtob(bp->b_blkno) + entryoffsetinblock;
 			dp->i_ino = ino;
 			cd9660_rrip_getname(ep,altname,&namelen,&dp->i_ino,imp);
 			if (namelen == cnp->cn_namelen
 			    && !bcmp(name,altname,namelen))
 				goto found;
 			ino = 0;
 			break;
 		}
 		dp->i_offset += reclen;
 		entryoffsetinblock += reclen;
 	}
 	if (ino) {
 foundino:
 		dp->i_ino = ino;
 		if (saveoffset != dp->i_offset) {
 			if (lblkno(imp, dp->i_offset) !=
 			    lblkno(imp, saveoffset)) {
 				if (bp != NULL)
 					brelse(bp);
 				if ((error = cd9660_blkatoff(vdp,
 				    (off_t)saveoffset, NULL, &bp)) != 0)
 					return (error);
 			}
 			entryoffsetinblock = saveoffset & bmask;
 			ep = (struct iso_directory_record *)
 				((char *)bp->b_data + entryoffsetinblock);
 			dp->i_offset = saveoffset;
 		}
 		goto found;
 	}
 notfound:
 	/*
 	 * If we started in the middle of the directory and failed
 	 * to find our target, we must check the beginning as well.
 	 */
 	if (numdirpasses == 2) {
 		numdirpasses--;
 		dp->i_offset = 0;
 		endsearch = dp->i_diroff;
 		goto searchloop;
 	}
 	if (bp != NULL)
 		brelse(bp);
 
 	/*
 	 * Insert name into cache (as non-existent) if appropriate.
 	 */
 	if (cnp->cn_flags & MAKEENTRY)
 		cache_enter(vdp, *vpp, cnp);
 	if (nameiop == CREATE || nameiop == RENAME)
 		return (EROFS);
 	return (ENOENT);
 
 found:
 	if (numdirpasses == 2)
 		nchstats.ncs_pass2++;
 
 	/*
 	 * Found component in pathname.
 	 * If the final component of path name, save information
 	 * in the cache as to where the entry was found.
 	 */
 	if ((flags & ISLASTCN) && nameiop == LOOKUP)
 		dp->i_diroff = dp->i_offset;
 
 	/*
 	 * Step through the translation in the name.  We do not `iput' the
 	 * directory because we may need it again if a symbolic link
 	 * is relative to the current directory.  Instead we save it
 	 * unlocked as "pdp".  We must get the target inode before unlocking
 	 * the directory to insure that the inode will not be removed
 	 * before we get it.  We prevent deadlock by always fetching
 	 * inodes from the root, moving down the directory tree. Thus
 	 * when following backward pointers ".." we must unlock the
 	 * parent directory before getting the requested directory.
 	 * There is a potential race condition here if both the current
 	 * and parent directories are removed before the `iget' for the
 	 * inode associated with ".." returns.  We hope that this occurs
 	 * infrequently since we cannot avoid this race condition without
 	 * implementing a sophisticated deadlock detection algorithm.
 	 * Note also that this simple deadlock detection scheme will not
 	 * work if the filesystem has any hard links other than ".."
 	 * that point backwards in the directory structure.
 	 */
 	pdp = vdp;
 	/*
 	 * If ino is different from dp->i_ino,
 	 * it's a relocated directory.
 	 */
 	if (flags & ISDOTDOT) {
 		saved_ino = dp->i_ino;
 		VOP_UNLOCK(pdp, 0, td);	/* race to get the inode */
 		error = cd9660_vget_internal(vdp->v_mount, saved_ino,
 					     LK_EXCLUSIVE, &tdp,
 					     saved_ino != ino, ep);
 		brelse(bp);
-		vn_lock(pdp, LK_EXCLUSIVE | LK_RETRY, td);
+		vn_lock(pdp, LK_EXCLUSIVE | LK_RETRY);
 		if (error)
 			return (error);
 		*vpp = tdp;
 	} else if (dp->i_number == dp->i_ino) {
 		brelse(bp);
 		VREF(vdp);	/* we want ourself, ie "." */
 		*vpp = vdp;
 	} else {
 		error = cd9660_vget_internal(vdp->v_mount, dp->i_ino,
 					     LK_EXCLUSIVE, &tdp,
 					     dp->i_ino != ino, ep);
 		brelse(bp);
 		if (error)
 			return (error);
 		*vpp = tdp;
 	}
 
 	/*
 	 * Insert name into cache if appropriate.
 	 */
 	if (cnp->cn_flags & MAKEENTRY)
 		cache_enter(vdp, *vpp, cnp);
 	return (0);
 }
 
 /*
  * Return buffer with the contents of block "offset" from the beginning of
  * directory "ip".  If "res" is non-zero, fill it in with a pointer to the
  * remaining space in the directory.
  */
 int
 cd9660_blkatoff(vp, offset, res, bpp)
 	struct vnode *vp;
 	off_t offset;
 	char **res;
 	struct buf **bpp;
 {
 	struct iso_node *ip;
 	struct iso_mnt *imp;
 	struct buf *bp;
 	daddr_t lbn;
 	int bsize, bshift, error;
 
 	ip = VTOI(vp);
 	imp = ip->i_mnt;
 	lbn = lblkno(imp, offset);
 	bsize = blksize(imp, ip, lbn);
 	bshift = imp->im_bshift;
 
 	if ((error = bread(vp, lbn, bsize, NOCRED, &bp)) != 0) {
 		brelse(bp);
 		*bpp = NULL;
 		return (error);
 	}
 
 	/*
 	 * We must BMAP the buffer because the directory code may use b_blkno
 	 * to calculate the inode for certain types of directory entries.
 	 * We could get away with not doing it before we VMIO-backed the
 	 * directories because the buffers would get freed atomically with
 	 * the invalidation of their data.  But with VMIO-backed buffers
 	 * the buffers may be freed and then later reconstituted - and the
 	 * reconstituted buffer will have no knowledge of b_blkno.
 	 */
 	if (bp->b_blkno == bp->b_lblkno) {
 	        bp->b_blkno = (ip->iso_start + bp->b_lblkno) << (bshift - DEV_BSHIFT);
         }
 
 	if (res)
 		*res = (char *)bp->b_data + blkoff(imp, offset);
 	*bpp = bp;
 	return (0);
 }
Index: head/sys/fs/cd9660/cd9660_vfsops.c
===================================================================
--- head/sys/fs/cd9660/cd9660_vfsops.c	(revision 175201)
+++ head/sys/fs/cd9660/cd9660_vfsops.c	(revision 175202)
@@ -1,823 +1,823 @@
 /*-
  * Copyright (c) 1994
  *	The Regents of the University of California.  All rights reserved.
  *
  * This code is derived from software contributed to Berkeley
  * by Pace Willisson (pace@blitz.com).  The Rock Ridge Extension
  * Support code is derived from software contributed to Berkeley
  * by Atsushi Murai (amurai@spec.co.jp).
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	@(#)cd9660_vfsops.c	8.18 (Berkeley) 5/22/95
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/namei.h>
 #include <sys/priv.h>
 #include <sys/proc.h>
 #include <sys/kernel.h>
 #include <sys/vnode.h>
 #include <sys/mount.h>
 #include <sys/bio.h>
 #include <sys/buf.h>
 #include <sys/cdio.h>
 #include <sys/conf.h>
 #include <sys/fcntl.h>
 #include <sys/malloc.h>
 #include <sys/stat.h>
 #include <sys/syslog.h>
 #include <sys/iconv.h>
 
 #include <fs/cd9660/iso.h>
 #include <fs/cd9660/iso_rrip.h>
 #include <fs/cd9660/cd9660_node.h>
 #include <fs/cd9660/cd9660_mount.h>
 
 #include <geom/geom.h>
 #include <geom/geom_vfs.h>
 
 MALLOC_DEFINE(M_ISOFSMNT, "isofs_mount", "ISOFS mount structure");
 MALLOC_DEFINE(M_ISOFSNODE, "isofs_node", "ISOFS vnode private part");
 
 struct iconv_functions *cd9660_iconv = NULL;
 
 static vfs_mount_t	cd9660_mount;
 static vfs_cmount_t	cd9660_cmount;
 static vfs_unmount_t	cd9660_unmount;
 static vfs_root_t	cd9660_root;
 static vfs_statfs_t	cd9660_statfs;
 static vfs_vget_t	cd9660_vget;
 static vfs_fhtovp_t	cd9660_fhtovp;
 
 static struct vfsops cd9660_vfsops = {
 	.vfs_fhtovp =		cd9660_fhtovp,
 	.vfs_mount =		cd9660_mount,
 	.vfs_cmount =		cd9660_cmount,
 	.vfs_root =		cd9660_root,
 	.vfs_statfs =		cd9660_statfs,
 	.vfs_unmount =		cd9660_unmount,
 	.vfs_vget =		cd9660_vget,
 };
 VFS_SET(cd9660_vfsops, cd9660, VFCF_READONLY);
 MODULE_VERSION(cd9660, 1);
 
 static int iso_mountfs(struct vnode *devvp, struct mount *mp,
 		       struct thread *td);
 
 /*
  * VFS Operations.
  */
 
 static int
 cd9660_cmount(struct mntarg *ma, void *data, int flags, struct thread *td)
 {
 	struct iso_args args;
 	int error;
 
 	error = copyin(data, &args, sizeof args);
 	if (error)
 		return (error);
 
 	ma = mount_argsu(ma, "from", args.fspec, MAXPATHLEN);
 	ma = mount_arg(ma, "export", &args.export, sizeof args.export);
 	ma = mount_argsu(ma, "cs_disk", args.cs_disk, 64);
 	ma = mount_argsu(ma, "cs_local", args.cs_local, 64);
 	ma = mount_argf(ma, "ssector", "%u", args.ssector);
 	ma = mount_argb(ma, !(args.flags & ISOFSMNT_NORRIP), "norrip");
 	ma = mount_argb(ma, args.flags & ISOFSMNT_GENS, "nogens");
 	ma = mount_argb(ma, args.flags & ISOFSMNT_EXTATT, "noextatt");
 	ma = mount_argb(ma, !(args.flags & ISOFSMNT_NOJOLIET), "nojoliet");
 	ma = mount_argb(ma,
 	    args.flags & ISOFSMNT_BROKENJOLIET, "nobrokenjoliet");
 	ma = mount_argb(ma, args.flags & ISOFSMNT_KICONV, "nokiconv");
 
 	error = kernel_mount(ma, flags);
 
 	return (error);
 }
 
 static int
 cd9660_mount(struct mount *mp, struct thread *td)
 {
 	struct vnode *devvp;
 	char *fspec;
 	int error;
 	mode_t accessmode;
 	struct nameidata ndp;
 	struct iso_mnt *imp = 0;
 
 	/*
 	 * Unconditionally mount as read-only.
 	 */
 	MNT_ILOCK(mp);
 	mp->mnt_flag |= MNT_RDONLY;
 	MNT_IUNLOCK(mp);
 
 	fspec = vfs_getopts(mp->mnt_optnew, "from", &error);
 	if (error)
 		return (error);
 
 	imp = VFSTOISOFS(mp);
 
 	if (mp->mnt_flag & MNT_UPDATE) {
 		if (vfs_flagopt(mp->mnt_optnew, "export", NULL, 0))
 			return (0);
 	}
 	/*
 	 * Not an update, or updating the name: look up the name
 	 * and verify that it refers to a sensible block device.
 	 */
 	NDINIT(&ndp, LOOKUP, FOLLOW, UIO_SYSSPACE, fspec, td);
 	if ((error = namei(&ndp)))
 		return (error);
 	NDFREE(&ndp, NDF_ONLY_PNBUF);
 	devvp = ndp.ni_vp;
 
 	if (!vn_isdisk(devvp, &error)) {
 		vrele(devvp);
 		return (error);
 	}
 
 	/*
 	 * Verify that user has necessary permissions on the device,
 	 * or has superuser abilities
 	 */
 	accessmode = VREAD;
-	vn_lock(devvp, LK_EXCLUSIVE | LK_RETRY, td);
+	vn_lock(devvp, LK_EXCLUSIVE | LK_RETRY);
 	error = VOP_ACCESS(devvp, accessmode, td->td_ucred, td);
 	if (error)
 		error = priv_check(td, PRIV_VFS_MOUNT_PERM);
 	if (error) {
 		vput(devvp);
 		return (error);
 	}
 	VOP_UNLOCK(devvp, 0, td);
 
 	if ((mp->mnt_flag & MNT_UPDATE) == 0) {
 		error = iso_mountfs(devvp, mp, td);
 	} else {
 		if (devvp != imp->im_devvp)
 			error = EINVAL;	/* needs translation */
 		else
 			vrele(devvp);
 	}
 	if (error) {
 		vrele(devvp);
 		return error;
 	}
 	vfs_mountedfrom(mp, fspec);
 	return 0;
 }
 
 /*
  * Common code for mount and mountroot
  */
 static int
 iso_mountfs(devvp, mp, td)
 	struct vnode *devvp;
 	struct mount *mp;
 	struct thread *td;
 {
 	struct iso_mnt *isomp = (struct iso_mnt *)0;
 	struct buf *bp = NULL;
 	struct buf *pribp = NULL, *supbp = NULL;
 	struct cdev *dev = devvp->v_rdev;
 	int error = EINVAL;
 	int high_sierra = 0;
 	int iso_bsize;
 	int iso_blknum;
 	int joliet_level;
 	struct iso_volume_descriptor *vdp = 0;
 	struct iso_primary_descriptor *pri = NULL;
 	struct iso_sierra_primary_descriptor *pri_sierra = NULL;
 	struct iso_supplementary_descriptor *sup = NULL;
 	struct iso_directory_record *rootp;
 	int logical_block_size, ssector;
 	struct g_consumer *cp;
 	struct bufobj *bo;
 	char *cs_local, *cs_disk;
 
-	vn_lock(devvp, LK_EXCLUSIVE | LK_RETRY, td);
+	vn_lock(devvp, LK_EXCLUSIVE | LK_RETRY);
 	DROP_GIANT();
 	g_topology_lock();
 	error = g_vfs_open(devvp, &cp, "cd9660", 0);
 	g_topology_unlock();
 	PICKUP_GIANT();
 	VOP_UNLOCK(devvp, 0, td);
 	if (error)
 		return error;
 	if (devvp->v_rdev->si_iosize_max != 0)
 		mp->mnt_iosize_max = devvp->v_rdev->si_iosize_max;
 	if (mp->mnt_iosize_max > MAXPHYS)
 		mp->mnt_iosize_max = MAXPHYS;
 
 	bo = &devvp->v_bufobj;
 	bo->bo_private = cp;
 	bo->bo_ops = g_vfs_bufops;
 
 	/* This is the "logical sector size".  The standard says this
 	 * should be 2048 or the physical sector size on the device,
 	 * whichever is greater.
 	 */
 	if ((ISO_DEFAULT_BLOCK_SIZE % cp->provider->sectorsize) != 0) {
 		DROP_GIANT();
 		g_topology_lock();
 		g_vfs_close(cp, td);
 		g_topology_unlock();
                 PICKUP_GIANT();
 		return (EINVAL);
 	}
 
 	iso_bsize = cp->provider->sectorsize;
 
 	joliet_level = 0;
 	if (1 != vfs_scanopt(mp->mnt_optnew, "ssector", "%d", &ssector))
 		ssector = 0;
 	for (iso_blknum = 16 + ssector;
 	     iso_blknum < 100 + ssector;
 	     iso_blknum++) {
 		if ((error = bread(devvp, iso_blknum * btodb(ISO_DEFAULT_BLOCK_SIZE),
 				  iso_bsize, NOCRED, &bp)) != 0)
 			goto out;
 
 		vdp = (struct iso_volume_descriptor *)bp->b_data;
 		if (bcmp (vdp->id, ISO_STANDARD_ID, sizeof vdp->id) != 0) {
 			if (bcmp (vdp->id_sierra, ISO_SIERRA_ID,
 				  sizeof vdp->id) != 0) {
 				error = EINVAL;
 				goto out;
 			} else
 				high_sierra = 1;
 		}
 		switch (isonum_711 (high_sierra? vdp->type_sierra: vdp->type)){
 		case ISO_VD_PRIMARY:
 			if (pribp == NULL) {
 				pribp = bp;
 				bp = NULL;
 				pri = (struct iso_primary_descriptor *)vdp;
 				pri_sierra =
 				  (struct iso_sierra_primary_descriptor *)vdp;
 			}
 			break;
 
 		case ISO_VD_SUPPLEMENTARY:
 			if (supbp == NULL) {
 				supbp = bp;
 				bp = NULL;
 				sup = (struct iso_supplementary_descriptor *)vdp;
 
 				if (!vfs_flagopt(mp->mnt_optnew, "nojoliet", NULL, 0)) {
 					if (bcmp(sup->escape, "%/@", 3) == 0)
 						joliet_level = 1;
 					if (bcmp(sup->escape, "%/C", 3) == 0)
 						joliet_level = 2;
 					if (bcmp(sup->escape, "%/E", 3) == 0)
 						joliet_level = 3;
 
 					if ((isonum_711 (sup->flags) & 1) &&
 					    !vfs_flagopt(mp->mnt_optnew, "brokenjoliet", NULL, 0))
 						joliet_level = 0;
 				}
 			}
 			break;
 
 		case ISO_VD_END:
 			goto vd_end;
 
 		default:
 			break;
 		}
 		if (bp) {
 			brelse(bp);
 			bp = NULL;
 		}
 	}
  vd_end:
 	if (bp) {
 		brelse(bp);
 		bp = NULL;
 	}
 
 	if (pri == NULL) {
 		error = EINVAL;
 		goto out;
 	}
 
 	logical_block_size =
 		isonum_723 (high_sierra?
 			    pri_sierra->logical_block_size:
 			    pri->logical_block_size);
 
 	if (logical_block_size < DEV_BSIZE || logical_block_size > MAXBSIZE
 	    || (logical_block_size & (logical_block_size - 1)) != 0) {
 		error = EINVAL;
 		goto out;
 	}
 
 	rootp = (struct iso_directory_record *)
 		(high_sierra?
 		 pri_sierra->root_directory_record:
 		 pri->root_directory_record);
 
 	isomp = malloc(sizeof *isomp, M_ISOFSMNT, M_WAITOK | M_ZERO);
 	isomp->im_cp = cp;
 	isomp->im_bo = bo;
 	isomp->logical_block_size = logical_block_size;
 	isomp->volume_space_size =
 		isonum_733 (high_sierra?
 			    pri_sierra->volume_space_size:
 			    pri->volume_space_size);
 	isomp->joliet_level = 0;
 	/*
 	 * Since an ISO9660 multi-session CD can also access previous
 	 * sessions, we have to include them into the space consider-
 	 * ations.  This doesn't yield a very accurate number since
 	 * parts of the old sessions might be inaccessible now, but we
 	 * can't do much better.  This is also important for the NFS
 	 * filehandle validation.
 	 */
 	isomp->volume_space_size += ssector;
 	bcopy (rootp, isomp->root, sizeof isomp->root);
 	isomp->root_extent = isonum_733 (rootp->extent);
 	isomp->root_size = isonum_733 (rootp->size);
 
 	isomp->im_bmask = logical_block_size - 1;
 	isomp->im_bshift = ffs(logical_block_size) - 1;
 
 	pribp->b_flags |= B_AGE;
 	brelse(pribp);
 	pribp = NULL;
 
 	mp->mnt_data = isomp;
 	mp->mnt_stat.f_fsid.val[0] = dev2udev(dev);
 	mp->mnt_stat.f_fsid.val[1] = mp->mnt_vfc->vfc_typenum;
 	mp->mnt_maxsymlinklen = 0;
 	MNT_ILOCK(mp);
 	mp->mnt_flag |= MNT_LOCAL;
 	MNT_IUNLOCK(mp);
 	isomp->im_mountp = mp;
 	isomp->im_dev = dev;
 	isomp->im_devvp = devvp;
 
 	vfs_flagopt(mp->mnt_optnew, "norrip", &isomp->im_flags, ISOFSMNT_NORRIP);
 	vfs_flagopt(mp->mnt_optnew, "gens", &isomp->im_flags, ISOFSMNT_GENS);
 	vfs_flagopt(mp->mnt_optnew, "extatt", &isomp->im_flags, ISOFSMNT_EXTATT);
 	vfs_flagopt(mp->mnt_optnew, "nojoliet", &isomp->im_flags, ISOFSMNT_NOJOLIET);
 	vfs_flagopt(mp->mnt_optnew, "kiconv", &isomp->im_flags, ISOFSMNT_KICONV);
 
 	/* Check the Rock Ridge Extension support */
 	if (!(isomp->im_flags & ISOFSMNT_NORRIP)) {
 		if ((error = bread(isomp->im_devvp,
 				  (isomp->root_extent + isonum_711(rootp->ext_attr_length)) <<
 				  (isomp->im_bshift - DEV_BSHIFT),
 				  isomp->logical_block_size, NOCRED, &bp)) != 0)
 		    goto out;
 
 		rootp = (struct iso_directory_record *)bp->b_data;
 
 		if ((isomp->rr_skip = cd9660_rrip_offset(rootp,isomp)) < 0) {
 		    isomp->im_flags |= ISOFSMNT_NORRIP;
 		} else {
 		    isomp->im_flags &= ~ISOFSMNT_GENS;
 		}
 
 		/*
 		 * The contents are valid,
 		 * but they will get reread as part of another vnode, so...
 		 */
 		bp->b_flags |= B_AGE;
 		brelse(bp);
 		bp = NULL;
 	}
 
 	if (isomp->im_flags & ISOFSMNT_KICONV && cd9660_iconv) {
 		cs_local = vfs_getopts(mp->mnt_optnew, "cs_local", &error);
 		if (error)
 			goto out;
 		cs_disk = vfs_getopts(mp->mnt_optnew, "cs_disk", &error);
 		if (error)
 			goto out;
 		cd9660_iconv->open(cs_local, cs_disk, &isomp->im_d2l);
 		cd9660_iconv->open(cs_disk, cs_local, &isomp->im_l2d);
 	} else {
 		isomp->im_d2l = NULL;
 		isomp->im_l2d = NULL;
 	}
 
 	if (high_sierra) {
 		/* this effectively ignores all the mount flags */
 		if (bootverbose)
 			log(LOG_INFO, "cd9660: High Sierra Format\n");
 		isomp->iso_ftype = ISO_FTYPE_HIGH_SIERRA;
 	} else
 		switch (isomp->im_flags&(ISOFSMNT_NORRIP|ISOFSMNT_GENS)) {
 		  default:
 			  isomp->iso_ftype = ISO_FTYPE_DEFAULT;
 			  break;
 		  case ISOFSMNT_GENS|ISOFSMNT_NORRIP:
 			  isomp->iso_ftype = ISO_FTYPE_9660;
 			  break;
 		  case 0:
 			  if (bootverbose)
 			  	  log(LOG_INFO, "cd9660: RockRidge Extension\n");
 			  isomp->iso_ftype = ISO_FTYPE_RRIP;
 			  break;
 		}
 
 	/* Decide whether to use the Joliet descriptor */
 
 	if (isomp->iso_ftype != ISO_FTYPE_RRIP && joliet_level) {
 		if (bootverbose)
 			log(LOG_INFO, "cd9660: Joliet Extension (Level %d)\n",
 			    joliet_level);
 		rootp = (struct iso_directory_record *)
 			sup->root_directory_record;
 		bcopy (rootp, isomp->root, sizeof isomp->root);
 		isomp->root_extent = isonum_733 (rootp->extent);
 		isomp->root_size = isonum_733 (rootp->size);
 		isomp->joliet_level = joliet_level;
 		supbp->b_flags |= B_AGE;
 	}
 
 	if (supbp) {
 		brelse(supbp);
 		supbp = NULL;
 	}
 
 	return 0;
 out:
 	if (bp)
 		brelse(bp);
 	if (pribp)
 		brelse(pribp);
 	if (supbp)
 		brelse(supbp);
 	if (cp != NULL) {
 		DROP_GIANT();
 		g_topology_lock();
 		g_vfs_close(cp, td);
 		g_topology_unlock();
 		PICKUP_GIANT();
 	}
 	if (isomp) {
 		free((caddr_t)isomp, M_ISOFSMNT);
 		mp->mnt_data = NULL;
 	}
 	return error;
 }
 
 /*
  * unmount system call
  */
 static int
 cd9660_unmount(mp, mntflags, td)
 	struct mount *mp;
 	int mntflags;
 	struct thread *td;
 {
 	struct iso_mnt *isomp;
 	int error, flags = 0;
 
 	if (mntflags & MNT_FORCE)
 		flags |= FORCECLOSE;
 #if 0
 	mntflushbuf(mp, 0);
 	if (mntinvalbuf(mp))
 		return EBUSY;
 #endif
 	if ((error = vflush(mp, 0, flags, td)))
 		return (error);
 
 	isomp = VFSTOISOFS(mp);
 
 	if (isomp->im_flags & ISOFSMNT_KICONV && cd9660_iconv) {
 		if (isomp->im_d2l)
 			cd9660_iconv->close(isomp->im_d2l);
 		if (isomp->im_l2d)
 			cd9660_iconv->close(isomp->im_l2d);
 	}
 	DROP_GIANT();
 	g_topology_lock();
 	g_vfs_close(isomp->im_cp, td);
 	g_topology_unlock();
 	PICKUP_GIANT();
 	vrele(isomp->im_devvp);
 	free((caddr_t)isomp, M_ISOFSMNT);
 	mp->mnt_data = NULL;
 	MNT_ILOCK(mp);
 	mp->mnt_flag &= ~MNT_LOCAL;
 	MNT_IUNLOCK(mp);
 	return (error);
 }
 
 /*
  * Return root of a filesystem
  */
 static int
 cd9660_root(mp, flags, vpp, td)
 	struct mount *mp;
 	int flags;
 	struct vnode **vpp;
 	struct thread *td;
 {
 	struct iso_mnt *imp = VFSTOISOFS(mp);
 	struct iso_directory_record *dp =
 	    (struct iso_directory_record *)imp->root;
 	ino_t ino = isodirino(dp, imp);
 
 	/*
 	 * With RRIP we must use the `.' entry of the root directory.
 	 * Simply tell vget, that it's a relocated directory.
 	 */
 	return (cd9660_vget_internal(mp, ino, LK_EXCLUSIVE, vpp,
 	    imp->iso_ftype == ISO_FTYPE_RRIP, dp));
 }
 
 /*
  * Get filesystem statistics.
  */
 static int
 cd9660_statfs(mp, sbp, td)
 	struct mount *mp;
 	struct statfs *sbp;
 	struct thread *td;
 {
 	struct iso_mnt *isomp;
 
 	isomp = VFSTOISOFS(mp);
 
 	sbp->f_bsize = isomp->logical_block_size;
 	sbp->f_iosize = sbp->f_bsize;	/* XXX */
 	sbp->f_blocks = isomp->volume_space_size;
 	sbp->f_bfree = 0; /* total free blocks */
 	sbp->f_bavail = 0; /* blocks free for non superuser */
 	sbp->f_files =	0; /* total files */
 	sbp->f_ffree = 0; /* free file nodes */
 	return 0;
 }
 
 /*
  * File handle to vnode
  *
  * Have to be really careful about stale file handles:
  * - check that the inode number is in range
  * - call iget() to get the locked inode
  * - check for an unallocated inode (i_mode == 0)
  * - check that the generation number matches
  */
 
 /* ARGSUSED */
 static int
 cd9660_fhtovp(mp, fhp, vpp)
 	struct mount *mp;
 	struct fid *fhp;
 	struct vnode **vpp;
 {
 	struct ifid *ifhp = (struct ifid *)fhp;
 	struct iso_node *ip;
 	struct vnode *nvp;
 	int error;
 
 #ifdef	ISOFS_DBG
 	printf("fhtovp: ino %d, start %ld\n",
 	       ifhp->ifid_ino, ifhp->ifid_start);
 #endif
 
 	if ((error = VFS_VGET(mp, ifhp->ifid_ino, LK_EXCLUSIVE, &nvp)) != 0) {
 		*vpp = NULLVP;
 		return (error);
 	}
 	ip = VTOI(nvp);
 	if (ip->inode.iso_mode == 0) {
 		vput(nvp);
 		*vpp = NULLVP;
 		return (ESTALE);
 	}
 	*vpp = nvp;
 	vnode_create_vobject(*vpp, ip->i_size, curthread);
 	return (0);
 }
 
 static int
 cd9660_vget(mp, ino, flags, vpp)
 	struct mount *mp;
 	ino_t ino;
 	int flags;
 	struct vnode **vpp;
 {
 
 	/*
 	 * XXXX
 	 * It would be nice if we didn't always set the `relocated' flag
 	 * and force the extra read, but I don't want to think about fixing
 	 * that right now.
 	 */
 	return (cd9660_vget_internal(mp, ino, flags, vpp,
 #if 0
 	    VFSTOISOFS(mp)->iso_ftype == ISO_FTYPE_RRIP,
 #else
 	    0,
 #endif
 	    (struct iso_directory_record *)0));
 }
 
 int
 cd9660_vget_internal(mp, ino, flags, vpp, relocated, isodir)
 	struct mount *mp;
 	ino_t ino;
 	int flags;
 	struct vnode **vpp;
 	int relocated;
 	struct iso_directory_record *isodir;
 {
 	struct iso_mnt *imp;
 	struct iso_node *ip;
 	struct buf *bp;
 	struct vnode *vp;
 	struct cdev *dev;
 	int error;
 	struct thread *td;
 
 	td = curthread;
 	error = vfs_hash_get(mp, ino, flags, td, vpp, NULL, NULL);
 	if (error || *vpp != NULL)
 		return (error);
 
 	imp = VFSTOISOFS(mp);
 	dev = imp->im_dev;
 
 	/* Allocate a new vnode/iso_node. */
 	if ((error = getnewvnode("isofs", mp, &cd9660_vnodeops, &vp)) != 0) {
 		*vpp = NULLVP;
 		return (error);
 	}
 	MALLOC(ip, struct iso_node *, sizeof(struct iso_node), M_ISOFSNODE,
 	    M_WAITOK | M_ZERO);
 	vp->v_data = ip;
 	ip->i_vnode = vp;
 	ip->i_number = ino;
 
 	lockmgr(vp->v_vnlock, LK_EXCLUSIVE, NULL, td);
 	error = insmntque(vp, mp);
 	if (error != 0) {
 		free(ip, M_ISOFSNODE);
 		*vpp = NULLVP;
 		return (error);
 	}
 	error = vfs_hash_insert(vp, ino, flags, td, vpp, NULL, NULL);
 	if (error || *vpp != NULL)
 		return (error);
 
 	if (isodir == 0) {
 		int lbn, off;
 
 		lbn = lblkno(imp, ino);
 		if (lbn >= imp->volume_space_size) {
 			vput(vp);
 			printf("fhtovp: lbn exceed volume space %d\n", lbn);
 			return (ESTALE);
 		}
 
 		off = blkoff(imp, ino);
 		if (off + ISO_DIRECTORY_RECORD_SIZE > imp->logical_block_size) {
 			vput(vp);
 			printf("fhtovp: crosses block boundary %d\n",
 			       off + ISO_DIRECTORY_RECORD_SIZE);
 			return (ESTALE);
 		}
 
 		error = bread(imp->im_devvp,
 			      lbn << (imp->im_bshift - DEV_BSHIFT),
 			      imp->logical_block_size, NOCRED, &bp);
 		if (error) {
 			vput(vp);
 			brelse(bp);
 			printf("fhtovp: bread error %d\n",error);
 			return (error);
 		}
 		isodir = (struct iso_directory_record *)(bp->b_data + off);
 
 		if (off + isonum_711(isodir->length) >
 		    imp->logical_block_size) {
 			vput(vp);
 			if (bp != 0)
 				brelse(bp);
 			printf("fhtovp: directory crosses block boundary %d[off=%d/len=%d]\n",
 			       off +isonum_711(isodir->length), off,
 			       isonum_711(isodir->length));
 			return (ESTALE);
 		}
 
 #if 0
 		if (isonum_733(isodir->extent) +
 		    isonum_711(isodir->ext_attr_length) != ifhp->ifid_start) {
 			if (bp != 0)
 				brelse(bp);
 			printf("fhtovp: file start miss %d vs %d\n",
 			       isonum_733(isodir->extent) + isonum_711(isodir->ext_attr_length),
 			       ifhp->ifid_start);
 			return (ESTALE);
 		}
 #endif
 	} else
 		bp = 0;
 
 	ip->i_mnt = imp;
 	VREF(imp->im_devvp);
 
 	if (relocated) {
 		/*
 		 * On relocated directories we must
 		 * read the `.' entry out of a dir.
 		 */
 		ip->iso_start = ino >> imp->im_bshift;
 		if (bp != 0)
 			brelse(bp);
 		if ((error = cd9660_blkatoff(vp, (off_t)0, NULL, &bp)) != 0) {
 			vput(vp);
 			return (error);
 		}
 		isodir = (struct iso_directory_record *)bp->b_data;
 	}
 
 	ip->iso_extent = isonum_733(isodir->extent);
 	ip->i_size = isonum_733(isodir->size);
 	ip->iso_start = isonum_711(isodir->ext_attr_length) + ip->iso_extent;
 
 	/*
 	 * Setup time stamp, attribute
 	 */
 	vp->v_type = VNON;
 	switch (imp->iso_ftype) {
 	default:	/* ISO_FTYPE_9660 */
 	    {
 		struct buf *bp2;
 		int off;
 		if ((imp->im_flags & ISOFSMNT_EXTATT)
 		    && (off = isonum_711(isodir->ext_attr_length)))
 			cd9660_blkatoff(vp, (off_t)-(off << imp->im_bshift), NULL,
 				     &bp2);
 		else
 			bp2 = NULL;
 		cd9660_defattr(isodir, ip, bp2, ISO_FTYPE_9660);
 		cd9660_deftstamp(isodir, ip, bp2, ISO_FTYPE_9660);
 		if (bp2)
 			brelse(bp2);
 		break;
 	    }
 	case ISO_FTYPE_RRIP:
 		cd9660_rrip_analyze(isodir, ip, imp);
 		break;
 	}
 
 	if (bp != 0)
 		brelse(bp);
 
 	/*
 	 * Initialize the associated vnode
 	 */
 	switch (vp->v_type = IFTOVT(ip->inode.iso_mode)) {
 	case VFIFO:
 		vp->v_op = &cd9660_fifoops;
 		break;
 	default:
 		break;
 	}
 
 	if (ip->iso_extent == imp->root_extent)
 		vp->v_vflag |= VV_ROOT;
 
 	/*
 	 * XXX need generation number?
 	 */
 
 	*vpp = vp;
 	return (0);
 }
Index: head/sys/fs/coda/coda_vfsops.c
===================================================================
--- head/sys/fs/coda/coda_vfsops.c	(revision 175201)
+++ head/sys/fs/coda/coda_vfsops.c	(revision 175202)
@@ -1,521 +1,521 @@
 /*-
  *             Coda: an Experimental Distributed File System
  *                              Release 3.1
  * 
  *           Copyright (c) 1987-1998 Carnegie Mellon University
  *                          All Rights Reserved
  * 
  * Permission  to  use, copy, modify and distribute this software and its
  * documentation is hereby granted,  provided  that  both  the  copyright
  * notice  and  this  permission  notice  appear  in  all  copies  of the
  * software, derivative works or  modified  versions,  and  any  portions
  * thereof, and that both notices appear in supporting documentation, and
  * that credit is given to Carnegie Mellon University  in  all  documents
  * and publicity pertaining to direct or indirect use of this code or its
  * derivatives.
  * 
  * CODA IS AN EXPERIMENTAL SOFTWARE SYSTEM AND IS  KNOWN  TO  HAVE  BUGS,
  * SOME  OF  WHICH MAY HAVE SERIOUS CONSEQUENCES.  CARNEGIE MELLON ALLOWS
  * FREE USE OF THIS SOFTWARE IN ITS "AS IS" CONDITION.   CARNEGIE  MELLON
  * DISCLAIMS  ANY  LIABILITY  OF  ANY  KIND  FOR  ANY  DAMAGES WHATSOEVER
  * RESULTING DIRECTLY OR INDIRECTLY FROM THE USE OF THIS SOFTWARE  OR  OF
  * ANY DERIVATIVE WORK.
  * 
  * Carnegie  Mellon  encourages  users  of  this  software  to return any
  * improvements or extensions that  they  make,  and  to  grant  Carnegie
  * Mellon the rights to redistribute these changes without encumbrance.
  * 
  *  	@(#) src/sys/cfs/coda_vfsops.c,v 1.1.1.1 1998/08/29 21:14:52 rvb Exp $
  */
 /*-
  * Mach Operating System
  * Copyright (c) 1989 Carnegie-Mellon University
  * All rights reserved.  The CMU software License Agreement specifies
  * the terms and conditions for use and redistribution.
  */
 
 /*
  * This code was written for the Coda filesystem at Carnegie Mellon
  * University.  Contributers include David Steere, James Kistler, and
  * M. Satyanarayanan.  
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/conf.h>
 #include <sys/kernel.h>
 #include <sys/lock.h>
 #include <sys/malloc.h>
 #include <sys/mount.h>
 #include <sys/namei.h>
 #include <sys/proc.h>
 
 #include <fs/coda/coda.h>
 #include <fs/coda/cnode.h>
 #include <fs/coda/coda_vfsops.h>
 #include <fs/coda/coda_venus.h>
 #include <fs/coda/coda_subr.h>
 #include <fs/coda/coda_opstats.h>
 
 MALLOC_DEFINE(M_CODA, "coda", "Various Coda Structures");
 
 int codadebug = 0;
 int coda_vfsop_print_entry = 0;
 #define ENTRY    if(coda_vfsop_print_entry) myprintf(("Entered %s\n",__func__))
 
 struct vnode *coda_ctlvp;
 
 /* structure to keep statistics of internally generated/satisfied calls */
 
 struct coda_op_stats coda_vfsopstats[CODA_VFSOPS_SIZE];
 
 #define MARK_ENTRY(op) (coda_vfsopstats[op].entries++)
 #define MARK_INT_SAT(op) (coda_vfsopstats[op].sat_intrn++)
 #define MARK_INT_FAIL(op) (coda_vfsopstats[op].unsat_intrn++)
 #define MARK_INT_GEN(op) (coda_vfsopstats[op].gen_intrn++)
 
 extern int coda_nc_initialized;     /* Set if cache has been initialized */
 extern int vc_nb_open(struct cdev *, int, int, struct thread *);
 
 int
 coda_vfsopstats_init(void)
 {
 	register int i;
 	
 	for (i=0;i<CODA_VFSOPS_SIZE;i++) {
 		coda_vfsopstats[i].opcode = i;
 		coda_vfsopstats[i].entries = 0;
 		coda_vfsopstats[i].sat_intrn = 0;
 		coda_vfsopstats[i].unsat_intrn = 0;
 		coda_vfsopstats[i].gen_intrn = 0;
 	}
 	
 	return 0;
 }
 
 static const char *coda_opts[] = { "from", NULL };
 /*
  * cfs mount vfsop
  * Set up mount info record and attach it to vfs struct.
  */
 /*ARGSUSED*/
 int
 coda_mount(struct mount *vfsp, struct thread *td)
 {
     struct vnode *dvp;
     struct cnode *cp;
     struct cdev *dev;
     struct coda_mntinfo *mi;
     struct vnode *rootvp;
     CodaFid rootfid = INVAL_FID;
     CodaFid ctlfid = CTL_FID;
     int error;
     struct nameidata ndp;
     ENTRY;
     char *from;
 
     if (vfs_filteropt(vfsp->mnt_optnew, coda_opts))
 	return (EINVAL);
 
     from = vfs_getopts(vfsp->mnt_optnew, "from", &error);
     if (error)
 	return (error);
 
     coda_vfsopstats_init();
     coda_vnodeopstats_init();
     
     MARK_ENTRY(CODA_MOUNT_STATS);
     if (CODA_MOUNTED(vfsp)) {
 	MARK_INT_FAIL(CODA_MOUNT_STATS);
 	return(EBUSY);
     }
     
     /* Validate mount device.  Similar to getmdev(). */
     NDINIT(&ndp, LOOKUP, FOLLOW, UIO_SYSSPACE, from, td);
     error = namei(&ndp);
     dvp = ndp.ni_vp;
 
     if (error) {
 	MARK_INT_FAIL(CODA_MOUNT_STATS);
 	return (error);
     }
     if (dvp->v_type != VCHR) {
 	MARK_INT_FAIL(CODA_MOUNT_STATS);
 	vrele(dvp);
 	NDFREE(&ndp, NDF_ONLY_PNBUF);
 	return(ENXIO);
     }
     dev = dvp->v_rdev;
     vrele(dvp);
     NDFREE(&ndp, NDF_ONLY_PNBUF);
 
     /*
      * Initialize the mount record and link it to the vfs struct
      */
     mi = dev2coda_mntinfo(dev);
     if (!mi) {
 	MARK_INT_FAIL(CODA_MOUNT_STATS);
 	printf("Coda mount: %s is not a cfs device\n", from);
 	return(ENXIO);
     }
     
     if (!VC_OPEN(&mi->mi_vcomm)) {
 	MARK_INT_FAIL(CODA_MOUNT_STATS);
 	return(ENODEV);
     }
     
     /* No initialization (here) of mi_vcomm! */
     vfsp->mnt_data = mi;
     vfs_getnewfsid (vfsp);
 
     mi->mi_vfsp = vfsp;
     mi->mi_started = 0;			/* XXX See coda_root() */
     
     /*
      * Make a root vnode to placate the Vnode interface, but don't
      * actually make the CODA_ROOT call to venus until the first call
      * to coda_root in case a server is down while venus is starting.
      */
     cp = make_coda_node(&rootfid, vfsp, VDIR);
     rootvp = CTOV(cp);
     rootvp->v_vflag |= VV_ROOT;
 	
     cp = make_coda_node(&ctlfid, vfsp, VREG);
     coda_ctlvp = CTOV(cp);
 
     /* Add vfs and rootvp to chain of vfs hanging off mntinfo */
     mi->mi_vfsp = vfsp;
     mi->mi_rootvp = rootvp;
     
     vfs_mountedfrom(vfsp, from);
     /* error is currently guaranteed to be zero, but in case some
        code changes... */
     CODADEBUG(1,
 	     myprintf(("coda_omount returned %d\n",error)););
     if (error)
 	MARK_INT_FAIL(CODA_MOUNT_STATS);
     else
 	MARK_INT_SAT(CODA_MOUNT_STATS);
     
     return(error);
 }
 
 int
 coda_unmount(vfsp, mntflags, td)
     struct mount *vfsp;
     int mntflags;
     struct thread *td;
 {
     struct coda_mntinfo *mi = vftomi(vfsp);
     int active, error = 0;
     
     ENTRY;
     MARK_ENTRY(CODA_UMOUNT_STATS);
     if (!CODA_MOUNTED(vfsp)) {
 	MARK_INT_FAIL(CODA_UMOUNT_STATS);
 	return(EINVAL);
     }
     
     if (mi->mi_vfsp == vfsp) {	/* We found the victim */
 	if (!IS_UNMOUNTING(VTOC(mi->mi_rootvp)))
 	    return (EBUSY); 	/* Venus is still running */
 
 #ifdef	DEBUG
 	printf("coda_unmount: ROOT: vp %p, cp %p\n", mi->mi_rootvp, VTOC(mi->mi_rootvp));
 #endif
 	vrele(mi->mi_rootvp);
 	vrele(coda_ctlvp);
 	active = coda_kill(vfsp, NOT_DOWNCALL);
 	ASSERT_VOP_LOCKED(mi->mi_rootvp, "coda_unmount");
 	mi->mi_rootvp->v_vflag &= ~VV_ROOT;
 	error = vflush(mi->mi_vfsp, 0, FORCECLOSE, td);
 #ifdef CODA_VERBOSE
 	printf("coda_unmount: active = %d, vflush active %d\n", active, error);
 #endif
 	error = 0;
 	/* I'm going to take this out to allow lookups to go through. I'm
 	 * not sure it's important anyway. -- DCS 2/2/94
 	 */
 	/* vfsp->VFS_DATA = NULL; */
 
 	/* No more vfsp's to hold onto */
 	mi->mi_vfsp = NULL;
 	mi->mi_rootvp = NULL;
 
 	if (error)
 	    MARK_INT_FAIL(CODA_UMOUNT_STATS);
 	else
 	    MARK_INT_SAT(CODA_UMOUNT_STATS);
 
 	return(error);
     }
     return (EINVAL);
 }
 
 /*
  * find root of cfs
  */
 int
 coda_root(vfsp, flags, vpp, td)
 	struct mount *vfsp;
 	int flags;
 	struct vnode **vpp;
 	struct thread *td;
 {
     struct coda_mntinfo *mi = vftomi(vfsp);
     struct vnode **result;
     int error;
     struct proc *p = td->td_proc;
     CodaFid VFid;
     static const CodaFid invalfid = INVAL_FID;
  
     ENTRY;
     MARK_ENTRY(CODA_ROOT_STATS);
     result = NULL;
     
     if (vfsp == mi->mi_vfsp) {
 	/*
 	 * Cache the root across calls. We only need to pass the request
 	 * on to Venus if the root vnode is the dummy we installed in
 	 * coda_omount() with all c_fid members zeroed.
 	 *
 	 * XXX In addition, we assume that the first call to coda_root()
 	 * is from vfs_omount()
 	 * (before the call to checkdirs()) and return the dummy root
 	 * node to avoid a deadlock. This bug is fixed in the Coda CVS
 	 * repository but not in any released versions as of 6 Mar 2003.
 	 */
 	if (memcmp(&VTOC(mi->mi_rootvp)->c_fid, &invalfid,
 	    sizeof(CodaFid)) != 0 || mi->mi_started == 0)
 	    { /* Found valid root. */
 		*vpp = mi->mi_rootvp;
 		mi->mi_started = 1;
 
 		/* On Mach, this is vref.  On NetBSD, VOP_LOCK */
 #if	1
 		vref(*vpp);
-		vn_lock(*vpp, LK_EXCLUSIVE, td);
+		vn_lock(*vpp, LK_EXCLUSIVE);
 #else
 		vget(*vpp, LK_EXCLUSIVE, td);
 #endif
 		MARK_INT_SAT(CODA_ROOT_STATS);
 		return(0);
 	    }
     }
 
     error = venus_root(vftomi(vfsp), td->td_ucred, p, &VFid);
 
     if (!error) {
 	/*
 	 * Save the new rootfid in the cnode, and rehash the cnode into the
 	 * cnode hash with the new fid key.
 	 */
 	coda_unsave(VTOC(mi->mi_rootvp));
 	VTOC(mi->mi_rootvp)->c_fid = VFid;
 	coda_save(VTOC(mi->mi_rootvp));
 
 	*vpp = mi->mi_rootvp;
 #if	1
 	vref(*vpp);
-	vn_lock(*vpp, LK_EXCLUSIVE, td);
+	vn_lock(*vpp, LK_EXCLUSIVE);
 #else
 	vget(*vpp, LK_EXCLUSIVE, td);
 #endif
 
 	MARK_INT_SAT(CODA_ROOT_STATS);
 	goto exit;
     } else if (error == ENODEV || error == EINTR) {
 	/* Gross hack here! */
 	/*
 	 * If Venus fails to respond to the CODA_ROOT call, coda_call returns
 	 * ENODEV. Return the uninitialized root vnode to allow vfs
 	 * operations such as unmount to continue. Without this hack,
 	 * there is no way to do an unmount if Venus dies before a 
 	 * successful CODA_ROOT call is done. All vnode operations 
 	 * will fail.
 	 */
 	*vpp = mi->mi_rootvp;
 #if	1
 	vref(*vpp);
-	vn_lock(*vpp, LK_EXCLUSIVE, td);
+	vn_lock(*vpp, LK_EXCLUSIVE);
 #else
 	vget(*vpp, LK_EXCLUSIVE, td);
 #endif
 
 	MARK_INT_FAIL(CODA_ROOT_STATS);
 	error = 0;
 	goto exit;
     } else {
 	CODADEBUG( CODA_ROOT, myprintf(("error %d in CODA_ROOT\n", error)); );
 	MARK_INT_FAIL(CODA_ROOT_STATS);
 		
 	goto exit;
     }
 
  exit:
     return(error);
 }
 
 /*
  * Get filesystem statistics.
  */
 int
 coda_nb_statfs(vfsp, sbp, td)
     register struct mount *vfsp;
     struct statfs *sbp;
     struct thread *td;
 {
     ENTRY;
 /*  MARK_ENTRY(CODA_STATFS_STATS); */
     if (!CODA_MOUNTED(vfsp)) {
 /*	MARK_INT_FAIL(CODA_STATFS_STATS);*/
 	return(EINVAL);
     }
     
     bzero(sbp, sizeof(struct statfs));
     /* XXX - what to do about f_flags, others? --bnoble */
     /* Below This is what AFS does
     	#define NB_SFS_SIZ 0x895440
      */
     /* Note: Normal fs's have a bsize of 0x400 == 1024 */
     sbp->f_type = vfsp->mnt_vfc->vfc_typenum;
     sbp->f_bsize = 8192; /* XXX */
     sbp->f_iosize = 8192; /* XXX */
 #define NB_SFS_SIZ 0x8AB75D
     sbp->f_blocks = NB_SFS_SIZ;
     sbp->f_bfree = NB_SFS_SIZ;
     sbp->f_bavail = NB_SFS_SIZ;
     sbp->f_files = NB_SFS_SIZ;
     sbp->f_ffree = NB_SFS_SIZ;
     bcopy((caddr_t)&(vfsp->mnt_stat.f_fsid), (caddr_t)&(sbp->f_fsid), sizeof (fsid_t));
     snprintf(sbp->f_mntonname, sizeof(sbp->f_mntonname), "/coda");
     snprintf(sbp->f_fstypename, sizeof(sbp->f_fstypename), "coda");
 /*  MARK_INT_SAT(CODA_STATFS_STATS); */
     return(0);
 }
 
 /*
  * Flush any pending I/O.
  */
 int
 coda_sync(vfsp, waitfor, td)
     struct mount *vfsp;
     int    waitfor;
     struct thread *td;
 {
     ENTRY;
     MARK_ENTRY(CODA_SYNC_STATS);
     MARK_INT_SAT(CODA_SYNC_STATS);
     return(0);
 }
 
 /* 
  * fhtovp is now what vget used to be in 4.3-derived systems.  For
  * some silly reason, vget is now keyed by a 32 bit ino_t, rather than
  * a type-specific fid.  
  */
 int
 coda_fhtovp(vfsp, fhp, nam, vpp, exflagsp, creadanonp)
     register struct mount *vfsp;    
     struct fid *fhp;
     struct mbuf *nam;
     struct vnode **vpp;
     int *exflagsp;
     struct ucred **creadanonp;
 {
     struct cfid *cfid = (struct cfid *)fhp;
     struct cnode *cp = 0;
     int error;
     struct thread *td = curthread; /* XXX -mach */
     struct proc *p = td->td_proc;
     CodaFid VFid;
     int vtype;
 
     ENTRY;
     
     MARK_ENTRY(CODA_VGET_STATS);
     /* Check for vget of control object. */
     if (IS_CTL_FID(&cfid->cfid_fid)) {
 	*vpp = coda_ctlvp;
 	vref(coda_ctlvp);
 	MARK_INT_SAT(CODA_VGET_STATS);
 	return(0);
     }
     
     error = venus_fhtovp(vftomi(vfsp), &cfid->cfid_fid, td->td_ucred, p, &VFid, &vtype);
     
     if (error) {
 	CODADEBUG(CODA_VGET, myprintf(("vget error %d\n",error));)
 	    *vpp = (struct vnode *)0;
     } else {
 	CODADEBUG(CODA_VGET, 
 		 myprintf(("vget: %s type %d result %d\n",
 			coda_f2s(&VFid), vtype, error)); )	    
 	cp = make_coda_node(&VFid, vfsp, vtype);
 	*vpp = CTOV(cp);
     }
     return(error);
 }
 
 /*
  * To allow for greater ease of use, some vnodes may be orphaned when
  * Venus dies.  Certain operations should still be allowed to go
  * through, but without propagating ophan-ness.  So this function will
  * get a new vnode for the file from the current run of Venus.  */
  
 int
 getNewVnode(vpp)
      struct vnode **vpp;
 {
     struct cfid cfid;
     struct coda_mntinfo *mi = vftomi((*vpp)->v_mount);
     
     ENTRY;
 
     cfid.cfid_len = (short)sizeof(CodaFid);
     cfid.cfid_fid = VTOC(*vpp)->c_fid;	/* Structure assignment. */
     /* XXX ? */
 
     /* We're guessing that if set, the 1st element on the list is a
      * valid vnode to use. If not, return ENODEV as venus is dead.
      */
     if (mi->mi_vfsp == NULL)
 	return ENODEV;
     
     return coda_fhtovp(mi->mi_vfsp, (struct fid*)&cfid, NULL, vpp,
 		      NULL, NULL);
 }
 
 #include <ufs/ufs/extattr.h>
 #include <ufs/ufs/quota.h>
 #include <ufs/ufs/ufsmount.h>
 /* get the mount structure corresponding to a given device.  Assume 
  * device corresponds to a UFS. Return NULL if no device is found.
  */ 
 struct mount *devtomp(dev)
     struct cdev *dev;
 {
     struct mount *mp;
    
     TAILQ_FOREACH(mp, &mountlist, mnt_list) {
 	if (((VFSTOUFS(mp))->um_dev == dev)) {
 	    /* mount corresponds to UFS and the device matches one we want */
 	    return(mp); 
 	}
     }
     /* mount structure wasn't found */ 
     return(NULL); 
 }
 
 struct vfsops coda_vfsops = {
     .vfs_mount =		coda_mount,
     .vfs_root = 		coda_root,
     .vfs_statfs =		coda_nb_statfs,
     .vfs_sync = 		coda_sync,
     .vfs_unmount =		coda_unmount,
 };
 
 VFS_SET(coda_vfsops, coda, VFCF_NETWORK);
Index: head/sys/fs/coda/coda_vnops.c
===================================================================
--- head/sys/fs/coda/coda_vnops.c	(revision 175201)
+++ head/sys/fs/coda/coda_vnops.c	(revision 175202)
@@ -1,1743 +1,1743 @@
 /*-
  *             Coda: an Experimental Distributed File System
  *                              Release 3.1
  * 
  *           Copyright (c) 1987-1998 Carnegie Mellon University
  *                          All Rights Reserved
  * 
  * Permission  to  use, copy, modify and distribute this software and its
  * documentation is hereby granted,  provided  that  both  the  copyright
  * notice  and  this  permission  notice  appear  in  all  copies  of the
  * software, derivative works or  modified  versions,  and  any  portions
  * thereof, and that both notices appear in supporting documentation, and
  * that credit is given to Carnegie Mellon University  in  all  documents
  * and publicity pertaining to direct or indirect use of this code or its
  * derivatives.
  * 
  * CODA IS AN EXPERIMENTAL SOFTWARE SYSTEM AND IS  KNOWN  TO  HAVE  BUGS,
  * SOME  OF  WHICH MAY HAVE SERIOUS CONSEQUENCES.  CARNEGIE MELLON ALLOWS
  * FREE USE OF THIS SOFTWARE IN ITS "AS IS" CONDITION.   CARNEGIE  MELLON
  * DISCLAIMS  ANY  LIABILITY  OF  ANY  KIND  FOR  ANY  DAMAGES WHATSOEVER
  * RESULTING DIRECTLY OR INDIRECTLY FROM THE USE OF THIS SOFTWARE  OR  OF
  * ANY DERIVATIVE WORK.
  * 
  * Carnegie  Mellon  encourages  users  of  this  software  to return any
  * improvements or extensions that  they  make,  and  to  grant  Carnegie
  * Mellon the rights to redistribute these changes without encumbrance.
  * 
  *  	@(#) src/sys/coda/coda_vnops.c,v 1.1.1.1 1998/08/29 21:14:52 rvb Exp $
  */
 /* 
  * Mach Operating System
  * Copyright (c) 1990 Carnegie-Mellon University
  * Copyright (c) 1989 Carnegie-Mellon University
  * All rights reserved.  The CMU software License Agreement specifies
  * the terms and conditions for use and redistribution.
  */
 
 /*
  * This code was written for the Coda filesystem at Carnegie Mellon
  * University.  Contributers include David Steere, James Kistler, and
  * M. Satyanarayanan.  
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/acct.h>
 #include <sys/errno.h>
 #include <sys/fcntl.h>
 #include <sys/kernel.h>
 #include <sys/lock.h>
 #include <sys/malloc.h>
 #include <sys/file.h>		/* Must come after sys/malloc.h */
 #include <sys/mount.h>
 #include <sys/mutex.h>
 #include <sys/namei.h>
 #include <sys/proc.h>
 #include <sys/uio.h>
 #include <sys/unistd.h>
 
 #include <vm/vm.h>
 #include <vm/vm_object.h>
 #include <vm/vm_extern.h>
 
 #include <fs/coda/coda.h>
 #include <fs/coda/cnode.h>
 #include <fs/coda/coda_vnops.h>
 #include <fs/coda/coda_venus.h>
 #include <fs/coda/coda_opstats.h>
 #include <fs/coda/coda_subr.h>
 #include <fs/coda/coda_namecache.h>
 #include <fs/coda/coda_pioctl.h>
 
 /* 
  * These flags select various performance enhancements.
  */
 int coda_attr_cache  = 1;       /* Set to cache attributes in the kernel */
 int coda_symlink_cache = 1;     /* Set to cache symbolic link information */
 int coda_access_cache = 1;      /* Set to handle some access checks directly */
 
 /* structure to keep track of vfs calls */
 
 struct coda_op_stats coda_vnodeopstats[CODA_VNODEOPS_SIZE];
 
 #define MARK_ENTRY(op) (coda_vnodeopstats[op].entries++)
 #define MARK_INT_SAT(op) (coda_vnodeopstats[op].sat_intrn++)
 #define MARK_INT_FAIL(op) (coda_vnodeopstats[op].unsat_intrn++)
 #define MARK_INT_GEN(op) (coda_vnodeopstats[op].gen_intrn++)
 
 /* What we are delaying for in printf */
 int coda_printf_delay = 0;  /* in microseconds */
 int coda_vnop_print_entry = 0;
 static int coda_lockdebug = 0;
 
 /*
  * Some NetBSD details:
  * 
  *   coda_start is called at the end of the mount syscall.
  *   coda_init is called at boot time.
  */
 
 #define ENTRY  if(coda_vnop_print_entry) myprintf(("Entered %s\n",__func__))
 
 /* Definition of the vnode operation vector */
 
 struct vop_vector coda_vnodeops = {
     .vop_default = VOP_PANIC,
     .vop_lookup = coda_lookup,		/* lookup */
     .vop_create = coda_create,		/* create */
     .vop_mknod = VOP_PANIC,	/* mknod */
     .vop_open = coda_open,		/* open */
     .vop_close = coda_close,		/* close */
     .vop_access = coda_access,		/* access */
     .vop_getattr = coda_getattr,	/* getattr */
     .vop_setattr = coda_setattr,	/* setattr */
     .vop_read = coda_read,		/* read */
     .vop_write = coda_write,		/* write */
     .vop_ioctl = coda_ioctl,		/* ioctl */
     .vop_fsync = coda_fsync,		/* fsync */
     .vop_remove = coda_remove,		/* remove */
     .vop_link = coda_link,		/* link */
     .vop_rename = coda_rename,		/* rename */
     .vop_mkdir = coda_mkdir,		/* mkdir */
     .vop_rmdir = coda_rmdir,		/* rmdir */
     .vop_symlink = coda_symlink,	/* symlink */
     .vop_readdir = coda_readdir,	/* readdir */
     .vop_readlink = coda_readlink,	/* readlink */
     .vop_inactive = coda_inactive,	/* inactive */
     .vop_reclaim = coda_reclaim,	/* reclaim */
     .vop_lock1 = coda_lock,		/* lock */
     .vop_unlock = coda_unlock,		/* unlock */
     .vop_bmap = coda_bmap,		/* bmap */
     .vop_print = VOP_PANIC,	/* print */
     .vop_islocked = coda_islocked,	/* islocked */
     .vop_pathconf = coda_pathconf,	/* pathconf */
     .vop_advlock = VOP_NULL,	/* advlock */
     .vop_lease = VOP_NULL,		/* lease */
     .vop_poll = vop_stdpoll,
     .vop_getpages = vop_stdgetpages,	/* pager intf.*/
     .vop_putpages = vop_stdputpages,	/* pager intf.*/
     .vop_getwritemount =	vop_stdgetwritemount,
 
 #if 0
     missing
     .vop_cachedlookup =	ufs_lookup,
     .vop_whiteout =	ufs_whiteout,
 #endif
 
 };
 
 /* A generic do-nothing.  For lease_check, advlock */
 int
 coda_vop_nop(void *anon) {
     struct vnodeop_desc **desc = (struct vnodeop_desc **)anon;
 
     if (codadebug) {
 	myprintf(("Vnode operation %s called, but unsupported\n",
 		  (*desc)->vdesc_name));
     } 
    return (0);
 }
 
 int
 coda_vnodeopstats_init(void)
 {
 	register int i;
 	
 	for(i=0;i<CODA_VNODEOPS_SIZE;i++) {
 		coda_vnodeopstats[i].opcode = i;
 		coda_vnodeopstats[i].entries = 0;
 		coda_vnodeopstats[i].sat_intrn = 0;
 		coda_vnodeopstats[i].unsat_intrn = 0;
 		coda_vnodeopstats[i].gen_intrn = 0;
 	}
 	return 0;
 }
 		
 /* 
  * coda_open calls Venus which returns an open file descriptor the cache
  * file holding the data. We get the vnode while we are still in the
  * context of the venus process in coda_psdev.c. This vnode is then
  * passed back to the caller and opened.
  */
 int
 coda_open(struct vop_open_args *ap)
 {
     /* 
      * NetBSD can pass the O_EXCL flag in mode, even though the check
      * has already happened.  Venus defensively assumes that if open
      * is passed the EXCL, it must be a bug.  We strip the flag here.
      */
 /* true args */
     register struct vnode **vpp = &(ap->a_vp);
     struct cnode *cp = VTOC(*vpp);
     int flag = ap->a_mode & (~O_EXCL);
     struct ucred *cred = ap->a_cred;
     struct thread *td = ap->a_td;
 /* locals */
     int error;
     struct vnode *vp;
 
     MARK_ENTRY(CODA_OPEN_STATS);
 
     /* Check for open of control file. */
     if (IS_CTL_VP(*vpp)) {
 	/* XXX */
 	/* if (WRITEABLE(flag)) */ 
 	if (flag & (FWRITE | O_TRUNC | O_CREAT | O_EXCL)) {
 	    MARK_INT_FAIL(CODA_OPEN_STATS);
 	    return(EACCES);
 	}
 	MARK_INT_SAT(CODA_OPEN_STATS);
 	return(0);
     }
 
     error = venus_open(vtomi((*vpp)), &cp->c_fid, flag, cred, td->td_proc, &vp);
     if (error)
 	return (error);
 
     CODADEBUG( CODA_OPEN,myprintf(("open: vp %p result %d\n", vp, error));)
 
     /* Keep a reference until the close comes in. */
     vref(*vpp);                
 
     /* Save the vnode pointer for the cache file. */
     if (cp->c_ovp == NULL) {
 	cp->c_ovp = vp;
     } else {
 	if (cp->c_ovp != vp)
 	    panic("coda_open:  cp->c_ovp != ITOV(ip)");
     }
     cp->c_ocount++;
 
     /* Flush the attribute cached if writing the file. */
     if (flag & FWRITE) {
 	cp->c_owrite++;
 	cp->c_flags &= ~C_VATTR;
     }
 
     /* Open the cache file. */
     error = VOP_OPEN(vp, flag, cred, td, NULL); 
     if (error) {
     	printf("coda_open: VOP_OPEN on container failed %d\n", error);
 	return (error);
     }
 /* grab (above) does this when it calls newvnode unless it's in the cache*/
 
     return(error);
 }
 
 /*
  * Close the cache file used for I/O and notify Venus.
  */
 int
 coda_close(struct vop_close_args *ap)
 {
 /* true args */
     struct vnode *vp = ap->a_vp;
     struct cnode *cp = VTOC(vp);
     int flag = ap->a_fflag;
     struct ucred *cred = ap->a_cred;
     struct thread *td = ap->a_td;
 /* locals */
     int error;
 
     MARK_ENTRY(CODA_CLOSE_STATS);
 
     /* Check for close of control file. */
     if (IS_CTL_VP(vp)) {
 	MARK_INT_SAT(CODA_CLOSE_STATS);
 	return(0);
     }
 
     if (cp->c_ovp) {
 	VOP_CLOSE(cp->c_ovp, flag, cred, td); /* Do errors matter here? */
 	vrele(cp->c_ovp);
     }
 #ifdef CODA_VERBOSE
     else printf("coda_close: NO container vp %p/cp %p\n", vp, cp);
 #endif
 
     if (--cp->c_ocount == 0)
 	cp->c_ovp = NULL;
 
     if (flag & FWRITE)                    /* file was opened for write */
 	--cp->c_owrite;
 
     if (!IS_UNMOUNTING(cp))
          error = venus_close(vtomi(vp), &cp->c_fid, flag, cred, td->td_proc);
     else error = ENODEV;
 
     vrele(vp);
 
     CODADEBUG(CODA_CLOSE, myprintf(("close: result %d\n",error)); )
     return(error);
 }
 
 int
 coda_read(struct vop_read_args *ap)
 {
 
     ENTRY;
     return(coda_rdwr(ap->a_vp, ap->a_uio, UIO_READ,
 		    ap->a_ioflag, ap->a_cred, ap->a_uio->uio_td));
 }
 
 int
 coda_write(struct vop_write_args *ap)
 {
 
     ENTRY;
     return(coda_rdwr(ap->a_vp, ap->a_uio, UIO_WRITE,
 		    ap->a_ioflag, ap->a_cred, ap->a_uio->uio_td));
 }
 
 int
 coda_rdwr(struct vnode *vp, struct uio *uiop, enum uio_rw rw, int ioflag,
     struct ucred *cred, struct thread *td)
 { 
 /* upcall decl */
   /* NOTE: container file operation!!! */
 /* locals */
     struct cnode *cp = VTOC(vp);
     struct vnode *cfvp = cp->c_ovp;
     int opened_internally = 0;
     int error = 0;
 
     MARK_ENTRY(CODA_RDWR_STATS);
 
     CODADEBUG(CODA_RDWR, myprintf(("coda_rdwr(%d, %p, %d, %lld, %d)\n", rw, 
 			      (void *)uiop->uio_iov->iov_base, uiop->uio_resid, 
 			      (long long)uiop->uio_offset, uiop->uio_segflg)); )
 	
     /* Check for rdwr of control object. */
     if (IS_CTL_VP(vp)) {
 	MARK_INT_FAIL(CODA_RDWR_STATS);
 	return(EINVAL);
     }
 
     /* 
      * If file is not already open this must be a page {read,write} request
      * and we should open it internally.
      */
     if (cfvp == NULL) {
 	opened_internally = 1;
 	MARK_INT_GEN(CODA_OPEN_STATS);
 	error = VOP_OPEN(vp, (rw == UIO_READ ? FREAD : FWRITE), cred, td, NULL);
 	printf("coda_rdwr: Internally Opening %p\n", vp);
 	if (error) {
 		printf("coda_rdwr: VOP_OPEN on container failed %d\n", error);
 		return (error);
 	}
 	cfvp = cp->c_ovp;
     }
 
     /* Have UFS handle the call. */
     CODADEBUG(CODA_RDWR, myprintf(("indirect rdwr: fid = %s, refcnt = %d\n",
 			     coda_f2s(&cp->c_fid), CTOV(cp)->v_usecount)); )
     if (rw == UIO_READ) {
 	error = VOP_READ(cfvp, uiop, ioflag, cred);
     } else {
 	error = VOP_WRITE(cfvp, uiop, ioflag, cred);
 	/* ufs_write updates the vnode_pager_setsize for the vnode/object */
 
 	{   struct vattr attr;
 
 	    if (VOP_GETATTR(cfvp, &attr, cred, td) == 0) {
 		vnode_pager_setsize(vp, attr.va_size);
 	    }
 	}
     }
 
     if (error)
 	MARK_INT_FAIL(CODA_RDWR_STATS);
     else
 	MARK_INT_SAT(CODA_RDWR_STATS);
 
     /* Do an internal close if necessary. */
     if (opened_internally) {
 	MARK_INT_GEN(CODA_CLOSE_STATS);
 	(void)VOP_CLOSE(vp, (rw == UIO_READ ? FREAD : FWRITE), cred, td);
     }
 
     /* Invalidate cached attributes if writing. */
     if (rw == UIO_WRITE)
 	cp->c_flags &= ~C_VATTR;
     return(error);
 }
 
 
 
 int
 coda_ioctl(struct vop_ioctl_args *ap)
 {
 /* true args */
     struct vnode *vp = ap->a_vp;
     int com = ap->a_command;
     caddr_t data = ap->a_data;
     int flag = ap->a_fflag;
     struct ucred *cred = ap->a_cred;
     struct thread *td = ap->a_td;
 /* locals */
     int error;
     struct vnode *tvp;
     struct nameidata ndp;
     struct PioctlData *iap = (struct PioctlData *)data;
 
     MARK_ENTRY(CODA_IOCTL_STATS);
 
     CODADEBUG(CODA_IOCTL, myprintf(("in coda_ioctl on %s\n", iap->path));)
 	
     /* Don't check for operation on a dying object, for ctlvp it
        shouldn't matter */
 	
     /* Must be control object to succeed. */
     if (!IS_CTL_VP(vp)) {
 	MARK_INT_FAIL(CODA_IOCTL_STATS);
 	CODADEBUG(CODA_IOCTL, myprintf(("coda_ioctl error: vp != ctlvp"));)
 	    return (EOPNOTSUPP);
     }
     /* Look up the pathname. */
 
     /* Should we use the name cache here? It would get it from
        lookupname sooner or later anyway, right? */
 
     NDINIT(&ndp, LOOKUP, (iap->follow ? FOLLOW : NOFOLLOW), UIO_USERSPACE, iap->path, td);
     error = namei(&ndp);
     tvp = ndp.ni_vp;
 
     if (error) {
 	MARK_INT_FAIL(CODA_IOCTL_STATS);
 	CODADEBUG(CODA_IOCTL, myprintf(("coda_ioctl error: lookup returns %d\n",
 				   error));)
 	return(error);
     }
 
     /* 
      * Make sure this is a coda style cnode, but it may be a
      * different vfsp 
      */
     if (tvp->v_op != &coda_vnodeops) {
 	vrele(tvp);
 	NDFREE(&ndp, NDF_ONLY_PNBUF);
 	MARK_INT_FAIL(CODA_IOCTL_STATS);
 	CODADEBUG(CODA_IOCTL, 
 		 myprintf(("coda_ioctl error: %s not a coda object\n", 
 			iap->path));)
 	return(EINVAL);
     }
 
     if (iap->vi.in_size > VC_MAXDATASIZE) {
 	NDFREE(&ndp, 0);
 	return(EINVAL);
     }
     error = venus_ioctl(vtomi(tvp), &((VTOC(tvp))->c_fid), com, flag, data, cred, td->td_proc);
 
     if (error)
 	MARK_INT_FAIL(CODA_IOCTL_STATS);
     else
 	CODADEBUG(CODA_IOCTL, myprintf(("Ioctl returns %d \n", error)); )
 
     vrele(tvp);
     NDFREE(&ndp, NDF_ONLY_PNBUF);
     return(error);
 }
 
 /*
  * To reduce the cost of a user-level venus;we cache attributes in
  * the kernel.  Each cnode has storage allocated for an attribute. If
  * c_vattr is valid, return a reference to it. Otherwise, get the
  * attributes from venus and store them in the cnode.  There is some
  * question if this method is a security leak. But I think that in
  * order to make this call, the user must have done a lookup and
  * opened the file, and therefore should already have access.  
  */
 int
 coda_getattr(struct vop_getattr_args *ap)
 {
 /* true args */
     struct vnode *vp = ap->a_vp;
     struct cnode *cp = VTOC(vp);
     struct vattr *vap = ap->a_vap;
     struct ucred *cred = ap->a_cred;
     struct thread *td = ap->a_td;
 /* locals */
     int error;
 
     MARK_ENTRY(CODA_GETATTR_STATS);
 
     if (IS_UNMOUNTING(cp))
 	return ENODEV;
 
     /* Check for getattr of control object. */
     if (IS_CTL_VP(vp)) {
 	MARK_INT_FAIL(CODA_GETATTR_STATS);
 	return(ENOENT);
     }
 
     /* Check to see if the attributes have already been cached */
     if (VALID_VATTR(cp)) { 
 	CODADEBUG(CODA_GETATTR, { myprintf(("attr cache hit: %s\n",
 					coda_f2s(&cp->c_fid)));});
 	CODADEBUG(CODA_GETATTR, if (!(codadebug & ~CODA_GETATTR))
 		 print_vattr(&cp->c_vattr); );
 	
 	*vap = cp->c_vattr;
 	MARK_INT_SAT(CODA_GETATTR_STATS);
 	return(0);
     }
 
     error = venus_getattr(vtomi(vp), &cp->c_fid, cred, td->td_proc, vap);
 
     if (!error) {
 	CODADEBUG(CODA_GETATTR, myprintf(("getattr miss %s: result %d\n",
 				     coda_f2s(&cp->c_fid), error)); )	       
 	    
 	CODADEBUG(CODA_GETATTR, if (!(codadebug & ~CODA_GETATTR))
 		 print_vattr(vap);	);
 	
     {	int size = vap->va_size;
     	struct vnode *convp = cp->c_ovp;
 	if (convp != (struct vnode *)0) {
 	    vnode_pager_setsize(convp, size);
 	}
     }
 	/* If not open for write, store attributes in cnode */   
 	if ((cp->c_owrite == 0) && (coda_attr_cache)) {  
 	    cp->c_vattr = *vap;
 	    cp->c_flags |= C_VATTR; 
 	}
 	
     }
     return(error);
 }
 
 int
 coda_setattr(struct vop_setattr_args *ap)
 {
 /* true args */
     register struct vnode *vp = ap->a_vp;
     struct cnode *cp = VTOC(vp);
     register struct vattr *vap = ap->a_vap;
     struct ucred *cred = ap->a_cred;
     struct thread *td = ap->a_td;
 /* locals */
     int error;
 
     MARK_ENTRY(CODA_SETATTR_STATS);
 
     /* Check for setattr of control object. */
     if (IS_CTL_VP(vp)) {
 	MARK_INT_FAIL(CODA_SETATTR_STATS);
 	return(ENOENT);
     }
 
     if (codadebug & CODADBGMSK(CODA_SETATTR)) {
 	print_vattr(vap);
     }
     error = venus_setattr(vtomi(vp), &cp->c_fid, vap, cred, td->td_proc);
 
     if (!error)
 	cp->c_flags &= ~C_VATTR;
 
     {	int size = vap->va_size;
     	struct vnode *convp = cp->c_ovp;
 	if (size != VNOVAL && convp != (struct vnode *)0) {
 	    vnode_pager_setsize(convp, size);
 	}
     }
     CODADEBUG(CODA_SETATTR,	myprintf(("setattr %d\n", error)); )
     return(error);
 }
 
 int
 coda_access(struct vop_access_args *ap)
 {
 /* true args */
     struct vnode *vp = ap->a_vp;
     struct cnode *cp = VTOC(vp);
     int mode = ap->a_mode;
     struct ucred *cred = ap->a_cred;
     struct thread *td = ap->a_td;
 /* locals */
     int error;
 
     MARK_ENTRY(CODA_ACCESS_STATS);
 
     /* Check for access of control object.  Only read access is
        allowed on it. */
     if (IS_CTL_VP(vp)) {
 	/* bogus hack - all will be marked as successes */
 	MARK_INT_SAT(CODA_ACCESS_STATS);
 	return(((mode & VREAD) && !(mode & (VWRITE | VEXEC))) 
 	       ? 0 : EACCES);
     }
 
     /*
      * if the file is a directory, and we are checking exec (eg lookup) 
      * access, and the file is in the namecache, then the user must have 
      * lookup access to it.
      */
     if (coda_access_cache) {
 	if ((vp->v_type == VDIR) && (mode & VEXEC)) {
 	    if (coda_nc_lookup(cp, ".", 1, cred)) {
 		MARK_INT_SAT(CODA_ACCESS_STATS);
 		return(0);                     /* it was in the cache */
 	    }
 	}
     }
 
     error = venus_access(vtomi(vp), &cp->c_fid, mode, cred, td->td_proc);
 
     return(error);
 }
 
 int
 coda_readlink(struct vop_readlink_args *ap)
 {
 /* true args */
     struct vnode *vp = ap->a_vp;
     struct cnode *cp = VTOC(vp);
     struct uio *uiop = ap->a_uio;
     struct ucred *cred = ap->a_cred;
     struct thread *td = ap->a_uio->uio_td;
 /* locals */
     int error;
     char *str;
     int len;
 
     MARK_ENTRY(CODA_READLINK_STATS);
 
     /* Check for readlink of control object. */
     if (IS_CTL_VP(vp)) {
 	MARK_INT_FAIL(CODA_READLINK_STATS);
 	return(ENOENT);
     }
 
     if ((coda_symlink_cache) && (VALID_SYMLINK(cp))) { /* symlink was cached */
 	uiop->uio_rw = UIO_READ;
 	error = uiomove(cp->c_symlink, (int)cp->c_symlen, uiop);
 	if (error)
 	    MARK_INT_FAIL(CODA_READLINK_STATS);
 	else
 	    MARK_INT_SAT(CODA_READLINK_STATS);
 	return(error);
     }
 
     error = venus_readlink(vtomi(vp), &cp->c_fid, cred,
         td != NULL ? td->td_proc : NULL, &str, &len);
 
     if (!error) {
 	uiop->uio_rw = UIO_READ;
 	error = uiomove(str, len, uiop);
 
 	if (coda_symlink_cache) {
 	    cp->c_symlink = str;
 	    cp->c_symlen = len;
 	    cp->c_flags |= C_SYMLINK;
 	} else
 	    CODA_FREE(str, len);
     }
 
     CODADEBUG(CODA_READLINK, myprintf(("in readlink result %d\n",error));)
     return(error);
 }
 
 int
 coda_fsync(struct vop_fsync_args *ap)
 {
 /* true args */
     struct vnode *vp = ap->a_vp;
     struct cnode *cp = VTOC(vp);
     struct thread *td = ap->a_td;
 /* locals */
     struct vnode *convp = cp->c_ovp;
     int error;
    
     MARK_ENTRY(CODA_FSYNC_STATS);
 
     /* Check for fsync on an unmounting object */
     /* The NetBSD kernel, in it's infinite wisdom, can try to fsync
      * after an unmount has been initiated.  This is a Bad Thing,
      * which we have to avoid.  Not a legitimate failure for stats.
      */
     if (IS_UNMOUNTING(cp)) {
 	return(ENODEV);
     }
 
     /* Check for fsync of control object. */
     if (IS_CTL_VP(vp)) {
 	MARK_INT_SAT(CODA_FSYNC_STATS);
 	return(0);
     }
 
     if (convp)
     	VOP_FSYNC(convp, MNT_WAIT, td);
 
     /*
      * We see fsyncs with usecount == 1 then usecount == 0.
      * For now we ignore them.
      */
     /*
     VI_LOCK(vp);
     if (!vp->v_usecount) {
     	printf("coda_fsync on vnode %p with %d usecount.  c_flags = %x (%x)\n",
 		vp, vp->v_usecount, cp->c_flags, cp->c_flags&C_PURGING);
     }
     VI_UNLOCK(vp);
     */
 
     /*
      * We can expect fsync on any vnode at all if venus is pruging it.
      * Venus can't very well answer the fsync request, now can it?
      * Hopefully, it won't have to, because hopefully, venus preserves
      * the (possibly untrue) invariant that it never purges an open
      * vnode.  Hopefully.
      */
     if (cp->c_flags & C_PURGING) {
 	return(0);
     }
 
     /* needs research */
     return 0;
     error = venus_fsync(vtomi(vp), &cp->c_fid, td->td_proc);
 
     CODADEBUG(CODA_FSYNC, myprintf(("in fsync result %d\n",error)); );
     return(error);
 }
 
 int
 coda_inactive(struct vop_inactive_args *ap)
 {
     /* XXX - at the moment, inactive doesn't look at cred, and doesn't
        have a proc pointer.  Oops. */
 /* true args */
     struct vnode *vp = ap->a_vp;
     struct cnode *cp = VTOC(vp);
     struct ucred *cred __attribute__((unused)) = NULL;
     struct thread *td __attribute__((unused)) = curthread;
 /* upcall decl */
 /* locals */
 
     /* We don't need to send inactive to venus - DCS */
     MARK_ENTRY(CODA_INACTIVE_STATS);
 
     CODADEBUG(CODA_INACTIVE, myprintf(("in inactive, %s, vfsp %p\n",
 				  coda_f2s(&cp->c_fid), vp->v_mount));)
  
     /* If an array has been allocated to hold the symlink, deallocate it */
     if ((coda_symlink_cache) && (VALID_SYMLINK(cp))) {
 	if (cp->c_symlink == NULL)
 	    panic("coda_inactive: null symlink pointer in cnode");
 	
 	CODA_FREE(cp->c_symlink, cp->c_symlen);
 	cp->c_flags &= ~C_SYMLINK;
 	cp->c_symlen = 0;
     }
 
     /* Remove it from the table so it can't be found. */
     coda_unsave(cp);
     if ((struct coda_mntinfo *)(vp->v_mount->mnt_data) == NULL) {
 	myprintf(("Help! vfsp->vfs_data was NULL, but vnode %p wasn't dying\n", vp));
 	panic("badness in coda_inactive\n");
     }
 
     if (IS_UNMOUNTING(cp)) {
 #ifdef	DEBUG
 	printf("coda_inactive: IS_UNMOUNTING use %d: vp %p, cp %p\n", vrefcnt(vp), vp, cp);
 	if (cp->c_ovp != NULL)
 	    printf("coda_inactive: cp->ovp != NULL use %d: vp %p, cp %p\n",
 	    	   vrefcnt(vp), vp, cp);
 #endif
     } else {
 #ifdef OLD_DIAGNOSTIC
 	if (vrefcnt(CTOV(cp))) {
 	    panic("coda_inactive: nonzero reference count");
 	}
 	if (cp->c_ovp != NULL) {
 	    panic("coda_inactive:  cp->ovp != NULL");
 	}
 #endif
 	vgone(vp);
     }
 
     MARK_INT_SAT(CODA_INACTIVE_STATS);
     return(0);
 }
 
 /*
  * Remote filesystem operations having to do with directory manipulation.
  */
 
 /* 
  * It appears that in NetBSD, lookup is supposed to return the vnode locked
  */
 int
 coda_lookup(struct vop_lookup_args *ap)
 {
 /* true args */
     struct vnode *dvp = ap->a_dvp;
     struct cnode *dcp = VTOC(dvp);
     struct vnode **vpp = ap->a_vpp;
     /* 
      * It looks as though ap->a_cnp->ni_cnd->cn_nameptr holds the rest
      * of the string to xlate, and that we must try to get at least
      * ap->a_cnp->ni_cnd->cn_namelen of those characters to macth.  I
      * could be wrong. 
      */
     struct componentname  *cnp = ap->a_cnp;
     struct ucred *cred = cnp->cn_cred;
     struct thread *td = cnp->cn_thread;
 /* locals */
     struct cnode *cp;
     const char *nm = cnp->cn_nameptr;
     int len = cnp->cn_namelen;
     CodaFid VFid;
     int	vtype;
     int error = 0;
 
     MARK_ENTRY(CODA_LOOKUP_STATS);
 
     CODADEBUG(CODA_LOOKUP, myprintf(("lookup: %s in %s\n",
 				   nm, coda_f2s(&dcp->c_fid))););
 
     /* Check for lookup of control object. */
     if (IS_CTL_NAME(dvp, nm, len)) {
 	*vpp = coda_ctlvp;
 	vref(*vpp);
 	MARK_INT_SAT(CODA_LOOKUP_STATS);
 	goto exit;
     }
 
     if (len+1 > CODA_MAXNAMLEN) {
 	MARK_INT_FAIL(CODA_LOOKUP_STATS);
 
 	CODADEBUG(CODA_LOOKUP, myprintf(("name too long: lookup, %s (%s)\n",
 					 coda_f2s(&dcp->c_fid), nm)););
 	*vpp = (struct vnode *)0;
 	error = EINVAL;
 	goto exit;
     }
     /* First try to look the file up in the cfs name cache */
     /* lock the parent vnode? */
     cp = coda_nc_lookup(dcp, nm, len, cred);
     if (cp) {
 	*vpp = CTOV(cp);
 	vref(*vpp);
 	CODADEBUG(CODA_LOOKUP, 
 		 myprintf(("lookup result %d vpp %p\n",error,*vpp));)
     } else {
 	
 	/* The name wasn't cached, so we need to contact Venus */
 	error = venus_lookup(vtomi(dvp), &dcp->c_fid, nm, len, cred, td->td_proc, &VFid, &vtype);
 	
 	if (error) {
 	    MARK_INT_FAIL(CODA_LOOKUP_STATS);
 
 	    CODADEBUG(CODA_LOOKUP, myprintf(("lookup error on %s (%s)%d\n",
 					     coda_f2s(&dcp->c_fid), nm, error));)
 	    *vpp = (struct vnode *)0;
 	} else {
 	    MARK_INT_SAT(CODA_LOOKUP_STATS);
 	    CODADEBUG(CODA_LOOKUP, 
 		     myprintf(("lookup: %s type %o result %d\n",
 			       coda_f2s(&VFid), vtype, error)); )
 	    cp = make_coda_node(&VFid, dvp->v_mount, vtype);
 	    *vpp = CTOV(cp);
 	    
 	    /* enter the new vnode in the Name Cache only if the top bit isn't set */
 	    /* And don't enter a new vnode for an invalid one! */
 	    if (!(vtype & CODA_NOCACHE))
 		coda_nc_enter(VTOC(dvp), nm, len, cred, VTOC(*vpp));
 	}
     }
 
  exit:
     /* 
      * If we are creating, and this was the last name to be looked up,
      * and the error was ENOENT, then there really shouldn't be an
      * error and we can make the leaf NULL and return success.  Since
      * this is supposed to work under Mach as well as NetBSD, we're
      * leaving this fn wrapped.  We also must tell lookup/namei that
      * we need to save the last component of the name.  (Create will
      * have to free the name buffer later...lucky us...)
      */
     if (((cnp->cn_nameiop == CREATE) || (cnp->cn_nameiop == RENAME))
 	&& (cnp->cn_flags & ISLASTCN)
 	&& (error == ENOENT))
     {
 	error = EJUSTRETURN;
 	cnp->cn_flags |= SAVENAME;
 	*ap->a_vpp = NULL;
     }
 
     /* 
      * If we are removing, and we are at the last element, and we
      * found it, then we need to keep the name around so that the
      * removal will go ahead as planned.  Unfortunately, this will
      * probably also lock the to-be-removed vnode, which may or may
      * not be a good idea.  I'll have to look at the bits of
      * coda_remove to make sure.  We'll only save the name if we did in
      * fact find the name, otherwise coda_remove won't have a chance
      * to free the pathname.  
      */
     if ((cnp->cn_nameiop == DELETE)
 	&& (cnp->cn_flags & ISLASTCN)
 	&& !error)
     {
 	cnp->cn_flags |= SAVENAME;
     }
 
     /* 
      * If the lookup went well, we need to (potentially?) unlock the
      * parent, and lock the child.  We are only responsible for
      * checking to see if the parent is supposed to be unlocked before
      * we return.  We must always lock the child (provided there is
      * one, and (the parent isn't locked or it isn't the same as the
      * parent.)  Simple, huh?  We can never leave the parent locked unless
      * we are ISLASTCN
      */
     if (!error || (error == EJUSTRETURN)) {
 	if (cnp->cn_flags & ISDOTDOT) {
 	    if ((error = VOP_UNLOCK(dvp, 0, td))) {
 		return error; 
 	    }	    
 	    /* 
 	     * The parent is unlocked.  As long as there is a child,
 	     * lock it without bothering to check anything else. 
 	     */
 	    if (*ap->a_vpp) {
 		if ((error = VOP_LOCK(*ap->a_vpp, LK_EXCLUSIVE, td))) {
-		    vn_lock(dvp, LK_RETRY|LK_EXCLUSIVE, td);
+		    vn_lock(dvp, LK_RETRY|LK_EXCLUSIVE);
 		    return (error);
 		}
 	    }
-	    vn_lock(dvp, LK_RETRY|LK_EXCLUSIVE, td);
+	    vn_lock(dvp, LK_RETRY|LK_EXCLUSIVE);
 	} else {
 	    /* The parent is locked, and may be the same as the child */
 	    if (*ap->a_vpp && (*ap->a_vpp != dvp)) {
 		/* Different, go ahead and lock it. */
 		if ((error = VOP_LOCK(*ap->a_vpp, LK_EXCLUSIVE, td))) {
 		    return (error);
 		}
 	    }
 	}
     } else {
 	/* If the lookup failed, we need to ensure that the leaf is NULL */
 	/* Don't change any locking? */
 	*ap->a_vpp = NULL;
     }
     return(error);
 }
 
 /*ARGSUSED*/
 int
 coda_create(struct vop_create_args *ap)
 {
 /* true args */
     struct vnode *dvp = ap->a_dvp;
     struct cnode *dcp = VTOC(dvp);
     struct vattr *va = ap->a_vap;
     int exclusive = 1;
     int mode = ap->a_vap->va_mode;
     struct vnode **vpp = ap->a_vpp;
     struct componentname  *cnp = ap->a_cnp;
     struct ucred *cred = cnp->cn_cred;
     struct thread *td = cnp->cn_thread;
 /* locals */
     int error;
     struct cnode *cp;
     const char *nm = cnp->cn_nameptr;
     int len = cnp->cn_namelen;
     CodaFid VFid;
     struct vattr attr;
 
     MARK_ENTRY(CODA_CREATE_STATS);
 
     /* All creates are exclusive XXX */
     /* I'm assuming the 'mode' argument is the file mode bits XXX */
 
     /* Check for create of control object. */
     if (IS_CTL_NAME(dvp, nm, len)) {
 	*vpp = (struct vnode *)0;
 	MARK_INT_FAIL(CODA_CREATE_STATS);
 	return(EACCES);
     }
 
     error = venus_create(vtomi(dvp), &dcp->c_fid, nm, len, exclusive, mode, va, cred, td->td_proc, &VFid, &attr);
 
     if (!error) {
 	
 	/* If this is an exclusive create, panic if the file already exists. */
 	/* Venus should have detected the file and reported EEXIST. */
 
 	if ((exclusive == 1) &&
 	    (coda_find(&VFid) != NULL))
 	    panic("cnode existed for newly created file!");
 	
 	cp = make_coda_node(&VFid, dvp->v_mount, attr.va_type);
 	*vpp = CTOV(cp);
 	
 	/* Update va to reflect the new attributes. */
 	(*va) = attr;
 	
 	/* Update the attribute cache and mark it as valid */
 	if (coda_attr_cache) {
 	    VTOC(*vpp)->c_vattr = attr;
 	    VTOC(*vpp)->c_flags |= C_VATTR;       
 	}
 
 	/* Invalidate the parent's attr cache, the modification time has changed */
 	VTOC(dvp)->c_flags &= ~C_VATTR;
 	
 	/* enter the new vnode in the Name Cache */
 	coda_nc_enter(VTOC(dvp), nm, len, cred, VTOC(*vpp));
 	
 	CODADEBUG(CODA_CREATE, 
 		  myprintf(("create: %s, result %d\n",
 			   coda_f2s(&VFid), error)); )
     } else {
 	*vpp = (struct vnode *)0;
 	CODADEBUG(CODA_CREATE, myprintf(("create error %d\n", error));)
     }
 
     if (!error) {
 	if (cnp->cn_flags & LOCKLEAF) {
 	    if ((error = VOP_LOCK(*ap->a_vpp, LK_EXCLUSIVE, td))) {
 		printf("coda_create: ");
 		panic("unlocked parent but couldn't lock child");
 	    }
 	}
 #ifdef OLD_DIAGNOSTIC
 	else {
 	    printf("coda_create: LOCKLEAF not set!\n");
 	}
 #endif
     }
     return(error);
 }
 
 int
 coda_remove(struct vop_remove_args *ap)
 {
 /* true args */
     struct vnode *dvp = ap->a_dvp;
     struct cnode *cp = VTOC(dvp);
     struct componentname  *cnp = ap->a_cnp;
     struct ucred *cred = cnp->cn_cred;
     struct thread *td = cnp->cn_thread;
 /* locals */
     int error;
     const char *nm = cnp->cn_nameptr;
     int len = cnp->cn_namelen;
     struct cnode *tp;
 
     MARK_ENTRY(CODA_REMOVE_STATS);
 
     CODADEBUG(CODA_REMOVE, myprintf(("remove: %s in %s\n",
 				     nm, coda_f2s(&cp->c_fid))););
     /* Remove the file's entry from the CODA Name Cache */
     /* We're being conservative here, it might be that this person
      * doesn't really have sufficient access to delete the file
      * but we feel zapping the entry won't really hurt anyone -- dcs
      */
     /* I'm gonna go out on a limb here. If a file and a hardlink to it
      * exist, and one is removed, the link count on the other will be
      * off by 1. We could either invalidate the attrs if cached, or
      * fix them. I'll try to fix them. DCS 11/8/94
      */
     tp = coda_nc_lookup(VTOC(dvp), nm, len, cred);
     if (tp) {
 	if (VALID_VATTR(tp)) {	/* If attrs are cached */
 	    if (tp->c_vattr.va_nlink > 1) {	/* If it's a hard link */
 		tp->c_vattr.va_nlink--;
 	    }
 	}
 	
 	coda_nc_zapfile(VTOC(dvp), nm, len); 
 	/* No need to flush it if it doesn't exist! */
     }
     /* Invalidate the parent's attr cache, the modification time has changed */
     VTOC(dvp)->c_flags &= ~C_VATTR;
 
     /* Check for remove of control object. */
     if (IS_CTL_NAME(dvp, nm, len)) {
 	MARK_INT_FAIL(CODA_REMOVE_STATS);
 	return(ENOENT);
     }
 
     error = venus_remove(vtomi(dvp), &cp->c_fid, nm, len, cred, td->td_proc);
 
     CODADEBUG(CODA_REMOVE, myprintf(("in remove result %d\n",error)); )
 
     return(error);
 }
 
 int
 coda_link(struct vop_link_args *ap)
 {
 /* true args */
     struct vnode *vp = ap->a_vp;
     struct cnode *cp = VTOC(vp);
     struct vnode *tdvp = ap->a_tdvp;
     struct cnode *tdcp = VTOC(tdvp);
     struct componentname *cnp = ap->a_cnp;
     struct ucred *cred = cnp->cn_cred;
     struct thread *td = cnp->cn_thread;
 /* locals */
     int error;
     const char *nm = cnp->cn_nameptr;
     int len = cnp->cn_namelen;
 
     MARK_ENTRY(CODA_LINK_STATS);
 
     if (codadebug & CODADBGMSK(CODA_LINK)) {
 	myprintf(("nb_link:   vp fid: %s\n",
 		  coda_f2s(&cp->c_fid)));
 	myprintf(("nb_link: tdvp fid: %s)\n",
 		  coda_f2s(&tdcp->c_fid)));	
     }
     if (codadebug & CODADBGMSK(CODA_LINK)) {
 	myprintf(("link:   vp fid: %s\n",
 		  coda_f2s(&cp->c_fid)));
 	myprintf(("link: tdvp fid: %s\n",
 		  coda_f2s(&tdcp->c_fid)));
     }
 
     /* Check for link to/from control object. */
     if (IS_CTL_NAME(tdvp, nm, len) || IS_CTL_VP(vp)) {
 	MARK_INT_FAIL(CODA_LINK_STATS);
 	return(EACCES);
     }
 
     error = venus_link(vtomi(vp), &cp->c_fid, &tdcp->c_fid, nm, len, cred, td->td_proc);
 
     /* Invalidate the parent's attr cache, the modification time has changed */
     VTOC(tdvp)->c_flags &= ~C_VATTR;
     VTOC(vp)->c_flags &= ~C_VATTR;
 
     CODADEBUG(CODA_LINK,	myprintf(("in link result %d\n",error)); )
 
     return(error);
 }
 
 int
 coda_rename(struct vop_rename_args *ap)
 {
 /* true args */
     struct vnode *odvp = ap->a_fdvp;
     struct cnode *odcp = VTOC(odvp);
     struct componentname  *fcnp = ap->a_fcnp;
     struct vnode *ndvp = ap->a_tdvp;
     struct cnode *ndcp = VTOC(ndvp);
     struct componentname  *tcnp = ap->a_tcnp;
     struct ucred *cred = fcnp->cn_cred;
     struct thread *td = fcnp->cn_thread;
 /* true args */
     int error;
     const char *fnm = fcnp->cn_nameptr;
     int flen = fcnp->cn_namelen;
     const char *tnm = tcnp->cn_nameptr;
     int tlen = tcnp->cn_namelen;
 
     MARK_ENTRY(CODA_RENAME_STATS);
 
     /* Hmmm.  The vnodes are already looked up.  Perhaps they are locked?
        This could be Bad. XXX */
 #ifdef OLD_DIAGNOSTIC
     if ((fcnp->cn_cred != tcnp->cn_cred)
 	|| (fcnp->cn_thread != tcnp->cn_thread))
     {
 	panic("coda_rename: component names don't agree");
     }
 #endif
 
     /* Check for rename involving control object. */ 
     if (IS_CTL_NAME(odvp, fnm, flen) || IS_CTL_NAME(ndvp, tnm, tlen)) {
 	MARK_INT_FAIL(CODA_RENAME_STATS);
 	return(EACCES);
     }
 
     /* Problem with moving directories -- need to flush entry for .. */
     if (odvp != ndvp) {
 	struct cnode *ovcp = coda_nc_lookup(VTOC(odvp), fnm, flen, cred);
 	if (ovcp) {
 	    struct vnode *ovp = CTOV(ovcp);
 	    if ((ovp) &&
 		(ovp->v_type == VDIR)) /* If it's a directory */
 		coda_nc_zapfile(VTOC(ovp),"..", 2);
 	}
     }
 
     /* Remove the entries for both source and target files */
     coda_nc_zapfile(VTOC(odvp), fnm, flen);
     coda_nc_zapfile(VTOC(ndvp), tnm, tlen);
 
     /* Invalidate the parent's attr cache, the modification time has changed */
     VTOC(odvp)->c_flags &= ~C_VATTR;
     VTOC(ndvp)->c_flags &= ~C_VATTR;
 
     if (flen+1 > CODA_MAXNAMLEN) {
 	MARK_INT_FAIL(CODA_RENAME_STATS);
 	error = EINVAL;
 	goto exit;
     }
 
     if (tlen+1 > CODA_MAXNAMLEN) {
 	MARK_INT_FAIL(CODA_RENAME_STATS);
 	error = EINVAL;
 	goto exit;
     }
 
     error = venus_rename(vtomi(odvp), &odcp->c_fid, &ndcp->c_fid, fnm, flen, tnm, tlen, cred, td->td_proc);
 
  exit:
     CODADEBUG(CODA_RENAME, myprintf(("in rename result %d\n",error));)
     /* XXX - do we need to call cache pureg on the moved vnode? */
     cache_purge(ap->a_fvp);
 
     /* Release parents first, then children. */
     vrele(odvp);
     if (ap->a_tvp) {
 	if (ap->a_tvp == ndvp)
 	    vrele(ndvp);
 	else
 	    vput(ndvp);
 	vput(ap->a_tvp);
     } else
 	vput(ndvp);
     vrele(ap->a_fvp);
 
     return(error);
 }
 
 int
 coda_mkdir(struct vop_mkdir_args *ap)
 {
 /* true args */
     struct vnode *dvp = ap->a_dvp;
     struct cnode *dcp = VTOC(dvp);	
     struct componentname  *cnp = ap->a_cnp;
     register struct vattr *va = ap->a_vap;
     struct vnode **vpp = ap->a_vpp;
     struct ucred *cred = cnp->cn_cred;
     struct thread *td = cnp->cn_thread;
 /* locals */
     int error;
     const char *nm = cnp->cn_nameptr;
     int len = cnp->cn_namelen;
     struct cnode *cp;
     CodaFid VFid;
     struct vattr ova;
 
     MARK_ENTRY(CODA_MKDIR_STATS);
 
     /* Check for mkdir of target object. */
     if (IS_CTL_NAME(dvp, nm, len)) {
 	*vpp = (struct vnode *)0;
 	MARK_INT_FAIL(CODA_MKDIR_STATS);
 	return(EACCES);
     }
 
     if (len+1 > CODA_MAXNAMLEN) {
 	*vpp = (struct vnode *)0;
 	MARK_INT_FAIL(CODA_MKDIR_STATS);
 	return(EACCES);
     }
 
     error = venus_mkdir(vtomi(dvp), &dcp->c_fid, nm, len, va, cred, td->td_proc, &VFid, &ova);
 
     if (!error) {
 	if (coda_find(&VFid) != NULL)
 	    panic("cnode existed for newly created directory!");
 	
 	
 	cp =  make_coda_node(&VFid, dvp->v_mount, va->va_type);
 	*vpp = CTOV(cp);
 	
 	/* enter the new vnode in the Name Cache */
 	coda_nc_enter(VTOC(dvp), nm, len, cred, VTOC(*vpp));
 
 	/* as a side effect, enter "." and ".." for the directory */
 	coda_nc_enter(VTOC(*vpp), ".", 1, cred, VTOC(*vpp));
 	coda_nc_enter(VTOC(*vpp), "..", 2, cred, VTOC(dvp));
 
 	if (coda_attr_cache) {
 	    VTOC(*vpp)->c_vattr = ova;		/* update the attr cache */
 	    VTOC(*vpp)->c_flags |= C_VATTR;	/* Valid attributes in cnode */
 	}
 
 	/* Invalidate the parent's attr cache, the modification time has changed */
 	VTOC(dvp)->c_flags &= ~C_VATTR;
 	
 	CODADEBUG( CODA_MKDIR, myprintf(("mkdir: %s result %d\n",
 					 coda_f2s(&VFid), error)); )
 	} else {
 	*vpp = (struct vnode *)0;
 	CODADEBUG(CODA_MKDIR, myprintf(("mkdir error %d\n",error));)
     }
 
     return(error);
 }
 
 int
 coda_rmdir(struct vop_rmdir_args *ap)
 {
 /* true args */
     struct vnode *dvp = ap->a_dvp;
     struct cnode *dcp = VTOC(dvp);
     struct componentname  *cnp = ap->a_cnp;
     struct ucred *cred = cnp->cn_cred;
     struct thread *td = cnp->cn_thread;
 /* true args */
     int error;
     const char *nm = cnp->cn_nameptr;
     int len = cnp->cn_namelen;
     struct cnode *cp;
    
     MARK_ENTRY(CODA_RMDIR_STATS);
 
     /* Check for rmdir of control object. */
     if (IS_CTL_NAME(dvp, nm, len)) {
 	MARK_INT_FAIL(CODA_RMDIR_STATS);
 	return(ENOENT);
     }
 
     /* We're being conservative here, it might be that this person
      * doesn't really have sufficient access to delete the file
      * but we feel zapping the entry won't really hurt anyone -- dcs
      */
     /*
      * As a side effect of the rmdir, remove any entries for children of
      * the directory, especially "." and "..".
      */
     cp = coda_nc_lookup(dcp, nm, len, cred);
     if (cp) coda_nc_zapParentfid(&(cp->c_fid), NOT_DOWNCALL);
 
     /* Remove the file's entry from the CODA Name Cache */
     coda_nc_zapfile(dcp, nm, len);
 
     /* Invalidate the parent's attr cache, the modification time has changed */
     dcp->c_flags &= ~C_VATTR;
 
     error = venus_rmdir(vtomi(dvp), &dcp->c_fid, nm, len, cred, td->td_proc);
 
     CODADEBUG(CODA_RMDIR, myprintf(("in rmdir result %d\n", error)); )
 
     return(error);
 }
 
 int
 coda_symlink(struct vop_symlink_args *ap)
 {
 /* true args */
     struct vnode *tdvp = ap->a_dvp;
     struct cnode *tdcp = VTOC(tdvp);	
     struct componentname *cnp = ap->a_cnp;
     struct vattr *tva = ap->a_vap;
     char *path = ap->a_target;
     struct ucred *cred = cnp->cn_cred;
     struct thread *td = cnp->cn_thread;
     struct vnode **vpp = ap->a_vpp;
 /* locals */
     int error;
     /* 
      * XXX I'm assuming the following things about coda_symlink's
      * arguments: 
      *       t(foo) is the new name/parent/etc being created.
      *       lname is the contents of the new symlink. 
      */
     char *nm = cnp->cn_nameptr;
     int len = cnp->cn_namelen;
     int plen = strlen(path);
 
     /* 
      * Here's the strategy for the moment: perform the symlink, then
      * do a lookup to grab the resulting vnode.  I know this requires
      * two communications with Venus for a new sybolic link, but
      * that's the way the ball bounces.  I don't yet want to change
      * the way the Mach symlink works.  When Mach support is
      * deprecated, we should change symlink so that the common case
      * returns the resultant vnode in a vpp argument.
      */
 
     MARK_ENTRY(CODA_SYMLINK_STATS);
 
     /* Check for symlink of control object. */
     if (IS_CTL_NAME(tdvp, nm, len)) {
 	MARK_INT_FAIL(CODA_SYMLINK_STATS);
 	return(EACCES);
     }
 
     if (plen+1 > CODA_MAXPATHLEN) {
 	MARK_INT_FAIL(CODA_SYMLINK_STATS);
 	return(EINVAL);
     }
 
     if (len+1 > CODA_MAXNAMLEN) {
 	MARK_INT_FAIL(CODA_SYMLINK_STATS);
 	error = EINVAL;
 	goto exit;
     }
 
     error = venus_symlink(vtomi(tdvp), &tdcp->c_fid, path, plen, nm, len, tva, cred, td->td_proc);
 
     /* Invalidate the parent's attr cache, the modification time has changed */
     tdcp->c_flags &= ~C_VATTR;
 
     if (error == 0)
 	error = VOP_LOOKUP(tdvp, vpp, cnp);
 
  exit:    
     CODADEBUG(CODA_SYMLINK, myprintf(("in symlink result %d\n",error)); )
     return(error);
 }
 
 /*
  * Read directory entries.
  */
 int
 coda_readdir(struct vop_readdir_args *ap)
 {
 /* true args */
     struct vnode *vp = ap->a_vp;
     struct cnode *cp = VTOC(vp);
     register struct uio *uiop = ap->a_uio;
     struct ucred *cred = ap->a_cred;
     int *eofflag = ap->a_eofflag;
     u_long **cookies = ap->a_cookies;
     int *ncookies = ap->a_ncookies;
     struct thread *td = ap->a_uio->uio_td;
 /* upcall decl */
 /* locals */
     int error = 0;
 
     MARK_ENTRY(CODA_READDIR_STATS);
 
     CODADEBUG(CODA_READDIR, myprintf(("coda_readdir(%p, %d, %lld, %d)\n",
 				      (void *)uiop->uio_iov->iov_base,
 				      uiop->uio_resid,
 				      (long long)uiop->uio_offset,
 				      uiop->uio_segflg)); )
 	
     /* Check for readdir of control object. */
     if (IS_CTL_VP(vp)) {
 	MARK_INT_FAIL(CODA_READDIR_STATS);
 	return(ENOENT);
     }
 
     {
 	/* If directory is not already open do an "internal open" on it. */
 	int opened_internally = 0;
 	if (cp->c_ovp == NULL) {
 	    opened_internally = 1;
 	    MARK_INT_GEN(CODA_OPEN_STATS);
 	    error = VOP_OPEN(vp, FREAD, cred, td, NULL);
 	    printf("coda_readdir: Internally Opening %p\n", vp);
 	    if (error) {
 		printf("coda_readdir: VOP_OPEN on container failed %d\n", error);
 		return (error);
 	    }
 	}
 	
 	/* Have UFS handle the call. */
 	CODADEBUG(CODA_READDIR, myprintf(("indirect readdir: fid = %s, refcnt = %d\n", coda_f2s(&cp->c_fid), vp->v_usecount)); )
 	error = VOP_READDIR(cp->c_ovp, uiop, cred, eofflag, ncookies,
 			       cookies);
 	
 	if (error)
 	    MARK_INT_FAIL(CODA_READDIR_STATS);
 	else
 	    MARK_INT_SAT(CODA_READDIR_STATS);
 	
 	/* Do an "internal close" if necessary. */ 
 	if (opened_internally) {
 	    MARK_INT_GEN(CODA_CLOSE_STATS);
 	    (void)VOP_CLOSE(vp, FREAD, cred, td);
 	}
     }
 
     return(error);
 }
 
 /*
  * Convert from filesystem blocks to device blocks
  */
 int
 coda_bmap(struct vop_bmap_args *ap)
 {
     /* XXX on the global proc */
 /* true args */
     struct vnode *vp __attribute__((unused)) = ap->a_vp;	/* file's vnode */
     daddr_t bn __attribute__((unused)) = ap->a_bn;	/* fs block number */
     struct bufobj **bop = ap->a_bop;			/* RETURN bufobj of device */
     daddr_t *bnp __attribute__((unused)) = ap->a_bnp;	/* RETURN device block number */
     struct thread *td __attribute__((unused)) = curthread;
 /* upcall decl */
 /* locals */
 
 	int ret = 0;
 	struct cnode *cp;
 
 	cp = VTOC(vp);
 	if (cp->c_ovp) {
 		return EINVAL;
 		ret =  VOP_BMAP(cp->c_ovp, bn, bop, bnp, ap->a_runp, ap->a_runb);
 #if	0
 		printf("VOP_BMAP(cp->c_ovp %p, bn %p, bop %p, bnp %lld, ap->a_runp %p, ap->a_runb %p) = %d\n",
 			cp->c_ovp, bn, bop, bnp, ap->a_runp, ap->a_runb, ret);
 #endif
 		return ret;
 	} else {
 #if	0
 		printf("coda_bmap: no container\n");
 #endif
 		return(EOPNOTSUPP);
 	}
 }
 
 int
 coda_reclaim(struct vop_reclaim_args *ap)
 {
 /* true args */
     struct vnode *vp = ap->a_vp;
     struct cnode *cp = VTOC(vp);
 /* upcall decl */
 /* locals */
 
 /*
  * Forced unmount/flush will let vnodes with non zero use be destroyed!
  */
     ENTRY;
 
     if (IS_UNMOUNTING(cp)) {
 #ifdef	DEBUG
 	if (VTOC(vp)->c_ovp) {
 	    if (IS_UNMOUNTING(cp))
 		printf("coda_reclaim: c_ovp not void: vp %p, cp %p\n", vp, cp);
 	}
 #endif
     } else {
 #ifdef OLD_DIAGNOSTIC
 	if (vrefcnt(vp) != 0) 
 	    print("coda_reclaim: pushing active %p\n", vp);
 	if (VTOC(vp)->c_ovp) {
 	    panic("coda_reclaim: c_ovp not void");
     }
 #endif
     }	
     cache_purge(vp);
     coda_free(VTOC(vp));
     vp->v_data = NULL;
     vnode_destroy_vobject(vp);
     return (0);
 }
 
 int
 coda_lock(struct vop_lock1_args *ap)
 {
 /* true args */
     struct vnode *vp = ap->a_vp;
     struct cnode *cp = VTOC(vp);
 /* upcall decl */
 /* locals */
 
     ENTRY;
 
     if ((ap->a_flags & LK_INTERLOCK) == 0) {
 	VI_LOCK(vp);
 	ap->a_flags |= LK_INTERLOCK;
     }
 
     if (coda_lockdebug) {
 	myprintf(("Attempting lock on %s\n",
 		  coda_f2s(&cp->c_fid)));
     }
 
     return (vop_stdlock(ap));
 }
 
 int
 coda_unlock(struct vop_unlock_args *ap)
 {
 /* true args */
     struct vnode *vp = ap->a_vp;
     struct cnode *cp = VTOC(vp);
 /* upcall decl */
 /* locals */
 
     ENTRY;
     if (coda_lockdebug) {
 	myprintf(("Attempting unlock on %s\n",
 		  coda_f2s(&cp->c_fid)));
     }
 
     return (vop_stdunlock(ap));
 }
 
 int
 coda_islocked(struct vop_islocked_args *ap)
 {
 /* true args */
     ENTRY;
 
     return (vop_stdislocked(ap));
 }
 
 void
 print_vattr(struct vattr *attr)
 {
     char *typestr;
 
     switch (attr->va_type) {
     case VNON:
 	typestr = "VNON";
 	break;
     case VREG:
 	typestr = "VREG";
 	break;
     case VDIR:
 	typestr = "VDIR";
 	break;
     case VBLK:
 	typestr = "VBLK";
 	break;
     case VCHR:
 	typestr = "VCHR";
 	break;
     case VLNK:
 	typestr = "VLNK";
 	break;
     case VSOCK:
 	typestr = "VSCK";
 	break;
     case VFIFO:
 	typestr = "VFFO";
 	break;
     case VBAD:
 	typestr = "VBAD";
 	break;
     default:
 	typestr = "????";
 	break;
     }
 
 
     myprintf(("attr: type %s mode %d uid %d gid %d fsid %d rdev %d\n",
 	      typestr, (int)attr->va_mode, (int)attr->va_uid,
 	      (int)attr->va_gid, (int)attr->va_fsid, (int)attr->va_rdev));
 
     myprintf(("      fileid %d nlink %d size %d blocksize %d bytes %d\n",
 	      (int)attr->va_fileid, (int)attr->va_nlink, 
 	      (int)attr->va_size,
 	      (int)attr->va_blocksize,(int)attr->va_bytes));
     myprintf(("      gen %ld flags %ld vaflags %d\n",
 	      attr->va_gen, attr->va_flags, attr->va_vaflags));
     myprintf(("      atime sec %d nsec %d\n",
 	      (int)attr->va_atime.tv_sec, (int)attr->va_atime.tv_nsec));
     myprintf(("      mtime sec %d nsec %d\n",
 	      (int)attr->va_mtime.tv_sec, (int)attr->va_mtime.tv_nsec));
     myprintf(("      ctime sec %d nsec %d\n",
 	      (int)attr->va_ctime.tv_sec, (int)attr->va_ctime.tv_nsec));
 }
 
 /* How to print a ucred */
 void
 print_cred(struct ucred *cred)
 {
 
 	int i;
 
 	myprintf(("ref %d\tuid %d\n",cred->cr_ref,cred->cr_uid));
 
 	for (i=0; i < cred->cr_ngroups; i++)
 		myprintf(("\tgroup %d: (%d)\n",i,cred->cr_groups[i]));
 	myprintf(("\n"));
 
 }
 
 /*
  * Return a vnode for the given fid.
  * If no cnode exists for this fid create one and put it
  * in a table hashed by coda_f2i().  If the cnode for
  * this fid is already in the table return it (ref count is
  * incremented by coda_find.  The cnode will be flushed from the
  * table when coda_inactive calls coda_unsave.
  */
 struct cnode *
 make_coda_node(CodaFid *fid, struct mount *vfsp, short type)
 {
     struct cnode *cp;
     int          err;
 
     if ((cp = coda_find(fid)) == NULL) {
 	struct vnode *vp;
 	
 	cp = coda_alloc();
 	cp->c_fid = *fid;
 	
 	err = getnewvnode("coda", vfsp, &coda_vnodeops, &vp);  
 	if (err) {                                                
 	    panic("coda: getnewvnode returned error %d\n", err);   
 	}                                                         
 	err = insmntque1(vp, vfsp, NULL, NULL);	/* XXX: Too early for mpsafe fs */
 	if (err != 0)
 		panic("coda: insmntque failed: error %d", err);
 	vp->v_data = cp;                                          
 	vp->v_type = type;                                      
 	cp->c_vnode = vp;                                         
 	coda_save(cp);
 	
     } else {
 	vref(CTOV(cp));
     }
 
     return cp;
 }
 
 int
 coda_pathconf(struct vop_pathconf_args *ap)
 {
 	int error;
 	register_t *retval;
 
 	retval = ap->a_retval;
 	error = 0;
 
 	switch (ap->a_name) {
 	case _PC_NAME_MAX:
 		*retval = CODA_MAXNAMLEN;
 		break;
 	case _PC_PATH_MAX:
 		*retval = CODA_MAXPATHLEN;
 		break;
 	default:
 		error = vop_stdpathconf(ap);
 		break;
 	}
 
 	return (error);
 }
Index: head/sys/fs/devfs/devfs_devs.c
===================================================================
--- head/sys/fs/devfs/devfs_devs.c	(revision 175201)
+++ head/sys/fs/devfs/devfs_devs.c	(revision 175202)
@@ -1,541 +1,541 @@
 /*-
  * Copyright (c) 2000,2004
  *	Poul-Henning Kamp.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  * From: FreeBSD: src/sys/miscfs/kernfs/kernfs_vfsops.c 1.36
  *
  * $FreeBSD$
  */
 
 #include "opt_mac.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/conf.h>
 #include <sys/dirent.h>
 #include <sys/kernel.h>
 #include <sys/limits.h>
 #include <sys/lock.h>
 #include <sys/malloc.h>
 #include <sys/proc.h>
 #include <sys/sx.h>
 #include <sys/sysctl.h>
 #include <sys/vnode.h>
 
 #include <sys/kdb.h>
 
 #include <fs/devfs/devfs.h>
 #include <fs/devfs/devfs_int.h>
 
 #include <security/mac/mac_framework.h>
 
 /*
  * The one true (but secret) list of active devices in the system.
  * Locked by dev_lock()/devmtx
  */
 struct cdev_priv_list cdevp_list = TAILQ_HEAD_INITIALIZER(cdevp_list);
 
 struct unrhdr *devfs_inos;
 
 
 static MALLOC_DEFINE(M_DEVFS2, "DEVFS2", "DEVFS data 2");
 static MALLOC_DEFINE(M_DEVFS3, "DEVFS3", "DEVFS data 3");
 static MALLOC_DEFINE(M_CDEVP, "DEVFS1", "DEVFS cdev_priv storage");
 
 static SYSCTL_NODE(_vfs, OID_AUTO, devfs, CTLFLAG_RW, 0, "DEVFS filesystem");
 
 static unsigned devfs_generation;
 SYSCTL_UINT(_vfs_devfs, OID_AUTO, generation, CTLFLAG_RD,
 	&devfs_generation, 0, "DEVFS generation number");
 
 unsigned devfs_rule_depth = 1;
 SYSCTL_UINT(_vfs_devfs, OID_AUTO, rule_depth, CTLFLAG_RW,
 	&devfs_rule_depth, 0, "Max depth of ruleset include");
 
 /*
  * Helper sysctl for devname(3).  We're given a struct cdev * and return
  * the name, if any, registered by the device driver.
  */
 static int
 sysctl_devname(SYSCTL_HANDLER_ARGS)
 {
 	int error;
 	dev_t ud;
 	struct cdev_priv *cdp;
 
 	error = SYSCTL_IN(req, &ud, sizeof (ud));
 	if (error)
 		return (error);
 	if (ud == NODEV)
 		return(EINVAL);
 /*
 	ud ^ devfs_random();
 */
 	dev_lock();
 	TAILQ_FOREACH(cdp, &cdevp_list, cdp_list)
 		if (cdp->cdp_inode == ud)
 			break;
 	dev_unlock();
 	if (cdp == NULL)
 		return(ENOENT);
 	return(SYSCTL_OUT(req, cdp->cdp_c.si_name, strlen(cdp->cdp_c.si_name) + 1));
 	return (error);
 }
 
 SYSCTL_PROC(_kern, OID_AUTO, devname, CTLTYPE_OPAQUE|CTLFLAG_RW|CTLFLAG_ANYBODY,
 	NULL, 0, sysctl_devname, "", "devname(3) handler");
 
 SYSCTL_INT(_debug_sizeof, OID_AUTO, cdev, CTLFLAG_RD,
     0, sizeof(struct cdev), "sizeof(struct cdev)");
 
 SYSCTL_INT(_debug_sizeof, OID_AUTO, cdev_priv, CTLFLAG_RD,
     0, sizeof(struct cdev_priv), "sizeof(struct cdev_priv)");
 
 struct cdev *
 devfs_alloc(void)
 {
 	struct cdev_priv *cdp;
 	struct cdev *cdev;
 
 	cdp = malloc(sizeof *cdp, M_CDEVP, M_USE_RESERVE | M_ZERO | M_WAITOK);
 
 	cdp->cdp_dirents = &cdp->cdp_dirent0;
 	cdp->cdp_dirent0 = NULL;
 	cdp->cdp_maxdirent = 0;
 
 	cdev = &cdp->cdp_c;
 	cdev->si_priv = cdp;
 
 	cdev->si_name = cdev->__si_namebuf;
 	LIST_INIT(&cdev->si_children);
 	return (cdev);
 }
 
 void
 devfs_free(struct cdev *cdev)
 {
 	struct cdev_priv *cdp;
 
 	cdp = cdev->si_priv;
 	if (cdev->si_cred != NULL)
 		crfree(cdev->si_cred);
 	if (cdp->cdp_inode > 0)
 		free_unr(devfs_inos, cdp->cdp_inode);
 	if (cdp->cdp_maxdirent > 0) 
 		free(cdp->cdp_dirents, M_DEVFS2);
 	free(cdp, M_CDEVP);
 }
 
 struct devfs_dirent *
 devfs_find(struct devfs_dirent *dd, const char *name, int namelen)
 {
 	struct devfs_dirent *de;
 
 	TAILQ_FOREACH(de, &dd->de_dlist, de_list) {
 		if (namelen != de->de_dirent->d_namlen)
 			continue;
 		if (bcmp(name, de->de_dirent->d_name, namelen) != 0)
 			continue;
 		break;
 	}
 	return (de);
 }
 
 struct devfs_dirent *
 devfs_newdirent(char *name, int namelen)
 {
 	int i;
 	struct devfs_dirent *de;
 	struct dirent d;
 
 	d.d_namlen = namelen;
 	i = sizeof (*de) + GENERIC_DIRSIZ(&d); 
 	de = malloc(i, M_DEVFS3, M_WAITOK | M_ZERO);
 	de->de_dirent = (struct dirent *)(de + 1);
 	de->de_dirent->d_namlen = namelen;
 	de->de_dirent->d_reclen = GENERIC_DIRSIZ(&d);
 	bcopy(name, de->de_dirent->d_name, namelen);
 	de->de_dirent->d_name[namelen] = '\0';
 	vfs_timestamp(&de->de_ctime);
 	de->de_mtime = de->de_atime = de->de_ctime;
 	de->de_links = 1;
 	de->de_holdcnt = 1;
 #ifdef MAC
 	mac_devfs_init(de);
 #endif
 	return (de);
 }
 
 struct devfs_dirent *
 devfs_vmkdir(struct devfs_mount *dmp, char *name, int namelen, struct devfs_dirent *dotdot, u_int inode)
 {
 	struct devfs_dirent *dd;
 	struct devfs_dirent *de;
 
 	/* Create the new directory */
 	dd = devfs_newdirent(name, namelen);
 	TAILQ_INIT(&dd->de_dlist);
 	dd->de_dirent->d_type = DT_DIR;
 	dd->de_mode = 0555;
 	dd->de_links = 2;
 	dd->de_dir = dd;
 	if (inode != 0)
 		dd->de_inode = inode;
 	else
 		dd->de_inode = alloc_unr(devfs_inos);
 
 	/* Create the "." entry in the new directory */
 	de = devfs_newdirent(".", 1);
 	de->de_dirent->d_type = DT_DIR;
 	de->de_flags |= DE_DOT;
 	TAILQ_INSERT_TAIL(&dd->de_dlist, de, de_list);
 	de->de_dir = dd;
 
 	/* Create the ".." entry in the new directory */
 	de = devfs_newdirent("..", 2);
 	de->de_dirent->d_type = DT_DIR;
 	de->de_flags |= DE_DOTDOT;
 	TAILQ_INSERT_TAIL(&dd->de_dlist, de, de_list);
 	if (dotdot == NULL) {
 		de->de_dir = dd;
 	} else {
 		de->de_dir = dotdot;
 		TAILQ_INSERT_TAIL(&dotdot->de_dlist, dd, de_list);
 		dotdot->de_links++;
 	}
 
 #ifdef MAC
 	mac_devfs_create_directory(dmp->dm_mount, name, namelen, dd);
 #endif
 	return (dd);
 }
 
 void
 devfs_dirent_free(struct devfs_dirent *de)
 {
 	free(de, M_DEVFS3);
 }
 
 /*
  * The caller needs to hold the dm for the duration of the call since
  * dm->dm_lock may be temporary dropped.
  */
 void
 devfs_delete(struct devfs_mount *dm, struct devfs_dirent *de, int vp_locked)
 {
 	struct vnode *vp;
 	struct thread *td;
 
 	KASSERT((de->de_flags & DE_DOOMED) == 0,
 		("devfs_delete doomed dirent"));
 	td = curthread;
 	de->de_flags |= DE_DOOMED;
 	mtx_lock(&devfs_de_interlock);
 	vp = de->de_vnode;
 	if (vp != NULL) {
 		VI_LOCK(vp);
 		mtx_unlock(&devfs_de_interlock);
 		vholdl(vp);
 		sx_unlock(&dm->dm_lock);
 		if (!vp_locked)
-			vn_lock(vp, LK_EXCLUSIVE | LK_INTERLOCK | LK_RETRY, td);
+			vn_lock(vp, LK_EXCLUSIVE | LK_INTERLOCK | LK_RETRY);
 		else
 			VI_UNLOCK(vp);
 		vgone(vp);
 		if (!vp_locked)
 			VOP_UNLOCK(vp, 0, td);
 		vdrop(vp);
 		sx_xlock(&dm->dm_lock);
 	} else
 		mtx_unlock(&devfs_de_interlock);
 	if (de->de_symlink) {
 		free(de->de_symlink, M_DEVFS);
 		de->de_symlink = NULL;
 	}
 #ifdef MAC
 	mac_devfs_destroy(de);
 #endif
 	if (de->de_inode > DEVFS_ROOTINO) {
 		free_unr(devfs_inos, de->de_inode);
 		de->de_inode = 0;
 	}
 	if (DEVFS_DE_DROP(de))
 		devfs_dirent_free(de);
 }
 
 /*
  * Called on unmount.
  * Recursively removes the entire tree.
  * The caller needs to hold the dm for the duration of the call.
  */
 
 static void
 devfs_purge(struct devfs_mount *dm, struct devfs_dirent *dd)
 {
 	struct devfs_dirent *de;
 
 	sx_assert(&dm->dm_lock, SX_XLOCKED);
 	for (;;) {
 		de = TAILQ_FIRST(&dd->de_dlist);
 		if (de == NULL)
 			break;
 		TAILQ_REMOVE(&dd->de_dlist, de, de_list);
 		if (de->de_flags & (DE_DOT|DE_DOTDOT))
 			devfs_delete(dm, de, 0);
 		else if (de->de_dirent->d_type == DT_DIR)
 			devfs_purge(dm, de);
 		else 
 			devfs_delete(dm, de, 0);
 	}
 	devfs_delete(dm, dd, 0);
 }
 
 /*
  * Each cdev_priv has an array of pointers to devfs_dirent which is indexed
  * by the mount points dm_idx.
  * This function extends the array when necessary, taking into account that
  * the default array is 1 element and not malloc'ed.
  */
 static void
 devfs_metoo(struct cdev_priv *cdp, struct devfs_mount *dm)
 {
 	struct devfs_dirent **dep;
 	int siz;
 
 	siz = (dm->dm_idx + 1) * sizeof *dep;
 	dep = malloc(siz, M_DEVFS2, M_WAITOK | M_ZERO);
 	dev_lock();
 	if (dm->dm_idx <= cdp->cdp_maxdirent) {
 		/* We got raced */
 		dev_unlock();
 		free(dep, M_DEVFS2);
 		return;
 	} 
 	memcpy(dep, cdp->cdp_dirents, (cdp->cdp_maxdirent + 1) * sizeof *dep);
 	if (cdp->cdp_maxdirent > 0)
 		free(cdp->cdp_dirents, M_DEVFS2);
 	cdp->cdp_dirents = dep;
 	/*
 	 * XXX: if malloc told us how much we actually got this could
 	 * XXX: be optimized.
 	 */
 	cdp->cdp_maxdirent = dm->dm_idx;
 	dev_unlock();
 }
 
 /*
  * The caller needs to hold the dm for the duration of the call.
  */
 static int
 devfs_populate_loop(struct devfs_mount *dm, int cleanup)
 {
 	struct cdev_priv *cdp;
 	struct devfs_dirent *de;
 	struct devfs_dirent *dd;
 	struct cdev *pdev;
 	int j;
 	char *q, *s;
 
 	sx_assert(&dm->dm_lock, SX_XLOCKED);
 	dev_lock();
 	TAILQ_FOREACH(cdp, &cdevp_list, cdp_list) {
 
 		KASSERT(cdp->cdp_dirents != NULL, ("NULL cdp_dirents"));
 
 		/*
 		 * If we are unmounting, or the device has been destroyed,
 		 * clean up our dirent.
 		 */
 		if ((cleanup || !(cdp->cdp_flags & CDP_ACTIVE)) &&
 		    dm->dm_idx <= cdp->cdp_maxdirent &&
 		    cdp->cdp_dirents[dm->dm_idx] != NULL) {
 			de = cdp->cdp_dirents[dm->dm_idx];
 			cdp->cdp_dirents[dm->dm_idx] = NULL;
 			KASSERT(cdp == de->de_cdp,
 			    ("%s %d %s %p %p", __func__, __LINE__,
 			    cdp->cdp_c.si_name, cdp, de->de_cdp));
 			KASSERT(de->de_dir != NULL, ("Null de->de_dir"));
 			dev_unlock();
 
 			TAILQ_REMOVE(&de->de_dir->de_dlist, de, de_list);
 			de->de_cdp = NULL;
 			de->de_inode = 0;
 			devfs_delete(dm, de, 0);
 			dev_lock();
 			cdp->cdp_inuse--;
 			dev_unlock();
 			return (1);
 		}
 		/*
 	 	 * GC any lingering devices
 		 */
 		if (!(cdp->cdp_flags & CDP_ACTIVE)) {
 			if (cdp->cdp_inuse > 0)
 				continue;
 			TAILQ_REMOVE(&cdevp_list, cdp, cdp_list);
 			dev_unlock();
 			dev_rel(&cdp->cdp_c);
 			return (1);
 		}
 		/*
 		 * Don't create any new dirents if we are unmounting
 		 */
 		if (cleanup)
 			continue;
 		KASSERT((cdp->cdp_flags & CDP_ACTIVE), ("Bogons, I tell ya'!"));
 
 		if (dm->dm_idx <= cdp->cdp_maxdirent &&
 		    cdp->cdp_dirents[dm->dm_idx] != NULL) {
 			de = cdp->cdp_dirents[dm->dm_idx];
 			KASSERT(cdp == de->de_cdp, ("inconsistent cdp"));
 			continue;
 		}
 
 
 		cdp->cdp_inuse++;
 		dev_unlock();
 
 		if (dm->dm_idx > cdp->cdp_maxdirent)
 		        devfs_metoo(cdp, dm);
 
 		dd = dm->dm_rootdir;
 		s = cdp->cdp_c.si_name;
 		for (;;) {
 			for (q = s; *q != '/' && *q != '\0'; q++)
 				continue;
 			if (*q != '/')
 				break;
 			de = devfs_find(dd, s, q - s);
 			if (de == NULL)
 				de = devfs_vmkdir(dm, s, q - s, dd, 0);
 			s = q + 1;
 			dd = de;
 		}
 
 		de = devfs_newdirent(s, q - s);
 		if (cdp->cdp_c.si_flags & SI_ALIAS) {
 			de->de_uid = 0;
 			de->de_gid = 0;
 			de->de_mode = 0755;
 			de->de_dirent->d_type = DT_LNK;
 			pdev = cdp->cdp_c.si_parent;
 			j = strlen(pdev->si_name) + 1;
 			de->de_symlink = malloc(j, M_DEVFS, M_WAITOK);
 			bcopy(pdev->si_name, de->de_symlink, j);
 		} else {
 			de->de_uid = cdp->cdp_c.si_uid;
 			de->de_gid = cdp->cdp_c.si_gid;
 			de->de_mode = cdp->cdp_c.si_mode;
 			de->de_dirent->d_type = DT_CHR;
 		}
 		de->de_inode = cdp->cdp_inode;
 		de->de_cdp = cdp;
 #ifdef MAC
 		mac_devfs_create_device(cdp->cdp_c.si_cred, dm->dm_mount,
 		    &cdp->cdp_c, de);
 #endif
 		de->de_dir = dd;
 		TAILQ_INSERT_TAIL(&dd->de_dlist, de, de_list);
 		devfs_rules_apply(dm, de);
 		dev_lock();
 		/* XXX: could check that cdp is still active here */
 		KASSERT(cdp->cdp_dirents[dm->dm_idx] == NULL,
 		    ("%s %d\n", __func__, __LINE__));
 		cdp->cdp_dirents[dm->dm_idx] = de;
 		KASSERT(de->de_cdp != (void *)0xdeadc0de,
 		    ("%s %d\n", __func__, __LINE__));
 		dev_unlock();
 		return (1);
 	}
 	dev_unlock();
 	return (0);
 }
 
 /*
  * The caller needs to hold the dm for the duration of the call.
  */
 void
 devfs_populate(struct devfs_mount *dm)
 {
 
 	sx_assert(&dm->dm_lock, SX_XLOCKED);
 	if (dm->dm_generation == devfs_generation)
 		return;
 	while (devfs_populate_loop(dm, 0))
 		continue;
 	dm->dm_generation = devfs_generation;
 }
 
 /*
  * The caller needs to hold the dm for the duration of the call.
  */
 void
 devfs_cleanup(struct devfs_mount *dm)
 {
 
 	sx_assert(&dm->dm_lock, SX_XLOCKED);
 	while (devfs_populate_loop(dm, 1))
 		continue;
 	devfs_purge(dm, dm->dm_rootdir);
 }
 
 /*
  * devfs_create() and devfs_destroy() are called from kern_conf.c and
  * in both cases the devlock() mutex is held, so no further locking
  * is necesary and no sleeping allowed.
  */
 
 void
 devfs_create(struct cdev *dev)
 {
 	struct cdev_priv *cdp;
 
 	mtx_assert(&devmtx, MA_OWNED);
 	cdp = dev->si_priv;
 	cdp->cdp_flags |= CDP_ACTIVE;
 	cdp->cdp_inode = alloc_unrl(devfs_inos);
 	dev_refl(dev);
 	TAILQ_INSERT_TAIL(&cdevp_list, cdp, cdp_list);
 	devfs_generation++;
 }
 
 void
 devfs_destroy(struct cdev *dev)
 {
 	struct cdev_priv *cdp;
 
 	mtx_assert(&devmtx, MA_OWNED);
 	cdp = dev->si_priv;
 	cdp->cdp_flags &= ~CDP_ACTIVE;
 	devfs_generation++;
 }
 
 static void
 devfs_devs_init(void *junk __unused)
 {
 
 	devfs_inos = new_unrhdr(DEVFS_ROOTINO + 1, INT_MAX, &devmtx);
 }
 
 SYSINIT(devfs_devs, SI_SUB_DEVFS, SI_ORDER_FIRST, devfs_devs_init, NULL);
Index: head/sys/fs/devfs/devfs_vnops.c
===================================================================
--- head/sys/fs/devfs/devfs_vnops.c	(revision 175201)
+++ head/sys/fs/devfs/devfs_vnops.c	(revision 175202)
@@ -1,1412 +1,1412 @@
 /*-
  * Copyright (c) 2000-2004
  *	Poul-Henning Kamp.  All rights reserved.
  * Copyright (c) 1989, 1992-1993, 1995
  *	The Regents of the University of California.  All rights reserved.
  *
  * This code is derived from software donated to Berkeley by
  * Jan-Simon Pendry.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	@(#)kernfs_vnops.c	8.15 (Berkeley) 5/21/95
  * From: FreeBSD: src/sys/miscfs/kernfs/kernfs_vnops.c 1.43
  *
  * $FreeBSD$
  */
 
 /*
  * TODO:
  *	remove empty directories
  *	mkdir: want it ?
  */
 
 #include "opt_mac.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/conf.h>
 #include <sys/dirent.h>
 #include <sys/fcntl.h>
 #include <sys/file.h>
 #include <sys/filedesc.h>
 #include <sys/filio.h>
 #include <sys/kernel.h>
 #include <sys/lock.h>
 #include <sys/malloc.h>
 #include <sys/mount.h>
 #include <sys/namei.h>
 #include <sys/priv.h>
 #include <sys/proc.h>
 #include <sys/stat.h>
 #include <sys/sx.h>
 #include <sys/time.h>
 #include <sys/ttycom.h>
 #include <sys/unistd.h>
 #include <sys/vnode.h>
 
 static struct vop_vector devfs_vnodeops;
 static struct vop_vector devfs_specops;
 static struct fileops devfs_ops_f;
 
 #include <fs/devfs/devfs.h>
 #include <fs/devfs/devfs_int.h>
 
 #include <security/mac/mac_framework.h>
 
 struct mtx	devfs_de_interlock;
 MTX_SYSINIT(devfs_de_interlock, &devfs_de_interlock, "devfs interlock", MTX_DEF);
 struct sx	clone_drain_lock;
 SX_SYSINIT(clone_drain_lock, &clone_drain_lock, "clone events drain lock");
 
 static int
 devfs_fp_check(struct file *fp, struct cdev **devp, struct cdevsw **dswp)
 {
 
 	*dswp = devvn_refthread(fp->f_vnode, devp);
 	if (*devp != fp->f_data) {
 		if (*dswp != NULL)
 			dev_relthread(*devp);
 		return (ENXIO);
 	}
 	KASSERT((*devp)->si_refcount > 0,
 	    ("devfs: un-referenced struct cdev *(%s)", devtoname(*devp)));
 	if (*dswp == NULL)
 		return (ENXIO);
 	return (0);
 }
 
 /*
  * Construct the fully qualified path name relative to the mountpoint
  */
 static char *
 devfs_fqpn(char *buf, struct vnode *dvp, struct componentname *cnp)
 {
 	int i;
 	struct devfs_dirent *de, *dd;
 	struct devfs_mount *dmp;
 
 	dmp = VFSTODEVFS(dvp->v_mount);
 	dd = dvp->v_data;
 	i = SPECNAMELEN;
 	buf[i] = '\0';
 	i -= cnp->cn_namelen;
 	if (i < 0)
 		 return (NULL);
 	bcopy(cnp->cn_nameptr, buf + i, cnp->cn_namelen);
 	de = dd;
 	while (de != dmp->dm_rootdir) {
 		i--;
 		if (i < 0)
 			 return (NULL);
 		buf[i] = '/';
 		i -= de->de_dirent->d_namlen;
 		if (i < 0)
 			 return (NULL);
 		bcopy(de->de_dirent->d_name, buf + i,
 		    de->de_dirent->d_namlen);
 		de = TAILQ_FIRST(&de->de_dlist);	/* "." */
 		de = TAILQ_NEXT(de, de_list);		/* ".." */
 		de = de->de_dir;
 	}
 	return (buf + i);
 }
 
 static int
 devfs_allocv_drop_refs(int drop_dm_lock, struct devfs_mount *dmp,
 	struct devfs_dirent *de)
 {
 	int not_found;
 
 	not_found = 0;
 	if (de->de_flags & DE_DOOMED)
 		not_found = 1;
 	if (DEVFS_DE_DROP(de)) {
 		KASSERT(not_found == 1, ("DEVFS de dropped but not doomed"));
 		devfs_dirent_free(de);
 	}
 	if (DEVFS_DMP_DROP(dmp)) {
 		KASSERT(not_found == 1,
 			("DEVFS mount struct freed before dirent"));
 		not_found = 2;
 		sx_xunlock(&dmp->dm_lock);
 		devfs_unmount_final(dmp);
 	}
 	if (not_found == 1 || (drop_dm_lock && not_found != 2))
 		sx_unlock(&dmp->dm_lock);
 	return (not_found);
 }
 
 static void
 devfs_insmntque_dtr(struct vnode *vp, void *arg)
 {
 	struct devfs_dirent *de;
 
 	de = (struct devfs_dirent *)arg;
 	mtx_lock(&devfs_de_interlock);
 	vp->v_data = NULL;
 	de->de_vnode = NULL;
 	mtx_unlock(&devfs_de_interlock);
 	vgone(vp);
 	vput(vp);
 }
 
 /*
  * devfs_allocv shall be entered with dmp->dm_lock held, and it drops
  * it on return.
  */
 int
 devfs_allocv(struct devfs_dirent *de, struct mount *mp, struct vnode **vpp, struct thread *td)
 {
 	int error;
 	struct vnode *vp;
 	struct cdev *dev;
 	struct devfs_mount *dmp;
 
 	KASSERT(td == curthread, ("devfs_allocv: td != curthread"));
 	dmp = VFSTODEVFS(mp);
 	if (de->de_flags & DE_DOOMED) {
 		sx_xunlock(&dmp->dm_lock);
 		return (ENOENT);
 	}
  loop:
 	DEVFS_DE_HOLD(de);
 	DEVFS_DMP_HOLD(dmp);
 	mtx_lock(&devfs_de_interlock);
 	vp = de->de_vnode;
 	if (vp != NULL) {
 		VI_LOCK(vp);
 		mtx_unlock(&devfs_de_interlock);
 		sx_xunlock(&dmp->dm_lock);
 		error = vget(vp, LK_EXCLUSIVE | LK_INTERLOCK, td);
 		sx_xlock(&dmp->dm_lock);
 		if (devfs_allocv_drop_refs(0, dmp, de)) {
 			if (error == 0)
 				vput(vp);
 			return (ENOENT);
 		}
 		else if (error)
 			goto loop;
 		sx_xunlock(&dmp->dm_lock);
 		*vpp = vp;
 		return (0);
 	}
 	mtx_unlock(&devfs_de_interlock);
 	if (de->de_dirent->d_type == DT_CHR) {
 		if (!(de->de_cdp->cdp_flags & CDP_ACTIVE)) {
 			devfs_allocv_drop_refs(1, dmp, de);
 			return (ENOENT);
 		}
 		dev = &de->de_cdp->cdp_c;
 	} else {
 		dev = NULL;
 	}
 	error = getnewvnode("devfs", mp, &devfs_vnodeops, &vp);
 	if (error != 0) {
 		devfs_allocv_drop_refs(1, dmp, de);
 		printf("devfs_allocv: failed to allocate new vnode\n");
 		return (error);
 	}
 
 	if (de->de_dirent->d_type == DT_CHR) {
 		vp->v_type = VCHR;
 		VI_LOCK(vp);
 		dev_lock();
 		dev_refl(dev);
 		/* XXX: v_rdev should be protect by vnode lock */
 		vp->v_rdev = dev;
 		KASSERT(vp->v_usecount == 1,
 		    ("%s %d (%d)\n", __func__, __LINE__, vp->v_usecount));
 		dev->si_usecount += vp->v_usecount;
 		dev_unlock();
 		VI_UNLOCK(vp);
 		vp->v_op = &devfs_specops;
 	} else if (de->de_dirent->d_type == DT_DIR) {
 		vp->v_type = VDIR;
 	} else if (de->de_dirent->d_type == DT_LNK) {
 		vp->v_type = VLNK;
 	} else {
 		vp->v_type = VBAD;
 	}
-	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td);
+	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
 	mtx_lock(&devfs_de_interlock);
 	vp->v_data = de;
 	de->de_vnode = vp;
 	mtx_unlock(&devfs_de_interlock);
 	error = insmntque1(vp, mp, devfs_insmntque_dtr, de);
 	if (error != 0) {
 		(void) devfs_allocv_drop_refs(1, dmp, de);
 		return (error);
 	}
 	if (devfs_allocv_drop_refs(0, dmp, de)) {
 		vput(vp);
 		return (ENOENT);
 	}
 #ifdef MAC
 	mac_devfs_vnode_associate(mp, de, vp);
 #endif
 	sx_xunlock(&dmp->dm_lock);
 	*vpp = vp;
 	return (0);
 }
 
 static int
 devfs_access(struct vop_access_args *ap)
 {
 	struct vnode *vp = ap->a_vp;
 	struct devfs_dirent *de;
 	int error;
 
 	de = vp->v_data;
 	if (vp->v_type == VDIR)
 		de = de->de_dir;
 
 	error = vaccess(vp->v_type, de->de_mode, de->de_uid, de->de_gid,
 	    ap->a_mode, ap->a_cred, NULL);
 	if (!error)
 		return (error);
 	if (error != EACCES)
 		return (error);
 	/* We do, however, allow access to the controlling terminal */
 	if (!(ap->a_td->td_proc->p_flag & P_CONTROLT))
 		return (error);
 	if (ap->a_td->td_proc->p_session->s_ttyvp == de->de_vnode)
 		return (0);
 	return (error);
 }
 
 /* ARGSUSED */
 static int
 devfs_advlock(struct vop_advlock_args *ap)
 {
 
 	return (ap->a_flags & F_FLOCK ? EOPNOTSUPP : EINVAL);
 }
 
 /* ARGSUSED */
 static int
 devfs_close(struct vop_close_args *ap)
 {
 	struct vnode *vp = ap->a_vp, *oldvp;
 	struct thread *td = ap->a_td;
 	struct cdev *dev = vp->v_rdev;
 	struct cdevsw *dsw;
 	int vp_locked, error;
 
 	/*
 	 * Hack: a tty device that is a controlling terminal
 	 * has a reference from the session structure.
 	 * We cannot easily tell that a character device is
 	 * a controlling terminal, unless it is the closing
 	 * process' controlling terminal.  In that case,
 	 * if the reference count is 2 (this last descriptor
 	 * plus the session), release the reference from the session.
 	 */
 	oldvp = NULL;
 	sx_xlock(&proctree_lock);
 	if (td && vp == td->td_proc->p_session->s_ttyvp) {
 		SESS_LOCK(td->td_proc->p_session);
 		VI_LOCK(vp);
 		if (count_dev(dev) == 2 && (vp->v_iflag & VI_DOOMED) == 0) {
 			td->td_proc->p_session->s_ttyvp = NULL;
 			oldvp = vp;
 		}
 		VI_UNLOCK(vp);
 		SESS_UNLOCK(td->td_proc->p_session);
 	}
 	sx_xunlock(&proctree_lock);
 	if (oldvp != NULL)
 		vrele(oldvp);
 	/*
 	 * We do not want to really close the device if it
 	 * is still in use unless we are trying to close it
 	 * forcibly. Since every use (buffer, vnode, swap, cmap)
 	 * holds a reference to the vnode, and because we mark
 	 * any other vnodes that alias this device, when the
 	 * sum of the reference counts on all the aliased
 	 * vnodes descends to one, we are on last close.
 	 */
 	dsw = dev_refthread(dev);
 	if (dsw == NULL)
 		return (ENXIO);
 	VI_LOCK(vp);
 	if (vp->v_iflag & VI_DOOMED) {
 		/* Forced close. */
 	} else if (dsw->d_flags & D_TRACKCLOSE) {
 		/* Keep device updated on status. */
 	} else if (count_dev(dev) > 1) {
 		VI_UNLOCK(vp);
 		dev_relthread(dev);
 		return (0);
 	}
 	vholdl(vp);
 	VI_UNLOCK(vp);
 	vp_locked = VOP_ISLOCKED(vp, td);
 	VOP_UNLOCK(vp, 0, td);
 	KASSERT(dev->si_refcount > 0,
 	    ("devfs_close() on un-referenced struct cdev *(%s)", devtoname(dev)));
 	if (!(dsw->d_flags & D_NEEDGIANT)) {
 		DROP_GIANT();
 		error = dsw->d_close(dev, ap->a_fflag, S_IFCHR, td);
 		PICKUP_GIANT();
 	} else {
 		error = dsw->d_close(dev, ap->a_fflag, S_IFCHR, td);
 	}
 	dev_relthread(dev);
-	vn_lock(vp, vp_locked | LK_RETRY, td);
+	vn_lock(vp, vp_locked | LK_RETRY);
 	vdrop(vp);
 	return (error);
 }
 
 static int
 devfs_close_f(struct file *fp, struct thread *td)
 {
 
 	return (vnops.fo_close(fp, td));
 }
 
 /* ARGSUSED */
 static int
 devfs_fsync(struct vop_fsync_args *ap)
 {
 	if (!vn_isdisk(ap->a_vp, NULL))
 		return (0);
 
 	return (vop_stdfsync(ap));
 }
 
 static int
 devfs_getattr(struct vop_getattr_args *ap)
 {
 	struct vnode *vp = ap->a_vp;
 	struct vattr *vap = ap->a_vap;
 	int error = 0;
 	struct devfs_dirent *de;
 	struct cdev *dev;
 
 	de = vp->v_data;
 	KASSERT(de != NULL, ("Null dirent in devfs_getattr vp=%p", vp));
 	if (vp->v_type == VDIR) {
 		de = de->de_dir;
 		KASSERT(de != NULL,
 		    ("Null dir dirent in devfs_getattr vp=%p", vp));
 	}
 	bzero((caddr_t) vap, sizeof(*vap));
 	vattr_null(vap);
 	vap->va_uid = de->de_uid;
 	vap->va_gid = de->de_gid;
 	vap->va_mode = de->de_mode;
 	if (vp->v_type == VLNK)
 		vap->va_size = strlen(de->de_symlink);
 	else if (vp->v_type == VDIR)
 		vap->va_size = vap->va_bytes = DEV_BSIZE;
 	else
 		vap->va_size = 0;
 	if (vp->v_type != VDIR)
 		vap->va_bytes = 0;
 	vap->va_blocksize = DEV_BSIZE;
 	vap->va_type = vp->v_type;
 
 #define fix(aa)							\
 	do {							\
 		if ((aa).tv_sec <= 3600) {			\
 			(aa).tv_sec = boottime.tv_sec;		\
 			(aa).tv_nsec = boottime.tv_usec * 1000; \
 		}						\
 	} while (0)
 
 	if (vp->v_type != VCHR)  {
 		fix(de->de_atime);
 		vap->va_atime = de->de_atime;
 		fix(de->de_mtime);
 		vap->va_mtime = de->de_mtime;
 		fix(de->de_ctime);
 		vap->va_ctime = de->de_ctime;
 	} else {
 		dev = vp->v_rdev;
 		fix(dev->si_atime);
 		vap->va_atime = dev->si_atime;
 		fix(dev->si_mtime);
 		vap->va_mtime = dev->si_mtime;
 		fix(dev->si_ctime);
 		vap->va_ctime = dev->si_ctime;
 
 		vap->va_rdev = dev->si_priv->cdp_inode;
 	}
 	vap->va_gen = 0;
 	vap->va_flags = 0;
 	vap->va_nlink = de->de_links;
 	vap->va_fileid = de->de_inode;
 
 	return (error);
 }
 
 /* ARGSUSED */
 static int
 devfs_ioctl_f(struct file *fp, u_long com, void *data, struct ucred *cred, struct thread *td)
 {
 	struct cdev *dev;
 	struct cdevsw *dsw;
 	struct vnode *vp;
 	struct vnode *vpold;
 	int error, i;
 	const char *p;
 	struct fiodgname_arg *fgn;
 
 	error = devfs_fp_check(fp, &dev, &dsw);
 	if (error)
 		return (error);
 
 	if (com == FIODTYPE) {
 		*(int *)data = dsw->d_flags & D_TYPEMASK;
 		dev_relthread(dev);
 		return (0);
 	} else if (com == FIODGNAME) {
 		fgn = data;
 		p = devtoname(dev);
 		i = strlen(p) + 1;
 		if (i > fgn->len)
 			error = EINVAL;
 		else
 			error = copyout(p, fgn->buf, i);
 		dev_relthread(dev);
 		return (error);
 	}
 	error = dsw->d_ioctl(dev, com, data, fp->f_flag, td);
 	dev_relthread(dev);
 	if (error == ENOIOCTL)
 		error = ENOTTY;
 	if (error == 0 && com == TIOCSCTTY) {
 		vp = fp->f_vnode;
 
 		/* Do nothing if reassigning same control tty */
 		sx_slock(&proctree_lock);
 		if (td->td_proc->p_session->s_ttyvp == vp) {
 			sx_sunlock(&proctree_lock);
 			return (0);
 		}
 
 		mtx_lock(&Giant);	/* XXX TTY */
 
 		vpold = td->td_proc->p_session->s_ttyvp;
 		VREF(vp);
 		SESS_LOCK(td->td_proc->p_session);
 		td->td_proc->p_session->s_ttyvp = vp;
 		SESS_UNLOCK(td->td_proc->p_session);
 
 		sx_sunlock(&proctree_lock);
 
 		/* Get rid of reference to old control tty */
 		if (vpold)
 			vrele(vpold);
 		mtx_unlock(&Giant);	/* XXX TTY */
 	}
 	return (error);
 }
 
 /* ARGSUSED */
 static int
 devfs_kqfilter_f(struct file *fp, struct knote *kn)
 {
 	struct cdev *dev;
 	struct cdevsw *dsw;
 	int error;
 
 	error = devfs_fp_check(fp, &dev, &dsw);
 	if (error)
 		return (error);
 	error = dsw->d_kqfilter(dev, kn);
 	dev_relthread(dev);
 	return (error);
 }
 
 static int
 devfs_lookupx(struct vop_lookup_args *ap, int *dm_unlock)
 {
 	struct componentname *cnp;
 	struct vnode *dvp, **vpp;
 	struct thread *td;
 	struct devfs_dirent *de, *dd;
 	struct devfs_dirent **dde;
 	struct devfs_mount *dmp;
 	struct cdev *cdev;
 	int error, flags, nameiop;
 	char specname[SPECNAMELEN + 1], *pname;
 
 	cnp = ap->a_cnp;
 	vpp = ap->a_vpp;
 	dvp = ap->a_dvp;
 	pname = cnp->cn_nameptr;
 	td = cnp->cn_thread;
 	flags = cnp->cn_flags;
 	nameiop = cnp->cn_nameiop;
 	dmp = VFSTODEVFS(dvp->v_mount);
 	dd = dvp->v_data;
 	*vpp = NULLVP;
 
 	if ((flags & ISLASTCN) && nameiop == RENAME)
 		return (EOPNOTSUPP);
 
 	if (dvp->v_type != VDIR)
 		return (ENOTDIR);
 
 	if ((flags & ISDOTDOT) && (dvp->v_vflag & VV_ROOT))
 		return (EIO);
 
 	error = VOP_ACCESS(dvp, VEXEC, cnp->cn_cred, td);
 	if (error)
 		return (error);
 
 	if (cnp->cn_namelen == 1 && *pname == '.') {
 		if ((flags & ISLASTCN) && nameiop != LOOKUP)
 			return (EINVAL);
 		*vpp = dvp;
 		VREF(dvp);
 		return (0);
 	}
 
 	if (flags & ISDOTDOT) {
 		if ((flags & ISLASTCN) && nameiop != LOOKUP)
 			return (EINVAL);
 		VOP_UNLOCK(dvp, 0, td);
 		de = TAILQ_FIRST(&dd->de_dlist);	/* "." */
 		de = TAILQ_NEXT(de, de_list);		/* ".." */
 		de = de->de_dir;
 		error = devfs_allocv(de, dvp->v_mount, vpp, td);
 		*dm_unlock = 0;
-		vn_lock(dvp, LK_EXCLUSIVE | LK_RETRY, td);
+		vn_lock(dvp, LK_EXCLUSIVE | LK_RETRY);
 		return (error);
 	}
 
 	DEVFS_DMP_HOLD(dmp);
 	devfs_populate(dmp);
 	if (DEVFS_DMP_DROP(dmp)) {
 		*dm_unlock = 0;
 		sx_xunlock(&dmp->dm_lock);
 		devfs_unmount_final(dmp);
 		return (ENOENT);
 	}
 	dd = dvp->v_data;
 	de = devfs_find(dd, cnp->cn_nameptr, cnp->cn_namelen);
 	while (de == NULL) {	/* While(...) so we can use break */
 
 		if (nameiop == DELETE)
 			return (ENOENT);
 
 		/*
 		 * OK, we didn't have an entry for the name we were asked for
 		 * so we try to see if anybody can create it on demand.
 		 */
 		pname = devfs_fqpn(specname, dvp, cnp);
 		if (pname == NULL)
 			break;
 
 		cdev = NULL;
 		DEVFS_DMP_HOLD(dmp);
 		sx_xunlock(&dmp->dm_lock);
 		sx_slock(&clone_drain_lock);
 		EVENTHANDLER_INVOKE(dev_clone,
 		    td->td_ucred, pname, strlen(pname), &cdev);
 		sx_sunlock(&clone_drain_lock);
 		sx_xlock(&dmp->dm_lock);
 		if (DEVFS_DMP_DROP(dmp)) {
 			*dm_unlock = 0;
 			sx_xunlock(&dmp->dm_lock);
 			devfs_unmount_final(dmp);
 			return (ENOENT);
 		}
 		if (cdev == NULL)
 			break;
 
 		DEVFS_DMP_HOLD(dmp);
 		devfs_populate(dmp);
 		if (DEVFS_DMP_DROP(dmp)) {
 			*dm_unlock = 0;
 			sx_xunlock(&dmp->dm_lock);
 			devfs_unmount_final(dmp);
 			return (ENOENT);
 		}
 
 		dev_lock();
 		dde = &cdev->si_priv->cdp_dirents[dmp->dm_idx];
 		if (dde != NULL && *dde != NULL)
 			de = *dde;
 		dev_unlock();
 		dev_rel(cdev);
 		break;
 	}
 
 	if (de == NULL || de->de_flags & DE_WHITEOUT) {
 		if ((nameiop == CREATE || nameiop == RENAME) &&
 		    (flags & (LOCKPARENT | WANTPARENT)) && (flags & ISLASTCN)) {
 			cnp->cn_flags |= SAVENAME;
 			return (EJUSTRETURN);
 		}
 		return (ENOENT);
 	}
 
 	if ((cnp->cn_nameiop == DELETE) && (flags & ISLASTCN)) {
 		error = VOP_ACCESS(dvp, VWRITE, cnp->cn_cred, td);
 		if (error)
 			return (error);
 		if (*vpp == dvp) {
 			VREF(dvp);
 			*vpp = dvp;
 			return (0);
 		}
 	}
 	error = devfs_allocv(de, dvp->v_mount, vpp, td);
 	*dm_unlock = 0;
 	return (error);
 }
 
 static int
 devfs_lookup(struct vop_lookup_args *ap)
 {
 	int j;
 	struct devfs_mount *dmp;
 	int dm_unlock;
 
 	dmp = VFSTODEVFS(ap->a_dvp->v_mount);
 	dm_unlock = 1;
 	sx_xlock(&dmp->dm_lock);
 	j = devfs_lookupx(ap, &dm_unlock);
 	if (dm_unlock == 1)
 		sx_xunlock(&dmp->dm_lock);
 	return (j);
 }
 
 static int
 devfs_mknod(struct vop_mknod_args *ap)
 {
 	struct componentname *cnp;
 	struct vnode *dvp, **vpp;
 	struct thread *td;
 	struct devfs_dirent *dd, *de;
 	struct devfs_mount *dmp;
 	int error;
 
 	/*
 	 * The only type of node we should be creating here is a
 	 * character device, for anything else return EOPNOTSUPP.
 	 */
 	if (ap->a_vap->va_type != VCHR)
 		return (EOPNOTSUPP);
 	dvp = ap->a_dvp;
 	dmp = VFSTODEVFS(dvp->v_mount);
 
 	cnp = ap->a_cnp;
 	vpp = ap->a_vpp;
 	td = cnp->cn_thread;
 	dd = dvp->v_data;
 
 	error = ENOENT;
 	sx_xlock(&dmp->dm_lock);
 	TAILQ_FOREACH(de, &dd->de_dlist, de_list) {
 		if (cnp->cn_namelen != de->de_dirent->d_namlen)
 			continue;
 		if (bcmp(cnp->cn_nameptr, de->de_dirent->d_name,
 		    de->de_dirent->d_namlen) != 0)
 			continue;
 		if (de->de_flags & DE_WHITEOUT)
 			break;
 		goto notfound;
 	}
 	if (de == NULL)
 		goto notfound;
 	de->de_flags &= ~DE_WHITEOUT;
 	error = devfs_allocv(de, dvp->v_mount, vpp, td);
 	return (error);
 notfound:
 	sx_xunlock(&dmp->dm_lock);
 	return (error);
 }
 
 /* ARGSUSED */
 static int
 devfs_open(struct vop_open_args *ap)
 {
 	struct thread *td = ap->a_td;
 	struct vnode *vp = ap->a_vp;
 	struct cdev *dev = vp->v_rdev;
 	struct file *fp = ap->a_fp;
 	int error;
 	struct cdevsw *dsw;
 
 	if (vp->v_type == VBLK)
 		return (ENXIO);
 
 	if (dev == NULL)
 		return (ENXIO);
 
 	/* Make this field valid before any I/O in d_open. */
 	if (dev->si_iosize_max == 0)
 		dev->si_iosize_max = DFLTPHYS;
 
 	dsw = dev_refthread(dev);
 	if (dsw == NULL)
 		return (ENXIO);
 
 	/* XXX: Special casing of ttys for deadfs.  Probably redundant. */
 	if (dsw->d_flags & D_TTY)
 		vp->v_vflag |= VV_ISTTY;
 
 	VOP_UNLOCK(vp, 0, td);
 
 	if(!(dsw->d_flags & D_NEEDGIANT)) {
 		DROP_GIANT();
 		if (dsw->d_fdopen != NULL)
 			error = dsw->d_fdopen(dev, ap->a_mode, td, fp);
 		else
 			error = dsw->d_open(dev, ap->a_mode, S_IFCHR, td);
 		PICKUP_GIANT();
 	} else {
 		if (dsw->d_fdopen != NULL)
 			error = dsw->d_fdopen(dev, ap->a_mode, td, fp);
 		else
 			error = dsw->d_open(dev, ap->a_mode, S_IFCHR, td);
 	}
 
-	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td);
+	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
 
 	dev_relthread(dev);
 
 	if (error)
 		return (error);
 
 #if 0	/* /dev/console */
 	KASSERT(fp != NULL,
 	     ("Could not vnode bypass device on NULL fp"));
 #else
 	if(fp == NULL)
 		return (error);
 #endif
 	KASSERT(fp->f_ops == &badfileops,
 	     ("Could not vnode bypass device on fdops %p", fp->f_ops));
 	finit(fp, fp->f_flag, DTYPE_VNODE, dev, &devfs_ops_f);
 	return (error);
 }
 
 static int
 devfs_pathconf(struct vop_pathconf_args *ap)
 {
 
 	switch (ap->a_name) {
 	case _PC_MAC_PRESENT:
 #ifdef MAC
 		/*
 		 * If MAC is enabled, devfs automatically supports
 		 * trivial non-persistant label storage.
 		 */
 		*ap->a_retval = 1;
 #else
 		*ap->a_retval = 0;
 #endif
 		return (0);
 	default:
 		return (vop_stdpathconf(ap));
 	}
 	/* NOTREACHED */
 }
 
 /* ARGSUSED */
 static int
 devfs_poll_f(struct file *fp, int events, struct ucred *cred, struct thread *td)
 {
 	struct cdev *dev;
 	struct cdevsw *dsw;
 	int error;
 
 	error = devfs_fp_check(fp, &dev, &dsw);
 	if (error)
 		return (error);
 	error = dsw->d_poll(dev, events, td);
 	dev_relthread(dev);
 	return(error);
 }
 
 /*
  * Print out the contents of a special device vnode.
  */
 static int
 devfs_print(struct vop_print_args *ap)
 {
 
 	printf("\tdev %s\n", devtoname(ap->a_vp->v_rdev));
 	return (0);
 }
 
 /* ARGSUSED */
 static int
 devfs_read_f(struct file *fp, struct uio *uio, struct ucred *cred, int flags, struct thread *td)
 {
 	struct cdev *dev;
 	int ioflag, error, resid;
 	struct cdevsw *dsw;
 
 	error = devfs_fp_check(fp, &dev, &dsw);
 	if (error)
 		return (error);
 	resid = uio->uio_resid;
 	ioflag = fp->f_flag & (O_NONBLOCK | O_DIRECT);
 	if (ioflag & O_DIRECT)
 		ioflag |= IO_DIRECT;
 
 	if ((flags & FOF_OFFSET) == 0)
 		uio->uio_offset = fp->f_offset;
 
 	error = dsw->d_read(dev, uio, ioflag);
 	if (uio->uio_resid != resid || (error == 0 && resid != 0))
 		vfs_timestamp(&dev->si_atime);
 	dev_relthread(dev);
 
 	if ((flags & FOF_OFFSET) == 0)
 		fp->f_offset = uio->uio_offset;
 	fp->f_nextoff = uio->uio_offset;
 	return (error);
 }
 
 static int
 devfs_readdir(struct vop_readdir_args *ap)
 {
 	int error;
 	struct uio *uio;
 	struct dirent *dp;
 	struct devfs_dirent *dd;
 	struct devfs_dirent *de;
 	struct devfs_mount *dmp;
 	off_t off, oldoff;
 	int *tmp_ncookies = NULL;
 
 	if (ap->a_vp->v_type != VDIR)
 		return (ENOTDIR);
 
 	uio = ap->a_uio;
 	if (uio->uio_offset < 0)
 		return (EINVAL);
 
 	/*
 	 * XXX: This is a temporary hack to get around this filesystem not
 	 * supporting cookies. We store the location of the ncookies pointer
 	 * in a temporary variable before calling vfs_subr.c:vfs_read_dirent()
 	 * and set the number of cookies to 0. We then set the pointer to
 	 * NULL so that vfs_read_dirent doesn't try to call realloc() on 
 	 * ap->a_cookies. Later in this function, we restore the ap->a_ncookies
 	 * pointer to its original location before returning to the caller.
 	 */
 	if (ap->a_ncookies != NULL) {
 		tmp_ncookies = ap->a_ncookies;
 		*ap->a_ncookies = 0;
 		ap->a_ncookies = NULL;
 	}
 
 	dmp = VFSTODEVFS(ap->a_vp->v_mount);
 	sx_xlock(&dmp->dm_lock);
 	DEVFS_DMP_HOLD(dmp);
 	devfs_populate(dmp);
 	if (DEVFS_DMP_DROP(dmp)) {
 		sx_xunlock(&dmp->dm_lock);
 		devfs_unmount_final(dmp);
 		if (tmp_ncookies != NULL)
 			ap->a_ncookies = tmp_ncookies;
 		return (EIO);
 	}
 	error = 0;
 	de = ap->a_vp->v_data;
 	off = 0;
 	oldoff = uio->uio_offset;
 	TAILQ_FOREACH(dd, &de->de_dlist, de_list) {
 		KASSERT(dd->de_cdp != (void *)0xdeadc0de, ("%s %d\n", __func__, __LINE__));
 		if (dd->de_flags & DE_WHITEOUT)
 			continue;
 		if (dd->de_dirent->d_type == DT_DIR)
 			de = dd->de_dir;
 		else
 			de = dd;
 		dp = dd->de_dirent;
 		if (dp->d_reclen > uio->uio_resid)
 			break;
 		dp->d_fileno = de->de_inode;
 		if (off >= uio->uio_offset) {
 			error = vfs_read_dirent(ap, dp, off);
 			if (error)
 				break;
 		}
 		off += dp->d_reclen;
 	}
 	sx_xunlock(&dmp->dm_lock);
 	uio->uio_offset = off;
 
 	/*
 	 * Restore ap->a_ncookies if it wasn't originally NULL in the first
 	 * place.
 	 */
 	if (tmp_ncookies != NULL)
 		ap->a_ncookies = tmp_ncookies;
 
 	return (error);
 }
 
 static int
 devfs_readlink(struct vop_readlink_args *ap)
 {
 	struct devfs_dirent *de;
 
 	de = ap->a_vp->v_data;
 	return (uiomove(de->de_symlink, strlen(de->de_symlink), ap->a_uio));
 }
 
 static int
 devfs_reclaim(struct vop_reclaim_args *ap)
 {
 	struct vnode *vp = ap->a_vp;
 	struct devfs_dirent *de;
 	struct cdev *dev;
 
 	mtx_lock(&devfs_de_interlock);
 	de = vp->v_data;
 	if (de != NULL) {
 		de->de_vnode = NULL;
 		vp->v_data = NULL;
 	}
 	mtx_unlock(&devfs_de_interlock);
 
 	vnode_destroy_vobject(vp);
 
 	VI_LOCK(vp);
 	dev_lock();
 	dev = vp->v_rdev;
 	vp->v_rdev = NULL;
 
 	if (dev == NULL) {
 		dev_unlock();
 		VI_UNLOCK(vp);
 		return (0);
 	}
 
 	dev->si_usecount -= vp->v_usecount;
 	dev_unlock();
 	VI_UNLOCK(vp);
 	dev_rel(dev);
 	return (0);
 }
 
 static int
 devfs_remove(struct vop_remove_args *ap)
 {
 	struct vnode *vp = ap->a_vp;
 	struct devfs_dirent *dd;
 	struct devfs_dirent *de;
 	struct devfs_mount *dmp = VFSTODEVFS(vp->v_mount);
 
 	sx_xlock(&dmp->dm_lock);
 	dd = ap->a_dvp->v_data;
 	de = vp->v_data;
 	if (de->de_cdp == NULL) {
 		TAILQ_REMOVE(&dd->de_dlist, de, de_list);
 		devfs_delete(dmp, de, 1);
 	} else {
 		de->de_flags |= DE_WHITEOUT;
 	}
 	sx_xunlock(&dmp->dm_lock);
 	return (0);
 }
 
 /*
  * Revoke is called on a tty when a terminal session ends.  The vnode
  * is orphaned by setting v_op to deadfs so we need to let go of it
  * as well so that we create a new one next time around.
  *
  */
 static int
 devfs_revoke(struct vop_revoke_args *ap)
 {
 	struct vnode *vp = ap->a_vp, *vp2;
 	struct cdev *dev;
 	struct cdev_priv *cdp;
 	struct devfs_dirent *de;
 	int i;
 
 	KASSERT((ap->a_flags & REVOKEALL) != 0, ("devfs_revoke !REVOKEALL"));
 
 	dev = vp->v_rdev;
 	cdp = dev->si_priv;
  
 	dev_lock();
 	cdp->cdp_inuse++;
 	dev_unlock();
 
 	vhold(vp);
 	vgone(vp);
 	vdrop(vp);
 
 	VOP_UNLOCK(vp,0,curthread);
  loop:
 	for (;;) {
 		mtx_lock(&devfs_de_interlock);
 		dev_lock();
 		vp2 = NULL;
 		for (i = 0; i <= cdp->cdp_maxdirent; i++) {
 			de = cdp->cdp_dirents[i];
 			if (de == NULL)
 				continue;
 
 			vp2 = de->de_vnode;
 			if (vp2 != NULL) {
 				dev_unlock();
 				VI_LOCK(vp2);
 				mtx_unlock(&devfs_de_interlock);
 				if (vget(vp2, LK_EXCLUSIVE | LK_INTERLOCK,
 				    curthread))
 					goto loop;
 				vhold(vp2);
 				vgone(vp2);
 				vdrop(vp2);
 				vput(vp2);
 				break;
 			} 
 		}
 		if (vp2 != NULL) {
 			continue;
 		}
 		dev_unlock();
 		mtx_unlock(&devfs_de_interlock);
 		break;
 	}
 	dev_lock();
 	cdp->cdp_inuse--;
 	if (!(cdp->cdp_flags & CDP_ACTIVE) && cdp->cdp_inuse == 0) {
 		TAILQ_REMOVE(&cdevp_list, cdp, cdp_list);
 		dev_unlock();
 		dev_rel(&cdp->cdp_c);
 	} else
 		dev_unlock();
 
-	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, curthread);
+	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
 	return (0);
 }
 
 static int
 devfs_rioctl(struct vop_ioctl_args *ap)
 {
 	int error;
 	struct devfs_mount *dmp;
 
 	dmp = VFSTODEVFS(ap->a_vp->v_mount);
 	sx_xlock(&dmp->dm_lock);
 	DEVFS_DMP_HOLD(dmp);
 	devfs_populate(dmp);
 	if (DEVFS_DMP_DROP(dmp)) {
 		sx_xunlock(&dmp->dm_lock);
 		devfs_unmount_final(dmp);
 		return (ENOENT);
 	}
 	error = devfs_rules_ioctl(dmp, ap->a_command, ap->a_data, ap->a_td);
 	sx_xunlock(&dmp->dm_lock);
 	return (error);
 }
 
 static int
 devfs_rread(struct vop_read_args *ap)
 {
 
 	if (ap->a_vp->v_type != VDIR)
 		return (EINVAL);
 	return (VOP_READDIR(ap->a_vp, ap->a_uio, ap->a_cred, NULL, NULL, NULL));
 }
 
 static int
 devfs_setattr(struct vop_setattr_args *ap)
 {
 	struct devfs_dirent *de;
 	struct vattr *vap;
 	struct vnode *vp;
 	int c, error;
 	uid_t uid;
 	gid_t gid;
 
 	vap = ap->a_vap;
 	vp = ap->a_vp;
 	if ((vap->va_type != VNON) ||
 	    (vap->va_nlink != VNOVAL) ||
 	    (vap->va_fsid != VNOVAL) ||
 	    (vap->va_fileid != VNOVAL) ||
 	    (vap->va_blocksize != VNOVAL) ||
 	    (vap->va_flags != VNOVAL && vap->va_flags != 0) ||
 	    (vap->va_rdev != VNOVAL) ||
 	    ((int)vap->va_bytes != VNOVAL) ||
 	    (vap->va_gen != VNOVAL)) {
 		return (EINVAL);
 	}
 
 	de = vp->v_data;
 	if (vp->v_type == VDIR)
 		de = de->de_dir;
 
 	error = c = 0;
 	if (vap->va_uid == (uid_t)VNOVAL)
 		uid = de->de_uid;
 	else
 		uid = vap->va_uid;
 	if (vap->va_gid == (gid_t)VNOVAL)
 		gid = de->de_gid;
 	else
 		gid = vap->va_gid;
 	if (uid != de->de_uid || gid != de->de_gid) {
 		if ((ap->a_cred->cr_uid != de->de_uid) || uid != de->de_uid ||
 		    (gid != de->de_gid && !groupmember(gid, ap->a_cred))) {
 			error = priv_check(ap->a_td, PRIV_VFS_CHOWN);
 			if (error)
 				return (error);
 		}
 		de->de_uid = uid;
 		de->de_gid = gid;
 		c = 1;
 	}
 
 	if (vap->va_mode != (mode_t)VNOVAL) {
 		if (ap->a_cred->cr_uid != de->de_uid) {
 			error = priv_check(ap->a_td, PRIV_VFS_ADMIN);
 			if (error)
 				return (error);
 		}
 		de->de_mode = vap->va_mode;
 		c = 1;
 	}
 
 	if (vap->va_atime.tv_sec != VNOVAL || vap->va_mtime.tv_sec != VNOVAL) {
 		/* See the comment in ufs_vnops::ufs_setattr(). */
 		if ((error = VOP_ACCESS(vp, VADMIN, ap->a_cred, ap->a_td)) &&
 		    ((vap->va_vaflags & VA_UTIMES_NULL) == 0 ||
 		    (error = VOP_ACCESS(vp, VWRITE, ap->a_cred, ap->a_td))))
 			return (error);
 		if (vap->va_atime.tv_sec != VNOVAL) {
 			if (vp->v_type == VCHR)
 				vp->v_rdev->si_atime = vap->va_atime;
 			else
 				de->de_atime = vap->va_atime;
 		}
 		if (vap->va_mtime.tv_sec != VNOVAL) {
 			if (vp->v_type == VCHR)
 				vp->v_rdev->si_mtime = vap->va_mtime;
 			else
 				de->de_mtime = vap->va_mtime;
 		}
 		c = 1;
 	}
 
 	if (c) {
 		if (vp->v_type == VCHR)
 			vfs_timestamp(&vp->v_rdev->si_ctime);
 		else
 			vfs_timestamp(&de->de_mtime);
 	}
 	return (0);
 }
 
 #ifdef MAC
 static int
 devfs_setlabel(struct vop_setlabel_args *ap)
 {
 	struct vnode *vp;
 	struct devfs_dirent *de;
 
 	vp = ap->a_vp;
 	de = vp->v_data;
 
 	mac_vnode_relabel(ap->a_cred, vp, ap->a_label);
 	mac_devfs_update(vp->v_mount, de, vp);
 
 	return (0);
 }
 #endif
 
 static int
 devfs_stat_f(struct file *fp, struct stat *sb, struct ucred *cred, struct thread *td)
 {
 
 	return (vnops.fo_stat(fp, sb, cred, td));
 }
 
 static int
 devfs_symlink(struct vop_symlink_args *ap)
 {
 	int i, error;
 	struct devfs_dirent *dd;
 	struct devfs_dirent *de;
 	struct devfs_mount *dmp;
 	struct thread *td;
 
 	td = ap->a_cnp->cn_thread;
 	KASSERT(td == curthread, ("devfs_symlink: td != curthread"));
 
 	error = priv_check(td, PRIV_DEVFS_SYMLINK);
 	if (error)
 		return(error);
 	dmp = VFSTODEVFS(ap->a_dvp->v_mount);
 	dd = ap->a_dvp->v_data;
 	de = devfs_newdirent(ap->a_cnp->cn_nameptr, ap->a_cnp->cn_namelen);
 	de->de_uid = 0;
 	de->de_gid = 0;
 	de->de_mode = 0755;
 	de->de_inode = alloc_unr(devfs_inos);
 	de->de_dirent->d_type = DT_LNK;
 	i = strlen(ap->a_target) + 1;
 	de->de_symlink = malloc(i, M_DEVFS, M_WAITOK);
 	bcopy(ap->a_target, de->de_symlink, i);
 	sx_xlock(&dmp->dm_lock);
 #ifdef MAC
 	mac_devfs_create_symlink(ap->a_cnp->cn_cred, dmp->dm_mount, dd, de);
 #endif
 	TAILQ_INSERT_TAIL(&dd->de_dlist, de, de_list);
 	return (devfs_allocv(de, ap->a_dvp->v_mount, ap->a_vpp, td));
 }
 
 static int
 devfs_truncate_f(struct file *fp, off_t length, struct ucred *cred, struct thread *td)
 {
 
 	return (vnops.fo_truncate(fp, length, cred, td));
 }
 
 /* ARGSUSED */
 static int
 devfs_write_f(struct file *fp, struct uio *uio, struct ucred *cred, int flags, struct thread *td)
 {
 	struct cdev *dev;
 	int error, ioflag, resid;
 	struct cdevsw *dsw;
 
 	error = devfs_fp_check(fp, &dev, &dsw);
 	if (error)
 		return (error);
 	KASSERT(uio->uio_td == td, ("uio_td %p is not td %p", uio->uio_td, td));
 	ioflag = fp->f_flag & (O_NONBLOCK | O_DIRECT | O_FSYNC);
 	if (ioflag & O_DIRECT)
 		ioflag |= IO_DIRECT;
 	if ((flags & FOF_OFFSET) == 0)
 		uio->uio_offset = fp->f_offset;
 
 	resid = uio->uio_resid;
 
 	error = dsw->d_write(dev, uio, ioflag);
 	if (uio->uio_resid != resid || (error == 0 && resid != 0)) {
 		vfs_timestamp(&dev->si_ctime);
 		dev->si_mtime = dev->si_ctime;
 	}
 	dev_relthread(dev);
 
 	if ((flags & FOF_OFFSET) == 0)
 		fp->f_offset = uio->uio_offset;
 	fp->f_nextoff = uio->uio_offset;
 	return (error);
 }
 
 dev_t
 dev2udev(struct cdev *x)
 {
 	if (x == NULL)
 		return (NODEV);
 	return (x->si_priv->cdp_inode);
 }
 
 static struct fileops devfs_ops_f = {
 	.fo_read =	devfs_read_f,
 	.fo_write =	devfs_write_f,
 	.fo_truncate =	devfs_truncate_f,
 	.fo_ioctl =	devfs_ioctl_f,
 	.fo_poll =	devfs_poll_f,
 	.fo_kqfilter =	devfs_kqfilter_f,
 	.fo_stat =	devfs_stat_f,
 	.fo_close =	devfs_close_f,
 	.fo_flags =	DFLAG_PASSABLE | DFLAG_SEEKABLE
 };
 
 static struct vop_vector devfs_vnodeops = {
 	.vop_default =		&default_vnodeops,
 
 	.vop_access =		devfs_access,
 	.vop_getattr =		devfs_getattr,
 	.vop_ioctl =		devfs_rioctl,
 	.vop_lookup =		devfs_lookup,
 	.vop_mknod =		devfs_mknod,
 	.vop_pathconf =		devfs_pathconf,
 	.vop_read =		devfs_rread,
 	.vop_readdir =		devfs_readdir,
 	.vop_readlink =		devfs_readlink,
 	.vop_reclaim =		devfs_reclaim,
 	.vop_remove =		devfs_remove,
 	.vop_revoke =		devfs_revoke,
 	.vop_setattr =		devfs_setattr,
 #ifdef MAC
 	.vop_setlabel =		devfs_setlabel,
 #endif
 	.vop_symlink =		devfs_symlink,
 };
 
 static struct vop_vector devfs_specops = {
 	.vop_default =		&default_vnodeops,
 
 	.vop_access =		devfs_access,
 	.vop_advlock =		devfs_advlock,
 	.vop_bmap =		VOP_PANIC,
 	.vop_close =		devfs_close,
 	.vop_create =		VOP_PANIC,
 	.vop_fsync =		devfs_fsync,
 	.vop_getattr =		devfs_getattr,
 	.vop_lease =		VOP_NULL,
 	.vop_link =		VOP_PANIC,
 	.vop_mkdir =		VOP_PANIC,
 	.vop_mknod =		VOP_PANIC,
 	.vop_open =		devfs_open,
 	.vop_pathconf =		devfs_pathconf,
 	.vop_print =		devfs_print,
 	.vop_read =		VOP_PANIC,
 	.vop_readdir =		VOP_PANIC,
 	.vop_readlink =		VOP_PANIC,
 	.vop_reallocblks =	VOP_PANIC,
 	.vop_reclaim =		devfs_reclaim,
 	.vop_remove =		devfs_remove,
 	.vop_rename =		VOP_PANIC,
 	.vop_revoke =		devfs_revoke,
 	.vop_rmdir =		VOP_PANIC,
 	.vop_setattr =		devfs_setattr,
 #ifdef MAC
 	.vop_setlabel =		devfs_setlabel,
 #endif
 	.vop_strategy =		VOP_PANIC,
 	.vop_symlink =		VOP_PANIC,
 	.vop_write =		VOP_PANIC,
 };
 
 /*
  * Our calling convention to the device drivers used to be that we passed
  * vnode.h IO_* flags to read()/write(), but we're moving to fcntl.h O_ 
  * flags instead since that's what open(), close() and ioctl() takes and
  * we don't really want vnode.h in device drivers.
  * We solved the source compatibility by redefining some vnode flags to
  * be the same as the fcntl ones and by sending down the bitwise OR of
  * the respective fcntl/vnode flags.  These CTASSERTS make sure nobody
  * pulls the rug out under this.
  */
 CTASSERT(O_NONBLOCK == IO_NDELAY);
 CTASSERT(O_FSYNC == IO_SYNC);
Index: head/sys/fs/fdescfs/fdesc_vfsops.c
===================================================================
--- head/sys/fs/fdescfs/fdesc_vfsops.c	(revision 175201)
+++ head/sys/fs/fdescfs/fdesc_vfsops.c	(revision 175202)
@@ -1,214 +1,214 @@
 /*-
  * Copyright (c) 1992, 1993, 1995
  *	The Regents of the University of California.  All rights reserved.
  *
  * This code is derived from software donated to Berkeley by
  * Jan-Simon Pendry.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	@(#)fdesc_vfsops.c	8.4 (Berkeley) 1/21/94
  *
  * $FreeBSD$
  */
 
 /*
  * /dev/fd Filesystem
  */
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/filedesc.h>
 #include <sys/kernel.h>
 #include <sys/lock.h>
 #include <sys/mutex.h>
 #include <sys/malloc.h>
 #include <sys/mount.h>
 #include <sys/proc.h>
 #include <sys/resourcevar.h>
 #include <sys/vnode.h>
 
 #include <fs/fdescfs/fdesc.h>
 
 static MALLOC_DEFINE(M_FDESCMNT, "fdesc_mount", "FDESC mount structure");
 
 static vfs_cmount_t	fdesc_cmount;
 static vfs_mount_t	fdesc_mount;
 static vfs_unmount_t	fdesc_unmount;
 static vfs_statfs_t	fdesc_statfs;
 static vfs_root_t	fdesc_root;
 
 /*
  * Compatibility shim for old mount(2) system call.
  */
 int
 fdesc_cmount(struct mntarg *ma, void *data, int flags, struct thread *td)
 {
 	return kernel_mount(ma, flags);
 }
 
 /*
  * Mount the per-process file descriptors (/dev/fd)
  */
 static int
 fdesc_mount(struct mount *mp, struct thread *td)
 {
 	int error = 0;
 	struct fdescmount *fmp;
 	struct vnode *rvp;
 
 	/*
 	 * Update is a no-op
 	 */
 	if (mp->mnt_flag & (MNT_UPDATE | MNT_ROOTFS))
 		return (EOPNOTSUPP);
 
 	error = fdesc_allocvp(Froot, FD_ROOT, mp, &rvp, td);
 	if (error)
 		return (error);
 
 	MALLOC(fmp, struct fdescmount *, sizeof(struct fdescmount),
 				M_FDESCMNT, M_WAITOK);	/* XXX */
 	rvp->v_type = VDIR;
 	rvp->v_vflag |= VV_ROOT;
 	fmp->f_root = rvp;
 	/* XXX -- don't mark as local to work around fts() problems */
 	/*mp->mnt_flag |= MNT_LOCAL;*/
 	mp->mnt_data =  fmp;
 	vfs_getnewfsid(mp);
 
 	vfs_mountedfrom(mp, "fdescfs");
 	return (0);
 }
 
 static int
 fdesc_unmount(mp, mntflags, td)
 	struct mount *mp;
 	int mntflags;
 	struct thread *td;
 {
 	int error;
 	int flags = 0;
 
 	if (mntflags & MNT_FORCE)
 		flags |= FORCECLOSE;
 
 	/*
 	 * Clear out buffer cache.  I don't think we
 	 * ever get anything cached at this level at the
 	 * moment, but who knows...
 	 *
 	 * There is 1 extra root vnode reference corresponding
 	 * to f_root.
 	 */
 	if ((error = vflush(mp, 1, flags, td)) != 0)
 		return (error);
 
 	/*
 	 * Finally, throw away the fdescmount structure
 	 */
 	free(mp->mnt_data, M_FDESCMNT);	/* XXX */
 	mp->mnt_data = 0;
 
 	return (0);
 }
 
 static int
 fdesc_root(mp, flags, vpp, td)
 	struct mount *mp;
 	int flags;
 	struct vnode **vpp;
 	struct thread *td;
 {
 	struct vnode *vp;
 
 	/*
 	 * Return locked reference to root.
 	 */
 	vp = VFSTOFDESC(mp)->f_root;
 	VREF(vp);
-	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td);
+	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
 	*vpp = vp;
 	return (0);
 }
 
 static int
 fdesc_statfs(mp, sbp, td)
 	struct mount *mp;
 	struct statfs *sbp;
 	struct thread *td;
 {
 	struct filedesc *fdp;
 	int lim;
 	int i;
 	int last;
 	int freefd;
 
 	/*
 	 * Compute number of free file descriptors.
 	 * [ Strange results will ensue if the open file
 	 * limit is ever reduced below the current number
 	 * of open files... ]
 	 */
 	PROC_LOCK(td->td_proc);
 	lim = lim_cur(td->td_proc, RLIMIT_NOFILE);
 	PROC_UNLOCK(td->td_proc);
 	fdp = td->td_proc->p_fd;
 	FILEDESC_SLOCK(fdp);
 	last = min(fdp->fd_nfiles, lim);
 	freefd = 0;
 	for (i = fdp->fd_freefile; i < last; i++)
 		if (fdp->fd_ofiles[i] == NULL)
 			freefd++;
 
 	/*
 	 * Adjust for the fact that the fdesc array may not
 	 * have been fully allocated yet.
 	 */
 	if (fdp->fd_nfiles < lim)
 		freefd += (lim - fdp->fd_nfiles);
 	FILEDESC_SUNLOCK(fdp);
 
 	sbp->f_flags = 0;
 	sbp->f_bsize = DEV_BSIZE;
 	sbp->f_iosize = DEV_BSIZE;
 	sbp->f_blocks = 2;		/* 1K to keep df happy */
 	sbp->f_bfree = 0;
 	sbp->f_bavail = 0;
 	sbp->f_files = lim + 1;		/* Allow for "." */
 	sbp->f_ffree = freefd;		/* See comments above */
 	return (0);
 }
 
 static struct vfsops fdesc_vfsops = {
 	.vfs_cmount =		fdesc_cmount,
 	.vfs_init =		fdesc_init,
 	.vfs_mount =		fdesc_mount,
 	.vfs_root =		fdesc_root,
 	.vfs_statfs =		fdesc_statfs,
 	.vfs_unmount =		fdesc_unmount,
 };
 
 VFS_SET(fdesc_vfsops, fdescfs, VFCF_SYNTHETIC);
Index: head/sys/fs/fdescfs/fdesc_vnops.c
===================================================================
--- head/sys/fs/fdescfs/fdesc_vnops.c	(revision 175201)
+++ head/sys/fs/fdescfs/fdesc_vnops.c	(revision 175202)
@@ -1,536 +1,536 @@
 /*-
  * Copyright (c) 1992, 1993
  *	The Regents of the University of California.  All rights reserved.
  *
  * This code is derived from software donated to Berkeley by
  * Jan-Simon Pendry.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	@(#)fdesc_vnops.c	8.9 (Berkeley) 1/21/94
  *
  * $FreeBSD$
  */
 
 /*
  * /dev/fd Filesystem
  */
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/conf.h>
 #include <sys/dirent.h>
 #include <sys/filedesc.h>
 #include <sys/kernel.h>	/* boottime */
 #include <sys/lock.h>
 #include <sys/mutex.h>
 #include <sys/malloc.h>
 #include <sys/file.h>	/* Must come after sys/malloc.h */
 #include <sys/mount.h>
 #include <sys/namei.h>
 #include <sys/proc.h>
 #include <sys/stat.h>
 #include <sys/vnode.h>
 
 #include <fs/fdescfs/fdesc.h>
 
 #define FDL_WANT	0x01
 #define FDL_LOCKED	0x02
 static int fdcache_lock;
 
 #define	NFDCACHE 4
 #define FD_NHASH(ix) \
 	(&fdhashtbl[(ix) & fdhash])
 static LIST_HEAD(fdhashhead, fdescnode) *fdhashtbl;
 static u_long fdhash;
 
 static vop_getattr_t	fdesc_getattr;
 static vop_inactive_t	fdesc_inactive;
 static vop_lookup_t	fdesc_lookup;
 static vop_open_t	fdesc_open;
 static vop_readdir_t	fdesc_readdir;
 static vop_reclaim_t	fdesc_reclaim;
 static vop_setattr_t	fdesc_setattr;
 
 static struct vop_vector fdesc_vnodeops = {
 	.vop_default =		&default_vnodeops,
 
 	.vop_access =		VOP_NULL,
 	.vop_getattr =		fdesc_getattr,
 	.vop_inactive =		fdesc_inactive,
 	.vop_lookup =		fdesc_lookup,
 	.vop_open =		fdesc_open,
 	.vop_pathconf =		vop_stdpathconf,
 	.vop_readdir =		fdesc_readdir,
 	.vop_reclaim =		fdesc_reclaim,
 	.vop_setattr =		fdesc_setattr,
 };
 
 /*
  * Initialise cache headers
  */
 int
 fdesc_init(vfsp)
 	struct vfsconf *vfsp;
 {
 
 	fdhashtbl = hashinit(NFDCACHE, M_CACHE, &fdhash);
 	return (0);
 }
 
 int
 fdesc_allocvp(ftype, ix, mp, vpp, td)
 	fdntype ftype;
 	int ix;
 	struct mount *mp;
 	struct vnode **vpp;
 	struct thread *td;
 {
 	struct fdhashhead *fc;
 	struct fdescnode *fd;
 	int error = 0;
 
 	fc = FD_NHASH(ix);
 loop:
 	LIST_FOREACH(fd, fc, fd_hash) {
 		if (fd->fd_ix == ix && fd->fd_vnode->v_mount == mp) {
 			if (vget(fd->fd_vnode, 0, td))
 				goto loop;
 			*vpp = fd->fd_vnode;
 			return (error);
 		}
 	}
 
 	/*
 	 * otherwise lock the array while we call getnewvnode
 	 * since that can block.
 	 */
 	if (fdcache_lock & FDL_LOCKED) {
 		fdcache_lock |= FDL_WANT;
 		(void) tsleep( &fdcache_lock, PINOD, "fdalvp", 0);
 		goto loop;
 	}
 	fdcache_lock |= FDL_LOCKED;
 
 	/*
 	 * Do the MALLOC before the getnewvnode since doing so afterward
 	 * might cause a bogus v_data pointer to get dereferenced
 	 * elsewhere if MALLOC should block.
 	 */
 	MALLOC(fd, struct fdescnode *, sizeof(struct fdescnode), M_TEMP, M_WAITOK);
 
 	error = getnewvnode("fdesc", mp, &fdesc_vnodeops, vpp);
 	if (error) {
 		FREE(fd, M_TEMP);
 		goto out;
 	}
 	(*vpp)->v_data = fd;
 	fd->fd_vnode = *vpp;
 	fd->fd_type = ftype;
 	fd->fd_fd = -1;
 	fd->fd_ix = ix;
 	/* XXX: vnode should be locked here */
 	error = insmntque(*vpp, mp); /* XXX: Too early for mpsafe fs */
 	if (error != 0) {
 		free(fd, M_TEMP);
 		*vpp = NULLVP;
 		goto out;
 	}
 	LIST_INSERT_HEAD(fc, fd, fd_hash);
 
 out:
 	fdcache_lock &= ~FDL_LOCKED;
 
 	if (fdcache_lock & FDL_WANT) {
 		fdcache_lock &= ~FDL_WANT;
 		wakeup( &fdcache_lock);
 	}
 
 	return (error);
 }
 
 /*
  * vp is the current namei directory
  * ndp is the name to locate in that directory...
  */
 static int
 fdesc_lookup(ap)
 	struct vop_lookup_args /* {
 		struct vnode * a_dvp;
 		struct vnode ** a_vpp;
 		struct componentname * a_cnp;
 	} */ *ap;
 {
 	struct vnode **vpp = ap->a_vpp;
 	struct vnode *dvp = ap->a_dvp;
 	struct componentname *cnp = ap->a_cnp;
 	char *pname = cnp->cn_nameptr;
 	struct thread *td = cnp->cn_thread;
 	struct file *fp;
 	int nlen = cnp->cn_namelen;
 	u_int fd;
 	int error;
 	struct vnode *fvp;
 
 	if ((cnp->cn_flags & ISLASTCN) &&
 	    (cnp->cn_nameiop == DELETE || cnp->cn_nameiop == RENAME)) {
 		error = EROFS;
 		goto bad;
 	}
 
 	if (cnp->cn_namelen == 1 && *pname == '.') {
 		*vpp = dvp;
 		VREF(dvp);
 		return (0);
 	}
 
 	if (VTOFDESC(dvp)->fd_type != Froot) {
 		error = ENOTDIR;
 		goto bad;
 	}
 
 	fd = 0;
 	/* the only time a leading 0 is acceptable is if it's "0" */
 	if (*pname == '0' && nlen != 1) {
 		error = ENOENT;
 		goto bad;
 	}
 	while (nlen--) {
 		if (*pname < '0' || *pname > '9') {
 			error = ENOENT;
 			goto bad;
 		}
 		fd = 10 * fd + *pname++ - '0';
 	}
 
 	if ((error = fget(td, fd, &fp)) != 0)
 		goto bad;
 
 	error = fdesc_allocvp(Fdesc, FD_DESC+fd, dvp->v_mount, &fvp, td);
 	fdrop(fp, td);
 	if (error)
 		goto bad;
 	VTOFDESC(fvp)->fd_fd = fd;
 	if (fvp != dvp)
-		vn_lock(fvp, LK_EXCLUSIVE | LK_RETRY, td);
+		vn_lock(fvp, LK_EXCLUSIVE | LK_RETRY);
 	*vpp = fvp;
 	return (0);
 
 bad:
 	*vpp = NULL;
 	return (error);
 }
 
 static int
 fdesc_open(ap)
 	struct vop_open_args /* {
 		struct vnode *a_vp;
 		int  a_mode;
 		struct ucred *a_cred;
 		struct thread *a_td;
 	} */ *ap;
 {
 	struct vnode *vp = ap->a_vp;
 
 	if (VTOFDESC(vp)->fd_type == Froot)
 		return (0);
 
 	/*
 	 * XXX Kludge: set td->td_proc->p_dupfd to contain the value of the the file
 	 * descriptor being sought for duplication. The error return ensures
 	 * that the vnode for this device will be released by vn_open. Open
 	 * will detect this special error and take the actions in dupfdopen.
 	 * Other callers of vn_open or VOP_OPEN will simply report the
 	 * error.
 	 */
 	ap->a_td->td_dupfd = VTOFDESC(vp)->fd_fd;	/* XXX */
 	return (ENODEV);
 }
 
 static int
 fdesc_getattr(ap)
 	struct vop_getattr_args /* {
 		struct vnode *a_vp;
 		struct vattr *a_vap;
 		struct ucred *a_cred;
 		struct thread *a_td;
 	} */ *ap;
 {
 	struct vnode *vp = ap->a_vp;
 	struct vattr *vap = ap->a_vap;
 	struct file *fp;
 	struct stat stb;
 	u_int fd;
 	int error = 0;
 
 	switch (VTOFDESC(vp)->fd_type) {
 	case Froot:
 		VATTR_NULL(vap);
 
 		vap->va_mode = S_IRUSR|S_IXUSR|S_IRGRP|S_IXGRP|S_IROTH|S_IXOTH;
 		vap->va_type = VDIR;
 		vap->va_nlink = 2;
 		vap->va_size = DEV_BSIZE;
 		vap->va_fileid = VTOFDESC(vp)->fd_ix;
 		vap->va_uid = 0;
 		vap->va_gid = 0;
 		vap->va_blocksize = DEV_BSIZE;
 		vap->va_atime.tv_sec = boottime.tv_sec;
 		vap->va_atime.tv_nsec = 0;
 		vap->va_mtime = vap->va_atime;
 		vap->va_ctime = vap->va_mtime;
 		vap->va_gen = 0;
 		vap->va_flags = 0;
 		vap->va_rdev = 0;
 		vap->va_bytes = 0;
 		break;
 
 	case Fdesc:
 		fd = VTOFDESC(vp)->fd_fd;
 
 		if ((error = fget(ap->a_td, fd, &fp)) != 0)
 			return (error);
 
 		bzero(&stb, sizeof(stb));
 		error = fo_stat(fp, &stb, ap->a_td->td_ucred, ap->a_td);
 		fdrop(fp, ap->a_td);
 		if (error == 0) {
 			VATTR_NULL(vap);
 			vap->va_type = IFTOVT(stb.st_mode);
 			vap->va_mode = stb.st_mode;
 #define FDRX (VREAD|VEXEC)
 			if (vap->va_type == VDIR)
 				vap->va_mode &= ~((FDRX)|(FDRX>>3)|(FDRX>>6));
 #undef FDRX
 			vap->va_nlink = 1;
 			vap->va_flags = 0;
 			vap->va_bytes = stb.st_blocks * stb.st_blksize;
 			vap->va_fileid = VTOFDESC(vp)->fd_ix;
 			vap->va_size = stb.st_size;
 			vap->va_blocksize = stb.st_blksize;
 			vap->va_rdev = stb.st_rdev;
 
 			/*
 			 * If no time data is provided, use the current time.
 			 */
 			if (stb.st_atimespec.tv_sec == 0 &&
 			    stb.st_atimespec.tv_nsec == 0)
 				nanotime(&stb.st_atimespec);
 
 			if (stb.st_ctimespec.tv_sec == 0 &&
 			    stb.st_ctimespec.tv_nsec == 0)
 				nanotime(&stb.st_ctimespec);
 
 			if (stb.st_mtimespec.tv_sec == 0 &&
 			    stb.st_mtimespec.tv_nsec == 0)
 				nanotime(&stb.st_mtimespec);
 
 			vap->va_atime = stb.st_atimespec;
 			vap->va_mtime = stb.st_mtimespec;
 			vap->va_ctime = stb.st_ctimespec;
 			vap->va_uid = stb.st_uid;
 			vap->va_gid = stb.st_gid;
 		}
 		break;
 
 	default:
 		panic("fdesc_getattr");
 		break;
 	}
 
 	if (error == 0)
 		vp->v_type = vap->va_type;
 	return (error);
 }
 
 static int
 fdesc_setattr(ap)
 	struct vop_setattr_args /* {
 		struct vnode *a_vp;
 		struct vattr *a_vap;
 		struct ucred *a_cred;
 		struct thread *a_td;
 	} */ *ap;
 {
 	struct vattr *vap = ap->a_vap;
 	struct vnode *vp;
 	struct mount *mp;
 	struct file *fp;
 	unsigned fd;
 	int error;
 
 	/*
 	 * Can't mess with the root vnode
 	 */
 	if (VTOFDESC(ap->a_vp)->fd_type == Froot)
 		return (EACCES);
 
 	fd = VTOFDESC(ap->a_vp)->fd_fd;
 
 	/*
 	 * Allow setattr where there is an underlying vnode.
 	 */
 	error = getvnode(ap->a_td->td_proc->p_fd, fd, &fp);
 	if (error) {
 		/*
 		 * getvnode() returns EINVAL if the file descriptor is not
 		 * backed by a vnode.  Silently drop all changes except
 		 * chflags(2) in this case.
 		 */
 		if (error == EINVAL) {
 			if (vap->va_flags != VNOVAL)
 				error = EOPNOTSUPP;
 			else
 				error = 0;
 		}
 		return (error);
 	}
 	vp = fp->f_vnode;
 	if ((error = vn_start_write(vp, &mp, V_WAIT | PCATCH)) == 0) {
-		vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, ap->a_td);
+		vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
 		error = VOP_SETATTR(vp, ap->a_vap, ap->a_cred, ap->a_td);
 		VOP_UNLOCK(vp, 0, ap->a_td);
 		vn_finished_write(mp);
 	}
 	fdrop(fp, ap->a_td);
 	return (error);
 }
 
 #define UIO_MX 16
 
 static int
 fdesc_readdir(ap)
 	struct vop_readdir_args /* {
 		struct vnode *a_vp;
 		struct uio *a_uio;
 		struct ucred *a_cred;
 		int *a_eofflag;
 		u_long *a_cookies;
 		int a_ncookies;
 	} */ *ap;
 {
 	struct uio *uio = ap->a_uio;
 	struct filedesc *fdp;
 	struct dirent d;
 	struct dirent *dp = &d;
 	int error, i, off, fcnt;
 
 	/*
 	 * We don't allow exporting fdesc mounts, and currently local
 	 * requests do not need cookies.
 	 */
 	if (ap->a_ncookies)
 		panic("fdesc_readdir: not hungry");
 
 	if (VTOFDESC(ap->a_vp)->fd_type != Froot)
 		panic("fdesc_readdir: not dir");
 
 	off = (int)uio->uio_offset;
 	if (off != uio->uio_offset || off < 0 || (u_int)off % UIO_MX != 0 ||
 	    uio->uio_resid < UIO_MX)
 		return (EINVAL);
 	i = (u_int)off / UIO_MX;
 	fdp = uio->uio_td->td_proc->p_fd;
 	error = 0;
 
 	fcnt = i - 2;		/* The first two nodes are `.' and `..' */
 
 	FILEDESC_SLOCK(fdp);
 	while (i < fdp->fd_nfiles + 2 && uio->uio_resid >= UIO_MX) {
 		switch (i) {
 		case 0:	/* `.' */
 		case 1: /* `..' */
 			bzero((caddr_t)dp, UIO_MX);
 
 			dp->d_fileno = i + FD_ROOT;
 			dp->d_namlen = i + 1;
 			dp->d_reclen = UIO_MX;
 			bcopy("..", dp->d_name, dp->d_namlen);
 			dp->d_name[i + 1] = '\0';
 			dp->d_type = DT_DIR;
 			break;
 		default:
 			if (fdp->fd_ofiles[fcnt] == NULL) {
 				FILEDESC_SUNLOCK(fdp);
 				goto done;
 			}
 
 			bzero((caddr_t) dp, UIO_MX);
 			dp->d_namlen = sprintf(dp->d_name, "%d", fcnt);
 			dp->d_reclen = UIO_MX;
 			dp->d_type = DT_UNKNOWN;
 			dp->d_fileno = i + FD_DESC;
 			break;
 		}
 		/*
 		 * And ship to userland
 		 */
 		FILEDESC_SUNLOCK(fdp);
 		error = uiomove(dp, UIO_MX, uio);
 		if (error)
 			goto done;
 		FILEDESC_SLOCK(fdp);
 		i++;
 		fcnt++;
 	}
 	FILEDESC_SUNLOCK(fdp);
 
 done:
 	uio->uio_offset = i * UIO_MX;
 	return (error);
 }
 
 static int
 fdesc_inactive(ap)
 	struct vop_inactive_args /* {
 		struct vnode *a_vp;
 		struct thread *a_td;
 	} */ *ap;
 {
 	struct vnode *vp = ap->a_vp;
 
 	/*
 	 * Clear out the v_type field to avoid
 	 * nasty things happening in vgone().
 	 */
 	vp->v_type = VNON;
 	return (0);
 }
 
 static int
 fdesc_reclaim(ap)
 	struct vop_reclaim_args /* {
 		struct vnode *a_vp;
 	} */ *ap;
 {
 	struct vnode *vp = ap->a_vp;
 	struct fdescnode *fd = VTOFDESC(vp);
 
 	LIST_REMOVE(fd, fd_hash);
 	FREE(vp->v_data, M_TEMP);
 	vp->v_data = 0;
 
 	return (0);
 }
Index: head/sys/fs/fifofs/fifo_vnops.c
===================================================================
--- head/sys/fs/fifofs/fifo_vnops.c	(revision 175201)
+++ head/sys/fs/fifofs/fifo_vnops.c	(revision 175202)
@@ -1,748 +1,748 @@
 /*-
  * Copyright (c) 1990, 1993, 1995
  *	The Regents of the University of California.
  * Copyright (c) 2005 Robert N. M. Watson
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	@(#)fifo_vnops.c	8.10 (Berkeley) 5/27/95
  * $FreeBSD$
  */
 
 #include <sys/param.h>
 #include <sys/event.h>
 #include <sys/file.h>
 #include <sys/filedesc.h>
 #include <sys/filio.h>
 #include <sys/fcntl.h>
 #include <sys/kernel.h>
 #include <sys/lock.h>
 #include <sys/mutex.h>
 #include <sys/malloc.h>
 #include <sys/poll.h>
 #include <sys/proc.h> /* XXXKSE */
 #include <sys/signalvar.h>
 #include <sys/socket.h>
 #include <sys/socketvar.h>
 #include <sys/sx.h>
 #include <sys/systm.h>
 #include <sys/un.h>
 #include <sys/unistd.h>
 #include <sys/vnode.h>
 #include <fs/fifofs/fifo.h>
 
 static fo_rdwr_t        fifo_read_f;
 static fo_rdwr_t        fifo_write_f;
 static fo_ioctl_t       fifo_ioctl_f;
 static fo_poll_t        fifo_poll_f;
 static fo_kqfilter_t    fifo_kqfilter_f;
 static fo_stat_t        fifo_stat_f;
 static fo_close_t       fifo_close_f;
 static fo_truncate_t    fifo_truncate_f;
 
 struct fileops fifo_ops_f = {
 	.fo_read =      fifo_read_f,
 	.fo_write =     fifo_write_f,
 	.fo_truncate =  fifo_truncate_f,
 	.fo_ioctl =     fifo_ioctl_f,
 	.fo_poll =      fifo_poll_f,
 	.fo_kqfilter =  fifo_kqfilter_f,
 	.fo_stat =      fifo_stat_f,
 	.fo_close =     fifo_close_f,
 	.fo_flags =     DFLAG_PASSABLE
 };
 
 /*
  * This structure is associated with the FIFO vnode and stores
  * the state associated with the FIFO.
  */
 struct fifoinfo {
 	struct socket	*fi_readsock;
 	struct socket	*fi_writesock;
 	long		fi_readers;
 	long		fi_writers;
 };
 
 static vop_print_t	fifo_print;
 static vop_open_t	fifo_open;
 static vop_close_t	fifo_close;
 static vop_ioctl_t	fifo_ioctl;
 static vop_kqfilter_t	fifo_kqfilter;
 static vop_pathconf_t	fifo_pathconf;
 static vop_advlock_t	fifo_advlock;
 
 static void	filt_fifordetach(struct knote *kn);
 static int	filt_fiforead(struct knote *kn, long hint);
 static void	filt_fifowdetach(struct knote *kn);
 static int	filt_fifowrite(struct knote *kn, long hint);
 static void	filt_fifodetach_notsup(struct knote *kn);
 static int	filt_fifo_notsup(struct knote *kn, long hint);
 
 static struct filterops fiforead_filtops =
 	{ 1, NULL, filt_fifordetach, filt_fiforead };
 static struct filterops fifowrite_filtops =
 	{ 1, NULL, filt_fifowdetach, filt_fifowrite };
 static struct filterops fifo_notsup_filtops =
 	{ 1, NULL, filt_fifodetach_notsup, filt_fifo_notsup };
 
 struct vop_vector fifo_specops = {
 	.vop_default =		&default_vnodeops,
 
 	.vop_access =		VOP_EBADF,
 	.vop_advlock =		fifo_advlock,
 	.vop_close =		fifo_close,
 	.vop_create =		VOP_PANIC,
 	.vop_getattr =		VOP_EBADF,
 	.vop_ioctl =		fifo_ioctl,
 	.vop_kqfilter =		fifo_kqfilter,
 	.vop_lease =		VOP_NULL,
 	.vop_link =		VOP_PANIC,
 	.vop_mkdir =		VOP_PANIC,
 	.vop_mknod =		VOP_PANIC,
 	.vop_open =		fifo_open,
 	.vop_pathconf =		fifo_pathconf,
 	.vop_print =		fifo_print,
 	.vop_read =		VOP_PANIC,
 	.vop_readdir =		VOP_PANIC,
 	.vop_readlink =		VOP_PANIC,
 	.vop_reallocblks =	VOP_PANIC,
 	.vop_reclaim =		VOP_NULL,
 	.vop_remove =		VOP_PANIC,
 	.vop_rename =		VOP_PANIC,
 	.vop_rmdir =		VOP_PANIC,
 	.vop_setattr =		VOP_EBADF,
 	.vop_symlink =		VOP_PANIC,
 	.vop_write =		VOP_PANIC,
 };
 
 struct mtx fifo_mtx;
 MTX_SYSINIT(fifo, &fifo_mtx, "fifo mutex", MTX_DEF);
 
 /*
  * Dispose of fifo resources.
  */
 static void
 fifo_cleanup(struct vnode *vp)
 {
 	struct fifoinfo *fip = vp->v_fifoinfo;
 
 	ASSERT_VOP_LOCKED(vp, "fifo_cleanup");
 	if (fip->fi_readers == 0 && fip->fi_writers == 0) {
 		vp->v_fifoinfo = NULL;
 		(void)soclose(fip->fi_readsock);
 		(void)soclose(fip->fi_writesock);
 		FREE(fip, M_VNODE);
 	}
 }
 
 /*
  * Open called to set up a new instance of a fifo or
  * to find an active instance of a fifo.
  */
 /* ARGSUSED */
 static int
 fifo_open(ap)
 	struct vop_open_args /* {
 		struct vnode *a_vp;
 		int  a_mode;
 		struct ucred *a_cred;
 		struct thread *a_td;
 		int a_fdidx;
 	} */ *ap;
 {
 	struct vnode *vp = ap->a_vp;
 	struct fifoinfo *fip;
 	struct thread *td = ap->a_td;
 	struct ucred *cred = ap->a_cred;
 	struct file *fp = ap->a_fp;
 	struct socket *rso, *wso;
 	int error;
 
 	ASSERT_VOP_ELOCKED(vp, "fifo_open");
 	if (fp == NULL)
 		return (EINVAL);
 	if ((fip = vp->v_fifoinfo) == NULL) {
 		MALLOC(fip, struct fifoinfo *, sizeof(*fip), M_VNODE, M_WAITOK);
 		error = socreate(AF_LOCAL, &rso, SOCK_STREAM, 0, cred, td);
 		if (error)
 			goto fail1;
 		fip->fi_readsock = rso;
 		error = socreate(AF_LOCAL, &wso, SOCK_STREAM, 0, cred, td);
 		if (error)
 			goto fail2;
 		fip->fi_writesock = wso;
 		error = soconnect2(wso, rso);
 		if (error) {
 			(void)soclose(wso);
 fail2:
 			(void)soclose(rso);
 fail1:
 			free(fip, M_VNODE);
 			return (error);
 		}
 		fip->fi_readers = fip->fi_writers = 0;
 		wso->so_snd.sb_lowat = PIPE_BUF;
 		SOCKBUF_LOCK(&rso->so_rcv);
 		rso->so_rcv.sb_state |= SBS_CANTRCVMORE;
 		SOCKBUF_UNLOCK(&rso->so_rcv);
 		KASSERT(vp->v_fifoinfo == NULL,
 		    ("fifo_open: v_fifoinfo race"));
 		vp->v_fifoinfo = fip;
 	}
 
 	/*
 	 * General access to fi_readers and fi_writers is protected using
 	 * the vnode lock.
 	 *
 	 * Protect the increment of fi_readers and fi_writers and the
 	 * associated calls to wakeup() with the fifo mutex in addition
 	 * to the vnode lock.  This allows the vnode lock to be dropped
 	 * for the msleep() calls below, and using the fifo mutex with
 	 * msleep() prevents the wakeup from being missed.
 	 */
 	mtx_lock(&fifo_mtx);
 	if (ap->a_mode & FREAD) {
 		fip->fi_readers++;
 		if (fip->fi_readers == 1) {
 			SOCKBUF_LOCK(&fip->fi_writesock->so_snd);
 			fip->fi_writesock->so_snd.sb_state &= ~SBS_CANTSENDMORE;
 			SOCKBUF_UNLOCK(&fip->fi_writesock->so_snd);
 			if (fip->fi_writers > 0) {
 				wakeup(&fip->fi_writers);
 				sowwakeup(fip->fi_writesock);
 			}
 		}
 	}
 	if (ap->a_mode & FWRITE) {
 		if ((ap->a_mode & O_NONBLOCK) && fip->fi_readers == 0) {
 			mtx_unlock(&fifo_mtx);
 			return (ENXIO);
 		}
 		fip->fi_writers++;
 		if (fip->fi_writers == 1) {
 			SOCKBUF_LOCK(&fip->fi_readsock->so_rcv);
 			fip->fi_readsock->so_rcv.sb_state &= ~SBS_CANTRCVMORE;
 			SOCKBUF_UNLOCK(&fip->fi_readsock->so_rcv);
 			if (fip->fi_readers > 0) {
 				wakeup(&fip->fi_readers);
 				sorwakeup(fip->fi_readsock);
 			}
 		}
 	}
 	if ((ap->a_mode & O_NONBLOCK) == 0) {
 		if ((ap->a_mode & FREAD) && fip->fi_writers == 0) {
 			VOP_UNLOCK(vp, 0, td);
 			error = msleep(&fip->fi_readers, &fifo_mtx,
 			    PDROP | PCATCH | PSOCK, "fifoor", 0);
-			vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td);
+			vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
 			if (error) {
 				fip->fi_readers--;
 				if (fip->fi_readers == 0) {
 					socantsendmore(fip->fi_writesock);
 					fifo_cleanup(vp);
 				}
 				return (error);
 			}
 			mtx_lock(&fifo_mtx);
 			/*
 			 * We must have got woken up because we had a writer.
 			 * That (and not still having one) is the condition
 			 * that we must wait for.
 			 */
 		}
 		if ((ap->a_mode & FWRITE) && fip->fi_readers == 0) {
 			VOP_UNLOCK(vp, 0, td);
 			error = msleep(&fip->fi_writers, &fifo_mtx,
 			    PDROP | PCATCH | PSOCK, "fifoow", 0);
-			vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td);
+			vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
 			if (error) {
 				fip->fi_writers--;
 				if (fip->fi_writers == 0) {
 					socantrcvmore(fip->fi_readsock);
 					fifo_cleanup(vp);
 				}
 				return (error);
 			}
 			/*
 			 * We must have got woken up because we had
 			 * a reader.  That (and not still having one)
 			 * is the condition that we must wait for.
 			 */
 			mtx_lock(&fifo_mtx);
 		}
 	}
 	mtx_unlock(&fifo_mtx);
 	KASSERT(fp != NULL, ("can't fifo/vnode bypass"));
 	KASSERT(fp->f_ops == &badfileops, ("not badfileops in fifo_open"));
 	finit(fp, fp->f_flag, DTYPE_FIFO, fip, &fifo_ops_f);
 	return (0);
 }
 
 /*
  * Now unused vnode ioctl routine.
  */
 /* ARGSUSED */
 static int
 fifo_ioctl(ap)
 	struct vop_ioctl_args /* {
 		struct vnode *a_vp;
 		u_long  a_command;
 		caddr_t  a_data;
 		int  a_fflag;
 		struct ucred *a_cred;
 		struct thread *a_td;
 	} */ *ap;
 {
 
 	printf("WARNING: fifo_ioctl called unexpectedly\n");
 	return (ENOTTY);
 }
 
 /*
  * Now unused vnode kqfilter routine.
  */
 /* ARGSUSED */
 static int
 fifo_kqfilter(ap)
 	struct vop_kqfilter_args /* {
 		struct vnode *a_vp;
 		struct knote *a_kn;
 	} */ *ap;
 {
 
 	printf("WARNING: fifo_kqfilter called unexpectedly\n");
 	return (EINVAL);
 }
 
 static void
 filt_fifordetach(struct knote *kn)
 {
 	struct socket *so = (struct socket *)kn->kn_hook;
 
 	SOCKBUF_LOCK(&so->so_rcv);
 	knlist_remove(&so->so_rcv.sb_sel.si_note, kn, 1);
 	if (knlist_empty(&so->so_rcv.sb_sel.si_note))
 		so->so_rcv.sb_flags &= ~SB_KNOTE;
 	SOCKBUF_UNLOCK(&so->so_rcv);
 }
 
 static int
 filt_fiforead(struct knote *kn, long hint)
 {
 	struct socket *so = (struct socket *)kn->kn_hook;
 
 	SOCKBUF_LOCK_ASSERT(&so->so_rcv);
 	kn->kn_data = so->so_rcv.sb_cc;
 	if (so->so_rcv.sb_state & SBS_CANTRCVMORE) {
 		kn->kn_flags |= EV_EOF;
 		return (1);
 	} else {
 		kn->kn_flags &= ~EV_EOF;
 		return (kn->kn_data > 0);
 	}
 }
 
 static void
 filt_fifowdetach(struct knote *kn)
 {
 	struct socket *so = (struct socket *)kn->kn_hook;
 
 	SOCKBUF_LOCK(&so->so_snd);
 	knlist_remove(&so->so_snd.sb_sel.si_note, kn, 1);
 	if (knlist_empty(&so->so_snd.sb_sel.si_note))
 		so->so_snd.sb_flags &= ~SB_KNOTE;
 	SOCKBUF_UNLOCK(&so->so_snd);
 }
 
 static int
 filt_fifowrite(struct knote *kn, long hint)
 {
 	struct socket *so = (struct socket *)kn->kn_hook;
 
 	SOCKBUF_LOCK_ASSERT(&so->so_snd);
 	kn->kn_data = sbspace(&so->so_snd);
 	if (so->so_snd.sb_state & SBS_CANTSENDMORE) {
 		kn->kn_flags |= EV_EOF;
 		return (1);
 	} else {
 		kn->kn_flags &= ~EV_EOF;
 	        return (kn->kn_data >= so->so_snd.sb_lowat);
 	}
 }
 
 static void
 filt_fifodetach_notsup(struct knote *kn)
 {
 
 }
 
 static int
 filt_fifo_notsup(struct knote *kn, long hint)
 {
 
 	return (0);
 }
 
 /*
  * Device close routine
  */
 /* ARGSUSED */
 static int
 fifo_close(ap)
 	struct vop_close_args /* {
 		struct vnode *a_vp;
 		int  a_fflag;
 		struct ucred *a_cred;
 		struct thread *a_td;
 	} */ *ap;
 {
 	struct vnode *vp = ap->a_vp;
 	struct fifoinfo *fip = vp->v_fifoinfo;
 
 	ASSERT_VOP_LOCKED(vp, "fifo_close");
 	KASSERT(fip != NULL, ("fifo_close: no v_fifoinfo"));
 	if (ap->a_fflag & FREAD) {
 		fip->fi_readers--;
 		if (fip->fi_readers == 0)
 			socantsendmore(fip->fi_writesock);
 	}
 	if (ap->a_fflag & FWRITE) {
 		fip->fi_writers--;
 		if (fip->fi_writers == 0)
 			socantrcvmore(fip->fi_readsock);
 	}
 	fifo_cleanup(vp);
 	return (0);
 }
 
 /*
  * Print out internal contents of a fifo vnode.
  */
 int
 fifo_printinfo(vp)
 	struct vnode *vp;
 {
 	register struct fifoinfo *fip = vp->v_fifoinfo;
 
 	if (fip == NULL){
 		printf(", NULL v_fifoinfo");
 		return (0);
 	}
 	printf(", fifo with %ld readers and %ld writers",
 		fip->fi_readers, fip->fi_writers);
 	return (0);
 }
 
 /*
  * Print out the contents of a fifo vnode.
  */
 static int
 fifo_print(ap)
 	struct vop_print_args /* {
 		struct vnode *a_vp;
 	} */ *ap;
 {
 	fifo_printinfo(ap->a_vp);
 	printf("\n");
 	return (0);
 }
 
 /*
  * Return POSIX pathconf information applicable to fifo's.
  */
 static int
 fifo_pathconf(ap)
 	struct vop_pathconf_args /* {
 		struct vnode *a_vp;
 		int a_name;
 		int *a_retval;
 	} */ *ap;
 {
 
 	switch (ap->a_name) {
 	case _PC_LINK_MAX:
 		*ap->a_retval = LINK_MAX;
 		return (0);
 	case _PC_PIPE_BUF:
 		*ap->a_retval = PIPE_BUF;
 		return (0);
 	case _PC_CHOWN_RESTRICTED:
 		*ap->a_retval = 1;
 		return (0);
 	default:
 		return (EINVAL);
 	}
 	/* NOTREACHED */
 }
 
 /*
  * Fifo advisory byte-level locks.
  */
 /* ARGSUSED */
 static int
 fifo_advlock(ap)
 	struct vop_advlock_args /* {
 		struct vnode *a_vp;
 		caddr_t  a_id;
 		int  a_op;
 		struct flock *a_fl;
 		int  a_flags;
 	} */ *ap;
 {
 
 	return (ap->a_flags & F_FLOCK ? EOPNOTSUPP : EINVAL);
 }
 
 static int
 fifo_close_f(struct file *fp, struct thread *td)
 {
 
 	return (vnops.fo_close(fp, td));
 }
 
 /*
  * The implementation of ioctl() for named fifos is complicated by the fact
  * that we permit O_RDWR fifo file descriptors, meaning that the actions of
  * ioctls may have to be applied to both the underlying sockets rather than
  * just one.  The original implementation simply forward the ioctl to one
  * or both sockets based on fp->f_flag.  We now consider each ioctl
  * separately, as the composition effect requires careful ordering.
  *
  * We do not blindly pass all ioctls through to the socket in order to avoid
  * providing unnecessary ioctls that might be improperly depended on by
  * applications (such as socket-specific, routing, and interface ioctls).
  *
  * Unlike sys_pipe.c, fifos do not implement the deprecated TIOCSPGRP and
  * TIOCGPGRP ioctls.  Earlier implementations of fifos did forward SIOCSPGRP
  * and SIOCGPGRP ioctls, so we might need to re-add those here.
  */
 static int
 fifo_ioctl_f(struct file *fp, u_long com, void *data, struct ucred *cred,
     struct thread *td)
 {
 	struct fifoinfo *fi;
 	struct file filetmp;	/* Local, so need not be locked. */
 	int error;
 
 	error = ENOTTY;
 	fi = fp->f_data;
 
 	switch (com) {
 	case FIONBIO:
 		/*
 		 * Non-blocking I/O is implemented at the fifo layer using
 		 * MSG_NBIO, so does not need to be forwarded down the stack.
 		 */
 		return (0);
 
 	case FIOASYNC:
 	case FIOSETOWN:
 	case FIOGETOWN:
 		/*
 		 * These socket ioctls don't have any ordering requirements,
 		 * so are called in an arbitrary order, and only on the
 		 * sockets indicated by the file descriptor rights.
 		 *
 		 * XXXRW: If O_RDWR and the read socket accepts an ioctl but
 		 * the write socket doesn't, the socketpair is left in an
 		 * inconsistent state.
 		 */
 		if (fp->f_flag & FREAD) {
 			filetmp.f_data = fi->fi_readsock;
 			filetmp.f_cred = cred;
 			error = soo_ioctl(&filetmp, com, data, cred, td);
 			if (error)
 				return (error);
 		}
 		if (fp->f_flag & FWRITE) {
 			filetmp.f_data = fi->fi_writesock;
 			filetmp.f_cred = cred;
 			error = soo_ioctl(&filetmp, com, data, cred, td);
 		}
 		return (error);
 
 	case FIONREAD:
 		/*
 		 * FIONREAD will return 0 for non-readable descriptors, and
 		 * the results of FIONREAD on the read socket for readable
 		 * descriptors.
 		 */
 		if (!(fp->f_flag & FREAD)) {
 			*(int *)data = 0;
 			return (0);
 		}
 		filetmp.f_data = fi->fi_readsock;
 		filetmp.f_cred = cred;
 		return (soo_ioctl(&filetmp, com, data, cred, td));
 
 	default:
 		return (ENOTTY);
 	}
 }
 
 /*
  * Because fifos are now a file descriptor layer object, EVFILT_VNODE is not
  * implemented.  Likely, fifo_kqfilter() should be removed, and
  * fifo_kqfilter_f() should know how to forward the request to the underling
  * vnode using f_vnode in the file descriptor here.
  */
 static int
 fifo_kqfilter_f(struct file *fp, struct knote *kn)
 {
 	struct fifoinfo *fi;
 	struct socket *so;
 	struct sockbuf *sb;
 
 	fi = fp->f_data;
 
 	/*
 	 * If a filter is requested that is not supported by this file
 	 * descriptor, don't return an error, but also don't ever generate an
 	 * event.
 	 */
 	if ((kn->kn_filter == EVFILT_READ) && !(fp->f_flag & FREAD)) {
 		kn->kn_fop = &fifo_notsup_filtops;
 		return (0);
 	}
 
 	if ((kn->kn_filter == EVFILT_WRITE) && !(fp->f_flag & FWRITE)) {
 		kn->kn_fop = &fifo_notsup_filtops;
 		return (0);
 	}
 
 	switch (kn->kn_filter) {
 	case EVFILT_READ:
 		kn->kn_fop = &fiforead_filtops;
 		so = fi->fi_readsock;
 		sb = &so->so_rcv;
 		break;
 	case EVFILT_WRITE:
 		kn->kn_fop = &fifowrite_filtops;
 		so = fi->fi_writesock;
 		sb = &so->so_snd;
 		break;
 	default:
 		return (EINVAL);
 	}
 
 	kn->kn_hook = (caddr_t)so;
 
 	SOCKBUF_LOCK(sb);
 	knlist_add(&sb->sb_sel.si_note, kn, 1);
 	sb->sb_flags |= SB_KNOTE;
 	SOCKBUF_UNLOCK(sb);
 
 	return (0);
 }
 
 static int
 fifo_poll_f(struct file *fp, int events, struct ucred *cred, struct thread *td)
 {
 	struct fifoinfo *fip;
 	struct file filetmp;
 	int levents, revents = 0;
 
 	fip = fp->f_data;
 	levents = events &
 	    (POLLIN | POLLINIGNEOF | POLLPRI | POLLRDNORM | POLLRDBAND);
 	if ((fp->f_flag & FREAD) && levents) {
 		/*
 		 * If POLLIN or POLLRDNORM is requested and POLLINIGNEOF is
 		 * not, then convert the first two to the last one.  This
 		 * tells the socket poll function to ignore EOF so that we
 		 * block if there is no writer (and no data).  Callers can
 		 * set POLLINIGNEOF to get non-blocking behavior.
 		 */
 		if (levents & (POLLIN | POLLRDNORM) &&
 		    !(levents & POLLINIGNEOF)) {
 			levents &= ~(POLLIN | POLLRDNORM);
 			levents |= POLLINIGNEOF;
 		}
 
 		filetmp.f_data = fip->fi_readsock;
 		filetmp.f_cred = cred;
 		revents |= soo_poll(&filetmp, levents, cred, td);
 
 		/* Reverse the above conversion. */
 		if ((revents & POLLINIGNEOF) && !(events & POLLINIGNEOF)) {
 			revents |= (events & (POLLIN | POLLRDNORM));
 			revents &= ~POLLINIGNEOF;
 		}
 	}
 	levents = events & (POLLOUT | POLLWRNORM | POLLWRBAND);
 	if ((fp->f_flag & FWRITE) && levents) {
 		filetmp.f_data = fip->fi_writesock;
 		filetmp.f_cred = cred;
 		revents |= soo_poll(&filetmp, levents, cred, td);
 	}
 	return (revents);
 }
 
 static int
 fifo_read_f(struct file *fp, struct uio *uio, struct ucred *cred, int flags, struct thread *td)
 {
 	struct fifoinfo *fip;
 	int error, sflags;
 
 	fip = fp->f_data;
 	KASSERT(uio->uio_rw == UIO_READ,("fifo_read mode"));
 	if (uio->uio_resid == 0)
 		return (0);
 	sflags = (fp->f_flag & FNONBLOCK) ? MSG_NBIO : 0;
 	mtx_lock(&Giant);
 	error = soreceive(fip->fi_readsock, NULL, uio, NULL, NULL, &sflags);
 	mtx_unlock(&Giant);
 	return (error);
 }
 
 static int
 fifo_stat_f(struct file *fp, struct stat *sb, struct ucred *cred, struct thread *td)
 {
 
 	return (vnops.fo_stat(fp, sb, cred, td));
 }
 
 static int
 fifo_truncate_f(struct file *fp, off_t length, struct ucred *cred, struct thread *td)
 {
 
 	return (vnops.fo_truncate(fp, length, cred, td));
 }
 
 static int
 fifo_write_f(struct file *fp, struct uio *uio, struct ucred *cred, int flags, struct thread *td)
 {
 	struct fifoinfo *fip;
 	int error, sflags;
 
 	fip = fp->f_data;
 	KASSERT(uio->uio_rw == UIO_WRITE,("fifo_write mode"));
 	sflags = (fp->f_flag & FNONBLOCK) ? MSG_NBIO : 0;
 	mtx_lock(&Giant);
 	error = sosend(fip->fi_writesock, NULL, uio, 0, NULL, sflags, td);
 	mtx_unlock(&Giant);
 	return (error);
 }
Index: head/sys/fs/hpfs/hpfs_vnops.c
===================================================================
--- head/sys/fs/hpfs/hpfs_vnops.c	(revision 175201)
+++ head/sys/fs/hpfs/hpfs_vnops.c	(revision 175202)
@@ -1,1258 +1,1258 @@
 /*-
  * Copyright (c) 1998, 1999 Semen Ustimenko (semenu@FreeBSD.org)
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  * $FreeBSD$
  */
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/kernel.h>
 #include <sys/proc.h>
 #include <sys/conf.h>
 #include <sys/time.h>
 #include <sys/types.h>
 #include <sys/stat.h>
 #include <sys/vnode.h>
 #include <sys/mount.h>
 #include <sys/namei.h>
 #include <sys/malloc.h>
 #include <sys/bio.h>
 #include <sys/buf.h>
 #include <sys/dirent.h>
 
 #include <vm/vm.h>
 #include <vm/vm_param.h>
 #include <vm/vm_page.h>
 #include <vm/vm_object.h>
 #include <vm/vm_pager.h>
 #include <vm/vnode_pager.h>
 #include <vm/vm_extern.h>
 
 #include <sys/unistd.h> /* for pathconf(2) constants */
 
 #include <fs/hpfs/hpfs.h>
 #include <fs/hpfs/hpfsmount.h>
 #include <fs/hpfs/hpfs_subr.h>
 #include <fs/hpfs/hpfs_ioctl.h>
 
 static int	hpfs_de_uiomove(struct hpfsmount *, struct hpfsdirent *,
 				     struct uio *);
 static vop_ioctl_t	hpfs_ioctl;
 static vop_read_t	hpfs_read;
 static vop_write_t	hpfs_write;
 static vop_getattr_t	hpfs_getattr;
 static vop_setattr_t	hpfs_setattr;
 static vop_inactive_t	hpfs_inactive;
 static vop_print_t	hpfs_print;
 static vop_reclaim_t	hpfs_reclaim;
 static vop_strategy_t	hpfs_strategy;
 static vop_access_t	hpfs_access;
 static vop_open_t	hpfs_open;
 static vop_close_t	hpfs_close;
 static vop_readdir_t	hpfs_readdir;
 static vop_cachedlookup_t	hpfs_lookup;
 static vop_create_t	hpfs_create;
 static vop_remove_t	hpfs_remove;
 static vop_bmap_t	hpfs_bmap;
 static vop_fsync_t	hpfs_fsync;
 static vop_pathconf_t	hpfs_pathconf;
 static vop_vptofh_t	hpfs_vptofh;
 
 static int
 hpfs_fsync(ap)
 	struct vop_fsync_args /* {
 		struct vnode *a_vp;
 		struct ucred *a_cred;
 		int a_waitfor;
 		struct thread *a_td;
 	} */ *ap;
 {
 	/*
 	 * Flush our dirty buffers.
 	 */
 	vop_stdfsync(ap);
 
 	/*
 	 * Write out the on-disc version of the vnode.
 	 */
 	return hpfs_update(VTOHP(ap->a_vp));
 }
 
 static int
 hpfs_ioctl (
 	struct vop_ioctl_args /* {
 		struct vnode *a_vp;
 		u_long a_command;
 		caddr_t a_data;
 		int a_fflag;
 		struct ucred *a_cred;
 		struct thread *a_td;
 	} */ *ap)
 {
 	register struct vnode *vp = ap->a_vp;
 	register struct hpfsnode *hp = VTOHP(vp);
 	int error;
 
 	printf("hpfs_ioctl(0x%x, 0x%lx, 0x%p, 0x%x): ",
 		hp->h_no, ap->a_command, ap->a_data, ap->a_fflag);
 
 	switch (ap->a_command) {
 	case HPFSIOCGEANUM: {
 		u_long eanum;
 		u_long passed;
 		struct ea *eap;
 
 		eanum = 0;
 
 		if (hp->h_fn.fn_ealen > 0) {
 			eap = (struct ea *)&(hp->h_fn.fn_int);
 			passed = 0;
 
 			while (passed < hp->h_fn.fn_ealen) {
 
 				printf("EAname: %s\n", EA_NAME(eap));
 
 				eanum++;
 				passed += sizeof(struct ea) +
 					  eap->ea_namelen + 1 + eap->ea_vallen;
 				eap = (struct ea *)((caddr_t)hp->h_fn.fn_int +
 						passed);
 			}
 			error = 0;
 		} else {
 			error = ENOENT;
 		}
 
 		printf("%lu eas\n", eanum);
 
 		*(u_long *)ap->a_data = eanum;
 
 		break;
 	}
 	case HPFSIOCGEASZ: {
 		u_long eanum;
 		u_long passed;
 		struct ea *eap;
 
 		printf("EA%ld\n", *(u_long *)ap->a_data);
 
 		eanum = 0;
 		if (hp->h_fn.fn_ealen > 0) {
 			eap = (struct ea *)&(hp->h_fn.fn_int);
 			passed = 0;
 
 			error = ENOENT;
 			while (passed < hp->h_fn.fn_ealen) {
 				printf("EAname: %s\n", EA_NAME(eap));
 
 				if (eanum == *(u_long *)ap->a_data) {
 					*(u_long *)ap->a_data =
 					  	eap->ea_namelen + 1 +
 						eap->ea_vallen;
 
 					error = 0;
 					break;
 				}
 
 				eanum++;
 				passed += sizeof(struct ea) +
 					  eap->ea_namelen + 1 + eap->ea_vallen;
 				eap = (struct ea *)((caddr_t)hp->h_fn.fn_int +
 						passed);
 			}
 		} else {
 			error = ENOENT;
 		}
 
 		break;
 	}
 	case HPFSIOCRDEA: {
 		u_long eanum;
 		u_long passed;
 		struct hpfs_rdea *rdeap;
 		struct ea *eap;
 
 		rdeap = (struct hpfs_rdea *)ap->a_data;
 		printf("EA%ld\n", rdeap->ea_no);
 
 		eanum = 0;
 		if (hp->h_fn.fn_ealen > 0) {
 			eap = (struct ea *)&(hp->h_fn.fn_int);
 			passed = 0;
 
 			error = ENOENT;
 			while (passed < hp->h_fn.fn_ealen) {
 				printf("EAname: %s\n", EA_NAME(eap));
 
 				if (eanum == rdeap->ea_no) {
 					rdeap->ea_sz = eap->ea_namelen + 1 +
 							eap->ea_vallen;
 					copyout(EA_NAME(eap),rdeap->ea_data,
 						rdeap->ea_sz);
 					error = 0;
 					break;
 				}
 
 				eanum++;
 				passed += sizeof(struct ea) +
 					  eap->ea_namelen + 1 + eap->ea_vallen;
 				eap = (struct ea *)((caddr_t)hp->h_fn.fn_int +
 						passed);
 			}
 		} else {
 			error = ENOENT;
 		}
 
 		break;
 	}
 	default:
 		error = ENOTTY;
 		break;
 	}
 	return (error);
 }
 
 /*
  * Map file offset to disk offset.
  */
 int
 hpfs_bmap(ap)
 	struct vop_bmap_args /* {
 		struct vnode *a_vp;
 		daddr_t  a_bn;
 		struct bufobj **a_bop;
 		daddr_t *a_bnp;
 		int *a_runp;
 		int *a_runb;
 	} */ *ap;
 {
 	register struct hpfsnode *hp = VTOHP(ap->a_vp);
 	daddr_t blkno;
 	int error;
 
 	if (ap->a_bop != NULL) 
 		*ap->a_bop = &hp->h_devvp->v_bufobj;
 	if (ap->a_runb != NULL)
 		*ap->a_runb = 0;
 	if (ap->a_bnp == NULL)
 		return (0);
 
 	dprintf(("hpfs_bmap(0x%x, 0x%x): ",hp->h_no, ap->a_bn));
 
 	error = hpfs_hpbmap (hp, ap->a_bn, &blkno, ap->a_runp);
 	*ap->a_bnp = blkno;
 
 	return (error);
 }
 
 static int
 hpfs_read(ap)
 	struct vop_read_args /* {
 		struct vnode *a_vp;
 		struct uio *a_uio;
 		int a_ioflag;
 		struct ucred *a_cred;
 	} */ *ap;
 {
 	register struct vnode *vp = ap->a_vp;
 	register struct hpfsnode *hp = VTOHP(vp);
 	struct uio *uio = ap->a_uio;
 	struct buf *bp;
 	u_int xfersz, toread;
 	u_int off;
 	daddr_t lbn, bn;
 	int resid;
 	int runl;
 	int error = 0;
 
 	resid = min (uio->uio_resid, hp->h_fn.fn_size - uio->uio_offset);
 
 	dprintf(("hpfs_read(0x%x, off: %d resid: %d, segflg: %d): [resid: 0x%x]\n",hp->h_no,(u_int32_t)uio->uio_offset,uio->uio_resid,uio->uio_segflg, resid));
 
 	while (resid) {
 		lbn = uio->uio_offset >> DEV_BSHIFT;
 		off = uio->uio_offset & (DEV_BSIZE - 1);
 		dprintf(("hpfs_read: resid: 0x%x lbn: 0x%x off: 0x%x\n",
 			uio->uio_resid, lbn, off));
 		error = hpfs_hpbmap(hp, lbn, &bn, &runl);
 		if (error)
 			return (error);
 
 		toread = min(off + resid, min(DFLTPHYS, (runl+1)*DEV_BSIZE));
 		xfersz = (toread + DEV_BSIZE - 1) & ~(DEV_BSIZE - 1);
 		dprintf(("hpfs_read: bn: 0x%x (0x%x) toread: 0x%x (0x%x)\n",
 			bn, runl, toread, xfersz));
 
 		if (toread == 0) 
 			break;
 
 		error = bread(hp->h_devvp, bn, xfersz, NOCRED, &bp);
 		if (error) {
 			brelse(bp);
 			break;
 		}
 
 		error = uiomove(bp->b_data + off, toread - off, uio);
 		if(error) {
 			brelse(bp);
 			break;
 		}
 		brelse(bp);
 		resid -= toread;
 	}
 	dprintf(("hpfs_read: successful\n"));
 	return (error);
 }
 
 static int
 hpfs_write(ap)
 	struct vop_write_args /* {
 		struct vnode *a_vp;
 		struct uio *a_uio;
 		int  a_ioflag;
 		struct ucred *a_cred;
 	} */ *ap;
 {
 	register struct vnode *vp = ap->a_vp;
 	register struct hpfsnode *hp = VTOHP(vp);
 	struct uio *uio = ap->a_uio;
 	struct buf *bp;
 	u_int xfersz, towrite;
 	u_int off;
 	daddr_t lbn, bn;
 	int runl;
 	int error = 0;
 
 	dprintf(("hpfs_write(0x%x, off: %d resid: %d, segflg: %d):\n",hp->h_no,(u_int32_t)uio->uio_offset,uio->uio_resid,uio->uio_segflg));
 
 	if (ap->a_ioflag & IO_APPEND) {
 		dprintf(("hpfs_write: APPEND mode\n"));
 		uio->uio_offset = hp->h_fn.fn_size;
 	}
 	if (uio->uio_offset + uio->uio_resid > hp->h_fn.fn_size) {
 		error = hpfs_extend (hp, uio->uio_offset + uio->uio_resid);
 		if (error) {
 			printf("hpfs_write: hpfs_extend FAILED %d\n", error);
 			return (error);
 		}
 	}
 
 	while (uio->uio_resid) {
 		lbn = uio->uio_offset >> DEV_BSHIFT;
 		off = uio->uio_offset & (DEV_BSIZE - 1);
 		dprintf(("hpfs_write: resid: 0x%x lbn: 0x%x off: 0x%x\n",
 			uio->uio_resid, lbn, off));
 		error = hpfs_hpbmap(hp, lbn, &bn, &runl);
 		if (error)
 			return (error);
 
 		towrite = min(off + uio->uio_resid, min(DFLTPHYS, (runl+1)*DEV_BSIZE));
 		xfersz = (towrite + DEV_BSIZE - 1) & ~(DEV_BSIZE - 1);
 		dprintf(("hpfs_write: bn: 0x%x (0x%x) towrite: 0x%x (0x%x)\n",
 			bn, runl, towrite, xfersz));
 
 		if ((off == 0) && (towrite == xfersz)) {
 			bp = getblk(hp->h_devvp, bn, xfersz, 0, 0, 0);
 			clrbuf(bp);
 		} else {
 			error = bread(hp->h_devvp, bn, xfersz, NOCRED, &bp);
 			if (error) {
 				brelse(bp);
 				return (error);
 			}
 		}
 
 		error = uiomove(bp->b_data + off, towrite - off, uio);
 		if(error) {
 			brelse(bp);
 			return (error);
 		}
 
 		if (ap->a_ioflag & IO_SYNC)
 			bwrite(bp);
 		else
 			bawrite(bp);
 	}
 
 	dprintf(("hpfs_write: successful\n"));
 	return (0);
 }
 
 /*
  * XXXXX do we need hpfsnode locking inside?
  */
 static int
 hpfs_getattr(ap)
 	struct vop_getattr_args /* {
 		struct vnode *a_vp;
 		struct vattr *a_vap;
 		struct ucred *a_cred;
 		struct thread *a_td;
 	} */ *ap;
 {
 	register struct vnode *vp = ap->a_vp;
 	register struct hpfsnode *hp = VTOHP(vp);
 	register struct vattr *vap = ap->a_vap;
 	int error;
 
 	dprintf(("hpfs_getattr(0x%x):\n", hp->h_no));
 
 	vap->va_fsid = dev2udev(hp->h_dev);
 	vap->va_fileid = hp->h_no;
 	vap->va_mode = hp->h_mode;
 	vap->va_nlink = 1;
 	vap->va_uid = hp->h_uid;
 	vap->va_gid = hp->h_gid;
 	vap->va_rdev = 0;				/* XXX UNODEV ? */
 	vap->va_size = hp->h_fn.fn_size;
 	vap->va_bytes = ((hp->h_fn.fn_size + DEV_BSIZE-1) & ~(DEV_BSIZE-1)) +
 			DEV_BSIZE;
 
 	if (!(hp->h_flag & H_PARVALID)) {
 		error = hpfs_validateparent(hp);
 		if (error) 
 			return (error);
 	}
 	vap->va_atime = hpfstimetounix(hp->h_atime);
 	vap->va_mtime = hpfstimetounix(hp->h_mtime);
 	vap->va_ctime = hpfstimetounix(hp->h_ctime);
 
 	vap->va_flags = 0;
 	vap->va_gen = 0;
 	vap->va_blocksize = DEV_BSIZE;
 	vap->va_type = vp->v_type;
 	vap->va_filerev = 0;
 
 	return (0);
 }
 
 /*
  * XXXXX do we need hpfsnode locking inside?
  */
 static int
 hpfs_setattr(ap)
 	struct vop_setattr_args /* {
 		struct vnode *a_vp;
 		struct vattr *a_vap;
 		struct ucred *a_cred;
 		struct thread *a_td;
 	} */ *ap;
 {
 	struct vnode *vp = ap->a_vp;
 	struct hpfsnode *hp = VTOHP(vp);
 	struct vattr *vap = ap->a_vap;
 	struct ucred *cred = ap->a_cred;
 	struct thread *td = ap->a_td;
 	int error;
 
 	dprintf(("hpfs_setattr(0x%x):\n", hp->h_no));
 
 	/*
 	 * Check for unsettable attributes.
 	 */
 	if ((vap->va_type != VNON) || (vap->va_nlink != VNOVAL) ||
 	    (vap->va_fsid != VNOVAL) || (vap->va_fileid != VNOVAL) ||
 	    (vap->va_blocksize != VNOVAL) || (vap->va_rdev != VNOVAL) ||
 	    (vap->va_bytes != VNOVAL) || (vap->va_gen != VNOVAL)) {
 		dprintf(("hpfs_setattr: changing nonsettable attr\n"));
 		return (EINVAL);
 	}
 
 	/* Can't change flags XXX Could be implemented */
 	if (vap->va_flags != VNOVAL) {
 		printf("hpfs_setattr: FLAGS CANNOT BE SET\n");
 		return (EINVAL);
 	}
 
 	/* Can't change uid/gid XXX Could be implemented */
 	if (vap->va_uid != (uid_t)VNOVAL || vap->va_gid != (gid_t)VNOVAL) {
 		printf("hpfs_setattr: UID/GID CANNOT BE SET\n");
 		return (EINVAL);
 	}
 
 	/* Can't change mode XXX Could be implemented */
 	if (vap->va_mode != (mode_t)VNOVAL) {
 		printf("hpfs_setattr: MODE CANNOT BE SET\n");
 		return (EINVAL);
 	}
 
 	/* Update times */
 	if (vap->va_atime.tv_sec != VNOVAL || vap->va_mtime.tv_sec != VNOVAL) {
 		if (vp->v_mount->mnt_flag & MNT_RDONLY)
 			return (EROFS);
 		if (vap->va_vaflags & VA_UTIMES_NULL) {
 			error = VOP_ACCESS(vp, VADMIN, cred, td);
 			if (error)
 				error = VOP_ACCESS(vp, VWRITE, cred, td);
 		} else
 			error = VOP_ACCESS(vp, VADMIN, cred, td);
 		if (vap->va_atime.tv_sec != VNOVAL)
 			hp->h_atime = vap->va_atime.tv_sec;
 		if (vap->va_mtime.tv_sec != VNOVAL)
 			hp->h_mtime = vap->va_mtime.tv_sec;
 
 		hp->h_flag |= H_PARCHANGE;
 	}
 
 	if (vap->va_size != VNOVAL) {
 		switch (vp->v_type) {
 		case VDIR:
 			return (EISDIR);
 		case VREG:
 			if (vp->v_mount->mnt_flag & MNT_RDONLY)
 				return (EROFS);
 			break;
 		default:
 			printf("hpfs_setattr: WRONG v_type\n");
 			return (EINVAL);
 		}
 
 		if (vap->va_size < hp->h_fn.fn_size) {
 			error = vtruncbuf(vp, cred, td, vap->va_size, DEV_BSIZE);
 			if (error)
 				return (error);
 			error = hpfs_truncate(hp, vap->va_size);
 			if (error)
 				return (error);
 
 		} else if (vap->va_size > hp->h_fn.fn_size) {
 			vnode_pager_setsize(vp, vap->va_size);
 			error = hpfs_extend(hp, vap->va_size);
 			if (error)
 				return (error);
 		}
 	}
 
 	return (0);
 }
 
 /*
  * Last reference to a node.  If necessary, write or delete it.
  */
 int
 hpfs_inactive(ap)
 	struct vop_inactive_args /* {
 		struct vnode *a_vp;
 	} */ *ap;
 {
 	register struct vnode *vp = ap->a_vp;
 	register struct hpfsnode *hp = VTOHP(vp);
 	int error;
 
 	dprintf(("hpfs_inactive(0x%x): \n", hp->h_no));
 
 	if (hp->h_flag & H_CHANGE) {
 		dprintf(("hpfs_inactive: node changed, update\n"));
 		error = hpfs_update (hp);
 		if (error)
 			return (error);
 	}
 
 	if (hp->h_flag & H_PARCHANGE) {
 		dprintf(("hpfs_inactive: parent node changed, update\n"));
 		error = hpfs_updateparent (hp);
 		if (error)
 			return (error);
 	}
 
 	if (prtactive && vrefcnt(vp) != 0)
 		vprint("hpfs_inactive: pushing active", vp);
 
 	if (hp->h_flag & H_INVAL) {
 		vrecycle(vp, ap->a_td);
 		return (0);
 	}
 
 	return (0);
 }
 
 /*
  * Reclaim an inode so that it can be used for other purposes.
  */
 int
 hpfs_reclaim(ap)
 	struct vop_reclaim_args /* {
 		struct vnode *a_vp;
 	} */ *ap;
 {
 	register struct vnode *vp = ap->a_vp;
 	register struct hpfsnode *hp = VTOHP(vp);
 
 	dprintf(("hpfs_reclaim(0x%x0): \n", hp->h_no));
 
 	/*
 	 * Destroy the vm object and flush associated pages.
 	 */
 	vnode_destroy_vobject(vp);
 
 	vfs_hash_remove(vp);
 
 	mtx_destroy(&hp->h_interlock);
 
 	vp->v_data = NULL;
 
 	FREE(hp, M_HPFSNO);
 
 	return (0);
 }
 
 static int
 hpfs_print(ap)
 	struct vop_print_args /* {
 		struct vnode *a_vp;
 	} */ *ap;
 {
 	register struct vnode *vp = ap->a_vp;
 	register struct hpfsnode *hp = VTOHP(vp);
 
 	printf("\tino 0x%x\n", hp->h_no);
 	return (0);
 }
 
 /*
  * Calculate the logical to physical mapping if not done already,
  * then call the device strategy routine.
  *
  * In order to be able to swap to a file, the hpfs_hpbmap operation may not
  * deadlock on memory.  See hpfs_bmap() for details. XXXXXXX (not impl)
  */
 int
 hpfs_strategy(ap)
 	struct vop_strategy_args /* {
 		struct buf *a_bp;
 	} */ *ap;
 {
 	register struct buf *bp = ap->a_bp;
 	register struct vnode *vp = ap->a_vp;
 	register struct hpfsnode *hp = VTOHP(ap->a_vp);
 	daddr_t blkno;
 	struct bufobj *bo;
 	int error;
 
 	dprintf(("hpfs_strategy(): \n"));
 
 	if (vp->v_type == VBLK || vp->v_type == VCHR)
 		panic("hpfs_strategy: spec");
 	if (bp->b_blkno == bp->b_lblkno) {
 		error = hpfs_hpbmap (hp, bp->b_lblkno, &blkno, NULL);
 		bp->b_blkno = blkno;
 		if (error) {
 			printf("hpfs_strategy: hpfs_bpbmap FAILED %d\n", error);
 			bp->b_error = error;
 			bp->b_ioflags |= BIO_ERROR;
 			bufdone(bp);
 			return (error);
 		}
 		if ((long)bp->b_blkno == -1)
 			vfs_bio_clrbuf(bp);
 	}
 	if ((long)bp->b_blkno == -1) {
 		bufdone(bp);
 		return (0);
 	}
 	bp->b_iooffset = dbtob(bp->b_blkno);
 	bo = hp->h_hpmp->hpm_bo;
 	BO_STRATEGY(bo, bp);
 	return (0);
 }
 
 /*
  * XXXXX do we need hpfsnode locking inside?
  */
 int
 hpfs_access(ap)
 	struct vop_access_args /* {
 		struct vnode *a_vp;
 		int  a_mode;
 		struct ucred *a_cred;
 		struct thread *a_td;
 	} */ *ap;
 {
 	struct vnode *vp = ap->a_vp;
 	struct hpfsnode *hp = VTOHP(vp);
 	mode_t mode = ap->a_mode;
 
 	dprintf(("hpfs_access(0x%x):\n", hp->h_no));
 
 	/*
 	 * Disallow write attempts on read-only filesystems;
 	 * unless the file is a socket, fifo, or a block or
 	 * character device resident on the filesystem.
 	 */
 	if (mode & VWRITE) {
 		switch ((int)vp->v_type) {
 		case VDIR:
 		case VLNK:
 		case VREG:
 			if (vp->v_mount->mnt_flag & MNT_RDONLY)
 				return (EROFS);
 			break;
 		}
 	}
 
 	return (vaccess(vp->v_type, hp->h_mode, hp->h_uid, hp->h_gid,
 	    ap->a_mode, ap->a_cred, NULL));
 }
 
 /*
  * Open called.
  *
  * Nothing to do.
  */
 /* ARGSUSED */
 static int
 hpfs_open(ap)
 	struct vop_open_args /* {
 		struct vnode *a_vp;
 		int  a_mode;
 		struct ucred *a_cred;
 		struct thread *a_td;
 	} */ *ap;
 {
 #ifdef HPFS_DEBUG
 	register struct vnode *vp = ap->a_vp;
 	register struct hpfsnode *hp = VTOHP(vp);
 
 	printf("hpfs_open(0x%x):\n",hp->h_no);
 #endif
 
 	/*
 	 * Files marked append-only must be opened for appending.
 	 */
 
 	return (0);
 }
 
 /*
  * Close called.
  *
  * Update the times on the inode.
  */
 /* ARGSUSED */
 static int
 hpfs_close(ap)
 	struct vop_close_args /* {
 		struct vnode *a_vp;
 		int  a_fflag;
 		struct ucred *a_cred;
 		struct thread *a_td;
 	} */ *ap;
 {
 #ifdef HPFS_DEBUG
 	register struct vnode *vp = ap->a_vp;
 	register struct hpfsnode *hp = VTOHP(vp);
 
 	printf("hpfs_close: %d\n",hp->h_no);
 #endif
 
 	return (0);
 }
 
 static int
 hpfs_de_uiomove (
 	struct hpfsmount *hpmp,
 	struct hpfsdirent *dep,
 	struct uio *uio)
 {
 	struct dirent cde;
 	int i, error;
 
 	dprintf(("[no: 0x%x, size: %d, name: %2d:%.*s, flag: 0x%x] ",
 		dep->de_fnode, dep->de_size, dep->de_namelen,
 		dep->de_namelen, dep->de_name, dep->de_flag));
 
 	/*strncpy(cde.d_name, dep->de_name, dep->de_namelen);*/
 	for (i=0; i<dep->de_namelen; i++) 
 		cde.d_name[i] = hpfs_d2u(hpmp, dep->de_name[i]);
 
 	cde.d_name[dep->de_namelen] = '\0';
 	cde.d_namlen = dep->de_namelen;
 	cde.d_fileno = dep->de_fnode;
 	cde.d_type = (dep->de_flag & DE_DIR) ? DT_DIR : DT_REG;
 	cde.d_reclen = sizeof(struct dirent);
 
 	error = uiomove((char *)&cde, sizeof(struct dirent), uio);
 	if (error)
 		return (error);
 	
 	dprintf(("[0x%x] ", uio->uio_resid));
 	return (error);
 }
 
 
 static struct dirent hpfs_de_dot =
 	{ 0, sizeof(struct dirent), DT_DIR, 1, "." };
 static struct dirent hpfs_de_dotdot =
 	{ 0, sizeof(struct dirent), DT_DIR, 2, ".." };
 int
 hpfs_readdir(ap)
 	struct vop_readdir_args /* {
 		struct vnode *a_vp;
 		struct uio *a_uio;
 		struct ucred *a_cred;
 		int *a_ncookies;
 		u_int **cookies;
 	} */ *ap;
 {
 	register struct vnode *vp = ap->a_vp;
 	register struct hpfsnode *hp = VTOHP(vp);
 	struct hpfsmount *hpmp = hp->h_hpmp;
 	struct uio *uio = ap->a_uio;
 	int ncookies = 0, i, num, cnum;
 	int error = 0;
 	off_t off;
 	struct buf *bp;
 	struct dirblk *dp;
 	struct hpfsdirent *dep;
 	lsn_t olsn;
 	lsn_t lsn;
 	int level;
 
 	dprintf(("hpfs_readdir(0x%x, 0x%x, 0x%x): ",hp->h_no,(u_int32_t)uio->uio_offset,uio->uio_resid));
 
 	off = uio->uio_offset;
 
 	if( uio->uio_offset < sizeof(struct dirent) ) {
 		dprintf((". faked, "));
 		hpfs_de_dot.d_fileno = hp->h_no;
 		error = uiomove((char *)&hpfs_de_dot,sizeof(struct dirent),uio);
 		if(error) {
 			return (error);
 		}
 
 		ncookies ++;
 	}
 
 	if( uio->uio_offset < 2 * sizeof(struct dirent) ) {
 		dprintf((".. faked, "));
 		hpfs_de_dotdot.d_fileno = hp->h_fn.fn_parent;
 
 		error = uiomove((char *)&hpfs_de_dotdot, sizeof(struct dirent),
 				uio);
 		if(error) {
 			return (error);
 		}
 
 		ncookies ++;
 	}
 
 	num = uio->uio_offset / sizeof(struct dirent) - 2;
 	cnum = 0;
 
 	lsn = ((alleaf_t *)hp->h_fn.fn_abd)->al_lsn;
 
 	olsn = 0;
 	level = 1;
 
 dive:
 	dprintf(("[dive 0x%x] ", lsn));
 	error = bread(hp->h_devvp, lsn, D_BSIZE, NOCRED, &bp);
 	if (error) {
 		brelse(bp);
 		return (error);
 	}
 
 	dp = (struct dirblk *) bp->b_data;
 	if (dp->d_magic != D_MAGIC) {
 		printf("hpfs_readdir: MAGIC DOESN'T MATCH\n");
 		brelse(bp);
 		return (EINVAL);
 	}
 
 	dep = D_DIRENT(dp);
 
 	if (olsn) {
 		dprintf(("[restore 0x%x] ", olsn));
 
 		while(!(dep->de_flag & DE_END) ) {
 			if((dep->de_flag & DE_DOWN) &&
 			   (olsn == DE_DOWNLSN(dep)))
 					 break;
 			dep = (hpfsdirent_t *)((caddr_t)dep + dep->de_reclen);
 		}
 
 		if((dep->de_flag & DE_DOWN) && (olsn == DE_DOWNLSN(dep))) {
 			if (dep->de_flag & DE_END)
 				goto blockdone;
 
 			if (!(dep->de_flag & DE_SPECIAL)) {
 				if (num <= cnum) {
 					if (uio->uio_resid < sizeof(struct dirent)) {
 						brelse(bp);
 						dprintf(("[resid] "));
 						goto readdone;
 					}
 
 					error = hpfs_de_uiomove(hpmp, dep, uio);
 					if (error) {
 						brelse (bp);
 						return (error);
 					}
 					ncookies++;
 
 					if (uio->uio_resid < sizeof(struct dirent)) {
 						brelse(bp);
 						dprintf(("[resid] "));
 						goto readdone;
 					}
 				}
 				cnum++;
 			}
 
 			dep = (hpfsdirent_t *)((caddr_t)dep + dep->de_reclen);
 		} else {
 			printf("hpfs_readdir: ERROR! oLSN not found\n");
 			brelse(bp);
 			return (EINVAL);
 		}
 	}
 
 	olsn = 0;
 
 	while(!(dep->de_flag & DE_END)) {
 		if(dep->de_flag & DE_DOWN) {
 			lsn = DE_DOWNLSN(dep);
 			brelse(bp);
 			level++;
 			goto dive;
 		}
 
 		if (!(dep->de_flag & DE_SPECIAL)) {
 			if (num <= cnum) {
 				if (uio->uio_resid < sizeof(struct dirent)) {
 					brelse(bp);
 					dprintf(("[resid] "));
 					goto readdone;
 				}
 
 				error = hpfs_de_uiomove(hpmp, dep, uio);
 				if (error) {
 					brelse (bp);
 					return (error);
 				}
 				ncookies++;
 				
 				if (uio->uio_resid < sizeof(struct dirent)) {
 					brelse(bp);
 					dprintf(("[resid] "));
 					goto readdone;
 				}
 			}
 			cnum++;
 		}
 
 		dep = (hpfsdirent_t *)((caddr_t)dep + dep->de_reclen);
 	}
 
 	if(dep->de_flag & DE_DOWN) {
 		dprintf(("[enddive] "));
 		lsn = DE_DOWNLSN(dep);
 		brelse(bp);
 		level++;
 		goto dive;
 	}
 
 blockdone:
 	dprintf(("[EOB] "));
 	olsn = lsn;
 	lsn = dp->d_parent;
 	brelse(bp);
 	level--;
 
 	dprintf(("[level %d] ", level));
 
 	if (level > 0)
 		goto dive;	/* undive really */
 
 	if (ap->a_eofflag) {
 	    dprintf(("[EOF] "));
 	    *ap->a_eofflag = 1;
 	}
 
 readdone:
 	dprintf(("[readdone]\n"));
 	if (!error && ap->a_ncookies != NULL) {
 		struct dirent* dpStart;
 		struct dirent* dp;
 		u_long *cookies;
 		u_long *cookiep;
 
 		dprintf(("%d cookies, ",ncookies));
 		if (uio->uio_segflg != UIO_SYSSPACE || uio->uio_iovcnt != 1)
 			panic("hpfs_readdir: unexpected uio from NFS server");
 		dpStart = (struct dirent *)
 		     ((caddr_t)uio->uio_iov->iov_base -
 			 (uio->uio_offset - off));
 		MALLOC(cookies, u_long *, ncookies * sizeof(u_long),
 		       M_TEMP, M_WAITOK);
 		for (dp = dpStart, cookiep = cookies, i=0;
 		     i < ncookies;
 		     dp = (struct dirent *)((caddr_t) dp + dp->d_reclen), i++) {
 			off += dp->d_reclen;
 			*cookiep++ = (u_int) off;
 		}
 		*ap->a_ncookies = ncookies;
 		*ap->a_cookies = cookies;
 	}
 
 	return (0);
 }
 
 int
 hpfs_lookup(ap)
 	struct vop_cachedlookup_args /* {
 		struct vnode *a_dvp;
 		struct vnode **a_vpp;
 		struct componentname *a_cnp;
 	} */ *ap;
 {
 	register struct vnode *dvp = ap->a_dvp;
 	register struct hpfsnode *dhp = VTOHP(dvp);
 	struct hpfsmount *hpmp = dhp->h_hpmp;
 	struct componentname *cnp = ap->a_cnp;
 	struct ucred *cred = cnp->cn_cred;
 	int error;
 	int nameiop = cnp->cn_nameiop;
 	int flags = cnp->cn_flags;
 	dprintf(("hpfs_lookup(0x%x, %s, %ld):\n",
 		dhp->h_no, cnp->cn_nameptr, cnp->cn_namelen));
 
 	if (nameiop != CREATE && nameiop != DELETE && nameiop != LOOKUP) {
 		printf("hpfs_lookup: LOOKUP, DELETE and CREATE are only supported\n");
 		return (EOPNOTSUPP);
 	}
 
 	error = VOP_ACCESS(dvp, VEXEC, cred, cnp->cn_thread);
 	if(error)
 		return (error);
 
 	if( (cnp->cn_namelen == 1) &&
 	    !strncmp(cnp->cn_nameptr,".",1) ) {
 		dprintf(("hpfs_lookup(0x%x,...): . faked\n",dhp->h_no));
 
 		VREF(dvp);
 		*ap->a_vpp = dvp;
 
 		return (0);
 	} else if( (cnp->cn_namelen == 2) &&
 	    !strncmp(cnp->cn_nameptr,"..",2) && (flags & ISDOTDOT) ) {
 		dprintf(("hpfs_lookup(0x%x,...): .. faked (0x%x)\n",
 			dhp->h_no, dhp->h_fn.fn_parent));
 
 		if (VFS_VGET(hpmp->hpm_mp, dhp->h_fn.fn_parent,
 		    LK_NOWAIT | LK_EXCLUSIVE, ap->a_vpp)) {
 			VOP_UNLOCK(dvp,0,cnp->cn_thread);
 			error = VFS_VGET(hpmp->hpm_mp,
 				 dhp->h_fn.fn_parent, LK_EXCLUSIVE, ap->a_vpp); 
-			vn_lock(dvp, LK_EXCLUSIVE|LK_RETRY, cnp->cn_thread);
+			vn_lock(dvp, LK_EXCLUSIVE|LK_RETRY);
 			if (error)
 				return(error);
 		}
 		return (0);
 	} else {
 		struct buf *bp;
 		struct hpfsdirent *dep;
 		struct hpfsnode *hp;
 
 		error = hpfs_genlookupbyname(dhp,
 				cnp->cn_nameptr, cnp->cn_namelen, &bp, &dep);
 		if (error) {
 			if ((error == ENOENT) && (flags & ISLASTCN) &&
 			    (nameiop == CREATE || nameiop == RENAME)) {
 				cnp->cn_flags |= SAVENAME;
 				return (EJUSTRETURN);
 			}
 
 			return (error);
 		}
 
 		dprintf(("hpfs_lookup: fnode: 0x%x, CPID: 0x%x\n",
 			 dep->de_fnode, dep->de_cpid));
 
 		if (nameiop == DELETE && (flags & ISLASTCN)) {
 			error = VOP_ACCESS(dvp, VWRITE, cred, cnp->cn_thread);
 			if (error) {
 				brelse(bp);
 				return (error);
 			}
 		}
 
 		if (dhp->h_no == dep->de_fnode) {
 			brelse(bp);
 			VREF(dvp);
 			*ap->a_vpp = dvp;
 			return (0);
 		}
 
 		error = VFS_VGET(hpmp->hpm_mp, dep->de_fnode, LK_EXCLUSIVE,
 				 ap->a_vpp);
 		if (error) {
 			printf("hpfs_lookup: VFS_VGET FAILED %d\n", error);
 			brelse(bp);
 			return(error);
 		}
 
 		hp = VTOHP(*ap->a_vpp);
 
 		hp->h_mtime = dep->de_mtime;
 		hp->h_ctime = dep->de_ctime;
 		hp->h_atime = dep->de_atime;
 		bcopy(dep->de_name, hp->h_name, dep->de_namelen);
 		hp->h_name[dep->de_namelen] = '\0';
 		hp->h_namelen = dep->de_namelen;
 		hp->h_flag |= H_PARVALID;
 
 		brelse(bp);
 
 		if ((flags & MAKEENTRY) &&
 		    (!(flags & ISLASTCN) || 
 		     (nameiop != DELETE && nameiop != CREATE)))
 			cache_enter(dvp, *ap->a_vpp, cnp);
 	}
 	return (error);
 }
 
 int
 hpfs_remove(ap)
 	struct vop_remove_args /* {
 		struct vnode *a_dvp;
 		struct vnode *a_vp;
 		struct componentname *a_cnp;
 	} */ *ap;
 {
 	int error;
 
 	dprintf(("hpfs_remove(0x%x, %s, %ld): \n", VTOHP(ap->a_vp)->h_no,
 		ap->a_cnp->cn_nameptr, ap->a_cnp->cn_namelen));
 
 	if (ap->a_vp->v_type == VDIR)
 		return (EPERM);
 
 	error = hpfs_removefnode (ap->a_dvp, ap->a_vp, ap->a_cnp);
 	return (error);
 }
 
 int
 hpfs_create(ap)
 	struct vop_create_args /* {
 		struct vnode *a_dvp;
 		struct vnode **a_vpp;
 		struct componentname *a_cnp;
 		struct vattr *a_vap;
 	} */ *ap;
 {
 	int error;
 
 	dprintf(("hpfs_create(0x%x, %s, %ld): \n", VTOHP(ap->a_dvp)->h_no,
 		ap->a_cnp->cn_nameptr, ap->a_cnp->cn_namelen));
 
 	if (!(ap->a_cnp->cn_flags & HASBUF)) 
 		panic ("hpfs_create: no name\n");
 
 	error = hpfs_makefnode (ap->a_dvp, ap->a_vpp, ap->a_cnp, ap->a_vap);
 
 	return (error);
 }
 
 /*
  * Return POSIX pathconf information applicable to NTFS filesystem
  */
 int
 hpfs_pathconf(ap)
 	struct vop_pathconf_args /* {
 		struct vnode *a_vp;
 		int a_name;
 		register_t *a_retval;
 	} */ *ap;
 {
 	switch (ap->a_name) {
 	case _PC_LINK_MAX:
 		*ap->a_retval = 1;
 		return (0);
 	case _PC_NAME_MAX:
 		*ap->a_retval = HPFS_MAXFILENAME;
 		return (0);
 	case _PC_PATH_MAX:
 		*ap->a_retval = PATH_MAX;
 		return (0);
 	case _PC_CHOWN_RESTRICTED:
 		*ap->a_retval = 1;
 		return (0);
 	case _PC_NO_TRUNC:
 		*ap->a_retval = 0;
 		return (0);
 	default:
 		return (EINVAL);
 	}
 	/* NOTREACHED */
 }
 
 int
 hpfs_vptofh(ap)
 	struct vop_vptofh_args /* {
 		struct vnode *a_vp;
 		struct fid *a_fhp;
 	} */ *ap;
 {
 	register struct hpfsnode *hpp;
 	register struct hpfid *hpfhp;
 
 	hpp = VTOHP(ap->a_vp);
 	hpfhp = (struct hpfid *)ap->a_fhp;
 	hpfhp->hpfid_len = sizeof(struct hpfid);
 	hpfhp->hpfid_ino = hpp->h_no;
 	/* hpfhp->hpfid_gen = hpp->h_gen; */
 	return (0);
 }
 
 
 /*
  * Global vfs data structures
  */
 struct vop_vector hpfs_vnodeops = {
 	.vop_default =		&default_vnodeops,
 
 	.vop_access =		hpfs_access,
 	.vop_bmap =		hpfs_bmap,
 	.vop_cachedlookup =	hpfs_lookup,
 	.vop_close =		hpfs_close,
 	.vop_create =		hpfs_create,
 	.vop_fsync =		hpfs_fsync,
 	.vop_getattr =		hpfs_getattr,
 	.vop_inactive =		hpfs_inactive,
 	.vop_ioctl =		hpfs_ioctl,
 	.vop_lookup =		vfs_cache_lookup,
 	.vop_open =		hpfs_open,
 	.vop_pathconf =		hpfs_pathconf,
 	.vop_print =		hpfs_print,
 	.vop_read =		hpfs_read,
 	.vop_readdir =		hpfs_readdir,
 	.vop_reclaim =		hpfs_reclaim,
 	.vop_remove =		hpfs_remove,
 	.vop_setattr =		hpfs_setattr,
 	.vop_strategy =		hpfs_strategy,
 	.vop_write =		hpfs_write,
 	.vop_vptofh =		hpfs_vptofh,
 };
Index: head/sys/fs/msdosfs/msdosfs_lookup.c
===================================================================
--- head/sys/fs/msdosfs/msdosfs_lookup.c	(revision 175201)
+++ head/sys/fs/msdosfs/msdosfs_lookup.c	(revision 175202)
@@ -1,1088 +1,1088 @@
 /* $FreeBSD$ */
 /*	$NetBSD: msdosfs_lookup.c,v 1.37 1997/11/17 15:36:54 ws Exp $	*/
 
 /*-
  * Copyright (C) 1994, 1995, 1997 Wolfgang Solfrank.
  * Copyright (C) 1994, 1995, 1997 TooLs GmbH.
  * All rights reserved.
  * Original code by Paul Popelka (paulp@uts.amdahl.com) (see below).
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. All advertising materials mentioning features or use of this software
  *    must display the following acknowledgement:
  *	This product includes software developed by TooLs GmbH.
  * 4. The name of TooLs GmbH may not be used to endorse or promote products
  *    derived from this software without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY TOOLS GMBH ``AS IS'' AND ANY EXPRESS OR
  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
  * IN NO EVENT SHALL TOOLS GMBH BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
  * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
  * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
  * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
  * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
  * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
  * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 /*-
  * Written by Paul Popelka (paulp@uts.amdahl.com)
  *
  * You can do anything you want with this software, just don't say you wrote
  * it, and don't remove this notice.
  *
  * This software is provided "as is".
  *
  * The author supplies this software to be publicly redistributed on the
  * understanding that the author is not responsible for the correct
  * functioning of this software in any circumstances and is not liable for
  * any damages caused by this software.
  *
  * October 1992
  */
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/buf.h>
 #include <sys/mount.h>
 #include <sys/namei.h>
 #include <sys/vnode.h>
 
 #include <fs/msdosfs/bpb.h>
 #include <fs/msdosfs/direntry.h>
 #include <fs/msdosfs/denode.h>
 #include <fs/msdosfs/fat.h>
 #include <fs/msdosfs/msdosfsmount.h>
 
 /*
  * When we search a directory the blocks containing directory entries are
  * read and examined.  The directory entries contain information that would
  * normally be in the inode of a unix filesystem.  This means that some of
  * a directory's contents may also be in memory resident denodes (sort of
  * an inode).  This can cause problems if we are searching while some other
  * process is modifying a directory.  To prevent one process from accessing
  * incompletely modified directory information we depend upon being the
  * sole owner of a directory block.  bread/brelse provide this service.
  * This being the case, when a process modifies a directory it must first
  * acquire the disk block that contains the directory entry to be modified.
  * Then update the disk block and the denode, and then write the disk block
  * out to disk.  This way disk blocks containing directory entries and in
  * memory denode's will be in synch.
  */
 int
 msdosfs_lookup(ap)
 	struct vop_cachedlookup_args /* {
 		struct vnode *a_dvp;
 		struct vnode **a_vpp;
 		struct componentname *a_cnp;
 	} */ *ap;
 {
 	struct mbnambuf nb;
 	struct vnode *vdp = ap->a_dvp;
 	struct vnode **vpp = ap->a_vpp;
 	struct componentname *cnp = ap->a_cnp;
 	daddr_t bn;
 	int error;
 	int slotcount;
 	int slotoffset = 0;
 	int frcn;
 	u_long cluster;
 	int blkoff;
 	int diroff;
 	int blsize;
 	int isadir;		/* ~0 if found direntry is a directory	 */
 	u_long scn;		/* starting cluster number		 */
 	struct vnode *pdp;
 	struct denode *dp;
 	struct denode *tdp;
 	struct msdosfsmount *pmp;
 	struct buf *bp = 0;
 	struct direntry *dep = NULL;
 	u_char dosfilename[12];
 	int flags = cnp->cn_flags;
 	int nameiop = cnp->cn_nameiop;
 	struct thread *td = cnp->cn_thread;
 	int unlen;
 
 	int wincnt = 1;
 	int chksum = -1, chksum_ok;
 	int olddos = 1;
 
 #ifdef MSDOSFS_DEBUG
 	printf("msdosfs_lookup(): looking for %s\n", cnp->cn_nameptr);
 #endif
 	dp = VTODE(vdp);
 	pmp = dp->de_pmp;
 	*vpp = NULL;
 #ifdef MSDOSFS_DEBUG
 	printf("msdosfs_lookup(): vdp %p, dp %p, Attr %02x\n",
 	    vdp, dp, dp->de_Attributes);
 #endif
 
 	/*
 	 * If they are going after the . or .. entry in the root directory,
 	 * they won't find it.  DOS filesystems don't have them in the root
 	 * directory.  So, we fake it. deget() is in on this scam too.
 	 */
 	if ((vdp->v_vflag & VV_ROOT) && cnp->cn_nameptr[0] == '.' &&
 	    (cnp->cn_namelen == 1 ||
 		(cnp->cn_namelen == 2 && cnp->cn_nameptr[1] == '.'))) {
 		isadir = ATTR_DIRECTORY;
 		scn = MSDOSFSROOT;
 #ifdef MSDOSFS_DEBUG
 		printf("msdosfs_lookup(): looking for . or .. in root directory\n");
 #endif
 		cluster = MSDOSFSROOT;
 		blkoff = MSDOSFSROOT_OFS;
 		goto foundroot;
 	}
 
 	switch (unix2dosfn((const u_char *)cnp->cn_nameptr, dosfilename,
 	    cnp->cn_namelen, 0, pmp)) {
 	case 0:
 		return (EINVAL);
 	case 1:
 		break;
 	case 2:
 		wincnt = winSlotCnt((const u_char *)cnp->cn_nameptr,
 		    cnp->cn_namelen, pmp) + 1;
 		break;
 	case 3:
 		olddos = 0;
 		wincnt = winSlotCnt((const u_char *)cnp->cn_nameptr,
 		    cnp->cn_namelen, pmp) + 1;
 		break;
 	}
 	if (pmp->pm_flags & MSDOSFSMNT_SHORTNAME) {
 		wincnt = 1;
 		olddos = 1;
 	}
 	unlen = winLenFixup(cnp->cn_nameptr, cnp->cn_namelen);
 
 	/*
 	 * Suppress search for slots unless creating
 	 * file and at end of pathname, in which case
 	 * we watch for a place to put the new file in
 	 * case it doesn't already exist.
 	 */
 	slotcount = wincnt;
 	if ((nameiop == CREATE || nameiop == RENAME) &&
 	    (flags & ISLASTCN))
 		slotcount = 0;
 
 #ifdef MSDOSFS_DEBUG
 	printf("msdosfs_lookup(): dos version of filename %s, length %ld\n",
 	    dosfilename, cnp->cn_namelen);
 #endif
 	/*
 	 * Search the directory pointed at by vdp for the name pointed at
 	 * by cnp->cn_nameptr.
 	 */
 	tdp = NULL;
 	mbnambuf_init(&nb);
 	/*
 	 * The outer loop ranges over the clusters that make up the
 	 * directory.  Note that the root directory is different from all
 	 * other directories.  It has a fixed number of blocks that are not
 	 * part of the pool of allocatable clusters.  So, we treat it a
 	 * little differently. The root directory starts at "cluster" 0.
 	 */
 	diroff = 0;
 	for (frcn = 0;; frcn++) {
 		error = pcbmap(dp, frcn, &bn, &cluster, &blsize);
 		if (error) {
 			if (error == E2BIG)
 				break;
 			return (error);
 		}
 		error = bread(pmp->pm_devvp, bn, blsize, NOCRED, &bp);
 		if (error) {
 			brelse(bp);
 			return (error);
 		}
 		for (blkoff = 0; blkoff < blsize;
 		     blkoff += sizeof(struct direntry),
 		     diroff += sizeof(struct direntry)) {
 			dep = (struct direntry *)(bp->b_data + blkoff);
 			/*
 			 * If the slot is empty and we are still looking
 			 * for an empty then remember this one.  If the
 			 * slot is not empty then check to see if it
 			 * matches what we are looking for.  If the slot
 			 * has never been filled with anything, then the
 			 * remainder of the directory has never been used,
 			 * so there is no point in searching it.
 			 */
 			if (dep->deName[0] == SLOT_EMPTY ||
 			    dep->deName[0] == SLOT_DELETED) {
 				/*
 				 * Drop memory of previous long matches
 				 */
 				chksum = -1;
 				mbnambuf_init(&nb);
 
 				if (slotcount < wincnt) {
 					slotcount++;
 					slotoffset = diroff;
 				}
 				if (dep->deName[0] == SLOT_EMPTY) {
 					brelse(bp);
 					goto notfound;
 				}
 			} else {
 				/*
 				 * If there wasn't enough space for our winentries,
 				 * forget about the empty space
 				 */
 				if (slotcount < wincnt)
 					slotcount = 0;
 
 				/*
 				 * Check for Win95 long filename entry
 				 */
 				if (dep->deAttributes == ATTR_WIN95) {
 					if (pmp->pm_flags & MSDOSFSMNT_SHORTNAME)
 						continue;
 
 					chksum = win2unixfn(&nb,
 					    (struct winentry *)dep, chksum,
 					    pmp);
 					continue;
 				}
 
 				chksum = winChkName(&nb,
 				    (const u_char *)cnp->cn_nameptr, unlen,
 				    chksum, pmp);
 				if (chksum == -2) {
 					chksum = -1;
 					continue;
 				}
 
 				/*
 				 * Ignore volume labels (anywhere, not just
 				 * the root directory).
 				 */
 				if (dep->deAttributes & ATTR_VOLUME) {
 					chksum = -1;
 					continue;
 				}
 
 				/*
 				 * Check for a checksum or name match
 				 */
 				chksum_ok = (chksum == winChksum(dep));
 				if (!chksum_ok
 				    && (!olddos || bcmp(dosfilename, dep->deName, 11))) {
 					chksum = -1;
 					continue;
 				}
 #ifdef MSDOSFS_DEBUG
 				printf("msdosfs_lookup(): match blkoff %d, diroff %d\n",
 				    blkoff, diroff);
 #endif
 				/*
 				 * Remember where this directory
 				 * entry came from for whoever did
 				 * this lookup.
 				 */
 				dp->de_fndoffset = diroff;
 				if (chksum_ok && nameiop == RENAME) {
 					/*
 					 * Target had correct long name
 					 * directory entries, reuse them
 					 * as needed.
 					 */
 					dp->de_fndcnt = wincnt - 1;
 				} else {
 					/*
 					 * Long name directory entries
 					 * not present or corrupt, can only
 					 * reuse dos directory entry.
 					 */
 					dp->de_fndcnt = 0;
 				}
 
 				goto found;
 			}
 		}	/* for (blkoff = 0; .... */
 		/*
 		 * Release the buffer holding the directory cluster just
 		 * searched.
 		 */
 		brelse(bp);
 	}	/* for (frcn = 0; ; frcn++) */
 
 notfound:
 	/*
 	 * We hold no disk buffers at this point.
 	 */
 
 	/*
 	 * Fixup the slot description to point to the place where
 	 * we might put the new DOS direntry (putting the Win95
 	 * long name entries before that)
 	 */
 	if (!slotcount) {
 		slotcount = 1;
 		slotoffset = diroff;
 	}
 	if (wincnt > slotcount)
 		slotoffset += sizeof(struct direntry) * (wincnt - slotcount);
 
 	/*
 	 * If we get here we didn't find the entry we were looking for. But
 	 * that's ok if we are creating or renaming and are at the end of
 	 * the pathname and the directory hasn't been removed.
 	 */
 #ifdef MSDOSFS_DEBUG
 	printf("msdosfs_lookup(): op %d, refcnt %ld\n",
 	    nameiop, dp->de_refcnt);
 	printf("               slotcount %d, slotoffset %d\n",
 	       slotcount, slotoffset);
 #endif
 	if ((nameiop == CREATE || nameiop == RENAME) &&
 	    (flags & ISLASTCN) && dp->de_refcnt != 0) {
 		/*
 		 * Access for write is interpreted as allowing
 		 * creation of files in the directory.
 		 */
 		error = VOP_ACCESS(vdp, VWRITE, cnp->cn_cred, cnp->cn_thread);
 		if (error)
 			return (error);
 		/*
 		 * Return an indication of where the new directory
 		 * entry should be put.
 		 */
 		dp->de_fndoffset = slotoffset;
 		dp->de_fndcnt = wincnt - 1;
 
 		/*
 		 * We return with the directory locked, so that
 		 * the parameters we set up above will still be
 		 * valid if we actually decide to do a direnter().
 		 * We return ni_vp == NULL to indicate that the entry
 		 * does not currently exist; we leave a pointer to
 		 * the (locked) directory inode in ndp->ni_dvp.
 		 * The pathname buffer is saved so that the name
 		 * can be obtained later.
 		 *
 		 * NB - if the directory is unlocked, then this
 		 * information cannot be used.
 		 */
 		cnp->cn_flags |= SAVENAME;
 		return (EJUSTRETURN);
 	}
 #if 0
 	/*
 	 * Insert name into cache (as non-existent) if appropriate.
 	 *
 	 * XXX Negative caching is broken for msdosfs because the name
 	 * cache doesn't understand peculiarities such as case insensitivity
 	 * and 8.3 filenames.  Hence, it may not invalidate all negative
 	 * entries if a file with this name is later created.
 	 */
 	if ((cnp->cn_flags & MAKEENTRY) && nameiop != CREATE)
 		cache_enter(vdp, *vpp, cnp);
 #endif
 	return (ENOENT);
 
 found:
 	/*
 	 * NOTE:  We still have the buffer with matched directory entry at
 	 * this point.
 	 */
 	isadir = dep->deAttributes & ATTR_DIRECTORY;
 	scn = getushort(dep->deStartCluster);
 	if (FAT32(pmp)) {
 		scn |= getushort(dep->deHighClust) << 16;
 		if (scn == pmp->pm_rootdirblk) {
 			/*
 			 * There should actually be 0 here.
 			 * Just ignore the error.
 			 */
 			scn = MSDOSFSROOT;
 		}
 	}
 
 	if (isadir) {
 		cluster = scn;
 		if (cluster == MSDOSFSROOT)
 			blkoff = MSDOSFSROOT_OFS;
 		else
 			blkoff = 0;
 	} else if (cluster == MSDOSFSROOT)
 		blkoff = diroff;
 
 	/*
 	 * Now release buf to allow deget to read the entry again.
 	 * Reserving it here and giving it to deget could result
 	 * in a deadlock.
 	 */
 	brelse(bp);
 	bp = 0;
 	
 foundroot:
 	/*
 	 * If we entered at foundroot, then we are looking for the . or ..
 	 * entry of the filesystems root directory.  isadir and scn were
 	 * setup before jumping here.  And, bp is already null.
 	 */
 	if (FAT32(pmp) && scn == MSDOSFSROOT)
 		scn = pmp->pm_rootdirblk;
 
 	/*
 	 * If deleting, and at end of pathname, return
 	 * parameters which can be used to remove file.
 	 */
 	if (nameiop == DELETE && (flags & ISLASTCN)) {
 		/*
 		 * Don't allow deleting the root.
 		 */
 		if (blkoff == MSDOSFSROOT_OFS)
 			return EROFS;				/* really? XXX */
 
 		/*
 		 * Write access to directory required to delete files.
 		 */
 		error = VOP_ACCESS(vdp, VWRITE, cnp->cn_cred, cnp->cn_thread);
 		if (error)
 			return (error);
 
 		/*
 		 * Return pointer to current entry in dp->i_offset.
 		 * Save directory inode pointer in ndp->ni_dvp for dirremove().
 		 */
 		if (dp->de_StartCluster == scn && isadir) {	/* "." */
 			VREF(vdp);
 			*vpp = vdp;
 			return (0);
 		}
 		error = deget(pmp, cluster, blkoff, &tdp);
 		if (error)
 			return (error);
 		*vpp = DETOV(tdp);
 		return (0);
 	}
 
 	/*
 	 * If rewriting (RENAME), return the inode and the
 	 * information required to rewrite the present directory
 	 * Must get inode of directory entry to verify it's a
 	 * regular file, or empty directory.
 	 */
 	if (nameiop == RENAME && (flags & ISLASTCN)) {
 		if (blkoff == MSDOSFSROOT_OFS)
 			return EROFS;				/* really? XXX */
 
 		error = VOP_ACCESS(vdp, VWRITE, cnp->cn_cred, cnp->cn_thread);
 		if (error)
 			return (error);
 
 		/*
 		 * Careful about locking second inode.
 		 * This can only occur if the target is ".".
 		 */
 		if (dp->de_StartCluster == scn && isadir)
 			return (EISDIR);
 
 		if ((error = deget(pmp, cluster, blkoff, &tdp)) != 0)
 			return (error);
 		*vpp = DETOV(tdp);
 		cnp->cn_flags |= SAVENAME;
 		return (0);
 	}
 
 	/*
 	 * Step through the translation in the name.  We do not `vput' the
 	 * directory because we may need it again if a symbolic link
 	 * is relative to the current directory.  Instead we save it
 	 * unlocked as "pdp".  We must get the target inode before unlocking
 	 * the directory to insure that the inode will not be removed
 	 * before we get it.  We prevent deadlock by always fetching
 	 * inodes from the root, moving down the directory tree. Thus
 	 * when following backward pointers ".." we must unlock the
 	 * parent directory before getting the requested directory.
 	 * There is a potential race condition here if both the current
 	 * and parent directories are removed before the VFS_VGET for the
 	 * inode associated with ".." returns.  We hope that this occurs
 	 * infrequently since we cannot avoid this race condition without
 	 * implementing a sophisticated deadlock detection algorithm.
 	 * Note also that this simple deadlock detection scheme will not
 	 * work if the filesystem has any hard links other than ".."
 	 * that point backwards in the directory structure.
 	 */
 	pdp = vdp;
 	if (flags & ISDOTDOT) {
 		VOP_UNLOCK(pdp, 0, td);
 		error = deget(pmp, cluster, blkoff,  &tdp);
-		vn_lock(pdp, LK_EXCLUSIVE | LK_RETRY, td); 
+		vn_lock(pdp, LK_EXCLUSIVE | LK_RETRY); 
 		if (error)
 			return (error);
 		*vpp = DETOV(tdp);
 	} else if (dp->de_StartCluster == scn && isadir) {
 		VREF(vdp);	/* we want ourself, ie "." */
 		*vpp = vdp;
 	} else {
 		if ((error = deget(pmp, cluster, blkoff, &tdp)) != 0)
 			return (error);
 		*vpp = DETOV(tdp);
 	}
 
 	/*
 	 * Insert name into cache if appropriate.
 	 */
 	if (cnp->cn_flags & MAKEENTRY)
 		cache_enter(vdp, *vpp, cnp);
 	return (0);
 }
 
 /*
  * dep  - directory entry to copy into the directory
  * ddep - directory to add to
  * depp - return the address of the denode for the created directory entry
  *	  if depp != 0
  * cnp  - componentname needed for Win95 long filenames
  */
 int
 createde(dep, ddep, depp, cnp)
 	struct denode *dep;
 	struct denode *ddep;
 	struct denode **depp;
 	struct componentname *cnp;
 {
 	int error;
 	u_long dirclust, diroffset;
 	struct direntry *ndep;
 	struct msdosfsmount *pmp = ddep->de_pmp;
 	struct buf *bp;
 	daddr_t bn;
 	int blsize;
 
 #ifdef MSDOSFS_DEBUG
 	printf("createde(dep %p, ddep %p, depp %p, cnp %p)\n",
 	    dep, ddep, depp, cnp);
 #endif
 
 	/*
 	 * If no space left in the directory then allocate another cluster
 	 * and chain it onto the end of the file.  There is one exception
 	 * to this.  That is, if the root directory has no more space it
 	 * can NOT be expanded.  extendfile() checks for and fails attempts
 	 * to extend the root directory.  We just return an error in that
 	 * case.
 	 */
 	if (ddep->de_fndoffset >= ddep->de_FileSize) {
 		diroffset = ddep->de_fndoffset + sizeof(struct direntry)
 		    - ddep->de_FileSize;
 		dirclust = de_clcount(pmp, diroffset);
 		error = extendfile(ddep, dirclust, 0, 0, DE_CLEAR);
 		if (error) {
 			(void)detrunc(ddep, ddep->de_FileSize, 0, NOCRED, NULL);
 			return error;
 		}
 
 		/*
 		 * Update the size of the directory
 		 */
 		ddep->de_FileSize += de_cn2off(pmp, dirclust);
 	}
 
 	/*
 	 * We just read in the cluster with space.  Copy the new directory
 	 * entry in.  Then write it to disk. NOTE:  DOS directories
 	 * do not get smaller as clusters are emptied.
 	 */
 	error = pcbmap(ddep, de_cluster(pmp, ddep->de_fndoffset),
 		       &bn, &dirclust, &blsize);
 	if (error)
 		return error;
 	diroffset = ddep->de_fndoffset;
 	if (dirclust != MSDOSFSROOT)
 		diroffset &= pmp->pm_crbomask;
 	if ((error = bread(pmp->pm_devvp, bn, blsize, NOCRED, &bp)) != 0) {
 		brelse(bp);
 		return error;
 	}
 	ndep = bptoep(pmp, bp, ddep->de_fndoffset);
 
 	DE_EXTERNALIZE(ndep, dep);
 
 	/*
 	 * Now write the Win95 long name
 	 */
 	if (ddep->de_fndcnt > 0) {
 		u_int8_t chksum = winChksum(ndep);
 		const u_char *un = (const u_char *)cnp->cn_nameptr;
 		int unlen = cnp->cn_namelen;
 		int cnt = 1;
 
 		while (--ddep->de_fndcnt >= 0) {
 			if (!(ddep->de_fndoffset & pmp->pm_crbomask)) {
 				if (DETOV(ddep)->v_mount->mnt_flag & MNT_ASYNC)
 					bdwrite(bp);
 				else if ((error = bwrite(bp)) != 0)
 					return error;
 
 				ddep->de_fndoffset -= sizeof(struct direntry);
 				error = pcbmap(ddep,
 					       de_cluster(pmp,
 							  ddep->de_fndoffset),
 					       &bn, 0, &blsize);
 				if (error)
 					return error;
 
 				error = bread(pmp->pm_devvp, bn, blsize,
 					      NOCRED, &bp);
 				if (error) {
 					brelse(bp);
 					return error;
 				}
 				ndep = bptoep(pmp, bp, ddep->de_fndoffset);
 			} else {
 				ndep--;
 				ddep->de_fndoffset -= sizeof(struct direntry);
 			}
 			if (!unix2winfn(un, unlen, (struct winentry *)ndep,
 					cnt++, chksum, pmp))
 				break;
 		}
 	}
 
 	if (DETOV(ddep)->v_mount->mnt_flag & MNT_ASYNC)
 		bdwrite(bp);
 	else if ((error = bwrite(bp)) != 0)
 		return error;
 
 	/*
 	 * If they want us to return with the denode gotten.
 	 */
 	if (depp) {
 		if (dep->de_Attributes & ATTR_DIRECTORY) {
 			dirclust = dep->de_StartCluster;
 			if (FAT32(pmp) && dirclust == pmp->pm_rootdirblk)
 				dirclust = MSDOSFSROOT;
 			if (dirclust == MSDOSFSROOT)
 				diroffset = MSDOSFSROOT_OFS;
 			else
 				diroffset = 0;
 		}
 		return deget(pmp, dirclust, diroffset, depp);
 	}
 
 	return 0;
 }
 
 /*
  * Be sure a directory is empty except for "." and "..". Return 1 if empty,
  * return 0 if not empty or error.
  */
 int
 dosdirempty(dep)
 	struct denode *dep;
 {
 	int blsize;
 	int error;
 	u_long cn;
 	daddr_t bn;
 	struct buf *bp;
 	struct msdosfsmount *pmp = dep->de_pmp;
 	struct direntry *dentp;
 
 	/*
 	 * Since the filesize field in directory entries for a directory is
 	 * zero, we just have to feel our way through the directory until
 	 * we hit end of file.
 	 */
 	for (cn = 0;; cn++) {
 		if ((error = pcbmap(dep, cn, &bn, 0, &blsize)) != 0) {
 			if (error == E2BIG)
 				return (1);	/* it's empty */
 			return (0);
 		}
 		error = bread(pmp->pm_devvp, bn, blsize, NOCRED, &bp);
 		if (error) {
 			brelse(bp);
 			return (0);
 		}
 		for (dentp = (struct direntry *)bp->b_data;
 		     (char *)dentp < bp->b_data + blsize;
 		     dentp++) {
 			if (dentp->deName[0] != SLOT_DELETED &&
 			    (dentp->deAttributes & ATTR_VOLUME) == 0) {
 				/*
 				 * In dos directories an entry whose name
 				 * starts with SLOT_EMPTY (0) starts the
 				 * beginning of the unused part of the
 				 * directory, so we can just return that it
 				 * is empty.
 				 */
 				if (dentp->deName[0] == SLOT_EMPTY) {
 					brelse(bp);
 					return (1);
 				}
 				/*
 				 * Any names other than "." and ".." in a
 				 * directory mean it is not empty.
 				 */
 				if (bcmp(dentp->deName, ".          ", 11) &&
 				    bcmp(dentp->deName, "..         ", 11)) {
 					brelse(bp);
 #ifdef MSDOSFS_DEBUG
 					printf("dosdirempty(): entry found %02x, %02x\n",
 					    dentp->deName[0], dentp->deName[1]);
 #endif
 					return (0);	/* not empty */
 				}
 			}
 		}
 		brelse(bp);
 	}
 	/* NOTREACHED */
 }
 
 /*
  * Check to see if the directory described by target is in some
  * subdirectory of source.  This prevents something like the following from
  * succeeding and leaving a bunch or files and directories orphaned. mv
  * /a/b/c /a/b/c/d/e/f Where c and f are directories.
  *
  * source - the inode for /a/b/c
  * target - the inode for /a/b/c/d/e/f
  *
  * Returns 0 if target is NOT a subdirectory of source.
  * Otherwise returns a non-zero error number.
  * The target inode is always unlocked on return.
  */
 int
 doscheckpath(source, target)
 	struct denode *source;
 	struct denode *target;
 {
 	daddr_t scn;
 	struct msdosfsmount *pmp;
 	struct direntry *ep;
 	struct denode *dep;
 	struct buf *bp = NULL;
 	int error = 0;
 
 	dep = target;
 	if ((target->de_Attributes & ATTR_DIRECTORY) == 0 ||
 	    (source->de_Attributes & ATTR_DIRECTORY) == 0) {
 		error = ENOTDIR;
 		goto out;
 	}
 	if (dep->de_StartCluster == source->de_StartCluster) {
 		error = EEXIST;
 		goto out;
 	}
 	if (dep->de_StartCluster == MSDOSFSROOT)
 		goto out;
 	pmp = dep->de_pmp;
 #ifdef	DIAGNOSTIC
 	if (pmp != source->de_pmp)
 		panic("doscheckpath: source and target on different filesystems");
 #endif
 	if (FAT32(pmp) && dep->de_StartCluster == pmp->pm_rootdirblk)
 		goto out;
 
 	for (;;) {
 		if ((dep->de_Attributes & ATTR_DIRECTORY) == 0) {
 			error = ENOTDIR;
 			break;
 		}
 		scn = dep->de_StartCluster;
 		error = bread(pmp->pm_devvp, cntobn(pmp, scn),
 			      pmp->pm_bpcluster, NOCRED, &bp);
 		if (error)
 			break;
 
 		ep = (struct direntry *) bp->b_data + 1;
 		if ((ep->deAttributes & ATTR_DIRECTORY) == 0 ||
 		    bcmp(ep->deName, "..         ", 11) != 0) {
 			error = ENOTDIR;
 			break;
 		}
 		scn = getushort(ep->deStartCluster);
 		if (FAT32(pmp))
 			scn |= getushort(ep->deHighClust) << 16;
 
 		if (scn == source->de_StartCluster) {
 			error = EINVAL;
 			break;
 		}
 		if (scn == MSDOSFSROOT)
 			break;
 		if (FAT32(pmp) && scn == pmp->pm_rootdirblk) {
 			/*
 			 * scn should be 0 in this case,
 			 * but we silently ignore the error.
 			 */
 			break;
 		}
 
 		vput(DETOV(dep));
 		brelse(bp);
 		bp = NULL;
 		/* NOTE: deget() clears dep on error */
 		if ((error = deget(pmp, scn, 0, &dep)) != 0)
 			break;
 	}
 out:;
 	if (bp)
 		brelse(bp);
 	if (error == ENOTDIR)
 		printf("doscheckpath(): .. not a directory?\n");
 	if (dep != NULL)
 		vput(DETOV(dep));
 	return (error);
 }
 
 /*
  * Read in the disk block containing the directory entry (dirclu, dirofs)
  * and return the address of the buf header, and the address of the
  * directory entry within the block.
  */
 int
 readep(pmp, dirclust, diroffset, bpp, epp)
 	struct msdosfsmount *pmp;
 	u_long dirclust, diroffset;
 	struct buf **bpp;
 	struct direntry **epp;
 {
 	int error;
 	daddr_t bn;
 	int blsize;
 
 	blsize = pmp->pm_bpcluster;
 	if (dirclust == MSDOSFSROOT
 	    && de_blk(pmp, diroffset + blsize) > pmp->pm_rootdirsize)
 		blsize = de_bn2off(pmp, pmp->pm_rootdirsize) & pmp->pm_crbomask;
 	bn = detobn(pmp, dirclust, diroffset);
 	if ((error = bread(pmp->pm_devvp, bn, blsize, NOCRED, bpp)) != 0) {
 		brelse(*bpp);
 		*bpp = NULL;
 		return (error);
 	}
 	if (epp)
 		*epp = bptoep(pmp, *bpp, diroffset);
 	return (0);
 }
 
 /*
  * Read in the disk block containing the directory entry dep came from and
  * return the address of the buf header, and the address of the directory
  * entry within the block.
  */
 int
 readde(dep, bpp, epp)
 	struct denode *dep;
 	struct buf **bpp;
 	struct direntry **epp;
 {
 
 	return (readep(dep->de_pmp, dep->de_dirclust, dep->de_diroffset,
 	    bpp, epp));
 }
 
 /*
  * Remove a directory entry. At this point the file represented by the
  * directory entry to be removed is still full length until noone has it
  * open.  When the file no longer being used msdosfs_inactive() is called
  * and will truncate the file to 0 length.  When the vnode containing the
  * denode is needed for some other purpose by VFS it will call
  * msdosfs_reclaim() which will remove the denode from the denode cache.
  */
 int
 removede(pdep, dep)
 	struct denode *pdep;	/* directory where the entry is removed */
 	struct denode *dep;	/* file to be removed */
 {
 	int error;
 	struct direntry *ep;
 	struct buf *bp;
 	daddr_t bn;
 	int blsize;
 	struct msdosfsmount *pmp = pdep->de_pmp;
 	u_long offset = pdep->de_fndoffset;
 
 #ifdef MSDOSFS_DEBUG
 	printf("removede(): filename %s, dep %p, offset %08lx\n",
 	    dep->de_Name, dep, offset);
 #endif
 
 	dep->de_refcnt--;
 	offset += sizeof(struct direntry);
 	do {
 		offset -= sizeof(struct direntry);
 		error = pcbmap(pdep, de_cluster(pmp, offset), &bn, 0, &blsize);
 		if (error)
 			return error;
 		error = bread(pmp->pm_devvp, bn, blsize, NOCRED, &bp);
 		if (error) {
 			brelse(bp);
 			return error;
 		}
 		ep = bptoep(pmp, bp, offset);
 		/*
 		 * Check whether, if we came here the second time, i.e.
 		 * when underflowing into the previous block, the last
 		 * entry in this block is a longfilename entry, too.
 		 */
 		if (ep->deAttributes != ATTR_WIN95
 		    && offset != pdep->de_fndoffset) {
 			brelse(bp);
 			break;
 		}
 		offset += sizeof(struct direntry);
 		while (1) {
 			/*
 			 * We are a bit agressive here in that we delete any Win95
 			 * entries preceding this entry, not just the ones we "own".
 			 * Since these presumably aren't valid anyway,
 			 * there should be no harm.
 			 */
 			offset -= sizeof(struct direntry);
 			ep--->deName[0] = SLOT_DELETED;
 			if ((pmp->pm_flags & MSDOSFSMNT_NOWIN95)
 			    || !(offset & pmp->pm_crbomask)
 			    || ep->deAttributes != ATTR_WIN95)
 				break;
 		}
 		if (DETOV(pdep)->v_mount->mnt_flag & MNT_ASYNC)
 			bdwrite(bp);
 		else if ((error = bwrite(bp)) != 0)
 			return error;
 	} while (!(pmp->pm_flags & MSDOSFSMNT_NOWIN95)
 	    && !(offset & pmp->pm_crbomask)
 	    && offset);
 	return 0;
 }
 
 /*
  * Create a unique DOS name in dvp
  */
 int
 uniqdosname(dep, cnp, cp)
 	struct denode *dep;
 	struct componentname *cnp;
 	u_char *cp;
 {
 	struct msdosfsmount *pmp = dep->de_pmp;
 	struct direntry *dentp;
 	int gen;
 	int blsize;
 	u_long cn;
 	daddr_t bn;
 	struct buf *bp;
 	int error;
 	
 	if (pmp->pm_flags & MSDOSFSMNT_SHORTNAME)
 		return (unix2dosfn((const u_char *)cnp->cn_nameptr, cp,
 		    cnp->cn_namelen, 0, pmp) ? 0 : EINVAL);
 
 	for (gen = 1;; gen++) {
 		/*
 		 * Generate DOS name with generation number
 		 */
 		if (!unix2dosfn((const u_char *)cnp->cn_nameptr, cp,
 		    cnp->cn_namelen, gen, pmp))
 			return gen == 1 ? EINVAL : EEXIST;
 
 		/*
 		 * Now look for a dir entry with this exact name
 		 */
 		for (cn = error = 0; !error; cn++) {
 			if ((error = pcbmap(dep, cn, &bn, 0, &blsize)) != 0) {
 				if (error == E2BIG)	/* EOF reached and not found */
 					return 0;
 				return error;
 			}
 			error = bread(pmp->pm_devvp, bn, blsize, NOCRED, &bp);
 			if (error) {
 				brelse(bp);
 				return error;
 			}
 			for (dentp = (struct direntry *)bp->b_data;
 			     (char *)dentp < bp->b_data + blsize;
 			     dentp++) {
 				if (dentp->deName[0] == SLOT_EMPTY) {
 					/*
 					 * Last used entry and not found
 					 */
 					brelse(bp);
 					return 0;
 				}
 				/*
 				 * Ignore volume labels and Win95 entries
 				 */
 				if (dentp->deAttributes & ATTR_VOLUME)
 					continue;
 				if (!bcmp(dentp->deName, cp, 11)) {
 					error = EEXIST;
 					break;
 				}
 			}
 			brelse(bp);
 		}
 	}
 }
 
 /*
  * Find any Win'95 long filename entry in directory dep
  */
 int
 findwin95(dep)
 	struct denode *dep;
 {
 	struct msdosfsmount *pmp = dep->de_pmp;
 	struct direntry *dentp;
 	int blsize, win95;
 	u_long cn;
 	daddr_t bn;
 	struct buf *bp;
 
 	win95 = 1;
 	/*
 	 * Read through the directory looking for Win'95 entries
 	 * Note: Error currently handled just as EOF			XXX
 	 */
 	for (cn = 0;; cn++) {
 		if (pcbmap(dep, cn, &bn, 0, &blsize))
 			return (win95);
 		if (bread(pmp->pm_devvp, bn, blsize, NOCRED, &bp)) {
 			brelse(bp);
 			return (win95);
 		}
 		for (dentp = (struct direntry *)bp->b_data;
 		     (char *)dentp < bp->b_data + blsize;
 		     dentp++) {
 			if (dentp->deName[0] == SLOT_EMPTY) {
 				/*
 				 * Last used entry and not found
 				 */
 				brelse(bp);
 				return (win95);
 			}
 			if (dentp->deName[0] == SLOT_DELETED) {
 				/*
 				 * Ignore deleted files
 				 * Note: might be an indication of Win'95 anyway	XXX
 				 */
 				continue;
 			}
 			if (dentp->deAttributes == ATTR_WIN95) {
 				brelse(bp);
 				return 1;
 			}
 			win95 = 0;
 		}
 		brelse(bp);
 	}
 }
Index: head/sys/fs/msdosfs/msdosfs_vfsops.c
===================================================================
--- head/sys/fs/msdosfs/msdosfs_vfsops.c	(revision 175201)
+++ head/sys/fs/msdosfs/msdosfs_vfsops.c	(revision 175202)
@@ -1,969 +1,969 @@
 /* $FreeBSD$ */
 /*	$NetBSD: msdosfs_vfsops.c,v 1.51 1997/11/17 15:36:58 ws Exp $	*/
 
 /*-
  * Copyright (C) 1994, 1995, 1997 Wolfgang Solfrank.
  * Copyright (C) 1994, 1995, 1997 TooLs GmbH.
  * All rights reserved.
  * Original code by Paul Popelka (paulp@uts.amdahl.com) (see below).
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. All advertising materials mentioning features or use of this software
  *    must display the following acknowledgement:
  *	This product includes software developed by TooLs GmbH.
  * 4. The name of TooLs GmbH may not be used to endorse or promote products
  *    derived from this software without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY TOOLS GMBH ``AS IS'' AND ANY EXPRESS OR
  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
  * IN NO EVENT SHALL TOOLS GMBH BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
  * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
  * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
  * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
  * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
  * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
  * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 /*-
  * Written by Paul Popelka (paulp@uts.amdahl.com)
  *
  * You can do anything you want with this software, just don't say you wrote
  * it, and don't remove this notice.
  *
  * This software is provided "as is".
  *
  * The author supplies this software to be publicly redistributed on the
  * understanding that the author is not responsible for the correct
  * functioning of this software in any circumstances and is not liable for
  * any damages caused by this software.
  *
  * October 1992
  */
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/buf.h>
 #include <sys/conf.h>
 #include <sys/iconv.h>
 #include <sys/kernel.h>
 #include <sys/lock.h>
 #include <sys/malloc.h>
 #include <sys/mount.h>
 #include <sys/mutex.h>
 #include <sys/namei.h>
 #include <sys/priv.h>
 #include <sys/proc.h>
 #include <sys/stat.h>
 #include <sys/vnode.h>
 
 #include <geom/geom.h>
 #include <geom/geom_vfs.h>
 
 #include <fs/msdosfs/bootsect.h>
 #include <fs/msdosfs/bpb.h>
 #include <fs/msdosfs/direntry.h>
 #include <fs/msdosfs/denode.h>
 #include <fs/msdosfs/fat.h>
 #include <fs/msdosfs/msdosfsmount.h>
 
 /* Mount options that we support. */
 static const char *msdosfs_opts[] = {
 	"async", "noatime", "noclusterr", "noclusterw",
 	"export", "force", "from", "sync",
 	"cs_dos", "cs_local", "cs_win", "dirmask",
 	"gid", "kiconv", "large", "longname",
 	"longnames", "mask", "shortname", "shortnames",
 	"uid", "win95", "nowin95",
 	NULL
 };
 
 #if 1 /*def PC98*/
 /*
  * XXX - The boot signature formatted by NEC PC-98 DOS looks like a
  *       garbage or a random value :-{
  *       If you want to use that broken-signatured media, define the
  *       following symbol even though PC/AT.
  *       (ex. mount PC-98 DOS formatted FD on PC/AT)
  */
 #define	MSDOSFS_NOCHECKSIG
 #endif
 
 MALLOC_DEFINE(M_MSDOSFSMNT, "msdosfs_mount", "MSDOSFS mount structure");
 static MALLOC_DEFINE(M_MSDOSFSFAT, "msdosfs_fat", "MSDOSFS file allocation table");
 
 struct iconv_functions *msdosfs_iconv;
 
 static int	update_mp(struct mount *mp, struct thread *td);
 static int	mountmsdosfs(struct vnode *devvp, struct mount *mp,
 		    struct thread *td);
 static vfs_fhtovp_t	msdosfs_fhtovp;
 static vfs_mount_t	msdosfs_mount;
 static vfs_root_t	msdosfs_root;
 static vfs_statfs_t	msdosfs_statfs;
 static vfs_sync_t	msdosfs_sync;
 static vfs_unmount_t	msdosfs_unmount;
 
 /* Maximum length of a character set name (arbitrary). */
 #define	MAXCSLEN	64
 
 static int
 update_mp(struct mount *mp, struct thread *td)
 {
 	struct msdosfsmount *pmp = VFSTOMSDOSFS(mp);
 	void *dos, *win, *local;
 	int error, v;
 
 	if (!vfs_getopt(mp->mnt_optnew, "kiconv", NULL, NULL)) {
 		if (msdosfs_iconv != NULL) {
 			error = vfs_getopt(mp->mnt_optnew,
 			    "cs_win", &win, NULL);
 			if (!error)
 				error = vfs_getopt(mp->mnt_optnew,
 				    "cs_local", &local, NULL);
 			if (!error)
 				error = vfs_getopt(mp->mnt_optnew,
 				    "cs_dos", &dos, NULL);
 			if (!error) {
 				msdosfs_iconv->open(win, local, &pmp->pm_u2w);
 				msdosfs_iconv->open(local, win, &pmp->pm_w2u);
 				msdosfs_iconv->open(dos, local, &pmp->pm_u2d);
 				msdosfs_iconv->open(local, dos, &pmp->pm_d2u);
 			}
 			if (error != 0)
 				return (error);
 		} else {
 			pmp->pm_w2u = NULL;
 			pmp->pm_u2w = NULL;
 			pmp->pm_d2u = NULL;
 			pmp->pm_u2d = NULL;
 		}
 	}
 
 	if (1 == vfs_scanopt(mp->mnt_optnew, "gid", "%d", &v))
 		pmp->pm_gid = v;
 	if (1 == vfs_scanopt(mp->mnt_optnew, "uid", "%d", &v))
 		pmp->pm_uid = v;
 	if (1 == vfs_scanopt(mp->mnt_optnew, "mask", "%d", &v))
 		pmp->pm_mask = v & ALLPERMS;
 	if (1 == vfs_scanopt(mp->mnt_optnew, "dirmask", "%d", &v))
 		pmp->pm_dirmask = v & ALLPERMS;
 	vfs_flagopt(mp->mnt_optnew, "shortname",
 	    &pmp->pm_flags, MSDOSFSMNT_SHORTNAME);
 	vfs_flagopt(mp->mnt_optnew, "shortnames",
 	    &pmp->pm_flags, MSDOSFSMNT_SHORTNAME);
 	vfs_flagopt(mp->mnt_optnew, "longname",
 	    &pmp->pm_flags, MSDOSFSMNT_LONGNAME);
 	vfs_flagopt(mp->mnt_optnew, "longnames",
 	    &pmp->pm_flags, MSDOSFSMNT_LONGNAME);
 	vfs_flagopt(mp->mnt_optnew, "kiconv",
 	    &pmp->pm_flags, MSDOSFSMNT_KICONV);
 
 	if (vfs_getopt(mp->mnt_optnew, "nowin95", NULL, NULL) == 0)
 		pmp->pm_flags |= MSDOSFSMNT_NOWIN95;
 	else
 		pmp->pm_flags &= ~MSDOSFSMNT_NOWIN95;
 
 	if (pmp->pm_flags & MSDOSFSMNT_NOWIN95)
 		pmp->pm_flags |= MSDOSFSMNT_SHORTNAME;
 	else if (!(pmp->pm_flags &
 	    (MSDOSFSMNT_SHORTNAME | MSDOSFSMNT_LONGNAME))) {
 		struct vnode *rootvp;
 
 		/*
 		 * Try to divine whether to support Win'95 long filenames
 		 */
 		if (FAT32(pmp))
 			pmp->pm_flags |= MSDOSFSMNT_LONGNAME;
 		else {
 			if ((error =
 			    msdosfs_root(mp, LK_EXCLUSIVE, &rootvp, td)) != 0)
 				return error;
 			pmp->pm_flags |= findwin95(VTODE(rootvp)) ?
 			    MSDOSFSMNT_LONGNAME : MSDOSFSMNT_SHORTNAME;
 			vput(rootvp);
 		}
 	}
 	return 0;
 }
 
 static int
 msdosfs_cmount(struct mntarg *ma, void *data, int flags, struct thread *td)
 {
 	struct msdosfs_args args;
 	int error;
 
 	if (data == NULL)
 		return (EINVAL);
 	error = copyin(data, &args, sizeof args);
 	if (error)
 		return (error);
 
 	ma = mount_argsu(ma, "from", args.fspec, MAXPATHLEN);
 	ma = mount_arg(ma, "export", &args.export, sizeof args.export);
 	ma = mount_argf(ma, "uid", "%d", args.uid);
 	ma = mount_argf(ma, "gid", "%d", args.gid);
 	ma = mount_argf(ma, "mask", "%d", args.mask);
 	ma = mount_argf(ma, "dirmask", "%d", args.dirmask);
 
 	ma = mount_argb(ma, args.flags & MSDOSFSMNT_SHORTNAME, "noshortname");
 	ma = mount_argb(ma, args.flags & MSDOSFSMNT_LONGNAME, "nolongname");
 	ma = mount_argb(ma, !(args.flags & MSDOSFSMNT_NOWIN95), "nowin95");
 	ma = mount_argb(ma, args.flags & MSDOSFSMNT_KICONV, "nokiconv");
 
 	ma = mount_argsu(ma, "cs_win", args.cs_win, MAXCSLEN);
 	ma = mount_argsu(ma, "cs_dos", args.cs_dos, MAXCSLEN);
 	ma = mount_argsu(ma, "cs_local", args.cs_local, MAXCSLEN);
 
 	error = kernel_mount(ma, flags);
 
 	return (error);
 }
 
 /*
  * mp - path - addr in user space of mount point (ie /usr or whatever)
  * data - addr in user space of mount params including the name of the block
  * special file to treat as a filesystem.
  */
 static int
 msdosfs_mount(struct mount *mp, struct thread *td)
 {
 	struct vnode *devvp;	  /* vnode for blk device to mount */
 	/* msdosfs specific mount control block */
 	struct msdosfsmount *pmp = NULL;
 	struct nameidata ndp;
 	int error, flags;
 	mode_t accessmode;
 	char *from;
 
 	if (vfs_filteropt(mp->mnt_optnew, msdosfs_opts))
 		return (EINVAL);
 
 	/*
 	 * If updating, check whether changing from read-only to
 	 * read/write; if there is no device name, that's all we do.
 	 */
 	if (mp->mnt_flag & MNT_UPDATE) {
 		pmp = VFSTOMSDOSFS(mp);
 		if (vfs_flagopt(mp->mnt_optnew, "export", NULL, 0)) {
 			/*
 			 * Forbid export requests if filesystem has
 			 * MSDOSFS_LARGEFS flag set.
 			 */
 			if ((pmp->pm_flags & MSDOSFS_LARGEFS) != 0) {
 				vfs_mount_error(mp,
 				    "MSDOSFS_LARGEFS flag set, cannot export");
 				return (EOPNOTSUPP);
 			}
 		}
 		if (!(pmp->pm_flags & MSDOSFSMNT_RONLY) &&
 		    vfs_flagopt(mp->mnt_optnew, "ro", NULL, 0)) {
 			error = VFS_SYNC(mp, MNT_WAIT, td);
 			if (error)
 				return (error);
 			flags = WRITECLOSE;
 			if (mp->mnt_flag & MNT_FORCE)
 				flags |= FORCECLOSE;
 			error = vflush(mp, 0, flags, td);
 			if (error)
 				return (error);
 
 			/*
 			 * Now the volume is clean.  Mark it so while the
 			 * device is still rw.
 			 */
 			error = markvoldirty(pmp, 0);
 			if (error) {
 				(void)markvoldirty(pmp, 1);
 				return (error);
 			}
 
 			/* Downgrade the device from rw to ro. */
 			DROP_GIANT();
 			g_topology_lock();
 			error = g_access(pmp->pm_cp, 0, -1, 0);
 			g_topology_unlock();
 			PICKUP_GIANT();
 			if (error) {
 				(void)markvoldirty(pmp, 1);
 				return (error);
 			}
 
 			/*
 			 * Backing out after an error was painful in the
 			 * above.  Now we are committed to succeeding.
 			 */
 			pmp->pm_fmod = 0;
 			pmp->pm_flags |= MSDOSFSMNT_RONLY;
 			MNT_ILOCK(mp);
 			mp->mnt_flag |= MNT_RDONLY;
 			MNT_IUNLOCK(mp);
 		} else if ((pmp->pm_flags & MSDOSFSMNT_RONLY) &&
 		    !vfs_flagopt(mp->mnt_optnew, "ro", NULL, 0)) {
 			/*
 			 * If upgrade to read-write by non-root, then verify
 			 * that user has necessary permissions on the device.
 			 */
 			devvp = pmp->pm_devvp;
-			vn_lock(devvp, LK_EXCLUSIVE | LK_RETRY, td);
+			vn_lock(devvp, LK_EXCLUSIVE | LK_RETRY);
 			error = VOP_ACCESS(devvp, VREAD | VWRITE,
 			    td->td_ucred, td);
 			if (error)
 				error = priv_check(td, PRIV_VFS_MOUNT_PERM);
 			if (error) {
 				VOP_UNLOCK(devvp, 0, td);
 				return (error);
 			}
 			VOP_UNLOCK(devvp, 0, td);
 			DROP_GIANT();
 			g_topology_lock();
 			error = g_access(pmp->pm_cp, 0, 1, 0);
 			g_topology_unlock();
 			PICKUP_GIANT();
 			if (error)
 				return (error);
 
 			pmp->pm_fmod = 1;
 			pmp->pm_flags &= ~MSDOSFSMNT_RONLY;
 			MNT_ILOCK(mp);
 			mp->mnt_flag &= ~MNT_RDONLY;
 			MNT_IUNLOCK(mp);
 
 			/* Now that the volume is modifiable, mark it dirty. */
 			error = markvoldirty(pmp, 1);
 			if (error)
 				return (error); 
 		}
 	}
 	/*
 	 * Not an update, or updating the name: look up the name
 	 * and verify that it refers to a sensible disk device.
 	 */
 	if (vfs_getopt(mp->mnt_optnew, "from", (void **)&from, NULL))
 		return (EINVAL);
 	NDINIT(&ndp, LOOKUP, FOLLOW | LOCKLEAF, UIO_SYSSPACE, from, td);
 	error = namei(&ndp);
 	if (error)
 		return (error);
 	devvp = ndp.ni_vp;
 	NDFREE(&ndp, NDF_ONLY_PNBUF);
 
 	if (!vn_isdisk(devvp, &error)) {
 		vput(devvp);
 		return (error);
 	}
 	/*
 	 * If mount by non-root, then verify that user has necessary
 	 * permissions on the device.
 	 */
 	accessmode = VREAD;
 	if ((mp->mnt_flag & MNT_RDONLY) == 0)
 		accessmode |= VWRITE;
 	error = VOP_ACCESS(devvp, accessmode, td->td_ucred, td);
 	if (error)
 		error = priv_check(td, PRIV_VFS_MOUNT_PERM);
 	if (error) {
 		vput(devvp);
 		return (error);
 	}
 	if ((mp->mnt_flag & MNT_UPDATE) == 0) {
 		error = mountmsdosfs(devvp, mp, td);
 #ifdef MSDOSFS_DEBUG		/* only needed for the printf below */
 		pmp = VFSTOMSDOSFS(mp);
 #endif
 	} else {
 		if (devvp != pmp->pm_devvp)
 			error = EINVAL;	/* XXX needs translation */
 		else
 			vput(devvp);
 	}
 	if (error) {
 		vrele(devvp);
 		return (error);
 	}
 
 	error = update_mp(mp, td);
 	if (error) {
 		if ((mp->mnt_flag & MNT_UPDATE) == 0)
 			msdosfs_unmount(mp, MNT_FORCE, td);
 		return error;
 	}
 
 	vfs_mountedfrom(mp, from);
 #ifdef MSDOSFS_DEBUG
 	printf("msdosfs_mount(): mp %p, pmp %p, inusemap %p\n", mp, pmp, pmp->pm_inusemap);
 #endif
 	return (0);
 }
 
 static int
 mountmsdosfs(struct vnode *devvp, struct mount *mp, struct thread *td)
 {
 	struct msdosfsmount *pmp;
 	struct buf *bp;
 	struct cdev *dev = devvp->v_rdev;
 	union bootsector *bsp;
 	struct byte_bpb33 *b33;
 	struct byte_bpb50 *b50;
 	struct byte_bpb710 *b710;
 	u_int8_t SecPerClust;
 	u_long clusters;
 	int ronly, error;
 	struct g_consumer *cp;
 	struct bufobj *bo;
 
 	ronly = (mp->mnt_flag & MNT_RDONLY) != 0;
 	/* XXX: use VOP_ACCESS to check FS perms */
 	DROP_GIANT();
 	g_topology_lock();
 	error = g_vfs_open(devvp, &cp, "msdosfs", ronly ? 0 : 1);
 	g_topology_unlock();
 	PICKUP_GIANT();
 	VOP_UNLOCK(devvp, 0, td);
 	if (error)
 		return (error);
 
 	bo = &devvp->v_bufobj;
 	bp = NULL;		/* This and pmp both used in error_exit. */
 	pmp = NULL;
 
 	/*
 	 * Read the boot sector of the filesystem, and then check the
 	 * boot signature.  If not a dos boot sector then error out.
 	 *
 	 * NOTE: 8192 is a magic size that works for ffs.
 	 */
 	error = bread(devvp, 0, 8192, NOCRED, &bp);
 	if (error)
 		goto error_exit;
 	bp->b_flags |= B_AGE;
 	bsp = (union bootsector *)bp->b_data;
 	b33 = (struct byte_bpb33 *)bsp->bs33.bsBPB;
 	b50 = (struct byte_bpb50 *)bsp->bs50.bsBPB;
 	b710 = (struct byte_bpb710 *)bsp->bs710.bsBPB;
 
 #ifndef MSDOSFS_NOCHECKSIG
 	if (bsp->bs50.bsBootSectSig0 != BOOTSIG0
 	    || bsp->bs50.bsBootSectSig1 != BOOTSIG1) {
 		error = EINVAL;
 		goto error_exit;
 	}
 #endif
 
 	pmp = malloc(sizeof *pmp, M_MSDOSFSMNT, M_WAITOK | M_ZERO);
 	pmp->pm_mountp = mp;
 	pmp->pm_cp = cp;
 	pmp->pm_bo = bo;
 
 	/*
 	 * Initialize ownerships and permissions, since nothing else will
 	 * initialize them iff we are mounting root.
 	 */
 	pmp->pm_uid = UID_ROOT;
 	pmp->pm_gid = GID_WHEEL;
 	pmp->pm_mask = pmp->pm_dirmask = S_IXUSR | S_IXGRP | S_IXOTH |
 	    S_IRUSR | S_IRGRP | S_IROTH | S_IWUSR;
 
 	/*
 	 * Experimental support for large MS-DOS filesystems.
 	 * WARNING: This uses at least 32 bytes of kernel memory (which is not
 	 * reclaimed until the FS is unmounted) for each file on disk to map
 	 * between the 32-bit inode numbers used by VFS and the 64-bit
 	 * pseudo-inode numbers used internally by msdosfs. This is only
 	 * safe to use in certain controlled situations (e.g. read-only FS
 	 * with less than 1 million files).
 	 * Since the mappings do not persist across unmounts (or reboots), these
 	 * filesystems are not suitable for exporting through NFS, or any other
 	 * application that requires fixed inode numbers.
 	 */
 	vfs_flagopt(mp->mnt_optnew, "large", &pmp->pm_flags, MSDOSFS_LARGEFS);
 
 	/*
 	 * Compute several useful quantities from the bpb in the
 	 * bootsector.  Copy in the dos 5 variant of the bpb then fix up
 	 * the fields that are different between dos 5 and dos 3.3.
 	 */
 	SecPerClust = b50->bpbSecPerClust;
 	pmp->pm_BytesPerSec = getushort(b50->bpbBytesPerSec);
 	if (pmp->pm_BytesPerSec < DEV_BSIZE) {
 		error = EINVAL;
 		goto error_exit;
 	}
 	pmp->pm_ResSectors = getushort(b50->bpbResSectors);
 	pmp->pm_FATs = b50->bpbFATs;
 	pmp->pm_RootDirEnts = getushort(b50->bpbRootDirEnts);
 	pmp->pm_Sectors = getushort(b50->bpbSectors);
 	pmp->pm_FATsecs = getushort(b50->bpbFATsecs);
 	pmp->pm_SecPerTrack = getushort(b50->bpbSecPerTrack);
 	pmp->pm_Heads = getushort(b50->bpbHeads);
 	pmp->pm_Media = b50->bpbMedia;
 
 	/* calculate the ratio of sector size to DEV_BSIZE */
 	pmp->pm_BlkPerSec = pmp->pm_BytesPerSec / DEV_BSIZE;
 
 	/* XXX - We should probably check more values here */
 	if (!pmp->pm_BytesPerSec || !SecPerClust
 		|| !pmp->pm_Heads
 #ifdef PC98
     		|| !pmp->pm_SecPerTrack || pmp->pm_SecPerTrack > 255) {
 #else
 		|| !pmp->pm_SecPerTrack || pmp->pm_SecPerTrack > 63) {
 #endif
 		error = EINVAL;
 		goto error_exit;
 	}
 
 	if (pmp->pm_Sectors == 0) {
 		pmp->pm_HiddenSects = getulong(b50->bpbHiddenSecs);
 		pmp->pm_HugeSectors = getulong(b50->bpbHugeSectors);
 	} else {
 		pmp->pm_HiddenSects = getushort(b33->bpbHiddenSecs);
 		pmp->pm_HugeSectors = pmp->pm_Sectors;
 	}
 	if (!(pmp->pm_flags & MSDOSFS_LARGEFS)) {
 		if (pmp->pm_HugeSectors > 0xffffffff /
 		    (pmp->pm_BytesPerSec / sizeof(struct direntry)) + 1) {
 			/*
 			 * We cannot deal currently with this size of disk
 			 * due to fileid limitations (see msdosfs_getattr and
 			 * msdosfs_readdir)
 			 */
 			error = EINVAL;
 			vfs_mount_error(mp,
 			    "Disk too big, try '-o large' mount option");
 			goto error_exit;
 		}
 	}
 
 	if (pmp->pm_RootDirEnts == 0) {
 		if (pmp->pm_Sectors
 		    || pmp->pm_FATsecs
 		    || getushort(b710->bpbFSVers)) {
 			error = EINVAL;
 			printf("mountmsdosfs(): bad FAT32 filesystem\n");
 			goto error_exit;
 		}
 		pmp->pm_fatmask = FAT32_MASK;
 		pmp->pm_fatmult = 4;
 		pmp->pm_fatdiv = 1;
 		pmp->pm_FATsecs = getulong(b710->bpbBigFATsecs);
 		if (getushort(b710->bpbExtFlags) & FATMIRROR)
 			pmp->pm_curfat = getushort(b710->bpbExtFlags) & FATNUM;
 		else
 			pmp->pm_flags |= MSDOSFS_FATMIRROR;
 	} else
 		pmp->pm_flags |= MSDOSFS_FATMIRROR;
 
 	/*
 	 * Check a few values (could do some more):
 	 * - logical sector size: power of 2, >= block size
 	 * - sectors per cluster: power of 2, >= 1
 	 * - number of sectors:   >= 1, <= size of partition
 	 * - number of FAT sectors: >= 1
 	 */
 	if ( (SecPerClust == 0)
 	  || (SecPerClust & (SecPerClust - 1))
 	  || (pmp->pm_BytesPerSec < DEV_BSIZE)
 	  || (pmp->pm_BytesPerSec & (pmp->pm_BytesPerSec - 1))
 	  || (pmp->pm_HugeSectors == 0)
 	  || (pmp->pm_FATsecs == 0)
 	) {
 		error = EINVAL;
 		goto error_exit;
 	}
 
 	pmp->pm_HugeSectors *= pmp->pm_BlkPerSec;
 	pmp->pm_HiddenSects *= pmp->pm_BlkPerSec;	/* XXX not used? */
 	pmp->pm_FATsecs     *= pmp->pm_BlkPerSec;
 	SecPerClust         *= pmp->pm_BlkPerSec;
 
 	pmp->pm_fatblk = pmp->pm_ResSectors * pmp->pm_BlkPerSec;
 
 	if (FAT32(pmp)) {
 		pmp->pm_rootdirblk = getulong(b710->bpbRootClust);
 		pmp->pm_firstcluster = pmp->pm_fatblk
 			+ (pmp->pm_FATs * pmp->pm_FATsecs);
 		pmp->pm_fsinfo = getushort(b710->bpbFSInfo) * pmp->pm_BlkPerSec;
 	} else {
 		pmp->pm_rootdirblk = pmp->pm_fatblk +
 			(pmp->pm_FATs * pmp->pm_FATsecs);
 		pmp->pm_rootdirsize = (pmp->pm_RootDirEnts * sizeof(struct direntry)
 				       + DEV_BSIZE - 1)
 			/ DEV_BSIZE; /* in blocks */
 		pmp->pm_firstcluster = pmp->pm_rootdirblk + pmp->pm_rootdirsize;
 	}
 
 	pmp->pm_maxcluster = (pmp->pm_HugeSectors - pmp->pm_firstcluster) /
 	    SecPerClust + 1;
 	pmp->pm_fatsize = pmp->pm_FATsecs * DEV_BSIZE;	/* XXX not used? */
 
 	if (pmp->pm_fatmask == 0) {
 		if (pmp->pm_maxcluster
 		    <= ((CLUST_RSRVD - CLUST_FIRST) & FAT12_MASK)) {
 			/*
 			 * This will usually be a floppy disk. This size makes
 			 * sure that one fat entry will not be split across
 			 * multiple blocks.
 			 */
 			pmp->pm_fatmask = FAT12_MASK;
 			pmp->pm_fatmult = 3;
 			pmp->pm_fatdiv = 2;
 		} else {
 			pmp->pm_fatmask = FAT16_MASK;
 			pmp->pm_fatmult = 2;
 			pmp->pm_fatdiv = 1;
 		}
 	}
 
 	clusters = (pmp->pm_fatsize / pmp->pm_fatmult) * pmp->pm_fatdiv;
 	if (pmp->pm_maxcluster >= clusters) {
 		printf("Warning: number of clusters (%ld) exceeds FAT "
 		    "capacity (%ld)\n", pmp->pm_maxcluster + 1, clusters);
 		pmp->pm_maxcluster = clusters - 1;
 	}
 
 	if (FAT12(pmp))
 		pmp->pm_fatblocksize = 3 * 512;
 	else
 		pmp->pm_fatblocksize = PAGE_SIZE;
 	pmp->pm_fatblocksize = roundup(pmp->pm_fatblocksize,
 	    pmp->pm_BytesPerSec);
 	pmp->pm_fatblocksec = pmp->pm_fatblocksize / DEV_BSIZE;
 	pmp->pm_bnshift = ffs(DEV_BSIZE) - 1;
 
 	/*
 	 * Compute mask and shift value for isolating cluster relative byte
 	 * offsets and cluster numbers from a file offset.
 	 */
 	pmp->pm_bpcluster = SecPerClust * DEV_BSIZE;
 	pmp->pm_crbomask = pmp->pm_bpcluster - 1;
 	pmp->pm_cnshift = ffs(pmp->pm_bpcluster) - 1;
 
 	/*
 	 * Check for valid cluster size
 	 * must be a power of 2
 	 */
 	if (pmp->pm_bpcluster ^ (1 << pmp->pm_cnshift)) {
 		error = EINVAL;
 		goto error_exit;
 	}
 
 	/*
 	 * Release the bootsector buffer.
 	 */
 	brelse(bp);
 	bp = NULL;
 
 	/*
 	 * Check the fsinfo sector if we have one.  Silently fix up our
 	 * in-core copy of fp->fsinxtfree if it is unknown (0xffffffff)
 	 * or too large.  Ignore fp->fsinfree for now, since we need to
 	 * read the entire FAT anyway to fill the inuse map.
 	 */
 	if (pmp->pm_fsinfo) {
 		struct fsinfo *fp;
 
 		if ((error = bread(devvp, pmp->pm_fsinfo, pmp->pm_BytesPerSec,
 		    NOCRED, &bp)) != 0)
 			goto error_exit;
 		fp = (struct fsinfo *)bp->b_data;
 		if (!bcmp(fp->fsisig1, "RRaA", 4)
 		    && !bcmp(fp->fsisig2, "rrAa", 4)
 		    && !bcmp(fp->fsisig3, "\0\0\125\252", 4)) {
 			pmp->pm_nxtfree = getulong(fp->fsinxtfree);
 			if (pmp->pm_nxtfree > pmp->pm_maxcluster)
 				pmp->pm_nxtfree = CLUST_FIRST;
 		} else
 			pmp->pm_fsinfo = 0;
 		brelse(bp);
 		bp = NULL;
 	}
 
 	/*
 	 * Finish initializing pmp->pm_nxtfree (just in case the first few
 	 * sectors aren't properly reserved in the FAT).  This completes
 	 * the fixup for fp->fsinxtfree, and fixes up the zero-initialized
 	 * value if there is no fsinfo.  We will use pmp->pm_nxtfree
 	 * internally even if there is no fsinfo.
 	 */
 	if (pmp->pm_nxtfree < CLUST_FIRST)
 		pmp->pm_nxtfree = CLUST_FIRST;
 
 	/*
 	 * Allocate memory for the bitmap of allocated clusters, and then
 	 * fill it in.
 	 */
 	pmp->pm_inusemap = malloc(howmany(pmp->pm_maxcluster + 1, N_INUSEBITS)
 				  * sizeof(*pmp->pm_inusemap),
 				  M_MSDOSFSFAT, M_WAITOK);
 
 	/*
 	 * fillinusemap() needs pm_devvp.
 	 */
 	pmp->pm_devvp = devvp;
 
 	/*
 	 * Have the inuse map filled in.
 	 */
 	if ((error = fillinusemap(pmp)) != 0)
 		goto error_exit;
 
 	/*
 	 * If they want fat updates to be synchronous then let them suffer
 	 * the performance degradation in exchange for the on disk copy of
 	 * the fat being correct just about all the time.  I suppose this
 	 * would be a good thing to turn on if the kernel is still flakey.
 	 */
 	if (mp->mnt_flag & MNT_SYNCHRONOUS)
 		pmp->pm_flags |= MSDOSFSMNT_WAITONFAT;
 
 	/*
 	 * Finish up.
 	 */
 	if (ronly)
 		pmp->pm_flags |= MSDOSFSMNT_RONLY;
 	else {
 		if ((error = markvoldirty(pmp, 1)) != 0) {
 			(void)markvoldirty(pmp, 0);
 			goto error_exit;
 		}
 		pmp->pm_fmod = 1;
 	}
 	mp->mnt_data =  pmp;
 	mp->mnt_stat.f_fsid.val[0] = dev2udev(dev);
 	mp->mnt_stat.f_fsid.val[1] = mp->mnt_vfc->vfc_typenum;
 	MNT_ILOCK(mp);
 	mp->mnt_flag |= MNT_LOCAL;
 	MNT_IUNLOCK(mp);
 
 	if (pmp->pm_flags & MSDOSFS_LARGEFS)
 		msdosfs_fileno_init(mp);
 
 	return 0;
 
 error_exit:
 	if (bp)
 		brelse(bp);
 	if (cp != NULL) {
 		DROP_GIANT();
 		g_topology_lock();
 		g_vfs_close(cp, td);
 		g_topology_unlock();
 		PICKUP_GIANT();
 	}
 	if (pmp) {
 		if (pmp->pm_inusemap)
 			free(pmp->pm_inusemap, M_MSDOSFSFAT);
 		free(pmp, M_MSDOSFSMNT);
 		mp->mnt_data = NULL;
 	}
 	return (error);
 }
 
 /*
  * Unmount the filesystem described by mp.
  */
 static int
 msdosfs_unmount(struct mount *mp, int mntflags, struct thread *td)
 {
 	struct msdosfsmount *pmp;
 	int error, flags;
 
 	flags = 0;
 	if (mntflags & MNT_FORCE)
 		flags |= FORCECLOSE;
 	error = vflush(mp, 0, flags, td);
 	if (error)
 		return error;
 	pmp = VFSTOMSDOSFS(mp);
 	if ((pmp->pm_flags & MSDOSFSMNT_RONLY) == 0) {
 		error = markvoldirty(pmp, 0);
 		if (error) {
 			(void)markvoldirty(pmp, 1);
 			return (error);
 		}
 	}
 	if (pmp->pm_flags & MSDOSFSMNT_KICONV && msdosfs_iconv) {
 		if (pmp->pm_w2u)
 			msdosfs_iconv->close(pmp->pm_w2u);
 		if (pmp->pm_u2w)
 			msdosfs_iconv->close(pmp->pm_u2w);
 		if (pmp->pm_d2u)
 			msdosfs_iconv->close(pmp->pm_d2u);
 		if (pmp->pm_u2d)
 			msdosfs_iconv->close(pmp->pm_u2d);
 	}
 
 #ifdef MSDOSFS_DEBUG
 	{
 		struct vnode *vp = pmp->pm_devvp;
 
 		VI_LOCK(vp);
 		vn_printf(vp,
 		    "msdosfs_umount(): just before calling VOP_CLOSE()\n");
 		printf("freef %p, freeb %p, mount %p\n",
 		    TAILQ_NEXT(vp, v_freelist), vp->v_freelist.tqe_prev,
 		    vp->v_mount);
 		printf("cleanblkhd %p, dirtyblkhd %p, numoutput %ld, type %d\n",
 		    TAILQ_FIRST(&vp->v_bufobj.bo_clean.bv_hd),
 		    TAILQ_FIRST(&vp->v_bufobj.bo_dirty.bv_hd),
 		    vp->v_bufobj.bo_numoutput, vp->v_type);
 		VI_UNLOCK(vp);
 	}
 #endif
 	DROP_GIANT();
 	g_topology_lock();
 	g_vfs_close(pmp->pm_cp, td);
 	g_topology_unlock();
 	PICKUP_GIANT();
 	vrele(pmp->pm_devvp);
 	free(pmp->pm_inusemap, M_MSDOSFSFAT);
 	if (pmp->pm_flags & MSDOSFS_LARGEFS)
 		msdosfs_fileno_free(mp);
 	free(pmp, M_MSDOSFSMNT);
 	mp->mnt_data = NULL;
 	MNT_ILOCK(mp);
 	mp->mnt_flag &= ~MNT_LOCAL;
 	MNT_IUNLOCK(mp);
 	return (0);
 }
 
 static int
 msdosfs_root(struct mount *mp, int flags, struct vnode **vpp, struct thread *td)
 {
 	struct msdosfsmount *pmp = VFSTOMSDOSFS(mp);
 	struct denode *ndep;
 	int error;
 
 #ifdef MSDOSFS_DEBUG
 	printf("msdosfs_root(); mp %p, pmp %p\n", mp, pmp);
 #endif
 	error = deget(pmp, MSDOSFSROOT, MSDOSFSROOT_OFS, &ndep);
 	if (error)
 		return (error);
 	*vpp = DETOV(ndep);
 	return (0);
 }
 
 static int
 msdosfs_statfs(struct mount *mp, struct statfs *sbp, struct thread *td)
 {
 	struct msdosfsmount *pmp;
 
 	pmp = VFSTOMSDOSFS(mp);
 	sbp->f_bsize = pmp->pm_bpcluster;
 	sbp->f_iosize = pmp->pm_bpcluster;
 	sbp->f_blocks = pmp->pm_maxcluster + 1;
 	sbp->f_bfree = pmp->pm_freeclustercount;
 	sbp->f_bavail = pmp->pm_freeclustercount;
 	sbp->f_files = pmp->pm_RootDirEnts;	/* XXX */
 	sbp->f_ffree = 0;	/* what to put in here? */
 	return (0);
 }
 
 static int
 msdosfs_sync(struct mount *mp, int waitfor, struct thread *td)
 {
 	struct vnode *vp, *nvp;
 	struct denode *dep;
 	struct msdosfsmount *pmp = VFSTOMSDOSFS(mp);
 	int error, allerror = 0;
 
 	/*
 	 * If we ever switch to not updating all of the fats all the time,
 	 * this would be the place to update them from the first one.
 	 */
 	if (pmp->pm_fmod != 0) {
 		if (pmp->pm_flags & MSDOSFSMNT_RONLY)
 			panic("msdosfs_sync: rofs mod");
 		else {
 			/* update fats here */
 		}
 	}
 	/*
 	 * Write back each (modified) denode.
 	 */
 	MNT_ILOCK(mp);
 loop:
 	MNT_VNODE_FOREACH(vp, mp, nvp) {
 		VI_LOCK(vp);
 		if (vp->v_type == VNON || (vp->v_iflag & VI_DOOMED)) {
 			VI_UNLOCK(vp);
 			continue;
 		}
 		MNT_IUNLOCK(mp);
 		dep = VTODE(vp);
 		if ((dep->de_flag &
 		    (DE_ACCESS | DE_CREATE | DE_UPDATE | DE_MODIFIED)) == 0 &&
 		    (vp->v_bufobj.bo_dirty.bv_cnt == 0 ||
 		    waitfor == MNT_LAZY)) {
 			VI_UNLOCK(vp);
 			MNT_ILOCK(mp);
 			continue;
 		}
 		error = vget(vp, LK_EXCLUSIVE | LK_NOWAIT | LK_INTERLOCK, td);
 		if (error) {
 			MNT_ILOCK(mp);
 			if (error == ENOENT)
 				goto loop;
 			continue;
 		}
 		error = VOP_FSYNC(vp, waitfor, td);
 		if (error)
 			allerror = error;
 		VOP_UNLOCK(vp, 0, td);
 		vrele(vp);
 		MNT_ILOCK(mp);
 	}
 	MNT_IUNLOCK(mp);
 
 	/*
 	 * Flush filesystem control info.
 	 */
 	if (waitfor != MNT_LAZY) {
-		vn_lock(pmp->pm_devvp, LK_EXCLUSIVE | LK_RETRY, td);
+		vn_lock(pmp->pm_devvp, LK_EXCLUSIVE | LK_RETRY);
 		error = VOP_FSYNC(pmp->pm_devvp, waitfor, td);
 		if (error)
 			allerror = error;
 		VOP_UNLOCK(pmp->pm_devvp, 0, td);
 	}
 	return (allerror);
 }
 
 static int
 msdosfs_fhtovp(struct mount *mp, struct fid *fhp, struct vnode **vpp)
 {
 	struct msdosfsmount *pmp = VFSTOMSDOSFS(mp);
 	struct defid *defhp = (struct defid *) fhp;
 	struct denode *dep;
 	int error;
 
 	error = deget(pmp, defhp->defid_dirclust, defhp->defid_dirofs, &dep);
 	if (error) {
 		*vpp = NULLVP;
 		return (error);
 	}
 	*vpp = DETOV(dep);
 	vnode_create_vobject(*vpp, dep->de_FileSize, curthread);
 	return (0);
 }
 
 static struct vfsops msdosfs_vfsops = {
 	.vfs_fhtovp =		msdosfs_fhtovp,
 	.vfs_mount =		msdosfs_mount,
 	.vfs_cmount =		msdosfs_cmount,
 	.vfs_root =		msdosfs_root,
 	.vfs_statfs =		msdosfs_statfs,
 	.vfs_sync =		msdosfs_sync,
 	.vfs_unmount =		msdosfs_unmount,
 };
 
 VFS_SET(msdosfs_vfsops, msdosfs, 0);
 MODULE_VERSION(msdosfs, 1);
Index: head/sys/fs/msdosfs/msdosfs_vnops.c
===================================================================
--- head/sys/fs/msdosfs/msdosfs_vnops.c	(revision 175201)
+++ head/sys/fs/msdosfs/msdosfs_vnops.c	(revision 175202)
@@ -1,2016 +1,2016 @@
 /* $FreeBSD$ */
 /*	$NetBSD: msdosfs_vnops.c,v 1.68 1998/02/10 14:10:04 mrg Exp $	*/
 
 /*-
  * Copyright (C) 1994, 1995, 1997 Wolfgang Solfrank.
  * Copyright (C) 1994, 1995, 1997 TooLs GmbH.
  * All rights reserved.
  * Original code by Paul Popelka (paulp@uts.amdahl.com) (see below).
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. All advertising materials mentioning features or use of this software
  *    must display the following acknowledgement:
  *	This product includes software developed by TooLs GmbH.
  * 4. The name of TooLs GmbH may not be used to endorse or promote products
  *    derived from this software without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY TOOLS GMBH ``AS IS'' AND ANY EXPRESS OR
  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
  * IN NO EVENT SHALL TOOLS GMBH BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
  * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
  * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
  * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
  * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
  * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
  * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 /*-
  * Written by Paul Popelka (paulp@uts.amdahl.com)
  *
  * You can do anything you want with this software, just don't say you wrote
  * it, and don't remove this notice.
  *
  * This software is provided "as is".
  *
  * The author supplies this software to be publicly redistributed on the
  * understanding that the author is not responsible for the correct
  * functioning of this software in any circumstances and is not liable for
  * any damages caused by this software.
  *
  * October 1992
  */
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/bio.h>
 #include <sys/buf.h>
 #include <sys/clock.h>
 #include <sys/dirent.h>
 #include <sys/lock.h>
 #include <sys/lockf.h>
 #include <sys/malloc.h>
 #include <sys/mount.h>
 #include <sys/mutex.h>
 #include <sys/namei.h>
 #include <sys/priv.h>
 #include <sys/proc.h>
 #include <sys/resourcevar.h>
 #include <sys/signalvar.h>
 #include <sys/stat.h>
 #include <sys/unistd.h>
 #include <sys/vnode.h>
 
 #include <vm/vm.h>
 #include <vm/vm_extern.h>
 
 #include <fs/msdosfs/bpb.h>
 #include <fs/msdosfs/direntry.h>
 #include <fs/msdosfs/denode.h>
 #include <fs/msdosfs/fat.h>
 #include <fs/msdosfs/msdosfsmount.h>
 
 #define	DOS_FILESIZE_MAX	0xffffffff
 
 /*
  * Prototypes for MSDOSFS vnode operations
  */
 static vop_advlock_t	msdosfs_advlock;
 static vop_create_t	msdosfs_create;
 static vop_mknod_t	msdosfs_mknod;
 static vop_open_t	msdosfs_open;
 static vop_close_t	msdosfs_close;
 static vop_access_t	msdosfs_access;
 static vop_getattr_t	msdosfs_getattr;
 static vop_setattr_t	msdosfs_setattr;
 static vop_read_t	msdosfs_read;
 static vop_write_t	msdosfs_write;
 static vop_fsync_t	msdosfs_fsync;
 static vop_remove_t	msdosfs_remove;
 static vop_link_t	msdosfs_link;
 static vop_rename_t	msdosfs_rename;
 static vop_mkdir_t	msdosfs_mkdir;
 static vop_rmdir_t	msdosfs_rmdir;
 static vop_symlink_t	msdosfs_symlink;
 static vop_readdir_t	msdosfs_readdir;
 static vop_bmap_t	msdosfs_bmap;
 static vop_strategy_t	msdosfs_strategy;
 static vop_print_t	msdosfs_print;
 static vop_pathconf_t	msdosfs_pathconf;
 static vop_vptofh_t	msdosfs_vptofh;
 
 /*
  * Some general notes:
  *
  * In the ufs filesystem the inodes, superblocks, and indirect blocks are
  * read/written using the vnode for the filesystem. Blocks that represent
  * the contents of a file are read/written using the vnode for the file
  * (including directories when they are read/written as files). This
  * presents problems for the dos filesystem because data that should be in
  * an inode (if dos had them) resides in the directory itself.  Since we
  * must update directory entries without the benefit of having the vnode
  * for the directory we must use the vnode for the filesystem.  This means
  * that when a directory is actually read/written (via read, write, or
  * readdir, or seek) we must use the vnode for the filesystem instead of
  * the vnode for the directory as would happen in ufs. This is to insure we
  * retreive the correct block from the buffer cache since the hash value is
  * based upon the vnode address and the desired block number.
  */
 
 /*
  * Create a regular file. On entry the directory to contain the file being
  * created is locked.  We must release before we return. We must also free
  * the pathname buffer pointed at by cnp->cn_pnbuf, always on error, or
  * only if the SAVESTART bit in cn_flags is clear on success.
  */
 static int
 msdosfs_create(ap)
 	struct vop_create_args /* {
 		struct vnode *a_dvp;
 		struct vnode **a_vpp;
 		struct componentname *a_cnp;
 		struct vattr *a_vap;
 	} */ *ap;
 {
 	struct componentname *cnp = ap->a_cnp;
 	struct denode ndirent;
 	struct denode *dep;
 	struct denode *pdep = VTODE(ap->a_dvp);
 	struct timespec ts;
 	int error;
 
 #ifdef MSDOSFS_DEBUG
 	printf("msdosfs_create(cnp %p, vap %p\n", cnp, ap->a_vap);
 #endif
 
 	/*
 	 * If this is the root directory and there is no space left we
 	 * can't do anything.  This is because the root directory can not
 	 * change size.
 	 */
 	if (pdep->de_StartCluster == MSDOSFSROOT
 	    && pdep->de_fndoffset >= pdep->de_FileSize) {
 		error = ENOSPC;
 		goto bad;
 	}
 
 	/*
 	 * Create a directory entry for the file, then call createde() to
 	 * have it installed. NOTE: DOS files are always executable.  We
 	 * use the absence of the owner write bit to make the file
 	 * readonly.
 	 */
 #ifdef DIAGNOSTIC
 	if ((cnp->cn_flags & HASBUF) == 0)
 		panic("msdosfs_create: no name");
 #endif
 	bzero(&ndirent, sizeof(ndirent));
 	error = uniqdosname(pdep, cnp, ndirent.de_Name);
 	if (error)
 		goto bad;
 
 	ndirent.de_Attributes = (ap->a_vap->va_mode & VWRITE) ?
 				ATTR_ARCHIVE : ATTR_ARCHIVE | ATTR_READONLY;
 	ndirent.de_LowerCase = 0;
 	ndirent.de_StartCluster = 0;
 	ndirent.de_FileSize = 0;
 	ndirent.de_dev = pdep->de_dev;
 	ndirent.de_pmp = pdep->de_pmp;
 	ndirent.de_flag = DE_ACCESS | DE_CREATE | DE_UPDATE;
 	getnanotime(&ts);
 	DETIMES(&ndirent, &ts, &ts, &ts);
 	error = createde(&ndirent, pdep, &dep, cnp);
 	if (error)
 		goto bad;
 	*ap->a_vpp = DETOV(dep);
 	return (0);
 
 bad:
 	return (error);
 }
 
 static int
 msdosfs_mknod(ap)
 	struct vop_mknod_args /* {
 		struct vnode *a_dvp;
 		struct vnode **a_vpp;
 		struct componentname *a_cnp;
 		struct vattr *a_vap;
 	} */ *ap;
 {
 
     return (EINVAL);
 }
 
 static int
 msdosfs_open(ap)
 	struct vop_open_args /* {
 		struct vnode *a_vp;
 		int a_mode;
 		struct ucred *a_cred;
 		struct thread *a_td;
 		int a_fdidx;
 	} */ *ap;
 {
 	struct denode *dep = VTODE(ap->a_vp);
 	vnode_create_vobject(ap->a_vp, dep->de_FileSize, ap->a_td);
 	return 0;
 }
 
 static int
 msdosfs_close(ap)
 	struct vop_close_args /* {
 		struct vnode *a_vp;
 		int a_fflag;
 		struct ucred *a_cred;
 		struct thread *a_td;
 	} */ *ap;
 {
 	struct vnode *vp = ap->a_vp;
 	struct denode *dep = VTODE(vp);
 	struct timespec ts;
 
 	VI_LOCK(vp);
 	if (vp->v_usecount > 1) {
 		getnanotime(&ts);
 		DETIMES(dep, &ts, &ts, &ts);
 	}
 	VI_UNLOCK(vp);
 	return 0;
 }
 
 static int
 msdosfs_access(ap)
 	struct vop_access_args /* {
 		struct vnode *a_vp;
 		int a_mode;
 		struct ucred *a_cred;
 		struct thread *a_td;
 	} */ *ap;
 {
 	struct vnode *vp = ap->a_vp;
 	struct denode *dep = VTODE(ap->a_vp);
 	struct msdosfsmount *pmp = dep->de_pmp;
 	mode_t file_mode, mode = ap->a_mode;
 
 	file_mode = (S_IXUSR|S_IXGRP|S_IXOTH) | (S_IRUSR|S_IRGRP|S_IROTH) |
 	    ((dep->de_Attributes & ATTR_READONLY) ? 0 : (S_IWUSR|S_IWGRP|S_IWOTH));
 	file_mode &= (vp->v_type == VDIR ? pmp->pm_dirmask : pmp->pm_mask);
 
 	/*
 	 * Disallow writing to directories and regular files if the
 	 * filesystem is read-only.
 	 */
 	if (mode & VWRITE) {
 		switch (vp->v_type) {
 		case VDIR:
 		case VREG:
 			if (vp->v_mount->mnt_flag & MNT_RDONLY)
 				return (EROFS);
 			break;
 		default:
 			break;
 		}
 	}
 
 	return (vaccess(vp->v_type, file_mode, pmp->pm_uid, pmp->pm_gid,
 	    ap->a_mode, ap->a_cred, NULL));
 }
 
 static int
 msdosfs_getattr(ap)
 	struct vop_getattr_args /* {
 		struct vnode *a_vp;
 		struct vattr *a_vap;
 		struct ucred *a_cred;
 		struct thread *a_td;
 	} */ *ap;
 {
 	struct denode *dep = VTODE(ap->a_vp);
 	struct msdosfsmount *pmp = dep->de_pmp;
 	struct vattr *vap = ap->a_vap;
 	mode_t mode;
 	struct timespec ts;
 	u_long dirsperblk = pmp->pm_BytesPerSec / sizeof(struct direntry);
 	uint64_t fileid;
 
 	getnanotime(&ts);
 	DETIMES(dep, &ts, &ts, &ts);
 	vap->va_fsid = dev2udev(dep->de_dev);
 	/*
 	 * The following computation of the fileid must be the same as that
 	 * used in msdosfs_readdir() to compute d_fileno. If not, pwd
 	 * doesn't work.
 	 */
 	if (dep->de_Attributes & ATTR_DIRECTORY) {
 		fileid = (uint64_t)cntobn(pmp, dep->de_StartCluster) *
 		    dirsperblk;
 		if (dep->de_StartCluster == MSDOSFSROOT)
 			fileid = 1;
 	} else {
 		fileid = (uint64_t)cntobn(pmp, dep->de_dirclust) *
 		    dirsperblk;
 		if (dep->de_dirclust == MSDOSFSROOT)
 			fileid = (uint64_t)roottobn(pmp, 0) * dirsperblk;
 		fileid += (uoff_t)dep->de_diroffset / sizeof(struct direntry);
 	}
 
 	if (pmp->pm_flags & MSDOSFS_LARGEFS)
 		vap->va_fileid = msdosfs_fileno_map(pmp->pm_mountp, fileid);
 	else
 		vap->va_fileid = (long)fileid;
 
 	if ((dep->de_Attributes & ATTR_READONLY) == 0)
 		mode = S_IRWXU|S_IRWXG|S_IRWXO;
 	else
 		mode = S_IRUSR|S_IXUSR|S_IRGRP|S_IXGRP|S_IROTH|S_IXOTH;
 	vap->va_mode = mode & 
 	    (ap->a_vp->v_type == VDIR ? pmp->pm_dirmask : pmp->pm_mask);
 	vap->va_uid = pmp->pm_uid;
 	vap->va_gid = pmp->pm_gid;
 	vap->va_nlink = 1;
 	vap->va_rdev = 0;
 	vap->va_size = dep->de_FileSize;
 	fattime2timespec(dep->de_MDate, dep->de_MTime, 0, 0, &vap->va_mtime);
 	vap->va_ctime = vap->va_mtime;
 	if (pmp->pm_flags & MSDOSFSMNT_LONGNAME) {
 		fattime2timespec(dep->de_ADate, 0, 0, 0, &vap->va_atime);
 		fattime2timespec(dep->de_CDate, dep->de_CTime, dep->de_CHun,
 		    0, &vap->va_birthtime);
 	} else {
 		vap->va_atime = vap->va_mtime;
 		vap->va_birthtime.tv_sec = -1;
 		vap->va_birthtime.tv_nsec = 0;
 	}
 	vap->va_flags = 0;
 	if ((dep->de_Attributes & ATTR_ARCHIVE) == 0)
 		vap->va_flags |= SF_ARCHIVED;
 	vap->va_gen = 0;
 	vap->va_blocksize = pmp->pm_bpcluster;
 	vap->va_bytes =
 	    (dep->de_FileSize + pmp->pm_crbomask) & ~pmp->pm_crbomask;
 	vap->va_type = ap->a_vp->v_type;
 	vap->va_filerev = dep->de_modrev;
 	return (0);
 }
 
 static int
 msdosfs_setattr(ap)
 	struct vop_setattr_args /* {
 		struct vnode *a_vp;
 		struct vattr *a_vap;
 		struct ucred *a_cred;
 		struct thread *a_td;
 	} */ *ap;
 {
 	struct vnode *vp = ap->a_vp;
 	struct denode *dep = VTODE(ap->a_vp);
 	struct msdosfsmount *pmp = dep->de_pmp;
 	struct vattr *vap = ap->a_vap;
 	struct ucred *cred = ap->a_cred;
 	int error = 0;
 
 #ifdef MSDOSFS_DEBUG
 	printf("msdosfs_setattr(): vp %p, vap %p, cred %p, p %p\n",
 	    ap->a_vp, vap, cred, ap->a_td);
 #endif
 
 	/*
 	 * Check for unsettable attributes.
 	 */
 	if ((vap->va_type != VNON) || (vap->va_nlink != VNOVAL) ||
 	    (vap->va_fsid != VNOVAL) || (vap->va_fileid != VNOVAL) ||
 	    (vap->va_blocksize != VNOVAL) || (vap->va_rdev != VNOVAL) ||
 	    (vap->va_bytes != VNOVAL) || (vap->va_gen != VNOVAL)) {
 #ifdef MSDOSFS_DEBUG
 		printf("msdosfs_setattr(): returning EINVAL\n");
 		printf("    va_type %d, va_nlink %x, va_fsid %lx, va_fileid %lx\n",
 		    vap->va_type, vap->va_nlink, vap->va_fsid, vap->va_fileid);
 		printf("    va_blocksize %lx, va_rdev %x, va_bytes %qx, va_gen %lx\n",
 		    vap->va_blocksize, vap->va_rdev, vap->va_bytes, vap->va_gen);
 		printf("    va_uid %x, va_gid %x\n",
 		    vap->va_uid, vap->va_gid);
 #endif
 		return (EINVAL);
 	}
 	if (vap->va_flags != VNOVAL) {
 		if (vp->v_mount->mnt_flag & MNT_RDONLY)
 			return (EROFS);
 		if (cred->cr_uid != pmp->pm_uid) {
 			error = priv_check_cred(cred, PRIV_VFS_ADMIN, 0);
 			if (error)
 				return (error);
 		}
 		/*
 		 * We are very inconsistent about handling unsupported
 		 * attributes.  We ignored the access time and the
 		 * read and execute bits.  We were strict for the other
 		 * attributes.
 		 *
 		 * Here we are strict, stricter than ufs in not allowing
 		 * users to attempt to set SF_SETTABLE bits or anyone to
 		 * set unsupported bits.  However, we ignore attempts to
 		 * set ATTR_ARCHIVE for directories `cp -pr' from a more
 		 * sensible filesystem attempts it a lot.
 		 */
 		if (vap->va_flags & SF_SETTABLE) {
 			error = priv_check_cred(cred, PRIV_VFS_SYSFLAGS, 0);
 			if (error)
 				return (error);
 		}
 		if (vap->va_flags & ~SF_ARCHIVED)
 			return EOPNOTSUPP;
 		if (vap->va_flags & SF_ARCHIVED)
 			dep->de_Attributes &= ~ATTR_ARCHIVE;
 		else if (!(dep->de_Attributes & ATTR_DIRECTORY))
 			dep->de_Attributes |= ATTR_ARCHIVE;
 		dep->de_flag |= DE_MODIFIED;
 	}
 
 	if (vap->va_uid != (uid_t)VNOVAL || vap->va_gid != (gid_t)VNOVAL) {
 		uid_t uid;
 		gid_t gid;
 
 		if (vp->v_mount->mnt_flag & MNT_RDONLY)
 			return (EROFS);
 		uid = vap->va_uid;
 		if (uid == (uid_t)VNOVAL)
 			uid = pmp->pm_uid;
 		gid = vap->va_gid;
 		if (gid == (gid_t)VNOVAL)
 			gid = pmp->pm_gid;
 		if (cred->cr_uid != pmp->pm_uid || uid != pmp->pm_uid ||
 		    (gid != pmp->pm_gid && !groupmember(gid, cred))) {
 			error = priv_check_cred(cred, PRIV_VFS_CHOWN, 0);
 			if (error)
 				return (error);
 		}
 		if (uid != pmp->pm_uid || gid != pmp->pm_gid)
 			return EINVAL;
 	}
 
 	if (vap->va_size != VNOVAL) {
 		switch (vp->v_type) {
 		case VDIR:
 			return (EISDIR);
 		case VREG:
 			/*
 			 * Truncation is only supported for regular files,
 			 * Disallow it if the filesystem is read-only.
 			 */
 			if (vp->v_mount->mnt_flag & MNT_RDONLY)
 				return (EROFS);
 			break;
 		default:
 			/*
 			 * According to POSIX, the result is unspecified
 			 * for file types other than regular files,
 			 * directories and shared memory objects.  We
 			 * don't support any file types except regular
 			 * files and directories in this file system, so
 			 * this (default) case is unreachable and can do
 			 * anything.  Keep falling through to detrunc()
 			 * for now.
 			 */
 			break;
 		}
 		error = detrunc(dep, vap->va_size, 0, cred, ap->a_td);
 		if (error)
 			return error;
 	}
 	if (vap->va_atime.tv_sec != VNOVAL || vap->va_mtime.tv_sec != VNOVAL) {
 		if (vp->v_mount->mnt_flag & MNT_RDONLY)
 			return (EROFS);
 		if (vap->va_vaflags & VA_UTIMES_NULL) {
 			error = VOP_ACCESS(vp, VADMIN, cred, ap->a_td); 
 			if (error)
 				error = VOP_ACCESS(vp, VWRITE, cred,
 				    ap->a_td);
 		} else
 			error = VOP_ACCESS(vp, VADMIN, cred, ap->a_td);
 		if (vp->v_type != VDIR) {
 			if ((pmp->pm_flags & MSDOSFSMNT_NOWIN95) == 0 &&
 			    vap->va_atime.tv_sec != VNOVAL) {
 				dep->de_flag &= ~DE_ACCESS;
 				timespec2fattime(&vap->va_atime, 0,
 				    &dep->de_ADate, NULL, NULL);
 			}
 			if (vap->va_mtime.tv_sec != VNOVAL) {
 				dep->de_flag &= ~DE_UPDATE;
 				timespec2fattime(&vap->va_mtime, 0,
 				    &dep->de_MDate, &dep->de_MTime, NULL);
 			}
 			dep->de_Attributes |= ATTR_ARCHIVE;
 			dep->de_flag |= DE_MODIFIED;
 		}
 	}
 	/*
 	 * DOS files only have the ability to have their writability
 	 * attribute set, so we use the owner write bit to set the readonly
 	 * attribute.
 	 */
 	if (vap->va_mode != (mode_t)VNOVAL) {
 		if (vp->v_mount->mnt_flag & MNT_RDONLY)
 			return (EROFS);
 		if (cred->cr_uid != pmp->pm_uid) {
 			error = priv_check_cred(cred, PRIV_VFS_ADMIN, 0);
 			if (error)
 				return (error);
 		}
 		if (vp->v_type != VDIR) {
 			/* We ignore the read and execute bits. */
 			if (vap->va_mode & VWRITE)
 				dep->de_Attributes &= ~ATTR_READONLY;
 			else
 				dep->de_Attributes |= ATTR_READONLY;
 			dep->de_Attributes |= ATTR_ARCHIVE;
 			dep->de_flag |= DE_MODIFIED;
 		}
 	}
 	return (deupdat(dep, 0));
 }
 
 static int
 msdosfs_read(ap)
 	struct vop_read_args /* {
 		struct vnode *a_vp;
 		struct uio *a_uio;
 		int a_ioflag;
 		struct ucred *a_cred;
 	} */ *ap;
 {
 	int error = 0;
 	int blsize;
 	int isadir;
 	int orig_resid;
 	u_int n;
 	u_long diff;
 	u_long on;
 	daddr_t lbn;
 	daddr_t rablock;
 	int rasize;
 	int seqcount;
 	struct buf *bp;
 	struct vnode *vp = ap->a_vp;
 	struct denode *dep = VTODE(vp);
 	struct msdosfsmount *pmp = dep->de_pmp;
 	struct uio *uio = ap->a_uio;
 
 	/*
 	 * If they didn't ask for any data, then we are done.
 	 */
 	orig_resid = uio->uio_resid;
 	if (orig_resid == 0)
 		return (0);
 
 	/*
 	 * The caller is supposed to ensure that
 	 * uio->uio_offset >= 0 and uio->uio_resid >= 0.
 	 * We don't need to check for large offsets as in ffs because
 	 * dep->de_FileSize <= DOS_FILESIZE_MAX < OFF_MAX, so large
 	 * offsets cannot cause overflow even in theory.
 	 */
 
 	seqcount = ap->a_ioflag >> IO_SEQSHIFT;
 
 	isadir = dep->de_Attributes & ATTR_DIRECTORY;
 	do {
 		if (uio->uio_offset >= dep->de_FileSize)
 			break;
 		lbn = de_cluster(pmp, uio->uio_offset);
 		rablock = lbn + 1;
 		blsize = pmp->pm_bpcluster;
 		on = uio->uio_offset & pmp->pm_crbomask;
 		/*
 		 * If we are operating on a directory file then be sure to
 		 * do i/o with the vnode for the filesystem instead of the
 		 * vnode for the directory.
 		 */
 		if (isadir) {
 			/* convert cluster # to block # */
 			error = pcbmap(dep, lbn, &lbn, 0, &blsize);
 			if (error == E2BIG) {
 				error = EINVAL;
 				break;
 			} else if (error)
 				break;
 			error = bread(pmp->pm_devvp, lbn, blsize, NOCRED, &bp);
 		} else if (de_cn2off(pmp, rablock) >= dep->de_FileSize) {
 			error = bread(vp, lbn, blsize, NOCRED, &bp);
 		} else if ((vp->v_mount->mnt_flag & MNT_NOCLUSTERR) == 0) {
 			error = cluster_read(vp, dep->de_FileSize, lbn, blsize,
 			    NOCRED, on + uio->uio_resid, seqcount, &bp);
 		} else if (seqcount > 1) {
 			rasize = blsize;
 			error = breadn(vp, lbn,
 			    blsize, &rablock, &rasize, 1, NOCRED, &bp);
 		} else {
 			error = bread(vp, lbn, blsize, NOCRED, &bp);
 		}
 		if (error) {
 			brelse(bp);
 			break;
 		}
 		diff = pmp->pm_bpcluster - on;
 		n = diff > uio->uio_resid ? uio->uio_resid : diff;
 		diff = dep->de_FileSize - uio->uio_offset;
 		if (diff < n)
 			n = diff;
 		diff = blsize - bp->b_resid;
 		if (diff < n)
 			n = diff;
 		error = uiomove(bp->b_data + on, (int) n, uio);
 		brelse(bp);
 	} while (error == 0 && uio->uio_resid > 0 && n != 0);
 	if (!isadir && (error == 0 || uio->uio_resid != orig_resid) &&
 	    (vp->v_mount->mnt_flag & MNT_NOATIME) == 0)
 		dep->de_flag |= DE_ACCESS;
 	return (error);
 }
 
 /*
  * Write data to a file or directory.
  */
 static int
 msdosfs_write(ap)
 	struct vop_write_args /* {
 		struct vnode *a_vp;
 		struct uio *a_uio;
 		int a_ioflag;
 		struct ucred *a_cred;
 	} */ *ap;
 {
 	int n;
 	int croffset;
 	int resid;
 	u_long osize;
 	int error = 0;
 	u_long count;
 	int seqcount;
 	daddr_t bn, lastcn;
 	struct buf *bp;
 	int ioflag = ap->a_ioflag;
 	struct uio *uio = ap->a_uio;
 	struct thread *td = uio->uio_td;
 	struct vnode *vp = ap->a_vp;
 	struct vnode *thisvp;
 	struct denode *dep = VTODE(vp);
 	struct msdosfsmount *pmp = dep->de_pmp;
 	struct ucred *cred = ap->a_cred;
 
 #ifdef MSDOSFS_DEBUG
 	printf("msdosfs_write(vp %p, uio %p, ioflag %x, cred %p\n",
 	    vp, uio, ioflag, cred);
 	printf("msdosfs_write(): diroff %lu, dirclust %lu, startcluster %lu\n",
 	    dep->de_diroffset, dep->de_dirclust, dep->de_StartCluster);
 #endif
 
 	switch (vp->v_type) {
 	case VREG:
 		if (ioflag & IO_APPEND)
 			uio->uio_offset = dep->de_FileSize;
 		thisvp = vp;
 		break;
 	case VDIR:
 		return EISDIR;
 	default:
 		panic("msdosfs_write(): bad file type");
 	}
 
 	/*
 	 * This is needed (unlike in ffs_write()) because we extend the
 	 * file outside of the loop but we don't want to extend the file
 	 * for writes of 0 bytes.
 	 */
 	if (uio->uio_resid == 0)
 		return (0);
 
 	/*
 	 * The caller is supposed to ensure that
 	 * uio->uio_offset >= 0 and uio->uio_resid >= 0.
 	 */
 	if ((uoff_t)uio->uio_offset + uio->uio_resid > DOS_FILESIZE_MAX)
 		return (EFBIG);
 
 	/*
 	 * If they've exceeded their filesize limit, tell them about it.
 	 */
 	if (td != NULL) {
 		PROC_LOCK(td->td_proc);
 		if ((uoff_t)uio->uio_offset + uio->uio_resid >
 		    lim_cur(td->td_proc, RLIMIT_FSIZE)) {
 			psignal(td->td_proc, SIGXFSZ);
 			PROC_UNLOCK(td->td_proc);
 			return (EFBIG);
 		}
 		PROC_UNLOCK(td->td_proc);
 	}
 
 	/*
 	 * If the offset we are starting the write at is beyond the end of
 	 * the file, then they've done a seek.  Unix filesystems allow
 	 * files with holes in them, DOS doesn't so we must fill the hole
 	 * with zeroed blocks.
 	 */
 	if (uio->uio_offset > dep->de_FileSize) {
 		error = deextend(dep, uio->uio_offset, cred);
 		if (error)
 			return (error);
 	}
 
 	/*
 	 * Remember some values in case the write fails.
 	 */
 	resid = uio->uio_resid;
 	osize = dep->de_FileSize;
 
 	/*
 	 * If we write beyond the end of the file, extend it to its ultimate
 	 * size ahead of the time to hopefully get a contiguous area.
 	 */
 	if (uio->uio_offset + resid > osize) {
 		count = de_clcount(pmp, uio->uio_offset + resid) -
 			de_clcount(pmp, osize);
 		error = extendfile(dep, count, NULL, NULL, 0);
 		if (error &&  (error != ENOSPC || (ioflag & IO_UNIT)))
 			goto errexit;
 		lastcn = dep->de_fc[FC_LASTFC].fc_frcn;
 	} else
 		lastcn = de_clcount(pmp, osize) - 1;
 
 	seqcount = ioflag >> IO_SEQSHIFT;
 	do {
 		if (de_cluster(pmp, uio->uio_offset) > lastcn) {
 			error = ENOSPC;
 			break;
 		}
 
 		croffset = uio->uio_offset & pmp->pm_crbomask;
 		n = min(uio->uio_resid, pmp->pm_bpcluster - croffset);
 		if (uio->uio_offset + n > dep->de_FileSize) {
 			dep->de_FileSize = uio->uio_offset + n;
 			/* The object size needs to be set before buffer is allocated */
 			vnode_pager_setsize(vp, dep->de_FileSize);
 		}
 
 		bn = de_cluster(pmp, uio->uio_offset);
 		if ((uio->uio_offset & pmp->pm_crbomask) == 0
 		    && (de_cluster(pmp, uio->uio_offset + uio->uio_resid)
 			> de_cluster(pmp, uio->uio_offset)
 			|| uio->uio_offset + uio->uio_resid >= dep->de_FileSize)) {
 			/*
 			 * If either the whole cluster gets written,
 			 * or we write the cluster from its start beyond EOF,
 			 * then no need to read data from disk.
 			 */
 			bp = getblk(thisvp, bn, pmp->pm_bpcluster, 0, 0, 0);
 			vfs_bio_clrbuf(bp);
 			/*
 			 * Do the bmap now, since pcbmap needs buffers
 			 * for the fat table. (see msdosfs_strategy)
 			 */
 			if (bp->b_blkno == bp->b_lblkno) {
 				error = pcbmap(dep, bp->b_lblkno, &bn, 0, 0);
 				if (error)
 					bp->b_blkno = -1;
 				else
 					bp->b_blkno = bn;
 			}
 			if (bp->b_blkno == -1) {
 				brelse(bp);
 				if (!error)
 					error = EIO;		/* XXX */
 				break;
 			}
 		} else {
 			/*
 			 * The block we need to write into exists, so read it in.
 			 */
 			error = bread(thisvp, bn, pmp->pm_bpcluster, cred, &bp);
 			if (error) {
 				brelse(bp);
 				break;
 			}
 		}
 
 		/*
 		 * Should these vnode_pager_* functions be done on dir
 		 * files?
 		 */
 
 		/*
 		 * Copy the data from user space into the buf header.
 		 */
 		error = uiomove(bp->b_data + croffset, n, uio);
 		if (error) {
 			brelse(bp);
 			break;
 		}
 
 		/* Prepare for clustered writes in some else clauses. */
 		if ((vp->v_mount->mnt_flag & MNT_NOCLUSTERW) == 0)
 			bp->b_flags |= B_CLUSTEROK;
 
 		/*
 		 * If IO_SYNC, then each buffer is written synchronously.
 		 * Otherwise, if we have a severe page deficiency then
 		 * write the buffer asynchronously.  Otherwise, if on a
 		 * cluster boundary then write the buffer asynchronously,
 		 * combining it with contiguous clusters if permitted and
 		 * possible, since we don't expect more writes into this
 		 * buffer soon.  Otherwise, do a delayed write because we
 		 * expect more writes into this buffer soon.
 		 */
 		if (ioflag & IO_SYNC)
 			(void)bwrite(bp);
 		else if (vm_page_count_severe() || buf_dirty_count_severe())
 			bawrite(bp);
 		else if (n + croffset == pmp->pm_bpcluster) {
 			if ((vp->v_mount->mnt_flag & MNT_NOCLUSTERW) == 0)
 				cluster_write(vp, bp, dep->de_FileSize,
 				    seqcount);
 			else
 				bawrite(bp);
 		} else
 			bdwrite(bp);
 		dep->de_flag |= DE_UPDATE;
 	} while (error == 0 && uio->uio_resid > 0);
 
 	/*
 	 * If the write failed and they want us to, truncate the file back
 	 * to the size it was before the write was attempted.
 	 */
 errexit:
 	if (error) {
 		if (ioflag & IO_UNIT) {
 			detrunc(dep, osize, ioflag & IO_SYNC, NOCRED, NULL);
 			uio->uio_offset -= resid - uio->uio_resid;
 			uio->uio_resid = resid;
 		} else {
 			detrunc(dep, dep->de_FileSize, ioflag & IO_SYNC, NOCRED, NULL);
 			if (uio->uio_resid != resid)
 				error = 0;
 		}
 	} else if (ioflag & IO_SYNC)
 		error = deupdat(dep, 1);
 	return (error);
 }
 
 /*
  * Flush the blocks of a file to disk.
  *
  * This function is worthless for vnodes that represent directories. Maybe we
  * could just do a sync if they try an fsync on a directory file.
  */
 static int
 msdosfs_fsync(ap)
 	struct vop_fsync_args /* {
 		struct vnode *a_vp;
 		struct ucred *a_cred;
 		int a_waitfor;
 		struct thread *a_td;
 	} */ *ap;
 {
 
 	vop_stdfsync(ap);
 	return (deupdat(VTODE(ap->a_vp), ap->a_waitfor == MNT_WAIT));
 }
 
 static int
 msdosfs_remove(ap)
 	struct vop_remove_args /* {
 		struct vnode *a_dvp;
 		struct vnode *a_vp;
 		struct componentname *a_cnp;
 	} */ *ap;
 {
 	struct denode *dep = VTODE(ap->a_vp);
 	struct denode *ddep = VTODE(ap->a_dvp);
 	int error;
 
 	if (ap->a_vp->v_type == VDIR)
 		error = EPERM;
 	else
 		error = removede(ddep, dep);
 #ifdef MSDOSFS_DEBUG
 	printf("msdosfs_remove(), dep %p, v_usecount %d\n", dep, ap->a_vp->v_usecount);
 #endif
 	return (error);
 }
 
 /*
  * DOS filesystems don't know what links are.
  */
 static int
 msdosfs_link(ap)
 	struct vop_link_args /* {
 		struct vnode *a_tdvp;
 		struct vnode *a_vp;
 		struct componentname *a_cnp;
 	} */ *ap;
 {
 	return (EOPNOTSUPP);
 }
 
 /*
  * Renames on files require moving the denode to a new hash queue since the
  * denode's location is used to compute which hash queue to put the file
  * in. Unless it is a rename in place.  For example "mv a b".
  *
  * What follows is the basic algorithm:
  *
  * if (file move) {
  *	if (dest file exists) {
  *		remove dest file
  *	}
  *	if (dest and src in same directory) {
  *		rewrite name in existing directory slot
  *	} else {
  *		write new entry in dest directory
  *		update offset and dirclust in denode
  *		move denode to new hash chain
  *		clear old directory entry
  *	}
  * } else {
  *	directory move
  *	if (dest directory exists) {
  *		if (dest is not empty) {
  *			return ENOTEMPTY
  *		}
  *		remove dest directory
  *	}
  *	if (dest and src in same directory) {
  *		rewrite name in existing entry
  *	} else {
  *		be sure dest is not a child of src directory
  *		write entry in dest directory
  *		update "." and ".." in moved directory
  *		clear old directory entry for moved directory
  *	}
  * }
  *
  * On entry:
  *	source's parent directory is unlocked
  *	source file or directory is unlocked
  *	destination's parent directory is locked
  *	destination file or directory is locked if it exists
  *
  * On exit:
  *	all denodes should be released
  */
 static int
 msdosfs_rename(ap)
 	struct vop_rename_args /* {
 		struct vnode *a_fdvp;
 		struct vnode *a_fvp;
 		struct componentname *a_fcnp;
 		struct vnode *a_tdvp;
 		struct vnode *a_tvp;
 		struct componentname *a_tcnp;
 	} */ *ap;
 {
 	struct vnode *tdvp = ap->a_tdvp;
 	struct vnode *fvp = ap->a_fvp;
 	struct vnode *fdvp = ap->a_fdvp;
 	struct vnode *tvp = ap->a_tvp;
 	struct componentname *tcnp = ap->a_tcnp;
 	struct componentname *fcnp = ap->a_fcnp;
 	struct thread *td = fcnp->cn_thread;
 	struct denode *ip, *xp, *dp, *zp;
 	u_char toname[11], oldname[11];
 	u_long from_diroffset, to_diroffset;
 	u_char to_count;
 	int doingdirectory = 0, newparent = 0;
 	int error;
 	u_long cn;
 	daddr_t bn;
 	struct denode *fddep;	/* from file's parent directory	 */
 	struct msdosfsmount *pmp;
 	struct direntry *dotdotp;
 	struct buf *bp;
 
 	fddep = VTODE(ap->a_fdvp);
 	pmp = fddep->de_pmp;
 
 	pmp = VFSTOMSDOSFS(fdvp->v_mount);
 
 #ifdef DIAGNOSTIC
 	if ((tcnp->cn_flags & HASBUF) == 0 ||
 	    (fcnp->cn_flags & HASBUF) == 0)
 		panic("msdosfs_rename: no name");
 #endif
 	/*
 	 * Check for cross-device rename.
 	 */
 	if (fvp->v_mount != tdvp->v_mount ||
 	    (tvp && fvp->v_mount != tvp->v_mount)) {
 		error = EXDEV;
 abortit:
 		if (tdvp == tvp)
 			vrele(tdvp);
 		else
 			vput(tdvp);
 		if (tvp)
 			vput(tvp);
 		vrele(fdvp);
 		vrele(fvp);
 		return (error);
 	}
 
 	/*
 	 * If source and dest are the same, do nothing.
 	 */
 	if (tvp == fvp) {
 		error = 0;
 		goto abortit;
 	}
 
-	error = vn_lock(fvp, LK_EXCLUSIVE, td);
+	error = vn_lock(fvp, LK_EXCLUSIVE);
 	if (error)
 		goto abortit;
 	dp = VTODE(fdvp);
 	ip = VTODE(fvp);
 
 	/*
 	 * Be sure we are not renaming ".", "..", or an alias of ".". This
 	 * leads to a crippled directory tree.  It's pretty tough to do a
 	 * "ls" or "pwd" with the "." directory entry missing, and "cd .."
 	 * doesn't work if the ".." entry is missing.
 	 */
 	if (ip->de_Attributes & ATTR_DIRECTORY) {
 		/*
 		 * Avoid ".", "..", and aliases of "." for obvious reasons.
 		 */
 		if ((fcnp->cn_namelen == 1 && fcnp->cn_nameptr[0] == '.') ||
 		    dp == ip ||
 		    (fcnp->cn_flags & ISDOTDOT) ||
 		    (tcnp->cn_flags & ISDOTDOT) ||
 		    (ip->de_flag & DE_RENAME)) {
 			VOP_UNLOCK(fvp, 0, td);
 			error = EINVAL;
 			goto abortit;
 		}
 		ip->de_flag |= DE_RENAME;
 		doingdirectory++;
 	}
 
 	/*
 	 * When the target exists, both the directory
 	 * and target vnodes are returned locked.
 	 */
 	dp = VTODE(tdvp);
 	xp = tvp ? VTODE(tvp) : NULL;
 	/*
 	 * Remember direntry place to use for destination
 	 */
 	to_diroffset = dp->de_fndoffset;
 	to_count = dp->de_fndcnt;
 
 	/*
 	 * If ".." must be changed (ie the directory gets a new
 	 * parent) then the source directory must not be in the
 	 * directory heirarchy above the target, as this would
 	 * orphan everything below the source directory. Also
 	 * the user must have write permission in the source so
 	 * as to be able to change "..". We must repeat the call
 	 * to namei, as the parent directory is unlocked by the
 	 * call to doscheckpath().
 	 */
 	error = VOP_ACCESS(fvp, VWRITE, tcnp->cn_cred, tcnp->cn_thread);
 	VOP_UNLOCK(fvp, 0, td);
 	if (VTODE(fdvp)->de_StartCluster != VTODE(tdvp)->de_StartCluster)
 		newparent = 1;
 	if (doingdirectory && newparent) {
 		if (error)	/* write access check above */
 			goto bad;
 		if (xp != NULL)
 			vput(tvp);
 		/*
 		 * doscheckpath() vput()'s dp,
 		 * so we have to do a relookup afterwards
 		 */
 		error = doscheckpath(ip, dp);
 		if (error)
 			goto out;
 		if ((tcnp->cn_flags & SAVESTART) == 0)
 			panic("msdosfs_rename: lost to startdir");
 		error = relookup(tdvp, &tvp, tcnp);
 		if (error)
 			goto out;
 		dp = VTODE(tdvp);
 		xp = tvp ? VTODE(tvp) : NULL;
 	}
 
 	if (xp != NULL) {
 		/*
 		 * Target must be empty if a directory and have no links
 		 * to it. Also, ensure source and target are compatible
 		 * (both directories, or both not directories).
 		 */
 		if (xp->de_Attributes & ATTR_DIRECTORY) {
 			if (!dosdirempty(xp)) {
 				error = ENOTEMPTY;
 				goto bad;
 			}
 			if (!doingdirectory) {
 				error = ENOTDIR;
 				goto bad;
 			}
 			cache_purge(tdvp);
 		} else if (doingdirectory) {
 			error = EISDIR;
 			goto bad;
 		}
 		error = removede(dp, xp);
 		if (error)
 			goto bad;
 		vput(tvp);
 		xp = NULL;
 	}
 
 	/*
 	 * Convert the filename in tcnp into a dos filename. We copy this
 	 * into the denode and directory entry for the destination
 	 * file/directory.
 	 */
 	error = uniqdosname(VTODE(tdvp), tcnp, toname);
 	if (error)
 		goto abortit;
 
 	/*
 	 * Since from wasn't locked at various places above,
 	 * have to do a relookup here.
 	 */
 	fcnp->cn_flags &= ~MODMASK;
 	fcnp->cn_flags |= LOCKPARENT | LOCKLEAF;
 	if ((fcnp->cn_flags & SAVESTART) == 0)
 		panic("msdosfs_rename: lost from startdir");
 	if (!newparent)
 		VOP_UNLOCK(tdvp, 0, td);
 	if (relookup(fdvp, &fvp, fcnp) == 0)
 		vrele(fdvp);
 	if (fvp == NULL) {
 		/*
 		 * From name has disappeared.
 		 */
 		if (doingdirectory)
 			panic("rename: lost dir entry");
 		if (newparent)
 			VOP_UNLOCK(tdvp, 0, td);
 		vrele(tdvp);
 		vrele(ap->a_fvp);
 		return 0;
 	}
 	xp = VTODE(fvp);
 	zp = VTODE(fdvp);
 	from_diroffset = zp->de_fndoffset;
 
 	/*
 	 * Ensure that the directory entry still exists and has not
 	 * changed till now. If the source is a file the entry may
 	 * have been unlinked or renamed. In either case there is
 	 * no further work to be done. If the source is a directory
 	 * then it cannot have been rmdir'ed or renamed; this is
 	 * prohibited by the DE_RENAME flag.
 	 */
 	if (xp != ip) {
 		if (doingdirectory)
 			panic("rename: lost dir entry");
 		VOP_UNLOCK(fvp, 0, td);
 		if (newparent)
 			VOP_UNLOCK(fdvp, 0, td);
 		vrele(ap->a_fvp);
 		xp = NULL;
 	} else {
 		vrele(fvp);
 		xp = NULL;
 
 		/*
 		 * First write a new entry in the destination
 		 * directory and mark the entry in the source directory
 		 * as deleted.  Then move the denode to the correct hash
 		 * chain for its new location in the filesystem.  And, if
 		 * we moved a directory, then update its .. entry to point
 		 * to the new parent directory.
 		 */
 		bcopy(ip->de_Name, oldname, 11);
 		bcopy(toname, ip->de_Name, 11);	/* update denode */
 		dp->de_fndoffset = to_diroffset;
 		dp->de_fndcnt = to_count;
 		error = createde(ip, dp, (struct denode **)0, tcnp);
 		if (error) {
 			bcopy(oldname, ip->de_Name, 11);
 			if (newparent)
 				VOP_UNLOCK(fdvp, 0, td);
 			VOP_UNLOCK(fvp, 0, td);
 			goto bad;
 		}
 		ip->de_refcnt++;
 		zp->de_fndoffset = from_diroffset;
 		error = removede(zp, ip);
 		if (error) {
 			/* XXX should downgrade to ro here, fs is corrupt */
 			if (newparent)
 				VOP_UNLOCK(fdvp, 0, td);
 			VOP_UNLOCK(fvp, 0, td);
 			goto bad;
 		}
 		if (!doingdirectory) {
 			error = pcbmap(dp, de_cluster(pmp, to_diroffset), 0,
 				       &ip->de_dirclust, 0);
 			if (error) {
 				/* XXX should downgrade to ro here, fs is corrupt */
 				if (newparent)
 					VOP_UNLOCK(fdvp, 0, td);
 				VOP_UNLOCK(fvp, 0, td);
 				goto bad;
 			}
 			if (ip->de_dirclust == MSDOSFSROOT)
 				ip->de_diroffset = to_diroffset;
 			else
 				ip->de_diroffset = to_diroffset & pmp->pm_crbomask;
 		}
 		reinsert(ip);
 		if (newparent)
 			VOP_UNLOCK(fdvp, 0, td);
 	}
 
 	/*
 	 * If we moved a directory to a new parent directory, then we must
 	 * fixup the ".." entry in the moved directory.
 	 */
 	if (doingdirectory && newparent) {
 		cn = ip->de_StartCluster;
 		if (cn == MSDOSFSROOT) {
 			/* this should never happen */
 			panic("msdosfs_rename(): updating .. in root directory?");
 		} else
 			bn = cntobn(pmp, cn);
 		error = bread(pmp->pm_devvp, bn, pmp->pm_bpcluster,
 			      NOCRED, &bp);
 		if (error) {
 			/* XXX should downgrade to ro here, fs is corrupt */
 			brelse(bp);
 			VOP_UNLOCK(fvp, 0, td);
 			goto bad;
 		}
 		dotdotp = (struct direntry *)bp->b_data + 1;
 		putushort(dotdotp->deStartCluster, dp->de_StartCluster);
 		if (FAT32(pmp))
 			putushort(dotdotp->deHighClust, dp->de_StartCluster >> 16);
 		if (fvp->v_mount->mnt_flag & MNT_ASYNC)
 			bdwrite(bp);
 		else if ((error = bwrite(bp)) != 0) {
 			/* XXX should downgrade to ro here, fs is corrupt */
 			VOP_UNLOCK(fvp, 0, td);
 			goto bad;
 		}
 	}
 
 	VOP_UNLOCK(fvp, 0, td);
 bad:
 	if (xp)
 		vput(tvp);
 	vput(tdvp);
 out:
 	ip->de_flag &= ~DE_RENAME;
 	vrele(fdvp);
 	vrele(fvp);
 	return (error);
 
 }
 
 static struct {
 	struct direntry dot;
 	struct direntry dotdot;
 } dosdirtemplate = {
 	{	".       ", "   ",			/* the . entry */
 		ATTR_DIRECTORY,				/* file attribute */
 		0,					/* reserved */
 		0, { 0, 0 }, { 0, 0 },			/* create time & date */
 		{ 0, 0 },				/* access date */
 		{ 0, 0 },				/* high bits of start cluster */
 		{ 210, 4 }, { 210, 4 },			/* modify time & date */
 		{ 0, 0 },				/* startcluster */
 		{ 0, 0, 0, 0 }				/* filesize */
 	},
 	{	"..      ", "   ",			/* the .. entry */
 		ATTR_DIRECTORY,				/* file attribute */
 		0,					/* reserved */
 		0, { 0, 0 }, { 0, 0 },			/* create time & date */
 		{ 0, 0 },				/* access date */
 		{ 0, 0 },				/* high bits of start cluster */
 		{ 210, 4 }, { 210, 4 },			/* modify time & date */
 		{ 0, 0 },				/* startcluster */
 		{ 0, 0, 0, 0 }				/* filesize */
 	}
 };
 
 static int
 msdosfs_mkdir(ap)
 	struct vop_mkdir_args /* {
 		struct vnode *a_dvp;
 		struct vnode **a_vpp;
 		struvt componentname *a_cnp;
 		struct vattr *a_vap;
 	} */ *ap;
 {
 	struct componentname *cnp = ap->a_cnp;
 	struct denode *dep;
 	struct denode *pdep = VTODE(ap->a_dvp);
 	struct direntry *denp;
 	struct msdosfsmount *pmp = pdep->de_pmp;
 	struct buf *bp;
 	u_long newcluster, pcl;
 	int bn;
 	int error;
 	struct denode ndirent;
 	struct timespec ts;
 
 	/*
 	 * If this is the root directory and there is no space left we
 	 * can't do anything.  This is because the root directory can not
 	 * change size.
 	 */
 	if (pdep->de_StartCluster == MSDOSFSROOT
 	    && pdep->de_fndoffset >= pdep->de_FileSize) {
 		error = ENOSPC;
 		goto bad2;
 	}
 
 	/*
 	 * Allocate a cluster to hold the about to be created directory.
 	 */
 	error = clusteralloc(pmp, 0, 1, CLUST_EOFE, &newcluster, NULL);
 	if (error)
 		goto bad2;
 
 	bzero(&ndirent, sizeof(ndirent));
 	ndirent.de_pmp = pmp;
 	ndirent.de_flag = DE_ACCESS | DE_CREATE | DE_UPDATE;
 	getnanotime(&ts);
 	DETIMES(&ndirent, &ts, &ts, &ts);
 
 	/*
 	 * Now fill the cluster with the "." and ".." entries. And write
 	 * the cluster to disk.  This way it is there for the parent
 	 * directory to be pointing at if there were a crash.
 	 */
 	bn = cntobn(pmp, newcluster);
 	/* always succeeds */
 	bp = getblk(pmp->pm_devvp, bn, pmp->pm_bpcluster, 0, 0, 0);
 	bzero(bp->b_data, pmp->pm_bpcluster);
 	bcopy(&dosdirtemplate, bp->b_data, sizeof dosdirtemplate);
 	denp = (struct direntry *)bp->b_data;
 	putushort(denp[0].deStartCluster, newcluster);
 	putushort(denp[0].deCDate, ndirent.de_CDate);
 	putushort(denp[0].deCTime, ndirent.de_CTime);
 	denp[0].deCHundredth = ndirent.de_CHun;
 	putushort(denp[0].deADate, ndirent.de_ADate);
 	putushort(denp[0].deMDate, ndirent.de_MDate);
 	putushort(denp[0].deMTime, ndirent.de_MTime);
 	pcl = pdep->de_StartCluster;
 	if (FAT32(pmp) && pcl == pmp->pm_rootdirblk)
 		pcl = 0;
 	putushort(denp[1].deStartCluster, pcl);
 	putushort(denp[1].deCDate, ndirent.de_CDate);
 	putushort(denp[1].deCTime, ndirent.de_CTime);
 	denp[1].deCHundredth = ndirent.de_CHun;
 	putushort(denp[1].deADate, ndirent.de_ADate);
 	putushort(denp[1].deMDate, ndirent.de_MDate);
 	putushort(denp[1].deMTime, ndirent.de_MTime);
 	if (FAT32(pmp)) {
 		putushort(denp[0].deHighClust, newcluster >> 16);
 		putushort(denp[1].deHighClust, pdep->de_StartCluster >> 16);
 	}
 
 	if (ap->a_dvp->v_mount->mnt_flag & MNT_ASYNC)
 		bdwrite(bp);
 	else if ((error = bwrite(bp)) != 0)
 		goto bad;
 
 	/*
 	 * Now build up a directory entry pointing to the newly allocated
 	 * cluster.  This will be written to an empty slot in the parent
 	 * directory.
 	 */
 #ifdef DIAGNOSTIC
 	if ((cnp->cn_flags & HASBUF) == 0)
 		panic("msdosfs_mkdir: no name");
 #endif
 	error = uniqdosname(pdep, cnp, ndirent.de_Name);
 	if (error)
 		goto bad;
 
 	ndirent.de_Attributes = ATTR_DIRECTORY;
 	ndirent.de_LowerCase = 0;
 	ndirent.de_StartCluster = newcluster;
 	ndirent.de_FileSize = 0;
 	ndirent.de_dev = pdep->de_dev;
 	error = createde(&ndirent, pdep, &dep, cnp);
 	if (error)
 		goto bad;
 	*ap->a_vpp = DETOV(dep);
 	return (0);
 
 bad:
 	clusterfree(pmp, newcluster, NULL);
 bad2:
 	return (error);
 }
 
 static int
 msdosfs_rmdir(ap)
 	struct vop_rmdir_args /* {
 		struct vnode *a_dvp;
 		struct vnode *a_vp;
 		struct componentname *a_cnp;
 	} */ *ap;
 {
 	struct vnode *vp = ap->a_vp;
 	struct vnode *dvp = ap->a_dvp;
 	struct componentname *cnp = ap->a_cnp;
 	struct denode *ip, *dp;
 	struct thread *td = cnp->cn_thread;
 	int error;
 
 	ip = VTODE(vp);
 	dp = VTODE(dvp);
 
 	/*
 	 * Verify the directory is empty (and valid).
 	 * (Rmdir ".." won't be valid since
 	 *  ".." will contain a reference to
 	 *  the current directory and thus be
 	 *  non-empty.)
 	 */
 	error = 0;
 	if (!dosdirempty(ip) || ip->de_flag & DE_RENAME) {
 		error = ENOTEMPTY;
 		goto out;
 	}
 	/*
 	 * Delete the entry from the directory.  For dos filesystems this
 	 * gets rid of the directory entry on disk, the in memory copy
 	 * still exists but the de_refcnt is <= 0.  This prevents it from
 	 * being found by deget().  When the vput() on dep is done we give
 	 * up access and eventually msdosfs_reclaim() will be called which
 	 * will remove it from the denode cache.
 	 */
 	error = removede(dp, ip);
 	if (error)
 		goto out;
 	/*
 	 * This is where we decrement the link count in the parent
 	 * directory.  Since dos filesystems don't do this we just purge
 	 * the name cache.
 	 */
 	cache_purge(dvp);
 	VOP_UNLOCK(dvp, 0, td);
 	/*
 	 * Truncate the directory that is being deleted.
 	 */
 	error = detrunc(ip, (u_long)0, IO_SYNC, cnp->cn_cred, td);
 	cache_purge(vp);
 
-	vn_lock(dvp, LK_EXCLUSIVE | LK_RETRY, td);
+	vn_lock(dvp, LK_EXCLUSIVE | LK_RETRY);
 out:
 	return (error);
 }
 
 /*
  * DOS filesystems don't know what symlinks are.
  */
 static int
 msdosfs_symlink(ap)
 	struct vop_symlink_args /* {
 		struct vnode *a_dvp;
 		struct vnode **a_vpp;
 		struct componentname *a_cnp;
 		struct vattr *a_vap;
 		char *a_target;
 	} */ *ap;
 {
 	return (EOPNOTSUPP);
 }
 
 static int
 msdosfs_readdir(ap)
 	struct vop_readdir_args /* {
 		struct vnode *a_vp;
 		struct uio *a_uio;
 		struct ucred *a_cred;
 		int *a_eofflag;
 		int *a_ncookies;
 		u_long **a_cookies;
 	} */ *ap;
 {
 	struct mbnambuf nb;
 	int error = 0;
 	int diff;
 	long n;
 	int blsize;
 	long on;
 	u_long cn;
 	uint64_t fileno;
 	u_long dirsperblk;
 	long bias = 0;
 	daddr_t bn, lbn;
 	struct buf *bp;
 	struct denode *dep = VTODE(ap->a_vp);
 	struct msdosfsmount *pmp = dep->de_pmp;
 	struct direntry *dentp;
 	struct dirent dirbuf;
 	struct uio *uio = ap->a_uio;
 	u_long *cookies = NULL;
 	int ncookies = 0;
 	off_t offset, off;
 	int chksum = -1;
 
 #ifdef MSDOSFS_DEBUG
 	printf("msdosfs_readdir(): vp %p, uio %p, cred %p, eofflagp %p\n",
 	    ap->a_vp, uio, ap->a_cred, ap->a_eofflag);
 #endif
 
 	/*
 	 * msdosfs_readdir() won't operate properly on regular files since
 	 * it does i/o only with the the filesystem vnode, and hence can
 	 * retrieve the wrong block from the buffer cache for a plain file.
 	 * So, fail attempts to readdir() on a plain file.
 	 */
 	if ((dep->de_Attributes & ATTR_DIRECTORY) == 0)
 		return (ENOTDIR);
 
 	/*
 	 * To be safe, initialize dirbuf
 	 */
 	bzero(dirbuf.d_name, sizeof(dirbuf.d_name));
 
 	/*
 	 * If the user buffer is smaller than the size of one dos directory
 	 * entry or the file offset is not a multiple of the size of a
 	 * directory entry, then we fail the read.
 	 */
 	off = offset = uio->uio_offset;
 	if (uio->uio_resid < sizeof(struct direntry) ||
 	    (offset & (sizeof(struct direntry) - 1)))
 		return (EINVAL);
 
 	if (ap->a_ncookies) {
 		ncookies = uio->uio_resid / 16;
 		MALLOC(cookies, u_long *, ncookies * sizeof(u_long), M_TEMP,
 		       M_WAITOK);
 		*ap->a_cookies = cookies;
 		*ap->a_ncookies = ncookies;
 	}
 
 	dirsperblk = pmp->pm_BytesPerSec / sizeof(struct direntry);
 
 	/*
 	 * If they are reading from the root directory then, we simulate
 	 * the . and .. entries since these don't exist in the root
 	 * directory.  We also set the offset bias to make up for having to
 	 * simulate these entries. By this I mean that at file offset 64 we
 	 * read the first entry in the root directory that lives on disk.
 	 */
 	if (dep->de_StartCluster == MSDOSFSROOT
 	    || (FAT32(pmp) && dep->de_StartCluster == pmp->pm_rootdirblk)) {
 #if 0
 		printf("msdosfs_readdir(): going after . or .. in root dir, offset %d\n",
 		    offset);
 #endif
 		bias = 2 * sizeof(struct direntry);
 		if (offset < bias) {
 			for (n = (int)offset / sizeof(struct direntry);
 			     n < 2; n++) {
 				if (FAT32(pmp))
 					fileno = (uint64_t)cntobn(pmp,
 								 pmp->pm_rootdirblk)
 							  * dirsperblk;
 				else
 					fileno = 1;
 				if (pmp->pm_flags & MSDOSFS_LARGEFS) {
 					dirbuf.d_fileno =
 					    msdosfs_fileno_map(pmp->pm_mountp,
 					    fileno);
 				} else {
 
 					dirbuf.d_fileno = (uint32_t)fileno;
 				}
 				dirbuf.d_type = DT_DIR;
 				switch (n) {
 				case 0:
 					dirbuf.d_namlen = 1;
 					strcpy(dirbuf.d_name, ".");
 					break;
 				case 1:
 					dirbuf.d_namlen = 2;
 					strcpy(dirbuf.d_name, "..");
 					break;
 				}
 				dirbuf.d_reclen = GENERIC_DIRSIZ(&dirbuf);
 				if (uio->uio_resid < dirbuf.d_reclen)
 					goto out;
 				error = uiomove(&dirbuf, dirbuf.d_reclen, uio);
 				if (error)
 					goto out;
 				offset += sizeof(struct direntry);
 				off = offset;
 				if (cookies) {
 					*cookies++ = offset;
 					if (--ncookies <= 0)
 						goto out;
 				}
 			}
 		}
 	}
 
 	mbnambuf_init(&nb);
 	off = offset;
 	while (uio->uio_resid > 0) {
 		lbn = de_cluster(pmp, offset - bias);
 		on = (offset - bias) & pmp->pm_crbomask;
 		n = min(pmp->pm_bpcluster - on, uio->uio_resid);
 		diff = dep->de_FileSize - (offset - bias);
 		if (diff <= 0)
 			break;
 		n = min(n, diff);
 		error = pcbmap(dep, lbn, &bn, &cn, &blsize);
 		if (error)
 			break;
 		error = bread(pmp->pm_devvp, bn, blsize, NOCRED, &bp);
 		if (error) {
 			brelse(bp);
 			return (error);
 		}
 		n = min(n, blsize - bp->b_resid);
 		if (n == 0) {
 			brelse(bp);
 			return (EIO);
 		}
 
 		/*
 		 * Convert from dos directory entries to fs-independent
 		 * directory entries.
 		 */
 		for (dentp = (struct direntry *)(bp->b_data + on);
 		     (char *)dentp < bp->b_data + on + n;
 		     dentp++, offset += sizeof(struct direntry)) {
 #if 0
 			printf("rd: dentp %08x prev %08x crnt %08x deName %02x attr %02x\n",
 			    dentp, prev, crnt, dentp->deName[0], dentp->deAttributes);
 #endif
 			/*
 			 * If this is an unused entry, we can stop.
 			 */
 			if (dentp->deName[0] == SLOT_EMPTY) {
 				brelse(bp);
 				goto out;
 			}
 			/*
 			 * Skip deleted entries.
 			 */
 			if (dentp->deName[0] == SLOT_DELETED) {
 				chksum = -1;
 				mbnambuf_init(&nb);
 				continue;
 			}
 
 			/*
 			 * Handle Win95 long directory entries
 			 */
 			if (dentp->deAttributes == ATTR_WIN95) {
 				if (pmp->pm_flags & MSDOSFSMNT_SHORTNAME)
 					continue;
 				chksum = win2unixfn(&nb,
 				    (struct winentry *)dentp, chksum, pmp);
 				continue;
 			}
 
 			/*
 			 * Skip volume labels
 			 */
 			if (dentp->deAttributes & ATTR_VOLUME) {
 				chksum = -1;
 				mbnambuf_init(&nb);
 				continue;
 			}
 			/*
 			 * This computation of d_fileno must match
 			 * the computation of va_fileid in
 			 * msdosfs_getattr.
 			 */
 			if (dentp->deAttributes & ATTR_DIRECTORY) {
 				fileno = getushort(dentp->deStartCluster);
 				if (FAT32(pmp))
 					fileno |= getushort(dentp->deHighClust) << 16;
 				/* if this is the root directory */
 				if (fileno == MSDOSFSROOT)
 					if (FAT32(pmp))
 						fileno = (uint64_t)cntobn(pmp,
 								pmp->pm_rootdirblk)
 							 * dirsperblk;
 					else
 						fileno = 1;
 				else
 					fileno = (uint64_t)cntobn(pmp, fileno) *
 					    dirsperblk;
 				dirbuf.d_type = DT_DIR;
 			} else {
 				fileno = (uoff_t)offset /
 				    sizeof(struct direntry);
 				dirbuf.d_type = DT_REG;
 			}
 			if (pmp->pm_flags & MSDOSFS_LARGEFS) {
 				dirbuf.d_fileno =
 				    msdosfs_fileno_map(pmp->pm_mountp, fileno);
 			} else
 				dirbuf.d_fileno = (uint32_t)fileno;
 
 			if (chksum != winChksum(dentp)) {
 				dirbuf.d_namlen = dos2unixfn(dentp->deName,
 				    (u_char *)dirbuf.d_name,
 				    dentp->deLowerCase |
 					((pmp->pm_flags & MSDOSFSMNT_SHORTNAME) ?
 					(LCASE_BASE | LCASE_EXT) : 0),
 				    pmp);
 				mbnambuf_init(&nb);
 			} else
 				mbnambuf_flush(&nb, &dirbuf);
 			chksum = -1;
 			dirbuf.d_reclen = GENERIC_DIRSIZ(&dirbuf);
 			if (uio->uio_resid < dirbuf.d_reclen) {
 				brelse(bp);
 				goto out;
 			}
 			error = uiomove(&dirbuf, dirbuf.d_reclen, uio);
 			if (error) {
 				brelse(bp);
 				goto out;
 			}
 			if (cookies) {
 				*cookies++ = offset + sizeof(struct direntry);
 				if (--ncookies <= 0) {
 					brelse(bp);
 					goto out;
 				}
 			}
 			off = offset + sizeof(struct direntry);
 		}
 		brelse(bp);
 	}
 out:
 	/* Subtract unused cookies */
 	if (ap->a_ncookies)
 		*ap->a_ncookies -= ncookies;
 
 	uio->uio_offset = off;
 
 	/*
 	 * Set the eofflag (NFS uses it)
 	 */
 	if (ap->a_eofflag) {
 		if (dep->de_FileSize - (offset - bias) <= 0)
 			*ap->a_eofflag = 1;
 		else
 			*ap->a_eofflag = 0;
 	}
 	return (error);
 }
 
 /*-
  * a_vp   - pointer to the file's vnode
  * a_bn   - logical block number within the file (cluster number for us)
  * a_bop  - where to return the bufobj of the special file containing the fs
  * a_bnp  - where to return the "physical" block number corresponding to a_bn
  *          (relative to the special file; units are blocks of size DEV_BSIZE)
  * a_runp - where to return the "run past" a_bn.  This is the count of logical
  *          blocks whose physical blocks (together with a_bn's physical block)
  *          are contiguous.
  * a_runb - where to return the "run before" a_bn.
  */
 static int
 msdosfs_bmap(ap)
 	struct vop_bmap_args /* {
 		struct vnode *a_vp;
 		daddr_t a_bn;
 		struct bufobj **a_bop;
 		daddr_t *a_bnp;
 		int *a_runp;
 		int *a_runb;
 	} */ *ap;
 {
 	struct denode *dep;
 	struct mount *mp;
 	struct msdosfsmount *pmp;
 	struct vnode *vp;
 	daddr_t runbn;
 	u_long cn;
 	int bnpercn, error, maxio, maxrun, run;
 
 	vp = ap->a_vp;
 	dep = VTODE(vp);
 	pmp = dep->de_pmp;
 	if (ap->a_bop != NULL)
 		*ap->a_bop = &pmp->pm_devvp->v_bufobj;
 	if (ap->a_bnp == NULL)
 		return (0);
 	if (ap->a_runp != NULL)
 		*ap->a_runp = 0;
 	if (ap->a_runb != NULL)
 		*ap->a_runb = 0;
 	cn = ap->a_bn;
 	if (cn != ap->a_bn)
 		return (EFBIG);
 	error = pcbmap(dep, cn, ap->a_bnp, NULL, NULL);
 	if (error != 0 || (ap->a_runp == NULL && ap->a_runb == NULL))
 		return (error);
 
 	mp = vp->v_mount;
 	maxio = mp->mnt_iosize_max / mp->mnt_stat.f_iosize;
 	bnpercn = de_cn2bn(pmp, 1);
 	if (ap->a_runp != NULL) {
 		maxrun = ulmin(maxio - 1, pmp->pm_maxcluster - cn);
 		for (run = 1; run <= maxrun; run++) {
 			if (pcbmap(dep, cn + run, &runbn, NULL, NULL) != 0 ||
 			    runbn != *ap->a_bnp + run * bnpercn)
 				break;
 		}
 		*ap->a_runp = run - 1;
 	}
 	if (ap->a_runb != NULL) {
 		maxrun = ulmin(maxio - 1, cn);
 		for (run = 1; run < maxrun; run++) {
 			if (pcbmap(dep, cn - run, &runbn, NULL, NULL) != 0 ||
 			    runbn != *ap->a_bnp - run * bnpercn)
 				break;
 		}
 		*ap->a_runb = run - 1;
 	}
 	return (0);
 }
 
 static int
 msdosfs_strategy(ap)
 	struct vop_strategy_args /* {
 		struct vnode *a_vp;
 		struct buf *a_bp;
 	} */ *ap;
 {
 	struct buf *bp = ap->a_bp;
 	struct denode *dep = VTODE(ap->a_vp);
 	struct bufobj *bo;
 	int error = 0;
 	daddr_t blkno;
 
 	/*
 	 * If we don't already know the filesystem relative block number
 	 * then get it using pcbmap().  If pcbmap() returns the block
 	 * number as -1 then we've got a hole in the file.  DOS filesystems
 	 * don't allow files with holes, so we shouldn't ever see this.
 	 */
 	if (bp->b_blkno == bp->b_lblkno) {
 		error = pcbmap(dep, bp->b_lblkno, &blkno, 0, 0);
 		bp->b_blkno = blkno;
 		if (error) {
 			bp->b_error = error;
 			bp->b_ioflags |= BIO_ERROR;
 			bufdone(bp);
 			return (error);
 		}
 		if ((long)bp->b_blkno == -1)
 			vfs_bio_clrbuf(bp);
 	}
 	if (bp->b_blkno == -1) {
 		bufdone(bp);
 		return (0);
 	}
 	/*
 	 * Read/write the block from/to the disk that contains the desired
 	 * file block.
 	 */
 	bp->b_iooffset = dbtob(bp->b_blkno);
 	bo = dep->de_pmp->pm_bo;
 	BO_STRATEGY(bo, bp);
 	return (0);
 }
 
 static int
 msdosfs_print(ap)
 	struct vop_print_args /* {
 		struct vnode *vp;
 	} */ *ap;
 {
 	struct denode *dep = VTODE(ap->a_vp);
 
 	printf("\tstartcluster %lu, dircluster %lu, diroffset %lu, ",
 	       dep->de_StartCluster, dep->de_dirclust, dep->de_diroffset);
 	printf("on dev %s\n", devtoname(dep->de_dev));
 	return (0);
 }
 
 static int
 msdosfs_pathconf(ap)
 	struct vop_pathconf_args /* {
 		struct vnode *a_vp;
 		int a_name;
 		int *a_retval;
 	} */ *ap;
 {
 	struct msdosfsmount *pmp = VTODE(ap->a_vp)->de_pmp;
 
 	switch (ap->a_name) {
 	case _PC_LINK_MAX:
 		*ap->a_retval = 1;
 		return (0);
 	case _PC_NAME_MAX:
 		*ap->a_retval = pmp->pm_flags & MSDOSFSMNT_LONGNAME ? WIN_MAXLEN : 12;
 		return (0);
 	case _PC_PATH_MAX:
 		*ap->a_retval = PATH_MAX;
 		return (0);
 	case _PC_CHOWN_RESTRICTED:
 		*ap->a_retval = 1;
 		return (0);
 	case _PC_NO_TRUNC:
 		*ap->a_retval = 0;
 		return (0);
 	default:
 		return (EINVAL);
 	}
 	/* NOTREACHED */
 }
 
 static int
 msdosfs_advlock(ap)
 	struct vop_advlock_args /* {
 		struct vnode *a_vp;
 		u_char a_id;
 		int a_op;
 		struct flock *a_fl;
 		int a_flags;
 	} */ *ap;
 {
 	struct denode *dep = VTODE(ap->a_vp);
 
 	return (lf_advlock(ap, &dep->de_lockf, dep->de_FileSize));
 }
 
 static int
 msdosfs_vptofh(ap)
 	struct vop_vptofh_args /* {
 		struct vnode *a_vp;
 		struct fid *a_fhp;
 	} */ *ap;
 {
 	struct denode *dep;
 	struct defid *defhp;
 
 	dep = VTODE(ap->a_vp);
 	defhp = (struct defid *)ap->a_fhp;
 	defhp->defid_len = sizeof(struct defid);
 	defhp->defid_dirclust = dep->de_dirclust;
 	defhp->defid_dirofs = dep->de_diroffset;
 	/* defhp->defid_gen = dep->de_gen; */
 	return (0);
 }
 
 /* Global vfs data structures for msdosfs */
 struct vop_vector msdosfs_vnodeops = {
 	.vop_default =		&default_vnodeops,
 
 	.vop_access =		msdosfs_access,
 	.vop_advlock =		msdosfs_advlock,
 	.vop_bmap =		msdosfs_bmap,
 	.vop_cachedlookup =	msdosfs_lookup,
 	.vop_open =		msdosfs_open,
 	.vop_close =		msdosfs_close,
 	.vop_create =		msdosfs_create,
 	.vop_fsync =		msdosfs_fsync,
 	.vop_getattr =		msdosfs_getattr,
 	.vop_inactive =		msdosfs_inactive,
 	.vop_link =		msdosfs_link,
 	.vop_lookup =		vfs_cache_lookup,
 	.vop_mkdir =		msdosfs_mkdir,
 	.vop_mknod =		msdosfs_mknod,
 	.vop_pathconf =		msdosfs_pathconf,
 	.vop_print =		msdosfs_print,
 	.vop_read =		msdosfs_read,
 	.vop_readdir =		msdosfs_readdir,
 	.vop_reclaim =		msdosfs_reclaim,
 	.vop_remove =		msdosfs_remove,
 	.vop_rename =		msdosfs_rename,
 	.vop_rmdir =		msdosfs_rmdir,
 	.vop_setattr =		msdosfs_setattr,
 	.vop_strategy =		msdosfs_strategy,
 	.vop_symlink =		msdosfs_symlink,
 	.vop_write =		msdosfs_write,
 	.vop_vptofh =		msdosfs_vptofh,
 };
Index: head/sys/fs/ntfs/ntfs_vfsops.c
===================================================================
--- head/sys/fs/ntfs/ntfs_vfsops.c	(revision 175201)
+++ head/sys/fs/ntfs/ntfs_vfsops.c	(revision 175202)
@@ -1,795 +1,795 @@
 /*	$NetBSD: ntfs_vfsops.c,v 1.23 1999/11/15 19:38:14 jdolecek Exp $	*/
 
 /*-
  * Copyright (c) 1998, 1999 Semen Ustimenko
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  * $FreeBSD$
  */
 
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/namei.h>
 #include <sys/conf.h>
 #include <sys/proc.h>
 #include <sys/kernel.h>
 #include <sys/vnode.h>
 #include <sys/mount.h>
 #include <sys/bio.h>
 #include <sys/buf.h>
 #include <sys/fcntl.h>
 #include <sys/malloc.h>
 #include <sys/stat.h>
 #include <sys/systm.h>
 
 #include <geom/geom.h>
 #include <geom/geom_vfs.h>
 
 #include <vm/vm.h>
 #include <vm/vm_param.h>
 #include <vm/vm_page.h>
 #include <vm/vm_object.h>
 #include <vm/vm_extern.h>
 
 /*#define NTFS_DEBUG 1*/
 #include <fs/ntfs/ntfs.h>
 #include <fs/ntfs/ntfs_inode.h>
 #include <fs/ntfs/ntfs_subr.h>
 #include <fs/ntfs/ntfs_vfsops.h>
 #include <fs/ntfs/ntfs_ihash.h>
 #include <fs/ntfs/ntfsmount.h>
 
 static MALLOC_DEFINE(M_NTFSMNT, "ntfs_mount", "NTFS mount structure");
 MALLOC_DEFINE(M_NTFSNTNODE,"ntfs_ntnode",  "NTFS ntnode information");
 MALLOC_DEFINE(M_NTFSFNODE,"ntfs_fnode",  "NTFS fnode information");
 MALLOC_DEFINE(M_NTFSDIR,"ntfs_dir",  "NTFS dir buffer");
 
 struct sockaddr;
 
 static int	ntfs_mountfs(register struct vnode *, struct mount *, 
 				  struct thread *);
 static int	ntfs_calccfree(struct ntfsmount *ntmp, cn_t *cfreep);
 
 static vfs_init_t       ntfs_init;
 static vfs_uninit_t     ntfs_uninit;
 static vfs_vget_t       ntfs_vget;
 static vfs_fhtovp_t     ntfs_fhtovp;
 static vfs_cmount_t     ntfs_cmount;
 static vfs_mount_t      ntfs_mount;
 static vfs_root_t       ntfs_root;
 static vfs_statfs_t     ntfs_statfs;
 static vfs_unmount_t    ntfs_unmount;
 
 static b_strategy_t     ntfs_bufstrategy;
 
 /* 
  * Buffer operations for NTFS vnodes.
  * We punt on VOP_BMAP, so we need to do
  * strategy on the file's vnode rather
  * than the underlying device's
  */
 static struct buf_ops ntfs_vnbufops = {
 	.bop_name     = "NTFS",
 	.bop_strategy = ntfs_bufstrategy,
 };
 
 static int
 ntfs_init (
 	struct vfsconf *vcp )
 {
 	ntfs_nthashinit();
 	ntfs_toupper_init();
 	return 0;
 }
 
 static int
 ntfs_uninit (
 	struct vfsconf *vcp )
 {
 	ntfs_toupper_destroy();
 	ntfs_nthashdestroy();
 	return 0;
 }
 
 static int
 ntfs_cmount ( 
 	struct mntarg *ma,
 	void *data,
 	int flags,
 	struct thread *td )
 {
 	int error;
 	struct ntfs_args args;
 
 	error = copyin(data, (caddr_t)&args, sizeof args);
 	if (error)
 		return (error);
 	ma = mount_argsu(ma, "from", args.fspec, MAXPATHLEN);
 	ma = mount_arg(ma, "export", &args.export, sizeof args.export);
 	ma = mount_argf(ma, "uid", "%d", args.uid);
 	ma = mount_argf(ma, "gid", "%d", args.gid);
 	ma = mount_argf(ma, "mode", "%d", args.mode);
 	ma = mount_argb(ma, args.flag & NTFS_MFLAG_CASEINS, "nocaseins");
 	ma = mount_argb(ma, args.flag & NTFS_MFLAG_ALLNAMES, "noallnames");
 	if (args.flag & NTFS_MFLAG_KICONV) {
 		ma = mount_argsu(ma, "cs_ntfs", args.cs_ntfs, 64);
 		ma = mount_argsu(ma, "cs_local", args.cs_local, 64);
 	}
 
 	error = kernel_mount(ma, flags);
 
 	return (error);
 }
 
 static const char *ntfs_opts[] = {
 	"from", "export", "uid", "gid", "mode", "caseins", "allnames",
 	"kiconv", "cs_ntfs", "cs_local", NULL
 };
 
 static int
 ntfs_mount ( 
 	struct mount *mp,
 	struct thread *td )
 {
 	int		err = 0, error;
 	struct vnode	*devvp;
 	struct nameidata ndp;
 	char *from;
 
 	if (vfs_filteropt(mp->mnt_optnew, ntfs_opts))
 		return (EINVAL);
 
 	from = vfs_getopts(mp->mnt_optnew, "from", &error);
 	if (error)	
 		return (error);
 
 	/*
 	 * If updating, check whether changing from read-only to
 	 * read/write.
 	 */
 	if (mp->mnt_flag & MNT_UPDATE) {
 		if (vfs_flagopt(mp->mnt_optnew, "export", NULL, 0)) {
 			/* Process export requests in vfs_mount.c */
 			goto success;
 		} else {
 			printf("ntfs_mount(): MNT_UPDATE not supported\n");
 			err = EINVAL;
 			goto error_1;
 		}
 	}
 
 	/*
 	 * Not an update, or updating the name: look up the name
 	 * and verify that it refers to a sensible block device.
 	 */
 	NDINIT(&ndp, LOOKUP, FOLLOW | LOCKLEAF, UIO_SYSSPACE, from, td);
 	err = namei(&ndp);
 	if (err) {
 		/* can't get devvp!*/
 		goto error_1;
 	}
 	NDFREE(&ndp, NDF_ONLY_PNBUF);
 	devvp = ndp.ni_vp;
 
 	if (!vn_isdisk(devvp, &err))  {
 		vput(devvp);
 		return (err);
 	}
 
 	if (mp->mnt_flag & MNT_UPDATE) {
 #if 0
 		/*
 		 ********************
 		 * UPDATE
 		 ********************
 		 */
 
 		if (devvp != ntmp->um_devvp)
 			err = EINVAL;	/* needs translation */
 		vput(devvp);
 		if (err)
 			return (err);
 #endif
 	} else {
 		/*
 		 ********************
 		 * NEW MOUNT
 		 ********************
 		 */
 
 		/*
 		 * Since this is a new mount, we want the names for
 		 * the device and the mount point copied in.  If an
 		 * error occurs, the mountpoint is discarded by the
 		 * upper level code.  Note that vfs_mount() handles
 		 * copying the mountpoint f_mntonname for us, so we
 		 * don't have to do it here unless we want to set it
 		 * to something other than "path" for some rason.
 		 */
 		/* Save "mounted from" info for mount point (NULL pad)*/
 		vfs_mountedfrom(mp, from);
 
 		err = ntfs_mountfs(devvp, mp, td);
 	}
 	if (err) {
 		vrele(devvp);
 		return (err);
 	}
 
 	goto success;
 
 error_1:	/* no state to back out*/
 	/* XXX: missing NDFREE(&ndp, ...) */
 
 success:
 	return(err);
 }
 
 /*
  * Common code for mount and mountroot
  */
 int
 ntfs_mountfs(devvp, mp, td)
 	register struct vnode *devvp;
 	struct mount *mp;
 	struct thread *td;
 {
 	struct buf *bp;
 	struct ntfsmount *ntmp;
 	struct cdev *dev = devvp->v_rdev;
 	int error, ronly, i, v;
 	struct vnode *vp;
 	struct g_consumer *cp;
 	struct g_provider *pp;
 	char *cs_ntfs, *cs_local;
 
 	ronly = (mp->mnt_flag & MNT_RDONLY) != 0;
 	DROP_GIANT();
 	g_topology_lock();
 
 	/*
 	 * XXX: Do not allow more than one consumer to open a device
 	 *      associated with a particular GEOM provider.
 	 *      This disables multiple read-only mounts of a device,
 	 *      but it gets rid of panics in vget() when you try to
 	 *      mount the same device more than once.
 	 */
 	pp = g_dev_getprovider(devvp->v_rdev);
  	if ((pp != NULL) && ((pp->acr | pp->acw | pp->ace ) != 0)) 
 		error = EPERM;
 	else 
 		error = g_vfs_open(devvp, &cp, "ntfs", ronly ? 0 : 1);
 
 	g_topology_unlock();
 	PICKUP_GIANT();
 	VOP_UNLOCK(devvp, 0, td);
 	if (error)
 		return (error);
 
 	bp = NULL;
 
 	error = bread(devvp, BBLOCK, BBSIZE, NOCRED, &bp);
 	if (error)
 		goto out;
 	ntmp = malloc( sizeof *ntmp, M_NTFSMNT, M_WAITOK | M_ZERO);
 	bcopy( bp->b_data, &ntmp->ntm_bootfile, sizeof(struct bootfile) );
 	/*
 	 * We must not cache the boot block if its size is not exactly
 	 * one cluster in order to avoid confusing the buffer cache when
 	 * the boot file is read later by ntfs_readntvattr_plain(), which
 	 * reads a cluster at a time.
 	 */
 	if (ntfs_cntob(1) != BBSIZE)
 		bp->b_flags |= B_NOCACHE;
 	brelse( bp );
 	bp = NULL;
 
 	if (strncmp((const char *)ntmp->ntm_bootfile.bf_sysid, NTFS_BBID, NTFS_BBIDLEN)) {
 		error = EINVAL;
 		dprintf(("ntfs_mountfs: invalid boot block\n"));
 		goto out;
 	}
 
 	{
 		int8_t cpr = ntmp->ntm_mftrecsz;
 		if( cpr > 0 )
 			ntmp->ntm_bpmftrec = ntmp->ntm_spc * cpr;
 		else
 			ntmp->ntm_bpmftrec = (1 << (-cpr)) / ntmp->ntm_bps;
 	}
 	dprintf(("ntfs_mountfs(): bps: %d, spc: %d, media: %x, mftrecsz: %d (%d sects)\n",
 		ntmp->ntm_bps,ntmp->ntm_spc,ntmp->ntm_bootfile.bf_media,
 		ntmp->ntm_mftrecsz,ntmp->ntm_bpmftrec));
 	dprintf(("ntfs_mountfs(): mftcn: 0x%x|0x%x\n",
 		(u_int32_t)ntmp->ntm_mftcn,(u_int32_t)ntmp->ntm_mftmirrcn));
 
 	ntmp->ntm_mountp = mp;
 	ntmp->ntm_devvp = devvp;
 	if (1 == vfs_scanopt(mp->mnt_optnew, "uid", "%d", &v))
 		ntmp->ntm_uid = v;
 	if (1 == vfs_scanopt(mp->mnt_optnew, "gid", "%d", &v))
 		ntmp->ntm_gid = v;
 	if (1 == vfs_scanopt(mp->mnt_optnew, "mode", "%d", &v))
 		ntmp->ntm_mode = v & ACCESSPERMS;
 	vfs_flagopt(mp->mnt_optnew,
 	    "caseins", &ntmp->ntm_flag, NTFS_MFLAG_CASEINS);
 	vfs_flagopt(mp->mnt_optnew,
 	    "allnames", &ntmp->ntm_flag, NTFS_MFLAG_ALLNAMES);
 	ntmp->ntm_cp = cp;
 	ntmp->ntm_bo = &devvp->v_bufobj;
 
 	cs_local = vfs_getopts(mp->mnt_optnew, "cs_local", &error);
 	if (error && error != ENOENT)
 		goto out;
 	cs_ntfs = vfs_getopts(mp->mnt_optnew, "cs_ntfs", &error);
 	if (error && error != ENOENT)
 		goto out;
 	/* Copy in the 8-bit to Unicode conversion table */
 	/* Initialize Unicode to 8-bit table from 8toU table */
 	ntfs_82u_init(ntmp, cs_local, cs_ntfs);
 	if (cs_local != NULL && cs_ntfs != NULL)
 		ntfs_u28_init(ntmp, NULL, cs_local, cs_ntfs);
 	else
 		ntfs_u28_init(ntmp, ntmp->ntm_82u, cs_local, cs_ntfs);
 
 	mp->mnt_data = ntmp;
 
 	dprintf(("ntfs_mountfs(): case-%s,%s uid: %d, gid: %d, mode: %o\n",
 		(ntmp->ntm_flag & NTFS_MFLAG_CASEINS)?"insens.":"sens.",
 		(ntmp->ntm_flag & NTFS_MFLAG_ALLNAMES)?" allnames,":"",
 		ntmp->ntm_uid, ntmp->ntm_gid, ntmp->ntm_mode));
 
 	/*
 	 * We read in some system nodes to do not allow 
 	 * reclaim them and to have everytime access to them.
 	 */ 
 	{
 		int pi[3] = { NTFS_MFTINO, NTFS_ROOTINO, NTFS_BITMAPINO };
 		for (i=0; i<3; i++) {
 			error = VFS_VGET(mp, pi[i], LK_EXCLUSIVE,
 					 &(ntmp->ntm_sysvn[pi[i]]));
 			if(error)
 				goto out1;
 			ntmp->ntm_sysvn[pi[i]]->v_vflag |= VV_SYSTEM;
 			VREF(ntmp->ntm_sysvn[pi[i]]);
 			vput(ntmp->ntm_sysvn[pi[i]]);
 		}
 	}
 
 	/* read the Unicode lowercase --> uppercase translation table,
 	 * if necessary */
 	if ((error = ntfs_toupper_use(mp, ntmp)))
 		goto out1;
 
 	/*
 	 * Scan $BitMap and count free clusters
 	 */
 	error = ntfs_calccfree(ntmp, &ntmp->ntm_cfree);
 	if(error)
 		goto out1;
 
 	/*
 	 * Read and translate to internal format attribute
 	 * definition file. 
 	 */
 	{
 		int num,j;
 		struct attrdef ad;
 
 		/* Open $AttrDef */
 		error = VFS_VGET(mp, NTFS_ATTRDEFINO, LK_EXCLUSIVE, &vp );
 		if(error) 
 			goto out1;
 
 		/* Count valid entries */
 		for(num=0;;num++) {
 			error = ntfs_readattr(ntmp, VTONT(vp),
 					NTFS_A_DATA, NULL,
 					num * sizeof(ad), sizeof(ad),
 					&ad, NULL);
 			if (error)
 				goto out1;
 			if (ad.ad_name[0] == 0)
 				break;
 		}
 
 		/* Alloc memory for attribute definitions */
 		MALLOC(ntmp->ntm_ad, struct ntvattrdef *,
 			num * sizeof(struct ntvattrdef),
 			M_NTFSMNT, M_WAITOK);
 
 		ntmp->ntm_adnum = num;
 
 		/* Read them and translate */
 		for(i=0;i<num;i++){
 			error = ntfs_readattr(ntmp, VTONT(vp),
 					NTFS_A_DATA, NULL,
 					i * sizeof(ad), sizeof(ad),
 					&ad, NULL);
 			if (error)
 				goto out1;
 			j = 0;
 			do {
 				ntmp->ntm_ad[i].ad_name[j] = ad.ad_name[j];
 			} while(ad.ad_name[j++]);
 			ntmp->ntm_ad[i].ad_namelen = j - 1;
 			ntmp->ntm_ad[i].ad_type = ad.ad_type;
 		}
 
 		vput(vp);
 	}
 
 	mp->mnt_stat.f_fsid.val[0] = dev2udev(dev);
 	mp->mnt_stat.f_fsid.val[1] = mp->mnt_vfc->vfc_typenum;
 	mp->mnt_maxsymlinklen = 0;
 	MNT_ILOCK(mp);
 	mp->mnt_flag |= MNT_LOCAL;
 	MNT_IUNLOCK(mp);
 	return (0);
 
 out1:
 	for(i=0;i<NTFS_SYSNODESNUM;i++)
 		if(ntmp->ntm_sysvn[i]) vrele(ntmp->ntm_sysvn[i]);
 
 	if (vflush(mp, 0, 0, td))
 		dprintf(("ntfs_mountfs: vflush failed\n"));
 
 out:
 	if (bp)
 		brelse(bp);
 
 	DROP_GIANT();
 	g_topology_lock();
 	g_vfs_close(cp, td);
 	g_topology_unlock();
 	PICKUP_GIANT();
 	
 	return (error);
 }
 
 static int
 ntfs_unmount( 
 	struct mount *mp,
 	int mntflags,
 	struct thread *td)
 {
 	struct ntfsmount *ntmp;
 	int error, flags, i;
 
 	dprintf(("ntfs_unmount: unmounting...\n"));
 	ntmp = VFSTONTFS(mp);
 
 	flags = 0;
 	if(mntflags & MNT_FORCE)
 		flags |= FORCECLOSE;
 
 	dprintf(("ntfs_unmount: vflushing...\n"));
 	error = vflush(mp, 0, flags | SKIPSYSTEM, td);
 	if (error) {
 		printf("ntfs_unmount: vflush failed: %d\n",error);
 		return (error);
 	}
 
 	/* Check if only system vnodes are rest */
 	for(i=0;i<NTFS_SYSNODESNUM;i++)
 		 if((ntmp->ntm_sysvn[i]) && 
 		    (vrefcnt(ntmp->ntm_sysvn[i]) > 1)) return (EBUSY);
 
 	/* Dereference all system vnodes */
 	for(i=0;i<NTFS_SYSNODESNUM;i++)
 		 if(ntmp->ntm_sysvn[i]) vrele(ntmp->ntm_sysvn[i]);
 
 	/* vflush system vnodes */
 	error = vflush(mp, 0, flags, td);
 	if (error)
 		printf("ntfs_unmount: vflush failed(sysnodes): %d\n",error);
 
 	vinvalbuf(ntmp->ntm_devvp, V_SAVE, td, 0, 0);
 
 	DROP_GIANT();
 	g_topology_lock();
 	g_vfs_close(ntmp->ntm_cp, td);
 	g_topology_unlock();
 	PICKUP_GIANT();
 
 	vrele(ntmp->ntm_devvp);
 
 	/* free the toupper table, if this has been last mounted ntfs volume */
 	ntfs_toupper_unuse();
 
 	dprintf(("ntfs_umount: freeing memory...\n"));
 	ntfs_u28_uninit(ntmp);
 	ntfs_82u_uninit(ntmp);
 	mp->mnt_data = NULL;
 	MNT_ILOCK(mp);
 	mp->mnt_flag &= ~MNT_LOCAL;
 	MNT_IUNLOCK(mp);
 	FREE(ntmp->ntm_ad, M_NTFSMNT);
 	FREE(ntmp, M_NTFSMNT);
 	return (error);
 }
 
 static int
 ntfs_root(
 	struct mount *mp,
 	int flags,
 	struct vnode **vpp,
 	struct thread *td )
 {
 	struct vnode *nvp;
 	int error = 0;
 
 	dprintf(("ntfs_root(): sysvn: %p\n",
 		VFSTONTFS(mp)->ntm_sysvn[NTFS_ROOTINO]));
 	error = VFS_VGET(mp, (ino_t)NTFS_ROOTINO, LK_EXCLUSIVE, &nvp);
 	if(error) {
 		printf("ntfs_root: VFS_VGET failed: %d\n",error);
 		return (error);
 	}
 
 	*vpp = nvp;
 	return (0);
 }
 
 static int
 ntfs_calccfree(
 	struct ntfsmount *ntmp,
 	cn_t *cfreep)
 {
 	struct vnode *vp;
 	u_int8_t *tmp;
 	int j, error;
 	long cfree = 0;
 	size_t bmsize, i;
 
 	vp = ntmp->ntm_sysvn[NTFS_BITMAPINO];
 
 	bmsize = VTOF(vp)->f_size;
 
 	MALLOC(tmp, u_int8_t *, bmsize, M_TEMP, M_WAITOK);
 
 	error = ntfs_readattr(ntmp, VTONT(vp), NTFS_A_DATA, NULL,
 			       0, bmsize, tmp, NULL);
 	if (error)
 		goto out;
 
 	for(i=0;i<bmsize;i++)
 		for(j=0;j<8;j++)
 			if(~tmp[i] & (1 << j)) cfree++;
 	*cfreep = cfree;
 
     out:
 	FREE(tmp, M_TEMP);
 	return(error);
 }
 
 static int
 ntfs_statfs(
 	struct mount *mp,
 	struct statfs *sbp,
 	struct thread *td)
 {
 	struct ntfsmount *ntmp = VFSTONTFS(mp);
 	u_int64_t mftsize,mftallocated;
 
 	dprintf(("ntfs_statfs():\n"));
 
 	mftsize = VTOF(ntmp->ntm_sysvn[NTFS_MFTINO])->f_size;
 	mftallocated = VTOF(ntmp->ntm_sysvn[NTFS_MFTINO])->f_allocated;
 
 	sbp->f_type = mp->mnt_vfc->vfc_typenum;
 	sbp->f_bsize = ntmp->ntm_bps;
 	sbp->f_iosize = ntmp->ntm_bps * ntmp->ntm_spc;
 	sbp->f_blocks = ntmp->ntm_bootfile.bf_spv;
 	sbp->f_bfree = sbp->f_bavail = ntfs_cntobn(ntmp->ntm_cfree);
 	sbp->f_ffree = sbp->f_bfree / ntmp->ntm_bpmftrec;
 	sbp->f_files = mftallocated / ntfs_bntob(ntmp->ntm_bpmftrec) +
 		       sbp->f_ffree;
 	sbp->f_flags = mp->mnt_flag;
 
 	return (0);
 }
 
 /*ARGSUSED*/
 static int
 ntfs_fhtovp(
 	struct mount *mp,
 	struct fid *fhp,
 	struct vnode **vpp)
 {
 	struct vnode *nvp;
 	struct ntfid *ntfhp = (struct ntfid *)fhp;
 	int error;
 
 	ddprintf(("ntfs_fhtovp(): %d\n", ntfhp->ntfid_ino));
 
 	if ((error = VFS_VGET(mp, ntfhp->ntfid_ino, LK_EXCLUSIVE, &nvp)) != 0) {
 		*vpp = NULLVP;
 		return (error);
 	}
 	/* XXX as unlink/rmdir/mkdir/creat are not currently possible
 	 * with NTFS, we don't need to check anything else for now */
 	*vpp = nvp;
 	vnode_create_vobject(nvp, VTOF(nvp)->f_size, curthread);
 	return (0);
 }
 
 int
 ntfs_vgetex(
 	struct mount *mp,
 	ino_t ino,
 	u_int32_t attrtype,
 	char *attrname,
 	u_long lkflags,
 	u_long flags,
 	struct thread *td,
 	struct vnode **vpp) 
 {
 	int error;
 	register struct ntfsmount *ntmp;
 	struct ntnode *ip;
 	struct fnode *fp;
 	struct vnode *vp;
 	enum vtype f_type;
 
 	dprintf(("ntfs_vgetex: ino: %d, attr: 0x%x:%s, lkf: 0x%lx, f: 0x%lx\n",
 		ino, attrtype, attrname?attrname:"", (u_long)lkflags,
 		(u_long)flags));
 
 	ntmp = VFSTONTFS(mp);
 	*vpp = NULL;
 
 	/* Get ntnode */
 	error = ntfs_ntlookup(ntmp, ino, &ip);
 	if (error) {
 		printf("ntfs_vget: ntfs_ntget failed\n");
 		return (error);
 	}
 
 	/* It may be not initialized fully, so force load it */
 	if (!(flags & VG_DONTLOADIN) && !(ip->i_flag & IN_LOADED)) {
 		error = ntfs_loadntnode(ntmp, ip);
 		if(error) {
 			printf("ntfs_vget: CAN'T LOAD ATTRIBUTES FOR INO: %d\n",
 			       ip->i_number);
 			ntfs_ntput(ip);
 			return (error);
 		}
 	}
 
 	error = ntfs_fget(ntmp, ip, attrtype, attrname, &fp);
 	if (error) {
 		printf("ntfs_vget: ntfs_fget failed\n");
 		ntfs_ntput(ip);
 		return (error);
 	}
 
 	f_type = VNON;
 	if (!(flags & VG_DONTVALIDFN) && !(fp->f_flag & FN_VALID)) {
 		if ((ip->i_frflag & NTFS_FRFLAG_DIR) &&
 		    (fp->f_attrtype == NTFS_A_DATA && fp->f_attrname == NULL)) {
 			f_type = VDIR;
 		} else if (flags & VG_EXT) {
 			f_type = VNON;
 			fp->f_size = fp->f_allocated = 0;
 		} else {
 			f_type = VREG;	
 
 			error = ntfs_filesize(ntmp, fp, 
 					      &fp->f_size, &fp->f_allocated);
 			if (error) {
 				ntfs_ntput(ip);
 				return (error);
 			}
 		}
 
 		fp->f_flag |= FN_VALID;
 	}
 
 	if (FTOV(fp)) {
 		vget(FTOV(fp), lkflags, td);
 		*vpp = FTOV(fp);
 		ntfs_ntput(ip);
 		return (0);
 	}
 
 	error = getnewvnode("ntfs", ntmp->ntm_mountp, &ntfs_vnodeops, &vp);
 	if(error) {
 		ntfs_frele(fp);
 		ntfs_ntput(ip);
 		return (error);
 	}
 	/* XXX: Too early for mpsafe fs, lacks vnode lock */
 	error = insmntque(vp, ntmp->ntm_mountp);
 	if (error) {
 		ntfs_frele(fp);
 		ntfs_ntput(ip);
 		return (error);
 	}
 	dprintf(("ntfs_vget: vnode: %p for ntnode: %d\n", vp,ino));
 
 	fp->f_vp = vp;
 	vp->v_data = fp;
 	vp->v_type = f_type;
 
 	vp->v_bufobj.bo_ops = &ntfs_vnbufops;
 	vp->v_bufobj.bo_private = vp;
 
 	if (ino == NTFS_ROOTINO)
 		vp->v_vflag |= VV_ROOT;
 
 	ntfs_ntput(ip);
 
 	if (lkflags & LK_TYPE_MASK) {
-		error = vn_lock(vp, lkflags, td);
+		error = vn_lock(vp, lkflags);
 		if (error) {
 			vput(vp);
 			return (error);
 		}
 	}
 
 	*vpp = vp;
 	return (0);
 	
 }
 
 static int
 ntfs_vget(
 	struct mount *mp,
 	ino_t ino,
 	int lkflags,
 	struct vnode **vpp) 
 {
 	return ntfs_vgetex(mp, ino, NTFS_A_DATA, NULL, lkflags, 0,
 	    curthread, vpp);
 }
 
 static void
 ntfs_bufstrategy(struct bufobj *bo, struct buf *bp)
 {
 	struct vnode *vp;
 	int rc;
 
 	vp = bo->bo_private;
 	KASSERT(bo == &vp->v_bufobj, ("BO/VP mismatch: vp %p bo %p != %p",
 	    vp, &vp->v_bufobj, bo));
 	rc = VOP_STRATEGY(vp, bp);
 	KASSERT(rc == 0, ("NTFS VOP_STRATEGY failed: bp=%p, "
 		"vp=%p, rc=%d", bp, vp, rc));
 }
 
 static struct vfsops ntfs_vfsops = {
 	.vfs_fhtovp =	ntfs_fhtovp,
 	.vfs_init =	ntfs_init,
 	.vfs_cmount =	ntfs_cmount,
 	.vfs_mount =	ntfs_mount,
 	.vfs_root =	ntfs_root,
 	.vfs_statfs =	ntfs_statfs,
 	.vfs_uninit =	ntfs_uninit,
 	.vfs_unmount =	ntfs_unmount,
 	.vfs_vget =	ntfs_vget,
 };
 VFS_SET(ntfs_vfsops, ntfs, 0);
 MODULE_VERSION(ntfs, 1);
Index: head/sys/fs/ntfs/ntfs_vnops.c
===================================================================
--- head/sys/fs/ntfs/ntfs_vnops.c	(revision 175201)
+++ head/sys/fs/ntfs/ntfs_vnops.c	(revision 175202)
@@ -1,777 +1,777 @@
 /*	$NetBSD: ntfs_vnops.c,v 1.23 1999/10/31 19:45:27 jdolecek Exp $	*/
 
 /*-
  * Copyright (c) 1992, 1993
  *	The Regents of the University of California.  All rights reserved.
  *
  * This code is derived from software contributed to Berkeley by
  * John Heidemann of the UCLA Ficus project.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  * $FreeBSD$
  *
  */
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/kernel.h>
 #include <sys/time.h>
 #include <sys/types.h>
 #include <sys/stat.h>
 #include <sys/vnode.h>
 #include <sys/mount.h>
 #include <sys/namei.h>
 #include <sys/malloc.h>
 #include <sys/bio.h>
 #include <sys/buf.h>
 #include <sys/dirent.h>
 
 #include <vm/vm.h>
 #include <vm/vm_param.h>
 #include <vm/vm_page.h>
 #include <vm/vm_object.h>
 #include <vm/vm_pager.h>
 #include <vm/vnode_pager.h>
 #include <vm/vm_extern.h>
 
 #include <sys/sysctl.h>
 
 /*#define NTFS_DEBUG 1*/
 #include <fs/ntfs/ntfs.h>
 #include <fs/ntfs/ntfs_inode.h>
 #include <fs/ntfs/ntfs_subr.h>
 
 #include <sys/unistd.h> /* for pathconf(2) constants */
 
 static vop_read_t	ntfs_read;
 static vop_write_t	ntfs_write;
 static vop_getattr_t	ntfs_getattr;
 static vop_inactive_t	ntfs_inactive;
 static vop_reclaim_t	ntfs_reclaim;
 static vop_bmap_t	ntfs_bmap;
 static vop_strategy_t	ntfs_strategy;
 static vop_access_t	ntfs_access;
 static vop_open_t	ntfs_open;
 static vop_close_t	ntfs_close;
 static vop_readdir_t	ntfs_readdir;
 static vop_cachedlookup_t	ntfs_lookup;
 static vop_fsync_t	ntfs_fsync;
 static vop_pathconf_t	ntfs_pathconf;
 static vop_vptofh_t	ntfs_vptofh;
 
 int	ntfs_prtactive = 1;	/* 1 => print out reclaim of active vnodes */
 
 /*
  * This is a noop, simply returning what one has been given.
  */
 int
 ntfs_bmap(ap)
 	struct vop_bmap_args /* {
 		struct vnode *a_vp;
 		daddr_t  a_bn;
 		struct bufobj **a_bop;
 		daddr_t *a_bnp;
 		int *a_runp;
 		int *a_runb;
 	} */ *ap;
 {
 	struct vnode *vp = ap->a_vp;
 
 	dprintf(("ntfs_bmap: vn: %p, blk: %d\n", ap->a_vp,(u_int32_t)ap->a_bn));
 	if (ap->a_bop != NULL)
 		*ap->a_bop = &vp->v_bufobj;
 	if (ap->a_bnp != NULL)
 		*ap->a_bnp = ap->a_bn;
 	if (ap->a_runp != NULL)
 		*ap->a_runp = 0;
 	if (ap->a_runb != NULL)
 		*ap->a_runb = 0;
 	return (0);
 }
 
 static int
 ntfs_read(ap)
 	struct vop_read_args /* {
 		struct vnode *a_vp;
 		struct uio *a_uio;
 		int a_ioflag;
 		struct ucred *a_cred;
 	} */ *ap;
 {
 	register struct vnode *vp = ap->a_vp;
 	register struct fnode *fp = VTOF(vp);
 	register struct ntnode *ip = FTONT(fp);
 	struct uio *uio = ap->a_uio;
 	struct ntfsmount *ntmp = ip->i_mp;
 	struct buf *bp;
 	daddr_t cn;
 	int resid, off, toread;
 	int error;
 
 	dprintf(("ntfs_read: ino: %d, off: %d resid: %d, segflg: %d\n",ip->i_number,(u_int32_t)uio->uio_offset,uio->uio_resid,uio->uio_segflg));
 
 	dprintf(("ntfs_read: filesize: %d",(u_int32_t)fp->f_size));
 
 	/* don't allow reading after end of file */
 	if (uio->uio_offset > fp->f_size)
 		return (0);
 
 	resid = MIN(uio->uio_resid, fp->f_size - uio->uio_offset);
 
 	dprintf((", resid: %d\n", resid));
 
 	error = 0;
 	while (resid) {
 		cn = ntfs_btocn(uio->uio_offset);
 		off = ntfs_btocnoff(uio->uio_offset);
 
 		toread = MIN(off + resid, ntfs_cntob(1));
 
 		error = bread(vp, cn, ntfs_cntob(1), NOCRED, &bp);
 		if (error) {
 			brelse(bp);
 			break;
 		}
 
 		error = uiomove(bp->b_data + off, toread - off, uio);
 		if(error) {
 			brelse(bp);
 			break;
 		}
 		brelse(bp);
 
 		resid -= toread - off;
 	}
 
 	return (error);
 }
 
 static int
 ntfs_getattr(ap)
 	struct vop_getattr_args /* {
 		struct vnode *a_vp;
 		struct vattr *a_vap;
 		struct ucred *a_cred;
 		struct thread *a_td;
 	} */ *ap;
 {
 	register struct vnode *vp = ap->a_vp;
 	register struct fnode *fp = VTOF(vp);
 	register struct ntnode *ip = FTONT(fp);
 	register struct vattr *vap = ap->a_vap;
 
 	dprintf(("ntfs_getattr: %d, flags: %d\n",ip->i_number,ip->i_flag));
 
 	vap->va_fsid = dev2udev(ip->i_dev);
 	vap->va_fileid = ip->i_number;
 	vap->va_mode = ip->i_mp->ntm_mode;
 	vap->va_nlink = (ip->i_nlink || ip->i_flag & IN_LOADED ? ip->i_nlink : 1);
 	vap->va_uid = ip->i_mp->ntm_uid;
 	vap->va_gid = ip->i_mp->ntm_gid;
 	vap->va_rdev = 0;				/* XXX UNODEV ? */
 	vap->va_size = fp->f_size;
 	vap->va_bytes = fp->f_allocated;
 	vap->va_atime = ntfs_nttimetounix(fp->f_times.t_access);
 	vap->va_mtime = ntfs_nttimetounix(fp->f_times.t_write);
 	vap->va_ctime = ntfs_nttimetounix(fp->f_times.t_create);
 	vap->va_flags = ip->i_flag;
 	vap->va_gen = 0;
 	vap->va_blocksize = ip->i_mp->ntm_spc * ip->i_mp->ntm_bps;
 	vap->va_type = vp->v_type;
 	vap->va_filerev = 0;
 	return (0);
 }
 
 /*
  * Last reference to an ntnode.  If necessary, write or delete it.
  */
 int
 ntfs_inactive(ap)
 	struct vop_inactive_args /* {
 		struct vnode *a_vp;
 	} */ *ap;
 {
 	register struct vnode *vp = ap->a_vp;
 #ifdef NTFS_DEBUG
 	register struct ntnode *ip = VTONT(vp);
 #endif
 
 	dprintf(("ntfs_inactive: vnode: %p, ntnode: %d\n", vp, ip->i_number));
 
 	if (ntfs_prtactive && vrefcnt(vp) != 0)
 		vprint("ntfs_inactive: pushing active", vp);
 
 	/* XXX since we don't support any filesystem changes
 	 * right now, nothing more needs to be done
 	 */
 	return (0);
 }
 
 /*
  * Reclaim an fnode/ntnode so that it can be used for other purposes.
  */
 int
 ntfs_reclaim(ap)
 	struct vop_reclaim_args /* {
 		struct vnode *a_vp;
 	} */ *ap;
 {
 	register struct vnode *vp = ap->a_vp;
 	register struct fnode *fp = VTOF(vp);
 	register struct ntnode *ip = FTONT(fp);
 	int error;
 
 	dprintf(("ntfs_reclaim: vnode: %p, ntnode: %d\n", vp, ip->i_number));
 
 	if (ntfs_prtactive && vrefcnt(vp) != 0)
 		vprint("ntfs_reclaim: pushing active", vp);
 
 	/*
 	 * Destroy the vm object and flush associated pages.
 	 */
 	vnode_destroy_vobject(vp);
 
 	if ((error = ntfs_ntget(ip)) != 0)
 		return (error);
 	
 	/* Purge old data structures associated with the inode. */
 	ntfs_frele(fp);
 	ntfs_ntput(ip);
 	vp->v_data = NULL;
 
 	return (0);
 }
 
 /*
  * Calculate the logical to physical mapping if not done already,
  * then call the device strategy routine.
  */
 int
 ntfs_strategy(ap)
 	struct vop_strategy_args /* {
 		struct buf *a_bp;
 	} */ *ap;
 {
 	register struct buf *bp = ap->a_bp;
 	register struct vnode *vp = ap->a_vp;
 	register struct fnode *fp = VTOF(vp);
 	register struct ntnode *ip = FTONT(fp);
 	struct ntfsmount *ntmp = ip->i_mp;
 	int error;
 
 	dprintf(("ntfs_strategy: offset: %d, blkno: %d, lblkno: %d\n",
 		(u_int32_t)bp->b_offset,(u_int32_t)bp->b_blkno,
 		(u_int32_t)bp->b_lblkno));
 
 	dprintf(("strategy: bcount: %d flags: 0x%x\n", 
 		(u_int32_t)bp->b_bcount,bp->b_flags));
 
 	if (bp->b_iocmd == BIO_READ) {
 		u_int32_t toread;
 
 		if (ntfs_cntob(bp->b_blkno) >= fp->f_size) {
 			clrbuf(bp);
 			error = 0;
 		} else {
 			toread = MIN(bp->b_bcount,
 				 fp->f_size-ntfs_cntob(bp->b_blkno));
 			dprintf(("ntfs_strategy: toread: %d, fsize: %d\n",
 				toread,(u_int32_t)fp->f_size));
 
 			error = ntfs_readattr(ntmp, ip, fp->f_attrtype,
 				fp->f_attrname, ntfs_cntob(bp->b_blkno),
 				toread, bp->b_data, NULL);
 
 			if (error) {
 				printf("ntfs_strategy: ntfs_readattr failed\n");
 				bp->b_error = error;
 				bp->b_ioflags |= BIO_ERROR;
 			}
 
 			bzero(bp->b_data + toread, bp->b_bcount - toread);
 		}
 	} else {
 		size_t tmp;
 		u_int32_t towrite;
 
 		if (ntfs_cntob(bp->b_blkno) + bp->b_bcount >= fp->f_size) {
 			printf("ntfs_strategy: CAN'T EXTEND FILE\n");
 			bp->b_error = error = EFBIG;
 			bp->b_ioflags |= BIO_ERROR;
 		} else {
 			towrite = MIN(bp->b_bcount,
 				fp->f_size-ntfs_cntob(bp->b_blkno));
 			dprintf(("ntfs_strategy: towrite: %d, fsize: %d\n",
 				towrite,(u_int32_t)fp->f_size));
 
 			error = ntfs_writeattr_plain(ntmp, ip, fp->f_attrtype,	
 				fp->f_attrname, ntfs_cntob(bp->b_blkno),towrite,
 				bp->b_data, &tmp, NULL);
 
 			if (error) {
 				printf("ntfs_strategy: ntfs_writeattr fail\n");
 				bp->b_error = error;
 				bp->b_ioflags |= BIO_ERROR;
 			}
 		}
 	}
 	bufdone(bp);
 	return (error);
 }
 
 static int
 ntfs_write(ap)
 	struct vop_write_args /* {
 		struct vnode *a_vp;
 		struct uio *a_uio;
 		int  a_ioflag;
 		struct ucred *a_cred;
 	} */ *ap;
 {
 	register struct vnode *vp = ap->a_vp;
 	register struct fnode *fp = VTOF(vp);
 	register struct ntnode *ip = FTONT(fp);
 	struct uio *uio = ap->a_uio;
 	struct ntfsmount *ntmp = ip->i_mp;
 	u_int64_t towrite;
 	size_t written;
 	int error;
 
 	dprintf(("ntfs_write: ino: %d, off: %d resid: %d, segflg: %d\n",ip->i_number,(u_int32_t)uio->uio_offset,uio->uio_resid,uio->uio_segflg));
 	dprintf(("ntfs_write: filesize: %d",(u_int32_t)fp->f_size));
 
 	if (uio->uio_resid + uio->uio_offset > fp->f_size) {
 		printf("ntfs_write: CAN'T WRITE BEYOND END OF FILE\n");
 		return (EFBIG);
 	}
 
 	towrite = MIN(uio->uio_resid, fp->f_size - uio->uio_offset);
 
 	dprintf((", towrite: %d\n",(u_int32_t)towrite));
 
 	error = ntfs_writeattr_plain(ntmp, ip, fp->f_attrtype,
 		fp->f_attrname, uio->uio_offset, towrite, NULL, &written, uio);
 #ifdef NTFS_DEBUG
 	if (error)
 		printf("ntfs_write: ntfs_writeattr failed: %d\n", error);
 #endif
 
 	return (error);
 }
 
 int
 ntfs_access(ap)
 	struct vop_access_args /* {
 		struct vnode *a_vp;
 		int  a_mode;
 		struct ucred *a_cred;
 		struct thread *a_td;
 	} */ *ap;
 {
 	struct vnode *vp = ap->a_vp;
 	struct ntnode *ip = VTONT(vp);
 	mode_t mode = ap->a_mode;
 #ifdef QUOTA
 	int error;
 #endif
 
 	dprintf(("ntfs_access: %d\n",ip->i_number));
 
 	/*
 	 * Disallow write attempts on read-only filesystems;
 	 * unless the file is a socket, fifo, or a block or
 	 * character device resident on the filesystem.
 	 */
 	if (mode & VWRITE) {
 		switch ((int)vp->v_type) {
 		case VDIR:
 		case VLNK:
 		case VREG:
 			if (vp->v_mount->mnt_flag & MNT_RDONLY)
 				return (EROFS);
 #ifdef QUOTA
 			if (error = getinoquota(ip))
 				return (error);
 #endif
 			break;
 		}
 	}
 
 	return (vaccess(vp->v_type, ip->i_mp->ntm_mode, ip->i_mp->ntm_uid,
 	    ip->i_mp->ntm_gid, ap->a_mode, ap->a_cred, NULL));
 } 
 
 /*
  * Open called.
  *
  * Nothing to do.
  */
 /* ARGSUSED */
 static int
 ntfs_open(ap)
 	struct vop_open_args /* {
 		struct vnode *a_vp;
 		int  a_mode;
 		struct ucred *a_cred;
 		struct thread *a_td;
 	} */ *ap;
 {
 #ifdef NTFS_DEBUG
 	register struct vnode *vp = ap->a_vp;
 	register struct ntnode *ip = VTONT(vp);
 
 	printf("ntfs_open: %d\n",ip->i_number);
 #endif
 
 	vnode_create_vobject(ap->a_vp, VTOF(ap->a_vp)->f_size, ap->a_td);
 
 	/*
 	 * Files marked append-only must be opened for appending.
 	 */
 
 	return (0);
 }
 
 /*
  * Close called.
  *
  * Update the times on the inode.
  */
 /* ARGSUSED */
 static int
 ntfs_close(ap)
 	struct vop_close_args /* {
 		struct vnode *a_vp;
 		int  a_fflag;
 		struct ucred *a_cred;
 		struct thread *a_td;
 	} */ *ap;
 {
 #ifdef NTFS_DEBUG
 	register struct vnode *vp = ap->a_vp;
 	register struct ntnode *ip = VTONT(vp);
 
 	printf("ntfs_close: %d\n",ip->i_number);
 #endif
 
 	return (0);
 }
 
 int
 ntfs_readdir(ap)
 	struct vop_readdir_args /* {
 		struct vnode *a_vp;
 		struct uio *a_uio;
 		struct ucred *a_cred;
 		int *a_ncookies;
 		u_int **cookies;
 	} */ *ap;
 {
 	register struct vnode *vp = ap->a_vp;
 	register struct fnode *fp = VTOF(vp);
 	register struct ntnode *ip = FTONT(fp);
 	struct uio *uio = ap->a_uio;
 	struct ntfsmount *ntmp = ip->i_mp;
 	int i, j, error = 0;
 	wchar c;
 	u_int32_t faked = 0, num;
 	int ncookies = 0;
 	struct dirent cde;
 	off_t off;
 
 	dprintf(("ntfs_readdir %d off: %d resid: %d\n",ip->i_number,(u_int32_t)uio->uio_offset,uio->uio_resid));
 
 	off = uio->uio_offset;
 
 	/* Simulate . in every dir except ROOT */
 	if( ip->i_number != NTFS_ROOTINO ) {
 		struct dirent dot = { NTFS_ROOTINO,
 				sizeof(struct dirent), DT_DIR, 1, "." };
 
 		if( uio->uio_offset < sizeof(struct dirent) ) {
 			dot.d_fileno = ip->i_number;
 			error = uiomove((char *)&dot,sizeof(struct dirent),uio);
 			if(error)
 				return (error);
 
 			ncookies ++;
 		}
 	}
 
 	/* Simulate .. in every dir including ROOT */
 	if( uio->uio_offset < 2 * sizeof(struct dirent) ) {
 		struct dirent dotdot = { NTFS_ROOTINO,
 				sizeof(struct dirent), DT_DIR, 2, ".." };
 
 		error = uiomove((char *)&dotdot,sizeof(struct dirent),uio);
 		if(error)
 			return (error);
 
 		ncookies ++;
 	}
 
 	faked = (ip->i_number == NTFS_ROOTINO) ? 1 : 2;
 	num = uio->uio_offset / sizeof(struct dirent) - faked;
 
 	while( uio->uio_resid >= sizeof(struct dirent) ) {
 		struct attr_indexentry *iep;
 
 		error = ntfs_ntreaddir(ntmp, fp, num, &iep);
 
 		if(error)
 			return (error);
 
 		if( NULL == iep )
 			break;
 
 		for(; !(iep->ie_flag & NTFS_IEFLAG_LAST) && (uio->uio_resid >= sizeof(struct dirent));
 			iep = NTFS_NEXTREC(iep, struct attr_indexentry *))
 		{
 			if(!ntfs_isnamepermitted(ntmp,iep))
 				continue;
 
 			for(i=0, j=0; i<iep->ie_fnamelen; i++, j++) {
 				c = NTFS_U28(iep->ie_fname[i]);
 				if (c&0xFF00)
 					cde.d_name[j++] = (char)(c>>8);
 				cde.d_name[j] = (char)c&0xFF;
 			}
 			cde.d_name[j] = '\0';
 			dprintf(("ntfs_readdir: elem: %d, fname:[%s] type: %d, flag: %d, ",
 				num, cde.d_name, iep->ie_fnametype,
 				iep->ie_flag));
 			cde.d_namlen = j;
 			cde.d_fileno = iep->ie_number;
 			cde.d_type = (iep->ie_fflag & NTFS_FFLAG_DIR) ? DT_DIR : DT_REG;
 			cde.d_reclen = sizeof(struct dirent);
 			dprintf(("%s\n", (cde.d_type == DT_DIR) ? "dir":"reg"));
 
 			error = uiomove((char *)&cde, sizeof(struct dirent), uio);
 			if(error)
 				return (error);
 
 			ncookies++;
 			num++;
 		}
 	}
 
 	dprintf(("ntfs_readdir: %d entries (%d bytes) read\n",
 		ncookies,(u_int)(uio->uio_offset - off)));
 	dprintf(("ntfs_readdir: off: %d resid: %d\n",
 		(u_int32_t)uio->uio_offset,uio->uio_resid));
 
 	if (!error && ap->a_ncookies != NULL) {
 		struct dirent* dpStart;
 		struct dirent* dp;
 		u_long *cookies;
 		u_long *cookiep;
 
 		ddprintf(("ntfs_readdir: %d cookies\n",ncookies));
 		if (uio->uio_segflg != UIO_SYSSPACE || uio->uio_iovcnt != 1)
 			panic("ntfs_readdir: unexpected uio from NFS server");
 		dpStart = (struct dirent *)
 		     ((caddr_t)uio->uio_iov->iov_base -
 			 (uio->uio_offset - off));
 		MALLOC(cookies, u_long *, ncookies * sizeof(u_long),
 		       M_TEMP, M_WAITOK);
 		for (dp = dpStart, cookiep = cookies, i=0;
 		     i < ncookies;
 		     dp = (struct dirent *)((caddr_t) dp + dp->d_reclen), i++) {
 			off += dp->d_reclen;
 			*cookiep++ = (u_int) off;
 		}
 		*ap->a_ncookies = ncookies;
 		*ap->a_cookies = cookies;
 	}
 /*
 	if (ap->a_eofflag)
 	    *ap->a_eofflag = VTONT(ap->a_vp)->i_size <= uio->uio_offset;
 */
 	return (error);
 }
 
 int
 ntfs_lookup(ap)
 	struct vop_cachedlookup_args /* {
 		struct vnode *a_dvp;
 		struct vnode **a_vpp;
 		struct componentname *a_cnp;
 	} */ *ap;
 {
 	register struct vnode *dvp = ap->a_dvp;
 	register struct ntnode *dip = VTONT(dvp);
 	struct ntfsmount *ntmp = dip->i_mp;
 	struct componentname *cnp = ap->a_cnp;
 	struct ucred *cred = cnp->cn_cred;
 	int error;
 	dprintf(("ntfs_lookup: \"%.*s\" (%ld bytes) in %d\n",
 		(int)cnp->cn_namelen, cnp->cn_nameptr, cnp->cn_namelen,
 		dip->i_number));
 
 	error = VOP_ACCESS(dvp, VEXEC, cred, cnp->cn_thread);
 	if(error)
 		return (error);
 
 	if ((cnp->cn_flags & ISLASTCN) &&
 	    (dvp->v_mount->mnt_flag & MNT_RDONLY) &&
 	    (cnp->cn_nameiop == DELETE || cnp->cn_nameiop == RENAME))
 		return (EROFS);
 
 	if(cnp->cn_namelen == 1 && cnp->cn_nameptr[0] == '.') {
 		dprintf(("ntfs_lookup: faking . directory in %d\n",
 			dip->i_number));
 
 		VREF(dvp);
 		*ap->a_vpp = dvp;
 		error = 0;
 	} else if (cnp->cn_flags & ISDOTDOT) {
 		struct ntvattr *vap;
 
 		dprintf(("ntfs_lookup: faking .. directory in %d\n",
 			 dip->i_number));
 
 		error = ntfs_ntvattrget(ntmp, dip, NTFS_A_NAME, NULL, 0, &vap);
 		if(error)
 			return (error);
 
 		VOP_UNLOCK(dvp,0,cnp->cn_thread);
 		dprintf(("ntfs_lookup: parentdir: %d\n",
 			 vap->va_a_name->n_pnumber));
 		error = VFS_VGET(ntmp->ntm_mountp, vap->va_a_name->n_pnumber,
 				 LK_EXCLUSIVE, ap->a_vpp); 
 		ntfs_ntvattrrele(vap);
 		if (error) {
-			vn_lock(dvp,LK_EXCLUSIVE|LK_RETRY,cnp->cn_thread);
+			vn_lock(dvp,LK_EXCLUSIVE|LK_RETRY);
 			return (error);
 		}
 	} else {
 		error = ntfs_ntlookupfile(ntmp, dvp, cnp, ap->a_vpp);
 		if (error) {
 			dprintf(("ntfs_ntlookupfile: returned %d\n", error));
 			return (error);
 		}
 
 		dprintf(("ntfs_lookup: found ino: %d\n", 
 			VTONT(*ap->a_vpp)->i_number));
 	}
 
 	if (cnp->cn_flags & MAKEENTRY)
 		cache_enter(dvp, *ap->a_vpp, cnp);
 
 	return (error);
 }
 
 /*
  * Flush the blocks of a file to disk.
  *
  * This function is worthless for vnodes that represent directories. Maybe we
  * could just do a sync if they try an fsync on a directory file.
  */
 static int
 ntfs_fsync(ap)
 	struct vop_fsync_args /* {
 		struct vnode *a_vp;
 		struct ucred *a_cred;
 		int a_waitfor;
 		struct thread *a_td;
 	} */ *ap;
 {
 	return (0);
 }
 
 /*
  * Return POSIX pathconf information applicable to NTFS filesystem
  */
 int
 ntfs_pathconf(ap)
 	struct vop_pathconf_args *ap;
 {
 
 	switch (ap->a_name) {
 	case _PC_LINK_MAX:
 		*ap->a_retval = 1;
 		return (0);
 	case _PC_NAME_MAX:
 		*ap->a_retval = NTFS_MAXFILENAME;
 		return (0);
 	case _PC_PATH_MAX:
 		*ap->a_retval = PATH_MAX;
 		return (0);
 	case _PC_CHOWN_RESTRICTED:
 		*ap->a_retval = 1;
 		return (0);
 	case _PC_NO_TRUNC:
 		*ap->a_retval = 0;
 		return (0);
 	default:
 		return (EINVAL);
 	}
 	/* NOTREACHED */
 }
 
 int
 ntfs_vptofh(ap)
 	struct vop_vptofh_args /* {
 		struct vnode *a_vp;
 		struct fid *a_fhp;
 	} */ *ap;
 {
 	register struct ntnode *ntp;
 	register struct ntfid *ntfhp;
 
 	ddprintf(("ntfs_fhtovp(): %p\n", ap->a_vp));
 
 	ntp = VTONT(ap->a_vp);
 	ntfhp = (struct ntfid *)ap->a_fhp;
 	ntfhp->ntfid_len = sizeof(struct ntfid);
 	ntfhp->ntfid_ino = ntp->i_number;
 	/* ntfhp->ntfid_gen = ntp->i_gen; */
 	return (0);
 }
 
 /*
  * Global vfs data structures
  */
 struct vop_vector ntfs_vnodeops = {
 	.vop_default =		&default_vnodeops,
 
 	.vop_access =		ntfs_access,
 	.vop_bmap =		ntfs_bmap,
 	.vop_cachedlookup =	ntfs_lookup,
 	.vop_close =		ntfs_close,
 	.vop_fsync =		ntfs_fsync,
 	.vop_getattr =		ntfs_getattr,
 	.vop_inactive =		ntfs_inactive,
 	.vop_lookup =		vfs_cache_lookup,
 	.vop_open =		ntfs_open,
 	.vop_pathconf =		ntfs_pathconf,
 	.vop_read =		ntfs_read,
 	.vop_readdir =		ntfs_readdir,
 	.vop_reclaim =		ntfs_reclaim,
 	.vop_strategy =		ntfs_strategy,
 	.vop_write =		ntfs_write,
 	.vop_vptofh =		ntfs_vptofh,
 };
Index: head/sys/fs/nullfs/null_subr.c
===================================================================
--- head/sys/fs/nullfs/null_subr.c	(revision 175201)
+++ head/sys/fs/nullfs/null_subr.c	(revision 175202)
@@ -1,349 +1,349 @@
 /*-
  * Copyright (c) 1992, 1993
  *	The Regents of the University of California.  All rights reserved.
  *
  * This code is derived from software donated to Berkeley by
  * Jan-Simon Pendry.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	@(#)null_subr.c	8.7 (Berkeley) 5/14/95
  *
  * $FreeBSD$
  */
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/kernel.h>
 #include <sys/lock.h>
 #include <sys/mutex.h>
 #include <sys/malloc.h>
 #include <sys/mount.h>
 #include <sys/proc.h>
 #include <sys/vnode.h>
 
 #include <fs/nullfs/null.h>
 
 #define LOG2_SIZEVNODE 8		/* log2(sizeof struct vnode) */
 #define	NNULLNODECACHE 16
 
 /*
  * Null layer cache:
  * Each cache entry holds a reference to the lower vnode
  * along with a pointer to the alias vnode.  When an
  * entry is added the lower vnode is VREF'd.  When the
  * alias is removed the lower vnode is vrele'd.
  */
 
 #define	NULL_NHASH(vp) \
 	(&null_node_hashtbl[(((uintptr_t)vp)>>LOG2_SIZEVNODE) & null_node_hash])
 
 static LIST_HEAD(null_node_hashhead, null_node) *null_node_hashtbl;
 static u_long null_node_hash;
 struct mtx null_hashmtx;
 
 static MALLOC_DEFINE(M_NULLFSHASH, "nullfs_hash", "NULLFS hash table");
 MALLOC_DEFINE(M_NULLFSNODE, "nullfs_node", "NULLFS vnode private part");
 
 static struct vnode * null_hashget(struct mount *, struct vnode *);
 static struct vnode * null_hashins(struct mount *, struct null_node *);
 
 /*
  * Initialise cache headers
  */
 int
 nullfs_init(vfsp)
 	struct vfsconf *vfsp;
 {
 
 	NULLFSDEBUG("nullfs_init\n");		/* printed during system boot */
 	null_node_hashtbl = hashinit(NNULLNODECACHE, M_NULLFSHASH, &null_node_hash);
 	mtx_init(&null_hashmtx, "nullhs", NULL, MTX_DEF);
 	return (0);
 }
 
 int
 nullfs_uninit(vfsp)
 	struct vfsconf *vfsp;
 {
 
 	mtx_destroy(&null_hashmtx);
 	free(null_node_hashtbl, M_NULLFSHASH);
 	return (0);
 }
 
 /*
  * Return a VREF'ed alias for lower vnode if already exists, else 0.
  * Lower vnode should be locked on entry and will be left locked on exit.
  */
 static struct vnode *
 null_hashget(mp, lowervp)
 	struct mount *mp;
 	struct vnode *lowervp;
 {
 	struct thread *td = curthread;	/* XXX */
 	struct null_node_hashhead *hd;
 	struct null_node *a;
 	struct vnode *vp;
 	int error;
 
 	ASSERT_VOP_LOCKED(lowervp, "null_hashget");
 
 	/*
 	 * Find hash base, and then search the (two-way) linked
 	 * list looking for a null_node structure which is referencing
 	 * the lower vnode.  If found, the increment the null_node
 	 * reference count (but NOT the lower vnode's VREF counter).
 	 */
 	hd = NULL_NHASH(lowervp);
 	mtx_lock(&null_hashmtx);
 	LIST_FOREACH(a, hd, null_hash) {
 		if (a->null_lowervp == lowervp && NULLTOV(a)->v_mount == mp) {
 			vp = NULLTOV(a);
 			VI_LOCK(vp);
 			mtx_unlock(&null_hashmtx);
 			/*
 			 * We need to clear the OWEINACT flag here as this
 			 * may lead vget() to try to lock our vnode which
 			 * is already locked via lowervp.
 			 */
 			vp->v_iflag &= ~VI_OWEINACT;
 			error = vget(vp, LK_INTERLOCK, td);
 			/*
 			 * Since we have the lower node locked the nullfs
 			 * node can not be in the process of recycling.  If
 			 * it had been recycled before we grabed the lower
 			 * lock it would not have been found on the hash.
 			 */
 			if (error)
 				panic("null_hashget: vget error %d", error);
 			return (vp);
 		}
 	}
 	mtx_unlock(&null_hashmtx);
 	return (NULLVP);
 }
 
 /*
  * Act like null_hashget, but add passed null_node to hash if no existing
  * node found.
  */
 static struct vnode *
 null_hashins(mp, xp)
 	struct mount *mp;
 	struct null_node *xp;
 {
 	struct thread *td = curthread;	/* XXX */
 	struct null_node_hashhead *hd;
 	struct null_node *oxp;
 	struct vnode *ovp;
 	int error;
 
 	hd = NULL_NHASH(xp->null_lowervp);
 	mtx_lock(&null_hashmtx);
 	LIST_FOREACH(oxp, hd, null_hash) {
 		if (oxp->null_lowervp == xp->null_lowervp &&
 		    NULLTOV(oxp)->v_mount == mp) {
 			/*
 			 * See null_hashget for a description of this
 			 * operation.
 			 */
 			ovp = NULLTOV(oxp);
 			VI_LOCK(ovp);
 			mtx_unlock(&null_hashmtx);
 			ovp->v_iflag &= ~VI_OWEINACT;
 			error = vget(ovp, LK_INTERLOCK, td);
 			if (error)
 				panic("null_hashins: vget error %d", error);
 			return (ovp);
 		}
 	}
 	LIST_INSERT_HEAD(hd, xp, null_hash);
 	mtx_unlock(&null_hashmtx);
 	return (NULLVP);
 }
 
 static void
 null_insmntque_dtr(struct vnode *vp, void *xp)
 {
 	vp->v_data = NULL;
 	vp->v_vnlock = &vp->v_lock;
 	FREE(xp, M_NULLFSNODE);
 	vp->v_op = &dead_vnodeops;
-	(void) vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, curthread);
+	(void) vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
 	vgone(vp);
 	vput(vp);
 }
 
 /*
  * Make a new or get existing nullfs node.
  * Vp is the alias vnode, lowervp is the lower vnode.
  * 
  * The lowervp assumed to be locked and having "spare" reference. This routine
  * vrele lowervp if nullfs node was taken from hash. Otherwise it "transfers"
  * the caller's "spare" reference to created nullfs vnode.
  */
 int
 null_nodeget(mp, lowervp, vpp)
 	struct mount *mp;
 	struct vnode *lowervp;
 	struct vnode **vpp;
 {
 	struct null_node *xp;
 	struct vnode *vp;
 	int error;
 
 	/* Lookup the hash firstly */
 	*vpp = null_hashget(mp, lowervp);
 	if (*vpp != NULL) {
 		vrele(lowervp);
 		return (0);
 	}
 
 	/*
 	 * We do not serialize vnode creation, instead we will check for
 	 * duplicates later, when adding new vnode to hash.
 	 *
 	 * Note that duplicate can only appear in hash if the lowervp is
 	 * locked LK_SHARED.
 	 */
 
 	/*
 	 * Do the MALLOC before the getnewvnode since doing so afterward
 	 * might cause a bogus v_data pointer to get dereferenced
 	 * elsewhere if MALLOC should block.
 	 */
 	MALLOC(xp, struct null_node *, sizeof(struct null_node),
 	    M_NULLFSNODE, M_WAITOK);
 
 	error = getnewvnode("null", mp, &null_vnodeops, &vp);
 	if (error) {
 		FREE(xp, M_NULLFSNODE);
 		return (error);
 	}
 
 	xp->null_vnode = vp;
 	xp->null_lowervp = lowervp;
 	vp->v_type = lowervp->v_type;
 	vp->v_data = xp;
 	vp->v_vnlock = lowervp->v_vnlock;
 	if (vp->v_vnlock == NULL)
 		panic("null_nodeget: Passed a NULL vnlock.\n");
 	error = insmntque1(vp, mp, null_insmntque_dtr, xp);
 	if (error != 0)
 		return (error);
 	/*
 	 * Atomically insert our new node into the hash or vget existing 
 	 * if someone else has beaten us to it.
 	 */
 	*vpp = null_hashins(mp, xp);
 	if (*vpp != NULL) {
 		vrele(lowervp);
 		vp->v_vnlock = &vp->v_lock;
 		xp->null_lowervp = NULL;
 		vrele(vp);
 		return (0);
 	}
 	*vpp = vp;
 
 	return (0);
 }
 
 /*
  * Remove node from hash.
  */
 void
 null_hashrem(xp)
 	struct null_node *xp;
 {
 
 	mtx_lock(&null_hashmtx);
 	LIST_REMOVE(xp, null_hash);
 	mtx_unlock(&null_hashmtx);
 }
 
 #ifdef DIAGNOSTIC
 
 #ifdef KDB
 #define	null_checkvp_barrier	1
 #else
 #define	null_checkvp_barrier	0
 #endif
 
 struct vnode *
 null_checkvp(vp, fil, lno)
 	struct vnode *vp;
 	char *fil;
 	int lno;
 {
 	int interlock = 0;
 	struct null_node *a = VTONULL(vp);
 #ifdef notyet
 	/*
 	 * Can't do this check because vop_reclaim runs
 	 * with a funny vop vector.
 	 */
 	if (vp->v_op != null_vnodeop_p) {
 		printf ("null_checkvp: on non-null-node\n");
 		while (null_checkvp_barrier) /*WAIT*/ ;
 		panic("null_checkvp");
 	};
 #endif
 	if (a->null_lowervp == NULLVP) {
 		/* Should never happen */
 		int i; u_long *p;
 		printf("vp = %p, ZERO ptr\n", (void *)vp);
 		for (p = (u_long *) a, i = 0; i < 8; i++)
 			printf(" %lx", p[i]);
 		printf("\n");
 		/* wait for debugger */
 		while (null_checkvp_barrier) /*WAIT*/ ;
 		panic("null_checkvp");
 	}
 	if (mtx_owned(VI_MTX(vp)) != 0) {
 		VI_UNLOCK(vp);
 		interlock = 1;
 	}
 	if (vrefcnt(a->null_lowervp) < 1) {
 		int i; u_long *p;
 		printf("vp = %p, unref'ed lowervp\n", (void *)vp);
 		for (p = (u_long *) a, i = 0; i < 8; i++)
 			printf(" %lx", p[i]);
 		printf("\n");
 		/* wait for debugger */
 		while (null_checkvp_barrier) /*WAIT*/ ;
 		panic ("null with unref'ed lowervp");
 	};
 	if (interlock != 0)
 		VI_LOCK(vp);
 #ifdef notyet
 	printf("null %x/%d -> %x/%d [%s, %d]\n",
 	        NULLTOV(a), vrefcnt(NULLTOV(a)),
 		a->null_lowervp, vrefcnt(a->null_lowervp),
 		fil, lno);
 #endif
 	return a->null_lowervp;
 }
 #endif
Index: head/sys/fs/nullfs/null_vfsops.c
===================================================================
--- head/sys/fs/nullfs/null_vfsops.c	(revision 175201)
+++ head/sys/fs/nullfs/null_vfsops.c	(revision 175202)
@@ -1,370 +1,370 @@
 /*-
  * Copyright (c) 1992, 1993, 1995
  *	The Regents of the University of California.  All rights reserved.
  *
  * This code is derived from software donated to Berkeley by
  * Jan-Simon Pendry.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	@(#)null_vfsops.c	8.2 (Berkeley) 1/21/94
  *
  * @(#)lofs_vfsops.c	1.2 (Berkeley) 6/18/92
  * $FreeBSD$
  */
 
 /*
  * Null Layer
  * (See null_vnops.c for a description of what this does.)
  */
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/kernel.h>
 #include <sys/lock.h>
 #include <sys/malloc.h>
 #include <sys/mount.h>
 #include <sys/namei.h>
 #include <sys/proc.h>
 #include <sys/vnode.h>
 
 #include <fs/nullfs/null.h>
 
 static MALLOC_DEFINE(M_NULLFSMNT, "nullfs_mount", "NULLFS mount structure");
 
 static vfs_fhtovp_t	nullfs_fhtovp;
 static vfs_mount_t	nullfs_mount;
 static vfs_quotactl_t	nullfs_quotactl;
 static vfs_root_t	nullfs_root;
 static vfs_sync_t	nullfs_sync;
 static vfs_statfs_t	nullfs_statfs;
 static vfs_unmount_t	nullfs_unmount;
 static vfs_vget_t	nullfs_vget;
 static vfs_extattrctl_t	nullfs_extattrctl;
 
 /*
  * Mount null layer
  */
 static int
 nullfs_mount(struct mount *mp, struct thread *td)
 {
 	int error = 0;
 	struct vnode *lowerrootvp, *vp;
 	struct vnode *nullm_rootvp;
 	struct null_mount *xmp;
 	char *target;
 	int isvnunlocked = 0, len;
 	struct nameidata nd, *ndp = &nd;
 
 	NULLFSDEBUG("nullfs_mount(mp = %p)\n", (void *)mp);
 
 	if (mp->mnt_flag & MNT_ROOTFS)
 		return (EOPNOTSUPP);
 	/*
 	 * Update is a no-op
 	 */
 	if (mp->mnt_flag & MNT_UPDATE) {
 		/*
 		 * Only support update mounts for NFS export.
 		 */
 		if (vfs_flagopt(mp->mnt_optnew, "export", NULL, 0))
 			return (0);
 		else
 			return (EOPNOTSUPP);
 	}
 
 	/*
 	 * Get argument
 	 */
 	error = vfs_getopt(mp->mnt_optnew, "target", (void **)&target, &len);
 	if (error || target[len - 1] != '\0')
 		return (EINVAL);
 
 	/*
 	 * Unlock lower node to avoid deadlock.
 	 * (XXX) VOP_ISLOCKED is needed?
 	 */
 	if ((mp->mnt_vnodecovered->v_op == &null_vnodeops) &&
 		VOP_ISLOCKED(mp->mnt_vnodecovered, NULL)) {
 		VOP_UNLOCK(mp->mnt_vnodecovered, 0, td);
 		isvnunlocked = 1;
 	}
 	/*
 	 * Find lower node
 	 */
 	NDINIT(ndp, LOOKUP, FOLLOW|LOCKLEAF,
 		UIO_SYSSPACE, target, td);
 	error = namei(ndp);
 	/*
 	 * Re-lock vnode.
 	 */
 	if (isvnunlocked && !VOP_ISLOCKED(mp->mnt_vnodecovered, NULL))
-		vn_lock(mp->mnt_vnodecovered, LK_EXCLUSIVE | LK_RETRY, td);
+		vn_lock(mp->mnt_vnodecovered, LK_EXCLUSIVE | LK_RETRY);
 
 	if (error)
 		return (error);
 	NDFREE(ndp, NDF_ONLY_PNBUF);
 
 	/*
 	 * Sanity check on lower vnode
 	 */
 	lowerrootvp = ndp->ni_vp;
 
 	/*
 	 * Check multi null mount to avoid `lock against myself' panic.
 	 */
 	if (lowerrootvp == VTONULL(mp->mnt_vnodecovered)->null_lowervp) {
 		NULLFSDEBUG("nullfs_mount: multi null mount?\n");
 		vput(lowerrootvp);
 		return (EDEADLK);
 	}
 
 	xmp = (struct null_mount *) malloc(sizeof(struct null_mount),
 				M_NULLFSMNT, M_WAITOK);	/* XXX */
 
 	/*
 	 * Save reference to underlying FS
 	 */
 	xmp->nullm_vfs = lowerrootvp->v_mount;
 
 	/*
 	 * Save reference.  Each mount also holds
 	 * a reference on the root vnode.
 	 */
 	error = null_nodeget(mp, lowerrootvp, &vp);
 	/*
 	 * Make sure the node alias worked
 	 */
 	if (error) {
 		VOP_UNLOCK(vp, 0, td);
 		vrele(lowerrootvp);
 		free(xmp, M_NULLFSMNT);	/* XXX */
 		return (error);
 	}
 
 	/*
 	 * Keep a held reference to the root vnode.
 	 * It is vrele'd in nullfs_unmount.
 	 */
 	nullm_rootvp = vp;
 	nullm_rootvp->v_vflag |= VV_ROOT;
 	xmp->nullm_rootvp = nullm_rootvp;
 
 	/*
 	 * Unlock the node (either the lower or the alias)
 	 */
 	VOP_UNLOCK(vp, 0, td);
 
 	if (NULLVPTOLOWERVP(nullm_rootvp)->v_mount->mnt_flag & MNT_LOCAL) {
 		MNT_ILOCK(mp);
 		mp->mnt_flag |= MNT_LOCAL;
 		MNT_IUNLOCK(mp);
 	}
 	MNT_ILOCK(mp);
 	mp->mnt_kern_flag |= lowerrootvp->v_mount->mnt_kern_flag & MNTK_MPSAFE;
 	MNT_IUNLOCK(mp);
 	mp->mnt_data =  xmp;
 	vfs_getnewfsid(mp);
 
 	vfs_mountedfrom(mp, target);
 
 	NULLFSDEBUG("nullfs_mount: lower %s, alias at %s\n",
 		mp->mnt_stat.f_mntfromname, mp->mnt_stat.f_mntonname);
 	return (0);
 }
 
 /*
  * Free reference to null layer
  */
 static int
 nullfs_unmount(mp, mntflags, td)
 	struct mount *mp;
 	int mntflags;
 	struct thread *td;
 {
 	void *mntdata;
 	int error;
 	int flags = 0;
 
 	NULLFSDEBUG("nullfs_unmount: mp = %p\n", (void *)mp);
 
 	if (mntflags & MNT_FORCE)
 		flags |= FORCECLOSE;
 
 	/* There is 1 extra root vnode reference (nullm_rootvp). */
 	error = vflush(mp, 1, flags, td);
 	if (error)
 		return (error);
 
 	/*
 	 * Finally, throw away the null_mount structure
 	 */
 	mntdata = mp->mnt_data;
 	mp->mnt_data = 0;
 	free(mntdata, M_NULLFSMNT);
 	return 0;
 }
 
 static int
 nullfs_root(mp, flags, vpp, td)
 	struct mount *mp;
 	int flags;
 	struct vnode **vpp;
 	struct thread *td;
 {
 	struct vnode *vp;
 
 	NULLFSDEBUG("nullfs_root(mp = %p, vp = %p->%p)\n", (void *)mp,
 	    (void *)MOUNTTONULLMOUNT(mp)->nullm_rootvp,
 	    (void *)NULLVPTOLOWERVP(MOUNTTONULLMOUNT(mp)->nullm_rootvp));
 
 	/*
 	 * Return locked reference to root.
 	 */
 	vp = MOUNTTONULLMOUNT(mp)->nullm_rootvp;
 	VREF(vp);
 
 #ifdef NULLFS_DEBUG
 	if (VOP_ISLOCKED(vp, NULL))
 		panic("root vnode is locked.\n");
 #endif
-	vn_lock(vp, flags | LK_RETRY, td);
+	vn_lock(vp, flags | LK_RETRY);
 	*vpp = vp;
 	return 0;
 }
 
 static int
 nullfs_quotactl(mp, cmd, uid, arg, td)
 	struct mount *mp;
 	int cmd;
 	uid_t uid;
 	void *arg;
 	struct thread *td;
 {
 	return VFS_QUOTACTL(MOUNTTONULLMOUNT(mp)->nullm_vfs, cmd, uid, arg, td);
 }
 
 static int
 nullfs_statfs(mp, sbp, td)
 	struct mount *mp;
 	struct statfs *sbp;
 	struct thread *td;
 {
 	int error;
 	struct statfs mstat;
 
 	NULLFSDEBUG("nullfs_statfs(mp = %p, vp = %p->%p)\n", (void *)mp,
 	    (void *)MOUNTTONULLMOUNT(mp)->nullm_rootvp,
 	    (void *)NULLVPTOLOWERVP(MOUNTTONULLMOUNT(mp)->nullm_rootvp));
 
 	bzero(&mstat, sizeof(mstat));
 
 	error = VFS_STATFS(MOUNTTONULLMOUNT(mp)->nullm_vfs, &mstat, td);
 	if (error)
 		return (error);
 
 	/* now copy across the "interesting" information and fake the rest */
 	sbp->f_type = mstat.f_type;
 	sbp->f_flags = mstat.f_flags;
 	sbp->f_bsize = mstat.f_bsize;
 	sbp->f_iosize = mstat.f_iosize;
 	sbp->f_blocks = mstat.f_blocks;
 	sbp->f_bfree = mstat.f_bfree;
 	sbp->f_bavail = mstat.f_bavail;
 	sbp->f_files = mstat.f_files;
 	sbp->f_ffree = mstat.f_ffree;
 	return (0);
 }
 
 static int
 nullfs_sync(mp, waitfor, td)
 	struct mount *mp;
 	int waitfor;
 	struct thread *td;
 {
 	/*
 	 * XXX - Assumes no data cached at null layer.
 	 */
 	return (0);
 }
 
 static int
 nullfs_vget(mp, ino, flags, vpp)
 	struct mount *mp;
 	ino_t ino;
 	int flags;
 	struct vnode **vpp;
 {
 	int error;
 	error = VFS_VGET(MOUNTTONULLMOUNT(mp)->nullm_vfs, ino, flags, vpp);
 	if (error)
 		return (error);
 
 	return (null_nodeget(mp, *vpp, vpp));
 }
 
 static int
 nullfs_fhtovp(mp, fidp, vpp)
 	struct mount *mp;
 	struct fid *fidp;
 	struct vnode **vpp;
 {
 	int error;
 	error = VFS_FHTOVP(MOUNTTONULLMOUNT(mp)->nullm_vfs, fidp, vpp);
 	if (error)
 		return (error);
 
 	return (null_nodeget(mp, *vpp, vpp));
 }
 
 static int                        
 nullfs_extattrctl(mp, cmd, filename_vp, namespace, attrname, td)
 	struct mount *mp;
 	int cmd;
 	struct vnode *filename_vp;
 	int namespace;
 	const char *attrname;
 	struct thread *td;            
 {
 	return VFS_EXTATTRCTL(MOUNTTONULLMOUNT(mp)->nullm_vfs, cmd, filename_vp,
 	    namespace, attrname, td);
 }
 
 
 static struct vfsops null_vfsops = {
 	.vfs_extattrctl =	nullfs_extattrctl,
 	.vfs_fhtovp =		nullfs_fhtovp,
 	.vfs_init =		nullfs_init,
 	.vfs_mount =		nullfs_mount,
 	.vfs_quotactl =		nullfs_quotactl,
 	.vfs_root =		nullfs_root,
 	.vfs_statfs =		nullfs_statfs,
 	.vfs_sync =		nullfs_sync,
 	.vfs_uninit =		nullfs_uninit,
 	.vfs_unmount =		nullfs_unmount,
 	.vfs_vget =		nullfs_vget,
 };
 
 VFS_SET(null_vfsops, nullfs, VFCF_LOOPBACK);
Index: head/sys/fs/nwfs/nwfs_node.c
===================================================================
--- head/sys/fs/nwfs/nwfs_node.c	(revision 175201)
+++ head/sys/fs/nwfs/nwfs_node.c	(revision 175202)
@@ -1,383 +1,383 @@
 /*-
  * Copyright (c) 1999, 2000 Boris Popov
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. All advertising materials mentioning features or use of this software
  *    must display the following acknowledgement:
  *    This product includes software developed by Boris Popov.
  * 4. Neither the name of the author nor the names of any co-contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  * $FreeBSD$
  */
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/kernel.h>
 #include <sys/lock.h>
 #include <sys/malloc.h>
 #include <sys/mount.h>
 #include <sys/mutex.h>
 #include <sys/proc.h>
 #include <sys/queue.h>
 #include <sys/sysctl.h>
 #include <sys/time.h>
 #include <sys/vnode.h>
 
 #include <vm/vm.h>
 #include <vm/vm_extern.h>
 #include <vm/vm_page.h>
 #include <vm/vm_object.h>
 
 #include <netncp/ncp.h>
 #include <netncp/ncp_conn.h>
 #include <netncp/ncp_subr.h>
 
 #include <fs/nwfs/nwfs.h>
 #include <fs/nwfs/nwfs_mount.h>
 #include <fs/nwfs/nwfs_node.h>
 #include <fs/nwfs/nwfs_subr.h>
 
 #define	NWNOHASH(fhsum) (&nwhashtbl[(fhsum.f_id) & nwnodehash])
 
 static LIST_HEAD(nwnode_hash_head,nwnode) *nwhashtbl;
 static u_long nwnodehash;
 static struct lock nwhashlock;
 
 static MALLOC_DEFINE(M_NWNODE, "nwfs_node", "NWFS vnode private part");
 static MALLOC_DEFINE(M_NWFSHASH, "nwfs_hash", "NWFS has table");
 
 static int nwfs_sysctl_vnprint(SYSCTL_HANDLER_ARGS);
 
 SYSCTL_DECL(_vfs_nwfs);
 
 SYSCTL_PROC(_vfs_nwfs, OID_AUTO, vnprint, CTLFLAG_WR|CTLTYPE_OPAQUE,
 	    NULL, 0, nwfs_sysctl_vnprint, "S,vnlist", "vnode hash");
 
 void
 nwfs_hash_init(void) {
 	nwhashtbl = hashinit(desiredvnodes, M_NWFSHASH, &nwnodehash);
 	lockinit(&nwhashlock, PVFS, "nwfshl", 0, 0);
 }
 
 void
 nwfs_hash_free(void) {
 	lockdestroy(&nwhashlock);
 	free(nwhashtbl, M_NWFSHASH);
 }
 
 int
 nwfs_sysctl_vnprint(SYSCTL_HANDLER_ARGS) {
 	struct nwnode *np;
 	struct nwnode_hash_head *nhpp;
 	struct vnode *vp;
 	int i;
 
 	if (nwfs_debuglevel == 0)
 		return 0;
 	printf("Name:uc:hc:fid:pfid\n");
 	for(i = 0; i <= nwnodehash; i++) {
 		nhpp = &nwhashtbl[i];
 		LIST_FOREACH(np, nhpp, n_hash) {
 			vp = NWTOV(np);
 			vprint("", vp);
 			printf("%s:%d:%d:%d:%d\n",np->n_name,vrefcnt(vp),
 			    vp->v_holdcnt,np->n_fid.f_id, np->n_fid.f_parent);
 		}
 	}
 	return 0;
 }
 
 /*
  * Search nwnode with given fid.
  * Hash list should be locked by caller.
  */
 static int
 nwfs_hashlookup(struct nwmount *nmp, ncpfid fid, struct nwnode **npp)
 {
 	struct nwnode *np;
 	struct nwnode_hash_head *nhpp;
 
 	nhpp = NWNOHASH(fid);
 	LIST_FOREACH(np, nhpp, n_hash) {
 		if (nmp != np->n_mount || !NWCMPF(&fid, &np->n_fid))
 			continue;
 		if (npp)
 			*npp = np;
 		return 0;
 	}
 	return ENOENT;
 }
 
 /*
  * Allocate new nwfsnode/vnode from given nwnode. 
  * Vnode referenced and not locked.
  */
 static int
 nwfs_allocvp(struct mount *mp, ncpfid fid, struct nw_entry_info *fap,
 	struct vnode *dvp, struct vnode **vpp)
 {
 	struct thread *td = curthread;	/* XXX */
 	struct nwnode *np;
 	struct nwnode_hash_head *nhpp;
 	struct nwmount *nmp = VFSTONWFS(mp);
 	struct vnode *vp;
 	int error;
 
 loop:
 	lockmgr(&nwhashlock, LK_EXCLUSIVE, NULL, td);
 rescan:
 	if (nwfs_hashlookup(nmp, fid, &np) == 0) {
 		vp = NWTOV(np);
 		mtx_lock(&vp->v_interlock);
 		lockmgr(&nwhashlock, LK_RELEASE, NULL, td);
 		if (vget(vp, LK_EXCLUSIVE | LK_INTERLOCK, td))
 			goto loop;
 		if (fap)
 			np->n_attr = fap->attributes;
 		*vpp = vp;
 		return(0);
 	}
 	lockmgr(&nwhashlock, LK_RELEASE, NULL, td);
 
 	if (fap == NULL || ((fap->attributes & aDIR) == 0 && dvp == NULL))
 		panic("nwfs_allocvp: fap = %p, dvp = %p\n", fap, dvp);
 	/*
 	 * Do the MALLOC before the getnewvnode since doing so afterward
 	 * might cause a bogus v_data pointer to get dereferenced
 	 * elsewhere if MALLOC should block.
 	 */
 	MALLOC(np, struct nwnode *, sizeof *np, M_NWNODE, M_WAITOK | M_ZERO);
 	error = getnewvnode("nwfs", mp, &nwfs_vnodeops, &vp);
 	if (error) {
 		*vpp = NULL;
 		FREE(np, M_NWNODE);
 		return (error);
 	}
 	error = insmntque(vp, mp);	/* XXX: Too early for mpsafe fs */
 	if (error != 0) {
 		FREE(np, M_NWNODE);
 		*vpp = NULL;
 		return (error);
 	}
 	vp->v_data = np;
 	np->n_vnode = vp;
 	np->n_mount = nmp;
 	np->n_attr = fap->attributes;
 	vp->v_type = np->n_attr & aDIR ? VDIR : VREG;
 	np->n_fid = fid;
 	if (dvp) {
 		np->n_parent = VTONW(dvp)->n_fid;
 	}
 	vp->v_vnlock->lk_flags |= LK_CANRECURSE;
 	lockmgr(&nwhashlock, LK_EXCLUSIVE, NULL, td);
 	/*
 	 * Another process can create vnode while we blocked in malloc() or
 	 * getnewvnode(). Rescan list again.
 	 */
 	if (nwfs_hashlookup(nmp, fid, NULL) == 0) {
 		vp->v_data = NULL;
 		np->n_vnode = NULL;
 		vrele(vp);
 		FREE(np, M_NWNODE);
 		goto rescan;
 	}
 	*vpp = vp;
 	nhpp = NWNOHASH(fid);
 	LIST_INSERT_HEAD(nhpp, np, n_hash);
-	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td);
+	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
 	lockmgr(&nwhashlock, LK_RELEASE, NULL, td);
 	
 	ASSERT_VOP_LOCKED(dvp, "nwfs_allocvp");
 	if (vp->v_type == VDIR && dvp && (dvp->v_vflag & VV_ROOT) == 0) {
 		np->n_flag |= NREFPARENT;
 		vref(dvp);
 	}
 	return 0;
 }
 
 int
 nwfs_nget(struct mount *mp, ncpfid fid, struct nw_entry_info *fap,
 	  struct vnode *dvp, struct vnode **vpp)
 {
 	struct vnode *vp;
 	int error;
 
 	*vpp = NULL;
 	error = nwfs_allocvp(mp, fid, fap, dvp, &vp);
 	if (error)
 		return error;
 	if (fap)
 		nwfs_attr_cacheenter(vp, fap);
 	*vpp = vp;
 	return 0;
 }
 
 int
 nwfs_lookupnp(struct nwmount *nmp, ncpfid fid, struct thread *td,
 	struct nwnode **npp)
 {
 	int error;
 
 	lockmgr(&nwhashlock, LK_EXCLUSIVE, NULL, td);
 	error = nwfs_hashlookup(nmp, fid, npp);
 	lockmgr(&nwhashlock, LK_RELEASE, NULL, td);
 	return error;
 }
 
 /*
  * Free nwnode, and give vnode back to system
  */
 int
 nwfs_reclaim(ap)                     
         struct vop_reclaim_args /* {
     		struct vnode *a_vp;
 		struct thread *a_td;
         } */ *ap;
 {
 	struct vnode *dvp = NULL, *vp = ap->a_vp;
 	struct nwnode *dnp, *np = VTONW(vp);
 	struct nwmount *nmp = VTONWFS(vp);
 	struct thread *td = ap->a_td;
 	
 	NCPVNDEBUG("%s,%d\n", np->n_name, vrefcnt(vp));
 	/*
 	 * Destroy the vm object and flush associated pages.
 	 */
 	vnode_destroy_vobject(vp);
 
 	if (np->n_flag & NREFPARENT) {
 		np->n_flag &= ~NREFPARENT;
 		if (nwfs_lookupnp(nmp, np->n_parent, td, &dnp) == 0) {
 			dvp = dnp->n_vnode;
 		} else {
 			NCPVNDEBUG("%s: has no parent ?\n",np->n_name);
 		}
 	}
 	lockmgr(&nwhashlock, LK_EXCLUSIVE, NULL, td);
 	LIST_REMOVE(np, n_hash);
 	lockmgr(&nwhashlock, LK_RELEASE, NULL, td);
 	if (nmp->n_root == np) {
 		nmp->n_root = NULL;
 	}
 	vp->v_data = NULL;
 	FREE(np, M_NWNODE);
 	if (dvp) {
 		vrele(dvp);
 	}
 	return (0);
 }
 
 int
 nwfs_inactive(ap)
 	struct vop_inactive_args /* {
 		struct vnode *a_vp;
 		struct thread *a_td;
 	} */ *ap;
 {
 	struct thread *td = ap->a_td;
 	struct ucred *cred = td->td_ucred;
 	struct vnode *vp = ap->a_vp;
 	struct nwnode *np = VTONW(vp);
 	int error;
 
 	NCPVNDEBUG("%s: %d\n", VTONW(vp)->n_name, vrefcnt(vp));
 	if (np->opened) {
 		error = nwfs_vinvalbuf(vp, td);
 		error = ncp_close_file(NWFSTOCONN(VTONWFS(vp)), &np->n_fh, td, cred);
 		np->opened = 0;
 	}
 	if (np->n_flag & NSHOULDFREE) {
 		cache_purge(vp);
 		vgone(vp);
 	}
 	return (0);
 }
 /*
  * routines to maintain vnode attributes cache
  * nwfs_attr_cacheenter: unpack np.i to va structure
  */
 void
 nwfs_attr_cacheenter(struct vnode *vp, struct nw_entry_info *fi)
 {
 	struct nwnode *np = VTONW(vp);
 	struct nwmount *nmp = VTONWFS(vp);
 	struct vattr *va = &np->n_vattr;
 
 	va->va_type = vp->v_type;		/* vnode type (for create) */
 	np->n_nmlen = fi->nameLen;
 	bcopy(fi->entryName, np->n_name, np->n_nmlen);
 	np->n_name[fi->nameLen] = 0;
 	if (vp->v_type == VREG) {
 		if (va->va_size != fi->dataStreamSize) {
 			va->va_size = fi->dataStreamSize;
 			vnode_pager_setsize(vp, va->va_size);
 		}
 		va->va_mode = nmp->m.file_mode;	/* files access mode and type */
 	} else if (vp->v_type == VDIR) {
 		va->va_size = 16384; 		/* should be a better way ... */
 		va->va_mode = nmp->m.dir_mode;	/* files access mode and type */
 	} else
 		return;
 	np->n_size = va->va_size;
 	va->va_nlink = 1;		/* number of references to file */
 	va->va_uid = nmp->m.uid;	/* owner user id */
 	va->va_gid = nmp->m.gid;	/* owner group id */
 	va->va_fsid = vp->v_mount->mnt_stat.f_fsid.val[0];
 	va->va_fileid = np->n_fid.f_id;	/* file id */
 	if (va->va_fileid == 0)
 		va->va_fileid = NWFS_ROOT_INO;
 	va->va_blocksize=nmp->connh->nh_conn->buffer_size;/* blocksize preferred for i/o */
 	/* time of last modification */
 	ncp_dos2unixtime(fi->modifyDate, fi->modifyTime, 0, nmp->m.tz, &va->va_mtime);
 	/* time of last access */
 	ncp_dos2unixtime(fi->lastAccessDate, 0, 0, nmp->m.tz, &va->va_atime);
 	va->va_ctime = va->va_mtime;	/* time file changed */
 	va->va_gen = VNOVAL;		/* generation number of file */
 	va->va_flags = 0;		/* flags defined for file */
 	va->va_rdev = VNOVAL;		/* device the special file represents */
 	va->va_bytes = va->va_size;	/* bytes of disk space held by file */
 	va->va_filerev = 0;		/* file modification number */
 	va->va_vaflags = 0;		/* operations flags */
 	np->n_vattr = *va;
 	if (np->n_mtime == 0) {
 		np->n_mtime = va->va_mtime.tv_sec;
 	}
 	np->n_atime = time_second;
 	np->n_dosfid = fi->DosDirNum;
 	return;
 }
 
 int
 nwfs_attr_cachelookup(struct vnode *vp, struct vattr *va)
 {
 	struct nwnode *np = VTONW(vp);
 	int diff;
 
 	diff = time_second - np->n_atime;
 	if (diff > 2) {	/* XXX should be configurable */
 		return ENOENT;
 	}
 	*va = np->n_vattr;
 	return 0;
 }
Index: head/sys/fs/nwfs/nwfs_vnops.c
===================================================================
--- head/sys/fs/nwfs/nwfs_vnops.c	(revision 175201)
+++ head/sys/fs/nwfs/nwfs_vnops.c	(revision 175202)
@@ -1,981 +1,981 @@
 /*-
  * Copyright (c) 1999, 2000, 2001 Boris Popov
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. All advertising materials mentioning features or use of this software
  *    must display the following acknowledgement:
  *    This product includes software developed by Boris Popov.
  * 4. Neither the name of the author nor the names of any co-contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  * $FreeBSD$
  */
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/namei.h>
 #include <sys/kernel.h>
 #include <sys/bio.h>
 #include <sys/buf.h>
 #include <sys/fcntl.h>
 #include <sys/mount.h>
 #include <sys/unistd.h>
 #include <sys/vnode.h>
 
 #include <vm/vm.h>
 #include <vm/vm_extern.h>
 
 #include <machine/mutex.h>
 
 #include <netncp/ncp.h>
 #include <netncp/ncp_conn.h>
 #include <netncp/ncp_subr.h>
 #include <netncp/nwerror.h>
 #include <netncp/ncp_nls.h>
 
 #include <fs/nwfs/nwfs.h>
 #include <fs/nwfs/nwfs_node.h>
 #include <fs/nwfs/nwfs_subr.h>
 
 /*
  * Prototypes for NWFS vnode operations
  */
 static vop_create_t	nwfs_create;
 static vop_mknod_t	nwfs_mknod;
 static vop_open_t	nwfs_open;
 static vop_close_t	nwfs_close;
 static vop_access_t	nwfs_access;
 static vop_getattr_t	nwfs_getattr;
 static vop_setattr_t	nwfs_setattr;
 static vop_read_t	nwfs_read;
 static vop_write_t	nwfs_write;
 static vop_fsync_t	nwfs_fsync;
 static vop_remove_t	nwfs_remove;
 static vop_link_t	nwfs_link;
 static vop_lookup_t	nwfs_lookup;
 static vop_rename_t	nwfs_rename;
 static vop_mkdir_t	nwfs_mkdir;
 static vop_rmdir_t	nwfs_rmdir;
 static vop_symlink_t	nwfs_symlink;
 static vop_readdir_t	nwfs_readdir;
 static vop_strategy_t	nwfs_strategy;
 static vop_print_t	nwfs_print;
 static vop_pathconf_t	nwfs_pathconf;
 
 /* Global vfs data structures for nwfs */
 struct vop_vector nwfs_vnodeops = {
 	.vop_default =		&default_vnodeops,
 
 	.vop_access =		nwfs_access,
 	.vop_close =		nwfs_close,
 	.vop_create =		nwfs_create,
 	.vop_fsync =		nwfs_fsync,
 	.vop_getattr =		nwfs_getattr,
 	.vop_getpages =		nwfs_getpages,
 	.vop_inactive =		nwfs_inactive,
 	.vop_ioctl =		nwfs_ioctl,
 	.vop_link =		nwfs_link,
 	.vop_lookup =		nwfs_lookup,
 	.vop_mkdir =		nwfs_mkdir,
 	.vop_mknod =		nwfs_mknod,
 	.vop_open =		nwfs_open,
 	.vop_pathconf =		nwfs_pathconf,
 	.vop_print =		nwfs_print,
 	.vop_putpages =		nwfs_putpages,
 	.vop_read =		nwfs_read,
 	.vop_readdir =		nwfs_readdir,
 	.vop_reclaim =		nwfs_reclaim,
 	.vop_remove =		nwfs_remove,
 	.vop_rename =		nwfs_rename,
 	.vop_rmdir =		nwfs_rmdir,
 	.vop_setattr =		nwfs_setattr,
 	.vop_strategy =		nwfs_strategy,
 	.vop_symlink =		nwfs_symlink,
 	.vop_write =		nwfs_write,
 };
 
 /*
  * nwfs_access vnode op
  */
 static int
 nwfs_access(ap)
 	struct vop_access_args /* {
 		struct vnode *a_vp;
 		int  a_mode;
 		struct ucred *a_cred;
 		struct thread *td;
 	} */ *ap;
 {
 	struct vnode *vp = ap->a_vp;
 	mode_t mpmode;
 	struct nwmount *nmp = VTONWFS(vp);
 
 	NCPVNDEBUG("\n");
 	if ((ap->a_mode & VWRITE) && (vp->v_mount->mnt_flag & MNT_RDONLY)) {
 		switch (vp->v_type) {
 		    case VREG: case VDIR: case VLNK:
 			return (EROFS);
 		    default:
 			break;
 		}
 	}
 	mpmode = vp->v_type == VREG ? nmp->m.file_mode :
 	    nmp->m.dir_mode;
         return (vaccess(vp->v_type, mpmode, nmp->m.uid,
             nmp->m.gid, ap->a_mode, ap->a_cred, NULL));
 }
 /*
  * nwfs_open vnode op
  */
 /* ARGSUSED */
 static int
 nwfs_open(ap)
 	struct vop_open_args /* {
 		struct vnode *a_vp;
 		int  a_mode;
 		struct ucred *a_cred;
 		struct thread *td;
 	} */ *ap;
 {
 	struct vnode *vp = ap->a_vp;
 	int mode = ap->a_mode;
 	struct nwnode *np = VTONW(vp);
 	struct ncp_open_info no;
 	struct nwmount *nmp = VTONWFS(vp);
 	struct vattr vattr;
 	int error, nwm;
 
 	NCPVNDEBUG("%s,%d\n", np->n_name, np->opened);
 	if (vp->v_type != VREG && vp->v_type != VDIR) { 
 		NCPFATAL("open vtype = %d\n", vp->v_type);
 		return (EACCES);
 	}
 	if (vp->v_type == VDIR) return 0;	/* nothing to do now */
 	if (np->n_flag & NMODIFIED) {
 		if ((error = nwfs_vinvalbuf(vp, ap->a_td)) == EINTR)
 			return (error);
 		np->n_atime = 0;
 		error = VOP_GETATTR(vp, &vattr, ap->a_cred, ap->a_td);
 		if (error) return (error);
 		np->n_mtime = vattr.va_mtime.tv_sec;
 	} else {
 		error = VOP_GETATTR(vp, &vattr, ap->a_cred, ap->a_td);
 		if (error) return (error);
 		if (np->n_mtime != vattr.va_mtime.tv_sec) {
 			if ((error = nwfs_vinvalbuf(vp, ap->a_td)) == EINTR)
 				return (error);
 			np->n_mtime = vattr.va_mtime.tv_sec;
 		}
 	}
 	if (np->opened) {
 		np->opened++;
 		return 0;
 	}
 	nwm = AR_READ;
 	if ((vp->v_mount->mnt_flag & MNT_RDONLY) == 0)
 		nwm |= AR_WRITE;
 	error = ncp_open_create_file_or_subdir(nmp, vp, 0, NULL, OC_MODE_OPEN,
 					       0, nwm, &no, ap->a_td, ap->a_cred);
 	if (error) {
 		if (mode & FWRITE)
 			return EACCES;
 		nwm = AR_READ;
 		error = ncp_open_create_file_or_subdir(nmp, vp, 0, NULL, OC_MODE_OPEN, 0,
 						   nwm, &no, ap->a_td, ap->a_cred);
 	}
 	if (!error) {
 		np->opened++;
 		np->n_fh = no.fh;
 		np->n_origfh = no.origfh;
 	}
 	np->n_atime = 0;
 	return (error);
 }
 
 static int
 nwfs_close(ap)
 	struct vop_close_args /* {
 		struct vnodeop_desc *a_desc;
 		struct vnode *a_vp;
 		int  a_fflag;
 		struct ucred *a_cred;
 		struct thread *td;
 	} */ *ap;
 {
 	struct vnode *vp = ap->a_vp;
 	struct nwnode *np = VTONW(vp);
 	int error;
 
 	NCPVNDEBUG("name=%s,pid=%d,c=%d\n", np->n_name, ap->a_td->td_proc->p_pid,
 			np->opened);
 
 	if (vp->v_type == VDIR) return 0;	/* nothing to do now */
 	error = 0;
 	mtx_lock(&vp->v_interlock);
 	if (np->opened == 0) {
 		mtx_unlock(&vp->v_interlock);
 		return 0;
 	}
 	mtx_unlock(&vp->v_interlock);
 	error = nwfs_vinvalbuf(vp, ap->a_td);
 	mtx_lock(&vp->v_interlock);
 	if (np->opened == 0) {
 		mtx_unlock(&vp->v_interlock);
 		return 0;
 	}
 	if (--np->opened == 0) {
 		mtx_unlock(&vp->v_interlock);
 		error = ncp_close_file(NWFSTOCONN(VTONWFS(vp)), &np->n_fh, 
 		   ap->a_td, ap->a_cred);
 	} else
 		mtx_unlock(&vp->v_interlock);
 	np->n_atime = 0;
 	return (error);
 }
 
 /*
  * nwfs_getattr call from vfs.
  */
 static int
 nwfs_getattr(ap)
 	struct vop_getattr_args /* {
 		struct vnode *a_vp;
 		struct vattr *a_vap;
 		struct ucred *a_cred;
 		struct thread *td;
 	} */ *ap;
 {
 	struct vnode *vp = ap->a_vp;
 	struct nwnode *np = VTONW(vp);
 	struct vattr *va=ap->a_vap;
 	struct nwmount *nmp = VTONWFS(vp);
 	struct nw_entry_info fattr;
 	int error;
 	u_int32_t oldsize;
 
 	NCPVNDEBUG("%lx:%d: '%s' %d\n", (long)vp, nmp->n_volume, np->n_name, (vp->v_vflag & VV_ROOT) != 0);
 	error = nwfs_attr_cachelookup(vp, va);
 	if (!error) return 0;
 	NCPVNDEBUG("not in cache\n");
 	oldsize = np->n_size;
 	if (np->n_flag & NVOLUME) {
 		error = ncp_obtain_info(nmp, np->n_fid.f_id, 0, NULL, &fattr,
 		    ap->a_td, ap->a_cred);
 	} else {
 		error = ncp_obtain_info(nmp, np->n_fid.f_parent, np->n_nmlen, 
 		    np->n_name, &fattr, ap->a_td, ap->a_cred);
 	}
 	if (error) {
 		NCPVNDEBUG("error %d\n", error);
 		return error;
 	}
 	nwfs_attr_cacheenter(vp, &fattr);
 	*va = np->n_vattr;
 	if (np->opened)
 		np->n_size = oldsize;
 	return (0);
 }
 /*
  * nwfs_setattr call from vfs.
  */
 static int
 nwfs_setattr(ap)
 	struct vop_setattr_args /* {
 		struct vnode *a_vp;
 		struct vattr *a_vap;
 		struct ucred *a_cred;
 		struct thread *td;
 	} */ *ap;
 {
 	struct vnode *vp = ap->a_vp;
 	struct nwnode *np = VTONW(vp);
 	struct vattr *vap = ap->a_vap;
 	u_quad_t tsize=0;
 	int error = 0;
 
 	NCPVNDEBUG("\n");
 	if (vap->va_flags != VNOVAL)
 		return (EOPNOTSUPP);
 	/*
 	 * Disallow write attempts if the filesystem is mounted read-only.
 	 */
   	if ((vap->va_uid != (uid_t)VNOVAL || vap->va_gid != (gid_t)VNOVAL || 
 	     vap->va_atime.tv_sec != VNOVAL || vap->va_mtime.tv_sec != VNOVAL ||
 	     vap->va_mode != (mode_t)VNOVAL) &&(vp->v_mount->mnt_flag & MNT_RDONLY))
 		return (EROFS);
 	if (vap->va_size != VNOVAL) {
  		switch (vp->v_type) {
  		case VDIR:
  			return (EISDIR);
  		case VREG:
 			/*
 			 * Disallow write attempts if the filesystem is
 			 * mounted read-only.
 			 */
 			if (vp->v_mount->mnt_flag & MNT_RDONLY)
 				return (EROFS);
 			vnode_pager_setsize(vp, (u_long)vap->va_size);
  			tsize = np->n_size;
  			np->n_size = vap->va_size;
 			break;
  		default:
 			return EINVAL;
   		};
   	}
 	error = ncp_setattr(vp, vap, ap->a_cred, ap->a_td);
 	if (error && vap->va_size != VNOVAL) {
 		np->n_size = tsize;
 		vnode_pager_setsize(vp, (u_long)tsize);
 	}
 	np->n_atime = 0;	/* invalidate cache */
 	VOP_GETATTR(vp, vap, ap->a_cred, ap->a_td);
 	np->n_mtime = vap->va_mtime.tv_sec;
 	return (0);
 }
 /*
  * nwfs_read call.
  */
 static int
 nwfs_read(ap)
 	struct vop_read_args /* {
 		struct vnode *a_vp;
 		struct uio *a_uio;
 		int  a_ioflag;
 		struct ucred *a_cred;
 	} */ *ap;
 {
 	struct vnode *vp = ap->a_vp;
 	struct uio *uio=ap->a_uio;
 	int error;
 	NCPVNDEBUG("nwfs_read:\n");
 
 	if (vp->v_type != VREG && vp->v_type != VDIR)
 		return (EPERM);
 	error = nwfs_readvnode(vp, uio, ap->a_cred);
 	return error;
 }
 
 static int
 nwfs_write(ap)
 	struct vop_write_args /* {
 		struct vnode *a_vp;
 		struct uio *a_uio;
 		int  a_ioflag;
 		struct ucred *a_cred;
 	} */ *ap;
 {
 	struct vnode *vp = ap->a_vp;
 	struct uio *uio = ap->a_uio;
 	int error;
 
 	NCPVNDEBUG("%d,ofs=%d,sz=%d\n", vp->v_type, (int)uio->uio_offset, uio->uio_resid);
 
 	if (vp->v_type != VREG)
 		return (EPERM);
 	error = nwfs_writevnode(vp, uio, ap->a_cred, ap->a_ioflag);
 	return(error);
 }
 /*
  * nwfs_create call
  * Create a regular file. On entry the directory to contain the file being
  * created is locked.  We must release before we return. We must also free
  * the pathname buffer pointed at by cnp->cn_pnbuf, always on error, or
  * only if the SAVESTART bit in cn_flags is clear on success.
  */
 static int
 nwfs_create(ap)
 	struct vop_create_args /* {
 		struct vnode *a_dvp;
 		struct vnode **a_vpp;
 		struct componentname *a_cnp;
 		struct vattr *a_vap;
 	} */ *ap;
 {
 	struct vnode *dvp = ap->a_dvp;
 	struct vattr *vap = ap->a_vap;
 	struct vnode **vpp=ap->a_vpp;
 	struct componentname *cnp = ap->a_cnp;
 	struct vnode *vp = (struct vnode *)0;
 	int error = 0, fmode;
 	struct vattr vattr;
 	struct nwnode *np;
 	struct ncp_open_info no;
 	struct nwmount *nmp=VTONWFS(dvp);
 	ncpfid fid;
 	
 
 	NCPVNDEBUG("\n");
 	*vpp = NULL;
 	if (vap->va_type == VSOCK)
 		return (EOPNOTSUPP);
 	if ((error = VOP_GETATTR(dvp, &vattr, cnp->cn_cred, cnp->cn_thread))) {
 		return (error);
 	}
 	fmode = AR_READ | AR_WRITE;
 /*	if (vap->va_vaflags & VA_EXCLUSIVE)
 		fmode |= AR_DENY_READ | AR_DENY_WRITE;*/
 	
 	error = ncp_open_create_file_or_subdir(nmp, dvp, cnp->cn_namelen, cnp->cn_nameptr, 
 			   OC_MODE_CREATE | OC_MODE_OPEN | OC_MODE_REPLACE,
 			   0, fmode, &no, cnp->cn_thread, cnp->cn_cred);
 	if (!error) {
 		error = ncp_close_file(NWFSTOCONN(nmp), &no.fh, cnp->cn_thread, cnp->cn_cred);
 		fid.f_parent = VTONW(dvp)->n_fid.f_id;
 		fid.f_id = no.fattr.dirEntNum;
 		error = nwfs_nget(VTOVFS(dvp), fid, &no.fattr, dvp, &vp);
 		if (!error) {
 			np = VTONW(vp);
 			np->opened = 0;
 			*vpp = vp;
 		}
 		if (cnp->cn_flags & MAKEENTRY)
 			cache_enter(dvp, vp, cnp);
 	}
 	return (error);
 }
 
 /*
  * nwfs_remove call. It isn't possible to emulate UFS behaivour because
  * NetWare doesn't allow delete/rename operations on an opened file.
  */
 static int
 nwfs_remove(ap)
 	struct vop_remove_args /* {
 		struct vnodeop_desc *a_desc;
 		struct vnode * a_dvp;
 		struct vnode * a_vp;
 		struct componentname * a_cnp;
 	} */ *ap;
 {
 	struct vnode *vp = ap->a_vp;
 	struct vnode *dvp = ap->a_dvp;
 	struct componentname *cnp = ap->a_cnp;
 	struct nwnode *np = VTONW(vp);
 	struct nwmount *nmp = VTONWFS(vp);
 	int error;
 
 	if (vp->v_type == VDIR || np->opened || vrefcnt(vp) != 1)
 		return EPERM;
 	cache_purge(vp);
 	error = ncp_DeleteNSEntry(nmp, VTONW(dvp)->n_fid.f_id,
 	    cnp->cn_namelen, cnp->cn_nameptr, cnp->cn_thread, cnp->cn_cred);
 	if (error == 0)
 		np->n_flag |= NSHOULDFREE;
 	else if (error == 0x899c)
 		error = EACCES;
 	return (error);
 }
 
 /*
  * nwfs_file rename call
  */
 static int
 nwfs_rename(ap)
 	struct vop_rename_args  /* {
 		struct vnode *a_fdvp;
 		struct vnode *a_fvp;
 		struct componentname *a_fcnp;
 		struct vnode *a_tdvp;
 		struct vnode *a_tvp;
 		struct componentname *a_tcnp;
 	} */ *ap;
 {
 	struct vnode *fvp = ap->a_fvp;
 	struct vnode *tvp = ap->a_tvp;
 	struct vnode *fdvp = ap->a_fdvp;
 	struct vnode *tdvp = ap->a_tdvp;
 	struct componentname *tcnp = ap->a_tcnp;
 	struct componentname *fcnp = ap->a_fcnp;
 	struct nwmount *nmp=VTONWFS(fvp);
 	u_int16_t oldtype = 6;
 	int error=0;
 
 	/* Check for cross-device rename */
 	if ((fvp->v_mount != tdvp->v_mount) ||
 	    (tvp && (fvp->v_mount != tvp->v_mount))) {
 		error = EXDEV;
 		goto out;
 	}
 
 	if (tvp && vrefcnt(tvp) > 1) {
 		error = EBUSY;
 		goto out;
 	}
 	if (fvp->v_type == VDIR) {
 		oldtype |= NW_TYPE_SUBDIR;
 	} else if (fvp->v_type == VREG) {
 		oldtype |= NW_TYPE_FILE;
 	} else {
 		error = EINVAL;
 		goto out;
 	}
 	if (tvp && tvp != fvp) {
 		error = ncp_DeleteNSEntry(nmp, VTONW(tdvp)->n_fid.f_id,
 		    tcnp->cn_namelen, tcnp->cn_nameptr, 
 		    tcnp->cn_thread, tcnp->cn_cred);
 		if (error == 0x899c) error = EACCES;
 		if (error)
 			goto out_cacherem;
 	}
 	error = ncp_nsrename(NWFSTOCONN(nmp), nmp->n_volume, nmp->name_space, 
 		oldtype, &nmp->m.nls,
 		VTONW(fdvp)->n_fid.f_id, fcnp->cn_nameptr, fcnp->cn_namelen,
 		VTONW(tdvp)->n_fid.f_id, tcnp->cn_nameptr, tcnp->cn_namelen,
 		tcnp->cn_thread, tcnp->cn_cred);
 
 	if (error == 0x8992)
 		error = EEXIST;
 	if (fvp->v_type == VDIR) {
 		if (tvp != NULL && tvp->v_type == VDIR)
 			cache_purge(tdvp);
 		cache_purge(fdvp);
 	}
 out_cacherem:
 	nwfs_attr_cacheremove(fdvp);
 	nwfs_attr_cacheremove(tdvp);
 	nwfs_attr_cacheremove(fvp);
 out:
 	if (tdvp == tvp)
 		vrele(tdvp);
 	else
 		vput(tdvp);
 	if (tvp)
 		vput(tvp);
 	vrele(fdvp);
 	vrele(fvp);
 	if (tvp)
 		nwfs_attr_cacheremove(tvp);
 	/*
 	 * Kludge: Map ENOENT => 0 assuming that it is a reply to a retry.
 	 */
 	if (error == ENOENT)
 		error = 0;
 	return (error);
 }
 
 /*
  * nwfs hard link create call
  * Netware filesystems don't know what links are.
  */
 static int
 nwfs_link(ap)
 	struct vop_link_args /* {
 		struct vnode *a_tdvp;
 		struct vnode *a_vp;
 		struct componentname *a_cnp;
 	} */ *ap;
 {
 	return EOPNOTSUPP;
 }
 
 /*
  * nwfs_symlink link create call
  * Netware filesystems don't know what symlinks are.
  */
 static int
 nwfs_symlink(ap)
 	struct vop_symlink_args /* {
 		struct vnode *a_dvp;
 		struct vnode **a_vpp;
 		struct componentname *a_cnp;
 		struct vattr *a_vap;
 		char *a_target;
 	} */ *ap;
 {
 	return (EOPNOTSUPP);
 }
 
 static int nwfs_mknod(ap) 
 	struct vop_mknod_args /* {
 	} */ *ap;
 {
 	return (EOPNOTSUPP);
 }
 
 /*
  * nwfs_mkdir call
  */
 static int
 nwfs_mkdir(ap)
 	struct vop_mkdir_args /* {
 		struct vnode *a_dvp;
 		struct vnode **a_vpp;
 		struct componentname *a_cnp;
 		struct vattr *a_vap;
 	} */ *ap;
 {
 	struct vnode *dvp = ap->a_dvp;
 /*	struct vattr *vap = ap->a_vap;*/
 	struct componentname *cnp = ap->a_cnp;
 	int len=cnp->cn_namelen;
 	struct ncp_open_info no;
 	struct nwnode *np;
 	struct vnode *newvp = (struct vnode *)0;
 	ncpfid fid;
 	int error = 0;
 	struct vattr vattr;
 	char *name=cnp->cn_nameptr;
 
 	if ((error = VOP_GETATTR(dvp, &vattr, cnp->cn_cred, cnp->cn_thread))) {
 		return (error);
 	}	
 	if ((name[0] == '.') && ((len == 1) || ((len == 2) && (name[1] == '.')))) {
 		return EEXIST;
 	}
 	if (ncp_open_create_file_or_subdir(VTONWFS(dvp), dvp, cnp->cn_namelen,
 			cnp->cn_nameptr, OC_MODE_CREATE, aDIR, 0xffff,
 			&no, cnp->cn_thread, cnp->cn_cred) != 0) {
 		error = EACCES;
 	} else {
 		error = 0;
         }
 	if (!error) {
 		fid.f_parent = VTONW(dvp)->n_fid.f_id;
 		fid.f_id = no.fattr.dirEntNum;
 		error = nwfs_nget(VTOVFS(dvp), fid, &no.fattr, dvp, &newvp);
 		if (!error) {
 			np = VTONW(newvp);
 			newvp->v_type = VDIR;
 			*ap->a_vpp = newvp;
 		}
 	}
 	return (error);
 }
 
 /*
  * nwfs_remove directory call
  */
 static int
 nwfs_rmdir(ap)
 	struct vop_rmdir_args /* {
 		struct vnode *a_dvp;
 		struct vnode *a_vp;
 		struct componentname *a_cnp;
 	} */ *ap;
 {
 	struct vnode *vp = ap->a_vp;
 	struct vnode *dvp = ap->a_dvp;
 	struct componentname *cnp = ap->a_cnp;
 	struct nwnode *np = VTONW(vp);
 	struct nwmount *nmp = VTONWFS(vp);
 	struct nwnode *dnp = VTONW(dvp);
 	int error = EIO;
 
 	if (dvp == vp)
 		return EINVAL;
 
 	error = ncp_DeleteNSEntry(nmp, dnp->n_fid.f_id, 
 		cnp->cn_namelen, cnp->cn_nameptr, cnp->cn_thread, cnp->cn_cred);
 	if (error == 0)
 		np->n_flag |= NSHOULDFREE;
 	else if (error == NWE_DIR_NOT_EMPTY)
 		error = ENOTEMPTY;
 	dnp->n_flag |= NMODIFIED;
 	nwfs_attr_cacheremove(dvp);
 	cache_purge(dvp);
 	cache_purge(vp);
 	return (error);
 }
 
 /*
  * nwfs_readdir call
  */
 static int
 nwfs_readdir(ap)
 	struct vop_readdir_args /* {
 		struct vnode *a_vp;
 		struct uio *a_uio;
 		struct ucred *a_cred;
 		int *a_eofflag;
 		u_long *a_cookies;
 		int a_ncookies;
 	} */ *ap;
 {
 	struct vnode *vp = ap->a_vp;
 	struct uio *uio = ap->a_uio;
 	int error;
 
 	if (vp->v_type != VDIR)
 		return (EPERM);
 	if (ap->a_ncookies) {
 		printf("nwfs_readdir: no support for cookies now...");
 		return (EOPNOTSUPP);
 	}
 
 	error = nwfs_readvnode(vp, uio, ap->a_cred);
 	return error;
 }
 /* ARGSUSED */
 static int
 nwfs_fsync(ap)
 	struct vop_fsync_args /* {
 		struct vnodeop_desc *a_desc;
 		struct vnode * a_vp;
 		struct ucred * a_cred;
 		int  a_waitfor;
 		struct thread *a_td;
 	} */ *ap;
 {
 /*	return (nfs_flush(ap->a_vp, ap->a_cred, ap->a_waitfor, ap->a_td, 1));*/
     return (0);
 }
 
 /* ARGSUSED */
 static 
 int nwfs_print (ap) 
 	struct vop_print_args /* {
 		struct vnode *a_vp;
 	} */ *ap;
 {
 	struct vnode *vp = ap->a_vp;
 	struct nwnode *np = VTONW(vp);
 
 	printf("\tnwfs node: name = '%s', fid = %d, pfid = %d\n",
 	    np->n_name, np->n_fid.f_id, np->n_fid.f_parent);
 	return (0);
 }
 
 static int nwfs_pathconf (ap)
 	struct vop_pathconf_args  /* {
 	struct vnode *vp;
 	int name;
 	register_t *retval;
 	} */ *ap;
 {
 	int name=ap->a_name, error=0;
 	register_t *retval=ap->a_retval;
 	
 	switch(name){
 		case _PC_LINK_MAX:
 		        *retval=0;
 			break;
 		case _PC_NAME_MAX:
 			*retval=NCP_MAX_FILENAME; /* XXX from nwfsnode */
 			break;
 		case _PC_PATH_MAX:
 			*retval=NCP_MAXPATHLEN; /* XXX from nwfsnode */
 			break;
 		default:
 			error=EINVAL;
 	}
 	return(error);
 }
 
 static int nwfs_strategy (ap) 
 	struct vop_strategy_args /* {
 	struct buf *a_bp
 	} */ *ap;
 {
 	struct buf *bp=ap->a_bp;
 	struct ucred *cr;
 	struct thread *td;
 	int error = 0;
 
 	NCPVNDEBUG("\n");
 	if (bp->b_flags & B_ASYNC)
 		td = (struct thread *)0;
 	else
 		td = curthread;	/* XXX */
 	if (bp->b_iocmd == BIO_READ)
 		cr = bp->b_rcred;
 	else
 		cr = bp->b_wcred;
 	/*
 	 * If the op is asynchronous and an i/o daemon is waiting
 	 * queue the request, wake it up and wait for completion
 	 * otherwise just do it ourselves.
 	 */
 	if ((bp->b_flags & B_ASYNC) == 0 )
 		error = nwfs_doio(ap->a_vp, bp, cr, td);
 	return (error);
 }
 
 
 /*
  * How to keep the brain busy ...
  * Currently lookup routine can make two lookup for vnode. This can be
  * avoided by reorg the code.
  */
 int
 nwfs_lookup(ap)
 	struct vop_lookup_args /* {
 		struct vnodeop_desc *a_desc;
 		struct vnode *a_dvp;
 		struct vnode **a_vpp;
 		struct componentname *a_cnp;
 	} */ *ap;
 {
 	struct componentname *cnp = ap->a_cnp;
 	struct vnode *dvp = ap->a_dvp;
 	struct vnode **vpp = ap->a_vpp;
 	int flags = cnp->cn_flags;
 	struct vnode *vp;
 	struct nwmount *nmp;
 	struct mount *mp = dvp->v_mount;
 	struct nwnode *dnp, *npp;
 	struct nw_entry_info fattr, *fap;
 	ncpfid fid;
 	int nameiop=cnp->cn_nameiop, islastcn;
 	int error = 0, notfound;
 	struct thread *td = cnp->cn_thread;
 	char _name[cnp->cn_namelen+1];
 	bcopy(cnp->cn_nameptr, _name, cnp->cn_namelen);
 	_name[cnp->cn_namelen]=0;
 	
 	if (dvp->v_type != VDIR)
 		return (ENOTDIR);
 	if ((flags & ISDOTDOT) && (dvp->v_vflag & VV_ROOT)) {
 		printf("nwfs_lookup: invalid '..'\n");
 		return EIO;
 	}
 
 	NCPVNDEBUG("%d '%s' in '%s' id=d\n", nameiop, _name, 
 		VTONW(dvp)->n_name/*, VTONW(dvp)->n_name*/);
 
 	islastcn = flags & ISLASTCN;
 	if (islastcn && (mp->mnt_flag & MNT_RDONLY) && (nameiop != LOOKUP))
 		return (EROFS);
 	if ((error = VOP_ACCESS(dvp, VEXEC, cnp->cn_cred, td)))
 		return (error);
 	nmp = VFSTONWFS(mp);
 	dnp = VTONW(dvp);
 /*
 printf("dvp %d:%d:%d\n", (int)mp, (int)dvp->v_vflag & VV_ROOT, (int)flags & ISDOTDOT);
 */
 	error = ncp_pathcheck(cnp->cn_nameptr, cnp->cn_namelen, &nmp->m.nls, 
 	    (nameiop == CREATE || nameiop == RENAME) && (nmp->m.nls.opt & NWHP_NOSTRICT) == 0);
 	if (error) 
 	    return ENOENT;
 
 	error = cache_lookup(dvp, vpp, cnp);
 	NCPVNDEBUG("cache_lookup returned %d\n", error);
 	if (error > 0)
 		return error;
 	if (error) {		/* name was found */
 		struct vattr vattr;
 
 		vp = *vpp;
 		if (VOP_GETATTR(vp, &vattr, cnp->cn_cred, td) == 0 &&
 		    vattr.va_ctime.tv_sec == VTONW(vp)->n_ctime) {
 			if (nameiop != LOOKUP && islastcn)
 				cnp->cn_flags |= SAVENAME;
 			NCPVNDEBUG("use cached vnode");
 			return (0);
 		}
 		cache_purge(vp);
 		if (vp != dvp)
 			vput(vp);
 		else
 			vrele(vp);
 		*vpp = NULLVP;
 	}
 	/* not in cache, so ...  */
 	error = 0;
 	*vpp = NULLVP;
 	fap = NULL;
 	if (flags & ISDOTDOT) {
 		if (NWCMPF(&dnp->n_parent, &nmp->n_rootent)) {
 			fid = nmp->n_rootent;
 			fap = NULL;
 			notfound = 0;
 		} else {
 			error = nwfs_lookupnp(nmp, dnp->n_parent, td, &npp);
 			if (error) {
 				return error;
 			}
 			fid = dnp->n_parent;
 			fap = &fattr;
 			/*np = *npp;*/
 			notfound = ncp_obtain_info(nmp, npp->n_dosfid,
 			    0, NULL, fap, td, cnp->cn_cred);
 		}
 	} else {
 		fap = &fattr;
 		notfound = ncp_lookup(dvp, cnp->cn_namelen, cnp->cn_nameptr,
 			fap, td, cnp->cn_cred);
 		fid.f_id = fap->dirEntNum;
 		if (cnp->cn_namelen == 1 && cnp->cn_nameptr[0] == '.') {
 			fid.f_parent = dnp->n_fid.f_parent;
 		} else
 			fid.f_parent = dnp->n_fid.f_id;
 		NCPVNDEBUG("call to ncp_lookup returned=%d\n", notfound);
 	}
 	if (notfound && notfound < 0x80 )
 		return (notfound);	/* hard error */
 	if (notfound) { /* entry not found */
 		/* Handle RENAME or CREATE case... */
 		if ((nameiop == CREATE || nameiop == RENAME) && islastcn) {
 			cnp->cn_flags |= SAVENAME;
 			return (EJUSTRETURN);
 		}
 		return ENOENT;
 	}/* else {
 		NCPVNDEBUG("Found entry %s with id=%d\n", fap->entryName, fap->dirEntNum);
 	}*/
 	/* handle DELETE case ... */
 	if (nameiop == DELETE && islastcn) { 	/* delete last component */
 		error = VOP_ACCESS(dvp, VWRITE, cnp->cn_cred, cnp->cn_thread);
 		if (error) return (error);
 		if (NWCMPF(&dnp->n_fid, &fid)) {	/* we found ourselfs */
 			VREF(dvp);
 			*vpp = dvp;
 			return 0;
 		}
 		error = nwfs_nget(mp, fid, fap, dvp, &vp);
 		if (error) return (error);
 		*vpp = vp;
 		cnp->cn_flags |= SAVENAME;	/* I free it later */
 		return (0);
 	}
 	if (nameiop == RENAME && islastcn) {
 		error = VOP_ACCESS(dvp, VWRITE, cnp->cn_cred, cnp->cn_thread);
 		if (error) return (error);
 		if (NWCMPF(&dnp->n_fid, &fid)) return EISDIR;
 		error = nwfs_nget(mp, fid, fap, dvp, &vp);
 		if (error) return (error);
 		*vpp = vp;
 		cnp->cn_flags |= SAVENAME;
 		return (0);
 	}
 	if (flags & ISDOTDOT) {
 		VOP_UNLOCK(dvp, 0, td);		/* race to get the inode */
 		error = nwfs_nget(mp, fid, NULL, NULL, &vp);
-		vn_lock(dvp, LK_EXCLUSIVE | LK_RETRY, td);
+		vn_lock(dvp, LK_EXCLUSIVE | LK_RETRY);
 		if (error)
 			return (error);
 		*vpp = vp;
 	} else if (NWCMPF(&dnp->n_fid, &fid)) {
 		vref(dvp);
 		*vpp = dvp;
 	} else {
 		error = nwfs_nget(mp, fid, fap, dvp, &vp);
 		if (error) return (error);
 		*vpp = vp;
 		NCPVNDEBUG("lookup: getnewvp!\n");
 	}
 	if ((cnp->cn_flags & MAKEENTRY)/* && !islastcn*/) {
 		VTONW(*vpp)->n_ctime = VTONW(*vpp)->n_vattr.va_ctime.tv_sec;
 		cache_enter(dvp, *vpp, cnp);
 	}
 	return (0);
 }
Index: head/sys/fs/portalfs/portal_vfsops.c
===================================================================
--- head/sys/fs/portalfs/portal_vfsops.c	(revision 175201)
+++ head/sys/fs/portalfs/portal_vfsops.c	(revision 175202)
@@ -1,258 +1,258 @@
 /*-
  * Copyright (c) 1992, 1993, 1995
  *	The Regents of the University of California.  All rights reserved.
  *
  * This code is derived from software donated to Berkeley by
  * Jan-Simon Pendry.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	@(#)portal_vfsops.c	8.11 (Berkeley) 5/14/95
  *
  * $FreeBSD$
  */
 
 /*
  * Portal Filesystem
  */
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/domain.h>
 #include <sys/filedesc.h>
 #include <sys/kernel.h>
 #include <sys/lock.h>
 #include <sys/mutex.h>
 #include <sys/malloc.h>
 #include <sys/file.h>		/* Must come after sys/malloc.h */
 #include <sys/mount.h>
 #include <sys/proc.h>
 #include <sys/protosw.h>
 #include <sys/socket.h>
 #include <sys/socketvar.h>
 #include <sys/vnode.h>
 
 #include <fs/portalfs/portal.h>
 
 static MALLOC_DEFINE(M_PORTALFSMNT, "portal_mount", "PORTAL mount structure");
 
 static vfs_unmount_t	portal_unmount;
 static vfs_root_t	portal_root;
 static vfs_statfs_t	portal_statfs;
 
 static const char *portal_opts[] = {
 	"socket", "config",
 	NULL
 };
 
 static int
 portal_cmount(struct mntarg *ma, void *data, int flags, struct thread *td)
 {
 	struct portal_args args;
 	int error;
 
 	if (data == NULL)
 		return (EINVAL);
 	error = copyin(data, &args, sizeof args);
 	if (error)
 		return (error);
 
 	ma = mount_argf(ma, "socket", "%d", args.pa_socket);
 	ma = mount_argsu(ma, "config", args.pa_config, MAXPATHLEN);
 	error = kernel_mount(ma, flags);
 
 	return (error);
 }
 
 /*
  * Mount the per-process file descriptors (/dev/fd)
  */
 static int
 portal_mount(struct mount *mp, struct thread *td)
 {
 	struct file *fp;
 	struct portalmount *fmp;
 	struct socket *so;
 	struct vnode *rvp;
 	struct portalnode *pn;
 	int error, v;
 	char *p;
 
 	if (vfs_filteropt(mp->mnt_optnew, portal_opts))
 		return (EINVAL);
 
 	error = vfs_scanopt(mp->mnt_optnew, "socket", "%d", &v);
 	if (error != 1)
 		return (EINVAL);
 	error = vfs_getopt(mp->mnt_optnew, "config", (void **)&p, NULL);
 	if (error)
 		return (error);
 
 	if ((error = fget(td, v, &fp)) != 0)
 		return (error);
         if (fp->f_type != DTYPE_SOCKET) {
 		fdrop(fp, td);
                 return(ENOTSOCK);
 	}
 	so = fp->f_data;	/* XXX race against userland */
 	if (so->so_proto->pr_domain->dom_family != AF_UNIX) {
 		fdrop(fp, td);
 		return (ESOCKTNOSUPPORT);
 	}
 
 	MALLOC(pn, struct portalnode *, sizeof(struct portalnode),
 		M_TEMP, M_WAITOK);
 
 	MALLOC(fmp, struct portalmount *, sizeof(struct portalmount),
 		M_PORTALFSMNT, M_WAITOK);	/* XXX */
 
 	error = getnewvnode("portal", mp, &portal_vnodeops, &rvp); /* XXX */
 	if (error) {
 		FREE(fmp, M_PORTALFSMNT);
 		FREE(pn, M_TEMP);
 		fdrop(fp, td);
 		return (error);
 	}
 
 	error = insmntque(rvp, mp);	/* XXX: Too early for mpsafe fs */
 	if (error != 0) {
 		FREE(fmp, M_PORTALFSMNT);
 		FREE(pn, M_TEMP);
 		fdrop(fp, td);
 		return (error);
 	}
 	rvp->v_data = pn;
 	rvp->v_type = VDIR;
 	rvp->v_vflag |= VV_ROOT;
 	VTOPORTAL(rvp)->pt_arg = 0;
 	VTOPORTAL(rvp)->pt_size = 0;
 	VTOPORTAL(rvp)->pt_fileid = PORTAL_ROOTFILEID;
 	fmp->pm_root = rvp;
 	fhold(fp);
 	fmp->pm_server = fp;
 
 	MNT_ILOCK(mp);
 	mp->mnt_flag |= MNT_LOCAL;
 	MNT_IUNLOCK(mp);
 	mp->mnt_data =  fmp;
 	vfs_getnewfsid(mp);
 
 	vfs_mountedfrom(mp, p);
 	fdrop(fp, td);
 	return (0);
 }
 
 static int
 portal_unmount(mp, mntflags, td)
 	struct mount *mp;
 	int mntflags;
 	struct thread *td;
 {
 	int error, flags = 0;
 
 
 	if (mntflags & MNT_FORCE)
 		flags |= FORCECLOSE;
 
 	/*
 	 * Clear out buffer cache.  I don't think we
 	 * ever get anything cached at this level at the
 	 * moment, but who knows...
 	 */
 #ifdef notyet
 	mntflushbuf(mp, 0);
 	if (mntinvalbuf(mp, 1))
 		return (EBUSY);
 #endif
 	/* There is 1 extra root vnode reference (pm_root). */
 	error = vflush(mp, 1, flags, td);
 	if (error)
 		return (error);
 
 	/*
 	 * Shutdown the socket.  This will cause the select in the
 	 * daemon to wake up, and then the accept will get ECONNABORTED
 	 * which it interprets as a request to go and bury itself.
 	 */
 	soshutdown(VFSTOPORTAL(mp)->pm_server->f_data, 2);
 	/*
 	 * Discard reference to underlying file.  Must call closef because
 	 * this may be the last reference.
 	 */
 	closef(VFSTOPORTAL(mp)->pm_server, (struct thread *) 0);
 	/*
 	 * Finally, throw away the portalmount structure
 	 */
 	free(mp->mnt_data, M_PORTALFSMNT);	/* XXX */
 	mp->mnt_data = 0;
 	return (0);
 }
 
 static int
 portal_root(mp, flags, vpp, td)
 	struct mount *mp;
 	int flags;
 	struct vnode **vpp;
 	struct thread *td;
 {
 	struct vnode *vp;
 
 	/*
 	 * Return locked reference to root.
 	 */
 	vp = VFSTOPORTAL(mp)->pm_root;
 	VREF(vp);
-	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td);
+	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
 	*vpp = vp;
 	return (0);
 }
 
 static int
 portal_statfs(mp, sbp, td)
 	struct mount *mp;
 	struct statfs *sbp;
 	struct thread *td;
 {
 
 	sbp->f_flags = 0;
 	sbp->f_bsize = DEV_BSIZE;
 	sbp->f_iosize = DEV_BSIZE;
 	sbp->f_blocks = 2;		/* 1K to keep df happy */
 	sbp->f_bfree = 0;
 	sbp->f_bavail = 0;
 	sbp->f_files = 1;		/* Allow for "." */
 	sbp->f_ffree = 0;		/* See comments above */
 	return (0);
 }
 
 static struct vfsops portal_vfsops = {
 	.vfs_cmount =		portal_cmount,
 	.vfs_mount =		portal_mount,
 	.vfs_root =		portal_root,
 	.vfs_statfs =		portal_statfs,
 	.vfs_unmount =		portal_unmount,
 };
 
 VFS_SET(portal_vfsops, portalfs, VFCF_SYNTHETIC);
Index: head/sys/fs/portalfs/portal_vnops.c
===================================================================
--- head/sys/fs/portalfs/portal_vnops.c	(revision 175201)
+++ head/sys/fs/portalfs/portal_vnops.c	(revision 175202)
@@ -1,567 +1,566 @@
 /*-
  * Copyright (c) 1992, 1993
  *	The Regents of the University of California.  All rights reserved.
  *
  * This code is derived from software donated to Berkeley by
  * Jan-Simon Pendry.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	@(#)portal_vnops.c	8.14 (Berkeley) 5/21/95
  *
  * $FreeBSD$
  */
 
 /*
  * Portal Filesystem
  */
 
 #include <sys/param.h>
 #include <sys/fcntl.h>
 #include <sys/file.h>
 #include <sys/kernel.h>
 #include <sys/lock.h>
 #include <sys/malloc.h>
 #include <sys/mbuf.h>
 #include <sys/mount.h>
 #include <sys/mutex.h>
 #include <sys/namei.h>
 #include <sys/proc.h>
 #include <sys/socket.h>
 #include <sys/socketvar.h>
 #include <sys/stat.h>
 #include <sys/syscallsubr.h>
 #include <sys/systm.h>
 #include <sys/un.h>
 #include <sys/unpcb.h>
 #include <sys/vnode.h>
 
 #include <fs/portalfs/portal.h>
 
 static int portal_fileid = PORTAL_ROOTFILEID+1;
 
 static void	portal_closefd(struct thread *td, int fd);
 static int	portal_connect(struct socket *so, struct socket *so2);
 static vop_getattr_t	portal_getattr;
 static vop_lookup_t	portal_lookup;
 static vop_open_t	portal_open;
 static vop_readdir_t	portal_readdir;
 static vop_reclaim_t	portal_reclaim;
 static vop_setattr_t	portal_setattr;
 
 static void
 portal_closefd(td, fd)
 	struct thread *td;
 	int fd;
 {
 	int error;
 
 	error = kern_close(td, fd);
 	/*
 	 * We should never get an error, and there isn't anything
 	 * we could do if we got one, so just print a message.
 	 */
 	if (error)
 		printf("portal_closefd: error = %d\n", error);
 }
 
 /*
  * vp is the current namei directory
  * cnp is the name to locate in that directory...
  */
 static int
 portal_lookup(ap)
 	struct vop_lookup_args /* {
 		struct vnode * a_dvp;
 		struct vnode ** a_vpp;
 		struct componentname * a_cnp;
 	} */ *ap;
 {
 	struct componentname *cnp = ap->a_cnp;
 	struct vnode **vpp = ap->a_vpp;
 	struct vnode *dvp = ap->a_dvp;
 	char *pname = cnp->cn_nameptr;
-	struct thread *td = cnp->cn_thread;
 	struct portalnode *pt;
 	int error;
 	struct vnode *fvp = 0;
 	char *path;
 	int size;
 
 	*vpp = NULLVP;
 
 	if (cnp->cn_nameiop == DELETE || cnp->cn_nameiop == RENAME)
 		return (EROFS);
 
 	if (cnp->cn_namelen == 1 && *pname == '.') {
 		*vpp = dvp;
 		VREF(dvp);
 		return (0);
 	}
 	KASSERT((cnp->cn_flags & ISDOTDOT) == 0,
 	    ("portal_lookup: Can not handle dotdot lookups."));
 
 	/*
 	 * Do the MALLOC before the getnewvnode since doing so afterward
 	 * might cause a bogus v_data pointer to get dereferenced
 	 * elsewhere if MALLOC should block.
 	 */
 	MALLOC(pt, struct portalnode *, sizeof(struct portalnode),
 		M_TEMP, M_WAITOK);
 
 	error = getnewvnode("portal", dvp->v_mount, &portal_vnodeops, &fvp);
 	if (error) {
 		FREE(pt, M_TEMP);
 		goto bad;
 	}
 	fvp->v_type = VREG;
 	fvp->v_data = pt;
 	/*
 	 * Save all of the remaining pathname and
 	 * advance the namei next pointer to the end
 	 * of the string.
 	 */
 	for (size = 0, path = pname; *path; path++)
 		size++;
 	cnp->cn_consume = size - cnp->cn_namelen;
 
 	pt->pt_arg = malloc(size+1, M_TEMP, M_WAITOK);
 	pt->pt_size = size+1;
 	bcopy(pname, pt->pt_arg, pt->pt_size);
 	pt->pt_fileid = portal_fileid++;
 
 	*vpp = fvp;
-	vn_lock(fvp, LK_EXCLUSIVE | LK_RETRY, td);
+	vn_lock(fvp, LK_EXCLUSIVE | LK_RETRY);
 	error = insmntque(fvp, dvp->v_mount);
 	if (error != 0) {
 		*vpp = NULLVP;
 		return (error);
 	}
 	return (0);
 
 bad:;
 	if (fvp)
 		vrele(fvp);
 	return (error);
 }
 
 static int
 portal_connect(so, so2)
 	struct socket *so;
 	struct socket *so2;
 {
 	/* from unp_connect, bypassing the namei stuff... */
 	struct socket *so3;
 	struct unpcb *unp2;
 	struct unpcb *unp3;
 
 	if (so2 == 0)
 		return (ECONNREFUSED);
 
 	if (so->so_type != so2->so_type)
 		return (EPROTOTYPE);
 
 	if ((so2->so_options & SO_ACCEPTCONN) == 0)
 		return (ECONNREFUSED);
 
 	if ((so3 = sonewconn(so2, 0)) == 0)
 		return (ECONNREFUSED);
 
 	unp2 = sotounpcb(so2);
 	unp3 = sotounpcb(so3);
 	if (unp2->unp_addr)
 		unp3->unp_addr = (struct sockaddr_un *)
 		    sodupsockaddr((struct sockaddr *)unp2->unp_addr,
 		    M_NOWAIT);
 	so2 = so3;
 
 	return (uipc_connect2(so, so2));
 }
 
 static int
 portal_open(ap)
 	struct vop_open_args /* {
 		struct vnode *a_vp;
 		int  a_mode;
 		struct ucred *a_cred;
 		struct thread *a_td;
 	} */ *ap;
 {
 	struct socket *so = 0;
 	struct portalnode *pt;
 	struct thread *td = ap->a_td;
 	struct vnode *vp = ap->a_vp;
 	struct uio auio;
 	struct iovec aiov[2];
 	int res;
 	struct mbuf *cm = 0;
 	struct cmsghdr *cmsg;
 	int newfds;
 	int *ip;
 	int fd;
 	int error;
 	int len;
 	struct portalmount *fmp;
 	struct file *fp;
 	struct portal_cred pcred;
 
 	/*
 	 * Nothing to do when opening the root node.
 	 */
 	if (vp->v_vflag & VV_ROOT)
 		return (0);
 
 	/*
 	 * Can't be opened unless the caller is set up
 	 * to deal with the side effects.  Check for this
 	 * by testing whether td_dupfd has been set.
 	 */
 	if (td->td_dupfd >= 0)
 		return (ENODEV);
 
 	pt = VTOPORTAL(vp);
 	fmp = VFSTOPORTAL(vp->v_mount);
 
 	/*
 	 * Create a new socket.
 	 */
 	error = socreate(AF_UNIX, &so, SOCK_STREAM, 0, ap->a_td->td_ucred,
 	    ap->a_td);
 	if (error)
 		goto bad;
 
 	/*
 	 * Reserve some buffer space
 	 */
 	res = pt->pt_size + sizeof(pcred) + 512;	/* XXX */
 	error = soreserve(so, res, res);
 	if (error)
 		goto bad;
 
 	/*
 	 * Kick off connection
 	 */
 	error = portal_connect(so, fmp->pm_server->f_data);
 	if (error)
 		goto bad;
 
 	/*
 	 * Wait for connection to complete
 	 */
 	/*
 	 * XXX: Since the mount point is holding a reference on the
 	 * underlying server socket, it is not easy to find out whether
 	 * the server process is still running.  To handle this problem
 	 * we loop waiting for the new socket to be connected (something
 	 * which will only happen if the server is still running) or for
 	 * the reference count on the server socket to drop to 1, which
 	 * will happen if the server dies.  Sleep for 5 second intervals
 	 * and keep polling the reference count.   XXX.
 	 */
 	/* XXXRW: Locking? */
 	SOCK_LOCK(so);
 	while ((so->so_state & SS_ISCONNECTING) && so->so_error == 0) {
 		if (fmp->pm_server->f_count == 1) {
 			SOCK_UNLOCK(so);
 			error = ECONNREFUSED;
 			goto bad;
 		}
 		(void) msleep((caddr_t) &so->so_timeo, SOCK_MTX(so), PSOCK,
 		    "portalcon", 5 * hz);
 	}
 	SOCK_UNLOCK(so);
 
 	if (so->so_error) {
 		error = so->so_error;
 		goto bad;
 	}
 
 	/*
 	 * Set miscellaneous flags
 	 */
 	SOCKBUF_LOCK(&so->so_rcv);
 	so->so_rcv.sb_timeo = 0;
 	so->so_rcv.sb_flags |= SB_NOINTR;
 	SOCKBUF_UNLOCK(&so->so_rcv);
 	SOCKBUF_LOCK(&so->so_snd);
 	so->so_snd.sb_timeo = 0;
 	so->so_snd.sb_flags |= SB_NOINTR;
 	SOCKBUF_UNLOCK(&so->so_snd);
 
 
 	pcred.pcr_flag = ap->a_mode;
 	pcred.pcr_uid = ap->a_cred->cr_uid;
 	pcred.pcr_ngroups = ap->a_cred->cr_ngroups;
 	bcopy(ap->a_cred->cr_groups, pcred.pcr_groups, NGROUPS * sizeof(gid_t));
 	aiov[0].iov_base = (caddr_t) &pcred;
 	aiov[0].iov_len = sizeof(pcred);
 	aiov[1].iov_base = pt->pt_arg;
 	aiov[1].iov_len = pt->pt_size;
 	auio.uio_iov = aiov;
 	auio.uio_iovcnt = 2;
 	auio.uio_rw = UIO_WRITE;
 	auio.uio_segflg = UIO_SYSSPACE;
 	auio.uio_td = td;
 	auio.uio_offset = 0;
 	auio.uio_resid = aiov[0].iov_len + aiov[1].iov_len;
 
 	error = sosend(so, (struct sockaddr *) 0, &auio,
 			(struct mbuf *) 0, (struct mbuf *) 0, 0 , td);
 	if (error)
 		goto bad;
 
 	len = auio.uio_resid = sizeof(int);
 	do {
 		struct mbuf *m = 0;
 		int flags = MSG_WAITALL;
 		error = soreceive(so, (struct sockaddr **) 0, &auio,
 					&m, &cm, &flags);
 		if (error)
 			goto bad;
 
 		/*
 		 * Grab an error code from the mbuf.
 		 */
 		if (m) {
 			m = m_pullup(m, sizeof(int));	/* Needed? */
 			if (m) {
 				error = *(mtod(m, int *));
 				m_freem(m);
 			} else {
 				error = EINVAL;
 			}
 		} else {
 			if (cm == 0) {
 				error = ECONNRESET;	 /* XXX */
 #ifdef notdef
 				break;
 #endif
 			}
 		}
 	} while (cm == 0 && auio.uio_resid == len && !error);
 
 	if (cm == 0)
 		goto bad;
 
 	if (auio.uio_resid) {
 		error = 0;
 #ifdef notdef
 		error = EMSGSIZE;
 		goto bad;
 #endif
 	}
 
 	/*
 	 * XXX: Break apart the control message, and retrieve the
 	 * received file descriptor.  Note that more than one descriptor
 	 * may have been received, or that the rights chain may have more
 	 * than a single mbuf in it.  What to do?
 	 */
 	cmsg = mtod(cm, struct cmsghdr *);
 	newfds = (cmsg->cmsg_len - sizeof(*cmsg)) / sizeof (int);
 	if (newfds == 0) {
 		error = ECONNREFUSED;
 		goto bad;
 	}
 	/*
 	 * At this point the rights message consists of a control message
 	 * header, followed by a data region containing a vector of
 	 * integer file descriptors.  The fds were allocated by the action
 	 * of receiving the control message.
 	 */
 	ip = (int *) (cmsg + 1);
 	fd = *ip++;
 	if (newfds > 1) {
 		/*
 		 * Close extra fds.
 		 */
 		int i;
 		printf("portal_open: %d extra fds\n", newfds - 1);
 		for (i = 1; i < newfds; i++) {
 			portal_closefd(td, *ip);
 			ip++;
 		}
 	}
 
 	/*
 	 * Check that the mode the file is being opened for is a subset
 	 * of the mode of the existing descriptor.
 	 */
 	if ((error = fget(td, fd, &fp)) != 0)
 		goto bad;
 	if (((ap->a_mode & (FREAD|FWRITE)) | fp->f_flag) != fp->f_flag) {
 		fdrop(fp, td);
 		portal_closefd(td, fd);
 		error = EACCES;
 		goto bad;
 	}
 	fdrop(fp, td);
 
 	/*
 	 * Save the dup fd in the proc structure then return the
 	 * special error code (ENXIO) which causes magic things to
 	 * happen in vn_open.  The whole concept is, well, hmmm.
 	 */
 	td->td_dupfd = fd;
 	error = ENXIO;
 
 bad:;
 	/*
 	 * And discard the control message.
 	 */
 	if (cm) {
 		m_freem(cm);
 	}
 
 	if (so) {
 		soshutdown(so, 2);
 		soclose(so);
 	}
 	return (error);
 }
 
 static int
 portal_getattr(ap)
 	struct vop_getattr_args /* {
 		struct vnode *a_vp;
 		struct vattr *a_vap;
 		struct ucred *a_cred;
 		struct thread *a_td;
 	} */ *ap;
 {
 	struct vnode *vp = ap->a_vp;
 	struct vattr *vap = ap->a_vap;
 
 	bzero(vap, sizeof(*vap));
 	vattr_null(vap);
 	vap->va_uid = 0;
 	vap->va_gid = 0;
 	vap->va_size = DEV_BSIZE;
 	vap->va_blocksize = DEV_BSIZE;
 	nanotime(&vap->va_atime);
 	vap->va_mtime = vap->va_atime;
 	vap->va_ctime = vap->va_mtime;
 	vap->va_gen = 0;
 	vap->va_flags = 0;
 	vap->va_rdev = 0;
 	/* vap->va_qbytes = 0; */
 	vap->va_bytes = 0;
 	/* vap->va_qsize = 0; */
 	if (vp->v_vflag & VV_ROOT) {
 		vap->va_type = VDIR;
 		vap->va_mode = S_IRUSR|S_IWUSR|S_IXUSR|
 				S_IRGRP|S_IWGRP|S_IXGRP|
 				S_IROTH|S_IWOTH|S_IXOTH;
 		vap->va_nlink = 2;
 		vap->va_fileid = 2;
 	} else {
 		vap->va_type = VREG;
 		vap->va_mode = S_IRUSR|S_IWUSR|
 				S_IRGRP|S_IWGRP|
 				S_IROTH|S_IWOTH;
 		vap->va_nlink = 1;
 		vap->va_fileid = VTOPORTAL(vp)->pt_fileid;
 	}
 	return (0);
 }
 
 static int
 portal_setattr(ap)
 	struct vop_setattr_args /* {
 		struct vnode *a_vp;
 		struct vattr *a_vap;
 		struct ucred *a_cred;
 		struct thread *a_td;
 	} */ *ap;
 {
 
 	/*
 	 * Can't mess with the root vnode
 	 */
 	if (ap->a_vp->v_vflag & VV_ROOT)
 		return (EACCES);
 
 	if (ap->a_vap->va_flags != VNOVAL)
 		return (EOPNOTSUPP);
 
 	return (0);
 }
 
 /*
  * Fake readdir, just return empty directory.
  * It is hard to deal with '.' and '..' so don't bother.
  */
 static int
 portal_readdir(ap)
 	struct vop_readdir_args /* {
 		struct vnode *a_vp;
 		struct uio *a_uio;
 		struct ucred *a_cred;
 		int *a_eofflag;
 		u_long *a_cookies;
 		int a_ncookies;
 	} */ *ap;
 {
 
 	/*
 	 * We don't allow exporting portal mounts, and currently local
 	 * requests do not need cookies.
 	 */
 	if (ap->a_ncookies)
 		panic("portal_readdir: not hungry");
 
 	return (0);
 }
 
 static int
 portal_reclaim(ap)
 	struct vop_reclaim_args /* {
 		struct vnode *a_vp;
 	} */ *ap;
 {
 	struct portalnode *pt = VTOPORTAL(ap->a_vp);
 
 	if (pt->pt_arg) {
 		free((caddr_t) pt->pt_arg, M_TEMP);
 		pt->pt_arg = 0;
 	}
 	FREE(ap->a_vp->v_data, M_TEMP);
 	ap->a_vp->v_data = 0;
 
 	return (0);
 }
 
 struct vop_vector portal_vnodeops = {
 	.vop_default =		&default_vnodeops,
 
 	.vop_access =		VOP_NULL,
 	.vop_getattr =		portal_getattr,
 	.vop_lookup =		portal_lookup,
 	.vop_open =		portal_open,
 	.vop_pathconf =		vop_stdpathconf,
 	.vop_readdir =		portal_readdir,
 	.vop_reclaim =		portal_reclaim,
 	.vop_setattr =		portal_setattr,
 };
Index: head/sys/fs/procfs/procfs.c
===================================================================
--- head/sys/fs/procfs/procfs.c	(revision 175201)
+++ head/sys/fs/procfs/procfs.c	(revision 175202)
@@ -1,210 +1,210 @@
 /*-
  * Copyright (c) 2001 Dag-Erling Sm�rgrav
  * Copyright (c) 1993 Jan-Simon Pendry
  * Copyright (c) 1993
  *	The Regents of the University of California.  All rights reserved.
  *
  * This code is derived from software contributed to Berkeley by
  * Jan-Simon Pendry.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. All advertising materials mentioning features or use of this software
  *    must display the following acknowledgement:
  *	This product includes software developed by the University of
  *	California, Berkeley and its contributors.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	@(#)procfs_vfsops.c	8.7 (Berkeley) 5/10/95
  *
  * $FreeBSD$
  */
 
 #include <sys/param.h>
 #include <sys/queue.h>
 #include <sys/exec.h>
 #include <sys/lock.h>
 #include <sys/kernel.h>
 #include <sys/malloc.h>
 #include <sys/mount.h>
 #include <sys/mutex.h>
 #include <sys/proc.h>
 #include <sys/sbuf.h>
 #include <sys/sysproto.h>
 #include <sys/systm.h>
 #include <sys/vnode.h>
 
 #include <vm/vm.h>
 #include <vm/pmap.h>
 #include <vm/vm_param.h>
 
 #include <fs/pseudofs/pseudofs.h>
 #include <fs/procfs/procfs.h>
 
 /*
  * Filler function for proc/pid/self
  */
 int
 procfs_doprocfile(PFS_FILL_ARGS)
 {
 	char *fullpath = "unknown";
 	char *freepath = NULL;
 	struct vnode *textvp;
 	int err;
 
 	textvp = p->p_textvp;
 	VI_LOCK(textvp);
 	vholdl(textvp);
-	err = vn_lock(textvp, LK_EXCLUSIVE | LK_INTERLOCK, td);
+	err = vn_lock(textvp, LK_EXCLUSIVE | LK_INTERLOCK);
 	vdrop(textvp);
 	if (err)
 		return (err);
 	vn_fullpath(td, textvp, &fullpath, &freepath);
 	VOP_UNLOCK(textvp, 0, td);
 	sbuf_printf(sb, "%s", fullpath);
 	if (freepath)
 		free(freepath, M_TEMP);
 	return (0);
 }
 
 /*
  * Filler function for proc/curproc
  */
 int
 procfs_docurproc(PFS_FILL_ARGS)
 {
 	sbuf_printf(sb, "%ld", (long)td->td_proc->p_pid);
 	return (0);
 }
 
 /*
  * Adjust mode for some nodes that need it
  */
 int
 procfs_attr(PFS_ATTR_ARGS)
 {
 	PROC_LOCK_ASSERT(p, MA_OWNED);
 
 	/* XXX inefficient, split into separate functions */
 	if (strcmp(pn->pn_name, "ctl") == 0 ||
 	    strcmp(pn->pn_name, "note") == 0 ||
 	    strcmp(pn->pn_name, "notepg") == 0)
 		vap->va_mode = 0200;
 	else if (strcmp(pn->pn_name, "mem") == 0 ||
 	    strcmp(pn->pn_name, "regs") == 0 ||
 	    strcmp(pn->pn_name, "dbregs") == 0 ||
 	    strcmp(pn->pn_name, "fpregs") == 0)
 		vap->va_mode = 0600;
 
 	if ((p->p_flag & P_SUGID) && pn->pn_type != pfstype_procdir)
 		vap->va_mode = 0;
 
 	vap->va_uid = p->p_ucred->cr_uid;
 	vap->va_gid = p->p_ucred->cr_gid;
 
 	return (0);
 }
 
 /*
  * Visibility: some files only exist for non-system processes
  * Non-static because linprocfs uses it.
  */
 int
 procfs_notsystem(PFS_VIS_ARGS)
 {
 	PROC_LOCK_ASSERT(p, MA_OWNED);
 	return ((p->p_flag & P_SYSTEM) == 0);
 }
 
 /*
  * Visibility: some files are only visible to process that can debug
  * the target process.
  */
 int
 procfs_candebug(PFS_VIS_ARGS)
 {
 	PROC_LOCK_ASSERT(p, MA_OWNED);
 	return ((p->p_flag & P_SYSTEM) == 0 && p_candebug(td, p) == 0);
 }
 
 /*
  * Constructor
  */
 static int
 procfs_init(PFS_INIT_ARGS)
 {
 	struct pfs_node *root;
 	struct pfs_node *dir;
 	struct pfs_node *node;
 
 	root = pi->pi_root;
 
 	pfs_create_link(root, "curproc", procfs_docurproc,
 	    NULL, NULL, NULL, 0);
 
 	dir = pfs_create_dir(root, "pid",
 	    procfs_attr, NULL, NULL, PFS_PROCDEP);
 	pfs_create_file(dir, "cmdline", procfs_doproccmdline,
 	    NULL, NULL, NULL, PFS_RD);
 	pfs_create_file(dir, "ctl", procfs_doprocctl,
 	    procfs_attr, NULL, NULL, PFS_WR);
 	pfs_create_file(dir, "dbregs", procfs_doprocdbregs,
 	    procfs_attr, procfs_candebug, NULL, PFS_RDWR|PFS_RAW);
 	pfs_create_file(dir, "etype", procfs_doproctype,
 	    NULL, NULL, NULL, PFS_RD);
 	pfs_create_file(dir, "fpregs", procfs_doprocfpregs,
 	    procfs_attr, procfs_candebug, NULL, PFS_RDWR|PFS_RAW);
 	pfs_create_file(dir, "map", procfs_doprocmap,
 	    NULL, procfs_notsystem, NULL, PFS_RD);
 	node = pfs_create_file(dir, "mem", procfs_doprocmem,
 	    procfs_attr, procfs_candebug, NULL, PFS_RDWR|PFS_RAW);
 	node->pn_ioctl = procfs_ioctl;
 	node->pn_close = procfs_close;
 	pfs_create_file(dir, "note", procfs_doprocnote,
 	    procfs_attr, procfs_candebug, NULL, PFS_WR);
 	pfs_create_file(dir, "notepg", procfs_doprocnote,
 	    procfs_attr, procfs_candebug, NULL, PFS_WR);
 	pfs_create_file(dir, "regs", procfs_doprocregs,
 	    procfs_attr, procfs_candebug, NULL, PFS_RDWR|PFS_RAW);
 	pfs_create_file(dir, "rlimit", procfs_doprocrlimit,
 	    NULL, NULL, NULL, PFS_RD);
 	pfs_create_file(dir, "status", procfs_doprocstatus,
 	    NULL, NULL, NULL, PFS_RD);
 
 	pfs_create_link(dir, "file", procfs_doprocfile,
 	    NULL, procfs_notsystem, NULL, 0);
 
 	return (0);
 }
 
 /*
  * Destructor
  */
 static int
 procfs_uninit(PFS_INIT_ARGS)
 {
 	/* nothing to do, pseudofs will GC */
 	return (0);
 }
 
 PSEUDOFS(procfs, 1);
Index: head/sys/fs/procfs/procfs_map.c
===================================================================
--- head/sys/fs/procfs/procfs_map.c	(revision 175201)
+++ head/sys/fs/procfs/procfs_map.c	(revision 175202)
@@ -1,244 +1,244 @@
 /*-
  * Copyright (c) 1993 Jan-Simon Pendry
  * Copyright (c) 1993
  *	The Regents of the University of California.  All rights reserved.
  *
  * This code is derived from software contributed to Berkeley by
  * Jan-Simon Pendry.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	@(#)procfs_status.c	8.3 (Berkeley) 2/17/94
  *
  * $FreeBSD$
  */
 
 #include "opt_compat.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/lock.h>
 #include <sys/filedesc.h>
 #include <sys/malloc.h>
 #include <sys/mount.h>
 #include <sys/mutex.h>
 #include <sys/proc.h>
 #include <sys/uio.h>
 #include <sys/vnode.h>
 
 #include <fs/pseudofs/pseudofs.h>
 #include <fs/procfs/procfs.h>
 
 #include <vm/vm.h>
 #include <vm/pmap.h>
 #include <vm/vm_map.h>
 #include <vm/vm_page.h>
 #include <vm/vm_object.h>
 
 #ifdef COMPAT_IA32
 #include <sys/procfs.h>
 #include <machine/fpu.h>
 #include <compat/ia32/ia32_reg.h>
 
 extern struct sysentvec ia32_freebsd_sysvec;
 #endif
 
 
 #define MEBUFFERSIZE 256
 
 /*
  * The map entries can *almost* be read with programs like cat.  However,
  * large maps need special programs to read.  It is not easy to implement
  * a program that can sense the required size of the buffer, and then
  * subsequently do a read with the appropriate size.  This operation cannot
  * be atomic.  The best that we can do is to allow the program to do a read
  * with an arbitrarily large buffer, and return as much as we can.  We can
  * return an error code if the buffer is too small (EFBIG), then the program
  * can try a bigger buffer.
  */
 int
 procfs_doprocmap(PFS_FILL_ARGS)
 {
 	int len;
 	int error, vfslocked;
 	vm_map_t map = &p->p_vmspace->vm_map;
 	vm_map_entry_t entry, tmp_entry;
 	struct vnode *vp;
 	char mebuffer[MEBUFFERSIZE];
 	char *fullpath, *freepath;
 	unsigned int last_timestamp;
 #ifdef COMPAT_IA32
 	int wrap32 = 0;
 #endif
 
 	PROC_LOCK(p);
 	error = p_candebug(td, p);
 	PROC_UNLOCK(p);
 	if (error)
 		return (error);
 
 	if (uio->uio_rw != UIO_READ)
 		return (EOPNOTSUPP);
 
 	if (uio->uio_offset != 0)
 		return (0);
 
 #ifdef COMPAT_IA32
         if (curthread->td_proc->p_sysent == &ia32_freebsd_sysvec) {
                 if (p->p_sysent != &ia32_freebsd_sysvec)
                         return (EOPNOTSUPP);
                 wrap32 = 1;
         }
 #endif
 
 	vm_map_lock_read(map);
 	for (entry = map->header.next;
 		((uio->uio_resid > 0) && (entry != &map->header));
 		entry = entry->next) {
 		vm_object_t obj, tobj, lobj;
 		int ref_count, shadow_count, flags;
 		vm_offset_t addr;
 		int resident, privateresident;
 		char *type;
 
 		if (entry->eflags & MAP_ENTRY_IS_SUB_MAP)
 			continue;
 
 		privateresident = 0;
 		obj = entry->object.vm_object;
 		if (obj != NULL) {
 			VM_OBJECT_LOCK(obj);
 			if (obj->shadow_count == 1)
 				privateresident = obj->resident_page_count;
 		}
 
 		resident = 0;
 		addr = entry->start;
 		while (addr < entry->end) {
 			if (pmap_extract(map->pmap, addr))
 				resident++;
 			addr += PAGE_SIZE;
 		}
 
 		for (lobj = tobj = obj; tobj; tobj = tobj->backing_object) {
 			if (tobj != obj)
 				VM_OBJECT_LOCK(tobj);
 			if (lobj != obj)
 				VM_OBJECT_UNLOCK(lobj);
 			lobj = tobj;
 		}
 
 		freepath = NULL;
 		fullpath = "-";
 		if (lobj) {
 			switch(lobj->type) {
 			default:
 			case OBJT_DEFAULT:
 				type = "default";
 				vp = NULL;
 				break;
 			case OBJT_VNODE:
 				type = "vnode";
 				vp = lobj->handle;
 				vref(vp);
 				break;
 			case OBJT_SWAP:
 				type = "swap";
 				vp = NULL;
 				break;
 			case OBJT_DEVICE:
 				type = "device";
 				vp = NULL;
 				break;
 			}
 			if (lobj != obj)
 				VM_OBJECT_UNLOCK(lobj);
 
 			flags = obj->flags;
 			ref_count = obj->ref_count;
 			shadow_count = obj->shadow_count;
 			VM_OBJECT_UNLOCK(obj);
 			if (vp != NULL) {
 				vfslocked = VFS_LOCK_GIANT(vp->v_mount);
-				vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td);
+				vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
 				vn_fullpath(td, vp, &fullpath, &freepath);
 				vput(vp);
 				VFS_UNLOCK_GIANT(vfslocked);
 			}
 		} else {
 			type = "none";
 			flags = 0;
 			ref_count = 0;
 			shadow_count = 0;
 		}
 
 		/*
 		 * format:
 		 *  start, end, resident, private resident, cow, access, type.
 		 */
 		snprintf(mebuffer, sizeof mebuffer,
 		    "0x%lx 0x%lx %d %d %p %s%s%s %d %d 0x%x %s %s %s %s\n",
 			(u_long)entry->start, (u_long)entry->end,
 			resident, privateresident,
 #ifdef COMPAT_IA32
 			wrap32 ? NULL : obj,	/* Hide 64 bit value */
 #else
 			obj,
 #endif
 			(entry->protection & VM_PROT_READ)?"r":"-",
 			(entry->protection & VM_PROT_WRITE)?"w":"-",
 			(entry->protection & VM_PROT_EXECUTE)?"x":"-",
 			ref_count, shadow_count, flags,
 			(entry->eflags & MAP_ENTRY_COW)?"COW":"NCOW",
 			(entry->eflags & MAP_ENTRY_NEEDS_COPY)?"NC":"NNC",
 			type, fullpath);
 
 		if (freepath != NULL)
 			free(freepath, M_TEMP);
 
 		len = strlen(mebuffer);
 		if (len > uio->uio_resid) {
 			error = EFBIG;
 			break;
 		}
 		last_timestamp = map->timestamp;
 		vm_map_unlock_read(map);
 		error = uiomove(mebuffer, len, uio);
 		vm_map_lock_read(map);
 		if (error)
 			break;
 		if (last_timestamp + 1 != map->timestamp) {
 			/*
 			 * Look again for the entry because the map was
 			 * modified while it was unlocked.  Specifically,
 			 * the entry may have been clipped, merged, or deleted.
 			 */
 			vm_map_lookup_entry(map, addr - 1, &tmp_entry);
 			entry = tmp_entry;
 		}
 	}
 	vm_map_unlock_read(map);
 	return (error);
 }
Index: head/sys/fs/pseudofs/pseudofs_vncache.c
===================================================================
--- head/sys/fs/pseudofs/pseudofs_vncache.c	(revision 175201)
+++ head/sys/fs/pseudofs/pseudofs_vncache.c	(revision 175202)
@@ -1,308 +1,308 @@
 /*-
  * Copyright (c) 2001 Dag-Erling Co�dan Sm�rgrav
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer
  *    in this position and unchanged.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. The name of the author may not be used to endorse or promote products
  *    derived from this software without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_pseudofs.h"
 
 #include <sys/param.h>
 #include <sys/kernel.h>
 #include <sys/systm.h>
 #include <sys/eventhandler.h>
 #include <sys/lock.h>
 #include <sys/malloc.h>
 #include <sys/mutex.h>
 #include <sys/proc.h>
 #include <sys/sysctl.h>
 #include <sys/vnode.h>
 
 #include <fs/pseudofs/pseudofs.h>
 #include <fs/pseudofs/pseudofs_internal.h>
 
 static MALLOC_DEFINE(M_PFSVNCACHE, "pfs_vncache", "pseudofs vnode cache");
 
 static struct mtx pfs_vncache_mutex;
 static struct pfs_vdata *pfs_vncache;
 static eventhandler_tag pfs_exit_tag;
 static void pfs_exit(void *arg, struct proc *p);
 
 SYSCTL_NODE(_vfs_pfs, OID_AUTO, vncache, CTLFLAG_RW, 0,
     "pseudofs vnode cache");
 
 static int pfs_vncache_entries;
 SYSCTL_INT(_vfs_pfs_vncache, OID_AUTO, entries, CTLFLAG_RD,
     &pfs_vncache_entries, 0,
     "number of entries in the vnode cache");
 
 static int pfs_vncache_maxentries;
 SYSCTL_INT(_vfs_pfs_vncache, OID_AUTO, maxentries, CTLFLAG_RD,
     &pfs_vncache_maxentries, 0,
     "highest number of entries in the vnode cache");
 
 static int pfs_vncache_hits;
 SYSCTL_INT(_vfs_pfs_vncache, OID_AUTO, hits, CTLFLAG_RD,
     &pfs_vncache_hits, 0,
     "number of cache hits since initialization");
 
 static int pfs_vncache_misses;
 SYSCTL_INT(_vfs_pfs_vncache, OID_AUTO, misses, CTLFLAG_RD,
     &pfs_vncache_misses, 0,
     "number of cache misses since initialization");
 
 extern struct vop_vector pfs_vnodeops;	/* XXX -> .h file */
 
 /*
  * Initialize vnode cache
  */
 void
 pfs_vncache_load(void)
 {
 
 	mtx_assert(&Giant, MA_OWNED);
 	mtx_init(&pfs_vncache_mutex, "pfs_vncache", NULL, MTX_DEF);
 	pfs_exit_tag = EVENTHANDLER_REGISTER(process_exit, pfs_exit, NULL,
 	    EVENTHANDLER_PRI_ANY);
 }
 
 /*
  * Tear down vnode cache
  */
 void
 pfs_vncache_unload(void)
 {
 
 	mtx_assert(&Giant, MA_OWNED);
 	EVENTHANDLER_DEREGISTER(process_exit, pfs_exit_tag);
 	KASSERT(pfs_vncache_entries == 0,
 	    ("%d vncache entries remaining", pfs_vncache_entries));
 	mtx_destroy(&pfs_vncache_mutex);
 }
 
 /*
  * Allocate a vnode
  */
 int
 pfs_vncache_alloc(struct mount *mp, struct vnode **vpp,
 		  struct pfs_node *pn, pid_t pid)
 {
 	struct pfs_vdata *pvd;
 	struct vnode *vp;
 	int error;
 
 	/*
 	 * See if the vnode is in the cache.
 	 * XXX linear search is not very efficient.
 	 */
 retry:
 	mtx_lock(&pfs_vncache_mutex);
 	for (pvd = pfs_vncache; pvd; pvd = pvd->pvd_next) {
 		if (pvd->pvd_pn == pn && pvd->pvd_pid == pid &&
 		    pvd->pvd_vnode->v_mount == mp) {
 			vp = pvd->pvd_vnode;
 			VI_LOCK(vp);
 			mtx_unlock(&pfs_vncache_mutex);
 			if (vget(vp, LK_EXCLUSIVE | LK_INTERLOCK, curthread) == 0) {
 				++pfs_vncache_hits;
 				*vpp = vp;
 				/*
 				 * Some callers cache_enter(vp) later, so
 				 * we have to make sure it's not in the
 				 * VFS cache so it doesn't get entered
 				 * twice.  A better solution would be to
 				 * make pfs_vncache_alloc() responsible
 				 * for entering the vnode in the VFS
 				 * cache.
 				 */
 				cache_purge(vp);
 				return (0);
 			}
 			goto retry;
 		}
 	}
 	mtx_unlock(&pfs_vncache_mutex);
 	++pfs_vncache_misses;
 
 	/* nope, get a new one */
 	MALLOC(pvd, struct pfs_vdata *, sizeof *pvd, M_PFSVNCACHE, M_WAITOK);
 	mtx_lock(&pfs_vncache_mutex);
 	if (++pfs_vncache_entries > pfs_vncache_maxentries)
 		pfs_vncache_maxentries = pfs_vncache_entries;
 	mtx_unlock(&pfs_vncache_mutex);
 	error = getnewvnode("pseudofs", mp, &pfs_vnodeops, vpp);
 	if (error) {
 		mtx_lock(&pfs_vncache_mutex);
 		--pfs_vncache_entries;
 		mtx_unlock(&pfs_vncache_mutex);
 		FREE(pvd, M_PFSVNCACHE);
 		return (error);
 	}
 	pvd->pvd_pn = pn;
 	pvd->pvd_pid = pid;
 	(*vpp)->v_data = pvd;
 	switch (pn->pn_type) {
 	case pfstype_root:
 		(*vpp)->v_vflag = VV_ROOT;
 #if 0
 		printf("root vnode allocated\n");
 #endif
 		/* fall through */
 	case pfstype_dir:
 	case pfstype_this:
 	case pfstype_parent:
 	case pfstype_procdir:
 		(*vpp)->v_type = VDIR;
 		break;
 	case pfstype_file:
 		(*vpp)->v_type = VREG;
 		break;
 	case pfstype_symlink:
 		(*vpp)->v_type = VLNK;
 		break;
 	case pfstype_none:
 		KASSERT(0, ("pfs_vncache_alloc called for null node\n"));
 	default:
 		panic("%s has unexpected type: %d", pn->pn_name, pn->pn_type);
 	}
 	/*
 	 * Propagate flag through to vnode so users know it can change
 	 * if the process changes (i.e. execve)
 	 */
 	if ((pn->pn_flags & PFS_PROCDEP) != 0)
 		(*vpp)->v_vflag |= VV_PROCDEP;
 	pvd->pvd_vnode = *vpp;
 	(*vpp)->v_vnlock->lk_flags |= LK_CANRECURSE;
-	vn_lock(*vpp, LK_EXCLUSIVE | LK_RETRY, curthread);
+	vn_lock(*vpp, LK_EXCLUSIVE | LK_RETRY);
 	error = insmntque(*vpp, mp);
 	if (error != 0) {
 		mtx_lock(&pfs_vncache_mutex);
 		--pfs_vncache_entries;
 		mtx_unlock(&pfs_vncache_mutex);
 		FREE(pvd, M_PFSVNCACHE);
 		*vpp = NULLVP;
 		return (error);
 	}
 	mtx_lock(&pfs_vncache_mutex);
 	pvd->pvd_prev = NULL;
 	pvd->pvd_next = pfs_vncache;
 	if (pvd->pvd_next)
 		pvd->pvd_next->pvd_prev = pvd;
 	pfs_vncache = pvd;
 	mtx_unlock(&pfs_vncache_mutex);
 	return (0);
 }
 
 /*
  * Free a vnode
  */
 int
 pfs_vncache_free(struct vnode *vp)
 {
 	struct pfs_vdata *pvd;
 
 	mtx_lock(&pfs_vncache_mutex);
 	pvd = (struct pfs_vdata *)vp->v_data;
 	KASSERT(pvd != NULL, ("pfs_vncache_free(): no vnode data\n"));
 	if (pvd->pvd_next)
 		pvd->pvd_next->pvd_prev = pvd->pvd_prev;
 	if (pvd->pvd_prev)
 		pvd->pvd_prev->pvd_next = pvd->pvd_next;
 	else
 		pfs_vncache = pvd->pvd_next;
 	--pfs_vncache_entries;
 	mtx_unlock(&pfs_vncache_mutex);
 
 	FREE(pvd, M_PFSVNCACHE);
 	vp->v_data = NULL;
 	return (0);
 }
 
 /*
  * Purge the cache of dead entries
  *
  * This is extremely inefficient due to the fact that vgone() not only
  * indirectly modifies the vnode cache, but may also sleep.  We can
  * neither hold pfs_vncache_mutex across a vgone() call, nor make any
  * assumptions about the state of the cache after vgone() returns.  In
  * consequence, we must start over after every vgone() call, and keep
  * trying until we manage to traverse the entire cache.
  *
  * The only way to improve this situation is to change the data structure
  * used to implement the cache.
  */
 void
 pfs_purge(struct pfs_node *pn)
 {
 	struct pfs_vdata *pvd;
 	struct vnode *vnp;
 
 	mtx_lock(&pfs_vncache_mutex);
 	pvd = pfs_vncache;
 	while (pvd != NULL) {
 		if (pvd->pvd_dead || (pn != NULL && pvd->pvd_pn == pn)) {
 			vnp = pvd->pvd_vnode;
 			vhold(vnp);
 			mtx_unlock(&pfs_vncache_mutex);
 			VOP_LOCK(vnp, LK_EXCLUSIVE, curthread);
 			vgone(vnp);
 			VOP_UNLOCK(vnp, 0, curthread);
 			vdrop(vnp);
 			mtx_lock(&pfs_vncache_mutex);
 			pvd = pfs_vncache;
 		} else {
 			pvd = pvd->pvd_next;
 		}
 	}
 	mtx_unlock(&pfs_vncache_mutex);
 }
 
 /*
  * Free all vnodes associated with a defunct process
  *
  * XXXRW: It is unfortunate that pfs_exit() always acquires and releases two
  * mutexes (one of which is Giant) for every process exit, even if procfs
  * isn't mounted.
  */
 static void
 pfs_exit(void *arg, struct proc *p)
 {
 	struct pfs_vdata *pvd;
 	int dead;
 
 	if (pfs_vncache == NULL)
 		return;
 	mtx_lock(&Giant);
 	mtx_lock(&pfs_vncache_mutex);
 	for (pvd = pfs_vncache, dead = 0; pvd != NULL; pvd = pvd->pvd_next)
 		if (pvd->pvd_pid == p->p_pid)
 			dead = pvd->pvd_dead = 1;
 	mtx_unlock(&pfs_vncache_mutex);
 	if (dead)
 		pfs_purge(NULL);
 	mtx_unlock(&Giant);
 }
Index: head/sys/fs/pseudofs/pseudofs_vnops.c
===================================================================
--- head/sys/fs/pseudofs/pseudofs_vnops.c	(revision 175201)
+++ head/sys/fs/pseudofs/pseudofs_vnops.c	(revision 175202)
@@ -1,892 +1,892 @@
 /*-
  * Copyright (c) 2001 Dag-Erling Co�dan Sm�rgrav
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer
  *    in this position and unchanged.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. The name of the author may not be used to endorse or promote products
  *    derived from this software without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_pseudofs.h"
 
 #include <sys/param.h>
 #include <sys/kernel.h>
 #include <sys/systm.h>
 #include <sys/ctype.h>
 #include <sys/dirent.h>
 #include <sys/fcntl.h>
 #include <sys/limits.h>
 #include <sys/lock.h>
 #include <sys/malloc.h>
 #include <sys/mount.h>
 #include <sys/mutex.h>
 #include <sys/namei.h>
 #include <sys/proc.h>
 #include <sys/sbuf.h>
 #include <sys/sx.h>
 #include <sys/sysctl.h>
 #include <sys/vnode.h>
 
 #include <fs/pseudofs/pseudofs.h>
 #include <fs/pseudofs/pseudofs_internal.h>
 
 /*
  * Returns the fileno, adjusted for target pid
  */
 static uint32_t
 pn_fileno(struct pfs_node *pn, pid_t pid)
 {
 
 	KASSERT(pn->pn_fileno > 0,
 	    ("%s(): no fileno allocated", __func__));
 	if (pid != NO_PID)
 		return (pn->pn_fileno * NO_PID + pid);
 	return (pn->pn_fileno);
 }
 
 /*
  * Returns non-zero if given file is visible to given thread.
  */
 static int
 pfs_visible_proc(struct thread *td, struct pfs_node *pn, struct proc *proc)
 {
 	int visible;
 
 	if (proc == NULL)
 		return (0);
 
 	PROC_LOCK_ASSERT(proc, MA_OWNED);
 
 	visible = ((proc->p_flag & P_WEXIT) == 0);
 	if (visible)
 		visible = (p_cansee(td, proc) == 0);
 	if (visible && pn->pn_vis != NULL)
 		visible = pn_vis(td, proc, pn);
 	if (!visible)
 		return (0);
 	return (1);
 }
 
 static int
 pfs_visible(struct thread *td, struct pfs_node *pn, pid_t pid, struct proc **p)
 {
 	struct proc *proc;
 
 	PFS_TRACE(("%s (pid: %d, req: %d)",
 	    pn->pn_name, pid, td->td_proc->p_pid));
 
 	if (p)
 		*p = NULL;
 	if (pid == NO_PID)
 		PFS_RETURN (1);
 	if ((proc = pfind(pid)) == NULL)
 		PFS_RETURN (0);
 	if (pfs_visible_proc(td, pn, proc)) {
 		if (p)
 			*p = proc;
 		else
 			PROC_UNLOCK(proc);
 		PFS_RETURN (1);
 	}
 	PROC_UNLOCK(proc);
 	PFS_RETURN (0);
 }
 
 /*
  * Verify permissions
  */
 static int
 pfs_access(struct vop_access_args *va)
 {
 	struct vnode *vn = va->a_vp;
 	struct pfs_vdata *pvd = vn->v_data;
 	struct vattr vattr;
 	int error;
 
 	PFS_TRACE(("%s", pvd->pvd_pn->pn_name));
 	(void)pvd;
 
 	error = VOP_GETATTR(vn, &vattr, va->a_cred, va->a_td);
 	if (error)
 		PFS_RETURN (error);
 	error = vaccess(vn->v_type, vattr.va_mode, vattr.va_uid,
 	    vattr.va_gid, va->a_mode, va->a_cred, NULL);
 	PFS_RETURN (error);
 }
 
 /*
  * Close a file or directory
  */
 static int
 pfs_close(struct vop_close_args *va)
 {
 	struct vnode *vn = va->a_vp;
 	struct pfs_vdata *pvd = vn->v_data;
 	struct pfs_node *pn = pvd->pvd_pn;
 	struct proc *proc;
 	int error;
 
 	PFS_TRACE(("%s", pn->pn_name));
 	pfs_assert_not_owned(pn);
 
 	/*
 	 * Do nothing unless this is the last close and the node has a
 	 * last-close handler.
 	 */
 	if (vrefcnt(vn) > 1 || pn->pn_close == NULL)
 		PFS_RETURN (0);
 
 	if (pvd->pvd_pid != NO_PID) {
 		proc = pfind(pvd->pvd_pid);
 	} else {
 		proc = NULL;
 	}
 
 	error = pn_close(va->a_td, proc, pn);
 
 	if (proc != NULL)
 		PROC_UNLOCK(proc);
 
 	PFS_RETURN (error);
 }
 
 /*
  * Get file attributes
  */
 static int
 pfs_getattr(struct vop_getattr_args *va)
 {
 	struct vnode *vn = va->a_vp;
 	struct pfs_vdata *pvd = vn->v_data;
 	struct pfs_node *pn = pvd->pvd_pn;
 	struct vattr *vap = va->a_vap;
 	struct proc *proc;
 	int error = 0;
 
 	PFS_TRACE(("%s", pn->pn_name));
 	pfs_assert_not_owned(pn);
 
 	if (!pfs_visible(curthread, pn, pvd->pvd_pid, &proc))
 		PFS_RETURN (ENOENT);
 
 	VATTR_NULL(vap);
 	vap->va_type = vn->v_type;
 	vap->va_fileid = pn_fileno(pn, pvd->pvd_pid);
 	vap->va_flags = 0;
 	vap->va_blocksize = PAGE_SIZE;
 	vap->va_bytes = vap->va_size = 0;
 	vap->va_fsid = vn->v_mount->mnt_stat.f_fsid.val[0];
 	vap->va_nlink = 1;
 	nanotime(&vap->va_ctime);
 	vap->va_atime = vap->va_mtime = vap->va_ctime;
 
 	switch (pn->pn_type) {
 	case pfstype_procdir:
 	case pfstype_root:
 	case pfstype_dir:
 #if 0
 		pfs_lock(pn);
 		/* compute link count */
 		pfs_unlock(pn);
 #endif
 		vap->va_mode = 0555;
 		break;
 	case pfstype_file:
 	case pfstype_symlink:
 		vap->va_mode = 0444;
 		break;
 	default:
 		printf("shouldn't be here!\n");
 		vap->va_mode = 0;
 		break;
 	}
 
 	if (proc != NULL) {
 		vap->va_uid = proc->p_ucred->cr_ruid;
 		vap->va_gid = proc->p_ucred->cr_rgid;
 		if (pn->pn_attr != NULL)
 			error = pn_attr(va->a_td, proc, pn, vap);
 		PROC_UNLOCK(proc);
 	} else {
 		vap->va_uid = 0;
 		vap->va_gid = 0;
 	}
 
 	PFS_RETURN (error);
 }
 
 /*
  * Perform an ioctl
  */
 static int
 pfs_ioctl(struct vop_ioctl_args *va)
 {
 	struct vnode *vn = va->a_vp;
 	struct pfs_vdata *pvd = vn->v_data;
 	struct pfs_node *pn = pvd->pvd_pn;
 	struct proc *proc;
 	int error;
 
 	PFS_TRACE(("%s: %lx", pn->pn_name, va->a_command));
 	pfs_assert_not_owned(pn);
 
 	if (vn->v_type != VREG)
 		PFS_RETURN (EINVAL);
 
 	if (pn->pn_ioctl == NULL)
 		PFS_RETURN (ENOTTY);
 
 	/*
 	 * This is necessary because process' privileges may
 	 * have changed since the open() call.
 	 */
 	if (!pfs_visible(curthread, pn, pvd->pvd_pid, &proc))
 		PFS_RETURN (EIO);
 
 	error = pn_ioctl(curthread, proc, pn, va->a_command, va->a_data);
 
 	if (proc != NULL)
 		PROC_UNLOCK(proc);
 
 	PFS_RETURN (error);
 }
 
 /*
  * Perform getextattr
  */
 static int
 pfs_getextattr(struct vop_getextattr_args *va)
 {
 	struct vnode *vn = va->a_vp;
 	struct pfs_vdata *pvd = vn->v_data;
 	struct pfs_node *pn = pvd->pvd_pn;
 	struct proc *proc;
 	int error;
 
 	PFS_TRACE(("%s", pn->pn_name));
 	pfs_assert_not_owned(pn);
 
 	/*
 	 * This is necessary because either process' privileges may
 	 * have changed since the open() call.
 	 */
 	if (!pfs_visible(curthread, pn, pvd->pvd_pid, &proc))
 		PFS_RETURN (EIO);
 
 	if (pn->pn_getextattr == NULL)
 		error = EOPNOTSUPP;
 	else
 		error = pn_getextattr(curthread, proc, pn,
 		    va->a_attrnamespace, va->a_name, va->a_uio,
 		    va->a_size, va->a_cred);
 
 	if (proc != NULL)
 		PROC_UNLOCK(proc);
 
 	pfs_unlock(pn);
 	PFS_RETURN (error);
 }
 
 /*
  * Look up a file or directory
  */
 static int
 pfs_lookup(struct vop_cachedlookup_args *va)
 {
 	struct vnode *vn = va->a_dvp;
 	struct vnode **vpp = va->a_vpp;
 	struct componentname *cnp = va->a_cnp;
 	struct pfs_vdata *pvd = vn->v_data;
 	struct pfs_node *pd = pvd->pvd_pn;
 	struct pfs_node *pn, *pdn = NULL;
 	pid_t pid = pvd->pvd_pid;
 	char *pname;
 	int error, i, namelen, visible;
 
 	PFS_TRACE(("%.*s", (int)cnp->cn_namelen, cnp->cn_nameptr));
 	pfs_assert_not_owned(pd);
 
 	if (vn->v_type != VDIR)
 		PFS_RETURN (ENOTDIR);
 
 	error = VOP_ACCESS(vn, VEXEC, cnp->cn_cred, cnp->cn_thread);
 	if (error)
 		PFS_RETURN (error);
 
 	/*
 	 * Don't support DELETE or RENAME.  CREATE is supported so
 	 * that O_CREAT will work, but the lookup will still fail if
 	 * the file does not exist.
 	 */
 	if ((cnp->cn_flags & ISLASTCN) &&
 	    (cnp->cn_nameiop == DELETE || cnp->cn_nameiop == RENAME))
 		PFS_RETURN (EOPNOTSUPP);
 
 	/* shortcut: check if the name is too long */
 	if (cnp->cn_namelen >= PFS_NAMELEN)
 		PFS_RETURN (ENOENT);
 
 	/* check that parent directory is visible... */
 	if (!pfs_visible(curthread, pd, pvd->pvd_pid, NULL))
 		PFS_RETURN (ENOENT);
 
 	/* self */
 	namelen = cnp->cn_namelen;
 	pname = cnp->cn_nameptr;
 	if (namelen == 1 && pname[0] == '.') {
 		pn = pd;
 		*vpp = vn;
 		VREF(vn);
 		PFS_RETURN (0);
 	}
 
 	/* parent */
 	if (cnp->cn_flags & ISDOTDOT) {
 		if (pd->pn_type == pfstype_root)
 			PFS_RETURN (EIO);
 		VOP_UNLOCK(vn, 0, cnp->cn_thread);
 		KASSERT(pd->pn_parent != NULL,
 		    ("%s(): non-root directory has no parent", __func__));
 		/*
 		 * This one is tricky.  Descendents of procdir nodes
 		 * inherit their parent's process affinity, but
 		 * there's no easy reverse mapping.  For simplicity,
 		 * we assume that if this node is a procdir, its
 		 * parent isn't (which is correct as long as
 		 * descendents of procdir nodes are never procdir
 		 * nodes themselves)
 		 */
 		if (pd->pn_type == pfstype_procdir)
 			pid = NO_PID;
 		pfs_lock(pd);
 		pn = pd->pn_parent;
 		pfs_unlock(pd);
 		goto got_pnode;
 	}
 
 	pfs_lock(pd);
 
 	/* named node */
 	for (pn = pd->pn_nodes; pn != NULL; pn = pn->pn_next)
 		if (pn->pn_type == pfstype_procdir)
 			pdn = pn;
 		else if (pn->pn_name[namelen] == '\0' &&
 		    bcmp(pname, pn->pn_name, namelen) == 0) {
 			pfs_unlock(pd);
 			goto got_pnode;
 		}
 
 	/* process dependent node */
 	if ((pn = pdn) != NULL) {
 		pid = 0;
 		for (pid = 0, i = 0; i < namelen && isdigit(pname[i]); ++i)
 			if ((pid = pid * 10 + pname[i] - '0') > PID_MAX)
 				break;
 		if (i == cnp->cn_namelen) {
 			pfs_unlock(pd);
 			goto got_pnode;
 		}
 	}
 
 	pfs_unlock(pd);
 
 	PFS_RETURN (ENOENT);
 
  got_pnode:
 	pfs_assert_not_owned(pd);
 	pfs_assert_not_owned(pn);
 	visible = pfs_visible(curthread, pn, pid, NULL);
 	if (!visible) {
 		error = ENOENT;
 		goto failed;
 	}
 
 	error = pfs_vncache_alloc(vn->v_mount, vpp, pn, pid);
 	if (error)
 		goto failed;
 
 	if (cnp->cn_flags & ISDOTDOT)
-		vn_lock(vn, LK_EXCLUSIVE|LK_RETRY, cnp->cn_thread);
+		vn_lock(vn, LK_EXCLUSIVE|LK_RETRY);
 	if (cnp->cn_flags & MAKEENTRY)
 		cache_enter(vn, *vpp, cnp);
 	PFS_RETURN (0);
  failed:
 	if (cnp->cn_flags & ISDOTDOT)
-		vn_lock(vn, LK_EXCLUSIVE|LK_RETRY, cnp->cn_thread);
+		vn_lock(vn, LK_EXCLUSIVE|LK_RETRY);
 	PFS_RETURN(error);
 }
 
 /*
  * Open a file or directory.
  */
 static int
 pfs_open(struct vop_open_args *va)
 {
 	struct vnode *vn = va->a_vp;
 	struct pfs_vdata *pvd = vn->v_data;
 	struct pfs_node *pn = pvd->pvd_pn;
 	int mode = va->a_mode;
 
 	PFS_TRACE(("%s (mode 0x%x)", pn->pn_name, mode));
 	pfs_assert_not_owned(pn);
 
 	/* check if the requested mode is permitted */
 	if (((mode & FREAD) && !(mode & PFS_RD)) ||
 	    ((mode & FWRITE) && !(mode & PFS_WR)))
 		PFS_RETURN (EPERM);
 
 	/* we don't support locking */
 	if ((mode & O_SHLOCK) || (mode & O_EXLOCK))
 		PFS_RETURN (EOPNOTSUPP);
 
 	PFS_RETURN (0);
 }
 
 /*
  * Read from a file
  */
 static int
 pfs_read(struct vop_read_args *va)
 {
 	struct vnode *vn = va->a_vp;
 	struct pfs_vdata *pvd = vn->v_data;
 	struct pfs_node *pn = pvd->pvd_pn;
 	struct uio *uio = va->a_uio;
 	struct proc *proc;
 	struct sbuf *sb = NULL;
 	int error;
 	unsigned int buflen, offset, resid;
 
 	PFS_TRACE(("%s", pn->pn_name));
 	pfs_assert_not_owned(pn);
 
 	if (vn->v_type != VREG)
 		PFS_RETURN (EINVAL);
 
 	if (!(pn->pn_flags & PFS_RD))
 		PFS_RETURN (EBADF);
 
 	if (pn->pn_fill == NULL)
 		PFS_RETURN (EIO);
 
 	/*
 	 * This is necessary because either process' privileges may
 	 * have changed since the open() call.
 	 */
 	if (!pfs_visible(curthread, pn, pvd->pvd_pid, &proc))
 		PFS_RETURN (EIO);
 	if (proc != NULL) {
 		_PHOLD(proc);
 		PROC_UNLOCK(proc);
 	}
 
 	if (pn->pn_flags & PFS_RAWRD) {
 		PFS_TRACE(("%lu resid", (unsigned long)uio->uio_resid));
 		error = pn_fill(curthread, proc, pn, NULL, uio);
 		PFS_TRACE(("%lu resid", (unsigned long)uio->uio_resid));
 		if (proc != NULL)
 			PRELE(proc);
 		PFS_RETURN (error);
 	}
 
 	/* beaucoup sanity checks so we don't ask for bogus allocation */
 	if (uio->uio_offset < 0 || uio->uio_resid < 0 ||
 	    (offset = uio->uio_offset) != uio->uio_offset ||
 	    (resid = uio->uio_resid) != uio->uio_resid ||
 	    (buflen = offset + resid + 1) < offset || buflen > INT_MAX) {
 		if (proc != NULL)
 			PRELE(proc);
 		PFS_RETURN (EINVAL);
 	}
 	if (buflen > MAXPHYS + 1) {
 		if (proc != NULL)
 			PRELE(proc);
 		PFS_RETURN (EIO);
 	}
 
 	sb = sbuf_new(sb, NULL, buflen, 0);
 	if (sb == NULL) {
 		if (proc != NULL)
 			PRELE(proc);
 		PFS_RETURN (EIO);
 	}
 
 	error = pn_fill(curthread, proc, pn, sb, uio);
 
 	if (proc != NULL)
 		PRELE(proc);
 
 	if (error) {
 		sbuf_delete(sb);
 		PFS_RETURN (error);
 	}
 
 	sbuf_finish(sb);
 	error = uiomove_frombuf(sbuf_data(sb), sbuf_len(sb), uio);
 	sbuf_delete(sb);
 	PFS_RETURN (error);
 }
 
 /*
  * Iterate through directory entries
  */
 static int
 pfs_iterate(struct thread *td, struct proc *proc, struct pfs_node *pd,
 	    struct pfs_node **pn, struct proc **p)
 {
 	int visible;
 
 	sx_assert(&allproc_lock, SX_SLOCKED);
 	pfs_assert_owned(pd);
  again:
 	if (*pn == NULL) {
 		/* first node */
 		*pn = pd->pn_nodes;
 	} else if ((*pn)->pn_type != pfstype_procdir) {
 		/* next node */
 		*pn = (*pn)->pn_next;
 	}
 	if (*pn != NULL && (*pn)->pn_type == pfstype_procdir) {
 		/* next process */
 		if (*p == NULL)
 			*p = LIST_FIRST(&allproc);
 		else
 			*p = LIST_NEXT(*p, p_list);
 		/* out of processes: next node */
 		if (*p == NULL)
 			*pn = (*pn)->pn_next;
 		else
 			PROC_LOCK(*p);
 	}
 
 	if ((*pn) == NULL)
 		return (-1);
 
 	if (*p != NULL) {
 		visible = pfs_visible_proc(td, *pn, *p);
 		PROC_UNLOCK(*p);
 	} else if (proc != NULL) {
 		visible = pfs_visible_proc(td, *pn, proc);
 	} else {
 		visible = 1;
 	}
 	if (!visible)
 		goto again;
 
 	return (0);
 }
 
 /*
  * Return directory entries.
  */
 static int
 pfs_readdir(struct vop_readdir_args *va)
 {
 	struct vnode *vn = va->a_vp;
 	struct pfs_vdata *pvd = vn->v_data;
 	struct pfs_node *pd = pvd->pvd_pn;
 	pid_t pid = pvd->pvd_pid;
 	struct proc *p, *proc;
 	struct pfs_node *pn;
 	struct dirent *entry;
 	struct uio *uio;
 	off_t offset;
 	int error, i, resid;
 	char *buf, *ent;
 
 	KASSERT(pd->pn_info == vn->v_mount->mnt_data,
 	    ("%s(): pn_info does not match mountpoint", __func__));
 	PFS_TRACE(("%s pid %lu", pd->pn_name, (unsigned long)pid));
 	pfs_assert_not_owned(pd);
 
 	if (vn->v_type != VDIR)
 		PFS_RETURN (ENOTDIR);
 	uio = va->a_uio;
 
 	/* only allow reading entire entries */
 	offset = uio->uio_offset;
 	resid = uio->uio_resid;
 	if (offset < 0 || offset % PFS_DELEN != 0 ||
 	    (resid && resid < PFS_DELEN))
 		PFS_RETURN (EINVAL);
 	if (resid == 0)
 		PFS_RETURN (0);
 
 	/* can't do this while holding the proc lock... */
 	buf = malloc(resid, M_IOV, M_WAITOK | M_ZERO);
 	sx_slock(&allproc_lock);
 	pfs_lock(pd);
 
         /* check if the directory is visible to the caller */
         if (!pfs_visible(curthread, pd, pid, &proc)) {
 		sx_sunlock(&allproc_lock);
 		pfs_unlock(pd);
 		free(buf, M_IOV);
                 PFS_RETURN (ENOENT);
 	}
 	KASSERT(pid == NO_PID || proc != NULL,
 	    ("%s(): no process for pid %lu", __func__, (unsigned long)pid));
 
 	/* skip unwanted entries */
 	for (pn = NULL, p = NULL; offset > 0; offset -= PFS_DELEN) {
 		if (pfs_iterate(curthread, proc, pd, &pn, &p) == -1) {
 			/* nothing left... */
 			if (proc != NULL)
 				PROC_UNLOCK(proc);
 			pfs_unlock(pd);
 			sx_sunlock(&allproc_lock);
 			free(buf, M_IOV);
 			PFS_RETURN (0);
 		}
 	}
 
 	/* fill in entries */
 	ent = buf;
 	while (pfs_iterate(curthread, proc, pd, &pn, &p) != -1 &&
 	    resid >= PFS_DELEN) {
 		entry = (struct dirent *)ent;
 		entry->d_reclen = PFS_DELEN;
 		entry->d_fileno = pn_fileno(pn, pid);
 		/* PFS_DELEN was picked to fit PFS_NAMLEN */
 		for (i = 0; i < PFS_NAMELEN - 1 && pn->pn_name[i] != '\0'; ++i)
 			entry->d_name[i] = pn->pn_name[i];
 		entry->d_name[i] = 0;
 		entry->d_namlen = i;
 		switch (pn->pn_type) {
 		case pfstype_procdir:
 			KASSERT(p != NULL,
 			    ("reached procdir node with p == NULL"));
 			entry->d_namlen = snprintf(entry->d_name,
 			    PFS_NAMELEN, "%d", p->p_pid);
 			/* fall through */
 		case pfstype_root:
 		case pfstype_dir:
 		case pfstype_this:
 		case pfstype_parent:
 			entry->d_type = DT_DIR;
 			break;
 		case pfstype_file:
 			entry->d_type = DT_REG;
 			break;
 		case pfstype_symlink:
 			entry->d_type = DT_LNK;
 			break;
 		default:
 			panic("%s has unexpected node type: %d", pn->pn_name, pn->pn_type);
 		}
 		PFS_TRACE(("%s", entry->d_name));
 		offset += PFS_DELEN;
 		resid -= PFS_DELEN;
 		ent += PFS_DELEN;
 	}
 	if (proc != NULL)
 		PROC_UNLOCK(proc);
 	pfs_unlock(pd);
 	sx_sunlock(&allproc_lock);
 	PFS_TRACE(("%zd bytes", ent - buf));
 	error = uiomove(buf, ent - buf, uio);
 	free(buf, M_IOV);
 	PFS_RETURN (error);
 }
 
 /*
  * Read a symbolic link
  */
 static int
 pfs_readlink(struct vop_readlink_args *va)
 {
 	struct vnode *vn = va->a_vp;
 	struct pfs_vdata *pvd = vn->v_data;
 	struct pfs_node *pn = pvd->pvd_pn;
 	struct uio *uio = va->a_uio;
 	struct proc *proc = NULL;
 	char buf[PATH_MAX];
 	struct sbuf sb;
 	int error;
 
 	PFS_TRACE(("%s", pn->pn_name));
 	pfs_assert_not_owned(pn);
 
 	if (vn->v_type != VLNK)
 		PFS_RETURN (EINVAL);
 
 	if (pn->pn_fill == NULL)
 		PFS_RETURN (EIO);
 
 	if (pvd->pvd_pid != NO_PID) {
 		if ((proc = pfind(pvd->pvd_pid)) == NULL)
 			PFS_RETURN (EIO);
 		if (proc->p_flag & P_WEXIT) {
 			PROC_UNLOCK(proc);
 			PFS_RETURN (EIO);
 		}
 		_PHOLD(proc);
 		PROC_UNLOCK(proc);
 	}
 
 	/* sbuf_new() can't fail with a static buffer */
 	sbuf_new(&sb, buf, sizeof buf, 0);
 
 	error = pn_fill(curthread, proc, pn, &sb, NULL);
 
 	if (proc != NULL)
 		PRELE(proc);
 
 	if (error) {
 		sbuf_delete(&sb);
 		PFS_RETURN (error);
 	}
 
 	sbuf_finish(&sb);
 	error = uiomove_frombuf(sbuf_data(&sb), sbuf_len(&sb), uio);
 	sbuf_delete(&sb);
 	PFS_RETURN (error);
 }
 
 /*
  * Reclaim a vnode
  */
 static int
 pfs_reclaim(struct vop_reclaim_args *va)
 {
 	struct vnode *vn = va->a_vp;
 	struct pfs_vdata *pvd = vn->v_data;
 	struct pfs_node *pn = pvd->pvd_pn;
 
 	PFS_TRACE(("%s", pn->pn_name));
 	pfs_assert_not_owned(pn);
 
 	return (pfs_vncache_free(va->a_vp));
 }
 
 /*
  * Set attributes
  */
 static int
 pfs_setattr(struct vop_setattr_args *va)
 {
 	struct vnode *vn = va->a_vp;
 	struct pfs_vdata *pvd = vn->v_data;
 	struct pfs_node *pn = pvd->pvd_pn;
 
 	PFS_TRACE(("%s", pn->pn_name));
 	pfs_assert_not_owned(pn);
 
 	PFS_RETURN (EOPNOTSUPP);
 }
 
 /*
  * Write to a file
  */
 static int
 pfs_write(struct vop_write_args *va)
 {
 	struct vnode *vn = va->a_vp;
 	struct pfs_vdata *pvd = vn->v_data;
 	struct pfs_node *pn = pvd->pvd_pn;
 	struct uio *uio = va->a_uio;
 	struct proc *proc;
 	struct sbuf sb;
 	int error;
 
 	PFS_TRACE(("%s", pn->pn_name));
 	pfs_assert_not_owned(pn);
 
 	if (vn->v_type != VREG)
 		PFS_RETURN (EINVAL);
 	KASSERT(pn->pn_type != pfstype_file,
 	    ("%s(): VREG vnode refers to non-file pfs_node", __func__));
 
 	if (!(pn->pn_flags & PFS_WR))
 		PFS_RETURN (EBADF);
 
 	if (pn->pn_fill == NULL)
 		PFS_RETURN (EIO);
 
 	/*
 	 * This is necessary because either process' privileges may
 	 * have changed since the open() call.
 	 */
 	if (!pfs_visible(curthread, pn, pvd->pvd_pid, &proc))
 		PFS_RETURN (EIO);
 	if (proc != NULL) {
 		_PHOLD(proc);
 		PROC_UNLOCK(proc);
 	}
 
 	if (pn->pn_flags & PFS_RAWWR) {
 		pfs_lock(pn);
 		error = pn_fill(curthread, proc, pn, NULL, uio);
 		pfs_unlock(pn);
 		if (proc != NULL)
 			PRELE(proc);
 		PFS_RETURN (error);
 	}
 
 	sbuf_uionew(&sb, uio, &error);
 	if (error) {
 		if (proc != NULL)
 			PRELE(proc);
 		PFS_RETURN (error);
 	}
 
 	error = pn_fill(curthread, proc, pn, &sb, uio);
 
 	sbuf_delete(&sb);
 	if (proc != NULL)
 		PRELE(proc);
 	PFS_RETURN (error);
 }
 
 /*
  * Vnode operations
  */
 struct vop_vector pfs_vnodeops = {
 	.vop_default =		&default_vnodeops,
 
 	.vop_access =		pfs_access,
 	.vop_cachedlookup =	pfs_lookup,
 	.vop_close =		pfs_close,
 	.vop_create =		VOP_EOPNOTSUPP,
 	.vop_getattr =		pfs_getattr,
 	.vop_getextattr =	pfs_getextattr,
 	.vop_ioctl =		pfs_ioctl,
 	.vop_link =		VOP_EOPNOTSUPP,
 	.vop_lookup =		vfs_cache_lookup,
 	.vop_mkdir =		VOP_EOPNOTSUPP,
 	.vop_mknod =		VOP_EOPNOTSUPP,
 	.vop_open =		pfs_open,
 	.vop_read =		pfs_read,
 	.vop_readdir =		pfs_readdir,
 	.vop_readlink =		pfs_readlink,
 	.vop_reclaim =		pfs_reclaim,
 	.vop_remove =		VOP_EOPNOTSUPP,
 	.vop_rename =		VOP_EOPNOTSUPP,
 	.vop_rmdir =		VOP_EOPNOTSUPP,
 	.vop_setattr =		pfs_setattr,
 	.vop_symlink =		VOP_EOPNOTSUPP,
 	.vop_write =		pfs_write,
 	/* XXX I've probably forgotten a few that need VOP_EOPNOTSUPP */
 };
Index: head/sys/fs/smbfs/smbfs_io.c
===================================================================
--- head/sys/fs/smbfs/smbfs_io.c	(revision 175201)
+++ head/sys/fs/smbfs/smbfs_io.c	(revision 175202)
@@ -1,711 +1,711 @@
 /*-
  * Copyright (c) 2000-2001, Boris Popov
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. All advertising materials mentioning features or use of this software
  *    must display the following acknowledgement:
  *    This product includes software developed by Boris Popov.
  * 4. Neither the name of the author nor the names of any co-contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  * $FreeBSD$
  *
  */
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/resourcevar.h>	/* defines plimit structure in proc struct */
 #include <sys/kernel.h>
 #include <sys/proc.h>
 #include <sys/fcntl.h>
 #include <sys/bio.h>
 #include <sys/buf.h>
 #include <sys/mount.h>
 #include <sys/namei.h>
 #include <sys/vnode.h>
 #include <sys/dirent.h>
 #include <sys/signalvar.h>
 #include <sys/sysctl.h>
 #include <sys/vmmeter.h>
 
 #include <vm/vm.h>
 #include <vm/vm_page.h>
 #include <vm/vm_extern.h>
 #include <vm/vm_object.h>
 #include <vm/vm_pager.h>
 #include <vm/vnode_pager.h>
 /*
 #include <sys/ioccom.h>
 */
 #include <netsmb/smb.h>
 #include <netsmb/smb_conn.h>
 #include <netsmb/smb_subr.h>
 
 #include <fs/smbfs/smbfs.h>
 #include <fs/smbfs/smbfs_node.h>
 #include <fs/smbfs/smbfs_subr.h>
 
 /*#define SMBFS_RWGENERIC*/
 
 extern int smbfs_pbuf_freecnt;
 
 static int smbfs_fastlookup = 1;
 
 SYSCTL_DECL(_vfs_smbfs);
 SYSCTL_INT(_vfs_smbfs, OID_AUTO, fastlookup, CTLFLAG_RW, &smbfs_fastlookup, 0, "");
 
 
 #define DE_SIZE	(sizeof(struct dirent))
 
 static int
 smbfs_readvdir(struct vnode *vp, struct uio *uio, struct ucred *cred)
 {
 	struct dirent de;
 	struct componentname cn;
 	struct smb_cred scred;
 	struct smbfs_fctx *ctx;
 	struct vnode *newvp;
 	struct smbnode *np = VTOSMB(vp);
 	int error/*, *eofflag = ap->a_eofflag*/;
 	long offset, limit;
 
 	np = VTOSMB(vp);
 	SMBVDEBUG("dirname='%s'\n", np->n_name);
 	smb_makescred(&scred, uio->uio_td, cred);
 	offset = uio->uio_offset / DE_SIZE;	/* offset in the directory */
 	limit = uio->uio_resid / DE_SIZE;
 	if (uio->uio_resid < DE_SIZE || uio->uio_offset < 0)
 		return EINVAL;
 	while (limit && offset < 2) {
 		limit--;
 		bzero((caddr_t)&de, DE_SIZE);
 		de.d_reclen = DE_SIZE;
 		de.d_fileno = (offset == 0) ? np->n_ino :
 		    (np->n_parent ? VTOSMB(np->n_parent)->n_ino : 2);
 		if (de.d_fileno == 0)
 			de.d_fileno = 0x7ffffffd + offset;
 		de.d_namlen = offset + 1;
 		de.d_name[0] = '.';
 		de.d_name[1] = '.';
 		de.d_name[offset + 1] = '\0';
 		de.d_type = DT_DIR;
 		error = uiomove(&de, DE_SIZE, uio);
 		if (error)
 			return error;
 		offset++;
 		uio->uio_offset += DE_SIZE;
 	}
 	if (limit == 0)
 		return 0;
 	if (offset != np->n_dirofs || np->n_dirseq == NULL) {
 		SMBVDEBUG("Reopening search %ld:%ld\n", offset, np->n_dirofs);
 		if (np->n_dirseq) {
 			smbfs_findclose(np->n_dirseq, &scred);
 			np->n_dirseq = NULL;
 		}
 		np->n_dirofs = 2;
 		error = smbfs_findopen(np, "*", 1,
 		    SMB_FA_SYSTEM | SMB_FA_HIDDEN | SMB_FA_DIR,
 		    &scred, &ctx);
 		if (error) {
 			SMBVDEBUG("can not open search, error = %d", error);
 			return error;
 		}
 		np->n_dirseq = ctx;
 	} else
 		ctx = np->n_dirseq;
 	while (np->n_dirofs < offset) {
 		error = smbfs_findnext(ctx, offset - np->n_dirofs++, &scred);
 		if (error) {
 			smbfs_findclose(np->n_dirseq, &scred);
 			np->n_dirseq = NULL;
 			return error == ENOENT ? 0 : error;
 		}
 	}
 	error = 0;
 	for (; limit; limit--, offset++) {
 		error = smbfs_findnext(ctx, limit, &scred);
 		if (error)
 			break;
 		np->n_dirofs++;
 		bzero((caddr_t)&de, DE_SIZE);
 		de.d_reclen = DE_SIZE;
 		de.d_fileno = ctx->f_attr.fa_ino;
 		de.d_type = (ctx->f_attr.fa_attr & SMB_FA_DIR) ? DT_DIR : DT_REG;
 		de.d_namlen = ctx->f_nmlen;
 		bcopy(ctx->f_name, de.d_name, de.d_namlen);
 		de.d_name[de.d_namlen] = '\0';
 		if (smbfs_fastlookup) {
 			error = smbfs_nget(vp->v_mount, vp, ctx->f_name,
 			    ctx->f_nmlen, &ctx->f_attr, &newvp);
 			if (!error) {
 				cn.cn_nameptr = de.d_name;
 				cn.cn_namelen = de.d_namlen;
 				cache_enter(vp, newvp, &cn);
 				vput(newvp);
 			}
 		}
 		error = uiomove(&de, DE_SIZE, uio);
 		if (error)
 			break;
 	}
 	if (error == ENOENT)
 		error = 0;
 	uio->uio_offset = offset * DE_SIZE;
 	return error;
 }
 
 int
 smbfs_readvnode(struct vnode *vp, struct uio *uiop, struct ucred *cred)
 {
 	struct smbmount *smp = VFSTOSMBFS(vp->v_mount);
 	struct smbnode *np = VTOSMB(vp);
 	struct thread *td;
 	struct vattr vattr;
 	struct smb_cred scred;
 	int error, lks;
 
 	/*
 	 * Protect against method which is not supported for now
 	 */
 	if (uiop->uio_segflg == UIO_NOCOPY)
 		return EOPNOTSUPP;
 
 	if (vp->v_type != VREG && vp->v_type != VDIR) {
 		SMBFSERR("vn types other than VREG or VDIR are unsupported !\n");
 		return EIO;
 	}
 	if (uiop->uio_resid == 0)
 		return 0;
 	if (uiop->uio_offset < 0)
 		return EINVAL;
 /*	if (uiop->uio_offset + uiop->uio_resid > smp->nm_maxfilesize)
 		return EFBIG;*/
 	td = uiop->uio_td;
 	if (vp->v_type == VDIR) {
 		lks = LK_EXCLUSIVE;/*lockstatus(vp->v_vnlock, td);*/
 		if (lks == LK_SHARED)
-			vn_lock(vp, LK_UPGRADE | LK_RETRY, td);
+			vn_lock(vp, LK_UPGRADE | LK_RETRY);
 		error = smbfs_readvdir(vp, uiop, cred);
 		if (lks == LK_SHARED)
-			vn_lock(vp, LK_DOWNGRADE | LK_RETRY, td);
+			vn_lock(vp, LK_DOWNGRADE | LK_RETRY);
 		return error;
 	}
 
 /*	biosize = SSTOCN(smp->sm_share)->sc_txmax;*/
 	if (np->n_flag & NMODIFIED) {
 		smbfs_attr_cacheremove(vp);
 		error = VOP_GETATTR(vp, &vattr, cred, td);
 		if (error)
 			return error;
 		np->n_mtime.tv_sec = vattr.va_mtime.tv_sec;
 	} else {
 		error = VOP_GETATTR(vp, &vattr, cred, td);
 		if (error)
 			return error;
 		if (np->n_mtime.tv_sec != vattr.va_mtime.tv_sec) {
 			error = smbfs_vinvalbuf(vp, td);
 			if (error)
 				return error;
 			np->n_mtime.tv_sec = vattr.va_mtime.tv_sec;
 		}
 	}
 	smb_makescred(&scred, td, cred);
 	return smb_read(smp->sm_share, np->n_fid, uiop, &scred);
 }
 
 int
 smbfs_writevnode(struct vnode *vp, struct uio *uiop,
 	struct ucred *cred, int ioflag)
 {
 	struct smbmount *smp = VTOSMBFS(vp);
 	struct smbnode *np = VTOSMB(vp);
 	struct smb_cred scred;
 	struct proc *p;
 	struct thread *td;
 	int error = 0;
 
 	if (vp->v_type != VREG) {
 		SMBERROR("vn types other than VREG unsupported !\n");
 		return EIO;
 	}
 	SMBVDEBUG("ofs=%d,resid=%d\n",(int)uiop->uio_offset, uiop->uio_resid);
 	if (uiop->uio_offset < 0)
 		return EINVAL;
 /*	if (uiop->uio_offset + uiop->uio_resid > smp->nm_maxfilesize)
 		return (EFBIG);*/
 	td = uiop->uio_td;
 	p = td->td_proc;
 	if (ioflag & (IO_APPEND | IO_SYNC)) {
 		if (np->n_flag & NMODIFIED) {
 			smbfs_attr_cacheremove(vp);
 			error = smbfs_vinvalbuf(vp, td);
 			if (error)
 				return error;
 		}
 		if (ioflag & IO_APPEND) {
 #ifdef notyet
 			/*
 			 * File size can be changed by another client
 			 */
 			smbfs_attr_cacheremove(vp);
 			error = VOP_GETATTR(vp, &vattr, cred, td);
 			if (error) return (error);
 #endif
 			uiop->uio_offset = np->n_size;
 		}
 	}
 	if (uiop->uio_resid == 0)
 		return 0;
 	if (p != NULL) {
 		PROC_LOCK(p);
 		if (uiop->uio_offset + uiop->uio_resid >
 		    lim_cur(p, RLIMIT_FSIZE)) {
 			psignal(p, SIGXFSZ);
 			PROC_UNLOCK(p);
 			return EFBIG;
 		}
 		PROC_UNLOCK(p);
 	}
 	smb_makescred(&scred, td, cred);
 	error = smb_write(smp->sm_share, np->n_fid, uiop, &scred);
 	SMBVDEBUG("after: ofs=%d,resid=%d\n",(int)uiop->uio_offset, uiop->uio_resid);
 	if (!error) {
 		if (uiop->uio_offset > np->n_size) {
 			np->n_size = uiop->uio_offset;
 			vnode_pager_setsize(vp, np->n_size);
 		}
 	}
 	return error;
 }
 
 /*
  * Do an I/O operation to/from a cache block.
  */
 int
 smbfs_doio(struct vnode *vp, struct buf *bp, struct ucred *cr, struct thread *td)
 {
 	struct smbmount *smp = VFSTOSMBFS(vp->v_mount);
 	struct smbnode *np = VTOSMB(vp);
 	struct uio uio, *uiop = &uio;
 	struct iovec io;
 	struct smb_cred scred;
 	int error = 0;
 
 	uiop->uio_iov = &io;
 	uiop->uio_iovcnt = 1;
 	uiop->uio_segflg = UIO_SYSSPACE;
 	uiop->uio_td = td;
 
 	smb_makescred(&scred, td, cr);
 
 	if (bp->b_iocmd == BIO_READ) {
 	    io.iov_len = uiop->uio_resid = bp->b_bcount;
 	    io.iov_base = bp->b_data;
 	    uiop->uio_rw = UIO_READ;
 	    switch (vp->v_type) {
 	      case VREG:
 		uiop->uio_offset = ((off_t)bp->b_blkno) * DEV_BSIZE;
 		error = smb_read(smp->sm_share, np->n_fid, uiop, &scred);
 		if (error)
 			break;
 		if (uiop->uio_resid) {
 			int left = uiop->uio_resid;
 			int nread = bp->b_bcount - left;
 			if (left > 0)
 			    bzero((char *)bp->b_data + nread, left);
 		}
 		break;
 	    default:
 		printf("smbfs_doio:  type %x unexpected\n",vp->v_type);
 		break;
 	    };
 	    if (error) {
 		bp->b_error = error;
 		bp->b_ioflags |= BIO_ERROR;
 	    }
 	} else { /* write */
 	    if (((bp->b_blkno * DEV_BSIZE) + bp->b_dirtyend) > np->n_size)
 		bp->b_dirtyend = np->n_size - (bp->b_blkno * DEV_BSIZE);
 
 	    if (bp->b_dirtyend > bp->b_dirtyoff) {
 		io.iov_len = uiop->uio_resid = bp->b_dirtyend - bp->b_dirtyoff;
 		uiop->uio_offset = ((off_t)bp->b_blkno) * DEV_BSIZE + bp->b_dirtyoff;
 		io.iov_base = (char *)bp->b_data + bp->b_dirtyoff;
 		uiop->uio_rw = UIO_WRITE;
 		error = smb_write(smp->sm_share, np->n_fid, uiop, &scred);
 
 		/*
 		 * For an interrupted write, the buffer is still valid
 		 * and the write hasn't been pushed to the server yet,
 		 * so we can't set BIO_ERROR and report the interruption
 		 * by setting B_EINTR. For the B_ASYNC case, B_EINTR
 		 * is not relevant, so the rpc attempt is essentially
 		 * a noop.  For the case of a V3 write rpc not being
 		 * committed to stable storage, the block is still
 		 * dirty and requires either a commit rpc or another
 		 * write rpc with iomode == NFSV3WRITE_FILESYNC before
 		 * the block is reused. This is indicated by setting
 		 * the B_DELWRI and B_NEEDCOMMIT flags.
 		 */
 		if (error == EINTR
 		    || (!error && (bp->b_flags & B_NEEDCOMMIT))) {
 			int s;
 
 			s = splbio();
 			bp->b_flags &= ~(B_INVAL|B_NOCACHE);
 			if ((bp->b_flags & B_ASYNC) == 0)
 			    bp->b_flags |= B_EINTR;
 			if ((bp->b_flags & B_PAGING) == 0) {
 			    bdirty(bp);
 			    bp->b_flags &= ~B_DONE;
 			}
 			if ((bp->b_flags & B_ASYNC) == 0)
 			    bp->b_flags |= B_EINTR;
 			splx(s);
 		} else {
 			if (error) {
 				bp->b_ioflags |= BIO_ERROR;
 				bp->b_error = error;
 			}
 			bp->b_dirtyoff = bp->b_dirtyend = 0;
 		}
 	    } else {
 		bp->b_resid = 0;
 		bufdone(bp);
 		return 0;
 	    }
 	}
 	bp->b_resid = uiop->uio_resid;
 	bufdone(bp);
 	return error;
 }
 
 /*
  * Vnode op for VM getpages.
  * Wish wish .... get rid from multiple IO routines
  */
 int
 smbfs_getpages(ap)
 	struct vop_getpages_args /* {
 		struct vnode *a_vp;
 		vm_page_t *a_m;
 		int a_count;
 		int a_reqpage;
 		vm_ooffset_t a_offset;
 	} */ *ap;
 {
 #ifdef SMBFS_RWGENERIC
 	return vop_stdgetpages(ap);
 #else
 	int i, error, nextoff, size, toff, npages, count, reqpage;
 	struct uio uio;
 	struct iovec iov;
 	vm_offset_t kva;
 	struct buf *bp;
 	struct vnode *vp;
 	struct thread *td;
 	struct ucred *cred;
 	struct smbmount *smp;
 	struct smbnode *np;
 	struct smb_cred scred;
 	vm_object_t object;
 	vm_page_t *pages, m;
 
 	vp = ap->a_vp;
 	if ((object = vp->v_object) == NULL) {
 		printf("smbfs_getpages: called with non-merged cache vnode??\n");
 		return VM_PAGER_ERROR;
 	}
 
 	td = curthread;				/* XXX */
 	cred = td->td_ucred;		/* XXX */
 	np = VTOSMB(vp);
 	smp = VFSTOSMBFS(vp->v_mount);
 	pages = ap->a_m;
 	count = ap->a_count;
 	npages = btoc(count);
 	reqpage = ap->a_reqpage;
 
 	/*
 	 * If the requested page is partially valid, just return it and
 	 * allow the pager to zero-out the blanks.  Partially valid pages
 	 * can only occur at the file EOF.
 	 */
 	m = pages[reqpage];
 
 	VM_OBJECT_LOCK(object);
 	if (m->valid != 0) {
 		/* handled by vm_fault now	  */
 		/* vm_page_zero_invalid(m, TRUE); */
 		vm_page_lock_queues();
 		for (i = 0; i < npages; ++i) {
 			if (i != reqpage)
 				vm_page_free(pages[i]);
 		}
 		vm_page_unlock_queues();
 		VM_OBJECT_UNLOCK(object);
 		return 0;
 	}
 	VM_OBJECT_UNLOCK(object);
 
 	smb_makescred(&scred, td, cred);
 
 	bp = getpbuf(&smbfs_pbuf_freecnt);
 
 	kva = (vm_offset_t) bp->b_data;
 	pmap_qenter(kva, pages, npages);
 	PCPU_INC(cnt.v_vnodein);
 	PCPU_ADD(cnt.v_vnodepgsin, npages);
 
 	iov.iov_base = (caddr_t) kva;
 	iov.iov_len = count;
 	uio.uio_iov = &iov;
 	uio.uio_iovcnt = 1;
 	uio.uio_offset = IDX_TO_OFF(pages[0]->pindex);
 	uio.uio_resid = count;
 	uio.uio_segflg = UIO_SYSSPACE;
 	uio.uio_rw = UIO_READ;
 	uio.uio_td = td;
 
 	error = smb_read(smp->sm_share, np->n_fid, &uio, &scred);
 	pmap_qremove(kva, npages);
 
 	relpbuf(bp, &smbfs_pbuf_freecnt);
 
 	VM_OBJECT_LOCK(object);
 	if (error && (uio.uio_resid == count)) {
 		printf("smbfs_getpages: error %d\n",error);
 		vm_page_lock_queues();
 		for (i = 0; i < npages; i++) {
 			if (reqpage != i)
 				vm_page_free(pages[i]);
 		}
 		vm_page_unlock_queues();
 		VM_OBJECT_UNLOCK(object);
 		return VM_PAGER_ERROR;
 	}
 
 	size = count - uio.uio_resid;
 
 	vm_page_lock_queues();
 	for (i = 0, toff = 0; i < npages; i++, toff = nextoff) {
 		vm_page_t m;
 		nextoff = toff + PAGE_SIZE;
 		m = pages[i];
 
 		if (nextoff <= size) {
 			/*
 			 * Read operation filled an entire page
 			 */
 			m->valid = VM_PAGE_BITS_ALL;
 			vm_page_undirty(m);
 		} else if (size > toff) {
 			/*
 			 * Read operation filled a partial page.
 			 */
 			m->valid = 0;
 			vm_page_set_validclean(m, 0, size - toff);
 			/* handled by vm_fault now	  */
 			/* vm_page_zero_invalid(m, TRUE); */
 		} else {
 			/*
 			 * Read operation was short.  If no error occured
 			 * we may have hit a zero-fill section.   We simply
 			 * leave valid set to 0.
 			 */
 			;
 		}
 
 		if (i != reqpage) {
 			/*
 			 * Whether or not to leave the page activated is up in
 			 * the air, but we should put the page on a page queue
 			 * somewhere (it already is in the object).  Result:
 			 * It appears that emperical results show that
 			 * deactivating pages is best.
 			 */
 
 			/*
 			 * Just in case someone was asking for this page we
 			 * now tell them that it is ok to use.
 			 */
 			if (!error) {
 				if (m->oflags & VPO_WANTED)
 					vm_page_activate(m);
 				else
 					vm_page_deactivate(m);
 				vm_page_wakeup(m);
 			} else {
 				vm_page_free(m);
 			}
 		}
 	}
 	vm_page_unlock_queues();
 	VM_OBJECT_UNLOCK(object);
 	return 0;
 #endif /* SMBFS_RWGENERIC */
 }
 
 /*
  * Vnode op for VM putpages.
  * possible bug: all IO done in sync mode
  * Note that vop_close always invalidate pages before close, so it's
  * not necessary to open vnode.
  */
 int
 smbfs_putpages(ap)
 	struct vop_putpages_args /* {
 		struct vnode *a_vp;
 		vm_page_t *a_m;
 		int a_count;
 		int a_sync;
 		int *a_rtvals;
 		vm_ooffset_t a_offset;
 	} */ *ap;
 {
 	int error;
 	struct vnode *vp = ap->a_vp;
 	struct thread *td;
 	struct ucred *cred;
 
 #ifdef SMBFS_RWGENERIC
 	td = curthread;			/* XXX */
 	cred = td->td_ucred;		/* XXX */
 	VOP_OPEN(vp, FWRITE, cred, td, NULL);
 	error = vop_stdputpages(ap);
 	VOP_CLOSE(vp, FWRITE, cred, td);
 	return error;
 #else
 	struct uio uio;
 	struct iovec iov;
 	vm_offset_t kva;
 	struct buf *bp;
 	int i, npages, count;
 	int *rtvals;
 	struct smbmount *smp;
 	struct smbnode *np;
 	struct smb_cred scred;
 	vm_page_t *pages;
 
 	td = curthread;			/* XXX */
 	cred = td->td_ucred;		/* XXX */
 /*	VOP_OPEN(vp, FWRITE, cred, td, NULL);*/
 	np = VTOSMB(vp);
 	smp = VFSTOSMBFS(vp->v_mount);
 	pages = ap->a_m;
 	count = ap->a_count;
 	rtvals = ap->a_rtvals;
 	npages = btoc(count);
 
 	for (i = 0; i < npages; i++) {
 		rtvals[i] = VM_PAGER_AGAIN;
 	}
 
 	bp = getpbuf(&smbfs_pbuf_freecnt);
 
 	kva = (vm_offset_t) bp->b_data;
 	pmap_qenter(kva, pages, npages);
 	PCPU_INC(cnt.v_vnodeout);
 	PCPU_ADD(cnt.v_vnodepgsout, count);
 
 	iov.iov_base = (caddr_t) kva;
 	iov.iov_len = count;
 	uio.uio_iov = &iov;
 	uio.uio_iovcnt = 1;
 	uio.uio_offset = IDX_TO_OFF(pages[0]->pindex);
 	uio.uio_resid = count;
 	uio.uio_segflg = UIO_SYSSPACE;
 	uio.uio_rw = UIO_WRITE;
 	uio.uio_td = td;
 	SMBVDEBUG("ofs=%d,resid=%d\n",(int)uio.uio_offset, uio.uio_resid);
 
 	smb_makescred(&scred, td, cred);
 	error = smb_write(smp->sm_share, np->n_fid, &uio, &scred);
 /*	VOP_CLOSE(vp, FWRITE, cred, td);*/
 	SMBVDEBUG("paged write done: %d\n", error);
 
 	pmap_qremove(kva, npages);
 
 	relpbuf(bp, &smbfs_pbuf_freecnt);
 
 	if (!error) {
 		int nwritten = round_page(count - uio.uio_resid) / PAGE_SIZE;
 		vm_page_lock_queues();
 		for (i = 0; i < nwritten; i++) {
 			rtvals[i] = VM_PAGER_OK;
 			vm_page_undirty(pages[i]);
 		}
 		vm_page_unlock_queues();
 	}
 	return rtvals[0];
 #endif /* SMBFS_RWGENERIC */
 }
 
 /*
  * Flush and invalidate all dirty buffers. If another process is already
  * doing the flush, just wait for completion.
  */
 int
 smbfs_vinvalbuf(struct vnode *vp, struct thread *td)
 {
 	struct smbnode *np = VTOSMB(vp);
 	int error = 0;
 
 	if (vp->v_iflag & VI_DOOMED)
 		return 0;
 
 	while (np->n_flag & NFLUSHINPROG) {
 		np->n_flag |= NFLUSHWANT;
 		error = tsleep(&np->n_flag, PRIBIO + 2, "smfsvinv", 2 * hz);
 		error = smb_td_intr(td);
 		if (error == EINTR)
 			return EINTR;
 	}
 	np->n_flag |= NFLUSHINPROG;
 
 	if (vp->v_bufobj.bo_object != NULL) {
 		VM_OBJECT_LOCK(vp->v_bufobj.bo_object);
 		vm_object_page_clean(vp->v_bufobj.bo_object, 0, 0, OBJPC_SYNC);
 		VM_OBJECT_UNLOCK(vp->v_bufobj.bo_object);
 	}
 
 	error = vinvalbuf(vp, V_SAVE, td, PCATCH, 0);
 	while (error) {
 		if (error == ERESTART || error == EINTR) {
 			np->n_flag &= ~NFLUSHINPROG;
 			if (np->n_flag & NFLUSHWANT) {
 				np->n_flag &= ~NFLUSHWANT;
 				wakeup(&np->n_flag);
 			}
 			return EINTR;
 		}
 		error = vinvalbuf(vp, V_SAVE, td, PCATCH, 0);
 	}
 	np->n_flag &= ~(NMODIFIED | NFLUSHINPROG);
 	if (np->n_flag & NFLUSHWANT) {
 		np->n_flag &= ~NFLUSHWANT;
 		wakeup(&np->n_flag);
 	}
 	return (error);
 }
Index: head/sys/fs/smbfs/smbfs_node.c
===================================================================
--- head/sys/fs/smbfs/smbfs_node.c	(revision 175201)
+++ head/sys/fs/smbfs/smbfs_node.c	(revision 175202)
@@ -1,447 +1,447 @@
 /*-
  * Copyright (c) 2000-2001 Boris Popov
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. All advertising materials mentioning features or use of this software
  *    must display the following acknowledgement:
  *    This product includes software developed by Boris Popov.
  * 4. Neither the name of the author nor the names of any co-contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  * $FreeBSD$
  */
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/kernel.h>
 #include <sys/lock.h>
 #include <sys/malloc.h>
 #include <sys/mount.h>
 #include <sys/proc.h>
 #include <sys/queue.h>
 #include <sys/stat.h>
 #include <sys/sysctl.h>
 #include <sys/time.h>
 #include <sys/vnode.h>
 
 #include <netsmb/smb.h>
 #include <netsmb/smb_conn.h>
 #include <netsmb/smb_subr.h>
 
 #include <vm/vm.h>
 #include <vm/vm_extern.h>
 /*#include <vm/vm_page.h>
 #include <vm/vm_object.h>*/
 
 #include <fs/smbfs/smbfs.h>
 #include <fs/smbfs/smbfs_node.h>
 #include <fs/smbfs/smbfs_subr.h>
 
 #define	SMBFS_NOHASH(smp, hval)	(&(smp)->sm_hash[(hval) & (smp)->sm_hashlen])
 #define	smbfs_hash_lock(smp, td)	lockmgr(&smp->sm_hashlock, LK_EXCLUSIVE, NULL, td)
 #define	smbfs_hash_unlock(smp, td)	lockmgr(&smp->sm_hashlock, LK_RELEASE, NULL, td)
 
 
 extern struct vop_vector smbfs_vnodeops;	/* XXX -> .h file */
 
 MALLOC_DEFINE(M_SMBNODE, "smbufs_node", "SMBFS vnode private part");
 static MALLOC_DEFINE(M_SMBNODENAME, "smbufs_nname", "SMBFS node name");
 
 int smbfs_hashprint(struct mount *mp);
 
 #if 0
 #ifdef SYSCTL_DECL
 SYSCTL_DECL(_vfs_smbfs);
 #endif
 SYSCTL_PROC(_vfs_smbfs, OID_AUTO, vnprint, CTLFLAG_WR|CTLTYPE_OPAQUE,
 	    NULL, 0, smbfs_hashprint, "S,vnlist", "vnode hash");
 #endif
 
 #define	FNV_32_PRIME ((u_int32_t) 0x01000193UL)
 #define	FNV1_32_INIT ((u_int32_t) 33554467UL)
 
 u_int32_t
 smbfs_hash(const u_char *name, int nmlen)
 {
 	u_int32_t v;
 
 	for (v = FNV1_32_INIT; nmlen; name++, nmlen--) {
 		v *= FNV_32_PRIME;
 		v ^= (u_int32_t)*name;
 	}
 	return v;
 }
 
 int
 smbfs_hashprint(struct mount *mp)
 {
 	struct smbmount *smp = VFSTOSMBFS(mp);
 	struct smbnode_hashhead *nhpp;
 	struct smbnode *np;
 	int i;
 
 	for(i = 0; i <= smp->sm_hashlen; i++) {
 		nhpp = &smp->sm_hash[i];
 		LIST_FOREACH(np, nhpp, n_hash)
 			vprint("", SMBTOV(np));
 	}
 	return 0;
 }
 
 static char *
 smbfs_name_alloc(const u_char *name, int nmlen)
 {
 	u_char *cp;
 
 	nmlen++;
 #ifdef SMBFS_NAME_DEBUG
 	cp = malloc(nmlen + 2 + sizeof(int), M_SMBNODENAME, M_WAITOK);
 	*(int*)cp = nmlen;
 	cp += sizeof(int);
 	cp[0] = 0xfc;
 	cp++;
 	bcopy(name, cp, nmlen - 1);
 	cp[nmlen] = 0xfe;
 #else
 	cp = malloc(nmlen, M_SMBNODENAME, M_WAITOK);
 	bcopy(name, cp, nmlen - 1);
 #endif
 	cp[nmlen - 1] = 0;
 	return cp;
 }
 
 static void
 smbfs_name_free(u_char *name)
 {
 #ifdef SMBFS_NAME_DEBUG
 	int nmlen, slen;
 	u_char *cp;
 
 	cp = name;
 	cp--;
 	if (*cp != 0xfc)
 		panic("First byte of name entry '%s' corrupted", name);
 	cp -= sizeof(int);
 	nmlen = *(int*)cp;
 	slen = strlen(name) + 1;
 	if (nmlen != slen)
 		panic("Name length mismatch: was %d, now %d name '%s'",
 		    nmlen, slen, name);
 	if (name[nmlen] != 0xfe)
 		panic("Last byte of name entry '%s' corrupted\n", name);
 	free(cp, M_SMBNODENAME);
 #else
 	free(name, M_SMBNODENAME);
 #endif
 }
 
 static int
 smbfs_node_alloc(struct mount *mp, struct vnode *dvp,
 	const char *name, int nmlen, struct smbfattr *fap, struct vnode **vpp)
 {
 	struct vattr vattr;
 	struct thread *td = curthread;	/* XXX */
 	struct smbmount *smp = VFSTOSMBFS(mp);
 	struct smbnode_hashhead *nhpp;
 	struct smbnode *np, *np2, *dnp;
 	struct vnode *vp;
 	u_long hashval;
 	int error;
 
 	*vpp = NULL;
 	if (smp->sm_root != NULL && dvp == NULL) {
 		SMBERROR("do not allocate root vnode twice!\n");
 		return EINVAL;
 	}
 	if (nmlen == 2 && bcmp(name, "..", 2) == 0) {
 		if (dvp == NULL)
 			return EINVAL;
 		vp = VTOSMB(VTOSMB(dvp)->n_parent)->n_vnode;
 		error = vget(vp, LK_EXCLUSIVE, td);
 		if (error == 0)
 			*vpp = vp;
 		return error;
 	} else if (nmlen == 1 && name[0] == '.') {
 		SMBERROR("do not call me with dot!\n");
 		return EINVAL;
 	}
 	dnp = dvp ? VTOSMB(dvp) : NULL;
 	if (dnp == NULL && dvp != NULL) {
 		vprint("smbfs_node_alloc: dead parent vnode", dvp);
 		return EINVAL;
 	}
 	hashval = smbfs_hash(name, nmlen);
 retry:
 	smbfs_hash_lock(smp, td);
 loop:
 	nhpp = SMBFS_NOHASH(smp, hashval);
 	LIST_FOREACH(np, nhpp, n_hash) {
 		vp = SMBTOV(np);
 		if (np->n_parent != dvp ||
 		    np->n_nmlen != nmlen || bcmp(name, np->n_name, nmlen) != 0)
 			continue;
 		VI_LOCK(vp);
 		smbfs_hash_unlock(smp, td);
 		if (vget(vp, LK_EXCLUSIVE | LK_INTERLOCK, td) != 0)
 			goto retry;
 		/* Force cached attributes to be refreshed if stale. */
 		(void)VOP_GETATTR(vp, &vattr, td->td_ucred, td);
 		/*
 		 * If the file type on the server is inconsistent with
 		 * what it was when we created the vnode, kill the
 		 * bogus vnode now and fall through to the code below
 		 * to create a new one with the right type.
 		 */
 		if ((vp->v_type == VDIR && (np->n_dosattr & SMB_FA_DIR) == 0) ||
 		    (vp->v_type == VREG && (np->n_dosattr & SMB_FA_DIR) != 0)) {
 			vgone(vp);
 			vput(vp);
 			break;
 		}
 		*vpp = vp;
 		return 0;
 	}
 	smbfs_hash_unlock(smp, td);
 	/*
 	 * If we don't have node attributes, then it is an explicit lookup
 	 * for an existing vnode.
 	 */
 	if (fap == NULL)
 		return ENOENT;
 
 	MALLOC(np, struct smbnode *, sizeof *np, M_SMBNODE, M_WAITOK);
 	error = getnewvnode("smbfs", mp, &smbfs_vnodeops, &vp);
 	if (error) {
 		FREE(np, M_SMBNODE);
 		return error;
 	}
 	error = insmntque(vp, mp);	/* XXX: Too early for mpsafe fs */
 	if (error != 0) {
 		FREE(np, M_SMBNODE);
 		return (error);
 	}
 	vp->v_type = fap->fa_attr & SMB_FA_DIR ? VDIR : VREG;
 	bzero(np, sizeof(*np));
 	vp->v_data = np;
 	np->n_vnode = vp;
 	np->n_mount = VFSTOSMBFS(mp);
 	np->n_nmlen = nmlen;
 	np->n_name = smbfs_name_alloc(name, nmlen);
 	np->n_ino = fap->fa_ino;
 
 	if (dvp) {
 		ASSERT_VOP_LOCKED(dvp, "smbfs_node_alloc");
 		np->n_parent = dvp;
 		if (/*vp->v_type == VDIR &&*/ (dvp->v_vflag & VV_ROOT) == 0) {
 			vref(dvp);
 			np->n_flag |= NREFPARENT;
 		}
 	} else if (vp->v_type == VREG)
 		SMBERROR("new vnode '%s' born without parent ?\n", np->n_name);
 
 	vp->v_vnlock->lk_flags |= LK_CANRECURSE;
-	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td);
+	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
 
 	smbfs_hash_lock(smp, td);
 	LIST_FOREACH(np2, nhpp, n_hash) {
 		if (np2->n_parent != dvp ||
 		    np2->n_nmlen != nmlen || bcmp(name, np2->n_name, nmlen) != 0)
 			continue;
 		vput(vp);
 /*		smb_name_free(np->n_name);
 		FREE(np, M_SMBNODE);*/
 		goto loop;
 	}
 	LIST_INSERT_HEAD(nhpp, np, n_hash);
 	smbfs_hash_unlock(smp, td);
 	*vpp = vp;
 	return 0;
 }
 
 int
 smbfs_nget(struct mount *mp, struct vnode *dvp, const char *name, int nmlen,
 	struct smbfattr *fap, struct vnode **vpp)
 {
 	struct smbnode *np;
 	struct vnode *vp;
 	int error;
 
 	*vpp = NULL;
 	error = smbfs_node_alloc(mp, dvp, name, nmlen, fap, &vp);
 	if (error)
 		return error;
 	np = VTOSMB(vp);
 	if (fap)
 		smbfs_attr_cacheenter(vp, fap);
 	*vpp = vp;
 	return 0;
 }
 
 /*
  * Free smbnode, and give vnode back to system
  */
 int
 smbfs_reclaim(ap)                     
         struct vop_reclaim_args /* {
 		struct vnode *a_vp;
 		struct thread *a_p;
         } */ *ap;
 {
 	struct vnode *vp = ap->a_vp;
 	struct thread *td = ap->a_td;
 	struct vnode *dvp;
 	struct smbnode *np = VTOSMB(vp);
 	struct smbmount *smp = VTOSMBFS(vp);
 	
 	SMBVDEBUG("%s,%d\n", np->n_name, vrefcnt(vp));
 
 	KASSERT((np->n_flag & NOPEN) == 0, ("file not closed before reclaim"));
 
 	smbfs_hash_lock(smp, td);
 	/*
 	 * Destroy the vm object and flush associated pages.
 	 */
 	vnode_destroy_vobject(vp);
 
 	dvp = (np->n_parent && (np->n_flag & NREFPARENT)) ?
 	    np->n_parent : NULL;
 
 	if (np->n_hash.le_prev)
 		LIST_REMOVE(np, n_hash);
 	if (smp->sm_root == np) {
 		SMBVDEBUG("root vnode\n");
 		smp->sm_root = NULL;
 	}
 	vp->v_data = NULL;
 	smbfs_hash_unlock(smp, td);
 	if (np->n_name)
 		smbfs_name_free(np->n_name);
 	FREE(np, M_SMBNODE);
 	if (dvp != NULL) {
 		vrele(dvp);
 		/*
 		 * Indicate that we released something; see comment
 		 * in smbfs_unmount().
 		 */
 		smp->sm_didrele = 1;
 	}
 	return 0;
 }
 
 int
 smbfs_inactive(ap)
 	struct vop_inactive_args /* {
 		struct vnode *a_vp;
 		struct thread *a_td;
 	} */ *ap;
 {
 	struct thread *td = ap->a_td;
 	struct ucred *cred = td->td_ucred;
 	struct vnode *vp = ap->a_vp;
 	struct smbnode *np = VTOSMB(vp);
 	struct smb_cred scred;
 	struct vattr va;
 
 	SMBVDEBUG("%s: %d\n", VTOSMB(vp)->n_name, vrefcnt(vp));
 	if ((np->n_flag & NOPEN) != 0) {
 		smb_makescred(&scred, td, cred);
 		smbfs_vinvalbuf(vp, td);
 		if (vp->v_type == VREG) {
 			VOP_GETATTR(vp, &va, cred, td);
 			smbfs_smb_close(np->n_mount->sm_share, np->n_fid,
 			    &np->n_mtime, &scred);
 		} else if (vp->v_type == VDIR) {
 			if (np->n_dirseq != NULL) {
 				smbfs_findclose(np->n_dirseq, &scred);
 				np->n_dirseq = NULL;
 			}
 		}
 		np->n_flag &= ~NOPEN;
 		smbfs_attr_cacheremove(vp);
 	}
 	if (np->n_flag & NGONE)
 		vrecycle(vp, td);
 	return (0);
 }
 /*
  * routines to maintain vnode attributes cache
  * smbfs_attr_cacheenter: unpack np.i to vattr structure
  */
 void
 smbfs_attr_cacheenter(struct vnode *vp, struct smbfattr *fap)
 {
 	struct smbnode *np = VTOSMB(vp);
 
 	if (vp->v_type == VREG) {
 		if (np->n_size != fap->fa_size) {
 			np->n_size = fap->fa_size;
 			vnode_pager_setsize(vp, np->n_size);
 		}
 	} else if (vp->v_type == VDIR) {
 		np->n_size = 16384; 		/* should be a better way ... */
 	} else
 		return;
 	np->n_mtime = fap->fa_mtime;
 	np->n_dosattr = fap->fa_attr;
 	np->n_attrage = time_second;
 	return;
 }
 
 int
 smbfs_attr_cachelookup(struct vnode *vp, struct vattr *va)
 {
 	struct smbnode *np = VTOSMB(vp);
 	struct smbmount *smp = VTOSMBFS(vp);
 	int diff;
 
 	diff = time_second - np->n_attrage;
 	if (diff > 2)	/* XXX should be configurable */
 		return ENOENT;
 	va->va_type = vp->v_type;		/* vnode type (for create) */
 	if (vp->v_type == VREG) {
 		va->va_mode = smp->sm_file_mode; /* files access mode and type */
 		if (np->n_dosattr & SMB_FA_RDONLY)
 			va->va_mode &= ~(S_IWUSR|S_IWGRP|S_IWOTH);
 	} else if (vp->v_type == VDIR) {
 		va->va_mode = smp->sm_dir_mode;	/* files access mode and type */
 	} else
 		return EINVAL;
 	va->va_size = np->n_size;
 	va->va_nlink = 1;		/* number of references to file */
 	va->va_uid = smp->sm_uid;	/* owner user id */
 	va->va_gid = smp->sm_gid;	/* owner group id */
 	va->va_fsid = vp->v_mount->mnt_stat.f_fsid.val[0];
 	va->va_fileid = np->n_ino;	/* file id */
 	if (va->va_fileid == 0)
 		va->va_fileid = 2;
 	va->va_blocksize = SSTOVC(smp->sm_share)->vc_txmax;
 	va->va_mtime = np->n_mtime;
 	va->va_atime = va->va_ctime = va->va_mtime;	/* time file changed */
 	va->va_gen = VNOVAL;		/* generation number of file */
 	va->va_flags = 0;		/* flags defined for file */
 	va->va_rdev = VNOVAL;		/* device the special file represents */
 	va->va_bytes = va->va_size;	/* bytes of disk space held by file */
 	va->va_filerev = 0;		/* file modification number */
 	va->va_vaflags = 0;		/* operations flags */
 	return 0;
 }
Index: head/sys/fs/smbfs/smbfs_vnops.c
===================================================================
--- head/sys/fs/smbfs/smbfs_vnops.c	(revision 175201)
+++ head/sys/fs/smbfs/smbfs_vnops.c	(revision 175202)
@@ -1,1271 +1,1271 @@
 /*-
  * Copyright (c) 2000-2001 Boris Popov
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. All advertising materials mentioning features or use of this software
  *    must display the following acknowledgement:
  *    This product includes software developed by Boris Popov.
  * 4. Neither the name of the author nor the names of any co-contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  * $FreeBSD$
  */
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/namei.h>
 #include <sys/kernel.h>
 #include <sys/proc.h>
 #include <sys/bio.h>
 #include <sys/buf.h>
 #include <sys/fcntl.h>
 #include <sys/mount.h>
 #include <sys/unistd.h>
 #include <sys/vnode.h>
 #include <sys/limits.h>
 #include <sys/lockf.h>
 #include <sys/stat.h>
 
 #include <vm/vm.h>
 #include <vm/vm_extern.h>
 
 
 #include <netsmb/smb.h>
 #include <netsmb/smb_conn.h>
 #include <netsmb/smb_subr.h>
 
 #include <fs/smbfs/smbfs.h>
 #include <fs/smbfs/smbfs_node.h>
 #include <fs/smbfs/smbfs_subr.h>
 
 /*
  * Prototypes for SMBFS vnode operations
  */
 static vop_create_t	smbfs_create;
 static vop_mknod_t	smbfs_mknod;
 static vop_open_t	smbfs_open;
 static vop_close_t	smbfs_close;
 static vop_access_t	smbfs_access;
 static vop_getattr_t	smbfs_getattr;
 static vop_setattr_t	smbfs_setattr;
 static vop_read_t	smbfs_read;
 static vop_write_t	smbfs_write;
 static vop_fsync_t	smbfs_fsync;
 static vop_remove_t	smbfs_remove;
 static vop_link_t	smbfs_link;
 static vop_lookup_t	smbfs_lookup;
 static vop_rename_t	smbfs_rename;
 static vop_mkdir_t	smbfs_mkdir;
 static vop_rmdir_t	smbfs_rmdir;
 static vop_symlink_t	smbfs_symlink;
 static vop_readdir_t	smbfs_readdir;
 static vop_strategy_t	smbfs_strategy;
 static vop_print_t	smbfs_print;
 static vop_pathconf_t	smbfs_pathconf;
 static vop_advlock_t	smbfs_advlock;
 static vop_getextattr_t	smbfs_getextattr;
 
 struct vop_vector smbfs_vnodeops = {
 	.vop_default =		&default_vnodeops,
 
 	.vop_access =		smbfs_access,
 	.vop_advlock =		smbfs_advlock,
 	.vop_close =		smbfs_close,
 	.vop_create =		smbfs_create,
 	.vop_fsync =		smbfs_fsync,
 	.vop_getattr =		smbfs_getattr,
 	.vop_getextattr = 	smbfs_getextattr,
 	.vop_getpages =		smbfs_getpages,
 	.vop_inactive =		smbfs_inactive,
 	.vop_ioctl =		smbfs_ioctl,
 	.vop_link =		smbfs_link,
 	.vop_lookup =		smbfs_lookup,
 	.vop_mkdir =		smbfs_mkdir,
 	.vop_mknod =		smbfs_mknod,
 	.vop_open =		smbfs_open,
 	.vop_pathconf =		smbfs_pathconf,
 	.vop_print =		smbfs_print,
 	.vop_putpages =		smbfs_putpages,
 	.vop_read =		smbfs_read,
 	.vop_readdir =		smbfs_readdir,
 	.vop_reclaim =		smbfs_reclaim,
 	.vop_remove =		smbfs_remove,
 	.vop_rename =		smbfs_rename,
 	.vop_rmdir =		smbfs_rmdir,
 	.vop_setattr =		smbfs_setattr,
 /*	.vop_setextattr =	smbfs_setextattr,*/
 	.vop_strategy =		smbfs_strategy,
 	.vop_symlink =		smbfs_symlink,
 	.vop_write =		smbfs_write,
 };
 
 static int
 smbfs_access(ap)
 	struct vop_access_args /* {
 		struct vnode *a_vp;
 		int  a_mode;
 		struct ucred *a_cred;
 		struct thread *a_td;
 	} */ *ap;
 {
 	struct vnode *vp = ap->a_vp;
 	mode_t mode = ap->a_mode;
 	mode_t mpmode;
 	struct smbmount *smp = VTOSMBFS(vp);
 
 	SMBVDEBUG("\n");
 	if ((mode & VWRITE) && (vp->v_mount->mnt_flag & MNT_RDONLY)) {
 		switch (vp->v_type) {
 		    case VREG: case VDIR: case VLNK:
 			return EROFS;
 		    default:
 			break;
 		}
 	}
 	mpmode = vp->v_type == VREG ? smp->sm_file_mode : smp->sm_dir_mode;
 	return (vaccess(vp->v_type, mpmode, smp->sm_uid,
 	    smp->sm_gid, ap->a_mode, ap->a_cred, NULL));
 }
 
 /* ARGSUSED */
 static int
 smbfs_open(ap)
 	struct vop_open_args /* {
 		struct vnode *a_vp;
 		int  a_mode;
 		struct ucred *a_cred;
 		struct thread *a_td;
 	} */ *ap;
 {
 	struct vnode *vp = ap->a_vp;
 	struct smbnode *np = VTOSMB(vp);
 	struct smb_cred scred;
 	struct vattr vattr;
 	int mode = ap->a_mode;
 	int error, accmode;
 
 	SMBVDEBUG("%s,%d\n", np->n_name, (np->n_flag & NOPEN) != 0);
 	if (vp->v_type != VREG && vp->v_type != VDIR) { 
 		SMBFSERR("open eacces vtype=%d\n", vp->v_type);
 		return EACCES;
 	}
 	if (vp->v_type == VDIR) {
 		np->n_flag |= NOPEN;
 		return 0;
 	}
 	if (np->n_flag & NMODIFIED) {
 		if ((error = smbfs_vinvalbuf(vp, ap->a_td)) == EINTR)
 			return error;
 		smbfs_attr_cacheremove(vp);
 		error = VOP_GETATTR(vp, &vattr, ap->a_cred, ap->a_td);
 		if (error)
 			return error;
 		np->n_mtime.tv_sec = vattr.va_mtime.tv_sec;
 	} else {
 		error = VOP_GETATTR(vp, &vattr, ap->a_cred, ap->a_td);
 		if (error)
 			return error;
 		if (np->n_mtime.tv_sec != vattr.va_mtime.tv_sec) {
 			error = smbfs_vinvalbuf(vp, ap->a_td);
 			if (error == EINTR)
 				return error;
 			np->n_mtime.tv_sec = vattr.va_mtime.tv_sec;
 		}
 	}
 	if ((np->n_flag & NOPEN) != 0)
 		return 0;
 	/*
 	 * Use DENYNONE to give unixy semantics of permitting
 	 * everything not forbidden by permissions.  Ie denial
 	 * is up to server with clients/openers needing to use
 	 * advisory locks for further control.
 	 */
 	accmode = SMB_SM_DENYNONE|SMB_AM_OPENREAD;
 	if ((vp->v_mount->mnt_flag & MNT_RDONLY) == 0)
 		accmode = SMB_SM_DENYNONE|SMB_AM_OPENRW;
 	smb_makescred(&scred, ap->a_td, ap->a_cred);
 	error = smbfs_smb_open(np, accmode, &scred);
 	if (error) {
 		if (mode & FWRITE)
 			return EACCES;
 		else if ((vp->v_mount->mnt_flag & MNT_RDONLY) == 0) {
 			accmode = SMB_SM_DENYNONE|SMB_AM_OPENREAD;
 			error = smbfs_smb_open(np, accmode, &scred);
 		}
 	}
 	if (error == 0) {
 		np->n_flag |= NOPEN;
 		vnode_create_vobject(ap->a_vp, vattr.va_size, ap->a_td);
 	}
 	smbfs_attr_cacheremove(vp);
 	return error;
 }
 
 static int
 smbfs_close(ap)
 	struct vop_close_args /* {
 		struct vnodeop_desc *a_desc;
 		struct vnode *a_vp;
 		int  a_fflag;
 		struct ucred *a_cred;
 		struct thread *a_td;
 	} */ *ap;
 {
 	struct vnode *vp = ap->a_vp;
 	struct thread *td = ap->a_td;
 	struct smbnode *np = VTOSMB(vp);
 	struct smb_cred scred;
 
 	if (vp->v_type == VDIR && (np->n_flag & NOPEN) != 0 &&
 	    np->n_dirseq != NULL) {
 		smb_makescred(&scred, td, ap->a_cred);
 		smbfs_findclose(np->n_dirseq, &scred);
 		np->n_dirseq = NULL;
 	}
 	return 0;
 }
 
 /*
  * smbfs_getattr call from vfs.
  */
 static int
 smbfs_getattr(ap)
 	struct vop_getattr_args /* {
 		struct vnode *a_vp;
 		struct vattr *a_vap;
 		struct ucred *a_cred;
 		struct thread *a_td;
 	} */ *ap;
 {
 	struct vnode *vp = ap->a_vp;
 	struct smbnode *np = VTOSMB(vp);
 	struct vattr *va=ap->a_vap;
 	struct smbfattr fattr;
 	struct smb_cred scred;
 	u_quad_t oldsize;
 	int error;
 
 	SMBVDEBUG("%lx: '%s' %d\n", (long)vp, np->n_name, (vp->v_vflag & VV_ROOT) != 0);
 	error = smbfs_attr_cachelookup(vp, va);
 	if (!error)
 		return 0;
 	SMBVDEBUG("not in the cache\n");
 	smb_makescred(&scred, ap->a_td, ap->a_cred);
 	oldsize = np->n_size;
 	error = smbfs_smb_lookup(np, NULL, 0, &fattr, &scred);
 	if (error) {
 		SMBVDEBUG("error %d\n", error);
 		return error;
 	}
 	smbfs_attr_cacheenter(vp, &fattr);
 	smbfs_attr_cachelookup(vp, va);
 	if (np->n_flag & NOPEN)
 		np->n_size = oldsize;
 	return 0;
 }
 
 static int
 smbfs_setattr(ap)
 	struct vop_setattr_args /* {
 		struct vnode *a_vp;
 		struct vattr *a_vap;
 		struct ucred *a_cred;
 		struct thread *a_td;
 	} */ *ap;
 {
 	struct vnode *vp = ap->a_vp;
 	struct smbnode *np = VTOSMB(vp);
 	struct vattr *vap = ap->a_vap;
 	struct timespec *mtime, *atime;
 	struct smb_cred scred;
 	struct smb_share *ssp = np->n_mount->sm_share;
 	struct smb_vc *vcp = SSTOVC(ssp);
 	u_quad_t tsize = 0;
 	int isreadonly, doclose, error = 0;
 	int old_n_dosattr;
 
 	SMBVDEBUG("\n");
 	if (vap->va_flags != VNOVAL)
 		return EOPNOTSUPP;
 	isreadonly = (vp->v_mount->mnt_flag & MNT_RDONLY);
 	/*
 	 * Disallow write attempts if the filesystem is mounted read-only.
 	 */
   	if ((vap->va_uid != (uid_t)VNOVAL || vap->va_gid != (gid_t)VNOVAL || 
 	     vap->va_atime.tv_sec != VNOVAL || vap->va_mtime.tv_sec != VNOVAL ||
 	     vap->va_mode != (mode_t)VNOVAL) && isreadonly)
 		return EROFS;
 	smb_makescred(&scred, ap->a_td, ap->a_cred);
 	if (vap->va_size != VNOVAL) {
  		switch (vp->v_type) {
  		    case VDIR:
  			return EISDIR;
  		    case VREG:
 			break;
  		    default:
 			return EINVAL;
   		};
 		if (isreadonly)
 			return EROFS;
 		doclose = 0;
 		vnode_pager_setsize(vp, (u_long)vap->va_size);
  		tsize = np->n_size;
  		np->n_size = vap->va_size;
 		if ((np->n_flag & NOPEN) == 0) {
 			error = smbfs_smb_open(np,
 					       SMB_SM_DENYNONE|SMB_AM_OPENRW,
 					       &scred);
 			if (error == 0)
 				doclose = 1;
 		}
 		if (error == 0)
 			error = smbfs_smb_setfsize(np, vap->va_size, &scred);
 		if (doclose)
 			smbfs_smb_close(ssp, np->n_fid, NULL, &scred);
 		if (error) {
 			np->n_size = tsize;
 			vnode_pager_setsize(vp, (u_long)tsize);
 			return error;
 		}
   	}
 	if (vap->va_mode != (mode_t)VNOVAL) {
 		old_n_dosattr = np->n_dosattr;
 		if (vap->va_mode & S_IWUSR)
 			np->n_dosattr &= ~SMB_FA_RDONLY;
 		else
 			np->n_dosattr |= SMB_FA_RDONLY;
 		if (np->n_dosattr != old_n_dosattr) {
 			error = smbfs_smb_setpattr(np, np->n_dosattr, NULL, &scred);
 			if (error)
 				return error;
 		}
 	}
 	mtime = atime = NULL;
 	if (vap->va_mtime.tv_sec != VNOVAL)
 		mtime = &vap->va_mtime;
 	if (vap->va_atime.tv_sec != VNOVAL)
 		atime = &vap->va_atime;
 	if (mtime != atime) {
 		if (vap->va_vaflags & VA_UTIMES_NULL) {
 			error = VOP_ACCESS(vp, VADMIN, ap->a_cred, ap->a_td);
 			if (error)
 				error = VOP_ACCESS(vp, VWRITE, ap->a_cred,
 				    ap->a_td);
 		} else
 			error = VOP_ACCESS(vp, VADMIN, ap->a_cred, ap->a_td);
 #if 0
 		if (mtime == NULL)
 			mtime = &np->n_mtime;
 		if (atime == NULL)
 			atime = &np->n_atime;
 #endif
 		/*
 		 * If file is opened, then we can use handle based calls.
 		 * If not, use path based ones.
 		 */
 		if ((np->n_flag & NOPEN) == 0) {
 			if (vcp->vc_flags & SMBV_WIN95) {
 				error = VOP_OPEN(vp, FWRITE, ap->a_cred, ap->a_td, NULL);
 				if (!error) {
 /*				error = smbfs_smb_setfattrNT(np, 0, mtime, atime, &scred);
 				VOP_GETATTR(vp, &vattr, ap->a_cred, ap->a_td);*/
 				if (mtime)
 					np->n_mtime = *mtime;
 				VOP_CLOSE(vp, FWRITE, ap->a_cred, ap->a_td);
 				}
 			} else if ((vcp->vc_sopt.sv_caps & SMB_CAP_NT_SMBS)) {
 				error = smbfs_smb_setptime2(np, mtime, atime, 0, &scred);
 /*				error = smbfs_smb_setpattrNT(np, 0, mtime, atime, &scred);*/
 			} else if (SMB_DIALECT(vcp) >= SMB_DIALECT_LANMAN2_0) {
 				error = smbfs_smb_setptime2(np, mtime, atime, 0, &scred);
 			} else {
 				error = smbfs_smb_setpattr(np, 0, mtime, &scred);
 			}
 		} else {
 			if (vcp->vc_sopt.sv_caps & SMB_CAP_NT_SMBS) {
 				error = smbfs_smb_setfattrNT(np, 0, mtime, atime, &scred);
 			} else if (SMB_DIALECT(vcp) >= SMB_DIALECT_LANMAN1_0) {
 				error = smbfs_smb_setftime(np, mtime, atime, &scred);
 			} else {
 				/*
 				 * I have no idea how to handle this for core
 				 * level servers. The possible solution is to
 				 * update mtime after file is closed.
 				 */
 				 SMBERROR("can't update times on an opened file\n");
 			}
 		}
 	}
 	/*
 	 * Invalidate attribute cache in case if server doesn't set
 	 * required attributes.
 	 */
 	smbfs_attr_cacheremove(vp);	/* invalidate cache */
 	VOP_GETATTR(vp, vap, ap->a_cred, ap->a_td);
 	np->n_mtime.tv_sec = vap->va_mtime.tv_sec;
 	return error;
 }
 /*
  * smbfs_read call.
  */
 static int
 smbfs_read(ap)
 	struct vop_read_args /* {
 		struct vnode *a_vp;
 		struct uio *a_uio;
 		int  a_ioflag;
 		struct ucred *a_cred;
 	} */ *ap;
 {
 	struct vnode *vp = ap->a_vp;
 	struct uio *uio = ap->a_uio;
 
 	SMBVDEBUG("\n");
 	if (vp->v_type != VREG && vp->v_type != VDIR)
 		return EPERM;
 	return smbfs_readvnode(vp, uio, ap->a_cred);
 }
 
 static int
 smbfs_write(ap)
 	struct vop_write_args /* {
 		struct vnode *a_vp;
 		struct uio *a_uio;
 		int  a_ioflag;
 		struct ucred *a_cred;
 	} */ *ap;
 {
 	struct vnode *vp = ap->a_vp;
 	struct uio *uio = ap->a_uio;
 
 	SMBVDEBUG("%d,ofs=%d,sz=%d\n",vp->v_type, (int)uio->uio_offset, uio->uio_resid);
 	if (vp->v_type != VREG)
 		return (EPERM);
 	return smbfs_writevnode(vp, uio, ap->a_cred,ap->a_ioflag);
 }
 /*
  * smbfs_create call
  * Create a regular file. On entry the directory to contain the file being
  * created is locked.  We must release before we return. We must also free
  * the pathname buffer pointed at by cnp->cn_pnbuf, always on error, or
  * only if the SAVESTART bit in cn_flags is clear on success.
  */
 static int
 smbfs_create(ap)
 	struct vop_create_args /* {
 		struct vnode *a_dvp;
 		struct vnode **a_vpp;
 		struct componentname *a_cnp;
 		struct vattr *a_vap;
 	} */ *ap;
 {
 	struct vnode *dvp = ap->a_dvp;
 	struct vattr *vap = ap->a_vap;
 	struct vnode **vpp=ap->a_vpp;
 	struct componentname *cnp = ap->a_cnp;
 	struct smbnode *dnp = VTOSMB(dvp);
 	struct vnode *vp;
 	struct vattr vattr;
 	struct smbfattr fattr;
 	struct smb_cred scred;
 	char *name = cnp->cn_nameptr;
 	int nmlen = cnp->cn_namelen;
 	int error;
 	
 
 	SMBVDEBUG("\n");
 	*vpp = NULL;
 	if (vap->va_type != VREG)
 		return EOPNOTSUPP;
 	if ((error = VOP_GETATTR(dvp, &vattr, cnp->cn_cred, cnp->cn_thread)))
 		return error;
 	smb_makescred(&scred, cnp->cn_thread, cnp->cn_cred);
 	
 	error = smbfs_smb_create(dnp, name, nmlen, &scred);
 	if (error)
 		return error;
 	error = smbfs_smb_lookup(dnp, name, nmlen, &fattr, &scred);
 	if (error)
 		return error;
 	error = smbfs_nget(VTOVFS(dvp), dvp, name, nmlen, &fattr, &vp);
 	if (error)
 		return error;
 	*vpp = vp;
 	if (cnp->cn_flags & MAKEENTRY)
 		cache_enter(dvp, vp, cnp);
 	return error;
 }
 
 static int
 smbfs_remove(ap)
 	struct vop_remove_args /* {
 		struct vnodeop_desc *a_desc;
 		struct vnode * a_dvp;
 		struct vnode * a_vp;
 		struct componentname * a_cnp;
 	} */ *ap;
 {
 	struct vnode *vp = ap->a_vp;
 /*	struct vnode *dvp = ap->a_dvp;*/
 	struct componentname *cnp = ap->a_cnp;
 	struct smbnode *np = VTOSMB(vp);
 	struct smb_cred scred;
 	int error;
 
 	if (vp->v_type == VDIR || (np->n_flag & NOPEN) != 0 || vrefcnt(vp) != 1)
 		return EPERM;
 	smb_makescred(&scred, cnp->cn_thread, cnp->cn_cred);
 	error = smbfs_smb_delete(np, &scred);
 	if (error == 0)
 		np->n_flag |= NGONE;
 	cache_purge(vp);
 	return error;
 }
 
 /*
  * smbfs_file rename call
  */
 static int
 smbfs_rename(ap)
 	struct vop_rename_args  /* {
 		struct vnode *a_fdvp;
 		struct vnode *a_fvp;
 		struct componentname *a_fcnp;
 		struct vnode *a_tdvp;
 		struct vnode *a_tvp;
 		struct componentname *a_tcnp;
 	} */ *ap;
 {
 	struct vnode *fvp = ap->a_fvp;
 	struct vnode *tvp = ap->a_tvp;
 	struct vnode *fdvp = ap->a_fdvp;
 	struct vnode *tdvp = ap->a_tdvp;
 	struct componentname *tcnp = ap->a_tcnp;
 /*	struct componentname *fcnp = ap->a_fcnp;*/
 	struct smb_cred scred;
 	u_int16_t flags = 6;
 	int error=0;
 
 	/* Check for cross-device rename */
 	if ((fvp->v_mount != tdvp->v_mount) ||
 	    (tvp && (fvp->v_mount != tvp->v_mount))) {
 		error = EXDEV;
 		goto out;
 	}
 
 	if (tvp && vrefcnt(tvp) > 1) {
 		error = EBUSY;
 		goto out;
 	}
 	flags = 0x10;			/* verify all writes */
 	if (fvp->v_type == VDIR) {
 		flags |= 2;
 	} else if (fvp->v_type == VREG) {
 		flags |= 1;
 	} else {
 		error = EINVAL;
 		goto out;
 	}
 	smb_makescred(&scred, tcnp->cn_thread, tcnp->cn_cred);
 	/*
 	 * It seems that Samba doesn't implement SMB_COM_MOVE call...
 	 */
 #ifdef notnow
 	if (SMB_DIALECT(SSTOCN(smp->sm_share)) >= SMB_DIALECT_LANMAN1_0) {
 		error = smbfs_smb_move(VTOSMB(fvp), VTOSMB(tdvp),
 		    tcnp->cn_nameptr, tcnp->cn_namelen, flags, &scred);
 	} else
 #endif
 	{
 		/*
 		 * We have to do the work atomicaly
 		 */
 		if (tvp && tvp != fvp) {
 			error = smbfs_smb_delete(VTOSMB(tvp), &scred);
 			if (error)
 				goto out_cacherem;
 			VTOSMB(fvp)->n_flag |= NGONE;
 		}
 		error = smbfs_smb_rename(VTOSMB(fvp), VTOSMB(tdvp),
 		    tcnp->cn_nameptr, tcnp->cn_namelen, &scred);
 	}
 
 	if (fvp->v_type == VDIR) {
 		if (tvp != NULL && tvp->v_type == VDIR)
 			cache_purge(tdvp);
 		cache_purge(fdvp);
 	}
 
 out_cacherem:
 	smbfs_attr_cacheremove(fdvp);
 	smbfs_attr_cacheremove(tdvp);
 out:
 	if (tdvp == tvp)
 		vrele(tdvp);
 	else
 		vput(tdvp);
 	if (tvp)
 		vput(tvp);
 	vrele(fdvp);
 	vrele(fvp);
 #ifdef possible_mistake
 	vgone(fvp);
 	if (tvp)
 		vgone(tvp);
 #endif
 	return error;
 }
 
 /*
  * somtime it will come true...
  */
 static int
 smbfs_link(ap)
 	struct vop_link_args /* {
 		struct vnode *a_tdvp;
 		struct vnode *a_vp;
 		struct componentname *a_cnp;
 	} */ *ap;
 {
 	return EOPNOTSUPP;
 }
 
 /*
  * smbfs_symlink link create call.
  * Sometime it will be functional...
  */
 static int
 smbfs_symlink(ap)
 	struct vop_symlink_args /* {
 		struct vnode *a_dvp;
 		struct vnode **a_vpp;
 		struct componentname *a_cnp;
 		struct vattr *a_vap;
 		char *a_target;
 	} */ *ap;
 {
 	return EOPNOTSUPP;
 }
 
 static int
 smbfs_mknod(ap) 
 	struct vop_mknod_args /* {
 	} */ *ap;
 {
 	return EOPNOTSUPP;
 }
 
 static int
 smbfs_mkdir(ap)
 	struct vop_mkdir_args /* {
 		struct vnode *a_dvp;
 		struct vnode **a_vpp;
 		struct componentname *a_cnp;
 		struct vattr *a_vap;
 	} */ *ap;
 {
 	struct vnode *dvp = ap->a_dvp;
 /*	struct vattr *vap = ap->a_vap;*/
 	struct vnode *vp;
 	struct componentname *cnp = ap->a_cnp;
 	struct smbnode *dnp = VTOSMB(dvp);
 	struct vattr vattr;
 	struct smb_cred scred;
 	struct smbfattr fattr;
 	char *name = cnp->cn_nameptr;
 	int len = cnp->cn_namelen;
 	int error;
 
 	if ((error = VOP_GETATTR(dvp, &vattr, cnp->cn_cred, cnp->cn_thread))) {
 		return error;
 	}	
 	if ((name[0] == '.') && ((len == 1) || ((len == 2) && (name[1] == '.'))))
 		return EEXIST;
 	smb_makescred(&scred, cnp->cn_thread, cnp->cn_cred);
 	error = smbfs_smb_mkdir(dnp, name, len, &scred);
 	if (error)
 		return error;
 	error = smbfs_smb_lookup(dnp, name, len, &fattr, &scred);
 	if (error)
 		return error;
 	error = smbfs_nget(VTOVFS(dvp), dvp, name, len, &fattr, &vp);
 	if (error)
 		return error;
 	*ap->a_vpp = vp;
 	return 0;
 }
 
 /*
  * smbfs_remove directory call
  */
 static int
 smbfs_rmdir(ap)
 	struct vop_rmdir_args /* {
 		struct vnode *a_dvp;
 		struct vnode *a_vp;
 		struct componentname *a_cnp;
 	} */ *ap;
 {
 	struct vnode *vp = ap->a_vp;
 	struct vnode *dvp = ap->a_dvp;
 	struct componentname *cnp = ap->a_cnp;
 /*	struct smbmount *smp = VTOSMBFS(vp);*/
 	struct smbnode *dnp = VTOSMB(dvp);
 	struct smbnode *np = VTOSMB(vp);
 	struct smb_cred scred;
 	int error;
 
 	if (dvp == vp)
 		return EINVAL;
 
 	smb_makescred(&scred, cnp->cn_thread, cnp->cn_cred);
 	error = smbfs_smb_rmdir(np, &scred);
 	if (error == 0)
 		np->n_flag |= NGONE;
 	dnp->n_flag |= NMODIFIED;
 	smbfs_attr_cacheremove(dvp);
 /*	cache_purge(dvp);*/
 	cache_purge(vp);
 	return error;
 }
 
 /*
  * smbfs_readdir call
  */
 static int
 smbfs_readdir(ap)
 	struct vop_readdir_args /* {
 		struct vnode *a_vp;
 		struct uio *a_uio;
 		struct ucred *a_cred;
 		int *a_eofflag;
 		u_long *a_cookies;
 		int a_ncookies;
 	} */ *ap;
 {
 	struct vnode *vp = ap->a_vp;
 	struct uio *uio = ap->a_uio;
 	int error;
 
 	if (vp->v_type != VDIR)
 		return (EPERM);
 #ifdef notnow
 	if (ap->a_ncookies) {
 		printf("smbfs_readdir: no support for cookies now...");
 		return (EOPNOTSUPP);
 	}
 #endif
 	error = smbfs_readvnode(vp, uio, ap->a_cred);
 	return error;
 }
 
 /* ARGSUSED */
 static int
 smbfs_fsync(ap)
 	struct vop_fsync_args /* {
 		struct vnodeop_desc *a_desc;
 		struct vnode * a_vp;
 		struct ucred * a_cred;
 		int  a_waitfor;
 		struct thread * a_td;
 	} */ *ap;
 {
 /*	return (smb_flush(ap->a_vp, ap->a_cred, ap->a_waitfor, ap->a_td, 1));*/
     return (0);
 }
 
 static 
 int smbfs_print (ap) 
 	struct vop_print_args /* {
 	struct vnode *a_vp;
 	} */ *ap;
 {
 	struct vnode *vp = ap->a_vp;
 	struct smbnode *np = VTOSMB(vp);
 
 	if (np == NULL) {
 		printf("no smbnode data\n");
 		return (0);
 	}
 	printf("\tname = %s, parent = %p, open = %d\n", np->n_name,
 	    np->n_parent ? np->n_parent : NULL, (np->n_flag & NOPEN) != 0);
 	return (0);
 }
 
 static int
 smbfs_pathconf (ap)
 	struct vop_pathconf_args  /* {
 	struct vnode *vp;
 	int name;
 	register_t *retval;
 	} */ *ap;
 {
 	struct smbmount *smp = VFSTOSMBFS(VTOVFS(ap->a_vp));
 	struct smb_vc *vcp = SSTOVC(smp->sm_share);
 	register_t *retval = ap->a_retval;
 	int error = 0;
 	
 	switch (ap->a_name) {
 	    case _PC_LINK_MAX:
 		*retval = 0;
 		break;
 	    case _PC_NAME_MAX:
 		*retval = (vcp->vc_hflags2 & SMB_FLAGS2_KNOWS_LONG_NAMES) ? 255 : 12;
 		break;
 	    case _PC_PATH_MAX:
 		*retval = 800;	/* XXX: a correct one ? */
 		break;
 	    default:
 		error = EINVAL;
 	}
 	return error;
 }
 
 static int
 smbfs_strategy (ap) 
 	struct vop_strategy_args /* {
 	struct buf *a_bp
 	} */ *ap;
 {
 	struct buf *bp=ap->a_bp;
 	struct ucred *cr;
 	struct thread *td;
 	int error = 0;
 
 	SMBVDEBUG("\n");
 	if (bp->b_flags & B_ASYNC)
 		td = (struct thread *)0;
 	else
 		td = curthread;	/* XXX */
 	if (bp->b_iocmd == BIO_READ)
 		cr = bp->b_rcred;
 	else
 		cr = bp->b_wcred;
 
 	if ((bp->b_flags & B_ASYNC) == 0 )
 		error = smbfs_doio(ap->a_vp, bp, cr, td);
 	return error;
 }
 
 int
 smbfs_ioctl(ap)
 	struct vop_ioctl_args /* {
 		struct vnode *a_vp;
 		u_long a_command;
 		caddr_t a_data;
 		int fflag;
 		struct ucred *cred;
 		struct thread *td;
 	} */ *ap;
 {
 	return ENOTTY;
 }
 
 static char smbfs_atl[] = "rhsvda";
 static int
 smbfs_getextattr(struct vop_getextattr_args *ap)
 /* {
         IN struct vnode *a_vp;
         IN char *a_name;
         INOUT struct uio *a_uio;
         IN struct ucred *a_cred;
         IN struct thread *a_td;
 };
 */
 {
 	struct vnode *vp = ap->a_vp;
 	struct thread *td = ap->a_td;
 	struct ucred *cred = ap->a_cred;
 	struct uio *uio = ap->a_uio;
 	const char *name = ap->a_name;
 	struct smbnode *np = VTOSMB(vp);
 	struct vattr vattr;
 	char buf[10];
 	int i, attr, error;
 
 	error = VOP_ACCESS(vp, VREAD, cred, td);
 	if (error)
 		return error;
 	error = VOP_GETATTR(vp, &vattr, cred, td);
 	if (error)
 		return error;
 	if (strcmp(name, "dosattr") == 0) {
 		attr = np->n_dosattr;
 		for (i = 0; i < 6; i++, attr >>= 1)
 			buf[i] = (attr & 1) ? smbfs_atl[i] : '-';
 		buf[i] = 0;
 		error = uiomove(buf, i, uio);
 		
 	} else
 		error = EINVAL;
 	return error;
 }
 
 /*
  * Since we expected to support F_GETLK (and SMB protocol has no such function),
  * it is necessary to use lf_advlock(). It would be nice if this function had
  * a callback mechanism because it will help to improve a level of consistency.
  */
 int
 smbfs_advlock(ap)
 	struct vop_advlock_args /* {
 		struct vnode *a_vp;
 		caddr_t  a_id;
 		int  a_op;
 		struct flock *a_fl;
 		int  a_flags;
 	} */ *ap;
 {
 	struct vnode *vp = ap->a_vp;
 	struct smbnode *np = VTOSMB(vp);
 	struct flock *fl = ap->a_fl;
 	caddr_t id = (caddr_t)1 /* ap->a_id */;
 /*	int flags = ap->a_flags;*/
 	struct thread *td = curthread;
 	struct smb_cred scred;
 	u_quad_t size;
 	off_t start, end, oadd;
 	int error, lkop;
 
 	if (vp->v_type == VDIR) {
 		/*
 		 * SMB protocol have no support for directory locking.
 		 * Although locks can be processed on local machine, I don't
 		 * think that this is a good idea, because some programs
 		 * can work wrong assuming directory is locked. So, we just
 		 * return 'operation not supported
 		 */
 		 return EOPNOTSUPP;
 	}
 	size = np->n_size;
 	switch (fl->l_whence) {
 
 	case SEEK_SET:
 	case SEEK_CUR:
 		start = fl->l_start;
 		break;
 
 	case SEEK_END:
 		if (size > OFF_MAX ||
 		    (fl->l_start > 0 && size > OFF_MAX - fl->l_start))
 			return EOVERFLOW;
 		start = size + fl->l_start;
 		break;
 
 	default:
 		return EINVAL;
 	}
 	if (start < 0)
 		return EINVAL;
 	if (fl->l_len < 0) {
 		if (start == 0)
 			return EINVAL;
 		end = start - 1;
 		start += fl->l_len;
 		if (start < 0)
 			return EINVAL;
 	} else if (fl->l_len == 0)
 		end = -1;
 	else {
 		oadd = fl->l_len - 1;
 		if (oadd > OFF_MAX - start)
 			return EOVERFLOW;
 		end = start + oadd;
 	}
 	smb_makescred(&scred, td, td->td_ucred);
 	switch (ap->a_op) {
 	    case F_SETLK:
 		switch (fl->l_type) {
 		    case F_WRLCK:
 			lkop = SMB_LOCK_EXCL;
 			break;
 		    case F_RDLCK:
 			lkop = SMB_LOCK_SHARED;
 			break;
 		    case F_UNLCK:
 			lkop = SMB_LOCK_RELEASE;
 			break;
 		    default:
 			return EINVAL;
 		}
 		error = lf_advlock(ap, &np->n_lockf, size);
 		if (error)
 			break;
 		lkop = SMB_LOCK_EXCL;
 		error = smbfs_smb_lock(np, lkop, id, start, end, &scred);
 		if (error) {
 			ap->a_op = F_UNLCK;
 			lf_advlock(ap, &np->n_lockf, size);
 		}
 		break;
 	    case F_UNLCK:
 		lf_advlock(ap, &np->n_lockf, size);
 		error = smbfs_smb_lock(np, SMB_LOCK_RELEASE, id, start, end, &scred);
 		break;
 	    case F_GETLK:
 		error = lf_advlock(ap, &np->n_lockf, size);
 		break;
 	    default:
 		return EINVAL;
 	}
 	return error;
 }
 
 static int
 smbfs_pathcheck(struct smbmount *smp, const char *name, int nmlen, int nameiop)
 {
 	static const char *badchars = "*/:<>;?";
 	static const char *badchars83 = " +|,[]=";
 	const char *cp;
 	int i, error;
 
 	/*
 	 * Backslash characters, being a path delimiter, are prohibited
 	 * within a path component even for LOOKUP operations.
 	 */
 	if (index(name, '\\') != NULL)
 		return ENOENT;
 
 	if (nameiop == LOOKUP)
 		return 0;
 	error = ENOENT;
 	if (SMB_DIALECT(SSTOVC(smp->sm_share)) < SMB_DIALECT_LANMAN2_0) {
 		/*
 		 * Name should conform 8.3 format
 		 */
 		if (nmlen > 12)
 			return ENAMETOOLONG;
 		cp = index(name, '.');
 		if (cp == NULL)
 			return error;
 		if (cp == name || (cp - name) > 8)
 			return error;
 		cp = index(cp + 1, '.');
 		if (cp != NULL)
 			return error;
 		for (cp = name, i = 0; i < nmlen; i++, cp++)
 			if (index(badchars83, *cp) != NULL)
 				return error;
 	}
 	for (cp = name, i = 0; i < nmlen; i++, cp++)
 		if (index(badchars, *cp) != NULL)
 			return error;
 	return 0;
 }
 
 /*
  * Things go even weird without fixed inode numbers...
  */
 int
 smbfs_lookup(ap)
 	struct vop_lookup_args /* {
 		struct vnodeop_desc *a_desc;
 		struct vnode *a_dvp;
 		struct vnode **a_vpp;
 		struct componentname *a_cnp;
 	} */ *ap;
 {
 	struct componentname *cnp = ap->a_cnp;
 	struct thread *td = cnp->cn_thread;
 	struct vnode *dvp = ap->a_dvp;
 	struct vnode **vpp = ap->a_vpp;
 	struct vnode *vp;
 	struct smbmount *smp;
 	struct mount *mp = dvp->v_mount;
 	struct smbnode *dnp;
 	struct smbfattr fattr, *fap;
 	struct smb_cred scred;
 	char *name = cnp->cn_nameptr;
 	int flags = cnp->cn_flags;
 	int nameiop = cnp->cn_nameiop;
 	int nmlen = cnp->cn_namelen;
 	int error, islastcn, isdot;
 	int killit;
 	
 	SMBVDEBUG("\n");
 	if (dvp->v_type != VDIR)
 		return ENOTDIR;
 	if ((flags & ISDOTDOT) && (dvp->v_vflag & VV_ROOT)) {
 		SMBFSERR("invalid '..'\n");
 		return EIO;
 	}
 #ifdef SMB_VNODE_DEBUG
 	{
 		char *cp, c;
 
 		cp = name + nmlen;
 		c = *cp;
 		*cp = 0;
 		SMBVDEBUG("%d '%s' in '%s' id=d\n", nameiop, name, 
 			VTOSMB(dvp)->n_name);
 		*cp = c;
 	}
 #endif
 	islastcn = flags & ISLASTCN;
 	if (islastcn && (mp->mnt_flag & MNT_RDONLY) && (nameiop != LOOKUP))
 		return EROFS;
 	if ((error = VOP_ACCESS(dvp, VEXEC, cnp->cn_cred, td)) != 0)
 		return error;
 	smp = VFSTOSMBFS(mp);
 	dnp = VTOSMB(dvp);
 	isdot = (nmlen == 1 && name[0] == '.');
 
 	error = smbfs_pathcheck(smp, cnp->cn_nameptr, cnp->cn_namelen, nameiop);
 
 	if (error) 
 		return ENOENT;
 
 	error = cache_lookup(dvp, vpp, cnp);
 	SMBVDEBUG("cache_lookup returned %d\n", error);
 	if (error > 0)
 		return error;
 	if (error) {		/* name was found */
 		struct vattr vattr;
 
 		killit = 0;
 		vp = *vpp;
 		error = VOP_GETATTR(vp, &vattr, cnp->cn_cred, td);
 		/*
 		 * If the file type on the server is inconsistent
 		 * with what it was when we created the vnode,
 		 * kill the bogus vnode now and fall through to
 		 * the code below to create a new one with the
 		 * right type.
 		 */
 		if (error == 0 &&
 		   ((vp->v_type == VDIR &&
 		   (VTOSMB(vp)->n_dosattr & SMB_FA_DIR) == 0) ||
 		   (vp->v_type == VREG &&
 		   (VTOSMB(vp)->n_dosattr & SMB_FA_DIR) != 0)))
 		   killit = 1;
 		else if (error == 0
 	     /*    && vattr.va_ctime.tv_sec == VTOSMB(vp)->n_ctime*/) {
 		     if (nameiop != LOOKUP && islastcn)
 			     cnp->cn_flags |= SAVENAME;
 		     SMBVDEBUG("use cached vnode\n");
 		     return (0);
 		}
 		cache_purge(vp);
 		/*
 		 * XXX This is not quite right, if '.' is
 		 * inconsistent, we really need to start the lookup
 		 * all over again.  Hopefully there is some other
 		 * guarantee that prevents this case from happening.
 		 */
 		if (killit && vp != dvp)
 			vgone(vp);
 		if (vp != dvp)
 			vput(vp);
 		else
 			vrele(vp);
 		*vpp = NULLVP;
 	}
 	/* 
 	 * entry is not in the cache or has been expired
 	 */
 	error = 0;
 	*vpp = NULLVP;
 	smb_makescred(&scred, td, cnp->cn_cred);
 	fap = &fattr;
 	if (flags & ISDOTDOT) {
 		error = smbfs_smb_lookup(VTOSMB(dnp->n_parent), NULL, 0, fap,
 		    &scred);
 		SMBVDEBUG("result of dotdot lookup: %d\n", error);
 	} else {
 		fap = &fattr;
 		error = smbfs_smb_lookup(dnp, name, nmlen, fap, &scred);
 /*		if (cnp->cn_namelen == 1 && cnp->cn_nameptr[0] == '.')*/
 		SMBVDEBUG("result of smbfs_smb_lookup: %d\n", error);
 	}
 	if (error && error != ENOENT)
 		return error;
 	if (error) {			/* entry not found */
 		/*
 		 * Handle RENAME or CREATE case...
 		 */
 		if ((nameiop == CREATE || nameiop == RENAME) && islastcn) {
 			error = VOP_ACCESS(dvp, VWRITE, cnp->cn_cred, td);
 			if (error)
 				return error;
 			cnp->cn_flags |= SAVENAME;
 			return (EJUSTRETURN);
 		}
 		return ENOENT;
 	}/* else {
 		SMBVDEBUG("Found entry %s with id=%d\n", fap->entryName, fap->dirEntNum);
 	}*/
 	/*
 	 * handle DELETE case ...
 	 */
 	if (nameiop == DELETE && islastcn) { 	/* delete last component */
 		error = VOP_ACCESS(dvp, VWRITE, cnp->cn_cred, td);
 		if (error)
 			return error;
 		if (isdot) {
 			VREF(dvp);
 			*vpp = dvp;
 			return 0;
 		}
 		error = smbfs_nget(mp, dvp, name, nmlen, fap, &vp);
 		if (error)
 			return error;
 		*vpp = vp;
 		cnp->cn_flags |= SAVENAME;
 		return 0;
 	}
 	if (nameiop == RENAME && islastcn) {
 		error = VOP_ACCESS(dvp, VWRITE, cnp->cn_cred, td);
 		if (error)
 			return error;
 		if (isdot)
 			return EISDIR;
 		error = smbfs_nget(mp, dvp, name, nmlen, fap, &vp);
 		if (error)
 			return error;
 		*vpp = vp;
 		cnp->cn_flags |= SAVENAME;
 		return 0;
 	}
 	if (flags & ISDOTDOT) {
 		VOP_UNLOCK(dvp, 0, td);
 		error = smbfs_nget(mp, dvp, name, nmlen, NULL, &vp);
-		vn_lock(dvp, LK_EXCLUSIVE | LK_RETRY, td);
+		vn_lock(dvp, LK_EXCLUSIVE | LK_RETRY);
 		if (error)
 			return error;
 		*vpp = vp;
 	} else if (isdot) {
 		vref(dvp);
 		*vpp = dvp;
 	} else {
 		error = smbfs_nget(mp, dvp, name, nmlen, fap, &vp);
 		if (error)
 			return error;
 		*vpp = vp;
 		SMBVDEBUG("lookup: getnewvp!\n");
 	}
 	if ((cnp->cn_flags & MAKEENTRY)/* && !islastcn*/) {
 /*		VTOSMB(*vpp)->n_ctime = VTOSMB(*vpp)->n_vattr.va_ctime.tv_sec;*/
 		cache_enter(dvp, *vpp, cnp);
 	}
 	return 0;
 }
Index: head/sys/fs/tmpfs/tmpfs_subr.c
===================================================================
--- head/sys/fs/tmpfs/tmpfs_subr.c	(revision 175201)
+++ head/sys/fs/tmpfs/tmpfs_subr.c	(revision 175202)
@@ -1,1301 +1,1301 @@
 /*	$NetBSD: tmpfs_subr.c,v 1.35 2007/07/09 21:10:50 ad Exp $	*/
 
 /*
  * Copyright (c) 2005 The NetBSD Foundation, Inc.
  * All rights reserved.
  *
  * This code is derived from software contributed to The NetBSD Foundation
  * by Julio M. Merino Vidal, developed as part of Google's Summer of Code
  * 2005 program.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. All advertising materials mentioning features or use of this software
  *    must display the following acknowledgement:
  *        This product includes software developed by the NetBSD
  *        Foundation, Inc. and its contributors.
  * 4. Neither the name of The NetBSD Foundation nor the names of its
  *    contributors may be used to endorse or promote products derived
  *    from this software without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
  * POSSIBILITY OF SUCH DAMAGE.
  */
 
 /*
  * Efficient memory file system supporting functions.
  */
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <sys/param.h>
 #include <sys/namei.h>
 #include <sys/priv.h>
 #include <sys/proc.h>
 #include <sys/stat.h>
 #include <sys/systm.h>
 #include <sys/vnode.h>
 #include <sys/vmmeter.h>
 
 #include <vm/vm.h>
 #include <vm/vm_object.h>
 #include <vm/vm_page.h>
 #include <vm/vm_pager.h>
 #include <vm/vm_extern.h>
 
 #include <fs/tmpfs/tmpfs.h>
 #include <fs/tmpfs/tmpfs_fifoops.h>
 #include <fs/tmpfs/tmpfs_vnops.h>
 
 /* --------------------------------------------------------------------- */
 
 /*
  * Allocates a new node of type 'type' inside the 'tmp' mount point, with
  * its owner set to 'uid', its group to 'gid' and its mode set to 'mode',
  * using the credentials of the process 'p'.
  *
  * If the node type is set to 'VDIR', then the parent parameter must point
  * to the parent directory of the node being created.  It may only be NULL
  * while allocating the root node.
  *
  * If the node type is set to 'VBLK' or 'VCHR', then the rdev parameter
  * specifies the device the node represents.
  *
  * If the node type is set to 'VLNK', then the parameter target specifies
  * the file name of the target file for the symbolic link that is being
  * created.
  *
  * Note that new nodes are retrieved from the available list if it has
  * items or, if it is empty, from the node pool as long as there is enough
  * space to create them.
  *
  * Returns zero on success or an appropriate error code on failure.
  */
 int
 tmpfs_alloc_node(struct tmpfs_mount *tmp, enum vtype type,
     uid_t uid, gid_t gid, mode_t mode, struct tmpfs_node *parent,
     char *target, dev_t rdev, struct thread *p, struct tmpfs_node **node)
 {
 	struct tmpfs_node *nnode;
 
 	/* If the root directory of the 'tmp' file system is not yet
 	 * allocated, this must be the request to do it. */
 	MPASS(IMPLIES(tmp->tm_root == NULL, parent == NULL && type == VDIR));
 
 	MPASS(IFF(type == VLNK, target != NULL));
 	MPASS(IFF(type == VBLK || type == VCHR, rdev != VNOVAL));
 
 	if (tmp->tm_nodes_inuse > tmp->tm_nodes_max)
 		return (ENOSPC);
 
 	nnode = (struct tmpfs_node *)uma_zalloc_arg(
 				tmp->tm_node_pool, tmp, M_WAITOK);
 
 	/* Generic initialization. */
 	nnode->tn_type = type;
 	vfs_timestamp(&nnode->tn_atime);
 	nnode->tn_birthtime = nnode->tn_ctime = nnode->tn_mtime =
 	    nnode->tn_atime;
 	nnode->tn_uid = uid;
 	nnode->tn_gid = gid;
 	nnode->tn_mode = mode;
 	nnode->tn_id = alloc_unr(tmp->tm_ino_unr);
 
 	/* Type-specific initialization. */
 	switch (nnode->tn_type) {
 	case VBLK:
 	case VCHR:
 		nnode->tn_rdev = rdev;
 		break;
 
 	case VDIR:
 		TAILQ_INIT(&nnode->tn_dir.tn_dirhead);
 		MPASS(parent != nnode);
 		MPASS(IMPLIES(parent == NULL, tmp->tm_root == NULL));
 		nnode->tn_dir.tn_parent = (parent == NULL) ? nnode : parent;
 		nnode->tn_dir.tn_readdir_lastn = 0;
 		nnode->tn_dir.tn_readdir_lastp = NULL;
 		nnode->tn_links++;
 		nnode->tn_dir.tn_parent->tn_links++;
 		break;
 
 	case VFIFO:
 		/* FALLTHROUGH */
 	case VSOCK:
 		break;
 
 	case VLNK:
 		MPASS(strlen(target) < MAXPATHLEN);
 		nnode->tn_size = strlen(target);
 		nnode->tn_link = malloc(nnode->tn_size, M_TMPFSNAME,
 		    M_WAITOK);
 		memcpy(nnode->tn_link, target, nnode->tn_size);
 		break;
 
 	case VREG:
 		nnode->tn_reg.tn_aobj =
 		    vm_pager_allocate(OBJT_SWAP, NULL, 0, VM_PROT_DEFAULT, 0);
 		nnode->tn_reg.tn_aobj_pages = 0;
 		break;
 
 	default:
 		panic("tmpfs_alloc_node: type %p %d", nnode, (int)nnode->tn_type);
 	}
 
 	TMPFS_LOCK(tmp);
 	LIST_INSERT_HEAD(&tmp->tm_nodes_used, nnode, tn_entries);
 	tmp->tm_nodes_inuse++;
 	TMPFS_UNLOCK(tmp);
 
 	*node = nnode;
 	return 0;
 }
 
 /* --------------------------------------------------------------------- */
 
 /*
  * Destroys the node pointed to by node from the file system 'tmp'.
  * If the node does not belong to the given mount point, the results are
  * unpredicted.
  *
  * If the node references a directory; no entries are allowed because
  * their removal could need a recursive algorithm, something forbidden in
  * kernel space.  Furthermore, there is not need to provide such
  * functionality (recursive removal) because the only primitives offered
  * to the user are the removal of empty directories and the deletion of
  * individual files.
  *
  * Note that nodes are not really deleted; in fact, when a node has been
  * allocated, it cannot be deleted during the whole life of the file
  * system.  Instead, they are moved to the available list and remain there
  * until reused.
  */
 void
 tmpfs_free_node(struct tmpfs_mount *tmp, struct tmpfs_node *node)
 {
 	size_t pages = 0;
 
 #ifdef INVARIANTS
 	TMPFS_NODE_LOCK(node);
 	MPASS(node->tn_vnode == NULL);
 	TMPFS_NODE_UNLOCK(node);
 #endif
 
 	TMPFS_LOCK(tmp);
 	LIST_REMOVE(node, tn_entries);
 	tmp->tm_nodes_inuse--;
 	TMPFS_UNLOCK(tmp);
 
 	switch (node->tn_type) {
 	case VNON:
 		/* Do not do anything.  VNON is provided to let the
 		 * allocation routine clean itself easily by avoiding
 		 * duplicating code in it. */
 		/* FALLTHROUGH */
 	case VBLK:
 		/* FALLTHROUGH */
 	case VCHR:
 		/* FALLTHROUGH */
 	case VDIR:
 		/* FALLTHROUGH */
 	case VFIFO:
 		/* FALLTHROUGH */
 	case VSOCK:
 		break;
 
 	case VLNK:
 		free(node->tn_link, M_TMPFSNAME);
 		break;
 
 	case VREG:
 		if (node->tn_reg.tn_aobj != NULL)
 			vm_object_deallocate(node->tn_reg.tn_aobj);
 		pages = node->tn_reg.tn_aobj_pages;
 		break;
 
 	default:
 		panic("tmpfs_free_node: type %p %d", node, (int)node->tn_type);
 	}
 
 	free_unr(tmp->tm_ino_unr, node->tn_id);
 	uma_zfree(tmp->tm_node_pool, node);
 
 	TMPFS_LOCK(tmp);
 	tmp->tm_pages_used -= pages;
 	TMPFS_UNLOCK(tmp);
 }
 
 /* --------------------------------------------------------------------- */
 
 /*
  * Allocates a new directory entry for the node node with a name of name.
  * The new directory entry is returned in *de.
  *
  * The link count of node is increased by one to reflect the new object
  * referencing it.
  *
  * Returns zero on success or an appropriate error code on failure.
  */
 int
 tmpfs_alloc_dirent(struct tmpfs_mount *tmp, struct tmpfs_node *node,
     const char *name, uint16_t len, struct tmpfs_dirent **de)
 {
 	struct tmpfs_dirent *nde;
 
 	nde = (struct tmpfs_dirent *)uma_zalloc(
 					tmp->tm_dirent_pool, M_WAITOK);
 	nde->td_name = malloc(len, M_TMPFSNAME, M_WAITOK);
 	nde->td_namelen = len;
 	memcpy(nde->td_name, name, len);
 
 	nde->td_node = node;
 	node->tn_links++;
 
 	*de = nde;
 
 	return 0;
 }
 
 /* --------------------------------------------------------------------- */
 
 /*
  * Frees a directory entry.  It is the caller's responsibility to destroy
  * the node referenced by it if needed.
  *
  * The link count of node is decreased by one to reflect the removal of an
  * object that referenced it.  This only happens if 'node_exists' is true;
  * otherwise the function will not access the node referred to by the
  * directory entry, as it may already have been released from the outside.
  */
 void
 tmpfs_free_dirent(struct tmpfs_mount *tmp, struct tmpfs_dirent *de,
     boolean_t node_exists)
 {
 	if (node_exists) {
 		struct tmpfs_node *node;
 
 		node = de->td_node;
 
 		MPASS(node->tn_links > 0);
 		node->tn_links--;
 	}
 
 	free(de->td_name, M_TMPFSNAME);
 	uma_zfree(tmp->tm_dirent_pool, de);
 }
 
 /* --------------------------------------------------------------------- */
 
 /*
  * Allocates a new vnode for the node node or returns a new reference to
  * an existing one if the node had already a vnode referencing it.  The
  * resulting locked vnode is returned in *vpp.
  *
  * Returns zero on success or an appropriate error code on failure.
  */
 int
 tmpfs_alloc_vp(struct mount *mp, struct tmpfs_node *node, int lkflag,
     struct vnode **vpp, struct thread *td)
 {
 	int error = 0;
 	struct vnode *vp;
 
 loop:
 	TMPFS_NODE_LOCK(node);
 	if ((vp = node->tn_vnode) != NULL) {
 		VI_LOCK(vp);
 		TMPFS_NODE_UNLOCK(node);
 		vholdl(vp);
 		(void) vget(vp, lkflag | LK_INTERLOCK | LK_RETRY, td);
 		vdrop(vp);
 
 		/*
 		 * Make sure the vnode is still there after
 		 * getting the interlock to avoid racing a free.
 		 */
 		if (node->tn_vnode == NULL || node->tn_vnode != vp) {
 			vput(vp);
 			goto loop;
 		}
 
 		goto out;
 	}
 
 	/*
 	 * otherwise lock the vp list while we call getnewvnode
 	 * since that can block.
 	 */
 	if (node->tn_vpstate & TMPFS_VNODE_ALLOCATING) {
 		node->tn_vpstate |= TMPFS_VNODE_WANT;
 		error = msleep((caddr_t) &node->tn_vpstate,
 		    TMPFS_NODE_MTX(node), PDROP | PCATCH,
 		    "tmpfs_alloc_vp", 0);
 		if (error)
 			return error;
 
 		goto loop;
 	} else
 		node->tn_vpstate |= TMPFS_VNODE_ALLOCATING;
 	
 	TMPFS_NODE_UNLOCK(node);
 
 	/* Get a new vnode and associate it with our node. */
 	error = getnewvnode("tmpfs", mp, &tmpfs_vnodeop_entries, &vp);
 	if (error != 0)
 		goto unlock;
 	MPASS(vp != NULL);
 
-	(void) vn_lock(vp, lkflag | LK_RETRY, td);
+	(void) vn_lock(vp, lkflag | LK_RETRY);
 
 	vp->v_data = node;
 	vp->v_type = node->tn_type;
 
 	/* Type-specific initialization. */
 	switch (node->tn_type) {
 	case VBLK:
 		/* FALLTHROUGH */
 	case VCHR:
 		/* FALLTHROUGH */
 	case VLNK:
 		/* FALLTHROUGH */
 	case VREG:
 		/* FALLTHROUGH */
 	case VSOCK:
 		break;
 	case VFIFO:
 		vp->v_op = &tmpfs_fifoop_entries;
 		break;
 	case VDIR:
 		if (node->tn_dir.tn_parent == node)
 			vp->v_vflag |= VV_ROOT;
 		break;
 
 	default:
 		panic("tmpfs_alloc_vp: type %p %d", node, (int)node->tn_type);
 	}
 
 	vnode_pager_setsize(vp, node->tn_size);
 	error = insmntque(vp, mp);
 	if (error) {
 		vgone(vp);
 		vput(vp);
 		vp = NULL;
 	}
 
 unlock:
 	TMPFS_NODE_LOCK(node);
 
 	MPASS(node->tn_vpstate & TMPFS_VNODE_ALLOCATING);
 	node->tn_vpstate &= ~TMPFS_VNODE_ALLOCATING;
 	node->tn_vnode = vp;
 
 	if (node->tn_vpstate & TMPFS_VNODE_WANT) {
 		node->tn_vpstate &= ~TMPFS_VNODE_WANT;
 		TMPFS_NODE_UNLOCK(node);
 		wakeup((caddr_t) &node->tn_vpstate);
 	} else
 		TMPFS_NODE_UNLOCK(node);
 
 out:
 	*vpp = vp;
 
 	MPASS(IFF(error == 0, *vpp != NULL && VOP_ISLOCKED(*vpp, td)));
 #ifdef INVARIANTS
 	TMPFS_NODE_LOCK(node);
 	MPASS(*vpp == node->tn_vnode);
 	TMPFS_NODE_UNLOCK(node);
 #endif
 
 	return error;
 }
 
 /* --------------------------------------------------------------------- */
 
 /*
  * Destroys the association between the vnode vp and the node it
  * references.
  */
 void
 tmpfs_free_vp(struct vnode *vp)
 {
 	struct tmpfs_node *node;
 
 	node = VP_TO_TMPFS_NODE(vp);
 
 	TMPFS_NODE_LOCK(node);
 	node->tn_vnode = NULL;
 	vp->v_data = NULL;
 	TMPFS_NODE_UNLOCK(node);
 }
 
 /* --------------------------------------------------------------------- */
 
 /*
  * Allocates a new file of type 'type' and adds it to the parent directory
  * 'dvp'; this addition is done using the component name given in 'cnp'.
  * The ownership of the new file is automatically assigned based on the
  * credentials of the caller (through 'cnp'), the group is set based on
  * the parent directory and the mode is determined from the 'vap' argument.
  * If successful, *vpp holds a vnode to the newly created file and zero
  * is returned.  Otherwise *vpp is NULL and the function returns an
  * appropriate error code.
  */
 int
 tmpfs_alloc_file(struct vnode *dvp, struct vnode **vpp, struct vattr *vap,
     struct componentname *cnp, char *target)
 {
 	int error;
 	struct tmpfs_dirent *de;
 	struct tmpfs_mount *tmp;
 	struct tmpfs_node *dnode;
 	struct tmpfs_node *node;
 	struct tmpfs_node *parent;
 
 	MPASS(VOP_ISLOCKED(dvp, cnp->cn_thread));
 	MPASS(cnp->cn_flags & HASBUF);
 
 	tmp = VFS_TO_TMPFS(dvp->v_mount);
 	dnode = VP_TO_TMPFS_DIR(dvp);
 	*vpp = NULL;
 
 	/* If the entry we are creating is a directory, we cannot overflow
 	 * the number of links of its parent, because it will get a new
 	 * link. */
 	if (vap->va_type == VDIR) {
 		/* Ensure that we do not overflow the maximum number of links
 		 * imposed by the system. */
 		MPASS(dnode->tn_links <= LINK_MAX);
 		if (dnode->tn_links == LINK_MAX) {
 			error = EMLINK;
 			goto out;
 		}
 
 		parent = dnode;
 		MPASS(parent != NULL);
 	} else
 		parent = NULL;
 
 	/* Allocate a node that represents the new file. */
 	error = tmpfs_alloc_node(tmp, vap->va_type, cnp->cn_cred->cr_uid,
 	    dnode->tn_gid, vap->va_mode, parent, target, vap->va_rdev,
 	    cnp->cn_thread, &node);
 	if (error != 0)
 		goto out;
 
 	/* Allocate a directory entry that points to the new file. */
 	error = tmpfs_alloc_dirent(tmp, node, cnp->cn_nameptr, cnp->cn_namelen,
 	    &de);
 	if (error != 0) {
 		tmpfs_free_node(tmp, node);
 		goto out;
 	}
 
 	/* Allocate a vnode for the new file. */
 	error = tmpfs_alloc_vp(dvp->v_mount, node, LK_EXCLUSIVE, vpp,
 	    cnp->cn_thread);
 	if (error != 0) {
 		tmpfs_free_dirent(tmp, de, TRUE);
 		tmpfs_free_node(tmp, node);
 		goto out;
 	}
 
 	/* Now that all required items are allocated, we can proceed to
 	 * insert the new node into the directory, an operation that
 	 * cannot fail. */
 	tmpfs_dir_attach(dvp, de);
 
 out:
 
 	return error;
 }
 
 /* --------------------------------------------------------------------- */
 
 /*
  * Attaches the directory entry de to the directory represented by vp.
  * Note that this does not change the link count of the node pointed by
  * the directory entry, as this is done by tmpfs_alloc_dirent.
  */
 void
 tmpfs_dir_attach(struct vnode *vp, struct tmpfs_dirent *de)
 {
 	struct tmpfs_node *dnode;
 
 	ASSERT_VOP_ELOCKED(vp, __func__);
 	dnode = VP_TO_TMPFS_DIR(vp);
 	TAILQ_INSERT_TAIL(&dnode->tn_dir.tn_dirhead, de, td_entries);
 	dnode->tn_size += sizeof(struct tmpfs_dirent);
 	dnode->tn_status |= TMPFS_NODE_ACCESSED | TMPFS_NODE_CHANGED | \
 	    TMPFS_NODE_MODIFIED;
 }
 
 /* --------------------------------------------------------------------- */
 
 /*
  * Detaches the directory entry de from the directory represented by vp.
  * Note that this does not change the link count of the node pointed by
  * the directory entry, as this is done by tmpfs_free_dirent.
  */
 void
 tmpfs_dir_detach(struct vnode *vp, struct tmpfs_dirent *de)
 {
 	struct tmpfs_node *dnode;
 
 	ASSERT_VOP_ELOCKED(vp, __func__);
 	dnode = VP_TO_TMPFS_DIR(vp);
 
 	if (dnode->tn_dir.tn_readdir_lastp == de) {
 		dnode->tn_dir.tn_readdir_lastn = 0;
 		dnode->tn_dir.tn_readdir_lastp = NULL;
 	}
 
 	TAILQ_REMOVE(&dnode->tn_dir.tn_dirhead, de, td_entries);
 	dnode->tn_size -= sizeof(struct tmpfs_dirent);
 	dnode->tn_status |= TMPFS_NODE_ACCESSED | TMPFS_NODE_CHANGED | \
 	    TMPFS_NODE_MODIFIED;
 }
 
 /* --------------------------------------------------------------------- */
 
 /*
  * Looks for a directory entry in the directory represented by node.
  * 'cnp' describes the name of the entry to look for.  Note that the .
  * and .. components are not allowed as they do not physically exist
  * within directories.
  *
  * Returns a pointer to the entry when found, otherwise NULL.
  */
 struct tmpfs_dirent *
 tmpfs_dir_lookup(struct tmpfs_node *node, struct componentname *cnp)
 {
 	boolean_t found;
 	struct tmpfs_dirent *de;
 
 	MPASS(IMPLIES(cnp->cn_namelen == 1, cnp->cn_nameptr[0] != '.'));
 	MPASS(IMPLIES(cnp->cn_namelen == 2, !(cnp->cn_nameptr[0] == '.' &&
 	    cnp->cn_nameptr[1] == '.')));
 	TMPFS_VALIDATE_DIR(node);
 
 	found = 0;
 	TAILQ_FOREACH(de, &node->tn_dir.tn_dirhead, td_entries) {
 		MPASS(cnp->cn_namelen < 0xffff);
 		if (de->td_namelen == (uint16_t)cnp->cn_namelen &&
 		    memcmp(de->td_name, cnp->cn_nameptr, de->td_namelen) == 0) {
 			found = 1;
 			break;
 		}
 	}
 	node->tn_status |= TMPFS_NODE_ACCESSED;
 
 	return found ? de : NULL;
 }
 
 struct tmpfs_dirent *
 tmpfs_dir_search(struct tmpfs_node *node, struct tmpfs_node *f)
 {
 	struct tmpfs_dirent *de;
 
 	TMPFS_VALIDATE_DIR(node);
 	node->tn_status |= TMPFS_NODE_ACCESSED;
 	TAILQ_FOREACH(de, &node->tn_dir.tn_dirhead, td_entries) {
 		if (de->td_node == f)
 			return (de);
 	}
 	return (NULL);
 }
 
 /* --------------------------------------------------------------------- */
 
 /*
  * Helper function for tmpfs_readdir.  Creates a '.' entry for the given
  * directory and returns it in the uio space.  The function returns 0
  * on success, -1 if there was not enough space in the uio structure to
  * hold the directory entry or an appropriate error code if another
  * error happens.
  */
 int
 tmpfs_dir_getdotdent(struct tmpfs_node *node, struct uio *uio)
 {
 	int error;
 	struct dirent dent;
 
 	TMPFS_VALIDATE_DIR(node);
 	MPASS(uio->uio_offset == TMPFS_DIRCOOKIE_DOT);
 
 	dent.d_fileno = node->tn_id;
 	dent.d_type = DT_DIR;
 	dent.d_namlen = 1;
 	dent.d_name[0] = '.';
 	dent.d_name[1] = '\0';
 	dent.d_reclen = GENERIC_DIRSIZ(&dent);
 
 	if (dent.d_reclen > uio->uio_resid)
 		error = -1;
 	else {
 		error = uiomove(&dent, dent.d_reclen, uio);
 		if (error == 0)
 			uio->uio_offset = TMPFS_DIRCOOKIE_DOTDOT;
 	}
 
 	node->tn_status |= TMPFS_NODE_ACCESSED;
 
 	return error;
 }
 
 /* --------------------------------------------------------------------- */
 
 /*
  * Helper function for tmpfs_readdir.  Creates a '..' entry for the given
  * directory and returns it in the uio space.  The function returns 0
  * on success, -1 if there was not enough space in the uio structure to
  * hold the directory entry or an appropriate error code if another
  * error happens.
  */
 int
 tmpfs_dir_getdotdotdent(struct tmpfs_node *node, struct uio *uio)
 {
 	int error;
 	struct dirent dent;
 
 	TMPFS_VALIDATE_DIR(node);
 	MPASS(uio->uio_offset == TMPFS_DIRCOOKIE_DOTDOT);
 
 	dent.d_fileno = node->tn_dir.tn_parent->tn_id;
 	dent.d_type = DT_DIR;
 	dent.d_namlen = 2;
 	dent.d_name[0] = '.';
 	dent.d_name[1] = '.';
 	dent.d_name[2] = '\0';
 	dent.d_reclen = GENERIC_DIRSIZ(&dent);
 
 	if (dent.d_reclen > uio->uio_resid)
 		error = -1;
 	else {
 		error = uiomove(&dent, dent.d_reclen, uio);
 		if (error == 0) {
 			struct tmpfs_dirent *de;
 
 			de = TAILQ_FIRST(&node->tn_dir.tn_dirhead);
 			if (de == NULL)
 				uio->uio_offset = TMPFS_DIRCOOKIE_EOF;
 			else
 				uio->uio_offset = tmpfs_dircookie(de);
 		}
 	}
 
 	node->tn_status |= TMPFS_NODE_ACCESSED;
 
 	return error;
 }
 
 /* --------------------------------------------------------------------- */
 
 /*
  * Lookup a directory entry by its associated cookie.
  */
 struct tmpfs_dirent *
 tmpfs_dir_lookupbycookie(struct tmpfs_node *node, off_t cookie)
 {
 	struct tmpfs_dirent *de;
 
 	if (cookie == node->tn_dir.tn_readdir_lastn &&
 	    node->tn_dir.tn_readdir_lastp != NULL) {
 		return node->tn_dir.tn_readdir_lastp;
 	}
 
 	TAILQ_FOREACH(de, &node->tn_dir.tn_dirhead, td_entries) {
 		if (tmpfs_dircookie(de) == cookie) {
 			break;
 		}
 	}
 
 	return de;
 }
 
 /* --------------------------------------------------------------------- */
 
 /*
  * Helper function for tmpfs_readdir.  Returns as much directory entries
  * as can fit in the uio space.  The read starts at uio->uio_offset.
  * The function returns 0 on success, -1 if there was not enough space
  * in the uio structure to hold the directory entry or an appropriate
  * error code if another error happens.
  */
 int
 tmpfs_dir_getdents(struct tmpfs_node *node, struct uio *uio, off_t *cntp)
 {
 	int error;
 	off_t startcookie;
 	struct tmpfs_dirent *de;
 
 	TMPFS_VALIDATE_DIR(node);
 
 	/* Locate the first directory entry we have to return.  We have cached
 	 * the last readdir in the node, so use those values if appropriate.
 	 * Otherwise do a linear scan to find the requested entry. */
 	startcookie = uio->uio_offset;
 	MPASS(startcookie != TMPFS_DIRCOOKIE_DOT);
 	MPASS(startcookie != TMPFS_DIRCOOKIE_DOTDOT);
 	if (startcookie == TMPFS_DIRCOOKIE_EOF) {
 		return 0;
 	} else {
 		de = tmpfs_dir_lookupbycookie(node, startcookie);
 	}
 	if (de == NULL) {
 		return EINVAL;
 	}
 
 	/* Read as much entries as possible; i.e., until we reach the end of
 	 * the directory or we exhaust uio space. */
 	do {
 		struct dirent d;
 
 		/* Create a dirent structure representing the current
 		 * tmpfs_node and fill it. */
 		d.d_fileno = de->td_node->tn_id;
 		switch (de->td_node->tn_type) {
 		case VBLK:
 			d.d_type = DT_BLK;
 			break;
 
 		case VCHR:
 			d.d_type = DT_CHR;
 			break;
 
 		case VDIR:
 			d.d_type = DT_DIR;
 			break;
 
 		case VFIFO:
 			d.d_type = DT_FIFO;
 			break;
 
 		case VLNK:
 			d.d_type = DT_LNK;
 			break;
 
 		case VREG:
 			d.d_type = DT_REG;
 			break;
 
 		case VSOCK:
 			d.d_type = DT_SOCK;
 			break;
 
 		default:
 			panic("tmpfs_dir_getdents: type %p %d",
 			    de->td_node, (int)de->td_node->tn_type);
 		}
 		d.d_namlen = de->td_namelen;
 		MPASS(de->td_namelen < sizeof(d.d_name));
 		(void)memcpy(d.d_name, de->td_name, de->td_namelen);
 		d.d_name[de->td_namelen] = '\0';
 		d.d_reclen = GENERIC_DIRSIZ(&d);
 
 		/* Stop reading if the directory entry we are treating is
 		 * bigger than the amount of data that can be returned. */
 		if (d.d_reclen > uio->uio_resid) {
 			error = -1;
 			break;
 		}
 
 		/* Copy the new dirent structure into the output buffer and
 		 * advance pointers. */
 		error = uiomove(&d, d.d_reclen, uio);
 
 		(*cntp)++;
 		de = TAILQ_NEXT(de, td_entries);
 	} while (error == 0 && uio->uio_resid > 0 && de != NULL);
 
 	/* Update the offset and cache. */
 	if (de == NULL) {
 		uio->uio_offset = TMPFS_DIRCOOKIE_EOF;
 		node->tn_dir.tn_readdir_lastn = 0;
 		node->tn_dir.tn_readdir_lastp = NULL;
 	} else {
 		node->tn_dir.tn_readdir_lastn = uio->uio_offset = tmpfs_dircookie(de);
 		node->tn_dir.tn_readdir_lastp = de;
 	}
 
 	node->tn_status |= TMPFS_NODE_ACCESSED;
 	return error;
 }
 
 /* --------------------------------------------------------------------- */
 
 /*
  * Resizes the aobj associated to the regular file pointed to by vp to
  * the size newsize.  'vp' must point to a vnode that represents a regular
  * file.  'newsize' must be positive.
  *
  * Returns zero on success or an appropriate error code on failure.
  */
 int
 tmpfs_reg_resize(struct vnode *vp, off_t newsize)
 {
 	int error;
 	size_t newpages, oldpages;
 	struct tmpfs_mount *tmp;
 	struct tmpfs_node *node;
 	off_t oldsize;
 
 	MPASS(vp->v_type == VREG);
 	MPASS(newsize >= 0);
 
 	node = VP_TO_TMPFS_NODE(vp);
 	tmp = VFS_TO_TMPFS(vp->v_mount);
 
 	/* Convert the old and new sizes to the number of pages needed to
 	 * store them.  It may happen that we do not need to do anything
 	 * because the last allocated page can accommodate the change on
 	 * its own. */
 	oldsize = node->tn_size;
 	oldpages = round_page(oldsize) / PAGE_SIZE;
 	MPASS(oldpages == node->tn_reg.tn_aobj_pages);
 	newpages = round_page(newsize) / PAGE_SIZE;
 
 	if (newpages > oldpages &&
 	    newpages - oldpages > TMPFS_PAGES_AVAIL(tmp)) {
 		error = ENOSPC;
 		goto out;
 	}
 
 	node->tn_reg.tn_aobj_pages = newpages;
 
 	TMPFS_LOCK(tmp);
 	tmp->tm_pages_used += (newpages - oldpages);
 	TMPFS_UNLOCK(tmp);
 
 	node->tn_size = newsize;
 	vnode_pager_setsize(vp, newsize);
 	if (newsize < oldsize) {
 		size_t zerolen = round_page(newsize) - newsize;
 		vm_object_t uobj = node->tn_reg.tn_aobj;
 		vm_page_t m;
 
 		/*
 		 * free "backing store"
 		 */
 		VM_OBJECT_LOCK(uobj);
 		if (newpages < oldpages) {
 			swap_pager_freespace(uobj,
 						newpages, oldpages - newpages);
 			vm_object_page_remove(uobj,
 				OFF_TO_IDX(newsize + PAGE_MASK), 0, FALSE);
 		}
 
 		/*
 		 * zero out the truncated part of the last page.
 		 */
 
 		if (zerolen > 0) {
 			m = vm_page_grab(uobj, OFF_TO_IDX(newsize),
 					VM_ALLOC_NORMAL | VM_ALLOC_RETRY);
 			pmap_zero_page_area(m, PAGE_SIZE - zerolen,
 				zerolen);
 			vm_page_wakeup(m);
 		}
 		VM_OBJECT_UNLOCK(uobj);
 
 	}
 
 	error = 0;
 
 out:
 	return error;
 }
 
 /* --------------------------------------------------------------------- */
 
 /*
  * Change flags of the given vnode.
  * Caller should execute tmpfs_update on vp after a successful execution.
  * The vnode must be locked on entry and remain locked on exit.
  */
 int
 tmpfs_chflags(struct vnode *vp, int flags, struct ucred *cred, struct thread *p)
 {
 	int error;
 	struct tmpfs_node *node;
 
 	MPASS(VOP_ISLOCKED(vp, p));
 
 	node = VP_TO_TMPFS_NODE(vp);
 
 	/* Disallow this operation if the file system is mounted read-only. */
 	if (vp->v_mount->mnt_flag & MNT_RDONLY)
 		return EROFS;
 
 	/*
 	 * Callers may only modify the file flags on objects they
 	 * have VADMIN rights for.
 	 */
 	if ((error = VOP_ACCESS(vp, VADMIN, cred, p)))
 		return (error);
 	/*
 	 * Unprivileged processes are not permitted to unset system
 	 * flags, or modify flags if any system flags are set.
 	 */
 	if (!priv_check_cred(cred, PRIV_VFS_SYSFLAGS, 0)) {
 		if (node->tn_flags
 		  & (SF_NOUNLINK | SF_IMMUTABLE | SF_APPEND)) {
 			error = securelevel_gt(cred, 0);
 			if (error)
 				return (error);
 		}
 		/* Snapshot flag cannot be set or cleared */
 		if (((flags & SF_SNAPSHOT) != 0 &&
 		  (node->tn_flags & SF_SNAPSHOT) == 0) ||
 		  ((flags & SF_SNAPSHOT) == 0 &&
 		  (node->tn_flags & SF_SNAPSHOT) != 0))
 			return (EPERM);
 		node->tn_flags = flags;
 	} else {
 		if (node->tn_flags
 		  & (SF_NOUNLINK | SF_IMMUTABLE | SF_APPEND) ||
 		  (flags & UF_SETTABLE) != flags)
 			return (EPERM);
 		node->tn_flags &= SF_SETTABLE;
 		node->tn_flags |= (flags & UF_SETTABLE);
 	}
 	node->tn_status |= TMPFS_NODE_CHANGED;
 
 	MPASS(VOP_ISLOCKED(vp, p));
 
 	return 0;
 }
 
 /* --------------------------------------------------------------------- */
 
 /*
  * Change access mode on the given vnode.
  * Caller should execute tmpfs_update on vp after a successful execution.
  * The vnode must be locked on entry and remain locked on exit.
  */
 int
 tmpfs_chmod(struct vnode *vp, mode_t mode, struct ucred *cred, struct thread *p)
 {
 	int error;
 	struct tmpfs_node *node;
 
 	MPASS(VOP_ISLOCKED(vp, p));
 
 	node = VP_TO_TMPFS_NODE(vp);
 
 	/* Disallow this operation if the file system is mounted read-only. */
 	if (vp->v_mount->mnt_flag & MNT_RDONLY)
 		return EROFS;
 
 	/* Immutable or append-only files cannot be modified, either. */
 	if (node->tn_flags & (IMMUTABLE | APPEND))
 		return EPERM;
 
 	/*
 	 * To modify the permissions on a file, must possess VADMIN
 	 * for that file.
 	 */
 	if ((error = VOP_ACCESS(vp, VADMIN, cred, p)))
 		return (error);
 
 	/*
 	 * Privileged processes may set the sticky bit on non-directories,
 	 * as well as set the setgid bit on a file with a group that the
 	 * process is not a member of.
 	 */
 	if (vp->v_type != VDIR && (mode & S_ISTXT)) {
 		if (priv_check_cred(cred, PRIV_VFS_STICKYFILE, 0))
 			return (EFTYPE);
 	}
 	if (!groupmember(node->tn_gid, cred) && (mode & S_ISGID)) {
 		error = priv_check_cred(cred, PRIV_VFS_SETGID, 0);
 		if (error)
 			return (error);
 	}
 
 
 	node->tn_mode &= ~ALLPERMS;
 	node->tn_mode |= mode & ALLPERMS;
 
 	node->tn_status |= TMPFS_NODE_CHANGED;
 
 	MPASS(VOP_ISLOCKED(vp, p));
 
 	return 0;
 }
 
 /* --------------------------------------------------------------------- */
 
 /*
  * Change ownership of the given vnode.  At least one of uid or gid must
  * be different than VNOVAL.  If one is set to that value, the attribute
  * is unchanged.
  * Caller should execute tmpfs_update on vp after a successful execution.
  * The vnode must be locked on entry and remain locked on exit.
  */
 int
 tmpfs_chown(struct vnode *vp, uid_t uid, gid_t gid, struct ucred *cred,
     struct thread *p)
 {
 	int error;
 	struct tmpfs_node *node;
 	uid_t ouid;
 	gid_t ogid;
 
 	MPASS(VOP_ISLOCKED(vp, p));
 
 	node = VP_TO_TMPFS_NODE(vp);
 
 	/* Assign default values if they are unknown. */
 	MPASS(uid != VNOVAL || gid != VNOVAL);
 	if (uid == VNOVAL)
 		uid = node->tn_uid;
 	if (gid == VNOVAL)
 		gid = node->tn_gid;
 	MPASS(uid != VNOVAL && gid != VNOVAL);
 
 	/* Disallow this operation if the file system is mounted read-only. */
 	if (vp->v_mount->mnt_flag & MNT_RDONLY)
 		return EROFS;
 
 	/* Immutable or append-only files cannot be modified, either. */
 	if (node->tn_flags & (IMMUTABLE | APPEND))
 		return EPERM;
 
 	/*
 	 * To modify the ownership of a file, must possess VADMIN for that
 	 * file.
 	 */
 	if ((error = VOP_ACCESS(vp, VADMIN, cred, p)))
 		return (error);
 
 	/*
 	 * To change the owner of a file, or change the group of a file to a
 	 * group of which we are not a member, the caller must have
 	 * privilege.
 	 */
 	if ((uid != node->tn_uid ||
 	    (gid != node->tn_gid && !groupmember(gid, cred))) &&
 	    (error = priv_check_cred(cred, PRIV_VFS_CHOWN, 0)))
 		return (error);
 
 	ogid = node->tn_gid;
 	ouid = node->tn_uid;
 
 	node->tn_uid = uid;
 	node->tn_gid = gid;
 
 	node->tn_status |= TMPFS_NODE_CHANGED;
 
 	if ((node->tn_mode & (S_ISUID | S_ISGID)) && (ouid != uid || ogid != gid)) {
 		if (priv_check_cred(cred, PRIV_VFS_RETAINSUGID, 0))
 			node->tn_mode &= ~(S_ISUID | S_ISGID);
 	}
 
 	MPASS(VOP_ISLOCKED(vp, p));
 
 	return 0;
 }
 
 /* --------------------------------------------------------------------- */
 
 /*
  * Change size of the given vnode.
  * Caller should execute tmpfs_update on vp after a successful execution.
  * The vnode must be locked on entry and remain locked on exit.
  */
 int
 tmpfs_chsize(struct vnode *vp, u_quad_t size, struct ucred *cred,
     struct thread *p)
 {
 	int error;
 	struct tmpfs_node *node;
 
 	MPASS(VOP_ISLOCKED(vp, p));
 
 	node = VP_TO_TMPFS_NODE(vp);
 
 	/* Decide whether this is a valid operation based on the file type. */
 	error = 0;
 	switch (vp->v_type) {
 	case VDIR:
 		return EISDIR;
 
 	case VREG:
 		if (vp->v_mount->mnt_flag & MNT_RDONLY)
 			return EROFS;
 		break;
 
 	case VBLK:
 		/* FALLTHROUGH */
 	case VCHR:
 		/* FALLTHROUGH */
 	case VFIFO:
 		/* Allow modifications of special files even if in the file
 		 * system is mounted read-only (we are not modifying the
 		 * files themselves, but the objects they represent). */
 		return 0;
 
 	default:
 		/* Anything else is unsupported. */
 		return EOPNOTSUPP;
 	}
 
 	/* Immutable or append-only files cannot be modified, either. */
 	if (node->tn_flags & (IMMUTABLE | APPEND))
 		return EPERM;
 
 	error = tmpfs_truncate(vp, size);
 	/* tmpfs_truncate will raise the NOTE_EXTEND and NOTE_ATTRIB kevents
 	 * for us, as will update tn_status; no need to do that here. */
 
 	MPASS(VOP_ISLOCKED(vp, p));
 
 	return error;
 }
 
 /* --------------------------------------------------------------------- */
 
 /*
  * Change access and modification times of the given vnode.
  * Caller should execute tmpfs_update on vp after a successful execution.
  * The vnode must be locked on entry and remain locked on exit.
  */
 int
 tmpfs_chtimes(struct vnode *vp, struct timespec *atime, struct timespec *mtime,
 	struct timespec *birthtime, int vaflags, struct ucred *cred, struct thread *l)
 {
 	int error;
 	struct tmpfs_node *node;
 
 	MPASS(VOP_ISLOCKED(vp, l));
 
 	node = VP_TO_TMPFS_NODE(vp);
 
 	/* Disallow this operation if the file system is mounted read-only. */
 	if (vp->v_mount->mnt_flag & MNT_RDONLY)
 		return EROFS;
 
 	/* Immutable or append-only files cannot be modified, either. */
 	if (node->tn_flags & (IMMUTABLE | APPEND))
 		return EPERM;
 
 	/* Determine if the user have proper privilege to update time. */
 	if (vaflags & VA_UTIMES_NULL) {
 		error = VOP_ACCESS(vp, VADMIN, cred, l);
 		if (error)
 			error = VOP_ACCESS(vp, VWRITE, cred, l);
 	} else
 		error = VOP_ACCESS(vp, VADMIN, cred, l);
 	if (error)
 		return (error);
 
 	if (atime->tv_sec != VNOVAL && atime->tv_nsec != VNOVAL)
 		node->tn_status |= TMPFS_NODE_ACCESSED;
 
 	if (mtime->tv_sec != VNOVAL && mtime->tv_nsec != VNOVAL)
 		node->tn_status |= TMPFS_NODE_MODIFIED;
 
 	if (birthtime->tv_nsec != VNOVAL && birthtime->tv_nsec != VNOVAL)
 		node->tn_status |= TMPFS_NODE_MODIFIED;
 
 	tmpfs_itimes(vp, atime, mtime);
 
 	if (birthtime->tv_nsec != VNOVAL && birthtime->tv_nsec != VNOVAL)
 		node->tn_birthtime = *birthtime;
 	MPASS(VOP_ISLOCKED(vp, l));
 
 	return 0;
 }
 
 /* --------------------------------------------------------------------- */
 /* Sync timestamps */
 void
 tmpfs_itimes(struct vnode *vp, const struct timespec *acc,
     const struct timespec *mod)
 {
 	struct tmpfs_node *node;
 	struct timespec now;
 
 	node = VP_TO_TMPFS_NODE(vp);
 
 	if ((node->tn_status & (TMPFS_NODE_ACCESSED | TMPFS_NODE_MODIFIED |
 	    TMPFS_NODE_CHANGED)) == 0)
 		return;
 
 	vfs_timestamp(&now);
 	if (node->tn_status & TMPFS_NODE_ACCESSED) {
 		if (acc == NULL)
 			 acc = &now;
 		node->tn_atime = *acc;
 	}
 	if (node->tn_status & TMPFS_NODE_MODIFIED) {
 		if (mod == NULL)
 			mod = &now;
 		node->tn_mtime = *mod;
 	}
 	if (node->tn_status & TMPFS_NODE_CHANGED) {
 		node->tn_ctime = now;
 	}
 	node->tn_status &=
 	    ~(TMPFS_NODE_ACCESSED | TMPFS_NODE_MODIFIED | TMPFS_NODE_CHANGED);
 }
 
 /* --------------------------------------------------------------------- */
 
 void
 tmpfs_update(struct vnode *vp)
 {
 
 	tmpfs_itimes(vp, NULL, NULL);
 }
 
 /* --------------------------------------------------------------------- */
 
 int
 tmpfs_truncate(struct vnode *vp, off_t length)
 {
 	boolean_t extended;
 	int error;
 	struct tmpfs_node *node;
 
 	node = VP_TO_TMPFS_NODE(vp);
 	extended = length > node->tn_size;
 
 	if (length < 0) {
 		error = EINVAL;
 		goto out;
 	}
 
 	if (node->tn_size == length) {
 		error = 0;
 		goto out;
 	}
 
 	if (length > VFS_TO_TMPFS(vp->v_mount)->tm_maxfilesize)
 		return (EFBIG);
 
 	error = tmpfs_reg_resize(vp, length);
 	if (error == 0) {
 		node->tn_status |= TMPFS_NODE_CHANGED | TMPFS_NODE_MODIFIED;
 	}
 
 out:
 	tmpfs_update(vp);
 
 	return error;
 }
Index: head/sys/fs/tmpfs/tmpfs_vfsops.c
===================================================================
--- head/sys/fs/tmpfs/tmpfs_vfsops.c	(revision 175201)
+++ head/sys/fs/tmpfs/tmpfs_vfsops.c	(revision 175202)
@@ -1,473 +1,473 @@
 /*	$NetBSD: tmpfs_vfsops.c,v 1.10 2005/12/11 12:24:29 christos Exp $	*/
 
 /*
  * Copyright (c) 2005 The NetBSD Foundation, Inc.
  * All rights reserved.
  *
  * This code is derived from software contributed to The NetBSD Foundation
  * by Julio M. Merino Vidal, developed as part of Google's Summer of Code
  * 2005 program.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. All advertising materials mentioning features or use of this software
  *    must display the following acknowledgement:
  *        This product includes software developed by the NetBSD
  *        Foundation, Inc. and its contributors.
  * 4. Neither the name of The NetBSD Foundation nor the names of its
  *    contributors may be used to endorse or promote products derived
  *    from this software without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
  * POSSIBILITY OF SUCH DAMAGE.
  */
 
 /*
  * Efficient memory file system.
  *
  * tmpfs is a file system that uses NetBSD's virtual memory sub-system
  * (the well-known UVM) to store file data and metadata in an efficient
  * way.  This means that it does not follow the structure of an on-disk
  * file system because it simply does not need to.  Instead, it uses
  * memory-specific data structures and algorithms to automatically
  * allocate and release resources.
  */
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <sys/param.h>
 #include <sys/limits.h>
 #include <sys/lock.h>
 #include <sys/mutex.h>
 #include <sys/kernel.h>
 #include <sys/stat.h>
 #include <sys/systm.h>
 #include <sys/sysctl.h>
 
 #include <vm/vm.h>
 #include <vm/vm_object.h>
 #include <vm/vm_param.h>
 
 #include <fs/tmpfs/tmpfs.h>
 
 /*
  * Default permission for root node
  */
 #define TMPFS_DEFAULT_ROOT_MODE	(S_IRWXU|S_IRGRP|S_IXGRP|S_IROTH|S_IXOTH)
 
 MALLOC_DEFINE(M_TMPFSMNT, "tmpfs mount", "tmpfs mount structures");
 MALLOC_DEFINE(M_TMPFSNAME, "tmpfs name", "tmpfs file names");
 
 /* --------------------------------------------------------------------- */
 
 static int	tmpfs_mount(struct mount *, struct thread *);
 static int	tmpfs_unmount(struct mount *, int, struct thread *);
 static int	tmpfs_root(struct mount *, int flags, struct vnode **,
 		    struct thread *);
 static int	tmpfs_fhtovp(struct mount *, struct fid *, struct vnode **);
 static int	tmpfs_statfs(struct mount *, struct statfs *, struct thread *);
 
 /* --------------------------------------------------------------------- */
 
 static const char *tmpfs_opts[] = {
 	"from", "size", "inodes", "uid", "gid", "mode", "export",
 	NULL
 };
 
 /* --------------------------------------------------------------------- */
 
 #define SWI_MAXMIB	3
 
 static u_int
 get_swpgtotal(void)
 {
 	struct xswdev xsd;
 	char *sname = "vm.swap_info";
 	int soid[SWI_MAXMIB], oid[2];
 	u_int unswdev, total, dmmax, nswapdev;
 	size_t mibi, len;
 
 	total = 0;
 
 	len = sizeof(dmmax);
 	if (kernel_sysctlbyname(curthread, "vm.dmmax", &dmmax, &len,
 				NULL, 0, NULL, 0) != 0)
 		return total;
 
 	len = sizeof(nswapdev);
 	if (kernel_sysctlbyname(curthread, "vm.nswapdev",
 				&nswapdev, &len,
 				NULL, 0, NULL, 0) != 0)
 		return total;
 
 	mibi = (SWI_MAXMIB - 1) * sizeof(int);
 	oid[0] = 0;
 	oid[1] = 3;
 
 	if (kernel_sysctl(curthread, oid, 2,
 			soid, &mibi, (void *)sname, strlen(sname),
 			NULL, 0) != 0)
 		return total;
 
 	mibi = (SWI_MAXMIB - 1);
 	for (unswdev = 0; unswdev < nswapdev; ++unswdev) {
 		soid[mibi] = unswdev;
 		len = sizeof(struct xswdev);
 		if (kernel_sysctl(curthread,
 				soid, mibi + 1, &xsd, &len, NULL, 0,
 				NULL, 0) != 0)
 			return total;
 		if (len == sizeof(struct xswdev))
 			total += (xsd.xsw_nblks - dmmax);
 	}
 
 	/* Not Reached */
 	return total;
 }
 
 /* --------------------------------------------------------------------- */
 static int
 tmpfs_node_ctor(void *mem, int size, void *arg, int flags)
 {
 	struct tmpfs_node *node = (struct tmpfs_node *)mem;
 
 	node->tn_gen++;
 	node->tn_size = 0;
 	node->tn_status = 0;
 	node->tn_flags = 0;
 	node->tn_links = 0;
 	node->tn_lockf = NULL;
 	node->tn_vnode = NULL;
 	node->tn_vpstate = 0;
 
 	return (0);
 }
 
 static void
 tmpfs_node_dtor(void *mem, int size, void *arg)
 {
 	struct tmpfs_node *node = (struct tmpfs_node *)mem;
 	node->tn_type = VNON;
 }
 
 static int
 tmpfs_node_init(void *mem, int size, int flags)
 {
 	struct tmpfs_node *node = (struct tmpfs_node *)mem;
 	node->tn_id = 0;
 
 	mtx_init(&node->tn_interlock, "tmpfs node interlock", NULL, MTX_DEF);
 	node->tn_gen = arc4random();
 
 	return (0);
 }
 
 static void
 tmpfs_node_fini(void *mem, int size)
 {
 	struct tmpfs_node *node = (struct tmpfs_node *)mem;
 
 	mtx_destroy(&node->tn_interlock);
 }
 
 static int
 tmpfs_mount(struct mount *mp, struct thread *td)
 {
 	struct tmpfs_mount *tmp;
 	struct tmpfs_node *root;
 	size_t pages, mem_size;
 	ino_t nodes;
 	int error;
 	/* Size counters. */
 	ino_t	nodes_max;
 	size_t	size_max;
 
 	/* Root node attributes. */
 	uid_t	root_uid;
 	gid_t	root_gid;
 	mode_t	root_mode;
 
 	struct vattr	va;
 
 	if (vfs_filteropt(mp->mnt_optnew, tmpfs_opts))
 		return (EINVAL);
 
 	if (mp->mnt_flag & MNT_UPDATE) {
 		/* XXX: There is no support yet to update file system
 		 * settings.  Should be added. */
 
 		return EOPNOTSUPP;
 	}
 
 	printf("WARNING: TMPFS is considered to be a highly experimental "
 	    "feature in FreeBSD.\n");
 
-	vn_lock(mp->mnt_vnodecovered, LK_SHARED | LK_RETRY, td);
+	vn_lock(mp->mnt_vnodecovered, LK_SHARED | LK_RETRY);
 	error = VOP_GETATTR(mp->mnt_vnodecovered, &va, mp->mnt_cred, td);
 	VOP_UNLOCK(mp->mnt_vnodecovered, 0, td);
 	if (error)
 		return (error);
 
 	if (mp->mnt_cred->cr_ruid != 0 ||
 	    vfs_scanopt(mp->mnt_optnew, "gid", "%d", &root_gid) != 1)
 		root_gid = va.va_gid;
 	if (mp->mnt_cred->cr_ruid != 0 ||
 	    vfs_scanopt(mp->mnt_optnew, "uid", "%d", &root_uid) != 1)
 		root_uid = va.va_uid;
 	if (mp->mnt_cred->cr_ruid != 0 ||
 	    vfs_scanopt(mp->mnt_optnew, "mode", "%ho", &root_mode) != 1)
 		root_mode = va.va_mode;
 	if (vfs_scanopt(mp->mnt_optnew, "inodes", "%d", &nodes_max) != 1)
 		nodes_max = 0;
 	if (vfs_scanopt(mp->mnt_optnew, "size", "%qu", &size_max) != 1)
 		size_max = 0;
 
 	/* Do not allow mounts if we do not have enough memory to preserve
 	 * the minimum reserved pages. */
 	mem_size = cnt.v_free_count + cnt.v_inactive_count + get_swpgtotal();
 	mem_size -= mem_size > cnt.v_wire_count ? cnt.v_wire_count : mem_size;
 	if (mem_size < TMPFS_PAGES_RESERVED)
 		return ENOSPC;
 
 	/* Get the maximum number of memory pages this file system is
 	 * allowed to use, based on the maximum size the user passed in
 	 * the mount structure.  A value of zero is treated as if the
 	 * maximum available space was requested. */
 	if (size_max < PAGE_SIZE || size_max >= SIZE_MAX)
 		pages = SIZE_MAX;
 	else
 		pages = howmany(size_max, PAGE_SIZE);
 	MPASS(pages > 0);
 
 	if (nodes_max <= 3)
 		nodes = 3 + pages * PAGE_SIZE / 1024;
 	else
 		nodes = nodes_max;
 	MPASS(nodes >= 3);
 
 	/* Allocate the tmpfs mount structure and fill it. */
 	tmp = (struct tmpfs_mount *)malloc(sizeof(struct tmpfs_mount),
 	    M_TMPFSMNT, M_WAITOK | M_ZERO);
 
 	mtx_init(&tmp->allnode_lock, "tmpfs allnode lock", NULL, MTX_DEF);
 	tmp->tm_nodes_max = nodes;
 	tmp->tm_nodes_inuse = 0;
 	tmp->tm_maxfilesize = (u_int64_t)(cnt.v_page_count + get_swpgtotal()) * PAGE_SIZE;
 	LIST_INIT(&tmp->tm_nodes_used);
 
 	tmp->tm_pages_max = pages;
 	tmp->tm_pages_used = 0;
 	tmp->tm_ino_unr = new_unrhdr(2, INT_MAX, &tmp->allnode_lock);
 	tmp->tm_dirent_pool = uma_zcreate("TMPFS dirent",
 	    sizeof(struct tmpfs_dirent),
 	    NULL, NULL, NULL, NULL,
 	    UMA_ALIGN_PTR, 0);
 	tmp->tm_node_pool = uma_zcreate("TMPFS node",
 	    sizeof(struct tmpfs_node),
 	    tmpfs_node_ctor, tmpfs_node_dtor,
 	    tmpfs_node_init, tmpfs_node_fini,
 	    UMA_ALIGN_PTR, 0);
 
 	/* Allocate the root node. */
 	error = tmpfs_alloc_node(tmp, VDIR, root_uid,
 	    root_gid, root_mode & ALLPERMS, NULL, NULL,
 	    VNOVAL, td, &root);
 
 	if (error != 0 || root == NULL) {
 	    uma_zdestroy(tmp->tm_node_pool);
 	    uma_zdestroy(tmp->tm_dirent_pool);
 	    delete_unrhdr(tmp->tm_ino_unr);
 	    free(tmp, M_TMPFSMNT);
 	    return error;
 	}
 	KASSERT(root->tn_id == 2, ("tmpfs root with invalid ino: %d", root->tn_id));
 	tmp->tm_root = root;
 
 	MNT_ILOCK(mp);
 	mp->mnt_flag |= MNT_LOCAL;
 	mp->mnt_kern_flag |= MNTK_MPSAFE;
 	MNT_IUNLOCK(mp);
 
 	mp->mnt_data = tmp;
 	mp->mnt_stat.f_namemax = MAXNAMLEN;
 	vfs_getnewfsid(mp);
 	vfs_mountedfrom(mp, "tmpfs");
 
 	return 0;
 }
 
 /* --------------------------------------------------------------------- */
 
 /* ARGSUSED2 */
 static int
 tmpfs_unmount(struct mount *mp, int mntflags, struct thread *l)
 {
 	int error;
 	int flags = 0;
 	struct tmpfs_mount *tmp;
 	struct tmpfs_node *node;
 
 	/* Handle forced unmounts. */
 	if (mntflags & MNT_FORCE)
 		flags |= FORCECLOSE;
 
 	/* Finalize all pending I/O. */
 	error = vflush(mp, 0, flags, l);
 	if (error != 0)
 		return error;
 
 	tmp = VFS_TO_TMPFS(mp);
 
 	/* Free all associated data.  The loop iterates over the linked list
 	 * we have containing all used nodes.  For each of them that is
 	 * a directory, we free all its directory entries.  Note that after
 	 * freeing a node, it will automatically go to the available list,
 	 * so we will later have to iterate over it to release its items. */
 	node = LIST_FIRST(&tmp->tm_nodes_used);
 	while (node != NULL) {
 		struct tmpfs_node *next;
 
 		if (node->tn_type == VDIR) {
 			struct tmpfs_dirent *de;
 
 			de = TAILQ_FIRST(&node->tn_dir.tn_dirhead);
 			while (de != NULL) {
 				struct tmpfs_dirent *nde;
 
 				nde = TAILQ_NEXT(de, td_entries);
 				tmpfs_free_dirent(tmp, de, FALSE);
 				de = nde;
 				node->tn_size -= sizeof(struct tmpfs_dirent);
 			}
 		}
 
 		next = LIST_NEXT(node, tn_entries);
 		tmpfs_free_node(tmp, node);
 		node = next;
 	}
 
 	uma_zdestroy(tmp->tm_dirent_pool);
 	uma_zdestroy(tmp->tm_node_pool);
 	delete_unrhdr(tmp->tm_ino_unr);
 
 	mtx_destroy(&tmp->allnode_lock);
 	MPASS(tmp->tm_pages_used == 0);
 	MPASS(tmp->tm_nodes_inuse == 0);
 
 	/* Throw away the tmpfs_mount structure. */
 	free(mp->mnt_data, M_TMPFSMNT);
 	mp->mnt_data = NULL;
 
 	MNT_ILOCK(mp);
 	mp->mnt_flag &= ~MNT_LOCAL;
 	MNT_IUNLOCK(mp);
 	return 0;
 }
 
 /* --------------------------------------------------------------------- */
 
 static int
 tmpfs_root(struct mount *mp, int flags, struct vnode **vpp, struct thread *td)
 {
 	int error;
 	error = tmpfs_alloc_vp(mp, VFS_TO_TMPFS(mp)->tm_root, flags, vpp, td);
 
 	if (!error)
 		(*vpp)->v_vflag |= VV_ROOT;
 
 	return error;
 }
 
 /* --------------------------------------------------------------------- */
 
 static int
 tmpfs_fhtovp(struct mount *mp, struct fid *fhp, struct vnode **vpp)
 {
 	boolean_t found;
 	struct tmpfs_fid *tfhp;
 	struct tmpfs_mount *tmp;
 	struct tmpfs_node *node;
 
 	tmp = VFS_TO_TMPFS(mp);
 
 	tfhp = (struct tmpfs_fid *)fhp;
 	if (tfhp->tf_len != sizeof(struct tmpfs_fid))
 		return EINVAL;
 
 	if (tfhp->tf_id >= tmp->tm_nodes_max)
 		return EINVAL;
 
 	found = FALSE;
 
 	TMPFS_LOCK(tmp);
 	LIST_FOREACH(node, &tmp->tm_nodes_used, tn_entries) {
 		if (node->tn_id == tfhp->tf_id &&
 		    node->tn_gen == tfhp->tf_gen) {
 			found = TRUE;
 			break;
 		}
 	}
 	TMPFS_UNLOCK(tmp);
 
 	if (found)
 		return (tmpfs_alloc_vp(mp, node, LK_EXCLUSIVE, vpp, curthread));
 
 	return (EINVAL);
 }
 
 /* --------------------------------------------------------------------- */
 
 /* ARGSUSED2 */
 static int
 tmpfs_statfs(struct mount *mp, struct statfs *sbp, struct thread *l)
 {
 	fsfilcnt_t freenodes;
 	struct tmpfs_mount *tmp;
 
 	tmp = VFS_TO_TMPFS(mp);
 
 	sbp->f_iosize = PAGE_SIZE;
 	sbp->f_bsize = PAGE_SIZE;
 
 	sbp->f_blocks = TMPFS_PAGES_MAX(tmp);
 	sbp->f_bavail = sbp->f_bfree = TMPFS_PAGES_AVAIL(tmp);
 
 	freenodes = MIN(tmp->tm_nodes_max - tmp->tm_nodes_inuse,
 	    TMPFS_PAGES_AVAIL(tmp) * PAGE_SIZE / sizeof(struct tmpfs_node));
 
 	sbp->f_files = freenodes + tmp->tm_nodes_inuse;
 	sbp->f_ffree = freenodes;
 	/* sbp->f_owner = tmp->tn_uid; */
 
 	return 0;
 }
 
 /* --------------------------------------------------------------------- */
 
 /*
  * tmpfs vfs operations.
  */
 
 struct vfsops tmpfs_vfsops = {
 	.vfs_mount =			tmpfs_mount,
 	.vfs_unmount =			tmpfs_unmount,
 	.vfs_root =			tmpfs_root,
 	.vfs_statfs =			tmpfs_statfs,
 	.vfs_fhtovp =			tmpfs_fhtovp,
 };
 VFS_SET(tmpfs_vfsops, tmpfs, 0);
Index: head/sys/fs/tmpfs/tmpfs_vnops.c
===================================================================
--- head/sys/fs/tmpfs/tmpfs_vnops.c	(revision 175201)
+++ head/sys/fs/tmpfs/tmpfs_vnops.c	(revision 175202)
@@ -1,1499 +1,1499 @@
 /*	$NetBSD: tmpfs_vnops.c,v 1.39 2007/07/23 15:41:01 jmmv Exp $	*/
 
 /*
  * Copyright (c) 2005, 2006 The NetBSD Foundation, Inc.
  * All rights reserved.
  *
  * This code is derived from software contributed to The NetBSD Foundation
  * by Julio M. Merino Vidal, developed as part of Google's Summer of Code
  * 2005 program.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. All advertising materials mentioning features or use of this software
  *    must display the following acknowledgement:
  *        This product includes software developed by the NetBSD
  *        Foundation, Inc. and its contributors.
  * 4. Neither the name of The NetBSD Foundation nor the names of its
  *    contributors may be used to endorse or promote products derived
  *    from this software without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
  * POSSIBILITY OF SUCH DAMAGE.
  */
 
 /*
  * tmpfs vnode interface.
  */
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <sys/param.h>
 #include <sys/fcntl.h>
 #include <sys/lockf.h>
 #include <sys/namei.h>
 #include <sys/priv.h>
 #include <sys/proc.h>
 #include <sys/resourcevar.h>
 #include <sys/stat.h>
 #include <sys/systm.h>
 #include <sys/unistd.h>
 #include <sys/vnode.h>
 
 #include <vm/vm.h>
 #include <vm/vm_object.h>
 #include <vm/vm_page.h>
 #include <vm/vm_pager.h>
 #include <sys/sched.h>
 #include <sys/sf_buf.h>
 #include <machine/_inttypes.h>
 
 #include <fs/fifofs/fifo.h>
 #include <fs/tmpfs/tmpfs_vnops.h>
 #include <fs/tmpfs/tmpfs.h>
 
 /* --------------------------------------------------------------------- */
 
 static int
 tmpfs_lookup(struct vop_cachedlookup_args *v)
 {
 	struct vnode *dvp = v->a_dvp;
 	struct vnode **vpp = v->a_vpp;
 	struct componentname *cnp = v->a_cnp;
 	struct thread *td = cnp->cn_thread;
 
 	int error;
 	struct tmpfs_dirent *de;
 	struct tmpfs_node *dnode;
 
 	dnode = VP_TO_TMPFS_DIR(dvp);
 	*vpp = NULLVP;
 
 	/* Check accessibility of requested node as a first step. */
 	error = VOP_ACCESS(dvp, VEXEC, cnp->cn_cred, td);
 	if (error != 0)
 		goto out;
 
 	/* We cannot be requesting the parent directory of the root node. */
 	MPASS(IMPLIES(dnode->tn_type == VDIR &&
 	    dnode->tn_dir.tn_parent == dnode,
 	    !(cnp->cn_flags & ISDOTDOT)));
 
 	if (cnp->cn_flags & ISDOTDOT) {
 		int ltype = 0;
 
 		ltype = VOP_ISLOCKED(dvp, td);
 		vhold(dvp);
 		VOP_UNLOCK(dvp, 0, td);
 		/* Allocate a new vnode on the matching entry. */
 		error = tmpfs_alloc_vp(dvp->v_mount, dnode->tn_dir.tn_parent,
 		    cnp->cn_lkflags, vpp, td);
 
-		vn_lock(dvp, ltype | LK_RETRY, td);
+		vn_lock(dvp, ltype | LK_RETRY);
 		vdrop(dvp);
 	} else if (cnp->cn_namelen == 1 && cnp->cn_nameptr[0] == '.') {
 		VREF(dvp);
 		*vpp = dvp;
 		error = 0;
 	} else {
 		de = tmpfs_dir_lookup(dnode, cnp);
 		if (de == NULL) {
 			/* The entry was not found in the directory.
 			 * This is OK if we are creating or renaming an
 			 * entry and are working on the last component of
 			 * the path name. */
 			if ((cnp->cn_flags & ISLASTCN) &&
 			    (cnp->cn_nameiop == CREATE || \
 			    cnp->cn_nameiop == RENAME)) {
 				error = VOP_ACCESS(dvp, VWRITE, cnp->cn_cred,
 				    cnp->cn_thread);
 				if (error != 0)
 					goto out;
 
 				/* Keep the component name in the buffer for
 				 * future uses. */
 				cnp->cn_flags |= SAVENAME;
 
 				error = EJUSTRETURN;
 			} else
 				error = ENOENT;
 		} else {
 			struct tmpfs_node *tnode;
 
 			/* The entry was found, so get its associated
 			 * tmpfs_node. */
 			tnode = de->td_node;
 
 			/* If we are not at the last path component and
 			 * found a non-directory or non-link entry (which
 			 * may itself be pointing to a directory), raise
 			 * an error. */
 			if ((tnode->tn_type != VDIR &&
 			    tnode->tn_type != VLNK) &&
 			    !(cnp->cn_flags & ISLASTCN)) {
 				error = ENOTDIR;
 				goto out;
 			}
 
 			/* If we are deleting or renaming the entry, keep
 			 * track of its tmpfs_dirent so that it can be
 			 * easily deleted later. */
 			if ((cnp->cn_flags & ISLASTCN) &&
 			    (cnp->cn_nameiop == DELETE ||
 			    cnp->cn_nameiop == RENAME)) {
 				error = VOP_ACCESS(dvp, VWRITE, cnp->cn_cred,
 				    cnp->cn_thread);
 				if (error != 0)
 					goto out;
 
 				/* Allocate a new vnode on the matching entry. */
 				error = tmpfs_alloc_vp(dvp->v_mount, tnode,
 						cnp->cn_lkflags, vpp, td);
 				if (error != 0)
 					goto out;
 
 				if ((dnode->tn_mode & S_ISTXT) &&
 				  VOP_ACCESS(dvp, VADMIN, cnp->cn_cred, cnp->cn_thread) &&
 				  VOP_ACCESS(*vpp, VADMIN, cnp->cn_cred, cnp->cn_thread)) {
 					error = EPERM;
 					vput(*vpp);
 					*vpp = NULL;
 					goto out;
 				}
 				cnp->cn_flags |= SAVENAME;
 			} else {
 				error = tmpfs_alloc_vp(dvp->v_mount, tnode,
 						cnp->cn_lkflags, vpp, td);
 			}
 		}
 	}
 
 	/* Store the result of this lookup in the cache.  Avoid this if the
 	 * request was for creation, as it does not improve timings on
 	 * emprical tests. */
 	if ((cnp->cn_flags & MAKEENTRY) && cnp->cn_nameiop != CREATE)
 		cache_enter(dvp, *vpp, cnp);
 
 out:
 	/* If there were no errors, *vpp cannot be null and it must be
 	 * locked. */
 	MPASS(IFF(error == 0, *vpp != NULLVP && VOP_ISLOCKED(*vpp, td)));
 
 	return error;
 }
 
 /* --------------------------------------------------------------------- */
 
 static int
 tmpfs_create(struct vop_create_args *v)
 {
 	struct vnode *dvp = v->a_dvp;
 	struct vnode **vpp = v->a_vpp;
 	struct componentname *cnp = v->a_cnp;
 	struct vattr *vap = v->a_vap;
 
 	MPASS(vap->va_type == VREG || vap->va_type == VSOCK);
 
 	return tmpfs_alloc_file(dvp, vpp, vap, cnp, NULL);
 }
 /* --------------------------------------------------------------------- */
 
 static int
 tmpfs_mknod(struct vop_mknod_args *v)
 {
 	struct vnode *dvp = v->a_dvp;
 	struct vnode **vpp = v->a_vpp;
 	struct componentname *cnp = v->a_cnp;
 	struct vattr *vap = v->a_vap;
 
 	if (vap->va_type != VBLK && vap->va_type != VCHR &&
 	    vap->va_type != VFIFO)
 		return EINVAL;
 
 	return tmpfs_alloc_file(dvp, vpp, vap, cnp, NULL);
 }
 
 /* --------------------------------------------------------------------- */
 
 static int
 tmpfs_open(struct vop_open_args *v)
 {
 	struct vnode *vp = v->a_vp;
 	int mode = v->a_mode;
 
 	int error;
 	struct tmpfs_node *node;
 
 	MPASS(VOP_ISLOCKED(vp, v->a_td));
 
 	node = VP_TO_TMPFS_NODE(vp);
 
 	/* The file is still active but all its names have been removed
 	 * (e.g. by a "rmdir $(pwd)").  It cannot be opened any more as
 	 * it is about to die. */
 	if (node->tn_links < 1)
 		return (ENOENT);
 
 	/* If the file is marked append-only, deny write requests. */
 	if (node->tn_flags & APPEND && (mode & (FWRITE | O_APPEND)) == FWRITE)
 		error = EPERM;
 	else {
 		error = 0;
 		vnode_create_vobject(vp, node->tn_size, v->a_td);
 	}
 
 	MPASS(VOP_ISLOCKED(vp, v->a_td));
 	return error;
 }
 
 /* --------------------------------------------------------------------- */
 
 static int
 tmpfs_close(struct vop_close_args *v)
 {
 	struct vnode *vp = v->a_vp;
 
 	struct tmpfs_node *node;
 
 	MPASS(VOP_ISLOCKED(vp, v->a_td));
 
 	node = VP_TO_TMPFS_NODE(vp);
 
 	if (node->tn_links > 0) {
 		/* Update node times.  No need to do it if the node has
 		 * been deleted, because it will vanish after we return. */
 		tmpfs_update(vp);
 	}
 
 	return 0;
 }
 
 /* --------------------------------------------------------------------- */
 
 int
 tmpfs_access(struct vop_access_args *v)
 {
 	struct vnode *vp = v->a_vp;
 	int mode = v->a_mode;
 	struct ucred *cred = v->a_cred;
 
 	int error;
 	struct tmpfs_node *node;
 
 	MPASS(VOP_ISLOCKED(vp, v->a_td));
 
 	node = VP_TO_TMPFS_NODE(vp);
 
 	switch (vp->v_type) {
 	case VDIR:
 		/* FALLTHROUGH */
 	case VLNK:
 		/* FALLTHROUGH */
 	case VREG:
 		if (mode & VWRITE && vp->v_mount->mnt_flag & MNT_RDONLY) {
 			error = EROFS;
 			goto out;
 		}
 		break;
 
 	case VBLK:
 		/* FALLTHROUGH */
 	case VCHR:
 		/* FALLTHROUGH */
 	case VSOCK:
 		/* FALLTHROUGH */
 	case VFIFO:
 		break;
 
 	default:
 		error = EINVAL;
 		goto out;
 	}
 
 	if (mode & VWRITE && node->tn_flags & IMMUTABLE) {
 		error = EPERM;
 		goto out;
 	}
 
 	error = vaccess(vp->v_type, node->tn_mode, node->tn_uid,
 	    node->tn_gid, mode, cred, NULL);
 
 out:
 	MPASS(VOP_ISLOCKED(vp, v->a_td));
 
 	return error;
 }
 
 /* --------------------------------------------------------------------- */
 
 int
 tmpfs_getattr(struct vop_getattr_args *v)
 {
 	struct vnode *vp = v->a_vp;
 	struct vattr *vap = v->a_vap;
 
 	struct tmpfs_node *node;
 
 	node = VP_TO_TMPFS_NODE(vp);
 
 	VATTR_NULL(vap);
 
 	tmpfs_update(vp);
 
 	vap->va_type = vp->v_type;
 	vap->va_mode = node->tn_mode;
 	vap->va_nlink = node->tn_links;
 	vap->va_uid = node->tn_uid;
 	vap->va_gid = node->tn_gid;
 	vap->va_fsid = vp->v_mount->mnt_stat.f_fsid.val[0];
 	vap->va_fileid = node->tn_id;
 	vap->va_size = node->tn_size;
 	vap->va_blocksize = PAGE_SIZE;
 	vap->va_atime = node->tn_atime;
 	vap->va_mtime = node->tn_mtime;
 	vap->va_ctime = node->tn_ctime;
 	vap->va_birthtime = node->tn_birthtime;
 	vap->va_gen = node->tn_gen;
 	vap->va_flags = node->tn_flags;
 	vap->va_rdev = (vp->v_type == VBLK || vp->v_type == VCHR) ?
 		node->tn_rdev : VNOVAL;
 	vap->va_bytes = round_page(node->tn_size);
 	vap->va_filerev = VNOVAL;
 	vap->va_vaflags = 0;
 	vap->va_spare = VNOVAL; /* XXX */
 
 	return 0;
 }
 
 /* --------------------------------------------------------------------- */
 
 /* XXX Should this operation be atomic?  I think it should, but code in
  * XXX other places (e.g., ufs) doesn't seem to be... */
 int
 tmpfs_setattr(struct vop_setattr_args *v)
 {
 	struct vnode *vp = v->a_vp;
 	struct vattr *vap = v->a_vap;
 	struct ucred *cred = v->a_cred;
 	struct thread *l = v->a_td;
 
 	int error;
 
 	MPASS(VOP_ISLOCKED(vp, l));
 
 	error = 0;
 
 	/* Abort if any unsettable attribute is given. */
 	if (vap->va_type != VNON ||
 	    vap->va_nlink != VNOVAL ||
 	    vap->va_fsid != VNOVAL ||
 	    vap->va_fileid != VNOVAL ||
 	    vap->va_blocksize != VNOVAL ||
 	    vap->va_gen != VNOVAL ||
 	    vap->va_rdev != VNOVAL ||
 	    vap->va_bytes != VNOVAL)
 		error = EINVAL;
 
 	if (error == 0 && (vap->va_flags != VNOVAL))
 		error = tmpfs_chflags(vp, vap->va_flags, cred, l);
 
 	if (error == 0 && (vap->va_size != VNOVAL))
 		error = tmpfs_chsize(vp, vap->va_size, cred, l);
 
 	if (error == 0 && (vap->va_uid != VNOVAL || vap->va_gid != VNOVAL))
 		error = tmpfs_chown(vp, vap->va_uid, vap->va_gid, cred,
 		    l);
 
 	if (error == 0 && (vap->va_mode != (mode_t)VNOVAL))
 		error = tmpfs_chmod(vp, vap->va_mode, cred, l);
 
 	if (error == 0 && ((vap->va_atime.tv_sec != VNOVAL &&
 	    vap->va_atime.tv_nsec != VNOVAL) ||
 	    (vap->va_mtime.tv_sec != VNOVAL &&
 	    vap->va_mtime.tv_nsec != VNOVAL) ||
 	    (vap->va_birthtime.tv_sec != VNOVAL &&
 	    vap->va_birthtime.tv_nsec != VNOVAL)))
 		error = tmpfs_chtimes(vp, &vap->va_atime, &vap->va_mtime,
 			&vap->va_birthtime, vap->va_vaflags, cred, l);
 
 	/* Update the node times.  We give preference to the error codes
 	 * generated by this function rather than the ones that may arise
 	 * from tmpfs_update. */
 	tmpfs_update(vp);
 
 	MPASS(VOP_ISLOCKED(vp, l));
 
 	return error;
 }
 
 /* --------------------------------------------------------------------- */
 
 static int
 tmpfs_mappedread(vm_object_t vobj, vm_object_t tobj, size_t len, struct uio *uio)
 {
 	vm_pindex_t	idx;
 	vm_page_t	m;
 	struct sf_buf	*sf;
 	off_t		offset, addr;
 	size_t		tlen;
 	caddr_t		va;
 	int		error;
 
 	addr = uio->uio_offset;
 	idx = OFF_TO_IDX(addr);
 	offset = addr & PAGE_MASK;
 	tlen = MIN(PAGE_SIZE - offset, len);
 
 	if ((vobj == NULL) || (vobj->resident_page_count == 0))
 		goto nocache;
 
 	VM_OBJECT_LOCK(vobj);
 lookupvpg:
 	if (((m = vm_page_lookup(vobj, idx)) != NULL) &&
 	    vm_page_is_valid(m, offset, tlen)) {
 		if (vm_page_sleep_if_busy(m, FALSE, "tmfsmr"))
 			goto lookupvpg;
 		vm_page_busy(m);
 		VM_OBJECT_UNLOCK(vobj);
 		sched_pin();
 		sf = sf_buf_alloc(m, SFB_CPUPRIVATE);
 		va = (caddr_t)sf_buf_kva(sf);
 		error = uiomove(va + offset, tlen, uio);
 		sf_buf_free(sf);
 		sched_unpin();
 		VM_OBJECT_LOCK(vobj);
 		vm_page_wakeup(m);
 		VM_OBJECT_UNLOCK(vobj);
 		return	(error);
 	}
 	VM_OBJECT_UNLOCK(vobj);
 nocache:
 	VM_OBJECT_LOCK(tobj);
 	vm_object_pip_add(tobj, 1);
 	m = vm_page_grab(tobj, idx, VM_ALLOC_WIRED |
 	    VM_ALLOC_ZERO | VM_ALLOC_NORMAL | VM_ALLOC_RETRY);
 	if (m->valid != VM_PAGE_BITS_ALL) {
 		int behind, ahead;
 		if (vm_pager_has_page(tobj, idx, &behind, &ahead)) {
 			error = vm_pager_get_pages(tobj, &m, 1, 0);
 			if (error != 0) {
 				printf("tmpfs get pages from pager error [read]\n");
 				goto out;
 			}
 		} else
 			vm_page_zero_invalid(m, TRUE);
 	}
 	VM_OBJECT_UNLOCK(tobj);
 	sched_pin();
 	sf = sf_buf_alloc(m, SFB_CPUPRIVATE);
 	va = (caddr_t)sf_buf_kva(sf);
 	error = uiomove(va + offset, tlen, uio);
 	sf_buf_free(sf);
 	sched_unpin();
 	VM_OBJECT_LOCK(tobj);
 out:
 	vm_page_lock_queues();
 	vm_page_unwire(m, 0);
 	vm_page_activate(m);
 	vm_page_unlock_queues();
 	vm_page_wakeup(m);
 	vm_object_pip_subtract(tobj, 1);
 	VM_OBJECT_UNLOCK(tobj);
 
 	return	(error);
 }
 
 static int
 tmpfs_read(struct vop_read_args *v)
 {
 	struct vnode *vp = v->a_vp;
 	struct uio *uio = v->a_uio;
 
 	struct tmpfs_node *node;
 	vm_object_t uobj;
 	size_t len;
 	int resid;
 
 	int error = 0;
 
 	node = VP_TO_TMPFS_NODE(vp);
 
 	if (vp->v_type != VREG) {
 		error = EISDIR;
 		goto out;
 	}
 
 	if (uio->uio_offset < 0) {
 		error = EINVAL;
 		goto out;
 	}
 
 	node->tn_status |= TMPFS_NODE_ACCESSED;
 
 	uobj = node->tn_reg.tn_aobj;
 	while ((resid = uio->uio_resid) > 0) {
 		error = 0;
 		if (node->tn_size <= uio->uio_offset)
 			break;
 		len = MIN(node->tn_size - uio->uio_offset, resid);
 		if (len == 0)
 			break;
 		error = tmpfs_mappedread(vp->v_object, uobj, len, uio);
 		if ((error != 0) || (resid == uio->uio_resid))
 			break;
 	}
 
 out:
 
 	return error;
 }
 
 /* --------------------------------------------------------------------- */
 
 static int
 tmpfs_mappedwrite(vm_object_t vobj, vm_object_t tobj, size_t len, struct uio *uio)
 {
 	vm_pindex_t	idx;
 	vm_page_t	vpg, tpg;
 	struct sf_buf	*sf;
 	off_t		offset, addr;
 	size_t		tlen;
 	caddr_t		va;
 	int		error;
 
 	error = 0;
 	
 	addr = uio->uio_offset;
 	idx = OFF_TO_IDX(addr);
 	offset = addr & PAGE_MASK;
 	tlen = MIN(PAGE_SIZE - offset, len);
 
 	if ((vobj == NULL) || (vobj->resident_page_count == 0)) {
 		vpg = NULL;
 		goto nocache;
 	}
 
 	VM_OBJECT_LOCK(vobj);
 lookupvpg:
 	if (((vpg = vm_page_lookup(vobj, idx)) != NULL) &&
 	    vm_page_is_valid(vpg, offset, tlen)) {
 		if (vm_page_sleep_if_busy(vpg, FALSE, "tmfsmw"))
 			goto lookupvpg;
 		vm_page_busy(vpg);
 		vm_page_lock_queues();
 		vm_page_undirty(vpg);
 		vm_page_unlock_queues();
 		VM_OBJECT_UNLOCK(vobj);
 		sched_pin();
 		sf = sf_buf_alloc(vpg, SFB_CPUPRIVATE);
 		va = (caddr_t)sf_buf_kva(sf);
 		error = uiomove(va + offset, tlen, uio);
 		sf_buf_free(sf);
 		sched_unpin();
 	} else {
 		VM_OBJECT_UNLOCK(vobj);
 		vpg = NULL;
 	}
 nocache:
 	VM_OBJECT_LOCK(tobj);
 	vm_object_pip_add(tobj, 1);
 	tpg = vm_page_grab(tobj, idx, VM_ALLOC_WIRED |
 	    VM_ALLOC_ZERO | VM_ALLOC_NORMAL | VM_ALLOC_RETRY);
 	if (tpg->valid != VM_PAGE_BITS_ALL) {
 		int behind, ahead;
 		if (vm_pager_has_page(tobj, idx, &behind, &ahead)) {
 			error = vm_pager_get_pages(tobj, &tpg, 1, 0);
 			if (error != 0) {
 				printf("tmpfs get pages from pager error [write]\n");
 				goto out;
 			}
 		} else
 			vm_page_zero_invalid(tpg, TRUE);
 	}
 	VM_OBJECT_UNLOCK(tobj);
 	if (vpg == NULL) {
 		sched_pin();
 		sf = sf_buf_alloc(tpg, SFB_CPUPRIVATE);
 		va = (caddr_t)sf_buf_kva(sf);
 		error = uiomove(va + offset, tlen, uio);
 		sf_buf_free(sf);
 		sched_unpin();
 	} else {
 		KASSERT(vpg->valid == VM_PAGE_BITS_ALL, ("parts of vpg invalid"));
 		pmap_copy_page(vpg, tpg);
 	}
 	VM_OBJECT_LOCK(tobj);
 out:
 	if (vobj != NULL)
 		VM_OBJECT_LOCK(vobj);
 	vm_page_lock_queues();
 	if (error == 0) {
 		vm_page_set_validclean(tpg, offset, tlen);
 		vm_page_zero_invalid(tpg, TRUE);
 		vm_page_dirty(tpg);
 	}
 	vm_page_unwire(tpg, 0);
 	vm_page_activate(tpg);
 	vm_page_unlock_queues();
 	vm_page_wakeup(tpg);
 	if (vpg != NULL)
 		vm_page_wakeup(vpg);
 	if (vobj != NULL)
 		VM_OBJECT_UNLOCK(vobj);
 	vm_object_pip_subtract(tobj, 1);
 	VM_OBJECT_UNLOCK(tobj);
 
 	return	(error);
 }
 
 static int
 tmpfs_write(struct vop_write_args *v)
 {
 	struct vnode *vp = v->a_vp;
 	struct uio *uio = v->a_uio;
 	int ioflag = v->a_ioflag;
 	struct thread *td = uio->uio_td;
 
 	boolean_t extended;
 	int error = 0;
 	off_t oldsize;
 	struct tmpfs_node *node;
 	vm_object_t uobj;
 	size_t len;
 	int resid;
 
 	node = VP_TO_TMPFS_NODE(vp);
 	oldsize = node->tn_size;
 
 	if (uio->uio_offset < 0 || vp->v_type != VREG) {
 		error = EINVAL;
 		goto out;
 	}
 
 	if (uio->uio_resid == 0) {
 		error = 0;
 		goto out;
 	}
 
 	if (ioflag & IO_APPEND)
 		uio->uio_offset = node->tn_size;
 
 	if (uio->uio_offset + uio->uio_resid >
 	  VFS_TO_TMPFS(vp->v_mount)->tm_maxfilesize)
 		return (EFBIG);
 
 	if (vp->v_type == VREG && td != NULL) {
 		PROC_LOCK(td->td_proc);
 		if (uio->uio_offset + uio->uio_resid >
 		  lim_cur(td->td_proc, RLIMIT_FSIZE)) {
 			psignal(td->td_proc, SIGXFSZ);
 			PROC_UNLOCK(td->td_proc);
 			return (EFBIG);
 		}
 		PROC_UNLOCK(td->td_proc);
 	}
 
 	extended = uio->uio_offset + uio->uio_resid > node->tn_size;
 	if (extended) {
 		error = tmpfs_reg_resize(vp, uio->uio_offset + uio->uio_resid);
 		if (error != 0)
 			goto out;
 	}
 
 	uobj = node->tn_reg.tn_aobj;
 	while ((resid = uio->uio_resid) > 0) {
 		if (node->tn_size <= uio->uio_offset)
 			break;
 		len = MIN(node->tn_size - uio->uio_offset, resid);
 		if (len == 0)
 			break;
 		error = tmpfs_mappedwrite(vp->v_object, uobj, len, uio);
 		if ((error != 0) || (resid == uio->uio_resid))
 			break;
 	}
 
 	node->tn_status |= TMPFS_NODE_ACCESSED | TMPFS_NODE_MODIFIED |
 	    (extended ? TMPFS_NODE_CHANGED : 0);
 
 	if (node->tn_mode & (S_ISUID | S_ISGID)) {
 		if (priv_check_cred(v->a_cred, PRIV_VFS_RETAINSUGID, 0))
 			node->tn_mode &= ~(S_ISUID | S_ISGID);
 	}
 
 	if (error != 0)
 		(void)tmpfs_reg_resize(vp, oldsize);
 
 out:
 	MPASS(IMPLIES(error == 0, uio->uio_resid == 0));
 	MPASS(IMPLIES(error != 0, oldsize == node->tn_size));
 
 	return error;
 }
 
 /* --------------------------------------------------------------------- */
 
 static int
 tmpfs_fsync(struct vop_fsync_args *v)
 {
 	struct vnode *vp = v->a_vp;
 
 	MPASS(VOP_ISLOCKED(vp, v->a_td));
 
 	tmpfs_update(vp);
 
 	return 0;
 }
 
 /* --------------------------------------------------------------------- */
 
 static int
 tmpfs_remove(struct vop_remove_args *v)
 {
 	struct vnode *dvp = v->a_dvp;
 	struct vnode *vp = v->a_vp;
 
 	int error;
 	struct tmpfs_dirent *de;
 	struct tmpfs_mount *tmp;
 	struct tmpfs_node *dnode;
 	struct tmpfs_node *node;
 
 	MPASS(VOP_ISLOCKED(dvp, v->a_cnp->cn_thread));
 	MPASS(VOP_ISLOCKED(vp, v->a_cnp->cn_thread));
 
 	if (vp->v_type == VDIR) {
 		error = EISDIR;
 		goto out;
 	}
 
 	dnode = VP_TO_TMPFS_DIR(dvp);
 	node = VP_TO_TMPFS_NODE(vp);
 	tmp = VFS_TO_TMPFS(vp->v_mount);
 	de = tmpfs_dir_search(dnode, node);
 	MPASS(de != NULL);
 
 	/* Files marked as immutable or append-only cannot be deleted. */
 	if ((node->tn_flags & (IMMUTABLE | APPEND | NOUNLINK)) ||
 	    (dnode->tn_flags & APPEND)) {
 		error = EPERM;
 		goto out;
 	}
 
 	/* Remove the entry from the directory; as it is a file, we do not
 	 * have to change the number of hard links of the directory. */
 	tmpfs_dir_detach(dvp, de);
 
 	/* Free the directory entry we just deleted.  Note that the node
 	 * referred by it will not be removed until the vnode is really
 	 * reclaimed. */
 	tmpfs_free_dirent(tmp, de, TRUE);
 
 	if (node->tn_links > 0)
 		node->tn_status |= TMPFS_NODE_ACCESSED | TMPFS_NODE_CHANGED | \
 	    TMPFS_NODE_MODIFIED;
 	error = 0;
 
 out:
 
 	return error;
 }
 
 /* --------------------------------------------------------------------- */
 
 static int
 tmpfs_link(struct vop_link_args *v)
 {
 	struct vnode *dvp = v->a_tdvp;
 	struct vnode *vp = v->a_vp;
 	struct componentname *cnp = v->a_cnp;
 
 	int error;
 	struct tmpfs_dirent *de;
 	struct tmpfs_node *node;
 
 	MPASS(VOP_ISLOCKED(dvp, cnp->cn_thread));
 	MPASS(cnp->cn_flags & HASBUF);
 	MPASS(dvp != vp); /* XXX When can this be false? */
 
 	node = VP_TO_TMPFS_NODE(vp);
 
 	/* XXX: Why aren't the following two tests done by the caller? */
 
 	/* Hard links of directories are forbidden. */
 	if (vp->v_type == VDIR) {
 		error = EPERM;
 		goto out;
 	}
 
 	/* Cannot create cross-device links. */
 	if (dvp->v_mount != vp->v_mount) {
 		error = EXDEV;
 		goto out;
 	}
 
 	/* Ensure that we do not overflow the maximum number of links imposed
 	 * by the system. */
 	MPASS(node->tn_links <= LINK_MAX);
 	if (node->tn_links == LINK_MAX) {
 		error = EMLINK;
 		goto out;
 	}
 
 	/* We cannot create links of files marked immutable or append-only. */
 	if (node->tn_flags & (IMMUTABLE | APPEND)) {
 		error = EPERM;
 		goto out;
 	}
 
 	/* Allocate a new directory entry to represent the node. */
 	error = tmpfs_alloc_dirent(VFS_TO_TMPFS(vp->v_mount), node,
 	    cnp->cn_nameptr, cnp->cn_namelen, &de);
 	if (error != 0)
 		goto out;
 
 	/* Insert the new directory entry into the appropriate directory. */
 	tmpfs_dir_attach(dvp, de);
 
 	/* vp link count has changed, so update node times. */
 	node->tn_status |= TMPFS_NODE_CHANGED;
 	tmpfs_update(vp);
 
 	error = 0;
 
 out:
 	return error;
 }
 
 /* --------------------------------------------------------------------- */
 
 static int
 tmpfs_rename(struct vop_rename_args *v)
 {
 	struct vnode *fdvp = v->a_fdvp;
 	struct vnode *fvp = v->a_fvp;
 	struct componentname *fcnp = v->a_fcnp;
 	struct vnode *tdvp = v->a_tdvp;
 	struct vnode *tvp = v->a_tvp;
 	struct componentname *tcnp = v->a_tcnp;
 
 	char *newname;
 	int error;
 	struct tmpfs_dirent *de;
 	struct tmpfs_node *fdnode;
 	struct tmpfs_node *fnode;
 	struct tmpfs_node *tnode;
 	struct tmpfs_node *tdnode;
 
 	MPASS(VOP_ISLOCKED(tdvp, tcnp->cn_thread));
 	MPASS(IMPLIES(tvp != NULL, VOP_ISLOCKED(tvp, tcnp->cn_thread)));
 	MPASS(fcnp->cn_flags & HASBUF);
 	MPASS(tcnp->cn_flags & HASBUF);
 
   	tnode = (tvp == NULL) ? NULL : VP_TO_TMPFS_NODE(tvp);
 
 	/* Disallow cross-device renames.
 	 * XXX Why isn't this done by the caller? */
 	if (fvp->v_mount != tdvp->v_mount ||
 	    (tvp != NULL && fvp->v_mount != tvp->v_mount)) {
 		error = EXDEV;
 		goto out;
 	}
 
 	tdnode = VP_TO_TMPFS_DIR(tdvp);
 
 	/* If source and target are the same file, there is nothing to do. */
 	if (fvp == tvp) {
 		error = 0;
 		goto out;
 	}
 
 	/* If we need to move the directory between entries, lock the
 	 * source so that we can safely operate on it. */
 	if (tdvp != fdvp) {
-		error = vn_lock(fdvp, LK_EXCLUSIVE | LK_RETRY, tcnp->cn_thread);
+		error = vn_lock(fdvp, LK_EXCLUSIVE | LK_RETRY);
 		if (error != 0)
 			goto out;
 	}
 	fdnode = VP_TO_TMPFS_DIR(fdvp);
 	fnode = VP_TO_TMPFS_NODE(fvp);
 	de = tmpfs_dir_search(fdnode, fnode);
 
 	/* Avoid manipulating '.' and '..' entries. */
 	if (de == NULL) {
 		MPASS(fvp->v_type == VDIR);
 		error = EINVAL;
 		goto out_locked;
 	}
 	MPASS(de->td_node == fnode);
 
 	/* If re-naming a directory to another preexisting directory
 	 * ensure that the target directory is empty so that its
 	 * removal causes no side effects.
 	 * Kern_rename gurantees the destination to be a directory
 	 * if the source is one. */
 	if (tvp != NULL) {
 		MPASS(tnode != NULL);
 
 		if ((tnode->tn_flags & (NOUNLINK | IMMUTABLE | APPEND)) ||
 		    (tdnode->tn_flags & (APPEND | IMMUTABLE))) {
 			error = EPERM;
 			goto out_locked;
 		}
 
 		if (fnode->tn_type == VDIR && tnode->tn_type == VDIR) {
 			if (tnode->tn_size > 0) {
 				error = ENOTEMPTY;
 				goto out_locked;
 			}
 		} else if (fnode->tn_type == VDIR && tnode->tn_type != VDIR) {
 			error = ENOTDIR;
 			goto out_locked;
 		} else if (fnode->tn_type != VDIR && tnode->tn_type == VDIR) {
 			error = EISDIR;
 			goto out_locked;
 		} else {
 			MPASS(fnode->tn_type != VDIR &&
 				tnode->tn_type != VDIR);
 		}
 	}
 
 	if ((fnode->tn_flags & (NOUNLINK | IMMUTABLE | APPEND))
 	    || (fdnode->tn_flags & (APPEND | IMMUTABLE))) {
 		error = EPERM;
 		goto out_locked;
 	}
 
 	/* Ensure that we have enough memory to hold the new name, if it
 	 * has to be changed. */
 	if (fcnp->cn_namelen != tcnp->cn_namelen ||
 	    memcmp(fcnp->cn_nameptr, tcnp->cn_nameptr, fcnp->cn_namelen) != 0) {
 		newname = malloc(tcnp->cn_namelen, M_TMPFSNAME, M_WAITOK);
 	} else
 		newname = NULL;
 
 	/* If the node is being moved to another directory, we have to do
 	 * the move. */
 	if (fdnode != tdnode) {
 		/* In case we are moving a directory, we have to adjust its
 		 * parent to point to the new parent. */
 		if (de->td_node->tn_type == VDIR) {
 			struct tmpfs_node *n;
 
 			/* Ensure the target directory is not a child of the
 			 * directory being moved.  Otherwise, we'd end up
 			 * with stale nodes. */
 			n = tdnode;
 			while (n != n->tn_dir.tn_parent) {
 				if (n == fnode) {
 					error = EINVAL;
 					if (newname != NULL)
 						    free(newname, M_TMPFSNAME);
 					goto out_locked;
 				}
 				n = n->tn_dir.tn_parent;
 			}
 
 			/* Adjust the parent pointer. */
 			TMPFS_VALIDATE_DIR(fnode);
 			de->td_node->tn_dir.tn_parent = tdnode;
 
 			/* As a result of changing the target of the '..'
 			 * entry, the link count of the source and target
 			 * directories has to be adjusted. */
 			fdnode->tn_links--;
 			tdnode->tn_links++;
 		}
 
 		/* Do the move: just remove the entry from the source directory
 		 * and insert it into the target one. */
 		tmpfs_dir_detach(fdvp, de);
 		tmpfs_dir_attach(tdvp, de);
 	}
 
 	/* If the name has changed, we need to make it effective by changing
 	 * it in the directory entry. */
 	if (newname != NULL) {
 		MPASS(tcnp->cn_namelen <= MAXNAMLEN);
 
 		free(de->td_name, M_TMPFSNAME);
 		de->td_namelen = (uint16_t)tcnp->cn_namelen;
 		memcpy(newname, tcnp->cn_nameptr, tcnp->cn_namelen);
 		de->td_name = newname;
 
 		fnode->tn_status |= TMPFS_NODE_CHANGED;
 		tdnode->tn_status |= TMPFS_NODE_MODIFIED;
 	}
 
 	/* If we are overwriting an entry, we have to remove the old one
 	 * from the target directory. */
 	if (tvp != NULL) {
 		/* Remove the old entry from the target directory. */
 		de = tmpfs_dir_search(tdnode, tnode);
 		tmpfs_dir_detach(tdvp, de);
 
 		/* Free the directory entry we just deleted.  Note that the
 		 * node referred by it will not be removed until the vnode is
 		 * really reclaimed. */
 		tmpfs_free_dirent(VFS_TO_TMPFS(tvp->v_mount), de, TRUE);
 	}
 
 	error = 0;
 
 out_locked:
 	if (fdnode != tdnode)
 		VOP_UNLOCK(fdvp, 0, tcnp->cn_thread);
 
 out:
 	/* Release target nodes. */
 	/* XXX: I don't understand when tdvp can be the same as tvp, but
 	 * other code takes care of this... */
 	if (tdvp == tvp)
 		vrele(tdvp);
 	else
 		vput(tdvp);
 	if (tvp != NULL)
 		vput(tvp);
 
 	/* Release source nodes. */
 	vrele(fdvp);
 	vrele(fvp);
 
 	return error;
 }
 
 /* --------------------------------------------------------------------- */
 
 static int
 tmpfs_mkdir(struct vop_mkdir_args *v)
 {
 	struct vnode *dvp = v->a_dvp;
 	struct vnode **vpp = v->a_vpp;
 	struct componentname *cnp = v->a_cnp;
 	struct vattr *vap = v->a_vap;
 
 	MPASS(vap->va_type == VDIR);
 
 	return tmpfs_alloc_file(dvp, vpp, vap, cnp, NULL);
 }
 
 /* --------------------------------------------------------------------- */
 
 static int
 tmpfs_rmdir(struct vop_rmdir_args *v)
 {
 	struct vnode *dvp = v->a_dvp;
 	struct vnode *vp = v->a_vp;
 
 	int error;
 	struct tmpfs_dirent *de;
 	struct tmpfs_mount *tmp;
 	struct tmpfs_node *dnode;
 	struct tmpfs_node *node;
 
 	MPASS(VOP_ISLOCKED(dvp, v->a_cnp->cn_thread));
 	MPASS(VOP_ISLOCKED(vp, v->a_cnp->cn_thread));
 
 	tmp = VFS_TO_TMPFS(dvp->v_mount);
 	dnode = VP_TO_TMPFS_DIR(dvp);
 	node = VP_TO_TMPFS_DIR(vp);
 
 	/* Directories with more than two entries ('.' and '..') cannot be
 	 * removed. */
 	 if (node->tn_size > 0) {
 		 error = ENOTEMPTY;
 		 goto out;
 	 }
 
 	if ((dnode->tn_flags & APPEND)
 	    || (node->tn_flags & (NOUNLINK | IMMUTABLE | APPEND))) {
 		error = EPERM;
 		goto out;
 	}
 
 	/* This invariant holds only if we are not trying to remove "..".
 	  * We checked for that above so this is safe now. */
 	MPASS(node->tn_dir.tn_parent == dnode);
 
 	/* Get the directory entry associated with node (vp).  This was
 	 * filled by tmpfs_lookup while looking up the entry. */
 	de = tmpfs_dir_search(dnode, node);
 	MPASS(TMPFS_DIRENT_MATCHES(de,
 	    v->a_cnp->cn_nameptr,
 	    v->a_cnp->cn_namelen));
 
 	/* Check flags to see if we are allowed to remove the directory. */
 	if (dnode->tn_flags & APPEND
 		|| node->tn_flags & (NOUNLINK | IMMUTABLE | APPEND)) {
 		error = EPERM;
 		goto out;
 	}
 
 	/* Detach the directory entry from the directory (dnode). */
 	tmpfs_dir_detach(dvp, de);
 
 	node->tn_links--;
 	node->tn_status |= TMPFS_NODE_ACCESSED | TMPFS_NODE_CHANGED | \
 	    TMPFS_NODE_MODIFIED;
 	node->tn_dir.tn_parent->tn_links--;
 	node->tn_dir.tn_parent->tn_status |= TMPFS_NODE_ACCESSED | \
 	    TMPFS_NODE_CHANGED | TMPFS_NODE_MODIFIED;
 
 	cache_purge(dvp);
 	cache_purge(vp);
 
 	/* Free the directory entry we just deleted.  Note that the node
 	 * referred by it will not be removed until the vnode is really
 	 * reclaimed. */
 	tmpfs_free_dirent(tmp, de, TRUE);
 
 	/* Release the deleted vnode (will destroy the node, notify
 	 * interested parties and clean it from the cache). */
 
 	dnode->tn_status |= TMPFS_NODE_CHANGED;
 	tmpfs_update(dvp);
 
 	error = 0;
 
 out:
 	return error;
 }
 
 /* --------------------------------------------------------------------- */
 
 static int
 tmpfs_symlink(struct vop_symlink_args *v)
 {
 	struct vnode *dvp = v->a_dvp;
 	struct vnode **vpp = v->a_vpp;
 	struct componentname *cnp = v->a_cnp;
 	struct vattr *vap = v->a_vap;
 	char *target = v->a_target;
 
 #ifdef notyet /* XXX FreeBSD BUG: kern_symlink is not setting VLNK */
 	MPASS(vap->va_type == VLNK);
 #else
 	vap->va_type = VLNK;
 #endif
 
 	return tmpfs_alloc_file(dvp, vpp, vap, cnp, target);
 }
 
 /* --------------------------------------------------------------------- */
 
 static int
 tmpfs_readdir(struct vop_readdir_args *v)
 {
 	struct vnode *vp = v->a_vp;
 	struct uio *uio = v->a_uio;
 	int *eofflag = v->a_eofflag;
 	u_long **cookies = v->a_cookies;
 	int *ncookies = v->a_ncookies;
 
 	int error;
 	off_t startoff;
 	off_t cnt = 0;
 	struct tmpfs_node *node;
 
 	/* This operation only makes sense on directory nodes. */
 	if (vp->v_type != VDIR)
 		return ENOTDIR;
 
 	node = VP_TO_TMPFS_DIR(vp);
 
 	startoff = uio->uio_offset;
 
 	if (uio->uio_offset == TMPFS_DIRCOOKIE_DOT) {
 		error = tmpfs_dir_getdotdent(node, uio);
 		if (error != 0)
 			goto outok;
 		cnt++;
 	}
 
 	if (uio->uio_offset == TMPFS_DIRCOOKIE_DOTDOT) {
 		error = tmpfs_dir_getdotdotdent(node, uio);
 		if (error != 0)
 			goto outok;
 		cnt++;
 	}
 
 	error = tmpfs_dir_getdents(node, uio, &cnt);
 
 outok:
 	MPASS(error >= -1);
 
 	if (error == -1)
 		error = 0;
 
 	if (eofflag != NULL)
 		*eofflag =
 		    (error == 0 && uio->uio_offset == TMPFS_DIRCOOKIE_EOF);
 
 	/* Update NFS-related variables. */
 	if (error == 0 && cookies != NULL && ncookies != NULL) {
 		off_t i;
 		off_t off = startoff;
 		struct tmpfs_dirent *de = NULL;
 
 		*ncookies = cnt;
 		*cookies = malloc(cnt * sizeof(off_t), M_TEMP, M_WAITOK);
 
 		for (i = 0; i < cnt; i++) {
 			MPASS(off != TMPFS_DIRCOOKIE_EOF);
 			if (off == TMPFS_DIRCOOKIE_DOT) {
 				off = TMPFS_DIRCOOKIE_DOTDOT;
 			} else {
 				if (off == TMPFS_DIRCOOKIE_DOTDOT) {
 					de = TAILQ_FIRST(&node->tn_dir.tn_dirhead);
 				} else if (de != NULL) {
 					de = TAILQ_NEXT(de, td_entries);
 				} else {
 					de = tmpfs_dir_lookupbycookie(node,
 					    off);
 					MPASS(de != NULL);
 					de = TAILQ_NEXT(de, td_entries);
 				}
 				if (de == NULL)
 					off = TMPFS_DIRCOOKIE_EOF;
 				else
 					off = tmpfs_dircookie(de);
 			}
 
 			(*cookies)[i] = off;
 		}
 		MPASS(uio->uio_offset == off);
 	}
 
 	return error;
 }
 
 /* --------------------------------------------------------------------- */
 
 static int
 tmpfs_readlink(struct vop_readlink_args *v)
 {
 	struct vnode *vp = v->a_vp;
 	struct uio *uio = v->a_uio;
 
 	int error;
 	struct tmpfs_node *node;
 
 	MPASS(uio->uio_offset == 0);
 	MPASS(vp->v_type == VLNK);
 
 	node = VP_TO_TMPFS_NODE(vp);
 
 	error = uiomove(node->tn_link, MIN(node->tn_size, uio->uio_resid),
 	    uio);
 	node->tn_status |= TMPFS_NODE_ACCESSED;
 
 	return error;
 }
 
 /* --------------------------------------------------------------------- */
 
 static int
 tmpfs_inactive(struct vop_inactive_args *v)
 {
 	struct vnode *vp = v->a_vp;
 	struct thread *l = v->a_td;
 
 	struct tmpfs_node *node;
 
 	MPASS(VOP_ISLOCKED(vp, l));
 
 	node = VP_TO_TMPFS_NODE(vp);
 
 	if (node->tn_links == 0)
 		vrecycle(vp, l);
 
 	return 0;
 }
 
 /* --------------------------------------------------------------------- */
 
 int
 tmpfs_reclaim(struct vop_reclaim_args *v)
 {
 	struct vnode *vp = v->a_vp;
 
 	struct tmpfs_mount *tmp;
 	struct tmpfs_node *node;
 
 	node = VP_TO_TMPFS_NODE(vp);
 	tmp = VFS_TO_TMPFS(vp->v_mount);
 
 	vnode_destroy_vobject(vp);
 	cache_purge(vp);
 	tmpfs_free_vp(vp);
 
 	/* If the node referenced by this vnode was deleted by the user,
 	 * we must free its associated data structures (now that the vnode
 	 * is being reclaimed). */
 	if (node->tn_links == 0)
 		tmpfs_free_node(tmp, node);
 
 	MPASS(vp->v_data == NULL);
 	return 0;
 }
 
 /* --------------------------------------------------------------------- */
 
 static int
 tmpfs_print(struct vop_print_args *v)
 {
 	struct vnode *vp = v->a_vp;
 
 	struct tmpfs_node *node;
 
 	node = VP_TO_TMPFS_NODE(vp);
 
 	printf("tag VT_TMPFS, tmpfs_node %p, flags 0x%x, links %d\n",
 	    node, node->tn_flags, node->tn_links);
 	printf("\tmode 0%o, owner %d, group %d, size %" PRIdMAX
 	    ", status 0x%x\n",
 	    node->tn_mode, node->tn_uid, node->tn_gid,
 	    (uintmax_t)node->tn_size, node->tn_status);
 
 	if (vp->v_type == VFIFO)
 		fifo_printinfo(vp);
 
 	printf("\n");
 
 	return 0;
 }
 
 /* --------------------------------------------------------------------- */
 
 static int
 tmpfs_pathconf(struct vop_pathconf_args *v)
 {
 	int name = v->a_name;
 	register_t *retval = v->a_retval;
 
 	int error;
 
 	error = 0;
 
 	switch (name) {
 	case _PC_LINK_MAX:
 		*retval = LINK_MAX;
 		break;
 
 	case _PC_NAME_MAX:
 		*retval = NAME_MAX;
 		break;
 
 	case _PC_PATH_MAX:
 		*retval = PATH_MAX;
 		break;
 
 	case _PC_PIPE_BUF:
 		*retval = PIPE_BUF;
 		break;
 
 	case _PC_CHOWN_RESTRICTED:
 		*retval = 1;
 		break;
 
 	case _PC_NO_TRUNC:
 		*retval = 1;
 		break;
 
 	case _PC_SYNC_IO:
 		*retval = 1;
 		break;
 
 	case _PC_FILESIZEBITS:
 		*retval = 0; /* XXX Don't know which value should I return. */
 		break;
 
 	default:
 		error = EINVAL;
 	}
 
 	return error;
 }
 
 /* --------------------------------------------------------------------- */
 
 static int
 tmpfs_advlock(struct vop_advlock_args *v)
 {
 	struct vnode *vp = v->a_vp;
 
 	struct tmpfs_node *node;
 
 	node = VP_TO_TMPFS_NODE(vp);
 
 	return lf_advlock(v, &node->tn_lockf, node->tn_size);
 }
 
 /* --------------------------------------------------------------------- */
 
 static int
 tmpfs_vptofh(struct vop_vptofh_args *ap)
 {
 	struct tmpfs_fid *tfhp;
 	struct tmpfs_node *node;
 
 	tfhp = (struct tmpfs_fid *)ap->a_fhp;
 	node = VP_TO_TMPFS_NODE(ap->a_vp);
 
 	tfhp->tf_len = sizeof(struct tmpfs_fid);
 	tfhp->tf_id = node->tn_id;
 	tfhp->tf_gen = node->tn_gen;
 
 	return (0);
 }
 
 /* --------------------------------------------------------------------- */
 
 /*
  * vnode operations vector used for files stored in a tmpfs file system.
  */
 struct vop_vector tmpfs_vnodeop_entries = {
 	.vop_default =			&default_vnodeops,
 	.vop_lookup =			vfs_cache_lookup,
 	.vop_cachedlookup =		tmpfs_lookup,
 	.vop_create =			tmpfs_create,
 	.vop_mknod =			tmpfs_mknod,
 	.vop_open =			tmpfs_open,
 	.vop_close =			tmpfs_close,
 	.vop_access =			tmpfs_access,
 	.vop_getattr =			tmpfs_getattr,
 	.vop_setattr =			tmpfs_setattr,
 	.vop_read =			tmpfs_read,
 	.vop_write =			tmpfs_write,
 	.vop_fsync =			tmpfs_fsync,
 	.vop_remove =			tmpfs_remove,
 	.vop_link =			tmpfs_link,
 	.vop_rename =			tmpfs_rename,
 	.vop_mkdir =			tmpfs_mkdir,
 	.vop_rmdir =			tmpfs_rmdir,
 	.vop_symlink =			tmpfs_symlink,
 	.vop_readdir =			tmpfs_readdir,
 	.vop_readlink =			tmpfs_readlink,
 	.vop_inactive =			tmpfs_inactive,
 	.vop_reclaim =			tmpfs_reclaim,
 	.vop_print =			tmpfs_print,
 	.vop_pathconf =			tmpfs_pathconf,
 	.vop_advlock =			tmpfs_advlock,
 	.vop_vptofh =			tmpfs_vptofh,
 	.vop_bmap =			VOP_EOPNOTSUPP,
 };
 
Index: head/sys/fs/udf/udf_vnops.c
===================================================================
--- head/sys/fs/udf/udf_vnops.c	(revision 175201)
+++ head/sys/fs/udf/udf_vnops.c	(revision 175202)
@@ -1,1218 +1,1218 @@
 /*-
  * Copyright (c) 2001, 2002 Scott Long <scottl@freebsd.org>
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  * $FreeBSD$
  */
 
 /* udf_vnops.c */
 /* Take care of the vnode side of things */
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/namei.h>
 #include <sys/kernel.h>
 #include <sys/malloc.h>
 #include <sys/stat.h>
 #include <sys/bio.h>
 #include <sys/conf.h>
 #include <sys/buf.h>
 #include <sys/iconv.h>
 #include <sys/mount.h>
 #include <sys/vnode.h>
 #include <sys/dirent.h>
 #include <sys/queue.h>
 #include <sys/unistd.h>
 #include <sys/endian.h>
 
 #include <vm/uma.h>
 
 #include <fs/udf/ecma167-udf.h>
 #include <fs/udf/osta.h>
 #include <fs/udf/udf.h>
 #include <fs/udf/udf_mount.h>
 
 extern struct iconv_functions *udf_iconv;
 
 static vop_access_t	udf_access;
 static vop_getattr_t	udf_getattr;
 static vop_open_t	udf_open;
 static vop_ioctl_t	udf_ioctl;
 static vop_pathconf_t	udf_pathconf;
 static vop_read_t	udf_read;
 static vop_readdir_t	udf_readdir;
 static vop_readlink_t	udf_readlink;
 static vop_strategy_t	udf_strategy;
 static vop_bmap_t	udf_bmap;
 static vop_cachedlookup_t	udf_lookup;
 static vop_reclaim_t	udf_reclaim;
 static vop_vptofh_t	udf_vptofh;
 static int udf_readatoffset(struct udf_node *node, int *size, off_t offset,
     struct buf **bp, uint8_t **data);
 static int udf_bmap_internal(struct udf_node *node, off_t offset,
     daddr_t *sector, uint32_t *max_size);
 
 static struct vop_vector udf_vnodeops = {
 	.vop_default =		&default_vnodeops,
 
 	.vop_access =		udf_access,
 	.vop_bmap =		udf_bmap,
 	.vop_cachedlookup =	udf_lookup,
 	.vop_getattr =		udf_getattr,
 	.vop_ioctl =		udf_ioctl,
 	.vop_lookup =		vfs_cache_lookup,
 	.vop_open =		udf_open,
 	.vop_pathconf =		udf_pathconf,
 	.vop_read =		udf_read,
 	.vop_readdir =		udf_readdir,
 	.vop_readlink =		udf_readlink,
 	.vop_reclaim =		udf_reclaim,
 	.vop_strategy =		udf_strategy,
 	.vop_vptofh =		udf_vptofh,
 };
 
 MALLOC_DEFINE(M_UDFFID, "udf_fid", "UDF FileId structure");
 MALLOC_DEFINE(M_UDFDS, "udf_ds", "UDF Dirstream structure");
 
 #define UDF_INVALID_BMAP	-1
 
 int
 udf_allocv(struct mount *mp, struct vnode **vpp, struct thread *td)
 {
 	int error;
 	struct vnode *vp;
 
 	error = getnewvnode("udf", mp, &udf_vnodeops, &vp);
 	if (error) {
 		printf("udf_allocv: failed to allocate new vnode\n");
 		return (error);
 	}
 
 	*vpp = vp;
 	return (0);
 }
 
 /* Convert file entry permission (5 bits per owner/group/user) to a mode_t */
 static mode_t
 udf_permtomode(struct udf_node *node)
 {
 	uint32_t perm;
 	uint16_t flags;
 	mode_t mode;
 
 	perm = le32toh(node->fentry->perm);
 	flags = le16toh(node->fentry->icbtag.flags);
 
 	mode = perm & UDF_FENTRY_PERM_USER_MASK;
 	mode |= ((perm & UDF_FENTRY_PERM_GRP_MASK) >> 2);
 	mode |= ((perm & UDF_FENTRY_PERM_OWNER_MASK) >> 4);
 	mode |= ((flags & UDF_ICB_TAG_FLAGS_STICKY) << 4);
 	mode |= ((flags & UDF_ICB_TAG_FLAGS_SETGID) << 6);
 	mode |= ((flags & UDF_ICB_TAG_FLAGS_SETUID) << 8);
 
 	return (mode);
 }
 
 static int
 udf_access(struct vop_access_args *a)
 {
 	struct vnode *vp;
 	struct udf_node *node;
 	mode_t a_mode, mode;
 
 	vp = a->a_vp;
 	node = VTON(vp);
 	a_mode = a->a_mode;
 
 	if (a_mode & VWRITE) {
 		switch (vp->v_type) {
 		case VDIR:
 		case VLNK:
 		case VREG:
 			return (EROFS);
 			/* NOT REACHED */
 		default:
 			break;
 		}
 	}
 
 	mode = udf_permtomode(node);
 
 	return (vaccess(vp->v_type, mode, node->fentry->uid, node->fentry->gid,
 	    a_mode, a->a_cred, NULL));
 }
 
 static int
 udf_open(struct vop_open_args *ap) {
 	struct udf_node *np = VTON(ap->a_vp);
 	off_t fsize;
 
 	fsize = le64toh(np->fentry->inf_len);
 	vnode_create_vobject(ap->a_vp, fsize, ap->a_td);
 	return 0;
 }
 
 static int mon_lens[2][12] = {
 	{31, 28, 31, 30, 31, 30, 31, 31, 30, 31, 30, 31},
 	{31, 29, 31, 30, 31, 30, 31, 31, 30, 31, 30, 31}
 };
 
 static int
 udf_isaleapyear(int year)
 {
 	int i;
 
 	i = (year % 4) ? 0 : 1;
 	i &= (year % 100) ? 1 : 0;
 	i |= (year % 400) ? 0 : 1;
 
 	return i;
 }
 
 /*
  * XXX This is just a rough hack.  Daylight savings isn't calculated and tv_nsec
  * is ignored.
  * Timezone calculation compliments of Julian Elischer <julian@elischer.org>.
  */
 static void
 udf_timetotimespec(struct timestamp *time, struct timespec *t)
 {
 	int i, lpyear, daysinyear, year;
 	union {
 		uint16_t	u_tz_offset;
 		int16_t		s_tz_offset;
 	} tz;
 
 	t->tv_nsec = 0;
 
 	/* DirectCD seems to like using bogus year values */
 	year = le16toh(time->year);
 	if (year < 1970) {
 		t->tv_sec = 0;
 		return;
 	}
 
 	/* Calculate the time and day */
 	t->tv_sec = time->second;
 	t->tv_sec += time->minute * 60;
 	t->tv_sec += time->hour * 3600;
 	t->tv_sec += time->day * 3600 * 24;
 
 	/* Calculate the month */
 	lpyear = udf_isaleapyear(year);
 	for (i = 1; i < time->month; i++)
 		t->tv_sec += mon_lens[lpyear][i] * 3600 * 24;
 
 	/* Speed up the calculation */
 	if (year > 1979)
 		t->tv_sec += 315532800;
 	if (year > 1989)
 		t->tv_sec += 315619200;
 	if (year > 1999)
 		t->tv_sec += 315532800;
 	for (i = 2000; i < year; i++) {
 		daysinyear = udf_isaleapyear(i) + 365 ;
 		t->tv_sec += daysinyear * 3600 * 24;
 	}
 
 	/*
 	 * Calculate the time zone.  The timezone is 12 bit signed 2's
 	 * complement, so we gotta do some extra magic to handle it right.
 	 */
 	tz.u_tz_offset = le16toh(time->type_tz);
 	tz.u_tz_offset &= 0x0fff;
 	if (tz.u_tz_offset & 0x0800)
 		tz.u_tz_offset |= 0xf000;	/* extend the sign to 16 bits */
 	if ((time->type_tz & 0x1000) && (tz.s_tz_offset != -2047))
 		t->tv_sec -= tz.s_tz_offset * 60;
 
 	return;
 }
 
 static int
 udf_getattr(struct vop_getattr_args *a)
 {
 	struct vnode *vp;
 	struct udf_node *node;
 	struct vattr *vap;
 	struct file_entry *fentry;
 	struct timespec ts;
 
 	ts.tv_sec = 0;
 
 	vp = a->a_vp;
 	vap = a->a_vap;
 	node = VTON(vp);
 	fentry = node->fentry;
 
 	vap->va_fsid = dev2udev(node->udfmp->im_dev);
 	vap->va_fileid = node->hash_id;
 	vap->va_mode = udf_permtomode(node);
 	vap->va_nlink = le16toh(fentry->link_cnt);
 	/*
 	 * XXX The spec says that -1 is valid for uid/gid and indicates an
 	 * invalid uid/gid.  How should this be represented?
 	 */
 	vap->va_uid = (le32toh(fentry->uid) == -1) ? 0 : le32toh(fentry->uid);
 	vap->va_gid = (le32toh(fentry->gid) == -1) ? 0 : le32toh(fentry->gid);
 	udf_timetotimespec(&fentry->atime, &vap->va_atime);
 	udf_timetotimespec(&fentry->mtime, &vap->va_mtime);
 	vap->va_ctime = vap->va_mtime; /* XXX Stored as an Extended Attribute */
 	vap->va_rdev = 0; /* XXX */
 	if (vp->v_type & VDIR) {
 		/*
 		 * Directories that are recorded within their ICB will show
 		 * as having 0 blocks recorded.  Since tradition dictates
 		 * that directories consume at least one logical block,
 		 * make it appear so.
 		 */
 		if (fentry->logblks_rec != 0) {
 			vap->va_size =
 			    le64toh(fentry->logblks_rec) * node->udfmp->bsize;
 		} else {
 			vap->va_size = node->udfmp->bsize;
 		}
 	} else {
 		vap->va_size = le64toh(fentry->inf_len);
 	}
 	vap->va_flags = 0;
 	vap->va_gen = 1;
 	vap->va_blocksize = node->udfmp->bsize;
 	vap->va_bytes = le64toh(fentry->inf_len);
 	vap->va_type = vp->v_type;
 	vap->va_filerev = 0; /* XXX */
 	return (0);
 }
 
 /*
  * File specific ioctls.
  */
 static int
 udf_ioctl(struct vop_ioctl_args *a)
 {
 	printf("%s called\n", __func__);
 	return (ENOTTY);
 }
 
 /*
  * I'm not sure that this has much value in a read-only filesystem, but
  * cd9660 has it too.
  */
 static int
 udf_pathconf(struct vop_pathconf_args *a)
 {
 
 	switch (a->a_name) {
 	case _PC_LINK_MAX:
 		*a->a_retval = 65535;
 		return (0);
 	case _PC_NAME_MAX:
 		*a->a_retval = NAME_MAX;
 		return (0);
 	case _PC_PATH_MAX:
 		*a->a_retval = PATH_MAX;
 		return (0);
 	case _PC_NO_TRUNC:
 		*a->a_retval = 1;
 		return (0);
 	default:
 		return (EINVAL);
 	}
 }
 
 #define lblkno(udfmp, loc)	((loc) >> (udfmp)->bshift)
 #define blkoff(udfmp, loc)	((loc) & (udfmp)->bmask)
 #define lblktosize(imp, blk)	((blk) << (udfmp)->bshift)
 
 static int
 udf_read(struct vop_read_args *ap)
 {
 	struct vnode *vp = ap->a_vp;
 	struct uio *uio = ap->a_uio;
 	struct udf_node *node = VTON(vp);
 	struct udf_mnt *udfmp;
 	struct buf *bp;
 	daddr_t lbn, rablock;
 	off_t diff, fsize;
 	int error = 0;
 	long size, n, on;
 
 	if (uio->uio_resid == 0)
 		return (0);
 	if (uio->uio_offset < 0)
 		return (EINVAL);
 	fsize = le64toh(node->fentry->inf_len);
 	udfmp = node->udfmp;
 	do {
 		lbn = lblkno(udfmp, uio->uio_offset);
 		on = blkoff(udfmp, uio->uio_offset);
 		n = min((u_int)(udfmp->bsize - on),
 			uio->uio_resid);
 		diff = fsize - uio->uio_offset;
 		if (diff <= 0)
 			return (0);
 		if (diff < n)
 			n = diff;
 		size = udfmp->bsize;
 		rablock = lbn + 1;
 		if ((vp->v_mount->mnt_flag & MNT_NOCLUSTERR) == 0) {
 			if (lblktosize(udfmp, rablock) < fsize) {
 				error = cluster_read(vp, fsize, lbn, size, NOCRED,
 					uio->uio_resid, (ap->a_ioflag >> 16), &bp);
 			} else {
 				error = bread(vp, lbn, size, NOCRED, &bp);
 			}
 		} else {
 			error = bread(vp, lbn, size, NOCRED, &bp);
 		}
 		n = min(n, size - bp->b_resid);
 		if (error) {
 			brelse(bp);
 			return (error);
 		}
 
 		error = uiomove(bp->b_data + on, (int)n, uio);
 		brelse(bp);
 	} while (error == 0 && uio->uio_resid > 0 && n != 0);
 	return (error);
 }
 
 /*
  * Call the OSTA routines to translate the name from a CS0 dstring to a
  * 16-bit Unicode String.  Hooks need to be placed in here to translate from
  * Unicode to the encoding that the kernel/user expects.  Return the length
  * of the translated string.
  */
 static int
 udf_transname(char *cs0string, char *destname, int len, struct udf_mnt *udfmp)
 {
 	unicode_t *transname;
 	char *unibuf, *unip;
 	int i, destlen;
 	ssize_t unilen = 0;
 	size_t destleft = MAXNAMLEN;
 
 	/* Convert 16-bit Unicode to destname */
 	if (udfmp->im_flags & UDFMNT_KICONV && udf_iconv) {
 		/* allocate a buffer big enough to hold an 8->16 bit expansion */
 		unibuf = uma_zalloc(udf_zone_trans, M_WAITOK);
 		unip = unibuf;
 		if ((unilen = (ssize_t)udf_UncompressUnicodeByte(len, cs0string, unibuf)) == -1) {
 			printf("udf: Unicode translation failed\n");
 			uma_zfree(udf_zone_trans, unibuf);
 			return 0;
 		}
 
 		while (unilen > 0 && destleft > 0) {
 			udf_iconv->conv(udfmp->im_d2l, (const char **)&unibuf,
 				(size_t *)&unilen, (char **)&destname, &destleft);
 			/* Unconverted character found */
 			if (unilen > 0 && destleft > 0) {
 				*destname++ = '?';
 				destleft--;
 				unibuf += 2;
 				unilen -= 2;
 			}
 		}
 		uma_zfree(udf_zone_trans, unip);
 		*destname = '\0';
 		destlen = MAXNAMLEN - (int)destleft;
 	} else {
 		/* allocate a buffer big enough to hold an 8->16 bit expansion */
 		transname = uma_zalloc(udf_zone_trans, M_WAITOK);
 
 		if ((unilen = (ssize_t)udf_UncompressUnicode(len, cs0string, transname)) == -1) {
 			printf("udf: Unicode translation failed\n");
 			uma_zfree(udf_zone_trans, transname);
 			return 0;
 		}
 
 		for (i = 0; i < unilen ; i++) {
 			if (transname[i] & 0xff00) {
 				destname[i] = '.';	/* Fudge the 16bit chars */
 			} else {
 				destname[i] = transname[i] & 0xff;
 			}
 		}
 		uma_zfree(udf_zone_trans, transname);
 		destname[unilen] = 0;
 		destlen = (int)unilen;
 	}
 
 	return (destlen);
 }
 
 /*
  * Compare a CS0 dstring with a name passed in from the VFS layer.  Return
  * 0 on a successful match, nonzero otherwise.  Unicode work may need to be done
  * here also.
  */
 static int
 udf_cmpname(char *cs0string, char *cmpname, int cs0len, int cmplen, struct udf_mnt *udfmp)
 {
 	char *transname;
 	int error = 0;
 
 	/* This is overkill, but not worth creating a new zone */
 	transname = uma_zalloc(udf_zone_trans, M_WAITOK);
 
 	cs0len = udf_transname(cs0string, transname, cs0len, udfmp);
 
 	/* Easy check.  If they aren't the same length, they aren't equal */
 	if ((cs0len == 0) || (cs0len != cmplen))
 		error = -1;
 	else
 		error = bcmp(transname, cmpname, cmplen);
 
 	uma_zfree(udf_zone_trans, transname);
 	return (error);
 }
 
 struct udf_uiodir {
 	struct dirent *dirent;
 	u_long *cookies;
 	int ncookies;
 	int acookies;
 	int eofflag;
 };
 
 static int
 udf_uiodir(struct udf_uiodir *uiodir, int de_size, struct uio *uio, long cookie)
 {
 	if (uiodir->cookies != NULL) {
 		if (++uiodir->acookies > uiodir->ncookies) {
 			uiodir->eofflag = 0;
 			return (-1);
 		}
 		*uiodir->cookies++ = cookie;
 	}
 
 	if (uio->uio_resid < de_size) {
 		uiodir->eofflag = 0;
 		return (-1);
 	}
 
 	return (uiomove(uiodir->dirent, de_size, uio));
 }
 
 static struct udf_dirstream *
 udf_opendir(struct udf_node *node, int offset, int fsize, struct udf_mnt *udfmp)
 {
 	struct udf_dirstream *ds;
 
 	ds = uma_zalloc(udf_zone_ds, M_WAITOK | M_ZERO);
 
 	ds->node = node;
 	ds->offset = offset;
 	ds->udfmp = udfmp;
 	ds->fsize = fsize;
 
 	return (ds);
 }
 
 static struct fileid_desc *
 udf_getfid(struct udf_dirstream *ds)
 {
 	struct fileid_desc *fid;
 	int error, frag_size = 0, total_fid_size;
 
 	/* End of directory? */
 	if (ds->offset + ds->off >= ds->fsize) {
 		ds->error = 0;
 		return (NULL);
 	}
 
 	/* Grab the first extent of the directory */
 	if (ds->off == 0) {
 		ds->size = 0;
 		error = udf_readatoffset(ds->node, &ds->size, ds->offset,
 		    &ds->bp, &ds->data);
 		if (error) {
 			ds->error = error;
 			if (ds->bp != NULL)
 				brelse(ds->bp);
 			return (NULL);
 		}
 	}
 
 	/*
 	 * Clean up from a previous fragmented FID.
 	 * XXX Is this the right place for this?
 	 */
 	if (ds->fid_fragment && ds->buf != NULL) {
 		ds->fid_fragment = 0;
 		FREE(ds->buf, M_UDFFID);
 	}
 
 	fid = (struct fileid_desc*)&ds->data[ds->off];
 
 	/*
 	 * Check to see if the fid is fragmented. The first test
 	 * ensures that we don't wander off the end of the buffer
 	 * looking for the l_iu and l_fi fields.
 	 */
 	if (ds->off + UDF_FID_SIZE > ds->size ||
 	    ds->off + le16toh(fid->l_iu) + fid->l_fi + UDF_FID_SIZE > ds->size){
 
 		/* Copy what we have of the fid into a buffer */
 		frag_size = ds->size - ds->off;
 		if (frag_size >= ds->udfmp->bsize) {
 			printf("udf: invalid FID fragment\n");
 			ds->error = EINVAL;
 			return (NULL);
 		}
 
 		/*
 		 * File ID descriptors can only be at most one
 		 * logical sector in size.
 		 */
 		MALLOC(ds->buf, uint8_t*, ds->udfmp->bsize, M_UDFFID,
 		     M_WAITOK | M_ZERO);
 		bcopy(fid, ds->buf, frag_size);
 
 		/* Reduce all of the casting magic */
 		fid = (struct fileid_desc*)ds->buf;
 
 		if (ds->bp != NULL)
 			brelse(ds->bp);
 
 		/* Fetch the next allocation */
 		ds->offset += ds->size;
 		ds->size = 0;
 		error = udf_readatoffset(ds->node, &ds->size, ds->offset,
 		    &ds->bp, &ds->data);
 		if (error) {
 			ds->error = error;
 			return (NULL);
 		}
 
 		/*
 		 * If the fragment was so small that we didn't get
 		 * the l_iu and l_fi fields, copy those in.
 		 */
 		if (frag_size < UDF_FID_SIZE)
 			bcopy(ds->data, &ds->buf[frag_size],
 			    UDF_FID_SIZE - frag_size);
 
 		/*
 		 * Now that we have enough of the fid to work with,
 		 * copy in the rest of the fid from the new
 		 * allocation.
 		 */
 		total_fid_size = UDF_FID_SIZE + le16toh(fid->l_iu) + fid->l_fi;
 		if (total_fid_size > ds->udfmp->bsize) {
 			printf("udf: invalid FID\n");
 			ds->error = EIO;
 			return (NULL);
 		}
 		bcopy(ds->data, &ds->buf[frag_size],
 		    total_fid_size - frag_size);
 
 		ds->fid_fragment = 1;
 	} else {
 		total_fid_size = le16toh(fid->l_iu) + fid->l_fi + UDF_FID_SIZE;
 	}
 
 	/*
 	 * Update the offset. Align on a 4 byte boundary because the
 	 * UDF spec says so.
 	 */
 	ds->this_off = ds->off;
 	if (!ds->fid_fragment) {
 		ds->off += (total_fid_size + 3) & ~0x03;
 	} else {
 		ds->off = (total_fid_size - frag_size + 3) & ~0x03;
 	}
 
 	return (fid);
 }
 
 static void
 udf_closedir(struct udf_dirstream *ds)
 {
 
 	if (ds->bp != NULL)
 		brelse(ds->bp);
 
 	if (ds->fid_fragment && ds->buf != NULL)
 		FREE(ds->buf, M_UDFFID);
 
 	uma_zfree(udf_zone_ds, ds);
 }
 
 static int
 udf_readdir(struct vop_readdir_args *a)
 {
 	struct vnode *vp;
 	struct uio *uio;
 	struct dirent dir;
 	struct udf_node *node;
 	struct udf_mnt *udfmp;
 	struct fileid_desc *fid;
 	struct udf_uiodir uiodir;
 	struct udf_dirstream *ds;
 	u_long *cookies = NULL;
 	int ncookies;
 	int error = 0;
 
 	vp = a->a_vp;
 	uio = a->a_uio;
 	node = VTON(vp);
 	udfmp = node->udfmp;
 	uiodir.eofflag = 1;
 
 	if (a->a_ncookies != NULL) {
 		/*
 		 * Guess how many entries are needed.  If we run out, this
 		 * function will be called again and thing will pick up were
 		 * it left off.
 		 */
 		ncookies = uio->uio_resid / 8;
 		MALLOC(cookies, u_long *, sizeof(u_long) * ncookies,
 		    M_TEMP, M_WAITOK);
 		if (cookies == NULL)
 			return (ENOMEM);
 		uiodir.ncookies = ncookies;
 		uiodir.cookies = cookies;
 		uiodir.acookies = 0;
 	} else {
 		uiodir.cookies = NULL;
 	}
 
 	/*
 	 * Iterate through the file id descriptors.  Give the parent dir
 	 * entry special attention.
 	 */
 	ds = udf_opendir(node, uio->uio_offset, le64toh(node->fentry->inf_len),
 	    node->udfmp);
 
 	while ((fid = udf_getfid(ds)) != NULL) {
 
 		/* XXX Should we return an error on a bad fid? */
 		if (udf_checktag(&fid->tag, TAGID_FID)) {
 			printf("Invalid FID tag\n");
 			hexdump(fid, UDF_FID_SIZE, NULL, 0);
 			error = EIO;
 			break;
 		}
 
 		/* Is this a deleted file? */
 		if (fid->file_char & UDF_FILE_CHAR_DEL)
 			continue;
 
 		if ((fid->l_fi == 0) && (fid->file_char & UDF_FILE_CHAR_PAR)) {
 			/* Do up the '.' and '..' entries.  Dummy values are
 			 * used for the cookies since the offset here is
 			 * usually zero, and NFS doesn't like that value
 			 */
 			dir.d_fileno = node->hash_id;
 			dir.d_type = DT_DIR;
 			dir.d_name[0] = '.';
 			dir.d_name[1] = '\0';
 			dir.d_namlen = 1;
 			dir.d_reclen = GENERIC_DIRSIZ(&dir);
 			uiodir.dirent = &dir;
 			error = udf_uiodir(&uiodir, dir.d_reclen, uio, 1);
 			if (error)
 				break;
 
 			dir.d_fileno = udf_getid(&fid->icb);
 			dir.d_type = DT_DIR;
 			dir.d_name[0] = '.';
 			dir.d_name[1] = '.';
 			dir.d_name[2] = '\0';
 			dir.d_namlen = 2;
 			dir.d_reclen = GENERIC_DIRSIZ(&dir);
 			uiodir.dirent = &dir;
 			error = udf_uiodir(&uiodir, dir.d_reclen, uio, 2);
 		} else {
 			dir.d_namlen = udf_transname(&fid->data[fid->l_iu],
 			    &dir.d_name[0], fid->l_fi, udfmp);
 			dir.d_fileno = udf_getid(&fid->icb);
 			dir.d_type = (fid->file_char & UDF_FILE_CHAR_DIR) ?
 			    DT_DIR : DT_UNKNOWN;
 			dir.d_reclen = GENERIC_DIRSIZ(&dir);
 			uiodir.dirent = &dir;
 			error = udf_uiodir(&uiodir, dir.d_reclen, uio,
 			    ds->this_off);
 		}
 		if (error) {
 			printf("uiomove returned %d\n", error);
 			break;
 		}
 
 	}
 
 	/* tell the calling layer whether we need to be called again */
 	*a->a_eofflag = uiodir.eofflag;
 	uio->uio_offset = ds->offset + ds->off;
 
 	if (!error)
 		error = ds->error;
 
 	udf_closedir(ds);
 
 	if (a->a_ncookies != NULL) {
 		if (error)
 			FREE(cookies, M_TEMP);
 		else {
 			*a->a_ncookies = uiodir.acookies;
 			*a->a_cookies = cookies;
 		}
 	}
 
 	return (error);
 }
 
 /* Are there any implementations out there that do soft-links? */
 static int
 udf_readlink(struct vop_readlink_args *ap)
 {
 	printf("%s called\n", __func__);
 	return (EOPNOTSUPP);
 }
 
 static int
 udf_strategy(struct vop_strategy_args *a)
 {
 	struct buf *bp;
 	struct vnode *vp;
 	struct udf_node *node;
 	int maxsize;
 	daddr_t sector;
 	struct bufobj *bo;
 	int multiplier;
 
 	bp = a->a_bp;
 	vp = a->a_vp;
 	node = VTON(vp);
 
 	if (bp->b_blkno == bp->b_lblkno) {
 		/*
 		 * Files that are embedded in the fentry don't translate well
 		 * to a block number.  Reject.
 		 */
 		if (udf_bmap_internal(node, bp->b_lblkno * node->udfmp->bsize,
 		    &sector, &maxsize)) {
 			clrbuf(bp);
 			bp->b_blkno = -1;
 		}
 
 		/* bmap gives sector numbers, bio works with device blocks */
 		multiplier = node->udfmp->bsize / DEV_BSIZE;
 		bp->b_blkno = sector * multiplier;
 
 	}
 	if ((long)bp->b_blkno == -1) {
 		bufdone(bp);
 		return (0);
 	}
 	bo = node->udfmp->im_bo;
 	bp->b_iooffset = dbtob(bp->b_blkno);
 	BO_STRATEGY(bo, bp);
 	return (0);
 }
 
 static int
 udf_bmap(struct vop_bmap_args *a)
 {
 	struct udf_node *node;
 	uint32_t max_size;
 	daddr_t lsector;
 	int error;
 
 	node = VTON(a->a_vp);
 
 	if (a->a_bop != NULL)
 		*a->a_bop = &node->udfmp->im_devvp->v_bufobj;
 	if (a->a_bnp == NULL)
 		return (0);
 	if (a->a_runb)
 		*a->a_runb = 0;
 
 	error = udf_bmap_internal(node, a->a_bn * node->udfmp->bsize, &lsector,
 	    &max_size);
 	if (error)
 		return (error);
 
 	/* Translate logical to physical sector number */
 	*a->a_bnp = lsector << (node->udfmp->bshift - DEV_BSHIFT);
 
 	/* Punt on read-ahead for now */
 	if (a->a_runp)
 		*a->a_runp = 0;
 
 	return (0);
 }
 
 /*
  * The all powerful VOP_LOOKUP().
  */
 static int
 udf_lookup(struct vop_cachedlookup_args *a)
 {
 	struct vnode *dvp;
 	struct vnode *tdp = NULL;
 	struct vnode **vpp = a->a_vpp;
 	struct udf_node *node;
 	struct udf_mnt *udfmp;
 	struct fileid_desc *fid = NULL;
 	struct udf_dirstream *ds;
 	struct thread *td;
 	u_long nameiop;
 	u_long flags;
 	char *nameptr;
 	long namelen;
 	ino_t id = 0;
 	int offset, error = 0;
 	int numdirpasses, fsize;
 
 	dvp = a->a_dvp;
 	node = VTON(dvp);
 	udfmp = node->udfmp;
 	nameiop = a->a_cnp->cn_nameiop;
 	flags = a->a_cnp->cn_flags;
 	nameptr = a->a_cnp->cn_nameptr;
 	namelen = a->a_cnp->cn_namelen;
 	fsize = le64toh(node->fentry->inf_len);
 	td = a->a_cnp->cn_thread;
 
 	/*
 	 * If this is a LOOKUP and we've already partially searched through
 	 * the directory, pick up where we left off and flag that the
 	 * directory may need to be searched twice.  For a full description,
 	 * see /sys/fs/cd9660/cd9660_lookup.c:cd9660_lookup()
 	 */
 	if (nameiop != LOOKUP || node->diroff == 0 || node->diroff > fsize) {
 		offset = 0;
 		numdirpasses = 1;
 	} else {
 		offset = node->diroff;
 		numdirpasses = 2;
 		nchstats.ncs_2passes++;
 	}
 
 lookloop:
 	ds = udf_opendir(node, offset, fsize, udfmp);
 
 	while ((fid = udf_getfid(ds)) != NULL) {
 
 		/* XXX Should we return an error on a bad fid? */
 		if (udf_checktag(&fid->tag, TAGID_FID)) {
 			printf("udf_lookup: Invalid tag\n");
 			error = EIO;
 			break;
 		}
 
 		/* Is this a deleted file? */
 		if (fid->file_char & UDF_FILE_CHAR_DEL)
 			continue;
 
 		if ((fid->l_fi == 0) && (fid->file_char & UDF_FILE_CHAR_PAR)) {
 			if (flags & ISDOTDOT) {
 				id = udf_getid(&fid->icb);
 				break;
 			}
 		} else {
 			if (!(udf_cmpname(&fid->data[fid->l_iu],
 			    nameptr, fid->l_fi, namelen, udfmp))) {
 				id = udf_getid(&fid->icb);
 				break;
 			}
 		}
 	}
 
 	if (!error)
 		error = ds->error;
 
 	/* XXX Bail out here? */
 	if (error) {
 		udf_closedir(ds);
 		return (error);
 	}
 
 	/* Did we have a match? */
 	if (id) {
 		if (flags & ISDOTDOT)
 			VOP_UNLOCK(dvp, 0, a->a_cnp->cn_thread);
 		error = udf_vget(udfmp->im_mountp, id, LK_EXCLUSIVE, &tdp);
 		if (flags & ISDOTDOT)
-			vn_lock(dvp, LK_EXCLUSIVE|LK_RETRY, a->a_cnp->cn_thread);
+			vn_lock(dvp, LK_EXCLUSIVE|LK_RETRY);
 		if (!error) {
 			/*
 			 * Remember where this entry was if it's the final
 			 * component.
 			 */
 			if ((flags & ISLASTCN) && nameiop == LOOKUP)
 				node->diroff = ds->offset + ds->off;
 			if (numdirpasses == 2)
 				nchstats.ncs_pass2++;
 			*vpp = tdp;
 			/* Put this entry in the cache */
 			if (flags & MAKEENTRY)
 				cache_enter(dvp, *vpp, a->a_cnp);
 		}
 	} else {
 		/* Name wasn't found on this pass.  Do another pass? */
 		if (numdirpasses == 2) {
 			numdirpasses--;
 			offset = 0;
 			udf_closedir(ds);
 			goto lookloop;
 		}
 
 		/* Enter name into cache as non-existant */
 		if (flags & MAKEENTRY)
 			cache_enter(dvp, *vpp, a->a_cnp);
 
 		if ((flags & ISLASTCN) &&
 		    (nameiop == CREATE || nameiop == RENAME)) {
 			error = EROFS;
 		} else {
 			error = ENOENT;
 		}
 	}
 
 	udf_closedir(ds);
 	return (error);
 }
 
 static int
 udf_reclaim(struct vop_reclaim_args *a)
 {
 	struct vnode *vp;
 	struct udf_node *unode;
 
 	vp = a->a_vp;
 	unode = VTON(vp);
 
 	/*
 	 * Destroy the vm object and flush associated pages.
 	 */
 	vnode_destroy_vobject(vp);
 
 	if (unode != NULL) {
 		vfs_hash_remove(vp);
 
 		if (unode->fentry != NULL)
 			FREE(unode->fentry, M_UDFFENTRY);
 		uma_zfree(udf_zone_node, unode);
 		vp->v_data = NULL;
 	}
 
 	return (0);
 }
 
 static int
 udf_vptofh(struct vop_vptofh_args *a)
 {
 	struct udf_node *node;
 	struct ifid *ifhp;
 
 	node = VTON(a->a_vp);
 	ifhp = (struct ifid *)a->a_fhp;
 	ifhp->ifid_len = sizeof(struct ifid);
 	ifhp->ifid_ino = node->hash_id;
 
 	return (0);
 }
 
 /*
  * Read the block and then set the data pointer to correspond with the
  * offset passed in.  Only read in at most 'size' bytes, and then set 'size'
  * to the number of bytes pointed to.  If 'size' is zero, try to read in a
  * whole extent.
  *
  * Note that *bp may be assigned error or not.
  *
  */
 static int
 udf_readatoffset(struct udf_node *node, int *size, off_t offset,
     struct buf **bp, uint8_t **data)
 {
 	struct udf_mnt *udfmp;
 	struct file_entry *fentry = NULL;
 	struct buf *bp1;
 	uint32_t max_size;
 	daddr_t sector;
 	int error;
 
 	udfmp = node->udfmp;
 
 	*bp = NULL;
 	error = udf_bmap_internal(node, offset, &sector, &max_size);
 	if (error == UDF_INVALID_BMAP) {
 		/*
 		 * This error means that the file *data* is stored in the
 		 * allocation descriptor field of the file entry.
 		 */
 		fentry = node->fentry;
 		*data = &fentry->data[le32toh(fentry->l_ea)];
 		*size = le32toh(fentry->l_ad);
 		return (0);
 	} else if (error != 0) {
 		return (error);
 	}
 
 	/* Adjust the size so that it is within range */
 	if (*size == 0 || *size > max_size)
 		*size = max_size;
 	*size = min(*size, MAXBSIZE);
 
 	if ((error = udf_readlblks(udfmp, sector, *size + (offset & udfmp->bmask), bp))) {
 		printf("warning: udf_readlblks returned error %d\n", error);
 		/* note: *bp may be non-NULL */
 		return (error);
 	}
 
 	bp1 = *bp;
 	*data = (uint8_t *)&bp1->b_data[offset & udfmp->bmask];
 	return (0);
 }
 
 /*
  * Translate a file offset into a logical block and then into a physical
  * block.
  * max_size - maximum number of bytes that can be read starting from given
  * offset, rather than beginning of calculated sector number
  */
 static int
 udf_bmap_internal(struct udf_node *node, off_t offset, daddr_t *sector,
     uint32_t *max_size)
 {
 	struct udf_mnt *udfmp;
 	struct file_entry *fentry;
 	void *icb;
 	struct icb_tag *tag;
 	uint32_t icblen = 0;
 	daddr_t lsector;
 	int ad_offset, ad_num = 0;
 	int i, p_offset;
 
 	udfmp = node->udfmp;
 	fentry = node->fentry;
 	tag = &fentry->icbtag;
 
 	switch (le16toh(tag->strat_type)) {
 	case 4:
 		break;
 
 	case 4096:
 		printf("Cannot deal with strategy4096 yet!\n");
 		return (ENODEV);
 
 	default:
 		printf("Unknown strategy type %d\n", tag->strat_type);
 		return (ENODEV);
 	}
 
 	switch (le16toh(tag->flags) & 0x7) {
 	case 0:
 		/*
 		 * The allocation descriptor field is filled with short_ad's.
 		 * If the offset is beyond the current extent, look for the
 		 * next extent.
 		 */
 		do {
 			offset -= icblen;
 			ad_offset = sizeof(struct short_ad) * ad_num;
 			if (ad_offset > le32toh(fentry->l_ad)) {
 				printf("File offset out of bounds\n");
 				return (EINVAL);
 			}
 			icb = GETICB(short_ad, fentry,
 			    le32toh(fentry->l_ea) + ad_offset);
 			icblen = GETICBLEN(short_ad, icb);
 			ad_num++;
 		} while(offset >= icblen);
 
 		lsector = (offset  >> udfmp->bshift) +
 		    le32toh(((struct short_ad *)(icb))->pos);
 
 		*max_size = icblen - offset;
 
 		break;
 	case 1:
 		/*
 		 * The allocation descriptor field is filled with long_ad's
 		 * If the offset is beyond the current extent, look for the
 		 * next extent.
 		 */
 		do {
 			offset -= icblen;
 			ad_offset = sizeof(struct long_ad) * ad_num;
 			if (ad_offset > le32toh(fentry->l_ad)) {
 				printf("File offset out of bounds\n");
 				return (EINVAL);
 			}
 			icb = GETICB(long_ad, fentry,
 			    le32toh(fentry->l_ea) + ad_offset);
 			icblen = GETICBLEN(long_ad, icb);
 			ad_num++;
 		} while(offset >= icblen);
 
 		lsector = (offset >> udfmp->bshift) +
 		    le32toh(((struct long_ad *)(icb))->loc.lb_num);
 
 		*max_size = icblen - offset;
 
 		break;
 	case 3:
 		/*
 		 * This type means that the file *data* is stored in the
 		 * allocation descriptor field of the file entry.
 		 */
 		*max_size = 0;
 		*sector = node->hash_id + udfmp->part_start;
 
 		return (UDF_INVALID_BMAP);
 	case 2:
 		/* DirectCD does not use extended_ad's */
 	default:
 		printf("Unsupported allocation descriptor %d\n",
 		       tag->flags & 0x7);
 		return (ENODEV);
 	}
 
 	*sector = lsector + udfmp->part_start;
 
 	/*
 	 * Check the sparing table.  Each entry represents the beginning of
 	 * a packet.
 	 */
 	if (udfmp->s_table != NULL) {
 		for (i = 0; i< udfmp->s_table_entries; i++) {
 			p_offset =
 			    lsector - le32toh(udfmp->s_table->entries[i].org);
 			if ((p_offset < udfmp->p_sectors) && (p_offset >= 0)) {
 				*sector =
 				   le32toh(udfmp->s_table->entries[i].map) +
 				    p_offset;
 				break;
 			}
 		}
 	}
 
 	return (0);
 }
Index: head/sys/fs/unionfs/union_subr.c
===================================================================
--- head/sys/fs/unionfs/union_subr.c	(revision 175201)
+++ head/sys/fs/unionfs/union_subr.c	(revision 175202)
@@ -1,1093 +1,1093 @@
 /*-
  * Copyright (c) 1994 Jan-Simon Pendry
  * Copyright (c) 1994
  *	The Regents of the University of California.  All rights reserved.
  * Copyright (c) 2005, 2006 Masanori Ozawa <ozawa@ongs.co.jp>, ONGS Inc.
  * Copyright (c) 2006 Daichi Goto <daichi@freebsd.org>
  *
  * This code is derived from software contributed to Berkeley by
  * Jan-Simon Pendry.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	@(#)union_subr.c	8.20 (Berkeley) 5/20/95
  * $FreeBSD$
  */
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/kernel.h>
 #include <sys/lock.h>
 #include <sys/mutex.h>
 #include <sys/malloc.h>
 #include <sys/mount.h>
 #include <sys/namei.h>
 #include <sys/proc.h>
 #include <sys/vnode.h>
 #include <sys/dirent.h>
 #include <sys/fcntl.h>
 #include <sys/filedesc.h>
 #include <sys/stat.h>
 #include <sys/resourcevar.h>
 
 #ifdef MAC
 #include <sys/mac.h>
 #endif
 
 #include <vm/uma.h>
 
 #include <fs/unionfs/union.h>
 
 MALLOC_DEFINE(M_UNIONFSNODE, "UNIONFS node", "UNIONFS vnode private part");
 MALLOC_DEFINE(M_UNIONFSPATH, "UNIONFS path", "UNIONFS path private part");
 
 /*
  * Initialize
  */
 int 
 unionfs_init(struct vfsconf *vfsp)
 {
 	UNIONFSDEBUG("unionfs_init\n");	/* printed during system boot */
 	return (0);
 }
 
 /*
  * Uninitialize
  */
 int 
 unionfs_uninit(struct vfsconf *vfsp)
 {
 	return (0);
 }
 
 /*
  * Make a new or get existing unionfs node.
  * 
  * uppervp and lowervp should be unlocked. Because if new unionfs vnode is
  * locked, uppervp or lowervp is locked too. In order to prevent dead lock,
  * you should not lock plurality simultaneously.
  */
 int
 unionfs_nodeget(struct mount *mp, struct vnode *uppervp,
 		struct vnode *lowervp, struct vnode *dvp,
 		struct vnode **vpp, struct componentname *cnp,
 		struct thread *td)
 {
 	struct unionfs_mount *ump;
 	struct unionfs_node *unp;
 	struct vnode   *vp;
 	int		error;
 	int		lkflags;
 	char	       *path;
 
 	ump = MOUNTTOUNIONFSMOUNT(mp);
 	lkflags = (cnp ? cnp->cn_lkflags : 0);
 	path = (cnp ? cnp->cn_nameptr : NULL);
 
 	if (uppervp == NULLVP && lowervp == NULLVP)
 		panic("unionfs_nodeget: upper and lower is null");
 
 	/* If it has no ISLASTCN flag, path check is skipped. */
 	if (cnp && !(cnp->cn_flags & ISLASTCN))
 		path = NULL;
 
 	if ((uppervp == NULLVP || ump->um_uppervp != uppervp) ||
 	    (lowervp == NULLVP || ump->um_lowervp != lowervp)) {
 		if (dvp == NULLVP)
 			return (EINVAL);
 	}
 
 	/*
 	 * Do the MALLOC before the getnewvnode since doing so afterward
 	 * might cause a bogus v_data pointer to get dereferenced elsewhere
 	 * if MALLOC should block.
 	 */
 	MALLOC(unp, struct unionfs_node *, sizeof(struct unionfs_node),
 	    M_UNIONFSNODE, M_WAITOK | M_ZERO);
 
 	error = getnewvnode("unionfs", mp, &unionfs_vnodeops, &vp);
 	if (error != 0) {
 		FREE(unp, M_UNIONFSNODE);
 		return (error);
 	}
 	error = insmntque(vp, mp);	/* XXX: Too early for mpsafe fs */
 	if (error != 0) {
 		FREE(unp, M_UNIONFSNODE);
 		return (error);
 	}
 	if (dvp != NULLVP)
 		vref(dvp);
 	if (uppervp != NULLVP)
 		vref(uppervp);
 	if (lowervp != NULLVP)
 		vref(lowervp);
 
 	unp->un_vnode = vp;
 	unp->un_uppervp = uppervp;
 	unp->un_lowervp = lowervp;
 	unp->un_dvp = dvp;
 	if (uppervp != NULLVP)
 		vp->v_vnlock = uppervp->v_vnlock;
 	else
 		vp->v_vnlock = lowervp->v_vnlock;
 
 	if (path != NULL) {
 		unp->un_path = (char *)
 		    malloc(cnp->cn_namelen +1, M_UNIONFSPATH, M_WAITOK|M_ZERO);
 		bcopy(cnp->cn_nameptr, unp->un_path, cnp->cn_namelen);
 		unp->un_path[cnp->cn_namelen] = '\0';
 	}
 	vp->v_type = (uppervp != NULLVP ? uppervp->v_type : lowervp->v_type);
 	vp->v_data = unp;
 
 	if ((uppervp != NULLVP && ump->um_uppervp == uppervp) &&
 	    (lowervp != NULLVP && ump->um_lowervp == lowervp))
 		vp->v_vflag |= VV_ROOT;
 
 	if (lkflags & LK_TYPE_MASK)
-		vn_lock(vp, lkflags | LK_RETRY, td);
+		vn_lock(vp, lkflags | LK_RETRY);
 
 	*vpp = vp;
 
 	return (0);
 }
 
 /*
  * Clean up the unionfs node.
  */
 void
 unionfs_noderem(struct vnode *vp, struct thread *td)
 {
 	int		vfslocked;
 	struct unionfs_node *unp;
 	struct unionfs_node_status *unsp, *unsp_tmp;
 	struct vnode   *lvp;
 	struct vnode   *uvp;
 
 	/*
 	 * Use the interlock to protect the clearing of v_data to
 	 * prevent faults in unionfs_lock().
 	 */
 	VI_LOCK(vp);
 	unp = VTOUNIONFS(vp);
 	lvp = unp->un_lowervp;
 	uvp = unp->un_uppervp;
 	unp->un_lowervp = unp->un_uppervp = NULLVP;
 
 	vp->v_vnlock = &(vp->v_lock);
 	vp->v_data = NULL;
 	lockmgr(vp->v_vnlock, LK_EXCLUSIVE | LK_INTERLOCK, VI_MTX(vp), td);
 	if (lvp != NULLVP)
 		VOP_UNLOCK(lvp, 0, td);
 	if (uvp != NULLVP)
 		VOP_UNLOCK(uvp, 0, td);
 	vp->v_object = NULL;
 
 	if (lvp != NULLVP) {
 		vfslocked = VFS_LOCK_GIANT(lvp->v_mount);
 		vrele(lvp);
 		VFS_UNLOCK_GIANT(vfslocked);
 	}
 	if (uvp != NULLVP) {
 		vfslocked = VFS_LOCK_GIANT(uvp->v_mount);
 		vrele(uvp);
 		VFS_UNLOCK_GIANT(vfslocked);
 	}
 	if (unp->un_dvp != NULLVP) {
 		vfslocked = VFS_LOCK_GIANT(unp->un_dvp->v_mount);
 		vrele(unp->un_dvp);
 		VFS_UNLOCK_GIANT(vfslocked);
 		unp->un_dvp = NULLVP;
 	}
 	if (unp->un_path) {
 		free(unp->un_path, M_UNIONFSPATH);
 		unp->un_path = NULL;
 	}
 
 	LIST_FOREACH_SAFE(unsp, &(unp->un_unshead), uns_list, unsp_tmp) {
 		LIST_REMOVE(unsp, uns_list);
 		free(unsp, M_TEMP);
 	}
 	FREE(unp, M_UNIONFSNODE);
 }
 
 /*
  * Get the unionfs node status.
  * You need exclusive lock this vnode.
  */
 void
 unionfs_get_node_status(struct unionfs_node *unp, struct thread *td,
 			struct unionfs_node_status **unspp)
 {
 	struct unionfs_node_status *unsp;
 
 	KASSERT(NULL != unspp, ("null pointer"));
 	ASSERT_VOP_ELOCKED(UNIONFSTOV(unp), "unionfs_get_node_status");
 
 	LIST_FOREACH(unsp, &(unp->un_unshead), uns_list) {
 		if (unsp->uns_tid == td->td_tid) {
 			*unspp = unsp;
 			return;
 		}
 	}
 
 	/* create a new unionfs node status */
 	MALLOC(unsp, struct unionfs_node_status *,
 	    sizeof(struct unionfs_node_status), M_TEMP, M_WAITOK | M_ZERO);
 
 	unsp->uns_tid = td->td_tid;
 	LIST_INSERT_HEAD(&(unp->un_unshead), unsp, uns_list);
 
 	*unspp = unsp;
 }
 
 /*
  * Remove the unionfs node status, if you can.
  * You need exclusive lock this vnode.
  */
 void
 unionfs_tryrem_node_status(struct unionfs_node *unp, struct thread *td,
 			   struct unionfs_node_status *unsp)
 {
 	KASSERT(NULL != unsp, ("null pointer"));
 	ASSERT_VOP_ELOCKED(UNIONFSTOV(unp), "unionfs_get_node_status");
 
 	if (0 < unsp->uns_lower_opencnt || 0 < unsp->uns_upper_opencnt)
 		return;
 
 	LIST_REMOVE(unsp, uns_list);
 	free(unsp, M_TEMP);
 }
 
 /*
  * Create upper node attr.
  */
 void
 unionfs_create_uppervattr_core(struct unionfs_mount *ump,
 			       struct vattr *lva,
 			       struct vattr *uva,
 			       struct thread *td)
 {
 	VATTR_NULL(uva);
 	uva->va_type = lva->va_type;
 	uva->va_atime = lva->va_atime;
 	uva->va_mtime = lva->va_mtime;
 	uva->va_ctime = lva->va_ctime;
 
 	switch (ump->um_copymode) {
 	case UNIONFS_TRANSPARENT:
 		uva->va_mode = lva->va_mode;
 		uva->va_uid = lva->va_uid;
 		uva->va_gid = lva->va_gid;
 		break;
 	case UNIONFS_MASQUERADE:
 		if (ump->um_uid == lva->va_uid) {
 			uva->va_mode = lva->va_mode & 077077;
 			uva->va_mode |= (lva->va_type == VDIR ? ump->um_udir : ump->um_ufile) & 0700;
 			uva->va_uid = lva->va_uid;
 			uva->va_gid = lva->va_gid;
 		} else {
 			uva->va_mode = (lva->va_type == VDIR ? ump->um_udir : ump->um_ufile);
 			uva->va_uid = ump->um_uid;
 			uva->va_gid = ump->um_gid;
 		}
 		break;
 	default:		/* UNIONFS_TRADITIONAL */
 		FILEDESC_SLOCK(td->td_proc->p_fd);
 		uva->va_mode = 0777 & ~td->td_proc->p_fd->fd_cmask;
 		FILEDESC_SUNLOCK(td->td_proc->p_fd);
 		uva->va_uid = ump->um_uid;
 		uva->va_gid = ump->um_gid;
 		break;
 	}
 }
 
 /*
  * Create upper node attr.
  */
 int
 unionfs_create_uppervattr(struct unionfs_mount *ump,
 			  struct vnode *lvp,
 			  struct vattr *uva,
 			  struct ucred *cred,
 			  struct thread *td)
 {
 	int		error;
 	struct vattr	lva;
 
 	if ((error = VOP_GETATTR(lvp, &lva, cred, td)))
 		return (error);
 
 	unionfs_create_uppervattr_core(ump, &lva, uva, td);
 
 	return (error);
 }
 
 /*
  * relookup
  * 
  * dvp should be locked on entry and will be locked on return.
  * 
  * If an error is returned, *vpp will be invalid, otherwise it will hold a
  * locked, referenced vnode. If *vpp == dvp then remember that only one
  * LK_EXCLUSIVE lock is held.
  */
 static int
 unionfs_relookup(struct vnode *dvp, struct vnode **vpp,
 		 struct componentname *cnp, struct componentname *cn,
 		 struct thread *td, char *path, int pathlen, u_long nameiop)
 {
 	int	error;
 
 	cn->cn_namelen = pathlen;
 	cn->cn_pnbuf = uma_zalloc(namei_zone, M_WAITOK);
 	bcopy(path, cn->cn_pnbuf, pathlen);
 	cn->cn_pnbuf[pathlen] = '\0';
 
 	cn->cn_nameiop = nameiop;
 	cn->cn_flags = (LOCKPARENT | LOCKLEAF | HASBUF | SAVENAME | ISLASTCN);
 	cn->cn_lkflags = LK_EXCLUSIVE;
 	cn->cn_thread = td;
 	cn->cn_cred = cnp->cn_cred;
 
 	cn->cn_nameptr = cn->cn_pnbuf;
 	cn->cn_consume = cnp->cn_consume;
 
 	if (nameiop == DELETE)
 		cn->cn_flags |= (cnp->cn_flags & (DOWHITEOUT | SAVESTART));
 	else if (RENAME == nameiop)
 		cn->cn_flags |= (cnp->cn_flags & SAVESTART);
 
 	vref(dvp);
 	VOP_UNLOCK(dvp, 0, td);
 
 	if ((error = relookup(dvp, vpp, cn))) {
 		uma_zfree(namei_zone, cn->cn_pnbuf);
 		cn->cn_flags &= ~HASBUF;
-		vn_lock(dvp, LK_EXCLUSIVE | LK_RETRY, td);
+		vn_lock(dvp, LK_EXCLUSIVE | LK_RETRY);
 	} else
 		vrele(dvp);
 
 	return (error);
 }
 
 /*
  * relookup for CREATE namei operation.
  *
  * dvp is unionfs vnode. dvp should be locked.
  *
  * If it called 'unionfs_copyfile' function by unionfs_link etc,
  * VOP_LOOKUP information is broken.
  * So it need relookup in order to create link etc.
  */
 int
 unionfs_relookup_for_create(struct vnode *dvp, struct componentname *cnp,
 			    struct thread *td)
 {
 	int	error;
 	struct vnode *udvp;
 	struct vnode *vp;
 	struct componentname cn;
 
 	udvp = UNIONFSVPTOUPPERVP(dvp);
 	vp = NULLVP;
 
 	error = unionfs_relookup(udvp, &vp, cnp, &cn, td, cnp->cn_nameptr,
 	    strlen(cnp->cn_nameptr), CREATE);
 	if (error)
 		return (error);
 
 	if (vp != NULLVP) {
 		if (udvp == vp)
 			vrele(vp);
 		else
 			vput(vp);
 
 		error = EEXIST;
 	}
 
 	if (cn.cn_flags & HASBUF) {
 		uma_zfree(namei_zone, cn.cn_pnbuf);
 		cn.cn_flags &= ~HASBUF;
 	}
 
 	if (!error) {
 		cn.cn_flags |= (cnp->cn_flags & HASBUF);
 		cnp->cn_flags = cn.cn_flags;
 	}
 
 	return (error);
 }
 
 /*
  * relookup for DELETE namei operation.
  *
  * dvp is unionfs vnode. dvp should be locked.
  */
 int
 unionfs_relookup_for_delete(struct vnode *dvp, struct componentname *cnp,
 			    struct thread *td)
 {
 	int	error;
 	struct vnode *udvp;
 	struct vnode *vp;
 	struct componentname cn;
 
 	udvp = UNIONFSVPTOUPPERVP(dvp);
 	vp = NULLVP;
 
 	error = unionfs_relookup(udvp, &vp, cnp, &cn, td, cnp->cn_nameptr,
 	    strlen(cnp->cn_nameptr), DELETE);
 	if (error)
 		return (error);
 
 	if (vp == NULLVP)
 		error = ENOENT;
 	else {
 		if (udvp == vp)
 			vrele(vp);
 		else
 			vput(vp);
 	}
 
 	if (cn.cn_flags & HASBUF) {
 		uma_zfree(namei_zone, cn.cn_pnbuf);
 		cn.cn_flags &= ~HASBUF;
 	}
 
 	if (!error) {
 		cn.cn_flags |= (cnp->cn_flags & HASBUF);
 		cnp->cn_flags = cn.cn_flags;
 	}
 
 	return (error);
 }
 
 /*
  * relookup for RENAME namei operation.
  *
  * dvp is unionfs vnode. dvp should be locked.
  */
 int
 unionfs_relookup_for_rename(struct vnode *dvp, struct componentname *cnp,
 			    struct thread *td)
 {
 	int error;
 	struct vnode *udvp;
 	struct vnode *vp;
 	struct componentname cn;
 
 	udvp = UNIONFSVPTOUPPERVP(dvp);
 	vp = NULLVP;
 
 	error = unionfs_relookup(udvp, &vp, cnp, &cn, td, cnp->cn_nameptr,
 	    strlen(cnp->cn_nameptr), RENAME);
 	if (error)
 		return (error);
 
 	if (vp != NULLVP) {
 		if (udvp == vp)
 			vrele(vp);
 		else
 			vput(vp);
 	}
 
 	if (cn.cn_flags & HASBUF) {
 		uma_zfree(namei_zone, cn.cn_pnbuf);
 		cn.cn_flags &= ~HASBUF;
 	}
 
 	if (!error) {
 		cn.cn_flags |= (cnp->cn_flags & HASBUF);
 		cnp->cn_flags = cn.cn_flags;
 	}
 
 	return (error);
 
 }
 
 /*
  * Update the unionfs_node.
  * 
  * uvp is new locked upper vnode. unionfs vnode's lock will be exchanged to the
  * uvp's lock and lower's lock will be unlocked.
  */
 static void
 unionfs_node_update(struct unionfs_node *unp, struct vnode *uvp,
 		    struct thread *td)
 {
 	int		count, lockcnt;
 	struct vnode   *vp;
 	struct vnode   *lvp;
 
 	vp = UNIONFSTOV(unp);
 	lvp = unp->un_lowervp;
 
 	/*
 	 * lock update
 	 */
 	VI_LOCK(vp);
 	unp->un_uppervp = uvp;
 	vp->v_vnlock = uvp->v_vnlock;
 	lockcnt = lvp->v_vnlock->lk_exclusivecount;
 	if (lockcnt <= 0)
 		panic("unionfs: no exclusive lock");
 	VI_UNLOCK(vp);
 	for (count = 1; count < lockcnt; count++)
-		vn_lock(uvp, LK_EXCLUSIVE | LK_CANRECURSE | LK_RETRY, td);
+		vn_lock(uvp, LK_EXCLUSIVE | LK_CANRECURSE | LK_RETRY);
 }
 
 /*
  * Create a new shadow dir.
  * 
  * udvp should be locked on entry and will be locked on return.
  * 
  * If no error returned, unp will be updated.
  */
 int
 unionfs_mkshadowdir(struct unionfs_mount *ump, struct vnode *udvp,
 		    struct unionfs_node *unp, struct componentname *cnp,
 		    struct thread *td)
 {
 	int		error;
 	struct vnode   *lvp;
 	struct vnode   *uvp;
 	struct vattr	va;
 	struct vattr	lva;
 	struct componentname cn;
 	struct mount   *mp;
 	struct ucred   *cred;
 	struct ucred   *credbk;
 	struct uidinfo *rootinfo;
 
 	if (unp->un_uppervp != NULLVP)
 		return (EEXIST);
 
 	lvp = unp->un_lowervp;
 	uvp = NULLVP;
 	credbk = cnp->cn_cred;
 
 	/* Authority change to root */
 	rootinfo = uifind((uid_t)0);
 	cred = crdup(cnp->cn_cred);
 	chgproccnt(cred->cr_ruidinfo, 1, 0);
 	change_euid(cred, rootinfo);
 	change_ruid(cred, rootinfo);
 	change_svuid(cred, (uid_t)0);
 	uifree(rootinfo);
 	cnp->cn_cred = cred;
 
 	memset(&cn, 0, sizeof(cn));
 
 	if ((error = VOP_GETATTR(lvp, &lva, cnp->cn_cred, td)))
 		goto unionfs_mkshadowdir_abort;
 
 	if ((error = unionfs_relookup(udvp, &uvp, cnp, &cn, td, cnp->cn_nameptr, cnp->cn_namelen, CREATE)))
 		goto unionfs_mkshadowdir_abort;
 	if (uvp != NULLVP) {
 		if (udvp == uvp)
 			vrele(uvp);
 		else
 			vput(uvp);
 
 		error = EEXIST;
 		goto unionfs_mkshadowdir_free_out;
 	}
 
 	if ((error = vn_start_write(udvp, &mp, V_WAIT | PCATCH)))
 		goto unionfs_mkshadowdir_free_out;
 	if ((error = VOP_LEASE(udvp, td, cn.cn_cred, LEASE_WRITE))) {
 		vn_finished_write(mp);
 		goto unionfs_mkshadowdir_free_out;
 	}
 	unionfs_create_uppervattr_core(ump, &lva, &va, td);
 
 	error = VOP_MKDIR(udvp, &uvp, &cn, &va);
 
 	if (!error) {
 		unionfs_node_update(unp, uvp, td);
 
 		/*
 		 * XXX The bug which cannot set uid/gid was corrected.
 		 * Ignore errors.
 		 */
 		va.va_type = VNON;
 		VOP_SETATTR(uvp, &va, cn.cn_cred, td);
 	}
 	vn_finished_write(mp);
 
 unionfs_mkshadowdir_free_out:
 	if (cn.cn_flags & HASBUF) {
 		uma_zfree(namei_zone, cn.cn_pnbuf);
 		cn.cn_flags &= ~HASBUF;
 	}
 
 unionfs_mkshadowdir_abort:
 	cnp->cn_cred = credbk;
 	chgproccnt(cred->cr_ruidinfo, -1, 0);
 	crfree(cred);
 
 	return (error);
 }
 
 /*
  * Create a new whiteout.
  * 
  * dvp should be locked on entry and will be locked on return.
  */
 int
 unionfs_mkwhiteout(struct vnode *dvp, struct componentname *cnp,
 		   struct thread *td, char *path)
 {
 	int		error;
 	struct vnode   *wvp;
 	struct componentname cn;
 	struct mount   *mp;
 
 	if (path == NULL)
 		path = cnp->cn_nameptr;
 
 	wvp = NULLVP;
 	if ((error = unionfs_relookup(dvp, &wvp, cnp, &cn, td, path, strlen(path), CREATE)))
 		return (error);
 	if (wvp != NULLVP) {
 		if (cn.cn_flags & HASBUF) {
 			uma_zfree(namei_zone, cn.cn_pnbuf);
 			cn.cn_flags &= ~HASBUF;
 		}
 		if (dvp == wvp)
 			vrele(wvp);
 		else
 			vput(wvp);
 
 		return (EEXIST);
 	}
 
 	if ((error = vn_start_write(dvp, &mp, V_WAIT | PCATCH)))
 		goto unionfs_mkwhiteout_free_out;
 	if (!(error = VOP_LEASE(dvp, td, td->td_ucred, LEASE_WRITE)))
 		error = VOP_WHITEOUT(dvp, &cn, CREATE);
 
 	vn_finished_write(mp);
 
 unionfs_mkwhiteout_free_out:
 	if (cn.cn_flags & HASBUF) {
 		uma_zfree(namei_zone, cn.cn_pnbuf);
 		cn.cn_flags &= ~HASBUF;
 	}
 
 	return (error);
 }
 
 /*
  * Create a new vnode for create a new shadow file.
  * 
  * If an error is returned, *vpp will be invalid, otherwise it will hold a
  * locked, referenced and opened vnode.
  * 
  * unp is never updated.
  */
 static int
 unionfs_vn_create_on_upper(struct vnode **vpp, struct vnode *udvp,
 			   struct unionfs_node *unp, struct vattr *uvap,
 			   struct thread *td)
 {
 	struct unionfs_mount *ump;
 	struct vnode   *vp;
 	struct vnode   *lvp;
 	struct ucred   *cred;
 	struct vattr	lva;
 	int		fmode;
 	int		error;
 	struct componentname cn;
 
 	ump = MOUNTTOUNIONFSMOUNT(UNIONFSTOV(unp)->v_mount);
 	vp = NULLVP;
 	lvp = unp->un_lowervp;
 	cred = td->td_ucred;
 	fmode = FFLAGS(O_WRONLY | O_CREAT | O_TRUNC | O_EXCL);
 	error = 0;
 
 	if ((error = VOP_GETATTR(lvp, &lva, cred, td)) != 0)
 		return (error);
 	unionfs_create_uppervattr_core(ump, &lva, uvap, td);
 
 	if (unp->un_path == NULL)
 		panic("unionfs: un_path is null");
 
 	cn.cn_namelen = strlen(unp->un_path);
 	cn.cn_pnbuf = uma_zalloc(namei_zone, M_WAITOK);
 	bcopy(unp->un_path, cn.cn_pnbuf, cn.cn_namelen + 1);
 	cn.cn_nameiop = CREATE;
 	cn.cn_flags = (LOCKPARENT | LOCKLEAF | HASBUF | SAVENAME | ISLASTCN);
 	cn.cn_lkflags = LK_EXCLUSIVE;
 	cn.cn_thread = td;
 	cn.cn_cred = cred;
 	cn.cn_nameptr = cn.cn_pnbuf;
 	cn.cn_consume = 0;
 
 	vref(udvp);
 	if ((error = relookup(udvp, &vp, &cn)) != 0)
 		goto unionfs_vn_create_on_upper_free_out2;
 	vrele(udvp);
 
 	if (vp != NULLVP) {
 		if (vp == udvp)
 			vrele(vp);
 		else
 			vput(vp);
 		error = EEXIST;
 		goto unionfs_vn_create_on_upper_free_out1;
 	}
 
 	if ((error = VOP_LEASE(udvp, td, cred, LEASE_WRITE)) != 0)
 		goto unionfs_vn_create_on_upper_free_out1;
 
 	if ((error = VOP_CREATE(udvp, &vp, &cn, uvap)) != 0)
 		goto unionfs_vn_create_on_upper_free_out1;
 
 	if ((error = VOP_OPEN(vp, fmode, cred, td, NULL)) != 0) {
 		vput(vp);
 		goto unionfs_vn_create_on_upper_free_out1;
 	}
 	vp->v_writecount++;
 	*vpp = vp;
 
 unionfs_vn_create_on_upper_free_out1:
 	VOP_UNLOCK(udvp, 0, td);
 
 unionfs_vn_create_on_upper_free_out2:
 	if (cn.cn_flags & HASBUF) {
 		uma_zfree(namei_zone, cn.cn_pnbuf);
 		cn.cn_flags &= ~HASBUF;
 	}
 
 	return (error);
 }
 
 /*
  * Copy from lvp to uvp.
  * 
  * lvp and uvp should be locked and opened on entry and will be locked and
  * opened on return.
  */
 static int
 unionfs_copyfile_core(struct vnode *lvp, struct vnode *uvp,
 		      struct ucred *cred, struct thread *td)
 {
 	int		error;
 	off_t		offset;
 	int		count;
 	int		bufoffset;
 	char           *buf;
 	struct uio	uio;
 	struct iovec	iov;
 
 	error = 0;
 	memset(&uio, 0, sizeof(uio));
 
 	uio.uio_td = td;
 	uio.uio_segflg = UIO_SYSSPACE;
 	uio.uio_offset = 0;
 
 	if ((error = VOP_LEASE(lvp, td, cred, LEASE_READ)) != 0)
 		return (error);
 	if ((error = VOP_LEASE(uvp, td, cred, LEASE_WRITE)) != 0)
 		return (error);
 	buf = malloc(MAXBSIZE, M_TEMP, M_WAITOK);
 
 	while (error == 0) {
 		offset = uio.uio_offset;
 
 		uio.uio_iov = &iov;
 		uio.uio_iovcnt = 1;
 		iov.iov_base = buf;
 		iov.iov_len = MAXBSIZE;
 		uio.uio_resid = iov.iov_len;
 		uio.uio_rw = UIO_READ;
 
 		if ((error = VOP_READ(lvp, &uio, 0, cred)) != 0)
 			break;
 		if ((count = MAXBSIZE - uio.uio_resid) == 0)
 			break;
 
 		bufoffset = 0;
 		while (bufoffset < count) {
 			uio.uio_iov = &iov;
 			uio.uio_iovcnt = 1;
 			iov.iov_base = buf + bufoffset;
 			iov.iov_len = count - bufoffset;
 			uio.uio_offset = offset + bufoffset;
 			uio.uio_resid = iov.iov_len;
 			uio.uio_rw = UIO_WRITE;
 
 			if ((error = VOP_WRITE(uvp, &uio, 0, cred)) != 0)
 				break;
 
 			bufoffset += (count - bufoffset) - uio.uio_resid;
 		}
 
 		uio.uio_offset = offset + bufoffset;
 	}
 
 	free(buf, M_TEMP);
 
 	return (error);
 }
 
 /*
  * Copy file from lower to upper.
  * 
  * If you need copy of the contents, set 1 to docopy. Otherwise, set 0 to
  * docopy.
  * 
  * If no error returned, unp will be updated.
  */
 int
 unionfs_copyfile(struct unionfs_node *unp, int docopy, struct ucred *cred,
 		 struct thread *td)
 {
 	int		error;
 	struct mount   *mp;
 	struct vnode   *udvp;
 	struct vnode   *lvp;
 	struct vnode   *uvp;
 	struct vattr	uva;
 
 	lvp = unp->un_lowervp;
 	uvp = NULLVP;
 
 	if ((UNIONFSTOV(unp)->v_mount->mnt_flag & MNT_RDONLY))
 		return (EROFS);
 	if (unp->un_dvp == NULLVP)
 		return (EINVAL);
 	if (unp->un_uppervp != NULLVP)
 		return (EEXIST);
 	udvp = VTOUNIONFS(unp->un_dvp)->un_uppervp;
 	if (udvp == NULLVP)
 		return (EROFS);
 	if ((udvp->v_mount->mnt_flag & MNT_RDONLY))
 		return (EROFS);
 
 	error = VOP_ACCESS(lvp, VREAD, cred, td);
 	if (error != 0)
 		return (error);
 
 	if ((error = vn_start_write(udvp, &mp, V_WAIT | PCATCH)) != 0)
 		return (error);
 	error = unionfs_vn_create_on_upper(&uvp, udvp, unp, &uva, td);
 	if (error != 0) {
 		vn_finished_write(mp);
 		return (error);
 	}
 
 	if (docopy != 0) {
 		error = VOP_OPEN(lvp, FREAD, cred, td, NULL);
 		if (error == 0) {
 			error = unionfs_copyfile_core(lvp, uvp, cred, td);
 			VOP_CLOSE(lvp, FREAD, cred, td);
 		}
 	}
 	VOP_CLOSE(uvp, FWRITE, cred, td);
 	uvp->v_writecount--;
 
 	vn_finished_write(mp);
 
 	if (error == 0) {
 		/* Reset the attributes. Ignore errors. */
 		uva.va_type = VNON;
 		VOP_SETATTR(uvp, &uva, cred, td);
 	}
 
 	unionfs_node_update(unp, uvp, td);
 
 	return (error);
 }
 
 /*
  * It checks whether vp can rmdir. (check empty)
  *
  * vp is unionfs vnode.
  * vp should be locked.
  */
 int
 unionfs_check_rmdir(struct vnode *vp, struct ucred *cred, struct thread *td)
 {
 	int		error;
 	int		eofflag;
 	int		lookuperr;
 	struct vnode   *uvp;
 	struct vnode   *lvp;
 	struct vnode   *tvp;
 	struct vattr	va;
 	struct componentname cn;
 	/*
 	 * The size of buf needs to be larger than DIRBLKSIZ.
 	 */
 	char		buf[256 * 6];
 	struct dirent  *dp;
 	struct dirent  *edp;
 	struct uio	uio;
 	struct iovec	iov;
 
 	ASSERT_VOP_ELOCKED(vp, "unionfs_check_rmdir");
 
 	eofflag = 0;
 	uvp = UNIONFSVPTOUPPERVP(vp);
 	lvp = UNIONFSVPTOLOWERVP(vp);
 
 	/* check opaque */
 	if ((error = VOP_GETATTR(uvp, &va, cred, td)) != 0)
 		return (error);
 	if (va.va_flags & OPAQUE)
 		return (0);
 
 	/* open vnode */
 #ifdef MAC
 	if ((error = mac_vnode_check_open(cred, vp, VEXEC|VREAD)) != 0)
 		return (error);
 #endif
 	if ((error = VOP_ACCESS(vp, VEXEC|VREAD, cred, td)) != 0)
 		return (error);
 	if ((error = VOP_OPEN(vp, FREAD, cred, td, NULL)) != 0)
 		return (error);
 
 	uio.uio_rw = UIO_READ;
 	uio.uio_segflg = UIO_SYSSPACE;
 	uio.uio_td = td;
 	uio.uio_offset = 0;
 
 #ifdef MAC
 	error = mac_vnode_check_readdir(td->td_ucred, lvp);
 #endif
 	while (!error && !eofflag) {
 		iov.iov_base = buf;
 		iov.iov_len = sizeof(buf);
 		uio.uio_iov = &iov;
 		uio.uio_iovcnt = 1;
 		uio.uio_resid = iov.iov_len;
 
 		error = VOP_READDIR(lvp, &uio, cred, &eofflag, NULL, NULL);
 		if (error)
 			break;
 
 		edp = (struct dirent*)&buf[sizeof(buf) - uio.uio_resid];
 		for (dp = (struct dirent*)buf; !error && dp < edp;
 		     dp = (struct dirent*)((caddr_t)dp + dp->d_reclen)) {
 			if (dp->d_type == DT_WHT ||
 			    (dp->d_namlen == 1 && dp->d_name[0] == '.') ||
 			    (dp->d_namlen == 2 && !bcmp(dp->d_name, "..", 2)))
 				continue;
 
 			cn.cn_namelen = dp->d_namlen;
 			cn.cn_pnbuf = NULL;
 			cn.cn_nameptr = dp->d_name;
 			cn.cn_nameiop = LOOKUP;
 			cn.cn_flags = (LOCKPARENT | LOCKLEAF | SAVENAME | RDONLY | ISLASTCN);
 			cn.cn_lkflags = LK_EXCLUSIVE;
 			cn.cn_thread = td;
 			cn.cn_cred = cred;
 			cn.cn_consume = 0;
 
 			/*
 			 * check entry in lower.
 			 * Sometimes, readdir function returns
 			 * wrong entry.
 			 */
 			lookuperr = VOP_LOOKUP(lvp, &tvp, &cn);
 
 			if (!lookuperr)
 				vput(tvp);
 			else
 				continue; /* skip entry */
 
 			/*
 			 * check entry
 			 * If it has no exist/whiteout entry in upper,
 			 * directory is not empty.
 			 */
 			cn.cn_flags = (LOCKPARENT | LOCKLEAF | SAVENAME | RDONLY | ISLASTCN);
 			lookuperr = VOP_LOOKUP(uvp, &tvp, &cn);
 
 			if (!lookuperr)
 				vput(tvp);
 
 			/* ignore exist or whiteout entry */
 			if (!lookuperr ||
 			    (lookuperr == ENOENT && (cn.cn_flags & ISWHITEOUT)))
 				continue;
 
 			error = ENOTEMPTY;
 		}
 	}
 
 	/* close vnode */
 	VOP_CLOSE(vp, FREAD, cred, td);
 
 	return (error);
 }
 
 #ifdef DIAGNOSTIC
 
 struct vnode   *
 unionfs_checkuppervp(struct vnode *vp, char *fil, int lno)
 {
 	struct unionfs_node *unp;
 
 	unp = VTOUNIONFS(vp);
 
 #ifdef notyet
 	if (vp->v_op != unionfs_vnodeop_p) {
 		printf("unionfs_checkuppervp: on non-unionfs-node.\n");
 #ifdef KDB
 		kdb_enter(KDB_WHY_UNIONFS,
 		    "unionfs_checkuppervp: on non-unionfs-node.\n");
 #endif
 		panic("unionfs_checkuppervp");
 	};
 #endif
 	return (unp->un_uppervp);
 }
 
 struct vnode   *
 unionfs_checklowervp(struct vnode *vp, char *fil, int lno)
 {
 	struct unionfs_node *unp;
 
 	unp = VTOUNIONFS(vp);
 
 #ifdef notyet
 	if (vp->v_op != unionfs_vnodeop_p) {
 		printf("unionfs_checklowervp: on non-unionfs-node.\n");
 #ifdef KDB
 		kdb_enter(KDB_WHY_UNIONFS,
 		    "unionfs_checklowervp: on non-unionfs-node.\n");
 #endif
 		panic("unionfs_checklowervp");
 	};
 #endif
 	return (unp->un_lowervp);
 }
 #endif
Index: head/sys/fs/unionfs/union_vfsops.c
===================================================================
--- head/sys/fs/unionfs/union_vfsops.c	(revision 175201)
+++ head/sys/fs/unionfs/union_vfsops.c	(revision 175202)
@@ -1,562 +1,562 @@
 /*-
  * Copyright (c) 1994, 1995 The Regents of the University of California.
  * Copyright (c) 1994, 1995 Jan-Simon Pendry.
  * Copyright (c) 2005, 2006 Masanori Ozawa <ozawa@ongs.co.jp>, ONGS Inc.
  * Copyright (c) 2006 Daichi Goto <daichi@freebsd.org>
  * All rights reserved.
  *
  * This code is derived from software donated to Berkeley by
  * Jan-Simon Pendry.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	@(#)union_vfsops.c	8.20 (Berkeley) 5/20/95
  * $FreeBSD$
  */
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/kdb.h>
 #include <sys/kernel.h>
 #include <sys/lock.h>
 #include <sys/malloc.h>
 #include <sys/mount.h>
 #include <sys/namei.h>
 #include <sys/proc.h>
 #include <sys/vnode.h>
 #include <sys/stat.h>
 
 #include <fs/unionfs/union.h>
 
 static MALLOC_DEFINE(M_UNIONFSMNT, "UNIONFS mount", "UNIONFS mount structure");
 
 static vfs_fhtovp_t	unionfs_fhtovp;
 static vfs_checkexp_t	unionfs_checkexp;
 static vfs_mount_t	unionfs_domount;
 static vfs_quotactl_t	unionfs_quotactl;
 static vfs_root_t	unionfs_root;
 static vfs_sync_t	unionfs_sync;
 static vfs_statfs_t	unionfs_statfs;
 static vfs_unmount_t	unionfs_unmount;
 static vfs_vget_t	unionfs_vget;
 static vfs_extattrctl_t	unionfs_extattrctl;
 
 static struct vfsops unionfs_vfsops;
 
 /*
  * Exchange from userland file mode to vmode.
  */
 static u_short 
 mode2vmode(mode_t mode)
 {
 	u_short		ret;
 
 	ret = 0;
 
 	/* other */
 	if (mode & S_IXOTH)
 		ret |= VEXEC >> 6;
 	if (mode & S_IWOTH)
 		ret |= VWRITE >> 6;
 	if (mode & S_IROTH)
 		ret |= VREAD >> 6;
 
 	/* group */
 	if (mode & S_IXGRP)
 		ret |= VEXEC >> 3;
 	if (mode & S_IWGRP)
 		ret |= VWRITE >> 3;
 	if (mode & S_IRGRP)
 		ret |= VREAD >> 3;
 
 	/* owner */
 	if (mode & S_IXUSR)
 		ret |= VEXEC;
 	if (mode & S_IWUSR)
 		ret |= VWRITE;
 	if (mode & S_IRUSR)
 		ret |= VREAD;
 
 	return (ret);
 }
 
 /*
  * Mount unionfs layer.
  */
 static int
 unionfs_domount(struct mount *mp, struct thread *td)
 {
 	int		error;
 	struct vnode   *lowerrootvp;
 	struct vnode   *upperrootvp;
 	struct unionfs_mount *ump;
 	char           *target;
 	char           *tmp;
 	char           *ep;
 	int		len;
 	size_t		done;
 	int		below;
 	uid_t		uid;
 	gid_t		gid;
 	u_short		udir;
 	u_short		ufile;
 	unionfs_copymode copymode;
 	unionfs_whitemode whitemode;
 	struct componentname fakecn;
 	struct nameidata nd, *ndp;
 	struct vattr	va;
 
 	UNIONFSDEBUG("unionfs_mount(mp = %p)\n", (void *)mp);
 
 	error = 0;
 	below = 0;
 	uid = 0;
 	gid = 0;
 	udir = 0;
 	ufile = 0;
 	copymode = UNIONFS_TRANSPARENT;	/* default */
 	whitemode = UNIONFS_WHITE_ALWAYS;
 	ndp = &nd;
 
 	if (mp->mnt_flag & MNT_ROOTFS) {
 		vfs_mount_error(mp, "Cannot union mount root filesystem");
 		return (EOPNOTSUPP);
 	}
 
 	/*
 	 * Update is a no operation.
 	 */
 	if (mp->mnt_flag & MNT_UPDATE) {
 		vfs_mount_error(mp, "unionfs does not support mount update");
 		return (EOPNOTSUPP);
 	}
 
 	/*
 	 * Get argument
 	 */
 	error = vfs_getopt(mp->mnt_optnew, "target", (void **)&target, &len);
 	if (error)
 		error = vfs_getopt(mp->mnt_optnew, "from", (void **)&target,
 		    &len);
 	if (error || target[len - 1] != '\0') {
 		vfs_mount_error(mp, "Invalid target");
 		return (EINVAL);
 	}
 	if (vfs_getopt(mp->mnt_optnew, "below", NULL, NULL) == 0)
 		below = 1;
 	if (vfs_getopt(mp->mnt_optnew, "udir", (void **)&tmp, NULL) == 0) {
 		if (tmp != NULL)
 			udir = (mode_t)strtol(tmp, &ep, 8);
 		if (tmp == NULL || *ep) {
 			vfs_mount_error(mp, "Invalid udir");
 			return (EINVAL);
 		}
 		udir = mode2vmode(udir);
 	}
 	if (vfs_getopt(mp->mnt_optnew, "ufile", (void **)&tmp, NULL) == 0) {
 		if (tmp != NULL)
 			ufile = (mode_t)strtol(tmp, &ep, 8);
 		if (tmp == NULL || *ep) {
 			vfs_mount_error(mp, "Invalid ufile");
 			return (EINVAL);
 		}
 		ufile = mode2vmode(ufile);
 	}
 	/* check umask, uid and gid */
 	if (udir == 0 && ufile != 0)
 		udir = ufile;
 	if (ufile == 0 && udir != 0)
 		ufile = udir;
 
-	vn_lock(mp->mnt_vnodecovered, LK_SHARED | LK_RETRY, td);
+	vn_lock(mp->mnt_vnodecovered, LK_SHARED | LK_RETRY);
 	error = VOP_GETATTR(mp->mnt_vnodecovered, &va, mp->mnt_cred, td);
 	if (!error) {
 		if (udir == 0)
 			udir = va.va_mode;
 		if (ufile == 0)
 			ufile = va.va_mode;
 		uid = va.va_uid;
 		gid = va.va_gid;
 	}
 	VOP_UNLOCK(mp->mnt_vnodecovered, 0, td);
 	if (error)
 		return (error);
 
 	if (mp->mnt_cred->cr_ruid == 0) {	/* root only */
 		if (vfs_getopt(mp->mnt_optnew, "uid", (void **)&tmp,
 		    NULL) == 0) {
 			if (tmp != NULL)
 				uid = (uid_t)strtol(tmp, &ep, 10);
 			if (tmp == NULL || *ep) {
 				vfs_mount_error(mp, "Invalid uid");
 				return (EINVAL);
 			}
 		}
 		if (vfs_getopt(mp->mnt_optnew, "gid", (void **)&tmp,
 		    NULL) == 0) {
 			if (tmp != NULL)
 				gid = (gid_t)strtol(tmp, &ep, 10);
 			if (tmp == NULL || *ep) {
 				vfs_mount_error(mp, "Invalid gid");
 				return (EINVAL);
 			}
 		}
 		if (vfs_getopt(mp->mnt_optnew, "copymode", (void **)&tmp,
 		    NULL) == 0) {
 			if (tmp == NULL) {
 				vfs_mount_error(mp, "Invalid copymode");
 				return (EINVAL);
 			} else if (strcasecmp(tmp, "traditional") == 0)
 				copymode = UNIONFS_TRADITIONAL;
 			else if (strcasecmp(tmp, "transparent") == 0)
 				copymode = UNIONFS_TRANSPARENT;
 			else if (strcasecmp(tmp, "masquerade") == 0)
 				copymode = UNIONFS_MASQUERADE;
 			else {
 				vfs_mount_error(mp, "Invalid copymode");
 				return (EINVAL);
 			}
 		}
 		if (vfs_getopt(mp->mnt_optnew, "whiteout", (void **)&tmp,
 		    NULL) == 0) {
 			if (tmp == NULL) {
 				vfs_mount_error(mp, "Invalid whiteout mode");
 				return (EINVAL);
 			} else if (strcasecmp(tmp, "always") == 0)
 				whitemode = UNIONFS_WHITE_ALWAYS;
 			else if (strcasecmp(tmp, "whenneeded") == 0)
 				whitemode = UNIONFS_WHITE_WHENNEEDED;
 			else {
 				vfs_mount_error(mp, "Invalid whiteout mode");
 				return (EINVAL);
 			}
 		}
 	}
 	/* If copymode is UNIONFS_TRADITIONAL, uid/gid is mounted user. */
 	if (copymode == UNIONFS_TRADITIONAL) {
 		uid = mp->mnt_cred->cr_ruid;
 		gid = mp->mnt_cred->cr_rgid;
 	}
 
 	UNIONFSDEBUG("unionfs_mount: uid=%d, gid=%d\n", uid, gid);
 	UNIONFSDEBUG("unionfs_mount: udir=0%03o, ufile=0%03o\n", udir, ufile);
 	UNIONFSDEBUG("unionfs_mount: copymode=%d\n", copymode);
 
 	/*
 	 * Find upper node
 	 */
 	NDINIT(ndp, LOOKUP, FOLLOW | WANTPARENT | LOCKLEAF, UIO_SYSSPACE, target, td);
 	if ((error = namei(ndp)))
 		return (error);
 
 	NDFREE(ndp, NDF_ONLY_PNBUF);
 
 	/* get root vnodes */
 	lowerrootvp = mp->mnt_vnodecovered;
 	upperrootvp = ndp->ni_vp;
 
 	vrele(ndp->ni_dvp);
 	ndp->ni_dvp = NULLVP;
 
 	/* create unionfs_mount */
 	ump = (struct unionfs_mount *)malloc(sizeof(struct unionfs_mount),
 	    M_UNIONFSMNT, M_WAITOK | M_ZERO);
 
 	/*
 	 * Save reference
 	 */
 	if (below) {
 		VOP_UNLOCK(upperrootvp, 0, td);
-		vn_lock(lowerrootvp, LK_EXCLUSIVE | LK_RETRY, td);
+		vn_lock(lowerrootvp, LK_EXCLUSIVE | LK_RETRY);
 		ump->um_lowervp = upperrootvp;
 		ump->um_uppervp = lowerrootvp;
 	} else {
 		ump->um_lowervp = lowerrootvp;
 		ump->um_uppervp = upperrootvp;
 	}
 	ump->um_rootvp = NULLVP;
 	ump->um_uid = uid;
 	ump->um_gid = gid;
 	ump->um_udir = udir;
 	ump->um_ufile = ufile;
 	ump->um_copymode = copymode;
 	ump->um_whitemode = whitemode;
 
 	MNT_ILOCK(mp);
 	if ((lowerrootvp->v_mount->mnt_kern_flag & MNTK_MPSAFE) &&
 	    (upperrootvp->v_mount->mnt_kern_flag & MNTK_MPSAFE))
 		mp->mnt_kern_flag |= MNTK_MPSAFE;
 	MNT_IUNLOCK(mp);
 	mp->mnt_data = ump;
 
 	/*
 	 * Copy upper layer's RDONLY flag.
 	 */
 	mp->mnt_flag |= ump->um_uppervp->v_mount->mnt_flag & MNT_RDONLY;
 
 	/*
 	 * Check whiteout
 	 */
 	if ((mp->mnt_flag & MNT_RDONLY) == 0) {
 		memset(&fakecn, 0, sizeof(fakecn));
 		fakecn.cn_nameiop = LOOKUP;
 		fakecn.cn_thread = td;
 		error = VOP_WHITEOUT(ump->um_uppervp, &fakecn, LOOKUP);
 		if (error) {
 			if (below) {
 				VOP_UNLOCK(ump->um_uppervp, 0, td);
 				vrele(upperrootvp);
 			} else
 				vput(ump->um_uppervp);
 			free(ump, M_UNIONFSMNT);
 			mp->mnt_data = NULL;
 			return (error);
 		}
 	}
 
 	/*
 	 * Unlock the node
 	 */
 	VOP_UNLOCK(ump->um_uppervp, 0, td);
 
 	/*
 	 * Get the unionfs root vnode.
 	 */
 	error = unionfs_nodeget(mp, ump->um_uppervp, ump->um_lowervp,
 	    NULLVP, &(ump->um_rootvp), NULL, td);
 	vrele(upperrootvp);
 	if (error) {
 		free(ump, M_UNIONFSMNT);
 		mp->mnt_data = NULL;
 		return (error);
 	}
 
 	/*
 	 * Check mnt_flag
 	 */
 	if ((ump->um_lowervp->v_mount->mnt_flag & MNT_LOCAL) &&
 	    (ump->um_uppervp->v_mount->mnt_flag & MNT_LOCAL))
 		mp->mnt_flag |= MNT_LOCAL;
 
 	/*
 	 * Get new fsid
 	 */
 	vfs_getnewfsid(mp);
 
 	len = MNAMELEN - 1;
 	tmp = mp->mnt_stat.f_mntfromname;
 	copystr((below ? "<below>:" : "<above>:"), tmp, len, &done);
 	len -= done - 1;
 	tmp += done - 1;
 	copystr(target, tmp, len, NULL);
 
 	UNIONFSDEBUG("unionfs_mount: from %s, on %s\n",
 	    mp->mnt_stat.f_mntfromname, mp->mnt_stat.f_mntonname);
 
 	return (0);
 }
 
 /*
  * Free reference to unionfs layer
  */
 static int
 unionfs_unmount(struct mount *mp, int mntflags, struct thread *td)
 {
 	struct unionfs_mount *ump;
 	int		error;
 	int		num;
 	int		freeing;
 	int		flags;
 
 	UNIONFSDEBUG("unionfs_unmount: mp = %p\n", (void *)mp);
 
 	ump = MOUNTTOUNIONFSMOUNT(mp);
 	flags = 0;
 
 	if (mntflags & MNT_FORCE)
 		flags |= FORCECLOSE;
 
 	/* vflush (no need to call vrele) */
 	for (freeing = 0; (error = vflush(mp, 1, flags, td)) != 0;) {
 		num = mp->mnt_nvnodelistsize;
 		if (num == freeing)
 			break;
 		freeing = num;
 	}
 
 	if (error)
 		return (error);
 
 	free(ump, M_UNIONFSMNT);
 	mp->mnt_data = 0;
 
 	return (0);
 }
 
 static int
 unionfs_root(struct mount *mp, int flags, struct vnode **vpp, struct thread *td)
 {
 	struct unionfs_mount *ump;
 	struct vnode   *vp;
 
 	ump = MOUNTTOUNIONFSMOUNT(mp);
 	vp = ump->um_rootvp;
 
 	UNIONFSDEBUG("unionfs_root: rootvp=%p locked=%x\n",
 	    vp, VOP_ISLOCKED(vp, td));
 
 	vref(vp);
 	if (flags & LK_TYPE_MASK)
-		vn_lock(vp, flags, td);
+		vn_lock(vp, flags);
 
 	*vpp = vp;
 
 	return (0);
 }
 
 static int
 unionfs_quotactl(struct mount *mp, int cmd, uid_t uid, void *arg,
     struct thread *td)
 {
 	struct unionfs_mount *ump;
 
 	ump = MOUNTTOUNIONFSMOUNT(mp);
 
 	/*
 	 * Writing is always performed to upper vnode.
 	 */
 	return (VFS_QUOTACTL(ump->um_uppervp->v_mount, cmd, uid, arg, td));
 }
 
 static int
 unionfs_statfs(struct mount *mp, struct statfs *sbp, struct thread *td)
 {
 	struct unionfs_mount *ump;
 	int		error;
 	struct statfs	mstat;
 	uint64_t	lbsize;
 
 	ump = MOUNTTOUNIONFSMOUNT(mp);
 
 	UNIONFSDEBUG("unionfs_statfs(mp = %p, lvp = %p, uvp = %p)\n",
 	    (void *)mp, (void *)ump->um_lowervp, (void *)ump->um_uppervp);
 
 	bzero(&mstat, sizeof(mstat));
 
 	error = VFS_STATFS(ump->um_lowervp->v_mount, &mstat, td);
 	if (error)
 		return (error);
 
 	/* now copy across the "interesting" information and fake the rest */
 	sbp->f_blocks = mstat.f_blocks;
 	sbp->f_files = mstat.f_files;
 
 	lbsize = mstat.f_bsize;
 
 	error = VFS_STATFS(ump->um_uppervp->v_mount, &mstat, td);
 	if (error)
 		return (error);
 
 	/*
 	 * The FS type etc is copy from upper vfs.
 	 * (write able vfs have priority)
 	 */
 	sbp->f_type = mstat.f_type;
 	sbp->f_flags = mstat.f_flags;
 	sbp->f_bsize = mstat.f_bsize;
 	sbp->f_iosize = mstat.f_iosize;
 
 	if (mstat.f_bsize != lbsize)
 		sbp->f_blocks = ((off_t)sbp->f_blocks * lbsize) / mstat.f_bsize;
 
 	sbp->f_blocks += mstat.f_blocks;
 	sbp->f_bfree = mstat.f_bfree;
 	sbp->f_bavail = mstat.f_bavail;
 	sbp->f_files += mstat.f_files;
 	sbp->f_ffree = mstat.f_ffree;
 	return (0);
 }
 
 static int
 unionfs_sync(struct mount *mp, int waitfor, struct thread *td)
 {
 	/* nothing to do */
 	return (0);
 }
 
 static int
 unionfs_vget(struct mount *mp, ino_t ino, int flags, struct vnode **vpp)
 {
 	return (EOPNOTSUPP);
 }
 
 static int
 unionfs_fhtovp(struct mount *mp, struct fid *fidp, struct vnode **vpp)
 {
 	return (EOPNOTSUPP);
 }
 
 static int
 unionfs_checkexp(struct mount *mp, struct sockaddr *nam, int *extflagsp,
 		 struct ucred **credanonp)
 {
 	return (EOPNOTSUPP);
 }
 
 static int
 unionfs_extattrctl(struct mount *mp, int cmd, struct vnode *filename_vp,
     int namespace, const char *attrname, struct thread *td)
 {
 	struct unionfs_mount *ump;
 	struct unionfs_node *unp;
 
 	ump = MOUNTTOUNIONFSMOUNT(mp);
 	unp = VTOUNIONFS(filename_vp);
 
 	if (unp->un_uppervp != NULLVP) {
 		return (VFS_EXTATTRCTL(ump->um_uppervp->v_mount, cmd,
 		    unp->un_uppervp, namespace, attrname, td));
 	} else {
 		return (VFS_EXTATTRCTL(ump->um_lowervp->v_mount, cmd,
 		    unp->un_lowervp, namespace, attrname, td));
 	}
 }
 
 static struct vfsops unionfs_vfsops = {
 	.vfs_checkexp =		unionfs_checkexp,
 	.vfs_extattrctl =	unionfs_extattrctl,
 	.vfs_fhtovp =		unionfs_fhtovp,
 	.vfs_init =		unionfs_init,
 	.vfs_mount =		unionfs_domount,
 	.vfs_quotactl =		unionfs_quotactl,
 	.vfs_root =		unionfs_root,
 	.vfs_statfs =		unionfs_statfs,
 	.vfs_sync =		unionfs_sync,
 	.vfs_uninit =		unionfs_uninit,
 	.vfs_unmount =		unionfs_unmount,
 	.vfs_vget =		unionfs_vget,
 };
 
 VFS_SET(unionfs_vfsops, unionfs, VFCF_LOOPBACK);
Index: head/sys/fs/unionfs/union_vnops.c
===================================================================
--- head/sys/fs/unionfs/union_vnops.c	(revision 175201)
+++ head/sys/fs/unionfs/union_vnops.c	(revision 175202)
@@ -1,2330 +1,2331 @@
 /*-
  * Copyright (c) 1992, 1993, 1994, 1995 Jan-Simon Pendry.
  * Copyright (c) 1992, 1993, 1994, 1995
  *      The Regents of the University of California.
  * Copyright (c) 2005, 2006 Masanori Ozawa <ozawa@ongs.co.jp>, ONGS Inc.
  * Copyright (c) 2006 Daichi Goto <daichi@freebsd.org>
  * All rights reserved.
  *
  * This code is derived from software contributed to Berkeley by
  * Jan-Simon Pendry.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	@(#)union_vnops.c	8.32 (Berkeley) 6/23/95
  * $FreeBSD$
  *
  */
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/conf.h>
 #include <sys/kernel.h>
 #include <sys/lock.h>
 #include <sys/malloc.h>
 #include <sys/mount.h>
 #include <sys/mutex.h>
 #include <sys/namei.h>
 #include <sys/sysctl.h>
 #include <sys/vnode.h>
 #include <sys/kdb.h>
 #include <sys/fcntl.h>
 #include <sys/stat.h>
 #include <sys/dirent.h>
 #include <sys/proc.h>
 #include <sys/bio.h>
 #include <sys/buf.h>
 
 #include <fs/unionfs/union.h>
 
 #include <vm/vm.h>
 #include <vm/vm_extern.h>
 #include <vm/vm_object.h>
 #include <vm/vnode_pager.h>
 
 #if 0
 #define UNIONFS_INTERNAL_DEBUG(msg, args...)    printf(msg, ## args)
 #define UNIONFS_IDBG_RENAME
 #else
 #define UNIONFS_INTERNAL_DEBUG(msg, args...)
 #endif
 
 /* lockmgr lock <-> reverse table */
 struct lk_lr_table {
 	int	lock;
 	int	revlock;
 };
 
 static struct lk_lr_table un_llt[] = {
 	{LK_SHARED, LK_RELEASE},
 	{LK_EXCLUSIVE, LK_RELEASE},
 	{LK_UPGRADE, LK_DOWNGRADE},
 	{LK_DOWNGRADE, LK_UPGRADE},
 	{0, 0}
 };
 
 
 static int
 unionfs_lookup(struct vop_cachedlookup_args *ap)
 {
 	int		iswhiteout;
 	int		lockflag;
 	int		error , uerror, lerror;
 	u_long		nameiop;
 	u_long		cnflags, cnflagsbk;
 	struct unionfs_node *dunp;
 	struct vnode   *dvp, *udvp, *ldvp, *vp, *uvp, *lvp, *dtmpvp;
 	struct vattr	va;
 	struct componentname *cnp;
 	struct thread  *td;
 
 	iswhiteout = 0;
 	lockflag = 0;
 	error = uerror = lerror = ENOENT;
 	cnp = ap->a_cnp;
 	nameiop = cnp->cn_nameiop;
 	cnflags = cnp->cn_flags;
 	dvp = ap->a_dvp;
 	dunp = VTOUNIONFS(dvp);
 	udvp = dunp->un_uppervp;
 	ldvp = dunp->un_lowervp;
 	vp = uvp = lvp = NULLVP;
 	td = curthread;
 	*(ap->a_vpp) = NULLVP;
 
 	UNIONFS_INTERNAL_DEBUG("unionfs_lookup: enter: nameiop=%ld, flags=%lx, path=%s\n", nameiop, cnflags, cnp->cn_nameptr);
 
 	if (dvp->v_type != VDIR)
 		return (ENOTDIR);
 
 	/*
 	 * If read-only and op is not LOOKUP, will return EROFS.
 	 */
 	if ((cnflags & ISLASTCN) &&
 	    (dvp->v_mount->mnt_flag & MNT_RDONLY) &&
 	    LOOKUP != nameiop)
 		return (EROFS);
 
 	/*
 	 * lookup dotdot
 	 */
 	if (cnflags & ISDOTDOT) {
 		if (LOOKUP != nameiop && udvp == NULLVP)
 			return (EROFS);
 
 		if (udvp != NULLVP) {
 			dtmpvp = udvp;
 			if (ldvp != NULLVP)
 				VOP_UNLOCK(ldvp, 0, td);
 		}
 		else
 			dtmpvp = ldvp;
 
 		error = VOP_LOOKUP(dtmpvp, &vp, cnp);
 
 		if (dtmpvp == udvp && ldvp != NULLVP) {
 			VOP_UNLOCK(udvp, 0, td);
-			vn_lock(dvp, LK_EXCLUSIVE | LK_RETRY, td);
+			vn_lock(dvp, LK_EXCLUSIVE | LK_RETRY);
 		}
 
 		if (error == 0) {
 			/*
 			 * Exchange lock and reference from vp to
 			 * dunp->un_dvp. vp is upper/lower vnode, but it
 			 * will need to return the unionfs vnode.
 			 */
 			if (nameiop == DELETE  || nameiop == RENAME ||
 			    (cnp->cn_lkflags & LK_TYPE_MASK))
 				VOP_UNLOCK(vp, 0, td);
 			vrele(vp);
 
 			VOP_UNLOCK(dvp, 0, td);
 			*(ap->a_vpp) = dunp->un_dvp;
 			vref(dunp->un_dvp);
 
 			if (nameiop == DELETE || nameiop == RENAME)
-				vn_lock(dunp->un_dvp, LK_EXCLUSIVE | LK_RETRY, td);
+				vn_lock(dunp->un_dvp, LK_EXCLUSIVE | LK_RETRY);
 			else if (cnp->cn_lkflags & LK_TYPE_MASK)
-				vn_lock(dunp->un_dvp, cnp->cn_lkflags | LK_RETRY, td);
+				vn_lock(dunp->un_dvp, cnp->cn_lkflags |
+				    LK_RETRY);
 
-			vn_lock(dvp, LK_EXCLUSIVE | LK_RETRY, td);
+			vn_lock(dvp, LK_EXCLUSIVE | LK_RETRY);
 		} else if (error == ENOENT && (cnflags & MAKEENTRY) &&
 		    nameiop != CREATE)
 			cache_enter(dvp, NULLVP, cnp);
 
 		UNIONFS_INTERNAL_DEBUG("unionfs_lookup: leave (%d)\n", error);
 
 		return (error);
 	}
 
 	/*
 	 * lookup upper layer
 	 */
 	if (udvp != NULLVP) {
 		uerror = VOP_LOOKUP(udvp, &uvp, cnp);
 
 		if (uerror == 0) {
 			if (udvp == uvp) {	/* is dot */
 				vrele(uvp);
 				*(ap->a_vpp) = dvp;
 				vref(dvp);
 
 				UNIONFS_INTERNAL_DEBUG("unionfs_lookup: leave (%d)\n", uerror);
 
 				return (uerror);
 			}
 			if (nameiop == DELETE || nameiop == RENAME ||
 			    (cnp->cn_lkflags & LK_TYPE_MASK))
 				VOP_UNLOCK(uvp, 0, td);
 		}
 
 		/* check whiteout */
 		if (uerror == ENOENT || uerror == EJUSTRETURN)
 			if (cnp->cn_flags & ISWHITEOUT)
 				iswhiteout = 1;	/* don't lookup lower */
 		if (iswhiteout == 0 && ldvp != NULLVP)
 			if (VOP_GETATTR(udvp, &va, cnp->cn_cred, td) == 0 &&
 			    (va.va_flags & OPAQUE))
 				iswhiteout = 1;	/* don't lookup lower */
 #if 0
 		UNIONFS_INTERNAL_DEBUG("unionfs_lookup: debug: whiteout=%d, path=%s\n", iswhiteout, cnp->cn_nameptr);
 #endif
 	}
 
 	/*
 	 * lookup lower layer
 	 */
 	if (ldvp != NULLVP && !(cnflags & DOWHITEOUT) && iswhiteout == 0) {
 		/* always op is LOOKUP */
 		cnp->cn_nameiop = LOOKUP;
 		cnflagsbk = cnp->cn_flags;
 		cnp->cn_flags = cnflags;
 
 		lerror = VOP_LOOKUP(ldvp, &lvp, cnp);
 
 		cnp->cn_nameiop = nameiop;
 		if (udvp != NULLVP && (uerror == 0 || uerror == EJUSTRETURN))
 			cnp->cn_flags = cnflagsbk;
 
 		if (lerror == 0) {
 			if (ldvp == lvp) {	/* is dot */
 				if (uvp != NULLVP)
 					vrele(uvp);	/* no need? */
 				vrele(lvp);
 				*(ap->a_vpp) = dvp;
 				vref(dvp);
 
 				UNIONFS_INTERNAL_DEBUG("unionfs_lookup: leave (%d)\n", lerror);
 
 				return (lerror);
 			}
 			if (cnp->cn_lkflags & LK_TYPE_MASK)
 				VOP_UNLOCK(lvp, 0, td);
 		}
 	}
 
 	/*
 	 * check lookup result
 	 */
 	if (uvp == NULLVP && lvp == NULLVP) {
 		UNIONFS_INTERNAL_DEBUG("unionfs_lookup: leave (%d)\n",
 		    (udvp != NULLVP ? uerror : lerror));
 		return (udvp != NULLVP ? uerror : lerror);
 	}
 
 	/*
 	 * check vnode type
 	 */
 	if (uvp != NULLVP && lvp != NULLVP && uvp->v_type != lvp->v_type) {
 		vrele(lvp);
 		lvp = NULLVP;
 	}
 
 	/*
 	 * check shadow dir
 	 */
 	if (uerror != 0 && uerror != EJUSTRETURN && udvp != NULLVP &&
 	    lerror == 0 && lvp != NULLVP && lvp->v_type == VDIR &&
 	    !(dvp->v_mount->mnt_flag & MNT_RDONLY) &&
 	    (1 < cnp->cn_namelen || '.' != *(cnp->cn_nameptr))) {
 		/* get unionfs vnode in order to create a new shadow dir. */
 		error = unionfs_nodeget(dvp->v_mount, NULLVP, lvp, dvp, &vp,
 		    cnp, td);
 		if (error != 0)
 			goto unionfs_lookup_out;
 
 		if (LK_SHARED == (cnp->cn_lkflags & LK_TYPE_MASK))
 			VOP_UNLOCK(vp, 0, td);
 		if (LK_EXCLUSIVE != VOP_ISLOCKED(vp, td)) {
-			vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td);
+			vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
 			lockflag = 1;
 		}
 		error = unionfs_mkshadowdir(MOUNTTOUNIONFSMOUNT(dvp->v_mount),
 		    udvp, VTOUNIONFS(vp), cnp, td);
 		if (lockflag != 0)
 			VOP_UNLOCK(vp, 0, td);
 		if (error != 0) {
 			UNIONFSDEBUG("unionfs_lookup: Unable to create shadow dir.");
 			if ((cnp->cn_lkflags & LK_TYPE_MASK) == LK_EXCLUSIVE)
 				vput(vp);
 			else
 				vrele(vp);
 			goto unionfs_lookup_out;
 		}
 		if ((cnp->cn_lkflags & LK_TYPE_MASK) == LK_SHARED)
-			vn_lock(vp, LK_SHARED | LK_RETRY, td);
+			vn_lock(vp, LK_SHARED | LK_RETRY);
 	}
 	/*
 	 * get unionfs vnode.
 	 */
 	else {
 		if (uvp != NULLVP)
 			error = uerror;
 		else
 			error = lerror;
 		if (error != 0)
 			goto unionfs_lookup_out;
 		error = unionfs_nodeget(dvp->v_mount, uvp, lvp, dvp, &vp,
 		    cnp, td);
 		if (error != 0) {
 			UNIONFSDEBUG("unionfs_lookup: Unable to create unionfs vnode.");
 			goto unionfs_lookup_out;
 		}
 		if ((nameiop == DELETE || nameiop == RENAME) &&
 		    (cnp->cn_lkflags & LK_TYPE_MASK) == 0)
-			vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td);
+			vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
 	}
 
 	*(ap->a_vpp) = vp;
 
 	if (cnflags & MAKEENTRY)
 		cache_enter(dvp, vp, cnp);
 
 unionfs_lookup_out:
 	if (uvp != NULLVP)
 		vrele(uvp);
 	if (lvp != NULLVP)
 		vrele(lvp);
 
 	if (error == ENOENT && (cnflags & MAKEENTRY) && nameiop != CREATE)
 		cache_enter(dvp, NULLVP, cnp);
 
 	UNIONFS_INTERNAL_DEBUG("unionfs_lookup: leave (%d)\n", error);
 
 	return (error);
 }
 
 static int
 unionfs_create(struct vop_create_args *ap)
 {
 	struct unionfs_node *dunp;
 	struct componentname *cnp;
 	struct thread  *td;
 	struct vnode   *udvp;
 	struct vnode   *vp;
 	int		error;
 
 	UNIONFS_INTERNAL_DEBUG("unionfs_create: enter\n");
 
 	dunp = VTOUNIONFS(ap->a_dvp);
 	cnp = ap->a_cnp;
 	td = curthread;
 	udvp = dunp->un_uppervp;
 	error = EROFS;
 
 	if (udvp != NULLVP) {
 		if ((error = VOP_CREATE(udvp, &vp, cnp, ap->a_vap)) == 0) {
 			VOP_UNLOCK(vp, 0, td);
 			error = unionfs_nodeget(ap->a_dvp->v_mount, vp, NULLVP,
 			    ap->a_dvp, ap->a_vpp, cnp, td);
 			vrele(vp);
 		}
 	}
 
 	UNIONFS_INTERNAL_DEBUG("unionfs_create: leave (%d)\n", error);
 
 	return (error);
 }
 
 static int
 unionfs_whiteout(struct vop_whiteout_args *ap)
 {
 	struct unionfs_node *dunp;
 	struct componentname *cnp;
 	struct vnode   *udvp;
 	int		error;
 
 	UNIONFS_INTERNAL_DEBUG("unionfs_whiteout: enter\n");
 
 	dunp = VTOUNIONFS(ap->a_dvp);
 	cnp = ap->a_cnp;
 	udvp = dunp->un_uppervp;
 	error = EOPNOTSUPP;
 
 	if (udvp != NULLVP) {
 		switch (ap->a_flags) {
 		case CREATE:
 		case DELETE:
 		case LOOKUP:
 			error = VOP_WHITEOUT(udvp, cnp, ap->a_flags);
 			break;
 		default:
 			error = EINVAL;
 			break;
 		}
 	}
 
 	UNIONFS_INTERNAL_DEBUG("unionfs_whiteout: leave (%d)\n", error);
 
 	return (error);
 }
 
 static int
 unionfs_mknod(struct vop_mknod_args *ap)
 {
 	struct unionfs_node *dunp;
 	struct componentname *cnp;
 	struct thread  *td;
 	struct vnode   *udvp;
 	struct vnode   *vp;
 	int		error;
 
 	UNIONFS_INTERNAL_DEBUG("unionfs_mknod: enter\n");
 
 	dunp = VTOUNIONFS(ap->a_dvp);
 	cnp = ap->a_cnp;
 	td = curthread;
 	udvp = dunp->un_uppervp;
 	error = EROFS;
 
 	if (udvp != NULLVP) {
 		if ((error = VOP_MKNOD(udvp, &vp, cnp, ap->a_vap)) == 0) {
 			VOP_UNLOCK(vp, 0, td);
 			error = unionfs_nodeget(ap->a_dvp->v_mount, vp, NULLVP,
 			    ap->a_dvp, ap->a_vpp, cnp, td);
 			vrele(vp);
 		}
 	}
 
 	UNIONFS_INTERNAL_DEBUG("unionfs_mknod: leave (%d)\n", error);
 
 	return (error);
 }
 
 static int
 unionfs_open(struct vop_open_args *ap)
 {
 	int		error;
 	struct unionfs_node *unp;
 	struct unionfs_node_status *unsp;
 	struct vnode   *uvp;
 	struct vnode   *lvp;
 	struct vnode   *targetvp;
 	struct ucred   *cred;
 	struct thread  *td;
 
 	UNIONFS_INTERNAL_DEBUG("unionfs_open: enter\n");
 
 	error = 0;
 	unp = VTOUNIONFS(ap->a_vp);
 	uvp = unp->un_uppervp;
 	lvp = unp->un_lowervp;
 	targetvp = NULLVP;
 	cred = ap->a_cred;
 	td = ap->a_td;
 
 	unionfs_get_node_status(unp, td, &unsp);
 
 	if (unsp->uns_lower_opencnt > 0 || unsp->uns_upper_opencnt > 0) {
 		/* vnode is already opend. */
 		if (unsp->uns_upper_opencnt > 0)
 			targetvp = uvp;
 		else
 			targetvp = lvp;
 
 		if (targetvp == lvp &&
 		    (ap->a_mode & FWRITE) && lvp->v_type == VREG)
 			targetvp = NULLVP;
 	}
 	if (targetvp == NULLVP) {
 		if (uvp == NULLVP) {
 			if ((ap->a_mode & FWRITE) && lvp->v_type == VREG) {
 				error = unionfs_copyfile(unp,
 				    !(ap->a_mode & O_TRUNC), cred, td);
 				if (error != 0)
 					goto unionfs_open_abort;
 				targetvp = uvp = unp->un_uppervp;
 			} else
 				targetvp = lvp;
 		} else
 			targetvp = uvp;
 	}
 
 	error = VOP_OPEN(targetvp, ap->a_mode, cred, td, ap->a_fp);
 	if (error == 0) {
 		if (targetvp == uvp) {
 			if (uvp->v_type == VDIR && lvp != NULLVP &&
 			    unsp->uns_lower_opencnt <= 0) {
 				/* open lower for readdir */
 				error = VOP_OPEN(lvp, FREAD, cred, td, NULL);
 				if (error != 0) {
 					VOP_CLOSE(uvp, ap->a_mode, cred, td);
 					goto unionfs_open_abort;
 				}
 				unsp->uns_node_flag |= UNS_OPENL_4_READDIR;
 				unsp->uns_lower_opencnt++;
 			}
 			unsp->uns_upper_opencnt++;
 		} else {
 			unsp->uns_lower_opencnt++;
 			unsp->uns_lower_openmode = ap->a_mode;
 		}
 		ap->a_vp->v_object = targetvp->v_object;
 	}
 
 unionfs_open_abort:
 	if (error != 0)
 		unionfs_tryrem_node_status(unp, td, unsp);
 
 	UNIONFS_INTERNAL_DEBUG("unionfs_open: leave (%d)\n", error);
 
 	return (error);
 }
 
 static int
 unionfs_close(struct vop_close_args *ap)
 {
 	int		error;
 	int		locked;
 	struct unionfs_node *unp;
 	struct unionfs_node_status *unsp;
 	struct ucred   *cred;
 	struct thread  *td;
 	struct vnode   *ovp;
 
 	UNIONFS_INTERNAL_DEBUG("unionfs_close: enter\n");
 
 	locked = 0;
 	unp = VTOUNIONFS(ap->a_vp);
 	cred = ap->a_cred;
 	td = ap->a_td;
 
 	if (VOP_ISLOCKED(ap->a_vp, td) != LK_EXCLUSIVE) {
-		vn_lock(ap->a_vp, LK_EXCLUSIVE | LK_RETRY, td);
+		vn_lock(ap->a_vp, LK_EXCLUSIVE | LK_RETRY);
 		locked = 1;
 	}
 	unionfs_get_node_status(unp, td, &unsp);
 
 	if (unsp->uns_lower_opencnt <= 0 && unsp->uns_upper_opencnt <= 0) {
 #ifdef DIAGNOSTIC
 		printf("unionfs_close: warning: open count is 0\n");
 #endif
 		if (unp->un_uppervp != NULLVP)
 			ovp = unp->un_uppervp;
 		else
 			ovp = unp->un_lowervp;
 	} else if (unsp->uns_upper_opencnt > 0)
 		ovp = unp->un_uppervp;
 	else
 		ovp = unp->un_lowervp;
 
 	error = VOP_CLOSE(ovp, ap->a_fflag, cred, td);
 
 	if (error != 0)
 		goto unionfs_close_abort;
 
 	ap->a_vp->v_object = ovp->v_object;
 
 	if (ovp == unp->un_uppervp) {
 		unsp->uns_upper_opencnt--;
 		if (unsp->uns_upper_opencnt == 0) {
 			if (unsp->uns_node_flag & UNS_OPENL_4_READDIR) {
 				VOP_CLOSE(unp->un_lowervp, FREAD, cred, td);
 				unsp->uns_node_flag &= ~UNS_OPENL_4_READDIR;
 				unsp->uns_lower_opencnt--;
 			}
 			if (unsp->uns_lower_opencnt > 0)
 				ap->a_vp->v_object = unp->un_lowervp->v_object;
 		}
 	} else
 		unsp->uns_lower_opencnt--;
 
 unionfs_close_abort:
 	unionfs_tryrem_node_status(unp, td, unsp);
 
 	if (locked != 0)
 		VOP_UNLOCK(ap->a_vp, 0, td);
 
 	UNIONFS_INTERNAL_DEBUG("unionfs_close: leave (%d)\n", error);
 
 	return (error);
 }
 
 /*
  * Check the access mode toward shadow file/dir.
  */
 static int
 unionfs_check_corrected_access(u_short mode,
 			     struct vattr *va,
 			     struct ucred *cred)
 {
 	int		count;
 	uid_t		uid;	/* upper side vnode's uid */
 	gid_t		gid;	/* upper side vnode's gid */
 	u_short		vmode;	/* upper side vnode's mode */
 	gid_t          *gp;
 	u_short		mask;
 
 	mask = 0;
 	uid = va->va_uid;
 	gid = va->va_gid;
 	vmode = va->va_mode;
 
 	/* check owner */
 	if (cred->cr_uid == uid) {
 		if (mode & VEXEC)
 			mask |= S_IXUSR;
 		if (mode & VREAD)
 			mask |= S_IRUSR;
 		if (mode & VWRITE)
 			mask |= S_IWUSR;
 		return ((vmode & mask) == mask ? 0 : EACCES);
 	}
 
 	/* check group */
 	count = 0;
 	gp = cred->cr_groups;
 	for (; count < cred->cr_ngroups; count++, gp++) {
 		if (gid == *gp) {
 			if (mode & VEXEC)
 				mask |= S_IXGRP;
 			if (mode & VREAD)
 				mask |= S_IRGRP;
 			if (mode & VWRITE)
 				mask |= S_IWGRP;
 			return ((vmode & mask) == mask ? 0 : EACCES);
 		}
 	}
 
 	/* check other */
 	if (mode & VEXEC)
 		mask |= S_IXOTH;
 	if (mode & VREAD)
 		mask |= S_IROTH;
 	if (mode & VWRITE)
 		mask |= S_IWOTH;
 
 	return ((vmode & mask) == mask ? 0 : EACCES);
 }
 
 static int
 unionfs_access(struct vop_access_args *ap)
 {
 	struct unionfs_mount *ump;
 	struct unionfs_node *unp;
 	struct vnode   *uvp;
 	struct vnode   *lvp;
 	struct thread  *td;
 	struct vattr	va;
 	int		mode;
 	int		error;
 
 	UNIONFS_INTERNAL_DEBUG("unionfs_access: enter\n");
 
 	ump = MOUNTTOUNIONFSMOUNT(ap->a_vp->v_mount);
 	unp = VTOUNIONFS(ap->a_vp);
 	uvp = unp->un_uppervp;
 	lvp = unp->un_lowervp;
 	td = ap->a_td;
 	mode = ap->a_mode;
 	error = EACCES;
 
 	if ((mode & VWRITE) &&
 	    (ap->a_vp->v_mount->mnt_flag & MNT_RDONLY)) {
 		switch (ap->a_vp->v_type) {
 		case VREG:
 		case VDIR:
 		case VLNK:
 			return (EROFS);
 		default:
 			break;
 		}
 	}
 
 	if (uvp != NULLVP) {
 		error = VOP_ACCESS(uvp, mode, ap->a_cred, td);
 
 		UNIONFS_INTERNAL_DEBUG("unionfs_access: leave (%d)\n", error);
 
 		return (error);
 	}
 
 	if (lvp != NULLVP) {
 		if (mode & VWRITE) {
 			if (ump->um_uppervp->v_mount->mnt_flag & MNT_RDONLY) {
 				switch (ap->a_vp->v_type) {
 				case VREG:
 				case VDIR:
 				case VLNK:
 					return (EROFS);
 				default:
 					break;
 				}
 			} else if (ap->a_vp->v_type == VREG || ap->a_vp->v_type == VDIR) {
 				/* check shadow file/dir */
 				if (ump->um_copymode != UNIONFS_TRANSPARENT) {
 					error = unionfs_create_uppervattr(ump,
 					    lvp, &va, ap->a_cred, td);
 					if (error != 0)
 						return (error);
 
 					error = unionfs_check_corrected_access(
 					    mode, &va, ap->a_cred);
 					if (error != 0)
 						return (error);
 				}
 			}
 			mode &= ~VWRITE;
 			mode |= VREAD; /* will copy to upper */
 		}
 		error = VOP_ACCESS(lvp, mode, ap->a_cred, td);
 	}
 
 	UNIONFS_INTERNAL_DEBUG("unionfs_access: leave (%d)\n", error);
 
 	return (error);
 }
 
 static int
 unionfs_getattr(struct vop_getattr_args *ap)
 {
 	int		error;
 	struct unionfs_node *unp;
 	struct unionfs_mount *ump;
 	struct vnode   *uvp;
 	struct vnode   *lvp;
 	struct thread  *td;
 	struct vattr	va;
 
 	UNIONFS_INTERNAL_DEBUG("unionfs_getattr: enter\n");
 
 	unp = VTOUNIONFS(ap->a_vp);
 	ump = MOUNTTOUNIONFSMOUNT(ap->a_vp->v_mount);
 	uvp = unp->un_uppervp;
 	lvp = unp->un_lowervp;
 	td = ap->a_td;
 
 	if (uvp != NULLVP) {
 		if ((error = VOP_GETATTR(uvp, ap->a_vap, ap->a_cred, td)) == 0)
 			ap->a_vap->va_fsid = ap->a_vp->v_mount->mnt_stat.f_fsid.val[0];
 
 		UNIONFS_INTERNAL_DEBUG("unionfs_getattr: leave mode=%o, uid=%d, gid=%d (%d)\n",
 		    ap->a_vap->va_mode, ap->a_vap->va_uid,
 		    ap->a_vap->va_gid, error);
 
 		return (error);
 	}
 
 	error = VOP_GETATTR(lvp, ap->a_vap, ap->a_cred, td);
 
 	if (error == 0 && !(ump->um_uppervp->v_mount->mnt_flag & MNT_RDONLY)) {
 		/* correct the attr toward shadow file/dir. */
 		if (ap->a_vp->v_type == VREG || ap->a_vp->v_type == VDIR) {
 			unionfs_create_uppervattr_core(ump, ap->a_vap, &va, td);
 			ap->a_vap->va_mode = va.va_mode;
 			ap->a_vap->va_uid = va.va_uid;
 			ap->a_vap->va_gid = va.va_gid;
 		}
 	}
 
 	if (error == 0)
 		ap->a_vap->va_fsid = ap->a_vp->v_mount->mnt_stat.f_fsid.val[0];
 
 	UNIONFS_INTERNAL_DEBUG("unionfs_getattr: leave mode=%o, uid=%d, gid=%d (%d)\n",
 	    ap->a_vap->va_mode, ap->a_vap->va_uid, ap->a_vap->va_gid, error);
 
 	return (error);
 }
 
 static int
 unionfs_setattr(struct vop_setattr_args *ap)
 {
 	int		error;
 	struct unionfs_node *unp;
 	struct vnode   *uvp;
 	struct vnode   *lvp;
 	struct thread  *td;
 	struct vattr   *vap;
 
 	UNIONFS_INTERNAL_DEBUG("unionfs_setattr: enter\n");
 
 	error = EROFS;
 	unp = VTOUNIONFS(ap->a_vp);
 	uvp = unp->un_uppervp;
 	lvp = unp->un_lowervp;
 	td = ap->a_td;
 	vap = ap->a_vap;
 
 	if ((ap->a_vp->v_mount->mnt_flag & MNT_RDONLY) &&
 	    (vap->va_flags != VNOVAL || vap->va_uid != (uid_t)VNOVAL ||
 	     vap->va_gid != (gid_t)VNOVAL || vap->va_atime.tv_sec != VNOVAL ||
 	     vap->va_mtime.tv_sec != VNOVAL || vap->va_mode != (mode_t)VNOVAL))
 		return (EROFS);
 
 	if (uvp == NULLVP && lvp->v_type == VREG) {
 		error = unionfs_copyfile(unp, (vap->va_size != 0),
 		    ap->a_cred, td);
 		if (error != 0)
 			return (error);
 		uvp = unp->un_uppervp;
 	}
 
 	if (uvp != NULLVP)
 		error = VOP_SETATTR(uvp, vap, ap->a_cred, td);
 
 	UNIONFS_INTERNAL_DEBUG("unionfs_setattr: leave (%d)\n", error);
 
 	return (error);
 }
 
 static int
 unionfs_read(struct vop_read_args *ap)
 {
 	int		error;
 	struct unionfs_node *unp;
 	struct vnode   *tvp;
 
 	/* UNIONFS_INTERNAL_DEBUG("unionfs_read: enter\n"); */
 
 	unp = VTOUNIONFS(ap->a_vp);
 	tvp = (unp->un_uppervp != NULLVP ? unp->un_uppervp : unp->un_lowervp);
 
 	error = VOP_READ(tvp, ap->a_uio, ap->a_ioflag, ap->a_cred);
 
 	/* UNIONFS_INTERNAL_DEBUG("unionfs_read: leave (%d)\n", error); */
 
 	return (error);
 }
 
 static int
 unionfs_write(struct vop_write_args *ap)
 {
 	int		error;
 	struct unionfs_node *unp;
 	struct vnode   *tvp;
 
 	/* UNIONFS_INTERNAL_DEBUG("unionfs_write: enter\n"); */
 
 	unp = VTOUNIONFS(ap->a_vp);
 	tvp = (unp->un_uppervp != NULLVP ? unp->un_uppervp : unp->un_lowervp);
 
 	error = VOP_WRITE(tvp, ap->a_uio, ap->a_ioflag, ap->a_cred);
 
 	/* UNIONFS_INTERNAL_DEBUG("unionfs_write: leave (%d)\n", error); */
 
 	return (error);
 }
 
 static int
 unionfs_lease(struct vop_lease_args *ap)
 {
 	int error;
 	struct unionfs_node *unp;
 	struct vnode   *vp;
 
 	UNIONFS_INTERNAL_DEBUG("unionfs_lease: enter\n");
 
 	unp = VTOUNIONFS(ap->a_vp);
 	vp = (unp->un_uppervp != NULLVP ? unp->un_uppervp : unp->un_lowervp);
 
 	error = VOP_LEASE(vp, ap->a_td, ap->a_cred, ap->a_flag);
 
 	UNIONFS_INTERNAL_DEBUG("unionfs_lease: lease (%d)\n", error);
 
 	return (error);
 }
 
 static int
 unionfs_ioctl(struct vop_ioctl_args *ap)
 {
 	int error;
 	struct unionfs_node *unp;
 	struct unionfs_node_status *unsp;
 	struct vnode   *ovp;
 
 	UNIONFS_INTERNAL_DEBUG("unionfs_ioctl: enter\n");
 
- 	vn_lock(ap->a_vp, LK_EXCLUSIVE | LK_RETRY, ap->a_td);
+ 	vn_lock(ap->a_vp, LK_EXCLUSIVE | LK_RETRY);
 	unp = VTOUNIONFS(ap->a_vp);
 	unionfs_get_node_status(unp, ap->a_td, &unsp);
 	ovp = (unsp->uns_upper_opencnt ? unp->un_uppervp : unp->un_lowervp);
 	unionfs_tryrem_node_status(unp, ap->a_td, unsp);
 	VOP_UNLOCK(ap->a_vp, 0, ap->a_td);
 
 	if (ovp == NULLVP)
 		return (EBADF);
 
 	error = VOP_IOCTL(ovp, ap->a_command, ap->a_data, ap->a_fflag,
 	    ap->a_cred, ap->a_td);
 
 	UNIONFS_INTERNAL_DEBUG("unionfs_ioctl: lease (%d)\n", error);
 
 	return (error);
 }
 
 static int
 unionfs_poll(struct vop_poll_args *ap)
 {
 	struct unionfs_node *unp;
 	struct unionfs_node_status *unsp;
 	struct vnode   *ovp;
 
- 	vn_lock(ap->a_vp, LK_EXCLUSIVE | LK_RETRY, ap->a_td);
+ 	vn_lock(ap->a_vp, LK_EXCLUSIVE | LK_RETRY);
 	unp = VTOUNIONFS(ap->a_vp);
 	unionfs_get_node_status(unp, ap->a_td, &unsp);
 	ovp = (unsp->uns_upper_opencnt ? unp->un_uppervp : unp->un_lowervp);
 	unionfs_tryrem_node_status(unp, ap->a_td, unsp);
 	VOP_UNLOCK(ap->a_vp, 0, ap->a_td);
 
 	if (ovp == NULLVP)
 		return (EBADF);
 
 	return (VOP_POLL(ovp, ap->a_events, ap->a_cred, ap->a_td));
 }
 
 static int
 unionfs_fsync(struct vop_fsync_args *ap)
 {
 	struct unionfs_node *unp;
 	struct unionfs_node_status *unsp;
 	struct vnode   *ovp;
 
 	unp = VTOUNIONFS(ap->a_vp);
 	unionfs_get_node_status(unp, ap->a_td, &unsp);
 	ovp = (unsp->uns_upper_opencnt ? unp->un_uppervp : unp->un_lowervp);
 	unionfs_tryrem_node_status(unp, ap->a_td, unsp);
 
 	if (ovp == NULLVP)
 		return (EBADF);
 
 	return (VOP_FSYNC(ovp, ap->a_waitfor, ap->a_td));
 }
 
 static int
 unionfs_remove(struct vop_remove_args *ap)
 {
 	int		error;
 	struct unionfs_node *dunp;
 	struct unionfs_node *unp;
 	struct unionfs_mount *ump;
 	struct vnode   *udvp;
 	struct vnode   *uvp;
 	struct vnode   *lvp;
 	struct componentname *cnp;
 	struct thread  *td;
 
 	UNIONFS_INTERNAL_DEBUG("unionfs_remove: enter\n");
 
 	error = 0;
 	dunp = VTOUNIONFS(ap->a_dvp);
 	unp = VTOUNIONFS(ap->a_vp);
 	udvp = dunp->un_uppervp;
 	uvp = unp->un_uppervp;
 	lvp = unp->un_lowervp;
 	cnp = ap->a_cnp;
 	td = curthread;
 
 	if (udvp == NULLVP)
 		return (EROFS);
 
 	if (uvp != NULLVP) {
 		ump = MOUNTTOUNIONFSMOUNT(ap->a_vp->v_mount);
 		if (ump->um_whitemode == UNIONFS_WHITE_ALWAYS || lvp != NULLVP)
 			cnp->cn_flags |= DOWHITEOUT;
 		error = VOP_REMOVE(udvp, uvp, cnp);
 	} else if (lvp != NULLVP)
 		error = unionfs_mkwhiteout(udvp, cnp, td, unp->un_path);
 
 	UNIONFS_INTERNAL_DEBUG("unionfs_remove: leave (%d)\n", error);
 
 	return (error);
 }
 
 static int
 unionfs_link(struct vop_link_args *ap)
 {
 	int		error;
 	int		needrelookup;
 	struct unionfs_node *dunp;
 	struct unionfs_node *unp;
 	struct vnode   *udvp;
 	struct vnode   *uvp;
 	struct componentname *cnp;
 	struct thread  *td;
 
 	UNIONFS_INTERNAL_DEBUG("unionfs_link: enter\n");
 
 	error = 0;
 	needrelookup = 0;
 	dunp = VTOUNIONFS(ap->a_tdvp);
 	unp = NULL;
 	udvp = dunp->un_uppervp;
 	uvp = NULLVP;
 	cnp = ap->a_cnp;
 	td = curthread;
 
 	if (udvp == NULLVP)
 		return (EROFS);
 
 	if (ap->a_vp->v_op != &unionfs_vnodeops)
 		uvp = ap->a_vp;
 	else {
 		unp = VTOUNIONFS(ap->a_vp);
 
 		if (unp->un_uppervp == NULLVP) {
 			if (ap->a_vp->v_type != VREG)
 				return (EOPNOTSUPP);
 
 			error = unionfs_copyfile(unp, 1, cnp->cn_cred, td);
 			if (error != 0)
 				return (error);
 			needrelookup = 1;
 		}
 		uvp = unp->un_uppervp;
 	}
 
 	if (needrelookup != 0)
 		error = unionfs_relookup_for_create(ap->a_tdvp, cnp, td);
 
 	if (error == 0)
 		error = VOP_LINK(udvp, uvp, cnp);
 
 	UNIONFS_INTERNAL_DEBUG("unionfs_link: leave (%d)\n", error);
 
 	return (error);
 }
 
 static int
 unionfs_rename(struct vop_rename_args *ap)
 {
 	int		error;
 	struct vnode   *fdvp;
 	struct vnode   *fvp;
 	struct componentname *fcnp;
 	struct vnode   *tdvp;
 	struct vnode   *tvp;
 	struct componentname *tcnp;
 	struct vnode   *ltdvp;
 	struct vnode   *ltvp;
 	struct thread  *td;
 
 	/* rename target vnodes */
 	struct vnode   *rfdvp;
 	struct vnode   *rfvp;
 	struct vnode   *rtdvp;
 	struct vnode   *rtvp;
 
 	int		needrelookup;
 	struct unionfs_mount *ump;
 	struct unionfs_node *unp;
 
 	UNIONFS_INTERNAL_DEBUG("unionfs_rename: enter\n");
 
 	error = 0;
 	fdvp = ap->a_fdvp;
 	fvp = ap->a_fvp;
 	fcnp = ap->a_fcnp;
 	tdvp = ap->a_tdvp;
 	tvp = ap->a_tvp;
 	tcnp = ap->a_tcnp;
 	ltdvp = NULLVP;
 	ltvp = NULLVP;
 	td = curthread;
 	rfdvp = fdvp;
 	rfvp = fvp;
 	rtdvp = tdvp;
 	rtvp = tvp;
 	needrelookup = 0;
 
 #ifdef DIAGNOSTIC
 	if (!(fcnp->cn_flags & HASBUF) || !(tcnp->cn_flags & HASBUF))
 		panic("unionfs_rename: no name");
 #endif
 
 	/* check for cross device rename */
 	if (fvp->v_mount != tdvp->v_mount ||
 	    (tvp != NULLVP && fvp->v_mount != tvp->v_mount)) {
 		error = EXDEV;
 		goto unionfs_rename_abort;
 	}
 
 	/* Renaming a file to itself has no effect. */
 	if (fvp == tvp)
 		goto unionfs_rename_abort;
 
 	/*
 	 * from/to vnode is unionfs node.
 	 */
 
 	unp = VTOUNIONFS(fdvp);
 #ifdef UNIONFS_IDBG_RENAME
 	UNIONFS_INTERNAL_DEBUG("fdvp=%p, ufdvp=%p, lfdvp=%p\n", fdvp, unp->un_uppervp, unp->un_lowervp);
 #endif
 	if (unp->un_uppervp == NULLVP) {
 		error = ENODEV;
 		goto unionfs_rename_abort;
 	}
 	rfdvp = unp->un_uppervp;
 	vref(rfdvp);
 
 	unp = VTOUNIONFS(fvp);
 #ifdef UNIONFS_IDBG_RENAME
 	UNIONFS_INTERNAL_DEBUG("fvp=%p, ufvp=%p, lfvp=%p\n", fvp, unp->un_uppervp, unp->un_lowervp);
 #endif
 	ump = MOUNTTOUNIONFSMOUNT(fvp->v_mount);
 	if (unp->un_uppervp == NULLVP) {
 		switch (fvp->v_type) {
 		case VREG:
-			if ((error = vn_lock(fvp, LK_EXCLUSIVE, td)) != 0)
+			if ((error = vn_lock(fvp, LK_EXCLUSIVE)) != 0)
 				goto unionfs_rename_abort;
 			error = unionfs_copyfile(unp, 1, fcnp->cn_cred, td);
 			VOP_UNLOCK(fvp, 0, td);
 			if (error != 0)
 				goto unionfs_rename_abort;
 			break;
 		case VDIR:
-			if ((error = vn_lock(fvp, LK_EXCLUSIVE, td)) != 0)
+			if ((error = vn_lock(fvp, LK_EXCLUSIVE)) != 0)
 				goto unionfs_rename_abort;
 			error = unionfs_mkshadowdir(ump, rfdvp, unp, fcnp, td);
 			VOP_UNLOCK(fvp, 0, td);
 			if (error != 0)
 				goto unionfs_rename_abort;
 			break;
 		default:
 			error = ENODEV;
 			goto unionfs_rename_abort;
 		}
 
 		needrelookup = 1;
 	}
 
 	if (unp->un_lowervp != NULLVP)
 		fcnp->cn_flags |= DOWHITEOUT;
 	rfvp = unp->un_uppervp;
 	vref(rfvp);
 
 	unp = VTOUNIONFS(tdvp);
 #ifdef UNIONFS_IDBG_RENAME
 	UNIONFS_INTERNAL_DEBUG("tdvp=%p, utdvp=%p, ltdvp=%p\n", tdvp, unp->un_uppervp, unp->un_lowervp);
 #endif
 	if (unp->un_uppervp == NULLVP) {
 		error = ENODEV;
 		goto unionfs_rename_abort;
 	}
 	rtdvp = unp->un_uppervp;
 	ltdvp = unp->un_lowervp;
 	vref(rtdvp);
 
 	if (tdvp == tvp) {
 		rtvp = rtdvp;
 		vref(rtvp);
 	} else if (tvp != NULLVP) {
 		unp = VTOUNIONFS(tvp);
 #ifdef UNIONFS_IDBG_RENAME
 		UNIONFS_INTERNAL_DEBUG("tvp=%p, utvp=%p, ltvp=%p\n", tvp, unp->un_uppervp, unp->un_lowervp);
 #endif
 		if (unp->un_uppervp == NULLVP)
 			rtvp = NULLVP;
 		else {
 			if (tvp->v_type == VDIR) {
 				error = EINVAL;
 				goto unionfs_rename_abort;
 			}
 			rtvp = unp->un_uppervp;
 			ltvp = unp->un_lowervp;
 			vref(rtvp);
 		}
 	}
 
 	if (needrelookup != 0) {
-		if ((error = vn_lock(fdvp, LK_EXCLUSIVE, td)) != 0)
+		if ((error = vn_lock(fdvp, LK_EXCLUSIVE)) != 0)
 			goto unionfs_rename_abort;
 		error = unionfs_relookup_for_delete(fdvp, fcnp, td);
 		VOP_UNLOCK(fdvp, 0, td);
 		if (error != 0)
 			goto unionfs_rename_abort;
 
 		/* Locke of tvp is canceled in order to avoid recursive lock. */
 		if (tvp != NULLVP && tvp != tdvp)
 			VOP_UNLOCK(tvp, 0, td);
 		error = unionfs_relookup_for_rename(tdvp, tcnp, td);
 		if (tvp != NULLVP && tvp != tdvp)
-			vn_lock(tvp, LK_EXCLUSIVE | LK_RETRY, td);
+			vn_lock(tvp, LK_EXCLUSIVE | LK_RETRY);
 		if (error != 0)
 			goto unionfs_rename_abort;
 	}
 
 	error = VOP_RENAME(rfdvp, rfvp, fcnp, rtdvp, rtvp, tcnp);
 
 	if (error == 0) {
 		if (rtvp != NULLVP && rtvp->v_type == VDIR)
 			cache_purge(tdvp);
 		if (fvp->v_type == VDIR && fdvp != tdvp)
 			cache_purge(fdvp);
 	}
 
 	if (fdvp != rfdvp)
 		vrele(fdvp);
 	if (fvp != rfvp)
 		vrele(fvp);
 	if (ltdvp != NULLVP)
 		VOP_UNLOCK(ltdvp, 0, td);
 	if (tdvp != rtdvp)
 		vrele(tdvp);
 	if (ltvp != NULLVP)
 		VOP_UNLOCK(ltvp, 0, td);
 	if (tvp != rtvp && tvp != NULLVP) {
 		if (rtvp == NULLVP)
 			vput(tvp);
 		else
 			vrele(tvp);
 	}
 
 	UNIONFS_INTERNAL_DEBUG("unionfs_rename: leave (%d)\n", error);
 
 	return (error);
 
 unionfs_rename_abort:
 	if (fdvp != rfdvp)
 		vrele(rfdvp);
 	if (fvp != rfvp)
 		vrele(rfvp);
 	if (tdvp != rtdvp)
 		vrele(rtdvp);
 	vput(tdvp);
 	if (tvp != rtvp && rtvp != NULLVP)
 		vrele(rtvp);
 	if (tvp != NULLVP) {
 		if (tdvp != tvp)
 			vput(tvp);
 		else
 			vrele(tvp);
 	}
 	vrele(fdvp);
 	vrele(fvp);
 
 	UNIONFS_INTERNAL_DEBUG("unionfs_rename: leave (%d)\n", error);
 
 	return (error);
 }
 
 static int
 unionfs_mkdir(struct vop_mkdir_args *ap)
 {
 	int		error;
 	int		lkflags;
 	struct unionfs_node *dunp;
 	struct componentname *cnp;
 	struct thread  *td;
 	struct vnode   *udvp;
 	struct vnode   *uvp;
 	struct vattr	va;
 
 	UNIONFS_INTERNAL_DEBUG("unionfs_mkdir: enter\n");
 
 	error = EROFS;
 	dunp = VTOUNIONFS(ap->a_dvp);
 	cnp = ap->a_cnp;
 	lkflags = cnp->cn_lkflags;
 	td = curthread;
 	udvp = dunp->un_uppervp;
 
 	if (udvp != NULLVP) {
 		/* check opaque */
 		if (!(cnp->cn_flags & ISWHITEOUT)) {
 			error = VOP_GETATTR(udvp, &va, cnp->cn_cred, td);
 			if (error != 0)
 				return (error);
 			if (va.va_flags & OPAQUE) 
 				cnp->cn_flags |= ISWHITEOUT;
 		}
 
 		if ((error = VOP_MKDIR(udvp, &uvp, cnp, ap->a_vap)) == 0) {
 			VOP_UNLOCK(uvp, 0, td);
 			cnp->cn_lkflags = LK_EXCLUSIVE;
 			error = unionfs_nodeget(ap->a_dvp->v_mount, uvp, NULLVP,
 			    ap->a_dvp, ap->a_vpp, cnp, td);
 			cnp->cn_lkflags = lkflags;
 			vrele(uvp);
 		}
 	}
 
 	UNIONFS_INTERNAL_DEBUG("unionfs_mkdir: leave (%d)\n", error);
 
 	return (error);
 }
 
 static int
 unionfs_rmdir(struct vop_rmdir_args *ap)
 {
 	int		error;
 	struct unionfs_node *dunp;
 	struct unionfs_node *unp;
 	struct unionfs_mount *ump;
 	struct componentname *cnp;
 	struct thread  *td;
 	struct vnode   *udvp;
 	struct vnode   *uvp;
 	struct vnode   *lvp;
 
 	UNIONFS_INTERNAL_DEBUG("unionfs_rmdir: enter\n");
 
 	error = 0;
 	dunp = VTOUNIONFS(ap->a_dvp);
 	unp = VTOUNIONFS(ap->a_vp);
 	cnp = ap->a_cnp;
 	td = curthread;
 	udvp = dunp->un_uppervp;
 	uvp = unp->un_uppervp;
 	lvp = unp->un_lowervp;
 
 	if (udvp == NULLVP)
 		return (EROFS);
 
 	if (udvp == uvp)
 		return (EOPNOTSUPP);
 
 	if (uvp != NULLVP) {
 		if (lvp != NULLVP) {
 			error = unionfs_check_rmdir(ap->a_vp, cnp->cn_cred, td);
 			if (error != 0)
 				return (error);
 		}
 		ump = MOUNTTOUNIONFSMOUNT(ap->a_vp->v_mount);
 		if (ump->um_whitemode == UNIONFS_WHITE_ALWAYS || lvp != NULLVP)
 			cnp->cn_flags |= DOWHITEOUT;
 		error = VOP_RMDIR(udvp, uvp, cnp);
 	}
 	else if (lvp != NULLVP)
 		error = unionfs_mkwhiteout(udvp, cnp, td, unp->un_path);
 
 	if (error == 0) {
 		cache_purge(ap->a_dvp);
 		cache_purge(ap->a_vp);
 	}
 
 	UNIONFS_INTERNAL_DEBUG("unionfs_rmdir: leave (%d)\n", error);
 
 	return (error);
 }
 
 static int
 unionfs_symlink(struct vop_symlink_args *ap)
 {
 	int		error;
 	int		lkflags;
 	struct unionfs_node *dunp;
 	struct componentname *cnp;
 	struct thread  *td;
 	struct vnode   *udvp;
 	struct vnode   *uvp;
 
 	UNIONFS_INTERNAL_DEBUG("unionfs_symlink: enter\n");
 
 	error = EROFS;
 	dunp = VTOUNIONFS(ap->a_dvp);
 	cnp = ap->a_cnp;
 	lkflags = cnp->cn_lkflags;
 	td = curthread;
 	udvp = dunp->un_uppervp;
 
 	if (udvp != NULLVP) {
 		error = VOP_SYMLINK(udvp, &uvp, cnp, ap->a_vap, ap->a_target);
 		if (error == 0) {
 			VOP_UNLOCK(uvp, 0, td);
 			cnp->cn_lkflags = LK_EXCLUSIVE;
 			error = unionfs_nodeget(ap->a_dvp->v_mount, uvp, NULLVP,
 			    ap->a_dvp, ap->a_vpp, cnp, td);
 			cnp->cn_lkflags = lkflags;
 			vrele(uvp);
 		}
 	}
 
 	UNIONFS_INTERNAL_DEBUG("unionfs_symlink: leave (%d)\n", error);
 
 	return (error);
 }
 
 static int
 unionfs_readdir(struct vop_readdir_args *ap)
 {
 	int		error;
 	int		eofflag;
 	int		locked;
 	struct unionfs_node *unp;
 	struct unionfs_node_status *unsp;
 	struct uio     *uio;
 	struct vnode   *uvp;
 	struct vnode   *lvp;
 	struct thread  *td;
 	struct vattr    va;
 
 	int		ncookies_bk;
 	u_long         *cookies_bk;
 
 	UNIONFS_INTERNAL_DEBUG("unionfs_readdir: enter\n");
 
 	error = 0;
 	eofflag = 0;
 	locked = 0;
 	unp = VTOUNIONFS(ap->a_vp);
 	uio = ap->a_uio;
 	uvp = unp->un_uppervp;
 	lvp = unp->un_lowervp;
 	td = uio->uio_td;
 	ncookies_bk = 0;
 	cookies_bk = NULL;
 
 	if (ap->a_vp->v_type != VDIR)
 		return (ENOTDIR);
 
 	/* check opaque */
 	if (uvp != NULLVP && lvp != NULLVP) {
 		if ((error = VOP_GETATTR(uvp, &va, ap->a_cred, td)) != 0)
 			goto unionfs_readdir_exit;
 		if (va.va_flags & OPAQUE)
 			lvp = NULLVP;
 	}
 
 	/* check the open count. unionfs needs to open before readdir. */
 	if (VOP_ISLOCKED(ap->a_vp, td) != LK_EXCLUSIVE) {
-		vn_lock(ap->a_vp, LK_UPGRADE | LK_RETRY, td);
+		vn_lock(ap->a_vp, LK_UPGRADE | LK_RETRY);
 		locked = 1;
 	}
 	unionfs_get_node_status(unp, td, &unsp);
 	if ((uvp != NULLVP && unsp->uns_upper_opencnt <= 0) ||
 	    (lvp != NULLVP && unsp->uns_lower_opencnt <= 0)) {
 		unionfs_tryrem_node_status(unp, td, unsp);
 		error = EBADF;
 	}
 	if (locked == 1)
-		vn_lock(ap->a_vp, LK_DOWNGRADE | LK_RETRY, td);
+		vn_lock(ap->a_vp, LK_DOWNGRADE | LK_RETRY);
 	if (error != 0)
 		goto unionfs_readdir_exit;
 
 	/* upper only */
 	if (uvp != NULLVP && lvp == NULLVP) {
 		error = VOP_READDIR(uvp, uio, ap->a_cred, ap->a_eofflag,
 		    ap->a_ncookies, ap->a_cookies);
 		unsp->uns_readdir_status = 0;
 
 		goto unionfs_readdir_exit;
 	}
 
 	/* lower only */
 	if (uvp == NULLVP && lvp != NULLVP) {
 		error = VOP_READDIR(lvp, uio, ap->a_cred, ap->a_eofflag,
 		    ap->a_ncookies, ap->a_cookies);
 		unsp->uns_readdir_status = 2;
 
 		goto unionfs_readdir_exit;
 	}
 
 	/*
 	 * readdir upper and lower
 	 */
 	KASSERT(uvp != NULLVP, ("unionfs_readdir: null upper vp"));
 	KASSERT(lvp != NULLVP, ("unionfs_readdir: null lower vp"));
 	if (uio->uio_offset == 0)
 		unsp->uns_readdir_status = 0;
 
 	if (unsp->uns_readdir_status == 0) {
 		/* read upper */
 		error = VOP_READDIR(uvp, uio, ap->a_cred, &eofflag,
 				    ap->a_ncookies, ap->a_cookies);
 
 		if (error != 0 || eofflag == 0)
 			goto unionfs_readdir_exit;
 		unsp->uns_readdir_status = 1;
 
 		/*
 		 * ufs(and other fs) needs size of uio_resid larger than
 		 * DIRBLKSIZ.
 		 * size of DIRBLKSIZ equals DEV_BSIZE.
 		 * (see: ufs/ufs/ufs_vnops.c ufs_readdir func , ufs/ufs/dir.h)
 		 */
 		if (uio->uio_resid <= (uio->uio_resid & (DEV_BSIZE -1)))
 			goto unionfs_readdir_exit;
 
 		/*
 		 * backup cookies
 		 * It prepares to readdir in lower.
 		 */
 		if (ap->a_ncookies != NULL) {
 			ncookies_bk = *(ap->a_ncookies);
 			*(ap->a_ncookies) = 0;
 		}
 		if (ap->a_cookies != NULL) {
 			cookies_bk = *(ap->a_cookies);
 			*(ap->a_cookies) = NULL;
 		}
 	}
 
 	/* initialize for readdir in lower */
 	if (unsp->uns_readdir_status == 1) {
 		unsp->uns_readdir_status = 2;
 		uio->uio_offset = 0;
 	}
 
 	if (lvp == NULLVP) {
 		error = EBADF;
 		goto unionfs_readdir_exit;
 	}
 	/* read lower */
 	error = VOP_READDIR(lvp, uio, ap->a_cred, ap->a_eofflag,
 			    ap->a_ncookies, ap->a_cookies);
 
 	if (cookies_bk != NULL) {
 		/* merge cookies */
 		int		size;
 		u_long         *newcookies, *pos;
 
 		size = *(ap->a_ncookies) + ncookies_bk;
 		newcookies = (u_long *) malloc(size * sizeof(u_long),
 		    M_TEMP, M_WAITOK);
 		pos = newcookies;
 
 		memcpy(pos, cookies_bk, ncookies_bk * sizeof(u_long));
 		pos += ncookies_bk * sizeof(u_long);
 		memcpy(pos, *(ap->a_cookies), *(ap->a_ncookies) * sizeof(u_long));
 		free(cookies_bk, M_TEMP);
 		free(*(ap->a_cookies), M_TEMP);
 		*(ap->a_ncookies) = size;
 		*(ap->a_cookies) = newcookies;
 	}
 
 unionfs_readdir_exit:
 	if (error != 0 && ap->a_eofflag != NULL)
 		*(ap->a_eofflag) = 1;
 
 	UNIONFS_INTERNAL_DEBUG("unionfs_readdir: leave (%d)\n", error);
 
 	return (error);
 }
 
 static int
 unionfs_readlink(struct vop_readlink_args *ap)
 {
 	int error;
 	struct unionfs_node *unp;
 	struct vnode   *vp;
 
 	UNIONFS_INTERNAL_DEBUG("unionfs_readlink: enter\n");
 
 	unp = VTOUNIONFS(ap->a_vp);
 	vp = (unp->un_uppervp != NULLVP ? unp->un_uppervp : unp->un_lowervp);
 
 	error = VOP_READLINK(vp, ap->a_uio, ap->a_cred);
 
 	UNIONFS_INTERNAL_DEBUG("unionfs_readlink: leave (%d)\n", error);
 
 	return (error);
 }
 
 static int
 unionfs_getwritemount(struct vop_getwritemount_args *ap)
 {
 	int		error;
 	struct vnode   *uvp;
 	struct vnode   *vp;
 
 	UNIONFS_INTERNAL_DEBUG("unionfs_getwritemount: enter\n");
 
 	error = 0;
 	vp = ap->a_vp;
 
 	if (vp == NULLVP || (vp->v_mount->mnt_flag & MNT_RDONLY))
 		return (EACCES);
 
 	uvp = UNIONFSVPTOUPPERVP(vp);
 	if (uvp == NULLVP && VREG == vp->v_type)
 		uvp = UNIONFSVPTOUPPERVP(VTOUNIONFS(vp)->un_dvp);
 
 	if (uvp != NULLVP)
 		error = VOP_GETWRITEMOUNT(uvp, ap->a_mpp);
 	else {
 		VI_LOCK(vp);
 		if (vp->v_iflag & VI_FREE)
 			error = EOPNOTSUPP;
 		else
 			error = EACCES;
 		VI_UNLOCK(vp);
 	}
 
 	UNIONFS_INTERNAL_DEBUG("unionfs_getwritemount: leave (%d)\n", error);
 
 	return (error);
 }
 
 static int
 unionfs_inactive(struct vop_inactive_args *ap)
 {
 	ap->a_vp->v_object = NULL;
 	vrecycle(ap->a_vp, ap->a_td);
 	return (0);
 }
 
 static int
 unionfs_reclaim(struct vop_reclaim_args *ap)
 {
 	/* UNIONFS_INTERNAL_DEBUG("unionfs_reclaim: enter\n"); */
 
 	unionfs_noderem(ap->a_vp, ap->a_td);
 
 	/* UNIONFS_INTERNAL_DEBUG("unionfs_reclaim: leave\n"); */
 
 	return (0);
 }
 
 static int
 unionfs_print(struct vop_print_args *ap)
 {
 	struct unionfs_node *unp;
 	/* struct unionfs_node_status *unsp; */
 
 	unp = VTOUNIONFS(ap->a_vp);
 	/* unionfs_get_node_status(unp, curthread, &unsp); */
 
 	printf("unionfs_vp=%p, uppervp=%p, lowervp=%p\n",
 	    ap->a_vp, unp->un_uppervp, unp->un_lowervp);
 	/*
 	printf("unionfs opencnt: uppervp=%d, lowervp=%d\n",
 	    unsp->uns_upper_opencnt, unsp->uns_lower_opencnt);
 	*/
 
 	if (unp->un_uppervp != NULLVP)
 		vprint("unionfs: upper", unp->un_uppervp);
 	if (unp->un_lowervp != NULLVP)
 		vprint("unionfs: lower", unp->un_lowervp);
 
 	return (0);
 }
 
 static int
 unionfs_get_llt_revlock(int flags)
 {
 	int count;
 
 	flags &= LK_TYPE_MASK;
 	for (count = 0; un_llt[count].lock != 0; count++) {
 		if (flags == un_llt[count].lock) {
 			return un_llt[count].revlock;
 		}
 	}
 
 	return 0;
 }
 
 static int
 unionfs_lock(struct vop_lock1_args *ap)
 {
 	int		error;
 	int		flags;
 	int		revlock;
 	int		uhold;
 	struct mount   *mp;
 	struct unionfs_mount *ump;
 	struct unionfs_node *unp;
 	struct vnode   *vp;
 	struct vnode   *uvp;
 	struct vnode   *lvp;
 	struct thread  *td;
 
 	error = 0;
 	uhold = 0;
 	flags = ap->a_flags;
 	vp = ap->a_vp;
 	td = ap->a_td;
 
 	if (LK_RELEASE == (flags & LK_TYPE_MASK) || !(flags & LK_TYPE_MASK))
 		return (VOP_UNLOCK(vp, flags, td));
 
 	if ((revlock = unionfs_get_llt_revlock(flags)) == 0)
 		panic("unknown lock type: 0x%x", flags & LK_TYPE_MASK);
 
 	if ((flags & LK_INTERLOCK) == 0)
 		VI_LOCK(vp);
 
 	mp = vp->v_mount;
 	if (mp == NULL)
 		goto unionfs_lock_null_vnode;
 
 	ump = MOUNTTOUNIONFSMOUNT(mp);
 	unp = VTOUNIONFS(vp);
 	if (ump == NULL || unp == NULL)
 		goto unionfs_lock_null_vnode;
 	lvp = unp->un_lowervp;
 	uvp = unp->un_uppervp;
 
 	if ((mp->mnt_kern_flag & MNTK_MPSAFE) != 0 &&
 	    (vp->v_iflag & VI_OWEINACT) != 0)
 		flags |= LK_NOWAIT;
 
 	/*
 	 * Sometimes, lower or upper is already exclusive locked.
 	 * (ex. vfs_domount: mounted vnode is already locked.)
 	 */
 	if ((flags & LK_TYPE_MASK) == LK_EXCLUSIVE &&
 	    vp == ump->um_rootvp)
 		flags |= LK_CANRECURSE;
 
 	if (lvp != NULLVP) {
 		VI_LOCK_FLAGS(lvp, MTX_DUPOK);
 		flags |= LK_INTERLOCK;
 		vholdl(lvp);
 
 		VI_UNLOCK(vp);
 		ap->a_flags &= ~LK_INTERLOCK;
 
 		error = VOP_LOCK(lvp, flags, td);
 
 		VI_LOCK(vp);
 		unp = VTOUNIONFS(vp);
 		if (unp == NULL) {
 			VI_UNLOCK(vp);
 			if (error == 0)
 				VOP_UNLOCK(lvp, 0, td);
 			vdrop(lvp);
 			return (vop_stdlock(ap));
 		}
 	}
 
 	if (error == 0 && uvp != NULLVP) {
 		VI_LOCK_FLAGS(uvp, MTX_DUPOK);
 		flags |= LK_INTERLOCK;
 		vholdl(uvp);
 		uhold = 1;
 
 		VI_UNLOCK(vp);
 		ap->a_flags &= ~LK_INTERLOCK;
 
 		error = VOP_LOCK(uvp, flags, td);
 
 		VI_LOCK(vp);
 		unp = VTOUNIONFS(vp);
 		if (unp == NULL) {
 			VI_UNLOCK(vp);
 			if (error == 0) {
 				VOP_UNLOCK(uvp, 0, td);
 				if (lvp != NULLVP)
 					VOP_UNLOCK(lvp, 0, td);
 			}
 			if (lvp != NULLVP)
 				vdrop(lvp);
 			vdrop(uvp);
 			return (vop_stdlock(ap));
 		}
 
 		if (error != 0 && lvp != NULLVP) {
 			VI_UNLOCK(vp);
 			if ((revlock & LK_TYPE_MASK) == LK_RELEASE)
 				VOP_UNLOCK(lvp, revlock, td);
 			else
-				vn_lock(lvp, revlock | LK_RETRY, td);
+				vn_lock(lvp, revlock | LK_RETRY);
 			goto unionfs_lock_abort;
 		}
 	}
 
 	VI_UNLOCK(vp);
 unionfs_lock_abort:
 	if (lvp != NULLVP)
 		vdrop(lvp);
 	if (uhold != 0)
 		vdrop(uvp);
 
 	return (error);
 
 unionfs_lock_null_vnode:
 	ap->a_flags |= LK_INTERLOCK;
 	return (vop_stdlock(ap));
 }
 
 static int
 unionfs_unlock(struct vop_unlock_args *ap)
 {
 	int		error;
 	int		flags;
 	int		mtxlkflag;
 	int		uhold;
 	struct vnode   *vp;
 	struct vnode   *lvp;
 	struct vnode   *uvp;
 	struct unionfs_node *unp;
 
 	error = 0;
 	mtxlkflag = 0;
 	uhold = 0;
 	flags = ap->a_flags | LK_RELEASE;
 	vp = ap->a_vp;
 
 	if ((flags & LK_INTERLOCK) != 0)
 		mtxlkflag = 1;
 	else if (mtx_owned(VI_MTX(vp)) == 0) {
 		VI_LOCK(vp);
 		mtxlkflag = 2;
 	}
 
 	unp = VTOUNIONFS(vp);
 	if (unp == NULL)
 		goto unionfs_unlock_null_vnode;
 	lvp = unp->un_lowervp;
 	uvp = unp->un_uppervp;
 
 	if (lvp != NULLVP) {
 		VI_LOCK_FLAGS(lvp, MTX_DUPOK);
 		flags |= LK_INTERLOCK;
 		vholdl(lvp);
 
 		VI_UNLOCK(vp);
 		ap->a_flags &= ~LK_INTERLOCK;
 
 		error = VOP_UNLOCK(lvp, flags, ap->a_td);
 
 		VI_LOCK(vp);
 	}
 
 	if (error == 0 && uvp != NULLVP) {
 		VI_LOCK_FLAGS(uvp, MTX_DUPOK);
 		flags |= LK_INTERLOCK;
 		vholdl(uvp);
 		uhold = 1;
 
 		VI_UNLOCK(vp);
 		ap->a_flags &= ~LK_INTERLOCK;
 
 		error = VOP_UNLOCK(uvp, flags, ap->a_td);
 
 		VI_LOCK(vp);
 	}
 
 	VI_UNLOCK(vp);
 	if (lvp != NULLVP)
 		vdrop(lvp);
 	if (uhold != 0)
 		vdrop(uvp);
 	if (mtxlkflag == 0)
 		VI_LOCK(vp);
 
 	return error;
 
 unionfs_unlock_null_vnode:
 	if (mtxlkflag == 2)
 		VI_UNLOCK(vp);
 	return (vop_stdunlock(ap));
 }
 
 static int
 unionfs_pathconf(struct vop_pathconf_args *ap)
 {
 	struct unionfs_node *unp;
 	struct vnode   *vp;
 
 	unp = VTOUNIONFS(ap->a_vp);
 	vp = (unp->un_uppervp != NULLVP ? unp->un_uppervp : unp->un_lowervp);
 
 	return (VOP_PATHCONF(vp, ap->a_name, ap->a_retval));
 }
 
 static int
 unionfs_advlock(struct vop_advlock_args *ap)
 {
 	int error;
 	struct unionfs_node *unp;
 	struct unionfs_node_status *unsp;
 	struct vnode   *vp;
 	struct vnode   *uvp;
 	struct thread  *td;
 
 	UNIONFS_INTERNAL_DEBUG("unionfs_advlock: enter\n");
 
 	vp = ap->a_vp;
 	td = curthread;
 
-	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td);
+	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
 
 	unp = VTOUNIONFS(ap->a_vp);
 	uvp = unp->un_uppervp;
 
 	if (uvp == NULLVP) {
 		error = unionfs_copyfile(unp, 1, td->td_ucred, td);
 		if (error != 0)
 			goto unionfs_advlock_abort;
 		uvp = unp->un_uppervp;
 
 		unionfs_get_node_status(unp, td, &unsp);
 		if (unsp->uns_lower_opencnt > 0) {
 			/* try reopen the vnode */
 			error = VOP_OPEN(uvp, unsp->uns_lower_openmode,
 				td->td_ucred, td, NULL);
 			if (error)
 				goto unionfs_advlock_abort;
 			unsp->uns_upper_opencnt++;
 			VOP_CLOSE(unp->un_lowervp, unsp->uns_lower_openmode, td->td_ucred, td);
 			unsp->uns_lower_opencnt--;
 		} else
 			unionfs_tryrem_node_status(unp, td, unsp);
 	}
 
 	VOP_UNLOCK(vp, 0, td);
 
 	error = VOP_ADVLOCK(uvp, ap->a_id, ap->a_op, ap->a_fl, ap->a_flags);
 
 	UNIONFS_INTERNAL_DEBUG("unionfs_advlock: leave (%d)\n", error);
 
 	return error;
 
 unionfs_advlock_abort:
 	VOP_UNLOCK(vp, 0, td);
 
 	UNIONFS_INTERNAL_DEBUG("unionfs_advlock: leave (%d)\n", error);
 
 	return error;
 }
 
 static int
 unionfs_strategy(struct vop_strategy_args *ap)
 {
 	struct unionfs_node *unp;
 	struct vnode   *vp;
 
 	unp = VTOUNIONFS(ap->a_vp);
 	vp = (unp->un_uppervp != NULLVP ? unp->un_uppervp : unp->un_lowervp);
 
 #ifdef DIAGNOSTIC
 	if (vp == NULLVP)
 		panic("unionfs_strategy: nullvp");
 
 	if (ap->a_bp->b_iocmd == BIO_WRITE && vp == unp->un_lowervp)
 		panic("unionfs_strategy: writing to lowervp");
 #endif
 
 	return (VOP_STRATEGY(vp, ap->a_bp));
 }
 
 static int
 unionfs_getacl(struct vop_getacl_args *ap)
 {
 	int		error;
 	struct unionfs_node *unp;
 	struct vnode   *vp;
 
 	unp = VTOUNIONFS(ap->a_vp);
 	vp = (unp->un_uppervp != NULLVP ? unp->un_uppervp : unp->un_lowervp);
 
 	UNIONFS_INTERNAL_DEBUG("unionfs_getacl: enter\n");
 
 	error = VOP_GETACL(vp, ap->a_type, ap->a_aclp, ap->a_cred, ap->a_td);
 
 	UNIONFS_INTERNAL_DEBUG("unionfs_getacl: leave (%d)\n", error);
 
 	return (error);
 }
 
 static int
 unionfs_setacl(struct vop_setacl_args *ap)
 {
 	int		error;
 	struct unionfs_node *unp;
 	struct vnode   *uvp;
 	struct vnode   *lvp;
 	struct thread  *td;
 
 	UNIONFS_INTERNAL_DEBUG("unionfs_setacl: enter\n");
 
 	error = EROFS;
 	unp = VTOUNIONFS(ap->a_vp);
 	uvp = unp->un_uppervp;
 	lvp = unp->un_lowervp;
 	td = ap->a_td;
 
 	if (ap->a_vp->v_mount->mnt_flag & MNT_RDONLY)
 		return (EROFS);
 
 	if (uvp == NULLVP && lvp->v_type == VREG) {
 		if ((error = unionfs_copyfile(unp, 1, ap->a_cred, td)) != 0)
 			return (error);
 		uvp = unp->un_uppervp;
 	}
 
 	if (uvp != NULLVP)
 		error = VOP_SETACL(uvp, ap->a_type, ap->a_aclp, ap->a_cred, td);
 
 	UNIONFS_INTERNAL_DEBUG("unionfs_setacl: leave (%d)\n", error);
 
 	return (error);
 }
 
 static int
 unionfs_aclcheck(struct vop_aclcheck_args *ap)
 {
 	int		error;
 	struct unionfs_node *unp;
 	struct vnode   *vp;
 
 	UNIONFS_INTERNAL_DEBUG("unionfs_aclcheck: enter\n");
 
 	unp = VTOUNIONFS(ap->a_vp);
 	vp = (unp->un_uppervp != NULLVP ? unp->un_uppervp : unp->un_lowervp);
 
 	error = VOP_ACLCHECK(vp, ap->a_type, ap->a_aclp, ap->a_cred, ap->a_td);
 
 	UNIONFS_INTERNAL_DEBUG("unionfs_aclcheck: leave (%d)\n", error);
 
 	return (error);
 }
 
 static int
 unionfs_openextattr(struct vop_openextattr_args *ap)
 {
 	int		error;
 	struct unionfs_node *unp;
 	struct vnode   *vp;
 	struct vnode   *tvp;
 
 	vp = ap->a_vp;
 	unp = VTOUNIONFS(vp);
 	tvp = (unp->un_uppervp != NULLVP ? unp->un_uppervp : unp->un_lowervp);
 
 	if ((tvp == unp->un_uppervp && (unp->un_flag & UNIONFS_OPENEXTU)) ||
 	    (tvp == unp->un_lowervp && (unp->un_flag & UNIONFS_OPENEXTL)))
 		return (EBUSY);
 
 	error = VOP_OPENEXTATTR(tvp, ap->a_cred, ap->a_td);
 
 	if (error == 0) {
-		vn_lock(vp, LK_UPGRADE | LK_RETRY, ap->a_td);
+		vn_lock(vp, LK_UPGRADE | LK_RETRY);
 		if (tvp == unp->un_uppervp)
 			unp->un_flag |= UNIONFS_OPENEXTU;
 		else
 			unp->un_flag |= UNIONFS_OPENEXTL;
-		vn_lock(vp, LK_DOWNGRADE | LK_RETRY, ap->a_td);
+		vn_lock(vp, LK_DOWNGRADE | LK_RETRY);
 	}
 
 	return (error);
 }
 
 static int
 unionfs_closeextattr(struct vop_closeextattr_args *ap)
 {
 	int		error;
 	struct unionfs_node *unp;
 	struct vnode   *vp;
 	struct vnode   *tvp;
 
 	vp = ap->a_vp;
 	unp = VTOUNIONFS(vp);
 	tvp = NULLVP;
 
 	if (unp->un_flag & UNIONFS_OPENEXTU)
 		tvp = unp->un_uppervp;
 	else if (unp->un_flag & UNIONFS_OPENEXTL)
 		tvp = unp->un_lowervp;
 
 	if (tvp == NULLVP)
 		return (EOPNOTSUPP);
 
 	error = VOP_CLOSEEXTATTR(tvp, ap->a_commit, ap->a_cred, ap->a_td);
 
 	if (error == 0) {
-		vn_lock(vp, LK_UPGRADE | LK_RETRY, ap->a_td);
+		vn_lock(vp, LK_UPGRADE | LK_RETRY);
 		if (tvp == unp->un_uppervp)
 			unp->un_flag &= ~UNIONFS_OPENEXTU;
 		else
 			unp->un_flag &= ~UNIONFS_OPENEXTL;
-		vn_lock(vp, LK_DOWNGRADE | LK_RETRY, ap->a_td);
+		vn_lock(vp, LK_DOWNGRADE | LK_RETRY);
 	}
 
 	return (error);
 }
 
 static int
 unionfs_getextattr(struct vop_getextattr_args *ap)
 {
 	struct unionfs_node *unp;
 	struct vnode   *vp;
 
 	unp = VTOUNIONFS(ap->a_vp);
 	vp = NULLVP;
 
 	if (unp->un_flag & UNIONFS_OPENEXTU)
 		vp = unp->un_uppervp;
 	else if (unp->un_flag & UNIONFS_OPENEXTL)
 		vp = unp->un_lowervp;
 
 	if (vp == NULLVP)
 		return (EOPNOTSUPP);
 
 	return (VOP_GETEXTATTR(vp, ap->a_attrnamespace, ap->a_name,
 	    ap->a_uio, ap->a_size, ap->a_cred, ap->a_td));
 }
 
 static int
 unionfs_setextattr(struct vop_setextattr_args *ap)
 {
 	int		error;
 	struct unionfs_node *unp;
 	struct vnode   *uvp;
 	struct vnode   *lvp;
 	struct vnode   *ovp;
 	struct ucred   *cred;
 	struct thread  *td;
 
 	error = EROFS;
 	unp = VTOUNIONFS(ap->a_vp);
 	uvp = unp->un_uppervp;
 	lvp = unp->un_lowervp;
 	ovp = NULLVP;
 	cred = ap->a_cred;
 	td = ap->a_td;
 
 	UNIONFS_INTERNAL_DEBUG("unionfs_setextattr: enter (un_flag=%x)\n", unp->un_flag);
 
 	if (ap->a_vp->v_mount->mnt_flag & MNT_RDONLY)
 		return (EROFS);
 
 	if (unp->un_flag & UNIONFS_OPENEXTU)
 		ovp = unp->un_uppervp;
 	else if (unp->un_flag & UNIONFS_OPENEXTL)
 		ovp = unp->un_lowervp;
 
 	if (ovp == NULLVP)
 		return (EOPNOTSUPP);
 
 	if (ovp == lvp && lvp->v_type == VREG) {
 		VOP_CLOSEEXTATTR(lvp, 0, cred, td);
 		if (uvp == NULLVP &&
 		    (error = unionfs_copyfile(unp, 1, cred, td)) != 0) {
 unionfs_setextattr_reopen:
 			if ((unp->un_flag & UNIONFS_OPENEXTL) &&
 			    VOP_OPENEXTATTR(lvp, cred, td)) {
 #ifdef DIAGNOSTIC
 				panic("unionfs: VOP_OPENEXTATTR failed");
 #endif
 				unp->un_flag &= ~UNIONFS_OPENEXTL;
 			}
 			goto unionfs_setextattr_abort;
 		}
 		uvp = unp->un_uppervp;
 		if ((error = VOP_OPENEXTATTR(uvp, cred, td)) != 0)
 			goto unionfs_setextattr_reopen;
 		unp->un_flag &= ~UNIONFS_OPENEXTL;
 		unp->un_flag |= UNIONFS_OPENEXTU;
 		ovp = uvp;
 	}
 
 	if (ovp == uvp)
 		error = VOP_SETEXTATTR(ovp, ap->a_attrnamespace, ap->a_name,
 		    ap->a_uio, cred, td);
 
 unionfs_setextattr_abort:
 	UNIONFS_INTERNAL_DEBUG("unionfs_setextattr: leave (%d)\n", error);
 
 	return (error);
 }
 
 static int
 unionfs_listextattr(struct vop_listextattr_args *ap)
 {
 	struct unionfs_node *unp;
 	struct vnode   *vp;
 
 	unp = VTOUNIONFS(ap->a_vp);
 	vp = NULLVP;
 
 	if (unp->un_flag & UNIONFS_OPENEXTU)
 		vp = unp->un_uppervp;
 	else if (unp->un_flag & UNIONFS_OPENEXTL)
 		vp = unp->un_lowervp;
 
 	if (vp == NULLVP)
 		return (EOPNOTSUPP);
 
 	return (VOP_LISTEXTATTR(vp, ap->a_attrnamespace, ap->a_uio,
 	    ap->a_size, ap->a_cred, ap->a_td));
 }
 
 static int
 unionfs_deleteextattr(struct vop_deleteextattr_args *ap)
 {
 	int		error;
 	struct unionfs_node *unp;
 	struct vnode   *uvp;
 	struct vnode   *lvp;
 	struct vnode   *ovp;
 	struct ucred   *cred;
 	struct thread  *td;
 
 	error = EROFS;
 	unp = VTOUNIONFS(ap->a_vp);
 	uvp = unp->un_uppervp;
 	lvp = unp->un_lowervp;
 	ovp = NULLVP;
 	cred = ap->a_cred;
 	td = ap->a_td;
 
 	UNIONFS_INTERNAL_DEBUG("unionfs_deleteextattr: enter (un_flag=%x)\n", unp->un_flag);
 
 	if (ap->a_vp->v_mount->mnt_flag & MNT_RDONLY)
 		return (EROFS);
 
 	if (unp->un_flag & UNIONFS_OPENEXTU)
 		ovp = unp->un_uppervp;
 	else if (unp->un_flag & UNIONFS_OPENEXTL)
 		ovp = unp->un_lowervp;
 
 	if (ovp == NULLVP)
 		return (EOPNOTSUPP);
 
 	if (ovp == lvp && lvp->v_type == VREG) {
 		VOP_CLOSEEXTATTR(lvp, 0, cred, td);
 		if (uvp == NULLVP &&
 		    (error = unionfs_copyfile(unp, 1, cred, td)) != 0) {
 unionfs_deleteextattr_reopen:
 			if ((unp->un_flag & UNIONFS_OPENEXTL) &&
 			    VOP_OPENEXTATTR(lvp, cred, td)) {
 #ifdef DIAGNOSTIC
 				panic("unionfs: VOP_OPENEXTATTR failed");
 #endif
 				unp->un_flag &= ~UNIONFS_OPENEXTL;
 			}
 			goto unionfs_deleteextattr_abort;
 		}
 		uvp = unp->un_uppervp;
 		if ((error = VOP_OPENEXTATTR(uvp, cred, td)) != 0)
 			goto unionfs_deleteextattr_reopen;
 		unp->un_flag &= ~UNIONFS_OPENEXTL;
 		unp->un_flag |= UNIONFS_OPENEXTU;
 		ovp = uvp;
 	}
 
 	if (ovp == uvp)
 		error = VOP_DELETEEXTATTR(ovp, ap->a_attrnamespace, ap->a_name,
 		    ap->a_cred, ap->a_td);
 
 unionfs_deleteextattr_abort:
 	UNIONFS_INTERNAL_DEBUG("unionfs_deleteextattr: leave (%d)\n", error);
 
 	return (error);
 }
 
 static int
 unionfs_setlabel(struct vop_setlabel_args *ap)
 {
 	int		error;
 	struct unionfs_node *unp;
 	struct vnode   *uvp;
 	struct vnode   *lvp;
 	struct thread  *td;
 
 	UNIONFS_INTERNAL_DEBUG("unionfs_setlabel: enter\n");
 
 	error = EROFS;
 	unp = VTOUNIONFS(ap->a_vp);
 	uvp = unp->un_uppervp;
 	lvp = unp->un_lowervp;
 	td = ap->a_td;
 
 	if (ap->a_vp->v_mount->mnt_flag & MNT_RDONLY)
 		return (EROFS);
 
 	if (uvp == NULLVP && lvp->v_type == VREG) {
 		if ((error = unionfs_copyfile(unp, 1, ap->a_cred, td)) != 0)
 			return (error);
 		uvp = unp->un_uppervp;
 	}
 
 	if (uvp != NULLVP)
 		error = VOP_SETLABEL(uvp, ap->a_label, ap->a_cred, td);
 
 	UNIONFS_INTERNAL_DEBUG("unionfs_setlabel: leave (%d)\n", error);
 
 	return (error);
 }
 
 static int
 unionfs_vptofh(struct vop_vptofh_args *ap)
 {
 	return (EOPNOTSUPP);
 }
 
 struct vop_vector unionfs_vnodeops = {
 	.vop_default =		&default_vnodeops,
 
 	.vop_access =		unionfs_access,
 	.vop_aclcheck =		unionfs_aclcheck,
 	.vop_advlock =		unionfs_advlock,
 	.vop_bmap =		VOP_EOPNOTSUPP,
 	.vop_cachedlookup =	unionfs_lookup,
 	.vop_close =		unionfs_close,
 	.vop_closeextattr =	unionfs_closeextattr,
 	.vop_create =		unionfs_create,
 	.vop_deleteextattr =	unionfs_deleteextattr,
 	.vop_fsync =		unionfs_fsync,
 	.vop_getacl =		unionfs_getacl,
 	.vop_getattr =		unionfs_getattr,
 	.vop_getextattr =	unionfs_getextattr,
 	.vop_getwritemount =	unionfs_getwritemount,
 	.vop_inactive =		unionfs_inactive,
 	.vop_ioctl =		unionfs_ioctl,
 	.vop_lease =		unionfs_lease,
 	.vop_link =		unionfs_link,
 	.vop_listextattr =	unionfs_listextattr,
 	.vop_lock1 =		unionfs_lock,
 	.vop_lookup =		vfs_cache_lookup,
 	.vop_mkdir =		unionfs_mkdir,
 	.vop_mknod =		unionfs_mknod,
 	.vop_open =		unionfs_open,
 	.vop_openextattr =	unionfs_openextattr,
 	.vop_pathconf =		unionfs_pathconf,
 	.vop_poll =		unionfs_poll,
 	.vop_print =		unionfs_print,
 	.vop_read =		unionfs_read,
 	.vop_readdir =		unionfs_readdir,
 	.vop_readlink =		unionfs_readlink,
 	.vop_reclaim =		unionfs_reclaim,
 	.vop_remove =		unionfs_remove,
 	.vop_rename =		unionfs_rename,
 	.vop_rmdir =		unionfs_rmdir,
 	.vop_setacl =		unionfs_setacl,
 	.vop_setattr =		unionfs_setattr,
 	.vop_setextattr =	unionfs_setextattr,
 	.vop_setlabel =		unionfs_setlabel,
 	.vop_strategy =		unionfs_strategy,
 	.vop_symlink =		unionfs_symlink,
 	.vop_unlock =		unionfs_unlock,
 	.vop_whiteout =		unionfs_whiteout,
 	.vop_write =		unionfs_write,
 	.vop_vptofh =		unionfs_vptofh,
 };
Index: head/sys/gnu/fs/ext2fs/ext2_lookup.c
===================================================================
--- head/sys/gnu/fs/ext2fs/ext2_lookup.c	(revision 175201)
+++ head/sys/gnu/fs/ext2fs/ext2_lookup.c	(revision 175202)
@@ -1,1082 +1,1082 @@
 /*-
  *  modified for Lites 1.1
  *
  *  Aug 1995, Godmar Back (gback@cs.utah.edu)
  *  University of Utah, Department of Computer Science
  */
 /*-
  * Copyright (c) 1989, 1993
  *	The Regents of the University of California.  All rights reserved.
  * (c) UNIX System Laboratories, Inc.
  * All or some portions of this file are derived from material licensed
  * to the University of California by American Telephone and Telegraph
  * Co. or Unix System Laboratories, Inc. and are reproduced herein with
  * the permission of UNIX System Laboratories, Inc.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	@(#)ufs_lookup.c	8.6 (Berkeley) 4/1/94
  * $FreeBSD$
  */
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/namei.h>
 #include <sys/bio.h>
 #include <sys/buf.h>
 #include <sys/mount.h>
 #include <sys/vnode.h>
 #include <sys/malloc.h>
 #include <sys/dirent.h>
 #include <sys/sysctl.h>
 
 #include <ufs/ufs/dir.h>
 
 #include <gnu/fs/ext2fs/inode.h>
 #include <gnu/fs/ext2fs/ext2_mount.h>
 #include <gnu/fs/ext2fs/ext2_extern.h>
 #include <gnu/fs/ext2fs/ext2_fs.h>
 #include <gnu/fs/ext2fs/ext2_fs_sb.h>
 
 #ifdef DIAGNOSTIC
 static int dirchk = 1;
 #else
 static int dirchk = 0;
 #endif
 
 static SYSCTL_NODE(_vfs, OID_AUTO, e2fs, CTLFLAG_RD, 0, "EXT2FS filesystem");
 SYSCTL_INT(_vfs_e2fs, OID_AUTO, dircheck, CTLFLAG_RW, &dirchk, 0, "");
 
 /*
    DIRBLKSIZE in ffs is DEV_BSIZE (in most cases 512)
    while it is the native blocksize in ext2fs - thus, a #define
    is no longer appropriate
 */
 #undef  DIRBLKSIZ
 
 static u_char ext2_ft_to_dt[] = {
 	DT_UNKNOWN,		/* EXT2_FT_UNKNOWN */
 	DT_REG,			/* EXT2_FT_REG_FILE */
 	DT_DIR,			/* EXT2_FT_DIR */
 	DT_CHR,			/* EXT2_FT_CHRDEV */
 	DT_BLK,			/* EXT2_FT_BLKDEV */
 	DT_FIFO,		/* EXT2_FT_FIFO */
 	DT_SOCK,		/* EXT2_FT_SOCK */
 	DT_LNK,			/* EXT2_FT_SYMLINK */
 };
 #define	FTTODT(ft)						\
     ((ft) > sizeof(ext2_ft_to_dt) / sizeof(ext2_ft_to_dt[0]) ?	\
     DT_UNKNOWN : ext2_ft_to_dt[(ft)])
 
 static u_char dt_to_ext2_ft[] = {
 	EXT2_FT_UNKNOWN,	/* DT_UNKNOWN */
 	EXT2_FT_FIFO,		/* DT_FIFO */
 	EXT2_FT_CHRDEV,		/* DT_CHR */
 	EXT2_FT_UNKNOWN,	/* unused */
 	EXT2_FT_DIR,		/* DT_DIR */
 	EXT2_FT_UNKNOWN,	/* unused */
 	EXT2_FT_BLKDEV,		/* DT_BLK */
 	EXT2_FT_UNKNOWN,	/* unused */
 	EXT2_FT_REG_FILE,	/* DT_REG */
 	EXT2_FT_UNKNOWN,	/* unused */
 	EXT2_FT_SYMLINK,	/* DT_LNK */
 	EXT2_FT_UNKNOWN,	/* unused */
 	EXT2_FT_SOCK,		/* DT_SOCK */
 	EXT2_FT_UNKNOWN,	/* unused */
 	EXT2_FT_UNKNOWN,	/* DT_WHT */
 };
 #define	DTTOFT(dt)						\
     ((dt) > sizeof(dt_to_ext2_ft) / sizeof(dt_to_ext2_ft[0]) ?	\
     EXT2_FT_UNKNOWN : dt_to_ext2_ft[(dt)])
 
 static int	ext2_dirbadentry(struct vnode *dp, struct ext2_dir_entry_2 *de,
 		    int entryoffsetinblock);
 
 /*
  * Vnode op for reading directories.
  *
  * The routine below assumes that the on-disk format of a directory
  * is the same as that defined by <sys/dirent.h>. If the on-disk
  * format changes, then it will be necessary to do a conversion
  * from the on-disk format that read returns to the format defined
  * by <sys/dirent.h>.
  */
 /*
  * this is exactly what we do here - the problem is that the conversion
  * will blow up some entries by four bytes, so it can't be done in place.
  * This is too bad. Right now the conversion is done entry by entry, the
  * converted entry is sent via uiomove.
  *
  * XXX allocate a buffer, convert as many entries as possible, then send
  * the whole buffer to uiomove
  */
 int
 ext2_readdir(ap)
 	struct vop_readdir_args /* {
 		struct vnode *a_vp;
 		struct uio *a_uio;
 		struct ucred *a_cred;
 	} */ *ap;
 {
 	struct uio *uio = ap->a_uio;
 	int count, error;
 
 	struct ext2_dir_entry_2 *edp, *dp;
 	int ncookies;
 	struct dirent dstdp;
 	struct uio auio;
 	struct iovec aiov;
 	caddr_t dirbuf;
 	int DIRBLKSIZ = VTOI(ap->a_vp)->i_e2fs->s_blocksize;
 	int readcnt;
 	off_t startoffset = uio->uio_offset;
 
 	count = uio->uio_resid;
 	/*
 	 * Avoid complications for partial directory entries by adjusting
 	 * the i/o to end at a block boundary.  Don't give up (like ufs
 	 * does) if the initial adjustment gives a negative count, since
 	 * many callers don't supply a large enough buffer.  The correct
 	 * size is a little larger than DIRBLKSIZ to allow for expansion
 	 * of directory entries, but some callers just use 512.
 	 */
 	count -= (uio->uio_offset + count) & (DIRBLKSIZ -1);
 	if (count <= 0)
 		count += DIRBLKSIZ;
 
 #ifdef EXT2FS_DEBUG
 	printf("ext2_readdir: uio_offset = %lld, uio_resid = %d, count = %d\n",
 	    uio->uio_offset, uio->uio_resid, count);
 #endif
 
 	auio = *uio;
 	auio.uio_iov = &aiov;
 	auio.uio_iovcnt = 1;
 	auio.uio_resid = count;
 	auio.uio_segflg = UIO_SYSSPACE;
 	aiov.iov_len = count;
 	MALLOC(dirbuf, caddr_t, count, M_TEMP, M_WAITOK);
 	aiov.iov_base = dirbuf;
 	error = VOP_READ(ap->a_vp, &auio, 0, ap->a_cred);
 	if (error == 0) {
 		readcnt = count - auio.uio_resid;
 		edp = (struct ext2_dir_entry_2 *)&dirbuf[readcnt];
 		ncookies = 0;
 		bzero(&dstdp, offsetof(struct dirent, d_name));
 		for (dp = (struct ext2_dir_entry_2 *)dirbuf;
 		    !error && uio->uio_resid > 0 && dp < edp; ) {
 			/*-
 			 * "New" ext2fs directory entries differ in 3 ways
 			 * from ufs on-disk ones:
 			 * - the name is not necessarily NUL-terminated.
 			 * - the file type field always exists and always
 			 *   follows the name length field.
 			 * - the file type is encoded in a different way.
 			 *
 			 * "Old" ext2fs directory entries need no special
 			 * conversions, since they are binary compatible
 			 * with "new" entries having a file type of 0 (i.e.,
 			 * EXT2_FT_UNKNOWN).  Splitting the old name length
 			 * field didn't make a mess like it did in ufs,
 			 * because ext2fs uses a machine-independent disk
 			 * layout.
 			 */
 			dstdp.d_fileno = dp->inode;
 			dstdp.d_type = FTTODT(dp->file_type);
 			dstdp.d_namlen = dp->name_len;
 			dstdp.d_reclen = GENERIC_DIRSIZ(&dstdp);
 			bcopy(dp->name, dstdp.d_name, dstdp.d_namlen);
 			bzero(dstdp.d_name + dstdp.d_namlen,
 			    dstdp.d_reclen - offsetof(struct dirent, d_name) -
 			    dstdp.d_namlen);
 
 			if (dp->rec_len > 0) {
 				if(dstdp.d_reclen <= uio->uio_resid) {
 					/* advance dp */
 					dp = (struct ext2_dir_entry_2 *)
 					    ((char *)dp + dp->rec_len);
 					error =
 					  uiomove(&dstdp, dstdp.d_reclen, uio);
 					if (!error)
 						ncookies++;
 				} else
 					break;
 			} else {
 				error = EIO;
 				break;
 			}
 		}
 		/* we need to correct uio_offset */
 		uio->uio_offset = startoffset + (caddr_t)dp - dirbuf;
 
 		if (!error && ap->a_ncookies != NULL) {
 			u_long *cookiep, *cookies, *ecookies;
 			off_t off;
 
 			if (uio->uio_segflg != UIO_SYSSPACE || uio->uio_iovcnt != 1)
 				panic("ext2_readdir: unexpected uio from NFS server");
 			MALLOC(cookies, u_long *, ncookies * sizeof(u_long), M_TEMP,
 			       M_WAITOK);
 			off = startoffset;
 			for (dp = (struct ext2_dir_entry_2 *)dirbuf,
 			     cookiep = cookies, ecookies = cookies + ncookies;
 			     cookiep < ecookies;
 			     dp = (struct ext2_dir_entry_2 *)((caddr_t) dp + dp->rec_len)) {
 				off += dp->rec_len;
 				*cookiep++ = (u_long) off;
 			}
 			*ap->a_ncookies = ncookies;
 			*ap->a_cookies = cookies;
 		}
 	}
 	FREE(dirbuf, M_TEMP);
 	if (ap->a_eofflag)
 		*ap->a_eofflag = VTOI(ap->a_vp)->i_size <= uio->uio_offset;
 	return (error);
 }
 
 /*
  * Convert a component of a pathname into a pointer to a locked inode.
  * This is a very central and rather complicated routine.
  * If the file system is not maintained in a strict tree hierarchy,
  * this can result in a deadlock situation (see comments in code below).
  *
  * The cnp->cn_nameiop argument is LOOKUP, CREATE, RENAME, or DELETE depending
  * on whether the name is to be looked up, created, renamed, or deleted.
  * When CREATE, RENAME, or DELETE is specified, information usable in
  * creating, renaming, or deleting a directory entry may be calculated.
  * If flag has LOCKPARENT or'ed into it and the target of the pathname
  * exists, lookup returns both the target and its parent directory locked.
  * When creating or renaming and LOCKPARENT is specified, the target may
  * not be ".".  When deleting and LOCKPARENT is specified, the target may
  * be "."., but the caller must check to ensure it does an vrele and vput
  * instead of two vputs.
  *
  * Overall outline of ext2_lookup:
  *
  *	search for name in directory, to found or notfound
  * notfound:
  *	if creating, return locked directory, leaving info on available slots
  *	else return error
  * found:
  *	if at end of path and deleting, return information to allow delete
  *	if at end of path and rewriting (RENAME and LOCKPARENT), lock target
  *	  inode and return info to allow rewrite
  *	if not at end, add name to cache; if at end and neither creating
  *	  nor deleting, add name to cache
  */
 int
 ext2_lookup(ap)
 	struct vop_cachedlookup_args /* {
 		struct vnode *a_dvp;
 		struct vnode **a_vpp;
 		struct componentname *a_cnp;
 	} */ *ap;
 {
 	struct vnode *vdp;		/* vnode for directory being searched */
 	struct inode *dp;		/* inode for directory being searched */
 	struct buf *bp;			/* a buffer of directory entries */
 	struct ext2_dir_entry_2 *ep;	/* the current directory entry */
 	int entryoffsetinblock;		/* offset of ep in bp's buffer */
 	enum {NONE, COMPACT, FOUND} slotstatus;
 	doff_t slotoffset;		/* offset of area with free space */
 	int slotsize;			/* size of area at slotoffset */
 	int slotfreespace;		/* amount of space free in slot */
 	int slotneeded;			/* size of the entry we're seeking */
 	int numdirpasses;		/* strategy for directory search */
 	doff_t endsearch;		/* offset to end directory search */
 	doff_t prevoff;			/* prev entry dp->i_offset */
 	struct vnode *pdp;		/* saved dp during symlink work */
 	struct vnode *tdp;		/* returned by VFS_VGET */
 	doff_t enduseful;		/* pointer past last used dir slot */
 	u_long bmask;			/* block offset mask */
 	int namlen, error;
 	struct vnode **vpp = ap->a_vpp;
 	struct componentname *cnp = ap->a_cnp;
 	struct ucred *cred = cnp->cn_cred;
 	int flags = cnp->cn_flags;
 	int nameiop = cnp->cn_nameiop;
 	struct thread *td = cnp->cn_thread;
 	ino_t saved_ino;
 
 	int	DIRBLKSIZ = VTOI(ap->a_dvp)->i_e2fs->s_blocksize;
 
 	bp = NULL;
 	slotoffset = -1;
 	*vpp = NULL;
 	vdp = ap->a_dvp;
 	dp = VTOI(vdp);
 	/*
 	 * We now have a segment name to search for, and a directory to search.
 	 */
 
 	/*
 	 * Suppress search for slots unless creating
 	 * file and at end of pathname, in which case
 	 * we watch for a place to put the new file in
 	 * case it doesn't already exist.
 	 */
 	slotstatus = FOUND;
 	slotfreespace = slotsize = slotneeded = 0;
 	if ((nameiop == CREATE || nameiop == RENAME) &&
 	    (flags & ISLASTCN)) {
 		slotstatus = NONE;
 		slotneeded = EXT2_DIR_REC_LEN(cnp->cn_namelen);
 		/* was
 		slotneeded = (sizeof(struct direct) - MAXNAMLEN +
 			cnp->cn_namelen + 3) &~ 3; */
 	}
 
 	/*
 	 * If there is cached information on a previous search of
 	 * this directory, pick up where we last left off.
 	 * We cache only lookups as these are the most common
 	 * and have the greatest payoff. Caching CREATE has little
 	 * benefit as it usually must search the entire directory
 	 * to determine that the entry does not exist. Caching the
 	 * location of the last DELETE or RENAME has not reduced
 	 * profiling time and hence has been removed in the interest
 	 * of simplicity.
 	 */
 	bmask = VFSTOEXT2(vdp->v_mount)->um_mountp->mnt_stat.f_iosize - 1;
 	if (nameiop != LOOKUP || dp->i_diroff == 0 ||
 	    dp->i_diroff > dp->i_size) {
 		entryoffsetinblock = 0;
 		dp->i_offset = 0;
 		numdirpasses = 1;
 	} else {
 		dp->i_offset = dp->i_diroff;
 		if ((entryoffsetinblock = dp->i_offset & bmask) &&
 		    (error = ext2_blkatoff(vdp, (off_t)dp->i_offset, NULL,
 		    &bp)))
 			return (error);
 		numdirpasses = 2;
 		nchstats.ncs_2passes++;
 	}
 	prevoff = dp->i_offset;
 	endsearch = roundup(dp->i_size, DIRBLKSIZ);
 	enduseful = 0;
 
 searchloop:
 	while (dp->i_offset < endsearch) {
 		/*
 		 * If necessary, get the next directory block.
 		 */
 		if ((dp->i_offset & bmask) == 0) {
 			if (bp != NULL)
 				brelse(bp);
 			if ((error =
 			    ext2_blkatoff(vdp, (off_t)dp->i_offset, NULL,
 			    &bp)) != 0)
 				return (error);
 			entryoffsetinblock = 0;
 		}
 		/*
 		 * If still looking for a slot, and at a DIRBLKSIZE
 		 * boundary, have to start looking for free space again.
 		 */
 		if (slotstatus == NONE &&
 		    (entryoffsetinblock & (DIRBLKSIZ - 1)) == 0) {
 			slotoffset = -1;
 			slotfreespace = 0;
 		}
 		/*
 		 * Get pointer to next entry.
 		 * Full validation checks are slow, so we only check
 		 * enough to insure forward progress through the
 		 * directory. Complete checks can be run by setting
 		 * "vfs.e2fs.dirchk" to be true.
 		 */
 		ep = (struct ext2_dir_entry_2 *)
 			((char *)bp->b_data + entryoffsetinblock);
 		if (ep->rec_len == 0 ||
 		    (dirchk && ext2_dirbadentry(vdp, ep, entryoffsetinblock))) {
 			int i;
 			ext2_dirbad(dp, dp->i_offset, "mangled entry");
 			i = DIRBLKSIZ - (entryoffsetinblock & (DIRBLKSIZ - 1));
 			dp->i_offset += i;
 			entryoffsetinblock += i;
 			continue;
 		}
 
 		/*
 		 * If an appropriate sized slot has not yet been found,
 		 * check to see if one is available. Also accumulate space
 		 * in the current block so that we can determine if
 		 * compaction is viable.
 		 */
 		if (slotstatus != FOUND) {
 			int size = ep->rec_len;
 
 			if (ep->inode != 0)
 				size -= EXT2_DIR_REC_LEN(ep->name_len);
 			if (size > 0) {
 				if (size >= slotneeded) {
 					slotstatus = FOUND;
 					slotoffset = dp->i_offset;
 					slotsize = ep->rec_len;
 				} else if (slotstatus == NONE) {
 					slotfreespace += size;
 					if (slotoffset == -1)
 						slotoffset = dp->i_offset;
 					if (slotfreespace >= slotneeded) {
 						slotstatus = COMPACT;
 						slotsize = dp->i_offset +
 						      ep->rec_len - slotoffset;
 					}
 				}
 			}
 		}
 
 		/*
 		 * Check for a name match.
 		 */
 		if (ep->inode) {
 			namlen = ep->name_len;
 			if (namlen == cnp->cn_namelen &&
 			    !bcmp(cnp->cn_nameptr, ep->name,
 				(unsigned)namlen)) {
 				/*
 				 * Save directory entry's inode number and
 				 * reclen in ndp->ni_ufs area, and release
 				 * directory buffer.
 				 */
 				dp->i_ino = ep->inode;
 				dp->i_reclen = ep->rec_len;
 				goto found;
 			}
 		}
 		prevoff = dp->i_offset;
 		dp->i_offset += ep->rec_len;
 		entryoffsetinblock += ep->rec_len;
 		if (ep->inode)
 			enduseful = dp->i_offset;
 	}
 /* notfound: */
 	/*
 	 * If we started in the middle of the directory and failed
 	 * to find our target, we must check the beginning as well.
 	 */
 	if (numdirpasses == 2) {
 		numdirpasses--;
 		dp->i_offset = 0;
 		endsearch = dp->i_diroff;
 		goto searchloop;
 	}
 	if (bp != NULL)
 		brelse(bp);
 	/*
 	 * If creating, and at end of pathname and current
 	 * directory has not been removed, then can consider
 	 * allowing file to be created.
 	 */
 	if ((nameiop == CREATE || nameiop == RENAME) &&
 	    (flags & ISLASTCN) && dp->i_nlink != 0) {
 		/*
 		 * Access for write is interpreted as allowing
 		 * creation of files in the directory.
 		 */
 		if ((error = VOP_ACCESS(vdp, VWRITE, cred, cnp->cn_thread)) != 0)
 			return (error);
 		/*
 		 * Return an indication of where the new directory
 		 * entry should be put.  If we didn't find a slot,
 		 * then set dp->i_count to 0 indicating
 		 * that the new slot belongs at the end of the
 		 * directory. If we found a slot, then the new entry
 		 * can be put in the range from dp->i_offset to
 		 * dp->i_offset + dp->i_count.
 		 */
 		if (slotstatus == NONE) {
 			dp->i_offset = roundup(dp->i_size, DIRBLKSIZ);
 			dp->i_count = 0;
 			enduseful = dp->i_offset;
 		} else {
 			dp->i_offset = slotoffset;
 			dp->i_count = slotsize;
 			if (enduseful < slotoffset + slotsize)
 				enduseful = slotoffset + slotsize;
 		}
 		dp->i_endoff = roundup(enduseful, DIRBLKSIZ);
 		dp->i_flag |= IN_CHANGE | IN_UPDATE;
 		/*
 		 * We return with the directory locked, so that
 		 * the parameters we set up above will still be
 		 * valid if we actually decide to do a direnter().
 		 * We return ni_vp == NULL to indicate that the entry
 		 * does not currently exist; we leave a pointer to
 		 * the (locked) directory inode in ndp->ni_dvp.
 		 * The pathname buffer is saved so that the name
 		 * can be obtained later.
 		 *
 		 * NB - if the directory is unlocked, then this
 		 * information cannot be used.
 		 */
 		cnp->cn_flags |= SAVENAME;
 		return (EJUSTRETURN);
 	}
 	/*
 	 * Insert name into cache (as non-existent) if appropriate.
 	 */
 	if ((cnp->cn_flags & MAKEENTRY) && nameiop != CREATE)
 		cache_enter(vdp, *vpp, cnp);
 	return (ENOENT);
 
 found:
 	if (numdirpasses == 2)
 		nchstats.ncs_pass2++;
 	/*
 	 * Check that directory length properly reflects presence
 	 * of this entry.
 	 */
 	if (entryoffsetinblock + EXT2_DIR_REC_LEN(ep->name_len)
 		> dp->i_size) {
 		ext2_dirbad(dp, dp->i_offset, "i_size too small");
 		dp->i_size = entryoffsetinblock+EXT2_DIR_REC_LEN(ep->name_len);
 		dp->i_flag |= IN_CHANGE | IN_UPDATE;
 	}
 	brelse(bp);
 
 	/*
 	 * Found component in pathname.
 	 * If the final component of path name, save information
 	 * in the cache as to where the entry was found.
 	 */
 	if ((flags & ISLASTCN) && nameiop == LOOKUP)
 		dp->i_diroff = dp->i_offset &~ (DIRBLKSIZ - 1);
 
 	/*
 	 * If deleting, and at end of pathname, return
 	 * parameters which can be used to remove file.
 	 */
 	if (nameiop == DELETE && (flags & ISLASTCN)) {
 		/*
 		 * Write access to directory required to delete files.
 		 */
 		if ((error = VOP_ACCESS(vdp, VWRITE, cred, cnp->cn_thread)) != 0)
 			return (error);
 		/*
 		 * Return pointer to current entry in dp->i_offset,
 		 * and distance past previous entry (if there
 		 * is a previous entry in this block) in dp->i_count.
 		 * Save directory inode pointer in ndp->ni_dvp for dirremove().
 		 */
 		if ((dp->i_offset & (DIRBLKSIZ - 1)) == 0)
 			dp->i_count = 0;
 		else
 			dp->i_count = dp->i_offset - prevoff;
 		if (dp->i_number == dp->i_ino) {
 			VREF(vdp);
 			*vpp = vdp;
 			return (0);
 		}
 		if ((error = VFS_VGET(vdp->v_mount, dp->i_ino, LK_EXCLUSIVE,
 		    &tdp)) != 0)
 			return (error);
 		/*
 		 * If directory is "sticky", then user must own
 		 * the directory, or the file in it, else she
 		 * may not delete it (unless she's root). This
 		 * implements append-only directories.
 		 */
 		if ((dp->i_mode & ISVTX) &&
 		    cred->cr_uid != 0 &&
 		    cred->cr_uid != dp->i_uid &&
 		    VTOI(tdp)->i_uid != cred->cr_uid) {
 			vput(tdp);
 			return (EPERM);
 		}
 		*vpp = tdp;
 		return (0);
 	}
 
 	/*
 	 * If rewriting (RENAME), return the inode and the
 	 * information required to rewrite the present directory
 	 * Must get inode of directory entry to verify it's a
 	 * regular file, or empty directory.
 	 */
 	if (nameiop == RENAME && (flags & ISLASTCN)) {
 		if ((error = VOP_ACCESS(vdp, VWRITE, cred, cnp->cn_thread)) != 0)
 			return (error);
 		/*
 		 * Careful about locking second inode.
 		 * This can only occur if the target is ".".
 		 */
 		if (dp->i_number == dp->i_ino)
 			return (EISDIR);
 		if ((error = VFS_VGET(vdp->v_mount, dp->i_ino, LK_EXCLUSIVE,
 		    &tdp)) != 0)
 			return (error);
 		*vpp = tdp;
 		cnp->cn_flags |= SAVENAME;
 		return (0);
 	}
 
 	/*
 	 * Step through the translation in the name.  We do not `vput' the
 	 * directory because we may need it again if a symbolic link
 	 * is relative to the current directory.  Instead we save it
 	 * unlocked as "pdp".  We must get the target inode before unlocking
 	 * the directory to insure that the inode will not be removed
 	 * before we get it.  We prevent deadlock by always fetching
 	 * inodes from the root, moving down the directory tree. Thus
 	 * when following backward pointers ".." we must unlock the
 	 * parent directory before getting the requested directory.
 	 * There is a potential race condition here if both the current
 	 * and parent directories are removed before the VFS_VGET for the
 	 * inode associated with ".." returns.  We hope that this occurs
 	 * infrequently since we cannot avoid this race condition without
 	 * implementing a sophisticated deadlock detection algorithm.
 	 * Note also that this simple deadlock detection scheme will not
 	 * work if the file system has any hard links other than ".."
 	 * that point backwards in the directory structure.
 	 */
 	pdp = vdp;
 	if (flags & ISDOTDOT) {
 		saved_ino = dp->i_ino;
 		VOP_UNLOCK(pdp, 0, td);	/* race to get the inode */
 		error = VFS_VGET(vdp->v_mount, saved_ino, LK_EXCLUSIVE, &tdp);
-		vn_lock(pdp, LK_EXCLUSIVE | LK_RETRY, td);
+		vn_lock(pdp, LK_EXCLUSIVE | LK_RETRY);
 		if (error != 0)
 			return (error);
 		*vpp = tdp;
 	} else if (dp->i_number == dp->i_ino) {
 		VREF(vdp);	/* we want ourself, ie "." */
 		*vpp = vdp;
 	} else {
 		if ((error = VFS_VGET(vdp->v_mount, dp->i_ino, LK_EXCLUSIVE,
 		    &tdp)) != 0)
 			return (error);
 		*vpp = tdp;
 	}
 
 	/*
 	 * Insert name into cache if appropriate.
 	 */
 	if (cnp->cn_flags & MAKEENTRY)
 		cache_enter(vdp, *vpp, cnp);
 	return (0);
 }
 
 void
 ext2_dirbad(ip, offset, how)
 	struct inode *ip;
 	doff_t offset;
 	char *how;
 {
 	struct mount *mp;
 
 	mp = ITOV(ip)->v_mount;
 	(void)printf("%s: bad dir ino %lu at offset %ld: %s\n",
 	    mp->mnt_stat.f_mntonname, (u_long)ip->i_number, (long)offset, how);
 	if ((mp->mnt_flag & MNT_RDONLY) == 0)
 		panic("ext2_dirbad: bad dir");
 }
 
 /*
  * Do consistency checking on a directory entry:
  *	record length must be multiple of 4
  *	entry must fit in rest of its DIRBLKSIZ block
  *	record must be large enough to contain entry
  *	name is not longer than MAXNAMLEN
  *	name must be as long as advertised, and null terminated
  */
 /*
  *	changed so that it confirms to ext2_check_dir_entry
  */
 static int
 ext2_dirbadentry(dp, de, entryoffsetinblock)
 	struct vnode *dp;
 	struct ext2_dir_entry_2 *de;
 	int entryoffsetinblock;
 {
 	int	DIRBLKSIZ = VTOI(dp)->i_e2fs->s_blocksize;
 
 	char * error_msg = NULL;
 
 	if (de->rec_len < EXT2_DIR_REC_LEN(1))
 		error_msg = "rec_len is smaller than minimal";
 	else if (de->rec_len % 4 != 0)
 		error_msg = "rec_len % 4 != 0";
 	else if (de->rec_len < EXT2_DIR_REC_LEN(de->name_len))
 		error_msg = "reclen is too small for name_len";
 	else if (entryoffsetinblock + de->rec_len > DIRBLKSIZ)
 		error_msg = "directory entry across blocks";
 	/* else LATER
 	     if (de->inode > dir->i_sb->u.ext2_sb.s_es->s_inodes_count)
 		error_msg = "inode out of bounds";
 	*/
 
 	if (error_msg != NULL) {
 		printf("bad directory entry: %s\n", error_msg);
 		printf("offset=%d, inode=%lu, rec_len=%u, name_len=%u\n",
 			entryoffsetinblock, (unsigned long)de->inode,
 			de->rec_len, de->name_len);
 	}
 	return error_msg == NULL ? 0 : 1;
 }
 
 /*
  * Write a directory entry after a call to namei, using the parameters
  * that it left in nameidata.  The argument ip is the inode which the new
  * directory entry will refer to.  Dvp is a pointer to the directory to
  * be written, which was left locked by namei. Remaining parameters
  * (dp->i_offset, dp->i_count) indicate how the space for the new
  * entry is to be obtained.
  */
 int
 ext2_direnter(ip, dvp, cnp)
 	struct inode *ip;
 	struct vnode *dvp;
 	struct componentname *cnp;
 {
 	struct ext2_dir_entry_2 *ep, *nep;
 	struct inode *dp;
 	struct buf *bp;
 	struct ext2_dir_entry_2 newdir;
 	struct iovec aiov;
 	struct uio auio;
 	u_int dsize;
 	int error, loc, newentrysize, spacefree;
 	char *dirbuf;
 	int     DIRBLKSIZ = ip->i_e2fs->s_blocksize;
 
 
 #ifdef DIAGNOSTIC
 	if ((cnp->cn_flags & SAVENAME) == 0)
 		panic("direnter: missing name");
 #endif
 	dp = VTOI(dvp);
 	newdir.inode = ip->i_number;
 	newdir.name_len = cnp->cn_namelen;
 	if (EXT2_HAS_INCOMPAT_FEATURE(ip->i_e2fs->s_es,
 	    EXT2_FEATURE_INCOMPAT_FILETYPE))
 		newdir.file_type = DTTOFT(IFTODT(ip->i_mode));
 	else
 		newdir.file_type = EXT2_FT_UNKNOWN;
 	bcopy(cnp->cn_nameptr, newdir.name, (unsigned)cnp->cn_namelen + 1);
 	newentrysize = EXT2_DIR_REC_LEN(newdir.name_len);
 	if (dp->i_count == 0) {
 		/*
 		 * If dp->i_count is 0, then namei could find no
 		 * space in the directory. Here, dp->i_offset will
 		 * be on a directory block boundary and we will write the
 		 * new entry into a fresh block.
 		 */
 		if (dp->i_offset & (DIRBLKSIZ - 1))
 			panic("ext2_direnter: newblk");
 		auio.uio_offset = dp->i_offset;
 		newdir.rec_len = DIRBLKSIZ;
 		auio.uio_resid = newentrysize;
 		aiov.iov_len = newentrysize;
 		aiov.iov_base = (caddr_t)&newdir;
 		auio.uio_iov = &aiov;
 		auio.uio_iovcnt = 1;
 		auio.uio_rw = UIO_WRITE;
 		auio.uio_segflg = UIO_SYSSPACE;
 		auio.uio_td = (struct thread *)0;
 		error = VOP_WRITE(dvp, &auio, IO_SYNC, cnp->cn_cred);
 		if (DIRBLKSIZ >
 		    VFSTOEXT2(dvp->v_mount)->um_mountp->mnt_stat.f_bsize)
 			/* XXX should grow with balloc() */
 			panic("ext2_direnter: frag size");
 		else if (!error) {
 			dp->i_size = roundup(dp->i_size, DIRBLKSIZ);
 			dp->i_flag |= IN_CHANGE;
 		}
 		return (error);
 	}
 
 	/*
 	 * If dp->i_count is non-zero, then namei found space
 	 * for the new entry in the range dp->i_offset to
 	 * dp->i_offset + dp->i_count in the directory.
 	 * To use this space, we may have to compact the entries located
 	 * there, by copying them together towards the beginning of the
 	 * block, leaving the free space in one usable chunk at the end.
 	 */
 
 	/*
 	 * Increase size of directory if entry eats into new space.
 	 * This should never push the size past a new multiple of
 	 * DIRBLKSIZE.
 	 *
 	 * N.B. - THIS IS AN ARTIFACT OF 4.2 AND SHOULD NEVER HAPPEN.
 	 */
 	if (dp->i_offset + dp->i_count > dp->i_size)
 		dp->i_size = dp->i_offset + dp->i_count;
 	/*
 	 * Get the block containing the space for the new directory entry.
 	 */
 	if ((error = ext2_blkatoff(dvp, (off_t)dp->i_offset, &dirbuf,
 	    &bp)) != 0)
 		return (error);
 	/*
 	 * Find space for the new entry. In the simple case, the entry at
 	 * offset base will have the space. If it does not, then namei
 	 * arranged that compacting the region dp->i_offset to
 	 * dp->i_offset + dp->i_count would yield the
 	 * space.
 	 */
 	ep = (struct ext2_dir_entry_2 *)dirbuf;
 	dsize = EXT2_DIR_REC_LEN(ep->name_len);
 	spacefree = ep->rec_len - dsize;
 	for (loc = ep->rec_len; loc < dp->i_count; ) {
 		nep = (struct ext2_dir_entry_2 *)(dirbuf + loc);
 		if (ep->inode) {
 			/* trim the existing slot */
 			ep->rec_len = dsize;
 			ep = (struct ext2_dir_entry_2 *)((char *)ep + dsize);
 		} else {
 			/* overwrite; nothing there; header is ours */
 			spacefree += dsize;
 		}
 		dsize = EXT2_DIR_REC_LEN(nep->name_len);
 		spacefree += nep->rec_len - dsize;
 		loc += nep->rec_len;
 		bcopy((caddr_t)nep, (caddr_t)ep, dsize);
 	}
 	/*
 	 * Update the pointer fields in the previous entry (if any),
 	 * copy in the new entry, and write out the block.
 	 */
 	if (ep->inode == 0) {
 		if (spacefree + dsize < newentrysize)
 			panic("ext2_direnter: compact1");
 		newdir.rec_len = spacefree + dsize;
 	} else {
 		if (spacefree < newentrysize)
 			panic("ext2_direnter: compact2");
 		newdir.rec_len = spacefree;
 		ep->rec_len = dsize;
 		ep = (struct ext2_dir_entry_2 *)((char *)ep + dsize);
 	}
 	bcopy((caddr_t)&newdir, (caddr_t)ep, (u_int)newentrysize);
 	error = bwrite(bp);
 	dp->i_flag |= IN_CHANGE | IN_UPDATE;
 	if (!error && dp->i_endoff && dp->i_endoff < dp->i_size)
 		error = ext2_truncate(dvp, (off_t)dp->i_endoff, IO_SYNC,
 		    cnp->cn_cred, cnp->cn_thread);
 	return (error);
 }
 
 /*
  * Remove a directory entry after a call to namei, using
  * the parameters which it left in nameidata. The entry
  * dp->i_offset contains the offset into the directory of the
  * entry to be eliminated.  The dp->i_count field contains the
  * size of the previous record in the directory.  If this
  * is 0, the first entry is being deleted, so we need only
  * zero the inode number to mark the entry as free.  If the
  * entry is not the first in the directory, we must reclaim
  * the space of the now empty record by adding the record size
  * to the size of the previous entry.
  */
 int
 ext2_dirremove(dvp, cnp)
 	struct vnode *dvp;
 	struct componentname *cnp;
 {
 	struct inode *dp;
 	struct ext2_dir_entry_2 *ep;
 	struct buf *bp;
 	int error;
 
 	dp = VTOI(dvp);
 	if (dp->i_count == 0) {
 		/*
 		 * First entry in block: set d_ino to zero.
 		 */
 		if ((error =
 		    ext2_blkatoff(dvp, (off_t)dp->i_offset, (char **)&ep,
 		    &bp)) != 0)
 			return (error);
 		ep->inode = 0;
 		error = bwrite(bp);
 		dp->i_flag |= IN_CHANGE | IN_UPDATE;
 		return (error);
 	}
 	/*
 	 * Collapse new free space into previous entry.
 	 */
 	if ((error = ext2_blkatoff(dvp, (off_t)(dp->i_offset - dp->i_count),
 	    (char **)&ep, &bp)) != 0)
 		return (error);
 	ep->rec_len += dp->i_reclen;
 	error = bwrite(bp);
 	dp->i_flag |= IN_CHANGE | IN_UPDATE;
 	return (error);
 }
 
 /*
  * Rewrite an existing directory entry to point at the inode
  * supplied.  The parameters describing the directory entry are
  * set up by a call to namei.
  */
 int
 ext2_dirrewrite(dp, ip, cnp)
 	struct inode *dp, *ip;
 	struct componentname *cnp;
 {
 	struct buf *bp;
 	struct ext2_dir_entry_2 *ep;
 	struct vnode *vdp = ITOV(dp);
 	int error;
 
 	if ((error = ext2_blkatoff(vdp, (off_t)dp->i_offset, (char **)&ep,
 	    &bp)) != 0)
 		return (error);
 	ep->inode = ip->i_number;
 	if (EXT2_HAS_INCOMPAT_FEATURE(ip->i_e2fs->s_es,
 	    EXT2_FEATURE_INCOMPAT_FILETYPE))
 		ep->file_type = DTTOFT(IFTODT(ip->i_mode));
 	else
 		ep->file_type = EXT2_FT_UNKNOWN;
 	error = bwrite(bp);
 	dp->i_flag |= IN_CHANGE | IN_UPDATE;
 	return (error);
 }
 
 /*
  * Check if a directory is empty or not.
  * Inode supplied must be locked.
  *
  * Using a struct dirtemplate here is not precisely
  * what we want, but better than using a struct direct.
  *
  * NB: does not handle corrupted directories.
  */
 int
 ext2_dirempty(ip, parentino, cred)
 	struct inode *ip;
 	ino_t parentino;
 	struct ucred *cred;
 {
 	off_t off;
 	struct dirtemplate dbuf;
 	struct ext2_dir_entry_2 *dp = (struct ext2_dir_entry_2 *)&dbuf;
 	int error, count, namlen;
 #define	MINDIRSIZ (sizeof (struct dirtemplate) / 2)
 
 	for (off = 0; off < ip->i_size; off += dp->rec_len) {
 		error = vn_rdwr(UIO_READ, ITOV(ip), (caddr_t)dp, MINDIRSIZ,
 		    off, UIO_SYSSPACE, IO_NODELOCKED | IO_NOMACCHECK, cred,
 		    NOCRED, &count, (struct thread *)0);
 		/*
 		 * Since we read MINDIRSIZ, residual must
 		 * be 0 unless we're at end of file.
 		 */
 		if (error || count != 0)
 			return (0);
 		/* avoid infinite loops */
 		if (dp->rec_len == 0)
 			return (0);
 		/* skip empty entries */
 		if (dp->inode == 0)
 			continue;
 		/* accept only "." and ".." */
 		namlen = dp->name_len;
 		if (namlen > 2)
 			return (0);
 		if (dp->name[0] != '.')
 			return (0);
 		/*
 		 * At this point namlen must be 1 or 2.
 		 * 1 implies ".", 2 implies ".." if second
 		 * char is also "."
 		 */
 		if (namlen == 1)
 			continue;
 		if (dp->name[1] == '.' && dp->inode == parentino)
 			continue;
 		return (0);
 	}
 	return (1);
 }
 
 /*
  * Check if source directory is in the path of the target directory.
  * Target is supplied locked, source is unlocked.
  * The target is always vput before returning.
  */
 int
 ext2_checkpath(source, target, cred)
 	struct inode *source, *target;
 	struct ucred *cred;
 {
 	struct vnode *vp;
 	int error, rootino, namlen;
 	struct dirtemplate dirbuf;
 
 	vp = ITOV(target);
 	if (target->i_number == source->i_number) {
 		error = EEXIST;
 		goto out;
 	}
 	rootino = ROOTINO;
 	error = 0;
 	if (target->i_number == rootino)
 		goto out;
 
 	for (;;) {
 		if (vp->v_type != VDIR) {
 			error = ENOTDIR;
 			break;
 		}
 		error = vn_rdwr(UIO_READ, vp, (caddr_t)&dirbuf,
 			sizeof (struct dirtemplate), (off_t)0, UIO_SYSSPACE,
 			IO_NODELOCKED | IO_NOMACCHECK, cred, NOCRED, (int *)0,
 			(struct thread *)0);
 		if (error != 0)
 			break;
 		namlen = dirbuf.dotdot_type;	/* like ufs little-endian */
 		if (namlen != 2 ||
 		    dirbuf.dotdot_name[0] != '.' ||
 		    dirbuf.dotdot_name[1] != '.') {
 			error = ENOTDIR;
 			break;
 		}
 		if (dirbuf.dotdot_ino == source->i_number) {
 			error = EINVAL;
 			break;
 		}
 		if (dirbuf.dotdot_ino == rootino)
 			break;
 		vput(vp);
 		if ((error = VFS_VGET(vp->v_mount, dirbuf.dotdot_ino,
 		    LK_EXCLUSIVE, &vp)) != 0) {
 			vp = NULL;
 			break;
 		}
 	}
 
 out:
 	if (error == ENOTDIR)
 		printf("checkpath: .. not a directory\n");
 	if (vp != NULL)
 		vput(vp);
 	return (error);
 }
Index: head/sys/gnu/fs/ext2fs/ext2_vfsops.c
===================================================================
--- head/sys/gnu/fs/ext2fs/ext2_vfsops.c	(revision 175201)
+++ head/sys/gnu/fs/ext2fs/ext2_vfsops.c	(revision 175202)
@@ -1,1157 +1,1157 @@
 /*-
  *  modified for EXT2FS support in Lites 1.1
  *
  *  Aug 1995, Godmar Back (gback@cs.utah.edu)
  *  University of Utah, Department of Computer Science
  */
 /*-
  * Copyright (c) 1989, 1991, 1993, 1994	
  *	The Regents of the University of California.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	@(#)ffs_vfsops.c	8.8 (Berkeley) 4/18/94
  * $FreeBSD$
  */
 
 /*-
  * COPYRIGHT.INFO says this has some GPL'd code from ext2_super.c in it
  *
  *      This program is free software; you can redistribute it and/or modify
  *      it under the terms of the GNU General Public License as published by
  *      the Free Software Foundation; either version 2 of the License.
  *
  *      This program is distributed in the hope that it will be useful,
  *      but WITHOUT ANY WARRANTY; without even the implied warranty of
  *      MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  *      GNU General Public License for more details.
  *
  *      You should have received a copy of the GNU General Public License
  *      along with this program; if not, write to the Free Software
  *      Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
  *
  */
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/namei.h>
 #include <sys/priv.h>
 #include <sys/proc.h>
 #include <sys/kernel.h>
 #include <sys/vnode.h>
 #include <sys/mount.h>
 #include <sys/bio.h>
 #include <sys/buf.h>
 #include <sys/conf.h>
 #include <sys/fcntl.h>
 #include <sys/malloc.h>
 #include <sys/stat.h>
 #include <sys/mutex.h>
 
 #include <geom/geom.h>
 #include <geom/geom_vfs.h>
 
 #include <gnu/fs/ext2fs/ext2_mount.h>
 #include <gnu/fs/ext2fs/inode.h>
 
 #include <gnu/fs/ext2fs/fs.h>
 #include <gnu/fs/ext2fs/ext2_extern.h>
 #include <gnu/fs/ext2fs/ext2_fs.h>
 #include <gnu/fs/ext2fs/ext2_fs_sb.h>
 
 static int ext2_flushfiles(struct mount *mp, int flags, struct thread *td);
 static int ext2_mountfs(struct vnode *, struct mount *, struct thread *);
 static int ext2_reload(struct mount *mp, struct thread *td);
 static int ext2_sbupdate(struct ext2mount *, int);
 
 static vfs_unmount_t		ext2_unmount;
 static vfs_root_t		ext2_root;
 static vfs_statfs_t		ext2_statfs;
 static vfs_sync_t		ext2_sync;
 static vfs_vget_t		ext2_vget;
 static vfs_fhtovp_t		ext2_fhtovp;
 static vfs_mount_t		ext2_mount;
 
 MALLOC_DEFINE(M_EXT2NODE, "ext2_node", "EXT2 vnode private part");
 static MALLOC_DEFINE(M_EXT2MNT, "ext2_mount", "EXT2 mount structure");
 
 static struct vfsops ext2fs_vfsops = {
 	.vfs_fhtovp =		ext2_fhtovp,
 	.vfs_mount =		ext2_mount,
 	.vfs_root =		ext2_root,	/* root inode via vget */
 	.vfs_statfs =		ext2_statfs,
 	.vfs_sync =		ext2_sync,
 	.vfs_unmount =		ext2_unmount,
 	.vfs_vget =		ext2_vget,
 };
 
 VFS_SET(ext2fs_vfsops, ext2fs, 0);
 
 #define bsd_malloc malloc
 #define bsd_free free
 
 static int	ext2_check_sb_compat(struct ext2_super_block *es, struct cdev *dev,
 		    int ronly);
 static int	compute_sb_data(struct vnode * devvp,
 		    struct ext2_super_block * es, struct ext2_sb_info * fs);
 
 static const char *ext2_opts[] = { "from", "export", "acls", "exec",
     "noatime", "union", "suiddir", "multilabel", "nosymfollow",
     "noclusterr", "noclusterw", "force", NULL };
  
 /*
  * VFS Operations.
  *
  * mount system call
  */
 static int
 ext2_mount(mp, td)
 	struct mount *mp;
 	struct thread *td;
 {
 	struct vfsoptlist *opts;
 	struct vnode *devvp;
 	struct ext2mount *ump = 0;
 	struct ext2_sb_info *fs;
 	char *path, *fspec;
 	int error, flags, len;
 	mode_t accessmode;
 	struct nameidata nd, *ndp = &nd;
 
 	opts = mp->mnt_optnew;
 
 	if (vfs_filteropt(opts, ext2_opts))
 		return (EINVAL);
 
 	vfs_getopt(opts, "fspath", (void **)&path, NULL);
 	/* Double-check the length of path.. */
 	if (strlen(path) >= MAXMNTLEN - 1)
 		return (ENAMETOOLONG);
 
 	fspec = NULL;
 	error = vfs_getopt(opts, "from", (void **)&fspec, &len);
 	if (!error && fspec[len - 1] != '\0')
 		return (EINVAL);
 
 	/*
 	 * If updating, check whether changing from read-only to
 	 * read/write; if there is no device name, that's all we do.
 	 */
 	if (mp->mnt_flag & MNT_UPDATE) {
 		ump = VFSTOEXT2(mp);
 		fs = ump->um_e2fs;
 		error = 0;
 		if (fs->s_rd_only == 0 &&
 		    vfs_flagopt(opts, "ro", NULL, 0)) {
 			error = VFS_SYNC(mp, MNT_WAIT, td);
 			if (error)
 				return (error);
 			flags = WRITECLOSE;
 			if (mp->mnt_flag & MNT_FORCE)
 				flags |= FORCECLOSE;
 			if (vfs_busy(mp, LK_NOWAIT, 0, td))
 				return (EBUSY);
 			error = ext2_flushfiles(mp, flags, td);
 			vfs_unbusy(mp, td);
 			if (!error && fs->s_wasvalid) {
 				fs->s_es->s_state |= EXT2_VALID_FS;
 				ext2_sbupdate(ump, MNT_WAIT);
 			}
 			fs->s_rd_only = 1;
 			vfs_flagopt(opts, "ro", &mp->mnt_flag, MNT_RDONLY);
 			DROP_GIANT();
 			g_topology_lock();
 			g_access(ump->um_cp, 0, -1, 0);
 			g_topology_unlock();
 			PICKUP_GIANT();
 		}
 		if (!error && (mp->mnt_flag & MNT_RELOAD))
 			error = ext2_reload(mp, td);
 		if (error)
 			return (error);
 		devvp = ump->um_devvp;
 		if (fs->s_rd_only && !vfs_flagopt(opts, "ro", NULL, 0)) {
 			if (ext2_check_sb_compat(fs->s_es, devvp->v_rdev, 0))
 				return (EPERM);
 			/*
 			 * If upgrade to read-write by non-root, then verify
 			 * that user has necessary permissions on the device.
 			 */
-			vn_lock(devvp, LK_EXCLUSIVE | LK_RETRY, td);
+			vn_lock(devvp, LK_EXCLUSIVE | LK_RETRY);
 			error = VOP_ACCESS(devvp, VREAD | VWRITE,
 			    td->td_ucred, td);
 			if (error)
 				error = priv_check(td, PRIV_VFS_MOUNT_PERM);
 			if (error) {
 				VOP_UNLOCK(devvp, 0, td);
 				return (error);
 			}
 			VOP_UNLOCK(devvp, 0, td);
 			DROP_GIANT();
 			g_topology_lock();
 			error = g_access(ump->um_cp, 0, 1, 0);
 			g_topology_unlock();
 			PICKUP_GIANT();
 			if (error)
 				return (error);
 
 			if ((fs->s_es->s_state & EXT2_VALID_FS) == 0 ||
 			    (fs->s_es->s_state & EXT2_ERROR_FS)) {
 				if (mp->mnt_flag & MNT_FORCE) {
 					printf(
 "WARNING: %s was not properly dismounted\n",
 					    fs->fs_fsmnt);
 				} else {
 					printf(
 "WARNING: R/W mount of %s denied.  Filesystem is not clean - run fsck\n",
 					    fs->fs_fsmnt);
 					return (EPERM);
 				}
 			}
 			fs->s_es->s_state &= ~EXT2_VALID_FS;
 			ext2_sbupdate(ump, MNT_WAIT);
 			fs->s_rd_only = 0;
 			MNT_ILOCK(mp);
 			mp->mnt_flag &= ~MNT_RDONLY;
 			MNT_IUNLOCK(mp);
 		}
 		if (vfs_flagopt(opts, "export", NULL, 0)) {
 			/* Process export requests in vfs_mount.c. */
 			return (error);
 		}
 	}
 	/*
 	 * Not an update, or updating the name: look up the name
 	 * and verify that it refers to a sensible disk device.
 	 */
 	if (fspec == NULL)
 		return (EINVAL);
 	NDINIT(ndp, LOOKUP, FOLLOW | LOCKLEAF, UIO_SYSSPACE, fspec, td);
 	if ((error = namei(ndp)) != 0)
 		return (error);
 	NDFREE(ndp, NDF_ONLY_PNBUF);
 	devvp = ndp->ni_vp;
 
 	if (!vn_isdisk(devvp, &error)) {
 		vput(devvp);
 		return (error);
 	}
 
 	/*
 	 * If mount by non-root, then verify that user has necessary
 	 * permissions on the device.
 	 *
 	 * XXXRW: VOP_ACCESS() enough?
 	 */
 	accessmode = VREAD;
 	if ((mp->mnt_flag & MNT_RDONLY) == 0)
 		accessmode |= VWRITE;
 	error = VOP_ACCESS(devvp, accessmode, td->td_ucred, td);
 	if (error)
 		error = priv_check(td, PRIV_VFS_MOUNT_PERM);
 	if (error) {
 		vput(devvp);
 		return (error);
 	}
 
 	if ((mp->mnt_flag & MNT_UPDATE) == 0) {
 		error = ext2_mountfs(devvp, mp, td);
 	} else {
 		if (devvp != ump->um_devvp) {
 			vput(devvp);
 			return (EINVAL);	/* needs translation */
 		} else
 			vput(devvp);
 	}
 	if (error) {
 		vrele(devvp);
 		return (error);
 	}
 	ump = VFSTOEXT2(mp);
 	fs = ump->um_e2fs;
 	/*
 	 * Note that this strncpy() is ok because of a check at the start
 	 * of ext2_mount().
 	 */
 	strncpy(fs->fs_fsmnt, path, MAXMNTLEN);
 	fs->fs_fsmnt[MAXMNTLEN - 1] = '\0';
 	vfs_mountedfrom(mp, fspec);
 	return (0);
 }
 
 /*
  * checks that the data in the descriptor blocks make sense
  * this is taken from ext2/super.c
  */
 static int ext2_check_descriptors (struct ext2_sb_info * sb)
 {
         int i;
         int desc_block = 0;
         unsigned long block = sb->s_es->s_first_data_block;
         struct ext2_group_desc * gdp = NULL;
 
         /* ext2_debug ("Checking group descriptors"); */
 
         for (i = 0; i < sb->s_groups_count; i++)
         {
 		/* examine next descriptor block */
                 if ((i % EXT2_DESC_PER_BLOCK(sb)) == 0)
                         gdp = (struct ext2_group_desc *) 
 				sb->s_group_desc[desc_block++]->b_data;
                 if (gdp->bg_block_bitmap < block ||
                     gdp->bg_block_bitmap >= block + EXT2_BLOCKS_PER_GROUP(sb))
                 {
                         printf ("ext2_check_descriptors: "
                                     "Block bitmap for group %d"
                                     " not in group (block %lu)!\n",
                                     i, (unsigned long) gdp->bg_block_bitmap);
                         return 0;
                 }
                 if (gdp->bg_inode_bitmap < block ||
                     gdp->bg_inode_bitmap >= block + EXT2_BLOCKS_PER_GROUP(sb))
                 {
                         printf ("ext2_check_descriptors: "
                                     "Inode bitmap for group %d"
                                     " not in group (block %lu)!\n",
                                     i, (unsigned long) gdp->bg_inode_bitmap);
                         return 0;
                 }
                 if (gdp->bg_inode_table < block ||
                     gdp->bg_inode_table + sb->s_itb_per_group >=
                     block + EXT2_BLOCKS_PER_GROUP(sb))
                 {
                         printf ("ext2_check_descriptors: "
                                     "Inode table for group %d"
                                     " not in group (block %lu)!\n",
                                     i, (unsigned long) gdp->bg_inode_table);
                         return 0;
                 }
                 block += EXT2_BLOCKS_PER_GROUP(sb);
                 gdp++;
         }
         return 1;
 }
 
 static int
 ext2_check_sb_compat(es, dev, ronly)
 	struct ext2_super_block *es;
 	struct cdev *dev;
 	int ronly;
 {
 
 	if (es->s_magic != EXT2_SUPER_MAGIC) {
 		printf("ext2fs: %s: wrong magic number %#x (expected %#x)\n",
 		    devtoname(dev), es->s_magic, EXT2_SUPER_MAGIC);
 		return (1);
 	}
 	if (es->s_rev_level > EXT2_GOOD_OLD_REV) {
 		if (es->s_feature_incompat & ~EXT2_FEATURE_INCOMPAT_SUPP) {
 			printf(
 "WARNING: mount of %s denied due to unsupported optional features\n",
 			    devtoname(dev));
 			return (1);
 		}
 		if (!ronly &&
 		    (es->s_feature_ro_compat & ~EXT2_FEATURE_RO_COMPAT_SUPP)) {
 			printf(
 "WARNING: R/W mount of %s denied due to unsupported optional features\n",
 			    devtoname(dev));
 			return (1);
 		}
 	}
 	return (0);
 }
 
 /*
  * this computes the fields of the  ext2_sb_info structure from the
  * data in the ext2_super_block structure read in
  */
 static int compute_sb_data(devvp, es, fs)
 	struct vnode * devvp;
 	struct ext2_super_block * es;
 	struct ext2_sb_info * fs;
 {
     int db_count, error;
     int i, j;
     int logic_sb_block = 1;	/* XXX for now */
 
 #if 1
 #define V(v)  
 #else
 #define V(v)  printf(#v"= %d\n", fs->v);
 #endif
 
     fs->s_blocksize = EXT2_MIN_BLOCK_SIZE << es->s_log_block_size; 
     V(s_blocksize)
     fs->s_bshift = EXT2_MIN_BLOCK_LOG_SIZE + es->s_log_block_size;
     V(s_bshift)
     fs->s_fsbtodb = es->s_log_block_size + 1;
     V(s_fsbtodb)
     fs->s_qbmask = fs->s_blocksize - 1;
     V(s_bmask)
     fs->s_blocksize_bits = EXT2_BLOCK_SIZE_BITS(es);
     V(s_blocksize_bits)
     fs->s_frag_size = EXT2_MIN_FRAG_SIZE << es->s_log_frag_size;
     V(s_frag_size)
     if (fs->s_frag_size)
 	fs->s_frags_per_block = fs->s_blocksize / fs->s_frag_size;
     V(s_frags_per_block)
     fs->s_blocks_per_group = es->s_blocks_per_group;
     V(s_blocks_per_group)
     fs->s_frags_per_group = es->s_frags_per_group;
     V(s_frags_per_group)
     fs->s_inodes_per_group = es->s_inodes_per_group;
     V(s_inodes_per_group)
     fs->s_inodes_per_block = fs->s_blocksize / EXT2_INODE_SIZE;
     V(s_inodes_per_block)
     fs->s_itb_per_group = fs->s_inodes_per_group /fs->s_inodes_per_block;
     V(s_itb_per_group)
     fs->s_desc_per_block = fs->s_blocksize / sizeof (struct ext2_group_desc);
     V(s_desc_per_block)
     /* s_resuid / s_resgid ? */
     fs->s_groups_count = (es->s_blocks_count -
 			  es->s_first_data_block +
 			  EXT2_BLOCKS_PER_GROUP(fs) - 1) /
 			 EXT2_BLOCKS_PER_GROUP(fs);
     V(s_groups_count)
     db_count = (fs->s_groups_count + EXT2_DESC_PER_BLOCK(fs) - 1) /
 	EXT2_DESC_PER_BLOCK(fs);
     fs->s_db_per_group = db_count;
     V(s_db_per_group)
 
     fs->s_group_desc = bsd_malloc(db_count * sizeof (struct buf *),
 		M_EXT2MNT, M_WAITOK);
 
     /* adjust logic_sb_block */
     if(fs->s_blocksize > SBSIZE) 
 	/* Godmar thinks: if the blocksize is greater than 1024, then
 	   the superblock is logically part of block zero. 
 	 */
         logic_sb_block = 0;
     
     for (i = 0; i < db_count; i++) {
 	error = bread(devvp , fsbtodb(fs, logic_sb_block + i + 1), 
 		fs->s_blocksize, NOCRED, &fs->s_group_desc[i]);
 	if(error) {
 	    for (j = 0; j < i; j++)
 		brelse(fs->s_group_desc[j]);
 	    bsd_free(fs->s_group_desc, M_EXT2MNT);
 	    printf("EXT2-fs: unable to read group descriptors (%d)\n", error);
 	    return EIO;
 	}
 	LCK_BUF(fs->s_group_desc[i])
     }
     if(!ext2_check_descriptors(fs)) {
 	    for (j = 0; j < db_count; j++)
 		    ULCK_BUF(fs->s_group_desc[j])
 	    bsd_free(fs->s_group_desc, M_EXT2MNT);
 	    printf("EXT2-fs: (ext2_check_descriptors failure) "
 		   "unable to read group descriptors\n");
 	    return EIO;
     }
 
     for (i = 0; i < EXT2_MAX_GROUP_LOADED; i++) {
 	    fs->s_inode_bitmap_number[i] = 0;
 	    fs->s_inode_bitmap[i] = NULL;
 	    fs->s_block_bitmap_number[i] = 0;
 	    fs->s_block_bitmap[i] = NULL;
     }
     fs->s_loaded_inode_bitmaps = 0;
     fs->s_loaded_block_bitmaps = 0;
     if (es->s_rev_level == EXT2_GOOD_OLD_REV || (es->s_feature_ro_compat &
         EXT2_FEATURE_RO_COMPAT_LARGE_FILE) == 0)
 	fs->fs_maxfilesize = 0x7fffffff;
     else
 	fs->fs_maxfilesize = 0x7fffffffffffffff;
     return 0;
 }
 
 /*
  * Reload all incore data for a filesystem (used after running fsck on
  * the root filesystem and finding things to fix). The filesystem must
  * be mounted read-only.
  *
  * Things to do to update the mount:
  *	1) invalidate all cached meta-data.
  *	2) re-read superblock from disk.
  *	3) re-read summary information from disk.
  *	4) invalidate all inactive vnodes.
  *	5) invalidate all cached file data.
  *	6) re-read inode data for all active vnodes.
  */
 static int
 ext2_reload(struct mount *mp, struct thread *td)
 {
 	struct vnode *vp, *mvp, *devvp;
 	struct inode *ip;
 	struct buf *bp;
 	struct ext2_super_block * es;
 	struct ext2_sb_info *fs;
 	int error;
 
 	if ((mp->mnt_flag & MNT_RDONLY) == 0)
 		return (EINVAL);
 	/*
 	 * Step 1: invalidate all cached meta-data.
 	 */
 	devvp = VFSTOEXT2(mp)->um_devvp;
-	vn_lock(devvp, LK_EXCLUSIVE | LK_RETRY, td);
+	vn_lock(devvp, LK_EXCLUSIVE | LK_RETRY);
 	if (vinvalbuf(devvp, 0, td, 0, 0) != 0)
 		panic("ext2_reload: dirty1");
 	VOP_UNLOCK(devvp, 0, td);
 
 	/*
 	 * Step 2: re-read superblock from disk.
 	 * constants have been adjusted for ext2
 	 */
 	if ((error = bread(devvp, SBLOCK, SBSIZE, NOCRED, &bp)) != 0)
 		return (error);
 	es = (struct ext2_super_block *)bp->b_data;
 	if (ext2_check_sb_compat(es, devvp->v_rdev, 0) != 0) {
 		brelse(bp);
 		return (EIO);		/* XXX needs translation */
 	}
 	fs = VFSTOEXT2(mp)->um_e2fs;
 	bcopy(bp->b_data, fs->s_es, sizeof(struct ext2_super_block));
 
 	if((error = compute_sb_data(devvp, es, fs)) != 0) {
 		brelse(bp);
 		return error;
 	}
 #ifdef UNKLAR
 	if (fs->fs_sbsize < SBSIZE)
 		bp->b_flags |= B_INVAL;
 #endif
 	brelse(bp);
 
 loop:
 	MNT_ILOCK(mp);
 	MNT_VNODE_FOREACH(vp, mp, mvp) {
 		VI_LOCK(vp);
 		if (vp->v_iflag & VI_DOOMED) {
 			VI_UNLOCK(vp);
 			continue;
 		}
 		MNT_IUNLOCK(mp);
 		/*
 		 * Step 4: invalidate all cached file data.
 		 */
 		if (vget(vp, LK_EXCLUSIVE | LK_INTERLOCK, td)) {
 			MNT_VNODE_FOREACH_ABORT(mp, mvp);
 			goto loop;
 		}
 		if (vinvalbuf(vp, 0, td, 0, 0))
 			panic("ext2_reload: dirty2");
 		/*
 		 * Step 5: re-read inode data for all active vnodes.
 		 */
 		ip = VTOI(vp);
 		error =
 		    bread(devvp, fsbtodb(fs, ino_to_fsba(fs, ip->i_number)),
 		    (int)fs->s_blocksize, NOCRED, &bp);
 		if (error) {
 			VOP_UNLOCK(vp, 0, td);
 			vrele(vp);
 			MNT_VNODE_FOREACH_ABORT(mp, mvp);
 			return (error);
 		}
 		ext2_ei2i((struct ext2_inode *) ((char *)bp->b_data +
 		    EXT2_INODE_SIZE * ino_to_fsbo(fs, ip->i_number)), ip);
 		brelse(bp);
 		VOP_UNLOCK(vp, 0, td);
 		vrele(vp);
 		MNT_ILOCK(mp);
 	}
 	MNT_IUNLOCK(mp);
 	return (0);
 }
 
 /*
  * Common code for mount and mountroot
  */
 static int
 ext2_mountfs(devvp, mp, td)
 	struct vnode *devvp;
 	struct mount *mp;
 	struct thread *td;
 {
 	struct ext2mount *ump;
 	struct buf *bp;
 	struct ext2_sb_info *fs;
 	struct ext2_super_block * es;
 	struct cdev *dev = devvp->v_rdev;
 	struct g_consumer *cp;
 	struct bufobj *bo;
 	int error;
 	int ronly;
 
 	ronly = vfs_flagopt(mp->mnt_optnew, "ro", NULL, 0);
 	/* XXX: use VOP_ACESS to check FS perms */
 	DROP_GIANT();
 	g_topology_lock();
 	error = g_vfs_open(devvp, &cp, "ext2fs", ronly ? 0 : 1);
 	g_topology_unlock();
 	PICKUP_GIANT();
 	VOP_UNLOCK(devvp, 0, td);
 	if (error)
 		return (error);
 
 	/* XXX: should we check for some sectorsize or 512 instead? */
 	if (((SBSIZE % cp->provider->sectorsize) != 0) ||
 	    (SBSIZE < cp->provider->sectorsize)) {
 		DROP_GIANT();
 		g_topology_lock();
 		g_vfs_close(cp, td);
 		g_topology_unlock();
 		PICKUP_GIANT();
 		return (EINVAL);
 	}
 
 	bo = &devvp->v_bufobj;
 	bo->bo_private = cp;
 	bo->bo_ops = g_vfs_bufops;
 	if (devvp->v_rdev->si_iosize_max != 0)
 		mp->mnt_iosize_max = devvp->v_rdev->si_iosize_max;
 	if (mp->mnt_iosize_max > MAXPHYS)
 		mp->mnt_iosize_max = MAXPHYS;
 
 	bp = NULL;
 	ump = NULL;
 	if ((error = bread(devvp, SBLOCK, SBSIZE, NOCRED, &bp)) != 0)
 		goto out;
 	es = (struct ext2_super_block *)bp->b_data;
 	if (ext2_check_sb_compat(es, dev, ronly) != 0) {
 		error = EINVAL;		/* XXX needs translation */
 		goto out;
 	}
 	if ((es->s_state & EXT2_VALID_FS) == 0 ||
 	    (es->s_state & EXT2_ERROR_FS)) {
 		if (ronly || (mp->mnt_flag & MNT_FORCE)) {
 			printf(
 "WARNING: Filesystem was not properly dismounted\n");
 		} else {
 			printf(
 "WARNING: R/W mount denied.  Filesystem is not clean - run fsck\n");
 			error = EPERM;
 			goto out;
 		}
 	}
 	ump = bsd_malloc(sizeof *ump, M_EXT2MNT, M_WAITOK);
 	bzero((caddr_t)ump, sizeof *ump);
 	/* I don't know whether this is the right strategy. Note that
 	   we dynamically allocate both an ext2_sb_info and an ext2_super_block
 	   while Linux keeps the super block in a locked buffer
 	 */
 	ump->um_e2fs = bsd_malloc(sizeof(struct ext2_sb_info), 
 		M_EXT2MNT, M_WAITOK);
 	ump->um_e2fs->s_es = bsd_malloc(sizeof(struct ext2_super_block), 
 		M_EXT2MNT, M_WAITOK);
 	bcopy(es, ump->um_e2fs->s_es, (u_int)sizeof(struct ext2_super_block));
 	if ((error = compute_sb_data(devvp, ump->um_e2fs->s_es, ump->um_e2fs)))
 		goto out;
 	/*
 	 * We don't free the group descriptors allocated by compute_sb_data()
 	 * until ext2_unmount().  This is OK since the mount will succeed.
 	 */
 	brelse(bp);
 	bp = NULL;
 	fs = ump->um_e2fs;
 	fs->s_rd_only = ronly;	/* ronly is set according to mnt_flags */
 	/* if the fs is not mounted read-only, make sure the super block is 
 	   always written back on a sync()
 	 */
 	fs->s_wasvalid = fs->s_es->s_state & EXT2_VALID_FS ? 1 : 0;
 	if (ronly == 0) {
 		fs->s_dirt = 1;		/* mark it modified */
 		fs->s_es->s_state &= ~EXT2_VALID_FS;	/* set fs invalid */
 	}
 	mp->mnt_data = ump;
 	mp->mnt_stat.f_fsid.val[0] = dev2udev(dev);
 	mp->mnt_stat.f_fsid.val[1] = mp->mnt_vfc->vfc_typenum;
 	mp->mnt_maxsymlinklen = EXT2_MAXSYMLINKLEN;
 	MNT_ILOCK(mp);
 	mp->mnt_flag |= MNT_LOCAL;
 	MNT_IUNLOCK(mp);
 	ump->um_mountp = mp;
 	ump->um_dev = dev;
 	ump->um_devvp = devvp;
 	ump->um_bo = &devvp->v_bufobj;
 	ump->um_cp = cp;
 	/* setting those two parameters allowed us to use
 	   ufs_bmap w/o changse !
 	*/
 	ump->um_nindir = EXT2_ADDR_PER_BLOCK(fs);
 	ump->um_bptrtodb = fs->s_es->s_log_block_size + 1;
 	ump->um_seqinc = EXT2_FRAGS_PER_BLOCK(fs);
 	if (ronly == 0) 
 		ext2_sbupdate(ump, MNT_WAIT);
 	return (0);
 out:
 	if (bp)
 		brelse(bp);
 	if (cp != NULL) {
 		DROP_GIANT();
 		g_topology_lock();
 		g_vfs_close(cp, td);
 		g_topology_unlock();
 		PICKUP_GIANT();
 	}
 	if (ump) {
 		bsd_free(ump->um_e2fs->s_es, M_EXT2MNT);
 		bsd_free(ump->um_e2fs, M_EXT2MNT);
 		bsd_free(ump, M_EXT2MNT);
 		mp->mnt_data = NULL;
 	}
 	return (error);
 }
 
 /*
  * unmount system call
  */
 static int
 ext2_unmount(mp, mntflags, td)
 	struct mount *mp;
 	int mntflags;
 	struct thread *td;
 {
 	struct ext2mount *ump;
 	struct ext2_sb_info *fs;
 	int error, flags, ronly, i;
 
 	flags = 0;
 	if (mntflags & MNT_FORCE) {
 		if (mp->mnt_flag & MNT_ROOTFS)
 			return (EINVAL);
 		flags |= FORCECLOSE;
 	}
 	if ((error = ext2_flushfiles(mp, flags, td)) != 0)
 		return (error);
 	ump = VFSTOEXT2(mp);
 	fs = ump->um_e2fs;
 	ronly = fs->s_rd_only;
 	if (ronly == 0) {
 		if (fs->s_wasvalid)
 			fs->s_es->s_state |= EXT2_VALID_FS;
 		ext2_sbupdate(ump, MNT_WAIT);
 	}
 
 	/* release buffers containing group descriptors */
 	for(i = 0; i < fs->s_db_per_group; i++) 
 		ULCK_BUF(fs->s_group_desc[i])
 	bsd_free(fs->s_group_desc, M_EXT2MNT);
 
 	/* release cached inode/block bitmaps */
         for (i = 0; i < EXT2_MAX_GROUP_LOADED; i++)
                 if (fs->s_inode_bitmap[i])
 			ULCK_BUF(fs->s_inode_bitmap[i])
 
         for (i = 0; i < EXT2_MAX_GROUP_LOADED; i++)
                 if (fs->s_block_bitmap[i])
 			ULCK_BUF(fs->s_block_bitmap[i])
 
 	DROP_GIANT();
 	g_topology_lock();
 	g_vfs_close(ump->um_cp, td);
 	g_topology_unlock();
 	PICKUP_GIANT();
 	vrele(ump->um_devvp);
 	bsd_free(fs->s_es, M_EXT2MNT);
 	bsd_free(fs, M_EXT2MNT);
 	bsd_free(ump, M_EXT2MNT);
 	mp->mnt_data = NULL;
 	MNT_ILOCK(mp);
 	mp->mnt_flag &= ~MNT_LOCAL;
 	MNT_IUNLOCK(mp);
 	return (error);
 }
 
 /*
  * Flush out all the files in a filesystem.
  */
 static int
 ext2_flushfiles(mp, flags, td)
 	struct mount *mp;
 	int flags;
 	struct thread *td;
 {
 	int error;
 
 	error = vflush(mp, 0, flags, td);
 	return (error);
 }
 
 /*
  * Get file system statistics.
  * taken from ext2/super.c ext2_statfs
  */
 static int
 ext2_statfs(mp, sbp, td)
 	struct mount *mp;
 	struct statfs *sbp;
 	struct thread *td;
 {
         unsigned long overhead;
 	struct ext2mount *ump;
 	struct ext2_sb_info *fs;
 	struct ext2_super_block *es;
 	int i, nsb;
 
 	ump = VFSTOEXT2(mp);
 	fs = ump->um_e2fs;
 	es = fs->s_es;
 
 	if (es->s_magic != EXT2_SUPER_MAGIC)
 		panic("ext2_statfs - magic number spoiled");
 
 	/*
 	 * Compute the overhead (FS structures)
 	 */
 	if (es->s_feature_ro_compat & EXT2_FEATURE_RO_COMPAT_SPARSE_SUPER) {
 		nsb = 0;
 		for (i = 0 ; i < fs->s_groups_count; i++)
 			if (ext2_group_sparse(i))
 				nsb++;
 	} else
 		nsb = fs->s_groups_count;
 	overhead = es->s_first_data_block + 
 	    /* Superblocks and block group descriptors: */
 	    nsb * (1 + fs->s_db_per_group) +
 	    /* Inode bitmap, block bitmap, and inode table: */
 	    fs->s_groups_count * (1 + 1 + fs->s_itb_per_group);
 
 	sbp->f_bsize = EXT2_FRAG_SIZE(fs);	
 	sbp->f_iosize = EXT2_BLOCK_SIZE(fs);
 	sbp->f_blocks = es->s_blocks_count - overhead;
 	sbp->f_bfree = es->s_free_blocks_count; 
 	sbp->f_bavail = sbp->f_bfree - es->s_r_blocks_count; 
 	sbp->f_files = es->s_inodes_count; 
 	sbp->f_ffree = es->s_free_inodes_count; 
 	return (0);
 }
 
 /*
  * Go through the disk queues to initiate sandbagged IO;
  * go through the inodes to write those that have been modified;
  * initiate the writing of the super block if it has been modified.
  *
  * Note: we are always called with the filesystem marked `MPBUSY'.
  */
 static int
 ext2_sync(mp, waitfor, td)
 	struct mount *mp;
 	int waitfor;
 	struct thread *td;
 {
 	struct vnode *mvp, *vp;
 	struct inode *ip;
 	struct ext2mount *ump = VFSTOEXT2(mp);
 	struct ext2_sb_info *fs;
 	int error, allerror = 0;
 
 	fs = ump->um_e2fs;
 	if (fs->s_dirt != 0 && fs->s_rd_only != 0) {		/* XXX */
 		printf("fs = %s\n", fs->fs_fsmnt);
 		panic("ext2_sync: rofs mod");
 	}
 	/*
 	 * Write back each (modified) inode.
 	 */
 	MNT_ILOCK(mp);
 loop:
 	MNT_VNODE_FOREACH(vp, mp, mvp) {
 		VI_LOCK(vp);
 		if (vp->v_type == VNON || (vp->v_iflag & VI_DOOMED)) {
 			VI_UNLOCK(vp);
 			continue;
 		}
 		MNT_IUNLOCK(mp);
 		ip = VTOI(vp);
 		if ((ip->i_flag &
 		    (IN_ACCESS | IN_CHANGE | IN_MODIFIED | IN_UPDATE)) == 0 &&
 		    (vp->v_bufobj.bo_dirty.bv_cnt == 0 ||
 		    waitfor == MNT_LAZY)) {
 			VI_UNLOCK(vp);
 			MNT_ILOCK(mp);
 			continue;
 		}
 		error = vget(vp, LK_EXCLUSIVE | LK_NOWAIT | LK_INTERLOCK, td);
 		if (error) {
 			MNT_ILOCK(mp);
 			if (error == ENOENT) {
 				MNT_VNODE_FOREACH_ABORT_ILOCKED(mp, mvp);
 				goto loop;
 			}
 			continue;
 		}
 		if ((error = VOP_FSYNC(vp, waitfor, td)) != 0)
 			allerror = error;
 		VOP_UNLOCK(vp, 0, td);
 		vrele(vp);
 		MNT_ILOCK(mp);
 	}
 	MNT_IUNLOCK(mp);
 	/*
 	 * Force stale file system control information to be flushed.
 	 */
 	if (waitfor != MNT_LAZY) {
-		vn_lock(ump->um_devvp, LK_EXCLUSIVE | LK_RETRY, td);
+		vn_lock(ump->um_devvp, LK_EXCLUSIVE | LK_RETRY);
 		if ((error = VOP_FSYNC(ump->um_devvp, waitfor, td)) != 0)
 			allerror = error;
 		VOP_UNLOCK(ump->um_devvp, 0, td);
 	}
 	/*
 	 * Write back modified superblock.
 	 */
 	if (fs->s_dirt != 0) {
 		fs->s_dirt = 0;
 		fs->s_es->s_wtime = time_second;
 		if ((error = ext2_sbupdate(ump, waitfor)) != 0)
 			allerror = error;
 	}
 	return (allerror);
 }
 
 /*
  * Look up an EXT2FS dinode number to find its incore vnode, otherwise read it
  * in from disk.  If it is in core, wait for the lock bit to clear, then
  * return the inode locked.  Detection and handling of mount points must be
  * done by the calling routine.
  */
 static int
 ext2_vget(mp, ino, flags, vpp)
 	struct mount *mp;
 	ino_t ino;
 	int flags;
 	struct vnode **vpp;
 {
 	struct ext2_sb_info *fs;
 	struct inode *ip;
 	struct ext2mount *ump;
 	struct buf *bp;
 	struct vnode *vp;
 	struct cdev *dev;
 	int i, error;
 	int used_blocks;
 	struct thread *td;
 
 	td = curthread;
 	error = vfs_hash_get(mp, ino, flags, td, vpp, NULL, NULL);
 	if (error || *vpp != NULL)
 		return (error);
 
 	ump = VFSTOEXT2(mp);
 	dev = ump->um_dev;
 
 	/*
 	 * If this MALLOC() is performed after the getnewvnode()
 	 * it might block, leaving a vnode with a NULL v_data to be
 	 * found by ext2_sync() if a sync happens to fire right then,
 	 * which will cause a panic because ext2_sync() blindly
 	 * dereferences vp->v_data (as well it should).
 	 */
 	ip = malloc(sizeof(struct inode), M_EXT2NODE, M_WAITOK | M_ZERO);
 
 	/* Allocate a new vnode/inode. */
 	if ((error = getnewvnode("ext2fs", mp, &ext2_vnodeops, &vp)) != 0) {
 		*vpp = NULL;
 		free(ip, M_EXT2NODE);
 		return (error);
 	}
 	vp->v_data = ip;
 	ip->i_vnode = vp;
 	ip->i_e2fs = fs = ump->um_e2fs;
 	ip->i_number = ino;
 
 	lockmgr(vp->v_vnlock, LK_EXCLUSIVE, NULL, td);
 	error = insmntque(vp, mp);
 	if (error != 0) {
 		free(ip, M_EXT2NODE);
 		*vpp = NULL;
 		return (error);
 	}
 	error = vfs_hash_insert(vp, ino, flags, td, vpp, NULL, NULL);
 	if (error || *vpp != NULL)
 		return (error);
 
 	/* Read in the disk contents for the inode, copy into the inode. */
 #if 0
 printf("ext2_vget(%d) dbn= %d ", ino, fsbtodb(fs, ino_to_fsba(fs, ino)));
 #endif
 	if ((error = bread(ump->um_devvp, fsbtodb(fs, ino_to_fsba(fs, ino)),
 	    (int)fs->s_blocksize, NOCRED, &bp)) != 0) {
 		/*
 		 * The inode does not contain anything useful, so it would
 		 * be misleading to leave it on its hash chain. With mode
 		 * still zero, it will be unlinked and returned to the free
 		 * list by vput().
 		 */
 		vput(vp);
 		brelse(bp);
 		*vpp = NULL;
 		return (error);
 	}
 	/* convert ext2 inode to dinode */
 	ext2_ei2i((struct ext2_inode *) ((char *)bp->b_data + EXT2_INODE_SIZE *
 			ino_to_fsbo(fs, ino)), ip);
 	ip->i_block_group = ino_to_cg(fs, ino);
 	ip->i_next_alloc_block = 0;
 	ip->i_next_alloc_goal = 0;
 	ip->i_prealloc_count = 0;
 	ip->i_prealloc_block = 0;
         /* now we want to make sure that block pointers for unused
            blocks are zeroed out - ext2_balloc depends on this 
 	   although for regular files and directories only
 	*/
 	if(S_ISDIR(ip->i_mode) || S_ISREG(ip->i_mode)) {
 		used_blocks = (ip->i_size+fs->s_blocksize-1) / fs->s_blocksize;
 		for(i = used_blocks; i < EXT2_NDIR_BLOCKS; i++)
 			ip->i_db[i] = 0;
 	}
 /*
 	ext2_print_inode(ip);
 */
 	brelse(bp);
 
 	/*
 	 * Initialize the vnode from the inode, check for aliases.
 	 * Note that the underlying vnode may have changed.
 	 */
 	if ((error = ext2_vinit(mp, &ext2_fifoops, &vp)) != 0) {
 		vput(vp);
 		*vpp = NULL;
 		return (error);
 	}
 	/*
 	 * Finish inode initialization now that aliasing has been resolved.
 	 */
 	ip->i_devvp = ump->um_devvp;
 	/*
 	 * Set up a generation number for this inode if it does not
 	 * already have one. This should only happen on old filesystems.
 	 */
 	if (ip->i_gen == 0) {
 		ip->i_gen = random() / 2 + 1;
 		if ((vp->v_mount->mnt_flag & MNT_RDONLY) == 0)
 			ip->i_flag |= IN_MODIFIED;
 	}
 	*vpp = vp;
 	return (0);
 }
 
 /*
  * File handle to vnode
  *
  * Have to be really careful about stale file handles:
  * - check that the inode number is valid
  * - call ext2_vget() to get the locked inode
  * - check for an unallocated inode (i_mode == 0)
  * - check that the given client host has export rights and return
  *   those rights via. exflagsp and credanonp
  */
 static int
 ext2_fhtovp(mp, fhp, vpp)
 	struct mount *mp;
 	struct fid *fhp;
 	struct vnode **vpp;
 {
 	struct inode *ip;
 	struct ufid *ufhp;
 	struct vnode *nvp;
 	struct ext2_sb_info *fs;
 	int error;
 
 	ufhp = (struct ufid *)fhp;
 	fs = VFSTOEXT2(mp)->um_e2fs;
 	if (ufhp->ufid_ino < ROOTINO ||
 	    ufhp->ufid_ino > fs->s_groups_count * fs->s_es->s_inodes_per_group)
 		return (ESTALE);
 
 	error = VFS_VGET(mp, ufhp->ufid_ino, LK_EXCLUSIVE, &nvp);
 	if (error) {
 		*vpp = NULLVP;
 		return (error);
 	}
 	ip = VTOI(nvp);
 	if (ip->i_mode == 0 ||
 	    ip->i_gen != ufhp->ufid_gen || ip->i_nlink <= 0) {
 		vput(nvp);
 		*vpp = NULLVP;
 		return (ESTALE);
 	}
 	*vpp = nvp;
 	vnode_create_vobject(*vpp, 0, curthread);
 	return (0);
 }
 
 /*
  * Write a superblock and associated information back to disk.
  */
 static int
 ext2_sbupdate(mp, waitfor)
 	struct ext2mount *mp;
 	int waitfor;
 {
 	struct ext2_sb_info *fs = mp->um_e2fs;
 	struct ext2_super_block *es = fs->s_es;
 	struct buf *bp;
 	int error = 0;
 /*
 printf("\nupdating superblock, waitfor=%s\n", waitfor == MNT_WAIT ? "yes":"no");
 */
 	bp = getblk(mp->um_devvp, SBLOCK, SBSIZE, 0, 0, 0);
 	bcopy((caddr_t)es, bp->b_data, (u_int)sizeof(struct ext2_super_block));
 	if (waitfor == MNT_WAIT)
 		error = bwrite(bp);
 	else
 		bawrite(bp);
 
 	/*
 	 * The buffers for group descriptors, inode bitmaps and block bitmaps
 	 * are not busy at this point and are (hopefully) written by the
 	 * usual sync mechanism. No need to write them here
 		 */
 
 	return (error);
 }
 
 /*
  * Return the root of a filesystem.
  */
 static int
 ext2_root(mp, flags, vpp, td)
 	struct mount *mp;
 	int flags;
 	struct vnode **vpp;
 	struct thread *td;
 {
 	struct vnode *nvp;
 	int error;
 
 	error = VFS_VGET(mp, (ino_t)ROOTINO, LK_EXCLUSIVE, &nvp);
 	if (error)
 		return (error);
 	*vpp = nvp;
 	return (0);
 }
Index: head/sys/gnu/fs/ext2fs/ext2_vnops.c
===================================================================
--- head/sys/gnu/fs/ext2fs/ext2_vnops.c	(revision 175201)
+++ head/sys/gnu/fs/ext2fs/ext2_vnops.c	(revision 175202)
@@ -1,1775 +1,1775 @@
 /*-
  *  modified for EXT2FS support in Lites 1.1
  *
  *  Aug 1995, Godmar Back (gback@cs.utah.edu)
  *  University of Utah, Department of Computer Science
  */
 /*-
  * Copyright (c) 1982, 1986, 1989, 1993
  *	The Regents of the University of California.  All rights reserved.
  * (c) UNIX System Laboratories, Inc.
  * All or some portions of this file are derived from material licensed
  * to the University of California by American Telephone and Telegraph
  * Co. or Unix System Laboratories, Inc. and are reproduced herein with
  * the permission of UNIX System Laboratories, Inc.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	@(#)ufs_vnops.c	8.7 (Berkeley) 2/3/94
  *	@(#)ufs_vnops.c 8.27 (Berkeley) 5/27/95
  * $FreeBSD$
  */
 
 #include "opt_suiddir.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/resourcevar.h>
 #include <sys/kernel.h>
 #include <sys/fcntl.h>
 #include <sys/stat.h>
 #include <sys/bio.h>
 #include <sys/buf.h>
 #include <sys/priv.h>
 #include <sys/proc.h>
 #include <sys/mount.h>
 #include <sys/unistd.h>
 #include <sys/time.h>
 #include <sys/vnode.h>
 #include <sys/namei.h>
 #include <sys/lockf.h>
 #include <sys/event.h>
 #include <sys/conf.h>
 #include <sys/file.h>
 
 #include <vm/vm.h>
 #include <vm/vm_extern.h>
 #include <vm/vnode_pager.h>
 
 #include <fs/fifofs/fifo.h>
 
 #include <sys/signalvar.h>
 #include <ufs/ufs/dir.h>
 
 #include <gnu/fs/ext2fs/inode.h>
 #include <gnu/fs/ext2fs/ext2_mount.h>
 #include <gnu/fs/ext2fs/ext2_fs_sb.h>
 #include <gnu/fs/ext2fs/fs.h>
 #include <gnu/fs/ext2fs/ext2_extern.h>
 #include <gnu/fs/ext2fs/ext2_fs.h>
 
 static int ext2_makeinode(int mode, struct vnode *, struct vnode **, struct componentname *);
 
 static vop_access_t	ext2_access;
 static vop_advlock_t	ext2_advlock;
 static int ext2_chmod(struct vnode *, int, struct ucred *, struct thread *);
 static int ext2_chown(struct vnode *, uid_t, gid_t, struct ucred *,
     struct thread *);
 static vop_close_t	ext2_close;
 static vop_create_t	ext2_create;
 static vop_fsync_t	ext2_fsync;
 static vop_getattr_t	ext2_getattr;
 static vop_kqfilter_t	ext2_kqfilter;
 static vop_link_t	ext2_link;
 static vop_mkdir_t	ext2_mkdir;
 static vop_mknod_t	ext2_mknod;
 static vop_open_t	ext2_open;
 static vop_pathconf_t	ext2_pathconf;
 static vop_print_t	ext2_print;
 static vop_read_t	ext2_read;
 static vop_readlink_t	ext2_readlink;
 static vop_remove_t	ext2_remove;
 static vop_rename_t	ext2_rename;
 static vop_rmdir_t	ext2_rmdir;
 static vop_setattr_t	ext2_setattr;
 static vop_strategy_t	ext2_strategy;
 static vop_symlink_t	ext2_symlink;
 static vop_write_t	ext2_write;
 static vop_vptofh_t	ext2_vptofh;
 static vop_close_t	ext2fifo_close;
 static vop_kqfilter_t	ext2fifo_kqfilter;
 static int filt_ext2read(struct knote *kn, long hint);
 static int filt_ext2write(struct knote *kn, long hint);
 static int filt_ext2vnode(struct knote *kn, long hint);
 static void filt_ext2detach(struct knote *kn);
 
 /* Global vfs data structures for ext2. */
 struct vop_vector ext2_vnodeops = {
 	.vop_default =		&default_vnodeops,
 	.vop_access =		ext2_access,
 	.vop_advlock =		ext2_advlock,
 	.vop_bmap =		ext2_bmap,
 	.vop_cachedlookup =	ext2_lookup,
 	.vop_close =		ext2_close,
 	.vop_create =		ext2_create,
 	.vop_fsync =		ext2_fsync,
 	.vop_getattr =		ext2_getattr,
 	.vop_inactive =		ext2_inactive,
 	.vop_link =		ext2_link,
 	.vop_lookup =		vfs_cache_lookup,
 	.vop_mkdir =		ext2_mkdir,
 	.vop_mknod =		ext2_mknod,
 	.vop_open =		ext2_open,
 	.vop_pathconf =		ext2_pathconf,
 	.vop_poll =		vop_stdpoll,
 	.vop_kqfilter =		ext2_kqfilter,
 	.vop_print =		ext2_print,
 	.vop_read =		ext2_read,
 	.vop_readdir =		ext2_readdir,
 	.vop_readlink =		ext2_readlink,
 	.vop_reallocblks =	ext2_reallocblks,
 	.vop_reclaim =		ext2_reclaim,
 	.vop_remove =		ext2_remove,
 	.vop_rename =		ext2_rename,
 	.vop_rmdir =		ext2_rmdir,
 	.vop_setattr =		ext2_setattr,
 	.vop_strategy =		ext2_strategy,
 	.vop_symlink =		ext2_symlink,
 	.vop_write =		ext2_write,
 	.vop_vptofh =		ext2_vptofh,
 };
 
 struct vop_vector ext2_fifoops = {
 	.vop_default =		&fifo_specops,
 	.vop_access =		ext2_access,
 	.vop_close =		ext2fifo_close,
 	.vop_fsync =		ext2_fsync,
 	.vop_getattr =		ext2_getattr,
 	.vop_inactive =		ext2_inactive,
 	.vop_kqfilter =		ext2fifo_kqfilter,
 	.vop_print =		ext2_print,
 	.vop_read =		VOP_PANIC,
 	.vop_reclaim =		ext2_reclaim,
 	.vop_setattr =		ext2_setattr,
 	.vop_write =		VOP_PANIC,
 	.vop_vptofh =		ext2_vptofh,
 };
 
 #include <gnu/fs/ext2fs/ext2_readwrite.c>
 
 /*
  * A virgin directory (no blushing please).
  * Note that the type and namlen fields are reversed relative to ext2.
  * Also, we don't use `struct odirtemplate', since it would just cause
  * endianness problems.
  */
 static struct dirtemplate mastertemplate = {
 	0, 12, 1, EXT2_FT_DIR, ".",
 	0, DIRBLKSIZ - 12, 2, EXT2_FT_DIR, ".."
 };
 static struct dirtemplate omastertemplate = {
 	0, 12, 1, EXT2_FT_UNKNOWN, ".",
 	0, DIRBLKSIZ - 12, 2, EXT2_FT_UNKNOWN, ".."
 };
 
 void
 ext2_itimes(vp)
 	struct vnode *vp;
 {
 	struct inode *ip;
 	struct timespec ts;
 
 	ip = VTOI(vp);
 	if ((ip->i_flag & (IN_ACCESS | IN_CHANGE | IN_UPDATE)) == 0)
 		return;
 	if ((vp->v_type == VBLK || vp->v_type == VCHR))
 		ip->i_flag |= IN_LAZYMOD;
 	else
 		ip->i_flag |= IN_MODIFIED;
 	if ((vp->v_mount->mnt_flag & MNT_RDONLY) == 0) {
 		vfs_timestamp(&ts);
 		if (ip->i_flag & IN_ACCESS) {
 			ip->i_atime = ts.tv_sec;
 			ip->i_atimensec = ts.tv_nsec;
 		}
 		if (ip->i_flag & IN_UPDATE) {
 			ip->i_mtime = ts.tv_sec;
 			ip->i_mtimensec = ts.tv_nsec;
 			ip->i_modrev++;
 		}
 		if (ip->i_flag & IN_CHANGE) {
 			ip->i_ctime = ts.tv_sec;
 			ip->i_ctimensec = ts.tv_nsec;
 		}
 	}
 	ip->i_flag &= ~(IN_ACCESS | IN_CHANGE | IN_UPDATE);
 }
 
 /*
  * Create a regular file
  */
 static int
 ext2_create(ap)
 	struct vop_create_args /* {
 		struct vnode *a_dvp;
 		struct vnode **a_vpp;
 		struct componentname *a_cnp;
 		struct vattr *a_vap;
 	} */ *ap;
 {
 	int error;
 
 	error =
 	    ext2_makeinode(MAKEIMODE(ap->a_vap->va_type, ap->a_vap->va_mode),
 	    ap->a_dvp, ap->a_vpp, ap->a_cnp);
 	if (error)
 		return (error);
 	return (0);
 }
 
 static int
 ext2_open(ap)
 	struct vop_open_args /* {
 		struct vnode *a_vp;
 		int  a_mode;
 		struct ucred *a_cred;
 		struct thread *a_td;
 	} */ *ap;
 {
 
 	if (ap->a_vp->v_type == VBLK || ap->a_vp->v_type == VCHR)
 		return (EOPNOTSUPP);
 
 	/*
 	 * Files marked append-only must be opened for appending.
 	 */
 	if ((VTOI(ap->a_vp)->i_flags & APPEND) &&
 	    (ap->a_mode & (FWRITE | O_APPEND)) == FWRITE)
 		return (EPERM);
 
 	vnode_create_vobject(ap->a_vp, VTOI(ap->a_vp)->i_size, ap->a_td);
 
 	return (0);
 }
 
 /*
  * Close called.
  *
  * Update the times on the inode.
  */
 static int
 ext2_close(ap)
 	struct vop_close_args /* {
 		struct vnode *a_vp;
 		int  a_fflag;
 		struct ucred *a_cred;
 		struct thread *a_td;
 	} */ *ap;
 {
 	struct vnode *vp = ap->a_vp;
 
 	VI_LOCK(vp);
 	if (vp->v_usecount > 1)
 		ext2_itimes(vp);
 	VI_UNLOCK(vp);
 	return (0);
 }
 
 static int
 ext2_access(ap)
 	struct vop_access_args /* {
 		struct vnode *a_vp;
 		int  a_mode;
 		struct ucred *a_cred;
 		struct thread *a_td;
 	} */ *ap;
 {
 	struct vnode *vp = ap->a_vp;
 	struct inode *ip = VTOI(vp);
 	mode_t mode = ap->a_mode;
 	int error;
 
 	if (vp->v_type == VBLK || vp->v_type == VCHR)
 		return (EOPNOTSUPP);
 
 	/*
 	 * Disallow write attempts on read-only file systems;
 	 * unless the file is a socket, fifo, or a block or
 	 * character device resident on the file system.
 	 */
 	if (mode & VWRITE) {
 		switch (vp->v_type) {
 		case VDIR:
 		case VLNK:
 		case VREG:
 			if (vp->v_mount->mnt_flag & MNT_RDONLY)
 				return (EROFS);
 			break;
 		default:
 			break;
 		}
 	}
 
 	/* If immutable bit set, nobody gets to write it. */
 	if ((mode & VWRITE) && (ip->i_flags & (IMMUTABLE | SF_SNAPSHOT)))
 		return (EPERM);
 
 	error = vaccess(vp->v_type, ip->i_mode, ip->i_uid, ip->i_gid,
 	    ap->a_mode, ap->a_cred, NULL);
 	return (error);
 }
 
 static int
 ext2_getattr(ap)
 	struct vop_getattr_args /* {
 		struct vnode *a_vp;
 		struct vattr *a_vap;
 		struct ucred *a_cred;
 		struct thread *a_td;
 	} */ *ap;
 {
 	struct vnode *vp = ap->a_vp;
 	struct inode *ip = VTOI(vp);
 	struct vattr *vap = ap->a_vap;
 
 	ext2_itimes(vp);
 	/*
 	 * Copy from inode table
 	 */
 	vap->va_fsid = dev2udev(ip->i_devvp->v_rdev);
 	vap->va_fileid = ip->i_number;
 	vap->va_mode = ip->i_mode & ~IFMT;
 	vap->va_nlink = ip->i_nlink;
 	vap->va_uid = ip->i_uid;
 	vap->va_gid = ip->i_gid;
 	vap->va_rdev = ip->i_rdev;
 	vap->va_size = ip->i_size;
 	vap->va_atime.tv_sec = ip->i_atime;
 	vap->va_atime.tv_nsec = ip->i_atimensec;
 	vap->va_mtime.tv_sec = ip->i_mtime;
 	vap->va_mtime.tv_nsec = ip->i_mtimensec;
 	vap->va_ctime.tv_sec = ip->i_ctime;
 	vap->va_ctime.tv_nsec = ip->i_ctimensec;
 	vap->va_flags = ip->i_flags;
 	vap->va_gen = ip->i_gen;
 	vap->va_blocksize = vp->v_mount->mnt_stat.f_iosize;
 	vap->va_bytes = dbtob((u_quad_t)ip->i_blocks);
 	vap->va_type = IFTOVT(ip->i_mode);
 	vap->va_filerev = ip->i_modrev;
 	return (0);
 }
 
 /*
  * Set attribute vnode op. called from several syscalls
  */
 static int
 ext2_setattr(ap)
 	struct vop_setattr_args /* {
 		struct vnode *a_vp;
 		struct vattr *a_vap;
 		struct ucred *a_cred;
 		struct thread *a_td;
 	} */ *ap;
 {
 	struct vattr *vap = ap->a_vap;
 	struct vnode *vp = ap->a_vp;
 	struct inode *ip = VTOI(vp);
 	struct ucred *cred = ap->a_cred;
 	struct thread *td = ap->a_td;
 	int error;
 
 	/*
 	 * Check for unsettable attributes.
 	 */
 	if ((vap->va_type != VNON) || (vap->va_nlink != VNOVAL) ||
 	    (vap->va_fsid != VNOVAL) || (vap->va_fileid != VNOVAL) ||
 	    (vap->va_blocksize != VNOVAL) || (vap->va_rdev != VNOVAL) ||
 	    ((int)vap->va_bytes != VNOVAL) || (vap->va_gen != VNOVAL)) {
 		return (EINVAL);
 	}
 	if (vap->va_flags != VNOVAL) {
 		if (vp->v_mount->mnt_flag & MNT_RDONLY)
 			return (EROFS);
 		/*
 		 * Callers may only modify the file flags on objects they
 		 * have VADMIN rights for.
 		 */
 		if ((error = VOP_ACCESS(vp, VADMIN, cred, td)))
 			return (error);
 		/*
 		 * Unprivileged processes and privileged processes in
 		 * jail() are not permitted to unset system flags, or
 		 * modify flags if any system flags are set.
 		 * Privileged non-jail processes may not modify system flags
 		 * if securelevel > 0 and any existing system flags are set.
 		 */
 		if (!priv_check_cred(cred, PRIV_VFS_SYSFLAGS, 0)) {
 			if (ip->i_flags
 			    & (SF_NOUNLINK | SF_IMMUTABLE | SF_APPEND)) {
 				error = securelevel_gt(cred, 0);
 				if (error)
 					return (error);
 			}
 			ip->i_flags = vap->va_flags;
 		} else {
 			if (ip->i_flags
 			    & (SF_NOUNLINK | SF_IMMUTABLE | SF_APPEND) ||
 			    (vap->va_flags & UF_SETTABLE) != vap->va_flags)
 				return (EPERM);
 			ip->i_flags &= SF_SETTABLE;
 			ip->i_flags |= (vap->va_flags & UF_SETTABLE);
 		}
 		ip->i_flag |= IN_CHANGE;
 		if (vap->va_flags & (IMMUTABLE | APPEND))
 			return (0);
 	}
 	if (ip->i_flags & (IMMUTABLE | APPEND))
 		return (EPERM);
 	/*
 	 * Go through the fields and update iff not VNOVAL.
 	 */
 	if (vap->va_uid != (uid_t)VNOVAL || vap->va_gid != (gid_t)VNOVAL) {
 		if (vp->v_mount->mnt_flag & MNT_RDONLY)
 			return (EROFS);
 		if ((error = ext2_chown(vp, vap->va_uid, vap->va_gid, cred,
 		    td)) != 0)
 			return (error);
 	}
 	if (vap->va_size != VNOVAL) {
 		/*
 		 * Disallow write attempts on read-only file systems;
 		 * unless the file is a socket, fifo, or a block or
 		 * character device resident on the file system.
 		 */
 		switch (vp->v_type) {
 		case VDIR:
 			return (EISDIR);
 		case VLNK:
 		case VREG:
 			if (vp->v_mount->mnt_flag & MNT_RDONLY)
 				return (EROFS);
 			break;
 		default:
 			break;
 		}
 		if ((error = ext2_truncate(vp, vap->va_size, 0, cred, td)) != 0)
 			return (error);
 	}
 	if (vap->va_atime.tv_sec != VNOVAL || vap->va_mtime.tv_sec != VNOVAL) {
 		if (vp->v_mount->mnt_flag & MNT_RDONLY)
 			return (EROFS);
 		/*
 		 * From utimes(2):
 		 * If times is NULL, ... The caller must be the owner of
 		 * the file, have permission to write the file, or be the
 		 * super-user.
 		 * If times is non-NULL, ... The caller must be the owner of
 		 * the file or be the super-user.
 		 */
 		if ((error = VOP_ACCESS(vp, VADMIN, cred, td)) &&
 		    ((vap->va_vaflags & VA_UTIMES_NULL) == 0 ||
 		    (error = VOP_ACCESS(vp, VWRITE, cred, td))))
 			return (error);
 		if (vap->va_atime.tv_sec != VNOVAL)
 			ip->i_flag |= IN_ACCESS;
 		if (vap->va_mtime.tv_sec != VNOVAL)
 			ip->i_flag |= IN_CHANGE | IN_UPDATE;
 		ext2_itimes(vp);
 		if (vap->va_atime.tv_sec != VNOVAL) {
 			ip->i_atime = vap->va_atime.tv_sec;
 			ip->i_atimensec = vap->va_atime.tv_nsec;
 		}
 		if (vap->va_mtime.tv_sec != VNOVAL) {
 			ip->i_mtime = vap->va_mtime.tv_sec;
 			ip->i_mtimensec = vap->va_mtime.tv_nsec;
 		}
 		error = ext2_update(vp, 0);
 		if (error)
 			return (error);
 	}
 	error = 0;
 	if (vap->va_mode != (mode_t)VNOVAL) {
 		if (vp->v_mount->mnt_flag & MNT_RDONLY)
 			return (EROFS);
 		error = ext2_chmod(vp, (int)vap->va_mode, cred, td);
 	}
 	return (error);
 }
 
 /*
  * Change the mode on a file.
  * Inode must be locked before calling.
  */
 static int
 ext2_chmod(vp, mode, cred, td)
 	struct vnode *vp;
 	int mode;
 	struct ucred *cred;
 	struct thread *td;
 {
 	struct inode *ip = VTOI(vp);
 	int error;
 
 	/*
 	 * To modify the permissions on a file, must possess VADMIN
 	 * for that file.
 	 */
 	if ((error = VOP_ACCESS(vp, VADMIN, cred, td)))
 		return (error);
 	/*
 	 * Privileged processes may set the sticky bit on non-directories,
 	 * as well as set the setgid bit on a file with a group that the
 	 * process is not a member of.
 	 */
 	if (vp->v_type != VDIR && (mode & S_ISTXT)) {
 		error = priv_check_cred(cred, PRIV_VFS_STICKYFILE, 0);
 		if (error)
 			return (EFTYPE);
 	}
 	if (!groupmember(ip->i_gid, cred) && (mode & ISGID)) {
 		error = priv_check_cred(cred, PRIV_VFS_SETGID, 0);
 		if (error)
 			return (error);
 	}
 	ip->i_mode &= ~ALLPERMS;
 	ip->i_mode |= (mode & ALLPERMS);
 	ip->i_flag |= IN_CHANGE;
 	return (0);
 }
 
 /*
  * Perform chown operation on inode ip;
  * inode must be locked prior to call.
  */
 static int
 ext2_chown(vp, uid, gid, cred, td)
 	struct vnode *vp;
 	uid_t uid;
 	gid_t gid;
 	struct ucred *cred;
 	struct thread *td;
 {
 	struct inode *ip = VTOI(vp);
 	uid_t ouid;
 	gid_t ogid;
 	int error = 0;
 
 	if (uid == (uid_t)VNOVAL)
 		uid = ip->i_uid;
 	if (gid == (gid_t)VNOVAL)
 		gid = ip->i_gid;
 	/*
 	 * To modify the ownership of a file, must possess VADMIN
 	 * for that file.
 	 */
 	if ((error = VOP_ACCESS(vp, VADMIN, cred, td)))
 		return (error);
 	/*
 	 * To change the owner of a file, or change the group of a file
 	 * to a group of which we are not a member, the caller must
 	 * have privilege.
 	 */
 	if (uid != ip->i_uid || (gid != ip->i_gid &&
 	    !groupmember(gid, cred))) {
 		error = priv_check_cred(cred, PRIV_VFS_CHOWN, 0);
 		if (error)
 			return (error);
 	}
 	ogid = ip->i_gid;
 	ouid = ip->i_uid;
 	ip->i_gid = gid;
 	ip->i_uid = uid;
 	ip->i_flag |= IN_CHANGE;
 	if ((ip->i_mode & (ISUID | ISGID)) && (ouid != uid || ogid != gid)) {
 		if (priv_check_cred(cred, PRIV_VFS_RETAINSUGID, 0) != 0)
 			ip->i_mode &= ~(ISUID | ISGID);
 	}
 	return (0);
 }
 
 /*
  * Synch an open file.
  */
 /* ARGSUSED */
 static int
 ext2_fsync(ap)
 	struct vop_fsync_args /* {
 		struct vnode *a_vp;
 		struct ucred *a_cred;
 		int a_waitfor;
 		struct thread *a_td;
 	} */ *ap;
 {
 	/*
 	 * Flush all dirty buffers associated with a vnode.
 	 */
 	ext2_discard_prealloc(VTOI(ap->a_vp));
 
 	vop_stdfsync(ap);
 
 	return (ext2_update(ap->a_vp, ap->a_waitfor == MNT_WAIT));
 }
 
 /*
  * Mknod vnode call
  */
 /* ARGSUSED */
 static int
 ext2_mknod(ap)
 	struct vop_mknod_args /* {
 		struct vnode *a_dvp;
 		struct vnode **a_vpp;
 		struct componentname *a_cnp;
 		struct vattr *a_vap;
 	} */ *ap;
 {
 	struct vattr *vap = ap->a_vap;
 	struct vnode **vpp = ap->a_vpp;
 	struct inode *ip;
 	ino_t ino;
 	int error;
 
 	error = ext2_makeinode(MAKEIMODE(vap->va_type, vap->va_mode),
 	    ap->a_dvp, vpp, ap->a_cnp);
 	if (error)
 		return (error);
 	ip = VTOI(*vpp);
 	ip->i_flag |= IN_ACCESS | IN_CHANGE | IN_UPDATE;
 	if (vap->va_rdev != VNOVAL) {
 		/*
 		 * Want to be able to use this to make badblock
 		 * inodes, so don't truncate the dev number.
 		 */
 		ip->i_rdev = vap->va_rdev;
 	}
 	/*
 	 * Remove inode, then reload it through VFS_VGET so it is
 	 * checked to see if it is an alias of an existing entry in
 	 * the inode cache.	 XXX I don't believe this is necessary now.
 	 */
 	(*vpp)->v_type = VNON;
 	ino = ip->i_number;	/* Save this before vgone() invalidates ip. */
 	vgone(*vpp);
 	vput(*vpp);
 	error = VFS_VGET(ap->a_dvp->v_mount, ino, LK_EXCLUSIVE, vpp);
 	if (error) {
 		*vpp = NULL;
 		return (error);
 	}
 	return (0);
 }
 
 static int
 ext2_remove(ap)
 	struct vop_remove_args /* {
 		struct vnode *a_dvp;
 		struct vnode *a_vp;
 		struct componentname *a_cnp;
 	} */ *ap;
 {
 	struct inode *ip;
 	struct vnode *vp = ap->a_vp;
 	struct vnode *dvp = ap->a_dvp;
 	int error;
 
 	ip = VTOI(vp);
 	if ((ip->i_flags & (NOUNLINK | IMMUTABLE | APPEND)) ||
 	    (VTOI(dvp)->i_flags & APPEND)) {
 		error = EPERM;
 		goto out;
 	}
 	error = ext2_dirremove(dvp, ap->a_cnp);
 	if (error == 0) {
 		ip->i_nlink--;
 		ip->i_flag |= IN_CHANGE;
 	}
 out:
 	return (error);
 }
 
 /*
  * link vnode call
  */
 static int
 ext2_link(ap)
 	struct vop_link_args /* {
 		struct vnode *a_tdvp;
 		struct vnode *a_vp;
 		struct componentname *a_cnp;
 	} */ *ap;
 {
 	struct vnode *vp = ap->a_vp;
 	struct vnode *tdvp = ap->a_tdvp;
 	struct componentname *cnp = ap->a_cnp;
 	struct inode *ip;
 	int error;
 
 #ifdef DIAGNOSTIC
 	if ((cnp->cn_flags & HASBUF) == 0)
 		panic("ext2_link: no name");
 #endif
 	if (tdvp->v_mount != vp->v_mount) {
 		error = EXDEV;
 		goto out;
 	}
 	ip = VTOI(vp);
 	if ((nlink_t)ip->i_nlink >= LINK_MAX) {
 		error = EMLINK;
 		goto out;
 	}
 	if (ip->i_flags & (IMMUTABLE | APPEND)) {
 		error = EPERM;
 		goto out;
 	}
 	ip->i_nlink++;
 	ip->i_flag |= IN_CHANGE;
 	error = ext2_update(vp, 1);
 	if (!error)
 		error = ext2_direnter(ip, tdvp, cnp);
 	if (error) {
 		ip->i_nlink--;
 		ip->i_flag |= IN_CHANGE;
 	}
 out:
 	return (error);
 }
 
 /*
  * Rename system call.
  *   See comments in sys/ufs/ufs/ufs_vnops.c
  */
 static int
 ext2_rename(ap)
 	struct vop_rename_args  /* {
 		struct vnode *a_fdvp;
 		struct vnode *a_fvp;
 		struct componentname *a_fcnp;
 		struct vnode *a_tdvp;
 		struct vnode *a_tvp;
 		struct componentname *a_tcnp;
 	} */ *ap;
 {
 	struct vnode *tvp = ap->a_tvp;
 	struct vnode *tdvp = ap->a_tdvp;
 	struct vnode *fvp = ap->a_fvp;
 	struct vnode *fdvp = ap->a_fdvp;
 	struct componentname *tcnp = ap->a_tcnp;
 	struct componentname *fcnp = ap->a_fcnp;
 	struct thread *td = fcnp->cn_thread;
 	struct inode *ip, *xp, *dp;
 	struct dirtemplate dirbuf;
 	int doingdirectory = 0, oldparent = 0, newparent = 0;
 	int error = 0;
 	u_char namlen;
 
 #ifdef DIAGNOSTIC
 	if ((tcnp->cn_flags & HASBUF) == 0 ||
 	    (fcnp->cn_flags & HASBUF) == 0)
 		panic("ext2_rename: no name");
 #endif
 	/*
 	 * Check for cross-device rename.
 	 */
 	if ((fvp->v_mount != tdvp->v_mount) ||
 	    (tvp && (fvp->v_mount != tvp->v_mount))) {
 		error = EXDEV;
 abortit:
 		if (tdvp == tvp)
 			vrele(tdvp);
 		else
 			vput(tdvp);
 		if (tvp)
 			vput(tvp);
 		vrele(fdvp);
 		vrele(fvp);
 		return (error);
 	}
 
 	if (tvp && ((VTOI(tvp)->i_flags & (NOUNLINK | IMMUTABLE | APPEND)) ||
 	    (VTOI(tdvp)->i_flags & APPEND))) {
 		error = EPERM;
 		goto abortit;
 	}
 
 	/*
 	 * Renaming a file to itself has no effect.  The upper layers should
 	 * not call us in that case.  Temporarily just warn if they do.
 	 */
 	if (fvp == tvp) {
 		printf("ext2_rename: fvp == tvp (can't happen)\n");
 		error = 0;
 		goto abortit;
 	}
 
-	if ((error = vn_lock(fvp, LK_EXCLUSIVE, td)) != 0)
+	if ((error = vn_lock(fvp, LK_EXCLUSIVE)) != 0)
 		goto abortit;
 	dp = VTOI(fdvp);
 	ip = VTOI(fvp);
  	if (ip->i_nlink >= LINK_MAX) {
  		VOP_UNLOCK(fvp, 0, td);
  		error = EMLINK;
  		goto abortit;
  	}
 	if ((ip->i_flags & (NOUNLINK | IMMUTABLE | APPEND))
 	    || (dp->i_flags & APPEND)) {
 		VOP_UNLOCK(fvp, 0, td);
 		error = EPERM;
 		goto abortit;
 	}
 	if ((ip->i_mode & IFMT) == IFDIR) {
 		/*
 		 * Avoid ".", "..", and aliases of "." for obvious reasons.
 		 */
 		if ((fcnp->cn_namelen == 1 && fcnp->cn_nameptr[0] == '.') ||
 		    dp == ip || (fcnp->cn_flags | tcnp->cn_flags) & ISDOTDOT ||
 		    (ip->i_flag & IN_RENAME)) {
 			VOP_UNLOCK(fvp, 0, td);
 			error = EINVAL;
 			goto abortit;
 		}
 		ip->i_flag |= IN_RENAME;
 		oldparent = dp->i_number;
 		doingdirectory++;
 	}
 	vrele(fdvp);
 
 	/*
 	 * When the target exists, both the directory
 	 * and target vnodes are returned locked.
 	 */
 	dp = VTOI(tdvp);
 	xp = NULL;
 	if (tvp)
 		xp = VTOI(tvp);
 
 	/*
 	 * 1) Bump link count while we're moving stuff
 	 *    around.  If we crash somewhere before
 	 *    completing our work, the link count
 	 *    may be wrong, but correctable.
 	 */
 	ip->i_nlink++;
 	ip->i_flag |= IN_CHANGE;
 	if ((error = ext2_update(fvp, 1)) != 0) {
 		VOP_UNLOCK(fvp, 0, td);
 		goto bad;
 	}
 
 	/*
 	 * If ".." must be changed (ie the directory gets a new
 	 * parent) then the source directory must not be in the
 	 * directory heirarchy above the target, as this would
 	 * orphan everything below the source directory. Also
 	 * the user must have write permission in the source so
 	 * as to be able to change "..". We must repeat the call
 	 * to namei, as the parent directory is unlocked by the
 	 * call to checkpath().
 	 */
 	error = VOP_ACCESS(fvp, VWRITE, tcnp->cn_cred, tcnp->cn_thread);
 	VOP_UNLOCK(fvp, 0, td);
 	if (oldparent != dp->i_number)
 		newparent = dp->i_number;
 	if (doingdirectory && newparent) {
 		if (error)	/* write access check above */
 			goto bad;
 		if (xp != NULL)
 			vput(tvp);
 		error = ext2_checkpath(ip, dp, tcnp->cn_cred);
 		if (error)
 			goto out;
 		VREF(tdvp);
 		error = relookup(tdvp, &tvp, tcnp);
 		if (error)
 			goto out;
 		vrele(tdvp);
 		dp = VTOI(tdvp);
 		xp = NULL;
 		if (tvp)
 			xp = VTOI(tvp);
 	}
 	/*
 	 * 2) If target doesn't exist, link the target
 	 *    to the source and unlink the source.
 	 *    Otherwise, rewrite the target directory
 	 *    entry to reference the source inode and
 	 *    expunge the original entry's existence.
 	 */
 	if (xp == NULL) {
 		if (dp->i_devvp != ip->i_devvp)
 			panic("ext2_rename: EXDEV");
 		/*
 		 * Account for ".." in new directory.
 		 * When source and destination have the same
 		 * parent we don't fool with the link count.
 		 */
 		if (doingdirectory && newparent) {
 			if ((nlink_t)dp->i_nlink >= LINK_MAX) {
 				error = EMLINK;
 				goto bad;
 			}
 			dp->i_nlink++;
 			dp->i_flag |= IN_CHANGE;
 			error = ext2_update(tdvp, 1);
 			if (error)
 				goto bad;
 		}
 		error = ext2_direnter(ip, tdvp, tcnp);
 		if (error) {
 			if (doingdirectory && newparent) {
 				dp->i_nlink--;
 				dp->i_flag |= IN_CHANGE;
 				(void)ext2_update(tdvp, 1);
 			}
 			goto bad;
 		}
 		vput(tdvp);
 	} else {
 		if (xp->i_devvp != dp->i_devvp || xp->i_devvp != ip->i_devvp)
 		       panic("ext2_rename: EXDEV");
 		/*
 		 * Short circuit rename(foo, foo).
 		 */
 		if (xp->i_number == ip->i_number)
 			panic("ext2_rename: same file");
 		/*
 		 * If the parent directory is "sticky", then the user must
 		 * own the parent directory, or the destination of the rename,
 		 * otherwise the destination may not be changed (except by
 		 * root). This implements append-only directories.
 		 */
 		if ((dp->i_mode & S_ISTXT) && tcnp->cn_cred->cr_uid != 0 &&
 		    tcnp->cn_cred->cr_uid != dp->i_uid &&
 		    xp->i_uid != tcnp->cn_cred->cr_uid) {
 			error = EPERM;
 			goto bad;
 		}
 		/*
 		 * Target must be empty if a directory and have no links
 		 * to it. Also, ensure source and target are compatible
 		 * (both directories, or both not directories).
 		 */
 		if ((xp->i_mode&IFMT) == IFDIR) {
 			if (! ext2_dirempty(xp, dp->i_number, tcnp->cn_cred) || 
 			    xp->i_nlink > 2) {
 				error = ENOTEMPTY;
 				goto bad;
 			}
 			if (!doingdirectory) {
 				error = ENOTDIR;
 				goto bad;
 			}
 			cache_purge(tdvp);
 		} else if (doingdirectory) {
 			error = EISDIR;
 			goto bad;
 		}
 		error = ext2_dirrewrite(dp, ip, tcnp);
 		if (error)
 			goto bad;
 		/*
 		 * If the target directory is in the same
 		 * directory as the source directory,
 		 * decrement the link count on the parent
 		 * of the target directory.
 		 */
 		if (doingdirectory && !newparent) {
 		       dp->i_nlink--;
 		       dp->i_flag |= IN_CHANGE;
 		}
 		vput(tdvp);
 		/*
 		 * Adjust the link count of the target to
 		 * reflect the dirrewrite above.  If this is
 		 * a directory it is empty and there are
 		 * no links to it, so we can squash the inode and
 		 * any space associated with it.  We disallowed
 		 * renaming over top of a directory with links to
 		 * it above, as the remaining link would point to
 		 * a directory without "." or ".." entries.
 		 */
 		xp->i_nlink--;
 		if (doingdirectory) {
 			if (--xp->i_nlink != 0)
 				panic("ext2_rename: linked directory");
 			error = ext2_truncate(tvp, (off_t)0, IO_SYNC,
 			    tcnp->cn_cred, tcnp->cn_thread);
 		}
 		xp->i_flag |= IN_CHANGE;
 		vput(tvp);
 		xp = NULL;
 	}
 
 	/*
 	 * 3) Unlink the source.
 	 */
 	fcnp->cn_flags &= ~MODMASK;
 	fcnp->cn_flags |= LOCKPARENT | LOCKLEAF;
 	VREF(fdvp);
 	error = relookup(fdvp, &fvp, fcnp);
 	if (error == 0)
 		vrele(fdvp);
 	if (fvp != NULL) {
 		xp = VTOI(fvp);
 		dp = VTOI(fdvp);
 	} else {
 		/*
 		 * From name has disappeared.
 		 */
 		if (doingdirectory)
 			panic("ext2_rename: lost dir entry");
 		vrele(ap->a_fvp);
 		return (0);
 	}
 	/*
 	 * Ensure that the directory entry still exists and has not
 	 * changed while the new name has been entered. If the source is
 	 * a file then the entry may have been unlinked or renamed. In
 	 * either case there is no further work to be done. If the source
 	 * is a directory then it cannot have been rmdir'ed; its link
 	 * count of three would cause a rmdir to fail with ENOTEMPTY.
 	 * The IN_RENAME flag ensures that it cannot be moved by another
 	 * rename.
 	 */
 	if (xp != ip) {
 		if (doingdirectory)
 			panic("ext2_rename: lost dir entry");
 	} else {
 		/*
 		 * If the source is a directory with a
 		 * new parent, the link count of the old
 		 * parent directory must be decremented
 		 * and ".." set to point to the new parent.
 		 */
 		if (doingdirectory && newparent) {
 			dp->i_nlink--;
 			dp->i_flag |= IN_CHANGE;
 			error = vn_rdwr(UIO_READ, fvp, (caddr_t)&dirbuf,
 				sizeof (struct dirtemplate), (off_t)0,
 				UIO_SYSSPACE, IO_NODELOCKED | IO_NOMACCHECK,
 				tcnp->cn_cred, NOCRED, (int *)0,
 				(struct thread *)0);
 			if (error == 0) {
 				/* Like ufs little-endian: */
 				namlen = dirbuf.dotdot_type;
 				if (namlen != 2 ||
 				    dirbuf.dotdot_name[0] != '.' ||
 				    dirbuf.dotdot_name[1] != '.') {
 					ext2_dirbad(xp, (doff_t)12,
 					    "rename: mangled dir");
 				} else {
 					dirbuf.dotdot_ino = newparent;
 					(void) vn_rdwr(UIO_WRITE, fvp,
 					    (caddr_t)&dirbuf,
 					    sizeof (struct dirtemplate),
 					    (off_t)0, UIO_SYSSPACE,
 					    IO_NODELOCKED | IO_SYNC |
 					    IO_NOMACCHECK, tcnp->cn_cred,
 					    NOCRED, (int *)0,
 					    (struct thread *)0);
 					cache_purge(fdvp);
 				}
 			}
 		}
 		error = ext2_dirremove(fdvp, fcnp);
 		if (!error) {
 			xp->i_nlink--;
 			xp->i_flag |= IN_CHANGE;
 		}
 		xp->i_flag &= ~IN_RENAME;
 	}
 	if (dp)
 		vput(fdvp);
 	if (xp)
 		vput(fvp);
 	vrele(ap->a_fvp);
 	return (error);
 
 bad:
 	if (xp)
 		vput(ITOV(xp));
 	vput(ITOV(dp));
 out:
 	if (doingdirectory)
 		ip->i_flag &= ~IN_RENAME;
-	if (vn_lock(fvp, LK_EXCLUSIVE, td) == 0) {
+	if (vn_lock(fvp, LK_EXCLUSIVE) == 0) {
 		ip->i_nlink--;
 		ip->i_flag |= IN_CHANGE;
 		ip->i_flag &= ~IN_RENAME;
 		vput(fvp);
 	} else
 		vrele(fvp);
 	return (error);
 }
 
 /*
  * Mkdir system call
  */
 static int
 ext2_mkdir(ap)
 	struct vop_mkdir_args /* {
 		struct vnode *a_dvp;
 		struct vnode **a_vpp;
 		struct componentname *a_cnp;
 		struct vattr *a_vap;
 	} */ *ap;
 {
 	struct vnode *dvp = ap->a_dvp;
 	struct vattr *vap = ap->a_vap;
 	struct componentname *cnp = ap->a_cnp;
 	struct inode *ip, *dp;
 	struct vnode *tvp;
 	struct dirtemplate dirtemplate, *dtp;
 	int error, dmode;
 
 #ifdef DIAGNOSTIC
 	if ((cnp->cn_flags & HASBUF) == 0)
 		panic("ext2_mkdir: no name");
 #endif
 	dp = VTOI(dvp);
 	if ((nlink_t)dp->i_nlink >= LINK_MAX) {
 		error = EMLINK;
 		goto out;
 	}
 	dmode = vap->va_mode & 0777;
 	dmode |= IFDIR;
 	/*
 	 * Must simulate part of ext2_makeinode here to acquire the inode,
 	 * but not have it entered in the parent directory. The entry is
 	 * made later after writing "." and ".." entries.
 	 */
 	error = ext2_valloc(dvp, dmode, cnp->cn_cred, &tvp);
 	if (error)
 		goto out;
 	ip = VTOI(tvp);
 	ip->i_gid = dp->i_gid;
 #ifdef SUIDDIR
 	{
 		/*
 		 * if we are hacking owners here, (only do this where told to)
 		 * and we are not giving it TOO root, (would subvert quotas)
 		 * then go ahead and give it to the other user.
 		 * The new directory also inherits the SUID bit. 
 		 * If user's UID and dir UID are the same,
 		 * 'give it away' so that the SUID is still forced on.
 		 */
 		if ( (dvp->v_mount->mnt_flag & MNT_SUIDDIR) &&
 		   (dp->i_mode & ISUID) && dp->i_uid) {
 			dmode |= ISUID;
 			ip->i_uid = dp->i_uid;
 		} else {
 			ip->i_uid = cnp->cn_cred->cr_uid;
 		}
 	}
 #else
 	ip->i_uid = cnp->cn_cred->cr_uid;
 #endif
 	ip->i_flag |= IN_ACCESS | IN_CHANGE | IN_UPDATE;
 	ip->i_mode = dmode;
 	tvp->v_type = VDIR;	/* Rest init'd in getnewvnode(). */
 	ip->i_nlink = 2;
 	if (cnp->cn_flags & ISWHITEOUT)
 		ip->i_flags |= UF_OPAQUE;
 	error = ext2_update(tvp, 1);
 
 	/*
 	 * Bump link count in parent directory
 	 * to reflect work done below.  Should
 	 * be done before reference is created
 	 * so reparation is possible if we crash.
 	 */
 	dp->i_nlink++;
 	dp->i_flag |= IN_CHANGE;
 	error = ext2_update(dvp, 1);
 	if (error)
 		goto bad;
 
 	/* Initialize directory with "." and ".." from static template. */
 	if (EXT2_HAS_INCOMPAT_FEATURE(ip->i_e2fs->s_es,
 	    EXT2_FEATURE_INCOMPAT_FILETYPE))
 		dtp = &mastertemplate;
 	else
 		dtp = &omastertemplate;
 	dirtemplate = *dtp;
 	dirtemplate.dot_ino = ip->i_number;
 	dirtemplate.dotdot_ino = dp->i_number;
 	/* note that in ext2 DIRBLKSIZ == blocksize, not DEV_BSIZE 
 	 * so let's just redefine it - for this function only
 	 */
 #undef  DIRBLKSIZ 
 #define DIRBLKSIZ  VTOI(dvp)->i_e2fs->s_blocksize
 	dirtemplate.dotdot_reclen = DIRBLKSIZ - 12;
 	error = vn_rdwr(UIO_WRITE, tvp, (caddr_t)&dirtemplate,
 	    sizeof (dirtemplate), (off_t)0, UIO_SYSSPACE,
 	    IO_NODELOCKED | IO_SYNC | IO_NOMACCHECK, cnp->cn_cred, NOCRED,
 	    (int *)0, (struct thread *)0);
 	if (error) {
 		dp->i_nlink--;
 		dp->i_flag |= IN_CHANGE;
 		goto bad;
 	}
 	if (DIRBLKSIZ > VFSTOEXT2(dvp->v_mount)->um_mountp->mnt_stat.f_bsize)
 		/* XXX should grow with balloc() */
 		panic("ext2_mkdir: blksize");
 	else {
 		ip->i_size = DIRBLKSIZ;
 		ip->i_flag |= IN_CHANGE;
 	}
 
 	/* Directory set up, now install its entry in the parent directory. */
 	error = ext2_direnter(ip, dvp, cnp);
 	if (error) {
 		dp->i_nlink--;
 		dp->i_flag |= IN_CHANGE;
 	}
 bad:
 	/*
 	 * No need to do an explicit VOP_TRUNCATE here, vrele will do this
 	 * for us because we set the link count to 0.
 	 */
 	if (error) {
 		ip->i_nlink = 0;
 		ip->i_flag |= IN_CHANGE;
 		vput(tvp);
 	} else
 		*ap->a_vpp = tvp;
 out:
 	return (error);
 #undef  DIRBLKSIZ
 #define DIRBLKSIZ  DEV_BSIZE
 }
 
 /*
  * Rmdir system call.
  */
 static int
 ext2_rmdir(ap)
 	struct vop_rmdir_args /* {
 		struct vnode *a_dvp;
 		struct vnode *a_vp;
 		struct componentname *a_cnp;
 	} */ *ap;
 {
 	struct vnode *vp = ap->a_vp;
 	struct vnode *dvp = ap->a_dvp;
 	struct componentname *cnp = ap->a_cnp;
 	struct thread *td = cnp->cn_thread;
 	struct inode *ip, *dp;
 	int error;
 
 	ip = VTOI(vp);
 	dp = VTOI(dvp);
 
 	/*
 	 * Verify the directory is empty (and valid).
 	 * (Rmdir ".." won't be valid since
 	 *  ".." will contain a reference to
 	 *  the current directory and thus be
 	 *  non-empty.)
 	 */
 	error = 0;
 	if (ip->i_nlink != 2 || !ext2_dirempty(ip, dp->i_number, cnp->cn_cred)) {
 		error = ENOTEMPTY;
 		goto out;
 	}
 	if ((dp->i_flags & APPEND)
 	    || (ip->i_flags & (NOUNLINK | IMMUTABLE | APPEND))) {
 		error = EPERM;
 		goto out;
 	}
 	/*
 	 * Delete reference to directory before purging
 	 * inode.  If we crash in between, the directory
 	 * will be reattached to lost+found,
 	 */
 	error = ext2_dirremove(dvp, cnp);
 	if (error)
 		goto out;
 	dp->i_nlink--;
 	dp->i_flag |= IN_CHANGE;
 	cache_purge(dvp);
 	VOP_UNLOCK(dvp, 0, td);
 	/*
 	 * Truncate inode.  The only stuff left
 	 * in the directory is "." and "..".  The
 	 * "." reference is inconsequential since
 	 * we're quashing it.  The ".." reference
 	 * has already been adjusted above.  We've
 	 * removed the "." reference and the reference
 	 * in the parent directory, but there may be
 	 * other hard links so decrement by 2 and
 	 * worry about them later.
 	 */
 	ip->i_nlink -= 2;
 	error = ext2_truncate(vp, (off_t)0, IO_SYNC, cnp->cn_cred, td);
 	cache_purge(ITOV(ip));
-	vn_lock(dvp, LK_EXCLUSIVE | LK_RETRY, td);
+	vn_lock(dvp, LK_EXCLUSIVE | LK_RETRY);
 out:
 	return (error);
 }
 
 /*
  * symlink -- make a symbolic link
  */
 static int
 ext2_symlink(ap)
 	struct vop_symlink_args /* {
 		struct vnode *a_dvp;
 		struct vnode **a_vpp;
 		struct componentname *a_cnp;
 		struct vattr *a_vap;
 		char *a_target;
 	} */ *ap;
 {
 	struct vnode *vp, **vpp = ap->a_vpp;
 	struct inode *ip;
 	int len, error;
 
 	error = ext2_makeinode(IFLNK | ap->a_vap->va_mode, ap->a_dvp,
 	    vpp, ap->a_cnp);
 	if (error)
 		return (error);
 	vp = *vpp;
 	len = strlen(ap->a_target);
 	if (len < vp->v_mount->mnt_maxsymlinklen) {
 		ip = VTOI(vp);
 		bcopy(ap->a_target, (char *)ip->i_shortlink, len);
 		ip->i_size = len;
 		ip->i_flag |= IN_CHANGE | IN_UPDATE;
 	} else
 		error = vn_rdwr(UIO_WRITE, vp, ap->a_target, len, (off_t)0,
 		    UIO_SYSSPACE, IO_NODELOCKED | IO_NOMACCHECK,
 		    ap->a_cnp->cn_cred, NOCRED, (int *)0, (struct thread *)0);
 	if (error)
 		vput(vp);
 	return (error);
 }
 
 /*
  * Return target name of a symbolic link
  */
 static int
 ext2_readlink(ap)
 	struct vop_readlink_args /* {
 		struct vnode *a_vp;
 		struct uio *a_uio;
 		struct ucred *a_cred;
 	} */ *ap;
 {
 	struct vnode *vp = ap->a_vp;
 	struct inode *ip = VTOI(vp);
 	int isize;
 
 	isize = ip->i_size;
 	if (isize < vp->v_mount->mnt_maxsymlinklen) {
 		uiomove((char *)ip->i_shortlink, isize, ap->a_uio);
 		return (0);
 	}
 	return (VOP_READ(vp, ap->a_uio, 0, ap->a_cred));
 }
 
 /*
  * Calculate the logical to physical mapping if not done already,
  * then call the device strategy routine.
  *
  * In order to be able to swap to a file, the ext2_bmaparray() operation may not
  * deadlock on memory.  See ext2_bmap() for details.
  */
 static int
 ext2_strategy(ap)
 	struct vop_strategy_args /* {
 		struct vnode *a_vp;
 		struct buf *a_bp;
 	} */ *ap;
 {
 	struct buf *bp = ap->a_bp;
 	struct vnode *vp = ap->a_vp;
 	struct inode *ip;
 	struct bufobj *bo;
 	int32_t blkno;
 	int error;
 
 	ip = VTOI(vp);
 	if (vp->v_type == VBLK || vp->v_type == VCHR)
 		panic("ext2_strategy: spec");
 	if (bp->b_blkno == bp->b_lblkno) {
 		error = ext2_bmaparray(vp, bp->b_lblkno, &blkno, NULL, NULL);
 		bp->b_blkno = blkno;
 		if (error) {
 			bp->b_error = error;
 			bp->b_ioflags |= BIO_ERROR;
 			bufdone(bp);
 			return (error);
 		}
 		if ((long)bp->b_blkno == -1)
 			vfs_bio_clrbuf(bp);
 	}
 	if ((long)bp->b_blkno == -1) {
 		bufdone(bp);
 		return (0);
 	}
 	bp->b_iooffset = dbtob(bp->b_blkno);
 	bo = VFSTOEXT2(vp->v_mount)->um_bo;
 	BO_STRATEGY(bo, bp);
 	return (0);
 }
 
 /*
  * Print out the contents of an inode.
  */
 static int
 ext2_print(ap)
 	struct vop_print_args /* {
 		struct vnode *a_vp;
 	} */ *ap;
 {
 	struct vnode *vp = ap->a_vp;
 	struct inode *ip = VTOI(vp);
 
 	vn_printf(ip->i_devvp, "\tino %lu", (u_long)ip->i_number);
 	if (vp->v_type == VFIFO)
 		fifo_printinfo(vp);
 	printf("\n");
 	return (0);
 }
 
 /*
  * Close wrapper for fifos.
  *
  * Update the times on the inode then do device close.
  */
 static int
 ext2fifo_close(ap)
 	struct vop_close_args /* {
 		struct vnode *a_vp;
 		int  a_fflag;
 		struct ucred *a_cred;
 		struct thread *a_td;
 	} */ *ap;
 {
 	struct vnode *vp = ap->a_vp;
 
 	VI_LOCK(vp);
 	if (vp->v_usecount > 1)
 		ext2_itimes(vp);
 	VI_UNLOCK(vp);
 	return (fifo_specops.vop_close(ap));
 }
 
 /*
  * Kqfilter wrapper for fifos.
  *
  * Fall through to ext2 kqfilter routines if needed 
  */
 static int
 ext2fifo_kqfilter(ap)
 	struct vop_kqfilter_args *ap;
 {
 	int error;
 
 	error = fifo_specops.vop_kqfilter(ap);
 	if (error)
 		error = ext2_kqfilter(ap);
 	return (error);
 }
 
 /*
  * Return POSIX pathconf information applicable to ext2 filesystems.
  */
 static int
 ext2_pathconf(ap)
 	struct vop_pathconf_args /* {
 		struct vnode *a_vp;
 		int a_name;
 		int *a_retval;
 	} */ *ap;
 {
 
 	switch (ap->a_name) {
 	case _PC_LINK_MAX:
 		*ap->a_retval = LINK_MAX;
 		return (0);
 	case _PC_NAME_MAX:
 		*ap->a_retval = NAME_MAX;
 		return (0);
 	case _PC_PATH_MAX:
 		*ap->a_retval = PATH_MAX;
 		return (0);
 	case _PC_PIPE_BUF:
 		*ap->a_retval = PIPE_BUF;
 		return (0);
 	case _PC_CHOWN_RESTRICTED:
 		*ap->a_retval = 1;
 		return (0);
 	case _PC_NO_TRUNC:
 		*ap->a_retval = 1;
 		return (0);
 	default:
 		return (EINVAL);
 	}
 	/* NOTREACHED */
 }
 
 /*
  * Advisory record locking support
  */
 static int
 ext2_advlock(ap)
 	struct vop_advlock_args /* {
 		struct vnode *a_vp;
 		caddr_t  a_id;
 		int  a_op;
 		struct flock *a_fl;
 		int  a_flags;
 	} */ *ap;
 {
 	struct inode *ip = VTOI(ap->a_vp);
 
 	return (lf_advlock(ap, &(ip->i_lockf), ip->i_size));
 }
 
 /*
  * Vnode pointer to File handle
  */
 /* ARGSUSED */
 static int
 ext2_vptofh(ap)
 	struct vop_vptofh_args /* {
 		struct vnode *a_vp;
 		struct fid *a_fhp;
 	} */ *ap;
 {
 	struct inode *ip;
 	struct ufid *ufhp;
 
 	ip = VTOI(ap->a_vp);
 	ufhp = (struct ufid *)ap->a_fhp;
 	ufhp->ufid_len = sizeof(struct ufid);
 	ufhp->ufid_ino = ip->i_number;
 	ufhp->ufid_gen = ip->i_gen;
 	return (0);
 }
 
 /*
  * Initialize the vnode associated with a new inode, handle aliased
  * vnodes.
  */
 int
 ext2_vinit(mntp, fifoops, vpp)
 	struct mount *mntp;
 	struct vop_vector *fifoops;
 	struct vnode **vpp;
 {
 	struct inode *ip;
 	struct vnode *vp;
 
 	vp = *vpp;
 	ip = VTOI(vp);
 	vp->v_type = IFTOVT(ip->i_mode);
 	if (vp->v_type == VFIFO)
 		vp->v_op = fifoops;
 
 	if (ip->i_number == ROOTINO)
 		vp->v_vflag |= VV_ROOT;
 	ip->i_modrev = init_va_filerev();
 	*vpp = vp;
 	return (0);
 }
 
 /*
  * Allocate a new inode.
  */
 static int
 ext2_makeinode(mode, dvp, vpp, cnp)
 	int mode;
 	struct vnode *dvp;
 	struct vnode **vpp;
 	struct componentname *cnp;
 {
 	struct inode *ip, *pdir;
 	struct vnode *tvp;
 	int error;
 
 	pdir = VTOI(dvp);
 #ifdef DIAGNOSTIC
 	if ((cnp->cn_flags & HASBUF) == 0)
 		panic("ext2_makeinode: no name");
 #endif
 	*vpp = NULL;
 	if ((mode & IFMT) == 0)
 		mode |= IFREG;
 
 	error = ext2_valloc(dvp, mode, cnp->cn_cred, &tvp);
 	if (error) {
 		return (error);
 	}
 	ip = VTOI(tvp);
 	ip->i_gid = pdir->i_gid;
 #ifdef SUIDDIR
 	{
 		/*
 		 * if we are
 		 * not the owner of the directory,
 		 * and we are hacking owners here, (only do this where told to)
 		 * and we are not giving it TOO root, (would subvert quotas)
 		 * then go ahead and give it to the other user.
 		 * Note that this drops off the execute bits for security.
 		 */
 		if ( (dvp->v_mount->mnt_flag & MNT_SUIDDIR) &&
 		     (pdir->i_mode & ISUID) &&
 		     (pdir->i_uid != cnp->cn_cred->cr_uid) && pdir->i_uid) {
 			ip->i_uid = pdir->i_uid;
 			mode &= ~07111;
 		} else {
 			ip->i_uid = cnp->cn_cred->cr_uid;
 		}
 	}
 #else
 	ip->i_uid = cnp->cn_cred->cr_uid;
 #endif
 	ip->i_flag |= IN_ACCESS | IN_CHANGE | IN_UPDATE;
 	ip->i_mode = mode;
 	tvp->v_type = IFTOVT(mode);	/* Rest init'd in getnewvnode(). */
 	ip->i_nlink = 1;
 	if ((ip->i_mode & ISGID) && !groupmember(ip->i_gid, cnp->cn_cred)) {
 		if (priv_check_cred(cnp->cn_cred, PRIV_VFS_RETAINSUGID, 0))
 			ip->i_mode &= ~ISGID;
 	}
 
 	if (cnp->cn_flags & ISWHITEOUT)
 		ip->i_flags |= UF_OPAQUE;
 
 	/*
 	 * Make sure inode goes to disk before directory entry.
 	 */
 	error = ext2_update(tvp, 1);
 	if (error)
 		goto bad;
 	error = ext2_direnter(ip, dvp, cnp);
 	if (error)
 		goto bad;
 
 	*vpp = tvp;
 	return (0);
 
 bad:
 	/*
 	 * Write error occurred trying to update the inode
 	 * or the directory so must deallocate the inode.
 	 */
 	ip->i_nlink = 0;
 	ip->i_flag |= IN_CHANGE;
 	vput(tvp);
 	return (error);
 }
 
 static struct filterops ext2read_filtops = 
 	{ 1, NULL, filt_ext2detach, filt_ext2read };
 static struct filterops ext2write_filtops = 
 	{ 1, NULL, filt_ext2detach, filt_ext2write };
 static struct filterops ext2vnode_filtops = 
 	{ 1, NULL, filt_ext2detach, filt_ext2vnode };
 
 static int
 ext2_kqfilter(ap)
 	struct vop_kqfilter_args /* {
 		struct vnode *a_vp;
 		struct knote *a_kn;
 	} */ *ap;
 {
 	struct vnode *vp = ap->a_vp;
 	struct knote *kn = ap->a_kn;
 
 	switch (kn->kn_filter) {
 	case EVFILT_READ:
 		kn->kn_fop = &ext2read_filtops;
 		break;
 	case EVFILT_WRITE:
 		kn->kn_fop = &ext2write_filtops;
 		break;
 	case EVFILT_VNODE:
 		kn->kn_fop = &ext2vnode_filtops;
 		break;
 	default:
 		return (1);
 	}
 
 	kn->kn_hook = (caddr_t)vp;
 
 	if (vp->v_pollinfo == NULL)
 		v_addpollinfo(vp);
 	if (vp->v_pollinfo == NULL)
 		return ENOMEM;
 	knlist_add(&vp->v_pollinfo->vpi_selinfo.si_note, kn, 0);
 
 	return (0);
 }
 
 static void
 filt_ext2detach(struct knote *kn)
 {
 	struct vnode *vp = (struct vnode *)kn->kn_hook;
 
 	KASSERT(vp->v_pollinfo != NULL, ("Mising v_pollinfo"));
 	knlist_remove(&vp->v_pollinfo->vpi_selinfo.si_note, kn, 0);
 }
 
 /*ARGSUSED*/
 static int
 filt_ext2read(struct knote *kn, long hint)
 {
 	struct vnode *vp = (struct vnode *)kn->kn_hook;
 	struct inode *ip = VTOI(vp);
 
 	/*
 	 * filesystem is gone, so set the EOF flag and schedule 
 	 * the knote for deletion.
 	 */
 	if (hint == NOTE_REVOKE) {
 		kn->kn_flags |= (EV_EOF | EV_ONESHOT);
 		return (1);
 	}
 
         kn->kn_data = ip->i_size - kn->kn_fp->f_offset;
         return (kn->kn_data != 0);
 }
 
 /*ARGSUSED*/
 static int
 filt_ext2write(struct knote *kn, long hint)
 {
 
 	/*
 	 * filesystem is gone, so set the EOF flag and schedule 
 	 * the knote for deletion.
 	 */
 	if (hint == NOTE_REVOKE)
 		kn->kn_flags |= (EV_EOF | EV_ONESHOT);
 
         kn->kn_data = 0;
         return (1);
 }
 
 static int
 filt_ext2vnode(struct knote *kn, long hint)
 {
 
 	if (kn->kn_sfflags & hint)
 		kn->kn_fflags |= hint;
 	if (hint == NOTE_REVOKE) {
 		kn->kn_flags |= EV_EOF;
 		return (1);
 	}
 	return (kn->kn_fflags != 0);
 }
Index: head/sys/gnu/fs/ext2fs/fs.h
===================================================================
--- head/sys/gnu/fs/ext2fs/fs.h	(revision 175201)
+++ head/sys/gnu/fs/ext2fs/fs.h	(revision 175202)
@@ -1,170 +1,170 @@
 /*-
  *  modified for EXT2FS support in Lites 1.1
  *
  *  Aug 1995, Godmar Back (gback@cs.utah.edu)
  *  University of Utah, Department of Computer Science
  */
 /*-
  * Copyright (c) 1982, 1986, 1993
  *	The Regents of the University of California.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	@(#)fs.h	8.7 (Berkeley) 4/19/94
  * $FreeBSD$
  */
 
 /*
  * Each disk drive contains some number of file systems.
  * A file system consists of a number of cylinder groups.
  * Each cylinder group has inodes and data.
  *
  * A file system is described by its super-block, which in turn
  * describes the cylinder groups.  The super-block is critical
  * data and is replicated in each cylinder group to protect against
  * catastrophic loss.  This is done at `newfs' time and the critical
  * super-block data does not change, so the copies need not be
  * referenced further unless disaster strikes.
  *
  * The first boot and super blocks are given in absolute disk addresses.
  * The byte-offset forms are preferred, as they don't imply a sector size.
  */
 #define SBSIZE		1024
 #define SBLOCK		2
 
 /*
  * The path name on which the file system is mounted is maintained
  * in fs_fsmnt. MAXMNTLEN defines the amount of space allocated in 
  * the super block for this name.
  */
 #define MAXMNTLEN 512
 
 /*
  * Macros for access to superblock array structures
  */
 
 /*
  * Convert cylinder group to base address of its global summary info.
  */
 #define fs_cs(fs, cgindx)      (((struct ext2_group_desc *) \
         (fs->s_group_desc[cgindx / EXT2_DESC_PER_BLOCK(fs)]->b_data)) \
 		[cgindx % EXT2_DESC_PER_BLOCK(fs)])
 
 /*
  * Turn file system block numbers into disk block addresses.
  * This maps file system blocks to device size blocks.
  */
 #define fsbtodb(fs, b)	((b) << ((fs)->s_fsbtodb))
 #define	dbtofsb(fs, b)	((b) >> ((fs)->s_fsbtodb))
 
 /* get group containing inode */
 #define ino_to_cg(fs, x)	(((x) - 1) / EXT2_INODES_PER_GROUP(fs))
 
 /* get block containing inode from its number x */
 #define	ino_to_fsba(fs, x)	fs_cs(fs, ino_to_cg(fs, x)).bg_inode_table + \
 	(((x)-1) % EXT2_INODES_PER_GROUP(fs))/EXT2_INODES_PER_BLOCK(fs)
 
 /* get offset for inode in block */
 #define	ino_to_fsbo(fs, x)	((x-1) % EXT2_INODES_PER_BLOCK(fs))
 
 /*
  * Give cylinder group number for a file system block.
  * Give cylinder group block number for a file system block.
  */
 #define	dtog(fs, d)	(((d) - fs->s_es->s_first_data_block) / \
 			EXT2_BLOCKS_PER_GROUP(fs))
 #define	dtogd(fs, d)	(((d) - fs->s_es->s_first_data_block) % \
 			EXT2_BLOCKS_PER_GROUP(fs))
 
 /*
  * The following macros optimize certain frequently calculated
  * quantities by using shifts and masks in place of divisions
  * modulos and multiplications.
  */
 #define blkoff(fs, loc)		/* calculates (loc % fs->fs_bsize) */ \
 	((loc) & (fs)->s_qbmask)
 
 #define lblktosize(fs, blk)	/* calculates (blk * fs->fs_bsize) */ \
 	((blk) << (fs->s_bshift))
 
 #define lblkno(fs, loc)		/* calculates (loc / fs->fs_bsize) */ \
 	((loc) >> (fs->s_bshift))
 
 /* no fragments -> logical block number equal # of frags */
 #define numfrags(fs, loc)	/* calculates (loc / fs->fs_fsize) */ \
 	((loc) >> (fs->s_bshift))
 
 #define fragroundup(fs, size)	/* calculates roundup(size, fs->fs_fsize) */ \
 	roundup(size, fs->s_frag_size)
 	/* was (((size) + (fs)->fs_qfmask) & (fs)->fs_fmask) */
 
 /*
  * Determining the size of a file block in the file system.
  * easy w/o fragments
  */
 #define blksize(fs, ip, lbn) ((fs)->s_frag_size)
 
 /*
  * INOPB is the number of inodes in a secondary storage block.
  */
 #define	INOPB(fs)	EXT2_INODES_PER_BLOCK(fs)
 
 /*
  * NINDIR is the number of indirects in a file system block.
  */
 #define	NINDIR(fs)	(EXT2_ADDR_PER_BLOCK(fs))
 
 extern int inside[], around[];
 extern u_char *fragtbl[];
 
 /* a few remarks about superblock locking/unlocking
  * Linux provides special routines for doing so
  * I haven't figured out yet what BSD does
  * I think I'll try a VOP_LOCK/VOP_UNLOCK on the device vnode
  */
 #define  DEVVP(inode)		(VFSTOEXT2(ITOV(inode)->v_mount)->um_devvp)
-#define  lock_super(devvp)   	vn_lock(devvp, LK_EXCLUSIVE | LK_RETRY, curthread)
+#define  lock_super(devvp)   	vn_lock(devvp, LK_EXCLUSIVE | LK_RETRY)
 #define  unlock_super(devvp) 	VOP_UNLOCK(devvp, 0, curthread)
 
 /*
  * Historically, ext2fs kept it's metadata buffers on the LOCKED queue.  Now,
  * we change the lock owner to kern so that we may use it from contexts other
  * than the one that originally locked it.  When we are finished with the
  * buffer, we release it, writing it first if it was dirty.
  */
 #define LCK_BUF(bp) { \
 	(bp)->b_flags |= B_PERSISTENT; \
 	BUF_KERNPROC(bp); \
 }
 
 #define ULCK_BUF(bp) { \
 	long flags; \
 	flags = (bp)->b_flags; \
 	(bp)->b_flags &= ~(B_DIRTY | B_PERSISTENT); \
 	if (flags & B_DIRTY) \
 		bwrite(bp); \
 	else \
 		brelse(bp); \
 }
Index: head/sys/gnu/fs/reiserfs/reiserfs_namei.c
===================================================================
--- head/sys/gnu/fs/reiserfs/reiserfs_namei.c	(revision 175201)
+++ head/sys/gnu/fs/reiserfs/reiserfs_namei.c	(revision 175202)
@@ -1,701 +1,701 @@
 /*-
  * Copyright 2000 Hans Reiser
  * See README for licensing and copyright details
  * 
  * Ported to FreeBSD by Jean-S�bastien P�dron <jspedron@club-internet.fr>
  * 
  * $FreeBSD$
  */
 
 #include <gnu/fs/reiserfs/reiserfs_fs.h>
 
 static int	reiserfs_find_entry(struct reiserfs_node *dp,
     const char *name, int namelen,
     struct path * path_to_entry, struct reiserfs_dir_entry *de);
 
 MALLOC_DEFINE(M_REISERFSCOOKIES, "reiserfs_cookies",
     "ReiserFS VOP_READDIR cookies");
 
 /* -------------------------------------------------------------------
  * Lookup functions
  * -------------------------------------------------------------------*/
 
 int
 reiserfs_lookup(struct vop_cachedlookup_args *ap)
 {
 	int error, retval;
 	struct vnode *vdp         = ap->a_dvp;
 	struct vnode **vpp        = ap->a_vpp;
 	struct componentname *cnp = ap->a_cnp;
 
 	int flags         = cnp->cn_flags;
 	struct thread *td = cnp->cn_thread;
 	struct cpu_key *saved_ino;
 
 	struct vnode *vp;
 	struct vnode *pdp;  /* Saved dp during symlink work */
 	struct reiserfs_node *dp;
 	struct reiserfs_dir_entry de;
 	INITIALIZE_PATH(path_to_entry);
 
 	char c = cnp->cn_nameptr[cnp->cn_namelen];
 	cnp->cn_nameptr[cnp->cn_namelen] = '\0';
 	reiserfs_log(LOG_DEBUG, "looking for `%s', %ld (%s)\n",
 	    cnp->cn_nameptr, cnp->cn_namelen, cnp->cn_pnbuf);
 	cnp->cn_nameptr[cnp->cn_namelen] = c;
 
 	vp = NULL;
 	dp = VTOI(vdp);
 
 	if (REISERFS_MAX_NAME(dp->i_reiserfs->s_blocksize) < cnp->cn_namelen)
 		return (ENAMETOOLONG);
 
 	reiserfs_log(LOG_DEBUG, "searching entry\n");
 	de.de_gen_number_bit_string = 0;
 	retval = reiserfs_find_entry(dp, cnp->cn_nameptr, cnp->cn_namelen,
 	    &path_to_entry, &de);
 	pathrelse(&path_to_entry);
 
 	if (retval == NAME_FOUND) {
 		reiserfs_log(LOG_DEBUG, "found\n");
 	} else {
 		reiserfs_log(LOG_DEBUG, "not found\n");
 	}
 
 	if (retval == NAME_FOUND) {
 #if 0
 		/* Hide the .reiserfs_priv directory */
 		if (reiserfs_xattrs(dp->i_reiserfs) &&
 		    !old_format_only(dp->i_reiserfs) &&
 		    REISERFS_SB(dp->i_reiserfs)->priv_root &&
 		    REISERFS_SB(dp->i_reiserfs)->priv_root->d_inode &&
 		    de.de_objectid == le32toh(INODE_PKEY(REISERFS_SB(
 		    dp->i_reiserfs)->priv_root->d_inode)->k_objectid)) {
 			return (EACCES);
 		}
 #endif
 
 		reiserfs_log(LOG_DEBUG, "reading vnode\n");
 		pdp = vdp;
 		if (flags & ISDOTDOT) {
 			saved_ino = (struct cpu_key *)&(de.de_dir_id);
 			VOP_UNLOCK(pdp, 0, td);
 			error = reiserfs_iget(vdp->v_mount,
 			    saved_ino, &vp, td);
-			vn_lock(pdp, LK_EXCLUSIVE | LK_RETRY, td);
+			vn_lock(pdp, LK_EXCLUSIVE | LK_RETRY);
 			if (error != 0)
 				return (error);
 			*vpp = vp;
 		} else if (de.de_objectid == dp->i_number &&
 		    de.de_dir_id == dp->i_ino) {
 			VREF(vdp); /* We want ourself, ie "." */
 			*vpp = vdp;
 		} else {
 			if ((error = reiserfs_iget(vdp->v_mount,
 			    (struct cpu_key *)&(de.de_dir_id), &vp, td)) != 0)
 				return (error);
 			*vpp = vp;
 		}
 
 		/*
 		 * Propogate the priv_object flag so we know we're in the
 		 * priv tree
 		 */
 		/*if (is_reiserfs_priv_object(dir))
 			REISERFS_I(inode)->i_flags |= i_priv_object;*/
 	} else {
 		if (retval == IO_ERROR) {
 			reiserfs_log(LOG_DEBUG, "IO error\n");
 			return (EIO);
 		}
 
 		return (ENOENT);
 	}
 
 	/* Insert name into cache if appropriate. */
 	if (cnp->cn_flags & MAKEENTRY)
 		cache_enter(vdp, *vpp, cnp);
 
 	reiserfs_log(LOG_DEBUG, "done\n");
 	return (0);
 }
 
 extern struct key MIN_KEY;
 
 int
 reiserfs_readdir(struct vop_readdir_args  /* {
 		struct vnode *a_vp;
 		struct uio *a_uio;
 		struct ucred *a_cred;
 		int *a_eofflag;
 		int *a_ncookies;
 		u_long **a_cookies;
 	} */*ap)
 {
 	int error = 0;
 	struct dirent dstdp;
 	struct uio *uio = ap->a_uio;
 
 	off_t next_pos;
 	struct buf *bp;
 	struct item_head *ih;
 	struct cpu_key pos_key;
 	const struct key *rkey;
 	struct reiserfs_node *ip;
 	struct reiserfs_dir_entry de;
 	INITIALIZE_PATH(path_to_entry);
 	int entry_num, item_num, search_res;
 
 	/* The NFS part */
 	int ncookies = 0;
 	u_long *cookies = NULL;
 
 	/*
 	 * Form key for search the next directory entry using f_pos field of
 	 * file structure
 	 */
 	ip = VTOI(ap->a_vp);
 	make_cpu_key(&pos_key,
 	    ip, uio->uio_offset ? uio->uio_offset : DOT_OFFSET,
 	    TYPE_DIRENTRY, 3);
 	next_pos = cpu_key_k_offset(&pos_key);
 
 	reiserfs_log(LOG_DEBUG, "listing entries for "
 	    "(objectid=%d, dirid=%d)\n",
 	    pos_key.on_disk_key.k_objectid, pos_key.on_disk_key.k_dir_id);
 	reiserfs_log(LOG_DEBUG, "uio_offset = %jd, uio_resid = %d\n",
 	    (intmax_t)uio->uio_offset, uio->uio_resid);
 
 	if (ap->a_ncookies && ap->a_cookies) {
 		cookies = (u_long *)malloc(
 		    uio->uio_resid / 16 * sizeof(u_long),
 		    M_REISERFSCOOKIES, M_WAITOK);
 	}
 
 	while (1) {
 		//research:
 		/*
 		 * Search the directory item, containing entry with
 		 * specified key
 		 */
 		reiserfs_log(LOG_DEBUG, "search directory to read\n");
 		search_res = search_by_entry_key(ip->i_reiserfs, &pos_key,
 		    &path_to_entry, &de);
 		if (search_res == IO_ERROR) {
 			error = EIO;
 			goto out;
 		}
 
 		entry_num = de.de_entry_num;
 		item_num  = de.de_item_num;
 		bp = de.de_bp;
 		ih = de.de_ih;
 
 		if (search_res == POSITION_FOUND ||
 		    entry_num < I_ENTRY_COUNT(ih)) {
 			/*
 			 * Go through all entries in the directory item
 			 * beginning from the entry, that has been found.
 			 */
 			struct reiserfs_de_head *deh = B_I_DEH(bp, ih) +
 			    entry_num;
 
 			if (ap->a_ncookies == NULL) {
 				cookies = NULL;
 			} else {
 				//ncookies = 
 			}
 
 			reiserfs_log(LOG_DEBUG,
 			    "walking through directory entries\n");
 			for (; entry_num < I_ENTRY_COUNT(ih);
 			    entry_num++, deh++) {
 				int d_namlen;
 				char *d_name;
 				off_t d_off;
 				ino_t d_ino;
 
 				if (!de_visible(deh)) {
 					/* It is hidden entry */
 					continue;
 				}
 
 				d_namlen = entry_length(bp, ih, entry_num);
 				d_name   = B_I_DEH_ENTRY_FILE_NAME(bp, ih, deh);
 				if (!d_name[d_namlen - 1])
 					d_namlen = strlen(d_name);
 				reiserfs_log(LOG_DEBUG, "  - `%s' (len=%d)\n",
 				    d_name, d_namlen);
 
 				if (d_namlen > REISERFS_MAX_NAME(
 				    ip->i_reiserfs->s_blocksize)) {
 					/* Too big to send back to VFS */
 					continue;
 				}
 
 #if 0
 				/* Ignore the .reiserfs_priv entry */
 				if (reiserfs_xattrs(ip->i_reiserfs) &&
 				    !old_format_only(ip->i_reiserfs) &&
 				    filp->f_dentry == ip->i_reiserfs->s_root &&
 				    REISERFS_SB(ip->i_reiserfs)->priv_root &&
 				    REISERFS_SB(ip->i_reiserfs)->priv_root->d_inode &&
 				    deh_objectid(deh) ==
 				    le32toh(INODE_PKEY(REISERFS_SB(
 				    ip->i_reiserfs)->priv_root->d_inode)->k_objectid)) {
 					continue;
 				}
 #endif
 
 				d_off = deh_offset(deh);
 				d_ino = deh_objectid(deh);
 				uio->uio_offset = d_off;
 
 				/* Copy to user land */
 				dstdp.d_fileno = d_ino;
 				dstdp.d_type   = DT_UNKNOWN;
 				dstdp.d_namlen = d_namlen;
 				dstdp.d_reclen = GENERIC_DIRSIZ(&dstdp);
 				bcopy(d_name, dstdp.d_name, dstdp.d_namlen);
 				bzero(dstdp.d_name + dstdp.d_namlen,
 				    dstdp.d_reclen -
 				    offsetof(struct dirent, d_name) -
 				    dstdp.d_namlen);
 
 				if (d_namlen > 0) {
 					if (dstdp.d_reclen <= uio->uio_resid) {
 						reiserfs_log(LOG_DEBUG, "     copying to user land\n");
 						error = uiomove(&dstdp,
 						    dstdp.d_reclen, uio);
 						if (error)
 							goto end;
 						if (cookies != NULL) {
 							cookies[ncookies] =
 							    d_off;
 							ncookies++;
 						}
 					} else
 						break;
 				} else {
 					error = EIO;
 					break;
 				}
 
 				next_pos = deh_offset(deh) + 1;
 			}
 			reiserfs_log(LOG_DEBUG, "...done\n");
 		}
 
 		reiserfs_log(LOG_DEBUG, "checking item num (%d == %d ?)\n",
 		    item_num, B_NR_ITEMS(bp) - 1);
 		if (item_num != B_NR_ITEMS(bp) - 1) {
 			/* End of directory has been reached */
 			reiserfs_log(LOG_DEBUG, "end reached\n");
 			if (ap->a_eofflag)
 				*ap->a_eofflag = 1;
 			goto end;
 		}
 
 		/*
 		 * Item we went through is last item of node. Using right
 		 * delimiting key check is it directory end
 		 */
 		reiserfs_log(LOG_DEBUG, "get right key\n");
 		rkey = get_rkey(&path_to_entry, ip->i_reiserfs);
 		reiserfs_log(LOG_DEBUG, "right key = (objectid=%d, dirid=%d)\n",
 		    rkey->k_objectid, rkey->k_dir_id);
 
 		reiserfs_log(LOG_DEBUG, "compare it to MIN_KEY\n");
 		reiserfs_log(LOG_DEBUG, "MIN KEY = (objectid=%d, dirid=%d)\n",
 		    MIN_KEY.k_objectid, MIN_KEY.k_dir_id);
 		if (comp_le_keys(rkey, &MIN_KEY) == 0) {
 			/* Set pos_key to key, that is the smallest and greater
 			 * that key of the last entry in the item */
 			reiserfs_log(LOG_DEBUG, "continuing on the right\n");
 			set_cpu_key_k_offset(&pos_key, next_pos);
 			continue;
 		}
 
 		reiserfs_log(LOG_DEBUG, "compare it to pos_key\n");
 		reiserfs_log(LOG_DEBUG, "pos key = (objectid=%d, dirid=%d)\n",
 		    pos_key.on_disk_key.k_objectid,
 		    pos_key.on_disk_key.k_dir_id);
 		if (COMP_SHORT_KEYS(rkey, &pos_key)) {
 			/* End of directory has been reached */
 			reiserfs_log(LOG_DEBUG, "end reached (right)\n");
 			if (ap->a_eofflag)
 				*ap->a_eofflag = 1;
 			goto end;
 		}
 
 		/* Directory continues in the right neighboring block */
 		reiserfs_log(LOG_DEBUG, "continuing with a new offset\n");
 		set_cpu_key_k_offset(&pos_key,
 		    le_key_k_offset(KEY_FORMAT_3_5, rkey));
 		reiserfs_log(LOG_DEBUG,
 		    "new pos key = (objectid=%d, dirid=%d)\n",
 		    pos_key.on_disk_key.k_objectid,
 		    pos_key.on_disk_key.k_dir_id);
 	}
 
 end:
 	uio->uio_offset = next_pos;
 	pathrelse(&path_to_entry);
 	reiserfs_check_path(&path_to_entry);
 out:
 	if (error && cookies != NULL) {
 		free(cookies, M_REISERFSCOOKIES);
 	} else if (ap->a_ncookies != NULL && ap->a_cookies != NULL) {
 		*ap->a_ncookies = ncookies;
 		*ap->a_cookies  = cookies;
 	}
 	return (error);
 }
 
 /* -------------------------------------------------------------------
  * Functions from linux/fs/reiserfs/namei.c
  * -------------------------------------------------------------------*/
 
 
 /*
  * Directory item contains array of entry headers. This performs binary
  * search through that array.
  */
 static int
 bin_search_in_dir_item(struct reiserfs_dir_entry *de, off_t off)
 {
 	struct item_head *ih = de->de_ih;
 	struct reiserfs_de_head *deh = de->de_deh;
 	int rbound, lbound, j;
 
 	lbound = 0;
 	rbound = I_ENTRY_COUNT(ih) - 1;
 
 	for (j = (rbound + lbound) / 2; lbound <= rbound;
 	    j = (rbound + lbound) / 2) {
 		if (off < deh_offset(deh + j)) {
 			rbound = j - 1;
 			continue;
 		}
 		if (off > deh_offset(deh + j)) {
 			lbound = j + 1;
 			continue;
 		}
 
 		/* This is not name found, but matched third key component */
 		de->de_entry_num = j;
 		return (NAME_FOUND);
 	}
 
 	de->de_entry_num = lbound;
 	return (NAME_NOT_FOUND);
 }
 
 /*
  * Comment?  Maybe something like set de to point to what the path
  * points to?
  */
 static inline void
 set_de_item_location(struct reiserfs_dir_entry *de, struct path *path)
 {
 
 	de->de_bp       = get_last_bp(path);
 	de->de_ih       = get_ih(path);
 	de->de_deh      = B_I_DEH(de->de_bp, de->de_ih);
 	de->de_item_num = PATH_LAST_POSITION(path);
 }
 
 /*
  * de_bh, de_ih, de_deh (points to first element of array), de_item_num
  * is set
  */
 void
 set_de_name_and_namelen(struct reiserfs_dir_entry *de)
 {
 	struct reiserfs_de_head *deh = de->de_deh + de->de_entry_num;
 
 	if (de->de_entry_num >= ih_entry_count(de->de_ih)) {
 		reiserfs_log(LOG_DEBUG, "BUG\n");
 		return;
 	}
 
 	de->de_entrylen = entry_length(de->de_bp, de->de_ih, de->de_entry_num);
 	de->de_namelen  = de->de_entrylen - (de_with_sd(deh) ? SD_SIZE : 0);
 	de->de_name     = B_I_PITEM(de->de_bp, de->de_ih) + deh_location(deh);
 	if (de->de_name[de->de_namelen - 1] == 0)
 		de->de_namelen = strlen(de->de_name);
 }
 
 /* What entry points to */
 static inline void
 set_de_object_key(struct reiserfs_dir_entry *de)
 {
 
 	if (de->de_entry_num >= ih_entry_count(de->de_ih)) {
 		reiserfs_log(LOG_DEBUG, "BUG\n");
 		return;
 	}
 	de->de_dir_id   = deh_dir_id(&(de->de_deh[de->de_entry_num]));
 	de->de_objectid = deh_objectid(&(de->de_deh[de->de_entry_num]));
 }
 
 static inline void
 store_de_entry_key(struct reiserfs_dir_entry *de)
 {
 	struct reiserfs_de_head *deh = de->de_deh + de->de_entry_num;
 
 	if (de->de_entry_num >= ih_entry_count(de->de_ih)) {
 		reiserfs_log(LOG_DEBUG, "BUG\n"); 
 		return;
 	}
 
 	/* Store key of the found entry */
 	de->de_entry_key.version = KEY_FORMAT_3_5;
 	de->de_entry_key.on_disk_key.k_dir_id =
 	    le32toh(de->de_ih->ih_key.k_dir_id);
 	de->de_entry_key.on_disk_key.k_objectid =
 	    le32toh(de->de_ih->ih_key.k_objectid);
 	set_cpu_key_k_offset(&(de->de_entry_key), deh_offset(deh));
 	set_cpu_key_k_type(&(de->de_entry_key), TYPE_DIRENTRY);
 }
 
 /*
  * We assign a key to each directory item, and place multiple entries in
  * a single directory item. A directory item has a key equal to the key
  * of the first directory entry in it.
  *
  * This function first calls search_by_key, then, if item whose first
  * entry matches is not found it looks for the entry inside directory
  * item found by search_by_key. Fills the path to the entry, and to the
  * entry position in the item
  */
 int
 search_by_entry_key(struct reiserfs_sb_info *sbi,
     const struct cpu_key *key, struct path *path,
     struct reiserfs_dir_entry *de)
 {
 	int retval;
 
 	reiserfs_log(LOG_DEBUG, "searching in (objectid=%d,dirid=%d)\n",
 	    key->on_disk_key.k_objectid, key->on_disk_key.k_dir_id);
 	retval = search_item(sbi, key, path);
 	switch (retval) {
 	case ITEM_NOT_FOUND:
 		if (!PATH_LAST_POSITION(path)) {
 			reiserfs_log(LOG_DEBUG,
 			    "search_by_key returned item position == 0");
 			pathrelse(path);
 			return (IO_ERROR);
 		}
 		PATH_LAST_POSITION(path)--;
 		reiserfs_log(LOG_DEBUG, "search_by_key did not found it\n");
 		break;
 	case ITEM_FOUND:
 		reiserfs_log(LOG_DEBUG, "search_by_key found it\n");
 		break;
 	case IO_ERROR:
 		return (retval);
 	default:
 		pathrelse(path);
 		reiserfs_log(LOG_DEBUG, "no path to here");
 		return (IO_ERROR);
 	}
 
 	reiserfs_log(LOG_DEBUG, "set item location\n");
 	set_de_item_location(de, path);
 
 	/*
 	 * Binary search in directory item by third component of the
 	 * key. Sets de->de_entry_num of de
 	 */
 	reiserfs_log(LOG_DEBUG, "bin_search_in_dir_item\n");
 	retval = bin_search_in_dir_item(de, cpu_key_k_offset(key));
 	path->pos_in_item = de->de_entry_num;
 	if (retval != NAME_NOT_FOUND) {
 		/*
 		 * Ugly, but rename needs de_bp, de_deh, de_name, de_namelen,
 		 * de_objectid set
 		 */
 		set_de_name_and_namelen(de);
 		set_de_object_key(de);
 		reiserfs_log(LOG_DEBUG, "set (objectid=%d,dirid=%d)\n",
 		    de->de_objectid, de->de_dir_id);
 	}
 
 	return (retval);
 }
 
 static uint32_t
 get_third_component(struct reiserfs_sb_info *sbi, const char *name, int len)
 {
 	uint32_t res;
 
 	if (!len || (len == 1 && name[0] == '.'))
 		return (DOT_OFFSET);
 
 	if (len == 2 && name[0] == '.' && name[1] == '.')
 		return (DOT_DOT_OFFSET);
 
 	res = REISERFS_SB(sbi)->s_hash_function(name, len);
 
 	/* Take bits from 7-th to 30-th including both bounds */
 	res = GET_HASH_VALUE(res);
 	if (res == 0)
 		/*
 		 * Needed to have no names before "." and ".." those have hash
 		 * value == 0 and generation counters 1 and 2 accordingly
 		 */
 		res = 128;
 
 	return (res + MAX_GENERATION_NUMBER);
 }
 
 static int
 reiserfs_match(struct reiserfs_dir_entry *de, const char *name, int namelen)
 {
 	int retval = NAME_NOT_FOUND;
 
 	if ((namelen == de->de_namelen) &&
 	    !memcmp(de->de_name, name, de->de_namelen))
 		retval = (de_visible(de->de_deh + de->de_entry_num) ?
 		    NAME_FOUND : NAME_FOUND_INVISIBLE);
 
 	return (retval);
 }
 
 /*
  * de's de_bh, de_ih, de_deh, de_item_num, de_entry_num are set already
  * Used when hash collisions exist
  */
 static int
 linear_search_in_dir_item(struct cpu_key *key, struct reiserfs_dir_entry *de,
     const char *name, int namelen)
 {
 	int i;
 	int retval;
 	struct reiserfs_de_head * deh = de->de_deh;
 
 	i = de->de_entry_num;
 
 	if (i == I_ENTRY_COUNT(de->de_ih) ||
 	    GET_HASH_VALUE(deh_offset(deh + i)) !=
 	    GET_HASH_VALUE(cpu_key_k_offset(key))) {
 		i--;
 	}
 
 	/*RFALSE( de->de_deh != B_I_DEH (de->de_bh, de->de_ih),
 	  "vs-7010: array of entry headers not found");*/
 
 	deh += i;
 
 	for (; i >= 0; i--, deh--) {
 		if (GET_HASH_VALUE(deh_offset(deh)) !=
 		    GET_HASH_VALUE(cpu_key_k_offset(key))) {
 			/*
 			 * Hash value does not match, no need to check
 			 * whole name
 			 */
 			reiserfs_log(LOG_DEBUG, "name `%s' not found\n", name);
 			return (NAME_NOT_FOUND);
 		}
 
 		/* Mark that this generation number is used */
 		if (de->de_gen_number_bit_string)
 			set_bit(GET_GENERATION_NUMBER(deh_offset(deh)),
 			    (unsigned long *)de->de_gen_number_bit_string);
 
 		/* Calculate pointer to name and namelen */
 		de->de_entry_num = i;
 		set_de_name_and_namelen(de);
 
 		if ((retval = reiserfs_match(de, name, namelen)) !=
 		    NAME_NOT_FOUND) {
 			/*
 			 * de's de_name, de_namelen, de_recordlen are set.
 			 * Fill the rest:
 			 */
 			/* key of pointed object */
 			set_de_object_key(de);
 			store_de_entry_key(de);
 
 			/* retval can be NAME_FOUND or NAME_FOUND_INVISIBLE */
 			reiserfs_log(LOG_DEBUG,
 			    "reiserfs_match answered `%d'\n",
 			    retval);
 			return (retval);
 		}
 	}
 
 	if (GET_GENERATION_NUMBER(le_ih_k_offset(de->de_ih)) == 0)
 		/*
 		 * We have reached left most entry in the node. In common
 		 * we have to go to the left neighbor, but if generation
 		 * counter is 0 already, we know for sure, that there is
 		 * no name with the same hash value
 		 */
 		/* FIXME: this work correctly only because hash value can
 		 * not be 0. Btw, in case of Yura's hash it is probably
 		 * possible, so, this is a bug
 		 */
 		return (NAME_NOT_FOUND);
 
 	/*RFALSE(de->de_item_num,
 	    "vs-7015: two diritems of the same directory in one node?");*/
 
 	return (GOTO_PREVIOUS_ITEM);
 }
 
 /*
  * May return NAME_FOUND, NAME_FOUND_INVISIBLE, NAME_NOT_FOUND
  * FIXME: should add something like IOERROR
  */
 static int
 reiserfs_find_entry(struct reiserfs_node *dp, const char *name, int namelen,
     struct path * path_to_entry, struct reiserfs_dir_entry *de)
 {
 	struct cpu_key key_to_search;
 	int retval;
 
 	if (namelen > REISERFS_MAX_NAME(dp->i_reiserfs->s_blocksize))
 		return NAME_NOT_FOUND;
 
 	/* We will search for this key in the tree */
 	make_cpu_key(&key_to_search, dp,
 	    get_third_component(dp->i_reiserfs, name, namelen),
 	    TYPE_DIRENTRY, 3);
 
 	while (1) {
 		reiserfs_log(LOG_DEBUG, "search by entry key\n");
 		retval = search_by_entry_key(dp->i_reiserfs, &key_to_search,
 		    path_to_entry, de);
 		if (retval == IO_ERROR) {
 			reiserfs_log(LOG_DEBUG, "IO error in %s\n",
 			    __FUNCTION__);
 			return IO_ERROR;
 		}
 
 		/* Compare names for all entries having given hash value */
 		reiserfs_log(LOG_DEBUG, "linear search for `%s'\n", name);
 		retval = linear_search_in_dir_item(&key_to_search, de,
 		    name, namelen);
 		if (retval != GOTO_PREVIOUS_ITEM) {
 			/*
 			 * There is no need to scan directory anymore.
 			 * Given entry found or does not exist
 			 */
 			reiserfs_log(LOG_DEBUG, "linear search returned "
 			    "(objectid=%d,dirid=%d)\n",
 			    de->de_objectid, de->de_dir_id);
 			path_to_entry->pos_in_item = de->de_entry_num;
 			return retval;
 		}
 
 		/*
 		 * There is left neighboring item of this directory and
 		 * given entry can be there
 		 */
 		set_cpu_key_k_offset(&key_to_search,
 		    le_ih_k_offset(de->de_ih) - 1);
 		pathrelse(path_to_entry);  
 	} /* while (1) */
 }
Index: head/sys/gnu/fs/xfs/FreeBSD/xfs_freebsd_iget.c
===================================================================
--- head/sys/gnu/fs/xfs/FreeBSD/xfs_freebsd_iget.c	(revision 175201)
+++ head/sys/gnu/fs/xfs/FreeBSD/xfs_freebsd_iget.c	(revision 175202)
@@ -1,419 +1,419 @@
 /*
  * Copyright (c) 2000-2003 Silicon Graphics, Inc.  All Rights Reserved.
  * Copyright (c) 2006 Russell Cattelan Digital Elves, Inc. All Rights Reserved.
  *
  * This program is free software; you can redistribute it and/or modify it
  * under the terms of version 2 of the GNU General Public License as
  * published by the Free Software Foundation.
  *
  * This program is distributed in the hope that it would be useful, but
  * WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
  *
  * Further, this software is distributed without any warranty that it is
  * free of the rightful claim of any third person regarding infringement
  * or the like.  Any license provided herein, whether implied or
  * otherwise, applies only to this software file.  Patent licenses, if
  * any, provided herein do not apply to combinations of this program with
  * other software, or any other product whatsoever.
  *
  * You should have received a copy of the GNU General Public License along
  * with this program; if not, write the Free Software Foundation, Inc., 59
  * Temple Place - Suite 330, Boston MA 02111-1307, USA.
  *
  * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
  * Mountain View, CA  94043, or:
  *
  * http://www.sgi.com
  *
  * For further information regarding this notice, see:
  *
  * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
  */
 
 #include "xfs.h"
 
 #include "xfs_types.h"
 #include "xfs_bit.h"
 #include "xfs_inum.h"
 #include "xfs_log.h"
 #include "xfs_trans.h"
 #include "xfs_sb.h"
 #include "xfs_ag.h"
 #include "xfs_dir.h"
 #include "xfs_dir2.h"
 #include "xfs_dmapi.h"
 #include "xfs_mount.h"
 #include "xfs_alloc_btree.h"
 #include "xfs_bmap_btree.h"
 #include "xfs_ialloc_btree.h"
 #include "xfs_attr_sf.h"
 #include "xfs_dir_sf.h"
 #include "xfs_dir2_sf.h"
 #include "xfs_dinode.h"
 #include "xfs_inode.h"
 #include "xfs_btree.h"
 #include "xfs_ialloc.h"
 #include "xfs_quota.h"
 #include "xfs_utils.h"
 #include "xfs_vnode.h"
 
 static int xfs_vn_allocate(xfs_mount_t *, xfs_inode_t *, struct xfs_vnode **);
 
 /*
  * Look up an inode by number in the given file system.
  * The inode is looked up in the hash table for the file system
  * represented by the mount point parameter mp.  Each bucket of
  * the hash table is guarded by an individual semaphore.
  *
  * If the inode is found in the hash table, its corresponding vnode
  * is obtained with a call to vn_get().  This call takes care of
  * coordination with the reclamation of the inode and vnode.  Note
  * that the vmap structure is filled in while holding the hash lock.
  * This gives us the state of the inode/vnode when we found it and
  * is used for coordination in vn_get().
  *
  * If it is not in core, read it in from the file system's device and
  * add the inode into the hash table.
  *
  * The inode is locked according to the value of the lock_flags parameter.
  * This flag parameter indicates how and if the inode's IO lock and inode lock
  * should be taken.
  *
  * mp -- the mount point structure for the current file system.  It points
  *       to the inode hash table.
  * tp -- a pointer to the current transaction if there is one.  This is
  *       simply passed through to the xfs_iread() call.
  * ino -- the number of the inode desired.  This is the unique identifier
  *        within the file system for the inode being requested.
  * lock_flags -- flags indicating how to lock the inode.  See the comment
  *		 for xfs_ilock() for a list of valid values.
  * bno -- the block number starting the buffer containing the inode,
  *	  if known (as by bulkstat), else 0.
  */
 int
 xfs_iget(
 	xfs_mount_t	*mp,
 	xfs_trans_t	*tp,
 	xfs_ino_t	ino,
 	uint		flags,
 	uint		lock_flags,
 	xfs_inode_t	**ipp,
 	xfs_daddr_t	bno)
 {
 	xfs_ihash_t	*ih;
 	xfs_inode_t	*ip;
 	xfs_inode_t	*iq;
 	xfs_vnode_t	*vp;
 	ulong		version;
 	int		error;
 	/* REFERENCED */
 	int		newnode;
 	xfs_chash_t	*ch;
 	xfs_chashlist_t	*chl, *chlnew;
 	vmap_t		vmap;
 	SPLDECL(s);
 
 	XFS_STATS_INC(xs_ig_attempts);
 
 	ih = XFS_IHASH(mp, ino);
 
 again:
 	read_lock(&ih->ih_lock);
 
 	for (ip = ih->ih_next; ip != NULL; ip = ip->i_next) {
 		if (ip->i_ino == ino) {
 			vp = XFS_ITOV(ip);
 			VMAP(vp, vmap);
 			/*
 			 * Inode cache hit: if ip is not at the front of
 			 * its hash chain, move it there now.
 			 * Do this with the lock held for update, but
 			 * do statistics after releasing the lock.
 			 */
 			if (ip->i_prevp != &ih->ih_next
 			    && rwlock_trypromote(&ih->ih_lock)) {
 
 				if ((iq = ip->i_next)) {
 					iq->i_prevp = ip->i_prevp;
 				}
 				*ip->i_prevp = iq;
 				iq = ih->ih_next;
 				iq->i_prevp = &ip->i_next;
 				ip->i_next = iq;
 				ip->i_prevp = &ih->ih_next;
 				ih->ih_next = ip;
 				write_unlock(&ih->ih_lock);
 			} else {
 				read_unlock(&ih->ih_lock);
 			}
 
 			XFS_STATS_INC(xs_ig_found);
 
 			/*
 			 * Get a reference to the vnode/inode.
 			 * vn_get() takes care of coordination with
 			 * the file system inode release and reclaim
 			 * functions.  If it returns NULL, the inode
 			 * has been reclaimed so just start the search
 			 * over again.  We probably won't find it,
 			 * but we could be racing with another cpu
 			 * looking for the same inode so we have to at
 			 * least look.
 			 */
 			if (!(vp = vn_get(vp, &vmap))) {
 				XFS_STATS_INC(xs_ig_frecycle);
 				goto again;
 			}
 
 			if (lock_flags != 0) {
 				ip->i_flags &= ~XFS_IRECLAIM;
 				xfs_ilock(ip, lock_flags);
 			}
 
 			newnode = (ip->i_d.di_mode == 0);
 			if (newnode) {
 				xfs_iocore_inode_reinit(ip);
 			}
 			ip->i_flags &= ~XFS_ISTALE;
 
 			vn_trace_exit(vp, "xfs_iget.found",
 						(inst_t *)__return_address);
 			goto return_ip;
 		}
 	}
 
 	/*
 	 * Inode cache miss: save the hash chain version stamp and unlock
 	 * the chain, so we don't deadlock in vn_alloc.
 	 */
 	XFS_STATS_INC(xs_ig_missed);
 
 	version = ih->ih_version;
 
 	read_unlock(&ih->ih_lock);
 
 	/*
 	 * Read the disk inode attributes into a new inode structure and get
 	 * a new vnode for it. This should also initialize i_ino and i_mount.
 	 */
 	error = xfs_iread(mp, tp, ino, &ip, bno);
 	if (error) {
 		return error;
 	}
 
 	error = xfs_vn_allocate(mp, ip, &vp);
 	if (error) {
 		return error;
 	}
 	vn_trace_exit(vp, "xfs_iget.alloc", (inst_t *)__return_address);
 
 	xfs_inode_lock_init(ip, vp);
 	xfs_iocore_inode_init(ip);
 
 	if (lock_flags != 0) {
 		xfs_ilock(ip, lock_flags);
 	}
 
 	/*
 	 * Put ip on its hash chain, unless someone else hashed a duplicate
 	 * after we released the hash lock.
 	 */
 	write_lock(&ih->ih_lock);
 
 	if (ih->ih_version != version) {
 		for (iq = ih->ih_next; iq != NULL; iq = iq->i_next) {
 			if (iq->i_ino == ino) {
 				write_unlock(&ih->ih_lock);
 				xfs_idestroy(ip);
 
 				XFS_STATS_INC(xs_ig_dup);
 				goto again;
 			}
 		}
 	}
 
 	/*
 	 * These values _must_ be set before releasing ihlock!
 	 */
 	ip->i_hash = ih;
 	if ((iq = ih->ih_next)) {
 		iq->i_prevp = &ip->i_next;
 	}
 	ip->i_next = iq;
 	ip->i_prevp = &ih->ih_next;
 	ih->ih_next = ip;
 	ip->i_udquot = ip->i_gdquot = NULL;
 	ih->ih_version++;
 
 	write_unlock(&ih->ih_lock);
 
 	/*
 	 * put ip on its cluster's hash chain
 	 */
 	ASSERT(ip->i_chash == NULL && ip->i_cprev == NULL &&
 	       ip->i_cnext == NULL);
 
 	chlnew = NULL;
 	ch = XFS_CHASH(mp, ip->i_blkno);
  chlredo:
 	s = mutex_spinlock(&ch->ch_lock);
 	for (chl = ch->ch_list; chl != NULL; chl = chl->chl_next) {
 		if (chl->chl_blkno == ip->i_blkno) {
 
 			/* insert this inode into the doubly-linked list
 			 * where chl points */
 			if ((iq = chl->chl_ip)) {
 				ip->i_cprev = iq->i_cprev;
 				iq->i_cprev->i_cnext = ip;
 				iq->i_cprev = ip;
 				ip->i_cnext = iq;
 			} else {
 				ip->i_cnext = ip;
 				ip->i_cprev = ip;
 			}
 			chl->chl_ip = ip;
 			ip->i_chash = chl;
 			break;
 		}
 	}
 
 	/* no hash list found for this block; add a new hash list */
 	if (chl == NULL)  {
 		if (chlnew == NULL) {
 			mutex_spinunlock(&ch->ch_lock, s);
 			ASSERT(xfs_chashlist_zone != NULL);
 			chlnew = (xfs_chashlist_t *)
 					kmem_zone_alloc(xfs_chashlist_zone,
 						KM_SLEEP);
 			ASSERT(chlnew != NULL);
 			goto chlredo;
 		} else {
 			ip->i_cnext = ip;
 			ip->i_cprev = ip;
 			ip->i_chash = chlnew;
 			chlnew->chl_ip = ip;
 			chlnew->chl_blkno = ip->i_blkno;
 			chlnew->chl_next = ch->ch_list;
 			ch->ch_list = chlnew;
 			chlnew = NULL;
 		}
 	} else {
 		if (chlnew != NULL) {
 			kmem_zone_free(xfs_chashlist_zone, chlnew);
 		}
 	}
 
 	mutex_spinunlock(&ch->ch_lock, s);
 
 	/*
 	 * Link ip to its mount and thread it on the mount's inode list.
 	 */
 	XFS_MOUNT_ILOCK(mp);
 	if ((iq = mp->m_inodes)) {
 		ASSERT(iq->i_mprev->i_mnext == iq);
 		ip->i_mprev = iq->i_mprev;
 		iq->i_mprev->i_mnext = ip;
 		iq->i_mprev = ip;
 		ip->i_mnext = iq;
 	} else {
 		ip->i_mnext = ip;
 		ip->i_mprev = ip;
 	}
 	mp->m_inodes = ip;
 
 	XFS_MOUNT_IUNLOCK(mp);
 
 	newnode = 1;
 
  return_ip:
 	ASSERT(ip->i_df.if_ext_max ==
 	       XFS_IFORK_DSIZE(ip) / sizeof(xfs_bmbt_rec_t));
 
 	ASSERT(((ip->i_d.di_flags & XFS_DIFLAG_REALTIME) != 0) ==
 	       ((ip->i_iocore.io_flags & XFS_IOCORE_RT) != 0));
 
 	*ipp = ip;
 
 	/*
 	 * If we have a real type for an on-disk inode, we can set ops(&unlock)
 	 * now.	 If it's a new inode being created, xfs_ialloc will handle it.
 	 */
 	XVFS_INIT_VNODE(XFS_MTOVFS(mp), vp, XFS_ITOBHV(ip), 1);
 
 	return 0;
 }
 
 /*
  * Special iput for brand-new inodes that are still locked
  */
 void
 xfs_iput_new(xfs_inode_t	*ip,
 	     uint		lock_flags)
 {
 	xfs_vnode_t		*vp = XFS_ITOV(ip);
 
 	vn_trace_entry(vp, "xfs_iput_new", (inst_t *)__return_address);
 
 	printf("xfs_iput_new: ip %p\n",ip);
 	
 	if ((ip->i_d.di_mode == 0)) {
 		ASSERT(!(ip->i_flags & XFS_IRECLAIMABLE));
 		//vn_mark_bad(vp);
 		printf("xfs_iput_new: ip %p di_mode == 0\n",ip);
 		/* mabe call vgone here? RMC */
 	}
 	if (lock_flags)
 		xfs_iunlock(ip, lock_flags);
 
 	ASSERT_VOP_LOCKED(vp->v_vnode, "xfs_iput_new");
 	vput(vp->v_vnode);
 }
 
 extern struct vop_vector xfs_vnops;
 
 static int
 xfs_vn_allocate(xfs_mount_t *mp, xfs_inode_t *ip, struct xfs_vnode **vpp)
 {
 	struct vnode *vp;
 	struct xfs_vnode *vdata;
 	int error;
 
 	/* Use zone allocator here? */
 	vdata = kmem_zalloc(sizeof(*vdata), KM_SLEEP);
 
 	error = getnewvnode("xfs", XVFSTOMNT(XFS_MTOVFS(mp)),
 			    &xfs_vnops, &vp);
 	if (error) {
 		kmem_free(vdata, sizeof(*vdata));
 		return (error);
 	}
 
 	vp->v_vnlock->lk_flags |= LK_CANRECURSE;
-	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, curthread);
+	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
 	error = insmntque(vp, XVFSTOMNT(XFS_MTOVFS(mp)));
 	if (error != 0) {
 		kmem_free(vdata, sizeof(*vdata));
 		return (error);
 	}
 
 	vp->v_data = (void *)vdata;
 	vdata->v_number= 0;
 	vdata->v_inode = ip;
 	vdata->v_vfsp  = XFS_MTOVFS(mp);
 	vdata->v_vnode = vp;
 
  	vn_bhv_head_init(VN_BHV_HEAD(vdata), "vnode");
 
 
 #ifdef  CONFIG_XFS_VNODE_TRACING
         vp->v_trace = ktrace_alloc(VNODE_TRACE_SIZE, KM_SLEEP);
 #endif  /* CONFIG_XFS_VNODE_TRACING */
 
         vn_trace_exit(vp, "vn_initialize", (inst_t *)__return_address);
 
 	if (error == 0)
 		*vpp = vdata;
 
 	return (error);
 }
Index: head/sys/gnu/fs/xfs/FreeBSD/xfs_super.c
===================================================================
--- head/sys/gnu/fs/xfs/FreeBSD/xfs_super.c	(revision 175201)
+++ head/sys/gnu/fs/xfs/FreeBSD/xfs_super.c	(revision 175202)
@@ -1,279 +1,279 @@
 /*
  * Copyright (c) 2000-2005 Silicon Graphics, Inc.
  * All Rights Reserved.
  *
  * This program is free software; you can redistribute it and/or
  * modify it under the terms of the GNU General Public License as
  * published by the Free Software Foundation.
  *
  * This program is distributed in the hope that it would be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  * GNU General Public License for more details.
  *
  * You should have received a copy of the GNU General Public License
  * along with this program; if not, write the Free Software Foundation,
  * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
  */
 #include "xfs.h"
 #include "xfs_bit.h"
 #include "xfs_log.h"
 #include "xfs_clnt.h"
 #include "xfs_inum.h"
 #include "xfs_trans.h"
 #include "xfs_sb.h"
 #include "xfs_ag.h"
 #include "xfs_dir.h"
 #include "xfs_dir2.h"
 #include "xfs_alloc.h"
 #include "xfs_dmapi.h"
 #include "xfs_quota.h"
 #include "xfs_mount.h"
 #include "xfs_bmap_btree.h"
 #include "xfs_alloc_btree.h"
 #include "xfs_ialloc_btree.h"
 #include "xfs_dir_sf.h"
 #include "xfs_dir2_sf.h"
 #include "xfs_attr_sf.h"
 #include "xfs_dinode.h"
 #include "xfs_inode.h"
 #include "xfs_btree.h"
 #include "xfs_ialloc.h"
 #include "xfs_bmap.h"
 #include "xfs_rtalloc.h"
 #include "xfs_error.h"
 #include "xfs_itable.h"
 #include "xfs_rw.h"
 #include "xfs_acl.h"
 #include "xfs_cap.h"
 #include "xfs_mac.h"
 #include "xfs_attr.h"
 #include "xfs_buf_item.h"
 #include "xfs_utils.h"
 #include "xfs_version.h"
 #include "xfs_buf.h"
 
 #include <sys/priv.h>
 
 #include <geom/geom.h>
 #include <geom/geom_vfs.h>
 
 extern struct vop_vector xfs_fifoops;
 extern struct xfs_vnodeops xfs_vnodeops;
 
 __uint64_t
 xfs_max_file_offset(
 	unsigned int		blockshift)
 {
 
 	return (OFF_MAX);
 }
 
 void
 xfs_initialize_vnode(
 	bhv_desc_t		*bdp,
 	xfs_vnode_t		*xvp,
 	bhv_desc_t		*inode_bhv,
 	int			unlock)
 {
 	xfs_inode_t		*ip = XFS_BHVTOI(inode_bhv);
 
 	if (!inode_bhv->bd_vobj) {
 		xvp->v_vfsp = bhvtovfs(bdp);
 		bhv_desc_init(inode_bhv, ip, xvp, &xfs_vnodeops);
 		bhv_insert(VN_BHV_HEAD(xvp), inode_bhv);
 	}
 
 	/*
 	 * XXX: Use VNON as an indication of freshly allocated vnode
 	 * which need to be initialized and unlocked.
 	 * This is _not_ like the same place in Linux version of
 	 * routine.
 	 */
 
 	if (xvp->v_vnode->v_type != VNON)
 	  return;
 
 	xvp->v_vnode->v_type =  IFTOVT(ip->i_d.di_mode);
 
 	if (xvp->v_vnode->v_type == VFIFO)
 		xvp->v_vnode->v_op = &xfs_fifoops;
 
 	ASSERT_VOP_LOCKED(xvp->v_vnode, "xfs_initialize_vnode");
 
 	/* For new inodes we need to set the ops vectors,
 	 * and unlock the inode.
 	 */
 	if (ip->i_d.di_mode != 0 && unlock)
 		VOP_UNLOCK(xvp->v_vnode, 0, curthread);
 }
 
 #if 0
 struct vnode *
 xfs_get_inode(
 	bhv_desc_t	*bdp,
 	xfs_ino_t	ino,
 	int		flags)
 {
 	return NULL;
 }
 #endif
 
 /*ARGSUSED*/
 int
 xfs_blkdev_get(
 	xfs_mount_t		*mp,
 	const char		*name,
 	struct vnode		**bdevp)
 {
 	struct nameidata	nd;
 	struct nameidata	*ndp = &nd;
 	int			error, ronly;
 	struct thread		*td;
 	struct vnode		*devvp;
 	struct g_consumer	*cp;
 	struct g_provider	*pp;
 	mode_t			accessmode;
 
 	td = curthread;
 
 	NDINIT(ndp, LOOKUP, FOLLOW, UIO_SYSSPACE, name, td);
 	if ((error = namei(ndp)) != 0)
 		return (error);
 	NDFREE(ndp, NDF_ONLY_PNBUF);
 	devvp = ndp->ni_vp;
 
 	if (!vn_isdisk(devvp, &error)) {
 		vrele(devvp);
 		return (error);
 	}
 
-	vn_lock(devvp, LK_EXCLUSIVE | LK_RETRY, td);
+	vn_lock(devvp, LK_EXCLUSIVE | LK_RETRY);
 
 	ronly = ((XFS_MTOVFS(mp)->vfs_flag & VFS_RDONLY) != 0);
 	accessmode = VREAD;
 	if (!ronly)
 		accessmode |= VWRITE;
 	error = VOP_ACCESS(devvp, accessmode, td->td_ucred, td);
 	if (error)
 		error = priv_check(td, PRIV_VFS_MOUNT_PERM);
 	if (error) {
 		vput(devvp);
 		return (error);
 	}
 
 	DROP_GIANT();
 	g_topology_lock();
 
 	/*
 	 * XXX: Do not allow more than one consumer to open a device
 	 *      associated with a particular GEOM provider.
 	 *      This disables multiple read-only mounts of a device,
 	 *      but it gets rid of panics in bmemfree() when you try to
 	 *      mount the same device more than once.
 	 *      During mounting, XFS does a bread() of the superblock, but does
 	 *      not brelse() it.  A subsequent mount of the same device
 	 *      will try to bread() the superblock, resulting in a panic in 
 	 *      bremfree(), "buffer not on queue".
 	 */
 	pp = g_dev_getprovider(devvp->v_rdev);
  	if ((pp != NULL) && ((pp->acr | pp->acw | pp->ace ) != 0)) 
 		error = EPERM;
 	else 
 		error = g_vfs_open(devvp, &cp, "xfs", ronly ? 0 : 1);
 
 	g_topology_unlock();
 	PICKUP_GIANT();
 
 	if (error) {
 		vput(devvp);
 		return (error);
 	}
 	VOP_UNLOCK(devvp, 0, td);
 
 	devvp->v_bufobj.bo_private = cp;
 	devvp->v_bufobj.bo_ops = &xfs_bo_ops;
 
 	*bdevp = devvp;
 	return (0);
 }
 
 void
 xfs_blkdev_put(
 	struct vnode	*devvp)
 {
 	struct g_consumer	*cp;
 
 	if (devvp == NULL)
 		return;
 
 	vinvalbuf(devvp, V_SAVE, curthread, 0, 0);
 
 	cp = devvp->v_bufobj.bo_private;
 	DROP_GIANT();
 	g_topology_lock();
 	g_wither_geom_close(cp->geom, ENXIO);
 	g_topology_unlock();
 	PICKUP_GIANT();
 
         vrele(devvp);
 }
 
 void
 xfs_mountfs_check_barriers(xfs_mount_t *mp)
 {
 	printf("xfs_mountfs_check_barriers NI\n");
 }
 
 void
 xfs_flush_inode(
 		xfs_inode_t	*ip)
 {
 	printf("xfs_flush_inode NI\n");
 }
 
 void
 xfs_flush_device(
 		 xfs_inode_t	*ip)
 {
 	printf("xfs_flush_device NI\n");
         xfs_log_force(ip->i_mount, (xfs_lsn_t)0, XFS_LOG_FORCE|XFS_LOG_SYNC);
 }
 
 
 void
 xfs_blkdev_issue_flush(
 	xfs_buftarg_t		*buftarg)
 {
 	printf("xfs_blkdev_issue_flush NI\n");
 }
 
 int
 init_xfs_fs( void )
 {
 	static char		message[] =
 		XFS_VERSION_STRING " with " XFS_BUILD_OPTIONS " enabled\n";
 
 	printf(message);
 
 	vn_init();
 	xfs_init();
 	uuid_init();
 #ifdef RMC
 	vfs_initdmapi();
 #endif
 	vfs_initquota();
 
 	return 0;
 }
 
 void
 exit_xfs_fs(void)
 {
 	xfs_cleanup();
 	vfs_exitquota();
 #ifdef RMC
 	vfs_exitdmapi();
 #endif
 }
 
Index: head/sys/gnu/fs/xfs/FreeBSD/xfs_vnode.c
===================================================================
--- head/sys/gnu/fs/xfs/FreeBSD/xfs_vnode.c	(revision 175201)
+++ head/sys/gnu/fs/xfs/FreeBSD/xfs_vnode.c	(revision 175202)
@@ -1,267 +1,267 @@
 /*
  * Copyright (c) 2000-2003 Silicon Graphics, Inc.  All Rights Reserved.
  *
  * This program is free software; you can redistribute it and/or modify it
  * under the terms of version 2 of the GNU General Public License as
  * published by the Free Software Foundation.
  *
  * This program is distributed in the hope that it would be useful, but
  * WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
  *
  * Further, this software is distributed without any warranty that it is
  * free of the rightful claim of any third person regarding infringement
  * or the like.  Any license provided herein, whether implied or
  * otherwise, applies only to this software file.  Patent licenses, if
  * any, provided herein do not apply to combinations of this program with
  * other software, or any other product whatsoever.
  *
  * You should have received a copy of the GNU General Public License along
  * with this program; if not, write the Free Software Foundation, Inc., 59
  * Temple Place - Suite 330, Boston MA 02111-1307, USA.
  *
  * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
  * Mountain View, CA  94043, or:
  *
  * http://www.sgi.com
  *
  * For further information regarding this notice, see:
  *
  * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
  */
 
 #include "xfs.h"
 #include "xfs_types.h"
 #include "xfs_bit.h"
 #include "xfs_inum.h"
 #include "xfs_log.h"
 #include "xfs_trans.h"
 #include "xfs_trans_priv.h"
 #include "xfs_sb.h"
 #include "xfs_ag.h"
 #include "xfs_dir.h"
 #include "xfs_dir2.h"
 #include "xfs_dmapi.h"
 #include "xfs_mount.h"
 #include "xfs_alloc_btree.h"
 #include "xfs_bmap_btree.h"
 #include "xfs_ialloc_btree.h"
 #include "xfs_btree.h"
 #include "xfs_imap.h"
 #include "xfs_alloc.h"
 #include "xfs_attr_sf.h"
 #include "xfs_dir_sf.h"
 #include "xfs_dir2_sf.h"
 #include "xfs_dinode.h"
 #include "xfs_ialloc.h"
 #include "xfs_inode.h"
 #include "xfs_inode_item.h"
 
 void
 vn_init(void)
 {
 }
 
 void
 vn_iowait(
 	  struct xfs_vnode *vp)
 {
 	printf("vn_iowait doing nothing on FreeBSD?\n");
 }
 
 struct xfs_vnode *
 vn_initialize(
 	xfs_vnode_t	*vp)
 {
 	XFS_STATS_INC(vn_active);
 	XFS_STATS_INC(vn_alloc);
 
 	/* Initialize the first behavior and the behavior chain head. */
 	vn_bhv_head_init(VN_BHV_HEAD(vp), "vnode");
 
 #ifdef	CONFIG_XFS_VNODE_TRACING
 	vp->v_trace = ktrace_alloc(VNODE_TRACE_SIZE, KM_SLEEP);
 #endif	/* CONFIG_XFS_VNODE_TRACING */
 
 	vn_trace_exit(vp, "vn_initialize", (inst_t *)__return_address);
 	return vp;
 }
 
 /*
  * Get a reference on a vnode. Need to drop vnode reference
  * to accomodate for vhold by VMAP regardless of whether or
  * not we were able to successfully grab the vnode.
  */
 xfs_vnode_t *
 vn_get(
 	struct xfs_vnode	*xfs_vp,
 	vmap_t			*vmap)
 {
 	struct vnode *vp;
 	int error;
 
 	XFS_STATS_INC(vn_get);
 
 	vp = vmap->v_vp;
 
 	error = vget(vp, 0, curthread);
 	if (error) {
 		vdrop(vp);
 		return (NULL);
 	}
 
 	vdrop(vp);
 	if (vp->v_data != xfs_vp) {
 		vput(vp);
 		return (NULL);
 	}
 
 	vn_trace_exit(vp, "vn_get", (inst_t *)__return_address);
 	return xfs_vp;
 }
 
 /*
  * purge a vnode from the cache
  * At this point the vnode is guaranteed to have no references (vn_count == 0)
  * The caller has to make sure that there are no ways someone could
  * get a handle (via vn_get) on the vnode (usually done via a mount/vfs lock).
  */
 void
 vn_purge(struct xfs_vnode        *xfs_vp)
 {
         struct vnode *vp;
 
         vn_trace_entry(vp, "vn_purge", (inst_t *)__return_address);
 
         vp = xfs_vp->v_vnode;
 
-        vn_lock(vp, LK_EXCLUSIVE, curthread);
+        vn_lock(vp, LK_EXCLUSIVE);
 	if (vp->v_holdcnt == 0)
 		vhold(vp);
 	vgone(vp);
         VOP_UNLOCK(vp, 0, curthread);
 }
 
 void xfs_ichgtime(
 	xfs_inode_t	*ip,
 	int		flags)
 {
 	timespec_t  tv;
 	
 	vfs_timestamp(&tv);
 	if (flags & XFS_ICHGTIME_MOD) {
 		ip->i_d.di_mtime.t_sec = (__int32_t)tv.tv_sec;
 		ip->i_d.di_mtime.t_nsec = (__int32_t)tv.tv_nsec;
 	}
 	if (flags & XFS_ICHGTIME_ACC) {
 		ip->i_d.di_atime.t_sec = (__int32_t)tv.tv_sec;
 		ip->i_d.di_atime.t_nsec = (__int32_t)tv.tv_nsec;
 	}
 	if (flags & XFS_ICHGTIME_CHG) {
 		ip->i_d.di_ctime.t_sec = (__int32_t)tv.tv_sec;
 		ip->i_d.di_ctime.t_nsec = (__int32_t)tv.tv_nsec;
 	}
 	
 //printf ("xfs_ichgtime NI\n");
 
 }
 
 
 /*
  * Bring the atime in the XFS inode uptodate.
  * Used before logging the inode to disk or when the Linux inode goes away.
  */
 
 /*
  * It's unclear if we need this since this is for syncing the linux inode's atime
  * to the xfs inode's atime.
  * Since FreeBSD doesn't have atime in the vnode is there anything to really
  * sync over?
  * For now just make this a update atime call
  */
 
 void
 xfs_synchronize_atime(
 	xfs_inode_t	*ip)
 {
 #if 0
 	xfs_vnode_t	*vp;
 #endif
 
 	timespec_t  tv;
 	
 /* vfs_timestamp looks at the system time accuracy variable */
 	vfs_timestamp(&tv);
 #if 0
 	printf("xfs_synchronize_atime old (%d,%d) new (%d,%ld)\n",
 	       ip->i_d.di_atime.t_sec,
 	       ip->i_d.di_atime.t_nsec,
 	       tv.tv_sec,
 	       tv.tv_nsec);
 #endif
 
 	ip->i_d.di_atime.t_sec = (__int32_t)tv.tv_sec;
 	ip->i_d.di_atime.t_nsec = (__int32_t)tv.tv_nsec;
 }
 
 #ifdef RMC
 /*
  * Extracting atime values in various formats
  */
 void vn_atime_to_bstime(struct xfs_vnode *vp, xfs_bstime_t *bs_atime)
 {
 	bs_atime->tv_sec = vp->v_inode.i_atime.tv_sec;
 	bs_atime->tv_nsec = vp->v_inode.i_atime.tv_nsec;
 	printf("vn_atime_to_bstime NI\n");
 }
 #endif
 
 
 #ifdef	CONFIG_XFS_VNODE_TRACING
 
 #define KTRACE_ENTER(vp, vk, s, line, ra)			\
 	ktrace_enter(	(vp)->v_trace,				\
 /*  0 */		(void *)(__psint_t)(vk),		\
 /*  1 */		(void *)(s),				\
 /*  2 */		(void *)(__psint_t) line,		\
 /*  3 */		(void *)(vn_count(vp)), \
 /*  4 */		(void *)(ra),				\
 /*  5 */		(void *)(__psunsigned_t)(vp)->v_flag,	\
 /*  6 */		(void *)(__psint_t)smp_processor_id(),	\
 /*  7 */		(void *)(__psint_t)(current->pid),	\
 /*  8 */		(void *)__return_address,		\
 /*  9 */		0, 0, 0, 0, 0, 0, 0)
 
 /*
  * Vnode tracing code.
  */
 void
 vn_trace_entry(xfs_vnode_t *vp, char *func, inst_t *ra)
 {
 	KTRACE_ENTER(vp, VNODE_KTRACE_ENTRY, func, 0, ra);
 }
 
 void
 vn_trace_exit(xfs_vnode_t *vp, char *func, inst_t *ra)
 {
 	KTRACE_ENTER(vp, VNODE_KTRACE_EXIT, func, 0, ra);
 }
 
 void
 vn_trace_hold(xfs_vnode_t *vp, char *file, int line, inst_t *ra)
 {
 	KTRACE_ENTER(vp, VNODE_KTRACE_HOLD, file, line, ra);
 }
 
 void
 vn_trace_ref(xfs_vnode_t *vp, char *file, int line, inst_t *ra)
 {
 	KTRACE_ENTER(vp, VNODE_KTRACE_REF, file, line, ra);
 }
 
 void
 vn_trace_rele(xfs_vnode_t *vp, char *file, int line, inst_t *ra)
 {
 	KTRACE_ENTER(vp, VNODE_KTRACE_RELE, file, line, ra);
 }
 #endif	/* CONFIG_XFS_VNODE_TRACING */
Index: head/sys/gnu/fs/xfs/FreeBSD/xfs_vnops.c
===================================================================
--- head/sys/gnu/fs/xfs/FreeBSD/xfs_vnops.c	(revision 175201)
+++ head/sys/gnu/fs/xfs/FreeBSD/xfs_vnops.c	(revision 175202)
@@ -1,1699 +1,1699 @@
 /*
  * Copyright (c) 2001, Alexander Kabaev
  * Copyright (c) 2006, Russell Cattelan Digital Elves Inc.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  * $FreeBSD$
  */
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/namei.h>
 #include <sys/kernel.h>
 #include <sys/fcntl.h>
 #include <sys/mount.h>
 #include <sys/unistd.h>
 #include <sys/vnode.h>
 #include <sys/dirent.h>
 #include <sys/ioccom.h>
 #include <sys/malloc.h>
 #include <sys/extattr.h>
 
 #include <vm/vm.h>
 #include <vm/vm_extern.h>
 #include <vm/vm_object.h>
 #include <vm/vm_page.h>
 #include <vm/vm_pager.h>
 #include <vm/vnode_pager.h>
 
 #include <fs/fifofs/fifo.h>
 
 #define NO_VFS_MACROS
 #include "xfs.h"
 #include "xfs_types.h"
 #include "xfs_bit.h"
 #include "xfs_inum.h"
 #include "xfs_log.h"
 #include "xfs_trans.h"
 #include "xfs_trans_priv.h"
 #include "xfs_sb.h"
 #include "xfs_ag.h"
 #include "xfs_dir.h"
 #include "xfs_dir2.h"
 #include "xfs_dmapi.h"
 #include "xfs_mount.h"
 #include "xfs_alloc_btree.h"
 #include "xfs_bmap_btree.h"
 #include "xfs_ialloc_btree.h"
 #include "xfs_btree.h"
 #include "xfs_imap.h"
 #include "xfs_attr.h"
 #include "xfs_attr_sf.h"
 #include "xfs_dir_sf.h"
 #include "xfs_dir2_sf.h"
 #include "xfs_dinode.h"
 #include "xfs_ialloc.h"
 #include "xfs_alloc.h"
 #include "xfs_inode.h"
 #include "xfs_inode_item.h"
 #include "xfs_acl.h"
 #include "xfs_cap.h"
 #include "xfs_mac.h"
 #include "xfs_iomap.h"
 #include "xfs_clnt.h"
 #include "xfs_mountops.h"
 
 /*
  * Prototypes for XFS vnode operations.
  */
 static vop_access_t		_xfs_access;
 static vop_advlock_t		_xfs_advlock;
 static vop_bmap_t		_xfs_bmap;
 static vop_cachedlookup_t	_xfs_cachedlookup;
 static vop_close_t		_xfs_close;
 static vop_create_t		_xfs_create;
 static vop_deleteextattr_t	_xfs_deleteextattr;
 static vop_fsync_t		_xfs_fsync;
 static vop_getattr_t		_xfs_getattr;
 static vop_getextattr_t		_xfs_getextattr;
 static vop_inactive_t		_xfs_inactive;
 static vop_ioctl_t		_xfs_ioctl;
 static vop_link_t		_xfs_link;
 static vop_listextattr_t	_xfs_listextattr;
 static vop_mkdir_t		_xfs_mkdir;
 static vop_mknod_t		_xfs_mknod;
 static vop_open_t		_xfs_open;
 static vop_read_t		_xfs_read;
 static vop_readdir_t		_xfs_readdir;
 static vop_readlink_t		_xfs_readlink;
 static vop_reclaim_t		_xfs_reclaim;
 static vop_remove_t		_xfs_remove;
 static vop_rename_t		_xfs_rename;
 static vop_rmdir_t		_xfs_rmdir;
 static vop_setattr_t		_xfs_setattr;
 static vop_setextattr_t		_xfs_setextattr;
 static vop_strategy_t		_xfs_strategy;
 static vop_symlink_t		_xfs_symlink;
 static vop_write_t		_xfs_write;
 static vop_vptofh_t		_xfs_vptofh;
 
 struct vop_vector xfs_vnops = {
 	.vop_default =		&default_vnodeops,
 	.vop_access =		_xfs_access,
 	.vop_advlock =		_xfs_advlock,
 	.vop_bmap =		_xfs_bmap,
 	.vop_cachedlookup =	_xfs_cachedlookup,
 	.vop_close =		_xfs_close,
 	.vop_create =		_xfs_create,
 	.vop_deleteextattr =	_xfs_deleteextattr,
 	.vop_fsync =		_xfs_fsync,
 	.vop_getattr =		_xfs_getattr,
 	.vop_getextattr =	_xfs_getextattr,
 	.vop_inactive =		_xfs_inactive,
 	.vop_ioctl =		_xfs_ioctl,
 	.vop_link =		_xfs_link,
 	.vop_listextattr =	_xfs_listextattr,
 	.vop_lookup =		vfs_cache_lookup,
 	.vop_mkdir =		_xfs_mkdir,
 	.vop_mknod =		_xfs_mknod,
 	.vop_open =		_xfs_open,
 	.vop_read =		_xfs_read,
 	.vop_readdir =		_xfs_readdir,
 	.vop_readlink =		_xfs_readlink,
 	.vop_reclaim =		_xfs_reclaim,
 	.vop_remove =		_xfs_remove,
 	.vop_rename =		_xfs_rename,
 	.vop_rmdir =		_xfs_rmdir,
 	.vop_setattr =		_xfs_setattr,
 	.vop_setextattr =	_xfs_setextattr,
 	.vop_strategy =		_xfs_strategy,
 	.vop_symlink =		_xfs_symlink,
 	.vop_write =		_xfs_write,
 	.vop_vptofh =		_xfs_vptofh,
 };
 
 /*
  *  FIFO's specific operations.
  */
 
 static vop_close_t	_xfsfifo_close;
 static vop_read_t	_xfsfifo_read;
 static vop_kqfilter_t	_xfsfifo_kqfilter;
 static vop_write_t	_xfsfifo_write;
 
 struct vop_vector xfs_fifoops = {
 	.vop_default =		&fifo_specops,
 	.vop_access =		_xfs_access,
 	.vop_close =		_xfsfifo_close,
 	.vop_fsync =		_xfs_fsync,
 	.vop_getattr =		_xfs_getattr,
 	.vop_inactive =		_xfs_inactive,
 	.vop_kqfilter =		_xfsfifo_kqfilter,
 	.vop_read =		_xfsfifo_read,
 	.vop_reclaim =		_xfs_reclaim,
 	.vop_setattr =		_xfs_setattr,
 	.vop_write =		_xfsfifo_write,
 	.vop_vptofh =		_xfs_vptofh,
 };
 
 static int
 _xfs_access(
     	struct vop_access_args /* {
 		struct vnode *a_vp;
 		int  a_mode;
 		struct ucred *a_cred;
 		struct thread *a_td;
 	} */ *ap)
 {
 	int error;
 
 	XVOP_ACCESS(VPTOXFSVP(ap->a_vp), ap->a_mode, ap->a_cred, error);
 	return (error);
 }
 
 static int
 _xfs_open(
     	struct vop_open_args /* {
 		struct vnode *a_vp;
 		int  a_mode;
 		struct ucred *a_cred;
 		struct thread *a_td;
 		int  a_fdidx;
 	} */ *ap)
 {
 	int error;
 
 	XVOP_OPEN(VPTOXFSVP(ap->a_vp), ap->a_cred, error);
 	if (error == 0)
 		vnode_create_vobject(ap->a_vp, 0, ap->a_td);
 	return (error);
 }
 
 static int
 _xfs_close(
 	struct vop_close_args /* {
 		struct vnodeop_desc *a_desc;
 		struct vnode *a_vp;
 		int  a_fflag;
 		struct ucred *a_cred;
 		struct thread *a_td;
 	} */ *ap)
 {
 	int error = 0;
 	/* XVOP_CLOSE(VPTOXFSVP(ap->a_vp), NULL, error); */
 	return (error);
 }
 
 static int
 _xfs_getattr(
 	struct vop_getattr_args /* {
 		struct vnode *a_vp;
 		struct vattr *a_vap;
 		struct ucred *a_cred;
 		struct thread *a_td;
 	} */ *ap)
 {
 	struct vnode	*vp = ap->a_vp;
 	struct vattr	*vap = ap->a_vap;
 	struct mount	*mp;
 	xfs_vattr_t	va;
 	int		error;
 	/* extract the xfs vnode from the private data */
 	//xfs_vnode_t	*xvp = (xfs_vnode_t *)vp->v_data;
 
 	VATTR_NULL(vap);
 	memset(&va,0,sizeof(xfs_vattr_t));
 	va.va_mask = XFS_AT_STAT|XFS_AT_GENCOUNT|XFS_AT_XFLAGS;
 
 	XVOP_GETATTR(VPTOXFSVP(vp), &va, 0, ap->a_cred, error);
 	if (error)
 		return (error);
 
 	mp  = vp->v_mount;
 
 	vap->va_type = IFTOVT(((xfs_vnode_t *)vp->v_data)->v_inode->i_d.di_mode);
 	vap->va_mode = va.va_mode;
 	vap->va_nlink = va.va_nlink;
 	vap->va_uid = va.va_uid;
 	vap->va_gid = va.va_gid;
 	vap->va_fsid = mp->mnt_stat.f_fsid.val[0];
 	vap->va_fileid = va.va_nodeid;
 	vap->va_size = va.va_size;
 	vap->va_blocksize = va.va_blocksize;
 	vap->va_atime = va.va_atime;
 	vap->va_mtime = va.va_mtime;
 	vap->va_ctime = va.va_ctime;
 	vap->va_gen = va.va_gen;
 	vap->va_rdev = va.va_rdev;
 	vap->va_bytes = (va.va_nblocks << BBSHIFT);
 
 	/* XFS now supports devices that have block sizes
 	 * other than 512 so BBSHIFT will work for now
 	 * but need to get this value from the super block
 	 */
 
 	/*
 	 * Fields with no direct equivalent in XFS
 	 * leave initialized by VATTR_NULL
 	 */
 #if 0
 	vap->va_filerev = 0;
 	vap->va_birthtime = va.va_ctime;
 	vap->va_vaflags = 0;
 	vap->va_flags = 0;
 	vap->va_spare = 0;
 #endif
 
 	return (0);
 }
 
 static int
 _xfs_setattr(
 	struct vop_setattr_args /* {
 		struct vnode *a_vp;
 		struct vattr *a_vap;
 		struct ucred *a_cred;
 		struct thread *a_td;
 	} */ *ap)
 {
 	struct vnode *vp = ap->a_vp;
 	struct vattr *vap = ap->a_vap;
 	xfs_vattr_t   va;
 	int error;
 
 	/*
 	 * Check for unsettable attributes.
 	 */
 #ifdef RMC
 	if ((vap->va_type != VNON) || (vap->va_nlink != VNOVAL) ||
 	    (vap->va_fsid != VNOVAL) || (vap->va_fileid != VNOVAL) ||
 	    (vap->va_blocksize != VNOVAL) || (vap->va_rdev != VNOVAL) ||
 	    ((int)vap->va_bytes != VNOVAL) || (vap->va_gen != VNOVAL))
 		return (EINVAL);
 #endif
 
 	memset(&va, 0, sizeof(va));
 
 	if (vap->va_uid != (uid_t)VNOVAL) {
 		va.va_mask |= XFS_AT_UID;
 		va.va_uid = vap->va_uid;
 	}
 	if (vap->va_gid != (gid_t)VNOVAL) {
 		va.va_mask |= XFS_AT_GID;
 		va.va_gid = vap->va_gid;
 	}
 	if (vap->va_size != VNOVAL) {
 		va.va_mask |= XFS_AT_SIZE;
 		va.va_size = vap->va_size;
 	}
 	if (vap->va_atime.tv_sec != VNOVAL) {
 		va.va_mask |= XFS_AT_ATIME;
 		va.va_atime = vap->va_atime;
 	}
 	if (vap->va_mtime.tv_sec != VNOVAL) {
 		va.va_mask |= XFS_AT_MTIME;
 		va.va_mtime = vap->va_mtime;
 	}
 	if (vap->va_ctime.tv_sec != VNOVAL) {
 		va.va_mask |= XFS_AT_CTIME;
 		va.va_ctime = vap->va_ctime;
 	}
 	if (vap->va_mode != (mode_t)VNOVAL) {
 		va.va_mask |= XFS_AT_MODE;
 		va.va_mode = vap->va_mode;
 	}
 
 	XVOP_SETATTR(VPTOXFSVP(vp), &va, 0, ap->a_cred, error);
 	return (error);
 }
 
 static int
 _xfs_inactive(
 	struct vop_inactive_args  /* {
 		struct vnode *a_vp;
 		struct thread *a_td;
 	} */ *ap)
 {
 	struct vnode *vp = ap->a_vp;
 	struct thread *td = ap->a_td;
 	int error;
 
 	XVOP_INACTIVE(VPTOXFSVP(vp), td->td_ucred, error);
 	return (error);
 }
 
 static int
 _xfs_read(
 	struct vop_read_args /* {
 		struct vnode *a_vp;
 		struct uio *a_uio;
 		int  a_ioflag;
 		struct ucred *a_cred;
 	} */ *ap)
 {
 	struct vnode *vp = ap->a_vp;
  	struct uio *uio = ap->a_uio;
 	int error;
 
 	switch (vp->v_type) {
 	case VREG:
 		break;
 	case VDIR:
 		return (EISDIR);
 	default:
 		return (EPERM);
 	};
 
 	XVOP_READ(VPTOXFSVP(vp), uio, ap->a_ioflag, ap->a_cred, error);
 	return error;
 }
 
 int
 xfs_read_file(xfs_mount_t *mp, xfs_inode_t *ip, struct uio *uio, int ioflag)
 {
 	xfs_fileoff_t lbn, nextlbn;
 	xfs_fsize_t bytesinfile;
 	long size, xfersize, blkoffset;
 	struct buf *bp;
 	struct vnode *vp;
 	int error, orig_resid;
 	int seqcount;
 
 	seqcount = ioflag >> IO_SEQSHIFT;
 
 	orig_resid = uio->uio_resid;
 	if (orig_resid <= 0)
 		return (0);
 
 	vp = XFS_ITOV(ip)->v_vnode;
 
 	/*
 	 * Ok so we couldn't do it all in one vm trick...
 	 * so cycle around trying smaller bites..
 	 */
 	for (error = 0, bp = NULL; uio->uio_resid > 0; bp = NULL) {
 		if ((bytesinfile = ip->i_d.di_size - uio->uio_offset) <= 0)
 			break;
 
 		lbn = XFS_B_TO_FSBT(mp, uio->uio_offset);
 		nextlbn = lbn + 1;
 
 		/*
 		 * size of buffer.  The buffer representing the
 		 * end of the file is rounded up to the size of
 		 * the block type ( fragment or full block,
 		 * depending ).
 		 */
 		size = mp->m_sb.sb_blocksize;
 		blkoffset = XFS_B_FSB_OFFSET(mp, uio->uio_offset);
 
 		/*
 		 * The amount we want to transfer in this iteration is
 		 * one FS block less the amount of the data before
 		 * our startpoint (duh!)
 		 */
 		xfersize = mp->m_sb.sb_blocksize - blkoffset;
 
 		/*
 		 * But if we actually want less than the block,
 		 * or the file doesn't have a whole block more of data,
 		 * then use the lesser number.
 		 */
 		if (uio->uio_resid < xfersize)
 			xfersize = uio->uio_resid;
 		if (bytesinfile < xfersize)
 			xfersize = bytesinfile;
 
 		if (XFS_FSB_TO_B(mp, nextlbn) >= ip->i_d.di_size ) {
 			/*
 			 * Don't do readahead if this is the end of the file.
 			 */
 			error = bread(vp, lbn, size, NOCRED, &bp);
 		} else if ((vp->v_mount->mnt_flag & MNT_NOCLUSTERR) == 0) {
 			/*
 			 * Otherwise if we are allowed to cluster,
 			 * grab as much as we can.
 			 *
 			 * XXX  This may not be a win if we are not
 			 * doing sequential access.
 			 */
 			error = cluster_read(vp, ip->i_d.di_size, lbn,
 				size, NOCRED, uio->uio_resid, seqcount, &bp);
 		} else if (seqcount > 1) {
 			/*
 			 * If we are NOT allowed to cluster, then
 			 * if we appear to be acting sequentially,
 			 * fire off a request for a readahead
 			 * as well as a read. Note that the 4th and 5th
 			 * arguments point to arrays of the size specified in
 			 * the 6th argument.
 			 */
 			int nextsize = mp->m_sb.sb_blocksize;
 			error = breadn(vp, lbn,
 			    size, &nextlbn, &nextsize, 1, NOCRED, &bp);
 		} else {
 			/*
 			 * Failing all of the above, just read what the
 			 * user asked for. Interestingly, the same as
 			 * the first option above.
 			 */
 			error = bread(vp, lbn, size, NOCRED, &bp);
 		}
 		if (error) {
 			brelse(bp);
 			bp = NULL;
 			break;
 		}
 
 		/*
 		 * If IO_DIRECT then set B_DIRECT for the buffer.  This
 		 * will cause us to attempt to release the buffer later on
 		 * and will cause the buffer cache to attempt to free the
 		 * underlying pages.
 		 */
 		if (ioflag & IO_DIRECT)
 			bp->b_flags |= B_DIRECT;
 
 		/*
 		 * We should only get non-zero b_resid when an I/O error
 		 * has occurred, which should cause us to break above.
 		 * However, if the short read did not cause an error,
 		 * then we want to ensure that we do not uiomove bad
 		 * or uninitialized data.
 		 */
 		size -= bp->b_resid;
 		if (size < xfersize) {
 			if (size == 0)
 				break;
 			xfersize = size;
 		}
 
 		/*
 		 * otherwise use the general form
 		 */
 		error = uiomove((char *)bp->b_data + blkoffset,
 			    (int)xfersize, uio);
 
 		if (error)
 			break;
 
 		if (ioflag & (IO_VMIO|IO_DIRECT) ) {
 			/*
 			 * If there are no dependencies, and it's VMIO,
 			 * then we don't need the buf, mark it available
 			 * for freeing. The VM has the data.
 			 */
 			bp->b_flags |= B_RELBUF;
 			brelse(bp);
 		} else {
 			/*
 			 * Otherwise let whoever
 			 * made the request take care of
 			 * freeing it. We just queue
 			 * it onto another list.
 			 */
 			bqrelse(bp);
 		}
 	}
 
 	/*
 	 * This can only happen in the case of an error
 	 * because the loop above resets bp to NULL on each iteration
 	 * and on normal completion has not set a new value into it.
 	 * so it must have come from a 'break' statement
 	 */
 	if (bp != NULL) {
 		if (ioflag & (IO_VMIO|IO_DIRECT)) {
 			bp->b_flags |= B_RELBUF;
 			brelse(bp);
 		} else
 			bqrelse(bp);
 	}
 
 	return (error);
 }
 
 static int
 _xfs_write(struct vop_write_args /* {
 		struct vnode *a_vp;
 		struct uio *a_uio;
 		int  a_ioflag;
 		struct ucred *a_cred;
 	} */ *ap)
 {
 	struct vnode *vp = ap->a_vp;
 	struct uio *uio = ap->a_uio;
 	int ioflag = ap->a_ioflag;
 	int error;
 
 	xfs_vnode_t *xvp = (xfs_vnode_t *)vp->v_data;
 
 	error = xfs_write(xvp->v_bh.bh_first, uio, ioflag, ap->a_cred);
 
 	if (error < 0) {
 		printf("Xfs_write got error %d\n",error);
 		return -error;
 	}
 	return 0;
 }
 
 
 int
 xfs_write_file(xfs_inode_t *xip, struct uio *uio, int ioflag)
 {
 	struct buf	*bp;
 	//struct thread	*td;
 	daddr_t		lbn;
 	off_t		osize = 0;
 	off_t		offset= 0;
 	int		blkoffset, error, resid, xfersize;
 	int		fsblocksize;
 	int		seqcount;
 	xfs_iomap_t	iomap;
 	int		maps = 1;
 
 	xfs_vnode_t	*xvp = XFS_ITOV(xip);
 	struct vnode	*vp = xvp->v_vnode;
 
 	xfs_mount_t	*mp = (&xip->i_iocore)->io_mount;
 
 	seqcount = ioflag >> IO_SEQSHIFT;
 
 	memset(&iomap,0,sizeof(xfs_iomap_t));
 
 	/*
 	 * Maybe this should be above the vnode op call, but so long as
 	 * file servers have no limits, I don't think it matters.
 	 */
 #if 0
 	td = uio->uio_td;
 	if (vp->v_type == VREG && td != NULL) {
 		PROC_LOCK(td->td_proc);
 		if (uio->uio_offset + uio->uio_resid >
 		    lim_cur(td->td_proc, RLIMIT_FSIZE)) {
 			psignal(td->td_proc, SIGXFSZ);
 			PROC_UNLOCK(td->td_proc);
 			return (EFBIG);
 		}
 		PROC_UNLOCK(td->td_proc);
 	}
 #endif
 
 	resid = uio->uio_resid;
 	offset = uio->uio_offset;
 	osize = xip->i_d.di_size;
 
    /* xfs bmap wants bytes for both offset and size */
 	XVOP_BMAP(xvp,
 		  uio->uio_offset,
 		  uio->uio_resid,
 		  BMAPI_WRITE|BMAPI_DIRECT,
 		  &iomap, &maps, error);
 	if(error) {
 		printf("XVOP_BMAP failed\n");
 		goto error;
 	}
 
 	for (error = 0; uio->uio_resid > 0;) {
 
 		lbn = XFS_B_TO_FSBT(mp, offset);
 		blkoffset = XFS_B_FSB_OFFSET(mp, offset);
 		xfersize = mp->m_sb.sb_blocksize - blkoffset;
 		fsblocksize = mp->m_sb.sb_blocksize;
 
 		if (uio->uio_resid < xfersize)
 			xfersize = uio->uio_resid;
 
 		/*
 		 * getblk sets buf by  blkno *  bo->bo_bsize
 		 * bo_bsize is set from the mnt point fsize
 		 * so we call getblk in the case using fsblocks
 		 * not basic blocks
 		 */
 
 		bp = getblk(vp, lbn, fsblocksize, 0, 0, 0);
 		if(!bp) {
 			printf("getblk failed\n");
 			error = EINVAL;
 			break;
 		}
 
 		if (!(bp->b_flags & B_CACHE)  && fsblocksize > xfersize)
 			vfs_bio_clrbuf(bp);
 
 		if (offset + xfersize >  xip->i_d.di_size) {
 			xip->i_d.di_size = offset + xfersize;
 			vnode_pager_setsize(vp, offset + fsblocksize);
 		}
 
 		/* move the offset for the next itteration of the loop */
 		offset += xfersize;
 
 		error = uiomove((char *)bp->b_data + blkoffset, xfersize, uio);
 
 		if ((ioflag & IO_VMIO) &&
 		   (LIST_FIRST(&bp->b_dep) == NULL)) /* in ext2fs? */
 			bp->b_flags |= B_RELBUF;
 
 		/* force to full direct for now */
 		bp->b_flags |= B_DIRECT;
 		/* and sync ... the delay path is not pushing data out */
 		ioflag |= IO_SYNC;
 
 		if (ioflag & IO_SYNC) {
 			(void)bwrite(bp);
 		} else if (0 /* RMC xfersize + blkoffset == fs->s_frag_size */) {
 			if ((vp->v_mount->mnt_flag & MNT_NOCLUSTERW) == 0) {
 				bp->b_flags |= B_CLUSTEROK;
 				cluster_write(vp, bp, osize, seqcount);
 			} else {
 				bawrite(bp);
 			}
 		} else {
 			bp->b_flags |= B_CLUSTEROK;
 			bdwrite(bp);
 		}
 		if (error || xfersize == 0)
 			break;
 	}
 	/*
 	 * If we successfully wrote any data, and we are not the superuser
 	 * we clear the setuid and setgid bits as a precaution against
 	 * tampering.
 	 */
 #if 0
 	if (resid > uio->uio_resid && ap->a_cred && ap->a_cred->cr_uid != 0)
 		ip->i_mode &= ~(ISUID | ISGID);
 #endif
 	if (error) {
 		if (ioflag & IO_UNIT) {
 #if 0
 			(void)ext2_truncate(vp, osize,
 			    ioflag & IO_SYNC, ap->a_cred, uio->uio_td);
 #endif
 			uio->uio_offset -= resid - uio->uio_resid;
 			uio->uio_resid = resid;
 		}
 	} else if (resid > uio->uio_resid && (ioflag & IO_SYNC)) {
 		/* Update the vnode here? */
 	}
 
 error:
 	return error;
 }
 
 static int
 _xfs_create(
     	struct vop_create_args  /* {
 		struct vnode *a_dvp;
 		struct vnode **a_vpp;
 		struct componentname *a_cnp;
 		struct vattr *a_vap;
 	} */ *ap)
 {
 	struct vnode *dvp = ap->a_dvp;
  	struct vattr *vap = ap->a_vap;
 	struct thread *td = curthread;
 	struct ucred *credp = td->td_ucred;
 	struct componentname *cnp = ap->a_cnp;
 	xfs_vnode_t *xvp;
 	xfs_vattr_t va;
 	int error;
 
 	memset(&va, 0, sizeof (va));
 	va.va_mask |= XFS_AT_MODE;
 	va.va_mode = vap->va_mode;
 	va.va_mask |= XFS_AT_TYPE;
 	va.va_mode |=  VTTOIF(vap->va_type);
 
 	xvp = NULL;
 	XVOP_CREATE(VPTOXFSVP(dvp), cnp, &va, &xvp, credp, error);
 
 	if (error == 0) {
 		*ap->a_vpp = xvp->v_vnode;
 		VOP_LOCK(xvp->v_vnode, LK_EXCLUSIVE, td);
 	}
 
 	return (error);
 }
 
 extern int xfs_remove(bhv_desc_t *, bhv_desc_t *, vname_t *, cred_t *);
 
 static int
 _xfs_remove(
 	struct vop_remove_args /* {
 		struct vnodeop_desc *a_desc;
 		struct vnode * a_dvp;
 		struct vnode * a_vp;
 		struct componentname * a_cnp;
 	} */ *ap)
 {
 	struct vnode *vp = ap->a_vp;
 	struct thread *td = curthread;
 	struct ucred  *credp = td->td_ucred;
 	/*
 	struct vnode *dvp = ap->a_dvp; 
  	struct componentname *cnp = ap->a_cnp;
 	*/
 	int error;
 
 	if (vp->v_type == VDIR || vp->v_usecount != 1)
 		return (EPERM);
 
 	error = xfs_remove(VPTOXFSVP(ap->a_dvp)->v_bh.bh_first,
 			   VPTOXFSVP(ap->a_vp)->v_bh.bh_first,
 			   ap->a_cnp,credp);
 
 	cache_purge(vp);
 	return error;
 }
 
 static int
 _xfs_rename(
 	struct vop_rename_args /* {
 		struct vnode *a_fdvp;
 		struct vnode *a_fvp;
 		struct componentname *a_fcnp;
 		struct vnode *a_tdvp;
 		struct vnode *a_tvp;
 		struct componentname *a_tcnp;
 	} */ *ap)
 {
 	struct vnode *fvp = ap->a_fvp;
 	struct vnode *tvp = ap->a_tvp;
 	struct vnode *fdvp = ap->a_fdvp;
 	struct vnode *tdvp = ap->a_tdvp;
 /* 	struct componentname *tcnp = ap->a_tcnp; */
 /*	struct componentname *fcnp = ap->a_fcnp;*/
 	int error = EPERM;
 
 	if (error)
 		goto out;
 
 	/* Check for cross-device rename */
 	if ((fvp->v_mount != tdvp->v_mount) ||
 	    (tvp && (fvp->v_mount != tvp->v_mount))) {
 		error = EXDEV;
 		goto out;
 	}
 
 	if (tvp && tvp->v_usecount > 1) {
 		error = EBUSY;
 		goto out;
 	}
 
 	if (fvp->v_type == VDIR) {
 		if (tvp != NULL && tvp->v_type == VDIR)
 			cache_purge(tdvp);
 		cache_purge(fdvp);
 	}
 out:
 	if (tdvp == tvp)
 		vrele(tdvp);
 	else
 		vput(tdvp);
 	if (tvp)
 		vput(tvp);
 	vrele(fdvp);
 	vrele(fvp);
 	vgone(fvp);
 	if (tvp)
 		vgone(tvp);
 	return (error);
 }
 
 static int
 _xfs_link(
 	struct vop_link_args /* {
 		struct vnode *a_tdvp;
 		struct vnode *a_vp;
 		struct componentname *a_cnp;
 	} */ *ap)
 {
 	xfs_vnode_t *tdvp, *vp;
 	int error;
 
 	tdvp = VPTOXFSVP(ap->a_tdvp);
 	vp = VPTOXFSVP(ap->a_vp);
 	XVOP_LINK(tdvp, vp, ap->a_cnp, NULL, error);
 	return (error);
 }
 
 static int
 _xfs_symlink(
 	struct vop_symlink_args /* {
 		struct vnode *a_dvp;
 		struct vnode **a_vpp;
 		struct componentname *a_cnp;
 		struct vattr *a_vap;
 		char *a_target;
 	} */ *ap)
 {
 	struct thread *td = curthread;
 	struct ucred  *credp = td->td_ucred;
 	xfs_vnode_t *xvp;
 	xfs_vattr_t va;
 	int error;
 
 	memset(&va, 0, sizeof (va));
 
 	va.va_mask |= XFS_AT_MODE;
 	va.va_mode = ap->a_vap->va_mode | S_IFLNK;
 	va.va_mask |= XFS_AT_TYPE;
 
 	XVOP_SYMLINK(VPTOXFSVP(ap->a_dvp), ap->a_cnp, &va, ap->a_target,
 	    &xvp, credp, error);
 
 	if (error == 0) {
 		*ap->a_vpp = xvp->v_vnode;
 		VOP_LOCK(xvp->v_vnode, LK_EXCLUSIVE, td);
 	}
 
 	return (error);
 }
 
 static int
 _xfs_mknod(
 	struct vop_mknod_args /* {
 		struct vnode *a_dvp;
 		struct vnode **a_vpp;
 		struct componentname *a_cnp;
 		struct vattr *a_vap;
 	} */ *ap)
 {
 	struct vnode *dvp = ap->a_dvp;
  	struct vattr *vap = ap->a_vap;
 	struct thread *td = curthread;
 	struct ucred *credp = td->td_ucred;
 	struct componentname *cnp = ap->a_cnp;
 	xfs_vnode_t *xvp;
 	xfs_vattr_t va;
 	int error;
 
 	memset(&va, 0, sizeof (va));
 	va.va_mask |= XFS_AT_MODE;
 	va.va_mode = vap->va_mode | S_IFIFO;
 	va.va_mask |= XFS_AT_TYPE;
 	va.va_mask |= XFS_AT_RDEV;
 	va.va_rdev = vap->va_rdev;
 
 	xvp = NULL;
 	XVOP_CREATE(VPTOXFSVP(dvp), cnp, &va, &xvp, credp, error);
 
 	if (error == 0) {
 		*ap->a_vpp = xvp->v_vnode;
 		VOP_LOCK(xvp->v_vnode, LK_EXCLUSIVE, td);
 	}
 
 	return (error);
 }
 
 static int
 _xfs_mkdir(
 	struct vop_mkdir_args /* {
 		 struct vnode *a_dvp;
 		 struct vnode **a_vpp;
 		 struct componentname *a_cnp;
 		 struct vattr *a_vap;
 	} */ *ap)
 {
 	struct vnode *dvp = ap->a_dvp;
  	struct vattr *vap = ap->a_vap;
 	struct thread *td = curthread;
 	struct ucred *credp = td->td_ucred;
 	struct componentname *cnp = ap->a_cnp;
 	xfs_vnode_t *xvp;
 	xfs_vattr_t va;
 	int error;
 
 	memset(&va, 0, sizeof (va));
 	va.va_mask |= XFS_AT_MODE;
 	va.va_mode = vap->va_mode | S_IFDIR;
 	va.va_mask |= XFS_AT_TYPE;
 
 	xvp = NULL;
 	XVOP_MKDIR(VPTOXFSVP(dvp), cnp, &va, &xvp, credp, error);
 
 	if (error == 0) {
 		*ap->a_vpp = xvp->v_vnode;
 		VOP_LOCK(xvp->v_vnode, LK_EXCLUSIVE, td);
 	}
 
 	return (error);
 }
 
 static int
 _xfs_rmdir(
 	struct vop_rmdir_args /* {
 		struct vnode *a_dvp;
 		struct vnode *a_vp;
 		struct componentname *a_cnp;
 	} */ *ap)
 {
 	struct vnode *vp = ap->a_vp;
 	struct vnode *dvp = ap->a_dvp;
 /* 	struct componentname *cnp = ap->a_cnp; */
 	int error;
 
 	if (dvp == vp)
 		return (EINVAL);
 
 	error = EPERM;
 
 	return (error);
 }
 
 static int
 _xfs_readdir(
 	struct vop_readdir_args /* {
 		struct vnode *a_vp;
 		struct uio *a_uio;
 		struct ucred *a_cred;
 		int *a_eofflag;
 		int *a_ncookies;
 		u_long **a_cookies;
 	} */ *ap)
 {
 	struct vnode *vp = ap->a_vp;
 	struct uio *uio = ap->a_uio;
 	int error;
 	off_t	off;
 	int	eof = 0;
 
 	if (vp->v_type != VDIR)
 		return (EPERM);
 	if (ap->a_ncookies) {
 		return (EOPNOTSUPP);
 	}
 
 	error = 0;
 	while (!eof){
 		off = (int)uio->uio_offset;
 
 		XVOP_READDIR(VPTOXFSVP(vp), uio, NULL, &eof, error);
 		if ((uio->uio_offset == off) || error) {
 			break;
 		}
 	}
 
 	if (ap->a_eofflag)
 		*ap->a_eofflag = (eof != 0);
 
         return (error);
 }
 
 
 static int
 _xfs_readlink(
 	struct vop_readlink_args /* {
 		struct vnode *a_vp;
 		struct uio *a_uio;
 		struct ucred *a_cred;
 	} */ *ap)
 {
 	struct vnode *vp = ap->a_vp;
 	struct uio *uio = ap->a_uio;
 	struct ucred *cred = ap->a_cred;
 	int error;
 
 	XVOP_READLINK(VPTOXFSVP(vp), uio, 0, cred, error);
 	return (error);
 }
 
 static int
 _xfs_fsync(
 	struct vop_fsync_args /* {
 		struct vnode * a_vp;
 		int  a_waitfor;
 		struct thread * a_td;
 	} */ *ap)
 {
 	xfs_vnode_t  *vp = VPTOXFSVP(ap->a_vp);
 	int flags = FSYNC_DATA;
 	int error;
 
 	if (ap->a_waitfor == MNT_WAIT)
 		flags |= FSYNC_WAIT;
 	XVOP_FSYNC(vp, flags, ap->a_td->td_ucred, (xfs_off_t)0, (xfs_off_t)-1, error);
 
 	return (error);
 }
 
 static int
 _xfs_bmap(
 	struct vop_bmap_args /* {
 		struct vnode *a_vp;
 		daddr_t  a_bn;
 		struct bufobj **a_bop;
 		daddr_t *a_bnp;
 		int *a_runp;
 		int *a_runb;
 	} */ *ap)
 {
 	xfs_iomap_t iomap;
 	xfs_off_t offset;
 	ssize_t   size;
 	struct mount *mp;
 	struct xfs_mount *xmp;
 	struct xfs_vnode *xvp;
 	int error, maxrun, retbm;
 
 	mp  = ap->a_vp->v_mount;
 	xmp = XFS_VFSTOM(MNTTOVFS(mp));
 	if (ap->a_bop != NULL)
 		*ap->a_bop = &xmp->m_ddev_targp->specvp->v_bufobj;
 	if (ap->a_bnp == NULL)
 		return (0);
 
 	xvp = VPTOXFSVP(ap->a_vp);
 	retbm = 1;
 
 	offset = XFS_FSB_TO_B(xmp, ap->a_bn);
 	size = XFS_FSB_TO_B(xmp, 1);
 	XVOP_BMAP(xvp, offset, size, BMAPI_READ, &iomap, &retbm, error);
 	if (error)
 		return (error);
 	if (retbm == 0 || iomap.iomap_bn == IOMAP_DADDR_NULL) {
 		*ap->a_bnp = (daddr_t)-1;
 		if (ap->a_runb)
 			*ap->a_runb = 0;
 		if (ap->a_runp)
 			*ap->a_runp = 0;
 	} else {
 		*ap->a_bnp = iomap.iomap_bn + btodb(iomap.iomap_delta);
 		maxrun = mp->mnt_iosize_max / mp->mnt_stat.f_iosize - 1;
 		if (ap->a_runb) {
 			*ap->a_runb = XFS_B_TO_FSB(xmp, iomap.iomap_delta);
 			if (*ap->a_runb > maxrun)
 				*ap->a_runb  = maxrun;
 		}
 		if (ap->a_runp) {
 			*ap->a_runp =
 			    XFS_B_TO_FSB(xmp, iomap.iomap_bsize
 				- iomap.iomap_delta - size);
 			if (*ap->a_runp > maxrun)
 				*ap->a_runp  = maxrun;
 		}
 	}
 	return (0);
 }
 
 static int
 _xfs_strategy(
 	struct vop_strategy_args /* {
 		struct vnode *a_vp;
 		struct buf *a_bp;
 	} */ *ap)
 {
 	daddr_t blkno;
 	struct buf *bp;;
 	struct bufobj *bo;
 	struct vnode *vp;
 	struct xfs_mount *xmp;
 	int error;
 
 	bp = ap->a_bp;
 	vp = ap->a_vp;
 
 	KASSERT(ap->a_vp == ap->a_bp->b_vp, ("%s(%p != %p)",
 	    __func__, ap->a_vp, ap->a_bp->b_vp));
 	if (bp->b_blkno == bp->b_lblkno) {
 		error = VOP_BMAP(vp, bp->b_lblkno, NULL, &blkno, NULL, NULL);
 		bp->b_blkno = blkno;
 		bp->b_iooffset = (blkno << BBSHIFT);
 		if (error) {
 			bp->b_error = error;
 			bp->b_ioflags |= BIO_ERROR;
 			bufdone(bp);
 			return (error);
 		}
 		if ((long)bp->b_blkno == -1)
 			vfs_bio_clrbuf(bp);
         }
 	if ((long)bp->b_blkno == -1) {
 		bufdone(bp);
 		return (0);
 	}
 
 	xmp = XFS_VFSTOM(MNTTOVFS(vp->v_mount));
 	bo = &xmp->m_ddev_targp->specvp->v_bufobj;
 	bo->bo_ops->bop_strategy(bo, bp);
 	return (0);
 }
 
 int
 _xfs_ioctl(
 	struct vop_ioctl_args /* {
 		struct vnode *a_vp;
 		u_long a_command;
 		caddr_t a_data;
 		int fflag;
 		struct ucred *cred;
 		struct thread *a_td;
 	} */ *ap)
 {
 /* 	struct vnode *vp = ap->a_vp; */
 /* 	struct thread *p = ap->a_td; */
 /* 	struct file *fp; */
 	int error;
 
 	xfs_vnode_t *xvp = VPTOXFSVP(ap->a_vp);
 
 	printf("_xfs_ioctl cmd 0x%lx data %p\n",ap->a_command,ap->a_data);
 
 //	XVOP_IOCTL(xvp,(void *)NULL,(void *)NULL,ap->a_fflag,ap->a_command,ap->a_data,error);
 	error = xfs_ioctl(xvp->v_bh.bh_first,NULL,NULL,ap->a_fflag,ap->a_command,ap->a_data);
 
 	return error;
 }
 
 int
 _xfs_advlock(
 	struct vop_advlock_args /* {
 		struct vnode *a_vp;
 		caddr_t  a_id;
 		int  a_op;
 		struct flock *a_fl;
 		int  a_flags;
 	} */ *ap)
 {
 /* 	struct vnode *vp = ap->a_vp;*/
 	struct flock *fl = ap->a_fl;
 /* 	caddr_t id = (caddr_t)1 */ /* ap->a_id */;
 /* 	int flags = ap->a_flags; */
 	off_t start, end, size;
 	int error/* , lkop */;
 
 	/*KAN: temp */
 	return (EOPNOTSUPP);
 
 	size = 0;
 	error = 0;
 	switch (fl->l_whence) {
 	    case SEEK_SET:
 	    case SEEK_CUR:
 		start = fl->l_start;
 		break;
 	    case SEEK_END:
 		start = fl->l_start + size;
 	    default:
 		return (EINVAL);
 	}
 	if (start < 0)
 		return (EINVAL);
 	if (fl->l_len == 0)
 		end = -1;
 	else {
 		end = start + fl->l_len - 1;
 		if (end < start)
 			return (EINVAL);
 	}
 #ifdef notyet
 	switch (ap->a_op) {
 	    case F_SETLK:
 		error = lf_advlock(ap, &np->n_lockf, size);
 		break;
 	    case F_UNLCK:
 		lf_advlock(ap, &np->n_lockf, size);
 		break;
 	    case F_GETLK:
 		error = lf_advlock(ap, &np->n_lockf, size);
 		break;
 	    default:
 		return (EINVAL);
 	}
 #endif
 	return (error);
 }
 
 static int
 _xfs_cachedlookup(
 	struct vop_cachedlookup_args /* {
 		struct vnode * a_dvp;
 		struct vnode ** a_vpp;
 		struct componentname * a_cnp;
 	} */ *ap)
 {
 	struct vnode *dvp, *tvp;
 	struct xfs_vnode *cvp;
 	int islastcn;
 	int error;
 	struct vnode **vpp = ap->a_vpp;
 	struct componentname *cnp = ap->a_cnp;
 	struct ucred *cred = cnp->cn_cred;
 	int flags = cnp->cn_flags;
 	int nameiop = cnp->cn_nameiop;
 	struct thread *td = cnp->cn_thread;
 
 	char *pname = cnp->cn_nameptr;
 	int namelen = cnp->cn_namelen;
 
 	*vpp = NULL;
 	dvp = ap->a_dvp;
 	islastcn = flags & ISLASTCN;
 
 	XVOP_LOOKUP(VPTOXFSVP(dvp), cnp, &cvp, 0, NULL, cred, error);
 
 	if (error == ENOENT) {
 		if ((nameiop == CREATE || nameiop == RENAME ||
 		     nameiop == DELETE) && islastcn)
 		{
 			error = VOP_ACCESS(dvp, VWRITE, cred, td);
 			if (error)
 				return (error);
 			cnp->cn_flags |= SAVENAME;
 			return (EJUSTRETURN);
 		}
 		if ((cnp->cn_flags & MAKEENTRY) && nameiop != CREATE)
 			cache_enter(dvp, *vpp, cnp);
 		return (error);
 	}
 	if (error)
 		return (error);
 
 	tvp = cvp->v_vnode;
 
 	if (nameiop == DELETE && islastcn) {
-		if ((error = vn_lock(tvp, LK_EXCLUSIVE, td))) {
+		if ((error = vn_lock(tvp, LK_EXCLUSIVE))) {
 			vrele(tvp);
 			goto err_out;
 		}
 		*vpp = tvp;
 
 		/* Directory should be writable for deletes. */
 	        error = VOP_ACCESS(dvp, VWRITE, cred, td);
          	if (error)
 		 	goto err_out;
 
 		/* XXXKAN: Permission checks for sticky dirs? */
 		return (0);
 	 }
 
 	if (nameiop == RENAME && islastcn) {
-		if ((error = vn_lock(tvp, LK_EXCLUSIVE, td))) {
+		if ((error = vn_lock(tvp, LK_EXCLUSIVE))) {
 			vrele(tvp);
 			goto err_out;
 		}
 		*vpp = tvp;
 
 		if ((error = VOP_ACCESS(dvp, VWRITE, cred, td)))
 			goto err_out;
 		return (0);
 	}
 
 	if (flags & ISDOTDOT) {
 		VOP_UNLOCK(dvp, 0, td);
-		error = vn_lock(tvp, cnp->cn_lkflags, td);
+		error = vn_lock(tvp, cnp->cn_lkflags);
 		if (error) {
-			vn_lock(dvp, LK_EXCLUSIVE | LK_RETRY, td);
+			vn_lock(dvp, LK_EXCLUSIVE | LK_RETRY);
 			vrele(tvp);
 			goto err_out;
 		}
 		*vpp = tvp;
 	} else if (namelen == 1 && pname[0] == '.') {
 		*vpp = tvp;
 		KASSERT(tvp == dvp, ("not same directory"));
 	} else {
-		if ((error = vn_lock(tvp, cnp->cn_lkflags, td))) {
+		if ((error = vn_lock(tvp, cnp->cn_lkflags))) {
 			vrele(tvp);
 			goto err_out;
 		}
 		*vpp = tvp;
 	}
 
 	if (cnp->cn_flags & MAKEENTRY)
 		cache_enter(dvp, *vpp, cnp);
 	return (0);
 
 err_out:
 	if (*vpp != 0)
 		vput(*vpp);
 	return (error);
 }
 
 static int
 _xfs_reclaim(
 	struct vop_reclaim_args /* {
 		struct vnode *a_vp;
 		struct thread  *a_td;
 	} */ *ap)
 {
 
 	struct vnode *vp = ap->a_vp;
 	struct xfs_vnode *xfs_vp = VPTOXFSVP(vp);
 	int error;
 
 	XVOP_RECLAIM(xfs_vp, error);
 	kmem_free(xfs_vp, sizeof(*xfs_vp));
 	vp->v_data = NULL;
 	return (error);
 }
 
 static int
 _xfs_kqfilter(
 	struct vop_kqfilter_args /* {
 		struct vnodeop_desc *a_desc;
 		struct vnode *a_vp;
 		struct knote *a_kn;
 	} */ *ap)
 {
 	return (0);
 }
 
 struct xfs_inode *
 xfs_vtoi(struct xfs_vnode *xvp)
 {
 	return(XFS_BHVTOI(xvp->v_fbhv));
 }
 
 /*
  * Read wrapper for fifos.
  */
 static int
 _xfsfifo_read(
 	struct vop_read_args /* {
 		struct vnode *a_vp;
 		struct uio *a_uio;
 		int  a_ioflag;
 		struct ucred *a_cred;
 	} */ *ap)
 {
 	int error, resid;
 	struct xfs_inode *ip;
 	struct uio *uio;
 
 	uio = ap->a_uio;
 	resid = uio->uio_resid;
 	error = fifo_specops.vop_read(ap);
 	ip = xfs_vtoi(VPTOXFSVP(ap->a_vp));
 	if ((ap->a_vp->v_mount->mnt_flag & MNT_NOATIME) == 0 && ip != NULL &&
 	    (uio->uio_resid != resid || (error == 0 && resid != 0)))
 		xfs_ichgtime(ip, XFS_ICHGTIME_ACC);
 	return (error);
 }
 
 /*
  * Write wrapper for fifos.
  */
 static int
 _xfsfifo_write(
 	struct vop_write_args /* {
 		struct vnode *a_vp;
 		struct uio *a_uio;
 		int  a_ioflag;
 		struct ucred *a_cred;
 	} */ *ap)
 {
 	int error, resid;
 	struct uio *uio;
 	struct xfs_inode *ip;
 
 	uio = ap->a_uio;
 	resid = uio->uio_resid;
 	error = fifo_specops.vop_write(ap);
 	ip = xfs_vtoi(VPTOXFSVP(ap->a_vp));
 	if (ip != NULL && (uio->uio_resid != resid ||
 	    (error == 0 && resid != 0)))
 		xfs_ichgtime(ip, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
 	return (error);
 }
 
 /*
  * Close wrapper for fifos.
  *
  * Update the times on the inode then do device close.
  */
 static int
 _xfsfifo_close(
 	struct vop_close_args /* {
 		struct vnode *a_vp;
 		int  a_fflag;
 		struct ucred *a_cred;
 		struct thread *a_td;
 	} */ *ap)
 {
 
 	return (fifo_specops.vop_close(ap));
 }
 
 /*
  * Kqfilter wrapper for fifos.
  *
  * Fall through to ufs kqfilter routines if needed
  */
 static int
 _xfsfifo_kqfilter(
 	struct vop_kqfilter_args /* {
 		struct vnodeop_desc *a_desc;
 		struct vnode *a_vp;
 		struct knote *a_kn;
 	} */ *ap)
 {
 	int error;
 
 	error = fifo_specops.vop_kqfilter(ap);
 	if (error)
 		error = _xfs_kqfilter(ap);
 	return (error);
 }
 
 static int
 _xfs_getextattr(
 	struct vop_getextattr_args /* {
 		struct vnode *a_vp;
 		int a_attrnamespace;
 		const char *a_name;
 		struct uio *a_uio;
 		size_t *a_size;
 		struct ucred *a_cred;
 		struct thread *a_td;
 	} */ *ap)
 {
 	int error;
 	char *value;
 	int size;
 
 	error = extattr_check_cred(ap->a_vp, ap->a_attrnamespace,
 	    ap->a_cred, ap->a_td, VREAD);
         if (error)
 		return (error);
 
 	size = ATTR_MAX_VALUELEN;
 	value = (char *)kmem_zalloc(size, KM_SLEEP);
 	if (value == NULL)
 		return (ENOMEM);
 
 	XVOP_ATTR_GET(VPTOXFSVP(ap->a_vp), ap->a_name, value, &size, 1,
 	    ap->a_cred, error);
 
 	if (ap->a_uio != NULL) {
 		if (ap->a_uio->uio_iov->iov_len < size)
 			error = ERANGE;
 		else
 			uiomove(value, size, ap->a_uio);
 	}
 
 	if (ap->a_size != NULL)
 		*ap->a_size = size;
 
 	kmem_free(value, ATTR_MAX_VALUELEN);
 	return (error);
 }		
 
 static int
 _xfs_listextattr(
 	struct vop_listextattr_args /* {
 		struct vnode *a_vp;
 		int a_attrnamespace;
 		struct uio *a_uio;
 		size_t *a_size;
 		struct ucred *a_cred;
 		struct thread *a_td;
 	} */ *ap)
 {
 	int error;
 	char *buf = NULL;
 	int buf_len = 0;
 	attrlist_cursor_kern_t  cursor = { 0 };
 	int i;
 	char name_len;
 	int attrnames_len = 0;
 	int xfs_flags = ATTR_KERNAMELS;
 
 	error = extattr_check_cred(ap->a_vp, ap->a_attrnamespace,
 	    ap->a_cred, ap->a_td, VREAD);
         if (error)
 		return (error);
 
 	if (ap->a_attrnamespace & EXTATTR_NAMESPACE_USER)
 		xfs_flags |= ATTR_KERNORMALS;
 
 	if (ap->a_attrnamespace & EXTATTR_NAMESPACE_SYSTEM)
 		xfs_flags |= ATTR_KERNROOTLS;
 
 	if (ap->a_uio == NULL || ap->a_uio->uio_iov[0].iov_base == NULL) {
 		xfs_flags |= ATTR_KERNOVAL;
 		buf_len = 0;
 	} else {
 		buf = ap->a_uio->uio_iov[0].iov_base;
 		buf_len = ap->a_uio->uio_iov[0].iov_len;
 	}
 
 	XVOP_ATTR_LIST(VPTOXFSVP(ap->a_vp), buf, buf_len, xfs_flags,
 		    &cursor, ap->a_cred, error);
 	if (error < 0) {
 		attrnames_len = -error;
 		error = 0;
 	}
 	if (buf == NULL)
 		goto done;
 
 	/*
 	 * extattr_list expects a list of names.  Each list
 	 * entry consists of one byte for the name length, followed
 	 * by the name (not null terminated)
 	 */
 	name_len=0;
 	for(i=attrnames_len-1; i > 0 ; --i) {
 		buf[i] = buf[i-1];
 		if (buf[i])
 			++name_len;
 		else {
 			buf[i] = name_len;
 			name_len = 0;
 		}
 	} 
 	buf[0] = name_len;
 
 	if (ap->a_uio != NULL)
 		ap->a_uio->uio_resid -= attrnames_len;
 
 done:
 	if (ap->a_size != NULL)
 		*ap->a_size = attrnames_len;
 
 	return (error);
 }
 
 static int
 _xfs_setextattr(struct vop_setextattr_args *ap)
 /*
 vop_setextattr {
 	IN struct vnode *a_vp;
 	IN int a_attrnamespace;
 	IN const char *a_name;
 	INOUT struct uio *a_uio;
 	IN struct ucred *a_cred;
 	IN struct thread *a_td;
 };
 */
 {
 	char *val;
 	size_t vallen;
 	int error, xfs_flags;
 
 	if (ap->a_vp->v_type == VCHR)
 		return (EOPNOTSUPP);
 
 	if (ap->a_uio == NULL)
 		return (EINVAL);
 	vallen = ap->a_uio->uio_resid;
 	if (vallen > ATTR_MAX_VALUELEN)
 		return (EOVERFLOW);
 
 	if (ap->a_name[0] == '\0')
 		return (EINVAL);
 
 	error = extattr_check_cred(ap->a_vp, ap->a_attrnamespace,
 	    ap->a_cred, ap->a_td, VWRITE);
 	if (error)
 		return (error);
 
 	xfs_flags = 0;
 	if (ap->a_attrnamespace & EXTATTR_NAMESPACE_USER)
 		xfs_flags |= ATTR_KERNORMALS;
 	if (ap->a_attrnamespace & EXTATTR_NAMESPACE_SYSTEM)
 		xfs_flags |= ATTR_KERNROOTLS;
 
 	val = (char *)kmem_zalloc(vallen, KM_SLEEP);
 	if (val == NULL)
 		return (ENOMEM);
 	error = uiomove(val, (int)vallen, ap->a_uio);
 	if (error)
 		goto err_out;
 
 	XVOP_ATTR_SET(VPTOXFSVP(ap->a_vp), ap->a_name, val, vallen, xfs_flags,
 	    ap->a_cred, error);
 err_out:
 	kmem_free(val, vallen);
 	return(error);
 }
 
 static int
 _xfs_deleteextattr(struct vop_deleteextattr_args *ap)
 /*
 vop_deleteextattr {
 	IN struct vnode *a_vp;
 	IN int a_attrnamespace;
 	IN const char *a_name;
 	IN struct ucred *a_cred;
 	IN struct thread *a_td;
 };
 */
 {
 	int error, xfs_flags;
 
 	if (ap->a_vp->v_type == VCHR)
 		return (EOPNOTSUPP);
 
 	if (ap->a_name[0] == '\0')
 		return (EINVAL);
 
 	error = extattr_check_cred(ap->a_vp, ap->a_attrnamespace,
 	    ap->a_cred, ap->a_td, VWRITE);
 	if (error)
 		return (error);
 
 	xfs_flags = 0;
 	if (ap->a_attrnamespace & EXTATTR_NAMESPACE_USER)
 		xfs_flags |= ATTR_KERNORMALS;
 	if (ap->a_attrnamespace & EXTATTR_NAMESPACE_SYSTEM)
 		xfs_flags |= ATTR_KERNROOTLS;
 
 	XVOP_ATTR_REMOVE(VPTOXFSVP(ap->a_vp), ap->a_name, xfs_flags,
 	    ap->a_cred, error);
 	return (error);
 }
 
 static int
 _xfs_vptofh(struct vop_vptofh_args *ap)
 /*
 vop_vptofh {
 	IN struct vnode *a_vp;
 	IN struct fid *a_fhp;
 };
 */
 {
 	printf("xfs_vptofh");
 	return ENOSYS;
 }
Index: head/sys/i386/ibcs2/ibcs2_misc.c
===================================================================
--- head/sys/i386/ibcs2/ibcs2_misc.c	(revision 175201)
+++ head/sys/i386/ibcs2/ibcs2_misc.c	(revision 175202)
@@ -1,1246 +1,1246 @@
 /*-
  * Copyright (c) 1995 Steven Wallace
  * Copyright (c) 1994, 1995 Scott Bartram
  * Copyright (c) 1992, 1993
  *	The Regents of the University of California.  All rights reserved.
  *
  * This software was developed by the Computer Systems Engineering group
  * at Lawrence Berkeley Laboratory under DARPA contract BG 91-66 and
  * contributed to Berkeley.
  *
  * All advertising materials mentioning features or use of this software
  * must display the following acknowledgement:
  *	This product includes software developed by the University of
  *	California, Lawrence Berkeley Laboratory.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. All advertising materials mentioning features or use of this software
  *    must display the following acknowledgement:
  *	This product includes software developed by the University of
  *	California, Berkeley and its contributors.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  * from: Header: sun_misc.c,v 1.16 93/04/07 02:46:27 torek Exp 
  *
  *	@(#)sun_misc.c	8.1 (Berkeley) 6/18/93
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 /*
  * IBCS2 compatibility module.
  *
  * IBCS2 system calls that are implemented differently in BSD are
  * handled here.
  */
 #include "opt_mac.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/dirent.h>
 #include <sys/fcntl.h>
 #include <sys/filedesc.h>
 #include <sys/imgact.h>
 #include <sys/kernel.h>
 #include <sys/lock.h>
 #include <sys/malloc.h>
 #include <sys/file.h>			/* Must come after sys/malloc.h */
 #include <sys/mutex.h>
 #include <sys/priv.h>
 #include <sys/reboot.h>
 #include <sys/resourcevar.h>
 #include <sys/stat.h>
 #include <sys/sysctl.h>
 #include <sys/syscallsubr.h>
 #include <sys/sysproto.h>
 #include <sys/time.h>
 #include <sys/times.h>
 #include <sys/vnode.h>
 #include <sys/wait.h>
 
 #include <machine/cpu.h>
 
 #include <i386/ibcs2/ibcs2_dirent.h>
 #include <i386/ibcs2/ibcs2_signal.h>
 #include <i386/ibcs2/ibcs2_proto.h>
 #include <i386/ibcs2/ibcs2_unistd.h>
 #include <i386/ibcs2/ibcs2_util.h>
 #include <i386/ibcs2/ibcs2_utime.h>
 #include <i386/ibcs2/ibcs2_xenix.h>
 
 #include <security/mac/mac_framework.h>
 
 int
 ibcs2_ulimit(td, uap)
 	struct thread *td;
 	struct ibcs2_ulimit_args *uap;
 {
 	struct rlimit rl;
 	struct proc *p;
 	int error;
 #define IBCS2_GETFSIZE		1
 #define IBCS2_SETFSIZE		2
 #define IBCS2_GETPSIZE		3
 #define IBCS2_GETDTABLESIZE	4
 
 	p = td->td_proc;
 	switch (uap->cmd) {
 	case IBCS2_GETFSIZE:
 		PROC_LOCK(p);
 		td->td_retval[0] = lim_cur(p, RLIMIT_FSIZE);
 		PROC_UNLOCK(p);
 		if (td->td_retval[0] == -1)
 			td->td_retval[0] = 0x7fffffff;
 		return 0;
 	case IBCS2_SETFSIZE:
 		PROC_LOCK(p);
 		rl.rlim_max = lim_max(p, RLIMIT_FSIZE);
 		PROC_UNLOCK(p);
 		rl.rlim_cur = uap->newlimit;
 		error = kern_setrlimit(td, RLIMIT_FSIZE, &rl);
 		if (!error) {
 			PROC_LOCK(p);
 			td->td_retval[0] = lim_cur(p, RLIMIT_FSIZE);
 			PROC_UNLOCK(p);
 		} else {
 			DPRINTF(("failed "));
 		}
 		return error;
 	case IBCS2_GETPSIZE:
 		PROC_LOCK(p);
 		td->td_retval[0] = lim_cur(p, RLIMIT_RSS); /* XXX */
 		PROC_UNLOCK(p);
 		return 0;
 	case IBCS2_GETDTABLESIZE:
 		uap->cmd = IBCS2_SC_OPEN_MAX;
 		return ibcs2_sysconf(td, (struct ibcs2_sysconf_args *)uap);
 	default:
 		return ENOSYS;
 	}
 }
 
 #define IBCS2_WSTOPPED       0177
 #define IBCS2_STOPCODE(sig)  ((sig) << 8 | IBCS2_WSTOPPED)
 int
 ibcs2_wait(td, uap)
 	struct thread *td;
 	struct ibcs2_wait_args *uap;
 {
 	int error, options, status;
 	int *statusp;
 	pid_t pid;
         struct trapframe *tf = td->td_frame;
 	
 	if ((tf->tf_eflags & (PSL_Z|PSL_PF|PSL_N|PSL_V))
             == (PSL_Z|PSL_PF|PSL_N|PSL_V)) {
 		/* waitpid */
 		pid = uap->a1;
 		statusp = (int *)uap->a2;
 		options = uap->a3;
 	} else {
 		/* wait */
 		pid = WAIT_ANY;
 		statusp = (int *)uap->a1;
 		options = 0;
 	}
 	error = kern_wait(td, pid, &status, options, NULL);
 	if (error)
 		return error;
 	if (statusp) {
 		/*
 		 * Convert status/signal result.
 		 */
 		if (WIFSTOPPED(status)) {
 			if (WSTOPSIG(status) <= 0 ||
 			    WSTOPSIG(status) > IBCS2_SIGTBLSZ)
 				return (EINVAL);
 			status =
 			  IBCS2_STOPCODE(bsd_to_ibcs2_sig[_SIG_IDX(WSTOPSIG(status))]);
 		} else if (WIFSIGNALED(status)) {
 			if (WTERMSIG(status) <= 0 ||
 			    WTERMSIG(status) > IBCS2_SIGTBLSZ)
 				return (EINVAL);
 			status = bsd_to_ibcs2_sig[_SIG_IDX(WTERMSIG(status))];
 		}
 		/* else exit status -- identical */
 
 		/* record result/status */
 		td->td_retval[1] = status;
 		return copyout(&status, statusp, sizeof(status));
 	}
 
 	return 0;
 }
 
 int
 ibcs2_execv(td, uap)
 	struct thread *td;
 	struct ibcs2_execv_args *uap;
 {
 	struct image_args eargs;
 	char *path;
 	int error;
 
         CHECKALTEXIST(td, uap->path, &path);
 
 	error = exec_copyin_args(&eargs, path, UIO_SYSSPACE, uap->argp, NULL);
 	free(path, M_TEMP);
 	if (error == 0)
 		error = kern_execve(td, &eargs, NULL);
 	return (error);
 }
 
 int
 ibcs2_execve(td, uap) 
         struct thread *td;
         struct ibcs2_execve_args *uap;
 {
 	struct image_args eargs;
 	char *path;
 	int error;
 
         CHECKALTEXIST(td, uap->path, &path);
 
 	error = exec_copyin_args(&eargs, path, UIO_SYSSPACE, uap->argp,
 	    uap->envp);
 	free(path, M_TEMP);
 	if (error == 0)
 		error = kern_execve(td, &eargs, NULL);
 	return (error);
 }
 
 int
 ibcs2_umount(td, uap)
 	struct thread *td;
 	struct ibcs2_umount_args *uap;
 {
 	struct unmount_args um;
 
 	um.path = uap->name;
 	um.flags = 0;
 	return unmount(td, &um);
 }
 
 int
 ibcs2_mount(td, uap)
 	struct thread *td;
 	struct ibcs2_mount_args *uap;
 {
 #ifdef notyet
 	int oflags = uap->flags, nflags, error;
 	char fsname[MFSNAMELEN];
 
 	if (oflags & (IBCS2_MS_NOSUB | IBCS2_MS_SYS5))
 		return (EINVAL);
 	if ((oflags & IBCS2_MS_NEWTYPE) == 0)
 		return (EINVAL);
 	nflags = 0;
 	if (oflags & IBCS2_MS_RDONLY)
 		nflags |= MNT_RDONLY;
 	if (oflags & IBCS2_MS_NOSUID)
 		nflags |= MNT_NOSUID;
 	if (oflags & IBCS2_MS_REMOUNT)
 		nflags |= MNT_UPDATE;
 	uap->flags = nflags;
 
 	if (error = copyinstr((caddr_t)uap->type, fsname, sizeof fsname,
 			      (u_int *)0))
 		return (error);
 
 	if (strcmp(fsname, "4.2") == 0) {
 		uap->type = (caddr_t)STACK_ALLOC();
 		if (error = copyout("ufs", uap->type, sizeof("ufs")))
 			return (error);
 	} else if (strcmp(fsname, "nfs") == 0) {
 		struct ibcs2_nfs_args sna;
 		struct sockaddr_in sain;
 		struct nfs_args na;
 		struct sockaddr sa;
 
 		if (error = copyin(uap->data, &sna, sizeof sna))
 			return (error);
 		if (error = copyin(sna.addr, &sain, sizeof sain))
 			return (error);
 		bcopy(&sain, &sa, sizeof sa);
 		sa.sa_len = sizeof(sain);
 		uap->data = (caddr_t)STACK_ALLOC();
 		na.addr = (struct sockaddr *)((int)uap->data + sizeof na);
 		na.sotype = SOCK_DGRAM;
 		na.proto = IPPROTO_UDP;
 		na.fh = (nfsv2fh_t *)sna.fh;
 		na.flags = sna.flags;
 		na.wsize = sna.wsize;
 		na.rsize = sna.rsize;
 		na.timeo = sna.timeo;
 		na.retrans = sna.retrans;
 		na.hostname = sna.hostname;
 
 		if (error = copyout(&sa, na.addr, sizeof sa))
 			return (error);
 		if (error = copyout(&na, uap->data, sizeof na))
 			return (error);
 	}
 	return (mount(td, uap));
 #else
 	return EINVAL;
 #endif
 }
 
 /*
  * Read iBCS2-style directory entries.  We suck them into kernel space so
  * that they can be massaged before being copied out to user code.  Like
  * SunOS, we squish out `empty' entries.
  *
  * This is quite ugly, but what do you expect from compatibility code?
  */
 
 int
 ibcs2_getdents(td, uap)
 	struct thread *td;
 	register struct ibcs2_getdents_args *uap;
 {
 	register struct vnode *vp;
 	register caddr_t inp, buf;	/* BSD-format */
 	register int len, reclen;	/* BSD-format */
 	register caddr_t outp;		/* iBCS2-format */
 	register int resid;		/* iBCS2-format */
 	struct file *fp;
 	struct uio auio;
 	struct iovec aiov;
 	struct ibcs2_dirent idb;
 	off_t off;			/* true file offset */
 	int buflen, error, eofflag, vfslocked;
 	u_long *cookies = NULL, *cookiep;
 	int ncookies;
 #define	BSD_DIRENT(cp)		((struct dirent *)(cp))
 #define	IBCS2_RECLEN(reclen)	(reclen + sizeof(u_short))
 
 	if ((error = getvnode(td->td_proc->p_fd, uap->fd, &fp)) != 0)
 		return (error);
 	if ((fp->f_flag & FREAD) == 0) {
 		fdrop(fp, td);
 		return (EBADF);
 	}
 	vp = fp->f_vnode;
 	vfslocked = VFS_LOCK_GIANT(vp->v_mount);
 	if (vp->v_type != VDIR) {	/* XXX  vnode readdir op should do this */
 		VFS_UNLOCK_GIANT(vfslocked);
 		fdrop(fp, td);
 		return (EINVAL);
 	}
 
 	off = fp->f_offset;
 #define	DIRBLKSIZ	512		/* XXX we used to use ufs's DIRBLKSIZ */
 	buflen = max(DIRBLKSIZ, uap->nbytes);
 	buflen = min(buflen, MAXBSIZE);
 	buf = malloc(buflen, M_TEMP, M_WAITOK);
-	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td);
+	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
 again:
 	aiov.iov_base = buf;
 	aiov.iov_len = buflen;
 	auio.uio_iov = &aiov;
 	auio.uio_iovcnt = 1;
 	auio.uio_rw = UIO_READ;
 	auio.uio_segflg = UIO_SYSSPACE;
 	auio.uio_td = td;
 	auio.uio_resid = buflen;
 	auio.uio_offset = off;
 
 	if (cookies) {
 		free(cookies, M_TEMP);
 		cookies = NULL;
 	}
 
 #ifdef MAC
 	error = mac_vnode_check_readdir(td->td_ucred, vp);
 	if (error)
 		goto out;
 #endif
 
 	/*
 	 * First we read into the malloc'ed buffer, then
 	 * we massage it into user space, one record at a time.
 	 */
 	if ((error = VOP_READDIR(vp, &auio, fp->f_cred, &eofflag, &ncookies, &cookies)) != 0)
 		goto out;
 	inp = buf;
 	outp = uap->buf;
 	resid = uap->nbytes;
 	if ((len = buflen - auio.uio_resid) <= 0)
 		goto eof;
 
 	cookiep = cookies;
 
 	if (cookies) {
 		/*
 		 * When using cookies, the vfs has the option of reading from
 		 * a different offset than that supplied (UFS truncates the
 		 * offset to a block boundary to make sure that it never reads
 		 * partway through a directory entry, even if the directory
 		 * has been compacted).
 		 */
 		while (len > 0 && ncookies > 0 && *cookiep <= off) {
 			len -= BSD_DIRENT(inp)->d_reclen;
 			inp += BSD_DIRENT(inp)->d_reclen;
 			cookiep++;
 			ncookies--;
 		}
 	}
 
 	for (; len > 0; len -= reclen) {
 		if (cookiep && ncookies == 0)
 			break;
 		reclen = BSD_DIRENT(inp)->d_reclen;
 		if (reclen & 3) {
 		        printf("ibcs2_getdents: reclen=%d\n", reclen);
 		        error = EFAULT;
 			goto out;
 		}
 		if (BSD_DIRENT(inp)->d_fileno == 0) {
 			inp += reclen;	/* it is a hole; squish it out */
 			if (cookiep) {
 				off = *cookiep++;
 				ncookies--;
 			} else
 				off += reclen;
 			continue;
 		}
 		if (reclen > len || resid < IBCS2_RECLEN(reclen)) {
 			/* entry too big for buffer, so just stop */
 			outp++;
 			break;
 		}
 		/*
 		 * Massage in place to make an iBCS2-shaped dirent (otherwise
 		 * we have to worry about touching user memory outside of
 		 * the copyout() call).
 		 */
 		idb.d_ino = (ibcs2_ino_t)BSD_DIRENT(inp)->d_fileno;
 		idb.d_off = (ibcs2_off_t)off;
 		idb.d_reclen = (u_short)IBCS2_RECLEN(reclen);
 		if ((error = copyout((caddr_t)&idb, outp, 10)) != 0 ||
 		    (error = copyout(BSD_DIRENT(inp)->d_name, outp + 10,
 				     BSD_DIRENT(inp)->d_namlen + 1)) != 0)
 			goto out;
 		/* advance past this real entry */
 		if (cookiep) {
 			off = *cookiep++;
 			ncookies--;
 		} else
 			off += reclen;
 		inp += reclen;
 		/* advance output past iBCS2-shaped entry */
 		outp += IBCS2_RECLEN(reclen);
 		resid -= IBCS2_RECLEN(reclen);
 	}
 	/* if we squished out the whole block, try again */
 	if (outp == uap->buf)
 		goto again;
 	fp->f_offset = off;		/* update the vnode offset */
 eof:
 	td->td_retval[0] = uap->nbytes - resid;
 out:
 	VOP_UNLOCK(vp, 0, td);
 	VFS_UNLOCK_GIANT(vfslocked);
 	fdrop(fp, td);
 	if (cookies)
 		free(cookies, M_TEMP);
 	free(buf, M_TEMP);
 	return (error);
 }
 
 int
 ibcs2_read(td, uap)
 	struct thread *td;
 	struct ibcs2_read_args *uap;
 {
 	register struct vnode *vp;
 	register caddr_t inp, buf;	/* BSD-format */
 	register int len, reclen;	/* BSD-format */
 	register caddr_t outp;		/* iBCS2-format */
 	register int resid;		/* iBCS2-format */
 	struct file *fp;
 	struct uio auio;
 	struct iovec aiov;
 	struct ibcs2_direct {
 		ibcs2_ino_t ino;
 		char name[14];
 	} idb;
 	off_t off;			/* true file offset */
 	int buflen, error, eofflag, size, vfslocked;
 	u_long *cookies = NULL, *cookiep;
 	int ncookies;
 
 	if ((error = getvnode(td->td_proc->p_fd, uap->fd, &fp)) != 0) {
 		if (error == EINVAL)
 			return read(td, (struct read_args *)uap);
 		else
 			return error;
 	}
 	if ((fp->f_flag & FREAD) == 0) {
 		fdrop(fp, td);
 		return (EBADF);
 	}
 	vp = fp->f_vnode;
 	vfslocked = VFS_LOCK_GIANT(vp->v_mount);
 	if (vp->v_type != VDIR) {
 		VFS_UNLOCK_GIANT(vfslocked);
 		fdrop(fp, td);
 		return read(td, (struct read_args *)uap);
 	}
 
 	off = fp->f_offset;
 
 	DPRINTF(("ibcs2_read: read directory\n"));
 
 	buflen = max(DIRBLKSIZ, uap->nbytes);
 	buflen = min(buflen, MAXBSIZE);
 	buf = malloc(buflen, M_TEMP, M_WAITOK);
-	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td);
+	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
 again:
 	aiov.iov_base = buf;
 	aiov.iov_len = buflen;
 	auio.uio_iov = &aiov;
 	auio.uio_iovcnt = 1;
 	auio.uio_rw = UIO_READ;
 	auio.uio_segflg = UIO_SYSSPACE;
 	auio.uio_td = td;
 	auio.uio_resid = buflen;
 	auio.uio_offset = off;
 
 	if (cookies) {
 		free(cookies, M_TEMP);
 		cookies = NULL;
 	}
 
 #ifdef MAC
 	error = mac_vnode_check_readdir(td->td_ucred, vp);
 	if (error)
 		goto out;
 #endif
 
 	/*
 	 * First we read into the malloc'ed buffer, then
 	 * we massage it into user space, one record at a time.
 	 */
 	if ((error = VOP_READDIR(vp, &auio, fp->f_cred, &eofflag, &ncookies, &cookies)) != 0) {
 		DPRINTF(("VOP_READDIR failed: %d\n", error));
 		goto out;
 	}
 	inp = buf;
 	outp = uap->buf;
 	resid = uap->nbytes;
 	if ((len = buflen - auio.uio_resid) <= 0)
 		goto eof;
 
 	cookiep = cookies;
 
 	if (cookies) {
 		/*
 		 * When using cookies, the vfs has the option of reading from
 		 * a different offset than that supplied (UFS truncates the
 		 * offset to a block boundary to make sure that it never reads
 		 * partway through a directory entry, even if the directory
 		 * has been compacted).
 		 */
 		while (len > 0 && ncookies > 0 && *cookiep <= off) {
 			len -= BSD_DIRENT(inp)->d_reclen;
 			inp += BSD_DIRENT(inp)->d_reclen;
 			cookiep++;
 			ncookies--;
 		}
 	}
 
 	for (; len > 0 && resid > 0; len -= reclen) {
 		if (cookiep && ncookies == 0)
 			break;
 		reclen = BSD_DIRENT(inp)->d_reclen;
 		if (reclen & 3) {
 		        printf("ibcs2_read: reclen=%d\n", reclen);
 		        error = EFAULT;
 			goto out;
 		}
 		if (BSD_DIRENT(inp)->d_fileno == 0) {
 			inp += reclen;	/* it is a hole; squish it out */
 			if (cookiep) {
 				off = *cookiep++;
 				ncookies--;
 			} else
 				off += reclen;
 			continue;
 		}
 		if (reclen > len || resid < sizeof(struct ibcs2_direct)) {
 			/* entry too big for buffer, so just stop */
 			outp++;
 			break;
 		}
 		/*
 		 * Massage in place to make an iBCS2-shaped dirent (otherwise
 		 * we have to worry about touching user memory outside of
 		 * the copyout() call).
 		 *
 		 * TODO: if length(filename) > 14, then break filename into
 		 * multiple entries and set inode = 0xffff except last
 		 */
 		idb.ino = (BSD_DIRENT(inp)->d_fileno > 0xfffe) ? 0xfffe :
 			BSD_DIRENT(inp)->d_fileno;
 		(void)copystr(BSD_DIRENT(inp)->d_name, idb.name, 14, &size);
 		bzero(idb.name + size, 14 - size);
 		if ((error = copyout(&idb, outp, sizeof(struct ibcs2_direct))) != 0)
 			goto out;
 		/* advance past this real entry */
 		if (cookiep) {
 			off = *cookiep++;
 			ncookies--;
 		} else
 			off += reclen;
 		inp += reclen;
 		/* advance output past iBCS2-shaped entry */
 		outp += sizeof(struct ibcs2_direct);
 		resid -= sizeof(struct ibcs2_direct);
 	}
 	/* if we squished out the whole block, try again */
 	if (outp == uap->buf)
 		goto again;
 	fp->f_offset = off;		/* update the vnode offset */
 eof:
 	td->td_retval[0] = uap->nbytes - resid;
 out:
 	VOP_UNLOCK(vp, 0, td);
 	VFS_UNLOCK_GIANT(vfslocked);
 	fdrop(fp, td);
 	if (cookies)
 		free(cookies, M_TEMP);
 	free(buf, M_TEMP);
 	return (error);
 }
 
 int
 ibcs2_mknod(td, uap)
 	struct thread *td;
 	struct ibcs2_mknod_args *uap;
 {
 	char *path;
 	int error;
 
         CHECKALTCREAT(td, uap->path, &path);
 	if (S_ISFIFO(uap->mode))
 		error = kern_mkfifo(td, path, UIO_SYSSPACE, uap->mode);
 	else
 		error = kern_mknod(td, path, UIO_SYSSPACE, uap->mode, uap->dev);
 	free(path, M_TEMP);
 	return (error);
 }
 
 int
 ibcs2_getgroups(td, uap)
 	struct thread *td;
 	struct ibcs2_getgroups_args *uap;
 {
 	ibcs2_gid_t iset[NGROUPS_MAX];
 	gid_t gp[NGROUPS_MAX];
 	u_int i, ngrp;
 	int error;
 
 	if (uap->gidsetsize < 0)
 		return (EINVAL);
 	ngrp = MIN(uap->gidsetsize, NGROUPS_MAX);
 	error = kern_getgroups(td, &ngrp, gp);
 	if (error)
 		return (error);
 	if (uap->gidsetsize > 0) {
 		for (i = 0; i < ngrp; i++)
 			iset[i] = (ibcs2_gid_t)gp[i];
 		error = copyout(iset, uap->gidset, ngrp * sizeof(ibcs2_gid_t));
 	}
 	if (error == 0)
 		td->td_retval[0] = ngrp;
 	return (error);
 }
 
 int
 ibcs2_setgroups(td, uap)
 	struct thread *td;
 	struct ibcs2_setgroups_args *uap;
 {
 	ibcs2_gid_t iset[NGROUPS_MAX];
 	gid_t gp[NGROUPS_MAX];
 	int error, i;
 
 	if (uap->gidsetsize < 0 || uap->gidsetsize > NGROUPS_MAX)
 		return (EINVAL);
 	if (uap->gidsetsize && uap->gidset) {
 		error = copyin(uap->gidset, iset, sizeof(ibcs2_gid_t) *
 		    uap->gidsetsize);
 		if (error)
 			return (error);
 		for (i = 0; i < uap->gidsetsize; i++)
 			gp[i] = (gid_t)iset[i];
 	}
 	return (kern_setgroups(td, uap->gidsetsize, gp));
 }
 
 int
 ibcs2_setuid(td, uap)
 	struct thread *td;
 	struct ibcs2_setuid_args *uap;
 {
 	struct setuid_args sa;
 
 	sa.uid = (uid_t)uap->uid;
 	return setuid(td, &sa);
 }
 
 int
 ibcs2_setgid(td, uap)
 	struct thread *td;
 	struct ibcs2_setgid_args *uap;
 {
 	struct setgid_args sa;
 
 	sa.gid = (gid_t)uap->gid;
 	return setgid(td, &sa);
 }
 
 int
 ibcs2_time(td, uap)
 	struct thread *td;
 	struct ibcs2_time_args *uap;
 {
 	struct timeval tv;
 
 	microtime(&tv);
 	td->td_retval[0] = tv.tv_sec;
 	if (uap->tp)
 		return copyout((caddr_t)&tv.tv_sec, (caddr_t)uap->tp,
 			       sizeof(ibcs2_time_t));
 	else
 		return 0;
 }
 
 int
 ibcs2_pathconf(td, uap)
 	struct thread *td;
 	struct ibcs2_pathconf_args *uap;
 {
 	char *path;
 	int error;
 
 	CHECKALTEXIST(td, uap->path, &path);
 	uap->name++;	/* iBCS2 _PC_* defines are offset by one */
 	error = kern_pathconf(td, path, UIO_SYSSPACE, uap->name);
 	free(path, M_TEMP);
 	return (error);
 }
 
 int
 ibcs2_fpathconf(td, uap)
 	struct thread *td;
 	struct ibcs2_fpathconf_args *uap;
 {
 	uap->name++;	/* iBCS2 _PC_* defines are offset by one */
         return fpathconf(td, (struct fpathconf_args *)uap);
 }
 
 int
 ibcs2_sysconf(td, uap)
 	struct thread *td;
 	struct ibcs2_sysconf_args *uap;
 {
 	int mib[2], value, len, error;
 	struct proc *p;
 
 	p = td->td_proc;
 	switch(uap->name) {
 	case IBCS2_SC_ARG_MAX:
 		mib[1] = KERN_ARGMAX;
 		break;
 
 	case IBCS2_SC_CHILD_MAX:
 		PROC_LOCK(p);
 		td->td_retval[0] = lim_cur(td->td_proc, RLIMIT_NPROC);
 		PROC_UNLOCK(p);
 		return 0;
 
 	case IBCS2_SC_CLK_TCK:
 		td->td_retval[0] = hz;
 		return 0;
 
 	case IBCS2_SC_NGROUPS_MAX:
 		mib[1] = KERN_NGROUPS;
 		break;
 
 	case IBCS2_SC_OPEN_MAX:
 		PROC_LOCK(p);
 		td->td_retval[0] = lim_cur(td->td_proc, RLIMIT_NOFILE);
 		PROC_UNLOCK(p);
 		return 0;
 		
 	case IBCS2_SC_JOB_CONTROL:
 		mib[1] = KERN_JOB_CONTROL;
 		break;
 		
 	case IBCS2_SC_SAVED_IDS:
 		mib[1] = KERN_SAVED_IDS;
 		break;
 		
 	case IBCS2_SC_VERSION:
 		mib[1] = KERN_POSIX1;
 		break;
 		
 	case IBCS2_SC_PASS_MAX:
 		td->td_retval[0] = 128;		/* XXX - should we create PASS_MAX ? */
 		return 0;
 
 	case IBCS2_SC_XOPEN_VERSION:
 		td->td_retval[0] = 2;		/* XXX: What should that be? */
 		return 0;
 		
 	default:
 		return EINVAL;
 	}
 
 	mib[0] = CTL_KERN;
 	len = sizeof(value);
 	error = kernel_sysctl(td, mib, 2, &value, &len, NULL, 0, NULL, 0);
 	if (error)
 		return error;
 	td->td_retval[0] = value;
 	return 0;
 }
 
 int
 ibcs2_alarm(td, uap)
 	struct thread *td;
 	struct ibcs2_alarm_args *uap;
 {
 	struct itimerval itv, oitv;
 	int error;
 
 	timevalclear(&itv.it_interval);
 	itv.it_value.tv_sec = uap->sec;
 	itv.it_value.tv_usec = 0;
 	error = kern_setitimer(td, ITIMER_REAL, &itv, &oitv);
 	if (error)
 		return (error);
 	if (oitv.it_value.tv_usec != 0)
 		oitv.it_value.tv_sec++;
 	td->td_retval[0] = oitv.it_value.tv_sec;
 	return (0);
 }
 
 int
 ibcs2_times(td, uap)
 	struct thread *td;
 	struct ibcs2_times_args *uap;
 {
 	struct rusage ru;
 	struct timeval t;
 	struct tms tms;
 	int error;
 
 #define CONVTCK(r)      (r.tv_sec * hz + r.tv_usec / (1000000 / hz))
 
 	error = kern_getrusage(td, RUSAGE_SELF, &ru);
 	if (error)
 		return (error);
 	tms.tms_utime = CONVTCK(ru.ru_utime);
 	tms.tms_stime = CONVTCK(ru.ru_stime);
 
 	error = kern_getrusage(td, RUSAGE_CHILDREN, &ru);
 	if (error)
 		return (error);
 	tms.tms_cutime = CONVTCK(ru.ru_utime);
 	tms.tms_cstime = CONVTCK(ru.ru_stime);
 
 	microtime(&t);
 	td->td_retval[0] = CONVTCK(t);
 	
 	return (copyout(&tms, uap->tp, sizeof(struct tms)));
 }
 
 int
 ibcs2_stime(td, uap)
 	struct thread *td;
 	struct ibcs2_stime_args *uap;
 {
 	struct timeval tv;
 	long secs;
 	int error;
 
 	error = copyin(uap->timep, &secs, sizeof(long));
 	if (error)
 		return (error);
 	tv.tv_sec = secs;
 	tv.tv_usec = 0;
 	error = kern_settimeofday(td, &tv, NULL);
 	if (error)
 		error = EPERM;
 	return (error);
 }
 
 int
 ibcs2_utime(td, uap)
 	struct thread *td;
 	struct ibcs2_utime_args *uap;
 {
 	struct ibcs2_utimbuf ubuf;
 	struct timeval tbuf[2], *tp;
 	char *path;
 	int error;
 
 	if (uap->buf) {
 		error = copyin(uap->buf, &ubuf, sizeof(ubuf));
 		if (error)
 			return (error);
 		tbuf[0].tv_sec = ubuf.actime;
 		tbuf[0].tv_usec = 0;
 		tbuf[1].tv_sec = ubuf.modtime;
 		tbuf[1].tv_usec = 0;
 		tp = tbuf;
 	} else
 		tp = NULL;
 
         CHECKALTEXIST(td, uap->path, &path);
 	error = kern_utimes(td, path, UIO_SYSSPACE, tp, UIO_SYSSPACE);
 	free(path, M_TEMP);
 	return (error);
 }
 
 int
 ibcs2_nice(td, uap)
 	struct thread *td;
 	struct ibcs2_nice_args *uap;
 {
 	int error;
 	struct setpriority_args sa;
 
 	sa.which = PRIO_PROCESS;
 	sa.who = 0;
 	sa.prio = td->td_proc->p_nice + uap->incr;
 	if ((error = setpriority(td, &sa)) != 0)
 		return EPERM;
 	td->td_retval[0] = td->td_proc->p_nice;
 	return 0;
 }
 
 /*
  * iBCS2 getpgrp, setpgrp, setsid, and setpgid
  */
 
 int
 ibcs2_pgrpsys(td, uap)
 	struct thread *td;
 	struct ibcs2_pgrpsys_args *uap;
 {
 	struct proc *p = td->td_proc;
 	switch (uap->type) {
 	case 0:			/* getpgrp */
 		PROC_LOCK(p);
 		td->td_retval[0] = p->p_pgrp->pg_id;
 		PROC_UNLOCK(p);
 		return 0;
 
 	case 1:			/* setpgrp */
 	    {
 		struct setpgid_args sa;
 
 		sa.pid = 0;
 		sa.pgid = 0;
 		setpgid(td, &sa);
 		PROC_LOCK(p);
 		td->td_retval[0] = p->p_pgrp->pg_id;
 		PROC_UNLOCK(p);
 		return 0;
 	    }
 
 	case 2:			/* setpgid */
 	    {
 		struct setpgid_args sa;
 
 		sa.pid = uap->pid;
 		sa.pgid = uap->pgid;
 		return setpgid(td, &sa);
 	    }
 
 	case 3:			/* setsid */
 		return setsid(td, NULL);
 
 	default:
 		return EINVAL;
 	}
 }
 
 /*
  * XXX - need to check for nested calls
  */
 
 int
 ibcs2_plock(td, uap)
 	struct thread *td;
 	struct ibcs2_plock_args *uap;
 {
 	int error;
 #define IBCS2_UNLOCK	0
 #define IBCS2_PROCLOCK	1
 #define IBCS2_TEXTLOCK	2
 #define IBCS2_DATALOCK	4
 
 	
 	switch(uap->cmd) {
 	case IBCS2_UNLOCK:
         	error = priv_check(td, PRIV_VM_MUNLOCK);
 		if (error)
 			return (error);
 		/* XXX - TODO */
 		return (0);
 
 	case IBCS2_PROCLOCK:
 	case IBCS2_TEXTLOCK:
 	case IBCS2_DATALOCK:
         	error = priv_check(td, PRIV_VM_MLOCK);
 		if (error)
 			return (error);
 		/* XXX - TODO */
 		return 0;
 	}
 	return EINVAL;
 }
 
 int
 ibcs2_uadmin(td, uap)
 	struct thread *td;
 	struct ibcs2_uadmin_args *uap;
 {
 #define SCO_A_REBOOT        1
 #define SCO_A_SHUTDOWN      2
 #define SCO_A_REMOUNT       4
 #define SCO_A_CLOCK         8
 #define SCO_A_SETCONFIG     128
 #define SCO_A_GETDEV        130
 
 #define SCO_AD_HALT         0
 #define SCO_AD_BOOT         1
 #define SCO_AD_IBOOT        2
 #define SCO_AD_PWRDOWN      3
 #define SCO_AD_PWRNAP       4
 
 #define SCO_AD_PANICBOOT    1
 
 #define SCO_AD_GETBMAJ      0
 #define SCO_AD_GETCMAJ      1
 
 	switch(uap->cmd) {
 	case SCO_A_REBOOT:
 	case SCO_A_SHUTDOWN:
 		switch(uap->func) {
 			struct reboot_args r;
 		case SCO_AD_HALT:
 		case SCO_AD_PWRDOWN:
 		case SCO_AD_PWRNAP:
 			r.opt = RB_HALT;
 			return (reboot(td, &r));
 		case SCO_AD_BOOT:
 		case SCO_AD_IBOOT:
 			r.opt = RB_AUTOBOOT;
 			return (reboot(td, &r));
 		}
 		return EINVAL;
 	case SCO_A_REMOUNT:
 	case SCO_A_CLOCK:
 	case SCO_A_SETCONFIG:
 		return 0;
 	case SCO_A_GETDEV:
 		return EINVAL;	/* XXX - TODO */
 	}
 	return EINVAL;
 }
 
 int
 ibcs2_sysfs(td, uap)
 	struct thread *td;
 	struct ibcs2_sysfs_args *uap;
 {
 #define IBCS2_GETFSIND        1
 #define IBCS2_GETFSTYP        2
 #define IBCS2_GETNFSTYP       3
 
 	switch(uap->cmd) {
 	case IBCS2_GETFSIND:
 	case IBCS2_GETFSTYP:
 	case IBCS2_GETNFSTYP:
 		break;
 	}
 	return EINVAL;		/* XXX - TODO */
 }
 
 int
 ibcs2_unlink(td, uap)
 	struct thread *td;
 	struct ibcs2_unlink_args *uap;
 {
 	char *path;
 	int error;
 
 	CHECKALTEXIST(td, uap->path, &path);
 	error = kern_unlink(td, path, UIO_SYSSPACE);
 	free(path, M_TEMP);
 	return (error);
 }
 
 int
 ibcs2_chdir(td, uap)
 	struct thread *td;
 	struct ibcs2_chdir_args *uap;
 {
 	char *path;
 	int error;
 
 	CHECKALTEXIST(td, uap->path, &path);
 	error = kern_chdir(td, path, UIO_SYSSPACE);
 	free(path, M_TEMP);
 	return (error);
 }
 
 int
 ibcs2_chmod(td, uap)
 	struct thread *td;
 	struct ibcs2_chmod_args *uap;
 {
 	char *path;
 	int error;
 
 	CHECKALTEXIST(td, uap->path, &path);
 	error = kern_chmod(td, path, UIO_SYSSPACE, uap->mode);
 	free(path, M_TEMP);
 	return (error);
 }
 
 int
 ibcs2_chown(td, uap)
 	struct thread *td;
 	struct ibcs2_chown_args *uap;
 {
 	char *path;
 	int error;
 
 	CHECKALTEXIST(td, uap->path, &path);
 	error = kern_chown(td, path, UIO_SYSSPACE, uap->uid, uap->gid);
 	free(path, M_TEMP);
 	return (error);
 }
 
 int
 ibcs2_rmdir(td, uap)
 	struct thread *td;
 	struct ibcs2_rmdir_args *uap;
 {
 	char *path;
 	int error;
 
 	CHECKALTEXIST(td, uap->path, &path);
 	error = kern_rmdir(td, path, UIO_SYSSPACE);
 	free(path, M_TEMP);
 	return (error);
 }
 
 int
 ibcs2_mkdir(td, uap)
 	struct thread *td;
 	struct ibcs2_mkdir_args *uap;
 {
 	char *path;
 	int error;
 
 	CHECKALTEXIST(td, uap->path, &path);
 	error = kern_mkdir(td, path, UIO_SYSSPACE, uap->mode);
 	free(path, M_TEMP);
 	return (error);
 }
 
 int
 ibcs2_symlink(td, uap)
 	struct thread *td;
 	struct ibcs2_symlink_args *uap;
 {
 	char *path, *link;
 	int error;
 
 	CHECKALTEXIST(td, uap->path, &path);
 
 	/*
 	 * Have to expand CHECKALTCREAT() so that 'path' can be freed on
 	 * errors.
 	 */
 	error = ibcs2_emul_find(td, uap->link, UIO_USERSPACE, &link, 1);
 	if (link == NULL) {
 		free(path, M_TEMP);
 		return (error);
 	}
 	error = kern_symlink(td, path, link, UIO_SYSSPACE);
 	free(path, M_TEMP);
 	free(link, M_TEMP);
 	return (error);
 }
 
 int
 ibcs2_rename(td, uap)
 	struct thread *td;
 	struct ibcs2_rename_args *uap;
 {
 	char *from, *to;
 	int error;
 
 	CHECKALTEXIST(td, uap->from, &from);
 
 	/*
 	 * Have to expand CHECKALTCREAT() so that 'from' can be freed on
 	 * errors.
 	 */
 	error = ibcs2_emul_find(td, uap->to, UIO_USERSPACE, &to, 1);
 	if (to == NULL) {
 		free(from, M_TEMP);
 		return (error);
 	}
 	error = kern_rename(td, from, to, UIO_SYSSPACE);
 	free(from, M_TEMP);
 	free(to, M_TEMP);
 	return (error);
 }
 
 int
 ibcs2_readlink(td, uap)
 	struct thread *td;
 	struct ibcs2_readlink_args *uap;
 {
 	char *path;
 	int error;
 
 	CHECKALTEXIST(td, uap->path, &path);
 	error = kern_readlink(td, path, UIO_SYSSPACE, uap->buf, UIO_USERSPACE,
 		uap->count);
 	free(path, M_TEMP);
 	return (error);
 }
Index: head/sys/i386/ibcs2/imgact_coff.c
===================================================================
--- head/sys/i386/ibcs2/imgact_coff.c	(revision 175201)
+++ head/sys/i386/ibcs2/imgact_coff.c	(revision 175202)
@@ -1,497 +1,497 @@
 /*-
  * Copyright (c) 1994 Sean Eric Fagan
  * Copyright (c) 1994 S�ren Schmidt
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer
  *    in this position and unchanged.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. The name of the author may not be used to endorse or promote products
  *    derived from this software without specific prior written permission
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/exec.h>
 #include <sys/fcntl.h>
 #include <sys/imgact.h>
 #include <sys/kernel.h>
 #include <sys/lock.h>
 #include <sys/malloc.h>
 #include <sys/mman.h>
 #include <sys/mount.h>
 #include <sys/namei.h>
 #include <sys/vnode.h>
 
 #include <vm/vm.h>
 #include <vm/pmap.h>
 #include <vm/vm_map.h>
 #include <vm/vm_kern.h>
 #include <vm/vm_extern.h>
 
 #include <i386/ibcs2/coff.h>
 #include <i386/ibcs2/ibcs2_util.h>
 
 MODULE_DEPEND(coff, ibcs2, 1, 1, 1);
 
 extern struct sysentvec ibcs2_svr3_sysvec;
 
 static int coff_load_file(struct thread *td, char *name);
 static int exec_coff_imgact(struct image_params *imgp);
 
 static int load_coff_section(struct vmspace *vmspace, struct vnode *vp, vm_offset_t offset, caddr_t vmaddr, size_t memsz, size_t filsz, vm_prot_t prot);
 
 static int
 load_coff_section(struct vmspace *vmspace, struct vnode *vp, vm_offset_t offset,
 		  caddr_t vmaddr, size_t memsz, size_t filsz, vm_prot_t prot)
 {
 	size_t map_len;
 	vm_offset_t map_offset;
 	vm_offset_t map_addr;
 	int error;
 	unsigned char *data_buf = 0;
 	size_t copy_len;
 
 	map_offset = trunc_page(offset);
 	map_addr = trunc_page((vm_offset_t)vmaddr);
 
 	if (memsz > filsz) {
 		/*
 		 * We have the stupid situation that
 		 * the section is longer than it is on file,
 		 * which means it has zero-filled areas, and
 		 * we have to work for it.  Stupid iBCS!
 		 */
 		map_len = trunc_page(offset + filsz) - trunc_page(map_offset);
 	} else {
 		/*
 		 * The only stuff we care about is on disk, and we
 		 * don't care if we map in more than is really there.
 		 */
 		map_len = round_page(offset + filsz) - trunc_page(map_offset);
 	}
 
 	DPRINTF(("%s(%d):  vm_mmap(&vmspace->vm_map, &0x%08lx, 0x%x, 0x%x, "
 		"VM_PROT_ALL, MAP_PRIVATE | MAP_FIXED, OBJT_VNODE, vp, 0x%x)\n",
 		__FILE__, __LINE__, map_addr, map_len, prot, map_offset));
 
 	if ((error = vm_mmap(&vmspace->vm_map,
 			     &map_addr,
 			     map_len,
 			     prot,
 			     VM_PROT_ALL,
 			     MAP_PRIVATE | MAP_FIXED,
 			     OBJT_VNODE,
 			     vp,
 			     map_offset)) != 0)
 		return error;
 
 	if (memsz == filsz) {
 		/* We're done! */
 		return 0;
 	}
 
 	/*
 	 * Now we have screwball stuff, to accomodate stupid COFF.
 	 * We have to map the remaining bit of the file into the kernel's
 	 * memory map, allocate some anonymous memory, copy that last
 	 * bit into it, and then we're done. *sigh*
 	 * For clean-up reasons, we actally map in the file last.
 	 */
 
 	copy_len = (offset + filsz) - trunc_page(offset + filsz);
 	map_addr = trunc_page((vm_offset_t)vmaddr + filsz);
 	map_len = round_page((vm_offset_t)vmaddr + memsz) - map_addr;
 
 	DPRINTF(("%s(%d): vm_map_find(&vmspace->vm_map, NULL, 0, &0x%08lx,0x%x, FALSE, VM_PROT_ALL, VM_PROT_ALL, 0)\n", __FILE__, __LINE__, map_addr, map_len));
 
 	if (map_len != 0) {
 		error = vm_map_find(&vmspace->vm_map, NULL, 0, &map_addr,
 				    map_len, FALSE, VM_PROT_ALL, VM_PROT_ALL, 0);
 		if (error)
 			return error;
 	}
 
 	if ((error = vm_mmap(kernel_map,
 			    (vm_offset_t *) &data_buf,
 			    PAGE_SIZE,
 			    VM_PROT_READ,
 			    VM_PROT_READ,
 			    0,
 			    OBJT_VNODE,
 			    vp,
 			    trunc_page(offset + filsz))) != 0)
 		return error;
 
 	error = copyout(data_buf, (caddr_t) map_addr, copy_len);
 
 	if (vm_map_remove(kernel_map,
 			  (vm_offset_t) data_buf,
 			  (vm_offset_t) data_buf + PAGE_SIZE))
 		panic("load_coff_section vm_map_remove failed");
 
 	return error;
 }
 
 static int
 coff_load_file(struct thread *td, char *name)
 {
 	struct proc *p = td->td_proc;
   	struct vmspace *vmspace = p->p_vmspace;
   	int error;
   	struct nameidata nd;
   	struct vnode *vp;
   	struct vattr attr;
   	struct filehdr *fhdr;
   	struct aouthdr *ahdr;
   	struct scnhdr *scns;
   	char *ptr = 0;
   	int nscns;
   	unsigned long text_offset = 0, text_address = 0, text_size = 0;
   	unsigned long data_offset = 0, data_address = 0, data_size = 0;
   	unsigned long bss_size = 0;
   	int i;
 
 	NDINIT(&nd, LOOKUP, ISOPEN | LOCKLEAF | FOLLOW | SAVENAME,
 	    UIO_SYSSPACE, name, td);
 
   	error = namei(&nd);
   	if (error)
     		return error;
 
   	vp = nd.ni_vp;
   	if (vp == NULL)
     		return ENOEXEC;
 
   	if (vp->v_writecount) {
     		error = ETXTBSY;
     		goto fail;
   	}
 
   	if ((error = VOP_GETATTR(vp, &attr, td->td_ucred, td)) != 0)
     		goto fail;
 
   	if ((vp->v_mount->mnt_flag & MNT_NOEXEC)
 	    || ((attr.va_mode & 0111) == 0)
 	    || (attr.va_type != VREG))
     		goto fail;
 
   	if (attr.va_size == 0) {
     		error = ENOEXEC;
     		goto fail;
   	}
 
   	if ((error = VOP_ACCESS(vp, VEXEC, td->td_ucred, td)) != 0)
     		goto fail;
 
   	if ((error = VOP_OPEN(vp, FREAD, td->td_ucred, td, NULL)) != 0)
     		goto fail;
 
 	/*
 	 * Lose the lock on the vnode. It's no longer needed, and must not
 	 * exist for the pagefault paging to work below.
 	 */
 	VOP_UNLOCK(vp, 0, td);
 
   	if ((error = vm_mmap(kernel_map,
 			    (vm_offset_t *) &ptr,
 			    PAGE_SIZE,
 			    VM_PROT_READ,
 		       	    VM_PROT_READ,
 			    0,
 			    OBJT_VNODE,
 			    vp,
 			    0)) != 0)
 		goto unlocked_fail;
 
   	fhdr = (struct filehdr *)ptr;
 
   	if (fhdr->f_magic != I386_COFF) {
     		error = ENOEXEC;
     		goto dealloc_and_fail;
   	}
 
   	nscns = fhdr->f_nscns;
 
   	if ((nscns * sizeof(struct scnhdr)) > PAGE_SIZE) {
     		/*
      		 * XXX -- just fail.  I'm so lazy.
      		 */
     		error = ENOEXEC;
     		goto dealloc_and_fail;
   	}
 
   	ahdr = (struct aouthdr*)(ptr + sizeof(struct filehdr));
 
   	scns = (struct scnhdr*)(ptr + sizeof(struct filehdr)
 			  + sizeof(struct aouthdr));
 
   	for (i = 0; i < nscns; i++) {
     		if (scns[i].s_flags & STYP_NOLOAD)
       			continue;
     		else if (scns[i].s_flags & STYP_TEXT) {
       			text_address = scns[i].s_vaddr;
       			text_size = scns[i].s_size;
       			text_offset = scns[i].s_scnptr;
     		}
 		else if (scns[i].s_flags & STYP_DATA) {
       			data_address = scns[i].s_vaddr;
       			data_size = scns[i].s_size;
       			data_offset = scns[i].s_scnptr;
     		} else if (scns[i].s_flags & STYP_BSS) {
       			bss_size = scns[i].s_size;
     		}
   	}
 
   	if ((error = load_coff_section(vmspace, vp, text_offset,
 				      (caddr_t)(void *)(uintptr_t)text_address,
 				      text_size, text_size,
 				      VM_PROT_READ | VM_PROT_EXECUTE)) != 0) {
     		goto dealloc_and_fail;
   	}
   	if ((error = load_coff_section(vmspace, vp, data_offset,
 				      (caddr_t)(void *)(uintptr_t)data_address,
 				      data_size + bss_size, data_size,
 				      VM_PROT_ALL)) != 0) {
     		goto dealloc_and_fail;
   	}
 
   	error = 0;
 
  dealloc_and_fail:
 	if (vm_map_remove(kernel_map,
 			  (vm_offset_t) ptr,
 			  (vm_offset_t) ptr + PAGE_SIZE))
     		panic("%s vm_map_remove failed", __func__);
 
  fail:
 	VOP_UNLOCK(vp, 0, td);
  unlocked_fail:
 	NDFREE(&nd, NDF_ONLY_PNBUF);
 	vrele(nd.ni_vp);
   	return error;
 }
 
 static int
 exec_coff_imgact(imgp)
 	struct image_params *imgp;
 {
 	const struct filehdr *fhdr = (const struct filehdr*)imgp->image_header;
 	const struct aouthdr *ahdr;
 	const struct scnhdr *scns;
 	int i;
 	struct vmspace *vmspace;
 	int nscns;
 	int error;
 	unsigned long text_offset = 0, text_address = 0, text_size = 0;
 	unsigned long data_offset = 0, data_address = 0, data_size = 0;
 	unsigned long bss_size = 0;
 	caddr_t hole;
 	struct thread *td = curthread;
 
 	if (fhdr->f_magic != I386_COFF ||
 	    !(fhdr->f_flags & F_EXEC)) {
 
 		 DPRINTF(("%s(%d): return -1\n", __FILE__, __LINE__));
 		 return -1;
 	}
 
 	nscns = fhdr->f_nscns;
 	if ((nscns * sizeof(struct scnhdr)) > PAGE_SIZE) {
 	  	/*
 	   	 * For now, return an error -- need to be able to
 	   	 * read in all of the section structures.
 	   	 */
 
 		DPRINTF(("%s(%d): return -1\n", __FILE__, __LINE__));
 		return -1;
 	}
 
 	ahdr = (const struct aouthdr*)
 	       ((const char*)(imgp->image_header) + sizeof(struct filehdr));
 	imgp->entry_addr = ahdr->entry;
 
 	scns = (const struct scnhdr*)
 	       ((const char*)(imgp->image_header) + sizeof(struct filehdr) +
 		sizeof(struct aouthdr));
 
 	VOP_UNLOCK(imgp->vp, 0, td);
 
 	error = exec_new_vmspace(imgp, &ibcs2_svr3_sysvec);
 	if (error)
 		goto fail;
 	vmspace = imgp->proc->p_vmspace;
 
 	for (i = 0; i < nscns; i++) {
 
 	  DPRINTF(("i = %d, scns[i].s_name = %s, scns[i].s_vaddr = %08lx, "
 		   "scns[i].s_scnptr = %d\n", i, scns[i].s_name,
 		   scns[i].s_vaddr, scns[i].s_scnptr));
 	  if (scns[i].s_flags & STYP_NOLOAD) {
 	    	/*
 	     	 * A section that is not loaded, for whatever
 	     	 * reason.  It takes precedance over other flag
 	     	 * bits...
 	     	 */
 	    	continue;
 	  } else if (scns[i].s_flags & STYP_TEXT) {
 	    	text_address = scns[i].s_vaddr;
 	    	text_size = scns[i].s_size;
 	    	text_offset = scns[i].s_scnptr;
 	  } else if (scns[i].s_flags & STYP_DATA) {
 	    	/* .data section */
 	    	data_address = scns[i].s_vaddr;
 	    	data_size = scns[i].s_size;
 	    	data_offset = scns[i].s_scnptr;
 	  } else if (scns[i].s_flags & STYP_BSS) {
 	    	/* .bss section */
 	    	bss_size = scns[i].s_size;
 	  } else if (scns[i].s_flags & STYP_LIB) {
 	    	char *buf = 0;
 	    	int foff = trunc_page(scns[i].s_scnptr);
 	    	int off = scns[i].s_scnptr - foff;
 	    	int len = round_page(scns[i].s_size + PAGE_SIZE);
 	    	int j;
 
 	    	if ((error = vm_mmap(kernel_map,
 				    (vm_offset_t *) &buf,
 				    len,
 				    VM_PROT_READ,
 				    VM_PROT_READ,
 				    0,
 				    OBJT_VNODE,
 				    imgp->vp,
 				    foff)) != 0) {
 	      		error = ENOEXEC;
 			goto fail;
 	    	}
 		if(scns[i].s_size) {
 			char *libbuf;
 			int emul_path_len = strlen(ibcs2_emul_path);
 
 			libbuf = malloc(MAXPATHLEN + emul_path_len,
 					M_TEMP, M_WAITOK);
 			strcpy(libbuf, ibcs2_emul_path);
 
 		    	for (j = off; j < scns[i].s_size + off;) {
 				long stroff, nextoff;
 	      			char *libname;
 
 				nextoff = 4 * *(long *)(buf + j);
 				stroff = 4 * *(long *)(buf + j + sizeof(long));
 
 		      		libname = buf + j + stroff;
 		      		j += nextoff;
 
 				DPRINTF(("%s(%d):  shared library %s\n",
 					 __FILE__, __LINE__, libname));
 				strlcpy(&libbuf[emul_path_len], libname, MAXPATHLEN);
 /* XXXKSE only 1:1 in coff */  	error = coff_load_file(
 				    FIRST_THREAD_IN_PROC(imgp->proc), libbuf);
 		      		if (error)
 	      				error = coff_load_file(
 					    FIRST_THREAD_IN_PROC(imgp->proc),
 					    libname);
 		      		if (error)
 					break;
 		    	}
 			free(libbuf, M_TEMP);
 		}
 		if (vm_map_remove(kernel_map,
 				  (vm_offset_t) buf,
 				  (vm_offset_t) buf + len))
 	      		panic("exec_coff_imgact vm_map_remove failed");
 	    	if (error)
 	      		goto fail;
 	  	}
 	}
 	/*
 	 * Map in .text now
 	 */
 
 	DPRINTF(("%s(%d):  load_coff_section(vmspace, "
 		"imgp->vp, %08lx, %08lx, 0x%x, 0x%x, 0x%x)\n",
 		__FILE__, __LINE__, text_offset, text_address,
 		text_size, text_size, VM_PROT_READ | VM_PROT_EXECUTE));
 	if ((error = load_coff_section(vmspace, imgp->vp,
 				      text_offset,
 				      (caddr_t)(void *)(uintptr_t)text_address,
 				      text_size, text_size,
 				      VM_PROT_READ | VM_PROT_EXECUTE)) != 0) {
 		DPRINTF(("%s(%d): error = %d\n", __FILE__, __LINE__, error));
 		goto fail;
        	}
 	/*
 	 * Map in .data and .bss now
 	 */
 
 
 	DPRINTF(("%s(%d): load_coff_section(vmspace, "
 		"imgp->vp, 0x%08lx, 0x%08lx, 0x%x, 0x%x, 0x%x)\n",
 		__FILE__, __LINE__, data_offset, data_address,
 		data_size + bss_size, data_size, VM_PROT_ALL));
 	if ((error = load_coff_section(vmspace, imgp->vp,
 				      data_offset,
 				      (caddr_t)(void *)(uintptr_t)data_address,
 				      data_size + bss_size, data_size,
 				      VM_PROT_ALL)) != 0) {
 
 		DPRINTF(("%s(%d): error = %d\n", __FILE__, __LINE__, error));
 		goto fail;
 	}
 
 	imgp->interpreted = 0;
 	imgp->proc->p_sysent = &ibcs2_svr3_sysvec;
 
 	vmspace->vm_tsize = round_page(text_size) >> PAGE_SHIFT;
 	vmspace->vm_dsize = round_page(data_size + bss_size) >> PAGE_SHIFT;
 	vmspace->vm_taddr = (caddr_t)(void *)(uintptr_t)text_address;
 	vmspace->vm_daddr = (caddr_t)(void *)(uintptr_t)data_address;
 
 	hole = (caddr_t)trunc_page((vm_offset_t)vmspace->vm_daddr) + ctob(vmspace->vm_dsize);
 
 
 	DPRINTF(("%s(%d): vm_map_find(&vmspace->vm_map, NULL, 0, &0x%08lx, PAGE_SIZE, FALSE, VM_PROT_ALL, VM_PROT_ALL, 0)\n",
 		__FILE__, __LINE__, hole));
         DPRINTF(("imgact: error = %d\n", error));
 
 	error = vm_map_find(&vmspace->vm_map, NULL, 0,
 			    (vm_offset_t *) &hole, PAGE_SIZE, FALSE,
 				VM_PROT_ALL, VM_PROT_ALL, 0);
 
 	DPRINTF(("IBCS2: start vm_dsize = 0x%x, vm_daddr = 0x%x end = 0x%x\n",
 		ctob(vmspace->vm_dsize), vmspace->vm_daddr,
 		ctob(vmspace->vm_dsize) + vmspace->vm_daddr ));
 	DPRINTF(("%s(%d):  returning successfully!\n", __FILE__, __LINE__));
 
 fail:
-	vn_lock(imgp->vp, LK_EXCLUSIVE | LK_RETRY, td);
+	vn_lock(imgp->vp, LK_EXCLUSIVE | LK_RETRY);
 
 	return error;
 }
 
 /*
  * Tell kern_execve.c about it, with a little help from the linker.
  */
 static struct execsw coff_execsw = { exec_coff_imgact, "coff" };
 EXEC_SET(coff, coff_execsw);
Index: head/sys/i386/linux/imgact_linux.c
===================================================================
--- head/sys/i386/linux/imgact_linux.c	(revision 175201)
+++ head/sys/i386/linux/imgact_linux.c	(revision 175202)
@@ -1,247 +1,247 @@
 /*-
  * Copyright (c) 1994-1996 S�ren Schmidt
  * All rights reserved.
  *
  * Based heavily on /sys/kern/imgact_aout.c which is:
  * Copyright (c) 1993, David Greenman
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer
  *    in this position and unchanged.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. The name of the author may not be used to endorse or promote products
  *    derived from this software without specific prior written permission
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/exec.h>
 #include <sys/imgact.h>
 #include <sys/imgact_aout.h>
 #include <sys/kernel.h>
 #include <sys/lock.h>
 #include <sys/mman.h>
 #include <sys/mutex.h>
 #include <sys/proc.h>
 #include <sys/resourcevar.h>
 #include <sys/vnode.h>
 
 #include <vm/vm.h>
 #include <vm/vm_kern.h>
 #include <vm/vm_param.h>
 #include <vm/pmap.h>
 #include <vm/vm_map.h>
 #include <vm/vm_extern.h>
 
 #include <i386/linux/linux.h>
 
 static int	exec_linux_imgact(struct image_params *iparams);
 
 static int
 exec_linux_imgact(struct image_params *imgp)
 {
     const struct exec *a_out = (const struct exec *) imgp->image_header;
     struct vmspace *vmspace;
     vm_offset_t vmaddr;
     unsigned long virtual_offset, file_offset;
     vm_offset_t buffer;
     unsigned long bss_size;
     struct thread *td = curthread;
     int error;
 
     if (((a_out->a_magic >> 16) & 0xff) != 0x64)
 	return -1;
 
     /*
      * Set file/virtual offset based on a.out variant.
      */
     switch ((int)(a_out->a_magic & 0xffff)) {
     case 0413:
 	virtual_offset = 0;
 	file_offset = 1024;
 	break;
     case 0314:
 	virtual_offset = 4096;
 	file_offset = 0;
 	break;
     default:
 	return (-1);
     }
     bss_size = round_page(a_out->a_bss);
 #ifdef DEBUG
     printf("imgact: text: %08lx, data: %08lx, bss: %08lx\n",
 	(u_long)a_out->a_text, (u_long)a_out->a_data, bss_size);
 #endif
 
     /*
      * Check various fields in header for validity/bounds.
      */
     if (a_out->a_entry < virtual_offset ||
 	a_out->a_entry >= virtual_offset + a_out->a_text ||
 	a_out->a_text & PAGE_MASK || a_out->a_data & PAGE_MASK)
 	return (-1);
 
     /* text + data can't exceed file size */
     if (a_out->a_data + a_out->a_text > imgp->attr->va_size)
 	return (EFAULT);
     /*
      * text/data/bss must not exceed limits
      */
     PROC_LOCK(imgp->proc);
     if (a_out->a_text > maxtsiz ||
 	a_out->a_data + bss_size > lim_cur(imgp->proc, RLIMIT_DATA)) {
 	PROC_UNLOCK(imgp->proc);
 	return (ENOMEM);
     }
     PROC_UNLOCK(imgp->proc);
 
     VOP_UNLOCK(imgp->vp, 0, td);
 
     /*
      * Destroy old process VM and create a new one (with a new stack)
      */
     error = exec_new_vmspace(imgp, &linux_sysvec);
     if (error)
 	    goto fail;
     vmspace = imgp->proc->p_vmspace;
 
     /*
      * Check if file_offset page aligned,.
      * Currently we cannot handle misaligned file offsets,
      * and so we read in the entire image (what a waste).
      */
     if (file_offset & PAGE_MASK) {
 #ifdef DEBUG
 	printf("imgact: Non page aligned binary %lu\n", file_offset);
 #endif
 	/*
 	 * Map text+data+bss read/write/execute
 	 */
 	vmaddr = virtual_offset;
 	error = vm_map_find(&vmspace->vm_map, NULL, 0, &vmaddr,
 			    a_out->a_text + a_out->a_data + bss_size, FALSE,
 			    VM_PROT_ALL, VM_PROT_ALL, 0);
 	if (error)
 	    goto fail;
 
 	error = vm_mmap(kernel_map, &buffer,
 			round_page(a_out->a_text + a_out->a_data + file_offset),
 			VM_PROT_READ, VM_PROT_READ, 0, OBJT_VNODE,
 			imgp->vp, trunc_page(file_offset));
 	if (error)
 	    goto fail;
 
 	error = copyout((void *)(uintptr_t)(buffer + file_offset),
 			(void *)vmaddr, a_out->a_text + a_out->a_data);
 
 	vm_map_remove(kernel_map, buffer,
 		      buffer + round_page(a_out->a_text + a_out->a_data + file_offset));
 
 	if (error)
 	    goto fail;
 
 	/*
 	 * remove write enable on the 'text' part
 	 */
 	error = vm_map_protect(&vmspace->vm_map,
 			       vmaddr,
 			       vmaddr + a_out->a_text,
 			       VM_PROT_EXECUTE|VM_PROT_READ,
 			       TRUE);
 	if (error)
 	    goto fail;
     }
     else {
 #ifdef DEBUG
 	printf("imgact: Page aligned binary %lu\n", file_offset);
 #endif
 	/*
 	 * Map text+data read/execute
 	 */
 	vmaddr = virtual_offset;
 	error = vm_mmap(&vmspace->vm_map, &vmaddr,
 			a_out->a_text + a_out->a_data,
 			VM_PROT_READ | VM_PROT_EXECUTE,
 			VM_PROT_ALL,
 			MAP_PRIVATE | MAP_FIXED,
 			OBJT_VNODE,
 			imgp->vp, file_offset);
 	if (error)
 	    goto fail;
 
 #ifdef DEBUG
 	printf("imgact: startaddr=%08lx, length=%08lx\n",
 	    (u_long)vmaddr, (u_long)a_out->a_text + (u_long)a_out->a_data);
 #endif
 	/*
 	 * allow read/write of data
 	 */
 	error = vm_map_protect(&vmspace->vm_map,
 			       vmaddr + a_out->a_text,
 			       vmaddr + a_out->a_text + a_out->a_data,
 			       VM_PROT_ALL,
 			       FALSE);
 	if (error)
 	    goto fail;
 
 	/*
 	 * Allocate anon demand-zeroed area for uninitialized data
 	 */
 	if (bss_size != 0) {
 	    vmaddr = virtual_offset + a_out->a_text + a_out->a_data;
 	    error = vm_map_find(&vmspace->vm_map, NULL, 0, &vmaddr,
 				bss_size, FALSE, VM_PROT_ALL, VM_PROT_ALL, 0);
 	    if (error)
 		goto fail;
 #ifdef DEBUG
 	    printf("imgact: bssaddr=%08lx, length=%08lx\n",
 		(u_long)vmaddr, bss_size);
 #endif
 
 	}
 	/* Indicate that this file should not be modified */
 	mp_fixme("Unlocked v_flag access");
 	imgp->vp->v_vflag |= VV_TEXT;
     }
     /* Fill in process VM information */
     vmspace->vm_tsize = round_page(a_out->a_text) >> PAGE_SHIFT;
     vmspace->vm_dsize = round_page(a_out->a_data + bss_size) >> PAGE_SHIFT;
     vmspace->vm_taddr = (caddr_t)(void *)(uintptr_t)virtual_offset;
     vmspace->vm_daddr = (caddr_t)(void *)(uintptr_t)
 	(virtual_offset + a_out->a_text);
 
     /* Fill in image_params */
     imgp->interpreted = 0;
     imgp->entry_addr = a_out->a_entry;
 
     imgp->proc->p_sysent = &linux_sysvec;
 
 fail:
-    vn_lock(imgp->vp, LK_EXCLUSIVE | LK_RETRY, td);
+    vn_lock(imgp->vp, LK_EXCLUSIVE | LK_RETRY);
     return (error);
 }
 
 /*
  * Tell kern_execve.c about it, with a little help from the linker.
  */
 static struct execsw linux_execsw = { exec_linux_imgact, "linux a.out" };
 EXEC_SET(linuxaout, linux_execsw);
Index: head/sys/kern/imgact_aout.c
===================================================================
--- head/sys/kern/imgact_aout.c	(revision 175201)
+++ head/sys/kern/imgact_aout.c	(revision 175202)
@@ -1,274 +1,273 @@
 /*-
  * Copyright (c) 1993, David Greenman
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <sys/param.h>
 #include <sys/exec.h>
 #include <sys/imgact.h>
 #include <sys/imgact_aout.h>
 #include <sys/kernel.h>
 #include <sys/lock.h>
 #include <sys/malloc.h>
 #include <sys/mutex.h>
 #include <sys/proc.h>
 #include <sys/resourcevar.h>
 #include <sys/signalvar.h>
 #include <sys/syscall.h>
 #include <sys/sysent.h>
 #include <sys/systm.h>
 #include <sys/vnode.h>
 
 #include <machine/frame.h>
 #include <machine/md_var.h>
 
 #include <vm/vm.h>
 #include <vm/pmap.h>
 #include <vm/vm_map.h>
 #include <vm/vm_object.h>
 #include <vm/vm_param.h>
 
 static int	exec_aout_imgact(struct image_params *imgp);
 static int	aout_fixup(register_t **stack_base, struct image_params *imgp);
 
 struct sysentvec aout_sysvec = {
 	SYS_MAXSYSCALL,
 	sysent,
 	0,
 	0,
 	NULL,
 	0,
 	NULL,
 	NULL,
 	aout_fixup,
 	sendsig,
 	sigcode,
 	&szsigcode,
 	NULL,
 	"FreeBSD a.out",
 	NULL,
 	NULL,
 	MINSIGSTKSZ,
 	PAGE_SIZE,
 	VM_MIN_ADDRESS,
 	VM_MAXUSER_ADDRESS,
 	USRSTACK,
 	PS_STRINGS,
 	VM_PROT_ALL,
 	exec_copyout_strings,
 	exec_setregs,
 	NULL
 };
 
 static int
 aout_fixup(stack_base, imgp)
 	register_t **stack_base;
 	struct image_params *imgp;
 {
 
 	return (suword(--(*stack_base), imgp->args->argc));
 }
 
 static int
 exec_aout_imgact(imgp)
 	struct image_params *imgp;
 {
 	const struct exec *a_out = (const struct exec *) imgp->image_header;
-	struct thread *td = curthread;
 	struct vmspace *vmspace;
 	vm_map_t map;
 	vm_object_t object;
 	vm_offset_t text_end, data_end;
 	unsigned long virtual_offset;
 	unsigned long file_offset;
 	unsigned long bss_size;
 	int error;
 
 	/*
 	 * Linux and *BSD binaries look very much alike,
 	 * only the machine id is different:
 	 * 0x64 for Linux, 0x86 for *BSD, 0x00 for BSDI.
 	 * NetBSD is in network byte order.. ugh.
 	 */
 	if (((a_out->a_magic >> 16) & 0xff) != 0x86 &&
 	    ((a_out->a_magic >> 16) & 0xff) != 0 &&
 	    ((((int)ntohl(a_out->a_magic)) >> 16) & 0xff) != 0x86)
                 return -1;
 
 	/*
 	 * Set file/virtual offset based on a.out variant.
 	 *	We do two cases: host byte order and network byte order
 	 *	(for NetBSD compatibility)
 	 */
 	switch ((int)(a_out->a_magic & 0xffff)) {
 	case ZMAGIC:
 		virtual_offset = 0;
 		if (a_out->a_text) {
 			file_offset = PAGE_SIZE;
 		} else {
 			/* Bill's "screwball mode" */
 			file_offset = 0;
 		}
 		break;
 	case QMAGIC:
 		virtual_offset = PAGE_SIZE;
 		file_offset = 0;
 		/* Pass PS_STRINGS for BSD/OS binaries only. */
 		if (N_GETMID(*a_out) == MID_ZERO)
 			imgp->ps_strings = aout_sysvec.sv_psstrings;
 		break;
 	default:
 		/* NetBSD compatibility */
 		switch ((int)(ntohl(a_out->a_magic) & 0xffff)) {
 		case ZMAGIC:
 		case QMAGIC:
 			virtual_offset = PAGE_SIZE;
 			file_offset = 0;
 			break;
 		default:
 			return (-1);
 		}
 	}
 
 	bss_size = roundup(a_out->a_bss, PAGE_SIZE);
 
 	/*
 	 * Check various fields in header for validity/bounds.
 	 */
 	if (/* entry point must lay with text region */
 	    a_out->a_entry < virtual_offset ||
 	    a_out->a_entry >= virtual_offset + a_out->a_text ||
 
 	    /* text and data size must each be page rounded */
 	    a_out->a_text & PAGE_MASK || a_out->a_data & PAGE_MASK)
 		return (-1);
 
 	/* text + data can't exceed file size */
 	if (a_out->a_data + a_out->a_text > imgp->attr->va_size)
 		return (EFAULT);
 
 	/*
 	 * text/data/bss must not exceed limits
 	 */
 	PROC_LOCK(imgp->proc);
 	if (/* text can't exceed maximum text size */
 	    a_out->a_text > maxtsiz ||
 
 	    /* data + bss can't exceed rlimit */
 	    a_out->a_data + bss_size > lim_cur(imgp->proc, RLIMIT_DATA)) {
 			PROC_UNLOCK(imgp->proc);
 			return (ENOMEM);
 	}
 	PROC_UNLOCK(imgp->proc);
 
 	/*
 	 * Avoid a possible deadlock if the current address space is destroyed
 	 * and that address space maps the locked vnode.  In the common case,
 	 * the locked vnode's v_usecount is decremented but remains greater
 	 * than zero.  Consequently, the vnode lock is not needed by vrele().
 	 * However, in cases where the vnode lock is external, such as nullfs,
 	 * v_usecount may become zero.
 	 */
-	VOP_UNLOCK(imgp->vp, 0, td);
+	VOP_UNLOCK(imgp->vp, 0, curthread);
 
 	/*
 	 * Destroy old process VM and create a new one (with a new stack)
 	 */
 	error = exec_new_vmspace(imgp, &aout_sysvec);
 
-	vn_lock(imgp->vp, LK_EXCLUSIVE | LK_RETRY, td);
+	vn_lock(imgp->vp, LK_EXCLUSIVE | LK_RETRY);
 	if (error)
 		return (error);
 
 	/*
 	 * The vm space can be changed by exec_new_vmspace
 	 */
 	vmspace = imgp->proc->p_vmspace;
 
 	object = imgp->object;
 	map = &vmspace->vm_map;
 	vm_map_lock(map);
 	vm_object_reference(object);
 
 	text_end = virtual_offset + a_out->a_text;
 	error = vm_map_insert(map, object,
 		file_offset,
 		virtual_offset, text_end,
 		VM_PROT_READ | VM_PROT_EXECUTE, VM_PROT_ALL,
 		MAP_COPY_ON_WRITE | MAP_PREFAULT);
 	if (error) {
 		vm_map_unlock(map);
 		vm_object_deallocate(object);
 		return (error);
 	}
 	data_end = text_end + a_out->a_data;
 	if (a_out->a_data) {
 		vm_object_reference(object);
 		error = vm_map_insert(map, object,
 			file_offset + a_out->a_text,
 			text_end, data_end,
 			VM_PROT_ALL, VM_PROT_ALL,
 			MAP_COPY_ON_WRITE | MAP_PREFAULT);
 		if (error) {
 			vm_map_unlock(map);
 			vm_object_deallocate(object);
 			return (error);
 		}
 	}
 
 	if (bss_size) {
 		error = vm_map_insert(map, NULL, 0,
 			data_end, data_end + bss_size,
 			VM_PROT_ALL, VM_PROT_ALL, 0);
 		if (error) {
 			vm_map_unlock(map);
 			return (error);
 		}
 	}
 	vm_map_unlock(map);
 
 	/* Fill in process VM information */
 	vmspace->vm_tsize = a_out->a_text >> PAGE_SHIFT;
 	vmspace->vm_dsize = (a_out->a_data + bss_size) >> PAGE_SHIFT;
 	vmspace->vm_taddr = (caddr_t) (uintptr_t) virtual_offset;
 	vmspace->vm_daddr = (caddr_t) (uintptr_t)
 			    (virtual_offset + a_out->a_text);
 
 	/* Fill in image_params */
 	imgp->interpreted = 0;
 	imgp->entry_addr = a_out->a_entry;
 
 	imgp->proc->p_sysent = &aout_sysvec;
 
 	return (0);
 }
 
 /*
  * Tell kern_execve.c about it, with a little help from the linker.
  */
 static struct execsw aout_execsw = { exec_aout_imgact, "a.out" };
 EXEC_SET(aout, aout_execsw);
Index: head/sys/kern/imgact_elf.c
===================================================================
--- head/sys/kern/imgact_elf.c	(revision 175201)
+++ head/sys/kern/imgact_elf.c	(revision 175202)
@@ -1,1356 +1,1356 @@
 /*-
  * Copyright (c) 2000 David O'Brien
  * Copyright (c) 1995-1996 S�ren Schmidt
  * Copyright (c) 1996 Peter Wemm
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer
  *    in this position and unchanged.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. The name of the author may not be used to endorse or promote products
  *    derived from this software without specific prior written permission
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_compat.h"
 
 #include <sys/param.h>
 #include <sys/exec.h>
 #include <sys/fcntl.h>
 #include <sys/imgact.h>
 #include <sys/imgact_elf.h>
 #include <sys/kernel.h>
 #include <sys/lock.h>
 #include <sys/malloc.h>
 #include <sys/mount.h>
 #include <sys/mutex.h>
 #include <sys/mman.h>
 #include <sys/namei.h>
 #include <sys/pioctl.h>
 #include <sys/proc.h>
 #include <sys/procfs.h>
 #include <sys/resourcevar.h>
 #include <sys/sf_buf.h>
 #include <sys/systm.h>
 #include <sys/signalvar.h>
 #include <sys/stat.h>
 #include <sys/sx.h>
 #include <sys/syscall.h>
 #include <sys/sysctl.h>
 #include <sys/sysent.h>
 #include <sys/vnode.h>
 
 #include <vm/vm.h>
 #include <vm/vm_kern.h>
 #include <vm/vm_param.h>
 #include <vm/pmap.h>
 #include <vm/vm_map.h>
 #include <vm/vm_object.h>
 #include <vm/vm_extern.h>
 
 #include <machine/elf.h>
 #include <machine/md_var.h>
 
 #if defined(COMPAT_IA32) && __ELF_WORD_SIZE == 32
 #include <machine/fpu.h>
 #include <compat/ia32/ia32_reg.h>
 #endif
 
 #define OLD_EI_BRAND	8
 
 static int __elfN(check_header)(const Elf_Ehdr *hdr);
 static Elf_Brandinfo *__elfN(get_brandinfo)(const Elf_Ehdr *hdr,
     const char *interp);
 static int __elfN(load_file)(struct proc *p, const char *file, u_long *addr,
     u_long *entry, size_t pagesize);
 static int __elfN(load_section)(struct vmspace *vmspace, vm_object_t object,
     vm_offset_t offset, caddr_t vmaddr, size_t memsz, size_t filsz,
     vm_prot_t prot, size_t pagesize);
 static int __CONCAT(exec_, __elfN(imgact))(struct image_params *imgp);
 
 SYSCTL_NODE(_kern, OID_AUTO, __CONCAT(elf, __ELF_WORD_SIZE), CTLFLAG_RW, 0,
     "");
 
 int __elfN(fallback_brand) = -1;
 SYSCTL_INT(__CONCAT(_kern_elf, __ELF_WORD_SIZE), OID_AUTO,
     fallback_brand, CTLFLAG_RW, &__elfN(fallback_brand), 0,
     __XSTRING(__CONCAT(ELF, __ELF_WORD_SIZE)) " brand of last resort");
 TUNABLE_INT("kern.elf" __XSTRING(__ELF_WORD_SIZE) ".fallback_brand",
     &__elfN(fallback_brand));
 
 static int elf_trace = 0;
 SYSCTL_INT(_debug, OID_AUTO, __elfN(trace), CTLFLAG_RW, &elf_trace, 0, "");
 
 static int elf_legacy_coredump = 0;
 SYSCTL_INT(_debug, OID_AUTO, __elfN(legacy_coredump), CTLFLAG_RW, 
     &elf_legacy_coredump, 0, "");
 
 static Elf_Brandinfo *elf_brand_list[MAX_BRANDS];
 
 #define	trunc_page_ps(va, ps)	((va) & ~(ps - 1))
 #define	round_page_ps(va, ps)	(((va) + (ps - 1)) & ~(ps - 1))
 #define	aligned(a, t)	(trunc_page_ps((u_long)(a), sizeof(t)) == (u_long)(a))
 
 int
 __elfN(insert_brand_entry)(Elf_Brandinfo *entry)
 {
 	int i;
 
 	for (i = 0; i < MAX_BRANDS; i++) {
 		if (elf_brand_list[i] == NULL) {
 			elf_brand_list[i] = entry;
 			break;
 		}
 	}
 	if (i == MAX_BRANDS)
 		return (-1);
 	return (0);
 }
 
 int
 __elfN(remove_brand_entry)(Elf_Brandinfo *entry)
 {
 	int i;
 
 	for (i = 0; i < MAX_BRANDS; i++) {
 		if (elf_brand_list[i] == entry) {
 			elf_brand_list[i] = NULL;
 			break;
 		}
 	}
 	if (i == MAX_BRANDS)
 		return (-1);
 	return (0);
 }
 
 int
 __elfN(brand_inuse)(Elf_Brandinfo *entry)
 {
 	struct proc *p;
 	int rval = FALSE;
 
 	sx_slock(&allproc_lock);
 	FOREACH_PROC_IN_SYSTEM(p) {
 		if (p->p_sysent == entry->sysvec) {
 			rval = TRUE;
 			break;
 		}
 	}
 	sx_sunlock(&allproc_lock);
 
 	return (rval);
 }
 
 static Elf_Brandinfo *
 __elfN(get_brandinfo)(const Elf_Ehdr *hdr, const char *interp)
 {
 	Elf_Brandinfo *bi;
 	int i;
 
 	/*
 	 * We support three types of branding -- (1) the ELF EI_OSABI field
 	 * that SCO added to the ELF spec, (2) FreeBSD 3.x's traditional string
 	 * branding w/in the ELF header, and (3) path of the `interp_path'
 	 * field.  We should also look for an ".note.ABI-tag" ELF section now
 	 * in all Linux ELF binaries, FreeBSD 4.1+, and some NetBSD ones.
 	 */
 
 	/* If the executable has a brand, search for it in the brand list. */
 	for (i = 0; i < MAX_BRANDS; i++) {
 		bi = elf_brand_list[i];
 		if (bi != NULL && hdr->e_machine == bi->machine &&
 		    (hdr->e_ident[EI_OSABI] == bi->brand ||
 		    strncmp((const char *)&hdr->e_ident[OLD_EI_BRAND],
 		    bi->compat_3_brand, strlen(bi->compat_3_brand)) == 0))
 			return (bi);
 	}
 
 	/* Lacking a known brand, search for a recognized interpreter. */
 	if (interp != NULL) {
 		for (i = 0; i < MAX_BRANDS; i++) {
 			bi = elf_brand_list[i];
 			if (bi != NULL && hdr->e_machine == bi->machine &&
 			    strcmp(interp, bi->interp_path) == 0)
 				return (bi);
 		}
 	}
 
 	/* Lacking a recognized interpreter, try the default brand */
 	for (i = 0; i < MAX_BRANDS; i++) {
 		bi = elf_brand_list[i];
 		if (bi != NULL && hdr->e_machine == bi->machine &&
 		    __elfN(fallback_brand) == bi->brand)
 			return (bi);
 	}
 	return (NULL);
 }
 
 static int
 __elfN(check_header)(const Elf_Ehdr *hdr)
 {
 	Elf_Brandinfo *bi;
 	int i;
 
 	if (!IS_ELF(*hdr) ||
 	    hdr->e_ident[EI_CLASS] != ELF_TARG_CLASS ||
 	    hdr->e_ident[EI_DATA] != ELF_TARG_DATA ||
 	    hdr->e_ident[EI_VERSION] != EV_CURRENT ||
 	    hdr->e_phentsize != sizeof(Elf_Phdr) ||
 	    hdr->e_version != ELF_TARG_VER)
 		return (ENOEXEC);
 
 	/*
 	 * Make sure we have at least one brand for this machine.
 	 */
 
 	for (i = 0; i < MAX_BRANDS; i++) {
 		bi = elf_brand_list[i];
 		if (bi != NULL && bi->machine == hdr->e_machine)
 			break;
 	}
 	if (i == MAX_BRANDS)
 		return (ENOEXEC);
 
 	return (0);
 }
 
 static int
 __elfN(map_partial)(vm_map_t map, vm_object_t object, vm_ooffset_t offset,
     vm_offset_t start, vm_offset_t end, vm_prot_t prot)
 {
 	struct sf_buf *sf;
 	int error;
 	vm_offset_t off;
 
 	/*
 	 * Create the page if it doesn't exist yet. Ignore errors.
 	 */
 	vm_map_lock(map);
 	vm_map_insert(map, NULL, 0, trunc_page(start), round_page(end),
 	    VM_PROT_ALL, VM_PROT_ALL, 0);
 	vm_map_unlock(map);
 
 	/*
 	 * Find the page from the underlying object.
 	 */
 	if (object) {
 		sf = vm_imgact_map_page(object, offset);
 		if (sf == NULL)
 			return (KERN_FAILURE);
 		off = offset - trunc_page(offset);
 		error = copyout((caddr_t)sf_buf_kva(sf) + off, (caddr_t)start,
 		    end - start);
 		vm_imgact_unmap_page(sf);
 		if (error) {
 			return (KERN_FAILURE);
 		}
 	}
 
 	return (KERN_SUCCESS);
 }
 
 static int
 __elfN(map_insert)(vm_map_t map, vm_object_t object, vm_ooffset_t offset,
     vm_offset_t start, vm_offset_t end, vm_prot_t prot, int cow)
 {
 	struct sf_buf *sf;
 	vm_offset_t off;
 	vm_size_t sz;
 	int error, rv;
 
 	if (start != trunc_page(start)) {
 		rv = __elfN(map_partial)(map, object, offset, start,
 		    round_page(start), prot);
 		if (rv)
 			return (rv);
 		offset += round_page(start) - start;
 		start = round_page(start);
 	}
 	if (end != round_page(end)) {
 		rv = __elfN(map_partial)(map, object, offset +
 		    trunc_page(end) - start, trunc_page(end), end, prot);
 		if (rv)
 			return (rv);
 		end = trunc_page(end);
 	}
 	if (end > start) {
 		if (offset & PAGE_MASK) {
 			/*
 			 * The mapping is not page aligned. This means we have
 			 * to copy the data. Sigh.
 			 */
 			rv = vm_map_find(map, NULL, 0, &start, end - start,
 			    FALSE, prot | VM_PROT_WRITE, VM_PROT_ALL, 0);
 			if (rv)
 				return (rv);
 			if (object == NULL)
 				return (KERN_SUCCESS);
 			for (; start < end; start += sz) {
 				sf = vm_imgact_map_page(object, offset);
 				if (sf == NULL)
 					return (KERN_FAILURE);
 				off = offset - trunc_page(offset);
 				sz = end - start;
 				if (sz > PAGE_SIZE - off)
 					sz = PAGE_SIZE - off;
 				error = copyout((caddr_t)sf_buf_kva(sf) + off,
 				    (caddr_t)start, sz);
 				vm_imgact_unmap_page(sf);
 				if (error) {
 					return (KERN_FAILURE);
 				}
 				offset += sz;
 			}
 			rv = KERN_SUCCESS;
 		} else {
 			vm_object_reference(object);
 			vm_map_lock(map);
 			rv = vm_map_insert(map, object, offset, start, end,
 			    prot, VM_PROT_ALL, cow);
 			vm_map_unlock(map);
 			if (rv != KERN_SUCCESS)
 				vm_object_deallocate(object);
 		}
 		return (rv);
 	} else {
 		return (KERN_SUCCESS);
 	}
 }
 
 static int
 __elfN(load_section)(struct vmspace *vmspace,
 	vm_object_t object, vm_offset_t offset,
 	caddr_t vmaddr, size_t memsz, size_t filsz, vm_prot_t prot,
 	size_t pagesize)
 {
 	struct sf_buf *sf;
 	size_t map_len;
 	vm_offset_t map_addr;
 	int error, rv, cow;
 	size_t copy_len;
 	vm_offset_t file_addr;
 
 	/*
 	 * It's necessary to fail if the filsz + offset taken from the
 	 * header is greater than the actual file pager object's size.
 	 * If we were to allow this, then the vm_map_find() below would
 	 * walk right off the end of the file object and into the ether.
 	 *
 	 * While I'm here, might as well check for something else that
 	 * is invalid: filsz cannot be greater than memsz.
 	 */
 	if ((off_t)filsz + offset > object->un_pager.vnp.vnp_size ||
 	    filsz > memsz) {
 		uprintf("elf_load_section: truncated ELF file\n");
 		return (ENOEXEC);
 	}
 
 	map_addr = trunc_page_ps((vm_offset_t)vmaddr, pagesize);
 	file_addr = trunc_page_ps(offset, pagesize);
 
 	/*
 	 * We have two choices.  We can either clear the data in the last page
 	 * of an oversized mapping, or we can start the anon mapping a page
 	 * early and copy the initialized data into that first page.  We
 	 * choose the second..
 	 */
 	if (memsz > filsz)
 		map_len = trunc_page_ps(offset + filsz, pagesize) - file_addr;
 	else
 		map_len = round_page_ps(offset + filsz, pagesize) - file_addr;
 
 	if (map_len != 0) {
 		/* cow flags: don't dump readonly sections in core */
 		cow = MAP_COPY_ON_WRITE | MAP_PREFAULT |
 		    (prot & VM_PROT_WRITE ? 0 : MAP_DISABLE_COREDUMP);
 
 		rv = __elfN(map_insert)(&vmspace->vm_map,
 				      object,
 				      file_addr,	/* file offset */
 				      map_addr,		/* virtual start */
 				      map_addr + map_len,/* virtual end */
 				      prot,
 				      cow);
 		if (rv != KERN_SUCCESS)
 			return (EINVAL);
 
 		/* we can stop now if we've covered it all */
 		if (memsz == filsz) {
 			return (0);
 		}
 	}
 
 
 	/*
 	 * We have to get the remaining bit of the file into the first part
 	 * of the oversized map segment.  This is normally because the .data
 	 * segment in the file is extended to provide bss.  It's a neat idea
 	 * to try and save a page, but it's a pain in the behind to implement.
 	 */
 	copy_len = (offset + filsz) - trunc_page_ps(offset + filsz, pagesize);
 	map_addr = trunc_page_ps((vm_offset_t)vmaddr + filsz, pagesize);
 	map_len = round_page_ps((vm_offset_t)vmaddr + memsz, pagesize) -
 	    map_addr;
 
 	/* This had damn well better be true! */
 	if (map_len != 0) {
 		rv = __elfN(map_insert)(&vmspace->vm_map, NULL, 0, map_addr,
 		    map_addr + map_len, VM_PROT_ALL, 0);
 		if (rv != KERN_SUCCESS) {
 			return (EINVAL);
 		}
 	}
 
 	if (copy_len != 0) {
 		vm_offset_t off;
 
 		sf = vm_imgact_map_page(object, offset + filsz);
 		if (sf == NULL)
 			return (EIO);
 
 		/* send the page fragment to user space */
 		off = trunc_page_ps(offset + filsz, pagesize) -
 		    trunc_page(offset + filsz);
 		error = copyout((caddr_t)sf_buf_kva(sf) + off,
 		    (caddr_t)map_addr, copy_len);
 		vm_imgact_unmap_page(sf);
 		if (error) {
 			return (error);
 		}
 	}
 
 	/*
 	 * set it to the specified protection.
 	 * XXX had better undo the damage from pasting over the cracks here!
 	 */
 	vm_map_protect(&vmspace->vm_map, trunc_page(map_addr),
 	    round_page(map_addr + map_len),  prot, FALSE);
 
 	return (0);
 }
 
 /*
  * Load the file "file" into memory.  It may be either a shared object
  * or an executable.
  *
  * The "addr" reference parameter is in/out.  On entry, it specifies
  * the address where a shared object should be loaded.  If the file is
  * an executable, this value is ignored.  On exit, "addr" specifies
  * where the file was actually loaded.
  *
  * The "entry" reference parameter is out only.  On exit, it specifies
  * the entry point for the loaded file.
  */
 static int
 __elfN(load_file)(struct proc *p, const char *file, u_long *addr,
 	u_long *entry, size_t pagesize)
 {
 	struct {
 		struct nameidata nd;
 		struct vattr attr;
 		struct image_params image_params;
 	} *tempdata;
 	const Elf_Ehdr *hdr = NULL;
 	const Elf_Phdr *phdr = NULL;
 	struct nameidata *nd;
 	struct vmspace *vmspace = p->p_vmspace;
 	struct vattr *attr;
 	struct image_params *imgp;
 	vm_prot_t prot;
 	u_long rbase;
 	u_long base_addr = 0;
 	int vfslocked, error, i, numsegs;
 
 	if (curthread->td_proc != p)
 		panic("elf_load_file - thread");	/* XXXKSE DIAGNOSTIC */
 
 	tempdata = malloc(sizeof(*tempdata), M_TEMP, M_WAITOK);
 	nd = &tempdata->nd;
 	attr = &tempdata->attr;
 	imgp = &tempdata->image_params;
 
 	/*
 	 * Initialize part of the common data
 	 */
 	imgp->proc = p;
 	imgp->attr = attr;
 	imgp->firstpage = NULL;
 	imgp->image_header = NULL;
 	imgp->object = NULL;
 	imgp->execlabel = NULL;
 
 	/* XXXKSE */
 	NDINIT(nd, LOOKUP, MPSAFE|LOCKLEAF|FOLLOW, UIO_SYSSPACE, file,
 	    curthread);
 	vfslocked = 0;
 	if ((error = namei(nd)) != 0) {
 		nd->ni_vp = NULL;
 		goto fail;
 	}
 	vfslocked = NDHASGIANT(nd);
 	NDFREE(nd, NDF_ONLY_PNBUF);
 	imgp->vp = nd->ni_vp;
 
 	/*
 	 * Check permissions, modes, uid, etc on the file, and "open" it.
 	 */
 	error = exec_check_permissions(imgp);
 	if (error)
 		goto fail;
 
 	error = exec_map_first_page(imgp);
 	if (error)
 		goto fail;
 
 	/*
 	 * Also make certain that the interpreter stays the same, so set
 	 * its VV_TEXT flag, too.
 	 */
 	nd->ni_vp->v_vflag |= VV_TEXT;
 
 	imgp->object = nd->ni_vp->v_object;
 
 	hdr = (const Elf_Ehdr *)imgp->image_header;
 	if ((error = __elfN(check_header)(hdr)) != 0)
 		goto fail;
 	if (hdr->e_type == ET_DYN)
 		rbase = *addr;
 	else if (hdr->e_type == ET_EXEC)
 		rbase = 0;
 	else {
 		error = ENOEXEC;
 		goto fail;
 	}
 
 	/* Only support headers that fit within first page for now      */
 	/*    (multiplication of two Elf_Half fields will not overflow) */
 	if ((hdr->e_phoff > PAGE_SIZE) ||
 	    (hdr->e_phentsize * hdr->e_phnum) > PAGE_SIZE - hdr->e_phoff) {
 		error = ENOEXEC;
 		goto fail;
 	}
 
 	phdr = (const Elf_Phdr *)(imgp->image_header + hdr->e_phoff);
 	if (!aligned(phdr, Elf_Addr)) {
 		error = ENOEXEC;
 		goto fail;
 	}
 
 	for (i = 0, numsegs = 0; i < hdr->e_phnum; i++) {
 		if (phdr[i].p_type == PT_LOAD) {	/* Loadable segment */
 			prot = 0;
 			if (phdr[i].p_flags & PF_X)
   				prot |= VM_PROT_EXECUTE;
 			if (phdr[i].p_flags & PF_W)
   				prot |= VM_PROT_WRITE;
 			if (phdr[i].p_flags & PF_R)
   				prot |= VM_PROT_READ;
 
 			if ((error = __elfN(load_section)(vmspace,
 			    imgp->object, phdr[i].p_offset,
 			    (caddr_t)(uintptr_t)phdr[i].p_vaddr + rbase,
 			    phdr[i].p_memsz, phdr[i].p_filesz, prot,
 			    pagesize)) != 0)
 				goto fail;
 			/*
 			 * Establish the base address if this is the
 			 * first segment.
 			 */
 			if (numsegs == 0)
   				base_addr = trunc_page(phdr[i].p_vaddr +
 				    rbase);
 			numsegs++;
 		}
 	}
 	*addr = base_addr;
 	*entry = (unsigned long)hdr->e_entry + rbase;
 
 fail:
 	if (imgp->firstpage)
 		exec_unmap_first_page(imgp);
 
 	if (nd->ni_vp)
 		vput(nd->ni_vp);
 
 	VFS_UNLOCK_GIANT(vfslocked);
 	free(tempdata, M_TEMP);
 
 	return (error);
 }
 
 static const char FREEBSD_ABI_VENDOR[] = "FreeBSD";
 
 static int
 __CONCAT(exec_, __elfN(imgact))(struct image_params *imgp)
 {
 	const Elf_Ehdr *hdr = (const Elf_Ehdr *)imgp->image_header;
 	const Elf_Phdr *phdr, *pnote = NULL;
 	Elf_Auxargs *elf_auxargs;
 	struct vmspace *vmspace;
 	vm_prot_t prot;
 	u_long text_size = 0, data_size = 0, total_size = 0;
 	u_long text_addr = 0, data_addr = 0;
 	u_long seg_size, seg_addr;
 	u_long addr, entry = 0, proghdr = 0;
 	int error = 0, i;
 	const char *interp = NULL, *newinterp = NULL;
 	Elf_Brandinfo *brand_info;
 	const Elf_Note *note, *note_end;
 	char *path;
 	const char *note_name;
 	struct thread *td = curthread;
 	struct sysentvec *sv;
 
 	/*
 	 * Do we have a valid ELF header ?
 	 *
 	 * Only allow ET_EXEC & ET_DYN here, reject ET_DYN later
 	 * if particular brand doesn't support it.
 	 */
 	if (__elfN(check_header)(hdr) != 0 ||
 	    (hdr->e_type != ET_EXEC && hdr->e_type != ET_DYN))
 		return (-1);
 
 	/*
 	 * From here on down, we return an errno, not -1, as we've
 	 * detected an ELF file.
 	 */
 
 	if ((hdr->e_phoff > PAGE_SIZE) ||
 	    (hdr->e_phoff + hdr->e_phentsize * hdr->e_phnum) > PAGE_SIZE) {
 		/* Only support headers in first page for now */
 		return (ENOEXEC);
 	}
 	phdr = (const Elf_Phdr *)(imgp->image_header + hdr->e_phoff);
 	if (!aligned(phdr, Elf_Addr))
 		return (ENOEXEC);
 	for (i = 0; i < hdr->e_phnum; i++) {
 		if (phdr[i].p_type == PT_INTERP) {
 			/* Path to interpreter */
 			if (phdr[i].p_filesz > MAXPATHLEN ||
 			    phdr[i].p_offset + phdr[i].p_filesz > PAGE_SIZE)
 				return (ENOEXEC);
 			interp = imgp->image_header + phdr[i].p_offset;
 			break;
 		}
 	}
 
 	brand_info = __elfN(get_brandinfo)(hdr, interp);
 	if (brand_info == NULL) {
 		uprintf("ELF binary type \"%u\" not known.\n",
 		    hdr->e_ident[EI_OSABI]);
 		return (ENOEXEC);
 	}
 	if (hdr->e_type == ET_DYN &&
 	    (brand_info->flags & BI_CAN_EXEC_DYN) == 0)
 		return (ENOEXEC);
 	sv = brand_info->sysvec;
 	if (interp != NULL && brand_info->interp_newpath != NULL)
 		newinterp = brand_info->interp_newpath;
 
 	/*
 	 * Avoid a possible deadlock if the current address space is destroyed
 	 * and that address space maps the locked vnode.  In the common case,
 	 * the locked vnode's v_usecount is decremented but remains greater
 	 * than zero.  Consequently, the vnode lock is not needed by vrele().
 	 * However, in cases where the vnode lock is external, such as nullfs,
 	 * v_usecount may become zero.
 	 */
 	VOP_UNLOCK(imgp->vp, 0, td);
 
 	error = exec_new_vmspace(imgp, sv);
 	imgp->proc->p_sysent = sv;
 
-	vn_lock(imgp->vp, LK_EXCLUSIVE | LK_RETRY, td);
+	vn_lock(imgp->vp, LK_EXCLUSIVE | LK_RETRY);
 	if (error)
 		return (error);
 
 	vmspace = imgp->proc->p_vmspace;
 
 	for (i = 0; i < hdr->e_phnum; i++) {
 		switch (phdr[i].p_type) {
 		case PT_LOAD:	/* Loadable segment */
 			prot = 0;
 			if (phdr[i].p_flags & PF_X)
   				prot |= VM_PROT_EXECUTE;
 			if (phdr[i].p_flags & PF_W)
   				prot |= VM_PROT_WRITE;
 			if (phdr[i].p_flags & PF_R)
   				prot |= VM_PROT_READ;
 
 #if defined(__ia64__) && __ELF_WORD_SIZE == 32 && defined(IA32_ME_HARDER)
 			/*
 			 * Some x86 binaries assume read == executable,
 			 * notably the M3 runtime and therefore cvsup
 			 */
 			if (prot & VM_PROT_READ)
 				prot |= VM_PROT_EXECUTE;
 #endif
 
 			if ((error = __elfN(load_section)(vmspace,
 			    imgp->object, phdr[i].p_offset,
 			    (caddr_t)(uintptr_t)phdr[i].p_vaddr,
 			    phdr[i].p_memsz, phdr[i].p_filesz, prot,
 			    sv->sv_pagesize)) != 0)
 				return (error);
 
 			/*
 			 * If this segment contains the program headers,
 			 * remember their virtual address for the AT_PHDR
 			 * aux entry. Static binaries don't usually include
 			 * a PT_PHDR entry.
 			 */
 			if (phdr[i].p_offset == 0 &&
 			    hdr->e_phoff + hdr->e_phnum * hdr->e_phentsize
 				<= phdr[i].p_filesz)
 				proghdr = phdr[i].p_vaddr + hdr->e_phoff;
 
 			seg_addr = trunc_page(phdr[i].p_vaddr);
 			seg_size = round_page(phdr[i].p_memsz +
 			    phdr[i].p_vaddr - seg_addr);
 
 			/*
 			 * Is this .text or .data?  We can't use
 			 * VM_PROT_WRITE or VM_PROT_EXEC, it breaks the
 			 * alpha terribly and possibly does other bad
 			 * things so we stick to the old way of figuring
 			 * it out:  If the segment contains the program
 			 * entry point, it's a text segment, otherwise it
 			 * is a data segment.
 			 *
 			 * Note that obreak() assumes that data_addr + 
 			 * data_size == end of data load area, and the ELF
 			 * file format expects segments to be sorted by
 			 * address.  If multiple data segments exist, the
 			 * last one will be used.
 			 */
 			if (hdr->e_entry >= phdr[i].p_vaddr &&
 			    hdr->e_entry < (phdr[i].p_vaddr +
 			    phdr[i].p_memsz)) {
 				text_size = seg_size;
 				text_addr = seg_addr;
 				entry = (u_long)hdr->e_entry;
 			} else {
 				data_size = seg_size;
 				data_addr = seg_addr;
 			}
 			total_size += seg_size;
 			break;
 		case PT_PHDR: 	/* Program header table info */
 			proghdr = phdr[i].p_vaddr;
 			break;
 		case PT_NOTE:
 			pnote = &phdr[i];
 			break;
 		default:
 			break;
 		}
 	}
 	
 	if (data_addr == 0 && data_size == 0) {
 		data_addr = text_addr;
 		data_size = text_size;
 	}
 
 	/*
 	 * Check limits.  It should be safe to check the
 	 * limits after loading the segments since we do
 	 * not actually fault in all the segments pages.
 	 */
 	PROC_LOCK(imgp->proc);
 	if (data_size > lim_cur(imgp->proc, RLIMIT_DATA) ||
 	    text_size > maxtsiz ||
 	    total_size > lim_cur(imgp->proc, RLIMIT_VMEM)) {
 		PROC_UNLOCK(imgp->proc);
 		return (ENOMEM);
 	}
 
 	vmspace->vm_tsize = text_size >> PAGE_SHIFT;
 	vmspace->vm_taddr = (caddr_t)(uintptr_t)text_addr;
 	vmspace->vm_dsize = data_size >> PAGE_SHIFT;
 	vmspace->vm_daddr = (caddr_t)(uintptr_t)data_addr;
 
 	/*
 	 * We load the dynamic linker where a userland call
 	 * to mmap(0, ...) would put it.  The rationale behind this
 	 * calculation is that it leaves room for the heap to grow to
 	 * its maximum allowed size.
 	 */
 	addr = round_page((vm_offset_t)imgp->proc->p_vmspace->vm_daddr +
 	    lim_max(imgp->proc, RLIMIT_DATA));
 	PROC_UNLOCK(imgp->proc);
 
 	imgp->entry_addr = entry;
 
 	if (interp != NULL) {
 		int have_interp = FALSE;
 		VOP_UNLOCK(imgp->vp, 0, td);
 		if (brand_info->emul_path != NULL &&
 		    brand_info->emul_path[0] != '\0') {
 			path = malloc(MAXPATHLEN, M_TEMP, M_WAITOK);
 			snprintf(path, MAXPATHLEN, "%s%s",
 			    brand_info->emul_path, interp);
 			error = __elfN(load_file)(imgp->proc, path, &addr,
 			    &imgp->entry_addr, sv->sv_pagesize);
 			free(path, M_TEMP);
 			if (error == 0)
 				have_interp = TRUE;
 		}
 		if (!have_interp && newinterp != NULL) {
 			error = __elfN(load_file)(imgp->proc, newinterp, &addr,
 			    &imgp->entry_addr, sv->sv_pagesize);
 			have_interp = TRUE;
 		}
 		if (!have_interp) {
 			error = __elfN(load_file)(imgp->proc, interp, &addr,
 			    &imgp->entry_addr, sv->sv_pagesize);
 		}
-		vn_lock(imgp->vp, LK_EXCLUSIVE | LK_RETRY, td);
+		vn_lock(imgp->vp, LK_EXCLUSIVE | LK_RETRY);
 		if (error != 0) {
 			uprintf("ELF interpreter %s not found\n", interp);
 			return (error);
 		}
 	}
 
 	/*
 	 * Construct auxargs table (used by the fixup routine)
 	 */
 	elf_auxargs = malloc(sizeof(Elf_Auxargs), M_TEMP, M_WAITOK);
 	elf_auxargs->execfd = -1;
 	elf_auxargs->phdr = proghdr;
 	elf_auxargs->phent = hdr->e_phentsize;
 	elf_auxargs->phnum = hdr->e_phnum;
 	elf_auxargs->pagesz = PAGE_SIZE;
 	elf_auxargs->base = addr;
 	elf_auxargs->flags = 0;
 	elf_auxargs->entry = entry;
 	elf_auxargs->trace = elf_trace;
 
 	imgp->auxargs = elf_auxargs;
 	imgp->interpreted = 0;
 
 	/*
 	 * Try to fetch the osreldate for FreeBSD binary from the ELF
 	 * OSABI-note. Only the first page of the image is searched,
 	 * the same as for headers.
 	 */
 	if (pnote != NULL && pnote->p_offset < PAGE_SIZE &&
 	    pnote->p_offset + pnote->p_filesz < PAGE_SIZE ) {
 		note = (const Elf_Note *)(imgp->image_header + pnote->p_offset);
 		if (!aligned(note, Elf32_Addr)) {
 			free(imgp->auxargs, M_TEMP);
 			imgp->auxargs = NULL;
 			return (ENOEXEC);
 		}
 		note_end = (const Elf_Note *)(imgp->image_header + pnote->p_offset +
 		    pnote->p_filesz);
 		while (note < note_end) {
 			if (note->n_namesz == sizeof(FREEBSD_ABI_VENDOR) &&
 			    note->n_descsz == sizeof(int32_t) &&
 			    note->n_type == 1 /* ABI_NOTETYPE */) {
 				note_name = (const char *)(note + 1);
 				if (strncmp(FREEBSD_ABI_VENDOR, note_name,
 				    sizeof(FREEBSD_ABI_VENDOR)) == 0) {
 					imgp->proc->p_osrel = *(const int32_t *)
 					    (note_name +
 					    round_page_ps(sizeof(FREEBSD_ABI_VENDOR),
 						sizeof(Elf32_Addr)));
 					break;
 				}
 			}
 			note = (const Elf_Note *)((const char *)(note + 1) +
 			    round_page_ps(note->n_namesz, sizeof(Elf32_Addr)) +
 			    round_page_ps(note->n_descsz, sizeof(Elf32_Addr)));
 		}
 	}
 
 	return (error);
 }
 
 #define	suword __CONCAT(suword, __ELF_WORD_SIZE)
 
 int
 __elfN(freebsd_fixup)(register_t **stack_base, struct image_params *imgp)
 {
 	Elf_Auxargs *args = (Elf_Auxargs *)imgp->auxargs;
 	Elf_Addr *base;
 	Elf_Addr *pos;
 
 	base = (Elf_Addr *)*stack_base;
 	pos = base + (imgp->args->argc + imgp->args->envc + 2);
 
 	if (args->trace) {
 		AUXARGS_ENTRY(pos, AT_DEBUG, 1);
 	}
 	if (args->execfd != -1) {
 		AUXARGS_ENTRY(pos, AT_EXECFD, args->execfd);
 	}
 	AUXARGS_ENTRY(pos, AT_PHDR, args->phdr);
 	AUXARGS_ENTRY(pos, AT_PHENT, args->phent);
 	AUXARGS_ENTRY(pos, AT_PHNUM, args->phnum);
 	AUXARGS_ENTRY(pos, AT_PAGESZ, args->pagesz);
 	AUXARGS_ENTRY(pos, AT_FLAGS, args->flags);
 	AUXARGS_ENTRY(pos, AT_ENTRY, args->entry);
 	AUXARGS_ENTRY(pos, AT_BASE, args->base);
 	AUXARGS_ENTRY(pos, AT_NULL, 0);
 
 	free(imgp->auxargs, M_TEMP);
 	imgp->auxargs = NULL;
 
 	base--;
 	suword(base, (long)imgp->args->argc);
 	*stack_base = (register_t *)base;
 	return (0);
 }
 
 /*
  * Code for generating ELF core dumps.
  */
 
 typedef void (*segment_callback)(vm_map_entry_t, void *);
 
 /* Closure for cb_put_phdr(). */
 struct phdr_closure {
 	Elf_Phdr *phdr;		/* Program header to fill in */
 	Elf_Off offset;		/* Offset of segment in core file */
 };
 
 /* Closure for cb_size_segment(). */
 struct sseg_closure {
 	int count;		/* Count of writable segments. */
 	size_t size;		/* Total size of all writable segments. */
 };
 
 static void cb_put_phdr(vm_map_entry_t, void *);
 static void cb_size_segment(vm_map_entry_t, void *);
 static void each_writable_segment(struct thread *, segment_callback, void *);
 static int __elfN(corehdr)(struct thread *, struct vnode *, struct ucred *,
     int, void *, size_t);
 static void __elfN(puthdr)(struct thread *, void *, size_t *, int);
 static void __elfN(putnote)(void *, size_t *, const char *, int,
     const void *, size_t);
 
 int
 __elfN(coredump)(td, vp, limit)
 	struct thread *td;
 	struct vnode *vp;
 	off_t limit;
 {
 	struct ucred *cred = td->td_ucred;
 	int error = 0;
 	struct sseg_closure seginfo;
 	void *hdr;
 	size_t hdrsize;
 
 	/* Size the program segments. */
 	seginfo.count = 0;
 	seginfo.size = 0;
 	each_writable_segment(td, cb_size_segment, &seginfo);
 
 	/*
 	 * Calculate the size of the core file header area by making
 	 * a dry run of generating it.  Nothing is written, but the
 	 * size is calculated.
 	 */
 	hdrsize = 0;
 	__elfN(puthdr)(td, (void *)NULL, &hdrsize, seginfo.count);
 
 	if (hdrsize + seginfo.size >= limit)
 		return (EFAULT);
 
 	/*
 	 * Allocate memory for building the header, fill it up,
 	 * and write it out.
 	 */
 	hdr = malloc(hdrsize, M_TEMP, M_WAITOK);
 	if (hdr == NULL) {
 		return (EINVAL);
 	}
 	error = __elfN(corehdr)(td, vp, cred, seginfo.count, hdr, hdrsize);
 
 	/* Write the contents of all of the writable segments. */
 	if (error == 0) {
 		Elf_Phdr *php;
 		off_t offset;
 		int i;
 
 		php = (Elf_Phdr *)((char *)hdr + sizeof(Elf_Ehdr)) + 1;
 		offset = hdrsize;
 		for (i = 0; i < seginfo.count; i++) {
 			error = vn_rdwr_inchunks(UIO_WRITE, vp,
 			    (caddr_t)(uintptr_t)php->p_vaddr,
 			    php->p_filesz, offset, UIO_USERSPACE,
 			    IO_UNIT | IO_DIRECT, cred, NOCRED, NULL,
 			    curthread); /* XXXKSE */
 			if (error != 0)
 				break;
 			offset += php->p_filesz;
 			php++;
 		}
 	}
 	free(hdr, M_TEMP);
 
 	return (error);
 }
 
 /*
  * A callback for each_writable_segment() to write out the segment's
  * program header entry.
  */
 static void
 cb_put_phdr(entry, closure)
 	vm_map_entry_t entry;
 	void *closure;
 {
 	struct phdr_closure *phc = (struct phdr_closure *)closure;
 	Elf_Phdr *phdr = phc->phdr;
 
 	phc->offset = round_page(phc->offset);
 
 	phdr->p_type = PT_LOAD;
 	phdr->p_offset = phc->offset;
 	phdr->p_vaddr = entry->start;
 	phdr->p_paddr = 0;
 	phdr->p_filesz = phdr->p_memsz = entry->end - entry->start;
 	phdr->p_align = PAGE_SIZE;
 	phdr->p_flags = 0;
 	if (entry->protection & VM_PROT_READ)
 		phdr->p_flags |= PF_R;
 	if (entry->protection & VM_PROT_WRITE)
 		phdr->p_flags |= PF_W;
 	if (entry->protection & VM_PROT_EXECUTE)
 		phdr->p_flags |= PF_X;
 
 	phc->offset += phdr->p_filesz;
 	phc->phdr++;
 }
 
 /*
  * A callback for each_writable_segment() to gather information about
  * the number of segments and their total size.
  */
 static void
 cb_size_segment(entry, closure)
 	vm_map_entry_t entry;
 	void *closure;
 {
 	struct sseg_closure *ssc = (struct sseg_closure *)closure;
 
 	ssc->count++;
 	ssc->size += entry->end - entry->start;
 }
 
 /*
  * For each writable segment in the process's memory map, call the given
  * function with a pointer to the map entry and some arbitrary
  * caller-supplied data.
  */
 static void
 each_writable_segment(td, func, closure)
 	struct thread *td;
 	segment_callback func;
 	void *closure;
 {
 	struct proc *p = td->td_proc;
 	vm_map_t map = &p->p_vmspace->vm_map;
 	vm_map_entry_t entry;
 	vm_object_t backing_object, object;
 	boolean_t ignore_entry;
 
 	vm_map_lock_read(map);
 	for (entry = map->header.next; entry != &map->header;
 	    entry = entry->next) {
 		/*
 		 * Don't dump inaccessible mappings, deal with legacy
 		 * coredump mode.
 		 *
 		 * Note that read-only segments related to the elf binary
 		 * are marked MAP_ENTRY_NOCOREDUMP now so we no longer
 		 * need to arbitrarily ignore such segments.
 		 */
 		if (elf_legacy_coredump) {
 			if ((entry->protection & VM_PROT_RW) != VM_PROT_RW)
 				continue;
 		} else {
 			if ((entry->protection & VM_PROT_ALL) == 0)
 				continue;
 		}
 
 		/*
 		 * Dont include memory segment in the coredump if
 		 * MAP_NOCORE is set in mmap(2) or MADV_NOCORE in
 		 * madvise(2).  Do not dump submaps (i.e. parts of the
 		 * kernel map).
 		 */
 		if (entry->eflags & (MAP_ENTRY_NOCOREDUMP|MAP_ENTRY_IS_SUB_MAP))
 			continue;
 
 		if ((object = entry->object.vm_object) == NULL)
 			continue;
 
 		/* Ignore memory-mapped devices and such things. */
 		VM_OBJECT_LOCK(object);
 		while ((backing_object = object->backing_object) != NULL) {
 			VM_OBJECT_LOCK(backing_object);
 			VM_OBJECT_UNLOCK(object);
 			object = backing_object;
 		}
 		ignore_entry = object->type != OBJT_DEFAULT &&
 		    object->type != OBJT_SWAP && object->type != OBJT_VNODE;
 		VM_OBJECT_UNLOCK(object);
 		if (ignore_entry)
 			continue;
 
 		(*func)(entry, closure);
 	}
 	vm_map_unlock_read(map);
 }
 
 /*
  * Write the core file header to the file, including padding up to
  * the page boundary.
  */
 static int
 __elfN(corehdr)(td, vp, cred, numsegs, hdr, hdrsize)
 	struct thread *td;
 	struct vnode *vp;
 	struct ucred *cred;
 	int numsegs;
 	size_t hdrsize;
 	void *hdr;
 {
 	size_t off;
 
 	/* Fill in the header. */
 	bzero(hdr, hdrsize);
 	off = 0;
 	__elfN(puthdr)(td, hdr, &off, numsegs);
 
 	/* Write it to the core file. */
 	return (vn_rdwr_inchunks(UIO_WRITE, vp, hdr, hdrsize, (off_t)0,
 	    UIO_SYSSPACE, IO_UNIT | IO_DIRECT, cred, NOCRED, NULL,
 	    td)); /* XXXKSE */
 }
 
 #if defined(COMPAT_IA32) && __ELF_WORD_SIZE == 32
 typedef struct prstatus32 elf_prstatus_t;
 typedef struct prpsinfo32 elf_prpsinfo_t;
 typedef struct fpreg32 elf_prfpregset_t;
 typedef struct fpreg32 elf_fpregset_t;
 typedef struct reg32 elf_gregset_t;
 #else
 typedef prstatus_t elf_prstatus_t;
 typedef prpsinfo_t elf_prpsinfo_t;
 typedef prfpregset_t elf_prfpregset_t;
 typedef prfpregset_t elf_fpregset_t;
 typedef gregset_t elf_gregset_t;
 #endif
 
 static void
 __elfN(puthdr)(struct thread *td, void *dst, size_t *off, int numsegs)
 {
 	struct {
 		elf_prstatus_t status;
 		elf_prfpregset_t fpregset;
 		elf_prpsinfo_t psinfo;
 	} *tempdata;
 	elf_prstatus_t *status;
 	elf_prfpregset_t *fpregset;
 	elf_prpsinfo_t *psinfo;
 	struct proc *p;
 	struct thread *thr;
 	size_t ehoff, noteoff, notesz, phoff;
 
 	p = td->td_proc;
 
 	ehoff = *off;
 	*off += sizeof(Elf_Ehdr);
 
 	phoff = *off;
 	*off += (numsegs + 1) * sizeof(Elf_Phdr);
 
 	noteoff = *off;
 	/*
 	 * Don't allocate space for the notes if we're just calculating
 	 * the size of the header. We also don't collect the data.
 	 */
 	if (dst != NULL) {
 		tempdata = malloc(sizeof(*tempdata), M_TEMP, M_ZERO|M_WAITOK);
 		status = &tempdata->status;
 		fpregset = &tempdata->fpregset;
 		psinfo = &tempdata->psinfo;
 	} else {
 		tempdata = NULL;
 		status = NULL;
 		fpregset = NULL;
 		psinfo = NULL;
 	}
 
 	if (dst != NULL) {
 		psinfo->pr_version = PRPSINFO_VERSION;
 		psinfo->pr_psinfosz = sizeof(elf_prpsinfo_t);
 		strlcpy(psinfo->pr_fname, td->td_name, sizeof(psinfo->pr_fname));
 		/*
 		 * XXX - We don't fill in the command line arguments properly
 		 * yet.
 		 */
 		strlcpy(psinfo->pr_psargs, td->td_name,
 		    sizeof(psinfo->pr_psargs));
 	}
 	__elfN(putnote)(dst, off, "FreeBSD", NT_PRPSINFO, psinfo,
 	    sizeof *psinfo);
 
 	/*
 	 * To have the debugger select the right thread (LWP) as the initial
 	 * thread, we dump the state of the thread passed to us in td first.
 	 * This is the thread that causes the core dump and thus likely to
 	 * be the right thread one wants to have selected in the debugger.
 	 */
 	thr = td;
 	while (thr != NULL) {
 		if (dst != NULL) {
 			status->pr_version = PRSTATUS_VERSION;
 			status->pr_statussz = sizeof(elf_prstatus_t);
 			status->pr_gregsetsz = sizeof(elf_gregset_t);
 			status->pr_fpregsetsz = sizeof(elf_fpregset_t);
 			status->pr_osreldate = osreldate;
 			status->pr_cursig = p->p_sig;
 			status->pr_pid = thr->td_tid;
 #if defined(COMPAT_IA32) && __ELF_WORD_SIZE == 32
 			fill_regs32(thr, &status->pr_reg);
 			fill_fpregs32(thr, fpregset);
 #else
 			fill_regs(thr, &status->pr_reg);
 			fill_fpregs(thr, fpregset);
 #endif
 		}
 		__elfN(putnote)(dst, off, "FreeBSD", NT_PRSTATUS, status,
 		    sizeof *status);
 		__elfN(putnote)(dst, off, "FreeBSD", NT_FPREGSET, fpregset,
 		    sizeof *fpregset);
 		/*
 		 * Allow for MD specific notes, as well as any MD
 		 * specific preparations for writing MI notes.
 		 */
 		__elfN(dump_thread)(thr, dst, off);
 
 		thr = (thr == td) ? TAILQ_FIRST(&p->p_threads) :
 		    TAILQ_NEXT(thr, td_plist);
 		if (thr == td)
 			thr = TAILQ_NEXT(thr, td_plist);
 	}
 
 	notesz = *off - noteoff;
 
 	if (dst != NULL)
 		free(tempdata, M_TEMP);
 
 	/* Align up to a page boundary for the program segments. */
 	*off = round_page(*off);
 
 	if (dst != NULL) {
 		Elf_Ehdr *ehdr;
 		Elf_Phdr *phdr;
 		struct phdr_closure phc;
 
 		/*
 		 * Fill in the ELF header.
 		 */
 		ehdr = (Elf_Ehdr *)((char *)dst + ehoff);
 		ehdr->e_ident[EI_MAG0] = ELFMAG0;
 		ehdr->e_ident[EI_MAG1] = ELFMAG1;
 		ehdr->e_ident[EI_MAG2] = ELFMAG2;
 		ehdr->e_ident[EI_MAG3] = ELFMAG3;
 		ehdr->e_ident[EI_CLASS] = ELF_CLASS;
 		ehdr->e_ident[EI_DATA] = ELF_DATA;
 		ehdr->e_ident[EI_VERSION] = EV_CURRENT;
 		ehdr->e_ident[EI_OSABI] = ELFOSABI_FREEBSD;
 		ehdr->e_ident[EI_ABIVERSION] = 0;
 		ehdr->e_ident[EI_PAD] = 0;
 		ehdr->e_type = ET_CORE;
 #if defined(COMPAT_IA32) && __ELF_WORD_SIZE == 32
 		ehdr->e_machine = EM_386;
 #else
 		ehdr->e_machine = ELF_ARCH;
 #endif
 		ehdr->e_version = EV_CURRENT;
 		ehdr->e_entry = 0;
 		ehdr->e_phoff = phoff;
 		ehdr->e_flags = 0;
 		ehdr->e_ehsize = sizeof(Elf_Ehdr);
 		ehdr->e_phentsize = sizeof(Elf_Phdr);
 		ehdr->e_phnum = numsegs + 1;
 		ehdr->e_shentsize = sizeof(Elf_Shdr);
 		ehdr->e_shnum = 0;
 		ehdr->e_shstrndx = SHN_UNDEF;
 
 		/*
 		 * Fill in the program header entries.
 		 */
 		phdr = (Elf_Phdr *)((char *)dst + phoff);
 
 		/* The note segement. */
 		phdr->p_type = PT_NOTE;
 		phdr->p_offset = noteoff;
 		phdr->p_vaddr = 0;
 		phdr->p_paddr = 0;
 		phdr->p_filesz = notesz;
 		phdr->p_memsz = 0;
 		phdr->p_flags = 0;
 		phdr->p_align = 0;
 		phdr++;
 
 		/* All the writable segments from the program. */
 		phc.phdr = phdr;
 		phc.offset = *off;
 		each_writable_segment(td, cb_put_phdr, &phc);
 	}
 }
 
 static void
 __elfN(putnote)(void *dst, size_t *off, const char *name, int type,
     const void *desc, size_t descsz)
 {
 	Elf_Note note;
 
 	note.n_namesz = strlen(name) + 1;
 	note.n_descsz = descsz;
 	note.n_type = type;
 	if (dst != NULL)
 		bcopy(&note, (char *)dst + *off, sizeof note);
 	*off += sizeof note;
 	if (dst != NULL)
 		bcopy(name, (char *)dst + *off, note.n_namesz);
 	*off += roundup2(note.n_namesz, sizeof(Elf_Size));
 	if (dst != NULL)
 		bcopy(desc, (char *)dst + *off, note.n_descsz);
 	*off += roundup2(note.n_descsz, sizeof(Elf_Size));
 }
 
 /*
  * Tell kern_execve.c about it, with a little help from the linker.
  */
 static struct execsw __elfN(execsw) = {
 	__CONCAT(exec_, __elfN(imgact)),
 	__XSTRING(__CONCAT(ELF, __ELF_WORD_SIZE))
 };
 EXEC_SET(__CONCAT(elf, __ELF_WORD_SIZE), __elfN(execsw));
Index: head/sys/kern/imgact_gzip.c
===================================================================
--- head/sys/kern/imgact_gzip.c	(revision 175201)
+++ head/sys/kern/imgact_gzip.c	(revision 175202)
@@ -1,403 +1,403 @@
 /*-
  * ----------------------------------------------------------------------------
  * "THE BEER-WARE LICENSE" (Revision 42):
  * <phk@FreeBSD.org> wrote this file.  As long as you retain this notice you
  * can do whatever you want with this stuff. If we meet some day, and you think
  * this stuff is worth it, you can buy me a beer in return.   Poul-Henning Kamp
  * ----------------------------------------------------------------------------
  */
 
 /*
  * This module handles execution of a.out files which have been run through
  * "gzip".  This saves diskspace, but wastes cpu-cycles and VM.
  *
  * TODO:
  *	text-segments should be made R/O after being filled
  *	is the vm-stuff safe ?
  * 	should handle the entire header of gzip'ed stuff.
  *	inflate isn't quite reentrant yet...
  *	error-handling is a mess...
  *	so is the rest...
  *	tidy up unnecesary includes
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <sys/param.h>
 #include <sys/exec.h>
 #include <sys/imgact.h>
 #include <sys/imgact_aout.h>
 #include <sys/kernel.h>
 #include <sys/lock.h>
 #include <sys/mman.h>
 #include <sys/mutex.h>
 #include <sys/proc.h>
 #include <sys/resourcevar.h>
 #include <sys/sysent.h>
 #include <sys/systm.h>
 #include <sys/vnode.h>
 #include <sys/inflate.h>
 
 #include <vm/vm.h>
 #include <vm/vm_param.h>
 #include <vm/pmap.h>
 #include <vm/vm_map.h>
 #include <vm/vm_kern.h>
 #include <vm/vm_extern.h>
 
 struct imgact_gzip {
 	struct image_params *ip;
 	struct exec     a_out;
 	int             error;
 	int		gotheader;
 	int             where;
 	u_char         *inbuf;
 	u_long          offset;
 	u_long          output;
 	u_long          len;
 	int             idx;
 	u_long          virtual_offset, file_offset, file_end, bss_size;
 };
 
 static int exec_gzip_imgact(struct image_params *imgp);
 static int NextByte(void *vp);
 static int do_aout_hdr(struct imgact_gzip *);
 static int Flush(void *vp, u_char *, u_long siz);
 
 static int
 exec_gzip_imgact(imgp)
 	struct image_params *imgp;
 {
 	int             error, error2 = 0;
 	const u_char   *p = (const u_char *) imgp->image_header;
 	struct imgact_gzip igz;
 	struct inflate  infl;
 	struct vmspace *vmspace;
 
 	/* If these four are not OK, it isn't a gzip file */
 	if (p[0] != 0x1f)
 		return -1;	/* 0    Simply magic	 */
 	if (p[1] != 0x8b)
 		return -1;	/* 1    Simply magic	 */
 	if (p[2] != 0x08)
 		return -1;	/* 2    Compression method	 */
 	if (p[9] != 0x03)
 		return -1;	/* 9    OS compressed on	 */
 
 	/*
 	 * If this one contains anything but a comment or a filename marker,
 	 * we don't want to chew on it
 	 */
 	if (p[3] & ~(0x18))
 		return ENOEXEC;	/* 3    Flags		 */
 
 	/* These are of no use to us */
 	/* 4-7  Timestamp		 */
 	/* 8    Extra flags		 */
 
 	bzero(&igz, sizeof igz);
 	bzero(&infl, sizeof infl);
 	infl.gz_private = (void *) &igz;
 	infl.gz_input = NextByte;
 	infl.gz_output = Flush;
 
 	igz.ip = imgp;
 	igz.idx = 10;
 
 	if (p[3] & 0x08) {	/* skip a filename */
 		while (p[igz.idx++])
 			if (igz.idx >= PAGE_SIZE)
 				return ENOEXEC;
 	}
 	if (p[3] & 0x10) {	/* skip a comment */
 		while (p[igz.idx++])
 			if (igz.idx >= PAGE_SIZE)
 				return ENOEXEC;
 	}
 	igz.len = imgp->attr->va_size;
 
 	error = inflate(&infl);
 
 	/*
 	 * The unzipped file may not even have been long enough to contain
 	 * a header giving Flush() a chance to return error.  Check for this.
 	 */
 	if ( !igz.gotheader )
 		return ENOEXEC;
 
 	if ( !error ) {
 		vmspace = imgp->proc->p_vmspace;
 		error = vm_map_protect(&vmspace->vm_map,
 			(vm_offset_t) vmspace->vm_taddr,
 			(vm_offset_t) (vmspace->vm_taddr + 
 				      (vmspace->vm_tsize << PAGE_SHIFT)) ,
 			VM_PROT_READ|VM_PROT_EXECUTE,0);
 	}
 
 	if (igz.inbuf) {
 		error2 =
 			vm_map_remove(kernel_map, (vm_offset_t) igz.inbuf,
 			    (vm_offset_t) igz.inbuf + PAGE_SIZE);
 	}
 	if (igz.error || error || error2) {
 		printf("Output=%lu ", igz.output);
 		printf("Inflate_error=%d igz.error=%d error2=%d where=%d\n",
 		       error, igz.error, error2, igz.where);
 	}
 	if (igz.error)
 		return igz.error;
 	if (error)
 		return ENOEXEC;
 	if (error2)
 		return error2;
 	return 0;
 }
 
 static int
 do_aout_hdr(struct imgact_gzip * gz)
 {
 	int             error;
 	struct thread  *td = curthread;
 	struct vmspace *vmspace;
 	vm_offset_t     vmaddr;
 
 	/*
 	 * Set file/virtual offset based on a.out variant. We do two cases:
 	 * host byte order and network byte order (for NetBSD compatibility)
 	 */
 	switch ((int) (gz->a_out.a_magic & 0xffff)) {
 	case ZMAGIC:
 		gz->virtual_offset = 0;
 		if (gz->a_out.a_text) {
 			gz->file_offset = PAGE_SIZE;
 		} else {
 			/* Bill's "screwball mode" */
 			gz->file_offset = 0;
 		}
 		break;
 	case QMAGIC:
 		gz->virtual_offset = PAGE_SIZE;
 		gz->file_offset = 0;
 		break;
 	default:
 		/* NetBSD compatibility */
 		switch ((int) (ntohl(gz->a_out.a_magic) & 0xffff)) {
 		case ZMAGIC:
 		case QMAGIC:
 			gz->virtual_offset = PAGE_SIZE;
 			gz->file_offset = 0;
 			break;
 		default:
 			gz->where = __LINE__;
 			return (-1);
 		}
 	}
 
 	gz->bss_size = roundup(gz->a_out.a_bss, PAGE_SIZE);
 
 	/*
 	 * Check various fields in header for validity/bounds.
 	 */
 	if (			/* entry point must lay with text region */
 	    gz->a_out.a_entry < gz->virtual_offset ||
 	    gz->a_out.a_entry >= gz->virtual_offset + gz->a_out.a_text ||
 
 	/* text and data size must each be page rounded */
 	    gz->a_out.a_text & PAGE_MASK || gz->a_out.a_data & PAGE_MASK) {
 		gz->where = __LINE__;
 		return (-1);
 	}
 	/*
 	 * text/data/bss must not exceed limits
 	 */
 	PROC_LOCK(gz->ip->proc);
 	if (			/* text can't exceed maximum text size */
 	    gz->a_out.a_text > maxtsiz ||
 
 	/* data + bss can't exceed rlimit */
 	    gz->a_out.a_data + gz->bss_size >
 	    lim_cur(gz->ip->proc, RLIMIT_DATA)) {
 		PROC_UNLOCK(gz->ip->proc);
 		gz->where = __LINE__;
 		return (ENOMEM);
 	}
 	PROC_UNLOCK(gz->ip->proc);
 	/* Find out how far we should go */
 	gz->file_end = gz->file_offset + gz->a_out.a_text + gz->a_out.a_data;
 
 	/*
 	 * Avoid a possible deadlock if the current address space is destroyed
 	 * and that address space maps the locked vnode.  In the common case,
 	 * the locked vnode's v_usecount is decremented but remains greater
 	 * than zero.  Consequently, the vnode lock is not needed by vrele().
 	 * However, in cases where the vnode lock is external, such as nullfs,
 	 * v_usecount may become zero.
 	 */
 	VOP_UNLOCK(gz->ip->vp, 0, td);
 
 	/*
 	 * Destroy old process VM and create a new one (with a new stack)
 	 */
 	error = exec_new_vmspace(gz->ip, &aout_sysvec);
 
-	vn_lock(gz->ip->vp, LK_EXCLUSIVE | LK_RETRY, td);
+	vn_lock(gz->ip->vp, LK_EXCLUSIVE | LK_RETRY);
 	if (error) {
 		gz->where = __LINE__;
 		return (error);
 	}
 
 	vmspace = gz->ip->proc->p_vmspace;
 
 	vmaddr = gz->virtual_offset;
 
 	error = vm_mmap(&vmspace->vm_map,
 			&vmaddr,
 			gz->a_out.a_text + gz->a_out.a_data,
 			VM_PROT_ALL, VM_PROT_ALL, MAP_ANON | MAP_FIXED,
 			OBJT_DEFAULT,
 			NULL,
 			0);
 
 	if (error) {
 		gz->where = __LINE__;
 		return (error);
 	}
 
 	if (gz->bss_size != 0) {
 		/*
 		 * Allocate demand-zeroed area for uninitialized data.
 		 * "bss" = 'block started by symbol' - named after the 
 		 * IBM 7090 instruction of the same name.
 		 */
 		vmaddr = gz->virtual_offset + gz->a_out.a_text + 
 			gz->a_out.a_data;
 		error = vm_map_find(&vmspace->vm_map,
 				NULL,
 				0,
 				&vmaddr, 
 				gz->bss_size,
 				FALSE, VM_PROT_ALL, VM_PROT_ALL, 0);
 		if (error) {
 			gz->where = __LINE__;
 			return (error);
 		}
 	}
 	/* Fill in process VM information */
 	vmspace->vm_tsize = gz->a_out.a_text >> PAGE_SHIFT;
 	vmspace->vm_dsize = (gz->a_out.a_data + gz->bss_size) >> PAGE_SHIFT;
 	vmspace->vm_taddr = (caddr_t) (uintptr_t) gz->virtual_offset;
 	vmspace->vm_daddr = (caddr_t) (uintptr_t)
 			    (gz->virtual_offset + gz->a_out.a_text);
 
 	/* Fill in image_params */
 	gz->ip->interpreted = 0;
 	gz->ip->entry_addr = gz->a_out.a_entry;
 
 	gz->ip->proc->p_sysent = &aout_sysvec;
 
 	return 0;
 }
 
 static int
 NextByte(void *vp)
 {
 	int             error;
 	struct imgact_gzip *igz = (struct imgact_gzip *) vp;
 
 	if (igz->idx >= igz->len) {
 		igz->where = __LINE__;
 		return GZ_EOF;
 	}
 	if (igz->inbuf && igz->idx < (igz->offset + PAGE_SIZE)) {
 		return igz->inbuf[(igz->idx++) - igz->offset];
 	}
 	if (igz->inbuf) {
 		error = vm_map_remove(kernel_map, (vm_offset_t) igz->inbuf,
 			    (vm_offset_t) igz->inbuf + PAGE_SIZE);
 		if (error) {
 			igz->where = __LINE__;
 			igz->error = error;
 			return GZ_EOF;
 		}
 	}
 	igz->offset = igz->idx & ~PAGE_MASK;
 
 	error = vm_mmap(kernel_map,	/* map */
 			(vm_offset_t *) & igz->inbuf,	/* address */
 			PAGE_SIZE,	/* size */
 			VM_PROT_READ,	/* protection */
 			VM_PROT_READ,	/* max protection */
 			0,	/* flags */
 			OBJT_VNODE,	/* handle type */
 			igz->ip->vp,	/* vnode */
 			igz->offset);	/* offset */
 	if (error) {
 		igz->where = __LINE__;
 		igz->error = error;
 		return GZ_EOF;
 	}
 	return igz->inbuf[(igz->idx++) - igz->offset];
 }
 
 static int
 Flush(void *vp, u_char * ptr, u_long siz)
 {
 	struct imgact_gzip *gz = (struct imgact_gzip *) vp;
 	u_char         *p = ptr, *q;
 	int             i;
 
 	/* First, find an a.out-header. */
 	if (gz->output < sizeof gz->a_out) {
 		q = (u_char *) & gz->a_out;
 		i = min(siz, sizeof gz->a_out - gz->output);
 		bcopy(p, q + gz->output, i);
 		gz->output += i;
 		p += i;
 		siz -= i;
 		if (gz->output == sizeof gz->a_out) {
 			gz->gotheader = 1;
 			i = do_aout_hdr(gz);
 			if (i == -1) {
 				if (!gz->where)
 					gz->where = __LINE__;
 				gz->error = ENOEXEC;
 				return ENOEXEC;
 			} else if (i) {
 				gz->where = __LINE__;
 				gz->error = i;
 				return ENOEXEC;
 			}
 			if (gz->file_offset == 0) {
 				q = (u_char *) (uintptr_t) gz->virtual_offset;
 				copyout(&gz->a_out, q, sizeof gz->a_out);
 			}
 		}
 	}
 	/* Skip over zero-padded first PAGE if needed */
 	if (gz->output < gz->file_offset &&
 	    gz->output + siz > gz->file_offset) {
 		i = min(siz, gz->file_offset - gz->output);
 		gz->output += i;
 		p += i;
 		siz -= i;
 	}
 	if (gz->output >= gz->file_offset && gz->output < gz->file_end) {
 		i = min(siz, gz->file_end - gz->output);
 		q = (u_char *) (uintptr_t)
 		    (gz->virtual_offset + gz->output - gz->file_offset);
 		copyout(p, q, i);
 		gz->output += i;
 		p += i;
 		siz -= i;
 	}
 	gz->output += siz;
 	return 0;
 }
 
 
 /*
  * Tell kern_execve.c about it, with a little help from the linker.
  */
 static struct execsw gzip_execsw = {exec_gzip_imgact, "gzip"};
 EXEC_SET(execgzip, gzip_execsw);
Index: head/sys/kern/kern_alq.c
===================================================================
--- head/sys/kern/kern_alq.c	(revision 175201)
+++ head/sys/kern/kern_alq.c	(revision 175202)
@@ -1,519 +1,519 @@
 /*-
  * Copyright (c) 2002, Jeffrey Roberson <jeff@freebsd.org>
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice unmodified, this list of conditions, and the following
  *    disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_mac.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/kernel.h>
 #include <sys/kthread.h>
 #include <sys/lock.h>
 #include <sys/mount.h>
 #include <sys/mutex.h>
 #include <sys/namei.h>
 #include <sys/proc.h>
 #include <sys/vnode.h>
 #include <sys/alq.h>
 #include <sys/malloc.h>
 #include <sys/unistd.h>
 #include <sys/fcntl.h>
 #include <sys/eventhandler.h>
 
 #include <security/mac/mac_framework.h>
 
 /* Async. Logging Queue */
 struct alq {
 	int	aq_entmax;		/* Max entries */
 	int	aq_entlen;		/* Entry length */
 	char	*aq_entbuf;		/* Buffer for stored entries */
 	int	aq_flags;		/* Queue flags */
 	struct mtx	aq_mtx;		/* Queue lock */
 	struct vnode	*aq_vp;		/* Open vnode handle */
 	struct ucred	*aq_cred;	/* Credentials of the opening thread */
 	struct ale	*aq_first;	/* First ent */
 	struct ale	*aq_entfree;	/* First free ent */
 	struct ale	*aq_entvalid;	/* First ent valid for writing */
 	LIST_ENTRY(alq)	aq_act;		/* List of active queues */
 	LIST_ENTRY(alq)	aq_link;	/* List of all queues */
 };
 
 #define	AQ_WANTED	0x0001		/* Wakeup sleeper when io is done */
 #define	AQ_ACTIVE	0x0002		/* on the active list */
 #define	AQ_FLUSHING	0x0004		/* doing IO */
 #define	AQ_SHUTDOWN	0x0008		/* Queue no longer valid */
 
 #define	ALQ_LOCK(alq)	mtx_lock_spin(&(alq)->aq_mtx)
 #define	ALQ_UNLOCK(alq)	mtx_unlock_spin(&(alq)->aq_mtx)
 
 static MALLOC_DEFINE(M_ALD, "ALD", "ALD");
 
 /*
  * The ald_mtx protects the ald_queues list and the ald_active list.
  */
 static struct mtx ald_mtx;
 static LIST_HEAD(, alq) ald_queues;
 static LIST_HEAD(, alq) ald_active;
 static int ald_shutingdown = 0;
 struct thread *ald_thread;
 static struct proc *ald_proc;
 
 #define	ALD_LOCK()	mtx_lock(&ald_mtx)
 #define	ALD_UNLOCK()	mtx_unlock(&ald_mtx)
 
 /* Daemon functions */
 static int ald_add(struct alq *);
 static int ald_rem(struct alq *);
 static void ald_startup(void *);
 static void ald_daemon(void);
 static void ald_shutdown(void *, int);
 static void ald_activate(struct alq *);
 static void ald_deactivate(struct alq *);
 
 /* Internal queue functions */
 static void alq_shutdown(struct alq *);
 static int alq_doio(struct alq *);
 
 
 /*
  * Add a new queue to the global list.  Fail if we're shutting down.
  */
 static int
 ald_add(struct alq *alq)
 {
 	int error;
 
 	error = 0;
 
 	ALD_LOCK();
 	if (ald_shutingdown) {
 		error = EBUSY;
 		goto done;
 	}
 	LIST_INSERT_HEAD(&ald_queues, alq, aq_link);
 done:
 	ALD_UNLOCK();
 	return (error);
 }
 
 /*
  * Remove a queue from the global list unless we're shutting down.  If so,
  * the ald will take care of cleaning up it's resources.
  */
 static int
 ald_rem(struct alq *alq)
 {
 	int error;
 
 	error = 0;
 
 	ALD_LOCK();
 	if (ald_shutingdown) {
 		error = EBUSY;
 		goto done;
 	}
 	LIST_REMOVE(alq, aq_link);
 done:
 	ALD_UNLOCK();
 	return (error);
 }
 
 /*
  * Put a queue on the active list.  This will schedule it for writing.
  */
 static void
 ald_activate(struct alq *alq)
 {
 	LIST_INSERT_HEAD(&ald_active, alq, aq_act);
 	wakeup(&ald_active);
 }
 
 static void
 ald_deactivate(struct alq *alq)
 {
 	LIST_REMOVE(alq, aq_act);
 	alq->aq_flags &= ~AQ_ACTIVE;
 }
 
 static void
 ald_startup(void *unused)
 {
 	mtx_init(&ald_mtx, "ALDmtx", NULL, MTX_DEF|MTX_QUIET);
 	LIST_INIT(&ald_queues);
 	LIST_INIT(&ald_active);
 }
 
 static void
 ald_daemon(void)
 {
 	int needwakeup;
 	struct alq *alq;
 
 	ald_thread = FIRST_THREAD_IN_PROC(ald_proc);
 
 	EVENTHANDLER_REGISTER(shutdown_pre_sync, ald_shutdown, NULL,
 	    SHUTDOWN_PRI_FIRST);
 
 	ALD_LOCK();
 
 	for (;;) {
 		while ((alq = LIST_FIRST(&ald_active)) == NULL)
 			msleep(&ald_active, &ald_mtx, PWAIT, "aldslp", 0);
 
 		ALQ_LOCK(alq);
 		ald_deactivate(alq);
 		ALD_UNLOCK();
 		needwakeup = alq_doio(alq);
 		ALQ_UNLOCK(alq);
 		if (needwakeup)
 			wakeup(alq);
 		ALD_LOCK();
 	}
 }
 
 static void
 ald_shutdown(void *arg, int howto)
 {
 	struct alq *alq;
 
 	ALD_LOCK();
 	ald_shutingdown = 1;
 
 	while ((alq = LIST_FIRST(&ald_queues)) != NULL) {
 		LIST_REMOVE(alq, aq_link);
 		ALD_UNLOCK();
 		alq_shutdown(alq);
 		ALD_LOCK();
 	}
 	ALD_UNLOCK();
 }
 
 static void
 alq_shutdown(struct alq *alq)
 {
 	ALQ_LOCK(alq);
 
 	/* Stop any new writers. */
 	alq->aq_flags |= AQ_SHUTDOWN;
 
 	/* Drain IO */
 	while (alq->aq_flags & (AQ_FLUSHING|AQ_ACTIVE)) {
 		alq->aq_flags |= AQ_WANTED;
 		ALQ_UNLOCK(alq);
 		tsleep(alq, PWAIT, "aldclose", 0);
 		ALQ_LOCK(alq);
 	}
 	ALQ_UNLOCK(alq);
 
 	vn_close(alq->aq_vp, FWRITE, alq->aq_cred,
 	    curthread);
 	crfree(alq->aq_cred);
 }
 
 /*
  * Flush all pending data to disk.  This operation will block.
  */
 static int
 alq_doio(struct alq *alq)
 {
 	struct thread *td;
 	struct mount *mp;
 	struct vnode *vp;
 	struct uio auio;
 	struct iovec aiov[2];
 	struct ale *ale;
 	struct ale *alstart;
 	int totlen;
 	int iov;
 	int vfslocked;
 
 	vp = alq->aq_vp;
 	td = curthread;
 	totlen = 0;
 	iov = 0;
 
 	alstart = ale = alq->aq_entvalid;
 	alq->aq_entvalid = NULL;
 
 	bzero(&aiov, sizeof(aiov));
 	bzero(&auio, sizeof(auio));
 
 	do {
 		if (aiov[iov].iov_base == NULL)
 			aiov[iov].iov_base = ale->ae_data;
 		aiov[iov].iov_len += alq->aq_entlen;
 		totlen += alq->aq_entlen;
 		/* Check to see if we're wrapping the buffer */
 		if (ale->ae_data + alq->aq_entlen != ale->ae_next->ae_data)
 			iov++;
 		ale->ae_flags &= ~AE_VALID;
 		ale = ale->ae_next;
 	} while (ale->ae_flags & AE_VALID);
 
 	alq->aq_flags |= AQ_FLUSHING;
 	ALQ_UNLOCK(alq);
 
 	if (iov == 2 || aiov[iov].iov_base == NULL)
 		iov--;
 
 	auio.uio_iov = &aiov[0];
 	auio.uio_offset = 0;
 	auio.uio_segflg = UIO_SYSSPACE;
 	auio.uio_rw = UIO_WRITE;
 	auio.uio_iovcnt = iov + 1;
 	auio.uio_resid = totlen;
 	auio.uio_td = td;
 
 	/*
 	 * Do all of the junk required to write now.
 	 */
 	vfslocked = VFS_LOCK_GIANT(vp->v_mount);
 	vn_start_write(vp, &mp, V_WAIT);
-	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td);
+	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
 	VOP_LEASE(vp, td, alq->aq_cred, LEASE_WRITE);
 	/*
 	 * XXX: VOP_WRITE error checks are ignored.
 	 */
 #ifdef MAC
 	if (mac_vnode_check_write(alq->aq_cred, NOCRED, vp) == 0)
 #endif
 		VOP_WRITE(vp, &auio, IO_UNIT | IO_APPEND, alq->aq_cred);
 	VOP_UNLOCK(vp, 0, td);
 	vn_finished_write(mp);
 	VFS_UNLOCK_GIANT(vfslocked);
 
 	ALQ_LOCK(alq);
 	alq->aq_flags &= ~AQ_FLUSHING;
 
 	if (alq->aq_entfree == NULL)
 		alq->aq_entfree = alstart;
 
 	if (alq->aq_flags & AQ_WANTED) {
 		alq->aq_flags &= ~AQ_WANTED;
 		return (1);
 	}
 
 	return(0);
 }
 
 static struct kproc_desc ald_kp = {
         "ALQ Daemon",
         ald_daemon,
         &ald_proc
 };
 
 SYSINIT(aldthread, SI_SUB_KTHREAD_IDLE, SI_ORDER_ANY, kproc_start, &ald_kp)
 SYSINIT(ald, SI_SUB_LOCK, SI_ORDER_ANY, ald_startup, NULL)
 
 
 /* User visible queue functions */
 
 /*
  * Create the queue data structure, allocate the buffer, and open the file.
  */
 int
 alq_open(struct alq **alqp, const char *file, struct ucred *cred, int cmode,
     int size, int count)
 {
 	struct thread *td;
 	struct nameidata nd;
 	struct ale *ale;
 	struct ale *alp;
 	struct alq *alq;
 	char *bufp;
 	int flags;
 	int error;
 	int i, vfslocked;
 
 	*alqp = NULL;
 	td = curthread;
 
 	NDINIT(&nd, LOOKUP, NOFOLLOW | MPSAFE, UIO_SYSSPACE, file, td);
 	flags = FWRITE | O_NOFOLLOW | O_CREAT;
 
 	error = vn_open_cred(&nd, &flags, cmode, cred, NULL);
 	if (error)
 		return (error);
 
 	vfslocked = NDHASGIANT(&nd);
 	NDFREE(&nd, NDF_ONLY_PNBUF);
 	/* We just unlock so we hold a reference */
 	VOP_UNLOCK(nd.ni_vp, 0, td);
 	VFS_UNLOCK_GIANT(vfslocked);
 
 	alq = malloc(sizeof(*alq), M_ALD, M_WAITOK|M_ZERO);
 	alq->aq_entbuf = malloc(count * size, M_ALD, M_WAITOK|M_ZERO);
 	alq->aq_first = malloc(sizeof(*ale) * count, M_ALD, M_WAITOK|M_ZERO);
 	alq->aq_vp = nd.ni_vp;
 	alq->aq_cred = crhold(cred);
 	alq->aq_entmax = count;
 	alq->aq_entlen = size;
 	alq->aq_entfree = alq->aq_first;
 
 	mtx_init(&alq->aq_mtx, "ALD Queue", NULL, MTX_SPIN|MTX_QUIET);
 
 	bufp = alq->aq_entbuf;
 	ale = alq->aq_first;
 	alp = NULL;
 
 	/* Match up entries with buffers */
 	for (i = 0; i < count; i++) {
 		if (alp)
 			alp->ae_next = ale;
 		ale->ae_data = bufp;
 		alp = ale;
 		ale++;
 		bufp += size;
 	}
 
 	alp->ae_next = alq->aq_first;
 
 	if ((error = ald_add(alq)) != 0)
 		return (error);
 	*alqp = alq;
 
 	return (0);
 }
 
 /*
  * Copy a new entry into the queue.  If the operation would block either
  * wait or return an error depending on the value of waitok.
  */
 int
 alq_write(struct alq *alq, void *data, int waitok)
 {
 	struct ale *ale;
 
 	if ((ale = alq_get(alq, waitok)) == NULL)
 		return (EWOULDBLOCK);
 
 	bcopy(data, ale->ae_data, alq->aq_entlen);
 	alq_post(alq, ale);
 
 	return (0);
 }
 
 struct ale *
 alq_get(struct alq *alq, int waitok)
 {
 	struct ale *ale;
 	struct ale *aln;
 
 	ale = NULL;
 
 	ALQ_LOCK(alq);
 
 	/* Loop until we get an entry or we're shutting down */
 	while ((alq->aq_flags & AQ_SHUTDOWN) == 0 && 
 	    (ale = alq->aq_entfree) == NULL &&
 	    (waitok & ALQ_WAITOK)) {
 		alq->aq_flags |= AQ_WANTED;
 		ALQ_UNLOCK(alq);
 		tsleep(alq, PWAIT, "alqget", 0);
 		ALQ_LOCK(alq);
 	}
 
 	if (ale != NULL) {
 		aln = ale->ae_next;
 		if ((aln->ae_flags & AE_VALID) == 0)
 			alq->aq_entfree = aln;
 		else
 			alq->aq_entfree = NULL;
 	} else
 		ALQ_UNLOCK(alq);
 
 
 	return (ale);
 }
 
 void
 alq_post(struct alq *alq, struct ale *ale)
 {
 	int activate;
 
 	ale->ae_flags |= AE_VALID;
 
 	if (alq->aq_entvalid == NULL)
 		alq->aq_entvalid = ale;
 
 	if ((alq->aq_flags & AQ_ACTIVE) == 0) {
 		alq->aq_flags |= AQ_ACTIVE;
 		activate = 1;
 	} else
 		activate = 0;
 
 	ALQ_UNLOCK(alq);
 	if (activate) {
 		ALD_LOCK();
 		ald_activate(alq);
 		ALD_UNLOCK();
 	}
 }
 
 void
 alq_flush(struct alq *alq)
 {
 	int needwakeup = 0;
 
 	ALD_LOCK();
 	ALQ_LOCK(alq);
 	if (alq->aq_flags & AQ_ACTIVE) {
 		ald_deactivate(alq);
 		ALD_UNLOCK();
 		needwakeup = alq_doio(alq);
 	} else
 		ALD_UNLOCK();
 	ALQ_UNLOCK(alq);
 
 	if (needwakeup)
 		wakeup(alq);
 }
 
 /*
  * Flush remaining data, close the file and free all resources.
  */
 void
 alq_close(struct alq *alq)
 {
 	/*
 	 * If we're already shuting down someone else will flush and close
 	 * the vnode.
 	 */
 	if (ald_rem(alq) != 0)
 		return;
 
 	/*
 	 * Drain all pending IO.
 	 */
 	alq_shutdown(alq);
 
 	mtx_destroy(&alq->aq_mtx);
 	free(alq->aq_first, M_ALD);
 	free(alq->aq_entbuf, M_ALD);
 	free(alq, M_ALD);
 }
Index: head/sys/kern/kern_descrip.c
===================================================================
--- head/sys/kern/kern_descrip.c	(revision 175201)
+++ head/sys/kern/kern_descrip.c	(revision 175202)
@@ -1,2868 +1,2868 @@
 /*-
  * Copyright (c) 1982, 1986, 1989, 1991, 1993
  *	The Regents of the University of California.  All rights reserved.
  * (c) UNIX System Laboratories, Inc.
  * All or some portions of this file are derived from material licensed
  * to the University of California by American Telephone and Telegraph
  * Co. or Unix System Laboratories, Inc. and are reproduced herein with
  * the permission of UNIX System Laboratories, Inc.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	@(#)kern_descrip.c	8.6 (Berkeley) 4/19/94
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_compat.h"
 #include "opt_ddb.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 
 #include <sys/conf.h>
 #include <sys/domain.h>
 #include <sys/fcntl.h>
 #include <sys/file.h>
 #include <sys/filedesc.h>
 #include <sys/filio.h>
 #include <sys/jail.h>
 #include <sys/kernel.h>
 #include <sys/limits.h>
 #include <sys/lock.h>
 #include <sys/malloc.h>
 #include <sys/mount.h>
 #include <sys/mqueue.h>
 #include <sys/mutex.h>
 #include <sys/namei.h>
 #include <sys/priv.h>
 #include <sys/proc.h>
 #include <sys/protosw.h>
 #include <sys/resourcevar.h>
 #include <sys/signalvar.h>
 #include <sys/socketvar.h>
 #include <sys/stat.h>
 #include <sys/sx.h>
 #include <sys/syscallsubr.h>
 #include <sys/sysctl.h>
 #include <sys/sysproto.h>
 #include <sys/unistd.h>
 #include <sys/user.h>
 #include <sys/vnode.h>
 
 #include <security/audit/audit.h>
 
 #include <vm/uma.h>
 
 #include <ddb/ddb.h>
 
 static MALLOC_DEFINE(M_FILEDESC, "filedesc", "Open file descriptor table");
 static MALLOC_DEFINE(M_FILEDESC_TO_LEADER, "filedesc_to_leader",
 		     "file desc to leader structures");
 static MALLOC_DEFINE(M_SIGIO, "sigio", "sigio structures");
 
 static uma_zone_t file_zone;
 
 
 /* How to treat 'new' parameter when allocating a fd for do_dup(). */
 enum dup_type { DUP_VARIABLE, DUP_FIXED };
 
 static int do_dup(struct thread *td, enum dup_type type, int old, int new,
     register_t *retval);
 static int	fd_first_free(struct filedesc *, int, int);
 static int	fd_last_used(struct filedesc *, int, int);
 static void	fdgrowtable(struct filedesc *, int);
 static void	fdunused(struct filedesc *fdp, int fd);
 static void	fdused(struct filedesc *fdp, int fd);
 
 /*
  * A process is initially started out with NDFILE descriptors stored within
  * this structure, selected to be enough for typical applications based on
  * the historical limit of 20 open files (and the usage of descriptors by
  * shells).  If these descriptors are exhausted, a larger descriptor table
  * may be allocated, up to a process' resource limit; the internal arrays
  * are then unused.
  */
 #define NDFILE		20
 #define NDSLOTSIZE	sizeof(NDSLOTTYPE)
 #define	NDENTRIES	(NDSLOTSIZE * __CHAR_BIT)
 #define NDSLOT(x)	((x) / NDENTRIES)
 #define NDBIT(x)	((NDSLOTTYPE)1 << ((x) % NDENTRIES))
 #define	NDSLOTS(x)	(((x) + NDENTRIES - 1) / NDENTRIES)
 
 /*
  * Storage required per open file descriptor.
  */
 #define OFILESIZE (sizeof(struct file *) + sizeof(char))
 
 /*
  * Basic allocation of descriptors:
  * one of the above, plus arrays for NDFILE descriptors.
  */
 struct filedesc0 {
 	struct	filedesc fd_fd;
 	/*
 	 * These arrays are used when the number of open files is
 	 * <= NDFILE, and are then pointed to by the pointers above.
 	 */
 	struct	file *fd_dfiles[NDFILE];
 	char	fd_dfileflags[NDFILE];
 	NDSLOTTYPE fd_dmap[NDSLOTS(NDFILE)];
 };
 
 /*
  * Descriptor management.
  */
 volatile int openfiles;			/* actual number of open files */
 struct mtx sigio_lock;		/* mtx to protect pointers to sigio */
 void	(*mq_fdclose)(struct thread *td, int fd, struct file *fp);
 
 /* A mutex to protect the association between a proc and filedesc. */
 static struct mtx	fdesc_mtx;
 
 /*
  * Find the first zero bit in the given bitmap, starting at low and not
  * exceeding size - 1.
  */
 static int
 fd_first_free(struct filedesc *fdp, int low, int size)
 {
 	NDSLOTTYPE *map = fdp->fd_map;
 	NDSLOTTYPE mask;
 	int off, maxoff;
 
 	if (low >= size)
 		return (low);
 
 	off = NDSLOT(low);
 	if (low % NDENTRIES) {
 		mask = ~(~(NDSLOTTYPE)0 >> (NDENTRIES - (low % NDENTRIES)));
 		if ((mask &= ~map[off]) != 0UL)
 			return (off * NDENTRIES + ffsl(mask) - 1);
 		++off;
 	}
 	for (maxoff = NDSLOTS(size); off < maxoff; ++off)
 		if (map[off] != ~0UL)
 			return (off * NDENTRIES + ffsl(~map[off]) - 1);
 	return (size);
 }
 
 /*
  * Find the highest non-zero bit in the given bitmap, starting at low and
  * not exceeding size - 1.
  */
 static int
 fd_last_used(struct filedesc *fdp, int low, int size)
 {
 	NDSLOTTYPE *map = fdp->fd_map;
 	NDSLOTTYPE mask;
 	int off, minoff;
 
 	if (low >= size)
 		return (-1);
 
 	off = NDSLOT(size);
 	if (size % NDENTRIES) {
 		mask = ~(~(NDSLOTTYPE)0 << (size % NDENTRIES));
 		if ((mask &= map[off]) != 0)
 			return (off * NDENTRIES + flsl(mask) - 1);
 		--off;
 	}
 	for (minoff = NDSLOT(low); off >= minoff; --off)
 		if (map[off] != 0)
 			return (off * NDENTRIES + flsl(map[off]) - 1);
 	return (low - 1);
 }
 
 static int
 fdisused(struct filedesc *fdp, int fd)
 {
         KASSERT(fd >= 0 && fd < fdp->fd_nfiles,
             ("file descriptor %d out of range (0, %d)", fd, fdp->fd_nfiles));
 	return ((fdp->fd_map[NDSLOT(fd)] & NDBIT(fd)) != 0);
 }
 
 /*
  * Mark a file descriptor as used.
  */
 static void
 fdused(struct filedesc *fdp, int fd)
 {
 
 	FILEDESC_XLOCK_ASSERT(fdp);
 	KASSERT(!fdisused(fdp, fd),
 	    ("fd already used"));
 
 	fdp->fd_map[NDSLOT(fd)] |= NDBIT(fd);
 	if (fd > fdp->fd_lastfile)
 		fdp->fd_lastfile = fd;
 	if (fd == fdp->fd_freefile)
 		fdp->fd_freefile = fd_first_free(fdp, fd, fdp->fd_nfiles);
 }
 
 /*
  * Mark a file descriptor as unused.
  */
 static void
 fdunused(struct filedesc *fdp, int fd)
 {
 
 	FILEDESC_XLOCK_ASSERT(fdp);
 	KASSERT(fdisused(fdp, fd),
 	    ("fd is already unused"));
 	KASSERT(fdp->fd_ofiles[fd] == NULL,
 	    ("fd is still in use"));
 
 	fdp->fd_map[NDSLOT(fd)] &= ~NDBIT(fd);
 	if (fd < fdp->fd_freefile)
 		fdp->fd_freefile = fd;
 	if (fd == fdp->fd_lastfile)
 		fdp->fd_lastfile = fd_last_used(fdp, 0, fd);
 }
 
 /*
  * System calls on descriptors.
  */
 #ifndef _SYS_SYSPROTO_H_
 struct getdtablesize_args {
 	int	dummy;
 };
 #endif
 /* ARGSUSED */
 int
 getdtablesize(struct thread *td, struct getdtablesize_args *uap)
 {
 	struct proc *p = td->td_proc;
 
 	PROC_LOCK(p);
 	td->td_retval[0] =
 	    min((int)lim_cur(p, RLIMIT_NOFILE), maxfilesperproc);
 	PROC_UNLOCK(p);
 	return (0);
 }
 
 /*
  * Duplicate a file descriptor to a particular value.
  *
  * Note: keep in mind that a potential race condition exists when closing
  * descriptors from a shared descriptor table (via rfork).
  */
 #ifndef _SYS_SYSPROTO_H_
 struct dup2_args {
 	u_int	from;
 	u_int	to;
 };
 #endif
 /* ARGSUSED */
 int
 dup2(struct thread *td, struct dup2_args *uap)
 {
 
 	return (do_dup(td, DUP_FIXED, (int)uap->from, (int)uap->to,
 		    td->td_retval));
 }
 
 /*
  * Duplicate a file descriptor.
  */
 #ifndef _SYS_SYSPROTO_H_
 struct dup_args {
 	u_int	fd;
 };
 #endif
 /* ARGSUSED */
 int
 dup(struct thread *td, struct dup_args *uap)
 {
 
 	return (do_dup(td, DUP_VARIABLE, (int)uap->fd, 0, td->td_retval));
 }
 
 /*
  * The file control system call.
  */
 #ifndef _SYS_SYSPROTO_H_
 struct fcntl_args {
 	int	fd;
 	int	cmd;
 	long	arg;
 };
 #endif
 /* ARGSUSED */
 int
 fcntl(struct thread *td, struct fcntl_args *uap)
 {
 	struct flock fl;
 	intptr_t arg;
 	int error;
 
 	error = 0;
 	switch (uap->cmd) {
 	case F_GETLK:
 	case F_SETLK:
 	case F_SETLKW:
 		error = copyin((void *)(intptr_t)uap->arg, &fl, sizeof(fl));
 		arg = (intptr_t)&fl;
 		break;
 	default:
 		arg = uap->arg;
 		break;
 	}
 	if (error)
 		return (error);
 	error = kern_fcntl(td, uap->fd, uap->cmd, arg);
 	if (error)
 		return (error);
 	if (uap->cmd == F_GETLK)
 		error = copyout(&fl, (void *)(intptr_t)uap->arg, sizeof(fl));
 	return (error);
 }
 
 static inline struct file *
 fdtofp(int fd, struct filedesc *fdp)
 {
 	struct file *fp;
 
 	FILEDESC_LOCK_ASSERT(fdp);
 	if ((unsigned)fd >= fdp->fd_nfiles ||
 	    (fp = fdp->fd_ofiles[fd]) == NULL)
 		return (NULL);
 	return (fp);
 }
 
 int
 kern_fcntl(struct thread *td, int fd, int cmd, intptr_t arg)
 {
 	struct filedesc *fdp;
 	struct flock *flp;
 	struct file *fp;
 	struct proc *p;
 	char *pop;
 	struct vnode *vp;
 	u_int newmin;
 	int error, flg, tmp;
 	int vfslocked;
 
 	vfslocked = 0;
 	error = 0;
 	flg = F_POSIX;
 	p = td->td_proc;
 	fdp = p->p_fd;
 
 	switch (cmd) {
 	case F_DUPFD:
 		FILEDESC_SLOCK(fdp);
 		if ((fp = fdtofp(fd, fdp)) == NULL) {
 			FILEDESC_SUNLOCK(fdp);
 			error = EBADF;
 			break;
 		}
 		FILEDESC_SUNLOCK(fdp);
 		newmin = arg;
 		PROC_LOCK(p);
 		if (newmin >= lim_cur(p, RLIMIT_NOFILE) ||
 		    newmin >= maxfilesperproc) {
 			PROC_UNLOCK(p);
 			error = EINVAL;
 			break;
 		}
 		PROC_UNLOCK(p);
 		error = do_dup(td, DUP_VARIABLE, fd, newmin, td->td_retval);
 		break;
 
 	case F_GETFD:
 		FILEDESC_SLOCK(fdp);
 		if ((fp = fdtofp(fd, fdp)) == NULL) {
 			FILEDESC_SUNLOCK(fdp);
 			error = EBADF;
 			break;
 		}
 		pop = &fdp->fd_ofileflags[fd];
 		td->td_retval[0] = (*pop & UF_EXCLOSE) ? FD_CLOEXEC : 0;
 		FILEDESC_SUNLOCK(fdp);
 		break;
 
 	case F_SETFD:
 		FILEDESC_XLOCK(fdp);
 		if ((fp = fdtofp(fd, fdp)) == NULL) {
 			FILEDESC_XUNLOCK(fdp);
 			error = EBADF;
 			break;
 		}
 		pop = &fdp->fd_ofileflags[fd];
 		*pop = (*pop &~ UF_EXCLOSE) |
 		    (arg & FD_CLOEXEC ? UF_EXCLOSE : 0);
 		FILEDESC_XUNLOCK(fdp);
 		break;
 
 	case F_GETFL:
 		FILEDESC_SLOCK(fdp);
 		if ((fp = fdtofp(fd, fdp)) == NULL) {
 			FILEDESC_SUNLOCK(fdp);
 			error = EBADF;
 			break;
 		}
 		td->td_retval[0] = OFLAGS(fp->f_flag);
 		FILEDESC_SUNLOCK(fdp);
 		break;
 
 	case F_SETFL:
 		FILEDESC_SLOCK(fdp);
 		if ((fp = fdtofp(fd, fdp)) == NULL) {
 			FILEDESC_SUNLOCK(fdp);
 			error = EBADF;
 			break;
 		}
 		fhold(fp);
 		FILEDESC_SUNLOCK(fdp);
 		do {
 			tmp = flg = fp->f_flag;
 			tmp &= ~FCNTLFLAGS;
 			tmp |= FFLAGS(arg & ~O_ACCMODE) & FCNTLFLAGS;
 		} while(atomic_cmpset_int(&fp->f_flag, flg, tmp) == 0);
 		tmp = fp->f_flag & FNONBLOCK;
 		error = fo_ioctl(fp, FIONBIO, &tmp, td->td_ucred, td);
 		if (error) {
 			fdrop(fp, td);
 			break;
 		}
 		tmp = fp->f_flag & FASYNC;
 		error = fo_ioctl(fp, FIOASYNC, &tmp, td->td_ucred, td);
 		if (error == 0) {
 			fdrop(fp, td);
 			break;
 		}
 		atomic_clear_int(&fp->f_flag, FNONBLOCK);
 		tmp = 0;
 		(void)fo_ioctl(fp, FIONBIO, &tmp, td->td_ucred, td);
 		fdrop(fp, td);
 		break;
 
 	case F_GETOWN:
 		FILEDESC_SLOCK(fdp);
 		if ((fp = fdtofp(fd, fdp)) == NULL) {
 			FILEDESC_SUNLOCK(fdp);
 			error = EBADF;
 			break;
 		}
 		fhold(fp);
 		FILEDESC_SUNLOCK(fdp);
 		error = fo_ioctl(fp, FIOGETOWN, &tmp, td->td_ucred, td);
 		if (error == 0)
 			td->td_retval[0] = tmp;
 		fdrop(fp, td);
 		break;
 
 	case F_SETOWN:
 		FILEDESC_SLOCK(fdp);
 		if ((fp = fdtofp(fd, fdp)) == NULL) {
 			FILEDESC_SUNLOCK(fdp);
 			error = EBADF;
 			break;
 		}
 		fhold(fp);
 		FILEDESC_SUNLOCK(fdp);
 		tmp = arg;
 		error = fo_ioctl(fp, FIOSETOWN, &tmp, td->td_ucred, td);
 		fdrop(fp, td);
 		break;
 
 	case F_SETLKW:
 		flg |= F_WAIT;
 		/* FALLTHROUGH F_SETLK */
 
 	case F_SETLK:
 		FILEDESC_SLOCK(fdp);
 		if ((fp = fdtofp(fd, fdp)) == NULL) {
 			FILEDESC_SUNLOCK(fdp);
 			error = EBADF;
 			break;
 		}
 		if (fp->f_type != DTYPE_VNODE) {
 			FILEDESC_SUNLOCK(fdp);
 			error = EBADF;
 			break;
 		}
 		flp = (struct flock *)arg;
 		if (flp->l_whence == SEEK_CUR) {
 			if (fp->f_offset < 0 ||
 			    (flp->l_start > 0 &&
 			     fp->f_offset > OFF_MAX - flp->l_start)) {
 				FILEDESC_SUNLOCK(fdp);
 				error = EOVERFLOW;
 				break;
 			}
 			flp->l_start += fp->f_offset;
 		}
 
 		/*
 		 * VOP_ADVLOCK() may block.
 		 */
 		fhold(fp);
 		FILEDESC_SUNLOCK(fdp);
 		vp = fp->f_vnode;
 		vfslocked = VFS_LOCK_GIANT(vp->v_mount);
 		switch (flp->l_type) {
 		case F_RDLCK:
 			if ((fp->f_flag & FREAD) == 0) {
 				error = EBADF;
 				break;
 			}
 			PROC_LOCK(p->p_leader);
 			p->p_leader->p_flag |= P_ADVLOCK;
 			PROC_UNLOCK(p->p_leader);
 			error = VOP_ADVLOCK(vp, (caddr_t)p->p_leader, F_SETLK,
 			    flp, flg);
 			break;
 		case F_WRLCK:
 			if ((fp->f_flag & FWRITE) == 0) {
 				error = EBADF;
 				break;
 			}
 			PROC_LOCK(p->p_leader);
 			p->p_leader->p_flag |= P_ADVLOCK;
 			PROC_UNLOCK(p->p_leader);
 			error = VOP_ADVLOCK(vp, (caddr_t)p->p_leader, F_SETLK,
 			    flp, flg);
 			break;
 		case F_UNLCK:
 			error = VOP_ADVLOCK(vp, (caddr_t)p->p_leader, F_UNLCK,
 			    flp, F_POSIX);
 			break;
 		default:
 			error = EINVAL;
 			break;
 		}
 		VFS_UNLOCK_GIANT(vfslocked);
 		vfslocked = 0;
 		/* Check for race with close */
 		FILEDESC_SLOCK(fdp);
 		if ((unsigned) fd >= fdp->fd_nfiles ||
 		    fp != fdp->fd_ofiles[fd]) {
 			FILEDESC_SUNLOCK(fdp);
 			flp->l_whence = SEEK_SET;
 			flp->l_start = 0;
 			flp->l_len = 0;
 			flp->l_type = F_UNLCK;
 			vfslocked = VFS_LOCK_GIANT(vp->v_mount);
 			(void) VOP_ADVLOCK(vp, (caddr_t)p->p_leader,
 					   F_UNLCK, flp, F_POSIX);
 			VFS_UNLOCK_GIANT(vfslocked);
 			vfslocked = 0;
 		} else
 			FILEDESC_SUNLOCK(fdp);
 		fdrop(fp, td);
 		break;
 
 	case F_GETLK:
 		FILEDESC_SLOCK(fdp);
 		if ((fp = fdtofp(fd, fdp)) == NULL) {
 			FILEDESC_SUNLOCK(fdp);
 			error = EBADF;
 			break;
 		}
 		if (fp->f_type != DTYPE_VNODE) {
 			FILEDESC_SUNLOCK(fdp);
 			error = EBADF;
 			break;
 		}
 		flp = (struct flock *)arg;
 		if (flp->l_type != F_RDLCK && flp->l_type != F_WRLCK &&
 		    flp->l_type != F_UNLCK) {
 			FILEDESC_SUNLOCK(fdp);
 			error = EINVAL;
 			break;
 		}
 		if (flp->l_whence == SEEK_CUR) {
 			if ((flp->l_start > 0 &&
 			    fp->f_offset > OFF_MAX - flp->l_start) ||
 			    (flp->l_start < 0 &&
 			     fp->f_offset < OFF_MIN - flp->l_start)) {
 				FILEDESC_SUNLOCK(fdp);
 				error = EOVERFLOW;
 				break;
 			}
 			flp->l_start += fp->f_offset;
 		}
 		/*
 		 * VOP_ADVLOCK() may block.
 		 */
 		fhold(fp);
 		FILEDESC_SUNLOCK(fdp);
 		vp = fp->f_vnode;
 		vfslocked = VFS_LOCK_GIANT(vp->v_mount);
 		error = VOP_ADVLOCK(vp, (caddr_t)p->p_leader, F_GETLK, flp,
 		    F_POSIX);
 		VFS_UNLOCK_GIANT(vfslocked);
 		vfslocked = 0;
 		fdrop(fp, td);
 		break;
 	default:
 		error = EINVAL;
 		break;
 	}
 	VFS_UNLOCK_GIANT(vfslocked);
 	return (error);
 }
 
 /*
  * Common code for dup, dup2, and fcntl(F_DUPFD).
  */
 static int
 do_dup(struct thread *td, enum dup_type type, int old, int new,
     register_t *retval)
 {
 	struct filedesc *fdp;
 	struct proc *p;
 	struct file *fp;
 	struct file *delfp;
 	int error, holdleaders, maxfd;
 
 	KASSERT((type == DUP_VARIABLE || type == DUP_FIXED),
 	    ("invalid dup type %d", type));
 
 	p = td->td_proc;
 	fdp = p->p_fd;
 
 	/*
 	 * Verify we have a valid descriptor to dup from and possibly to
 	 * dup to.
 	 */
 	if (old < 0 || new < 0)
 		return (EBADF);
 	PROC_LOCK(p);
 	maxfd = min((int)lim_cur(p, RLIMIT_NOFILE), maxfilesperproc);
 	PROC_UNLOCK(p);
 	if (new >= maxfd)
 		return (EMFILE);
 
 	FILEDESC_XLOCK(fdp);
 	if (old >= fdp->fd_nfiles || fdp->fd_ofiles[old] == NULL) {
 		FILEDESC_XUNLOCK(fdp);
 		return (EBADF);
 	}
 	if (type == DUP_FIXED && old == new) {
 		*retval = new;
 		FILEDESC_XUNLOCK(fdp);
 		return (0);
 	}
 	fp = fdp->fd_ofiles[old];
 	fhold(fp);
 
 	/*
 	 * If the caller specified a file descriptor, make sure the file
 	 * table is large enough to hold it, and grab it.  Otherwise, just
 	 * allocate a new descriptor the usual way.  Since the filedesc
 	 * lock may be temporarily dropped in the process, we have to look
 	 * out for a race.
 	 */
 	if (type == DUP_FIXED) {
 		if (new >= fdp->fd_nfiles)
 			fdgrowtable(fdp, new + 1);
 		if (fdp->fd_ofiles[new] == NULL)
 			fdused(fdp, new);
 	} else {
 		if ((error = fdalloc(td, new, &new)) != 0) {
 			FILEDESC_XUNLOCK(fdp);
 			fdrop(fp, td);
 			return (error);
 		}
 	}
 
 	/*
 	 * If the old file changed out from under us then treat it as a
 	 * bad file descriptor.  Userland should do its own locking to
 	 * avoid this case.
 	 */
 	if (fdp->fd_ofiles[old] != fp) {
 		/* we've allocated a descriptor which we won't use */
 		if (fdp->fd_ofiles[new] == NULL)
 			fdunused(fdp, new);
 		FILEDESC_XUNLOCK(fdp);
 		fdrop(fp, td);
 		return (EBADF);
 	}
 	KASSERT(old != new,
 	    ("new fd is same as old"));
 
 	/*
 	 * Save info on the descriptor being overwritten.  We cannot close
 	 * it without introducing an ownership race for the slot, since we
 	 * need to drop the filedesc lock to call closef().
 	 *
 	 * XXX this duplicates parts of close().
 	 */
 	delfp = fdp->fd_ofiles[new];
 	holdleaders = 0;
 	if (delfp != NULL) {
 		if (td->td_proc->p_fdtol != NULL) {
 			/*
 			 * Ask fdfree() to sleep to ensure that all relevant
 			 * process leaders can be traversed in closef().
 			 */
 			fdp->fd_holdleaderscount++;
 			holdleaders = 1;
 		}
 	}
 
 	/*
 	 * Duplicate the source descriptor
 	 */
 	fdp->fd_ofiles[new] = fp;
 	fdp->fd_ofileflags[new] = fdp->fd_ofileflags[old] &~ UF_EXCLOSE;
 	if (new > fdp->fd_lastfile)
 		fdp->fd_lastfile = new;
 	*retval = new;
 
 	/*
 	 * If we dup'd over a valid file, we now own the reference to it
 	 * and must dispose of it using closef() semantics (as if a
 	 * close() were performed on it).
 	 *
 	 * XXX this duplicates parts of close().
 	 */
 	if (delfp != NULL) {
 		knote_fdclose(td, new);
 		if (delfp->f_type == DTYPE_MQUEUE)
 			mq_fdclose(td, new, delfp);
 		FILEDESC_XUNLOCK(fdp);
 		(void) closef(delfp, td);
 		if (holdleaders) {
 			FILEDESC_XLOCK(fdp);
 			fdp->fd_holdleaderscount--;
 			if (fdp->fd_holdleaderscount == 0 &&
 			    fdp->fd_holdleaderswakeup != 0) {
 				fdp->fd_holdleaderswakeup = 0;
 				wakeup(&fdp->fd_holdleaderscount);
 			}
 			FILEDESC_XUNLOCK(fdp);
 		}
 	} else {
 		FILEDESC_XUNLOCK(fdp);
 	}
 	return (0);
 }
 
 /*
  * If sigio is on the list associated with a process or process group,
  * disable signalling from the device, remove sigio from the list and
  * free sigio.
  */
 void
 funsetown(struct sigio **sigiop)
 {
 	struct sigio *sigio;
 
 	SIGIO_LOCK();
 	sigio = *sigiop;
 	if (sigio == NULL) {
 		SIGIO_UNLOCK();
 		return;
 	}
 	*(sigio->sio_myref) = NULL;
 	if ((sigio)->sio_pgid < 0) {
 		struct pgrp *pg = (sigio)->sio_pgrp;
 		PGRP_LOCK(pg);
 		SLIST_REMOVE(&sigio->sio_pgrp->pg_sigiolst, sigio,
 			     sigio, sio_pgsigio);
 		PGRP_UNLOCK(pg);
 	} else {
 		struct proc *p = (sigio)->sio_proc;
 		PROC_LOCK(p);
 		SLIST_REMOVE(&sigio->sio_proc->p_sigiolst, sigio,
 			     sigio, sio_pgsigio);
 		PROC_UNLOCK(p);
 	}
 	SIGIO_UNLOCK();
 	crfree(sigio->sio_ucred);
 	FREE(sigio, M_SIGIO);
 }
 
 /*
  * Free a list of sigio structures.
  * We only need to lock the SIGIO_LOCK because we have made ourselves
  * inaccessible to callers of fsetown and therefore do not need to lock
  * the proc or pgrp struct for the list manipulation.
  */
 void
 funsetownlst(struct sigiolst *sigiolst)
 {
 	struct proc *p;
 	struct pgrp *pg;
 	struct sigio *sigio;
 
 	sigio = SLIST_FIRST(sigiolst);
 	if (sigio == NULL)
 		return;
 	p = NULL;
 	pg = NULL;
 
 	/*
 	 * Every entry of the list should belong
 	 * to a single proc or pgrp.
 	 */
 	if (sigio->sio_pgid < 0) {
 		pg = sigio->sio_pgrp;
 		PGRP_LOCK_ASSERT(pg, MA_NOTOWNED);
 	} else /* if (sigio->sio_pgid > 0) */ {
 		p = sigio->sio_proc;
 		PROC_LOCK_ASSERT(p, MA_NOTOWNED);
 	}
 
 	SIGIO_LOCK();
 	while ((sigio = SLIST_FIRST(sigiolst)) != NULL) {
 		*(sigio->sio_myref) = NULL;
 		if (pg != NULL) {
 			KASSERT(sigio->sio_pgid < 0,
 			    ("Proc sigio in pgrp sigio list"));
 			KASSERT(sigio->sio_pgrp == pg,
 			    ("Bogus pgrp in sigio list"));
 			PGRP_LOCK(pg);
 			SLIST_REMOVE(&pg->pg_sigiolst, sigio, sigio,
 			    sio_pgsigio);
 			PGRP_UNLOCK(pg);
 		} else /* if (p != NULL) */ {
 			KASSERT(sigio->sio_pgid > 0,
 			    ("Pgrp sigio in proc sigio list"));
 			KASSERT(sigio->sio_proc == p,
 			    ("Bogus proc in sigio list"));
 			PROC_LOCK(p);
 			SLIST_REMOVE(&p->p_sigiolst, sigio, sigio,
 			    sio_pgsigio);
 			PROC_UNLOCK(p);
 		}
 		SIGIO_UNLOCK();
 		crfree(sigio->sio_ucred);
 		FREE(sigio, M_SIGIO);
 		SIGIO_LOCK();
 	}
 	SIGIO_UNLOCK();
 }
 
 /*
  * This is common code for FIOSETOWN ioctl called by fcntl(fd, F_SETOWN, arg).
  *
  * After permission checking, add a sigio structure to the sigio list for
  * the process or process group.
  */
 int
 fsetown(pid_t pgid, struct sigio **sigiop)
 {
 	struct proc *proc;
 	struct pgrp *pgrp;
 	struct sigio *sigio;
 	int ret;
 
 	if (pgid == 0) {
 		funsetown(sigiop);
 		return (0);
 	}
 
 	ret = 0;
 
 	/* Allocate and fill in the new sigio out of locks. */
 	MALLOC(sigio, struct sigio *, sizeof(struct sigio), M_SIGIO, M_WAITOK);
 	sigio->sio_pgid = pgid;
 	sigio->sio_ucred = crhold(curthread->td_ucred);
 	sigio->sio_myref = sigiop;
 
 	sx_slock(&proctree_lock);
 	if (pgid > 0) {
 		proc = pfind(pgid);
 		if (proc == NULL) {
 			ret = ESRCH;
 			goto fail;
 		}
 
 		/*
 		 * Policy - Don't allow a process to FSETOWN a process
 		 * in another session.
 		 *
 		 * Remove this test to allow maximum flexibility or
 		 * restrict FSETOWN to the current process or process
 		 * group for maximum safety.
 		 */
 		PROC_UNLOCK(proc);
 		if (proc->p_session != curthread->td_proc->p_session) {
 			ret = EPERM;
 			goto fail;
 		}
 
 		pgrp = NULL;
 	} else /* if (pgid < 0) */ {
 		pgrp = pgfind(-pgid);
 		if (pgrp == NULL) {
 			ret = ESRCH;
 			goto fail;
 		}
 		PGRP_UNLOCK(pgrp);
 
 		/*
 		 * Policy - Don't allow a process to FSETOWN a process
 		 * in another session.
 		 *
 		 * Remove this test to allow maximum flexibility or
 		 * restrict FSETOWN to the current process or process
 		 * group for maximum safety.
 		 */
 		if (pgrp->pg_session != curthread->td_proc->p_session) {
 			ret = EPERM;
 			goto fail;
 		}
 
 		proc = NULL;
 	}
 	funsetown(sigiop);
 	if (pgid > 0) {
 		PROC_LOCK(proc);
 		/*
 		 * Since funsetownlst() is called without the proctree
 		 * locked, we need to check for P_WEXIT.
 		 * XXX: is ESRCH correct?
 		 */
 		if ((proc->p_flag & P_WEXIT) != 0) {
 			PROC_UNLOCK(proc);
 			ret = ESRCH;
 			goto fail;
 		}
 		SLIST_INSERT_HEAD(&proc->p_sigiolst, sigio, sio_pgsigio);
 		sigio->sio_proc = proc;
 		PROC_UNLOCK(proc);
 	} else {
 		PGRP_LOCK(pgrp);
 		SLIST_INSERT_HEAD(&pgrp->pg_sigiolst, sigio, sio_pgsigio);
 		sigio->sio_pgrp = pgrp;
 		PGRP_UNLOCK(pgrp);
 	}
 	sx_sunlock(&proctree_lock);
 	SIGIO_LOCK();
 	*sigiop = sigio;
 	SIGIO_UNLOCK();
 	return (0);
 
 fail:
 	sx_sunlock(&proctree_lock);
 	crfree(sigio->sio_ucred);
 	FREE(sigio, M_SIGIO);
 	return (ret);
 }
 
 /*
  * This is common code for FIOGETOWN ioctl called by fcntl(fd, F_GETOWN, arg).
  */
 pid_t
 fgetown(sigiop)
 	struct sigio **sigiop;
 {
 	pid_t pgid;
 
 	SIGIO_LOCK();
 	pgid = (*sigiop != NULL) ? (*sigiop)->sio_pgid : 0;
 	SIGIO_UNLOCK();
 	return (pgid);
 }
 
 /*
  * Close a file descriptor.
  */
 #ifndef _SYS_SYSPROTO_H_
 struct close_args {
 	int     fd;
 };
 #endif
 /* ARGSUSED */
 int
 close(td, uap)
 	struct thread *td;
 	struct close_args *uap;
 {
 
 	return (kern_close(td, uap->fd));
 }
 
 int
 kern_close(td, fd)
 	struct thread *td;
 	int fd;
 {
 	struct filedesc *fdp;
 	struct file *fp;
 	int error;
 	int holdleaders;
 
 	error = 0;
 	holdleaders = 0;
 	fdp = td->td_proc->p_fd;
 
 	AUDIT_SYSCLOSE(td, fd);
 
 	FILEDESC_XLOCK(fdp);
 	if ((unsigned)fd >= fdp->fd_nfiles ||
 	    (fp = fdp->fd_ofiles[fd]) == NULL) {
 		FILEDESC_XUNLOCK(fdp);
 		return (EBADF);
 	}
 	fdp->fd_ofiles[fd] = NULL;
 	fdp->fd_ofileflags[fd] = 0;
 	fdunused(fdp, fd);
 	if (td->td_proc->p_fdtol != NULL) {
 		/*
 		 * Ask fdfree() to sleep to ensure that all relevant
 		 * process leaders can be traversed in closef().
 		 */
 		fdp->fd_holdleaderscount++;
 		holdleaders = 1;
 	}
 
 	/*
 	 * We now hold the fp reference that used to be owned by the
 	 * descriptor array.  We have to unlock the FILEDESC *AFTER*
 	 * knote_fdclose to prevent a race of the fd getting opened, a knote
 	 * added, and deleteing a knote for the new fd.
 	 */
 	knote_fdclose(td, fd);
 	if (fp->f_type == DTYPE_MQUEUE)
 		mq_fdclose(td, fd, fp);
 	FILEDESC_XUNLOCK(fdp);
 
 	error = closef(fp, td);
 	if (holdleaders) {
 		FILEDESC_XLOCK(fdp);
 		fdp->fd_holdleaderscount--;
 		if (fdp->fd_holdleaderscount == 0 &&
 		    fdp->fd_holdleaderswakeup != 0) {
 			fdp->fd_holdleaderswakeup = 0;
 			wakeup(&fdp->fd_holdleaderscount);
 		}
 		FILEDESC_XUNLOCK(fdp);
 	}
 	return (error);
 }
 
 #if defined(COMPAT_43)
 /*
  * Return status information about a file descriptor.
  */
 #ifndef _SYS_SYSPROTO_H_
 struct ofstat_args {
 	int	fd;
 	struct	ostat *sb;
 };
 #endif
 /* ARGSUSED */
 int
 ofstat(struct thread *td, struct ofstat_args *uap)
 {
 	struct ostat oub;
 	struct stat ub;
 	int error;
 
 	error = kern_fstat(td, uap->fd, &ub);
 	if (error == 0) {
 		cvtstat(&ub, &oub);
 		error = copyout(&oub, uap->sb, sizeof(oub));
 	}
 	return (error);
 }
 #endif /* COMPAT_43 */
 
 /*
  * Return status information about a file descriptor.
  */
 #ifndef _SYS_SYSPROTO_H_
 struct fstat_args {
 	int	fd;
 	struct	stat *sb;
 };
 #endif
 /* ARGSUSED */
 int
 fstat(struct thread *td, struct fstat_args *uap)
 {
 	struct stat ub;
 	int error;
 
 	error = kern_fstat(td, uap->fd, &ub);
 	if (error == 0)
 		error = copyout(&ub, uap->sb, sizeof(ub));
 	return (error);
 }
 
 int
 kern_fstat(struct thread *td, int fd, struct stat *sbp)
 {
 	struct file *fp;
 	int error;
 
 	AUDIT_ARG(fd, fd);
 
 	if ((error = fget(td, fd, &fp)) != 0)
 		return (error);
 
 	AUDIT_ARG(file, td->td_proc, fp);
 
 	error = fo_stat(fp, sbp, td->td_ucred, td);
 	fdrop(fp, td);
 	return (error);
 }
 
 /*
  * Return status information about a file descriptor.
  */
 #ifndef _SYS_SYSPROTO_H_
 struct nfstat_args {
 	int	fd;
 	struct	nstat *sb;
 };
 #endif
 /* ARGSUSED */
 int
 nfstat(struct thread *td, struct nfstat_args *uap)
 {
 	struct nstat nub;
 	struct stat ub;
 	int error;
 
 	error = kern_fstat(td, uap->fd, &ub);
 	if (error == 0) {
 		cvtnstat(&ub, &nub);
 		error = copyout(&nub, uap->sb, sizeof(nub));
 	}
 	return (error);
 }
 
 /*
  * Return pathconf information about a file descriptor.
  */
 #ifndef _SYS_SYSPROTO_H_
 struct fpathconf_args {
 	int	fd;
 	int	name;
 };
 #endif
 /* ARGSUSED */
 int
 fpathconf(struct thread *td, struct fpathconf_args *uap)
 {
 	struct file *fp;
 	struct vnode *vp;
 	int error;
 
 	if ((error = fget(td, uap->fd, &fp)) != 0)
 		return (error);
 
 	/* If asynchronous I/O is available, it works for all descriptors. */
 	if (uap->name == _PC_ASYNC_IO) {
 		td->td_retval[0] = async_io_version;
 		goto out;
 	}
 	vp = fp->f_vnode;
 	if (vp != NULL) {
 		int vfslocked;
 		vfslocked = VFS_LOCK_GIANT(vp->v_mount);
-		vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td);
+		vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
 		error = VOP_PATHCONF(vp, uap->name, td->td_retval);
 		VOP_UNLOCK(vp, 0, td);
 		VFS_UNLOCK_GIANT(vfslocked);
 	} else if (fp->f_type == DTYPE_PIPE || fp->f_type == DTYPE_SOCKET) {
 		if (uap->name != _PC_PIPE_BUF) {
 			error = EINVAL;
 		} else {
 			td->td_retval[0] = PIPE_BUF;
 		error = 0;
 		}
 	} else {
 		error = EOPNOTSUPP;
 	}
 out:
 	fdrop(fp, td);
 	return (error);
 }
 
 /*
  * Grow the file table to accomodate (at least) nfd descriptors.  This may
  * block and drop the filedesc lock, but it will reacquire it before
  * returning.
  */
 static void
 fdgrowtable(struct filedesc *fdp, int nfd)
 {
 	struct file **ntable;
 	char *nfileflags;
 	int nnfiles, onfiles;
 	NDSLOTTYPE *nmap;
 
 	FILEDESC_XLOCK_ASSERT(fdp);
 
 	KASSERT(fdp->fd_nfiles > 0,
 	    ("zero-length file table"));
 
 	/* compute the size of the new table */
 	onfiles = fdp->fd_nfiles;
 	nnfiles = NDSLOTS(nfd) * NDENTRIES; /* round up */
 	if (nnfiles <= onfiles)
 		/* the table is already large enough */
 		return;
 
 	/* allocate a new table and (if required) new bitmaps */
 	FILEDESC_XUNLOCK(fdp);
 	MALLOC(ntable, struct file **, nnfiles * OFILESIZE,
 	    M_FILEDESC, M_ZERO | M_WAITOK);
 	nfileflags = (char *)&ntable[nnfiles];
 	if (NDSLOTS(nnfiles) > NDSLOTS(onfiles))
 		MALLOC(nmap, NDSLOTTYPE *, NDSLOTS(nnfiles) * NDSLOTSIZE,
 		    M_FILEDESC, M_ZERO | M_WAITOK);
 	else
 		nmap = NULL;
 	FILEDESC_XLOCK(fdp);
 
 	/*
 	 * We now have new tables ready to go.  Since we dropped the
 	 * filedesc lock to call malloc(), watch out for a race.
 	 */
 	onfiles = fdp->fd_nfiles;
 	if (onfiles >= nnfiles) {
 		/* we lost the race, but that's OK */
 		free(ntable, M_FILEDESC);
 		if (nmap != NULL)
 			free(nmap, M_FILEDESC);
 		return;
 	}
 	bcopy(fdp->fd_ofiles, ntable, onfiles * sizeof(*ntable));
 	bcopy(fdp->fd_ofileflags, nfileflags, onfiles);
 	if (onfiles > NDFILE)
 		free(fdp->fd_ofiles, M_FILEDESC);
 	fdp->fd_ofiles = ntable;
 	fdp->fd_ofileflags = nfileflags;
 	if (NDSLOTS(nnfiles) > NDSLOTS(onfiles)) {
 		bcopy(fdp->fd_map, nmap, NDSLOTS(onfiles) * sizeof(*nmap));
 		if (NDSLOTS(onfiles) > NDSLOTS(NDFILE))
 			free(fdp->fd_map, M_FILEDESC);
 		fdp->fd_map = nmap;
 	}
 	fdp->fd_nfiles = nnfiles;
 }
 
 /*
  * Allocate a file descriptor for the process.
  */
 int
 fdalloc(struct thread *td, int minfd, int *result)
 {
 	struct proc *p = td->td_proc;
 	struct filedesc *fdp = p->p_fd;
 	int fd = -1, maxfd;
 
 	FILEDESC_XLOCK_ASSERT(fdp);
 
 	if (fdp->fd_freefile > minfd)
 		minfd = fdp->fd_freefile;	   
 
 	PROC_LOCK(p);
 	maxfd = min((int)lim_cur(p, RLIMIT_NOFILE), maxfilesperproc);
 	PROC_UNLOCK(p);
 
 	/*
 	 * Search the bitmap for a free descriptor.  If none is found, try
 	 * to grow the file table.  Keep at it until we either get a file
 	 * descriptor or run into process or system limits; fdgrowtable()
 	 * may drop the filedesc lock, so we're in a race.
 	 */
 	for (;;) {
 		fd = fd_first_free(fdp, minfd, fdp->fd_nfiles);
 		if (fd >= maxfd)
 			return (EMFILE);
 		if (fd < fdp->fd_nfiles)
 			break;
 		fdgrowtable(fdp, min(fdp->fd_nfiles * 2, maxfd));
 	}
 
 	/*
 	 * Perform some sanity checks, then mark the file descriptor as
 	 * used and return it to the caller.
 	 */
 	KASSERT(!fdisused(fdp, fd),
 	    ("fd_first_free() returned non-free descriptor"));
 	KASSERT(fdp->fd_ofiles[fd] == NULL,
 	    ("free descriptor isn't"));
 	fdp->fd_ofileflags[fd] = 0; /* XXX needed? */
 	fdused(fdp, fd);
 	*result = fd;
 	return (0);
 }
 
 /*
  * Check to see whether n user file descriptors are available to the process
  * p.
  */
 int
 fdavail(struct thread *td, int n)
 {
 	struct proc *p = td->td_proc;
 	struct filedesc *fdp = td->td_proc->p_fd;
 	struct file **fpp;
 	int i, lim, last;
 
 	FILEDESC_LOCK_ASSERT(fdp);
 
 	PROC_LOCK(p);
 	lim = min((int)lim_cur(p, RLIMIT_NOFILE), maxfilesperproc);
 	PROC_UNLOCK(p);
 	if ((i = lim - fdp->fd_nfiles) > 0 && (n -= i) <= 0)
 		return (1);
 	last = min(fdp->fd_nfiles, lim);
 	fpp = &fdp->fd_ofiles[fdp->fd_freefile];
 	for (i = last - fdp->fd_freefile; --i >= 0; fpp++) {
 		if (*fpp == NULL && --n <= 0)
 			return (1);
 	}
 	return (0);
 }
 
 /*
  * Create a new open file structure and allocate a file decriptor for the
  * process that refers to it.  We add one reference to the file for the
  * descriptor table and one reference for resultfp. This is to prevent us
  * being preempted and the entry in the descriptor table closed after we
  * release the FILEDESC lock.
  */
 int
 falloc(struct thread *td, struct file **resultfp, int *resultfd)
 {
 	struct proc *p = td->td_proc;
 	struct file *fp;
 	int error, i;
 	int maxuserfiles = maxfiles - (maxfiles / 20);
 	static struct timeval lastfail;
 	static int curfail;
 
 	fp = uma_zalloc(file_zone, M_WAITOK | M_ZERO);
 	if ((openfiles >= maxuserfiles &&
 	    priv_check(td, PRIV_MAXFILES) != 0) ||
 	    openfiles >= maxfiles) {
 		if (ppsratecheck(&lastfail, &curfail, 1)) {
 			printf("kern.maxfiles limit exceeded by uid %i, please see tuning(7).\n",
 				td->td_ucred->cr_ruid);
 		}
 		uma_zfree(file_zone, fp);
 		return (ENFILE);
 	}
 	atomic_add_int(&openfiles, 1);
 
 	/*
 	 * If the process has file descriptor zero open, add the new file
 	 * descriptor to the list of open files at that point, otherwise
 	 * put it at the front of the list of open files.
 	 */
 	fp->f_count = 1;
 	if (resultfp)
 		fp->f_count++;
 	fp->f_cred = crhold(td->td_ucred);
 	fp->f_ops = &badfileops;
 	fp->f_data = NULL;
 	fp->f_vnode = NULL;
 	FILEDESC_XLOCK(p->p_fd);
 	if ((error = fdalloc(td, 0, &i))) {
 		FILEDESC_XUNLOCK(p->p_fd);
 		fdrop(fp, td);
 		if (resultfp)
 			fdrop(fp, td);
 		return (error);
 	}
 	p->p_fd->fd_ofiles[i] = fp;
 	FILEDESC_XUNLOCK(p->p_fd);
 	if (resultfp)
 		*resultfp = fp;
 	if (resultfd)
 		*resultfd = i;
 	return (0);
 }
 
 /*
  * Build a new filedesc structure from another.
  * Copy the current, root, and jail root vnode references.
  */
 struct filedesc *
 fdinit(struct filedesc *fdp)
 {
 	struct filedesc0 *newfdp;
 
 	newfdp = malloc(sizeof *newfdp, M_FILEDESC, M_WAITOK | M_ZERO);
 	FILEDESC_LOCK_INIT(&newfdp->fd_fd);
 	if (fdp != NULL) {
 		FILEDESC_XLOCK(fdp);
 		newfdp->fd_fd.fd_cdir = fdp->fd_cdir;
 		if (newfdp->fd_fd.fd_cdir)
 			VREF(newfdp->fd_fd.fd_cdir);
 		newfdp->fd_fd.fd_rdir = fdp->fd_rdir;
 		if (newfdp->fd_fd.fd_rdir)
 			VREF(newfdp->fd_fd.fd_rdir);
 		newfdp->fd_fd.fd_jdir = fdp->fd_jdir;
 		if (newfdp->fd_fd.fd_jdir)
 			VREF(newfdp->fd_fd.fd_jdir);
 		FILEDESC_XUNLOCK(fdp);
 	}
 
 	/* Create the file descriptor table. */
 	newfdp->fd_fd.fd_refcnt = 1;
 	newfdp->fd_fd.fd_holdcnt = 1;
 	newfdp->fd_fd.fd_cmask = CMASK;
 	newfdp->fd_fd.fd_ofiles = newfdp->fd_dfiles;
 	newfdp->fd_fd.fd_ofileflags = newfdp->fd_dfileflags;
 	newfdp->fd_fd.fd_nfiles = NDFILE;
 	newfdp->fd_fd.fd_map = newfdp->fd_dmap;
 	newfdp->fd_fd.fd_lastfile = -1;
 	return (&newfdp->fd_fd);
 }
 
 static struct filedesc *
 fdhold(struct proc *p)
 {
 	struct filedesc *fdp;
 
 	mtx_lock(&fdesc_mtx);
 	fdp = p->p_fd;
 	if (fdp != NULL)
 		fdp->fd_holdcnt++;
 	mtx_unlock(&fdesc_mtx);
 	return (fdp);
 }
 
 static void
 fddrop(struct filedesc *fdp)
 {
 	int i;
 
 	mtx_lock(&fdesc_mtx);
 	i = --fdp->fd_holdcnt;
 	mtx_unlock(&fdesc_mtx);
 	if (i > 0)
 		return;
 
 	FILEDESC_LOCK_DESTROY(fdp);
 	FREE(fdp, M_FILEDESC);
 }
 
 /*
  * Share a filedesc structure.
  */
 struct filedesc *
 fdshare(struct filedesc *fdp)
 {
 
 	FILEDESC_XLOCK(fdp);
 	fdp->fd_refcnt++;
 	FILEDESC_XUNLOCK(fdp);
 	return (fdp);
 }
 
 /*
  * Unshare a filedesc structure, if necessary by making a copy
  */
 void
 fdunshare(struct proc *p, struct thread *td)
 {
 
 	FILEDESC_XLOCK(p->p_fd);
 	if (p->p_fd->fd_refcnt > 1) {
 		struct filedesc *tmp;
 
 		FILEDESC_XUNLOCK(p->p_fd);
 		tmp = fdcopy(p->p_fd);
 		fdfree(td);
 		p->p_fd = tmp;
 	} else
 		FILEDESC_XUNLOCK(p->p_fd);
 }
 
 /*
  * Copy a filedesc structure.  A NULL pointer in returns a NULL reference,
  * this is to ease callers, not catch errors.
  */
 struct filedesc *
 fdcopy(struct filedesc *fdp)
 {
 	struct filedesc *newfdp;
 	int i;
 
 	/* Certain daemons might not have file descriptors. */
 	if (fdp == NULL)
 		return (NULL);
 
 	newfdp = fdinit(fdp);
 	FILEDESC_SLOCK(fdp);
 	while (fdp->fd_lastfile >= newfdp->fd_nfiles) {
 		FILEDESC_SUNLOCK(fdp);
 		FILEDESC_XLOCK(newfdp);
 		fdgrowtable(newfdp, fdp->fd_lastfile + 1);
 		FILEDESC_XUNLOCK(newfdp);
 		FILEDESC_SLOCK(fdp);
 	}
 	/* copy everything except kqueue descriptors */
 	newfdp->fd_freefile = -1;
 	for (i = 0; i <= fdp->fd_lastfile; ++i) {
 		if (fdisused(fdp, i) &&
 		    fdp->fd_ofiles[i]->f_type != DTYPE_KQUEUE) {
 			newfdp->fd_ofiles[i] = fdp->fd_ofiles[i];
 			newfdp->fd_ofileflags[i] = fdp->fd_ofileflags[i];
 			fhold(newfdp->fd_ofiles[i]);
 			newfdp->fd_lastfile = i;
 		} else {
 			if (newfdp->fd_freefile == -1)
 				newfdp->fd_freefile = i;
 		}
 	}
 	FILEDESC_SUNLOCK(fdp);
 	FILEDESC_XLOCK(newfdp);
 	for (i = 0; i <= newfdp->fd_lastfile; ++i)
 		if (newfdp->fd_ofiles[i] != NULL)
 			fdused(newfdp, i);
 	FILEDESC_XUNLOCK(newfdp);
 	FILEDESC_SLOCK(fdp);
 	if (newfdp->fd_freefile == -1)
 		newfdp->fd_freefile = i;
 	newfdp->fd_cmask = fdp->fd_cmask;
 	FILEDESC_SUNLOCK(fdp);
 	return (newfdp);
 }
 
 /*
  * Release a filedesc structure.
  */
 void
 fdfree(struct thread *td)
 {
 	struct filedesc *fdp;
 	struct file **fpp;
 	int i, locked;
 	struct filedesc_to_leader *fdtol;
 	struct file *fp;
 	struct vnode *cdir, *jdir, *rdir, *vp;
 	struct flock lf;
 
 	/* Certain daemons might not have file descriptors. */
 	fdp = td->td_proc->p_fd;
 	if (fdp == NULL)
 		return;
 
 	/* Check for special need to clear POSIX style locks */
 	fdtol = td->td_proc->p_fdtol;
 	if (fdtol != NULL) {
 		FILEDESC_XLOCK(fdp);
 		KASSERT(fdtol->fdl_refcount > 0,
 			("filedesc_to_refcount botch: fdl_refcount=%d",
 			 fdtol->fdl_refcount));
 		if (fdtol->fdl_refcount == 1 &&
 		    (td->td_proc->p_leader->p_flag & P_ADVLOCK) != 0) {
 			for (i = 0, fpp = fdp->fd_ofiles;
 			     i <= fdp->fd_lastfile;
 			     i++, fpp++) {
 				if (*fpp == NULL ||
 				    (*fpp)->f_type != DTYPE_VNODE)
 					continue;
 				fp = *fpp;
 				fhold(fp);
 				FILEDESC_XUNLOCK(fdp);
 				lf.l_whence = SEEK_SET;
 				lf.l_start = 0;
 				lf.l_len = 0;
 				lf.l_type = F_UNLCK;
 				vp = fp->f_vnode;
 				locked = VFS_LOCK_GIANT(vp->v_mount);
 				(void) VOP_ADVLOCK(vp,
 						   (caddr_t)td->td_proc->
 						   p_leader,
 						   F_UNLCK,
 						   &lf,
 						   F_POSIX);
 				VFS_UNLOCK_GIANT(locked);
 				FILEDESC_XLOCK(fdp);
 				fdrop(fp, td);
 				fpp = fdp->fd_ofiles + i;
 			}
 		}
 	retry:
 		if (fdtol->fdl_refcount == 1) {
 			if (fdp->fd_holdleaderscount > 0 &&
 			    (td->td_proc->p_leader->p_flag & P_ADVLOCK) != 0) {
 				/*
 				 * close() or do_dup() has cleared a reference
 				 * in a shared file descriptor table.
 				 */
 				fdp->fd_holdleaderswakeup = 1;
 				sx_sleep(&fdp->fd_holdleaderscount,
 				    FILEDESC_LOCK(fdp), PLOCK, "fdlhold", 0);
 				goto retry;
 			}
 			if (fdtol->fdl_holdcount > 0) {
 				/*
 				 * Ensure that fdtol->fdl_leader remains
 				 * valid in closef().
 				 */
 				fdtol->fdl_wakeup = 1;
 				sx_sleep(fdtol, FILEDESC_LOCK(fdp), PLOCK,
 				    "fdlhold", 0);
 				goto retry;
 			}
 		}
 		fdtol->fdl_refcount--;
 		if (fdtol->fdl_refcount == 0 &&
 		    fdtol->fdl_holdcount == 0) {
 			fdtol->fdl_next->fdl_prev = fdtol->fdl_prev;
 			fdtol->fdl_prev->fdl_next = fdtol->fdl_next;
 		} else
 			fdtol = NULL;
 		td->td_proc->p_fdtol = NULL;
 		FILEDESC_XUNLOCK(fdp);
 		if (fdtol != NULL)
 			FREE(fdtol, M_FILEDESC_TO_LEADER);
 	}
 	FILEDESC_XLOCK(fdp);
 	i = --fdp->fd_refcnt;
 	FILEDESC_XUNLOCK(fdp);
 	if (i > 0)
 		return;
 	/*
 	 * We are the last reference to the structure, so we can
 	 * safely assume it will not change out from under us.
 	 */
 	fpp = fdp->fd_ofiles;
 	for (i = fdp->fd_lastfile; i-- >= 0; fpp++) {
 		if (*fpp)
 			(void) closef(*fpp, td);
 	}
 	FILEDESC_XLOCK(fdp);
 
 	/* XXX This should happen earlier. */
 	mtx_lock(&fdesc_mtx);
 	td->td_proc->p_fd = NULL;
 	mtx_unlock(&fdesc_mtx);
 
 	if (fdp->fd_nfiles > NDFILE)
 		FREE(fdp->fd_ofiles, M_FILEDESC);
 	if (NDSLOTS(fdp->fd_nfiles) > NDSLOTS(NDFILE))
 		FREE(fdp->fd_map, M_FILEDESC);
 
 	fdp->fd_nfiles = 0;
 
 	cdir = fdp->fd_cdir;
 	fdp->fd_cdir = NULL;
 	rdir = fdp->fd_rdir;
 	fdp->fd_rdir = NULL;
 	jdir = fdp->fd_jdir;
 	fdp->fd_jdir = NULL;
 	FILEDESC_XUNLOCK(fdp);
 
 	if (cdir) {
 		locked = VFS_LOCK_GIANT(cdir->v_mount);
 		vrele(cdir);
 		VFS_UNLOCK_GIANT(locked);
 	}
 	if (rdir) {
 		locked = VFS_LOCK_GIANT(rdir->v_mount);
 		vrele(rdir);
 		VFS_UNLOCK_GIANT(locked);
 	}
 	if (jdir) {
 		locked = VFS_LOCK_GIANT(jdir->v_mount);
 		vrele(jdir);
 		VFS_UNLOCK_GIANT(locked);
 	}
 
 	fddrop(fdp);
 }
 
 /*
  * For setugid programs, we don't want to people to use that setugidness
  * to generate error messages which write to a file which otherwise would
  * otherwise be off-limits to the process.  We check for filesystems where
  * the vnode can change out from under us after execve (like [lin]procfs).
  *
  * Since setugidsafety calls this only for fd 0, 1 and 2, this check is
  * sufficient.  We also don't check for setugidness since we know we are.
  */
 static int
 is_unsafe(struct file *fp)
 {
 	if (fp->f_type == DTYPE_VNODE) {
 		struct vnode *vp = fp->f_vnode;
 
 		if ((vp->v_vflag & VV_PROCDEP) != 0)
 			return (1);
 	}
 	return (0);
 }
 
 /*
  * Make this setguid thing safe, if at all possible.
  */
 void
 setugidsafety(struct thread *td)
 {
 	struct filedesc *fdp;
 	int i;
 
 	/* Certain daemons might not have file descriptors. */
 	fdp = td->td_proc->p_fd;
 	if (fdp == NULL)
 		return;
 
 	/*
 	 * Note: fdp->fd_ofiles may be reallocated out from under us while
 	 * we are blocked in a close.  Be careful!
 	 */
 	FILEDESC_XLOCK(fdp);
 	for (i = 0; i <= fdp->fd_lastfile; i++) {
 		if (i > 2)
 			break;
 		if (fdp->fd_ofiles[i] && is_unsafe(fdp->fd_ofiles[i])) {
 			struct file *fp;
 
 			knote_fdclose(td, i);
 			/*
 			 * NULL-out descriptor prior to close to avoid
 			 * a race while close blocks.
 			 */
 			fp = fdp->fd_ofiles[i];
 			fdp->fd_ofiles[i] = NULL;
 			fdp->fd_ofileflags[i] = 0;
 			fdunused(fdp, i);
 			FILEDESC_XUNLOCK(fdp);
 			(void) closef(fp, td);
 			FILEDESC_XLOCK(fdp);
 		}
 	}
 	FILEDESC_XUNLOCK(fdp);
 }
 
 /*
  * If a specific file object occupies a specific file descriptor, close the
  * file descriptor entry and drop a reference on the file object.  This is a
  * convenience function to handle a subsequent error in a function that calls
  * falloc() that handles the race that another thread might have closed the
  * file descriptor out from under the thread creating the file object.
  */
 void
 fdclose(struct filedesc *fdp, struct file *fp, int idx, struct thread *td)
 {
 
 	FILEDESC_XLOCK(fdp);
 	if (fdp->fd_ofiles[idx] == fp) {
 		fdp->fd_ofiles[idx] = NULL;
 		fdunused(fdp, idx);
 		FILEDESC_XUNLOCK(fdp);
 		fdrop(fp, td);
 	} else
 		FILEDESC_XUNLOCK(fdp);
 }
 
 /*
  * Close any files on exec?
  */
 void
 fdcloseexec(struct thread *td)
 {
 	struct filedesc *fdp;
 	int i;
 
 	/* Certain daemons might not have file descriptors. */
 	fdp = td->td_proc->p_fd;
 	if (fdp == NULL)
 		return;
 
 	FILEDESC_XLOCK(fdp);
 
 	/*
 	 * We cannot cache fd_ofiles or fd_ofileflags since operations
 	 * may block and rip them out from under us.
 	 */
 	for (i = 0; i <= fdp->fd_lastfile; i++) {
 		if (fdp->fd_ofiles[i] != NULL &&
 		    (fdp->fd_ofiles[i]->f_type == DTYPE_MQUEUE ||
 		    (fdp->fd_ofileflags[i] & UF_EXCLOSE))) {
 			struct file *fp;
 
 			knote_fdclose(td, i);
 			/*
 			 * NULL-out descriptor prior to close to avoid
 			 * a race while close blocks.
 			 */
 			fp = fdp->fd_ofiles[i];
 			fdp->fd_ofiles[i] = NULL;
 			fdp->fd_ofileflags[i] = 0;
 			fdunused(fdp, i);
 			if (fp->f_type == DTYPE_MQUEUE)
 				mq_fdclose(td, i, fp);
 			FILEDESC_XUNLOCK(fdp);
 			(void) closef(fp, td);
 			FILEDESC_XLOCK(fdp);
 		}
 	}
 	FILEDESC_XUNLOCK(fdp);
 }
 
 /*
  * It is unsafe for set[ug]id processes to be started with file
  * descriptors 0..2 closed, as these descriptors are given implicit
  * significance in the Standard C library.  fdcheckstd() will create a
  * descriptor referencing /dev/null for each of stdin, stdout, and
  * stderr that is not already open.
  */
 int
 fdcheckstd(struct thread *td)
 {
 	struct filedesc *fdp;
 	register_t retval, save;
 	int i, error, devnull;
 
 	fdp = td->td_proc->p_fd;
 	if (fdp == NULL)
 		return (0);
 	KASSERT(fdp->fd_refcnt == 1, ("the fdtable should not be shared"));
 	devnull = -1;
 	error = 0;
 	for (i = 0; i < 3; i++) {
 		if (fdp->fd_ofiles[i] != NULL)
 			continue;
 		if (devnull < 0) {
 			save = td->td_retval[0];
 			error = kern_open(td, "/dev/null", UIO_SYSSPACE,
 			    O_RDWR, 0);
 			devnull = td->td_retval[0];
 			KASSERT(devnull == i, ("oof, we didn't get our fd"));
 			td->td_retval[0] = save;
 			if (error)
 				break;
 		} else {
 			error = do_dup(td, DUP_FIXED, devnull, i, &retval);
 			if (error != 0)
 				break;
 		}
 	}
 	return (error);
 }
 
 /*
  * Internal form of close.  Decrement reference count on file structure.
  * Note: td may be NULL when closing a file that was being passed in a
  * message.
  *
  * XXXRW: Giant is not required for the caller, but often will be held; this
  * makes it moderately likely the Giant will be recursed in the VFS case.
  */
 int
 closef(struct file *fp, struct thread *td)
 {
 	struct vnode *vp;
 	struct flock lf;
 	struct filedesc_to_leader *fdtol;
 	struct filedesc *fdp;
 
 	/*
 	 * POSIX record locking dictates that any close releases ALL
 	 * locks owned by this process.  This is handled by setting
 	 * a flag in the unlock to free ONLY locks obeying POSIX
 	 * semantics, and not to free BSD-style file locks.
 	 * If the descriptor was in a message, POSIX-style locks
 	 * aren't passed with the descriptor, and the thread pointer
 	 * will be NULL.  Callers should be careful only to pass a
 	 * NULL thread pointer when there really is no owning
 	 * context that might have locks, or the locks will be
 	 * leaked.
 	 */
 	if (fp->f_type == DTYPE_VNODE && td != NULL) {
 		int vfslocked;
 
 		vp = fp->f_vnode;
 		vfslocked = VFS_LOCK_GIANT(vp->v_mount);
 		if ((td->td_proc->p_leader->p_flag & P_ADVLOCK) != 0) {
 			lf.l_whence = SEEK_SET;
 			lf.l_start = 0;
 			lf.l_len = 0;
 			lf.l_type = F_UNLCK;
 			(void) VOP_ADVLOCK(vp, (caddr_t)td->td_proc->p_leader,
 					   F_UNLCK, &lf, F_POSIX);
 		}
 		fdtol = td->td_proc->p_fdtol;
 		if (fdtol != NULL) {
 			/*
 			 * Handle special case where file descriptor table is
 			 * shared between multiple process leaders.
 			 */
 			fdp = td->td_proc->p_fd;
 			FILEDESC_XLOCK(fdp);
 			for (fdtol = fdtol->fdl_next;
 			     fdtol != td->td_proc->p_fdtol;
 			     fdtol = fdtol->fdl_next) {
 				if ((fdtol->fdl_leader->p_flag &
 				     P_ADVLOCK) == 0)
 					continue;
 				fdtol->fdl_holdcount++;
 				FILEDESC_XUNLOCK(fdp);
 				lf.l_whence = SEEK_SET;
 				lf.l_start = 0;
 				lf.l_len = 0;
 				lf.l_type = F_UNLCK;
 				vp = fp->f_vnode;
 				(void) VOP_ADVLOCK(vp,
 						   (caddr_t)fdtol->fdl_leader,
 						   F_UNLCK, &lf, F_POSIX);
 				FILEDESC_XLOCK(fdp);
 				fdtol->fdl_holdcount--;
 				if (fdtol->fdl_holdcount == 0 &&
 				    fdtol->fdl_wakeup != 0) {
 					fdtol->fdl_wakeup = 0;
 					wakeup(fdtol);
 				}
 			}
 			FILEDESC_XUNLOCK(fdp);
 		}
 		VFS_UNLOCK_GIANT(vfslocked);
 	}
 	return (fdrop(fp, td));
 }
 
 /*
  * Initialize the file pointer with the specified properties.
  * 
  * The ops are set with release semantics to be certain that the flags, type,
  * and data are visible when ops is.  This is to prevent ops methods from being
  * called with bad data.
  */
 void
 finit(struct file *fp, u_int flag, short type, void *data, struct fileops *ops)
 {
 	fp->f_data = data;
 	fp->f_flag = flag;
 	fp->f_type = type;
 	atomic_store_rel_ptr((volatile uintptr_t *)&fp->f_ops, (uintptr_t)ops);
 }
 
 
 /*
  * Extract the file pointer associated with the specified descriptor for the
  * current user process.
  *
  * If the descriptor doesn't exist, EBADF is returned.
  *
  * If the descriptor exists but doesn't match 'flags' then return EBADF for
  * read attempts and EINVAL for write attempts.
  *
  * If 'hold' is set (non-zero) the file's refcount will be bumped on return.
  * It should be dropped with fdrop().  If it is not set, then the refcount
  * will not be bumped however the thread's filedesc struct will be returned
  * locked (for fgetsock).
  *
  * If an error occured the non-zero error is returned and *fpp is set to
  * NULL.  Otherwise *fpp is set and zero is returned.
  */
 static __inline int
 _fget(struct thread *td, int fd, struct file **fpp, int flags, int hold)
 {
 	struct filedesc *fdp;
 	struct file *fp;
 
 	*fpp = NULL;
 	if (td == NULL || (fdp = td->td_proc->p_fd) == NULL)
 		return (EBADF);
 	FILEDESC_SLOCK(fdp);
 	if ((fp = fget_locked(fdp, fd)) == NULL || fp->f_ops == &badfileops) {
 		FILEDESC_SUNLOCK(fdp);
 		return (EBADF);
 	}
 
 	/*
 	 * FREAD and FWRITE failure return EBADF as per POSIX.
 	 *
 	 * Only one flag, or 0, may be specified.
 	 */
 	if (flags == FREAD && (fp->f_flag & FREAD) == 0) {
 		FILEDESC_SUNLOCK(fdp);
 		return (EBADF);
 	}
 	if (flags == FWRITE && (fp->f_flag & FWRITE) == 0) {
 		FILEDESC_SUNLOCK(fdp);
 		return (EBADF);
 	}
 	if (hold) {
 		fhold(fp);
 		FILEDESC_SUNLOCK(fdp);
 	}
 	*fpp = fp;
 	return (0);
 }
 
 int
 fget(struct thread *td, int fd, struct file **fpp)
 {
 
 	return(_fget(td, fd, fpp, 0, 1));
 }
 
 int
 fget_read(struct thread *td, int fd, struct file **fpp)
 {
 
 	return(_fget(td, fd, fpp, FREAD, 1));
 }
 
 int
 fget_write(struct thread *td, int fd, struct file **fpp)
 {
 
 	return(_fget(td, fd, fpp, FWRITE, 1));
 }
 
 /*
  * Like fget() but loads the underlying vnode, or returns an error if the
  * descriptor does not represent a vnode.  Note that pipes use vnodes but
  * never have VM objects.  The returned vnode will be vref()'d.
  *
  * XXX: what about the unused flags ?
  */
 static __inline int
 _fgetvp(struct thread *td, int fd, struct vnode **vpp, int flags)
 {
 	struct file *fp;
 	int error;
 
 	*vpp = NULL;
 	if ((error = _fget(td, fd, &fp, 0, 0)) != 0)
 		return (error);
 	if (fp->f_vnode == NULL) {
 		error = EINVAL;
 	} else {
 		*vpp = fp->f_vnode;
 		vref(*vpp);
 	}
 	FILEDESC_SUNLOCK(td->td_proc->p_fd);
 	return (error);
 }
 
 int
 fgetvp(struct thread *td, int fd, struct vnode **vpp)
 {
 
 	return (_fgetvp(td, fd, vpp, 0));
 }
 
 int
 fgetvp_read(struct thread *td, int fd, struct vnode **vpp)
 {
 
 	return (_fgetvp(td, fd, vpp, FREAD));
 }
 
 #ifdef notyet
 int
 fgetvp_write(struct thread *td, int fd, struct vnode **vpp)
 {
 
 	return (_fgetvp(td, fd, vpp, FWRITE));
 }
 #endif
 
 /*
  * Like fget() but loads the underlying socket, or returns an error if the
  * descriptor does not represent a socket.
  *
  * We bump the ref count on the returned socket.  XXX Also obtain the SX lock
  * in the future.
  *
  * XXXRW: fgetsock() and fputsock() are deprecated, as consumers should rely
  * on their file descriptor reference to prevent the socket from being free'd
  * during use.
  */
 int
 fgetsock(struct thread *td, int fd, struct socket **spp, u_int *fflagp)
 {
 	struct file *fp;
 	int error;
 
 	*spp = NULL;
 	if (fflagp != NULL)
 		*fflagp = 0;
 	if ((error = _fget(td, fd, &fp, 0, 0)) != 0)
 		return (error);
 	if (fp->f_type != DTYPE_SOCKET) {
 		error = ENOTSOCK;
 	} else {
 		*spp = fp->f_data;
 		if (fflagp)
 			*fflagp = fp->f_flag;
 		SOCK_LOCK(*spp);
 		soref(*spp);
 		SOCK_UNLOCK(*spp);
 	}
 	FILEDESC_SUNLOCK(td->td_proc->p_fd);
 	return (error);
 }
 
 /*
  * Drop the reference count on the socket and XXX release the SX lock in the
  * future.  The last reference closes the socket.
  *
  * XXXRW: fputsock() is deprecated, see comment for fgetsock().
  */
 void
 fputsock(struct socket *so)
 {
 
 	ACCEPT_LOCK();
 	SOCK_LOCK(so);
 	sorele(so);
 }
 
 /*
  * Handle the last reference to a file being closed.
  */
 int
 _fdrop(struct file *fp, struct thread *td)
 {
 	int error;
 
 	error = 0;
 	if (fp->f_count != 0)
 		panic("fdrop: count %d", fp->f_count);
 	if (fp->f_ops != &badfileops)
 		error = fo_close(fp, td);
 	atomic_subtract_int(&openfiles, 1);
 	crfree(fp->f_cred);
 	uma_zfree(file_zone, fp);
 
 	return (error);
 }
 
 /*
  * Apply an advisory lock on a file descriptor.
  *
  * Just attempt to get a record lock of the requested type on the entire file
  * (l_whence = SEEK_SET, l_start = 0, l_len = 0).
  */
 #ifndef _SYS_SYSPROTO_H_
 struct flock_args {
 	int	fd;
 	int	how;
 };
 #endif
 /* ARGSUSED */
 int
 flock(struct thread *td, struct flock_args *uap)
 {
 	struct file *fp;
 	struct vnode *vp;
 	struct flock lf;
 	int vfslocked;
 	int error;
 
 	if ((error = fget(td, uap->fd, &fp)) != 0)
 		return (error);
 	if (fp->f_type != DTYPE_VNODE) {
 		fdrop(fp, td);
 		return (EOPNOTSUPP);
 	}
 
 	vp = fp->f_vnode;
 	vfslocked = VFS_LOCK_GIANT(vp->v_mount);
 	lf.l_whence = SEEK_SET;
 	lf.l_start = 0;
 	lf.l_len = 0;
 	if (uap->how & LOCK_UN) {
 		lf.l_type = F_UNLCK;
 		atomic_clear_int(&fp->f_flag, FHASLOCK);
 		error = VOP_ADVLOCK(vp, (caddr_t)fp, F_UNLCK, &lf, F_FLOCK);
 		goto done2;
 	}
 	if (uap->how & LOCK_EX)
 		lf.l_type = F_WRLCK;
 	else if (uap->how & LOCK_SH)
 		lf.l_type = F_RDLCK;
 	else {
 		error = EBADF;
 		goto done2;
 	}
 	atomic_set_int(&fp->f_flag, FHASLOCK);
 	error = VOP_ADVLOCK(vp, (caddr_t)fp, F_SETLK, &lf,
 	    (uap->how & LOCK_NB) ? F_FLOCK : F_FLOCK | F_WAIT);
 done2:
 	fdrop(fp, td);
 	VFS_UNLOCK_GIANT(vfslocked);
 	return (error);
 }
 /*
  * Duplicate the specified descriptor to a free descriptor.
  */
 int
 dupfdopen(struct thread *td, struct filedesc *fdp, int indx, int dfd, int mode, int error)
 {
 	struct file *wfp;
 	struct file *fp;
 
 	/*
 	 * If the to-be-dup'd fd number is greater than the allowed number
 	 * of file descriptors, or the fd to be dup'd has already been
 	 * closed, then reject.
 	 */
 	FILEDESC_XLOCK(fdp);
 	if (dfd < 0 || dfd >= fdp->fd_nfiles ||
 	    (wfp = fdp->fd_ofiles[dfd]) == NULL) {
 		FILEDESC_XUNLOCK(fdp);
 		return (EBADF);
 	}
 
 	/*
 	 * There are two cases of interest here.
 	 *
 	 * For ENODEV simply dup (dfd) to file descriptor (indx) and return.
 	 *
 	 * For ENXIO steal away the file structure from (dfd) and store it in
 	 * (indx).  (dfd) is effectively closed by this operation.
 	 *
 	 * Any other error code is just returned.
 	 */
 	switch (error) {
 	case ENODEV:
 		/*
 		 * Check that the mode the file is being opened for is a
 		 * subset of the mode of the existing descriptor.
 		 */
 		if (((mode & (FREAD|FWRITE)) | wfp->f_flag) != wfp->f_flag) {
 			FILEDESC_XUNLOCK(fdp);
 			return (EACCES);
 		}
 		fp = fdp->fd_ofiles[indx];
 		fdp->fd_ofiles[indx] = wfp;
 		fdp->fd_ofileflags[indx] = fdp->fd_ofileflags[dfd];
 		if (fp == NULL)
 			fdused(fdp, indx);
 		fhold(wfp);
 		FILEDESC_XUNLOCK(fdp);
 		if (fp != NULL)
 			/*
 			 * We now own the reference to fp that the ofiles[]
 			 * array used to own.  Release it.
 			 */
 			fdrop(fp, td);
 		return (0);
 
 	case ENXIO:
 		/*
 		 * Steal away the file pointer from dfd and stuff it into indx.
 		 */
 		fp = fdp->fd_ofiles[indx];
 		fdp->fd_ofiles[indx] = fdp->fd_ofiles[dfd];
 		fdp->fd_ofiles[dfd] = NULL;
 		fdp->fd_ofileflags[indx] = fdp->fd_ofileflags[dfd];
 		fdp->fd_ofileflags[dfd] = 0;
 		fdunused(fdp, dfd);
 		if (fp == NULL)
 			fdused(fdp, indx);
 		FILEDESC_XUNLOCK(fdp);
 
 		/*
 		 * We now own the reference to fp that the ofiles[] array
 		 * used to own.  Release it.
 		 */
 		if (fp != NULL)
 			fdrop(fp, td);
 		return (0);
 
 	default:
 		FILEDESC_XUNLOCK(fdp);
 		return (error);
 	}
 	/* NOTREACHED */
 }
 
 /*
  * Scan all active processes to see if any of them have a current or root
  * directory of `olddp'. If so, replace them with the new mount point.
  */
 void
 mountcheckdirs(struct vnode *olddp, struct vnode *newdp)
 {
 	struct filedesc *fdp;
 	struct proc *p;
 	int nrele;
 
 	if (vrefcnt(olddp) == 1)
 		return;
 	sx_slock(&allproc_lock);
 	FOREACH_PROC_IN_SYSTEM(p) {
 		fdp = fdhold(p);
 		if (fdp == NULL)
 			continue;
 		nrele = 0;
 		FILEDESC_XLOCK(fdp);
 		if (fdp->fd_cdir == olddp) {
 			vref(newdp);
 			fdp->fd_cdir = newdp;
 			nrele++;
 		}
 		if (fdp->fd_rdir == olddp) {
 			vref(newdp);
 			fdp->fd_rdir = newdp;
 			nrele++;
 		}
 		FILEDESC_XUNLOCK(fdp);
 		fddrop(fdp);
 		while (nrele--)
 			vrele(olddp);
 	}
 	sx_sunlock(&allproc_lock);
 	if (rootvnode == olddp) {
 		vrele(rootvnode);
 		vref(newdp);
 		rootvnode = newdp;
 	}
 }
 
 struct filedesc_to_leader *
 filedesc_to_leader_alloc(struct filedesc_to_leader *old, struct filedesc *fdp, struct proc *leader)
 {
 	struct filedesc_to_leader *fdtol;
 
 	MALLOC(fdtol, struct filedesc_to_leader *,
 	       sizeof(struct filedesc_to_leader),
 	       M_FILEDESC_TO_LEADER,
 	       M_WAITOK);
 	fdtol->fdl_refcount = 1;
 	fdtol->fdl_holdcount = 0;
 	fdtol->fdl_wakeup = 0;
 	fdtol->fdl_leader = leader;
 	if (old != NULL) {
 		FILEDESC_XLOCK(fdp);
 		fdtol->fdl_next = old->fdl_next;
 		fdtol->fdl_prev = old;
 		old->fdl_next = fdtol;
 		fdtol->fdl_next->fdl_prev = fdtol;
 		FILEDESC_XUNLOCK(fdp);
 	} else {
 		fdtol->fdl_next = fdtol;
 		fdtol->fdl_prev = fdtol;
 	}
 	return (fdtol);
 }
 
 /*
  * Get file structures globally.
  */
 static int
 sysctl_kern_file(SYSCTL_HANDLER_ARGS)
 {
 	struct xfile xf;
 	struct filedesc *fdp;
 	struct file *fp;
 	struct proc *p;
 	int error, n;
 
 	error = sysctl_wire_old_buffer(req, 0);
 	if (error != 0)
 		return (error);
 	if (req->oldptr == NULL) {
 		n = 0;
 		sx_slock(&allproc_lock);
 		FOREACH_PROC_IN_SYSTEM(p) {
 			if (p->p_state == PRS_NEW)
 				continue;
 			fdp = fdhold(p);
 			if (fdp == NULL)
 				continue;
 			/* overestimates sparse tables. */
 			if (fdp->fd_lastfile > 0)
 				n += fdp->fd_lastfile;
 			fddrop(fdp);
 		}
 		sx_sunlock(&allproc_lock);
 		return (SYSCTL_OUT(req, 0, n * sizeof(xf)));
 	}
 	error = 0;
 	bzero(&xf, sizeof(xf));
 	xf.xf_size = sizeof(xf);
 	sx_slock(&allproc_lock);
 	FOREACH_PROC_IN_SYSTEM(p) {
 		if (p->p_state == PRS_NEW)
 			continue;
 		PROC_LOCK(p);
 		if (p_cansee(req->td, p) != 0) {
 			PROC_UNLOCK(p);
 			continue;
 		}
 		xf.xf_pid = p->p_pid;
 		xf.xf_uid = p->p_ucred->cr_uid;
 		PROC_UNLOCK(p);
 		fdp = fdhold(p);
 		if (fdp == NULL)
 			continue;
 		FILEDESC_SLOCK(fdp);
 		for (n = 0; fdp->fd_refcnt > 0 && n < fdp->fd_nfiles; ++n) {
 			if ((fp = fdp->fd_ofiles[n]) == NULL)
 				continue;
 			xf.xf_fd = n;
 			xf.xf_file = fp;
 			xf.xf_data = fp->f_data;
 			xf.xf_vnode = fp->f_vnode;
 			xf.xf_type = fp->f_type;
 			xf.xf_count = fp->f_count;
 			xf.xf_msgcount = 0;
 			xf.xf_offset = fp->f_offset;
 			xf.xf_flag = fp->f_flag;
 			error = SYSCTL_OUT(req, &xf, sizeof(xf));
 			if (error)
 				break;
 		}
 		FILEDESC_SUNLOCK(fdp);
 		fddrop(fdp);
 		if (error)
 			break;
 	}
 	sx_sunlock(&allproc_lock);
 	return (error);
 }
 
 SYSCTL_PROC(_kern, KERN_FILE, file, CTLTYPE_OPAQUE|CTLFLAG_RD,
     0, 0, sysctl_kern_file, "S,xfile", "Entire file table");
 
 /*
  * Get per-process file descriptors for use by procstat(1), et al.
  */
 static int
 sysctl_kern_proc_filedesc(SYSCTL_HANDLER_ARGS)
 {
 	char *fullpath, *freepath;
 	struct kinfo_file *kif;
 	struct filedesc *fdp;
 	int error, i, *name;
 	struct socket *so;
 	struct vnode *vp;
 	struct file *fp;
 	struct proc *p;
 	int vfslocked;
 
 	name = (int *)arg1;
 	if ((p = pfind((pid_t)name[0])) == NULL)
 		return (ESRCH);
 	if ((error = p_candebug(curthread, p))) {
 		PROC_UNLOCK(p);
 		return (error);
 	}
 	fdp = fdhold(p);
 	PROC_UNLOCK(p);
 	kif = malloc(sizeof(*kif), M_TEMP, M_WAITOK);
 	FILEDESC_SLOCK(fdp);
 	for (i = 0; i < fdp->fd_nfiles; i++) {
 		if ((fp = fdp->fd_ofiles[i]) == NULL)
 			continue;
 		bzero(kif, sizeof(*kif));
 		kif->kf_structsize = sizeof(*kif);
 		vp = NULL;
 		so = NULL;
 		kif->kf_fd = i;
 		switch (fp->f_type) {
 		case DTYPE_VNODE:
 			kif->kf_type = KF_TYPE_VNODE;
 			vp = fp->f_vnode;
 			break;
 
 		case DTYPE_SOCKET:
 			kif->kf_type = KF_TYPE_SOCKET;
 			so = fp->f_data;
 			break;
 
 		case DTYPE_PIPE:
 			kif->kf_type = KF_TYPE_PIPE;
 			break;
 
 		case DTYPE_FIFO:
 			kif->kf_type = KF_TYPE_FIFO;
 			vp = fp->f_vnode;
 			vref(vp);
 			break;
 
 		case DTYPE_KQUEUE:
 			kif->kf_type = KF_TYPE_KQUEUE;
 			break;
 
 		case DTYPE_CRYPTO:
 			kif->kf_type = KF_TYPE_CRYPTO;
 			break;
 
 		case DTYPE_MQUEUE:
 			kif->kf_type = KF_TYPE_MQUEUE;
 			break;
 
 		default:
 			kif->kf_type = KF_TYPE_UNKNOWN;
 			break;
 		}
 		kif->kf_ref_count = fp->f_count;
 		if (fp->f_flag & FREAD)
 			kif->kf_flags |= KF_FLAG_READ;
 		if (fp->f_flag & FWRITE)
 			kif->kf_flags |= KF_FLAG_WRITE;
 		if (fp->f_flag & FAPPEND)
 			kif->kf_flags |= KF_FLAG_APPEND;
 		if (fp->f_flag & FASYNC)
 			kif->kf_flags |= KF_FLAG_ASYNC;
 		if (fp->f_flag & FFSYNC)
 			kif->kf_flags |= KF_FLAG_FSYNC;
 		if (fp->f_flag & FNONBLOCK)
 			kif->kf_flags |= KF_FLAG_NONBLOCK;
 		if (fp->f_flag & O_DIRECT)
 			kif->kf_flags |= KF_FLAG_DIRECT;
 		if (fp->f_flag & FHASLOCK)
 			kif->kf_flags |= KF_FLAG_HASLOCK;
 		kif->kf_offset = fp->f_offset;
 		if (vp != NULL) {
 			vref(vp);
 			switch (vp->v_type) {
 			case VNON:
 				kif->kf_vnode_type = KF_VTYPE_VNON;
 				break;
 			case VREG:
 				kif->kf_vnode_type = KF_VTYPE_VREG;
 				break;
 			case VDIR:
 				kif->kf_vnode_type = KF_VTYPE_VDIR;
 				break;
 			case VBLK:
 				kif->kf_vnode_type = KF_VTYPE_VBLK;
 				break;
 			case VCHR:
 				kif->kf_vnode_type = KF_VTYPE_VCHR;
 				break;
 			case VLNK:
 				kif->kf_vnode_type = KF_VTYPE_VLNK;
 				break;
 			case VSOCK:
 				kif->kf_vnode_type = KF_VTYPE_VSOCK;
 				break;
 			case VFIFO:
 				kif->kf_vnode_type = KF_VTYPE_VFIFO;
 				break;
 			case VBAD:
 				kif->kf_vnode_type = KF_VTYPE_VBAD;
 				break;
 			default:
 				kif->kf_vnode_type = KF_VTYPE_UNKNOWN;
 				break;
 			}
 			/*
 			 * It is OK to drop the filedesc lock here as we will
 			 * re-validate and re-evaluate its properties when
 			 * the loop continues.
 			 */
 			freepath = NULL;
 			fullpath = "-";
 			FILEDESC_SUNLOCK(fdp);
 			vfslocked = VFS_LOCK_GIANT(vp->v_mount);
-			vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, curthread);
+			vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
 			vn_fullpath(curthread, vp, &fullpath, &freepath);
 			vput(vp);
 			VFS_UNLOCK_GIANT(vfslocked);
 			strlcpy(kif->kf_path, fullpath,
 			    sizeof(kif->kf_path));
 			if (freepath != NULL)
 				free(freepath, M_TEMP);
 			FILEDESC_SLOCK(fdp);
 		}
 		if (so != NULL) {
 			struct sockaddr *sa;
 
 			if (so->so_proto->pr_usrreqs->pru_sockaddr(so, &sa)
 			    == 0 && sa->sa_len <= sizeof(kif->kf_sa_local)) {
 				bcopy(sa, &kif->kf_sa_local, sa->sa_len);
 				free(sa, M_SONAME);
 			}
 			if (so->so_proto->pr_usrreqs->pru_peeraddr(so, &sa)
 			    == 00 && sa->sa_len <= sizeof(kif->kf_sa_peer)) {
 				bcopy(sa, &kif->kf_sa_peer, sa->sa_len);
 				free(sa, M_SONAME);
 			}
 			kif->kf_sock_domain =
 			    so->so_proto->pr_domain->dom_family;
 			kif->kf_sock_type = so->so_type;
 			kif->kf_sock_protocol = so->so_proto->pr_protocol;
 		}
 		error = SYSCTL_OUT(req, kif, sizeof(*kif));
 		if (error)
 			break;
 	}
 	FILEDESC_SUNLOCK(fdp);
 	fddrop(fdp);
 	free(kif, M_TEMP);
 	return (0);
 }
 
 static SYSCTL_NODE(_kern_proc, KERN_PROC_FILEDESC, filedesc, CTLFLAG_RD,
     sysctl_kern_proc_filedesc, "Process filedesc entries");
 
 #ifdef DDB
 /*
  * For the purposes of debugging, generate a human-readable string for the
  * file type.
  */
 static const char *
 file_type_to_name(short type)
 {
 
 	switch (type) {
 	case 0:
 		return ("zero");
 	case DTYPE_VNODE:
 		return ("vnod");
 	case DTYPE_SOCKET:
 		return ("sock");
 	case DTYPE_PIPE:
 		return ("pipe");
 	case DTYPE_FIFO:
 		return ("fifo");
 	case DTYPE_KQUEUE:
 		return ("kque");
 	case DTYPE_CRYPTO:
 		return ("crpt");
 	case DTYPE_MQUEUE:
 		return ("mque");
 	case DTYPE_SHM:
 		return ("shm");
 	default:
 		return ("unkn");
 	}
 }
 
 /*
  * For the purposes of debugging, identify a process (if any, perhaps one of
  * many) that references the passed file in its file descriptor array. Return
  * NULL if none.
  */
 static struct proc *
 file_to_first_proc(struct file *fp)
 {
 	struct filedesc *fdp;
 	struct proc *p;
 	int n;
 
 	FOREACH_PROC_IN_SYSTEM(p) {
 		if (p->p_state == PRS_NEW)
 			continue;
 		fdp = p->p_fd;
 		if (fdp == NULL)
 			continue;
 		for (n = 0; n < fdp->fd_nfiles; n++) {
 			if (fp == fdp->fd_ofiles[n])
 				return (p);
 		}
 	}
 	return (NULL);
 }
 
 static void
 db_print_file(struct file *fp, int header)
 {
 	struct proc *p;
 
 	if (header)
 		db_printf("%8s %4s %8s %8s %4s %5s %6s %8s %5s %12s\n",
 		    "File", "Type", "Data", "Flag", "GCFl", "Count",
 		    "MCount", "Vnode", "FPID", "FCmd");
 	p = file_to_first_proc(fp);
 	db_printf("%8p %4s %8p %08x %04x %5d %6d %8p %5d %12s\n", fp,
 	    file_type_to_name(fp->f_type), fp->f_data, fp->f_flag,
 	    0, fp->f_count, 0, fp->f_vnode,
 	    p != NULL ? p->p_pid : -1, p != NULL ? p->p_comm : "-");
 }
 
 DB_SHOW_COMMAND(file, db_show_file)
 {
 	struct file *fp;
 
 	if (!have_addr) {
 		db_printf("usage: show file <addr>\n");
 		return;
 	}
 	fp = (struct file *)addr;
 	db_print_file(fp, 1);
 }
 
 DB_SHOW_COMMAND(files, db_show_files)
 {
 	struct filedesc *fdp;
 	struct file *fp;
 	struct proc *p;
 	int header;
 	int n;
 
 	header = 1;
 	FOREACH_PROC_IN_SYSTEM(p) {
 		if (p->p_state == PRS_NEW)
 			continue;
 		if ((fdp = p->p_fd) == NULL)
 			continue;
 		for (n = 0; n < fdp->fd_nfiles; ++n) {
 			if ((fp = fdp->fd_ofiles[n]) == NULL)
 				continue;
 			db_print_file(fp, header);
 			header = 0;
 		}
 	}
 }
 #endif
 
 SYSCTL_INT(_kern, KERN_MAXFILESPERPROC, maxfilesperproc, CTLFLAG_RW,
     &maxfilesperproc, 0, "Maximum files allowed open per process");
 
 SYSCTL_INT(_kern, KERN_MAXFILES, maxfiles, CTLFLAG_RW,
     &maxfiles, 0, "Maximum number of files");
 
 SYSCTL_INT(_kern, OID_AUTO, openfiles, CTLFLAG_RD,
     __DEVOLATILE(int *, &openfiles), 0, "System-wide number of open files");
 
 /* ARGSUSED*/
 static void
 filelistinit(void *dummy)
 {
 
 	file_zone = uma_zcreate("Files", sizeof(struct file), NULL, NULL,
 	    NULL, NULL, UMA_ALIGN_PTR, 0);
 	mtx_init(&sigio_lock, "sigio lock", NULL, MTX_DEF);
 	mtx_init(&fdesc_mtx, "fdesc", NULL, MTX_DEF);
 }
 SYSINIT(select, SI_SUB_LOCK, SI_ORDER_FIRST, filelistinit, NULL)
 
 /*-------------------------------------------------------------------*/
 
 static int
 badfo_readwrite(struct file *fp, struct uio *uio, struct ucred *active_cred, int flags, struct thread *td)
 {
 
 	return (EBADF);
 }
 
 static int
 badfo_truncate(struct file *fp, off_t length, struct ucred *active_cred, struct thread *td)
 {
 
 	return (EINVAL);
 }
 
 static int
 badfo_ioctl(struct file *fp, u_long com, void *data, struct ucred *active_cred, struct thread *td)
 {
 
 	return (EBADF);
 }
 
 static int
 badfo_poll(struct file *fp, int events, struct ucred *active_cred, struct thread *td)
 {
 
 	return (0);
 }
 
 static int
 badfo_kqfilter(struct file *fp, struct knote *kn)
 {
 
 	return (EBADF);
 }
 
 static int
 badfo_stat(struct file *fp, struct stat *sb, struct ucred *active_cred, struct thread *td)
 {
 
 	return (EBADF);
 }
 
 static int
 badfo_close(struct file *fp, struct thread *td)
 {
 
 	return (EBADF);
 }
 
 struct fileops badfileops = {
 	.fo_read = badfo_readwrite,
 	.fo_write = badfo_readwrite,
 	.fo_truncate = badfo_truncate,
 	.fo_ioctl = badfo_ioctl,
 	.fo_poll = badfo_poll,
 	.fo_kqfilter = badfo_kqfilter,
 	.fo_stat = badfo_stat,
 	.fo_close = badfo_close,
 };
 
 
 /*-------------------------------------------------------------------*/
 
 /*
  * File Descriptor pseudo-device driver (/dev/fd/).
  *
  * Opening minor device N dup()s the file (if any) connected to file
  * descriptor N belonging to the calling process.  Note that this driver
  * consists of only the ``open()'' routine, because all subsequent
  * references to this file will be direct to the other driver.
  *
  * XXX: we could give this one a cloning event handler if necessary.
  */
 
 /* ARGSUSED */
 static int
 fdopen(struct cdev *dev, int mode, int type, struct thread *td)
 {
 
 	/*
 	 * XXX Kludge: set curthread->td_dupfd to contain the value of the
 	 * the file descriptor being sought for duplication. The error
 	 * return ensures that the vnode for this device will be released
 	 * by vn_open. Open will detect this special error and take the
 	 * actions in dupfdopen below. Other callers of vn_open or VOP_OPEN
 	 * will simply report the error.
 	 */
 	td->td_dupfd = dev2unit(dev);
 	return (ENODEV);
 }
 
 static struct cdevsw fildesc_cdevsw = {
 	.d_version =	D_VERSION,
 	.d_flags =	D_NEEDGIANT,
 	.d_open =	fdopen,
 	.d_name =	"FD",
 };
 
 static void
 fildesc_drvinit(void *unused)
 {
 	struct cdev *dev;
 
 	dev = make_dev(&fildesc_cdevsw, 0, UID_ROOT, GID_WHEEL, 0666, "fd/0");
 	make_dev_alias(dev, "stdin");
 	dev = make_dev(&fildesc_cdevsw, 1, UID_ROOT, GID_WHEEL, 0666, "fd/1");
 	make_dev_alias(dev, "stdout");
 	dev = make_dev(&fildesc_cdevsw, 2, UID_ROOT, GID_WHEEL, 0666, "fd/2");
 	make_dev_alias(dev, "stderr");
 }
 
 SYSINIT(fildescdev, SI_SUB_DRIVERS, SI_ORDER_MIDDLE, fildesc_drvinit, NULL)
Index: head/sys/kern/kern_exec.c
===================================================================
--- head/sys/kern/kern_exec.c	(revision 175201)
+++ head/sys/kern/kern_exec.c	(revision 175202)
@@ -1,1310 +1,1310 @@
 /*-
  * Copyright (c) 1993, David Greenman
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_hwpmc_hooks.h"
 #include "opt_ktrace.h"
 #include "opt_mac.h"
 #include "opt_vm.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/eventhandler.h>
 #include <sys/lock.h>
 #include <sys/mutex.h>
 #include <sys/sysproto.h>
 #include <sys/signalvar.h>
 #include <sys/kernel.h>
 #include <sys/mount.h>
 #include <sys/filedesc.h>
 #include <sys/fcntl.h>
 #include <sys/acct.h>
 #include <sys/exec.h>
 #include <sys/imgact.h>
 #include <sys/imgact_elf.h>
 #include <sys/wait.h>
 #include <sys/malloc.h>
 #include <sys/priv.h>
 #include <sys/proc.h>
 #include <sys/pioctl.h>
 #include <sys/namei.h>
 #include <sys/resourcevar.h>
 #include <sys/sf_buf.h>
 #include <sys/syscallsubr.h>
 #include <sys/sysent.h>
 #include <sys/shm.h>
 #include <sys/sysctl.h>
 #include <sys/vnode.h>
 #ifdef KTRACE
 #include <sys/ktrace.h>
 #endif
 
 #include <vm/vm.h>
 #include <vm/vm_param.h>
 #include <vm/pmap.h>
 #include <vm/vm_page.h>
 #include <vm/vm_map.h>
 #include <vm/vm_kern.h>
 #include <vm/vm_extern.h>
 #include <vm/vm_object.h>
 #include <vm/vm_pager.h>
 
 #ifdef	HWPMC_HOOKS
 #include <sys/pmckern.h>
 #endif
 
 #include <machine/reg.h>
 
 #include <security/audit/audit.h>
 #include <security/mac/mac_framework.h>
 
 MALLOC_DEFINE(M_PARGS, "proc-args", "Process arguments");
 
 static int sysctl_kern_ps_strings(SYSCTL_HANDLER_ARGS);
 static int sysctl_kern_usrstack(SYSCTL_HANDLER_ARGS);
 static int sysctl_kern_stackprot(SYSCTL_HANDLER_ARGS);
 static int do_execve(struct thread *td, struct image_args *args,
     struct mac *mac_p);
 static void exec_free_args(struct image_args *);
 
 /* XXX This should be vm_size_t. */
 SYSCTL_PROC(_kern, KERN_PS_STRINGS, ps_strings, CTLTYPE_ULONG|CTLFLAG_RD,
     NULL, 0, sysctl_kern_ps_strings, "LU", "");
 
 /* XXX This should be vm_size_t. */
 SYSCTL_PROC(_kern, KERN_USRSTACK, usrstack, CTLTYPE_ULONG|CTLFLAG_RD,
     NULL, 0, sysctl_kern_usrstack, "LU", "");
 
 SYSCTL_PROC(_kern, OID_AUTO, stackprot, CTLTYPE_INT|CTLFLAG_RD,
     NULL, 0, sysctl_kern_stackprot, "I", "");
 
 u_long ps_arg_cache_limit = PAGE_SIZE / 16;
 SYSCTL_ULONG(_kern, OID_AUTO, ps_arg_cache_limit, CTLFLAG_RW, 
     &ps_arg_cache_limit, 0, "");
 
 static int
 sysctl_kern_ps_strings(SYSCTL_HANDLER_ARGS)
 {
 	struct proc *p;
 	int error;
 
 	p = curproc;
 #ifdef SCTL_MASK32
 	if (req->flags & SCTL_MASK32) {
 		unsigned int val;
 		val = (unsigned int)p->p_sysent->sv_psstrings;
 		error = SYSCTL_OUT(req, &val, sizeof(val));
 	} else
 #endif
 		error = SYSCTL_OUT(req, &p->p_sysent->sv_psstrings,
 		   sizeof(p->p_sysent->sv_psstrings));
 	return error;
 }
 
 static int
 sysctl_kern_usrstack(SYSCTL_HANDLER_ARGS)
 {
 	struct proc *p;
 	int error;
 
 	p = curproc;
 #ifdef SCTL_MASK32
 	if (req->flags & SCTL_MASK32) {
 		unsigned int val;
 		val = (unsigned int)p->p_sysent->sv_usrstack;
 		error = SYSCTL_OUT(req, &val, sizeof(val));
 	} else
 #endif
 		error = SYSCTL_OUT(req, &p->p_sysent->sv_usrstack,
 		    sizeof(p->p_sysent->sv_usrstack));
 	return error;
 }
 
 static int
 sysctl_kern_stackprot(SYSCTL_HANDLER_ARGS)
 {
 	struct proc *p;
 
 	p = curproc;
 	return (SYSCTL_OUT(req, &p->p_sysent->sv_stackprot,
 	    sizeof(p->p_sysent->sv_stackprot)));
 }
 
 /*
  * Each of the items is a pointer to a `const struct execsw', hence the
  * double pointer here.
  */
 static const struct execsw **execsw;
 
 #ifndef _SYS_SYSPROTO_H_
 struct execve_args {
 	char    *fname; 
 	char    **argv;
 	char    **envv; 
 };
 #endif
 
 int
 execve(td, uap)
 	struct thread *td;
 	struct execve_args /* {
 		char *fname;
 		char **argv;
 		char **envv;
 	} */ *uap;
 {
 	int error;
 	struct image_args args;
 
 	error = exec_copyin_args(&args, uap->fname, UIO_USERSPACE,
 	    uap->argv, uap->envv);
 	if (error == 0)
 		error = kern_execve(td, &args, NULL);
 	return (error);
 }
 
 #ifndef _SYS_SYSPROTO_H_
 struct __mac_execve_args {
 	char	*fname;
 	char	**argv;
 	char	**envv;
 	struct mac	*mac_p;
 };
 #endif
 
 int
 __mac_execve(td, uap)
 	struct thread *td;
 	struct __mac_execve_args /* {
 		char *fname;
 		char **argv;
 		char **envv;
 		struct mac *mac_p;
 	} */ *uap;
 {
 #ifdef MAC
 	int error;
 	struct image_args args;
 
 	error = exec_copyin_args(&args, uap->fname, UIO_USERSPACE,
 	    uap->argv, uap->envv);
 	if (error == 0)
 		error = kern_execve(td, &args, uap->mac_p);
 	return (error);
 #else
 	return (ENOSYS);
 #endif
 }
 
 /*
  * XXX: kern_execve has the astonishing property of not always returning to
  * the caller.  If sufficiently bad things happen during the call to
  * do_execve(), it can end up calling exit1(); as a result, callers must
  * avoid doing anything which they might need to undo (e.g., allocating
  * memory).
  */
 int
 kern_execve(td, args, mac_p)
 	struct thread *td;
 	struct image_args *args;
 	struct mac *mac_p;
 {
 	struct proc *p = td->td_proc;
 	int error;
 
 	AUDIT_ARG(argv, args->begin_argv, args->argc,
 	    args->begin_envv - args->begin_argv);
 	AUDIT_ARG(envv, args->begin_envv, args->envc,
 	    args->endp - args->begin_envv);
 	if (p->p_flag & P_HADTHREADS) {
 		PROC_LOCK(p);
 		if (thread_single(SINGLE_BOUNDARY)) {
 			PROC_UNLOCK(p);
 	       		exec_free_args(args);
 			return (ERESTART);	/* Try again later. */
 		}
 		PROC_UNLOCK(p);
 	}
 
 	error = do_execve(td, args, mac_p);
 
 	if (p->p_flag & P_HADTHREADS) {
 		PROC_LOCK(p);
 		/*
 		 * If success, we upgrade to SINGLE_EXIT state to
 		 * force other threads to suicide.
 		 */
 		if (error == 0)
 			thread_single(SINGLE_EXIT);
 		else
 			thread_single_end();
 		PROC_UNLOCK(p);
 	}
 
 	return (error);
 }
 
 /*
  * In-kernel implementation of execve().  All arguments are assumed to be
  * userspace pointers from the passed thread.
  */
 static int
 do_execve(td, args, mac_p)
 	struct thread *td;
 	struct image_args *args;
 	struct mac *mac_p;
 {
 	struct proc *p = td->td_proc;
 	struct nameidata nd, *ndp;
 	struct ucred *newcred = NULL, *oldcred;
 	struct uidinfo *euip;
 	register_t *stack_base;
 	int error, len, i;
 	struct image_params image_params, *imgp;
 	struct vattr attr;
 	int (*img_first)(struct image_params *);
 	struct pargs *oldargs = NULL, *newargs = NULL;
 	struct sigacts *oldsigacts, *newsigacts;
 #ifdef KTRACE
 	struct vnode *tracevp = NULL;
 	struct ucred *tracecred = NULL;
 #endif
 	struct vnode *textvp = NULL;
 	int credential_changing;
 	int vfslocked;
 	int textset;
 #ifdef MAC
 	struct label *interplabel = NULL;
 	int will_transition;
 #endif
 #ifdef HWPMC_HOOKS
 	struct pmckern_procexec pe;
 #endif
 
 	vfslocked = 0;
 	imgp = &image_params;
 
 	/*
 	 * Lock the process and set the P_INEXEC flag to indicate that
 	 * it should be left alone until we're done here.  This is
 	 * necessary to avoid race conditions - e.g. in ptrace() -
 	 * that might allow a local user to illicitly obtain elevated
 	 * privileges.
 	 */
 	PROC_LOCK(p);
 	KASSERT((p->p_flag & P_INEXEC) == 0,
 	    ("%s(): process already has P_INEXEC flag", __func__));
 	p->p_flag |= P_INEXEC;
 	PROC_UNLOCK(p);
 
 	/*
 	 * Initialize part of the common data
 	 */
 	imgp->proc = p;
 	imgp->execlabel = NULL;
 	imgp->attr = &attr;
 	imgp->entry_addr = 0;
 	imgp->vmspace_destroyed = 0;
 	imgp->interpreted = 0;
 	imgp->interpreter_name = args->buf + PATH_MAX + ARG_MAX;
 	imgp->auxargs = NULL;
 	imgp->vp = NULL;
 	imgp->object = NULL;
 	imgp->firstpage = NULL;
 	imgp->ps_strings = 0;
 	imgp->auxarg_size = 0;
 	imgp->args = args;
 
 #ifdef MAC
 	error = mac_execve_enter(imgp, mac_p);
 	if (error)
 		goto exec_fail;
 #endif
 
 	imgp->image_header = NULL;
 
 	/*
 	 * Translate the file name. namei() returns a vnode pointer
 	 *	in ni_vp amoung other things.
 	 *
 	 * XXXAUDIT: It would be desirable to also audit the name of the
 	 * interpreter if this is an interpreted binary.
 	 */
 	ndp = &nd;
 	NDINIT(ndp, LOOKUP, ISOPEN | LOCKLEAF | FOLLOW | SAVENAME | MPSAFE |
 	    AUDITVNODE1, UIO_SYSSPACE, args->fname, td);
 
 interpret:
 	error = namei(ndp);
 	if (error)
 		goto exec_fail;
 
 	vfslocked = NDHASGIANT(ndp);
 	imgp->vp = ndp->ni_vp;
 
 	/*
 	 * Check file permissions (also 'opens' file)
 	 */
 	error = exec_check_permissions(imgp);
 	if (error)
 		goto exec_fail_dealloc;
 
 	imgp->object = imgp->vp->v_object;
 	if (imgp->object != NULL)
 		vm_object_reference(imgp->object);
 
 	/*
 	 * Set VV_TEXT now so no one can write to the executable while we're
 	 * activating it.
 	 *
 	 * Remember if this was set before and unset it in case this is not
 	 * actually an executable image.
 	 */
 	textset = imgp->vp->v_vflag & VV_TEXT;
 	imgp->vp->v_vflag |= VV_TEXT;
 
 	error = exec_map_first_page(imgp);
 	if (error)
 		goto exec_fail_dealloc;
 
 	imgp->proc->p_osrel = 0;
 	/*
 	 *	If the current process has a special image activator it
 	 *	wants to try first, call it.   For example, emulating shell
 	 *	scripts differently.
 	 */
 	error = -1;
 	if ((img_first = imgp->proc->p_sysent->sv_imgact_try) != NULL)
 		error = img_first(imgp);
 
 	/*
 	 *	Loop through the list of image activators, calling each one.
 	 *	An activator returns -1 if there is no match, 0 on success,
 	 *	and an error otherwise.
 	 */
 	for (i = 0; error == -1 && execsw[i]; ++i) {
 		if (execsw[i]->ex_imgact == NULL ||
 		    execsw[i]->ex_imgact == img_first) {
 			continue;
 		}
 		error = (*execsw[i]->ex_imgact)(imgp);
 	}
 
 	if (error) {
 		if (error == -1) {
 			if (textset == 0)
 				imgp->vp->v_vflag &= ~VV_TEXT;
 			error = ENOEXEC;
 		}
 		goto exec_fail_dealloc;
 	}
 
 	/*
 	 * Special interpreter operation, cleanup and loop up to try to
 	 * activate the interpreter.
 	 */
 	if (imgp->interpreted) {
 		exec_unmap_first_page(imgp);
 		/*
 		 * VV_TEXT needs to be unset for scripts.  There is a short
 		 * period before we determine that something is a script where
 		 * VV_TEXT will be set. The vnode lock is held over this
 		 * entire period so nothing should illegitimately be blocked.
 		 */
 		imgp->vp->v_vflag &= ~VV_TEXT;
 		/* free name buffer and old vnode */
 		NDFREE(ndp, NDF_ONLY_PNBUF);
 #ifdef MAC
 		interplabel = mac_vnode_label_alloc();
 		mac_vnode_copy_label(ndp->ni_vp->v_label, interplabel);
 #endif
 		vput(ndp->ni_vp);
 		vm_object_deallocate(imgp->object);
 		imgp->object = NULL;
 		VFS_UNLOCK_GIANT(vfslocked);
 		vfslocked = 0;
 		/* set new name to that of the interpreter */
 		NDINIT(ndp, LOOKUP, LOCKLEAF | FOLLOW | SAVENAME | MPSAFE,
 		    UIO_SYSSPACE, imgp->interpreter_name, td);
 		goto interpret;
 	}
 
 	/*
 	 * Copy out strings (args and env) and initialize stack base
 	 */
 	if (p->p_sysent->sv_copyout_strings)
 		stack_base = (*p->p_sysent->sv_copyout_strings)(imgp);
 	else
 		stack_base = exec_copyout_strings(imgp);
 
 	/*
 	 * If custom stack fixup routine present for this process
 	 * let it do the stack setup.
 	 * Else stuff argument count as first item on stack
 	 */
 	if (p->p_sysent->sv_fixup != NULL)
 		(*p->p_sysent->sv_fixup)(&stack_base, imgp);
 	else
 		suword(--stack_base, imgp->args->argc);
 
 	/*
 	 * For security and other reasons, the file descriptor table cannot
 	 * be shared after an exec.
 	 */
 	fdunshare(p, td);
 
 	/*
 	 * Malloc things before we need locks.
 	 */
 	newcred = crget();
 	euip = uifind(attr.va_uid);
 	i = imgp->args->begin_envv - imgp->args->begin_argv;
 	/* Cache arguments if they fit inside our allowance */
 	if (ps_arg_cache_limit >= i + sizeof(struct pargs)) {
 		newargs = pargs_alloc(i);
 		bcopy(imgp->args->begin_argv, newargs->ar_args, i);
 	}
 
 	/* close files on exec */
 	VOP_UNLOCK(imgp->vp, 0, td);
 	fdcloseexec(td);
-	vn_lock(imgp->vp, LK_EXCLUSIVE | LK_RETRY, td);
+	vn_lock(imgp->vp, LK_EXCLUSIVE | LK_RETRY);
 
 	/* Get a reference to the vnode prior to locking the proc */
 	VREF(ndp->ni_vp);
 
 	/*
 	 * For security and other reasons, signal handlers cannot
 	 * be shared after an exec. The new process gets a copy of the old
 	 * handlers. In execsigs(), the new process will have its signals
 	 * reset.
 	 */
 	PROC_LOCK(p);
 	if (sigacts_shared(p->p_sigacts)) {
 		oldsigacts = p->p_sigacts;
 		PROC_UNLOCK(p);
 		newsigacts = sigacts_alloc();
 		sigacts_copy(newsigacts, oldsigacts);
 		PROC_LOCK(p);
 		p->p_sigacts = newsigacts;
 	} else
 		oldsigacts = NULL;
 
 	/* Stop profiling */
 	stopprofclock(p);
 
 	/* reset caught signals */
 	execsigs(p);
 
 	/* name this process - nameiexec(p, ndp) */
 	len = min(ndp->ni_cnd.cn_namelen,MAXCOMLEN);
 	bcopy(ndp->ni_cnd.cn_nameptr, p->p_comm, len);
 	p->p_comm[len] = 0;
 	bcopy(p->p_comm, td->td_name, sizeof(td->td_name));
 
 	/*
 	 * mark as execed, wakeup the process that vforked (if any) and tell
 	 * it that it now has its own resources back
 	 */
 	p->p_flag |= P_EXEC;
 	if (p->p_pptr && (p->p_flag & P_PPWAIT)) {
 		p->p_flag &= ~P_PPWAIT;
 		wakeup(p->p_pptr);
 	}
 
 	/*
 	 * Implement image setuid/setgid.
 	 *
 	 * Don't honor setuid/setgid if the filesystem prohibits it or if
 	 * the process is being traced.
 	 *
 	 * XXXMAC: For the time being, use NOSUID to also prohibit
 	 * transitions on the file system.
 	 */
 	oldcred = p->p_ucred;
 	credential_changing = 0;
 	credential_changing |= (attr.va_mode & VSUID) && oldcred->cr_uid !=
 	    attr.va_uid;
 	credential_changing |= (attr.va_mode & VSGID) && oldcred->cr_gid !=
 	    attr.va_gid;
 #ifdef MAC
 	will_transition = mac_vnode_execve_will_transition(oldcred, imgp->vp,
 	    interplabel, imgp);
 	credential_changing |= will_transition;
 #endif
 
 	if (credential_changing &&
 	    (imgp->vp->v_mount->mnt_flag & MNT_NOSUID) == 0 &&
 	    (p->p_flag & P_TRACED) == 0) {
 		/*
 		 * Turn off syscall tracing for set-id programs, except for
 		 * root.  Record any set-id flags first to make sure that
 		 * we do not regain any tracing during a possible block.
 		 */
 		setsugid(p);
 
 #ifdef KTRACE
 		if (p->p_tracevp != NULL &&
 		    priv_check_cred(oldcred, PRIV_DEBUG_DIFFCRED, 0)) {
 			mtx_lock(&ktrace_mtx);
 			p->p_traceflag = 0;
 			tracevp = p->p_tracevp;
 			p->p_tracevp = NULL;
 			tracecred = p->p_tracecred;
 			p->p_tracecred = NULL;
 			mtx_unlock(&ktrace_mtx);
 		}
 #endif
 		/*
 		 * Close any file descriptors 0..2 that reference procfs,
 		 * then make sure file descriptors 0..2 are in use.
 		 *
 		 * setugidsafety() may call closef() and then pfind()
 		 * which may grab the process lock.
 		 * fdcheckstd() may call falloc() which may block to
 		 * allocate memory, so temporarily drop the process lock.
 		 */
 		PROC_UNLOCK(p);
 		setugidsafety(td);
 		VOP_UNLOCK(imgp->vp, 0, td);
 		error = fdcheckstd(td);
-		vn_lock(imgp->vp, LK_EXCLUSIVE | LK_RETRY, td);
+		vn_lock(imgp->vp, LK_EXCLUSIVE | LK_RETRY);
 		if (error != 0)
 			goto done1;
 		PROC_LOCK(p);
 		/*
 		 * Set the new credentials.
 		 */
 		crcopy(newcred, oldcred);
 		if (attr.va_mode & VSUID)
 			change_euid(newcred, euip);
 		if (attr.va_mode & VSGID)
 			change_egid(newcred, attr.va_gid);
 #ifdef MAC
 		if (will_transition) {
 			mac_vnode_execve_transition(oldcred, newcred, imgp->vp,
 			    interplabel, imgp);
 		}
 #endif
 		/*
 		 * Implement correct POSIX saved-id behavior.
 		 *
 		 * XXXMAC: Note that the current logic will save the
 		 * uid and gid if a MAC domain transition occurs, even
 		 * though maybe it shouldn't.
 		 */
 		change_svuid(newcred, newcred->cr_uid);
 		change_svgid(newcred, newcred->cr_gid);
 		p->p_ucred = newcred;
 		newcred = NULL;
 	} else {
 		if (oldcred->cr_uid == oldcred->cr_ruid &&
 		    oldcred->cr_gid == oldcred->cr_rgid)
 			p->p_flag &= ~P_SUGID;
 		/*
 		 * Implement correct POSIX saved-id behavior.
 		 *
 		 * XXX: It's not clear that the existing behavior is
 		 * POSIX-compliant.  A number of sources indicate that the
 		 * saved uid/gid should only be updated if the new ruid is
 		 * not equal to the old ruid, or the new euid is not equal
 		 * to the old euid and the new euid is not equal to the old
 		 * ruid.  The FreeBSD code always updates the saved uid/gid.
 		 * Also, this code uses the new (replaced) euid and egid as
 		 * the source, which may or may not be the right ones to use.
 		 */
 		if (oldcred->cr_svuid != oldcred->cr_uid ||
 		    oldcred->cr_svgid != oldcred->cr_gid) {
 			crcopy(newcred, oldcred);
 			change_svuid(newcred, newcred->cr_uid);
 			change_svgid(newcred, newcred->cr_gid);
 			p->p_ucred = newcred;
 			newcred = NULL;
 		}
 	}
 
 	/*
 	 * Store the vp for use in procfs.  This vnode was referenced prior
 	 * to locking the proc lock.
 	 */
 	textvp = p->p_textvp;
 	p->p_textvp = ndp->ni_vp;
 
 	/*
 	 * Notify others that we exec'd, and clear the P_INEXEC flag
 	 * as we're now a bona fide freshly-execed process.
 	 */
 	KNOTE_LOCKED(&p->p_klist, NOTE_EXEC);
 	p->p_flag &= ~P_INEXEC;
 
 	/*
 	 * If tracing the process, trap to debugger so breakpoints
 	 * can be set before the program executes.
 	 * Use tdsignal to deliver signal to current thread, use
 	 * psignal may cause the signal to be delivered to wrong thread
 	 * because that thread will exit, remember we are going to enter
 	 * single thread mode.
 	 */
 	if (p->p_flag & P_TRACED)
 		tdsignal(p, td, SIGTRAP, NULL);
 
 	/* clear "fork but no exec" flag, as we _are_ execing */
 	p->p_acflag &= ~AFORK;
 
 	/*
 	 * Free any previous argument cache and replace it with
 	 * the new argument cache, if any.
 	 */
 	oldargs = p->p_args;
 	p->p_args = newargs;
 	newargs = NULL;
 
 #ifdef	HWPMC_HOOKS
 	/*
 	 * Check if system-wide sampling is in effect or if the
 	 * current process is using PMCs.  If so, do exec() time
 	 * processing.  This processing needs to happen AFTER the
 	 * P_INEXEC flag is cleared.
 	 *
 	 * The proc lock needs to be released before taking the PMC
 	 * SX.
 	 */
 	if (PMC_SYSTEM_SAMPLING_ACTIVE() || PMC_PROC_IS_USING_PMCS(p)) {
 		PROC_UNLOCK(p);
 		pe.pm_credentialschanged = credential_changing;
 		pe.pm_entryaddr = imgp->entry_addr;
 
 		PMC_CALL_HOOK_X(td, PMC_FN_PROCESS_EXEC, (void *) &pe);
 	} else
 		PROC_UNLOCK(p);
 #else  /* !HWPMC_HOOKS */
 	PROC_UNLOCK(p);
 #endif
 
 	/* Set values passed into the program in registers. */
 	if (p->p_sysent->sv_setregs)
 		(*p->p_sysent->sv_setregs)(td, imgp->entry_addr,
 		    (u_long)(uintptr_t)stack_base, imgp->ps_strings);
 	else
 		exec_setregs(td, imgp->entry_addr,
 		    (u_long)(uintptr_t)stack_base, imgp->ps_strings);
 
 	vfs_mark_atime(imgp->vp, td);
 
 done1:
 	/*
 	 * Free any resources malloc'd earlier that we didn't use.
 	 */
 	uifree(euip);
 	if (newcred == NULL)
 		crfree(oldcred);
 	else
 		crfree(newcred);
 	VOP_UNLOCK(imgp->vp, 0, td);
 	/*
 	 * Handle deferred decrement of ref counts.
 	 */
 	if (textvp != NULL) {
 		int tvfslocked;
 
 		tvfslocked = VFS_LOCK_GIANT(textvp->v_mount);
 		vrele(textvp);
 		VFS_UNLOCK_GIANT(tvfslocked);
 	}
 	if (ndp->ni_vp && error != 0)
 		vrele(ndp->ni_vp);
 #ifdef KTRACE
 	if (tracevp != NULL) {
 		int tvfslocked;
 
 		tvfslocked = VFS_LOCK_GIANT(tracevp->v_mount);
 		vrele(tracevp);
 		VFS_UNLOCK_GIANT(tvfslocked);
 	}
 	if (tracecred != NULL)
 		crfree(tracecred);
 #endif
-	vn_lock(imgp->vp, LK_EXCLUSIVE | LK_RETRY, td);
+	vn_lock(imgp->vp, LK_EXCLUSIVE | LK_RETRY);
 	if (oldargs != NULL)
 		pargs_drop(oldargs);
 	if (newargs != NULL)
 		pargs_drop(newargs);
 	if (oldsigacts != NULL)
 		sigacts_free(oldsigacts);
 
 exec_fail_dealloc:
 
 	/*
 	 * free various allocated resources
 	 */
 	if (imgp->firstpage != NULL)
 		exec_unmap_first_page(imgp);
 
 	if (imgp->vp != NULL) {
 		NDFREE(ndp, NDF_ONLY_PNBUF);
 		vput(imgp->vp);
 	}
 
 	if (imgp->object != NULL)
 		vm_object_deallocate(imgp->object);
 
 	if (error == 0) {
 		/*
 		 * Stop the process here if its stop event mask has
 		 * the S_EXEC bit set.
 		 */
 		STOPEVENT(p, S_EXEC, 0);
 		goto done2;
 	}
 
 exec_fail:
 	/* we're done here, clear P_INEXEC */
 	PROC_LOCK(p);
 	p->p_flag &= ~P_INEXEC;
 	PROC_UNLOCK(p);
 
 done2:
 #ifdef MAC
 	mac_execve_exit(imgp);
 	if (interplabel != NULL)
 		mac_vnode_label_free(interplabel);
 #endif
 	VFS_UNLOCK_GIANT(vfslocked);
 	exec_free_args(args);
 
 	if (error && imgp->vmspace_destroyed) {
 		/* sorry, no more process anymore. exit gracefully */
 		exit1(td, W_EXITCODE(0, SIGABRT));
 		/* NOT REACHED */
 	}
 	return (error);
 }
 
 int
 exec_map_first_page(imgp)
 	struct image_params *imgp;
 {
 	int rv, i;
 	int initial_pagein;
 	vm_page_t ma[VM_INITIAL_PAGEIN];
 	vm_object_t object;
 
 	if (imgp->firstpage != NULL)
 		exec_unmap_first_page(imgp);
 
 	object = imgp->vp->v_object;
 	if (object == NULL)
 		return (EACCES);
 	VM_OBJECT_LOCK(object);
 #if VM_NRESERVLEVEL > 0
 	if ((object->flags & OBJ_COLORED) == 0) {
 		object->flags |= OBJ_COLORED;
 		object->pg_color = 0;
 	}
 #endif
 	ma[0] = vm_page_grab(object, 0, VM_ALLOC_NORMAL | VM_ALLOC_RETRY);
 	if ((ma[0]->valid & VM_PAGE_BITS_ALL) != VM_PAGE_BITS_ALL) {
 		initial_pagein = VM_INITIAL_PAGEIN;
 		if (initial_pagein > object->size)
 			initial_pagein = object->size;
 		for (i = 1; i < initial_pagein; i++) {
 			if ((ma[i] = vm_page_lookup(object, i)) != NULL) {
 				if (ma[i]->valid)
 					break;
 				if ((ma[i]->oflags & VPO_BUSY) || ma[i]->busy)
 					break;
 				vm_page_busy(ma[i]);
 			} else {
 				ma[i] = vm_page_alloc(object, i,
 				    VM_ALLOC_NORMAL | VM_ALLOC_IFNOTCACHED);
 				if (ma[i] == NULL)
 					break;
 			}
 		}
 		initial_pagein = i;
 		rv = vm_pager_get_pages(object, ma, initial_pagein, 0);
 		ma[0] = vm_page_lookup(object, 0);
 		if ((rv != VM_PAGER_OK) || (ma[0] == NULL) ||
 		    (ma[0]->valid == 0)) {
 			if (ma[0]) {
 				vm_page_lock_queues();
 				vm_page_free(ma[0]);
 				vm_page_unlock_queues();
 			}
 			VM_OBJECT_UNLOCK(object);
 			return (EIO);
 		}
 	}
 	vm_page_lock_queues();
 	vm_page_hold(ma[0]);
 	vm_page_unlock_queues();
 	vm_page_wakeup(ma[0]);
 	VM_OBJECT_UNLOCK(object);
 
 	imgp->firstpage = sf_buf_alloc(ma[0], 0);
 	imgp->image_header = (char *)sf_buf_kva(imgp->firstpage);
 
 	return (0);
 }
 
 void
 exec_unmap_first_page(imgp)
 	struct image_params *imgp;
 {
 	vm_page_t m;
 
 	if (imgp->firstpage != NULL) {
 		m = sf_buf_page(imgp->firstpage);
 		sf_buf_free(imgp->firstpage);
 		imgp->firstpage = NULL;
 		vm_page_lock_queues();
 		vm_page_unhold(m);
 		vm_page_unlock_queues();
 	}
 }
 
 /*
  * Destroy old address space, and allocate a new stack
  *	The new stack is only SGROWSIZ large because it is grown
  *	automatically in trap.c.
  */
 int
 exec_new_vmspace(imgp, sv)
 	struct image_params *imgp;
 	struct sysentvec *sv;
 {
 	int error;
 	struct proc *p = imgp->proc;
 	struct vmspace *vmspace = p->p_vmspace;
 	vm_offset_t stack_addr;
 	vm_map_t map;
 	u_long ssiz;
 
 	imgp->vmspace_destroyed = 1;
 	imgp->sysent = sv;
 
 	/* May be called with Giant held */
 	EVENTHANDLER_INVOKE(process_exec, p, imgp);
 
 	/*
 	 * Blow away entire process VM, if address space not shared,
 	 * otherwise, create a new VM space so that other threads are
 	 * not disrupted
 	 */
 	map = &vmspace->vm_map;
 	if (vmspace->vm_refcnt == 1 && vm_map_min(map) == sv->sv_minuser &&
 	    vm_map_max(map) == sv->sv_maxuser) {
 		shmexit(vmspace);
 		pmap_remove_pages(vmspace_pmap(vmspace));
 		vm_map_remove(map, vm_map_min(map), vm_map_max(map));
 	} else {
 		error = vmspace_exec(p, sv->sv_minuser, sv->sv_maxuser);
 		if (error)
 			return (error);
 		vmspace = p->p_vmspace;
 		map = &vmspace->vm_map;
 	}
 
 	/* Allocate a new stack */
 	if (sv->sv_maxssiz != NULL)
 		ssiz = *sv->sv_maxssiz;
 	else
 		ssiz = maxssiz;
 	stack_addr = sv->sv_usrstack - ssiz;
 	error = vm_map_stack(map, stack_addr, (vm_size_t)ssiz,
 	    sv->sv_stackprot, VM_PROT_ALL, MAP_STACK_GROWS_DOWN);
 	if (error)
 		return (error);
 
 #ifdef __ia64__
 	/* Allocate a new register stack */
 	stack_addr = IA64_BACKINGSTORE;
 	error = vm_map_stack(map, stack_addr, (vm_size_t)ssiz,
 	    sv->sv_stackprot, VM_PROT_ALL, MAP_STACK_GROWS_UP);
 	if (error)
 		return (error);
 #endif
 
 	/* vm_ssize and vm_maxsaddr are somewhat antiquated concepts in the
 	 * VM_STACK case, but they are still used to monitor the size of the
 	 * process stack so we can check the stack rlimit.
 	 */
 	vmspace->vm_ssize = sgrowsiz >> PAGE_SHIFT;
 	vmspace->vm_maxsaddr = (char *)sv->sv_usrstack - ssiz;
 
 	return (0);
 }
 
 /*
  * Copy out argument and environment strings from the old process address
  * space into the temporary string buffer.
  */
 int
 exec_copyin_args(struct image_args *args, char *fname,
     enum uio_seg segflg, char **argv, char **envv)
 {
 	char *argp, *envp;
 	int error;
 	size_t length;
 
 	error = 0;
 
 	bzero(args, sizeof(*args));
 	if (argv == NULL)
 		return (EFAULT);
 	/*
 	 * Allocate temporary demand zeroed space for argument and
 	 *	environment strings:
 	 *
 	 * o ARG_MAX for argument and environment;
 	 * o MAXSHELLCMDLEN for the name of interpreters.
 	 */
 	args->buf = (char *) kmem_alloc_wait(exec_map,
 	    PATH_MAX + ARG_MAX + MAXSHELLCMDLEN);
 	if (args->buf == NULL)
 		return (ENOMEM);
 	args->begin_argv = args->buf;
 	args->endp = args->begin_argv;
 	args->stringspace = ARG_MAX;
 
 	args->fname = args->buf + ARG_MAX;
 
 	/*
 	 * Copy the file name.
 	 */
 	error = (segflg == UIO_SYSSPACE) ?
 	    copystr(fname, args->fname, PATH_MAX, &length) :
 	    copyinstr(fname, args->fname, PATH_MAX, &length);
 	if (error != 0)
 		goto err_exit;
 
 	/*
 	 * extract arguments first
 	 */
 	while ((argp = (caddr_t) (intptr_t) fuword(argv++))) {
 		if (argp == (caddr_t) -1) {
 			error = EFAULT;
 			goto err_exit;
 		}
 		if ((error = copyinstr(argp, args->endp,
 		    args->stringspace, &length))) {
 			if (error == ENAMETOOLONG) 
 				error = E2BIG;
 			goto err_exit;
 		}
 		args->stringspace -= length;
 		args->endp += length;
 		args->argc++;
 	}
 
 	args->begin_envv = args->endp;
 
 	/*
 	 * extract environment strings
 	 */
 	if (envv) {
 		while ((envp = (caddr_t)(intptr_t)fuword(envv++))) {
 			if (envp == (caddr_t)-1) {
 				error = EFAULT;
 				goto err_exit;
 			}
 			if ((error = copyinstr(envp, args->endp,
 			    args->stringspace, &length))) {
 				if (error == ENAMETOOLONG)
 					error = E2BIG;
 				goto err_exit;
 			}
 			args->stringspace -= length;
 			args->endp += length;
 			args->envc++;
 		}
 	}
 
 	return (0);
 
 err_exit:
 	exec_free_args(args);
 	return (error);
 }
 
 static void
 exec_free_args(struct image_args *args)
 {
 
 	if (args->buf) {
 		kmem_free_wakeup(exec_map, (vm_offset_t)args->buf,
 		    PATH_MAX + ARG_MAX + MAXSHELLCMDLEN);
 		args->buf = NULL;
 	}
 }
 
 /*
  * Copy strings out to the new process address space, constructing new arg
  * and env vector tables. Return a pointer to the base so that it can be used
  * as the initial stack pointer.
  */
 register_t *
 exec_copyout_strings(imgp)
 	struct image_params *imgp;
 {
 	int argc, envc;
 	char **vectp;
 	char *stringp, *destp;
 	register_t *stack_base;
 	struct ps_strings *arginfo;
 	struct proc *p;
 	int szsigcode;
 
 	/*
 	 * Calculate string base and vector table pointers.
 	 * Also deal with signal trampoline code for this exec type.
 	 */
 	p = imgp->proc;
 	szsigcode = 0;
 	arginfo = (struct ps_strings *)p->p_sysent->sv_psstrings;
 	if (p->p_sysent->sv_szsigcode != NULL)
 		szsigcode = *(p->p_sysent->sv_szsigcode);
 	destp =	(caddr_t)arginfo - szsigcode - SPARE_USRSPACE -
 	    roundup((ARG_MAX - imgp->args->stringspace), sizeof(char *));
 
 	/*
 	 * install sigcode
 	 */
 	if (szsigcode)
 		copyout(p->p_sysent->sv_sigcode, ((caddr_t)arginfo -
 		    szsigcode), szsigcode);
 
 	/*
 	 * If we have a valid auxargs ptr, prepare some room
 	 * on the stack.
 	 */
 	if (imgp->auxargs) {
 		/*
 		 * 'AT_COUNT*2' is size for the ELF Auxargs data. This is for
 		 * lower compatibility.
 		 */
 		imgp->auxarg_size = (imgp->auxarg_size) ? imgp->auxarg_size :
 		    (AT_COUNT * 2);
 		/*
 		 * The '+ 2' is for the null pointers at the end of each of
 		 * the arg and env vector sets,and imgp->auxarg_size is room
 		 * for argument of Runtime loader.
 		 */
 		vectp = (char **)(destp - (imgp->args->argc +
 		    imgp->args->envc + 2 + imgp->auxarg_size) *
 		    sizeof(char *));
 
 	} else {
 		/*
 		 * The '+ 2' is for the null pointers at the end of each of
 		 * the arg and env vector sets
 		 */
 		vectp = (char **)(destp - (imgp->args->argc + imgp->args->envc + 2) *
 		    sizeof(char *));
 	}
 
 	/*
 	 * vectp also becomes our initial stack base
 	 */
 	stack_base = (register_t *)vectp;
 
 	stringp = imgp->args->begin_argv;
 	argc = imgp->args->argc;
 	envc = imgp->args->envc;
 
 	/*
 	 * Copy out strings - arguments and environment.
 	 */
 	copyout(stringp, destp, ARG_MAX - imgp->args->stringspace);
 
 	/*
 	 * Fill in "ps_strings" struct for ps, w, etc.
 	 */
 	suword(&arginfo->ps_argvstr, (long)(intptr_t)vectp);
 	suword(&arginfo->ps_nargvstr, argc);
 
 	/*
 	 * Fill in argument portion of vector table.
 	 */
 	for (; argc > 0; --argc) {
 		suword(vectp++, (long)(intptr_t)destp);
 		while (*stringp++ != 0)
 			destp++;
 		destp++;
 	}
 
 	/* a null vector table pointer separates the argp's from the envp's */
 	suword(vectp++, 0);
 
 	suword(&arginfo->ps_envstr, (long)(intptr_t)vectp);
 	suword(&arginfo->ps_nenvstr, envc);
 
 	/*
 	 * Fill in environment portion of vector table.
 	 */
 	for (; envc > 0; --envc) {
 		suword(vectp++, (long)(intptr_t)destp);
 		while (*stringp++ != 0)
 			destp++;
 		destp++;
 	}
 
 	/* end of vector table is a null pointer */
 	suword(vectp, 0);
 
 	return (stack_base);
 }
 
 /*
  * Check permissions of file to execute.
  *	Called with imgp->vp locked.
  *	Return 0 for success or error code on failure.
  */
 int
 exec_check_permissions(imgp)
 	struct image_params *imgp;
 {
 	struct vnode *vp = imgp->vp;
 	struct vattr *attr = imgp->attr;
 	struct thread *td;
 	int error;
 
 	td = curthread;			/* XXXKSE */
 
 	/* Get file attributes */
 	error = VOP_GETATTR(vp, attr, td->td_ucred, td);
 	if (error)
 		return (error);
 
 #ifdef MAC
 	error = mac_vnode_check_exec(td->td_ucred, imgp->vp, imgp);
 	if (error)
 		return (error);
 #endif
 	
 	/*
 	 * 1) Check if file execution is disabled for the filesystem that this
 	 *	file resides on.
 	 * 2) Insure that at least one execute bit is on - otherwise root
 	 *	will always succeed, and we don't want to happen unless the
 	 *	file really is executable.
 	 * 3) Insure that the file is a regular file.
 	 */
 	if ((vp->v_mount->mnt_flag & MNT_NOEXEC) ||
 	    ((attr->va_mode & 0111) == 0) ||
 	    (attr->va_type != VREG))
 		return (EACCES);
 
 	/*
 	 * Zero length files can't be exec'd
 	 */
 	if (attr->va_size == 0)
 		return (ENOEXEC);
 
 	/*
 	 *  Check for execute permission to file based on current credentials.
 	 */
 	error = VOP_ACCESS(vp, VEXEC, td->td_ucred, td);
 	if (error)
 		return (error);
 
 	/*
 	 * Check number of open-for-writes on the file and deny execution
 	 * if there are any.
 	 */
 	if (vp->v_writecount)
 		return (ETXTBSY);
 
 	/*
 	 * Call filesystem specific open routine (which does nothing in the
 	 * general case).
 	 */
 	error = VOP_OPEN(vp, FREAD, td->td_ucred, td, NULL);
 	return (error);
 }
 
 /*
  * Exec handler registration
  */
 int
 exec_register(execsw_arg)
 	const struct execsw *execsw_arg;
 {
 	const struct execsw **es, **xs, **newexecsw;
 	int count = 2;	/* New slot and trailing NULL */
 
 	if (execsw)
 		for (es = execsw; *es; es++)
 			count++;
 	newexecsw = malloc(count * sizeof(*es), M_TEMP, M_WAITOK);
 	if (newexecsw == NULL)
 		return (ENOMEM);
 	xs = newexecsw;
 	if (execsw)
 		for (es = execsw; *es; es++)
 			*xs++ = *es;
 	*xs++ = execsw_arg;
 	*xs = NULL;
 	if (execsw)
 		free(execsw, M_TEMP);
 	execsw = newexecsw;
 	return (0);
 }
 
 int
 exec_unregister(execsw_arg)
 	const struct execsw *execsw_arg;
 {
 	const struct execsw **es, **xs, **newexecsw;
 	int count = 1;
 
 	if (execsw == NULL)
 		panic("unregister with no handlers left?\n");
 
 	for (es = execsw; *es; es++) {
 		if (*es == execsw_arg)
 			break;
 	}
 	if (*es == NULL)
 		return (ENOENT);
 	for (es = execsw; *es; es++)
 		if (*es != execsw_arg)
 			count++;
 	newexecsw = malloc(count * sizeof(*es), M_TEMP, M_WAITOK);
 	if (newexecsw == NULL)
 		return (ENOMEM);
 	xs = newexecsw;
 	for (es = execsw; *es; es++)
 		if (*es != execsw_arg)
 			*xs++ = *es;
 	*xs = NULL;
 	if (execsw)
 		free(execsw, M_TEMP);
 	execsw = newexecsw;
 	return (0);
 }
Index: head/sys/kern/kern_jail.c
===================================================================
--- head/sys/kern/kern_jail.c	(revision 175201)
+++ head/sys/kern/kern_jail.c	(revision 175202)
@@ -1,985 +1,985 @@
 /*-
  * ----------------------------------------------------------------------------
  * "THE BEER-WARE LICENSE" (Revision 42):
  * <phk@FreeBSD.ORG> wrote this file.  As long as you retain this notice you
  * can do whatever you want with this stuff. If we meet some day, and you think
  * this stuff is worth it, you can buy me a beer in return.   Poul-Henning Kamp
  * ----------------------------------------------------------------------------
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_mac.h"
 
 #include <sys/param.h>
 #include <sys/types.h>
 #include <sys/kernel.h>
 #include <sys/systm.h>
 #include <sys/errno.h>
 #include <sys/sysproto.h>
 #include <sys/malloc.h>
 #include <sys/priv.h>
 #include <sys/proc.h>
 #include <sys/taskqueue.h>
 #include <sys/jail.h>
 #include <sys/lock.h>
 #include <sys/mutex.h>
 #include <sys/sx.h>
 #include <sys/namei.h>
 #include <sys/mount.h>
 #include <sys/queue.h>
 #include <sys/socket.h>
 #include <sys/syscallsubr.h>
 #include <sys/sysctl.h>
 #include <sys/vnode.h>
 #include <net/if.h>
 #include <netinet/in.h>
 
 #include <security/mac/mac_framework.h>
 
 MALLOC_DEFINE(M_PRISON, "prison", "Prison structures");
 
 SYSCTL_NODE(_security, OID_AUTO, jail, CTLFLAG_RW, 0,
     "Jail rules");
 
 int	jail_set_hostname_allowed = 1;
 SYSCTL_INT(_security_jail, OID_AUTO, set_hostname_allowed, CTLFLAG_RW,
     &jail_set_hostname_allowed, 0,
     "Processes in jail can set their hostnames");
 
 int	jail_socket_unixiproute_only = 1;
 SYSCTL_INT(_security_jail, OID_AUTO, socket_unixiproute_only, CTLFLAG_RW,
     &jail_socket_unixiproute_only, 0,
     "Processes in jail are limited to creating UNIX/IPv4/route sockets only");
 
 int	jail_sysvipc_allowed = 0;
 SYSCTL_INT(_security_jail, OID_AUTO, sysvipc_allowed, CTLFLAG_RW,
     &jail_sysvipc_allowed, 0,
     "Processes in jail can use System V IPC primitives");
 
 static int jail_enforce_statfs = 2;
 SYSCTL_INT(_security_jail, OID_AUTO, enforce_statfs, CTLFLAG_RW,
     &jail_enforce_statfs, 0,
     "Processes in jail cannot see all mounted file systems");
 
 int	jail_allow_raw_sockets = 0;
 SYSCTL_INT(_security_jail, OID_AUTO, allow_raw_sockets, CTLFLAG_RW,
     &jail_allow_raw_sockets, 0,
     "Prison root can create raw sockets");
 
 int	jail_chflags_allowed = 0;
 SYSCTL_INT(_security_jail, OID_AUTO, chflags_allowed, CTLFLAG_RW,
     &jail_chflags_allowed, 0,
     "Processes in jail can alter system file flags");
 
 int	jail_mount_allowed = 0;
 SYSCTL_INT(_security_jail, OID_AUTO, mount_allowed, CTLFLAG_RW,
     &jail_mount_allowed, 0,
     "Processes in jail can mount/unmount jail-friendly file systems");
 
 /* allprison, lastprid, and prisoncount are protected by allprison_lock. */
 struct	prisonlist allprison;
 struct	sx allprison_lock;
 int	lastprid = 0;
 int	prisoncount = 0;
 
 /*
  * List of jail services. Protected by allprison_lock.
  */
 TAILQ_HEAD(prison_services_head, prison_service);
 static struct prison_services_head prison_services =
     TAILQ_HEAD_INITIALIZER(prison_services);
 static int prison_service_slots = 0;
 
 struct prison_service {
 	prison_create_t ps_create;
 	prison_destroy_t ps_destroy;
 	int		ps_slotno;
 	TAILQ_ENTRY(prison_service) ps_next;
 	char	ps_name[0];
 };
 
 static void		 init_prison(void *);
 static void		 prison_complete(void *context, int pending);
 static int		 sysctl_jail_list(SYSCTL_HANDLER_ARGS);
 
 static void
 init_prison(void *data __unused)
 {
 
 	sx_init(&allprison_lock, "allprison");
 	LIST_INIT(&allprison);
 }
 
 SYSINIT(prison, SI_SUB_INTRINSIC, SI_ORDER_ANY, init_prison, NULL);
 
 /*
  * struct jail_args {
  *	struct jail *jail;
  * };
  */
 int
 jail(struct thread *td, struct jail_args *uap)
 {
 	struct nameidata nd;
 	struct prison *pr, *tpr;
 	struct prison_service *psrv;
 	struct jail j;
 	struct jail_attach_args jaa;
 	int vfslocked, error, tryprid;
 
 	error = copyin(uap->jail, &j, sizeof(j));
 	if (error)
 		return (error);
 	if (j.version != 0)
 		return (EINVAL);
 
 	MALLOC(pr, struct prison *, sizeof(*pr), M_PRISON, M_WAITOK | M_ZERO);
 	mtx_init(&pr->pr_mtx, "jail mutex", NULL, MTX_DEF);
 	pr->pr_ref = 1;
 	error = copyinstr(j.path, &pr->pr_path, sizeof(pr->pr_path), 0);
 	if (error)
 		goto e_killmtx;
 	NDINIT(&nd, LOOKUP, MPSAFE | FOLLOW | LOCKLEAF, UIO_SYSSPACE,
 	    pr->pr_path, td);
 	error = namei(&nd);
 	if (error)
 		goto e_killmtx;
 	vfslocked = NDHASGIANT(&nd);
 	pr->pr_root = nd.ni_vp;
 	VOP_UNLOCK(nd.ni_vp, 0, td);
 	NDFREE(&nd, NDF_ONLY_PNBUF);
 	VFS_UNLOCK_GIANT(vfslocked);
 	error = copyinstr(j.hostname, &pr->pr_host, sizeof(pr->pr_host), 0);
 	if (error)
 		goto e_dropvnref;
 	pr->pr_ip = j.ip_number;
 	pr->pr_linux = NULL;
 	pr->pr_securelevel = securelevel;
 	if (prison_service_slots == 0)
 		pr->pr_slots = NULL;
 	else {
 		pr->pr_slots = malloc(sizeof(*pr->pr_slots) * prison_service_slots,
 		    M_PRISON, M_ZERO | M_WAITOK);
 	}
 
 	/* Determine next pr_id and add prison to allprison list. */
 	sx_xlock(&allprison_lock);
 	tryprid = lastprid + 1;
 	if (tryprid == JAIL_MAX)
 		tryprid = 1;
 next:
 	LIST_FOREACH(tpr, &allprison, pr_list) {
 		if (tpr->pr_id == tryprid) {
 			tryprid++;
 			if (tryprid == JAIL_MAX) {
 				sx_xunlock(&allprison_lock);
 				error = EAGAIN;
 				goto e_dropvnref;
 			}
 			goto next;
 		}
 	}
 	pr->pr_id = jaa.jid = lastprid = tryprid;
 	LIST_INSERT_HEAD(&allprison, pr, pr_list);
 	prisoncount++;
 	sx_downgrade(&allprison_lock);
 	TAILQ_FOREACH(psrv, &prison_services, ps_next) {
 		psrv->ps_create(psrv, pr);
 	}
 	sx_sunlock(&allprison_lock);
 
 	error = jail_attach(td, &jaa);
 	if (error)
 		goto e_dropprref;
 	mtx_lock(&pr->pr_mtx);
 	pr->pr_ref--;
 	mtx_unlock(&pr->pr_mtx);
 	td->td_retval[0] = jaa.jid;
 	return (0);
 e_dropprref:
 	sx_xlock(&allprison_lock);
 	LIST_REMOVE(pr, pr_list);
 	prisoncount--;
 	sx_downgrade(&allprison_lock);
 	TAILQ_FOREACH(psrv, &prison_services, ps_next) {
 		psrv->ps_destroy(psrv, pr);
 	}
 	sx_sunlock(&allprison_lock);
 e_dropvnref:
 	vfslocked = VFS_LOCK_GIANT(pr->pr_root->v_mount);
 	vrele(pr->pr_root);
 	VFS_UNLOCK_GIANT(vfslocked);
 e_killmtx:
 	mtx_destroy(&pr->pr_mtx);
 	FREE(pr, M_PRISON);
 	return (error);
 }
 
 /*
  * struct jail_attach_args {
  *	int jid;
  * };
  */
 int
 jail_attach(struct thread *td, struct jail_attach_args *uap)
 {
 	struct proc *p;
 	struct ucred *newcred, *oldcred;
 	struct prison *pr;
 	int vfslocked, error;
 
 	/*
 	 * XXX: Note that there is a slight race here if two threads
 	 * in the same privileged process attempt to attach to two
 	 * different jails at the same time.  It is important for
 	 * user processes not to do this, or they might end up with
 	 * a process root from one prison, but attached to the jail
 	 * of another.
 	 */
 	error = priv_check(td, PRIV_JAIL_ATTACH);
 	if (error)
 		return (error);
 
 	p = td->td_proc;
 	sx_slock(&allprison_lock);
 	pr = prison_find(uap->jid);
 	if (pr == NULL) {
 		sx_sunlock(&allprison_lock);
 		return (EINVAL);
 	}
 	pr->pr_ref++;
 	mtx_unlock(&pr->pr_mtx);
 	sx_sunlock(&allprison_lock);
 
 	vfslocked = VFS_LOCK_GIANT(pr->pr_root->v_mount);
-	vn_lock(pr->pr_root, LK_EXCLUSIVE | LK_RETRY, td);
+	vn_lock(pr->pr_root, LK_EXCLUSIVE | LK_RETRY);
 	if ((error = change_dir(pr->pr_root, td)) != 0)
 		goto e_unlock;
 #ifdef MAC
 	if ((error = mac_vnode_check_chroot(td->td_ucred, pr->pr_root)))
 		goto e_unlock;
 #endif
 	VOP_UNLOCK(pr->pr_root, 0, td);
 	change_root(pr->pr_root, td);
 	VFS_UNLOCK_GIANT(vfslocked);
 
 	newcred = crget();
 	PROC_LOCK(p);
 	oldcred = p->p_ucred;
 	setsugid(p);
 	crcopy(newcred, oldcred);
 	newcred->cr_prison = pr;
 	p->p_ucred = newcred;
 	PROC_UNLOCK(p);
 	crfree(oldcred);
 	return (0);
 e_unlock:
 	VOP_UNLOCK(pr->pr_root, 0, td);
 	VFS_UNLOCK_GIANT(vfslocked);
 	mtx_lock(&pr->pr_mtx);
 	pr->pr_ref--;
 	mtx_unlock(&pr->pr_mtx);
 	return (error);
 }
 
 /*
  * Returns a locked prison instance, or NULL on failure.
  */
 struct prison *
 prison_find(int prid)
 {
 	struct prison *pr;
 
 	sx_assert(&allprison_lock, SX_LOCKED);
 	LIST_FOREACH(pr, &allprison, pr_list) {
 		if (pr->pr_id == prid) {
 			mtx_lock(&pr->pr_mtx);
 			if (pr->pr_ref == 0) {
 				mtx_unlock(&pr->pr_mtx);
 				break;
 			}
 			return (pr);
 		}
 	}
 	return (NULL);
 }
 
 void
 prison_free(struct prison *pr)
 {
 
 	mtx_lock(&pr->pr_mtx);
 	pr->pr_ref--;
 	if (pr->pr_ref == 0) {
 		mtx_unlock(&pr->pr_mtx);
 		TASK_INIT(&pr->pr_task, 0, prison_complete, pr);
 		taskqueue_enqueue(taskqueue_thread, &pr->pr_task);
 		return;
 	}
 	mtx_unlock(&pr->pr_mtx);
 }
 
 static void
 prison_complete(void *context, int pending)
 {
 	struct prison_service *psrv;
 	struct prison *pr;
 	int vfslocked;
 
 	pr = (struct prison *)context;
 
 	sx_xlock(&allprison_lock);
 	LIST_REMOVE(pr, pr_list);
 	prisoncount--;
 	sx_downgrade(&allprison_lock);
 	TAILQ_FOREACH(psrv, &prison_services, ps_next) {
 		psrv->ps_destroy(psrv, pr);
 	}
 	sx_sunlock(&allprison_lock);
 
 	vfslocked = VFS_LOCK_GIANT(pr->pr_root->v_mount);
 	vrele(pr->pr_root);
 	VFS_UNLOCK_GIANT(vfslocked);
 
 	mtx_destroy(&pr->pr_mtx);
 	if (pr->pr_linux != NULL)
 		FREE(pr->pr_linux, M_PRISON);
 	FREE(pr, M_PRISON);
 }
 
 void
 prison_hold(struct prison *pr)
 {
 
 	mtx_lock(&pr->pr_mtx);
 	KASSERT(pr->pr_ref > 0,
 	    ("Trying to hold dead prison (id=%d).", pr->pr_id));
 	pr->pr_ref++;
 	mtx_unlock(&pr->pr_mtx);
 }
 
 u_int32_t
 prison_getip(struct ucred *cred)
 {
 
 	return (cred->cr_prison->pr_ip);
 }
 
 int
 prison_ip(struct ucred *cred, int flag, u_int32_t *ip)
 {
 	u_int32_t tmp;
 
 	if (!jailed(cred))
 		return (0);
 	if (flag)
 		tmp = *ip;
 	else
 		tmp = ntohl(*ip);
 	if (tmp == INADDR_ANY) {
 		if (flag)
 			*ip = cred->cr_prison->pr_ip;
 		else
 			*ip = htonl(cred->cr_prison->pr_ip);
 		return (0);
 	}
 	if (tmp == INADDR_LOOPBACK) {
 		if (flag)
 			*ip = cred->cr_prison->pr_ip;
 		else
 			*ip = htonl(cred->cr_prison->pr_ip);
 		return (0);
 	}
 	if (cred->cr_prison->pr_ip != tmp)
 		return (1);
 	return (0);
 }
 
 void
 prison_remote_ip(struct ucred *cred, int flag, u_int32_t *ip)
 {
 	u_int32_t tmp;
 
 	if (!jailed(cred))
 		return;
 	if (flag)
 		tmp = *ip;
 	else
 		tmp = ntohl(*ip);
 	if (tmp == INADDR_LOOPBACK) {
 		if (flag)
 			*ip = cred->cr_prison->pr_ip;
 		else
 			*ip = htonl(cred->cr_prison->pr_ip);
 		return;
 	}
 	return;
 }
 
 int
 prison_if(struct ucred *cred, struct sockaddr *sa)
 {
 	struct sockaddr_in *sai;
 	int ok;
 
 	sai = (struct sockaddr_in *)sa;
 	if ((sai->sin_family != AF_INET) && jail_socket_unixiproute_only)
 		ok = 1;
 	else if (sai->sin_family != AF_INET)
 		ok = 0;
 	else if (cred->cr_prison->pr_ip != ntohl(sai->sin_addr.s_addr))
 		ok = 1;
 	else
 		ok = 0;
 	return (ok);
 }
 
 /*
  * Return 0 if jails permit p1 to frob p2, otherwise ESRCH.
  */
 int
 prison_check(struct ucred *cred1, struct ucred *cred2)
 {
 
 	if (jailed(cred1)) {
 		if (!jailed(cred2))
 			return (ESRCH);
 		if (cred2->cr_prison != cred1->cr_prison)
 			return (ESRCH);
 	}
 
 	return (0);
 }
 
 /*
  * Return 1 if the passed credential is in a jail, otherwise 0.
  */
 int
 jailed(struct ucred *cred)
 {
 
 	return (cred->cr_prison != NULL);
 }
 
 /*
  * Return the correct hostname for the passed credential.
  */
 void
 getcredhostname(struct ucred *cred, char *buf, size_t size)
 {
 
 	if (jailed(cred)) {
 		mtx_lock(&cred->cr_prison->pr_mtx);
 		strlcpy(buf, cred->cr_prison->pr_host, size);
 		mtx_unlock(&cred->cr_prison->pr_mtx);
 	} else
 		strlcpy(buf, hostname, size);
 }
 
 /*
  * Determine whether the subject represented by cred can "see"
  * status of a mount point.
  * Returns: 0 for permitted, ENOENT otherwise.
  * XXX: This function should be called cr_canseemount() and should be
  *      placed in kern_prot.c.
  */
 int
 prison_canseemount(struct ucred *cred, struct mount *mp)
 {
 	struct prison *pr;
 	struct statfs *sp;
 	size_t len;
 
 	if (!jailed(cred) || jail_enforce_statfs == 0)
 		return (0);
 	pr = cred->cr_prison;
 	if (pr->pr_root->v_mount == mp)
 		return (0);
 	if (jail_enforce_statfs == 2)
 		return (ENOENT);
 	/*
 	 * If jail's chroot directory is set to "/" we should be able to see
 	 * all mount-points from inside a jail.
 	 * This is ugly check, but this is the only situation when jail's
 	 * directory ends with '/'.
 	 */
 	if (strcmp(pr->pr_path, "/") == 0)
 		return (0);
 	len = strlen(pr->pr_path);
 	sp = &mp->mnt_stat;
 	if (strncmp(pr->pr_path, sp->f_mntonname, len) != 0)
 		return (ENOENT);
 	/*
 	 * Be sure that we don't have situation where jail's root directory
 	 * is "/some/path" and mount point is "/some/pathpath".
 	 */
 	if (sp->f_mntonname[len] != '\0' && sp->f_mntonname[len] != '/')
 		return (ENOENT);
 	return (0);
 }
 
 void
 prison_enforce_statfs(struct ucred *cred, struct mount *mp, struct statfs *sp)
 {
 	char jpath[MAXPATHLEN];
 	struct prison *pr;
 	size_t len;
 
 	if (!jailed(cred) || jail_enforce_statfs == 0)
 		return;
 	pr = cred->cr_prison;
 	if (prison_canseemount(cred, mp) != 0) {
 		bzero(sp->f_mntonname, sizeof(sp->f_mntonname));
 		strlcpy(sp->f_mntonname, "[restricted]",
 		    sizeof(sp->f_mntonname));
 		return;
 	}
 	if (pr->pr_root->v_mount == mp) {
 		/*
 		 * Clear current buffer data, so we are sure nothing from
 		 * the valid path left there.
 		 */
 		bzero(sp->f_mntonname, sizeof(sp->f_mntonname));
 		*sp->f_mntonname = '/';
 		return;
 	}
 	/*
 	 * If jail's chroot directory is set to "/" we should be able to see
 	 * all mount-points from inside a jail.
 	 */
 	if (strcmp(pr->pr_path, "/") == 0)
 		return;
 	len = strlen(pr->pr_path);
 	strlcpy(jpath, sp->f_mntonname + len, sizeof(jpath));
 	/*
 	 * Clear current buffer data, so we are sure nothing from
 	 * the valid path left there.
 	 */
 	bzero(sp->f_mntonname, sizeof(sp->f_mntonname));
 	if (*jpath == '\0') {
 		/* Should never happen. */
 		*sp->f_mntonname = '/';
 	} else {
 		strlcpy(sp->f_mntonname, jpath, sizeof(sp->f_mntonname));
 	}
 }
 
 /*
  * Check with permission for a specific privilege is granted within jail.  We
  * have a specific list of accepted privileges; the rest are denied.
  */
 int
 prison_priv_check(struct ucred *cred, int priv)
 {
 
 	if (!jailed(cred))
 		return (0);
 
 	switch (priv) {
 
 		/*
 		 * Allow ktrace privileges for root in jail.
 		 */
 	case PRIV_KTRACE:
 
 #if 0
 		/*
 		 * Allow jailed processes to configure audit identity and
 		 * submit audit records (login, etc).  In the future we may
 		 * want to further refine the relationship between audit and
 		 * jail.
 		 */
 	case PRIV_AUDIT_GETAUDIT:
 	case PRIV_AUDIT_SETAUDIT:
 	case PRIV_AUDIT_SUBMIT:
 #endif
 
 		/*
 		 * Allow jailed processes to manipulate process UNIX
 		 * credentials in any way they see fit.
 		 */
 	case PRIV_CRED_SETUID:
 	case PRIV_CRED_SETEUID:
 	case PRIV_CRED_SETGID:
 	case PRIV_CRED_SETEGID:
 	case PRIV_CRED_SETGROUPS:
 	case PRIV_CRED_SETREUID:
 	case PRIV_CRED_SETREGID:
 	case PRIV_CRED_SETRESUID:
 	case PRIV_CRED_SETRESGID:
 
 		/*
 		 * Jail implements visibility constraints already, so allow
 		 * jailed root to override uid/gid-based constraints.
 		 */
 	case PRIV_SEEOTHERGIDS:
 	case PRIV_SEEOTHERUIDS:
 
 		/*
 		 * Jail implements inter-process debugging limits already, so
 		 * allow jailed root various debugging privileges.
 		 */
 	case PRIV_DEBUG_DIFFCRED:
 	case PRIV_DEBUG_SUGID:
 	case PRIV_DEBUG_UNPRIV:
 
 		/*
 		 * Allow jail to set various resource limits and login
 		 * properties, and for now, exceed process resource limits.
 		 */
 	case PRIV_PROC_LIMIT:
 	case PRIV_PROC_SETLOGIN:
 	case PRIV_PROC_SETRLIMIT:
 
 		/*
 		 * System V and POSIX IPC privileges are granted in jail.
 		 */
 	case PRIV_IPC_READ:
 	case PRIV_IPC_WRITE:
 	case PRIV_IPC_ADMIN:
 	case PRIV_IPC_MSGSIZE:
 	case PRIV_MQ_ADMIN:
 
 		/*
 		 * Jail implements its own inter-process limits, so allow
 		 * root processes in jail to change scheduling on other
 		 * processes in the same jail.  Likewise for signalling.
 		 */
 	case PRIV_SCHED_DIFFCRED:
 	case PRIV_SIGNAL_DIFFCRED:
 	case PRIV_SIGNAL_SUGID:
 
 		/*
 		 * Allow jailed processes to write to sysctls marked as jail
 		 * writable.
 		 */
 	case PRIV_SYSCTL_WRITEJAIL:
 
 		/*
 		 * Allow root in jail to manage a variety of quota
 		 * properties.  These should likely be conditional on a
 		 * configuration option.
 		 */
 	case PRIV_VFS_GETQUOTA:
 	case PRIV_VFS_SETQUOTA:
 
 		/*
 		 * Since Jail relies on chroot() to implement file system
 		 * protections, grant many VFS privileges to root in jail.
 		 * Be careful to exclude mount-related and NFS-related
 		 * privileges.
 		 */
 	case PRIV_VFS_READ:
 	case PRIV_VFS_WRITE:
 	case PRIV_VFS_ADMIN:
 	case PRIV_VFS_EXEC:
 	case PRIV_VFS_LOOKUP:
 	case PRIV_VFS_BLOCKRESERVE:	/* XXXRW: Slightly surprising. */
 	case PRIV_VFS_CHFLAGS_DEV:
 	case PRIV_VFS_CHOWN:
 	case PRIV_VFS_CHROOT:
 	case PRIV_VFS_RETAINSUGID:
 	case PRIV_VFS_FCHROOT:
 	case PRIV_VFS_LINK:
 	case PRIV_VFS_SETGID:
 	case PRIV_VFS_STAT:
 	case PRIV_VFS_STICKYFILE:
 		return (0);
 
 		/*
 		 * Depending on the global setting, allow privilege of
 		 * setting system flags.
 		 */
 	case PRIV_VFS_SYSFLAGS:
 		if (jail_chflags_allowed)
 			return (0);
 		else
 			return (EPERM);
 
 		/*
 		 * Depending on the global setting, allow privilege of
 		 * mounting/unmounting file systems.
 		 */
 	case PRIV_VFS_MOUNT:
 	case PRIV_VFS_UNMOUNT:
 	case PRIV_VFS_MOUNT_NONUSER:
 	case PRIV_VFS_MOUNT_OWNER:
 		if (jail_mount_allowed)
 			return (0);
 		else
 			return (EPERM);
 
 		/*
 		 * Allow jailed root to bind reserved ports and reuse in-use
 		 * ports.
 		 */
 	case PRIV_NETINET_RESERVEDPORT:
 	case PRIV_NETINET_REUSEPORT:
 		return (0);
 
 		/*
 		 * Conditionally allow creating raw sockets in jail.
 		 */
 	case PRIV_NETINET_RAW:
 		if (jail_allow_raw_sockets)
 			return (0);
 		else
 			return (EPERM);
 
 		/*
 		 * Since jail implements its own visibility limits on netstat
 		 * sysctls, allow getcred.  This allows identd to work in
 		 * jail.
 		 */
 	case PRIV_NETINET_GETCRED:
 		return (0);
 
 	default:
 		/*
 		 * In all remaining cases, deny the privilege request.  This
 		 * includes almost all network privileges, many system
 		 * configuration privileges.
 		 */
 		return (EPERM);
 	}
 }
 
 /*
  * Register jail service. Provides 'create' and 'destroy' methods.
  * 'create' method will be called for every existing jail and all
  * jails in the future as they beeing created.
  * 'destroy' method will be called for every jail going away and
  * for all existing jails at the time of service deregistration.
  */
 struct prison_service *
 prison_service_register(const char *name, prison_create_t create,
     prison_destroy_t destroy)
 {
 	struct prison_service *psrv, *psrv2;
 	struct prison *pr;
 	int reallocate = 1, slotno = 0;
 	void **slots, **oldslots;
 
 	psrv = malloc(sizeof(*psrv) + strlen(name) + 1, M_PRISON,
 	    M_WAITOK | M_ZERO);
 	psrv->ps_create = create;
 	psrv->ps_destroy = destroy;
 	strcpy(psrv->ps_name, name);
 	/*
 	 * Grab the allprison_lock here, so we won't miss any jail
 	 * creation/destruction.
 	 */
 	sx_xlock(&allprison_lock);
 #ifdef INVARIANTS
 	/*
 	 * Verify if service is not already registered.
 	 */
 	TAILQ_FOREACH(psrv2, &prison_services, ps_next) {
 		KASSERT(strcmp(psrv2->ps_name, name) != 0,
 		    ("jail service %s already registered", name));
 	}
 #endif
 	/*
 	 * Find free slot. When there is no existing free slot available,
 	 * allocate one at the end.
 	 */
 	TAILQ_FOREACH(psrv2, &prison_services, ps_next) {
 		if (psrv2->ps_slotno != slotno) {
 			KASSERT(slotno < psrv2->ps_slotno,
 			    ("Invalid slotno (slotno=%d >= ps_slotno=%d",
 			    slotno, psrv2->ps_slotno));
 			/* We found free slot. */
 			reallocate = 0;
 			break;
 		}
 		slotno++;
 	}
 	psrv->ps_slotno = slotno;
 	/*
 	 * Keep the list sorted by slot number.
 	 */
 	if (psrv2 != NULL) {
 		KASSERT(reallocate == 0, ("psrv2 != NULL && reallocate != 0"));
 		TAILQ_INSERT_BEFORE(psrv2, psrv, ps_next);
 	} else {
 		KASSERT(reallocate == 1, ("psrv2 == NULL && reallocate == 0"));
 		TAILQ_INSERT_TAIL(&prison_services, psrv, ps_next);
 	}
 	prison_service_slots++;
 	sx_downgrade(&allprison_lock);
 	/*
 	 * Allocate memory for new slot if we didn't found empty one.
 	 * Do not use realloc(9), because pr_slots is protected with a mutex,
 	 * so we can't sleep.
 	 */
 	LIST_FOREACH(pr, &allprison, pr_list) {
 		if (reallocate) {
 			/* First allocate memory with M_WAITOK. */
 			slots = malloc(sizeof(*slots) * prison_service_slots,
 			    M_PRISON, M_WAITOK);
 			/* Now grab the mutex and replace pr_slots. */
 			mtx_lock(&pr->pr_mtx);
 			oldslots = pr->pr_slots;
 			if (psrv->ps_slotno > 0) {
 				bcopy(oldslots, slots,
 				    sizeof(*slots) * (prison_service_slots - 1));
 			}
 			slots[psrv->ps_slotno] = NULL;
 			pr->pr_slots = slots;
 			mtx_unlock(&pr->pr_mtx);
 			if (oldslots != NULL)
 				free(oldslots, M_PRISON);
 		}
 		/*
 		 * Call 'create' method for each existing jail.
 		 */
 		psrv->ps_create(psrv, pr);
 	}
 	sx_sunlock(&allprison_lock);
 
 	return (psrv);
 }
 
 void
 prison_service_deregister(struct prison_service *psrv)
 {
 	struct prison *pr;
 	void **slots, **oldslots;
 	int last = 0;
 
 	sx_xlock(&allprison_lock);
 	if (TAILQ_LAST(&prison_services, prison_services_head) == psrv)
 		last = 1;
 	TAILQ_REMOVE(&prison_services, psrv, ps_next);
 	prison_service_slots--;
 	sx_downgrade(&allprison_lock);
 	LIST_FOREACH(pr, &allprison, pr_list) {
 		/*
 		 * Call 'destroy' method for every currently existing jail.
 		 */
 		psrv->ps_destroy(psrv, pr);
 		/*
 		 * If this is the last slot, free the memory allocated for it.
 		 */
 		if (last) {
 			if (prison_service_slots == 0)
 				slots = NULL;
 			else {
 				slots = malloc(sizeof(*slots) * prison_service_slots,
 				    M_PRISON, M_WAITOK);
 			}
 			mtx_lock(&pr->pr_mtx);
 			oldslots = pr->pr_slots;
 			/*
 			 * We require setting slot to NULL after freeing it,
 			 * this way we can check for memory leaks here.
 			 */
 			KASSERT(oldslots[psrv->ps_slotno] == NULL,
 			    ("Slot %d (service %s, jailid=%d) still contains data?",
 			     psrv->ps_slotno, psrv->ps_name, pr->pr_id));
 			if (psrv->ps_slotno > 0) {
 				bcopy(oldslots, slots,
 				    sizeof(*slots) * prison_service_slots);
 			}
 			pr->pr_slots = slots;
 			mtx_unlock(&pr->pr_mtx);
 			KASSERT(oldslots != NULL, ("oldslots == NULL"));
 			free(oldslots, M_PRISON);
 		}
 	}
 	sx_sunlock(&allprison_lock);
 	free(psrv, M_PRISON);
 }
 
 /*
  * Function sets data for the given jail in slot assigned for the given
  * jail service.
  */
 void
 prison_service_data_set(struct prison_service *psrv, struct prison *pr,
     void *data)
 {
 
 	mtx_assert(&pr->pr_mtx, MA_OWNED);
 	pr->pr_slots[psrv->ps_slotno] = data;
 }
 
 /*
  * Function clears slots assigned for the given jail service in the given
  * prison structure and returns current slot data.
  */
 void *
 prison_service_data_del(struct prison_service *psrv, struct prison *pr)
 {
 	void *data;
 
 	mtx_assert(&pr->pr_mtx, MA_OWNED);
 	data = pr->pr_slots[psrv->ps_slotno];
 	pr->pr_slots[psrv->ps_slotno] = NULL;
 	return (data);
 }
 
 /*
  * Function returns current data from the slot assigned to the given jail
  * service for the given jail.
  */
 void *
 prison_service_data_get(struct prison_service *psrv, struct prison *pr)
 {
 
 	mtx_assert(&pr->pr_mtx, MA_OWNED);
 	return (pr->pr_slots[psrv->ps_slotno]);
 }
 
 static int
 sysctl_jail_list(SYSCTL_HANDLER_ARGS)
 {
 	struct xprison *xp, *sxp;
 	struct prison *pr;
 	int count, error;
 
 	if (jailed(req->td->td_ucred))
 		return (0);
 
 	sx_slock(&allprison_lock);
 	if ((count = prisoncount) == 0) {
 		sx_sunlock(&allprison_lock);
 		return (0);
 	}
 
 	sxp = xp = malloc(sizeof(*xp) * count, M_TEMP, M_WAITOK | M_ZERO);
 
 	LIST_FOREACH(pr, &allprison, pr_list) {
 		xp->pr_version = XPRISON_VERSION;
 		xp->pr_id = pr->pr_id;
 		xp->pr_ip = pr->pr_ip;
 		strlcpy(xp->pr_path, pr->pr_path, sizeof(xp->pr_path));
 		mtx_lock(&pr->pr_mtx);
 		strlcpy(xp->pr_host, pr->pr_host, sizeof(xp->pr_host));
 		mtx_unlock(&pr->pr_mtx);
 		xp++;
 	}
 	sx_sunlock(&allprison_lock);
 
 	error = SYSCTL_OUT(req, sxp, sizeof(*sxp) * count);
 	free(sxp, M_TEMP);
 	return (error);
 }
 
 SYSCTL_OID(_security_jail, OID_AUTO, list, CTLTYPE_STRUCT | CTLFLAG_RD,
     NULL, 0, sysctl_jail_list, "S", "List of active jails");
 
 static int
 sysctl_jail_jailed(SYSCTL_HANDLER_ARGS)
 {
 	int error, injail;
 
 	injail = jailed(req->td->td_ucred);
 	error = SYSCTL_OUT(req, &injail, sizeof(injail));
 
 	return (error);
 }
 SYSCTL_PROC(_security_jail, OID_AUTO, jailed, CTLTYPE_INT | CTLFLAG_RD,
     NULL, 0, sysctl_jail_jailed, "I", "Process in jail?");
Index: head/sys/kern/kern_ktrace.c
===================================================================
--- head/sys/kern/kern_ktrace.c	(revision 175201)
+++ head/sys/kern/kern_ktrace.c	(revision 175202)
@@ -1,1011 +1,1011 @@
 /*-
  * Copyright (c) 1989, 1993
  *	The Regents of the University of California.
  * Copyright (c) 2005 Robert N. M. Watson
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	@(#)kern_ktrace.c	8.2 (Berkeley) 9/23/93
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_ktrace.h"
 #include "opt_mac.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/fcntl.h>
 #include <sys/kernel.h>
 #include <sys/kthread.h>
 #include <sys/lock.h>
 #include <sys/mutex.h>
 #include <sys/malloc.h>
 #include <sys/mount.h>
 #include <sys/namei.h>
 #include <sys/priv.h>
 #include <sys/proc.h>
 #include <sys/unistd.h>
 #include <sys/vnode.h>
 #include <sys/ktrace.h>
 #include <sys/sx.h>
 #include <sys/sysctl.h>
 #include <sys/syslog.h>
 #include <sys/sysproto.h>
 
 #include <security/mac/mac_framework.h>
 
 /*
  * The ktrace facility allows the tracing of certain key events in user space
  * processes, such as system calls, signal delivery, context switches, and
  * user generated events using utrace(2).  It works by streaming event
  * records and data to a vnode associated with the process using the
  * ktrace(2) system call.  In general, records can be written directly from
  * the context that generates the event.  One important exception to this is
  * during a context switch, where sleeping is not permitted.  To handle this
  * case, trace events are generated using in-kernel ktr_request records, and
  * then delivered to disk at a convenient moment -- either immediately, the
  * next traceable event, at system call return, or at process exit.
  *
  * When dealing with multiple threads or processes writing to the same event
  * log, ordering guarantees are weak: specifically, if an event has multiple
  * records (i.e., system call enter and return), they may be interlaced with
  * records from another event.  Process and thread ID information is provided
  * in the record, and user applications can de-interlace events if required.
  */
 
 static MALLOC_DEFINE(M_KTRACE, "KTRACE", "KTRACE");
 
 #ifdef KTRACE
 
 #ifndef KTRACE_REQUEST_POOL
 #define	KTRACE_REQUEST_POOL	100
 #endif
 
 struct ktr_request {
 	struct	ktr_header ktr_header;
 	void	*ktr_buffer;
 	union {
 		struct	ktr_syscall ktr_syscall;
 		struct	ktr_sysret ktr_sysret;
 		struct	ktr_genio ktr_genio;
 		struct	ktr_psig ktr_psig;
 		struct	ktr_csw ktr_csw;
 	} ktr_data;
 	STAILQ_ENTRY(ktr_request) ktr_list;
 };
 
 static int data_lengths[] = {
 	0,					/* none */
 	offsetof(struct ktr_syscall, ktr_args),	/* KTR_SYSCALL */
 	sizeof(struct ktr_sysret),		/* KTR_SYSRET */
 	0,					/* KTR_NAMEI */
 	sizeof(struct ktr_genio),		/* KTR_GENIO */
 	sizeof(struct ktr_psig),		/* KTR_PSIG */
 	sizeof(struct ktr_csw),			/* KTR_CSW */
 	0					/* KTR_USER */
 };
 
 static STAILQ_HEAD(, ktr_request) ktr_free;
 
 static SYSCTL_NODE(_kern, OID_AUTO, ktrace, CTLFLAG_RD, 0, "KTRACE options");
 
 static u_int ktr_requestpool = KTRACE_REQUEST_POOL;
 TUNABLE_INT("kern.ktrace.request_pool", &ktr_requestpool);
 
 static u_int ktr_geniosize = PAGE_SIZE;
 TUNABLE_INT("kern.ktrace.genio_size", &ktr_geniosize);
 SYSCTL_UINT(_kern_ktrace, OID_AUTO, genio_size, CTLFLAG_RW, &ktr_geniosize,
     0, "Maximum size of genio event payload");
 
 static int print_message = 1;
 struct mtx ktrace_mtx;
 static struct sx ktrace_sx;
 
 static void ktrace_init(void *dummy);
 static int sysctl_kern_ktrace_request_pool(SYSCTL_HANDLER_ARGS);
 static u_int ktrace_resize_pool(u_int newsize);
 static struct ktr_request *ktr_getrequest(int type);
 static void ktr_submitrequest(struct thread *td, struct ktr_request *req);
 static void ktr_freerequest(struct ktr_request *req);
 static void ktr_writerequest(struct thread *td, struct ktr_request *req);
 static int ktrcanset(struct thread *,struct proc *);
 static int ktrsetchildren(struct thread *,struct proc *,int,int,struct vnode *);
 static int ktrops(struct thread *,struct proc *,int,int,struct vnode *);
 
 /*
  * ktrace itself generates events, such as context switches, which we do not
  * wish to trace.  Maintain a flag, TDP_INKTRACE, on each thread to determine
  * whether or not it is in a region where tracing of events should be
  * suppressed.
  */
 static void
 ktrace_enter(struct thread *td)
 {
 
 	KASSERT(!(td->td_pflags & TDP_INKTRACE), ("ktrace_enter: flag set"));
 	td->td_pflags |= TDP_INKTRACE;
 }
 
 static void
 ktrace_exit(struct thread *td)
 {
 
 	KASSERT(td->td_pflags & TDP_INKTRACE, ("ktrace_exit: flag not set"));
 	td->td_pflags &= ~TDP_INKTRACE;
 }
 
 static void
 ktrace_assert(struct thread *td)
 {
 
 	KASSERT(td->td_pflags & TDP_INKTRACE, ("ktrace_assert: flag not set"));
 }
 
 static void
 ktrace_init(void *dummy)
 {
 	struct ktr_request *req;
 	int i;
 
 	mtx_init(&ktrace_mtx, "ktrace", NULL, MTX_DEF | MTX_QUIET);
 	sx_init(&ktrace_sx, "ktrace_sx");
 	STAILQ_INIT(&ktr_free);
 	for (i = 0; i < ktr_requestpool; i++) {
 		req = malloc(sizeof(struct ktr_request), M_KTRACE, M_WAITOK);
 		STAILQ_INSERT_HEAD(&ktr_free, req, ktr_list);
 	}
 }
 SYSINIT(ktrace_init, SI_SUB_KTRACE, SI_ORDER_ANY, ktrace_init, NULL);
 
 static int
 sysctl_kern_ktrace_request_pool(SYSCTL_HANDLER_ARGS)
 {
 	struct thread *td;
 	u_int newsize, oldsize, wantsize;
 	int error;
 
 	/* Handle easy read-only case first to avoid warnings from GCC. */
 	if (!req->newptr) {
 		mtx_lock(&ktrace_mtx);
 		oldsize = ktr_requestpool;
 		mtx_unlock(&ktrace_mtx);
 		return (SYSCTL_OUT(req, &oldsize, sizeof(u_int)));
 	}
 
 	error = SYSCTL_IN(req, &wantsize, sizeof(u_int));
 	if (error)
 		return (error);
 	td = curthread;
 	ktrace_enter(td);
 	mtx_lock(&ktrace_mtx);
 	oldsize = ktr_requestpool;
 	newsize = ktrace_resize_pool(wantsize);
 	mtx_unlock(&ktrace_mtx);
 	ktrace_exit(td);
 	error = SYSCTL_OUT(req, &oldsize, sizeof(u_int));
 	if (error)
 		return (error);
 	if (wantsize > oldsize && newsize < wantsize)
 		return (ENOSPC);
 	return (0);
 }
 SYSCTL_PROC(_kern_ktrace, OID_AUTO, request_pool, CTLTYPE_UINT|CTLFLAG_RW,
     &ktr_requestpool, 0, sysctl_kern_ktrace_request_pool, "IU", "");
 
 static u_int
 ktrace_resize_pool(u_int newsize)
 {
 	struct ktr_request *req;
 	int bound;
 
 	mtx_assert(&ktrace_mtx, MA_OWNED);
 	print_message = 1;
 	bound = newsize - ktr_requestpool;
 	if (bound == 0)
 		return (ktr_requestpool);
 	if (bound < 0)
 		/* Shrink pool down to newsize if possible. */
 		while (bound++ < 0) {
 			req = STAILQ_FIRST(&ktr_free);
 			if (req == NULL)
 				return (ktr_requestpool);
 			STAILQ_REMOVE_HEAD(&ktr_free, ktr_list);
 			ktr_requestpool--;
 			mtx_unlock(&ktrace_mtx);
 			free(req, M_KTRACE);
 			mtx_lock(&ktrace_mtx);
 		}
 	else
 		/* Grow pool up to newsize. */
 		while (bound-- > 0) {
 			mtx_unlock(&ktrace_mtx);
 			req = malloc(sizeof(struct ktr_request), M_KTRACE,
 			    M_WAITOK);
 			mtx_lock(&ktrace_mtx);
 			STAILQ_INSERT_HEAD(&ktr_free, req, ktr_list);
 			ktr_requestpool++;
 		}
 	return (ktr_requestpool);
 }
 
 static struct ktr_request *
 ktr_getrequest(int type)
 {
 	struct ktr_request *req;
 	struct thread *td = curthread;
 	struct proc *p = td->td_proc;
 	int pm;
 
 	ktrace_enter(td);	/* XXX: In caller instead? */
 	mtx_lock(&ktrace_mtx);
 	if (!KTRCHECK(td, type)) {
 		mtx_unlock(&ktrace_mtx);
 		ktrace_exit(td);
 		return (NULL);
 	}
 	req = STAILQ_FIRST(&ktr_free);
 	if (req != NULL) {
 		STAILQ_REMOVE_HEAD(&ktr_free, ktr_list);
 		req->ktr_header.ktr_type = type;
 		if (p->p_traceflag & KTRFAC_DROP) {
 			req->ktr_header.ktr_type |= KTR_DROP;
 			p->p_traceflag &= ~KTRFAC_DROP;
 		}
 		mtx_unlock(&ktrace_mtx);
 		microtime(&req->ktr_header.ktr_time);
 		req->ktr_header.ktr_pid = p->p_pid;
 		req->ktr_header.ktr_tid = td->td_tid;
 		bcopy(td->td_name, req->ktr_header.ktr_comm, MAXCOMLEN + 1);
 		req->ktr_buffer = NULL;
 		req->ktr_header.ktr_len = 0;
 	} else {
 		p->p_traceflag |= KTRFAC_DROP;
 		pm = print_message;
 		print_message = 0;
 		mtx_unlock(&ktrace_mtx);
 		if (pm)
 			printf("Out of ktrace request objects.\n");
 		ktrace_exit(td);
 	}
 	return (req);
 }
 
 /*
  * Some trace generation environments don't permit direct access to VFS,
  * such as during a context switch where sleeping is not allowed.  Under these
  * circumstances, queue a request to the thread to be written asynchronously
  * later.
  */
 static void
 ktr_enqueuerequest(struct thread *td, struct ktr_request *req)
 {
 
 	mtx_lock(&ktrace_mtx);
 	STAILQ_INSERT_TAIL(&td->td_proc->p_ktr, req, ktr_list);
 	mtx_unlock(&ktrace_mtx);
 	ktrace_exit(td);
 }
 
 /*
  * Drain any pending ktrace records from the per-thread queue to disk.  This
  * is used both internally before committing other records, and also on
  * system call return.  We drain all the ones we can find at the time when
  * drain is requested, but don't keep draining after that as those events
  * may me approximately "after" the current event.
  */
 static void
 ktr_drain(struct thread *td)
 {
 	struct ktr_request *queued_req;
 	STAILQ_HEAD(, ktr_request) local_queue;
 
 	ktrace_assert(td);
 	sx_assert(&ktrace_sx, SX_XLOCKED);
 
 	STAILQ_INIT(&local_queue);	/* XXXRW: needed? */
 
 	if (!STAILQ_EMPTY(&td->td_proc->p_ktr)) {
 		mtx_lock(&ktrace_mtx);
 		STAILQ_CONCAT(&local_queue, &td->td_proc->p_ktr);
 		mtx_unlock(&ktrace_mtx);
 
 		while ((queued_req = STAILQ_FIRST(&local_queue))) {
 			STAILQ_REMOVE_HEAD(&local_queue, ktr_list);
 			ktr_writerequest(td, queued_req);
 			ktr_freerequest(queued_req);
 		}
 	}
 }
 
 /*
  * Submit a trace record for immediate commit to disk -- to be used only
  * where entering VFS is OK.  First drain any pending records that may have
  * been cached in the thread.
  */
 static void
 ktr_submitrequest(struct thread *td, struct ktr_request *req)
 {
 
 	ktrace_assert(td);
 
 	sx_xlock(&ktrace_sx);
 	ktr_drain(td);
 	ktr_writerequest(td, req);
 	ktr_freerequest(req);
 	sx_xunlock(&ktrace_sx);
 
 	ktrace_exit(td);
 }
 
 static void
 ktr_freerequest(struct ktr_request *req)
 {
 
 	if (req->ktr_buffer != NULL)
 		free(req->ktr_buffer, M_KTRACE);
 	mtx_lock(&ktrace_mtx);
 	STAILQ_INSERT_HEAD(&ktr_free, req, ktr_list);
 	mtx_unlock(&ktrace_mtx);
 }
 
 void
 ktrsyscall(code, narg, args)
 	int code, narg;
 	register_t args[];
 {
 	struct ktr_request *req;
 	struct ktr_syscall *ktp;
 	size_t buflen;
 	char *buf = NULL;
 
 	buflen = sizeof(register_t) * narg;
 	if (buflen > 0) {
 		buf = malloc(buflen, M_KTRACE, M_WAITOK);
 		bcopy(args, buf, buflen);
 	}
 	req = ktr_getrequest(KTR_SYSCALL);
 	if (req == NULL) {
 		if (buf != NULL)
 			free(buf, M_KTRACE);
 		return;
 	}
 	ktp = &req->ktr_data.ktr_syscall;
 	ktp->ktr_code = code;
 	ktp->ktr_narg = narg;
 	if (buflen > 0) {
 		req->ktr_header.ktr_len = buflen;
 		req->ktr_buffer = buf;
 	}
 	ktr_submitrequest(curthread, req);
 }
 
 void
 ktrsysret(code, error, retval)
 	int code, error;
 	register_t retval;
 {
 	struct ktr_request *req;
 	struct ktr_sysret *ktp;
 
 	req = ktr_getrequest(KTR_SYSRET);
 	if (req == NULL)
 		return;
 	ktp = &req->ktr_data.ktr_sysret;
 	ktp->ktr_code = code;
 	ktp->ktr_error = error;
 	ktp->ktr_retval = retval;		/* what about val2 ? */
 	ktr_submitrequest(curthread, req);
 }
 
 /*
  * When a process exits, drain per-process asynchronous trace records.
  */
 void
 ktrprocexit(struct thread *td)
 {
 
 	ktrace_enter(td);
 	sx_xlock(&ktrace_sx);
 	ktr_drain(td);
 	sx_xunlock(&ktrace_sx);
 	ktrace_exit(td);
 }
 
 /*
  * When a thread returns, drain any asynchronous records generated by the
  * system call.
  */
 void
 ktruserret(struct thread *td)
 {
 
 	ktrace_enter(td);
 	sx_xlock(&ktrace_sx);
 	ktr_drain(td);
 	sx_xunlock(&ktrace_sx);
 	ktrace_exit(td);
 }
 
 void
 ktrnamei(path)
 	char *path;
 {
 	struct ktr_request *req;
 	int namelen;
 	char *buf = NULL;
 
 	namelen = strlen(path);
 	if (namelen > 0) {
 		buf = malloc(namelen, M_KTRACE, M_WAITOK);
 		bcopy(path, buf, namelen);
 	}
 	req = ktr_getrequest(KTR_NAMEI);
 	if (req == NULL) {
 		if (buf != NULL)
 			free(buf, M_KTRACE);
 		return;
 	}
 	if (namelen > 0) {
 		req->ktr_header.ktr_len = namelen;
 		req->ktr_buffer = buf;
 	}
 	ktr_submitrequest(curthread, req);
 }
 
 void
 ktrgenio(fd, rw, uio, error)
 	int fd;
 	enum uio_rw rw;
 	struct uio *uio;
 	int error;
 {
 	struct ktr_request *req;
 	struct ktr_genio *ktg;
 	int datalen;
 	char *buf;
 
 	if (error) {
 		free(uio, M_IOV);
 		return;
 	}
 	uio->uio_offset = 0;
 	uio->uio_rw = UIO_WRITE;
 	datalen = imin(uio->uio_resid, ktr_geniosize);
 	buf = malloc(datalen, M_KTRACE, M_WAITOK);
 	error = uiomove(buf, datalen, uio);
 	free(uio, M_IOV);
 	if (error) {
 		free(buf, M_KTRACE);
 		return;
 	}
 	req = ktr_getrequest(KTR_GENIO);
 	if (req == NULL) {
 		free(buf, M_KTRACE);
 		return;
 	}
 	ktg = &req->ktr_data.ktr_genio;
 	ktg->ktr_fd = fd;
 	ktg->ktr_rw = rw;
 	req->ktr_header.ktr_len = datalen;
 	req->ktr_buffer = buf;
 	ktr_submitrequest(curthread, req);
 }
 
 void
 ktrpsig(sig, action, mask, code)
 	int sig;
 	sig_t action;
 	sigset_t *mask;
 	int code;
 {
 	struct ktr_request *req;
 	struct ktr_psig	*kp;
 
 	req = ktr_getrequest(KTR_PSIG);
 	if (req == NULL)
 		return;
 	kp = &req->ktr_data.ktr_psig;
 	kp->signo = (char)sig;
 	kp->action = action;
 	kp->mask = *mask;
 	kp->code = code;
 	ktr_enqueuerequest(curthread, req);
 }
 
 void
 ktrcsw(out, user)
 	int out, user;
 {
 	struct ktr_request *req;
 	struct ktr_csw *kc;
 
 	req = ktr_getrequest(KTR_CSW);
 	if (req == NULL)
 		return;
 	kc = &req->ktr_data.ktr_csw;
 	kc->out = out;
 	kc->user = user;
 	ktr_enqueuerequest(curthread, req);
 }
 #endif /* KTRACE */
 
 /* Interface and common routines */
 
 #ifndef _SYS_SYSPROTO_H_
 struct ktrace_args {
 	char	*fname;
 	int	ops;
 	int	facs;
 	int	pid;
 };
 #endif
 /* ARGSUSED */
 int
 ktrace(td, uap)
 	struct thread *td;
 	register struct ktrace_args *uap;
 {
 #ifdef KTRACE
 	register struct vnode *vp = NULL;
 	register struct proc *p;
 	struct pgrp *pg;
 	int facs = uap->facs & ~KTRFAC_ROOT;
 	int ops = KTROP(uap->ops);
 	int descend = uap->ops & KTRFLAG_DESCEND;
 	int nfound, ret = 0;
 	int flags, error = 0, vfslocked;
 	struct nameidata nd;
 	struct ucred *cred;
 
 	/*
 	 * Need something to (un)trace.
 	 */
 	if (ops != KTROP_CLEARFILE && facs == 0)
 		return (EINVAL);
 
 	ktrace_enter(td);
 	if (ops != KTROP_CLEAR) {
 		/*
 		 * an operation which requires a file argument.
 		 */
 		NDINIT(&nd, LOOKUP, NOFOLLOW | MPSAFE, UIO_USERSPACE,
 		    uap->fname, td);
 		flags = FREAD | FWRITE | O_NOFOLLOW;
 		error = vn_open(&nd, &flags, 0, NULL);
 		if (error) {
 			ktrace_exit(td);
 			return (error);
 		}
 		vfslocked = NDHASGIANT(&nd);
 		NDFREE(&nd, NDF_ONLY_PNBUF);
 		vp = nd.ni_vp;
 		VOP_UNLOCK(vp, 0, td);
 		if (vp->v_type != VREG) {
 			(void) vn_close(vp, FREAD|FWRITE, td->td_ucred, td);
 			VFS_UNLOCK_GIANT(vfslocked);
 			ktrace_exit(td);
 			return (EACCES);
 		}
 		VFS_UNLOCK_GIANT(vfslocked);
 	}
 	/*
 	 * Clear all uses of the tracefile.
 	 */
 	if (ops == KTROP_CLEARFILE) {
 		int vrele_count;
 
 		vrele_count = 0;
 		sx_slock(&allproc_lock);
 		FOREACH_PROC_IN_SYSTEM(p) {
 			PROC_LOCK(p);
 			if (p->p_tracevp == vp) {
 				if (ktrcanset(td, p)) {
 					mtx_lock(&ktrace_mtx);
 					cred = p->p_tracecred;
 					p->p_tracecred = NULL;
 					p->p_tracevp = NULL;
 					p->p_traceflag = 0;
 					mtx_unlock(&ktrace_mtx);
 					vrele_count++;
 					crfree(cred);
 				} else
 					error = EPERM;
 			}
 			PROC_UNLOCK(p);
 		}
 		sx_sunlock(&allproc_lock);
 		if (vrele_count > 0) {
 			vfslocked = VFS_LOCK_GIANT(vp->v_mount);
 			while (vrele_count-- > 0)
 				vrele(vp);
 			VFS_UNLOCK_GIANT(vfslocked);
 		}
 		goto done;
 	}
 	/*
 	 * do it
 	 */
 	sx_slock(&proctree_lock);
 	if (uap->pid < 0) {
 		/*
 		 * by process group
 		 */
 		pg = pgfind(-uap->pid);
 		if (pg == NULL) {
 			sx_sunlock(&proctree_lock);
 			error = ESRCH;
 			goto done;
 		}
 		/*
 		 * ktrops() may call vrele(). Lock pg_members
 		 * by the proctree_lock rather than pg_mtx.
 		 */
 		PGRP_UNLOCK(pg);
 		nfound = 0;
 		LIST_FOREACH(p, &pg->pg_members, p_pglist) {
 			PROC_LOCK(p);
 			if (p_cansee(td, p) != 0) {
 				PROC_UNLOCK(p); 
 				continue;
 			}
 			PROC_UNLOCK(p); 
 			nfound++;
 			if (descend)
 				ret |= ktrsetchildren(td, p, ops, facs, vp);
 			else
 				ret |= ktrops(td, p, ops, facs, vp);
 		}
 		if (nfound == 0) {
 			sx_sunlock(&proctree_lock);
 			error = ESRCH;
 			goto done;
 		}
 	} else {
 		/*
 		 * by pid
 		 */
 		p = pfind(uap->pid);
 		if (p == NULL) {
 			sx_sunlock(&proctree_lock);
 			error = ESRCH;
 			goto done;
 		}
 		error = p_cansee(td, p);
 		/*
 		 * The slock of the proctree lock will keep this process
 		 * from going away, so unlocking the proc here is ok.
 		 */
 		PROC_UNLOCK(p);
 		if (error) {
 			sx_sunlock(&proctree_lock);
 			goto done;
 		}
 		if (descend)
 			ret |= ktrsetchildren(td, p, ops, facs, vp);
 		else
 			ret |= ktrops(td, p, ops, facs, vp);
 	}
 	sx_sunlock(&proctree_lock);
 	if (!ret)
 		error = EPERM;
 done:
 	if (vp != NULL) {
 		vfslocked = VFS_LOCK_GIANT(vp->v_mount);
 		(void) vn_close(vp, FWRITE, td->td_ucred, td);
 		VFS_UNLOCK_GIANT(vfslocked);
 	}
 	ktrace_exit(td);
 	return (error);
 #else /* !KTRACE */
 	return (ENOSYS);
 #endif /* KTRACE */
 }
 
 /* ARGSUSED */
 int
 utrace(td, uap)
 	struct thread *td;
 	register struct utrace_args *uap;
 {
 
 #ifdef KTRACE
 	struct ktr_request *req;
 	void *cp;
 	int error;
 
 	if (!KTRPOINT(td, KTR_USER))
 		return (0);
 	if (uap->len > KTR_USER_MAXLEN)
 		return (EINVAL);
 	cp = malloc(uap->len, M_KTRACE, M_WAITOK);
 	error = copyin(uap->addr, cp, uap->len);
 	if (error) {
 		free(cp, M_KTRACE);
 		return (error);
 	}
 	req = ktr_getrequest(KTR_USER);
 	if (req == NULL) {
 		free(cp, M_KTRACE);
 		return (ENOMEM);
 	}
 	req->ktr_buffer = cp;
 	req->ktr_header.ktr_len = uap->len;
 	ktr_submitrequest(td, req);
 	return (0);
 #else /* !KTRACE */
 	return (ENOSYS);
 #endif /* KTRACE */
 }
 
 #ifdef KTRACE
 static int
 ktrops(td, p, ops, facs, vp)
 	struct thread *td;
 	struct proc *p;
 	int ops, facs;
 	struct vnode *vp;
 {
 	struct vnode *tracevp = NULL;
 	struct ucred *tracecred = NULL;
 
 	PROC_LOCK(p);
 	if (!ktrcanset(td, p)) {
 		PROC_UNLOCK(p);
 		return (0);
 	}
 	mtx_lock(&ktrace_mtx);
 	if (ops == KTROP_SET) {
 		if (p->p_tracevp != vp) {
 			/*
 			 * if trace file already in use, relinquish below
 			 */
 			tracevp = p->p_tracevp;
 			VREF(vp);
 			p->p_tracevp = vp;
 		}
 		if (p->p_tracecred != td->td_ucred) {
 			tracecred = p->p_tracecred;
 			p->p_tracecred = crhold(td->td_ucred);
 		}
 		p->p_traceflag |= facs;
 		if (priv_check(td, PRIV_KTRACE) == 0)
 			p->p_traceflag |= KTRFAC_ROOT;
 	} else {
 		/* KTROP_CLEAR */
 		if (((p->p_traceflag &= ~facs) & KTRFAC_MASK) == 0) {
 			/* no more tracing */
 			p->p_traceflag = 0;
 			tracevp = p->p_tracevp;
 			p->p_tracevp = NULL;
 			tracecred = p->p_tracecred;
 			p->p_tracecred = NULL;
 		}
 	}
 	mtx_unlock(&ktrace_mtx);
 	PROC_UNLOCK(p);
 	if (tracevp != NULL) {
 		int vfslocked;
 
 		vfslocked = VFS_LOCK_GIANT(tracevp->v_mount);
 		vrele(tracevp);
 		VFS_UNLOCK_GIANT(vfslocked);
 	}
 	if (tracecred != NULL)
 		crfree(tracecred);
 
 	return (1);
 }
 
 static int
 ktrsetchildren(td, top, ops, facs, vp)
 	struct thread *td;
 	struct proc *top;
 	int ops, facs;
 	struct vnode *vp;
 {
 	register struct proc *p;
 	register int ret = 0;
 
 	p = top;
 	sx_assert(&proctree_lock, SX_LOCKED);
 	for (;;) {
 		ret |= ktrops(td, p, ops, facs, vp);
 		/*
 		 * If this process has children, descend to them next,
 		 * otherwise do any siblings, and if done with this level,
 		 * follow back up the tree (but not past top).
 		 */
 		if (!LIST_EMPTY(&p->p_children))
 			p = LIST_FIRST(&p->p_children);
 		else for (;;) {
 			if (p == top)
 				return (ret);
 			if (LIST_NEXT(p, p_sibling)) {
 				p = LIST_NEXT(p, p_sibling);
 				break;
 			}
 			p = p->p_pptr;
 		}
 	}
 	/*NOTREACHED*/
 }
 
 static void
 ktr_writerequest(struct thread *td, struct ktr_request *req)
 {
 	struct ktr_header *kth;
 	struct vnode *vp;
 	struct proc *p;
 	struct ucred *cred;
 	struct uio auio;
 	struct iovec aiov[3];
 	struct mount *mp;
 	int datalen, buflen, vrele_count;
 	int error, vfslocked;
 
 	/*
 	 * We hold the vnode and credential for use in I/O in case ktrace is
 	 * disabled on the process as we write out the request.
 	 *
 	 * XXXRW: This is not ideal: we could end up performing a write after
 	 * the vnode has been closed.
 	 */
 	mtx_lock(&ktrace_mtx);
 	vp = td->td_proc->p_tracevp;
 	if (vp != NULL)
 		VREF(vp);
 	cred = td->td_proc->p_tracecred;
 	if (cred != NULL)
 		crhold(cred);
 	mtx_unlock(&ktrace_mtx);
 
 	/*
 	 * If vp is NULL, the vp has been cleared out from under this
 	 * request, so just drop it.  Make sure the credential and vnode are
 	 * in sync: we should have both or neither.
 	 */
 	if (vp == NULL) {
 		KASSERT(cred == NULL, ("ktr_writerequest: cred != NULL"));
 		return;
 	}
 	KASSERT(cred != NULL, ("ktr_writerequest: cred == NULL"));
 
 	kth = &req->ktr_header;
 	datalen = data_lengths[(u_short)kth->ktr_type & ~KTR_DROP];
 	buflen = kth->ktr_len;
 	auio.uio_iov = &aiov[0];
 	auio.uio_offset = 0;
 	auio.uio_segflg = UIO_SYSSPACE;
 	auio.uio_rw = UIO_WRITE;
 	aiov[0].iov_base = (caddr_t)kth;
 	aiov[0].iov_len = sizeof(struct ktr_header);
 	auio.uio_resid = sizeof(struct ktr_header);
 	auio.uio_iovcnt = 1;
 	auio.uio_td = td;
 	if (datalen != 0) {
 		aiov[1].iov_base = (caddr_t)&req->ktr_data;
 		aiov[1].iov_len = datalen;
 		auio.uio_resid += datalen;
 		auio.uio_iovcnt++;
 		kth->ktr_len += datalen;
 	}
 	if (buflen != 0) {
 		KASSERT(req->ktr_buffer != NULL, ("ktrace: nothing to write"));
 		aiov[auio.uio_iovcnt].iov_base = req->ktr_buffer;
 		aiov[auio.uio_iovcnt].iov_len = buflen;
 		auio.uio_resid += buflen;
 		auio.uio_iovcnt++;
 	}
 
 	vfslocked = VFS_LOCK_GIANT(vp->v_mount);
 	vn_start_write(vp, &mp, V_WAIT);
-	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td);
+	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
 	(void)VOP_LEASE(vp, td, cred, LEASE_WRITE);
 #ifdef MAC
 	error = mac_vnode_check_write(cred, NOCRED, vp);
 	if (error == 0)
 #endif
 		error = VOP_WRITE(vp, &auio, IO_UNIT | IO_APPEND, cred);
 	VOP_UNLOCK(vp, 0, td);
 	vn_finished_write(mp);
 	vrele(vp);
 	VFS_UNLOCK_GIANT(vfslocked);
 	if (!error)
 		return;
 	/*
 	 * If error encountered, give up tracing on this vnode.  We defer
 	 * all the vrele()'s on the vnode until after we are finished walking
 	 * the various lists to avoid needlessly holding locks.
 	 */
 	log(LOG_NOTICE, "ktrace write failed, errno %d, tracing stopped\n",
 	    error);
 	vrele_count = 0;
 	/*
 	 * First, clear this vnode from being used by any processes in the
 	 * system.
 	 * XXX - If one process gets an EPERM writing to the vnode, should
 	 * we really do this?  Other processes might have suitable
 	 * credentials for the operation.
 	 */
 	cred = NULL;
 	sx_slock(&allproc_lock);
 	FOREACH_PROC_IN_SYSTEM(p) {
 		PROC_LOCK(p);
 		if (p->p_tracevp == vp) {
 			mtx_lock(&ktrace_mtx);
 			p->p_tracevp = NULL;
 			p->p_traceflag = 0;
 			cred = p->p_tracecred;
 			p->p_tracecred = NULL;
 			mtx_unlock(&ktrace_mtx);
 			vrele_count++;
 		}
 		PROC_UNLOCK(p);
 		if (cred != NULL) {
 			crfree(cred);
 			cred = NULL;
 		}
 	}
 	sx_sunlock(&allproc_lock);
 
 	/*
 	 * We can't clear any pending requests in threads that have cached
 	 * them but not yet committed them, as those are per-thread.  The
 	 * thread will have to clear it itself on system call return.
 	 */
 	vfslocked = VFS_LOCK_GIANT(vp->v_mount);
 	while (vrele_count-- > 0)
 		vrele(vp);
 	VFS_UNLOCK_GIANT(vfslocked);
 }
 
 /*
  * Return true if caller has permission to set the ktracing state
  * of target.  Essentially, the target can't possess any
  * more permissions than the caller.  KTRFAC_ROOT signifies that
  * root previously set the tracing status on the target process, and
  * so, only root may further change it.
  */
 static int
 ktrcanset(td, targetp)
 	struct thread *td;
 	struct proc *targetp;
 {
 
 	PROC_LOCK_ASSERT(targetp, MA_OWNED);
 	if (targetp->p_traceflag & KTRFAC_ROOT &&
 	    priv_check(td, PRIV_KTRACE))
 		return (0);
 
 	if (p_candebug(td, targetp) != 0)
 		return (0);
 
 	return (1);
 }
 
 #endif /* KTRACE */
Index: head/sys/kern/kern_proc.c
===================================================================
--- head/sys/kern/kern_proc.c	(revision 175201)
+++ head/sys/kern/kern_proc.c	(revision 175202)
@@ -1,1628 +1,1627 @@
 /*-
  * Copyright (c) 1982, 1986, 1989, 1991, 1993
  *	The Regents of the University of California.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	@(#)kern_proc.c	8.7 (Berkeley) 2/14/95
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_ddb.h"
 #include "opt_ktrace.h"
 #include "opt_kstack_pages.h"
 #include "opt_stack.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/kernel.h>
 #include <sys/lock.h>
 #include <sys/malloc.h>
 #include <sys/mount.h>
 #include <sys/mutex.h>
 #include <sys/proc.h>
 #include <sys/refcount.h>
 #include <sys/sbuf.h>
 #include <sys/sysent.h>
 #include <sys/sched.h>
 #include <sys/smp.h>
 #include <sys/stack.h>
 #include <sys/sysctl.h>
 #include <sys/filedesc.h>
 #include <sys/tty.h>
 #include <sys/signalvar.h>
 #include <sys/sx.h>
 #include <sys/user.h>
 #include <sys/jail.h>
 #include <sys/vnode.h>
 #include <sys/eventhandler.h>
 #ifdef KTRACE
 #include <sys/uio.h>
 #include <sys/ktrace.h>
 #endif
 
 #ifdef DDB
 #include <ddb/ddb.h>
 #endif
 
 #include <vm/vm.h>
 #include <vm/vm_extern.h>
 #include <vm/pmap.h>
 #include <vm/vm_map.h>
 #include <vm/vm_object.h>
 #include <vm/uma.h>
 
 MALLOC_DEFINE(M_PGRP, "pgrp", "process group header");
 MALLOC_DEFINE(M_SESSION, "session", "session header");
 static MALLOC_DEFINE(M_PROC, "proc", "Proc structures");
 MALLOC_DEFINE(M_SUBPROC, "subproc", "Proc sub-structures");
 
 static void doenterpgrp(struct proc *, struct pgrp *);
 static void orphanpg(struct pgrp *pg);
 static void fill_kinfo_proc_only(struct proc *p, struct kinfo_proc *kp);
 static void fill_kinfo_thread(struct thread *td, struct kinfo_proc *kp);
 static void pgadjustjobc(struct pgrp *pgrp, int entering);
 static void pgdelete(struct pgrp *);
 static int proc_ctor(void *mem, int size, void *arg, int flags);
 static void proc_dtor(void *mem, int size, void *arg);
 static int proc_init(void *mem, int size, int flags);
 static void proc_fini(void *mem, int size);
 
 /*
  * Other process lists
  */
 struct pidhashhead *pidhashtbl;
 u_long pidhash;
 struct pgrphashhead *pgrphashtbl;
 u_long pgrphash;
 struct proclist allproc;
 struct proclist zombproc;
 struct sx allproc_lock;
 struct sx proctree_lock;
 struct mtx ppeers_lock;
 uma_zone_t proc_zone;
 uma_zone_t ithread_zone;
 
 int kstack_pages = KSTACK_PAGES;
 SYSCTL_INT(_kern, OID_AUTO, kstack_pages, CTLFLAG_RD, &kstack_pages, 0, "");
 
 CTASSERT(sizeof(struct kinfo_proc) == KINFO_PROC_SIZE);
 
 /*
  * Initialize global process hashing structures.
  */
 void
 procinit()
 {
 
 	sx_init(&allproc_lock, "allproc");
 	sx_init(&proctree_lock, "proctree");
 	mtx_init(&ppeers_lock, "p_peers", NULL, MTX_DEF);
 	LIST_INIT(&allproc);
 	LIST_INIT(&zombproc);
 	pidhashtbl = hashinit(maxproc / 4, M_PROC, &pidhash);
 	pgrphashtbl = hashinit(maxproc / 4, M_PROC, &pgrphash);
 	proc_zone = uma_zcreate("PROC", sched_sizeof_proc(),
 	    proc_ctor, proc_dtor, proc_init, proc_fini,
 	    UMA_ALIGN_PTR, UMA_ZONE_NOFREE);
 	uihashinit();
 }
 
 /*
  * Prepare a proc for use.
  */
 static int
 proc_ctor(void *mem, int size, void *arg, int flags)
 {
 	struct proc *p;
 
 	p = (struct proc *)mem;
 	EVENTHANDLER_INVOKE(process_ctor, p);
 	return (0);
 }
 
 /*
  * Reclaim a proc after use.
  */
 static void
 proc_dtor(void *mem, int size, void *arg)
 {
 	struct proc *p;
 	struct thread *td;
 
 	/* INVARIANTS checks go here */
 	p = (struct proc *)mem;
 	td = FIRST_THREAD_IN_PROC(p);
 	if (td != NULL) {
 #ifdef INVARIANTS
 		KASSERT((p->p_numthreads == 1),
 		    ("bad number of threads in exiting process"));
 		KASSERT(STAILQ_EMPTY(&p->p_ktr), ("proc_dtor: non-empty p_ktr"));
 #endif
 		/* Dispose of an alternate kstack, if it exists.
 		 * XXX What if there are more than one thread in the proc?
 		 *     The first thread in the proc is special and not
 		 *     freed, so you gotta do this here.
 		 */
 		if (((p->p_flag & P_KTHREAD) != 0) && (td->td_altkstack != 0))
 			vm_thread_dispose_altkstack(td);
 	}
 	EVENTHANDLER_INVOKE(process_dtor, p);
 	if (p->p_ksi != NULL)
 		KASSERT(! KSI_ONQ(p->p_ksi), ("SIGCHLD queue"));
 }
 
 /*
  * Initialize type-stable parts of a proc (when newly created).
  */
 static int
 proc_init(void *mem, int size, int flags)
 {
 	struct proc *p;
 
 	p = (struct proc *)mem;
 	p->p_sched = (struct p_sched *)&p[1];
 	bzero(&p->p_mtx, sizeof(struct mtx));
 	mtx_init(&p->p_mtx, "process lock", NULL, MTX_DEF | MTX_DUPOK);
 	mtx_init(&p->p_slock, "process slock", NULL, MTX_SPIN | MTX_RECURSE);
 	TAILQ_INIT(&p->p_threads);	     /* all threads in proc */
 	EVENTHANDLER_INVOKE(process_init, p);
 	p->p_stats = pstats_alloc();
 	return (0);
 }
 
 /*
  * UMA should ensure that this function is never called.
  * Freeing a proc structure would violate type stability.
  */
 static void
 proc_fini(void *mem, int size)
 {
 #ifdef notnow
 	struct proc *p;
 
 	p = (struct proc *)mem;
 	EVENTHANDLER_INVOKE(process_fini, p);
 	pstats_free(p->p_stats);
 	thread_free(FIRST_THREAD_IN_PROC(p));
 	mtx_destroy(&p->p_mtx);
 	if (p->p_ksi != NULL)
 		ksiginfo_free(p->p_ksi);
 #else
 	panic("proc reclaimed");
 #endif
 }
 
 /*
  * Is p an inferior of the current process?
  */
 int
 inferior(p)
 	register struct proc *p;
 {
 
 	sx_assert(&proctree_lock, SX_LOCKED);
 	for (; p != curproc; p = p->p_pptr)
 		if (p->p_pid == 0)
 			return (0);
 	return (1);
 }
 
 /*
  * Locate a process by number; return only "live" processes -- i.e., neither
  * zombies nor newly born but incompletely initialized processes.  By not
  * returning processes in the PRS_NEW state, we allow callers to avoid
  * testing for that condition to avoid dereferencing p_ucred, et al.
  */
 struct proc *
 pfind(pid)
 	register pid_t pid;
 {
 	register struct proc *p;
 
 	sx_slock(&allproc_lock);
 	LIST_FOREACH(p, PIDHASH(pid), p_hash)
 		if (p->p_pid == pid) {
 			if (p->p_state == PRS_NEW) {
 				p = NULL;
 				break;
 			}
 			PROC_LOCK(p);
 			break;
 		}
 	sx_sunlock(&allproc_lock);
 	return (p);
 }
 
 /*
  * Locate a process group by number.
  * The caller must hold proctree_lock.
  */
 struct pgrp *
 pgfind(pgid)
 	register pid_t pgid;
 {
 	register struct pgrp *pgrp;
 
 	sx_assert(&proctree_lock, SX_LOCKED);
 
 	LIST_FOREACH(pgrp, PGRPHASH(pgid), pg_hash) {
 		if (pgrp->pg_id == pgid) {
 			PGRP_LOCK(pgrp);
 			return (pgrp);
 		}
 	}
 	return (NULL);
 }
 
 /*
  * Create a new process group.
  * pgid must be equal to the pid of p.
  * Begin a new session if required.
  */
 int
 enterpgrp(p, pgid, pgrp, sess)
 	register struct proc *p;
 	pid_t pgid;
 	struct pgrp *pgrp;
 	struct session *sess;
 {
 	struct pgrp *pgrp2;
 
 	sx_assert(&proctree_lock, SX_XLOCKED);
 
 	KASSERT(pgrp != NULL, ("enterpgrp: pgrp == NULL"));
 	KASSERT(p->p_pid == pgid,
 	    ("enterpgrp: new pgrp and pid != pgid"));
 
 	pgrp2 = pgfind(pgid);
 
 	KASSERT(pgrp2 == NULL,
 	    ("enterpgrp: pgrp with pgid exists"));
 	KASSERT(!SESS_LEADER(p),
 	    ("enterpgrp: session leader attempted setpgrp"));
 
 	mtx_init(&pgrp->pg_mtx, "process group", NULL, MTX_DEF | MTX_DUPOK);
 
 	if (sess != NULL) {
 		/*
 		 * new session
 		 */
 		mtx_init(&sess->s_mtx, "session", NULL, MTX_DEF);
 		mtx_lock(&Giant);       /* XXX TTY */
 		PROC_LOCK(p);
 		p->p_flag &= ~P_CONTROLT;
 		PROC_UNLOCK(p);
 		PGRP_LOCK(pgrp);
 		sess->s_leader = p;
 		sess->s_sid = p->p_pid;
 		sess->s_count = 1;
 		sess->s_ttyvp = NULL;
 		sess->s_ttyp = NULL;
 		bcopy(p->p_session->s_login, sess->s_login,
 			    sizeof(sess->s_login));
 		pgrp->pg_session = sess;
 		KASSERT(p == curproc,
 		    ("enterpgrp: mksession and p != curproc"));
 	} else {
 		mtx_lock(&Giant);       /* XXX TTY */
 		pgrp->pg_session = p->p_session;
 		SESS_LOCK(pgrp->pg_session);
 		pgrp->pg_session->s_count++;
 		SESS_UNLOCK(pgrp->pg_session);
 		PGRP_LOCK(pgrp);
 	}
 	pgrp->pg_id = pgid;
 	LIST_INIT(&pgrp->pg_members);
 
 	/*
 	 * As we have an exclusive lock of proctree_lock,
 	 * this should not deadlock.
 	 */
 	LIST_INSERT_HEAD(PGRPHASH(pgid), pgrp, pg_hash);
 	pgrp->pg_jobc = 0;
 	SLIST_INIT(&pgrp->pg_sigiolst);
 	PGRP_UNLOCK(pgrp);
 	mtx_unlock(&Giant);       /* XXX TTY */
 
 	doenterpgrp(p, pgrp);
 
 	return (0);
 }
 
 /*
  * Move p to an existing process group
  */
 int
 enterthispgrp(p, pgrp)
 	register struct proc *p;
 	struct pgrp *pgrp;
 {
 
 	sx_assert(&proctree_lock, SX_XLOCKED);
 	PROC_LOCK_ASSERT(p, MA_NOTOWNED);
 	PGRP_LOCK_ASSERT(pgrp, MA_NOTOWNED);
 	PGRP_LOCK_ASSERT(p->p_pgrp, MA_NOTOWNED);
 	SESS_LOCK_ASSERT(p->p_session, MA_NOTOWNED);
 	KASSERT(pgrp->pg_session == p->p_session,
 		("%s: pgrp's session %p, p->p_session %p.\n",
 		__func__,
 		pgrp->pg_session,
 		p->p_session));
 	KASSERT(pgrp != p->p_pgrp,
 		("%s: p belongs to pgrp.", __func__));
 
 	doenterpgrp(p, pgrp);
 
 	return (0);
 }
 
 /*
  * Move p to a process group
  */
 static void
 doenterpgrp(p, pgrp)
 	struct proc *p;
 	struct pgrp *pgrp;
 {
 	struct pgrp *savepgrp;
 
 	sx_assert(&proctree_lock, SX_XLOCKED);
 	PROC_LOCK_ASSERT(p, MA_NOTOWNED);
 	PGRP_LOCK_ASSERT(pgrp, MA_NOTOWNED);
 	PGRP_LOCK_ASSERT(p->p_pgrp, MA_NOTOWNED);
 	SESS_LOCK_ASSERT(p->p_session, MA_NOTOWNED);
 
 	savepgrp = p->p_pgrp;
 
 	/*
 	 * Adjust eligibility of affected pgrps to participate in job control.
 	 * Increment eligibility counts before decrementing, otherwise we
 	 * could reach 0 spuriously during the first call.
 	 */
 	fixjobc(p, pgrp, 1);
 	fixjobc(p, p->p_pgrp, 0);
 
 	mtx_lock(&Giant);       /* XXX TTY */
 	PGRP_LOCK(pgrp);
 	PGRP_LOCK(savepgrp);
 	PROC_LOCK(p);
 	LIST_REMOVE(p, p_pglist);
 	p->p_pgrp = pgrp;
 	PROC_UNLOCK(p);
 	LIST_INSERT_HEAD(&pgrp->pg_members, p, p_pglist);
 	PGRP_UNLOCK(savepgrp);
 	PGRP_UNLOCK(pgrp);
 	mtx_unlock(&Giant);     /* XXX TTY */
 	if (LIST_EMPTY(&savepgrp->pg_members))
 		pgdelete(savepgrp);
 }
 
 /*
  * remove process from process group
  */
 int
 leavepgrp(p)
 	register struct proc *p;
 {
 	struct pgrp *savepgrp;
 
 	sx_assert(&proctree_lock, SX_XLOCKED);
 	savepgrp = p->p_pgrp;
 	mtx_lock(&Giant);	/* XXX TTY */
 	PGRP_LOCK(savepgrp);
 	PROC_LOCK(p);
 	LIST_REMOVE(p, p_pglist);
 	p->p_pgrp = NULL;
 	PROC_UNLOCK(p);
 	PGRP_UNLOCK(savepgrp);
 	mtx_unlock(&Giant);	/* XXX TTY */
 	if (LIST_EMPTY(&savepgrp->pg_members))
 		pgdelete(savepgrp);
 	return (0);
 }
 
 /*
  * delete a process group
  */
 static void
 pgdelete(pgrp)
 	register struct pgrp *pgrp;
 {
 	struct session *savesess;
 
 	sx_assert(&proctree_lock, SX_XLOCKED);
 	PGRP_LOCK_ASSERT(pgrp, MA_NOTOWNED);
 	SESS_LOCK_ASSERT(pgrp->pg_session, MA_NOTOWNED);
 
 	/*
 	 * Reset any sigio structures pointing to us as a result of
 	 * F_SETOWN with our pgid.
 	 */
 	funsetownlst(&pgrp->pg_sigiolst);
 
 	mtx_lock(&Giant);       /* XXX TTY */
 	PGRP_LOCK(pgrp);
 	if (pgrp->pg_session->s_ttyp != NULL &&
 	    pgrp->pg_session->s_ttyp->t_pgrp == pgrp)
 		pgrp->pg_session->s_ttyp->t_pgrp = NULL;
 	LIST_REMOVE(pgrp, pg_hash);
 	savesess = pgrp->pg_session;
 	SESSRELE(savesess);
 	PGRP_UNLOCK(pgrp);
 	mtx_destroy(&pgrp->pg_mtx);
 	FREE(pgrp, M_PGRP);
 	mtx_unlock(&Giant);     /* XXX TTY */
 }
 
 static void
 pgadjustjobc(pgrp, entering)
 	struct pgrp *pgrp;
 	int entering;
 {
 
 	PGRP_LOCK(pgrp);
 	if (entering)
 		pgrp->pg_jobc++;
 	else {
 		--pgrp->pg_jobc;
 		if (pgrp->pg_jobc == 0)
 			orphanpg(pgrp);
 	}
 	PGRP_UNLOCK(pgrp);
 }
 
 /*
  * Adjust pgrp jobc counters when specified process changes process group.
  * We count the number of processes in each process group that "qualify"
  * the group for terminal job control (those with a parent in a different
  * process group of the same session).  If that count reaches zero, the
  * process group becomes orphaned.  Check both the specified process'
  * process group and that of its children.
  * entering == 0 => p is leaving specified group.
  * entering == 1 => p is entering specified group.
  */
 void
 fixjobc(p, pgrp, entering)
 	register struct proc *p;
 	register struct pgrp *pgrp;
 	int entering;
 {
 	register struct pgrp *hispgrp;
 	register struct session *mysession;
 
 	sx_assert(&proctree_lock, SX_LOCKED);
 	PROC_LOCK_ASSERT(p, MA_NOTOWNED);
 	PGRP_LOCK_ASSERT(pgrp, MA_NOTOWNED);
 	SESS_LOCK_ASSERT(pgrp->pg_session, MA_NOTOWNED);
 
 	/*
 	 * Check p's parent to see whether p qualifies its own process
 	 * group; if so, adjust count for p's process group.
 	 */
 	mysession = pgrp->pg_session;
 	if ((hispgrp = p->p_pptr->p_pgrp) != pgrp &&
 	    hispgrp->pg_session == mysession)
 		pgadjustjobc(pgrp, entering);
 
 	/*
 	 * Check this process' children to see whether they qualify
 	 * their process groups; if so, adjust counts for children's
 	 * process groups.
 	 */
 	LIST_FOREACH(p, &p->p_children, p_sibling) {
 		hispgrp = p->p_pgrp;
 		if (hispgrp == pgrp ||
 		    hispgrp->pg_session != mysession)
 			continue;
 		PROC_LOCK(p);
 		if (p->p_state == PRS_ZOMBIE) {
 			PROC_UNLOCK(p);
 			continue;
 		}
 		PROC_UNLOCK(p);
 		pgadjustjobc(hispgrp, entering);
 	}
 }
 
 /*
  * A process group has become orphaned;
  * if there are any stopped processes in the group,
  * hang-up all process in that group.
  */
 static void
 orphanpg(pg)
 	struct pgrp *pg;
 {
 	register struct proc *p;
 
 	PGRP_LOCK_ASSERT(pg, MA_OWNED);
 
 	LIST_FOREACH(p, &pg->pg_members, p_pglist) {
 		PROC_LOCK(p);
 		if (P_SHOULDSTOP(p)) {
 			PROC_UNLOCK(p);
 			LIST_FOREACH(p, &pg->pg_members, p_pglist) {
 				PROC_LOCK(p);
 				psignal(p, SIGHUP);
 				psignal(p, SIGCONT);
 				PROC_UNLOCK(p);
 			}
 			return;
 		}
 		PROC_UNLOCK(p);
 	}
 }
 
 void
 sessrele(struct session *s)
 {
 	int i;
 
 	SESS_LOCK(s);
 	i = --s->s_count;
 	SESS_UNLOCK(s);
 	if (i == 0) {
 		if (s->s_ttyp != NULL)
 			ttyrel(s->s_ttyp);
 		mtx_destroy(&s->s_mtx);
 		FREE(s, M_SESSION);
 	}
 }
 
 #include "opt_ddb.h"
 #ifdef DDB
 #include <ddb/ddb.h>
 
 DB_SHOW_COMMAND(pgrpdump, pgrpdump)
 {
 	register struct pgrp *pgrp;
 	register struct proc *p;
 	register int i;
 
 	for (i = 0; i <= pgrphash; i++) {
 		if (!LIST_EMPTY(&pgrphashtbl[i])) {
 			printf("\tindx %d\n", i);
 			LIST_FOREACH(pgrp, &pgrphashtbl[i], pg_hash) {
 				printf(
 			"\tpgrp %p, pgid %ld, sess %p, sesscnt %d, mem %p\n",
 				    (void *)pgrp, (long)pgrp->pg_id,
 				    (void *)pgrp->pg_session,
 				    pgrp->pg_session->s_count,
 				    (void *)LIST_FIRST(&pgrp->pg_members));
 				LIST_FOREACH(p, &pgrp->pg_members, p_pglist) {
 					printf("\t\tpid %ld addr %p pgrp %p\n", 
 					    (long)p->p_pid, (void *)p,
 					    (void *)p->p_pgrp);
 				}
 			}
 		}
 	}
 }
 #endif /* DDB */
 
 /*
  * Clear kinfo_proc and fill in any information that is common
  * to all threads in the process.
  * Must be called with the target process locked.
  */
 static void
 fill_kinfo_proc_only(struct proc *p, struct kinfo_proc *kp)
 {
 	struct thread *td0;
 	struct tty *tp;
 	struct session *sp;
 	struct ucred *cred;
 	struct sigacts *ps;
 
 	bzero(kp, sizeof(*kp));
 
 	kp->ki_structsize = sizeof(*kp);
 	kp->ki_paddr = p;
 	PROC_LOCK_ASSERT(p, MA_OWNED);
 	kp->ki_addr =/* p->p_addr; */0; /* XXXKSE */
 	kp->ki_args = p->p_args;
 	kp->ki_textvp = p->p_textvp;
 #ifdef KTRACE
 	kp->ki_tracep = p->p_tracevp;
 	mtx_lock(&ktrace_mtx);
 	kp->ki_traceflag = p->p_traceflag;
 	mtx_unlock(&ktrace_mtx);
 #endif
 	kp->ki_fd = p->p_fd;
 	kp->ki_vmspace = p->p_vmspace;
 	kp->ki_flag = p->p_flag;
 	cred = p->p_ucred;
 	if (cred) {
 		kp->ki_uid = cred->cr_uid;
 		kp->ki_ruid = cred->cr_ruid;
 		kp->ki_svuid = cred->cr_svuid;
 		/* XXX bde doesn't like KI_NGROUPS */
 		kp->ki_ngroups = min(cred->cr_ngroups, KI_NGROUPS);
 		bcopy(cred->cr_groups, kp->ki_groups,
 		    kp->ki_ngroups * sizeof(gid_t));
 		kp->ki_rgid = cred->cr_rgid;
 		kp->ki_svgid = cred->cr_svgid;
 		/* If jailed(cred), emulate the old P_JAILED flag. */
 		if (jailed(cred)) {
 			kp->ki_flag |= P_JAILED;
 			/* If inside a jail, use 0 as a jail ID. */
 			if (!jailed(curthread->td_ucred))
 				kp->ki_jid = cred->cr_prison->pr_id;
 		}
 	}
 	ps = p->p_sigacts;
 	if (ps) {
 		mtx_lock(&ps->ps_mtx);
 		kp->ki_sigignore = ps->ps_sigignore;
 		kp->ki_sigcatch = ps->ps_sigcatch;
 		mtx_unlock(&ps->ps_mtx);
 	}
 	PROC_SLOCK(p);
 	if (p->p_state != PRS_NEW &&
 	    p->p_state != PRS_ZOMBIE &&
 	    p->p_vmspace != NULL) {
 		struct vmspace *vm = p->p_vmspace;
 
 		kp->ki_size = vm->vm_map.size;
 		kp->ki_rssize = vmspace_resident_count(vm); /*XXX*/
 		FOREACH_THREAD_IN_PROC(p, td0) {
 			if (!TD_IS_SWAPPED(td0))
 				kp->ki_rssize += td0->td_kstack_pages;
 			if (td0->td_altkstack_obj != NULL)
 				kp->ki_rssize += td0->td_altkstack_pages;
 		}
 		kp->ki_swrss = vm->vm_swrss;
 		kp->ki_tsize = vm->vm_tsize;
 		kp->ki_dsize = vm->vm_dsize;
 		kp->ki_ssize = vm->vm_ssize;
 	} else if (p->p_state == PRS_ZOMBIE)
 		kp->ki_stat = SZOMB;
 	if (kp->ki_flag & P_INMEM)
 		kp->ki_sflag = PS_INMEM;
 	else
 		kp->ki_sflag = 0;
 	/* Calculate legacy swtime as seconds since 'swtick'. */
 	kp->ki_swtime = (ticks - p->p_swtick) / hz;
 	kp->ki_pid = p->p_pid;
 	kp->ki_nice = p->p_nice;
 	rufetch(p, &kp->ki_rusage);
 	kp->ki_runtime = cputick2usec(p->p_rux.rux_runtime);
 	PROC_SUNLOCK(p);
 	if ((p->p_flag & P_INMEM) && p->p_stats != NULL) {
 		kp->ki_start = p->p_stats->p_start;
 		timevaladd(&kp->ki_start, &boottime);
 		PROC_SLOCK(p);
 		calcru(p, &kp->ki_rusage.ru_utime, &kp->ki_rusage.ru_stime);
 		PROC_SUNLOCK(p);
 		calccru(p, &kp->ki_childutime, &kp->ki_childstime);
 
 		/* Some callers want child-times in a single value */
 		kp->ki_childtime = kp->ki_childstime;
 		timevaladd(&kp->ki_childtime, &kp->ki_childutime);
 	}
 	tp = NULL;
 	if (p->p_pgrp) {
 		kp->ki_pgid = p->p_pgrp->pg_id;
 		kp->ki_jobc = p->p_pgrp->pg_jobc;
 		sp = p->p_pgrp->pg_session;
 
 		if (sp != NULL) {
 			kp->ki_sid = sp->s_sid;
 			SESS_LOCK(sp);
 			strlcpy(kp->ki_login, sp->s_login,
 			    sizeof(kp->ki_login));
 			if (sp->s_ttyvp)
 				kp->ki_kiflag |= KI_CTTY;
 			if (SESS_LEADER(p))
 				kp->ki_kiflag |= KI_SLEADER;
 			tp = sp->s_ttyp;
 			SESS_UNLOCK(sp);
 		}
 	}
 	if ((p->p_flag & P_CONTROLT) && tp != NULL) {
 		kp->ki_tdev = dev2udev(tp->t_dev);
 		kp->ki_tpgid = tp->t_pgrp ? tp->t_pgrp->pg_id : NO_PID;
 		if (tp->t_session)
 			kp->ki_tsid = tp->t_session->s_sid;
 	} else
 		kp->ki_tdev = NODEV;
 	if (p->p_comm[0] != '\0')
 		strlcpy(kp->ki_comm, p->p_comm, sizeof(kp->ki_comm));
 	if (p->p_sysent && p->p_sysent->sv_name != NULL &&
 	    p->p_sysent->sv_name[0] != '\0')
 		strlcpy(kp->ki_emul, p->p_sysent->sv_name, sizeof(kp->ki_emul));
 	kp->ki_siglist = p->p_siglist;
 	kp->ki_xstat = p->p_xstat;
 	kp->ki_acflag = p->p_acflag;
 	kp->ki_lock = p->p_lock;
 	if (p->p_pptr)
 		kp->ki_ppid = p->p_pptr->p_pid;
 }
 
 /*
  * Fill in information that is thread specific.
  * Must be called with p_slock locked.
  */
 static void
 fill_kinfo_thread(struct thread *td, struct kinfo_proc *kp)
 {
 	struct proc *p;
 
 	p = td->td_proc;
 	PROC_SLOCK_ASSERT(p, MA_OWNED);
 
 	thread_lock(td);
 	if (td->td_wmesg != NULL)
 		strlcpy(kp->ki_wmesg, td->td_wmesg, sizeof(kp->ki_wmesg));
 	else
 		bzero(kp->ki_wmesg, sizeof(kp->ki_wmesg));
 	if (td->td_name[0] != '\0')
 		strlcpy(kp->ki_ocomm, td->td_name, sizeof(kp->ki_ocomm));
 	if (TD_ON_LOCK(td)) {
 		kp->ki_kiflag |= KI_LOCKBLOCK;
 		strlcpy(kp->ki_lockname, td->td_lockname,
 		    sizeof(kp->ki_lockname));
 	} else {
 		kp->ki_kiflag &= ~KI_LOCKBLOCK;
 		bzero(kp->ki_lockname, sizeof(kp->ki_lockname));
 	}
 
 	if (p->p_state == PRS_NORMAL) { /*  XXXKSE very approximate */
 		if (TD_ON_RUNQ(td) ||
 		    TD_CAN_RUN(td) ||
 		    TD_IS_RUNNING(td)) {
 			kp->ki_stat = SRUN;
 		} else if (P_SHOULDSTOP(p)) {
 			kp->ki_stat = SSTOP;
 		} else if (TD_IS_SLEEPING(td)) {
 			kp->ki_stat = SSLEEP;
 		} else if (TD_ON_LOCK(td)) {
 			kp->ki_stat = SLOCK;
 		} else {
 			kp->ki_stat = SWAIT;
 		}
 	} else if (p->p_state == PRS_ZOMBIE) {
 		kp->ki_stat = SZOMB;
 	} else {
 		kp->ki_stat = SIDL;
 	}
 
 	/* Things in the thread */
 	kp->ki_wchan = td->td_wchan;
 	kp->ki_pri.pri_level = td->td_priority;
 	kp->ki_pri.pri_native = td->td_base_pri;
 	kp->ki_lastcpu = td->td_lastcpu;
 	kp->ki_oncpu = td->td_oncpu;
 	kp->ki_tdflags = td->td_flags;
 	kp->ki_tid = td->td_tid;
 	kp->ki_numthreads = p->p_numthreads;
 	kp->ki_pcb = td->td_pcb;
 	kp->ki_kstack = (void *)td->td_kstack;
 	kp->ki_pctcpu = sched_pctcpu(td);
 	kp->ki_estcpu = td->td_estcpu;
 	kp->ki_slptime = (ticks - td->td_slptick) / hz;
 	kp->ki_pri.pri_class = td->td_pri_class;
 	kp->ki_pri.pri_user = td->td_user_pri;
 
 	/* We can't get this anymore but ps etc never used it anyway. */
 	kp->ki_rqindex = 0;
 
 	SIGSETOR(kp->ki_siglist, td->td_siglist);
 	kp->ki_sigmask = td->td_sigmask;
 	thread_unlock(td);
 }
 
 /*
  * Fill in a kinfo_proc structure for the specified process.
  * Must be called with the target process locked.
  */
 void
 fill_kinfo_proc(struct proc *p, struct kinfo_proc *kp)
 {
 
 	fill_kinfo_proc_only(p, kp);
 	PROC_SLOCK(p);
 	if (FIRST_THREAD_IN_PROC(p) != NULL)
 		fill_kinfo_thread(FIRST_THREAD_IN_PROC(p), kp);
 	PROC_SUNLOCK(p);
 }
 
 struct pstats *
 pstats_alloc(void)
 {
 
 	return (malloc(sizeof(struct pstats), M_SUBPROC, M_ZERO|M_WAITOK));
 }
 
 /*
  * Copy parts of p_stats; zero the rest of p_stats (statistics).
  */
 void
 pstats_fork(struct pstats *src, struct pstats *dst)
 {
 
 	bzero(&dst->pstat_startzero,
 	    __rangeof(struct pstats, pstat_startzero, pstat_endzero));
 	bcopy(&src->pstat_startcopy, &dst->pstat_startcopy,
 	    __rangeof(struct pstats, pstat_startcopy, pstat_endcopy));
 }
 
 void
 pstats_free(struct pstats *ps)
 {
 
 	free(ps, M_SUBPROC);
 }
 
 /*
  * Locate a zombie process by number
  */
 struct proc *
 zpfind(pid_t pid)
 {
 	struct proc *p;
 
 	sx_slock(&allproc_lock);
 	LIST_FOREACH(p, &zombproc, p_list)
 		if (p->p_pid == pid) {
 			PROC_LOCK(p);
 			break;
 		}
 	sx_sunlock(&allproc_lock);
 	return (p);
 }
 
 #define KERN_PROC_ZOMBMASK	0x3
 #define KERN_PROC_NOTHREADS	0x4
 
 /*
  * Must be called with the process locked and will return with it unlocked.
  */
 static int
 sysctl_out_proc(struct proc *p, struct sysctl_req *req, int flags)
 {
 	struct thread *td;
 	struct kinfo_proc kinfo_proc;
 	int error = 0;
 	struct proc *np;
 	pid_t pid = p->p_pid;
 
 	PROC_LOCK_ASSERT(p, MA_OWNED);
 
 	fill_kinfo_proc_only(p, &kinfo_proc);
 	if (flags & KERN_PROC_NOTHREADS) {
 		PROC_SLOCK(p);
 		if (FIRST_THREAD_IN_PROC(p) != NULL)
 			fill_kinfo_thread(FIRST_THREAD_IN_PROC(p), &kinfo_proc);
 		PROC_SUNLOCK(p);
 		error = SYSCTL_OUT(req, (caddr_t)&kinfo_proc,
 				   sizeof(kinfo_proc));
 	} else {
 		PROC_SLOCK(p);
 		if (FIRST_THREAD_IN_PROC(p) != NULL)
 			FOREACH_THREAD_IN_PROC(p, td) {
 				fill_kinfo_thread(td, &kinfo_proc);
 				error = SYSCTL_OUT(req, (caddr_t)&kinfo_proc,
 						   sizeof(kinfo_proc));
 				if (error)
 					break;
 			}
 		else
 			error = SYSCTL_OUT(req, (caddr_t)&kinfo_proc,
 					   sizeof(kinfo_proc));
 		PROC_SUNLOCK(p);
 	}
 	PROC_UNLOCK(p);
 	if (error)
 		return (error);
 	if (flags & KERN_PROC_ZOMBMASK)
 		np = zpfind(pid);
 	else {
 		if (pid == 0)
 			return (0);
 		np = pfind(pid);
 	}
 	if (np == NULL)
 		return EAGAIN;
 	if (np != p) {
 		PROC_UNLOCK(np);
 		return EAGAIN;
 	}
 	PROC_UNLOCK(np);
 	return (0);
 }
 
 static int
 sysctl_kern_proc(SYSCTL_HANDLER_ARGS)
 {
 	int *name = (int*) arg1;
 	u_int namelen = arg2;
 	struct proc *p;
 	int flags, doingzomb, oid_number;
 	int error = 0;
 
 	oid_number = oidp->oid_number;
 	if (oid_number != KERN_PROC_ALL &&
 	    (oid_number & KERN_PROC_INC_THREAD) == 0)
 		flags = KERN_PROC_NOTHREADS;
 	else {
 		flags = 0;
 		oid_number &= ~KERN_PROC_INC_THREAD;
 	}
 	if (oid_number == KERN_PROC_PID) {
 		if (namelen != 1) 
 			return (EINVAL);
 		error = sysctl_wire_old_buffer(req, 0);
 		if (error)
 			return (error);		
 		p = pfind((pid_t)name[0]);
 		if (!p)
 			return (ESRCH);
 		if ((error = p_cansee(curthread, p))) {
 			PROC_UNLOCK(p);
 			return (error);
 		}
 		error = sysctl_out_proc(p, req, flags);
 		return (error);
 	}
 
 	switch (oid_number) {
 	case KERN_PROC_ALL:
 		if (namelen != 0)
 			return (EINVAL);
 		break;
 	case KERN_PROC_PROC:
 		if (namelen != 0 && namelen != 1)
 			return (EINVAL);
 		break;
 	default:
 		if (namelen != 1)
 			return (EINVAL);
 		break;
 	}
 	
 	if (!req->oldptr) {
 		/* overestimate by 5 procs */
 		error = SYSCTL_OUT(req, 0, sizeof (struct kinfo_proc) * 5);
 		if (error)
 			return (error);
 	}
 	error = sysctl_wire_old_buffer(req, 0);
 	if (error != 0)
 		return (error);
 	sx_slock(&allproc_lock);
 	for (doingzomb=0 ; doingzomb < 2 ; doingzomb++) {
 		if (!doingzomb)
 			p = LIST_FIRST(&allproc);
 		else
 			p = LIST_FIRST(&zombproc);
 		for (; p != 0; p = LIST_NEXT(p, p_list)) {
 			/*
 			 * Skip embryonic processes.
 			 */
 			PROC_SLOCK(p);
 			if (p->p_state == PRS_NEW) {
 				PROC_SUNLOCK(p);
 				continue;
 			}
 			PROC_SUNLOCK(p);
 			PROC_LOCK(p);
 			KASSERT(p->p_ucred != NULL,
 			    ("process credential is NULL for non-NEW proc"));
 			/*
 			 * Show a user only appropriate processes.
 			 */
 			if (p_cansee(curthread, p)) {
 				PROC_UNLOCK(p);
 				continue;
 			}
 			/*
 			 * TODO - make more efficient (see notes below).
 			 * do by session.
 			 */
 			switch (oid_number) {
 
 			case KERN_PROC_GID:
 				if (p->p_ucred->cr_gid != (gid_t)name[0]) {
 					PROC_UNLOCK(p);
 					continue;
 				}
 				break;
 
 			case KERN_PROC_PGRP:
 				/* could do this by traversing pgrp */
 				if (p->p_pgrp == NULL ||
 				    p->p_pgrp->pg_id != (pid_t)name[0]) {
 					PROC_UNLOCK(p);
 					continue;
 				}
 				break;
 
 			case KERN_PROC_RGID:
 				if (p->p_ucred->cr_rgid != (gid_t)name[0]) {
 					PROC_UNLOCK(p);
 					continue;
 				}
 				break;
 
 			case KERN_PROC_SESSION:
 				if (p->p_session == NULL ||
 				    p->p_session->s_sid != (pid_t)name[0]) {
 					PROC_UNLOCK(p);
 					continue;
 				}
 				break;
 
 			case KERN_PROC_TTY:
 				if ((p->p_flag & P_CONTROLT) == 0 ||
 				    p->p_session == NULL) {
 					PROC_UNLOCK(p);
 					continue;
 				}
 				SESS_LOCK(p->p_session);
 				if (p->p_session->s_ttyp == NULL ||
 				    dev2udev(p->p_session->s_ttyp->t_dev) != 
 				    (dev_t)name[0]) {
 					SESS_UNLOCK(p->p_session);
 					PROC_UNLOCK(p);
 					continue;
 				}
 				SESS_UNLOCK(p->p_session);
 				break;
 
 			case KERN_PROC_UID:
 				if (p->p_ucred->cr_uid != (uid_t)name[0]) {
 					PROC_UNLOCK(p);
 					continue;
 				}
 				break;
 
 			case KERN_PROC_RUID:
 				if (p->p_ucred->cr_ruid != (uid_t)name[0]) {
 					PROC_UNLOCK(p);
 					continue;
 				}
 				break;
 
 			case KERN_PROC_PROC:
 				break;
 
 			default:
 				break;
 
 			}
 
 			error = sysctl_out_proc(p, req, flags | doingzomb);
 			if (error) {
 				sx_sunlock(&allproc_lock);
 				return (error);
 			}
 		}
 	}
 	sx_sunlock(&allproc_lock);
 	return (0);
 }
 
 struct pargs *
 pargs_alloc(int len)
 {
 	struct pargs *pa;
 
 	MALLOC(pa, struct pargs *, sizeof(struct pargs) + len, M_PARGS,
 		M_WAITOK);
 	refcount_init(&pa->ar_ref, 1);
 	pa->ar_length = len;
 	return (pa);
 }
 
 void
 pargs_free(struct pargs *pa)
 {
 
 	FREE(pa, M_PARGS);
 }
 
 void
 pargs_hold(struct pargs *pa)
 {
 
 	if (pa == NULL)
 		return;
 	refcount_acquire(&pa->ar_ref);
 }
 
 void
 pargs_drop(struct pargs *pa)
 {
 
 	if (pa == NULL)
 		return;
 	if (refcount_release(&pa->ar_ref))
 		pargs_free(pa);
 }
 
 /*
  * This sysctl allows a process to retrieve the argument list or process
  * title for another process without groping around in the address space
  * of the other process.  It also allow a process to set its own "process 
  * title to a string of its own choice.
  */
 static int
 sysctl_kern_proc_args(SYSCTL_HANDLER_ARGS)
 {
 	int *name = (int*) arg1;
 	u_int namelen = arg2;
 	struct pargs *newpa, *pa;
 	struct proc *p;
 	int error = 0;
 
 	if (namelen != 1) 
 		return (EINVAL);
 
 	p = pfind((pid_t)name[0]);
 	if (!p)
 		return (ESRCH);
 
 	if ((error = p_cansee(curthread, p)) != 0) {
 		PROC_UNLOCK(p);
 		return (error);
 	}
 
 	if (req->newptr && curproc != p) {
 		PROC_UNLOCK(p);
 		return (EPERM);
 	}
 
 	pa = p->p_args;
 	pargs_hold(pa);
 	PROC_UNLOCK(p);
 	if (req->oldptr != NULL && pa != NULL)
 		error = SYSCTL_OUT(req, pa->ar_args, pa->ar_length);
 	pargs_drop(pa);
 	if (error != 0 || req->newptr == NULL)
 		return (error);
 
 	if (req->newlen + sizeof(struct pargs) > ps_arg_cache_limit)
 		return (ENOMEM);
 	newpa = pargs_alloc(req->newlen);
 	error = SYSCTL_IN(req, newpa->ar_args, req->newlen);
 	if (error != 0) {
 		pargs_free(newpa);
 		return (error);
 	}
 	PROC_LOCK(p);
 	pa = p->p_args;
 	p->p_args = newpa;
 	PROC_UNLOCK(p);
 	pargs_drop(pa);
 	return (0);
 }
 
 /*
  * This sysctl allows a process to retrieve the path of the executable for
  * itself or another process.
  */
 static int
 sysctl_kern_proc_pathname(SYSCTL_HANDLER_ARGS)
 {
 	pid_t *pidp = (pid_t *)arg1;
 	unsigned int arglen = arg2;
 	struct proc *p;
 	struct vnode *vp;
 	char *retbuf, *freebuf;
 	int error;
 
 	if (arglen != 1)
 		return (EINVAL);
 	if (*pidp == -1) {	/* -1 means this process */
 		p = req->td->td_proc;
 	} else {
 		p = pfind(*pidp);
 		if (p == NULL)
 			return (ESRCH);
 		if ((error = p_cansee(curthread, p)) != 0) {
 			PROC_UNLOCK(p);
 			return (error);
 		}
 	}
 
 	vp = p->p_textvp;
 	if (vp == NULL) {
 		if (*pidp != -1)
 			PROC_UNLOCK(p);
 		return (0);
 	}
 	vref(vp);
 	if (*pidp != -1)
 		PROC_UNLOCK(p);
 	error = vn_fullpath(req->td, vp, &retbuf, &freebuf);
 	vrele(vp);
 	if (error)
 		return (error);
 	error = SYSCTL_OUT(req, retbuf, strlen(retbuf) + 1);
 	free(freebuf, M_TEMP);
 	return (error);
 }
 
 static int
 sysctl_kern_proc_sv_name(SYSCTL_HANDLER_ARGS)
 {
 	struct proc *p;
 	char *sv_name;
 	int *name;
 	int namelen;
 	int error;
 
 	namelen = arg2;
 	if (namelen != 1) 
 		return (EINVAL);
 
 	name = (int *)arg1;
 	if ((p = pfind((pid_t)name[0])) == NULL)
 		return (ESRCH);
 	if ((error = p_cansee(curthread, p))) {
 		PROC_UNLOCK(p);
 		return (error);
 	}
 	sv_name = p->p_sysent->sv_name;
 	PROC_UNLOCK(p);
 	return (sysctl_handle_string(oidp, sv_name, 0, req));
 }
 
 static int
 sysctl_kern_proc_vmmap(SYSCTL_HANDLER_ARGS)
 {
 	vm_map_entry_t entry, tmp_entry;
 	unsigned int last_timestamp;
 	char *fullpath, *freepath;
 	struct kinfo_vmentry *kve;
 	int error, *name;
 	struct vnode *vp;
 	struct proc *p;
 	vm_map_t map;
 
 	name = (int *)arg1;
 	if ((p = pfind((pid_t)name[0])) == NULL)
 		return (ESRCH);
 	if (p->p_flag & P_WEXIT) {
 		PROC_UNLOCK(p);
 		return (ESRCH);
 	}
 	if ((error = p_candebug(curthread, p))) {
 		PROC_UNLOCK(p);
 		return (error);
 	}
 	_PHOLD(p);
 	PROC_UNLOCK(p);
 
 	kve = malloc(sizeof(*kve), M_TEMP, M_WAITOK);
 
 	map = &p->p_vmspace->vm_map;	/* XXXRW: More locking required? */
 	vm_map_lock_read(map);
 	for (entry = map->header.next; entry != &map->header;
 	    entry = entry->next) {
 		vm_object_t obj, tobj, lobj;
 		vm_offset_t addr;
 		int vfslocked;
 
 		if (entry->eflags & MAP_ENTRY_IS_SUB_MAP)
 			continue;
 
 		bzero(kve, sizeof(*kve));
 		kve->kve_structsize = sizeof(*kve);
 
 		kve->kve_private_resident = 0;
 		obj = entry->object.vm_object;
 		if (obj != NULL) {
 			VM_OBJECT_LOCK(obj);
 			if (obj->shadow_count == 1)
 				kve->kve_private_resident =
 				    obj->resident_page_count;
 		}
 		kve->kve_resident = 0;
 		addr = entry->start;
 		while (addr < entry->end) {
 			if (pmap_extract(map->pmap, addr))
 				kve->kve_resident++;
 			addr += PAGE_SIZE;
 		}
 
 		for (lobj = tobj = obj; tobj; tobj = tobj->backing_object) {
 			if (tobj != obj)
 				VM_OBJECT_LOCK(tobj);
 			if (lobj != obj)
 				VM_OBJECT_UNLOCK(lobj);
 			lobj = tobj;
 		}
 
 		freepath = NULL;
 		fullpath = "";
 		if (lobj) {
 			vp = NULL;
 			switch(lobj->type) {
 			case OBJT_DEFAULT:
 				kve->kve_type = KVME_TYPE_DEFAULT;
 				break;
 			case OBJT_VNODE:
 				kve->kve_type = KVME_TYPE_VNODE;
 				vp = lobj->handle;
 				vref(vp);
 				break;
 			case OBJT_SWAP:
 				kve->kve_type = KVME_TYPE_SWAP;
 				break;
 			case OBJT_DEVICE:
 				kve->kve_type = KVME_TYPE_DEVICE;
 				break;
 			case OBJT_PHYS:
 				kve->kve_type = KVME_TYPE_PHYS;
 				break;
 			case OBJT_DEAD:
 				kve->kve_type = KVME_TYPE_DEAD;
 				break;
 			default:
 				kve->kve_type = KVME_TYPE_UNKNOWN;
 				break;
 			}
 			if (lobj != obj)
 				VM_OBJECT_UNLOCK(lobj);
 
 			kve->kve_ref_count = obj->ref_count;
 			kve->kve_shadow_count = obj->shadow_count;
 			VM_OBJECT_UNLOCK(obj);
 			if (vp != NULL) {
 				vfslocked = VFS_LOCK_GIANT(vp->v_mount);
-				vn_lock(vp, LK_EXCLUSIVE | LK_RETRY,
-				    curthread);
+				vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
 				vn_fullpath(curthread, vp, &fullpath,
 				    &freepath);
 				vput(vp);
 				VFS_UNLOCK_GIANT(vfslocked);
 			}
 		} else {
 			kve->kve_type = KVME_TYPE_NONE;
 			kve->kve_ref_count = 0;
 			kve->kve_shadow_count = 0;
 		}
 
 		kve->kve_start = (void*)entry->start;
 		kve->kve_end = (void*)entry->end;
 
 		if (entry->protection & VM_PROT_READ)
 			kve->kve_protection |= KVME_PROT_READ;
 		if (entry->protection & VM_PROT_WRITE)
 			kve->kve_protection |= KVME_PROT_WRITE;
 		if (entry->protection & VM_PROT_EXECUTE)
 			kve->kve_protection |= KVME_PROT_EXEC;
 
 		if (entry->eflags & MAP_ENTRY_COW)
 			kve->kve_flags |= KVME_FLAG_COW;
 		if (entry->eflags & MAP_ENTRY_NEEDS_COPY)
 			kve->kve_flags |= KVME_FLAG_NEEDS_COPY;
 
 		strlcpy(kve->kve_path, fullpath, sizeof(kve->kve_path));
 		if (freepath != NULL)
 			free(freepath, M_TEMP);
 
 		last_timestamp = map->timestamp;
 		vm_map_unlock_read(map);
 		error = SYSCTL_OUT(req, kve, sizeof(*kve));
 		vm_map_lock_read(map);
 		if (error)
 			break;
 		if (last_timestamp + 1 != map->timestamp) {
 			vm_map_lookup_entry(map, addr - 1, &tmp_entry);
 			entry = tmp_entry;
 		}
 	}
 	vm_map_unlock_read(map);
 	PRELE(p);
 	free(kve, M_TEMP);
 	return (error);
 }
 
 #if defined(STACK) || defined(DDB)
 static int
 sysctl_kern_proc_kstack(SYSCTL_HANDLER_ARGS)
 {
 	struct kinfo_kstack *kkstp;
 	int error, i, *name, numthreads;
 	lwpid_t *lwpidarray;
 	struct thread *td;
 	struct stack *st;
 	struct sbuf sb;
 	struct proc *p;
 
 	name = (int *)arg1;
 	if ((p = pfind((pid_t)name[0])) == NULL)
 		return (ESRCH);
 	/* XXXRW: Not clear ESRCH is the right error during proc execve(). */
 	if (p->p_flag & P_WEXIT || p->p_flag & P_INEXEC) {
 		PROC_UNLOCK(p);
 		return (ESRCH);
 	}
 	if ((error = p_candebug(curthread, p))) {
 		PROC_UNLOCK(p);
 		return (error);
 	}
 	_PHOLD(p);
 	PROC_UNLOCK(p);
 
 	kkstp = malloc(sizeof(*kkstp), M_TEMP, M_WAITOK);
 	st = stack_create();
 
 	lwpidarray = NULL;
 	numthreads = 0;
 	PROC_SLOCK(p);
 repeat:
 	if (numthreads < p->p_numthreads) {
 		if (lwpidarray != NULL) {
 			free(lwpidarray, M_TEMP);
 			lwpidarray = NULL;
 		}
 		numthreads = p->p_numthreads;
 		PROC_SUNLOCK(p);
 		lwpidarray = malloc(sizeof(*lwpidarray) * numthreads, M_TEMP,
 		    M_WAITOK | M_ZERO);
 		PROC_SLOCK(p);
 		goto repeat;
 	}
 	PROC_SUNLOCK(p);
 	i = 0;
 
 	/*
 	 * XXXRW: During the below loop, execve(2) and countless other sorts
 	 * of changes could have taken place.  Should we check to see if the
 	 * vmspace has been replaced, or the like, in order to prevent
 	 * giving a snapshot that spans, say, execve(2), with some threads
 	 * before and some after?  Among other things, the credentials could
 	 * have changed, in which case the right to extract debug info might
 	 * no longer be assured.
 	 */
 	PROC_LOCK(p);
 	FOREACH_THREAD_IN_PROC(p, td) {
 		KASSERT(i < numthreads,
 		    ("sysctl_kern_proc_kstack: numthreads"));
 		lwpidarray[i] = td->td_tid;
 		i++;
 	}
 	numthreads = i;
 	for (i = 0; i < numthreads; i++) {
 		td = thread_find(p, lwpidarray[i]);
 		if (td == NULL) {
 			continue;
 		}
 		bzero(kkstp, sizeof(*kkstp));
 		(void)sbuf_new(&sb, kkstp->kkst_trace,
 		    sizeof(kkstp->kkst_trace), SBUF_FIXEDLEN);
 		thread_lock(td);
 		kkstp->kkst_tid = td->td_tid;
 		if (TD_IS_SWAPPED(td))
 			kkstp->kkst_state = KKST_STATE_SWAPPED;
 		else if (TD_IS_RUNNING(td))
 			kkstp->kkst_state = KKST_STATE_RUNNING;
 		else {
 			kkstp->kkst_state = KKST_STATE_STACKOK;
 			stack_save_td(st, td);
 		}
 		thread_unlock(td);
 		PROC_UNLOCK(p);
 		stack_sbuf_print(&sb, st);
 		sbuf_finish(&sb);
 		sbuf_delete(&sb);
 		error = SYSCTL_OUT(req, kkstp, sizeof(*kkstp));
 		PROC_LOCK(p);
 		if (error)
 			break;
 	}
 	_PRELE(p);
 	PROC_UNLOCK(p);
 	if (lwpidarray != NULL)
 		free(lwpidarray, M_TEMP);
 	stack_destroy(st);
 	free(kkstp, M_TEMP);
 	return (error);
 }
 #endif
 
 SYSCTL_NODE(_kern, KERN_PROC, proc, CTLFLAG_RD,  0, "Process table");
 
 SYSCTL_PROC(_kern_proc, KERN_PROC_ALL, all, CTLFLAG_RD|CTLTYPE_STRUCT,
 	0, 0, sysctl_kern_proc, "S,proc", "Return entire process table");
 
 static SYSCTL_NODE(_kern_proc, KERN_PROC_GID, gid, CTLFLAG_RD,
 	sysctl_kern_proc, "Process table");
 
 static SYSCTL_NODE(_kern_proc, KERN_PROC_PGRP, pgrp, CTLFLAG_RD, 
 	sysctl_kern_proc, "Process table");
 
 static SYSCTL_NODE(_kern_proc, KERN_PROC_RGID, rgid, CTLFLAG_RD,
 	sysctl_kern_proc, "Process table");
 
 static SYSCTL_NODE(_kern_proc, KERN_PROC_SESSION, sid, CTLFLAG_RD,
 	sysctl_kern_proc, "Process table");
 
 static SYSCTL_NODE(_kern_proc, KERN_PROC_TTY, tty, CTLFLAG_RD, 
 	sysctl_kern_proc, "Process table");
 
 static SYSCTL_NODE(_kern_proc, KERN_PROC_UID, uid, CTLFLAG_RD, 
 	sysctl_kern_proc, "Process table");
 
 static SYSCTL_NODE(_kern_proc, KERN_PROC_RUID, ruid, CTLFLAG_RD, 
 	sysctl_kern_proc, "Process table");
 
 static SYSCTL_NODE(_kern_proc, KERN_PROC_PID, pid, CTLFLAG_RD, 
 	sysctl_kern_proc, "Process table");
 
 static SYSCTL_NODE(_kern_proc, KERN_PROC_PROC, proc, CTLFLAG_RD,
 	sysctl_kern_proc, "Return process table, no threads");
 
 static SYSCTL_NODE(_kern_proc, KERN_PROC_ARGS, args,
 	CTLFLAG_RW | CTLFLAG_ANYBODY,
 	sysctl_kern_proc_args, "Process argument list");
 
 static SYSCTL_NODE(_kern_proc, KERN_PROC_PATHNAME, pathname, CTLFLAG_RD,
 	sysctl_kern_proc_pathname, "Process executable path");
 
 static SYSCTL_NODE(_kern_proc, KERN_PROC_SV_NAME, sv_name, CTLFLAG_RD,
 	sysctl_kern_proc_sv_name, "Process syscall vector name (ABI type)");
 
 static SYSCTL_NODE(_kern_proc, (KERN_PROC_GID | KERN_PROC_INC_THREAD), gid_td,
 	CTLFLAG_RD, sysctl_kern_proc, "Process table");
 
 static SYSCTL_NODE(_kern_proc, (KERN_PROC_PGRP | KERN_PROC_INC_THREAD), pgrp_td,
 	CTLFLAG_RD, sysctl_kern_proc, "Process table");
 
 static SYSCTL_NODE(_kern_proc, (KERN_PROC_RGID | KERN_PROC_INC_THREAD), rgid_td,
 	CTLFLAG_RD, sysctl_kern_proc, "Process table");
 
 static SYSCTL_NODE(_kern_proc, (KERN_PROC_SESSION | KERN_PROC_INC_THREAD),
 	sid_td, CTLFLAG_RD, sysctl_kern_proc, "Process table");
 
 static SYSCTL_NODE(_kern_proc, (KERN_PROC_TTY | KERN_PROC_INC_THREAD), tty_td,
 	CTLFLAG_RD, sysctl_kern_proc, "Process table");
 
 static SYSCTL_NODE(_kern_proc, (KERN_PROC_UID | KERN_PROC_INC_THREAD), uid_td,
 	CTLFLAG_RD, sysctl_kern_proc, "Process table");
 
 static SYSCTL_NODE(_kern_proc, (KERN_PROC_RUID | KERN_PROC_INC_THREAD), ruid_td,
 	CTLFLAG_RD, sysctl_kern_proc, "Process table");
 
 static SYSCTL_NODE(_kern_proc, (KERN_PROC_PID | KERN_PROC_INC_THREAD), pid_td,
 	CTLFLAG_RD, sysctl_kern_proc, "Process table");
 
 static SYSCTL_NODE(_kern_proc, (KERN_PROC_PROC | KERN_PROC_INC_THREAD), proc_td,
 	CTLFLAG_RD, sysctl_kern_proc, "Return process table, no threads");
 
 static SYSCTL_NODE(_kern_proc, KERN_PROC_VMMAP, vmmap, CTLFLAG_RD,
 	sysctl_kern_proc_vmmap, "Process vm map entries");
 
 #if defined(STACK) || defined(DDB)
 static SYSCTL_NODE(_kern_proc, KERN_PROC_KSTACK, kstack, CTLFLAG_RD,
 	sysctl_kern_proc_kstack, "Process kernel stacks");
 #endif
Index: head/sys/kern/kern_sig.c
===================================================================
--- head/sys/kern/kern_sig.c	(revision 175201)
+++ head/sys/kern/kern_sig.c	(revision 175202)
@@ -1,3327 +1,3327 @@
 /*-
  * Copyright (c) 1982, 1986, 1989, 1991, 1993
  *	The Regents of the University of California.  All rights reserved.
  * (c) UNIX System Laboratories, Inc.
  * All or some portions of this file are derived from material licensed
  * to the University of California by American Telephone and Telegraph
  * Co. or Unix System Laboratories, Inc. and are reproduced herein with
  * the permission of UNIX System Laboratories, Inc.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	@(#)kern_sig.c	8.7 (Berkeley) 4/18/94
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_compat.h"
 #include "opt_ktrace.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/signalvar.h>
 #include <sys/vnode.h>
 #include <sys/acct.h>
 #include <sys/condvar.h>
 #include <sys/event.h>
 #include <sys/fcntl.h>
 #include <sys/kernel.h>
 #include <sys/kse.h>
 #include <sys/ktr.h>
 #include <sys/ktrace.h>
 #include <sys/lock.h>
 #include <sys/malloc.h>
 #include <sys/mutex.h>
 #include <sys/namei.h>
 #include <sys/proc.h>
 #include <sys/posix4.h>
 #include <sys/pioctl.h>
 #include <sys/resourcevar.h>
 #include <sys/sleepqueue.h>
 #include <sys/smp.h>
 #include <sys/stat.h>
 #include <sys/sx.h>
 #include <sys/syscallsubr.h>
 #include <sys/sysctl.h>
 #include <sys/sysent.h>
 #include <sys/syslog.h>
 #include <sys/sysproto.h>
 #include <sys/timers.h>
 #include <sys/unistd.h>
 #include <sys/wait.h>
 #include <vm/vm.h>
 #include <vm/vm_extern.h>
 #include <vm/uma.h>
 
 #include <machine/cpu.h>
 
 #include <security/audit/audit.h>
 
 #define	ONSIG	32		/* NSIG for osig* syscalls.  XXX. */
 
 static int	coredump(struct thread *);
 static char	*expand_name(const char *, uid_t, pid_t);
 static int	killpg1(struct thread *td, int sig, int pgid, int all);
 static int	issignal(struct thread *p);
 static int	sigprop(int sig);
 static void	tdsigwakeup(struct thread *, int, sig_t, int);
 static void	sig_suspend_threads(struct thread *, struct proc *, int);
 static int	filt_sigattach(struct knote *kn);
 static void	filt_sigdetach(struct knote *kn);
 static int	filt_signal(struct knote *kn, long hint);
 static struct thread *sigtd(struct proc *p, int sig, int prop);
 #ifdef KSE
 static int	do_tdsignal(struct proc *, struct thread *, int, ksiginfo_t *);
 #endif
 static void	sigqueue_start(void);
 
 static uma_zone_t	ksiginfo_zone = NULL;
 struct filterops sig_filtops =
 	{ 0, filt_sigattach, filt_sigdetach, filt_signal };
 
 int	kern_logsigexit = 1;
 SYSCTL_INT(_kern, KERN_LOGSIGEXIT, logsigexit, CTLFLAG_RW, 
     &kern_logsigexit, 0, 
     "Log processes quitting on abnormal signals to syslog(3)");
 
 static int	kern_forcesigexit = 1;
 SYSCTL_INT(_kern, OID_AUTO, forcesigexit, CTLFLAG_RW,
     &kern_forcesigexit, 0, "Force trap signal to be handled");
 
 SYSCTL_NODE(_kern, OID_AUTO, sigqueue, CTLFLAG_RW, 0, "POSIX real time signal");
 
 static int	max_pending_per_proc = 128;
 SYSCTL_INT(_kern_sigqueue, OID_AUTO, max_pending_per_proc, CTLFLAG_RW,
     &max_pending_per_proc, 0, "Max pending signals per proc");
 
 static int	preallocate_siginfo = 1024;
 TUNABLE_INT("kern.sigqueue.preallocate", &preallocate_siginfo);
 SYSCTL_INT(_kern_sigqueue, OID_AUTO, preallocate, CTLFLAG_RD,
     &preallocate_siginfo, 0, "Preallocated signal memory size");
 
 static int	signal_overflow = 0;
 SYSCTL_INT(_kern_sigqueue, OID_AUTO, overflow, CTLFLAG_RD,
     &signal_overflow, 0, "Number of signals overflew");
 
 static int	signal_alloc_fail = 0;
 SYSCTL_INT(_kern_sigqueue, OID_AUTO, alloc_fail, CTLFLAG_RD,
     &signal_alloc_fail, 0, "signals failed to be allocated");
 
 SYSINIT(signal, SI_SUB_P1003_1B, SI_ORDER_FIRST+3, sigqueue_start, NULL);
 
 /*
  * Policy -- Can ucred cr1 send SIGIO to process cr2?
  * Should use cr_cansignal() once cr_cansignal() allows SIGIO and SIGURG
  * in the right situations.
  */
 #define CANSIGIO(cr1, cr2) \
 	((cr1)->cr_uid == 0 || \
 	    (cr1)->cr_ruid == (cr2)->cr_ruid || \
 	    (cr1)->cr_uid == (cr2)->cr_ruid || \
 	    (cr1)->cr_ruid == (cr2)->cr_uid || \
 	    (cr1)->cr_uid == (cr2)->cr_uid)
 
 int sugid_coredump;
 SYSCTL_INT(_kern, OID_AUTO, sugid_coredump, CTLFLAG_RW, 
     &sugid_coredump, 0, "Enable coredumping set user/group ID processes");
 
 static int	do_coredump = 1;
 SYSCTL_INT(_kern, OID_AUTO, coredump, CTLFLAG_RW,
 	&do_coredump, 0, "Enable/Disable coredumps");
 
 static int	set_core_nodump_flag = 0;
 SYSCTL_INT(_kern, OID_AUTO, nodump_coredump, CTLFLAG_RW, &set_core_nodump_flag,
 	0, "Enable setting the NODUMP flag on coredump files");
 
 /*
  * Signal properties and actions.
  * The array below categorizes the signals and their default actions
  * according to the following properties:
  */
 #define	SA_KILL		0x01		/* terminates process by default */
 #define	SA_CORE		0x02		/* ditto and coredumps */
 #define	SA_STOP		0x04		/* suspend process */
 #define	SA_TTYSTOP	0x08		/* ditto, from tty */
 #define	SA_IGNORE	0x10		/* ignore by default */
 #define	SA_CONT		0x20		/* continue if suspended */
 #define	SA_CANTMASK	0x40		/* non-maskable, catchable */
 #define	SA_PROC		0x80		/* deliverable to any thread */
 
 static int sigproptbl[NSIG] = {
         SA_KILL|SA_PROC,		/* SIGHUP */
         SA_KILL|SA_PROC,		/* SIGINT */
         SA_KILL|SA_CORE|SA_PROC,	/* SIGQUIT */
         SA_KILL|SA_CORE,		/* SIGILL */
         SA_KILL|SA_CORE,		/* SIGTRAP */
         SA_KILL|SA_CORE,		/* SIGABRT */
         SA_KILL|SA_CORE|SA_PROC,	/* SIGEMT */
         SA_KILL|SA_CORE,		/* SIGFPE */
         SA_KILL|SA_PROC,		/* SIGKILL */
         SA_KILL|SA_CORE,		/* SIGBUS */
         SA_KILL|SA_CORE,		/* SIGSEGV */
         SA_KILL|SA_CORE,		/* SIGSYS */
         SA_KILL|SA_PROC,		/* SIGPIPE */
         SA_KILL|SA_PROC,		/* SIGALRM */
         SA_KILL|SA_PROC,		/* SIGTERM */
         SA_IGNORE|SA_PROC,		/* SIGURG */
         SA_STOP|SA_PROC,		/* SIGSTOP */
         SA_STOP|SA_TTYSTOP|SA_PROC,	/* SIGTSTP */
         SA_IGNORE|SA_CONT|SA_PROC,	/* SIGCONT */
         SA_IGNORE|SA_PROC,		/* SIGCHLD */
         SA_STOP|SA_TTYSTOP|SA_PROC,	/* SIGTTIN */
         SA_STOP|SA_TTYSTOP|SA_PROC,	/* SIGTTOU */
         SA_IGNORE|SA_PROC,		/* SIGIO */
         SA_KILL,			/* SIGXCPU */
         SA_KILL,			/* SIGXFSZ */
         SA_KILL|SA_PROC,		/* SIGVTALRM */
         SA_KILL|SA_PROC,		/* SIGPROF */
         SA_IGNORE|SA_PROC,		/* SIGWINCH  */
         SA_IGNORE|SA_PROC,		/* SIGINFO */
         SA_KILL|SA_PROC,		/* SIGUSR1 */
         SA_KILL|SA_PROC,		/* SIGUSR2 */
 };
 
 static void
 sigqueue_start(void)
 {
 	ksiginfo_zone = uma_zcreate("ksiginfo", sizeof(ksiginfo_t),
 		NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0);
 	uma_prealloc(ksiginfo_zone, preallocate_siginfo);
 	p31b_setcfg(CTL_P1003_1B_REALTIME_SIGNALS, _POSIX_REALTIME_SIGNALS);
 	p31b_setcfg(CTL_P1003_1B_RTSIG_MAX, SIGRTMAX - SIGRTMIN + 1);
 	p31b_setcfg(CTL_P1003_1B_SIGQUEUE_MAX, max_pending_per_proc);
 }
 
 ksiginfo_t *
 ksiginfo_alloc(int wait)
 {
 	int flags;
 
 	flags = M_ZERO;
 	if (! wait)
 		flags |= M_NOWAIT;
 	if (ksiginfo_zone != NULL)
 		return ((ksiginfo_t *)uma_zalloc(ksiginfo_zone, flags));
 	return (NULL);
 }
 
 void
 ksiginfo_free(ksiginfo_t *ksi)
 {
 	uma_zfree(ksiginfo_zone, ksi);
 }
 
 static __inline int
 ksiginfo_tryfree(ksiginfo_t *ksi)
 {
 	if (!(ksi->ksi_flags & KSI_EXT)) {
 		uma_zfree(ksiginfo_zone, ksi);
 		return (1);
 	}
 	return (0);
 }
 
 void
 sigqueue_init(sigqueue_t *list, struct proc *p)
 {
 	SIGEMPTYSET(list->sq_signals);
 	SIGEMPTYSET(list->sq_kill);
 	TAILQ_INIT(&list->sq_list);
 	list->sq_proc = p;
 	list->sq_flags = SQ_INIT;
 }
 
 /*
  * Get a signal's ksiginfo.
  * Return:
  * 	0	-	signal not found
  *	others	-	signal number
  */ 
 int
 sigqueue_get(sigqueue_t *sq, int signo, ksiginfo_t *si)
 {
 	struct proc *p = sq->sq_proc;
 	struct ksiginfo *ksi, *next;
 	int count = 0;
 
 	KASSERT(sq->sq_flags & SQ_INIT, ("sigqueue not inited"));
 
 	if (!SIGISMEMBER(sq->sq_signals, signo))
 		return (0);
 
 	if (SIGISMEMBER(sq->sq_kill, signo)) {
 		count++;
 		SIGDELSET(sq->sq_kill, signo);
 	}
 
 	TAILQ_FOREACH_SAFE(ksi, &sq->sq_list, ksi_link, next) {
 		if (ksi->ksi_signo == signo) {
 			if (count == 0) {
 				TAILQ_REMOVE(&sq->sq_list, ksi, ksi_link);
 				ksi->ksi_sigq = NULL;
 				ksiginfo_copy(ksi, si);
 				if (ksiginfo_tryfree(ksi) && p != NULL)
 					p->p_pendingcnt--;
 			}
 			if (++count > 1)
 				break;
 		}
 	}
 
 	if (count <= 1)
 		SIGDELSET(sq->sq_signals, signo);
 	si->ksi_signo = signo;
 	return (signo);
 }
 
 void
 sigqueue_take(ksiginfo_t *ksi)
 {
 	struct ksiginfo *kp;
 	struct proc	*p;
 	sigqueue_t	*sq;
 
 	if (ksi == NULL || (sq = ksi->ksi_sigq) == NULL)
 		return;
 
 	p = sq->sq_proc;
 	TAILQ_REMOVE(&sq->sq_list, ksi, ksi_link);
 	ksi->ksi_sigq = NULL;
 	if (!(ksi->ksi_flags & KSI_EXT) && p != NULL)
 		p->p_pendingcnt--;
 
 	for (kp = TAILQ_FIRST(&sq->sq_list); kp != NULL;
 	     kp = TAILQ_NEXT(kp, ksi_link)) {
 		if (kp->ksi_signo == ksi->ksi_signo)
 			break;
 	}
 	if (kp == NULL && !SIGISMEMBER(sq->sq_kill, ksi->ksi_signo))
 		SIGDELSET(sq->sq_signals, ksi->ksi_signo);
 }
 
 int
 sigqueue_add(sigqueue_t *sq, int signo, ksiginfo_t *si)
 {
 	struct proc *p = sq->sq_proc;
 	struct ksiginfo *ksi;
 	int ret = 0;
 
 	KASSERT(sq->sq_flags & SQ_INIT, ("sigqueue not inited"));
 	
 	if (signo == SIGKILL || signo == SIGSTOP || si == NULL) {
 		SIGADDSET(sq->sq_kill, signo);
 		goto out_set_bit;
 	}
 
 	/* directly insert the ksi, don't copy it */
 	if (si->ksi_flags & KSI_INS) {
 		TAILQ_INSERT_TAIL(&sq->sq_list, si, ksi_link);
 		si->ksi_sigq = sq;
 		goto out_set_bit;
 	}
 
 	if (__predict_false(ksiginfo_zone == NULL)) {
 		SIGADDSET(sq->sq_kill, signo);
 		goto out_set_bit;
 	}
 	
 	if (p != NULL && p->p_pendingcnt >= max_pending_per_proc) {
 		signal_overflow++;
 		ret = EAGAIN;
 	} else if ((ksi = ksiginfo_alloc(0)) == NULL) {
 		signal_alloc_fail++;
 		ret = EAGAIN;
 	} else {
 		if (p != NULL)
 			p->p_pendingcnt++;
 		ksiginfo_copy(si, ksi);
 		ksi->ksi_signo = signo;
 		TAILQ_INSERT_TAIL(&sq->sq_list, ksi, ksi_link);
 		ksi->ksi_sigq = sq;
 	}
 
 	if ((si->ksi_flags & KSI_TRAP) != 0) {
 		if (ret != 0)
 			SIGADDSET(sq->sq_kill, signo);
 		ret = 0;
 		goto out_set_bit;
 	}
 
 	if (ret != 0)
 		return (ret);
 	
 out_set_bit:
 	SIGADDSET(sq->sq_signals, signo);
 	return (ret);
 }
 
 void
 sigqueue_flush(sigqueue_t *sq)
 {
 	struct proc *p = sq->sq_proc;
 	ksiginfo_t *ksi;
 
 	KASSERT(sq->sq_flags & SQ_INIT, ("sigqueue not inited"));
 
 	if (p != NULL)
 		PROC_LOCK_ASSERT(p, MA_OWNED);
 
 	while ((ksi = TAILQ_FIRST(&sq->sq_list)) != NULL) {
 		TAILQ_REMOVE(&sq->sq_list, ksi, ksi_link);
 		ksi->ksi_sigq = NULL;
 		if (ksiginfo_tryfree(ksi) && p != NULL)
 			p->p_pendingcnt--;
 	}
 
 	SIGEMPTYSET(sq->sq_signals);
 	SIGEMPTYSET(sq->sq_kill);
 }
 
 void
 sigqueue_collect_set(sigqueue_t *sq, sigset_t *set)
 {
 	ksiginfo_t *ksi;
 
 	KASSERT(sq->sq_flags & SQ_INIT, ("sigqueue not inited"));
 
 	TAILQ_FOREACH(ksi, &sq->sq_list, ksi_link)
 		SIGADDSET(*set, ksi->ksi_signo);
 	SIGSETOR(*set, sq->sq_kill);
 }
 
 void
 sigqueue_move_set(sigqueue_t *src, sigqueue_t *dst, sigset_t *setp)
 {
 	sigset_t tmp, set;
 	struct proc *p1, *p2;
 	ksiginfo_t *ksi, *next;
 
 	KASSERT(src->sq_flags & SQ_INIT, ("src sigqueue not inited"));
 	KASSERT(dst->sq_flags & SQ_INIT, ("dst sigqueue not inited"));
 	/*
 	 * make a copy, this allows setp to point to src or dst
 	 * sq_signals without trouble.
 	 */
 	set = *setp;
 	p1 = src->sq_proc;
 	p2 = dst->sq_proc;
 	/* Move siginfo to target list */
 	TAILQ_FOREACH_SAFE(ksi, &src->sq_list, ksi_link, next) {
 		if (SIGISMEMBER(set, ksi->ksi_signo)) {
 			TAILQ_REMOVE(&src->sq_list, ksi, ksi_link);
 			if (p1 != NULL)
 				p1->p_pendingcnt--;
 			TAILQ_INSERT_TAIL(&dst->sq_list, ksi, ksi_link);
 			ksi->ksi_sigq = dst;
 			if (p2 != NULL)
 				p2->p_pendingcnt++;
 		}
 	}
 
 	/* Move pending bits to target list */
 	tmp = src->sq_kill;
 	SIGSETAND(tmp, set);
 	SIGSETOR(dst->sq_kill, tmp);
 	SIGSETNAND(src->sq_kill, tmp);
 
 	tmp = src->sq_signals;
 	SIGSETAND(tmp, set);
 	SIGSETOR(dst->sq_signals, tmp);
 	SIGSETNAND(src->sq_signals, tmp);
 
 	/* Finally, rescan src queue and set pending bits for it */
 	sigqueue_collect_set(src, &src->sq_signals);
 }
 
 void
 sigqueue_move(sigqueue_t *src, sigqueue_t *dst, int signo)
 {
 	sigset_t set;
 
 	SIGEMPTYSET(set);
 	SIGADDSET(set, signo);
 	sigqueue_move_set(src, dst, &set);
 }
 
 void
 sigqueue_delete_set(sigqueue_t *sq, sigset_t *set)
 {
 	struct proc *p = sq->sq_proc;
 	ksiginfo_t *ksi, *next;
 
 	KASSERT(sq->sq_flags & SQ_INIT, ("src sigqueue not inited"));
 
 	/* Remove siginfo queue */
 	TAILQ_FOREACH_SAFE(ksi, &sq->sq_list, ksi_link, next) {
 		if (SIGISMEMBER(*set, ksi->ksi_signo)) {
 			TAILQ_REMOVE(&sq->sq_list, ksi, ksi_link);
 			ksi->ksi_sigq = NULL;
 			if (ksiginfo_tryfree(ksi) && p != NULL)
 				p->p_pendingcnt--;
 		}
 	}
 	SIGSETNAND(sq->sq_kill, *set);
 	SIGSETNAND(sq->sq_signals, *set);
 	/* Finally, rescan queue and set pending bits for it */
 	sigqueue_collect_set(sq, &sq->sq_signals);
 }
 
 void
 sigqueue_delete(sigqueue_t *sq, int signo)
 {
 	sigset_t set;
 
 	SIGEMPTYSET(set);
 	SIGADDSET(set, signo);
 	sigqueue_delete_set(sq, &set);
 }
 
 /* Remove a set of signals for a process */
 void
 sigqueue_delete_set_proc(struct proc *p, sigset_t *set)
 {
 	sigqueue_t worklist;
 	struct thread *td0;
 
 	PROC_LOCK_ASSERT(p, MA_OWNED);
 
 	sigqueue_init(&worklist, NULL);
 	sigqueue_move_set(&p->p_sigqueue, &worklist, set);
 
 	PROC_SLOCK(p);
 	FOREACH_THREAD_IN_PROC(p, td0)
 		sigqueue_move_set(&td0->td_sigqueue, &worklist, set);
 	PROC_SUNLOCK(p);
 
 	sigqueue_flush(&worklist);
 }
 
 void
 sigqueue_delete_proc(struct proc *p, int signo)
 {
 	sigset_t set;
 
 	SIGEMPTYSET(set);
 	SIGADDSET(set, signo);
 	sigqueue_delete_set_proc(p, &set);
 }
 
 void
 sigqueue_delete_stopmask_proc(struct proc *p)
 {
 	sigset_t set;
 
 	SIGEMPTYSET(set);
 	SIGADDSET(set, SIGSTOP);
 	SIGADDSET(set, SIGTSTP);
 	SIGADDSET(set, SIGTTIN);
 	SIGADDSET(set, SIGTTOU);
 	sigqueue_delete_set_proc(p, &set);
 }
 
 /*
  * Determine signal that should be delivered to process p, the current
  * process, 0 if none.  If there is a pending stop signal with default
  * action, the process stops in issignal().
  */
 int
 cursig(struct thread *td)
 {
 	PROC_LOCK_ASSERT(td->td_proc, MA_OWNED);
 	mtx_assert(&td->td_proc->p_sigacts->ps_mtx, MA_OWNED);
 	THREAD_LOCK_ASSERT(td, MA_NOTOWNED);
 	return (SIGPENDING(td) ? issignal(td) : 0);
 }
 
 /*
  * Arrange for ast() to handle unmasked pending signals on return to user
  * mode.  This must be called whenever a signal is added to td_sigqueue or
  * unmasked in td_sigmask.
  */
 void
 signotify(struct thread *td)
 {
 	struct proc *p;
 #ifdef KSE
 	sigset_t set, saved;
 #else
 	sigset_t set;
 #endif
 
 	p = td->td_proc;
 
 	PROC_LOCK_ASSERT(p, MA_OWNED);
 
 	/*
 	 * If our mask changed we may have to move signal that were
 	 * previously masked by all threads to our sigqueue.
 	 */
 	set = p->p_sigqueue.sq_signals;
 #ifdef KSE
 	if (p->p_flag & P_SA)
 		saved = p->p_sigqueue.sq_signals;
 #endif
 	SIGSETNAND(set, td->td_sigmask);
 	if (! SIGISEMPTY(set))
 		sigqueue_move_set(&p->p_sigqueue, &td->td_sigqueue, &set);
 	if (SIGPENDING(td)) {
 		thread_lock(td);
 		td->td_flags |= TDF_NEEDSIGCHK | TDF_ASTPENDING;
 		thread_unlock(td);
 	}
 #ifdef KSE
 	if ((p->p_flag & P_SA) && !(p->p_flag & P_SIGEVENT)) {
 		if (!SIGSETEQ(saved, p->p_sigqueue.sq_signals)) {
 			/* pending set changed */
 			p->p_flag |= P_SIGEVENT;
 			wakeup(&p->p_siglist);
 		}
 	}
 #endif
 }
 
 int
 sigonstack(size_t sp)
 {
 	struct thread *td = curthread;
 
 	return ((td->td_pflags & TDP_ALTSTACK) ?
 #if defined(COMPAT_43)
 	    ((td->td_sigstk.ss_size == 0) ?
 		(td->td_sigstk.ss_flags & SS_ONSTACK) :
 		((sp - (size_t)td->td_sigstk.ss_sp) < td->td_sigstk.ss_size))
 #else
 	    ((sp - (size_t)td->td_sigstk.ss_sp) < td->td_sigstk.ss_size)
 #endif
 	    : 0);
 }
 
 static __inline int
 sigprop(int sig)
 {
 
 	if (sig > 0 && sig < NSIG)
 		return (sigproptbl[_SIG_IDX(sig)]);
 	return (0);
 }
 
 int
 sig_ffs(sigset_t *set)
 {
 	int i;
 
 	for (i = 0; i < _SIG_WORDS; i++)
 		if (set->__bits[i])
 			return (ffs(set->__bits[i]) + (i * 32));
 	return (0);
 }
 
 /*
  * kern_sigaction
  * sigaction
  * freebsd4_sigaction
  * osigaction
  */
 int
 kern_sigaction(td, sig, act, oact, flags)
 	struct thread *td;
 	register int sig;
 	struct sigaction *act, *oact;
 	int flags;
 {
 	struct sigacts *ps;
 	struct proc *p = td->td_proc;
 
 	if (!_SIG_VALID(sig))
 		return (EINVAL);
 
 	PROC_LOCK(p);
 	ps = p->p_sigacts;
 	mtx_lock(&ps->ps_mtx);
 	if (oact) {
 		oact->sa_mask = ps->ps_catchmask[_SIG_IDX(sig)];
 		oact->sa_flags = 0;
 		if (SIGISMEMBER(ps->ps_sigonstack, sig))
 			oact->sa_flags |= SA_ONSTACK;
 		if (!SIGISMEMBER(ps->ps_sigintr, sig))
 			oact->sa_flags |= SA_RESTART;
 		if (SIGISMEMBER(ps->ps_sigreset, sig))
 			oact->sa_flags |= SA_RESETHAND;
 		if (SIGISMEMBER(ps->ps_signodefer, sig))
 			oact->sa_flags |= SA_NODEFER;
 		if (SIGISMEMBER(ps->ps_siginfo, sig)) {
 			oact->sa_flags |= SA_SIGINFO;
 			oact->sa_sigaction =
 			    (__siginfohandler_t *)ps->ps_sigact[_SIG_IDX(sig)];
 		} else
 			oact->sa_handler = ps->ps_sigact[_SIG_IDX(sig)];
 		if (sig == SIGCHLD && ps->ps_flag & PS_NOCLDSTOP)
 			oact->sa_flags |= SA_NOCLDSTOP;
 		if (sig == SIGCHLD && ps->ps_flag & PS_NOCLDWAIT)
 			oact->sa_flags |= SA_NOCLDWAIT;
 	}
 	if (act) {
 		if ((sig == SIGKILL || sig == SIGSTOP) &&
 		    act->sa_handler != SIG_DFL) {
 			mtx_unlock(&ps->ps_mtx);
 			PROC_UNLOCK(p);
 			return (EINVAL);
 		}
 
 		/*
 		 * Change setting atomically.
 		 */
 
 		ps->ps_catchmask[_SIG_IDX(sig)] = act->sa_mask;
 		SIG_CANTMASK(ps->ps_catchmask[_SIG_IDX(sig)]);
 		if (act->sa_flags & SA_SIGINFO) {
 			ps->ps_sigact[_SIG_IDX(sig)] =
 			    (__sighandler_t *)act->sa_sigaction;
 			SIGADDSET(ps->ps_siginfo, sig);
 		} else {
 			ps->ps_sigact[_SIG_IDX(sig)] = act->sa_handler;
 			SIGDELSET(ps->ps_siginfo, sig);
 		}
 		if (!(act->sa_flags & SA_RESTART))
 			SIGADDSET(ps->ps_sigintr, sig);
 		else
 			SIGDELSET(ps->ps_sigintr, sig);
 		if (act->sa_flags & SA_ONSTACK)
 			SIGADDSET(ps->ps_sigonstack, sig);
 		else
 			SIGDELSET(ps->ps_sigonstack, sig);
 		if (act->sa_flags & SA_RESETHAND)
 			SIGADDSET(ps->ps_sigreset, sig);
 		else
 			SIGDELSET(ps->ps_sigreset, sig);
 		if (act->sa_flags & SA_NODEFER)
 			SIGADDSET(ps->ps_signodefer, sig);
 		else
 			SIGDELSET(ps->ps_signodefer, sig);
 		if (sig == SIGCHLD) {
 			if (act->sa_flags & SA_NOCLDSTOP)
 				ps->ps_flag |= PS_NOCLDSTOP;
 			else
 				ps->ps_flag &= ~PS_NOCLDSTOP;
 			if (act->sa_flags & SA_NOCLDWAIT) {
 				/*
 				 * Paranoia: since SA_NOCLDWAIT is implemented
 				 * by reparenting the dying child to PID 1 (and
 				 * trust it to reap the zombie), PID 1 itself
 				 * is forbidden to set SA_NOCLDWAIT.
 				 */
 				if (p->p_pid == 1)
 					ps->ps_flag &= ~PS_NOCLDWAIT;
 				else
 					ps->ps_flag |= PS_NOCLDWAIT;
 			} else
 				ps->ps_flag &= ~PS_NOCLDWAIT;
 			if (ps->ps_sigact[_SIG_IDX(SIGCHLD)] == SIG_IGN)
 				ps->ps_flag |= PS_CLDSIGIGN;
 			else
 				ps->ps_flag &= ~PS_CLDSIGIGN;
 		}
 		/*
 		 * Set bit in ps_sigignore for signals that are set to SIG_IGN,
 		 * and for signals set to SIG_DFL where the default is to
 		 * ignore. However, don't put SIGCONT in ps_sigignore, as we
 		 * have to restart the process.
 		 */
 		if (ps->ps_sigact[_SIG_IDX(sig)] == SIG_IGN ||
 		    (sigprop(sig) & SA_IGNORE &&
 		     ps->ps_sigact[_SIG_IDX(sig)] == SIG_DFL)) {
 #ifdef KSE
 			if ((p->p_flag & P_SA) &&
 			     SIGISMEMBER(p->p_sigqueue.sq_signals, sig)) {
 				p->p_flag |= P_SIGEVENT;
 				wakeup(&p->p_siglist);
 			}
 #endif
 			/* never to be seen again */
 			PROC_SLOCK(p);
 			sigqueue_delete_proc(p, sig);
 			PROC_SUNLOCK(p);
 			if (sig != SIGCONT)
 				/* easier in psignal */
 				SIGADDSET(ps->ps_sigignore, sig);
 			SIGDELSET(ps->ps_sigcatch, sig);
 		} else {
 			SIGDELSET(ps->ps_sigignore, sig);
 			if (ps->ps_sigact[_SIG_IDX(sig)] == SIG_DFL)
 				SIGDELSET(ps->ps_sigcatch, sig);
 			else
 				SIGADDSET(ps->ps_sigcatch, sig);
 		}
 #ifdef COMPAT_FREEBSD4
 		if (ps->ps_sigact[_SIG_IDX(sig)] == SIG_IGN ||
 		    ps->ps_sigact[_SIG_IDX(sig)] == SIG_DFL ||
 		    (flags & KSA_FREEBSD4) == 0)
 			SIGDELSET(ps->ps_freebsd4, sig);
 		else
 			SIGADDSET(ps->ps_freebsd4, sig);
 #endif
 #ifdef COMPAT_43
 		if (ps->ps_sigact[_SIG_IDX(sig)] == SIG_IGN ||
 		    ps->ps_sigact[_SIG_IDX(sig)] == SIG_DFL ||
 		    (flags & KSA_OSIGSET) == 0)
 			SIGDELSET(ps->ps_osigset, sig);
 		else
 			SIGADDSET(ps->ps_osigset, sig);
 #endif
 	}
 	mtx_unlock(&ps->ps_mtx);
 	PROC_UNLOCK(p);
 	return (0);
 }
 
 #ifndef _SYS_SYSPROTO_H_
 struct sigaction_args {
 	int	sig;
 	struct	sigaction *act;
 	struct	sigaction *oact;
 };
 #endif
 int
 sigaction(td, uap)
 	struct thread *td;
 	register struct sigaction_args *uap;
 {
 	struct sigaction act, oact;
 	register struct sigaction *actp, *oactp;
 	int error;
 
 	actp = (uap->act != NULL) ? &act : NULL;
 	oactp = (uap->oact != NULL) ? &oact : NULL;
 	if (actp) {
 		error = copyin(uap->act, actp, sizeof(act));
 		if (error)
 			return (error);
 	}
 	error = kern_sigaction(td, uap->sig, actp, oactp, 0);
 	if (oactp && !error)
 		error = copyout(oactp, uap->oact, sizeof(oact));
 	return (error);
 }
 
 #ifdef COMPAT_FREEBSD4
 #ifndef _SYS_SYSPROTO_H_
 struct freebsd4_sigaction_args {
 	int	sig;
 	struct	sigaction *act;
 	struct	sigaction *oact;
 };
 #endif
 int
 freebsd4_sigaction(td, uap)
 	struct thread *td;
 	register struct freebsd4_sigaction_args *uap;
 {
 	struct sigaction act, oact;
 	register struct sigaction *actp, *oactp;
 	int error;
 
 
 	actp = (uap->act != NULL) ? &act : NULL;
 	oactp = (uap->oact != NULL) ? &oact : NULL;
 	if (actp) {
 		error = copyin(uap->act, actp, sizeof(act));
 		if (error)
 			return (error);
 	}
 	error = kern_sigaction(td, uap->sig, actp, oactp, KSA_FREEBSD4);
 	if (oactp && !error)
 		error = copyout(oactp, uap->oact, sizeof(oact));
 	return (error);
 }
 #endif	/* COMAPT_FREEBSD4 */
 
 #ifdef COMPAT_43	/* XXX - COMPAT_FBSD3 */
 #ifndef _SYS_SYSPROTO_H_
 struct osigaction_args {
 	int	signum;
 	struct	osigaction *nsa;
 	struct	osigaction *osa;
 };
 #endif
 int
 osigaction(td, uap)
 	struct thread *td;
 	register struct osigaction_args *uap;
 {
 	struct osigaction sa;
 	struct sigaction nsa, osa;
 	register struct sigaction *nsap, *osap;
 	int error;
 
 	if (uap->signum <= 0 || uap->signum >= ONSIG)
 		return (EINVAL);
 
 	nsap = (uap->nsa != NULL) ? &nsa : NULL;
 	osap = (uap->osa != NULL) ? &osa : NULL;
 
 	if (nsap) {
 		error = copyin(uap->nsa, &sa, sizeof(sa));
 		if (error)
 			return (error);
 		nsap->sa_handler = sa.sa_handler;
 		nsap->sa_flags = sa.sa_flags;
 		OSIG2SIG(sa.sa_mask, nsap->sa_mask);
 	}
 	error = kern_sigaction(td, uap->signum, nsap, osap, KSA_OSIGSET);
 	if (osap && !error) {
 		sa.sa_handler = osap->sa_handler;
 		sa.sa_flags = osap->sa_flags;
 		SIG2OSIG(osap->sa_mask, sa.sa_mask);
 		error = copyout(&sa, uap->osa, sizeof(sa));
 	}
 	return (error);
 }
 
 #if !defined(__i386__)
 /* Avoid replicating the same stub everywhere */
 int
 osigreturn(td, uap)
 	struct thread *td;
 	struct osigreturn_args *uap;
 {
 
 	return (nosys(td, (struct nosys_args *)uap));
 }
 #endif
 #endif /* COMPAT_43 */
 
 /*
  * Initialize signal state for process 0;
  * set to ignore signals that are ignored by default.
  */
 void
 siginit(p)
 	struct proc *p;
 {
 	register int i;
 	struct sigacts *ps;
 
 	PROC_LOCK(p);
 	ps = p->p_sigacts;
 	mtx_lock(&ps->ps_mtx);
 	for (i = 1; i <= NSIG; i++)
 		if (sigprop(i) & SA_IGNORE && i != SIGCONT)
 			SIGADDSET(ps->ps_sigignore, i);
 	mtx_unlock(&ps->ps_mtx);
 	PROC_UNLOCK(p);
 }
 
 /*
  * Reset signals for an exec of the specified process.
  */
 void
 execsigs(struct proc *p)
 {
 	struct sigacts *ps;
 	int sig;
 	struct thread *td;
 
 	/*
 	 * Reset caught signals.  Held signals remain held
 	 * through td_sigmask (unless they were caught,
 	 * and are now ignored by default).
 	 */
 	PROC_LOCK_ASSERT(p, MA_OWNED);
 	td = FIRST_THREAD_IN_PROC(p);
 	ps = p->p_sigacts;
 	mtx_lock(&ps->ps_mtx);
 	while (SIGNOTEMPTY(ps->ps_sigcatch)) {
 		sig = sig_ffs(&ps->ps_sigcatch);
 		SIGDELSET(ps->ps_sigcatch, sig);
 		if (sigprop(sig) & SA_IGNORE) {
 			if (sig != SIGCONT)
 				SIGADDSET(ps->ps_sigignore, sig);
 			PROC_SLOCK(p);
 			sigqueue_delete_proc(p, sig);
 			PROC_SUNLOCK(p);
 		}
 		ps->ps_sigact[_SIG_IDX(sig)] = SIG_DFL;
 	}
 	/*
 	 * Reset stack state to the user stack.
 	 * Clear set of signals caught on the signal stack.
 	 */
 	td->td_sigstk.ss_flags = SS_DISABLE;
 	td->td_sigstk.ss_size = 0;
 	td->td_sigstk.ss_sp = 0;
 	td->td_pflags &= ~TDP_ALTSTACK;
 	/*
 	 * Reset no zombies if child dies flag as Solaris does.
 	 */
 	ps->ps_flag &= ~(PS_NOCLDWAIT | PS_CLDSIGIGN);
 	if (ps->ps_sigact[_SIG_IDX(SIGCHLD)] == SIG_IGN)
 		ps->ps_sigact[_SIG_IDX(SIGCHLD)] = SIG_DFL;
 	mtx_unlock(&ps->ps_mtx);
 }
 
 /*
  * kern_sigprocmask()
  *
  *	Manipulate signal mask.
  */
 int
 kern_sigprocmask(td, how, set, oset, old)
 	struct thread *td;
 	int how;
 	sigset_t *set, *oset;
 	int old;
 {
 	int error;
 
 	PROC_LOCK(td->td_proc);
 	if (oset != NULL)
 		*oset = td->td_sigmask;
 
 	error = 0;
 	if (set != NULL) {
 		switch (how) {
 		case SIG_BLOCK:
 			SIG_CANTMASK(*set);
 			SIGSETOR(td->td_sigmask, *set);
 			break;
 		case SIG_UNBLOCK:
 			SIGSETNAND(td->td_sigmask, *set);
 			signotify(td);
 			break;
 		case SIG_SETMASK:
 			SIG_CANTMASK(*set);
 			if (old)
 				SIGSETLO(td->td_sigmask, *set);
 			else
 				td->td_sigmask = *set;
 			signotify(td);
 			break;
 		default:
 			error = EINVAL;
 			break;
 		}
 	}
 	PROC_UNLOCK(td->td_proc);
 	return (error);
 }
 
 #ifndef _SYS_SYSPROTO_H_
 struct sigprocmask_args {
 	int	how;
 	const sigset_t *set;
 	sigset_t *oset;
 };
 #endif
 int
 sigprocmask(td, uap)
 	register struct thread *td;
 	struct sigprocmask_args *uap;
 {
 	sigset_t set, oset;
 	sigset_t *setp, *osetp;
 	int error;
 
 	setp = (uap->set != NULL) ? &set : NULL;
 	osetp = (uap->oset != NULL) ? &oset : NULL;
 	if (setp) {
 		error = copyin(uap->set, setp, sizeof(set));
 		if (error)
 			return (error);
 	}
 	error = kern_sigprocmask(td, uap->how, setp, osetp, 0);
 	if (osetp && !error) {
 		error = copyout(osetp, uap->oset, sizeof(oset));
 	}
 	return (error);
 }
 
 #ifdef COMPAT_43	/* XXX - COMPAT_FBSD3 */
 #ifndef _SYS_SYSPROTO_H_
 struct osigprocmask_args {
 	int	how;
 	osigset_t mask;
 };
 #endif
 int
 osigprocmask(td, uap)
 	register struct thread *td;
 	struct osigprocmask_args *uap;
 {
 	sigset_t set, oset;
 	int error;
 
 	OSIG2SIG(uap->mask, set);
 	error = kern_sigprocmask(td, uap->how, &set, &oset, 1);
 	SIG2OSIG(oset, td->td_retval[0]);
 	return (error);
 }
 #endif /* COMPAT_43 */
 
 int
 sigwait(struct thread *td, struct sigwait_args *uap)
 {
 	ksiginfo_t ksi;
 	sigset_t set;
 	int error;
 
 	error = copyin(uap->set, &set, sizeof(set));
 	if (error) {
 		td->td_retval[0] = error;
 		return (0);
 	}
 
 	error = kern_sigtimedwait(td, set, &ksi, NULL);
 	if (error) {
 		if (error == ERESTART)
 			return (error);
 		td->td_retval[0] = error;
 		return (0);
 	}
 
 	error = copyout(&ksi.ksi_signo, uap->sig, sizeof(ksi.ksi_signo));
 	td->td_retval[0] = error;
 	return (0);
 }
 
 int
 sigtimedwait(struct thread *td, struct sigtimedwait_args *uap)
 {
 	struct timespec ts;
 	struct timespec *timeout;
 	sigset_t set;
 	ksiginfo_t ksi;
 	int error;
 
 	if (uap->timeout) {
 		error = copyin(uap->timeout, &ts, sizeof(ts));
 		if (error)
 			return (error);
 
 		timeout = &ts;
 	} else
 		timeout = NULL;
 
 	error = copyin(uap->set, &set, sizeof(set));
 	if (error)
 		return (error);
 
 	error = kern_sigtimedwait(td, set, &ksi, timeout);
 	if (error)
 		return (error);
 
 	if (uap->info)
 		error = copyout(&ksi.ksi_info, uap->info, sizeof(siginfo_t));
 
 	if (error == 0)
 		td->td_retval[0] = ksi.ksi_signo;
 	return (error);
 }
 
 int
 sigwaitinfo(struct thread *td, struct sigwaitinfo_args *uap)
 {
 	ksiginfo_t ksi;
 	sigset_t set;
 	int error;
 
 	error = copyin(uap->set, &set, sizeof(set));
 	if (error)
 		return (error);
 
 	error = kern_sigtimedwait(td, set, &ksi, NULL);
 	if (error)
 		return (error);
 
 	if (uap->info)
 		error = copyout(&ksi.ksi_info, uap->info, sizeof(siginfo_t));
 	
 	if (error == 0)
 		td->td_retval[0] = ksi.ksi_signo;
 	return (error);
 }
 
 int
 kern_sigtimedwait(struct thread *td, sigset_t waitset, ksiginfo_t *ksi,
 	struct timespec *timeout)
 {
 	struct sigacts *ps;
 	sigset_t savedmask;
 	struct proc *p;
 	int error, sig, hz, i, timevalid = 0;
 	struct timespec rts, ets, ts;
 	struct timeval tv;
 
 	p = td->td_proc;
 	error = 0;
 	sig = 0;
 	ets.tv_sec = 0;
 	ets.tv_nsec = 0;
 	SIG_CANTMASK(waitset);
 
 	PROC_LOCK(p);
 	ps = p->p_sigacts;
 	savedmask = td->td_sigmask;
 	if (timeout) {
 		if (timeout->tv_nsec >= 0 && timeout->tv_nsec < 1000000000) {
 			timevalid = 1;
 			getnanouptime(&rts);
 		 	ets = rts;
 			timespecadd(&ets, timeout);
 		}
 	}
 
 restart:
 	for (i = 1; i <= _SIG_MAXSIG; ++i) {
 		if (!SIGISMEMBER(waitset, i))
 			continue;
 		if (!SIGISMEMBER(td->td_sigqueue.sq_signals, i)) {
 			if (SIGISMEMBER(p->p_sigqueue.sq_signals, i)) {
 #ifdef KSE
 				if (p->p_flag & P_SA) {
 					p->p_flag |= P_SIGEVENT;
 					wakeup(&p->p_siglist);
 				}
 #endif
 				sigqueue_move(&p->p_sigqueue,
 					&td->td_sigqueue, i);
 			} else
 				continue;
 		}
 
 		SIGFILLSET(td->td_sigmask);
 		SIG_CANTMASK(td->td_sigmask);
 		SIGDELSET(td->td_sigmask, i);
 		mtx_lock(&ps->ps_mtx);
 		sig = cursig(td);
 		mtx_unlock(&ps->ps_mtx);
 		if (sig)
 			goto out;
 		else {
 			/*
 			 * Because cursig() may have stopped current thread,
 			 * after it is resumed, things may have already been 
 			 * changed, it should rescan any pending signals.
 			 */
 			goto restart;
 		}
 	}
 
 	if (error)
 		goto out;
 
 	/*
 	 * POSIX says this must be checked after looking for pending
 	 * signals.
 	 */
 	if (timeout) {
 		if (!timevalid) {
 			error = EINVAL;
 			goto out;
 		}
 		getnanouptime(&rts);
 		if (timespeccmp(&rts, &ets, >=)) {
 			error = EAGAIN;
 			goto out;
 		}
 		ts = ets;
 		timespecsub(&ts, &rts);
 		TIMESPEC_TO_TIMEVAL(&tv, &ts);
 		hz = tvtohz(&tv);
 	} else
 		hz = 0;
 
 	td->td_sigmask = savedmask;
 	SIGSETNAND(td->td_sigmask, waitset);
 	signotify(td);
 	error = msleep(&ps, &p->p_mtx, PPAUSE|PCATCH, "sigwait", hz);
 	if (timeout) {
 		if (error == ERESTART) {
 			/* timeout can not be restarted. */
 			error = EINTR;
 		} else if (error == EAGAIN) {
 			/* will calculate timeout by ourself. */
 			error = 0;
 		}
 	}
 	goto restart;
 
 out:
 	td->td_sigmask = savedmask;
 	signotify(td);
 	if (sig) {
 		ksiginfo_init(ksi);
 		sigqueue_get(&td->td_sigqueue, sig, ksi);
 		ksi->ksi_signo = sig;
 		if (ksi->ksi_code == SI_TIMER)
 			itimer_accept(p, ksi->ksi_timerid, ksi);
 		error = 0;
 
 #ifdef KTRACE
 		if (KTRPOINT(td, KTR_PSIG)) {
 			sig_t action;
 
 			mtx_lock(&ps->ps_mtx);
 			action = ps->ps_sigact[_SIG_IDX(sig)];
 			mtx_unlock(&ps->ps_mtx);
 			ktrpsig(sig, action, &td->td_sigmask, 0);
 		}
 #endif
 		if (sig == SIGKILL)
 			sigexit(td, sig);
 	}
 	PROC_UNLOCK(p);
 	return (error);
 }
 
 #ifndef _SYS_SYSPROTO_H_
 struct sigpending_args {
 	sigset_t	*set;
 };
 #endif
 int
 sigpending(td, uap)
 	struct thread *td;
 	struct sigpending_args *uap;
 {
 	struct proc *p = td->td_proc;
 	sigset_t pending;
 
 	PROC_LOCK(p);
 	pending = p->p_sigqueue.sq_signals;
 	SIGSETOR(pending, td->td_sigqueue.sq_signals);
 	PROC_UNLOCK(p);
 	return (copyout(&pending, uap->set, sizeof(sigset_t)));
 }
 
 #ifdef COMPAT_43	/* XXX - COMPAT_FBSD3 */
 #ifndef _SYS_SYSPROTO_H_
 struct osigpending_args {
 	int	dummy;
 };
 #endif
 int
 osigpending(td, uap)
 	struct thread *td;
 	struct osigpending_args *uap;
 {
 	struct proc *p = td->td_proc;
 	sigset_t pending;
 
 	PROC_LOCK(p);
 	pending = p->p_sigqueue.sq_signals;
 	SIGSETOR(pending, td->td_sigqueue.sq_signals);
 	PROC_UNLOCK(p);
 	SIG2OSIG(pending, td->td_retval[0]);
 	return (0);
 }
 #endif /* COMPAT_43 */
 
 #if defined(COMPAT_43)
 /*
  * Generalized interface signal handler, 4.3-compatible.
  */
 #ifndef _SYS_SYSPROTO_H_
 struct osigvec_args {
 	int	signum;
 	struct	sigvec *nsv;
 	struct	sigvec *osv;
 };
 #endif
 /* ARGSUSED */
 int
 osigvec(td, uap)
 	struct thread *td;
 	register struct osigvec_args *uap;
 {
 	struct sigvec vec;
 	struct sigaction nsa, osa;
 	register struct sigaction *nsap, *osap;
 	int error;
 
 	if (uap->signum <= 0 || uap->signum >= ONSIG)
 		return (EINVAL);
 	nsap = (uap->nsv != NULL) ? &nsa : NULL;
 	osap = (uap->osv != NULL) ? &osa : NULL;
 	if (nsap) {
 		error = copyin(uap->nsv, &vec, sizeof(vec));
 		if (error)
 			return (error);
 		nsap->sa_handler = vec.sv_handler;
 		OSIG2SIG(vec.sv_mask, nsap->sa_mask);
 		nsap->sa_flags = vec.sv_flags;
 		nsap->sa_flags ^= SA_RESTART;	/* opposite of SV_INTERRUPT */
 	}
 	error = kern_sigaction(td, uap->signum, nsap, osap, KSA_OSIGSET);
 	if (osap && !error) {
 		vec.sv_handler = osap->sa_handler;
 		SIG2OSIG(osap->sa_mask, vec.sv_mask);
 		vec.sv_flags = osap->sa_flags;
 		vec.sv_flags &= ~SA_NOCLDWAIT;
 		vec.sv_flags ^= SA_RESTART;
 		error = copyout(&vec, uap->osv, sizeof(vec));
 	}
 	return (error);
 }
 
 #ifndef _SYS_SYSPROTO_H_
 struct osigblock_args {
 	int	mask;
 };
 #endif
 int
 osigblock(td, uap)
 	register struct thread *td;
 	struct osigblock_args *uap;
 {
 	struct proc *p = td->td_proc;
 	sigset_t set;
 
 	OSIG2SIG(uap->mask, set);
 	SIG_CANTMASK(set);
 	PROC_LOCK(p);
 	SIG2OSIG(td->td_sigmask, td->td_retval[0]);
 	SIGSETOR(td->td_sigmask, set);
 	PROC_UNLOCK(p);
 	return (0);
 }
 
 #ifndef _SYS_SYSPROTO_H_
 struct osigsetmask_args {
 	int	mask;
 };
 #endif
 int
 osigsetmask(td, uap)
 	struct thread *td;
 	struct osigsetmask_args *uap;
 {
 	struct proc *p = td->td_proc;
 	sigset_t set;
 
 	OSIG2SIG(uap->mask, set);
 	SIG_CANTMASK(set);
 	PROC_LOCK(p);
 	SIG2OSIG(td->td_sigmask, td->td_retval[0]);
 	SIGSETLO(td->td_sigmask, set);
 	signotify(td);
 	PROC_UNLOCK(p);
 	return (0);
 }
 #endif /* COMPAT_43 */
 
 /*
  * Suspend calling thread until signal, providing mask to be set in the
  * meantime. 
  */
 #ifndef _SYS_SYSPROTO_H_
 struct sigsuspend_args {
 	const sigset_t *sigmask;
 };
 #endif
 /* ARGSUSED */
 int
 sigsuspend(td, uap)
 	struct thread *td;
 	struct sigsuspend_args *uap;
 {
 	sigset_t mask;
 	int error;
 
 	error = copyin(uap->sigmask, &mask, sizeof(mask));
 	if (error)
 		return (error);
 	return (kern_sigsuspend(td, mask));
 }
 
 int
 kern_sigsuspend(struct thread *td, sigset_t mask)
 {
 	struct proc *p = td->td_proc;
 
 	/*
 	 * When returning from sigsuspend, we want
 	 * the old mask to be restored after the
 	 * signal handler has finished.  Thus, we
 	 * save it here and mark the sigacts structure
 	 * to indicate this.
 	 */
 	PROC_LOCK(p);
 	td->td_oldsigmask = td->td_sigmask;
 	td->td_pflags |= TDP_OLDMASK;
 	SIG_CANTMASK(mask);
 	td->td_sigmask = mask;
 	signotify(td);
 	while (msleep(&p->p_sigacts, &p->p_mtx, PPAUSE|PCATCH, "pause", 0) == 0)
 		/* void */;
 	PROC_UNLOCK(p);
 	/* always return EINTR rather than ERESTART... */
 	return (EINTR);
 }
 
 #ifdef COMPAT_43	/* XXX - COMPAT_FBSD3 */
 /*
  * Compatibility sigsuspend call for old binaries.  Note nonstandard calling
  * convention: libc stub passes mask, not pointer, to save a copyin.
  */
 #ifndef _SYS_SYSPROTO_H_
 struct osigsuspend_args {
 	osigset_t mask;
 };
 #endif
 /* ARGSUSED */
 int
 osigsuspend(td, uap)
 	struct thread *td;
 	struct osigsuspend_args *uap;
 {
 	struct proc *p = td->td_proc;
 	sigset_t mask;
 
 	PROC_LOCK(p);
 	td->td_oldsigmask = td->td_sigmask;
 	td->td_pflags |= TDP_OLDMASK;
 	OSIG2SIG(uap->mask, mask);
 	SIG_CANTMASK(mask);
 	SIGSETLO(td->td_sigmask, mask);
 	signotify(td);
 	while (msleep(&p->p_sigacts, &p->p_mtx, PPAUSE|PCATCH, "opause", 0) == 0)
 		/* void */;
 	PROC_UNLOCK(p);
 	/* always return EINTR rather than ERESTART... */
 	return (EINTR);
 }
 #endif /* COMPAT_43 */
 
 #if defined(COMPAT_43)
 #ifndef _SYS_SYSPROTO_H_
 struct osigstack_args {
 	struct	sigstack *nss;
 	struct	sigstack *oss;
 };
 #endif
 /* ARGSUSED */
 int
 osigstack(td, uap)
 	struct thread *td;
 	register struct osigstack_args *uap;
 {
 	struct sigstack nss, oss;
 	int error = 0;
 
 	if (uap->nss != NULL) {
 		error = copyin(uap->nss, &nss, sizeof(nss));
 		if (error)
 			return (error);
 	}
 	oss.ss_sp = td->td_sigstk.ss_sp;
 	oss.ss_onstack = sigonstack(cpu_getstack(td));
 	if (uap->nss != NULL) {
 		td->td_sigstk.ss_sp = nss.ss_sp;
 		td->td_sigstk.ss_size = 0;
 		td->td_sigstk.ss_flags |= nss.ss_onstack & SS_ONSTACK;
 		td->td_pflags |= TDP_ALTSTACK;
 	}
 	if (uap->oss != NULL)
 		error = copyout(&oss, uap->oss, sizeof(oss));
 
 	return (error);
 }
 #endif /* COMPAT_43 */
 
 #ifndef _SYS_SYSPROTO_H_
 struct sigaltstack_args {
 	stack_t	*ss;
 	stack_t	*oss;
 };
 #endif
 /* ARGSUSED */
 int
 sigaltstack(td, uap)
 	struct thread *td;
 	register struct sigaltstack_args *uap;
 {
 	stack_t ss, oss;
 	int error;
 
 	if (uap->ss != NULL) {
 		error = copyin(uap->ss, &ss, sizeof(ss));
 		if (error)
 			return (error);
 	}
 	error = kern_sigaltstack(td, (uap->ss != NULL) ? &ss : NULL,
 	    (uap->oss != NULL) ? &oss : NULL);
 	if (error)
 		return (error);
 	if (uap->oss != NULL)
 		error = copyout(&oss, uap->oss, sizeof(stack_t));
 	return (error);
 }
 
 int
 kern_sigaltstack(struct thread *td, stack_t *ss, stack_t *oss)
 {
 	struct proc *p = td->td_proc;
 	int oonstack;
 
 	oonstack = sigonstack(cpu_getstack(td));
 
 	if (oss != NULL) {
 		*oss = td->td_sigstk;
 		oss->ss_flags = (td->td_pflags & TDP_ALTSTACK)
 		    ? ((oonstack) ? SS_ONSTACK : 0) : SS_DISABLE;
 	}
 
 	if (ss != NULL) {
 		if (oonstack)
 			return (EPERM);
 		if ((ss->ss_flags & ~SS_DISABLE) != 0)
 			return (EINVAL);
 		if (!(ss->ss_flags & SS_DISABLE)) {
 			if (ss->ss_size < p->p_sysent->sv_minsigstksz)
 				return (ENOMEM);
 
 			td->td_sigstk = *ss;
 			td->td_pflags |= TDP_ALTSTACK;
 		} else {
 			td->td_pflags &= ~TDP_ALTSTACK;
 		}
 	}
 	return (0);
 }
 
 /*
  * Common code for kill process group/broadcast kill.
  * cp is calling process.
  */
 static int
 killpg1(td, sig, pgid, all)
 	register struct thread *td;
 	int sig, pgid, all;
 {
 	register struct proc *p;
 	struct pgrp *pgrp;
 	int nfound = 0;
 
 	if (all) {
 		/*
 		 * broadcast
 		 */
 		sx_slock(&allproc_lock);
 		FOREACH_PROC_IN_SYSTEM(p) {
 			PROC_LOCK(p);
 			if (p->p_pid <= 1 || p->p_flag & P_SYSTEM ||
 			    p == td->td_proc || p->p_state == PRS_NEW) {
 				PROC_UNLOCK(p);
 				continue;
 			}
 			if (p_cansignal(td, p, sig) == 0) {
 				nfound++;
 				if (sig)
 					psignal(p, sig);
 			}
 			PROC_UNLOCK(p);
 		}
 		sx_sunlock(&allproc_lock);
 	} else {
 		sx_slock(&proctree_lock);
 		if (pgid == 0) {
 			/*
 			 * zero pgid means send to my process group.
 			 */
 			pgrp = td->td_proc->p_pgrp;
 			PGRP_LOCK(pgrp);
 		} else {
 			pgrp = pgfind(pgid);
 			if (pgrp == NULL) {
 				sx_sunlock(&proctree_lock);
 				return (ESRCH);
 			}
 		}
 		sx_sunlock(&proctree_lock);
 		LIST_FOREACH(p, &pgrp->pg_members, p_pglist) {
 			PROC_LOCK(p);	      
 			if (p->p_pid <= 1 || p->p_flag & P_SYSTEM ||
 				p->p_state == PRS_NEW ) {
 				PROC_UNLOCK(p);
 				continue;
 			}
 			if (p_cansignal(td, p, sig) == 0) {
 				nfound++;
 				if (sig)
 					psignal(p, sig);
 			}
 			PROC_UNLOCK(p);
 		}
 		PGRP_UNLOCK(pgrp);
 	}
 	return (nfound ? 0 : ESRCH);
 }
 
 #ifndef _SYS_SYSPROTO_H_
 struct kill_args {
 	int	pid;
 	int	signum;
 };
 #endif
 /* ARGSUSED */
 int
 kill(td, uap)
 	register struct thread *td;
 	register struct kill_args *uap;
 {
 	register struct proc *p;
 	int error;
 
 	AUDIT_ARG(signum, uap->signum);
 	AUDIT_ARG(pid, uap->pid);
 	if ((u_int)uap->signum > _SIG_MAXSIG)
 		return (EINVAL);
 
 	if (uap->pid > 0) {
 		/* kill single process */
 		if ((p = pfind(uap->pid)) == NULL) {
 			if ((p = zpfind(uap->pid)) == NULL)
 				return (ESRCH);
 		}
 		AUDIT_ARG(process, p);
 		error = p_cansignal(td, p, uap->signum);
 		if (error == 0 && uap->signum)
 			psignal(p, uap->signum);
 		PROC_UNLOCK(p);
 		return (error);
 	}
 	switch (uap->pid) {
 	case -1:		/* broadcast signal */
 		return (killpg1(td, uap->signum, 0, 1));
 	case 0:			/* signal own process group */
 		return (killpg1(td, uap->signum, 0, 0));
 	default:		/* negative explicit process group */
 		return (killpg1(td, uap->signum, -uap->pid, 0));
 	}
 	/* NOTREACHED */
 }
 
 #if defined(COMPAT_43)
 #ifndef _SYS_SYSPROTO_H_
 struct okillpg_args {
 	int	pgid;
 	int	signum;
 };
 #endif
 /* ARGSUSED */
 int
 okillpg(td, uap)
 	struct thread *td;
 	register struct okillpg_args *uap;
 {
 
 	AUDIT_ARG(signum, uap->signum);
 	AUDIT_ARG(pid, uap->pgid);
 	if ((u_int)uap->signum > _SIG_MAXSIG)
 		return (EINVAL);
 
 	return (killpg1(td, uap->signum, uap->pgid, 0));
 }
 #endif /* COMPAT_43 */
 
 #ifndef _SYS_SYSPROTO_H_
 struct sigqueue_args {
 	pid_t pid;
 	int signum;
 	/* union sigval */ void *value;
 };
 #endif
 int
 sigqueue(struct thread *td, struct sigqueue_args *uap)
 {
 	ksiginfo_t ksi;
 	struct proc *p;
 	int error;
 
 	if ((u_int)uap->signum > _SIG_MAXSIG)
 		return (EINVAL);
 
 	/*
 	 * Specification says sigqueue can only send signal to
 	 * single process.
 	 */
 	if (uap->pid <= 0)
 		return (EINVAL);
 
 	if ((p = pfind(uap->pid)) == NULL) {
 		if ((p = zpfind(uap->pid)) == NULL)
 			return (ESRCH);
 	}
 	error = p_cansignal(td, p, uap->signum);
 	if (error == 0 && uap->signum != 0) {
 		ksiginfo_init(&ksi);
 		ksi.ksi_signo = uap->signum;
 		ksi.ksi_code = SI_QUEUE;
 		ksi.ksi_pid = td->td_proc->p_pid;
 		ksi.ksi_uid = td->td_ucred->cr_ruid;
 		ksi.ksi_value.sival_ptr = uap->value;
 		error = tdsignal(p, NULL, ksi.ksi_signo, &ksi);
 	}
 	PROC_UNLOCK(p);
 	return (error);
 }
 
 /*
  * Send a signal to a process group.
  */
 void
 gsignal(pgid, sig)
 	int pgid, sig;
 {
 	struct pgrp *pgrp;
 
 	if (pgid != 0) {
 		sx_slock(&proctree_lock);
 		pgrp = pgfind(pgid);
 		sx_sunlock(&proctree_lock);
 		if (pgrp != NULL) {
 			pgsignal(pgrp, sig, 0);
 			PGRP_UNLOCK(pgrp);
 		}
 	}
 }
 
 /*
  * Send a signal to a process group.  If checktty is 1,
  * limit to members which have a controlling terminal.
  */
 void
 pgsignal(pgrp, sig, checkctty)
 	struct pgrp *pgrp;
 	int sig, checkctty;
 {
 	register struct proc *p;
 
 	if (pgrp) {
 		PGRP_LOCK_ASSERT(pgrp, MA_OWNED);
 		LIST_FOREACH(p, &pgrp->pg_members, p_pglist) {
 			PROC_LOCK(p);
 			if (checkctty == 0 || p->p_flag & P_CONTROLT)
 				psignal(p, sig);
 			PROC_UNLOCK(p);
 		}
 	}
 }
 
 /*
  * Send a signal caused by a trap to the current thread.  If it will be
  * caught immediately, deliver it with correct code.  Otherwise, post it
  * normally.
  */
 void
 trapsignal(struct thread *td, ksiginfo_t *ksi)
 {
 	struct sigacts *ps;
 	struct proc *p;
 #ifdef KSE
 	int error;
 #endif
 	int sig;
 	int code;
 
 	p = td->td_proc;
 	sig = ksi->ksi_signo;
 	code = ksi->ksi_code;
 	KASSERT(_SIG_VALID(sig), ("invalid signal"));
 
 #ifdef KSE
 	if (td->td_pflags & TDP_SA) {
 		if (td->td_mailbox == NULL)
 			thread_user_enter(td);
 		PROC_LOCK(p);
 		SIGDELSET(td->td_sigmask, sig);
 		thread_lock(td);
 		/*
 		 * Force scheduling an upcall, so UTS has chance to
 		 * process the signal before thread runs again in
 		 * userland.
 		 */
 		if (td->td_upcall)
 			td->td_upcall->ku_flags |= KUF_DOUPCALL;
 		thread_unlock(td);
 	} else {
 		PROC_LOCK(p);
 	}
 #else
 	PROC_LOCK(p);
 #endif
 	ps = p->p_sigacts;
 	mtx_lock(&ps->ps_mtx);
 	if ((p->p_flag & P_TRACED) == 0 && SIGISMEMBER(ps->ps_sigcatch, sig) &&
 	    !SIGISMEMBER(td->td_sigmask, sig)) {
 		td->td_ru.ru_nsignals++;
 #ifdef KTRACE
 		if (KTRPOINT(curthread, KTR_PSIG))
 			ktrpsig(sig, ps->ps_sigact[_SIG_IDX(sig)],
 			    &td->td_sigmask, code);
 #endif
 #ifdef KSE
 		if (!(td->td_pflags & TDP_SA))
 			(*p->p_sysent->sv_sendsig)(ps->ps_sigact[_SIG_IDX(sig)], 
 				ksi, &td->td_sigmask);
 #else
 		(*p->p_sysent->sv_sendsig)(ps->ps_sigact[_SIG_IDX(sig)], 
 				ksi, &td->td_sigmask);
 #endif
 #ifdef KSE
 		else if (td->td_mailbox == NULL) {
 			mtx_unlock(&ps->ps_mtx);
 			/* UTS caused a sync signal */
 			p->p_code = code;	/* XXX for core dump/debugger */
 			p->p_sig = sig;		/* XXX to verify code */
 			sigexit(td, sig);
 		} else {
 			mtx_unlock(&ps->ps_mtx);
 			SIGADDSET(td->td_sigmask, sig);
 			PROC_UNLOCK(p);
 			error = copyout(&ksi->ksi_info, &td->td_mailbox->tm_syncsig,
 			    sizeof(siginfo_t));
 			PROC_LOCK(p);
 			/* UTS memory corrupted */
 			if (error)
 				sigexit(td, SIGSEGV);
 			mtx_lock(&ps->ps_mtx);
 		}
 #endif
 		SIGSETOR(td->td_sigmask, ps->ps_catchmask[_SIG_IDX(sig)]);
 		if (!SIGISMEMBER(ps->ps_signodefer, sig))
 			SIGADDSET(td->td_sigmask, sig);
 		if (SIGISMEMBER(ps->ps_sigreset, sig)) {
 			/*
 			 * See kern_sigaction() for origin of this code.
 			 */
 			SIGDELSET(ps->ps_sigcatch, sig);
 			if (sig != SIGCONT &&
 			    sigprop(sig) & SA_IGNORE)
 				SIGADDSET(ps->ps_sigignore, sig);
 			ps->ps_sigact[_SIG_IDX(sig)] = SIG_DFL;
 		}
 		mtx_unlock(&ps->ps_mtx);
 	} else {
 		/*
 		 * Avoid a possible infinite loop if the thread
 		 * masking the signal or process is ignoring the
 		 * signal.
 		 */
 		if (kern_forcesigexit &&
 		    (SIGISMEMBER(td->td_sigmask, sig) ||
 		     ps->ps_sigact[_SIG_IDX(sig)] == SIG_IGN)) {
 			SIGDELSET(td->td_sigmask, sig);
 			SIGDELSET(ps->ps_sigcatch, sig);
 			SIGDELSET(ps->ps_sigignore, sig);
 			ps->ps_sigact[_SIG_IDX(sig)] = SIG_DFL;
 		}
 		mtx_unlock(&ps->ps_mtx);
 		p->p_code = code;	/* XXX for core dump/debugger */
 		p->p_sig = sig;		/* XXX to verify code */
 		tdsignal(p, td, sig, ksi);
 	}
 	PROC_UNLOCK(p);
 }
 
 static struct thread *
 sigtd(struct proc *p, int sig, int prop)
 {
 	struct thread *td, *signal_td;
 
 	PROC_LOCK_ASSERT(p, MA_OWNED);
 
 	/*
 	 * Check if current thread can handle the signal without
 	 * switching conetxt to another thread.
 	 */
 	if (curproc == p && !SIGISMEMBER(curthread->td_sigmask, sig))
 		return (curthread);
 	signal_td = NULL;
 	PROC_SLOCK(p);
 	FOREACH_THREAD_IN_PROC(p, td) {
 		if (!SIGISMEMBER(td->td_sigmask, sig)) {
 			signal_td = td;
 			break;
 		}
 	}
 	if (signal_td == NULL)
 		signal_td = FIRST_THREAD_IN_PROC(p);
 	PROC_SUNLOCK(p);
 	return (signal_td);
 }
 
 /*
  * Send the signal to the process.  If the signal has an action, the action
  * is usually performed by the target process rather than the caller; we add
  * the signal to the set of pending signals for the process.
  *
  * Exceptions:
  *   o When a stop signal is sent to a sleeping process that takes the
  *     default action, the process is stopped without awakening it.
  *   o SIGCONT restarts stopped processes (or puts them back to sleep)
  *     regardless of the signal action (eg, blocked or ignored).
  *
  * Other ignored signals are discarded immediately.
  * 
  * NB: This function may be entered from the debugger via the "kill" DDB
  * command.  There is little that can be done to mitigate the possibly messy
  * side effects of this unwise possibility.
  */
 void
 psignal(struct proc *p, int sig)
 {
 	(void) tdsignal(p, NULL, sig, NULL);
 }
 
 int
 psignal_event(struct proc *p, struct sigevent *sigev, ksiginfo_t *ksi)
 {
 	struct thread *td = NULL;
 
 	PROC_LOCK_ASSERT(p, MA_OWNED);
 
 	KASSERT(!KSI_ONQ(ksi), ("psignal_event: ksi on queue"));
 
 	/*
 	 * ksi_code and other fields should be set before
 	 * calling this function.
 	 */
 	ksi->ksi_signo = sigev->sigev_signo;
 	ksi->ksi_value = sigev->sigev_value;
 	if (sigev->sigev_notify == SIGEV_THREAD_ID) {
 		td = thread_find(p, sigev->sigev_notify_thread_id);
 		if (td == NULL)
 			return (ESRCH);
 	}
 	return (tdsignal(p, td, ksi->ksi_signo, ksi));
 }
 
 int
 tdsignal(struct proc *p, struct thread *td, int sig, ksiginfo_t *ksi)
 {
 #ifdef KSE
 	sigset_t saved;
 	int ret;
 
 	if (p->p_flag & P_SA)
 		saved = p->p_sigqueue.sq_signals;
 	ret = do_tdsignal(p, td, sig, ksi);
 	if ((p->p_flag & P_SA) && !(p->p_flag & P_SIGEVENT)) {
 		if (!SIGSETEQ(saved, p->p_sigqueue.sq_signals)) {
 			/* pending set changed */
 			p->p_flag |= P_SIGEVENT;
 			wakeup(&p->p_siglist);
 		}
 	}
 	return (ret);
 }
 
 static int
 do_tdsignal(struct proc *p, struct thread *td, int sig, ksiginfo_t *ksi)
 {
 #endif
 	sig_t action;
 	sigqueue_t *sigqueue;
 	int prop;
 	struct sigacts *ps;
 	int intrval;
 	int ret = 0;
 
 	PROC_LOCK_ASSERT(p, MA_OWNED);
 
 	if (!_SIG_VALID(sig))
 #ifdef KSE
 		panic("do_tdsignal(): invalid signal %d", sig);
 #else
 		panic("tdsignal(): invalid signal %d", sig);
 #endif
 
 #ifdef KSE
 	KASSERT(ksi == NULL || !KSI_ONQ(ksi), ("do_tdsignal: ksi on queue"));
 #else
 	KASSERT(ksi == NULL || !KSI_ONQ(ksi), ("tdsignal: ksi on queue"));
 #endif
 
 	/*
 	 * IEEE Std 1003.1-2001: return success when killing a zombie.
 	 */
 	if (p->p_state == PRS_ZOMBIE) {
 		if (ksi && (ksi->ksi_flags & KSI_INS))
 			ksiginfo_tryfree(ksi);
 		return (ret);
 	}
 
 	ps = p->p_sigacts;
 	KNOTE_LOCKED(&p->p_klist, NOTE_SIGNAL | sig);
 	prop = sigprop(sig);
 
 	/*
 	 * If the signal is blocked and not destined for this thread, then
 	 * assign it to the process so that we can find it later in the first
 	 * thread that unblocks it.  Otherwise, assign it to this thread now.
 	 */
 	if (td == NULL) {
 		td = sigtd(p, sig, prop);
 		if (SIGISMEMBER(td->td_sigmask, sig))
 			sigqueue = &p->p_sigqueue;
 		else
 			sigqueue = &td->td_sigqueue;
 	} else {
 		KASSERT(td->td_proc == p, ("invalid thread"));
 		sigqueue = &td->td_sigqueue;
 	}
 
 	/*
 	 * If the signal is being ignored,
 	 * then we forget about it immediately.
 	 * (Note: we don't set SIGCONT in ps_sigignore,
 	 * and if it is set to SIG_IGN,
 	 * action will be SIG_DFL here.)
 	 */
 	mtx_lock(&ps->ps_mtx);
 	if (SIGISMEMBER(ps->ps_sigignore, sig)) {
 		mtx_unlock(&ps->ps_mtx);
 		if (ksi && (ksi->ksi_flags & KSI_INS))
 			ksiginfo_tryfree(ksi);
 		return (ret);
 	}
 	if (SIGISMEMBER(td->td_sigmask, sig))
 		action = SIG_HOLD;
 	else if (SIGISMEMBER(ps->ps_sigcatch, sig))
 		action = SIG_CATCH;
 	else
 		action = SIG_DFL;
 	if (SIGISMEMBER(ps->ps_sigintr, sig))
 		intrval = EINTR;
 	else
 		intrval = ERESTART;
 	mtx_unlock(&ps->ps_mtx);
 
 	if (prop & SA_CONT)
 		sigqueue_delete_stopmask_proc(p);
 	else if (prop & SA_STOP) {
 		/*
 		 * If sending a tty stop signal to a member of an orphaned
 		 * process group, discard the signal here if the action
 		 * is default; don't stop the process below if sleeping,
 		 * and don't clear any pending SIGCONT.
 		 */
 		if ((prop & SA_TTYSTOP) &&
 		    (p->p_pgrp->pg_jobc == 0) &&
 		    (action == SIG_DFL)) {
 			if (ksi && (ksi->ksi_flags & KSI_INS))
 				ksiginfo_tryfree(ksi);
 			return (ret);
 		}
 		PROC_SLOCK(p);
 		sigqueue_delete_proc(p, SIGCONT);
 		PROC_SUNLOCK(p);
 		if (p->p_flag & P_CONTINUED) {
 			p->p_flag &= ~P_CONTINUED;
 			PROC_LOCK(p->p_pptr);
 			sigqueue_take(p->p_ksi);
 			PROC_UNLOCK(p->p_pptr);
 		}
 	}
 
 	ret = sigqueue_add(sigqueue, sig, ksi);
 	if (ret != 0)
 		return (ret);
 	signotify(td);
 	/*
 	 * Defer further processing for signals which are held,
 	 * except that stopped processes must be continued by SIGCONT.
 	 */
 	if (action == SIG_HOLD &&
 	    !((prop & SA_CONT) && (p->p_flag & P_STOPPED_SIG)))
 		return (ret);
 	/*
 	 * SIGKILL: Remove procfs STOPEVENTs.
 	 */
 	if (sig == SIGKILL) {
 		/* from procfs_ioctl.c: PIOCBIC */
 		p->p_stops = 0;
 		/* from procfs_ioctl.c: PIOCCONT */
 		p->p_step = 0;
 		wakeup(&p->p_step);
 	}
 	/*
 	 * Some signals have a process-wide effect and a per-thread
 	 * component.  Most processing occurs when the process next
 	 * tries to cross the user boundary, however there are some
 	 * times when processing needs to be done immediatly, such as
 	 * waking up threads so that they can cross the user boundary.
 	 * We try do the per-process part here.
 	 */
 	PROC_SLOCK(p);
 	if (P_SHOULDSTOP(p)) {
 		/*
 		 * The process is in stopped mode. All the threads should be
 		 * either winding down or already on the suspended queue.
 		 */
 		if (p->p_flag & P_TRACED) {
 			/*
 			 * The traced process is already stopped,
 			 * so no further action is necessary.
 			 * No signal can restart us.
 			 */
 			PROC_SUNLOCK(p);
 			goto out;
 		}
 
 		if (sig == SIGKILL) {
 			/*
 			 * SIGKILL sets process running.
 			 * It will die elsewhere.
 			 * All threads must be restarted.
 			 */
 			p->p_flag &= ~P_STOPPED_SIG;
 			goto runfast;
 		}
 
 		if (prop & SA_CONT) {
 			/*
 			 * If SIGCONT is default (or ignored), we continue the
 			 * process but don't leave the signal in sigqueue as
 			 * it has no further action.  If SIGCONT is held, we
 			 * continue the process and leave the signal in
 			 * sigqueue.  If the process catches SIGCONT, let it
 			 * handle the signal itself.  If it isn't waiting on
 			 * an event, it goes back to run state.
 			 * Otherwise, process goes back to sleep state.
 			 */
 			p->p_flag &= ~P_STOPPED_SIG;
 			if (p->p_numthreads == p->p_suspcount) {
 				PROC_SUNLOCK(p);
 				p->p_flag |= P_CONTINUED;
 				p->p_xstat = SIGCONT;
 				PROC_LOCK(p->p_pptr);
 				childproc_continued(p);
 				PROC_UNLOCK(p->p_pptr);
 				PROC_SLOCK(p);
 			}
 			if (action == SIG_DFL) {
 				thread_unsuspend(p);
 				PROC_SUNLOCK(p);
 				sigqueue_delete(sigqueue, sig);
 				goto out;
 			}
 			if (action == SIG_CATCH) {
 #ifdef KSE
 				/*
 				 * The process wants to catch it so it needs
 				 * to run at least one thread, but which one?
 				 * It would seem that the answer would be to
 				 * run an upcall in the next KSE to run, and
 				 * deliver the signal that way. In a NON KSE
 				 * process, we need to make sure that the
 				 * single thread is runnable asap.
 				 * XXXKSE for now however, make them all run.
 				 */
 #endif
 				/*
 				 * The process wants to catch it so it needs
 				 * to run at least one thread, but which one?
 				 */
 				goto runfast;
 			}
 			/*
 			 * The signal is not ignored or caught.
 			 */
 			thread_unsuspend(p);
 			PROC_SUNLOCK(p);
 			goto out;
 		}
 
 		if (prop & SA_STOP) {
 			/*
 			 * Already stopped, don't need to stop again
 			 * (If we did the shell could get confused).
 			 * Just make sure the signal STOP bit set.
 			 */
 			PROC_SUNLOCK(p);
 			p->p_flag |= P_STOPPED_SIG;
 			sigqueue_delete(sigqueue, sig);
 			goto out;
 		}
 
 		/*
 		 * All other kinds of signals:
 		 * If a thread is sleeping interruptibly, simulate a
 		 * wakeup so that when it is continued it will be made
 		 * runnable and can look at the signal.  However, don't make
 		 * the PROCESS runnable, leave it stopped.
 		 * It may run a bit until it hits a thread_suspend_check().
 		 */
 		thread_lock(td);
 		if (TD_ON_SLEEPQ(td) && (td->td_flags & TDF_SINTR))
 			sleepq_abort(td, intrval);
 		thread_unlock(td);
 		PROC_SUNLOCK(p);
 		goto out;
 		/*
 		 * Mutexes are short lived. Threads waiting on them will
 		 * hit thread_suspend_check() soon.
 		 */
 	} else if (p->p_state == PRS_NORMAL) {
 		if (p->p_flag & P_TRACED || action == SIG_CATCH) {
 			thread_lock(td);
 			tdsigwakeup(td, sig, action, intrval);
 			thread_unlock(td);
 			PROC_SUNLOCK(p);
 			goto out;
 		}
 
 		MPASS(action == SIG_DFL);
 
 		if (prop & SA_STOP) {
 			if (p->p_flag & P_PPWAIT) {
 				PROC_SUNLOCK(p);
 				goto out;
 			}
 			p->p_flag |= P_STOPPED_SIG;
 			p->p_xstat = sig;
 			sig_suspend_threads(td, p, 1);
 			if (p->p_numthreads == p->p_suspcount) {
 				/*
 				 * only thread sending signal to another
 				 * process can reach here, if thread is sending
 				 * signal to its process, because thread does
 				 * not suspend itself here, p_numthreads
 				 * should never be equal to p_suspcount.
 				 */
 				thread_stopped(p);
 				PROC_SUNLOCK(p);
 				sigqueue_delete_proc(p, p->p_xstat);
 			} else
 				PROC_SUNLOCK(p);
 			goto out;
 		} 
 		else
 			goto runfast;
 		/* NOTREACHED */
 	} else {
 		/* Not in "NORMAL" state. discard the signal. */
 		PROC_SUNLOCK(p);
 		sigqueue_delete(sigqueue, sig);
 		goto out;
 	}
 
 	/*
 	 * The process is not stopped so we need to apply the signal to all the
 	 * running threads.
 	 */
 
 runfast:
 	thread_lock(td);
 	tdsigwakeup(td, sig, action, intrval);
 	thread_unlock(td);
 	thread_unsuspend(p);
 	PROC_SUNLOCK(p);
 out:
 	/* If we jump here, proc slock should not be owned. */
 	PROC_SLOCK_ASSERT(p, MA_NOTOWNED);
 	return (ret);
 }
 
 /*
  * The force of a signal has been directed against a single
  * thread.  We need to see what we can do about knocking it
  * out of any sleep it may be in etc.
  */
 static void
 tdsigwakeup(struct thread *td, int sig, sig_t action, int intrval)
 {
 	struct proc *p = td->td_proc;
 	register int prop;
 
 	PROC_LOCK_ASSERT(p, MA_OWNED);
 	PROC_SLOCK_ASSERT(p, MA_OWNED);
 	THREAD_LOCK_ASSERT(td, MA_OWNED);
 	prop = sigprop(sig);
 
 	/*
 	 * Bring the priority of a thread up if we want it to get
 	 * killed in this lifetime.
 	 */
 	if (action == SIG_DFL && (prop & SA_KILL) && td->td_priority > PUSER)
 		sched_prio(td, PUSER);
 
 	if (TD_ON_SLEEPQ(td)) {
 		/*
 		 * If thread is sleeping uninterruptibly
 		 * we can't interrupt the sleep... the signal will
 		 * be noticed when the process returns through
 		 * trap() or syscall().
 		 */
 		if ((td->td_flags & TDF_SINTR) == 0)
 			return;
 		/*
 		 * If SIGCONT is default (or ignored) and process is
 		 * asleep, we are finished; the process should not
 		 * be awakened.
 		 */
 		if ((prop & SA_CONT) && action == SIG_DFL) {
 			thread_unlock(td);
 			PROC_SUNLOCK(p);
 			sigqueue_delete(&p->p_sigqueue, sig);
 			/*
 			 * It may be on either list in this state.
 			 * Remove from both for now.
 			 */
 			sigqueue_delete(&td->td_sigqueue, sig);
 			PROC_SLOCK(p);
 			thread_lock(td);
 			return;
 		}
 
 		/*
 		 * Give low priority threads a better chance to run.
 		 */
 		if (td->td_priority > PUSER)
 			sched_prio(td, PUSER);
 
 		sleepq_abort(td, intrval);
 	} else {
 		/*
 		 * Other states do nothing with the signal immediately,
 		 * other than kicking ourselves if we are running.
 		 * It will either never be noticed, or noticed very soon.
 		 */
 #ifdef SMP
 		if (TD_IS_RUNNING(td) && td != curthread)
 			forward_signal(td);
 #endif
 	}
 }
 
 static void
 sig_suspend_threads(struct thread *td, struct proc *p, int sending)
 {
 	struct thread *td2;
 
 	PROC_LOCK_ASSERT(p, MA_OWNED);
 	PROC_SLOCK_ASSERT(p, MA_OWNED);
 
 	FOREACH_THREAD_IN_PROC(p, td2) {
 		thread_lock(td2);
 		if ((TD_IS_SLEEPING(td2) || TD_IS_SWAPPED(td2)) &&
 		    (td2->td_flags & TDF_SINTR) &&
 		    !TD_IS_SUSPENDED(td2)) {
 			thread_suspend_one(td2);
 		} else {
 			if (sending || td != td2)
 				td2->td_flags |= TDF_ASTPENDING;
 #ifdef SMP
 			if (TD_IS_RUNNING(td2) && td2 != td)
 				forward_signal(td2);
 #endif
 		}
 		thread_unlock(td2);
 	}
 }
 
 int
 ptracestop(struct thread *td, int sig)
 {
 	struct proc *p = td->td_proc;
 
 	PROC_LOCK_ASSERT(p, MA_OWNED);
 	WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK,
 	    &p->p_mtx.lock_object, "Stopping for traced signal");
 
 	thread_lock(td);
 	td->td_flags |= TDF_XSIG;
 	thread_unlock(td);
 	td->td_xsig = sig;
 	PROC_SLOCK(p);
 	while ((p->p_flag & P_TRACED) && (td->td_flags & TDF_XSIG)) {
 		if (p->p_flag & P_SINGLE_EXIT) {
 			thread_lock(td);
 			td->td_flags &= ~TDF_XSIG;
 			thread_unlock(td);
 			PROC_SUNLOCK(p);
 			return (sig);
 		}
 		/*
 		 * Just make wait() to work, the last stopped thread
 		 * will win.
 		 */
 		p->p_xstat = sig;
 		p->p_xthread = td;
 		p->p_flag |= (P_STOPPED_SIG|P_STOPPED_TRACE);
 		sig_suspend_threads(td, p, 0);
 stopme:
 		thread_suspend_switch(td);
 		if (!(p->p_flag & P_TRACED)) {
 			break;
 		}
 		if (td->td_flags & TDF_DBSUSPEND) {
 			if (p->p_flag & P_SINGLE_EXIT)
 				break;
 			goto stopme;
 		}
 	}
 	PROC_SUNLOCK(p);
 	return (td->td_xsig);
 }
 
 /*
  * If the current process has received a signal (should be caught or cause
  * termination, should interrupt current syscall), return the signal number.
  * Stop signals with default action are processed immediately, then cleared;
  * they aren't returned.  This is checked after each entry to the system for
  * a syscall or trap (though this can usually be done without calling issignal
  * by checking the pending signal masks in cursig.) The normal call
  * sequence is
  *
  *	while (sig = cursig(curthread))
  *		postsig(sig);
  */
 static int
 issignal(td)
 	struct thread *td;
 {
 	struct proc *p;
 	struct sigacts *ps;
 	sigset_t sigpending;
 	int sig, prop, newsig;
 
 	p = td->td_proc;
 	ps = p->p_sigacts;
 	mtx_assert(&ps->ps_mtx, MA_OWNED);
 	PROC_LOCK_ASSERT(p, MA_OWNED);
 	for (;;) {
 		int traced = (p->p_flag & P_TRACED) || (p->p_stops & S_SIG);
 
 		sigpending = td->td_sigqueue.sq_signals;
 		SIGSETNAND(sigpending, td->td_sigmask);
 
 		if (p->p_flag & P_PPWAIT)
 			SIG_STOPSIGMASK(sigpending);
 		if (SIGISEMPTY(sigpending))	/* no signal to send */
 			return (0);
 		sig = sig_ffs(&sigpending);
 
 		if (p->p_stops & S_SIG) {
 			mtx_unlock(&ps->ps_mtx);
 			stopevent(p, S_SIG, sig);
 			mtx_lock(&ps->ps_mtx);
 		}
 
 		/*
 		 * We should see pending but ignored signals
 		 * only if P_TRACED was on when they were posted.
 		 */
 		if (SIGISMEMBER(ps->ps_sigignore, sig) && (traced == 0)) {
 			sigqueue_delete(&td->td_sigqueue, sig);
 #ifdef KSE
 			if (td->td_pflags & TDP_SA)
 				SIGADDSET(td->td_sigmask, sig);
 #endif
 			continue;
 		}
 		if (p->p_flag & P_TRACED && (p->p_flag & P_PPWAIT) == 0) {
 			/*
 			 * If traced, always stop.
 			 */
 			mtx_unlock(&ps->ps_mtx);
 			newsig = ptracestop(td, sig);
 			mtx_lock(&ps->ps_mtx);
 
 #ifdef KSE
 			if (td->td_pflags & TDP_SA)
 				SIGADDSET(td->td_sigmask, sig);
 
 #endif
 			if (sig != newsig) {
 				ksiginfo_t ksi;
 				/*
 				 * clear old signal.
 				 * XXX shrug off debugger, it causes siginfo to
 				 * be thrown away.
 				 */
 				sigqueue_get(&td->td_sigqueue, sig, &ksi);
 
 				/*
 				 * If parent wants us to take the signal,
 				 * then it will leave it in p->p_xstat;
 				 * otherwise we just look for signals again.
 			 	*/
 				if (newsig == 0)
 					continue;
 				sig = newsig;
 
 				/*
 				 * Put the new signal into td_sigqueue. If the
 				 * signal is being masked, look for other signals.
 				 */
 				SIGADDSET(td->td_sigqueue.sq_signals, sig);
 #ifdef KSE
 				if (td->td_pflags & TDP_SA)
 					SIGDELSET(td->td_sigmask, sig);
 #endif
 				if (SIGISMEMBER(td->td_sigmask, sig))
 					continue;
 				signotify(td);
 			}
 
 			/*
 			 * If the traced bit got turned off, go back up
 			 * to the top to rescan signals.  This ensures
 			 * that p_sig* and p_sigact are consistent.
 			 */
 			if ((p->p_flag & P_TRACED) == 0)
 				continue;
 		}
 
 		prop = sigprop(sig);
 
 		/*
 		 * Decide whether the signal should be returned.
 		 * Return the signal's number, or fall through
 		 * to clear it from the pending mask.
 		 */
 		switch ((intptr_t)p->p_sigacts->ps_sigact[_SIG_IDX(sig)]) {
 
 		case (intptr_t)SIG_DFL:
 			/*
 			 * Don't take default actions on system processes.
 			 */
 			if (p->p_pid <= 1) {
 #ifdef DIAGNOSTIC
 				/*
 				 * Are you sure you want to ignore SIGSEGV
 				 * in init? XXX
 				 */
 				printf("Process (pid %lu) got signal %d\n",
 					(u_long)p->p_pid, sig);
 #endif
 				break;		/* == ignore */
 			}
 			/*
 			 * If there is a pending stop signal to process
 			 * with default action, stop here,
 			 * then clear the signal.  However,
 			 * if process is member of an orphaned
 			 * process group, ignore tty stop signals.
 			 */
 			if (prop & SA_STOP) {
 				if (p->p_flag & P_TRACED ||
 		    		    (p->p_pgrp->pg_jobc == 0 &&
 				     prop & SA_TTYSTOP))
 					break;	/* == ignore */
 				mtx_unlock(&ps->ps_mtx);
 				WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK,
 				    &p->p_mtx.lock_object, "Catching SIGSTOP");
 				p->p_flag |= P_STOPPED_SIG;
 				p->p_xstat = sig;
 				PROC_SLOCK(p);
 				sig_suspend_threads(td, p, 0);
 				thread_suspend_switch(td);
 				PROC_SUNLOCK(p);
 				mtx_lock(&ps->ps_mtx);
 				break;
 			} else if (prop & SA_IGNORE) {
 				/*
 				 * Except for SIGCONT, shouldn't get here.
 				 * Default action is to ignore; drop it.
 				 */
 				break;		/* == ignore */
 			} else
 				return (sig);
 			/*NOTREACHED*/
 
 		case (intptr_t)SIG_IGN:
 			/*
 			 * Masking above should prevent us ever trying
 			 * to take action on an ignored signal other
 			 * than SIGCONT, unless process is traced.
 			 */
 			if ((prop & SA_CONT) == 0 &&
 			    (p->p_flag & P_TRACED) == 0)
 				printf("issignal\n");
 			break;		/* == ignore */
 
 		default:
 			/*
 			 * This signal has an action, let
 			 * postsig() process it.
 			 */
 			return (sig);
 		}
 		sigqueue_delete(&td->td_sigqueue, sig);		/* take the signal! */
 	}
 	/* NOTREACHED */
 }
 
 void
 thread_stopped(struct proc *p)
 {
 	int n;
 
 	PROC_LOCK_ASSERT(p, MA_OWNED);
 	PROC_SLOCK_ASSERT(p, MA_OWNED);
 	n = p->p_suspcount;
 	if (p == curproc)
 		n++;
 	if ((p->p_flag & P_STOPPED_SIG) && (n == p->p_numthreads)) {
 		PROC_SUNLOCK(p);
 		p->p_flag &= ~P_WAITED;
 		PROC_LOCK(p->p_pptr);
 		childproc_stopped(p, (p->p_flag & P_TRACED) ?
 			CLD_TRAPPED : CLD_STOPPED);
 		PROC_UNLOCK(p->p_pptr);
 		PROC_SLOCK(p);
 	}
 }
  
 /*
  * Take the action for the specified signal
  * from the current set of pending signals.
  */
 void
 postsig(sig)
 	register int sig;
 {
 	struct thread *td = curthread;
 	register struct proc *p = td->td_proc;
 	struct sigacts *ps;
 	sig_t action;
 	ksiginfo_t ksi;
 	sigset_t returnmask;
 	int code;
 
 	KASSERT(sig != 0, ("postsig"));
 
 	PROC_LOCK_ASSERT(p, MA_OWNED);
 	ps = p->p_sigacts;
 	mtx_assert(&ps->ps_mtx, MA_OWNED);
 	ksiginfo_init(&ksi);
 	sigqueue_get(&td->td_sigqueue, sig, &ksi);
 	ksi.ksi_signo = sig;
 	if (ksi.ksi_code == SI_TIMER)
 		itimer_accept(p, ksi.ksi_timerid, &ksi);
 	action = ps->ps_sigact[_SIG_IDX(sig)];
 #ifdef KTRACE
 	if (KTRPOINT(td, KTR_PSIG))
 		ktrpsig(sig, action, td->td_pflags & TDP_OLDMASK ?
 		    &td->td_oldsigmask : &td->td_sigmask, 0);
 #endif
 	if (p->p_stops & S_SIG) {
 		mtx_unlock(&ps->ps_mtx);
 		stopevent(p, S_SIG, sig);
 		mtx_lock(&ps->ps_mtx);
 	}
 
 #ifdef KSE
 	if (!(td->td_pflags & TDP_SA) && action == SIG_DFL) {
 #else
 	if (action == SIG_DFL) {
 #endif
 		/*
 		 * Default action, where the default is to kill
 		 * the process.  (Other cases were ignored above.)
 		 */
 		mtx_unlock(&ps->ps_mtx);
 		sigexit(td, sig);
 		/* NOTREACHED */
 	} else {
 #ifdef KSE
 		if (td->td_pflags & TDP_SA) {
 			if (sig == SIGKILL) {
 				mtx_unlock(&ps->ps_mtx);
 				sigexit(td, sig);
 			}
 		}
 
 #endif
 		/*
 		 * If we get here, the signal must be caught.
 		 */
 		KASSERT(action != SIG_IGN && !SIGISMEMBER(td->td_sigmask, sig),
 		    ("postsig action"));
 		/*
 		 * Set the new mask value and also defer further
 		 * occurrences of this signal.
 		 *
 		 * Special case: user has done a sigsuspend.  Here the
 		 * current mask is not of interest, but rather the
 		 * mask from before the sigsuspend is what we want
 		 * restored after the signal processing is completed.
 		 */
 		if (td->td_pflags & TDP_OLDMASK) {
 			returnmask = td->td_oldsigmask;
 			td->td_pflags &= ~TDP_OLDMASK;
 		} else
 			returnmask = td->td_sigmask;
 
 		SIGSETOR(td->td_sigmask, ps->ps_catchmask[_SIG_IDX(sig)]);
 		if (!SIGISMEMBER(ps->ps_signodefer, sig))
 			SIGADDSET(td->td_sigmask, sig);
 
 		if (SIGISMEMBER(ps->ps_sigreset, sig)) {
 			/*
 			 * See kern_sigaction() for origin of this code.
 			 */
 			SIGDELSET(ps->ps_sigcatch, sig);
 			if (sig != SIGCONT &&
 			    sigprop(sig) & SA_IGNORE)
 				SIGADDSET(ps->ps_sigignore, sig);
 			ps->ps_sigact[_SIG_IDX(sig)] = SIG_DFL;
 		}
 		td->td_ru.ru_nsignals++;
 		if (p->p_sig != sig) {
 			code = 0;
 		} else {
 			code = p->p_code;
 			p->p_code = 0;
 			p->p_sig = 0;
 		}
 #ifdef KSE
 		if (td->td_pflags & TDP_SA)
 			thread_signal_add(curthread, &ksi);
 		else
 			(*p->p_sysent->sv_sendsig)(action, &ksi, &returnmask);
 #else
 		(*p->p_sysent->sv_sendsig)(action, &ksi, &returnmask);
 #endif
 	}
 }
 
 /*
  * Kill the current process for stated reason.
  */
 void
 killproc(p, why)
 	struct proc *p;
 	char *why;
 {
 
 	PROC_LOCK_ASSERT(p, MA_OWNED);
 	CTR3(KTR_PROC, "killproc: proc %p (pid %d, %s)",
 		p, p->p_pid, p->p_comm);
 	log(LOG_ERR, "pid %d (%s), uid %d, was killed: %s\n", p->p_pid, p->p_comm,
 		p->p_ucred ? p->p_ucred->cr_uid : -1, why);
 	psignal(p, SIGKILL);
 }
 
 /*
  * Force the current process to exit with the specified signal, dumping core
  * if appropriate.  We bypass the normal tests for masked and caught signals,
  * allowing unrecoverable failures to terminate the process without changing
  * signal state.  Mark the accounting record with the signal termination.
  * If dumping core, save the signal number for the debugger.  Calls exit and
  * does not return.
  */
 void
 sigexit(td, sig)
 	struct thread *td;
 	int sig;
 {
 	struct proc *p = td->td_proc;
 
 	PROC_LOCK_ASSERT(p, MA_OWNED);
 	p->p_acflag |= AXSIG;
 	/*
 	 * We must be single-threading to generate a core dump.  This
 	 * ensures that the registers in the core file are up-to-date.
 	 * Also, the ELF dump handler assumes that the thread list doesn't
 	 * change out from under it.
 	 *
 	 * XXX If another thread attempts to single-thread before us
 	 *     (e.g. via fork()), we won't get a dump at all.
 	 */
 	if ((sigprop(sig) & SA_CORE) && (thread_single(SINGLE_NO_EXIT) == 0)) {
 		p->p_sig = sig;
 		/*
 		 * Log signals which would cause core dumps
 		 * (Log as LOG_INFO to appease those who don't want
 		 * these messages.)
 		 * XXX : Todo, as well as euid, write out ruid too
 		 * Note that coredump() drops proc lock.
 		 */
 		if (coredump(td) == 0)
 			sig |= WCOREFLAG;
 		if (kern_logsigexit)
 			log(LOG_INFO,
 			    "pid %d (%s), uid %d: exited on signal %d%s\n",
 			    p->p_pid, p->p_comm,
 			    td->td_ucred ? td->td_ucred->cr_uid : -1,
 			    sig &~ WCOREFLAG,
 			    sig & WCOREFLAG ? " (core dumped)" : "");
 	} else
 		PROC_UNLOCK(p);
 	exit1(td, W_EXITCODE(0, sig));
 	/* NOTREACHED */
 }
 
 /*
  * Send queued SIGCHLD to parent when child process's state
  * is changed.
  */
 static void
 sigparent(struct proc *p, int reason, int status)
 {
 	PROC_LOCK_ASSERT(p, MA_OWNED);
 	PROC_LOCK_ASSERT(p->p_pptr, MA_OWNED);
 
 	if (p->p_ksi != NULL) {
 		p->p_ksi->ksi_signo  = SIGCHLD;
 		p->p_ksi->ksi_code   = reason;
 		p->p_ksi->ksi_status = status;
 		p->p_ksi->ksi_pid    = p->p_pid;
 		p->p_ksi->ksi_uid    = p->p_ucred->cr_ruid;
 		if (KSI_ONQ(p->p_ksi))
 			return;
 	}
 	tdsignal(p->p_pptr, NULL, SIGCHLD, p->p_ksi);
 }
 
 static void
 childproc_jobstate(struct proc *p, int reason, int status)
 {
 	struct sigacts *ps;
 
 	PROC_LOCK_ASSERT(p, MA_OWNED);
 	PROC_LOCK_ASSERT(p->p_pptr, MA_OWNED);
 
 	/*
 	 * Wake up parent sleeping in kern_wait(), also send
 	 * SIGCHLD to parent, but SIGCHLD does not guarantee
 	 * that parent will awake, because parent may masked
 	 * the signal.
 	 */
 	p->p_pptr->p_flag |= P_STATCHILD;
 	wakeup(p->p_pptr);
 
 	ps = p->p_pptr->p_sigacts;
 	mtx_lock(&ps->ps_mtx);
 	if ((ps->ps_flag & PS_NOCLDSTOP) == 0) {
 		mtx_unlock(&ps->ps_mtx);
 		sigparent(p, reason, status);
 	} else
 		mtx_unlock(&ps->ps_mtx);
 }
 
 void
 childproc_stopped(struct proc *p, int reason)
 {
 	childproc_jobstate(p, reason, p->p_xstat);
 }
 
 void
 childproc_continued(struct proc *p)
 {
 	childproc_jobstate(p, CLD_CONTINUED, SIGCONT);
 }
 
 void
 childproc_exited(struct proc *p)
 {
 	int reason;
 	int status = p->p_xstat; /* convert to int */
 
 	reason = CLD_EXITED;
 	if (WCOREDUMP(status))
 		reason = CLD_DUMPED;
 	else if (WIFSIGNALED(status))
 		reason = CLD_KILLED;
 	/*
 	 * XXX avoid calling wakeup(p->p_pptr), the work is
 	 * done in exit1().
 	 */
 	sigparent(p, reason, status);
 }
 
 static char corefilename[MAXPATHLEN] = {"%N.core"};
 SYSCTL_STRING(_kern, OID_AUTO, corefile, CTLFLAG_RW, corefilename,
 	      sizeof(corefilename), "process corefile name format string");
 
 /*
  * expand_name(name, uid, pid)
  * Expand the name described in corefilename, using name, uid, and pid.
  * corefilename is a printf-like string, with three format specifiers:
  *	%N	name of process ("name")
  *	%P	process id (pid)
  *	%U	user id (uid)
  * For example, "%N.core" is the default; they can be disabled completely
  * by using "/dev/null", or all core files can be stored in "/cores/%U/%N-%P".
  * This is controlled by the sysctl variable kern.corefile (see above).
  */
 
 static char *
 expand_name(name, uid, pid)
 	const char *name;
 	uid_t uid;
 	pid_t pid;
 {
 	const char *format, *appendstr;
 	char *temp;
 	char buf[11];		/* Buffer for pid/uid -- max 4B */
 	size_t i, l, n;
 
 	format = corefilename;
 	temp = malloc(MAXPATHLEN, M_TEMP, M_NOWAIT | M_ZERO);
 	if (temp == NULL)
 		return (NULL);
 	for (i = 0, n = 0; n < MAXPATHLEN && format[i]; i++) {
 		switch (format[i]) {
 		case '%':	/* Format character */
 			i++;
 			switch (format[i]) {
 			case '%':
 				appendstr = "%";
 				break;
 			case 'N':	/* process name */
 				appendstr = name;
 				break;
 			case 'P':	/* process id */
 				sprintf(buf, "%u", pid);
 				appendstr = buf;
 				break;
 			case 'U':	/* user id */
 				sprintf(buf, "%u", uid);
 				appendstr = buf;
 				break;
 			default:
 				appendstr = "";
 			  	log(LOG_ERR,
 				    "Unknown format character %c in `%s'\n",
 				    format[i], format);
 			}
 			l = strlen(appendstr);
 			if ((n + l) >= MAXPATHLEN)
 				goto toolong;
 			memcpy(temp + n, appendstr, l);
 			n += l;
 			break;
 		default:
 			temp[n++] = format[i];
 		}
 	}
 	if (format[i] != '\0')
 		goto toolong;
 	return (temp);
 toolong:
 	log(LOG_ERR, "pid %ld (%s), uid (%lu): corename is too long\n",
 	    (long)pid, name, (u_long)uid);
 	free(temp, M_TEMP);
 	return (NULL);
 }
 
 /*
  * Dump a process' core.  The main routine does some
  * policy checking, and creates the name of the coredump;
  * then it passes on a vnode and a size limit to the process-specific
  * coredump routine if there is one; if there _is not_ one, it returns
  * ENOSYS; otherwise it returns the error from the process-specific routine.
  */
 
 static int
 coredump(struct thread *td)
 {
 	struct proc *p = td->td_proc;
 	register struct vnode *vp;
 	register struct ucred *cred = td->td_ucred;
 	struct flock lf;
 	struct nameidata nd;
 	struct vattr vattr;
 	int error, error1, flags, locked;
 	struct mount *mp;
 	char *name;			/* name of corefile */
 	off_t limit;
 	int vfslocked;
 
 	PROC_LOCK_ASSERT(p, MA_OWNED);
 	MPASS((p->p_flag & P_HADTHREADS) == 0 || p->p_singlethread == td);
 	_STOPEVENT(p, S_CORE, 0);
 
 	name = expand_name(p->p_comm, td->td_ucred->cr_uid, p->p_pid);
 	if (name == NULL) {
 #ifdef AUDIT
 		audit_proc_coredump(td, NULL, EINVAL);
 #endif
 		return (EINVAL);
 	}
 	if (((sugid_coredump == 0) && p->p_flag & P_SUGID) || do_coredump == 0) {
 		PROC_UNLOCK(p);
 #ifdef AUDIT
 		audit_proc_coredump(td, name, EFAULT);
 #endif
 		free(name, M_TEMP);
 		return (EFAULT);
 	}
 	
 	/*
 	 * Note that the bulk of limit checking is done after
 	 * the corefile is created.  The exception is if the limit
 	 * for corefiles is 0, in which case we don't bother
 	 * creating the corefile at all.  This layout means that
 	 * a corefile is truncated instead of not being created,
 	 * if it is larger than the limit.
 	 */
 	limit = (off_t)lim_cur(p, RLIMIT_CORE);
 	PROC_UNLOCK(p);
 	if (limit == 0) {
 #ifdef AUDIT
 		audit_proc_coredump(td, name, EFBIG);
 #endif
 		free(name, M_TEMP);
 		return (EFBIG);
 	}
 
 restart:
 	NDINIT(&nd, LOOKUP, NOFOLLOW | MPSAFE, UIO_SYSSPACE, name, td);
 	flags = O_CREAT | FWRITE | O_NOFOLLOW;
 	error = vn_open(&nd, &flags, S_IRUSR | S_IWUSR, NULL);
 	if (error) {
 #ifdef AUDIT
 		audit_proc_coredump(td, name, error);
 #endif
 		free(name, M_TEMP);
 		return (error);
 	}
 	vfslocked = NDHASGIANT(&nd);
 	NDFREE(&nd, NDF_ONLY_PNBUF);
 	vp = nd.ni_vp;
 
 	/* Don't dump to non-regular files or files with links. */
 	if (vp->v_type != VREG ||
 	    VOP_GETATTR(vp, &vattr, cred, td) || vattr.va_nlink != 1) {
 		VOP_UNLOCK(vp, 0, td);
 		error = EFAULT;
 		goto close;
 	}
 
 	VOP_UNLOCK(vp, 0, td);
 	lf.l_whence = SEEK_SET;
 	lf.l_start = 0;
 	lf.l_len = 0;
 	lf.l_type = F_WRLCK;
 	locked = (VOP_ADVLOCK(vp, (caddr_t)p, F_SETLK, &lf, F_FLOCK) == 0);
 
 	if (vn_start_write(vp, &mp, V_NOWAIT) != 0) {
 		lf.l_type = F_UNLCK;
 		if (locked)
 			VOP_ADVLOCK(vp, (caddr_t)p, F_UNLCK, &lf, F_FLOCK);
 		if ((error = vn_close(vp, FWRITE, cred, td)) != 0)
 			goto out;
 		if ((error = vn_start_write(NULL, &mp, V_XSLEEP | PCATCH)) != 0)
 			goto out;
 		VFS_UNLOCK_GIANT(vfslocked);
 		goto restart;
 	}
 
 	VATTR_NULL(&vattr);
 	vattr.va_size = 0;
 	if (set_core_nodump_flag)
 		vattr.va_flags = UF_NODUMP;
-	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td);
+	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
 	VOP_LEASE(vp, td, cred, LEASE_WRITE);
 	VOP_SETATTR(vp, &vattr, cred, td);
 	VOP_UNLOCK(vp, 0, td);
 	vn_finished_write(mp);
 	PROC_LOCK(p);
 	p->p_acflag |= ACORE;
 	PROC_UNLOCK(p);
 
 	error = p->p_sysent->sv_coredump ?
 	  p->p_sysent->sv_coredump(td, vp, limit) :
 	  ENOSYS;
 
 	if (locked) {
 		lf.l_type = F_UNLCK;
 		VOP_ADVLOCK(vp, (caddr_t)p, F_UNLCK, &lf, F_FLOCK);
 	}
 close:
 	error1 = vn_close(vp, FWRITE, cred, td);
 	if (error == 0)
 		error = error1;
 out:
 #ifdef AUDIT
 	audit_proc_coredump(td, name, error);
 #endif
 	free(name, M_TEMP);
 	VFS_UNLOCK_GIANT(vfslocked);
 	return (error);
 }
 
 /*
  * Nonexistent system call-- signal process (may want to handle it).  Flag
  * error in case process won't see signal immediately (blocked or ignored).
  */
 #ifndef _SYS_SYSPROTO_H_
 struct nosys_args {
 	int	dummy;
 };
 #endif
 /* ARGSUSED */
 int
 nosys(td, args)
 	struct thread *td;
 	struct nosys_args *args;
 {
 	struct proc *p = td->td_proc;
 
 	PROC_LOCK(p);
 	psignal(p, SIGSYS);
 	PROC_UNLOCK(p);
 	return (ENOSYS);
 }
 
 /*
  * Send a SIGIO or SIGURG signal to a process or process group using stored
  * credentials rather than those of the current process.
  */
 void
 pgsigio(sigiop, sig, checkctty)
 	struct sigio **sigiop;
 	int sig, checkctty;
 {
 	struct sigio *sigio;
 
 	SIGIO_LOCK();
 	sigio = *sigiop;
 	if (sigio == NULL) {
 		SIGIO_UNLOCK();
 		return;
 	}
 	if (sigio->sio_pgid > 0) {
 		PROC_LOCK(sigio->sio_proc);
 		if (CANSIGIO(sigio->sio_ucred, sigio->sio_proc->p_ucred))
 			psignal(sigio->sio_proc, sig);
 		PROC_UNLOCK(sigio->sio_proc);
 	} else if (sigio->sio_pgid < 0) {
 		struct proc *p;
 
 		PGRP_LOCK(sigio->sio_pgrp);
 		LIST_FOREACH(p, &sigio->sio_pgrp->pg_members, p_pglist) {
 			PROC_LOCK(p);
 			if (CANSIGIO(sigio->sio_ucred, p->p_ucred) &&
 			    (checkctty == 0 || (p->p_flag & P_CONTROLT)))
 				psignal(p, sig);
 			PROC_UNLOCK(p);
 		}
 		PGRP_UNLOCK(sigio->sio_pgrp);
 	}
 	SIGIO_UNLOCK();
 }
 
 static int
 filt_sigattach(struct knote *kn)
 {
 	struct proc *p = curproc;
 
 	kn->kn_ptr.p_proc = p;
 	kn->kn_flags |= EV_CLEAR;		/* automatically set */
 
 	knlist_add(&p->p_klist, kn, 0);
 
 	return (0);
 }
 
 static void
 filt_sigdetach(struct knote *kn)
 {
 	struct proc *p = kn->kn_ptr.p_proc;
 
 	knlist_remove(&p->p_klist, kn, 0);
 }
 
 /*
  * signal knotes are shared with proc knotes, so we apply a mask to 
  * the hint in order to differentiate them from process hints.  This
  * could be avoided by using a signal-specific knote list, but probably
  * isn't worth the trouble.
  */
 static int
 filt_signal(struct knote *kn, long hint)
 {
 
 	if (hint & NOTE_SIGNAL) {
 		hint &= ~NOTE_SIGNAL;
 
 		if (kn->kn_id == hint)
 			kn->kn_data++;
 	}
 	return (kn->kn_data != 0);
 }
 
 struct sigacts *
 sigacts_alloc(void)
 {
 	struct sigacts *ps;
 
 	ps = malloc(sizeof(struct sigacts), M_SUBPROC, M_WAITOK | M_ZERO);
 	ps->ps_refcnt = 1;
 	mtx_init(&ps->ps_mtx, "sigacts", NULL, MTX_DEF);
 	return (ps);
 }
 
 void
 sigacts_free(struct sigacts *ps)
 {
 
 	mtx_lock(&ps->ps_mtx);
 	ps->ps_refcnt--;
 	if (ps->ps_refcnt == 0) {
 		mtx_destroy(&ps->ps_mtx);
 		free(ps, M_SUBPROC);
 	} else
 		mtx_unlock(&ps->ps_mtx);
 }
 
 struct sigacts *
 sigacts_hold(struct sigacts *ps)
 {
 	mtx_lock(&ps->ps_mtx);
 	ps->ps_refcnt++;
 	mtx_unlock(&ps->ps_mtx);
 	return (ps);
 }
 
 void
 sigacts_copy(struct sigacts *dest, struct sigacts *src)
 {
 
 	KASSERT(dest->ps_refcnt == 1, ("sigacts_copy to shared dest"));
 	mtx_lock(&src->ps_mtx);
 	bcopy(src, dest, offsetof(struct sigacts, ps_refcnt));
 	mtx_unlock(&src->ps_mtx);
 }
 
 int
 sigacts_shared(struct sigacts *ps)
 {
 	int shared;
 
 	mtx_lock(&ps->ps_mtx);
 	shared = ps->ps_refcnt > 1;
 	mtx_unlock(&ps->ps_mtx);
 	return (shared);
 }
Index: head/sys/kern/uipc_mqueue.c
===================================================================
--- head/sys/kern/uipc_mqueue.c	(revision 175201)
+++ head/sys/kern/uipc_mqueue.c	(revision 175202)
@@ -1,2489 +1,2488 @@
 /*-
  * Copyright (c) 2005 David Xu <davidxu@freebsd.org>
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  */
 
 /*
  * POSIX message queue implementation.
  *
  * 1) A mqueue filesystem can be mounted, each message queue appears
  *    in mounted directory, user can change queue's permission and
  *    ownership, or remove a queue. Manually creating a file in the
  *    directory causes a message queue to be created in the kernel with
  *    default message queue attributes applied and same name used, this
  *    method is not advocated since mq_open syscall allows user to specify
  *    different attributes. Also the file system can be mounted multiple
  *    times at different mount points but shows same contents.
  *
  * 2) Standard POSIX message queue API. The syscalls do not use vfs layer,
  *    but directly operate on internal data structure, this allows user to
  *    use the IPC facility without having to mount mqueue file system.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <sys/param.h>
 #include <sys/kernel.h>
 #include <sys/systm.h>
 #include <sys/limits.h>
 #include <sys/buf.h>
 #include <sys/dirent.h>
 #include <sys/event.h>
 #include <sys/eventhandler.h>
 #include <sys/fcntl.h>
 #include <sys/file.h>
 #include <sys/filedesc.h>
 #include <sys/lock.h>
 #include <sys/malloc.h>
 #include <sys/module.h>
 #include <sys/mount.h>
 #include <sys/mqueue.h>
 #include <sys/mutex.h>
 #include <sys/namei.h>
 #include <sys/posix4.h>
 #include <sys/poll.h>
 #include <sys/priv.h>
 #include <sys/proc.h>
 #include <sys/queue.h>
 #include <sys/sysproto.h>
 #include <sys/stat.h>
 #include <sys/syscall.h>
 #include <sys/syscallsubr.h>
 #include <sys/sysent.h>
 #include <sys/sx.h>
 #include <sys/sysctl.h>
 #include <sys/taskqueue.h>
 #include <sys/unistd.h>
 #include <sys/vnode.h>
 #include <machine/atomic.h>
 
 /*
  * Limits and constants
  */
 #define	MQFS_NAMELEN		NAME_MAX
 #define MQFS_DELEN		(8 + MQFS_NAMELEN)
 
 /* node types */
 typedef enum {
 	mqfstype_none = 0,
 	mqfstype_root,
 	mqfstype_dir,
 	mqfstype_this,
 	mqfstype_parent,
 	mqfstype_file,
 	mqfstype_symlink,
 } mqfs_type_t;
 
 struct mqfs_node;
 
 /*
  * mqfs_info: describes a mqfs instance
  */
 struct mqfs_info {
 	struct sx		mi_lock;
 	struct mqfs_node	*mi_root;
 	struct unrhdr		*mi_unrhdr;
 };
 
 struct mqfs_vdata {
 	LIST_ENTRY(mqfs_vdata)	mv_link;
 	struct mqfs_node	*mv_node;
 	struct vnode		*mv_vnode;
 	struct task		mv_task;
 };
 
 /*
  * mqfs_node: describes a node (file or directory) within a mqfs
  */
 struct mqfs_node {
 	char			mn_name[MQFS_NAMELEN+1];
 	struct mqfs_info	*mn_info;
 	struct mqfs_node	*mn_parent;
 	LIST_HEAD(,mqfs_node)	mn_children;
 	LIST_ENTRY(mqfs_node)	mn_sibling;
 	LIST_HEAD(,mqfs_vdata)	mn_vnodes;
 	int			mn_refcount;
 	mqfs_type_t		mn_type;
 	int			mn_deleted;
 	u_int32_t		mn_fileno;
 	void			*mn_data;
 	struct timespec		mn_birth;
 	struct timespec		mn_ctime;
 	struct timespec		mn_atime;
 	struct timespec		mn_mtime;
 	uid_t			mn_uid;
 	gid_t			mn_gid;
 	int			mn_mode;
 };
 
 #define	VTON(vp)	(((struct mqfs_vdata *)((vp)->v_data))->mv_node)
 #define VTOMQ(vp) 	((struct mqueue *)(VTON(vp)->mn_data))
 #define	VFSTOMQFS(m)	((struct mqfs_info *)((m)->mnt_data))
 #define	FPTOMQ(fp)	((struct mqueue *)(((struct mqfs_node *) \
 				(fp)->f_data)->mn_data))
 
 TAILQ_HEAD(msgq, mqueue_msg);
 
 struct mqueue;
 
 struct mqueue_notifier {
 	LIST_ENTRY(mqueue_notifier)	nt_link;
 	struct sigevent			nt_sigev;
 	ksiginfo_t			nt_ksi;
 	struct proc			*nt_proc;
 };
 
 struct mqueue {
 	struct mtx	mq_mutex;
 	int		mq_flags;
 	long		mq_maxmsg;
 	long		mq_msgsize;
 	long		mq_curmsgs;
 	long		mq_totalbytes;
 	struct msgq	mq_msgq;
 	int		mq_receivers;
 	int		mq_senders;
 	struct selinfo	mq_rsel;
 	struct selinfo	mq_wsel;
 	struct mqueue_notifier	*mq_notifier;
 };
 
 #define	MQ_RSEL		0x01
 #define	MQ_WSEL		0x02
 
 struct mqueue_msg {
 	TAILQ_ENTRY(mqueue_msg)	msg_link;
 	unsigned int	msg_prio;
 	unsigned int	msg_size;
 	/* following real data... */
 };
 
 SYSCTL_NODE(_kern, OID_AUTO, mqueue, CTLFLAG_RW, 0,
 	"POSIX real time message queue");
 
 static int	default_maxmsg  = 10;
 static int	default_msgsize = 1024;
 
 static int	maxmsg = 100;
 SYSCTL_INT(_kern_mqueue, OID_AUTO, maxmsg, CTLFLAG_RW,
     &maxmsg, 0, "Default maximum messages in queue");
 static int	maxmsgsize = 16384;
 SYSCTL_INT(_kern_mqueue, OID_AUTO, maxmsgsize, CTLFLAG_RW,
     &maxmsgsize, 0, "Default maximum message size");
 static int	maxmq = 100;
 SYSCTL_INT(_kern_mqueue, OID_AUTO, maxmq, CTLFLAG_RW,
     &maxmq, 0, "maximum message queues");
 static int	curmq = 0;
 SYSCTL_INT(_kern_mqueue, OID_AUTO, curmq, CTLFLAG_RW,
     &curmq, 0, "current message queue number");
 static int	unloadable = 0;
 static MALLOC_DEFINE(M_MQUEUEDATA, "mqdata", "mqueue data");
 
 static eventhandler_tag exit_tag;
 
 /* Only one instance per-system */
 static struct mqfs_info		mqfs_data;
 static uma_zone_t		mqnode_zone;
 static uma_zone_t		mqueue_zone;
 static uma_zone_t		mvdata_zone;
 static uma_zone_t		mqnoti_zone;
 static struct vop_vector	mqfs_vnodeops;
 static struct fileops		mqueueops;
 
 /*
  * Directory structure construction and manipulation
  */
 #ifdef notyet
 static struct mqfs_node	*mqfs_create_dir(struct mqfs_node *parent,
 	const char *name, int namelen, struct ucred *cred, int mode);
 static struct mqfs_node	*mqfs_create_link(struct mqfs_node *parent,
 	const char *name, int namelen, struct ucred *cred, int mode);
 #endif
 
 static struct mqfs_node	*mqfs_create_file(struct mqfs_node *parent,
 	const char *name, int namelen, struct ucred *cred, int mode);
 static int	mqfs_destroy(struct mqfs_node *mn);
 static void	mqfs_fileno_alloc(struct mqfs_info *mi, struct mqfs_node *mn);
 static void	mqfs_fileno_free(struct mqfs_info *mi, struct mqfs_node *mn);
 static int	mqfs_allocv(struct mount *mp, struct vnode **vpp, struct mqfs_node *pn);
 
 /*
  * Message queue construction and maniplation
  */
 static struct mqueue	*mqueue_alloc(const struct mq_attr *attr);
 static void	mqueue_free(struct mqueue *mq);
 static int	mqueue_send(struct mqueue *mq, const char *msg_ptr,
 			size_t msg_len, unsigned msg_prio, int waitok,
 			const struct timespec *abs_timeout);
 static int	mqueue_receive(struct mqueue *mq, char *msg_ptr,
 			size_t msg_len, unsigned *msg_prio, int waitok,
 			const struct timespec *abs_timeout);
 static int	_mqueue_send(struct mqueue *mq, struct mqueue_msg *msg,
 			int timo);
 static int	_mqueue_recv(struct mqueue *mq, struct mqueue_msg **msg,
 			int timo);
 static void	mqueue_send_notification(struct mqueue *mq);
 static void	mqueue_fdclose(struct thread *td, int fd, struct file *fp);
 static void	mq_proc_exit(void *arg, struct proc *p);
 
 /*
  * kqueue filters
  */
 static void	filt_mqdetach(struct knote *kn);
 static int	filt_mqread(struct knote *kn, long hint);
 static int	filt_mqwrite(struct knote *kn, long hint);
 
 struct filterops mq_rfiltops =
 	{ 1, NULL, filt_mqdetach, filt_mqread };
 struct filterops mq_wfiltops =
 	{ 1, NULL, filt_mqdetach, filt_mqwrite };
 
 /*
  * Initialize fileno bitmap
  */
 static void
 mqfs_fileno_init(struct mqfs_info *mi)
 {
 	struct unrhdr *up;
 
 	up = new_unrhdr(1, INT_MAX, NULL);
 	mi->mi_unrhdr = up;
 }
 
 /*
  * Tear down fileno bitmap
  */
 static void
 mqfs_fileno_uninit(struct mqfs_info *mi)
 {
 	struct unrhdr *up;
 
 	up = mi->mi_unrhdr;
 	mi->mi_unrhdr = NULL;
 	delete_unrhdr(up);
 }
 
 /*
  * Allocate a file number
  */
 static void
 mqfs_fileno_alloc(struct mqfs_info *mi, struct mqfs_node *mn)
 {
 	/* make sure our parent has a file number */
 	if (mn->mn_parent && !mn->mn_parent->mn_fileno)
 		mqfs_fileno_alloc(mi, mn->mn_parent);
 
 	switch (mn->mn_type) {
 	case mqfstype_root:
 	case mqfstype_dir:
 	case mqfstype_file:
 	case mqfstype_symlink:
 		mn->mn_fileno = alloc_unr(mi->mi_unrhdr);
 		break;
 	case mqfstype_this:
 		KASSERT(mn->mn_parent != NULL,
 		    ("mqfstype_this node has no parent"));
 		mn->mn_fileno = mn->mn_parent->mn_fileno;
 		break;
 	case mqfstype_parent:
 		KASSERT(mn->mn_parent != NULL,
 		    ("mqfstype_parent node has no parent"));
 		if (mn->mn_parent == mi->mi_root) {
 			mn->mn_fileno = mn->mn_parent->mn_fileno;
 			break;
 		}
 		KASSERT(mn->mn_parent->mn_parent != NULL,
 		    ("mqfstype_parent node has no grandparent"));
 		mn->mn_fileno = mn->mn_parent->mn_parent->mn_fileno;
 		break;
 	default:
 		KASSERT(0,
 		    ("mqfs_fileno_alloc() called for unknown type node: %d",
 			mn->mn_type));
 		break;
 	}
 }
 
 /*
  * Release a file number
  */
 static void
 mqfs_fileno_free(struct mqfs_info *mi, struct mqfs_node *mn)
 {
 	switch (mn->mn_type) {
 	case mqfstype_root:
 	case mqfstype_dir:
 	case mqfstype_file:
 	case mqfstype_symlink:
 		free_unr(mi->mi_unrhdr, mn->mn_fileno);
 		break;
 	case mqfstype_this:
 	case mqfstype_parent:
 		/* ignore these, as they don't "own" their file number */
 		break;
 	default:
 		KASSERT(0,
 		    ("mqfs_fileno_free() called for unknown type node: %d", 
 			mn->mn_type));
 		break;
 	}
 }
 
 static __inline struct mqfs_node *
 mqnode_alloc(void)
 {
 	return uma_zalloc(mqnode_zone, M_WAITOK | M_ZERO);
 }
 
 static __inline void
 mqnode_free(struct mqfs_node *node)
 {
 	uma_zfree(mqnode_zone, node);
 }
 
 static __inline void
 mqnode_addref(struct mqfs_node *node)
 {
 	atomic_fetchadd_int(&node->mn_refcount, 1);
 }
 
 static __inline void
 mqnode_release(struct mqfs_node *node)
 {
 	int old, exp;
 
 	old = atomic_fetchadd_int(&node->mn_refcount, -1);
 	if (node->mn_type == mqfstype_dir ||
 	    node->mn_type == mqfstype_root)
 		exp = 3; /* include . and .. */
 	else
 		exp = 1;
 	if (old == exp)
 		mqfs_destroy(node);
 }
 
 /*
  * Add a node to a directory
  */
 static int
 mqfs_add_node(struct mqfs_node *parent, struct mqfs_node *node)
 {
 	KASSERT(parent != NULL, ("%s(): parent is NULL", __func__));
 	KASSERT(parent->mn_info != NULL,
 	    ("%s(): parent has no mn_info", __func__));
 	KASSERT(parent->mn_type == mqfstype_dir ||
 	    parent->mn_type == mqfstype_root,
 	    ("%s(): parent is not a directory", __func__));
 
 	node->mn_info = parent->mn_info;
 	node->mn_parent = parent;
 	LIST_INIT(&node->mn_children);
 	LIST_INIT(&node->mn_vnodes);
 	LIST_INSERT_HEAD(&parent->mn_children, node, mn_sibling);
 	mqnode_addref(parent);
 	return (0);
 }
 
 static struct mqfs_node *
 mqfs_create_node(const char *name, int namelen, struct ucred *cred, int mode,
 	int nodetype)
 {
 	struct mqfs_node *node;
 
 	node = mqnode_alloc();
 	strncpy(node->mn_name, name, namelen);
 	node->mn_type = nodetype;
 	node->mn_refcount = 1;
 	getnanotime(&node->mn_birth);
 	node->mn_ctime = node->mn_atime = node->mn_mtime
 		= node->mn_birth;
 	node->mn_uid = cred->cr_uid;
 	node->mn_gid = cred->cr_gid;
 	node->mn_mode = mode;
 	return (node);
 }
 
 /*
  * Create a file
  */
 static struct mqfs_node *
 mqfs_create_file(struct mqfs_node *parent, const char *name, int namelen,
 	struct ucred *cred, int mode)
 {
 	struct mqfs_node *node;
 
 	node = mqfs_create_node(name, namelen, cred, mode, mqfstype_file);
 	if (mqfs_add_node(parent, node) != 0) {
 		mqnode_free(node);
 		return (NULL);
 	}
 	return (node);
 }
 
 /*
  * Add . and .. to a directory
  */
 static int
 mqfs_fixup_dir(struct mqfs_node *parent)
 {
 	struct mqfs_node *dir;
 
 	dir = mqnode_alloc();
 	dir->mn_name[0] = '.';
 	dir->mn_type = mqfstype_this;
 	dir->mn_refcount = 1;
 	if (mqfs_add_node(parent, dir) != 0) {
 		mqnode_free(dir);
 		return (-1);
 	}
 
 	dir = mqnode_alloc();
 	dir->mn_name[0] = dir->mn_name[1] = '.';
 	dir->mn_type = mqfstype_parent;
 	dir->mn_refcount = 1;
 
 	if (mqfs_add_node(parent, dir) != 0) {
 		mqnode_free(dir);
 		return (-1);
 	}
 
 	return (0);
 }
 
 #ifdef notyet
 
 /*
  * Create a directory
  */
 static struct mqfs_node *
 mqfs_create_dir(struct mqfs_node *parent, const char *name, int namelen,
 	struct ucred *cred, int mode)
 {
 	struct mqfs_node *node;
 
 	node = mqfs_create_node(name, namelen, cred, mode, mqfstype_dir);
 	if (mqfs_add_node(parent, node) != 0) {
 		mqnode_free(node);
 		return (NULL);
 	}
 
 	if (mqfs_fixup_dir(node) != 0) {
 		mqfs_destroy(node);
 		return (NULL);
 	}
 	return (node);
 }
 
 /*
  * Create a symlink
  */
 static struct mqfs_node *
 mqfs_create_link(struct mqfs_node *parent, const char *name, int namelen,
 	struct ucred *cred, int mode)
 {
 	struct mqfs_node *node;
 
 	node = mqfs_create_node(name, namelen, cred, mode, mqfstype_symlink);
 	if (mqfs_add_node(parent, node) != 0) {
 		mqnode_free(node);
 		return (NULL);
 	}
 	return (node);
 }
 
 #endif
 
 /*
  * Destroy a node or a tree of nodes
  */
 static int
 mqfs_destroy(struct mqfs_node *node)
 {
 	struct mqfs_node *parent;
 
 	KASSERT(node != NULL,
 	    ("%s(): node is NULL", __func__));
 	KASSERT(node->mn_info != NULL,
 	    ("%s(): node has no mn_info", __func__));
 
 	/* destroy children */
 	if (node->mn_type == mqfstype_dir || node->mn_type == mqfstype_root)
 		while (! LIST_EMPTY(&node->mn_children))
 			mqfs_destroy(LIST_FIRST(&node->mn_children));
 
 	/* unlink from parent */
 	if ((parent = node->mn_parent) != NULL) {
 		KASSERT(parent->mn_info == node->mn_info,
 		    ("%s(): parent has different mn_info", __func__));
 		LIST_REMOVE(node, mn_sibling);
 	}
 
 	if (node->mn_fileno != 0)
 		mqfs_fileno_free(node->mn_info, node);
 	if (node->mn_data != NULL)
 		mqueue_free(node->mn_data);
 	mqnode_free(node);
 	return (0);
 }
 
 /*
  * Mount a mqfs instance
  */
 static int
 mqfs_mount(struct mount *mp, struct thread *td)
 {
 	struct statfs *sbp;
 
 	if (mp->mnt_flag & MNT_UPDATE)
 		return (EOPNOTSUPP);
 
 	mp->mnt_data = &mqfs_data;
 	MNT_ILOCK(mp);
 	mp->mnt_flag |= MNT_LOCAL;
 	mp->mnt_kern_flag |= MNTK_MPSAFE;
 	MNT_IUNLOCK(mp);
 	vfs_getnewfsid(mp);
 
 	sbp = &mp->mnt_stat;
 	vfs_mountedfrom(mp, "mqueue");
 	sbp->f_bsize = PAGE_SIZE;
 	sbp->f_iosize = PAGE_SIZE;
 	sbp->f_blocks = 1;
 	sbp->f_bfree = 0;
 	sbp->f_bavail = 0;
 	sbp->f_files = 1;
 	sbp->f_ffree = 0;
 	return (0);
 }
 
 /*
  * Unmount a mqfs instance
  */
 static int
 mqfs_unmount(struct mount *mp, int mntflags, struct thread *td)
 {
 	int error;
 
 	error = vflush(mp, 0, (mntflags & MNT_FORCE) ?  FORCECLOSE : 0, td);
 	return (error);
 }
 
 /*
  * Return a root vnode
  */
 static int
 mqfs_root(struct mount *mp, int flags, struct vnode **vpp, struct thread *td)
 {
 	struct mqfs_info *mqfs;
 	int ret;
 
 	mqfs = VFSTOMQFS(mp);
 	sx_xlock(&mqfs->mi_lock);
 	ret = mqfs_allocv(mp, vpp, mqfs->mi_root);
 	sx_xunlock(&mqfs->mi_lock);
 	return (ret);
 }
 
 /*
  * Return filesystem stats
  */
 static int
 mqfs_statfs(struct mount *mp, struct statfs *sbp, struct thread *td)
 {
 	/* XXX update statistics */
 	return (0);
 }
 
 /*
  * Initialize a mqfs instance
  */
 static int
 mqfs_init(struct vfsconf *vfc)
 {
 	struct mqfs_node *root;
 	struct mqfs_info *mi;
 
 	mqnode_zone = uma_zcreate("mqnode", sizeof(struct mqfs_node),
 		NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0);
 	mqueue_zone = uma_zcreate("mqueue", sizeof(struct mqueue),
 		NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0);
 	mvdata_zone = uma_zcreate("mvdata",
 		sizeof(struct mqfs_vdata), NULL, NULL, NULL,
 		NULL, UMA_ALIGN_PTR, 0);
 	mqnoti_zone = uma_zcreate("mqnotifier", sizeof(struct mqueue_notifier),
 		NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0);
 	mi = &mqfs_data;
 	sx_init(&mi->mi_lock, "mqfs lock");
 	/* set up the root diretory */
 	root = mqfs_create_node("/", 1, curthread->td_ucred, 01777,
 		mqfstype_root);
 	root->mn_info = mi;
 	LIST_INIT(&root->mn_children);
 	LIST_INIT(&root->mn_vnodes);
 	mi->mi_root = root;
 	mqfs_fileno_init(mi);
 	mqfs_fileno_alloc(mi, root);
 	mqfs_fixup_dir(root);
 	exit_tag = EVENTHANDLER_REGISTER(process_exit, mq_proc_exit, NULL,
 	    EVENTHANDLER_PRI_ANY);
 	mq_fdclose = mqueue_fdclose;
 	p31b_setcfg(CTL_P1003_1B_MESSAGE_PASSING, _POSIX_MESSAGE_PASSING);
 	return (0);
 }
 
 /*
  * Destroy a mqfs instance
  */
 static int
 mqfs_uninit(struct vfsconf *vfc)
 {
 	struct mqfs_info *mi;
 
 	if (!unloadable)
 		return (EOPNOTSUPP);
 	EVENTHANDLER_DEREGISTER(process_exit, exit_tag);
 	mi = &mqfs_data;
 	mqfs_destroy(mi->mi_root);
 	mi->mi_root = NULL;
 	mqfs_fileno_uninit(mi);
 	sx_destroy(&mi->mi_lock);
 	uma_zdestroy(mqnode_zone);
 	uma_zdestroy(mqueue_zone);
 	uma_zdestroy(mvdata_zone);
 	uma_zdestroy(mqnoti_zone);
 	return (0);
 }
 
 /*
  * task routine
  */
 static void
 do_recycle(void *context, int pending __unused)
 {
 	struct vnode *vp = (struct vnode *)context;
 
 	vrecycle(vp, curthread);
 	vdrop(vp);
 }
 
 /*
  * Allocate a vnode
  */
 static int
 mqfs_allocv(struct mount *mp, struct vnode **vpp, struct mqfs_node *pn)
 {
 	struct mqfs_vdata *vd;
 	int error;
 
 	LIST_FOREACH(vd, &pn->mn_vnodes, mv_link) {
 		if (vd->mv_vnode->v_mount == mp)
 			break;
 	}
 
 	if (vd != NULL) {
 		if (vget(vd->mv_vnode, 0, curthread) == 0) {
 			*vpp = vd->mv_vnode;
-			vn_lock(*vpp, LK_RETRY | LK_EXCLUSIVE,
-			    curthread);
+			vn_lock(*vpp, LK_RETRY | LK_EXCLUSIVE);
 			return (0);
 		}
 		/* XXX if this can happen, we're in trouble */
 	}
 
 	error = getnewvnode("mqueue", mp, &mqfs_vnodeops, vpp);
 	if (error)
 		return (error);
-	vn_lock(*vpp, LK_EXCLUSIVE | LK_RETRY, curthread);
+	vn_lock(*vpp, LK_EXCLUSIVE | LK_RETRY);
 	error = insmntque(*vpp, mp);
 	if (error != 0) {
 		*vpp = NULLVP;
 		return (error);
 	}
 	vd = uma_zalloc(mvdata_zone, M_WAITOK);
 	(*vpp)->v_data = vd;
 	vd->mv_vnode = *vpp;
 	vd->mv_node = pn;
 	TASK_INIT(&vd->mv_task, 0, do_recycle, *vpp);
 	LIST_INSERT_HEAD(&pn->mn_vnodes, vd, mv_link);
 	mqnode_addref(pn);
 	switch (pn->mn_type) {
 	case mqfstype_root:
 		(*vpp)->v_vflag = VV_ROOT;
 		/* fall through */
 	case mqfstype_dir:
 	case mqfstype_this:
 	case mqfstype_parent:
 		(*vpp)->v_type = VDIR;
 		break;
 	case mqfstype_file:
 		(*vpp)->v_type = VREG;
 		break;
 	case mqfstype_symlink:
 		(*vpp)->v_type = VLNK;
 		break;
 	case mqfstype_none:
 		KASSERT(0, ("mqfs_allocf called for null node\n"));
 	default:
 		panic("%s has unexpected type: %d", pn->mn_name, pn->mn_type);
 	}
 	return (0);
 }
 
 /* 
  * Search a directory entry
  */
 static struct mqfs_node *
 mqfs_search(struct mqfs_node *pd, const char *name, int len)
 {
 	struct mqfs_node *pn;
 
 	LIST_FOREACH(pn, &pd->mn_children, mn_sibling) {
 		if (strncmp(pn->mn_name, name, len) == 0)
 			return (pn);
 	}
 	return (NULL);
 }
 
 /*
  * Look up a file or directory.
  */
 static int
 mqfs_lookupx(struct vop_cachedlookup_args *ap)
 {
 	struct componentname *cnp;
 	struct vnode *dvp, **vpp;
 	struct mqfs_node *pd;
 	struct mqfs_node *pn;
 	int nameiop, flags, error, namelen;
 	char *pname;
 	struct thread *td;
 
 	cnp = ap->a_cnp;
 	vpp = ap->a_vpp;
 	dvp = ap->a_dvp;
 	pname = cnp->cn_nameptr;
 	namelen = cnp->cn_namelen;
 	td = cnp->cn_thread;
 	flags = cnp->cn_flags;
 	nameiop = cnp->cn_nameiop;
 	pd = VTON(dvp);
 	pn = NULL;
 	*vpp = NULLVP;
 
 	if (dvp->v_type != VDIR)
 		return (ENOTDIR);
 
 	error = VOP_ACCESS(dvp, VEXEC, cnp->cn_cred, cnp->cn_thread);
 	if (error)
 		return (error);
 
 	/* shortcut: check if the name is too long */
 	if (cnp->cn_namelen >= MQFS_NAMELEN)
 		return (ENOENT);
 
 	/* self */
 	if (namelen == 1 && pname[0] == '.') {
 		if ((flags & ISLASTCN) && nameiop != LOOKUP)
 			return (EINVAL);
 		pn = pd;
 		*vpp = dvp;
 		VREF(dvp);
 		return (0);
 	}
 
 	/* parent */
 	if (cnp->cn_flags & ISDOTDOT) {
 		if (dvp->v_vflag & VV_ROOT)
 			return (EIO);
 		if ((flags & ISLASTCN) && nameiop != LOOKUP)
 			return (EINVAL);
 		VOP_UNLOCK(dvp, 0, cnp->cn_thread);
 		KASSERT(pd->mn_parent, ("non-root directory has no parent"));
 		pn = pd->mn_parent;
 		error = mqfs_allocv(dvp->v_mount, vpp, pn);
-		vn_lock(dvp, LK_EXCLUSIVE | LK_RETRY, td);
+		vn_lock(dvp, LK_EXCLUSIVE | LK_RETRY);
 		return (error);
 	}
 
 	/* named node */
 	pn = mqfs_search(pd, pname, namelen);
 	
 	/* found */
 	if (pn != NULL) {
 		/* DELETE */
 		if (nameiop == DELETE && (flags & ISLASTCN)) {
 			error = VOP_ACCESS(dvp, VWRITE, cnp->cn_cred, td);
 			if (error)
 				return (error);
 			if (*vpp == dvp) {
 				VREF(dvp);
 				*vpp = dvp;
 				return (0);
 			}
 		}
 
 		/* allocate vnode */
 		error = mqfs_allocv(dvp->v_mount, vpp, pn);
 		if (error == 0 && cnp->cn_flags & MAKEENTRY)
 			cache_enter(dvp, *vpp, cnp);
 		return (error);
 	}
 	
 	/* not found */
 
 	/* will create a new entry in the directory ? */
 	if ((nameiop == CREATE || nameiop == RENAME) && (flags & LOCKPARENT)
 	    && (flags & ISLASTCN)) {
 		error = VOP_ACCESS(dvp, VWRITE, cnp->cn_cred, td);
 		if (error)
 			return (error);
 		cnp->cn_flags |= SAVENAME;
 		return (EJUSTRETURN);
 	}
 	return (ENOENT);
 }
 
 #if 0
 struct vop_lookup_args {
 	struct vop_generic_args a_gen;
 	struct vnode *a_dvp;
 	struct vnode **a_vpp;
 	struct componentname *a_cnp;
 };
 #endif
 
 /*
  * vnode lookup operation
  */
 static int
 mqfs_lookup(struct vop_cachedlookup_args *ap)
 {
 	struct mqfs_info *mqfs = VFSTOMQFS(ap->a_dvp->v_mount);
 	int rc;
 
 	sx_xlock(&mqfs->mi_lock);
 	rc = mqfs_lookupx(ap);
 	sx_xunlock(&mqfs->mi_lock);
 	return (rc);
 }
 
 #if 0
 struct vop_create_args {
 	struct vnode *a_dvp;
 	struct vnode **a_vpp;
 	struct componentname *a_cnp;
 	struct vattr *a_vap;
 };
 #endif
 
 /*
  * vnode creation operation
  */
 static int
 mqfs_create(struct vop_create_args *ap)
 {
 	struct mqfs_info *mqfs = VFSTOMQFS(ap->a_dvp->v_mount);
 	struct componentname *cnp = ap->a_cnp;
 	struct mqfs_node *pd;
 	struct mqfs_node *pn;
 	struct mqueue *mq;
 	int error;
 
 	pd = VTON(ap->a_dvp);
 	if (pd->mn_type != mqfstype_root && pd->mn_type != mqfstype_dir)
 		return (ENOTDIR);
 	mq = mqueue_alloc(NULL);
 	if (mq == NULL)
 		return (EAGAIN);
 	sx_xlock(&mqfs->mi_lock);
 #if 0
 	/* named node */
 	pn = mqfs_search(pd, cnp->cn_nameptr, cnp->cn_namelen);
 	if (pn != NULL) {
 		mqueue_free(mq);
 		sx_xunlock(&mqfs->mi_lock);
 		return (EEXIST);
 	}
 #else
 	if ((cnp->cn_flags & HASBUF) == 0)
 		panic("%s: no name", __func__);
 #endif
 	pn = mqfs_create_file(pd, cnp->cn_nameptr, cnp->cn_namelen,
 		cnp->cn_cred, ap->a_vap->va_mode);
 	if (pn == NULL)
 		error = ENOSPC;
 	else {
 		error = mqfs_allocv(ap->a_dvp->v_mount, ap->a_vpp, pn);
 		if (error)
 			mqfs_destroy(pn);
 		else
 			pn->mn_data = mq;
 	}
 	sx_xunlock(&mqfs->mi_lock);
 	if (error)
 		mqueue_free(mq);
 	return (error);
 }
 
 /*
  * Remove an entry
  */
 static
 int do_unlink(struct mqfs_node *pn, struct ucred *ucred)
 {
 	struct mqfs_node *parent;
 	struct mqfs_vdata *vd;
 	int error = 0;
 
 	sx_assert(&pn->mn_info->mi_lock, SX_LOCKED);
 
 	if (ucred->cr_uid != pn->mn_uid &&
 	    (error = priv_check_cred(ucred, PRIV_MQ_ADMIN, 0)) != 0)
 		error = EACCES;
 	else if (!pn->mn_deleted) {
 		parent = pn->mn_parent;
 		pn->mn_parent = NULL;
 		pn->mn_deleted = 1;
 		LIST_REMOVE(pn, mn_sibling);
 		LIST_FOREACH(vd, &pn->mn_vnodes, mv_link) {
 			cache_purge(vd->mv_vnode);
 			vhold(vd->mv_vnode);
 			taskqueue_enqueue(taskqueue_thread, &vd->mv_task);
 		}
 		mqnode_release(pn);
 		mqnode_release(parent);
 	} else
 		error = ENOENT;
 	return (error);
 }
 
 #if 0
 struct vop_remove_args {
 	struct vnode *a_dvp;
 	struct vnode *a_vp;
 	struct componentname *a_cnp;
 };
 #endif
 
 /*
  * vnode removal operation
  */
 static int
 mqfs_remove(struct vop_remove_args *ap)
 {
 	struct mqfs_info *mqfs = VFSTOMQFS(ap->a_dvp->v_mount);
 	struct mqfs_node *pn;
 	int error;
 
 	if (ap->a_vp->v_type == VDIR)
                 return (EPERM);
 	pn = VTON(ap->a_vp);
 	sx_xlock(&mqfs->mi_lock);
 	error = do_unlink(pn, ap->a_cnp->cn_cred);
 	sx_xunlock(&mqfs->mi_lock);
 	return (error);
 }
 
 #if 0
 struct vop_inactive_args {
 	struct vnode *a_vp;
 	struct thread *a_td;
 };
 #endif
 
 static int
 mqfs_inactive(struct vop_inactive_args *ap)
 {
 	struct mqfs_node *pn = VTON(ap->a_vp);
 
 	if (pn->mn_deleted)
 		vrecycle(ap->a_vp, ap->a_td);
 	return (0);
 }
 
 #if 0
 struct vop_reclaim_args {
 	struct vop_generic_args a_gen;
 	struct vnode *a_vp;
 	struct thread *a_td;
 };
 #endif
 
 static int
 mqfs_reclaim(struct vop_reclaim_args *ap)
 {
 	struct mqfs_info *mqfs = VFSTOMQFS(ap->a_vp->v_mount);
 	struct vnode *vp = ap->a_vp;
 	struct mqfs_node *pn;
 	struct mqfs_vdata *vd;
 
 	vd = vp->v_data;
 	pn = vd->mv_node;
 	sx_xlock(&mqfs->mi_lock);
 	vp->v_data = NULL;
 	LIST_REMOVE(vd, mv_link);
 	uma_zfree(mvdata_zone, vd);
 	mqnode_release(pn);
 	sx_xunlock(&mqfs->mi_lock);
 	return (0);
 }
 
 #if 0
 struct vop_open_args {
 	struct vop_generic_args a_gen;
 	struct vnode *a_vp;
 	int a_mode;
 	struct ucred *a_cred;
 	struct thread *a_td;
 	int a_fdidx;
 };
 #endif
 
 static int
 mqfs_open(struct vop_open_args *ap)
 {
 	return (0);
 }
 
 #if 0
 struct vop_close_args {
 	struct vop_generic_args a_gen;
 	struct vnode *a_vp;
 	int a_fflag;
 	struct ucred *a_cred;
 	struct thread *a_td;
 };
 #endif
 
 static int
 mqfs_close(struct vop_close_args *ap)
 {
 	return (0);
 }
 
 #if 0
 struct vop_access_args {
 	struct vop_generic_args a_gen;
 	struct vnode *a_vp;
 	int a_mode;
 	struct ucred *a_cred;
 	struct thread *a_td;
 };
 #endif
 
 /*
  * Verify permissions
  */
 static int
 mqfs_access(struct vop_access_args *ap)
 {
 	struct vnode *vp = ap->a_vp;
 	struct vattr vattr;
 	int error;
 
 	error = VOP_GETATTR(vp, &vattr, ap->a_cred, ap->a_td);
 	if (error)
 		return (error);
 	error = vaccess(vp->v_type, vattr.va_mode, vattr.va_uid,
 	    vattr.va_gid, ap->a_mode, ap->a_cred, NULL);
 	return (error);
 }
 
 #if 0
 struct vop_getattr_args {
 	struct vop_generic_args a_gen;
 	struct vnode *a_vp;
 	struct vattr *a_vap;
 	struct ucred *a_cred;
 	struct thread *a_td;
 };
 #endif
 
 /*
  * Get file attributes
  */
 static int
 mqfs_getattr(struct vop_getattr_args *ap)
 {
 	struct vnode *vp = ap->a_vp;
 	struct mqfs_node *pn = VTON(vp);
 	struct vattr *vap = ap->a_vap;
 	int error = 0;
 
 	VATTR_NULL(vap);
 	vap->va_type = vp->v_type;
 	vap->va_mode = pn->mn_mode;
 	vap->va_nlink = 1;
 	vap->va_uid = pn->mn_uid;
 	vap->va_gid = pn->mn_gid;
 	vap->va_fsid = vp->v_mount->mnt_stat.f_fsid.val[0];
 	vap->va_fileid = pn->mn_fileno;
 	vap->va_size = 0;
 	vap->va_blocksize = PAGE_SIZE;
 	vap->va_bytes = vap->va_size = 0;
 	vap->va_atime = pn->mn_atime;
 	vap->va_mtime = pn->mn_mtime;
 	vap->va_ctime = pn->mn_ctime;
 	vap->va_birthtime = pn->mn_birth;
 	vap->va_gen = 0;
 	vap->va_flags = 0;
 	vap->va_rdev = 0;
 	vap->va_bytes = 0;
 	vap->va_filerev = 0;
 	vap->va_vaflags = 0;
 	return (error);
 }
 
 #if 0
 struct vop_setattr_args {
 	struct vop_generic_args a_gen;
 	struct vnode *a_vp;
 	struct vattr *a_vap;
 	struct ucred *a_cred;
 	struct thread *a_td;
 };
 #endif
 /*
  * Set attributes
  */
 static int
 mqfs_setattr(struct vop_setattr_args *ap)
 {
 	struct mqfs_node *pn;
 	struct vattr *vap;
 	struct vnode *vp;
 	int c, error;
 	uid_t uid;
 	gid_t gid;
 
 	vap = ap->a_vap;
 	vp = ap->a_vp;
 	if ((vap->va_type != VNON) ||
 	    (vap->va_nlink != VNOVAL) ||
 	    (vap->va_fsid != VNOVAL) ||
 	    (vap->va_fileid != VNOVAL) ||
 	    (vap->va_blocksize != VNOVAL) ||
 	    (vap->va_flags != VNOVAL && vap->va_flags != 0) ||
 	    (vap->va_rdev != VNOVAL) ||
 	    ((int)vap->va_bytes != VNOVAL) ||
 	    (vap->va_gen != VNOVAL)) {
 		return (EINVAL);
 	}
 
 	pn = VTON(vp);
 
 	error = c = 0;
 	if (vap->va_uid == (uid_t)VNOVAL)
 		uid = pn->mn_uid;
 	else
 		uid = vap->va_uid;
 	if (vap->va_gid == (gid_t)VNOVAL)
 		gid = pn->mn_gid;
 	else
 		gid = vap->va_gid;
 
 	if (uid != pn->mn_uid || gid != pn->mn_gid) {
 		/*
 		 * To modify the ownership of a file, must possess VADMIN
 		 * for that file.
 		 */
 		if ((error = VOP_ACCESS(vp, VADMIN, ap->a_cred, ap->a_td)))
 			return (error);
 
 		/*
 		 * XXXRW: Why is there a privilege check here: shouldn't the
 		 * check in VOP_ACCESS() be enough?  Also, are the group bits
 		 * below definitely right?
 		 */
 		if (((ap->a_cred->cr_uid != pn->mn_uid) || uid != pn->mn_uid ||
 		    (gid != pn->mn_gid && !groupmember(gid, ap->a_cred))) &&
 		    (error = priv_check(ap->a_td, PRIV_MQ_ADMIN)) != 0)
 			return (error);
 		pn->mn_uid = uid;
 		pn->mn_gid = gid;
 		c = 1;
 	}
 
 	if (vap->va_mode != (mode_t)VNOVAL) {
 		if ((ap->a_cred->cr_uid != pn->mn_uid) &&
 		    (error = priv_check(ap->a_td, PRIV_MQ_ADMIN)))
 			return (error);
 		pn->mn_mode = vap->va_mode;
 		c = 1;
 	}
 
 	if (vap->va_atime.tv_sec != VNOVAL || vap->va_mtime.tv_sec != VNOVAL) {
 		/* See the comment in ufs_vnops::ufs_setattr(). */
 		if ((error = VOP_ACCESS(vp, VADMIN, ap->a_cred, ap->a_td)) &&
 		    ((vap->va_vaflags & VA_UTIMES_NULL) == 0 ||
 		    (error = VOP_ACCESS(vp, VWRITE, ap->a_cred, ap->a_td))))
 			return (error);
 		if (vap->va_atime.tv_sec != VNOVAL) {
 			pn->mn_atime = vap->va_atime;
 		}
 		if (vap->va_mtime.tv_sec != VNOVAL) {
 			pn->mn_mtime = vap->va_mtime;
 		}
 		c = 1;
 	}
 	if (c) {
 		vfs_timestamp(&pn->mn_ctime);
 	}
 	return (0);
 }
 
 #if 0
 struct vop_read_args {
 	struct vop_generic_args a_gen;
 	struct vnode *a_vp;
 	struct uio *a_uio;
 	int a_ioflag;
 	struct ucred *a_cred;
 };
 #endif
 
 /*
  * Read from a file
  */
 static int
 mqfs_read(struct vop_read_args *ap)
 {
 	char buf[80];
 	struct vnode *vp = ap->a_vp;
 	struct uio *uio = ap->a_uio;
 	struct mqfs_node *pn;
 	struct mqueue *mq;
 	int len, error;
 
 	if (vp->v_type != VREG)
 		return (EINVAL);
 
 	pn = VTON(vp);
 	mq = VTOMQ(vp);
 	snprintf(buf, sizeof(buf),
 		"QSIZE:%-10ld MAXMSG:%-10ld CURMSG:%-10ld MSGSIZE:%-10ld\n",
 		mq->mq_totalbytes,
 		mq->mq_maxmsg,
 		mq->mq_curmsgs,
 		mq->mq_msgsize);
 	buf[sizeof(buf)-1] = '\0';
 	len = strlen(buf);
 	error = uiomove_frombuf(buf, len, uio);
 	return (error);
 }
 
 #if 0
 struct vop_readdir_args {
 	struct vop_generic_args a_gen;
 	struct vnode *a_vp;
 	struct uio *a_uio;
 	struct ucred *a_cred;
 	int *a_eofflag;
 	int *a_ncookies;
 	u_long **a_cookies;
 };
 #endif
 
 /*
  * Return directory entries.
  */
 static int
 mqfs_readdir(struct vop_readdir_args *ap)
 {
 	struct vnode *vp;
 	struct mqfs_info *mi;
 	struct mqfs_node *pd;
 	struct mqfs_node *pn;
 	struct dirent entry;
 	struct uio *uio;
 	int *tmp_ncookies = NULL;
 	off_t offset;
 	int error, i;
 
 	vp = ap->a_vp;
 	mi = VFSTOMQFS(vp->v_mount);
 	pd = VTON(vp);
 	uio = ap->a_uio;
 
 	if (vp->v_type != VDIR)
 		return (ENOTDIR);
 
 	if (uio->uio_offset < 0)
 		return (EINVAL);
 
 	if (ap->a_ncookies != NULL) {
 		tmp_ncookies = ap->a_ncookies;
 		*ap->a_ncookies = 0;
 		ap->a_ncookies = NULL;
         }
 
 	error = 0;
 	offset = 0;
 
 	sx_xlock(&mi->mi_lock);
 
 	LIST_FOREACH(pn, &pd->mn_children, mn_sibling) {
 		entry.d_reclen = sizeof(entry);
 		if (!pn->mn_fileno)
 			mqfs_fileno_alloc(mi, pn);
 		entry.d_fileno = pn->mn_fileno;
 		for (i = 0; i < MQFS_NAMELEN - 1 && pn->mn_name[i] != '\0'; ++i)
 			entry.d_name[i] = pn->mn_name[i];
 		entry.d_name[i] = 0;
 		entry.d_namlen = i;
 		switch (pn->mn_type) {
 		case mqfstype_root:
 		case mqfstype_dir:
 		case mqfstype_this:
 		case mqfstype_parent:
 			entry.d_type = DT_DIR;
 			break;
 		case mqfstype_file:
 			entry.d_type = DT_REG;
 			break;
 		case mqfstype_symlink:
 			entry.d_type = DT_LNK;
 			break;
 		default:
 			panic("%s has unexpected node type: %d", pn->mn_name,
 				pn->mn_type);
 		}
 		if (entry.d_reclen > uio->uio_resid)
                         break;
 		if (offset >= uio->uio_offset) {
 			error = vfs_read_dirent(ap, &entry, offset);
                         if (error)
                                 break;
                 }
                 offset += entry.d_reclen;
 	}
 	sx_xunlock(&mi->mi_lock);
 
 	uio->uio_offset = offset;
 
 	if (tmp_ncookies != NULL)
 		ap->a_ncookies = tmp_ncookies;
 
 	return (error);
 }
 
 #ifdef notyet
 
 #if 0
 struct vop_mkdir_args {
 	struct vnode *a_dvp;
 	struvt vnode **a_vpp;
 	struvt componentname *a_cnp;
 	struct vattr *a_vap;
 };
 #endif
 
 /*
  * Create a directory.
  */
 static int
 mqfs_mkdir(struct vop_mkdir_args *ap)
 {
 	struct mqfs_info *mqfs = VFSTOMQFS(ap->a_dvp->v_mount);
 	struct componentname *cnp = ap->a_cnp;
 	struct mqfs_node *pd = VTON(ap->a_dvp);
 	struct mqfs_node *pn;
 	int error;
 
 	if (pd->mn_type != mqfstype_root && pd->mn_type != mqfstype_dir)
 		return (ENOTDIR);
 	sx_xlock(&mqfs->mi_lock);
 #if 0
 	/* named node */
 	pn = mqfs_search(pd, cnp->cn_nameptr, cnp->cn_namelen);
 	if (pn != NULL) {
 		sx_xunlock(&mqfs->mi_lock);
 		return (EEXIST);
 	}
 #else
 	if ((cnp->cn_flags & HASBUF) == 0)
 		panic("%s: no name", __func__);
 #endif
 	pn = mqfs_create_dir(pd, cnp->cn_nameptr, cnp->cn_namelen,
 		ap->a_vap->cn_cred, ap->a_vap->va_mode);
 	if (pn == NULL)
 		error = ENOSPC;
 	else
 		error = mqfs_allocv(ap->a_dvp->v_mount, ap->a_vpp, pn);
 	sx_xunlock(&mqfs->mi_lock);
 	return (error);
 }
 
 #if 0
 struct vop_rmdir_args {
 	struct vnode *a_dvp;
 	struct vnode *a_vp;
 	struct componentname *a_cnp;
 };
 #endif
 
 /*
  * Remove a directory.
  */
 static int
 mqfs_rmdir(struct vop_rmdir_args *ap)
 {
 	struct mqfs_info *mqfs = VFSTOMQFS(ap->a_dvp->v_mount);
 	struct mqfs_node *pn = VTON(ap->a_vp);
 	struct mqfs_node *pt;
 
 	if (pn->mn_type != mqfstype_dir)
 		return (ENOTDIR);
 
 	sx_xlock(&mqfs->mi_lock);
 	if (pn->mn_deleted) {
 		sx_xunlock(&mqfs->mi_lock);
 		return (ENOENT);
 	}
 
 	pt = LIST_FIRST(&pn->mn_children);
 	pt = LIST_NEXT(pt, mn_sibling);
 	pt = LIST_NEXT(pt, mn_sibling);
 	if (pt != NULL) {
 		sx_xunlock(&mqfs->mi_lock);
 		return (ENOTEMPTY);
 	}
 	pt = pn->mn_parent;
 	pn->mn_parent = NULL;
 	pn->mn_deleted = 1;
 	LIST_REMOVE(pn, mn_sibling);
 	mqnode_release(pn);
 	mqnode_release(pt);
 	sx_xunlock(&mqfs->mi_lock);
 	cache_purge(ap->a_vp);
 	return (0);
 }
 
 #endif /* notyet */
 
 /*
  * Allocate a message queue
  */
 static struct mqueue *
 mqueue_alloc(const struct mq_attr *attr)
 {
 	struct mqueue *mq;
 
 	if (curmq >= maxmq)
 		return (NULL);
 	mq = uma_zalloc(mqueue_zone, M_WAITOK | M_ZERO);
 	TAILQ_INIT(&mq->mq_msgq);
 	if (attr != NULL) {
 		mq->mq_maxmsg = attr->mq_maxmsg;
 		mq->mq_msgsize = attr->mq_msgsize;
 	} else {
 		mq->mq_maxmsg = default_maxmsg;
 		mq->mq_msgsize = default_msgsize;
 	}
 	mtx_init(&mq->mq_mutex, "mqueue", NULL, MTX_DEF);
 	knlist_init(&mq->mq_rsel.si_note, &mq->mq_mutex, NULL, NULL, NULL);
 	knlist_init(&mq->mq_wsel.si_note, &mq->mq_mutex, NULL, NULL, NULL);
 	atomic_add_int(&curmq, 1);
 	return (mq);
 }
 
 /*
  * Destroy a message queue
  */
 static void
 mqueue_free(struct mqueue *mq)
 {
 	struct mqueue_msg *msg;
 
 	while ((msg = TAILQ_FIRST(&mq->mq_msgq)) != NULL) {
 		TAILQ_REMOVE(&mq->mq_msgq, msg, msg_link);
 		FREE(msg, M_MQUEUEDATA);
 	}
 
 	mtx_destroy(&mq->mq_mutex);
 	knlist_destroy(&mq->mq_rsel.si_note);
 	knlist_destroy(&mq->mq_wsel.si_note);
 	uma_zfree(mqueue_zone, mq);
 	atomic_add_int(&curmq, -1);
 }
 
 /*
  * Load a message from user space
  */
 static struct mqueue_msg *
 mqueue_loadmsg(const char *msg_ptr, size_t msg_size, int msg_prio)
 {
 	struct mqueue_msg *msg;
 	size_t len;
 	int error;
 
 	len = sizeof(struct mqueue_msg) + msg_size;
 	MALLOC(msg, struct mqueue_msg *, len, M_MQUEUEDATA, M_WAITOK);
 	error = copyin(msg_ptr, ((char *)msg) + sizeof(struct mqueue_msg),
 	    msg_size);
 	if (error) {
 		FREE(msg, M_MQUEUEDATA);
 		msg = NULL;
 	} else {
 		msg->msg_size = msg_size;
 		msg->msg_prio = msg_prio;
 	}
 	return (msg);
 }
 
 /*
  * Save a message to user space
  */
 static int
 mqueue_savemsg(struct mqueue_msg *msg, char *msg_ptr, int *msg_prio)
 {
 	int error;
 
 	error = copyout(((char *)msg) + sizeof(*msg), msg_ptr,
 		msg->msg_size);
 	if (error == 0 && msg_prio != NULL)
 		error = copyout(&msg->msg_prio, msg_prio, sizeof(int));
 	return (error);
 }
 
 /*
  * Free a message's memory
  */
 static __inline void
 mqueue_freemsg(struct mqueue_msg *msg)
 {
 	FREE(msg, M_MQUEUEDATA);
 }
 
 /*
  * Send a message. if waitok is false, thread will not be
  * blocked if there is no data in queue, otherwise, absolute
  * time will be checked.
  */
 int
 mqueue_send(struct mqueue *mq, const char *msg_ptr,
 	size_t msg_len, unsigned msg_prio, int waitok,
 	const struct timespec *abs_timeout)
 {
 	struct mqueue_msg *msg;
 	struct timespec ets, ts, ts2;
 	struct timeval tv;
 	int error;
 
 	if (msg_prio >= MQ_PRIO_MAX)
 		return (EINVAL);
 	if (msg_len > mq->mq_msgsize)
 		return (EMSGSIZE);
 	msg = mqueue_loadmsg(msg_ptr, msg_len, msg_prio);
 	if (msg == NULL)
 		return (EFAULT);
 
 	/* O_NONBLOCK case */
 	if (!waitok) {
 		error = _mqueue_send(mq, msg, -1);
 		if (error)
 			goto bad;
 		return (0);
 	}
 
 	/* we allow a null timeout (wait forever) */
 	if (abs_timeout == NULL) {
 		error = _mqueue_send(mq, msg, 0);
 		if (error)
 			goto bad;
 		return (0);
 	}
 
 	/* send it before checking time */
 	error = _mqueue_send(mq, msg, -1);
 	if (error == 0)
 		return (0);
 
 	if (error != EAGAIN)
 		goto bad;
 
 	error = copyin(abs_timeout, &ets, sizeof(ets));
 	if (error != 0)
 		goto bad;
 	if (ets.tv_nsec >= 1000000000 || ets.tv_nsec < 0) {
 		error = EINVAL;
 		goto bad;
 	}
 	for (;;) {
 		ts2 = ets;
 		getnanotime(&ts);
 		timespecsub(&ts2, &ts);
 		if (ts2.tv_sec < 0 || (ts2.tv_sec == 0 && ts2.tv_nsec <= 0)) {
 			error = ETIMEDOUT;
 			break;
 		}
 		TIMESPEC_TO_TIMEVAL(&tv, &ts2);
 		error = _mqueue_send(mq, msg, tvtohz(&tv));
 		if (error != ETIMEDOUT)
 			break;
 	}
 	if (error == 0)
 		return (0);
 bad:
 	mqueue_freemsg(msg);
 	return (error);
 }
 
 /*
  * Common routine to send a message
  */
 static int
 _mqueue_send(struct mqueue *mq, struct mqueue_msg *msg, int timo)
 {	
 	struct mqueue_msg *msg2;
 	int error = 0;
 
 	mtx_lock(&mq->mq_mutex);
 	while (mq->mq_curmsgs >= mq->mq_maxmsg && error == 0) {
 		if (timo < 0) {
 			mtx_unlock(&mq->mq_mutex);
 			return (EAGAIN);
 		}
 		mq->mq_senders++;
 		error = msleep(&mq->mq_senders, &mq->mq_mutex,
 			    PCATCH, "mqsend", timo);
 		mq->mq_senders--;
 		if (error == EAGAIN)
 			error = ETIMEDOUT;
 	}
 	if (mq->mq_curmsgs >= mq->mq_maxmsg) {
 		mtx_unlock(&mq->mq_mutex);
 		return (error);
 	}
 	error = 0;
 	if (TAILQ_EMPTY(&mq->mq_msgq)) {
 		TAILQ_INSERT_HEAD(&mq->mq_msgq, msg, msg_link);
 	} else {
 		if (msg->msg_prio <= TAILQ_LAST(&mq->mq_msgq, msgq)->msg_prio) {
 			TAILQ_INSERT_TAIL(&mq->mq_msgq, msg, msg_link);
 		} else {
 			TAILQ_FOREACH(msg2, &mq->mq_msgq, msg_link) {
 				if (msg2->msg_prio < msg->msg_prio)
 					break;
 			}
 			TAILQ_INSERT_BEFORE(msg2, msg, msg_link);
 		}
 	}
 	mq->mq_curmsgs++;
 	mq->mq_totalbytes += msg->msg_size;
 	if (mq->mq_receivers)
 		wakeup_one(&mq->mq_receivers);
 	else if (mq->mq_notifier != NULL)
 		mqueue_send_notification(mq);
 	if (mq->mq_flags & MQ_RSEL) {
 		mq->mq_flags &= ~MQ_RSEL;
 		selwakeup(&mq->mq_rsel);
 	}
 	KNOTE_LOCKED(&mq->mq_rsel.si_note, 0);
 	mtx_unlock(&mq->mq_mutex);
 	return (0);
 }
 
 /*
  * Send realtime a signal to process which registered itself
  * successfully by mq_notify.
  */
 static void
 mqueue_send_notification(struct mqueue *mq)
 {
 	struct mqueue_notifier *nt;
 	struct proc *p;
 
 	mtx_assert(&mq->mq_mutex, MA_OWNED);
 	nt = mq->mq_notifier;
 	if (nt->nt_sigev.sigev_notify != SIGEV_NONE) {
 		p = nt->nt_proc;
 		PROC_LOCK(p);
 		if (!KSI_ONQ(&nt->nt_ksi))
 			psignal_event(p, &nt->nt_sigev, &nt->nt_ksi);
 		PROC_UNLOCK(p);
 	}
 	mq->mq_notifier = NULL;
 }
 
 /*
  * Get a message. if waitok is false, thread will not be
  * blocked if there is no data in queue, otherwise, absolute
  * time will be checked.
  */
 int
 mqueue_receive(struct mqueue *mq, char *msg_ptr,
 	size_t msg_len, unsigned *msg_prio, int waitok,
 	const struct timespec *abs_timeout)
 {
 	struct mqueue_msg *msg;
 	struct timespec ets, ts, ts2;
 	struct timeval tv;
 	int error;
 
 	if (msg_len < mq->mq_msgsize)
 		return (EMSGSIZE);
 
 	/* O_NONBLOCK case */
 	if (!waitok) {
 		error = _mqueue_recv(mq, &msg, -1);
 		if (error)
 			return (error);
 		goto received;
 	}
 
 	/* we allow a null timeout (wait forever). */
 	if (abs_timeout == NULL) {
 		error = _mqueue_recv(mq, &msg, 0);
 		if (error)
 			return (error);
 		goto received;
 	}
 
 	/* try to get a message before checking time */
 	error = _mqueue_recv(mq, &msg, -1);
 	if (error == 0)
 		goto received;
 
 	if (error != EAGAIN)
 		return (error);
 
 	error = copyin(abs_timeout, &ets, sizeof(ets));
 	if (error != 0)
 		return (error);
 	if (ets.tv_nsec >= 1000000000 || ets.tv_nsec < 0) {
 		error = EINVAL;
 		return (error);
 	}
 
 	for (;;) {
 		ts2 = ets;
 		getnanotime(&ts);
 		timespecsub(&ts2, &ts);
 		if (ts2.tv_sec < 0 || (ts2.tv_sec == 0 && ts2.tv_nsec <= 0)) {
 			error = ETIMEDOUT;
 			return (error);
 		}
 		TIMESPEC_TO_TIMEVAL(&tv, &ts2);
 		error = _mqueue_recv(mq, &msg, tvtohz(&tv));
 		if (error == 0)
 			break;
 		if (error != ETIMEDOUT)
 			return (error);
 	}
 
 received:
 	error = mqueue_savemsg(msg, msg_ptr, msg_prio);
 	if (error == 0) {
 		curthread->td_retval[0] = msg->msg_size;
 		curthread->td_retval[1] = 0;
 	}
 	mqueue_freemsg(msg);
 	return (error);
 }
 
 /*
  * Common routine to receive a message
  */
 static int
 _mqueue_recv(struct mqueue *mq, struct mqueue_msg **msg, int timo)
 {	
 	int error = 0;
 	
 	mtx_lock(&mq->mq_mutex);
 	while ((*msg = TAILQ_FIRST(&mq->mq_msgq)) == NULL && error == 0) {
 		if (timo < 0) {
 			mtx_unlock(&mq->mq_mutex);
 			return (EAGAIN);
 		}
 		mq->mq_receivers++;
 		error = msleep(&mq->mq_receivers, &mq->mq_mutex,
 			    PCATCH, "mqrecv", timo);
 		mq->mq_receivers--;
 		if (error == EAGAIN)
 			error = ETIMEDOUT;
 	}
 	if (*msg != NULL) {
 		error = 0;
 		TAILQ_REMOVE(&mq->mq_msgq, *msg, msg_link);
 		mq->mq_curmsgs--;
 		mq->mq_totalbytes -= (*msg)->msg_size;
 		if (mq->mq_senders)
 			wakeup_one(&mq->mq_senders);
 		if (mq->mq_flags & MQ_WSEL) {
 			mq->mq_flags &= ~MQ_WSEL;
 			selwakeup(&mq->mq_wsel);
 		}
 		KNOTE_LOCKED(&mq->mq_wsel.si_note, 0);
 	}
 	if (mq->mq_notifier != NULL && mq->mq_receivers == 0 &&
 	    !TAILQ_EMPTY(&mq->mq_msgq)) {
 		mqueue_send_notification(mq);
 	}
 	mtx_unlock(&mq->mq_mutex);
 	return (error);
 }
 
 static __inline struct mqueue_notifier *
 notifier_alloc(void)
 {
 	return (uma_zalloc(mqnoti_zone, M_WAITOK | M_ZERO));
 }
 
 static __inline void
 notifier_free(struct mqueue_notifier *p)
 {
 	uma_zfree(mqnoti_zone, p);
 }
 
 static struct mqueue_notifier *
 notifier_search(struct proc *p, int fd)
 {
 	struct mqueue_notifier *nt;
 
 	LIST_FOREACH(nt, &p->p_mqnotifier, nt_link) {
 		if (nt->nt_ksi.ksi_mqd == fd)
 			break;
 	}
 	return (nt);
 }
 
 static __inline void
 notifier_insert(struct proc *p, struct mqueue_notifier *nt)
 {
 	LIST_INSERT_HEAD(&p->p_mqnotifier, nt, nt_link);
 }
 
 static __inline void
 notifier_delete(struct proc *p, struct mqueue_notifier *nt)
 {
 	LIST_REMOVE(nt, nt_link);
 	notifier_free(nt);
 }
 
 static void
 notifier_remove(struct proc *p, struct mqueue *mq, int fd)
 {
 	struct mqueue_notifier *nt;
 
 	mtx_assert(&mq->mq_mutex, MA_OWNED);
 	PROC_LOCK(p);
 	nt = notifier_search(p, fd);
 	if (nt != NULL) {
 		if (mq->mq_notifier == nt)
 			mq->mq_notifier = NULL;
 		sigqueue_take(&nt->nt_ksi);
 		notifier_delete(p, nt);
 	}
 	PROC_UNLOCK(p);
 }
 
 /*
  * Syscall to open a message queue.
  */
 int
 kmq_open(struct thread *td, struct kmq_open_args *uap)
 {
 	char path[MQFS_NAMELEN + 1];
 	struct mq_attr attr, *pattr;
 	struct mqfs_node *pn;
 	struct filedesc *fdp;
 	struct file *fp;
 	struct mqueue *mq;
 	int fd, error, len, flags, cmode;
 
 	if ((uap->flags & O_ACCMODE) == O_ACCMODE)
 		return (EINVAL);
 
 	fdp = td->td_proc->p_fd;
 	flags = FFLAGS(uap->flags);
 	cmode = (((uap->mode & ~fdp->fd_cmask) & ALLPERMS) & ~S_ISTXT);
 	mq = NULL;
 	if ((flags & O_CREAT) && (uap->attr != NULL)) {
 		error = copyin(uap->attr, &attr, sizeof(attr));
 		if (error)
 			return (error);
 		if (attr.mq_maxmsg <= 0 || attr.mq_maxmsg > maxmsg)
 			return (EINVAL);
 		if (attr.mq_msgsize <= 0 || attr.mq_msgsize > maxmsgsize)
 			return (EINVAL);
 		pattr = &attr;
 	} else
 		pattr = NULL;
 
 	error = copyinstr(uap->path, path, MQFS_NAMELEN + 1, NULL);
         if (error)
 		return (error);
 
 	/*
 	 * The first character of name must be a slash  (/) character
 	 * and the remaining characters of name cannot include any slash
 	 * characters. 
 	 */
 	len = strlen(path);
 	if (len < 2  || path[0] != '/' || index(path + 1, '/') != NULL)
 		return (EINVAL);
 
 	error = falloc(td, &fp, &fd);
 	if (error)
 		return (error);
 
 	sx_xlock(&mqfs_data.mi_lock);
 	pn = mqfs_search(mqfs_data.mi_root, path + 1, len - 1);
 	if (pn == NULL) {
 		if (!(flags & O_CREAT)) {
 			error = ENOENT;
 		} else {
 			mq = mqueue_alloc(pattr);
 			if (mq == NULL) {
 				error = ENFILE;
 			} else {
 				pn = mqfs_create_file(mqfs_data.mi_root,
 				         path + 1, len - 1, td->td_ucred,
 					 cmode);
 				if (pn == NULL) {
 					error = ENOSPC;
 					mqueue_free(mq);
 				}
 			}
 		}
 
 		if (error == 0) {
 			pn->mn_data = mq;
 		}
 	} else {
 		if ((flags & (O_CREAT | O_EXCL)) == (O_CREAT | O_EXCL)) {
 			error = EEXIST;
 		} else {
 			int acc_mode = 0;
 
 			if (flags & FREAD)
 				acc_mode |= VREAD;
 			if (flags & FWRITE)
 				acc_mode |= VWRITE;
 			error = vaccess(VREG, pn->mn_mode, pn->mn_uid,
 				    pn->mn_gid, acc_mode, td->td_ucred, NULL);
 		}
 	}
 
 	if (error) {
 		sx_xunlock(&mqfs_data.mi_lock);
 		fdclose(fdp, fp, fd, td);
 		fdrop(fp, td);
 		return (error);
 	}
 
 	mqnode_addref(pn);
 	sx_xunlock(&mqfs_data.mi_lock);
 
 	finit(fp, flags & (FREAD | FWRITE | O_NONBLOCK), DTYPE_MQUEUE, pn,
 	    &mqueueops);
 
 	FILEDESC_XLOCK(fdp);
 	if (fdp->fd_ofiles[fd] == fp)
 		fdp->fd_ofileflags[fd] |= UF_EXCLOSE;
 	FILEDESC_XUNLOCK(fdp);
 	td->td_retval[0] = fd;
 	fdrop(fp, td);
 	return (0);
 }
 
 /*
  * Syscall to unlink a message queue.
  */
 int
 kmq_unlink(struct thread *td, struct kmq_unlink_args *uap)
 {
 	char path[MQFS_NAMELEN+1];
 	struct mqfs_node *pn;
 	int error, len;
 
 	error = copyinstr(uap->path, path, MQFS_NAMELEN + 1, NULL);
         if (error)
 		return (error);
 
 	len = strlen(path);
 	if (len < 2  || path[0] != '/' || index(path + 1, '/') != NULL)
 		return (EINVAL);
 
 	sx_xlock(&mqfs_data.mi_lock);
 	pn = mqfs_search(mqfs_data.mi_root, path + 1, len - 1);
 	if (pn != NULL)
 		error = do_unlink(pn, td->td_ucred);
 	else
 		error = ENOENT;
 	sx_xunlock(&mqfs_data.mi_lock);
 	return (error);
 }
 
 typedef int (*_fgetf)(struct thread *, int, struct file **);
 
 /*
  * Get message queue by giving file slot
  */
 static int
 _getmq(struct thread *td, int fd, _fgetf func,
        struct file **fpp, struct mqfs_node **ppn, struct mqueue **pmq)
 {
 	struct mqfs_node *pn;
 	int error;
 
 	error = func(td, fd, fpp);
 	if (error)
 		return (error);
 	if (&mqueueops != (*fpp)->f_ops) {
 		fdrop(*fpp, td);
 		return (EBADF);
 	}
 	pn = (*fpp)->f_data;
 	if (ppn)
 		*ppn = pn;
 	if (pmq)
 		*pmq = pn->mn_data;
 	return (0);
 }
 
 static __inline int
 getmq(struct thread *td, int fd, struct file **fpp, struct mqfs_node **ppn,
 	struct mqueue **pmq)
 {
 	return _getmq(td, fd, fget, fpp, ppn, pmq);
 }
 
 static __inline int
 getmq_read(struct thread *td, int fd, struct file **fpp,
 	 struct mqfs_node **ppn, struct mqueue **pmq)
 {
 	return _getmq(td, fd, fget_read, fpp, ppn, pmq);
 }
 
 static __inline int
 getmq_write(struct thread *td, int fd, struct file **fpp,
 	struct mqfs_node **ppn, struct mqueue **pmq)
 {
 	return _getmq(td, fd, fget_write, fpp, ppn, pmq);
 }
 
 int
 kmq_setattr(struct thread *td, struct kmq_setattr_args *uap)
 {
 	struct mqueue *mq;
 	struct file *fp;
 	struct mq_attr attr, oattr;
 	u_int oflag, flag;
 	int error;
 
 	if (uap->attr) {
 		error = copyin(uap->attr, &attr, sizeof(attr));
 		if (error)
 			return (error);
 		if (attr.mq_flags & ~O_NONBLOCK)
 			return (EINVAL);
 	}
 	error = getmq(td, uap->mqd, &fp, NULL, &mq);
 	if (error)
 		return (error);
 	oattr.mq_maxmsg  = mq->mq_maxmsg;
 	oattr.mq_msgsize = mq->mq_msgsize;
 	oattr.mq_curmsgs = mq->mq_curmsgs;
 	if (uap->attr) {
 		do {
 			oflag = flag = fp->f_flag;
 			flag &= ~O_NONBLOCK;
 			flag |= (attr.mq_flags & O_NONBLOCK);
 		} while (atomic_cmpset_int(&fp->f_flag, oflag, flag) == 0);
 	} else
 		oflag = fp->f_flag;
 	oattr.mq_flags = (O_NONBLOCK & oflag);
 	fdrop(fp, td);
 	if (uap->oattr)
 		error = copyout(&oattr, uap->oattr, sizeof(oattr));
 	return (error);
 }
 
 int
 kmq_timedreceive(struct thread *td, struct kmq_timedreceive_args *uap)
 {
 	struct mqueue *mq;
 	struct file *fp;
 	int error;
 	int waitok;
 
 	error = getmq_read(td, uap->mqd, &fp, NULL, &mq);
 	if (error)
 		return (error);
 	waitok = !(fp->f_flag & O_NONBLOCK);
 	error = mqueue_receive(mq, uap->msg_ptr, uap->msg_len,
 		uap->msg_prio, waitok, uap->abs_timeout);
 	fdrop(fp, td);
 	return (error);
 }
 
 int
 kmq_timedsend(struct thread *td, struct kmq_timedsend_args *uap)
 {
 	struct mqueue *mq;
 	struct file *fp;
 	int error, waitok;
 
 	error = getmq_write(td, uap->mqd, &fp, NULL, &mq);
 	if (error)
 		return (error);
 	waitok = !(fp->f_flag & O_NONBLOCK);
 	error = mqueue_send(mq, uap->msg_ptr, uap->msg_len,
 		uap->msg_prio, waitok, uap->abs_timeout);
 	fdrop(fp, td);
 	return (error);
 }
 
 int
 kmq_notify(struct thread *td, struct kmq_notify_args *uap)
 {
 	struct sigevent ev;
 	struct filedesc *fdp;
 	struct proc *p;
 	struct mqueue *mq;
 	struct file *fp;
 	struct mqueue_notifier *nt, *newnt = NULL;
 	int error;
 
 	p = td->td_proc;
 	fdp = td->td_proc->p_fd;
 	if (uap->sigev) {
 		error = copyin(uap->sigev, &ev, sizeof(ev));
 		if (error)
 			return (error);
 		if (ev.sigev_notify != SIGEV_SIGNAL &&
 		    ev.sigev_notify != SIGEV_THREAD_ID &&
 		    ev.sigev_notify != SIGEV_NONE)
 			return (EINVAL);
 		if ((ev.sigev_notify == SIGEV_SIGNAL ||
 		     ev.sigev_notify == SIGEV_THREAD_ID) &&
 			!_SIG_VALID(ev.sigev_signo))
 			return (EINVAL);
 	}
 	error = getmq(td, uap->mqd, &fp, NULL, &mq);
 	if (error)
 		return (error);
 again:
 	FILEDESC_SLOCK(fdp);
 	if (fget_locked(fdp, uap->mqd) != fp) {
 		FILEDESC_SUNLOCK(fdp);
 		error = EBADF;
 		goto out;
 	}
 	mtx_lock(&mq->mq_mutex);
 	FILEDESC_SUNLOCK(fdp);
 	if (uap->sigev != NULL) {
 		if (mq->mq_notifier != NULL) {
 			error = EBUSY;
 		} else {
 			PROC_LOCK(p);
 			nt = notifier_search(p, uap->mqd);
 			if (nt == NULL) {
 				if (newnt == NULL) {
 					PROC_UNLOCK(p);
 					mtx_unlock(&mq->mq_mutex);
 					newnt = notifier_alloc();
 					goto again;
 				}
 			}
 
 			if (nt != NULL) {
 				sigqueue_take(&nt->nt_ksi);
 				if (newnt != NULL) {
 					notifier_free(newnt);
 					newnt = NULL;
 				}
 			} else {
 				nt = newnt;
 				newnt = NULL;
 				ksiginfo_init(&nt->nt_ksi);
 				nt->nt_ksi.ksi_flags |= KSI_INS | KSI_EXT;
 				nt->nt_ksi.ksi_code = SI_MESGQ;
 				nt->nt_proc = p;
 				nt->nt_ksi.ksi_mqd = uap->mqd;
 				notifier_insert(p, nt);
 			}
 			nt->nt_sigev = ev;
 			mq->mq_notifier = nt;
 			PROC_UNLOCK(p);
 			/*
 			 * if there is no receivers and message queue
 			 * is not empty, we should send notification
 			 * as soon as possible.
 			 */
 			if (mq->mq_receivers == 0 &&
 			    !TAILQ_EMPTY(&mq->mq_msgq))
 				mqueue_send_notification(mq);
 		}
 	} else {
 		notifier_remove(p, mq, uap->mqd);
 	}
 	mtx_unlock(&mq->mq_mutex);
 
 out:
 	fdrop(fp, td);
 	if (newnt != NULL)
 		notifier_free(newnt);
 	return (error);
 }
 
 static void
 mqueue_fdclose(struct thread *td, int fd, struct file *fp)
 {
 	struct filedesc *fdp;
 	struct mqueue *mq;
  
 	fdp = td->td_proc->p_fd;
 	FILEDESC_LOCK_ASSERT(fdp);
 
 	if (fp->f_ops == &mqueueops) {
 		mq = FPTOMQ(fp);
 		mtx_lock(&mq->mq_mutex);
 		notifier_remove(td->td_proc, mq, fd);
 
 		/* have to wakeup thread in same process */
 		if (mq->mq_flags & MQ_RSEL) {
 			mq->mq_flags &= ~MQ_RSEL;
 			selwakeup(&mq->mq_rsel);
 		}
 		if (mq->mq_flags & MQ_WSEL) {
 			mq->mq_flags &= ~MQ_WSEL;
 			selwakeup(&mq->mq_wsel);
 		}
 		mtx_unlock(&mq->mq_mutex);
 	}
 }
 
 static void
 mq_proc_exit(void *arg __unused, struct proc *p)
 {
 	struct filedesc *fdp;
 	struct file *fp;
 	struct mqueue *mq;
 	int i;
 
 	fdp = p->p_fd;
 	FILEDESC_SLOCK(fdp);
 	for (i = 0; i < fdp->fd_nfiles; ++i) {
 		fp = fget_locked(fdp, i);
 		if (fp != NULL && fp->f_ops == &mqueueops) {
 			mq = FPTOMQ(fp);
 			mtx_lock(&mq->mq_mutex);
 			notifier_remove(p, FPTOMQ(fp), i);
 			mtx_unlock(&mq->mq_mutex);
 		}
 	}
 	FILEDESC_SUNLOCK(fdp);
 	KASSERT(LIST_EMPTY(&p->p_mqnotifier), ("mq notifiers left"));
 }
 
 static int
 mqf_read(struct file *fp, struct uio *uio, struct ucred *active_cred,
 	int flags, struct thread *td)
 {
 	return (EOPNOTSUPP);
 }
 
 static int
 mqf_write(struct file *fp, struct uio *uio, struct ucred *active_cred,
 	int flags, struct thread *td)
 {
 	return (EOPNOTSUPP);
 }
 
 static int
 mqf_truncate(struct file *fp, off_t length, struct ucred *active_cred,
     struct thread *td)
 {
 
 	return (EINVAL);
 }
 
 static int
 mqf_ioctl(struct file *fp, u_long cmd, void *data,
 	struct ucred *active_cred, struct thread *td)
 {
 	return (ENOTTY);
 }
 
 static int
 mqf_poll(struct file *fp, int events, struct ucred *active_cred,
 	struct thread *td)
 {
 	struct mqueue *mq = FPTOMQ(fp);
 	int revents = 0;
 
 	mtx_lock(&mq->mq_mutex);
 	if (events & (POLLIN | POLLRDNORM)) {
 		if (mq->mq_curmsgs) {
 			revents |= events & (POLLIN | POLLRDNORM);
 		} else {
 			mq->mq_flags |= MQ_RSEL;
 			selrecord(td, &mq->mq_rsel);
  		}
 	}
 	if (events & POLLOUT) {
 		if (mq->mq_curmsgs < mq->mq_maxmsg)
 			revents |= POLLOUT;
 		else {
 			mq->mq_flags |= MQ_WSEL;
 			selrecord(td, &mq->mq_wsel);
 		}
 	}
 	mtx_unlock(&mq->mq_mutex);
 	return (revents);
 }
 
 static int
 mqf_close(struct file *fp, struct thread *td)
 {
 	struct mqfs_node *pn;
 
 	fp->f_ops = &badfileops;
 	pn = fp->f_data;
 	fp->f_data = NULL;
 	sx_xlock(&mqfs_data.mi_lock);
 	mqnode_release(pn);
 	sx_xunlock(&mqfs_data.mi_lock);
 	return (0);
 }
 
 static int
 mqf_stat(struct file *fp, struct stat *st, struct ucred *active_cred,
 	struct thread *td)
 {
 	struct mqfs_node *pn = fp->f_data;
 
 	bzero(st, sizeof *st);
 	st->st_atimespec = pn->mn_atime;
 	st->st_mtimespec = pn->mn_mtime;
 	st->st_ctimespec = pn->mn_ctime;
 	st->st_birthtimespec = pn->mn_birth;
 	st->st_uid = pn->mn_uid;
 	st->st_gid = pn->mn_gid;
 	st->st_mode = S_IFIFO | pn->mn_mode;
 	return (0);
 }
 
 static int
 mqf_kqfilter(struct file *fp, struct knote *kn)
 {
 	struct mqueue *mq = FPTOMQ(fp);
 	int error = 0;
 
 	if (kn->kn_filter == EVFILT_READ) {
 		kn->kn_fop = &mq_rfiltops;
 		knlist_add(&mq->mq_rsel.si_note, kn, 0);
 	} else if (kn->kn_filter == EVFILT_WRITE) {
 		kn->kn_fop = &mq_wfiltops;
 		knlist_add(&mq->mq_wsel.si_note, kn, 0);
 	} else
 		error = EINVAL;
 	return (error);
 }
 
 static void
 filt_mqdetach(struct knote *kn)
 {
 	struct mqueue *mq = FPTOMQ(kn->kn_fp);
 
 	if (kn->kn_filter == EVFILT_READ)
 		knlist_remove(&mq->mq_rsel.si_note, kn, 0);
 	else if (kn->kn_filter == EVFILT_WRITE)
 		knlist_remove(&mq->mq_wsel.si_note, kn, 0);
 	else
 		panic("filt_mqdetach");
 }
 
 static int
 filt_mqread(struct knote *kn, long hint)
 {
 	struct mqueue *mq = FPTOMQ(kn->kn_fp);
 
 	mtx_assert(&mq->mq_mutex, MA_OWNED);
 	return (mq->mq_curmsgs != 0);
 }
 
 static int
 filt_mqwrite(struct knote *kn, long hint)
 {
 	struct mqueue *mq = FPTOMQ(kn->kn_fp);
 
 	mtx_assert(&mq->mq_mutex, MA_OWNED);
 	return (mq->mq_curmsgs < mq->mq_maxmsg);
 }
 
 static struct fileops mqueueops = {
 	.fo_read		= mqf_read,
 	.fo_write		= mqf_write,
 	.fo_truncate		= mqf_truncate,
 	.fo_ioctl		= mqf_ioctl,
 	.fo_poll		= mqf_poll,
 	.fo_kqfilter		= mqf_kqfilter,
 	.fo_stat		= mqf_stat,
 	.fo_close		= mqf_close
 };
 
 static struct vop_vector mqfs_vnodeops = {
 	.vop_default 		= &default_vnodeops,
 	.vop_access		= mqfs_access,
 	.vop_cachedlookup	= mqfs_lookup,
 	.vop_lookup		= vfs_cache_lookup,
 	.vop_reclaim		= mqfs_reclaim,
 	.vop_create		= mqfs_create,
 	.vop_remove		= mqfs_remove,
 	.vop_inactive		= mqfs_inactive,
 	.vop_open		= mqfs_open,
 	.vop_close		= mqfs_close,
 	.vop_getattr		= mqfs_getattr,
 	.vop_setattr		= mqfs_setattr,
 	.vop_read		= mqfs_read,
 	.vop_write		= VOP_EOPNOTSUPP,
 	.vop_readdir		= mqfs_readdir,
 	.vop_mkdir		= VOP_EOPNOTSUPP,
 	.vop_rmdir		= VOP_EOPNOTSUPP
 };
 
 static struct vfsops mqfs_vfsops = {
 	.vfs_init 		= mqfs_init,
 	.vfs_uninit		= mqfs_uninit,
 	.vfs_mount		= mqfs_mount,
 	.vfs_unmount		= mqfs_unmount,
 	.vfs_root		= mqfs_root,
 	.vfs_statfs		= mqfs_statfs,
 };
 
 SYSCALL_MODULE_HELPER(kmq_open);
 SYSCALL_MODULE_HELPER(kmq_setattr);
 SYSCALL_MODULE_HELPER(kmq_timedsend);
 SYSCALL_MODULE_HELPER(kmq_timedreceive);
 SYSCALL_MODULE_HELPER(kmq_notify);
 SYSCALL_MODULE_HELPER(kmq_unlink);
 
 VFS_SET(mqfs_vfsops, mqueuefs, VFCF_SYNTHETIC);
 MODULE_VERSION(mqueuefs, 1);
Index: head/sys/kern/uipc_syscalls.c
===================================================================
--- head/sys/kern/uipc_syscalls.c	(revision 175201)
+++ head/sys/kern/uipc_syscalls.c	(revision 175202)
@@ -1,2629 +1,2629 @@
 /*-
  * Copyright (c) 1982, 1986, 1989, 1990, 1993
  *	The Regents of the University of California.  All rights reserved.
  *
  * sendfile(2) and related extensions:
  * Copyright (c) 1998, David Greenman. All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	@(#)uipc_syscalls.c	8.4 (Berkeley) 2/21/94
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_sctp.h"
 #include "opt_compat.h"
 #include "opt_ktrace.h"
 #include "opt_mac.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/kernel.h>
 #include <sys/lock.h>
 #include <sys/mutex.h>
 #include <sys/sysproto.h>
 #include <sys/malloc.h>
 #include <sys/filedesc.h>
 #include <sys/event.h>
 #include <sys/proc.h>
 #include <sys/fcntl.h>
 #include <sys/file.h>
 #include <sys/filio.h>
 #include <sys/mount.h>
 #include <sys/mbuf.h>
 #include <sys/protosw.h>
 #include <sys/sf_buf.h>
 #include <sys/socket.h>
 #include <sys/socketvar.h>
 #include <sys/signalvar.h>
 #include <sys/syscallsubr.h>
 #include <sys/sysctl.h>
 #include <sys/uio.h>
 #include <sys/vnode.h>
 #ifdef KTRACE
 #include <sys/ktrace.h>
 #endif
 
 #include <security/mac/mac_framework.h>
 
 #include <vm/vm.h>
 #include <vm/vm_object.h>
 #include <vm/vm_page.h>
 #include <vm/vm_pageout.h>
 #include <vm/vm_kern.h>
 #include <vm/vm_extern.h>
 
 #ifdef SCTP
 #include <netinet/sctp.h>
 #include <netinet/sctp_peeloff.h>
 #endif /* SCTP */
 
 static int sendit(struct thread *td, int s, struct msghdr *mp, int flags);
 static int recvit(struct thread *td, int s, struct msghdr *mp, void *namelenp);
 
 static int accept1(struct thread *td, struct accept_args *uap, int compat);
 static int do_sendfile(struct thread *td, struct sendfile_args *uap, int compat);
 static int getsockname1(struct thread *td, struct getsockname_args *uap,
 			int compat);
 static int getpeername1(struct thread *td, struct getpeername_args *uap,
 			int compat);
 
 /*
  * NSFBUFS-related variables and associated sysctls
  */
 int nsfbufs;
 int nsfbufspeak;
 int nsfbufsused;
 
 SYSCTL_INT(_kern_ipc, OID_AUTO, nsfbufs, CTLFLAG_RDTUN, &nsfbufs, 0,
     "Maximum number of sendfile(2) sf_bufs available");
 SYSCTL_INT(_kern_ipc, OID_AUTO, nsfbufspeak, CTLFLAG_RD, &nsfbufspeak, 0,
     "Number of sendfile(2) sf_bufs at peak usage");
 SYSCTL_INT(_kern_ipc, OID_AUTO, nsfbufsused, CTLFLAG_RD, &nsfbufsused, 0,
     "Number of sendfile(2) sf_bufs in use");
 
 /*
  * Convert a user file descriptor to a kernel file entry.  A reference on the
  * file entry is held upon returning.  This is lighter weight than
  * fgetsock(), which bumps the socket reference drops the file reference
  * count instead, as this approach avoids several additional mutex operations
  * associated with the additional reference count.  If requested, return the
  * open file flags.
  */
 static int
 getsock(struct filedesc *fdp, int fd, struct file **fpp, u_int *fflagp)
 {
 	struct file *fp;
 	int error;
 
 	fp = NULL;
 	if (fdp == NULL)
 		error = EBADF;
 	else {
 		FILEDESC_SLOCK(fdp);
 		fp = fget_locked(fdp, fd);
 		if (fp == NULL)
 			error = EBADF;
 		else if (fp->f_type != DTYPE_SOCKET) {
 			fp = NULL;
 			error = ENOTSOCK;
 		} else {
 			fhold(fp);
 			if (fflagp != NULL)
 				*fflagp = fp->f_flag;
 			error = 0;
 		}
 		FILEDESC_SUNLOCK(fdp);
 	}
 	*fpp = fp;
 	return (error);
 }
 
 /*
  * System call interface to the socket abstraction.
  */
 #if defined(COMPAT_43)
 #define COMPAT_OLDSOCK
 #endif
 
 int
 socket(td, uap)
 	struct thread *td;
 	struct socket_args /* {
 		int	domain;
 		int	type;
 		int	protocol;
 	} */ *uap;
 {
 	struct filedesc *fdp;
 	struct socket *so;
 	struct file *fp;
 	int fd, error;
 
 #ifdef MAC
 	error = mac_socket_check_create(td->td_ucred, uap->domain, uap->type,
 	    uap->protocol);
 	if (error)
 		return (error);
 #endif
 	fdp = td->td_proc->p_fd;
 	error = falloc(td, &fp, &fd);
 	if (error)
 		return (error);
 	/* An extra reference on `fp' has been held for us by falloc(). */
 	error = socreate(uap->domain, &so, uap->type, uap->protocol,
 	    td->td_ucred, td);
 	if (error) {
 		fdclose(fdp, fp, fd, td);
 	} else {
 		finit(fp, FREAD | FWRITE, DTYPE_SOCKET, so, &socketops);
 		td->td_retval[0] = fd;
 	}
 	fdrop(fp, td);
 	return (error);
 }
 
 /* ARGSUSED */
 int
 bind(td, uap)
 	struct thread *td;
 	struct bind_args /* {
 		int	s;
 		caddr_t	name;
 		int	namelen;
 	} */ *uap;
 {
 	struct sockaddr *sa;
 	int error;
 
 	if ((error = getsockaddr(&sa, uap->name, uap->namelen)) != 0)
 		return (error);
 
 	error = kern_bind(td, uap->s, sa);
 	free(sa, M_SONAME);
 	return (error);
 }
 
 int
 kern_bind(td, fd, sa)
 	struct thread *td;
 	int fd;
 	struct sockaddr *sa;
 {
 	struct socket *so;
 	struct file *fp;
 	int error;
 
 	error = getsock(td->td_proc->p_fd, fd, &fp, NULL);
 	if (error)
 		return (error);
 	so = fp->f_data;
 #ifdef MAC
 	SOCK_LOCK(so);
 	error = mac_socket_check_bind(td->td_ucred, so, sa);
 	SOCK_UNLOCK(so);
 	if (error)
 		goto done;
 #endif
 	error = sobind(so, sa, td);
 #ifdef MAC
 done:
 #endif
 	fdrop(fp, td);
 	return (error);
 }
 
 /* ARGSUSED */
 int
 listen(td, uap)
 	struct thread *td;
 	struct listen_args /* {
 		int	s;
 		int	backlog;
 	} */ *uap;
 {
 	struct socket *so;
 	struct file *fp;
 	int error;
 
 	error = getsock(td->td_proc->p_fd, uap->s, &fp, NULL);
 	if (error == 0) {
 		so = fp->f_data;
 #ifdef MAC
 		SOCK_LOCK(so);
 		error = mac_socket_check_listen(td->td_ucred, so);
 		SOCK_UNLOCK(so);
 		if (error)
 			goto done;
 #endif
 		error = solisten(so, uap->backlog, td);
 #ifdef MAC
 done:
 #endif
 		fdrop(fp, td);
 	}
 	return(error);
 }
 
 /*
  * accept1()
  */
 static int
 accept1(td, uap, compat)
 	struct thread *td;
 	struct accept_args /* {
 		int	s;
 		struct sockaddr	* __restrict name;
 		socklen_t	* __restrict anamelen;
 	} */ *uap;
 	int compat;
 {
 	struct sockaddr *name;
 	socklen_t namelen;
 	struct file *fp;
 	int error;
 
 	if (uap->name == NULL)
 		return (kern_accept(td, uap->s, NULL, NULL, NULL));
 
 	error = copyin(uap->anamelen, &namelen, sizeof (namelen));
 	if (error)
 		return (error);
 
 	error = kern_accept(td, uap->s, &name, &namelen, &fp);
 
 	/*
 	 * return a namelen of zero for older code which might
 	 * ignore the return value from accept.
 	 */
 	if (error) {
 		(void) copyout(&namelen,
 		    uap->anamelen, sizeof(*uap->anamelen));
 		return (error);
 	}
 
 	if (error == 0 && name != NULL) {
 #ifdef COMPAT_OLDSOCK
 		if (compat)
 			((struct osockaddr *)name)->sa_family =
 			    name->sa_family;
 #endif
 		error = copyout(name, uap->name, namelen);
 	}
 	if (error == 0)
 		error = copyout(&namelen, uap->anamelen,
 		    sizeof(namelen));
 	if (error)
 		fdclose(td->td_proc->p_fd, fp, td->td_retval[0], td);
 	fdrop(fp, td);
 	free(name, M_SONAME);
 	return (error);
 }
 
 int
 kern_accept(struct thread *td, int s, struct sockaddr **name,
     socklen_t *namelen, struct file **fp)
 {
 	struct filedesc *fdp;
 	struct file *headfp, *nfp = NULL;
 	struct sockaddr *sa = NULL;
 	int error;
 	struct socket *head, *so;
 	int fd;
 	u_int fflag;
 	pid_t pgid;
 	int tmp;
 
 	if (name) {
 		*name = NULL;
 		if (*namelen < 0)
 			return (EINVAL);
 	}
 
 	fdp = td->td_proc->p_fd;
 	error = getsock(fdp, s, &headfp, &fflag);
 	if (error)
 		return (error);
 	head = headfp->f_data;
 	if ((head->so_options & SO_ACCEPTCONN) == 0) {
 		error = EINVAL;
 		goto done;
 	}
 #ifdef MAC
 	SOCK_LOCK(head);
 	error = mac_socket_check_accept(td->td_ucred, head);
 	SOCK_UNLOCK(head);
 	if (error != 0)
 		goto done;
 #endif
 	error = falloc(td, &nfp, &fd);
 	if (error)
 		goto done;
 	ACCEPT_LOCK();
 	if ((head->so_state & SS_NBIO) && TAILQ_EMPTY(&head->so_comp)) {
 		ACCEPT_UNLOCK();
 		error = EWOULDBLOCK;
 		goto noconnection;
 	}
 	while (TAILQ_EMPTY(&head->so_comp) && head->so_error == 0) {
 		if (head->so_rcv.sb_state & SBS_CANTRCVMORE) {
 			head->so_error = ECONNABORTED;
 			break;
 		}
 		error = msleep(&head->so_timeo, &accept_mtx, PSOCK | PCATCH,
 		    "accept", 0);
 		if (error) {
 			ACCEPT_UNLOCK();
 			goto noconnection;
 		}
 	}
 	if (head->so_error) {
 		error = head->so_error;
 		head->so_error = 0;
 		ACCEPT_UNLOCK();
 		goto noconnection;
 	}
 	so = TAILQ_FIRST(&head->so_comp);
 	KASSERT(!(so->so_qstate & SQ_INCOMP), ("accept1: so SQ_INCOMP"));
 	KASSERT(so->so_qstate & SQ_COMP, ("accept1: so not SQ_COMP"));
 
 	/*
 	 * Before changing the flags on the socket, we have to bump the
 	 * reference count.  Otherwise, if the protocol calls sofree(),
 	 * the socket will be released due to a zero refcount.
 	 */
 	SOCK_LOCK(so);			/* soref() and so_state update */
 	soref(so);			/* file descriptor reference */
 
 	TAILQ_REMOVE(&head->so_comp, so, so_list);
 	head->so_qlen--;
 	so->so_state |= (head->so_state & SS_NBIO);
 	so->so_qstate &= ~SQ_COMP;
 	so->so_head = NULL;
 
 	SOCK_UNLOCK(so);
 	ACCEPT_UNLOCK();
 
 	/* An extra reference on `nfp' has been held for us by falloc(). */
 	td->td_retval[0] = fd;
 
 	/* connection has been removed from the listen queue */
 	KNOTE_UNLOCKED(&head->so_rcv.sb_sel.si_note, 0);
 
 	pgid = fgetown(&head->so_sigio);
 	if (pgid != 0)
 		fsetown(pgid, &so->so_sigio);
 
 	finit(nfp, fflag, DTYPE_SOCKET, so, &socketops);
 	/* Sync socket nonblocking/async state with file flags */
 	tmp = fflag & FNONBLOCK;
 	(void) fo_ioctl(nfp, FIONBIO, &tmp, td->td_ucred, td);
 	tmp = fflag & FASYNC;
 	(void) fo_ioctl(nfp, FIOASYNC, &tmp, td->td_ucred, td);
 	sa = 0;
 	error = soaccept(so, &sa);
 	if (error) {
 		/*
 		 * return a namelen of zero for older code which might
 		 * ignore the return value from accept.
 		 */
 		if (name)
 			*namelen = 0;
 		goto noconnection;
 	}
 	if (sa == NULL) {
 		if (name)
 			*namelen = 0;
 		goto done;
 	}
 	if (name) {
 		/* check sa_len before it is destroyed */
 		if (*namelen > sa->sa_len)
 			*namelen = sa->sa_len;
 		*name = sa;
 		sa = NULL;
 	}
 noconnection:
 	if (sa)
 		FREE(sa, M_SONAME);
 
 	/*
 	 * close the new descriptor, assuming someone hasn't ripped it
 	 * out from under us.
 	 */
 	if (error)
 		fdclose(fdp, nfp, fd, td);
 
 	/*
 	 * Release explicitly held references before returning.  We return
 	 * a reference on nfp to the caller on success if they request it.
 	 */
 done:
 	if (fp != NULL) {
 		if (error == 0) {
 			*fp = nfp;
 			nfp = NULL;
 		} else
 			*fp = NULL;
 	}
 	if (nfp != NULL)
 		fdrop(nfp, td);
 	fdrop(headfp, td);
 	return (error);
 }
 
 int
 accept(td, uap)
 	struct thread *td;
 	struct accept_args *uap;
 {
 
 	return (accept1(td, uap, 0));
 }
 
 #ifdef COMPAT_OLDSOCK
 int
 oaccept(td, uap)
 	struct thread *td;
 	struct accept_args *uap;
 {
 
 	return (accept1(td, uap, 1));
 }
 #endif /* COMPAT_OLDSOCK */
 
 /* ARGSUSED */
 int
 connect(td, uap)
 	struct thread *td;
 	struct connect_args /* {
 		int	s;
 		caddr_t	name;
 		int	namelen;
 	} */ *uap;
 {
 	struct sockaddr *sa;
 	int error;
 
 	error = getsockaddr(&sa, uap->name, uap->namelen);
 	if (error)
 		return (error);
 
 	error = kern_connect(td, uap->s, sa);
 	free(sa, M_SONAME);
 	return (error);
 }
 
 
 int
 kern_connect(td, fd, sa)
 	struct thread *td;
 	int fd;
 	struct sockaddr *sa;
 {
 	struct socket *so;
 	struct file *fp;
 	int error;
 	int interrupted = 0;
 
 	error = getsock(td->td_proc->p_fd, fd, &fp, NULL);
 	if (error)
 		return (error);
 	so = fp->f_data;
 	if (so->so_state & SS_ISCONNECTING) {
 		error = EALREADY;
 		goto done1;
 	}
 #ifdef MAC
 	SOCK_LOCK(so);
 	error = mac_socket_check_connect(td->td_ucred, so, sa);
 	SOCK_UNLOCK(so);
 	if (error)
 		goto bad;
 #endif
 	error = soconnect(so, sa, td);
 	if (error)
 		goto bad;
 	if ((so->so_state & SS_NBIO) && (so->so_state & SS_ISCONNECTING)) {
 		error = EINPROGRESS;
 		goto done1;
 	}
 	SOCK_LOCK(so);
 	while ((so->so_state & SS_ISCONNECTING) && so->so_error == 0) {
 		error = msleep(&so->so_timeo, SOCK_MTX(so), PSOCK | PCATCH,
 		    "connec", 0);
 		if (error) {
 			if (error == EINTR || error == ERESTART)
 				interrupted = 1;
 			break;
 		}
 	}
 	if (error == 0) {
 		error = so->so_error;
 		so->so_error = 0;
 	}
 	SOCK_UNLOCK(so);
 bad:
 	if (!interrupted)
 		so->so_state &= ~SS_ISCONNECTING;
 	if (error == ERESTART)
 		error = EINTR;
 done1:
 	fdrop(fp, td);
 	return (error);
 }
 
 int
 socketpair(td, uap)
 	struct thread *td;
 	struct socketpair_args /* {
 		int	domain;
 		int	type;
 		int	protocol;
 		int	*rsv;
 	} */ *uap;
 {
 	struct filedesc *fdp = td->td_proc->p_fd;
 	struct file *fp1, *fp2;
 	struct socket *so1, *so2;
 	int fd, error, sv[2];
 
 #ifdef MAC
 	/* We might want to have a separate check for socket pairs. */
 	error = mac_socket_check_create(td->td_ucred, uap->domain, uap->type,
 	    uap->protocol);
 	if (error)
 		return (error);
 #endif
 
 	error = socreate(uap->domain, &so1, uap->type, uap->protocol,
 	    td->td_ucred, td);
 	if (error)
 		return (error);
 	error = socreate(uap->domain, &so2, uap->type, uap->protocol,
 	    td->td_ucred, td);
 	if (error)
 		goto free1;
 	/* On success extra reference to `fp1' and 'fp2' is set by falloc. */
 	error = falloc(td, &fp1, &fd);
 	if (error)
 		goto free2;
 	sv[0] = fd;
 	fp1->f_data = so1;	/* so1 already has ref count */
 	error = falloc(td, &fp2, &fd);
 	if (error)
 		goto free3;
 	fp2->f_data = so2;	/* so2 already has ref count */
 	sv[1] = fd;
 	error = soconnect2(so1, so2);
 	if (error)
 		goto free4;
 	if (uap->type == SOCK_DGRAM) {
 		/*
 		 * Datagram socket connection is asymmetric.
 		 */
 		 error = soconnect2(so2, so1);
 		 if (error)
 			goto free4;
 	}
 	finit(fp1, FREAD | FWRITE, DTYPE_SOCKET, fp1->f_data, &socketops);
 	finit(fp2, FREAD | FWRITE, DTYPE_SOCKET, fp2->f_data, &socketops);
 	so1 = so2 = NULL;
 	error = copyout(sv, uap->rsv, 2 * sizeof (int));
 	if (error)
 		goto free4;
 	fdrop(fp1, td);
 	fdrop(fp2, td);
 	return (0);
 free4:
 	fdclose(fdp, fp2, sv[1], td);
 	fdrop(fp2, td);
 free3:
 	fdclose(fdp, fp1, sv[0], td);
 	fdrop(fp1, td);
 free2:
 	if (so2 != NULL)
 		(void)soclose(so2);
 free1:
 	if (so1 != NULL)
 		(void)soclose(so1);
 	return (error);
 }
 
 static int
 sendit(td, s, mp, flags)
 	struct thread *td;
 	int s;
 	struct msghdr *mp;
 	int flags;
 {
 	struct mbuf *control;
 	struct sockaddr *to;
 	int error;
 
 	if (mp->msg_name != NULL) {
 		error = getsockaddr(&to, mp->msg_name, mp->msg_namelen);
 		if (error) {
 			to = NULL;
 			goto bad;
 		}
 		mp->msg_name = to;
 	} else {
 		to = NULL;
 	}
 
 	if (mp->msg_control) {
 		if (mp->msg_controllen < sizeof(struct cmsghdr)
 #ifdef COMPAT_OLDSOCK
 		    && mp->msg_flags != MSG_COMPAT
 #endif
 		) {
 			error = EINVAL;
 			goto bad;
 		}
 		error = sockargs(&control, mp->msg_control,
 		    mp->msg_controllen, MT_CONTROL);
 		if (error)
 			goto bad;
 #ifdef COMPAT_OLDSOCK
 		if (mp->msg_flags == MSG_COMPAT) {
 			struct cmsghdr *cm;
 
 			M_PREPEND(control, sizeof(*cm), M_TRYWAIT);
 			if (control == 0) {
 				error = ENOBUFS;
 				goto bad;
 			} else {
 				cm = mtod(control, struct cmsghdr *);
 				cm->cmsg_len = control->m_len;
 				cm->cmsg_level = SOL_SOCKET;
 				cm->cmsg_type = SCM_RIGHTS;
 			}
 		}
 #endif
 	} else {
 		control = NULL;
 	}
 
 	error = kern_sendit(td, s, mp, flags, control, UIO_USERSPACE);
 
 bad:
 	if (to)
 		FREE(to, M_SONAME);
 	return (error);
 }
 
 int
 kern_sendit(td, s, mp, flags, control, segflg)
 	struct thread *td;
 	int s;
 	struct msghdr *mp;
 	int flags;
 	struct mbuf *control;
 	enum uio_seg segflg;
 {
 	struct file *fp;
 	struct uio auio;
 	struct iovec *iov;
 	struct socket *so;
 	int i;
 	int len, error;
 #ifdef KTRACE
 	struct uio *ktruio = NULL;
 #endif
 
 	error = getsock(td->td_proc->p_fd, s, &fp, NULL);
 	if (error)
 		return (error);
 	so = (struct socket *)fp->f_data;
 
 #ifdef MAC
 	SOCK_LOCK(so);
 	error = mac_socket_check_send(td->td_ucred, so);
 	SOCK_UNLOCK(so);
 	if (error)
 		goto bad;
 #endif
 
 	auio.uio_iov = mp->msg_iov;
 	auio.uio_iovcnt = mp->msg_iovlen;
 	auio.uio_segflg = segflg;
 	auio.uio_rw = UIO_WRITE;
 	auio.uio_td = td;
 	auio.uio_offset = 0;			/* XXX */
 	auio.uio_resid = 0;
 	iov = mp->msg_iov;
 	for (i = 0; i < mp->msg_iovlen; i++, iov++) {
 		if ((auio.uio_resid += iov->iov_len) < 0) {
 			error = EINVAL;
 			goto bad;
 		}
 	}
 #ifdef KTRACE
 	if (KTRPOINT(td, KTR_GENIO))
 		ktruio = cloneuio(&auio);
 #endif
 	len = auio.uio_resid;
 	error = sosend(so, mp->msg_name, &auio, 0, control, flags, td);
 	if (error) {
 		if (auio.uio_resid != len && (error == ERESTART ||
 		    error == EINTR || error == EWOULDBLOCK))
 			error = 0;
 		/* Generation of SIGPIPE can be controlled per socket */
 		if (error == EPIPE && !(so->so_options & SO_NOSIGPIPE) &&
 		    !(flags & MSG_NOSIGNAL)) {
 			PROC_LOCK(td->td_proc);
 			psignal(td->td_proc, SIGPIPE);
 			PROC_UNLOCK(td->td_proc);
 		}
 	}
 	if (error == 0)
 		td->td_retval[0] = len - auio.uio_resid;
 #ifdef KTRACE
 	if (ktruio != NULL) {
 		ktruio->uio_resid = td->td_retval[0];
 		ktrgenio(s, UIO_WRITE, ktruio, error);
 	}
 #endif
 bad:
 	fdrop(fp, td);
 	return (error);
 }
 
 int
 sendto(td, uap)
 	struct thread *td;
 	struct sendto_args /* {
 		int	s;
 		caddr_t	buf;
 		size_t	len;
 		int	flags;
 		caddr_t	to;
 		int	tolen;
 	} */ *uap;
 {
 	struct msghdr msg;
 	struct iovec aiov;
 	int error;
 
 	msg.msg_name = uap->to;
 	msg.msg_namelen = uap->tolen;
 	msg.msg_iov = &aiov;
 	msg.msg_iovlen = 1;
 	msg.msg_control = 0;
 #ifdef COMPAT_OLDSOCK
 	msg.msg_flags = 0;
 #endif
 	aiov.iov_base = uap->buf;
 	aiov.iov_len = uap->len;
 	error = sendit(td, uap->s, &msg, uap->flags);
 	return (error);
 }
 
 #ifdef COMPAT_OLDSOCK
 int
 osend(td, uap)
 	struct thread *td;
 	struct osend_args /* {
 		int	s;
 		caddr_t	buf;
 		int	len;
 		int	flags;
 	} */ *uap;
 {
 	struct msghdr msg;
 	struct iovec aiov;
 	int error;
 
 	msg.msg_name = 0;
 	msg.msg_namelen = 0;
 	msg.msg_iov = &aiov;
 	msg.msg_iovlen = 1;
 	aiov.iov_base = uap->buf;
 	aiov.iov_len = uap->len;
 	msg.msg_control = 0;
 	msg.msg_flags = 0;
 	error = sendit(td, uap->s, &msg, uap->flags);
 	return (error);
 }
 
 int
 osendmsg(td, uap)
 	struct thread *td;
 	struct osendmsg_args /* {
 		int	s;
 		caddr_t	msg;
 		int	flags;
 	} */ *uap;
 {
 	struct msghdr msg;
 	struct iovec *iov;
 	int error;
 
 	error = copyin(uap->msg, &msg, sizeof (struct omsghdr));
 	if (error)
 		return (error);
 	error = copyiniov(msg.msg_iov, msg.msg_iovlen, &iov, EMSGSIZE);
 	if (error)
 		return (error);
 	msg.msg_iov = iov;
 	msg.msg_flags = MSG_COMPAT;
 	error = sendit(td, uap->s, &msg, uap->flags);
 	free(iov, M_IOV);
 	return (error);
 }
 #endif
 
 int
 sendmsg(td, uap)
 	struct thread *td;
 	struct sendmsg_args /* {
 		int	s;
 		caddr_t	msg;
 		int	flags;
 	} */ *uap;
 {
 	struct msghdr msg;
 	struct iovec *iov;
 	int error;
 
 	error = copyin(uap->msg, &msg, sizeof (msg));
 	if (error)
 		return (error);
 	error = copyiniov(msg.msg_iov, msg.msg_iovlen, &iov, EMSGSIZE);
 	if (error)
 		return (error);
 	msg.msg_iov = iov;
 #ifdef COMPAT_OLDSOCK
 	msg.msg_flags = 0;
 #endif
 	error = sendit(td, uap->s, &msg, uap->flags);
 	free(iov, M_IOV);
 	return (error);
 }
 
 int
 kern_recvit(td, s, mp, fromseg, controlp)
 	struct thread *td;
 	int s;
 	struct msghdr *mp;
 	enum uio_seg fromseg;
 	struct mbuf **controlp;
 {
 	struct uio auio;
 	struct iovec *iov;
 	int i;
 	socklen_t len;
 	int error;
 	struct mbuf *m, *control = 0;
 	caddr_t ctlbuf;
 	struct file *fp;
 	struct socket *so;
 	struct sockaddr *fromsa = 0;
 #ifdef KTRACE
 	struct uio *ktruio = NULL;
 #endif
 
 	if(controlp != NULL)
 		*controlp = 0;
 
 	error = getsock(td->td_proc->p_fd, s, &fp, NULL);
 	if (error)
 		return (error);
 	so = fp->f_data;
 
 #ifdef MAC
 	SOCK_LOCK(so);
 	error = mac_socket_check_receive(td->td_ucred, so);
 	SOCK_UNLOCK(so);
 	if (error) {
 		fdrop(fp, td);
 		return (error);
 	}
 #endif
 
 	auio.uio_iov = mp->msg_iov;
 	auio.uio_iovcnt = mp->msg_iovlen;
 	auio.uio_segflg = UIO_USERSPACE;
 	auio.uio_rw = UIO_READ;
 	auio.uio_td = td;
 	auio.uio_offset = 0;			/* XXX */
 	auio.uio_resid = 0;
 	iov = mp->msg_iov;
 	for (i = 0; i < mp->msg_iovlen; i++, iov++) {
 		if ((auio.uio_resid += iov->iov_len) < 0) {
 			fdrop(fp, td);
 			return (EINVAL);
 		}
 	}
 #ifdef KTRACE
 	if (KTRPOINT(td, KTR_GENIO))
 		ktruio = cloneuio(&auio);
 #endif
 	len = auio.uio_resid;
 	error = soreceive(so, &fromsa, &auio, (struct mbuf **)0,
 	    (mp->msg_control || controlp) ? &control : (struct mbuf **)0,
 	    &mp->msg_flags);
 	if (error) {
 		if (auio.uio_resid != (int)len && (error == ERESTART ||
 		    error == EINTR || error == EWOULDBLOCK))
 			error = 0;
 	}
 #ifdef KTRACE
 	if (ktruio != NULL) {
 		ktruio->uio_resid = (int)len - auio.uio_resid;
 		ktrgenio(s, UIO_READ, ktruio, error);
 	}
 #endif
 	if (error)
 		goto out;
 	td->td_retval[0] = (int)len - auio.uio_resid;
 	if (mp->msg_name) {
 		len = mp->msg_namelen;
 		if (len <= 0 || fromsa == 0)
 			len = 0;
 		else {
 			/* save sa_len before it is destroyed by MSG_COMPAT */
 			len = MIN(len, fromsa->sa_len);
 #ifdef COMPAT_OLDSOCK
 			if (mp->msg_flags & MSG_COMPAT)
 				((struct osockaddr *)fromsa)->sa_family =
 				    fromsa->sa_family;
 #endif
 			if (fromseg == UIO_USERSPACE) {
 				error = copyout(fromsa, mp->msg_name,
 				    (unsigned)len);
 				if (error)
 					goto out;
 			} else
 				bcopy(fromsa, mp->msg_name, len);
 		}
 		mp->msg_namelen = len;
 	}
 	if (mp->msg_control && controlp == NULL) {
 #ifdef COMPAT_OLDSOCK
 		/*
 		 * We assume that old recvmsg calls won't receive access
 		 * rights and other control info, esp. as control info
 		 * is always optional and those options didn't exist in 4.3.
 		 * If we receive rights, trim the cmsghdr; anything else
 		 * is tossed.
 		 */
 		if (control && mp->msg_flags & MSG_COMPAT) {
 			if (mtod(control, struct cmsghdr *)->cmsg_level !=
 			    SOL_SOCKET ||
 			    mtod(control, struct cmsghdr *)->cmsg_type !=
 			    SCM_RIGHTS) {
 				mp->msg_controllen = 0;
 				goto out;
 			}
 			control->m_len -= sizeof (struct cmsghdr);
 			control->m_data += sizeof (struct cmsghdr);
 		}
 #endif
 		len = mp->msg_controllen;
 		m = control;
 		mp->msg_controllen = 0;
 		ctlbuf = mp->msg_control;
 
 		while (m && len > 0) {
 			unsigned int tocopy;
 
 			if (len >= m->m_len)
 				tocopy = m->m_len;
 			else {
 				mp->msg_flags |= MSG_CTRUNC;
 				tocopy = len;
 			}
 
 			if ((error = copyout(mtod(m, caddr_t),
 					ctlbuf, tocopy)) != 0)
 				goto out;
 
 			ctlbuf += tocopy;
 			len -= tocopy;
 			m = m->m_next;
 		}
 		mp->msg_controllen = ctlbuf - (caddr_t)mp->msg_control;
 	}
 out:
 	fdrop(fp, td);
 	if (fromsa)
 		FREE(fromsa, M_SONAME);
 
 	if (error == 0 && controlp != NULL)  
 		*controlp = control;
 	else  if (control)
 		m_freem(control);
 
 	return (error);
 }
 
 static int
 recvit(td, s, mp, namelenp)
 	struct thread *td;
 	int s;
 	struct msghdr *mp;
 	void *namelenp;
 {
 	int error;
 
 	error = kern_recvit(td, s, mp, UIO_USERSPACE, NULL);
 	if (error)
 		return (error);
 	if (namelenp) {
 		error = copyout(&mp->msg_namelen, namelenp, sizeof (socklen_t));
 #ifdef COMPAT_OLDSOCK
 		if (mp->msg_flags & MSG_COMPAT)
 			error = 0;	/* old recvfrom didn't check */
 #endif
 	}
 	return (error);
 }
 
 int
 recvfrom(td, uap)
 	struct thread *td;
 	struct recvfrom_args /* {
 		int	s;
 		caddr_t	buf;
 		size_t	len;
 		int	flags;
 		struct sockaddr * __restrict	from;
 		socklen_t * __restrict fromlenaddr;
 	} */ *uap;
 {
 	struct msghdr msg;
 	struct iovec aiov;
 	int error;
 
 	if (uap->fromlenaddr) {
 		error = copyin(uap->fromlenaddr,
 		    &msg.msg_namelen, sizeof (msg.msg_namelen));
 		if (error)
 			goto done2;
 	} else {
 		msg.msg_namelen = 0;
 	}
 	msg.msg_name = uap->from;
 	msg.msg_iov = &aiov;
 	msg.msg_iovlen = 1;
 	aiov.iov_base = uap->buf;
 	aiov.iov_len = uap->len;
 	msg.msg_control = 0;
 	msg.msg_flags = uap->flags;
 	error = recvit(td, uap->s, &msg, uap->fromlenaddr);
 done2:
 	return(error);
 }
 
 #ifdef COMPAT_OLDSOCK
 int
 orecvfrom(td, uap)
 	struct thread *td;
 	struct recvfrom_args *uap;
 {
 
 	uap->flags |= MSG_COMPAT;
 	return (recvfrom(td, uap));
 }
 #endif
 
 #ifdef COMPAT_OLDSOCK
 int
 orecv(td, uap)
 	struct thread *td;
 	struct orecv_args /* {
 		int	s;
 		caddr_t	buf;
 		int	len;
 		int	flags;
 	} */ *uap;
 {
 	struct msghdr msg;
 	struct iovec aiov;
 	int error;
 
 	msg.msg_name = 0;
 	msg.msg_namelen = 0;
 	msg.msg_iov = &aiov;
 	msg.msg_iovlen = 1;
 	aiov.iov_base = uap->buf;
 	aiov.iov_len = uap->len;
 	msg.msg_control = 0;
 	msg.msg_flags = uap->flags;
 	error = recvit(td, uap->s, &msg, NULL);
 	return (error);
 }
 
 /*
  * Old recvmsg.  This code takes advantage of the fact that the old msghdr
  * overlays the new one, missing only the flags, and with the (old) access
  * rights where the control fields are now.
  */
 int
 orecvmsg(td, uap)
 	struct thread *td;
 	struct orecvmsg_args /* {
 		int	s;
 		struct	omsghdr *msg;
 		int	flags;
 	} */ *uap;
 {
 	struct msghdr msg;
 	struct iovec *iov;
 	int error;
 
 	error = copyin(uap->msg, &msg, sizeof (struct omsghdr));
 	if (error)
 		return (error);
 	error = copyiniov(msg.msg_iov, msg.msg_iovlen, &iov, EMSGSIZE);
 	if (error)
 		return (error);
 	msg.msg_flags = uap->flags | MSG_COMPAT;
 	msg.msg_iov = iov;
 	error = recvit(td, uap->s, &msg, &uap->msg->msg_namelen);
 	if (msg.msg_controllen && error == 0)
 		error = copyout(&msg.msg_controllen,
 		    &uap->msg->msg_accrightslen, sizeof (int));
 	free(iov, M_IOV);
 	return (error);
 }
 #endif
 
 int
 recvmsg(td, uap)
 	struct thread *td;
 	struct recvmsg_args /* {
 		int	s;
 		struct	msghdr *msg;
 		int	flags;
 	} */ *uap;
 {
 	struct msghdr msg;
 	struct iovec *uiov, *iov;
 	int error;
 
 	error = copyin(uap->msg, &msg, sizeof (msg));
 	if (error)
 		return (error);
 	error = copyiniov(msg.msg_iov, msg.msg_iovlen, &iov, EMSGSIZE);
 	if (error)
 		return (error);
 	msg.msg_flags = uap->flags;
 #ifdef COMPAT_OLDSOCK
 	msg.msg_flags &= ~MSG_COMPAT;
 #endif
 	uiov = msg.msg_iov;
 	msg.msg_iov = iov;
 	error = recvit(td, uap->s, &msg, NULL);
 	if (error == 0) {
 		msg.msg_iov = uiov;
 		error = copyout(&msg, uap->msg, sizeof(msg));
 	}
 	free(iov, M_IOV);
 	return (error);
 }
 
 /* ARGSUSED */
 int
 shutdown(td, uap)
 	struct thread *td;
 	struct shutdown_args /* {
 		int	s;
 		int	how;
 	} */ *uap;
 {
 	struct socket *so;
 	struct file *fp;
 	int error;
 
 	error = getsock(td->td_proc->p_fd, uap->s, &fp, NULL);
 	if (error == 0) {
 		so = fp->f_data;
 		error = soshutdown(so, uap->how);
 		fdrop(fp, td);
 	}
 	return (error);
 }
 
 /* ARGSUSED */
 int
 setsockopt(td, uap)
 	struct thread *td;
 	struct setsockopt_args /* {
 		int	s;
 		int	level;
 		int	name;
 		caddr_t	val;
 		int	valsize;
 	} */ *uap;
 {
 
 	return (kern_setsockopt(td, uap->s, uap->level, uap->name,
 	    uap->val, UIO_USERSPACE, uap->valsize));
 }
 
 int
 kern_setsockopt(td, s, level, name, val, valseg, valsize)
 	struct thread *td;
 	int s;
 	int level;
 	int name;
 	void *val;
 	enum uio_seg valseg;
 	socklen_t valsize;
 {
 	int error;
 	struct socket *so;
 	struct file *fp;
 	struct sockopt sopt;
 
 	if (val == NULL && valsize != 0)
 		return (EFAULT);
 	if ((int)valsize < 0)
 		return (EINVAL);
 
 	sopt.sopt_dir = SOPT_SET;
 	sopt.sopt_level = level;
 	sopt.sopt_name = name;
 	sopt.sopt_val = val;
 	sopt.sopt_valsize = valsize;
 	switch (valseg) {
 	case UIO_USERSPACE:
 		sopt.sopt_td = td;
 		break;
 	case UIO_SYSSPACE:
 		sopt.sopt_td = NULL;
 		break;
 	default:
 		panic("kern_setsockopt called with bad valseg");
 	}
 
 	error = getsock(td->td_proc->p_fd, s, &fp, NULL);
 	if (error == 0) {
 		so = fp->f_data;
 		error = sosetopt(so, &sopt);
 		fdrop(fp, td);
 	}
 	return(error);
 }
 
 /* ARGSUSED */
 int
 getsockopt(td, uap)
 	struct thread *td;
 	struct getsockopt_args /* {
 		int	s;
 		int	level;
 		int	name;
 		void * __restrict	val;
 		socklen_t * __restrict avalsize;
 	} */ *uap;
 {
 	socklen_t valsize;
 	int	error;
 
 	if (uap->val) {
 		error = copyin(uap->avalsize, &valsize, sizeof (valsize));
 		if (error)
 			return (error);
 	}
 
 	error = kern_getsockopt(td, uap->s, uap->level, uap->name,
 	    uap->val, UIO_USERSPACE, &valsize);
 
 	if (error == 0)
 		error = copyout(&valsize, uap->avalsize, sizeof (valsize));
 	return (error);
 }
 
 /*
  * Kernel version of getsockopt.
  * optval can be a userland or userspace. optlen is always a kernel pointer.
  */
 int
 kern_getsockopt(td, s, level, name, val, valseg, valsize)
 	struct thread *td;
 	int s;
 	int level;
 	int name;
 	void *val;
 	enum uio_seg valseg;
 	socklen_t *valsize;
 {
 	int error;
 	struct  socket *so;
 	struct file *fp;
 	struct	sockopt sopt;
 
 	if (val == NULL)
 		*valsize = 0;
 	if ((int)*valsize < 0)
 		return (EINVAL);
 
 	sopt.sopt_dir = SOPT_GET;
 	sopt.sopt_level = level;
 	sopt.sopt_name = name;
 	sopt.sopt_val = val;
 	sopt.sopt_valsize = (size_t)*valsize; /* checked non-negative above */
 	switch (valseg) {
 	case UIO_USERSPACE:
 		sopt.sopt_td = td;
 		break;
 	case UIO_SYSSPACE:
 		sopt.sopt_td = NULL;
 		break;
 	default:
 		panic("kern_getsockopt called with bad valseg");
 	}
 
 	error = getsock(td->td_proc->p_fd, s, &fp, NULL);
 	if (error == 0) {
 		so = fp->f_data;
 		error = sogetopt(so, &sopt);
 		*valsize = sopt.sopt_valsize;
 		fdrop(fp, td);
 	}
 	return (error);
 }
 
 /*
  * getsockname1() - Get socket name.
  */
 /* ARGSUSED */
 static int
 getsockname1(td, uap, compat)
 	struct thread *td;
 	struct getsockname_args /* {
 		int	fdes;
 		struct sockaddr * __restrict asa;
 		socklen_t * __restrict alen;
 	} */ *uap;
 	int compat;
 {
 	struct sockaddr *sa;
 	socklen_t len;
 	int error;
 
 	error = copyin(uap->alen, &len, sizeof(len));
 	if (error)
 		return (error);
 
 	error = kern_getsockname(td, uap->fdes, &sa, &len);
 	if (error)
 		return (error);
 
 	if (len != 0) {
 #ifdef COMPAT_OLDSOCK
 		if (compat)
 			((struct osockaddr *)sa)->sa_family = sa->sa_family;
 #endif
 		error = copyout(sa, uap->asa, (u_int)len);
 	}
 	free(sa, M_SONAME);
 	if (error == 0)
 		error = copyout(&len, uap->alen, sizeof(len));
 	return (error);
 }
 
 int
 kern_getsockname(struct thread *td, int fd, struct sockaddr **sa,
     socklen_t *alen)
 {
 	struct socket *so;
 	struct file *fp;
 	socklen_t len;
 	int error;
 
 	if (*alen < 0)
 		return (EINVAL);
 
 	error = getsock(td->td_proc->p_fd, fd, &fp, NULL);
 	if (error)
 		return (error);
 	so = fp->f_data;
 	*sa = NULL;
 	error = (*so->so_proto->pr_usrreqs->pru_sockaddr)(so, sa);
 	if (error)
 		goto bad;
 	if (*sa == NULL)
 		len = 0;
 	else
 		len = MIN(*alen, (*sa)->sa_len);
 	*alen = len;
 bad:
 	fdrop(fp, td);
 	if (error && *sa) {
 		free(*sa, M_SONAME);
 		*sa = NULL;
 	}
 	return (error);
 }
 
 int
 getsockname(td, uap)
 	struct thread *td;
 	struct getsockname_args *uap;
 {
 
 	return (getsockname1(td, uap, 0));
 }
 
 #ifdef COMPAT_OLDSOCK
 int
 ogetsockname(td, uap)
 	struct thread *td;
 	struct getsockname_args *uap;
 {
 
 	return (getsockname1(td, uap, 1));
 }
 #endif /* COMPAT_OLDSOCK */
 
 /*
  * getpeername1() - Get name of peer for connected socket.
  */
 /* ARGSUSED */
 static int
 getpeername1(td, uap, compat)
 	struct thread *td;
 	struct getpeername_args /* {
 		int	fdes;
 		struct sockaddr * __restrict	asa;
 		socklen_t * __restrict	alen;
 	} */ *uap;
 	int compat;
 {
 	struct sockaddr *sa;
 	socklen_t len;
 	int error;
 
 	error = copyin(uap->alen, &len, sizeof (len));
 	if (error)
 		return (error);
 
 	error = kern_getpeername(td, uap->fdes, &sa, &len);
 	if (error)
 		return (error);
 
 	if (len != 0) {
 #ifdef COMPAT_OLDSOCK
 		if (compat)
 			((struct osockaddr *)sa)->sa_family = sa->sa_family;
 #endif
 		error = copyout(sa, uap->asa, (u_int)len);
 	}
 	free(sa, M_SONAME);
 	if (error == 0)
 		error = copyout(&len, uap->alen, sizeof(len));
 	return (error);
 }
 
 int
 kern_getpeername(struct thread *td, int fd, struct sockaddr **sa,
     socklen_t *alen)
 {
 	struct socket *so;
 	struct file *fp;
 	socklen_t len;
 	int error;
 
 	if (*alen < 0)
 		return (EINVAL);
 
 	error = getsock(td->td_proc->p_fd, fd, &fp, NULL);
 	if (error)
 		return (error);
 	so = fp->f_data;
 	if ((so->so_state & (SS_ISCONNECTED|SS_ISCONFIRMING)) == 0) {
 		error = ENOTCONN;
 		goto done;
 	}
 	*sa = NULL;
 	error = (*so->so_proto->pr_usrreqs->pru_peeraddr)(so, sa);
 	if (error)
 		goto bad;
 	if (*sa == NULL)
 		len = 0;
 	else
 		len = MIN(*alen, (*sa)->sa_len);
 	*alen = len;
 bad:
 	if (error && *sa) {
 		free(*sa, M_SONAME);
 		*sa = NULL;
 	}
 done:
 	fdrop(fp, td);
 	return (error);
 }
 
 int
 getpeername(td, uap)
 	struct thread *td;
 	struct getpeername_args *uap;
 {
 
 	return (getpeername1(td, uap, 0));
 }
 
 #ifdef COMPAT_OLDSOCK
 int
 ogetpeername(td, uap)
 	struct thread *td;
 	struct ogetpeername_args *uap;
 {
 
 	/* XXX uap should have type `getpeername_args *' to begin with. */
 	return (getpeername1(td, (struct getpeername_args *)uap, 1));
 }
 #endif /* COMPAT_OLDSOCK */
 
 int
 sockargs(mp, buf, buflen, type)
 	struct mbuf **mp;
 	caddr_t buf;
 	int buflen, type;
 {
 	struct sockaddr *sa;
 	struct mbuf *m;
 	int error;
 
 	if ((u_int)buflen > MLEN) {
 #ifdef COMPAT_OLDSOCK
 		if (type == MT_SONAME && (u_int)buflen <= 112)
 			buflen = MLEN;		/* unix domain compat. hack */
 		else
 #endif
 			if ((u_int)buflen > MCLBYTES)
 				return (EINVAL);
 	}
 	m = m_get(M_TRYWAIT, type);
 	if (m == NULL)
 		return (ENOBUFS);
 	if ((u_int)buflen > MLEN) {
 		MCLGET(m, M_TRYWAIT);
 		if ((m->m_flags & M_EXT) == 0) {
 			m_free(m);
 			return (ENOBUFS);
 		}
 	}
 	m->m_len = buflen;
 	error = copyin(buf, mtod(m, caddr_t), (u_int)buflen);
 	if (error)
 		(void) m_free(m);
 	else {
 		*mp = m;
 		if (type == MT_SONAME) {
 			sa = mtod(m, struct sockaddr *);
 
 #if defined(COMPAT_OLDSOCK) && BYTE_ORDER != BIG_ENDIAN
 			if (sa->sa_family == 0 && sa->sa_len < AF_MAX)
 				sa->sa_family = sa->sa_len;
 #endif
 			sa->sa_len = buflen;
 		}
 	}
 	return (error);
 }
 
 int
 getsockaddr(namp, uaddr, len)
 	struct sockaddr **namp;
 	caddr_t uaddr;
 	size_t len;
 {
 	struct sockaddr *sa;
 	int error;
 
 	if (len > SOCK_MAXADDRLEN)
 		return (ENAMETOOLONG);
 	if (len < offsetof(struct sockaddr, sa_data[0]))
 		return (EINVAL);
 	MALLOC(sa, struct sockaddr *, len, M_SONAME, M_WAITOK);
 	error = copyin(uaddr, sa, len);
 	if (error) {
 		FREE(sa, M_SONAME);
 	} else {
 #if defined(COMPAT_OLDSOCK) && BYTE_ORDER != BIG_ENDIAN
 		if (sa->sa_family == 0 && sa->sa_len < AF_MAX)
 			sa->sa_family = sa->sa_len;
 #endif
 		sa->sa_len = len;
 		*namp = sa;
 	}
 	return (error);
 }
 
 /*
  * Detach mapped page and release resources back to the system.
  */
 void
 sf_buf_mext(void *addr, void *args)
 {
 	vm_page_t m;
 
 	m = sf_buf_page(args);
 	sf_buf_free(args);
 	vm_page_lock_queues();
 	vm_page_unwire(m, 0);
 	/*
 	 * Check for the object going away on us. This can
 	 * happen since we don't hold a reference to it.
 	 * If so, we're responsible for freeing the page.
 	 */
 	if (m->wire_count == 0 && m->object == NULL)
 		vm_page_free(m);
 	vm_page_unlock_queues();
 }
 
 /*
  * sendfile(2)
  *
  * int sendfile(int fd, int s, off_t offset, size_t nbytes,
  *	 struct sf_hdtr *hdtr, off_t *sbytes, int flags)
  *
  * Send a file specified by 'fd' and starting at 'offset' to a socket
  * specified by 's'. Send only 'nbytes' of the file or until EOF if nbytes ==
  * 0.  Optionally add a header and/or trailer to the socket output.  If
  * specified, write the total number of bytes sent into *sbytes.
  */
 int
 sendfile(struct thread *td, struct sendfile_args *uap)
 {
 
 	return (do_sendfile(td, uap, 0));
 }
 
 static int
 do_sendfile(struct thread *td, struct sendfile_args *uap, int compat)
 {
 	struct sf_hdtr hdtr;
 	struct uio *hdr_uio, *trl_uio;
 	int error;
 
 	hdr_uio = trl_uio = NULL;
 
 	if (uap->hdtr != NULL) {
 		error = copyin(uap->hdtr, &hdtr, sizeof(hdtr));
 		if (error)
 			goto out;
 		if (hdtr.headers != NULL) {
 			error = copyinuio(hdtr.headers, hdtr.hdr_cnt, &hdr_uio);
 			if (error)
 				goto out;
 		}
 		if (hdtr.trailers != NULL) {
 			error = copyinuio(hdtr.trailers, hdtr.trl_cnt, &trl_uio);
 			if (error)
 				goto out;
 
 		}
 	}
 
 	error = kern_sendfile(td, uap, hdr_uio, trl_uio, compat);
 out:
 	if (hdr_uio)
 		free(hdr_uio, M_IOV);
 	if (trl_uio)
 		free(trl_uio, M_IOV);
 	return (error);
 }
 
 #ifdef COMPAT_FREEBSD4
 int
 freebsd4_sendfile(struct thread *td, struct freebsd4_sendfile_args *uap)
 {
 	struct sendfile_args args;
 
 	args.fd = uap->fd;
 	args.s = uap->s;
 	args.offset = uap->offset;
 	args.nbytes = uap->nbytes;
 	args.hdtr = uap->hdtr;
 	args.sbytes = uap->sbytes;
 	args.flags = uap->flags;
 
 	return (do_sendfile(td, &args, 1));
 }
 #endif /* COMPAT_FREEBSD4 */
 
 int
 kern_sendfile(struct thread *td, struct sendfile_args *uap,
     struct uio *hdr_uio, struct uio *trl_uio, int compat)
 {
 	struct file *sock_fp;
 	struct vnode *vp;
 	struct vm_object *obj = NULL;
 	struct socket *so = NULL;
 	struct mbuf *m = NULL;
 	struct sf_buf *sf;
 	struct vm_page *pg;
 	off_t off, xfsize, fsbytes = 0, sbytes = 0, rem = 0;
 	int error, hdrlen = 0, mnw = 0;
 	int vfslocked;
 
 	/*
 	 * The file descriptor must be a regular file and have a
 	 * backing VM object.
 	 * File offset must be positive.  If it goes beyond EOF
 	 * we send only the header/trailer and no payload data.
 	 */
 	if ((error = fgetvp_read(td, uap->fd, &vp)) != 0)
 		goto out;
 	vfslocked = VFS_LOCK_GIANT(vp->v_mount);
-	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td);
+	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
 	obj = vp->v_object;
 	if (obj != NULL) {
 		/*
 		 * Temporarily increase the backing VM object's reference
 		 * count so that a forced reclamation of its vnode does not
 		 * immediately destroy it.
 		 */
 		VM_OBJECT_LOCK(obj);
 		if ((obj->flags & OBJ_DEAD) == 0) {
 			vm_object_reference_locked(obj);
 			VM_OBJECT_UNLOCK(obj);
 		} else {
 			VM_OBJECT_UNLOCK(obj);
 			obj = NULL;
 		}
 	}
 	VOP_UNLOCK(vp, 0, td);
 	VFS_UNLOCK_GIANT(vfslocked);
 	if (obj == NULL) {
 		error = EINVAL;
 		goto out;
 	}
 	if (uap->offset < 0) {
 		error = EINVAL;
 		goto out;
 	}
 
 	/*
 	 * The socket must be a stream socket and connected.
 	 * Remember if it a blocking or non-blocking socket.
 	 */
 	if ((error = getsock(td->td_proc->p_fd, uap->s, &sock_fp,
 	    NULL)) != 0)
 		goto out;
 	so = sock_fp->f_data;
 	if (so->so_type != SOCK_STREAM) {
 		error = EINVAL;
 		goto out;
 	}
 	if ((so->so_state & SS_ISCONNECTED) == 0) {
 		error = ENOTCONN;
 		goto out;
 	}
 	/*
 	 * Do not wait on memory allocations but return ENOMEM for
 	 * caller to retry later.
 	 * XXX: Experimental.
 	 */
 	if (uap->flags & SF_MNOWAIT)
 		mnw = 1;
 
 #ifdef MAC
 	SOCK_LOCK(so);
 	error = mac_socket_check_send(td->td_ucred, so);
 	SOCK_UNLOCK(so);
 	if (error)
 		goto out;
 #endif
 
 	/* If headers are specified copy them into mbufs. */
 	if (hdr_uio != NULL) {
 		hdr_uio->uio_td = td;
 		hdr_uio->uio_rw = UIO_WRITE;
 		if (hdr_uio->uio_resid > 0) {
 			/*
 			 * In FBSD < 5.0 the nbytes to send also included
 			 * the header.  If compat is specified subtract the
 			 * header size from nbytes.
 			 */
 			if (compat) {
 				if (uap->nbytes > hdr_uio->uio_resid)
 					uap->nbytes -= hdr_uio->uio_resid;
 				else
 					uap->nbytes = 0;
 			}
 			m = m_uiotombuf(hdr_uio, (mnw ? M_NOWAIT : M_WAITOK),
 			    0, 0, 0);
 			if (m == NULL) {
 				error = mnw ? EAGAIN : ENOBUFS;
 				goto out;
 			}
 			hdrlen = m_length(m, NULL);
 		}
 	}
 
 	/* Protect against multiple writers to the socket. */
 	(void) sblock(&so->so_snd, M_WAITOK);
 
 	/*
 	 * Loop through the pages of the file, starting with the requested
 	 * offset. Get a file page (do I/O if necessary), map the file page
 	 * into an sf_buf, attach an mbuf header to the sf_buf, and queue
 	 * it on the socket.
 	 * This is done in two loops.  The inner loop turns as many pages
 	 * as it can, up to available socket buffer space, without blocking
 	 * into mbufs to have it bulk delivered into the socket send buffer.
 	 * The outer loop checks the state and available space of the socket
 	 * and takes care of the overall progress.
 	 */
 	for (off = uap->offset, rem = uap->nbytes; ; ) {
 		int loopbytes = 0;
 		int space = 0;
 		int done = 0;
 
 		/*
 		 * Check the socket state for ongoing connection,
 		 * no errors and space in socket buffer.
 		 * If space is low allow for the remainder of the
 		 * file to be processed if it fits the socket buffer.
 		 * Otherwise block in waiting for sufficient space
 		 * to proceed, or if the socket is nonblocking, return
 		 * to userland with EAGAIN while reporting how far
 		 * we've come.
 		 * We wait until the socket buffer has significant free
 		 * space to do bulk sends.  This makes good use of file
 		 * system read ahead and allows packet segmentation
 		 * offloading hardware to take over lots of work.  If
 		 * we were not careful here we would send off only one
 		 * sfbuf at a time.
 		 */
 		SOCKBUF_LOCK(&so->so_snd);
 		if (so->so_snd.sb_lowat < so->so_snd.sb_hiwat / 2)
 			so->so_snd.sb_lowat = so->so_snd.sb_hiwat / 2;
 retry_space:
 		if (so->so_snd.sb_state & SBS_CANTSENDMORE) {
 			error = EPIPE;
 			SOCKBUF_UNLOCK(&so->so_snd);
 			goto done;
 		} else if (so->so_error) {
 			error = so->so_error;
 			so->so_error = 0;
 			SOCKBUF_UNLOCK(&so->so_snd);
 			goto done;
 		}
 		space = sbspace(&so->so_snd);
 		if (space < rem &&
 		    (space <= 0 ||
 		     space < so->so_snd.sb_lowat)) {
 			if (so->so_state & SS_NBIO) {
 				SOCKBUF_UNLOCK(&so->so_snd);
 				error = EAGAIN;
 				goto done;
 			}
 			/*
 			 * sbwait drops the lock while sleeping.
 			 * When we loop back to retry_space the
 			 * state may have changed and we retest
 			 * for it.
 			 */
 			error = sbwait(&so->so_snd);
 			/*
 			 * An error from sbwait usually indicates that we've
 			 * been interrupted by a signal. If we've sent anything
 			 * then return bytes sent, otherwise return the error.
 			 */
 			if (error) {
 				SOCKBUF_UNLOCK(&so->so_snd);
 				goto done;
 			}
 			goto retry_space;
 		}
 		SOCKBUF_UNLOCK(&so->so_snd);
 
 		/*
 		 * Reduce space in the socket buffer by the size of
 		 * the header mbuf chain.
 		 * hdrlen is set to 0 after the first loop.
 		 */
 		space -= hdrlen;
 
 		/*
 		 * Loop and construct maximum sized mbuf chain to be bulk
 		 * dumped into socket buffer.
 		 */
 		while(space > loopbytes) {
 			vm_pindex_t pindex;
 			vm_offset_t pgoff;
 			struct mbuf *m0;
 
 			VM_OBJECT_LOCK(obj);
 			/*
 			 * Calculate the amount to transfer.
 			 * Not to exceed a page, the EOF,
 			 * or the passed in nbytes.
 			 */
 			pgoff = (vm_offset_t)(off & PAGE_MASK);
 			xfsize = omin(PAGE_SIZE - pgoff,
 			    obj->un_pager.vnp.vnp_size - uap->offset -
 			    fsbytes - loopbytes);
 			if (uap->nbytes)
 				rem = (uap->nbytes - fsbytes - loopbytes);
 			else
 				rem = obj->un_pager.vnp.vnp_size -
 				    uap->offset - fsbytes - loopbytes;
 			xfsize = omin(rem, xfsize);
 			if (xfsize <= 0) {
 				VM_OBJECT_UNLOCK(obj);
 				done = 1;		/* all data sent */
 				break;
 			}
 			/*
 			 * Don't overflow the send buffer.
 			 * Stop here and send out what we've
 			 * already got.
 			 */
 			if (space < loopbytes + xfsize) {
 				VM_OBJECT_UNLOCK(obj);
 				break;
 			}
 
 			/*
 			 * Attempt to look up the page.  Allocate
 			 * if not found or wait and loop if busy.
 			 */
 			pindex = OFF_TO_IDX(off);
 			pg = vm_page_grab(obj, pindex, VM_ALLOC_NOBUSY |
 			    VM_ALLOC_NORMAL | VM_ALLOC_WIRED | VM_ALLOC_RETRY);
 
 			/*
 			 * Check if page is valid for what we need,
 			 * otherwise initiate I/O.
 			 * If we already turned some pages into mbufs,
 			 * send them off before we come here again and
 			 * block.
 			 */
 			if (pg->valid && vm_page_is_valid(pg, pgoff, xfsize))
 				VM_OBJECT_UNLOCK(obj);
 			else if (m != NULL)
 				error = EAGAIN;	/* send what we already got */
 			else if (uap->flags & SF_NODISKIO)
 				error = EBUSY;
 			else {
 				int bsize, resid;
 
 				/*
 				 * Ensure that our page is still around
 				 * when the I/O completes.
 				 */
 				vm_page_io_start(pg);
 				VM_OBJECT_UNLOCK(obj);
 
 				/*
 				 * Get the page from backing store.
 				 */
 				bsize = vp->v_mount->mnt_stat.f_iosize;
 				vfslocked = VFS_LOCK_GIANT(vp->v_mount);
-				vn_lock(vp, LK_SHARED | LK_RETRY, td);
+				vn_lock(vp, LK_SHARED | LK_RETRY);
 
 				/*
 				 * XXXMAC: Because we don't have fp->f_cred
 				 * here, we pass in NOCRED.  This is probably
 				 * wrong, but is consistent with our original
 				 * implementation.
 				 */
 				error = vn_rdwr(UIO_READ, vp, NULL, MAXBSIZE,
 				    trunc_page(off), UIO_NOCOPY, IO_NODELOCKED |
 				    IO_VMIO | ((MAXBSIZE / bsize) << IO_SEQSHIFT),
 				    td->td_ucred, NOCRED, &resid, td);
 				VOP_UNLOCK(vp, 0, td);
 				VFS_UNLOCK_GIANT(vfslocked);
 				VM_OBJECT_LOCK(obj);
 				vm_page_io_finish(pg);
 				if (!error)
 					VM_OBJECT_UNLOCK(obj);
 				mbstat.sf_iocnt++;
 			}
 			if (error) {
 				vm_page_lock_queues();
 				vm_page_unwire(pg, 0);
 				/*
 				 * See if anyone else might know about
 				 * this page.  If not and it is not valid,
 				 * then free it.
 				 */
 				if (pg->wire_count == 0 && pg->valid == 0 &&
 				    pg->busy == 0 && !(pg->oflags & VPO_BUSY) &&
 				    pg->hold_count == 0) {
 					vm_page_free(pg);
 				}
 				vm_page_unlock_queues();
 				VM_OBJECT_UNLOCK(obj);
 				if (error == EAGAIN)
 					error = 0;	/* not a real error */
 				break;
 			}
 
 			/*
 			 * Get a sendfile buf.  We usually wait as long
 			 * as necessary, but this wait can be interrupted.
 			 */
 			if ((sf = sf_buf_alloc(pg,
 			    (mnw ? SFB_NOWAIT : SFB_CATCH))) == NULL) {
 				mbstat.sf_allocfail++;
 				vm_page_lock_queues();
 				vm_page_unwire(pg, 0);
 				/*
 				 * XXX: Not same check as above!?
 				 */
 				if (pg->wire_count == 0 && pg->object == NULL)
 					vm_page_free(pg);
 				vm_page_unlock_queues();
 				error = (mnw ? EAGAIN : EINTR);
 				break;
 			}
 
 			/*
 			 * Get an mbuf and set it up as having
 			 * external storage.
 			 */
 			m0 = m_get((mnw ? M_NOWAIT : M_WAITOK), MT_DATA);
 			if (m0 == NULL) {
 				error = (mnw ? EAGAIN : ENOBUFS);
 				sf_buf_mext((void *)sf_buf_kva(sf), sf);
 				break;
 			}
 			MEXTADD(m0, sf_buf_kva(sf), PAGE_SIZE, sf_buf_mext,
 			    sf, M_RDONLY, EXT_SFBUF);
 			m0->m_data = (char *)sf_buf_kva(sf) + pgoff;
 			m0->m_len = xfsize;
 
 			/* Append to mbuf chain. */
 			if (m != NULL)
 				m_cat(m, m0);
 			else
 				m = m0;
 
 			/* Keep track of bits processed. */
 			loopbytes += xfsize;
 			off += xfsize;
 		}
 
 		/* Add the buffer chain to the socket buffer. */
 		if (m != NULL) {
 			int mlen, err;
 
 			mlen = m_length(m, NULL);
 			SOCKBUF_LOCK(&so->so_snd);
 			if (so->so_snd.sb_state & SBS_CANTSENDMORE) {
 				error = EPIPE;
 				SOCKBUF_UNLOCK(&so->so_snd);
 				goto done;
 			}
 			SOCKBUF_UNLOCK(&so->so_snd);
 			/* Avoid error aliasing. */
 			err = (*so->so_proto->pr_usrreqs->pru_send)
 				    (so, 0, m, NULL, NULL, td);
 			if (err == 0) {
 				/*
 				 * We need two counters to get the
 				 * file offset and nbytes to send
 				 * right:
 				 * - sbytes contains the total amount
 				 *   of bytes sent, including headers.
 				 * - fsbytes contains the total amount
 				 *   of bytes sent from the file.
 				 */
 				sbytes += mlen;
 				fsbytes += mlen;
 				if (hdrlen) {
 					fsbytes -= hdrlen;
 					hdrlen = 0;
 				}
 			} else if (error == 0)
 				error = err;
 			m = NULL;	/* pru_send always consumes */
 		}
 
 		/* Quit outer loop on error or when we're done. */
 		if (error || done)
 			goto done;
 	}
 
 	/*
 	 * Send trailers. Wimp out and use writev(2).
 	 */
 	if (trl_uio != NULL) {
 		error = kern_writev(td, uap->s, trl_uio);
 		if (error)
 			goto done;
 		sbytes += td->td_retval[0];
 	}
 
 done:
 	sbunlock(&so->so_snd);
 out:
 	/*
 	 * If there was no error we have to clear td->td_retval[0]
 	 * because it may have been set by writev.
 	 */
 	if (error == 0) {
 		td->td_retval[0] = 0;
 	}
 	if (uap->sbytes != NULL) {
 		copyout(&sbytes, uap->sbytes, sizeof(off_t));
 	}
 	if (obj != NULL)
 		vm_object_deallocate(obj);
 	if (vp != NULL) {
 		vfslocked = VFS_LOCK_GIANT(vp->v_mount);
 		vrele(vp);
 		VFS_UNLOCK_GIANT(vfslocked);
 	}
 	if (so)
 		fdrop(sock_fp, td);
 	if (m)
 		m_freem(m);
 
 	if (error == ERESTART)
 		error = EINTR;
 
 	return (error);
 }
 
 /*
  * SCTP syscalls.
  * Functionality only compiled in if SCTP is defined in the kernel Makefile,
  * otherwise all return EOPNOTSUPP.
  * XXX: We should make this loadable one day.
  */
 int
 sctp_peeloff(td, uap)
 	struct thread *td;
 	struct sctp_peeloff_args /* {
 		int	sd;
 		caddr_t	name;
 	} */ *uap;
 {
 #ifdef SCTP
 	struct filedesc *fdp;
 	struct file *nfp = NULL;
 	int error;
 	struct socket *head, *so;
 	int fd;
 	u_int fflag;
 
 	fdp = td->td_proc->p_fd;
 	error = fgetsock(td, uap->sd, &head, &fflag);
 	if (error)
 		goto done2;
 	error = sctp_can_peel_off(head, (sctp_assoc_t)uap->name);
 	if (error)
 		goto done2;
 	/*
 	 * At this point we know we do have a assoc to pull
 	 * we proceed to get the fd setup. This may block
 	 * but that is ok.
 	 */
 
 	error = falloc(td, &nfp, &fd);
 	if (error)
 		goto done;
 	td->td_retval[0] = fd;
 
 	so = sonewconn(head, SS_ISCONNECTED);
 	if (so == NULL) 
 		goto noconnection;
 	/*
 	 * Before changing the flags on the socket, we have to bump the
 	 * reference count.  Otherwise, if the protocol calls sofree(),
 	 * the socket will be released due to a zero refcount.
 	 */
         SOCK_LOCK(so);
         soref(so);                      /* file descriptor reference */
         SOCK_UNLOCK(so);
 
 	ACCEPT_LOCK();
 
 	TAILQ_REMOVE(&head->so_comp, so, so_list);
 	head->so_qlen--;
 	so->so_state |= (head->so_state & SS_NBIO);
 	so->so_state &= ~SS_NOFDREF;
 	so->so_qstate &= ~SQ_COMP;
 	so->so_head = NULL;
 	ACCEPT_UNLOCK();
 	finit(nfp, fflag, DTYPE_SOCKET, so, &socketops);
 	error = sctp_do_peeloff(head, so, (sctp_assoc_t)uap->name);
 	if (error)
 		goto noconnection;
 	if (head->so_sigio != NULL)
 		fsetown(fgetown(&head->so_sigio), &so->so_sigio);
 
 noconnection:
 	/*
 	 * close the new descriptor, assuming someone hasn't ripped it
 	 * out from under us.
 	 */
 	if (error)
 		fdclose(fdp, nfp, fd, td);
 
 	/*
 	 * Release explicitly held references before returning.
 	 */
 done:
 	if (nfp != NULL)
 		fdrop(nfp, td);
 	fputsock(head);
 done2:
 	return (error);
 #else  /* SCTP */
 	return (EOPNOTSUPP);
 #endif /* SCTP */
 }
 
 int
 sctp_generic_sendmsg (td, uap)
 	struct thread *td;
 	struct sctp_generic_sendmsg_args /* {
 		int sd, 
 		caddr_t msg, 
 		int mlen, 
 		caddr_t to, 
 		__socklen_t tolen, 
 		struct sctp_sndrcvinfo *sinfo, 
 		int flags
 	} */ *uap;
 {
 #ifdef SCTP
 	struct sctp_sndrcvinfo sinfo, *u_sinfo = NULL;
 	struct socket *so;
 	struct file *fp = NULL;
 	int use_rcvinfo = 1;
 	int error = 0, len;
 	struct sockaddr *to = NULL;
 #ifdef KTRACE
 	struct uio *ktruio = NULL;
 #endif
 	struct uio auio;
 	struct iovec iov[1];
 
 	if (uap->sinfo) {
 		error = copyin(uap->sinfo, &sinfo, sizeof (sinfo));
 		if (error)
 			return (error);
 		u_sinfo = &sinfo;
 	}
 	if (uap->tolen) {
 		error = getsockaddr(&to, uap->to, uap->tolen);
 		if (error) {
 			to = NULL;
 			goto sctp_bad2;
 		}
 	}
 
 	error = getsock(td->td_proc->p_fd, uap->sd, &fp, NULL);
 	if (error)
 		goto sctp_bad;
 
 	iov[0].iov_base = uap->msg;
 	iov[0].iov_len = uap->mlen;
 
 	so = (struct socket *)fp->f_data;
 #ifdef MAC
 	SOCK_LOCK(so);
 	error = mac_socket_check_send(td->td_ucred, so);
 	SOCK_UNLOCK(so);
 	if (error)
 		goto sctp_bad;
 #endif /* MAC */
 
 	auio.uio_iov =  iov;
 	auio.uio_iovcnt = 1;
 	auio.uio_segflg = UIO_USERSPACE;
 	auio.uio_rw = UIO_WRITE;
 	auio.uio_td = td;
 	auio.uio_offset = 0;			/* XXX */
 	auio.uio_resid = 0;
 	len = auio.uio_resid = uap->mlen;
 	error = sctp_lower_sosend(so, to, &auio,
 		    (struct mbuf *)NULL, (struct mbuf *)NULL,
 		    uap->flags, use_rcvinfo, u_sinfo, td);
 	if (error) {
 		if (auio.uio_resid != len && (error == ERESTART ||
 		    error == EINTR || error == EWOULDBLOCK))
 			error = 0;
 		/* Generation of SIGPIPE can be controlled per socket. */
 		if (error == EPIPE && !(so->so_options & SO_NOSIGPIPE) &&
 		    !(uap->flags & MSG_NOSIGNAL)) {
 			PROC_LOCK(td->td_proc);
 			psignal(td->td_proc, SIGPIPE);
 			PROC_UNLOCK(td->td_proc);
 		}
 	}
 	if (error == 0)
 		td->td_retval[0] = len - auio.uio_resid;
 #ifdef KTRACE
 	if (ktruio != NULL) {
 		ktruio->uio_resid = td->td_retval[0];
 		ktrgenio(uap->sd, UIO_WRITE, ktruio, error);
 	}
 #endif /* KTRACE */
 sctp_bad:
 	if (fp)
 		fdrop(fp, td);
 sctp_bad2:
 	if (to)
 		free(to, M_SONAME);
 	return (error);
 #else  /* SCTP */
 	return (EOPNOTSUPP);
 #endif /* SCTP */
 }
 
 int
 sctp_generic_sendmsg_iov(td, uap)
 	struct thread *td;
 	struct sctp_generic_sendmsg_iov_args /* {
 		int sd, 
 		struct iovec *iov, 
 		int iovlen, 
 		caddr_t to, 
 		__socklen_t tolen, 
 		struct sctp_sndrcvinfo *sinfo, 
 		int flags
 	} */ *uap;
 {
 #ifdef SCTP
 	struct sctp_sndrcvinfo sinfo, *u_sinfo = NULL;
 	struct socket *so;
 	struct file *fp = NULL;
 	int use_rcvinfo = 1;
 	int error=0, len, i;
 	struct sockaddr *to = NULL;
 #ifdef KTRACE
 	struct uio *ktruio = NULL;
 #endif
 	struct uio auio;
 	struct iovec *iov, *tiov;
 
 	if (uap->sinfo) {
 		error = copyin(uap->sinfo, &sinfo, sizeof (sinfo));
 		if (error)
 			return (error);
 		u_sinfo = &sinfo;
 	}
 	if (uap->tolen) {
 		error = getsockaddr(&to, uap->to, uap->tolen);
 		if (error) {
 			to = NULL;
 			goto sctp_bad2;
 		}
 	}
 
 	error = getsock(td->td_proc->p_fd, uap->sd, &fp, NULL);
 	if (error)
 		goto sctp_bad1;
 
 	error = copyiniov(uap->iov, uap->iovlen, &iov, EMSGSIZE);
 	if (error)
 		goto sctp_bad1;
 
 	so = (struct socket *)fp->f_data;
 #ifdef MAC
 	SOCK_LOCK(so);
 	error = mac_socket_check_send(td->td_ucred, so);
 	SOCK_UNLOCK(so);
 	if (error)
 		goto sctp_bad;
 #endif /* MAC */
 
 	auio.uio_iov =  iov;
 	auio.uio_iovcnt = uap->iovlen;
 	auio.uio_segflg = UIO_USERSPACE;
 	auio.uio_rw = UIO_WRITE;
 	auio.uio_td = td;
 	auio.uio_offset = 0;			/* XXX */
 	auio.uio_resid = 0;
 	tiov = iov;
 	for (i = 0; i <uap->iovlen; i++, tiov++) {
 		if ((auio.uio_resid += tiov->iov_len) < 0) {
 			error = EINVAL;
 			goto sctp_bad;
 		}
 	}
 	len = auio.uio_resid;
 	error = sctp_lower_sosend(so, to, &auio,
 		    (struct mbuf *)NULL, (struct mbuf *)NULL,
 		    uap->flags, use_rcvinfo, u_sinfo, td);
 	if (error) {
 		if (auio.uio_resid != len && (error == ERESTART ||
 		    error == EINTR || error == EWOULDBLOCK))
 			error = 0;
 		/* Generation of SIGPIPE can be controlled per socket */
 		if (error == EPIPE && !(so->so_options & SO_NOSIGPIPE) &&
 		    !(uap->flags & MSG_NOSIGNAL)) {
 			PROC_LOCK(td->td_proc);
 			psignal(td->td_proc, SIGPIPE);
 			PROC_UNLOCK(td->td_proc);
 		}
 	}
 	if (error == 0)
 		td->td_retval[0] = len - auio.uio_resid;
 #ifdef KTRACE
 	if (ktruio != NULL) {
 		ktruio->uio_resid = td->td_retval[0];
 		ktrgenio(uap->sd, UIO_WRITE, ktruio, error);
 	}
 #endif /* KTRACE */
 sctp_bad:
 	free(iov, M_IOV);
 sctp_bad1:
 	if (fp)
 		fdrop(fp, td);
 sctp_bad2:
 	if (to)
 		free(to, M_SONAME);
 	return (error);
 #else  /* SCTP */
 	return (EOPNOTSUPP);
 #endif /* SCTP */
 }
 
 int
 sctp_generic_recvmsg(td, uap)
 	struct thread *td;
 	struct sctp_generic_recvmsg_args /* {
 		int sd, 
 		struct iovec *iov, 
 		int iovlen,
 		struct sockaddr *from, 
 		__socklen_t *fromlenaddr,
 		struct sctp_sndrcvinfo *sinfo, 
 		int *msg_flags
 	} */ *uap;
 {
 #ifdef SCTP
 	u_int8_t sockbufstore[256];
 	struct uio auio;
 	struct iovec *iov, *tiov;
 	struct sctp_sndrcvinfo sinfo;
 	struct socket *so;
 	struct file *fp = NULL;
 	struct sockaddr *fromsa;
 	int fromlen;
 	int len, i, msg_flags;
 	int error = 0;
 #ifdef KTRACE
 	struct uio *ktruio = NULL;
 #endif
 	error = getsock(td->td_proc->p_fd, uap->sd, &fp, NULL);
 	if (error) {
 		return (error);
 	}
 	error = copyiniov(uap->iov, uap->iovlen, &iov, EMSGSIZE);
 	if (error) {
 		goto out1;
 	}
 
 	so = fp->f_data;
 #ifdef MAC
 	SOCK_LOCK(so);
 	error = mac_socket_check_receive(td->td_ucred, so);
 	SOCK_UNLOCK(so);
 	if (error) {
 		goto out;
 		return (error);
 	}
 #endif /* MAC */
 
 	if (uap->fromlenaddr) {
 		error = copyin(uap->fromlenaddr,
 		    &fromlen, sizeof (fromlen));
 		if (error) {
 			goto out;
 		}
 	} else {
 		fromlen = 0;
 	}
 	if(uap->msg_flags) {
 		error = copyin(uap->msg_flags, &msg_flags, sizeof (int));
 		if (error) {
 			goto out;
 		}
 	} else {
 		msg_flags = 0;
 	}
 	auio.uio_iov = iov;
 	auio.uio_iovcnt = uap->iovlen;
   	auio.uio_segflg = UIO_USERSPACE;
 	auio.uio_rw = UIO_READ;
 	auio.uio_td = td;
 	auio.uio_offset = 0;			/* XXX */
 	auio.uio_resid = 0;
 	tiov = iov;
 	for (i = 0; i <uap->iovlen; i++, tiov++) {
 		if ((auio.uio_resid += tiov->iov_len) < 0) {
 			error = EINVAL;
 			goto out;
 		}
 	}
 	len = auio.uio_resid;
 	fromsa = (struct sockaddr *)sockbufstore;
 
 #ifdef KTRACE
 	if (KTRPOINT(td, KTR_GENIO))
 		ktruio = cloneuio(&auio);
 #endif /* KTRACE */
 	error = sctp_sorecvmsg(so, &auio, (struct mbuf **)NULL,
 		    fromsa, fromlen, &msg_flags,
 		    (struct sctp_sndrcvinfo *)&sinfo, 1);
 	if (error) {
 		if (auio.uio_resid != (int)len && (error == ERESTART ||
 		    error == EINTR || error == EWOULDBLOCK))
 			error = 0;
 	} else {
 		if (uap->sinfo)
 			error = copyout(&sinfo, uap->sinfo, sizeof (sinfo));
 	}
 #ifdef KTRACE
 	if (ktruio != NULL) {
 		ktruio->uio_resid = (int)len - auio.uio_resid;
 		ktrgenio(uap->sd, UIO_READ, ktruio, error);
 	}
 #endif /* KTRACE */
 	if (error)
 		goto out;
 	td->td_retval[0] = (int)len - auio.uio_resid;
 
 	if (fromlen && uap->from) {
 		len = fromlen;
 		if (len <= 0 || fromsa == 0)
 			len = 0;
 		else {
 			len = MIN(len, fromsa->sa_len);
 			error = copyout(fromsa, uap->from, (unsigned)len);
 			if (error)
 				goto out;
 		}
 		error = copyout(&len, uap->fromlenaddr, sizeof (socklen_t));
 		if (error) {
 			goto out;
 		}
 	}
 	if (uap->msg_flags) {
 		error = copyout(&msg_flags, uap->msg_flags, sizeof (int));
 		if (error) {
 			goto out;
 		}
 	}
 out:
 	free(iov, M_IOV);
 out1:
 	if (fp) 
 		fdrop(fp, td);
 
 	return (error);
 #else  /* SCTP */
 	return (EOPNOTSUPP);
 #endif /* SCTP */
 }
Index: head/sys/kern/vfs_acl.c
===================================================================
--- head/sys/kern/vfs_acl.c	(revision 175201)
+++ head/sys/kern/vfs_acl.c	(revision 175202)
@@ -1,431 +1,431 @@
 /*-
  * Copyright (c) 1999-2006 Robert N. M. Watson
  * All rights reserved.
  *
  * This software was developed by Robert Watson for the TrustedBSD Project.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 /*
  * Developed by the TrustedBSD Project.
  *
  * ACL system calls and other functions common across different ACL types.
  * Type-specific routines go into subr_acl_<type>.c.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_mac.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/sysproto.h>
 #include <sys/kernel.h>
 #include <sys/malloc.h>
 #include <sys/mount.h>
 #include <sys/vnode.h>
 #include <sys/lock.h>
 #include <sys/mutex.h>
 #include <sys/namei.h>
 #include <sys/file.h>
 #include <sys/filedesc.h>
 #include <sys/proc.h>
 #include <sys/sysent.h>
 #include <sys/acl.h>
 
 #include <security/mac/mac_framework.h>
 
 #include <vm/uma.h>
 
 uma_zone_t	acl_zone;
 static int	vacl_set_acl(struct thread *td, struct vnode *vp,
 		    acl_type_t type, struct acl *aclp);
 static int	vacl_get_acl(struct thread *td, struct vnode *vp,
 		    acl_type_t type, struct acl *aclp);
 static int	vacl_aclcheck(struct thread *td, struct vnode *vp,
 		    acl_type_t type, struct acl *aclp);
 
 /*
  * These calls wrap the real vnode operations, and are called by the syscall
  * code once the syscall has converted the path or file descriptor to a vnode
  * (unlocked).  The aclp pointer is assumed still to point to userland, so
  * this should not be consumed within the kernel except by syscall code.
  * Other code should directly invoke VOP_{SET,GET}ACL.
  */
 
 /*
  * Given a vnode, set its ACL.
  */
 static int
 vacl_set_acl(struct thread *td, struct vnode *vp, acl_type_t type,
     struct acl *aclp)
 {
 	struct acl inkernacl;
 	struct mount *mp;
 	int error;
 
 	error = copyin(aclp, &inkernacl, sizeof(struct acl));
 	if (error)
 		return(error);
 	error = vn_start_write(vp, &mp, V_WAIT | PCATCH);
 	if (error != 0)
 		return (error);
 	VOP_LEASE(vp, td, td->td_ucred, LEASE_WRITE);
-	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td);
+	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
 #ifdef MAC
 	error = mac_vnode_check_setacl(td->td_ucred, vp, type, &inkernacl);
 	if (error != 0)
 		goto out;
 #endif
 	error = VOP_SETACL(vp, type, &inkernacl, td->td_ucred, td);
 #ifdef MAC
 out:
 #endif
 	VOP_UNLOCK(vp, 0, td);
 	vn_finished_write(mp);
 	return(error);
 }
 
 /*
  * Given a vnode, get its ACL.
  */
 static int
 vacl_get_acl(struct thread *td, struct vnode *vp, acl_type_t type,
     struct acl *aclp)
 {
 	struct acl inkernelacl;
 	int error;
 
 	VOP_LEASE(vp, td, td->td_ucred, LEASE_WRITE);
-	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td);
+	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
 #ifdef MAC
 	error = mac_vnode_check_getacl(td->td_ucred, vp, type);
 	if (error != 0)
 		goto out;
 #endif
 	error = VOP_GETACL(vp, type, &inkernelacl, td->td_ucred, td);
 #ifdef MAC
 out:
 #endif
 	VOP_UNLOCK(vp, 0, td);
 	if (error == 0)
 		error = copyout(&inkernelacl, aclp, sizeof(struct acl));
 	return (error);
 }
 
 /*
  * Given a vnode, delete its ACL.
  */
 static int
 vacl_delete(struct thread *td, struct vnode *vp, acl_type_t type)
 {
 	struct mount *mp;
 	int error;
 
 	error = vn_start_write(vp, &mp, V_WAIT | PCATCH);
 	if (error)
 		return (error);
 	VOP_LEASE(vp, td, td->td_ucred, LEASE_WRITE);
-	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td);
+	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
 #ifdef MAC
 	error = mac_vnode_check_deleteacl(td->td_ucred, vp, type);
 	if (error)
 		goto out;
 #endif
 	error = VOP_SETACL(vp, type, 0, td->td_ucred, td);
 #ifdef MAC
 out:
 #endif
 	VOP_UNLOCK(vp, 0, td);
 	vn_finished_write(mp);
 	return (error);
 }
 
 /*
  * Given a vnode, check whether an ACL is appropriate for it
  */
 static int
 vacl_aclcheck(struct thread *td, struct vnode *vp, acl_type_t type,
     struct acl *aclp)
 {
 	struct acl inkernelacl;
 	int error;
 
 	error = copyin(aclp, &inkernelacl, sizeof(struct acl));
 	if (error)
 		return(error);
 	error = VOP_ACLCHECK(vp, type, &inkernelacl, td->td_ucred, td);
 	return (error);
 }
 
 /*
  * syscalls -- convert the path/fd to a vnode, and call vacl_whatever.  Don't
  * need to lock, as the vacl_ code will get/release any locks required.
  */
 
 /*
  * Given a file path, get an ACL for it
  */
 int
 __acl_get_file(struct thread *td, struct __acl_get_file_args *uap)
 {
 	struct nameidata nd;
 	int vfslocked, error;
 
 	NDINIT(&nd, LOOKUP, MPSAFE|FOLLOW, UIO_USERSPACE, uap->path, td);
 	error = namei(&nd);
 	vfslocked = NDHASGIANT(&nd);
 	if (error == 0) {
 		error = vacl_get_acl(td, nd.ni_vp, uap->type, uap->aclp);
 		NDFREE(&nd, 0);
 	}
 	VFS_UNLOCK_GIANT(vfslocked);
 	return (error);
 }
 
 /*
  * Given a file path, get an ACL for it; don't follow links.
  */
 int
 __acl_get_link(struct thread *td, struct __acl_get_link_args *uap)
 {
 	struct nameidata nd;
 	int vfslocked, error;
 
 	NDINIT(&nd, LOOKUP, MPSAFE|NOFOLLOW, UIO_USERSPACE, uap->path, td);
 	error = namei(&nd);
 	vfslocked = NDHASGIANT(&nd);
 	if (error == 0) {
 		error = vacl_get_acl(td, nd.ni_vp, uap->type, uap->aclp);
 		NDFREE(&nd, 0);
 	}
 	VFS_UNLOCK_GIANT(vfslocked);
 	return (error);
 }
 
 /*
  * Given a file path, set an ACL for it.
  */
 int
 __acl_set_file(struct thread *td, struct __acl_set_file_args *uap)
 {
 	struct nameidata nd;
 	int vfslocked, error;
 
 	NDINIT(&nd, LOOKUP, MPSAFE|FOLLOW, UIO_USERSPACE, uap->path, td);
 	error = namei(&nd);
 	vfslocked = NDHASGIANT(&nd);
 	if (error == 0) {
 		error = vacl_set_acl(td, nd.ni_vp, uap->type, uap->aclp);
 		NDFREE(&nd, 0);
 	}
 	VFS_UNLOCK_GIANT(vfslocked);
 	return (error);
 }
 
 /*
  * Given a file path, set an ACL for it; don't follow links.
  */
 int
 __acl_set_link(struct thread *td, struct __acl_set_link_args *uap)
 {
 	struct nameidata nd;
 	int vfslocked, error;
 
 	NDINIT(&nd, LOOKUP, MPSAFE|NOFOLLOW, UIO_USERSPACE, uap->path, td);
 	error = namei(&nd);
 	vfslocked = NDHASGIANT(&nd);
 	if (error == 0) {
 		error = vacl_set_acl(td, nd.ni_vp, uap->type, uap->aclp);
 		NDFREE(&nd, 0);
 	}
 	VFS_UNLOCK_GIANT(vfslocked);
 	return (error);
 }
 
 /*
  * Given a file descriptor, get an ACL for it.
  */
 int
 __acl_get_fd(struct thread *td, struct __acl_get_fd_args *uap)
 {
 	struct file *fp;
 	int vfslocked, error;
 
 	error = getvnode(td->td_proc->p_fd, uap->filedes, &fp);
 	if (error == 0) {
 		vfslocked = VFS_LOCK_GIANT(fp->f_vnode->v_mount);
 		error = vacl_get_acl(td, fp->f_vnode, uap->type, uap->aclp);
 		fdrop(fp, td);
 		VFS_UNLOCK_GIANT(vfslocked);
 	}
 	return (error);
 }
 
 /*
  * Given a file descriptor, set an ACL for it.
  */
 int
 __acl_set_fd(struct thread *td, struct __acl_set_fd_args *uap)
 {
 	struct file *fp;
 	int vfslocked, error;
 
 	error = getvnode(td->td_proc->p_fd, uap->filedes, &fp);
 	if (error == 0) {
 		vfslocked = VFS_LOCK_GIANT(fp->f_vnode->v_mount);
 		error = vacl_set_acl(td, fp->f_vnode, uap->type, uap->aclp);
 		fdrop(fp, td);
 		VFS_UNLOCK_GIANT(vfslocked);
 	}
 	return (error);
 }
 
 /*
  * Given a file path, delete an ACL from it.
  */
 int
 __acl_delete_file(struct thread *td, struct __acl_delete_file_args *uap)
 {
 	struct nameidata nd;
 	int vfslocked, error;
 
 	NDINIT(&nd, LOOKUP, MPSAFE|FOLLOW, UIO_USERSPACE, uap->path, td);
 	error = namei(&nd);
 	vfslocked = NDHASGIANT(&nd);
 	if (error == 0) {
 		error = vacl_delete(td, nd.ni_vp, uap->type);
 		NDFREE(&nd, 0);
 	}
 	VFS_UNLOCK_GIANT(vfslocked);
 	return (error);
 }
 
 /*
  * Given a file path, delete an ACL from it; don't follow links.
  */
 int
 __acl_delete_link(struct thread *td, struct __acl_delete_link_args *uap)
 {
 	struct nameidata nd;
 	int vfslocked, error;
 
 	NDINIT(&nd, LOOKUP, MPSAFE|NOFOLLOW, UIO_USERSPACE, uap->path, td);
 	error = namei(&nd);
 	vfslocked = NDHASGIANT(&nd);
 	if (error == 0) {
 		error = vacl_delete(td, nd.ni_vp, uap->type);
 		NDFREE(&nd, 0);
 	}
 	VFS_UNLOCK_GIANT(vfslocked);
 	return (error);
 }
 
 /*
  * Given a file path, delete an ACL from it.
  */
 int
 __acl_delete_fd(struct thread *td, struct __acl_delete_fd_args *uap)
 {
 	struct file *fp;
 	int vfslocked, error;
 
 	error = getvnode(td->td_proc->p_fd, uap->filedes, &fp);
 	if (error == 0) {
 		vfslocked = VFS_LOCK_GIANT(fp->f_vnode->v_mount);
 		error = vacl_delete(td, fp->f_vnode, uap->type);
 		fdrop(fp, td);
 		VFS_UNLOCK_GIANT(vfslocked);
 	}
 	return (error);
 }
 
 /*
  * Given a file path, check an ACL for it.
  */
 int
 __acl_aclcheck_file(struct thread *td, struct __acl_aclcheck_file_args *uap)
 {
 	struct nameidata	nd;
 	int vfslocked, error;
 
 	NDINIT(&nd, LOOKUP, MPSAFE|FOLLOW, UIO_USERSPACE, uap->path, td);
 	error = namei(&nd);
 	vfslocked = NDHASGIANT(&nd);
 	if (error == 0) {
 		error = vacl_aclcheck(td, nd.ni_vp, uap->type, uap->aclp);
 		NDFREE(&nd, 0);
 	}
 	VFS_UNLOCK_GIANT(vfslocked);
 	return (error);
 }
 
 /*
  * Given a file path, check an ACL for it; don't follow links.
  */
 int
 __acl_aclcheck_link(struct thread *td, struct __acl_aclcheck_link_args *uap)
 {
 	struct nameidata	nd;
 	int vfslocked, error;
 
 	NDINIT(&nd, LOOKUP, MPSAFE|NOFOLLOW, UIO_USERSPACE, uap->path, td);
 	error = namei(&nd);
 	vfslocked = NDHASGIANT(&nd);
 	if (error == 0) {
 		error = vacl_aclcheck(td, nd.ni_vp, uap->type, uap->aclp);
 		NDFREE(&nd, 0);
 	}
 	VFS_UNLOCK_GIANT(vfslocked);
 	return (error);
 }
 
 /*
  * Given a file descriptor, check an ACL for it.
  */
 int
 __acl_aclcheck_fd(struct thread *td, struct __acl_aclcheck_fd_args *uap)
 {
 	struct file *fp;
 	int vfslocked, error;
 
 	error = getvnode(td->td_proc->p_fd, uap->filedes, &fp);
 	if (error == 0) {
 		vfslocked = VFS_LOCK_GIANT(fp->f_vnode->v_mount);
 		error = vacl_aclcheck(td, fp->f_vnode, uap->type, uap->aclp);
 		fdrop(fp, td);
 		VFS_UNLOCK_GIANT(vfslocked);
 	}
 	return (error);
 }
 
 /* ARGUSED */
 
 static void
 aclinit(void *dummy __unused)
 {
 
 	acl_zone = uma_zcreate("ACL UMA zone", sizeof(struct acl),
 	    NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0);
 }
 SYSINIT(acls, SI_SUB_ACL, SI_ORDER_FIRST, aclinit, NULL)
Index: head/sys/kern/vfs_aio.c
===================================================================
--- head/sys/kern/vfs_aio.c	(revision 175201)
+++ head/sys/kern/vfs_aio.c	(revision 175202)
@@ -1,2323 +1,2323 @@
 /*-
  * Copyright (c) 1997 John S. Dyson.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. John S. Dyson's name may not be used to endorse or promote products
  *    derived from this software without specific prior written permission.
  *
  * DISCLAIMER:  This code isn't warranted to do anything useful.  Anything
  * bad that happens because of using this software isn't the responsibility
  * of the author.  This software is distributed AS-IS.
  */
 
 /*
  * This file contains support for the POSIX 1003.1B AIO/LIO facility.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/malloc.h>
 #include <sys/bio.h>
 #include <sys/buf.h>
 #include <sys/eventhandler.h>
 #include <sys/sysproto.h>
 #include <sys/filedesc.h>
 #include <sys/kernel.h>
 #include <sys/module.h>
 #include <sys/kthread.h>
 #include <sys/fcntl.h>
 #include <sys/file.h>
 #include <sys/limits.h>
 #include <sys/lock.h>
 #include <sys/mutex.h>
 #include <sys/unistd.h>
 #include <sys/posix4.h>
 #include <sys/proc.h>
 #include <sys/resourcevar.h>
 #include <sys/signalvar.h>
 #include <sys/protosw.h>
 #include <sys/sema.h>
 #include <sys/socket.h>
 #include <sys/socketvar.h>
 #include <sys/syscall.h>
 #include <sys/sysent.h>
 #include <sys/sysctl.h>
 #include <sys/sx.h>
 #include <sys/taskqueue.h>
 #include <sys/vnode.h>
 #include <sys/conf.h>
 #include <sys/event.h>
 #include <sys/mount.h>
 
 #include <machine/atomic.h>
 
 #include <vm/vm.h>
 #include <vm/vm_extern.h>
 #include <vm/pmap.h>
 #include <vm/vm_map.h>
 #include <vm/vm_object.h>
 #include <vm/uma.h>
 #include <sys/aio.h>
 
 #include "opt_vfs_aio.h"
 
 /*
  * Counter for allocating reference ids to new jobs.  Wrapped to 1 on
  * overflow. (XXX will be removed soon.)
  */
 static u_long jobrefid;
 
 /*
  * Counter for aio_fsync.
  */
 static uint64_t jobseqno;
 
 #define JOBST_NULL		0
 #define JOBST_JOBQSOCK		1
 #define JOBST_JOBQGLOBAL	2
 #define JOBST_JOBRUNNING	3
 #define JOBST_JOBFINISHED	4
 #define JOBST_JOBQBUF		5
 #define JOBST_JOBQSYNC		6
 
 #ifndef MAX_AIO_PER_PROC
 #define MAX_AIO_PER_PROC	32
 #endif
 
 #ifndef MAX_AIO_QUEUE_PER_PROC
 #define MAX_AIO_QUEUE_PER_PROC	256 /* Bigger than AIO_LISTIO_MAX */
 #endif
 
 #ifndef MAX_AIO_PROCS
 #define MAX_AIO_PROCS		32
 #endif
 
 #ifndef MAX_AIO_QUEUE
 #define	MAX_AIO_QUEUE		1024 /* Bigger than AIO_LISTIO_MAX */
 #endif
 
 #ifndef TARGET_AIO_PROCS
 #define TARGET_AIO_PROCS	4
 #endif
 
 #ifndef MAX_BUF_AIO
 #define MAX_BUF_AIO		16
 #endif
 
 #ifndef AIOD_TIMEOUT_DEFAULT
 #define	AIOD_TIMEOUT_DEFAULT	(10 * hz)
 #endif
 
 #ifndef AIOD_LIFETIME_DEFAULT
 #define AIOD_LIFETIME_DEFAULT	(30 * hz)
 #endif
 
 static SYSCTL_NODE(_vfs, OID_AUTO, aio, CTLFLAG_RW, 0, "Async IO management");
 
 static int max_aio_procs = MAX_AIO_PROCS;
 SYSCTL_INT(_vfs_aio, OID_AUTO, max_aio_procs,
 	CTLFLAG_RW, &max_aio_procs, 0,
 	"Maximum number of kernel threads to use for handling async IO ");
 
 static int num_aio_procs = 0;
 SYSCTL_INT(_vfs_aio, OID_AUTO, num_aio_procs,
 	CTLFLAG_RD, &num_aio_procs, 0,
 	"Number of presently active kernel threads for async IO");
 
 /*
  * The code will adjust the actual number of AIO processes towards this
  * number when it gets a chance.
  */
 static int target_aio_procs = TARGET_AIO_PROCS;
 SYSCTL_INT(_vfs_aio, OID_AUTO, target_aio_procs, CTLFLAG_RW, &target_aio_procs,
 	0, "Preferred number of ready kernel threads for async IO");
 
 static int max_queue_count = MAX_AIO_QUEUE;
 SYSCTL_INT(_vfs_aio, OID_AUTO, max_aio_queue, CTLFLAG_RW, &max_queue_count, 0,
     "Maximum number of aio requests to queue, globally");
 
 static int num_queue_count = 0;
 SYSCTL_INT(_vfs_aio, OID_AUTO, num_queue_count, CTLFLAG_RD, &num_queue_count, 0,
     "Number of queued aio requests");
 
 static int num_buf_aio = 0;
 SYSCTL_INT(_vfs_aio, OID_AUTO, num_buf_aio, CTLFLAG_RD, &num_buf_aio, 0,
     "Number of aio requests presently handled by the buf subsystem");
 
 /* Number of async I/O thread in the process of being started */
 /* XXX This should be local to aio_aqueue() */
 static int num_aio_resv_start = 0;
 
 static int aiod_timeout;
 SYSCTL_INT(_vfs_aio, OID_AUTO, aiod_timeout, CTLFLAG_RW, &aiod_timeout, 0,
     "Timeout value for synchronous aio operations");
 
 static int aiod_lifetime;
 SYSCTL_INT(_vfs_aio, OID_AUTO, aiod_lifetime, CTLFLAG_RW, &aiod_lifetime, 0,
     "Maximum lifetime for idle aiod");
 
 static int unloadable = 0;
 SYSCTL_INT(_vfs_aio, OID_AUTO, unloadable, CTLFLAG_RW, &unloadable, 0,
     "Allow unload of aio (not recommended)");
 
 
 static int max_aio_per_proc = MAX_AIO_PER_PROC;
 SYSCTL_INT(_vfs_aio, OID_AUTO, max_aio_per_proc, CTLFLAG_RW, &max_aio_per_proc,
     0, "Maximum active aio requests per process (stored in the process)");
 
 static int max_aio_queue_per_proc = MAX_AIO_QUEUE_PER_PROC;
 SYSCTL_INT(_vfs_aio, OID_AUTO, max_aio_queue_per_proc, CTLFLAG_RW,
     &max_aio_queue_per_proc, 0,
     "Maximum queued aio requests per process (stored in the process)");
 
 static int max_buf_aio = MAX_BUF_AIO;
 SYSCTL_INT(_vfs_aio, OID_AUTO, max_buf_aio, CTLFLAG_RW, &max_buf_aio, 0,
     "Maximum buf aio requests per process (stored in the process)");
 
 typedef struct oaiocb {
 	int	aio_fildes;		/* File descriptor */
 	off_t	aio_offset;		/* File offset for I/O */
 	volatile void *aio_buf;         /* I/O buffer in process space */
 	size_t	aio_nbytes;		/* Number of bytes for I/O */
 	struct	osigevent aio_sigevent;	/* Signal to deliver */
 	int	aio_lio_opcode;		/* LIO opcode */
 	int	aio_reqprio;		/* Request priority -- ignored */
 	struct	__aiocb_private	_aiocb_private;
 } oaiocb_t;
 
 /*
  * Below is a key of locks used to protect each member of struct aiocblist
  * aioliojob and kaioinfo and any backends.
  *
  * * - need not protected
  * a - locked by kaioinfo lock
  * b - locked by backend lock, the backend lock can be null in some cases,
  *     for example, BIO belongs to this type, in this case, proc lock is
  *     reused.
  * c - locked by aio_job_mtx, the lock for the generic file I/O backend.
  */
 
 /*
  * Current, there is only two backends: BIO and generic file I/O.
  * socket I/O is served by generic file I/O, this is not a good idea, since
  * disk file I/O and any other types without O_NONBLOCK flag can block daemon
  * threads, if there is no thread to serve socket I/O, the socket I/O will be
  * delayed too long or starved, we should create some threads dedicated to
  * sockets to do non-blocking I/O, same for pipe and fifo, for these I/O
  * systems we really need non-blocking interface, fiddling O_NONBLOCK in file
  * structure is not safe because there is race between userland and aio
  * daemons.
  */
 
 struct aiocblist {
 	TAILQ_ENTRY(aiocblist) list;	/* (b) internal list of for backend */
 	TAILQ_ENTRY(aiocblist) plist;	/* (a) list of jobs for each backend */
 	TAILQ_ENTRY(aiocblist) allist;  /* (a) list of all jobs in proc */
 	int	jobflags;		/* (a) job flags */
 	int	jobstate;		/* (b) job state */
 	int	inputcharge;		/* (*) input blockes */
 	int	outputcharge;		/* (*) output blockes */
 	struct	buf *bp;		/* (*) private to BIO backend,
 				  	 * buffer pointer
 					 */
 	struct	proc *userproc;		/* (*) user process */
 	struct  ucred *cred;		/* (*) active credential when created */
 	struct	file *fd_file;		/* (*) pointer to file structure */
 	struct	aioliojob *lio;		/* (*) optional lio job */
 	struct	aiocb *uuaiocb;		/* (*) pointer in userspace of aiocb */
 	struct	knlist klist;		/* (a) list of knotes */
 	struct	aiocb uaiocb;		/* (*) kernel I/O control block */
 	ksiginfo_t ksi;			/* (a) realtime signal info */
 	struct	task biotask;		/* (*) private to BIO backend */
 	uint64_t seqno;			/* (*) job number */
 	int	pending;		/* (a) number of pending I/O, aio_fsync only */
 };
 
 /* jobflags */
 #define AIOCBLIST_DONE		0x01
 #define AIOCBLIST_BUFDONE	0x02
 #define AIOCBLIST_RUNDOWN	0x04
 #define AIOCBLIST_CHECKSYNC	0x08
 
 /*
  * AIO process info
  */
 #define AIOP_FREE	0x1			/* proc on free queue */
 
 struct aiothreadlist {
 	int aiothreadflags;			/* (c) AIO proc flags */
 	TAILQ_ENTRY(aiothreadlist) list;	/* (c) list of processes */
 	struct thread *aiothread;		/* (*) the AIO thread */
 };
 
 /*
  * data-structure for lio signal management
  */
 struct aioliojob {
 	int	lioj_flags;			/* (a) listio flags */
 	int	lioj_count;			/* (a) listio flags */
 	int	lioj_finished_count;		/* (a) listio flags */
 	struct	sigevent lioj_signal;		/* (a) signal on all I/O done */
 	TAILQ_ENTRY(aioliojob) lioj_list;	/* (a) lio list */
 	struct  knlist klist;			/* (a) list of knotes */
 	ksiginfo_t lioj_ksi;			/* (a) Realtime signal info */
 };
 
 #define	LIOJ_SIGNAL		0x1	/* signal on all done (lio) */
 #define	LIOJ_SIGNAL_POSTED	0x2	/* signal has been posted */
 #define LIOJ_KEVENT_POSTED	0x4	/* kevent triggered */
 
 /*
  * per process aio data structure
  */
 struct kaioinfo {
 	struct mtx	kaio_mtx;	/* the lock to protect this struct */
 	int	kaio_flags;		/* (a) per process kaio flags */
 	int	kaio_maxactive_count;	/* (*) maximum number of AIOs */
 	int	kaio_active_count;	/* (c) number of currently used AIOs */
 	int	kaio_qallowed_count;	/* (*) maxiumu size of AIO queue */
 	int	kaio_count;		/* (a) size of AIO queue */
 	int	kaio_ballowed_count;	/* (*) maximum number of buffers */
 	int	kaio_buffer_count;	/* (a) number of physio buffers */
 	TAILQ_HEAD(,aiocblist) kaio_all;	/* (a) all AIOs in the process */
 	TAILQ_HEAD(,aiocblist) kaio_done;	/* (a) done queue for process */
 	TAILQ_HEAD(,aioliojob) kaio_liojoblist; /* (a) list of lio jobs */
 	TAILQ_HEAD(,aiocblist) kaio_jobqueue;	/* (a) job queue for process */
 	TAILQ_HEAD(,aiocblist) kaio_bufqueue;	/* (a) buffer job queue for process */
 	TAILQ_HEAD(,aiocblist) kaio_sockqueue;  /* (a) queue for aios waiting on sockets,
 						 *  NOT USED YET.
 						 */
 	TAILQ_HEAD(,aiocblist) kaio_syncqueue;	/* (a) queue for aio_fsync */
 	struct	task	kaio_task;	/* (*) task to kick aio threads */
 };
 
 #define AIO_LOCK(ki)		mtx_lock(&(ki)->kaio_mtx)
 #define AIO_UNLOCK(ki)		mtx_unlock(&(ki)->kaio_mtx)
 #define AIO_LOCK_ASSERT(ki, f)	mtx_assert(&(ki)->kaio_mtx, (f))
 #define AIO_MTX(ki)		(&(ki)->kaio_mtx)
 
 #define KAIO_RUNDOWN	0x1	/* process is being run down */
 #define KAIO_WAKEUP	0x2	/* wakeup process when there is a significant event */
 
 static TAILQ_HEAD(,aiothreadlist) aio_freeproc;		/* (c) Idle daemons */
 static struct sema aio_newproc_sem;
 static struct mtx aio_job_mtx;
 static struct mtx aio_sock_mtx;
 static TAILQ_HEAD(,aiocblist) aio_jobs;			/* (c) Async job list */
 static struct unrhdr *aiod_unr;
 
 void		aio_init_aioinfo(struct proc *p);
 static void	aio_onceonly(void);
 static int	aio_free_entry(struct aiocblist *aiocbe);
 static void	aio_process(struct aiocblist *aiocbe);
 static int	aio_newproc(int *);
 int		aio_aqueue(struct thread *td, struct aiocb *job,
 			struct aioliojob *lio, int type, int osigev);
 static void	aio_physwakeup(struct buf *bp);
 static void	aio_proc_rundown(void *arg, struct proc *p);
 static void	aio_proc_rundown_exec(void *arg, struct proc *p, struct image_params *imgp);
 static int	aio_qphysio(struct proc *p, struct aiocblist *iocb);
 static void	biohelper(void *, int);
 static void	aio_daemon(void *param);
 static void	aio_swake_cb(struct socket *, struct sockbuf *);
 static int	aio_unload(void);
 static void	aio_bio_done_notify(struct proc *userp, struct aiocblist *aiocbe, int type);
 #define DONE_BUF	1
 #define DONE_QUEUE	2
 static int	do_lio_listio(struct thread *td, struct lio_listio_args *uap, int oldsigev);
 static int	aio_kick(struct proc *userp);
 static void	aio_kick_nowait(struct proc *userp);
 static void	aio_kick_helper(void *context, int pending);
 static int	filt_aioattach(struct knote *kn);
 static void	filt_aiodetach(struct knote *kn);
 static int	filt_aio(struct knote *kn, long hint);
 static int	filt_lioattach(struct knote *kn);
 static void	filt_liodetach(struct knote *kn);
 static int	filt_lio(struct knote *kn, long hint);
 
 /*
  * Zones for:
  * 	kaio	Per process async io info
  *	aiop	async io thread data
  *	aiocb	async io jobs
  *	aiol	list io job pointer - internal to aio_suspend XXX
  *	aiolio	list io jobs
  */
 static uma_zone_t kaio_zone, aiop_zone, aiocb_zone, aiol_zone, aiolio_zone;
 
 /* kqueue filters for aio */
 static struct filterops aio_filtops =
 	{ 0, filt_aioattach, filt_aiodetach, filt_aio };
 static struct filterops lio_filtops =
 	{ 0, filt_lioattach, filt_liodetach, filt_lio };
 
 static eventhandler_tag exit_tag, exec_tag;
 
 TASKQUEUE_DEFINE_THREAD(aiod_bio);
 
 /*
  * Main operations function for use as a kernel module.
  */
 static int
 aio_modload(struct module *module, int cmd, void *arg)
 {
 	int error = 0;
 
 	switch (cmd) {
 	case MOD_LOAD:
 		aio_onceonly();
 		break;
 	case MOD_UNLOAD:
 		error = aio_unload();
 		break;
 	case MOD_SHUTDOWN:
 		break;
 	default:
 		error = EINVAL;
 		break;
 	}
 	return (error);
 }
 
 static moduledata_t aio_mod = {
 	"aio",
 	&aio_modload,
 	NULL
 };
 
 SYSCALL_MODULE_HELPER(aio_cancel);
 SYSCALL_MODULE_HELPER(aio_error);
 SYSCALL_MODULE_HELPER(aio_fsync);
 SYSCALL_MODULE_HELPER(aio_read);
 SYSCALL_MODULE_HELPER(aio_return);
 SYSCALL_MODULE_HELPER(aio_suspend);
 SYSCALL_MODULE_HELPER(aio_waitcomplete);
 SYSCALL_MODULE_HELPER(aio_write);
 SYSCALL_MODULE_HELPER(lio_listio);
 SYSCALL_MODULE_HELPER(oaio_read);
 SYSCALL_MODULE_HELPER(oaio_write);
 SYSCALL_MODULE_HELPER(olio_listio);
 
 DECLARE_MODULE(aio, aio_mod,
 	SI_SUB_VFS, SI_ORDER_ANY);
 MODULE_VERSION(aio, 1);
 
 /*
  * Startup initialization
  */
 static void
 aio_onceonly(void)
 {
 
 	/* XXX: should probably just use so->callback */
 	aio_swake = &aio_swake_cb;
 	exit_tag = EVENTHANDLER_REGISTER(process_exit, aio_proc_rundown, NULL,
 	    EVENTHANDLER_PRI_ANY);
 	exec_tag = EVENTHANDLER_REGISTER(process_exec, aio_proc_rundown_exec, NULL,
 	    EVENTHANDLER_PRI_ANY);
 	kqueue_add_filteropts(EVFILT_AIO, &aio_filtops);
 	kqueue_add_filteropts(EVFILT_LIO, &lio_filtops);
 	TAILQ_INIT(&aio_freeproc);
 	sema_init(&aio_newproc_sem, 0, "aio_new_proc");
 	mtx_init(&aio_job_mtx, "aio_job", NULL, MTX_DEF);
 	mtx_init(&aio_sock_mtx, "aio_sock", NULL, MTX_DEF);
 	TAILQ_INIT(&aio_jobs);
 	aiod_unr = new_unrhdr(1, INT_MAX, NULL);
 	kaio_zone = uma_zcreate("AIO", sizeof(struct kaioinfo), NULL, NULL,
 	    NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE);
 	aiop_zone = uma_zcreate("AIOP", sizeof(struct aiothreadlist), NULL,
 	    NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE);
 	aiocb_zone = uma_zcreate("AIOCB", sizeof(struct aiocblist), NULL, NULL,
 	    NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE);
 	aiol_zone = uma_zcreate("AIOL", AIO_LISTIO_MAX*sizeof(intptr_t) , NULL,
 	    NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE);
 	aiolio_zone = uma_zcreate("AIOLIO", sizeof(struct aioliojob), NULL,
 	    NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE);
 	aiod_timeout = AIOD_TIMEOUT_DEFAULT;
 	aiod_lifetime = AIOD_LIFETIME_DEFAULT;
 	jobrefid = 1;
 	async_io_version = _POSIX_VERSION;
 	p31b_setcfg(CTL_P1003_1B_AIO_LISTIO_MAX, AIO_LISTIO_MAX);
 	p31b_setcfg(CTL_P1003_1B_AIO_MAX, MAX_AIO_QUEUE);
 	p31b_setcfg(CTL_P1003_1B_AIO_PRIO_DELTA_MAX, 0);
 }
 
 /*
  * Callback for unload of AIO when used as a module.
  */
 static int
 aio_unload(void)
 {
 	int error;
 
 	/*
 	 * XXX: no unloads by default, it's too dangerous.
 	 * perhaps we could do it if locked out callers and then
 	 * did an aio_proc_rundown() on each process.
 	 *
 	 * jhb: aio_proc_rundown() needs to run on curproc though,
 	 * so I don't think that would fly.
 	 */
 	if (!unloadable)
 		return (EOPNOTSUPP);
 
 	error = kqueue_del_filteropts(EVFILT_AIO);
 	if (error)
 		return error;
 	error = kqueue_del_filteropts(EVFILT_LIO);
 	if (error)
 		return error;
 	async_io_version = 0;
 	aio_swake = NULL;
 	taskqueue_free(taskqueue_aiod_bio);
 	delete_unrhdr(aiod_unr);
 	uma_zdestroy(kaio_zone);
 	uma_zdestroy(aiop_zone);
 	uma_zdestroy(aiocb_zone);
 	uma_zdestroy(aiol_zone);
 	uma_zdestroy(aiolio_zone);
 	EVENTHANDLER_DEREGISTER(process_exit, exit_tag);
 	EVENTHANDLER_DEREGISTER(process_exec, exec_tag);
 	mtx_destroy(&aio_job_mtx);
 	mtx_destroy(&aio_sock_mtx);
 	sema_destroy(&aio_newproc_sem);
 	p31b_setcfg(CTL_P1003_1B_AIO_LISTIO_MAX, -1);
 	p31b_setcfg(CTL_P1003_1B_AIO_MAX, -1);
 	p31b_setcfg(CTL_P1003_1B_AIO_PRIO_DELTA_MAX, -1);
 	return (0);
 }
 
 /*
  * Init the per-process aioinfo structure.  The aioinfo limits are set
  * per-process for user limit (resource) management.
  */
 void
 aio_init_aioinfo(struct proc *p)
 {
 	struct kaioinfo *ki;
 
 	ki = uma_zalloc(kaio_zone, M_WAITOK);
 	mtx_init(&ki->kaio_mtx, "aiomtx", NULL, MTX_DEF);
 	ki->kaio_flags = 0;
 	ki->kaio_maxactive_count = max_aio_per_proc;
 	ki->kaio_active_count = 0;
 	ki->kaio_qallowed_count = max_aio_queue_per_proc;
 	ki->kaio_count = 0;
 	ki->kaio_ballowed_count = max_buf_aio;
 	ki->kaio_buffer_count = 0;
 	TAILQ_INIT(&ki->kaio_all);
 	TAILQ_INIT(&ki->kaio_done);
 	TAILQ_INIT(&ki->kaio_jobqueue);
 	TAILQ_INIT(&ki->kaio_bufqueue);
 	TAILQ_INIT(&ki->kaio_liojoblist);
 	TAILQ_INIT(&ki->kaio_sockqueue);
 	TAILQ_INIT(&ki->kaio_syncqueue);
 	TASK_INIT(&ki->kaio_task, 0, aio_kick_helper, p);
 	PROC_LOCK(p);
 	if (p->p_aioinfo == NULL) {
 		p->p_aioinfo = ki;
 		PROC_UNLOCK(p);
 	} else {
 		PROC_UNLOCK(p);
 		mtx_destroy(&ki->kaio_mtx);
 		uma_zfree(kaio_zone, ki);
 	}
 
 	while (num_aio_procs < target_aio_procs)
 		aio_newproc(NULL);
 }
 
 static int
 aio_sendsig(struct proc *p, struct sigevent *sigev, ksiginfo_t *ksi)
 {
 	int ret = 0;
 
 	PROC_LOCK(p);
 	if (!KSI_ONQ(ksi)) {
 		ksi->ksi_code = SI_ASYNCIO;
 		ksi->ksi_flags |= KSI_EXT | KSI_INS;
 		ret = psignal_event(p, sigev, ksi);
 	}
 	PROC_UNLOCK(p);
 	return (ret);
 }
 
 /*
  * Free a job entry.  Wait for completion if it is currently active, but don't
  * delay forever.  If we delay, we return a flag that says that we have to
  * restart the queue scan.
  */
 static int
 aio_free_entry(struct aiocblist *aiocbe)
 {
 	struct kaioinfo *ki;
 	struct aioliojob *lj;
 	struct proc *p;
 
 	p = aiocbe->userproc;
 	MPASS(curproc == p);
 	ki = p->p_aioinfo;
 	MPASS(ki != NULL);
 
 	AIO_LOCK_ASSERT(ki, MA_OWNED);
 	MPASS(aiocbe->jobstate == JOBST_JOBFINISHED);
 
 	atomic_subtract_int(&num_queue_count, 1);
 
 	ki->kaio_count--;
 	MPASS(ki->kaio_count >= 0);
 
 	TAILQ_REMOVE(&ki->kaio_done, aiocbe, plist);
 	TAILQ_REMOVE(&ki->kaio_all, aiocbe, allist);
 
 	lj = aiocbe->lio;
 	if (lj) {
 		lj->lioj_count--;
 		lj->lioj_finished_count--;
 
 		if (lj->lioj_count == 0) {
 			TAILQ_REMOVE(&ki->kaio_liojoblist, lj, lioj_list);
 			/* lio is going away, we need to destroy any knotes */
 			knlist_delete(&lj->klist, curthread, 1);
 			PROC_LOCK(p);
 			sigqueue_take(&lj->lioj_ksi);
 			PROC_UNLOCK(p);
 			uma_zfree(aiolio_zone, lj);
 		}
 	}
 
 	/* aiocbe is going away, we need to destroy any knotes */
 	knlist_delete(&aiocbe->klist, curthread, 1);
 	PROC_LOCK(p);
 	sigqueue_take(&aiocbe->ksi);
 	PROC_UNLOCK(p);
 
 	MPASS(aiocbe->bp == NULL);
 	aiocbe->jobstate = JOBST_NULL;
 	AIO_UNLOCK(ki);
 
 	/*
 	 * The thread argument here is used to find the owning process
 	 * and is also passed to fo_close() which may pass it to various
 	 * places such as devsw close() routines.  Because of that, we
 	 * need a thread pointer from the process owning the job that is
 	 * persistent and won't disappear out from under us or move to
 	 * another process.
 	 *
 	 * Currently, all the callers of this function call it to remove
 	 * an aiocblist from the current process' job list either via a
 	 * syscall or due to the current process calling exit() or
 	 * execve().  Thus, we know that p == curproc.  We also know that
 	 * curthread can't exit since we are curthread.
 	 *
 	 * Therefore, we use curthread as the thread to pass to
 	 * knlist_delete().  This does mean that it is possible for the
 	 * thread pointer at close time to differ from the thread pointer
 	 * at open time, but this is already true of file descriptors in
 	 * a multithreaded process.
 	 */
 	fdrop(aiocbe->fd_file, curthread);
 	crfree(aiocbe->cred);
 	uma_zfree(aiocb_zone, aiocbe);
 	AIO_LOCK(ki);
 
 	return (0);
 }
 
 static void
 aio_proc_rundown_exec(void *arg, struct proc *p, struct image_params *imgp __unused)
 {
    	aio_proc_rundown(arg, p);
 }
 
 /*
  * Rundown the jobs for a given process.
  */
 static void
 aio_proc_rundown(void *arg, struct proc *p)
 {
 	struct kaioinfo *ki;
 	struct aioliojob *lj;
 	struct aiocblist *cbe, *cbn;
 	struct file *fp;
 	struct socket *so;
 	int remove;
 
 	KASSERT(curthread->td_proc == p,
 	    ("%s: called on non-curproc", __func__));
 	ki = p->p_aioinfo;
 	if (ki == NULL)
 		return;
 
 	AIO_LOCK(ki);
 	ki->kaio_flags |= KAIO_RUNDOWN;
 
 restart:
 
 	/*
 	 * Try to cancel all pending requests. This code simulates
 	 * aio_cancel on all pending I/O requests.
 	 */
 	TAILQ_FOREACH_SAFE(cbe, &ki->kaio_jobqueue, plist, cbn) {
 		remove = 0;
 		mtx_lock(&aio_job_mtx);
 		if (cbe->jobstate == JOBST_JOBQGLOBAL) {
 			TAILQ_REMOVE(&aio_jobs, cbe, list);
 			remove = 1;
 		} else if (cbe->jobstate == JOBST_JOBQSOCK) {
 			fp = cbe->fd_file;
 			MPASS(fp->f_type == DTYPE_SOCKET);
 			so = fp->f_data;
 			TAILQ_REMOVE(&so->so_aiojobq, cbe, list);
 			remove = 1;
 		} else if (cbe->jobstate == JOBST_JOBQSYNC) {
 			TAILQ_REMOVE(&ki->kaio_syncqueue, cbe, list);
 			remove = 1;
 		}
 		mtx_unlock(&aio_job_mtx);
 
 		if (remove) {
 			cbe->jobstate = JOBST_JOBFINISHED;
 			cbe->uaiocb._aiocb_private.status = -1;
 			cbe->uaiocb._aiocb_private.error = ECANCELED;
 			TAILQ_REMOVE(&ki->kaio_jobqueue, cbe, plist);
 			aio_bio_done_notify(p, cbe, DONE_QUEUE);
 		}
 	}
 
 	/* Wait for all running I/O to be finished */
 	if (TAILQ_FIRST(&ki->kaio_bufqueue) ||
 	    TAILQ_FIRST(&ki->kaio_jobqueue)) {
 		ki->kaio_flags |= KAIO_WAKEUP;
 		msleep(&p->p_aioinfo, AIO_MTX(ki), PRIBIO, "aioprn", hz);
 		goto restart;
 	}
 
 	/* Free all completed I/O requests. */
 	while ((cbe = TAILQ_FIRST(&ki->kaio_done)) != NULL)
 		aio_free_entry(cbe);
 
 	while ((lj = TAILQ_FIRST(&ki->kaio_liojoblist)) != NULL) {
 		if (lj->lioj_count == 0) {
 			TAILQ_REMOVE(&ki->kaio_liojoblist, lj, lioj_list);
 			knlist_delete(&lj->klist, curthread, 1);
 			PROC_LOCK(p);
 			sigqueue_take(&lj->lioj_ksi);
 			PROC_UNLOCK(p);
 			uma_zfree(aiolio_zone, lj);
 		} else {
 			panic("LIO job not cleaned up: C:%d, FC:%d\n",
 			    lj->lioj_count, lj->lioj_finished_count);
 		}
 	}
 	AIO_UNLOCK(ki);
 	taskqueue_drain(taskqueue_aiod_bio, &ki->kaio_task);
 	mtx_destroy(&ki->kaio_mtx);
 	uma_zfree(kaio_zone, ki);
 	p->p_aioinfo = NULL;
 }
 
 /*
  * Select a job to run (called by an AIO daemon).
  */
 static struct aiocblist *
 aio_selectjob(struct aiothreadlist *aiop)
 {
 	struct aiocblist *aiocbe;
 	struct kaioinfo *ki;
 	struct proc *userp;
 
 	mtx_assert(&aio_job_mtx, MA_OWNED);
 	TAILQ_FOREACH(aiocbe, &aio_jobs, list) {
 		userp = aiocbe->userproc;
 		ki = userp->p_aioinfo;
 
 		if (ki->kaio_active_count < ki->kaio_maxactive_count) {
 			TAILQ_REMOVE(&aio_jobs, aiocbe, list);
 			/* Account for currently active jobs. */
 			ki->kaio_active_count++;
 			aiocbe->jobstate = JOBST_JOBRUNNING;
 			break;
 		}
 	}
 	return (aiocbe);
 }
 
 /*
  *  Move all data to a permanent storage device, this code
  *  simulates fsync syscall.
  */
 static int
 aio_fsync_vnode(struct thread *td, struct vnode *vp)
 {
 	struct mount *mp;
 	int vfslocked;
 	int error;
 
 	vfslocked = VFS_LOCK_GIANT(vp->v_mount);
 	if ((error = vn_start_write(vp, &mp, V_WAIT | PCATCH)) != 0)
 		goto drop;
-	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td);
+	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
 	if (vp->v_object != NULL) {
 		VM_OBJECT_LOCK(vp->v_object);
 		vm_object_page_clean(vp->v_object, 0, 0, 0);
 		VM_OBJECT_UNLOCK(vp->v_object);
 	}
 	error = VOP_FSYNC(vp, MNT_WAIT, td);
 
 	VOP_UNLOCK(vp, 0, td);
 	vn_finished_write(mp);
 drop:
 	VFS_UNLOCK_GIANT(vfslocked);
 	return (error);
 }
 
 /*
  * The AIO processing activity.  This is the code that does the I/O request for
  * the non-physio version of the operations.  The normal vn operations are used,
  * and this code should work in all instances for every type of file, including
  * pipes, sockets, fifos, and regular files.
  *
  * XXX I don't think it works well for socket, pipe, and fifo.
  */
 static void
 aio_process(struct aiocblist *aiocbe)
 {
 	struct ucred *td_savedcred;
 	struct thread *td;
 	struct aiocb *cb;
 	struct file *fp;
 	struct socket *so;
 	struct uio auio;
 	struct iovec aiov;
 	int cnt;
 	int error;
 	int oublock_st, oublock_end;
 	int inblock_st, inblock_end;
 
 	td = curthread;
 	td_savedcred = td->td_ucred;
 	td->td_ucred = aiocbe->cred;
 	cb = &aiocbe->uaiocb;
 	fp = aiocbe->fd_file;
 
 	if (cb->aio_lio_opcode == LIO_SYNC) {
 		error = 0;
 		cnt = 0;
 		if (fp->f_vnode != NULL)
 			error = aio_fsync_vnode(td, fp->f_vnode);
 		cb->_aiocb_private.error = error;
 		cb->_aiocb_private.status = 0;
 		td->td_ucred = td_savedcred;
 		return;
 	}
 
 	aiov.iov_base = (void *)(uintptr_t)cb->aio_buf;
 	aiov.iov_len = cb->aio_nbytes;
 
 	auio.uio_iov = &aiov;
 	auio.uio_iovcnt = 1;
 	auio.uio_offset = cb->aio_offset;
 	auio.uio_resid = cb->aio_nbytes;
 	cnt = cb->aio_nbytes;
 	auio.uio_segflg = UIO_USERSPACE;
 	auio.uio_td = td;
 
 	inblock_st = td->td_ru.ru_inblock;
 	oublock_st = td->td_ru.ru_oublock;
 	/*
 	 * aio_aqueue() acquires a reference to the file that is
 	 * released in aio_free_entry().
 	 */
 	if (cb->aio_lio_opcode == LIO_READ) {
 		auio.uio_rw = UIO_READ;
 		if (auio.uio_resid == 0)
 			error = 0;
 		else
 			error = fo_read(fp, &auio, fp->f_cred, FOF_OFFSET, td);
 	} else {
 		if (fp->f_type == DTYPE_VNODE)
 			bwillwrite();
 		auio.uio_rw = UIO_WRITE;
 		error = fo_write(fp, &auio, fp->f_cred, FOF_OFFSET, td);
 	}
 	inblock_end = td->td_ru.ru_inblock;
 	oublock_end = td->td_ru.ru_oublock;
 
 	aiocbe->inputcharge = inblock_end - inblock_st;
 	aiocbe->outputcharge = oublock_end - oublock_st;
 
 	if ((error) && (auio.uio_resid != cnt)) {
 		if (error == ERESTART || error == EINTR || error == EWOULDBLOCK)
 			error = 0;
 		if ((error == EPIPE) && (cb->aio_lio_opcode == LIO_WRITE)) {
 			int sigpipe = 1;
 			if (fp->f_type == DTYPE_SOCKET) {
 				so = fp->f_data;
 				if (so->so_options & SO_NOSIGPIPE)
 					sigpipe = 0;
 			}
 			if (sigpipe) {
 				PROC_LOCK(aiocbe->userproc);
 				psignal(aiocbe->userproc, SIGPIPE);
 				PROC_UNLOCK(aiocbe->userproc);
 			}
 		}
 	}
 
 	cnt -= auio.uio_resid;
 	cb->_aiocb_private.error = error;
 	cb->_aiocb_private.status = cnt;
 	td->td_ucred = td_savedcred;
 }
 
 static void
 aio_bio_done_notify(struct proc *userp, struct aiocblist *aiocbe, int type)
 {
 	struct aioliojob *lj;
 	struct kaioinfo *ki;
 	struct aiocblist *scb, *scbn;
 	int lj_done;
 
 	ki = userp->p_aioinfo;
 	AIO_LOCK_ASSERT(ki, MA_OWNED);
 	lj = aiocbe->lio;
 	lj_done = 0;
 	if (lj) {
 		lj->lioj_finished_count++;
 		if (lj->lioj_count == lj->lioj_finished_count)
 			lj_done = 1;
 	}
 	if (type == DONE_QUEUE) {
 		aiocbe->jobflags |= AIOCBLIST_DONE;
 	} else {
 		aiocbe->jobflags |= AIOCBLIST_BUFDONE;
 	}
 	TAILQ_INSERT_TAIL(&ki->kaio_done, aiocbe, plist);
 	aiocbe->jobstate = JOBST_JOBFINISHED;
 
 	if (ki->kaio_flags & KAIO_RUNDOWN)
 		goto notification_done;
 
 	if (aiocbe->uaiocb.aio_sigevent.sigev_notify == SIGEV_SIGNAL ||
 	    aiocbe->uaiocb.aio_sigevent.sigev_notify == SIGEV_THREAD_ID)
 		aio_sendsig(userp, &aiocbe->uaiocb.aio_sigevent, &aiocbe->ksi);
 
 	KNOTE_LOCKED(&aiocbe->klist, 1);
 
 	if (lj_done) {
 		if (lj->lioj_signal.sigev_notify == SIGEV_KEVENT) {
 			lj->lioj_flags |= LIOJ_KEVENT_POSTED;
 			KNOTE_LOCKED(&lj->klist, 1);
 		}
 		if ((lj->lioj_flags & (LIOJ_SIGNAL|LIOJ_SIGNAL_POSTED))
 		    == LIOJ_SIGNAL
 		    && (lj->lioj_signal.sigev_notify == SIGEV_SIGNAL ||
 		        lj->lioj_signal.sigev_notify == SIGEV_THREAD_ID)) {
 			aio_sendsig(userp, &lj->lioj_signal, &lj->lioj_ksi);
 			lj->lioj_flags |= LIOJ_SIGNAL_POSTED;
 		}
 	}
 
 notification_done:
 	if (aiocbe->jobflags & AIOCBLIST_CHECKSYNC) {
 		TAILQ_FOREACH_SAFE(scb, &ki->kaio_syncqueue, list, scbn) {
 			if (aiocbe->fd_file == scb->fd_file &&
 			    aiocbe->seqno < scb->seqno) {
 				if (--scb->pending == 0) {
 					mtx_lock(&aio_job_mtx);
 					scb->jobstate = JOBST_JOBQGLOBAL;
 					TAILQ_REMOVE(&ki->kaio_syncqueue, scb, list);
 					TAILQ_INSERT_TAIL(&aio_jobs, scb, list);
 					aio_kick_nowait(userp);
 					mtx_unlock(&aio_job_mtx);
 				}
 			}
 		}
 	}
 	if (ki->kaio_flags & KAIO_WAKEUP) {
 		ki->kaio_flags &= ~KAIO_WAKEUP;
 		wakeup(&userp->p_aioinfo);
 	}
 }
 
 /*
  * The AIO daemon, most of the actual work is done in aio_process,
  * but the setup (and address space mgmt) is done in this routine.
  */
 static void
 aio_daemon(void *_id)
 {
 	struct aiocblist *aiocbe;
 	struct aiothreadlist *aiop;
 	struct kaioinfo *ki;
 	struct proc *curcp, *mycp, *userp;
 	struct vmspace *myvm, *tmpvm;
 	struct thread *td = curthread;
 	int id = (intptr_t)_id;
 
 	/*
 	 * Local copies of curproc (cp) and vmspace (myvm)
 	 */
 	mycp = td->td_proc;
 	myvm = mycp->p_vmspace;
 
 	KASSERT(mycp->p_textvp == NULL, ("kthread has a textvp"));
 
 	/*
 	 * Allocate and ready the aio control info.  There is one aiop structure
 	 * per daemon.
 	 */
 	aiop = uma_zalloc(aiop_zone, M_WAITOK);
 	aiop->aiothread = td;
 	aiop->aiothreadflags = 0;
 
 	/* The daemon resides in its own pgrp. */
 	setsid(td, NULL);
 
 	/*
 	 * Wakeup parent process.  (Parent sleeps to keep from blasting away
 	 * and creating too many daemons.)
 	 */
 	sema_post(&aio_newproc_sem);
 
 	mtx_lock(&aio_job_mtx);
 	for (;;) {
 		/*
 		 * curcp is the current daemon process context.
 		 * userp is the current user process context.
 		 */
 		curcp = mycp;
 
 		/*
 		 * Take daemon off of free queue
 		 */
 		if (aiop->aiothreadflags & AIOP_FREE) {
 			TAILQ_REMOVE(&aio_freeproc, aiop, list);
 			aiop->aiothreadflags &= ~AIOP_FREE;
 		}
 
 		/*
 		 * Check for jobs.
 		 */
 		while ((aiocbe = aio_selectjob(aiop)) != NULL) {
 			mtx_unlock(&aio_job_mtx);
 			userp = aiocbe->userproc;
 
 			/*
 			 * Connect to process address space for user program.
 			 */
 			if (userp != curcp) {
 				/*
 				 * Save the current address space that we are
 				 * connected to.
 				 */
 				tmpvm = mycp->p_vmspace;
 
 				/*
 				 * Point to the new user address space, and
 				 * refer to it.
 				 */
 				mycp->p_vmspace = userp->p_vmspace;
 				atomic_add_int(&mycp->p_vmspace->vm_refcnt, 1);
 
 				/* Activate the new mapping. */
 				pmap_activate(FIRST_THREAD_IN_PROC(mycp));
 
 				/*
 				 * If the old address space wasn't the daemons
 				 * own address space, then we need to remove the
 				 * daemon's reference from the other process
 				 * that it was acting on behalf of.
 				 */
 				if (tmpvm != myvm) {
 					vmspace_free(tmpvm);
 				}
 				curcp = userp;
 			}
 
 			ki = userp->p_aioinfo;
 
 			/* Do the I/O function. */
 			aio_process(aiocbe);
 
 			mtx_lock(&aio_job_mtx);
 			/* Decrement the active job count. */
 			ki->kaio_active_count--;
 			mtx_unlock(&aio_job_mtx);
 
 			AIO_LOCK(ki);
 			TAILQ_REMOVE(&ki->kaio_jobqueue, aiocbe, plist);
 			aio_bio_done_notify(userp, aiocbe, DONE_QUEUE);
 			AIO_UNLOCK(ki);
 
 			mtx_lock(&aio_job_mtx);
 		}
 
 		/*
 		 * Disconnect from user address space.
 		 */
 		if (curcp != mycp) {
 
 			mtx_unlock(&aio_job_mtx);
 
 			/* Get the user address space to disconnect from. */
 			tmpvm = mycp->p_vmspace;
 
 			/* Get original address space for daemon. */
 			mycp->p_vmspace = myvm;
 
 			/* Activate the daemon's address space. */
 			pmap_activate(FIRST_THREAD_IN_PROC(mycp));
 #ifdef DIAGNOSTIC
 			if (tmpvm == myvm) {
 				printf("AIOD: vmspace problem -- %d\n",
 				    mycp->p_pid);
 			}
 #endif
 			/* Remove our vmspace reference. */
 			vmspace_free(tmpvm);
 
 			curcp = mycp;
 
 			mtx_lock(&aio_job_mtx);
 			/*
 			 * We have to restart to avoid race, we only sleep if
 			 * no job can be selected, that should be
 			 * curcp == mycp.
 			 */
 			continue;
 		}
 
 		mtx_assert(&aio_job_mtx, MA_OWNED);
 
 		TAILQ_INSERT_HEAD(&aio_freeproc, aiop, list);
 		aiop->aiothreadflags |= AIOP_FREE;
 
 		/*
 		 * If daemon is inactive for a long time, allow it to exit,
 		 * thereby freeing resources.
 		 */
 		if (msleep(aiop->aiothread, &aio_job_mtx, PRIBIO, "aiordy",
 		    aiod_lifetime)) {
 			if (TAILQ_EMPTY(&aio_jobs)) {
 				if ((aiop->aiothreadflags & AIOP_FREE) &&
 				    (num_aio_procs > target_aio_procs)) {
 					TAILQ_REMOVE(&aio_freeproc, aiop, list);
 					num_aio_procs--;
 					mtx_unlock(&aio_job_mtx);
 					uma_zfree(aiop_zone, aiop);
 					free_unr(aiod_unr, id);
 #ifdef DIAGNOSTIC
 					if (mycp->p_vmspace->vm_refcnt <= 1) {
 						printf("AIOD: bad vm refcnt for"
 						    " exiting daemon: %d\n",
 						    mycp->p_vmspace->vm_refcnt);
 					}
 #endif
 					kproc_exit(0);
 				}
 			}
 		}
 	}
 	mtx_unlock(&aio_job_mtx);
 	panic("shouldn't be here\n");
 }
 
 /*
  * Create a new AIO daemon. This is mostly a kernel-thread fork routine. The
  * AIO daemon modifies its environment itself.
  */
 static int
 aio_newproc(int *start)
 {
 	int error;
 	struct proc *p;
 	int id;
 
 	id = alloc_unr(aiod_unr);
 	error = kproc_create(aio_daemon, (void *)(intptr_t)id, &p,
 		RFNOWAIT, 0, "aiod%d", id);
 	if (error == 0) {
 		/*
 		 * Wait until daemon is started.
 		 */
 		sema_wait(&aio_newproc_sem);
 		mtx_lock(&aio_job_mtx);
 		num_aio_procs++;
 		if (start != NULL)
 			(*start)--;
 		mtx_unlock(&aio_job_mtx);
 	} else {
 		free_unr(aiod_unr, id);
 	}
 	return (error);
 }
 
 /*
  * Try the high-performance, low-overhead physio method for eligible
  * VCHR devices.  This method doesn't use an aio helper thread, and
  * thus has very low overhead.
  *
  * Assumes that the caller, aio_aqueue(), has incremented the file
  * structure's reference count, preventing its deallocation for the
  * duration of this call.
  */
 static int
 aio_qphysio(struct proc *p, struct aiocblist *aiocbe)
 {
 	struct aiocb *cb;
 	struct file *fp;
 	struct buf *bp;
 	struct vnode *vp;
 	struct kaioinfo *ki;
 	struct aioliojob *lj;
 	int error;
 
 	cb = &aiocbe->uaiocb;
 	fp = aiocbe->fd_file;
 
 	if (fp->f_type != DTYPE_VNODE)
 		return (-1);
 
 	vp = fp->f_vnode;
 
 	/*
 	 * If its not a disk, we don't want to return a positive error.
 	 * It causes the aio code to not fall through to try the thread
 	 * way when you're talking to a regular file.
 	 */
 	if (!vn_isdisk(vp, &error)) {
 		if (error == ENOTBLK)
 			return (-1);
 		else
 			return (error);
 	}
 
 	if (vp->v_bufobj.bo_bsize == 0)
 		return (-1);
 
  	if (cb->aio_nbytes % vp->v_bufobj.bo_bsize)
 		return (-1);
 
 	if (cb->aio_nbytes > vp->v_rdev->si_iosize_max)
 		return (-1);
 
 	if (cb->aio_nbytes >
 	    MAXPHYS - (((vm_offset_t) cb->aio_buf) & PAGE_MASK))
 		return (-1);
 
 	ki = p->p_aioinfo;
 	if (ki->kaio_buffer_count >= ki->kaio_ballowed_count)
 		return (-1);
 
 	/* Create and build a buffer header for a transfer. */
 	bp = (struct buf *)getpbuf(NULL);
 	BUF_KERNPROC(bp);
 
 	AIO_LOCK(ki);
 	ki->kaio_count++;
 	ki->kaio_buffer_count++;
 	lj = aiocbe->lio;
 	if (lj)
 		lj->lioj_count++;
 	AIO_UNLOCK(ki);
 
 	/*
 	 * Get a copy of the kva from the physical buffer.
 	 */
 	error = 0;
 
 	bp->b_bcount = cb->aio_nbytes;
 	bp->b_bufsize = cb->aio_nbytes;
 	bp->b_iodone = aio_physwakeup;
 	bp->b_saveaddr = bp->b_data;
 	bp->b_data = (void *)(uintptr_t)cb->aio_buf;
 	bp->b_offset = cb->aio_offset;
 	bp->b_iooffset = cb->aio_offset;
 	bp->b_blkno = btodb(cb->aio_offset);
 	bp->b_iocmd = cb->aio_lio_opcode == LIO_WRITE ? BIO_WRITE : BIO_READ;
 
 	/*
 	 * Bring buffer into kernel space.
 	 */
 	if (vmapbuf(bp) < 0) {
 		error = EFAULT;
 		goto doerror;
 	}
 
 	AIO_LOCK(ki);
 	aiocbe->bp = bp;
 	bp->b_caller1 = (void *)aiocbe;
 	TAILQ_INSERT_TAIL(&ki->kaio_bufqueue, aiocbe, plist);
 	TAILQ_INSERT_TAIL(&ki->kaio_all, aiocbe, allist);
 	aiocbe->jobstate = JOBST_JOBQBUF;
 	cb->_aiocb_private.status = cb->aio_nbytes;
 	AIO_UNLOCK(ki);
 
 	atomic_add_int(&num_queue_count, 1);
 	atomic_add_int(&num_buf_aio, 1);
 
 	bp->b_error = 0;
 
 	TASK_INIT(&aiocbe->biotask, 0, biohelper, aiocbe);
 
 	/* Perform transfer. */
 	dev_strategy(vp->v_rdev, bp);
 	return (0);
 
 doerror:
 	AIO_LOCK(ki);
 	ki->kaio_count--;
 	ki->kaio_buffer_count--;
 	if (lj)
 		lj->lioj_count--;
 	aiocbe->bp = NULL;
 	AIO_UNLOCK(ki);
 	relpbuf(bp, NULL);
 	return (error);
 }
 
 /*
  * Wake up aio requests that may be serviceable now.
  */
 static void
 aio_swake_cb(struct socket *so, struct sockbuf *sb)
 {
 	struct aiocblist *cb, *cbn;
 	int opcode;
 
 	if (sb == &so->so_snd)
 		opcode = LIO_WRITE;
 	else
 		opcode = LIO_READ;
 
 	SOCKBUF_LOCK(sb);
 	sb->sb_flags &= ~SB_AIO;
 	mtx_lock(&aio_job_mtx);
 	TAILQ_FOREACH_SAFE(cb, &so->so_aiojobq, list, cbn) {
 		if (opcode == cb->uaiocb.aio_lio_opcode) {
 			if (cb->jobstate != JOBST_JOBQSOCK)
 				panic("invalid queue value");
 			/* XXX
 			 * We don't have actual sockets backend yet,
 			 * so we simply move the requests to the generic
 			 * file I/O backend.
 			 */
 			TAILQ_REMOVE(&so->so_aiojobq, cb, list);
 			TAILQ_INSERT_TAIL(&aio_jobs, cb, list);
 			aio_kick_nowait(cb->userproc);
 		}
 	}
 	mtx_unlock(&aio_job_mtx);
 	SOCKBUF_UNLOCK(sb);
 }
 
 /*
  * Queue a new AIO request.  Choosing either the threaded or direct physio VCHR
  * technique is done in this code.
  */
 int
 aio_aqueue(struct thread *td, struct aiocb *job, struct aioliojob *lj,
 	int type, int oldsigev)
 {
 	struct proc *p = td->td_proc;
 	struct file *fp;
 	struct socket *so;
 	struct aiocblist *aiocbe, *cb;
 	struct kaioinfo *ki;
 	struct kevent kev;
 	struct sockbuf *sb;
 	int opcode;
 	int error;
 	int fd, kqfd;
 	int jid;
 
 	if (p->p_aioinfo == NULL)
 		aio_init_aioinfo(p);
 
 	ki = p->p_aioinfo;
 
 	suword(&job->_aiocb_private.status, -1);
 	suword(&job->_aiocb_private.error, 0);
 	suword(&job->_aiocb_private.kernelinfo, -1);
 
 	if (num_queue_count >= max_queue_count ||
 	    ki->kaio_count >= ki->kaio_qallowed_count) {
 		suword(&job->_aiocb_private.error, EAGAIN);
 		return (EAGAIN);
 	}
 
 	aiocbe = uma_zalloc(aiocb_zone, M_WAITOK | M_ZERO);
 	aiocbe->inputcharge = 0;
 	aiocbe->outputcharge = 0;
 	knlist_init(&aiocbe->klist, AIO_MTX(ki), NULL, NULL, NULL);
 
 	if (oldsigev) {
 		bzero(&aiocbe->uaiocb, sizeof(struct aiocb));
 		error = copyin(job, &aiocbe->uaiocb, sizeof(struct oaiocb));
 		bcopy(&aiocbe->uaiocb.__spare__, &aiocbe->uaiocb.aio_sigevent,
 			sizeof(struct osigevent));
 	} else {
 		error = copyin(job, &aiocbe->uaiocb, sizeof(struct aiocb));
 	}
 	if (error) {
 		suword(&job->_aiocb_private.error, error);
 		uma_zfree(aiocb_zone, aiocbe);
 		return (error);
 	}
 
 	if (aiocbe->uaiocb.aio_sigevent.sigev_notify != SIGEV_KEVENT &&
 	    aiocbe->uaiocb.aio_sigevent.sigev_notify != SIGEV_SIGNAL &&
 	    aiocbe->uaiocb.aio_sigevent.sigev_notify != SIGEV_THREAD_ID &&
 	    aiocbe->uaiocb.aio_sigevent.sigev_notify != SIGEV_NONE) {
 		suword(&job->_aiocb_private.error, EINVAL);
 		uma_zfree(aiocb_zone, aiocbe);
 		return (EINVAL);
 	}
 	
 	if ((aiocbe->uaiocb.aio_sigevent.sigev_notify == SIGEV_SIGNAL ||
 	     aiocbe->uaiocb.aio_sigevent.sigev_notify == SIGEV_THREAD_ID) &&
 		!_SIG_VALID(aiocbe->uaiocb.aio_sigevent.sigev_signo)) {
 		uma_zfree(aiocb_zone, aiocbe);
 		return (EINVAL);
 	}
 
 	ksiginfo_init(&aiocbe->ksi);
 
 	/* Save userspace address of the job info. */
 	aiocbe->uuaiocb = job;
 
 	/* Get the opcode. */
 	if (type != LIO_NOP)
 		aiocbe->uaiocb.aio_lio_opcode = type;
 	opcode = aiocbe->uaiocb.aio_lio_opcode;
 
 	/* Fetch the file object for the specified file descriptor. */
 	fd = aiocbe->uaiocb.aio_fildes;
 	switch (opcode) {
 	case LIO_WRITE:
 		error = fget_write(td, fd, &fp);
 		break;
 	case LIO_READ:
 		error = fget_read(td, fd, &fp);
 		break;
 	default:
 		error = fget(td, fd, &fp);
 	}
 	if (error) {
 		uma_zfree(aiocb_zone, aiocbe);
 		suword(&job->_aiocb_private.error, error);
 		return (error);
 	}
 
 	if (opcode == LIO_SYNC && fp->f_vnode == NULL) {
 		error = EINVAL;
 		goto aqueue_fail;
 	}
 
 	if (opcode != LIO_SYNC && aiocbe->uaiocb.aio_offset == -1LL) {
 		error = EINVAL;
 		goto aqueue_fail;
 	}
 
 	aiocbe->fd_file = fp;
 
 	mtx_lock(&aio_job_mtx);
 	jid = jobrefid++;
 	aiocbe->seqno = jobseqno++;
 	mtx_unlock(&aio_job_mtx);
 	error = suword(&job->_aiocb_private.kernelinfo, jid);
 	if (error) {
 		error = EINVAL;
 		goto aqueue_fail;
 	}
 	aiocbe->uaiocb._aiocb_private.kernelinfo = (void *)(intptr_t)jid;
 
 	if (opcode == LIO_NOP) {
 		fdrop(fp, td);
 		uma_zfree(aiocb_zone, aiocbe);
 		return (0);
 	}
 	if ((opcode != LIO_READ) && (opcode != LIO_WRITE) &&
 	    (opcode != LIO_SYNC)) {
 		error = EINVAL;
 		goto aqueue_fail;
 	}
 
 	if (aiocbe->uaiocb.aio_sigevent.sigev_notify != SIGEV_KEVENT)
 		goto no_kqueue;
 	kqfd = aiocbe->uaiocb.aio_sigevent.sigev_notify_kqueue;
 	kev.ident = (uintptr_t)aiocbe->uuaiocb;
 	kev.filter = EVFILT_AIO;
 	kev.flags = EV_ADD | EV_ENABLE | EV_FLAG1;
 	kev.data = (intptr_t)aiocbe;
 	kev.udata = aiocbe->uaiocb.aio_sigevent.sigev_value.sival_ptr;
 	error = kqfd_register(kqfd, &kev, td, 1);
 aqueue_fail:
 	if (error) {
 		fdrop(fp, td);
 		uma_zfree(aiocb_zone, aiocbe);
 		suword(&job->_aiocb_private.error, error);
 		goto done;
 	}
 no_kqueue:
 
 	suword(&job->_aiocb_private.error, EINPROGRESS);
 	aiocbe->uaiocb._aiocb_private.error = EINPROGRESS;
 	aiocbe->userproc = p;
 	aiocbe->cred = crhold(td->td_ucred);
 	aiocbe->jobflags = 0;
 	aiocbe->lio = lj;
 
 	if (opcode == LIO_SYNC)
 		goto queueit;
 
 	if (fp->f_type == DTYPE_SOCKET) {
 		/*
 		 * Alternate queueing for socket ops: Reach down into the
 		 * descriptor to get the socket data.  Then check to see if the
 		 * socket is ready to be read or written (based on the requested
 		 * operation).
 		 *
 		 * If it is not ready for io, then queue the aiocbe on the
 		 * socket, and set the flags so we get a call when sbnotify()
 		 * happens.
 		 *
 		 * Note if opcode is neither LIO_WRITE nor LIO_READ we lock
 		 * and unlock the snd sockbuf for no reason.
 		 */
 		so = fp->f_data;
 		sb = (opcode == LIO_READ) ? &so->so_rcv : &so->so_snd;
 		SOCKBUF_LOCK(sb);
 		if (((opcode == LIO_READ) && (!soreadable(so))) || ((opcode ==
 		    LIO_WRITE) && (!sowriteable(so)))) {
 			sb->sb_flags |= SB_AIO;
 
 			mtx_lock(&aio_job_mtx);
 			TAILQ_INSERT_TAIL(&so->so_aiojobq, aiocbe, list);
 			mtx_unlock(&aio_job_mtx);
 
 			AIO_LOCK(ki);
 			TAILQ_INSERT_TAIL(&ki->kaio_all, aiocbe, allist);
 			TAILQ_INSERT_TAIL(&ki->kaio_jobqueue, aiocbe, plist);
 			aiocbe->jobstate = JOBST_JOBQSOCK;
 			ki->kaio_count++;
 			if (lj)
 				lj->lioj_count++;
 			AIO_UNLOCK(ki);
 			SOCKBUF_UNLOCK(sb);
 			atomic_add_int(&num_queue_count, 1);
 			error = 0;
 			goto done;
 		}
 		SOCKBUF_UNLOCK(sb);
 	}
 
 	if ((error = aio_qphysio(p, aiocbe)) == 0)
 		goto done;
 #if 0
 	if (error > 0) {
 		aiocbe->uaiocb._aiocb_private.error = error;
 		suword(&job->_aiocb_private.error, error);
 		goto done;
 	}
 #endif
 queueit:
 	/* No buffer for daemon I/O. */
 	aiocbe->bp = NULL;
 	atomic_add_int(&num_queue_count, 1);
 
 	AIO_LOCK(ki);
 	ki->kaio_count++;
 	if (lj)
 		lj->lioj_count++;
 	TAILQ_INSERT_TAIL(&ki->kaio_jobqueue, aiocbe, plist);
 	TAILQ_INSERT_TAIL(&ki->kaio_all, aiocbe, allist);
 	if (opcode == LIO_SYNC) {
 		TAILQ_FOREACH(cb, &ki->kaio_jobqueue, plist) {
 			if (cb->fd_file == aiocbe->fd_file &&
 			    cb->uaiocb.aio_lio_opcode != LIO_SYNC &&
 			    cb->seqno < aiocbe->seqno) {
 				cb->jobflags |= AIOCBLIST_CHECKSYNC;
 				aiocbe->pending++;
 			}
 		}
 		TAILQ_FOREACH(cb, &ki->kaio_bufqueue, plist) {
 			if (cb->fd_file == aiocbe->fd_file &&
 			    cb->uaiocb.aio_lio_opcode != LIO_SYNC &&
 			    cb->seqno < aiocbe->seqno) {
 				cb->jobflags |= AIOCBLIST_CHECKSYNC;
 				aiocbe->pending++;
 			}
 		}
 		if (aiocbe->pending != 0) {
 			TAILQ_INSERT_TAIL(&ki->kaio_syncqueue, aiocbe, list);
 			aiocbe->jobstate = JOBST_JOBQSYNC;
 			AIO_UNLOCK(ki);
 			goto done;
 		}
 	}
 	mtx_lock(&aio_job_mtx);
 	TAILQ_INSERT_TAIL(&aio_jobs, aiocbe, list);
 	aiocbe->jobstate = JOBST_JOBQGLOBAL;
 	aio_kick_nowait(p);
 	mtx_unlock(&aio_job_mtx);
 	AIO_UNLOCK(ki);
 	error = 0;
 done:
 	return (error);
 }
 
 static void
 aio_kick_nowait(struct proc *userp)
 {
 	struct kaioinfo *ki = userp->p_aioinfo;
 	struct aiothreadlist *aiop;
 
 	mtx_assert(&aio_job_mtx, MA_OWNED);
 	if ((aiop = TAILQ_FIRST(&aio_freeproc)) != NULL) {
 		TAILQ_REMOVE(&aio_freeproc, aiop, list);
 		aiop->aiothreadflags &= ~AIOP_FREE;
 		wakeup(aiop->aiothread);
 	} else if (((num_aio_resv_start + num_aio_procs) < max_aio_procs) &&
 	    ((ki->kaio_active_count + num_aio_resv_start) <
 	    ki->kaio_maxactive_count)) {
 		taskqueue_enqueue(taskqueue_aiod_bio, &ki->kaio_task);
 	}
 }
 
 static int
 aio_kick(struct proc *userp)
 {
 	struct kaioinfo *ki = userp->p_aioinfo;
 	struct aiothreadlist *aiop;
 	int error, ret = 0;
 
 	mtx_assert(&aio_job_mtx, MA_OWNED);
 retryproc:
 	if ((aiop = TAILQ_FIRST(&aio_freeproc)) != NULL) {
 		TAILQ_REMOVE(&aio_freeproc, aiop, list);
 		aiop->aiothreadflags &= ~AIOP_FREE;
 		wakeup(aiop->aiothread);
 	} else if (((num_aio_resv_start + num_aio_procs) < max_aio_procs) &&
 	    ((ki->kaio_active_count + num_aio_resv_start) <
 	    ki->kaio_maxactive_count)) {
 		num_aio_resv_start++;
 		mtx_unlock(&aio_job_mtx);
 		error = aio_newproc(&num_aio_resv_start);
 		mtx_lock(&aio_job_mtx);
 		if (error) {
 			num_aio_resv_start--;
 			goto retryproc;
 		}
 	} else {
 		ret = -1;
 	}
 	return (ret);
 }
 
 static void
 aio_kick_helper(void *context, int pending)
 {
 	struct proc *userp = context;
 
 	mtx_lock(&aio_job_mtx);
 	while (--pending >= 0) {
 		if (aio_kick(userp))
 			break;
 	}
 	mtx_unlock(&aio_job_mtx);
 }
 
 /*
  * Support the aio_return system call, as a side-effect, kernel resources are
  * released.
  */
 int
 aio_return(struct thread *td, struct aio_return_args *uap)
 {
 	struct proc *p = td->td_proc;
 	struct aiocblist *cb;
 	struct aiocb *uaiocb;
 	struct kaioinfo *ki;
 	int status, error;
 
 	ki = p->p_aioinfo;
 	if (ki == NULL)
 		return (EINVAL);
 	uaiocb = uap->aiocbp;
 	AIO_LOCK(ki);
 	TAILQ_FOREACH(cb, &ki->kaio_done, plist) {
 		if (cb->uuaiocb == uaiocb)
 			break;
 	}
 	if (cb != NULL) {
 		MPASS(cb->jobstate == JOBST_JOBFINISHED);
 		status = cb->uaiocb._aiocb_private.status;
 		error = cb->uaiocb._aiocb_private.error;
 		td->td_retval[0] = status;
 		if (cb->uaiocb.aio_lio_opcode == LIO_WRITE) {
 			td->td_ru.ru_oublock += cb->outputcharge;
 			cb->outputcharge = 0;
 		} else if (cb->uaiocb.aio_lio_opcode == LIO_READ) {
 			td->td_ru.ru_inblock += cb->inputcharge;
 			cb->inputcharge = 0;
 		}
 		aio_free_entry(cb);
 		AIO_UNLOCK(ki);
 		suword(&uaiocb->_aiocb_private.error, error);
 		suword(&uaiocb->_aiocb_private.status, status);
 	} else {
 		error = EINVAL;
 		AIO_UNLOCK(ki);
 	}
 	return (error);
 }
 
 /*
  * Allow a process to wakeup when any of the I/O requests are completed.
  */
 int
 aio_suspend(struct thread *td, struct aio_suspend_args *uap)
 {
 	struct proc *p = td->td_proc;
 	struct timeval atv;
 	struct timespec ts;
 	struct aiocb *const *cbptr, *cbp;
 	struct kaioinfo *ki;
 	struct aiocblist *cb, *cbfirst;
 	struct aiocb **ujoblist;
 	int njoblist;
 	int error;
 	int timo;
 	int i;
 
 	if (uap->nent < 0 || uap->nent > AIO_LISTIO_MAX)
 		return (EINVAL);
 
 	timo = 0;
 	if (uap->timeout) {
 		/* Get timespec struct. */
 		if ((error = copyin(uap->timeout, &ts, sizeof(ts))) != 0)
 			return (error);
 
 		if (ts.tv_nsec < 0 || ts.tv_nsec >= 1000000000)
 			return (EINVAL);
 
 		TIMESPEC_TO_TIMEVAL(&atv, &ts);
 		if (itimerfix(&atv))
 			return (EINVAL);
 		timo = tvtohz(&atv);
 	}
 
 	ki = p->p_aioinfo;
 	if (ki == NULL)
 		return (EAGAIN);
 
 	njoblist = 0;
 	ujoblist = uma_zalloc(aiol_zone, M_WAITOK);
 	cbptr = uap->aiocbp;
 
 	for (i = 0; i < uap->nent; i++) {
 		cbp = (struct aiocb *)(intptr_t)fuword(&cbptr[i]);
 		if (cbp == 0)
 			continue;
 		ujoblist[njoblist] = cbp;
 		njoblist++;
 	}
 
 	if (njoblist == 0) {
 		uma_zfree(aiol_zone, ujoblist);
 		return (0);
 	}
 
 	AIO_LOCK(ki);
 	for (;;) {
 		cbfirst = NULL;
 		error = 0;
 		TAILQ_FOREACH(cb, &ki->kaio_all, allist) {
 			for (i = 0; i < njoblist; i++) {
 				if (cb->uuaiocb == ujoblist[i]) {
 					if (cbfirst == NULL)
 						cbfirst = cb;
 					if (cb->jobstate == JOBST_JOBFINISHED)
 						goto RETURN;
 				}
 			}
 		}
 		/* All tasks were finished. */
 		if (cbfirst == NULL)
 			break;
 
 		ki->kaio_flags |= KAIO_WAKEUP;
 		error = msleep(&p->p_aioinfo, AIO_MTX(ki), PRIBIO | PCATCH,
 		    "aiospn", timo);
 		if (error == ERESTART)
 			error = EINTR;
 		if (error)
 			break;
 	}
 RETURN:
 	AIO_UNLOCK(ki);
 	uma_zfree(aiol_zone, ujoblist);
 	return (error);
 }
 
 /*
  * aio_cancel cancels any non-physio aio operations not currently in
  * progress.
  */
 int
 aio_cancel(struct thread *td, struct aio_cancel_args *uap)
 {
 	struct proc *p = td->td_proc;
 	struct kaioinfo *ki;
 	struct aiocblist *cbe, *cbn;
 	struct file *fp;
 	struct socket *so;
 	int error;
 	int remove;
 	int cancelled = 0;
 	int notcancelled = 0;
 	struct vnode *vp;
 
 	/* Lookup file object. */
 	error = fget(td, uap->fd, &fp);
 	if (error)
 		return (error);
 
 	ki = p->p_aioinfo;
 	if (ki == NULL)
 		goto done;
 
 	if (fp->f_type == DTYPE_VNODE) {
 		vp = fp->f_vnode;
 		if (vn_isdisk(vp, &error)) {
 			fdrop(fp, td);
 			td->td_retval[0] = AIO_NOTCANCELED;
 			return (0);
 		}
 	}
 
 	AIO_LOCK(ki);
 	TAILQ_FOREACH_SAFE(cbe, &ki->kaio_jobqueue, plist, cbn) {
 		if ((uap->fd == cbe->uaiocb.aio_fildes) &&
 		    ((uap->aiocbp == NULL) ||
 		     (uap->aiocbp == cbe->uuaiocb))) {
 			remove = 0;
 
 			mtx_lock(&aio_job_mtx);
 			if (cbe->jobstate == JOBST_JOBQGLOBAL) {
 				TAILQ_REMOVE(&aio_jobs, cbe, list);
 				remove = 1;
 			} else if (cbe->jobstate == JOBST_JOBQSOCK) {
 				MPASS(fp->f_type == DTYPE_SOCKET);
 				so = fp->f_data;
 				TAILQ_REMOVE(&so->so_aiojobq, cbe, list);
 				remove = 1;
 			} else if (cbe->jobstate == JOBST_JOBQSYNC) {
 				TAILQ_REMOVE(&ki->kaio_syncqueue, cbe, list);
 				remove = 1;
 			}
 			mtx_unlock(&aio_job_mtx);
 
 			if (remove) {
 				TAILQ_REMOVE(&ki->kaio_jobqueue, cbe, plist);
 				cbe->uaiocb._aiocb_private.status = -1;
 				cbe->uaiocb._aiocb_private.error = ECANCELED;
 				aio_bio_done_notify(p, cbe, DONE_QUEUE);
 				cancelled++;
 			} else {
 				notcancelled++;
 			}
 			if (uap->aiocbp != NULL)
 				break;
 		}
 	}
 	AIO_UNLOCK(ki);
 
 done:
 	fdrop(fp, td);
 
 	if (uap->aiocbp != NULL) {
 		if (cancelled) {
 			td->td_retval[0] = AIO_CANCELED;
 			return (0);
 		}
 	}
 
 	if (notcancelled) {
 		td->td_retval[0] = AIO_NOTCANCELED;
 		return (0);
 	}
 
 	if (cancelled) {
 		td->td_retval[0] = AIO_CANCELED;
 		return (0);
 	}
 
 	td->td_retval[0] = AIO_ALLDONE;
 
 	return (0);
 }
 
 /*
  * aio_error is implemented in the kernel level for compatibility purposes
  * only.  For a user mode async implementation, it would be best to do it in
  * a userland subroutine.
  */
 int
 aio_error(struct thread *td, struct aio_error_args *uap)
 {
 	struct proc *p = td->td_proc;
 	struct aiocblist *cb;
 	struct kaioinfo *ki;
 	int status;
 
 	ki = p->p_aioinfo;
 	if (ki == NULL) {
 		td->td_retval[0] = EINVAL;
 		return (0);
 	}
 
 	AIO_LOCK(ki);
 	TAILQ_FOREACH(cb, &ki->kaio_all, allist) {
 		if (cb->uuaiocb == uap->aiocbp) {
 			if (cb->jobstate == JOBST_JOBFINISHED)
 				td->td_retval[0] =
 					cb->uaiocb._aiocb_private.error;
 			else
 				td->td_retval[0] = EINPROGRESS;
 			AIO_UNLOCK(ki);
 			return (0);
 		}
 	}
 	AIO_UNLOCK(ki);
 
 	/*
 	 * Hack for failure of aio_aqueue.
 	 */
 	status = fuword(&uap->aiocbp->_aiocb_private.status);
 	if (status == -1) {
 		td->td_retval[0] = fuword(&uap->aiocbp->_aiocb_private.error);
 		return (0);
 	}
 
 	td->td_retval[0] = EINVAL;
 	return (0);
 }
 
 /* syscall - asynchronous read from a file (REALTIME) */
 int
 oaio_read(struct thread *td, struct oaio_read_args *uap)
 {
 
 	return aio_aqueue(td, (struct aiocb *)uap->aiocbp, NULL, LIO_READ, 1);
 }
 
 int
 aio_read(struct thread *td, struct aio_read_args *uap)
 {
 
 	return aio_aqueue(td, uap->aiocbp, NULL, LIO_READ, 0);
 }
 
 /* syscall - asynchronous write to a file (REALTIME) */
 int
 oaio_write(struct thread *td, struct oaio_write_args *uap)
 {
 
 	return aio_aqueue(td, (struct aiocb *)uap->aiocbp, NULL, LIO_WRITE, 1);
 }
 
 int
 aio_write(struct thread *td, struct aio_write_args *uap)
 {
 
 	return aio_aqueue(td, uap->aiocbp, NULL, LIO_WRITE, 0);
 }
 
 /* syscall - list directed I/O (REALTIME) */
 int
 olio_listio(struct thread *td, struct olio_listio_args *uap)
 {
 	return do_lio_listio(td, (struct lio_listio_args *)uap, 1);
 }
 
 /* syscall - list directed I/O (REALTIME) */
 int
 lio_listio(struct thread *td, struct lio_listio_args *uap)
 {
 	return do_lio_listio(td, uap, 0);
 }
 
 static int
 do_lio_listio(struct thread *td, struct lio_listio_args *uap, int oldsigev)
 {
 	struct proc *p = td->td_proc;
 	struct aiocb *iocb, * const *cbptr;
 	struct kaioinfo *ki;
 	struct aioliojob *lj;
 	struct kevent kev;
 	int nent;
 	int error;
 	int nerror;
 	int i;
 
 	if ((uap->mode != LIO_NOWAIT) && (uap->mode != LIO_WAIT))
 		return (EINVAL);
 
 	nent = uap->nent;
 	if (nent < 0 || nent > AIO_LISTIO_MAX)
 		return (EINVAL);
 
 	if (p->p_aioinfo == NULL)
 		aio_init_aioinfo(p);
 
 	ki = p->p_aioinfo;
 
 	lj = uma_zalloc(aiolio_zone, M_WAITOK);
 	lj->lioj_flags = 0;
 	lj->lioj_count = 0;
 	lj->lioj_finished_count = 0;
 	knlist_init(&lj->klist, AIO_MTX(ki), NULL, NULL, NULL);
 	ksiginfo_init(&lj->lioj_ksi);
 
 	/*
 	 * Setup signal.
 	 */
 	if (uap->sig && (uap->mode == LIO_NOWAIT)) {
 		bzero(&lj->lioj_signal, sizeof(&lj->lioj_signal));
 		error = copyin(uap->sig, &lj->lioj_signal,
 				oldsigev ? sizeof(struct osigevent) :
 					   sizeof(struct sigevent));
 		if (error) {
 			uma_zfree(aiolio_zone, lj);
 			return (error);
 		}
 
 		if (lj->lioj_signal.sigev_notify == SIGEV_KEVENT) {
 			/* Assume only new style KEVENT */
 			kev.filter = EVFILT_LIO;
 			kev.flags = EV_ADD | EV_ENABLE | EV_FLAG1;
 			kev.ident = (uintptr_t)uap->acb_list; /* something unique */
 			kev.data = (intptr_t)lj;
 			/* pass user defined sigval data */
 			kev.udata = lj->lioj_signal.sigev_value.sival_ptr;
 			error = kqfd_register(
 			    lj->lioj_signal.sigev_notify_kqueue, &kev, td, 1);
 			if (error) {
 				uma_zfree(aiolio_zone, lj);
 				return (error);
 			}
 		} else if (lj->lioj_signal.sigev_notify == SIGEV_NONE) {
 			;
 		} else if (lj->lioj_signal.sigev_notify == SIGEV_SIGNAL ||
 			   lj->lioj_signal.sigev_notify == SIGEV_THREAD_ID) {
 				if (!_SIG_VALID(lj->lioj_signal.sigev_signo)) {
 					uma_zfree(aiolio_zone, lj);
 					return EINVAL;
 				}
 				lj->lioj_flags |= LIOJ_SIGNAL;
 		} else {
 			uma_zfree(aiolio_zone, lj);
 			return EINVAL;
 		}
 	}
 
 	AIO_LOCK(ki);
 	TAILQ_INSERT_TAIL(&ki->kaio_liojoblist, lj, lioj_list);
 	/*
 	 * Add extra aiocb count to avoid the lio to be freed
 	 * by other threads doing aio_waitcomplete or aio_return,
 	 * and prevent event from being sent until we have queued
 	 * all tasks.
 	 */
 	lj->lioj_count = 1;
 	AIO_UNLOCK(ki);
 
 	/*
 	 * Get pointers to the list of I/O requests.
 	 */
 	nerror = 0;
 	cbptr = uap->acb_list;
 	for (i = 0; i < uap->nent; i++) {
 		iocb = (struct aiocb *)(intptr_t)fuword(&cbptr[i]);
 		if (((intptr_t)iocb != -1) && ((intptr_t)iocb != 0)) {
 			error = aio_aqueue(td, iocb, lj, LIO_NOP, oldsigev);
 			if (error != 0)
 				nerror++;
 		}
 	}
 
 	error = 0;
 	AIO_LOCK(ki);
 	if (uap->mode == LIO_WAIT) {
 		while (lj->lioj_count - 1 != lj->lioj_finished_count) {
 			ki->kaio_flags |= KAIO_WAKEUP;
 			error = msleep(&p->p_aioinfo, AIO_MTX(ki),
 			    PRIBIO | PCATCH, "aiospn", 0);
 			if (error == ERESTART)
 				error = EINTR;
 			if (error)
 				break;
 		}
 	} else {
 		if (lj->lioj_count - 1 == lj->lioj_finished_count) {
 			if (lj->lioj_signal.sigev_notify == SIGEV_KEVENT) {
 				lj->lioj_flags |= LIOJ_KEVENT_POSTED;
 				KNOTE_LOCKED(&lj->klist, 1);
 			}
 			if ((lj->lioj_flags & (LIOJ_SIGNAL|LIOJ_SIGNAL_POSTED))
 			    == LIOJ_SIGNAL
 			    && (lj->lioj_signal.sigev_notify == SIGEV_SIGNAL ||
 			    lj->lioj_signal.sigev_notify == SIGEV_THREAD_ID)) {
 				aio_sendsig(p, &lj->lioj_signal,
 					    &lj->lioj_ksi);
 				lj->lioj_flags |= LIOJ_SIGNAL_POSTED;
 			}
 		}
 	}
 	lj->lioj_count--;
 	if (lj->lioj_count == 0) {
 		TAILQ_REMOVE(&ki->kaio_liojoblist, lj, lioj_list);
 		knlist_delete(&lj->klist, curthread, 1);
 		PROC_LOCK(p);
 		sigqueue_take(&lj->lioj_ksi);
 		PROC_UNLOCK(p);
 		AIO_UNLOCK(ki);
 		uma_zfree(aiolio_zone, lj);
 	} else
 		AIO_UNLOCK(ki);
 
 	if (nerror)
 		return (EIO);
 	return (error);
 }
 
 /*
  * Called from interrupt thread for physio, we should return as fast
  * as possible, so we schedule a biohelper task.
  */
 static void
 aio_physwakeup(struct buf *bp)
 {
 	struct aiocblist *aiocbe;
 
 	aiocbe = (struct aiocblist *)bp->b_caller1;
 	taskqueue_enqueue(taskqueue_aiod_bio, &aiocbe->biotask);
 }
 
 /*
  * Task routine to perform heavy tasks, process wakeup, and signals.
  */
 static void
 biohelper(void *context, int pending)
 {
 	struct aiocblist *aiocbe = context;
 	struct buf *bp;
 	struct proc *userp;
 	struct kaioinfo *ki;
 	int nblks;
 
 	bp = aiocbe->bp;
 	userp = aiocbe->userproc;
 	ki = userp->p_aioinfo;
 	AIO_LOCK(ki);
 	aiocbe->uaiocb._aiocb_private.status -= bp->b_resid;
 	aiocbe->uaiocb._aiocb_private.error = 0;
 	if (bp->b_ioflags & BIO_ERROR)
 		aiocbe->uaiocb._aiocb_private.error = bp->b_error;
 	nblks = btodb(aiocbe->uaiocb.aio_nbytes);
 	if (aiocbe->uaiocb.aio_lio_opcode == LIO_WRITE)
 		aiocbe->outputcharge += nblks;
 	else
 		aiocbe->inputcharge += nblks;
 	aiocbe->bp = NULL;
 	TAILQ_REMOVE(&userp->p_aioinfo->kaio_bufqueue, aiocbe, plist);
 	ki->kaio_buffer_count--;
 	aio_bio_done_notify(userp, aiocbe, DONE_BUF);
 	AIO_UNLOCK(ki);
 
 	/* Release mapping into kernel space. */
 	vunmapbuf(bp);
 	relpbuf(bp, NULL);
 	atomic_subtract_int(&num_buf_aio, 1);
 }
 
 /* syscall - wait for the next completion of an aio request */
 int
 aio_waitcomplete(struct thread *td, struct aio_waitcomplete_args *uap)
 {
 	struct proc *p = td->td_proc;
 	struct timeval atv;
 	struct timespec ts;
 	struct kaioinfo *ki;
 	struct aiocblist *cb;
 	struct aiocb *uuaiocb;
 	int error, status, timo;
 
 	suword(uap->aiocbp, (long)NULL);
 
 	timo = 0;
 	if (uap->timeout) {
 		/* Get timespec struct. */
 		error = copyin(uap->timeout, &ts, sizeof(ts));
 		if (error)
 			return (error);
 
 		if ((ts.tv_nsec < 0) || (ts.tv_nsec >= 1000000000))
 			return (EINVAL);
 
 		TIMESPEC_TO_TIMEVAL(&atv, &ts);
 		if (itimerfix(&atv))
 			return (EINVAL);
 		timo = tvtohz(&atv);
 	}
 
 	if (p->p_aioinfo == NULL)
 		aio_init_aioinfo(p);
 	ki = p->p_aioinfo;
 
 	error = 0;
 	cb = NULL;
 	AIO_LOCK(ki);
 	while ((cb = TAILQ_FIRST(&ki->kaio_done)) == NULL) {
 		ki->kaio_flags |= KAIO_WAKEUP;
 		error = msleep(&p->p_aioinfo, AIO_MTX(ki), PRIBIO | PCATCH,
 		    "aiowc", timo);
 		if (timo && error == ERESTART)
 			error = EINTR;
 		if (error)
 			break;
 	}
 
 	if (cb != NULL) {
 		MPASS(cb->jobstate == JOBST_JOBFINISHED);
 		uuaiocb = cb->uuaiocb;
 		status = cb->uaiocb._aiocb_private.status;
 		error = cb->uaiocb._aiocb_private.error;
 		td->td_retval[0] = status;
 		if (cb->uaiocb.aio_lio_opcode == LIO_WRITE) {
 			td->td_ru.ru_oublock += cb->outputcharge;
 			cb->outputcharge = 0;
 		} else if (cb->uaiocb.aio_lio_opcode == LIO_READ) {
 			td->td_ru.ru_inblock += cb->inputcharge;
 			cb->inputcharge = 0;
 		}
 		aio_free_entry(cb);
 		AIO_UNLOCK(ki);
 		suword(uap->aiocbp, (long)uuaiocb);
 		suword(&uuaiocb->_aiocb_private.error, error);
 		suword(&uuaiocb->_aiocb_private.status, status);
 	} else
 		AIO_UNLOCK(ki);
 
 	return (error);
 }
 
 int
 aio_fsync(struct thread *td, struct aio_fsync_args *uap)
 {
 	struct proc *p = td->td_proc;
 	struct kaioinfo *ki;
 
 	if (uap->op != O_SYNC) /* XXX lack of O_DSYNC */
 		return (EINVAL);
 	ki = p->p_aioinfo;
 	if (ki == NULL)
 		aio_init_aioinfo(p);
 	return aio_aqueue(td, uap->aiocbp, NULL, LIO_SYNC, 0);
 }
 
 /* kqueue attach function */
 static int
 filt_aioattach(struct knote *kn)
 {
 	struct aiocblist *aiocbe = (struct aiocblist *)kn->kn_sdata;
 
 	/*
 	 * The aiocbe pointer must be validated before using it, so
 	 * registration is restricted to the kernel; the user cannot
 	 * set EV_FLAG1.
 	 */
 	if ((kn->kn_flags & EV_FLAG1) == 0)
 		return (EPERM);
 	kn->kn_flags &= ~EV_FLAG1;
 
 	knlist_add(&aiocbe->klist, kn, 0);
 
 	return (0);
 }
 
 /* kqueue detach function */
 static void
 filt_aiodetach(struct knote *kn)
 {
 	struct aiocblist *aiocbe = (struct aiocblist *)kn->kn_sdata;
 
 	if (!knlist_empty(&aiocbe->klist))
 		knlist_remove(&aiocbe->klist, kn, 0);
 }
 
 /* kqueue filter function */
 /*ARGSUSED*/
 static int
 filt_aio(struct knote *kn, long hint)
 {
 	struct aiocblist *aiocbe = (struct aiocblist *)kn->kn_sdata;
 
 	kn->kn_data = aiocbe->uaiocb._aiocb_private.error;
 	if (aiocbe->jobstate != JOBST_JOBFINISHED)
 		return (0);
 	kn->kn_flags |= EV_EOF;
 	return (1);
 }
 
 /* kqueue attach function */
 static int
 filt_lioattach(struct knote *kn)
 {
 	struct aioliojob * lj = (struct aioliojob *)kn->kn_sdata;
 
 	/*
 	 * The aioliojob pointer must be validated before using it, so
 	 * registration is restricted to the kernel; the user cannot
 	 * set EV_FLAG1.
 	 */
 	if ((kn->kn_flags & EV_FLAG1) == 0)
 		return (EPERM);
 	kn->kn_flags &= ~EV_FLAG1;
 
 	knlist_add(&lj->klist, kn, 0);
 
 	return (0);
 }
 
 /* kqueue detach function */
 static void
 filt_liodetach(struct knote *kn)
 {
 	struct aioliojob * lj = (struct aioliojob *)kn->kn_sdata;
 
 	if (!knlist_empty(&lj->klist))
 		knlist_remove(&lj->klist, kn, 0);
 }
 
 /* kqueue filter function */
 /*ARGSUSED*/
 static int
 filt_lio(struct knote *kn, long hint)
 {
 	struct aioliojob * lj = (struct aioliojob *)kn->kn_sdata;
 
 	return (lj->lioj_flags & LIOJ_KEVENT_POSTED);
 }
Index: head/sys/kern/vfs_bio.c
===================================================================
--- head/sys/kern/vfs_bio.c	(revision 175201)
+++ head/sys/kern/vfs_bio.c	(revision 175202)
@@ -1,3953 +1,3953 @@
 /*-
  * Copyright (c) 2004 Poul-Henning Kamp
  * Copyright (c) 1994,1997 John S. Dyson
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 /*
  * this file contains a new buffer I/O scheme implementing a coherent
  * VM object and buffer cache scheme.  Pains have been taken to make
  * sure that the performance degradation associated with schemes such
  * as this is not realized.
  *
  * Author:  John S. Dyson
  * Significant help during the development and debugging phases
  * had been provided by David Greenman, also of the FreeBSD core team.
  *
  * see man buf(9) for more info.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/bio.h>
 #include <sys/conf.h>
 #include <sys/buf.h>
 #include <sys/devicestat.h>
 #include <sys/eventhandler.h>
 #include <sys/limits.h>
 #include <sys/lock.h>
 #include <sys/malloc.h>
 #include <sys/mount.h>
 #include <sys/mutex.h>
 #include <sys/kernel.h>
 #include <sys/kthread.h>
 #include <sys/proc.h>
 #include <sys/resourcevar.h>
 #include <sys/sysctl.h>
 #include <sys/vmmeter.h>
 #include <sys/vnode.h>
 #include <geom/geom.h>
 #include <vm/vm.h>
 #include <vm/vm_param.h>
 #include <vm/vm_kern.h>
 #include <vm/vm_pageout.h>
 #include <vm/vm_page.h>
 #include <vm/vm_object.h>
 #include <vm/vm_extern.h>
 #include <vm/vm_map.h>
 #include "opt_directio.h"
 #include "opt_swap.h"
 
 static MALLOC_DEFINE(M_BIOBUF, "biobuf", "BIO buffer");
 
 struct	bio_ops bioops;		/* I/O operation notification */
 
 struct	buf_ops buf_ops_bio = {
 	.bop_name	=	"buf_ops_bio",
 	.bop_write	=	bufwrite,
 	.bop_strategy	=	bufstrategy,
 	.bop_sync	=	bufsync,
 	.bop_bdflush	=	bufbdflush,
 };
 
 /*
  * XXX buf is global because kern_shutdown.c and ffs_checkoverlap has
  * carnal knowledge of buffers.  This knowledge should be moved to vfs_bio.c.
  */
 struct buf *buf;		/* buffer header pool */
 
 static struct proc *bufdaemonproc;
 
 static int inmem(struct vnode *vp, daddr_t blkno);
 static void vm_hold_free_pages(struct buf *bp, vm_offset_t from,
 		vm_offset_t to);
 static void vm_hold_load_pages(struct buf *bp, vm_offset_t from,
 		vm_offset_t to);
 static void vfs_page_set_valid(struct buf *bp, vm_ooffset_t off,
 		vm_page_t m);
 static void vfs_clean_pages(struct buf *bp);
 static void vfs_setdirty(struct buf *bp);
 static void vfs_setdirty_locked_object(struct buf *bp);
 static void vfs_vmio_release(struct buf *bp);
 static int vfs_bio_clcheck(struct vnode *vp, int size,
 		daddr_t lblkno, daddr_t blkno);
 static int flushbufqueues(int, int);
 static void buf_daemon(void);
 static void bremfreel(struct buf *bp);
 
 int vmiodirenable = TRUE;
 SYSCTL_INT(_vfs, OID_AUTO, vmiodirenable, CTLFLAG_RW, &vmiodirenable, 0,
     "Use the VM system for directory writes");
 int runningbufspace;
 SYSCTL_INT(_vfs, OID_AUTO, runningbufspace, CTLFLAG_RD, &runningbufspace, 0,
     "Amount of presently outstanding async buffer io");
 static int bufspace;
 SYSCTL_INT(_vfs, OID_AUTO, bufspace, CTLFLAG_RD, &bufspace, 0,
     "KVA memory used for bufs");
 static int maxbufspace;
 SYSCTL_INT(_vfs, OID_AUTO, maxbufspace, CTLFLAG_RD, &maxbufspace, 0,
     "Maximum allowed value of bufspace (including buf_daemon)");
 static int bufmallocspace;
 SYSCTL_INT(_vfs, OID_AUTO, bufmallocspace, CTLFLAG_RD, &bufmallocspace, 0,
     "Amount of malloced memory for buffers");
 static int maxbufmallocspace;
 SYSCTL_INT(_vfs, OID_AUTO, maxmallocbufspace, CTLFLAG_RW, &maxbufmallocspace, 0,
     "Maximum amount of malloced memory for buffers");
 static int lobufspace;
 SYSCTL_INT(_vfs, OID_AUTO, lobufspace, CTLFLAG_RD, &lobufspace, 0,
     "Minimum amount of buffers we want to have");
 int hibufspace;
 SYSCTL_INT(_vfs, OID_AUTO, hibufspace, CTLFLAG_RD, &hibufspace, 0,
     "Maximum allowed value of bufspace (excluding buf_daemon)");
 static int bufreusecnt;
 SYSCTL_INT(_vfs, OID_AUTO, bufreusecnt, CTLFLAG_RW, &bufreusecnt, 0,
     "Number of times we have reused a buffer");
 static int buffreekvacnt;
 SYSCTL_INT(_vfs, OID_AUTO, buffreekvacnt, CTLFLAG_RW, &buffreekvacnt, 0,
     "Number of times we have freed the KVA space from some buffer");
 static int bufdefragcnt;
 SYSCTL_INT(_vfs, OID_AUTO, bufdefragcnt, CTLFLAG_RW, &bufdefragcnt, 0,
     "Number of times we have had to repeat buffer allocation to defragment");
 static int lorunningspace;
 SYSCTL_INT(_vfs, OID_AUTO, lorunningspace, CTLFLAG_RW, &lorunningspace, 0,
     "Minimum preferred space used for in-progress I/O");
 static int hirunningspace;
 SYSCTL_INT(_vfs, OID_AUTO, hirunningspace, CTLFLAG_RW, &hirunningspace, 0,
     "Maximum amount of space to use for in-progress I/O");
 int dirtybufferflushes;
 SYSCTL_INT(_vfs, OID_AUTO, dirtybufferflushes, CTLFLAG_RW, &dirtybufferflushes,
     0, "Number of bdwrite to bawrite conversions to limit dirty buffers");
 int bdwriteskip;
 SYSCTL_INT(_vfs, OID_AUTO, bdwriteskip, CTLFLAG_RW, &bdwriteskip,
     0, "Number of buffers supplied to bdwrite with snapshot deadlock risk");
 int altbufferflushes;
 SYSCTL_INT(_vfs, OID_AUTO, altbufferflushes, CTLFLAG_RW, &altbufferflushes,
     0, "Number of fsync flushes to limit dirty buffers");
 static int recursiveflushes;
 SYSCTL_INT(_vfs, OID_AUTO, recursiveflushes, CTLFLAG_RW, &recursiveflushes,
     0, "Number of flushes skipped due to being recursive");
 static int numdirtybuffers;
 SYSCTL_INT(_vfs, OID_AUTO, numdirtybuffers, CTLFLAG_RD, &numdirtybuffers, 0,
     "Number of buffers that are dirty (has unwritten changes) at the moment");
 static int lodirtybuffers;
 SYSCTL_INT(_vfs, OID_AUTO, lodirtybuffers, CTLFLAG_RW, &lodirtybuffers, 0,
     "How many buffers we want to have free before bufdaemon can sleep");
 static int hidirtybuffers;
 SYSCTL_INT(_vfs, OID_AUTO, hidirtybuffers, CTLFLAG_RW, &hidirtybuffers, 0,
     "When the number of dirty buffers is considered severe");
 int dirtybufthresh;
 SYSCTL_INT(_vfs, OID_AUTO, dirtybufthresh, CTLFLAG_RW, &dirtybufthresh,
     0, "Number of bdwrite to bawrite conversions to clear dirty buffers");
 static int numfreebuffers;
 SYSCTL_INT(_vfs, OID_AUTO, numfreebuffers, CTLFLAG_RD, &numfreebuffers, 0,
     "Number of free buffers");
 static int lofreebuffers;
 SYSCTL_INT(_vfs, OID_AUTO, lofreebuffers, CTLFLAG_RW, &lofreebuffers, 0,
    "XXX Unused");
 static int hifreebuffers;
 SYSCTL_INT(_vfs, OID_AUTO, hifreebuffers, CTLFLAG_RW, &hifreebuffers, 0,
    "XXX Complicatedly unused");
 static int getnewbufcalls;
 SYSCTL_INT(_vfs, OID_AUTO, getnewbufcalls, CTLFLAG_RW, &getnewbufcalls, 0,
    "Number of calls to getnewbuf");
 static int getnewbufrestarts;
 SYSCTL_INT(_vfs, OID_AUTO, getnewbufrestarts, CTLFLAG_RW, &getnewbufrestarts, 0,
     "Number of times getnewbuf has had to restart a buffer aquisition");
 
 /*
  * Wakeup point for bufdaemon, as well as indicator of whether it is already
  * active.  Set to 1 when the bufdaemon is already "on" the queue, 0 when it
  * is idling.
  */
 static int bd_request;
 
 /*
  * This lock synchronizes access to bd_request.
  */
 static struct mtx bdlock;
 
 /*
  * bogus page -- for I/O to/from partially complete buffers
  * this is a temporary solution to the problem, but it is not
  * really that bad.  it would be better to split the buffer
  * for input in the case of buffers partially already in memory,
  * but the code is intricate enough already.
  */
 vm_page_t bogus_page;
 
 /*
  * Synchronization (sleep/wakeup) variable for active buffer space requests.
  * Set when wait starts, cleared prior to wakeup().
  * Used in runningbufwakeup() and waitrunningbufspace().
  */
 static int runningbufreq;
 
 /*
  * This lock protects the runningbufreq and synchronizes runningbufwakeup and
  * waitrunningbufspace().
  */
 static struct mtx rbreqlock;
 
 /* 
  * Synchronization (sleep/wakeup) variable for buffer requests.
  * Can contain the VFS_BIO_NEED flags defined below; setting/clearing is done
  * by and/or.
  * Used in numdirtywakeup(), bufspacewakeup(), bufcountwakeup(), bwillwrite(),
  * getnewbuf(), and getblk().
  */
 static int needsbuffer;
 
 /*
  * Lock that protects needsbuffer and the sleeps/wakeups surrounding it.
  */
 static struct mtx nblock;
 
 /*
  * Lock that protects against bwait()/bdone()/B_DONE races.
  */
 
 static struct mtx bdonelock;
 
 /*
  * Lock that protects against bwait()/bdone()/B_DONE races.
  */
 static struct mtx bpinlock;
 
 /*
  * Definitions for the buffer free lists.
  */
 #define BUFFER_QUEUES	6	/* number of free buffer queues */
 
 #define QUEUE_NONE	0	/* on no queue */
 #define QUEUE_CLEAN	1	/* non-B_DELWRI buffers */
 #define QUEUE_DIRTY	2	/* B_DELWRI buffers */
 #define QUEUE_DIRTY_GIANT 3	/* B_DELWRI buffers that need giant */
 #define QUEUE_EMPTYKVA	4	/* empty buffer headers w/KVA assignment */
 #define QUEUE_EMPTY	5	/* empty buffer headers */
 
 /* Queues for free buffers with various properties */
 static TAILQ_HEAD(bqueues, buf) bufqueues[BUFFER_QUEUES] = { { 0 } };
 
 /* Lock for the bufqueues */
 static struct mtx bqlock;
 
 /*
  * Single global constant for BUF_WMESG, to avoid getting multiple references.
  * buf_wmesg is referred from macros.
  */
 const char *buf_wmesg = BUF_WMESG;
 
 #define VFS_BIO_NEED_ANY	0x01	/* any freeable buffer */
 #define VFS_BIO_NEED_DIRTYFLUSH	0x02	/* waiting for dirty buffer flush */
 #define VFS_BIO_NEED_FREE	0x04	/* wait for free bufs, hi hysteresis */
 #define VFS_BIO_NEED_BUFSPACE	0x08	/* wait for buf space, lo hysteresis */
 
 #ifdef DIRECTIO
 extern void ffs_rawread_setup(void);
 #endif /* DIRECTIO */
 /*
  *	numdirtywakeup:
  *
  *	If someone is blocked due to there being too many dirty buffers,
  *	and numdirtybuffers is now reasonable, wake them up.
  */
 
 static __inline void
 numdirtywakeup(int level)
 {
 
 	if (numdirtybuffers <= level) {
 		mtx_lock(&nblock);
 		if (needsbuffer & VFS_BIO_NEED_DIRTYFLUSH) {
 			needsbuffer &= ~VFS_BIO_NEED_DIRTYFLUSH;
 			wakeup(&needsbuffer);
 		}
 		mtx_unlock(&nblock);
 	}
 }
 
 /*
  *	bufspacewakeup:
  *
  *	Called when buffer space is potentially available for recovery.
  *	getnewbuf() will block on this flag when it is unable to free 
  *	sufficient buffer space.  Buffer space becomes recoverable when 
  *	bp's get placed back in the queues.
  */
 
 static __inline void
 bufspacewakeup(void)
 {
 
 	/*
 	 * If someone is waiting for BUF space, wake them up.  Even
 	 * though we haven't freed the kva space yet, the waiting
 	 * process will be able to now.
 	 */
 	mtx_lock(&nblock);
 	if (needsbuffer & VFS_BIO_NEED_BUFSPACE) {
 		needsbuffer &= ~VFS_BIO_NEED_BUFSPACE;
 		wakeup(&needsbuffer);
 	}
 	mtx_unlock(&nblock);
 }
 
 /*
  * runningbufwakeup() - in-progress I/O accounting.
  *
  */
 void
 runningbufwakeup(struct buf *bp)
 {
 
 	if (bp->b_runningbufspace) {
 		atomic_subtract_int(&runningbufspace, bp->b_runningbufspace);
 		bp->b_runningbufspace = 0;
 		mtx_lock(&rbreqlock);
 		if (runningbufreq && runningbufspace <= lorunningspace) {
 			runningbufreq = 0;
 			wakeup(&runningbufreq);
 		}
 		mtx_unlock(&rbreqlock);
 	}
 }
 
 /*
  *	bufcountwakeup:
  *
  *	Called when a buffer has been added to one of the free queues to
  *	account for the buffer and to wakeup anyone waiting for free buffers.
  *	This typically occurs when large amounts of metadata are being handled
  *	by the buffer cache ( else buffer space runs out first, usually ).
  */
 
 static __inline void
 bufcountwakeup(void) 
 {
 
 	atomic_add_int(&numfreebuffers, 1);
 	mtx_lock(&nblock);
 	if (needsbuffer) {
 		needsbuffer &= ~VFS_BIO_NEED_ANY;
 		if (numfreebuffers >= hifreebuffers)
 			needsbuffer &= ~VFS_BIO_NEED_FREE;
 		wakeup(&needsbuffer);
 	}
 	mtx_unlock(&nblock);
 }
 
 /*
  *	waitrunningbufspace()
  *
  *	runningbufspace is a measure of the amount of I/O currently
  *	running.  This routine is used in async-write situations to
  *	prevent creating huge backups of pending writes to a device.
  *	Only asynchronous writes are governed by this function.
  *
  *	Reads will adjust runningbufspace, but will not block based on it.
  *	The read load has a side effect of reducing the allowed write load.
  *
  *	This does NOT turn an async write into a sync write.  It waits  
  *	for earlier writes to complete and generally returns before the
  *	caller's write has reached the device.
  */
 void
 waitrunningbufspace(void)
 {
 
 	mtx_lock(&rbreqlock);
 	while (runningbufspace > hirunningspace) {
 		++runningbufreq;
 		msleep(&runningbufreq, &rbreqlock, PVM, "wdrain", 0);
 	}
 	mtx_unlock(&rbreqlock);
 }
 
 
 /*
  *	vfs_buf_test_cache:
  *
  *	Called when a buffer is extended.  This function clears the B_CACHE
  *	bit if the newly extended portion of the buffer does not contain
  *	valid data.
  */
 static __inline
 void
 vfs_buf_test_cache(struct buf *bp,
 		  vm_ooffset_t foff, vm_offset_t off, vm_offset_t size,
 		  vm_page_t m)
 {
 
 	VM_OBJECT_LOCK_ASSERT(m->object, MA_OWNED);
 	if (bp->b_flags & B_CACHE) {
 		int base = (foff + off) & PAGE_MASK;
 		if (vm_page_is_valid(m, base, size) == 0)
 			bp->b_flags &= ~B_CACHE;
 	}
 }
 
 /* Wake up the buffer daemon if necessary */
 static __inline
 void
 bd_wakeup(int dirtybuflevel)
 {
 
 	mtx_lock(&bdlock);
 	if (bd_request == 0 && numdirtybuffers >= dirtybuflevel) {
 		bd_request = 1;
 		wakeup(&bd_request);
 	}
 	mtx_unlock(&bdlock);
 }
 
 /*
  * bd_speedup - speedup the buffer cache flushing code
  */
 
 static __inline
 void
 bd_speedup(void)
 {
 
 	bd_wakeup(1);
 }
 
 /*
  * Calculating buffer cache scaling values and reserve space for buffer
  * headers.  This is called during low level kernel initialization and
  * may be called more then once.  We CANNOT write to the memory area
  * being reserved at this time.
  */
 caddr_t
 kern_vfs_bio_buffer_alloc(caddr_t v, long physmem_est)
 {
 	int maxbuf;
 
 	/*
 	 * physmem_est is in pages.  Convert it to kilobytes (assumes
 	 * PAGE_SIZE is >= 1K)
 	 */
 	physmem_est = physmem_est * (PAGE_SIZE / 1024);
 
 	/*
 	 * The nominal buffer size (and minimum KVA allocation) is BKVASIZE.
 	 * For the first 64MB of ram nominally allocate sufficient buffers to
 	 * cover 1/4 of our ram.  Beyond the first 64MB allocate additional
 	 * buffers to cover 1/10 of our ram over 64MB.  When auto-sizing
 	 * the buffer cache we limit the eventual kva reservation to
 	 * maxbcache bytes.
 	 *
 	 * factor represents the 1/4 x ram conversion.
 	 */
 	if (nbuf == 0) {
 		int factor = 4 * BKVASIZE / 1024;
 
 		nbuf = 50;
 		if (physmem_est > 4096)
 			nbuf += min((physmem_est - 4096) / factor,
 			    65536 / factor);
 		if (physmem_est > 65536)
 			nbuf += (physmem_est - 65536) * 2 / (factor * 5);
 
 		if (maxbcache && nbuf > maxbcache / BKVASIZE)
 			nbuf = maxbcache / BKVASIZE;
 
 		/* XXX Avoid integer overflows later on with maxbufspace. */
 		maxbuf = (INT_MAX / 3) / BKVASIZE;
 		if (nbuf > maxbuf)
 			nbuf = maxbuf;
 	}
 
 #if 0
 	/*
 	 * Do not allow the buffer_map to be more then 1/2 the size of the
 	 * kernel_map.
 	 */
 	if (nbuf > (kernel_map->max_offset - kernel_map->min_offset) / 
 	    (BKVASIZE * 2)) {
 		nbuf = (kernel_map->max_offset - kernel_map->min_offset) / 
 		    (BKVASIZE * 2);
 		printf("Warning: nbufs capped at %d\n", nbuf);
 	}
 #endif
 
 	/*
 	 * swbufs are used as temporary holders for I/O, such as paging I/O.
 	 * We have no less then 16 and no more then 256.
 	 */
 	nswbuf = max(min(nbuf/4, 256), 16);
 #ifdef NSWBUF_MIN
 	if (nswbuf < NSWBUF_MIN)
 		nswbuf = NSWBUF_MIN;
 #endif
 #ifdef DIRECTIO
 	ffs_rawread_setup();
 #endif
 
 	/*
 	 * Reserve space for the buffer cache buffers
 	 */
 	swbuf = (void *)v;
 	v = (caddr_t)(swbuf + nswbuf);
 	buf = (void *)v;
 	v = (caddr_t)(buf + nbuf);
 
 	return(v);
 }
 
 /* Initialize the buffer subsystem.  Called before use of any buffers. */
 void
 bufinit(void)
 {
 	struct buf *bp;
 	int i;
 
 	mtx_init(&bqlock, "buf queue lock", NULL, MTX_DEF);
 	mtx_init(&rbreqlock, "runningbufspace lock", NULL, MTX_DEF);
 	mtx_init(&nblock, "needsbuffer lock", NULL, MTX_DEF);
 	mtx_init(&bdlock, "buffer daemon lock", NULL, MTX_DEF);
 	mtx_init(&bdonelock, "bdone lock", NULL, MTX_DEF);
 	mtx_init(&bpinlock, "bpin lock", NULL, MTX_DEF);
 
 	/* next, make a null set of free lists */
 	for (i = 0; i < BUFFER_QUEUES; i++)
 		TAILQ_INIT(&bufqueues[i]);
 
 	/* finally, initialize each buffer header and stick on empty q */
 	for (i = 0; i < nbuf; i++) {
 		bp = &buf[i];
 		bzero(bp, sizeof *bp);
 		bp->b_flags = B_INVAL;	/* we're just an empty header */
 		bp->b_rcred = NOCRED;
 		bp->b_wcred = NOCRED;
 		bp->b_qindex = QUEUE_EMPTY;
 		bp->b_vflags = 0;
 		bp->b_xflags = 0;
 		LIST_INIT(&bp->b_dep);
 		BUF_LOCKINIT(bp);
 		TAILQ_INSERT_TAIL(&bufqueues[QUEUE_EMPTY], bp, b_freelist);
 	}
 
 	/*
 	 * maxbufspace is the absolute maximum amount of buffer space we are 
 	 * allowed to reserve in KVM and in real terms.  The absolute maximum
 	 * is nominally used by buf_daemon.  hibufspace is the nominal maximum
 	 * used by most other processes.  The differential is required to 
 	 * ensure that buf_daemon is able to run when other processes might 
 	 * be blocked waiting for buffer space.
 	 *
 	 * maxbufspace is based on BKVASIZE.  Allocating buffers larger then
 	 * this may result in KVM fragmentation which is not handled optimally
 	 * by the system.
 	 */
 	maxbufspace = nbuf * BKVASIZE;
 	hibufspace = imax(3 * maxbufspace / 4, maxbufspace - MAXBSIZE * 10);
 	lobufspace = hibufspace - MAXBSIZE;
 
 	lorunningspace = 512 * 1024;
 	hirunningspace = 1024 * 1024;
 
 /*
  * Limit the amount of malloc memory since it is wired permanently into
  * the kernel space.  Even though this is accounted for in the buffer
  * allocation, we don't want the malloced region to grow uncontrolled.
  * The malloc scheme improves memory utilization significantly on average
  * (small) directories.
  */
 	maxbufmallocspace = hibufspace / 20;
 
 /*
  * Reduce the chance of a deadlock occuring by limiting the number
  * of delayed-write dirty buffers we allow to stack up.
  */
 	hidirtybuffers = nbuf / 4 + 20;
 	dirtybufthresh = hidirtybuffers * 9 / 10;
 	numdirtybuffers = 0;
 /*
  * To support extreme low-memory systems, make sure hidirtybuffers cannot
  * eat up all available buffer space.  This occurs when our minimum cannot
  * be met.  We try to size hidirtybuffers to 3/4 our buffer space assuming
  * BKVASIZE'd (8K) buffers.
  */
 	while (hidirtybuffers * BKVASIZE > 3 * hibufspace / 4) {
 		hidirtybuffers >>= 1;
 	}
 	lodirtybuffers = hidirtybuffers / 2;
 
 /*
  * Try to keep the number of free buffers in the specified range,
  * and give special processes (e.g. like buf_daemon) access to an 
  * emergency reserve.
  */
 	lofreebuffers = nbuf / 18 + 5;
 	hifreebuffers = 2 * lofreebuffers;
 	numfreebuffers = nbuf;
 
 /*
  * Maximum number of async ops initiated per buf_daemon loop.  This is
  * somewhat of a hack at the moment, we really need to limit ourselves
  * based on the number of bytes of I/O in-transit that were initiated
  * from buf_daemon.
  */
 
 	bogus_page = vm_page_alloc(NULL, 0, VM_ALLOC_NOOBJ |
 	    VM_ALLOC_NORMAL | VM_ALLOC_WIRED);
 }
 
 /*
  * bfreekva() - free the kva allocation for a buffer.
  *
  *	Since this call frees up buffer space, we call bufspacewakeup().
  */
 static void
 bfreekva(struct buf *bp)
 {
 
 	if (bp->b_kvasize) {
 		atomic_add_int(&buffreekvacnt, 1);
 		atomic_subtract_int(&bufspace, bp->b_kvasize);
 		vm_map_remove(buffer_map, (vm_offset_t) bp->b_kvabase,
 		    (vm_offset_t) bp->b_kvabase + bp->b_kvasize);
 		bp->b_kvasize = 0;
 		bufspacewakeup();
 	}
 }
 
 /*
  *	bremfree:
  *
  *	Mark the buffer for removal from the appropriate free list in brelse.
  *	
  */
 void
 bremfree(struct buf *bp)
 {
 
 	CTR3(KTR_BUF, "bremfree(%p) vp %p flags %X", bp, bp->b_vp, bp->b_flags);
 	KASSERT(BUF_REFCNT(bp), ("bremfree: buf must be locked."));
 	KASSERT((bp->b_flags & B_REMFREE) == 0,
 	    ("bremfree: buffer %p already marked for delayed removal.", bp));
 	KASSERT(bp->b_qindex != QUEUE_NONE,
 	    ("bremfree: buffer %p not on a queue.", bp));
 
 	bp->b_flags |= B_REMFREE;
 	/* Fixup numfreebuffers count.  */
 	if ((bp->b_flags & B_INVAL) || (bp->b_flags & B_DELWRI) == 0)
 		atomic_subtract_int(&numfreebuffers, 1);
 }
 
 /*
  *	bremfreef:
  *
  *	Force an immediate removal from a free list.  Used only in nfs when
  *	it abuses the b_freelist pointer.
  */
 void
 bremfreef(struct buf *bp)
 {
 	mtx_lock(&bqlock);
 	bremfreel(bp);
 	mtx_unlock(&bqlock);
 }
 
 /*
  *	bremfreel:
  *
  *	Removes a buffer from the free list, must be called with the
  *	bqlock held.
  */
 static void
 bremfreel(struct buf *bp)
 {
 	CTR3(KTR_BUF, "bremfreel(%p) vp %p flags %X",
 	    bp, bp->b_vp, bp->b_flags);
 	KASSERT(BUF_REFCNT(bp), ("bremfreel: buffer %p not locked.", bp));
 	KASSERT(bp->b_qindex != QUEUE_NONE,
 	    ("bremfreel: buffer %p not on a queue.", bp));
 	mtx_assert(&bqlock, MA_OWNED);
 
 	TAILQ_REMOVE(&bufqueues[bp->b_qindex], bp, b_freelist);
 	bp->b_qindex = QUEUE_NONE;
 	/*
 	 * If this was a delayed bremfree() we only need to remove the buffer
 	 * from the queue and return the stats are already done.
 	 */
 	if (bp->b_flags & B_REMFREE) {
 		bp->b_flags &= ~B_REMFREE;
 		return;
 	}
 	/*
 	 * Fixup numfreebuffers count.  If the buffer is invalid or not
 	 * delayed-write, the buffer was free and we must decrement
 	 * numfreebuffers.
 	 */
 	if ((bp->b_flags & B_INVAL) || (bp->b_flags & B_DELWRI) == 0)
 		atomic_subtract_int(&numfreebuffers, 1);
 }
 
 
 /*
  * Get a buffer with the specified data.  Look in the cache first.  We
  * must clear BIO_ERROR and B_INVAL prior to initiating I/O.  If B_CACHE
  * is set, the buffer is valid and we do not have to do anything ( see
  * getblk() ).  This is really just a special case of breadn().
  */
 int
 bread(struct vnode * vp, daddr_t blkno, int size, struct ucred * cred,
     struct buf **bpp)
 {
 
 	return (breadn(vp, blkno, size, 0, 0, 0, cred, bpp));
 }
 
 /*
  * Attempt to initiate asynchronous I/O on read-ahead blocks.  We must
  * clear BIO_ERROR and B_INVAL prior to initiating I/O . If B_CACHE is set,
  * the buffer is valid and we do not have to do anything.
  */
 void
 breada(struct vnode * vp, daddr_t * rablkno, int * rabsize,
     int cnt, struct ucred * cred)
 {
 	struct buf *rabp;
 	int i;
 
 	for (i = 0; i < cnt; i++, rablkno++, rabsize++) {
 		if (inmem(vp, *rablkno))
 			continue;
 		rabp = getblk(vp, *rablkno, *rabsize, 0, 0, 0);
 
 		if ((rabp->b_flags & B_CACHE) == 0) {
 			if (!TD_IS_IDLETHREAD(curthread))
 				curthread->td_ru.ru_inblock++;
 			rabp->b_flags |= B_ASYNC;
 			rabp->b_flags &= ~B_INVAL;
 			rabp->b_ioflags &= ~BIO_ERROR;
 			rabp->b_iocmd = BIO_READ;
 			if (rabp->b_rcred == NOCRED && cred != NOCRED)
 				rabp->b_rcred = crhold(cred);
 			vfs_busy_pages(rabp, 0);
 			BUF_KERNPROC(rabp);
 			rabp->b_iooffset = dbtob(rabp->b_blkno);
 			bstrategy(rabp);
 		} else {
 			brelse(rabp);
 		}
 	}
 }
 
 /*
  * Operates like bread, but also starts asynchronous I/O on
  * read-ahead blocks.
  */
 int
 breadn(struct vnode * vp, daddr_t blkno, int size,
     daddr_t * rablkno, int *rabsize,
     int cnt, struct ucred * cred, struct buf **bpp)
 {
 	struct buf *bp;
 	int rv = 0, readwait = 0;
 
 	CTR3(KTR_BUF, "breadn(%p, %jd, %d)", vp, blkno, size);
 	*bpp = bp = getblk(vp, blkno, size, 0, 0, 0);
 
 	/* if not found in cache, do some I/O */
 	if ((bp->b_flags & B_CACHE) == 0) {
 		if (!TD_IS_IDLETHREAD(curthread))
 			curthread->td_ru.ru_inblock++;
 		bp->b_iocmd = BIO_READ;
 		bp->b_flags &= ~B_INVAL;
 		bp->b_ioflags &= ~BIO_ERROR;
 		if (bp->b_rcred == NOCRED && cred != NOCRED)
 			bp->b_rcred = crhold(cred);
 		vfs_busy_pages(bp, 0);
 		bp->b_iooffset = dbtob(bp->b_blkno);
 		bstrategy(bp);
 		++readwait;
 	}
 
 	breada(vp, rablkno, rabsize, cnt, cred);
 
 	if (readwait) {
 		rv = bufwait(bp);
 	}
 	return (rv);
 }
 
 /*
  * Write, release buffer on completion.  (Done by iodone
  * if async).  Do not bother writing anything if the buffer
  * is invalid.
  *
  * Note that we set B_CACHE here, indicating that buffer is
  * fully valid and thus cacheable.  This is true even of NFS
  * now so we set it generally.  This could be set either here 
  * or in biodone() since the I/O is synchronous.  We put it
  * here.
  */
 int
 bufwrite(struct buf *bp)
 {
 	int oldflags;
 	struct vnode *vp;
 	int vp_md;
 
 	CTR3(KTR_BUF, "bufwrite(%p) vp %p flags %X", bp, bp->b_vp, bp->b_flags);
 	if (bp->b_flags & B_INVAL) {
 		brelse(bp);
 		return (0);
 	}
 
 	oldflags = bp->b_flags;
 
 	if (BUF_REFCNT(bp) == 0)
 		panic("bufwrite: buffer is not busy???");
 
 	if (bp->b_pin_count > 0)
 		bunpin_wait(bp);
 
 	KASSERT(!(bp->b_vflags & BV_BKGRDINPROG),
 	    ("FFS background buffer should not get here %p", bp));
 
 	vp = bp->b_vp;
 	if (vp)
 		vp_md = vp->v_vflag & VV_MD;
 	else
 		vp_md = 0;
 
 	/* Mark the buffer clean */
 	bundirty(bp);
 
 	bp->b_flags &= ~B_DONE;
 	bp->b_ioflags &= ~BIO_ERROR;
 	bp->b_flags |= B_CACHE;
 	bp->b_iocmd = BIO_WRITE;
 
 	bufobj_wref(bp->b_bufobj);
 	vfs_busy_pages(bp, 1);
 
 	/*
 	 * Normal bwrites pipeline writes
 	 */
 	bp->b_runningbufspace = bp->b_bufsize;
 	atomic_add_int(&runningbufspace, bp->b_runningbufspace);
 
 	if (!TD_IS_IDLETHREAD(curthread))
 		curthread->td_ru.ru_oublock++;
 	if (oldflags & B_ASYNC)
 		BUF_KERNPROC(bp);
 	bp->b_iooffset = dbtob(bp->b_blkno);
 	bstrategy(bp);
 
 	if ((oldflags & B_ASYNC) == 0) {
 		int rtval = bufwait(bp);
 		brelse(bp);
 		return (rtval);
 	} else {
 		/*
 		 * don't allow the async write to saturate the I/O
 		 * system.  We will not deadlock here because
 		 * we are blocking waiting for I/O that is already in-progress
 		 * to complete. We do not block here if it is the update
 		 * or syncer daemon trying to clean up as that can lead
 		 * to deadlock.
 		 */
 		if ((curthread->td_pflags & TDP_NORUNNINGBUF) == 0 && !vp_md)
 			waitrunningbufspace();
 	}
 
 	return (0);
 }
 
 void
 bufbdflush(struct bufobj *bo, struct buf *bp)
 {
 	struct buf *nbp;
 
 	if (bo->bo_dirty.bv_cnt > dirtybufthresh + 10) {
 		(void) VOP_FSYNC(bp->b_vp, MNT_NOWAIT, curthread);
 		altbufferflushes++;
 	} else if (bo->bo_dirty.bv_cnt > dirtybufthresh) {
 		BO_LOCK(bo);
 		/*
 		 * Try to find a buffer to flush.
 		 */
 		TAILQ_FOREACH(nbp, &bo->bo_dirty.bv_hd, b_bobufs) {
 			if ((nbp->b_vflags & BV_BKGRDINPROG) ||
 			    BUF_LOCK(nbp,
 				     LK_EXCLUSIVE | LK_NOWAIT, NULL))
 				continue;
 			if (bp == nbp)
 				panic("bdwrite: found ourselves");
 			BO_UNLOCK(bo);
 			/* Don't countdeps with the bo lock held. */
 			if (buf_countdeps(nbp, 0)) {
 				BO_LOCK(bo);
 				BUF_UNLOCK(nbp);
 				continue;
 			}
 			if (nbp->b_flags & B_CLUSTEROK) {
 				vfs_bio_awrite(nbp);
 			} else {
 				bremfree(nbp);
 				bawrite(nbp);
 			}
 			dirtybufferflushes++;
 			break;
 		}
 		if (nbp == NULL)
 			BO_UNLOCK(bo);
 	}
 }
 
 /*
  * Delayed write. (Buffer is marked dirty).  Do not bother writing
  * anything if the buffer is marked invalid.
  *
  * Note that since the buffer must be completely valid, we can safely
  * set B_CACHE.  In fact, we have to set B_CACHE here rather then in
  * biodone() in order to prevent getblk from writing the buffer
  * out synchronously.
  */
 void
 bdwrite(struct buf *bp)
 {
 	struct thread *td = curthread;
 	struct vnode *vp;
 	struct bufobj *bo;
 
 	CTR3(KTR_BUF, "bdwrite(%p) vp %p flags %X", bp, bp->b_vp, bp->b_flags);
 	KASSERT(bp->b_bufobj != NULL, ("No b_bufobj %p", bp));
 	KASSERT(BUF_REFCNT(bp) != 0, ("bdwrite: buffer is not busy"));
 
 	if (bp->b_flags & B_INVAL) {
 		brelse(bp);
 		return;
 	}
 
 	/*
 	 * If we have too many dirty buffers, don't create any more.
 	 * If we are wildly over our limit, then force a complete
 	 * cleanup. Otherwise, just keep the situation from getting
 	 * out of control. Note that we have to avoid a recursive
 	 * disaster and not try to clean up after our own cleanup!
 	 */
 	vp = bp->b_vp;
 	bo = bp->b_bufobj;
 	if ((td->td_pflags & (TDP_COWINPROGRESS|TDP_INBDFLUSH)) == 0) {
 		td->td_pflags |= TDP_INBDFLUSH;
 		BO_BDFLUSH(bo, bp);
 		td->td_pflags &= ~TDP_INBDFLUSH;
 	} else
 		recursiveflushes++;
 
 	bdirty(bp);
 	/*
 	 * Set B_CACHE, indicating that the buffer is fully valid.  This is
 	 * true even of NFS now.
 	 */
 	bp->b_flags |= B_CACHE;
 
 	/*
 	 * This bmap keeps the system from needing to do the bmap later,
 	 * perhaps when the system is attempting to do a sync.  Since it
 	 * is likely that the indirect block -- or whatever other datastructure
 	 * that the filesystem needs is still in memory now, it is a good
 	 * thing to do this.  Note also, that if the pageout daemon is
 	 * requesting a sync -- there might not be enough memory to do
 	 * the bmap then...  So, this is important to do.
 	 */
 	if (vp->v_type != VCHR && bp->b_lblkno == bp->b_blkno) {
 		VOP_BMAP(vp, bp->b_lblkno, NULL, &bp->b_blkno, NULL, NULL);
 	}
 
 	/*
 	 * Set the *dirty* buffer range based upon the VM system dirty pages.
 	 */
 	vfs_setdirty(bp);
 
 	/*
 	 * We need to do this here to satisfy the vnode_pager and the
 	 * pageout daemon, so that it thinks that the pages have been
 	 * "cleaned".  Note that since the pages are in a delayed write
 	 * buffer -- the VFS layer "will" see that the pages get written
 	 * out on the next sync, or perhaps the cluster will be completed.
 	 */
 	vfs_clean_pages(bp);
 	bqrelse(bp);
 
 	/*
 	 * Wakeup the buffer flushing daemon if we have a lot of dirty
 	 * buffers (midpoint between our recovery point and our stall
 	 * point).
 	 */
 	bd_wakeup((lodirtybuffers + hidirtybuffers) / 2);
 
 	/*
 	 * note: we cannot initiate I/O from a bdwrite even if we wanted to,
 	 * due to the softdep code.
 	 */
 }
 
 /*
  *	bdirty:
  *
  *	Turn buffer into delayed write request.  We must clear BIO_READ and
  *	B_RELBUF, and we must set B_DELWRI.  We reassign the buffer to 
  *	itself to properly update it in the dirty/clean lists.  We mark it
  *	B_DONE to ensure that any asynchronization of the buffer properly
  *	clears B_DONE ( else a panic will occur later ).  
  *
  *	bdirty() is kinda like bdwrite() - we have to clear B_INVAL which
  *	might have been set pre-getblk().  Unlike bwrite/bdwrite, bdirty()
  *	should only be called if the buffer is known-good.
  *
  *	Since the buffer is not on a queue, we do not update the numfreebuffers
  *	count.
  *
  *	The buffer must be on QUEUE_NONE.
  */
 void
 bdirty(struct buf *bp)
 {
 
 	CTR3(KTR_BUF, "bdirty(%p) vp %p flags %X",
 	    bp, bp->b_vp, bp->b_flags);
 	KASSERT(BUF_REFCNT(bp) == 1, ("bdirty: bp %p not locked",bp));
 	KASSERT(bp->b_bufobj != NULL, ("No b_bufobj %p", bp));
 	KASSERT(bp->b_flags & B_REMFREE || bp->b_qindex == QUEUE_NONE,
 	    ("bdirty: buffer %p still on queue %d", bp, bp->b_qindex));
 	bp->b_flags &= ~(B_RELBUF);
 	bp->b_iocmd = BIO_WRITE;
 
 	if ((bp->b_flags & B_DELWRI) == 0) {
 		bp->b_flags |= /* XXX B_DONE | */ B_DELWRI;
 		reassignbuf(bp);
 		atomic_add_int(&numdirtybuffers, 1);
 		bd_wakeup((lodirtybuffers + hidirtybuffers) / 2);
 	}
 }
 
 /*
  *	bundirty:
  *
  *	Clear B_DELWRI for buffer.
  *
  *	Since the buffer is not on a queue, we do not update the numfreebuffers
  *	count.
  *	
  *	The buffer must be on QUEUE_NONE.
  */
 
 void
 bundirty(struct buf *bp)
 {
 
 	CTR3(KTR_BUF, "bundirty(%p) vp %p flags %X", bp, bp->b_vp, bp->b_flags);
 	KASSERT(bp->b_bufobj != NULL, ("No b_bufobj %p", bp));
 	KASSERT(bp->b_flags & B_REMFREE || bp->b_qindex == QUEUE_NONE,
 	    ("bundirty: buffer %p still on queue %d", bp, bp->b_qindex));
 	KASSERT(BUF_REFCNT(bp) == 1, ("bundirty: bp %p not locked",bp));
 
 	if (bp->b_flags & B_DELWRI) {
 		bp->b_flags &= ~B_DELWRI;
 		reassignbuf(bp);
 		atomic_subtract_int(&numdirtybuffers, 1);
 		numdirtywakeup(lodirtybuffers);
 	}
 	/*
 	 * Since it is now being written, we can clear its deferred write flag.
 	 */
 	bp->b_flags &= ~B_DEFERRED;
 }
 
 /*
  *	bawrite:
  *
  *	Asynchronous write.  Start output on a buffer, but do not wait for
  *	it to complete.  The buffer is released when the output completes.
  *
  *	bwrite() ( or the VOP routine anyway ) is responsible for handling 
  *	B_INVAL buffers.  Not us.
  */
 void
 bawrite(struct buf *bp)
 {
 
 	bp->b_flags |= B_ASYNC;
 	(void) bwrite(bp);
 }
 
 /*
  *	bwillwrite:
  *
  *	Called prior to the locking of any vnodes when we are expecting to
  *	write.  We do not want to starve the buffer cache with too many
  *	dirty buffers so we block here.  By blocking prior to the locking
  *	of any vnodes we attempt to avoid the situation where a locked vnode
  *	prevents the various system daemons from flushing related buffers.
  */
 
 void
 bwillwrite(void)
 {
 
 	if (numdirtybuffers >= hidirtybuffers) {
 		mtx_lock(&nblock);
 		while (numdirtybuffers >= hidirtybuffers) {
 			bd_wakeup(1);
 			needsbuffer |= VFS_BIO_NEED_DIRTYFLUSH;
 			msleep(&needsbuffer, &nblock,
 			    (PRIBIO + 4), "flswai", 0);
 		}
 		mtx_unlock(&nblock);
 	}
 }
 
 /*
  * Return true if we have too many dirty buffers.
  */
 int
 buf_dirty_count_severe(void)
 {
 
 	return(numdirtybuffers >= hidirtybuffers);
 }
 
 /*
  *	brelse:
  *
  *	Release a busy buffer and, if requested, free its resources.  The
  *	buffer will be stashed in the appropriate bufqueue[] allowing it
  *	to be accessed later as a cache entity or reused for other purposes.
  */
 void
 brelse(struct buf *bp)
 {
 	CTR3(KTR_BUF, "brelse(%p) vp %p flags %X",
 	    bp, bp->b_vp, bp->b_flags);
 	KASSERT(!(bp->b_flags & (B_CLUSTER|B_PAGING)),
 	    ("brelse: inappropriate B_PAGING or B_CLUSTER bp %p", bp));
 
 	if (bp->b_flags & B_MANAGED) {
 		bqrelse(bp);
 		return;
 	}
 
 	if (bp->b_iocmd == BIO_WRITE && (bp->b_ioflags & BIO_ERROR) &&
 	    bp->b_error == EIO && !(bp->b_flags & B_INVAL)) {
 		/*
 		 * Failed write, redirty.  Must clear BIO_ERROR to prevent
 		 * pages from being scrapped.  If the error is anything
 		 * other than an I/O error (EIO), assume that retryingi
 		 * is futile.
 		 */
 		bp->b_ioflags &= ~BIO_ERROR;
 		bdirty(bp);
 	} else if ((bp->b_flags & (B_NOCACHE | B_INVAL)) ||
 	    (bp->b_ioflags & BIO_ERROR) || (bp->b_bufsize <= 0)) {
 		/*
 		 * Either a failed I/O or we were asked to free or not
 		 * cache the buffer.
 		 */
 		bp->b_flags |= B_INVAL;
 		if (!LIST_EMPTY(&bp->b_dep))
 			buf_deallocate(bp);
 		if (bp->b_flags & B_DELWRI) {
 			atomic_subtract_int(&numdirtybuffers, 1);
 			numdirtywakeup(lodirtybuffers);
 		}
 		bp->b_flags &= ~(B_DELWRI | B_CACHE);
 		if ((bp->b_flags & B_VMIO) == 0) {
 			if (bp->b_bufsize)
 				allocbuf(bp, 0);
 			if (bp->b_vp)
 				brelvp(bp);
 		}
 	}
 
 	/*
 	 * We must clear B_RELBUF if B_DELWRI is set.  If vfs_vmio_release() 
 	 * is called with B_DELWRI set, the underlying pages may wind up
 	 * getting freed causing a previous write (bdwrite()) to get 'lost'
 	 * because pages associated with a B_DELWRI bp are marked clean.
 	 * 
 	 * We still allow the B_INVAL case to call vfs_vmio_release(), even
 	 * if B_DELWRI is set.
 	 *
 	 * If B_DELWRI is not set we may have to set B_RELBUF if we are low
 	 * on pages to return pages to the VM page queues.
 	 */
 	if (bp->b_flags & B_DELWRI)
 		bp->b_flags &= ~B_RELBUF;
 	else if (vm_page_count_severe()) {
 		/*
 		 * XXX This lock may not be necessary since BKGRDINPROG
 		 * cannot be set while we hold the buf lock, it can only be
 		 * cleared if it is already pending.
 		 */
 		if (bp->b_vp) {
 			BO_LOCK(bp->b_bufobj);
 			if (!(bp->b_vflags & BV_BKGRDINPROG))
 				bp->b_flags |= B_RELBUF;
 			BO_UNLOCK(bp->b_bufobj);
 		} else
 			bp->b_flags |= B_RELBUF;
 	}
 
 	/*
 	 * VMIO buffer rundown.  It is not very necessary to keep a VMIO buffer
 	 * constituted, not even NFS buffers now.  Two flags effect this.  If
 	 * B_INVAL, the struct buf is invalidated but the VM object is kept
 	 * around ( i.e. so it is trivial to reconstitute the buffer later ).
 	 *
 	 * If BIO_ERROR or B_NOCACHE is set, pages in the VM object will be
 	 * invalidated.  BIO_ERROR cannot be set for a failed write unless the
 	 * buffer is also B_INVAL because it hits the re-dirtying code above.
 	 *
 	 * Normally we can do this whether a buffer is B_DELWRI or not.  If
 	 * the buffer is an NFS buffer, it is tracking piecemeal writes or
 	 * the commit state and we cannot afford to lose the buffer. If the
 	 * buffer has a background write in progress, we need to keep it
 	 * around to prevent it from being reconstituted and starting a second
 	 * background write.
 	 */
 	if ((bp->b_flags & B_VMIO)
 	    && !(bp->b_vp->v_mount != NULL &&
 		 (bp->b_vp->v_mount->mnt_vfc->vfc_flags & VFCF_NETWORK) != 0 &&
 		 !vn_isdisk(bp->b_vp, NULL) &&
 		 (bp->b_flags & B_DELWRI))
 	    ) {
 
 		int i, j, resid;
 		vm_page_t m;
 		off_t foff;
 		vm_pindex_t poff;
 		vm_object_t obj;
 
 		obj = bp->b_bufobj->bo_object;
 
 		/*
 		 * Get the base offset and length of the buffer.  Note that 
 		 * in the VMIO case if the buffer block size is not
 		 * page-aligned then b_data pointer may not be page-aligned.
 		 * But our b_pages[] array *IS* page aligned.
 		 *
 		 * block sizes less then DEV_BSIZE (usually 512) are not 
 		 * supported due to the page granularity bits (m->valid,
 		 * m->dirty, etc...). 
 		 *
 		 * See man buf(9) for more information
 		 */
 		resid = bp->b_bufsize;
 		foff = bp->b_offset;
 		VM_OBJECT_LOCK(obj);
 		for (i = 0; i < bp->b_npages; i++) {
 			int had_bogus = 0;
 
 			m = bp->b_pages[i];
 
 			/*
 			 * If we hit a bogus page, fixup *all* the bogus pages
 			 * now.
 			 */
 			if (m == bogus_page) {
 				poff = OFF_TO_IDX(bp->b_offset);
 				had_bogus = 1;
 
 				for (j = i; j < bp->b_npages; j++) {
 					vm_page_t mtmp;
 					mtmp = bp->b_pages[j];
 					if (mtmp == bogus_page) {
 						mtmp = vm_page_lookup(obj, poff + j);
 						if (!mtmp) {
 							panic("brelse: page missing\n");
 						}
 						bp->b_pages[j] = mtmp;
 					}
 				}
 
 				if ((bp->b_flags & B_INVAL) == 0) {
 					pmap_qenter(
 					    trunc_page((vm_offset_t)bp->b_data),
 					    bp->b_pages, bp->b_npages);
 				}
 				m = bp->b_pages[i];
 			}
 			if ((bp->b_flags & B_NOCACHE) ||
 			    (bp->b_ioflags & BIO_ERROR)) {
 				int poffset = foff & PAGE_MASK;
 				int presid = resid > (PAGE_SIZE - poffset) ?
 					(PAGE_SIZE - poffset) : resid;
 
 				KASSERT(presid >= 0, ("brelse: extra page"));
 				vm_page_lock_queues();
 				vm_page_set_invalid(m, poffset, presid);
 				vm_page_unlock_queues();
 				if (had_bogus)
 					printf("avoided corruption bug in bogus_page/brelse code\n");
 			}
 			resid -= PAGE_SIZE - (foff & PAGE_MASK);
 			foff = (foff + PAGE_SIZE) & ~(off_t)PAGE_MASK;
 		}
 		VM_OBJECT_UNLOCK(obj);
 		if (bp->b_flags & (B_INVAL | B_RELBUF))
 			vfs_vmio_release(bp);
 
 	} else if (bp->b_flags & B_VMIO) {
 
 		if (bp->b_flags & (B_INVAL | B_RELBUF)) {
 			vfs_vmio_release(bp);
 		}
 
 	} else if ((bp->b_flags & (B_INVAL | B_RELBUF)) != 0) {
 		if (bp->b_bufsize != 0)
 			allocbuf(bp, 0);
 		if (bp->b_vp != NULL)
 			brelvp(bp);
 	}
 			
 	if (BUF_REFCNT(bp) > 1) {
 		/* do not release to free list */
 		BUF_UNLOCK(bp);
 		return;
 	}
 
 	/* enqueue */
 	mtx_lock(&bqlock);
 	/* Handle delayed bremfree() processing. */
 	if (bp->b_flags & B_REMFREE)
 		bremfreel(bp);
 	if (bp->b_qindex != QUEUE_NONE)
 		panic("brelse: free buffer onto another queue???");
 
 	/* buffers with no memory */
 	if (bp->b_bufsize == 0) {
 		bp->b_flags |= B_INVAL;
 		bp->b_xflags &= ~(BX_BKGRDWRITE | BX_ALTDATA);
 		if (bp->b_vflags & BV_BKGRDINPROG)
 			panic("losing buffer 1");
 		if (bp->b_kvasize) {
 			bp->b_qindex = QUEUE_EMPTYKVA;
 		} else {
 			bp->b_qindex = QUEUE_EMPTY;
 		}
 		TAILQ_INSERT_HEAD(&bufqueues[bp->b_qindex], bp, b_freelist);
 	/* buffers with junk contents */
 	} else if (bp->b_flags & (B_INVAL | B_NOCACHE | B_RELBUF) ||
 	    (bp->b_ioflags & BIO_ERROR)) {
 		bp->b_flags |= B_INVAL;
 		bp->b_xflags &= ~(BX_BKGRDWRITE | BX_ALTDATA);
 		if (bp->b_vflags & BV_BKGRDINPROG)
 			panic("losing buffer 2");
 		bp->b_qindex = QUEUE_CLEAN;
 		TAILQ_INSERT_HEAD(&bufqueues[QUEUE_CLEAN], bp, b_freelist);
 	/* remaining buffers */
 	} else {
 		if ((bp->b_flags & (B_DELWRI|B_NEEDSGIANT)) ==
 		    (B_DELWRI|B_NEEDSGIANT))
 			bp->b_qindex = QUEUE_DIRTY_GIANT;
 		if (bp->b_flags & B_DELWRI)
 			bp->b_qindex = QUEUE_DIRTY;
 		else
 			bp->b_qindex = QUEUE_CLEAN;
 		if (bp->b_flags & B_AGE)
 			TAILQ_INSERT_HEAD(&bufqueues[bp->b_qindex], bp, b_freelist);
 		else
 			TAILQ_INSERT_TAIL(&bufqueues[bp->b_qindex], bp, b_freelist);
 	}
 	mtx_unlock(&bqlock);
 
 	/*
 	 * If B_INVAL and B_DELWRI is set, clear B_DELWRI.  We have already
 	 * placed the buffer on the correct queue.  We must also disassociate
 	 * the device and vnode for a B_INVAL buffer so gbincore() doesn't
 	 * find it.
 	 */
 	if (bp->b_flags & B_INVAL) {
 		if (bp->b_flags & B_DELWRI)
 			bundirty(bp);
 		if (bp->b_vp)
 			brelvp(bp);
 	}
 
 	/*
 	 * Fixup numfreebuffers count.  The bp is on an appropriate queue
 	 * unless locked.  We then bump numfreebuffers if it is not B_DELWRI.
 	 * We've already handled the B_INVAL case ( B_DELWRI will be clear
 	 * if B_INVAL is set ).
 	 */
 
 	if (!(bp->b_flags & B_DELWRI))
 		bufcountwakeup();
 
 	/*
 	 * Something we can maybe free or reuse
 	 */
 	if (bp->b_bufsize || bp->b_kvasize)
 		bufspacewakeup();
 
 	bp->b_flags &= ~(B_ASYNC | B_NOCACHE | B_AGE | B_RELBUF | B_DIRECT);
 	if ((bp->b_flags & B_DELWRI) == 0 && (bp->b_xflags & BX_VNDIRTY))
 		panic("brelse: not dirty");
 	/* unlock */
 	BUF_UNLOCK(bp);
 }
 
 /*
  * Release a buffer back to the appropriate queue but do not try to free
  * it.  The buffer is expected to be used again soon.
  *
  * bqrelse() is used by bdwrite() to requeue a delayed write, and used by
  * biodone() to requeue an async I/O on completion.  It is also used when
  * known good buffers need to be requeued but we think we may need the data
  * again soon.
  *
  * XXX we should be able to leave the B_RELBUF hint set on completion.
  */
 void
 bqrelse(struct buf *bp)
 {
 	CTR3(KTR_BUF, "bqrelse(%p) vp %p flags %X", bp, bp->b_vp, bp->b_flags);
 	KASSERT(!(bp->b_flags & (B_CLUSTER|B_PAGING)),
 	    ("bqrelse: inappropriate B_PAGING or B_CLUSTER bp %p", bp));
 
 	if (BUF_REFCNT(bp) > 1) {
 		/* do not release to free list */
 		BUF_UNLOCK(bp);
 		return;
 	}
 
 	if (bp->b_flags & B_MANAGED) {
 		if (bp->b_flags & B_REMFREE) {
 			mtx_lock(&bqlock);
 			bremfreel(bp);
 			mtx_unlock(&bqlock);
 		}
 		bp->b_flags &= ~(B_ASYNC | B_NOCACHE | B_AGE | B_RELBUF);
 		BUF_UNLOCK(bp);
 		return;
 	}
 
 	mtx_lock(&bqlock);
 	/* Handle delayed bremfree() processing. */
 	if (bp->b_flags & B_REMFREE)
 		bremfreel(bp);
 	if (bp->b_qindex != QUEUE_NONE)
 		panic("bqrelse: free buffer onto another queue???");
 	/* buffers with stale but valid contents */
 	if (bp->b_flags & B_DELWRI) {
 		if (bp->b_flags & B_NEEDSGIANT)
 			bp->b_qindex = QUEUE_DIRTY_GIANT;
 		else
 			bp->b_qindex = QUEUE_DIRTY;
 		TAILQ_INSERT_TAIL(&bufqueues[bp->b_qindex], bp, b_freelist);
 	} else {
 		/*
 		 * XXX This lock may not be necessary since BKGRDINPROG
 		 * cannot be set while we hold the buf lock, it can only be
 		 * cleared if it is already pending.
 		 */
 		BO_LOCK(bp->b_bufobj);
 		if (!vm_page_count_severe() || bp->b_vflags & BV_BKGRDINPROG) {
 			BO_UNLOCK(bp->b_bufobj);
 			bp->b_qindex = QUEUE_CLEAN;
 			TAILQ_INSERT_TAIL(&bufqueues[QUEUE_CLEAN], bp,
 			    b_freelist);
 		} else {
 			/*
 			 * We are too low on memory, we have to try to free
 			 * the buffer (most importantly: the wired pages
 			 * making up its backing store) *now*.
 			 */
 			BO_UNLOCK(bp->b_bufobj);
 			mtx_unlock(&bqlock);
 			brelse(bp);
 			return;
 		}
 	}
 	mtx_unlock(&bqlock);
 
 	if ((bp->b_flags & B_INVAL) || !(bp->b_flags & B_DELWRI))
 		bufcountwakeup();
 
 	/*
 	 * Something we can maybe free or reuse.
 	 */
 	if (bp->b_bufsize && !(bp->b_flags & B_DELWRI))
 		bufspacewakeup();
 
 	bp->b_flags &= ~(B_ASYNC | B_NOCACHE | B_AGE | B_RELBUF);
 	if ((bp->b_flags & B_DELWRI) == 0 && (bp->b_xflags & BX_VNDIRTY))
 		panic("bqrelse: not dirty");
 	/* unlock */
 	BUF_UNLOCK(bp);
 }
 
 /* Give pages used by the bp back to the VM system (where possible) */
 static void
 vfs_vmio_release(struct buf *bp)
 {
 	int i;
 	vm_page_t m;
 
 	VM_OBJECT_LOCK(bp->b_bufobj->bo_object);
 	vm_page_lock_queues();
 	for (i = 0; i < bp->b_npages; i++) {
 		m = bp->b_pages[i];
 		bp->b_pages[i] = NULL;
 		/*
 		 * In order to keep page LRU ordering consistent, put
 		 * everything on the inactive queue.
 		 */
 		vm_page_unwire(m, 0);
 		/*
 		 * We don't mess with busy pages, it is
 		 * the responsibility of the process that
 		 * busied the pages to deal with them.
 		 */
 		if ((m->oflags & VPO_BUSY) || (m->busy != 0))
 			continue;
 			
 		if (m->wire_count == 0) {
 			/*
 			 * Might as well free the page if we can and it has
 			 * no valid data.  We also free the page if the
 			 * buffer was used for direct I/O
 			 */
 			if ((bp->b_flags & B_ASYNC) == 0 && !m->valid &&
 			    m->hold_count == 0) {
 				vm_page_free(m);
 			} else if (bp->b_flags & B_DIRECT) {
 				vm_page_try_to_free(m);
 			} else if (vm_page_count_severe()) {
 				vm_page_try_to_cache(m);
 			}
 		}
 	}
 	vm_page_unlock_queues();
 	VM_OBJECT_UNLOCK(bp->b_bufobj->bo_object);
 	pmap_qremove(trunc_page((vm_offset_t) bp->b_data), bp->b_npages);
 	
 	if (bp->b_bufsize) {
 		bufspacewakeup();
 		bp->b_bufsize = 0;
 	}
 	bp->b_npages = 0;
 	bp->b_flags &= ~B_VMIO;
 	if (bp->b_vp)
 		brelvp(bp);
 }
 
 /*
  * Check to see if a block at a particular lbn is available for a clustered
  * write.
  */
 static int
 vfs_bio_clcheck(struct vnode *vp, int size, daddr_t lblkno, daddr_t blkno)
 {
 	struct buf *bpa;
 	int match;
 
 	match = 0;
 
 	/* If the buf isn't in core skip it */
 	if ((bpa = gbincore(&vp->v_bufobj, lblkno)) == NULL)
 		return (0);
 
 	/* If the buf is busy we don't want to wait for it */
 	if (BUF_LOCK(bpa, LK_EXCLUSIVE | LK_NOWAIT, NULL) != 0)
 		return (0);
 
 	/* Only cluster with valid clusterable delayed write buffers */
 	if ((bpa->b_flags & (B_DELWRI | B_CLUSTEROK | B_INVAL)) !=
 	    (B_DELWRI | B_CLUSTEROK))
 		goto done;
 
 	if (bpa->b_bufsize != size)
 		goto done;
 
 	/*
 	 * Check to see if it is in the expected place on disk and that the
 	 * block has been mapped.
 	 */
 	if ((bpa->b_blkno != bpa->b_lblkno) && (bpa->b_blkno == blkno))
 		match = 1;
 done:
 	BUF_UNLOCK(bpa);
 	return (match);
 }
 
 /*
  *	vfs_bio_awrite:
  *
  *	Implement clustered async writes for clearing out B_DELWRI buffers.
  *	This is much better then the old way of writing only one buffer at
  *	a time.  Note that we may not be presented with the buffers in the 
  *	correct order, so we search for the cluster in both directions.
  */
 int
 vfs_bio_awrite(struct buf *bp)
 {
 	int i;
 	int j;
 	daddr_t lblkno = bp->b_lblkno;
 	struct vnode *vp = bp->b_vp;
 	int ncl;
 	int nwritten;
 	int size;
 	int maxcl;
 
 	/*
 	 * right now we support clustered writing only to regular files.  If
 	 * we find a clusterable block we could be in the middle of a cluster
 	 * rather then at the beginning.
 	 */
 	if ((vp->v_type == VREG) && 
 	    (vp->v_mount != 0) && /* Only on nodes that have the size info */
 	    (bp->b_flags & (B_CLUSTEROK | B_INVAL)) == B_CLUSTEROK) {
 
 		size = vp->v_mount->mnt_stat.f_iosize;
 		maxcl = MAXPHYS / size;
 
 		VI_LOCK(vp);
 		for (i = 1; i < maxcl; i++)
 			if (vfs_bio_clcheck(vp, size, lblkno + i,
 			    bp->b_blkno + ((i * size) >> DEV_BSHIFT)) == 0)
 				break;
 
 		for (j = 1; i + j <= maxcl && j <= lblkno; j++) 
 			if (vfs_bio_clcheck(vp, size, lblkno - j,
 			    bp->b_blkno - ((j * size) >> DEV_BSHIFT)) == 0)
 				break;
 
 		VI_UNLOCK(vp);
 		--j;
 		ncl = i + j;
 		/*
 		 * this is a possible cluster write
 		 */
 		if (ncl != 1) {
 			BUF_UNLOCK(bp);
 			nwritten = cluster_wbuild(vp, size, lblkno - j, ncl);
 			return nwritten;
 		}
 	}
 	bremfree(bp);
 	bp->b_flags |= B_ASYNC;
 	/*
 	 * default (old) behavior, writing out only one block
 	 *
 	 * XXX returns b_bufsize instead of b_bcount for nwritten?
 	 */
 	nwritten = bp->b_bufsize;
 	(void) bwrite(bp);
 
 	return nwritten;
 }
 
 /*
  *	getnewbuf:
  *
  *	Find and initialize a new buffer header, freeing up existing buffers 
  *	in the bufqueues as necessary.  The new buffer is returned locked.
  *
  *	Important:  B_INVAL is not set.  If the caller wishes to throw the
  *	buffer away, the caller must set B_INVAL prior to calling brelse().
  *
  *	We block if:
  *		We have insufficient buffer headers
  *		We have insufficient buffer space
  *		buffer_map is too fragmented ( space reservation fails )
  *		If we have to flush dirty buffers ( but we try to avoid this )
  *
  *	To avoid VFS layer recursion we do not flush dirty buffers ourselves.
  *	Instead we ask the buf daemon to do it for us.  We attempt to
  *	avoid piecemeal wakeups of the pageout daemon.
  */
 
 static struct buf *
 getnewbuf(int slpflag, int slptimeo, int size, int maxsize)
 {
 	struct buf *bp;
 	struct buf *nbp;
 	int defrag = 0;
 	int nqindex;
 	static int flushingbufs;
 
 	/*
 	 * We can't afford to block since we might be holding a vnode lock,
 	 * which may prevent system daemons from running.  We deal with
 	 * low-memory situations by proactively returning memory and running
 	 * async I/O rather then sync I/O.
 	 */
 
 	atomic_add_int(&getnewbufcalls, 1);
 	atomic_subtract_int(&getnewbufrestarts, 1);
 restart:
 	atomic_add_int(&getnewbufrestarts, 1);
 
 	/*
 	 * Setup for scan.  If we do not have enough free buffers,
 	 * we setup a degenerate case that immediately fails.  Note
 	 * that if we are specially marked process, we are allowed to
 	 * dip into our reserves.
 	 *
 	 * The scanning sequence is nominally:  EMPTY->EMPTYKVA->CLEAN
 	 *
 	 * We start with EMPTYKVA.  If the list is empty we backup to EMPTY.
 	 * However, there are a number of cases (defragging, reusing, ...)
 	 * where we cannot backup.
 	 */
 	mtx_lock(&bqlock);
 	nqindex = QUEUE_EMPTYKVA;
 	nbp = TAILQ_FIRST(&bufqueues[QUEUE_EMPTYKVA]);
 
 	if (nbp == NULL) {
 		/*
 		 * If no EMPTYKVA buffers and we are either
 		 * defragging or reusing, locate a CLEAN buffer
 		 * to free or reuse.  If bufspace useage is low
 		 * skip this step so we can allocate a new buffer.
 		 */
 		if (defrag || bufspace >= lobufspace) {
 			nqindex = QUEUE_CLEAN;
 			nbp = TAILQ_FIRST(&bufqueues[QUEUE_CLEAN]);
 		}
 
 		/*
 		 * If we could not find or were not allowed to reuse a
 		 * CLEAN buffer, check to see if it is ok to use an EMPTY
 		 * buffer.  We can only use an EMPTY buffer if allocating
 		 * its KVA would not otherwise run us out of buffer space.
 		 */
 		if (nbp == NULL && defrag == 0 &&
 		    bufspace + maxsize < hibufspace) {
 			nqindex = QUEUE_EMPTY;
 			nbp = TAILQ_FIRST(&bufqueues[QUEUE_EMPTY]);
 		}
 	}
 
 	/*
 	 * Run scan, possibly freeing data and/or kva mappings on the fly
 	 * depending.
 	 */
 
 	while ((bp = nbp) != NULL) {
 		int qindex = nqindex;
 
 		/*
 		 * Calculate next bp ( we can only use it if we do not block
 		 * or do other fancy things ).
 		 */
 		if ((nbp = TAILQ_NEXT(bp, b_freelist)) == NULL) {
 			switch(qindex) {
 			case QUEUE_EMPTY:
 				nqindex = QUEUE_EMPTYKVA;
 				if ((nbp = TAILQ_FIRST(&bufqueues[QUEUE_EMPTYKVA])))
 					break;
 				/* FALLTHROUGH */
 			case QUEUE_EMPTYKVA:
 				nqindex = QUEUE_CLEAN;
 				if ((nbp = TAILQ_FIRST(&bufqueues[QUEUE_CLEAN])))
 					break;
 				/* FALLTHROUGH */
 			case QUEUE_CLEAN:
 				/*
 				 * nbp is NULL. 
 				 */
 				break;
 			}
 		}
 		/*
 		 * If we are defragging then we need a buffer with 
 		 * b_kvasize != 0.  XXX this situation should no longer
 		 * occur, if defrag is non-zero the buffer's b_kvasize
 		 * should also be non-zero at this point.  XXX
 		 */
 		if (defrag && bp->b_kvasize == 0) {
 			printf("Warning: defrag empty buffer %p\n", bp);
 			continue;
 		}
 
 		/*
 		 * Start freeing the bp.  This is somewhat involved.  nbp
 		 * remains valid only for QUEUE_EMPTY[KVA] bp's.
 		 */
 		if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT, NULL) != 0)
 			continue;
 		if (bp->b_vp) {
 			BO_LOCK(bp->b_bufobj);
 			if (bp->b_vflags & BV_BKGRDINPROG) {
 				BO_UNLOCK(bp->b_bufobj);
 				BUF_UNLOCK(bp);
 				continue;
 			}
 			BO_UNLOCK(bp->b_bufobj);
 		}
 		CTR6(KTR_BUF,
 		    "getnewbuf(%p) vp %p flags %X kvasize %d bufsize %d "
 		    "queue %d (recycling)", bp, bp->b_vp, bp->b_flags,
 		    bp->b_kvasize, bp->b_bufsize, qindex);
 
 		/*
 		 * Sanity Checks
 		 */
 		KASSERT(bp->b_qindex == qindex, ("getnewbuf: inconsistant queue %d bp %p", qindex, bp));
 
 		/*
 		 * Note: we no longer distinguish between VMIO and non-VMIO
 		 * buffers.
 		 */
 
 		KASSERT((bp->b_flags & B_DELWRI) == 0, ("delwri buffer %p found in queue %d", bp, qindex));
 
 		bremfreel(bp);
 		mtx_unlock(&bqlock);
 
 		if (qindex == QUEUE_CLEAN) {
 			if (bp->b_flags & B_VMIO) {
 				bp->b_flags &= ~B_ASYNC;
 				vfs_vmio_release(bp);
 			}
 			if (bp->b_vp)
 				brelvp(bp);
 		}
 
 		/*
 		 * NOTE:  nbp is now entirely invalid.  We can only restart
 		 * the scan from this point on.
 		 *
 		 * Get the rest of the buffer freed up.  b_kva* is still
 		 * valid after this operation.
 		 */
 
 		if (bp->b_rcred != NOCRED) {
 			crfree(bp->b_rcred);
 			bp->b_rcred = NOCRED;
 		}
 		if (bp->b_wcred != NOCRED) {
 			crfree(bp->b_wcred);
 			bp->b_wcred = NOCRED;
 		}
 		if (!LIST_EMPTY(&bp->b_dep))
 			buf_deallocate(bp);
 		if (bp->b_vflags & BV_BKGRDINPROG)
 			panic("losing buffer 3");
 		KASSERT(bp->b_vp == NULL,
 		    ("bp: %p still has vnode %p.  qindex: %d",
 		    bp, bp->b_vp, qindex));
 		KASSERT((bp->b_xflags & (BX_VNCLEAN|BX_VNDIRTY)) == 0,
 		   ("bp: %p still on a buffer list. xflags %X",
 		    bp, bp->b_xflags));
 
 		if (bp->b_bufsize)
 			allocbuf(bp, 0);
 
 		bp->b_flags = 0;
 		bp->b_ioflags = 0;
 		bp->b_xflags = 0;
 		bp->b_vflags = 0;
 		bp->b_vp = NULL;
 		bp->b_blkno = bp->b_lblkno = 0;
 		bp->b_offset = NOOFFSET;
 		bp->b_iodone = 0;
 		bp->b_error = 0;
 		bp->b_resid = 0;
 		bp->b_bcount = 0;
 		bp->b_npages = 0;
 		bp->b_dirtyoff = bp->b_dirtyend = 0;
 		bp->b_bufobj = NULL;
 		bp->b_pin_count = 0;
 		bp->b_fsprivate1 = NULL;
 		bp->b_fsprivate2 = NULL;
 		bp->b_fsprivate3 = NULL;
 
 		LIST_INIT(&bp->b_dep);
 
 		/*
 		 * If we are defragging then free the buffer.
 		 */
 		if (defrag) {
 			bp->b_flags |= B_INVAL;
 			bfreekva(bp);
 			brelse(bp);
 			defrag = 0;
 			goto restart;
 		}
 
 		/*
 		 * Notify any waiters for the buffer lock about
 		 * identity change by freeing the buffer.
 		 */
 		if (qindex == QUEUE_CLEAN && BUF_LOCKWAITERS(bp) > 0) {
 			bp->b_flags |= B_INVAL;
 			bfreekva(bp);
 			brelse(bp);
 			goto restart;
 		}
 
 		/*
 		 * If we are overcomitted then recover the buffer and its
 		 * KVM space.  This occurs in rare situations when multiple
 		 * processes are blocked in getnewbuf() or allocbuf().
 		 */
 		if (bufspace >= hibufspace)
 			flushingbufs = 1;
 		if (flushingbufs && bp->b_kvasize != 0) {
 			bp->b_flags |= B_INVAL;
 			bfreekva(bp);
 			brelse(bp);
 			goto restart;
 		}
 		if (bufspace < lobufspace)
 			flushingbufs = 0;
 		break;
 	}
 
 	/*
 	 * If we exhausted our list, sleep as appropriate.  We may have to
 	 * wakeup various daemons and write out some dirty buffers.
 	 *
 	 * Generally we are sleeping due to insufficient buffer space.
 	 */
 
 	if (bp == NULL) {
 		int flags;
 		char *waitmsg;
 
 		if (defrag) {
 			flags = VFS_BIO_NEED_BUFSPACE;
 			waitmsg = "nbufkv";
 		} else if (bufspace >= hibufspace) {
 			waitmsg = "nbufbs";
 			flags = VFS_BIO_NEED_BUFSPACE;
 		} else {
 			waitmsg = "newbuf";
 			flags = VFS_BIO_NEED_ANY;
 		}
 		mtx_lock(&nblock);
 		needsbuffer |= flags;
 		mtx_unlock(&nblock);
 		mtx_unlock(&bqlock);
 
 		bd_speedup();	/* heeeelp */
 
 		mtx_lock(&nblock);
 		while (needsbuffer & flags) {
 			if (msleep(&needsbuffer, &nblock,
 			    (PRIBIO + 4) | slpflag, waitmsg, slptimeo)) {
 				mtx_unlock(&nblock);
 				return (NULL);
 			}
 		}
 		mtx_unlock(&nblock);
 	} else {
 		/*
 		 * We finally have a valid bp.  We aren't quite out of the
 		 * woods, we still have to reserve kva space.  In order
 		 * to keep fragmentation sane we only allocate kva in
 		 * BKVASIZE chunks.
 		 */
 		maxsize = (maxsize + BKVAMASK) & ~BKVAMASK;
 
 		if (maxsize != bp->b_kvasize) {
 			vm_offset_t addr = 0;
 
 			bfreekva(bp);
 
 			vm_map_lock(buffer_map);
 			if (vm_map_findspace(buffer_map,
 				vm_map_min(buffer_map), maxsize, &addr)) {
 				/*
 				 * Uh oh.  Buffer map is to fragmented.  We
 				 * must defragment the map.
 				 */
 				atomic_add_int(&bufdefragcnt, 1);
 				vm_map_unlock(buffer_map);
 				defrag = 1;
 				bp->b_flags |= B_INVAL;
 				brelse(bp);
 				goto restart;
 			}
 			if (addr) {
 				vm_map_insert(buffer_map, NULL, 0,
 					addr, addr + maxsize,
 					VM_PROT_ALL, VM_PROT_ALL, MAP_NOFAULT);
 
 				bp->b_kvabase = (caddr_t) addr;
 				bp->b_kvasize = maxsize;
 				atomic_add_int(&bufspace, bp->b_kvasize);
 				atomic_add_int(&bufreusecnt, 1);
 			}
 			vm_map_unlock(buffer_map);
 		}
 		bp->b_saveaddr = bp->b_kvabase;
 		bp->b_data = bp->b_saveaddr;
 	}
 	return(bp);
 }
 
 /*
  *	buf_daemon:
  *
  *	buffer flushing daemon.  Buffers are normally flushed by the
  *	update daemon but if it cannot keep up this process starts to
  *	take the load in an attempt to prevent getnewbuf() from blocking.
  */
 
 static struct kproc_desc buf_kp = {
 	"bufdaemon",
 	buf_daemon,
 	&bufdaemonproc
 };
 SYSINIT(bufdaemon, SI_SUB_KTHREAD_BUF, SI_ORDER_FIRST, kproc_start, &buf_kp)
 
 static void
 buf_daemon()
 {
 
 	/*
 	 * This process needs to be suspended prior to shutdown sync.
 	 */
 	EVENTHANDLER_REGISTER(shutdown_pre_sync, kproc_shutdown, bufdaemonproc,
 	    SHUTDOWN_PRI_LAST);
 
 	/*
 	 * This process is allowed to take the buffer cache to the limit
 	 */
 	curthread->td_pflags |= TDP_NORUNNINGBUF;
 	mtx_lock(&bdlock);
 	for (;;) {
 		bd_request = 0;
 		mtx_unlock(&bdlock);
 
 		kproc_suspend_check(bufdaemonproc);
 
 		/*
 		 * Do the flush.  Limit the amount of in-transit I/O we
 		 * allow to build up, otherwise we would completely saturate
 		 * the I/O system.  Wakeup any waiting processes before we
 		 * normally would so they can run in parallel with our drain.
 		 */
 		while (numdirtybuffers > lodirtybuffers) {
 			int flushed;
 
 			flushed = flushbufqueues(QUEUE_DIRTY, 0);
 			/* The list empty check here is slightly racy */
 			if (!TAILQ_EMPTY(&bufqueues[QUEUE_DIRTY_GIANT])) {
 				mtx_lock(&Giant);
 				flushed += flushbufqueues(QUEUE_DIRTY_GIANT, 0);
 				mtx_unlock(&Giant);
 			}
 			if (flushed == 0) {
 				/*
 				 * Could not find any buffers without rollback
 				 * dependencies, so just write the first one
 				 * in the hopes of eventually making progress.
 				 */
 				flushbufqueues(QUEUE_DIRTY, 1);
 				if (!TAILQ_EMPTY(
 				    &bufqueues[QUEUE_DIRTY_GIANT])) {
 					mtx_lock(&Giant);
 					flushbufqueues(QUEUE_DIRTY_GIANT, 1);
 					mtx_unlock(&Giant);
 				}
 				break;
 			}
 			uio_yield();
 		}
 
 		/*
 		 * Only clear bd_request if we have reached our low water
 		 * mark.  The buf_daemon normally waits 1 second and
 		 * then incrementally flushes any dirty buffers that have
 		 * built up, within reason.
 		 *
 		 * If we were unable to hit our low water mark and couldn't
 		 * find any flushable buffers, we sleep half a second.
 		 * Otherwise we loop immediately.
 		 */
 		mtx_lock(&bdlock);
 		if (numdirtybuffers <= lodirtybuffers) {
 			/*
 			 * We reached our low water mark, reset the
 			 * request and sleep until we are needed again.
 			 * The sleep is just so the suspend code works.
 			 */
 			bd_request = 0;
 			msleep(&bd_request, &bdlock, PVM, "psleep", hz);
 		} else {
 			/*
 			 * We couldn't find any flushable dirty buffers but
 			 * still have too many dirty buffers, we
 			 * have to sleep and try again.  (rare)
 			 */
 			msleep(&bd_request, &bdlock, PVM, "qsleep", hz / 10);
 		}
 	}
 }
 
 /*
  *	flushbufqueues:
  *
  *	Try to flush a buffer in the dirty queue.  We must be careful to
  *	free up B_INVAL buffers instead of write them, which NFS is 
  *	particularly sensitive to.
  */
 static int flushwithdeps = 0;
 SYSCTL_INT(_vfs, OID_AUTO, flushwithdeps, CTLFLAG_RW, &flushwithdeps,
     0, "Number of buffers flushed with dependecies that require rollbacks");
 
 static int
 flushbufqueues(int queue, int flushdeps)
 {
 	struct thread *td = curthread;
 	struct buf sentinel;
 	struct vnode *vp;
 	struct mount *mp;
 	struct buf *bp;
 	int hasdeps;
 	int flushed;
 	int target;
 
 	target = numdirtybuffers - lodirtybuffers;
 	if (flushdeps && target > 2)
 		target /= 2;
 	flushed = 0;
 	bp = NULL;
 	mtx_lock(&bqlock);
 	TAILQ_INSERT_TAIL(&bufqueues[queue], &sentinel, b_freelist);
 	while (flushed != target) {
 		bp = TAILQ_FIRST(&bufqueues[queue]);
 		if (bp == &sentinel)
 			break;
 		TAILQ_REMOVE(&bufqueues[queue], bp, b_freelist);
 		TAILQ_INSERT_TAIL(&bufqueues[queue], bp, b_freelist);
 
 		if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT, NULL) != 0)
 			continue;
 		if (bp->b_pin_count > 0) {
 			BUF_UNLOCK(bp);
 			continue;
 		}
 		BO_LOCK(bp->b_bufobj);
 		if ((bp->b_vflags & BV_BKGRDINPROG) != 0 ||
 		    (bp->b_flags & B_DELWRI) == 0) {
 			BO_UNLOCK(bp->b_bufobj);
 			BUF_UNLOCK(bp);
 			continue;
 		}
 		BO_UNLOCK(bp->b_bufobj);
 		if (bp->b_flags & B_INVAL) {
 			bremfreel(bp);
 			mtx_unlock(&bqlock);
 			brelse(bp);
 			flushed++;
 			numdirtywakeup((lodirtybuffers + hidirtybuffers) / 2);
 			mtx_lock(&bqlock);
 			continue;
 		}
 
 		if (!LIST_EMPTY(&bp->b_dep) && buf_countdeps(bp, 0)) {
 			if (flushdeps == 0) {
 				BUF_UNLOCK(bp);
 				continue;
 			}
 			hasdeps = 1;
 		} else
 			hasdeps = 0;
 		/*
 		 * We must hold the lock on a vnode before writing
 		 * one of its buffers. Otherwise we may confuse, or
 		 * in the case of a snapshot vnode, deadlock the
 		 * system.
 		 *
 		 * The lock order here is the reverse of the normal
 		 * of vnode followed by buf lock.  This is ok because
 		 * the NOWAIT will prevent deadlock.
 		 */
 		vp = bp->b_vp;
 		if (vn_start_write(vp, &mp, V_NOWAIT) != 0) {
 			BUF_UNLOCK(bp);
 			continue;
 		}
-		if (vn_lock(vp, LK_EXCLUSIVE | LK_NOWAIT, td) == 0) {
+		if (vn_lock(vp, LK_EXCLUSIVE | LK_NOWAIT) == 0) {
 			mtx_unlock(&bqlock);
 			CTR3(KTR_BUF, "flushbufqueue(%p) vp %p flags %X",
 			    bp, bp->b_vp, bp->b_flags);
 			vfs_bio_awrite(bp);
 			vn_finished_write(mp);
 			VOP_UNLOCK(vp, 0, td);
 			flushwithdeps += hasdeps;
 			flushed++;
 			waitrunningbufspace();
 			numdirtywakeup((lodirtybuffers + hidirtybuffers) / 2);
 			mtx_lock(&bqlock);
 			continue;
 		}
 		vn_finished_write(mp);
 		BUF_UNLOCK(bp);
 	}
 	TAILQ_REMOVE(&bufqueues[queue], &sentinel, b_freelist);
 	mtx_unlock(&bqlock);
 	return (flushed);
 }
 
 /*
  * Check to see if a block is currently memory resident.
  */
 struct buf *
 incore(struct bufobj *bo, daddr_t blkno)
 {
 	struct buf *bp;
 
 	BO_LOCK(bo);
 	bp = gbincore(bo, blkno);
 	BO_UNLOCK(bo);
 	return (bp);
 }
 
 /*
  * Returns true if no I/O is needed to access the
  * associated VM object.  This is like incore except
  * it also hunts around in the VM system for the data.
  */
 
 static int
 inmem(struct vnode * vp, daddr_t blkno)
 {
 	vm_object_t obj;
 	vm_offset_t toff, tinc, size;
 	vm_page_t m;
 	vm_ooffset_t off;
 
 	ASSERT_VOP_LOCKED(vp, "inmem");
 
 	if (incore(&vp->v_bufobj, blkno))
 		return 1;
 	if (vp->v_mount == NULL)
 		return 0;
 	obj = vp->v_object;
 	if (obj == NULL)
 		return (0);
 
 	size = PAGE_SIZE;
 	if (size > vp->v_mount->mnt_stat.f_iosize)
 		size = vp->v_mount->mnt_stat.f_iosize;
 	off = (vm_ooffset_t)blkno * (vm_ooffset_t)vp->v_mount->mnt_stat.f_iosize;
 
 	VM_OBJECT_LOCK(obj);
 	for (toff = 0; toff < vp->v_mount->mnt_stat.f_iosize; toff += tinc) {
 		m = vm_page_lookup(obj, OFF_TO_IDX(off + toff));
 		if (!m)
 			goto notinmem;
 		tinc = size;
 		if (tinc > PAGE_SIZE - ((toff + off) & PAGE_MASK))
 			tinc = PAGE_SIZE - ((toff + off) & PAGE_MASK);
 		if (vm_page_is_valid(m,
 		    (vm_offset_t) ((toff + off) & PAGE_MASK), tinc) == 0)
 			goto notinmem;
 	}
 	VM_OBJECT_UNLOCK(obj);
 	return 1;
 
 notinmem:
 	VM_OBJECT_UNLOCK(obj);
 	return (0);
 }
 
 /*
  *	vfs_setdirty:
  *
  *	Sets the dirty range for a buffer based on the status of the dirty
  *	bits in the pages comprising the buffer.
  *
  *	The range is limited to the size of the buffer.
  *
  *	This routine is primarily used by NFS, but is generalized for the
  *	B_VMIO case.
  */
 static void
 vfs_setdirty(struct buf *bp) 
 {
 
 	/*
 	 * Degenerate case - empty buffer
 	 */
 
 	if (bp->b_bufsize == 0)
 		return;
 
 	/*
 	 * We qualify the scan for modified pages on whether the
 	 * object has been flushed yet.
 	 */
 
 	if ((bp->b_flags & B_VMIO) == 0)
 		return;
 
 	VM_OBJECT_LOCK(bp->b_bufobj->bo_object);
 	vfs_setdirty_locked_object(bp);
 	VM_OBJECT_UNLOCK(bp->b_bufobj->bo_object);
 }
 
 static void
 vfs_setdirty_locked_object(struct buf *bp)
 {
 	vm_object_t object;
 	int i;
 
 	object = bp->b_bufobj->bo_object;
 	VM_OBJECT_LOCK_ASSERT(object, MA_OWNED);
 	if (object->flags & (OBJ_MIGHTBEDIRTY|OBJ_CLEANING)) {
 		vm_offset_t boffset;
 		vm_offset_t eoffset;
 
 		vm_page_lock_queues();
 		/*
 		 * test the pages to see if they have been modified directly
 		 * by users through the VM system.
 		 */
 		for (i = 0; i < bp->b_npages; i++)
 			vm_page_test_dirty(bp->b_pages[i]);
 
 		/*
 		 * Calculate the encompassing dirty range, boffset and eoffset,
 		 * (eoffset - boffset) bytes.
 		 */
 
 		for (i = 0; i < bp->b_npages; i++) {
 			if (bp->b_pages[i]->dirty)
 				break;
 		}
 		boffset = (i << PAGE_SHIFT) - (bp->b_offset & PAGE_MASK);
 
 		for (i = bp->b_npages - 1; i >= 0; --i) {
 			if (bp->b_pages[i]->dirty) {
 				break;
 			}
 		}
 		eoffset = ((i + 1) << PAGE_SHIFT) - (bp->b_offset & PAGE_MASK);
 
 		vm_page_unlock_queues();
 		/*
 		 * Fit it to the buffer.
 		 */
 
 		if (eoffset > bp->b_bcount)
 			eoffset = bp->b_bcount;
 
 		/*
 		 * If we have a good dirty range, merge with the existing
 		 * dirty range.
 		 */
 
 		if (boffset < eoffset) {
 			if (bp->b_dirtyoff > boffset)
 				bp->b_dirtyoff = boffset;
 			if (bp->b_dirtyend < eoffset)
 				bp->b_dirtyend = eoffset;
 		}
 	}
 }
 
 /*
  *	getblk:
  *
  *	Get a block given a specified block and offset into a file/device.
  *	The buffers B_DONE bit will be cleared on return, making it almost
  * 	ready for an I/O initiation.  B_INVAL may or may not be set on 
  *	return.  The caller should clear B_INVAL prior to initiating a
  *	READ.
  *
  *	For a non-VMIO buffer, B_CACHE is set to the opposite of B_INVAL for
  *	an existing buffer.
  *
  *	For a VMIO buffer, B_CACHE is modified according to the backing VM.
  *	If getblk()ing a previously 0-sized invalid buffer, B_CACHE is set
  *	and then cleared based on the backing VM.  If the previous buffer is
  *	non-0-sized but invalid, B_CACHE will be cleared.
  *
  *	If getblk() must create a new buffer, the new buffer is returned with
  *	both B_INVAL and B_CACHE clear unless it is a VMIO buffer, in which
  *	case it is returned with B_INVAL clear and B_CACHE set based on the
  *	backing VM.
  *
  *	getblk() also forces a bwrite() for any B_DELWRI buffer whos
  *	B_CACHE bit is clear.
  *	
  *	What this means, basically, is that the caller should use B_CACHE to
  *	determine whether the buffer is fully valid or not and should clear
  *	B_INVAL prior to issuing a read.  If the caller intends to validate
  *	the buffer by loading its data area with something, the caller needs
  *	to clear B_INVAL.  If the caller does this without issuing an I/O, 
  *	the caller should set B_CACHE ( as an optimization ), else the caller
  *	should issue the I/O and biodone() will set B_CACHE if the I/O was
  *	a write attempt or if it was a successfull read.  If the caller 
  *	intends to issue a READ, the caller must clear B_INVAL and BIO_ERROR
  *	prior to issuing the READ.  biodone() will *not* clear B_INVAL.
  */
 struct buf *
 getblk(struct vnode * vp, daddr_t blkno, int size, int slpflag, int slptimeo,
     int flags)
 {
 	struct buf *bp;
 	struct bufobj *bo;
 	int error;
 
 	CTR3(KTR_BUF, "getblk(%p, %ld, %d)", vp, (long)blkno, size);
 	ASSERT_VOP_LOCKED(vp, "getblk");
 	if (size > MAXBSIZE)
 		panic("getblk: size(%d) > MAXBSIZE(%d)\n", size, MAXBSIZE);
 
 	bo = &vp->v_bufobj;
 loop:
 	/*
 	 * Block if we are low on buffers.   Certain processes are allowed
 	 * to completely exhaust the buffer cache.
          *
          * If this check ever becomes a bottleneck it may be better to
          * move it into the else, when gbincore() fails.  At the moment
          * it isn't a problem.
 	 *
 	 * XXX remove if 0 sections (clean this up after its proven)
          */
 	if (numfreebuffers == 0) {
 		if (TD_IS_IDLETHREAD(curthread))
 			return NULL;
 		mtx_lock(&nblock);
 		needsbuffer |= VFS_BIO_NEED_ANY;
 		mtx_unlock(&nblock);
 	}
 
 	BO_LOCK(bo);
 	bp = gbincore(bo, blkno);
 	if (bp != NULL) {
 		int lockflags;
 		/*
 		 * Buffer is in-core.  If the buffer is not busy, it must
 		 * be on a queue.
 		 */
 		lockflags = LK_EXCLUSIVE | LK_SLEEPFAIL | LK_INTERLOCK;
 
 		if (flags & GB_LOCK_NOWAIT)
 			lockflags |= LK_NOWAIT;
 
 		error = BUF_TIMELOCK(bp, lockflags,
 		    VI_MTX(vp), "getblk", slpflag, slptimeo);
 
 		/*
 		 * If we slept and got the lock we have to restart in case
 		 * the buffer changed identities.
 		 */
 		if (error == ENOLCK)
 			goto loop;
 		/* We timed out or were interrupted. */
 		else if (error)
 			return (NULL);
 
 		/*
 		 * The buffer is locked.  B_CACHE is cleared if the buffer is 
 		 * invalid.  Otherwise, for a non-VMIO buffer, B_CACHE is set
 		 * and for a VMIO buffer B_CACHE is adjusted according to the
 		 * backing VM cache.
 		 */
 		if (bp->b_flags & B_INVAL)
 			bp->b_flags &= ~B_CACHE;
 		else if ((bp->b_flags & (B_VMIO | B_INVAL)) == 0)
 			bp->b_flags |= B_CACHE;
 		bremfree(bp);
 
 		/*
 		 * check for size inconsistancies for non-VMIO case.
 		 */
 
 		if (bp->b_bcount != size) {
 			if ((bp->b_flags & B_VMIO) == 0 ||
 			    (size > bp->b_kvasize)) {
 				if (bp->b_flags & B_DELWRI) {
 					/*
 					 * If buffer is pinned and caller does
 					 * not want sleep  waiting for it to be
 					 * unpinned, bail out
 					 * */
 					if (bp->b_pin_count > 0) {
 						if (flags & GB_LOCK_NOWAIT) {
 							bqrelse(bp);
 							return (NULL);
 						} else {
 							bunpin_wait(bp);
 						}
 					}
 					bp->b_flags |= B_NOCACHE;
 					bwrite(bp);
 				} else {
 					if (LIST_EMPTY(&bp->b_dep)) {
 						bp->b_flags |= B_RELBUF;
 						brelse(bp);
 					} else {
 						bp->b_flags |= B_NOCACHE;
 						bwrite(bp);
 					}
 				}
 				goto loop;
 			}
 		}
 
 		/*
 		 * If the size is inconsistant in the VMIO case, we can resize
 		 * the buffer.  This might lead to B_CACHE getting set or
 		 * cleared.  If the size has not changed, B_CACHE remains
 		 * unchanged from its previous state.
 		 */
 
 		if (bp->b_bcount != size)
 			allocbuf(bp, size);
 
 		KASSERT(bp->b_offset != NOOFFSET, 
 		    ("getblk: no buffer offset"));
 
 		/*
 		 * A buffer with B_DELWRI set and B_CACHE clear must
 		 * be committed before we can return the buffer in
 		 * order to prevent the caller from issuing a read
 		 * ( due to B_CACHE not being set ) and overwriting
 		 * it.
 		 *
 		 * Most callers, including NFS and FFS, need this to
 		 * operate properly either because they assume they
 		 * can issue a read if B_CACHE is not set, or because
 		 * ( for example ) an uncached B_DELWRI might loop due 
 		 * to softupdates re-dirtying the buffer.  In the latter
 		 * case, B_CACHE is set after the first write completes,
 		 * preventing further loops.
 		 * NOTE!  b*write() sets B_CACHE.  If we cleared B_CACHE
 		 * above while extending the buffer, we cannot allow the
 		 * buffer to remain with B_CACHE set after the write
 		 * completes or it will represent a corrupt state.  To
 		 * deal with this we set B_NOCACHE to scrap the buffer
 		 * after the write.
 		 *
 		 * We might be able to do something fancy, like setting
 		 * B_CACHE in bwrite() except if B_DELWRI is already set,
 		 * so the below call doesn't set B_CACHE, but that gets real
 		 * confusing.  This is much easier.
 		 */
 
 		if ((bp->b_flags & (B_CACHE|B_DELWRI)) == B_DELWRI) {
 			bp->b_flags |= B_NOCACHE;
 			bwrite(bp);
 			goto loop;
 		}
 		bp->b_flags &= ~B_DONE;
 	} else {
 		int bsize, maxsize, vmio;
 		off_t offset;
 
 		/*
 		 * Buffer is not in-core, create new buffer.  The buffer
 		 * returned by getnewbuf() is locked.  Note that the returned
 		 * buffer is also considered valid (not marked B_INVAL).
 		 */
 		BO_UNLOCK(bo);
 		/*
 		 * If the user does not want us to create the buffer, bail out
 		 * here.
 		 */
 		if (flags & GB_NOCREAT)
 			return NULL;
 		bsize = bo->bo_bsize;
 		offset = blkno * bsize;
 		vmio = vp->v_object != NULL;
 		maxsize = vmio ? size + (offset & PAGE_MASK) : size;
 		maxsize = imax(maxsize, bsize);
 
 		bp = getnewbuf(slpflag, slptimeo, size, maxsize);
 		if (bp == NULL) {
 			if (slpflag || slptimeo)
 				return NULL;
 			goto loop;
 		}
 
 		/*
 		 * This code is used to make sure that a buffer is not
 		 * created while the getnewbuf routine is blocked.
 		 * This can be a problem whether the vnode is locked or not.
 		 * If the buffer is created out from under us, we have to
 		 * throw away the one we just created.
 		 *
 		 * Note: this must occur before we associate the buffer
 		 * with the vp especially considering limitations in
 		 * the splay tree implementation when dealing with duplicate
 		 * lblkno's.
 		 */
 		BO_LOCK(bo);
 		if (gbincore(bo, blkno)) {
 			BO_UNLOCK(bo);
 			bp->b_flags |= B_INVAL;
 			brelse(bp);
 			goto loop;
 		}
 
 		/*
 		 * Insert the buffer into the hash, so that it can
 		 * be found by incore.
 		 */
 		bp->b_blkno = bp->b_lblkno = blkno;
 		bp->b_offset = offset;
 		bgetvp(vp, bp);
 		BO_UNLOCK(bo);
 
 		/*
 		 * set B_VMIO bit.  allocbuf() the buffer bigger.  Since the
 		 * buffer size starts out as 0, B_CACHE will be set by
 		 * allocbuf() for the VMIO case prior to it testing the
 		 * backing store for validity.
 		 */
 
 		if (vmio) {
 			bp->b_flags |= B_VMIO;
 #if defined(VFS_BIO_DEBUG)
 			if (vn_canvmio(vp) != TRUE)
 				printf("getblk: VMIO on vnode type %d\n",
 					vp->v_type);
 #endif
 			KASSERT(vp->v_object == bp->b_bufobj->bo_object,
 			    ("ARGH! different b_bufobj->bo_object %p %p %p\n",
 			    bp, vp->v_object, bp->b_bufobj->bo_object));
 		} else {
 			bp->b_flags &= ~B_VMIO;
 			KASSERT(bp->b_bufobj->bo_object == NULL,
 			    ("ARGH! has b_bufobj->bo_object %p %p\n",
 			    bp, bp->b_bufobj->bo_object));
 		}
 
 		allocbuf(bp, size);
 		bp->b_flags &= ~B_DONE;
 	}
 	CTR4(KTR_BUF, "getblk(%p, %ld, %d) = %p", vp, (long)blkno, size, bp);
 	KASSERT(BUF_REFCNT(bp) == 1, ("getblk: bp %p not locked",bp));
 	KASSERT(bp->b_bufobj == bo,
 	    ("bp %p wrong b_bufobj %p should be %p", bp, bp->b_bufobj, bo));
 	return (bp);
 }
 
 /*
  * Get an empty, disassociated buffer of given size.  The buffer is initially
  * set to B_INVAL.
  */
 struct buf *
 geteblk(int size)
 {
 	struct buf *bp;
 	int maxsize;
 
 	maxsize = (size + BKVAMASK) & ~BKVAMASK;
 	while ((bp = getnewbuf(0, 0, size, maxsize)) == 0)
 		continue;
 	allocbuf(bp, size);
 	bp->b_flags |= B_INVAL;	/* b_dep cleared by getnewbuf() */
 	KASSERT(BUF_REFCNT(bp) == 1, ("geteblk: bp %p not locked",bp));
 	return (bp);
 }
 
 
 /*
  * This code constitutes the buffer memory from either anonymous system
  * memory (in the case of non-VMIO operations) or from an associated
  * VM object (in the case of VMIO operations).  This code is able to
  * resize a buffer up or down.
  *
  * Note that this code is tricky, and has many complications to resolve
  * deadlock or inconsistant data situations.  Tread lightly!!! 
  * There are B_CACHE and B_DELWRI interactions that must be dealt with by 
  * the caller.  Calling this code willy nilly can result in the loss of data.
  *
  * allocbuf() only adjusts B_CACHE for VMIO buffers.  getblk() deals with
  * B_CACHE for the non-VMIO case.
  */
 
 int
 allocbuf(struct buf *bp, int size)
 {
 	int newbsize, mbsize;
 	int i;
 
 	if (BUF_REFCNT(bp) == 0)
 		panic("allocbuf: buffer not busy");
 
 	if (bp->b_kvasize < size)
 		panic("allocbuf: buffer too small");
 
 	if ((bp->b_flags & B_VMIO) == 0) {
 		caddr_t origbuf;
 		int origbufsize;
 		/*
 		 * Just get anonymous memory from the kernel.  Don't
 		 * mess with B_CACHE.
 		 */
 		mbsize = (size + DEV_BSIZE - 1) & ~(DEV_BSIZE - 1);
 		if (bp->b_flags & B_MALLOC)
 			newbsize = mbsize;
 		else
 			newbsize = round_page(size);
 
 		if (newbsize < bp->b_bufsize) {
 			/*
 			 * malloced buffers are not shrunk
 			 */
 			if (bp->b_flags & B_MALLOC) {
 				if (newbsize) {
 					bp->b_bcount = size;
 				} else {
 					free(bp->b_data, M_BIOBUF);
 					if (bp->b_bufsize) {
 						atomic_subtract_int(
 						    &bufmallocspace,
 						    bp->b_bufsize);
 						bufspacewakeup();
 						bp->b_bufsize = 0;
 					}
 					bp->b_saveaddr = bp->b_kvabase;
 					bp->b_data = bp->b_saveaddr;
 					bp->b_bcount = 0;
 					bp->b_flags &= ~B_MALLOC;
 				}
 				return 1;
 			}		
 			vm_hold_free_pages(
 			    bp,
 			    (vm_offset_t) bp->b_data + newbsize,
 			    (vm_offset_t) bp->b_data + bp->b_bufsize);
 		} else if (newbsize > bp->b_bufsize) {
 			/*
 			 * We only use malloced memory on the first allocation.
 			 * and revert to page-allocated memory when the buffer
 			 * grows.
 			 */
 			/*
 			 * There is a potential smp race here that could lead
 			 * to bufmallocspace slightly passing the max.  It
 			 * is probably extremely rare and not worth worrying
 			 * over.
 			 */
 			if ( (bufmallocspace < maxbufmallocspace) &&
 				(bp->b_bufsize == 0) &&
 				(mbsize <= PAGE_SIZE/2)) {
 
 				bp->b_data = malloc(mbsize, M_BIOBUF, M_WAITOK);
 				bp->b_bufsize = mbsize;
 				bp->b_bcount = size;
 				bp->b_flags |= B_MALLOC;
 				atomic_add_int(&bufmallocspace, mbsize);
 				return 1;
 			}
 			origbuf = NULL;
 			origbufsize = 0;
 			/*
 			 * If the buffer is growing on its other-than-first allocation,
 			 * then we revert to the page-allocation scheme.
 			 */
 			if (bp->b_flags & B_MALLOC) {
 				origbuf = bp->b_data;
 				origbufsize = bp->b_bufsize;
 				bp->b_data = bp->b_kvabase;
 				if (bp->b_bufsize) {
 					atomic_subtract_int(&bufmallocspace,
 					    bp->b_bufsize);
 					bufspacewakeup();
 					bp->b_bufsize = 0;
 				}
 				bp->b_flags &= ~B_MALLOC;
 				newbsize = round_page(newbsize);
 			}
 			vm_hold_load_pages(
 			    bp,
 			    (vm_offset_t) bp->b_data + bp->b_bufsize,
 			    (vm_offset_t) bp->b_data + newbsize);
 			if (origbuf) {
 				bcopy(origbuf, bp->b_data, origbufsize);
 				free(origbuf, M_BIOBUF);
 			}
 		}
 	} else {
 		int desiredpages;
 
 		newbsize = (size + DEV_BSIZE - 1) & ~(DEV_BSIZE - 1);
 		desiredpages = (size == 0) ? 0 :
 			num_pages((bp->b_offset & PAGE_MASK) + newbsize);
 
 		if (bp->b_flags & B_MALLOC)
 			panic("allocbuf: VMIO buffer can't be malloced");
 		/*
 		 * Set B_CACHE initially if buffer is 0 length or will become
 		 * 0-length.
 		 */
 		if (size == 0 || bp->b_bufsize == 0)
 			bp->b_flags |= B_CACHE;
 
 		if (newbsize < bp->b_bufsize) {
 			/*
 			 * DEV_BSIZE aligned new buffer size is less then the
 			 * DEV_BSIZE aligned existing buffer size.  Figure out
 			 * if we have to remove any pages.
 			 */
 			if (desiredpages < bp->b_npages) {
 				vm_page_t m;
 
 				VM_OBJECT_LOCK(bp->b_bufobj->bo_object);
 				vm_page_lock_queues();
 				for (i = desiredpages; i < bp->b_npages; i++) {
 					/*
 					 * the page is not freed here -- it
 					 * is the responsibility of 
 					 * vnode_pager_setsize
 					 */
 					m = bp->b_pages[i];
 					KASSERT(m != bogus_page,
 					    ("allocbuf: bogus page found"));
 					while (vm_page_sleep_if_busy(m, TRUE, "biodep"))
 						vm_page_lock_queues();
 
 					bp->b_pages[i] = NULL;
 					vm_page_unwire(m, 0);
 				}
 				vm_page_unlock_queues();
 				VM_OBJECT_UNLOCK(bp->b_bufobj->bo_object);
 				pmap_qremove((vm_offset_t) trunc_page((vm_offset_t)bp->b_data) +
 				    (desiredpages << PAGE_SHIFT), (bp->b_npages - desiredpages));
 				bp->b_npages = desiredpages;
 			}
 		} else if (size > bp->b_bcount) {
 			/*
 			 * We are growing the buffer, possibly in a 
 			 * byte-granular fashion.
 			 */
 			struct vnode *vp;
 			vm_object_t obj;
 			vm_offset_t toff;
 			vm_offset_t tinc;
 
 			/*
 			 * Step 1, bring in the VM pages from the object, 
 			 * allocating them if necessary.  We must clear
 			 * B_CACHE if these pages are not valid for the 
 			 * range covered by the buffer.
 			 */
 
 			vp = bp->b_vp;
 			obj = bp->b_bufobj->bo_object;
 
 			VM_OBJECT_LOCK(obj);
 			while (bp->b_npages < desiredpages) {
 				vm_page_t m;
 				vm_pindex_t pi;
 
 				pi = OFF_TO_IDX(bp->b_offset) + bp->b_npages;
 				if ((m = vm_page_lookup(obj, pi)) == NULL) {
 					/*
 					 * note: must allocate system pages
 					 * since blocking here could intefere
 					 * with paging I/O, no matter which
 					 * process we are.
 					 */
 					m = vm_page_alloc(obj, pi,
 					    VM_ALLOC_NOBUSY | VM_ALLOC_SYSTEM |
 					    VM_ALLOC_WIRED);
 					if (m == NULL) {
 						atomic_add_int(&vm_pageout_deficit,
 						    desiredpages - bp->b_npages);
 						VM_OBJECT_UNLOCK(obj);
 						VM_WAIT;
 						VM_OBJECT_LOCK(obj);
 					} else {
 						if (m->valid == 0)
 							bp->b_flags &= ~B_CACHE;
 						bp->b_pages[bp->b_npages] = m;
 						++bp->b_npages;
 					}
 					continue;
 				}
 
 				/*
 				 * We found a page.  If we have to sleep on it,
 				 * retry because it might have gotten freed out
 				 * from under us.
 				 *
 				 * We can only test VPO_BUSY here.  Blocking on
 				 * m->busy might lead to a deadlock:
 				 *
 				 *  vm_fault->getpages->cluster_read->allocbuf
 				 *
 				 */
 				if (vm_page_sleep_if_busy(m, FALSE, "pgtblk"))
 					continue;
 
 				/*
 				 * We have a good page.
 				 */
 				vm_page_lock_queues();
 				vm_page_wire(m);
 				vm_page_unlock_queues();
 				bp->b_pages[bp->b_npages] = m;
 				++bp->b_npages;
 			}
 
 			/*
 			 * Step 2.  We've loaded the pages into the buffer,
 			 * we have to figure out if we can still have B_CACHE
 			 * set.  Note that B_CACHE is set according to the
 			 * byte-granular range ( bcount and size ), new the
 			 * aligned range ( newbsize ).
 			 *
 			 * The VM test is against m->valid, which is DEV_BSIZE
 			 * aligned.  Needless to say, the validity of the data
 			 * needs to also be DEV_BSIZE aligned.  Note that this
 			 * fails with NFS if the server or some other client
 			 * extends the file's EOF.  If our buffer is resized, 
 			 * B_CACHE may remain set! XXX
 			 */
 
 			toff = bp->b_bcount;
 			tinc = PAGE_SIZE - ((bp->b_offset + toff) & PAGE_MASK);
 
 			while ((bp->b_flags & B_CACHE) && toff < size) {
 				vm_pindex_t pi;
 
 				if (tinc > (size - toff))
 					tinc = size - toff;
 
 				pi = ((bp->b_offset & PAGE_MASK) + toff) >> 
 				    PAGE_SHIFT;
 
 				vfs_buf_test_cache(
 				    bp, 
 				    bp->b_offset,
 				    toff, 
 				    tinc, 
 				    bp->b_pages[pi]
 				);
 				toff += tinc;
 				tinc = PAGE_SIZE;
 			}
 			VM_OBJECT_UNLOCK(obj);
 
 			/*
 			 * Step 3, fixup the KVM pmap.  Remember that
 			 * bp->b_data is relative to bp->b_offset, but 
 			 * bp->b_offset may be offset into the first page.
 			 */
 
 			bp->b_data = (caddr_t)
 			    trunc_page((vm_offset_t)bp->b_data);
 			pmap_qenter(
 			    (vm_offset_t)bp->b_data,
 			    bp->b_pages, 
 			    bp->b_npages
 			);
 			
 			bp->b_data = (caddr_t)((vm_offset_t)bp->b_data | 
 			    (vm_offset_t)(bp->b_offset & PAGE_MASK));
 		}
 	}
 	if (newbsize < bp->b_bufsize)
 		bufspacewakeup();
 	bp->b_bufsize = newbsize;	/* actual buffer allocation	*/
 	bp->b_bcount = size;		/* requested buffer size	*/
 	return 1;
 }
 
 void
 biodone(struct bio *bp)
 {
 	void (*done)(struct bio *);
 
 	mtx_lock(&bdonelock);
 	bp->bio_flags |= BIO_DONE;
 	done = bp->bio_done;
 	if (done == NULL)
 		wakeup(bp);
 	mtx_unlock(&bdonelock);
 	if (done != NULL)
 		done(bp);
 }
 
 /*
  * Wait for a BIO to finish.
  *
  * XXX: resort to a timeout for now.  The optimal locking (if any) for this
  * case is not yet clear.
  */
 int
 biowait(struct bio *bp, const char *wchan)
 {
 
 	mtx_lock(&bdonelock);
 	while ((bp->bio_flags & BIO_DONE) == 0)
 		msleep(bp, &bdonelock, PRIBIO, wchan, hz / 10);
 	mtx_unlock(&bdonelock);
 	if (bp->bio_error != 0)
 		return (bp->bio_error);
 	if (!(bp->bio_flags & BIO_ERROR))
 		return (0);
 	return (EIO);
 }
 
 void
 biofinish(struct bio *bp, struct devstat *stat, int error)
 {
 	
 	if (error) {
 		bp->bio_error = error;
 		bp->bio_flags |= BIO_ERROR;
 	}
 	if (stat != NULL)
 		devstat_end_transaction_bio(stat, bp);
 	biodone(bp);
 }
 
 /*
  *	bufwait:
  *
  *	Wait for buffer I/O completion, returning error status.  The buffer
  *	is left locked and B_DONE on return.  B_EINTR is converted into an EINTR
  *	error and cleared.
  */
 int
 bufwait(struct buf *bp)
 {
 	if (bp->b_iocmd == BIO_READ)
 		bwait(bp, PRIBIO, "biord");
 	else
 		bwait(bp, PRIBIO, "biowr");
 	if (bp->b_flags & B_EINTR) {
 		bp->b_flags &= ~B_EINTR;
 		return (EINTR);
 	}
 	if (bp->b_ioflags & BIO_ERROR) {
 		return (bp->b_error ? bp->b_error : EIO);
 	} else {
 		return (0);
 	}
 }
 
  /*
   * Call back function from struct bio back up to struct buf.
   */
 static void
 bufdonebio(struct bio *bip)
 {
 	struct buf *bp;
 
 	bp = bip->bio_caller2;
 	bp->b_resid = bp->b_bcount - bip->bio_completed;
 	bp->b_resid = bip->bio_resid;	/* XXX: remove */
 	bp->b_ioflags = bip->bio_flags;
 	bp->b_error = bip->bio_error;
 	if (bp->b_error)
 		bp->b_ioflags |= BIO_ERROR;
 	bufdone(bp);
 	g_destroy_bio(bip);
 }
 
 void
 dev_strategy(struct cdev *dev, struct buf *bp)
 {
 	struct cdevsw *csw;
 	struct bio *bip;
 
 	if ((!bp->b_iocmd) || (bp->b_iocmd & (bp->b_iocmd - 1)))
 		panic("b_iocmd botch");
 	for (;;) {
 		bip = g_new_bio();
 		if (bip != NULL)
 			break;
 		/* Try again later */
 		tsleep(&bp, PRIBIO, "dev_strat", hz/10);
 	}
 	bip->bio_cmd = bp->b_iocmd;
 	bip->bio_offset = bp->b_iooffset;
 	bip->bio_length = bp->b_bcount;
 	bip->bio_bcount = bp->b_bcount;	/* XXX: remove */
 	bip->bio_data = bp->b_data;
 	bip->bio_done = bufdonebio;
 	bip->bio_caller2 = bp;
 	bip->bio_dev = dev;
 	KASSERT(dev->si_refcount > 0,
 	    ("dev_strategy on un-referenced struct cdev *(%s)",
 	    devtoname(dev)));
 	csw = dev_refthread(dev);
 	if (csw == NULL) {
 		g_destroy_bio(bip);
 		bp->b_error = ENXIO;
 		bp->b_ioflags = BIO_ERROR;
 		bufdone(bp);
 		return;
 	}
 	(*csw->d_strategy)(bip);
 	dev_relthread(dev);
 }
 
 /*
  *	bufdone:
  *
  *	Finish I/O on a buffer, optionally calling a completion function.
  *	This is usually called from an interrupt so process blocking is
  *	not allowed.
  *
  *	biodone is also responsible for setting B_CACHE in a B_VMIO bp.
  *	In a non-VMIO bp, B_CACHE will be set on the next getblk() 
  *	assuming B_INVAL is clear.
  *
  *	For the VMIO case, we set B_CACHE if the op was a read and no
  *	read error occured, or if the op was a write.  B_CACHE is never
  *	set if the buffer is invalid or otherwise uncacheable.
  *
  *	biodone does not mess with B_INVAL, allowing the I/O routine or the
  *	initiator to leave B_INVAL set to brelse the buffer out of existance
  *	in the biodone routine.
  */
 void
 bufdone(struct buf *bp)
 {
 	struct bufobj *dropobj;
 	void    (*biodone)(struct buf *);
 
 	CTR3(KTR_BUF, "bufdone(%p) vp %p flags %X", bp, bp->b_vp, bp->b_flags);
 	dropobj = NULL;
 
 	KASSERT(BUF_REFCNT(bp) > 0, ("biodone: bp %p not busy %d", bp,
 	    BUF_REFCNT(bp)));
 	KASSERT(!(bp->b_flags & B_DONE), ("biodone: bp %p already done", bp));
 
 	runningbufwakeup(bp);
 	if (bp->b_iocmd == BIO_WRITE)
 		dropobj = bp->b_bufobj;
 	/* call optional completion function if requested */
 	if (bp->b_iodone != NULL) {
 		biodone = bp->b_iodone;
 		bp->b_iodone = NULL;
 		(*biodone) (bp);
 		if (dropobj)
 			bufobj_wdrop(dropobj);
 		return;
 	}
 
 	bufdone_finish(bp);
 
 	if (dropobj)
 		bufobj_wdrop(dropobj);
 }
 
 void
 bufdone_finish(struct buf *bp)
 {
 	KASSERT(BUF_REFCNT(bp) > 0, ("biodone: bp %p not busy %d", bp,
 	    BUF_REFCNT(bp)));
 
 	if (!LIST_EMPTY(&bp->b_dep))
 		buf_complete(bp);
 
 	if (bp->b_flags & B_VMIO) {
 		int i;
 		vm_ooffset_t foff;
 		vm_page_t m;
 		vm_object_t obj;
 		int iosize;
 		struct vnode *vp = bp->b_vp;
 		boolean_t are_queues_locked;
 
 		obj = bp->b_bufobj->bo_object;
 
 #if defined(VFS_BIO_DEBUG)
 		mp_fixme("usecount and vflag accessed without locks.");
 		if (vp->v_usecount == 0) {
 			panic("biodone: zero vnode ref count");
 		}
 
 		KASSERT(vp->v_object != NULL,
 			("biodone: vnode %p has no vm_object", vp));
 #endif
 
 		foff = bp->b_offset;
 		KASSERT(bp->b_offset != NOOFFSET,
 		    ("biodone: no buffer offset"));
 
 		VM_OBJECT_LOCK(obj);
 #if defined(VFS_BIO_DEBUG)
 		if (obj->paging_in_progress < bp->b_npages) {
 			printf("biodone: paging in progress(%d) < bp->b_npages(%d)\n",
 			    obj->paging_in_progress, bp->b_npages);
 		}
 #endif
 
 		/*
 		 * Set B_CACHE if the op was a normal read and no error
 		 * occured.  B_CACHE is set for writes in the b*write()
 		 * routines.
 		 */
 		iosize = bp->b_bcount - bp->b_resid;
 		if (bp->b_iocmd == BIO_READ &&
 		    !(bp->b_flags & (B_INVAL|B_NOCACHE)) &&
 		    !(bp->b_ioflags & BIO_ERROR)) {
 			bp->b_flags |= B_CACHE;
 		}
 		if (bp->b_iocmd == BIO_READ) {
 			vm_page_lock_queues();
 			are_queues_locked = TRUE;
 		} else
 			are_queues_locked = FALSE;
 		for (i = 0; i < bp->b_npages; i++) {
 			int bogusflag = 0;
 			int resid;
 
 			resid = ((foff + PAGE_SIZE) & ~(off_t)PAGE_MASK) - foff;
 			if (resid > iosize)
 				resid = iosize;
 
 			/*
 			 * cleanup bogus pages, restoring the originals
 			 */
 			m = bp->b_pages[i];
 			if (m == bogus_page) {
 				bogusflag = 1;
 				m = vm_page_lookup(obj, OFF_TO_IDX(foff));
 				if (m == NULL)
 					panic("biodone: page disappeared!");
 				bp->b_pages[i] = m;
 				pmap_qenter(trunc_page((vm_offset_t)bp->b_data),
 				    bp->b_pages, bp->b_npages);
 			}
 #if defined(VFS_BIO_DEBUG)
 			if (OFF_TO_IDX(foff) != m->pindex) {
 				printf(
 "biodone: foff(%jd)/m->pindex(%ju) mismatch\n",
 				    (intmax_t)foff, (uintmax_t)m->pindex);
 			}
 #endif
 
 			/*
 			 * In the write case, the valid and clean bits are
 			 * already changed correctly ( see bdwrite() ), so we 
 			 * only need to do this here in the read case.
 			 */
 			if ((bp->b_iocmd == BIO_READ) && !bogusflag && resid > 0) {
 				vfs_page_set_valid(bp, foff, m);
 			}
 
 			/*
 			 * when debugging new filesystems or buffer I/O methods, this
 			 * is the most common error that pops up.  if you see this, you
 			 * have not set the page busy flag correctly!!!
 			 */
 			if (m->busy == 0) {
 				printf("biodone: page busy < 0, "
 				    "pindex: %d, foff: 0x(%x,%x), "
 				    "resid: %d, index: %d\n",
 				    (int) m->pindex, (int)(foff >> 32),
 						(int) foff & 0xffffffff, resid, i);
 				if (!vn_isdisk(vp, NULL))
 					printf(" iosize: %jd, lblkno: %jd, flags: 0x%x, npages: %d\n",
 					    (intmax_t)bp->b_vp->v_mount->mnt_stat.f_iosize,
 					    (intmax_t) bp->b_lblkno,
 					    bp->b_flags, bp->b_npages);
 				else
 					printf(" VDEV, lblkno: %jd, flags: 0x%x, npages: %d\n",
 					    (intmax_t) bp->b_lblkno,
 					    bp->b_flags, bp->b_npages);
 				printf(" valid: 0x%lx, dirty: 0x%lx, wired: %d\n",
 				    (u_long)m->valid, (u_long)m->dirty,
 				    m->wire_count);
 				panic("biodone: page busy < 0\n");
 			}
 			vm_page_io_finish(m);
 			vm_object_pip_subtract(obj, 1);
 			foff = (foff + PAGE_SIZE) & ~(off_t)PAGE_MASK;
 			iosize -= resid;
 		}
 		if (are_queues_locked)
 			vm_page_unlock_queues();
 		vm_object_pip_wakeupn(obj, 0);
 		VM_OBJECT_UNLOCK(obj);
 	}
 
 	/*
 	 * For asynchronous completions, release the buffer now. The brelse
 	 * will do a wakeup there if necessary - so no need to do a wakeup
 	 * here in the async case. The sync case always needs to do a wakeup.
 	 */
 
 	if (bp->b_flags & B_ASYNC) {
 		if ((bp->b_flags & (B_NOCACHE | B_INVAL | B_RELBUF)) || (bp->b_ioflags & BIO_ERROR))
 			brelse(bp);
 		else
 			bqrelse(bp);
 	} else
 		bdone(bp);
 }
 
 /*
  * This routine is called in lieu of iodone in the case of
  * incomplete I/O.  This keeps the busy status for pages
  * consistant.
  */
 void
 vfs_unbusy_pages(struct buf *bp)
 {
 	int i;
 	vm_object_t obj;
 	vm_page_t m;
 
 	runningbufwakeup(bp);
 	if (!(bp->b_flags & B_VMIO))
 		return;
 
 	obj = bp->b_bufobj->bo_object;
 	VM_OBJECT_LOCK(obj);
 	for (i = 0; i < bp->b_npages; i++) {
 		m = bp->b_pages[i];
 		if (m == bogus_page) {
 			m = vm_page_lookup(obj, OFF_TO_IDX(bp->b_offset) + i);
 			if (!m)
 				panic("vfs_unbusy_pages: page missing\n");
 			bp->b_pages[i] = m;
 			pmap_qenter(trunc_page((vm_offset_t)bp->b_data),
 			    bp->b_pages, bp->b_npages);
 		}
 		vm_object_pip_subtract(obj, 1);
 		vm_page_io_finish(m);
 	}
 	vm_object_pip_wakeupn(obj, 0);
 	VM_OBJECT_UNLOCK(obj);
 }
 
 /*
  * vfs_page_set_valid:
  *
  *	Set the valid bits in a page based on the supplied offset.   The
  *	range is restricted to the buffer's size.
  *
  *	This routine is typically called after a read completes.
  */
 static void
 vfs_page_set_valid(struct buf *bp, vm_ooffset_t off, vm_page_t m)
 {
 	vm_ooffset_t soff, eoff;
 
 	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
 	/*
 	 * Start and end offsets in buffer.  eoff - soff may not cross a
 	 * page boundry or cross the end of the buffer.  The end of the
 	 * buffer, in this case, is our file EOF, not the allocation size
 	 * of the buffer.
 	 */
 	soff = off;
 	eoff = (off + PAGE_SIZE) & ~(off_t)PAGE_MASK;
 	if (eoff > bp->b_offset + bp->b_bcount)
 		eoff = bp->b_offset + bp->b_bcount;
 
 	/*
 	 * Set valid range.  This is typically the entire buffer and thus the
 	 * entire page.
 	 */
 	if (eoff > soff) {
 		vm_page_set_validclean(
 		    m,
 		   (vm_offset_t) (soff & PAGE_MASK),
 		   (vm_offset_t) (eoff - soff)
 		);
 	}
 }
 
 /*
  * This routine is called before a device strategy routine.
  * It is used to tell the VM system that paging I/O is in
  * progress, and treat the pages associated with the buffer
  * almost as being VPO_BUSY.  Also the object paging_in_progress
  * flag is handled to make sure that the object doesn't become
  * inconsistant.
  *
  * Since I/O has not been initiated yet, certain buffer flags
  * such as BIO_ERROR or B_INVAL may be in an inconsistant state
  * and should be ignored.
  */
 void
 vfs_busy_pages(struct buf *bp, int clear_modify)
 {
 	int i, bogus;
 	vm_object_t obj;
 	vm_ooffset_t foff;
 	vm_page_t m;
 
 	if (!(bp->b_flags & B_VMIO))
 		return;
 
 	obj = bp->b_bufobj->bo_object;
 	foff = bp->b_offset;
 	KASSERT(bp->b_offset != NOOFFSET,
 	    ("vfs_busy_pages: no buffer offset"));
 	VM_OBJECT_LOCK(obj);
 	if (bp->b_bufsize != 0)
 		vfs_setdirty_locked_object(bp);
 retry:
 	for (i = 0; i < bp->b_npages; i++) {
 		m = bp->b_pages[i];
 
 		if (vm_page_sleep_if_busy(m, FALSE, "vbpage"))
 			goto retry;
 	}
 	bogus = 0;
 	vm_page_lock_queues();
 	for (i = 0; i < bp->b_npages; i++) {
 		m = bp->b_pages[i];
 
 		if ((bp->b_flags & B_CLUSTER) == 0) {
 			vm_object_pip_add(obj, 1);
 			vm_page_io_start(m);
 		}
 		/*
 		 * When readying a buffer for a read ( i.e
 		 * clear_modify == 0 ), it is important to do
 		 * bogus_page replacement for valid pages in 
 		 * partially instantiated buffers.  Partially 
 		 * instantiated buffers can, in turn, occur when
 		 * reconstituting a buffer from its VM backing store
 		 * base.  We only have to do this if B_CACHE is
 		 * clear ( which causes the I/O to occur in the
 		 * first place ).  The replacement prevents the read
 		 * I/O from overwriting potentially dirty VM-backed
 		 * pages.  XXX bogus page replacement is, uh, bogus.
 		 * It may not work properly with small-block devices.
 		 * We need to find a better way.
 		 */
 		pmap_remove_all(m);
 		if (clear_modify)
 			vfs_page_set_valid(bp, foff, m);
 		else if (m->valid == VM_PAGE_BITS_ALL &&
 		    (bp->b_flags & B_CACHE) == 0) {
 			bp->b_pages[i] = bogus_page;
 			bogus++;
 		}
 		foff = (foff + PAGE_SIZE) & ~(off_t)PAGE_MASK;
 	}
 	vm_page_unlock_queues();
 	VM_OBJECT_UNLOCK(obj);
 	if (bogus)
 		pmap_qenter(trunc_page((vm_offset_t)bp->b_data),
 		    bp->b_pages, bp->b_npages);
 }
 
 /*
  * Tell the VM system that the pages associated with this buffer
  * are clean.  This is used for delayed writes where the data is
  * going to go to disk eventually without additional VM intevention.
  *
  * Note that while we only really need to clean through to b_bcount, we
  * just go ahead and clean through to b_bufsize.
  */
 static void
 vfs_clean_pages(struct buf *bp)
 {
 	int i;
 	vm_ooffset_t foff, noff, eoff;
 	vm_page_t m;
 
 	if (!(bp->b_flags & B_VMIO))
 		return;
 
 	foff = bp->b_offset;
 	KASSERT(bp->b_offset != NOOFFSET,
 	    ("vfs_clean_pages: no buffer offset"));
 	VM_OBJECT_LOCK(bp->b_bufobj->bo_object);
 	vm_page_lock_queues();
 	for (i = 0; i < bp->b_npages; i++) {
 		m = bp->b_pages[i];
 		noff = (foff + PAGE_SIZE) & ~(off_t)PAGE_MASK;
 		eoff = noff;
 
 		if (eoff > bp->b_offset + bp->b_bufsize)
 			eoff = bp->b_offset + bp->b_bufsize;
 		vfs_page_set_valid(bp, foff, m);
 		/* vm_page_clear_dirty(m, foff & PAGE_MASK, eoff - foff); */
 		foff = noff;
 	}
 	vm_page_unlock_queues();
 	VM_OBJECT_UNLOCK(bp->b_bufobj->bo_object);
 }
 
 /*
  *	vfs_bio_set_validclean:
  *
  *	Set the range within the buffer to valid and clean.  The range is 
  *	relative to the beginning of the buffer, b_offset.  Note that b_offset
  *	itself may be offset from the beginning of the first page.
  *
  */
 
 void   
 vfs_bio_set_validclean(struct buf *bp, int base, int size)
 {
 	int i, n;
 	vm_page_t m;
 
 	if (!(bp->b_flags & B_VMIO))
 		return;
 	/*
 	 * Fixup base to be relative to beginning of first page.
 	 * Set initial n to be the maximum number of bytes in the
 	 * first page that can be validated.
 	 */
 
 	base += (bp->b_offset & PAGE_MASK);
 	n = PAGE_SIZE - (base & PAGE_MASK);
 
 	VM_OBJECT_LOCK(bp->b_bufobj->bo_object);
 	vm_page_lock_queues();
 	for (i = base / PAGE_SIZE; size > 0 && i < bp->b_npages; ++i) {
 		m = bp->b_pages[i];
 		if (n > size)
 			n = size;
 		vm_page_set_validclean(m, base & PAGE_MASK, n);
 		base += n;
 		size -= n;
 		n = PAGE_SIZE;
 	}
 	vm_page_unlock_queues();
 	VM_OBJECT_UNLOCK(bp->b_bufobj->bo_object);
 }
 
 /*
  *	vfs_bio_clrbuf:
  *
  *	clear a buffer.  This routine essentially fakes an I/O, so we need
  *	to clear BIO_ERROR and B_INVAL.
  *
  *	Note that while we only theoretically need to clear through b_bcount,
  *	we go ahead and clear through b_bufsize.
  */
 
 void
 vfs_bio_clrbuf(struct buf *bp) 
 {
 	int i, j, mask = 0;
 	caddr_t sa, ea;
 
 	if ((bp->b_flags & (B_VMIO | B_MALLOC)) != B_VMIO) {
 		clrbuf(bp);
 		return;
 	}
 
 	bp->b_flags &= ~B_INVAL;
 	bp->b_ioflags &= ~BIO_ERROR;
 	VM_OBJECT_LOCK(bp->b_bufobj->bo_object);
 	if ((bp->b_npages == 1) && (bp->b_bufsize < PAGE_SIZE) &&
 	    (bp->b_offset & PAGE_MASK) == 0) {
 		if (bp->b_pages[0] == bogus_page)
 			goto unlock;
 		mask = (1 << (bp->b_bufsize / DEV_BSIZE)) - 1;
 		VM_OBJECT_LOCK_ASSERT(bp->b_pages[0]->object, MA_OWNED);
 		if ((bp->b_pages[0]->valid & mask) == mask)
 			goto unlock;
 		if (((bp->b_pages[0]->flags & PG_ZERO) == 0) &&
 		    ((bp->b_pages[0]->valid & mask) == 0)) {
 			bzero(bp->b_data, bp->b_bufsize);
 			bp->b_pages[0]->valid |= mask;
 			goto unlock;
 		}
 	}
 	ea = sa = bp->b_data;
 	for(i = 0; i < bp->b_npages; i++, sa = ea) {
 		ea = (caddr_t)trunc_page((vm_offset_t)sa + PAGE_SIZE);
 		ea = (caddr_t)(vm_offset_t)ulmin(
 		    (u_long)(vm_offset_t)ea,
 		    (u_long)(vm_offset_t)bp->b_data + bp->b_bufsize);
 		if (bp->b_pages[i] == bogus_page)
 			continue;
 		j = ((vm_offset_t)sa & PAGE_MASK) / DEV_BSIZE;
 		mask = ((1 << ((ea - sa) / DEV_BSIZE)) - 1) << j;
 		VM_OBJECT_LOCK_ASSERT(bp->b_pages[i]->object, MA_OWNED);
 		if ((bp->b_pages[i]->valid & mask) == mask)
 			continue;
 		if ((bp->b_pages[i]->valid & mask) == 0) {
 			if ((bp->b_pages[i]->flags & PG_ZERO) == 0)
 				bzero(sa, ea - sa);
 		} else {
 			for (; sa < ea; sa += DEV_BSIZE, j++) {
 				if (((bp->b_pages[i]->flags & PG_ZERO) == 0) &&
 				    (bp->b_pages[i]->valid & (1 << j)) == 0)
 					bzero(sa, DEV_BSIZE);
 			}
 		}
 		bp->b_pages[i]->valid |= mask;
 	}
 unlock:
 	VM_OBJECT_UNLOCK(bp->b_bufobj->bo_object);
 	bp->b_resid = 0;
 }
 
 /*
  * vm_hold_load_pages and vm_hold_free_pages get pages into
  * a buffers address space.  The pages are anonymous and are
  * not associated with a file object.
  */
 static void
 vm_hold_load_pages(struct buf *bp, vm_offset_t from, vm_offset_t to)
 {
 	vm_offset_t pg;
 	vm_page_t p;
 	int index;
 
 	to = round_page(to);
 	from = round_page(from);
 	index = (from - trunc_page((vm_offset_t)bp->b_data)) >> PAGE_SHIFT;
 
 	VM_OBJECT_LOCK(kernel_object);
 	for (pg = from; pg < to; pg += PAGE_SIZE, index++) {
 tryagain:
 		/*
 		 * note: must allocate system pages since blocking here
 		 * could intefere with paging I/O, no matter which
 		 * process we are.
 		 */
 		p = vm_page_alloc(kernel_object,
 			((pg - VM_MIN_KERNEL_ADDRESS) >> PAGE_SHIFT),
 		    VM_ALLOC_NOBUSY | VM_ALLOC_SYSTEM | VM_ALLOC_WIRED);
 		if (!p) {
 			atomic_add_int(&vm_pageout_deficit,
 			    (to - pg) >> PAGE_SHIFT);
 			VM_OBJECT_UNLOCK(kernel_object);
 			VM_WAIT;
 			VM_OBJECT_LOCK(kernel_object);
 			goto tryagain;
 		}
 		p->valid = VM_PAGE_BITS_ALL;
 		pmap_qenter(pg, &p, 1);
 		bp->b_pages[index] = p;
 	}
 	VM_OBJECT_UNLOCK(kernel_object);
 	bp->b_npages = index;
 }
 
 /* Return pages associated with this buf to the vm system */
 static void
 vm_hold_free_pages(struct buf *bp, vm_offset_t from, vm_offset_t to)
 {
 	vm_offset_t pg;
 	vm_page_t p;
 	int index, newnpages;
 
 	from = round_page(from);
 	to = round_page(to);
 	newnpages = index = (from - trunc_page((vm_offset_t)bp->b_data)) >> PAGE_SHIFT;
 
 	VM_OBJECT_LOCK(kernel_object);
 	for (pg = from; pg < to; pg += PAGE_SIZE, index++) {
 		p = bp->b_pages[index];
 		if (p && (index < bp->b_npages)) {
 			if (p->busy) {
 				printf(
 			    "vm_hold_free_pages: blkno: %jd, lblkno: %jd\n",
 				    (intmax_t)bp->b_blkno,
 				    (intmax_t)bp->b_lblkno);
 			}
 			bp->b_pages[index] = NULL;
 			pmap_qremove(pg, 1);
 			vm_page_lock_queues();
 			vm_page_unwire(p, 0);
 			vm_page_free(p);
 			vm_page_unlock_queues();
 		}
 	}
 	VM_OBJECT_UNLOCK(kernel_object);
 	bp->b_npages = newnpages;
 }
 
 /*
  * Map an IO request into kernel virtual address space.
  *
  * All requests are (re)mapped into kernel VA space.
  * Notice that we use b_bufsize for the size of the buffer
  * to be mapped.  b_bcount might be modified by the driver.
  *
  * Note that even if the caller determines that the address space should
  * be valid, a race or a smaller-file mapped into a larger space may
  * actually cause vmapbuf() to fail, so all callers of vmapbuf() MUST
  * check the return value.
  */
 int
 vmapbuf(struct buf *bp)
 {
 	caddr_t addr, kva;
 	vm_prot_t prot;
 	int pidx, i;
 	struct vm_page *m;
 	struct pmap *pmap = &curproc->p_vmspace->vm_pmap;
 
 	if (bp->b_bufsize < 0)
 		return (-1);
 	prot = VM_PROT_READ;
 	if (bp->b_iocmd == BIO_READ)
 		prot |= VM_PROT_WRITE;	/* Less backwards than it looks */
 	for (addr = (caddr_t)trunc_page((vm_offset_t)bp->b_data), pidx = 0;
 	     addr < bp->b_data + bp->b_bufsize;
 	     addr += PAGE_SIZE, pidx++) {
 		/*
 		 * Do the vm_fault if needed; do the copy-on-write thing
 		 * when reading stuff off device into memory.
 		 *
 		 * NOTE! Must use pmap_extract() because addr may be in
 		 * the userland address space, and kextract is only guarenteed
 		 * to work for the kernland address space (see: sparc64 port).
 		 */
 retry:
 		if (vm_fault_quick(addr >= bp->b_data ? addr : bp->b_data,
 		    prot) < 0) {
 			vm_page_lock_queues();
 			for (i = 0; i < pidx; ++i) {
 				vm_page_unhold(bp->b_pages[i]);
 				bp->b_pages[i] = NULL;
 			}
 			vm_page_unlock_queues();
 			return(-1);
 		}
 		m = pmap_extract_and_hold(pmap, (vm_offset_t)addr, prot);
 		if (m == NULL)
 			goto retry;
 		bp->b_pages[pidx] = m;
 	}
 	if (pidx > btoc(MAXPHYS))
 		panic("vmapbuf: mapped more than MAXPHYS");
 	pmap_qenter((vm_offset_t)bp->b_saveaddr, bp->b_pages, pidx);
 	
 	kva = bp->b_saveaddr;
 	bp->b_npages = pidx;
 	bp->b_saveaddr = bp->b_data;
 	bp->b_data = kva + (((vm_offset_t) bp->b_data) & PAGE_MASK);
 	return(0);
 }
 
 /*
  * Free the io map PTEs associated with this IO operation.
  * We also invalidate the TLB entries and restore the original b_addr.
  */
 void
 vunmapbuf(struct buf *bp)
 {
 	int pidx;
 	int npages;
 
 	npages = bp->b_npages;
 	pmap_qremove(trunc_page((vm_offset_t)bp->b_data), npages);
 	vm_page_lock_queues();
 	for (pidx = 0; pidx < npages; pidx++)
 		vm_page_unhold(bp->b_pages[pidx]);
 	vm_page_unlock_queues();
 
 	bp->b_data = bp->b_saveaddr;
 }
 
 void
 bdone(struct buf *bp)
 {
 
 	mtx_lock(&bdonelock);
 	bp->b_flags |= B_DONE;
 	wakeup(bp);
 	mtx_unlock(&bdonelock);
 }
 
 void
 bwait(struct buf *bp, u_char pri, const char *wchan)
 {
 
 	mtx_lock(&bdonelock);
 	while ((bp->b_flags & B_DONE) == 0)
 		msleep(bp, &bdonelock, pri, wchan, 0);
 	mtx_unlock(&bdonelock);
 }
 
 int
 bufsync(struct bufobj *bo, int waitfor, struct thread *td)
 {
 
 	return (VOP_FSYNC(bo->__bo_vnode, waitfor, td));
 }
 
 void
 bufstrategy(struct bufobj *bo, struct buf *bp)
 {
 	int i = 0;
 	struct vnode *vp;
 
 	vp = bp->b_vp;
 	KASSERT(vp == bo->bo_private, ("Inconsistent vnode bufstrategy"));
 	KASSERT(vp->v_type != VCHR && vp->v_type != VBLK,
 	    ("Wrong vnode in bufstrategy(bp=%p, vp=%p)", bp, vp));
 	i = VOP_STRATEGY(vp, bp);
 	KASSERT(i == 0, ("VOP_STRATEGY failed bp=%p vp=%p", bp, bp->b_vp));
 }
 
 void
 bufobj_wrefl(struct bufobj *bo)
 {
 
 	KASSERT(bo != NULL, ("NULL bo in bufobj_wref"));
 	ASSERT_BO_LOCKED(bo);
 	bo->bo_numoutput++;
 }
 
 void
 bufobj_wref(struct bufobj *bo)
 {
 
 	KASSERT(bo != NULL, ("NULL bo in bufobj_wref"));
 	BO_LOCK(bo);
 	bo->bo_numoutput++;
 	BO_UNLOCK(bo);
 }
 
 void
 bufobj_wdrop(struct bufobj *bo)
 {
 
 	KASSERT(bo != NULL, ("NULL bo in bufobj_wdrop"));
 	BO_LOCK(bo);
 	KASSERT(bo->bo_numoutput > 0, ("bufobj_wdrop non-positive count"));
 	if ((--bo->bo_numoutput == 0) && (bo->bo_flag & BO_WWAIT)) {
 		bo->bo_flag &= ~BO_WWAIT;
 		wakeup(&bo->bo_numoutput);
 	}
 	BO_UNLOCK(bo);
 }
 
 int
 bufobj_wwait(struct bufobj *bo, int slpflag, int timeo)
 {
 	int error;
 
 	KASSERT(bo != NULL, ("NULL bo in bufobj_wwait"));
 	ASSERT_BO_LOCKED(bo);
 	error = 0;
 	while (bo->bo_numoutput) {
 		bo->bo_flag |= BO_WWAIT;
 		error = msleep(&bo->bo_numoutput, BO_MTX(bo),
 		    slpflag | (PRIBIO + 1), "bo_wwait", timeo);
 		if (error)
 			break;
 	}
 	return (error);
 }
 
 void
 bpin(struct buf *bp)
 {
 	mtx_lock(&bpinlock);
 	bp->b_pin_count++;
 	mtx_unlock(&bpinlock);
 }
 
 void
 bunpin(struct buf *bp)
 {
 	mtx_lock(&bpinlock);
 	if (--bp->b_pin_count == 0)
 		wakeup(bp);
 	mtx_unlock(&bpinlock);
 }
 
 void
 bunpin_wait(struct buf *bp)
 {
 	mtx_lock(&bpinlock);
 	while (bp->b_pin_count > 0)
 		msleep(bp, &bpinlock, PRIBIO, "bwunpin", 0);
 	mtx_unlock(&bpinlock);
 }
 
 #include "opt_ddb.h"
 #ifdef DDB
 #include <ddb/ddb.h>
 
 /* DDB command to show buffer data */
 DB_SHOW_COMMAND(buffer, db_show_buffer)
 {
 	/* get args */
 	struct buf *bp = (struct buf *)addr;
 
 	if (!have_addr) {
 		db_printf("usage: show buffer <addr>\n");
 		return;
 	}
 
 	db_printf("buf at %p\n", bp);
 	db_printf("b_flags = 0x%b\n", (u_int)bp->b_flags, PRINT_BUF_FLAGS);
 	db_printf(
 	    "b_error = %d, b_bufsize = %ld, b_bcount = %ld, b_resid = %ld\n"
 	    "b_bufobj = (%p), b_data = %p, b_blkno = %jd\n",
 	    bp->b_error, bp->b_bufsize, bp->b_bcount, bp->b_resid,
 	    bp->b_bufobj, bp->b_data, (intmax_t)bp->b_blkno);
 	if (bp->b_npages) {
 		int i;
 		db_printf("b_npages = %d, pages(OBJ, IDX, PA): ", bp->b_npages);
 		for (i = 0; i < bp->b_npages; i++) {
 			vm_page_t m;
 			m = bp->b_pages[i];
 			db_printf("(%p, 0x%lx, 0x%lx)", (void *)m->object,
 			    (u_long)m->pindex, (u_long)VM_PAGE_TO_PHYS(m));
 			if ((i + 1) < bp->b_npages)
 				db_printf(",");
 		}
 		db_printf("\n");
 	}
 	lockmgr_printinfo(&bp->b_lock);
 }
 
 DB_SHOW_COMMAND(lockedbufs, lockedbufs)
 {
 	struct buf *bp;
 	int i;
 
 	for (i = 0; i < nbuf; i++) {
 		bp = &buf[i];
 		if (lockcount(&bp->b_lock)) {
 			db_show_buffer((uintptr_t)bp, 1, 0, NULL);
 			db_printf("\n");
 		}
 	}
 }
 #endif /* DDB */
Index: head/sys/kern/vfs_cache.c
===================================================================
--- head/sys/kern/vfs_cache.c	(revision 175201)
+++ head/sys/kern/vfs_cache.c	(revision 175202)
@@ -1,842 +1,842 @@
 /*-
  * Copyright (c) 1989, 1993, 1995
  *	The Regents of the University of California.  All rights reserved.
  *
  * This code is derived from software contributed to Berkeley by
  * Poul-Henning Kamp of the FreeBSD Project.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	@(#)vfs_cache.c	8.5 (Berkeley) 3/22/95
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/kernel.h>
 #include <sys/lock.h>
 #include <sys/mutex.h>
 #include <sys/sysctl.h>
 #include <sys/mount.h>
 #include <sys/vnode.h>
 #include <sys/namei.h>
 #include <sys/malloc.h>
 #include <sys/syscallsubr.h>
 #include <sys/sysproto.h>
 #include <sys/proc.h>
 #include <sys/filedesc.h>
 #include <sys/fnv_hash.h>
 
 #include <vm/uma.h>
 
 /*
  * This structure describes the elements in the cache of recent
  * names looked up by namei.
  */
 
 struct	namecache {
 	LIST_ENTRY(namecache) nc_hash;	/* hash chain */
 	LIST_ENTRY(namecache) nc_src;	/* source vnode list */
 	TAILQ_ENTRY(namecache) nc_dst;	/* destination vnode list */
 	struct	vnode *nc_dvp;		/* vnode of parent of name */
 	struct	vnode *nc_vp;		/* vnode the name refers to */
 	u_char	nc_flag;		/* flag bits */
 	u_char	nc_nlen;		/* length of name */
 	char	nc_name[0];		/* segment name */
 };
 
 /*
  * Name caching works as follows:
  *
  * Names found by directory scans are retained in a cache
  * for future reference.  It is managed LRU, so frequently
  * used names will hang around.  Cache is indexed by hash value
  * obtained from (vp, name) where vp refers to the directory
  * containing name.
  *
  * If it is a "negative" entry, (i.e. for a name that is known NOT to
  * exist) the vnode pointer will be NULL.
  *
  * Upon reaching the last segment of a path, if the reference
  * is for DELETE, or NOCACHE is set (rewrite), and the
  * name is located in the cache, it will be dropped.
  */
 
 /*
  * Structures associated with name cacheing.
  */
 #define NCHHASH(hash) \
 	(&nchashtbl[(hash) & nchash])
 static LIST_HEAD(nchashhead, namecache) *nchashtbl;	/* Hash Table */
 static TAILQ_HEAD(, namecache) ncneg;	/* Hash Table */
 static u_long	nchash;			/* size of hash table */
 SYSCTL_ULONG(_debug, OID_AUTO, nchash, CTLFLAG_RD, &nchash, 0, "");
 static u_long	ncnegfactor = 16;	/* ratio of negative entries */
 SYSCTL_ULONG(_debug, OID_AUTO, ncnegfactor, CTLFLAG_RW, &ncnegfactor, 0, "");
 static u_long	numneg;			/* number of cache entries allocated */
 SYSCTL_ULONG(_debug, OID_AUTO, numneg, CTLFLAG_RD, &numneg, 0, "");
 static u_long	numcache;		/* number of cache entries allocated */
 SYSCTL_ULONG(_debug, OID_AUTO, numcache, CTLFLAG_RD, &numcache, 0, "");
 static u_long	numcachehv;		/* number of cache entries with vnodes held */
 SYSCTL_ULONG(_debug, OID_AUTO, numcachehv, CTLFLAG_RD, &numcachehv, 0, "");
 #if 0
 static u_long	numcachepl;		/* number of cache purge for leaf entries */
 SYSCTL_ULONG(_debug, OID_AUTO, numcachepl, CTLFLAG_RD, &numcachepl, 0, "");
 #endif
 struct	nchstats nchstats;		/* cache effectiveness statistics */
 
 static struct mtx cache_lock;
 MTX_SYSINIT(vfscache, &cache_lock, "Name Cache", MTX_DEF);
 
 #define	CACHE_LOCK()	mtx_lock(&cache_lock)
 #define	CACHE_UNLOCK()	mtx_unlock(&cache_lock)
 
 /*
  * UMA zones for the VFS cache.
  *
  * The small cache is used for entries with short names, which are the
  * most common.  The large cache is used for entries which are too big to
  * fit in the small cache.
  */
 static uma_zone_t cache_zone_small;
 static uma_zone_t cache_zone_large;
 
 #define	CACHE_PATH_CUTOFF	32
 #define	CACHE_ZONE_SMALL	(sizeof(struct namecache) + CACHE_PATH_CUTOFF)
 #define	CACHE_ZONE_LARGE	(sizeof(struct namecache) + NAME_MAX)
 
 #define cache_alloc(len)	uma_zalloc(((len) <= CACHE_PATH_CUTOFF) ? \
 	cache_zone_small : cache_zone_large, M_WAITOK)
 #define cache_free(ncp)		do { \
 	if (ncp != NULL) \
 		uma_zfree(((ncp)->nc_nlen <= CACHE_PATH_CUTOFF) ? \
 		    cache_zone_small : cache_zone_large, (ncp)); \
 } while (0)
 
 static int	doingcache = 1;		/* 1 => enable the cache */
 SYSCTL_INT(_debug, OID_AUTO, vfscache, CTLFLAG_RW, &doingcache, 0, "");
 
 /* Export size information to userland */
 SYSCTL_INT(_debug_sizeof, OID_AUTO, namecache, CTLFLAG_RD, 0,
 	sizeof(struct namecache), "");
 
 /*
  * The new name cache statistics
  */
 static SYSCTL_NODE(_vfs, OID_AUTO, cache, CTLFLAG_RW, 0, "Name cache statistics");
 #define STATNODE(mode, name, var) \
 	SYSCTL_ULONG(_vfs_cache, OID_AUTO, name, mode, var, 0, "");
 STATNODE(CTLFLAG_RD, numneg, &numneg);
 STATNODE(CTLFLAG_RD, numcache, &numcache);
 static u_long numcalls; STATNODE(CTLFLAG_RD, numcalls, &numcalls);
 static u_long dothits; STATNODE(CTLFLAG_RD, dothits, &dothits);
 static u_long dotdothits; STATNODE(CTLFLAG_RD, dotdothits, &dotdothits);
 static u_long numchecks; STATNODE(CTLFLAG_RD, numchecks, &numchecks);
 static u_long nummiss; STATNODE(CTLFLAG_RD, nummiss, &nummiss);
 static u_long nummisszap; STATNODE(CTLFLAG_RD, nummisszap, &nummisszap);
 static u_long numposzaps; STATNODE(CTLFLAG_RD, numposzaps, &numposzaps);
 static u_long numposhits; STATNODE(CTLFLAG_RD, numposhits, &numposhits);
 static u_long numnegzaps; STATNODE(CTLFLAG_RD, numnegzaps, &numnegzaps);
 static u_long numneghits; STATNODE(CTLFLAG_RD, numneghits, &numneghits);
 
 SYSCTL_OPAQUE(_vfs_cache, OID_AUTO, nchstats, CTLFLAG_RD, &nchstats,
 	sizeof(nchstats), "LU", "VFS cache effectiveness statistics");
 
 
 
 static void cache_zap(struct namecache *ncp);
 static int vn_fullpath1(struct thread *td, struct vnode *vp, struct vnode *rdir,
     char *buf, char **retbuf, u_int buflen);
 
 static MALLOC_DEFINE(M_VFSCACHE, "vfscache", "VFS name cache entries");
 
 /*
  * Flags in namecache.nc_flag
  */
 #define NCF_WHITE	1
 
 /*
  * Grab an atomic snapshot of the name cache hash chain lengths
  */
 SYSCTL_NODE(_debug, OID_AUTO, hashstat, CTLFLAG_RW, NULL, "hash table stats");
 
 static int
 sysctl_debug_hashstat_rawnchash(SYSCTL_HANDLER_ARGS)
 {
 	int error;
 	struct nchashhead *ncpp;
 	struct namecache *ncp;
 	int n_nchash;
 	int count;
 
 	n_nchash = nchash + 1;	/* nchash is max index, not count */
 	if (!req->oldptr)
 		return SYSCTL_OUT(req, 0, n_nchash * sizeof(int));
 
 	/* Scan hash tables for applicable entries */
 	for (ncpp = nchashtbl; n_nchash > 0; n_nchash--, ncpp++) {
 		count = 0;
 		LIST_FOREACH(ncp, ncpp, nc_hash) {
 			count++;
 		}
 		error = SYSCTL_OUT(req, &count, sizeof(count));
 		if (error)
 			return (error);
 	}
 	return (0);
 }
 SYSCTL_PROC(_debug_hashstat, OID_AUTO, rawnchash, CTLTYPE_INT|CTLFLAG_RD,
 	0, 0, sysctl_debug_hashstat_rawnchash, "S,int", "nchash chain lengths");
 
 static int
 sysctl_debug_hashstat_nchash(SYSCTL_HANDLER_ARGS)
 {
 	int error;
 	struct nchashhead *ncpp;
 	struct namecache *ncp;
 	int n_nchash;
 	int count, maxlength, used, pct;
 
 	if (!req->oldptr)
 		return SYSCTL_OUT(req, 0, 4 * sizeof(int));
 
 	n_nchash = nchash + 1;	/* nchash is max index, not count */
 	used = 0;
 	maxlength = 0;
 
 	/* Scan hash tables for applicable entries */
 	for (ncpp = nchashtbl; n_nchash > 0; n_nchash--, ncpp++) {
 		count = 0;
 		LIST_FOREACH(ncp, ncpp, nc_hash) {
 			count++;
 		}
 		if (count)
 			used++;
 		if (maxlength < count)
 			maxlength = count;
 	}
 	n_nchash = nchash + 1;
 	pct = (used * 100 * 100) / n_nchash;
 	error = SYSCTL_OUT(req, &n_nchash, sizeof(n_nchash));
 	if (error)
 		return (error);
 	error = SYSCTL_OUT(req, &used, sizeof(used));
 	if (error)
 		return (error);
 	error = SYSCTL_OUT(req, &maxlength, sizeof(maxlength));
 	if (error)
 		return (error);
 	error = SYSCTL_OUT(req, &pct, sizeof(pct));
 	if (error)
 		return (error);
 	return (0);
 }
 SYSCTL_PROC(_debug_hashstat, OID_AUTO, nchash, CTLTYPE_INT|CTLFLAG_RD,
 	0, 0, sysctl_debug_hashstat_nchash, "I", "nchash chain lengths");
 
 /*
  * cache_zap():
  *
  *   Removes a namecache entry from cache, whether it contains an actual
  *   pointer to a vnode or if it is just a negative cache entry.
  */
 static void
 cache_zap(ncp)
 	struct namecache *ncp;
 {
 	struct vnode *vp;
 
 	mtx_assert(&cache_lock, MA_OWNED);
 	CTR2(KTR_VFS, "cache_zap(%p) vp %p", ncp, ncp->nc_vp);
 	vp = NULL;
 	LIST_REMOVE(ncp, nc_hash);
 	LIST_REMOVE(ncp, nc_src);
 	if (LIST_EMPTY(&ncp->nc_dvp->v_cache_src)) {
 		vp = ncp->nc_dvp;
 		numcachehv--;
 	}
 	if (ncp->nc_vp) {
 		TAILQ_REMOVE(&ncp->nc_vp->v_cache_dst, ncp, nc_dst);
 		ncp->nc_vp->v_dd = NULL;
 	} else {
 		TAILQ_REMOVE(&ncneg, ncp, nc_dst);
 		numneg--;
 	}
 	numcache--;
 	cache_free(ncp);
 	if (vp)
 		vdrop(vp);
 }
 
 /*
  * Lookup an entry in the cache
  *
  * Lookup is called with dvp pointing to the directory to search,
  * cnp pointing to the name of the entry being sought. If the lookup
  * succeeds, the vnode is returned in *vpp, and a status of -1 is
  * returned. If the lookup determines that the name does not exist
  * (negative cacheing), a status of ENOENT is returned. If the lookup
  * fails, a status of zero is returned.
  *
  * vpp is locked and ref'd on return.  If we're looking up DOTDOT, dvp is
  * unlocked.  If we're looking up . an extra ref is taken, but the lock is
  * not recursively acquired.
  */
 
 int
 cache_lookup(dvp, vpp, cnp)
 	struct vnode *dvp;
 	struct vnode **vpp;
 	struct componentname *cnp;
 {
 	struct namecache *ncp;
 	struct thread *td;
 	u_int32_t hash;
 	int error, ltype;
 
 	if (!doingcache) {
 		cnp->cn_flags &= ~MAKEENTRY;
 		return (0);
 	}
 	td = cnp->cn_thread;
 retry:
 	CACHE_LOCK();
 	numcalls++;
 
 	if (cnp->cn_nameptr[0] == '.') {
 		if (cnp->cn_namelen == 1) {
 			*vpp = dvp;
 			CTR2(KTR_VFS, "cache_lookup(%p, %s) found via .",
 			    dvp, cnp->cn_nameptr);
 			dothits++;
 			goto success;
 		}
 		if (cnp->cn_namelen == 2 && cnp->cn_nameptr[1] == '.') {
 			dotdothits++;
 			if (dvp->v_dd == NULL ||
 			    (cnp->cn_flags & MAKEENTRY) == 0) {
 				CACHE_UNLOCK();
 				return (0);
 			}
 			*vpp = dvp->v_dd;
 			CTR3(KTR_VFS, "cache_lookup(%p, %s) found %p via ..",
 			    dvp, cnp->cn_nameptr, *vpp);
 			goto success;
 		}
 	}
 
 	hash = fnv_32_buf(cnp->cn_nameptr, cnp->cn_namelen, FNV1_32_INIT);
 	hash = fnv_32_buf(&dvp, sizeof(dvp), hash);
 	LIST_FOREACH(ncp, (NCHHASH(hash)), nc_hash) {
 		numchecks++;
 		if (ncp->nc_dvp == dvp && ncp->nc_nlen == cnp->cn_namelen &&
 		    !bcmp(ncp->nc_name, cnp->cn_nameptr, ncp->nc_nlen))
 			break;
 	}
 
 	/* We failed to find an entry */
 	if (ncp == 0) {
 		if ((cnp->cn_flags & MAKEENTRY) == 0) {
 			nummisszap++;
 		} else {
 			nummiss++;
 		}
 		nchstats.ncs_miss++;
 		CACHE_UNLOCK();
 		return (0);
 	}
 
 	/* We don't want to have an entry, so dump it */
 	if ((cnp->cn_flags & MAKEENTRY) == 0) {
 		numposzaps++;
 		nchstats.ncs_badhits++;
 		cache_zap(ncp);
 		CACHE_UNLOCK();
 		return (0);
 	}
 
 	/* We found a "positive" match, return the vnode */
 	if (ncp->nc_vp) {
 		numposhits++;
 		nchstats.ncs_goodhits++;
 		*vpp = ncp->nc_vp;
 		CTR4(KTR_VFS, "cache_lookup(%p, %s) found %p via ncp %p",
 		    dvp, cnp->cn_nameptr, *vpp, ncp);
 		goto success;
 	}
 
 	/* We found a negative match, and want to create it, so purge */
 	if (cnp->cn_nameiop == CREATE) {
 		numnegzaps++;
 		nchstats.ncs_badhits++;
 		cache_zap(ncp);
 		CACHE_UNLOCK();
 		return (0);
 	}
 
 	numneghits++;
 	/*
 	 * We found a "negative" match, so we shift it to the end of
 	 * the "negative" cache entries queue to satisfy LRU.  Also,
 	 * check to see if the entry is a whiteout; indicate this to
 	 * the componentname, if so.
 	 */
 	TAILQ_REMOVE(&ncneg, ncp, nc_dst);
 	TAILQ_INSERT_TAIL(&ncneg, ncp, nc_dst);
 	nchstats.ncs_neghits++;
 	if (ncp->nc_flag & NCF_WHITE)
 		cnp->cn_flags |= ISWHITEOUT;
 	CACHE_UNLOCK();
 	return (ENOENT);
 
 success:
 	/*
 	 * On success we return a locked and ref'd vnode as per the lookup
 	 * protocol.
 	 */
 	if (dvp == *vpp) {   /* lookup on "." */
 		VREF(*vpp);
 		CACHE_UNLOCK();
 		/*
 		 * When we lookup "." we still can be asked to lock it
 		 * differently...
 		 */
 		ltype = cnp->cn_lkflags & (LK_SHARED | LK_EXCLUSIVE);
 		if (ltype == VOP_ISLOCKED(*vpp, td))
 			return (-1);
 		else if (ltype == LK_EXCLUSIVE)
-			vn_lock(*vpp, LK_UPGRADE | LK_RETRY, td);
+			vn_lock(*vpp, LK_UPGRADE | LK_RETRY);
 		return (-1);
 	}
 	ltype = 0;	/* silence gcc warning */
 	if (cnp->cn_flags & ISDOTDOT) {
 		ltype = VOP_ISLOCKED(dvp, td);
 		VOP_UNLOCK(dvp, 0, td);
 	}
 	VI_LOCK(*vpp);
 	CACHE_UNLOCK();
 	error = vget(*vpp, cnp->cn_lkflags | LK_INTERLOCK, td);
 	if (cnp->cn_flags & ISDOTDOT)
-		vn_lock(dvp, ltype | LK_RETRY, td);
+		vn_lock(dvp, ltype | LK_RETRY);
 	if ((cnp->cn_flags & ISLASTCN) && (cnp->cn_lkflags & LK_EXCLUSIVE))
 		ASSERT_VOP_ELOCKED(*vpp, "cache_lookup");
 	if (error) {
 		*vpp = NULL;
 		goto retry;
 	}
 	return (-1);
 }
 
 /*
  * Add an entry to the cache.
  */
 void
 cache_enter(dvp, vp, cnp)
 	struct vnode *dvp;
 	struct vnode *vp;
 	struct componentname *cnp;
 {
 	struct namecache *ncp;
 	struct nchashhead *ncpp;
 	u_int32_t hash;
 	int hold;
 	int zap;
 	int len;
 
 	CTR3(KTR_VFS, "cache_enter(%p, %p, %s)", dvp, vp, cnp->cn_nameptr);
 	VNASSERT(vp == NULL || (vp->v_iflag & VI_DOOMED) == 0, vp,
 	    ("cahe_enter: Adding a doomed vnode"));
 
 	if (!doingcache)
 		return;
 
 	if (cnp->cn_nameptr[0] == '.') {
 		if (cnp->cn_namelen == 1) {
 			return;
 		}
 		/*
 		 * For dotdot lookups only cache the v_dd pointer if the
 		 * directory has a link back to its parent via v_cache_dst.
 		 * Without this an unlinked directory would keep a soft
 		 * reference to its parent which could not be NULLd at
 		 * cache_purge() time.
 		 */
 		if (cnp->cn_namelen == 2 && cnp->cn_nameptr[1] == '.') {
 			CACHE_LOCK();
 			if (!TAILQ_EMPTY(&dvp->v_cache_dst))
 				dvp->v_dd = vp;
 			CACHE_UNLOCK();
 			return;
 		}
 	}
 
 	hold = 0;
 	zap = 0;
 	ncp = cache_alloc(cnp->cn_namelen);
 	CACHE_LOCK();
 	numcache++;
 	if (!vp) {
 		numneg++;
 		ncp->nc_flag = cnp->cn_flags & ISWHITEOUT ? NCF_WHITE : 0;
 	} else if (vp->v_type == VDIR) {
 		vp->v_dd = dvp;
 	} else {
 		vp->v_dd = NULL;
 	}
 
 	/*
 	 * Set the rest of the namecache entry elements, calculate it's
 	 * hash key and insert it into the appropriate chain within
 	 * the cache entries table.
 	 */
 	ncp->nc_vp = vp;
 	ncp->nc_dvp = dvp;
 	len = ncp->nc_nlen = cnp->cn_namelen;
 	hash = fnv_32_buf(cnp->cn_nameptr, len, FNV1_32_INIT);
 	bcopy(cnp->cn_nameptr, ncp->nc_name, len);
 	hash = fnv_32_buf(&dvp, sizeof(dvp), hash);
 	ncpp = NCHHASH(hash);
 	LIST_INSERT_HEAD(ncpp, ncp, nc_hash);
 	if (LIST_EMPTY(&dvp->v_cache_src)) {
 		hold = 1;
 		numcachehv++;
 	}
 	LIST_INSERT_HEAD(&dvp->v_cache_src, ncp, nc_src);
 	/*
 	 * If the entry is "negative", we place it into the
 	 * "negative" cache queue, otherwise, we place it into the
 	 * destination vnode's cache entries queue.
 	 */
 	if (vp) {
 		TAILQ_INSERT_HEAD(&vp->v_cache_dst, ncp, nc_dst);
 	} else {
 		TAILQ_INSERT_TAIL(&ncneg, ncp, nc_dst);
 	}
 	if (numneg * ncnegfactor > numcache) {
 		ncp = TAILQ_FIRST(&ncneg);
 		zap = 1;
 	}
 	if (hold)
 		vhold(dvp);
 	if (zap)
 		cache_zap(ncp);
 	CACHE_UNLOCK();
 }
 
 /*
  * Name cache initialization, from vfs_init() when we are booting
  */
 static void
 nchinit(void *dummy __unused)
 {
 
 	TAILQ_INIT(&ncneg);
 
 	cache_zone_small = uma_zcreate("S VFS Cache", CACHE_ZONE_SMALL, NULL,
 	    NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_ZINIT);
 	cache_zone_large = uma_zcreate("L VFS Cache", CACHE_ZONE_LARGE, NULL,
 	    NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_ZINIT);
 
 	nchashtbl = hashinit(desiredvnodes * 2, M_VFSCACHE, &nchash);
 }
 SYSINIT(vfs, SI_SUB_VFS, SI_ORDER_SECOND, nchinit, NULL)
 
 
 /*
  * Invalidate all entries to a particular vnode.
  */
 void
 cache_purge(vp)
 	struct vnode *vp;
 {
 
 	CTR1(KTR_VFS, "cache_purge(%p)", vp);
 	CACHE_LOCK();
 	while (!LIST_EMPTY(&vp->v_cache_src))
 		cache_zap(LIST_FIRST(&vp->v_cache_src));
 	while (!TAILQ_EMPTY(&vp->v_cache_dst))
 		cache_zap(TAILQ_FIRST(&vp->v_cache_dst));
 	vp->v_dd = NULL;
 	CACHE_UNLOCK();
 }
 
 /*
  * Flush all entries referencing a particular filesystem.
  */
 void
 cache_purgevfs(mp)
 	struct mount *mp;
 {
 	struct nchashhead *ncpp;
 	struct namecache *ncp, *nnp;
 
 	/* Scan hash tables for applicable entries */
 	CACHE_LOCK();
 	for (ncpp = &nchashtbl[nchash]; ncpp >= nchashtbl; ncpp--) {
 		LIST_FOREACH_SAFE(ncp, ncpp, nc_hash, nnp) {
 			if (ncp->nc_dvp->v_mount == mp)
 				cache_zap(ncp);
 		}
 	}
 	CACHE_UNLOCK();
 }
 
 /*
  * Perform canonical checks and cache lookup and pass on to filesystem
  * through the vop_cachedlookup only if needed.
  */
 
 int
 vfs_cache_lookup(ap)
 	struct vop_lookup_args /* {
 		struct vnode *a_dvp;
 		struct vnode **a_vpp;
 		struct componentname *a_cnp;
 	} */ *ap;
 {
 	struct vnode *dvp;
 	int error;
 	struct vnode **vpp = ap->a_vpp;
 	struct componentname *cnp = ap->a_cnp;
 	struct ucred *cred = cnp->cn_cred;
 	int flags = cnp->cn_flags;
 	struct thread *td = cnp->cn_thread;
 
 	*vpp = NULL;
 	dvp = ap->a_dvp;
 
 	if (dvp->v_type != VDIR)
 		return (ENOTDIR);
 
 	if ((flags & ISLASTCN) && (dvp->v_mount->mnt_flag & MNT_RDONLY) &&
 	    (cnp->cn_nameiop == DELETE || cnp->cn_nameiop == RENAME))
 		return (EROFS);
 
 	error = VOP_ACCESS(dvp, VEXEC, cred, td);
 	if (error)
 		return (error);
 
 	error = cache_lookup(dvp, vpp, cnp);
 	if (error == 0)
 		return (VOP_CACHEDLOOKUP(dvp, vpp, cnp));
 	if (error == ENOENT)
 		return (error);
 	return (0);
 }
 
 
 #ifndef _SYS_SYSPROTO_H_
 struct  __getcwd_args {
 	u_char	*buf;
 	u_int	buflen;
 };
 #endif
 
 /*
  * XXX All of these sysctls would probably be more productive dead.
  */
 static int disablecwd;
 SYSCTL_INT(_debug, OID_AUTO, disablecwd, CTLFLAG_RW, &disablecwd, 0,
    "Disable the getcwd syscall");
 
 /* Implementation of the getcwd syscall. */
 int
 __getcwd(td, uap)
 	struct thread *td;
 	struct __getcwd_args *uap;
 {
 
 	return (kern___getcwd(td, uap->buf, UIO_USERSPACE, uap->buflen));
 }
 
 int
 kern___getcwd(struct thread *td, u_char *buf, enum uio_seg bufseg, u_int buflen)
 {
 	char *bp, *tmpbuf;
 	struct filedesc *fdp;
 	int error;
 
 	if (disablecwd)
 		return (ENODEV);
 	if (buflen < 2)
 		return (EINVAL);
 	if (buflen > MAXPATHLEN)
 		buflen = MAXPATHLEN;
 
 	tmpbuf = malloc(buflen, M_TEMP, M_WAITOK);
 	fdp = td->td_proc->p_fd;
 	FILEDESC_SLOCK(fdp);
 	error = vn_fullpath1(td, fdp->fd_cdir, fdp->fd_rdir, tmpbuf,
 	    &bp, buflen);
 	FILEDESC_SUNLOCK(fdp);
 
 	if (!error) {
 		if (bufseg == UIO_SYSSPACE)
 			bcopy(bp, buf, strlen(bp) + 1);
 		else
 			error = copyout(bp, buf, strlen(bp) + 1);
 	}
 	free(tmpbuf, M_TEMP);
 	return (error);
 }
 
 /*
  * Thus begins the fullpath magic.
  */
 
 #undef STATNODE
 #define STATNODE(name)							\
 	static u_int name;						\
 	SYSCTL_UINT(_vfs_cache, OID_AUTO, name, CTLFLAG_RD, &name, 0, "")
 
 static int disablefullpath;
 SYSCTL_INT(_debug, OID_AUTO, disablefullpath, CTLFLAG_RW, &disablefullpath, 0,
 	"Disable the vn_fullpath function");
 
 /* These count for kern___getcwd(), too. */
 STATNODE(numfullpathcalls);
 STATNODE(numfullpathfail1);
 STATNODE(numfullpathfail2);
 STATNODE(numfullpathfail4);
 STATNODE(numfullpathfound);
 
 /*
  * Retrieve the full filesystem path that correspond to a vnode from the name
  * cache (if available)
  */
 int
 vn_fullpath(struct thread *td, struct vnode *vn, char **retbuf, char **freebuf)
 {
 	char *buf;
 	struct filedesc *fdp;
 	int error;
 
 	if (disablefullpath)
 		return (ENODEV);
 	if (vn == NULL)
 		return (EINVAL);
 
 	buf = malloc(MAXPATHLEN, M_TEMP, M_WAITOK);
 	fdp = td->td_proc->p_fd;
 	FILEDESC_SLOCK(fdp);
 	error = vn_fullpath1(td, vn, fdp->fd_rdir, buf, retbuf, MAXPATHLEN);
 	FILEDESC_SUNLOCK(fdp);
 
 	if (!error)
 		*freebuf = buf;
 	else
 		free(buf, M_TEMP);
 	return (error);
 }
 
 /*
  * The magic behind kern___getcwd() and vn_fullpath().
  */
 static int
 vn_fullpath1(struct thread *td, struct vnode *vp, struct vnode *rdir,
     char *buf, char **retbuf, u_int buflen)
 {
 	char *bp;
 	int error, i, slash_prefixed;
 	struct namecache *ncp;
 
 	bp = buf + buflen - 1;
 	*bp = '\0';
 	error = 0;
 	slash_prefixed = 0;
 
 	CACHE_LOCK();
 	numfullpathcalls++;
 	if (vp->v_type != VDIR) {
 		ncp = TAILQ_FIRST(&vp->v_cache_dst);
 		if (!ncp) {
 			numfullpathfail2++;
 			CACHE_UNLOCK();
 			return (ENOENT);
 		}
 		for (i = ncp->nc_nlen - 1; i >= 0 && bp > buf; i--)
 			*--bp = ncp->nc_name[i];
 		if (bp == buf) {
 			numfullpathfail4++;
 			CACHE_UNLOCK();
 			return (ENOMEM);
 		}
 		*--bp = '/';
 		slash_prefixed = 1;
 		vp = ncp->nc_dvp;
 	}
 	while (vp != rdir && vp != rootvnode) {
 		if (vp->v_vflag & VV_ROOT) {
 			if (vp->v_iflag & VI_DOOMED) {	/* forced unmount */
 				error = EBADF;
 				break;
 			}
 			vp = vp->v_mount->mnt_vnodecovered;
 			continue;
 		}
 		if (vp->v_dd == NULL) {
 			numfullpathfail1++;
 			error = ENOTDIR;
 			break;
 		}
 		ncp = TAILQ_FIRST(&vp->v_cache_dst);
 		if (!ncp) {
 			numfullpathfail2++;
 			error = ENOENT;
 			break;
 		}
 		MPASS(ncp->nc_dvp == vp->v_dd);
 		for (i = ncp->nc_nlen - 1; i >= 0 && bp != buf; i--)
 			*--bp = ncp->nc_name[i];
 		if (bp == buf) {
 			numfullpathfail4++;
 			error = ENOMEM;
 			break;
 		}
 		*--bp = '/';
 		slash_prefixed = 1;
 		vp = ncp->nc_dvp;
 	}
 	if (error) {
 		CACHE_UNLOCK();
 		return (error);
 	}
 	if (!slash_prefixed) {
 		if (bp == buf) {
 			numfullpathfail4++;
 			CACHE_UNLOCK();
 			return (ENOMEM);
 		} else {
 			*--bp = '/';
 		}
 	}
 	numfullpathfound++;
 	CACHE_UNLOCK();
 
 	*retbuf = bp;
 	return (0);
 }
Index: head/sys/kern/vfs_extattr.c
===================================================================
--- head/sys/kern/vfs_extattr.c	(revision 175201)
+++ head/sys/kern/vfs_extattr.c	(revision 175202)
@@ -1,785 +1,785 @@
 /*-
  * Copyright (c) 1999-2001 Robert N. M. Watson
  * All rights reserved.
  *
  * This software was developed by Robert Watson for the TrustedBSD Project.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_mac.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/lock.h>
 #include <sys/mount.h>
 #include <sys/mutex.h>
 #include <sys/sysproto.h>
 #include <sys/namei.h>
 #include <sys/filedesc.h>
 #include <sys/limits.h>
 #include <sys/vnode.h>
 #include <sys/proc.h>
 #include <sys/extattr.h>
 
 #include <security/audit/audit.h>
 #include <security/mac/mac_framework.h>
 
 /*
  * Syscall to push extended attribute configuration information into the VFS.
  * Accepts a path, which it converts to a mountpoint, as well as a command
  * (int cmd), and attribute name and misc data.
  *
  * Currently this is used only by UFS1 extended attributes.
  */
 int
 extattrctl(td, uap)
 	struct thread *td;
 	struct extattrctl_args /* {
 		const char *path;
 		int cmd;
 		const char *filename;
 		int attrnamespace;
 		const char *attrname;
 	} */ *uap;
 {
 	struct vnode *filename_vp;
 	struct nameidata nd;
 	struct mount *mp, *mp_writable;
 	char attrname[EXTATTR_MAXNAMELEN];
 	int vfslocked, fnvfslocked, error;
 
 	AUDIT_ARG(cmd, uap->cmd);
 	AUDIT_ARG(value, uap->attrnamespace);
 	/*
 	 * uap->attrname is not always defined.  We check again later when we
 	 * invoke the VFS call so as to pass in NULL there if needed.
 	 */
 	if (uap->attrname != NULL) {
 		error = copyinstr(uap->attrname, attrname, EXTATTR_MAXNAMELEN,
 		    NULL);
 		if (error)
 			return (error);
 	}
 	AUDIT_ARG(text, attrname);
 
 	vfslocked = fnvfslocked = 0;
 	/*
 	 * uap->filename is not always defined.  If it is, grab a vnode lock,
 	 * which VFS_EXTATTRCTL() will later release.
 	 */
 	filename_vp = NULL;
 	if (uap->filename != NULL) {
 		NDINIT(&nd, LOOKUP, MPSAFE | FOLLOW | LOCKLEAF |
 		    AUDITVNODE2, UIO_USERSPACE, uap->filename, td);
 		error = namei(&nd);
 		if (error)
 			return (error);
 		fnvfslocked = NDHASGIANT(&nd);
 		filename_vp = nd.ni_vp;
 		NDFREE(&nd, NDF_NO_VP_RELE | NDF_NO_VP_UNLOCK);
 	}
 
 	/* uap->path is always defined. */
 	NDINIT(&nd, LOOKUP, MPSAFE | FOLLOW | AUDITVNODE1, UIO_USERSPACE,
 	    uap->path, td);
 	error = namei(&nd);
 	if (error) {
 		if (filename_vp != NULL)
 			vput(filename_vp);
 		goto out;
 	}
 	vfslocked = NDHASGIANT(&nd);
 	mp = nd.ni_vp->v_mount;
 	error = vn_start_write(nd.ni_vp, &mp_writable, V_WAIT | PCATCH);
 	NDFREE(&nd, 0);
 	if (error) {
 		if (filename_vp != NULL)
 			vput(filename_vp);
 		goto out;
 	}
 
 	error = VFS_EXTATTRCTL(mp, uap->cmd, filename_vp, uap->attrnamespace,
 	    uap->attrname != NULL ? attrname : NULL, td);
 
 	vn_finished_write(mp_writable);
 	/*
 	 * VFS_EXTATTRCTL will have unlocked, but not de-ref'd, filename_vp,
 	 * so vrele it if it is defined.
 	 */
 	if (filename_vp != NULL)
 		vrele(filename_vp);
 out:
 	VFS_UNLOCK_GIANT(fnvfslocked);
 	VFS_UNLOCK_GIANT(vfslocked);
 	return (error);
 }
 
 /*-
  * Set a named extended attribute on a file or directory
  *
  * Arguments: unlocked vnode "vp", attribute namespace "attrnamespace",
  *            kernelspace string pointer "attrname", userspace buffer
  *            pointer "data", buffer length "nbytes", thread "td".
  * Returns: 0 on success, an error number otherwise
  * Locks: none
  * References: vp must be a valid reference for the duration of the call
  */
 static int
 extattr_set_vp(struct vnode *vp, int attrnamespace, const char *attrname,
     void *data, size_t nbytes, struct thread *td)
 {
 	struct mount *mp;
 	struct uio auio;
 	struct iovec aiov;
 	ssize_t cnt;
 	int error;
 
 	VFS_ASSERT_GIANT(vp->v_mount);
 	error = vn_start_write(vp, &mp, V_WAIT | PCATCH);
 	if (error)
 		return (error);
 	VOP_LEASE(vp, td, td->td_ucred, LEASE_WRITE);
-	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td);
+	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
 
 	aiov.iov_base = data;
 	aiov.iov_len = nbytes;
 	auio.uio_iov = &aiov;
 	auio.uio_iovcnt = 1;
 	auio.uio_offset = 0;
 	if (nbytes > INT_MAX) {
 		error = EINVAL;
 		goto done;
 	}
 	auio.uio_resid = nbytes;
 	auio.uio_rw = UIO_WRITE;
 	auio.uio_segflg = UIO_USERSPACE;
 	auio.uio_td = td;
 	cnt = nbytes;
 
 #ifdef MAC
 	error = mac_vnode_check_setextattr(td->td_ucred, vp, attrnamespace,
 	    attrname, &auio);
 	if (error)
 		goto done;
 #endif
 
 	error = VOP_SETEXTATTR(vp, attrnamespace, attrname, &auio,
 	    td->td_ucred, td);
 	cnt -= auio.uio_resid;
 	td->td_retval[0] = cnt;
 
 done:
 	VOP_UNLOCK(vp, 0, td);
 	vn_finished_write(mp);
 	return (error);
 }
 
 int
 extattr_set_fd(td, uap)
 	struct thread *td;
 	struct extattr_set_fd_args /* {
 		int fd;
 		int attrnamespace;
 		const char *attrname;
 		void *data;
 		size_t nbytes;
 	} */ *uap;
 {
 	struct file *fp;
 	char attrname[EXTATTR_MAXNAMELEN];
 	int vfslocked, error;
 
 	AUDIT_ARG(fd, uap->fd);
 	AUDIT_ARG(value, uap->attrnamespace);
 	error = copyinstr(uap->attrname, attrname, EXTATTR_MAXNAMELEN, NULL);
 	if (error)
 		return (error);
 	AUDIT_ARG(text, attrname);
 
 	error = getvnode(td->td_proc->p_fd, uap->fd, &fp);
 	if (error)
 		return (error);
 
 	vfslocked = VFS_LOCK_GIANT(fp->f_vnode->v_mount);
 	error = extattr_set_vp(fp->f_vnode, uap->attrnamespace,
 	    attrname, uap->data, uap->nbytes, td);
 	fdrop(fp, td);
 	VFS_UNLOCK_GIANT(vfslocked);
 
 	return (error);
 }
 
 int
 extattr_set_file(td, uap)
 	struct thread *td;
 	struct extattr_set_file_args /* {
 		const char *path;
 		int attrnamespace;
 		const char *attrname;
 		void *data;
 		size_t nbytes;
 	} */ *uap;
 {
 	struct nameidata nd;
 	char attrname[EXTATTR_MAXNAMELEN];
 	int vfslocked, error;
 
 	AUDIT_ARG(value, uap->attrnamespace);
 	error = copyinstr(uap->attrname, attrname, EXTATTR_MAXNAMELEN, NULL);
 	if (error)
 		return (error);
 	AUDIT_ARG(text, attrname);
 
 	NDINIT(&nd, LOOKUP, MPSAFE | FOLLOW | AUDITVNODE1, UIO_USERSPACE,
 	    uap->path, td);
 	error = namei(&nd);
 	if (error)
 		return (error);
 	NDFREE(&nd, NDF_ONLY_PNBUF);
 
 	vfslocked = NDHASGIANT(&nd);
 	error = extattr_set_vp(nd.ni_vp, uap->attrnamespace, attrname,
 	    uap->data, uap->nbytes, td);
 
 	vrele(nd.ni_vp);
 	VFS_UNLOCK_GIANT(vfslocked);
 	return (error);
 }
 
 int
 extattr_set_link(td, uap)
 	struct thread *td;
 	struct extattr_set_link_args /* {
 		const char *path;
 		int attrnamespace;
 		const char *attrname;
 		void *data;
 		size_t nbytes;
 	} */ *uap;
 {
 	struct nameidata nd;
 	char attrname[EXTATTR_MAXNAMELEN];
 	int vfslocked, error;
 
 	AUDIT_ARG(value, uap->attrnamespace);
 	error = copyinstr(uap->attrname, attrname, EXTATTR_MAXNAMELEN, NULL);
 	if (error)
 		return (error);
 	AUDIT_ARG(text, attrname);
 
 	NDINIT(&nd, LOOKUP, MPSAFE | NOFOLLOW | AUDITVNODE1, UIO_USERSPACE,
 	    uap->path, td);
 	error = namei(&nd);
 	if (error)
 		return (error);
 	NDFREE(&nd, NDF_ONLY_PNBUF);
 
 	vfslocked = NDHASGIANT(&nd);
 	error = extattr_set_vp(nd.ni_vp, uap->attrnamespace, attrname,
 	    uap->data, uap->nbytes, td);
 
 	vrele(nd.ni_vp);
 	VFS_UNLOCK_GIANT(vfslocked);
 	return (error);
 }
 
 /*-
  * Get a named extended attribute on a file or directory
  *
  * Arguments: unlocked vnode "vp", attribute namespace "attrnamespace",
  *            kernelspace string pointer "attrname", userspace buffer
  *            pointer "data", buffer length "nbytes", thread "td".
  * Returns: 0 on success, an error number otherwise
  * Locks: none
  * References: vp must be a valid reference for the duration of the call
  */
 static int
 extattr_get_vp(struct vnode *vp, int attrnamespace, const char *attrname,
     void *data, size_t nbytes, struct thread *td)
 {
 	struct uio auio, *auiop;
 	struct iovec aiov;
 	ssize_t cnt;
 	size_t size, *sizep;
 	int error;
 
 	VFS_ASSERT_GIANT(vp->v_mount);
 	VOP_LEASE(vp, td, td->td_ucred, LEASE_READ);
-	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td);
+	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
 
 	/*
 	 * Slightly unusual semantics: if the user provides a NULL data
 	 * pointer, they don't want to receive the data, just the maximum
 	 * read length.
 	 */
 	auiop = NULL;
 	sizep = NULL;
 	cnt = 0;
 	if (data != NULL) {
 		aiov.iov_base = data;
 		aiov.iov_len = nbytes;
 		auio.uio_iov = &aiov;
 		auio.uio_iovcnt = 1;
 		auio.uio_offset = 0;
 		if (nbytes > INT_MAX) {
 			error = EINVAL;
 			goto done;
 		}
 		auio.uio_resid = nbytes;
 		auio.uio_rw = UIO_READ;
 		auio.uio_segflg = UIO_USERSPACE;
 		auio.uio_td = td;
 		auiop = &auio;
 		cnt = nbytes;
 	} else
 		sizep = &size;
 
 #ifdef MAC
 	error = mac_vnode_check_getextattr(td->td_ucred, vp, attrnamespace,
 	    attrname, &auio);
 	if (error)
 		goto done;
 #endif
 
 	error = VOP_GETEXTATTR(vp, attrnamespace, attrname, auiop, sizep,
 	    td->td_ucred, td);
 
 	if (auiop != NULL) {
 		cnt -= auio.uio_resid;
 		td->td_retval[0] = cnt;
 	} else
 		td->td_retval[0] = size;
 
 done:
 	VOP_UNLOCK(vp, 0, td);
 	return (error);
 }
 
 int
 extattr_get_fd(td, uap)
 	struct thread *td;
 	struct extattr_get_fd_args /* {
 		int fd;
 		int attrnamespace;
 		const char *attrname;
 		void *data;
 		size_t nbytes;
 	} */ *uap;
 {
 	struct file *fp;
 	char attrname[EXTATTR_MAXNAMELEN];
 	int vfslocked, error;
 
 	AUDIT_ARG(fd, uap->fd);
 	AUDIT_ARG(value, uap->attrnamespace);
 	error = copyinstr(uap->attrname, attrname, EXTATTR_MAXNAMELEN, NULL);
 	if (error)
 		return (error);
 	AUDIT_ARG(text, attrname);
 
 	error = getvnode(td->td_proc->p_fd, uap->fd, &fp);
 	if (error)
 		return (error);
 
 	vfslocked = VFS_LOCK_GIANT(fp->f_vnode->v_mount);
 	error = extattr_get_vp(fp->f_vnode, uap->attrnamespace,
 	    attrname, uap->data, uap->nbytes, td);
 
 	fdrop(fp, td);
 	VFS_UNLOCK_GIANT(vfslocked);
 	return (error);
 }
 
 int
 extattr_get_file(td, uap)
 	struct thread *td;
 	struct extattr_get_file_args /* {
 		const char *path;
 		int attrnamespace;
 		const char *attrname;
 		void *data;
 		size_t nbytes;
 	} */ *uap;
 {
 	struct nameidata nd;
 	char attrname[EXTATTR_MAXNAMELEN];
 	int vfslocked, error;
 
 	AUDIT_ARG(value, uap->attrnamespace);
 	error = copyinstr(uap->attrname, attrname, EXTATTR_MAXNAMELEN, NULL);
 	if (error)
 		return (error);
 	AUDIT_ARG(text, attrname);
 
 	NDINIT(&nd, LOOKUP, MPSAFE | FOLLOW | AUDITVNODE1, UIO_USERSPACE,
 	    uap->path, td);
 	error = namei(&nd);
 	if (error)
 		return (error);
 	NDFREE(&nd, NDF_ONLY_PNBUF);
 
 	vfslocked = NDHASGIANT(&nd);
 	error = extattr_get_vp(nd.ni_vp, uap->attrnamespace, attrname,
 	    uap->data, uap->nbytes, td);
 
 	vrele(nd.ni_vp);
 	VFS_UNLOCK_GIANT(vfslocked);
 	return (error);
 }
 
 int
 extattr_get_link(td, uap)
 	struct thread *td;
 	struct extattr_get_link_args /* {
 		const char *path;
 		int attrnamespace;
 		const char *attrname;
 		void *data;
 		size_t nbytes;
 	} */ *uap;
 {
 	struct nameidata nd;
 	char attrname[EXTATTR_MAXNAMELEN];
 	int vfslocked, error;
 
 	AUDIT_ARG(value, uap->attrnamespace);
 	error = copyinstr(uap->attrname, attrname, EXTATTR_MAXNAMELEN, NULL);
 	if (error)
 		return (error);
 	AUDIT_ARG(text, attrname);
 
 	NDINIT(&nd, LOOKUP, MPSAFE | NOFOLLOW | AUDITVNODE1, UIO_USERSPACE,
 	    uap->path, td);
 	error = namei(&nd);
 	if (error)
 		return (error);
 	NDFREE(&nd, NDF_ONLY_PNBUF);
 
 	vfslocked = NDHASGIANT(&nd);
 	error = extattr_get_vp(nd.ni_vp, uap->attrnamespace, attrname,
 	    uap->data, uap->nbytes, td);
 
 	vrele(nd.ni_vp);
 	VFS_UNLOCK_GIANT(vfslocked);
 	return (error);
 }
 
 /*
  * extattr_delete_vp(): Delete a named extended attribute on a file or
  *                      directory
  *
  * Arguments: unlocked vnode "vp", attribute namespace "attrnamespace",
  *            kernelspace string pointer "attrname", proc "p"
  * Returns: 0 on success, an error number otherwise
  * Locks: none
  * References: vp must be a valid reference for the duration of the call
  */
 static int
 extattr_delete_vp(struct vnode *vp, int attrnamespace, const char *attrname,
     struct thread *td)
 {
 	struct mount *mp;
 	int error;
 
 	VFS_ASSERT_GIANT(vp->v_mount);
 	error = vn_start_write(vp, &mp, V_WAIT | PCATCH);
 	if (error)
 		return (error);
 	VOP_LEASE(vp, td, td->td_ucred, LEASE_WRITE);
-	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td);
+	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
 
 #ifdef MAC
 	error = mac_vnode_check_deleteextattr(td->td_ucred, vp, attrnamespace,
 	    attrname);
 	if (error)
 		goto done;
 #endif
 
 	error = VOP_DELETEEXTATTR(vp, attrnamespace, attrname, td->td_ucred,
 	    td);
 	if (error == EOPNOTSUPP)
 		error = VOP_SETEXTATTR(vp, attrnamespace, attrname, NULL,
 		    td->td_ucred, td);
 #ifdef MAC
 done:
 #endif
 	VOP_UNLOCK(vp, 0, td);
 	vn_finished_write(mp);
 	return (error);
 }
 
 int
 extattr_delete_fd(td, uap)
 	struct thread *td;
 	struct extattr_delete_fd_args /* {
 		int fd;
 		int attrnamespace;
 		const char *attrname;
 	} */ *uap;
 {
 	struct file *fp;
 	char attrname[EXTATTR_MAXNAMELEN];
 	int vfslocked, error;
 
 	AUDIT_ARG(fd, uap->fd);
 	AUDIT_ARG(value, uap->attrnamespace);
 	error = copyinstr(uap->attrname, attrname, EXTATTR_MAXNAMELEN, NULL);
 	if (error)
 		return (error);
 	AUDIT_ARG(text, attrname);
 
 	error = getvnode(td->td_proc->p_fd, uap->fd, &fp);
 	if (error)
 		return (error);
 
 	vfslocked = VFS_LOCK_GIANT(fp->f_vnode->v_mount);
 	error = extattr_delete_vp(fp->f_vnode, uap->attrnamespace,
 	    attrname, td);
 	fdrop(fp, td);
 	VFS_UNLOCK_GIANT(vfslocked);
 	return (error);
 }
 
 int
 extattr_delete_file(td, uap)
 	struct thread *td;
 	struct extattr_delete_file_args /* {
 		const char *path;
 		int attrnamespace;
 		const char *attrname;
 	} */ *uap;
 {
 	struct nameidata nd;
 	char attrname[EXTATTR_MAXNAMELEN];
 	int vfslocked, error;
 
 	AUDIT_ARG(value, uap->attrnamespace);
 	error = copyinstr(uap->attrname, attrname, EXTATTR_MAXNAMELEN, NULL);
 	if (error)
 		return(error);
 	AUDIT_ARG(text, attrname);
 
 	NDINIT(&nd, LOOKUP, MPSAFE | FOLLOW | AUDITVNODE1, UIO_USERSPACE,
 	    uap->path, td);
 	error = namei(&nd);
 	if (error)
 		return(error);
 	NDFREE(&nd, NDF_ONLY_PNBUF);
 
 	vfslocked = NDHASGIANT(&nd);
 	error = extattr_delete_vp(nd.ni_vp, uap->attrnamespace, attrname, td);
 	vrele(nd.ni_vp);
 	VFS_UNLOCK_GIANT(vfslocked);
 	return(error);
 }
 
 int
 extattr_delete_link(td, uap)
 	struct thread *td;
 	struct extattr_delete_link_args /* {
 		const char *path;
 		int attrnamespace;
 		const char *attrname;
 	} */ *uap;
 {
 	struct nameidata nd;
 	char attrname[EXTATTR_MAXNAMELEN];
 	int vfslocked, error;
 
 	AUDIT_ARG(value, uap->attrnamespace);
 	error = copyinstr(uap->attrname, attrname, EXTATTR_MAXNAMELEN, NULL);
 	if (error)
 		return(error);
 	AUDIT_ARG(text, attrname);
 
 	NDINIT(&nd, LOOKUP, MPSAFE | NOFOLLOW | AUDITVNODE1, UIO_USERSPACE,
 	    uap->path, td);
 	error = namei(&nd);
 	if (error)
 		return(error);
 	NDFREE(&nd, NDF_ONLY_PNBUF);
 
 	vfslocked = NDHASGIANT(&nd);
 	error = extattr_delete_vp(nd.ni_vp, uap->attrnamespace, attrname, td);
 	vrele(nd.ni_vp);
 	VFS_UNLOCK_GIANT(vfslocked);
 	return(error);
 }
 
 /*-
  * Retrieve a list of extended attributes on a file or directory.
  *
  * Arguments: unlocked vnode "vp", attribute namespace 'attrnamespace",
  *            userspace buffer pointer "data", buffer length "nbytes",
  *            thread "td".
  * Returns: 0 on success, an error number otherwise
  * Locks: none
  * References: vp must be a valid reference for the duration of the call
  */
 static int
 extattr_list_vp(struct vnode *vp, int attrnamespace, void *data,
     size_t nbytes, struct thread *td)
 {
 	struct uio auio, *auiop;
 	size_t size, *sizep;
 	struct iovec aiov;
 	ssize_t cnt;
 	int error;
 
 	VFS_ASSERT_GIANT(vp->v_mount);
 	VOP_LEASE(vp, td, td->td_ucred, LEASE_READ);
-	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td);
+	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
 
 	auiop = NULL;
 	sizep = NULL;
 	cnt = 0;
 	if (data != NULL) {
 		aiov.iov_base = data;
 		aiov.iov_len = nbytes;
 		auio.uio_iov = &aiov;
 		auio.uio_iovcnt = 1;
 		auio.uio_offset = 0;
 		if (nbytes > INT_MAX) {
 			error = EINVAL;
 			goto done;
 		}
 		auio.uio_resid = nbytes;
 		auio.uio_rw = UIO_READ;
 		auio.uio_segflg = UIO_USERSPACE;
 		auio.uio_td = td;
 		auiop = &auio;
 		cnt = nbytes;
 	} else
 		sizep = &size;
 
 #ifdef MAC
 	error = mac_vnode_check_listextattr(td->td_ucred, vp, attrnamespace);
 	if (error)
 		goto done;
 #endif
 
 	error = VOP_LISTEXTATTR(vp, attrnamespace, auiop, sizep,
 	    td->td_ucred, td);
 
 	if (auiop != NULL) {
 		cnt -= auio.uio_resid;
 		td->td_retval[0] = cnt;
 	} else
 		td->td_retval[0] = size;
 
 done:
 	VOP_UNLOCK(vp, 0, td);
 	return (error);
 }
 
 
 int
 extattr_list_fd(td, uap)
 	struct thread *td;
 	struct extattr_list_fd_args /* {
 		int fd;
 		int attrnamespace;
 		void *data;
 		size_t nbytes;
 	} */ *uap;
 {
 	struct file *fp;
 	int vfslocked, error;
 
 	AUDIT_ARG(fd, uap->fd);
 	AUDIT_ARG(value, uap->attrnamespace);
 	error = getvnode(td->td_proc->p_fd, uap->fd, &fp);
 	if (error)
 		return (error);
 
 	vfslocked = VFS_LOCK_GIANT(fp->f_vnode->v_mount);
 	error = extattr_list_vp(fp->f_vnode, uap->attrnamespace, uap->data,
 	    uap->nbytes, td);
 
 	fdrop(fp, td);
 	VFS_UNLOCK_GIANT(vfslocked);
 	return (error);
 }
 
 int
 extattr_list_file(td, uap)
 	struct thread*td;
 	struct extattr_list_file_args /* {
 		const char *path;
 		int attrnamespace;
 		void *data;
 		size_t nbytes;
 	} */ *uap;
 {
 	struct nameidata nd;
 	int vfslocked, error;
 
 	AUDIT_ARG(value, uap->attrnamespace);
 	NDINIT(&nd, LOOKUP, MPSAFE | FOLLOW | AUDITVNODE1, UIO_USERSPACE,
 	    uap->path, td);
 	error = namei(&nd);
 	if (error)
 		return (error);
 	NDFREE(&nd, NDF_ONLY_PNBUF);
 
 	vfslocked = NDHASGIANT(&nd);
 	error = extattr_list_vp(nd.ni_vp, uap->attrnamespace, uap->data,
 	    uap->nbytes, td);
 
 	vrele(nd.ni_vp);
 	VFS_UNLOCK_GIANT(vfslocked);
 	return (error);
 }
 
 int
 extattr_list_link(td, uap)
 	struct thread*td;
 	struct extattr_list_link_args /* {
 		const char *path;
 		int attrnamespace;
 		void *data;
 		size_t nbytes;
 	} */ *uap;
 {
 	struct nameidata nd;
 	int vfslocked, error;
 
 	AUDIT_ARG(value, uap->attrnamespace);
 	NDINIT(&nd, LOOKUP, MPSAFE | NOFOLLOW | AUDITVNODE1, UIO_USERSPACE,
 	    uap->path, td);
 	error = namei(&nd);
 	if (error)
 		return (error);
 	NDFREE(&nd, NDF_ONLY_PNBUF);
 
 	vfslocked = NDHASGIANT(&nd);
 	error = extattr_list_vp(nd.ni_vp, uap->attrnamespace, uap->data,
 	    uap->nbytes, td);
 
 	vrele(nd.ni_vp);
 	VFS_UNLOCK_GIANT(vfslocked);
 	return (error);
 }
Index: head/sys/kern/vfs_lookup.c
===================================================================
--- head/sys/kern/vfs_lookup.c	(revision 175201)
+++ head/sys/kern/vfs_lookup.c	(revision 175202)
@@ -1,1106 +1,1111 @@
 /*-
  * Copyright (c) 1982, 1986, 1989, 1993
  *	The Regents of the University of California.  All rights reserved.
  * (c) UNIX System Laboratories, Inc.
  * All or some portions of this file are derived from material licensed
  * to the University of California by American Telephone and Telegraph
  * Co. or Unix System Laboratories, Inc. and are reproduced herein with
  * the permission of UNIX System Laboratories, Inc.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	@(#)vfs_lookup.c	8.4 (Berkeley) 2/16/94
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_ktrace.h"
 #include "opt_mac.h"
 #include "opt_vfs.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/kernel.h>
 #include <sys/lock.h>
 #include <sys/mutex.h>
 #include <sys/namei.h>
 #include <sys/vnode.h>
 #include <sys/mount.h>
 #include <sys/filedesc.h>
 #include <sys/proc.h>
 #include <sys/syscallsubr.h>
 #include <sys/sysctl.h>
 #ifdef KTRACE
 #include <sys/ktrace.h>
 #endif
 
 #include <security/audit/audit.h>
 #include <security/mac/mac_framework.h>
 
 #include <vm/uma.h>
 
 #define	NAMEI_DIAGNOSTIC 1
 #undef NAMEI_DIAGNOSTIC
 
 /*
  * Allocation zone for namei
  */
 uma_zone_t namei_zone;
 /*
  * Placeholder vnode for mp traversal
  */
 static struct vnode *vp_crossmp;
 
 static void
 nameiinit(void *dummy __unused)
 {
 	int error;
 
 	namei_zone = uma_zcreate("NAMEI", MAXPATHLEN, NULL, NULL, NULL, NULL,
 	    UMA_ALIGN_PTR, 0);
 	error = getnewvnode("crossmp", NULL, &dead_vnodeops, &vp_crossmp);
 	if (error != 0)
 		panic("nameiinit: getnewvnode");
 	vp_crossmp->v_vnlock->lk_flags &= ~LK_NOSHARE;
 }
 SYSINIT(vfs, SI_SUB_VFS, SI_ORDER_SECOND, nameiinit, NULL)
 
 #ifdef LOOKUP_SHARED
 static int lookup_shared = 1;
 #else
 static int lookup_shared = 0;
 #endif
 SYSCTL_INT(_vfs, OID_AUTO, lookup_shared, CTLFLAG_RW, &lookup_shared, 0,
     "Enables/Disables shared locks for path name translation");
 
 /*
  * Convert a pathname into a pointer to a locked vnode.
  *
  * The FOLLOW flag is set when symbolic links are to be followed
  * when they occur at the end of the name translation process.
  * Symbolic links are always followed for all other pathname
  * components other than the last.
  *
  * The segflg defines whether the name is to be copied from user
  * space or kernel space.
  *
  * Overall outline of namei:
  *
  *	copy in name
  *	get starting directory
  *	while (!done && !error) {
  *		call lookup to search path.
  *		if symbolic link, massage name in buffer and continue
  *	}
  */
 int
 namei(struct nameidata *ndp)
 {
 	struct filedesc *fdp;	/* pointer to file descriptor state */
 	char *cp;		/* pointer into pathname argument */
 	struct vnode *dp;	/* the directory we are searching */
 	struct iovec aiov;		/* uio for reading symbolic links */
 	struct uio auio;
 	int error, linklen;
 	struct componentname *cnp = &ndp->ni_cnd;
 	struct thread *td = cnp->cn_thread;
 	struct proc *p = td->td_proc;
 	int vfslocked;
 
 	KASSERT((cnp->cn_flags & MPSAFE) != 0 || mtx_owned(&Giant) != 0,
 	    ("NOT MPSAFE and Giant not held"));
 	ndp->ni_cnd.cn_cred = ndp->ni_cnd.cn_thread->td_ucred;
 	KASSERT(cnp->cn_cred && p, ("namei: bad cred/proc"));
 	KASSERT((cnp->cn_nameiop & (~OPMASK)) == 0,
 	    ("namei: nameiop contaminated with flags"));
 	KASSERT((cnp->cn_flags & OPMASK) == 0,
 	    ("namei: flags contaminated with nameiops"));
 	if (!lookup_shared)
 		cnp->cn_flags &= ~LOCKSHARED;
 	fdp = p->p_fd;
 
 	/*
 	 * Get a buffer for the name to be translated, and copy the
 	 * name into the buffer.
 	 */
 	if ((cnp->cn_flags & HASBUF) == 0)
 		cnp->cn_pnbuf = uma_zalloc(namei_zone, M_WAITOK);
 	if (ndp->ni_segflg == UIO_SYSSPACE)
 		error = copystr(ndp->ni_dirp, cnp->cn_pnbuf,
 			    MAXPATHLEN, (size_t *)&ndp->ni_pathlen);
 	else
 		error = copyinstr(ndp->ni_dirp, cnp->cn_pnbuf,
 			    MAXPATHLEN, (size_t *)&ndp->ni_pathlen);
 
 	/* If we are auditing the kernel pathname, save the user pathname. */
 	if (cnp->cn_flags & AUDITVNODE1)
 		AUDIT_ARG(upath, td, cnp->cn_pnbuf, ARG_UPATH1);
 	if (cnp->cn_flags & AUDITVNODE2)
 		AUDIT_ARG(upath, td, cnp->cn_pnbuf, ARG_UPATH2);
 
 	/*
 	 * Don't allow empty pathnames.
 	 */
 	if (!error && *cnp->cn_pnbuf == '\0')
 		error = ENOENT;
 
 	if (error) {
 		uma_zfree(namei_zone, cnp->cn_pnbuf);
 #ifdef DIAGNOSTIC
 		cnp->cn_pnbuf = NULL;
 		cnp->cn_nameptr = NULL;
 #endif
 		ndp->ni_vp = NULL;
 		return (error);
 	}
 	ndp->ni_loopcnt = 0;
 #ifdef KTRACE
 	if (KTRPOINT(td, KTR_NAMEI)) {
 		KASSERT(cnp->cn_thread == curthread,
 		    ("namei not using curthread"));
 		ktrnamei(cnp->cn_pnbuf);
 	}
 #endif
 
 	/*
 	 * Get starting point for the translation.
 	 */
 	FILEDESC_SLOCK(fdp);
 	ndp->ni_rootdir = fdp->fd_rdir;
 	ndp->ni_topdir = fdp->fd_jdir;
 
 	dp = fdp->fd_cdir;
 	vfslocked = VFS_LOCK_GIANT(dp->v_mount);
 	VREF(dp);
 	FILEDESC_SUNLOCK(fdp);
 	for (;;) {
 		/*
 		 * Check if root directory should replace current directory.
 		 * Done at start of translation and after symbolic link.
 		 */
 		cnp->cn_nameptr = cnp->cn_pnbuf;
 		if (*(cnp->cn_nameptr) == '/') {
 			vrele(dp);
 			VFS_UNLOCK_GIANT(vfslocked);
 			while (*(cnp->cn_nameptr) == '/') {
 				cnp->cn_nameptr++;
 				ndp->ni_pathlen--;
 			}
 			dp = ndp->ni_rootdir;
 			vfslocked = VFS_LOCK_GIANT(dp->v_mount);
 			VREF(dp);
 		}
 		if (vfslocked)
 			ndp->ni_cnd.cn_flags |= GIANTHELD;
 		ndp->ni_startdir = dp;
 		error = lookup(ndp);
 		if (error) {
 			uma_zfree(namei_zone, cnp->cn_pnbuf);
 #ifdef DIAGNOSTIC
 			cnp->cn_pnbuf = NULL;
 			cnp->cn_nameptr = NULL;
 #endif
 			return (error);
 		}
 		vfslocked = (ndp->ni_cnd.cn_flags & GIANTHELD) != 0;
 		ndp->ni_cnd.cn_flags &= ~GIANTHELD;
 		/*
 		 * Check for symbolic link
 		 */
 		if ((cnp->cn_flags & ISSYMLINK) == 0) {
 			if ((cnp->cn_flags & (SAVENAME | SAVESTART)) == 0) {
 				uma_zfree(namei_zone, cnp->cn_pnbuf);
 #ifdef DIAGNOSTIC
 				cnp->cn_pnbuf = NULL;
 				cnp->cn_nameptr = NULL;
 #endif
 			} else
 				cnp->cn_flags |= HASBUF;
 
 			if ((cnp->cn_flags & MPSAFE) == 0) {
 				VFS_UNLOCK_GIANT(vfslocked);
 			} else if (vfslocked)
 				ndp->ni_cnd.cn_flags |= GIANTHELD;
 			return (0);
 		}
 		if (ndp->ni_loopcnt++ >= MAXSYMLINKS) {
 			error = ELOOP;
 			break;
 		}
 #ifdef MAC
 		if ((cnp->cn_flags & NOMACCHECK) == 0) {
 			error = mac_vnode_check_readlink(td->td_ucred,
 			    ndp->ni_vp);
 			if (error)
 				break;
 		}
 #endif
 		if (ndp->ni_pathlen > 1)
 			cp = uma_zalloc(namei_zone, M_WAITOK);
 		else
 			cp = cnp->cn_pnbuf;
 		aiov.iov_base = cp;
 		aiov.iov_len = MAXPATHLEN;
 		auio.uio_iov = &aiov;
 		auio.uio_iovcnt = 1;
 		auio.uio_offset = 0;
 		auio.uio_rw = UIO_READ;
 		auio.uio_segflg = UIO_SYSSPACE;
 		auio.uio_td = (struct thread *)0;
 		auio.uio_resid = MAXPATHLEN;
 		error = VOP_READLINK(ndp->ni_vp, &auio, cnp->cn_cred);
 		if (error) {
 			if (ndp->ni_pathlen > 1)
 				uma_zfree(namei_zone, cp);
 			break;
 		}
 		linklen = MAXPATHLEN - auio.uio_resid;
 		if (linklen == 0) {
 			if (ndp->ni_pathlen > 1)
 				uma_zfree(namei_zone, cp);
 			error = ENOENT;
 			break;
 		}
 		if (linklen + ndp->ni_pathlen >= MAXPATHLEN) {
 			if (ndp->ni_pathlen > 1)
 				uma_zfree(namei_zone, cp);
 			error = ENAMETOOLONG;
 			break;
 		}
 		if (ndp->ni_pathlen > 1) {
 			bcopy(ndp->ni_next, cp + linklen, ndp->ni_pathlen);
 			uma_zfree(namei_zone, cnp->cn_pnbuf);
 			cnp->cn_pnbuf = cp;
 		} else
 			cnp->cn_pnbuf[linklen] = '\0';
 		ndp->ni_pathlen += linklen;
 		vput(ndp->ni_vp);
 		dp = ndp->ni_dvp;
 	}
 	uma_zfree(namei_zone, cnp->cn_pnbuf);
 #ifdef DIAGNOSTIC
 	cnp->cn_pnbuf = NULL;
 	cnp->cn_nameptr = NULL;
 #endif
 	vput(ndp->ni_vp);
 	ndp->ni_vp = NULL;
 	vrele(ndp->ni_dvp);
 	VFS_UNLOCK_GIANT(vfslocked);
 	return (error);
 }
 
 static int
 compute_cn_lkflags(struct mount *mp, int lkflags)
 {
 	if (mp == NULL || 
 	    ((lkflags & LK_SHARED) && !(mp->mnt_kern_flag & MNTK_LOOKUP_SHARED))) {
 		lkflags &= ~LK_SHARED;
 		lkflags |= LK_EXCLUSIVE;
 	}
 	return lkflags;
 }
 
 /*
  * Search a pathname.
  * This is a very central and rather complicated routine.
  *
  * The pathname is pointed to by ni_ptr and is of length ni_pathlen.
  * The starting directory is taken from ni_startdir. The pathname is
  * descended until done, or a symbolic link is encountered. The variable
  * ni_more is clear if the path is completed; it is set to one if a
  * symbolic link needing interpretation is encountered.
  *
  * The flag argument is LOOKUP, CREATE, RENAME, or DELETE depending on
  * whether the name is to be looked up, created, renamed, or deleted.
  * When CREATE, RENAME, or DELETE is specified, information usable in
  * creating, renaming, or deleting a directory entry may be calculated.
  * If flag has LOCKPARENT or'ed into it, the parent directory is returned
  * locked. If flag has WANTPARENT or'ed into it, the parent directory is
  * returned unlocked. Otherwise the parent directory is not returned. If
  * the target of the pathname exists and LOCKLEAF is or'ed into the flag
  * the target is returned locked, otherwise it is returned unlocked.
  * When creating or renaming and LOCKPARENT is specified, the target may not
  * be ".".  When deleting and LOCKPARENT is specified, the target may be ".".
  *
  * Overall outline of lookup:
  *
  * dirloop:
  *	identify next component of name at ndp->ni_ptr
  *	handle degenerate case where name is null string
  *	if .. and crossing mount points and on mounted filesys, find parent
  *	call VOP_LOOKUP routine for next component name
  *	    directory vnode returned in ni_dvp, unlocked unless LOCKPARENT set
  *	    component vnode returned in ni_vp (if it exists), locked.
  *	if result vnode is mounted on and crossing mount points,
  *	    find mounted on vnode
  *	if more components of name, do next level at dirloop
  *	return the answer in ni_vp, locked if LOCKLEAF set
  *	    if LOCKPARENT set, return locked parent in ni_dvp
  *	    if WANTPARENT set, return unlocked parent in ni_dvp
  */
 int
 lookup(struct nameidata *ndp)
 {
 	char *cp;		/* pointer into pathname argument */
 	struct vnode *dp = 0;	/* the directory we are searching */
 	struct vnode *tdp;		/* saved dp */
 	struct mount *mp;		/* mount table entry */
 	int docache;			/* == 0 do not cache last component */
 	int wantparent;			/* 1 => wantparent or lockparent flag */
 	int rdonly;			/* lookup read-only flag bit */
 	int trailing_slash;
 	int error = 0;
 	int dpunlocked = 0;		/* dp has already been unlocked */
 	struct componentname *cnp = &ndp->ni_cnd;
 	struct thread *td = cnp->cn_thread;
 	int vfslocked;			/* VFS Giant state for child */
 	int dvfslocked;			/* VFS Giant state for parent */
 	int tvfslocked;
 	int lkflags_save;
 	
 	/*
 	 * Setup: break out flag bits into variables.
 	 */
 	dvfslocked = (ndp->ni_cnd.cn_flags & GIANTHELD) != 0;
 	vfslocked = 0;
 	ndp->ni_cnd.cn_flags &= ~GIANTHELD;
 	wantparent = cnp->cn_flags & (LOCKPARENT | WANTPARENT);
 	KASSERT(cnp->cn_nameiop == LOOKUP || wantparent,
 	    ("CREATE, DELETE, RENAME require LOCKPARENT or WANTPARENT."));
 	docache = (cnp->cn_flags & NOCACHE) ^ NOCACHE;
 	if (cnp->cn_nameiop == DELETE ||
 	    (wantparent && cnp->cn_nameiop != CREATE &&
 	     cnp->cn_nameiop != LOOKUP))
 		docache = 0;
 	rdonly = cnp->cn_flags & RDONLY;
 	cnp->cn_flags &= ~ISSYMLINK;
 	ndp->ni_dvp = NULL;
 	/*
 	 * We use shared locks until we hit the parent of the last cn then
 	 * we adjust based on the requesting flags.
 	 */
 	if (lookup_shared)
 		cnp->cn_lkflags = LK_SHARED;
 	else
 		cnp->cn_lkflags = LK_EXCLUSIVE;
 	dp = ndp->ni_startdir;
 	ndp->ni_startdir = NULLVP;
-	vn_lock(dp, compute_cn_lkflags(dp->v_mount, cnp->cn_lkflags | LK_RETRY), td);
+	vn_lock(dp,
+	    compute_cn_lkflags(dp->v_mount, cnp->cn_lkflags | LK_RETRY));
 
 dirloop:
 	/*
 	 * Search a new directory.
 	 *
 	 * The last component of the filename is left accessible via
 	 * cnp->cn_nameptr for callers that need the name. Callers needing
 	 * the name set the SAVENAME flag. When done, they assume
 	 * responsibility for freeing the pathname buffer.
 	 */
 	cnp->cn_consume = 0;
 	for (cp = cnp->cn_nameptr; *cp != 0 && *cp != '/'; cp++)
 		continue;
 	cnp->cn_namelen = cp - cnp->cn_nameptr;
 	if (cnp->cn_namelen > NAME_MAX) {
 		error = ENAMETOOLONG;
 		goto bad;
 	}
 #ifdef NAMEI_DIAGNOSTIC
 	{ char c = *cp;
 	*cp = '\0';
 	printf("{%s}: ", cnp->cn_nameptr);
 	*cp = c; }
 #endif
 	ndp->ni_pathlen -= cnp->cn_namelen;
 	ndp->ni_next = cp;
 
 	/*
 	 * Replace multiple slashes by a single slash and trailing slashes
 	 * by a null.  This must be done before VOP_LOOKUP() because some
 	 * fs's don't know about trailing slashes.  Remember if there were
 	 * trailing slashes to handle symlinks, existing non-directories
 	 * and non-existing files that won't be directories specially later.
 	 */
 	trailing_slash = 0;
 	while (*cp == '/' && (cp[1] == '/' || cp[1] == '\0')) {
 		cp++;
 		ndp->ni_pathlen--;
 		if (*cp == '\0') {
 			trailing_slash = 1;
 			*ndp->ni_next = '\0';	/* XXX for direnter() ... */
 		}
 	}
 	ndp->ni_next = cp;
 
 	cnp->cn_flags |= MAKEENTRY;
 	if (*cp == '\0' && docache == 0)
 		cnp->cn_flags &= ~MAKEENTRY;
 	if (cnp->cn_namelen == 2 &&
 	    cnp->cn_nameptr[1] == '.' && cnp->cn_nameptr[0] == '.')
 		cnp->cn_flags |= ISDOTDOT;
 	else
 		cnp->cn_flags &= ~ISDOTDOT;
 	if (*ndp->ni_next == 0)
 		cnp->cn_flags |= ISLASTCN;
 	else
 		cnp->cn_flags &= ~ISLASTCN;
 
 
 	/*
 	 * Check for degenerate name (e.g. / or "")
 	 * which is a way of talking about a directory,
 	 * e.g. like "/." or ".".
 	 */
 	if (cnp->cn_nameptr[0] == '\0') {
 		if (dp->v_type != VDIR) {
 			error = ENOTDIR;
 			goto bad;
 		}
 		if (cnp->cn_nameiop != LOOKUP) {
 			error = EISDIR;
 			goto bad;
 		}
 		if (wantparent) {
 			ndp->ni_dvp = dp;
 			VREF(dp);
 		}
 		ndp->ni_vp = dp;
 
 		if (cnp->cn_flags & AUDITVNODE1)
 			AUDIT_ARG(vnode, dp, ARG_VNODE1);
 		else if (cnp->cn_flags & AUDITVNODE2)
 			AUDIT_ARG(vnode, dp, ARG_VNODE2);
 
 		if (!(cnp->cn_flags & (LOCKPARENT | LOCKLEAF)))
 			VOP_UNLOCK(dp, 0, td);
 		/* XXX This should probably move to the top of function. */
 		if (cnp->cn_flags & SAVESTART)
 			panic("lookup: SAVESTART");
 		goto success;
 	}
 
 	/*
 	 * Handle "..": four special cases.
 	 * 1. Return an error if this is the last component of
 	 *    the name and the operation is DELETE or RENAME.
 	 * 2. If at root directory (e.g. after chroot)
 	 *    or at absolute root directory
 	 *    then ignore it so can't get out.
 	 * 3. If this vnode is the root of a mounted
 	 *    filesystem, then replace it with the
 	 *    vnode which was mounted on so we take the
 	 *    .. in the other filesystem.
 	 * 4. If the vnode is the top directory of
 	 *    the jail or chroot, don't let them out.
 	 */
 	if (cnp->cn_flags & ISDOTDOT) {
 		if ((cnp->cn_flags & ISLASTCN) != 0 &&
 		    (cnp->cn_nameiop == DELETE || cnp->cn_nameiop == RENAME)) {
 			error = EINVAL;
 			goto bad;
 		}
 		for (;;) {
 			if (dp == ndp->ni_rootdir || 
 			    dp == ndp->ni_topdir || 
 			    dp == rootvnode ||
 			    ((dp->v_vflag & VV_ROOT) != 0 &&
 			     (cnp->cn_flags & NOCROSSMOUNT) != 0)) {
 				ndp->ni_dvp = dp;
 				ndp->ni_vp = dp;
 				vfslocked = VFS_LOCK_GIANT(dp->v_mount);
 				VREF(dp);
 				goto nextname;
 			}
 			if ((dp->v_vflag & VV_ROOT) == 0)
 				break;
 			if (dp->v_iflag & VI_DOOMED) {	/* forced unmount */
 				error = EBADF;
 				goto bad;
 			}
 			tdp = dp;
 			dp = dp->v_mount->mnt_vnodecovered;
 			tvfslocked = dvfslocked;
 			dvfslocked = VFS_LOCK_GIANT(dp->v_mount);
 			VREF(dp);
 			vput(tdp);
 			VFS_UNLOCK_GIANT(tvfslocked);
-			vn_lock(dp, compute_cn_lkflags(dp->v_mount, cnp->cn_lkflags | LK_RETRY), td);
+			vn_lock(dp,
+			    compute_cn_lkflags(dp->v_mount, cnp->cn_lkflags |
+			    LK_RETRY));
 		}
 	}
 
 	/*
 	 * We now have a segment name to search for, and a directory to search.
 	 */
 unionlookup:
 #ifdef MAC
 	if ((cnp->cn_flags & NOMACCHECK) == 0) {
 		error = mac_vnode_check_lookup(td->td_ucred, dp, cnp);
 		if (error)
 			goto bad;
 	}
 #endif
 	ndp->ni_dvp = dp;
 	ndp->ni_vp = NULL;
 	ASSERT_VOP_LOCKED(dp, "lookup");
 	VNASSERT(vfslocked == 0, dp, ("lookup: vfslocked %d", vfslocked));
 	/*
 	 * If we have a shared lock we may need to upgrade the lock for the
 	 * last operation.
 	 */
 	if (dp != vp_crossmp &&
 	    VOP_ISLOCKED(dp, td) == LK_SHARED &&
 	    (cnp->cn_flags & ISLASTCN) && (cnp->cn_flags & LOCKPARENT))
-		vn_lock(dp, LK_UPGRADE|LK_RETRY, td);
+		vn_lock(dp, LK_UPGRADE|LK_RETRY);
 	/*
 	 * If we're looking up the last component and we need an exclusive
 	 * lock, adjust our lkflags.
 	 */
 	if ((cnp->cn_flags & (ISLASTCN|LOCKSHARED|LOCKLEAF)) ==
 	    (ISLASTCN|LOCKLEAF))
 		cnp->cn_lkflags = LK_EXCLUSIVE;
 #ifdef NAMEI_DIAGNOSTIC
 	vprint("lookup in", dp);
 #endif
 	lkflags_save = cnp->cn_lkflags;
 	cnp->cn_lkflags = compute_cn_lkflags(dp->v_mount, cnp->cn_lkflags);
 	if ((error = VOP_LOOKUP(dp, &ndp->ni_vp, cnp)) != 0) {
 		cnp->cn_lkflags = lkflags_save;
 		KASSERT(ndp->ni_vp == NULL, ("leaf should be empty"));
 #ifdef NAMEI_DIAGNOSTIC
 		printf("not found\n");
 #endif
 		if ((error == ENOENT) &&
 		    (dp->v_vflag & VV_ROOT) && (dp->v_mount != NULL) &&
 		    (dp->v_mount->mnt_flag & MNT_UNION)) {
 			tdp = dp;
 			dp = dp->v_mount->mnt_vnodecovered;
 			tvfslocked = dvfslocked;
 			dvfslocked = VFS_LOCK_GIANT(dp->v_mount);
 			VREF(dp);
 			vput(tdp);
 			VFS_UNLOCK_GIANT(tvfslocked);
-			vn_lock(dp, compute_cn_lkflags(dp->v_mount, cnp->cn_lkflags | LK_RETRY), td);
+			vn_lock(dp,
+			    compute_cn_lkflags(dp->v_mount, cnp->cn_lkflags |
+			    LK_RETRY));
 			goto unionlookup;
 		}
 
 		if (error != EJUSTRETURN)
 			goto bad;
 		/*
 		 * If creating and at end of pathname, then can consider
 		 * allowing file to be created.
 		 */
 		if (rdonly) {
 			error = EROFS;
 			goto bad;
 		}
 		if (*cp == '\0' && trailing_slash &&
 		     !(cnp->cn_flags & WILLBEDIR)) {
 			error = ENOENT;
 			goto bad;
 		}
 		if ((cnp->cn_flags & LOCKPARENT) == 0)
 			VOP_UNLOCK(dp, 0, td);
 		/*
 		 * This is a temporary assert to make sure I know what the
 		 * behavior here was.
 		 */
 		KASSERT((cnp->cn_flags & (WANTPARENT|LOCKPARENT)) != 0,
 		   ("lookup: Unhandled case."));
 		/*
 		 * We return with ni_vp NULL to indicate that the entry
 		 * doesn't currently exist, leaving a pointer to the
 		 * (possibly locked) directory vnode in ndp->ni_dvp.
 		 */
 		if (cnp->cn_flags & SAVESTART) {
 			ndp->ni_startdir = ndp->ni_dvp;
 			VREF(ndp->ni_startdir);
 		}
 		goto success;
 	} else
 		cnp->cn_lkflags = lkflags_save;
 #ifdef NAMEI_DIAGNOSTIC
 	printf("found\n");
 #endif
 	/*
 	 * Take into account any additional components consumed by
 	 * the underlying filesystem.
 	 */
 	if (cnp->cn_consume > 0) {
 		cnp->cn_nameptr += cnp->cn_consume;
 		ndp->ni_next += cnp->cn_consume;
 		ndp->ni_pathlen -= cnp->cn_consume;
 		cnp->cn_consume = 0;
 	}
 
 	dp = ndp->ni_vp;
 	vfslocked = VFS_LOCK_GIANT(dp->v_mount);
 
 	/*
 	 * Check to see if the vnode has been mounted on;
 	 * if so find the root of the mounted filesystem.
 	 */
 	while (dp->v_type == VDIR && (mp = dp->v_mountedhere) &&
 	       (cnp->cn_flags & NOCROSSMOUNT) == 0) {
 		if (vfs_busy(mp, 0, 0, td))
 			continue;
 		vput(dp);
 		VFS_UNLOCK_GIANT(vfslocked);
 		vfslocked = VFS_LOCK_GIANT(mp);
 		if (dp != ndp->ni_dvp)
 			vput(ndp->ni_dvp);
 		else
 			vrele(ndp->ni_dvp);
 		VFS_UNLOCK_GIANT(dvfslocked);
 		dvfslocked = 0;
 		vref(vp_crossmp);
 		ndp->ni_dvp = vp_crossmp;
 		error = VFS_ROOT(mp, compute_cn_lkflags(mp, cnp->cn_lkflags), &tdp, td);
 		vfs_unbusy(mp, td);
-		if (vn_lock(vp_crossmp, LK_SHARED | LK_NOWAIT, td))
+		if (vn_lock(vp_crossmp, LK_SHARED | LK_NOWAIT))
 			panic("vp_crossmp exclusively locked or reclaimed");
 		if (error) {
 			dpunlocked = 1;
 			goto bad2;
 		}
 		ndp->ni_vp = dp = tdp;
 	}
 
 	/*
 	 * Check for symbolic link
 	 */
 	if ((dp->v_type == VLNK) &&
 	    ((cnp->cn_flags & FOLLOW) || trailing_slash ||
 	     *ndp->ni_next == '/')) {
 		cnp->cn_flags |= ISSYMLINK;
 		if (dp->v_iflag & VI_DOOMED) {
 			/* We can't know whether the directory was mounted with
 			 * NOSYMFOLLOW, so we can't follow safely. */
 			error = EBADF;
 			goto bad2;
 		}
 		if (dp->v_mount->mnt_flag & MNT_NOSYMFOLLOW) {
 			error = EACCES;
 			goto bad2;
 		}
 		/*
 		 * Symlink code always expects an unlocked dvp.
 		 */
 		if (ndp->ni_dvp != ndp->ni_vp)
 			VOP_UNLOCK(ndp->ni_dvp, 0, td);
 		goto success;
 	}
 
 	/*
 	 * Check for bogus trailing slashes.
 	 */
 	if (trailing_slash && dp->v_type != VDIR) {
 		error = ENOTDIR;
 		goto bad2;
 	}
 
 nextname:
 	/*
 	 * Not a symbolic link.  If more pathname,
 	 * continue at next component, else return.
 	 */
 	KASSERT((cnp->cn_flags & ISLASTCN) || *ndp->ni_next == '/',
 	    ("lookup: invalid path state."));
 	if (*ndp->ni_next == '/') {
 		cnp->cn_nameptr = ndp->ni_next;
 		while (*cnp->cn_nameptr == '/') {
 			cnp->cn_nameptr++;
 			ndp->ni_pathlen--;
 		}
 		if (ndp->ni_dvp != dp)
 			vput(ndp->ni_dvp);
 		else
 			vrele(ndp->ni_dvp);
 		VFS_UNLOCK_GIANT(dvfslocked);
 		dvfslocked = vfslocked;	/* dp becomes dvp in dirloop */
 		vfslocked = 0;
 		goto dirloop;
 	}
 	/*
 	 * Disallow directory write attempts on read-only filesystems.
 	 */
 	if (rdonly &&
 	    (cnp->cn_nameiop == DELETE || cnp->cn_nameiop == RENAME)) {
 		error = EROFS;
 		goto bad2;
 	}
 	if (cnp->cn_flags & SAVESTART) {
 		ndp->ni_startdir = ndp->ni_dvp;
 		VREF(ndp->ni_startdir);
 	}
 	if (!wantparent) {
 		if (ndp->ni_dvp != dp)
 			vput(ndp->ni_dvp);
 		else
 			vrele(ndp->ni_dvp);
 		VFS_UNLOCK_GIANT(dvfslocked);
 		dvfslocked = 0;
 	} else if ((cnp->cn_flags & LOCKPARENT) == 0 && ndp->ni_dvp != dp)
 		VOP_UNLOCK(ndp->ni_dvp, 0, td);
 
 	if (cnp->cn_flags & AUDITVNODE1)
 		AUDIT_ARG(vnode, dp, ARG_VNODE1);
 	else if (cnp->cn_flags & AUDITVNODE2)
 		AUDIT_ARG(vnode, dp, ARG_VNODE2);
 
 	if ((cnp->cn_flags & LOCKLEAF) == 0)
 		VOP_UNLOCK(dp, 0, td);
 success:
 	/*
 	 * Because of lookup_shared we may have the vnode shared locked, but
 	 * the caller may want it to be exclusively locked.
 	 */
 	if ((cnp->cn_flags & (ISLASTCN | LOCKSHARED | LOCKLEAF)) ==
 	    (ISLASTCN | LOCKLEAF) && VOP_ISLOCKED(dp, td) != LK_EXCLUSIVE) {
-		vn_lock(dp, LK_UPGRADE | LK_RETRY, td);
+		vn_lock(dp, LK_UPGRADE | LK_RETRY);
 	}
 	if (vfslocked && dvfslocked)
 		VFS_UNLOCK_GIANT(dvfslocked);	/* Only need one */
 	if (vfslocked || dvfslocked)
 		ndp->ni_cnd.cn_flags |= GIANTHELD;
 	return (0);
 
 bad2:
 	if (dp != ndp->ni_dvp)
 		vput(ndp->ni_dvp);
 	else
 		vrele(ndp->ni_dvp);
 bad:
 	if (!dpunlocked)
 		vput(dp);
 	VFS_UNLOCK_GIANT(vfslocked);
 	VFS_UNLOCK_GIANT(dvfslocked);
 	ndp->ni_cnd.cn_flags &= ~GIANTHELD;
 	ndp->ni_vp = NULL;
 	return (error);
 }
 
 /*
  * relookup - lookup a path name component
  *    Used by lookup to re-acquire things.
  */
 int
 relookup(struct vnode *dvp, struct vnode **vpp, struct componentname *cnp)
 {
 	struct thread *td = cnp->cn_thread;
 	struct vnode *dp = 0;		/* the directory we are searching */
 	int wantparent;			/* 1 => wantparent or lockparent flag */
 	int rdonly;			/* lookup read-only flag bit */
 	int error = 0;
 
 	KASSERT(cnp->cn_flags & ISLASTCN,
 	    ("relookup: Not given last component."));
 	/*
 	 * Setup: break out flag bits into variables.
 	 */
 	wantparent = cnp->cn_flags & (LOCKPARENT|WANTPARENT);
 	KASSERT(wantparent, ("relookup: parent not wanted."));
 	rdonly = cnp->cn_flags & RDONLY;
 	cnp->cn_flags &= ~ISSYMLINK;
 	dp = dvp;
 	cnp->cn_lkflags = LK_EXCLUSIVE;
-	vn_lock(dp, LK_EXCLUSIVE | LK_RETRY, td);
+	vn_lock(dp, LK_EXCLUSIVE | LK_RETRY);
 
 	/*
 	 * Search a new directory.
 	 *
 	 * The last component of the filename is left accessible via
 	 * cnp->cn_nameptr for callers that need the name. Callers needing
 	 * the name set the SAVENAME flag. When done, they assume
 	 * responsibility for freeing the pathname buffer.
 	 */
 #ifdef NAMEI_DIAGNOSTIC
 	printf("{%s}: ", cnp->cn_nameptr);
 #endif
 
 	/*
 	 * Check for degenerate name (e.g. / or "")
 	 * which is a way of talking about a directory,
 	 * e.g. like "/." or ".".
 	 */
 	if (cnp->cn_nameptr[0] == '\0') {
 		if (cnp->cn_nameiop != LOOKUP || wantparent) {
 			error = EISDIR;
 			goto bad;
 		}
 		if (dp->v_type != VDIR) {
 			error = ENOTDIR;
 			goto bad;
 		}
 		if (!(cnp->cn_flags & LOCKLEAF))
 			VOP_UNLOCK(dp, 0, td);
 		*vpp = dp;
 		/* XXX This should probably move to the top of function. */
 		if (cnp->cn_flags & SAVESTART)
 			panic("lookup: SAVESTART");
 		return (0);
 	}
 
 	if (cnp->cn_flags & ISDOTDOT)
 		panic ("relookup: lookup on dot-dot");
 
 	/*
 	 * We now have a segment name to search for, and a directory to search.
 	 */
 #ifdef NAMEI_DIAGNOSTIC
 	vprint("search in:", dp);
 #endif
 	if ((error = VOP_LOOKUP(dp, vpp, cnp)) != 0) {
 		KASSERT(*vpp == NULL, ("leaf should be empty"));
 		if (error != EJUSTRETURN)
 			goto bad;
 		/*
 		 * If creating and at end of pathname, then can consider
 		 * allowing file to be created.
 		 */
 		if (rdonly) {
 			error = EROFS;
 			goto bad;
 		}
 		/* ASSERT(dvp == ndp->ni_startdir) */
 		if (cnp->cn_flags & SAVESTART)
 			VREF(dvp);
 		if ((cnp->cn_flags & LOCKPARENT) == 0)
 			VOP_UNLOCK(dp, 0, td);
 		/*
 		 * This is a temporary assert to make sure I know what the
 		 * behavior here was.
 		 */
 		KASSERT((cnp->cn_flags & (WANTPARENT|LOCKPARENT)) != 0,
 		   ("relookup: Unhandled case."));
 		/*
 		 * We return with ni_vp NULL to indicate that the entry
 		 * doesn't currently exist, leaving a pointer to the
 		 * (possibly locked) directory vnode in ndp->ni_dvp.
 		 */
 		return (0);
 	}
 
 	dp = *vpp;
 
 	/*
 	 * Disallow directory write attempts on read-only filesystems.
 	 */
 	if (rdonly &&
 	    (cnp->cn_nameiop == DELETE || cnp->cn_nameiop == RENAME)) {
 		if (dvp == dp)
 			vrele(dvp);
 		else
 			vput(dvp);
 		error = EROFS;
 		goto bad;
 	}
 	/*
 	 * Set the parent lock/ref state to the requested state.
 	 */
 	if ((cnp->cn_flags & LOCKPARENT) == 0 && dvp != dp) {
 		if (wantparent)
 			VOP_UNLOCK(dvp, 0, td);
 		else
 			vput(dvp);
 	} else if (!wantparent)
 		vrele(dvp);
 	/*
 	 * Check for symbolic link
 	 */
 	KASSERT(dp->v_type != VLNK || !(cnp->cn_flags & FOLLOW),
 	    ("relookup: symlink found.\n"));
 
 	/* ASSERT(dvp == ndp->ni_startdir) */
 	if (cnp->cn_flags & SAVESTART)
 		VREF(dvp);
 	
 	if ((cnp->cn_flags & LOCKLEAF) == 0)
 		VOP_UNLOCK(dp, 0, td);
 	return (0);
 bad:
 	vput(dp);
 	*vpp = NULL;
 	return (error);
 }
 
 /*
  * Free data allocated by namei(); see namei(9) for details.
  */
 void
 NDFREE(struct nameidata *ndp, const u_int flags)
 {
 	int unlock_dvp;
 	int unlock_vp;
 
 	unlock_dvp = 0;
 	unlock_vp = 0;
 
 	if (!(flags & NDF_NO_FREE_PNBUF) &&
 	    (ndp->ni_cnd.cn_flags & HASBUF)) {
 		uma_zfree(namei_zone, ndp->ni_cnd.cn_pnbuf);
 		ndp->ni_cnd.cn_flags &= ~HASBUF;
 	}
 	if (!(flags & NDF_NO_VP_UNLOCK) &&
 	    (ndp->ni_cnd.cn_flags & LOCKLEAF) && ndp->ni_vp)
 		unlock_vp = 1;
 	if (!(flags & NDF_NO_VP_RELE) && ndp->ni_vp) {
 		if (unlock_vp) {
 			vput(ndp->ni_vp);
 			unlock_vp = 0;
 		} else
 			vrele(ndp->ni_vp);
 		ndp->ni_vp = NULL;
 	}
 	if (unlock_vp)
 		VOP_UNLOCK(ndp->ni_vp, 0, ndp->ni_cnd.cn_thread);
 	if (!(flags & NDF_NO_DVP_UNLOCK) &&
 	    (ndp->ni_cnd.cn_flags & LOCKPARENT) &&
 	    ndp->ni_dvp != ndp->ni_vp)
 		unlock_dvp = 1;
 	if (!(flags & NDF_NO_DVP_RELE) &&
 	    (ndp->ni_cnd.cn_flags & (LOCKPARENT|WANTPARENT))) {
 		if (unlock_dvp) {
 			vput(ndp->ni_dvp);
 			unlock_dvp = 0;
 		} else
 			vrele(ndp->ni_dvp);
 		ndp->ni_dvp = NULL;
 	}
 	if (unlock_dvp)
 		VOP_UNLOCK(ndp->ni_dvp, 0, ndp->ni_cnd.cn_thread);
 	if (!(flags & NDF_NO_STARTDIR_RELE) &&
 	    (ndp->ni_cnd.cn_flags & SAVESTART)) {
 		vrele(ndp->ni_startdir);
 		ndp->ni_startdir = NULL;
 	}
 }
 
 /*
  * Determine if there is a suitable alternate filename under the specified
  * prefix for the specified path.  If the create flag is set, then the
  * alternate prefix will be used so long as the parent directory exists.
  * This is used by the various compatiblity ABIs so that Linux binaries prefer
  * files under /compat/linux for example.  The chosen path (whether under
  * the prefix or under /) is returned in a kernel malloc'd buffer pointed
  * to by pathbuf.  The caller is responsible for free'ing the buffer from
  * the M_TEMP bucket if one is returned.
  */
 int
 kern_alternate_path(struct thread *td, const char *prefix, char *path,
     enum uio_seg pathseg, char **pathbuf, int create)
 {
 	struct nameidata nd, ndroot;
 	char *ptr, *buf, *cp;
 	size_t len, sz;
 	int error;
 
 	buf = (char *) malloc(MAXPATHLEN, M_TEMP, M_WAITOK);
 	*pathbuf = buf;
 
 	/* Copy the prefix into the new pathname as a starting point. */
 	len = strlcpy(buf, prefix, MAXPATHLEN);
 	if (len >= MAXPATHLEN) {
 		*pathbuf = NULL;
 		free(buf, M_TEMP);
 		return (EINVAL);
 	}
 	sz = MAXPATHLEN - len;
 	ptr = buf + len;
 
 	/* Append the filename to the prefix. */
 	if (pathseg == UIO_SYSSPACE)
 		error = copystr(path, ptr, sz, &len);
 	else
 		error = copyinstr(path, ptr, sz, &len);
 
 	if (error) {
 		*pathbuf = NULL;
 		free(buf, M_TEMP);
 		return (error);
 	}
 
 	/* Only use a prefix with absolute pathnames. */
 	if (*ptr != '/') {
 		error = EINVAL;
 		goto keeporig;
 	}
 
 	/*
 	 * We know that there is a / somewhere in this pathname.
 	 * Search backwards for it, to find the file's parent dir
 	 * to see if it exists in the alternate tree. If it does,
 	 * and we want to create a file (cflag is set). We don't
 	 * need to worry about the root comparison in this case.
 	 */
 
 	if (create) {
 		for (cp = &ptr[len] - 1; *cp != '/'; cp--);
 		*cp = '\0';
 
 		NDINIT(&nd, LOOKUP, FOLLOW | MPSAFE, UIO_SYSSPACE, buf, td);
 		error = namei(&nd);
 		*cp = '/';
 		if (error != 0)
 			goto keeporig;
 	} else {
 		NDINIT(&nd, LOOKUP, FOLLOW | MPSAFE, UIO_SYSSPACE, buf, td);
 
 		error = namei(&nd);
 		if (error != 0)
 			goto keeporig;
 
 		/*
 		 * We now compare the vnode of the prefix to the one
 		 * vnode asked. If they resolve to be the same, then we
 		 * ignore the match so that the real root gets used.
 		 * This avoids the problem of traversing "../.." to find the
 		 * root directory and never finding it, because "/" resolves
 		 * to the emulation root directory. This is expensive :-(
 		 */
 		NDINIT(&ndroot, LOOKUP, FOLLOW | MPSAFE, UIO_SYSSPACE, prefix,
 		    td);
 
 		/* We shouldn't ever get an error from this namei(). */
 		error = namei(&ndroot);
 		if (error == 0) {
 			if (nd.ni_vp == ndroot.ni_vp)
 				error = ENOENT;
 
 			NDFREE(&ndroot, NDF_ONLY_PNBUF);
 			vrele(ndroot.ni_vp);
 			VFS_UNLOCK_GIANT(NDHASGIANT(&ndroot));
 		}
 	}
 
 	NDFREE(&nd, NDF_ONLY_PNBUF);
 	vrele(nd.ni_vp);
 	VFS_UNLOCK_GIANT(NDHASGIANT(&nd));
 
 keeporig:
 	/* If there was an error, use the original path name. */
 	if (error)
 		bcopy(ptr, buf, len);
 	return (error);
 }
Index: head/sys/kern/vfs_mount.c
===================================================================
--- head/sys/kern/vfs_mount.c	(revision 175201)
+++ head/sys/kern/vfs_mount.c	(revision 175202)
@@ -1,2309 +1,2309 @@
 /*-
  * Copyright (c) 1999-2004 Poul-Henning Kamp
  * Copyright (c) 1999 Michael Smith
  * Copyright (c) 1989, 1993
  *	The Regents of the University of California.  All rights reserved.
  * (c) UNIX System Laboratories, Inc.
  * All or some portions of this file are derived from material licensed
  * to the University of California by American Telephone and Telegraph
  * Co. or Unix System Laboratories, Inc. and are reproduced herein with
  * the permission of UNIX System Laboratories, Inc.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <sys/param.h>
 #include <sys/conf.h>
 #include <sys/clock.h>
 #include <sys/jail.h>
 #include <sys/kernel.h>
 #include <sys/libkern.h>
 #include <sys/malloc.h>
 #include <sys/mount.h>
 #include <sys/mutex.h>
 #include <sys/namei.h>
 #include <sys/priv.h>
 #include <sys/proc.h>
 #include <sys/filedesc.h>
 #include <sys/reboot.h>
 #include <sys/syscallsubr.h>
 #include <sys/sysproto.h>
 #include <sys/sx.h>
 #include <sys/sysctl.h>
 #include <sys/sysent.h>
 #include <sys/systm.h>
 #include <sys/vnode.h>
 #include <vm/uma.h>
 
 #include <geom/geom.h>
 
 #include <machine/stdarg.h>
 
 #include <security/audit/audit.h>
 #include <security/mac/mac_framework.h>
 
 #include "opt_rootdevname.h"
 #include "opt_ddb.h"
 #include "opt_mac.h"
 
 #ifdef DDB
 #include <ddb/ddb.h>
 #endif
 
 #define	ROOTNAME		"root_device"
 #define	VFS_MOUNTARG_SIZE_MAX	(1024 * 64)
 
 static int	vfs_domount(struct thread *td, const char *fstype,
 		    char *fspath, int fsflags, void *fsdata);
 static int	vfs_mountroot_ask(void);
 static int	vfs_mountroot_try(const char *mountfrom);
 static int	vfs_donmount(struct thread *td, int fsflags,
 		    struct uio *fsoptions);
 static void	free_mntarg(struct mntarg *ma);
 static int	vfs_getopt_pos(struct vfsoptlist *opts, const char *name);
 
 static int	usermount = 0;
 SYSCTL_INT(_vfs, OID_AUTO, usermount, CTLFLAG_RW, &usermount, 0,
     "Unprivileged users may mount and unmount file systems");
 
 MALLOC_DEFINE(M_MOUNT, "mount", "vfs mount structure");
 MALLOC_DEFINE(M_VNODE_MARKER, "vnodemarker", "vnode marker");
 static uma_zone_t mount_zone;
 
 /* List of mounted filesystems. */
 struct mntlist mountlist = TAILQ_HEAD_INITIALIZER(mountlist);
 
 /* For any iteration/modification of mountlist */
 struct mtx mountlist_mtx;
 MTX_SYSINIT(mountlist, &mountlist_mtx, "mountlist", MTX_DEF);
 
 TAILQ_HEAD(vfsoptlist, vfsopt);
 struct vfsopt {
 	TAILQ_ENTRY(vfsopt) link;
 	char	*name;
 	void	*value;
 	int	len;
 };
 
 /*
  * The vnode of the system's root (/ in the filesystem, without chroot
  * active.)
  */
 struct vnode	*rootvnode;
 
 /*
  * The root filesystem is detailed in the kernel environment variable
  * vfs.root.mountfrom, which is expected to be in the general format
  *
  * <vfsname>:[<path>]
  * vfsname   := the name of a VFS known to the kernel and capable
  *              of being mounted as root
  * path      := disk device name or other data used by the filesystem
  *              to locate its physical store
  */
 
 /*
  * Global opts, taken by all filesystems
  */
 static const char *global_opts[] = {
 	"errmsg",
 	"fstype",
 	"fspath",
 	"ro",
 	"rw",
 	"nosuid",
 	"noexec",
 	"update",
 	NULL
 };
 
 /*
  * The root specifiers we will try if RB_CDROM is specified.
  */
 static char *cdrom_rootdevnames[] = {
 	"cd9660:cd0",
 	"cd9660:acd0",
 	NULL
 };
 
 /* legacy find-root code */
 char		*rootdevnames[2] = {NULL, NULL};
 #ifndef ROOTDEVNAME
 #  define ROOTDEVNAME NULL
 #endif
 static const char	*ctrootdevname = ROOTDEVNAME;
 
 /*
  * ---------------------------------------------------------------------
  * Functions for building and sanitizing the mount options
  */
 
 /* Remove one mount option. */
 static void
 vfs_freeopt(struct vfsoptlist *opts, struct vfsopt *opt)
 {
 
 	TAILQ_REMOVE(opts, opt, link);
 	free(opt->name, M_MOUNT);
 	if (opt->value != NULL)
 		free(opt->value, M_MOUNT);
 #ifdef INVARIANTS
 	else if (opt->len != 0)
 		panic("%s: mount option with NULL value but length != 0",
 		    __func__);
 #endif
 	free(opt, M_MOUNT);
 }
 
 /* Release all resources related to the mount options. */
 void
 vfs_freeopts(struct vfsoptlist *opts)
 {
 	struct vfsopt *opt;
 
 	while (!TAILQ_EMPTY(opts)) {
 		opt = TAILQ_FIRST(opts);
 		vfs_freeopt(opts, opt);
 	}
 	free(opts, M_MOUNT);
 }
 
 void
 vfs_deleteopt(struct vfsoptlist *opts, const char *name)
 {
 	struct vfsopt *opt, *temp;
 
 	TAILQ_FOREACH_SAFE(opt, opts, link, temp)  {
 		if (strcmp(opt->name, name) == 0)
 			vfs_freeopt(opts, opt);
 	}
 }
 
 /*
  * Check if options are equal (with or without the "no" prefix).
  */
 static int
 vfs_equalopts(const char *opt1, const char *opt2)
 {
 
 	/* "opt" vs. "opt" or "noopt" vs. "noopt" */
 	if (strcmp(opt1, opt2) == 0)
 		return (1);
 	/* "noopt" vs. "opt" */
 	if (strncmp(opt1, "no", 2) == 0 && strcmp(opt1 + 2, opt2) == 0)
 		return (1);
 	/* "opt" vs. "noopt" */
 	if (strncmp(opt2, "no", 2) == 0 && strcmp(opt1, opt2 + 2) == 0)
 		return (1);
 	return (0);
 }
 
 /*
  * If a mount option is specified several times,
  * (with or without the "no" prefix) only keep
  * the last occurence of it.
  */
 static void
 vfs_sanitizeopts(struct vfsoptlist *opts)
 {
 	struct vfsopt *opt, *opt2, *tmp;
 
 	TAILQ_FOREACH_REVERSE(opt, opts, vfsoptlist, link) {
 		opt2 = TAILQ_PREV(opt, vfsoptlist, link);
 		while (opt2 != NULL) {
 			if (vfs_equalopts(opt->name, opt2->name)) {
 				tmp = TAILQ_PREV(opt2, vfsoptlist, link);
 				vfs_freeopt(opts, opt2);
 				opt2 = tmp;
 			} else {
 				opt2 = TAILQ_PREV(opt2, vfsoptlist, link);
 			}
 		}
 	}
 }
 
 /*
  * Build a linked list of mount options from a struct uio.
  */
 static int
 vfs_buildopts(struct uio *auio, struct vfsoptlist **options)
 {
 	struct vfsoptlist *opts;
 	struct vfsopt *opt;
 	size_t memused;
 	unsigned int i, iovcnt;
 	int error, namelen, optlen;
 
 	opts = malloc(sizeof(struct vfsoptlist), M_MOUNT, M_WAITOK);
 	TAILQ_INIT(opts);
 	memused = 0;
 	iovcnt = auio->uio_iovcnt;
 	for (i = 0; i < iovcnt; i += 2) {
 		opt = malloc(sizeof(struct vfsopt), M_MOUNT, M_WAITOK);
 		namelen = auio->uio_iov[i].iov_len;
 		optlen = auio->uio_iov[i + 1].iov_len;
 		opt->name = malloc(namelen, M_MOUNT, M_WAITOK);
 		opt->value = NULL;
 		opt->len = 0;
 
 		/*
 		 * Do this early, so jumps to "bad" will free the current
 		 * option.
 		 */
 		TAILQ_INSERT_TAIL(opts, opt, link);
 		memused += sizeof(struct vfsopt) + optlen + namelen;
 
 		/*
 		 * Avoid consuming too much memory, and attempts to overflow
 		 * memused.
 		 */
 		if (memused > VFS_MOUNTARG_SIZE_MAX ||
 		    optlen > VFS_MOUNTARG_SIZE_MAX ||
 		    namelen > VFS_MOUNTARG_SIZE_MAX) {
 			error = EINVAL;
 			goto bad;
 		}
 
 		if (auio->uio_segflg == UIO_SYSSPACE) {
 			bcopy(auio->uio_iov[i].iov_base, opt->name, namelen);
 		} else {
 			error = copyin(auio->uio_iov[i].iov_base, opt->name,
 			    namelen);
 			if (error)
 				goto bad;
 		}
 		/* Ensure names are null-terminated strings. */
 		if (opt->name[namelen - 1] != '\0') {
 			error = EINVAL;
 			goto bad;
 		}
 		if (optlen != 0) {
 			opt->len = optlen;
 			opt->value = malloc(optlen, M_MOUNT, M_WAITOK);
 			if (auio->uio_segflg == UIO_SYSSPACE) {
 				bcopy(auio->uio_iov[i + 1].iov_base, opt->value,
 				    optlen);
 			} else {
 				error = copyin(auio->uio_iov[i + 1].iov_base,
 				    opt->value, optlen);
 				if (error)
 					goto bad;
 			}
 		}
 	}
 	vfs_sanitizeopts(opts);
 	*options = opts;
 	return (0);
 bad:
 	vfs_freeopts(opts);
 	return (error);
 }
 
 /*
  * Merge the old mount options with the new ones passed
  * in the MNT_UPDATE case.
  */
 static void
 vfs_mergeopts(struct vfsoptlist *toopts, struct vfsoptlist *opts)
 {
 	struct vfsopt *opt, *opt2, *new;
 
 	TAILQ_FOREACH(opt, opts, link) {
 		/*
 		 * Check that this option hasn't been redefined
 		 * nor cancelled with a "no" mount option.
 		 */
 		opt2 = TAILQ_FIRST(toopts);
 		while (opt2 != NULL) {
 			if (strcmp(opt2->name, opt->name) == 0)
 				goto next;
 			if (strncmp(opt2->name, "no", 2) == 0 &&
 			    strcmp(opt2->name + 2, opt->name) == 0) {
 				vfs_freeopt(toopts, opt2);
 				goto next;
 			}
 			opt2 = TAILQ_NEXT(opt2, link);
 		}
 		/* We want this option, duplicate it. */
 		new = malloc(sizeof(struct vfsopt), M_MOUNT, M_WAITOK);
 		new->name = malloc(strlen(opt->name) + 1, M_MOUNT, M_WAITOK);
 		strcpy(new->name, opt->name);
 		if (opt->len != 0) {
 			new->value = malloc(opt->len, M_MOUNT, M_WAITOK);
 			bcopy(opt->value, new->value, opt->len);
 		} else {
 			new->value = NULL;
 		}
 		new->len = opt->len;
 		TAILQ_INSERT_TAIL(toopts, new, link);
 next:
 		continue;
 	}
 }
 
 /*
  * Mount a filesystem.
  */
 int
 nmount(td, uap)
 	struct thread *td;
 	struct nmount_args /* {
 		struct iovec *iovp;
 		unsigned int iovcnt;
 		int flags;
 	} */ *uap;
 {
 	struct uio *auio;
 	struct iovec *iov;
 	unsigned int i;
 	int error;
 	u_int iovcnt;
 
 	AUDIT_ARG(fflags, uap->flags);
 
 	/*
 	 * Filter out MNT_ROOTFS.  We do not want clients of nmount() in
 	 * userspace to set this flag, but we must filter it out if we want
 	 * MNT_UPDATE on the root file system to work.
 	 * MNT_ROOTFS should only be set in the kernel in vfs_mountroot_try().
 	 */
 	uap->flags &= ~MNT_ROOTFS;
 
 	iovcnt = uap->iovcnt;
 	/*
 	 * Check that we have an even number of iovec's
 	 * and that we have at least two options.
 	 */
 	if ((iovcnt & 1) || (iovcnt < 4))
 		return (EINVAL);
 
 	error = copyinuio(uap->iovp, iovcnt, &auio);
 	if (error)
 		return (error);
 	iov = auio->uio_iov;
 	for (i = 0; i < iovcnt; i++) {
 		if (iov->iov_len > MMAXOPTIONLEN) {
 			free(auio, M_IOV);
 			return (EINVAL);
 		}
 		iov++;
 	}
 	error = vfs_donmount(td, uap->flags, auio);
 
 	free(auio, M_IOV);
 	return (error);
 }
 
 /*
  * ---------------------------------------------------------------------
  * Various utility functions
  */
 
 void
 vfs_ref(struct mount *mp)
 {
 
 	MNT_ILOCK(mp);
 	MNT_REF(mp);
 	MNT_IUNLOCK(mp);
 }
 
 void
 vfs_rel(struct mount *mp)
 {
 
 	MNT_ILOCK(mp);
 	MNT_REL(mp);
 	MNT_IUNLOCK(mp);
 }
 
 static int
 mount_init(void *mem, int size, int flags)
 {
 	struct mount *mp;
 
 	mp = (struct mount *)mem;
 	mtx_init(&mp->mnt_mtx, "struct mount mtx", NULL, MTX_DEF);
 	lockinit(&mp->mnt_lock, PVFS, "vfslock", 0, 0);
 	return (0);
 }
 
 static void
 mount_fini(void *mem, int size)
 {
 	struct mount *mp;
 
 	mp = (struct mount *)mem;
 	lockdestroy(&mp->mnt_lock);
 	mtx_destroy(&mp->mnt_mtx);
 }
 
 /*
  * Allocate and initialize the mount point struct.
  */
 struct mount *
 vfs_mount_alloc(struct vnode *vp, struct vfsconf *vfsp,
     const char *fspath, struct thread *td)
 {
 	struct mount *mp;
 
 	mp = uma_zalloc(mount_zone, M_WAITOK);
 	bzero(&mp->mnt_startzero,
 	    __rangeof(struct mount, mnt_startzero, mnt_endzero));
 	TAILQ_INIT(&mp->mnt_nvnodelist);
 	mp->mnt_nvnodelistsize = 0;
 	mp->mnt_ref = 0;
 	(void) vfs_busy(mp, LK_NOWAIT, 0, td);
 	mp->mnt_op = vfsp->vfc_vfsops;
 	mp->mnt_vfc = vfsp;
 	vfsp->vfc_refcount++;	/* XXX Unlocked */
 	mp->mnt_stat.f_type = vfsp->vfc_typenum;
 	mp->mnt_gen++;
 	strlcpy(mp->mnt_stat.f_fstypename, vfsp->vfc_name, MFSNAMELEN);
 	mp->mnt_vnodecovered = vp;
 	mp->mnt_cred = crdup(td->td_ucred);
 	mp->mnt_stat.f_owner = td->td_ucred->cr_uid;
 	strlcpy(mp->mnt_stat.f_mntonname, fspath, MNAMELEN);
 	mp->mnt_iosize_max = DFLTPHYS;
 #ifdef MAC
 	mac_mount_init(mp);
 	mac_mount_create(td->td_ucred, mp);
 #endif
 	arc4rand(&mp->mnt_hashseed, sizeof mp->mnt_hashseed, 0);
 	return (mp);
 }
 
 /*
  * Destroy the mount struct previously allocated by vfs_mount_alloc().
  */
 void
 vfs_mount_destroy(struct mount *mp)
 {
 	int i;
 
 	MNT_ILOCK(mp);
 	for (i = 0; mp->mnt_ref && i < 3; i++)
 		msleep(mp, MNT_MTX(mp), PVFS, "mntref", hz);
 	/*
 	 * This will always cause a 3 second delay in rebooting due to
 	 * refs on the root mountpoint that never go away.  Most of these
 	 * are held by init which never exits.
 	 */
 	if (i == 3 && (!rebooting || bootverbose))
 		printf("Mount point %s had %d dangling refs\n",
 		    mp->mnt_stat.f_mntonname, mp->mnt_ref);
 	if (mp->mnt_holdcnt != 0) {
 		printf("Waiting for mount point to be unheld\n");
 		while (mp->mnt_holdcnt != 0) {
 			mp->mnt_holdcntwaiters++;
 			msleep(&mp->mnt_holdcnt, MNT_MTX(mp),
 			       PZERO, "mntdestroy", 0);
 			mp->mnt_holdcntwaiters--;
 		}
 		printf("mount point unheld\n");
 	}
 	if (mp->mnt_writeopcount > 0) {
 		printf("Waiting for mount point write ops\n");
 		while (mp->mnt_writeopcount > 0) {
 			mp->mnt_kern_flag |= MNTK_SUSPEND;
 			msleep(&mp->mnt_writeopcount,
 			       MNT_MTX(mp),
 			       PZERO, "mntdestroy2", 0);
 		}
 		printf("mount point write ops completed\n");
 	}
 	if (mp->mnt_secondary_writes > 0) {
 		printf("Waiting for mount point secondary write ops\n");
 		while (mp->mnt_secondary_writes > 0) {
 			mp->mnt_kern_flag |= MNTK_SUSPEND;
 			msleep(&mp->mnt_secondary_writes,
 			       MNT_MTX(mp),
 			       PZERO, "mntdestroy3", 0);
 		}
 		printf("mount point secondary write ops completed\n");
 	}
 	MNT_IUNLOCK(mp);
 	mp->mnt_vfc->vfc_refcount--;
 	if (!TAILQ_EMPTY(&mp->mnt_nvnodelist)) {
 		struct vnode *vp;
 
 		TAILQ_FOREACH(vp, &mp->mnt_nvnodelist, v_nmntvnodes)
 			vprint("", vp);
 		panic("unmount: dangling vnode");
 	}
 	MNT_ILOCK(mp);
 	if (mp->mnt_kern_flag & MNTK_MWAIT)
 		wakeup(mp);
 	if (mp->mnt_writeopcount != 0)
 		panic("vfs_mount_destroy: nonzero writeopcount");
 	if (mp->mnt_secondary_writes != 0)
 		panic("vfs_mount_destroy: nonzero secondary_writes");
 	if (mp->mnt_nvnodelistsize != 0)
 		panic("vfs_mount_destroy: nonzero nvnodelistsize");
 	mp->mnt_writeopcount = -1000;
 	mp->mnt_nvnodelistsize = -1000;
 	mp->mnt_secondary_writes = -1000;
 	MNT_IUNLOCK(mp);
 #ifdef MAC
 	mac_mount_destroy(mp);
 #endif
 	if (mp->mnt_opt != NULL)
 		vfs_freeopts(mp->mnt_opt);
 	crfree(mp->mnt_cred);
 	uma_zfree(mount_zone, mp);
 }
 
 static int
 vfs_donmount(struct thread *td, int fsflags, struct uio *fsoptions)
 {
 	struct vfsoptlist *optlist;
 	struct vfsopt *opt, *noro_opt;
 	char *fstype, *fspath, *errmsg;
 	int error, fstypelen, fspathlen, errmsg_len, errmsg_pos;
 	int has_rw, has_noro;
 
 	errmsg = NULL;
 	errmsg_len = 0;
 	errmsg_pos = -1;
 	has_rw = 0;
 	has_noro = 0;
 
 	error = vfs_buildopts(fsoptions, &optlist);
 	if (error)
 		return (error);
 
 	if (vfs_getopt(optlist, "errmsg", (void **)&errmsg, &errmsg_len) == 0)
 		errmsg_pos = vfs_getopt_pos(optlist, "errmsg");
 
 	/*
 	 * We need these two options before the others,
 	 * and they are mandatory for any filesystem.
 	 * Ensure they are NUL terminated as well.
 	 */
 	fstypelen = 0;
 	error = vfs_getopt(optlist, "fstype", (void **)&fstype, &fstypelen);
 	if (error || fstype[fstypelen - 1] != '\0') {
 		error = EINVAL;
 		if (errmsg != NULL)
 			strncpy(errmsg, "Invalid fstype", errmsg_len);
 		goto bail;
 	}
 	fspathlen = 0;
 	error = vfs_getopt(optlist, "fspath", (void **)&fspath, &fspathlen);
 	if (error || fspath[fspathlen - 1] != '\0') {
 		error = EINVAL;
 		if (errmsg != NULL)
 			strncpy(errmsg, "Invalid fspath", errmsg_len);
 		goto bail;
 	}
 
 	/*
 	 * We need to see if we have the "update" option
 	 * before we call vfs_domount(), since vfs_domount() has special
 	 * logic based on MNT_UPDATE.  This is very important
 	 * when we want to update the root filesystem.
 	 */
 	TAILQ_FOREACH(opt, optlist, link) {
 		if (strcmp(opt->name, "update") == 0)
 			fsflags |= MNT_UPDATE;
 		else if (strcmp(opt->name, "async") == 0)
 			fsflags |= MNT_ASYNC;
 		else if (strcmp(opt->name, "force") == 0)
 			fsflags |= MNT_FORCE;
 		else if (strcmp(opt->name, "multilabel") == 0)
 			fsflags |= MNT_MULTILABEL;
 		else if (strcmp(opt->name, "noasync") == 0)
 			fsflags &= ~MNT_ASYNC;
 		else if (strcmp(opt->name, "noatime") == 0)
 			fsflags |= MNT_NOATIME;
 		else if (strcmp(opt->name, "atime") == 0) {
 			free(opt->name, M_MOUNT);
 			opt->name = strdup("nonoatime", M_MOUNT);
 		}
 		else if (strcmp(opt->name, "noclusterr") == 0)
 			fsflags |= MNT_NOCLUSTERR;
 		else if (strcmp(opt->name, "clusterr") == 0) {
 			free(opt->name, M_MOUNT);
 			opt->name = strdup("nonoclusterr", M_MOUNT);
 		}
 		else if (strcmp(opt->name, "noclusterw") == 0)
 			fsflags |= MNT_NOCLUSTERW;
 		else if (strcmp(opt->name, "clusterw") == 0) {
 			free(opt->name, M_MOUNT);
 			opt->name = strdup("nonoclusterw", M_MOUNT);
 		}
 		else if (strcmp(opt->name, "noexec") == 0)
 			fsflags |= MNT_NOEXEC;
 		else if (strcmp(opt->name, "exec") == 0) {
 			free(opt->name, M_MOUNT);
 			opt->name = strdup("nonoexec", M_MOUNT);
 		}
 		else if (strcmp(opt->name, "nosuid") == 0)
 			fsflags |= MNT_NOSUID;
 		else if (strcmp(opt->name, "suid") == 0) {
 			free(opt->name, M_MOUNT);
 			opt->name = strdup("nonosuid", M_MOUNT);
 		}
 		else if (strcmp(opt->name, "nosymfollow") == 0)
 			fsflags |= MNT_NOSYMFOLLOW;
 		else if (strcmp(opt->name, "symfollow") == 0) {
 			free(opt->name, M_MOUNT);
 			opt->name = strdup("nonosymfollow", M_MOUNT);
 		}
 		else if (strcmp(opt->name, "noro") == 0) {
 			fsflags &= ~MNT_RDONLY;
 			has_noro = 1;
 		}
 		else if (strcmp(opt->name, "rw") == 0) {
 			fsflags &= ~MNT_RDONLY;
 			has_rw = 1;
 		}
 		else if (strcmp(opt->name, "ro") == 0)
 			fsflags |= MNT_RDONLY;
 		else if (strcmp(opt->name, "rdonly") == 0) {
 			free(opt->name, M_MOUNT);
 			opt->name = strdup("ro", M_MOUNT);
 			fsflags |= MNT_RDONLY;
 		}
 		else if (strcmp(opt->name, "snapshot") == 0)
 			fsflags |= MNT_SNAPSHOT;
 		else if (strcmp(opt->name, "suiddir") == 0)
 			fsflags |= MNT_SUIDDIR;
 		else if (strcmp(opt->name, "sync") == 0)
 			fsflags |= MNT_SYNCHRONOUS;
 		else if (strcmp(opt->name, "union") == 0)
 			fsflags |= MNT_UNION;
 	}
 
 	/*
 	 * If "rw" was specified as a mount option, and we
 	 * are trying to update a mount-point from "ro" to "rw",
 	 * we need a mount option "noro", since in vfs_mergeopts(),
 	 * "noro" will cancel "ro", but "rw" will not do anything.
 	 */
 	if (has_rw && !has_noro) {
 		noro_opt = malloc(sizeof(struct vfsopt), M_MOUNT, M_WAITOK);
 		noro_opt->name = strdup("noro", M_MOUNT);
 		noro_opt->value = NULL;
 		noro_opt->len = 0;
 		TAILQ_INSERT_TAIL(optlist, noro_opt, link);
 	}
 
 	/*
 	 * Be ultra-paranoid about making sure the type and fspath
 	 * variables will fit in our mp buffers, including the
 	 * terminating NUL.
 	 */
 	if (fstypelen >= MFSNAMELEN - 1 || fspathlen >= MNAMELEN - 1) {
 		error = ENAMETOOLONG;
 		goto bail;
 	}
 
 	mtx_lock(&Giant);
 	error = vfs_domount(td, fstype, fspath, fsflags, optlist);
 	mtx_unlock(&Giant);
 bail:
 	/* copyout the errmsg */
 	if (errmsg_pos != -1 && ((2 * errmsg_pos + 1) < fsoptions->uio_iovcnt)
 	    && errmsg_len > 0 && errmsg != NULL) {
 		if (fsoptions->uio_segflg == UIO_SYSSPACE) {
 			bcopy(errmsg,
 			    fsoptions->uio_iov[2 * errmsg_pos + 1].iov_base,
 			    fsoptions->uio_iov[2 * errmsg_pos + 1].iov_len);
 		} else {
 			copyout(errmsg,
 			    fsoptions->uio_iov[2 * errmsg_pos + 1].iov_base,
 			    fsoptions->uio_iov[2 * errmsg_pos + 1].iov_len);
 		}
 	}
 
 	if (error != 0)
 		vfs_freeopts(optlist);
 	return (error);
 }
 
 /*
  * Old mount API.
  */
 #ifndef _SYS_SYSPROTO_H_
 struct mount_args {
 	char	*type;
 	char	*path;
 	int	flags;
 	caddr_t	data;
 };
 #endif
 /* ARGSUSED */
 int
 mount(td, uap)
 	struct thread *td;
 	struct mount_args /* {
 		char *type;
 		char *path;
 		int flags;
 		caddr_t data;
 	} */ *uap;
 {
 	char *fstype;
 	struct vfsconf *vfsp = NULL;
 	struct mntarg *ma = NULL;
 	int error;
 
 	AUDIT_ARG(fflags, uap->flags);
 
 	/*
 	 * Filter out MNT_ROOTFS.  We do not want clients of mount() in
 	 * userspace to set this flag, but we must filter it out if we want
 	 * MNT_UPDATE on the root file system to work.
 	 * MNT_ROOTFS should only be set in the kernel in vfs_mountroot_try().
 	 */
 	uap->flags &= ~MNT_ROOTFS;
 
 	fstype = malloc(MFSNAMELEN, M_TEMP, M_WAITOK);
 	error = copyinstr(uap->type, fstype, MFSNAMELEN, NULL);
 	if (error) {
 		free(fstype, M_TEMP);
 		return (error);
 	}
 
 	AUDIT_ARG(text, fstype);
 	mtx_lock(&Giant);
 	vfsp = vfs_byname_kld(fstype, td, &error);
 	free(fstype, M_TEMP);
 	if (vfsp == NULL) {
 		mtx_unlock(&Giant);
 		return (ENOENT);
 	}
 	if (vfsp->vfc_vfsops->vfs_cmount == NULL) {
 		mtx_unlock(&Giant);
 		return (EOPNOTSUPP);
 	}
 
 	ma = mount_argsu(ma, "fstype", uap->type, MNAMELEN);
 	ma = mount_argsu(ma, "fspath", uap->path, MNAMELEN);
 	ma = mount_argb(ma, uap->flags & MNT_RDONLY, "noro");
 	ma = mount_argb(ma, !(uap->flags & MNT_NOSUID), "nosuid");
 	ma = mount_argb(ma, !(uap->flags & MNT_NOEXEC), "noexec");
 
 	error = vfsp->vfc_vfsops->vfs_cmount(ma, uap->data, uap->flags, td);
 	mtx_unlock(&Giant);
 	return (error);
 }
 
 
 /*
  * vfs_domount(): actually attempt a filesystem mount.
  */
 static int
 vfs_domount(
 	struct thread *td,	/* Calling thread. */
 	const char *fstype,	/* Filesystem type. */
 	char *fspath,		/* Mount path. */
 	int fsflags,		/* Flags common to all filesystems. */
 	void *fsdata		/* Options local to the filesystem. */
 	)
 {
 	struct vnode *vp;
 	struct mount *mp;
 	struct vfsconf *vfsp;
 	struct export_args export;
 	int error, flag = 0;
 	struct vattr va;
 	struct nameidata nd;
 
 	mtx_assert(&Giant, MA_OWNED);
 	/*
 	 * Be ultra-paranoid about making sure the type and fspath
 	 * variables will fit in our mp buffers, including the
 	 * terminating NUL.
 	 */
 	if (strlen(fstype) >= MFSNAMELEN || strlen(fspath) >= MNAMELEN)
 		return (ENAMETOOLONG);
 
 	if (jailed(td->td_ucred) || usermount == 0) {
 		if ((error = priv_check(td, PRIV_VFS_MOUNT)) != 0)
 			return (error);
 	}
 
 	/*
 	 * Do not allow NFS export or MNT_SUIDDIR by unprivileged users.
 	 */
 	if (fsflags & MNT_EXPORTED) {
 		error = priv_check(td, PRIV_VFS_MOUNT_EXPORTED);
 		if (error)
 			return (error);
 	}
 	if (fsflags & MNT_SUIDDIR) {
 		error = priv_check(td, PRIV_VFS_MOUNT_SUIDDIR);
 		if (error)
 			return (error);
 	}
 	/*
 	 * Silently enforce MNT_NOSUID and MNT_USER for unprivileged users.
 	 */
 	if ((fsflags & (MNT_NOSUID | MNT_USER)) != (MNT_NOSUID | MNT_USER)) {
 		if (priv_check(td, PRIV_VFS_MOUNT_NONUSER) != 0)
 			fsflags |= MNT_NOSUID | MNT_USER;
 	}
 
 	/* Load KLDs before we lock the covered vnode to avoid reversals. */
 	vfsp = NULL;
 	if ((fsflags & MNT_UPDATE) == 0) {
 		/* Don't try to load KLDs if we're mounting the root. */
 		if (fsflags & MNT_ROOTFS)
 			vfsp = vfs_byname(fstype);
 		else
 			vfsp = vfs_byname_kld(fstype, td, &error);
 		if (vfsp == NULL)
 			return (ENODEV);
 		if (jailed(td->td_ucred) && !(vfsp->vfc_flags & VFCF_JAIL))
 			return (EPERM);
 	}
 	/*
 	 * Get vnode to be covered
 	 */
 	NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | AUDITVNODE1, UIO_SYSSPACE,
 	    fspath, td);
 	if ((error = namei(&nd)) != 0)
 		return (error);
 	NDFREE(&nd, NDF_ONLY_PNBUF);
 	vp = nd.ni_vp;
 	if (fsflags & MNT_UPDATE) {
 		if ((vp->v_vflag & VV_ROOT) == 0) {
 			vput(vp);
 			return (EINVAL);
 		}
 		mp = vp->v_mount;
 		MNT_ILOCK(mp);
 		flag = mp->mnt_flag;
 		/*
 		 * We only allow the filesystem to be reloaded if it
 		 * is currently mounted read-only.
 		 */
 		if ((fsflags & MNT_RELOAD) &&
 		    ((mp->mnt_flag & MNT_RDONLY) == 0)) {
 			MNT_IUNLOCK(mp);
 			vput(vp);
 			return (EOPNOTSUPP);	/* Needs translation */
 		}
 		MNT_IUNLOCK(mp);
 		/*
 		 * Only privileged root, or (if MNT_USER is set) the user that
 		 * did the original mount is permitted to update it.
 		 */
 		error = vfs_suser(mp, td);
 		if (error) {
 			vput(vp);
 			return (error);
 		}
 		if (vfs_busy(mp, LK_NOWAIT, 0, td)) {
 			vput(vp);
 			return (EBUSY);
 		}
 		VI_LOCK(vp);
 		if ((vp->v_iflag & VI_MOUNT) != 0 ||
 		    vp->v_mountedhere != NULL) {
 			VI_UNLOCK(vp);
 			vfs_unbusy(mp, td);
 			vput(vp);
 			return (EBUSY);
 		}
 		vp->v_iflag |= VI_MOUNT;
 		VI_UNLOCK(vp);
 		MNT_ILOCK(mp);
 		mp->mnt_flag |= fsflags &
 		    (MNT_RELOAD | MNT_FORCE | MNT_UPDATE | MNT_SNAPSHOT | MNT_ROOTFS);
 		MNT_IUNLOCK(mp);
 		VOP_UNLOCK(vp, 0, td);
 		mp->mnt_optnew = fsdata;
 		vfs_mergeopts(mp->mnt_optnew, mp->mnt_opt);
 	} else {
 		/*
 		 * If the user is not root, ensure that they own the directory
 		 * onto which we are attempting to mount.
 		 */
 		error = VOP_GETATTR(vp, &va, td->td_ucred, td);
 		if (error) {
 			vput(vp);
 			return (error);
 		}
 		if (va.va_uid != td->td_ucred->cr_uid) {
 			error = priv_check_cred(td->td_ucred, PRIV_VFS_ADMIN,
 			    0);
 			if (error) {
 				vput(vp);
 				return (error);
 			}
 		}
 		error = vinvalbuf(vp, V_SAVE, td, 0, 0);
 		if (error != 0) {
 			vput(vp);
 			return (error);
 		}
 		if (vp->v_type != VDIR) {
 			vput(vp);
 			return (ENOTDIR);
 		}
 		VI_LOCK(vp);
 		if ((vp->v_iflag & VI_MOUNT) != 0 ||
 		    vp->v_mountedhere != NULL) {
 			VI_UNLOCK(vp);
 			vput(vp);
 			return (EBUSY);
 		}
 		vp->v_iflag |= VI_MOUNT;
 		VI_UNLOCK(vp);
 
 		/*
 		 * Allocate and initialize the filesystem.
 		 */
 		mp = vfs_mount_alloc(vp, vfsp, fspath, td);
 		VOP_UNLOCK(vp, 0, td);
 
 		/* XXXMAC: pass to vfs_mount_alloc? */
 		mp->mnt_optnew = fsdata;
 	}
 
 	/*
 	 * Set the mount level flags.
 	 */
 	MNT_ILOCK(mp);
 	mp->mnt_flag = (mp->mnt_flag & ~MNT_UPDATEMASK) |
 		(fsflags & (MNT_UPDATEMASK | MNT_FORCE | MNT_ROOTFS |
 			    MNT_RDONLY));
 	if ((mp->mnt_flag & MNT_ASYNC) == 0)
 		mp->mnt_kern_flag &= ~MNTK_ASYNC;
 	MNT_IUNLOCK(mp);
 	/*
 	 * Mount the filesystem.
 	 * XXX The final recipients of VFS_MOUNT just overwrite the ndp they
 	 * get.  No freeing of cn_pnbuf.
 	 */
         error = VFS_MOUNT(mp, td);
 
 	/*
 	 * Process the export option only if we are
 	 * updating mount options.
 	 */
 	if (!error && (fsflags & MNT_UPDATE)) {
 		if (vfs_copyopt(mp->mnt_optnew, "export", &export,
 		    sizeof(export)) == 0)
 			error = vfs_export(mp, &export);
 	}
 
 	if (!error) {
 		if (mp->mnt_opt != NULL)
 			vfs_freeopts(mp->mnt_opt);
 		mp->mnt_opt = mp->mnt_optnew;
 		(void)VFS_STATFS(mp, &mp->mnt_stat, td);
 	}
 	/*
 	 * Prevent external consumers of mount options from reading
 	 * mnt_optnew.
 	*/
 	mp->mnt_optnew = NULL;
 	if (mp->mnt_flag & MNT_UPDATE) {
 		MNT_ILOCK(mp);
 		if (error)
 			mp->mnt_flag = (mp->mnt_flag & MNT_QUOTA) |
 				(flag & ~MNT_QUOTA);
 		else
 			mp->mnt_flag &=	~(MNT_UPDATE | MNT_RELOAD |
 					  MNT_FORCE | MNT_SNAPSHOT);
 		if ((mp->mnt_flag & MNT_ASYNC) != 0 && mp->mnt_noasync == 0)
 			mp->mnt_kern_flag |= MNTK_ASYNC;
 		else
 			mp->mnt_kern_flag &= ~MNTK_ASYNC;
 		MNT_IUNLOCK(mp);
 		if ((mp->mnt_flag & MNT_RDONLY) == 0) {
 			if (mp->mnt_syncer == NULL)
 				error = vfs_allocate_syncvnode(mp);
 		} else {
 			if (mp->mnt_syncer != NULL)
 				vrele(mp->mnt_syncer);
 			mp->mnt_syncer = NULL;
 		}
 		vfs_unbusy(mp, td);
 		VI_LOCK(vp);
 		vp->v_iflag &= ~VI_MOUNT;
 		VI_UNLOCK(vp);
 		vrele(vp);
 		return (error);
 	}
 	MNT_ILOCK(mp);
 	if ((mp->mnt_flag & MNT_ASYNC) != 0 && mp->mnt_noasync == 0)
 		mp->mnt_kern_flag |= MNTK_ASYNC;
 	else
 		mp->mnt_kern_flag &= ~MNTK_ASYNC;
 	MNT_IUNLOCK(mp);
-	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td);
+	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
 	/*
 	 * Put the new filesystem on the mount list after root.
 	 */
 	cache_purge(vp);
 	if (!error) {
 		struct vnode *newdp;
 
 		VI_LOCK(vp);
 		vp->v_iflag &= ~VI_MOUNT;
 		VI_UNLOCK(vp);
 		vp->v_mountedhere = mp;
 		mtx_lock(&mountlist_mtx);
 		TAILQ_INSERT_TAIL(&mountlist, mp, mnt_list);
 		mtx_unlock(&mountlist_mtx);
 		vfs_event_signal(NULL, VQ_MOUNT, 0);
 		if (VFS_ROOT(mp, LK_EXCLUSIVE, &newdp, td))
 			panic("mount: lost mount");
 		mountcheckdirs(vp, newdp);
 		vput(newdp);
 		VOP_UNLOCK(vp, 0, td);
 		if ((mp->mnt_flag & MNT_RDONLY) == 0)
 			error = vfs_allocate_syncvnode(mp);
 		vfs_unbusy(mp, td);
 		if (error)
 			vrele(vp);
 	} else {
 		VI_LOCK(vp);
 		vp->v_iflag &= ~VI_MOUNT;
 		VI_UNLOCK(vp);
 		vfs_unbusy(mp, td);
 		vfs_mount_destroy(mp);
 		vput(vp);
 	}
 	return (error);
 }
 
 /*
  * Unmount a filesystem.
  *
  * Note: unmount takes a path to the vnode mounted on as argument, not
  * special file (as before).
  */
 #ifndef _SYS_SYSPROTO_H_
 struct unmount_args {
 	char	*path;
 	int	flags;
 };
 #endif
 /* ARGSUSED */
 int
 unmount(td, uap)
 	struct thread *td;
 	register struct unmount_args /* {
 		char *path;
 		int flags;
 	} */ *uap;
 {
 	struct mount *mp;
 	char *pathbuf;
 	int error, id0, id1;
 
 	if (jailed(td->td_ucred) || usermount == 0) {
 		error = priv_check(td, PRIV_VFS_UNMOUNT);
 		if (error)
 			return (error);
 	}
 
 	pathbuf = malloc(MNAMELEN, M_TEMP, M_WAITOK);
 	error = copyinstr(uap->path, pathbuf, MNAMELEN, NULL);
 	if (error) {
 		free(pathbuf, M_TEMP);
 		return (error);
 	}
 	AUDIT_ARG(upath, td, pathbuf, ARG_UPATH1);
 	mtx_lock(&Giant);
 	if (uap->flags & MNT_BYFSID) {
 		/* Decode the filesystem ID. */
 		if (sscanf(pathbuf, "FSID:%d:%d", &id0, &id1) != 2) {
 			mtx_unlock(&Giant);
 			free(pathbuf, M_TEMP);
 			return (EINVAL);
 		}
 
 		mtx_lock(&mountlist_mtx);
 		TAILQ_FOREACH_REVERSE(mp, &mountlist, mntlist, mnt_list) {
 			if (mp->mnt_stat.f_fsid.val[0] == id0 &&
 			    mp->mnt_stat.f_fsid.val[1] == id1)
 				break;
 		}
 		mtx_unlock(&mountlist_mtx);
 	} else {
 		mtx_lock(&mountlist_mtx);
 		TAILQ_FOREACH_REVERSE(mp, &mountlist, mntlist, mnt_list) {
 			if (strcmp(mp->mnt_stat.f_mntonname, pathbuf) == 0)
 				break;
 		}
 		mtx_unlock(&mountlist_mtx);
 	}
 	free(pathbuf, M_TEMP);
 	if (mp == NULL) {
 		/*
 		 * Previously we returned ENOENT for a nonexistent path and
 		 * EINVAL for a non-mountpoint.  We cannot tell these apart
 		 * now, so in the !MNT_BYFSID case return the more likely
 		 * EINVAL for compatibility.
 		 */
 		mtx_unlock(&Giant);
 		return ((uap->flags & MNT_BYFSID) ? ENOENT : EINVAL);
 	}
 
 	/*
 	 * Don't allow unmounting the root filesystem.
 	 */
 	if (mp->mnt_flag & MNT_ROOTFS) {
 		mtx_unlock(&Giant);
 		return (EINVAL);
 	}
 	error = dounmount(mp, uap->flags, td);
 	mtx_unlock(&Giant);
 	return (error);
 }
 
 /*
  * Do the actual filesystem unmount.
  */
 int
 dounmount(mp, flags, td)
 	struct mount *mp;
 	int flags;
 	struct thread *td;
 {
 	struct vnode *coveredvp, *fsrootvp;
 	int error;
 	int async_flag;
 	int mnt_gen_r;
 
 	mtx_assert(&Giant, MA_OWNED);
 
 	if ((coveredvp = mp->mnt_vnodecovered) != NULL) {
 		mnt_gen_r = mp->mnt_gen;
 		VI_LOCK(coveredvp);
 		vholdl(coveredvp);
-		vn_lock(coveredvp, LK_EXCLUSIVE | LK_INTERLOCK | LK_RETRY, td);
+		vn_lock(coveredvp, LK_EXCLUSIVE | LK_INTERLOCK | LK_RETRY);
 		vdrop(coveredvp);
 		/*
 		 * Check for mp being unmounted while waiting for the
 		 * covered vnode lock.
 		 */
 		if (coveredvp->v_mountedhere != mp ||
 		    coveredvp->v_mountedhere->mnt_gen != mnt_gen_r) {
 			VOP_UNLOCK(coveredvp, 0, td);
 			return (EBUSY);
 		}
 	}
 	/*
 	 * Only privileged root, or (if MNT_USER is set) the user that did the
 	 * original mount is permitted to unmount this filesystem.
 	 */
 	error = vfs_suser(mp, td);
 	if (error) {
 		if (coveredvp)
 			VOP_UNLOCK(coveredvp, 0, td);
 		return (error);
 	}
 
 	MNT_ILOCK(mp);
 	if (mp->mnt_kern_flag & MNTK_UNMOUNT) {
 		MNT_IUNLOCK(mp);
 		if (coveredvp)
 			VOP_UNLOCK(coveredvp, 0, td);
 		return (EBUSY);
 	}
 	mp->mnt_kern_flag |= MNTK_UNMOUNT | MNTK_NOINSMNTQ;
 	/* Allow filesystems to detect that a forced unmount is in progress. */
 	if (flags & MNT_FORCE)
 		mp->mnt_kern_flag |= MNTK_UNMOUNTF;
 	error = lockmgr(&mp->mnt_lock, LK_DRAIN | LK_INTERLOCK |
 	    ((flags & MNT_FORCE) ? 0 : LK_NOWAIT), MNT_MTX(mp), td);
 	if (error) {
 		MNT_ILOCK(mp);
 		mp->mnt_kern_flag &= ~(MNTK_UNMOUNT | MNTK_NOINSMNTQ |
 		    MNTK_UNMOUNTF);
 		if (mp->mnt_kern_flag & MNTK_MWAIT)
 			wakeup(mp);
 		MNT_IUNLOCK(mp);
 		if (coveredvp)
 			VOP_UNLOCK(coveredvp, 0, td);
 		return (error);
 	}
 	vn_start_write(NULL, &mp, V_WAIT);
 
 	if (mp->mnt_flag & MNT_EXPUBLIC)
 		vfs_setpublicfs(NULL, NULL, NULL);
 
 	vfs_msync(mp, MNT_WAIT);
 	MNT_ILOCK(mp);
 	async_flag = mp->mnt_flag & MNT_ASYNC;
 	mp->mnt_flag &= ~MNT_ASYNC;
 	mp->mnt_kern_flag &= ~MNTK_ASYNC;
 	MNT_IUNLOCK(mp);
 	cache_purgevfs(mp);	/* remove cache entries for this file sys */
 	if (mp->mnt_syncer != NULL)
 		vrele(mp->mnt_syncer);
 	/*
 	 * For forced unmounts, move process cdir/rdir refs on the fs root
 	 * vnode to the covered vnode.  For non-forced unmounts we want
 	 * such references to cause an EBUSY error.
 	 */
 	if ((flags & MNT_FORCE) &&
 	    VFS_ROOT(mp, LK_EXCLUSIVE, &fsrootvp, td) == 0) {
 		if (mp->mnt_vnodecovered != NULL)
 			mountcheckdirs(fsrootvp, mp->mnt_vnodecovered);
 		if (fsrootvp == rootvnode) {
 			vrele(rootvnode);
 			rootvnode = NULL;
 		}
 		vput(fsrootvp);
 	}
 	if (((mp->mnt_flag & MNT_RDONLY) ||
 	     (error = VFS_SYNC(mp, MNT_WAIT, td)) == 0) ||
 	    (flags & MNT_FORCE)) {
 		error = VFS_UNMOUNT(mp, flags, td);
 	}
 	vn_finished_write(mp);
 	/*
 	 * If we failed to flush the dirty blocks for this mount point,
 	 * undo all the cdir/rdir and rootvnode changes we made above.
 	 * Unless we failed to do so because the device is reporting that
 	 * it doesn't exist anymore.
 	 */
 	if (error && error != ENXIO) {
 		if ((flags & MNT_FORCE) &&
 		    VFS_ROOT(mp, LK_EXCLUSIVE, &fsrootvp, td) == 0) {
 			if (mp->mnt_vnodecovered != NULL)
 				mountcheckdirs(mp->mnt_vnodecovered, fsrootvp);
 			if (rootvnode == NULL) {
 				rootvnode = fsrootvp;
 				vref(rootvnode);
 			}
 			vput(fsrootvp);
 		}
 		MNT_ILOCK(mp);
 		mp->mnt_kern_flag &= ~MNTK_NOINSMNTQ;
 		if ((mp->mnt_flag & MNT_RDONLY) == 0 && mp->mnt_syncer == NULL) {
 			MNT_IUNLOCK(mp);
 			(void) vfs_allocate_syncvnode(mp);
 			MNT_ILOCK(mp);
 		}
 		mp->mnt_kern_flag &= ~(MNTK_UNMOUNT | MNTK_UNMOUNTF);
 		mp->mnt_flag |= async_flag;
 		if ((mp->mnt_flag & MNT_ASYNC) != 0 && mp->mnt_noasync == 0)
 			mp->mnt_kern_flag |= MNTK_ASYNC;
 		lockmgr(&mp->mnt_lock, LK_RELEASE, NULL, td);
 		if (mp->mnt_kern_flag & MNTK_MWAIT)
 			wakeup(mp);
 		MNT_IUNLOCK(mp);
 		if (coveredvp)
 			VOP_UNLOCK(coveredvp, 0, td);
 		return (error);
 	}
 	mtx_lock(&mountlist_mtx);
 	TAILQ_REMOVE(&mountlist, mp, mnt_list);
 	mtx_unlock(&mountlist_mtx);
 	if (coveredvp != NULL) {
 		coveredvp->v_mountedhere = NULL;
 		vput(coveredvp);
 	}
 	vfs_event_signal(NULL, VQ_UNMOUNT, 0);
 	lockmgr(&mp->mnt_lock, LK_RELEASE, NULL, td);
 	vfs_mount_destroy(mp);
 	return (0);
 }
 
 /*
  * ---------------------------------------------------------------------
  * Mounting of root filesystem
  *
  */
 
 struct root_hold_token {
 	const char			*who;
 	LIST_ENTRY(root_hold_token)	list;
 };
 
 static LIST_HEAD(, root_hold_token)	root_holds =
     LIST_HEAD_INITIALIZER(&root_holds);
 
 static int root_mount_complete;
 
 /*
  * Hold root mount.
  */
 struct root_hold_token *
 root_mount_hold(const char *identifier)
 {
 	struct root_hold_token *h;
 
 	h = malloc(sizeof *h, M_DEVBUF, M_ZERO | M_WAITOK);
 	h->who = identifier;
 	mtx_lock(&mountlist_mtx);
 	LIST_INSERT_HEAD(&root_holds, h, list);
 	mtx_unlock(&mountlist_mtx);
 	return (h);
 }
 
 /*
  * Release root mount.
  */
 void
 root_mount_rel(struct root_hold_token *h)
 {
 
 	mtx_lock(&mountlist_mtx);
 	LIST_REMOVE(h, list);
 	wakeup(&root_holds);
 	mtx_unlock(&mountlist_mtx);
 	free(h, M_DEVBUF);
 }
 
 /*
  * Wait for all subsystems to release root mount.
  */
 static void
 root_mount_prepare(void)
 {
 	struct root_hold_token *h;
 
 	for (;;) {
 		DROP_GIANT();
 		g_waitidle();
 		PICKUP_GIANT();
 		mtx_lock(&mountlist_mtx);
 		if (LIST_EMPTY(&root_holds)) {
 			mtx_unlock(&mountlist_mtx);
 			break;
 		}
 		printf("Root mount waiting for:");
 		LIST_FOREACH(h, &root_holds, list)
 			printf(" %s", h->who);
 		printf("\n");
 		msleep(&root_holds, &mountlist_mtx, PZERO | PDROP, "roothold",
 		    hz);
 	}
 }
 
 /*
  * Root was mounted, share the good news.
  */
 static void
 root_mount_done(void)
 {
 
 	/*
 	 * Use a mutex to prevent the wakeup being missed and waiting for
 	 * an extra 1 second sleep.
 	 */
 	mtx_lock(&mountlist_mtx);
 	root_mount_complete = 1;
 	wakeup(&root_mount_complete);
 	mtx_unlock(&mountlist_mtx);
 }
 
 /*
  * Return true if root is already mounted.
  */
 int
 root_mounted(void)
 {
 
 	/* No mutex is acquired here because int stores are atomic. */
 	return (root_mount_complete);
 }
 
 /*
  * Wait until root is mounted.
  */
 void
 root_mount_wait(void)
 {
 
 	/*
 	 * Panic on an obvious deadlock - the function can't be called from
 	 * a thread which is doing the whole SYSINIT stuff.
 	 */
 	KASSERT(curthread->td_proc->p_pid != 0,
 	    ("root_mount_wait: cannot be called from the swapper thread"));
 	mtx_lock(&mountlist_mtx);
 	while (!root_mount_complete) {
 		msleep(&root_mount_complete, &mountlist_mtx, PZERO, "rootwait",
 		    hz);
 	}
 	mtx_unlock(&mountlist_mtx);
 }
 
 static void
 set_rootvnode(struct thread *td)
 {
 	struct proc *p;
 
 	if (VFS_ROOT(TAILQ_FIRST(&mountlist), LK_EXCLUSIVE, &rootvnode, td))
 		panic("Cannot find root vnode");
 
 	p = td->td_proc;
 	FILEDESC_SLOCK(p->p_fd);
 
 	if (p->p_fd->fd_cdir != NULL)
 		vrele(p->p_fd->fd_cdir);
 	p->p_fd->fd_cdir = rootvnode;
 	VREF(rootvnode);
 
 	if (p->p_fd->fd_rdir != NULL)
 		vrele(p->p_fd->fd_rdir);
 	p->p_fd->fd_rdir = rootvnode;
 	VREF(rootvnode);
 
 	FILEDESC_SUNLOCK(p->p_fd);
 
 	VOP_UNLOCK(rootvnode, 0, td);
 }
 
 /*
  * Mount /devfs as our root filesystem, but do not put it on the mountlist
  * yet.  Create a /dev -> / symlink so that absolute pathnames will lookup.
  */
 
 static void
 devfs_first(void)
 {
 	struct thread *td = curthread;
 	struct vfsoptlist *opts;
 	struct vfsconf *vfsp;
 	struct mount *mp = NULL;
 	int error;
 
 	vfsp = vfs_byname("devfs");
 	KASSERT(vfsp != NULL, ("Could not find devfs by name"));
 	if (vfsp == NULL)
 		return;
 
 	mp = vfs_mount_alloc(NULLVP, vfsp, "/dev", td);
 
 	error = VFS_MOUNT(mp, td);
 	KASSERT(error == 0, ("VFS_MOUNT(devfs) failed %d", error));
 	if (error)
 		return;
 
 	opts = malloc(sizeof(struct vfsoptlist), M_MOUNT, M_WAITOK);
 	TAILQ_INIT(opts);
 	mp->mnt_opt = opts;
 
 	mtx_lock(&mountlist_mtx);
 	TAILQ_INSERT_HEAD(&mountlist, mp, mnt_list);
 	mtx_unlock(&mountlist_mtx);
 
 	set_rootvnode(td);
 
 	error = kern_symlink(td, "/", "dev", UIO_SYSSPACE);
 	if (error)
 		printf("kern_symlink /dev -> / returns %d\n", error);
 }
 
 /*
  * Surgically move our devfs to be mounted on /dev.
  */
 
 static void
 devfs_fixup(struct thread *td)
 {
 	struct nameidata nd;
 	int error;
 	struct vnode *vp, *dvp;
 	struct mount *mp;
 
 	/* Remove our devfs mount from the mountlist and purge the cache */
 	mtx_lock(&mountlist_mtx);
 	mp = TAILQ_FIRST(&mountlist);
 	TAILQ_REMOVE(&mountlist, mp, mnt_list);
 	mtx_unlock(&mountlist_mtx);
 	cache_purgevfs(mp);
 
 	VFS_ROOT(mp, LK_EXCLUSIVE, &dvp, td);
 	VI_LOCK(dvp);
 	dvp->v_iflag &= ~VI_MOUNT;
 	VI_UNLOCK(dvp);
 	dvp->v_mountedhere = NULL;
 
 	/* Set up the real rootvnode, and purge the cache */
 	TAILQ_FIRST(&mountlist)->mnt_vnodecovered = NULL;
 	set_rootvnode(td);
 	cache_purgevfs(rootvnode->v_mount);
 
 	NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF, UIO_SYSSPACE, "/dev", td);
 	error = namei(&nd);
 	if (error) {
 		printf("Lookup of /dev for devfs, error: %d\n", error);
 		return;
 	}
 	NDFREE(&nd, NDF_ONLY_PNBUF);
 	vp = nd.ni_vp;
 	if (vp->v_type != VDIR) {
 		vput(vp);
 	}
 	error = vinvalbuf(vp, V_SAVE, td, 0, 0);
 	if (error) {
 		vput(vp);
 	}
 	cache_purge(vp);
 	mp->mnt_vnodecovered = vp;
 	vp->v_mountedhere = mp;
 	mtx_lock(&mountlist_mtx);
 	TAILQ_INSERT_TAIL(&mountlist, mp, mnt_list);
 	mtx_unlock(&mountlist_mtx);
 	VOP_UNLOCK(vp, 0, td);
 	vput(dvp);
 	vfs_unbusy(mp, td);
 
 	/* Unlink the no longer needed /dev/dev -> / symlink */
 	kern_unlink(td, "/dev/dev", UIO_SYSSPACE);
 }
 
 /*
  * Report errors during filesystem mounting.
  */
 void
 vfs_mount_error(struct mount *mp, const char *fmt, ...)
 {
 	struct vfsoptlist *moptlist = mp->mnt_optnew;
 	va_list ap;
 	int error, len;
 	char *errmsg;
 
 	error = vfs_getopt(moptlist, "errmsg", (void **)&errmsg, &len);
 	if (error || errmsg == NULL || len <= 0)
 		return;
 
 	va_start(ap, fmt);
 	vsnprintf(errmsg, (size_t)len, fmt, ap);
 	va_end(ap);
 }
 
 /*
  * Find and mount the root filesystem
  */
 void
 vfs_mountroot(void)
 {
 	char *cp;
 	int error, i, asked = 0;
 
 	root_mount_prepare();
 
 	mount_zone = uma_zcreate("Mountpoints", sizeof(struct mount),
 	    NULL, NULL, mount_init, mount_fini,
 	    UMA_ALIGN_PTR, UMA_ZONE_NOFREE);
 	devfs_first();
 
 	/*
 	 * We are booted with instructions to prompt for the root filesystem.
 	 */
 	if (boothowto & RB_ASKNAME) {
 		if (!vfs_mountroot_ask())
 			goto mounted;
 		asked = 1;
 	}
 
 	/*
 	 * The root filesystem information is compiled in, and we are
 	 * booted with instructions to use it.
 	 */
 	if (ctrootdevname != NULL && (boothowto & RB_DFLTROOT)) {
 		if (!vfs_mountroot_try(ctrootdevname))
 			goto mounted;
 		ctrootdevname = NULL;
 	}
 
 	/*
 	 * We've been given the generic "use CDROM as root" flag.  This is
 	 * necessary because one media may be used in many different
 	 * devices, so we need to search for them.
 	 */
 	if (boothowto & RB_CDROM) {
 		for (i = 0; cdrom_rootdevnames[i] != NULL; i++) {
 			if (!vfs_mountroot_try(cdrom_rootdevnames[i]))
 				goto mounted;
 		}
 	}
 
 	/*
 	 * Try to use the value read by the loader from /etc/fstab, or
 	 * supplied via some other means.  This is the preferred
 	 * mechanism.
 	 */
 	cp = getenv("vfs.root.mountfrom");
 	if (cp != NULL) {
 		error = vfs_mountroot_try(cp);
 		freeenv(cp);
 		if (!error)
 			goto mounted;
 	}
 
 	/*
 	 * Try values that may have been computed by code during boot
 	 */
 	if (!vfs_mountroot_try(rootdevnames[0]))
 		goto mounted;
 	if (!vfs_mountroot_try(rootdevnames[1]))
 		goto mounted;
 
 	/*
 	 * If we (still) have a compiled-in default, try it.
 	 */
 	if (ctrootdevname != NULL)
 		if (!vfs_mountroot_try(ctrootdevname))
 			goto mounted;
 	/*
 	 * Everything so far has failed, prompt on the console if we haven't
 	 * already tried that.
 	 */
 	if (!asked)
 		if (!vfs_mountroot_ask())
 			goto mounted;
 
 	panic("Root mount failed, startup aborted.");
 
 mounted:
 	root_mount_done();
 }
 
 /*
  * Mount (mountfrom) as the root filesystem.
  */
 static int
 vfs_mountroot_try(const char *mountfrom)
 {
 	struct mount	*mp;
 	char		*vfsname, *path;
 	time_t		timebase;
 	int		error;
 	char		patt[32];
 
 	vfsname = NULL;
 	path    = NULL;
 	mp      = NULL;
 	error   = EINVAL;
 
 	if (mountfrom == NULL)
 		return (error);		/* don't complain */
 	printf("Trying to mount root from %s\n", mountfrom);
 
 	/* parse vfs name and path */
 	vfsname = malloc(MFSNAMELEN, M_MOUNT, M_WAITOK);
 	path = malloc(MNAMELEN, M_MOUNT, M_WAITOK);
 	vfsname[0] = path[0] = 0;
 	sprintf(patt, "%%%d[a-z0-9]:%%%ds", MFSNAMELEN, MNAMELEN);
 	if (sscanf(mountfrom, patt, vfsname, path) < 1)
 		goto out;
 
 	if (path[0] == '\0')
 		strcpy(path, ROOTNAME);
 
 	error = kernel_vmount(
 	    MNT_RDONLY | MNT_ROOTFS,
 	    "fstype", vfsname,
 	    "fspath", "/",
 	    "from", path,
 	    NULL);
 	if (error == 0) {
 		/*
 		 * We mount devfs prior to mounting the / FS, so the first
 		 * entry will typically be devfs.
 		 */
 		mp = TAILQ_FIRST(&mountlist);
 		KASSERT(mp != NULL, ("%s: mountlist is empty", __func__));
 
 		/*
 		 * Iterate over all currently mounted file systems and use
 		 * the time stamp found to check and/or initialize the RTC.
 		 * Typically devfs has no time stamp and the only other FS
 		 * is the actual / FS.
 		 * Call inittodr() only once and pass it the largest of the
 		 * timestamps we encounter.
 		 */
 		timebase = 0;
 		do {
 			if (mp->mnt_time > timebase)
 				timebase = mp->mnt_time;
 			mp = TAILQ_NEXT(mp, mnt_list);
 		} while (mp != NULL);
 		inittodr(timebase);
 
 		devfs_fixup(curthread);
 	}
 out:
 	free(path, M_MOUNT);
 	free(vfsname, M_MOUNT);
 	return (error);
 }
 
 /*
  * ---------------------------------------------------------------------
  * Interactive root filesystem selection code.
  */
 
 static int
 vfs_mountroot_ask(void)
 {
 	char name[128];
 
 	for(;;) {
 		printf("\nManual root filesystem specification:\n");
 		printf("  <fstype>:<device>  Mount <device> using filesystem <fstype>\n");
 #if defined(__amd64__) || defined(__i386__) || defined(__ia64__)
 		printf("                       eg. ufs:da0s1a\n");
 #else
 		printf("                       eg. ufs:/dev/da0a\n");
 #endif
 		printf("  ?                  List valid disk boot devices\n");
 		printf("  <empty line>       Abort manual input\n");
 		printf("\nmountroot> ");
 		gets(name, sizeof(name), 1);
 		if (name[0] == '\0')
 			return (1);
 		if (name[0] == '?') {
 			printf("\nList of GEOM managed disk devices:\n  ");
 			g_dev_print();
 			continue;
 		}
 		if (!vfs_mountroot_try(name))
 			return (0);
 	}
 }
 
 /*
  * ---------------------------------------------------------------------
  * Functions for querying mount options/arguments from filesystems.
  */
 
 /*
  * Check that no unknown options are given
  */
 int
 vfs_filteropt(struct vfsoptlist *opts, const char **legal)
 {
 	struct vfsopt *opt;
 	char errmsg[255];
 	const char **t, *p, *q;
 	int ret = 0;
 
 	TAILQ_FOREACH(opt, opts, link) {
 		p = opt->name;
 		q = NULL;
 		if (p[0] == 'n' && p[1] == 'o')
 			q = p + 2;
 		for(t = global_opts; *t != NULL; t++) {
 			if (strcmp(*t, p) == 0)
 				break;
 			if (q != NULL) {
 				if (strcmp(*t, q) == 0)
 					break;
 			}
 		}
 		if (*t != NULL)
 			continue;
 		for(t = legal; *t != NULL; t++) {
 			if (strcmp(*t, p) == 0)
 				break;
 			if (q != NULL) {
 				if (strcmp(*t, q) == 0)
 					break;
 			}
 		}
 		if (*t != NULL)
 			continue;
 		sprintf(errmsg, "mount option <%s> is unknown", p);
 		printf("%s\n", errmsg);
 		ret = EINVAL;
 	}
 	if (ret != 0) {
 		TAILQ_FOREACH(opt, opts, link) {
 			if (strcmp(opt->name, "errmsg") == 0) {
 				strncpy((char *)opt->value, errmsg, opt->len);
 			}
 		}
 	}
 	return (ret);
 }
 
 /*
  * Get a mount option by its name.
  *
  * Return 0 if the option was found, ENOENT otherwise.
  * If len is non-NULL it will be filled with the length
  * of the option. If buf is non-NULL, it will be filled
  * with the address of the option.
  */
 int
 vfs_getopt(opts, name, buf, len)
 	struct vfsoptlist *opts;
 	const char *name;
 	void **buf;
 	int *len;
 {
 	struct vfsopt *opt;
 
 	KASSERT(opts != NULL, ("vfs_getopt: caller passed 'opts' as NULL"));
 
 	TAILQ_FOREACH(opt, opts, link) {
 		if (strcmp(name, opt->name) == 0) {
 			if (len != NULL)
 				*len = opt->len;
 			if (buf != NULL)
 				*buf = opt->value;
 			return (0);
 		}
 	}
 	return (ENOENT);
 }
 
 static int
 vfs_getopt_pos(struct vfsoptlist *opts, const char *name)
 {
 	struct vfsopt *opt;
 	int i;
 
 	if (opts == NULL)
 		return (-1);
 
 	i = 0;
 	TAILQ_FOREACH(opt, opts, link) {
 		if (strcmp(name, opt->name) == 0)
 			return (i);
 		++i;
 	}
 	return (-1);
 }
 
 char *
 vfs_getopts(struct vfsoptlist *opts, const char *name, int *error)
 {
 	struct vfsopt *opt;
 
 	*error = 0;
 	TAILQ_FOREACH(opt, opts, link) {
 		if (strcmp(name, opt->name) != 0)
 			continue;
 		if (((char *)opt->value)[opt->len - 1] != '\0') {
 			*error = EINVAL;
 			return (NULL);
 		}
 		return (opt->value);
 	}
 	*error = ENOENT;
 	return (NULL);
 }
 
 int
 vfs_flagopt(struct vfsoptlist *opts, const char *name, u_int *w, u_int val)
 {
 	struct vfsopt *opt;
 
 	TAILQ_FOREACH(opt, opts, link) {
 		if (strcmp(name, opt->name) == 0) {
 			if (w != NULL)
 				*w |= val;
 			return (1);
 		}
 	}
 	if (w != NULL)
 		*w &= ~val;
 	return (0);
 }
 
 int
 vfs_scanopt(struct vfsoptlist *opts, const char *name, const char *fmt, ...)
 {
 	va_list ap;
 	struct vfsopt *opt;
 	int ret;
 
 	KASSERT(opts != NULL, ("vfs_getopt: caller passed 'opts' as NULL"));
 
 	TAILQ_FOREACH(opt, opts, link) {
 		if (strcmp(name, opt->name) != 0)
 			continue;
 		if (opt->len == 0 || opt->value == NULL)
 			return (0);
 		if (((char *)opt->value)[opt->len - 1] != '\0')
 			return (0);
 		va_start(ap, fmt);
 		ret = vsscanf(opt->value, fmt, ap);
 		va_end(ap);
 		return (ret);
 	}
 	return (0);
 }
 
 /*
  * Find and copy a mount option.
  *
  * The size of the buffer has to be specified
  * in len, if it is not the same length as the
  * mount option, EINVAL is returned.
  * Returns ENOENT if the option is not found.
  */
 int
 vfs_copyopt(opts, name, dest, len)
 	struct vfsoptlist *opts;
 	const char *name;
 	void *dest;
 	int len;
 {
 	struct vfsopt *opt;
 
 	KASSERT(opts != NULL, ("vfs_copyopt: caller passed 'opts' as NULL"));
 
 	TAILQ_FOREACH(opt, opts, link) {
 		if (strcmp(name, opt->name) == 0) {
 			if (len != opt->len)
 				return (EINVAL);
 			bcopy(opt->value, dest, opt->len);
 			return (0);
 		}
 	}
 	return (ENOENT);
 }
 
 /*
  * This is a helper function for filesystems to traverse their
  * vnodes.  See MNT_VNODE_FOREACH() in sys/mount.h
  */
 
 struct vnode *
 __mnt_vnode_next(struct vnode **mvp, struct mount *mp)
 {
 	struct vnode *vp;
 
 	mtx_assert(MNT_MTX(mp), MA_OWNED);
 
 	KASSERT((*mvp)->v_mount == mp, ("marker vnode mount list mismatch"));
 	vp = TAILQ_NEXT(*mvp, v_nmntvnodes);
 	while (vp != NULL && vp->v_type == VMARKER)
 		vp = TAILQ_NEXT(vp, v_nmntvnodes);
 
 	/* Check if we are done */
 	if (vp == NULL) {
 		__mnt_vnode_markerfree(mvp, mp);
 		return (NULL);
 	}
 	TAILQ_REMOVE(&mp->mnt_nvnodelist, *mvp, v_nmntvnodes);
 	TAILQ_INSERT_AFTER(&mp->mnt_nvnodelist, vp, *mvp, v_nmntvnodes);
 	return (vp);
 }
 
 struct vnode *
 __mnt_vnode_first(struct vnode **mvp, struct mount *mp)
 {
 	struct vnode *vp;
 
 	mtx_assert(MNT_MTX(mp), MA_OWNED);
 
 	vp = TAILQ_FIRST(&mp->mnt_nvnodelist);
 	while (vp != NULL && vp->v_type == VMARKER)
 		vp = TAILQ_NEXT(vp, v_nmntvnodes);
 
 	/* Check if we are done */
 	if (vp == NULL) {
 		*mvp = NULL;
 		return (NULL);
 	}
 	mp->mnt_holdcnt++;
 	MNT_IUNLOCK(mp);
 	*mvp = (struct vnode *) malloc(sizeof(struct vnode),
 				       M_VNODE_MARKER,
 				       M_WAITOK | M_ZERO);
 	MNT_ILOCK(mp);
 	(*mvp)->v_type = VMARKER;
 
 	vp = TAILQ_FIRST(&mp->mnt_nvnodelist);
 	while (vp != NULL && vp->v_type == VMARKER)
 		vp = TAILQ_NEXT(vp, v_nmntvnodes);
 
 	/* Check if we are done */
 	if (vp == NULL) {
 		MNT_IUNLOCK(mp);
 		free(*mvp, M_VNODE_MARKER);
 		MNT_ILOCK(mp);
 		*mvp = NULL;
 		mp->mnt_holdcnt--;
 		if (mp->mnt_holdcnt == 0 && mp->mnt_holdcntwaiters != 0)
 			wakeup(&mp->mnt_holdcnt);
 		return (NULL);
 	}
 	mp->mnt_markercnt++;
 	(*mvp)->v_mount = mp;
 	TAILQ_INSERT_AFTER(&mp->mnt_nvnodelist, vp, *mvp, v_nmntvnodes);
 	return (vp);
 }
 
 
 void
 __mnt_vnode_markerfree(struct vnode **mvp, struct mount *mp)
 {
 
 	if (*mvp == NULL)
 		return;
 
 	mtx_assert(MNT_MTX(mp), MA_OWNED);
 
 	KASSERT((*mvp)->v_mount == mp, ("marker vnode mount list mismatch"));
 	TAILQ_REMOVE(&mp->mnt_nvnodelist, *mvp, v_nmntvnodes);
 	MNT_IUNLOCK(mp);
 	free(*mvp, M_VNODE_MARKER);
 	MNT_ILOCK(mp);
 	*mvp = NULL;
 
 	mp->mnt_markercnt--;
 	mp->mnt_holdcnt--;
 	if (mp->mnt_holdcnt == 0 && mp->mnt_holdcntwaiters != 0)
 		wakeup(&mp->mnt_holdcnt);
 }
 
 
 int
 __vfs_statfs(struct mount *mp, struct statfs *sbp, struct thread *td)
 {
 	int error;
 
 	error = mp->mnt_op->vfs_statfs(mp, &mp->mnt_stat, td);
 	if (sbp != &mp->mnt_stat)
 		*sbp = mp->mnt_stat;
 	return (error);
 }
 
 void
 vfs_mountedfrom(struct mount *mp, const char *from)
 {
 
 	bzero(mp->mnt_stat.f_mntfromname, sizeof mp->mnt_stat.f_mntfromname);
 	strlcpy(mp->mnt_stat.f_mntfromname, from,
 	    sizeof mp->mnt_stat.f_mntfromname);
 }
 
 /*
  * ---------------------------------------------------------------------
  * This is the api for building mount args and mounting filesystems from
  * inside the kernel.
  *
  * The API works by accumulation of individual args.  First error is
  * latched.
  *
  * XXX: should be documented in new manpage kernel_mount(9)
  */
 
 /* A memory allocation which must be freed when we are done */
 struct mntaarg {
 	SLIST_ENTRY(mntaarg)	next;
 };
 
 /* The header for the mount arguments */
 struct mntarg {
 	struct iovec *v;
 	int len;
 	int error;
 	SLIST_HEAD(, mntaarg)	list;
 };
 
 /*
  * Add a boolean argument.
  *
  * flag is the boolean value.
  * name must start with "no".
  */
 struct mntarg *
 mount_argb(struct mntarg *ma, int flag, const char *name)
 {
 
 	KASSERT(name[0] == 'n' && name[1] == 'o',
 	    ("mount_argb(...,%s): name must start with 'no'", name));
 
 	return (mount_arg(ma, name + (flag ? 2 : 0), NULL, 0));
 }
 
 /*
  * Add an argument printf style
  */
 struct mntarg *
 mount_argf(struct mntarg *ma, const char *name, const char *fmt, ...)
 {
 	va_list ap;
 	struct mntaarg *maa;
 	struct sbuf *sb;
 	int len;
 
 	if (ma == NULL) {
 		ma = malloc(sizeof *ma, M_MOUNT, M_WAITOK | M_ZERO);
 		SLIST_INIT(&ma->list);
 	}
 	if (ma->error)
 		return (ma);
 
 	ma->v = realloc(ma->v, sizeof *ma->v * (ma->len + 2),
 	    M_MOUNT, M_WAITOK);
 	ma->v[ma->len].iov_base = (void *)(uintptr_t)name;
 	ma->v[ma->len].iov_len = strlen(name) + 1;
 	ma->len++;
 
 	sb = sbuf_new(NULL, NULL, 0, SBUF_AUTOEXTEND);
 	va_start(ap, fmt);
 	sbuf_vprintf(sb, fmt, ap);
 	va_end(ap);
 	sbuf_finish(sb);
 	len = sbuf_len(sb) + 1;
 	maa = malloc(sizeof *maa + len, M_MOUNT, M_WAITOK | M_ZERO);
 	SLIST_INSERT_HEAD(&ma->list, maa, next);
 	bcopy(sbuf_data(sb), maa + 1, len);
 	sbuf_delete(sb);
 
 	ma->v[ma->len].iov_base = maa + 1;
 	ma->v[ma->len].iov_len = len;
 	ma->len++;
 
 	return (ma);
 }
 
 /*
  * Add an argument which is a userland string.
  */
 struct mntarg *
 mount_argsu(struct mntarg *ma, const char *name, const void *val, int len)
 {
 	struct mntaarg *maa;
 	char *tbuf;
 
 	if (val == NULL)
 		return (ma);
 	if (ma == NULL) {
 		ma = malloc(sizeof *ma, M_MOUNT, M_WAITOK | M_ZERO);
 		SLIST_INIT(&ma->list);
 	}
 	if (ma->error)
 		return (ma);
 	maa = malloc(sizeof *maa + len, M_MOUNT, M_WAITOK | M_ZERO);
 	SLIST_INSERT_HEAD(&ma->list, maa, next);
 	tbuf = (void *)(maa + 1);
 	ma->error = copyinstr(val, tbuf, len, NULL);
 	return (mount_arg(ma, name, tbuf, -1));
 }
 
 /*
  * Plain argument.
  *
  * If length is -1, use printf.
  */
 struct mntarg *
 mount_arg(struct mntarg *ma, const char *name, const void *val, int len)
 {
 
 	if (ma == NULL) {
 		ma = malloc(sizeof *ma, M_MOUNT, M_WAITOK | M_ZERO);
 		SLIST_INIT(&ma->list);
 	}
 	if (ma->error)
 		return (ma);
 
 	ma->v = realloc(ma->v, sizeof *ma->v * (ma->len + 2),
 	    M_MOUNT, M_WAITOK);
 	ma->v[ma->len].iov_base = (void *)(uintptr_t)name;
 	ma->v[ma->len].iov_len = strlen(name) + 1;
 	ma->len++;
 
 	ma->v[ma->len].iov_base = (void *)(uintptr_t)val;
 	if (len < 0)
 		ma->v[ma->len].iov_len = strlen(val) + 1;
 	else
 		ma->v[ma->len].iov_len = len;
 	ma->len++;
 	return (ma);
 }
 
 /*
  * Free a mntarg structure
  */
 static void
 free_mntarg(struct mntarg *ma)
 {
 	struct mntaarg *maa;
 
 	while (!SLIST_EMPTY(&ma->list)) {
 		maa = SLIST_FIRST(&ma->list);
 		SLIST_REMOVE_HEAD(&ma->list, next);
 		free(maa, M_MOUNT);
 	}
 	free(ma->v, M_MOUNT);
 	free(ma, M_MOUNT);
 }
 
 /*
  * Mount a filesystem
  */
 int
 kernel_mount(struct mntarg *ma, int flags)
 {
 	struct uio auio;
 	int error;
 
 	KASSERT(ma != NULL, ("kernel_mount NULL ma"));
 	KASSERT(ma->v != NULL, ("kernel_mount NULL ma->v"));
 	KASSERT(!(ma->len & 1), ("kernel_mount odd ma->len (%d)", ma->len));
 
 	auio.uio_iov = ma->v;
 	auio.uio_iovcnt = ma->len;
 	auio.uio_segflg = UIO_SYSSPACE;
 
 	error = ma->error;
 	if (!error)
 		error = vfs_donmount(curthread, flags, &auio);
 	free_mntarg(ma);
 	return (error);
 }
 
 /*
  * A printflike function to mount a filesystem.
  */
 int
 kernel_vmount(int flags, ...)
 {
 	struct mntarg *ma = NULL;
 	va_list ap;
 	const char *cp;
 	const void *vp;
 	int error;
 
 	va_start(ap, flags);
 	for (;;) {
 		cp = va_arg(ap, const char *);
 		if (cp == NULL)
 			break;
 		vp = va_arg(ap, const void *);
 		ma = mount_arg(ma, cp, vp, -1);
 	}
 	va_end(ap);
 
 	error = kernel_mount(ma, flags);
 	return (error);
 }
Index: head/sys/kern/vfs_subr.c
===================================================================
--- head/sys/kern/vfs_subr.c	(revision 175201)
+++ head/sys/kern/vfs_subr.c	(revision 175202)
@@ -1,4031 +1,4031 @@
 /*-
  * Copyright (c) 1989, 1993
  *	The Regents of the University of California.  All rights reserved.
  * (c) UNIX System Laboratories, Inc.
  * All or some portions of this file are derived from material licensed
  * to the University of California by American Telephone and Telegraph
  * Co. or Unix System Laboratories, Inc. and are reproduced herein with
  * the permission of UNIX System Laboratories, Inc.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	@(#)vfs_subr.c	8.31 (Berkeley) 5/26/95
  */
 
 /*
  * External virtual filesystem routines
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_ddb.h"
 #include "opt_mac.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/bio.h>
 #include <sys/buf.h>
 #include <sys/conf.h>
 #include <sys/dirent.h>
 #include <sys/event.h>
 #include <sys/eventhandler.h>
 #include <sys/extattr.h>
 #include <sys/file.h>
 #include <sys/fcntl.h>
 #include <sys/jail.h>
 #include <sys/kdb.h>
 #include <sys/kernel.h>
 #include <sys/kthread.h>
 #include <sys/malloc.h>
 #include <sys/mount.h>
 #include <sys/namei.h>
 #include <sys/priv.h>
 #include <sys/reboot.h>
 #include <sys/sleepqueue.h>
 #include <sys/stat.h>
 #include <sys/sysctl.h>
 #include <sys/syslog.h>
 #include <sys/vmmeter.h>
 #include <sys/vnode.h>
 
 #include <machine/stdarg.h>
 
 #include <security/mac/mac_framework.h>
 
 #include <vm/vm.h>
 #include <vm/vm_object.h>
 #include <vm/vm_extern.h>
 #include <vm/pmap.h>
 #include <vm/vm_map.h>
 #include <vm/vm_page.h>
 #include <vm/vm_kern.h>
 #include <vm/uma.h>
 
 #ifdef DDB
 #include <ddb/ddb.h>
 #endif
 
 static MALLOC_DEFINE(M_NETADDR, "subr_export_host", "Export host address structure");
 
 static void	delmntque(struct vnode *vp);
 static int	flushbuflist(struct bufv *bufv, int flags, struct bufobj *bo,
 		    int slpflag, int slptimeo);
 static void	syncer_shutdown(void *arg, int howto);
 static int	vtryrecycle(struct vnode *vp);
 static void	vbusy(struct vnode *vp);
 static void	vinactive(struct vnode *, struct thread *);
 static void	v_incr_usecount(struct vnode *);
 static void	v_decr_usecount(struct vnode *);
 static void	v_decr_useonly(struct vnode *);
 static void	v_upgrade_usecount(struct vnode *);
 static void	vfree(struct vnode *);
 static void	vnlru_free(int);
 static void	vdestroy(struct vnode *);
 static void	vgonel(struct vnode *);
 static void	vfs_knllock(void *arg);
 static void	vfs_knlunlock(void *arg);
 static int	vfs_knllocked(void *arg);
 
 
 /*
  * Enable Giant pushdown based on whether or not the vm is mpsafe in this
  * build.  Without mpsafevm the buffer cache can not run Giant free.
  */
 int mpsafe_vfs = 1;
 TUNABLE_INT("debug.mpsafevfs", &mpsafe_vfs);
 SYSCTL_INT(_debug, OID_AUTO, mpsafevfs, CTLFLAG_RD, &mpsafe_vfs, 0,
     "MPSAFE VFS");
 
 /*
  * Number of vnodes in existence.  Increased whenever getnewvnode()
  * allocates a new vnode, decreased on vdestroy() called on VI_DOOMed
  * vnode.
  */
 static unsigned long	numvnodes;
 
 SYSCTL_LONG(_vfs, OID_AUTO, numvnodes, CTLFLAG_RD, &numvnodes, 0, "");
 
 /*
  * Conversion tables for conversion from vnode types to inode formats
  * and back.
  */
 enum vtype iftovt_tab[16] = {
 	VNON, VFIFO, VCHR, VNON, VDIR, VNON, VBLK, VNON,
 	VREG, VNON, VLNK, VNON, VSOCK, VNON, VNON, VBAD,
 };
 int vttoif_tab[10] = {
 	0, S_IFREG, S_IFDIR, S_IFBLK, S_IFCHR, S_IFLNK,
 	S_IFSOCK, S_IFIFO, S_IFMT, S_IFMT
 };
 
 /*
  * List of vnodes that are ready for recycling.
  */
 static TAILQ_HEAD(freelst, vnode) vnode_free_list;
 
 /*
  * Free vnode target.  Free vnodes may simply be files which have been stat'd
  * but not read.  This is somewhat common, and a small cache of such files
  * should be kept to avoid recreation costs.
  */
 static u_long wantfreevnodes;
 SYSCTL_LONG(_vfs, OID_AUTO, wantfreevnodes, CTLFLAG_RW, &wantfreevnodes, 0, "");
 /* Number of vnodes in the free list. */
 static u_long freevnodes;
 SYSCTL_LONG(_vfs, OID_AUTO, freevnodes, CTLFLAG_RD, &freevnodes, 0, "");
 
 /*
  * Various variables used for debugging the new implementation of
  * reassignbuf().
  * XXX these are probably of (very) limited utility now.
  */
 static int reassignbufcalls;
 SYSCTL_INT(_vfs, OID_AUTO, reassignbufcalls, CTLFLAG_RW, &reassignbufcalls, 0, "");
 
 /*
  * Cache for the mount type id assigned to NFS.  This is used for
  * special checks in nfs/nfs_nqlease.c and vm/vnode_pager.c.
  */
 int	nfs_mount_type = -1;
 
 /* To keep more than one thread at a time from running vfs_getnewfsid */
 static struct mtx mntid_mtx;
 
 /*
  * Lock for any access to the following:
  *	vnode_free_list
  *	numvnodes
  *	freevnodes
  */
 static struct mtx vnode_free_list_mtx;
 
 /* Publicly exported FS */
 struct nfs_public nfs_pub;
 
 /* Zone for allocation of new vnodes - used exclusively by getnewvnode() */
 static uma_zone_t vnode_zone;
 static uma_zone_t vnodepoll_zone;
 
 /* Set to 1 to print out reclaim of active vnodes */
 int	prtactive;
 
 /*
  * The workitem queue.
  *
  * It is useful to delay writes of file data and filesystem metadata
  * for tens of seconds so that quickly created and deleted files need
  * not waste disk bandwidth being created and removed. To realize this,
  * we append vnodes to a "workitem" queue. When running with a soft
  * updates implementation, most pending metadata dependencies should
  * not wait for more than a few seconds. Thus, mounted on block devices
  * are delayed only about a half the time that file data is delayed.
  * Similarly, directory updates are more critical, so are only delayed
  * about a third the time that file data is delayed. Thus, there are
  * SYNCER_MAXDELAY queues that are processed round-robin at a rate of
  * one each second (driven off the filesystem syncer process). The
  * syncer_delayno variable indicates the next queue that is to be processed.
  * Items that need to be processed soon are placed in this queue:
  *
  *	syncer_workitem_pending[syncer_delayno]
  *
  * A delay of fifteen seconds is done by placing the request fifteen
  * entries later in the queue:
  *
  *	syncer_workitem_pending[(syncer_delayno + 15) & syncer_mask]
  *
  */
 static int syncer_delayno;
 static long syncer_mask;
 LIST_HEAD(synclist, bufobj);
 static struct synclist *syncer_workitem_pending;
 /*
  * The sync_mtx protects:
  *	bo->bo_synclist
  *	sync_vnode_count
  *	syncer_delayno
  *	syncer_state
  *	syncer_workitem_pending
  *	syncer_worklist_len
  *	rushjob
  */
 static struct mtx sync_mtx;
 
 #define SYNCER_MAXDELAY		32
 static int syncer_maxdelay = SYNCER_MAXDELAY;	/* maximum delay time */
 static int syncdelay = 30;		/* max time to delay syncing data */
 static int filedelay = 30;		/* time to delay syncing files */
 SYSCTL_INT(_kern, OID_AUTO, filedelay, CTLFLAG_RW, &filedelay, 0, "");
 static int dirdelay = 29;		/* time to delay syncing directories */
 SYSCTL_INT(_kern, OID_AUTO, dirdelay, CTLFLAG_RW, &dirdelay, 0, "");
 static int metadelay = 28;		/* time to delay syncing metadata */
 SYSCTL_INT(_kern, OID_AUTO, metadelay, CTLFLAG_RW, &metadelay, 0, "");
 static int rushjob;		/* number of slots to run ASAP */
 static int stat_rush_requests;	/* number of times I/O speeded up */
 SYSCTL_INT(_debug, OID_AUTO, rush_requests, CTLFLAG_RW, &stat_rush_requests, 0, "");
 
 /*
  * When shutting down the syncer, run it at four times normal speed.
  */
 #define SYNCER_SHUTDOWN_SPEEDUP		4
 static int sync_vnode_count;
 static int syncer_worklist_len;
 static enum { SYNCER_RUNNING, SYNCER_SHUTTING_DOWN, SYNCER_FINAL_DELAY }
     syncer_state;
 
 /*
  * Number of vnodes we want to exist at any one time.  This is mostly used
  * to size hash tables in vnode-related code.  It is normally not used in
  * getnewvnode(), as wantfreevnodes is normally nonzero.)
  *
  * XXX desiredvnodes is historical cruft and should not exist.
  */
 int desiredvnodes;
 SYSCTL_INT(_kern, KERN_MAXVNODES, maxvnodes, CTLFLAG_RW,
     &desiredvnodes, 0, "Maximum number of vnodes");
 SYSCTL_INT(_kern, OID_AUTO, minvnodes, CTLFLAG_RW,
     &wantfreevnodes, 0, "Minimum number of vnodes (legacy)");
 static int vnlru_nowhere;
 SYSCTL_INT(_debug, OID_AUTO, vnlru_nowhere, CTLFLAG_RW,
     &vnlru_nowhere, 0, "Number of times the vnlru process ran without success");
 
 /*
  * Macros to control when a vnode is freed and recycled.  All require
  * the vnode interlock.
  */
 #define VCANRECYCLE(vp) (((vp)->v_iflag & VI_FREE) && !(vp)->v_holdcnt)
 #define VSHOULDFREE(vp) (!((vp)->v_iflag & VI_FREE) && !(vp)->v_holdcnt)
 #define VSHOULDBUSY(vp) (((vp)->v_iflag & VI_FREE) && (vp)->v_holdcnt)
 
 
 /*
  * Initialize the vnode management data structures.
  */
 #ifndef	MAXVNODES_MAX
 #define	MAXVNODES_MAX	100000
 #endif
 static void
 vntblinit(void *dummy __unused)
 {
 
 	/*
 	 * Desiredvnodes is a function of the physical memory size and
 	 * the kernel's heap size.  Specifically, desiredvnodes scales
 	 * in proportion to the physical memory size until two fifths
 	 * of the kernel's heap size is consumed by vnodes and vm
 	 * objects.
 	 */
 	desiredvnodes = min(maxproc + cnt.v_page_count / 4, 2 * vm_kmem_size /
 	    (5 * (sizeof(struct vm_object) + sizeof(struct vnode))));
 	if (desiredvnodes > MAXVNODES_MAX) {
 		if (bootverbose)
 			printf("Reducing kern.maxvnodes %d -> %d\n",
 			    desiredvnodes, MAXVNODES_MAX);
 		desiredvnodes = MAXVNODES_MAX;
 	}
 	wantfreevnodes = desiredvnodes / 4;
 	mtx_init(&mntid_mtx, "mntid", NULL, MTX_DEF);
 	TAILQ_INIT(&vnode_free_list);
 	mtx_init(&vnode_free_list_mtx, "vnode_free_list", NULL, MTX_DEF);
 	vnode_zone = uma_zcreate("VNODE", sizeof (struct vnode), NULL, NULL,
 	    NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE);
 	vnodepoll_zone = uma_zcreate("VNODEPOLL", sizeof (struct vpollinfo),
 	    NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE);
 	/*
 	 * Initialize the filesystem syncer.
 	 */
 	syncer_workitem_pending = hashinit(syncer_maxdelay, M_VNODE,
 		&syncer_mask);
 	syncer_maxdelay = syncer_mask + 1;
 	mtx_init(&sync_mtx, "Syncer mtx", NULL, MTX_DEF);
 }
 SYSINIT(vfs, SI_SUB_VFS, SI_ORDER_FIRST, vntblinit, NULL)
 
 
 /*
  * Mark a mount point as busy. Used to synchronize access and to delay
  * unmounting. Interlock is not released on failure.
  */
 int
 vfs_busy(struct mount *mp, int flags, struct mtx *interlkp,
     struct thread *td)
 {
 	int lkflags;
 
 	MNT_ILOCK(mp);
 	MNT_REF(mp);
 	if (mp->mnt_kern_flag & MNTK_UNMOUNT) {
 		if (flags & LK_NOWAIT) {
 			MNT_REL(mp);
 			MNT_IUNLOCK(mp);
 			return (ENOENT);
 		}
 		if (interlkp)
 			mtx_unlock(interlkp);
 		mp->mnt_kern_flag |= MNTK_MWAIT;
 		/*
 		 * Since all busy locks are shared except the exclusive
 		 * lock granted when unmounting, the only place that a
 		 * wakeup needs to be done is at the release of the
 		 * exclusive lock at the end of dounmount.
 		 */
 		msleep(mp, MNT_MTX(mp), PVFS, "vfs_busy", 0);
 		MNT_REL(mp);
 		MNT_IUNLOCK(mp);
 		if (interlkp)
 			mtx_lock(interlkp);
 		return (ENOENT);
 	}
 	if (interlkp)
 		mtx_unlock(interlkp);
 	lkflags = LK_SHARED | LK_INTERLOCK;
 	if (lockmgr(&mp->mnt_lock, lkflags, MNT_MTX(mp), td))
 		panic("vfs_busy: unexpected lock failure");
 	return (0);
 }
 
 /*
  * Free a busy filesystem.
  */
 void
 vfs_unbusy(struct mount *mp, struct thread *td)
 {
 
 	lockmgr(&mp->mnt_lock, LK_RELEASE, NULL, td);
 	vfs_rel(mp);
 }
 
 /*
  * Lookup a mount point by filesystem identifier.
  */
 struct mount *
 vfs_getvfs(fsid_t *fsid)
 {
 	struct mount *mp;
 
 	mtx_lock(&mountlist_mtx);
 	TAILQ_FOREACH(mp, &mountlist, mnt_list) {
 		if (mp->mnt_stat.f_fsid.val[0] == fsid->val[0] &&
 		    mp->mnt_stat.f_fsid.val[1] == fsid->val[1]) {
 			vfs_ref(mp);
 			mtx_unlock(&mountlist_mtx);
 			return (mp);
 		}
 	}
 	mtx_unlock(&mountlist_mtx);
 	return ((struct mount *) 0);
 }
 
 /*
  * Check if a user can access privileged mount options.
  */
 int
 vfs_suser(struct mount *mp, struct thread *td)
 {
 	int error;
 
 	/*
 	 * If the thread is jailed, but this is not a jail-friendly file
 	 * system, deny immediately.
 	 */
 	if (jailed(td->td_ucred) && !(mp->mnt_vfc->vfc_flags & VFCF_JAIL))
 		return (EPERM);
 
 	/*
 	 * If the file system was mounted outside a jail and a jailed thread
 	 * tries to access it, deny immediately.
 	 */
 	if (!jailed(mp->mnt_cred) && jailed(td->td_ucred))
 		return (EPERM);
 
 	/*
 	 * If the file system was mounted inside different jail that the jail of
 	 * the calling thread, deny immediately.
 	 */
 	if (jailed(mp->mnt_cred) && jailed(td->td_ucred) &&
 	    mp->mnt_cred->cr_prison != td->td_ucred->cr_prison) {
 		return (EPERM);
 	}
 
 	if ((mp->mnt_flag & MNT_USER) == 0 ||
 	    mp->mnt_cred->cr_uid != td->td_ucred->cr_uid) {
 		if ((error = priv_check(td, PRIV_VFS_MOUNT_OWNER)) != 0)
 			return (error);
 	}
 	return (0);
 }
 
 /*
  * Get a new unique fsid.  Try to make its val[0] unique, since this value
  * will be used to create fake device numbers for stat().  Also try (but
  * not so hard) make its val[0] unique mod 2^16, since some emulators only
  * support 16-bit device numbers.  We end up with unique val[0]'s for the
  * first 2^16 calls and unique val[0]'s mod 2^16 for the first 2^8 calls.
  *
  * Keep in mind that several mounts may be running in parallel.  Starting
  * the search one past where the previous search terminated is both a
  * micro-optimization and a defense against returning the same fsid to
  * different mounts.
  */
 void
 vfs_getnewfsid(struct mount *mp)
 {
 	static u_int16_t mntid_base;
 	struct mount *nmp;
 	fsid_t tfsid;
 	int mtype;
 
 	mtx_lock(&mntid_mtx);
 	mtype = mp->mnt_vfc->vfc_typenum;
 	tfsid.val[1] = mtype;
 	mtype = (mtype & 0xFF) << 24;
 	for (;;) {
 		tfsid.val[0] = makedev(255,
 		    mtype | ((mntid_base & 0xFF00) << 8) | (mntid_base & 0xFF));
 		mntid_base++;
 		if ((nmp = vfs_getvfs(&tfsid)) == NULL)
 			break;
 		vfs_rel(nmp);
 	}
 	mp->mnt_stat.f_fsid.val[0] = tfsid.val[0];
 	mp->mnt_stat.f_fsid.val[1] = tfsid.val[1];
 	mtx_unlock(&mntid_mtx);
 }
 
 /*
  * Knob to control the precision of file timestamps:
  *
  *   0 = seconds only; nanoseconds zeroed.
  *   1 = seconds and nanoseconds, accurate within 1/HZ.
  *   2 = seconds and nanoseconds, truncated to microseconds.
  * >=3 = seconds and nanoseconds, maximum precision.
  */
 enum { TSP_SEC, TSP_HZ, TSP_USEC, TSP_NSEC };
 
 static int timestamp_precision = TSP_SEC;
 SYSCTL_INT(_vfs, OID_AUTO, timestamp_precision, CTLFLAG_RW,
     &timestamp_precision, 0, "");
 
 /*
  * Get a current timestamp.
  */
 void
 vfs_timestamp(struct timespec *tsp)
 {
 	struct timeval tv;
 
 	switch (timestamp_precision) {
 	case TSP_SEC:
 		tsp->tv_sec = time_second;
 		tsp->tv_nsec = 0;
 		break;
 	case TSP_HZ:
 		getnanotime(tsp);
 		break;
 	case TSP_USEC:
 		microtime(&tv);
 		TIMEVAL_TO_TIMESPEC(&tv, tsp);
 		break;
 	case TSP_NSEC:
 	default:
 		nanotime(tsp);
 		break;
 	}
 }
 
 /*
  * Set vnode attributes to VNOVAL
  */
 void
 vattr_null(struct vattr *vap)
 {
 
 	vap->va_type = VNON;
 	vap->va_size = VNOVAL;
 	vap->va_bytes = VNOVAL;
 	vap->va_mode = VNOVAL;
 	vap->va_nlink = VNOVAL;
 	vap->va_uid = VNOVAL;
 	vap->va_gid = VNOVAL;
 	vap->va_fsid = VNOVAL;
 	vap->va_fileid = VNOVAL;
 	vap->va_blocksize = VNOVAL;
 	vap->va_rdev = VNOVAL;
 	vap->va_atime.tv_sec = VNOVAL;
 	vap->va_atime.tv_nsec = VNOVAL;
 	vap->va_mtime.tv_sec = VNOVAL;
 	vap->va_mtime.tv_nsec = VNOVAL;
 	vap->va_ctime.tv_sec = VNOVAL;
 	vap->va_ctime.tv_nsec = VNOVAL;
 	vap->va_birthtime.tv_sec = VNOVAL;
 	vap->va_birthtime.tv_nsec = VNOVAL;
 	vap->va_flags = VNOVAL;
 	vap->va_gen = VNOVAL;
 	vap->va_vaflags = 0;
 }
 
 /*
  * This routine is called when we have too many vnodes.  It attempts
  * to free <count> vnodes and will potentially free vnodes that still
  * have VM backing store (VM backing store is typically the cause
  * of a vnode blowout so we want to do this).  Therefore, this operation
  * is not considered cheap.
  *
  * A number of conditions may prevent a vnode from being reclaimed.
  * the buffer cache may have references on the vnode, a directory
  * vnode may still have references due to the namei cache representing
  * underlying files, or the vnode may be in active use.   It is not
  * desireable to reuse such vnodes.  These conditions may cause the
  * number of vnodes to reach some minimum value regardless of what
  * you set kern.maxvnodes to.  Do not set kern.maxvnodes too low.
  */
 static int
 vlrureclaim(struct mount *mp)
 {
 	struct thread *td;
 	struct vnode *vp;
 	int done;
 	int trigger;
 	int usevnodes;
 	int count;
 
 	/*
 	 * Calculate the trigger point, don't allow user
 	 * screwups to blow us up.   This prevents us from
 	 * recycling vnodes with lots of resident pages.  We
 	 * aren't trying to free memory, we are trying to
 	 * free vnodes.
 	 */
 	usevnodes = desiredvnodes;
 	if (usevnodes <= 0)
 		usevnodes = 1;
 	trigger = cnt.v_page_count * 2 / usevnodes;
 	done = 0;
 	td = curthread;
 	vn_start_write(NULL, &mp, V_WAIT);
 	MNT_ILOCK(mp);
 	count = mp->mnt_nvnodelistsize / 10 + 1;
 	while (count != 0) {
 		vp = TAILQ_FIRST(&mp->mnt_nvnodelist);
 		while (vp != NULL && vp->v_type == VMARKER)
 			vp = TAILQ_NEXT(vp, v_nmntvnodes);
 		if (vp == NULL)
 			break;
 		TAILQ_REMOVE(&mp->mnt_nvnodelist, vp, v_nmntvnodes);
 		TAILQ_INSERT_TAIL(&mp->mnt_nvnodelist, vp, v_nmntvnodes);
 		--count;
 		if (!VI_TRYLOCK(vp))
 			goto next_iter;
 		/*
 		 * If it's been deconstructed already, it's still
 		 * referenced, or it exceeds the trigger, skip it.
 		 */
 		if (vp->v_usecount || !LIST_EMPTY(&(vp)->v_cache_src) ||
 		    (vp->v_iflag & VI_DOOMED) != 0 || (vp->v_object != NULL &&
 		    vp->v_object->resident_page_count > trigger)) {
 			VI_UNLOCK(vp);
 			goto next_iter;
 		}
 		MNT_IUNLOCK(mp);
 		vholdl(vp);
 		if (VOP_LOCK(vp, LK_INTERLOCK|LK_EXCLUSIVE|LK_NOWAIT, td)) {
 			vdrop(vp);
 			goto next_iter_mntunlocked;
 		}
 		VI_LOCK(vp);
 		/*
 		 * v_usecount may have been bumped after VOP_LOCK() dropped
 		 * the vnode interlock and before it was locked again.
 		 *
 		 * It is not necessary to recheck VI_DOOMED because it can
 		 * only be set by another thread that holds both the vnode
 		 * lock and vnode interlock.  If another thread has the
 		 * vnode lock before we get to VOP_LOCK() and obtains the
 		 * vnode interlock after VOP_LOCK() drops the vnode
 		 * interlock, the other thread will be unable to drop the
 		 * vnode lock before our VOP_LOCK() call fails.
 		 */
 		if (vp->v_usecount || !LIST_EMPTY(&(vp)->v_cache_src) ||
 		    (vp->v_object != NULL &&
 		    vp->v_object->resident_page_count > trigger)) {
 			VOP_UNLOCK(vp, LK_INTERLOCK, td);
 			goto next_iter_mntunlocked;
 		}
 		KASSERT((vp->v_iflag & VI_DOOMED) == 0,
 		    ("VI_DOOMED unexpectedly detected in vlrureclaim()"));
 		vgonel(vp);
 		VOP_UNLOCK(vp, 0, td);
 		vdropl(vp);
 		done++;
 next_iter_mntunlocked:
 		if ((count % 256) != 0)
 			goto relock_mnt;
 		goto yield;
 next_iter:
 		if ((count % 256) != 0)
 			continue;
 		MNT_IUNLOCK(mp);
 yield:
 		uio_yield();
 relock_mnt:
 		MNT_ILOCK(mp);
 	}
 	MNT_IUNLOCK(mp);
 	vn_finished_write(mp);
 	return done;
 }
 
 /*
  * Attempt to keep the free list at wantfreevnodes length.
  */
 static void
 vnlru_free(int count)
 {
 	struct vnode *vp;
 	int vfslocked;
 
 	mtx_assert(&vnode_free_list_mtx, MA_OWNED);
 	for (; count > 0; count--) {
 		vp = TAILQ_FIRST(&vnode_free_list);
 		/*
 		 * The list can be modified while the free_list_mtx
 		 * has been dropped and vp could be NULL here.
 		 */
 		if (!vp)
 			break;
 		VNASSERT(vp->v_op != NULL, vp,
 		    ("vnlru_free: vnode already reclaimed."));
 		TAILQ_REMOVE(&vnode_free_list, vp, v_freelist);
 		/*
 		 * Don't recycle if we can't get the interlock.
 		 */
 		if (!VI_TRYLOCK(vp)) {
 			TAILQ_INSERT_TAIL(&vnode_free_list, vp, v_freelist);
 			continue;
 		}
 		VNASSERT(VCANRECYCLE(vp), vp,
 		    ("vp inconsistent on freelist"));
 		freevnodes--;
 		vp->v_iflag &= ~VI_FREE;
 		vholdl(vp);
 		mtx_unlock(&vnode_free_list_mtx);
 		VI_UNLOCK(vp);
 		vfslocked = VFS_LOCK_GIANT(vp->v_mount);
 		vtryrecycle(vp);
 		VFS_UNLOCK_GIANT(vfslocked);
 		/*
 		 * If the recycled succeeded this vdrop will actually free
 		 * the vnode.  If not it will simply place it back on
 		 * the free list.
 		 */
 		vdrop(vp);
 		mtx_lock(&vnode_free_list_mtx);
 	}
 }
 /*
  * Attempt to recycle vnodes in a context that is always safe to block.
  * Calling vlrurecycle() from the bowels of filesystem code has some
  * interesting deadlock problems.
  */
 static struct proc *vnlruproc;
 static int vnlruproc_sig;
 
 static void
 vnlru_proc(void)
 {
 	struct mount *mp, *nmp;
 	int done;
 	struct proc *p = vnlruproc;
 	struct thread *td = curthread;
 
 	EVENTHANDLER_REGISTER(shutdown_pre_sync, kproc_shutdown, p,
 	    SHUTDOWN_PRI_FIRST);
 
 	mtx_lock(&Giant);
 
 	for (;;) {
 		kproc_suspend_check(p);
 		mtx_lock(&vnode_free_list_mtx);
 		if (freevnodes > wantfreevnodes)
 			vnlru_free(freevnodes - wantfreevnodes);
 		if (numvnodes <= desiredvnodes * 9 / 10) {
 			vnlruproc_sig = 0;
 			wakeup(&vnlruproc_sig);
 			msleep(vnlruproc, &vnode_free_list_mtx,
 			    PVFS|PDROP, "vlruwt", hz);
 			continue;
 		}
 		mtx_unlock(&vnode_free_list_mtx);
 		done = 0;
 		mtx_lock(&mountlist_mtx);
 		for (mp = TAILQ_FIRST(&mountlist); mp != NULL; mp = nmp) {
 			int vfsunlocked;
 			if (vfs_busy(mp, LK_NOWAIT, &mountlist_mtx, td)) {
 				nmp = TAILQ_NEXT(mp, mnt_list);
 				continue;
 			}
 			if (!VFS_NEEDSGIANT(mp)) {
 				mtx_unlock(&Giant);
 				vfsunlocked = 1;
 			} else
 				vfsunlocked = 0;
 			done += vlrureclaim(mp);
 			if (vfsunlocked)
 				mtx_lock(&Giant);
 			mtx_lock(&mountlist_mtx);
 			nmp = TAILQ_NEXT(mp, mnt_list);
 			vfs_unbusy(mp, td);
 		}
 		mtx_unlock(&mountlist_mtx);
 		if (done == 0) {
 			EVENTHANDLER_INVOKE(vfs_lowvnodes, desiredvnodes / 10);
 #if 0
 			/* These messages are temporary debugging aids */
 			if (vnlru_nowhere < 5)
 				printf("vnlru process getting nowhere..\n");
 			else if (vnlru_nowhere == 5)
 				printf("vnlru process messages stopped.\n");
 #endif
 			vnlru_nowhere++;
 			tsleep(vnlruproc, PPAUSE, "vlrup", hz * 3);
 		} else
 			uio_yield();
 	}
 }
 
 static struct kproc_desc vnlru_kp = {
 	"vnlru",
 	vnlru_proc,
 	&vnlruproc
 };
 SYSINIT(vnlru, SI_SUB_KTHREAD_UPDATE, SI_ORDER_FIRST, kproc_start, &vnlru_kp)
 
 /*
  * Routines having to do with the management of the vnode table.
  */
 
 static void
 vdestroy(struct vnode *vp)
 {
 	struct bufobj *bo;
 
 	CTR1(KTR_VFS, "vdestroy vp %p", vp);
 	mtx_lock(&vnode_free_list_mtx);
 	numvnodes--;
 	mtx_unlock(&vnode_free_list_mtx);
 	bo = &vp->v_bufobj;
 	VNASSERT((vp->v_iflag & VI_FREE) == 0, vp,
 	    ("cleaned vnode still on the free list."));
 	VNASSERT(vp->v_data == NULL, vp, ("cleaned vnode isn't"));
 	VNASSERT(vp->v_holdcnt == 0, vp, ("Non-zero hold count"));
 	VNASSERT(vp->v_usecount == 0, vp, ("Non-zero use count"));
 	VNASSERT(vp->v_writecount == 0, vp, ("Non-zero write count"));
 	VNASSERT(bo->bo_numoutput == 0, vp, ("Clean vnode has pending I/O's"));
 	VNASSERT(bo->bo_clean.bv_cnt == 0, vp, ("cleanbufcnt not 0"));
 	VNASSERT(bo->bo_clean.bv_root == NULL, vp, ("cleanblkroot not NULL"));
 	VNASSERT(bo->bo_dirty.bv_cnt == 0, vp, ("dirtybufcnt not 0"));
 	VNASSERT(bo->bo_dirty.bv_root == NULL, vp, ("dirtyblkroot not NULL"));
 	VNASSERT(TAILQ_EMPTY(&vp->v_cache_dst), vp, ("vp has namecache dst"));
 	VNASSERT(LIST_EMPTY(&vp->v_cache_src), vp, ("vp has namecache src"));
 	VI_UNLOCK(vp);
 #ifdef MAC
 	mac_vnode_destroy(vp);
 #endif
 	if (vp->v_pollinfo != NULL) {
 		knlist_destroy(&vp->v_pollinfo->vpi_selinfo.si_note);
 		mtx_destroy(&vp->v_pollinfo->vpi_lock);
 		uma_zfree(vnodepoll_zone, vp->v_pollinfo);
 	}
 #ifdef INVARIANTS
 	/* XXX Elsewhere we can detect an already freed vnode via NULL v_op. */
 	vp->v_op = NULL;
 #endif
 	lockdestroy(vp->v_vnlock);
 	mtx_destroy(&vp->v_interlock);
 	uma_zfree(vnode_zone, vp);
 }
 
 /*
  * Try to recycle a freed vnode.  We abort if anyone picks up a reference
  * before we actually vgone().  This function must be called with the vnode
  * held to prevent the vnode from being returned to the free list midway
  * through vgone().
  */
 static int
 vtryrecycle(struct vnode *vp)
 {
 	struct thread *td = curthread;
 	struct mount *vnmp;
 
 	CTR1(KTR_VFS, "vtryrecycle: trying vp %p", vp);
 	VNASSERT(vp->v_holdcnt, vp,
 	    ("vtryrecycle: Recycling vp %p without a reference.", vp));
 	/*
 	 * This vnode may found and locked via some other list, if so we
 	 * can't recycle it yet.
 	 */
 	if (VOP_LOCK(vp, LK_EXCLUSIVE | LK_NOWAIT, td) != 0)
 		return (EWOULDBLOCK);
 	/*
 	 * Don't recycle if its filesystem is being suspended.
 	 */
 	if (vn_start_write(vp, &vnmp, V_NOWAIT) != 0) {
 		VOP_UNLOCK(vp, 0, td);
 		return (EBUSY);
 	}
 	/*
 	 * If we got this far, we need to acquire the interlock and see if
 	 * anyone picked up this vnode from another list.  If not, we will
 	 * mark it with DOOMED via vgonel() so that anyone who does find it
 	 * will skip over it.
 	 */
 	VI_LOCK(vp);
 	if (vp->v_usecount) {
 		VOP_UNLOCK(vp, LK_INTERLOCK, td);
 		vn_finished_write(vnmp);
 		return (EBUSY);
 	}
 	if ((vp->v_iflag & VI_DOOMED) == 0)
 		vgonel(vp);
 	VOP_UNLOCK(vp, LK_INTERLOCK, td);
 	vn_finished_write(vnmp);
 	CTR1(KTR_VFS, "vtryrecycle: recycled vp %p", vp);
 	return (0);
 }
 
 /*
  * Return the next vnode from the free list.
  */
 int
 getnewvnode(const char *tag, struct mount *mp, struct vop_vector *vops,
     struct vnode **vpp)
 {
 	struct vnode *vp = NULL;
 	struct bufobj *bo;
 
 	mtx_lock(&vnode_free_list_mtx);
 	/*
 	 * Lend our context to reclaim vnodes if they've exceeded the max.
 	 */
 	if (freevnodes > wantfreevnodes)
 		vnlru_free(1);
 	/*
 	 * Wait for available vnodes.
 	 */
 	if (numvnodes > desiredvnodes) {
 		if (mp != NULL && (mp->mnt_kern_flag & MNTK_SUSPEND)) {
 			/*
 			 * File system is beeing suspended, we cannot risk a
 			 * deadlock here, so allocate new vnode anyway.
 			 */
 			if (freevnodes > wantfreevnodes)
 				vnlru_free(freevnodes - wantfreevnodes);
 			goto alloc;
 		}
 		if (vnlruproc_sig == 0) {
 			vnlruproc_sig = 1;	/* avoid unnecessary wakeups */
 			wakeup(vnlruproc);
 		}
 		msleep(&vnlruproc_sig, &vnode_free_list_mtx, PVFS,
 		    "vlruwk", hz);
 #if 0	/* XXX Not all VFS_VGET/ffs_vget callers check returns. */
 		if (numvnodes > desiredvnodes) {
 			mtx_unlock(&vnode_free_list_mtx);
 			return (ENFILE);
 		}
 #endif
 	}
 alloc:
 	numvnodes++;
 	mtx_unlock(&vnode_free_list_mtx);
 	vp = (struct vnode *) uma_zalloc(vnode_zone, M_WAITOK|M_ZERO);
 	/*
 	 * Setup locks.
 	 */
 	vp->v_vnlock = &vp->v_lock;
 	mtx_init(&vp->v_interlock, "vnode interlock", NULL, MTX_DEF);
 	/*
 	 * By default, don't allow shared locks unless filesystems
 	 * opt-in.
 	 */
 	lockinit(vp->v_vnlock, PVFS, tag, VLKTIMEOUT, LK_NOSHARE);
 	/*
 	 * Initialize bufobj.
 	 */
 	bo = &vp->v_bufobj;
 	bo->__bo_vnode = vp;
 	bo->bo_mtx = &vp->v_interlock;
 	bo->bo_ops = &buf_ops_bio;
 	bo->bo_private = vp;
 	TAILQ_INIT(&bo->bo_clean.bv_hd);
 	TAILQ_INIT(&bo->bo_dirty.bv_hd);
 	/*
 	 * Initialize namecache.
 	 */
 	LIST_INIT(&vp->v_cache_src);
 	TAILQ_INIT(&vp->v_cache_dst);
 	/*
 	 * Finalize various vnode identity bits.
 	 */
 	vp->v_type = VNON;
 	vp->v_tag = tag;
 	vp->v_op = vops;
 	v_incr_usecount(vp);
 	vp->v_data = 0;
 #ifdef MAC
 	mac_vnode_init(vp);
 	if (mp != NULL && (mp->mnt_flag & MNT_MULTILABEL) == 0)
 		mac_vnode_associate_singlelabel(mp, vp);
 	else if (mp == NULL)
 		printf("NULL mp in getnewvnode()\n");
 #endif
 	if (mp != NULL) {
 		bo->bo_bsize = mp->mnt_stat.f_iosize;
 		if ((mp->mnt_kern_flag & MNTK_NOKNOTE) != 0)
 			vp->v_vflag |= VV_NOKNOTE;
 	}
 
 	CTR2(KTR_VFS, "getnewvnode: mp %p vp %p", mp, vp);
 	*vpp = vp;
 	return (0);
 }
 
 /*
  * Delete from old mount point vnode list, if on one.
  */
 static void
 delmntque(struct vnode *vp)
 {
 	struct mount *mp;
 
 	mp = vp->v_mount;
 	if (mp == NULL)
 		return;
 	MNT_ILOCK(mp);
 	vp->v_mount = NULL;
 	VNASSERT(mp->mnt_nvnodelistsize > 0, vp,
 		("bad mount point vnode list size"));
 	TAILQ_REMOVE(&mp->mnt_nvnodelist, vp, v_nmntvnodes);
 	mp->mnt_nvnodelistsize--;
 	MNT_REL(mp);
 	MNT_IUNLOCK(mp);
 }
 
 static void
 insmntque_stddtr(struct vnode *vp, void *dtr_arg)
 {
 	struct thread *td;
 
 	td = curthread; /* XXX ? */
 	vp->v_data = NULL;
 	vp->v_op = &dead_vnodeops;
 	/* XXX non mp-safe fs may still call insmntque with vnode
 	   unlocked */
 	if (!VOP_ISLOCKED(vp, td))
-		vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td);
+		vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
 	vgone(vp);
 	vput(vp);
 }
 
 /*
  * Insert into list of vnodes for the new mount point, if available.
  */
 int
 insmntque1(struct vnode *vp, struct mount *mp,
 	void (*dtr)(struct vnode *, void *), void *dtr_arg)
 {
 
 	KASSERT(vp->v_mount == NULL,
 		("insmntque: vnode already on per mount vnode list"));
 	VNASSERT(mp != NULL, vp, ("Don't call insmntque(foo, NULL)"));
 	MNT_ILOCK(mp);
 	if ((mp->mnt_kern_flag & MNTK_NOINSMNTQ) != 0 &&
 	    mp->mnt_nvnodelistsize == 0) {
 		MNT_IUNLOCK(mp);
 		if (dtr != NULL)
 			dtr(vp, dtr_arg);
 		return (EBUSY);
 	}
 	vp->v_mount = mp;
 	MNT_REF(mp);
 	TAILQ_INSERT_TAIL(&mp->mnt_nvnodelist, vp, v_nmntvnodes);
 	VNASSERT(mp->mnt_nvnodelistsize >= 0, vp,
 		("neg mount point vnode list size"));
 	mp->mnt_nvnodelistsize++;
 	MNT_IUNLOCK(mp);
 	return (0);
 }
 
 int
 insmntque(struct vnode *vp, struct mount *mp)
 {
 
 	return (insmntque1(vp, mp, insmntque_stddtr, NULL));
 }
 
 /*
  * Flush out and invalidate all buffers associated with a bufobj
  * Called with the underlying object locked.
  */
 int
 bufobj_invalbuf(struct bufobj *bo, int flags, struct thread *td, int slpflag,
     int slptimeo)
 {
 	int error;
 
 	BO_LOCK(bo);
 	if (flags & V_SAVE) {
 		error = bufobj_wwait(bo, slpflag, slptimeo);
 		if (error) {
 			BO_UNLOCK(bo);
 			return (error);
 		}
 		if (bo->bo_dirty.bv_cnt > 0) {
 			BO_UNLOCK(bo);
 			if ((error = BO_SYNC(bo, MNT_WAIT, td)) != 0)
 				return (error);
 			/*
 			 * XXX We could save a lock/unlock if this was only
 			 * enabled under INVARIANTS
 			 */
 			BO_LOCK(bo);
 			if (bo->bo_numoutput > 0 || bo->bo_dirty.bv_cnt > 0)
 				panic("vinvalbuf: dirty bufs");
 		}
 	}
 	/*
 	 * If you alter this loop please notice that interlock is dropped and
 	 * reacquired in flushbuflist.  Special care is needed to ensure that
 	 * no race conditions occur from this.
 	 */
 	do {
 		error = flushbuflist(&bo->bo_clean,
 		    flags, bo, slpflag, slptimeo);
 		if (error == 0)
 			error = flushbuflist(&bo->bo_dirty,
 			    flags, bo, slpflag, slptimeo);
 		if (error != 0 && error != EAGAIN) {
 			BO_UNLOCK(bo);
 			return (error);
 		}
 	} while (error != 0);
 
 	/*
 	 * Wait for I/O to complete.  XXX needs cleaning up.  The vnode can
 	 * have write I/O in-progress but if there is a VM object then the
 	 * VM object can also have read-I/O in-progress.
 	 */
 	do {
 		bufobj_wwait(bo, 0, 0);
 		BO_UNLOCK(bo);
 		if (bo->bo_object != NULL) {
 			VM_OBJECT_LOCK(bo->bo_object);
 			vm_object_pip_wait(bo->bo_object, "bovlbx");
 			VM_OBJECT_UNLOCK(bo->bo_object);
 		}
 		BO_LOCK(bo);
 	} while (bo->bo_numoutput > 0);
 	BO_UNLOCK(bo);
 
 	/*
 	 * Destroy the copy in the VM cache, too.
 	 */
 	if (bo->bo_object != NULL) {
 		VM_OBJECT_LOCK(bo->bo_object);
 		vm_object_page_remove(bo->bo_object, 0, 0,
 			(flags & V_SAVE) ? TRUE : FALSE);
 		VM_OBJECT_UNLOCK(bo->bo_object);
 	}
 
 #ifdef INVARIANTS
 	BO_LOCK(bo);
 	if ((flags & (V_ALT | V_NORMAL)) == 0 &&
 	    (bo->bo_dirty.bv_cnt > 0 || bo->bo_clean.bv_cnt > 0))
 		panic("vinvalbuf: flush failed");
 	BO_UNLOCK(bo);
 #endif
 	return (0);
 }
 
 /*
  * Flush out and invalidate all buffers associated with a vnode.
  * Called with the underlying object locked.
  */
 int
 vinvalbuf(struct vnode *vp, int flags, struct thread *td, int slpflag,
     int slptimeo)
 {
 
 	CTR2(KTR_VFS, "vinvalbuf vp %p flags %d", vp, flags);
 	ASSERT_VOP_LOCKED(vp, "vinvalbuf");
 	return (bufobj_invalbuf(&vp->v_bufobj, flags, td, slpflag, slptimeo));
 }
 
 /*
  * Flush out buffers on the specified list.
  *
  */
 static int
 flushbuflist( struct bufv *bufv, int flags, struct bufobj *bo, int slpflag,
     int slptimeo)
 {
 	struct buf *bp, *nbp;
 	int retval, error;
 	daddr_t lblkno;
 	b_xflags_t xflags;
 
 	ASSERT_BO_LOCKED(bo);
 
 	retval = 0;
 	TAILQ_FOREACH_SAFE(bp, &bufv->bv_hd, b_bobufs, nbp) {
 		if (((flags & V_NORMAL) && (bp->b_xflags & BX_ALTDATA)) ||
 		    ((flags & V_ALT) && (bp->b_xflags & BX_ALTDATA) == 0)) {
 			continue;
 		}
 		lblkno = 0;
 		xflags = 0;
 		if (nbp != NULL) {
 			lblkno = nbp->b_lblkno;
 			xflags = nbp->b_xflags &
 				(BX_BKGRDMARKER | BX_VNDIRTY | BX_VNCLEAN);
 		}
 		retval = EAGAIN;
 		error = BUF_TIMELOCK(bp,
 		    LK_EXCLUSIVE | LK_SLEEPFAIL | LK_INTERLOCK, BO_MTX(bo),
 		    "flushbuf", slpflag, slptimeo);
 		if (error) {
 			BO_LOCK(bo);
 			return (error != ENOLCK ? error : EAGAIN);
 		}
 		KASSERT(bp->b_bufobj == bo,
 		    ("bp %p wrong b_bufobj %p should be %p",
 		    bp, bp->b_bufobj, bo));
 		if (bp->b_bufobj != bo) {	/* XXX: necessary ? */
 			BUF_UNLOCK(bp);
 			BO_LOCK(bo);
 			return (EAGAIN);
 		}
 		/*
 		 * XXX Since there are no node locks for NFS, I
 		 * believe there is a slight chance that a delayed
 		 * write will occur while sleeping just above, so
 		 * check for it.
 		 */
 		if (((bp->b_flags & (B_DELWRI | B_INVAL)) == B_DELWRI) &&
 		    (flags & V_SAVE)) {
 			bremfree(bp);
 			bp->b_flags |= B_ASYNC;
 			bwrite(bp);
 			BO_LOCK(bo);
 			return (EAGAIN);	/* XXX: why not loop ? */
 		}
 		bremfree(bp);
 		bp->b_flags |= (B_INVAL | B_RELBUF);
 		bp->b_flags &= ~B_ASYNC;
 		brelse(bp);
 		BO_LOCK(bo);
 		if (nbp != NULL &&
 		    (nbp->b_bufobj != bo ||
 		     nbp->b_lblkno != lblkno ||
 		     (nbp->b_xflags &
 		      (BX_BKGRDMARKER | BX_VNDIRTY | BX_VNCLEAN)) != xflags))
 			break;			/* nbp invalid */
 	}
 	return (retval);
 }
 
 /*
  * Truncate a file's buffer and pages to a specified length.  This
  * is in lieu of the old vinvalbuf mechanism, which performed unneeded
  * sync activity.
  */
 int
 vtruncbuf(struct vnode *vp, struct ucred *cred, struct thread *td,
     off_t length, int blksize)
 {
 	struct buf *bp, *nbp;
 	int anyfreed;
 	int trunclbn;
 	struct bufobj *bo;
 
 	CTR2(KTR_VFS, "vtruncbuf vp %p length %jd", vp, length);
 	/*
 	 * Round up to the *next* lbn.
 	 */
 	trunclbn = (length + blksize - 1) / blksize;
 
 	ASSERT_VOP_LOCKED(vp, "vtruncbuf");
 restart:
 	VI_LOCK(vp);
 	bo = &vp->v_bufobj;
 	anyfreed = 1;
 	for (;anyfreed;) {
 		anyfreed = 0;
 		TAILQ_FOREACH_SAFE(bp, &bo->bo_clean.bv_hd, b_bobufs, nbp) {
 			if (bp->b_lblkno < trunclbn)
 				continue;
 			if (BUF_LOCK(bp,
 			    LK_EXCLUSIVE | LK_SLEEPFAIL | LK_INTERLOCK,
 			    VI_MTX(vp)) == ENOLCK)
 				goto restart;
 
 			bremfree(bp);
 			bp->b_flags |= (B_INVAL | B_RELBUF);
 			bp->b_flags &= ~B_ASYNC;
 			brelse(bp);
 			anyfreed = 1;
 
 			if (nbp != NULL &&
 			    (((nbp->b_xflags & BX_VNCLEAN) == 0) ||
 			    (nbp->b_vp != vp) ||
 			    (nbp->b_flags & B_DELWRI))) {
 				goto restart;
 			}
 			VI_LOCK(vp);
 		}
 
 		TAILQ_FOREACH_SAFE(bp, &bo->bo_dirty.bv_hd, b_bobufs, nbp) {
 			if (bp->b_lblkno < trunclbn)
 				continue;
 			if (BUF_LOCK(bp,
 			    LK_EXCLUSIVE | LK_SLEEPFAIL | LK_INTERLOCK,
 			    VI_MTX(vp)) == ENOLCK)
 				goto restart;
 			bremfree(bp);
 			bp->b_flags |= (B_INVAL | B_RELBUF);
 			bp->b_flags &= ~B_ASYNC;
 			brelse(bp);
 			anyfreed = 1;
 			if (nbp != NULL &&
 			    (((nbp->b_xflags & BX_VNDIRTY) == 0) ||
 			    (nbp->b_vp != vp) ||
 			    (nbp->b_flags & B_DELWRI) == 0)) {
 				goto restart;
 			}
 			VI_LOCK(vp);
 		}
 	}
 
 	if (length > 0) {
 restartsync:
 		TAILQ_FOREACH_SAFE(bp, &bo->bo_dirty.bv_hd, b_bobufs, nbp) {
 			if (bp->b_lblkno > 0)
 				continue;
 			/*
 			 * Since we hold the vnode lock this should only
 			 * fail if we're racing with the buf daemon.
 			 */
 			if (BUF_LOCK(bp,
 			    LK_EXCLUSIVE | LK_SLEEPFAIL | LK_INTERLOCK,
 			    VI_MTX(vp)) == ENOLCK) {
 				goto restart;
 			}
 			VNASSERT((bp->b_flags & B_DELWRI), vp,
 			    ("buf(%p) on dirty queue without DELWRI", bp));
 
 			bremfree(bp);
 			bawrite(bp);
 			VI_LOCK(vp);
 			goto restartsync;
 		}
 	}
 
 	bufobj_wwait(bo, 0, 0);
 	VI_UNLOCK(vp);
 	vnode_pager_setsize(vp, length);
 
 	return (0);
 }
 
 /*
  * buf_splay() - splay tree core for the clean/dirty list of buffers in
  * 		 a vnode.
  *
  *	NOTE: We have to deal with the special case of a background bitmap
  *	buffer, a situation where two buffers will have the same logical
  *	block offset.  We want (1) only the foreground buffer to be accessed
  *	in a lookup and (2) must differentiate between the foreground and
  *	background buffer in the splay tree algorithm because the splay
  *	tree cannot normally handle multiple entities with the same 'index'.
  *	We accomplish this by adding differentiating flags to the splay tree's
  *	numerical domain.
  */
 static
 struct buf *
 buf_splay(daddr_t lblkno, b_xflags_t xflags, struct buf *root)
 {
 	struct buf dummy;
 	struct buf *lefttreemax, *righttreemin, *y;
 
 	if (root == NULL)
 		return (NULL);
 	lefttreemax = righttreemin = &dummy;
 	for (;;) {
 		if (lblkno < root->b_lblkno ||
 		    (lblkno == root->b_lblkno &&
 		    (xflags & BX_BKGRDMARKER) < (root->b_xflags & BX_BKGRDMARKER))) {
 			if ((y = root->b_left) == NULL)
 				break;
 			if (lblkno < y->b_lblkno) {
 				/* Rotate right. */
 				root->b_left = y->b_right;
 				y->b_right = root;
 				root = y;
 				if ((y = root->b_left) == NULL)
 					break;
 			}
 			/* Link into the new root's right tree. */
 			righttreemin->b_left = root;
 			righttreemin = root;
 		} else if (lblkno > root->b_lblkno ||
 		    (lblkno == root->b_lblkno &&
 		    (xflags & BX_BKGRDMARKER) > (root->b_xflags & BX_BKGRDMARKER))) {
 			if ((y = root->b_right) == NULL)
 				break;
 			if (lblkno > y->b_lblkno) {
 				/* Rotate left. */
 				root->b_right = y->b_left;
 				y->b_left = root;
 				root = y;
 				if ((y = root->b_right) == NULL)
 					break;
 			}
 			/* Link into the new root's left tree. */
 			lefttreemax->b_right = root;
 			lefttreemax = root;
 		} else {
 			break;
 		}
 		root = y;
 	}
 	/* Assemble the new root. */
 	lefttreemax->b_right = root->b_left;
 	righttreemin->b_left = root->b_right;
 	root->b_left = dummy.b_right;
 	root->b_right = dummy.b_left;
 	return (root);
 }
 
 static void
 buf_vlist_remove(struct buf *bp)
 {
 	struct buf *root;
 	struct bufv *bv;
 
 	KASSERT(bp->b_bufobj != NULL, ("No b_bufobj %p", bp));
 	ASSERT_BO_LOCKED(bp->b_bufobj);
 	KASSERT((bp->b_xflags & (BX_VNDIRTY|BX_VNCLEAN)) !=
 	    (BX_VNDIRTY|BX_VNCLEAN),
 	    ("buf_vlist_remove: Buf %p is on two lists", bp));
 	if (bp->b_xflags & BX_VNDIRTY)
 		bv = &bp->b_bufobj->bo_dirty;
 	else
 		bv = &bp->b_bufobj->bo_clean;
 	if (bp != bv->bv_root) {
 		root = buf_splay(bp->b_lblkno, bp->b_xflags, bv->bv_root);
 		KASSERT(root == bp, ("splay lookup failed in remove"));
 	}
 	if (bp->b_left == NULL) {
 		root = bp->b_right;
 	} else {
 		root = buf_splay(bp->b_lblkno, bp->b_xflags, bp->b_left);
 		root->b_right = bp->b_right;
 	}
 	bv->bv_root = root;
 	TAILQ_REMOVE(&bv->bv_hd, bp, b_bobufs);
 	bv->bv_cnt--;
 	bp->b_xflags &= ~(BX_VNDIRTY | BX_VNCLEAN);
 }
 
 /*
  * Add the buffer to the sorted clean or dirty block list using a
  * splay tree algorithm.
  *
  * NOTE: xflags is passed as a constant, optimizing this inline function!
  */
 static void
 buf_vlist_add(struct buf *bp, struct bufobj *bo, b_xflags_t xflags)
 {
 	struct buf *root;
 	struct bufv *bv;
 
 	ASSERT_BO_LOCKED(bo);
 	KASSERT((bp->b_xflags & (BX_VNDIRTY|BX_VNCLEAN)) == 0,
 	    ("buf_vlist_add: Buf %p has existing xflags %d", bp, bp->b_xflags));
 	bp->b_xflags |= xflags;
 	if (xflags & BX_VNDIRTY)
 		bv = &bo->bo_dirty;
 	else
 		bv = &bo->bo_clean;
 
 	root = buf_splay(bp->b_lblkno, bp->b_xflags, bv->bv_root);
 	if (root == NULL) {
 		bp->b_left = NULL;
 		bp->b_right = NULL;
 		TAILQ_INSERT_TAIL(&bv->bv_hd, bp, b_bobufs);
 	} else if (bp->b_lblkno < root->b_lblkno ||
 	    (bp->b_lblkno == root->b_lblkno &&
 	    (bp->b_xflags & BX_BKGRDMARKER) < (root->b_xflags & BX_BKGRDMARKER))) {
 		bp->b_left = root->b_left;
 		bp->b_right = root;
 		root->b_left = NULL;
 		TAILQ_INSERT_BEFORE(root, bp, b_bobufs);
 	} else {
 		bp->b_right = root->b_right;
 		bp->b_left = root;
 		root->b_right = NULL;
 		TAILQ_INSERT_AFTER(&bv->bv_hd, root, bp, b_bobufs);
 	}
 	bv->bv_cnt++;
 	bv->bv_root = bp;
 }
 
 /*
  * Lookup a buffer using the splay tree.  Note that we specifically avoid
  * shadow buffers used in background bitmap writes.
  *
  * This code isn't quite efficient as it could be because we are maintaining
  * two sorted lists and do not know which list the block resides in.
  *
  * During a "make buildworld" the desired buffer is found at one of
  * the roots more than 60% of the time.  Thus, checking both roots
  * before performing either splay eliminates unnecessary splays on the
  * first tree splayed.
  */
 struct buf *
 gbincore(struct bufobj *bo, daddr_t lblkno)
 {
 	struct buf *bp;
 
 	ASSERT_BO_LOCKED(bo);
 	if ((bp = bo->bo_clean.bv_root) != NULL &&
 	    bp->b_lblkno == lblkno && !(bp->b_xflags & BX_BKGRDMARKER))
 		return (bp);
 	if ((bp = bo->bo_dirty.bv_root) != NULL &&
 	    bp->b_lblkno == lblkno && !(bp->b_xflags & BX_BKGRDMARKER))
 		return (bp);
 	if ((bp = bo->bo_clean.bv_root) != NULL) {
 		bo->bo_clean.bv_root = bp = buf_splay(lblkno, 0, bp);
 		if (bp->b_lblkno == lblkno && !(bp->b_xflags & BX_BKGRDMARKER))
 			return (bp);
 	}
 	if ((bp = bo->bo_dirty.bv_root) != NULL) {
 		bo->bo_dirty.bv_root = bp = buf_splay(lblkno, 0, bp);
 		if (bp->b_lblkno == lblkno && !(bp->b_xflags & BX_BKGRDMARKER))
 			return (bp);
 	}
 	return (NULL);
 }
 
 /*
  * Associate a buffer with a vnode.
  */
 void
 bgetvp(struct vnode *vp, struct buf *bp)
 {
 
 	VNASSERT(bp->b_vp == NULL, bp->b_vp, ("bgetvp: not free"));
 
 	CTR3(KTR_BUF, "bgetvp(%p) vp %p flags %X", bp, vp, bp->b_flags);
 	VNASSERT((bp->b_xflags & (BX_VNDIRTY|BX_VNCLEAN)) == 0, vp,
 	    ("bgetvp: bp already attached! %p", bp));
 
 	ASSERT_VI_LOCKED(vp, "bgetvp");
 	vholdl(vp);
 	if (VFS_NEEDSGIANT(vp->v_mount) ||
 	    vp->v_bufobj.bo_flag & BO_NEEDSGIANT)
 		bp->b_flags |= B_NEEDSGIANT;
 	bp->b_vp = vp;
 	bp->b_bufobj = &vp->v_bufobj;
 	/*
 	 * Insert onto list for new vnode.
 	 */
 	buf_vlist_add(bp, &vp->v_bufobj, BX_VNCLEAN);
 }
 
 /*
  * Disassociate a buffer from a vnode.
  */
 void
 brelvp(struct buf *bp)
 {
 	struct bufobj *bo;
 	struct vnode *vp;
 
 	CTR3(KTR_BUF, "brelvp(%p) vp %p flags %X", bp, bp->b_vp, bp->b_flags);
 	KASSERT(bp->b_vp != NULL, ("brelvp: NULL"));
 
 	/*
 	 * Delete from old vnode list, if on one.
 	 */
 	vp = bp->b_vp;		/* XXX */
 	bo = bp->b_bufobj;
 	BO_LOCK(bo);
 	if (bp->b_xflags & (BX_VNDIRTY | BX_VNCLEAN))
 		buf_vlist_remove(bp);
 	else
 		panic("brelvp: Buffer %p not on queue.", bp);
 	if ((bo->bo_flag & BO_ONWORKLST) && bo->bo_dirty.bv_cnt == 0) {
 		bo->bo_flag &= ~BO_ONWORKLST;
 		mtx_lock(&sync_mtx);
 		LIST_REMOVE(bo, bo_synclist);
 		syncer_worklist_len--;
 		mtx_unlock(&sync_mtx);
 	}
 	bp->b_flags &= ~B_NEEDSGIANT;
 	bp->b_vp = NULL;
 	bp->b_bufobj = NULL;
 	vdropl(vp);
 }
 
 /*
  * Add an item to the syncer work queue.
  */
 static void
 vn_syncer_add_to_worklist(struct bufobj *bo, int delay)
 {
 	int slot;
 
 	ASSERT_BO_LOCKED(bo);
 
 	mtx_lock(&sync_mtx);
 	if (bo->bo_flag & BO_ONWORKLST)
 		LIST_REMOVE(bo, bo_synclist);
 	else {
 		bo->bo_flag |= BO_ONWORKLST;
 		syncer_worklist_len++;
 	}
 
 	if (delay > syncer_maxdelay - 2)
 		delay = syncer_maxdelay - 2;
 	slot = (syncer_delayno + delay) & syncer_mask;
 
 	LIST_INSERT_HEAD(&syncer_workitem_pending[slot], bo, bo_synclist);
 	mtx_unlock(&sync_mtx);
 }
 
 static int
 sysctl_vfs_worklist_len(SYSCTL_HANDLER_ARGS)
 {
 	int error, len;
 
 	mtx_lock(&sync_mtx);
 	len = syncer_worklist_len - sync_vnode_count;
 	mtx_unlock(&sync_mtx);
 	error = SYSCTL_OUT(req, &len, sizeof(len));
 	return (error);
 }
 
 SYSCTL_PROC(_vfs, OID_AUTO, worklist_len, CTLTYPE_INT | CTLFLAG_RD, NULL, 0,
     sysctl_vfs_worklist_len, "I", "Syncer thread worklist length");
 
 static struct proc *updateproc;
 static void sched_sync(void);
 static struct kproc_desc up_kp = {
 	"syncer",
 	sched_sync,
 	&updateproc
 };
 SYSINIT(syncer, SI_SUB_KTHREAD_UPDATE, SI_ORDER_FIRST, kproc_start, &up_kp)
 
 static int
 sync_vnode(struct synclist *slp, struct bufobj **bo, struct thread *td)
 {
 	struct vnode *vp;
 	struct mount *mp;
 	int vfslocked;
 
 	vfslocked = 0;
 restart:
 	*bo = LIST_FIRST(slp);
 	if (*bo == NULL) {
 		VFS_UNLOCK_GIANT(vfslocked);
 		return (0);
 	}
 	vp = (*bo)->__bo_vnode;	/* XXX */
 	if (VFS_NEEDSGIANT(vp->v_mount)) {
 		if (!vfslocked) {
 			vfslocked = 1;
 			if (mtx_trylock(&Giant) == 0) {
 				mtx_unlock(&sync_mtx);
 				mtx_lock(&Giant);
 				mtx_lock(&sync_mtx);
 				goto restart;
 			}
 		}
 	} else {
 		VFS_UNLOCK_GIANT(vfslocked);
 		vfslocked = 0;
 	}
 	if (VOP_ISLOCKED(vp, NULL) != 0) {
 		VFS_UNLOCK_GIANT(vfslocked);
 		return (1);
 	}
 	if (VI_TRYLOCK(vp) == 0) {
 		VFS_UNLOCK_GIANT(vfslocked);
 		return (1);
 	}
 	/*
 	 * We use vhold in case the vnode does not
 	 * successfully sync.  vhold prevents the vnode from
 	 * going away when we unlock the sync_mtx so that
 	 * we can acquire the vnode interlock.
 	 */
 	vholdl(vp);
 	mtx_unlock(&sync_mtx);
 	VI_UNLOCK(vp);
 	if (vn_start_write(vp, &mp, V_NOWAIT) != 0) {
 		vdrop(vp);
 		VFS_UNLOCK_GIANT(vfslocked);
 		mtx_lock(&sync_mtx);
 		return (1);
 	}
-	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td);
+	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
 	(void) VOP_FSYNC(vp, MNT_LAZY, td);
 	VOP_UNLOCK(vp, 0, td);
 	vn_finished_write(mp);
 	VI_LOCK(vp);
 	if (((*bo)->bo_flag & BO_ONWORKLST) != 0) {
 		/*
 		 * Put us back on the worklist.  The worklist
 		 * routine will remove us from our current
 		 * position and then add us back in at a later
 		 * position.
 		 */
 		vn_syncer_add_to_worklist(*bo, syncdelay);
 	}
 	vdropl(vp);
 	VFS_UNLOCK_GIANT(vfslocked);
 	mtx_lock(&sync_mtx);
 	return (0);
 }
 
 /*
  * System filesystem synchronizer daemon.
  */
 static void
 sched_sync(void)
 {
 	struct synclist *next;
 	struct synclist *slp;
 	struct bufobj *bo;
 	long starttime;
 	struct thread *td = curthread;
 	static int dummychan;
 	int last_work_seen;
 	int net_worklist_len;
 	int syncer_final_iter;
 	int first_printf;
 	int error;
 
 	last_work_seen = 0;
 	syncer_final_iter = 0;
 	first_printf = 1;
 	syncer_state = SYNCER_RUNNING;
 	starttime = time_uptime;
 	td->td_pflags |= TDP_NORUNNINGBUF;
 
 	EVENTHANDLER_REGISTER(shutdown_pre_sync, syncer_shutdown, td->td_proc,
 	    SHUTDOWN_PRI_LAST);
 
 	mtx_lock(&sync_mtx);
 	for (;;) {
 		if (syncer_state == SYNCER_FINAL_DELAY &&
 		    syncer_final_iter == 0) {
 			mtx_unlock(&sync_mtx);
 			kproc_suspend_check(td->td_proc);
 			mtx_lock(&sync_mtx);
 		}
 		net_worklist_len = syncer_worklist_len - sync_vnode_count;
 		if (syncer_state != SYNCER_RUNNING &&
 		    starttime != time_uptime) {
 			if (first_printf) {
 				printf("\nSyncing disks, vnodes remaining...");
 				first_printf = 0;
 			}
 			printf("%d ", net_worklist_len);
 		}
 		starttime = time_uptime;
 
 		/*
 		 * Push files whose dirty time has expired.  Be careful
 		 * of interrupt race on slp queue.
 		 *
 		 * Skip over empty worklist slots when shutting down.
 		 */
 		do {
 			slp = &syncer_workitem_pending[syncer_delayno];
 			syncer_delayno += 1;
 			if (syncer_delayno == syncer_maxdelay)
 				syncer_delayno = 0;
 			next = &syncer_workitem_pending[syncer_delayno];
 			/*
 			 * If the worklist has wrapped since the
 			 * it was emptied of all but syncer vnodes,
 			 * switch to the FINAL_DELAY state and run
 			 * for one more second.
 			 */
 			if (syncer_state == SYNCER_SHUTTING_DOWN &&
 			    net_worklist_len == 0 &&
 			    last_work_seen == syncer_delayno) {
 				syncer_state = SYNCER_FINAL_DELAY;
 				syncer_final_iter = SYNCER_SHUTDOWN_SPEEDUP;
 			}
 		} while (syncer_state != SYNCER_RUNNING && LIST_EMPTY(slp) &&
 		    syncer_worklist_len > 0);
 
 		/*
 		 * Keep track of the last time there was anything
 		 * on the worklist other than syncer vnodes.
 		 * Return to the SHUTTING_DOWN state if any
 		 * new work appears.
 		 */
 		if (net_worklist_len > 0 || syncer_state == SYNCER_RUNNING)
 			last_work_seen = syncer_delayno;
 		if (net_worklist_len > 0 && syncer_state == SYNCER_FINAL_DELAY)
 			syncer_state = SYNCER_SHUTTING_DOWN;
 		while (!LIST_EMPTY(slp)) {
 			error = sync_vnode(slp, &bo, td);
 			if (error == 1) {
 				LIST_REMOVE(bo, bo_synclist);
 				LIST_INSERT_HEAD(next, bo, bo_synclist);
 				continue;
 			}
 		}
 		if (syncer_state == SYNCER_FINAL_DELAY && syncer_final_iter > 0)
 			syncer_final_iter--;
 		/*
 		 * The variable rushjob allows the kernel to speed up the
 		 * processing of the filesystem syncer process. A rushjob
 		 * value of N tells the filesystem syncer to process the next
 		 * N seconds worth of work on its queue ASAP. Currently rushjob
 		 * is used by the soft update code to speed up the filesystem
 		 * syncer process when the incore state is getting so far
 		 * ahead of the disk that the kernel memory pool is being
 		 * threatened with exhaustion.
 		 */
 		if (rushjob > 0) {
 			rushjob -= 1;
 			continue;
 		}
 		/*
 		 * Just sleep for a short period of time between
 		 * iterations when shutting down to allow some I/O
 		 * to happen.
 		 *
 		 * If it has taken us less than a second to process the
 		 * current work, then wait. Otherwise start right over
 		 * again. We can still lose time if any single round
 		 * takes more than two seconds, but it does not really
 		 * matter as we are just trying to generally pace the
 		 * filesystem activity.
 		 */
 		if (syncer_state != SYNCER_RUNNING)
 			msleep(&dummychan, &sync_mtx, PPAUSE, "syncfnl",
 			    hz / SYNCER_SHUTDOWN_SPEEDUP);
 		else if (time_uptime == starttime)
 			msleep(&lbolt, &sync_mtx, PPAUSE, "syncer", 0);
 	}
 }
 
 /*
  * Request the syncer daemon to speed up its work.
  * We never push it to speed up more than half of its
  * normal turn time, otherwise it could take over the cpu.
  */
 int
 speedup_syncer(void)
 {
 	struct thread *td;
 	int ret = 0;
 
 	td = FIRST_THREAD_IN_PROC(updateproc);
 	mtx_lock(&sync_mtx);
 	if (rushjob < syncdelay / 2) {
 		rushjob += 1;
 		stat_rush_requests += 1;
 		ret = 1;
 	}
 	mtx_unlock(&sync_mtx);
 	sleepq_remove(td, &lbolt);
 	return (ret);
 }
 
 /*
  * Tell the syncer to speed up its work and run though its work
  * list several times, then tell it to shut down.
  */
 static void
 syncer_shutdown(void *arg, int howto)
 {
 	struct thread *td;
 
 	if (howto & RB_NOSYNC)
 		return;
 	td = FIRST_THREAD_IN_PROC(updateproc);
 	mtx_lock(&sync_mtx);
 	syncer_state = SYNCER_SHUTTING_DOWN;
 	rushjob = 0;
 	mtx_unlock(&sync_mtx);
 	sleepq_remove(td, &lbolt);
 	kproc_shutdown(arg, howto);
 }
 
 /*
  * Reassign a buffer from one vnode to another.
  * Used to assign file specific control information
  * (indirect blocks) to the vnode to which they belong.
  */
 void
 reassignbuf(struct buf *bp)
 {
 	struct vnode *vp;
 	struct bufobj *bo;
 	int delay;
 #ifdef INVARIANTS
 	struct bufv *bv;
 #endif
 
 	vp = bp->b_vp;
 	bo = bp->b_bufobj;
 	++reassignbufcalls;
 
 	CTR3(KTR_BUF, "reassignbuf(%p) vp %p flags %X",
 	    bp, bp->b_vp, bp->b_flags);
 	/*
 	 * B_PAGING flagged buffers cannot be reassigned because their vp
 	 * is not fully linked in.
 	 */
 	if (bp->b_flags & B_PAGING)
 		panic("cannot reassign paging buffer");
 
 	/*
 	 * Delete from old vnode list, if on one.
 	 */
 	VI_LOCK(vp);
 	if (bp->b_xflags & (BX_VNDIRTY | BX_VNCLEAN))
 		buf_vlist_remove(bp);
 	else
 		panic("reassignbuf: Buffer %p not on queue.", bp);
 	/*
 	 * If dirty, put on list of dirty buffers; otherwise insert onto list
 	 * of clean buffers.
 	 */
 	if (bp->b_flags & B_DELWRI) {
 		if ((bo->bo_flag & BO_ONWORKLST) == 0) {
 			switch (vp->v_type) {
 			case VDIR:
 				delay = dirdelay;
 				break;
 			case VCHR:
 				delay = metadelay;
 				break;
 			default:
 				delay = filedelay;
 			}
 			vn_syncer_add_to_worklist(bo, delay);
 		}
 		buf_vlist_add(bp, bo, BX_VNDIRTY);
 	} else {
 		buf_vlist_add(bp, bo, BX_VNCLEAN);
 
 		if ((bo->bo_flag & BO_ONWORKLST) && bo->bo_dirty.bv_cnt == 0) {
 			mtx_lock(&sync_mtx);
 			LIST_REMOVE(bo, bo_synclist);
 			syncer_worklist_len--;
 			mtx_unlock(&sync_mtx);
 			bo->bo_flag &= ~BO_ONWORKLST;
 		}
 	}
 #ifdef INVARIANTS
 	bv = &bo->bo_clean;
 	bp = TAILQ_FIRST(&bv->bv_hd);
 	KASSERT(bp == NULL || bp->b_bufobj == bo,
 	    ("bp %p wrong b_bufobj %p should be %p", bp, bp->b_bufobj, bo));
 	bp = TAILQ_LAST(&bv->bv_hd, buflists);
 	KASSERT(bp == NULL || bp->b_bufobj == bo,
 	    ("bp %p wrong b_bufobj %p should be %p", bp, bp->b_bufobj, bo));
 	bv = &bo->bo_dirty;
 	bp = TAILQ_FIRST(&bv->bv_hd);
 	KASSERT(bp == NULL || bp->b_bufobj == bo,
 	    ("bp %p wrong b_bufobj %p should be %p", bp, bp->b_bufobj, bo));
 	bp = TAILQ_LAST(&bv->bv_hd, buflists);
 	KASSERT(bp == NULL || bp->b_bufobj == bo,
 	    ("bp %p wrong b_bufobj %p should be %p", bp, bp->b_bufobj, bo));
 #endif
 	VI_UNLOCK(vp);
 }
 
 /*
  * Increment the use and hold counts on the vnode, taking care to reference
  * the driver's usecount if this is a chardev.  The vholdl() will remove
  * the vnode from the free list if it is presently free.  Requires the
  * vnode interlock and returns with it held.
  */
 static void
 v_incr_usecount(struct vnode *vp)
 {
 
 	CTR3(KTR_VFS, "v_incr_usecount: vp %p holdcnt %d usecount %d\n",
 	    vp, vp->v_holdcnt, vp->v_usecount);
 	vp->v_usecount++;
 	if (vp->v_type == VCHR && vp->v_rdev != NULL) {
 		dev_lock();
 		vp->v_rdev->si_usecount++;
 		dev_unlock();
 	}
 	vholdl(vp);
 }
 
 /*
  * Turn a holdcnt into a use+holdcnt such that only one call to
  * v_decr_usecount is needed.
  */
 static void
 v_upgrade_usecount(struct vnode *vp)
 {
 
 	CTR3(KTR_VFS, "v_upgrade_usecount: vp %p holdcnt %d usecount %d\n",
 	    vp, vp->v_holdcnt, vp->v_usecount);
 	vp->v_usecount++;
 	if (vp->v_type == VCHR && vp->v_rdev != NULL) {
 		dev_lock();
 		vp->v_rdev->si_usecount++;
 		dev_unlock();
 	}
 }
 
 /*
  * Decrement the vnode use and hold count along with the driver's usecount
  * if this is a chardev.  The vdropl() below releases the vnode interlock
  * as it may free the vnode.
  */
 static void
 v_decr_usecount(struct vnode *vp)
 {
 
 	CTR3(KTR_VFS, "v_decr_usecount: vp %p holdcnt %d usecount %d\n",
 	    vp, vp->v_holdcnt, vp->v_usecount);
 	ASSERT_VI_LOCKED(vp, __FUNCTION__);
 	VNASSERT(vp->v_usecount > 0, vp,
 	    ("v_decr_usecount: negative usecount"));
 	vp->v_usecount--;
 	if (vp->v_type == VCHR && vp->v_rdev != NULL) {
 		dev_lock();
 		vp->v_rdev->si_usecount--;
 		dev_unlock();
 	}
 	vdropl(vp);
 }
 
 /*
  * Decrement only the use count and driver use count.  This is intended to
  * be paired with a follow on vdropl() to release the remaining hold count.
  * In this way we may vgone() a vnode with a 0 usecount without risk of
  * having it end up on a free list because the hold count is kept above 0.
  */
 static void
 v_decr_useonly(struct vnode *vp)
 {
 
 	CTR3(KTR_VFS, "v_decr_useonly: vp %p holdcnt %d usecount %d\n",
 	    vp, vp->v_holdcnt, vp->v_usecount);
 	ASSERT_VI_LOCKED(vp, __FUNCTION__);
 	VNASSERT(vp->v_usecount > 0, vp,
 	    ("v_decr_useonly: negative usecount"));
 	vp->v_usecount--;
 	if (vp->v_type == VCHR && vp->v_rdev != NULL) {
 		dev_lock();
 		vp->v_rdev->si_usecount--;
 		dev_unlock();
 	}
 }
 
 /*
  * Grab a particular vnode from the free list, increment its
  * reference count and lock it. The vnode lock bit is set if the
  * vnode is being eliminated in vgone. The process is awakened
  * when the transition is completed, and an error returned to
  * indicate that the vnode is no longer usable (possibly having
  * been changed to a new filesystem type).
  */
 int
 vget(struct vnode *vp, int flags, struct thread *td)
 {
 	int oweinact;
 	int oldflags;
 	int error;
 
 	error = 0;
 	oldflags = flags;
 	oweinact = 0;
 	VFS_ASSERT_GIANT(vp->v_mount);
 	if ((flags & LK_INTERLOCK) == 0)
 		VI_LOCK(vp);
 	/*
 	 * If the inactive call was deferred because vput() was called
 	 * with a shared lock, we have to do it here before another thread
 	 * gets a reference to data that should be dead.
 	 */
 	if (vp->v_iflag & VI_OWEINACT) {
 		if (flags & LK_NOWAIT) {
 			VI_UNLOCK(vp);
 			return (EBUSY);
 		}
 		flags &= ~LK_TYPE_MASK;
 		flags |= LK_EXCLUSIVE;
 		oweinact = 1;
 	}
 	vholdl(vp);
-	if ((error = vn_lock(vp, flags | LK_INTERLOCK, td)) != 0) {
+	if ((error = vn_lock(vp, flags | LK_INTERLOCK)) != 0) {
 		vdrop(vp);
 		return (error);
 	}
 	VI_LOCK(vp);
 	/* Upgrade our holdcnt to a usecount. */
 	v_upgrade_usecount(vp);
 	if (vp->v_iflag & VI_DOOMED && (flags & LK_RETRY) == 0)
 		panic("vget: vn_lock failed to return ENOENT\n");
 	if (oweinact) {
 		if (vp->v_iflag & VI_OWEINACT)
 			vinactive(vp, td);
 		VI_UNLOCK(vp);
 		if ((oldflags & LK_TYPE_MASK) == 0)
 			VOP_UNLOCK(vp, 0, td);
 	} else
 		VI_UNLOCK(vp);
 	return (0);
 }
 
 /*
  * Increase the reference count of a vnode.
  */
 void
 vref(struct vnode *vp)
 {
 
 	VI_LOCK(vp);
 	v_incr_usecount(vp);
 	VI_UNLOCK(vp);
 }
 
 /*
  * Return reference count of a vnode.
  *
  * The results of this call are only guaranteed when some mechanism other
  * than the VI lock is used to stop other processes from gaining references
  * to the vnode.  This may be the case if the caller holds the only reference.
  * This is also useful when stale data is acceptable as race conditions may
  * be accounted for by some other means.
  */
 int
 vrefcnt(struct vnode *vp)
 {
 	int usecnt;
 
 	VI_LOCK(vp);
 	usecnt = vp->v_usecount;
 	VI_UNLOCK(vp);
 
 	return (usecnt);
 }
 
 
 /*
  * Vnode put/release.
  * If count drops to zero, call inactive routine and return to freelist.
  */
 void
 vrele(struct vnode *vp)
 {
 	struct thread *td = curthread;	/* XXX */
 
 	KASSERT(vp != NULL, ("vrele: null vp"));
 	VFS_ASSERT_GIANT(vp->v_mount);
 
 	VI_LOCK(vp);
 
 	/* Skip this v_writecount check if we're going to panic below. */
 	VNASSERT(vp->v_writecount < vp->v_usecount || vp->v_usecount < 1, vp,
 	    ("vrele: missed vn_close"));
 
 	if (vp->v_usecount > 1 || ((vp->v_iflag & VI_DOINGINACT) &&
 	    vp->v_usecount == 1)) {
 		v_decr_usecount(vp);
 		return;
 	}
 	if (vp->v_usecount != 1) {
 #ifdef DIAGNOSTIC
 		vprint("vrele: negative ref count", vp);
 #endif
 		VI_UNLOCK(vp);
 		panic("vrele: negative ref cnt");
 	}
 	/*
 	 * We want to hold the vnode until the inactive finishes to
 	 * prevent vgone() races.  We drop the use count here and the
 	 * hold count below when we're done.
 	 */
 	v_decr_useonly(vp);
 	/*
 	 * We must call VOP_INACTIVE with the node locked. Mark
 	 * as VI_DOINGINACT to avoid recursion.
 	 */
 	vp->v_iflag |= VI_OWEINACT;
-	if (vn_lock(vp, LK_EXCLUSIVE | LK_INTERLOCK, td) == 0) {
+	if (vn_lock(vp, LK_EXCLUSIVE | LK_INTERLOCK) == 0) {
 		VI_LOCK(vp);
 		if (vp->v_usecount > 0)
 			vp->v_iflag &= ~VI_OWEINACT;
 		if (vp->v_iflag & VI_OWEINACT)
 			vinactive(vp, td);
 		VOP_UNLOCK(vp, 0, td);
 	} else {
 		VI_LOCK(vp);
 		if (vp->v_usecount > 0)
 			vp->v_iflag &= ~VI_OWEINACT;
 	}
 	vdropl(vp);
 }
 
 /*
  * Release an already locked vnode.  This give the same effects as
  * unlock+vrele(), but takes less time and avoids releasing and
  * re-aquiring the lock (as vrele() acquires the lock internally.)
  */
 void
 vput(struct vnode *vp)
 {
 	struct thread *td = curthread;	/* XXX */
 	int error;
 
 	KASSERT(vp != NULL, ("vput: null vp"));
 	ASSERT_VOP_LOCKED(vp, "vput");
 	VFS_ASSERT_GIANT(vp->v_mount);
 	VI_LOCK(vp);
 	/* Skip this v_writecount check if we're going to panic below. */
 	VNASSERT(vp->v_writecount < vp->v_usecount || vp->v_usecount < 1, vp,
 	    ("vput: missed vn_close"));
 	error = 0;
 
 	if (vp->v_usecount > 1 || ((vp->v_iflag & VI_DOINGINACT) &&
 	    vp->v_usecount == 1)) {
 		VOP_UNLOCK(vp, 0, td);
 		v_decr_usecount(vp);
 		return;
 	}
 
 	if (vp->v_usecount != 1) {
 #ifdef DIAGNOSTIC
 		vprint("vput: negative ref count", vp);
 #endif
 		panic("vput: negative ref cnt");
 	}
 	/*
 	 * We want to hold the vnode until the inactive finishes to
 	 * prevent vgone() races.  We drop the use count here and the
 	 * hold count below when we're done.
 	 */
 	v_decr_useonly(vp);
 	vp->v_iflag |= VI_OWEINACT;
 	if (VOP_ISLOCKED(vp, NULL) != LK_EXCLUSIVE) {
 		error = VOP_LOCK(vp, LK_UPGRADE|LK_INTERLOCK|LK_NOWAIT, td);
 		VI_LOCK(vp);
 		if (error) {
 			if (vp->v_usecount > 0)
 				vp->v_iflag &= ~VI_OWEINACT;
 			goto done;
 		}
 	}
 	if (vp->v_usecount > 0)
 		vp->v_iflag &= ~VI_OWEINACT;
 	if (vp->v_iflag & VI_OWEINACT)
 		vinactive(vp, td);
 	VOP_UNLOCK(vp, 0, td);
 done:
 	vdropl(vp);
 }
 
 /*
  * Somebody doesn't want the vnode recycled.
  */
 void
 vhold(struct vnode *vp)
 {
 
 	VI_LOCK(vp);
 	vholdl(vp);
 	VI_UNLOCK(vp);
 }
 
 void
 vholdl(struct vnode *vp)
 {
 
 	vp->v_holdcnt++;
 	if (VSHOULDBUSY(vp))
 		vbusy(vp);
 }
 
 /*
  * Note that there is one less who cares about this vnode.  vdrop() is the
  * opposite of vhold().
  */
 void
 vdrop(struct vnode *vp)
 {
 
 	VI_LOCK(vp);
 	vdropl(vp);
 }
 
 /*
  * Drop the hold count of the vnode.  If this is the last reference to
  * the vnode we will free it if it has been vgone'd otherwise it is
  * placed on the free list.
  */
 void
 vdropl(struct vnode *vp)
 {
 
 	ASSERT_VI_LOCKED(vp, "vdropl");
 	if (vp->v_holdcnt <= 0)
 		panic("vdrop: holdcnt %d", vp->v_holdcnt);
 	vp->v_holdcnt--;
 	if (vp->v_holdcnt == 0) {
 		if (vp->v_iflag & VI_DOOMED) {
 			vdestroy(vp);
 			return;
 		} else
 			vfree(vp);
 	}
 	VI_UNLOCK(vp);
 }
 
 /*
  * Call VOP_INACTIVE on the vnode and manage the DOINGINACT and OWEINACT
  * flags.  DOINGINACT prevents us from recursing in calls to vinactive.
  * OWEINACT tracks whether a vnode missed a call to inactive due to a
  * failed lock upgrade.
  */
 static void
 vinactive(struct vnode *vp, struct thread *td)
 {
 
 	ASSERT_VOP_LOCKED(vp, "vinactive");
 	ASSERT_VI_LOCKED(vp, "vinactive");
 	VNASSERT((vp->v_iflag & VI_DOINGINACT) == 0, vp,
 	    ("vinactive: recursed on VI_DOINGINACT"));
 	vp->v_iflag |= VI_DOINGINACT;
 	vp->v_iflag &= ~VI_OWEINACT;
 	VI_UNLOCK(vp);
 	VOP_INACTIVE(vp, td);
 	VI_LOCK(vp);
 	VNASSERT(vp->v_iflag & VI_DOINGINACT, vp,
 	    ("vinactive: lost VI_DOINGINACT"));
 	vp->v_iflag &= ~VI_DOINGINACT;
 }
 
 /*
  * Remove any vnodes in the vnode table belonging to mount point mp.
  *
  * If FORCECLOSE is not specified, there should not be any active ones,
  * return error if any are found (nb: this is a user error, not a
  * system error). If FORCECLOSE is specified, detach any active vnodes
  * that are found.
  *
  * If WRITECLOSE is set, only flush out regular file vnodes open for
  * writing.
  *
  * SKIPSYSTEM causes any vnodes marked VV_SYSTEM to be skipped.
  *
  * `rootrefs' specifies the base reference count for the root vnode
  * of this filesystem. The root vnode is considered busy if its
  * v_usecount exceeds this value. On a successful return, vflush(, td)
  * will call vrele() on the root vnode exactly rootrefs times.
  * If the SKIPSYSTEM or WRITECLOSE flags are specified, rootrefs must
  * be zero.
  */
 #ifdef DIAGNOSTIC
 static int busyprt = 0;		/* print out busy vnodes */
 SYSCTL_INT(_debug, OID_AUTO, busyprt, CTLFLAG_RW, &busyprt, 0, "");
 #endif
 
 int
 vflush( struct mount *mp, int rootrefs, int flags, struct thread *td)
 {
 	struct vnode *vp, *mvp, *rootvp = NULL;
 	struct vattr vattr;
 	int busy = 0, error;
 
 	CTR1(KTR_VFS, "vflush: mp %p", mp);
 	if (rootrefs > 0) {
 		KASSERT((flags & (SKIPSYSTEM | WRITECLOSE)) == 0,
 		    ("vflush: bad args"));
 		/*
 		 * Get the filesystem root vnode. We can vput() it
 		 * immediately, since with rootrefs > 0, it won't go away.
 		 */
 		if ((error = VFS_ROOT(mp, LK_EXCLUSIVE, &rootvp, td)) != 0)
 			return (error);
 		vput(rootvp);
 
 	}
 	MNT_ILOCK(mp);
 loop:
 	MNT_VNODE_FOREACH(vp, mp, mvp) {
 
 		VI_LOCK(vp);
 		vholdl(vp);
 		MNT_IUNLOCK(mp);
-		error = vn_lock(vp, LK_INTERLOCK | LK_EXCLUSIVE, td);
+		error = vn_lock(vp, LK_INTERLOCK | LK_EXCLUSIVE);
 		if (error) {
 			vdrop(vp);
 			MNT_ILOCK(mp);
 			MNT_VNODE_FOREACH_ABORT_ILOCKED(mp, mvp);
 			goto loop;
 		}
 		/*
 		 * Skip over a vnodes marked VV_SYSTEM.
 		 */
 		if ((flags & SKIPSYSTEM) && (vp->v_vflag & VV_SYSTEM)) {
 			VOP_UNLOCK(vp, 0, td);
 			vdrop(vp);
 			MNT_ILOCK(mp);
 			continue;
 		}
 		/*
 		 * If WRITECLOSE is set, flush out unlinked but still open
 		 * files (even if open only for reading) and regular file
 		 * vnodes open for writing.
 		 */
 		if (flags & WRITECLOSE) {
 			error = VOP_GETATTR(vp, &vattr, td->td_ucred, td);
 			VI_LOCK(vp);
 
 			if ((vp->v_type == VNON ||
 			    (error == 0 && vattr.va_nlink > 0)) &&
 			    (vp->v_writecount == 0 || vp->v_type != VREG)) {
 				VOP_UNLOCK(vp, 0, td);
 				vdropl(vp);
 				MNT_ILOCK(mp);
 				continue;
 			}
 		} else
 			VI_LOCK(vp);
 		/*
 		 * With v_usecount == 0, all we need to do is clear out the
 		 * vnode data structures and we are done.
 		 *
 		 * If FORCECLOSE is set, forcibly close the vnode.
 		 */
 		if (vp->v_usecount == 0 || (flags & FORCECLOSE)) {
 			VNASSERT(vp->v_usecount == 0 ||
 			    (vp->v_type != VCHR && vp->v_type != VBLK), vp,
 			    ("device VNODE %p is FORCECLOSED", vp));
 			vgonel(vp);
 		} else {
 			busy++;
 #ifdef DIAGNOSTIC
 			if (busyprt)
 				vprint("vflush: busy vnode", vp);
 #endif
 		}
 		VOP_UNLOCK(vp, 0, td);
 		vdropl(vp);
 		MNT_ILOCK(mp);
 	}
 	MNT_IUNLOCK(mp);
 	if (rootrefs > 0 && (flags & FORCECLOSE) == 0) {
 		/*
 		 * If just the root vnode is busy, and if its refcount
 		 * is equal to `rootrefs', then go ahead and kill it.
 		 */
 		VI_LOCK(rootvp);
 		KASSERT(busy > 0, ("vflush: not busy"));
 		VNASSERT(rootvp->v_usecount >= rootrefs, rootvp,
 		    ("vflush: usecount %d < rootrefs %d",
 		     rootvp->v_usecount, rootrefs));
 		if (busy == 1 && rootvp->v_usecount == rootrefs) {
 			VOP_LOCK(rootvp, LK_EXCLUSIVE|LK_INTERLOCK, td);
 			vgone(rootvp);
 			VOP_UNLOCK(rootvp, 0, td);
 			busy = 0;
 		} else
 			VI_UNLOCK(rootvp);
 	}
 	if (busy)
 		return (EBUSY);
 	for (; rootrefs > 0; rootrefs--)
 		vrele(rootvp);
 	return (0);
 }
 
 /*
  * Recycle an unused vnode to the front of the free list.
  */
 int
 vrecycle(struct vnode *vp, struct thread *td)
 {
 	int recycled;
 
 	ASSERT_VOP_LOCKED(vp, "vrecycle");
 	recycled = 0;
 	VI_LOCK(vp);
 	if (vp->v_usecount == 0) {
 		recycled = 1;
 		vgonel(vp);
 	}
 	VI_UNLOCK(vp);
 	return (recycled);
 }
 
 /*
  * Eliminate all activity associated with a vnode
  * in preparation for reuse.
  */
 void
 vgone(struct vnode *vp)
 {
 	VI_LOCK(vp);
 	vgonel(vp);
 	VI_UNLOCK(vp);
 }
 
 /*
  * vgone, with the vp interlock held.
  */
 void
 vgonel(struct vnode *vp)
 {
 	struct thread *td;
 	int oweinact;
 	int active;
 	struct mount *mp;
 
 	CTR1(KTR_VFS, "vgonel: vp %p", vp);
 	ASSERT_VOP_LOCKED(vp, "vgonel");
 	ASSERT_VI_LOCKED(vp, "vgonel");
 	VNASSERT(vp->v_holdcnt, vp,
 	    ("vgonel: vp %p has no reference.", vp));
 	td = curthread;
 
 	/*
 	 * Don't vgonel if we're already doomed.
 	 */
 	if (vp->v_iflag & VI_DOOMED)
 		return;
 	vp->v_iflag |= VI_DOOMED;
 	/*
 	 * Check to see if the vnode is in use.  If so, we have to call
 	 * VOP_CLOSE() and VOP_INACTIVE().
 	 */
 	active = vp->v_usecount;
 	oweinact = (vp->v_iflag & VI_OWEINACT);
 	VI_UNLOCK(vp);
 	/*
 	 * Clean out any buffers associated with the vnode.
 	 * If the flush fails, just toss the buffers.
 	 */
 	mp = NULL;
 	if (!TAILQ_EMPTY(&vp->v_bufobj.bo_dirty.bv_hd))
 		(void) vn_start_secondary_write(vp, &mp, V_WAIT);
 	if (vinvalbuf(vp, V_SAVE, td, 0, 0) != 0)
 		vinvalbuf(vp, 0, td, 0, 0);
 
 	/*
 	 * If purging an active vnode, it must be closed and
 	 * deactivated before being reclaimed.
 	 */
 	if (active)
 		VOP_CLOSE(vp, FNONBLOCK, NOCRED, td);
 	if (oweinact || active) {
 		VI_LOCK(vp);
 		if ((vp->v_iflag & VI_DOINGINACT) == 0)
 			vinactive(vp, td);
 		VI_UNLOCK(vp);
 	}
 	/*
 	 * Reclaim the vnode.
 	 */
 	if (VOP_RECLAIM(vp, td))
 		panic("vgone: cannot reclaim");
 	if (mp != NULL)
 		vn_finished_secondary_write(mp);
 	VNASSERT(vp->v_object == NULL, vp,
 	    ("vop_reclaim left v_object vp=%p, tag=%s", vp, vp->v_tag));
 	/*
 	 * Delete from old mount point vnode list.
 	 */
 	delmntque(vp);
 	cache_purge(vp);
 	/*
 	 * Done with purge, reset to the standard lock and invalidate
 	 * the vnode.
 	 */
 	VI_LOCK(vp);
 	vp->v_vnlock = &vp->v_lock;
 	vp->v_op = &dead_vnodeops;
 	vp->v_tag = "none";
 	vp->v_type = VBAD;
 }
 
 /*
  * Calculate the total number of references to a special device.
  */
 int
 vcount(struct vnode *vp)
 {
 	int count;
 
 	dev_lock();
 	count = vp->v_rdev->si_usecount;
 	dev_unlock();
 	return (count);
 }
 
 /*
  * Same as above, but using the struct cdev *as argument
  */
 int
 count_dev(struct cdev *dev)
 {
 	int count;
 
 	dev_lock();
 	count = dev->si_usecount;
 	dev_unlock();
 	return(count);
 }
 
 /*
  * Print out a description of a vnode.
  */
 static char *typename[] =
 {"VNON", "VREG", "VDIR", "VBLK", "VCHR", "VLNK", "VSOCK", "VFIFO", "VBAD",
  "VMARKER"};
 
 void
 vn_printf(struct vnode *vp, const char *fmt, ...)
 {
 	va_list ap;
 	char buf[256], buf2[16];
 	u_long flags;
 
 	va_start(ap, fmt);
 	vprintf(fmt, ap);
 	va_end(ap);
 	printf("%p: ", (void *)vp);
 	printf("tag %s, type %s\n", vp->v_tag, typename[vp->v_type]);
 	printf("    usecount %d, writecount %d, refcount %d mountedhere %p\n",
 	    vp->v_usecount, vp->v_writecount, vp->v_holdcnt, vp->v_mountedhere);
 	buf[0] = '\0';
 	buf[1] = '\0';
 	if (vp->v_vflag & VV_ROOT)
 		strlcat(buf, "|VV_ROOT", sizeof(buf));
 	if (vp->v_vflag & VV_ISTTY)
 		strlcat(buf, "|VV_ISTTY", sizeof(buf));
 	if (vp->v_vflag & VV_NOSYNC)
 		strlcat(buf, "|VV_NOSYNC", sizeof(buf));
 	if (vp->v_vflag & VV_CACHEDLABEL)
 		strlcat(buf, "|VV_CACHEDLABEL", sizeof(buf));
 	if (vp->v_vflag & VV_TEXT)
 		strlcat(buf, "|VV_TEXT", sizeof(buf));
 	if (vp->v_vflag & VV_COPYONWRITE)
 		strlcat(buf, "|VV_COPYONWRITE", sizeof(buf));
 	if (vp->v_vflag & VV_SYSTEM)
 		strlcat(buf, "|VV_SYSTEM", sizeof(buf));
 	if (vp->v_vflag & VV_PROCDEP)
 		strlcat(buf, "|VV_PROCDEP", sizeof(buf));
 	if (vp->v_vflag & VV_NOKNOTE)
 		strlcat(buf, "|VV_NOKNOTE", sizeof(buf));
 	if (vp->v_vflag & VV_DELETED)
 		strlcat(buf, "|VV_DELETED", sizeof(buf));
 	if (vp->v_vflag & VV_MD)
 		strlcat(buf, "|VV_MD", sizeof(buf));
 	flags = vp->v_vflag & ~(VV_ROOT | VV_ISTTY | VV_NOSYNC |
 	    VV_CACHEDLABEL | VV_TEXT | VV_COPYONWRITE | VV_SYSTEM | VV_PROCDEP |
 	    VV_NOKNOTE | VV_DELETED | VV_MD);
 	if (flags != 0) {
 		snprintf(buf2, sizeof(buf2), "|VV(0x%lx)", flags);
 		strlcat(buf, buf2, sizeof(buf));
 	}
 	if (vp->v_iflag & VI_MOUNT)
 		strlcat(buf, "|VI_MOUNT", sizeof(buf));
 	if (vp->v_iflag & VI_AGE)
 		strlcat(buf, "|VI_AGE", sizeof(buf));
 	if (vp->v_iflag & VI_DOOMED)
 		strlcat(buf, "|VI_DOOMED", sizeof(buf));
 	if (vp->v_iflag & VI_FREE)
 		strlcat(buf, "|VI_FREE", sizeof(buf));
 	if (vp->v_iflag & VI_OBJDIRTY)
 		strlcat(buf, "|VI_OBJDIRTY", sizeof(buf));
 	if (vp->v_iflag & VI_DOINGINACT)
 		strlcat(buf, "|VI_DOINGINACT", sizeof(buf));
 	if (vp->v_iflag & VI_OWEINACT)
 		strlcat(buf, "|VI_OWEINACT", sizeof(buf));
 	flags = vp->v_iflag & ~(VI_MOUNT | VI_AGE | VI_DOOMED | VI_FREE |
 	    VI_OBJDIRTY | VI_DOINGINACT | VI_OWEINACT);
 	if (flags != 0) {
 		snprintf(buf2, sizeof(buf2), "|VI(0x%lx)", flags);
 		strlcat(buf, buf2, sizeof(buf));
 	}
 	printf("    flags (%s)\n", buf + 1);
 	if (mtx_owned(VI_MTX(vp)))
 		printf(" VI_LOCKed");
 	if (vp->v_object != NULL)
 		printf("    v_object %p ref %d pages %d\n",
 		    vp->v_object, vp->v_object->ref_count,
 		    vp->v_object->resident_page_count);
 	printf("    ");
 	lockmgr_printinfo(vp->v_vnlock);
 	printf("\n");
 	if (vp->v_data != NULL)
 		VOP_PRINT(vp);
 }
 
 #ifdef DDB
 /*
  * List all of the locked vnodes in the system.
  * Called when debugging the kernel.
  */
 DB_SHOW_COMMAND(lockedvnods, lockedvnodes)
 {
 	struct mount *mp, *nmp;
 	struct vnode *vp;
 
 	/*
 	 * Note: because this is DDB, we can't obey the locking semantics
 	 * for these structures, which means we could catch an inconsistent
 	 * state and dereference a nasty pointer.  Not much to be done
 	 * about that.
 	 */
 	db_printf("Locked vnodes\n");
 	for (mp = TAILQ_FIRST(&mountlist); mp != NULL; mp = nmp) {
 		nmp = TAILQ_NEXT(mp, mnt_list);
 		TAILQ_FOREACH(vp, &mp->mnt_nvnodelist, v_nmntvnodes) {
 			if (vp->v_type != VMARKER && VOP_ISLOCKED(vp, NULL))
 				vprint("", vp);
 		}
 		nmp = TAILQ_NEXT(mp, mnt_list);
 	}
 }
 
 /*
  * Show details about the given vnode.
  */
 DB_SHOW_COMMAND(vnode, db_show_vnode)
 {
 	struct vnode *vp;
 
 	if (!have_addr)
 		return;
 	vp = (struct vnode *)addr;
 	vn_printf(vp, "vnode ");
 }
 #endif	/* DDB */
 
 /*
  * Fill in a struct xvfsconf based on a struct vfsconf.
  */
 static void
 vfsconf2x(struct vfsconf *vfsp, struct xvfsconf *xvfsp)
 {
 
 	strcpy(xvfsp->vfc_name, vfsp->vfc_name);
 	xvfsp->vfc_typenum = vfsp->vfc_typenum;
 	xvfsp->vfc_refcount = vfsp->vfc_refcount;
 	xvfsp->vfc_flags = vfsp->vfc_flags;
 	/*
 	 * These are unused in userland, we keep them
 	 * to not break binary compatibility.
 	 */
 	xvfsp->vfc_vfsops = NULL;
 	xvfsp->vfc_next = NULL;
 }
 
 /*
  * Top level filesystem related information gathering.
  */
 static int
 sysctl_vfs_conflist(SYSCTL_HANDLER_ARGS)
 {
 	struct vfsconf *vfsp;
 	struct xvfsconf xvfsp;
 	int error;
 
 	error = 0;
 	TAILQ_FOREACH(vfsp, &vfsconf, vfc_list) {
 		bzero(&xvfsp, sizeof(xvfsp));
 		vfsconf2x(vfsp, &xvfsp);
 		error = SYSCTL_OUT(req, &xvfsp, sizeof xvfsp);
 		if (error)
 			break;
 	}
 	return (error);
 }
 
 SYSCTL_PROC(_vfs, OID_AUTO, conflist, CTLFLAG_RD, NULL, 0, sysctl_vfs_conflist,
     "S,xvfsconf", "List of all configured filesystems");
 
 #ifndef BURN_BRIDGES
 static int	sysctl_ovfs_conf(SYSCTL_HANDLER_ARGS);
 
 static int
 vfs_sysctl(SYSCTL_HANDLER_ARGS)
 {
 	int *name = (int *)arg1 - 1;	/* XXX */
 	u_int namelen = arg2 + 1;	/* XXX */
 	struct vfsconf *vfsp;
 	struct xvfsconf xvfsp;
 
 	printf("WARNING: userland calling deprecated sysctl, "
 	    "please rebuild world\n");
 
 #if 1 || defined(COMPAT_PRELITE2)
 	/* Resolve ambiguity between VFS_VFSCONF and VFS_GENERIC. */
 	if (namelen == 1)
 		return (sysctl_ovfs_conf(oidp, arg1, arg2, req));
 #endif
 
 	switch (name[1]) {
 	case VFS_MAXTYPENUM:
 		if (namelen != 2)
 			return (ENOTDIR);
 		return (SYSCTL_OUT(req, &maxvfsconf, sizeof(int)));
 	case VFS_CONF:
 		if (namelen != 3)
 			return (ENOTDIR);	/* overloaded */
 		TAILQ_FOREACH(vfsp, &vfsconf, vfc_list)
 			if (vfsp->vfc_typenum == name[2])
 				break;
 		if (vfsp == NULL)
 			return (EOPNOTSUPP);
 		bzero(&xvfsp, sizeof(xvfsp));
 		vfsconf2x(vfsp, &xvfsp);
 		return (SYSCTL_OUT(req, &xvfsp, sizeof(xvfsp)));
 	}
 	return (EOPNOTSUPP);
 }
 
 static SYSCTL_NODE(_vfs, VFS_GENERIC, generic, CTLFLAG_RD | CTLFLAG_SKIP,
 	vfs_sysctl, "Generic filesystem");
 
 #if 1 || defined(COMPAT_PRELITE2)
 
 static int
 sysctl_ovfs_conf(SYSCTL_HANDLER_ARGS)
 {
 	int error;
 	struct vfsconf *vfsp;
 	struct ovfsconf ovfs;
 
 	TAILQ_FOREACH(vfsp, &vfsconf, vfc_list) {
 		bzero(&ovfs, sizeof(ovfs));
 		ovfs.vfc_vfsops = vfsp->vfc_vfsops;	/* XXX used as flag */
 		strcpy(ovfs.vfc_name, vfsp->vfc_name);
 		ovfs.vfc_index = vfsp->vfc_typenum;
 		ovfs.vfc_refcount = vfsp->vfc_refcount;
 		ovfs.vfc_flags = vfsp->vfc_flags;
 		error = SYSCTL_OUT(req, &ovfs, sizeof ovfs);
 		if (error)
 			return error;
 	}
 	return 0;
 }
 
 #endif /* 1 || COMPAT_PRELITE2 */
 #endif /* !BURN_BRIDGES */
 
 #define KINFO_VNODESLOP		10
 #ifdef notyet
 /*
  * Dump vnode list (via sysctl).
  */
 /* ARGSUSED */
 static int
 sysctl_vnode(SYSCTL_HANDLER_ARGS)
 {
 	struct xvnode *xvn;
 	struct thread *td = req->td;
 	struct mount *mp;
 	struct vnode *vp;
 	int error, len, n;
 
 	/*
 	 * Stale numvnodes access is not fatal here.
 	 */
 	req->lock = 0;
 	len = (numvnodes + KINFO_VNODESLOP) * sizeof *xvn;
 	if (!req->oldptr)
 		/* Make an estimate */
 		return (SYSCTL_OUT(req, 0, len));
 
 	error = sysctl_wire_old_buffer(req, 0);
 	if (error != 0)
 		return (error);
 	xvn = malloc(len, M_TEMP, M_ZERO | M_WAITOK);
 	n = 0;
 	mtx_lock(&mountlist_mtx);
 	TAILQ_FOREACH(mp, &mountlist, mnt_list) {
 		if (vfs_busy(mp, LK_NOWAIT, &mountlist_mtx, td))
 			continue;
 		MNT_ILOCK(mp);
 		TAILQ_FOREACH(vp, &mp->mnt_nvnodelist, v_nmntvnodes) {
 			if (n == len)
 				break;
 			vref(vp);
 			xvn[n].xv_size = sizeof *xvn;
 			xvn[n].xv_vnode = vp;
 			xvn[n].xv_id = 0;	/* XXX compat */
 #define XV_COPY(field) xvn[n].xv_##field = vp->v_##field
 			XV_COPY(usecount);
 			XV_COPY(writecount);
 			XV_COPY(holdcnt);
 			XV_COPY(mount);
 			XV_COPY(numoutput);
 			XV_COPY(type);
 #undef XV_COPY
 			xvn[n].xv_flag = vp->v_vflag;
 
 			switch (vp->v_type) {
 			case VREG:
 			case VDIR:
 			case VLNK:
 				break;
 			case VBLK:
 			case VCHR:
 				if (vp->v_rdev == NULL) {
 					vrele(vp);
 					continue;
 				}
 				xvn[n].xv_dev = dev2udev(vp->v_rdev);
 				break;
 			case VSOCK:
 				xvn[n].xv_socket = vp->v_socket;
 				break;
 			case VFIFO:
 				xvn[n].xv_fifo = vp->v_fifoinfo;
 				break;
 			case VNON:
 			case VBAD:
 			default:
 				/* shouldn't happen? */
 				vrele(vp);
 				continue;
 			}
 			vrele(vp);
 			++n;
 		}
 		MNT_IUNLOCK(mp);
 		mtx_lock(&mountlist_mtx);
 		vfs_unbusy(mp, td);
 		if (n == len)
 			break;
 	}
 	mtx_unlock(&mountlist_mtx);
 
 	error = SYSCTL_OUT(req, xvn, n * sizeof *xvn);
 	free(xvn, M_TEMP);
 	return (error);
 }
 
 SYSCTL_PROC(_kern, KERN_VNODE, vnode, CTLTYPE_OPAQUE|CTLFLAG_RD,
 	0, 0, sysctl_vnode, "S,xvnode", "");
 #endif
 
 /*
  * Unmount all filesystems. The list is traversed in reverse order
  * of mounting to avoid dependencies.
  */
 void
 vfs_unmountall(void)
 {
 	struct mount *mp;
 	struct thread *td;
 	int error;
 
 	KASSERT(curthread != NULL, ("vfs_unmountall: NULL curthread"));
 	td = curthread;
 	/*
 	 * Since this only runs when rebooting, it is not interlocked.
 	 */
 	while(!TAILQ_EMPTY(&mountlist)) {
 		mp = TAILQ_LAST(&mountlist, mntlist);
 		error = dounmount(mp, MNT_FORCE, td);
 		if (error) {
 			TAILQ_REMOVE(&mountlist, mp, mnt_list);
 			/*
 			 * XXX: Due to the way in which we mount the root
 			 * file system off of devfs, devfs will generate a
 			 * "busy" warning when we try to unmount it before
 			 * the root.  Don't print a warning as a result in
 			 * order to avoid false positive errors that may
 			 * cause needless upset.
 			 */
 			if (strcmp(mp->mnt_vfc->vfc_name, "devfs") != 0) {
 				printf("unmount of %s failed (",
 				    mp->mnt_stat.f_mntonname);
 				if (error == EBUSY)
 					printf("BUSY)\n");
 				else
 					printf("%d)\n", error);
 			}
 		} else {
 			/* The unmount has removed mp from the mountlist */
 		}
 	}
 }
 
 /*
  * perform msync on all vnodes under a mount point
  * the mount point must be locked.
  */
 void
 vfs_msync(struct mount *mp, int flags)
 {
 	struct vnode *vp, *mvp;
 	struct vm_object *obj;
 
 	MNT_ILOCK(mp);
 	MNT_VNODE_FOREACH(vp, mp, mvp) {
 		VI_LOCK(vp);
 		if ((vp->v_iflag & VI_OBJDIRTY) &&
 		    (flags == MNT_WAIT || VOP_ISLOCKED(vp, NULL) == 0)) {
 			MNT_IUNLOCK(mp);
 			if (!vget(vp,
 			    LK_EXCLUSIVE | LK_RETRY | LK_INTERLOCK,
 			    curthread)) {
 				if (vp->v_vflag & VV_NOSYNC) {	/* unlinked */
 					vput(vp);
 					MNT_ILOCK(mp);
 					continue;
 				}
 
 				obj = vp->v_object;
 				if (obj != NULL) {
 					VM_OBJECT_LOCK(obj);
 					vm_object_page_clean(obj, 0, 0,
 					    flags == MNT_WAIT ?
 					    OBJPC_SYNC : OBJPC_NOSYNC);
 					VM_OBJECT_UNLOCK(obj);
 				}
 				vput(vp);
 			}
 			MNT_ILOCK(mp);
 		} else
 			VI_UNLOCK(vp);
 	}
 	MNT_IUNLOCK(mp);
 }
 
 /*
  * Mark a vnode as free, putting it up for recycling.
  */
 static void
 vfree(struct vnode *vp)
 {
 
 	CTR1(KTR_VFS, "vfree vp %p", vp);
 	ASSERT_VI_LOCKED(vp, "vfree");
 	mtx_lock(&vnode_free_list_mtx);
 	VNASSERT(vp->v_op != NULL, vp, ("vfree: vnode already reclaimed."));
 	VNASSERT((vp->v_iflag & VI_FREE) == 0, vp, ("vnode already free"));
 	VNASSERT(VSHOULDFREE(vp), vp, ("vfree: freeing when we shouldn't"));
 	VNASSERT((vp->v_iflag & VI_DOOMED) == 0, vp,
 	    ("vfree: Freeing doomed vnode"));
 	if (vp->v_iflag & VI_AGE) {
 		TAILQ_INSERT_HEAD(&vnode_free_list, vp, v_freelist);
 	} else {
 		TAILQ_INSERT_TAIL(&vnode_free_list, vp, v_freelist);
 	}
 	freevnodes++;
 	vp->v_iflag &= ~VI_AGE;
 	vp->v_iflag |= VI_FREE;
 	mtx_unlock(&vnode_free_list_mtx);
 }
 
 /*
  * Opposite of vfree() - mark a vnode as in use.
  */
 static void
 vbusy(struct vnode *vp)
 {
 	CTR1(KTR_VFS, "vbusy vp %p", vp);
 	ASSERT_VI_LOCKED(vp, "vbusy");
 	VNASSERT((vp->v_iflag & VI_FREE) != 0, vp, ("vnode not free"));
 	VNASSERT(vp->v_op != NULL, vp, ("vbusy: vnode already reclaimed."));
 
 	mtx_lock(&vnode_free_list_mtx);
 	TAILQ_REMOVE(&vnode_free_list, vp, v_freelist);
 	freevnodes--;
 	vp->v_iflag &= ~(VI_FREE|VI_AGE);
 	mtx_unlock(&vnode_free_list_mtx);
 }
 
 /*
  * Initalize per-vnode helper structure to hold poll-related state.
  */
 void
 v_addpollinfo(struct vnode *vp)
 {
 	struct vpollinfo *vi;
 
 	vi = uma_zalloc(vnodepoll_zone, M_WAITOK);
 	if (vp->v_pollinfo != NULL) {
 		uma_zfree(vnodepoll_zone, vi);
 		return;
 	}
 	vp->v_pollinfo = vi;
 	mtx_init(&vp->v_pollinfo->vpi_lock, "vnode pollinfo", NULL, MTX_DEF);
 	knlist_init(&vp->v_pollinfo->vpi_selinfo.si_note, vp, vfs_knllock,
 	    vfs_knlunlock, vfs_knllocked);
 }
 
 /*
  * Record a process's interest in events which might happen to
  * a vnode.  Because poll uses the historic select-style interface
  * internally, this routine serves as both the ``check for any
  * pending events'' and the ``record my interest in future events''
  * functions.  (These are done together, while the lock is held,
  * to avoid race conditions.)
  */
 int
 vn_pollrecord(struct vnode *vp, struct thread *td, int events)
 {
 
 	if (vp->v_pollinfo == NULL)
 		v_addpollinfo(vp);
 	mtx_lock(&vp->v_pollinfo->vpi_lock);
 	if (vp->v_pollinfo->vpi_revents & events) {
 		/*
 		 * This leaves events we are not interested
 		 * in available for the other process which
 		 * which presumably had requested them
 		 * (otherwise they would never have been
 		 * recorded).
 		 */
 		events &= vp->v_pollinfo->vpi_revents;
 		vp->v_pollinfo->vpi_revents &= ~events;
 
 		mtx_unlock(&vp->v_pollinfo->vpi_lock);
 		return events;
 	}
 	vp->v_pollinfo->vpi_events |= events;
 	selrecord(td, &vp->v_pollinfo->vpi_selinfo);
 	mtx_unlock(&vp->v_pollinfo->vpi_lock);
 	return 0;
 }
 
 /*
  * Routine to create and manage a filesystem syncer vnode.
  */
 #define sync_close ((int (*)(struct  vop_close_args *))nullop)
 static int	sync_fsync(struct  vop_fsync_args *);
 static int	sync_inactive(struct  vop_inactive_args *);
 static int	sync_reclaim(struct  vop_reclaim_args *);
 
 static struct vop_vector sync_vnodeops = {
 	.vop_bypass =	VOP_EOPNOTSUPP,
 	.vop_close =	sync_close,		/* close */
 	.vop_fsync =	sync_fsync,		/* fsync */
 	.vop_inactive =	sync_inactive,	/* inactive */
 	.vop_reclaim =	sync_reclaim,	/* reclaim */
 	.vop_lock1 =	vop_stdlock,	/* lock */
 	.vop_unlock =	vop_stdunlock,	/* unlock */
 	.vop_islocked =	vop_stdislocked,	/* islocked */
 };
 
 /*
  * Create a new filesystem syncer vnode for the specified mount point.
  */
 int
 vfs_allocate_syncvnode(struct mount *mp)
 {
 	struct vnode *vp;
 	static long start, incr, next;
 	int error;
 
 	/* Allocate a new vnode */
 	if ((error = getnewvnode("syncer", mp, &sync_vnodeops, &vp)) != 0) {
 		mp->mnt_syncer = NULL;
 		return (error);
 	}
 	vp->v_type = VNON;
 	error = insmntque(vp, mp);
 	if (error != 0)
 		panic("vfs_allocate_syncvnode: insmntque failed");
 	/*
 	 * Place the vnode onto the syncer worklist. We attempt to
 	 * scatter them about on the list so that they will go off
 	 * at evenly distributed times even if all the filesystems
 	 * are mounted at once.
 	 */
 	next += incr;
 	if (next == 0 || next > syncer_maxdelay) {
 		start /= 2;
 		incr /= 2;
 		if (start == 0) {
 			start = syncer_maxdelay / 2;
 			incr = syncer_maxdelay;
 		}
 		next = start;
 	}
 	VI_LOCK(vp);
 	vn_syncer_add_to_worklist(&vp->v_bufobj,
 	    syncdelay > 0 ? next % syncdelay : 0);
 	/* XXX - vn_syncer_add_to_worklist() also grabs and drops sync_mtx. */
 	mtx_lock(&sync_mtx);
 	sync_vnode_count++;
 	mtx_unlock(&sync_mtx);
 	VI_UNLOCK(vp);
 	mp->mnt_syncer = vp;
 	return (0);
 }
 
 /*
  * Do a lazy sync of the filesystem.
  */
 static int
 sync_fsync(struct vop_fsync_args *ap)
 {
 	struct vnode *syncvp = ap->a_vp;
 	struct mount *mp = syncvp->v_mount;
 	struct thread *td = ap->a_td;
 	int error;
 	struct bufobj *bo;
 
 	/*
 	 * We only need to do something if this is a lazy evaluation.
 	 */
 	if (ap->a_waitfor != MNT_LAZY)
 		return (0);
 
 	/*
 	 * Move ourselves to the back of the sync list.
 	 */
 	bo = &syncvp->v_bufobj;
 	BO_LOCK(bo);
 	vn_syncer_add_to_worklist(bo, syncdelay);
 	BO_UNLOCK(bo);
 
 	/*
 	 * Walk the list of vnodes pushing all that are dirty and
 	 * not already on the sync list.
 	 */
 	mtx_lock(&mountlist_mtx);
 	if (vfs_busy(mp, LK_EXCLUSIVE | LK_NOWAIT, &mountlist_mtx, td) != 0) {
 		mtx_unlock(&mountlist_mtx);
 		return (0);
 	}
 	if (vn_start_write(NULL, &mp, V_NOWAIT) != 0) {
 		vfs_unbusy(mp, td);
 		return (0);
 	}
 	MNT_ILOCK(mp);
 	mp->mnt_noasync++;
 	mp->mnt_kern_flag &= ~MNTK_ASYNC;
 	MNT_IUNLOCK(mp);
 	vfs_msync(mp, MNT_NOWAIT);
 	error = VFS_SYNC(mp, MNT_LAZY, td);
 	MNT_ILOCK(mp);
 	mp->mnt_noasync--;
 	if ((mp->mnt_flag & MNT_ASYNC) != 0 && mp->mnt_noasync == 0)
 		mp->mnt_kern_flag |= MNTK_ASYNC;
 	MNT_IUNLOCK(mp);
 	vn_finished_write(mp);
 	vfs_unbusy(mp, td);
 	return (error);
 }
 
 /*
  * The syncer vnode is no referenced.
  */
 static int
 sync_inactive(struct vop_inactive_args *ap)
 {
 
 	vgone(ap->a_vp);
 	return (0);
 }
 
 /*
  * The syncer vnode is no longer needed and is being decommissioned.
  *
  * Modifications to the worklist must be protected by sync_mtx.
  */
 static int
 sync_reclaim(struct vop_reclaim_args *ap)
 {
 	struct vnode *vp = ap->a_vp;
 	struct bufobj *bo;
 
 	VI_LOCK(vp);
 	bo = &vp->v_bufobj;
 	vp->v_mount->mnt_syncer = NULL;
 	if (bo->bo_flag & BO_ONWORKLST) {
 		mtx_lock(&sync_mtx);
 		LIST_REMOVE(bo, bo_synclist);
 		syncer_worklist_len--;
 		sync_vnode_count--;
 		mtx_unlock(&sync_mtx);
 		bo->bo_flag &= ~BO_ONWORKLST;
 	}
 	VI_UNLOCK(vp);
 
 	return (0);
 }
 
 /*
  * Check if vnode represents a disk device
  */
 int
 vn_isdisk(struct vnode *vp, int *errp)
 {
 	int error;
 
 	error = 0;
 	dev_lock();
 	if (vp->v_type != VCHR)
 		error = ENOTBLK;
 	else if (vp->v_rdev == NULL)
 		error = ENXIO;
 	else if (vp->v_rdev->si_devsw == NULL)
 		error = ENXIO;
 	else if (!(vp->v_rdev->si_devsw->d_flags & D_DISK))
 		error = ENOTBLK;
 	dev_unlock();
 	if (errp != NULL)
 		*errp = error;
 	return (error == 0);
 }
 
 /*
  * Common filesystem object access control check routine.  Accepts a
  * vnode's type, "mode", uid and gid, requested access mode, credentials,
  * and optional call-by-reference privused argument allowing vaccess()
  * to indicate to the caller whether privilege was used to satisfy the
  * request (obsoleted).  Returns 0 on success, or an errno on failure.
  *
  * The ifdef'd CAPABILITIES version is here for reference, but is not
  * actually used.
  */
 int
 vaccess(enum vtype type, mode_t file_mode, uid_t file_uid, gid_t file_gid,
     mode_t acc_mode, struct ucred *cred, int *privused)
 {
 	mode_t dac_granted;
 	mode_t priv_granted;
 
 	/*
 	 * Look for a normal, non-privileged way to access the file/directory
 	 * as requested.  If it exists, go with that.
 	 */
 
 	if (privused != NULL)
 		*privused = 0;
 
 	dac_granted = 0;
 
 	/* Check the owner. */
 	if (cred->cr_uid == file_uid) {
 		dac_granted |= VADMIN;
 		if (file_mode & S_IXUSR)
 			dac_granted |= VEXEC;
 		if (file_mode & S_IRUSR)
 			dac_granted |= VREAD;
 		if (file_mode & S_IWUSR)
 			dac_granted |= (VWRITE | VAPPEND);
 
 		if ((acc_mode & dac_granted) == acc_mode)
 			return (0);
 
 		goto privcheck;
 	}
 
 	/* Otherwise, check the groups (first match) */
 	if (groupmember(file_gid, cred)) {
 		if (file_mode & S_IXGRP)
 			dac_granted |= VEXEC;
 		if (file_mode & S_IRGRP)
 			dac_granted |= VREAD;
 		if (file_mode & S_IWGRP)
 			dac_granted |= (VWRITE | VAPPEND);
 
 		if ((acc_mode & dac_granted) == acc_mode)
 			return (0);
 
 		goto privcheck;
 	}
 
 	/* Otherwise, check everyone else. */
 	if (file_mode & S_IXOTH)
 		dac_granted |= VEXEC;
 	if (file_mode & S_IROTH)
 		dac_granted |= VREAD;
 	if (file_mode & S_IWOTH)
 		dac_granted |= (VWRITE | VAPPEND);
 	if ((acc_mode & dac_granted) == acc_mode)
 		return (0);
 
 privcheck:
 	/*
 	 * Build a privilege mask to determine if the set of privileges
 	 * satisfies the requirements when combined with the granted mask
 	 * from above.  For each privilege, if the privilege is required,
 	 * bitwise or the request type onto the priv_granted mask.
 	 */
 	priv_granted = 0;
 
 	if (type == VDIR) {
 		/*
 		 * For directories, use PRIV_VFS_LOOKUP to satisfy VEXEC
 		 * requests, instead of PRIV_VFS_EXEC.
 		 */
 		if ((acc_mode & VEXEC) && ((dac_granted & VEXEC) == 0) &&
 		    !priv_check_cred(cred, PRIV_VFS_LOOKUP, 0))
 			priv_granted |= VEXEC;
 	} else {
 		if ((acc_mode & VEXEC) && ((dac_granted & VEXEC) == 0) &&
 		    !priv_check_cred(cred, PRIV_VFS_EXEC, 0))
 			priv_granted |= VEXEC;
 	}
 
 	if ((acc_mode & VREAD) && ((dac_granted & VREAD) == 0) &&
 	    !priv_check_cred(cred, PRIV_VFS_READ, 0))
 		priv_granted |= VREAD;
 
 	if ((acc_mode & VWRITE) && ((dac_granted & VWRITE) == 0) &&
 	    !priv_check_cred(cred, PRIV_VFS_WRITE, 0))
 		priv_granted |= (VWRITE | VAPPEND);
 
 	if ((acc_mode & VADMIN) && ((dac_granted & VADMIN) == 0) &&
 	    !priv_check_cred(cred, PRIV_VFS_ADMIN, 0))
 		priv_granted |= VADMIN;
 
 	if ((acc_mode & (priv_granted | dac_granted)) == acc_mode) {
 		/* XXX audit: privilege used */
 		if (privused != NULL)
 			*privused = 1;
 		return (0);
 	}
 
 	return ((acc_mode & VADMIN) ? EPERM : EACCES);
 }
 
 /*
  * Credential check based on process requesting service, and per-attribute
  * permissions.
  */
 int
 extattr_check_cred(struct vnode *vp, int attrnamespace, struct ucred *cred,
     struct thread *td, int access)
 {
 
 	/*
 	 * Kernel-invoked always succeeds.
 	 */
 	if (cred == NOCRED)
 		return (0);
 
 	/*
 	 * Do not allow privileged processes in jail to directly manipulate
 	 * system attributes.
 	 */
 	switch (attrnamespace) {
 	case EXTATTR_NAMESPACE_SYSTEM:
 		/* Potentially should be: return (EPERM); */
 		return (priv_check_cred(cred, PRIV_VFS_EXTATTR_SYSTEM, 0));
 	case EXTATTR_NAMESPACE_USER:
 		return (VOP_ACCESS(vp, access, cred, td));
 	default:
 		return (EPERM);
 	}
 }
 
 #ifdef DEBUG_VFS_LOCKS
 /*
  * This only exists to supress warnings from unlocked specfs accesses.  It is
  * no longer ok to have an unlocked VFS.
  */
 #define	IGNORE_LOCK(vp) ((vp)->v_type == VCHR || (vp)->v_type == VBAD)
 
 int vfs_badlock_ddb = 1;	/* Drop into debugger on violation. */
 SYSCTL_INT(_debug, OID_AUTO, vfs_badlock_ddb, CTLFLAG_RW, &vfs_badlock_ddb, 0, "");
 
 int vfs_badlock_mutex = 1;	/* Check for interlock across VOPs. */
 SYSCTL_INT(_debug, OID_AUTO, vfs_badlock_mutex, CTLFLAG_RW, &vfs_badlock_mutex, 0, "");
 
 int vfs_badlock_print = 1;	/* Print lock violations. */
 SYSCTL_INT(_debug, OID_AUTO, vfs_badlock_print, CTLFLAG_RW, &vfs_badlock_print, 0, "");
 
 #ifdef KDB
 int vfs_badlock_backtrace = 1;	/* Print backtrace at lock violations. */
 SYSCTL_INT(_debug, OID_AUTO, vfs_badlock_backtrace, CTLFLAG_RW, &vfs_badlock_backtrace, 0, "");
 #endif
 
 static void
 vfs_badlock(const char *msg, const char *str, struct vnode *vp)
 {
 
 #ifdef KDB
 	if (vfs_badlock_backtrace)
 		kdb_backtrace();
 #endif
 	if (vfs_badlock_print)
 		printf("%s: %p %s\n", str, (void *)vp, msg);
 	if (vfs_badlock_ddb)
 		kdb_enter(KDB_WHY_VFSLOCK, "lock violation");
 }
 
 void
 assert_vi_locked(struct vnode *vp, const char *str)
 {
 
 	if (vfs_badlock_mutex && !mtx_owned(VI_MTX(vp)))
 		vfs_badlock("interlock is not locked but should be", str, vp);
 }
 
 void
 assert_vi_unlocked(struct vnode *vp, const char *str)
 {
 
 	if (vfs_badlock_mutex && mtx_owned(VI_MTX(vp)))
 		vfs_badlock("interlock is locked but should not be", str, vp);
 }
 
 void
 assert_vop_locked(struct vnode *vp, const char *str)
 {
 
 	if (vp && !IGNORE_LOCK(vp) && VOP_ISLOCKED(vp, NULL) == 0)
 		vfs_badlock("is not locked but should be", str, vp);
 }
 
 void
 assert_vop_unlocked(struct vnode *vp, const char *str)
 {
 
 	if (vp && !IGNORE_LOCK(vp) &&
 	    VOP_ISLOCKED(vp, curthread) == LK_EXCLUSIVE)
 		vfs_badlock("is locked but should not be", str, vp);
 }
 
 void
 assert_vop_elocked(struct vnode *vp, const char *str)
 {
 
 	if (vp && !IGNORE_LOCK(vp) &&
 	    VOP_ISLOCKED(vp, curthread) != LK_EXCLUSIVE)
 		vfs_badlock("is not exclusive locked but should be", str, vp);
 }
 
 #if 0
 void
 assert_vop_elocked_other(struct vnode *vp, const char *str)
 {
 
 	if (vp && !IGNORE_LOCK(vp) &&
 	    VOP_ISLOCKED(vp, curthread) != LK_EXCLOTHER)
 		vfs_badlock("is not exclusive locked by another thread",
 		    str, vp);
 }
 
 void
 assert_vop_slocked(struct vnode *vp, const char *str)
 {
 
 	if (vp && !IGNORE_LOCK(vp) &&
 	    VOP_ISLOCKED(vp, curthread) != LK_SHARED)
 		vfs_badlock("is not locked shared but should be", str, vp);
 }
 #endif /* 0 */
 #endif /* DEBUG_VFS_LOCKS */
 
 void
 vop_rename_pre(void *ap)
 {
 	struct vop_rename_args *a = ap;
 
 #ifdef DEBUG_VFS_LOCKS
 	if (a->a_tvp)
 		ASSERT_VI_UNLOCKED(a->a_tvp, "VOP_RENAME");
 	ASSERT_VI_UNLOCKED(a->a_tdvp, "VOP_RENAME");
 	ASSERT_VI_UNLOCKED(a->a_fvp, "VOP_RENAME");
 	ASSERT_VI_UNLOCKED(a->a_fdvp, "VOP_RENAME");
 
 	/* Check the source (from). */
 	if (a->a_tdvp != a->a_fdvp && a->a_tvp != a->a_fdvp)
 		ASSERT_VOP_UNLOCKED(a->a_fdvp, "vop_rename: fdvp locked");
 	if (a->a_tvp != a->a_fvp)
 		ASSERT_VOP_UNLOCKED(a->a_fvp, "vop_rename: fvp locked");
 
 	/* Check the target. */
 	if (a->a_tvp)
 		ASSERT_VOP_LOCKED(a->a_tvp, "vop_rename: tvp not locked");
 	ASSERT_VOP_LOCKED(a->a_tdvp, "vop_rename: tdvp not locked");
 #endif
 	if (a->a_tdvp != a->a_fdvp)
 		vhold(a->a_fdvp);
 	if (a->a_tvp != a->a_fvp)
 		vhold(a->a_fvp);
 	vhold(a->a_tdvp);
 	if (a->a_tvp)
 		vhold(a->a_tvp);
 }
 
 void
 vop_strategy_pre(void *ap)
 {
 #ifdef DEBUG_VFS_LOCKS
 	struct vop_strategy_args *a;
 	struct buf *bp;
 
 	a = ap;
 	bp = a->a_bp;
 
 	/*
 	 * Cluster ops lock their component buffers but not the IO container.
 	 */
 	if ((bp->b_flags & B_CLUSTER) != 0)
 		return;
 
 	if (BUF_REFCNT(bp) < 1) {
 		if (vfs_badlock_print)
 			printf(
 			    "VOP_STRATEGY: bp is not locked but should be\n");
 		if (vfs_badlock_ddb)
 			kdb_enter(KDB_WHY_VFSLOCK, "lock violation");
 	}
 #endif
 }
 
 void
 vop_lookup_pre(void *ap)
 {
 #ifdef DEBUG_VFS_LOCKS
 	struct vop_lookup_args *a;
 	struct vnode *dvp;
 
 	a = ap;
 	dvp = a->a_dvp;
 	ASSERT_VI_UNLOCKED(dvp, "VOP_LOOKUP");
 	ASSERT_VOP_LOCKED(dvp, "VOP_LOOKUP");
 #endif
 }
 
 void
 vop_lookup_post(void *ap, int rc)
 {
 #ifdef DEBUG_VFS_LOCKS
 	struct vop_lookup_args *a;
 	struct vnode *dvp;
 	struct vnode *vp;
 
 	a = ap;
 	dvp = a->a_dvp;
 	vp = *(a->a_vpp);
 
 	ASSERT_VI_UNLOCKED(dvp, "VOP_LOOKUP");
 	ASSERT_VOP_LOCKED(dvp, "VOP_LOOKUP");
 
 	if (!rc)
 		ASSERT_VOP_LOCKED(vp, "VOP_LOOKUP (child)");
 #endif
 }
 
 void
 vop_lock_pre(void *ap)
 {
 #ifdef DEBUG_VFS_LOCKS
 	struct vop_lock1_args *a = ap;
 
 	if ((a->a_flags & LK_INTERLOCK) == 0)
 		ASSERT_VI_UNLOCKED(a->a_vp, "VOP_LOCK");
 	else
 		ASSERT_VI_LOCKED(a->a_vp, "VOP_LOCK");
 #endif
 }
 
 void
 vop_lock_post(void *ap, int rc)
 {
 #ifdef DEBUG_VFS_LOCKS
 	struct vop_lock1_args *a = ap;
 
 	ASSERT_VI_UNLOCKED(a->a_vp, "VOP_LOCK");
 	if (rc == 0)
 		ASSERT_VOP_LOCKED(a->a_vp, "VOP_LOCK");
 #endif
 }
 
 void
 vop_unlock_pre(void *ap)
 {
 #ifdef DEBUG_VFS_LOCKS
 	struct vop_unlock_args *a = ap;
 
 	if (a->a_flags & LK_INTERLOCK)
 		ASSERT_VI_LOCKED(a->a_vp, "VOP_UNLOCK");
 	ASSERT_VOP_LOCKED(a->a_vp, "VOP_UNLOCK");
 #endif
 }
 
 void
 vop_unlock_post(void *ap, int rc)
 {
 #ifdef DEBUG_VFS_LOCKS
 	struct vop_unlock_args *a = ap;
 
 	if (a->a_flags & LK_INTERLOCK)
 		ASSERT_VI_UNLOCKED(a->a_vp, "VOP_UNLOCK");
 #endif
 }
 
 void
 vop_create_post(void *ap, int rc)
 {
 	struct vop_create_args *a = ap;
 
 	if (!rc)
 		VFS_KNOTE_LOCKED(a->a_dvp, NOTE_WRITE);
 }
 
 void
 vop_link_post(void *ap, int rc)
 {
 	struct vop_link_args *a = ap;
 
 	if (!rc) {
 		VFS_KNOTE_LOCKED(a->a_vp, NOTE_LINK);
 		VFS_KNOTE_LOCKED(a->a_tdvp, NOTE_WRITE);
 	}
 }
 
 void
 vop_mkdir_post(void *ap, int rc)
 {
 	struct vop_mkdir_args *a = ap;
 
 	if (!rc)
 		VFS_KNOTE_LOCKED(a->a_dvp, NOTE_WRITE | NOTE_LINK);
 }
 
 void
 vop_mknod_post(void *ap, int rc)
 {
 	struct vop_mknod_args *a = ap;
 
 	if (!rc)
 		VFS_KNOTE_LOCKED(a->a_dvp, NOTE_WRITE);
 }
 
 void
 vop_remove_post(void *ap, int rc)
 {
 	struct vop_remove_args *a = ap;
 
 	if (!rc) {
 		VFS_KNOTE_LOCKED(a->a_dvp, NOTE_WRITE);
 		VFS_KNOTE_LOCKED(a->a_vp, NOTE_DELETE);
 	}
 }
 
 void
 vop_rename_post(void *ap, int rc)
 {
 	struct vop_rename_args *a = ap;
 
 	if (!rc) {
 		VFS_KNOTE_UNLOCKED(a->a_fdvp, NOTE_WRITE);
 		VFS_KNOTE_UNLOCKED(a->a_tdvp, NOTE_WRITE);
 		VFS_KNOTE_UNLOCKED(a->a_fvp, NOTE_RENAME);
 		if (a->a_tvp)
 			VFS_KNOTE_UNLOCKED(a->a_tvp, NOTE_DELETE);
 	}
 	if (a->a_tdvp != a->a_fdvp)
 		vdrop(a->a_fdvp);
 	if (a->a_tvp != a->a_fvp)
 		vdrop(a->a_fvp);
 	vdrop(a->a_tdvp);
 	if (a->a_tvp)
 		vdrop(a->a_tvp);
 }
 
 void
 vop_rmdir_post(void *ap, int rc)
 {
 	struct vop_rmdir_args *a = ap;
 
 	if (!rc) {
 		VFS_KNOTE_LOCKED(a->a_dvp, NOTE_WRITE | NOTE_LINK);
 		VFS_KNOTE_LOCKED(a->a_vp, NOTE_DELETE);
 	}
 }
 
 void
 vop_setattr_post(void *ap, int rc)
 {
 	struct vop_setattr_args *a = ap;
 
 	if (!rc)
 		VFS_KNOTE_LOCKED(a->a_vp, NOTE_ATTRIB);
 }
 
 void
 vop_symlink_post(void *ap, int rc)
 {
 	struct vop_symlink_args *a = ap;
 
 	if (!rc)
 		VFS_KNOTE_LOCKED(a->a_dvp, NOTE_WRITE);
 }
 
 static struct knlist fs_knlist;
 
 static void
 vfs_event_init(void *arg)
 {
 	knlist_init(&fs_knlist, NULL, NULL, NULL, NULL);
 }
 /* XXX - correct order? */
 SYSINIT(vfs_knlist, SI_SUB_VFS, SI_ORDER_ANY, vfs_event_init, NULL);
 
 void
 vfs_event_signal(fsid_t *fsid, u_int32_t event, intptr_t data __unused)
 {
 
 	KNOTE_UNLOCKED(&fs_knlist, event);
 }
 
 static int	filt_fsattach(struct knote *kn);
 static void	filt_fsdetach(struct knote *kn);
 static int	filt_fsevent(struct knote *kn, long hint);
 
 struct filterops fs_filtops =
 	{ 0, filt_fsattach, filt_fsdetach, filt_fsevent };
 
 static int
 filt_fsattach(struct knote *kn)
 {
 
 	kn->kn_flags |= EV_CLEAR;
 	knlist_add(&fs_knlist, kn, 0);
 	return (0);
 }
 
 static void
 filt_fsdetach(struct knote *kn)
 {
 
 	knlist_remove(&fs_knlist, kn, 0);
 }
 
 static int
 filt_fsevent(struct knote *kn, long hint)
 {
 
 	kn->kn_fflags |= hint;
 	return (kn->kn_fflags != 0);
 }
 
 static int
 sysctl_vfs_ctl(SYSCTL_HANDLER_ARGS)
 {
 	struct vfsidctl vc;
 	int error;
 	struct mount *mp;
 
 	error = SYSCTL_IN(req, &vc, sizeof(vc));
 	if (error)
 		return (error);
 	if (vc.vc_vers != VFS_CTL_VERS1)
 		return (EINVAL);
 	mp = vfs_getvfs(&vc.vc_fsid);
 	if (mp == NULL)
 		return (ENOENT);
 	/* ensure that a specific sysctl goes to the right filesystem. */
 	if (strcmp(vc.vc_fstypename, "*") != 0 &&
 	    strcmp(vc.vc_fstypename, mp->mnt_vfc->vfc_name) != 0) {
 		vfs_rel(mp);
 		return (EINVAL);
 	}
 	VCTLTOREQ(&vc, req);
 	error = VFS_SYSCTL(mp, vc.vc_op, req);
 	vfs_rel(mp);
 	return (error);
 }
 
 SYSCTL_PROC(_vfs, OID_AUTO, ctl, CTLFLAG_WR, NULL, 0, sysctl_vfs_ctl, "",
     "Sysctl by fsid");
 
 /*
  * Function to initialize a va_filerev field sensibly.
  * XXX: Wouldn't a random number make a lot more sense ??
  */
 u_quad_t
 init_va_filerev(void)
 {
 	struct bintime bt;
 
 	getbinuptime(&bt);
 	return (((u_quad_t)bt.sec << 32LL) | (bt.frac >> 32LL));
 }
 
 static int	filt_vfsread(struct knote *kn, long hint);
 static int	filt_vfswrite(struct knote *kn, long hint);
 static int	filt_vfsvnode(struct knote *kn, long hint);
 static void	filt_vfsdetach(struct knote *kn);
 static struct filterops vfsread_filtops =
 	{ 1, NULL, filt_vfsdetach, filt_vfsread };
 static struct filterops vfswrite_filtops =
 	{ 1, NULL, filt_vfsdetach, filt_vfswrite };
 static struct filterops vfsvnode_filtops =
 	{ 1, NULL, filt_vfsdetach, filt_vfsvnode };
 
 static void
 vfs_knllock(void *arg)
 {
 	struct vnode *vp = arg;
 
-	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, curthread);
+	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
 }
 
 static void
 vfs_knlunlock(void *arg)
 {
 	struct vnode *vp = arg;
 
 	VOP_UNLOCK(vp, 0, curthread);
 }
 
 static int
 vfs_knllocked(void *arg)
 {
 	struct vnode *vp = arg;
 
 	return (VOP_ISLOCKED(vp, curthread) == LK_EXCLUSIVE);
 }
 
 int
 vfs_kqfilter(struct vop_kqfilter_args *ap)
 {
 	struct vnode *vp = ap->a_vp;
 	struct knote *kn = ap->a_kn;
 	struct knlist *knl;
 
 	switch (kn->kn_filter) {
 	case EVFILT_READ:
 		kn->kn_fop = &vfsread_filtops;
 		break;
 	case EVFILT_WRITE:
 		kn->kn_fop = &vfswrite_filtops;
 		break;
 	case EVFILT_VNODE:
 		kn->kn_fop = &vfsvnode_filtops;
 		break;
 	default:
 		return (EINVAL);
 	}
 
 	kn->kn_hook = (caddr_t)vp;
 
 	if (vp->v_pollinfo == NULL)
 		v_addpollinfo(vp);
 	if (vp->v_pollinfo == NULL)
 		return (ENOMEM);
 	knl = &vp->v_pollinfo->vpi_selinfo.si_note;
 	knlist_add(knl, kn, 0);
 
 	return (0);
 }
 
 /*
  * Detach knote from vnode
  */
 static void
 filt_vfsdetach(struct knote *kn)
 {
 	struct vnode *vp = (struct vnode *)kn->kn_hook;
 
 	KASSERT(vp->v_pollinfo != NULL, ("Missing v_pollinfo"));
 	knlist_remove(&vp->v_pollinfo->vpi_selinfo.si_note, kn, 0);
 }
 
 /*ARGSUSED*/
 static int
 filt_vfsread(struct knote *kn, long hint)
 {
 	struct vnode *vp = (struct vnode *)kn->kn_hook;
 	struct vattr va;
 
 	/*
 	 * filesystem is gone, so set the EOF flag and schedule
 	 * the knote for deletion.
 	 */
 	if (hint == NOTE_REVOKE) {
 		kn->kn_flags |= (EV_EOF | EV_ONESHOT);
 		return (1);
 	}
 
 	if (VOP_GETATTR(vp, &va, curthread->td_ucred, curthread))
 		return (0);
 
 	kn->kn_data = va.va_size - kn->kn_fp->f_offset;
 	return (kn->kn_data != 0);
 }
 
 /*ARGSUSED*/
 static int
 filt_vfswrite(struct knote *kn, long hint)
 {
 	/*
 	 * filesystem is gone, so set the EOF flag and schedule
 	 * the knote for deletion.
 	 */
 	if (hint == NOTE_REVOKE)
 		kn->kn_flags |= (EV_EOF | EV_ONESHOT);
 
 	kn->kn_data = 0;
 	return (1);
 }
 
 static int
 filt_vfsvnode(struct knote *kn, long hint)
 {
 	if (kn->kn_sfflags & hint)
 		kn->kn_fflags |= hint;
 	if (hint == NOTE_REVOKE) {
 		kn->kn_flags |= EV_EOF;
 		return (1);
 	}
 	return (kn->kn_fflags != 0);
 }
 
 int
 vfs_read_dirent(struct vop_readdir_args *ap, struct dirent *dp, off_t off)
 {
 	int error;
 
 	if (dp->d_reclen > ap->a_uio->uio_resid)
 		return (ENAMETOOLONG);
 	error = uiomove(dp, dp->d_reclen, ap->a_uio);
 	if (error) {
 		if (ap->a_ncookies != NULL) {
 			if (ap->a_cookies != NULL)
 				free(ap->a_cookies, M_TEMP);
 			ap->a_cookies = NULL;
 			*ap->a_ncookies = 0;
 		}
 		return (error);
 	}
 	if (ap->a_ncookies == NULL)
 		return (0);
 
 	KASSERT(ap->a_cookies,
 	    ("NULL ap->a_cookies value with non-NULL ap->a_ncookies!"));
 
 	*ap->a_cookies = realloc(*ap->a_cookies,
 	    (*ap->a_ncookies + 1) * sizeof(u_long), M_TEMP, M_WAITOK | M_ZERO);
 	(*ap->a_cookies)[*ap->a_ncookies] = off;
 	return (0);
 }
 
 /*
  * Mark for update the access time of the file if the filesystem
  * supports VA_MARK_ATIME.  This functionality is used by execve
  * and mmap, so we want to avoid the synchronous I/O implied by
  * directly setting va_atime for the sake of efficiency.
  */
 void
 vfs_mark_atime(struct vnode *vp, struct thread *td)
 {
 	struct vattr atimeattr;
 
 	if ((vp->v_mount->mnt_flag & (MNT_NOATIME | MNT_RDONLY)) == 0) {
 		VATTR_NULL(&atimeattr);
 		atimeattr.va_vaflags |= VA_MARK_ATIME;
 		(void)VOP_SETATTR(vp, &atimeattr, td->td_ucred, td);
 	}
 }
Index: head/sys/kern/vfs_syscalls.c
===================================================================
--- head/sys/kern/vfs_syscalls.c	(revision 175201)
+++ head/sys/kern/vfs_syscalls.c	(revision 175202)
@@ -1,4264 +1,4264 @@
 /*-
  * Copyright (c) 1989, 1993
  *	The Regents of the University of California.  All rights reserved.
  * (c) UNIX System Laboratories, Inc.
  * All or some portions of this file are derived from material licensed
  * to the University of California by American Telephone and Telegraph
  * Co. or Unix System Laboratories, Inc. and are reproduced herein with
  * the permission of UNIX System Laboratories, Inc.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	@(#)vfs_syscalls.c	8.13 (Berkeley) 4/15/94
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_compat.h"
 #include "opt_mac.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/bio.h>
 #include <sys/buf.h>
 #include <sys/sysent.h>
 #include <sys/malloc.h>
 #include <sys/mount.h>
 #include <sys/mutex.h>
 #include <sys/sysproto.h>
 #include <sys/namei.h>
 #include <sys/filedesc.h>
 #include <sys/kernel.h>
 #include <sys/fcntl.h>
 #include <sys/file.h>
 #include <sys/filio.h>
 #include <sys/limits.h>
 #include <sys/linker.h>
 #include <sys/stat.h>
 #include <sys/sx.h>
 #include <sys/unistd.h>
 #include <sys/vnode.h>
 #include <sys/priv.h>
 #include <sys/proc.h>
 #include <sys/dirent.h>
 #include <sys/jail.h>
 #include <sys/syscallsubr.h>
 #include <sys/sysctl.h>
 
 #include <machine/stdarg.h>
 
 #include <security/audit/audit.h>
 #include <security/mac/mac_framework.h>
 
 #include <vm/vm.h>
 #include <vm/vm_object.h>
 #include <vm/vm_page.h>
 #include <vm/uma.h>
 
 static int chroot_refuse_vdir_fds(struct filedesc *fdp);
 static int getutimes(const struct timeval *, enum uio_seg, struct timespec *);
 static int setfown(struct thread *td, struct vnode *, uid_t, gid_t);
 static int setfmode(struct thread *td, struct vnode *, int);
 static int setfflags(struct thread *td, struct vnode *, int);
 static int setutimes(struct thread *td, struct vnode *,
     const struct timespec *, int, int);
 static int vn_access(struct vnode *vp, int user_flags, struct ucred *cred,
     struct thread *td);
 
 /*
  * The module initialization routine for POSIX asynchronous I/O will
  * set this to the version of AIO that it implements.  (Zero means
  * that it is not implemented.)  This value is used here by pathconf()
  * and in kern_descrip.c by fpathconf().
  */
 int async_io_version;
 
 #ifdef DEBUG
 static int syncprt = 0;
 SYSCTL_INT(_debug, OID_AUTO, syncprt, CTLFLAG_RW, &syncprt, 0, "");
 #endif
 
 /*
  * Sync each mounted filesystem.
  */
 #ifndef _SYS_SYSPROTO_H_
 struct sync_args {
 	int     dummy;
 };
 #endif
 /* ARGSUSED */
 int
 sync(td, uap)
 	struct thread *td;
 	struct sync_args *uap;
 {
 	struct mount *mp, *nmp;
 	int vfslocked;
 
 	mtx_lock(&mountlist_mtx);
 	for (mp = TAILQ_FIRST(&mountlist); mp != NULL; mp = nmp) {
 		if (vfs_busy(mp, LK_NOWAIT, &mountlist_mtx, td)) {
 			nmp = TAILQ_NEXT(mp, mnt_list);
 			continue;
 		}
 		vfslocked = VFS_LOCK_GIANT(mp);
 		if ((mp->mnt_flag & MNT_RDONLY) == 0 &&
 		    vn_start_write(NULL, &mp, V_NOWAIT) == 0) {
 			MNT_ILOCK(mp);
 			mp->mnt_noasync++;
 			mp->mnt_kern_flag &= ~MNTK_ASYNC;
 			MNT_IUNLOCK(mp);
 			vfs_msync(mp, MNT_NOWAIT);
 			VFS_SYNC(mp, MNT_NOWAIT, td);
 			MNT_ILOCK(mp);
 			mp->mnt_noasync--;
 			if ((mp->mnt_flag & MNT_ASYNC) != 0 &&
 			    mp->mnt_noasync == 0)
 				mp->mnt_kern_flag |= MNTK_ASYNC;
 			MNT_IUNLOCK(mp);
 			vn_finished_write(mp);
 		}
 		VFS_UNLOCK_GIANT(vfslocked);
 		mtx_lock(&mountlist_mtx);
 		nmp = TAILQ_NEXT(mp, mnt_list);
 		vfs_unbusy(mp, td);
 	}
 	mtx_unlock(&mountlist_mtx);
 	return (0);
 }
 
 /* XXX PRISON: could be per prison flag */
 static int prison_quotas;
 #if 0
 SYSCTL_INT(_kern_prison, OID_AUTO, quotas, CTLFLAG_RW, &prison_quotas, 0, "");
 #endif
 
 /*
  * Change filesystem quotas.
  */
 #ifndef _SYS_SYSPROTO_H_
 struct quotactl_args {
 	char *path;
 	int cmd;
 	int uid;
 	caddr_t arg;
 };
 #endif
 int
 quotactl(td, uap)
 	struct thread *td;
 	register struct quotactl_args /* {
 		char *path;
 		int cmd;
 		int uid;
 		caddr_t arg;
 	} */ *uap;
 {
 	struct mount *mp;
 	int vfslocked;
 	int error;
 	struct nameidata nd;
 
 	AUDIT_ARG(cmd, uap->cmd);
 	AUDIT_ARG(uid, uap->uid);
 	if (jailed(td->td_ucred) && !prison_quotas)
 		return (EPERM);
 	NDINIT(&nd, LOOKUP, FOLLOW | MPSAFE | AUDITVNODE1,
 	   UIO_USERSPACE, uap->path, td);
 	if ((error = namei(&nd)) != 0)
 		return (error);
 	vfslocked = NDHASGIANT(&nd);
 	NDFREE(&nd, NDF_ONLY_PNBUF);
 	mp = nd.ni_vp->v_mount;
 	if ((error = vfs_busy(mp, 0, NULL, td))) {
 		vrele(nd.ni_vp);
 		VFS_UNLOCK_GIANT(vfslocked);
 		return (error);
 	}
 	vrele(nd.ni_vp);
 	error = VFS_QUOTACTL(mp, uap->cmd, uap->uid, uap->arg, td);
 	vfs_unbusy(mp, td);
 	VFS_UNLOCK_GIANT(vfslocked);
 	return (error);
 }
 
 /*
  * Used by statfs conversion routines to scale the block size up if
  * necessary so that all of the block counts are <= 'max_size'.  Note
  * that 'max_size' should be a bitmask, i.e. 2^n - 1 for some non-zero
  * value of 'n'.
  */
 void
 statfs_scale_blocks(struct statfs *sf, long max_size)
 {
 	uint64_t count;
 	int shift;
 
 	KASSERT(powerof2(max_size + 1), ("%s: invalid max_size", __func__));
 
 	/*
 	 * Attempt to scale the block counts to give a more accurate
 	 * overview to userland of the ratio of free space to used
 	 * space.  To do this, find the largest block count and compute
 	 * a divisor that lets it fit into a signed integer <= max_size.
 	 */
 	if (sf->f_bavail < 0)
 		count = -sf->f_bavail;
 	else
 		count = sf->f_bavail;
 	count = MAX(sf->f_blocks, MAX(sf->f_bfree, count));
 	if (count <= max_size)
 		return;
 
 	count >>= flsl(max_size);
 	shift = 0;
 	while (count > 0) {
 		shift++;
 		count >>=1;
 	}
 
 	sf->f_bsize <<= shift;
 	sf->f_blocks >>= shift;
 	sf->f_bfree >>= shift;
 	sf->f_bavail >>= shift;
 }
 
 /*
  * Get filesystem statistics.
  */
 #ifndef _SYS_SYSPROTO_H_
 struct statfs_args {
 	char *path;
 	struct statfs *buf;
 };
 #endif
 int
 statfs(td, uap)
 	struct thread *td;
 	register struct statfs_args /* {
 		char *path;
 		struct statfs *buf;
 	} */ *uap;
 {
 	struct statfs sf;
 	int error;
 
 	error = kern_statfs(td, uap->path, UIO_USERSPACE, &sf);
 	if (error == 0)
 		error = copyout(&sf, uap->buf, sizeof(sf));
 	return (error);
 }
 
 int
 kern_statfs(struct thread *td, char *path, enum uio_seg pathseg,
     struct statfs *buf)
 {
 	struct mount *mp;
 	struct statfs *sp, sb;
 	int vfslocked;
 	int error;
 	struct nameidata nd;
 
 	NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | MPSAFE | AUDITVNODE1,
 	    pathseg, path, td);
 	error = namei(&nd);
 	if (error)
 		return (error);
 	vfslocked = NDHASGIANT(&nd);
 	mp = nd.ni_vp->v_mount;
 	vfs_ref(mp);
 	NDFREE(&nd, NDF_ONLY_PNBUF);
 	vput(nd.ni_vp);
 #ifdef MAC
 	error = mac_mount_check_stat(td->td_ucred, mp);
 	if (error)
 		goto out;
 #endif
 	/*
 	 * Set these in case the underlying filesystem fails to do so.
 	 */
 	sp = &mp->mnt_stat;
 	sp->f_version = STATFS_VERSION;
 	sp->f_namemax = NAME_MAX;
 	sp->f_flags = mp->mnt_flag & MNT_VISFLAGMASK;
 	error = VFS_STATFS(mp, sp, td);
 	if (error)
 		goto out;
 	if (priv_check(td, PRIV_VFS_GENERATION)) {
 		bcopy(sp, &sb, sizeof(sb));
 		sb.f_fsid.val[0] = sb.f_fsid.val[1] = 0;
 		prison_enforce_statfs(td->td_ucred, mp, &sb);
 		sp = &sb;
 	}
 	*buf = *sp;
 out:
 	vfs_rel(mp);
 	VFS_UNLOCK_GIANT(vfslocked);
 	if (mtx_owned(&Giant))
 		printf("statfs(%d): %s: %d\n", vfslocked, path, error);
 	return (error);
 }
 
 /*
  * Get filesystem statistics.
  */
 #ifndef _SYS_SYSPROTO_H_
 struct fstatfs_args {
 	int fd;
 	struct statfs *buf;
 };
 #endif
 int
 fstatfs(td, uap)
 	struct thread *td;
 	register struct fstatfs_args /* {
 		int fd;
 		struct statfs *buf;
 	} */ *uap;
 {
 	struct statfs sf;
 	int error;
 
 	error = kern_fstatfs(td, uap->fd, &sf);
 	if (error == 0)
 		error = copyout(&sf, uap->buf, sizeof(sf));
 	return (error);
 }
 
 int
 kern_fstatfs(struct thread *td, int fd, struct statfs *buf)
 {
 	struct file *fp;
 	struct mount *mp;
 	struct statfs *sp, sb;
 	int vfslocked;
 	struct vnode *vp;
 	int error;
 
 	AUDIT_ARG(fd, fd);
 	error = getvnode(td->td_proc->p_fd, fd, &fp);
 	if (error)
 		return (error);
 	vp = fp->f_vnode;
 	vfslocked = VFS_LOCK_GIANT(vp->v_mount);
-	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td);
+	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
 #ifdef AUDIT
 	AUDIT_ARG(vnode, vp, ARG_VNODE1);
 #endif
 	mp = vp->v_mount;
 	if (mp)
 		vfs_ref(mp);
 	VOP_UNLOCK(vp, 0, td);
 	fdrop(fp, td);
 	if (vp->v_iflag & VI_DOOMED) {
 		error = EBADF;
 		goto out;
 	}
 #ifdef MAC
 	error = mac_mount_check_stat(td->td_ucred, mp);
 	if (error)
 		goto out;
 #endif
 	/*
 	 * Set these in case the underlying filesystem fails to do so.
 	 */
 	sp = &mp->mnt_stat;
 	sp->f_version = STATFS_VERSION;
 	sp->f_namemax = NAME_MAX;
 	sp->f_flags = mp->mnt_flag & MNT_VISFLAGMASK;
 	error = VFS_STATFS(mp, sp, td);
 	if (error)
 		goto out;
 	if (priv_check(td, PRIV_VFS_GENERATION)) {
 		bcopy(sp, &sb, sizeof(sb));
 		sb.f_fsid.val[0] = sb.f_fsid.val[1] = 0;
 		prison_enforce_statfs(td->td_ucred, mp, &sb);
 		sp = &sb;
 	}
 	*buf = *sp;
 out:
 	if (mp)
 		vfs_rel(mp);
 	VFS_UNLOCK_GIANT(vfslocked);
 	return (error);
 }
 
 /*
  * Get statistics on all filesystems.
  */
 #ifndef _SYS_SYSPROTO_H_
 struct getfsstat_args {
 	struct statfs *buf;
 	long bufsize;
 	int flags;
 };
 #endif
 int
 getfsstat(td, uap)
 	struct thread *td;
 	register struct getfsstat_args /* {
 		struct statfs *buf;
 		long bufsize;
 		int flags;
 	} */ *uap;
 {
 
 	return (kern_getfsstat(td, &uap->buf, uap->bufsize, UIO_USERSPACE,
 	    uap->flags));
 }
 
 /*
  * If (bufsize > 0 && bufseg == UIO_SYSSPACE)
  * 	The caller is responsible for freeing memory which will be allocated
  *	in '*buf'.
  */
 int
 kern_getfsstat(struct thread *td, struct statfs **buf, size_t bufsize,
     enum uio_seg bufseg, int flags)
 {
 	struct mount *mp, *nmp;
 	struct statfs *sfsp, *sp, sb;
 	size_t count, maxcount;
 	int vfslocked;
 	int error;
 
 	maxcount = bufsize / sizeof(struct statfs);
 	if (bufsize == 0)
 		sfsp = NULL;
 	else if (bufseg == UIO_USERSPACE)
 		sfsp = *buf;
 	else /* if (bufseg == UIO_SYSSPACE) */ {
 		count = 0;
 		mtx_lock(&mountlist_mtx);
 		TAILQ_FOREACH(mp, &mountlist, mnt_list) {
 			count++;
 		}
 		mtx_unlock(&mountlist_mtx);
 		if (maxcount > count)
 			maxcount = count;
 		sfsp = *buf = malloc(maxcount * sizeof(struct statfs), M_TEMP,
 		    M_WAITOK);
 	}
 	count = 0;
 	mtx_lock(&mountlist_mtx);
 	for (mp = TAILQ_FIRST(&mountlist); mp != NULL; mp = nmp) {
 		if (prison_canseemount(td->td_ucred, mp) != 0) {
 			nmp = TAILQ_NEXT(mp, mnt_list);
 			continue;
 		}
 #ifdef MAC
 		if (mac_mount_check_stat(td->td_ucred, mp) != 0) {
 			nmp = TAILQ_NEXT(mp, mnt_list);
 			continue;
 		}
 #endif
 		if (vfs_busy(mp, LK_NOWAIT, &mountlist_mtx, td)) {
 			nmp = TAILQ_NEXT(mp, mnt_list);
 			continue;
 		}
 		vfslocked = VFS_LOCK_GIANT(mp);
 		if (sfsp && count < maxcount) {
 			sp = &mp->mnt_stat;
 			/*
 			 * Set these in case the underlying filesystem
 			 * fails to do so.
 			 */
 			sp->f_version = STATFS_VERSION;
 			sp->f_namemax = NAME_MAX;
 			sp->f_flags = mp->mnt_flag & MNT_VISFLAGMASK;
 			/*
 			 * If MNT_NOWAIT or MNT_LAZY is specified, do not
 			 * refresh the fsstat cache. MNT_NOWAIT or MNT_LAZY
 			 * overrides MNT_WAIT.
 			 */
 			if (((flags & (MNT_LAZY|MNT_NOWAIT)) == 0 ||
 			    (flags & MNT_WAIT)) &&
 			    (error = VFS_STATFS(mp, sp, td))) {
 				VFS_UNLOCK_GIANT(vfslocked);
 				mtx_lock(&mountlist_mtx);
 				nmp = TAILQ_NEXT(mp, mnt_list);
 				vfs_unbusy(mp, td);
 				continue;
 			}
 			if (priv_check(td, PRIV_VFS_GENERATION)) {
 				bcopy(sp, &sb, sizeof(sb));
 				sb.f_fsid.val[0] = sb.f_fsid.val[1] = 0;
 				prison_enforce_statfs(td->td_ucred, mp, &sb);
 				sp = &sb;
 			}
 			if (bufseg == UIO_SYSSPACE)
 				bcopy(sp, sfsp, sizeof(*sp));
 			else /* if (bufseg == UIO_USERSPACE) */ {
 				error = copyout(sp, sfsp, sizeof(*sp));
 				if (error) {
 					vfs_unbusy(mp, td);
 					VFS_UNLOCK_GIANT(vfslocked);
 					return (error);
 				}
 			}
 			sfsp++;
 		}
 		VFS_UNLOCK_GIANT(vfslocked);
 		count++;
 		mtx_lock(&mountlist_mtx);
 		nmp = TAILQ_NEXT(mp, mnt_list);
 		vfs_unbusy(mp, td);
 	}
 	mtx_unlock(&mountlist_mtx);
 	if (sfsp && count > maxcount)
 		td->td_retval[0] = maxcount;
 	else
 		td->td_retval[0] = count;
 	return (0);
 }
 
 #ifdef COMPAT_FREEBSD4
 /*
  * Get old format filesystem statistics.
  */
 static void cvtstatfs(struct statfs *, struct ostatfs *);
 
 #ifndef _SYS_SYSPROTO_H_
 struct freebsd4_statfs_args {
 	char *path;
 	struct ostatfs *buf;
 };
 #endif
 int
 freebsd4_statfs(td, uap)
 	struct thread *td;
 	struct freebsd4_statfs_args /* {
 		char *path;
 		struct ostatfs *buf;
 	} */ *uap;
 {
 	struct ostatfs osb;
 	struct statfs sf;
 	int error;
 
 	error = kern_statfs(td, uap->path, UIO_USERSPACE, &sf);
 	if (error)
 		return (error);
 	cvtstatfs(&sf, &osb);
 	return (copyout(&osb, uap->buf, sizeof(osb)));
 }
 
 /*
  * Get filesystem statistics.
  */
 #ifndef _SYS_SYSPROTO_H_
 struct freebsd4_fstatfs_args {
 	int fd;
 	struct ostatfs *buf;
 };
 #endif
 int
 freebsd4_fstatfs(td, uap)
 	struct thread *td;
 	struct freebsd4_fstatfs_args /* {
 		int fd;
 		struct ostatfs *buf;
 	} */ *uap;
 {
 	struct ostatfs osb;
 	struct statfs sf;
 	int error;
 
 	error = kern_fstatfs(td, uap->fd, &sf);
 	if (error)
 		return (error);
 	cvtstatfs(&sf, &osb);
 	return (copyout(&osb, uap->buf, sizeof(osb)));
 }
 
 /*
  * Get statistics on all filesystems.
  */
 #ifndef _SYS_SYSPROTO_H_
 struct freebsd4_getfsstat_args {
 	struct ostatfs *buf;
 	long bufsize;
 	int flags;
 };
 #endif
 int
 freebsd4_getfsstat(td, uap)
 	struct thread *td;
 	register struct freebsd4_getfsstat_args /* {
 		struct ostatfs *buf;
 		long bufsize;
 		int flags;
 	} */ *uap;
 {
 	struct statfs *buf, *sp;
 	struct ostatfs osb;
 	size_t count, size;
 	int error;
 
 	count = uap->bufsize / sizeof(struct ostatfs);
 	size = count * sizeof(struct statfs);
 	error = kern_getfsstat(td, &buf, size, UIO_SYSSPACE, uap->flags);
 	if (size > 0) {
 		count = td->td_retval[0];
 		sp = buf;
 		while (count > 0 && error == 0) {
 			cvtstatfs(sp, &osb);
 			error = copyout(&osb, uap->buf, sizeof(osb));
 			sp++;
 			uap->buf++;
 			count--;
 		}
 		free(buf, M_TEMP);
 	}
 	return (error);
 }
 
 /*
  * Implement fstatfs() for (NFS) file handles.
  */
 #ifndef _SYS_SYSPROTO_H_
 struct freebsd4_fhstatfs_args {
 	struct fhandle *u_fhp;
 	struct ostatfs *buf;
 };
 #endif
 int
 freebsd4_fhstatfs(td, uap)
 	struct thread *td;
 	struct freebsd4_fhstatfs_args /* {
 		struct fhandle *u_fhp;
 		struct ostatfs *buf;
 	} */ *uap;
 {
 	struct ostatfs osb;
 	struct statfs sf;
 	fhandle_t fh;
 	int error;
 
 	error = copyin(uap->u_fhp, &fh, sizeof(fhandle_t));
 	if (error)
 		return (error);
 	error = kern_fhstatfs(td, fh, &sf);
 	if (error)
 		return (error);
 	cvtstatfs(&sf, &osb);
 	return (copyout(&osb, uap->buf, sizeof(osb)));
 }
 
 /*
  * Convert a new format statfs structure to an old format statfs structure.
  */
 static void
 cvtstatfs(nsp, osp)
 	struct statfs *nsp;
 	struct ostatfs *osp;
 {
 
 	statfs_scale_blocks(nsp, LONG_MAX);
 	bzero(osp, sizeof(*osp));
 	osp->f_bsize = nsp->f_bsize;
 	osp->f_iosize = MIN(nsp->f_iosize, LONG_MAX);
 	osp->f_blocks = nsp->f_blocks;
 	osp->f_bfree = nsp->f_bfree;
 	osp->f_bavail = nsp->f_bavail;
 	osp->f_files = MIN(nsp->f_files, LONG_MAX);
 	osp->f_ffree = MIN(nsp->f_ffree, LONG_MAX);
 	osp->f_owner = nsp->f_owner;
 	osp->f_type = nsp->f_type;
 	osp->f_flags = nsp->f_flags;
 	osp->f_syncwrites = MIN(nsp->f_syncwrites, LONG_MAX);
 	osp->f_asyncwrites = MIN(nsp->f_asyncwrites, LONG_MAX);
 	osp->f_syncreads = MIN(nsp->f_syncreads, LONG_MAX);
 	osp->f_asyncreads = MIN(nsp->f_asyncreads, LONG_MAX);
 	strlcpy(osp->f_fstypename, nsp->f_fstypename,
 	    MIN(MFSNAMELEN, OMFSNAMELEN));
 	strlcpy(osp->f_mntonname, nsp->f_mntonname,
 	    MIN(MNAMELEN, OMNAMELEN));
 	strlcpy(osp->f_mntfromname, nsp->f_mntfromname,
 	    MIN(MNAMELEN, OMNAMELEN));
 	osp->f_fsid = nsp->f_fsid;
 }
 #endif /* COMPAT_FREEBSD4 */
 
 /*
  * Change current working directory to a given file descriptor.
  */
 #ifndef _SYS_SYSPROTO_H_
 struct fchdir_args {
 	int	fd;
 };
 #endif
 int
 fchdir(td, uap)
 	struct thread *td;
 	struct fchdir_args /* {
 		int fd;
 	} */ *uap;
 {
 	register struct filedesc *fdp = td->td_proc->p_fd;
 	struct vnode *vp, *tdp, *vpold;
 	struct mount *mp;
 	struct file *fp;
 	int vfslocked;
 	int error;
 
 	AUDIT_ARG(fd, uap->fd);
 	if ((error = getvnode(fdp, uap->fd, &fp)) != 0)
 		return (error);
 	vp = fp->f_vnode;
 	VREF(vp);
 	fdrop(fp, td);
 	vfslocked = VFS_LOCK_GIANT(vp->v_mount);
-	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td);
+	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
 	AUDIT_ARG(vnode, vp, ARG_VNODE1);
 	error = change_dir(vp, td);
 	while (!error && (mp = vp->v_mountedhere) != NULL) {
 		int tvfslocked;
 		if (vfs_busy(mp, 0, 0, td))
 			continue;
 		tvfslocked = VFS_LOCK_GIANT(mp);
 		error = VFS_ROOT(mp, LK_EXCLUSIVE, &tdp, td);
 		vfs_unbusy(mp, td);
 		if (error) {
 			VFS_UNLOCK_GIANT(tvfslocked);
 			break;
 		}
 		vput(vp);
 		VFS_UNLOCK_GIANT(vfslocked);
 		vp = tdp;
 		vfslocked = tvfslocked;
 	}
 	if (error) {
 		vput(vp);
 		VFS_UNLOCK_GIANT(vfslocked);
 		return (error);
 	}
 	VOP_UNLOCK(vp, 0, td);
 	VFS_UNLOCK_GIANT(vfslocked);
 	FILEDESC_XLOCK(fdp);
 	vpold = fdp->fd_cdir;
 	fdp->fd_cdir = vp;
 	FILEDESC_XUNLOCK(fdp);
 	vfslocked = VFS_LOCK_GIANT(vpold->v_mount);
 	vrele(vpold);
 	VFS_UNLOCK_GIANT(vfslocked);
 	return (0);
 }
 
 /*
  * Change current working directory (``.'').
  */
 #ifndef _SYS_SYSPROTO_H_
 struct chdir_args {
 	char	*path;
 };
 #endif
 int
 chdir(td, uap)
 	struct thread *td;
 	struct chdir_args /* {
 		char *path;
 	} */ *uap;
 {
 
 	return (kern_chdir(td, uap->path, UIO_USERSPACE));
 }
 
 int
 kern_chdir(struct thread *td, char *path, enum uio_seg pathseg)
 {
 	register struct filedesc *fdp = td->td_proc->p_fd;
 	int error;
 	struct nameidata nd;
 	struct vnode *vp;
 	int vfslocked;
 
 	NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | AUDITVNODE1 | MPSAFE,
 	    pathseg, path, td);
 	if ((error = namei(&nd)) != 0)
 		return (error);
 	vfslocked = NDHASGIANT(&nd);
 	if ((error = change_dir(nd.ni_vp, td)) != 0) {
 		vput(nd.ni_vp);
 		VFS_UNLOCK_GIANT(vfslocked);
 		NDFREE(&nd, NDF_ONLY_PNBUF);
 		return (error);
 	}
 	VOP_UNLOCK(nd.ni_vp, 0, td);
 	VFS_UNLOCK_GIANT(vfslocked);
 	NDFREE(&nd, NDF_ONLY_PNBUF);
 	FILEDESC_XLOCK(fdp);
 	vp = fdp->fd_cdir;
 	fdp->fd_cdir = nd.ni_vp;
 	FILEDESC_XUNLOCK(fdp);
 	vfslocked = VFS_LOCK_GIANT(vp->v_mount);
 	vrele(vp);
 	VFS_UNLOCK_GIANT(vfslocked);
 	return (0);
 }
 
 /*
  * Helper function for raised chroot(2) security function:  Refuse if
  * any filedescriptors are open directories.
  */
 static int
 chroot_refuse_vdir_fds(fdp)
 	struct filedesc *fdp;
 {
 	struct vnode *vp;
 	struct file *fp;
 	int fd;
 
 	FILEDESC_LOCK_ASSERT(fdp);
 
 	for (fd = 0; fd < fdp->fd_nfiles ; fd++) {
 		fp = fget_locked(fdp, fd);
 		if (fp == NULL)
 			continue;
 		if (fp->f_type == DTYPE_VNODE) {
 			vp = fp->f_vnode;
 			if (vp->v_type == VDIR)
 				return (EPERM);
 		}
 	}
 	return (0);
 }
 
 /*
  * This sysctl determines if we will allow a process to chroot(2) if it
  * has a directory open:
  *	0: disallowed for all processes.
  *	1: allowed for processes that were not already chroot(2)'ed.
  *	2: allowed for all processes.
  */
 
 static int chroot_allow_open_directories = 1;
 
 SYSCTL_INT(_kern, OID_AUTO, chroot_allow_open_directories, CTLFLAG_RW,
      &chroot_allow_open_directories, 0, "");
 
 /*
  * Change notion of root (``/'') directory.
  */
 #ifndef _SYS_SYSPROTO_H_
 struct chroot_args {
 	char	*path;
 };
 #endif
 int
 chroot(td, uap)
 	struct thread *td;
 	struct chroot_args /* {
 		char *path;
 	} */ *uap;
 {
 	int error;
 	struct nameidata nd;
 	int vfslocked;
 
 	error = priv_check(td, PRIV_VFS_CHROOT);
 	if (error)
 		return (error);
 	NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | MPSAFE | AUDITVNODE1,
 	    UIO_USERSPACE, uap->path, td);
 	error = namei(&nd);
 	if (error)
 		goto error;
 	vfslocked = NDHASGIANT(&nd);
 	if ((error = change_dir(nd.ni_vp, td)) != 0)
 		goto e_vunlock;
 #ifdef MAC
 	if ((error = mac_vnode_check_chroot(td->td_ucred, nd.ni_vp)))
 		goto e_vunlock;
 #endif
 	VOP_UNLOCK(nd.ni_vp, 0, td);
 	error = change_root(nd.ni_vp, td);
 	vrele(nd.ni_vp);
 	VFS_UNLOCK_GIANT(vfslocked);
 	NDFREE(&nd, NDF_ONLY_PNBUF);
 	return (error);
 e_vunlock:
 	vput(nd.ni_vp);
 	VFS_UNLOCK_GIANT(vfslocked);
 error:
 	NDFREE(&nd, NDF_ONLY_PNBUF);
 	return (error);
 }
 
 /*
  * Common routine for chroot and chdir.  Callers must provide a locked vnode
  * instance.
  */
 int
 change_dir(vp, td)
 	struct vnode *vp;
 	struct thread *td;
 {
 	int error;
 
 	ASSERT_VOP_LOCKED(vp, "change_dir(): vp not locked");
 	if (vp->v_type != VDIR)
 		return (ENOTDIR);
 #ifdef MAC
 	error = mac_vnode_check_chdir(td->td_ucred, vp);
 	if (error)
 		return (error);
 #endif
 	error = VOP_ACCESS(vp, VEXEC, td->td_ucred, td);
 	return (error);
 }
 
 /*
  * Common routine for kern_chroot() and jail_attach().  The caller is
  * responsible for invoking priv_check() and mac_vnode_check_chroot() to
  * authorize this operation.
  */
 int
 change_root(vp, td)
 	struct vnode *vp;
 	struct thread *td;
 {
 	struct filedesc *fdp;
 	struct vnode *oldvp;
 	int vfslocked;
 	int error;
 
 	VFS_ASSERT_GIANT(vp->v_mount);
 	fdp = td->td_proc->p_fd;
 	FILEDESC_XLOCK(fdp);
 	if (chroot_allow_open_directories == 0 ||
 	    (chroot_allow_open_directories == 1 && fdp->fd_rdir != rootvnode)) {
 		error = chroot_refuse_vdir_fds(fdp);
 		if (error) {
 			FILEDESC_XUNLOCK(fdp);
 			return (error);
 		}
 	}
 	oldvp = fdp->fd_rdir;
 	fdp->fd_rdir = vp;
 	VREF(fdp->fd_rdir);
 	if (!fdp->fd_jdir) {
 		fdp->fd_jdir = vp;
 		VREF(fdp->fd_jdir);
 	}
 	FILEDESC_XUNLOCK(fdp);
 	vfslocked = VFS_LOCK_GIANT(oldvp->v_mount);
 	vrele(oldvp);
 	VFS_UNLOCK_GIANT(vfslocked);
 	return (0);
 }
 
 /*
  * Check permissions, allocate an open file structure, and call the device
  * open routine if any.
  */
 #ifndef _SYS_SYSPROTO_H_
 struct open_args {
 	char	*path;
 	int	flags;
 	int	mode;
 };
 #endif
 int
 open(td, uap)
 	struct thread *td;
 	register struct open_args /* {
 		char *path;
 		int flags;
 		int mode;
 	} */ *uap;
 {
 
 	return kern_open(td, uap->path, UIO_USERSPACE, uap->flags, uap->mode);
 }
 
 int
 kern_open(struct thread *td, char *path, enum uio_seg pathseg, int flags,
     int mode)
 {
 	struct proc *p = td->td_proc;
 	struct filedesc *fdp = p->p_fd;
 	struct file *fp;
 	struct vnode *vp;
 	struct vattr vat;
 	struct mount *mp;
 	int cmode;
 	struct file *nfp;
 	int type, indx, error;
 	struct flock lf;
 	struct nameidata nd;
 	int vfslocked;
 
 	AUDIT_ARG(fflags, flags);
 	AUDIT_ARG(mode, mode);
 	if ((flags & O_ACCMODE) == O_ACCMODE)
 		return (EINVAL);
 	flags = FFLAGS(flags);
 	error = falloc(td, &nfp, &indx);
 	if (error)
 		return (error);
 	/* An extra reference on `nfp' has been held for us by falloc(). */
 	fp = nfp;
 	/* Set the flags early so the finit in devfs can pick them up. */
 	fp->f_flag = flags & FMASK;
 	cmode = ((mode &~ fdp->fd_cmask) & ALLPERMS) &~ S_ISTXT;
 	NDINIT(&nd, LOOKUP, FOLLOW | AUDITVNODE1 | MPSAFE, pathseg, path, td);
 	td->td_dupfd = -1;		/* XXX check for fdopen */
  	error = vn_open(&nd, &flags, cmode, fp);
 	if (error) {
 		/*
 		 * If the vn_open replaced the method vector, something
 		 * wonderous happened deep below and we just pass it up
 		 * pretending we know what we do.
 		 */
 		if (error == ENXIO && fp->f_ops != &badfileops) {
 			fdrop(fp, td);
 			td->td_retval[0] = indx;
 			return (0);
 		}
 
 		/*
 		 * handle special fdopen() case.  bleh.  dupfdopen() is
 		 * responsible for dropping the old contents of ofiles[indx]
 		 * if it succeeds.
 		 */
 		if ((error == ENODEV || error == ENXIO) &&
 		    td->td_dupfd >= 0 &&		/* XXX from fdopen */
 		    (error =
 			dupfdopen(td, fdp, indx, td->td_dupfd, flags, error)) == 0) {
 			td->td_retval[0] = indx;
 			fdrop(fp, td);
 			return (0);
 		}
 		/*
 		 * Clean up the descriptor, but only if another thread hadn't
 		 * replaced or closed it.
 		 */
 		fdclose(fdp, fp, indx, td);
 		fdrop(fp, td);
 
 		if (error == ERESTART)
 			error = EINTR;
 		return (error);
 	}
 	td->td_dupfd = 0;
 	vfslocked = NDHASGIANT(&nd);
 	NDFREE(&nd, NDF_ONLY_PNBUF);
 	vp = nd.ni_vp;
 
 	fp->f_vnode = vp;	/* XXX Does devfs need this? */
 	/*
 	 * If the file wasn't claimed by devfs bind it to the normal
 	 * vnode operations here.
 	 */
 	if (fp->f_ops == &badfileops) {
 		KASSERT(vp->v_type != VFIFO, ("Unexpected fifo."));
 		fp->f_seqcount = 1;
 		finit(fp, flags & FMASK, DTYPE_VNODE, vp, &vnops);
 	}
 
 	VOP_UNLOCK(vp, 0, td);
 	if (flags & (O_EXLOCK | O_SHLOCK)) {
 		lf.l_whence = SEEK_SET;
 		lf.l_start = 0;
 		lf.l_len = 0;
 		if (flags & O_EXLOCK)
 			lf.l_type = F_WRLCK;
 		else
 			lf.l_type = F_RDLCK;
 		type = F_FLOCK;
 		if ((flags & FNONBLOCK) == 0)
 			type |= F_WAIT;
 		if ((error = VOP_ADVLOCK(vp, (caddr_t)fp, F_SETLK, &lf,
 			    type)) != 0)
 			goto bad;
 		atomic_set_int(&fp->f_flag, FHASLOCK);
 	}
 	if (flags & O_TRUNC) {
 		if ((error = vn_start_write(vp, &mp, V_WAIT | PCATCH)) != 0)
 			goto bad;
 		VOP_LEASE(vp, td, td->td_ucred, LEASE_WRITE);
 		VATTR_NULL(&vat);
 		vat.va_size = 0;
-		vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td);
+		vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
 #ifdef MAC
 		error = mac_vnode_check_write(td->td_ucred, fp->f_cred, vp);
 		if (error == 0)
 #endif
 			error = VOP_SETATTR(vp, &vat, td->td_ucred, td);
 		VOP_UNLOCK(vp, 0, td);
 		vn_finished_write(mp);
 		if (error)
 			goto bad;
 	}
 	VFS_UNLOCK_GIANT(vfslocked);
 	/*
 	 * Release our private reference, leaving the one associated with
 	 * the descriptor table intact.
 	 */
 	fdrop(fp, td);
 	td->td_retval[0] = indx;
 	return (0);
 bad:
 	VFS_UNLOCK_GIANT(vfslocked);
 	fdclose(fdp, fp, indx, td);
 	fdrop(fp, td);
 	return (error);
 }
 
 #ifdef COMPAT_43
 /*
  * Create a file.
  */
 #ifndef _SYS_SYSPROTO_H_
 struct ocreat_args {
 	char	*path;
 	int	mode;
 };
 #endif
 int
 ocreat(td, uap)
 	struct thread *td;
 	register struct ocreat_args /* {
 		char *path;
 		int mode;
 	} */ *uap;
 {
 
 	return (kern_open(td, uap->path, UIO_USERSPACE,
 	    O_WRONLY | O_CREAT | O_TRUNC, uap->mode));
 }
 #endif /* COMPAT_43 */
 
 /*
  * Create a special file.
  */
 #ifndef _SYS_SYSPROTO_H_
 struct mknod_args {
 	char	*path;
 	int	mode;
 	int	dev;
 };
 #endif
 int
 mknod(td, uap)
 	struct thread *td;
 	register struct mknod_args /* {
 		char *path;
 		int mode;
 		int dev;
 	} */ *uap;
 {
 
 	return (kern_mknod(td, uap->path, UIO_USERSPACE, uap->mode, uap->dev));
 }
 
 int
 kern_mknod(struct thread *td, char *path, enum uio_seg pathseg, int mode,
     int dev)
 {
 	struct vnode *vp;
 	struct mount *mp;
 	struct vattr vattr;
 	int error;
 	int whiteout = 0;
 	struct nameidata nd;
 	int vfslocked;
 
 	AUDIT_ARG(mode, mode);
 	AUDIT_ARG(dev, dev);
 	switch (mode & S_IFMT) {
 	case S_IFCHR:
 	case S_IFBLK:
 		error = priv_check(td, PRIV_VFS_MKNOD_DEV);
 		break;
 	case S_IFMT:
 		error = priv_check(td, PRIV_VFS_MKNOD_BAD);
 		break;
 	case S_IFWHT:
 		error = priv_check(td, PRIV_VFS_MKNOD_WHT);
 		break;
 	default:
 		error = EINVAL;
 		break;
 	}
 	if (error)
 		return (error);
 restart:
 	bwillwrite();
 	NDINIT(&nd, CREATE, LOCKPARENT | SAVENAME | MPSAFE | AUDITVNODE1,
 	    pathseg, path, td);
 	if ((error = namei(&nd)) != 0)
 		return (error);
 	vfslocked = NDHASGIANT(&nd);
 	vp = nd.ni_vp;
 	if (vp != NULL) {
 		NDFREE(&nd, NDF_ONLY_PNBUF);
 		if (vp == nd.ni_dvp)
 			vrele(nd.ni_dvp);
 		else
 			vput(nd.ni_dvp);
 		vrele(vp);
 		VFS_UNLOCK_GIANT(vfslocked);
 		return (EEXIST);
 	} else {
 		VATTR_NULL(&vattr);
 		FILEDESC_SLOCK(td->td_proc->p_fd);
 		vattr.va_mode = (mode & ALLPERMS) &
 		    ~td->td_proc->p_fd->fd_cmask;
 		FILEDESC_SUNLOCK(td->td_proc->p_fd);
 		vattr.va_rdev = dev;
 		whiteout = 0;
 
 		switch (mode & S_IFMT) {
 		case S_IFMT:	/* used by badsect to flag bad sectors */
 			vattr.va_type = VBAD;
 			break;
 		case S_IFCHR:
 			vattr.va_type = VCHR;
 			break;
 		case S_IFBLK:
 			vattr.va_type = VBLK;
 			break;
 		case S_IFWHT:
 			whiteout = 1;
 			break;
 		default:
 			panic("kern_mknod: invalid mode");
 		}
 	}
 	if (vn_start_write(nd.ni_dvp, &mp, V_NOWAIT) != 0) {
 		NDFREE(&nd, NDF_ONLY_PNBUF);
 		vput(nd.ni_dvp);
 		VFS_UNLOCK_GIANT(vfslocked);
 		if ((error = vn_start_write(NULL, &mp, V_XSLEEP | PCATCH)) != 0)
 			return (error);
 		goto restart;
 	}
 #ifdef MAC
 	if (error == 0 && !whiteout)
 		error = mac_vnode_check_create(td->td_ucred, nd.ni_dvp,
 		    &nd.ni_cnd, &vattr);
 #endif
 	if (!error) {
 		VOP_LEASE(nd.ni_dvp, td, td->td_ucred, LEASE_WRITE);
 		if (whiteout)
 			error = VOP_WHITEOUT(nd.ni_dvp, &nd.ni_cnd, CREATE);
 		else {
 			error = VOP_MKNOD(nd.ni_dvp, &nd.ni_vp,
 						&nd.ni_cnd, &vattr);
 			if (error == 0)
 				vput(nd.ni_vp);
 		}
 	}
 	NDFREE(&nd, NDF_ONLY_PNBUF);
 	vput(nd.ni_dvp);
 	vn_finished_write(mp);
 	VFS_UNLOCK_GIANT(vfslocked);
 	return (error);
 }
 
 /*
  * Create a named pipe.
  */
 #ifndef _SYS_SYSPROTO_H_
 struct mkfifo_args {
 	char	*path;
 	int	mode;
 };
 #endif
 int
 mkfifo(td, uap)
 	struct thread *td;
 	register struct mkfifo_args /* {
 		char *path;
 		int mode;
 	} */ *uap;
 {
 
 	return (kern_mkfifo(td, uap->path, UIO_USERSPACE, uap->mode));
 }
 
 int
 kern_mkfifo(struct thread *td, char *path, enum uio_seg pathseg, int mode)
 {
 	struct mount *mp;
 	struct vattr vattr;
 	int error;
 	struct nameidata nd;
 	int vfslocked;
 
 	AUDIT_ARG(mode, mode);
 restart:
 	bwillwrite();
 	NDINIT(&nd, CREATE, LOCKPARENT | SAVENAME | MPSAFE | AUDITVNODE1,
 	    pathseg, path, td);
 	if ((error = namei(&nd)) != 0)
 		return (error);
 	vfslocked = NDHASGIANT(&nd);
 	if (nd.ni_vp != NULL) {
 		NDFREE(&nd, NDF_ONLY_PNBUF);
 		if (nd.ni_vp == nd.ni_dvp)
 			vrele(nd.ni_dvp);
 		else
 			vput(nd.ni_dvp);
 		vrele(nd.ni_vp);
 		VFS_UNLOCK_GIANT(vfslocked);
 		return (EEXIST);
 	}
 	if (vn_start_write(nd.ni_dvp, &mp, V_NOWAIT) != 0) {
 		NDFREE(&nd, NDF_ONLY_PNBUF);
 		vput(nd.ni_dvp);
 		VFS_UNLOCK_GIANT(vfslocked);
 		if ((error = vn_start_write(NULL, &mp, V_XSLEEP | PCATCH)) != 0)
 			return (error);
 		goto restart;
 	}
 	VATTR_NULL(&vattr);
 	vattr.va_type = VFIFO;
 	FILEDESC_SLOCK(td->td_proc->p_fd);
 	vattr.va_mode = (mode & ALLPERMS) & ~td->td_proc->p_fd->fd_cmask;
 	FILEDESC_SUNLOCK(td->td_proc->p_fd);
 #ifdef MAC
 	error = mac_vnode_check_create(td->td_ucred, nd.ni_dvp, &nd.ni_cnd,
 	    &vattr);
 	if (error)
 		goto out;
 #endif
 	VOP_LEASE(nd.ni_dvp, td, td->td_ucred, LEASE_WRITE);
 	error = VOP_MKNOD(nd.ni_dvp, &nd.ni_vp, &nd.ni_cnd, &vattr);
 	if (error == 0)
 		vput(nd.ni_vp);
 #ifdef MAC
 out:
 #endif
 	vput(nd.ni_dvp);
 	vn_finished_write(mp);
 	VFS_UNLOCK_GIANT(vfslocked);
 	NDFREE(&nd, NDF_ONLY_PNBUF);
 	return (error);
 }
 
 /*
  * Make a hard file link.
  */
 #ifndef _SYS_SYSPROTO_H_
 struct link_args {
 	char	*path;
 	char	*link;
 };
 #endif
 int
 link(td, uap)
 	struct thread *td;
 	register struct link_args /* {
 		char *path;
 		char *link;
 	} */ *uap;
 {
 	int error;
 
 	error = kern_link(td, uap->path, uap->link, UIO_USERSPACE);
 	return (error);
 }
 
 static int hardlink_check_uid = 0;
 SYSCTL_INT(_security_bsd, OID_AUTO, hardlink_check_uid, CTLFLAG_RW,
     &hardlink_check_uid, 0,
     "Unprivileged processes cannot create hard links to files owned by other "
     "users");
 static int hardlink_check_gid = 0;
 SYSCTL_INT(_security_bsd, OID_AUTO, hardlink_check_gid, CTLFLAG_RW,
     &hardlink_check_gid, 0,
     "Unprivileged processes cannot create hard links to files owned by other "
     "groups");
 
 static int
 can_hardlink(struct vnode *vp, struct thread *td, struct ucred *cred)
 {
 	struct vattr va;
 	int error;
 
 	if (!hardlink_check_uid && !hardlink_check_gid)
 		return (0);
 
 	error = VOP_GETATTR(vp, &va, cred, td);
 	if (error != 0)
 		return (error);
 
 	if (hardlink_check_uid && cred->cr_uid != va.va_uid) {
 		error = priv_check_cred(cred, PRIV_VFS_LINK, 0);
 		if (error)
 			return (error);
 	}
 
 	if (hardlink_check_gid && !groupmember(va.va_gid, cred)) {
 		error = priv_check_cred(cred, PRIV_VFS_LINK, 0);
 		if (error)
 			return (error);
 	}
 
 	return (0);
 }
 
 int
 kern_link(struct thread *td, char *path, char *link, enum uio_seg segflg)
 {
 	struct vnode *vp;
 	struct mount *mp;
 	struct nameidata nd;
 	int vfslocked;
 	int lvfslocked;
 	int error;
 
 	bwillwrite();
 	NDINIT(&nd, LOOKUP, FOLLOW | MPSAFE | AUDITVNODE1, segflg, path, td);
 	if ((error = namei(&nd)) != 0)
 		return (error);
 	vfslocked = NDHASGIANT(&nd);
 	NDFREE(&nd, NDF_ONLY_PNBUF);
 	vp = nd.ni_vp;
 	if (vp->v_type == VDIR) {
 		vrele(vp);
 		VFS_UNLOCK_GIANT(vfslocked);
 		return (EPERM);		/* POSIX */
 	}
 	if ((error = vn_start_write(vp, &mp, V_WAIT | PCATCH)) != 0) {
 		vrele(vp);
 		VFS_UNLOCK_GIANT(vfslocked);
 		return (error);
 	}
 	NDINIT(&nd, CREATE, LOCKPARENT | SAVENAME | MPSAFE | AUDITVNODE2,
 	    segflg, link, td);
 	if ((error = namei(&nd)) == 0) {
 		lvfslocked = NDHASGIANT(&nd);
 		if (nd.ni_vp != NULL) {
 			if (nd.ni_dvp == nd.ni_vp)
 				vrele(nd.ni_dvp);
 			else
 				vput(nd.ni_dvp);
 			vrele(nd.ni_vp);
 			error = EEXIST;
-		} else if ((error = vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td))
+		} else if ((error = vn_lock(vp, LK_EXCLUSIVE | LK_RETRY))
 		    == 0) {
 			VOP_LEASE(nd.ni_dvp, td, td->td_ucred, LEASE_WRITE);
 			VOP_LEASE(vp, td, td->td_ucred, LEASE_WRITE);
 			error = can_hardlink(vp, td, td->td_ucred);
 			if (error == 0)
 #ifdef MAC
 				error = mac_vnode_check_link(td->td_ucred,
 				    nd.ni_dvp, vp, &nd.ni_cnd);
 			if (error == 0)
 #endif
 				error = VOP_LINK(nd.ni_dvp, vp, &nd.ni_cnd);
 			VOP_UNLOCK(vp, 0, td);
 			vput(nd.ni_dvp);
 		}
 		NDFREE(&nd, NDF_ONLY_PNBUF);
 		VFS_UNLOCK_GIANT(lvfslocked);
 	}
 	vrele(vp);
 	vn_finished_write(mp);
 	VFS_UNLOCK_GIANT(vfslocked);
 	return (error);
 }
 
 /*
  * Make a symbolic link.
  */
 #ifndef _SYS_SYSPROTO_H_
 struct symlink_args {
 	char	*path;
 	char	*link;
 };
 #endif
 int
 symlink(td, uap)
 	struct thread *td;
 	register struct symlink_args /* {
 		char *path;
 		char *link;
 	} */ *uap;
 {
 
 	return (kern_symlink(td, uap->path, uap->link, UIO_USERSPACE));
 }
 
 int
 kern_symlink(struct thread *td, char *path, char *link, enum uio_seg segflg)
 {
 	struct mount *mp;
 	struct vattr vattr;
 	char *syspath;
 	int error;
 	struct nameidata nd;
 	int vfslocked;
 
 	if (segflg == UIO_SYSSPACE) {
 		syspath = path;
 	} else {
 		syspath = uma_zalloc(namei_zone, M_WAITOK);
 		if ((error = copyinstr(path, syspath, MAXPATHLEN, NULL)) != 0)
 			goto out;
 	}
 	AUDIT_ARG(text, syspath);
 restart:
 	bwillwrite();
 	NDINIT(&nd, CREATE, LOCKPARENT | SAVENAME | MPSAFE | AUDITVNODE1,
 	    segflg, link, td);
 	if ((error = namei(&nd)) != 0)
 		goto out;
 	vfslocked = NDHASGIANT(&nd);
 	if (nd.ni_vp) {
 		NDFREE(&nd, NDF_ONLY_PNBUF);
 		if (nd.ni_vp == nd.ni_dvp)
 			vrele(nd.ni_dvp);
 		else
 			vput(nd.ni_dvp);
 		vrele(nd.ni_vp);
 		VFS_UNLOCK_GIANT(vfslocked);
 		error = EEXIST;
 		goto out;
 	}
 	if (vn_start_write(nd.ni_dvp, &mp, V_NOWAIT) != 0) {
 		NDFREE(&nd, NDF_ONLY_PNBUF);
 		vput(nd.ni_dvp);
 		VFS_UNLOCK_GIANT(vfslocked);
 		if ((error = vn_start_write(NULL, &mp, V_XSLEEP | PCATCH)) != 0)
 			goto out;
 		goto restart;
 	}
 	VATTR_NULL(&vattr);
 	FILEDESC_SLOCK(td->td_proc->p_fd);
 	vattr.va_mode = ACCESSPERMS &~ td->td_proc->p_fd->fd_cmask;
 	FILEDESC_SUNLOCK(td->td_proc->p_fd);
 #ifdef MAC
 	vattr.va_type = VLNK;
 	error = mac_vnode_check_create(td->td_ucred, nd.ni_dvp, &nd.ni_cnd,
 	    &vattr);
 	if (error)
 		goto out2;
 #endif
 	VOP_LEASE(nd.ni_dvp, td, td->td_ucred, LEASE_WRITE);
 	error = VOP_SYMLINK(nd.ni_dvp, &nd.ni_vp, &nd.ni_cnd, &vattr, syspath);
 	if (error == 0)
 		vput(nd.ni_vp);
 #ifdef MAC
 out2:
 #endif
 	NDFREE(&nd, NDF_ONLY_PNBUF);
 	vput(nd.ni_dvp);
 	vn_finished_write(mp);
 	VFS_UNLOCK_GIANT(vfslocked);
 out:
 	if (segflg != UIO_SYSSPACE)
 		uma_zfree(namei_zone, syspath);
 	return (error);
 }
 
 /*
  * Delete a whiteout from the filesystem.
  */
 int
 undelete(td, uap)
 	struct thread *td;
 	register struct undelete_args /* {
 		char *path;
 	} */ *uap;
 {
 	int error;
 	struct mount *mp;
 	struct nameidata nd;
 	int vfslocked;
 
 restart:
 	bwillwrite();
 	NDINIT(&nd, DELETE, LOCKPARENT | DOWHITEOUT | MPSAFE | AUDITVNODE1,
 	    UIO_USERSPACE, uap->path, td);
 	error = namei(&nd);
 	if (error)
 		return (error);
 	vfslocked = NDHASGIANT(&nd);
 
 	if (nd.ni_vp != NULLVP || !(nd.ni_cnd.cn_flags & ISWHITEOUT)) {
 		NDFREE(&nd, NDF_ONLY_PNBUF);
 		if (nd.ni_vp == nd.ni_dvp)
 			vrele(nd.ni_dvp);
 		else
 			vput(nd.ni_dvp);
 		if (nd.ni_vp)
 			vrele(nd.ni_vp);
 		VFS_UNLOCK_GIANT(vfslocked);
 		return (EEXIST);
 	}
 	if (vn_start_write(nd.ni_dvp, &mp, V_NOWAIT) != 0) {
 		NDFREE(&nd, NDF_ONLY_PNBUF);
 		vput(nd.ni_dvp);
 		VFS_UNLOCK_GIANT(vfslocked);
 		if ((error = vn_start_write(NULL, &mp, V_XSLEEP | PCATCH)) != 0)
 			return (error);
 		goto restart;
 	}
 	VOP_LEASE(nd.ni_dvp, td, td->td_ucred, LEASE_WRITE);
 	error = VOP_WHITEOUT(nd.ni_dvp, &nd.ni_cnd, DELETE);
 	NDFREE(&nd, NDF_ONLY_PNBUF);
 	vput(nd.ni_dvp);
 	vn_finished_write(mp);
 	VFS_UNLOCK_GIANT(vfslocked);
 	return (error);
 }
 
 /*
  * Delete a name from the filesystem.
  */
 #ifndef _SYS_SYSPROTO_H_
 struct unlink_args {
 	char	*path;
 };
 #endif
 int
 unlink(td, uap)
 	struct thread *td;
 	struct unlink_args /* {
 		char *path;
 	} */ *uap;
 {
 	int error;
 
 	error = kern_unlink(td, uap->path, UIO_USERSPACE);
 	return (error);
 }
 
 int
 kern_unlink(struct thread *td, char *path, enum uio_seg pathseg)
 {
 	struct mount *mp;
 	struct vnode *vp;
 	int error;
 	struct nameidata nd;
 	int vfslocked;
 
 restart:
 	bwillwrite();
 	NDINIT(&nd, DELETE, LOCKPARENT | LOCKLEAF | MPSAFE | AUDITVNODE1,
 	    pathseg, path, td);
 	if ((error = namei(&nd)) != 0)
 		return (error == EINVAL ? EPERM : error);
 	vfslocked = NDHASGIANT(&nd);
 	vp = nd.ni_vp;
 	if (vp->v_type == VDIR)
 		error = EPERM;		/* POSIX */
 	else {
 		/*
 		 * The root of a mounted filesystem cannot be deleted.
 		 *
 		 * XXX: can this only be a VDIR case?
 		 */
 		if (vp->v_vflag & VV_ROOT)
 			error = EBUSY;
 	}
 	if (error == 0) {
 		if (vn_start_write(nd.ni_dvp, &mp, V_NOWAIT) != 0) {
 			NDFREE(&nd, NDF_ONLY_PNBUF);
 			vput(nd.ni_dvp);
 			if (vp == nd.ni_dvp)
 				vrele(vp);
 			else
 				vput(vp);
 			VFS_UNLOCK_GIANT(vfslocked);
 			if ((error = vn_start_write(NULL, &mp,
 			    V_XSLEEP | PCATCH)) != 0)
 				return (error);
 			goto restart;
 		}
 #ifdef MAC
 		error = mac_vnode_check_unlink(td->td_ucred, nd.ni_dvp, vp,
 		    &nd.ni_cnd);
 		if (error)
 			goto out;
 #endif
 		VOP_LEASE(nd.ni_dvp, td, td->td_ucred, LEASE_WRITE);
 		error = VOP_REMOVE(nd.ni_dvp, vp, &nd.ni_cnd);
 #ifdef MAC
 out:
 #endif
 		vn_finished_write(mp);
 	}
 	NDFREE(&nd, NDF_ONLY_PNBUF);
 	vput(nd.ni_dvp);
 	if (vp == nd.ni_dvp)
 		vrele(vp);
 	else
 		vput(vp);
 	VFS_UNLOCK_GIANT(vfslocked);
 	return (error);
 }
 
 /*
  * Reposition read/write file offset.
  */
 #ifndef _SYS_SYSPROTO_H_
 struct lseek_args {
 	int	fd;
 	int	pad;
 	off_t	offset;
 	int	whence;
 };
 #endif
 int
 lseek(td, uap)
 	struct thread *td;
 	register struct lseek_args /* {
 		int fd;
 		int pad;
 		off_t offset;
 		int whence;
 	} */ *uap;
 {
 	struct ucred *cred = td->td_ucred;
 	struct file *fp;
 	struct vnode *vp;
 	struct vattr vattr;
 	off_t offset;
 	int error, noneg;
 	int vfslocked;
 
 	if ((error = fget(td, uap->fd, &fp)) != 0)
 		return (error);
 	if (!(fp->f_ops->fo_flags & DFLAG_SEEKABLE)) {
 		fdrop(fp, td);
 		return (ESPIPE);
 	}
 	vp = fp->f_vnode;
 	vfslocked = VFS_LOCK_GIANT(vp->v_mount);
 	noneg = (vp->v_type != VCHR);
 	offset = uap->offset;
 	switch (uap->whence) {
 	case L_INCR:
 		if (noneg &&
 		    (fp->f_offset < 0 ||
 		    (offset > 0 && fp->f_offset > OFF_MAX - offset))) {
 			error = EOVERFLOW;
 			break;
 		}
 		offset += fp->f_offset;
 		break;
 	case L_XTND:
-		vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td);
+		vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
 		error = VOP_GETATTR(vp, &vattr, cred, td);
 		VOP_UNLOCK(vp, 0, td);
 		if (error)
 			break;
 		if (noneg &&
 		    (vattr.va_size > OFF_MAX ||
 		    (offset > 0 && vattr.va_size > OFF_MAX - offset))) {
 			error = EOVERFLOW;
 			break;
 		}
 		offset += vattr.va_size;
 		break;
 	case L_SET:
 		break;
 	case SEEK_DATA:
 		error = fo_ioctl(fp, FIOSEEKDATA, &offset, cred, td);
 		break;
 	case SEEK_HOLE:
 		error = fo_ioctl(fp, FIOSEEKHOLE, &offset, cred, td);
 		break;
 	default:
 		error = EINVAL;
 	}
 	if (error == 0 && noneg && offset < 0)
 		error = EINVAL;
 	if (error != 0)
 		goto drop;
 	fp->f_offset = offset;
 	*(off_t *)(td->td_retval) = fp->f_offset;
 drop:
 	fdrop(fp, td);
 	VFS_UNLOCK_GIANT(vfslocked);
 	return (error);
 }
 
 #if defined(COMPAT_43)
 /*
  * Reposition read/write file offset.
  */
 #ifndef _SYS_SYSPROTO_H_
 struct olseek_args {
 	int	fd;
 	long	offset;
 	int	whence;
 };
 #endif
 int
 olseek(td, uap)
 	struct thread *td;
 	register struct olseek_args /* {
 		int fd;
 		long offset;
 		int whence;
 	} */ *uap;
 {
 	struct lseek_args /* {
 		int fd;
 		int pad;
 		off_t offset;
 		int whence;
 	} */ nuap;
 
 	nuap.fd = uap->fd;
 	nuap.offset = uap->offset;
 	nuap.whence = uap->whence;
 	return (lseek(td, &nuap));
 }
 #endif /* COMPAT_43 */
 
 /* Version with the 'pad' argument */
 int
 freebsd6_lseek(td, uap)
 	struct thread *td;
 	register struct freebsd6_lseek_args *uap;
 {
 	struct lseek_args ouap;
 
 	ouap.fd = uap->fd;
 	ouap.offset = uap->offset;
 	ouap.whence = uap->whence;
 	return (lseek(td, &ouap));
 }
 
 /*
  * Check access permissions using passed credentials.
  */
 static int
 vn_access(vp, user_flags, cred, td)
 	struct vnode	*vp;
 	int		user_flags;
 	struct ucred	*cred;
 	struct thread	*td;
 {
 	int error, flags;
 
 	/* Flags == 0 means only check for existence. */
 	error = 0;
 	if (user_flags) {
 		flags = 0;
 		if (user_flags & R_OK)
 			flags |= VREAD;
 		if (user_flags & W_OK)
 			flags |= VWRITE;
 		if (user_flags & X_OK)
 			flags |= VEXEC;
 #ifdef MAC
 		error = mac_vnode_check_access(cred, vp, flags);
 		if (error)
 			return (error);
 #endif
 		if ((flags & VWRITE) == 0 || (error = vn_writechk(vp)) == 0)
 			error = VOP_ACCESS(vp, flags, cred, td);
 	}
 	return (error);
 }
 
 /*
  * Check access permissions using "real" credentials.
  */
 #ifndef _SYS_SYSPROTO_H_
 struct access_args {
 	char	*path;
 	int	flags;
 };
 #endif
 int
 access(td, uap)
 	struct thread *td;
 	register struct access_args /* {
 		char *path;
 		int flags;
 	} */ *uap;
 {
 
 	return (kern_access(td, uap->path, UIO_USERSPACE, uap->flags));
 }
 
 int
 kern_access(struct thread *td, char *path, enum uio_seg pathseg, int flags)
 {
 	struct ucred *cred, *tmpcred;
 	register struct vnode *vp;
 	struct nameidata nd;
 	int vfslocked;
 	int error;
 
 	/*
 	 * Create and modify a temporary credential instead of one that
 	 * is potentially shared.  This could also mess up socket
 	 * buffer accounting which can run in an interrupt context.
 	 */
 	cred = td->td_ucred;
 	tmpcred = crdup(cred);
 	tmpcred->cr_uid = cred->cr_ruid;
 	tmpcred->cr_groups[0] = cred->cr_rgid;
 	td->td_ucred = tmpcred;
 	NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | MPSAFE | AUDITVNODE1,
 	    pathseg, path, td);
 	if ((error = namei(&nd)) != 0)
 		goto out1;
 	vfslocked = NDHASGIANT(&nd);
 	vp = nd.ni_vp;
 
 	error = vn_access(vp, flags, tmpcred, td);
 	NDFREE(&nd, NDF_ONLY_PNBUF);
 	vput(vp);
 	VFS_UNLOCK_GIANT(vfslocked);
 out1:
 	td->td_ucred = cred;
 	crfree(tmpcred);
 	return (error);
 }
 
 /*
  * Check access permissions using "effective" credentials.
  */
 #ifndef _SYS_SYSPROTO_H_
 struct eaccess_args {
 	char	*path;
 	int	flags;
 };
 #endif
 int
 eaccess(td, uap)
 	struct thread *td;
 	register struct eaccess_args /* {
 		char *path;
 		int flags;
 	} */ *uap;
 {
 
 	return (kern_eaccess(td, uap->path, UIO_USERSPACE, uap->flags));
 }
 
 int
 kern_eaccess(struct thread *td, char *path, enum uio_seg pathseg, int flags)
 {
 	struct nameidata nd;
 	struct vnode *vp;
 	int vfslocked;
 	int error;
 
 	NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | MPSAFE | AUDITVNODE1,
 	    pathseg, path, td);
 	if ((error = namei(&nd)) != 0)
 		return (error);
 	vp = nd.ni_vp;
 	vfslocked = NDHASGIANT(&nd);
 	error = vn_access(vp, flags, td->td_ucred, td);
 	NDFREE(&nd, NDF_ONLY_PNBUF);
 	vput(vp);
 	VFS_UNLOCK_GIANT(vfslocked);
 	return (error);
 }
 
 #if defined(COMPAT_43)
 /*
  * Get file status; this version follows links.
  */
 #ifndef _SYS_SYSPROTO_H_
 struct ostat_args {
 	char	*path;
 	struct ostat *ub;
 };
 #endif
 int
 ostat(td, uap)
 	struct thread *td;
 	register struct ostat_args /* {
 		char *path;
 		struct ostat *ub;
 	} */ *uap;
 {
 	struct stat sb;
 	struct ostat osb;
 	int error;
 
 	error = kern_stat(td, uap->path, UIO_USERSPACE, &sb);
 	if (error)
 		return (error);
 	cvtstat(&sb, &osb);
 	error = copyout(&osb, uap->ub, sizeof (osb));
 	return (error);
 }
 
 /*
  * Get file status; this version does not follow links.
  */
 #ifndef _SYS_SYSPROTO_H_
 struct olstat_args {
 	char	*path;
 	struct ostat *ub;
 };
 #endif
 int
 olstat(td, uap)
 	struct thread *td;
 	register struct olstat_args /* {
 		char *path;
 		struct ostat *ub;
 	} */ *uap;
 {
 	struct stat sb;
 	struct ostat osb;
 	int error;
 
 	error = kern_lstat(td, uap->path, UIO_USERSPACE, &sb);
 	if (error)
 		return (error);
 	cvtstat(&sb, &osb);
 	error = copyout(&osb, uap->ub, sizeof (osb));
 	return (error);
 }
 
 /*
  * Convert from an old to a new stat structure.
  */
 void
 cvtstat(st, ost)
 	struct stat *st;
 	struct ostat *ost;
 {
 
 	ost->st_dev = st->st_dev;
 	ost->st_ino = st->st_ino;
 	ost->st_mode = st->st_mode;
 	ost->st_nlink = st->st_nlink;
 	ost->st_uid = st->st_uid;
 	ost->st_gid = st->st_gid;
 	ost->st_rdev = st->st_rdev;
 	if (st->st_size < (quad_t)1 << 32)
 		ost->st_size = st->st_size;
 	else
 		ost->st_size = -2;
 	ost->st_atime = st->st_atime;
 	ost->st_mtime = st->st_mtime;
 	ost->st_ctime = st->st_ctime;
 	ost->st_blksize = st->st_blksize;
 	ost->st_blocks = st->st_blocks;
 	ost->st_flags = st->st_flags;
 	ost->st_gen = st->st_gen;
 }
 #endif /* COMPAT_43 */
 
 /*
  * Get file status; this version follows links.
  */
 #ifndef _SYS_SYSPROTO_H_
 struct stat_args {
 	char	*path;
 	struct stat *ub;
 };
 #endif
 int
 stat(td, uap)
 	struct thread *td;
 	register struct stat_args /* {
 		char *path;
 		struct stat *ub;
 	} */ *uap;
 {
 	struct stat sb;
 	int error;
 
 	error = kern_stat(td, uap->path, UIO_USERSPACE, &sb);
 	if (error == 0)
 		error = copyout(&sb, uap->ub, sizeof (sb));
 	return (error);
 }
 
 int
 kern_stat(struct thread *td, char *path, enum uio_seg pathseg, struct stat *sbp)
 {
 	struct nameidata nd;
 	struct stat sb;
 	int error, vfslocked;
 
 	NDINIT(&nd, LOOKUP,
 	    FOLLOW | LOCKSHARED | LOCKLEAF | MPSAFE | AUDITVNODE1,
 	    pathseg, path, td);
 	if ((error = namei(&nd)) != 0)
 		return (error);
 	vfslocked = NDHASGIANT(&nd);
 	error = vn_stat(nd.ni_vp, &sb, td->td_ucred, NOCRED, td);
 	NDFREE(&nd, NDF_ONLY_PNBUF);
 	vput(nd.ni_vp);
 	VFS_UNLOCK_GIANT(vfslocked);
 	if (mtx_owned(&Giant))
 		printf("stat(%d): %s\n", vfslocked, path);
 	if (error)
 		return (error);
 	*sbp = sb;
 	return (0);
 }
 
 /*
  * Get file status; this version does not follow links.
  */
 #ifndef _SYS_SYSPROTO_H_
 struct lstat_args {
 	char	*path;
 	struct stat *ub;
 };
 #endif
 int
 lstat(td, uap)
 	struct thread *td;
 	register struct lstat_args /* {
 		char *path;
 		struct stat *ub;
 	} */ *uap;
 {
 	struct stat sb;
 	int error;
 
 	error = kern_lstat(td, uap->path, UIO_USERSPACE, &sb);
 	if (error == 0)
 		error = copyout(&sb, uap->ub, sizeof (sb));
 	return (error);
 }
 
 int
 kern_lstat(struct thread *td, char *path, enum uio_seg pathseg, struct stat *sbp)
 {
 	struct vnode *vp;
 	struct stat sb;
 	struct nameidata nd;
 	int error, vfslocked;
 
 	NDINIT(&nd, LOOKUP,
 	    NOFOLLOW | LOCKLEAF | LOCKSHARED | MPSAFE | AUDITVNODE1,
 	    pathseg, path, td);
 	if ((error = namei(&nd)) != 0)
 		return (error);
 	vfslocked = NDHASGIANT(&nd);
 	vp = nd.ni_vp;
 	error = vn_stat(vp, &sb, td->td_ucred, NOCRED, td);
 	NDFREE(&nd, NDF_ONLY_PNBUF);
 	vput(vp);
 	VFS_UNLOCK_GIANT(vfslocked);
 	if (error)
 		return (error);
 	*sbp = sb;
 	return (0);
 }
 
 /*
  * Implementation of the NetBSD [l]stat() functions.
  */
 void
 cvtnstat(sb, nsb)
 	struct stat *sb;
 	struct nstat *nsb;
 {
 	bzero(nsb, sizeof *nsb);
 	nsb->st_dev = sb->st_dev;
 	nsb->st_ino = sb->st_ino;
 	nsb->st_mode = sb->st_mode;
 	nsb->st_nlink = sb->st_nlink;
 	nsb->st_uid = sb->st_uid;
 	nsb->st_gid = sb->st_gid;
 	nsb->st_rdev = sb->st_rdev;
 	nsb->st_atimespec = sb->st_atimespec;
 	nsb->st_mtimespec = sb->st_mtimespec;
 	nsb->st_ctimespec = sb->st_ctimespec;
 	nsb->st_size = sb->st_size;
 	nsb->st_blocks = sb->st_blocks;
 	nsb->st_blksize = sb->st_blksize;
 	nsb->st_flags = sb->st_flags;
 	nsb->st_gen = sb->st_gen;
 	nsb->st_birthtimespec = sb->st_birthtimespec;
 }
 
 #ifndef _SYS_SYSPROTO_H_
 struct nstat_args {
 	char	*path;
 	struct nstat *ub;
 };
 #endif
 int
 nstat(td, uap)
 	struct thread *td;
 	register struct nstat_args /* {
 		char *path;
 		struct nstat *ub;
 	} */ *uap;
 {
 	struct stat sb;
 	struct nstat nsb;
 	int error;
 
 	error = kern_stat(td, uap->path, UIO_USERSPACE, &sb);
 	if (error)
 		return (error);
 	cvtnstat(&sb, &nsb);
 	error = copyout(&nsb, uap->ub, sizeof (nsb));
 	return (error);
 }
 
 /*
  * NetBSD lstat.  Get file status; this version does not follow links.
  */
 #ifndef _SYS_SYSPROTO_H_
 struct lstat_args {
 	char	*path;
 	struct stat *ub;
 };
 #endif
 int
 nlstat(td, uap)
 	struct thread *td;
 	register struct nlstat_args /* {
 		char *path;
 		struct nstat *ub;
 	} */ *uap;
 {
 	struct stat sb;
 	struct nstat nsb;
 	int error;
 
 	error = kern_lstat(td, uap->path, UIO_USERSPACE, &sb);
 	if (error)
 		return (error);
 	cvtnstat(&sb, &nsb);
 	error = copyout(&nsb, uap->ub, sizeof (nsb));
 	return (error);
 }
 
 /*
  * Get configurable pathname variables.
  */
 #ifndef _SYS_SYSPROTO_H_
 struct pathconf_args {
 	char	*path;
 	int	name;
 };
 #endif
 int
 pathconf(td, uap)
 	struct thread *td;
 	register struct pathconf_args /* {
 		char *path;
 		int name;
 	} */ *uap;
 {
 
 	return (kern_pathconf(td, uap->path, UIO_USERSPACE, uap->name));
 }
 
 int
 kern_pathconf(struct thread *td, char *path, enum uio_seg pathseg, int name)
 {
 	struct nameidata nd;
 	int error, vfslocked;
 
 	NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | MPSAFE | AUDITVNODE1,
 	    pathseg, path, td);
 	if ((error = namei(&nd)) != 0)
 		return (error);
 	vfslocked = NDHASGIANT(&nd);
 	NDFREE(&nd, NDF_ONLY_PNBUF);
 
 	/* If asynchronous I/O is available, it works for all files. */
 	if (name == _PC_ASYNC_IO)
 		td->td_retval[0] = async_io_version;
 	else
 		error = VOP_PATHCONF(nd.ni_vp, name, td->td_retval);
 	vput(nd.ni_vp);
 	VFS_UNLOCK_GIANT(vfslocked);
 	return (error);
 }
 
 /*
  * Return target name of a symbolic link.
  */
 #ifndef _SYS_SYSPROTO_H_
 struct readlink_args {
 	char	*path;
 	char	*buf;
 	int	count;
 };
 #endif
 int
 readlink(td, uap)
 	struct thread *td;
 	register struct readlink_args /* {
 		char *path;
 		char *buf;
 		int count;
 	} */ *uap;
 {
 
 	return (kern_readlink(td, uap->path, UIO_USERSPACE, uap->buf,
 	    UIO_USERSPACE, uap->count));
 }
 
 int
 kern_readlink(struct thread *td, char *path, enum uio_seg pathseg, char *buf,
     enum uio_seg bufseg, int count)
 {
 	register struct vnode *vp;
 	struct iovec aiov;
 	struct uio auio;
 	int error;
 	struct nameidata nd;
 	int vfslocked;
 
 	NDINIT(&nd, LOOKUP, NOFOLLOW | LOCKLEAF | MPSAFE | AUDITVNODE1,
 	    pathseg, path, td);
 	if ((error = namei(&nd)) != 0)
 		return (error);
 	NDFREE(&nd, NDF_ONLY_PNBUF);
 	vfslocked = NDHASGIANT(&nd);
 	vp = nd.ni_vp;
 #ifdef MAC
 	error = mac_vnode_check_readlink(td->td_ucred, vp);
 	if (error) {
 		vput(vp);
 		VFS_UNLOCK_GIANT(vfslocked);
 		return (error);
 	}
 #endif
 	if (vp->v_type != VLNK)
 		error = EINVAL;
 	else {
 		aiov.iov_base = buf;
 		aiov.iov_len = count;
 		auio.uio_iov = &aiov;
 		auio.uio_iovcnt = 1;
 		auio.uio_offset = 0;
 		auio.uio_rw = UIO_READ;
 		auio.uio_segflg = bufseg;
 		auio.uio_td = td;
 		auio.uio_resid = count;
 		error = VOP_READLINK(vp, &auio, td->td_ucred);
 	}
 	vput(vp);
 	VFS_UNLOCK_GIANT(vfslocked);
 	td->td_retval[0] = count - auio.uio_resid;
 	return (error);
 }
 
 /*
  * Common implementation code for chflags() and fchflags().
  */
 static int
 setfflags(td, vp, flags)
 	struct thread *td;
 	struct vnode *vp;
 	int flags;
 {
 	int error;
 	struct mount *mp;
 	struct vattr vattr;
 
 	/*
 	 * Prevent non-root users from setting flags on devices.  When
 	 * a device is reused, users can retain ownership of the device
 	 * if they are allowed to set flags and programs assume that
 	 * chown can't fail when done as root.
 	 */
 	if (vp->v_type == VCHR || vp->v_type == VBLK) {
 		error = priv_check(td, PRIV_VFS_CHFLAGS_DEV);
 		if (error)
 			return (error);
 	}
 
 	if ((error = vn_start_write(vp, &mp, V_WAIT | PCATCH)) != 0)
 		return (error);
 	VOP_LEASE(vp, td, td->td_ucred, LEASE_WRITE);
-	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td);
+	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
 	VATTR_NULL(&vattr);
 	vattr.va_flags = flags;
 #ifdef MAC
 	error = mac_vnode_check_setflags(td->td_ucred, vp, vattr.va_flags);
 	if (error == 0)
 #endif
 		error = VOP_SETATTR(vp, &vattr, td->td_ucred, td);
 	VOP_UNLOCK(vp, 0, td);
 	vn_finished_write(mp);
 	return (error);
 }
 
 /*
  * Change flags of a file given a path name.
  */
 #ifndef _SYS_SYSPROTO_H_
 struct chflags_args {
 	char	*path;
 	int	flags;
 };
 #endif
 int
 chflags(td, uap)
 	struct thread *td;
 	register struct chflags_args /* {
 		char *path;
 		int flags;
 	} */ *uap;
 {
 	int error;
 	struct nameidata nd;
 	int vfslocked;
 
 	AUDIT_ARG(fflags, uap->flags);
 	NDINIT(&nd, LOOKUP, FOLLOW | MPSAFE | AUDITVNODE1, UIO_USERSPACE,
 	    uap->path, td);
 	if ((error = namei(&nd)) != 0)
 		return (error);
 	NDFREE(&nd, NDF_ONLY_PNBUF);
 	vfslocked = NDHASGIANT(&nd);
 	error = setfflags(td, nd.ni_vp, uap->flags);
 	vrele(nd.ni_vp);
 	VFS_UNLOCK_GIANT(vfslocked);
 	return (error);
 }
 
 /*
  * Same as chflags() but doesn't follow symlinks.
  */
 int
 lchflags(td, uap)
 	struct thread *td;
 	register struct lchflags_args /* {
 		char *path;
 		int flags;
 	} */ *uap;
 {
 	int error;
 	struct nameidata nd;
 	int vfslocked;
 
 	AUDIT_ARG(fflags, uap->flags);
 	NDINIT(&nd, LOOKUP, NOFOLLOW | MPSAFE | AUDITVNODE1, UIO_USERSPACE,
 	    uap->path, td);
 	if ((error = namei(&nd)) != 0)
 		return (error);
 	vfslocked = NDHASGIANT(&nd);
 	NDFREE(&nd, NDF_ONLY_PNBUF);
 	error = setfflags(td, nd.ni_vp, uap->flags);
 	vrele(nd.ni_vp);
 	VFS_UNLOCK_GIANT(vfslocked);
 	return (error);
 }
 
 /*
  * Change flags of a file given a file descriptor.
  */
 #ifndef _SYS_SYSPROTO_H_
 struct fchflags_args {
 	int	fd;
 	int	flags;
 };
 #endif
 int
 fchflags(td, uap)
 	struct thread *td;
 	register struct fchflags_args /* {
 		int fd;
 		int flags;
 	} */ *uap;
 {
 	struct file *fp;
 	int vfslocked;
 	int error;
 
 	AUDIT_ARG(fd, uap->fd);
 	AUDIT_ARG(fflags, uap->flags);
 	if ((error = getvnode(td->td_proc->p_fd, uap->fd, &fp)) != 0)
 		return (error);
 	vfslocked = VFS_LOCK_GIANT(fp->f_vnode->v_mount);
 #ifdef AUDIT
-	vn_lock(fp->f_vnode, LK_EXCLUSIVE | LK_RETRY, td);
+	vn_lock(fp->f_vnode, LK_EXCLUSIVE | LK_RETRY);
 	AUDIT_ARG(vnode, fp->f_vnode, ARG_VNODE1);
 	VOP_UNLOCK(fp->f_vnode, 0, td);
 #endif
 	error = setfflags(td, fp->f_vnode, uap->flags);
 	VFS_UNLOCK_GIANT(vfslocked);
 	fdrop(fp, td);
 	return (error);
 }
 
 /*
  * Common implementation code for chmod(), lchmod() and fchmod().
  */
 static int
 setfmode(td, vp, mode)
 	struct thread *td;
 	struct vnode *vp;
 	int mode;
 {
 	int error;
 	struct mount *mp;
 	struct vattr vattr;
 
 	if ((error = vn_start_write(vp, &mp, V_WAIT | PCATCH)) != 0)
 		return (error);
 	VOP_LEASE(vp, td, td->td_ucred, LEASE_WRITE);
-	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td);
+	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
 	VATTR_NULL(&vattr);
 	vattr.va_mode = mode & ALLPERMS;
 #ifdef MAC
 	error = mac_vnode_check_setmode(td->td_ucred, vp, vattr.va_mode);
 	if (error == 0)
 #endif
 		error = VOP_SETATTR(vp, &vattr, td->td_ucred, td);
 	VOP_UNLOCK(vp, 0, td);
 	vn_finished_write(mp);
 	return (error);
 }
 
 /*
  * Change mode of a file given path name.
  */
 #ifndef _SYS_SYSPROTO_H_
 struct chmod_args {
 	char	*path;
 	int	mode;
 };
 #endif
 int
 chmod(td, uap)
 	struct thread *td;
 	register struct chmod_args /* {
 		char *path;
 		int mode;
 	} */ *uap;
 {
 
 	return (kern_chmod(td, uap->path, UIO_USERSPACE, uap->mode));
 }
 
 int
 kern_chmod(struct thread *td, char *path, enum uio_seg pathseg, int mode)
 {
 	int error;
 	struct nameidata nd;
 	int vfslocked;
 
 	AUDIT_ARG(mode, mode);
 	NDINIT(&nd, LOOKUP, FOLLOW | MPSAFE | AUDITVNODE1, pathseg, path, td);
 	if ((error = namei(&nd)) != 0)
 		return (error);
 	vfslocked = NDHASGIANT(&nd);
 	NDFREE(&nd, NDF_ONLY_PNBUF);
 	error = setfmode(td, nd.ni_vp, mode);
 	vrele(nd.ni_vp);
 	VFS_UNLOCK_GIANT(vfslocked);
 	return (error);
 }
 
 /*
  * Change mode of a file given path name (don't follow links.)
  */
 #ifndef _SYS_SYSPROTO_H_
 struct lchmod_args {
 	char	*path;
 	int	mode;
 };
 #endif
 int
 lchmod(td, uap)
 	struct thread *td;
 	register struct lchmod_args /* {
 		char *path;
 		int mode;
 	} */ *uap;
 {
 	int error;
 	struct nameidata nd;
 	int vfslocked;
 
 	AUDIT_ARG(mode, (mode_t)uap->mode);
 	NDINIT(&nd, LOOKUP, NOFOLLOW | MPSAFE | AUDITVNODE1, UIO_USERSPACE,
 	    uap->path, td);
 	if ((error = namei(&nd)) != 0)
 		return (error);
 	vfslocked = NDHASGIANT(&nd);
 	NDFREE(&nd, NDF_ONLY_PNBUF);
 	error = setfmode(td, nd.ni_vp, uap->mode);
 	vrele(nd.ni_vp);
 	VFS_UNLOCK_GIANT(vfslocked);
 	return (error);
 }
 
 /*
  * Change mode of a file given a file descriptor.
  */
 #ifndef _SYS_SYSPROTO_H_
 struct fchmod_args {
 	int	fd;
 	int	mode;
 };
 #endif
 int
 fchmod(td, uap)
 	struct thread *td;
 	register struct fchmod_args /* {
 		int fd;
 		int mode;
 	} */ *uap;
 {
 	struct file *fp;
 	int vfslocked;
 	int error;
 
 	AUDIT_ARG(fd, uap->fd);
 	AUDIT_ARG(mode, uap->mode);
 	if ((error = getvnode(td->td_proc->p_fd, uap->fd, &fp)) != 0)
 		return (error);
 	vfslocked = VFS_LOCK_GIANT(fp->f_vnode->v_mount);
 #ifdef AUDIT
-	vn_lock(fp->f_vnode, LK_EXCLUSIVE | LK_RETRY, td);
+	vn_lock(fp->f_vnode, LK_EXCLUSIVE | LK_RETRY);
 	AUDIT_ARG(vnode, fp->f_vnode, ARG_VNODE1);
 	VOP_UNLOCK(fp->f_vnode, 0, td);
 #endif
 	error = setfmode(td, fp->f_vnode, uap->mode);
 	VFS_UNLOCK_GIANT(vfslocked);
 	fdrop(fp, td);
 	return (error);
 }
 
 /*
  * Common implementation for chown(), lchown(), and fchown()
  */
 static int
 setfown(td, vp, uid, gid)
 	struct thread *td;
 	struct vnode *vp;
 	uid_t uid;
 	gid_t gid;
 {
 	int error;
 	struct mount *mp;
 	struct vattr vattr;
 
 	if ((error = vn_start_write(vp, &mp, V_WAIT | PCATCH)) != 0)
 		return (error);
 	VOP_LEASE(vp, td, td->td_ucred, LEASE_WRITE);
-	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td);
+	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
 	VATTR_NULL(&vattr);
 	vattr.va_uid = uid;
 	vattr.va_gid = gid;
 #ifdef MAC
 	error = mac_vnode_check_setowner(td->td_ucred, vp, vattr.va_uid,
 	    vattr.va_gid);
 	if (error == 0)
 #endif
 		error = VOP_SETATTR(vp, &vattr, td->td_ucred, td);
 	VOP_UNLOCK(vp, 0, td);
 	vn_finished_write(mp);
 	return (error);
 }
 
 /*
  * Set ownership given a path name.
  */
 #ifndef _SYS_SYSPROTO_H_
 struct chown_args {
 	char	*path;
 	int	uid;
 	int	gid;
 };
 #endif
 int
 chown(td, uap)
 	struct thread *td;
 	register struct chown_args /* {
 		char *path;
 		int uid;
 		int gid;
 	} */ *uap;
 {
 
 	return (kern_chown(td, uap->path, UIO_USERSPACE, uap->uid, uap->gid));
 }
 
 int
 kern_chown(struct thread *td, char *path, enum uio_seg pathseg, int uid,
     int gid)
 {
 	int error;
 	struct nameidata nd;
 	int vfslocked;
 
 	AUDIT_ARG(owner, uid, gid);
 	NDINIT(&nd, LOOKUP, FOLLOW | MPSAFE | AUDITVNODE1, pathseg, path, td);
 	if ((error = namei(&nd)) != 0)
 		return (error);
 	vfslocked = NDHASGIANT(&nd);
 	NDFREE(&nd, NDF_ONLY_PNBUF);
 	error = setfown(td, nd.ni_vp, uid, gid);
 	vrele(nd.ni_vp);
 	VFS_UNLOCK_GIANT(vfslocked);
 	return (error);
 }
 
 /*
  * Set ownership given a path name, do not cross symlinks.
  */
 #ifndef _SYS_SYSPROTO_H_
 struct lchown_args {
 	char	*path;
 	int	uid;
 	int	gid;
 };
 #endif
 int
 lchown(td, uap)
 	struct thread *td;
 	register struct lchown_args /* {
 		char *path;
 		int uid;
 		int gid;
 	} */ *uap;
 {
 
 	return (kern_lchown(td, uap->path, UIO_USERSPACE, uap->uid, uap->gid));
 }
 
 int
 kern_lchown(struct thread *td, char *path, enum uio_seg pathseg, int uid,
     int gid)
 {
 	int error;
 	struct nameidata nd;
 	int vfslocked;
 
 	AUDIT_ARG(owner, uid, gid);
 	NDINIT(&nd, LOOKUP, NOFOLLOW | MPSAFE | AUDITVNODE1, pathseg, path, td);
 	if ((error = namei(&nd)) != 0)
 		return (error);
 	vfslocked = NDHASGIANT(&nd);
 	NDFREE(&nd, NDF_ONLY_PNBUF);
 	error = setfown(td, nd.ni_vp, uid, gid);
 	vrele(nd.ni_vp);
 	VFS_UNLOCK_GIANT(vfslocked);
 	return (error);
 }
 
 /*
  * Set ownership given a file descriptor.
  */
 #ifndef _SYS_SYSPROTO_H_
 struct fchown_args {
 	int	fd;
 	int	uid;
 	int	gid;
 };
 #endif
 int
 fchown(td, uap)
 	struct thread *td;
 	register struct fchown_args /* {
 		int fd;
 		int uid;
 		int gid;
 	} */ *uap;
 {
 	struct file *fp;
 	int vfslocked;
 	int error;
 
 	AUDIT_ARG(fd, uap->fd);
 	AUDIT_ARG(owner, uap->uid, uap->gid);
 	if ((error = getvnode(td->td_proc->p_fd, uap->fd, &fp)) != 0)
 		return (error);
 	vfslocked = VFS_LOCK_GIANT(fp->f_vnode->v_mount);
 #ifdef AUDIT
-	vn_lock(fp->f_vnode, LK_EXCLUSIVE | LK_RETRY, td);
+	vn_lock(fp->f_vnode, LK_EXCLUSIVE | LK_RETRY);
 	AUDIT_ARG(vnode, fp->f_vnode, ARG_VNODE1);
 	VOP_UNLOCK(fp->f_vnode, 0, td);
 #endif
 	error = setfown(td, fp->f_vnode, uap->uid, uap->gid);
 	VFS_UNLOCK_GIANT(vfslocked);
 	fdrop(fp, td);
 	return (error);
 }
 
 /*
  * Common implementation code for utimes(), lutimes(), and futimes().
  */
 static int
 getutimes(usrtvp, tvpseg, tsp)
 	const struct timeval *usrtvp;
 	enum uio_seg tvpseg;
 	struct timespec *tsp;
 {
 	struct timeval tv[2];
 	const struct timeval *tvp;
 	int error;
 
 	if (usrtvp == NULL) {
 		microtime(&tv[0]);
 		TIMEVAL_TO_TIMESPEC(&tv[0], &tsp[0]);
 		tsp[1] = tsp[0];
 	} else {
 		if (tvpseg == UIO_SYSSPACE) {
 			tvp = usrtvp;
 		} else {
 			if ((error = copyin(usrtvp, tv, sizeof(tv))) != 0)
 				return (error);
 			tvp = tv;
 		}
 
 		if (tvp[0].tv_usec < 0 || tvp[0].tv_usec >= 1000000 ||
 		    tvp[1].tv_usec < 0 || tvp[1].tv_usec >= 1000000)
 			return (EINVAL);
 		TIMEVAL_TO_TIMESPEC(&tvp[0], &tsp[0]);
 		TIMEVAL_TO_TIMESPEC(&tvp[1], &tsp[1]);
 	}
 	return (0);
 }
 
 /*
  * Common implementation code for utimes(), lutimes(), and futimes().
  */
 static int
 setutimes(td, vp, ts, numtimes, nullflag)
 	struct thread *td;
 	struct vnode *vp;
 	const struct timespec *ts;
 	int numtimes;
 	int nullflag;
 {
 	int error, setbirthtime;
 	struct mount *mp;
 	struct vattr vattr;
 
 	if ((error = vn_start_write(vp, &mp, V_WAIT | PCATCH)) != 0)
 		return (error);
 	VOP_LEASE(vp, td, td->td_ucred, LEASE_WRITE);
-	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td);
+	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
 	setbirthtime = 0;
 	if (numtimes < 3 && VOP_GETATTR(vp, &vattr, td->td_ucred, td) == 0 &&
 	    timespeccmp(&ts[1], &vattr.va_birthtime, < ))
 		setbirthtime = 1;
 	VATTR_NULL(&vattr);
 	vattr.va_atime = ts[0];
 	vattr.va_mtime = ts[1];
 	if (setbirthtime)
 		vattr.va_birthtime = ts[1];
 	if (numtimes > 2)
 		vattr.va_birthtime = ts[2];
 	if (nullflag)
 		vattr.va_vaflags |= VA_UTIMES_NULL;
 #ifdef MAC
 	error = mac_vnode_check_setutimes(td->td_ucred, vp, vattr.va_atime,
 	    vattr.va_mtime);
 #endif
 	if (error == 0)
 		error = VOP_SETATTR(vp, &vattr, td->td_ucred, td);
 	VOP_UNLOCK(vp, 0, td);
 	vn_finished_write(mp);
 	return (error);
 }
 
 /*
  * Set the access and modification times of a file.
  */
 #ifndef _SYS_SYSPROTO_H_
 struct utimes_args {
 	char	*path;
 	struct	timeval *tptr;
 };
 #endif
 int
 utimes(td, uap)
 	struct thread *td;
 	register struct utimes_args /* {
 		char *path;
 		struct timeval *tptr;
 	} */ *uap;
 {
 
 	return (kern_utimes(td, uap->path, UIO_USERSPACE, uap->tptr,
 	    UIO_USERSPACE));
 }
 
 int
 kern_utimes(struct thread *td, char *path, enum uio_seg pathseg,
     struct timeval *tptr, enum uio_seg tptrseg)
 {
 	struct timespec ts[2];
 	int error;
 	struct nameidata nd;
 	int vfslocked;
 
 	if ((error = getutimes(tptr, tptrseg, ts)) != 0)
 		return (error);
 	NDINIT(&nd, LOOKUP, FOLLOW | MPSAFE | AUDITVNODE1, pathseg, path, td);
 	if ((error = namei(&nd)) != 0)
 		return (error);
 	vfslocked = NDHASGIANT(&nd);
 	NDFREE(&nd, NDF_ONLY_PNBUF);
 	error = setutimes(td, nd.ni_vp, ts, 2, tptr == NULL);
 	vrele(nd.ni_vp);
 	VFS_UNLOCK_GIANT(vfslocked);
 	return (error);
 }
 
 /*
  * Set the access and modification times of a file.
  */
 #ifndef _SYS_SYSPROTO_H_
 struct lutimes_args {
 	char	*path;
 	struct	timeval *tptr;
 };
 #endif
 int
 lutimes(td, uap)
 	struct thread *td;
 	register struct lutimes_args /* {
 		char *path;
 		struct timeval *tptr;
 	} */ *uap;
 {
 
 	return (kern_lutimes(td, uap->path, UIO_USERSPACE, uap->tptr,
 	    UIO_USERSPACE));
 }
 
 int
 kern_lutimes(struct thread *td, char *path, enum uio_seg pathseg,
     struct timeval *tptr, enum uio_seg tptrseg)
 {
 	struct timespec ts[2];
 	int error;
 	struct nameidata nd;
 	int vfslocked;
 
 	if ((error = getutimes(tptr, tptrseg, ts)) != 0)
 		return (error);
 	NDINIT(&nd, LOOKUP, NOFOLLOW | MPSAFE | AUDITVNODE1, pathseg, path, td);
 	if ((error = namei(&nd)) != 0)
 		return (error);
 	vfslocked = NDHASGIANT(&nd);
 	NDFREE(&nd, NDF_ONLY_PNBUF);
 	error = setutimes(td, nd.ni_vp, ts, 2, tptr == NULL);
 	vrele(nd.ni_vp);
 	VFS_UNLOCK_GIANT(vfslocked);
 	return (error);
 }
 
 /*
  * Set the access and modification times of a file.
  */
 #ifndef _SYS_SYSPROTO_H_
 struct futimes_args {
 	int	fd;
 	struct	timeval *tptr;
 };
 #endif
 int
 futimes(td, uap)
 	struct thread *td;
 	register struct futimes_args /* {
 		int  fd;
 		struct timeval *tptr;
 	} */ *uap;
 {
 
 	return (kern_futimes(td, uap->fd, uap->tptr, UIO_USERSPACE));
 }
 
 int
 kern_futimes(struct thread *td, int fd, struct timeval *tptr,
     enum uio_seg tptrseg)
 {
 	struct timespec ts[2];
 	struct file *fp;
 	int vfslocked;
 	int error;
 
 	AUDIT_ARG(fd, fd);
 	if ((error = getutimes(tptr, tptrseg, ts)) != 0)
 		return (error);
 	if ((error = getvnode(td->td_proc->p_fd, fd, &fp)) != 0)
 		return (error);
 	vfslocked = VFS_LOCK_GIANT(fp->f_vnode->v_mount);
 #ifdef AUDIT
-	vn_lock(fp->f_vnode, LK_EXCLUSIVE | LK_RETRY, td);
+	vn_lock(fp->f_vnode, LK_EXCLUSIVE | LK_RETRY);
 	AUDIT_ARG(vnode, fp->f_vnode, ARG_VNODE1);
 	VOP_UNLOCK(fp->f_vnode, 0, td);
 #endif
 	error = setutimes(td, fp->f_vnode, ts, 2, tptr == NULL);
 	VFS_UNLOCK_GIANT(vfslocked);
 	fdrop(fp, td);
 	return (error);
 }
 
 /*
  * Truncate a file given its path name.
  */
 #ifndef _SYS_SYSPROTO_H_
 struct truncate_args {
 	char	*path;
 	int	pad;
 	off_t	length;
 };
 #endif
 int
 truncate(td, uap)
 	struct thread *td;
 	register struct truncate_args /* {
 		char *path;
 		int pad;
 		off_t length;
 	} */ *uap;
 {
 
 	return (kern_truncate(td, uap->path, UIO_USERSPACE, uap->length));
 }
 
 int
 kern_truncate(struct thread *td, char *path, enum uio_seg pathseg, off_t length)
 {
 	struct mount *mp;
 	struct vnode *vp;
 	struct vattr vattr;
 	int error;
 	struct nameidata nd;
 	int vfslocked;
 
 	if (length < 0)
 		return(EINVAL);
 	NDINIT(&nd, LOOKUP, FOLLOW | MPSAFE | AUDITVNODE1, pathseg, path, td);
 	if ((error = namei(&nd)) != 0)
 		return (error);
 	vfslocked = NDHASGIANT(&nd);
 	vp = nd.ni_vp;
 	if ((error = vn_start_write(vp, &mp, V_WAIT | PCATCH)) != 0) {
 		vrele(vp);
 		VFS_UNLOCK_GIANT(vfslocked);
 		return (error);
 	}
 	NDFREE(&nd, NDF_ONLY_PNBUF);
 	VOP_LEASE(vp, td, td->td_ucred, LEASE_WRITE);
-	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td);
+	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
 	if (vp->v_type == VDIR)
 		error = EISDIR;
 #ifdef MAC
 	else if ((error = mac_vnode_check_write(td->td_ucred, NOCRED, vp))) {
 	}
 #endif
 	else if ((error = vn_writechk(vp)) == 0 &&
 	    (error = VOP_ACCESS(vp, VWRITE, td->td_ucred, td)) == 0) {
 		VATTR_NULL(&vattr);
 		vattr.va_size = length;
 		error = VOP_SETATTR(vp, &vattr, td->td_ucred, td);
 	}
 	vput(vp);
 	vn_finished_write(mp);
 	VFS_UNLOCK_GIANT(vfslocked);
 	return (error);
 }
 
 #if defined(COMPAT_43)
 /*
  * Truncate a file given its path name.
  */
 #ifndef _SYS_SYSPROTO_H_
 struct otruncate_args {
 	char	*path;
 	long	length;
 };
 #endif
 int
 otruncate(td, uap)
 	struct thread *td;
 	register struct otruncate_args /* {
 		char *path;
 		long length;
 	} */ *uap;
 {
 	struct truncate_args /* {
 		char *path;
 		int pad;
 		off_t length;
 	} */ nuap;
 
 	nuap.path = uap->path;
 	nuap.length = uap->length;
 	return (truncate(td, &nuap));
 }
 #endif /* COMPAT_43 */
 
 /* Versions with the pad argument */
 int
 freebsd6_truncate(struct thread *td, struct freebsd6_truncate_args *uap)
 {
 	struct truncate_args ouap;
 
 	ouap.path = uap->path;
 	ouap.length = uap->length;
 	return (truncate(td, &ouap));
 }
 
 int
 freebsd6_ftruncate(struct thread *td, struct freebsd6_ftruncate_args *uap)
 {
 	struct ftruncate_args ouap;
 
 	ouap.fd = uap->fd;
 	ouap.length = uap->length;
 	return (ftruncate(td, &ouap));
 }
 
 /*
  * Sync an open file.
  */
 #ifndef _SYS_SYSPROTO_H_
 struct fsync_args {
 	int	fd;
 };
 #endif
 int
 fsync(td, uap)
 	struct thread *td;
 	struct fsync_args /* {
 		int fd;
 	} */ *uap;
 {
 	struct vnode *vp;
 	struct mount *mp;
 	struct file *fp;
 	int vfslocked;
 	int error;
 
 	AUDIT_ARG(fd, uap->fd);
 	if ((error = getvnode(td->td_proc->p_fd, uap->fd, &fp)) != 0)
 		return (error);
 	vp = fp->f_vnode;
 	vfslocked = VFS_LOCK_GIANT(vp->v_mount);
 	if ((error = vn_start_write(vp, &mp, V_WAIT | PCATCH)) != 0)
 		goto drop;
-	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td);
+	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
 	AUDIT_ARG(vnode, vp, ARG_VNODE1);
 	if (vp->v_object != NULL) {
 		VM_OBJECT_LOCK(vp->v_object);
 		vm_object_page_clean(vp->v_object, 0, 0, 0);
 		VM_OBJECT_UNLOCK(vp->v_object);
 	}
 	error = VOP_FSYNC(vp, MNT_WAIT, td);
 
 	VOP_UNLOCK(vp, 0, td);
 	vn_finished_write(mp);
 drop:
 	VFS_UNLOCK_GIANT(vfslocked);
 	fdrop(fp, td);
 	return (error);
 }
 
 /*
  * Rename files.  Source and destination must either both be directories, or
  * both not be directories.  If target is a directory, it must be empty.
  */
 #ifndef _SYS_SYSPROTO_H_
 struct rename_args {
 	char	*from;
 	char	*to;
 };
 #endif
 int
 rename(td, uap)
 	struct thread *td;
 	register struct rename_args /* {
 		char *from;
 		char *to;
 	} */ *uap;
 {
 
 	return (kern_rename(td, uap->from, uap->to, UIO_USERSPACE));
 }
 
 int
 kern_rename(struct thread *td, char *from, char *to, enum uio_seg pathseg)
 {
 	struct mount *mp = NULL;
 	struct vnode *tvp, *fvp, *tdvp;
 	struct nameidata fromnd, tond;
 	int tvfslocked;
 	int fvfslocked;
 	int error;
 
 	bwillwrite();
 #ifdef MAC
 	NDINIT(&fromnd, DELETE, LOCKPARENT | LOCKLEAF | SAVESTART | MPSAFE |
 	    AUDITVNODE1, pathseg, from, td);
 #else
 	NDINIT(&fromnd, DELETE, WANTPARENT | SAVESTART | MPSAFE |
 	    AUDITVNODE1, pathseg, from, td);
 #endif
 	if ((error = namei(&fromnd)) != 0)
 		return (error);
 	fvfslocked = NDHASGIANT(&fromnd);
 	tvfslocked = 0;
 #ifdef MAC
 	error = mac_vnode_check_rename_from(td->td_ucred, fromnd.ni_dvp,
 	    fromnd.ni_vp, &fromnd.ni_cnd);
 	VOP_UNLOCK(fromnd.ni_dvp, 0, td);
 	if (fromnd.ni_dvp != fromnd.ni_vp)
 		VOP_UNLOCK(fromnd.ni_vp, 0, td);
 #endif
 	fvp = fromnd.ni_vp;
 	if (error == 0)
 		error = vn_start_write(fvp, &mp, V_WAIT | PCATCH);
 	if (error != 0) {
 		NDFREE(&fromnd, NDF_ONLY_PNBUF);
 		vrele(fromnd.ni_dvp);
 		vrele(fvp);
 		goto out1;
 	}
 	NDINIT(&tond, RENAME, LOCKPARENT | LOCKLEAF | NOCACHE | SAVESTART |
 	    MPSAFE | AUDITVNODE2, pathseg, to, td);
 	if (fromnd.ni_vp->v_type == VDIR)
 		tond.ni_cnd.cn_flags |= WILLBEDIR;
 	if ((error = namei(&tond)) != 0) {
 		/* Translate error code for rename("dir1", "dir2/."). */
 		if (error == EISDIR && fvp->v_type == VDIR)
 			error = EINVAL;
 		NDFREE(&fromnd, NDF_ONLY_PNBUF);
 		vrele(fromnd.ni_dvp);
 		vrele(fvp);
 		vn_finished_write(mp);
 		goto out1;
 	}
 	tvfslocked = NDHASGIANT(&tond);
 	tdvp = tond.ni_dvp;
 	tvp = tond.ni_vp;
 	if (tvp != NULL) {
 		if (fvp->v_type == VDIR && tvp->v_type != VDIR) {
 			error = ENOTDIR;
 			goto out;
 		} else if (fvp->v_type != VDIR && tvp->v_type == VDIR) {
 			error = EISDIR;
 			goto out;
 		}
 	}
 	if (fvp == tdvp)
 		error = EINVAL;
 	/*
 	 * If the source is the same as the destination (that is, if they
 	 * are links to the same vnode), then there is nothing to do.
 	 */
 	if (fvp == tvp)
 		error = -1;
 #ifdef MAC
 	else
 		error = mac_vnode_check_rename_to(td->td_ucred, tdvp,
 		    tond.ni_vp, fromnd.ni_dvp == tdvp, &tond.ni_cnd);
 #endif
 out:
 	if (!error) {
 		VOP_LEASE(tdvp, td, td->td_ucred, LEASE_WRITE);
 		if (fromnd.ni_dvp != tdvp) {
 			VOP_LEASE(fromnd.ni_dvp, td, td->td_ucred, LEASE_WRITE);
 		}
 		if (tvp) {
 			VOP_LEASE(tvp, td, td->td_ucred, LEASE_WRITE);
 		}
 		error = VOP_RENAME(fromnd.ni_dvp, fromnd.ni_vp, &fromnd.ni_cnd,
 				   tond.ni_dvp, tond.ni_vp, &tond.ni_cnd);
 		NDFREE(&fromnd, NDF_ONLY_PNBUF);
 		NDFREE(&tond, NDF_ONLY_PNBUF);
 	} else {
 		NDFREE(&fromnd, NDF_ONLY_PNBUF);
 		NDFREE(&tond, NDF_ONLY_PNBUF);
 		if (tvp)
 			vput(tvp);
 		if (tdvp == tvp)
 			vrele(tdvp);
 		else
 			vput(tdvp);
 		vrele(fromnd.ni_dvp);
 		vrele(fvp);
 	}
 	vrele(tond.ni_startdir);
 	vn_finished_write(mp);
 out1:
 	if (fromnd.ni_startdir)
 		vrele(fromnd.ni_startdir);
 	VFS_UNLOCK_GIANT(fvfslocked);
 	VFS_UNLOCK_GIANT(tvfslocked);
 	if (error == -1)
 		return (0);
 	return (error);
 }
 
 /*
  * Make a directory file.
  */
 #ifndef _SYS_SYSPROTO_H_
 struct mkdir_args {
 	char	*path;
 	int	mode;
 };
 #endif
 int
 mkdir(td, uap)
 	struct thread *td;
 	register struct mkdir_args /* {
 		char *path;
 		int mode;
 	} */ *uap;
 {
 
 	return (kern_mkdir(td, uap->path, UIO_USERSPACE, uap->mode));
 }
 
 int
 kern_mkdir(struct thread *td, char *path, enum uio_seg segflg, int mode)
 {
 	struct mount *mp;
 	struct vnode *vp;
 	struct vattr vattr;
 	int error;
 	struct nameidata nd;
 	int vfslocked;
 
 	AUDIT_ARG(mode, mode);
 restart:
 	bwillwrite();
 	NDINIT(&nd, CREATE, LOCKPARENT | SAVENAME | MPSAFE | AUDITVNODE1,
 	    segflg, path, td);
 	nd.ni_cnd.cn_flags |= WILLBEDIR;
 	if ((error = namei(&nd)) != 0)
 		return (error);
 	vfslocked = NDHASGIANT(&nd);
 	vp = nd.ni_vp;
 	if (vp != NULL) {
 		NDFREE(&nd, NDF_ONLY_PNBUF);
 		/*
 		 * XXX namei called with LOCKPARENT but not LOCKLEAF has
 		 * the strange behaviour of leaving the vnode unlocked
 		 * if the target is the same vnode as the parent.
 		 */
 		if (vp == nd.ni_dvp)
 			vrele(nd.ni_dvp);
 		else
 			vput(nd.ni_dvp);
 		vrele(vp);
 		VFS_UNLOCK_GIANT(vfslocked);
 		return (EEXIST);
 	}
 	if (vn_start_write(nd.ni_dvp, &mp, V_NOWAIT) != 0) {
 		NDFREE(&nd, NDF_ONLY_PNBUF);
 		vput(nd.ni_dvp);
 		VFS_UNLOCK_GIANT(vfslocked);
 		if ((error = vn_start_write(NULL, &mp, V_XSLEEP | PCATCH)) != 0)
 			return (error);
 		goto restart;
 	}
 	VATTR_NULL(&vattr);
 	vattr.va_type = VDIR;
 	FILEDESC_SLOCK(td->td_proc->p_fd);
 	vattr.va_mode = (mode & ACCESSPERMS) &~ td->td_proc->p_fd->fd_cmask;
 	FILEDESC_SUNLOCK(td->td_proc->p_fd);
 #ifdef MAC
 	error = mac_vnode_check_create(td->td_ucred, nd.ni_dvp, &nd.ni_cnd,
 	    &vattr);
 	if (error)
 		goto out;
 #endif
 	VOP_LEASE(nd.ni_dvp, td, td->td_ucred, LEASE_WRITE);
 	error = VOP_MKDIR(nd.ni_dvp, &nd.ni_vp, &nd.ni_cnd, &vattr);
 #ifdef MAC
 out:
 #endif
 	NDFREE(&nd, NDF_ONLY_PNBUF);
 	vput(nd.ni_dvp);
 	if (!error)
 		vput(nd.ni_vp);
 	vn_finished_write(mp);
 	VFS_UNLOCK_GIANT(vfslocked);
 	return (error);
 }
 
 /*
  * Remove a directory file.
  */
 #ifndef _SYS_SYSPROTO_H_
 struct rmdir_args {
 	char	*path;
 };
 #endif
 int
 rmdir(td, uap)
 	struct thread *td;
 	struct rmdir_args /* {
 		char *path;
 	} */ *uap;
 {
 
 	return (kern_rmdir(td, uap->path, UIO_USERSPACE));
 }
 
 int
 kern_rmdir(struct thread *td, char *path, enum uio_seg pathseg)
 {
 	struct mount *mp;
 	struct vnode *vp;
 	int error;
 	struct nameidata nd;
 	int vfslocked;
 
 restart:
 	bwillwrite();
 	NDINIT(&nd, DELETE, LOCKPARENT | LOCKLEAF | MPSAFE | AUDITVNODE1,
 	    pathseg, path, td);
 	if ((error = namei(&nd)) != 0)
 		return (error);
 	vfslocked = NDHASGIANT(&nd);
 	vp = nd.ni_vp;
 	if (vp->v_type != VDIR) {
 		error = ENOTDIR;
 		goto out;
 	}
 	/*
 	 * No rmdir "." please.
 	 */
 	if (nd.ni_dvp == vp) {
 		error = EINVAL;
 		goto out;
 	}
 	/*
 	 * The root of a mounted filesystem cannot be deleted.
 	 */
 	if (vp->v_vflag & VV_ROOT) {
 		error = EBUSY;
 		goto out;
 	}
 #ifdef MAC
 	error = mac_vnode_check_unlink(td->td_ucred, nd.ni_dvp, vp,
 	    &nd.ni_cnd);
 	if (error)
 		goto out;
 #endif
 	if (vn_start_write(nd.ni_dvp, &mp, V_NOWAIT) != 0) {
 		NDFREE(&nd, NDF_ONLY_PNBUF);
 		vput(vp);
 		if (nd.ni_dvp == vp)
 			vrele(nd.ni_dvp);
 		else
 			vput(nd.ni_dvp);
 		VFS_UNLOCK_GIANT(vfslocked);
 		if ((error = vn_start_write(NULL, &mp, V_XSLEEP | PCATCH)) != 0)
 			return (error);
 		goto restart;
 	}
 	VOP_LEASE(nd.ni_dvp, td, td->td_ucred, LEASE_WRITE);
 	VOP_LEASE(vp, td, td->td_ucred, LEASE_WRITE);
 	error = VOP_RMDIR(nd.ni_dvp, nd.ni_vp, &nd.ni_cnd);
 	vn_finished_write(mp);
 out:
 	NDFREE(&nd, NDF_ONLY_PNBUF);
 	vput(vp);
 	if (nd.ni_dvp == vp)
 		vrele(nd.ni_dvp);
 	else
 		vput(nd.ni_dvp);
 	VFS_UNLOCK_GIANT(vfslocked);
 	return (error);
 }
 
 #ifdef COMPAT_43
 /*
  * Read a block of directory entries in a filesystem independent format.
  */
 #ifndef _SYS_SYSPROTO_H_
 struct ogetdirentries_args {
 	int	fd;
 	char	*buf;
 	u_int	count;
 	long	*basep;
 };
 #endif
 int
 ogetdirentries(td, uap)
 	struct thread *td;
 	register struct ogetdirentries_args /* {
 		int fd;
 		char *buf;
 		u_int count;
 		long *basep;
 	} */ *uap;
 {
 	struct vnode *vp;
 	struct file *fp;
 	struct uio auio, kuio;
 	struct iovec aiov, kiov;
 	struct dirent *dp, *edp;
 	caddr_t dirbuf;
 	int error, eofflag, readcnt, vfslocked;
 	long loff;
 
 	/* XXX arbitrary sanity limit on `count'. */
 	if (uap->count > 64 * 1024)
 		return (EINVAL);
 	if ((error = getvnode(td->td_proc->p_fd, uap->fd, &fp)) != 0)
 		return (error);
 	if ((fp->f_flag & FREAD) == 0) {
 		fdrop(fp, td);
 		return (EBADF);
 	}
 	vp = fp->f_vnode;
 unionread:
 	vfslocked = VFS_LOCK_GIANT(vp->v_mount);
 	if (vp->v_type != VDIR) {
 		VFS_UNLOCK_GIANT(vfslocked);
 		fdrop(fp, td);
 		return (EINVAL);
 	}
 	aiov.iov_base = uap->buf;
 	aiov.iov_len = uap->count;
 	auio.uio_iov = &aiov;
 	auio.uio_iovcnt = 1;
 	auio.uio_rw = UIO_READ;
 	auio.uio_segflg = UIO_USERSPACE;
 	auio.uio_td = td;
 	auio.uio_resid = uap->count;
-	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td);
+	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
 	loff = auio.uio_offset = fp->f_offset;
 #ifdef MAC
 	error = mac_vnode_check_readdir(td->td_ucred, vp);
 	if (error) {
 		VOP_UNLOCK(vp, 0, td);
 		VFS_UNLOCK_GIANT(vfslocked);
 		fdrop(fp, td);
 		return (error);
 	}
 #endif
 #	if (BYTE_ORDER != LITTLE_ENDIAN)
 		if (vp->v_mount->mnt_maxsymlinklen <= 0) {
 			error = VOP_READDIR(vp, &auio, fp->f_cred, &eofflag,
 			    NULL, NULL);
 			fp->f_offset = auio.uio_offset;
 		} else
 #	endif
 	{
 		kuio = auio;
 		kuio.uio_iov = &kiov;
 		kuio.uio_segflg = UIO_SYSSPACE;
 		kiov.iov_len = uap->count;
 		MALLOC(dirbuf, caddr_t, uap->count, M_TEMP, M_WAITOK);
 		kiov.iov_base = dirbuf;
 		error = VOP_READDIR(vp, &kuio, fp->f_cred, &eofflag,
 			    NULL, NULL);
 		fp->f_offset = kuio.uio_offset;
 		if (error == 0) {
 			readcnt = uap->count - kuio.uio_resid;
 			edp = (struct dirent *)&dirbuf[readcnt];
 			for (dp = (struct dirent *)dirbuf; dp < edp; ) {
 #				if (BYTE_ORDER == LITTLE_ENDIAN)
 					/*
 					 * The expected low byte of
 					 * dp->d_namlen is our dp->d_type.
 					 * The high MBZ byte of dp->d_namlen
 					 * is our dp->d_namlen.
 					 */
 					dp->d_type = dp->d_namlen;
 					dp->d_namlen = 0;
 #				else
 					/*
 					 * The dp->d_type is the high byte
 					 * of the expected dp->d_namlen,
 					 * so must be zero'ed.
 					 */
 					dp->d_type = 0;
 #				endif
 				if (dp->d_reclen > 0) {
 					dp = (struct dirent *)
 					    ((char *)dp + dp->d_reclen);
 				} else {
 					error = EIO;
 					break;
 				}
 			}
 			if (dp >= edp)
 				error = uiomove(dirbuf, readcnt, &auio);
 		}
 		FREE(dirbuf, M_TEMP);
 	}
 	if (error) {
 		VOP_UNLOCK(vp, 0, td);
 		VFS_UNLOCK_GIANT(vfslocked);
 		fdrop(fp, td);
 		return (error);
 	}
 	if (uap->count == auio.uio_resid &&
 	    (vp->v_vflag & VV_ROOT) &&
 	    (vp->v_mount->mnt_flag & MNT_UNION)) {
 		struct vnode *tvp = vp;
 		vp = vp->v_mount->mnt_vnodecovered;
 		VREF(vp);
 		fp->f_vnode = vp;
 		fp->f_data = vp;
 		fp->f_offset = 0;
 		vput(tvp);
 		VFS_UNLOCK_GIANT(vfslocked);
 		goto unionread;
 	}
 	VOP_UNLOCK(vp, 0, td);
 	VFS_UNLOCK_GIANT(vfslocked);
 	error = copyout(&loff, uap->basep, sizeof(long));
 	fdrop(fp, td);
 	td->td_retval[0] = uap->count - auio.uio_resid;
 	return (error);
 }
 #endif /* COMPAT_43 */
 
 /*
  * Read a block of directory entries in a filesystem independent format.
  */
 #ifndef _SYS_SYSPROTO_H_
 struct getdirentries_args {
 	int	fd;
 	char	*buf;
 	u_int	count;
 	long	*basep;
 };
 #endif
 int
 getdirentries(td, uap)
 	struct thread *td;
 	register struct getdirentries_args /* {
 		int fd;
 		char *buf;
 		u_int count;
 		long *basep;
 	} */ *uap;
 {
 	struct vnode *vp;
 	struct file *fp;
 	struct uio auio;
 	struct iovec aiov;
 	int vfslocked;
 	long loff;
 	int error, eofflag;
 
 	AUDIT_ARG(fd, uap->fd);
 	if ((error = getvnode(td->td_proc->p_fd, uap->fd, &fp)) != 0)
 		return (error);
 	if ((fp->f_flag & FREAD) == 0) {
 		fdrop(fp, td);
 		return (EBADF);
 	}
 	vp = fp->f_vnode;
 unionread:
 	vfslocked = VFS_LOCK_GIANT(vp->v_mount);
 	if (vp->v_type != VDIR) {
 		VFS_UNLOCK_GIANT(vfslocked);
 		error = EINVAL;
 		goto fail;
 	}
 	aiov.iov_base = uap->buf;
 	aiov.iov_len = uap->count;
 	auio.uio_iov = &aiov;
 	auio.uio_iovcnt = 1;
 	auio.uio_rw = UIO_READ;
 	auio.uio_segflg = UIO_USERSPACE;
 	auio.uio_td = td;
 	auio.uio_resid = uap->count;
-	/* vn_lock(vp, LK_SHARED | LK_RETRY, td); */
-	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td);
+	/* vn_lock(vp, LK_SHARED | LK_RETRY); */
+	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
 	AUDIT_ARG(vnode, vp, ARG_VNODE1);
 	loff = auio.uio_offset = fp->f_offset;
 #ifdef MAC
 	error = mac_vnode_check_readdir(td->td_ucred, vp);
 	if (error == 0)
 #endif
 		error = VOP_READDIR(vp, &auio, fp->f_cred, &eofflag, NULL,
 		    NULL);
 	fp->f_offset = auio.uio_offset;
 	if (error) {
 		VOP_UNLOCK(vp, 0, td);
 		VFS_UNLOCK_GIANT(vfslocked);
 		goto fail;
 	}
 	if (uap->count == auio.uio_resid &&
 	    (vp->v_vflag & VV_ROOT) &&
 	    (vp->v_mount->mnt_flag & MNT_UNION)) {
 		struct vnode *tvp = vp;
 		vp = vp->v_mount->mnt_vnodecovered;
 		VREF(vp);
 		fp->f_vnode = vp;
 		fp->f_data = vp;
 		fp->f_offset = 0;
 		vput(tvp);
 		VFS_UNLOCK_GIANT(vfslocked);
 		goto unionread;
 	}
 	VOP_UNLOCK(vp, 0, td);
 	VFS_UNLOCK_GIANT(vfslocked);
 	if (uap->basep != NULL) {
 		error = copyout(&loff, uap->basep, sizeof(long));
 	}
 	td->td_retval[0] = uap->count - auio.uio_resid;
 fail:
 	fdrop(fp, td);
 	return (error);
 }
 
 #ifndef _SYS_SYSPROTO_H_
 struct getdents_args {
 	int fd;
 	char *buf;
 	size_t count;
 };
 #endif
 int
 getdents(td, uap)
 	struct thread *td;
 	register struct getdents_args /* {
 		int fd;
 		char *buf;
 		u_int count;
 	} */ *uap;
 {
 	struct getdirentries_args ap;
 	ap.fd = uap->fd;
 	ap.buf = uap->buf;
 	ap.count = uap->count;
 	ap.basep = NULL;
 	return (getdirentries(td, &ap));
 }
 
 /*
  * Set the mode mask for creation of filesystem nodes.
  */
 #ifndef _SYS_SYSPROTO_H_
 struct umask_args {
 	int	newmask;
 };
 #endif
 int
 umask(td, uap)
 	struct thread *td;
 	struct umask_args /* {
 		int newmask;
 	} */ *uap;
 {
 	register struct filedesc *fdp;
 
 	FILEDESC_XLOCK(td->td_proc->p_fd);
 	fdp = td->td_proc->p_fd;
 	td->td_retval[0] = fdp->fd_cmask;
 	fdp->fd_cmask = uap->newmask & ALLPERMS;
 	FILEDESC_XUNLOCK(td->td_proc->p_fd);
 	return (0);
 }
 
 /*
  * Void all references to file by ripping underlying filesystem away from
  * vnode.
  */
 #ifndef _SYS_SYSPROTO_H_
 struct revoke_args {
 	char	*path;
 };
 #endif
 int
 revoke(td, uap)
 	struct thread *td;
 	register struct revoke_args /* {
 		char *path;
 	} */ *uap;
 {
 	struct vnode *vp;
 	struct vattr vattr;
 	int error;
 	struct nameidata nd;
 	int vfslocked;
 
 	NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | MPSAFE | AUDITVNODE1,
 	    UIO_USERSPACE, uap->path, td);
 	if ((error = namei(&nd)) != 0)
 		return (error);
 	vfslocked = NDHASGIANT(&nd);
 	vp = nd.ni_vp;
 	NDFREE(&nd, NDF_ONLY_PNBUF);
 	if (vp->v_type != VCHR) {
 		error = EINVAL;
 		goto out;
 	}
 #ifdef MAC
 	error = mac_vnode_check_revoke(td->td_ucred, vp);
 	if (error)
 		goto out;
 #endif
 	error = VOP_GETATTR(vp, &vattr, td->td_ucred, td);
 	if (error)
 		goto out;
 	if (td->td_ucred->cr_uid != vattr.va_uid) {
 		error = priv_check(td, PRIV_VFS_ADMIN);
 		if (error)
 			goto out;
 	}
 	if (vcount(vp) > 1)
 		VOP_REVOKE(vp, REVOKEALL);
 out:
 	vput(vp);
 	VFS_UNLOCK_GIANT(vfslocked);
 	return (error);
 }
 
 /*
  * Convert a user file descriptor to a kernel file entry.
  * A reference on the file entry is held upon returning.
  */
 int
 getvnode(fdp, fd, fpp)
 	struct filedesc *fdp;
 	int fd;
 	struct file **fpp;
 {
 	int error;
 	struct file *fp;
 
 	fp = NULL;
 	if (fdp == NULL)
 		error = EBADF;
 	else {
 		FILEDESC_SLOCK(fdp);
 		if ((u_int)fd >= fdp->fd_nfiles ||
 		    (fp = fdp->fd_ofiles[fd]) == NULL)
 			error = EBADF;
 		else if (fp->f_vnode == NULL) {
 			fp = NULL;
 			error = EINVAL;
 		} else {
 			fhold(fp);
 			error = 0;
 		}
 		FILEDESC_SUNLOCK(fdp);
 	}
 	*fpp = fp;
 	return (error);
 }
 
 /*
  * Get an (NFS) file handle.
  */
 #ifndef _SYS_SYSPROTO_H_
 struct lgetfh_args {
 	char	*fname;
 	fhandle_t *fhp;
 };
 #endif
 int
 lgetfh(td, uap)
 	struct thread *td;
 	register struct lgetfh_args *uap;
 {
 	struct nameidata nd;
 	fhandle_t fh;
 	register struct vnode *vp;
 	int vfslocked;
 	int error;
 
 	error = priv_check(td, PRIV_VFS_GETFH);
 	if (error)
 		return (error);
 	NDINIT(&nd, LOOKUP, NOFOLLOW | LOCKLEAF | MPSAFE | AUDITVNODE1,
 	    UIO_USERSPACE, uap->fname, td);
 	error = namei(&nd);
 	if (error)
 		return (error);
 	vfslocked = NDHASGIANT(&nd);
 	NDFREE(&nd, NDF_ONLY_PNBUF);
 	vp = nd.ni_vp;
 	bzero(&fh, sizeof(fh));
 	fh.fh_fsid = vp->v_mount->mnt_stat.f_fsid;
 	error = VOP_VPTOFH(vp, &fh.fh_fid);
 	vput(vp);
 	VFS_UNLOCK_GIANT(vfslocked);
 	if (error)
 		return (error);
 	error = copyout(&fh, uap->fhp, sizeof (fh));
 	return (error);
 }
 
 #ifndef _SYS_SYSPROTO_H_
 struct getfh_args {
 	char	*fname;
 	fhandle_t *fhp;
 };
 #endif
 int
 getfh(td, uap)
 	struct thread *td;
 	register struct getfh_args *uap;
 {
 	struct nameidata nd;
 	fhandle_t fh;
 	register struct vnode *vp;
 	int vfslocked;
 	int error;
 
 	error = priv_check(td, PRIV_VFS_GETFH);
 	if (error)
 		return (error);
 	NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | MPSAFE | AUDITVNODE1,
 	    UIO_USERSPACE, uap->fname, td);
 	error = namei(&nd);
 	if (error)
 		return (error);
 	vfslocked = NDHASGIANT(&nd);
 	NDFREE(&nd, NDF_ONLY_PNBUF);
 	vp = nd.ni_vp;
 	bzero(&fh, sizeof(fh));
 	fh.fh_fsid = vp->v_mount->mnt_stat.f_fsid;
 	error = VOP_VPTOFH(vp, &fh.fh_fid);
 	vput(vp);
 	VFS_UNLOCK_GIANT(vfslocked);
 	if (error)
 		return (error);
 	error = copyout(&fh, uap->fhp, sizeof (fh));
 	return (error);
 }
 
 /*
  * syscall for the rpc.lockd to use to translate a NFS file handle into an
  * open descriptor.
  *
  * warning: do not remove the priv_check() call or this becomes one giant
  * security hole.
  */
 #ifndef _SYS_SYSPROTO_H_
 struct fhopen_args {
 	const struct fhandle *u_fhp;
 	int flags;
 };
 #endif
 int
 fhopen(td, uap)
 	struct thread *td;
 	struct fhopen_args /* {
 		const struct fhandle *u_fhp;
 		int flags;
 	} */ *uap;
 {
 	struct proc *p = td->td_proc;
 	struct mount *mp;
 	struct vnode *vp;
 	struct fhandle fhp;
 	struct vattr vat;
 	struct vattr *vap = &vat;
 	struct flock lf;
 	struct file *fp;
 	register struct filedesc *fdp = p->p_fd;
 	int fmode, mode, error, type;
 	struct file *nfp;
 	int vfslocked;
 	int indx;
 
 	error = priv_check(td, PRIV_VFS_FHOPEN);
 	if (error)
 		return (error);
 	fmode = FFLAGS(uap->flags);
 	/* why not allow a non-read/write open for our lockd? */
 	if (((fmode & (FREAD | FWRITE)) == 0) || (fmode & O_CREAT))
 		return (EINVAL);
 	error = copyin(uap->u_fhp, &fhp, sizeof(fhp));
 	if (error)
 		return(error);
 	/* find the mount point */
 	mp = vfs_getvfs(&fhp.fh_fsid);
 	if (mp == NULL)
 		return (ESTALE);
 	vfslocked = VFS_LOCK_GIANT(mp);
 	/* now give me my vnode, it gets returned to me locked */
 	error = VFS_FHTOVP(mp, &fhp.fh_fid, &vp);
 	if (error)
 		goto out;
 	/*
 	 * from now on we have to make sure not
 	 * to forget about the vnode
 	 * any error that causes an abort must vput(vp)
 	 * just set error = err and 'goto bad;'.
 	 */
 
 	/*
 	 * from vn_open
 	 */
 	if (vp->v_type == VLNK) {
 		error = EMLINK;
 		goto bad;
 	}
 	if (vp->v_type == VSOCK) {
 		error = EOPNOTSUPP;
 		goto bad;
 	}
 	mode = 0;
 	if (fmode & (FWRITE | O_TRUNC)) {
 		if (vp->v_type == VDIR) {
 			error = EISDIR;
 			goto bad;
 		}
 		error = vn_writechk(vp);
 		if (error)
 			goto bad;
 		mode |= VWRITE;
 	}
 	if (fmode & FREAD)
 		mode |= VREAD;
 	if (fmode & O_APPEND)
 		mode |= VAPPEND;
 #ifdef MAC
 	error = mac_vnode_check_open(td->td_ucred, vp, mode);
 	if (error)
 		goto bad;
 #endif
 	if (mode) {
 		error = VOP_ACCESS(vp, mode, td->td_ucred, td);
 		if (error)
 			goto bad;
 	}
 	if (fmode & O_TRUNC) {
 		VOP_UNLOCK(vp, 0, td);				/* XXX */
 		if ((error = vn_start_write(NULL, &mp, V_WAIT | PCATCH)) != 0) {
 			vrele(vp);
 			goto out;
 		}
 		VOP_LEASE(vp, td, td->td_ucred, LEASE_WRITE);
-		vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td);	/* XXX */
+		vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);	/* XXX */
 #ifdef MAC
 		/*
 		 * We don't yet have fp->f_cred, so use td->td_ucred, which
 		 * should be right.
 		 */
 		error = mac_vnode_check_write(td->td_ucred, td->td_ucred, vp);
 		if (error == 0) {
 #endif
 			VATTR_NULL(vap);
 			vap->va_size = 0;
 			error = VOP_SETATTR(vp, vap, td->td_ucred, td);
 #ifdef MAC
 		}
 #endif
 		vn_finished_write(mp);
 		if (error)
 			goto bad;
 	}
 	error = VOP_OPEN(vp, fmode, td->td_ucred, td, NULL);
 	if (error)
 		goto bad;
 
 	if (fmode & FWRITE)
 		vp->v_writecount++;
 
 	/*
 	 * end of vn_open code
 	 */
 
 	if ((error = falloc(td, &nfp, &indx)) != 0) {
 		if (fmode & FWRITE)
 			vp->v_writecount--;
 		goto bad;
 	}
 	/* An extra reference on `nfp' has been held for us by falloc(). */
 	fp = nfp;
 	nfp->f_vnode = vp;
 	finit(nfp, fmode & FMASK, DTYPE_VNODE, vp, &vnops);
 	if (fmode & (O_EXLOCK | O_SHLOCK)) {
 		lf.l_whence = SEEK_SET;
 		lf.l_start = 0;
 		lf.l_len = 0;
 		if (fmode & O_EXLOCK)
 			lf.l_type = F_WRLCK;
 		else
 			lf.l_type = F_RDLCK;
 		type = F_FLOCK;
 		if ((fmode & FNONBLOCK) == 0)
 			type |= F_WAIT;
 		VOP_UNLOCK(vp, 0, td);
 		if ((error = VOP_ADVLOCK(vp, (caddr_t)fp, F_SETLK, &lf,
 			    type)) != 0) {
 			/*
 			 * The lock request failed.  Normally close the
 			 * descriptor but handle the case where someone might
 			 * have dup()d or close()d it when we weren't looking.
 			 */
 			fdclose(fdp, fp, indx, td);
 
 			/*
 			 * release our private reference
 			 */
 			fdrop(fp, td);
 			goto out;
 		}
-		vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td);
+		vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
 		atomic_set_int(&fp->f_flag, FHASLOCK);
 	}
 
 	VOP_UNLOCK(vp, 0, td);
 	fdrop(fp, td);
 	vfs_rel(mp);
 	VFS_UNLOCK_GIANT(vfslocked);
 	td->td_retval[0] = indx;
 	return (0);
 
 bad:
 	vput(vp);
 out:
 	vfs_rel(mp);
 	VFS_UNLOCK_GIANT(vfslocked);
 	return (error);
 }
 
 /*
  * Stat an (NFS) file handle.
  */
 #ifndef _SYS_SYSPROTO_H_
 struct fhstat_args {
 	struct fhandle *u_fhp;
 	struct stat *sb;
 };
 #endif
 int
 fhstat(td, uap)
 	struct thread *td;
 	register struct fhstat_args /* {
 		struct fhandle *u_fhp;
 		struct stat *sb;
 	} */ *uap;
 {
 	struct stat sb;
 	fhandle_t fh;
 	struct mount *mp;
 	struct vnode *vp;
 	int vfslocked;
 	int error;
 
 	error = priv_check(td, PRIV_VFS_FHSTAT);
 	if (error)
 		return (error);
 	error = copyin(uap->u_fhp, &fh, sizeof(fhandle_t));
 	if (error)
 		return (error);
 	if ((mp = vfs_getvfs(&fh.fh_fsid)) == NULL)
 		return (ESTALE);
 	vfslocked = VFS_LOCK_GIANT(mp);
 	if ((error = VFS_FHTOVP(mp, &fh.fh_fid, &vp))) {
 		vfs_rel(mp);
 		VFS_UNLOCK_GIANT(vfslocked);
 		return (error);
 	}
 	error = vn_stat(vp, &sb, td->td_ucred, NOCRED, td);
 	vput(vp);
 	vfs_rel(mp);
 	VFS_UNLOCK_GIANT(vfslocked);
 	if (error)
 		return (error);
 	error = copyout(&sb, uap->sb, sizeof(sb));
 	return (error);
 }
 
 /*
  * Implement fstatfs() for (NFS) file handles.
  */
 #ifndef _SYS_SYSPROTO_H_
 struct fhstatfs_args {
 	struct fhandle *u_fhp;
 	struct statfs *buf;
 };
 #endif
 int
 fhstatfs(td, uap)
 	struct thread *td;
 	struct fhstatfs_args /* {
 		struct fhandle *u_fhp;
 		struct statfs *buf;
 	} */ *uap;
 {
 	struct statfs sf;
 	fhandle_t fh;
 	int error;
 
 	error = copyin(uap->u_fhp, &fh, sizeof(fhandle_t));
 	if (error)
 		return (error);
 	error = kern_fhstatfs(td, fh, &sf);
 	if (error)
 		return (error);
 	return (copyout(&sf, uap->buf, sizeof(sf)));
 }
 
 int
 kern_fhstatfs(struct thread *td, fhandle_t fh, struct statfs *buf)
 {
 	struct statfs *sp;
 	struct mount *mp;
 	struct vnode *vp;
 	int vfslocked;
 	int error;
 
 	error = priv_check(td, PRIV_VFS_FHSTATFS);
 	if (error)
 		return (error);
 	if ((mp = vfs_getvfs(&fh.fh_fsid)) == NULL)
 		return (ESTALE);
 	vfslocked = VFS_LOCK_GIANT(mp);
 	error = VFS_FHTOVP(mp, &fh.fh_fid, &vp);
 	if (error) {
 		VFS_UNLOCK_GIANT(vfslocked);
 		vfs_rel(mp);
 		return (error);
 	}
 	vput(vp);
 	error = prison_canseemount(td->td_ucred, mp);
 	if (error)
 		goto out;
 #ifdef MAC
 	error = mac_mount_check_stat(td->td_ucred, mp);
 	if (error)
 		goto out;
 #endif
 	/*
 	 * Set these in case the underlying filesystem fails to do so.
 	 */
 	sp = &mp->mnt_stat;
 	sp->f_version = STATFS_VERSION;
 	sp->f_namemax = NAME_MAX;
 	sp->f_flags = mp->mnt_flag & MNT_VISFLAGMASK;
 	error = VFS_STATFS(mp, sp, td);
 	if (error == 0)
 		*buf = *sp;
 out:
 	vfs_rel(mp);
 	VFS_UNLOCK_GIANT(vfslocked);
 	return (error);
 }
Index: head/sys/kern/vfs_vnops.c
===================================================================
--- head/sys/kern/vfs_vnops.c	(revision 175201)
+++ head/sys/kern/vfs_vnops.c	(revision 175202)
@@ -1,1312 +1,1313 @@
 /*-
  * Copyright (c) 1982, 1986, 1989, 1993
  *	The Regents of the University of California.  All rights reserved.
  * (c) UNIX System Laboratories, Inc.
  * All or some portions of this file are derived from material licensed
  * to the University of California by American Telephone and Telegraph
  * Co. or Unix System Laboratories, Inc. and are reproduced herein with
  * the permission of UNIX System Laboratories, Inc.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	@(#)vfs_vnops.c	8.2 (Berkeley) 1/21/94
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_mac.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/fcntl.h>
 #include <sys/file.h>
 #include <sys/kdb.h>
 #include <sys/stat.h>
 #include <sys/priv.h>
 #include <sys/proc.h>
 #include <sys/limits.h>
 #include <sys/lock.h>
 #include <sys/mount.h>
 #include <sys/mutex.h>
 #include <sys/namei.h>
 #include <sys/vnode.h>
 #include <sys/bio.h>
 #include <sys/buf.h>
 #include <sys/filio.h>
 #include <sys/sx.h>
 #include <sys/ttycom.h>
 #include <sys/conf.h>
 #include <sys/syslog.h>
 #include <sys/unistd.h>
 
 #include <security/mac/mac_framework.h>
 
 static fo_rdwr_t	vn_read;
 static fo_rdwr_t	vn_write;
 static fo_truncate_t	vn_truncate;
 static fo_ioctl_t	vn_ioctl;
 static fo_poll_t	vn_poll;
 static fo_kqfilter_t	vn_kqfilter;
 static fo_stat_t	vn_statfile;
 static fo_close_t	vn_closefile;
 
 struct 	fileops vnops = {
 	.fo_read = vn_read,
 	.fo_write = vn_write,
 	.fo_truncate = vn_truncate,
 	.fo_ioctl = vn_ioctl,
 	.fo_poll = vn_poll,
 	.fo_kqfilter = vn_kqfilter,
 	.fo_stat = vn_statfile,
 	.fo_close = vn_closefile,
 	.fo_flags = DFLAG_PASSABLE | DFLAG_SEEKABLE
 };
 
 int
 vn_open(ndp, flagp, cmode, fp)
 	struct nameidata *ndp;
 	int *flagp, cmode;
 	struct file *fp;
 {
 	struct thread *td = ndp->ni_cnd.cn_thread;
 
 	return (vn_open_cred(ndp, flagp, cmode, td->td_ucred, fp));
 }
 
 /*
  * Common code for vnode open operations.
  * Check permissions, and call the VOP_OPEN or VOP_CREATE routine.
  * 
  * Note that this does NOT free nameidata for the successful case,
  * due to the NDINIT being done elsewhere.
  */
 int
 vn_open_cred(ndp, flagp, cmode, cred, fp)
 	struct nameidata *ndp;
 	int *flagp, cmode;
 	struct ucred *cred;
 	struct file *fp;
 {
 	struct vnode *vp;
 	struct mount *mp;
 	struct thread *td = ndp->ni_cnd.cn_thread;
 	struct vattr vat;
 	struct vattr *vap = &vat;
 	int mode, fmode, error;
 	int vfslocked, mpsafe;
 
 	mpsafe = ndp->ni_cnd.cn_flags & MPSAFE;
 restart:
 	vfslocked = 0;
 	fmode = *flagp;
 	if (fmode & O_CREAT) {
 		ndp->ni_cnd.cn_nameiop = CREATE;
 		ndp->ni_cnd.cn_flags = ISOPEN | LOCKPARENT | LOCKLEAF |
 		    MPSAFE | AUDITVNODE1;
 		if ((fmode & O_EXCL) == 0 && (fmode & O_NOFOLLOW) == 0)
 			ndp->ni_cnd.cn_flags |= FOLLOW;
 		bwillwrite();
 		if ((error = namei(ndp)) != 0)
 			return (error);
 		vfslocked = NDHASGIANT(ndp);
 		if (!mpsafe)
 			ndp->ni_cnd.cn_flags &= ~MPSAFE;
 		if (ndp->ni_vp == NULL) {
 			VATTR_NULL(vap);
 			vap->va_type = VREG;
 			vap->va_mode = cmode;
 			if (fmode & O_EXCL)
 				vap->va_vaflags |= VA_EXCLUSIVE;
 			if (vn_start_write(ndp->ni_dvp, &mp, V_NOWAIT) != 0) {
 				NDFREE(ndp, NDF_ONLY_PNBUF);
 				vput(ndp->ni_dvp);
 				VFS_UNLOCK_GIANT(vfslocked);
 				if ((error = vn_start_write(NULL, &mp,
 				    V_XSLEEP | PCATCH)) != 0)
 					return (error);
 				goto restart;
 			}
 #ifdef MAC
 			error = mac_vnode_check_create(cred, ndp->ni_dvp,
 			    &ndp->ni_cnd, vap);
 			if (error == 0) {
 #endif
 				VOP_LEASE(ndp->ni_dvp, td, cred, LEASE_WRITE);
 				error = VOP_CREATE(ndp->ni_dvp, &ndp->ni_vp,
 						   &ndp->ni_cnd, vap);
 #ifdef MAC
 			}
 #endif
 			vput(ndp->ni_dvp);
 			vn_finished_write(mp);
 			if (error) {
 				VFS_UNLOCK_GIANT(vfslocked);
 				NDFREE(ndp, NDF_ONLY_PNBUF);
 				return (error);
 			}
 			fmode &= ~O_TRUNC;
 			vp = ndp->ni_vp;
 		} else {
 			if (ndp->ni_dvp == ndp->ni_vp)
 				vrele(ndp->ni_dvp);
 			else
 				vput(ndp->ni_dvp);
 			ndp->ni_dvp = NULL;
 			vp = ndp->ni_vp;
 			if (fmode & O_EXCL) {
 				error = EEXIST;
 				goto bad;
 			}
 			fmode &= ~O_CREAT;
 		}
 	} else {
 		ndp->ni_cnd.cn_nameiop = LOOKUP;
 		ndp->ni_cnd.cn_flags = ISOPEN |
 		    ((fmode & O_NOFOLLOW) ? NOFOLLOW : FOLLOW) |
 		    LOCKLEAF | MPSAFE | AUDITVNODE1;
 		if ((error = namei(ndp)) != 0)
 			return (error);
 		if (!mpsafe)
 			ndp->ni_cnd.cn_flags &= ~MPSAFE;
 		vfslocked = NDHASGIANT(ndp);
 		vp = ndp->ni_vp;
 	}
 	if (vp->v_type == VLNK) {
 		error = EMLINK;
 		goto bad;
 	}
 	if (vp->v_type == VSOCK) {
 		error = EOPNOTSUPP;
 		goto bad;
 	}
 	mode = 0;
 	if (fmode & (FWRITE | O_TRUNC)) {
 		if (vp->v_type == VDIR) {
 			error = EISDIR;
 			goto bad;
 		}
 		mode |= VWRITE;
 	}
 	if (fmode & FREAD)
 		mode |= VREAD;
 	if (fmode & O_APPEND)
 		mode |= VAPPEND;
 #ifdef MAC
 	error = mac_vnode_check_open(cred, vp, mode);
 	if (error)
 		goto bad;
 #endif
 	if ((fmode & O_CREAT) == 0) {
 		if (mode & VWRITE) {
 			error = vn_writechk(vp);
 			if (error)
 				goto bad;
 		}
 		if (mode) {
 		        error = VOP_ACCESS(vp, mode, cred, td);
 			if (error)
 				goto bad;
 		}
 	}
 	if ((error = VOP_OPEN(vp, fmode, cred, td, fp)) != 0)
 		goto bad;
 
 	if (fmode & FWRITE)
 		vp->v_writecount++;
 	*flagp = fmode;
 	ASSERT_VOP_ELOCKED(vp, "vn_open_cred");
 	if (!mpsafe)
 		VFS_UNLOCK_GIANT(vfslocked);
 	return (0);
 bad:
 	NDFREE(ndp, NDF_ONLY_PNBUF);
 	vput(vp);
 	VFS_UNLOCK_GIANT(vfslocked);
 	*flagp = fmode;
 	ndp->ni_vp = NULL;
 	return (error);
 }
 
 /*
  * Check for write permissions on the specified vnode.
  * Prototype text segments cannot be written.
  */
 int
 vn_writechk(vp)
 	register struct vnode *vp;
 {
 
 	ASSERT_VOP_LOCKED(vp, "vn_writechk");
 	/*
 	 * If there's shared text associated with
 	 * the vnode, try to free it up once.  If
 	 * we fail, we can't allow writing.
 	 */
 	if (vp->v_vflag & VV_TEXT)
 		return (ETXTBSY);
 
 	return (0);
 }
 
 /*
  * Vnode close call
  */
 int
 vn_close(vp, flags, file_cred, td)
 	register struct vnode *vp;
 	int flags;
 	struct ucred *file_cred;
 	struct thread *td;
 {
 	struct mount *mp;
 	int error;
 
 	VFS_ASSERT_GIANT(vp->v_mount);
 
 	vn_start_write(vp, &mp, V_WAIT);
-	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td);
+	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
 	if (flags & FWRITE) {
 		VNASSERT(vp->v_writecount > 0, vp, 
 		    ("vn_close: negative writecount"));
 		vp->v_writecount--;
 	}
 	error = VOP_CLOSE(vp, flags, file_cred, td);
 	vput(vp);
 	vn_finished_write(mp);
 	return (error);
 }
 
 /*
  * Heuristic to detect sequential operation.
  */
 static int
 sequential_heuristic(struct uio *uio, struct file *fp)
 {
 
 	/*
 	 * Offset 0 is handled specially.  open() sets f_seqcount to 1 so
 	 * that the first I/O is normally considered to be slightly
 	 * sequential.  Seeking to offset 0 doesn't change sequentiality
 	 * unless previous seeks have reduced f_seqcount to 0, in which
 	 * case offset 0 is not special.
 	 */
 	if ((uio->uio_offset == 0 && fp->f_seqcount > 0) ||
 	    uio->uio_offset == fp->f_nextoff) {
 		/*
 		 * f_seqcount is in units of fixed-size blocks so that it
 		 * depends mainly on the amount of sequential I/O and not
 		 * much on the number of sequential I/O's.  The fixed size
 		 * of 16384 is hard-coded here since it is (not quite) just
 		 * a magic size that works well here.  This size is more
 		 * closely related to the best I/O size for real disks than
 		 * to any block size used by software.
 		 */
 		fp->f_seqcount += howmany(uio->uio_resid, 16384);
 		if (fp->f_seqcount > IO_SEQMAX)
 			fp->f_seqcount = IO_SEQMAX;
 		return (fp->f_seqcount << IO_SEQSHIFT);
 	}
 
 	/* Not sequential.  Quickly draw-down sequentiality. */
 	if (fp->f_seqcount > 1)
 		fp->f_seqcount = 1;
 	else
 		fp->f_seqcount = 0;
 	return (0);
 }
 
 /*
  * Package up an I/O request on a vnode into a uio and do it.
  */
 int
 vn_rdwr(rw, vp, base, len, offset, segflg, ioflg, active_cred, file_cred,
     aresid, td)
 	enum uio_rw rw;
 	struct vnode *vp;
 	void *base;
 	int len;
 	off_t offset;
 	enum uio_seg segflg;
 	int ioflg;
 	struct ucred *active_cred;
 	struct ucred *file_cred;
 	int *aresid;
 	struct thread *td;
 {
 	struct uio auio;
 	struct iovec aiov;
 	struct mount *mp;
 	struct ucred *cred;
 	int error;
 
 	VFS_ASSERT_GIANT(vp->v_mount);
 
 	if ((ioflg & IO_NODELOCKED) == 0) {
 		mp = NULL;
 		if (rw == UIO_WRITE) { 
 			if (vp->v_type != VCHR &&
 			    (error = vn_start_write(vp, &mp, V_WAIT | PCATCH))
 			    != 0)
 				return (error);
-			vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td);
+			vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
 		} else {
 			/*
 			 * XXX This should be LK_SHARED but I don't trust VFS
 			 * enough to leave it like that until it has been
 			 * reviewed further.
 			 */
-			vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td);
+			vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
 		}
 
 	}
 	ASSERT_VOP_LOCKED(vp, "IO_NODELOCKED with no vp lock held");
 	auio.uio_iov = &aiov;
 	auio.uio_iovcnt = 1;
 	aiov.iov_base = base;
 	aiov.iov_len = len;
 	auio.uio_resid = len;
 	auio.uio_offset = offset;
 	auio.uio_segflg = segflg;
 	auio.uio_rw = rw;
 	auio.uio_td = td;
 	error = 0;
 #ifdef MAC
 	if ((ioflg & IO_NOMACCHECK) == 0) {
 		if (rw == UIO_READ)
 			error = mac_vnode_check_read(active_cred, file_cred,
 			    vp);
 		else
 			error = mac_vnode_check_write(active_cred, file_cred,
 			    vp);
 	}
 #endif
 	if (error == 0) {
 		if (file_cred)
 			cred = file_cred;
 		else
 			cred = active_cred;
 		if (rw == UIO_READ)
 			error = VOP_READ(vp, &auio, ioflg, cred);
 		else
 			error = VOP_WRITE(vp, &auio, ioflg, cred);
 	}
 	if (aresid)
 		*aresid = auio.uio_resid;
 	else
 		if (auio.uio_resid && error == 0)
 			error = EIO;
 	if ((ioflg & IO_NODELOCKED) == 0) {
 		if (rw == UIO_WRITE && vp->v_type != VCHR)
 			vn_finished_write(mp);
 		VOP_UNLOCK(vp, 0, td);
 	}
 	return (error);
 }
 
 /*
  * Package up an I/O request on a vnode into a uio and do it.  The I/O
  * request is split up into smaller chunks and we try to avoid saturating
  * the buffer cache while potentially holding a vnode locked, so we 
  * check bwillwrite() before calling vn_rdwr().  We also call uio_yield()
  * to give other processes a chance to lock the vnode (either other processes
  * core'ing the same binary, or unrelated processes scanning the directory).
  */
 int
 vn_rdwr_inchunks(rw, vp, base, len, offset, segflg, ioflg, active_cred,
     file_cred, aresid, td)
 	enum uio_rw rw;
 	struct vnode *vp;
 	void *base;
 	size_t len;
 	off_t offset;
 	enum uio_seg segflg;
 	int ioflg;
 	struct ucred *active_cred;
 	struct ucred *file_cred;
 	size_t *aresid;
 	struct thread *td;
 {
 	int error = 0;
 	int iaresid;
 
 	VFS_ASSERT_GIANT(vp->v_mount);
 
 	do {
 		int chunk;
 
 		/*
 		 * Force `offset' to a multiple of MAXBSIZE except possibly
 		 * for the first chunk, so that filesystems only need to
 		 * write full blocks except possibly for the first and last
 		 * chunks.
 		 */
 		chunk = MAXBSIZE - (uoff_t)offset % MAXBSIZE;
 
 		if (chunk > len)
 			chunk = len;
 		if (rw != UIO_READ && vp->v_type == VREG)
 			bwillwrite();
 		iaresid = 0;
 		error = vn_rdwr(rw, vp, base, chunk, offset, segflg,
 		    ioflg, active_cred, file_cred, &iaresid, td);
 		len -= chunk;	/* aresid calc already includes length */
 		if (error)
 			break;
 		offset += chunk;
 		base = (char *)base + chunk;
 		uio_yield();
 	} while (len);
 	if (aresid)
 		*aresid = len + iaresid;
 	return (error);
 }
 
 /*
  * File table vnode read routine.
  */
 static int
 vn_read(fp, uio, active_cred, flags, td)
 	struct file *fp;
 	struct uio *uio;
 	struct ucred *active_cred;
 	struct thread *td;
 	int flags;
 {
 	struct vnode *vp;
 	int error, ioflag;
 	struct mtx *mtxp;
 	int vfslocked;
 
 	KASSERT(uio->uio_td == td, ("uio_td %p is not td %p",
 	    uio->uio_td, td));
 	mtxp = NULL;
 	vp = fp->f_vnode;
 	ioflag = 0;
 	if (fp->f_flag & FNONBLOCK)
 		ioflag |= IO_NDELAY;
 	if (fp->f_flag & O_DIRECT)
 		ioflag |= IO_DIRECT;
 	vfslocked = VFS_LOCK_GIANT(vp->v_mount);
 	VOP_LEASE(vp, td, fp->f_cred, LEASE_READ);
 	/*
 	 * According to McKusick the vn lock was protecting f_offset here.
 	 * It is now protected by the FOFFSET_LOCKED flag.
 	 */
 	if ((flags & FOF_OFFSET) == 0) {
 		mtxp = mtx_pool_find(mtxpool_sleep, fp);
 		mtx_lock(mtxp);
 		while(fp->f_vnread_flags & FOFFSET_LOCKED) {
 			fp->f_vnread_flags |= FOFFSET_LOCK_WAITING;
 			msleep(&fp->f_vnread_flags, mtxp, PUSER -1,
 			    "vnread offlock", 0);
 		}
 		fp->f_vnread_flags |= FOFFSET_LOCKED;
 		mtx_unlock(mtxp);
-		vn_lock(vp, LK_SHARED | LK_RETRY, td);
+		vn_lock(vp, LK_SHARED | LK_RETRY);
 		uio->uio_offset = fp->f_offset;
 	} else
-		vn_lock(vp, LK_SHARED | LK_RETRY, td);
+		vn_lock(vp, LK_SHARED | LK_RETRY);
 
 	ioflag |= sequential_heuristic(uio, fp);
 
 #ifdef MAC
 	error = mac_vnode_check_read(active_cred, fp->f_cred, vp);
 	if (error == 0)
 #endif
 		error = VOP_READ(vp, uio, ioflag, fp->f_cred);
 	if ((flags & FOF_OFFSET) == 0) {
 		fp->f_offset = uio->uio_offset;
 		mtx_lock(mtxp);
 		if (fp->f_vnread_flags & FOFFSET_LOCK_WAITING)
 			wakeup(&fp->f_vnread_flags);
 		fp->f_vnread_flags = 0;
 		mtx_unlock(mtxp);
 	}
 	fp->f_nextoff = uio->uio_offset;
 	VOP_UNLOCK(vp, 0, td);
 	VFS_UNLOCK_GIANT(vfslocked);
 	return (error);
 }
 
 /*
  * File table vnode write routine.
  */
 static int
 vn_write(fp, uio, active_cred, flags, td)
 	struct file *fp;
 	struct uio *uio;
 	struct ucred *active_cred;
 	struct thread *td;
 	int flags;
 {
 	struct vnode *vp;
 	struct mount *mp;
 	int error, ioflag;
 	int vfslocked;
 
 	KASSERT(uio->uio_td == td, ("uio_td %p is not td %p",
 	    uio->uio_td, td));
 	vp = fp->f_vnode;
 	vfslocked = VFS_LOCK_GIANT(vp->v_mount);
 	if (vp->v_type == VREG)
 		bwillwrite();
 	ioflag = IO_UNIT;
 	if (vp->v_type == VREG && (fp->f_flag & O_APPEND))
 		ioflag |= IO_APPEND;
 	if (fp->f_flag & FNONBLOCK)
 		ioflag |= IO_NDELAY;
 	if (fp->f_flag & O_DIRECT)
 		ioflag |= IO_DIRECT;
 	if ((fp->f_flag & O_FSYNC) ||
 	    (vp->v_mount && (vp->v_mount->mnt_flag & MNT_SYNCHRONOUS)))
 		ioflag |= IO_SYNC;
 	mp = NULL;
 	if (vp->v_type != VCHR &&
 	    (error = vn_start_write(vp, &mp, V_WAIT | PCATCH)) != 0)
 		goto unlock;
 	VOP_LEASE(vp, td, fp->f_cred, LEASE_WRITE);
-	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td);
+	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
 	if ((flags & FOF_OFFSET) == 0)
 		uio->uio_offset = fp->f_offset;
 	ioflag |= sequential_heuristic(uio, fp);
 #ifdef MAC
 	error = mac_vnode_check_write(active_cred, fp->f_cred, vp);
 	if (error == 0)
 #endif
 		error = VOP_WRITE(vp, uio, ioflag, fp->f_cred);
 	if ((flags & FOF_OFFSET) == 0)
 		fp->f_offset = uio->uio_offset;
 	fp->f_nextoff = uio->uio_offset;
 	VOP_UNLOCK(vp, 0, td);
 	if (vp->v_type != VCHR)
 		vn_finished_write(mp);
 unlock:
 	VFS_UNLOCK_GIANT(vfslocked);
 	return (error);
 }
 
 /*
  * File table truncate routine.
  */
 static int
 vn_truncate(fp, length, active_cred, td)
 	struct file *fp;
 	off_t length;
 	struct ucred *active_cred;
 	struct thread *td;
 {
 	struct vattr vattr;
 	struct mount *mp;
 	struct vnode *vp;
 	int vfslocked;
 	int error;
 
 	vp = fp->f_vnode;
 	vfslocked = VFS_LOCK_GIANT(vp->v_mount);
 	error = vn_start_write(vp, &mp, V_WAIT | PCATCH);
 	if (error) {
 		VFS_UNLOCK_GIANT(vfslocked);
 		return (error);
 	}
 	VOP_LEASE(vp, td, active_cred, LEASE_WRITE);
-	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td);
+	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
 	if (vp->v_type == VDIR) {
 		error = EISDIR;
 		goto out;
 	}
 #ifdef MAC
 	error = mac_vnode_check_write(active_cred, fp->f_cred, vp);
 	if (error)
 		goto out;
 #endif
 	error = vn_writechk(vp);
 	if (error == 0) {
 		VATTR_NULL(&vattr);
 		vattr.va_size = length;
 		error = VOP_SETATTR(vp, &vattr, fp->f_cred, td);
 	}
 out:
 	VOP_UNLOCK(vp, 0, td);
 	vn_finished_write(mp);
 	VFS_UNLOCK_GIANT(vfslocked);
 	return (error);
 }
 
 /*
  * File table vnode stat routine.
  */
 static int
 vn_statfile(fp, sb, active_cred, td)
 	struct file *fp;
 	struct stat *sb;
 	struct ucred *active_cred;
 	struct thread *td;
 {
 	struct vnode *vp = fp->f_vnode;
 	int vfslocked;
 	int error;
 
 	vfslocked = VFS_LOCK_GIANT(vp->v_mount);
-	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td);
+	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
 	error = vn_stat(vp, sb, active_cred, fp->f_cred, td);
 	VOP_UNLOCK(vp, 0, td);
 	VFS_UNLOCK_GIANT(vfslocked);
 
 	return (error);
 }
 
 /*
  * Stat a vnode; implementation for the stat syscall
  */
 int
 vn_stat(vp, sb, active_cred, file_cred, td)
 	struct vnode *vp;
 	register struct stat *sb;
 	struct ucred *active_cred;
 	struct ucred *file_cred;
 	struct thread *td;
 {
 	struct vattr vattr;
 	register struct vattr *vap;
 	int error;
 	u_short mode;
 
 #ifdef MAC
 	error = mac_vnode_check_stat(active_cred, file_cred, vp);
 	if (error)
 		return (error);
 #endif
 
 	vap = &vattr;
 	error = VOP_GETATTR(vp, vap, active_cred, td);
 	if (error)
 		return (error);
 
 	/*
 	 * Zero the spare stat fields
 	 */
 	bzero(sb, sizeof *sb);
 
 	/*
 	 * Copy from vattr table
 	 */
 	if (vap->va_fsid != VNOVAL)
 		sb->st_dev = vap->va_fsid;
 	else
 		sb->st_dev = vp->v_mount->mnt_stat.f_fsid.val[0];
 	sb->st_ino = vap->va_fileid;
 	mode = vap->va_mode;
 	switch (vap->va_type) {
 	case VREG:
 		mode |= S_IFREG;
 		break;
 	case VDIR:
 		mode |= S_IFDIR;
 		break;
 	case VBLK:
 		mode |= S_IFBLK;
 		break;
 	case VCHR:
 		mode |= S_IFCHR;
 		break;
 	case VLNK:
 		mode |= S_IFLNK;
 		/* This is a cosmetic change, symlinks do not have a mode. */
 		if (vp->v_mount->mnt_flag & MNT_NOSYMFOLLOW)
 			sb->st_mode &= ~ACCESSPERMS;	/* 0000 */
 		else
 			sb->st_mode |= ACCESSPERMS;	/* 0777 */
 		break;
 	case VSOCK:
 		mode |= S_IFSOCK;
 		break;
 	case VFIFO:
 		mode |= S_IFIFO;
 		break;
 	default:
 		return (EBADF);
 	};
 	sb->st_mode = mode;
 	sb->st_nlink = vap->va_nlink;
 	sb->st_uid = vap->va_uid;
 	sb->st_gid = vap->va_gid;
 	sb->st_rdev = vap->va_rdev;
 	if (vap->va_size > OFF_MAX)
 		return (EOVERFLOW);
 	sb->st_size = vap->va_size;
 	sb->st_atimespec = vap->va_atime;
 	sb->st_mtimespec = vap->va_mtime;
 	sb->st_ctimespec = vap->va_ctime;
 	sb->st_birthtimespec = vap->va_birthtime;
 
         /*
 	 * According to www.opengroup.org, the meaning of st_blksize is 
 	 *   "a filesystem-specific preferred I/O block size for this 
 	 *    object.  In some filesystem types, this may vary from file
 	 *    to file"
 	 * Default to PAGE_SIZE after much discussion.
 	 * XXX: min(PAGE_SIZE, vp->v_bufobj.bo_bsize) may be more correct.
 	 */
 
 	sb->st_blksize = PAGE_SIZE;
 	
 	sb->st_flags = vap->va_flags;
 	if (priv_check(td, PRIV_VFS_GENERATION))
 		sb->st_gen = 0;
 	else
 		sb->st_gen = vap->va_gen;
 
 	sb->st_blocks = vap->va_bytes / S_BLKSIZE;
 	return (0);
 }
 
 /*
  * File table vnode ioctl routine.
  */
 static int
 vn_ioctl(fp, com, data, active_cred, td)
 	struct file *fp;
 	u_long com;
 	void *data;
 	struct ucred *active_cred;
 	struct thread *td;
 {
 	struct vnode *vp = fp->f_vnode;
 	struct vattr vattr;
 	int vfslocked;
 	int error;
 
 	vfslocked = VFS_LOCK_GIANT(vp->v_mount);
 	error = ENOTTY;
 	switch (vp->v_type) {
 	case VREG:
 	case VDIR:
 		if (com == FIONREAD) {
-			vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td);
+			vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
 			error = VOP_GETATTR(vp, &vattr, active_cred, td);
 			VOP_UNLOCK(vp, 0, td);
 			if (!error)
 				*(int *)data = vattr.va_size - fp->f_offset;
 		}
 		if (com == FIONBIO || com == FIOASYNC)	/* XXX */
 			error = 0;
 		else
 			error = VOP_IOCTL(vp, com, data, fp->f_flag,
 			    active_cred, td);
 		break;
 
 	default:
 		break;
 	}
 	VFS_UNLOCK_GIANT(vfslocked);
 	return (error);
 }
 
 /*
  * File table vnode poll routine.
  */
 static int
 vn_poll(fp, events, active_cred, td)
 	struct file *fp;
 	int events;
 	struct ucred *active_cred;
 	struct thread *td;
 {
 	struct vnode *vp;
 	int vfslocked;
 	int error;
 
 	vp = fp->f_vnode;
 	vfslocked = VFS_LOCK_GIANT(vp->v_mount);
 #ifdef MAC
-	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td);
+	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
 	error = mac_vnode_check_poll(active_cred, fp->f_cred, vp);
 	VOP_UNLOCK(vp, 0, td);
 	if (!error)
 #endif
 
 	error = VOP_POLL(vp, events, fp->f_cred, td);
 	VFS_UNLOCK_GIANT(vfslocked);
 	return (error);
 }
 
 /*
  * Check that the vnode is still valid, and if so
  * acquire requested lock.
  */
 int
-_vn_lock(struct vnode *vp, int flags, struct thread *td, char *file, int line)
+_vn_lock(struct vnode *vp, int flags, char *file, int line)
 {
 	int error;
 
 	do {
 		if ((flags & LK_INTERLOCK) == 0)
 			VI_LOCK(vp);
 		if ((flags & LK_NOWAIT || (flags & LK_TYPE_MASK) == 0) &&
 		    vp->v_iflag & VI_DOOMED) {
 			VI_UNLOCK(vp);
 			return (ENOENT);
 		}
 		/*
 		 * Just polling to check validity.
 		 */
 		if ((flags & LK_TYPE_MASK) == 0) {
 			VI_UNLOCK(vp);
 			return (0);
 		}
 		/*
 		 * lockmgr drops interlock before it will return for
 		 * any reason.  So force the code above to relock it.
 		 */
-		error = VOP_LOCK1(vp, flags | LK_INTERLOCK, td, file, line);
+		error = VOP_LOCK1(vp, flags | LK_INTERLOCK, curthread, file,
+		    line);
 		flags &= ~LK_INTERLOCK;
 		KASSERT((flags & LK_RETRY) == 0 || error == 0,
 		    ("LK_RETRY set with incompatible flags %d\n", flags));
 		/*
 		 * Callers specify LK_RETRY if they wish to get dead vnodes.
 		 * If RETRY is not set, we return ENOENT instead.
 		 */
 		if (error == 0 && vp->v_iflag & VI_DOOMED &&
 		    (flags & LK_RETRY) == 0) {
-			VOP_UNLOCK(vp, 0, td);
+			VOP_UNLOCK(vp, 0, curthread);
 			error = ENOENT;
 			break;
 		}
 	} while (flags & LK_RETRY && error != 0);
 	return (error);
 }
 
 /*
  * File table vnode close routine.
  */
 static int
 vn_closefile(fp, td)
 	struct file *fp;
 	struct thread *td;
 {
 	struct vnode *vp;
 	struct flock lf;
 	int vfslocked;
 	int error;
 
 	vp = fp->f_vnode;
 
 	vfslocked = VFS_LOCK_GIANT(vp->v_mount);
 	if (fp->f_type == DTYPE_VNODE && fp->f_flag & FHASLOCK) {
 		lf.l_whence = SEEK_SET;
 		lf.l_start = 0;
 		lf.l_len = 0;
 		lf.l_type = F_UNLCK;
 		(void) VOP_ADVLOCK(vp, fp, F_UNLCK, &lf, F_FLOCK);
 	}
 
 	fp->f_ops = &badfileops;
 
 	error = vn_close(vp, fp->f_flag, fp->f_cred, td);
 	VFS_UNLOCK_GIANT(vfslocked);
 	return (error);
 }
 
 /*
  * Preparing to start a filesystem write operation. If the operation is
  * permitted, then we bump the count of operations in progress and
  * proceed. If a suspend request is in progress, we wait until the
  * suspension is over, and then proceed.
  */
 int
 vn_start_write(vp, mpp, flags)
 	struct vnode *vp;
 	struct mount **mpp;
 	int flags;
 {
 	struct mount *mp;
 	int error;
 
 	error = 0;
 	/*
 	 * If a vnode is provided, get and return the mount point that
 	 * to which it will write.
 	 */
 	if (vp != NULL) {
 		if ((error = VOP_GETWRITEMOUNT(vp, mpp)) != 0) {
 			*mpp = NULL;
 			if (error != EOPNOTSUPP)
 				return (error);
 			return (0);
 		}
 	}
 	if ((mp = *mpp) == NULL)
 		return (0);
 	MNT_ILOCK(mp);
 	if (vp == NULL)
 		MNT_REF(mp);
 	/*
 	 * Check on status of suspension.
 	 */
 	while ((mp->mnt_kern_flag & MNTK_SUSPEND) != 0) {
 		if (flags & V_NOWAIT) {
 			error = EWOULDBLOCK;
 			goto unlock;
 		}
 		error = msleep(&mp->mnt_flag, MNT_MTX(mp), 
 		    (PUSER - 1) | (flags & PCATCH), "suspfs", 0);
 		if (error)
 			goto unlock;
 	}
 	if (flags & V_XSLEEP)
 		goto unlock;
 	mp->mnt_writeopcount++;
 unlock:
 	MNT_REL(mp);
 	MNT_IUNLOCK(mp);
 	return (error);
 }
 
 /*
  * Secondary suspension. Used by operations such as vop_inactive
  * routines that are needed by the higher level functions. These
  * are allowed to proceed until all the higher level functions have
  * completed (indicated by mnt_writeopcount dropping to zero). At that
  * time, these operations are halted until the suspension is over.
  */
 int
 vn_write_suspend_wait(vp, mp, flags)
 	struct vnode *vp;
 	struct mount *mp;
 	int flags;
 {
 	int error;
 
 	if (vp != NULL) {
 		if ((error = VOP_GETWRITEMOUNT(vp, &mp)) != 0) {
 			if (error != EOPNOTSUPP)
 				return (error);
 			return (0);
 		}
 	}
 	/*
 	 * If we are not suspended or have not yet reached suspended
 	 * mode, then let the operation proceed.
 	 */
 	if (mp == NULL)
 		return (0);
 	MNT_ILOCK(mp);
 	if (vp == NULL)
 		MNT_REF(mp);
 	if ((mp->mnt_kern_flag & MNTK_SUSPENDED) == 0) {
 		MNT_REL(mp);
 		MNT_IUNLOCK(mp);
 		return (0);
 	}
 	if (flags & V_NOWAIT) {
 		MNT_REL(mp);
 		MNT_IUNLOCK(mp);
 		return (EWOULDBLOCK);
 	}
 	/*
 	 * Wait for the suspension to finish.
 	 */
 	error = msleep(&mp->mnt_flag, MNT_MTX(mp),
 	    (PUSER - 1) | (flags & PCATCH) | PDROP, "suspfs", 0);
 	vfs_rel(mp);
 	return (error);
 }
 
 /*
  * Secondary suspension. Used by operations such as vop_inactive
  * routines that are needed by the higher level functions. These
  * are allowed to proceed until all the higher level functions have
  * completed (indicated by mnt_writeopcount dropping to zero). At that
  * time, these operations are halted until the suspension is over.
  */
 int
 vn_start_secondary_write(vp, mpp, flags)
 	struct vnode *vp;
 	struct mount **mpp;
 	int flags;
 {
 	struct mount *mp;
 	int error;
 
  retry:
 	if (vp != NULL) {
 		if ((error = VOP_GETWRITEMOUNT(vp, mpp)) != 0) {
 			*mpp = NULL;
 			if (error != EOPNOTSUPP)
 				return (error);
 			return (0);
 		}
 	}
 	/*
 	 * If we are not suspended or have not yet reached suspended
 	 * mode, then let the operation proceed.
 	 */
 	if ((mp = *mpp) == NULL)
 		return (0);
 	MNT_ILOCK(mp);
 	if (vp == NULL)
 		MNT_REF(mp);
 	if ((mp->mnt_kern_flag & (MNTK_SUSPENDED | MNTK_SUSPEND2)) == 0) {
 		mp->mnt_secondary_writes++;
 		mp->mnt_secondary_accwrites++;
 		MNT_REL(mp);
 		MNT_IUNLOCK(mp);
 		return (0);
 	}
 	if (flags & V_NOWAIT) {
 		MNT_REL(mp);
 		MNT_IUNLOCK(mp);
 		return (EWOULDBLOCK);
 	}
 	/*
 	 * Wait for the suspension to finish.
 	 */
 	error = msleep(&mp->mnt_flag, MNT_MTX(mp),
 		       (PUSER - 1) | (flags & PCATCH) | PDROP, "suspfs", 0);
 	vfs_rel(mp);
 	if (error == 0)
 		goto retry;
 	return (error);
 }
 
 /*
  * Filesystem write operation has completed. If we are suspending and this
  * operation is the last one, notify the suspender that the suspension is
  * now in effect.
  */
 void
 vn_finished_write(mp)
 	struct mount *mp;
 {
 	if (mp == NULL)
 		return;
 	MNT_ILOCK(mp);
 	mp->mnt_writeopcount--;
 	if (mp->mnt_writeopcount < 0)
 		panic("vn_finished_write: neg cnt");
 	if ((mp->mnt_kern_flag & MNTK_SUSPEND) != 0 &&
 	    mp->mnt_writeopcount <= 0)
 		wakeup(&mp->mnt_writeopcount);
 	MNT_IUNLOCK(mp);
 }
 
 
 /*
  * Filesystem secondary write operation has completed. If we are
  * suspending and this operation is the last one, notify the suspender
  * that the suspension is now in effect.
  */
 void
 vn_finished_secondary_write(mp)
 	struct mount *mp;
 {
 	if (mp == NULL)
 		return;
 	MNT_ILOCK(mp);
 	mp->mnt_secondary_writes--;
 	if (mp->mnt_secondary_writes < 0)
 		panic("vn_finished_secondary_write: neg cnt");
 	if ((mp->mnt_kern_flag & MNTK_SUSPEND) != 0 &&
 	    mp->mnt_secondary_writes <= 0)
 		wakeup(&mp->mnt_secondary_writes);
 	MNT_IUNLOCK(mp);
 }
 
 
 
 /*
  * Request a filesystem to suspend write operations.
  */
 int
 vfs_write_suspend(mp)
 	struct mount *mp;
 {
 	struct thread *td = curthread;
 	int error;
 
 	MNT_ILOCK(mp);
 	if (mp->mnt_kern_flag & MNTK_SUSPEND) {
 		MNT_IUNLOCK(mp);
 		return (0);
 	}
 	mp->mnt_kern_flag |= MNTK_SUSPEND;
 	if (mp->mnt_writeopcount > 0)
 		(void) msleep(&mp->mnt_writeopcount, 
 		    MNT_MTX(mp), (PUSER - 1)|PDROP, "suspwt", 0);
 	else
 		MNT_IUNLOCK(mp);
 	if ((error = VFS_SYNC(mp, MNT_SUSPEND, td)) != 0)
 		vfs_write_resume(mp);
 	return (error);
 }
 
 /*
  * Request a filesystem to resume write operations.
  */
 void
 vfs_write_resume(mp)
 	struct mount *mp;
 {
 
 	MNT_ILOCK(mp);
 	if ((mp->mnt_kern_flag & MNTK_SUSPEND) != 0) {
 		mp->mnt_kern_flag &= ~(MNTK_SUSPEND | MNTK_SUSPEND2 |
 				       MNTK_SUSPENDED);
 		wakeup(&mp->mnt_writeopcount);
 		wakeup(&mp->mnt_flag);
 	}
 	MNT_IUNLOCK(mp);
 }
 
 /*
  * Implement kqueues for files by translating it to vnode operation.
  */
 static int
 vn_kqfilter(struct file *fp, struct knote *kn)
 {
 	int vfslocked;
 	int error;
 
 	vfslocked = VFS_LOCK_GIANT(fp->f_vnode->v_mount);
 	error = VOP_KQFILTER(fp->f_vnode, kn);
 	VFS_UNLOCK_GIANT(vfslocked);
 
 	return error;
 }
 
 /*
  * Simplified in-kernel wrapper calls for extended attribute access.
  * Both calls pass in a NULL credential, authorizing as "kernel" access.
  * Set IO_NODELOCKED in ioflg if the vnode is already locked.
  */
 int
 vn_extattr_get(struct vnode *vp, int ioflg, int attrnamespace,
     const char *attrname, int *buflen, char *buf, struct thread *td)
 {
 	struct uio	auio;
 	struct iovec	iov;
 	int	error;
 
 	iov.iov_len = *buflen;
 	iov.iov_base = buf;
 
 	auio.uio_iov = &iov;
 	auio.uio_iovcnt = 1;
 	auio.uio_rw = UIO_READ;
 	auio.uio_segflg = UIO_SYSSPACE;
 	auio.uio_td = td;
 	auio.uio_offset = 0;
 	auio.uio_resid = *buflen;
 
 	if ((ioflg & IO_NODELOCKED) == 0)
-		vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td);
+		vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
 
 	ASSERT_VOP_LOCKED(vp, "IO_NODELOCKED with no vp lock held");
 
 	/* authorize attribute retrieval as kernel */
 	error = VOP_GETEXTATTR(vp, attrnamespace, attrname, &auio, NULL, NULL,
 	    td);
 
 	if ((ioflg & IO_NODELOCKED) == 0)
 		VOP_UNLOCK(vp, 0, td);
 
 	if (error == 0) {
 		*buflen = *buflen - auio.uio_resid;
 	}
 
 	return (error);
 }
 
 /*
  * XXX failure mode if partially written?
  */
 int
 vn_extattr_set(struct vnode *vp, int ioflg, int attrnamespace,
     const char *attrname, int buflen, char *buf, struct thread *td)
 {
 	struct uio	auio;
 	struct iovec	iov;
 	struct mount	*mp;
 	int	error;
 
 	iov.iov_len = buflen;
 	iov.iov_base = buf;
 
 	auio.uio_iov = &iov;
 	auio.uio_iovcnt = 1;
 	auio.uio_rw = UIO_WRITE;
 	auio.uio_segflg = UIO_SYSSPACE;
 	auio.uio_td = td;
 	auio.uio_offset = 0;
 	auio.uio_resid = buflen;
 
 	if ((ioflg & IO_NODELOCKED) == 0) {
 		if ((error = vn_start_write(vp, &mp, V_WAIT)) != 0)
 			return (error);
-		vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td);
+		vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
 	}
 
 	ASSERT_VOP_LOCKED(vp, "IO_NODELOCKED with no vp lock held");
 
 	/* authorize attribute setting as kernel */
 	error = VOP_SETEXTATTR(vp, attrnamespace, attrname, &auio, NULL, td);
 
 	if ((ioflg & IO_NODELOCKED) == 0) {
 		vn_finished_write(mp);
 		VOP_UNLOCK(vp, 0, td);
 	}
 
 	return (error);
 }
 
 int
 vn_extattr_rm(struct vnode *vp, int ioflg, int attrnamespace,
     const char *attrname, struct thread *td)
 {
 	struct mount	*mp;
 	int	error;
 
 	if ((ioflg & IO_NODELOCKED) == 0) {
 		if ((error = vn_start_write(vp, &mp, V_WAIT)) != 0)
 			return (error);
-		vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td);
+		vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
 	}
 
 	ASSERT_VOP_LOCKED(vp, "IO_NODELOCKED with no vp lock held");
 
 	/* authorize attribute removal as kernel */
 	error = VOP_DELETEEXTATTR(vp, attrnamespace, attrname, NULL, td);
 	if (error == EOPNOTSUPP)
 		error = VOP_SETEXTATTR(vp, attrnamespace, attrname, NULL,
 		    NULL, td);
 
 	if ((ioflg & IO_NODELOCKED) == 0) {
 		vn_finished_write(mp);
 		VOP_UNLOCK(vp, 0, td);
 	}
 
 	return (error);
 }
Index: head/sys/nfs4client/nfs4_vnops.c
===================================================================
--- head/sys/nfs4client/nfs4_vnops.c	(revision 175201)
+++ head/sys/nfs4client/nfs4_vnops.c	(revision 175202)
@@ -1,2878 +1,2878 @@
 /* $Id: nfs_vnops.c,v 1.45 2003/11/05 14:59:02 rees Exp $ */
 
 /*-
  * copyright (c) 2003
  * the regents of the university of michigan
  * all rights reserved
  * 
  * permission is granted to use, copy, create derivative works and redistribute
  * this software and such derivative works for any purpose, so long as the name
  * of the university of michigan is not used in any advertising or publicity
  * pertaining to the use or distribution of this software without specific,
  * written prior authorization.  if the above copyright notice or any other
  * identification of the university of michigan is included in any copy of any
  * portion of this software, then the disclaimer below must also be included.
  * 
  * this software is provided as is, without representation from the university
  * of michigan as to its fitness for any purpose, and without warranty by the
  * university of michigan of any kind, either express or implied, including
  * without limitation the implied warranties of merchantability and fitness for
  * a particular purpose. the regents of the university of michigan shall not be
  * liable for any damages, including special, indirect, incidental, or
  * consequential damages, with respect to any claim arising out of or in
  * connection with the use of the software, even if it has been or is hereafter
  * advised of the possibility of such damages.
  */
 
 /*
  * Copyright (c) 1989, 1993
  *	The Regents of the University of California.  All rights reserved.
  *
  * This code is derived from software contributed to Berkeley by
  * Rick Macklem at The University of Guelph.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	@(#)nfs_vnops.c	8.16 (Berkeley) 5/27/95
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 /*
  * vnode op calls for Sun NFS version 2 and 3
  */
 
 #include "opt_inet.h"
 
 #include <sys/param.h>
 #include <sys/kernel.h>
 #include <sys/systm.h>
 #include <sys/resourcevar.h>
 #include <sys/proc.h>
 #include <sys/mount.h>
 #include <sys/bio.h>
 #include <sys/buf.h>
 #include <sys/malloc.h>
 #include <sys/mbuf.h>
 #include <sys/namei.h>
 #include <sys/socket.h>
 #include <sys/vnode.h>
 #include <sys/dirent.h>
 #include <sys/fcntl.h>
 #include <sys/lockf.h>
 #include <sys/stat.h>
 #include <sys/sysctl.h>
 #include <sys/lockmgr.h>
 #include <sys/signalvar.h>
 
 #include <vm/vm.h>
 #include <vm/vm_extern.h>
 
 #include <fs/fifofs/fifo.h>
 
 #include <rpc/rpcclnt.h>
 
 #include <nfs/rpcv2.h>
 #include <nfs/nfsproto.h>
 #include <nfsclient/nfs.h>
 #include <nfs4client/nfs4.h>
 #include <nfsclient/nfsnode.h>
 #include <nfsclient/nfsmount.h>
 #include <nfsclient/nfs_lock.h>
 #include <nfs/xdr_subs.h>
 #include <nfsclient/nfsm_subs.h>
 
 #include <net/if.h>
 #include <netinet/in.h>
 #include <netinet/in_var.h>
 
 /* NFSv4 */
 #include <nfs4client/nfs4m_subs.h>
 #include <nfs4client/nfs4_vn.h>
 
 /* Defs */
 #define	TRUE	1
 #define	FALSE	0
 
 /*
  * Ifdef for FreeBSD-current merged buffer cache. It is unfortunate that these
  * calls are not in getblk() and brelse() so that they would not be necessary
  * here.
  */
 #ifndef B_VMIO
 #define vfs_busy_pages(bp, f)
 #endif
 
 static int	nfs4_flush(struct vnode *, int, struct thread *,
 		    int);
 static int	nfs4_setattrrpc(struct vnode *, struct vattr *, struct ucred *,
 		    struct thread *);
 static int      nfs4_closerpc(struct vnode *, struct ucred *, struct thread *, int);
 
 static vop_lookup_t	nfs4_lookup;
 static vop_create_t	nfs4_create;
 static vop_mknod_t	nfs4_mknod;
 static vop_open_t	nfs4_open;
 static vop_close_t	nfs4_close;
 static vop_access_t	nfs4_access;
 static vop_getattr_t	nfs4_getattr;
 static vop_setattr_t	nfs4_setattr;
 static vop_read_t	nfs4_read;
 static vop_fsync_t	nfs4_fsync;
 static vop_remove_t	nfs4_remove;
 static vop_link_t	nfs4_link;
 static vop_rename_t	nfs4_rename;
 static vop_mkdir_t	nfs4_mkdir;
 static vop_rmdir_t	nfs4_rmdir;
 static vop_symlink_t	nfs4_symlink;
 static vop_readdir_t	nfs4_readdir;
 static vop_strategy_t	nfs4_strategy;
 static	int	nfs4_lookitup(struct vnode *, const char *, int,
 		    struct ucred *, struct thread *, struct nfsnode **);
 static	int	nfs4_sillyrename(struct vnode *, struct vnode *,
 		    struct componentname *);
 static vop_readlink_t	nfs4_readlink;
 static vop_print_t	nfs4_print;
 static vop_advlock_t	nfs4_advlock;
 
 /*
  * Global vfs data structures for nfs
  */
 struct vop_vector nfs4_vnodeops = {
 	.vop_default =		&default_vnodeops,
 	.vop_access =		nfs4_access,
 	.vop_advlock =		nfs4_advlock,
 	.vop_close =		nfs4_close,
 	.vop_create =		nfs4_create,
 	.vop_fsync =		nfs4_fsync,
 	.vop_getattr =		nfs4_getattr,
 	.vop_getpages =		nfs_getpages,
 	.vop_putpages =		nfs_putpages,
 	.vop_inactive =		nfs_inactive,
 	.vop_lease =		VOP_NULL,
 	.vop_link =		nfs4_link,
 	.vop_lookup =		nfs4_lookup,
 	.vop_mkdir =		nfs4_mkdir,
 	.vop_mknod =		nfs4_mknod,
 	.vop_open =		nfs4_open,
 	.vop_print =		nfs4_print,
 	.vop_read =		nfs4_read,
 	.vop_readdir =		nfs4_readdir,
 	.vop_readlink =		nfs4_readlink,
 	.vop_reclaim =		nfs_reclaim,
 	.vop_remove =		nfs4_remove,
 	.vop_rename =		nfs4_rename,
 	.vop_rmdir =		nfs4_rmdir,
 	.vop_setattr =		nfs4_setattr,
 	.vop_strategy =		nfs4_strategy,
 	.vop_symlink =		nfs4_symlink,
 	.vop_write =		nfs_write,
 };
 
 static int	nfs4_removerpc(struct vnode *dvp, const char *name, int namelen,
 			      struct ucred *cred, struct thread *td);
 static int	nfs4_renamerpc(struct vnode *fdvp, const char *fnameptr,
 			      int fnamelen, struct vnode *tdvp,
 			      const char *tnameptr, int tnamelen,
 			      struct ucred *cred, struct thread *td);
 static int	nfs4_renameit(struct vnode *sdvp, struct componentname *scnp,
 			     struct sillyrename *sp);
 static int      nfs4_openrpc(struct vnode *, struct vnode **,
                             struct componentname *, int, struct vattr *);
 static int	nfs4_open_confirm(struct vnode *vp, struct nfs4_compound *cpp,
 				 struct nfs4_oparg_open *openap,
 				 struct nfs4_oparg_getfh *gfh,
 				 struct ucred *cred, struct thread *td);
 static int      nfs4_createrpc(struct vnode *, struct vnode **,
                               struct componentname *, nfstype,
                               struct vattr *, char *);
 
 /*
  * Global variables
  */
 struct nfs4_lowner nfs4_masterlowner;
 
 #define	DIRHDSIZ	(sizeof (struct dirent) - (MAXNAMLEN + 1))
 
 SYSCTL_DECL(_vfs_nfs4);
 
 static int	nfs4_access_cache_timeout = NFS_MAXATTRTIMO;
 SYSCTL_INT(_vfs_nfs4, OID_AUTO, access_cache_timeout, CTLFLAG_RW,
 	   &nfs4_access_cache_timeout, 0, "NFS ACCESS cache timeout");
 
 #if 0
 static int	nfsv3_commit_on_close = 0;
 SYSCTL_INT(_vfs_nfs4, OID_AUTO, nfsv3_commit_on_close, CTLFLAG_RW,
 	   &nfsv3_commit_on_close, 0, "write+commit on close, else only write");
 
 SYSCTL_INT(_vfs_nfs4, OID_AUTO, access_cache_hits, CTLFLAG_RD,
 	   &nfsstats.accesscache_hits, 0, "NFS ACCESS cache hit count");
 
 SYSCTL_INT(_vfs_nfs4, OID_AUTO, access_cache_misses, CTLFLAG_RD,
 	   &nfsstats.accesscache_misses, 0, "NFS ACCESS cache miss count");
 #endif
 
 #define	NFSV3ACCESS_ALL (NFSV3ACCESS_READ | NFSV3ACCESS_MODIFY		\
 			 | NFSV3ACCESS_EXTEND | NFSV3ACCESS_EXECUTE	\
 			 | NFSV3ACCESS_DELETE | NFSV3ACCESS_LOOKUP)
 static int
 nfs4_v3_access_otw(struct vnode *vp, int wmode, struct thread *td,
     struct ucred *cred)
 {
 	const int v3 = 1;
 	u_int32_t *tl;
 	int error = 0, attrflag;
 
 	return (0);
 
 	struct mbuf *mreq, *mrep = NULL, *md, *mb;
 	caddr_t bpos, dpos;
 	u_int32_t rmode;
 	struct nfsnode *np = VTONFS(vp);
 
 	nfsstats.rpccnt[NFSPROC_ACCESS]++;
 	mreq = nfsm_reqhead(vp, NFSPROC_ACCESS, NFSX_FH(v3) + NFSX_UNSIGNED);
 	mb = mreq;
 	bpos = mtod(mb, caddr_t);
 	nfsm_fhtom(vp, v3);
 	tl = nfsm_build(u_int32_t *, NFSX_UNSIGNED);
 	*tl = txdr_unsigned(wmode);
 	nfsm_request(vp, NFSPROC_ACCESS, td, cred);
 	nfsm_postop_attr(vp, attrflag);
 	if (!error) {
 		tl = nfsm_dissect(u_int32_t *, NFSX_UNSIGNED);
 		rmode = fxdr_unsigned(u_int32_t, *tl);
 		np->n_mode = rmode;
 		np->n_modeuid = cred->cr_uid;
 		np->n_modestamp = time_second;
 	}
 	m_freem(mrep);
 nfsmout:
 	return error;
 }
 
 /*
  * nfs access vnode op.
  * For nfs version 2, just return ok. File accesses may fail later.
  * For nfs version 3, use the access rpc to check accessibility. If file modes
  * are changed on the server, accesses might still fail later.
  */
 static int
 nfs4_access(struct vop_access_args *ap)
 {
 	struct vnode *vp = ap->a_vp;
 	int error = 0;
 	u_int32_t mode, wmode;
 	int v3 = NFS_ISV3(vp);	/* v3 \in v4 */
 	struct nfsnode *np = VTONFS(vp);
 	caddr_t bpos, dpos;
 	struct mbuf *mreq, *mrep = NULL, *md, *mb;
 	struct nfs4_compound cp;
 	struct nfs4_oparg_access acc;
 	struct thread *td = ap->a_td;
 	struct ucred *cred = ap->a_cred;
 
 	/*
 	 * Disallow write attempts on filesystems mounted read-only;
 	 * unless the file is a socket, fifo, or a block or character
 	 * device resident on the filesystem.
 	 */
 	if ((ap->a_mode & VWRITE) && (vp->v_mount->mnt_flag & MNT_RDONLY)) {
 		switch (vp->v_type) {
 		case VREG:
 		case VDIR:
 		case VLNK:
 			return (EROFS);
 		default:
 			break;
 		}
 	}
 	/*
 	 * For nfs v3, check to see if we have done this recently, and if
 	 * so return our cached result instead of making an ACCESS call.
 	 * If not, do an access rpc, otherwise you are stuck emulating
 	 * ufs_access() locally using the vattr. This may not be correct,
 	 * since the server may apply other access criteria such as
 	 * client uid-->server uid mapping that we do not know about.
 	 */
 	/* XXX Disable this for now; needs fixing of _access_otw() */
 	if (0 && v3) {
 		if (ap->a_mode & VREAD)
 			mode = NFSV3ACCESS_READ;
 		else
 			mode = 0;
 		if (vp->v_type != VDIR) {
 			if (ap->a_mode & VWRITE)
 				mode |= (NFSV3ACCESS_MODIFY | NFSV3ACCESS_EXTEND);
 			if (ap->a_mode & VEXEC)
 				mode |= NFSV3ACCESS_EXECUTE;
 		} else {
 			if (ap->a_mode & VWRITE)
 				mode |= (NFSV3ACCESS_MODIFY | NFSV3ACCESS_EXTEND |
 				    NFSV3ACCESS_DELETE);
 			if (ap->a_mode & VEXEC)
 				mode |= NFSV3ACCESS_LOOKUP;
 		}
 		/* XXX safety belt, only make blanket request if caching */
 		if (nfs4_access_cache_timeout > 0) {
 			wmode = NFSV3ACCESS_READ | NFSV3ACCESS_MODIFY |
 			    NFSV3ACCESS_EXTEND | NFSV3ACCESS_EXECUTE |
 			    NFSV3ACCESS_DELETE | NFSV3ACCESS_LOOKUP;
 		} else {
 			wmode = mode;
 		}
 
 		/*
 		 * Does our cached result allow us to give a definite yes to
 		 * this request?
 		 */
 		if (time_second < np->n_modestamp + nfs4_access_cache_timeout &&
 		    ap->a_cred->cr_uid == np->n_modeuid &&
 		    (np->n_mode & mode) == mode) {
 			nfsstats.accesscache_hits++;
 		} else {
 			/*
 			 * Either a no, or a don't know.  Go to the wire.
 			 */
 			nfsstats.accesscache_misses++;
 		        error = nfs4_v3_access_otw(vp, wmode, ap->a_td,
 			    ap->a_cred);
 			if (error == 0) {
 				if ((np->n_mode & mode) != mode)
 					error = EACCES;
 			}
 		}
 		return (error);
 	}
 
 	/* XXX use generic access code here? */
 	mode = ap->a_mode & VREAD ? NFSV4ACCESS_READ : 0;
 	if (vp->v_type == VDIR) {
 		if (ap->a_mode & VWRITE)
 			mode |= NFSV4ACCESS_MODIFY | NFSV4ACCESS_EXTEND | NFSV4ACCESS_DELETE;
 		if (ap->a_mode & VEXEC)
 			mode |= NFSV4ACCESS_LOOKUP;
 	} else {
 		if (ap->a_mode & VWRITE)
 			mode |= NFSV4ACCESS_MODIFY | NFSV4ACCESS_EXTEND;
 		if (ap->a_mode & VEXEC)
 			mode |= NFSV4ACCESS_EXECUTE;
 	}
 
 	nfs_v4initcompound(&cp);
 	acc.mode = mode;
 
 	mreq = nfsm_reqhead(vp, NFSV4PROC_COMPOUND, 0);
 	mb = mreq;
 	bpos = mtod(mb, caddr_t);
 
 	nfsm_v4build_compound(&cp, "nfs4_access()");
 	nfsm_v4build_putfh(&cp, vp);
 	nfsm_v4build_access(&cp, &acc);
 	nfsm_v4build_finalize(&cp);
 
 	nfsm_request(vp, NFSV4PROC_COMPOUND, td, cred);
 	if (error != 0)
 		goto nfsmout;
 
 	nfsm_v4dissect_compound(&cp);
 	nfsm_v4dissect_putfh(&cp);
 	nfsm_v4dissect_access(&cp, &acc);
 
 	if ((acc.rmode & mode) != mode)
 		error = EACCES;
 
  nfsmout:
 	error = nfs_v4postop(&cp, error);
 
 	if (mrep != NULL)
 		m_freem(mrep);
 
 	return (error);
 }
 
 static int
 nfs4_openrpc(struct vnode *dvp, struct vnode **vpp, struct componentname *cnp,
     int flags, struct vattr *vap)
 {
 	struct vnode *vp = *vpp;
 	struct nfs4_oparg_getattr getattr;
 	struct nfs4_oparg_getfh getfh;
 	struct nfs4_oparg_open opena;
 	struct nfs4_compound cp;
 	caddr_t bpos, dpos;
 	int error = 0;
 	struct mbuf *mreq, *mrep = NULL, *md, *mb;
 	struct ucred *cred = cnp->cn_cred;
 	struct thread *td = cnp->cn_thread;
 	struct nfs4_fctx xfc, *fcp;
 	struct nfsnode *np;
 
 	if (vp == NULL) {
 		/* Create a new file */
 		np = NULL;
 		fcp = &xfc;
 		bzero(fcp, sizeof(*fcp));
 	} else {
 		np = VTONFS(vp);
 		fcp = flags & FWRITE ? &np->n_wfc : &np->n_rfc;
 	}
 
 	/*
 	 * Since we are currently only one lockowner; we only open the
 	 * file once each for reading and writing.
 	 */
 	if (fcp->refcnt++ != 0) {
 		*vpp = vp;
 		/*printf("not opening %s\n", np->n_name != NULL ? np->n_name : "");*/
 		return (0);
 	}
 
 	fcp->lop = &nfs4_masterlowner;
 	fcp->np = np;
 
 	nfs_v4initcompound(&cp);
 	cp.nmp = VFSTONFS(dvp->v_mount);
 
 	opena.ctype = NCLNULL;
 	opena.flags = flags;
 	opena.vap = vap;
 	opena.fcp = fcp;		/* For lockowner */
 	opena.cnp = cnp;
 
 	getattr.bm = &nfsv4_getattrbm;
 
 	mreq = nfsm_reqhead(vp, NFSV4PROC_COMPOUND, 0);
 	mb = mreq;
 	bpos = mtod(mb, caddr_t);
 
 	nfsm_v4build_compound(&cp, "nfs4_openrpc()");
 	nfsm_v4build_putfh(&cp, dvp);
 	nfsm_v4build_open(&cp, &opena);
 	nfsm_v4build_getattr(&cp, &getattr);
 	nfsm_v4build_getfh(&cp, &getfh);
 	nfsm_v4build_finalize(&cp);
 
 	nfsm_request(vp != NULL ? vp : dvp, NFSV4PROC_COMPOUND, td, cred);
 	if (error != 0)
 		goto nfsmout;
 
 	nfsm_v4dissect_compound(&cp);
 	nfsm_v4dissect_putfh(&cp);
 	nfsm_v4dissect_open(&cp, &opena);
 	nfsm_v4dissect_getattr(&cp, &getattr);
 	nfsm_v4dissect_getfh(&cp, &getfh);
 
 	error = nfs_v4postop(&cp, error);
 
 	if (opena.rflags & NFSV4OPENRES_CONFIRM) {
 		error = nfs4_open_confirm(vp ? vp : dvp, &cp, &opena, &getfh, cred, td);
 		if (error != 0)
 			goto nfsmout;
 	}
 
 	if (vp == NULL) {
 		/* New file */
 		error = nfs_nget(dvp->v_mount, &getfh.fh_val,
 				 getfh.fh_len, &np, LK_EXCLUSIVE);
 		if (error != 0)
 			goto nfsmout;
 
 		vp = NFSTOV(np);
 		np->n_dvp = dvp;
 		np->n_namelen = cnp->cn_namelen; /* XXX memory leaks on these; track! */
 		if (np->n_name != NULL)
 			FREE(np->n_name, M_NFSREQ);
 		MALLOC(np->n_name, u_char *, np->n_namelen + 1, M_NFSREQ, M_WAITOK);
 		bcopy(cnp->cn_nameptr, np->n_name, np->n_namelen);
 		np->n_name[np->n_namelen] = '\0';
 		if (flags & FWRITE)
 			np->n_wfc = *fcp;
 		else
 			np->n_rfc = *fcp;
 
 		/*printf("opened new file %s\n", np->n_name);*/
 
 		nfs4_vnop_loadattrcache(vp, &getattr.fa, NULL);
 		*vpp = vp;
 	} else {
 		/*printf("openend \"old\" %s\n", np->n_name != NULL ? np->n_name : "");*/
 
 		if (flags & O_TRUNC && np->n_size != 0) {
 			struct vattr va;
 
 			VATTR_NULL(&va);
 			va.va_size = 0;
 			error = nfs4_setattrrpc(vp, &va,
 			    cnp->cn_cred, cnp->cn_thread);
 		}
 		np->n_attrstamp = 0;
 	}
 
  nfsmout:
 	if (mrep != NULL)
 		m_freem(mrep);
 
 	return (error);
 }
 
 static int
 nfs4_open_confirm(struct vnode *vp, struct nfs4_compound *cpp,
     struct nfs4_oparg_open *openap, struct nfs4_oparg_getfh *gfh,
     struct ucred *cred, struct thread *td)
 {
 	caddr_t bpos, dpos;
 	int error = 0;
 	struct mbuf *mreq, *mrep = NULL, *md, *mb;
 
 	nfs_v4initcompound(cpp);
 	cpp->nmp = VFSTONFS(vp->v_mount);
 
 	mreq = nfsm_reqhead(vp, NFSV4PROC_COMPOUND, 0);
 	mb = mreq;
 	bpos = mtod(mb, caddr_t);
 
 	nfsm_v4build_compound(cpp, "nfs4_open_confirm()");
 	nfsm_v4build_putfh_nv(cpp, gfh);
 	nfsm_v4build_open_confirm(cpp, openap);
 	nfsm_v4build_finalize(cpp);
 
 	nfsm_request(vp, NFSV4PROC_COMPOUND, td, cred);
 	if (error != 0)
 		goto nfsmout;
 
 	nfsm_v4dissect_compound(cpp);
 	nfsm_v4dissect_putfh(cpp);
 	nfsm_v4dissect_open_confirm(cpp, openap);
 
  nfsmout:
 	error = nfs_v4postop(cpp, error);
 
 	if (mrep != NULL)
 		m_freem(mrep);
 
 	return (error);
 }
 
 
 /*
  * nfs open vnode op
  * Check to see if the type is ok
  * and that deletion is not in progress.
  * For paged in text files, you will need to flush the page cache
  * if consistency is lost.
  */
 /* ARGSUSED */
 static int
 nfs4_open(struct vop_open_args *ap)
 {
 	struct vnode *vp = ap->a_vp;
 	struct nfsnode *np = VTONFS(vp);
 	enum vtype vtype = vp->v_type;
 	int mode = ap->a_mode;
 	struct componentname cn;
 
 	if (vtype != VREG) {
 		if (vtype != VDIR && vtype != VLNK) {
 #ifdef DIAGNOSTIC
 			printf("open eacces vtyp=%d\n", vp->v_type);
 #endif
 			return (EACCES);
 		} else
 			return (0);
 	}
 
 	if (np->n_flag & NCREATED) {
 		np->n_flag &= ~NCREATED;
 		return (0);
 	}
 
 	cn.cn_nameptr = np->n_name;
 	cn.cn_namelen = np->n_namelen;
 	cn.cn_cred = ap->a_cred;
 	cn.cn_thread = ap->a_td;
 
 	return (nfs4_openrpc(np->n_dvp, &vp, &cn, mode, NULL));
 }
 
 static int
 nfs4_closerpc(struct vnode *vp, struct ucred *cred, struct thread *td, int flags)
 {
 	caddr_t bpos, dpos;
 	int error = 0;
 	struct mbuf *mreq, *mrep = NULL, *md, *mb;
 	struct nfs4_fctx *fcp;
 	struct nfs4_compound cp;
 	struct nfsnode *np = VTONFS(vp);
 
 	fcp = flags & FWRITE ? &np->n_wfc : &np->n_rfc;
 
 	nfs_v4initcompound(&cp);
 
 	if (--fcp->refcnt != 0)
 		return (0);
 
 	/*printf("closing %s\n", np->n_name != NULL ? np->n_name : "");*/
 
 	cp.fcp = fcp;
 
 	mreq = nfsm_reqhead(vp, NFSV4PROC_COMPOUND, 0);
 	mb = mreq;
 	bpos = mtod(mb, caddr_t);
 
 	nfsm_v4build_compound(&cp, "nfs4_closerpc()");
 	nfsm_v4build_putfh(&cp, vp);
 	nfsm_v4build_close(&cp, fcp);
 	nfsm_v4build_finalize(&cp);
 
 	nfsm_request(vp, NFSV4PROC_COMPOUND, td, cred);
 	if (error != 0)
 		goto nfsmout;
 
 	nfsm_v4dissect_compound(&cp);
 	nfsm_v4dissect_putfh(&cp);
 	nfsm_v4dissect_close(&cp, fcp);
 
  nfsmout:
 	error = nfs_v4postop(&cp, error);
 
 	if (mrep != NULL)
 		m_freem(mrep);
 
 	return (error);
 }
 
 /*
  * nfs close vnode op
  * play it safe for now (see comments in v2/v3 nfs_close regarding dirty buffers)
  */
 /* ARGSUSED */
 static int
 nfs4_close(struct vop_close_args *ap)
 {
 	struct vnode *vp = ap->a_vp;
 	struct nfsnode *np = VTONFS(vp);
 	int error = 0;
 
 	if (vp->v_type != VREG)
 		return (0);
 
 	if (np->n_flag & NMODIFIED) {
 		error = nfs_vinvalbuf(vp, V_SAVE, ap->a_td, 1);
 		np->n_attrstamp = 0;
 	}
 
 	error = nfs4_closerpc(vp, ap->a_cred, ap->a_td, ap->a_fflag);
 
 	if (!error && np->n_flag & NWRITEERR) {
 		np->n_flag &= ~NWRITEERR;
 		error = np->n_error;
 	}
 	return (error);
 }
 
 /*
  * nfs getattr call from vfs.
  */
 static int
 nfs4_getattr(struct vop_getattr_args *ap)
 {
 	struct vnode *vp = ap->a_vp;
 	struct nfsnode *np = VTONFS(vp);
 	caddr_t bpos, dpos;
 	int error = 0;
 	struct mbuf *mreq, *mrep = NULL, *md, *mb;
 	struct nfs4_oparg_getattr ga;
 	struct nfs4_compound cp;
 
 	/*
 	 * Update local times for special files.
 	 */
 	if (np->n_flag & (NACC | NUPD))
 		np->n_flag |= NCHG;
 	/*
 	 * First look in the cache.
 	 */
 	if (nfs_getattrcache(vp, ap->a_vap) == 0)
 		return (0);
 
 	nfsstats.rpccnt[NFSPROC_GETATTR]++;
 
 	mreq = nfsm_reqhead(vp, NFSV4PROC_COMPOUND, NFSX_FH(1));
 	mb = mreq;
 	bpos = mtod(mb, caddr_t);
 
 	ga.bm = &nfsv4_getattrbm;
 	nfs_v4initcompound(&cp);
 
 	nfsm_v4build_compound(&cp, "nfs4_getattr()");
 	nfsm_v4build_putfh(&cp, vp);
 	nfsm_v4build_getattr(&cp, &ga);
 	nfsm_v4build_finalize(&cp);
 
 	nfsm_request(vp, NFSV4PROC_COMPOUND, ap->a_td, ap->a_cred);
 	if (error != 0)
 		goto nfsmout;
 
 	nfsm_v4dissect_compound(&cp);
 	nfsm_v4dissect_putfh(&cp);
 	nfsm_v4dissect_getattr(&cp, &ga);
 
 	nfs4_vnop_loadattrcache(vp, &ga.fa, ap->a_vap);
 
 nfsmout:
 	error = nfs_v4postop(&cp, error);
 
 	if (mrep != NULL)
 		m_freem(mrep);
 	return (error);
 }
 
 /*
  * nfs setattr call.
  */
 static int
 nfs4_setattr(struct vop_setattr_args *ap)
 {
 	struct vnode *vp = ap->a_vp;
 	struct nfsnode *np = VTONFS(vp);
 	struct vattr *vap = ap->a_vap;
 	int error = 0;
 	u_quad_t tsize;
 
 #ifndef nolint
 	tsize = (u_quad_t)0;
 #endif
 
 	/*
 	 * Setting of flags is not supported.
 	 */
 	if (vap->va_flags != VNOVAL)
 		return (EOPNOTSUPP);
 
 	/*
 	 * Disallow write attempts if the filesystem is mounted read-only.
 	 */
   	if ((vap->va_flags != VNOVAL || vap->va_uid != (uid_t)VNOVAL ||
 	    vap->va_gid != (gid_t)VNOVAL || vap->va_atime.tv_sec != VNOVAL ||
 	    vap->va_mtime.tv_sec != VNOVAL || vap->va_mode != (mode_t)VNOVAL) &&
 	    (vp->v_mount->mnt_flag & MNT_RDONLY))
 		return (EROFS);
 	if (vap->va_size != VNOVAL) {
  		switch (vp->v_type) {
  		case VDIR:
  			return (EISDIR);
  		case VCHR:
  		case VBLK:
  		case VSOCK:
  		case VFIFO:
 			if (vap->va_mtime.tv_sec == VNOVAL &&
 			    vap->va_atime.tv_sec == VNOVAL &&
 			    vap->va_mode == (mode_t)VNOVAL &&
 			    vap->va_uid == (uid_t)VNOVAL &&
 			    vap->va_gid == (gid_t)VNOVAL)
 				return (0);
  			vap->va_size = VNOVAL;
  			break;
  		default:
 			/*
 			 * Disallow write attempts if the filesystem is
 			 * mounted read-only.
 			 */
 			if (vp->v_mount->mnt_flag & MNT_RDONLY)
 				return (EROFS);
 
 			/*
 			 *  We run vnode_pager_setsize() early (why?),
 			 * we must set np->n_size now to avoid vinvalbuf
 			 * V_SAVE races that might setsize a lower
 			 * value.
 			 */
 
 			tsize = np->n_size;
 			error = nfs_meta_setsize(vp, ap->a_cred, 
 						ap->a_td, vap->va_size);
 
  			if (np->n_flag & NMODIFIED) {
  			    if (vap->va_size == 0)
  				error = nfs_vinvalbuf(vp, 0, ap->a_td, 1);
  			    else
  				error = nfs_vinvalbuf(vp, V_SAVE, ap->a_td, 1);
  			    if (error) {
 				vnode_pager_setsize(vp, np->n_size);
  				return (error);
 			    }
  			}
 			/*
 			 * np->n_size has already been set to vap->va_size
 			 * in nfs_meta_setsize(). We must set it again since
 			 * nfs_loadattrcache() could be called through
 			 * nfs_meta_setsize() and could modify np->n_size.
 			 */
  			np->n_vattr.va_size = np->n_size = vap->va_size;
   		};
   	} else if ((vap->va_mtime.tv_sec != VNOVAL ||
 		vap->va_atime.tv_sec != VNOVAL) && (np->n_flag & NMODIFIED) &&
 		vp->v_type == VREG &&
   		(error = nfs_vinvalbuf(vp, V_SAVE, ap->a_td, 1)) == EINTR)
 		return (error);
 
 	if (vap->va_size != VNOVAL && np->n_wfc.refcnt == 0) {
 		/* Have to open the file before we can truncate it */
 		struct componentname cn;
 
 		cn.cn_nameptr = np->n_name;
 		cn.cn_namelen = np->n_namelen;
 		cn.cn_cred = ap->a_cred;
 		cn.cn_thread = ap->a_td;
 		error = nfs4_openrpc(np->n_dvp, &vp, &cn, FWRITE, NULL);
 		if (error)
 			return error;
 		np->n_flag |= NTRUNCATE;
 	}
 
 	error = nfs4_setattrrpc(vp, vap, ap->a_cred, ap->a_td);
 	if (error && vap->va_size != VNOVAL) {
 		np->n_size = np->n_vattr.va_size = tsize;
 		vnode_pager_setsize(vp, np->n_size);
 	}
 	return (error);
 }
 
 /*
  * Do an nfs setattr rpc.
  */
 static int
 nfs4_setattrrpc(struct vnode *vp, struct vattr *vap, struct ucred *cred,
     struct thread *td)
 {
 	caddr_t bpos, dpos;
 	int error = 0;
 	struct mbuf *mreq, *mrep = NULL, *md, *mb;
 	struct nfs4_compound cp;
 	struct nfs4_oparg_getattr ga;
 	struct nfsnode *np = VTONFS(vp);
 	struct nfs4_fctx *fcp;
 
 	nfsstats.rpccnt[NFSPROC_SETATTR]++;
 	mreq = nfsm_reqhead(vp, NFSV4PROC_COMPOUND, 0);
 	mb = mreq;
 	bpos = mtod(mb, caddr_t);
 
 	ga.bm = &nfsv4_getattrbm;
 	fcp = (vap->va_size != VNOVAL) ? &np->n_wfc : NULL;
 	nfs_v4initcompound(&cp);
 
 	nfsm_v4build_compound(&cp, "nfs4_setattrrpc");
 	nfsm_v4build_putfh(&cp, vp);
 	nfsm_v4build_setattr(&cp, vap, fcp);
 	nfsm_v4build_getattr(&cp, &ga);
 	nfsm_v4build_finalize(&cp);
 
 	nfsm_request(vp, NFSV4PROC_COMPOUND, td, cred);
 	if (error != 0)
 		goto nfsmout;
 
 	nfsm_v4dissect_compound(&cp);
 	nfsm_v4dissect_putfh(&cp);
 	nfsm_v4dissect_setattr(&cp);
 	nfsm_v4dissect_getattr(&cp, &ga);
 
 	nfs4_vnop_loadattrcache(vp, &ga.fa, NULL);
 
 	/* TODO: do the settatr and close in a single compound rpc */
 	if (np->n_flag & NTRUNCATE) {
 		error = nfs4_closerpc(vp, cred, td, FWRITE);
 		np->n_flag &= ~NTRUNCATE;
 	}
 
 nfsmout:
 	error = nfs_v4postop(&cp, error);
 
 	if (mrep != NULL)
 		m_freem(mrep);
 	
 	return (error);
 }
 
 /*
  * nfs lookup call, one step at a time...
  * First look in cache
  * If not found, unlock the directory nfsnode and do the rpc
  */
 static int
 nfs4_lookup(struct vop_lookup_args *ap)
 {
 	struct componentname *cnp = ap->a_cnp;
 	struct vnode *dvp = ap->a_dvp;
 	struct vnode **vpp = ap->a_vpp;
 	int isdot, flags = cnp->cn_flags;
 	struct vnode *newvp;
 	struct nfsmount *nmp;
 	caddr_t bpos, dpos;
 	struct mbuf *mreq, *mrep = NULL, *md, *mb;
 	long len;
 	nfsfh_t *fhp;
 	struct nfsnode *np;
 	int error = 0, fhsize;
 	struct thread *td = cnp->cn_thread;
 	struct nfs4_compound cp;
 	struct nfs4_oparg_getattr ga, dga;
 	struct nfs4_oparg_lookup l;
 	struct nfs4_oparg_getfh gfh;
 
 	*vpp = NULLVP;
 	if ((flags & ISLASTCN) && (dvp->v_mount->mnt_flag & MNT_RDONLY) &&
 	    (cnp->cn_nameiop == DELETE || cnp->cn_nameiop == RENAME))
 		return (EROFS);
 	if (dvp->v_type != VDIR)
 		return (ENOTDIR);
 	nmp = VFSTONFS(dvp->v_mount);
 	np = VTONFS(dvp);
 
 	isdot = cnp->cn_namelen == 1 && cnp->cn_nameptr[0] == '.';
 
 	if ((error = VOP_ACCESS(dvp, VEXEC, cnp->cn_cred, td)) != 0) {
 		*vpp = NULLVP;
 		return (error);
 	}
 	if ((error = cache_lookup(dvp, vpp, cnp)) && error != ENOENT) {
 		struct vattr vattr;
 
 		newvp = *vpp;
 		if (!VOP_GETATTR(newvp, &vattr, cnp->cn_cred, td)
 		 && vattr.va_ctime.tv_sec == VTONFS(newvp)->n_ctime) {
 		     nfsstats.lookupcache_hits++;
 		     if (cnp->cn_nameiop != LOOKUP &&
 			 (flags & ISLASTCN))
 			     cnp->cn_flags |= SAVENAME;
 		     return (0);
 		}
 		cache_purge(newvp);
 		if (newvp != dvp)
 			vput(newvp);
 		else
 			vrele(newvp);
 	}
 
 	error = 0;
 	newvp = NULLVP;
 	nfsstats.lookupcache_misses++;
 	nfsstats.rpccnt[NFSPROC_LOOKUP]++;
 
 	len = cnp->cn_namelen;
 	mreq = nfsm_reqhead(NULL, NFSV4PROC_COMPOUND, 0);
 	mb = mreq;
 	bpos = mtod(mb, caddr_t);
 
 	ga.bm = &nfsv4_getattrbm;
 	dga.bm = &nfsv4_getattrbm;
 	nfs_v4initcompound(&cp);
 
 	nfsm_v4build_compound(&cp, "nfs4_lookup()");
 	nfsm_v4build_putfh(&cp, dvp);
 	nfsm_v4build_getattr(&cp, &dga);
 	if (flags & ISDOTDOT)
 		nfsm_v4build_lookupp(&cp);
 	else if (!isdot) {
 		l.name = cnp->cn_nameptr;
 		l.namelen = len;
 		nfsm_v4build_lookup(&cp, &l);
 	}
 	nfsm_v4build_getattr(&cp, &ga);
 	nfsm_v4build_getfh(&cp, &gfh);
 	nfsm_v4build_finalize(&cp);
 
 	nfsm_request(dvp, NFSV4PROC_COMPOUND, cnp->cn_thread, cnp->cn_cred);
 	if (error != 0)
 		goto nfsmout;
 
 	nfsm_v4dissect_compound(&cp);
 	nfsm_v4dissect_putfh(&cp);
 	nfsm_v4dissect_getattr(&cp, &dga);
 	if (flags & ISDOTDOT)
 		nfsm_v4dissect_lookupp(&cp);
 	else if (!isdot)
 		nfsm_v4dissect_lookup(&cp);
 	nfsm_v4dissect_getattr(&cp, &ga);
 	nfsm_v4dissect_getfh(&cp, &gfh);
 
 	nfs4_vnop_loadattrcache(dvp, &dga.fa, NULL);
 	fhp = &gfh.fh_val;
 	fhsize = gfh.fh_len;
 
 	/*
 	 * Handle RENAME case...
 	 */
 	if (cnp->cn_nameiop == RENAME && (flags & ISLASTCN)) {
 		if (NFS_CMPFH(np, fhp, fhsize))
 			return (EISDIR);
 
 		error = nfs_nget(dvp->v_mount, fhp, fhsize, &np, LK_EXCLUSIVE);
 		if (error)
 			return (error);
 
 		newvp = NFSTOV(np);
 
 		nfs4_vnop_loadattrcache(newvp, &ga.fa, NULL);
 
 		*vpp = newvp;
 		cnp->cn_flags |= SAVENAME;
 		return (0);
 	}
 
 	if (flags & ISDOTDOT) {
 		VOP_UNLOCK(dvp, 0, td);
 
 		error = nfs_nget(dvp->v_mount, fhp, fhsize, &np, LK_EXCLUSIVE);
-		vn_lock(dvp, LK_EXCLUSIVE | LK_RETRY, td);
+		vn_lock(dvp, LK_EXCLUSIVE | LK_RETRY);
 		if (error)
 			return (error);
 		newvp = NFSTOV(np);
 
 		nfs4_vnop_loadattrcache(newvp, &ga.fa, NULL);
 	} else if (NFS_CMPFH(np, fhp, fhsize)) {
 		VREF(dvp);
 		newvp = dvp;
 	} else {
 		error = nfs_nget(dvp->v_mount, fhp, fhsize, &np, LK_EXCLUSIVE);
 		if (error)
 			return (error);
 		newvp = NFSTOV(np);
 
 		/* Fill in np used by open. */
 		np->n_dvp = dvp;
 		np->n_namelen = cnp->cn_namelen;
 		if (np->n_name != NULL)
 			FREE(np->n_name, M_NFSREQ);
 		MALLOC(np->n_name, u_char *, np->n_namelen + 1, M_NFSREQ, M_WAITOK);
 		bcopy(cnp->cn_nameptr, np->n_name, np->n_namelen);
 		np->n_name[np->n_namelen] = '\0';
 
 		nfs4_vnop_loadattrcache(newvp, &ga.fa, NULL);
 	}
 
 	if (cnp->cn_nameiop != LOOKUP && (flags & ISLASTCN))
 		cnp->cn_flags |= SAVENAME;
 	if ((cnp->cn_flags & MAKEENTRY) &&
 	    (cnp->cn_nameiop != DELETE || !(flags & ISLASTCN))) {
 		np->n_ctime = np->n_vattr.va_ctime.tv_sec;
 		cache_enter(dvp, newvp, cnp);
 	}
 	*vpp = newvp;
 	m_freem(mrep);
 nfsmout:
 	error = nfs_v4postop(&cp, error);
 
 	if (error) {
 		if (newvp != NULLVP) {
 			vrele(newvp);
 			*vpp = NULLVP;
 		}
 		if ((cnp->cn_nameiop == CREATE || cnp->cn_nameiop == RENAME) &&
 		    (flags & ISLASTCN) && error == ENOENT) {
 			if (dvp->v_mount->mnt_flag & MNT_RDONLY)
 				error = EROFS;
 			else
 				error = EJUSTRETURN;
 		}
 		if (cnp->cn_nameiop != LOOKUP && (flags & ISLASTCN))
 			cnp->cn_flags |= SAVENAME;
 	}
 
 	return (error);
 }
 
 /*
  * nfs read call.
  * Just call nfs_bioread() to do the work.
  */
 static int
 nfs4_read(struct vop_read_args *ap)
 {
 	struct vnode *vp = ap->a_vp;
 
 	switch (vp->v_type) {
 	case VREG:
 		return (nfs_bioread(vp, ap->a_uio, ap->a_ioflag, ap->a_cred));
 	case VDIR:
 		return (EISDIR);
 	default:
 		return (EOPNOTSUPP);
 	}
 }
 
 /*
  * nfs readlink call
  */
 static int
 nfs4_readlink(struct vop_readlink_args *ap)
 {
 	struct vnode *vp = ap->a_vp;
 
 	if (vp->v_type != VLNK)
 		return (EINVAL);
 	return (nfs_bioread(vp, ap->a_uio, 0, ap->a_cred));
 }
 
 /*
  * Do a readlink rpc.
  * Called by nfs_doio() from below the buffer cache.
  */
 int
 nfs4_readlinkrpc(struct vnode *vp, struct uio *uiop, struct ucred *cred)
 {
 	caddr_t bpos, dpos;
 	int error = 0;
 	struct mbuf *mreq, *mrep = NULL, *md, *mb;
 	struct nfs4_compound cp;
 
 	nfsstats.rpccnt[NFSPROC_READLINK]++;
 
 	mreq = nfsm_reqhead(vp, NFSV4PROC_COMPOUND, 0);
 	mb = mreq;
 	bpos = mtod(mb, caddr_t);
 
 	nfs_v4initcompound(&cp);
 
 	nfsm_v4build_compound(&cp, "nfs4_readlinkrpc()");
 	nfsm_v4build_putfh(&cp, vp);
 	nfsm_v4build_readlink(&cp);
 	nfsm_v4build_finalize(&cp);
 
 	nfsm_request(vp, NFSV4PROC_COMPOUND, uiop->uio_td, cred);
 	if (error != 0)
 		goto nfsmout;
 
 	nfsm_v4dissect_compound(&cp);
 	nfsm_v4dissect_putfh(&cp);
 	nfsm_v4dissect_readlink(&cp, uiop);
 
 nfsmout:
 	error = nfs_v4postop(&cp, error);
 
 	if (mrep != NULL)
 		m_freem(mrep);
 	return (error);
 }
 
 /*
  * nfs read rpc call
  * Ditto above
  */
 int
 nfs4_readrpc(struct vnode *vp, struct uio *uiop, struct ucred *cred)
 {
 	caddr_t bpos, dpos;
 	struct mbuf *mreq, *mrep = NULL, *md, *mb;
 	struct nfsmount *nmp;
 	int error = 0, len, tsiz;
 	struct nfs4_compound cp;
 	struct nfs4_oparg_read read;
 	struct nfsnode *np = VTONFS(vp);
 
 	nmp = VFSTONFS(vp->v_mount);
 	tsiz = uiop->uio_resid;
 	if (uiop->uio_offset + tsiz > nmp->nm_maxfilesize)
 		return (EFBIG);
 
 	if (tsiz == 0)
 		return (0);
 
 	read.uiop = uiop;
 	read.fcp = np->n_rfc.refcnt > 0 ? &np->n_rfc : &np->n_wfc;
 
 	while (tsiz > 0) {
 		nfsstats.rpccnt[NFSPROC_READ]++;
 		len = (tsiz > nmp->nm_rsize) ? nmp->nm_rsize : tsiz;
 
 		read.off = uiop->uio_offset;
 		read.maxcnt = len;
 		nfs_v4initcompound(&cp);
 
 		mreq = nfsm_reqhead(vp, NFSV4PROC_COMPOUND, 0);
 		mb = mreq;
 		bpos = mtod(mb, caddr_t);
 
 		nfsm_v4build_compound(&cp, "nfs4_readrpc()");
 		nfsm_v4build_putfh(&cp, vp);
 		nfsm_v4build_read(&cp, &read);
 		nfsm_v4build_finalize(&cp);
 
 		nfsm_request(vp, NFSV4PROC_COMPOUND, uiop->uio_td, cred);
 		if (error != 0) {
 			error = nfs_v4postop(&cp, error);
 			goto nfsmout;
 		}
 
 		nfsm_v4dissect_compound(&cp);
 		nfsm_v4dissect_putfh(&cp);
 		nfsm_v4dissect_read(&cp, &read);
 
 		if (read.eof || read.retlen == 0)
 			tsiz = 0;
 		else
 			tsiz -= read.retlen;
 
 		error = nfs_v4postop(&cp, error);
 
 		m_freem(mrep);
 		mrep = NULL;
 	}
 nfsmout:
 	if (mrep != NULL)
 		m_freem(mrep);
 	
 	return (error);
 }
 
 /*
  * nfs write call
  */
 int
 nfs4_writerpc(struct vnode *vp, struct uio *uiop, struct ucred *cred,
     int *iomode, int *must_commit)
 {
 	int32_t backup;
 	caddr_t bpos, dpos;
 	struct mbuf *mreq, *mrep = NULL, *md, *mb;
 	struct nfsmount *nmp = VFSTONFS(vp->v_mount);
 	int error = 0, len, tsiz, wccflag = 1, rlen;
 	struct nfs4_compound cp;
 	struct nfs4_oparg_write write;
 	nfsv4stablehow commit, committed = NSHFILESYNC;
 	caddr_t verf;
 	struct nfsnode *np = VTONFS(vp);
 
 #ifndef DIAGNOSTIC
 	if (uiop->uio_iovcnt != 1)
 		panic("nfs: writerpc iovcnt > 1");
 #endif
 	*must_commit = 0;
 	tsiz = uiop->uio_resid;
 	if (uiop->uio_offset + tsiz > nmp->nm_maxfilesize)
 		return (EFBIG);
 
 	if (tsiz == 0)
 		return (0);
 
 	write.stable = (nfsv4stablehow)*iomode;
 	write.uiop = uiop;
 	write.fcp = &np->n_wfc;
 
 	while (tsiz > 0) {
 		nfsstats.rpccnt[NFSPROC_WRITE]++;
 		len = (tsiz > nmp->nm_wsize) ? nmp->nm_wsize : tsiz;
 
 		write.off = uiop->uio_offset;
 		write.cnt = len;
 		nfs_v4initcompound(&cp);
 
 		mreq = nfsm_reqhead(vp, NFSV4PROC_COMPOUND, 0);
 		mb = mreq;
 		bpos = mtod(mb, caddr_t);
 
 		nfsm_v4build_compound(&cp, "nfs4_writerpc()");
 		nfsm_v4build_putfh(&cp, vp);
 		nfsm_v4build_write(&cp, &write);
 		nfsm_v4build_finalize(&cp);
 
 		nfsm_request(vp, NFSV4PROC_COMPOUND, uiop->uio_td, cred);
 		if (error != 0) {
 			error = nfs_v4postop(&cp, error);
 			goto nfsmout;
 		}
 
 		nfsm_v4dissect_compound(&cp);
 		nfsm_v4dissect_putfh(&cp);
 		nfsm_v4dissect_write(&cp, &write);
 
 		rlen = write.retlen;
 		if (rlen == 0) {
 			error = NFSERR_IO;
 			break;
 		} else if (rlen < len) {
 			backup = len - rlen;
 			uiop->uio_iov->iov_base =
 			    (char *)uiop->uio_iov->iov_base -  backup;
 			uiop->uio_iov->iov_len += backup;
                         uiop->uio_offset -= backup;
                         uiop->uio_resid += backup;
                         len = rlen;
 		}
 
 		commit = write.committed;
 
 		if (committed == NSHFILESYNC ||
 		    (committed = NSHDATASYNC && commit == NSHUNSTABLE))
 			committed = commit;
 
 		verf = (caddr_t)write.wverf;
 
                 if ((nmp->nm_flag & NFSSTA_HASWRITEVERF) == 0) {
                         bcopy(verf, nmp->nm_verf, NFSX_V4VERF);
                         nmp->nm_flag |= NFSMNT_HASWRITEVERF;
                 } else if (bcmp(verf, nmp->nm_verf, NFSX_V4VERF)) {
                         *must_commit = 1;
                         bcopy(verf, nmp->nm_verf, NFSX_V4VERF);
                 }
 
 		/* XXX wccflag */
 		if (wccflag)
 			VTONFS(vp)->n_mtime = VTONFS(vp)->n_vattr.va_mtime;
 
 		error = nfs_v4postop(&cp, error);
 
 		m_freem(mrep);
 		mrep = NULL;
 		if (error)
 			break;
 		tsiz -= len;
 	}
 nfsmout:
 	if (mrep != NULL)
 		m_freem(mrep);
 	*iomode = committed;
 	if (error)
 		uiop->uio_resid = tsiz;
 	return (error);
 }
 
 /* ARGSUSED */
 static int
 nfs4_mknod(struct vop_mknod_args *ap)
 {
 	struct vattr *vap = ap->a_vap;
 	struct vnode *newvp = NULL;
 	int error;
 
 	error = nfs4_createrpc(ap->a_dvp, &newvp,
 	    ap->a_cnp, (nfstype)vap->va_type, vap, NULL);
 
 	/* XXX - is this actually referenced here? */
 	if (error == 0) {
 		*ap->a_vpp = newvp;
 		vrele(newvp);
 	}
 
 	return (error);
 }
 
 static int
 nfs4_createrpc(struct vnode *dvp, struct vnode **vpp, struct componentname *cnp,
     nfstype ftype, struct vattr *vap, char *linktarget)
 {
 	struct nfsnode *dnp = VTONFS(dvp);
 	struct nfsnode *np = NULL;
 	struct vnode *newvp = NULL;
 	struct nfs4_compound cp;
 	struct nfs4_oparg_create c;
 	struct nfs4_oparg_getattr ga;
 	struct nfs4_oparg_getfh gfh;
 	caddr_t bpos, dpos;
 	struct mbuf *mreq, *mrep = NULL, *md, *mb;
 	int error = 0;
 
 	nfsstats.rpccnt[NFSPROC_CREATE]++;
 
 	mreq = nfsm_reqhead(dvp, NFSV4PROC_COMPOUND, 0);
 	mb = mreq;
 	bpos = mtod(mb, caddr_t);
 
 	bzero(&c, sizeof(c));
 	bzero(&ga, sizeof(ga));
 
 	c.type = ftype;
 	c.vap = vap;
 	c.linktext = linktarget;
 	c.name = cnp->cn_nameptr;
 	c.namelen = cnp->cn_namelen;
 
 	ga.bm = &nfsv4_getattrbm;
 	nfs_v4initcompound(&cp);
 
 	nfsm_v4build_compound(&cp, "nfs4_createrpc()");
 	nfsm_v4build_putfh(&cp, dvp);
 	nfsm_v4build_create(&cp, &c);
 	nfsm_v4build_getattr(&cp, &ga);
 	nfsm_v4build_getfh(&cp, &gfh);	
 	nfsm_v4build_finalize(&cp); 
 
 	nfsm_request(dvp, NFSV4PROC_COMPOUND, cnp->cn_thread, cnp->cn_cred);
 	if (error != 0)
 		goto nfsmout;
 
 	nfsm_v4dissect_compound(&cp);
 	nfsm_v4dissect_putfh(&cp);
 	nfsm_v4dissect_create(&cp, &c);
 	nfsm_v4dissect_getattr(&cp, &ga);
 	nfsm_v4dissect_getfh(&cp, &gfh);	
 	
 	error = nfs_nget(dvp->v_mount, &gfh.fh_val, gfh.fh_len, &np, LK_EXCLUSIVE);
 	if (error != 0)
 		goto nfsmout;
 
 	newvp = NFSTOV(np);
 	nfs4_vnop_loadattrcache(newvp, &ga.fa, NULL);
 
 	if (cnp->cn_flags & MAKEENTRY)
 		cache_enter(dvp, newvp, cnp);
 
 	dnp->n_flag |= NMODIFIED;
 	dnp->n_attrstamp = 0;
 
  nfsmout:
 	error = nfs_v4postop(&cp, error);
 
 	if (mrep != NULL)
 		m_freem(mrep);
 
 	/* XXX */
 	/*FREE(cnp->cn_pnbuf, M_NAMEI);*/
 	if (error != 0 && newvp != NULL)
 		vput(newvp);
 	else if (error == 0)
 		*vpp = newvp;
 
 	return (error);
 }
 
 static int
 nfs4_renamerpc(struct vnode *fdvp, const char *fnameptr, int fnamelen,
     struct vnode *tdvp, const char *tnameptr, int tnamelen,
     struct ucred *cred, struct thread *td)
 {
 
 	struct nfsnode *fnp = VTONFS(fdvp), *tnp = VTONFS(tdvp);
 	caddr_t bpos, dpos;
 	struct mbuf *mreq, *mrep = NULL, *md, *mb;
 	struct nfs4_compound cp;
 	struct nfs4_oparg_rename r;
 	int error = 0;
 
 	nfsstats.rpccnt[NFSPROC_RENAME]++;
 
 	r.fname = fnameptr;
 	r.fnamelen = fnamelen;
 	r.tname = tnameptr;
 	r.tnamelen = tnamelen;
 	nfs_v4initcompound(&cp);
 
 	mreq = nfsm_reqhead(fdvp, NFSV4PROC_COMPOUND, 0);
 	mb = mreq;
 	bpos = mtod(mb, caddr_t);
 
 	nfsm_v4build_compound(&cp, "nfs4_renamerpc()");
 	nfsm_v4build_putfh(&cp, fdvp);
 	nfsm_v4build_savefh(&cp);
 	nfsm_v4build_putfh(&cp, tdvp);
 	nfsm_v4build_rename(&cp, &r);
 	nfsm_v4build_finalize(&cp);
 
 	nfsm_request(fdvp, NFSV4PROC_COMPOUND, td, cred);
 	if (error != 0)
 		goto nfsmout;
 
 	nfsm_v4dissect_compound(&cp);
 	nfsm_v4dissect_putfh(&cp);
 	nfsm_v4dissect_savefh(&cp);
 	nfsm_v4dissect_putfh(&cp);
 	nfsm_v4dissect_rename(&cp);
 
 	/* XXX should this always be performed?  */
 	fnp->n_flag |= NMODIFIED;
 	tnp->n_flag |= NMODIFIED;
 	fnp->n_attrstamp = tnp->n_attrstamp = 0;
 
  nfsmout:
 	error = nfs_v4postop(&cp, error);
 
 	if (mrep != NULL)
 		m_freem(mrep);
 
 	return (error);
 }
 
 /*
  * nfs file create call
  */
 static int
 nfs4_create(struct vop_create_args *ap)
 {
 	struct vnode *dvp = ap->a_dvp;
 	struct vattr *vap = ap->a_vap;
 	struct nfsnode *dnp = VTONFS(dvp);
 	struct componentname *cnp = ap->a_cnp;
 	struct vnode *newvp = NULL;
 	int error = 0, fmode = (O_CREAT | FREAD | FWRITE);
 	struct vattr vattr;
 
 	if ((error = VOP_GETATTR(dvp, &vattr, cnp->cn_cred, cnp->cn_thread)) != 0)
 		return (error);
 
 	if (vap->va_vaflags & VA_EXCLUSIVE)
 		fmode |= O_EXCL;
 
 	error = nfs4_openrpc(dvp, &newvp, cnp, fmode, vap);
 	if (error != 0)
 		goto out;
 
 	VTONFS(newvp)->n_flag |= NCREATED;
 
 	if (cnp->cn_flags & MAKEENTRY)
 		cache_enter(dvp, newvp, cnp);
 
 	*ap->a_vpp = newvp;
 
 	dnp->n_flag |= NMODIFIED;
 	dnp->n_attrstamp = 0;	/* XXX; wccflag */
 
  out:
 	return (error);
 }
 
 /*
  * nfs file remove call
  * To try and make nfs semantics closer to ufs semantics, a file that has
  * other processes using the vnode is renamed instead of removed and then
  * removed later on the last close.
  * - If v_usecount > 1
  *	  If a rename is not already in the works
  *	     call nfs4_sillyrename() to set it up
  *     else
  *	  do the remove rpc
  */
 static int
 nfs4_remove(struct vop_remove_args *ap)
 {
 	struct vnode *vp = ap->a_vp;
 	struct vnode *dvp = ap->a_dvp;
 	struct componentname *cnp = ap->a_cnp;
 	struct nfsnode *np = VTONFS(vp);
 	int error = 0;
 	struct vattr vattr;
 
 #ifndef DIAGNOSTIC
 	if ((cnp->cn_flags & HASBUF) == 0)
 		panic("nfs4_remove: no name");
 	if (vrefcnt(vp) < 1)
 		panic("nfs4_remove: bad v_usecount");
 #endif
 	if (vp->v_type == VDIR)
 		error = EPERM;
 	else if (vrefcnt(vp) == 1 || (np->n_sillyrename &&
 	    VOP_GETATTR(vp, &vattr, cnp->cn_cred, cnp->cn_thread) == 0 &&
 	    vattr.va_nlink > 1)) {
 		/*
 		 * Purge the name cache so that the chance of a lookup for
 		 * the name succeeding while the remove is in progress is
 		 * minimized. Without node locking it can still happen, such
 		 * that an I/O op returns ESTALE, but since you get this if
 		 * another host removes the file..
 		 */
 		cache_purge(vp);
 		/*
 		 * throw away biocache buffers, mainly to avoid
 		 * unnecessary delayed writes later.
 		 */
 		error = nfs_vinvalbuf(vp, 0, cnp->cn_thread, 1);
 		/* Do the rpc */
 		if (error != EINTR)
 			error = nfs4_removerpc(dvp, cnp->cn_nameptr,
 				cnp->cn_namelen, cnp->cn_cred, cnp->cn_thread);
 		/*
 		 * Kludge City: If the first reply to the remove rpc is lost..
 		 *   the reply to the retransmitted request will be ENOENT
 		 *   since the file was in fact removed
 		 *   Therefore, we cheat and return success.
 		 */
 		if (error == ENOENT)
 			error = 0;
 	} else if (!np->n_sillyrename)
 		error = nfs4_sillyrename(dvp, vp, cnp);
 	np->n_attrstamp = 0;
 	return (error);
 }
 
 /*
  * nfs file remove rpc called from nfs_inactive
  */
 int
 nfs4_removeit(struct sillyrename *sp)
 {
 	/*
 	 * Make sure that the directory vnode is still valid.
 	 * XXX we should lock sp->s_dvp here.
 	 */
 	if (sp->s_dvp->v_type == VBAD)
 		return (0);
 	return (nfs4_removerpc(sp->s_dvp, sp->s_name, sp->s_namlen, sp->s_cred,
 		NULL));
 }
 
 /*
  * Nfs remove rpc, called from nfs4_remove() and nfs4_removeit().
  */
 static int
 nfs4_removerpc(struct vnode *dvp, const char *name, int namelen,
     struct ucred *cred, struct thread *td)
 {
 	caddr_t bpos, dpos;
 	int error = 0;
 	struct mbuf *mreq, *mrep = NULL, *md, *mb;
 	struct nfs4_compound cp;
 
 	nfsstats.rpccnt[NFSPROC_REMOVE]++;
 
 	mreq = nfsm_reqhead(dvp, NFSV4PROC_COMPOUND, 0);
 	mb = mreq;
 	bpos = mtod(mb, caddr_t);
 
 	nfs_v4initcompound(&cp);
 
 	nfsm_v4build_compound(&cp, "nfs4_removerpc()");
 	nfsm_v4build_putfh(&cp, dvp);
 	nfsm_v4build_remove(&cp, name, namelen);
 	nfsm_v4build_finalize(&cp);
 
 	nfsm_request(dvp, NFSV4PROC_COMPOUND, td, cred);
 	if (error != 0)
 		goto nfsmout;
 
 	nfsm_v4dissect_compound(&cp);
 	nfsm_v4dissect_putfh(&cp);
 	nfsm_v4dissect_remove(&cp);
 
  nfsmout:
 	error = nfs_v4postop(&cp, error);
 
 	if (mrep != NULL)
 		m_freem(mrep);
 
 	VTONFS(dvp)->n_flag |= NMODIFIED;
 	VTONFS(dvp)->n_attrstamp = 0; /* XXX wccflag */
 
 	return (error);
 }
 
 /*
  * nfs file rename call
  */
 static int
 nfs4_rename(struct vop_rename_args *ap)
 {
 	struct vnode *fvp = ap->a_fvp;
 	struct vnode *tvp = ap->a_tvp;
 	struct vnode *fdvp = ap->a_fdvp;
 	struct vnode *tdvp = ap->a_tdvp;
 	struct componentname *tcnp = ap->a_tcnp;
 	struct componentname *fcnp = ap->a_fcnp;
 	int error;
 
  #ifndef DIAGNOSTIC
 	if ((tcnp->cn_flags & HASBUF) == 0 ||
 	    (fcnp->cn_flags & HASBUF) == 0)
 		panic("nfs4_rename: no name");
 #endif
 	/* Check for cross-device rename */
 	if ((fvp->v_mount != tdvp->v_mount) ||
 	    (tvp && (fvp->v_mount != tvp->v_mount))) {
 		error = EXDEV;
 		goto out;
 	}
 
 	if (fvp == tvp) {
 		printf("nfs4_rename: fvp == tvp (can't happen)\n");
 		error = 0;
 		goto out;
 	}
-	if ((error = vn_lock(fvp, LK_EXCLUSIVE, fcnp->cn_thread)) != 0)
+	if ((error = vn_lock(fvp, LK_EXCLUSIVE)) != 0)
 		goto out;
 
 	/*
 	 * We have to flush B_DELWRI data prior to renaming
 	 * the file.  If we don't, the delayed-write buffers
 	 * can be flushed out later after the file has gone stale
 	 * under NFSV3.  NFSV2 does not have this problem because
 	 * ( as far as I can tell ) it flushes dirty buffers more
 	 * often.
 	 */
 	VOP_FSYNC(fvp, MNT_WAIT, fcnp->cn_thread);
 	VOP_UNLOCK(fvp, 0, fcnp->cn_thread);
 	if (tvp)
 	    VOP_FSYNC(tvp, MNT_WAIT, tcnp->cn_thread);
 
 	/*
 	 * If the tvp exists and is in use, sillyrename it before doing the
 	 * rename of the new file over it.
 	 * XXX Can't sillyrename a directory.
 	 */
 	if (tvp && vrefcnt(tvp) > 1 && !VTONFS(tvp)->n_sillyrename &&
 		tvp->v_type != VDIR && !nfs4_sillyrename(tdvp, tvp, tcnp)) {
 		vput(tvp);
 		tvp = NULL;
 	}
 
 	error = nfs4_renamerpc(fdvp, fcnp->cn_nameptr, fcnp->cn_namelen,
 		tdvp, tcnp->cn_nameptr, tcnp->cn_namelen, tcnp->cn_cred,
 		tcnp->cn_thread);
 
 	if (fvp->v_type == VDIR) {
 		if (tvp != NULL && tvp->v_type == VDIR)
 			cache_purge(tdvp);
 		cache_purge(fdvp);
 	}
 
 out:
 	if (tdvp == tvp)
 		vrele(tdvp);
 	else
 		vput(tdvp);
 	if (tvp)
 		vput(tvp);
 	vrele(fdvp);
 	vrele(fvp);
 	/*
 	 * Kludge: Map ENOENT => 0 assuming that it is a reply to a retry.
 	 */
 	if (error == ENOENT)
 		error = 0;
 	return (error);
 }
 
 /*
  * nfs file rename rpc called from nfs4_remove() above
  */
 static int
 nfs4_renameit(struct vnode *sdvp, struct componentname *scnp,
     struct sillyrename *sp)
 {
 	return (nfs4_renamerpc(sdvp, scnp->cn_nameptr, scnp->cn_namelen, sdvp,
 	    sp->s_name, sp->s_namlen, scnp->cn_cred, scnp->cn_thread));
 }
 
 /*
  * nfs hard link create call
  */
 static int
 nfs4_link(struct vop_link_args *ap)
 {
 	struct vnode *vp = ap->a_vp;
 	struct vnode *tdvp = ap->a_tdvp;
 	struct componentname *cnp = ap->a_cnp;
 	caddr_t bpos, dpos;
 	int error = 0;
 	struct mbuf *mreq, *mrep = NULL, *md, *mb;
 	struct nfs4_compound cp;
 	struct nfs4_oparg_link l;
 
 	if (vp->v_mount != tdvp->v_mount) {
 		return (EXDEV);
 	}
 
 	/*
 	 * Push all writes to the server, so that the attribute cache
 	 * doesn't get "out of sync" with the server.
 	 * XXX There should be a better way!
 	 */
 	VOP_FSYNC(vp, MNT_WAIT, cnp->cn_thread);
 
 	nfsstats.rpccnt[NFSPROC_LINK]++;
 
 	l.name = cnp->cn_nameptr;
 	l.namelen = cnp->cn_namelen;
 	nfs_v4initcompound(&cp);
 
 	mreq = nfsm_reqhead(vp, NFSV4PROC_COMPOUND, 0);
 	mb = mreq;
 	bpos = mtod(mb, caddr_t);
 
 	nfsm_v4build_compound(&cp, "nfs4_link()");
 	nfsm_v4build_putfh(&cp, vp);
 	nfsm_v4build_savefh(&cp);
 	nfsm_v4build_putfh(&cp, tdvp);
 	nfsm_v4build_link(&cp, &l);
 	nfsm_v4build_finalize(&cp);
 
 	nfsm_request(vp, NFSV4PROC_COMPOUND, cnp->cn_thread, cnp->cn_cred);
 	if (error != 0)
 		goto nfsmout;
 
 	nfsm_v4dissect_compound(&cp);
 	nfsm_v4dissect_putfh(&cp);
 	nfsm_v4dissect_savefh(&cp);
 	nfsm_v4dissect_putfh(&cp);
 	nfsm_v4dissect_link(&cp);
 
 	VTONFS(tdvp)->n_flag |= NMODIFIED;
 	VTONFS(vp)->n_attrstamp = 0;
 	VTONFS(tdvp)->n_attrstamp = 0;
 
 nfsmout:
 	error = nfs_v4postop(&cp, error);
 
 	if (mrep != NULL)
 		m_freem(mrep);
 
 	return (error);
 }
 
 /*
  * nfs symbolic link create call
  */
 static int
 nfs4_symlink(struct vop_symlink_args *ap)
 {
 	struct vnode *dvp = ap->a_dvp;
 	int error = 0;
 	struct vnode *newvp = NULL;
 
 	nfsstats.rpccnt[NFSPROC_SYMLINK]++;
 
 	error = nfs4_createrpc(ap->a_dvp, &newvp, ap->a_cnp, NFLNK,
 	    ap->a_vap, ap->a_target);
 
 	if (error != 0 && newvp != NULL)
 		vput(newvp);
 	else if (error == 0)
 		 *ap->a_vpp = newvp;
 
 	VTONFS(dvp)->n_flag |= NMODIFIED;
 	VTONFS(dvp)->n_attrstamp = 0; /* XXX wccflags */
 
 	return (error);
 }
 
 /*
  * nfs make dir call
  */
 static int
 nfs4_mkdir(struct vop_mkdir_args *ap)
 {
 	return (nfs4_createrpc(ap->a_dvp, ap->a_vpp, ap->a_cnp, NFDIR,
 		    ap->a_vap, NULL));
 }
 
 /*
  * nfs remove directory call
  */
 static int
 nfs4_rmdir(struct vop_rmdir_args *ap)
 {
 	struct vnode *vp = ap->a_vp;
 	struct vnode *dvp = ap->a_dvp;
 	struct nfsnode *dnp = VTONFS(dvp);
 	struct componentname *cnp = ap->a_cnp;
 	int error = 0;
 
 	if (dvp == vp)
 		return (EINVAL);
 
 	error = (nfs4_removerpc(dvp, cnp->cn_nameptr, cnp->cn_namelen, cnp->cn_cred,
 			       NULL));
 	if (error)
 		return (error);
 
 	dnp->n_flag |= NMODIFIED;
 	dnp->n_attrstamp = 0;
 	cache_purge(dvp);
 	cache_purge(vp);
 
 	return (error);
 }
 
 /*
  * nfs readdir call
  */
 static int
 nfs4_readdir(struct vop_readdir_args *ap)
 {
 	struct vnode *vp = ap->a_vp;
 	struct nfsnode *np = VTONFS(vp);
 	struct uio *uio = ap->a_uio;
 	int tresid, error;
 	struct vattr vattr;
 
 	if (vp->v_type != VDIR)
 		return (EPERM);
 	/*
 	 * First, check for hit on the EOF offset cache
 	 */
 	if (np->n_direofoffset > 0 && uio->uio_offset >= np->n_direofoffset &&
 	    (np->n_flag & NMODIFIED) == 0) {
 		if (VOP_GETATTR(vp, &vattr, ap->a_cred, uio->uio_td) == 0 &&
 			!NFS_TIMESPEC_COMPARE(&np->n_mtime, &vattr.va_mtime)) {
 			nfsstats.direofcache_hits++;
 			return (0);
 		}
 	}
 
 	/*
 	 * Call nfs_bioread() to do the real work.
 	 */
 	tresid = uio->uio_resid;
 	error = nfs_bioread(vp, uio, 0, ap->a_cred);
 
 	if (!error && uio->uio_resid == tresid)
 		nfsstats.direofcache_misses++;
 	return (error);
 }
 
 static u_char fty_to_dty[] = {
 	DT_UNKNOWN,		/* NFNON */
 	DT_REG,			/* NFREG */
 	DT_DIR,			/* NFDIR */
 	DT_BLK,			/* NFBLK */
 	DT_CHR,			/* NFCHR */
 	DT_LNK,			/* NFLNK */
 	DT_SOCK,		/* NFSOCK */
 	DT_FIFO,		/* NFFIFO */
         DT_UNKNOWN,		/* NFATTRDIT */
         DT_UNKNOWN,		/* NFNAMEDATTR */
         DT_UNKNOWN,		/* NFBAD */
 };
 
 /*
  * Readdir rpc call.
  * Called from below the buffer cache by nfs_doio().
  */
 int
 nfs4_readdirrpc(struct vnode *vp, struct uio *uiop, struct ucred *cred)
 {
 	int len, left;
 	struct dirent *dp = NULL;
 	u_int32_t *tl;
 	caddr_t p;
 	uint64_t *cookiep;
 	caddr_t bpos, dpos;
 	struct mbuf *mreq, *mrep = NULL, *md, *mb;
 	uint64_t cookie;
 	struct nfsmount *nmp = VFSTONFS(vp->v_mount);
 	struct nfsnode *dnp = VTONFS(vp);
 	int error = 0, tlen, more_dirs = 1, blksiz = 0, bigenough = 1;
 	struct nfs4_compound cp;
 	struct nfs4_oparg_readdir readdir;
 	struct nfsv4_fattr fattr;
 	u_int fty;
 
 #ifndef DIAGNOSTIC
 	if (uiop->uio_iovcnt != 1 || (uiop->uio_offset & (DIRBLKSIZ - 1)) ||
 		(uiop->uio_resid & (DIRBLKSIZ - 1)))
 		panic("nfs readdirrpc bad uio");
 #endif
 
 	/*
 	 * If there is no cookie, assume directory was stale.
 	 */
 	cookiep = nfs4_getcookie(dnp, uiop->uio_offset, 0);
 	if (cookiep)
 		cookie = *cookiep;
 	else
 		return (NFSERR_BAD_COOKIE);
 
 	/* Generate fake entries for "." and ".." */
 	while (cookie < 2 && bigenough) {
 		cookie++;
 		len = 4 + DIRHDSIZ;
 
 		if (len > uiop->uio_resid) {
 			bigenough = 0;
 			break;
 		}
 		dp = (struct dirent *)uiop->uio_iov->iov_base;
 
 		dp->d_namlen = cookie;
 		dp->d_reclen = len;
 		dp->d_type = DT_DIR;
 		if (cookie == 1)
 			dp->d_fileno = dnp->n_vattr.va_fileid; /* XXX has problems with pynfs virtualhandles */
 		else
 			dp->d_fileno = dnp->n_dvp != NULL ?
 			    VTONFS(dnp->n_dvp)->n_vattr.va_fileid : cookie;
 
 		p = dp->d_name;
 		*p++ = '.';
 		if (cookie == 2)
 			*p++ = '.';
 		*p = '\0';
 
 		blksiz += len;
 		if (blksiz == DIRBLKSIZ)
 			blksiz = 0;
 		uiop->uio_offset += len;
 		uiop->uio_resid -= len;
 		uiop->uio_iov->iov_base = (char *)uiop->uio_iov->iov_base + len;
 		uiop->uio_iov->iov_len -= len;
 	}
 
 	if (cookie == 2)
 		cookie = 0;
 
 	/* This is sort of ugly, to prevent v4postop() from acting weird */
 	bzero(&cp, sizeof(cp));
 
 	/*
 	 * Loop around doing readdir rpc's of size nm_readdirsize
 	 * truncated to a multiple of DIRBLKSIZ.
 	 * The stopping criteria is EOF or buffer full.
 	 */
 	/*
 	 * XXX this is sort of ugly for nfsv4; we don't maintain the
 	 * strict abstraction, but do the decoding inline.  that's ok.
 	 */
 	while (more_dirs && bigenough) {
 		nfsstats.rpccnt[NFSPROC_READDIR]++;
 
 		mreq = nfsm_reqhead(vp, NFSV4PROC_COMPOUND, 0);
 		mb = mreq;
 		bpos = mtod(mb, caddr_t);
 
 		readdir.cnt = nmp->nm_readdirsize;
 		readdir.cookie = cookie;
 		readdir.bm = &nfsv4_readdirbm;
 		if (cookie == 0)
 			bzero(&readdir.verf, sizeof(readdir.verf));
 		else
 			bcopy(&dnp->n_cookieverf, &readdir.verf,
 			    sizeof(readdir.verf));
 
 		nfs_v4initcompound(&cp);
 
 		nfsm_v4build_compound(&cp, "nfs4_readdirrpc()");
 		nfsm_v4build_putfh(&cp, vp);
 		nfsm_v4build_readdir(&cp, &readdir);
 		nfsm_v4build_finalize(&cp);
 
 		nfsm_request(vp, NFSV4PROC_COMPOUND, uiop->uio_td, cred);
 		if (error != 0)
 			goto nfsmout;
 
 		nfsm_v4dissect_compound(&cp);
 		nfsm_v4dissect_putfh(&cp);
 
 		/*
 		 * XXX - Readdir gets handled inline like in
 		 * NFSv{2,3}.  This is a nasty inconsistency and
 		 * should be fixed.
 		 */
 
 		tl = nfsm_dissect(uint32_t *, 5 * NFSX_UNSIGNED);
 		if (fxdr_unsigned(uint32_t, *tl++) != NFSV4OP_READDIR) {
 			error = EBADRPC;
 			goto nfsmout;
 		}
 		if (fxdr_unsigned(uint32_t, *tl++) != 0) {
 			error = EBADRPC;
 			goto nfsmout;
 		}
 
 		bcopy(tl, &dnp->n_cookieverf, NFSX_V4VERF);
 		tl += 2;
 		more_dirs = fxdr_unsigned(int, *tl++);
 
 		/* loop thru the dir entries, doctoring them to 4bsd form */
 		while (more_dirs && bigenough) {
 			tl = nfsm_dissect(uint32_t *, 3 * NFSX_UNSIGNED);
 			cookie = fxdr_hyper(tl);
 			tl += 2;
 			/* XXX cookie sanity check */
 			len = fxdr_unsigned(int, *tl++);
 			if (len <= 0 || len > NFS_MAXNAMLEN) {
 				error = EBADRPC;
 				goto nfsmout;
 			}
 			tlen = nfsm_rndup(len);
 			if (tlen == len)
 				tlen += 4;	/* To ensure null termination */
 			left = DIRBLKSIZ - blksiz;
 			if ((tlen + DIRHDSIZ) > left) {
 				dp->d_reclen += left;
 				uiop->uio_iov->iov_base =
 				    (char *)uiop->uio_iov->iov_base + left;
 				uiop->uio_iov->iov_len -= left;
 				uiop->uio_offset += left;
 				uiop->uio_resid -= left;
 				blksiz = 0;
 			}
 			if ((tlen + DIRHDSIZ) > uiop->uio_resid)
 				bigenough = 0;
 			if (bigenough) {
 				dp = (struct dirent *)uiop->uio_iov->iov_base;
 
 				dp->d_namlen = len;
 				dp->d_reclen = tlen + DIRHDSIZ;
 
 				blksiz += dp->d_reclen;
 				if (blksiz == DIRBLKSIZ)
 					blksiz = 0;
 				uiop->uio_offset += DIRHDSIZ;
 				uiop->uio_resid -= DIRHDSIZ;
 				uiop->uio_iov->iov_base =
 				    (char *)uiop->uio_iov->iov_base + DIRHDSIZ;
 				uiop->uio_iov->iov_len -= DIRHDSIZ;
 
 				/* Copy name */
 				nfsm_mtouio(uiop, len);
 				p = uiop->uio_iov->iov_base;
 				tlen -= len;
 				*p = '\0';	/* null terminate */
 				/* printf("nfs4_readdirrpc: name: \"%s\" cookie %d\n",
 				   p - len, (int) cookie);*/
 				uiop->uio_iov->iov_base =
 				    (char *)uiop->uio_iov->iov_base + tlen;
 				uiop->uio_iov->iov_len -= tlen;
 				uiop->uio_offset += tlen;
 				uiop->uio_resid -= tlen;
 
 				/* Copy attributes */
 				nfsm_v4dissect_attrs(&fattr);
 
 				dp->d_fileno = nfs_v4fileid4_to_fileid(
 					fattr.fa4_valid & FA4V_FILEID &&
 					    fattr.fa4_fileid ?
 					    fattr.fa4_fileid : cookie);
 
 				fty = (u_int)fattr.fa4_type;
 				dp->d_type = fattr.fa4_valid & FA4V_TYPE &&
 				    (fty < sizeof(fty_to_dty)) ?
 				    fty_to_dty[fty] : DT_UNKNOWN;
 			} else
 				nfsm_adv(nfsm_rndup(len));
 
 			tl = nfsm_dissect(uint32_t *, NFSX_UNSIGNED);
 			more_dirs = fxdr_unsigned(int, *tl++);
 		}
 		/*
 		 * If at end of rpc data, get the eof boolean
 		 */
 		if (!more_dirs) {
 			tl = nfsm_dissect(u_int32_t *, NFSX_UNSIGNED);
 			more_dirs = (fxdr_unsigned(int, *tl) == 0);
 		}
 
 		error = nfs_v4postop(&cp, error);
 
 		m_freem(mrep);
 		mrep = NULL;
 	}
 	/*
 	 * Fill last record, iff any, out to a multiple of DIRBLKSIZ
 	 * by increasing d_reclen for the last record.
 	 */
 	if (blksiz > 0) {
 		left = DIRBLKSIZ - blksiz;
 		dp->d_reclen += left;
 		uiop->uio_iov->iov_base =
 		    (char *)uiop->uio_iov->iov_base + left;
 		uiop->uio_iov->iov_len -= left;
 		uiop->uio_offset += left;
 		uiop->uio_resid -= left;
 	}
 
 	/*
 	 * We are now either at the end of the directory or have filled the
 	 * block.
 	 */
 	if (bigenough)
 		dnp->n_direofoffset = uiop->uio_offset;
 	else {
 		if (uiop->uio_resid > 0)
 			printf("EEK! readdirrpc resid > 0\n");
 		cookiep = nfs4_getcookie(dnp, uiop->uio_offset, 1);
 		*cookiep = cookie;
 	}
 nfsmout:
 	if (mrep != NULL)
 		m_freem(mrep);
 	return (error);
 }
 
 /*
  * Silly rename. To make the NFS filesystem that is stateless look a little
  * more like the "ufs" a remove of an active vnode is translated to a rename
  * to a funny looking filename that is removed by nfs_inactive on the
  * nfsnode. There is the potential for another process on a different client
  * to create the same funny name between the nfs_lookitup() fails and the
  * nfs_rename() completes, but...
  */
 static int
 nfs4_sillyrename(struct vnode *dvp, struct vnode *vp, struct componentname *cnp)
 {
 	struct sillyrename *sp;
 	struct nfsnode *np;
 	int error;
 	short pid;
 
 	cache_purge(dvp);
 	np = VTONFS(vp);
 #ifndef DIAGNOSTIC
 	if (vp->v_type == VDIR)
 		panic("nfs: sillyrename dir");
 #endif
 	MALLOC(sp, struct sillyrename *, sizeof (struct sillyrename),
 		M_NFSREQ, M_WAITOK);
 	sp->s_cred = crhold(cnp->cn_cred);
 	sp->s_dvp = dvp;
 	sp->s_removeit = nfs4_removeit;
 	VREF(dvp);
 
 	/* Fudge together a funny name */
 	pid = cnp->cn_thread->td_proc->p_pid;
 	sp->s_namlen = sprintf(sp->s_name, ".nfsA%04x4.4", pid);
 
 	/* Try lookitups until we get one that isn't there */
 	while (nfs4_lookitup(dvp, sp->s_name, sp->s_namlen, sp->s_cred,
 		cnp->cn_thread, NULL) == 0) {
 		sp->s_name[4]++;
 		if (sp->s_name[4] > 'z') {
 			error = EINVAL;
 			goto bad;
 		}
 	}
 	error = nfs4_renameit(dvp, cnp, sp);
 	if (error)
 		goto bad;
 	error = nfs4_lookitup(dvp, sp->s_name, sp->s_namlen, sp->s_cred,
 		cnp->cn_thread, &np);
 	np->n_sillyrename = sp;
 	return (0);
 bad:
 	vrele(sp->s_dvp);
 	crfree(sp->s_cred);
 	free((caddr_t)sp, M_NFSREQ);
 	return (error);
 }
 
 /*
  * Look up a file name and optionally either update the file handle or
  * allocate an nfsnode, depending on the value of npp.
  * npp == NULL	--> just do the lookup
  * *npp == NULL --> allocate a new nfsnode and make sure attributes are
  *			handled too
  * *npp != NULL --> update the file handle in the vnode
  */
 static int
 nfs4_lookitup(struct vnode *dvp, const char *name, int len, struct ucred *cred,
     struct thread *td, struct nfsnode **npp)
 {
 	struct vnode *newvp = NULL;
 	struct nfsnode *np, *dnp = VTONFS(dvp);
 	caddr_t bpos, dpos;
 	int error = 0, fhlen;
 	struct mbuf *mreq, *mrep = NULL, *md, *mb;
 	nfsfh_t *nfhp;
 	struct nfs4_compound cp;
 	struct nfs4_oparg_lookup l;
 	struct nfs4_oparg_getfh gfh;
 	struct nfs4_oparg_getattr ga;
 
 	nfsstats.rpccnt[NFSPROC_RENAME]++;
 
 	mreq = nfsm_reqhead(dvp, NFSV4PROC_COMPOUND, 0);
 	mb = mreq;
 	bpos = mtod(mb, caddr_t);
 
 	l.name = name;
 	l.namelen = len;
 
 	nfs_v4initcompound(&cp);
 
 	ga.bm = &nfsv4_getattrbm;
 
 	nfsm_v4build_compound(&cp, "nfs4_renamerpc()");
 	nfsm_v4build_putfh(&cp, dvp);
 	nfsm_v4build_lookup(&cp, &l);
 	nfsm_v4build_getfh(&cp, &gfh);
 	nfsm_v4build_getattr(&cp, &ga);
 
 	nfsm_request(dvp, NFSV4PROC_COMPOUND, td, cred);
 	if (error != 0)
 		goto nfsmout;
 
 	nfsm_v4dissect_compound(&cp);
 	nfsm_v4dissect_putfh(&cp);
 	nfsm_v4dissect_lookup(&cp);
 	nfsm_v4dissect_getfh(&cp, &gfh);
 	nfsm_v4dissect_getattr(&cp, &ga);
 
 	if (npp != NULL && error == 0) {
 		nfhp = &gfh.fh_val;
 		fhlen = gfh.fh_len;
 
 		if (*npp != NULL) {
 			np = *npp;
 			if (np->n_fhsize > NFS_SMALLFH && fhlen <= NFS_SMALLFH) {
 				free((caddr_t)np->n_fhp, M_NFSBIGFH);
 				np->n_fhp = &np->n_fh;
 			} else if (np->n_fhsize <= NFS_SMALLFH && fhlen>NFS_SMALLFH)
 				np->n_fhp =(nfsfh_t *)malloc(fhlen, M_NFSBIGFH, M_WAITOK);
 			bcopy((caddr_t)nfhp, (caddr_t)np->n_fhp, fhlen);
 			np->n_fhsize = fhlen;
 			newvp = NFSTOV(np);
 		} else if (NFS_CMPFH(dnp, nfhp, fhlen)) {
 			VREF(dvp);
 			newvp = dvp;
 		} else {
 			error = nfs_nget(dvp->v_mount, nfhp, fhlen, &np, LK_EXCLUSIVE);
 			if (error) {
 				m_freem(mrep);
 				return (error);
 			}
 			newvp = NFSTOV(np);
 		}
 
 		if (newvp != dvp) {
 			np->n_dvp = dvp;
 			np->n_namelen = len;
 			if (np->n_name != NULL)
 				FREE(np->n_name, M_NFSREQ);
 			MALLOC(np->n_name, u_char *,
 			    np->n_namelen + 1, M_NFSREQ, M_WAITOK);
 			memcpy(np->n_name, name, len);
 			np->n_name[len] = '\0';
 		}
 		nfs4_vnop_loadattrcache(newvp, &ga.fa, NULL);
 	}
 
 nfsmout:
 	error = nfs_v4postop(&cp, error);
 
 	if (mrep != NULL)
 		m_freem(mrep);
 	if (npp && *npp == NULL) {
 		if (error) {
 			if (newvp) {
 				if (newvp == dvp)
 					vrele(newvp);
 				else
 					vput(newvp);
 			}
 		} else
 			*npp = np;
 	}
 
 
 	return (error);
 }
 
 /*
  * Nfs Version 3 commit rpc
  */
 int
 nfs4_commit(struct vnode *vp, u_quad_t offset, int cnt, struct ucred *cred,
     struct thread *td)
 {
 	struct nfsmount *nmp = VFSTONFS(vp->v_mount);
 	caddr_t bpos, dpos;
 	int error = 0;
 	struct mbuf *mreq, *mrep = NULL, *md, *mb;
 	struct nfs4_compound cp;
 	struct nfs4_oparg_commit commit;
 
 	if ((nmp->nm_state & NFSSTA_HASWRITEVERF) == 0)
 		return (0);
 	nfsstats.rpccnt[NFSPROC_COMMIT]++;
 
 	mreq = nfsm_reqhead(vp, NFSV4PROC_COMPOUND, 0);
 	mb = mreq;
 	bpos = mtod(mb, caddr_t);
 
 	commit.start = offset;
 	commit.len = cnt;
 
 	nfs_v4initcompound(&cp);
 
 	nfsm_v4build_compound(&cp, "nfs4_commit()");
 	nfsm_v4build_putfh(&cp, vp);
 	nfsm_v4build_commit(&cp, &commit);
 	nfsm_v4build_finalize(&cp);
 
 	nfsm_request(vp, NFSV4PROC_COMPOUND, td, cred);
 	if (error != 0)
 		goto nfsmout;
 
 	nfsm_v4dissect_compound(&cp);
 	nfsm_v4dissect_putfh(&cp);
 	nfsm_v4dissect_commit(&cp, &commit);
 	
 	/* XXX */
 	/* nfsm_wcc_data(vp, wccflag);*/
 	if (bcmp(nmp->nm_verf, commit.verf, NFSX_V4VERF)) {
 		bcopy(commit.verf, nmp->nm_verf, NFSX_V4VERF);
 		error = NFSERR_STALEWRITEVERF;
 	}
 
 nfsmout:
 	error = nfs_v4postop(&cp, error);
 
 	if (mrep == NULL)
 		m_freem(mrep);
 	return (error);
 }
 
 /*
  * Strategy routine.
  * For async requests when nfsiod(s) are running, queue the request by
  * calling nfs_asyncio(), otherwise just all nfs_doio() to do the
  * request.
  */
 static int
 nfs4_strategy(struct vop_strategy_args *ap)
 {
 	struct buf *bp = ap->a_bp;
 	struct ucred *cr;
 	int error = 0;
 
 	KASSERT(!(bp->b_flags & B_DONE), ("nfs4_strategy: buffer %p unexpectedly marked B_DONE", bp));
 	KASSERT(BUF_REFCNT(bp) > 0, ("nfs4_strategy: buffer %p not locked", bp));
 
 	if (bp->b_iocmd == BIO_READ)
 		cr = bp->b_rcred;
 	else
 		cr = bp->b_wcred;
 
 	/*
 	 * If the op is asynchronous and an i/o daemon is waiting
 	 * queue the request, wake it up and wait for completion
 	 * otherwise just do it ourselves.
 	 */
 	if ((bp->b_flags & B_ASYNC) == 0 ||
 		nfs_asyncio(VFSTONFS(ap->a_vp->v_mount), bp, NOCRED, curthread))
 		error = nfs_doio(ap->a_vp, bp, cr, curthread);
 	return (error);
 }
 
 /*
  * fsync vnode op. Just call nfs4_flush() with commit == 1.
  */
 /* ARGSUSED */
 static int
 nfs4_fsync(struct vop_fsync_args *ap)
 {
 	return (nfs4_flush(ap->a_vp, ap->a_waitfor, ap->a_td, 1));
 }
 
 /*
  * Flush all the blocks associated with a vnode.
  * 	Walk through the buffer pool and push any dirty pages
  *	associated with the vnode.
  */
 static int
 nfs4_flush(struct vnode *vp, int waitfor, struct thread *td,
     int commit)
 {
 	struct nfsnode *np = VTONFS(vp);
 	struct buf *bp;
 	int i;
 	struct buf *nbp;
 	struct nfsmount *nmp = VFSTONFS(vp->v_mount);
 	int s, error = 0, slptimeo = 0, slpflag = 0, retv, bvecpos;
 	int passone = 1;
 	u_quad_t off, endoff, toff;
 	struct ucred* wcred = NULL;
 	struct buf **bvec = NULL;
 #ifndef NFS_COMMITBVECSIZ
 #define NFS_COMMITBVECSIZ	20
 #endif
 	struct buf *bvec_on_stack[NFS_COMMITBVECSIZ];
 	int bvecsize = 0, bveccount;
 
 	if (nmp->nm_flag & NFSMNT_INT)
 		slpflag = PCATCH;
 	if (!commit)
 		passone = 0;
 	/*
 	 * A b_flags == (B_DELWRI | B_NEEDCOMMIT) block has been written to the
 	 * server, but nas not been committed to stable storage on the server
 	 * yet. On the first pass, the byte range is worked out and the commit
 	 * rpc is done. On the second pass, nfs_writebp() is called to do the
 	 * job.
 	 */
 again:
 	off = (u_quad_t)-1;
 	endoff = 0;
 	bvecpos = 0;
 	if (NFS_ISV3(vp) && commit) {
 		s = splbio();
 		if (bvec != NULL && bvec != bvec_on_stack)
 			free(bvec, M_TEMP);
 		/*
 		 * Count up how many buffers waiting for a commit.
 		 */
 		bveccount = 0;
 		VI_LOCK(vp);
 		TAILQ_FOREACH_SAFE(bp, &vp->v_bufobj.bo_dirty.bv_hd, b_bobufs, nbp) {
 			if (BUF_REFCNT(bp) == 0 &&
 			    (bp->b_flags & (B_DELWRI | B_NEEDCOMMIT))
 				== (B_DELWRI | B_NEEDCOMMIT))
 				bveccount++;
 		}
 		/*
 		 * Allocate space to remember the list of bufs to commit.  It is
 		 * important to use M_NOWAIT here to avoid a race with nfs4_write.
 		 * If we can't get memory (for whatever reason), we will end up
 		 * committing the buffers one-by-one in the loop below.
 		 */
 		if (bveccount > NFS_COMMITBVECSIZ) {
 			/*
 			 * Release the vnode interlock to avoid a lock
 			 * order reversal.
 			 */
 			VI_UNLOCK(vp);
 			bvec = (struct buf **)
 				malloc(bveccount * sizeof(struct buf *),
 				       M_TEMP, M_NOWAIT);
 			VI_LOCK(vp);
 			if (bvec == NULL) {
 				bvec = bvec_on_stack;
 				bvecsize = NFS_COMMITBVECSIZ;
 			} else
 				bvecsize = bveccount;
 		} else {
 			bvec = bvec_on_stack;
 			bvecsize = NFS_COMMITBVECSIZ;
 		}
 		TAILQ_FOREACH_SAFE(bp, &vp->v_bufobj.bo_dirty.bv_hd, b_bobufs, nbp) {
 			if (bvecpos >= bvecsize)
 				break;
 			if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT, NULL)) {
 				nbp = TAILQ_NEXT(bp, b_bobufs);
 				continue;
 			}
 			if ((bp->b_flags & (B_DELWRI | B_NEEDCOMMIT)) !=
 			    (B_DELWRI | B_NEEDCOMMIT)) {
 				BUF_UNLOCK(bp);
 				nbp = TAILQ_NEXT(bp, b_bobufs);
 				continue;
 			}
 			VI_UNLOCK(vp);
 			bremfree(bp);
 			/*
 			 * Work out if all buffers are using the same cred
 			 * so we can deal with them all with one commit.
 			 *
 			 * NOTE: we are not clearing B_DONE here, so we have
 			 * to do it later on in this routine if we intend to
 			 * initiate I/O on the bp.
 			 *
 			 * Note: to avoid loopback deadlocks, we do not
 			 * assign b_runningbufspace.
 			 */
 			if (wcred == NULL)
 				wcred = bp->b_wcred;
 			else if (wcred != bp->b_wcred)
 				wcred = NOCRED;
 			vfs_busy_pages(bp, 1);
 
 			VI_LOCK(vp);
 			/*
 			 * bp is protected by being locked, but nbp is not
 			 * and vfs_busy_pages() may sleep.  We have to
 			 * recalculate nbp.
 			 */
 			nbp = TAILQ_NEXT(bp, b_bobufs);
 
 			/*
 			 * A list of these buffers is kept so that the
 			 * second loop knows which buffers have actually
 			 * been committed. This is necessary, since there
 			 * may be a race between the commit rpc and new
 			 * uncommitted writes on the file.
 			 */
 			bvec[bvecpos++] = bp;
 			toff = ((u_quad_t)bp->b_blkno) * DEV_BSIZE +
 				bp->b_dirtyoff;
 			if (toff < off)
 				off = toff;
 			toff += (u_quad_t)(bp->b_dirtyend - bp->b_dirtyoff);
 			if (toff > endoff)
 				endoff = toff;
 		}
 		splx(s);
 		VI_UNLOCK(vp);
 	}
 	if (bvecpos > 0) {
 		/*
 		 * Commit data on the server, as required.
 		 * If all bufs are using the same wcred, then use that with
 		 * one call for all of them, otherwise commit each one
 		 * separately.
 		 */
 		if (wcred != NOCRED)
 			retv = nfs4_commit(vp, off, (int)(endoff - off),
 					  wcred, td);
 		else {
 			retv = 0;
 			for (i = 0; i < bvecpos; i++) {
 				off_t off, size;
 				bp = bvec[i];
 				off = ((u_quad_t)bp->b_blkno) * DEV_BSIZE +
 					bp->b_dirtyoff;
 				size = (u_quad_t)(bp->b_dirtyend
 						  - bp->b_dirtyoff);
 				retv = nfs4_commit(vp, off, (int)size,
 						  bp->b_wcred, td);
 				if (retv) break;
 			}
 		}
 
 		if (retv == NFSERR_STALEWRITEVERF)
 			nfs_clearcommit(vp->v_mount);
 
 		/*
 		 * Now, either mark the blocks I/O done or mark the
 		 * blocks dirty, depending on whether the commit
 		 * succeeded.
 		 */
 		for (i = 0; i < bvecpos; i++) {
 			bp = bvec[i];
 			bp->b_flags &= ~(B_NEEDCOMMIT | B_CLUSTEROK);
 			if (retv) {
 				/*
 				 * Error, leave B_DELWRI intact
 				 */
 				vfs_unbusy_pages(bp);
 				brelse(bp);
 			} else {
 				/*
 				 * Success, remove B_DELWRI ( bundirty() ).
 				 *
 				 * b_dirtyoff/b_dirtyend seem to be NFS
 				 * specific.  We should probably move that
 				 * into bundirty(). XXX
 				 */
 				s = splbio();
 				bufobj_wref(&vp->v_bufobj);
 				bp->b_flags |= B_ASYNC;
 				bundirty(bp);
 				bp->b_flags &= ~B_DONE;
 				bp->b_ioflags &= ~BIO_ERROR;
 				bp->b_dirtyoff = bp->b_dirtyend = 0;
 				splx(s);
 				bufdone(bp);
 			}
 		}
 	}
 
 	/*
 	 * Start/do any write(s) that are required.
 	 */
 loop:
 	s = splbio();
 	VI_LOCK(vp);
 	TAILQ_FOREACH_SAFE(bp, &vp->v_bufobj.bo_dirty.bv_hd, b_bobufs, nbp) {
 		if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT, NULL)) {
 			if (waitfor != MNT_WAIT || passone)
 				continue;
 
 			error = BUF_TIMELOCK(bp,
 			    LK_EXCLUSIVE | LK_SLEEPFAIL | LK_INTERLOCK,
 			    VI_MTX(vp), "nfsfsync", slpflag, slptimeo);
 			splx(s);
 			if (error == 0)
 				panic("nfs4_fsync: inconsistent lock");
 			if (error == ENOLCK)
 				goto loop;
 			if (nfs4_sigintr(nmp, NULL, td)) {
 				error = EINTR;
 				goto done;
 			}
 			if (slpflag == PCATCH) {
 				slpflag = 0;
 				slptimeo = 2 * hz;
 			}
 			goto loop;
 		}
 		if ((bp->b_flags & B_DELWRI) == 0)
 			panic("nfs4_fsync: not dirty");
 		if ((passone || !commit) && (bp->b_flags & B_NEEDCOMMIT)) {
 			BUF_UNLOCK(bp);
 			continue;
 		}
 		VI_UNLOCK(vp);
 		bremfree(bp);
 		if (passone || !commit)
 		    bp->b_flags |= B_ASYNC;
 		else
 		    bp->b_flags |= B_ASYNC;
 		splx(s);
 		bwrite(bp);
 		goto loop;
 	}
 	splx(s);
 	if (passone) {
 		passone = 0;
 		VI_UNLOCK(vp);
 		goto again;
 	}
 	if (waitfor == MNT_WAIT) {
 		while (vp->v_bufobj.bo_numoutput) {
 			error = bufobj_wwait(&vp->v_bufobj, slpflag, slptimeo);
 			if (error) {
 			    VI_UNLOCK(vp);
 			    if (nfs4_sigintr(nmp, NULL, td)) {
 				error = EINTR;
 				goto done;
 			    }
 			    if (slpflag == PCATCH) {
 				slpflag = 0;
 				slptimeo = 2 * hz;
 			    }
 			    VI_LOCK(vp);
 			}
 		}
 		if (vp->v_bufobj.bo_dirty.bv_cnt > 0 && commit) {
 			VI_UNLOCK(vp);
 			goto loop;
 		}
 	}
 	VI_UNLOCK(vp);
 	if (np->n_flag & NWRITEERR) {
 		error = np->n_error;
 		np->n_flag &= ~NWRITEERR;
 	}
 done:
 	if (bvec != NULL && bvec != bvec_on_stack)
 		free(bvec, M_TEMP);
 	return (error);
 }
 
 /*
  * NFS advisory byte-level locks.
  */
 static int
 nfs4_advlock(struct vop_advlock_args *ap)
 {
 	return (EPERM);
 
 	if ((VFSTONFS(ap->a_vp->v_mount)->nm_flag & NFSMNT_NOLOCKD) != 0) {
 		struct nfsnode *np = VTONFS(ap->a_vp);
 
 		return (lf_advlock(ap, &(np->n_lockf), np->n_size));
 	}
 	return (nfs_dolock(ap));
 }
 
 /*
  * Print out the contents of an nfsnode.
  */
 static int
 nfs4_print(struct vop_print_args *ap)
 {
 	struct vnode *vp = ap->a_vp;
 	struct nfsnode *np = VTONFS(vp);
 
 	printf("\tfileid %ld fsid 0x%x",
 	   np->n_vattr.va_fileid, np->n_vattr.va_fsid);
 	if (vp->v_type == VFIFO)
 		fifo_printinfo(vp);
 	printf("\n");
 	return (0);
 }
 
 /*
  * This is the "real" nfs::bwrite(struct buf*).
  * We set B_CACHE if this is a VMIO buffer.
  */
 int
 nfs4_writebp(struct buf *bp, int force __unused, struct thread *td)
 {
 	int s;
 	int oldflags = bp->b_flags;
 #if 0
 	int retv = 1;
 	off_t off;
 #endif
 
 	if (BUF_REFCNT(bp) == 0)
 		panic("bwrite: buffer is not locked???");
 
 	if (bp->b_flags & B_INVAL) {
 		brelse(bp);
 		return(0);
 	}
 
 	bp->b_flags |= B_CACHE;
 
 	/*
 	 * Undirty the bp.  We will redirty it later if the I/O fails.
 	 */
 
 	s = splbio();
 	bundirty(bp);
 	bp->b_flags &= ~B_DONE;
 	bp->b_ioflags &= ~BIO_ERROR;
 	bp->b_iocmd = BIO_WRITE;
 
 	bufobj_wref(bp->b_bufobj);
 	curthread->td_ru.ru_oublock++;
 	splx(s);
 
 	/*
 	 * Note: to avoid loopback deadlocks, we do not
 	 * assign b_runningbufspace.
 	 */
 	vfs_busy_pages(bp, 1);
 
 	BUF_KERNPROC(bp);
 	bp->b_iooffset = dbtob(bp->b_blkno);
 	bstrategy(bp);
 
 	if( (oldflags & B_ASYNC) == 0) {
 		int rtval = bufwait(bp);
 
 		if (oldflags & B_DELWRI) {
 			s = splbio();
 			reassignbuf(bp);
 			splx(s);
 		}
 
 		brelse(bp);
 		return (rtval);
 	}
 
 	return (0);
 }
 
 /*
  * Just call nfs_writebp() with the force argument set to 1.
  *
  * NOTE: B_DONE may or may not be set in a_bp on call.
  */
 static int
 nfs4_bwrite(struct buf *bp)
 {
 
 	return (nfs4_writebp(bp, 1, curthread));
 }
 
 struct buf_ops buf_ops_nfs4 = {
 	.bop_name	=	"buf_ops_nfs4",
 	.bop_write	=	nfs4_bwrite,
 	.bop_strategy	=	bufstrategy,
 	.bop_sync	=	bufsync,
 	.bop_bdflush	=	bufbdflush,
 };
Index: head/sys/nfsclient/nfs_subs.c
===================================================================
--- head/sys/nfsclient/nfs_subs.c	(revision 175201)
+++ head/sys/nfsclient/nfs_subs.c	(revision 175202)
@@ -1,1187 +1,1187 @@
 /*-
  * Copyright (c) 1989, 1993
  *	The Regents of the University of California.  All rights reserved.
  *
  * This code is derived from software contributed to Berkeley by
  * Rick Macklem at The University of Guelph.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	@(#)nfs_subs.c  8.8 (Berkeley) 5/22/95
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 /*
  * These functions support the macros and help fiddle mbuf chains for
  * the nfs op functions. They do things like create the rpc header and
  * copy data between mbuf chains and uio lists.
  */
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/kernel.h>
 #include <sys/bio.h>
 #include <sys/buf.h>
 #include <sys/proc.h>
 #include <sys/mount.h>
 #include <sys/vnode.h>
 #include <sys/namei.h>
 #include <sys/mbuf.h>
 #include <sys/socket.h>
 #include <sys/stat.h>
 #include <sys/malloc.h>
 #include <sys/sysent.h>
 #include <sys/syscall.h>
 #include <sys/sysproto.h>
 
 #include <vm/vm.h>
 #include <vm/vm_object.h>
 #include <vm/vm_extern.h>
 #include <vm/uma.h>
 
 #include <rpc/rpcclnt.h>
 
 #include <nfs/rpcv2.h>
 #include <nfs/nfsproto.h>
 #include <nfsclient/nfs.h>
 #include <nfsclient/nfsnode.h>
 #include <nfs/xdr_subs.h>
 #include <nfsclient/nfsm_subs.h>
 #include <nfsclient/nfsmount.h>
 
 #include <netinet/in.h>
 
 /*
  * Note that stdarg.h and the ANSI style va_start macro is used for both
  * ANSI and traditional C compilers.
  */
 #include <machine/stdarg.h>
 
 /*
  * Data items converted to xdr at startup, since they are constant
  * This is kinda hokey, but may save a little time doing byte swaps
  */
 u_int32_t	nfs_xdrneg1;
 u_int32_t	rpc_call, rpc_vers, rpc_reply, rpc_msgdenied, rpc_autherr,
 		    rpc_mismatch, rpc_auth_unix, rpc_msgaccepted;
 u_int32_t	nfs_true, nfs_false;
 
 /* And other global data */
 u_int32_t nfs_xid = 0;
 static enum vtype nv2tov_type[8]= {
 	VNON, VREG, VDIR, VBLK, VCHR, VLNK, VNON,  VNON
 };
 
 int		nfs_ticks;
 int		nfs_pbuf_freecnt = -1;	/* start out unlimited */
 
 struct nfs_reqq	nfs_reqq;
 struct mtx nfs_reqq_mtx;
 struct nfs_bufq	nfs_bufq;
 struct mtx nfs_xid_mtx;
 
 /*
  * and the reverse mapping from generic to Version 2 procedure numbers
  */
 int nfsv2_procid[NFS_NPROCS] = {
 	NFSV2PROC_NULL,
 	NFSV2PROC_GETATTR,
 	NFSV2PROC_SETATTR,
 	NFSV2PROC_LOOKUP,
 	NFSV2PROC_NOOP,
 	NFSV2PROC_READLINK,
 	NFSV2PROC_READ,
 	NFSV2PROC_WRITE,
 	NFSV2PROC_CREATE,
 	NFSV2PROC_MKDIR,
 	NFSV2PROC_SYMLINK,
 	NFSV2PROC_CREATE,
 	NFSV2PROC_REMOVE,
 	NFSV2PROC_RMDIR,
 	NFSV2PROC_RENAME,
 	NFSV2PROC_LINK,
 	NFSV2PROC_READDIR,
 	NFSV2PROC_NOOP,
 	NFSV2PROC_STATFS,
 	NFSV2PROC_NOOP,
 	NFSV2PROC_NOOP,
 	NFSV2PROC_NOOP,
 	NFSV2PROC_NOOP,
 };
 
 LIST_HEAD(nfsnodehashhead, nfsnode);
 
 /*
  * Create the header for an rpc request packet
  * The hsiz is the size of the rest of the nfs request header.
  * (just used to decide if a cluster is a good idea)
  */
 struct mbuf *
 nfsm_reqhead(struct vnode *vp, u_long procid, int hsiz)
 {
 	struct mbuf *mb;
 
 	MGET(mb, M_TRYWAIT, MT_DATA);
 	if (hsiz >= MINCLSIZE)
 		MCLGET(mb, M_TRYWAIT);
 	mb->m_len = 0;
 	return (mb);
 }
 
 /*
  * Build the RPC header and fill in the authorization info.
  * The authorization string argument is only used when the credentials
  * come from outside of the kernel.
  * Returns the head of the mbuf list.
  */
 struct mbuf *
 nfsm_rpchead(struct ucred *cr, int nmflag, int procid, int auth_type,
     int auth_len, struct mbuf *mrest, int mrest_len, struct mbuf **mbp,
     u_int32_t **xidpp)
 {
 	struct mbuf *mb;
 	u_int32_t *tl;
 	caddr_t bpos;
 	int i;
 	struct mbuf *mreq;
 	int grpsiz, authsiz;
 
 	authsiz = nfsm_rndup(auth_len);
 	MGETHDR(mb, M_TRYWAIT, MT_DATA);
 	if ((authsiz + 10 * NFSX_UNSIGNED) >= MINCLSIZE) {
 		MCLGET(mb, M_TRYWAIT);
 	} else if ((authsiz + 10 * NFSX_UNSIGNED) < MHLEN) {
 		MH_ALIGN(mb, authsiz + 10 * NFSX_UNSIGNED);
 	} else {
 		MH_ALIGN(mb, 8 * NFSX_UNSIGNED);
 	}
 	mb->m_len = 0;
 	mreq = mb;
 	bpos = mtod(mb, caddr_t);
 
 	/*
 	 * First the RPC header.
 	 */
 	tl = nfsm_build(u_int32_t *, 8 * NFSX_UNSIGNED);
 
 	mtx_lock(&nfs_xid_mtx);
 	/* Get a pretty random xid to start with */
 	if (!nfs_xid)
 		nfs_xid = random();
 	/*
 	 * Skip zero xid if it should ever happen.
 	 */
 	if (++nfs_xid == 0)
 		nfs_xid++;
 
 	*xidpp = tl;
 	*tl++ = txdr_unsigned(nfs_xid);
 	mtx_unlock(&nfs_xid_mtx);
 	*tl++ = rpc_call;
 	*tl++ = rpc_vers;
 	*tl++ = txdr_unsigned(NFS_PROG);
 	if (nmflag & NFSMNT_NFSV3) {
 		*tl++ = txdr_unsigned(NFS_VER3);
 		*tl++ = txdr_unsigned(procid);
 	} else {
 		*tl++ = txdr_unsigned(NFS_VER2);
 		*tl++ = txdr_unsigned(nfsv2_procid[procid]);
 	}
 
 	/*
 	 * And then the authorization cred.
 	 */
 	*tl++ = txdr_unsigned(auth_type);
 	*tl = txdr_unsigned(authsiz);
 	switch (auth_type) {
 	case RPCAUTH_UNIX:
 		tl = nfsm_build(u_int32_t *, auth_len);
 		*tl++ = 0;		/* stamp ?? */
 		*tl++ = 0;		/* NULL hostname */
 		*tl++ = txdr_unsigned(cr->cr_uid);
 		*tl++ = txdr_unsigned(cr->cr_groups[0]);
 		grpsiz = (auth_len >> 2) - 5;
 		*tl++ = txdr_unsigned(grpsiz);
 		for (i = 1; i <= grpsiz; i++)
 			*tl++ = txdr_unsigned(cr->cr_groups[i]);
 		break;
 	}
 
 	/*
 	 * And the verifier...
 	 */
 	tl = nfsm_build(u_int32_t *, 2 * NFSX_UNSIGNED);
 	*tl++ = txdr_unsigned(RPCAUTH_NULL);
 	*tl = 0;
 	mb->m_next = mrest;
 	mreq->m_pkthdr.len = authsiz + 10 * NFSX_UNSIGNED + mrest_len;
 	mreq->m_pkthdr.rcvif = NULL;
 	*mbp = mb;
 	return (mreq);
 }
 
 /*
  * copies a uio scatter/gather list to an mbuf chain.
  * NOTE: can ony handle iovcnt == 1
  */
 int
 nfsm_uiotombuf(struct uio *uiop, struct mbuf **mq, int siz, caddr_t *bpos)
 {
 	char *uiocp;
 	struct mbuf *mp, *mp2;
 	int xfer, left, mlen;
 	int uiosiz, clflg, rem;
 	char *cp;
 
 #ifdef DIAGNOSTIC
 	if (uiop->uio_iovcnt != 1)
 		panic("nfsm_uiotombuf: iovcnt != 1");
 #endif
 
 	if (siz > MLEN)		/* or should it >= MCLBYTES ?? */
 		clflg = 1;
 	else
 		clflg = 0;
 	rem = nfsm_rndup(siz)-siz;
 	mp = mp2 = *mq;
 	while (siz > 0) {
 		left = uiop->uio_iov->iov_len;
 		uiocp = uiop->uio_iov->iov_base;
 		if (left > siz)
 			left = siz;
 		uiosiz = left;
 		while (left > 0) {
 			mlen = M_TRAILINGSPACE(mp);
 			if (mlen == 0) {
 				MGET(mp, M_TRYWAIT, MT_DATA);
 				if (clflg)
 					MCLGET(mp, M_TRYWAIT);
 				mp->m_len = 0;
 				mp2->m_next = mp;
 				mp2 = mp;
 				mlen = M_TRAILINGSPACE(mp);
 			}
 			xfer = (left > mlen) ? mlen : left;
 #ifdef notdef
 			/* Not Yet.. */
 			if (uiop->uio_iov->iov_op != NULL)
 				(*(uiop->uio_iov->iov_op))
 				(uiocp, mtod(mp, caddr_t)+mp->m_len, xfer);
 			else
 #endif
 			if (uiop->uio_segflg == UIO_SYSSPACE)
 				bcopy(uiocp, mtod(mp, caddr_t)+mp->m_len, xfer);
 			else
 				copyin(uiocp, mtod(mp, caddr_t)+mp->m_len, xfer);
 			mp->m_len += xfer;
 			left -= xfer;
 			uiocp += xfer;
 			uiop->uio_offset += xfer;
 			uiop->uio_resid -= xfer;
 		}
 		uiop->uio_iov->iov_base =
 		    (char *)uiop->uio_iov->iov_base + uiosiz;
 		uiop->uio_iov->iov_len -= uiosiz;
 		siz -= uiosiz;
 	}
 	if (rem > 0) {
 		if (rem > M_TRAILINGSPACE(mp)) {
 			MGET(mp, M_TRYWAIT, MT_DATA);
 			mp->m_len = 0;
 			mp2->m_next = mp;
 		}
 		cp = mtod(mp, caddr_t)+mp->m_len;
 		for (left = 0; left < rem; left++)
 			*cp++ = '\0';
 		mp->m_len += rem;
 		*bpos = cp;
 	} else
 		*bpos = mtod(mp, caddr_t)+mp->m_len;
 	*mq = mp;
 	return (0);
 }
 
 /*
  * Copy a string into mbufs for the hard cases...
  */
 int
 nfsm_strtmbuf(struct mbuf **mb, char **bpos, const char *cp, long siz)
 {
 	struct mbuf *m1 = NULL, *m2;
 	long left, xfer, len, tlen;
 	u_int32_t *tl;
 	int putsize;
 
 	putsize = 1;
 	m2 = *mb;
 	left = M_TRAILINGSPACE(m2);
 	if (left > 0) {
 		tl = ((u_int32_t *)(*bpos));
 		*tl++ = txdr_unsigned(siz);
 		putsize = 0;
 		left -= NFSX_UNSIGNED;
 		m2->m_len += NFSX_UNSIGNED;
 		if (left > 0) {
 			bcopy(cp, (caddr_t) tl, left);
 			siz -= left;
 			cp += left;
 			m2->m_len += left;
 			left = 0;
 		}
 	}
 	/* Loop around adding mbufs */
 	while (siz > 0) {
 		MGET(m1, M_TRYWAIT, MT_DATA);
 		if (siz > MLEN)
 			MCLGET(m1, M_TRYWAIT);
 		m1->m_len = NFSMSIZ(m1);
 		m2->m_next = m1;
 		m2 = m1;
 		tl = mtod(m1, u_int32_t *);
 		tlen = 0;
 		if (putsize) {
 			*tl++ = txdr_unsigned(siz);
 			m1->m_len -= NFSX_UNSIGNED;
 			tlen = NFSX_UNSIGNED;
 			putsize = 0;
 		}
 		if (siz < m1->m_len) {
 			len = nfsm_rndup(siz);
 			xfer = siz;
 			if (xfer < len)
 				*(tl+(xfer>>2)) = 0;
 		} else {
 			xfer = len = m1->m_len;
 		}
 		bcopy(cp, (caddr_t) tl, xfer);
 		m1->m_len = len+tlen;
 		siz -= xfer;
 		cp += xfer;
 	}
 	*mb = m1;
 	*bpos = mtod(m1, caddr_t)+m1->m_len;
 	return (0);
 }
 
 /*
  * Called once to initialize data structures...
  */
 int
 nfs_init(struct vfsconf *vfsp)
 {
 	int i;
 
 	nfsmount_zone = uma_zcreate("NFSMOUNT", sizeof(struct nfsmount),
 	    NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0);
 	rpc_vers = txdr_unsigned(RPC_VER2);
 	rpc_call = txdr_unsigned(RPC_CALL);
 	rpc_reply = txdr_unsigned(RPC_REPLY);
 	rpc_msgdenied = txdr_unsigned(RPC_MSGDENIED);
 	rpc_msgaccepted = txdr_unsigned(RPC_MSGACCEPTED);
 	rpc_mismatch = txdr_unsigned(RPC_MISMATCH);
 	rpc_autherr = txdr_unsigned(RPC_AUTHERR);
 	rpc_auth_unix = txdr_unsigned(RPCAUTH_UNIX);
 	nfs_true = txdr_unsigned(TRUE);
 	nfs_false = txdr_unsigned(FALSE);
 	nfs_xdrneg1 = txdr_unsigned(-1);
 	nfs_ticks = (hz * NFS_TICKINTVL + 500) / 1000;
 	if (nfs_ticks < 1)
 		nfs_ticks = 1;
 	/* Ensure async daemons disabled */
 	for (i = 0; i < NFS_MAXASYNCDAEMON; i++) {
 		nfs_iodwant[i] = NULL;
 		nfs_iodmount[i] = NULL;
 	}
 	nfs_nhinit();			/* Init the nfsnode table */
 
 	/*
 	 * Initialize reply list and start timer
 	 */
 	TAILQ_INIT(&nfs_reqq);
 	callout_init(&nfs_callout, CALLOUT_MPSAFE);
 	mtx_init(&nfs_reqq_mtx, "NFS reqq lock", NULL, MTX_DEF);
 	mtx_init(&nfs_iod_mtx, "NFS iod lock", NULL, MTX_DEF);
 	mtx_init(&nfs_xid_mtx, "NFS xid lock", NULL, MTX_DEF);
 
 	nfs_pbuf_freecnt = nswbuf / 2 + 1;
 
 	return (0);
 }
 
 int
 nfs_uninit(struct vfsconf *vfsp)
 {
 	int i;
 
 	callout_stop(&nfs_callout);
 
 	KASSERT(TAILQ_EMPTY(&nfs_reqq),
 	    ("nfs_uninit: request queue not empty"));
 
 	/*
 	 * Tell all nfsiod processes to exit. Clear nfs_iodmax, and wakeup
 	 * any sleeping nfsiods so they check nfs_iodmax and exit.
 	 */
 	mtx_lock(&nfs_iod_mtx);
 	nfs_iodmax = 0;
 	for (i = 0; i < nfs_numasync; i++)
 		if (nfs_iodwant[i])
 			wakeup(&nfs_iodwant[i]);
 	/* The last nfsiod to exit will wake us up when nfs_numasync hits 0 */
 	while (nfs_numasync)
 		msleep(&nfs_numasync, &nfs_iod_mtx, PWAIT, "ioddie", 0);
 	mtx_unlock(&nfs_iod_mtx);
 	nfs_nhuninit();
 	uma_zdestroy(nfsmount_zone);
 	return (0);
 }
 
 void 
 nfs_dircookie_lock(struct nfsnode *np)
 {
 	mtx_lock(&np->n_mtx);
 	while (np->n_flag & NDIRCOOKIELK)
 		(void) msleep(&np->n_flag, &np->n_mtx, PZERO, "nfsdirlk", 0);
 	np->n_flag |= NDIRCOOKIELK;
 	mtx_unlock(&np->n_mtx);
 }
 
 void 
 nfs_dircookie_unlock(struct nfsnode *np)
 {
 	mtx_lock(&np->n_mtx);
 	np->n_flag &= ~NDIRCOOKIELK;
 	wakeup(&np->n_flag);
 	mtx_unlock(&np->n_mtx);
 }
 
 int
 nfs_upgrade_vnlock(struct vnode *vp, struct thread *td)
 {
 	int old_lock;
 	
  	if ((old_lock = VOP_ISLOCKED(vp, td)) != LK_EXCLUSIVE) {
  		if (old_lock == LK_SHARED) {
  			/* Upgrade to exclusive lock, this might block */
- 			vn_lock(vp, LK_UPGRADE | LK_RETRY, td);
+ 			vn_lock(vp, LK_UPGRADE | LK_RETRY);
  		} else {
- 			vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td);
+ 			vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
  		}
   	}
 	return old_lock;
 }
 
 void
 nfs_downgrade_vnlock(struct vnode *vp, struct thread *td, int old_lock)
 {
 	if (old_lock != LK_EXCLUSIVE) {
  		if (old_lock == LK_SHARED) {
  			/* Downgrade from exclusive lock, this might block */
- 			vn_lock(vp, LK_DOWNGRADE, td);
+ 			vn_lock(vp, LK_DOWNGRADE);
  		} else {
  			VOP_UNLOCK(vp, 0, td);
  		}
   	}
 }
 
 void
 nfs_printf(const char *fmt, ...)
 {
 	va_list ap;
 
 	mtx_lock(&Giant);
 	va_start(ap, fmt);
 	printf(fmt, ap);
 	va_end(ap);
 	mtx_unlock(&Giant);
 }
 
 /*
  * Attribute cache routines.
  * nfs_loadattrcache() - loads or updates the cache contents from attributes
  *	that are on the mbuf list
  * nfs_getattrcache() - returns valid attributes if found in cache, returns
  *	error otherwise
  */
 
 /*
  * Load the attribute cache (that lives in the nfsnode entry) with
  * the values on the mbuf list and
  * Iff vap not NULL
  *    copy the attributes to *vaper
  */
 int
 nfs_loadattrcache(struct vnode **vpp, struct mbuf **mdp, caddr_t *dposp,
 		  struct vattr *vaper, int dontshrink)
 {
 	struct vnode *vp = *vpp;
 	struct vattr *vap;
 	struct nfs_fattr *fp;
 	struct nfsnode *np;
 	int32_t t1;
 	caddr_t cp2;
 	int rdev;
 	struct mbuf *md;
 	enum vtype vtyp;
 	u_short vmode;
 	struct timespec mtime, mtime_save;
 	int v3 = NFS_ISV3(vp);
 	struct thread *td = curthread;
 
 	md = *mdp;
 	t1 = (mtod(md, caddr_t) + md->m_len) - *dposp;
 	cp2 = nfsm_disct(mdp, dposp, NFSX_FATTR(v3), t1, M_TRYWAIT);
 	if (cp2 == NULL)
 		return EBADRPC;
 	fp = (struct nfs_fattr *)cp2;
 	if (v3) {
 		vtyp = nfsv3tov_type(fp->fa_type);
 		vmode = fxdr_unsigned(u_short, fp->fa_mode);
 		rdev = makedev(fxdr_unsigned(int, fp->fa3_rdev.specdata1),
 			fxdr_unsigned(int, fp->fa3_rdev.specdata2));
 		fxdr_nfsv3time(&fp->fa3_mtime, &mtime);
 	} else {
 		vtyp = nfsv2tov_type(fp->fa_type);
 		vmode = fxdr_unsigned(u_short, fp->fa_mode);
 		/*
 		 * XXX
 		 *
 		 * The duplicate information returned in fa_type and fa_mode
 		 * is an ambiguity in the NFS version 2 protocol.
 		 *
 		 * VREG should be taken literally as a regular file.  If a
 		 * server intents to return some type information differently
 		 * in the upper bits of the mode field (e.g. for sockets, or
 		 * FIFOs), NFSv2 mandates fa_type to be VNON.  Anyway, we
 		 * leave the examination of the mode bits even in the VREG
 		 * case to avoid breakage for bogus servers, but we make sure
 		 * that there are actually type bits set in the upper part of
 		 * fa_mode (and failing that, trust the va_type field).
 		 *
 		 * NFSv3 cleared the issue, and requires fa_mode to not
 		 * contain any type information (while also introduing sockets
 		 * and FIFOs for fa_type).
 		 */
 		if (vtyp == VNON || (vtyp == VREG && (vmode & S_IFMT) != 0))
 			vtyp = IFTOVT(vmode);
 		rdev = fxdr_unsigned(int32_t, fp->fa2_rdev);
 		fxdr_nfsv2time(&fp->fa2_mtime, &mtime);
 
 		/*
 		 * Really ugly NFSv2 kludge.
 		 */
 		if (vtyp == VCHR && rdev == 0xffffffff)
 			vtyp = VFIFO;
 	}
 
 	/*
 	 * If v_type == VNON it is a new node, so fill in the v_type,
 	 * n_mtime fields. Check to see if it represents a special
 	 * device, and if so, check for a possible alias. Once the
 	 * correct vnode has been obtained, fill in the rest of the
 	 * information.
 	 */
 	np = VTONFS(vp);
 	mtx_lock(&np->n_mtx);
 	if (vp->v_type != vtyp) {
 		vp->v_type = vtyp;
 		if (vp->v_type == VFIFO)
 			vp->v_op = &nfs_fifoops;
 		np->n_mtime = mtime;
 	}
 	vap = &np->n_vattr;
 	vap->va_type = vtyp;
 	vap->va_mode = (vmode & 07777);
 	vap->va_rdev = rdev;
 	mtime_save = vap->va_mtime;
 	vap->va_mtime = mtime;
 	vap->va_fsid = vp->v_mount->mnt_stat.f_fsid.val[0];
 	if (v3) {
 		vap->va_nlink = fxdr_unsigned(u_short, fp->fa_nlink);
 		vap->va_uid = fxdr_unsigned(uid_t, fp->fa_uid);
 		vap->va_gid = fxdr_unsigned(gid_t, fp->fa_gid);
 		vap->va_size = fxdr_hyper(&fp->fa3_size);
 		vap->va_blocksize = NFS_FABLKSIZE;
 		vap->va_bytes = fxdr_hyper(&fp->fa3_used);
 		vap->va_fileid = fxdr_unsigned(int32_t,
 		    fp->fa3_fileid.nfsuquad[1]);
 		fxdr_nfsv3time(&fp->fa3_atime, &vap->va_atime);
 		fxdr_nfsv3time(&fp->fa3_ctime, &vap->va_ctime);
 		vap->va_flags = 0;
 		vap->va_filerev = 0;
 	} else {
 		vap->va_nlink = fxdr_unsigned(u_short, fp->fa_nlink);
 		vap->va_uid = fxdr_unsigned(uid_t, fp->fa_uid);
 		vap->va_gid = fxdr_unsigned(gid_t, fp->fa_gid);
 		vap->va_size = fxdr_unsigned(u_int32_t, fp->fa2_size);
 		vap->va_blocksize = fxdr_unsigned(int32_t, fp->fa2_blocksize);
 		vap->va_bytes = (u_quad_t)fxdr_unsigned(int32_t, fp->fa2_blocks)
 		    * NFS_FABLKSIZE;
 		vap->va_fileid = fxdr_unsigned(int32_t, fp->fa2_fileid);
 		fxdr_nfsv2time(&fp->fa2_atime, &vap->va_atime);
 		vap->va_flags = 0;
 		vap->va_ctime.tv_sec = fxdr_unsigned(u_int32_t,
 		    fp->fa2_ctime.nfsv2_sec);
 		vap->va_ctime.tv_nsec = 0;
 		vap->va_gen = fxdr_unsigned(u_int32_t, fp->fa2_ctime.nfsv2_usec);
 		vap->va_filerev = 0;
 	}
 	np->n_attrstamp = time_second;
 	/* Timestamp the NFS otw getattr fetch */
 	if (td->td_proc) {
 		np->n_ac_ts_tid = td->td_tid;
 		np->n_ac_ts_pid = td->td_proc->p_pid;
 		np->n_ac_ts_syscalls = td->td_syscalls;
 	} else
 		bzero(&np->n_ac_ts, sizeof(struct nfs_attrcache_timestamp));
 	
 	if (vap->va_size != np->n_size) {
 		if (vap->va_type == VREG) {
 			if (dontshrink && vap->va_size < np->n_size) {
 				/*
 				 * We've been told not to shrink the file;
 				 * zero np->n_attrstamp to indicate that
 				 * the attributes are stale.
 				 */
 				vap->va_size = np->n_size;
 				np->n_attrstamp = 0;
 			} else if (np->n_flag & NMODIFIED) {
 				/*
 				 * We've modified the file: Use the larger
 				 * of our size, and the server's size.
 				 */
 				if (vap->va_size < np->n_size) {
 					vap->va_size = np->n_size;
 				} else {
 					np->n_size = vap->va_size;
 					np->n_flag |= NSIZECHANGED;
 				}
 			} else {
 				np->n_size = vap->va_size;
 				np->n_flag |= NSIZECHANGED;
 			}
 			vnode_pager_setsize(vp, np->n_size);
 		} else {
 			np->n_size = vap->va_size;
 		}
 	}
 	/*
 	 * The following checks are added to prevent a race between (say)
 	 * a READDIR+ and a WRITE. 
 	 * READDIR+, WRITE requests sent out.
 	 * READDIR+ resp, WRITE resp received on client.
 	 * However, the WRITE resp was handled before the READDIR+ resp
 	 * causing the post op attrs from the write to be loaded first
 	 * and the attrs from the READDIR+ to be loaded later. If this 
 	 * happens, we have stale attrs loaded into the attrcache.
 	 * We detect this by for the mtime moving back. We invalidate the 
 	 * attrcache when this happens.
 	 */
 	if (timespeccmp(&mtime_save, &vap->va_mtime, >))
 		/* Size changed or mtime went backwards */
 		np->n_attrstamp = 0;
 	if (vaper != NULL) {
 		bcopy((caddr_t)vap, (caddr_t)vaper, sizeof(*vap));
 		if (np->n_flag & NCHG) {
 			if (np->n_flag & NACC)
 				vaper->va_atime = np->n_atim;
 			if (np->n_flag & NUPD)
 				vaper->va_mtime = np->n_mtim;
 		}
 	}
 	mtx_unlock(&np->n_mtx);
 	return (0);
 }
 
 #ifdef NFS_ACDEBUG
 #include <sys/sysctl.h>
 SYSCTL_DECL(_vfs_nfs);
 static int nfs_acdebug;
 SYSCTL_INT(_vfs_nfs, OID_AUTO, acdebug, CTLFLAG_RW, &nfs_acdebug, 0, "");
 #endif
 
 /*
  * Check the time stamp
  * If the cache is valid, copy contents to *vap and return 0
  * otherwise return an error
  */
 int
 nfs_getattrcache(struct vnode *vp, struct vattr *vaper)
 {
 	struct nfsnode *np;
 	struct vattr *vap;
 	struct nfsmount *nmp;
 	int timeo;
 	
 	np = VTONFS(vp);
 	vap = &np->n_vattr;
 	nmp = VFSTONFS(vp->v_mount);
 #ifdef NFS_ACDEBUG
 	mtx_lock(&Giant);	/* nfs_printf() */
 #endif
 	mtx_lock(&np->n_mtx);
 	/* XXX n_mtime doesn't seem to be updated on a miss-and-reload */
 	timeo = (time_second - np->n_mtime.tv_sec) / 10;
 
 #ifdef NFS_ACDEBUG
 	if (nfs_acdebug>1)
 		nfs_printf("nfs_getattrcache: initial timeo = %d\n", timeo);
 #endif
 
 	if (vap->va_type == VDIR) {
 		if ((np->n_flag & NMODIFIED) || timeo < nmp->nm_acdirmin)
 			timeo = nmp->nm_acdirmin;
 		else if (timeo > nmp->nm_acdirmax)
 			timeo = nmp->nm_acdirmax;
 	} else {
 		if ((np->n_flag & NMODIFIED) || timeo < nmp->nm_acregmin)
 			timeo = nmp->nm_acregmin;
 		else if (timeo > nmp->nm_acregmax)
 			timeo = nmp->nm_acregmax;
 	}
 
 #ifdef NFS_ACDEBUG
 	if (nfs_acdebug > 2)
 		nfs_printf("acregmin %d; acregmax %d; acdirmin %d; acdirmax %d\n",
 			   nmp->nm_acregmin, nmp->nm_acregmax,
 			   nmp->nm_acdirmin, nmp->nm_acdirmax);
 
 	if (nfs_acdebug)
 		nfs_printf("nfs_getattrcache: age = %d; final timeo = %d\n",
 			   (time_second - np->n_attrstamp), timeo);
 #endif
 
 	if ((time_second - np->n_attrstamp) >= timeo) {
 		nfsstats.attrcache_misses++;
 		mtx_unlock(&np->n_mtx);
 		return( ENOENT);
 	}
 	nfsstats.attrcache_hits++;
 	if (vap->va_size != np->n_size) {
 		if (vap->va_type == VREG) {
 			if (np->n_flag & NMODIFIED) {
 				if (vap->va_size < np->n_size)
 					vap->va_size = np->n_size;
 				else
 					np->n_size = vap->va_size;
 			} else {
 				np->n_size = vap->va_size;
 			}
 			vnode_pager_setsize(vp, np->n_size);
 		} else {
 			np->n_size = vap->va_size;
 		}
 	}
 	bcopy((caddr_t)vap, (caddr_t)vaper, sizeof(struct vattr));
 	if (np->n_flag & NCHG) {
 		if (np->n_flag & NACC)
 			vaper->va_atime = np->n_atim;
 		if (np->n_flag & NUPD)
 			vaper->va_mtime = np->n_mtim;
 	}
 	mtx_unlock(&np->n_mtx);
 #ifdef NFS_ACDEBUG
 	mtx_unlock(&Giant);	/* nfs_printf() */
 #endif
 	return (0);
 }
 
 static nfsuint64 nfs_nullcookie = { { 0, 0 } };
 /*
  * This function finds the directory cookie that corresponds to the
  * logical byte offset given.
  */
 nfsuint64 *
 nfs_getcookie(struct nfsnode *np, off_t off, int add)
 {
 	struct nfsdmap *dp, *dp2;
 	int pos;
 	nfsuint64 *retval = NULL;
 	
 	pos = (uoff_t)off / NFS_DIRBLKSIZ;
 	if (pos == 0 || off < 0) {
 #ifdef DIAGNOSTIC
 		if (add)
 			panic("nfs getcookie add at <= 0");
 #endif
 		return (&nfs_nullcookie);
 	}
 	pos--;
 	dp = LIST_FIRST(&np->n_cookies);
 	if (!dp) {
 		if (add) {
 			MALLOC(dp, struct nfsdmap *, sizeof (struct nfsdmap),
 				M_NFSDIROFF, M_WAITOK);
 			dp->ndm_eocookie = 0;
 			LIST_INSERT_HEAD(&np->n_cookies, dp, ndm_list);
 		} else
 			goto out;
 	}
 	while (pos >= NFSNUMCOOKIES) {
 		pos -= NFSNUMCOOKIES;
 		if (LIST_NEXT(dp, ndm_list)) {
 			if (!add && dp->ndm_eocookie < NFSNUMCOOKIES &&
 			    pos >= dp->ndm_eocookie)
 				goto out;
 			dp = LIST_NEXT(dp, ndm_list);
 		} else if (add) {
 			MALLOC(dp2, struct nfsdmap *, sizeof (struct nfsdmap),
 				M_NFSDIROFF, M_WAITOK);
 			dp2->ndm_eocookie = 0;
 			LIST_INSERT_AFTER(dp, dp2, ndm_list);
 			dp = dp2;
 		} else
 			goto out;
 	}
 	if (pos >= dp->ndm_eocookie) {
 		if (add)
 			dp->ndm_eocookie = pos + 1;
 		else
 			goto out;
 	}
 	retval = &dp->ndm_cookies[pos];
 out:
 	return (retval);
 }
 
 /*
  * Invalidate cached directory information, except for the actual directory
  * blocks (which are invalidated separately).
  * Done mainly to avoid the use of stale offset cookies.
  */
 void
 nfs_invaldir(struct vnode *vp)
 {
 	struct nfsnode *np = VTONFS(vp);
 
 #ifdef DIAGNOSTIC
 	if (vp->v_type != VDIR)
 		panic("nfs: invaldir not dir");
 #endif
 	nfs_dircookie_lock(np);
 	np->n_direofoffset = 0;
 	np->n_cookieverf.nfsuquad[0] = 0;
 	np->n_cookieverf.nfsuquad[1] = 0;
 	if (LIST_FIRST(&np->n_cookies))
 		LIST_FIRST(&np->n_cookies)->ndm_eocookie = 0;
 	nfs_dircookie_unlock(np);
 }
 
 /*
  * The write verifier has changed (probably due to a server reboot), so all
  * B_NEEDCOMMIT blocks will have to be written again. Since they are on the
  * dirty block list as B_DELWRI, all this takes is clearing the B_NEEDCOMMIT
  * and B_CLUSTEROK flags.  Once done the new write verifier can be set for the
  * mount point.
  *
  * B_CLUSTEROK must be cleared along with B_NEEDCOMMIT because stage 1 data
  * writes are not clusterable.
  */
 void
 nfs_clearcommit(struct mount *mp)
 {
 	struct vnode *vp, *nvp;
 	struct buf *bp, *nbp;
 	int s;
 
 	s = splbio();
 	MNT_ILOCK(mp);
 	MNT_VNODE_FOREACH(vp, mp, nvp) {
 		VI_LOCK(vp);
 		if (vp->v_iflag & VI_DOOMED) {
 			VI_UNLOCK(vp);
 			continue;
 		}
 		MNT_IUNLOCK(mp);
 		TAILQ_FOREACH_SAFE(bp, &vp->v_bufobj.bo_dirty.bv_hd, b_bobufs, nbp) {
 			if (BUF_REFCNT(bp) == 0 &&
 			    (bp->b_flags & (B_DELWRI | B_NEEDCOMMIT))
 				== (B_DELWRI | B_NEEDCOMMIT))
 				bp->b_flags &= ~(B_NEEDCOMMIT | B_CLUSTEROK);
 		}
 		VI_UNLOCK(vp);
 		MNT_ILOCK(mp);
 	}
 	MNT_IUNLOCK(mp);
 	splx(s);
 }
 
 /*
  * Helper functions for former macros.  Some of these should be
  * moved to their callers.
  */
 
 int
 nfsm_mtofh_xx(struct vnode *d, struct vnode **v, int v3, int *f,
     struct mbuf **md, caddr_t *dpos)
 {
 	struct nfsnode *ttnp;
 	struct vnode *ttvp;
 	nfsfh_t *ttfhp;
 	u_int32_t *tl;
 	int ttfhsize;
 	int t1;
 
 	if (v3) {
 		tl = nfsm_dissect_xx(NFSX_UNSIGNED, md, dpos);
 		if (tl == NULL)
 			return EBADRPC;
 		*f = fxdr_unsigned(int, *tl);
 	} else
 		*f = 1;
 	if (*f) {
 		t1 = nfsm_getfh_xx(&ttfhp, &ttfhsize, (v3), md, dpos);
 		if (t1 != 0)
 			return t1;
 		t1 = nfs_nget(d->v_mount, ttfhp, ttfhsize, &ttnp, LK_EXCLUSIVE);
 		if (t1 != 0)
 			return t1;
 		*v = NFSTOV(ttnp);
 	}
 	if (v3) {
 		tl = nfsm_dissect_xx(NFSX_UNSIGNED, md, dpos);
 		if (tl == NULL)
 			return EBADRPC;
 		if (*f)
 			*f = fxdr_unsigned(int, *tl);
 		else if (fxdr_unsigned(int, *tl))
 			nfsm_adv_xx(NFSX_V3FATTR, md, dpos);
 	}
 	if (*f) {
 		ttvp = *v;
 		t1 = nfs_loadattrcache(&ttvp, md, dpos, NULL, 0);
 		if (t1)
 			return t1;
 		*v = ttvp;
 	}
 	return 0;
 }
 
 int
 nfsm_getfh_xx(nfsfh_t **f, int *s, int v3, struct mbuf **md, caddr_t *dpos)
 {
 	u_int32_t *tl;
 
 	if (v3) {
 		tl = nfsm_dissect_xx(NFSX_UNSIGNED, md, dpos);
 		if (tl == NULL)
 			return EBADRPC;
 		*s = fxdr_unsigned(int, *tl);
 		if (*s <= 0 || *s > NFSX_V3FHMAX)
 			return EBADRPC;
 	} else
 		*s = NFSX_V2FH;
 	*f = nfsm_dissect_xx(nfsm_rndup(*s), md, dpos);
 	if (*f == NULL)
 		return EBADRPC;
 	else
 		return 0;
 }
 
 
 int
 nfsm_loadattr_xx(struct vnode **v, struct vattr *va, struct mbuf **md,
 		 caddr_t *dpos)
 {
 	int t1;
 
 	struct vnode *ttvp = *v;
 	t1 = nfs_loadattrcache(&ttvp, md, dpos, va, 0);
 	if (t1 != 0)
 		return t1;
 	*v = ttvp;
 	return 0;
 }
 
 int
 nfsm_postop_attr_xx(struct vnode **v, int *f, struct mbuf **md,
 		    caddr_t *dpos)
 {
 	u_int32_t *tl;
 	int t1;
 
 	struct vnode *ttvp = *v;
 	tl = nfsm_dissect_xx(NFSX_UNSIGNED, md, dpos);
 	if (tl == NULL)
 		return EBADRPC;
 	*f = fxdr_unsigned(int, *tl);
 	if (*f != 0) {
 		t1 = nfs_loadattrcache(&ttvp, md, dpos, NULL, 1);
 		if (t1 != 0) {
 			*f = 0;
 			return t1;
 		}
 		*v = ttvp;
 	}
 	return 0;
 }
 
 int
 nfsm_wcc_data_xx(struct vnode **v, int *f, struct mbuf **md, caddr_t *dpos)
 {
 	u_int32_t *tl;
 	int ttattrf, ttretf = 0;
 	int t1;
 
 	tl = nfsm_dissect_xx(NFSX_UNSIGNED, md, dpos);
 	if (tl == NULL)
 		return EBADRPC;
 	if (*tl == nfs_true) {
 		tl = nfsm_dissect_xx(6 * NFSX_UNSIGNED, md, dpos);
 		if (tl == NULL)
 			return EBADRPC;
 		mtx_lock(&(VTONFS(*v))->n_mtx);
 		if (*f)
  			ttretf = (VTONFS(*v)->n_mtime.tv_sec == fxdr_unsigned(u_int32_t, *(tl + 2)) && 
 				  VTONFS(*v)->n_mtime.tv_nsec == fxdr_unsigned(u_int32_t, *(tl + 3))); 
 		mtx_unlock(&(VTONFS(*v))->n_mtx);
 	}
 	t1 = nfsm_postop_attr_xx(v, &ttattrf, md, dpos);
 	if (t1)
 		return t1;
 	if (*f)
 		*f = ttretf;
 	else
 		*f = ttattrf;
 	return 0;
 }
 
 int
 nfsm_strtom_xx(const char *a, int s, int m, struct mbuf **mb, caddr_t *bpos)
 {
 	u_int32_t *tl;
 	int t1;
 
 	if (s > m)
 		return ENAMETOOLONG;
 	t1 = nfsm_rndup(s) + NFSX_UNSIGNED;
 	if (t1 <= M_TRAILINGSPACE(*mb)) {
 		tl = nfsm_build_xx(t1, mb, bpos);
 		*tl++ = txdr_unsigned(s);
 		*(tl + ((t1 >> 2) - 2)) = 0;
 		bcopy(a, tl, s);
 	} else {
 		t1 = nfsm_strtmbuf(mb, bpos, a, s);
 		if (t1 != 0)
 			return t1;
 	}
 	return 0;
 }
 
 int
 nfsm_fhtom_xx(struct vnode *v, int v3, struct mbuf **mb, caddr_t *bpos)
 {
 	u_int32_t *tl;
 	int t1;
 	caddr_t cp;
 
 	if (v3) {
 		t1 = nfsm_rndup(VTONFS(v)->n_fhsize) + NFSX_UNSIGNED;
 		if (t1 < M_TRAILINGSPACE(*mb)) {
 			tl = nfsm_build_xx(t1, mb, bpos);
 			*tl++ = txdr_unsigned(VTONFS(v)->n_fhsize);
 			*(tl + ((t1 >> 2) - 2)) = 0;
 			bcopy(VTONFS(v)->n_fhp, tl, VTONFS(v)->n_fhsize);
 		} else {
 			t1 = nfsm_strtmbuf(mb, bpos,
 			    (const char *)VTONFS(v)->n_fhp,
 			    VTONFS(v)->n_fhsize);
 			if (t1 != 0)
 				return t1;
 		}
 	} else {
 		cp = nfsm_build_xx(NFSX_V2FH, mb, bpos);
 		bcopy(VTONFS(v)->n_fhp, cp, NFSX_V2FH);
 	}
 	return 0;
 }
 
 void
 nfsm_v3attrbuild_xx(struct vattr *va, int full, struct mbuf **mb,
     caddr_t *bpos)
 {
 	u_int32_t *tl;
 
 	if (va->va_mode != (mode_t)VNOVAL) {
 		tl = nfsm_build_xx(2 * NFSX_UNSIGNED, mb, bpos);
 		*tl++ = nfs_true;
 		*tl = txdr_unsigned(va->va_mode);
 	} else {
 		tl = nfsm_build_xx(NFSX_UNSIGNED, mb, bpos);
 		*tl = nfs_false;
 	}
 	if (full && va->va_uid != (uid_t)VNOVAL) {
 		tl = nfsm_build_xx(2 * NFSX_UNSIGNED, mb, bpos);
 		*tl++ = nfs_true;
 		*tl = txdr_unsigned(va->va_uid);
 	} else {
 		tl = nfsm_build_xx(NFSX_UNSIGNED, mb, bpos);
 		*tl = nfs_false;
 	}
 	if (full && va->va_gid != (gid_t)VNOVAL) {
 		tl = nfsm_build_xx(2 * NFSX_UNSIGNED, mb, bpos);
 		*tl++ = nfs_true;
 		*tl = txdr_unsigned(va->va_gid);
 	} else {
 		tl = nfsm_build_xx(NFSX_UNSIGNED, mb, bpos);
 		*tl = nfs_false;
 	}
 	if (full && va->va_size != VNOVAL) {
 		tl = nfsm_build_xx(3 * NFSX_UNSIGNED, mb, bpos);
 		*tl++ = nfs_true;
 		txdr_hyper(va->va_size, tl);
 	} else {
 		tl = nfsm_build_xx(NFSX_UNSIGNED, mb, bpos);
 		*tl = nfs_false;
 	}
 	if (va->va_atime.tv_sec != VNOVAL) {
 		if (va->va_atime.tv_sec != time_second) {
 			tl = nfsm_build_xx(3 * NFSX_UNSIGNED, mb, bpos);
 			*tl++ = txdr_unsigned(NFSV3SATTRTIME_TOCLIENT);
 			txdr_nfsv3time(&va->va_atime, tl);
 		} else {
 			tl = nfsm_build_xx(NFSX_UNSIGNED, mb, bpos);
 			*tl = txdr_unsigned(NFSV3SATTRTIME_TOSERVER);
 		}
 	} else {
 		tl = nfsm_build_xx(NFSX_UNSIGNED, mb, bpos);
 		*tl = txdr_unsigned(NFSV3SATTRTIME_DONTCHANGE);
 	}
 	if (va->va_mtime.tv_sec != VNOVAL) {
 		if (va->va_mtime.tv_sec != time_second) {
 			tl = nfsm_build_xx(3 * NFSX_UNSIGNED, mb, bpos);
 			*tl++ = txdr_unsigned(NFSV3SATTRTIME_TOCLIENT);
 			txdr_nfsv3time(&va->va_mtime, tl);
 		} else {
 			tl = nfsm_build_xx(NFSX_UNSIGNED, mb, bpos);
 			*tl = txdr_unsigned(NFSV3SATTRTIME_TOSERVER);
 		}
 	} else {
 		tl = nfsm_build_xx(NFSX_UNSIGNED, mb, bpos);
 		*tl = txdr_unsigned(NFSV3SATTRTIME_DONTCHANGE);
 	}
 }
Index: head/sys/nfsclient/nfs_vnops.c
===================================================================
--- head/sys/nfsclient/nfs_vnops.c	(revision 175201)
+++ head/sys/nfsclient/nfs_vnops.c	(revision 175202)
@@ -1,3273 +1,3273 @@
 /*-
  * Copyright (c) 1989, 1993
  *	The Regents of the University of California.  All rights reserved.
  *
  * This code is derived from software contributed to Berkeley by
  * Rick Macklem at The University of Guelph.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	@(#)nfs_vnops.c	8.16 (Berkeley) 5/27/95
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 /*
  * vnode op calls for Sun NFS version 2 and 3
  */
 
 #include "opt_inet.h"
 
 #include <sys/param.h>
 #include <sys/kernel.h>
 #include <sys/systm.h>
 #include <sys/resourcevar.h>
 #include <sys/proc.h>
 #include <sys/mount.h>
 #include <sys/bio.h>
 #include <sys/buf.h>
 #include <sys/malloc.h>
 #include <sys/mbuf.h>
 #include <sys/namei.h>
 #include <sys/socket.h>
 #include <sys/vnode.h>
 #include <sys/dirent.h>
 #include <sys/fcntl.h>
 #include <sys/lockf.h>
 #include <sys/stat.h>
 #include <sys/sysctl.h>
 #include <sys/signalvar.h>
 
 #include <vm/vm.h>
 #include <vm/vm_object.h>
 #include <vm/vm_extern.h>
 #include <vm/vm_object.h>
 
 #include <fs/fifofs/fifo.h>
 
 #include <rpc/rpcclnt.h>
 
 #include <nfs/rpcv2.h>
 #include <nfs/nfsproto.h>
 #include <nfsclient/nfs.h>
 #include <nfsclient/nfsnode.h>
 #include <nfsclient/nfsmount.h>
 #include <nfsclient/nfs_lock.h>
 #include <nfs/xdr_subs.h>
 #include <nfsclient/nfsm_subs.h>
 
 #include <net/if.h>
 #include <netinet/in.h>
 #include <netinet/in_var.h>
 
 /* Defs */
 #define	TRUE	1
 #define	FALSE	0
 
 /*
  * Ifdef for FreeBSD-current merged buffer cache. It is unfortunate that these
  * calls are not in getblk() and brelse() so that they would not be necessary
  * here.
  */
 #ifndef B_VMIO
 #define vfs_busy_pages(bp, f)
 #endif
 
 static vop_read_t	nfsfifo_read;
 static vop_write_t	nfsfifo_write;
 static vop_close_t	nfsfifo_close;
 static int	nfs_flush(struct vnode *, int, struct thread *,
 		    int);
 static int	nfs_setattrrpc(struct vnode *, struct vattr *, struct ucred *,
 		    struct thread *);
 static vop_lookup_t	nfs_lookup;
 static vop_create_t	nfs_create;
 static vop_mknod_t	nfs_mknod;
 static vop_open_t	nfs_open;
 static vop_close_t	nfs_close;
 static vop_access_t	nfs_access;
 static vop_getattr_t	nfs_getattr;
 static vop_setattr_t	nfs_setattr;
 static vop_read_t	nfs_read;
 static vop_fsync_t	nfs_fsync;
 static vop_remove_t	nfs_remove;
 static vop_link_t	nfs_link;
 static vop_rename_t	nfs_rename;
 static vop_mkdir_t	nfs_mkdir;
 static vop_rmdir_t	nfs_rmdir;
 static vop_symlink_t	nfs_symlink;
 static vop_readdir_t	nfs_readdir;
 static vop_strategy_t	nfs_strategy;
 static	int	nfs_lookitup(struct vnode *, const char *, int,
 		    struct ucred *, struct thread *, struct nfsnode **);
 static	int	nfs_sillyrename(struct vnode *, struct vnode *,
 		    struct componentname *);
 static vop_access_t	nfsspec_access;
 static vop_readlink_t	nfs_readlink;
 static vop_print_t	nfs_print;
 static vop_advlock_t	nfs_advlock;
 
 /*
  * Global vfs data structures for nfs
  */
 struct vop_vector nfs_vnodeops = {
 	.vop_default =		&default_vnodeops,
 	.vop_access =		nfs_access,
 	.vop_advlock =		nfs_advlock,
 	.vop_close =		nfs_close,
 	.vop_create =		nfs_create,
 	.vop_fsync =		nfs_fsync,
 	.vop_getattr =		nfs_getattr,
 	.vop_getpages =		nfs_getpages,
 	.vop_putpages =		nfs_putpages,
 	.vop_inactive =		nfs_inactive,
 	.vop_lease =		VOP_NULL,
 	.vop_link =		nfs_link,
 	.vop_lookup =		nfs_lookup,
 	.vop_mkdir =		nfs_mkdir,
 	.vop_mknod =		nfs_mknod,
 	.vop_open =		nfs_open,
 	.vop_print =		nfs_print,
 	.vop_read =		nfs_read,
 	.vop_readdir =		nfs_readdir,
 	.vop_readlink =		nfs_readlink,
 	.vop_reclaim =		nfs_reclaim,
 	.vop_remove =		nfs_remove,
 	.vop_rename =		nfs_rename,
 	.vop_rmdir =		nfs_rmdir,
 	.vop_setattr =		nfs_setattr,
 	.vop_strategy =		nfs_strategy,
 	.vop_symlink =		nfs_symlink,
 	.vop_write =		nfs_write,
 };
 
 struct vop_vector nfs_fifoops = {
 	.vop_default =		&fifo_specops,
 	.vop_access =		nfsspec_access,
 	.vop_close =		nfsfifo_close,
 	.vop_fsync =		nfs_fsync,
 	.vop_getattr =		nfs_getattr,
 	.vop_inactive =		nfs_inactive,
 	.vop_print =		nfs_print,
 	.vop_read =		nfsfifo_read,
 	.vop_reclaim =		nfs_reclaim,
 	.vop_setattr =		nfs_setattr,
 	.vop_write =		nfsfifo_write,
 };
 
 static int	nfs_mknodrpc(struct vnode *dvp, struct vnode **vpp,
 			     struct componentname *cnp, struct vattr *vap);
 static int	nfs_removerpc(struct vnode *dvp, const char *name, int namelen,
 			      struct ucred *cred, struct thread *td);
 static int	nfs_renamerpc(struct vnode *fdvp, const char *fnameptr,
 			      int fnamelen, struct vnode *tdvp,
 			      const char *tnameptr, int tnamelen,
 			      struct ucred *cred, struct thread *td);
 static int	nfs_renameit(struct vnode *sdvp, struct componentname *scnp,
 			     struct sillyrename *sp);
 
 /*
  * Global variables
  */
 struct mtx 	nfs_iod_mtx;
 struct proc	*nfs_iodwant[NFS_MAXASYNCDAEMON];
 struct nfsmount *nfs_iodmount[NFS_MAXASYNCDAEMON];
 int		 nfs_numasync = 0;
 #define	DIRHDSIZ	(sizeof (struct dirent) - (MAXNAMLEN + 1))
 
 SYSCTL_DECL(_vfs_nfs);
 
 static int	nfsaccess_cache_timeout = NFS_MAXATTRTIMO;
 SYSCTL_INT(_vfs_nfs, OID_AUTO, access_cache_timeout, CTLFLAG_RW,
 	   &nfsaccess_cache_timeout, 0, "NFS ACCESS cache timeout");
 
 static int	nfsv3_commit_on_close = 0;
 SYSCTL_INT(_vfs_nfs, OID_AUTO, nfsv3_commit_on_close, CTLFLAG_RW,
 	   &nfsv3_commit_on_close, 0, "write+commit on close, else only write");
 
 static int	nfs_clean_pages_on_close = 1;
 SYSCTL_INT(_vfs_nfs, OID_AUTO, clean_pages_on_close, CTLFLAG_RW,
 	   &nfs_clean_pages_on_close, 0, "NFS clean dirty pages on close");
 
 int nfs_directio_enable = 0;
 SYSCTL_INT(_vfs_nfs, OID_AUTO, nfs_directio_enable, CTLFLAG_RW,
 	   &nfs_directio_enable, 0, "Enable NFS directio");
 
 /*
  * This sysctl allows other processes to mmap a file that has been opened
  * O_DIRECT by a process.  In general, having processes mmap the file while
  * Direct IO is in progress can lead to Data Inconsistencies.  But, we allow
  * this by default to prevent DoS attacks - to prevent a malicious user from
  * opening up files O_DIRECT preventing other users from mmap'ing these
  * files.  "Protected" environments where stricter consistency guarantees are
  * required can disable this knob.  The process that opened the file O_DIRECT
  * cannot mmap() the file, because mmap'ed IO on an O_DIRECT open() is not
  * meaningful.
  */
 int nfs_directio_allow_mmap = 1;
 SYSCTL_INT(_vfs_nfs, OID_AUTO, nfs_directio_allow_mmap, CTLFLAG_RW,
 	   &nfs_directio_allow_mmap, 0, "Enable mmaped IO on file with O_DIRECT opens");
 
 #if 0
 SYSCTL_INT(_vfs_nfs, OID_AUTO, access_cache_hits, CTLFLAG_RD,
 	   &nfsstats.accesscache_hits, 0, "NFS ACCESS cache hit count");
 
 SYSCTL_INT(_vfs_nfs, OID_AUTO, access_cache_misses, CTLFLAG_RD,
 	   &nfsstats.accesscache_misses, 0, "NFS ACCESS cache miss count");
 #endif
 
 #define	NFSV3ACCESS_ALL (NFSV3ACCESS_READ | NFSV3ACCESS_MODIFY		\
 			 | NFSV3ACCESS_EXTEND | NFSV3ACCESS_EXECUTE	\
 			 | NFSV3ACCESS_DELETE | NFSV3ACCESS_LOOKUP)
 
 /*
  * SMP Locking Note :
  * The list of locks after the description of the lock is the ordering
  * of other locks acquired with the lock held.
  * np->n_mtx : Protects the fields in the nfsnode.
        VM Object Lock
        VI_MTX (acquired indirectly)
  * nmp->nm_mtx : Protects the fields in the nfsmount.
        rep->r_mtx
  * nfs_iod_mtx : Global lock, protects shared nfsiod state.
  * nfs_reqq_mtx : Global lock, protects the nfs_reqq list.
        nmp->nm_mtx
        rep->r_mtx
  * rep->r_mtx : Protects the fields in an nfsreq.
  */
 
 static int
 nfs3_access_otw(struct vnode *vp, int wmode, struct thread *td,
     struct ucred *cred)
 {
 	const int v3 = 1;
 	u_int32_t *tl;
 	int error = 0, attrflag;
 
 	struct mbuf *mreq, *mrep, *md, *mb;
 	caddr_t bpos, dpos;
 	u_int32_t rmode;
 	struct nfsnode *np = VTONFS(vp);
 
 	nfsstats.rpccnt[NFSPROC_ACCESS]++;
 	mreq = nfsm_reqhead(vp, NFSPROC_ACCESS, NFSX_FH(v3) + NFSX_UNSIGNED);
 	mb = mreq;
 	bpos = mtod(mb, caddr_t);
 	nfsm_fhtom(vp, v3);
 	tl = nfsm_build(u_int32_t *, NFSX_UNSIGNED);
 	*tl = txdr_unsigned(wmode);
 	nfsm_request(vp, NFSPROC_ACCESS, td, cred);
 	nfsm_postop_attr(vp, attrflag);
 	if (!error) {
 		tl = nfsm_dissect(u_int32_t *, NFSX_UNSIGNED);
 		rmode = fxdr_unsigned(u_int32_t, *tl);
 		mtx_lock(&np->n_mtx);
 		np->n_mode = rmode;
 		np->n_modeuid = cred->cr_uid;
 		np->n_modestamp = time_second;
 		mtx_unlock(&np->n_mtx);
 	}
 	m_freem(mrep);
 nfsmout:
 	return (error);
 }
 
 /*
  * nfs access vnode op.
  * For nfs version 2, just return ok. File accesses may fail later.
  * For nfs version 3, use the access rpc to check accessibility. If file modes
  * are changed on the server, accesses might still fail later.
  */
 static int
 nfs_access(struct vop_access_args *ap)
 {
 	struct vnode *vp = ap->a_vp;
 	int error = 0;
 	u_int32_t mode, wmode;
 	int v3 = NFS_ISV3(vp);
 	struct nfsnode *np = VTONFS(vp);
 
 	/*
 	 * Disallow write attempts on filesystems mounted read-only;
 	 * unless the file is a socket, fifo, or a block or character
 	 * device resident on the filesystem.
 	 */
 	if ((ap->a_mode & VWRITE) && (vp->v_mount->mnt_flag & MNT_RDONLY)) {
 		switch (vp->v_type) {
 		case VREG:
 		case VDIR:
 		case VLNK:
 			return (EROFS);
 		default:
 			break;
 		}
 	}
 	/*
 	 * For nfs v3, check to see if we have done this recently, and if
 	 * so return our cached result instead of making an ACCESS call.
 	 * If not, do an access rpc, otherwise you are stuck emulating
 	 * ufs_access() locally using the vattr. This may not be correct,
 	 * since the server may apply other access criteria such as
 	 * client uid-->server uid mapping that we do not know about.
 	 */
 	if (v3) {
 		if (ap->a_mode & VREAD)
 			mode = NFSV3ACCESS_READ;
 		else
 			mode = 0;
 		if (vp->v_type != VDIR) {
 			if (ap->a_mode & VWRITE)
 				mode |= (NFSV3ACCESS_MODIFY | NFSV3ACCESS_EXTEND);
 			if (ap->a_mode & VEXEC)
 				mode |= NFSV3ACCESS_EXECUTE;
 		} else {
 			if (ap->a_mode & VWRITE)
 				mode |= (NFSV3ACCESS_MODIFY | NFSV3ACCESS_EXTEND |
 					 NFSV3ACCESS_DELETE);
 			if (ap->a_mode & VEXEC)
 				mode |= NFSV3ACCESS_LOOKUP;
 		}
 		/* XXX safety belt, only make blanket request if caching */
 		if (nfsaccess_cache_timeout > 0) {
 			wmode = NFSV3ACCESS_READ | NFSV3ACCESS_MODIFY |
 				NFSV3ACCESS_EXTEND | NFSV3ACCESS_EXECUTE |
 				NFSV3ACCESS_DELETE | NFSV3ACCESS_LOOKUP;
 		} else {
 			wmode = mode;
 		}
 
 		/*
 		 * Does our cached result allow us to give a definite yes to
 		 * this request?
 		 */
 		mtx_lock(&np->n_mtx);
 		if ((time_second < (np->n_modestamp + nfsaccess_cache_timeout)) &&
 		    (ap->a_cred->cr_uid == np->n_modeuid) &&
 		    ((np->n_mode & mode) == mode)) {
 			nfsstats.accesscache_hits++;
 		} else {
 			/*
 			 * Either a no, or a don't know.  Go to the wire.
 			 */
 			nfsstats.accesscache_misses++;
 			mtx_unlock(&np->n_mtx);
 		        error = nfs3_access_otw(vp, wmode, ap->a_td,ap->a_cred);
 			mtx_lock(&np->n_mtx);
 			if (!error) {
 				if ((np->n_mode & mode) != mode) {
 					error = EACCES;
 				}
 			}
 		}
 		mtx_unlock(&np->n_mtx);
 		return (error);
 	} else {
 		if ((error = nfsspec_access(ap)) != 0) {
 			return (error);
 		}
 		/*
 		 * Attempt to prevent a mapped root from accessing a file
 		 * which it shouldn't.  We try to read a byte from the file
 		 * if the user is root and the file is not zero length.
 		 * After calling nfsspec_access, we should have the correct
 		 * file size cached.
 		 */
 		mtx_lock(&np->n_mtx);
 		if (ap->a_cred->cr_uid == 0 && (ap->a_mode & VREAD)
 		    && VTONFS(vp)->n_size > 0) {
 			struct iovec aiov;
 			struct uio auio;
 			char buf[1];
 
 			mtx_unlock(&np->n_mtx);
 			aiov.iov_base = buf;
 			aiov.iov_len = 1;
 			auio.uio_iov = &aiov;
 			auio.uio_iovcnt = 1;
 			auio.uio_offset = 0;
 			auio.uio_resid = 1;
 			auio.uio_segflg = UIO_SYSSPACE;
 			auio.uio_rw = UIO_READ;
 			auio.uio_td = ap->a_td;
 
 			if (vp->v_type == VREG)
 				error = nfs_readrpc(vp, &auio, ap->a_cred);
 			else if (vp->v_type == VDIR) {
 				char* bp;
 				bp = malloc(NFS_DIRBLKSIZ, M_TEMP, M_WAITOK);
 				aiov.iov_base = bp;
 				aiov.iov_len = auio.uio_resid = NFS_DIRBLKSIZ;
 				error = nfs_readdirrpc(vp, &auio, ap->a_cred);
 				free(bp, M_TEMP);
 			} else if (vp->v_type == VLNK)
 				error = nfs_readlinkrpc(vp, &auio, ap->a_cred);
 			else
 				error = EACCES;
 		} else
 			mtx_unlock(&np->n_mtx);
 		return (error);
 	}
 }
 
 int nfs_otw_getattr_avoid = 0;
 
 /*
  * nfs open vnode op
  * Check to see if the type is ok
  * and that deletion is not in progress.
  * For paged in text files, you will need to flush the page cache
  * if consistency is lost.
  */
 /* ARGSUSED */
 static int
 nfs_open(struct vop_open_args *ap)
 {
 	struct vnode *vp = ap->a_vp;
 	struct nfsnode *np = VTONFS(vp);
 	struct vattr vattr;
 	int error;
 	int fmode = ap->a_mode;
 
 	if (vp->v_type != VREG && vp->v_type != VDIR && vp->v_type != VLNK)
 		return (EOPNOTSUPP);
 
 	/*
 	 * Get a valid lease. If cached data is stale, flush it.
 	 */
 	mtx_lock(&np->n_mtx);
 	if (np->n_flag & NMODIFIED) {
 		mtx_unlock(&np->n_mtx);			
 		error = nfs_vinvalbuf(vp, V_SAVE, ap->a_td, 1);
 		if (error == EINTR || error == EIO)
 			return (error);
 		np->n_attrstamp = 0;
 		if (vp->v_type == VDIR)
 			np->n_direofoffset = 0;
 		error = VOP_GETATTR(vp, &vattr, ap->a_cred, ap->a_td);
 		if (error)
 			return (error);
 		mtx_lock(&np->n_mtx);
 		np->n_mtime = vattr.va_mtime;
 		mtx_unlock(&np->n_mtx);
 	} else {
 		struct thread *td = curthread;
 
 		if (np->n_ac_ts_syscalls != td->td_syscalls ||
 		    np->n_ac_ts_tid != td->td_tid || 
 		    td->td_proc == NULL ||
 		    np->n_ac_ts_pid != td->td_proc->p_pid) {
 			np->n_attrstamp = 0;
 		}
 		mtx_unlock(&np->n_mtx);						
 		error = VOP_GETATTR(vp, &vattr, ap->a_cred, ap->a_td);
 		if (error)
 			return (error);
 		mtx_lock(&np->n_mtx);
 		if (NFS_TIMESPEC_COMPARE(&np->n_mtime, &vattr.va_mtime)) {
 			if (vp->v_type == VDIR)
 				np->n_direofoffset = 0;
 			mtx_unlock(&np->n_mtx);
 			error = nfs_vinvalbuf(vp, V_SAVE, ap->a_td, 1);
 			if (error == EINTR || error == EIO) {
 				return (error);
 			}
 			mtx_lock(&np->n_mtx);
 			np->n_mtime = vattr.va_mtime;
 		}
 		mtx_unlock(&np->n_mtx);
 	}
 	/*
 	 * If the object has >= 1 O_DIRECT active opens, we disable caching.
 	 */
 	if (nfs_directio_enable && (fmode & O_DIRECT) && (vp->v_type == VREG)) {
 		if (np->n_directio_opens == 0) {
 			error = nfs_vinvalbuf(vp, V_SAVE, ap->a_td, 1);
 			if (error)
 				return (error);
 			mtx_lock(&np->n_mtx);
 			np->n_flag |= NNONCACHE;
 			mtx_unlock(&np->n_mtx);
 		}
 		np->n_directio_opens++;
 	}
 	vnode_create_vobject(vp, vattr.va_size, ap->a_td);
 	return (0);
 }
 
 /*
  * nfs close vnode op
  * What an NFS client should do upon close after writing is a debatable issue.
  * Most NFS clients push delayed writes to the server upon close, basically for
  * two reasons:
  * 1 - So that any write errors may be reported back to the client process
  *     doing the close system call. By far the two most likely errors are
  *     NFSERR_NOSPC and NFSERR_DQUOT to indicate space allocation failure.
  * 2 - To put a worst case upper bound on cache inconsistency between
  *     multiple clients for the file.
  * There is also a consistency problem for Version 2 of the protocol w.r.t.
  * not being able to tell if other clients are writing a file concurrently,
  * since there is no way of knowing if the changed modify time in the reply
  * is only due to the write for this client.
  * (NFS Version 3 provides weak cache consistency data in the reply that
  *  should be sufficient to detect and handle this case.)
  *
  * The current code does the following:
  * for NFS Version 2 - play it safe and flush/invalidate all dirty buffers
  * for NFS Version 3 - flush dirty buffers to the server but don't invalidate
  *                     or commit them (this satisfies 1 and 2 except for the
  *                     case where the server crashes after this close but
  *                     before the commit RPC, which is felt to be "good
  *                     enough". Changing the last argument to nfs_flush() to
  *                     a 1 would force a commit operation, if it is felt a
  *                     commit is necessary now.
  */
 /* ARGSUSED */
 static int
 nfs_close(struct vop_close_args *ap)
 {
 	struct vnode *vp = ap->a_vp;
 	struct nfsnode *np = VTONFS(vp);
 	int error = 0;
 	int fmode = ap->a_fflag;
 
 	if (vp->v_type == VREG) {
 	    /*
 	     * Examine and clean dirty pages, regardless of NMODIFIED.
 	     * This closes a major hole in close-to-open consistency.
 	     * We want to push out all dirty pages (and buffers) on
 	     * close, regardless of whether they were dirtied by
 	     * mmap'ed writes or via write().
 	     */
 	    if (nfs_clean_pages_on_close && vp->v_object) {
 		VM_OBJECT_LOCK(vp->v_object);
 		vm_object_page_clean(vp->v_object, 0, 0, 0);
 		VM_OBJECT_UNLOCK(vp->v_object);
 	    }
 	    mtx_lock(&np->n_mtx);
 	    if (np->n_flag & NMODIFIED) {
 		mtx_unlock(&np->n_mtx);
 		if (NFS_ISV3(vp)) {
 		    /*
 		     * Under NFSv3 we have dirty buffers to dispose of.  We
 		     * must flush them to the NFS server.  We have the option
 		     * of waiting all the way through the commit rpc or just
 		     * waiting for the initial write.  The default is to only
 		     * wait through the initial write so the data is in the
 		     * server's cache, which is roughly similar to the state
 		     * a standard disk subsystem leaves the file in on close().
 		     *
 		     * We cannot clear the NMODIFIED bit in np->n_flag due to
 		     * potential races with other processes, and certainly
 		     * cannot clear it if we don't commit.
 		     */
 		    int cm = nfsv3_commit_on_close ? 1 : 0;
 		    error = nfs_flush(vp, MNT_WAIT, ap->a_td, cm);
 		    /* np->n_flag &= ~NMODIFIED; */
 		} else
 		    error = nfs_vinvalbuf(vp, V_SAVE, ap->a_td, 1);
 		mtx_lock(&np->n_mtx);
 	    }
  	    /* 
  	     * Invalidate the attribute cache in all cases.
  	     * An open is going to fetch fresh attrs any way, other procs
  	     * on this node that have file open will be forced to do an 
  	     * otw attr fetch, but this is safe.
  	     */
 	    np->n_attrstamp = 0;
 	    if (np->n_flag & NWRITEERR) {
 		np->n_flag &= ~NWRITEERR;
 		error = np->n_error;
 	    }
 	    mtx_unlock(&np->n_mtx);
 	}
 	if (nfs_directio_enable)
 		KASSERT((np->n_directio_asyncwr == 0),
 			("nfs_close: dirty unflushed (%d) directio buffers\n",
 			 np->n_directio_asyncwr));
 	if (nfs_directio_enable && (fmode & O_DIRECT) && (vp->v_type == VREG)) {
 		mtx_lock(&np->n_mtx);
 		KASSERT((np->n_directio_opens > 0), 
 			("nfs_close: unexpectedly value (0) of n_directio_opens\n"));
 		np->n_directio_opens--;
 		if (np->n_directio_opens == 0)
 			np->n_flag &= ~NNONCACHE;
 		mtx_unlock(&np->n_mtx);
 	}
 	return (error);
 }
 
 /*
  * nfs getattr call from vfs.
  */
 static int
 nfs_getattr(struct vop_getattr_args *ap)
 {
 	struct vnode *vp = ap->a_vp;
 	struct nfsnode *np = VTONFS(vp);
 	caddr_t bpos, dpos;
 	int error = 0;
 	struct mbuf *mreq, *mrep, *md, *mb;
 	int v3 = NFS_ISV3(vp);
 
 	/*
 	 * Update local times for special files.
 	 */
 	mtx_lock(&np->n_mtx);
 	if (np->n_flag & (NACC | NUPD))
 		np->n_flag |= NCHG;
 	mtx_unlock(&np->n_mtx);
 	/*
 	 * First look in the cache.
 	 */
 	if (nfs_getattrcache(vp, ap->a_vap) == 0)
 		goto nfsmout;
 	if (v3 && nfsaccess_cache_timeout > 0) {
 		nfsstats.accesscache_misses++;
 		nfs3_access_otw(vp, NFSV3ACCESS_ALL, ap->a_td, ap->a_cred);
 		if (nfs_getattrcache(vp, ap->a_vap) == 0)
 			goto nfsmout;
 	}
 	nfsstats.rpccnt[NFSPROC_GETATTR]++;
 	mreq = nfsm_reqhead(vp, NFSPROC_GETATTR, NFSX_FH(v3));
 	mb = mreq;
 	bpos = mtod(mb, caddr_t);
 	nfsm_fhtom(vp, v3);
 	nfsm_request(vp, NFSPROC_GETATTR, ap->a_td, ap->a_cred);
 	if (!error) {
 		nfsm_loadattr(vp, ap->a_vap);
 	}
 	m_freem(mrep);
 nfsmout:
 	return (error);
 }
 
 /*
  * nfs setattr call.
  */
 static int
 nfs_setattr(struct vop_setattr_args *ap)
 {
 	struct vnode *vp = ap->a_vp;
 	struct nfsnode *np = VTONFS(vp);
 	struct vattr *vap = ap->a_vap;
 	int error = 0;
 	u_quad_t tsize;
 
 #ifndef nolint
 	tsize = (u_quad_t)0;
 #endif
 
 	/*
 	 * Setting of flags and marking of atimes are not supported.
 	 */
 	if (vap->va_flags != VNOVAL || (vap->va_vaflags & VA_MARK_ATIME))
 		return (EOPNOTSUPP);
 
 	/*
 	 * Disallow write attempts if the filesystem is mounted read-only.
 	 */
   	if ((vap->va_flags != VNOVAL || vap->va_uid != (uid_t)VNOVAL ||
 	    vap->va_gid != (gid_t)VNOVAL || vap->va_atime.tv_sec != VNOVAL ||
 	    vap->va_mtime.tv_sec != VNOVAL || vap->va_mode != (mode_t)VNOVAL) &&
 	    (vp->v_mount->mnt_flag & MNT_RDONLY)) {
 		error = EROFS;
 		goto out;
 	}
 	if (vap->va_size != VNOVAL) {
  		switch (vp->v_type) {
  		case VDIR:
  			return (EISDIR);
  		case VCHR:
  		case VBLK:
  		case VSOCK:
  		case VFIFO:
 			if (vap->va_mtime.tv_sec == VNOVAL &&
 			    vap->va_atime.tv_sec == VNOVAL &&
 			    vap->va_mode == (mode_t)VNOVAL &&
 			    vap->va_uid == (uid_t)VNOVAL &&
 			    vap->va_gid == (gid_t)VNOVAL)
 				return (0);		
  			vap->va_size = VNOVAL;
  			break;
  		default:
 			/*
 			 * Disallow write attempts if the filesystem is
 			 * mounted read-only.
 			 */
 			if (vp->v_mount->mnt_flag & MNT_RDONLY)
 				return (EROFS);
 			/*
 			 *  We run vnode_pager_setsize() early (why?),
 			 * we must set np->n_size now to avoid vinvalbuf
 			 * V_SAVE races that might setsize a lower
 			 * value.
 			 */
 			mtx_lock(&np->n_mtx);
 			tsize = np->n_size;
 			mtx_unlock(&np->n_mtx);
 			error = nfs_meta_setsize(vp, ap->a_cred, 
 						 ap->a_td, vap->va_size);
 			mtx_lock(&np->n_mtx);
  			if (np->n_flag & NMODIFIED) {
 			    tsize = np->n_size;
 			    mtx_unlock(&np->n_mtx);
  			    if (vap->va_size == 0)
  				error = nfs_vinvalbuf(vp, 0, ap->a_td, 1);
  			    else
  				error = nfs_vinvalbuf(vp, V_SAVE, ap->a_td, 1);
  			    if (error) {
 				vnode_pager_setsize(vp, tsize);
 				goto out;
 			    }
  			} else
 			    mtx_unlock(&np->n_mtx);
 			/*
 			 * np->n_size has already been set to vap->va_size
 			 * in nfs_meta_setsize(). We must set it again since
 			 * nfs_loadattrcache() could be called through
 			 * nfs_meta_setsize() and could modify np->n_size.
 			 */
 			mtx_lock(&np->n_mtx);
  			np->n_vattr.va_size = np->n_size = vap->va_size;
 			mtx_unlock(&np->n_mtx);
   		};
   	} else {
 		mtx_lock(&np->n_mtx);
 		if ((vap->va_mtime.tv_sec != VNOVAL || vap->va_atime.tv_sec != VNOVAL) && 
 		    (np->n_flag & NMODIFIED) && vp->v_type == VREG) {
 			mtx_unlock(&np->n_mtx);
 			if ((error = nfs_vinvalbuf(vp, V_SAVE, ap->a_td, 1)) != 0 &&
 			    (error == EINTR || error == EIO))
 				return error;
 		} else
 			mtx_unlock(&np->n_mtx);
 	}
 	error = nfs_setattrrpc(vp, vap, ap->a_cred, ap->a_td);
 	if (error && vap->va_size != VNOVAL) {
 		mtx_lock(&np->n_mtx);
 		np->n_size = np->n_vattr.va_size = tsize;
 		vnode_pager_setsize(vp, tsize);
 		mtx_unlock(&np->n_mtx);
 	}
 out:
 	return (error);
 }
 
 /*
  * Do an nfs setattr rpc.
  */
 static int
 nfs_setattrrpc(struct vnode *vp, struct vattr *vap, struct ucred *cred,
     struct thread *td)
 {
 	struct nfsv2_sattr *sp;
 	struct nfsnode *np = VTONFS(vp);
 	caddr_t bpos, dpos;
 	u_int32_t *tl;
 	int error = 0, wccflag = NFSV3_WCCRATTR;
 	struct mbuf *mreq, *mrep, *md, *mb;
 	int v3 = NFS_ISV3(vp);
 
 	nfsstats.rpccnt[NFSPROC_SETATTR]++;
 	mreq = nfsm_reqhead(vp, NFSPROC_SETATTR, NFSX_FH(v3) + NFSX_SATTR(v3));
 	mb = mreq;
 	bpos = mtod(mb, caddr_t);
 	nfsm_fhtom(vp, v3);
 	if (v3) {
 		nfsm_v3attrbuild(vap, TRUE);
 		tl = nfsm_build(u_int32_t *, NFSX_UNSIGNED);
 		*tl = nfs_false;
 	} else {
 		sp = nfsm_build(struct nfsv2_sattr *, NFSX_V2SATTR);
 		if (vap->va_mode == (mode_t)VNOVAL)
 			sp->sa_mode = nfs_xdrneg1;
 		else
 			sp->sa_mode = vtonfsv2_mode(vp->v_type, vap->va_mode);
 		if (vap->va_uid == (uid_t)VNOVAL)
 			sp->sa_uid = nfs_xdrneg1;
 		else
 			sp->sa_uid = txdr_unsigned(vap->va_uid);
 		if (vap->va_gid == (gid_t)VNOVAL)
 			sp->sa_gid = nfs_xdrneg1;
 		else
 			sp->sa_gid = txdr_unsigned(vap->va_gid);
 		sp->sa_size = txdr_unsigned(vap->va_size);
 		txdr_nfsv2time(&vap->va_atime, &sp->sa_atime);
 		txdr_nfsv2time(&vap->va_mtime, &sp->sa_mtime);
 	}
 	nfsm_request(vp, NFSPROC_SETATTR, td, cred);
 	if (v3) {
 		np->n_modestamp = 0;
 		nfsm_wcc_data(vp, wccflag);
 	} else
 		nfsm_loadattr(vp, NULL);
 	m_freem(mrep);
 nfsmout:
 	return (error);
 }
 
 /*
  * nfs lookup call, one step at a time...
  * First look in cache
  * If not found, unlock the directory nfsnode and do the rpc
  */
 static int
 nfs_lookup(struct vop_lookup_args *ap)
 {
 	struct componentname *cnp = ap->a_cnp;
 	struct vnode *dvp = ap->a_dvp;
 	struct vnode **vpp = ap->a_vpp;
 	int flags = cnp->cn_flags;
 	struct vnode *newvp;
 	struct nfsmount *nmp;
 	caddr_t bpos, dpos;
 	struct mbuf *mreq, *mrep, *md, *mb;
 	long len;
 	nfsfh_t *fhp;
 	struct nfsnode *np;
 	int error = 0, attrflag, fhsize;
 	int v3 = NFS_ISV3(dvp);
 	struct thread *td = cnp->cn_thread;
 	
 	*vpp = NULLVP;
 	if ((flags & ISLASTCN) && (dvp->v_mount->mnt_flag & MNT_RDONLY) &&
 	    (cnp->cn_nameiop == DELETE || cnp->cn_nameiop == RENAME))
 		return (EROFS);
 	if (dvp->v_type != VDIR)
 		return (ENOTDIR);
 	nmp = VFSTONFS(dvp->v_mount);
 	np = VTONFS(dvp);
 	if ((error = VOP_ACCESS(dvp, VEXEC, cnp->cn_cred, td)) != 0) {
 		*vpp = NULLVP;
 		return (error);
 	}
 	if ((error = cache_lookup(dvp, vpp, cnp)) && error != ENOENT) {
 		struct vattr vattr;
 
 		newvp = *vpp;
 		if (!VOP_GETATTR(newvp, &vattr, cnp->cn_cred, td)
 		 && vattr.va_ctime.tv_sec == VTONFS(newvp)->n_ctime) {
 		     nfsstats.lookupcache_hits++;
 		     if (cnp->cn_nameiop != LOOKUP &&
 			 (flags & ISLASTCN))
 			     cnp->cn_flags |= SAVENAME;
 		     return (0);
 		}
 		cache_purge(newvp);
 		if (dvp != newvp)
 			vput(newvp);
 		else 
 			vrele(newvp);
 		*vpp = NULLVP;
 	}
 	error = 0;
 	newvp = NULLVP;
 	nfsstats.lookupcache_misses++;
 	nfsstats.rpccnt[NFSPROC_LOOKUP]++;
 	len = cnp->cn_namelen;
 	mreq = nfsm_reqhead(dvp, NFSPROC_LOOKUP,
 		NFSX_FH(v3) + NFSX_UNSIGNED + nfsm_rndup(len));
 	mb = mreq;
 	bpos = mtod(mb, caddr_t);
 	nfsm_fhtom(dvp, v3);
 	nfsm_strtom(cnp->cn_nameptr, len, NFS_MAXNAMLEN);
 	nfsm_request(dvp, NFSPROC_LOOKUP, cnp->cn_thread, cnp->cn_cred);
 	if (error) {
 		if (v3) {
 			nfsm_postop_attr(dvp, attrflag);
 			m_freem(mrep);
 		}
 		goto nfsmout;
 	}
 	nfsm_getfh(fhp, fhsize, v3);
 
 	/*
 	 * Handle RENAME case...
 	 */
 	if (cnp->cn_nameiop == RENAME && (flags & ISLASTCN)) {
 		if (NFS_CMPFH(np, fhp, fhsize)) {
 			m_freem(mrep);
 			return (EISDIR);
 		}
 		error = nfs_nget(dvp->v_mount, fhp, fhsize, &np, LK_EXCLUSIVE);
 		if (error) {
 			m_freem(mrep);
 			return (error);
 		}
 		newvp = NFSTOV(np);
 		if (v3) {
 			nfsm_postop_attr(newvp, attrflag);
 			nfsm_postop_attr(dvp, attrflag);
 		} else
 			nfsm_loadattr(newvp, NULL);
 		*vpp = newvp;
 		m_freem(mrep);
 		cnp->cn_flags |= SAVENAME;
 		return (0);
 	}
 
 	if (flags & ISDOTDOT) {
 		VOP_UNLOCK(dvp, 0, td);
 		error = nfs_nget(dvp->v_mount, fhp, fhsize, &np, cnp->cn_lkflags);
-		vn_lock(dvp, LK_EXCLUSIVE | LK_RETRY, td);
+		vn_lock(dvp, LK_EXCLUSIVE | LK_RETRY);
 		if (error)
 			return (error);
 		newvp = NFSTOV(np);
 	} else if (NFS_CMPFH(np, fhp, fhsize)) {
 		VREF(dvp);
 		newvp = dvp;
 	} else {
 		error = nfs_nget(dvp->v_mount, fhp, fhsize, &np, cnp->cn_lkflags);
 		if (error) {
 			m_freem(mrep);
 			return (error);
 		}
 		newvp = NFSTOV(np);
 	}
 	if (v3) {
 		nfsm_postop_attr(newvp, attrflag);
 		nfsm_postop_attr(dvp, attrflag);
 	} else
 		nfsm_loadattr(newvp, NULL);
 	if (cnp->cn_nameiop != LOOKUP && (flags & ISLASTCN))
 		cnp->cn_flags |= SAVENAME;
 	if ((cnp->cn_flags & MAKEENTRY) &&
 	    (cnp->cn_nameiop != DELETE || !(flags & ISLASTCN))) {
 		np->n_ctime = np->n_vattr.va_ctime.tv_sec;
 		cache_enter(dvp, newvp, cnp);
 	}
 	*vpp = newvp;
 	m_freem(mrep);
 nfsmout:
 	if (error) {
 		if (newvp != NULLVP) {
 			vput(newvp);
 			*vpp = NULLVP;
 		}
 		if ((cnp->cn_nameiop == CREATE || cnp->cn_nameiop == RENAME) &&
 		    (flags & ISLASTCN) && error == ENOENT) {
 			if (dvp->v_mount->mnt_flag & MNT_RDONLY)
 				error = EROFS;
 			else
 				error = EJUSTRETURN;
 		}
 		if (cnp->cn_nameiop != LOOKUP && (flags & ISLASTCN))
 			cnp->cn_flags |= SAVENAME;
 	}
 	return (error);
 }
 
 /*
  * nfs read call.
  * Just call nfs_bioread() to do the work.
  */
 static int
 nfs_read(struct vop_read_args *ap)
 {
 	struct vnode *vp = ap->a_vp;
 
 	switch (vp->v_type) {
 	case VREG:
 		return (nfs_bioread(vp, ap->a_uio, ap->a_ioflag, ap->a_cred));
 	case VDIR:
 		return (EISDIR);
 	default:
 		return (EOPNOTSUPP);
 	}
 }
 
 /*
  * nfs readlink call
  */
 static int
 nfs_readlink(struct vop_readlink_args *ap)
 {
 	struct vnode *vp = ap->a_vp;
 
 	if (vp->v_type != VLNK)
 		return (EINVAL);
 	return (nfs_bioread(vp, ap->a_uio, 0, ap->a_cred));
 }
 
 /*
  * Do a readlink rpc.
  * Called by nfs_doio() from below the buffer cache.
  */
 int
 nfs_readlinkrpc(struct vnode *vp, struct uio *uiop, struct ucred *cred)
 {
 	caddr_t bpos, dpos;
 	int error = 0, len, attrflag;
 	struct mbuf *mreq, *mrep, *md, *mb;
 	int v3 = NFS_ISV3(vp);
 
 	nfsstats.rpccnt[NFSPROC_READLINK]++;
 	mreq = nfsm_reqhead(vp, NFSPROC_READLINK, NFSX_FH(v3));
 	mb = mreq;
 	bpos = mtod(mb, caddr_t);
 	nfsm_fhtom(vp, v3);
 	nfsm_request(vp, NFSPROC_READLINK, uiop->uio_td, cred);
 	if (v3)
 		nfsm_postop_attr(vp, attrflag);
 	if (!error) {
 		nfsm_strsiz(len, NFS_MAXPATHLEN);
 		if (len == NFS_MAXPATHLEN) {
 			struct nfsnode *np = VTONFS(vp);
 			mtx_lock(&np->n_mtx);
 			if (np->n_size && np->n_size < NFS_MAXPATHLEN)
 				len = np->n_size;
 			mtx_unlock(&np->n_mtx);
 		}
 		nfsm_mtouio(uiop, len);
 	}
 	m_freem(mrep);
 nfsmout:
 	return (error);
 }
 
 /*
  * nfs read rpc call
  * Ditto above
  */
 int
 nfs_readrpc(struct vnode *vp, struct uio *uiop, struct ucred *cred)
 {
 	u_int32_t *tl;
 	caddr_t bpos, dpos;
 	struct mbuf *mreq, *mrep, *md, *mb;
 	struct nfsmount *nmp;
 	int error = 0, len, retlen, tsiz, eof, attrflag;
 	int v3 = NFS_ISV3(vp);
 	int rsize;
 
 #ifndef nolint
 	eof = 0;
 #endif
 	nmp = VFSTONFS(vp->v_mount);
 	tsiz = uiop->uio_resid;
 	mtx_lock(&nmp->nm_mtx);
 	if (uiop->uio_offset + tsiz > nmp->nm_maxfilesize) {
 		mtx_unlock(&nmp->nm_mtx);
 		return (EFBIG);
 	}
 	rsize = nmp->nm_rsize;
 	mtx_unlock(&nmp->nm_mtx);
 	while (tsiz > 0) {
 		nfsstats.rpccnt[NFSPROC_READ]++;
 		len = (tsiz > rsize) ? rsize : tsiz;
 		mreq = nfsm_reqhead(vp, NFSPROC_READ, NFSX_FH(v3) + NFSX_UNSIGNED * 3);
 		mb = mreq;
 		bpos = mtod(mb, caddr_t);
 		nfsm_fhtom(vp, v3);
 		tl = nfsm_build(u_int32_t *, NFSX_UNSIGNED * 3);
 		if (v3) {
 			txdr_hyper(uiop->uio_offset, tl);
 			*(tl + 2) = txdr_unsigned(len);
 		} else {
 			*tl++ = txdr_unsigned(uiop->uio_offset);
 			*tl++ = txdr_unsigned(len);
 			*tl = 0;
 		}
 		nfsm_request(vp, NFSPROC_READ, uiop->uio_td, cred);
 		if (v3) {
 			nfsm_postop_attr(vp, attrflag);
 			if (error) {
 				m_freem(mrep);
 				goto nfsmout;
 			}
 			tl = nfsm_dissect(u_int32_t *, 2 * NFSX_UNSIGNED);
 			eof = fxdr_unsigned(int, *(tl + 1));
 		} else {
 			nfsm_loadattr(vp, NULL);
 		}
 		nfsm_strsiz(retlen, rsize);
 		nfsm_mtouio(uiop, retlen);
 		m_freem(mrep);
 		tsiz -= retlen;
 		if (v3) {
 			if (eof || retlen == 0) {
 				tsiz = 0;
 			}
 		} else if (retlen < len) {
 			tsiz = 0;
 		}
 	}
 nfsmout:
 	return (error);
 }
 
 /*
  * nfs write call
  */
 int
 nfs_writerpc(struct vnode *vp, struct uio *uiop, struct ucred *cred,
 	     int *iomode, int *must_commit)
 {
 	u_int32_t *tl;
 	int32_t backup;
 	caddr_t bpos, dpos;
 	struct mbuf *mreq, *mrep, *md, *mb;
 	struct nfsmount *nmp = VFSTONFS(vp->v_mount);
 	int error = 0, len, tsiz, wccflag = NFSV3_WCCRATTR, rlen, commit;
 	int v3 = NFS_ISV3(vp), committed = NFSV3WRITE_FILESYNC;
 	int wsize;
 	
 #ifndef DIAGNOSTIC
 	if (uiop->uio_iovcnt != 1)
 		panic("nfs: writerpc iovcnt > 1");
 #endif
 	*must_commit = 0;
 	tsiz = uiop->uio_resid;
 	mtx_lock(&nmp->nm_mtx);
 	if (uiop->uio_offset + tsiz > nmp->nm_maxfilesize) {
 		mtx_unlock(&nmp->nm_mtx);		
 		return (EFBIG);
 	}
 	wsize = nmp->nm_wsize;
 	mtx_unlock(&nmp->nm_mtx);
 	while (tsiz > 0) {
 		nfsstats.rpccnt[NFSPROC_WRITE]++;
 		len = (tsiz > wsize) ? wsize : tsiz;
 		mreq = nfsm_reqhead(vp, NFSPROC_WRITE,
 			NFSX_FH(v3) + 5 * NFSX_UNSIGNED + nfsm_rndup(len));
 		mb = mreq;
 		bpos = mtod(mb, caddr_t);
 		nfsm_fhtom(vp, v3);
 		if (v3) {
 			tl = nfsm_build(u_int32_t *, 5 * NFSX_UNSIGNED);
 			txdr_hyper(uiop->uio_offset, tl);
 			tl += 2;
 			*tl++ = txdr_unsigned(len);
 			*tl++ = txdr_unsigned(*iomode);
 			*tl = txdr_unsigned(len);
 		} else {
 			u_int32_t x;
 
 			tl = nfsm_build(u_int32_t *, 4 * NFSX_UNSIGNED);
 			/* Set both "begin" and "current" to non-garbage. */
 			x = txdr_unsigned((u_int32_t)uiop->uio_offset);
 			*tl++ = x;	/* "begin offset" */
 			*tl++ = x;	/* "current offset" */
 			x = txdr_unsigned(len);
 			*tl++ = x;	/* total to this offset */
 			*tl = x;	/* size of this write */
 		}
 		nfsm_uiotom(uiop, len);
 		nfsm_request(vp, NFSPROC_WRITE, uiop->uio_td, cred);
 		if (v3) {
 			wccflag = NFSV3_WCCCHK;
 			nfsm_wcc_data(vp, wccflag);
 			if (!error) {
 				tl = nfsm_dissect(u_int32_t *, 2 * NFSX_UNSIGNED
 					+ NFSX_V3WRITEVERF);
 				rlen = fxdr_unsigned(int, *tl++);
 				if (rlen == 0) {
 					error = NFSERR_IO;
 					m_freem(mrep);
 					break;
 				} else if (rlen < len) {
 					backup = len - rlen;
 					uiop->uio_iov->iov_base =
 					    (char *)uiop->uio_iov->iov_base -
 					    backup;
 					uiop->uio_iov->iov_len += backup;
 					uiop->uio_offset -= backup;
 					uiop->uio_resid += backup;
 					len = rlen;
 				}
 				commit = fxdr_unsigned(int, *tl++);
 
 				/*
 				 * Return the lowest committment level
 				 * obtained by any of the RPCs.
 				 */
 				if (committed == NFSV3WRITE_FILESYNC)
 					committed = commit;
 				else if (committed == NFSV3WRITE_DATASYNC &&
 					commit == NFSV3WRITE_UNSTABLE)
 					committed = commit;
 				mtx_lock(&nmp->nm_mtx);
 				if ((nmp->nm_state & NFSSTA_HASWRITEVERF) == 0){
 				    bcopy((caddr_t)tl, (caddr_t)nmp->nm_verf,
 					NFSX_V3WRITEVERF);
 				    nmp->nm_state |= NFSSTA_HASWRITEVERF;
 				} else if (bcmp((caddr_t)tl,
 				    (caddr_t)nmp->nm_verf, NFSX_V3WRITEVERF)) {
 				    *must_commit = 1;
 				    bcopy((caddr_t)tl, (caddr_t)nmp->nm_verf,
 					NFSX_V3WRITEVERF);
 				}
 				mtx_unlock(&nmp->nm_mtx);
 			}
 		} else {
 			nfsm_loadattr(vp, NULL);
 		}
 		if (wccflag) {
 			mtx_lock(&(VTONFS(vp))->n_mtx);
 			VTONFS(vp)->n_mtime = VTONFS(vp)->n_vattr.va_mtime;
 			mtx_unlock(&(VTONFS(vp))->n_mtx);
 		}
 		m_freem(mrep);
 		if (error)
 			break;
 		tsiz -= len;
 	}
 nfsmout:
 	if (vp->v_mount->mnt_kern_flag & MNTK_ASYNC)
 		committed = NFSV3WRITE_FILESYNC;
 	*iomode = committed;
 	if (error)
 		uiop->uio_resid = tsiz;
 	return (error);
 }
 
 /*
  * nfs mknod rpc
  * For NFS v2 this is a kludge. Use a create rpc but with the IFMT bits of the
  * mode set to specify the file type and the size field for rdev.
  */
 static int
 nfs_mknodrpc(struct vnode *dvp, struct vnode **vpp, struct componentname *cnp,
     struct vattr *vap)
 {
 	struct nfsv2_sattr *sp;
 	u_int32_t *tl;
 	struct vnode *newvp = NULL;
 	struct nfsnode *np = NULL;
 	struct vattr vattr;
 	caddr_t bpos, dpos;
 	int error = 0, wccflag = NFSV3_WCCRATTR, gotvp = 0;
 	struct mbuf *mreq, *mrep, *md, *mb;
 	u_int32_t rdev;
 	int v3 = NFS_ISV3(dvp);
 
 	if (vap->va_type == VCHR || vap->va_type == VBLK)
 		rdev = txdr_unsigned(vap->va_rdev);
 	else if (vap->va_type == VFIFO || vap->va_type == VSOCK)
 		rdev = nfs_xdrneg1;
 	else {
 		return (EOPNOTSUPP);
 	}
 	if ((error = VOP_GETATTR(dvp, &vattr, cnp->cn_cred, cnp->cn_thread)) != 0) {
 		return (error);
 	}
 	nfsstats.rpccnt[NFSPROC_MKNOD]++;
 	mreq = nfsm_reqhead(dvp, NFSPROC_MKNOD, NFSX_FH(v3) + 4 * NFSX_UNSIGNED +
 		+ nfsm_rndup(cnp->cn_namelen) + NFSX_SATTR(v3));
 	mb = mreq;
 	bpos = mtod(mb, caddr_t);
 	nfsm_fhtom(dvp, v3);
 	nfsm_strtom(cnp->cn_nameptr, cnp->cn_namelen, NFS_MAXNAMLEN);
 	if (v3) {
 		tl = nfsm_build(u_int32_t *, NFSX_UNSIGNED);
 		*tl++ = vtonfsv3_type(vap->va_type);
 		nfsm_v3attrbuild(vap, FALSE);
 		if (vap->va_type == VCHR || vap->va_type == VBLK) {
 			tl = nfsm_build(u_int32_t *, 2 * NFSX_UNSIGNED);
 			*tl++ = txdr_unsigned(umajor(vap->va_rdev));
 			*tl = txdr_unsigned(uminor(vap->va_rdev));
 		}
 	} else {
 		sp = nfsm_build(struct nfsv2_sattr *, NFSX_V2SATTR);
 		sp->sa_mode = vtonfsv2_mode(vap->va_type, vap->va_mode);
 		sp->sa_uid = nfs_xdrneg1;
 		sp->sa_gid = nfs_xdrneg1;
 		sp->sa_size = rdev;
 		txdr_nfsv2time(&vap->va_atime, &sp->sa_atime);
 		txdr_nfsv2time(&vap->va_mtime, &sp->sa_mtime);
 	}
 	nfsm_request(dvp, NFSPROC_MKNOD, cnp->cn_thread, cnp->cn_cred);
 	if (!error) {
 		nfsm_mtofh(dvp, newvp, v3, gotvp);
 		if (!gotvp) {
 			if (newvp) {
 				vput(newvp);
 				newvp = NULL;
 			}
 			error = nfs_lookitup(dvp, cnp->cn_nameptr,
 			    cnp->cn_namelen, cnp->cn_cred, cnp->cn_thread, &np);
 			if (!error)
 				newvp = NFSTOV(np);
 		}
 	}
 	if (v3)
 		nfsm_wcc_data(dvp, wccflag);
 	m_freem(mrep);
 nfsmout:
 	if (error) {
 		if (newvp)
 			vput(newvp);
 	} else {
 		if (cnp->cn_flags & MAKEENTRY)
 			cache_enter(dvp, newvp, cnp);
 		*vpp = newvp;
 	}
 	mtx_lock(&(VTONFS(dvp))->n_mtx);
 	VTONFS(dvp)->n_flag |= NMODIFIED;
 	if (!wccflag)
 		VTONFS(dvp)->n_attrstamp = 0;
 	mtx_unlock(&(VTONFS(dvp))->n_mtx);
 	return (error);
 }
 
 /*
  * nfs mknod vop
  * just call nfs_mknodrpc() to do the work.
  */
 /* ARGSUSED */
 static int
 nfs_mknod(struct vop_mknod_args *ap)
 {
 	return (nfs_mknodrpc(ap->a_dvp, ap->a_vpp, ap->a_cnp, ap->a_vap));
 }
 
 static u_long create_verf;
 /*
  * nfs file create call
  */
 static int
 nfs_create(struct vop_create_args *ap)
 {
 	struct vnode *dvp = ap->a_dvp;
 	struct vattr *vap = ap->a_vap;
 	struct componentname *cnp = ap->a_cnp;
 	struct nfsv2_sattr *sp;
 	u_int32_t *tl;
 	struct nfsnode *np = NULL;
 	struct vnode *newvp = NULL;
 	caddr_t bpos, dpos;
 	int error = 0, wccflag = NFSV3_WCCRATTR, gotvp = 0, fmode = 0;
 	struct mbuf *mreq, *mrep, *md, *mb;
 	struct vattr vattr;
 	int v3 = NFS_ISV3(dvp);
 
 	/*
 	 * Oops, not for me..
 	 */
 	if (vap->va_type == VSOCK)
 		return (nfs_mknodrpc(dvp, ap->a_vpp, cnp, vap));
 
 	if ((error = VOP_GETATTR(dvp, &vattr, cnp->cn_cred, cnp->cn_thread)) != 0) {
 		return (error);
 	}
 	if (vap->va_vaflags & VA_EXCLUSIVE)
 		fmode |= O_EXCL;
 again:
 	nfsstats.rpccnt[NFSPROC_CREATE]++;
 	mreq = nfsm_reqhead(dvp, NFSPROC_CREATE, NFSX_FH(v3) + 2 * NFSX_UNSIGNED +
 		nfsm_rndup(cnp->cn_namelen) + NFSX_SATTR(v3));
 	mb = mreq;
 	bpos = mtod(mb, caddr_t);
 	nfsm_fhtom(dvp, v3);
 	nfsm_strtom(cnp->cn_nameptr, cnp->cn_namelen, NFS_MAXNAMLEN);
 	if (v3) {
 		tl = nfsm_build(u_int32_t *, NFSX_UNSIGNED);
 		if (fmode & O_EXCL) {
 			*tl = txdr_unsigned(NFSV3CREATE_EXCLUSIVE);
 			tl = nfsm_build(u_int32_t *, NFSX_V3CREATEVERF);
 #ifdef INET
 			if (!TAILQ_EMPTY(&in_ifaddrhead))
 				*tl++ = IA_SIN(TAILQ_FIRST(&in_ifaddrhead))->sin_addr.s_addr;
 			else
 #endif
 				*tl++ = create_verf;
 			*tl = ++create_verf;
 		} else {
 			*tl = txdr_unsigned(NFSV3CREATE_UNCHECKED);
 			nfsm_v3attrbuild(vap, FALSE);
 		}
 	} else {
 		sp = nfsm_build(struct nfsv2_sattr *, NFSX_V2SATTR);
 		sp->sa_mode = vtonfsv2_mode(vap->va_type, vap->va_mode);
 		sp->sa_uid = nfs_xdrneg1;
 		sp->sa_gid = nfs_xdrneg1;
 		sp->sa_size = 0;
 		txdr_nfsv2time(&vap->va_atime, &sp->sa_atime);
 		txdr_nfsv2time(&vap->va_mtime, &sp->sa_mtime);
 	}
 	nfsm_request(dvp, NFSPROC_CREATE, cnp->cn_thread, cnp->cn_cred);
 	if (!error) {
 		nfsm_mtofh(dvp, newvp, v3, gotvp);
 		if (!gotvp) {
 			if (newvp) {
 				vput(newvp);
 				newvp = NULL;
 			}
 			error = nfs_lookitup(dvp, cnp->cn_nameptr,
 			    cnp->cn_namelen, cnp->cn_cred, cnp->cn_thread, &np);
 			if (!error)
 				newvp = NFSTOV(np);
 		}
 	}
 	if (v3)
 		nfsm_wcc_data(dvp, wccflag);
 	m_freem(mrep);
 nfsmout:
 	if (error) {
 		if (v3 && (fmode & O_EXCL) && error == NFSERR_NOTSUPP) {
 			fmode &= ~O_EXCL;
 			goto again;
 		}
 		if (newvp)
 			vput(newvp);
 	} else if (v3 && (fmode & O_EXCL)) {
 		/*
 		 * We are normally called with only a partially initialized
 		 * VAP.  Since the NFSv3 spec says that server may use the
 		 * file attributes to store the verifier, the spec requires
 		 * us to do a SETATTR RPC. FreeBSD servers store the verifier
 		 * in atime, but we can't really assume that all servers will
 		 * so we ensure that our SETATTR sets both atime and mtime.
 		 */
 		if (vap->va_mtime.tv_sec == VNOVAL)
 			vfs_timestamp(&vap->va_mtime);
 		if (vap->va_atime.tv_sec == VNOVAL)
 			vap->va_atime = vap->va_mtime;
 		error = nfs_setattrrpc(newvp, vap, cnp->cn_cred, cnp->cn_thread);
 		if (error)
 			vput(newvp);
 	}
 	if (!error) {
 		if (cnp->cn_flags & MAKEENTRY)
 			cache_enter(dvp, newvp, cnp);
 		*ap->a_vpp = newvp;
 	}
 	mtx_lock(&(VTONFS(dvp))->n_mtx);
 	VTONFS(dvp)->n_flag |= NMODIFIED;
 	if (!wccflag)
 		VTONFS(dvp)->n_attrstamp = 0;
 	mtx_unlock(&(VTONFS(dvp))->n_mtx);
 	return (error);
 }
 
 /*
  * nfs file remove call
  * To try and make nfs semantics closer to ufs semantics, a file that has
  * other processes using the vnode is renamed instead of removed and then
  * removed later on the last close.
  * - If v_usecount > 1
  *	  If a rename is not already in the works
  *	     call nfs_sillyrename() to set it up
  *     else
  *	  do the remove rpc
  */
 static int
 nfs_remove(struct vop_remove_args *ap)
 {
 	struct vnode *vp = ap->a_vp;
 	struct vnode *dvp = ap->a_dvp;
 	struct componentname *cnp = ap->a_cnp;
 	struct nfsnode *np = VTONFS(vp);
 	int error = 0;
 	struct vattr vattr;
 
 #ifndef DIAGNOSTIC
 	if ((cnp->cn_flags & HASBUF) == 0)
 		panic("nfs_remove: no name");
 	if (vrefcnt(vp) < 1)
 		panic("nfs_remove: bad v_usecount");
 #endif
 	if (vp->v_type == VDIR)
 		error = EPERM;
 	else if (vrefcnt(vp) == 1 || (np->n_sillyrename &&
 	    VOP_GETATTR(vp, &vattr, cnp->cn_cred, cnp->cn_thread) == 0 &&
 	    vattr.va_nlink > 1)) {
 		/*
 		 * Purge the name cache so that the chance of a lookup for
 		 * the name succeeding while the remove is in progress is
 		 * minimized. Without node locking it can still happen, such
 		 * that an I/O op returns ESTALE, but since you get this if
 		 * another host removes the file..
 		 */
 		cache_purge(vp);
 		/*
 		 * throw away biocache buffers, mainly to avoid
 		 * unnecessary delayed writes later.
 		 */
 		error = nfs_vinvalbuf(vp, 0, cnp->cn_thread, 1);
 		/* Do the rpc */
 		if (error != EINTR && error != EIO)
 			error = nfs_removerpc(dvp, cnp->cn_nameptr,
 				cnp->cn_namelen, cnp->cn_cred, cnp->cn_thread);
 		/*
 		 * Kludge City: If the first reply to the remove rpc is lost..
 		 *   the reply to the retransmitted request will be ENOENT
 		 *   since the file was in fact removed
 		 *   Therefore, we cheat and return success.
 		 */
 		if (error == ENOENT)
 			error = 0;
 	} else if (!np->n_sillyrename)
 		error = nfs_sillyrename(dvp, vp, cnp);
 	np->n_attrstamp = 0;
 	return (error);
 }
 
 /*
  * nfs file remove rpc called from nfs_inactive
  */
 int
 nfs_removeit(struct sillyrename *sp)
 {
 	/*
 	 * Make sure that the directory vnode is still valid.
 	 * XXX we should lock sp->s_dvp here.
 	 */
 	if (sp->s_dvp->v_type == VBAD)
 		return (0);
 	return (nfs_removerpc(sp->s_dvp, sp->s_name, sp->s_namlen, sp->s_cred,
 		NULL));
 }
 
 /*
  * Nfs remove rpc, called from nfs_remove() and nfs_removeit().
  */
 static int
 nfs_removerpc(struct vnode *dvp, const char *name, int namelen,
     struct ucred *cred, struct thread *td)
 {
 	caddr_t bpos, dpos;
 	int error = 0, wccflag = NFSV3_WCCRATTR;
 	struct mbuf *mreq, *mrep, *md, *mb;
 	int v3 = NFS_ISV3(dvp);
 
 	nfsstats.rpccnt[NFSPROC_REMOVE]++;
 	mreq = nfsm_reqhead(dvp, NFSPROC_REMOVE,
 		NFSX_FH(v3) + NFSX_UNSIGNED + nfsm_rndup(namelen));
 	mb = mreq;
 	bpos = mtod(mb, caddr_t);
 	nfsm_fhtom(dvp, v3);
 	nfsm_strtom(name, namelen, NFS_MAXNAMLEN);
 	nfsm_request(dvp, NFSPROC_REMOVE, td, cred);
 	if (v3)
 		nfsm_wcc_data(dvp, wccflag);
 	m_freem(mrep);
 nfsmout:
 	mtx_lock(&(VTONFS(dvp))->n_mtx);
 	VTONFS(dvp)->n_flag |= NMODIFIED;
 	if (!wccflag)
 		VTONFS(dvp)->n_attrstamp = 0;
 	mtx_unlock(&(VTONFS(dvp))->n_mtx);
 	return (error);
 }
 
 /*
  * nfs file rename call
  */
 static int
 nfs_rename(struct vop_rename_args *ap)
 {
 	struct vnode *fvp = ap->a_fvp;
 	struct vnode *tvp = ap->a_tvp;
 	struct vnode *fdvp = ap->a_fdvp;
 	struct vnode *tdvp = ap->a_tdvp;
 	struct componentname *tcnp = ap->a_tcnp;
 	struct componentname *fcnp = ap->a_fcnp;
 	int error;
 
 #ifndef DIAGNOSTIC
 	if ((tcnp->cn_flags & HASBUF) == 0 ||
 	    (fcnp->cn_flags & HASBUF) == 0)
 		panic("nfs_rename: no name");
 #endif
 	/* Check for cross-device rename */
 	if ((fvp->v_mount != tdvp->v_mount) ||
 	    (tvp && (fvp->v_mount != tvp->v_mount))) {
 		error = EXDEV;
 		goto out;
 	}
 
 	if (fvp == tvp) {
 		nfs_printf("nfs_rename: fvp == tvp (can't happen)\n");
 		error = 0;
 		goto out;
 	}
-	if ((error = vn_lock(fvp, LK_EXCLUSIVE, fcnp->cn_thread)) != 0)
+	if ((error = vn_lock(fvp, LK_EXCLUSIVE)) != 0)
 		goto out;
 
 	/*
 	 * We have to flush B_DELWRI data prior to renaming
 	 * the file.  If we don't, the delayed-write buffers
 	 * can be flushed out later after the file has gone stale
 	 * under NFSV3.  NFSV2 does not have this problem because
 	 * ( as far as I can tell ) it flushes dirty buffers more
 	 * often.
 	 * 
 	 * Skip the rename operation if the fsync fails, this can happen
 	 * due to the server's volume being full, when we pushed out data
 	 * that was written back to our cache earlier. Not checking for
 	 * this condition can result in potential (silent) data loss.
 	 */
 	error = VOP_FSYNC(fvp, MNT_WAIT, fcnp->cn_thread);
 	VOP_UNLOCK(fvp, 0, fcnp->cn_thread);
 	if (!error && tvp)
 		error = VOP_FSYNC(tvp, MNT_WAIT, tcnp->cn_thread);
 	if (error)
 		goto out;
 
 	/*
 	 * If the tvp exists and is in use, sillyrename it before doing the
 	 * rename of the new file over it.
 	 * XXX Can't sillyrename a directory.
 	 */
 	if (tvp && vrefcnt(tvp) > 1 && !VTONFS(tvp)->n_sillyrename &&
 		tvp->v_type != VDIR && !nfs_sillyrename(tdvp, tvp, tcnp)) {
 		vput(tvp);
 		tvp = NULL;
 	}
 
 	error = nfs_renamerpc(fdvp, fcnp->cn_nameptr, fcnp->cn_namelen,
 		tdvp, tcnp->cn_nameptr, tcnp->cn_namelen, tcnp->cn_cred,
 		tcnp->cn_thread);
 
 	if (fvp->v_type == VDIR) {
 		if (tvp != NULL && tvp->v_type == VDIR)
 			cache_purge(tdvp);
 		cache_purge(fdvp);
 	}
 
 out:
 	if (tdvp == tvp)
 		vrele(tdvp);
 	else
 		vput(tdvp);
 	if (tvp)
 		vput(tvp);
 	vrele(fdvp);
 	vrele(fvp);
 	/*
 	 * Kludge: Map ENOENT => 0 assuming that it is a reply to a retry.
 	 */
 	if (error == ENOENT)
 		error = 0;
 	return (error);
 }
 
 /*
  * nfs file rename rpc called from nfs_remove() above
  */
 static int
 nfs_renameit(struct vnode *sdvp, struct componentname *scnp,
     struct sillyrename *sp)
 {
 
 	return (nfs_renamerpc(sdvp, scnp->cn_nameptr, scnp->cn_namelen, sdvp,
 	    sp->s_name, sp->s_namlen, scnp->cn_cred, scnp->cn_thread));
 }
 
 /*
  * Do an nfs rename rpc. Called from nfs_rename() and nfs_renameit().
  */
 static int
 nfs_renamerpc(struct vnode *fdvp, const char *fnameptr, int fnamelen,
     struct vnode *tdvp, const char *tnameptr, int tnamelen, struct ucred *cred,
     struct thread *td)
 {
 	caddr_t bpos, dpos;
 	int error = 0, fwccflag = NFSV3_WCCRATTR, twccflag = NFSV3_WCCRATTR;
 	struct mbuf *mreq, *mrep, *md, *mb;
 	int v3 = NFS_ISV3(fdvp);
 
 	nfsstats.rpccnt[NFSPROC_RENAME]++;
 	mreq = nfsm_reqhead(fdvp, NFSPROC_RENAME,
 		(NFSX_FH(v3) + NFSX_UNSIGNED)*2 + nfsm_rndup(fnamelen) +
 		nfsm_rndup(tnamelen));
 	mb = mreq;
 	bpos = mtod(mb, caddr_t);
 	nfsm_fhtom(fdvp, v3);
 	nfsm_strtom(fnameptr, fnamelen, NFS_MAXNAMLEN);
 	nfsm_fhtom(tdvp, v3);
 	nfsm_strtom(tnameptr, tnamelen, NFS_MAXNAMLEN);
 	nfsm_request(fdvp, NFSPROC_RENAME, td, cred);
 	if (v3) {
 		nfsm_wcc_data(fdvp, fwccflag);
 		nfsm_wcc_data(tdvp, twccflag);
 	}
 	m_freem(mrep);
 nfsmout:
 	mtx_lock(&(VTONFS(fdvp))->n_mtx);
 	VTONFS(fdvp)->n_flag |= NMODIFIED;
 	mtx_unlock(&(VTONFS(fdvp))->n_mtx);
 	mtx_lock(&(VTONFS(tdvp))->n_mtx);
 	VTONFS(tdvp)->n_flag |= NMODIFIED;
 	mtx_unlock(&(VTONFS(tdvp))->n_mtx);
 	if (!fwccflag)
 		VTONFS(fdvp)->n_attrstamp = 0;
 	if (!twccflag)
 		VTONFS(tdvp)->n_attrstamp = 0;
 	return (error);
 }
 
 /*
  * nfs hard link create call
  */
 static int
 nfs_link(struct vop_link_args *ap)
 {
 	struct vnode *vp = ap->a_vp;
 	struct vnode *tdvp = ap->a_tdvp;
 	struct componentname *cnp = ap->a_cnp;
 	caddr_t bpos, dpos;
 	int error = 0, wccflag = NFSV3_WCCRATTR, attrflag = 0;
 	struct mbuf *mreq, *mrep, *md, *mb;
 	int v3;
 
 	if (vp->v_mount != tdvp->v_mount) {
 		return (EXDEV);
 	}
 
 	/*
 	 * Push all writes to the server, so that the attribute cache
 	 * doesn't get "out of sync" with the server.
 	 * XXX There should be a better way!
 	 */
 	VOP_FSYNC(vp, MNT_WAIT, cnp->cn_thread);
 
 	v3 = NFS_ISV3(vp);
 	nfsstats.rpccnt[NFSPROC_LINK]++;
 	mreq = nfsm_reqhead(vp, NFSPROC_LINK,
 		NFSX_FH(v3)*2 + NFSX_UNSIGNED + nfsm_rndup(cnp->cn_namelen));
 	mb = mreq;
 	bpos = mtod(mb, caddr_t);
 	nfsm_fhtom(vp, v3);
 	nfsm_fhtom(tdvp, v3);
 	nfsm_strtom(cnp->cn_nameptr, cnp->cn_namelen, NFS_MAXNAMLEN);
 	nfsm_request(vp, NFSPROC_LINK, cnp->cn_thread, cnp->cn_cred);
 	if (v3) {
 		nfsm_postop_attr(vp, attrflag);
 		nfsm_wcc_data(tdvp, wccflag);
 	}
 	m_freem(mrep);
 nfsmout:
 	mtx_lock(&(VTONFS(tdvp))->n_mtx);
 	VTONFS(tdvp)->n_flag |= NMODIFIED;
 	mtx_unlock(&(VTONFS(tdvp))->n_mtx);
 	if (!attrflag)
 		VTONFS(vp)->n_attrstamp = 0;
 	if (!wccflag)
 		VTONFS(tdvp)->n_attrstamp = 0;
 	return (error);
 }
 
 /*
  * nfs symbolic link create call
  */
 static int
 nfs_symlink(struct vop_symlink_args *ap)
 {
 	struct vnode *dvp = ap->a_dvp;
 	struct vattr *vap = ap->a_vap;
 	struct componentname *cnp = ap->a_cnp;
 	struct nfsv2_sattr *sp;
 	caddr_t bpos, dpos;
 	int slen, error = 0, wccflag = NFSV3_WCCRATTR, gotvp;
 	struct mbuf *mreq, *mrep, *md, *mb;
 	struct vnode *newvp = NULL;
 	int v3 = NFS_ISV3(dvp);
 
 	nfsstats.rpccnt[NFSPROC_SYMLINK]++;
 	slen = strlen(ap->a_target);
 	mreq = nfsm_reqhead(dvp, NFSPROC_SYMLINK, NFSX_FH(v3) + 2*NFSX_UNSIGNED +
 	    nfsm_rndup(cnp->cn_namelen) + nfsm_rndup(slen) + NFSX_SATTR(v3));
 	mb = mreq;
 	bpos = mtod(mb, caddr_t);
 	nfsm_fhtom(dvp, v3);
 	nfsm_strtom(cnp->cn_nameptr, cnp->cn_namelen, NFS_MAXNAMLEN);
 	if (v3) {
 		nfsm_v3attrbuild(vap, FALSE);
 	}
 	nfsm_strtom(ap->a_target, slen, NFS_MAXPATHLEN);
 	if (!v3) {
 		sp = nfsm_build(struct nfsv2_sattr *, NFSX_V2SATTR);
 		sp->sa_mode = vtonfsv2_mode(VLNK, vap->va_mode);
 		sp->sa_uid = nfs_xdrneg1;
 		sp->sa_gid = nfs_xdrneg1;
 		sp->sa_size = nfs_xdrneg1;
 		txdr_nfsv2time(&vap->va_atime, &sp->sa_atime);
 		txdr_nfsv2time(&vap->va_mtime, &sp->sa_mtime);
 	}
 
 	/*
 	 * Issue the NFS request and get the rpc response.
 	 *
 	 * Only NFSv3 responses returning an error of 0 actually return
 	 * a file handle that can be converted into newvp without having
 	 * to do an extra lookup rpc.
 	 */
 	nfsm_request(dvp, NFSPROC_SYMLINK, cnp->cn_thread, cnp->cn_cred);
 	if (v3) {
 		if (error == 0)
 			nfsm_mtofh(dvp, newvp, v3, gotvp);
 		nfsm_wcc_data(dvp, wccflag);
 	}
 
 	/*
 	 * out code jumps -> here, mrep is also freed.
 	 */
 
 	m_freem(mrep);
 nfsmout:
 
 	/*
 	 * If we do not have an error and we could not extract the newvp from
 	 * the response due to the request being NFSv2, we have to do a
 	 * lookup in order to obtain a newvp to return.
 	 */
 	if (error == 0 && newvp == NULL) {
 		struct nfsnode *np = NULL;
 
 		error = nfs_lookitup(dvp, cnp->cn_nameptr, cnp->cn_namelen,
 		    cnp->cn_cred, cnp->cn_thread, &np);
 		if (!error)
 			newvp = NFSTOV(np);
 	}
 	if (error) {
 		if (newvp)
 			vput(newvp);
 	} else {
 		*ap->a_vpp = newvp;
 	}
 	mtx_lock(&(VTONFS(dvp))->n_mtx);
 	VTONFS(dvp)->n_flag |= NMODIFIED;
 	mtx_unlock(&(VTONFS(dvp))->n_mtx);
 	if (!wccflag)
 		VTONFS(dvp)->n_attrstamp = 0;
 	return (error);
 }
 
 /*
  * nfs make dir call
  */
 static int
 nfs_mkdir(struct vop_mkdir_args *ap)
 {
 	struct vnode *dvp = ap->a_dvp;
 	struct vattr *vap = ap->a_vap;
 	struct componentname *cnp = ap->a_cnp;
 	struct nfsv2_sattr *sp;
 	int len;
 	struct nfsnode *np = NULL;
 	struct vnode *newvp = NULL;
 	caddr_t bpos, dpos;
 	int error = 0, wccflag = NFSV3_WCCRATTR;
 	int gotvp = 0;
 	struct mbuf *mreq, *mrep, *md, *mb;
 	struct vattr vattr;
 	int v3 = NFS_ISV3(dvp);
 
 	if ((error = VOP_GETATTR(dvp, &vattr, cnp->cn_cred, cnp->cn_thread)) != 0) {
 		return (error);
 	}
 	len = cnp->cn_namelen;
 	nfsstats.rpccnt[NFSPROC_MKDIR]++;
 	mreq = nfsm_reqhead(dvp, NFSPROC_MKDIR,
 	  NFSX_FH(v3) + NFSX_UNSIGNED + nfsm_rndup(len) + NFSX_SATTR(v3));
 	mb = mreq;
 	bpos = mtod(mb, caddr_t);
 	nfsm_fhtom(dvp, v3);
 	nfsm_strtom(cnp->cn_nameptr, len, NFS_MAXNAMLEN);
 	if (v3) {
 		nfsm_v3attrbuild(vap, FALSE);
 	} else {
 		sp = nfsm_build(struct nfsv2_sattr *, NFSX_V2SATTR);
 		sp->sa_mode = vtonfsv2_mode(VDIR, vap->va_mode);
 		sp->sa_uid = nfs_xdrneg1;
 		sp->sa_gid = nfs_xdrneg1;
 		sp->sa_size = nfs_xdrneg1;
 		txdr_nfsv2time(&vap->va_atime, &sp->sa_atime);
 		txdr_nfsv2time(&vap->va_mtime, &sp->sa_mtime);
 	}
 	nfsm_request(dvp, NFSPROC_MKDIR, cnp->cn_thread, cnp->cn_cred);
 	if (!error)
 		nfsm_mtofh(dvp, newvp, v3, gotvp);
 	if (v3)
 		nfsm_wcc_data(dvp, wccflag);
 	m_freem(mrep);
 nfsmout:
 	mtx_lock(&(VTONFS(dvp))->n_mtx);
 	VTONFS(dvp)->n_flag |= NMODIFIED;
 	mtx_unlock(&(VTONFS(dvp))->n_mtx);
 	if (!wccflag)
 		VTONFS(dvp)->n_attrstamp = 0;
 	if (error == 0 && newvp == NULL) {
 		error = nfs_lookitup(dvp, cnp->cn_nameptr, len, cnp->cn_cred,
 			cnp->cn_thread, &np);
 		if (!error) {
 			newvp = NFSTOV(np);
 			if (newvp->v_type != VDIR)
 				error = EEXIST;
 		}
 	}
 	if (error) {
 		if (newvp)
 			vput(newvp);
 	} else
 		*ap->a_vpp = newvp;
 	return (error);
 }
 
 /*
  * nfs remove directory call
  */
 static int
 nfs_rmdir(struct vop_rmdir_args *ap)
 {
 	struct vnode *vp = ap->a_vp;
 	struct vnode *dvp = ap->a_dvp;
 	struct componentname *cnp = ap->a_cnp;
 	caddr_t bpos, dpos;
 	int error = 0, wccflag = NFSV3_WCCRATTR;
 	struct mbuf *mreq, *mrep, *md, *mb;
 	int v3 = NFS_ISV3(dvp);
 
 	if (dvp == vp)
 		return (EINVAL);
 	nfsstats.rpccnt[NFSPROC_RMDIR]++;
 	mreq = nfsm_reqhead(dvp, NFSPROC_RMDIR,
 		NFSX_FH(v3) + NFSX_UNSIGNED + nfsm_rndup(cnp->cn_namelen));
 	mb = mreq;
 	bpos = mtod(mb, caddr_t);
 	nfsm_fhtom(dvp, v3);
 	nfsm_strtom(cnp->cn_nameptr, cnp->cn_namelen, NFS_MAXNAMLEN);
 	nfsm_request(dvp, NFSPROC_RMDIR, cnp->cn_thread, cnp->cn_cred);
 	if (v3)
 		nfsm_wcc_data(dvp, wccflag);
 	m_freem(mrep);
 nfsmout:
 	mtx_lock(&(VTONFS(dvp))->n_mtx);
 	VTONFS(dvp)->n_flag |= NMODIFIED;
 	mtx_unlock(&(VTONFS(dvp))->n_mtx);
 	if (!wccflag)
 		VTONFS(dvp)->n_attrstamp = 0;
 	cache_purge(dvp);
 	cache_purge(vp);
 	/*
 	 * Kludge: Map ENOENT => 0 assuming that you have a reply to a retry.
 	 */
 	if (error == ENOENT)
 		error = 0;
 	return (error);
 }
 
 /*
  * nfs readdir call
  */
 static int
 nfs_readdir(struct vop_readdir_args *ap)
 {
 	struct vnode *vp = ap->a_vp;
 	struct nfsnode *np = VTONFS(vp);
 	struct uio *uio = ap->a_uio;
 	int tresid, error = 0;
 	struct vattr vattr;
 	
 	if (vp->v_type != VDIR) 
 		return(EPERM);
 
 	/*
 	 * First, check for hit on the EOF offset cache
 	 */
 	if (np->n_direofoffset > 0 && uio->uio_offset >= np->n_direofoffset &&
 	    (np->n_flag & NMODIFIED) == 0) {
 		if (VOP_GETATTR(vp, &vattr, ap->a_cred, uio->uio_td) == 0) {
 			mtx_lock(&np->n_mtx);
 			if (!NFS_TIMESPEC_COMPARE(&np->n_mtime, &vattr.va_mtime)) {
 				mtx_unlock(&np->n_mtx);
 				nfsstats.direofcache_hits++;
 				goto out;
 			} else
 				mtx_unlock(&np->n_mtx);
 		}
 	}
 
 	/*
 	 * Call nfs_bioread() to do the real work.
 	 */
 	tresid = uio->uio_resid;
 	error = nfs_bioread(vp, uio, 0, ap->a_cred);
 
 	if (!error && uio->uio_resid == tresid) {
 		nfsstats.direofcache_misses++;
 	}
 out:
 	return (error);
 }
 
 /*
  * Readdir rpc call.
  * Called from below the buffer cache by nfs_doio().
  */
 int
 nfs_readdirrpc(struct vnode *vp, struct uio *uiop, struct ucred *cred)
 {
 	int len, left;
 	struct dirent *dp = NULL;
 	u_int32_t *tl;
 	caddr_t cp;
 	nfsuint64 *cookiep;
 	caddr_t bpos, dpos;
 	struct mbuf *mreq, *mrep, *md, *mb;
 	nfsuint64 cookie;
 	struct nfsmount *nmp = VFSTONFS(vp->v_mount);
 	struct nfsnode *dnp = VTONFS(vp);
 	u_quad_t fileno;
 	int error = 0, tlen, more_dirs = 1, blksiz = 0, bigenough = 1;
 	int attrflag;
 	int v3 = NFS_ISV3(vp);
 
 #ifndef DIAGNOSTIC
 	if (uiop->uio_iovcnt != 1 || (uiop->uio_offset & (DIRBLKSIZ - 1)) ||
 		(uiop->uio_resid & (DIRBLKSIZ - 1)))
 		panic("nfs readdirrpc bad uio");
 #endif
 
 	/*
 	 * If there is no cookie, assume directory was stale.
 	 */
 	nfs_dircookie_lock(dnp);
 	cookiep = nfs_getcookie(dnp, uiop->uio_offset, 0);
 	if (cookiep) {
 		cookie = *cookiep;
 		nfs_dircookie_unlock(dnp);
 	} else {
 		nfs_dircookie_unlock(dnp);		
 		return (NFSERR_BAD_COOKIE);
 	}
 
 	/*
 	 * Loop around doing readdir rpc's of size nm_readdirsize
 	 * truncated to a multiple of DIRBLKSIZ.
 	 * The stopping criteria is EOF or buffer full.
 	 */
 	while (more_dirs && bigenough) {
 		nfsstats.rpccnt[NFSPROC_READDIR]++;
 		mreq = nfsm_reqhead(vp, NFSPROC_READDIR, NFSX_FH(v3) +
 			NFSX_READDIR(v3));
 		mb = mreq;
 		bpos = mtod(mb, caddr_t);
 		nfsm_fhtom(vp, v3);
 		if (v3) {
 			tl = nfsm_build(u_int32_t *, 5 * NFSX_UNSIGNED);
 			*tl++ = cookie.nfsuquad[0];
 			*tl++ = cookie.nfsuquad[1];
 			mtx_lock(&dnp->n_mtx);
 			*tl++ = dnp->n_cookieverf.nfsuquad[0];
 			*tl++ = dnp->n_cookieverf.nfsuquad[1];
 			mtx_unlock(&dnp->n_mtx);
 		} else {
 			tl = nfsm_build(u_int32_t *, 2 * NFSX_UNSIGNED);
 			*tl++ = cookie.nfsuquad[0];
 		}
 		*tl = txdr_unsigned(nmp->nm_readdirsize);
 		nfsm_request(vp, NFSPROC_READDIR, uiop->uio_td, cred);
 		if (v3) {
 			nfsm_postop_attr(vp, attrflag);
 			if (!error) {
 				tl = nfsm_dissect(u_int32_t *,
 				    2 * NFSX_UNSIGNED);
 				mtx_lock(&dnp->n_mtx);
 				dnp->n_cookieverf.nfsuquad[0] = *tl++;
 				dnp->n_cookieverf.nfsuquad[1] = *tl;
 				mtx_unlock(&dnp->n_mtx);
 			} else {
 				m_freem(mrep);
 				goto nfsmout;
 			}
 		}
 		tl = nfsm_dissect(u_int32_t *, NFSX_UNSIGNED);
 		more_dirs = fxdr_unsigned(int, *tl);
 
 		/* loop thru the dir entries, doctoring them to 4bsd form */
 		while (more_dirs && bigenough) {
 			if (v3) {
 				tl = nfsm_dissect(u_int32_t *,
 				    3 * NFSX_UNSIGNED);
 				fileno = fxdr_hyper(tl);
 				len = fxdr_unsigned(int, *(tl + 2));
 			} else {
 				tl = nfsm_dissect(u_int32_t *,
 				    2 * NFSX_UNSIGNED);
 				fileno = fxdr_unsigned(u_quad_t, *tl++);
 				len = fxdr_unsigned(int, *tl);
 			}
 			if (len <= 0 || len > NFS_MAXNAMLEN) {
 				error = EBADRPC;
 				m_freem(mrep);
 				goto nfsmout;
 			}
 			tlen = nfsm_rndup(len);
 			if (tlen == len)
 				tlen += 4;	/* To ensure null termination */
 			left = DIRBLKSIZ - blksiz;
 			if ((tlen + DIRHDSIZ) > left) {
 				dp->d_reclen += left;
 				uiop->uio_iov->iov_base =
 				    (char *)uiop->uio_iov->iov_base + left;
 				uiop->uio_iov->iov_len -= left;
 				uiop->uio_offset += left;
 				uiop->uio_resid -= left;
 				blksiz = 0;
 			}
 			if ((tlen + DIRHDSIZ) > uiop->uio_resid)
 				bigenough = 0;
 			if (bigenough) {
 				dp = (struct dirent *)uiop->uio_iov->iov_base;
 				dp->d_fileno = (int)fileno;
 				dp->d_namlen = len;
 				dp->d_reclen = tlen + DIRHDSIZ;
 				dp->d_type = DT_UNKNOWN;
 				blksiz += dp->d_reclen;
 				if (blksiz == DIRBLKSIZ)
 					blksiz = 0;
 				uiop->uio_offset += DIRHDSIZ;
 				uiop->uio_resid -= DIRHDSIZ;
 				uiop->uio_iov->iov_base =
 				    (char *)uiop->uio_iov->iov_base + DIRHDSIZ;
 				uiop->uio_iov->iov_len -= DIRHDSIZ;
 				nfsm_mtouio(uiop, len);
 				cp = uiop->uio_iov->iov_base;
 				tlen -= len;
 				*cp = '\0';	/* null terminate */
 				uiop->uio_iov->iov_base =
 				    (char *)uiop->uio_iov->iov_base + tlen;
 				uiop->uio_iov->iov_len -= tlen;
 				uiop->uio_offset += tlen;
 				uiop->uio_resid -= tlen;
 			} else
 				nfsm_adv(nfsm_rndup(len));
 			if (v3) {
 				tl = nfsm_dissect(u_int32_t *,
 				    3 * NFSX_UNSIGNED);
 			} else {
 				tl = nfsm_dissect(u_int32_t *,
 				    2 * NFSX_UNSIGNED);
 			}
 			if (bigenough) {
 				cookie.nfsuquad[0] = *tl++;
 				if (v3)
 					cookie.nfsuquad[1] = *tl++;
 			} else if (v3)
 				tl += 2;
 			else
 				tl++;
 			more_dirs = fxdr_unsigned(int, *tl);
 		}
 		/*
 		 * If at end of rpc data, get the eof boolean
 		 */
 		if (!more_dirs) {
 			tl = nfsm_dissect(u_int32_t *, NFSX_UNSIGNED);
 			more_dirs = (fxdr_unsigned(int, *tl) == 0);
 		}
 		m_freem(mrep);
 	}
 	/*
 	 * Fill last record, iff any, out to a multiple of DIRBLKSIZ
 	 * by increasing d_reclen for the last record.
 	 */
 	if (blksiz > 0) {
 		left = DIRBLKSIZ - blksiz;
 		dp->d_reclen += left;
 		uiop->uio_iov->iov_base =
 		    (char *)uiop->uio_iov->iov_base + left;
 		uiop->uio_iov->iov_len -= left;
 		uiop->uio_offset += left;
 		uiop->uio_resid -= left;
 	}
 
 	/*
 	 * We are now either at the end of the directory or have filled the
 	 * block.
 	 */
 	if (bigenough)
 		dnp->n_direofoffset = uiop->uio_offset;
 	else {
 		if (uiop->uio_resid > 0)
 			nfs_printf("EEK! readdirrpc resid > 0\n");
 		nfs_dircookie_lock(dnp);
 		cookiep = nfs_getcookie(dnp, uiop->uio_offset, 1);
 		*cookiep = cookie;
 		nfs_dircookie_unlock(dnp);
 	}
 nfsmout:
 	return (error);
 }
 
 /*
  * NFS V3 readdir plus RPC. Used in place of nfs_readdirrpc().
  */
 int
 nfs_readdirplusrpc(struct vnode *vp, struct uio *uiop, struct ucred *cred)
 {
 	int len, left;
 	struct dirent *dp;
 	u_int32_t *tl;
 	caddr_t cp;
 	struct vnode *newvp;
 	nfsuint64 *cookiep;
 	caddr_t bpos, dpos, dpossav1, dpossav2;
 	struct mbuf *mreq, *mrep, *md, *mb, *mdsav1, *mdsav2;
 	struct nameidata nami, *ndp = &nami;
 	struct componentname *cnp = &ndp->ni_cnd;
 	nfsuint64 cookie;
 	struct nfsmount *nmp = VFSTONFS(vp->v_mount);
 	struct nfsnode *dnp = VTONFS(vp), *np;
 	nfsfh_t *fhp;
 	u_quad_t fileno;
 	int error = 0, tlen, more_dirs = 1, blksiz = 0, doit, bigenough = 1, i;
 	int attrflag, fhsize;
 
 #ifndef nolint
 	dp = NULL;
 #endif
 #ifndef DIAGNOSTIC
 	if (uiop->uio_iovcnt != 1 || (uiop->uio_offset & (DIRBLKSIZ - 1)) ||
 		(uiop->uio_resid & (DIRBLKSIZ - 1)))
 		panic("nfs readdirplusrpc bad uio");
 #endif
 	ndp->ni_dvp = vp;
 	newvp = NULLVP;
 
 	/*
 	 * If there is no cookie, assume directory was stale.
 	 */
 	nfs_dircookie_lock(dnp);
 	cookiep = nfs_getcookie(dnp, uiop->uio_offset, 0);
 	if (cookiep) {
 		cookie = *cookiep;
 		nfs_dircookie_unlock(dnp);
 	} else {
 		nfs_dircookie_unlock(dnp);
 		return (NFSERR_BAD_COOKIE);
 	}
 	/*
 	 * Loop around doing readdir rpc's of size nm_readdirsize
 	 * truncated to a multiple of DIRBLKSIZ.
 	 * The stopping criteria is EOF or buffer full.
 	 */
 	while (more_dirs && bigenough) {
 		nfsstats.rpccnt[NFSPROC_READDIRPLUS]++;
 		mreq = nfsm_reqhead(vp, NFSPROC_READDIRPLUS,
 			NFSX_FH(1) + 6 * NFSX_UNSIGNED);
 		mb = mreq;
 		bpos = mtod(mb, caddr_t);
 		nfsm_fhtom(vp, 1);
  		tl = nfsm_build(u_int32_t *, 6 * NFSX_UNSIGNED);
 		*tl++ = cookie.nfsuquad[0];
 		*tl++ = cookie.nfsuquad[1];
 		mtx_lock(&dnp->n_mtx);
 		*tl++ = dnp->n_cookieverf.nfsuquad[0];
 		*tl++ = dnp->n_cookieverf.nfsuquad[1];
 		mtx_unlock(&dnp->n_mtx);
 		*tl++ = txdr_unsigned(nmp->nm_readdirsize);
 		*tl = txdr_unsigned(nmp->nm_rsize);
 		nfsm_request(vp, NFSPROC_READDIRPLUS, uiop->uio_td, cred);
 		nfsm_postop_attr(vp, attrflag);
 		if (error) {
 			m_freem(mrep);
 			goto nfsmout;
 		}
 		tl = nfsm_dissect(u_int32_t *, 3 * NFSX_UNSIGNED);
 		mtx_lock(&dnp->n_mtx);
 		dnp->n_cookieverf.nfsuquad[0] = *tl++;
 		dnp->n_cookieverf.nfsuquad[1] = *tl++;
 		mtx_unlock(&dnp->n_mtx);
 		more_dirs = fxdr_unsigned(int, *tl);
 
 		/* loop thru the dir entries, doctoring them to 4bsd form */
 		while (more_dirs && bigenough) {
 			tl = nfsm_dissect(u_int32_t *, 3 * NFSX_UNSIGNED);
 			fileno = fxdr_hyper(tl);
 			len = fxdr_unsigned(int, *(tl + 2));
 			if (len <= 0 || len > NFS_MAXNAMLEN) {
 				error = EBADRPC;
 				m_freem(mrep);
 				goto nfsmout;
 			}
 			tlen = nfsm_rndup(len);
 			if (tlen == len)
 				tlen += 4;	/* To ensure null termination*/
 			left = DIRBLKSIZ - blksiz;
 			if ((tlen + DIRHDSIZ) > left) {
 				dp->d_reclen += left;
 				uiop->uio_iov->iov_base =
 				    (char *)uiop->uio_iov->iov_base + left;
 				uiop->uio_iov->iov_len -= left;
 				uiop->uio_offset += left;
 				uiop->uio_resid -= left;
 				blksiz = 0;
 			}
 			if ((tlen + DIRHDSIZ) > uiop->uio_resid)
 				bigenough = 0;
 			if (bigenough) {
 				dp = (struct dirent *)uiop->uio_iov->iov_base;
 				dp->d_fileno = (int)fileno;
 				dp->d_namlen = len;
 				dp->d_reclen = tlen + DIRHDSIZ;
 				dp->d_type = DT_UNKNOWN;
 				blksiz += dp->d_reclen;
 				if (blksiz == DIRBLKSIZ)
 					blksiz = 0;
 				uiop->uio_offset += DIRHDSIZ;
 				uiop->uio_resid -= DIRHDSIZ;
 				uiop->uio_iov->iov_base =
 				    (char *)uiop->uio_iov->iov_base + DIRHDSIZ;
 				uiop->uio_iov->iov_len -= DIRHDSIZ;
 				cnp->cn_nameptr = uiop->uio_iov->iov_base;
 				cnp->cn_namelen = len;
 				nfsm_mtouio(uiop, len);
 				cp = uiop->uio_iov->iov_base;
 				tlen -= len;
 				*cp = '\0';
 				uiop->uio_iov->iov_base =
 				    (char *)uiop->uio_iov->iov_base + tlen;
 				uiop->uio_iov->iov_len -= tlen;
 				uiop->uio_offset += tlen;
 				uiop->uio_resid -= tlen;
 			} else
 				nfsm_adv(nfsm_rndup(len));
 			tl = nfsm_dissect(u_int32_t *, 3 * NFSX_UNSIGNED);
 			if (bigenough) {
 				cookie.nfsuquad[0] = *tl++;
 				cookie.nfsuquad[1] = *tl++;
 			} else
 				tl += 2;
 
 			/*
 			 * Since the attributes are before the file handle
 			 * (sigh), we must skip over the attributes and then
 			 * come back and get them.
 			 */
 			attrflag = fxdr_unsigned(int, *tl);
 			if (attrflag) {
 			    dpossav1 = dpos;
 			    mdsav1 = md;
 			    nfsm_adv(NFSX_V3FATTR);
 			    tl = nfsm_dissect(u_int32_t *, NFSX_UNSIGNED);
 			    doit = fxdr_unsigned(int, *tl);
 			    /*
  			     * Skip loading the attrs for "..". There's a 
  			     * race between loading the attrs here and 
  			     * lookups that look for the directory currently
  			     * being read (in the parent). We try to acquire
  			     * the exclusive lock on ".." here, owning the 
  			     * lock on the directory being read. Lookup will
  			     * hold the lock on ".." and try to acquire the 
  			     * lock on the directory being read.
  			     * 
  			     * There are other ways of fixing this, one would
  			     * be to do a trylock on the ".." vnode and skip
  			     * loading the attrs on ".." if it happens to be 
  			     * locked by another process. But skipping the
  			     * attrload on ".." seems the easiest option.
  			     */
  			    if (strcmp(dp->d_name, "..") == 0) {
  				    doit = 0;
  				    /*
  				     * We've already skipped over the attrs, 
  				     * skip over the filehandle. And store d_type
  				     * as VDIR.
  				     */
  				    tl = nfsm_dissect(u_int32_t *, NFSX_UNSIGNED);
  				    i = fxdr_unsigned(int, *tl);
  				    nfsm_adv(nfsm_rndup(i));
  				    dp->d_type = IFTODT(VTTOIF(VDIR));
  			    }	    
 			    if (doit) {
 				nfsm_getfh(fhp, fhsize, 1);
 				if (NFS_CMPFH(dnp, fhp, fhsize)) {
 				    VREF(vp);
 				    newvp = vp;
 				    np = dnp;
 				} else {
 				    error = nfs_nget(vp->v_mount, fhp,
 					fhsize, &np, LK_EXCLUSIVE);
 				    if (error)
 					doit = 0;
 				    else
 					newvp = NFSTOV(np);
 				}
 			    }
 			    if (doit && bigenough) {
 				dpossav2 = dpos;
 				dpos = dpossav1;
 				mdsav2 = md;
 				md = mdsav1;
 				nfsm_loadattr(newvp, NULL);
 				dpos = dpossav2;
 				md = mdsav2;
 				dp->d_type =
 				    IFTODT(VTTOIF(np->n_vattr.va_type));
 				ndp->ni_vp = newvp;
 				/* Update n_ctime, so subsequent lookup doesn't purge entry */
 				np->n_ctime = np->n_vattr.va_ctime.tv_sec;
 			        cache_enter(ndp->ni_dvp, ndp->ni_vp, cnp);
 			    }
 			} else {
 			    /* Just skip over the file handle */
 			    tl = nfsm_dissect(u_int32_t *, NFSX_UNSIGNED);
 			    i = fxdr_unsigned(int, *tl);
 			    if (i) {
 				    tl = nfsm_dissect(u_int32_t *, NFSX_UNSIGNED);
 				    fhsize = fxdr_unsigned(int, *tl);
 				    nfsm_adv(nfsm_rndup(fhsize));
 			    }
 			}
 			if (newvp != NULLVP) {
 			    if (newvp == vp)
 				vrele(newvp);
 			    else
 				vput(newvp);
 			    newvp = NULLVP;
 			}
 			tl = nfsm_dissect(u_int32_t *, NFSX_UNSIGNED);
 			more_dirs = fxdr_unsigned(int, *tl);
 		}
 		/*
 		 * If at end of rpc data, get the eof boolean
 		 */
 		if (!more_dirs) {
 			tl = nfsm_dissect(u_int32_t *, NFSX_UNSIGNED);
 			more_dirs = (fxdr_unsigned(int, *tl) == 0);
 		}
 		m_freem(mrep);
 	}
 	/*
 	 * Fill last record, iff any, out to a multiple of DIRBLKSIZ
 	 * by increasing d_reclen for the last record.
 	 */
 	if (blksiz > 0) {
 		left = DIRBLKSIZ - blksiz;
 		dp->d_reclen += left;
 		uiop->uio_iov->iov_base =
 		    (char *)uiop->uio_iov->iov_base + left;
 		uiop->uio_iov->iov_len -= left;
 		uiop->uio_offset += left;
 		uiop->uio_resid -= left;
 	}
 
 	/*
 	 * We are now either at the end of the directory or have filled the
 	 * block.
 	 */
 	if (bigenough)
 		dnp->n_direofoffset = uiop->uio_offset;
 	else {
 		if (uiop->uio_resid > 0)
 			nfs_printf("EEK! readdirplusrpc resid > 0\n");
 		nfs_dircookie_lock(dnp);
 		cookiep = nfs_getcookie(dnp, uiop->uio_offset, 1);
 		*cookiep = cookie;
 		nfs_dircookie_unlock(dnp);
 	}
 nfsmout:
 	if (newvp != NULLVP) {
 	        if (newvp == vp)
 			vrele(newvp);
 		else
 			vput(newvp);
 		newvp = NULLVP;
 	}
 	return (error);
 }
 
 /*
  * Silly rename. To make the NFS filesystem that is stateless look a little
  * more like the "ufs" a remove of an active vnode is translated to a rename
  * to a funny looking filename that is removed by nfs_inactive on the
  * nfsnode. There is the potential for another process on a different client
  * to create the same funny name between the nfs_lookitup() fails and the
  * nfs_rename() completes, but...
  */
 static int
 nfs_sillyrename(struct vnode *dvp, struct vnode *vp, struct componentname *cnp)
 {
 	struct sillyrename *sp;
 	struct nfsnode *np;
 	int error;
 	short pid;
 	unsigned int lticks;
 
 	cache_purge(dvp);
 	np = VTONFS(vp);
 #ifndef DIAGNOSTIC
 	if (vp->v_type == VDIR)
 		panic("nfs: sillyrename dir");
 #endif
 	MALLOC(sp, struct sillyrename *, sizeof (struct sillyrename),
 		M_NFSREQ, M_WAITOK);
 	sp->s_cred = crhold(cnp->cn_cred);
 	sp->s_dvp = dvp;
 	sp->s_removeit = nfs_removeit;
 	VREF(dvp);
 
 	/* 
 	 * Fudge together a funny name.
 	 * Changing the format of the funny name to accomodate more 
 	 * sillynames per directory.
 	 * The name is now changed to .nfs.<ticks>.<pid>.4, where ticks is 
 	 * CPU ticks since boot.
 	 */
 	pid = cnp->cn_thread->td_proc->p_pid;
 	lticks = (unsigned int)ticks;
 	for ( ; ; ) {
 		sp->s_namlen = sprintf(sp->s_name, 
 				       ".nfs.%08x.%04x4.4", lticks, 
 				       pid);
 		if (nfs_lookitup(dvp, sp->s_name, sp->s_namlen, sp->s_cred,
 				 cnp->cn_thread, NULL))
 			break;
 		lticks++;
 	}
 	error = nfs_renameit(dvp, cnp, sp);
 	if (error)
 		goto bad;
 	error = nfs_lookitup(dvp, sp->s_name, sp->s_namlen, sp->s_cred,
 		cnp->cn_thread, &np);
 	np->n_sillyrename = sp;
 	return (0);
 bad:
 	vrele(sp->s_dvp);
 	crfree(sp->s_cred);
 	free((caddr_t)sp, M_NFSREQ);
 	return (error);
 }
 
 /*
  * Look up a file name and optionally either update the file handle or
  * allocate an nfsnode, depending on the value of npp.
  * npp == NULL	--> just do the lookup
  * *npp == NULL --> allocate a new nfsnode and make sure attributes are
  *			handled too
  * *npp != NULL --> update the file handle in the vnode
  */
 static int
 nfs_lookitup(struct vnode *dvp, const char *name, int len, struct ucred *cred,
     struct thread *td, struct nfsnode **npp)
 {
 	struct vnode *newvp = NULL;
 	struct nfsnode *np, *dnp = VTONFS(dvp);
 	caddr_t bpos, dpos;
 	int error = 0, fhlen, attrflag;
 	struct mbuf *mreq, *mrep, *md, *mb;
 	nfsfh_t *nfhp;
 	int v3 = NFS_ISV3(dvp);
 
 	nfsstats.rpccnt[NFSPROC_LOOKUP]++;
 	mreq = nfsm_reqhead(dvp, NFSPROC_LOOKUP,
 		NFSX_FH(v3) + NFSX_UNSIGNED + nfsm_rndup(len));
 	mb = mreq;
 	bpos = mtod(mb, caddr_t);
 	nfsm_fhtom(dvp, v3);
 	nfsm_strtom(name, len, NFS_MAXNAMLEN);
 	nfsm_request(dvp, NFSPROC_LOOKUP, td, cred);
 	if (npp && !error) {
 		nfsm_getfh(nfhp, fhlen, v3);
 		if (*npp) {
 		    np = *npp;
 		    if (np->n_fhsize > NFS_SMALLFH && fhlen <= NFS_SMALLFH) {
 			free((caddr_t)np->n_fhp, M_NFSBIGFH);
 			np->n_fhp = &np->n_fh;
 		    } else if (np->n_fhsize <= NFS_SMALLFH && fhlen>NFS_SMALLFH)
 			np->n_fhp =(nfsfh_t *)malloc(fhlen, M_NFSBIGFH, M_WAITOK);
 		    bcopy((caddr_t)nfhp, (caddr_t)np->n_fhp, fhlen);
 		    np->n_fhsize = fhlen;
 		    newvp = NFSTOV(np);
 		} else if (NFS_CMPFH(dnp, nfhp, fhlen)) {
 		    VREF(dvp);
 		    newvp = dvp;
 		} else {
 		    error = nfs_nget(dvp->v_mount, nfhp, fhlen, &np, LK_EXCLUSIVE);
 		    if (error) {
 			m_freem(mrep);
 			return (error);
 		    }
 		    newvp = NFSTOV(np);
 		}
 		if (v3) {
 			nfsm_postop_attr(newvp, attrflag);
 			if (!attrflag && *npp == NULL) {
 				m_freem(mrep);
 				if (newvp == dvp)
 					vrele(newvp);
 				else
 					vput(newvp);
 				return (ENOENT);
 			}
 		} else
 			nfsm_loadattr(newvp, NULL);
 	}
 	m_freem(mrep);
 nfsmout:
 	if (npp && *npp == NULL) {
 		if (error) {
 			if (newvp) {
 				if (newvp == dvp)
 					vrele(newvp);
 				else
 					vput(newvp);
 			}
 		} else
 			*npp = np;
 	}
 	return (error);
 }
 
 /*
  * Nfs Version 3 commit rpc
  */
 int
 nfs_commit(struct vnode *vp, u_quad_t offset, int cnt, struct ucred *cred,
 	   struct thread *td)
 {
 	u_int32_t *tl;
 	struct nfsmount *nmp = VFSTONFS(vp->v_mount);
 	caddr_t bpos, dpos;
 	int error = 0, wccflag = NFSV3_WCCRATTR;
 	struct mbuf *mreq, *mrep, *md, *mb;
 
 	mtx_lock(&nmp->nm_mtx);
 	if ((nmp->nm_state & NFSSTA_HASWRITEVERF) == 0) {
 		mtx_unlock(&nmp->nm_mtx);
 		return (0);
 	}
 	mtx_unlock(&nmp->nm_mtx);
 	nfsstats.rpccnt[NFSPROC_COMMIT]++;
 	mreq = nfsm_reqhead(vp, NFSPROC_COMMIT, NFSX_FH(1));
 	mb = mreq;
 	bpos = mtod(mb, caddr_t);
 	nfsm_fhtom(vp, 1);
 	tl = nfsm_build(u_int32_t *, 3 * NFSX_UNSIGNED);
 	txdr_hyper(offset, tl);
 	tl += 2;
 	*tl = txdr_unsigned(cnt);
 	nfsm_request(vp, NFSPROC_COMMIT, td, cred);
 	nfsm_wcc_data(vp, wccflag);
 	if (!error) {
 		tl = nfsm_dissect(u_int32_t *, NFSX_V3WRITEVERF);
 		if (bcmp((caddr_t)nmp->nm_verf, (caddr_t)tl,
 			NFSX_V3WRITEVERF)) {
 			bcopy((caddr_t)tl, (caddr_t)nmp->nm_verf,
 				NFSX_V3WRITEVERF);
 			error = NFSERR_STALEWRITEVERF;
 		}
 	}
 	m_freem(mrep);
 nfsmout:
 	return (error);
 }
 
 /*
  * Strategy routine.
  * For async requests when nfsiod(s) are running, queue the request by
  * calling nfs_asyncio(), otherwise just all nfs_doio() to do the
  * request.
  */
 static int
 nfs_strategy(struct vop_strategy_args *ap)
 {
 	struct buf *bp = ap->a_bp;
 	struct ucred *cr;
 
 	KASSERT(!(bp->b_flags & B_DONE), ("nfs_strategy: buffer %p unexpectedly marked B_DONE", bp));
 	KASSERT(BUF_REFCNT(bp) > 0, ("nfs_strategy: buffer %p not locked", bp));
 
 	if (bp->b_iocmd == BIO_READ)
 		cr = bp->b_rcred;
 	else
 		cr = bp->b_wcred;
 
 	/*
 	 * If the op is asynchronous and an i/o daemon is waiting
 	 * queue the request, wake it up and wait for completion
 	 * otherwise just do it ourselves.
 	 */
 	if ((bp->b_flags & B_ASYNC) == 0 ||
 	    nfs_asyncio(VFSTONFS(ap->a_vp->v_mount), bp, NOCRED, curthread))
 		(void)nfs_doio(ap->a_vp, bp, cr, curthread);
 	return (0);
 }
 
 /*
  * fsync vnode op. Just call nfs_flush() with commit == 1.
  */
 /* ARGSUSED */
 static int
 nfs_fsync(struct vop_fsync_args *ap)
 {
 	return (nfs_flush(ap->a_vp, ap->a_waitfor, ap->a_td, 1));
 }
 
 /*
  * Flush all the blocks associated with a vnode.
  * 	Walk through the buffer pool and push any dirty pages
  *	associated with the vnode.
  */
 static int
 nfs_flush(struct vnode *vp, int waitfor, struct thread *td,
     int commit)
 {
 	struct nfsnode *np = VTONFS(vp);
 	struct buf *bp;
 	int i;
 	struct buf *nbp;
 	struct nfsmount *nmp = VFSTONFS(vp->v_mount);
 	int s, error = 0, slptimeo = 0, slpflag = 0, retv, bvecpos;
 	int passone = 1;
 	u_quad_t off, endoff, toff;
 	struct ucred* wcred = NULL;
 	struct buf **bvec = NULL;
 #ifndef NFS_COMMITBVECSIZ
 #define NFS_COMMITBVECSIZ	20
 #endif
 	struct buf *bvec_on_stack[NFS_COMMITBVECSIZ];
 	int bvecsize = 0, bveccount;
 
 	if (nmp->nm_flag & NFSMNT_INT)
 		slpflag = PCATCH;
 	if (!commit)
 		passone = 0;
 	/*
 	 * A b_flags == (B_DELWRI | B_NEEDCOMMIT) block has been written to the
 	 * server, but has not been committed to stable storage on the server
 	 * yet. On the first pass, the byte range is worked out and the commit
 	 * rpc is done. On the second pass, nfs_writebp() is called to do the
 	 * job.
 	 */
 again:
 	off = (u_quad_t)-1;
 	endoff = 0;
 	bvecpos = 0;
 	if (NFS_ISV3(vp) && commit) {
 		s = splbio();
 		if (bvec != NULL && bvec != bvec_on_stack)
 			free(bvec, M_TEMP);
 		/*
 		 * Count up how many buffers waiting for a commit.
 		 */
 		bveccount = 0;
 		VI_LOCK(vp);
 		TAILQ_FOREACH_SAFE(bp, &vp->v_bufobj.bo_dirty.bv_hd, b_bobufs, nbp) {
 			if (BUF_REFCNT(bp) == 0 &&
 			    (bp->b_flags & (B_DELWRI | B_NEEDCOMMIT))
 				== (B_DELWRI | B_NEEDCOMMIT))
 				bveccount++;
 		}
 		/*
 		 * Allocate space to remember the list of bufs to commit.  It is
 		 * important to use M_NOWAIT here to avoid a race with nfs_write.
 		 * If we can't get memory (for whatever reason), we will end up
 		 * committing the buffers one-by-one in the loop below.
 		 */
 		if (bveccount > NFS_COMMITBVECSIZ) {
 			/*
 			 * Release the vnode interlock to avoid a lock
 			 * order reversal.
 			 */
 			VI_UNLOCK(vp);
 			bvec = (struct buf **)
 				malloc(bveccount * sizeof(struct buf *),
 				       M_TEMP, M_NOWAIT);
 			VI_LOCK(vp);
 			if (bvec == NULL) {
 				bvec = bvec_on_stack;
 				bvecsize = NFS_COMMITBVECSIZ;
 			} else
 				bvecsize = bveccount;
 		} else {
 			bvec = bvec_on_stack;
 			bvecsize = NFS_COMMITBVECSIZ;
 		}
 		TAILQ_FOREACH_SAFE(bp, &vp->v_bufobj.bo_dirty.bv_hd, b_bobufs, nbp) {
 			if (bvecpos >= bvecsize)
 				break;
 			if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT, NULL)) {
 				nbp = TAILQ_NEXT(bp, b_bobufs);
 				continue;
 			}
 			if ((bp->b_flags & (B_DELWRI | B_NEEDCOMMIT)) !=
 			    (B_DELWRI | B_NEEDCOMMIT)) {
 				BUF_UNLOCK(bp);
 				nbp = TAILQ_NEXT(bp, b_bobufs);
 				continue;
 			}
 			VI_UNLOCK(vp);
 			bremfree(bp);
 			/*
 			 * Work out if all buffers are using the same cred
 			 * so we can deal with them all with one commit.
 			 *
 			 * NOTE: we are not clearing B_DONE here, so we have
 			 * to do it later on in this routine if we intend to
 			 * initiate I/O on the bp.
 			 *
 			 * Note: to avoid loopback deadlocks, we do not
 			 * assign b_runningbufspace.
 			 */
 			if (wcred == NULL)
 				wcred = bp->b_wcred;
 			else if (wcred != bp->b_wcred)
 				wcred = NOCRED;
 			vfs_busy_pages(bp, 1);
 
 			VI_LOCK(vp);
 			/*
 			 * bp is protected by being locked, but nbp is not
 			 * and vfs_busy_pages() may sleep.  We have to
 			 * recalculate nbp.
 			 */
 			nbp = TAILQ_NEXT(bp, b_bobufs);
 
 			/*
 			 * A list of these buffers is kept so that the
 			 * second loop knows which buffers have actually
 			 * been committed. This is necessary, since there
 			 * may be a race between the commit rpc and new
 			 * uncommitted writes on the file.
 			 */
 			bvec[bvecpos++] = bp;
 			toff = ((u_quad_t)bp->b_blkno) * DEV_BSIZE +
 				bp->b_dirtyoff;
 			if (toff < off)
 				off = toff;
 			toff += (u_quad_t)(bp->b_dirtyend - bp->b_dirtyoff);
 			if (toff > endoff)
 				endoff = toff;
 		}
 		splx(s);
 		VI_UNLOCK(vp);
 	}
 	if (bvecpos > 0) {
 		/*
 		 * Commit data on the server, as required.
 		 * If all bufs are using the same wcred, then use that with
 		 * one call for all of them, otherwise commit each one
 		 * separately.
 		 */
 		if (wcred != NOCRED)
 			retv = nfs_commit(vp, off, (int)(endoff - off),
 					  wcred, td);
 		else {
 			retv = 0;
 			for (i = 0; i < bvecpos; i++) {
 				off_t off, size;
 				bp = bvec[i];
 				off = ((u_quad_t)bp->b_blkno) * DEV_BSIZE +
 					bp->b_dirtyoff;
 				size = (u_quad_t)(bp->b_dirtyend
 						  - bp->b_dirtyoff);
 				retv = nfs_commit(vp, off, (int)size,
 						  bp->b_wcred, td);
 				if (retv) break;
 			}
 		}
 
 		if (retv == NFSERR_STALEWRITEVERF)
 			nfs_clearcommit(vp->v_mount);
 
 		/*
 		 * Now, either mark the blocks I/O done or mark the
 		 * blocks dirty, depending on whether the commit
 		 * succeeded.
 		 */
 		for (i = 0; i < bvecpos; i++) {
 			bp = bvec[i];
 			bp->b_flags &= ~(B_NEEDCOMMIT | B_CLUSTEROK);
 			if (retv) {
 				/*
 				 * Error, leave B_DELWRI intact
 				 */
 				vfs_unbusy_pages(bp);
 				brelse(bp);
 			} else {
 				/*
 				 * Success, remove B_DELWRI ( bundirty() ).
 				 *
 				 * b_dirtyoff/b_dirtyend seem to be NFS
 				 * specific.  We should probably move that
 				 * into bundirty(). XXX
 				 */
 				s = splbio();
 				bufobj_wref(&vp->v_bufobj);
 				bp->b_flags |= B_ASYNC;
 				bundirty(bp);
 				bp->b_flags &= ~B_DONE;
 				bp->b_ioflags &= ~BIO_ERROR;
 				bp->b_dirtyoff = bp->b_dirtyend = 0;
 				splx(s);
 				bufdone(bp);
 			}
 		}
 	}
 
 	/*
 	 * Start/do any write(s) that are required.
 	 */
 loop:
 	s = splbio();
 	VI_LOCK(vp);
 	TAILQ_FOREACH_SAFE(bp, &vp->v_bufobj.bo_dirty.bv_hd, b_bobufs, nbp) {
 		if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT, NULL)) {
 			if (waitfor != MNT_WAIT || passone)
 				continue;
 
 			error = BUF_TIMELOCK(bp,
 			    LK_EXCLUSIVE | LK_SLEEPFAIL | LK_INTERLOCK,
 			    VI_MTX(vp), "nfsfsync", slpflag, slptimeo);
 			splx(s);
 			if (error == 0) {
 				BUF_UNLOCK(bp);
 				goto loop;
 			}
 			if (error == ENOLCK)
 				goto loop;
 			if (nfs_sigintr(nmp, NULL, td)) {
 				error = EINTR;
 				goto done;
 			}
 			if (slpflag == PCATCH) {
 				slpflag = 0;
 				slptimeo = 2 * hz;
 			}
 			goto loop;
 		}
 		if ((bp->b_flags & B_DELWRI) == 0)
 			panic("nfs_fsync: not dirty");
 		if ((passone || !commit) && (bp->b_flags & B_NEEDCOMMIT)) {
 			BUF_UNLOCK(bp);
 			continue;
 		}
 		VI_UNLOCK(vp);
 		bremfree(bp);
 		if (passone || !commit)
 		    bp->b_flags |= B_ASYNC;
 		else
 		    bp->b_flags |= B_ASYNC;
 		splx(s);
 		bwrite(bp);
 		if (nfs_sigintr(nmp, NULL, td)) {
 			error = EINTR;
 			goto done;
 		}
 		goto loop;
 	}
 	splx(s);
 	if (passone) {
 		passone = 0;
 		VI_UNLOCK(vp);
 		goto again;
 	}
 	if (waitfor == MNT_WAIT) {
 		while (vp->v_bufobj.bo_numoutput) {
 			error = bufobj_wwait(&vp->v_bufobj, slpflag, slptimeo);
 			if (error) {
 			    VI_UNLOCK(vp);
 			    error = nfs_sigintr(nmp, NULL, td);
 			    if (error)
 				goto done;
 			    if (slpflag == PCATCH) {
 				slpflag = 0;
 				slptimeo = 2 * hz;
 			    }
 			    VI_LOCK(vp);
 			}
 		}
 		if (vp->v_bufobj.bo_dirty.bv_cnt != 0 && commit) {
 			VI_UNLOCK(vp);
 			goto loop;
 		}
 		/*
 		 * Wait for all the async IO requests to drain
 		 */
 		VI_UNLOCK(vp);
 		mtx_lock(&np->n_mtx);
 		while (np->n_directio_asyncwr > 0) {
 			np->n_flag |= NFSYNCWAIT;
 			error = nfs_msleep(td, (caddr_t)&np->n_directio_asyncwr,
 					   &np->n_mtx, slpflag | (PRIBIO + 1), 
 					   "nfsfsync", 0);
 			if (error) {
 				if (nfs_sigintr(nmp, (struct nfsreq *)0, td)) {
 					mtx_unlock(&np->n_mtx);
 					error = EINTR;	
 					goto done;
 				}
 			}
 		}
 		mtx_unlock(&np->n_mtx);
 	} else
 		VI_UNLOCK(vp);
 	mtx_lock(&np->n_mtx);
 	if (np->n_flag & NWRITEERR) {
 		error = np->n_error;
 		np->n_flag &= ~NWRITEERR;
 	}
   	if (commit && vp->v_bufobj.bo_dirty.bv_cnt == 0 &&
 	    vp->v_bufobj.bo_numoutput == 0 && np->n_directio_asyncwr == 0)
   		np->n_flag &= ~NMODIFIED;
 	mtx_unlock(&np->n_mtx);
 done:
 	if (bvec != NULL && bvec != bvec_on_stack)
 		free(bvec, M_TEMP);
 	return (error);
 }
 
 /*
  * NFS advisory byte-level locks.
  */
 static int
 nfs_advlock(struct vop_advlock_args *ap)
 {
 	int error;
 	
 	mtx_lock(&Giant);
 	if ((VFSTONFS(ap->a_vp->v_mount)->nm_flag & NFSMNT_NOLOCKD) != 0) {
 		struct nfsnode *np = VTONFS(ap->a_vp);
 
 		error = lf_advlock(ap, &(np->n_lockf), np->n_size);
 		goto out;
 	}
 	error = nfs_dolock(ap);
 out:	
 	mtx_unlock(&Giant);
 	return (error);
 }
 
 /*
  * Print out the contents of an nfsnode.
  */
 static int
 nfs_print(struct vop_print_args *ap)
 {
 	struct vnode *vp = ap->a_vp;
 	struct nfsnode *np = VTONFS(vp);
 
 	nfs_printf("\tfileid %ld fsid 0x%x",
 	   np->n_vattr.va_fileid, np->n_vattr.va_fsid);
 	if (vp->v_type == VFIFO)
 		fifo_printinfo(vp);
 	printf("\n");
 	return (0);
 }
 
 /*
  * This is the "real" nfs::bwrite(struct buf*).
  * We set B_CACHE if this is a VMIO buffer.
  */
 int
 nfs_writebp(struct buf *bp, int force __unused, struct thread *td)
 {
 	int s;
 	int oldflags = bp->b_flags;
 #if 0
 	int retv = 1;
 	off_t off;
 #endif
 
 	if (BUF_REFCNT(bp) == 0)
 		panic("bwrite: buffer is not locked???");
 
 	if (bp->b_flags & B_INVAL) {
 		brelse(bp);
 		return(0);
 	}
 
 	bp->b_flags |= B_CACHE;
 
 	/*
 	 * Undirty the bp.  We will redirty it later if the I/O fails.
 	 */
 
 	s = splbio();
 	bundirty(bp);
 	bp->b_flags &= ~B_DONE;
 	bp->b_ioflags &= ~BIO_ERROR;
 	bp->b_iocmd = BIO_WRITE;
 
 	bufobj_wref(bp->b_bufobj);
 	curthread->td_ru.ru_oublock++;
 	splx(s);
 
 	/*
 	 * Note: to avoid loopback deadlocks, we do not
 	 * assign b_runningbufspace.
 	 */
 	vfs_busy_pages(bp, 1);
 
 	BUF_KERNPROC(bp);
 	bp->b_iooffset = dbtob(bp->b_blkno);
 	bstrategy(bp);
 
 	if( (oldflags & B_ASYNC) == 0) {
 		int rtval = bufwait(bp);
 
 		if (oldflags & B_DELWRI) {
 			s = splbio();
 			reassignbuf(bp);
 			splx(s);
 		}
 		brelse(bp);
 		return (rtval);
 	}
 
 	return (0);
 }
 
 /*
  * nfs special file access vnode op.
  * Essentially just get vattr and then imitate iaccess() since the device is
  * local to the client.
  */
 static int
 nfsspec_access(struct vop_access_args *ap)
 {
 	struct vattr *vap;
 	struct ucred *cred = ap->a_cred;
 	struct vnode *vp = ap->a_vp;
 	mode_t mode = ap->a_mode;
 	struct vattr vattr;
 	int error;
 
 	/*
 	 * Disallow write attempts on filesystems mounted read-only;
 	 * unless the file is a socket, fifo, or a block or character
 	 * device resident on the filesystem.
 	 */
 	if ((mode & VWRITE) && (vp->v_mount->mnt_flag & MNT_RDONLY)) {
 		switch (vp->v_type) {
 		case VREG:
 		case VDIR:
 		case VLNK:
 			return (EROFS);
 		default:
 			break;
 		}
 	}
 	vap = &vattr;
 	error = VOP_GETATTR(vp, vap, cred, ap->a_td);
 	if (error)
 		goto out;
 	error  = vaccess(vp->v_type, vap->va_mode, vap->va_uid, vap->va_gid,
 			 mode, cred, NULL);
 out:
 	return error;
 }
 
 /*
  * Read wrapper for fifos.
  */
 static int
 nfsfifo_read(struct vop_read_args *ap)
 {
 	struct nfsnode *np = VTONFS(ap->a_vp);
 	int error;
 
 	/*
 	 * Set access flag.
 	 */
 	mtx_lock(&np->n_mtx);
 	np->n_flag |= NACC;
 	getnanotime(&np->n_atim);
 	mtx_unlock(&np->n_mtx);
 	error = fifo_specops.vop_read(ap);
 	return error;	
 }
 
 /*
  * Write wrapper for fifos.
  */
 static int
 nfsfifo_write(struct vop_write_args *ap)
 {
 	struct nfsnode *np = VTONFS(ap->a_vp);
 
 	/*
 	 * Set update flag.
 	 */
 	mtx_lock(&np->n_mtx);
 	np->n_flag |= NUPD;
 	getnanotime(&np->n_mtim);
 	mtx_unlock(&np->n_mtx);
 	return(fifo_specops.vop_write(ap));
 }
 
 /*
  * Close wrapper for fifos.
  *
  * Update the times on the nfsnode then do fifo close.
  */
 static int
 nfsfifo_close(struct vop_close_args *ap)
 {
 	struct vnode *vp = ap->a_vp;
 	struct nfsnode *np = VTONFS(vp);
 	struct vattr vattr;
 	struct timespec ts;
 
 	mtx_lock(&np->n_mtx);
 	if (np->n_flag & (NACC | NUPD)) {
 		getnanotime(&ts);
 		if (np->n_flag & NACC)
 			np->n_atim = ts;
 		if (np->n_flag & NUPD)
 			np->n_mtim = ts;
 		np->n_flag |= NCHG;
 		if (vrefcnt(vp) == 1 &&
 		    (vp->v_mount->mnt_flag & MNT_RDONLY) == 0) {
 			VATTR_NULL(&vattr);
 			if (np->n_flag & NACC)
 				vattr.va_atime = np->n_atim;
 			if (np->n_flag & NUPD)
 				vattr.va_mtime = np->n_mtim;
 			mtx_unlock(&np->n_mtx);
 			(void)VOP_SETATTR(vp, &vattr, ap->a_cred, ap->a_td);
 			goto out;
 		}
 	}
 	mtx_unlock(&np->n_mtx);
 out:
 	return (fifo_specops.vop_close(ap));
 }
 
 /*
  * Just call nfs_writebp() with the force argument set to 1.
  *
  * NOTE: B_DONE may or may not be set in a_bp on call.
  */
 static int
 nfs_bwrite(struct buf *bp)
 {
 
 	return (nfs_writebp(bp, 1, curthread));
 }
 
 struct buf_ops buf_ops_nfs = {
 	.bop_name	=	"buf_ops_nfs",
 	.bop_write	=	nfs_bwrite,
 	.bop_strategy	=	bufstrategy,
 	.bop_sync	=	bufsync,
 	.bop_bdflush	=	bufbdflush,
 };
Index: head/sys/nfsserver/nfs_serv.c
===================================================================
--- head/sys/nfsserver/nfs_serv.c	(revision 175201)
+++ head/sys/nfsserver/nfs_serv.c	(revision 175202)
@@ -1,4284 +1,4284 @@
 /*-
  * Copyright (c) 1989, 1993
  *	The Regents of the University of California.  All rights reserved.
  *
  * This code is derived from software contributed to Berkeley by
  * Rick Macklem at The University of Guelph.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	@(#)nfs_serv.c  8.8 (Berkeley) 7/31/95
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 /*
  * nfs version 2 and 3 server calls to vnode ops
  * - these routines generally have 3 phases
  *   1 - break down and validate rpc request in mbuf list
  *   2 - do the vnode ops for the request
  *       (surprisingly ?? many are very similar to syscalls in vfs_syscalls.c)
  *   3 - build the rpc reply in an mbuf list
  *   nb:
  *	- do not mix the phases, since the nfsm_?? macros can return failures
  *	  on a bad rpc or similar and do not do any vrele() or vput()'s
  *
  *      - the nfsm_reply() macro generates an nfs rpc reply with the nfs
  *	error number iff error != 0 whereas
  *	returning an error from the server function implies a fatal error
  *	such as a badly constructed rpc request that should be dropped without
  *	a reply.
  *	For nfsm_reply(), the case where error == EBADRPC is treated
  *	specially; after constructing a reply, it does an immediate
  *	`goto nfsmout' to avoid getting any V3 post-op status appended.
  *
  * Other notes:
  *	Warning: always pay careful attention to resource cleanup on return
  *	and note that nfsm_*() macros can terminate a procedure on certain
  *	errors.
  *
  *	lookup() and namei()
  *	may return garbage in various structural fields/return elements
  *	if an error is returned, and may garbage up nd.ni_dvp even if no
  *	error is returned and you did not request LOCKPARENT or WANTPARENT.
  *
  *	We use the ni_cnd.cn_flags 'HASBUF' flag to track whether the name
  *	buffer has been freed or not.
  */
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/proc.h>
 #include <sys/namei.h>
 #include <sys/unistd.h>
 #include <sys/vnode.h>
 #include <sys/mount.h>
 #include <sys/socket.h>
 #include <sys/socketvar.h>
 #include <sys/malloc.h>
 #include <sys/mbuf.h>
 #include <sys/priv.h>
 #include <sys/dirent.h>
 #include <sys/stat.h>
 #include <sys/kernel.h>
 #include <sys/sysctl.h>
 #include <sys/bio.h>
 #include <sys/buf.h>
 
 #include <vm/vm.h>
 #include <vm/vm_extern.h>
 #include <vm/vm_object.h>
 
 #include <nfs/nfsproto.h>
 #include <nfs/rpcv2.h>
 #include <nfsserver/nfs.h>
 #include <nfs/xdr_subs.h>
 #include <nfsserver/nfsm_subs.h>
 
 #ifdef NFSRV_DEBUG
 #define nfsdbprintf(info)	printf info
 #else
 #define nfsdbprintf(info)
 #endif
 
 #define MAX_COMMIT_COUNT	(1024 * 1024)
 
 #define NUM_HEURISTIC		1017
 #define NHUSE_INIT		64
 #define NHUSE_INC		16
 #define NHUSE_MAX		2048
 
 static struct nfsheur {
 	struct vnode *nh_vp;	/* vp to match (unreferenced pointer) */
 	off_t nh_nextr;		/* next offset for sequential detection */
 	int nh_use;		/* use count for selection */
 	int nh_seqcount;	/* heuristic */
 } nfsheur[NUM_HEURISTIC];
 
 /* Global vars */
 
 int nfsrvw_procrastinate = NFS_GATHERDELAY * 1000;
 int nfsrvw_procrastinate_v3 = 0;
 
 static struct timeval	nfsver = { 0 };
 
 SYSCTL_NODE(_vfs, OID_AUTO, nfsrv, CTLFLAG_RW, 0, "NFS server");
 
 static int nfs_async;
 static int nfs_commit_blks;
 static int nfs_commit_miss;
 SYSCTL_INT(_vfs_nfsrv, OID_AUTO, async, CTLFLAG_RW, &nfs_async, 0, "");
 SYSCTL_INT(_vfs_nfsrv, OID_AUTO, commit_blks, CTLFLAG_RW, &nfs_commit_blks, 0, "");
 SYSCTL_INT(_vfs_nfsrv, OID_AUTO, commit_miss, CTLFLAG_RW, &nfs_commit_miss, 0, "");
 
 struct nfsrvstats nfsrvstats;
 SYSCTL_STRUCT(_vfs_nfsrv, NFS_NFSRVSTATS, nfsrvstats, CTLFLAG_RW,
 	&nfsrvstats, nfsrvstats, "S,nfsrvstats");
 
 static int	nfsrv_access(struct vnode *, int, struct ucred *, int,
 		    struct thread *, int);
 static void	nfsrvw_coalesce(struct nfsrv_descript *,
 		    struct nfsrv_descript *);
 
 /*
  * Clear nameidata fields that are tested in nsfmout cleanup code prior
  * to using first nfsm macro (that might jump to the cleanup code).
  */
 
 static __inline void
 ndclear(struct nameidata *nd)
 {
 
 	nd->ni_cnd.cn_flags = 0;
 	nd->ni_vp = NULL;
 	nd->ni_dvp = NULL;
 	nd->ni_startdir = NULL;
 }
 
 /*
  * Takes two vfslocked integers and returns with at most one
  * reference to giant.  The return value indicates whether giant
  * is held by either lock.  This simplifies nfsrv ops by allowing
  * them to track only one vfslocked var.
  */
 static __inline int
 nfsrv_lockedpair(int vfs1, int vfs2)
 {
 
 	if (vfs1 && vfs2)
 		VFS_UNLOCK_GIANT(vfs2);
 
 	return (vfs1 | vfs2);
 }
 
 static __inline int
 nfsrv_lockedpair_nd(int vfs1, struct nameidata *nd)
 {
 	int vfs2;
 
 	vfs2 = NDHASGIANT(nd);
 
 	return nfsrv_lockedpair(vfs1, vfs2);
 }
 
 /*
  * nfs v3 access service
  */
 int
 nfsrv3_access(struct nfsrv_descript *nfsd, struct nfssvc_sock *slp,
     struct thread *td, struct mbuf **mrq)
 {
 	struct mbuf *mrep = nfsd->nd_mrep, *md = nfsd->nd_md;
 	struct sockaddr *nam = nfsd->nd_nam;
 	caddr_t dpos = nfsd->nd_dpos;
 	struct ucred *cred = nfsd->nd_cr;
 	struct vnode *vp = NULL;
 	nfsfh_t nfh;
 	fhandle_t *fhp;
 	u_int32_t *tl;
 	caddr_t bpos;
 	int error = 0, rdonly, getret;
 	struct mbuf *mb, *mreq;
 	struct vattr vattr, *vap = &vattr;
 	u_long testmode, nfsmode;
 	int v3 = (nfsd->nd_flag & ND_NFSV3);
 	int vfslocked;
 
 	nfsdbprintf(("%s %d\n", __FILE__, __LINE__));
 	if (!v3)
 		panic("nfsrv3_access: v3 proc called on a v2 connection");
 	fhp = &nfh.fh_generic;
 	nfsm_srvmtofh(fhp);
 	tl = nfsm_dissect_nonblock(u_int32_t *, NFSX_UNSIGNED);
 	error = nfsrv_fhtovp(fhp, 1, &vp, &vfslocked, cred, slp,
 	    nam, &rdonly, TRUE);
 	if (error) {
 		nfsm_reply(NFSX_UNSIGNED);
 		nfsm_srvpostop_attr(1, NULL);
 		error = 0;
 		goto nfsmout;
 	}
 	nfsmode = fxdr_unsigned(u_int32_t, *tl);
 	if ((nfsmode & NFSV3ACCESS_READ) &&
 		nfsrv_access(vp, VREAD, cred, rdonly, td, 0))
 		nfsmode &= ~NFSV3ACCESS_READ;
 	if (vp->v_type == VDIR)
 		testmode = (NFSV3ACCESS_MODIFY | NFSV3ACCESS_EXTEND |
 			NFSV3ACCESS_DELETE);
 	else
 		testmode = (NFSV3ACCESS_MODIFY | NFSV3ACCESS_EXTEND);
 	if ((nfsmode & testmode) &&
 		nfsrv_access(vp, VWRITE, cred, rdonly, td, 0))
 		nfsmode &= ~testmode;
 	if (vp->v_type == VDIR)
 		testmode = NFSV3ACCESS_LOOKUP;
 	else
 		testmode = NFSV3ACCESS_EXECUTE;
 	if ((nfsmode & testmode) &&
 		nfsrv_access(vp, VEXEC, cred, rdonly, td, 0))
 		nfsmode &= ~testmode;
 	getret = VOP_GETATTR(vp, vap, cred, td);
 	vput(vp);
 	vp = NULL;
 	nfsm_reply(NFSX_POSTOPATTR(1) + NFSX_UNSIGNED);
 	nfsm_srvpostop_attr(getret, vap);
 	tl = nfsm_build(u_int32_t *, NFSX_UNSIGNED);
 	*tl = txdr_unsigned(nfsmode);
 nfsmout:
 	if (vp)
 		vput(vp);
 	VFS_UNLOCK_GIANT(vfslocked);
 	return(error);
 }
 
 /*
  * nfs getattr service
  */
 int
 nfsrv_getattr(struct nfsrv_descript *nfsd, struct nfssvc_sock *slp,
     struct thread *td, struct mbuf **mrq)
 {
 	struct mbuf *mrep = nfsd->nd_mrep, *md = nfsd->nd_md;
 	struct sockaddr *nam = nfsd->nd_nam;
 	caddr_t dpos = nfsd->nd_dpos;
 	struct ucred *cred = nfsd->nd_cr;
 	struct nfs_fattr *fp;
 	struct vattr va;
 	struct vattr *vap = &va;
 	struct vnode *vp = NULL;
 	nfsfh_t nfh;
 	fhandle_t *fhp;
 	caddr_t bpos;
 	int error = 0, rdonly;
 	struct mbuf *mb, *mreq;
 	int vfslocked;
 
 	nfsdbprintf(("%s %d\n", __FILE__, __LINE__));
 	vfslocked = 0;
 	fhp = &nfh.fh_generic;
 	nfsm_srvmtofh(fhp);
 	error = nfsrv_fhtovp(fhp, 1, &vp, &vfslocked, cred, slp, nam,
 	    &rdonly, TRUE);
 	if (error) {
 		nfsm_reply(0);
 		error = 0;
 		goto nfsmout;
 	}
 	error = VOP_GETATTR(vp, vap, cred, td);
 	vput(vp);
 	vp = NULL;
 	nfsm_reply(NFSX_FATTR(nfsd->nd_flag & ND_NFSV3));
 	if (error) {
 		error = 0;
 		goto nfsmout;
 	}
 	fp = nfsm_build(struct nfs_fattr *,
 	    NFSX_FATTR(nfsd->nd_flag & ND_NFSV3));
 	nfsm_srvfillattr(vap, fp);
 	/* fall through */
 
 nfsmout:
 	if (vp)
 		vput(vp);
 	VFS_UNLOCK_GIANT(vfslocked);
 	return(error);
 }
 
 /*
  * nfs setattr service
  */
 int
 nfsrv_setattr(struct nfsrv_descript *nfsd, struct nfssvc_sock *slp,
     struct thread *td, struct mbuf **mrq)
 {
 	struct mbuf *mrep = nfsd->nd_mrep, *md = nfsd->nd_md;
 	struct sockaddr *nam = nfsd->nd_nam;
 	caddr_t dpos = nfsd->nd_dpos;
 	struct ucred *cred = nfsd->nd_cr;
 	struct vattr va, preat;
 	struct vattr *vap = &va;
 	struct nfsv2_sattr *sp;
 	struct nfs_fattr *fp;
 	struct vnode *vp = NULL;
 	nfsfh_t nfh;
 	fhandle_t *fhp;
 	u_int32_t *tl;
 	caddr_t bpos;
 	int error = 0, rdonly, preat_ret = 1, postat_ret = 1;
 	int v3 = (nfsd->nd_flag & ND_NFSV3), gcheck = 0;
 	struct mbuf *mb, *mreq;
 	struct timespec guard = { 0, 0 };
 	struct mount *mp = NULL;
 	int tvfslocked;
 	int vfslocked;
 
 	nfsdbprintf(("%s %d\n", __FILE__, __LINE__));
 	vfslocked = 0;
 	fhp = &nfh.fh_generic;
 	nfsm_srvmtofh(fhp);
 	if ((mp = vfs_getvfs(&fhp->fh_fsid)) == NULL) {
 		error = ESTALE;
 		goto out;
 	}
 	vfslocked = VFS_LOCK_GIANT(mp);
 	(void) vn_start_write(NULL, &mp, V_WAIT);
 	vfs_rel(mp);		/* The write holds a ref. */
 	VATTR_NULL(vap);
 	if (v3) {
 		nfsm_srvsattr(vap);
 		tl = nfsm_dissect_nonblock(u_int32_t *, NFSX_UNSIGNED);
 		gcheck = fxdr_unsigned(int, *tl);
 		if (gcheck) {
 			tl = nfsm_dissect_nonblock(u_int32_t *, 2 * NFSX_UNSIGNED);
 			fxdr_nfsv3time(tl, &guard);
 		}
 	} else {
 		sp = nfsm_dissect_nonblock(struct nfsv2_sattr *, NFSX_V2SATTR);
 		/*
 		 * Nah nah nah nah na nah
 		 * There is a bug in the Sun client that puts 0xffff in the mode
 		 * field of sattr when it should put in 0xffffffff. The u_short
 		 * doesn't sign extend.
 		 * --> check the low order 2 bytes for 0xffff
 		 */
 		if ((fxdr_unsigned(int, sp->sa_mode) & 0xffff) != 0xffff)
 			vap->va_mode = nfstov_mode(sp->sa_mode);
 		if (sp->sa_uid != nfsrv_nfs_xdrneg1)
 			vap->va_uid = fxdr_unsigned(uid_t, sp->sa_uid);
 		if (sp->sa_gid != nfsrv_nfs_xdrneg1)
 			vap->va_gid = fxdr_unsigned(gid_t, sp->sa_gid);
 		if (sp->sa_size != nfsrv_nfs_xdrneg1)
 			vap->va_size = fxdr_unsigned(u_quad_t, sp->sa_size);
 		if (sp->sa_atime.nfsv2_sec != nfsrv_nfs_xdrneg1) {
 #ifdef notyet
 			fxdr_nfsv2time(&sp->sa_atime, &vap->va_atime);
 #else
 			vap->va_atime.tv_sec =
 				fxdr_unsigned(int32_t, sp->sa_atime.nfsv2_sec);
 			vap->va_atime.tv_nsec = 0;
 #endif
 		}
 		if (sp->sa_mtime.nfsv2_sec != nfsrv_nfs_xdrneg1)
 			fxdr_nfsv2time(&sp->sa_mtime, &vap->va_mtime);
 
 	}
 
 	/*
 	 * Now that we have all the fields, lets do it.
 	 */
 	error = nfsrv_fhtovp(fhp, 1, &vp, &tvfslocked, cred, slp,
 	    nam, &rdonly, TRUE);
 	vfslocked = nfsrv_lockedpair(vfslocked, tvfslocked);
 	if (error) {
 		nfsm_reply(2 * NFSX_UNSIGNED);
 		if (v3)
 			nfsm_srvwcc_data(preat_ret, &preat, postat_ret, vap);
 		error = 0;
 		goto nfsmout;
 	}
 
 	/*
 	 * vp now an active resource, pay careful attention to cleanup
 	 */
 	if (v3) {
 		error = preat_ret = VOP_GETATTR(vp, &preat, cred, td);
 		if (!error && gcheck &&
 			(preat.va_ctime.tv_sec != guard.tv_sec ||
 			 preat.va_ctime.tv_nsec != guard.tv_nsec))
 			error = NFSERR_NOT_SYNC;
 		if (error) {
 			vput(vp);
 			vp = NULL;
 			nfsm_reply(NFSX_WCCDATA(v3));
 			if (v3)
 				nfsm_srvwcc_data(preat_ret, &preat, postat_ret, vap);
 			error = 0;
 			goto nfsmout;
 		}
 	}
 
 	/*
 	 * If the size is being changed write acces is required, otherwise
 	 * just check for a read only filesystem.
 	 */
 	if (vap->va_size == ((u_quad_t)((quad_t) -1))) {
 		if (rdonly || (vp->v_mount->mnt_flag & MNT_RDONLY)) {
 			error = EROFS;
 			goto out;
 		}
 	} else {
 		if (vp->v_type == VDIR) {
 			error = EISDIR;
 			goto out;
 		} else if ((error = nfsrv_access(vp, VWRITE, cred, rdonly,
 			td, 0)) != 0)
 			goto out;
 	}
 	error = VOP_SETATTR(vp, vap, cred, td);
 	postat_ret = VOP_GETATTR(vp, vap, cred, td);
 	if (!error)
 		error = postat_ret;
 out:
 	if (vp != NULL)
 		vput(vp);
 
 	vp = NULL;
 	nfsm_reply(NFSX_WCCORFATTR(v3));
 	if (v3) {
 		nfsm_srvwcc_data(preat_ret, &preat, postat_ret, vap);
 	} else if (!error) {
 		/* v2 non-error case. */
 		fp = nfsm_build(struct nfs_fattr *, NFSX_V2FATTR);
 		nfsm_srvfillattr(vap, fp);
 	}
 	error = 0;
 	/* fall through */
 
 nfsmout:
 	if (vp)
 		vput(vp);
 	vn_finished_write(mp);
 	VFS_UNLOCK_GIANT(vfslocked);
 	return(error);
 }
 
 /*
  * nfs lookup rpc
  */
 int
 nfsrv_lookup(struct nfsrv_descript *nfsd, struct nfssvc_sock *slp,
     struct thread *td, struct mbuf **mrq)
 {
 	struct mbuf *mrep = nfsd->nd_mrep, *md = nfsd->nd_md;
 	struct sockaddr *nam = nfsd->nd_nam;
 	caddr_t dpos = nfsd->nd_dpos;
 	struct ucred *cred = nfsd->nd_cr;
 	struct nfs_fattr *fp;
 	struct nameidata nd, ind, *ndp = &nd;
 	struct vnode *vp, *dirp = NULL;
 	nfsfh_t nfh;
 	fhandle_t *fhp;
 	caddr_t bpos;
 	int error = 0, len, dirattr_ret = 1;
 	int v3 = (nfsd->nd_flag & ND_NFSV3), pubflag;
 	struct mbuf *mb, *mreq;
 	struct vattr va, dirattr, *vap = &va;
 	int tvfslocked;
 	int vfslocked;
 
 	nfsdbprintf(("%s %d\n", __FILE__, __LINE__));
 	ndclear(&nd);
 	vfslocked = 0;
 
 	fhp = &nfh.fh_generic;
 	nfsm_srvmtofh(fhp);
 	nfsm_srvnamesiz(len);
 
 	pubflag = nfs_ispublicfh(fhp);
 
 	nd.ni_cnd.cn_cred = cred;
 	nd.ni_cnd.cn_nameiop = LOOKUP;
 	nd.ni_cnd.cn_flags = LOCKLEAF | SAVESTART | MPSAFE;
 	error = nfs_namei(&nd, fhp, len, slp, nam, &md, &dpos,
 		&dirp, v3, &dirattr, &dirattr_ret, td, pubflag);
 	vfslocked = NDHASGIANT(&nd);
 
 	/*
 	 * namei failure, only dirp to cleanup.  Clear out garbarge from
 	 * structure in case macros jump to nfsmout.
 	 */
 
 	if (error) {
 		if (dirp) {
 			vrele(dirp);
 			dirp = NULL;
 		}
 		nfsm_reply(NFSX_POSTOPATTR(v3));
 		if (v3)
 			nfsm_srvpostop_attr(dirattr_ret, &dirattr);
 		error = 0;
 		goto nfsmout;
 	}
 
 	/*
 	 * Locate index file for public filehandle
 	 *
 	 * error is 0 on entry and 0 on exit from this block.
 	 */
 
 	if (pubflag) {
 		if (nd.ni_vp->v_type == VDIR && nfs_pub.np_index != NULL) {
 			/*
 			 * Setup call to lookup() to see if we can find
 			 * the index file. Arguably, this doesn't belong
 			 * in a kernel.. Ugh.  If an error occurs, do not
 			 * try to install an index file and then clear the
 			 * error.
 			 *
 			 * When we replace nd with ind and redirect ndp,
 			 * maintenance of ni_startdir and ni_vp shift to
 			 * ind and we have to clean them up in the old nd.
 			 * However, the cnd resource continues to be maintained
 			 * via the original nd.  Confused?  You aren't alone!
 			 */
 			ind = nd;
 			VOP_UNLOCK(nd.ni_vp, 0, td);
 			ind.ni_pathlen = strlen(nfs_pub.np_index);
 			ind.ni_cnd.cn_nameptr = ind.ni_cnd.cn_pnbuf =
 			    nfs_pub.np_index;
 			ind.ni_startdir = nd.ni_vp;
 			VREF(ind.ni_startdir);
 			ind.ni_cnd.cn_flags &= ~GIANTHELD;
 			tvfslocked = VFS_LOCK_GIANT(ind.ni_startdir->v_mount);
 			if (tvfslocked)
 				nd.ni_cnd.cn_flags |= GIANTHELD;
 			error = lookup(&ind);
 			ind.ni_dvp = NULL;
 			vfslocked = nfsrv_lockedpair_nd(vfslocked, &ind);
 			ind.ni_cnd.cn_flags &= ~GIANTHELD;
 
 			if (error == 0) {
 				/*
 				 * Found an index file. Get rid of
 				 * the old references.  transfer nd.ni_vp'
 				 */
 				if (dirp)
 					vrele(dirp);
 				dirp = nd.ni_vp;
 				nd.ni_vp = NULL;
 				vrele(nd.ni_startdir);
 				nd.ni_startdir = NULL;
 				ndp = &ind;
 			}
 			error = 0;
 		}
 		/*
 		 * If the public filehandle was used, check that this lookup
 		 * didn't result in a filehandle outside the publicly exported
 		 * filesystem.  We clear the poor vp here to avoid lockups due
 		 * to NFS I/O.
 		 */
 
 		if (ndp->ni_vp->v_mount != nfs_pub.np_mount) {
 			vput(nd.ni_vp);
 			nd.ni_vp = NULL;
 			error = EPERM;
 		}
 	}
 
 	/*
 	 * Resources at this point:
 	 *	ndp->ni_vp	may not be NULL
 	 */
 
 	if (error) {
 		nfsm_reply(NFSX_POSTOPATTR(v3));
 		if (v3)
 			nfsm_srvpostop_attr(dirattr_ret, &dirattr);
 		error = 0;
 		goto nfsmout;
 	}
 
 	/*
 	 * Get underlying attribute, then release remaining resources ( for
 	 * the same potential blocking reason ) and reply.
 	 */
 	vp = ndp->ni_vp;
 	bzero((caddr_t)fhp, sizeof(nfh));
 	fhp->fh_fsid = vp->v_mount->mnt_stat.f_fsid;
 	error = VOP_VPTOFH(vp, &fhp->fh_fid);
 	if (!error)
 		error = VOP_GETATTR(vp, vap, cred, td);
 
 	vput(vp);
 	vrele(ndp->ni_startdir);
 	vrele(dirp);
 	ndp->ni_vp = NULL;
 	ndp->ni_startdir = NULL;
 	dirp = NULL;
 	nfsm_reply(NFSX_SRVFH(v3) + NFSX_POSTOPORFATTR(v3) + NFSX_POSTOPATTR(v3));
 	if (error) {
 		if (v3)
 			nfsm_srvpostop_attr(dirattr_ret, &dirattr);
 		error = 0;
 		goto nfsmout;
 	}
 	nfsm_srvfhtom(fhp, v3);
 	if (v3) {
 		nfsm_srvpostop_attr(0, vap);
 		nfsm_srvpostop_attr(dirattr_ret, &dirattr);
 	} else {
 		fp = nfsm_build(struct nfs_fattr *, NFSX_V2FATTR);
 		nfsm_srvfillattr(vap, fp);
 	}
 
 nfsmout:
 	if (ndp->ni_vp || dirp || ndp->ni_startdir) {
 		if (ndp->ni_vp)
 			vput(ndp->ni_vp);
 		if (dirp)
 			vrele(dirp);
 		if (ndp->ni_startdir)
 			vrele(ndp->ni_startdir);
 	}
 	NDFREE(&nd, NDF_ONLY_PNBUF);
 	VFS_UNLOCK_GIANT(vfslocked);
 	return (error);
 }
 
 /*
  * nfs readlink service
  */
 int
 nfsrv_readlink(struct nfsrv_descript *nfsd, struct nfssvc_sock *slp,
     struct thread *td, struct mbuf **mrq)
 {
 	struct mbuf *mrep = nfsd->nd_mrep, *md = nfsd->nd_md;
 	struct sockaddr *nam = nfsd->nd_nam;
 	caddr_t dpos = nfsd->nd_dpos;
 	struct ucred *cred = nfsd->nd_cr;
 	struct iovec iv[(NFS_MAXPATHLEN+MLEN-1)/MLEN];
 	struct iovec *ivp = iv;
 	struct mbuf *mp;
 	u_int32_t *tl;
 	caddr_t bpos;
 	int error = 0, rdonly, i, tlen, len, getret;
 	int v3 = (nfsd->nd_flag & ND_NFSV3);
 	struct mbuf *mb, *mp3, *nmp, *mreq;
 	struct vnode *vp = NULL;
 	struct vattr attr;
 	nfsfh_t nfh;
 	fhandle_t *fhp;
 	struct uio io, *uiop = &io;
 	int vfslocked;
 
 	nfsdbprintf(("%s %d\n", __FILE__, __LINE__));
 	vfslocked = 0;
 #ifndef nolint
 	mp = NULL;
 #endif
 	mp3 = NULL;
 	fhp = &nfh.fh_generic;
 	nfsm_srvmtofh(fhp);
 	len = 0;
 	i = 0;
 	while (len < NFS_MAXPATHLEN) {
 		MGET(nmp, M_TRYWAIT, MT_DATA);
 		MCLGET(nmp, M_TRYWAIT);
 		nmp->m_len = NFSMSIZ(nmp);
 		if (len == 0)
 			mp3 = mp = nmp;
 		else {
 			mp->m_next = nmp;
 			mp = nmp;
 		}
 		if ((len + mp->m_len) > NFS_MAXPATHLEN) {
 			mp->m_len = NFS_MAXPATHLEN - len;
 			len = NFS_MAXPATHLEN;
 		} else
 			len += mp->m_len;
 		ivp->iov_base = mtod(mp, caddr_t);
 		ivp->iov_len = mp->m_len;
 		i++;
 		ivp++;
 	}
 	uiop->uio_iov = iv;
 	uiop->uio_iovcnt = i;
 	uiop->uio_offset = 0;
 	uiop->uio_resid = len;
 	uiop->uio_rw = UIO_READ;
 	uiop->uio_segflg = UIO_SYSSPACE;
 	uiop->uio_td = NULL;
 	error = nfsrv_fhtovp(fhp, 1, &vp, &vfslocked, cred, slp,
 	    nam, &rdonly, TRUE);
 	if (error) {
 		nfsm_reply(2 * NFSX_UNSIGNED);
 		if (v3)
 			nfsm_srvpostop_attr(1, NULL);
 		error = 0;
 		goto nfsmout;
 	}
 	if (vp->v_type != VLNK) {
 		if (v3)
 			error = EINVAL;
 		else
 			error = ENXIO;
 	} else 
 		error = VOP_READLINK(vp, uiop, cred);
 	getret = VOP_GETATTR(vp, &attr, cred, td);
 	vput(vp);
 	vp = NULL;
 	nfsm_reply(NFSX_POSTOPATTR(v3) + NFSX_UNSIGNED);
 	if (v3)
 		nfsm_srvpostop_attr(getret, &attr);
 	if (error) {
 		error = 0;
 		goto nfsmout;
 	}
 	if (uiop->uio_resid > 0) {
 		len -= uiop->uio_resid;
 		tlen = nfsm_rndup(len);
 		nfsm_adj(mp3, NFS_MAXPATHLEN-tlen, tlen-len);
 	}
 	tl = nfsm_build(u_int32_t *, NFSX_UNSIGNED);
 	*tl = txdr_unsigned(len);
 	mb->m_next = mp3;
 	mp3 = NULL;
 nfsmout:
 	if (mp3)
 		m_freem(mp3);
 	if (vp)
 		vput(vp);
 	VFS_UNLOCK_GIANT(vfslocked);
 	return(error);
 }
 
 /*
  * nfs read service
  */
 int
 nfsrv_read(struct nfsrv_descript *nfsd, struct nfssvc_sock *slp,
     struct thread *td, struct mbuf **mrq)
 {
 	struct mbuf *mrep = nfsd->nd_mrep, *md = nfsd->nd_md;
 	struct sockaddr *nam = nfsd->nd_nam;
 	caddr_t dpos = nfsd->nd_dpos;
 	struct ucred *cred = nfsd->nd_cr;
 	struct iovec *iv;
 	struct iovec *iv2;
 	struct mbuf *m;
 	struct nfs_fattr *fp;
 	u_int32_t *tl;
 	int i;
 	caddr_t bpos;
 	int error = 0, rdonly, cnt, len, left, siz, tlen, getret;
 	int v3 = (nfsd->nd_flag & ND_NFSV3), reqlen;
 	struct mbuf *mb, *mreq;
 	struct mbuf *m2;
 	struct vnode *vp = NULL;
 	nfsfh_t nfh;
 	fhandle_t *fhp;
 	struct uio io, *uiop = &io;
 	struct vattr va, *vap = &va;
 	struct nfsheur *nh;
 	off_t off;
 	int ioflag = 0;
 	int vfslocked;
 
 
 	nfsdbprintf(("%s %d\n", __FILE__, __LINE__));
 	vfslocked = 0;
 	fhp = &nfh.fh_generic;
 	nfsm_srvmtofh(fhp);
 	if (v3) {
 		tl = nfsm_dissect_nonblock(u_int32_t *, 2 * NFSX_UNSIGNED);
 		off = fxdr_hyper(tl);
 	} else {
 		tl = nfsm_dissect_nonblock(u_int32_t *, NFSX_UNSIGNED);
 		off = (off_t)fxdr_unsigned(u_int32_t, *tl);
 	}
 	nfsm_srvstrsiz(reqlen, NFS_SRVMAXDATA(nfsd));
 
 	/*
 	 * Reference vp.  If an error occurs, vp will be invalid, but we
 	 * have to NULL it just in case.  The macros might goto nfsmout
 	 * as well.
 	 */
 
 	error = nfsrv_fhtovp(fhp, 1, &vp, &vfslocked, cred, slp,
 	    nam, &rdonly, TRUE);
 	if (error) {
 		vp = NULL;
 		nfsm_reply(2 * NFSX_UNSIGNED);
 		if (v3)
 			nfsm_srvpostop_attr(1, NULL);
 		error = 0;
 		goto nfsmout;
 	}
 
 	if (vp->v_type != VREG) {
 		if (v3)
 			error = EINVAL;
 		else
 			error = (vp->v_type == VDIR) ? EISDIR : EACCES;
 	}
 	if (!error) {
 		if ((error = nfsrv_access(vp, VREAD, cred, rdonly,
 		    td, 1)) != 0)
 			error = nfsrv_access(vp, VEXEC, cred,
 			    rdonly, td, 1);
 	}
 	getret = VOP_GETATTR(vp, vap, cred, td);
 	if (!error)
 		error = getret;
 	if (error) {
 		vput(vp);
 		vp = NULL;
 		nfsm_reply(NFSX_POSTOPATTR(v3));
 		if (v3)
 			nfsm_srvpostop_attr(getret, vap);
 		error = 0;
 		goto nfsmout;
 	}
 
 	/*
 	 * Calculate byte count to read
 	 */
 
 	if (off >= vap->va_size)
 		cnt = 0;
 	else if ((off + reqlen) > vap->va_size)
 		cnt = vap->va_size - off;
 	else
 		cnt = reqlen;
 
 	/*
 	 * Calculate seqcount for heuristic
 	 */
 
 	{
 		int hi;
 		int try = 32;
 
 		/*
 		 * Locate best candidate
 		 */
 
 		hi = ((int)(vm_offset_t)vp / sizeof(struct vnode)) % NUM_HEURISTIC;
 		nh = &nfsheur[hi];
 
 		while (try--) {
 			if (nfsheur[hi].nh_vp == vp) {
 				nh = &nfsheur[hi];
 				break;
 			}
 			if (nfsheur[hi].nh_use > 0)
 				--nfsheur[hi].nh_use;
 			hi = (hi + 1) % NUM_HEURISTIC;
 			if (nfsheur[hi].nh_use < nh->nh_use)
 				nh = &nfsheur[hi];
 		}
 
 		if (nh->nh_vp != vp) {
 			nh->nh_vp = vp;
 			nh->nh_nextr = off;
 			nh->nh_use = NHUSE_INIT;
 			if (off == 0)
 				nh->nh_seqcount = 4;
 			else
 				nh->nh_seqcount = 1;
 		}
 
 		/*
 		 * Calculate heuristic
 		 */
 
 		if ((off == 0 && nh->nh_seqcount > 0) || off == nh->nh_nextr) {
 			if (++nh->nh_seqcount > IO_SEQMAX)
 				nh->nh_seqcount = IO_SEQMAX;
 		} else if (nh->nh_seqcount > 1) {
 			nh->nh_seqcount = 1;
 		} else {
 			nh->nh_seqcount = 0;
 		}
 		nh->nh_use += NHUSE_INC;
 		if (nh->nh_use > NHUSE_MAX)
 			nh->nh_use = NHUSE_MAX;
 		ioflag |= nh->nh_seqcount << IO_SEQSHIFT;
         }
 
 	nfsm_reply(NFSX_POSTOPORFATTR(v3) + 3 * NFSX_UNSIGNED+nfsm_rndup(cnt));
 	if (v3) {
 		tl = nfsm_build(u_int32_t *, NFSX_V3FATTR + 4 * NFSX_UNSIGNED);
 		*tl++ = nfsrv_nfs_true;
 		fp = (struct nfs_fattr *)tl;
 		tl += (NFSX_V3FATTR / sizeof (u_int32_t));
 	} else {
 		tl = nfsm_build(u_int32_t *, NFSX_V2FATTR + NFSX_UNSIGNED);
 		fp = (struct nfs_fattr *)tl;
 		tl += (NFSX_V2FATTR / sizeof (u_int32_t));
 	}
 	len = left = nfsm_rndup(cnt);
 	if (cnt > 0) {
 		/*
 		 * Generate the mbuf list with the uio_iov ref. to it.
 		 */
 		i = 0;
 		m = m2 = mb;
 		while (left > 0) {
 			siz = min(M_TRAILINGSPACE(m), left);
 			if (siz > 0) {
 				left -= siz;
 				i++;
 			}
 			if (left > 0) {
 				MGET(m, M_TRYWAIT, MT_DATA);
 				MCLGET(m, M_TRYWAIT);
 				m->m_len = 0;
 				m2->m_next = m;
 				m2 = m;
 			}
 		}
 		MALLOC(iv, struct iovec *, i * sizeof (struct iovec),
 		       M_TEMP, M_WAITOK);
 		uiop->uio_iov = iv2 = iv;
 		m = mb;
 		left = len;
 		i = 0;
 		while (left > 0) {
 			if (m == NULL)
 				panic("nfsrv_read iov");
 			siz = min(M_TRAILINGSPACE(m), left);
 			if (siz > 0) {
 				iv->iov_base = mtod(m, caddr_t) + m->m_len;
 				iv->iov_len = siz;
 				m->m_len += siz;
 				left -= siz;
 				iv++;
 				i++;
 			}
 			m = m->m_next;
 		}
 		uiop->uio_iovcnt = i;
 		uiop->uio_offset = off;
 		uiop->uio_resid = len;
 		uiop->uio_rw = UIO_READ;
 		uiop->uio_segflg = UIO_SYSSPACE;
 		error = VOP_READ(vp, uiop, IO_NODELOCKED | ioflag, cred);
 		off = uiop->uio_offset;
 		nh->nh_nextr = off;
 		FREE((caddr_t)iv2, M_TEMP);
 		if (error || (getret = VOP_GETATTR(vp, vap, cred, td))) {
 			if (!error)
 				error = getret;
 			m_freem(mreq);
 			vput(vp);
 			vp = NULL;
 			nfsm_reply(NFSX_POSTOPATTR(v3));
 			if (v3)
 				nfsm_srvpostop_attr(getret, vap);
 			error = 0;
 			goto nfsmout;
 		}
 	} else
 		uiop->uio_resid = 0;
 	vput(vp);
 	vp = NULL;
 	nfsm_srvfillattr(vap, fp);
 	tlen = len - uiop->uio_resid;
 	cnt = cnt < tlen ? cnt : tlen;
 	tlen = nfsm_rndup(cnt);
 	if (len != tlen || tlen != cnt)
 		nfsm_adj(mb, len - tlen, tlen - cnt);
 	if (v3) {
 		*tl++ = txdr_unsigned(cnt);
 		if (cnt < reqlen)
 			*tl++ = nfsrv_nfs_true;
 		else
 			*tl++ = nfsrv_nfs_false;
 	}
 	*tl = txdr_unsigned(cnt);
 nfsmout:
 	if (vp)
 		vput(vp);
 	VFS_UNLOCK_GIANT(vfslocked);
 	return(error);
 }
 
 /*
  * nfs write service
  */
 int
 nfsrv_write(struct nfsrv_descript *nfsd, struct nfssvc_sock *slp,
     struct thread *td, struct mbuf **mrq)
 {
 	struct mbuf *mrep = nfsd->nd_mrep, *md = nfsd->nd_md;
 	struct sockaddr *nam = nfsd->nd_nam;
 	caddr_t dpos = nfsd->nd_dpos;
 	struct ucred *cred = nfsd->nd_cr;
 	struct iovec *ivp;
 	int i, cnt;
 	struct mbuf *mp;
 	struct nfs_fattr *fp;
 	struct iovec *iv;
 	struct vattr va, forat;
 	struct vattr *vap = &va;
 	u_int32_t *tl;
 	caddr_t bpos;
 	int error = 0, rdonly, len, forat_ret = 1;
 	int ioflags, aftat_ret = 1, retlen = 0, zeroing, adjust;
 	int stable = NFSV3WRITE_FILESYNC;
 	int v3 = (nfsd->nd_flag & ND_NFSV3);
 	struct mbuf *mb, *mreq;
 	struct vnode *vp = NULL;
 	nfsfh_t nfh;
 	fhandle_t *fhp;
 	struct uio io, *uiop = &io;
 	off_t off;
 	struct mount *mntp = NULL;
 	int tvfslocked;
 	int vfslocked;
 
 	nfsdbprintf(("%s %d\n", __FILE__, __LINE__));
 	vfslocked = 0;
 	if (mrep == NULL) {
 		*mrq = NULL;
 		error = 0;
 		goto nfsmout;
 	}
 	fhp = &nfh.fh_generic;
 	nfsm_srvmtofh(fhp);
 	if ((mntp = vfs_getvfs(&fhp->fh_fsid)) == NULL) {
 		error = ESTALE;
 		goto ereply;
 	}
 	vfslocked = VFS_LOCK_GIANT(mntp);
 	(void) vn_start_write(NULL, &mntp, V_WAIT);
 	vfs_rel(mntp);		/* The write holds a ref. */
 	if (v3) {
 		tl = nfsm_dissect_nonblock(u_int32_t *, 5 * NFSX_UNSIGNED);
 		off = fxdr_hyper(tl);
 		tl += 3;
 		stable = fxdr_unsigned(int, *tl++);
 	} else {
 		tl = nfsm_dissect_nonblock(u_int32_t *, 4 * NFSX_UNSIGNED);
 		off = (off_t)fxdr_unsigned(u_int32_t, *++tl);
 		tl += 2;
 		if (nfs_async)
 	    		stable = NFSV3WRITE_UNSTABLE;
 	}
 	retlen = len = fxdr_unsigned(int32_t, *tl);
 	cnt = i = 0;
 
 	/*
 	 * For NFS Version 2, it is not obvious what a write of zero length
 	 * should do, but I might as well be consistent with Version 3,
 	 * which is to return ok so long as there are no permission problems.
 	 */
 	if (len > 0) {
 	    zeroing = 1;
 	    mp = mrep;
 	    while (mp) {
 		if (mp == md) {
 			zeroing = 0;
 			adjust = dpos - mtod(mp, caddr_t);
 			mp->m_len -= adjust;
 			if (mp->m_len > 0 && adjust > 0)
 				mp->m_data += adjust;
 		}
 		if (zeroing)
 			mp->m_len = 0;
 		else if (mp->m_len > 0) {
 			i += mp->m_len;
 			if (i > len) {
 				mp->m_len -= (i - len);
 				zeroing	= 1;
 			}
 			if (mp->m_len > 0)
 				cnt++;
 		}
 		mp = mp->m_next;
 	    }
 	}
 	if (len > NFS_MAXDATA || len < 0 || i < len) {
 		error = EIO;
 		nfsm_reply(2 * NFSX_UNSIGNED);
 		if (v3)
 			nfsm_srvwcc_data(forat_ret, &forat, aftat_ret, vap);
 		error = 0;
 		goto nfsmout;
 	}
 	error = nfsrv_fhtovp(fhp, 1, &vp, &tvfslocked, cred, slp,
 	    nam, &rdonly, TRUE);
 	vfslocked = nfsrv_lockedpair(vfslocked, tvfslocked);
 	if (error) {
 		vp = NULL;
 		nfsm_reply(2 * NFSX_UNSIGNED);
 		if (v3)
 			nfsm_srvwcc_data(forat_ret, &forat, aftat_ret, vap);
 		error = 0;
 		goto nfsmout;
 	}
 	if (v3)
 		forat_ret = VOP_GETATTR(vp, &forat, cred, td);
 	if (vp->v_type != VREG) {
 		if (v3)
 			error = EINVAL;
 		else
 			error = (vp->v_type == VDIR) ? EISDIR : EACCES;
 	}
 	if (!error)
 		error = nfsrv_access(vp, VWRITE, cred, rdonly, td, 1);
 	if (error) {
 		vput(vp);
 		vp = NULL;
 		nfsm_reply(NFSX_WCCDATA(v3));
 		if (v3)
 			nfsm_srvwcc_data(forat_ret, &forat, aftat_ret, vap);
 		error = 0;
 		goto nfsmout;
 	}
 
 	if (len > 0) {
 	    MALLOC(ivp, struct iovec *, cnt * sizeof (struct iovec), M_TEMP,
 		M_WAITOK);
 	    uiop->uio_iov = iv = ivp;
 	    uiop->uio_iovcnt = cnt;
 	    mp = mrep;
 	    while (mp) {
 		if (mp->m_len > 0) {
 			ivp->iov_base = mtod(mp, caddr_t);
 			ivp->iov_len = mp->m_len;
 			ivp++;
 		}
 		mp = mp->m_next;
 	    }
 
 	    /*
 	     * XXX
 	     * The IO_METASYNC flag indicates that all metadata (and not just
 	     * enough to ensure data integrity) mus be written to stable storage
 	     * synchronously.
 	     * (IO_METASYNC is not yet implemented in 4.4BSD-Lite.)
 	     */
 	    if (stable == NFSV3WRITE_UNSTABLE)
 		ioflags = IO_NODELOCKED;
 	    else if (stable == NFSV3WRITE_DATASYNC)
 		ioflags = (IO_SYNC | IO_NODELOCKED);
 	    else
 		ioflags = (IO_METASYNC | IO_SYNC | IO_NODELOCKED);
 	    uiop->uio_resid = len;
 	    uiop->uio_rw = UIO_WRITE;
 	    uiop->uio_segflg = UIO_SYSSPACE;
 	    uiop->uio_td = NULL;
 	    uiop->uio_offset = off;
 	    error = VOP_WRITE(vp, uiop, ioflags, cred);
 	    /* XXXRW: unlocked write. */
 	    nfsrvstats.srvvop_writes++;
 	    FREE((caddr_t)iv, M_TEMP);
 	}
 	aftat_ret = VOP_GETATTR(vp, vap, cred, td);
 	vput(vp);
 	vp = NULL;
 	if (!error)
 		error = aftat_ret;
 ereply:
 	nfsm_reply(NFSX_PREOPATTR(v3) + NFSX_POSTOPORFATTR(v3) +
 		2 * NFSX_UNSIGNED + NFSX_WRITEVERF(v3));
 	if (v3) {
 		nfsm_srvwcc_data(forat_ret, &forat, aftat_ret, vap);
 		if (error) {
 			error = 0;
 			goto nfsmout;
 		}
 		tl = nfsm_build(u_int32_t *, 4 * NFSX_UNSIGNED);
 		*tl++ = txdr_unsigned(retlen);
 		/*
 		 * If nfs_async is set, then pretend the write was FILESYNC.
 		 */
 		if (stable == NFSV3WRITE_UNSTABLE && !nfs_async)
 			*tl++ = txdr_unsigned(stable);
 		else
 			*tl++ = txdr_unsigned(NFSV3WRITE_FILESYNC);
 		/*
 		 * Actually, there is no need to txdr these fields,
 		 * but it may make the values more human readable,
 		 * for debugging purposes.
 		 */
 		if (nfsver.tv_sec == 0)
 			nfsver = boottime;
 		*tl++ = txdr_unsigned(nfsver.tv_sec);
 		*tl = txdr_unsigned(nfsver.tv_usec);
 	} else if (!error) {
 		/* v2 non-error case. */
 		fp = nfsm_build(struct nfs_fattr *, NFSX_V2FATTR);
 		nfsm_srvfillattr(vap, fp);
 	}
 	error = 0;
 nfsmout:
 	if (vp)
 		vput(vp);
 	vn_finished_write(mntp);
 	VFS_UNLOCK_GIANT(vfslocked);
 	return(error);
 }
 
 /*
  * For the purposes of write gathering, we must decide if the credential
  * associated with two pending requests have equivilent privileges.  Since
  * NFS only uses a subset of the BSD ucred -- the effective uid and group
  * IDs -- we have a compare routine that checks only the relevant fields.
  */
 static int
 nfsrv_samecred(struct ucred *cr1, struct ucred *cr2)
 {
 	int i;
 
 	if (cr1->cr_uid != cr2->cr_uid)
 		return (0);
 	if (cr1->cr_ngroups != cr2->cr_ngroups)
 		return (0);
 	for (i = 0; i < cr1->cr_ngroups; i++) {
 		if (cr1->cr_groups[i] != cr2->cr_groups[i])
 			return (0);
 	}
 	return (1);
 }
 
 /*
  * NFS write service with write gathering support. Called when
  * nfsrvw_procrastinate > 0.
  * See: Chet Juszczak, "Improving the Write Performance of an NFS Server",
  * in Proc. of the Winter 1994 Usenix Conference, pg. 247-259, San Franscisco,
  * Jan. 1994.
  */
 int
 nfsrv_writegather(struct nfsrv_descript **ndp, struct nfssvc_sock *slp,
     struct thread *td, struct mbuf **mrq)
 {
 	struct iovec *ivp;
 	struct mbuf *mp;
 	struct nfsrv_descript *wp, *nfsd, *owp, *swp;
 	struct nfs_fattr *fp;
 	int i;
 	struct iovec *iov;
 	struct nfsrvw_delayhash *wpp;
 	struct ucred *cred;
 	struct vattr va, forat;
 	u_int32_t *tl;
 	caddr_t bpos, dpos;
 	int error = 0, rdonly, len, forat_ret = 1;
 	int ioflags, aftat_ret = 1, s, adjust, v3, zeroing;
 	struct mbuf *mb, *mreq, *mrep, *md;
 	struct vnode *vp = NULL;
 	struct uio io, *uiop = &io;
 	u_quad_t cur_usec;
 	struct mount *mntp = NULL;
 	int mvfslocked;
 	int vfslocked;
 
 	nfsdbprintf(("%s %d\n", __FILE__, __LINE__));
 #ifndef nolint
 	i = 0;
 	len = 0;
 #endif
 	*mrq = NULL;
 	if (*ndp) {
 	    nfsd = *ndp;
 	    *ndp = NULL;
 	    mrep = nfsd->nd_mrep;
 	    md = nfsd->nd_md;
 	    dpos = nfsd->nd_dpos;
 	    cred = nfsd->nd_cr;
 	    v3 = (nfsd->nd_flag & ND_NFSV3);
 	    LIST_INIT(&nfsd->nd_coalesce);
 	    nfsd->nd_mreq = NULL;
 	    nfsd->nd_stable = NFSV3WRITE_FILESYNC;
 	    cur_usec = nfs_curusec();
 	    nfsd->nd_time = cur_usec +
 		(v3 ? nfsrvw_procrastinate_v3 : nfsrvw_procrastinate);
 
 	    /*
 	     * Now, get the write header..
 	     */
 	    nfsm_srvmtofh(&nfsd->nd_fh);
 	    if (v3) {
 		tl = nfsm_dissect_nonblock(u_int32_t *, 5 * NFSX_UNSIGNED);
 		nfsd->nd_off = fxdr_hyper(tl);
 		tl += 3;
 		nfsd->nd_stable = fxdr_unsigned(int, *tl++);
 	    } else {
 		tl = nfsm_dissect_nonblock(u_int32_t *, 4 * NFSX_UNSIGNED);
 		nfsd->nd_off = (off_t)fxdr_unsigned(u_int32_t, *++tl);
 		tl += 2;
 		if (nfs_async)
 			nfsd->nd_stable = NFSV3WRITE_UNSTABLE;
 	    }
 	    len = fxdr_unsigned(int32_t, *tl);
 	    nfsd->nd_len = len;
 	    nfsd->nd_eoff = nfsd->nd_off + len;
 
 	    /*
 	     * Trim the header out of the mbuf list and trim off any trailing
 	     * junk so that the mbuf list has only the write data.
 	     */
 	    zeroing = 1;
 	    i = 0;
 	    mp = mrep;
 	    while (mp) {
 		if (mp == md) {
 		    zeroing = 0;
 		    adjust = dpos - mtod(mp, caddr_t);
 		    mp->m_len -= adjust;
 		    if (mp->m_len > 0 && adjust > 0)
 			mp->m_data += adjust;
 		}
 		if (zeroing)
 		    mp->m_len = 0;
 		else {
 		    i += mp->m_len;
 		    if (i > len) {
 			mp->m_len -= (i - len);
 			zeroing = 1;
 		    }
 		}
 		mp = mp->m_next;
 	    }
 	    if (len > NFS_MAXDATA || len < 0  || i < len) {
 nfsmout:
 		m_freem(mrep);
 		error = EIO;
 		nfsm_writereply(2 * NFSX_UNSIGNED);
 		if (v3)
 		    nfsm_srvwcc_data(forat_ret, &forat, aftat_ret, &va);
 		nfsd->nd_mreq = mreq;
 		nfsd->nd_mrep = NULL;
 		nfsd->nd_time = 0;
 	    }
 
 	    /*
 	     * Add this entry to the hash and time queues.
 	     */
 	    s = splsoftclock();
 	    owp = NULL;
 	    wp = LIST_FIRST(&slp->ns_tq);
 	    while (wp && wp->nd_time < nfsd->nd_time) {
 		owp = wp;
 		wp = LIST_NEXT(wp, nd_tq);
 	    }
 	    NFS_DPF(WG, ("Q%03x", nfsd->nd_retxid & 0xfff));
 	    if (owp) {
 		LIST_INSERT_AFTER(owp, nfsd, nd_tq);
 	    } else {
 		LIST_INSERT_HEAD(&slp->ns_tq, nfsd, nd_tq);
 	    }
 	    if (nfsd->nd_mrep) {
 		wpp = NWDELAYHASH(slp, nfsd->nd_fh.fh_fid.fid_data);
 		owp = NULL;
 		wp = LIST_FIRST(wpp);
 		while (wp &&
 		    bcmp((caddr_t)&nfsd->nd_fh,(caddr_t)&wp->nd_fh, NFSX_V3FH)){
 		    owp = wp;
 		    wp = LIST_NEXT(wp, nd_hash);
 		}
 		while (wp && wp->nd_off < nfsd->nd_off &&
 		    !bcmp((caddr_t)&nfsd->nd_fh,(caddr_t)&wp->nd_fh, NFSX_V3FH)) {
 		    owp = wp;
 		    wp = LIST_NEXT(wp, nd_hash);
 		}
 		if (owp) {
 		    LIST_INSERT_AFTER(owp, nfsd, nd_hash);
 
 		    /*
 		     * Search the hash list for overlapping entries and
 		     * coalesce.
 		     */
 		    for(; nfsd && NFSW_CONTIG(owp, nfsd); nfsd = wp) {
 			wp = LIST_NEXT(nfsd, nd_hash);
 			if (nfsrv_samecred(owp->nd_cr, nfsd->nd_cr))
 			    nfsrvw_coalesce(owp, nfsd);
 		    }
 		} else {
 		    LIST_INSERT_HEAD(wpp, nfsd, nd_hash);
 		}
 	    }
 	    splx(s);
 	}
 
 	/*
 	 * Now, do VOP_WRITE()s for any one(s) that need to be done now
 	 * and generate the associated reply mbuf list(s).
 	 */
 loop1:
 	cur_usec = nfs_curusec();
 	s = splsoftclock();
 	for (nfsd = LIST_FIRST(&slp->ns_tq); nfsd; nfsd = owp) {
 		owp = LIST_NEXT(nfsd, nd_tq);
 		if (nfsd->nd_time > cur_usec)
 		    break;
 		if (nfsd->nd_mreq)
 		    continue;
 		NFS_DPF(WG, ("P%03x", nfsd->nd_retxid & 0xfff));
 		LIST_REMOVE(nfsd, nd_tq);
 		LIST_REMOVE(nfsd, nd_hash);
 		splx(s);
 		mrep = nfsd->nd_mrep;
 		nfsd->nd_mrep = NULL;
 		cred = nfsd->nd_cr;
 		v3 = (nfsd->nd_flag & ND_NFSV3);
 		forat_ret = aftat_ret = 1;
 		error = nfsrv_fhtovp(&nfsd->nd_fh, 1, &vp, &vfslocked, cred,
 		    slp, nfsd->nd_nam, &rdonly, TRUE);
 		if (!error) {
 		    if (v3)
 			forat_ret = VOP_GETATTR(vp, &forat, cred, td);
 		    if (vp->v_type != VREG) {
 			if (v3)
 			    error = EINVAL;
 			else
 			    error = (vp->v_type == VDIR) ? EISDIR : EACCES;
 		    }
 		} else {
 		    vp = NULL;
 		}
 		if (!error)
 		    error = nfsrv_access(vp, VWRITE, cred, rdonly,
 			td, 1);
 		if (nfsd->nd_stable == NFSV3WRITE_UNSTABLE)
 		    ioflags = IO_NODELOCKED;
 		else if (nfsd->nd_stable == NFSV3WRITE_DATASYNC)
 		    ioflags = (IO_SYNC | IO_NODELOCKED);
 		else
 		    ioflags = (IO_METASYNC | IO_SYNC | IO_NODELOCKED);
 		uiop->uio_rw = UIO_WRITE;
 		uiop->uio_segflg = UIO_SYSSPACE;
 		uiop->uio_td = NULL;
 		uiop->uio_offset = nfsd->nd_off;
 		uiop->uio_resid = nfsd->nd_eoff - nfsd->nd_off;
 		if (uiop->uio_resid > 0) {
 		    mp = mrep;
 		    i = 0;
 		    while (mp) {
 			if (mp->m_len > 0)
 			    i++;
 			mp = mp->m_next;
 		    }
 		    uiop->uio_iovcnt = i;
 		    MALLOC(iov, struct iovec *, i * sizeof (struct iovec),
 			M_TEMP, M_WAITOK);
 		    uiop->uio_iov = ivp = iov;
 		    mp = mrep;
 		    while (mp) {
 			if (mp->m_len > 0) {
 			    ivp->iov_base = mtod(mp, caddr_t);
 			    ivp->iov_len = mp->m_len;
 			    ivp++;
 			}
 			mp = mp->m_next;
 		    }
 		    mvfslocked = 0;
 		    if (!error) {
 			if (vn_start_write(vp, &mntp, V_NOWAIT) != 0) {
 			    VOP_UNLOCK(vp, 0, td);
 			    error = vn_start_write(NULL, &mntp, V_WAIT);
-			    vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td);
+			    vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
 			}
 		        mvfslocked = VFS_LOCK_GIANT(mntp);
 		    }
 		    if (!error) {
 			error = VOP_WRITE(vp, uiop, ioflags, cred);
 			/* XXXRW: unlocked write. */
 			nfsrvstats.srvvop_writes++;
 			vn_finished_write(mntp);
 		    }
 		    VFS_UNLOCK_GIANT(mvfslocked);
 		    FREE((caddr_t)iov, M_TEMP);
 		}
 		m_freem(mrep);
 		if (vp) {
 		    aftat_ret = VOP_GETATTR(vp, &va, cred, td);
 		    vput(vp);
 		    vp = NULL;
 		}
 		VFS_UNLOCK_GIANT(vfslocked);
 		/*
 		 * Loop around generating replies for all write rpcs that have
 		 * now been completed.
 		 */
 		swp = nfsd;
 		do {
 		    NFS_DPF(WG, ("R%03x", nfsd->nd_retxid & 0xfff));
 		    if (error) {
 			nfsm_writereply(NFSX_WCCDATA(v3));
 			if (v3) {
 			    nfsm_srvwcc_data(forat_ret, &forat, aftat_ret, &va);
 			}
 		    } else {
 			nfsm_writereply(NFSX_PREOPATTR(v3) +
 			    NFSX_POSTOPORFATTR(v3) + 2 * NFSX_UNSIGNED +
 			    NFSX_WRITEVERF(v3));
 			if (v3) {
 			    nfsm_srvwcc_data(forat_ret, &forat, aftat_ret, &va);
 			    tl = nfsm_build(u_int32_t *, 4 * NFSX_UNSIGNED);
 			    *tl++ = txdr_unsigned(nfsd->nd_len);
 			    *tl++ = txdr_unsigned(swp->nd_stable);
 			    /*
 			     * Actually, there is no need to txdr these fields,
 			     * but it may make the values more human readable,
 			     * for debugging purposes.
 			     */
 			    if (nfsver.tv_sec == 0)
 				    nfsver = boottime;
 			    *tl++ = txdr_unsigned(nfsver.tv_sec);
 			    *tl = txdr_unsigned(nfsver.tv_usec);
 			} else {
 			    fp = nfsm_build(struct nfs_fattr *, NFSX_V2FATTR);
 			    nfsm_srvfillattr(&va, fp);
 			}
 		    }
 		    nfsd->nd_mreq = mreq;
 		    if (nfsd->nd_mrep)
 			panic("nfsrv_write: nd_mrep not free");
 
 		    /*
 		     * Done. Put it at the head of the timer queue so that
 		     * the final phase can return the reply.
 		     */
 		    s = splsoftclock();
 		    if (nfsd != swp) {
 			nfsd->nd_time = 0;
 			LIST_INSERT_HEAD(&slp->ns_tq, nfsd, nd_tq);
 		    }
 		    nfsd = LIST_FIRST(&swp->nd_coalesce);
 		    if (nfsd) {
 			LIST_REMOVE(nfsd, nd_tq);
 		    }
 		    splx(s);
 		} while (nfsd);
 		s = splsoftclock();
 		swp->nd_time = 0;
 		LIST_INSERT_HEAD(&slp->ns_tq, swp, nd_tq);
 		splx(s);
 		goto loop1;
 	}
 	splx(s);
 
 	/*
 	 * Search for a reply to return.
 	 */
 	s = splsoftclock();
 	LIST_FOREACH(nfsd, &slp->ns_tq, nd_tq)
 		if (nfsd->nd_mreq) {
 		    NFS_DPF(WG, ("X%03x", nfsd->nd_retxid & 0xfff));
 		    LIST_REMOVE(nfsd, nd_tq);
 		    *mrq = nfsd->nd_mreq;
 		    *ndp = nfsd;
 		    break;
 		}
 	splx(s);
 	return (0);
 }
 
 /*
  * Coalesce the write request nfsd into owp. To do this we must:
  * - remove nfsd from the queues
  * - merge nfsd->nd_mrep into owp->nd_mrep
  * - update the nd_eoff and nd_stable for owp
  * - put nfsd on owp's nd_coalesce list
  * NB: Must be called at splsoftclock().
  */
 static void
 nfsrvw_coalesce(struct nfsrv_descript *owp, struct nfsrv_descript *nfsd)
 {
         int overlap;
         struct mbuf *mp;
 	struct nfsrv_descript *p;
 
 	NFS_DPF(WG, ("C%03x-%03x",
 		     nfsd->nd_retxid & 0xfff, owp->nd_retxid & 0xfff));
         LIST_REMOVE(nfsd, nd_hash);
         LIST_REMOVE(nfsd, nd_tq);
         if (owp->nd_eoff < nfsd->nd_eoff) {
             overlap = owp->nd_eoff - nfsd->nd_off;
             if (overlap < 0)
                 panic("nfsrv_coalesce: bad off");
             if (overlap > 0)
                 m_adj(nfsd->nd_mrep, overlap);
             mp = owp->nd_mrep;
             while (mp->m_next)
                 mp = mp->m_next;
             mp->m_next = nfsd->nd_mrep;
             owp->nd_eoff = nfsd->nd_eoff;
         } else
             m_freem(nfsd->nd_mrep);
         nfsd->nd_mrep = NULL;
         if (nfsd->nd_stable == NFSV3WRITE_FILESYNC)
             owp->nd_stable = NFSV3WRITE_FILESYNC;
         else if (nfsd->nd_stable == NFSV3WRITE_DATASYNC &&
             owp->nd_stable == NFSV3WRITE_UNSTABLE)
             owp->nd_stable = NFSV3WRITE_DATASYNC;
         LIST_INSERT_HEAD(&owp->nd_coalesce, nfsd, nd_tq);
 
 	/*
 	 * If nfsd had anything else coalesced into it, transfer them
 	 * to owp, otherwise their replies will never get sent.
 	 */
 	for (p = LIST_FIRST(&nfsd->nd_coalesce); p;
 	     p = LIST_FIRST(&nfsd->nd_coalesce)) {
 	    LIST_REMOVE(p, nd_tq);
 	    LIST_INSERT_HEAD(&owp->nd_coalesce, p, nd_tq);
 	}
 }
 
 /*
  * nfs create service
  * now does a truncate to 0 length via. setattr if it already exists
  */
 int
 nfsrv_create(struct nfsrv_descript *nfsd, struct nfssvc_sock *slp,
     struct thread *td, struct mbuf **mrq)
 {
 	struct mbuf *mrep = nfsd->nd_mrep, *md = nfsd->nd_md;
 	struct sockaddr *nam = nfsd->nd_nam;
 	caddr_t dpos = nfsd->nd_dpos;
 	struct ucred *cred = nfsd->nd_cr;
 	struct nfs_fattr *fp;
 	struct vattr va, dirfor, diraft;
 	struct vattr *vap = &va;
 	struct nfsv2_sattr *sp;
 	u_int32_t *tl;
 	struct nameidata nd;
 	caddr_t bpos;
 	int error = 0, rdev, len, tsize, dirfor_ret = 1, diraft_ret = 1;
 	int v3 = (nfsd->nd_flag & ND_NFSV3), how, exclusive_flag = 0;
 	caddr_t cp;
 	struct mbuf *mb, *mreq;
 	struct vnode *dirp = NULL;
 	nfsfh_t nfh;
 	fhandle_t *fhp;
 	u_quad_t tempsize;
 	u_char cverf[NFSX_V3CREATEVERF];
 	struct mount *mp = NULL;
 	int tvfslocked;
 	int vfslocked;
 
 	nfsdbprintf(("%s %d\n", __FILE__, __LINE__));
 	vfslocked = 0;
 #ifndef nolint
 	rdev = 0;
 #endif
 	ndclear(&nd);
 
 	fhp = &nfh.fh_generic;
 	nfsm_srvmtofh(fhp);
 	if ((mp = vfs_getvfs(&fhp->fh_fsid)) == NULL) {
 		error = ESTALE;
 		goto ereply;
 	}
 	vfslocked = VFS_LOCK_GIANT(mp);
 	(void) vn_start_write(NULL, &mp, V_WAIT);
 	vfs_rel(mp);		/* The write holds a ref. */
 	nfsm_srvnamesiz(len);
 
 	nd.ni_cnd.cn_cred = cred;
 	nd.ni_cnd.cn_nameiop = CREATE;
 	nd.ni_cnd.cn_flags = LOCKPARENT | LOCKLEAF | SAVESTART | MPSAFE;
 
 	/*
 	 * Call namei and do initial cleanup to get a few things
 	 * out of the way.  If we get an initial error we cleanup
 	 * and return here to avoid special-casing the invalid nd
 	 * structure through the rest of the case.  dirp may be
 	 * set even if an error occurs, but the nd structure will not
 	 * be valid at all if an error occurs so we have to invalidate it
 	 * prior to calling nfsm_reply ( which might goto nfsmout ).
 	 */
 	error = nfs_namei(&nd, fhp, len, slp, nam, &md, &dpos,
 		&dirp, v3, &dirfor, &dirfor_ret, td, FALSE);
 	vfslocked = nfsrv_lockedpair_nd(vfslocked, &nd);
 	if (dirp && !v3) {
 		vrele(dirp);
 		dirp = NULL;
 	}
 	if (error) {
 		nfsm_reply(NFSX_WCCDATA(v3));
 		if (v3)
 			nfsm_srvwcc_data(dirfor_ret, &dirfor, diraft_ret, &diraft);
 		error = 0;
 		goto nfsmout;
 	}
 
 	/*
 	 * No error.  Continue.  State:
 	 *
 	 *	startdir	is valid ( we release this immediately )
 	 *	dirp 		may be valid
 	 *	nd.ni_vp	may be valid
 	 *	nd.ni_dvp	is valid
 	 *
 	 * The error state is set through the code and we may also do some
 	 * opportunistic releasing of vnodes to avoid holding locks through
 	 * NFS I/O.  The cleanup at the end is a catch-all
 	 */
 
 	VATTR_NULL(vap);
 	if (v3) {
 		tl = nfsm_dissect_nonblock(u_int32_t *, NFSX_UNSIGNED);
 		how = fxdr_unsigned(int, *tl);
 		switch (how) {
 		case NFSV3CREATE_GUARDED:
 			if (nd.ni_vp) {
 				error = EEXIST;
 				break;
 			}
 			/* fall through */
 		case NFSV3CREATE_UNCHECKED:
 			nfsm_srvsattr(vap);
 			break;
 		case NFSV3CREATE_EXCLUSIVE:
 			cp = nfsm_dissect_nonblock(caddr_t, NFSX_V3CREATEVERF);
 			bcopy(cp, cverf, NFSX_V3CREATEVERF);
 			exclusive_flag = 1;
 			break;
 		};
 		vap->va_type = VREG;
 	} else {
 		sp = nfsm_dissect_nonblock(struct nfsv2_sattr *, NFSX_V2SATTR);
 		vap->va_type = IFTOVT(fxdr_unsigned(u_int32_t, sp->sa_mode));
 		if (vap->va_type == VNON)
 			vap->va_type = VREG;
 		vap->va_mode = nfstov_mode(sp->sa_mode);
 		switch (vap->va_type) {
 		case VREG:
 			tsize = fxdr_unsigned(int32_t, sp->sa_size);
 			if (tsize != -1)
 				vap->va_size = (u_quad_t)tsize;
 			break;
 		case VCHR:
 		case VBLK:
 		case VFIFO:
 			rdev = fxdr_unsigned(long, sp->sa_size);
 			break;
 		default:
 			break;
 		};
 	}
 
 	/*
 	 * Iff doesn't exist, create it
 	 * otherwise just truncate to 0 length
 	 *   should I set the mode too ?
 	 *
 	 * The only possible error we can have at this point is EEXIST.
 	 * nd.ni_vp will also be non-NULL in that case.
 	 */
 	if (nd.ni_vp == NULL) {
 		if (vap->va_mode == (mode_t)VNOVAL)
 			vap->va_mode = 0;
 		if (vap->va_type == VREG || vap->va_type == VSOCK) {
 			error = VOP_CREATE(nd.ni_dvp, &nd.ni_vp, &nd.ni_cnd, vap);
 			if (error)
 				NDFREE(&nd, NDF_ONLY_PNBUF);
 			else {
 				if (exclusive_flag) {
 					exclusive_flag = 0;
 					VATTR_NULL(vap);
 					bcopy(cverf, (caddr_t)&vap->va_atime,
 						NFSX_V3CREATEVERF);
 					error = VOP_SETATTR(nd.ni_vp, vap, cred,
 						td);
 				}
 			}
 		} else if (vap->va_type == VCHR || vap->va_type == VBLK ||
 		    vap->va_type == VFIFO) {
 			/*
 			 * NFSv2-specific code for creating device nodes
 			 * and fifos.
 			 *
 			 * Handle SysV FIFO node special cases.  All other
 			 * devices require super user to access.
 			 */
 			if (vap->va_type == VCHR && rdev == 0xffffffff)
 				vap->va_type = VFIFO;
                         if (vap->va_type != VFIFO &&
                             (error = suser_cred(cred, 0))) {
 				goto ereply;
                         }
 			vap->va_rdev = rdev;
 			error = VOP_MKNOD(nd.ni_dvp, &nd.ni_vp, &nd.ni_cnd, vap);
 			if (error) {
 				NDFREE(&nd, NDF_ONLY_PNBUF);
 				goto ereply;
 			}
 			vput(nd.ni_vp);
 			nd.ni_vp = NULL;
 
 			/*
 			 * release dvp prior to lookup
 			 */
 			vput(nd.ni_dvp);
 			nd.ni_dvp = NULL;
 			/*
 			 * Setup for lookup.
 			 *
 			 * Even though LOCKPARENT was cleared, ni_dvp may
 			 * be garbage.
 			 */
 			nd.ni_cnd.cn_nameiop = LOOKUP;
 			nd.ni_cnd.cn_flags &= ~(LOCKPARENT);
 			nd.ni_cnd.cn_thread = td;
 			nd.ni_cnd.cn_cred = cred;
 			tvfslocked = VFS_LOCK_GIANT(nd.ni_startdir->v_mount);
 			if (tvfslocked)
 				nd.ni_cnd.cn_flags |= GIANTHELD;
 			error = lookup(&nd);
 			nd.ni_dvp = NULL;
 			vfslocked = nfsrv_lockedpair_nd(vfslocked, &nd);
 			nd.ni_cnd.cn_flags &= ~GIANTHELD;
 			if (error)
 				goto ereply;
 
 			if (nd.ni_cnd.cn_flags & ISSYMLINK) {
 				error = EINVAL;
 				goto ereply;
 			}
 		} else {
 			error = ENXIO;
 		}
 	} else {
 		if (vap->va_size != -1) {
 			error = nfsrv_access(nd.ni_vp, VWRITE,
 			    cred, (nd.ni_cnd.cn_flags & RDONLY), td, 0);
 			if (!error) {
 				tempsize = vap->va_size;
 				VATTR_NULL(vap);
 				vap->va_size = tempsize;
 				error = VOP_SETATTR(nd.ni_vp, vap, cred,
 					 td);
 			}
 		}
 	}
 
 	if (!error) {
 		bzero((caddr_t)fhp, sizeof(nfh));
 		fhp->fh_fsid = nd.ni_vp->v_mount->mnt_stat.f_fsid;
 		error = VOP_VPTOFH(nd.ni_vp, &fhp->fh_fid);
 		if (!error)
 			error = VOP_GETATTR(nd.ni_vp, vap, cred, td);
 	}
 	if (v3) {
 		if (exclusive_flag && !error &&
 			bcmp(cverf, (caddr_t)&vap->va_atime, NFSX_V3CREATEVERF))
 			error = EEXIST;
 		if (dirp == nd.ni_dvp)
 			diraft_ret = VOP_GETATTR(dirp, &diraft, cred, td);
 		else {
 			/* Drop the other locks to avoid deadlock. */
 			if (nd.ni_dvp) {
 				if (nd.ni_dvp == nd.ni_vp)
 					vrele(nd.ni_dvp);
 				else
 					vput(nd.ni_dvp);
 			}
 			if (nd.ni_vp)
 				vput(nd.ni_vp);
 			nd.ni_dvp = NULL;
 			nd.ni_vp = NULL;
 
-			vn_lock(dirp, LK_EXCLUSIVE | LK_RETRY, td);
+			vn_lock(dirp, LK_EXCLUSIVE | LK_RETRY);
 			diraft_ret = VOP_GETATTR(dirp, &diraft, cred, td);
 			VOP_UNLOCK(dirp, 0, td);
 		}
 	}
 ereply:
 	nfsm_reply(NFSX_SRVFH(v3) + NFSX_FATTR(v3) + NFSX_WCCDATA(v3));
 	if (v3) {
 		if (!error) {
 			nfsm_srvpostop_fh(fhp);
 			nfsm_srvpostop_attr(0, vap);
 		}
 		nfsm_srvwcc_data(dirfor_ret, &dirfor, diraft_ret, &diraft);
 	} else if (!error) {
 		/* v2 non-error case. */
 		nfsm_srvfhtom(fhp, v3);
 		fp = nfsm_build(struct nfs_fattr *, NFSX_V2FATTR);
 		nfsm_srvfillattr(vap, fp);
 	}
 	error = 0;
 
 nfsmout:
 	if (nd.ni_dvp) {
 		if (nd.ni_dvp == nd.ni_vp)
 			vrele(nd.ni_dvp);
 		else
 			vput(nd.ni_dvp);
 	}
 	if (nd.ni_vp)
 		vput(nd.ni_vp);
 	if (nd.ni_startdir) {
 		vrele(nd.ni_startdir);
 		nd.ni_startdir = NULL;
 	}
 	if (dirp)
 		vrele(dirp);
 	NDFREE(&nd, NDF_ONLY_PNBUF);
 	vn_finished_write(mp);
 	VFS_UNLOCK_GIANT(vfslocked);
 	return (error);
 }
 
 /*
  * nfs v3 mknod service
  */
 int
 nfsrv_mknod(struct nfsrv_descript *nfsd, struct nfssvc_sock *slp,
     struct thread *td, struct mbuf **mrq)
 {
 	struct mbuf *mrep = nfsd->nd_mrep, *md = nfsd->nd_md;
 	struct sockaddr *nam = nfsd->nd_nam;
 	caddr_t dpos = nfsd->nd_dpos;
 	struct ucred *cred = nfsd->nd_cr;
 	struct vattr va, dirfor, diraft;
 	struct vattr *vap = &va;
 	u_int32_t *tl;
 	struct nameidata nd;
 	caddr_t bpos;
 	int error = 0, len, dirfor_ret = 1, diraft_ret = 1;
 	u_int32_t major, minor;
 	enum vtype vtyp;
 	struct mbuf *mb, *mreq;
 	struct vnode *vp, *dirp = NULL;
 	nfsfh_t nfh;
 	fhandle_t *fhp;
 	struct mount *mp = NULL;
 	int v3 = (nfsd->nd_flag & ND_NFSV3);
 	int tvfslocked;
 	int vfslocked;
 
 	nfsdbprintf(("%s %d\n", __FILE__, __LINE__));
 	vfslocked = 0;
 	if (!v3)
 		panic("nfsrv_mknod: v3 proc called on a v2 connection");
 	ndclear(&nd);
 
 	fhp = &nfh.fh_generic;
 	nfsm_srvmtofh(fhp);
 	if ((mp = vfs_getvfs(&fhp->fh_fsid)) == NULL) {
 		error = ESTALE;
 		goto ereply;
 	}
 	vfslocked = VFS_LOCK_GIANT(mp);
 	(void) vn_start_write(NULL, &mp, V_WAIT);
 	vfs_rel(mp);		/* The write holds a ref. */
 	nfsm_srvnamesiz(len);
 
 	nd.ni_cnd.cn_cred = cred;
 	nd.ni_cnd.cn_nameiop = CREATE;
 	nd.ni_cnd.cn_flags = LOCKPARENT | LOCKLEAF | SAVESTART | MPSAFE;
 
 	/*
 	 * Handle nfs_namei() call.  If an error occurs, the nd structure
 	 * is not valid.  However, nfsm_*() routines may still jump to
 	 * nfsmout.
 	 */
 
 	error = nfs_namei(&nd, fhp, len, slp, nam, &md, &dpos,
 		&dirp, v3, &dirfor, &dirfor_ret, td, FALSE);
 	vfslocked = nfsrv_lockedpair_nd(vfslocked, &nd);
 	if (error) {
 		nfsm_reply(NFSX_WCCDATA(1));
 		nfsm_srvwcc_data(dirfor_ret, &dirfor, diraft_ret, &diraft);
 		error = 0;
 		goto nfsmout;
 	}
 	tl = nfsm_dissect_nonblock(u_int32_t *, NFSX_UNSIGNED);
 	vtyp = nfsv3tov_type(*tl);
 	if (vtyp != VCHR && vtyp != VBLK && vtyp != VSOCK && vtyp != VFIFO) {
 		error = NFSERR_BADTYPE;
 		goto out;
 	}
 	VATTR_NULL(vap);
 	nfsm_srvsattr(vap);
 	if (vtyp == VCHR || vtyp == VBLK) {
 		tl = nfsm_dissect_nonblock(u_int32_t *, 2 * NFSX_UNSIGNED);
 		major = fxdr_unsigned(u_int32_t, *tl++);
 		minor = fxdr_unsigned(u_int32_t, *tl);
 		vap->va_rdev = makedev(major, minor);
 	}
 
 	/*
 	 * Iff doesn't exist, create it.
 	 */
 	if (nd.ni_vp) {
 		error = EEXIST;
 		goto out;
 	}
 	vap->va_type = vtyp;
 	if (vap->va_mode == (mode_t)VNOVAL)
 		vap->va_mode = 0;
 	if (vtyp == VSOCK) {
 		vrele(nd.ni_startdir);
 		nd.ni_startdir = NULL;
 		error = VOP_CREATE(nd.ni_dvp, &nd.ni_vp, &nd.ni_cnd, vap);
 		if (error)
 			NDFREE(&nd, NDF_ONLY_PNBUF);
 	} else {
 		if (vtyp != VFIFO && (error = suser_cred(cred, 0)))
 			goto out;
 		error = VOP_MKNOD(nd.ni_dvp, &nd.ni_vp, &nd.ni_cnd, vap);
 		if (error) {
 			NDFREE(&nd, NDF_ONLY_PNBUF);
 			goto out;
 		}
 		vput(nd.ni_vp);
 		nd.ni_vp = NULL;
 
 		/*
 		 * Release dvp prior to lookup
 		 */
 		vput(nd.ni_dvp);
 		nd.ni_dvp = NULL;
 
 		nd.ni_cnd.cn_nameiop = LOOKUP;
 		nd.ni_cnd.cn_flags &= ~(LOCKPARENT);
 		nd.ni_cnd.cn_thread = td;
 		nd.ni_cnd.cn_cred = td->td_ucred;
 		tvfslocked = VFS_LOCK_GIANT(nd.ni_startdir->v_mount);
 		if (tvfslocked)
 			nd.ni_cnd.cn_flags |= GIANTHELD;
 		error = lookup(&nd);
 		nd.ni_dvp = NULL;
 		vfslocked = nfsrv_lockedpair_nd(vfslocked, &nd);
 		nd.ni_cnd.cn_flags &= ~GIANTHELD;
 
 		if (error)
 			goto out;
 		if (nd.ni_cnd.cn_flags & ISSYMLINK)
 			error = EINVAL;
 	}
 
 	/*
 	 * send response, cleanup, return.
 	 */
 out:
 	vp = nd.ni_vp;
 	if (!error) {
 		bzero((caddr_t)fhp, sizeof(nfh));
 		fhp->fh_fsid = vp->v_mount->mnt_stat.f_fsid;
 		error = VOP_VPTOFH(vp, &fhp->fh_fid);
 		if (!error)
 			error = VOP_GETATTR(vp, vap, cred, td);
 	}
 	if (nd.ni_dvp) {
 		if (nd.ni_dvp == nd.ni_vp)
 			vrele(nd.ni_dvp);
 		else
 			vput(nd.ni_dvp);
 		nd.ni_dvp = NULL;
 	}
 	if (vp) {
 		vput(vp);
 		vp = NULL;
 		nd.ni_vp = NULL;
 	}
 	if (nd.ni_startdir) {
 		vrele(nd.ni_startdir);
 		nd.ni_startdir = NULL;
 	}
 	NDFREE(&nd, NDF_ONLY_PNBUF);
 	if (dirp) {
-		vn_lock(dirp, LK_EXCLUSIVE | LK_RETRY, td);
+		vn_lock(dirp, LK_EXCLUSIVE | LK_RETRY);
 		diraft_ret = VOP_GETATTR(dirp, &diraft, cred, td);
 		VOP_UNLOCK(dirp, 0, td);
 	}
 ereply:
 	nfsm_reply(NFSX_SRVFH(1) + NFSX_POSTOPATTR(1) + NFSX_WCCDATA(1));
 	if (v3) {
 		if (!error) {
 			nfsm_srvpostop_fh(fhp);
 			nfsm_srvpostop_attr(0, vap);
 		}
 		nfsm_srvwcc_data(dirfor_ret, &dirfor, diraft_ret, &diraft);
 	}
 	vn_finished_write(mp);
 	VFS_UNLOCK_GIANT(vfslocked);
 	return (0);
 nfsmout:
 	if (nd.ni_dvp) {
 		if (nd.ni_dvp == nd.ni_vp)
 			vrele(nd.ni_dvp);
 		else
 			vput(nd.ni_dvp);
 	}
 	if (nd.ni_vp)
 		vput(nd.ni_vp);
 	if (dirp)
 		vrele(dirp);
 	if (nd.ni_startdir)
 		vrele(nd.ni_startdir);
 	NDFREE(&nd, NDF_ONLY_PNBUF);
 	vn_finished_write(mp);
 	VFS_UNLOCK_GIANT(vfslocked);
 	return (error);
 }
 
 /*
  * nfs remove service
  */
 int
 nfsrv_remove(struct nfsrv_descript *nfsd, struct nfssvc_sock *slp,
     struct thread *td, struct mbuf **mrq)
 {
 	struct mbuf *mrep = nfsd->nd_mrep, *md = nfsd->nd_md;
 	struct sockaddr *nam = nfsd->nd_nam;
 	caddr_t dpos = nfsd->nd_dpos;
 	struct ucred *cred = nfsd->nd_cr;
 	struct nameidata nd;
 	caddr_t bpos;
 	int error = 0, len, dirfor_ret = 1, diraft_ret = 1;
 	int v3 = (nfsd->nd_flag & ND_NFSV3);
 	struct mbuf *mb, *mreq;
 	struct vnode *dirp;
 	struct vattr dirfor, diraft;
 	nfsfh_t nfh;
 	fhandle_t *fhp;
 	struct mount *mp = NULL;
 	int vfslocked;
 
 	nfsdbprintf(("%s %d\n", __FILE__, __LINE__));
 	ndclear(&nd);
 	vfslocked = 0;
 
 	fhp = &nfh.fh_generic;
 	nfsm_srvmtofh(fhp);
 	if ((mp = vfs_getvfs(&fhp->fh_fsid)) == NULL) {
 		error = ESTALE;
 		goto ereply;
 	}
 	vfslocked = VFS_LOCK_GIANT(mp);
 	(void) vn_start_write(NULL, &mp, V_WAIT);
 	vfs_rel(mp);		/* The write holds a ref. */
 	nfsm_srvnamesiz(len);
 
 	nd.ni_cnd.cn_cred = cred;
 	nd.ni_cnd.cn_nameiop = DELETE;
 	nd.ni_cnd.cn_flags = LOCKPARENT | LOCKLEAF | MPSAFE;
 	error = nfs_namei(&nd, fhp, len, slp, nam, &md, &dpos,
 		&dirp, v3,  &dirfor, &dirfor_ret, td, FALSE);
 	vfslocked = NDHASGIANT(&nd);
 	if (dirp && !v3) {
 		vrele(dirp);
 		dirp = NULL;
 	}
 	if (error == 0) {
 		if (nd.ni_vp->v_type == VDIR) {
 			error = EPERM;		/* POSIX */
 			goto out;
 		}
 		/*
 		 * The root of a mounted filesystem cannot be deleted.
 		 */
 		if (nd.ni_vp->v_vflag & VV_ROOT) {
 			error = EBUSY;
 			goto out;
 		}
 out:
 		if (!error) {
 			error = VOP_REMOVE(nd.ni_dvp, nd.ni_vp, &nd.ni_cnd);
 			NDFREE(&nd, NDF_ONLY_PNBUF);
 		}
 	}
 	if (dirp && v3) {
 		if (dirp == nd.ni_dvp)
 			diraft_ret = VOP_GETATTR(dirp, &diraft, cred, td);
 		else {
 			/* Drop the other locks to avoid deadlock. */
 			if (nd.ni_dvp) {
 				if (nd.ni_dvp == nd.ni_vp)
 					vrele(nd.ni_dvp);
 				else
 					vput(nd.ni_dvp);
 			}
 			if (nd.ni_vp)
 				vput(nd.ni_vp);
 			nd.ni_dvp = NULL;
 			nd.ni_vp = NULL;
 
-			vn_lock(dirp, LK_EXCLUSIVE | LK_RETRY, td);
+			vn_lock(dirp, LK_EXCLUSIVE | LK_RETRY);
 			diraft_ret = VOP_GETATTR(dirp, &diraft, cred, td);
 			VOP_UNLOCK(dirp, 0, td);
 		}
 		vrele(dirp);
 		dirp = NULL;
 	}
 ereply:
 	nfsm_reply(NFSX_WCCDATA(v3));
 	if (v3) {
 		nfsm_srvwcc_data(dirfor_ret, &dirfor, diraft_ret, &diraft);
 		error = 0;
 	}
 nfsmout:
 	NDFREE(&nd, NDF_ONLY_PNBUF);
 	if (nd.ni_dvp) {
 		if (nd.ni_dvp == nd.ni_vp)
 			vrele(nd.ni_dvp);
 		else
 			vput(nd.ni_dvp);
 	}
 	if (nd.ni_vp)
 		vput(nd.ni_vp);
 	vn_finished_write(mp);
 	VFS_UNLOCK_GIANT(vfslocked);
 	return(error);
 }
 
 /*
  * nfs rename service
  */
 int
 nfsrv_rename(struct nfsrv_descript *nfsd, struct nfssvc_sock *slp,
     struct thread *td, struct mbuf **mrq)
 {
 	struct mbuf *mrep = nfsd->nd_mrep, *md = nfsd->nd_md;
 	struct sockaddr *nam = nfsd->nd_nam;
 	caddr_t dpos = nfsd->nd_dpos;
 	struct ucred *cred = nfsd->nd_cr;
 	caddr_t bpos;
 	int error = 0, len, len2, fdirfor_ret = 1, fdiraft_ret = 1;
 	int tdirfor_ret = 1, tdiraft_ret = 1;
 	int v3 = (nfsd->nd_flag & ND_NFSV3);
 	struct mbuf *mb, *mreq;
 	struct nameidata fromnd, tond;
 	struct vnode *fvp, *tvp, *tdvp, *fdirp = NULL;
 	struct vnode *tdirp = NULL;
 	struct vattr fdirfor, fdiraft, tdirfor, tdiraft;
 	nfsfh_t fnfh, tnfh;
 	fhandle_t *ffhp, *tfhp;
 	uid_t saved_uid;
 	struct mount *mp = NULL;
 	int vfslocked;
 
 	nfsdbprintf(("%s %d\n", __FILE__, __LINE__));
 	vfslocked = 0;
 #ifndef nolint
 	fvp = NULL;
 #endif
 	ffhp = &fnfh.fh_generic;
 	tfhp = &tnfh.fh_generic;
 
 	/*
 	 * Clear fields incase goto nfsmout occurs from macro.
 	 */
 
 	ndclear(&fromnd);
 	ndclear(&tond);
 
 	nfsm_srvmtofh(ffhp);
 	if ((mp = vfs_getvfs(&ffhp->fh_fsid)) == NULL) {
 		error = ESTALE;
 		goto out1;
 	}
 	vfslocked = VFS_LOCK_GIANT(mp);
 	(void) vn_start_write(NULL, &mp, V_WAIT);
 	vfs_rel(mp);		/* The write holds a ref. */
 	nfsm_srvnamesiz(len);
 	/*
 	 * Remember our original uid so that we can reset cr_uid before
 	 * the second nfs_namei() call, in case it is remapped.
 	 */
 	saved_uid = cred->cr_uid;
 	fromnd.ni_cnd.cn_cred = cred;
 	fromnd.ni_cnd.cn_nameiop = DELETE;
 	fromnd.ni_cnd.cn_flags = WANTPARENT | SAVESTART | MPSAFE;
 	error = nfs_namei(&fromnd, ffhp, len, slp, nam, &md,
 		&dpos, &fdirp, v3, &fdirfor, &fdirfor_ret, td, FALSE);
 	vfslocked = nfsrv_lockedpair_nd(vfslocked, &fromnd);
 	if (fdirp && !v3) {
 		vrele(fdirp);
 		fdirp = NULL;
 	}
 	if (error) {
 		nfsm_reply(2 * NFSX_WCCDATA(v3));
 		if (v3) {
 			nfsm_srvwcc_data(fdirfor_ret, &fdirfor, fdiraft_ret, &fdiraft);
 			nfsm_srvwcc_data(tdirfor_ret, &tdirfor, tdiraft_ret, &tdiraft);
 		}
 		error = 0;
 		goto nfsmout;
 	}
 	fvp = fromnd.ni_vp;
 	nfsm_srvmtofh(tfhp);
 	nfsm_srvnamesiz(len2);
 	cred->cr_uid = saved_uid;
 	tond.ni_cnd.cn_cred = cred;
 	tond.ni_cnd.cn_nameiop = RENAME;
 	tond.ni_cnd.cn_flags = LOCKPARENT | LOCKLEAF | NOCACHE | SAVESTART | MPSAFE;
 	error = nfs_namei(&tond, tfhp, len2, slp, nam, &md,
 		&dpos, &tdirp, v3, &tdirfor, &tdirfor_ret, td, FALSE);
 	vfslocked = nfsrv_lockedpair_nd(vfslocked, &tond);
 	if (tdirp && !v3) {
 		vrele(tdirp);
 		tdirp = NULL;
 	}
 	if (error)
 		goto out1;
 
 	tdvp = tond.ni_dvp;
 	tvp = tond.ni_vp;
 	if (tvp != NULL) {
 		if (fvp->v_type == VDIR && tvp->v_type != VDIR) {
 			if (v3)
 				error = EEXIST;
 			else
 				error = EISDIR;
 			goto out;
 		} else if (fvp->v_type != VDIR && tvp->v_type == VDIR) {
 			if (v3)
 				error = EEXIST;
 			else
 				error = ENOTDIR;
 			goto out;
 		}
 		if (tvp->v_type == VDIR && tvp->v_mountedhere) {
 			if (v3)
 				error = EXDEV;
 			else
 				error = ENOTEMPTY;
 			goto out;
 		}
 	}
 	if (fvp->v_type == VDIR && fvp->v_mountedhere) {
 		if (v3)
 			error = EXDEV;
 		else
 			error = ENOTEMPTY;
 		goto out;
 	}
 	if (fvp->v_mount != tdvp->v_mount) {
 		if (v3)
 			error = EXDEV;
 		else
 			error = ENOTEMPTY;
 		goto out;
 	}
 	if (fvp == tdvp) {
 		if (v3)
 			error = EINVAL;
 		else
 			error = ENOTEMPTY;
 	}
 	/*
 	 * If source is the same as the destination (that is the
 	 * same vnode with the same name in the same directory),
 	 * then there is nothing to do.
 	 */
 	if (fvp == tvp && fromnd.ni_dvp == tdvp &&
 	    fromnd.ni_cnd.cn_namelen == tond.ni_cnd.cn_namelen &&
 	    !bcmp(fromnd.ni_cnd.cn_nameptr, tond.ni_cnd.cn_nameptr,
 	      fromnd.ni_cnd.cn_namelen))
 		error = -1;
 out:
 	if (!error) {
 		/*
 		 * The VOP_RENAME function releases all vnode references &
 		 * locks prior to returning so we need to clear the pointers
 		 * to bypass cleanup code later on.
 		 */
 		error = VOP_RENAME(fromnd.ni_dvp, fromnd.ni_vp, &fromnd.ni_cnd,
 				   tond.ni_dvp, tond.ni_vp, &tond.ni_cnd);
 		fromnd.ni_dvp = NULL;
 		fromnd.ni_vp = NULL;
 		tond.ni_dvp = NULL;
 		tond.ni_vp = NULL;
 		if (error) {
 			NDFREE(&fromnd, NDF_ONLY_PNBUF);
 			NDFREE(&tond, NDF_ONLY_PNBUF);
 		}
 	} else {
 		if (error == -1)
 			error = 0;
 	}
 	/* fall through */
 out1:
 	nfsm_reply(2 * NFSX_WCCDATA(v3));
 	if (v3) {
 		/* Release existing locks to prevent deadlock. */
 		if (tond.ni_dvp) {
 			if (tond.ni_dvp == tond.ni_vp)
 				vrele(tond.ni_dvp);
 			else
 				vput(tond.ni_dvp);
 		}
 		if (tond.ni_vp)
 			vput(tond.ni_vp);
 		tond.ni_dvp = NULL;
 		tond.ni_vp = NULL;
 
 		if (fdirp) {
-			vn_lock(fdirp, LK_EXCLUSIVE | LK_RETRY, td);
+			vn_lock(fdirp, LK_EXCLUSIVE | LK_RETRY);
 			fdiraft_ret = VOP_GETATTR(fdirp, &fdiraft, cred, td);
 			VOP_UNLOCK(fdirp, 0, td);
 		}
 		if (tdirp) {
-			vn_lock(tdirp, LK_EXCLUSIVE | LK_RETRY, td);
+			vn_lock(tdirp, LK_EXCLUSIVE | LK_RETRY);
 			tdiraft_ret = VOP_GETATTR(tdirp, &tdiraft, cred, td);
 			VOP_UNLOCK(tdirp, 0, td);
 		}
 		nfsm_srvwcc_data(fdirfor_ret, &fdirfor, fdiraft_ret, &fdiraft);
 		nfsm_srvwcc_data(tdirfor_ret, &tdirfor, tdiraft_ret, &tdiraft);
 	}
 	error = 0;
 	/* fall through */
 
 nfsmout:
 	/*
 	 * Clear out tond related fields
 	 */
 	if (tond.ni_dvp) {
 		if (tond.ni_dvp == tond.ni_vp)
 			vrele(tond.ni_dvp);
 		else
 			vput(tond.ni_dvp);
 	}
 	if (tond.ni_vp)
 		vput(tond.ni_vp);
 	if (tdirp)
 		vrele(tdirp);
 	if (tond.ni_startdir)
 		vrele(tond.ni_startdir);
 	NDFREE(&tond, NDF_ONLY_PNBUF);
 	/*
 	 * Clear out fromnd related fields
 	 */
 	if (fdirp)
 		vrele(fdirp);
 	if (fromnd.ni_startdir)
 		vrele(fromnd.ni_startdir);
 	NDFREE(&fromnd, NDF_ONLY_PNBUF);
 	if (fromnd.ni_dvp)
 		vrele(fromnd.ni_dvp);
 	if (fromnd.ni_vp)
 		vrele(fromnd.ni_vp);
 
 	vn_finished_write(mp);
 	VFS_UNLOCK_GIANT(vfslocked);
 	return (error);
 }
 
 /*
  * nfs link service
  */
 int
 nfsrv_link(struct nfsrv_descript *nfsd, struct nfssvc_sock *slp,
     struct thread *td, struct mbuf **mrq)
 {
 	struct mbuf *mrep = nfsd->nd_mrep, *md = nfsd->nd_md;
 	struct sockaddr *nam = nfsd->nd_nam;
 	caddr_t dpos = nfsd->nd_dpos;
 	struct ucred *cred = nfsd->nd_cr;
 	struct nameidata nd;
 	caddr_t bpos;
 	int error = 0, rdonly, len, dirfor_ret = 1, diraft_ret = 1;
 	int getret = 1, v3 = (nfsd->nd_flag & ND_NFSV3);
 	struct mbuf *mb, *mreq;
 	struct vnode *vp = NULL, *xp, *dirp = NULL;
 	struct vattr dirfor, diraft, at;
 	nfsfh_t nfh, dnfh;
 	fhandle_t *fhp, *dfhp;
 	struct mount *mp = NULL;
 	int tvfslocked;
 	int vfslocked;
 
 	nfsdbprintf(("%s %d\n", __FILE__, __LINE__));
 	ndclear(&nd);
 	vfslocked = 0;
 
 	fhp = &nfh.fh_generic;
 	dfhp = &dnfh.fh_generic;
 	nfsm_srvmtofh(fhp);
 	if ((mp = vfs_getvfs(&fhp->fh_fsid)) == NULL) {
 		error = ESTALE;
 		goto ereply;
 	}
 	vfslocked = VFS_LOCK_GIANT(mp);
 	(void) vn_start_write(NULL, &mp, V_WAIT);
 	vfs_rel(mp);		/* The write holds a ref. */
 	nfsm_srvmtofh(dfhp);
 	nfsm_srvnamesiz(len);
 
 	error = nfsrv_fhtovp(fhp, TRUE, &vp, &tvfslocked, cred, slp,
 	    nam, &rdonly, TRUE);
 	vfslocked = nfsrv_lockedpair(vfslocked, tvfslocked);
 	if (error) {
 		nfsm_reply(NFSX_POSTOPATTR(v3) + NFSX_WCCDATA(v3));
 		if (v3) {
 			nfsm_srvpostop_attr(getret, &at);
 			nfsm_srvwcc_data(dirfor_ret, &dirfor, diraft_ret, &diraft);
 		}
 		vp = NULL;
 		error = 0;
 		goto nfsmout;
 	}
 	if (v3)
 		getret = VOP_GETATTR(vp, &at, cred, td);
 	if (vp->v_type == VDIR) {
 		error = EPERM;		/* POSIX */
 		goto out1;
 	}
 	VOP_UNLOCK(vp, 0, td);
 	nd.ni_cnd.cn_cred = cred;
 	nd.ni_cnd.cn_nameiop = CREATE;
 	nd.ni_cnd.cn_flags = LOCKPARENT | MPSAFE | MPSAFE;
 	error = nfs_namei(&nd, dfhp, len, slp, nam, &md, &dpos,
 		&dirp, v3, &dirfor, &dirfor_ret, td, FALSE);
 	vfslocked = nfsrv_lockedpair_nd(vfslocked, &nd);
 	if (dirp && !v3) {
 		vrele(dirp);
 		dirp = NULL;
 	}
 	if (error) {
 		vrele(vp);
 		vp = NULL;
 		goto out2;
 	}
 	xp = nd.ni_vp;
 	if (xp != NULL) {
 		error = EEXIST;
 		vrele(vp);
 		vp = NULL;
 		goto out2;
 	}
 	xp = nd.ni_dvp;
 	if (vp->v_mount != xp->v_mount) {
 		error = EXDEV;
 		vrele(vp);
 		vp = NULL;
 		goto out2;
 	}
-	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td);
+	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
 	error = VOP_LINK(nd.ni_dvp, vp, &nd.ni_cnd);
 	NDFREE(&nd, NDF_ONLY_PNBUF);
 	/* fall through */
 
 out1:
 	if (v3)
 		getret = VOP_GETATTR(vp, &at, cred, td);
 out2:
 	if (dirp) {
 		if (dirp == nd.ni_dvp)
 			diraft_ret = VOP_GETATTR(dirp, &diraft, cred, td);
 		else {
 			/* Release existing locks to prevent deadlock. */
 			if (nd.ni_dvp) {
 				if (nd.ni_dvp == nd.ni_vp)
 					vrele(nd.ni_dvp);
 				else
 					vput(nd.ni_dvp);
 			}
 			if (nd.ni_vp)
 				vrele(nd.ni_vp);
 			nd.ni_dvp = NULL;
 			nd.ni_vp = NULL;
 
-			vn_lock(dirp, LK_EXCLUSIVE | LK_RETRY, td);
+			vn_lock(dirp, LK_EXCLUSIVE | LK_RETRY);
 			diraft_ret = VOP_GETATTR(dirp, &diraft, cred, td);
 			VOP_UNLOCK(dirp, 0, td);
 		}
 	}
 ereply:
 	nfsm_reply(NFSX_POSTOPATTR(v3) + NFSX_WCCDATA(v3));
 	if (v3) {
 		nfsm_srvpostop_attr(getret, &at);
 		nfsm_srvwcc_data(dirfor_ret, &dirfor, diraft_ret, &diraft);
 		error = 0;
 	}
 	/* fall through */
 
 nfsmout:
 	NDFREE(&nd, NDF_ONLY_PNBUF);
 	if (vp)
 		vput(vp);
 	if (nd.ni_dvp) {
 		if (nd.ni_dvp == nd.ni_vp)
 			vrele(nd.ni_dvp);
 		else
 			vput(nd.ni_dvp);
 	}
 	if (dirp)
 		vrele(dirp);
 	if (nd.ni_vp)
 		vrele(nd.ni_vp);
 	vn_finished_write(mp);
 	VFS_UNLOCK_GIANT(vfslocked);
 	return(error);
 }
 
 /*
  * nfs symbolic link service
  */
 int
 nfsrv_symlink(struct nfsrv_descript *nfsd, struct nfssvc_sock *slp,
     struct thread *td, struct mbuf **mrq)
 {
 	struct mbuf *mrep = nfsd->nd_mrep, *md = nfsd->nd_md;
 	struct sockaddr *nam = nfsd->nd_nam;
 	caddr_t dpos = nfsd->nd_dpos;
 	struct ucred *cred = nfsd->nd_cr;
 	struct vattr va, dirfor, diraft;
 	struct nameidata nd;
 	struct vattr *vap = &va;
 	struct nfsv2_sattr *sp;
 	char *bpos, *pathcp = NULL;
 	struct uio io;
 	struct iovec iv;
 	int error = 0, len, len2, dirfor_ret = 1, diraft_ret = 1;
 	int v3 = (nfsd->nd_flag & ND_NFSV3);
 	struct mbuf *mb, *mreq;
 	struct vnode *dirp = NULL;
 	nfsfh_t nfh;
 	fhandle_t *fhp;
 	struct mount *mp = NULL;
 	int tvfslocked;
 	int vfslocked;
 
 	nfsdbprintf(("%s %d\n", __FILE__, __LINE__));
 	ndclear(&nd);
 	vfslocked = 0;
 
 	fhp = &nfh.fh_generic;
 	nfsm_srvmtofh(fhp);
 	if ((mp = vfs_getvfs(&fhp->fh_fsid)) == NULL) {
 		error = ESTALE;
 		goto out;
 	}
 	vfslocked = VFS_LOCK_GIANT(mp);
 	(void) vn_start_write(NULL, &mp, V_WAIT);
 	vfs_rel(mp);		/* The write holds a ref. */
 	nfsm_srvnamesiz(len);
 	nd.ni_cnd.cn_cred = cred;
 	nd.ni_cnd.cn_nameiop = CREATE;
 	nd.ni_cnd.cn_flags = LOCKPARENT | SAVESTART | MPSAFE;
 	error = nfs_namei(&nd, fhp, len, slp, nam, &md, &dpos,
 		&dirp, v3, &dirfor, &dirfor_ret, td, FALSE);
 	vfslocked = nfsrv_lockedpair_nd(vfslocked, &nd);
 	if (error == 0) {
 		VATTR_NULL(vap);
 		if (v3)
 			nfsm_srvsattr(vap);
 		nfsm_srvpathsiz(len2);
 	}
 	if (dirp && !v3) {
 		vrele(dirp);
 		dirp = NULL;
 	}
 	if (error)
 		goto out;
 	MALLOC(pathcp, caddr_t, len2 + 1, M_TEMP, M_WAITOK);
 	iv.iov_base = pathcp;
 	iv.iov_len = len2;
 	io.uio_resid = len2;
 	io.uio_offset = 0;
 	io.uio_iov = &iv;
 	io.uio_iovcnt = 1;
 	io.uio_segflg = UIO_SYSSPACE;
 	io.uio_rw = UIO_READ;
 	io.uio_td = NULL;
 	nfsm_mtouio(&io, len2);
 	if (!v3) {
 		sp = nfsm_dissect_nonblock(struct nfsv2_sattr *, NFSX_V2SATTR);
 		vap->va_mode = nfstov_mode(sp->sa_mode);
 	}
 	*(pathcp + len2) = '\0';
 	if (nd.ni_vp) {
 		error = EEXIST;
 		goto out;
 	}
 
 	/*
 	 * issue symlink op.  SAVESTART is set so the underlying path component
 	 * is only freed by the VOP if an error occurs.
 	 */
 	if (vap->va_mode == (mode_t)VNOVAL)
 		vap->va_mode = 0;
 	error = VOP_SYMLINK(nd.ni_dvp, &nd.ni_vp, &nd.ni_cnd, vap, pathcp);
 	if (error)
 		NDFREE(&nd, NDF_ONLY_PNBUF);
 	else
 		vput(nd.ni_vp);
 	nd.ni_vp = NULL;
 	/*
 	 * releases directory prior to potential lookup op.
 	 */
 	vput(nd.ni_dvp);
 	nd.ni_dvp = NULL;
 
 	if (error == 0) {
 	    if (v3) {
 		/*
 		 * Issue lookup.  Leave SAVESTART set so we can easily free
 		 * the name buffer later on.
 		 *
 		 * since LOCKPARENT is not set, ni_dvp will be garbage on
 		 * return whether an error occurs or not.
 		 */
 		nd.ni_cnd.cn_nameiop = LOOKUP;
 		nd.ni_cnd.cn_flags &= ~(LOCKPARENT | FOLLOW);
 		nd.ni_cnd.cn_flags |= (NOFOLLOW | LOCKLEAF);
 		nd.ni_cnd.cn_thread = td;
 		nd.ni_cnd.cn_cred = cred;
 		tvfslocked = VFS_LOCK_GIANT(nd.ni_startdir->v_mount);
 		if (tvfslocked)
 			nd.ni_cnd.cn_flags |= GIANTHELD;
 		error = lookup(&nd);
 		nd.ni_dvp = NULL;
 		vfslocked = nfsrv_lockedpair_nd(vfslocked, &nd);
 		nd.ni_cnd.cn_flags &= ~GIANTHELD;
 
 		if (error == 0) {
 			bzero((caddr_t)fhp, sizeof(nfh));
 			fhp->fh_fsid = nd.ni_vp->v_mount->mnt_stat.f_fsid;
 			error = VOP_VPTOFH(nd.ni_vp, &fhp->fh_fid);
 			if (!error)
 				error = VOP_GETATTR(nd.ni_vp, vap, cred,
 					td);
 			vput(nd.ni_vp);
 			nd.ni_vp = NULL;
 		}
 	    }
 	}
 out:
 	/*
 	 * These releases aren't strictly required, does even doing them
 	 * make any sense? XXX can nfsm_reply() block?
 	 */
 	if (pathcp) {
 		FREE(pathcp, M_TEMP);
 		pathcp = NULL;
 	}
 	if (dirp) {
-		vn_lock(dirp, LK_EXCLUSIVE | LK_RETRY, td);
+		vn_lock(dirp, LK_EXCLUSIVE | LK_RETRY);
 		diraft_ret = VOP_GETATTR(dirp, &diraft, cred, td);
 		VOP_UNLOCK(dirp, 0, td);
 	}
 	if (nd.ni_startdir) {
 		vrele(nd.ni_startdir);
 		nd.ni_startdir = NULL;
 	}
 	nfsm_reply(NFSX_SRVFH(v3) + NFSX_POSTOPATTR(v3) + NFSX_WCCDATA(v3));
 	if (v3) {
 		if (!error) {
 			nfsm_srvpostop_fh(fhp);
 			nfsm_srvpostop_attr(0, vap);
 		}
 		nfsm_srvwcc_data(dirfor_ret, &dirfor, diraft_ret, &diraft);
 	}
 	error = 0;
 	/* fall through */
 
 nfsmout:
 	NDFREE(&nd, NDF_ONLY_PNBUF);
 	if (nd.ni_dvp) {
 		if (nd.ni_dvp == nd.ni_vp)
 			vrele(nd.ni_dvp);
 		else
 			vput(nd.ni_dvp);
 	}
 	if (nd.ni_vp)
 		vrele(nd.ni_vp);
 	if (nd.ni_startdir)
 		vrele(nd.ni_startdir);
 	if (dirp)
 		vrele(dirp);
 	if (pathcp)
 		FREE(pathcp, M_TEMP);
 
 	vn_finished_write(mp);
 	VFS_UNLOCK_GIANT(vfslocked);
 	return (error);
 }
 
 /*
  * nfs mkdir service
  */
 int
 nfsrv_mkdir(struct nfsrv_descript *nfsd, struct nfssvc_sock *slp,
     struct thread *td, struct mbuf **mrq)
 {
 	struct mbuf *mrep = nfsd->nd_mrep, *md = nfsd->nd_md;
 	struct sockaddr *nam = nfsd->nd_nam;
 	caddr_t dpos = nfsd->nd_dpos;
 	struct ucred *cred = nfsd->nd_cr;
 	struct vattr va, dirfor, diraft;
 	struct vattr *vap = &va;
 	struct nfs_fattr *fp;
 	struct nameidata nd;
 	u_int32_t *tl;
 	caddr_t bpos;
 	int error = 0, len, dirfor_ret = 1, diraft_ret = 1;
 	int v3 = (nfsd->nd_flag & ND_NFSV3);
 	struct mbuf *mb, *mreq;
 	struct vnode *dirp = NULL;
 	int vpexcl = 0;
 	nfsfh_t nfh;
 	fhandle_t *fhp;
 	struct mount *mp = NULL;
 	int vfslocked;
 
 	nfsdbprintf(("%s %d\n", __FILE__, __LINE__));
 	ndclear(&nd);
 	vfslocked = 0;
 
 	fhp = &nfh.fh_generic;
 	nfsm_srvmtofh(fhp);
 	if ((mp = vfs_getvfs(&fhp->fh_fsid)) == NULL) {
 		error = ESTALE;
 		goto out;
 	}
 	vfslocked = VFS_LOCK_GIANT(mp);
 	(void) vn_start_write(NULL, &mp, V_WAIT);
 	vfs_rel(mp);		/* The write holds a ref. */
 	nfsm_srvnamesiz(len);
 	nd.ni_cnd.cn_cred = cred;
 	nd.ni_cnd.cn_nameiop = CREATE;
 	nd.ni_cnd.cn_flags = LOCKPARENT | MPSAFE;
 
 	error = nfs_namei(&nd, fhp, len, slp, nam, &md, &dpos,
 		&dirp, v3, &dirfor, &dirfor_ret, td, FALSE);
 	vfslocked = nfsrv_lockedpair_nd(vfslocked, &nd);
 	if (dirp && !v3) {
 		vrele(dirp);
 		dirp = NULL;
 	}
 	if (error) {
 		nfsm_reply(NFSX_WCCDATA(v3));
 		if (v3)
 			nfsm_srvwcc_data(dirfor_ret, &dirfor, diraft_ret, &diraft);
 		error = 0;
 		goto nfsmout;
 	}
 	VATTR_NULL(vap);
 	if (v3) {
 		nfsm_srvsattr(vap);
 	} else {
 		tl = nfsm_dissect_nonblock(u_int32_t *, NFSX_UNSIGNED);
 		vap->va_mode = nfstov_mode(*tl++);
 	}
 
 	/*
 	 * At this point nd.ni_dvp is referenced and exclusively locked and
 	 * nd.ni_vp, if it exists, is referenced but not locked.
 	 */
 
 	vap->va_type = VDIR;
 	if (nd.ni_vp != NULL) {
 		NDFREE(&nd, NDF_ONLY_PNBUF);
 		error = EEXIST;
 		goto out;
 	}
 
 	/*
 	 * Issue mkdir op.  Since SAVESTART is not set, the pathname
 	 * component is freed by the VOP call.  This will fill-in
 	 * nd.ni_vp, reference, and exclusively lock it.
 	 */
 	if (vap->va_mode == (mode_t)VNOVAL)
 		vap->va_mode = 0;
 	error = VOP_MKDIR(nd.ni_dvp, &nd.ni_vp, &nd.ni_cnd, vap);
 	NDFREE(&nd, NDF_ONLY_PNBUF);
 	vpexcl = 1;
 
 	vput(nd.ni_dvp);
 	nd.ni_dvp = NULL;
 
 	if (!error) {
 		bzero((caddr_t)fhp, sizeof(nfh));
 		fhp->fh_fsid = nd.ni_vp->v_mount->mnt_stat.f_fsid;
 		error = VOP_VPTOFH(nd.ni_vp, &fhp->fh_fid);
 		if (!error)
 			error = VOP_GETATTR(nd.ni_vp, vap, cred, td);
 	}
 out:
 	if (dirp) {
 		if (dirp == nd.ni_dvp) {
 			diraft_ret = VOP_GETATTR(dirp, &diraft, cred, td);
 		} else {
 			/* Release existing locks to prevent deadlock. */
 			if (nd.ni_dvp) {
 				NDFREE(&nd, NDF_ONLY_PNBUF);
 				if (nd.ni_dvp == nd.ni_vp && vpexcl)
 					vrele(nd.ni_dvp);
 				else
 					vput(nd.ni_dvp);
 			}
 			if (nd.ni_vp) {
 				if (vpexcl)
 					vput(nd.ni_vp);
 				else
 					vrele(nd.ni_vp);
 			}
 			nd.ni_dvp = NULL;
 			nd.ni_vp = NULL;
-			vn_lock(dirp, LK_EXCLUSIVE | LK_RETRY, td);
+			vn_lock(dirp, LK_EXCLUSIVE | LK_RETRY);
 			diraft_ret = VOP_GETATTR(dirp, &diraft, cred, td);
 			VOP_UNLOCK(dirp, 0, td);
 		}
 	}
 	nfsm_reply(NFSX_SRVFH(v3) + NFSX_POSTOPATTR(v3) + NFSX_WCCDATA(v3));
 	if (v3) {
 		if (!error) {
 			nfsm_srvpostop_fh(fhp);
 			nfsm_srvpostop_attr(0, vap);
 		}
 		nfsm_srvwcc_data(dirfor_ret, &dirfor, diraft_ret, &diraft);
 	} else if (!error) {
 		/* v2 non-error case. */
 		nfsm_srvfhtom(fhp, v3);
 		fp = nfsm_build(struct nfs_fattr *, NFSX_V2FATTR);
 		nfsm_srvfillattr(vap, fp);
 	}
 	error = 0;
 	/* fall through */
 
 nfsmout:
 	if (nd.ni_dvp) {
 		NDFREE(&nd, NDF_ONLY_PNBUF);
 		if (nd.ni_dvp == nd.ni_vp && vpexcl)
 			vrele(nd.ni_dvp);
 		else
 			vput(nd.ni_dvp);
 	}
 	if (nd.ni_vp) {
 		if (vpexcl)
 			vput(nd.ni_vp);
 		else
 			vrele(nd.ni_vp);
 	}
 	if (dirp)
 		vrele(dirp);
 	vn_finished_write(mp);
 	VFS_UNLOCK_GIANT(vfslocked);
 	return (error);
 }
 
 /*
  * nfs rmdir service
  */
 int
 nfsrv_rmdir(struct nfsrv_descript *nfsd, struct nfssvc_sock *slp,
     struct thread *td, struct mbuf **mrq)
 {
 	struct mbuf *mrep = nfsd->nd_mrep, *md = nfsd->nd_md;
 	struct sockaddr *nam = nfsd->nd_nam;
 	caddr_t dpos = nfsd->nd_dpos;
 	struct ucred *cred = nfsd->nd_cr;
 	caddr_t bpos;
 	int error = 0, len, dirfor_ret = 1, diraft_ret = 1;
 	int v3 = (nfsd->nd_flag & ND_NFSV3);
 	struct mbuf *mb, *mreq;
 	struct vnode *vp, *dirp = NULL;
 	struct vattr dirfor, diraft;
 	nfsfh_t nfh;
 	fhandle_t *fhp;
 	struct nameidata nd;
 	struct mount *mp = NULL;
 	int vfslocked;
 
 	nfsdbprintf(("%s %d\n", __FILE__, __LINE__));
 	ndclear(&nd);
 	vfslocked = 0;
 
 	fhp = &nfh.fh_generic;
 	nfsm_srvmtofh(fhp);
 	if ((mp = vfs_getvfs(&fhp->fh_fsid)) == NULL) {
 		error = ESTALE;
 		goto out;
 	}
 	vfslocked = VFS_LOCK_GIANT(mp);
 	(void) vn_start_write(NULL, &mp, V_WAIT);
 	vfs_rel(mp);		/* The write holds a ref. */
 	nfsm_srvnamesiz(len);
 	nd.ni_cnd.cn_cred = cred;
 	nd.ni_cnd.cn_nameiop = DELETE;
 	nd.ni_cnd.cn_flags = LOCKPARENT | LOCKLEAF | MPSAFE;
 	error = nfs_namei(&nd, fhp, len, slp, nam, &md, &dpos,
 		&dirp, v3, &dirfor, &dirfor_ret, td, FALSE);
 	vfslocked = nfsrv_lockedpair_nd(vfslocked, &nd);
 	if (dirp && !v3) {
 		vrele(dirp);
 		dirp = NULL;
 	}
 	if (error) {
 		nfsm_reply(NFSX_WCCDATA(v3));
 		if (v3)
 			nfsm_srvwcc_data(dirfor_ret, &dirfor, diraft_ret, &diraft);
 		error = 0;
 		goto nfsmout;
 	}
 	vp = nd.ni_vp;
 	if (vp->v_type != VDIR) {
 		error = ENOTDIR;
 		goto out;
 	}
 	/*
 	 * No rmdir "." please.
 	 */
 	if (nd.ni_dvp == vp) {
 		error = EINVAL;
 		goto out;
 	}
 	/*
 	 * The root of a mounted filesystem cannot be deleted.
 	 */
 	if (vp->v_vflag & VV_ROOT)
 		error = EBUSY;
 out:
 	/*
 	 * Issue or abort op.  Since SAVESTART is not set, path name
 	 * component is freed by the VOP after either.
 	 */
 	if (!error)
 		error = VOP_RMDIR(nd.ni_dvp, nd.ni_vp, &nd.ni_cnd);
 	NDFREE(&nd, NDF_ONLY_PNBUF);
 
 	if (dirp) {
 		if (dirp == nd.ni_dvp)
 			diraft_ret = VOP_GETATTR(dirp, &diraft, cred, td);
 		else {
 			/* Release existing locks to prevent deadlock. */
 			if (nd.ni_dvp) {
 				if (nd.ni_dvp == nd.ni_vp)
 					vrele(nd.ni_dvp);
 				else
 					vput(nd.ni_dvp);
 			}
 			if (nd.ni_vp)
 				vput(nd.ni_vp);
 			nd.ni_dvp = NULL;
 			nd.ni_vp = NULL;
-			vn_lock(dirp, LK_EXCLUSIVE | LK_RETRY, td);
+			vn_lock(dirp, LK_EXCLUSIVE | LK_RETRY);
 			diraft_ret = VOP_GETATTR(dirp, &diraft, cred, td);
 			VOP_UNLOCK(dirp, 0, td);
 		}
 	}
 	nfsm_reply(NFSX_WCCDATA(v3));
 	error = 0;
 	if (v3)
 		nfsm_srvwcc_data(dirfor_ret, &dirfor, diraft_ret, &diraft);
 	/* fall through */
 
 nfsmout:
 	NDFREE(&nd, NDF_ONLY_PNBUF);
 	if (nd.ni_dvp) {
 		if (nd.ni_dvp == nd.ni_vp)
 			vrele(nd.ni_dvp);
 		else
 			vput(nd.ni_dvp);
 	}
 	if (nd.ni_vp)
 		vput(nd.ni_vp);
 	if (dirp)
 		vrele(dirp);
 
 	vn_finished_write(mp);
 	VFS_UNLOCK_GIANT(vfslocked);
 	return(error);
 }
 
 /*
  * nfs readdir service
  * - mallocs what it thinks is enough to read
  *	count rounded up to a multiple of NFS_DIRBLKSIZ <= NFS_MAXREADDIR
  * - calls VOP_READDIR()
  * - loops around building the reply
  *	if the output generated exceeds count break out of loop
  *	The nfsm_clget macro is used here so that the reply will be packed
  *	tightly in mbuf clusters.
  * - it only knows that it has encountered eof when the VOP_READDIR()
  *	reads nothing
  * - as such one readdir rpc will return eof false although you are there
  *	and then the next will return eof
  * - it trims out records with d_fileno == 0
  *	this doesn't matter for Unix clients, but they might confuse clients
  *	for other os'.
  * NB: It is tempting to set eof to true if the VOP_READDIR() reads less
  *	than requested, but this may not apply to all filesystems. For
  *	example, client NFS does not { although it is never remote mounted
  *	anyhow }
  *     The alternate call nfsrv_readdirplus() does lookups as well.
  * PS: The NFS protocol spec. does not clarify what the "count" byte
  *	argument is a count of.. just name strings and file id's or the
  *	entire reply rpc or ...
  *	I tried just file name and id sizes and it confused the Sun client,
  *	so I am using the full rpc size now. The "paranoia.." comment refers
  *	to including the status longwords that are not a part of the dir.
  *	"entry" structures, but are in the rpc.
  */
 struct flrep {
 	nfsuint64	fl_off;
 	u_int32_t	fl_postopok;
 	u_int32_t	fl_fattr[NFSX_V3FATTR / sizeof (u_int32_t)];
 	u_int32_t	fl_fhok;
 	u_int32_t	fl_fhsize;
 	u_int32_t	fl_nfh[NFSX_V3FH / sizeof (u_int32_t)];
 };
 
 int
 nfsrv_readdir(struct nfsrv_descript *nfsd, struct nfssvc_sock *slp,
     struct thread *td, struct mbuf **mrq)
 {
 	struct mbuf *mrep = nfsd->nd_mrep, *md = nfsd->nd_md;
 	struct sockaddr *nam = nfsd->nd_nam;
 	caddr_t dpos = nfsd->nd_dpos;
 	struct ucred *cred = nfsd->nd_cr;
 	char *bp, *be;
 	struct mbuf *mp;
 	struct dirent *dp;
 	caddr_t cp;
 	u_int32_t *tl;
 	caddr_t bpos;
 	struct mbuf *mb, *mreq;
 	char *cpos, *cend, *rbuf;
 	struct vnode *vp = NULL;
 	struct vattr at;
 	nfsfh_t nfh;
 	fhandle_t *fhp;
 	struct uio io;
 	struct iovec iv;
 	int len, nlen, rem, xfer, tsiz, i, error = 0, getret = 1;
 	int siz, cnt, fullsiz, eofflag, rdonly, ncookies;
 	int v3 = (nfsd->nd_flag & ND_NFSV3);
 	u_quad_t off, toff, verf;
 	u_long *cookies = NULL, *cookiep; /* needs to be int64_t or off_t */
 	int vfslocked;
 
 	nfsdbprintf(("%s %d\n", __FILE__, __LINE__));
 	vfslocked = 0;
 	fhp = &nfh.fh_generic;
 	nfsm_srvmtofh(fhp);
 	if (v3) {
 		tl = nfsm_dissect_nonblock(u_int32_t *, 5 * NFSX_UNSIGNED);
 		toff = fxdr_hyper(tl);
 		tl += 2;
 		verf = fxdr_hyper(tl);
 		tl += 2;
 	} else {
 		tl = nfsm_dissect_nonblock(u_int32_t *, 2 * NFSX_UNSIGNED);
 		toff = fxdr_unsigned(u_quad_t, *tl++);
 		verf = 0;	/* shut up gcc */
 	}
 	off = toff;
 	cnt = fxdr_unsigned(int, *tl);
 	siz = ((cnt + DIRBLKSIZ - 1) & ~(DIRBLKSIZ - 1));
 	xfer = NFS_SRVMAXDATA(nfsd);
 	if (cnt > xfer)
 		cnt = xfer;
 	if (siz > xfer)
 		siz = xfer;
 	fullsiz = siz;
 	error = nfsrv_fhtovp(fhp, 1, &vp, &vfslocked, cred, slp,
 	    nam, &rdonly, TRUE);
 	if (!error && vp->v_type != VDIR) {
 		error = ENOTDIR;
 		vput(vp);
 		vp = NULL;
 	}
 	if (error) {
 		nfsm_reply(NFSX_UNSIGNED);
 		if (v3)
 			nfsm_srvpostop_attr(getret, &at);
 		error = 0;
 		goto nfsmout;
 	}
 
 	/*
 	 * Obtain lock on vnode for this section of the code
 	 */
 	if (v3) {
 		error = getret = VOP_GETATTR(vp, &at, cred, td);
 #if 0
 		/*
 		 * XXX This check may be too strict for Solaris 2.5 clients.
 		 */
 		if (!error && toff && verf && verf != at.va_filerev)
 			error = NFSERR_BAD_COOKIE;
 #endif
 	}
 	if (!error)
 		error = nfsrv_access(vp, VEXEC, cred, rdonly, td, 0);
 	if (error) {
 		vput(vp);
 		vp = NULL;
 		nfsm_reply(NFSX_POSTOPATTR(v3));
 		if (v3)
 			nfsm_srvpostop_attr(getret, &at);
 		error = 0;
 		goto nfsmout;
 	}
 	VOP_UNLOCK(vp, 0, td);
 
 	/*
 	 * end section.  Allocate rbuf and continue
 	 */
 	MALLOC(rbuf, caddr_t, siz, M_TEMP, M_WAITOK);
 again:
 	iv.iov_base = rbuf;
 	iv.iov_len = fullsiz;
 	io.uio_iov = &iv;
 	io.uio_iovcnt = 1;
 	io.uio_offset = (off_t)off;
 	io.uio_resid = fullsiz;
 	io.uio_segflg = UIO_SYSSPACE;
 	io.uio_rw = UIO_READ;
 	io.uio_td = NULL;
 	eofflag = 0;
-	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td);
+	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
 	if (cookies) {
 		free((caddr_t)cookies, M_TEMP);
 		cookies = NULL;
 	}
 	error = VOP_READDIR(vp, &io, cred, &eofflag, &ncookies, &cookies);
 	off = (off_t)io.uio_offset;
 	if (!cookies && !error)
 		error = NFSERR_PERM;
 	if (v3) {
 		getret = VOP_GETATTR(vp, &at, cred, td);
 		if (!error)
 			error = getret;
 	}
 	VOP_UNLOCK(vp, 0, td);
 	if (error) {
 		vrele(vp);
 		vp = NULL;
 		free((caddr_t)rbuf, M_TEMP);
 		if (cookies)
 			free((caddr_t)cookies, M_TEMP);
 		nfsm_reply(NFSX_POSTOPATTR(v3));
 		if (v3)
 			nfsm_srvpostop_attr(getret, &at);
 		error = 0;
 		goto nfsmout;
 	}
 	if (io.uio_resid) {
 		siz -= io.uio_resid;
 
 		/*
 		 * If nothing read, return eof
 		 * rpc reply
 		 */
 		if (siz == 0) {
 			vrele(vp);
 			vp = NULL;
 			nfsm_reply(NFSX_POSTOPATTR(v3) + NFSX_COOKIEVERF(v3) +
 				2 * NFSX_UNSIGNED);
 			if (v3) {
 				nfsm_srvpostop_attr(getret, &at);
 				tl = nfsm_build(u_int32_t *, 4 * NFSX_UNSIGNED);
 				txdr_hyper(at.va_filerev, tl);
 				tl += 2;
 			} else
 				tl = nfsm_build(u_int32_t *, 2 * NFSX_UNSIGNED);
 			*tl++ = nfsrv_nfs_false;
 			*tl = nfsrv_nfs_true;
 			FREE((caddr_t)rbuf, M_TEMP);
 			FREE((caddr_t)cookies, M_TEMP);
 			error = 0;
 			goto nfsmout;
 		}
 	}
 
 	/*
 	 * Check for degenerate cases of nothing useful read.
 	 * If so go try again
 	 */
 	cpos = rbuf;
 	cend = rbuf + siz;
 	dp = (struct dirent *)cpos;
 	cookiep = cookies;
 	/*
 	 * For some reason FreeBSD's ufs_readdir() chooses to back the
 	 * directory offset up to a block boundary, so it is necessary to
 	 * skip over the records that precede the requested offset. This
 	 * requires the assumption that file offset cookies monotonically
 	 * increase.
 	 */
 	while (cpos < cend && ncookies > 0 &&
 		(dp->d_fileno == 0 || dp->d_type == DT_WHT ||
 		 ((u_quad_t)(*cookiep)) <= toff)) {
 		cpos += dp->d_reclen;
 		dp = (struct dirent *)cpos;
 		cookiep++;
 		ncookies--;
 	}
 	if (cpos >= cend || ncookies == 0) {
 		toff = off;
 		siz = fullsiz;
 		goto again;
 	}
 
 	len = 3 * NFSX_UNSIGNED;	/* paranoia, probably can be 0 */
 	nfsm_reply(NFSX_POSTOPATTR(v3) + NFSX_COOKIEVERF(v3) + siz);
 	if (v3) {
 		nfsm_srvpostop_attr(getret, &at);
 		tl = nfsm_build(u_int32_t *, 2 * NFSX_UNSIGNED);
 		txdr_hyper(at.va_filerev, tl);
 	}
 	mp = mb;
 	bp = bpos;
 	be = bp + M_TRAILINGSPACE(mp);
 
 	/* Loop through the records and build reply */
 	while (cpos < cend && ncookies > 0) {
 		if (dp->d_fileno != 0 && dp->d_type != DT_WHT) {
 			nlen = dp->d_namlen;
 			rem = nfsm_rndup(nlen) - nlen;
 			len += (4 * NFSX_UNSIGNED + nlen + rem);
 			if (v3)
 				len += 2 * NFSX_UNSIGNED;
 			if (len > cnt) {
 				eofflag = 0;
 				break;
 			}
 			/*
 			 * Build the directory record xdr from
 			 * the dirent entry.
 			 */
 			nfsm_clget;
 			*tl = nfsrv_nfs_true;
 			bp += NFSX_UNSIGNED;
 			if (v3) {
 				nfsm_clget;
 				*tl = 0;
 				bp += NFSX_UNSIGNED;
 			}
 			nfsm_clget;
 			*tl = txdr_unsigned(dp->d_fileno);
 			bp += NFSX_UNSIGNED;
 			nfsm_clget;
 			*tl = txdr_unsigned(nlen);
 			bp += NFSX_UNSIGNED;
 
 			/* And loop around copying the name */
 			xfer = nlen;
 			cp = dp->d_name;
 			while (xfer > 0) {
 				nfsm_clget;
 				if ((bp+xfer) > be)
 					tsiz = be-bp;
 				else
 					tsiz = xfer;
 				bcopy(cp, bp, tsiz);
 				bp += tsiz;
 				xfer -= tsiz;
 				if (xfer > 0)
 					cp += tsiz;
 			}
 			/* And null pad to an int32_t boundary. */
 			for (i = 0; i < rem; i++)
 				*bp++ = '\0';
 			nfsm_clget;
 
 			/* Finish off the record */
 			if (v3) {
 				*tl = 0;
 				bp += NFSX_UNSIGNED;
 				nfsm_clget;
 			}
 			*tl = txdr_unsigned(*cookiep);
 			bp += NFSX_UNSIGNED;
 		}
 		cpos += dp->d_reclen;
 		dp = (struct dirent *)cpos;
 		cookiep++;
 		ncookies--;
 	}
 	vrele(vp);
 	vp = NULL;
 	nfsm_clget;
 	*tl = nfsrv_nfs_false;
 	bp += NFSX_UNSIGNED;
 	nfsm_clget;
 	if (eofflag)
 		*tl = nfsrv_nfs_true;
 	else
 		*tl = nfsrv_nfs_false;
 	bp += NFSX_UNSIGNED;
 	if (mp != mb) {
 		if (bp < be)
 			mp->m_len = bp - mtod(mp, caddr_t);
 	} else
 		mp->m_len += bp - bpos;
 	FREE((caddr_t)rbuf, M_TEMP);
 	FREE((caddr_t)cookies, M_TEMP);
 
 nfsmout:
 	if (vp)
 		vrele(vp);
 	VFS_UNLOCK_GIANT(vfslocked);
 	return(error);
 }
 
 int
 nfsrv_readdirplus(struct nfsrv_descript *nfsd, struct nfssvc_sock *slp,
     struct thread *td, struct mbuf **mrq)
 {
 	struct mbuf *mrep = nfsd->nd_mrep, *md = nfsd->nd_md;
 	struct sockaddr *nam = nfsd->nd_nam;
 	caddr_t dpos = nfsd->nd_dpos;
 	struct ucred *cred = nfsd->nd_cr;
 	char *bp, *be;
 	struct mbuf *mp;
 	struct dirent *dp;
 	caddr_t cp;
 	u_int32_t *tl;
 	caddr_t bpos;
 	struct mbuf *mb, *mreq;
 	char *cpos, *cend, *rbuf;
 	struct vnode *vp = NULL, *nvp;
 	struct flrep fl;
 	nfsfh_t nfh;
 	fhandle_t *fhp, *nfhp = (fhandle_t *)fl.fl_nfh;
 	struct uio io;
 	struct iovec iv;
 	struct vattr va, at, *vap = &va;
 	struct nfs_fattr *fp;
 	int len, nlen, rem, xfer, tsiz, i, error = 0, getret = 1;
 	int siz, cnt, fullsiz, eofflag, rdonly, dirlen, ncookies;
 	u_quad_t off, toff, verf;
 	u_long *cookies = NULL, *cookiep; /* needs to be int64_t or off_t */
 	int v3 = (nfsd->nd_flag & ND_NFSV3);
 	int vfslocked;
 
 	nfsdbprintf(("%s %d\n", __FILE__, __LINE__));
 	vfslocked = 0;
 	if (!v3)
 		panic("nfsrv_readdirplus: v3 proc called on a v2 connection");
 	fhp = &nfh.fh_generic;
 	nfsm_srvmtofh(fhp);
 	tl = nfsm_dissect_nonblock(u_int32_t *, 6 * NFSX_UNSIGNED);
 	toff = fxdr_hyper(tl);
 	tl += 2;
 	verf = fxdr_hyper(tl);
 	tl += 2;
 	siz = fxdr_unsigned(int, *tl++);
 	cnt = fxdr_unsigned(int, *tl);
 	off = toff;
 	siz = ((siz + DIRBLKSIZ - 1) & ~(DIRBLKSIZ - 1));
 	xfer = NFS_SRVMAXDATA(nfsd);
 	if (cnt > xfer)
 		cnt = xfer;
 	if (siz > xfer)
 		siz = xfer;
 	fullsiz = siz;
 	error = nfsrv_fhtovp(fhp, 1, &vp, &vfslocked, cred, slp,
 	    nam, &rdonly, TRUE);
 	if (!error && vp->v_type != VDIR) {
 		error = ENOTDIR;
 		vput(vp);
 		vp = NULL;
 	}
 	if (error) {
 		nfsm_reply(NFSX_UNSIGNED);
 		nfsm_srvpostop_attr(getret, &at);
 		error = 0;
 		goto nfsmout;
 	}
 	error = getret = VOP_GETATTR(vp, &at, cred, td);
 #if 0
 	/*
 	 * XXX This check may be too strict for Solaris 2.5 clients.
 	 */
 	if (!error && toff && verf && verf != at.va_filerev)
 		error = NFSERR_BAD_COOKIE;
 #endif
 	if (!error)
 		error = nfsrv_access(vp, VEXEC, cred, rdonly, td, 0);
 	if (error) {
 		vput(vp);
 		vp = NULL;
 		nfsm_reply(NFSX_V3POSTOPATTR);
 		nfsm_srvpostop_attr(getret, &at);
 		error = 0;
 		goto nfsmout;
 	}
 	VOP_UNLOCK(vp, 0, td);
 	MALLOC(rbuf, caddr_t, siz, M_TEMP, M_WAITOK);
 again:
 	iv.iov_base = rbuf;
 	iv.iov_len = fullsiz;
 	io.uio_iov = &iv;
 	io.uio_iovcnt = 1;
 	io.uio_offset = (off_t)off;
 	io.uio_resid = fullsiz;
 	io.uio_segflg = UIO_SYSSPACE;
 	io.uio_rw = UIO_READ;
 	io.uio_td = NULL;
 	eofflag = 0;
-	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td);
+	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
 	if (cookies) {
 		free((caddr_t)cookies, M_TEMP);
 		cookies = NULL;
 	}
 	error = VOP_READDIR(vp, &io, cred, &eofflag, &ncookies, &cookies);
 	off = (u_quad_t)io.uio_offset;
 	getret = VOP_GETATTR(vp, &at, cred, td);
 	VOP_UNLOCK(vp, 0, td);
 	if (!cookies && !error)
 		error = NFSERR_PERM;
 	if (!error)
 		error = getret;
 	if (error) {
 		vrele(vp);
 		vp = NULL;
 		if (cookies)
 			free((caddr_t)cookies, M_TEMP);
 		free((caddr_t)rbuf, M_TEMP);
 		nfsm_reply(NFSX_V3POSTOPATTR);
 		nfsm_srvpostop_attr(getret, &at);
 		error = 0;
 		goto nfsmout;
 	}
 	if (io.uio_resid) {
 		siz -= io.uio_resid;
 
 		/*
 		 * If nothing read, return eof
 		 * rpc reply
 		 */
 		if (siz == 0) {
 			vrele(vp);
 			vp = NULL;
 			nfsm_reply(NFSX_V3POSTOPATTR + NFSX_V3COOKIEVERF +
 				2 * NFSX_UNSIGNED);
 			nfsm_srvpostop_attr(getret, &at);
 			tl = nfsm_build(u_int32_t *, 4 * NFSX_UNSIGNED);
 			txdr_hyper(at.va_filerev, tl);
 			tl += 2;
 			*tl++ = nfsrv_nfs_false;
 			*tl = nfsrv_nfs_true;
 			FREE((caddr_t)cookies, M_TEMP);
 			FREE((caddr_t)rbuf, M_TEMP);
 			error = 0;
 			goto nfsmout;
 		}
 	}
 
 	/*
 	 * Check for degenerate cases of nothing useful read.
 	 * If so go try again
 	 */
 	cpos = rbuf;
 	cend = rbuf + siz;
 	dp = (struct dirent *)cpos;
 	cookiep = cookies;
 	/*
 	 * For some reason FreeBSD's ufs_readdir() chooses to back the
 	 * directory offset up to a block boundary, so it is necessary to
 	 * skip over the records that precede the requested offset. This
 	 * requires the assumption that file offset cookies monotonically
 	 * increase.
 	 */
 	while (cpos < cend && ncookies > 0 &&
 		(dp->d_fileno == 0 || dp->d_type == DT_WHT ||
 		 ((u_quad_t)(*cookiep)) <= toff)) {
 		cpos += dp->d_reclen;
 		dp = (struct dirent *)cpos;
 		cookiep++;
 		ncookies--;
 	}
 	if (cpos >= cend || ncookies == 0) {
 		toff = off;
 		siz = fullsiz;
 		goto again;
 	}
 
 	/*
 	 * Probe one of the directory entries to see if the filesystem
 	 * supports VGET.
 	 */
 	if (VFS_VGET(vp->v_mount, dp->d_fileno, LK_EXCLUSIVE, &nvp) ==
 	    EOPNOTSUPP) {
 		error = NFSERR_NOTSUPP;
 		vrele(vp);
 		vp = NULL;
 		free((caddr_t)cookies, M_TEMP);
 		free((caddr_t)rbuf, M_TEMP);
 		nfsm_reply(NFSX_V3POSTOPATTR);
 		nfsm_srvpostop_attr(getret, &at);
 		error = 0;
 		goto nfsmout;
 	}
 	vput(nvp);
 	nvp = NULL;
 
 	dirlen = len = NFSX_V3POSTOPATTR + NFSX_V3COOKIEVERF +
 	    2 * NFSX_UNSIGNED;
 	nfsm_reply(cnt);
 	nfsm_srvpostop_attr(getret, &at);
 	tl = nfsm_build(u_int32_t *, 2 * NFSX_UNSIGNED);
 	txdr_hyper(at.va_filerev, tl);
 	mp = mb;
 	bp = bpos;
 	be = bp + M_TRAILINGSPACE(mp);
 
 	/* Loop through the records and build reply */
 	while (cpos < cend && ncookies > 0) {
 		if (dp->d_fileno != 0 && dp->d_type != DT_WHT) {
 			nlen = dp->d_namlen;
 			rem = nfsm_rndup(nlen)-nlen;
 
 			/*
 			 * For readdir_and_lookup get the vnode using
 			 * the file number.
 			 */
 			if (VFS_VGET(vp->v_mount, dp->d_fileno, LK_EXCLUSIVE,
 			    &nvp))
 				goto invalid;
 			bzero((caddr_t)nfhp, NFSX_V3FH);
 			nfhp->fh_fsid =
 				nvp->v_mount->mnt_stat.f_fsid;
 			/*
 			 * XXXRW: Assert the mountpoints are the same so that
 			 * we know that acquiring Giant based on the
 			 * directory is the right thing for the child.
 			 */
 			KASSERT(nvp->v_mount == vp->v_mount,
 			    ("nfsrv_readdirplus: nvp mount != vp mount"));
 			if (VOP_VPTOFH(nvp, &nfhp->fh_fid)) {
 				vput(nvp);
 				nvp = NULL;
 				goto invalid;
 			}
 			if (VOP_GETATTR(nvp, vap, cred, td)) {
 				vput(nvp);
 				nvp = NULL;
 				goto invalid;
 			}
 			vput(nvp);
 			nvp = NULL;
 
 			/*
 			 * If either the dircount or maxcount will be
 			 * exceeded, get out now. Both of these lengths
 			 * are calculated conservatively, including all
 			 * XDR overheads.
 			 */
 			len += (8 * NFSX_UNSIGNED + nlen + rem + NFSX_V3FH +
 				NFSX_V3POSTOPATTR);
 			dirlen += (6 * NFSX_UNSIGNED + nlen + rem);
 			if (len > cnt || dirlen > fullsiz) {
 				eofflag = 0;
 				break;
 			}
 
 			/*
 			 * Build the directory record xdr from
 			 * the dirent entry.
 			 */
 			fp = (struct nfs_fattr *)&fl.fl_fattr;
 			nfsm_srvfillattr(vap, fp);
 			fl.fl_fhsize = txdr_unsigned(NFSX_V3FH);
 			fl.fl_fhok = nfsrv_nfs_true;
 			fl.fl_postopok = nfsrv_nfs_true;
 			fl.fl_off.nfsuquad[0] = 0;
 			fl.fl_off.nfsuquad[1] = txdr_unsigned(*cookiep);
 
 			nfsm_clget;
 			*tl = nfsrv_nfs_true;
 			bp += NFSX_UNSIGNED;
 			nfsm_clget;
 			*tl = 0;
 			bp += NFSX_UNSIGNED;
 			nfsm_clget;
 			*tl = txdr_unsigned(dp->d_fileno);
 			bp += NFSX_UNSIGNED;
 			nfsm_clget;
 			*tl = txdr_unsigned(nlen);
 			bp += NFSX_UNSIGNED;
 
 			/* And loop around copying the name */
 			xfer = nlen;
 			cp = dp->d_name;
 			while (xfer > 0) {
 				nfsm_clget;
 				if ((bp + xfer) > be)
 					tsiz = be - bp;
 				else
 					tsiz = xfer;
 				bcopy(cp, bp, tsiz);
 				bp += tsiz;
 				xfer -= tsiz;
 				if (xfer > 0)
 					cp += tsiz;
 			}
 			/* And null pad to an int32_t boundary. */
 			for (i = 0; i < rem; i++)
 				*bp++ = '\0';
 
 			/*
 			 * Now copy the flrep structure out.
 			 */
 			xfer = sizeof (struct flrep);
 			cp = (caddr_t)&fl;
 			while (xfer > 0) {
 				nfsm_clget;
 				if ((bp + xfer) > be)
 					tsiz = be - bp;
 				else
 					tsiz = xfer;
 				bcopy(cp, bp, tsiz);
 				bp += tsiz;
 				xfer -= tsiz;
 				if (xfer > 0)
 					cp += tsiz;
 			}
 		}
 invalid:
 		cpos += dp->d_reclen;
 		dp = (struct dirent *)cpos;
 		cookiep++;
 		ncookies--;
 	}
 	vrele(vp);
 	vp = NULL;
 	nfsm_clget;
 	*tl = nfsrv_nfs_false;
 	bp += NFSX_UNSIGNED;
 	nfsm_clget;
 	if (eofflag)
 		*tl = nfsrv_nfs_true;
 	else
 		*tl = nfsrv_nfs_false;
 	bp += NFSX_UNSIGNED;
 	if (mp != mb) {
 		if (bp < be)
 			mp->m_len = bp - mtod(mp, caddr_t);
 	} else
 		mp->m_len += bp - bpos;
 	FREE((caddr_t)cookies, M_TEMP);
 	FREE((caddr_t)rbuf, M_TEMP);
 nfsmout:
 	if (vp)
 		vrele(vp);
 	VFS_UNLOCK_GIANT(vfslocked);
 	return(error);
 }
 
 /*
  * nfs commit service
  */
 int
 nfsrv_commit(struct nfsrv_descript *nfsd, struct nfssvc_sock *slp,
     struct thread *td, struct mbuf **mrq)
 {
 	struct mbuf *mrep = nfsd->nd_mrep, *md = nfsd->nd_md;
 	struct sockaddr *nam = nfsd->nd_nam;
 	caddr_t dpos = nfsd->nd_dpos;
 	struct ucred *cred = nfsd->nd_cr;
 	struct vattr bfor, aft;
 	struct vnode *vp = NULL;
 	nfsfh_t nfh;
 	fhandle_t *fhp;
 	u_int32_t *tl;
 	caddr_t bpos;
 	int error = 0, rdonly, for_ret = 1, aft_ret = 1, cnt;
 	struct mbuf *mb, *mreq;
 	u_quad_t off;
 	struct mount *mp = NULL;
 	int v3 = (nfsd->nd_flag & ND_NFSV3);
 	int tvfslocked;
 	int vfslocked;
 
 	nfsdbprintf(("%s %d\n", __FILE__, __LINE__));
 	vfslocked = 0;
 	if (!v3)
 		panic("nfsrv_commit: v3 proc called on a v2 connection");
 	fhp = &nfh.fh_generic;
 	nfsm_srvmtofh(fhp);
 	if ((mp = vfs_getvfs(&fhp->fh_fsid)) == NULL) {
 		error = ESTALE;
 		goto ereply;
 	}
 	vfslocked = VFS_LOCK_GIANT(mp);
 	(void) vn_start_write(NULL, &mp, V_WAIT);
 	vfs_rel(mp);		/* The write holds a ref. */
 	tl = nfsm_dissect_nonblock(u_int32_t *, 3 * NFSX_UNSIGNED);
 
 	/*
 	 * XXX At this time VOP_FSYNC() does not accept offset and byte
 	 * count parameters, so these arguments are useless (someday maybe).
 	 */
 	off = fxdr_hyper(tl);
 	tl += 2;
 	cnt = fxdr_unsigned(int, *tl);
 	error = nfsrv_fhtovp(fhp, 1, &vp, &tvfslocked, cred, slp,
 	    nam, &rdonly, TRUE);
 	vfslocked = nfsrv_lockedpair(vfslocked, tvfslocked);
 	if (error) {
 		nfsm_reply(2 * NFSX_UNSIGNED);
 		nfsm_srvwcc_data(for_ret, &bfor, aft_ret, &aft);
 		error = 0;
 		goto nfsmout;
 	}
 	for_ret = VOP_GETATTR(vp, &bfor, cred, td);
 
 	if (cnt > MAX_COMMIT_COUNT) {
 		/*
 		 * Give up and do the whole thing
 		 */
 		if (vp->v_object &&
 		   (vp->v_object->flags & OBJ_MIGHTBEDIRTY)) {
 			VM_OBJECT_LOCK(vp->v_object);
 			vm_object_page_clean(vp->v_object, 0, 0, OBJPC_SYNC);
 			VM_OBJECT_UNLOCK(vp->v_object);
 		}
 		error = VOP_FSYNC(vp, MNT_WAIT, td);
 	} else {
 		/*
 		 * Locate and synchronously write any buffers that fall
 		 * into the requested range.  Note:  we are assuming that
 		 * f_iosize is a power of 2.
 		 */
 		int iosize = vp->v_mount->mnt_stat.f_iosize;
 		int iomask = iosize - 1;
 		int s;
 		daddr_t lblkno;
 
 		/*
 		 * Align to iosize boundry, super-align to page boundry.
 		 */
 		if (off & iomask) {
 			cnt += off & iomask;
 			off &= ~(u_quad_t)iomask;
 		}
 		if (off & PAGE_MASK) {
 			cnt += off & PAGE_MASK;
 			off &= ~(u_quad_t)PAGE_MASK;
 		}
 		lblkno = off / iosize;
 
 		if (vp->v_object &&
 		   (vp->v_object->flags & OBJ_MIGHTBEDIRTY)) {
 			VM_OBJECT_LOCK(vp->v_object);
 			vm_object_page_clean(vp->v_object, off / PAGE_SIZE, (cnt + PAGE_MASK) / PAGE_SIZE, OBJPC_SYNC);
 			VM_OBJECT_UNLOCK(vp->v_object);
 		}
 
 		s = splbio();
 		VI_LOCK(vp);
 		while (cnt > 0) {
 			struct buf *bp;
 
 			/*
 			 * If we have a buffer and it is marked B_DELWRI we
 			 * have to lock and write it.  Otherwise the prior
 			 * write is assumed to have already been committed.
 			 *
 			 * gbincore() can return invalid buffers now so we
 			 * have to check that bit as well (though B_DELWRI
 			 * should not be set if B_INVAL is set there could be
 			 * a race here since we haven't locked the buffer).
 			 */
 			if ((bp = gbincore(&vp->v_bufobj, lblkno)) != NULL) {
 				if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_SLEEPFAIL |
 				    LK_INTERLOCK, VI_MTX(vp)) == ENOLCK) {
 					VI_LOCK(vp);
 					continue; /* retry */
 				}
 			    	if ((bp->b_flags & (B_DELWRI|B_INVAL)) ==
 				    B_DELWRI) {
 					bremfree(bp);
 					bp->b_flags &= ~B_ASYNC;
 					bwrite(bp);
 					++nfs_commit_miss;
 				} else
 					BUF_UNLOCK(bp);
 				VI_LOCK(vp);
 			}
 			++nfs_commit_blks;
 			if (cnt < iosize)
 				break;
 			cnt -= iosize;
 			++lblkno;
 		}
 		VI_UNLOCK(vp);
 		splx(s);
 	}
 
 	aft_ret = VOP_GETATTR(vp, &aft, cred, td);
 	vput(vp);
 	vp = NULL;
 ereply:
 	nfsm_reply(NFSX_V3WCCDATA + NFSX_V3WRITEVERF);
 	nfsm_srvwcc_data(for_ret, &bfor, aft_ret, &aft);
 	if (!error) {
 		tl = nfsm_build(u_int32_t *, NFSX_V3WRITEVERF);
 		if (nfsver.tv_sec == 0)
 			nfsver = boottime;
 		*tl++ = txdr_unsigned(nfsver.tv_sec);
 		*tl = txdr_unsigned(nfsver.tv_usec);
 	} else {
 		error = 0;
 	}
 nfsmout:
 	if (vp)
 		vput(vp);
 	vn_finished_write(mp);
 	VFS_UNLOCK_GIANT(vfslocked);
 	return(error);
 }
 
 /*
  * nfs statfs service
  */
 int
 nfsrv_statfs(struct nfsrv_descript *nfsd, struct nfssvc_sock *slp,
     struct thread *td, struct mbuf **mrq)
 {
 	struct mbuf *mrep = nfsd->nd_mrep, *md = nfsd->nd_md;
 	struct sockaddr *nam = nfsd->nd_nam;
 	caddr_t dpos = nfsd->nd_dpos;
 	struct ucred *cred = nfsd->nd_cr;
 	struct statfs *sf;
 	struct nfs_statfs *sfp;
 	caddr_t bpos;
 	int error = 0, rdonly, getret = 1;
 	int v3 = (nfsd->nd_flag & ND_NFSV3);
 	struct mbuf *mb, *mreq;
 	struct vnode *vp = NULL;
 	struct vattr at;
 	nfsfh_t nfh;
 	fhandle_t *fhp;
 	struct statfs statfs;
 	u_quad_t tval;
 	int vfslocked;
 
 	nfsdbprintf(("%s %d\n", __FILE__, __LINE__));
 	vfslocked = 0;
 	fhp = &nfh.fh_generic;
 	nfsm_srvmtofh(fhp);
 	error = nfsrv_fhtovp(fhp, 1, &vp, &vfslocked, cred, slp,
 	    nam, &rdonly, TRUE);
 	if (error) {
 		nfsm_reply(NFSX_UNSIGNED);
 		if (v3)
 			nfsm_srvpostop_attr(getret, &at);
 		error = 0;
 		goto nfsmout;
 	}
 	sf = &statfs;
 	error = VFS_STATFS(vp->v_mount, sf, td);
 	getret = VOP_GETATTR(vp, &at, cred, td);
 	vput(vp);
 	vp = NULL;
 	nfsm_reply(NFSX_POSTOPATTR(v3) + NFSX_STATFS(v3));
 	if (v3)
 		nfsm_srvpostop_attr(getret, &at);
 	if (error) {
 		error = 0;
 		goto nfsmout;
 	}
 	sfp = nfsm_build(struct nfs_statfs *, NFSX_STATFS(v3));
 	if (v3) {
 		tval = (u_quad_t)sf->f_blocks;
 		tval *= (u_quad_t)sf->f_bsize;
 		txdr_hyper(tval, &sfp->sf_tbytes);
 		tval = (u_quad_t)sf->f_bfree;
 		tval *= (u_quad_t)sf->f_bsize;
 		txdr_hyper(tval, &sfp->sf_fbytes);
 		/*
 		 * Don't send negative values for available space,
 		 * since this field is unsigned in the NFS protocol.
 		 * Otherwise, the client would see absurdly high
 		 * numbers for free space.
 		 */
 		if (sf->f_bavail < 0)
 			tval = 0;
 		else
 			tval = (u_quad_t)sf->f_bavail;
 		tval *= (u_quad_t)sf->f_bsize;
 		txdr_hyper(tval, &sfp->sf_abytes);
 		sfp->sf_tfiles.nfsuquad[0] = 0;
 		sfp->sf_tfiles.nfsuquad[1] = txdr_unsigned(sf->f_files);
 		sfp->sf_ffiles.nfsuquad[0] = 0;
 		sfp->sf_ffiles.nfsuquad[1] = txdr_unsigned(sf->f_ffree);
 		sfp->sf_afiles.nfsuquad[0] = 0;
 		sfp->sf_afiles.nfsuquad[1] = txdr_unsigned(sf->f_ffree);
 		sfp->sf_invarsec = 0;
 	} else {
 		sfp->sf_tsize = txdr_unsigned(NFS_MAXDGRAMDATA);
 		sfp->sf_bsize = txdr_unsigned(sf->f_bsize);
 		sfp->sf_blocks = txdr_unsigned(sf->f_blocks);
 		sfp->sf_bfree = txdr_unsigned(sf->f_bfree);
 		if (sf->f_bavail < 0)
 			sfp->sf_bavail = 0;
 		else
 			sfp->sf_bavail = txdr_unsigned(sf->f_bavail);
 	}
 nfsmout:
 	if (vp)
 		vput(vp);
 	VFS_UNLOCK_GIANT(vfslocked);
 	return(error);
 }
 
 /*
  * nfs fsinfo service
  */
 int
 nfsrv_fsinfo(struct nfsrv_descript *nfsd, struct nfssvc_sock *slp,
     struct thread *td, struct mbuf **mrq)
 {
 	struct mbuf *mrep = nfsd->nd_mrep, *md = nfsd->nd_md;
 	struct sockaddr *nam = nfsd->nd_nam;
 	caddr_t dpos = nfsd->nd_dpos;
 	struct ucred *cred = nfsd->nd_cr;
 	struct nfsv3_fsinfo *sip;
 	caddr_t bpos;
 	int error = 0, rdonly, getret = 1, pref;
 	struct mbuf *mb, *mreq;
 	struct vnode *vp = NULL;
 	struct vattr at;
 	nfsfh_t nfh;
 	fhandle_t *fhp;
 	u_quad_t maxfsize;
 	struct statfs sb;
 	int v3 = (nfsd->nd_flag & ND_NFSV3);
 	int vfslocked;
 
 	nfsdbprintf(("%s %d\n", __FILE__, __LINE__));
 	if (!v3)
 		panic("nfsrv_fsinfo: v3 proc called on a v2 connection");
 	fhp = &nfh.fh_generic;
 	vfslocked = 0;
 	nfsm_srvmtofh(fhp);
 	error = nfsrv_fhtovp(fhp, 1, &vp, &vfslocked, cred, slp,
 	    nam, &rdonly, TRUE);
 	if (error) {
 		nfsm_reply(NFSX_UNSIGNED);
 		nfsm_srvpostop_attr(getret, &at);
 		error = 0;
 		goto nfsmout;
 	}
 
 	/* XXX Try to make a guess on the max file size. */
 	VFS_STATFS(vp->v_mount, &sb, td);
 	maxfsize = (u_quad_t)0x80000000 * sb.f_bsize - 1;
 
 	getret = VOP_GETATTR(vp, &at, cred, td);
 	vput(vp);
 	vp = NULL;
 	nfsm_reply(NFSX_V3POSTOPATTR + NFSX_V3FSINFO);
 	nfsm_srvpostop_attr(getret, &at);
 	sip = nfsm_build(struct nfsv3_fsinfo *, NFSX_V3FSINFO);
 
 	/*
 	 * XXX
 	 * There should be filesystem VFS OP(s) to get this information.
 	 * For now, assume ufs.
 	 */
 	if (slp->ns_so->so_type == SOCK_DGRAM)
 		pref = NFS_MAXDGRAMDATA;
 	else
 		pref = NFS_MAXDATA;
 	sip->fs_rtmax = txdr_unsigned(pref);
 	sip->fs_rtpref = txdr_unsigned(pref);
 	sip->fs_rtmult = txdr_unsigned(NFS_FABLKSIZE);
 	sip->fs_wtmax = txdr_unsigned(pref);
 	sip->fs_wtpref = txdr_unsigned(pref);
 	sip->fs_wtmult = txdr_unsigned(NFS_FABLKSIZE);
 	sip->fs_dtpref = txdr_unsigned(pref);
 	txdr_hyper(maxfsize, &sip->fs_maxfilesize);
 	sip->fs_timedelta.nfsv3_sec = 0;
 	sip->fs_timedelta.nfsv3_nsec = txdr_unsigned(1);
 	sip->fs_properties = txdr_unsigned(NFSV3FSINFO_LINK |
 		NFSV3FSINFO_SYMLINK | NFSV3FSINFO_HOMOGENEOUS |
 		NFSV3FSINFO_CANSETTIME);
 nfsmout:
 	if (vp)
 		vput(vp);
 	VFS_UNLOCK_GIANT(vfslocked);
 	return(error);
 }
 
 /*
  * nfs pathconf service
  */
 int
 nfsrv_pathconf(struct nfsrv_descript *nfsd, struct nfssvc_sock *slp,
     struct thread *td, struct mbuf **mrq)
 {
 	struct mbuf *mrep = nfsd->nd_mrep, *md = nfsd->nd_md;
 	struct sockaddr *nam = nfsd->nd_nam;
 	caddr_t dpos = nfsd->nd_dpos;
 	struct ucred *cred = nfsd->nd_cr;
 	struct nfsv3_pathconf *pc;
 	caddr_t bpos;
 	int error = 0, rdonly, getret = 1;
 	register_t linkmax, namemax, chownres, notrunc;
 	struct mbuf *mb, *mreq;
 	struct vnode *vp = NULL;
 	struct vattr at;
 	nfsfh_t nfh;
 	fhandle_t *fhp;
 	int v3 = (nfsd->nd_flag & ND_NFSV3);
 	int vfslocked;
 
 	nfsdbprintf(("%s %d\n", __FILE__, __LINE__));
 	if (!v3)
 		panic("nfsrv_pathconf: v3 proc called on a v2 connection");
 	fhp = &nfh.fh_generic;
 	nfsm_srvmtofh(fhp);
 	error = nfsrv_fhtovp(fhp, 1, &vp, &vfslocked, cred, slp,
 	    nam, &rdonly, TRUE);
 	if (error) {
 		nfsm_reply(NFSX_UNSIGNED);
 		nfsm_srvpostop_attr(getret, &at);
 		error = 0;
 		goto nfsmout;
 	}
 	error = VOP_PATHCONF(vp, _PC_LINK_MAX, &linkmax);
 	if (!error)
 		error = VOP_PATHCONF(vp, _PC_NAME_MAX, &namemax);
 	if (!error)
 		error = VOP_PATHCONF(vp, _PC_CHOWN_RESTRICTED, &chownres);
 	if (!error)
 		error = VOP_PATHCONF(vp, _PC_NO_TRUNC, &notrunc);
 	getret = VOP_GETATTR(vp, &at, cred, td);
 	vput(vp);
 	vp = NULL;
 	nfsm_reply(NFSX_V3POSTOPATTR + NFSX_V3PATHCONF);
 	nfsm_srvpostop_attr(getret, &at);
 	if (error) {
 		error = 0;
 		goto nfsmout;
 	}
 	pc = nfsm_build(struct nfsv3_pathconf *, NFSX_V3PATHCONF);
 
 	pc->pc_linkmax = txdr_unsigned(linkmax);
 	pc->pc_namemax = txdr_unsigned(namemax);
 	pc->pc_notrunc = txdr_unsigned(notrunc);
 	pc->pc_chownrestricted = txdr_unsigned(chownres);
 
 	/*
 	 * These should probably be supported by VOP_PATHCONF(), but
 	 * until msdosfs is exportable (why would you want to?), the
 	 * Unix defaults should be ok.
 	 */
 	pc->pc_caseinsensitive = nfsrv_nfs_false;
 	pc->pc_casepreserving = nfsrv_nfs_true;
 nfsmout:
 	if (vp)
 		vput(vp);
 	VFS_UNLOCK_GIANT(vfslocked);
 	return(error);
 }
 
 /*
  * Null operation, used by clients to ping server
  */
 /* ARGSUSED */
 int
 nfsrv_null(struct nfsrv_descript *nfsd, struct nfssvc_sock *slp,
     struct thread *td, struct mbuf **mrq)
 {
 	struct mbuf *mrep = nfsd->nd_mrep;
 	caddr_t bpos;
 	int error = NFSERR_RETVOID;
 	struct mbuf *mb, *mreq;
 
 	nfsdbprintf(("%s %d\n", __FILE__, __LINE__));
 	nfsm_reply(0);
 nfsmout:
 	return (error);
 }
 
 /*
  * No operation, used for obsolete procedures
  */
 /* ARGSUSED */
 int
 nfsrv_noop(struct nfsrv_descript *nfsd, struct nfssvc_sock *slp,
     struct thread *td, struct mbuf **mrq)
 {
 	struct mbuf *mrep = nfsd->nd_mrep;
 	caddr_t bpos;
 	int error;
 	struct mbuf *mb, *mreq;
 
 	nfsdbprintf(("%s %d\n", __FILE__, __LINE__));
 	if (nfsd->nd_repstat)
 		error = nfsd->nd_repstat;
 	else
 		error = EPROCUNAVAIL;
 	nfsm_reply(0);
 	error = 0;
 nfsmout:
 	return (error);
 }
 
 /*
  * Perform access checking for vnodes obtained from file handles that would
  * refer to files already opened by a Unix client. You cannot just use
  * vn_writechk() and VOP_ACCESS() for two reasons.
  * 1 - You must check for exported rdonly as well as MNT_RDONLY for the write
  *     case.
  * 2 - The owner is to be given access irrespective of mode bits for some
  *     operations, so that processes that chmod after opening a file don't
  *     break. I don't like this because it opens a security hole, but since
  *     the nfs server opens a security hole the size of a barn door anyhow,
  *     what the heck.
  *
  * The exception to rule 2 is EPERM. If a file is IMMUTABLE, VOP_ACCESS()
  * will return EPERM instead of EACCESS. EPERM is always an error.
  */
 static int
 nfsrv_access(struct vnode *vp, int flags, struct ucred *cred,
     int rdonly, struct thread *td, int override)
 {
 	struct vattr vattr;
 	int error;
 
 	VFS_ASSERT_GIANT(vp->v_mount);
 
 	nfsdbprintf(("%s %d\n", __FILE__, __LINE__));
 
 	if (flags & VWRITE) {
 		/* Just vn_writechk() changed to check rdonly */
 		/*
 		 * Disallow write attempts on read-only filesystems;
 		 * unless the file is a socket or a block or character
 		 * device resident on the filesystem.
 		 */
 		if (rdonly || (vp->v_mount->mnt_flag & MNT_RDONLY)) {
 			switch (vp->v_type) {
 			case VREG:
 			case VDIR:
 			case VLNK:
 				return (EROFS);
 			default:
 				break;
 			}
 		}
 		/*
 		 * If there's shared text associated with
 		 * the inode, we can't allow writing.
 		 */
 		if (vp->v_vflag & VV_TEXT)
 			return (ETXTBSY);
 	}
 
 	error = VOP_GETATTR(vp, &vattr, cred, td);
 	if (error)
 		return (error);
 	error = VOP_ACCESS(vp, flags, cred, td);
 	/*
 	 * Allow certain operations for the owner (reads and writes
 	 * on files that are already open).
 	 */
 	if (override && error == EACCES && cred->cr_uid == vattr.va_uid)
 		error = 0;
 	return (error);
 }
Index: head/sys/nfsserver/nfs_srvsubs.c
===================================================================
--- head/sys/nfsserver/nfs_srvsubs.c	(revision 175201)
+++ head/sys/nfsserver/nfs_srvsubs.c	(revision 175202)
@@ -1,1472 +1,1472 @@
 /*-
  * Copyright (c) 1989, 1993
  *	The Regents of the University of California.  All rights reserved.
  *
  * This code is derived from software contributed to Berkeley by
  * Rick Macklem at The University of Guelph.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	@(#)nfs_subs.c  8.8 (Berkeley) 5/22/95
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 /*
  * These functions support the macros and help fiddle mbuf chains for
  * the nfs op functions. They do things like create the rpc header and
  * copy data between mbuf chains and uio lists.
  */
 
 #include "opt_inet6.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/kernel.h>
 #include <sys/bio.h>
 #include <sys/buf.h>
 #include <sys/proc.h>
 #include <sys/mount.h>
 #include <sys/vnode.h>
 #include <sys/namei.h>
 #include <sys/mbuf.h>
 #include <sys/refcount.h>
 #include <sys/socket.h>
 #include <sys/stat.h>
 #include <sys/malloc.h>
 #include <sys/module.h>
 #include <sys/sysent.h>
 #include <sys/syscall.h>
 #include <sys/sysproto.h>
 
 #include <vm/vm.h>
 #include <vm/vm_object.h>
 #include <vm/vm_extern.h>
 #include <vm/uma.h>
 
 #include <nfs/rpcv2.h>
 #include <nfs/nfsproto.h>
 #include <nfsserver/nfs.h>
 #include <nfs/xdr_subs.h>
 #include <nfsserver/nfsm_subs.h>
 
 #include <netinet/in.h>
 
 /*
  * Data items converted to xdr at startup, since they are constant
  * This is kinda hokey, but may save a little time doing byte swaps
  */
 u_int32_t nfsrv_nfs_xdrneg1;
 u_int32_t nfsrv_rpc_call, nfsrv_rpc_vers, nfsrv_rpc_reply,
 	nfsrv_rpc_msgdenied, nfsrv_rpc_autherr,
 	nfsrv_rpc_mismatch, nfsrv_rpc_auth_unix, nfsrv_rpc_msgaccepted;
 u_int32_t nfsrv_nfs_prog, nfsrv_nfs_true, nfsrv_nfs_false;
 
 /* And other global data */
 static const nfstype nfsv2_type[9] = { NFNON, NFREG, NFDIR, NFBLK, NFCHR,
 				       NFLNK, NFNON, NFCHR, NFNON };
 #define vtonfsv2_type(a)	txdr_unsigned(nfsv2_type[((int32_t)(a))])
 #define vtonfsv3_mode(m)	txdr_unsigned((m) & ALLPERMS)
 
 int nfsrv_ticks;
 
 struct nfssvc_sockhead nfssvc_sockhead;
 int nfssvc_sockhead_flag;
 struct nfsd_head nfsd_head;
 int nfsd_head_flag;
 
 static int nfssvc_offset = SYS_nfssvc;
 static struct sysent nfssvc_prev_sysent;
 MAKE_SYSENT(nfssvc);
 
 struct mtx nfsd_mtx;
 
 /*
  * Mapping of old NFS Version 2 RPC numbers to generic numbers.
  */
 const int nfsrv_nfsv3_procid[NFS_NPROCS] = {
 	NFSPROC_NULL,
 	NFSPROC_GETATTR,
 	NFSPROC_SETATTR,
 	NFSPROC_NOOP,
 	NFSPROC_LOOKUP,
 	NFSPROC_READLINK,
 	NFSPROC_READ,
 	NFSPROC_NOOP,
 	NFSPROC_WRITE,
 	NFSPROC_CREATE,
 	NFSPROC_REMOVE,
 	NFSPROC_RENAME,
 	NFSPROC_LINK,
 	NFSPROC_SYMLINK,
 	NFSPROC_MKDIR,
 	NFSPROC_RMDIR,
 	NFSPROC_READDIR,
 	NFSPROC_FSSTAT,
 	NFSPROC_NOOP,
 	NFSPROC_NOOP,
 	NFSPROC_NOOP,
 	NFSPROC_NOOP,
 	NFSPROC_NOOP,
 };
 
 /*
  * and the reverse mapping from generic to Version 2 procedure numbers
  */
 const int nfsrvv2_procid[NFS_NPROCS] = {
 	NFSV2PROC_NULL,
 	NFSV2PROC_GETATTR,
 	NFSV2PROC_SETATTR,
 	NFSV2PROC_LOOKUP,
 	NFSV2PROC_NOOP,
 	NFSV2PROC_READLINK,
 	NFSV2PROC_READ,
 	NFSV2PROC_WRITE,
 	NFSV2PROC_CREATE,
 	NFSV2PROC_MKDIR,
 	NFSV2PROC_SYMLINK,
 	NFSV2PROC_CREATE,
 	NFSV2PROC_REMOVE,
 	NFSV2PROC_RMDIR,
 	NFSV2PROC_RENAME,
 	NFSV2PROC_LINK,
 	NFSV2PROC_READDIR,
 	NFSV2PROC_NOOP,
 	NFSV2PROC_STATFS,
 	NFSV2PROC_NOOP,
 	NFSV2PROC_NOOP,
 	NFSV2PROC_NOOP,
 	NFSV2PROC_NOOP,
 };
 
 /*
  * Maps errno values to nfs error numbers.
  * Use 0 (which gets converted to NFSERR_IO) as the catch all for ones not
  * specifically defined in RFC 1094.
  */
 static const u_char nfsrv_v2errmap[ELAST] = {
   NFSERR_PERM,	NFSERR_NOENT,	0,		0,		0,	
   NFSERR_NXIO,	0,		0,		0,		0,	
   0,		0,		NFSERR_ACCES,	0,		0,	
   0,		NFSERR_EXIST,	0,		NFSERR_NODEV,	NFSERR_NOTDIR,
   NFSERR_ISDIR,	0,		0,		0,		0,	
   0,		NFSERR_FBIG,	NFSERR_NOSPC,	0,		NFSERR_ROFS,
   0,		0,		0,		0,		0,	
   0,		0,		0,		0,		0,	
   0,		0,		0,		0,		0,	
   0,		0,		0,		0,		0,	
   0,		0,		0,		0,		0,	
   0,		0,		0,		0,		0,	
   0,		0,		NFSERR_NAMETOL,	0,		0,	
   NFSERR_NOTEMPTY, 0,		0,		NFSERR_DQUOT,	NFSERR_STALE,
   0
 };
 
 /*
  * Maps errno values to nfs error numbers.
  * Although it is not obvious whether or not NFS clients really care if
  * a returned error value is in the specified list for the procedure, the
  * safest thing to do is filter them appropriately. For Version 2, the
  * X/Open XNFS document is the only specification that defines error values
  * for each RPC (The RFC simply lists all possible error values for all RPCs),
  * so I have decided to not do this for Version 2.
  * The first entry is the default error return and the rest are the valid
  * errors for that RPC in increasing numeric order.
  */
 static const short nfsv3err_null[] = {
 	0,
 	0,
 };
 
 static const short nfsv3err_getattr[] = {
 	NFSERR_IO,
 	NFSERR_IO,
 	NFSERR_STALE,
 	NFSERR_BADHANDLE,
 	NFSERR_SERVERFAULT,
 	0,
 };
 
 static const short nfsv3err_setattr[] = {
 	NFSERR_IO,
 	NFSERR_PERM,
 	NFSERR_IO,
 	NFSERR_ACCES,
 	NFSERR_INVAL,
 	NFSERR_NOSPC,
 	NFSERR_ROFS,
 	NFSERR_DQUOT,
 	NFSERR_STALE,
 	NFSERR_BADHANDLE,
 	NFSERR_NOT_SYNC,
 	NFSERR_SERVERFAULT,
 	0,
 };
 
 static const short nfsv3err_lookup[] = {
 	NFSERR_IO,
 	NFSERR_NOENT,
 	NFSERR_IO,
 	NFSERR_ACCES,
 	NFSERR_NOTDIR,
 	NFSERR_NAMETOL,
 	NFSERR_STALE,
 	NFSERR_BADHANDLE,
 	NFSERR_SERVERFAULT,
 	0,
 };
 
 static const short nfsv3err_access[] = {
 	NFSERR_IO,
 	NFSERR_IO,
 	NFSERR_STALE,
 	NFSERR_BADHANDLE,
 	NFSERR_SERVERFAULT,
 	0,
 };
 
 static const short nfsv3err_readlink[] = {
 	NFSERR_IO,
 	NFSERR_IO,
 	NFSERR_ACCES,
 	NFSERR_INVAL,
 	NFSERR_STALE,
 	NFSERR_BADHANDLE,
 	NFSERR_NOTSUPP,
 	NFSERR_SERVERFAULT,
 	0,
 };
 
 static const short nfsv3err_read[] = {
 	NFSERR_IO,
 	NFSERR_IO,
 	NFSERR_NXIO,
 	NFSERR_ACCES,
 	NFSERR_INVAL,
 	NFSERR_STALE,
 	NFSERR_BADHANDLE,
 	NFSERR_SERVERFAULT,
 	0,
 };
 
 static const short nfsv3err_write[] = {
 	NFSERR_IO,
 	NFSERR_IO,
 	NFSERR_ACCES,
 	NFSERR_INVAL,
 	NFSERR_FBIG,
 	NFSERR_NOSPC,
 	NFSERR_ROFS,
 	NFSERR_DQUOT,
 	NFSERR_STALE,
 	NFSERR_BADHANDLE,
 	NFSERR_SERVERFAULT,
 	0,
 };
 
 static const short nfsv3err_create[] = {
 	NFSERR_IO,
 	NFSERR_IO,
 	NFSERR_ACCES,
 	NFSERR_EXIST,
 	NFSERR_NOTDIR,
 	NFSERR_NOSPC,
 	NFSERR_ROFS,
 	NFSERR_NAMETOL,
 	NFSERR_DQUOT,
 	NFSERR_STALE,
 	NFSERR_BADHANDLE,
 	NFSERR_NOTSUPP,
 	NFSERR_SERVERFAULT,
 	0,
 };
 
 static const short nfsv3err_mkdir[] = {
 	NFSERR_IO,
 	NFSERR_IO,
 	NFSERR_ACCES,
 	NFSERR_EXIST,
 	NFSERR_NOTDIR,
 	NFSERR_NOSPC,
 	NFSERR_ROFS,
 	NFSERR_NAMETOL,
 	NFSERR_DQUOT,
 	NFSERR_STALE,
 	NFSERR_BADHANDLE,
 	NFSERR_NOTSUPP,
 	NFSERR_SERVERFAULT,
 	0,
 };
 
 static const short nfsv3err_symlink[] = {
 	NFSERR_IO,
 	NFSERR_IO,
 	NFSERR_ACCES,
 	NFSERR_EXIST,
 	NFSERR_NOTDIR,
 	NFSERR_NOSPC,
 	NFSERR_ROFS,
 	NFSERR_NAMETOL,
 	NFSERR_DQUOT,
 	NFSERR_STALE,
 	NFSERR_BADHANDLE,
 	NFSERR_NOTSUPP,
 	NFSERR_SERVERFAULT,
 	0,
 };
 
 static const short nfsv3err_mknod[] = {
 	NFSERR_IO,
 	NFSERR_IO,
 	NFSERR_ACCES,
 	NFSERR_EXIST,
 	NFSERR_NOTDIR,
 	NFSERR_NOSPC,
 	NFSERR_ROFS,
 	NFSERR_NAMETOL,
 	NFSERR_DQUOT,
 	NFSERR_STALE,
 	NFSERR_BADHANDLE,
 	NFSERR_NOTSUPP,
 	NFSERR_SERVERFAULT,
 	NFSERR_BADTYPE,
 	0,
 };
 
 static const short nfsv3err_remove[] = {
 	NFSERR_IO,
 	NFSERR_NOENT,
 	NFSERR_IO,
 	NFSERR_ACCES,
 	NFSERR_NOTDIR,
 	NFSERR_ROFS,
 	NFSERR_NAMETOL,
 	NFSERR_STALE,
 	NFSERR_BADHANDLE,
 	NFSERR_SERVERFAULT,
 	0,
 };
 
 static const short nfsv3err_rmdir[] = {
 	NFSERR_IO,
 	NFSERR_NOENT,
 	NFSERR_IO,
 	NFSERR_ACCES,
 	NFSERR_EXIST,
 	NFSERR_NOTDIR,
 	NFSERR_INVAL,
 	NFSERR_ROFS,
 	NFSERR_NAMETOL,
 	NFSERR_NOTEMPTY,
 	NFSERR_STALE,
 	NFSERR_BADHANDLE,
 	NFSERR_NOTSUPP,
 	NFSERR_SERVERFAULT,
 	0,
 };
 
 static const short nfsv3err_rename[] = {
 	NFSERR_IO,
 	NFSERR_NOENT,
 	NFSERR_IO,
 	NFSERR_ACCES,
 	NFSERR_EXIST,
 	NFSERR_XDEV,
 	NFSERR_NOTDIR,
 	NFSERR_ISDIR,
 	NFSERR_INVAL,
 	NFSERR_NOSPC,
 	NFSERR_ROFS,
 	NFSERR_MLINK,
 	NFSERR_NAMETOL,
 	NFSERR_NOTEMPTY,
 	NFSERR_DQUOT,
 	NFSERR_STALE,
 	NFSERR_BADHANDLE,
 	NFSERR_NOTSUPP,
 	NFSERR_SERVERFAULT,
 	0,
 };
 
 static const short nfsv3err_link[] = {
 	NFSERR_IO,
 	NFSERR_IO,
 	NFSERR_ACCES,
 	NFSERR_EXIST,
 	NFSERR_XDEV,
 	NFSERR_NOTDIR,
 	NFSERR_INVAL,
 	NFSERR_NOSPC,
 	NFSERR_ROFS,
 	NFSERR_MLINK,
 	NFSERR_NAMETOL,
 	NFSERR_DQUOT,
 	NFSERR_STALE,
 	NFSERR_BADHANDLE,
 	NFSERR_NOTSUPP,
 	NFSERR_SERVERFAULT,
 	0,
 };
 
 static const short nfsv3err_readdir[] = {
 	NFSERR_IO,
 	NFSERR_IO,
 	NFSERR_ACCES,
 	NFSERR_NOTDIR,
 	NFSERR_STALE,
 	NFSERR_BADHANDLE,
 	NFSERR_BAD_COOKIE,
 	NFSERR_TOOSMALL,
 	NFSERR_SERVERFAULT,
 	0,
 };
 
 static const short nfsv3err_readdirplus[] = {
 	NFSERR_IO,
 	NFSERR_IO,
 	NFSERR_ACCES,
 	NFSERR_NOTDIR,
 	NFSERR_STALE,
 	NFSERR_BADHANDLE,
 	NFSERR_BAD_COOKIE,
 	NFSERR_NOTSUPP,
 	NFSERR_TOOSMALL,
 	NFSERR_SERVERFAULT,
 	0,
 };
 
 static const short nfsv3err_fsstat[] = {
 	NFSERR_IO,
 	NFSERR_IO,
 	NFSERR_STALE,
 	NFSERR_BADHANDLE,
 	NFSERR_SERVERFAULT,
 	0,
 };
 
 static const short nfsv3err_fsinfo[] = {
 	NFSERR_STALE,
 	NFSERR_STALE,
 	NFSERR_BADHANDLE,
 	NFSERR_SERVERFAULT,
 	0,
 };
 
 static const short nfsv3err_pathconf[] = {
 	NFSERR_STALE,
 	NFSERR_STALE,
 	NFSERR_BADHANDLE,
 	NFSERR_SERVERFAULT,
 	0,
 };
 
 static const short nfsv3err_commit[] = {
 	NFSERR_IO,
 	NFSERR_IO,
 	NFSERR_STALE,
 	NFSERR_BADHANDLE,
 	NFSERR_SERVERFAULT,
 	0,
 };
 
 static const short *nfsrv_v3errmap[] = {
 	nfsv3err_null,
 	nfsv3err_getattr,
 	nfsv3err_setattr,
 	nfsv3err_lookup,
 	nfsv3err_access,
 	nfsv3err_readlink,
 	nfsv3err_read,
 	nfsv3err_write,
 	nfsv3err_create,
 	nfsv3err_mkdir,
 	nfsv3err_symlink,
 	nfsv3err_mknod,
 	nfsv3err_remove,
 	nfsv3err_rmdir,
 	nfsv3err_rename,
 	nfsv3err_link,
 	nfsv3err_readdir,
 	nfsv3err_readdirplus,
 	nfsv3err_fsstat,
 	nfsv3err_fsinfo,
 	nfsv3err_pathconf,
 	nfsv3err_commit,
 };
 
 /*
  * Called once to initialize data structures...
  */
 static int
 nfsrv_modevent(module_t mod, int type, void *data)
 {
 	static int registered;
 	int error = 0;
 
 	switch (type) {
 	case MOD_LOAD:
 		mtx_init(&nfsd_mtx, "nfsd_mtx", NULL, MTX_DEF);
 		nfsrv_rpc_vers = txdr_unsigned(RPC_VER2);
 		nfsrv_rpc_call = txdr_unsigned(RPC_CALL);
 		nfsrv_rpc_reply = txdr_unsigned(RPC_REPLY);
 		nfsrv_rpc_msgdenied = txdr_unsigned(RPC_MSGDENIED);
 		nfsrv_rpc_msgaccepted = txdr_unsigned(RPC_MSGACCEPTED);
 		nfsrv_rpc_mismatch = txdr_unsigned(RPC_MISMATCH);
 		nfsrv_rpc_autherr = txdr_unsigned(RPC_AUTHERR);
 		nfsrv_rpc_auth_unix = txdr_unsigned(RPCAUTH_UNIX);
 		nfsrv_nfs_prog = txdr_unsigned(NFS_PROG);
 		nfsrv_nfs_true = txdr_unsigned(TRUE);
 		nfsrv_nfs_false = txdr_unsigned(FALSE);
 		nfsrv_nfs_xdrneg1 = txdr_unsigned(-1);
 		nfsrv_ticks = (hz * NFS_TICKINTVL + 500) / 1000;
 		if (nfsrv_ticks < 1)
 			nfsrv_ticks = 1;
 
 		nfsrv_initcache();	/* Init the server request cache */
 		NFSD_LOCK();
 		nfsrv_init(0);		/* Init server data structures */
 		callout_init(&nfsrv_callout, CALLOUT_MPSAFE);
 		NFSD_UNLOCK();
 		nfsrv_timer(0);
 
 		error = syscall_register(&nfssvc_offset, &nfssvc_sysent,
 		    &nfssvc_prev_sysent);
 		if (error)
 			break;
 		registered = 1;
 		break;
 
 	case MOD_UNLOAD:
 		if (nfsrv_numnfsd != 0) {
 			error = EBUSY;
 			break;
 		}
 
 		if (registered)
 			syscall_deregister(&nfssvc_offset, &nfssvc_prev_sysent);
 		callout_drain(&nfsrv_callout);
 		nfsrv_destroycache();	/* Free the server request cache */
 		nfsrv_destroycache();	/* Free the server request cache */
 		mtx_destroy(&nfsd_mtx);
 		break;
 	default:
 		error = EOPNOTSUPP;
 		break;
 	}
 	return error;
 }
 static moduledata_t nfsserver_mod = {
 	"nfsserver",
 	nfsrv_modevent,
 	NULL,
 };
 DECLARE_MODULE(nfsserver, nfsserver_mod, SI_SUB_VFS, SI_ORDER_ANY);
 
 /* So that loader and kldload(2) can find us, wherever we are.. */
 MODULE_VERSION(nfsserver, 1);
 
 /*
  * Set up nameidata for a lookup() call and do it.
  *
  * If pubflag is set, this call is done for a lookup operation on the
  * public filehandle. In that case we allow crossing mountpoints and
  * absolute pathnames. However, the caller is expected to check that
  * the lookup result is within the public fs, and deny access if
  * it is not.
  *
  * nfs_namei() clears out garbage fields that namei() might leave garbage.
  * This is mainly ni_vp and ni_dvp when an error occurs, and ni_dvp when no
  * error occurs but the parent was not requested.
  *
  * dirp may be set whether an error is returned or not, and must be
  * released by the caller.
  */
 int
 nfs_namei(struct nameidata *ndp, fhandle_t *fhp, int len,
     struct nfssvc_sock *slp, struct sockaddr *nam, struct mbuf **mdp,
     caddr_t *dposp, struct vnode **retdirp, int v3, struct vattr *retdirattrp,
     int *retdirattr_retp, struct thread *td, int pubflag)
 {
 	int i, rem;
 	struct mbuf *md;
 	char *fromcp, *tocp, *cp;
 	struct iovec aiov;
 	struct uio auio;
 	struct vnode *dp;
 	int error, rdonly, linklen;
 	struct componentname *cnp = &ndp->ni_cnd;
 	int lockleaf = (cnp->cn_flags & LOCKLEAF) != 0;
 	int dvfslocked;
 	int vfslocked;
 
 	vfslocked = 0;
 	dvfslocked = 0;
 	*retdirp = NULL;
 	cnp->cn_flags |= NOMACCHECK;
 	cnp->cn_pnbuf = uma_zalloc(namei_zone, M_WAITOK);
 
 	/*
 	 * Copy the name from the mbuf list to ndp->ni_pnbuf
 	 * and set the various ndp fields appropriately.
 	 */
 	fromcp = *dposp;
 	tocp = cnp->cn_pnbuf;
 	md = *mdp;
 	rem = mtod(md, caddr_t) + md->m_len - fromcp;
 	for (i = 0; i < len; i++) {
 		while (rem == 0) {
 			md = md->m_next;
 			if (md == NULL) {
 				error = EBADRPC;
 				goto out;
 			}
 			fromcp = mtod(md, caddr_t);
 			rem = md->m_len;
 		}
 		if (*fromcp == '\0' || (!pubflag && *fromcp == '/')) {
 			error = EACCES;
 			goto out;
 		}
 		*tocp++ = *fromcp++;
 		rem--;
 	}
 	*tocp = '\0';
 	*mdp = md;
 	*dposp = fromcp;
 	len = nfsm_rndup(len)-len;
 	if (len > 0) {
 		if (rem >= len)
 			*dposp += len;
 		else if ((error = nfs_adv(mdp, dposp, len, rem)) != 0)
 			goto out;
 	}
 
 	/*
 	 * Extract and set starting directory.
 	 */
 	error = nfsrv_fhtovp(fhp, FALSE, &dp, &dvfslocked,
 	    ndp->ni_cnd.cn_cred, slp, nam, &rdonly, pubflag);
 	if (error)
 		goto out;
 	vfslocked = VFS_LOCK_GIANT(dp->v_mount);
 	if (dp->v_type != VDIR) {
 		vrele(dp);
 		error = ENOTDIR;
 		goto out;
 	}
 
 	if (rdonly)
 		cnp->cn_flags |= RDONLY;
 
 	/*
 	 * Set return directory.  Reference to dp is implicitly transfered
 	 * to the returned pointer
 	 */
 	*retdirp = dp;
 	if (v3) {
-		vn_lock(dp, LK_EXCLUSIVE | LK_RETRY, td);
+		vn_lock(dp, LK_EXCLUSIVE | LK_RETRY);
 		*retdirattr_retp = VOP_GETATTR(dp, retdirattrp,
 			ndp->ni_cnd.cn_cred, td);
 		VOP_UNLOCK(dp, 0, td);
 	}
 
 	if (pubflag) {
 		/*
 		 * Oh joy. For WebNFS, handle those pesky '%' escapes,
 		 * and the 'native path' indicator.
 		 */
 		cp = uma_zalloc(namei_zone, M_WAITOK);
 		fromcp = cnp->cn_pnbuf;
 		tocp = cp;
 		if ((unsigned char)*fromcp >= WEBNFS_SPECCHAR_START) {
 			switch ((unsigned char)*fromcp) {
 			case WEBNFS_NATIVE_CHAR:
 				/*
 				 * 'Native' path for us is the same
 				 * as a path according to the NFS spec,
 				 * just skip the escape char.
 				 */
 				fromcp++;
 				break;
 			/*
 			 * More may be added in the future, range 0x80-0xff
 			 */
 			default:
 				error = EIO;
 				uma_zfree(namei_zone, cp);
 				goto out;
 			}
 		}
 		/*
 		 * Translate the '%' escapes, URL-style.
 		 */
 		while (*fromcp != '\0') {
 			if (*fromcp == WEBNFS_ESC_CHAR) {
 				if (fromcp[1] != '\0' && fromcp[2] != '\0') {
 					fromcp++;
 					*tocp++ = HEXSTRTOI(fromcp);
 					fromcp += 2;
 					continue;
 				} else {
 					error = ENOENT;
 					uma_zfree(namei_zone, cp);
 					goto out;
 				}
 			} else
 				*tocp++ = *fromcp++;
 		}
 		*tocp = '\0';
 		uma_zfree(namei_zone, cnp->cn_pnbuf);
 		cnp->cn_pnbuf = cp;
 	}
 
 	ndp->ni_pathlen = (tocp - cnp->cn_pnbuf) + 1;
 	ndp->ni_segflg = UIO_SYSSPACE;
 
 	if (pubflag) {
 		ndp->ni_rootdir = rootvnode;
 		ndp->ni_loopcnt = 0;
 		if (cnp->cn_pnbuf[0] == '/') {
 			int tvfslocked;
 
 			tvfslocked = VFS_LOCK_GIANT(rootvnode->v_mount);
 			VFS_UNLOCK_GIANT(vfslocked);
 			dp = rootvnode;
 			vfslocked = tvfslocked;
 		}
 	} else {
 		cnp->cn_flags |= NOCROSSMOUNT;
 	}
 
 	/*
 	 * Initialize for scan, set ni_startdir and bump ref on dp again
 	 * because lookup() will dereference ni_startdir.
 	 */
 
 	cnp->cn_thread = td;
 	VREF(dp);
 	ndp->ni_startdir = dp;
 
 	if (!lockleaf)
 		cnp->cn_flags |= LOCKLEAF;
 	for (;;) {
 		cnp->cn_nameptr = cnp->cn_pnbuf;
 		/*
 		 * Call lookup() to do the real work.  If an error occurs,
 		 * ndp->ni_vp and ni_dvp are left uninitialized or NULL and
 		 * we do not have to dereference anything before returning.
 		 * In either case ni_startdir will be dereferenced and NULLed
 		 * out.
 		 */
 		if (vfslocked)
 			ndp->ni_cnd.cn_flags |= GIANTHELD;
 		error = lookup(ndp);
 		vfslocked = (ndp->ni_cnd.cn_flags & GIANTHELD) != 0;
 		ndp->ni_cnd.cn_flags &= ~GIANTHELD;
 		if (error)
 			break;
 
 		/*
 		 * Check for encountering a symbolic link.  Trivial
 		 * termination occurs if no symlink encountered.
 		 * Note: zfree is safe because error is 0, so we will
 		 * not zfree it again when we break.
 		 */
 		if ((cnp->cn_flags & ISSYMLINK) == 0) {
 			if (cnp->cn_flags & (SAVENAME | SAVESTART))
 				cnp->cn_flags |= HASBUF;
 			else
 				uma_zfree(namei_zone, cnp->cn_pnbuf);
 			if (ndp->ni_vp && !lockleaf)
 				VOP_UNLOCK(ndp->ni_vp, 0, td);
 			break;
 		}
 
 		/*
 		 * Validate symlink
 		 */
 		if ((cnp->cn_flags & LOCKPARENT) && ndp->ni_pathlen == 1)
 			VOP_UNLOCK(ndp->ni_dvp, 0, td);
 		if (!pubflag) {
 			error = EINVAL;
 			goto badlink2;
 		}
 
 		if (ndp->ni_loopcnt++ >= MAXSYMLINKS) {
 			error = ELOOP;
 			goto badlink2;
 		}
 		if (ndp->ni_pathlen > 1)
 			cp = uma_zalloc(namei_zone, M_WAITOK);
 		else
 			cp = cnp->cn_pnbuf;
 		aiov.iov_base = cp;
 		aiov.iov_len = MAXPATHLEN;
 		auio.uio_iov = &aiov;
 		auio.uio_iovcnt = 1;
 		auio.uio_offset = 0;
 		auio.uio_rw = UIO_READ;
 		auio.uio_segflg = UIO_SYSSPACE;
 		auio.uio_td = NULL;
 		auio.uio_resid = MAXPATHLEN;
 		error = VOP_READLINK(ndp->ni_vp, &auio, cnp->cn_cred);
 		if (error) {
 		badlink1:
 			if (ndp->ni_pathlen > 1)
 				uma_zfree(namei_zone, cp);
 		badlink2:
 			vput(ndp->ni_vp);
 			vrele(ndp->ni_dvp);
 			break;
 		}
 		linklen = MAXPATHLEN - auio.uio_resid;
 		if (linklen == 0) {
 			error = ENOENT;
 			goto badlink1;
 		}
 		if (linklen + ndp->ni_pathlen >= MAXPATHLEN) {
 			error = ENAMETOOLONG;
 			goto badlink1;
 		}
 
 		/*
 		 * Adjust or replace path
 		 */
 		if (ndp->ni_pathlen > 1) {
 			bcopy(ndp->ni_next, cp + linklen, ndp->ni_pathlen);
 			uma_zfree(namei_zone, cnp->cn_pnbuf);
 			cnp->cn_pnbuf = cp;
 		} else
 			cnp->cn_pnbuf[linklen] = '\0';
 		ndp->ni_pathlen += linklen;
 
 		/*
 		 * Cleanup refs for next loop and check if root directory
 		 * should replace current directory.  Normally ni_dvp
 		 * becomes the new base directory and is cleaned up when
 		 * we loop.  Explicitly null pointers after invalidation
 		 * to clarify operation.
 		 */
 		vput(ndp->ni_vp);
 		ndp->ni_vp = NULL;
 
 		if (cnp->cn_pnbuf[0] == '/') {
 			vrele(ndp->ni_dvp);
 			ndp->ni_dvp = ndp->ni_rootdir;
 			VREF(ndp->ni_dvp);
 		}
 		ndp->ni_startdir = ndp->ni_dvp;
 		ndp->ni_dvp = NULL;
 	}
 	if (!lockleaf)
 		cnp->cn_flags &= ~LOCKLEAF;
 	if (cnp->cn_flags & GIANTHELD) {
 		mtx_unlock(&Giant);
 		cnp->cn_flags &= ~GIANTHELD;
 	}
 
 	/*
 	 * nfs_namei() guarentees that fields will not contain garbage
 	 * whether an error occurs or not.  This allows the caller to track
 	 * cleanup state trivially.
 	 */
 out:
 	if (error) {
 		uma_zfree(namei_zone, cnp->cn_pnbuf);
 		ndp->ni_vp = NULL;
 		ndp->ni_dvp = NULL;
 		ndp->ni_startdir = NULL;
 		cnp->cn_flags &= ~HASBUF;
 		VFS_UNLOCK_GIANT(vfslocked);
 		vfslocked = 0;
 	} else if ((ndp->ni_cnd.cn_flags & (WANTPARENT|LOCKPARENT)) == 0) {
 		ndp->ni_dvp = NULL;
 	}
 	/*
 	 * This differs from normal namei() in that even on failure we may
 	 * return with Giant held due to the dirp return.  Make sure we only
 	 * have not recursed however.  The calling code only expects to drop
 	 * one acquire.
 	 */
 	if (vfslocked || dvfslocked)
 		ndp->ni_cnd.cn_flags |= GIANTHELD;
 	if (vfslocked && dvfslocked)
 		VFS_UNLOCK_GIANT(vfslocked);
 	return (error);
 }
 
 /*
  * A fiddled version of m_adj() that ensures null fill to a long
  * boundary and only trims off the back end
  */
 void
 nfsm_adj(struct mbuf *mp, int len, int nul)
 {
 	struct mbuf *m;
 	int count, i;
 	char *cp;
 
 	/*
 	 * Trim from tail.  Scan the mbuf chain,
 	 * calculating its length and finding the last mbuf.
 	 * If the adjustment only affects this mbuf, then just
 	 * adjust and return.  Otherwise, rescan and truncate
 	 * after the remaining size.
 	 */
 	count = 0;
 	m = mp;
 	for (;;) {
 		count += m->m_len;
 		if (m->m_next == NULL)
 			break;
 		m = m->m_next;
 	}
 	if (m->m_len > len) {
 		m->m_len -= len;
 		if (nul > 0) {
 			cp = mtod(m, caddr_t)+m->m_len-nul;
 			for (i = 0; i < nul; i++)
 				*cp++ = '\0';
 		}
 		return;
 	}
 	count -= len;
 	if (count < 0)
 		count = 0;
 	/*
 	 * Correct length for chain is "count".
 	 * Find the mbuf with last data, adjust its length,
 	 * and toss data from remaining mbufs on chain.
 	 */
 	for (m = mp; m; m = m->m_next) {
 		if (m->m_len >= count) {
 			m->m_len = count;
 			if (nul > 0) {
 				cp = mtod(m, caddr_t)+m->m_len-nul;
 				for (i = 0; i < nul; i++)
 					*cp++ = '\0';
 			}
 			if (m->m_next != NULL) {
 				m_freem(m->m_next);
 				m->m_next = NULL;
 			}
 			break;
 		}
 		count -= m->m_len;
 	}
 }
 
 /*
  * Make these functions instead of macros, so that the kernel text size
  * doesn't get too big...
  */
 void
 nfsm_srvwcc(struct nfsrv_descript *nfsd, int before_ret,
     struct vattr *before_vap, int after_ret, struct vattr *after_vap,
     struct mbuf **mbp, char **bposp)
 {
 	struct mbuf *mb = *mbp;
 	char *bpos = *bposp;
 	u_int32_t *tl;
 
 	if (before_ret) {
 		tl = nfsm_build(u_int32_t *, NFSX_UNSIGNED);
 		*tl = nfsrv_nfs_false;
 	} else {
 		tl = nfsm_build(u_int32_t *, 7 * NFSX_UNSIGNED);
 		*tl++ = nfsrv_nfs_true;
 		txdr_hyper(before_vap->va_size, tl);
 		tl += 2;
 		txdr_nfsv3time(&(before_vap->va_mtime), tl);
 		tl += 2;
 		txdr_nfsv3time(&(before_vap->va_ctime), tl);
 	}
 	*bposp = bpos;
 	*mbp = mb;
 	nfsm_srvpostopattr(nfsd, after_ret, after_vap, mbp, bposp);
 }
 
 void
 nfsm_srvpostopattr(struct nfsrv_descript *nfsd, int after_ret,
     struct vattr *after_vap, struct mbuf **mbp, char **bposp)
 {
 	struct mbuf *mb = *mbp;
 	char *bpos = *bposp;
 	u_int32_t *tl;
 	struct nfs_fattr *fp;
 
 	if (after_ret) {
 		tl = nfsm_build(u_int32_t *, NFSX_UNSIGNED);
 		*tl = nfsrv_nfs_false;
 	} else {
 		tl = nfsm_build(u_int32_t *, NFSX_UNSIGNED + NFSX_V3FATTR);
 		*tl++ = nfsrv_nfs_true;
 		fp = (struct nfs_fattr *)tl;
 		nfsm_srvfattr(nfsd, after_vap, fp);
 	}
 	*mbp = mb;
 	*bposp = bpos;
 }
 
 void
 nfsm_srvfattr(struct nfsrv_descript *nfsd, struct vattr *vap,
     struct nfs_fattr *fp)
 {
 
 	fp->fa_nlink = txdr_unsigned(vap->va_nlink);
 	fp->fa_uid = txdr_unsigned(vap->va_uid);
 	fp->fa_gid = txdr_unsigned(vap->va_gid);
 	if (nfsd->nd_flag & ND_NFSV3) {
 		fp->fa_type = vtonfsv3_type(vap->va_type);
 		fp->fa_mode = vtonfsv3_mode(vap->va_mode);
 		txdr_hyper(vap->va_size, &fp->fa3_size);
 		txdr_hyper(vap->va_bytes, &fp->fa3_used);
 		fp->fa3_rdev.specdata1 = txdr_unsigned(umajor(vap->va_rdev));
 		fp->fa3_rdev.specdata2 = txdr_unsigned(uminor(vap->va_rdev));
 		fp->fa3_fsid.nfsuquad[0] = 0;
 		fp->fa3_fsid.nfsuquad[1] = txdr_unsigned(vap->va_fsid);
 		fp->fa3_fileid.nfsuquad[0] = 0;
 		fp->fa3_fileid.nfsuquad[1] = txdr_unsigned(vap->va_fileid);
 		txdr_nfsv3time(&vap->va_atime, &fp->fa3_atime);
 		txdr_nfsv3time(&vap->va_mtime, &fp->fa3_mtime);
 		txdr_nfsv3time(&vap->va_ctime, &fp->fa3_ctime);
 	} else {
 		fp->fa_type = vtonfsv2_type(vap->va_type);
 		fp->fa_mode = vtonfsv2_mode(vap->va_type, vap->va_mode);
 		fp->fa2_size = txdr_unsigned(vap->va_size);
 		fp->fa2_blocksize = txdr_unsigned(vap->va_blocksize);
 		if (vap->va_type == VFIFO)
 			fp->fa2_rdev = 0xffffffff;
 		else
 			fp->fa2_rdev = txdr_unsigned(vap->va_rdev);
 		fp->fa2_blocks = txdr_unsigned(vap->va_bytes / NFS_FABLKSIZE);
 		fp->fa2_fsid = txdr_unsigned(vap->va_fsid);
 		fp->fa2_fileid = txdr_unsigned(vap->va_fileid);
 		txdr_nfsv2time(&vap->va_atime, &fp->fa2_atime);
 		txdr_nfsv2time(&vap->va_mtime, &fp->fa2_mtime);
 		txdr_nfsv2time(&vap->va_ctime, &fp->fa2_ctime);
 	}
 }
 
 /*
  * nfsrv_fhtovp() - convert a fh to a vnode ptr (optionally locked)
  * 	- look up fsid in mount list (if not found ret error)
  *	- get vp and export rights by calling VFS_FHTOVP()
  *	- if cred->cr_uid == 0 or MNT_EXPORTANON set it to credanon
  *	- if not lockflag unlock it with VOP_UNLOCK()
  */
 int
 nfsrv_fhtovp(fhandle_t *fhp, int lockflag, struct vnode **vpp, int *vfslockedp,
     struct ucred *cred, struct nfssvc_sock *slp, struct sockaddr *nam,
     int *rdonlyp, int pubflag)
 {
 	struct thread *td = curthread; /* XXX */
 	struct mount *mp;
 	int i;
 	struct ucred *credanon;
 	int error, exflags;
 #ifdef MNT_EXNORESPORT		/* XXX needs mountd and /etc/exports help yet */
 	struct sockaddr_int *saddr;
 #endif
 	int vfslocked;
 
 	*vfslockedp = 0;
 	*vpp = NULL;
 
 	if (nfs_ispublicfh(fhp)) {
 		if (!pubflag || !nfs_pub.np_valid)
 			return (ESTALE);
 		fhp = &nfs_pub.np_handle;
 	}
 
 	mp = vfs_getvfs(&fhp->fh_fsid);
 	if (!mp)
 		return (ESTALE);
 	vfslocked = VFS_LOCK_GIANT(mp);
 	error = VFS_CHECKEXP(mp, nam, &exflags, &credanon);
 	if (error)
 		goto out;
 	error = VFS_FHTOVP(mp, &fhp->fh_fid, vpp);
 	if (error)
 		goto out;
 #ifdef MNT_EXNORESPORT
 	if (!(exflags & (MNT_EXNORESPORT|MNT_EXPUBLIC))) {
 		saddr = (struct sockaddr_in *)nam;
 		if ((saddr->sin_family == AF_INET ||
 		     saddr->sin_family == AF_INET6) &&
 	/* same code for INET and INET6: sin*_port at same offet */
 		    ntohs(saddr->sin_port) >= IPPORT_RESERVED) {
 			vput(*vpp);
 			*vpp = NULL;
 			error = NFSERR_AUTHERR | AUTH_TOOWEAK;
 		}
 	}
 #endif
 	/*
 	 * Check/setup credentials.
 	 */
 	if (cred->cr_uid == 0 || (exflags & MNT_EXPORTANON)) {
 		cred->cr_uid = credanon->cr_uid;
 		for (i = 0; i < credanon->cr_ngroups && i < NGROUPS; i++)
 			cred->cr_groups[i] = credanon->cr_groups[i];
 		cred->cr_ngroups = i;
 	}
 	if (exflags & MNT_EXRDONLY)
 		*rdonlyp = 1;
 	else
 		*rdonlyp = 0;
 
 	if (!lockflag)
 		VOP_UNLOCK(*vpp, 0, td);
 out:
 	vfs_rel(mp);
 	if (error) {
 		VFS_UNLOCK_GIANT(vfslocked);
 	} else
 		*vfslockedp = vfslocked;
 	return (error);
 }
 
 
 /*
  * WebNFS: check if a filehandle is a public filehandle. For v3, this
  * means a length of 0, for v2 it means all zeroes. nfsm_srvmtofh has
  * transformed this to all zeroes in both cases, so check for it.
  */
 int
 nfs_ispublicfh(fhandle_t *fhp)
 {
 	char *cp = (char *)fhp;
 	int i;
 
 	NFSD_LOCK_DONTCARE();
 
 	for (i = 0; i < NFSX_V3FH; i++)
 		if (*cp++ != 0)
 			return (FALSE);
 	return (TRUE);
 }
 
 /*
  * This function compares two net addresses by family and returns TRUE
  * if they are the same host.
  * If there is any doubt, return FALSE.
  * The AF_INET family is handled as a special case so that address mbufs
  * don't need to be saved to store "struct in_addr", which is only 4 bytes.
  */
 int
 netaddr_match(int family, union nethostaddr *haddr, struct sockaddr *nam)
 {
 	struct sockaddr_in *inetaddr;
 
 	NFSD_LOCK_DONTCARE();
 
 	switch (family) {
 	case AF_INET:
 		inetaddr = (struct sockaddr_in *)nam;
 		if (inetaddr->sin_family == AF_INET &&
 		    inetaddr->sin_addr.s_addr == haddr->had_inetaddr)
 			return (1);
 		break;
 #ifdef INET6
 	case AF_INET6:
 	{
 		register struct sockaddr_in6 *inet6addr1, *inet6addr2;
 
 		inet6addr1 = (struct sockaddr_in6 *)nam;
 		inet6addr2 = (struct sockaddr_in6 *)haddr->had_nam;
 	/* XXX - should test sin6_scope_id ? */
 		if (inet6addr1->sin6_family == AF_INET6 &&
 		    IN6_ARE_ADDR_EQUAL(&inet6addr1->sin6_addr,
 				       &inet6addr2->sin6_addr))
 			return (1);
 		break;
 	}
 #endif
 	default:
 		break;
 	};
 	return (0);
 }
 
 /*
  * Map errnos to NFS error numbers. For Version 3 also filter out error
  * numbers not specified for the associated procedure.
  */
 int
 nfsrv_errmap(struct nfsrv_descript *nd, int err)
 {
 	const short *defaulterrp, *errp;
 	int e;
 
 
 	if (nd->nd_flag & ND_NFSV3) {
 	    if (nd->nd_procnum <= NFSPROC_COMMIT) {
 		errp = defaulterrp = nfsrv_v3errmap[nd->nd_procnum];
 		while (*++errp) {
 			if (*errp == err)
 				return (err);
 			else if (*errp > err)
 				break;
 		}
 		return ((int)*defaulterrp);
 	    } else
 		return (err & 0xffff);
 	}
 	e = 0;
 	if (err <= ELAST)
 		e = nfsrv_v2errmap[err - 1];
 	if (e != 0)
 		return (e);
 	return (NFSERR_IO);
 }
 
 /*
  * Sort the group list in increasing numerical order.
  * (Insertion sort by Chris Torek, who was grossed out by the bubble sort
  *  that used to be here.)
  */
 void
 nfsrvw_sort(gid_t *list, int num)
 {
 	int i, j;
 	gid_t v;
 
 	/* Insertion sort. */
 	for (i = 1; i < num; i++) {
 		v = list[i];
 		/* find correct slot for value v, moving others up */
 		for (j = i; --j >= 0 && v < list[j];)
 			list[j + 1] = list[j];
 		list[j + 1] = v;
 	}
 }
 
 /*
  * Helper functions for macros.
  */
 
 void
 nfsm_srvfhtom_xx(fhandle_t *f, int v3, struct mbuf **mb, caddr_t *bpos)
 {
 	u_int32_t *tl;
 
 	if (v3) {
 		tl = nfsm_build_xx(NFSX_UNSIGNED + NFSX_V3FH, mb, bpos);
 		*tl++ = txdr_unsigned(NFSX_V3FH);
 		bcopy(f, tl, NFSX_V3FH);
 	} else {
 		tl = nfsm_build_xx(NFSX_V2FH, mb, bpos);
 		bcopy(f, tl, NFSX_V2FH);
 	}
 }
 
 void
 nfsm_srvpostop_fh_xx(fhandle_t *f, struct mbuf **mb, caddr_t *bpos)
 {
 	u_int32_t *tl;
 
 	tl = nfsm_build_xx(2 * NFSX_UNSIGNED + NFSX_V3FH, mb, bpos);
 	*tl++ = nfsrv_nfs_true;
 	*tl++ = txdr_unsigned(NFSX_V3FH);
 	bcopy(f, tl, NFSX_V3FH);
 }
 
 int
 nfsm_srvstrsiz_xx(int *s, int m, struct mbuf **md, caddr_t *dpos)
 {
 	u_int32_t *tl;
 
 	tl = nfsm_dissect_xx_nonblock(NFSX_UNSIGNED, md, dpos);
 	if (tl == NULL)
 		return EBADRPC;
 	*s = fxdr_unsigned(int32_t, *tl);
 	if (*s > m || *s <= 0)
 		return EBADRPC;
 	return 0;
 }
 
 int
 nfsm_srvnamesiz_xx(int *s, int m, struct mbuf **md, caddr_t *dpos)
 {
 	u_int32_t *tl;
 
 	NFSD_LOCK_DONTCARE();
 
 	tl = nfsm_dissect_xx_nonblock(NFSX_UNSIGNED, md, dpos);
 	if (tl == NULL)
 		return EBADRPC;
 	*s = fxdr_unsigned(int32_t, *tl);
 	if (*s > m)
 		return NFSERR_NAMETOL;
 	if (*s <= 0)
 		return EBADRPC;
 	return 0;
 }
 
 int
 nfsm_srvnamesiz0_xx(int *s, int m, struct mbuf **md, caddr_t *dpos)
 {
 	u_int32_t *tl;
 
 	tl = nfsm_dissect_xx_nonblock(NFSX_UNSIGNED, md, dpos);
 	if (tl == NULL)
 		return EBADRPC;
 	*s = fxdr_unsigned(int32_t, *tl);
 	if (*s > m)
 		return NFSERR_NAMETOL;
 	if (*s < 0)
 		return EBADRPC;
 	return 0;
 }
 
 void
 nfsm_clget_xx(u_int32_t **tl, struct mbuf *mb, struct mbuf **mp,
     char **bp, char **be, caddr_t bpos)
 {
 	struct mbuf *nmp;
 
 	NFSD_UNLOCK_ASSERT();
 
 	if (*bp >= *be) {
 		if (*mp == mb)
 			(*mp)->m_len += *bp - bpos;
 		MGET(nmp, M_TRYWAIT, MT_DATA);
 		MCLGET(nmp, M_TRYWAIT);
 		nmp->m_len = NFSMSIZ(nmp);
 		(*mp)->m_next = nmp;
 		*mp = nmp;
 		*bp = mtod(*mp, caddr_t);
 		*be = *bp + (*mp)->m_len;
 	}
 	*tl = (u_int32_t *)*bp;
 }
 
 int
 nfsm_srvmtofh_xx(fhandle_t *f, struct nfsrv_descript *nfsd, struct mbuf **md,
     caddr_t *dpos)
 {
 	u_int32_t *tl;
 	int fhlen;
 
 	if (nfsd->nd_flag & ND_NFSV3) {
 		tl = nfsm_dissect_xx_nonblock(NFSX_UNSIGNED, md, dpos);
 		if (tl == NULL)
 			return EBADRPC;
 		fhlen = fxdr_unsigned(int, *tl);
 		if (fhlen != 0 && fhlen != NFSX_V3FH)
 			return EBADRPC;
 	} else {
 		fhlen = NFSX_V2FH;
 	}
 	if (fhlen != 0) {
 		tl = nfsm_dissect_xx_nonblock(fhlen, md, dpos);
 		if (tl == NULL)
 			return EBADRPC;
 		bcopy((caddr_t)tl, (caddr_t)(f), fhlen);
 	} else {
 		bzero((caddr_t)(f), NFSX_V3FH);
 	}
 	return 0;
 }
 
 int
 nfsm_srvsattr_xx(struct vattr *a, struct mbuf **md, caddr_t *dpos)
 {
 	u_int32_t *tl;
 	int toclient = 0;
 
 	tl = nfsm_dissect_xx_nonblock(NFSX_UNSIGNED, md, dpos);
 	if (tl == NULL)
 		return EBADRPC;
 	if (*tl == nfsrv_nfs_true) {
 		tl = nfsm_dissect_xx_nonblock(NFSX_UNSIGNED, md, dpos);
 		if (tl == NULL)
 			return EBADRPC;
 		(a)->va_mode = nfstov_mode(*tl);
 	}
 	tl = nfsm_dissect_xx_nonblock(NFSX_UNSIGNED, md, dpos);
 	if (tl == NULL)
 		return EBADRPC;
 	if (*tl == nfsrv_nfs_true) {
 		tl = nfsm_dissect_xx_nonblock(NFSX_UNSIGNED, md, dpos);
 		if (tl == NULL)
 			return EBADRPC;
 		(a)->va_uid = fxdr_unsigned(uid_t, *tl);
 	}
 	tl = nfsm_dissect_xx_nonblock(NFSX_UNSIGNED, md, dpos);
 	if (tl == NULL)
 		return EBADRPC;
 	if (*tl == nfsrv_nfs_true) {
 		tl = nfsm_dissect_xx_nonblock(NFSX_UNSIGNED, md, dpos);
 		if (tl == NULL)
 			return EBADRPC;
 		(a)->va_gid = fxdr_unsigned(gid_t, *tl);
 	}
 	tl = nfsm_dissect_xx_nonblock(NFSX_UNSIGNED, md, dpos);
 	if (tl == NULL)
 		return EBADRPC;
 	if (*tl == nfsrv_nfs_true) {
 		tl = nfsm_dissect_xx_nonblock(2 * NFSX_UNSIGNED, md, dpos);
 		if (tl == NULL)
 			return EBADRPC;
 		(a)->va_size = fxdr_hyper(tl);
 	}
 	tl = nfsm_dissect_xx_nonblock(NFSX_UNSIGNED, md, dpos);
 	if (tl == NULL)
 		return EBADRPC;
 	switch (fxdr_unsigned(int, *tl)) {
 	case NFSV3SATTRTIME_TOCLIENT:
 		tl = nfsm_dissect_xx_nonblock(2 * NFSX_UNSIGNED, md, dpos);
 		if (tl == NULL)
 			return EBADRPC;
 		fxdr_nfsv3time(tl, &(a)->va_atime);
 		toclient = 1;
 		break;
 	case NFSV3SATTRTIME_TOSERVER:
 		getnanotime(&(a)->va_atime);
 		a->va_vaflags |= VA_UTIMES_NULL;
 		break;
 	}
 	tl = nfsm_dissect_xx_nonblock(NFSX_UNSIGNED, md, dpos);
 	if (tl == NULL)
 		return EBADRPC;
 	switch (fxdr_unsigned(int, *tl)) {
 	case NFSV3SATTRTIME_TOCLIENT:
 		tl = nfsm_dissect_xx_nonblock(2 * NFSX_UNSIGNED, md, dpos);
 		if (tl == NULL)
 			return EBADRPC;
 		fxdr_nfsv3time(tl, &(a)->va_mtime);
 		a->va_vaflags &= ~VA_UTIMES_NULL;
 		break;
 	case NFSV3SATTRTIME_TOSERVER:
 		getnanotime(&(a)->va_mtime);
 		if (toclient == 0)
 			a->va_vaflags |= VA_UTIMES_NULL;
 		break;
 	}
 	return 0;
 }
Index: head/sys/security/audit/audit_arg.c
===================================================================
--- head/sys/security/audit/audit_arg.c	(revision 175201)
+++ head/sys/security/audit/audit_arg.c	(revision 175202)
@@ -1,856 +1,856 @@
 /*
  * Copyright (c) 1999-2005 Apple Computer, Inc.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1.  Redistributions of source code must retain the above copyright
  *     notice, this list of conditions and the following disclaimer.
  * 2.  Redistributions in binary form must reproduce the above copyright
  *     notice, this list of conditions and the following disclaimer in the
  *     documentation and/or other materials provided with the distribution.
  * 3.  Neither the name of Apple Computer, Inc. ("Apple") nor the names of
  *     its contributors may be used to endorse or promote products derived
  *     from this software without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY APPLE AND ITS CONTRIBUTORS "AS IS" AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED. IN NO EVENT SHALL APPLE OR ITS CONTRIBUTORS BE LIABLE FOR
  * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
  * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING
  * IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
  * POSSIBILITY OF SUCH DAMAGE.
  *
  * $FreeBSD$
  */
 
 #include <sys/param.h>
 #include <sys/filedesc.h>
 #include <sys/ipc.h>
 #include <sys/mount.h>
 #include <sys/proc.h>
 #include <sys/socket.h>
 #include <sys/socketvar.h>
 #include <sys/protosw.h>
 #include <sys/domain.h>
 #include <sys/sbuf.h>
 #include <sys/systm.h>
 #include <sys/un.h>
 #include <sys/vnode.h>
 
 #include <netinet/in.h>
 #include <netinet/in_pcb.h>
 
 #include <security/audit/audit.h>
 #include <security/audit/audit_private.h>
 
 /*
  * Calls to manipulate elements of the audit record structure from system
  * call code.  Macro wrappers will prevent this functions from being entered
  * if auditing is disabled, avoiding the function call cost.  We check the
  * thread audit record pointer anyway, as the audit condition could change,
  * and pre-selection may not have allocated an audit record for this event.
  *
  * XXXAUDIT: Should we assert, in each case, that this field of the record
  * hasn't already been filled in?
  */
 void
 audit_arg_addr(void *addr)
 {
 	struct kaudit_record *ar;
 
 	ar = currecord();
 	if (ar == NULL)
 		return;
 
 	ar->k_ar.ar_arg_addr = addr;
 	ARG_SET_VALID(ar, ARG_ADDR);
 }
 
 void
 audit_arg_exit(int status, int retval)
 {
 	struct kaudit_record *ar;
 
 	ar = currecord();
 	if (ar == NULL)
 		return;
 
 	ar->k_ar.ar_arg_exitstatus = status;
 	ar->k_ar.ar_arg_exitretval = retval;
 	ARG_SET_VALID(ar, ARG_EXIT);
 }
 
 void
 audit_arg_len(int len)
 {
 	struct kaudit_record *ar;
 
 	ar = currecord();
 	if (ar == NULL)
 		return;
 
 	ar->k_ar.ar_arg_len = len;
 	ARG_SET_VALID(ar, ARG_LEN);
 }
 
 void
 audit_arg_fd(int fd)
 {
 	struct kaudit_record *ar;
 
 	ar = currecord();
 	if (ar == NULL)
 		return;
 
 	ar->k_ar.ar_arg_fd = fd;
 	ARG_SET_VALID(ar, ARG_FD);
 }
 
 void
 audit_arg_fflags(int fflags)
 {
 	struct kaudit_record *ar;
 
 	ar = currecord();
 	if (ar == NULL)
 		return;
 
 	ar->k_ar.ar_arg_fflags = fflags;
 	ARG_SET_VALID(ar, ARG_FFLAGS);
 }
 
 void
 audit_arg_gid(gid_t gid)
 {
 	struct kaudit_record *ar;
 
 	ar = currecord();
 	if (ar == NULL)
 		return;
 
 	ar->k_ar.ar_arg_gid = gid;
 	ARG_SET_VALID(ar, ARG_GID);
 }
 
 void
 audit_arg_uid(uid_t uid)
 {
 	struct kaudit_record *ar;
 
 	ar = currecord();
 	if (ar == NULL)
 		return;
 
 	ar->k_ar.ar_arg_uid = uid;
 	ARG_SET_VALID(ar, ARG_UID);
 }
 
 void
 audit_arg_egid(gid_t egid)
 {
 	struct kaudit_record *ar;
 
 	ar = currecord();
 	if (ar == NULL)
 		return;
 
 	ar->k_ar.ar_arg_egid = egid;
 	ARG_SET_VALID(ar, ARG_EGID);
 }
 
 void
 audit_arg_euid(uid_t euid)
 {
 	struct kaudit_record *ar;
 
 	ar = currecord();
 	if (ar == NULL)
 		return;
 
 	ar->k_ar.ar_arg_euid = euid;
 	ARG_SET_VALID(ar, ARG_EUID);
 }
 
 void
 audit_arg_rgid(gid_t rgid)
 {
 	struct kaudit_record *ar;
 
 	ar = currecord();
 	if (ar == NULL)
 		return;
 
 	ar->k_ar.ar_arg_rgid = rgid;
 	ARG_SET_VALID(ar, ARG_RGID);
 }
 
 void
 audit_arg_ruid(uid_t ruid)
 {
 	struct kaudit_record *ar;
 
 	ar = currecord();
 	if (ar == NULL)
 		return;
 
 	ar->k_ar.ar_arg_ruid = ruid;
 	ARG_SET_VALID(ar, ARG_RUID);
 }
 
 void
 audit_arg_sgid(gid_t sgid)
 {
 	struct kaudit_record *ar;
 
 	ar = currecord();
 	if (ar == NULL)
 		return;
 
 	ar->k_ar.ar_arg_sgid = sgid;
 	ARG_SET_VALID(ar, ARG_SGID);
 }
 
 void
 audit_arg_suid(uid_t suid)
 {
 	struct kaudit_record *ar;
 
 	ar = currecord();
 	if (ar == NULL)
 		return;
 
 	ar->k_ar.ar_arg_suid = suid;
 	ARG_SET_VALID(ar, ARG_SUID);
 }
 
 void
 audit_arg_groupset(gid_t *gidset, u_int gidset_size)
 {
 	int i;
 	struct kaudit_record *ar;
 
 	ar = currecord();
 	if (ar == NULL)
 		return;
 
 	for (i = 0; i < gidset_size; i++)
 		ar->k_ar.ar_arg_groups.gidset[i] = gidset[i];
 	ar->k_ar.ar_arg_groups.gidset_size = gidset_size;
 	ARG_SET_VALID(ar, ARG_GROUPSET);
 }
 
 void
 audit_arg_login(char *login)
 {
 	struct kaudit_record *ar;
 
 	ar = currecord();
 	if (ar == NULL)
 		return;
 
 	strlcpy(ar->k_ar.ar_arg_login, login, MAXLOGNAME);
 	ARG_SET_VALID(ar, ARG_LOGIN);
 }
 
 void
 audit_arg_ctlname(int *name, int namelen)
 {
 	struct kaudit_record *ar;
 
 	ar = currecord();
 	if (ar == NULL)
 		return;
 
 	bcopy(name, &ar->k_ar.ar_arg_ctlname, namelen * sizeof(int));
 	ar->k_ar.ar_arg_len = namelen;
 	ARG_SET_VALID(ar, ARG_CTLNAME | ARG_LEN);
 }
 
 void
 audit_arg_mask(int mask)
 {
 	struct kaudit_record *ar;
 
 	ar = currecord();
 	if (ar == NULL)
 		return;
 
 	ar->k_ar.ar_arg_mask = mask;
 	ARG_SET_VALID(ar, ARG_MASK);
 }
 
 void
 audit_arg_mode(mode_t mode)
 {
 	struct kaudit_record *ar;
 
 	ar = currecord();
 	if (ar == NULL)
 		return;
 
 	ar->k_ar.ar_arg_mode = mode;
 	ARG_SET_VALID(ar, ARG_MODE);
 }
 
 void
 audit_arg_dev(int dev)
 {
 	struct kaudit_record *ar;
 
 	ar = currecord();
 	if (ar == NULL)
 		return;
 
 	ar->k_ar.ar_arg_dev = dev;
 	ARG_SET_VALID(ar, ARG_DEV);
 }
 
 void
 audit_arg_value(long value)
 {
 	struct kaudit_record *ar;
 
 	ar = currecord();
 	if (ar == NULL)
 		return;
 
 	ar->k_ar.ar_arg_value = value;
 	ARG_SET_VALID(ar, ARG_VALUE);
 }
 
 void
 audit_arg_owner(uid_t uid, gid_t gid)
 {
 	struct kaudit_record *ar;
 
 	ar = currecord();
 	if (ar == NULL)
 		return;
 
 	ar->k_ar.ar_arg_uid = uid;
 	ar->k_ar.ar_arg_gid = gid;
 	ARG_SET_VALID(ar, ARG_UID | ARG_GID);
 }
 
 void
 audit_arg_pid(pid_t pid)
 {
 	struct kaudit_record *ar;
 
 	ar = currecord();
 	if (ar == NULL)
 		return;
 
 	ar->k_ar.ar_arg_pid = pid;
 	ARG_SET_VALID(ar, ARG_PID);
 }
 
 void
 audit_arg_process(struct proc *p)
 {
 	struct kaudit_record *ar;
 
 	KASSERT(p != NULL, ("audit_arg_process: p == NULL"));
 
 	PROC_LOCK_ASSERT(p, MA_OWNED);
 
 	ar = currecord();
 	if (ar == NULL)
 		return;
 
 	ar->k_ar.ar_arg_auid = p->p_ucred->cr_audit.ai_auid;
 	ar->k_ar.ar_arg_euid = p->p_ucred->cr_uid;
 	ar->k_ar.ar_arg_egid = p->p_ucred->cr_groups[0];
 	ar->k_ar.ar_arg_ruid = p->p_ucred->cr_ruid;
 	ar->k_ar.ar_arg_rgid = p->p_ucred->cr_rgid;
 	ar->k_ar.ar_arg_asid = p->p_ucred->cr_audit.ai_asid;
 	ar->k_ar.ar_arg_termid_addr = p->p_ucred->cr_audit.ai_termid;
 	ar->k_ar.ar_arg_pid = p->p_pid;
 	ARG_SET_VALID(ar, ARG_AUID | ARG_EUID | ARG_EGID | ARG_RUID |
 	    ARG_RGID | ARG_ASID | ARG_TERMID_ADDR | ARG_PID | ARG_PROCESS);
 }
 
 void
 audit_arg_signum(u_int signum)
 {
 	struct kaudit_record *ar;
 
 	ar = currecord();
 	if (ar == NULL)
 		return;
 
 	ar->k_ar.ar_arg_signum = signum;
 	ARG_SET_VALID(ar, ARG_SIGNUM);
 }
 
 void
 audit_arg_socket(int sodomain, int sotype, int soprotocol)
 {
 	struct kaudit_record *ar;
 
 	ar = currecord();
 	if (ar == NULL)
 		return;
 
 	ar->k_ar.ar_arg_sockinfo.so_domain = sodomain;
 	ar->k_ar.ar_arg_sockinfo.so_type = sotype;
 	ar->k_ar.ar_arg_sockinfo.so_protocol = soprotocol;
 	ARG_SET_VALID(ar, ARG_SOCKINFO);
 }
 
 void
 audit_arg_sockaddr(struct thread *td, struct sockaddr *sa)
 {
 	struct kaudit_record *ar;
 
 	KASSERT(td != NULL, ("audit_arg_sockaddr: td == NULL"));
 	KASSERT(sa != NULL, ("audit_arg_sockaddr: sa == NULL"));
 
 	ar = currecord();
 	if (ar == NULL)
 		return;
 
 	bcopy(sa, &ar->k_ar.ar_arg_sockaddr, sa->sa_len);
 	switch (sa->sa_family) {
 	case AF_INET:
 		ARG_SET_VALID(ar, ARG_SADDRINET);
 		break;
 
 	case AF_INET6:
 		ARG_SET_VALID(ar, ARG_SADDRINET6);
 		break;
 
 	case AF_UNIX:
 		audit_arg_upath(td, ((struct sockaddr_un *)sa)->sun_path,
 				ARG_UPATH1);
 		ARG_SET_VALID(ar, ARG_SADDRUNIX);
 		break;
 	/* XXXAUDIT: default:? */
 	}
 }
 
 void
 audit_arg_auid(uid_t auid)
 {
 	struct kaudit_record *ar;
 
 	ar = currecord();
 	if (ar == NULL)
 		return;
 
 	ar->k_ar.ar_arg_auid = auid;
 	ARG_SET_VALID(ar, ARG_AUID);
 }
 
 void
 audit_arg_auditinfo(struct auditinfo *au_info)
 {
 	struct kaudit_record *ar;
 
 	ar = currecord();
 	if (ar == NULL)
 		return;
 
 	ar->k_ar.ar_arg_auid = au_info->ai_auid;
 	ar->k_ar.ar_arg_asid = au_info->ai_asid;
 	ar->k_ar.ar_arg_amask.am_success = au_info->ai_mask.am_success;
 	ar->k_ar.ar_arg_amask.am_failure = au_info->ai_mask.am_failure;
 	ar->k_ar.ar_arg_termid.port = au_info->ai_termid.port;
 	ar->k_ar.ar_arg_termid.machine = au_info->ai_termid.machine;
 	ARG_SET_VALID(ar, ARG_AUID | ARG_ASID | ARG_AMASK | ARG_TERMID);
 }
 
 void
 audit_arg_auditinfo_addr(struct auditinfo_addr *au_info)
 {
 	struct kaudit_record *ar;
 
 	ar = currecord();
 	if (ar == NULL)
 		return;
 
 	ar->k_ar.ar_arg_auid = au_info->ai_auid;
 	ar->k_ar.ar_arg_asid = au_info->ai_asid;
 	ar->k_ar.ar_arg_amask.am_success = au_info->ai_mask.am_success;
 	ar->k_ar.ar_arg_amask.am_failure = au_info->ai_mask.am_failure;
 	ar->k_ar.ar_arg_termid_addr.at_type = au_info->ai_termid.at_type;
 	ar->k_ar.ar_arg_termid_addr.at_port = au_info->ai_termid.at_port;
 	ar->k_ar.ar_arg_termid_addr.at_addr[0] = au_info->ai_termid.at_addr[0];
 	ar->k_ar.ar_arg_termid_addr.at_addr[1] = au_info->ai_termid.at_addr[1];
 	ar->k_ar.ar_arg_termid_addr.at_addr[2] = au_info->ai_termid.at_addr[2];
 	ar->k_ar.ar_arg_termid_addr.at_addr[3] = au_info->ai_termid.at_addr[3];
 	ARG_SET_VALID(ar, ARG_AUID | ARG_ASID | ARG_AMASK | ARG_TERMID_ADDR);
 }
 
 void
 audit_arg_text(char *text)
 {
 	struct kaudit_record *ar;
 
 	KASSERT(text != NULL, ("audit_arg_text: text == NULL"));
 
 	ar = currecord();
 	if (ar == NULL)
 		return;
 
 	/* Invalidate the text string */
 	ar->k_ar.ar_valid_arg &= (ARG_ALL ^ ARG_TEXT);
 
 	if (ar->k_ar.ar_arg_text == NULL)
 		ar->k_ar.ar_arg_text = malloc(MAXPATHLEN, M_AUDITTEXT,
 		    M_WAITOK);
 
 	strncpy(ar->k_ar.ar_arg_text, text, MAXPATHLEN);
 	ARG_SET_VALID(ar, ARG_TEXT);
 }
 
 void
 audit_arg_cmd(int cmd)
 {
 	struct kaudit_record *ar;
 
 	ar = currecord();
 	if (ar == NULL)
 		return;
 
 	ar->k_ar.ar_arg_cmd = cmd;
 	ARG_SET_VALID(ar, ARG_CMD);
 }
 
 void
 audit_arg_svipc_cmd(int cmd)
 {
 	struct kaudit_record *ar;
 
 	ar = currecord();
 	if (ar == NULL)
 		return;
 
 	ar->k_ar.ar_arg_svipc_cmd = cmd;
 	ARG_SET_VALID(ar, ARG_SVIPC_CMD);
 }
 
 void
 audit_arg_svipc_perm(struct ipc_perm *perm)
 {
 	struct kaudit_record *ar;
 
 	ar = currecord();
 	if (ar == NULL)
 		return;
 
 	bcopy(perm, &ar->k_ar.ar_arg_svipc_perm,
 	    sizeof(ar->k_ar.ar_arg_svipc_perm));
 	ARG_SET_VALID(ar, ARG_SVIPC_PERM);
 }
 
 void
 audit_arg_svipc_id(int id)
 {
 	struct kaudit_record *ar;
 
 	ar = currecord();
 	if (ar == NULL)
 		return;
 
 	ar->k_ar.ar_arg_svipc_id = id;
 	ARG_SET_VALID(ar, ARG_SVIPC_ID);
 }
 
 void
 audit_arg_svipc_addr(void * addr)
 {
 	struct kaudit_record *ar;
 
 	ar = currecord();
 	if (ar == NULL)
 		return;
 
 	ar->k_ar.ar_arg_svipc_addr = addr;
 	ARG_SET_VALID(ar, ARG_SVIPC_ADDR);
 }
 
 void
 audit_arg_posix_ipc_perm(uid_t uid, gid_t gid, mode_t mode)
 {
 	struct kaudit_record *ar;
 
 	ar = currecord();
 	if (ar == NULL)
 		return;
 
 	ar->k_ar.ar_arg_pipc_perm.pipc_uid = uid;
 	ar->k_ar.ar_arg_pipc_perm.pipc_gid = gid;
 	ar->k_ar.ar_arg_pipc_perm.pipc_mode = mode;
 	ARG_SET_VALID(ar, ARG_POSIX_IPC_PERM);
 }
 
 void
 audit_arg_auditon(union auditon_udata *udata)
 {
 	struct kaudit_record *ar;
 
 	ar = currecord();
 	if (ar == NULL)
 		return;
 
 	bcopy((void *)udata, &ar->k_ar.ar_arg_auditon,
 	    sizeof(ar->k_ar.ar_arg_auditon));
 	ARG_SET_VALID(ar, ARG_AUDITON);
 }
 
 /*
  * Audit information about a file, either the file's vnode info, or its
  * socket address info.
  */
 void
 audit_arg_file(struct proc *p, struct file *fp)
 {
 	struct kaudit_record *ar;
 	struct socket *so;
 	struct inpcb *pcb;
 	struct vnode *vp;
 	int vfslocked;
 
 	ar = currecord();
 	if (ar == NULL)
 		return;
 
 	switch (fp->f_type) {
 	case DTYPE_VNODE:
 	case DTYPE_FIFO:
 		/*
 		 * XXXAUDIT: Only possibly to record as first vnode?
 		 */
 		vp = fp->f_vnode;
 		vfslocked = VFS_LOCK_GIANT(vp->v_mount);
-		vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, curthread);
+		vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
 		audit_arg_vnode(vp, ARG_VNODE1);
 		VOP_UNLOCK(vp, 0, curthread);
 		VFS_UNLOCK_GIANT(vfslocked);
 		break;
 
 	case DTYPE_SOCKET:
 		so = (struct socket *)fp->f_data;
 		if (INP_CHECK_SOCKAF(so, PF_INET)) {
 			SOCK_LOCK(so);
 			ar->k_ar.ar_arg_sockinfo.so_type =
 			    so->so_type;
 			ar->k_ar.ar_arg_sockinfo.so_domain =
 			    INP_SOCKAF(so);
 			ar->k_ar.ar_arg_sockinfo.so_protocol =
 			    so->so_proto->pr_protocol;
 			SOCK_UNLOCK(so);
 			pcb = (struct inpcb *)so->so_pcb;
 			INP_LOCK(pcb);
 			ar->k_ar.ar_arg_sockinfo.so_raddr =
 			    pcb->inp_faddr.s_addr;
 			ar->k_ar.ar_arg_sockinfo.so_laddr =
 			    pcb->inp_laddr.s_addr;
 			ar->k_ar.ar_arg_sockinfo.so_rport =
 			    pcb->inp_fport;
 			ar->k_ar.ar_arg_sockinfo.so_lport =
 			    pcb->inp_lport;
 			INP_UNLOCK(pcb);
 			ARG_SET_VALID(ar, ARG_SOCKINFO);
 		}
 		break;
 
 	default:
 		/* XXXAUDIT: else? */
 		break;
 	}
 }
 
 /*
  * Store a path as given by the user process for auditing into the audit
  * record stored on the user thread. This function will allocate the memory
  * to store the path info if not already available. This memory will be freed
  * when the audit record is freed.
  *
  * XXXAUDIT: Possibly assert that the memory isn't already allocated?
  */
 void
 audit_arg_upath(struct thread *td, char *upath, u_int64_t flag)
 {
 	struct kaudit_record *ar;
 	char **pathp;
 
 	KASSERT(td != NULL, ("audit_arg_upath: td == NULL"));
 	KASSERT(upath != NULL, ("audit_arg_upath: upath == NULL"));
 
 	ar = currecord();
 	if (ar == NULL)
 		return;
 
 	KASSERT((flag == ARG_UPATH1) || (flag == ARG_UPATH2),
 	    ("audit_arg_upath: flag %llu", (unsigned long long)flag));
 	KASSERT((flag != ARG_UPATH1) || (flag != ARG_UPATH2),
 	    ("audit_arg_upath: flag %llu", (unsigned long long)flag));
 
 	if (flag == ARG_UPATH1)
 		pathp = &ar->k_ar.ar_arg_upath1;
 	else
 		pathp = &ar->k_ar.ar_arg_upath2;
 
 	if (*pathp == NULL)
 		*pathp = malloc(MAXPATHLEN, M_AUDITPATH, M_WAITOK);
 
 	canon_path(td, upath, *pathp);
 
 	ARG_SET_VALID(ar, flag);
 }
 
 /*
  * Function to save the path and vnode attr information into the audit
  * record.
  *
  * It is assumed that the caller will hold any vnode locks necessary to
  * perform a VOP_GETATTR() on the passed vnode.
  *
  * XXX: The attr code is very similar to vfs_vnops.c:vn_stat(), but always
  * provides access to the generation number as we need that to construct the
  * BSM file ID.
  *
  * XXX: We should accept the process argument from the caller, since it's
  * very likely they already have a reference.
  *
  * XXX: Error handling in this function is poor.
  *
  * XXXAUDIT: Possibly KASSERT the path pointer is NULL?
  */
 void
 audit_arg_vnode(struct vnode *vp, u_int64_t flags)
 {
 	struct kaudit_record *ar;
 	struct vattr vattr;
 	int error;
 	struct vnode_au_info *vnp;
 
 	KASSERT(vp != NULL, ("audit_arg_vnode: vp == NULL"));
 	KASSERT((flags == ARG_VNODE1) || (flags == ARG_VNODE2),
 	    ("audit_arg_vnode: flags %jd", (intmax_t)flags));
 
 	/*
 	 * Assume that if the caller is calling audit_arg_vnode() on a
 	 * non-MPSAFE vnode, then it will have acquired Giant.
 	 */
 	VFS_ASSERT_GIANT(vp->v_mount);
 	ASSERT_VOP_LOCKED(vp, "audit_arg_vnode");
 
 	ar = currecord();
 	if (ar == NULL)
 		return;
 
 	/*
 	 * XXXAUDIT: The below clears, and then resets the flags for valid
 	 * arguments.  Ideally, either the new vnode is used, or the old one
 	 * would be.
 	 */
 	if (flags & ARG_VNODE1) {
 		ar->k_ar.ar_valid_arg &= (ARG_ALL ^ ARG_VNODE1);
 		vnp = &ar->k_ar.ar_arg_vnode1;
 	} else {
 		ar->k_ar.ar_valid_arg &= (ARG_ALL ^ ARG_VNODE2);
 		vnp = &ar->k_ar.ar_arg_vnode2;
 	}
 
 	error = VOP_GETATTR(vp, &vattr, curthread->td_ucred, curthread);
 	if (error) {
 		/* XXX: How to handle this case? */
 		return;
 	}
 
 	vnp->vn_mode = vattr.va_mode;
 	vnp->vn_uid = vattr.va_uid;
 	vnp->vn_gid = vattr.va_gid;
 	vnp->vn_dev = vattr.va_rdev;
 	vnp->vn_fsid = vattr.va_fsid;
 	vnp->vn_fileid = vattr.va_fileid;
 	vnp->vn_gen = vattr.va_gen;
 	if (flags & ARG_VNODE1)
 		ARG_SET_VALID(ar, ARG_VNODE1);
 	else
 		ARG_SET_VALID(ar, ARG_VNODE2);
 }
 
 /*
  * Audit the argument strings passed to exec.
  */
 void
 audit_arg_argv(char *argv, int argc, int length)
 {
 	struct kaudit_record *ar;
 
 	if (audit_argv == 0)
 		return;
 
 	ar = currecord();
 	if (ar == NULL)
 		return;
 
 	ar->k_ar.ar_arg_argv = malloc(length, M_AUDITTEXT, M_WAITOK);
 	bcopy(argv, ar->k_ar.ar_arg_argv, length);
 	ar->k_ar.ar_arg_argc = argc;
 	ARG_SET_VALID(ar, ARG_ARGV);
 }
 
 /*
  * Audit the environment strings passed to exec.
  */
 void
 audit_arg_envv(char *envv, int envc, int length)
 {
 	struct kaudit_record *ar;
 
 	if (audit_arge == 0)
 		return;
 
 	ar = currecord();
 	if (ar == NULL)
 		return;
 
 	ar->k_ar.ar_arg_envv = malloc(length, M_AUDITTEXT, M_WAITOK);
 	bcopy(envv, ar->k_ar.ar_arg_envv, length);
 	ar->k_ar.ar_arg_envc = envc;
 	ARG_SET_VALID(ar, ARG_ENVV);
 }
 
 /*
  * The close() system call uses it's own audit call to capture the path/vnode
  * information because those pieces are not easily obtained within the system
  * call itself.
  */
 void
 audit_sysclose(struct thread *td, int fd)
 {
 	struct kaudit_record *ar;
 	struct vnode *vp;
 	struct file *fp;
 	int vfslocked;
 
 	KASSERT(td != NULL, ("audit_sysclose: td == NULL"));
 
 	ar = currecord();
 	if (ar == NULL)
 		return;
 
 	audit_arg_fd(fd);
 
 	if (getvnode(td->td_proc->p_fd, fd, &fp) != 0)
 		return;
 
 	vp = fp->f_vnode;
 	vfslocked = VFS_LOCK_GIANT(vp->v_mount);
-	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td);
+	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
 	audit_arg_vnode(vp, ARG_VNODE1);
 	VOP_UNLOCK(vp, 0, td);
 	VFS_UNLOCK_GIANT(vfslocked);
 	fdrop(fp, td);
 }
Index: head/sys/security/audit/audit_bsm_klib.c
===================================================================
--- head/sys/security/audit/audit_bsm_klib.c	(revision 175201)
+++ head/sys/security/audit/audit_bsm_klib.c	(revision 175202)
@@ -1,547 +1,547 @@
 /*
  * Copyright (c) 1999-2005 Apple Computer, Inc.
  * Copyright (c) 2005 Robert N. M. Watson
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1.  Redistributions of source code must retain the above copyright
  *     notice, this list of conditions and the following disclaimer.
  * 2.  Redistributions in binary form must reproduce the above copyright
  *     notice, this list of conditions and the following disclaimer in the
  *     documentation and/or other materials provided with the distribution.
  * 3.  Neither the name of Apple Computer, Inc. ("Apple") nor the names of
  *     its contributors may be used to endorse or promote products derived
  *     from this software without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY APPLE AND ITS CONTRIBUTORS "AS IS" AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED. IN NO EVENT SHALL APPLE OR ITS CONTRIBUTORS BE LIABLE FOR
  * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
  * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING
  * IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
  * POSSIBILITY OF SUCH DAMAGE.
  *
  * $FreeBSD$
  */
 
 #include <sys/param.h>
 #include <sys/fcntl.h>
 #include <sys/filedesc.h>
 #include <sys/libkern.h>
 #include <sys/malloc.h>
 #include <sys/mount.h>
 #include <sys/proc.h>
 #include <sys/sem.h>
 #include <sys/syscall.h>
 #include <sys/sysctl.h>
 #include <sys/sysent.h>
 #include <sys/vnode.h>
 
 #include <bsm/audit.h>
 #include <bsm/audit_kevents.h>
 #include <security/audit/audit.h>
 #include <security/audit/audit_private.h>
 
 /*
  * Hash table functions for the audit event number to event class mask
  * mapping.
  */
 #define EVCLASSMAP_HASH_TABLE_SIZE 251
 struct evclass_elem {
 	au_event_t event;
 	au_class_t class;
 	LIST_ENTRY(evclass_elem) entry;
 };
 struct evclass_list {
 	LIST_HEAD(, evclass_elem) head;
 };
 
 static MALLOC_DEFINE(M_AUDITEVCLASS, "audit_evclass", "Audit event class");
 static struct mtx		evclass_mtx;
 static struct evclass_list	evclass_hash[EVCLASSMAP_HASH_TABLE_SIZE];
 
 /*
  * Look up the class for an audit event in the class mapping table.
  */
 au_class_t
 au_event_class(au_event_t event)
 {
 	struct evclass_list *evcl;
 	struct evclass_elem *evc;
 	au_class_t class;
 
 	mtx_lock(&evclass_mtx);
 	evcl = &evclass_hash[event % EVCLASSMAP_HASH_TABLE_SIZE];
 	class = 0;
 	LIST_FOREACH(evc, &evcl->head, entry) {
 		if (evc->event == event) {
 			class = evc->class;
 			goto out;
 		}
 	}
 out:
 	mtx_unlock(&evclass_mtx);
 	return (class);
 }
 
 /*
  * Insert a event to class mapping. If the event already exists in the
  * mapping, then replace the mapping with the new one.
  *
  * XXX There is currently no constraints placed on the number of mappings.
  * May want to either limit to a number, or in terms of memory usage.
  */
 void
 au_evclassmap_insert(au_event_t event, au_class_t class)
 {
 	struct evclass_list *evcl;
 	struct evclass_elem *evc, *evc_new;
 
 	/*
 	 * Pessimistically, always allocate storage before acquiring mutex.
 	 * Free if there is already a mapping for this event.
 	 */
 	evc_new = malloc(sizeof(*evc), M_AUDITEVCLASS, M_WAITOK);
 
 	mtx_lock(&evclass_mtx);
 	evcl = &evclass_hash[event % EVCLASSMAP_HASH_TABLE_SIZE];
 	LIST_FOREACH(evc, &evcl->head, entry) {
 		if (evc->event == event) {
 			evc->class = class;
 			mtx_unlock(&evclass_mtx);
 			free(evc_new, M_AUDITEVCLASS);
 			return;
 		}
 	}
 	evc = evc_new;
 	evc->event = event;
 	evc->class = class;
 	LIST_INSERT_HEAD(&evcl->head, evc, entry);
 	mtx_unlock(&evclass_mtx);
 }
 
 void
 au_evclassmap_init(void)
 {
 	int i;
 
 	mtx_init(&evclass_mtx, "evclass_mtx", NULL, MTX_DEF);
 	for (i = 0; i < EVCLASSMAP_HASH_TABLE_SIZE; i++)
 		LIST_INIT(&evclass_hash[i].head);
 
 	/*
 	 * Set up the initial event to class mapping for system calls.
 	 *
 	 * XXXRW: Really, this should walk all possible audit events, not all
 	 * native ABI system calls, as there may be audit events reachable
 	 * only through non-native system calls.  It also seems a shame to
 	 * frob the mutex this early.
 	 */
 	for (i = 0; i < SYS_MAXSYSCALL; i++) {
 		if (sysent[i].sy_auevent != AUE_NULL)
 			au_evclassmap_insert(sysent[i].sy_auevent, 0);
 	}
 }
 
 /*
  * Check whether an event is aditable by comparing the mask of classes this
  * event is part of against the given mask.
  */
 int
 au_preselect(au_event_t event, au_class_t class, au_mask_t *mask_p, int sorf)
 {
 	au_class_t effmask = 0;
 
 	if (mask_p == NULL)
 		return (-1);
 
 	/*
 	 * Perform the actual check of the masks against the event.
 	 */
 	if (sorf & AU_PRS_SUCCESS)
 		effmask |= (mask_p->am_success & class);
 
 	if (sorf & AU_PRS_FAILURE)
 		effmask |= (mask_p->am_failure & class);
 
 	if (effmask)
 		return (1);
 	else
 		return (0);
 }
 
 /*
  * Convert sysctl names and present arguments to events.
  */
 au_event_t
 ctlname_to_sysctlevent(int name[], uint64_t valid_arg)
 {
 
 	/* can't parse it - so return the worst case */
 	if ((valid_arg & (ARG_CTLNAME | ARG_LEN)) != (ARG_CTLNAME | ARG_LEN))
 		return (AUE_SYSCTL);
 
 	switch (name[0]) {
 	/* non-admin "lookups" treat them special */
 	case KERN_OSTYPE:
 	case KERN_OSRELEASE:
 	case KERN_OSREV:
 	case KERN_VERSION:
 	case KERN_ARGMAX:
 	case KERN_CLOCKRATE:
 	case KERN_BOOTTIME:
 	case KERN_POSIX1:
 	case KERN_NGROUPS:
 	case KERN_JOB_CONTROL:
 	case KERN_SAVED_IDS:
 	case KERN_OSRELDATE:
 	case KERN_DUMMY:
 		return (AUE_SYSCTL_NONADMIN);
 
 	/* only treat the changeable controls as admin */
 	case KERN_MAXVNODES:
 	case KERN_MAXPROC:
 	case KERN_MAXFILES:
 	case KERN_MAXPROCPERUID:
 	case KERN_MAXFILESPERPROC:
 	case KERN_HOSTID:
 	case KERN_SECURELVL:
 	case KERN_HOSTNAME:
 	case KERN_VNODE:
 	case KERN_PROC:
 	case KERN_FILE:
 	case KERN_PROF:
 	case KERN_NISDOMAINNAME:
 	case KERN_UPDATEINTERVAL:
 	case KERN_NTP_PLL:
 	case KERN_BOOTFILE:
 	case KERN_DUMPDEV:
 	case KERN_IPC:
 	case KERN_PS_STRINGS:
 	case KERN_USRSTACK:
 	case KERN_LOGSIGEXIT:
 	case KERN_IOV_MAX:
 	case KERN_MAXID:
 		return ((valid_arg & ARG_VALUE) ?
 			AUE_SYSCTL : AUE_SYSCTL_NONADMIN);
 
 	default:
 		return (AUE_SYSCTL);
 	}
 	/* NOTREACHED */
 }
 
 /*
  * Convert an open flags specifier into a specific type of open event for
  * auditing purposes.
  */
 au_event_t
 flags_and_error_to_openevent(int oflags, int error)
 {
 	au_event_t aevent;
 
 	/*
 	 * Need to check only those flags we care about.
 	 */
 	oflags = oflags & (O_RDONLY | O_CREAT | O_TRUNC | O_RDWR | O_WRONLY);
 
 	/*
 	 * These checks determine what flags are on with the condition that
 	 * ONLY that combination is on, and no other flags are on.
 	 */
 	switch (oflags) {
 	case O_RDONLY:
 		aevent = AUE_OPEN_R;
 		break;
 
 	case (O_RDONLY | O_CREAT):
 		aevent = AUE_OPEN_RC;
 		break;
 
 	case (O_RDONLY | O_CREAT | O_TRUNC):
 		aevent = AUE_OPEN_RTC;
 		break;
 
 	case (O_RDONLY | O_TRUNC):
 		aevent = AUE_OPEN_RT;
 		break;
 
 	case O_RDWR:
 		aevent = AUE_OPEN_RW;
 		break;
 
 	case (O_RDWR | O_CREAT):
 		aevent = AUE_OPEN_RWC;
 		break;
 
 	case (O_RDWR | O_CREAT | O_TRUNC):
 		aevent = AUE_OPEN_RWTC;
 		break;
 
 	case (O_RDWR | O_TRUNC):
 		aevent = AUE_OPEN_RWT;
 		break;
 
 	case O_WRONLY:
 		aevent = AUE_OPEN_W;
 		break;
 
 	case (O_WRONLY | O_CREAT):
 		aevent = AUE_OPEN_WC;
 		break;
 
 	case (O_WRONLY | O_CREAT | O_TRUNC):
 		aevent = AUE_OPEN_WTC;
 		break;
 
 	case (O_WRONLY | O_TRUNC):
 		aevent = AUE_OPEN_WT;
 		break;
 
 	default:
 		aevent = AUE_OPEN;
 		break;
 	}
 
 #if 0
 	/*
 	 * Convert chatty errors to better matching events.  Failures to
 	 * find a file are really just attribute events -- so recast them as
 	 * such.
 	 *
 	 * XXXAUDIT: Solaris defines that AUE_OPEN will never be returned, it
 	 * is just a placeholder.  However, in Darwin we return that in
 	 * preference to other events.  For now, comment this out as we don't
 	 * have a BSM conversion routine for AUE_OPEN.
 	 */
 	switch (aevent) {
 	case AUE_OPEN_R:
 	case AUE_OPEN_RT:
 	case AUE_OPEN_RW:
 	case AUE_OPEN_RWT:
 	case AUE_OPEN_W:
 	case AUE_OPEN_WT:
 		if (error == ENOENT)
 			aevent = AUE_OPEN;
 	}
 #endif
 	return (aevent);
 }
 
 /*
  * Convert a MSGCTL command to a specific event.
  */
 int
 msgctl_to_event(int cmd)
 {
 
 	switch (cmd) {
 	case IPC_RMID:
 		return (AUE_MSGCTL_RMID);
 
 	case IPC_SET:
 		return (AUE_MSGCTL_SET);
 
 	case IPC_STAT:
 		return (AUE_MSGCTL_STAT);
 
 	default:
 		/* We will audit a bad command. */
 		return (AUE_MSGCTL);
 	}
 }
 
 /*
  * Convert a SEMCTL command to a specific event.
  */
 int
 semctl_to_event(int cmd)
 {
 
 	switch (cmd) {
 	case GETALL:
 		return (AUE_SEMCTL_GETALL);
 
 	case GETNCNT:
 		return (AUE_SEMCTL_GETNCNT);
 
 	case GETPID:
 		return (AUE_SEMCTL_GETPID);
 
 	case GETVAL:
 		return (AUE_SEMCTL_GETVAL);
 
 	case GETZCNT:
 		return (AUE_SEMCTL_GETZCNT);
 
 	case IPC_RMID:
 		return (AUE_SEMCTL_RMID);
 
 	case IPC_SET:
 		return (AUE_SEMCTL_SET);
 
 	case SETALL:
 		return (AUE_SEMCTL_SETALL);
 
 	case SETVAL:
 		return (AUE_SEMCTL_SETVAL);
 
 	case IPC_STAT:
 		return (AUE_SEMCTL_STAT);
 
 	default:
 		/* We will audit a bad command */
 		return (AUE_SEMCTL);
 	}
 }
 
 /*
  * Convert a command for the auditon() system call to a audit event.
  */
 int
 auditon_command_event(int cmd)
 {
 
 	switch(cmd) {
 	case A_GETPOLICY:
 		return (AUE_AUDITON_GPOLICY);
 
 	case A_SETPOLICY:
 		return (AUE_AUDITON_SPOLICY);
 
 	case A_GETKMASK:
 		return (AUE_AUDITON_GETKMASK);
 
 	case A_SETKMASK:
 		return (AUE_AUDITON_SETKMASK);
 
 	case A_GETQCTRL:
 		return (AUE_AUDITON_GQCTRL);
 
 	case A_SETQCTRL:
 		return (AUE_AUDITON_SQCTRL);
 
 	case A_GETCWD:
 		return (AUE_AUDITON_GETCWD);
 
 	case A_GETCAR:
 		return (AUE_AUDITON_GETCAR);
 
 	case A_GETSTAT:
 		return (AUE_AUDITON_GETSTAT);
 
 	case A_SETSTAT:
 		return (AUE_AUDITON_SETSTAT);
 
 	case A_SETUMASK:
 		return (AUE_AUDITON_SETUMASK);
 
 	case A_SETSMASK:
 		return (AUE_AUDITON_SETSMASK);
 
 	case A_GETCOND:
 		return (AUE_AUDITON_GETCOND);
 
 	case A_SETCOND:
 		return (AUE_AUDITON_SETCOND);
 
 	case A_GETCLASS:
 		return (AUE_AUDITON_GETCLASS);
 
 	case A_SETCLASS:
 		return (AUE_AUDITON_SETCLASS);
 
 	case A_GETPINFO:
 	case A_SETPMASK:
 	case A_SETFSIZE:
 	case A_GETFSIZE:
 	case A_GETPINFO_ADDR:
 	case A_GETKAUDIT:
 	case A_SETKAUDIT:
 	default:
 		return (AUE_AUDITON);	/* No special record */
 	}
 }
 
 /*
  * Create a canonical path from given path by prefixing either the root
  * directory, or the current working directory.  If the process working
  * directory is NULL, we could use 'rootvnode' to obtain the root directory,
  * but this results in a volfs name written to the audit log. So we will
  * leave the filename starting with '/' in the audit log in this case.
  *
  * XXXRW: Since we combine two paths here, ideally a buffer of size
  * MAXPATHLEN * 2 would be passed in.
  */
 void
 canon_path(struct thread *td, char *path, char *cpath)
 {
 	char *bufp;
 	char *retbuf, *freebuf;
 	struct vnode *vnp;
 	struct filedesc *fdp;
 	int cisr, error, vfslocked;
 
 	WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK, NULL,
 	    "canon_path() at %s:%d", __FILE__, __LINE__);
 
 	fdp = td->td_proc->p_fd;
 	bufp = path;
 	cisr = 0;
 	FILEDESC_SLOCK(fdp);
 	if (*(path) == '/') {
 		while (*(bufp) == '/')
 			bufp++;			/* Skip leading '/'s. */
 		/*
 		 * If no process root, or it is the same as the system root,
 		 * audit the path as passed in with a single '/'.
 		 */
 		if ((fdp->fd_rdir == NULL) ||
 		    (fdp->fd_rdir == rootvnode)) {
 			vnp = NULL;
 			bufp--;			/* Restore one '/'. */
 		} else {
 			vnp = fdp->fd_rdir;	/* Use process root. */
 			vref(vnp);
 		}
 	} else {
 		vnp = fdp->fd_cdir;	/* Prepend the current dir. */
 		cisr = (fdp->fd_rdir == fdp->fd_cdir);
 		vref(vnp);
 		bufp = path;
 	}
 	FILEDESC_SUNLOCK(fdp);
 	if (vnp != NULL) {
 		/*
 		 * XXX: vn_fullpath() on FreeBSD is "less reliable" than
 		 * vn_getpath() on Darwin, so this will need more attention
 		 * in the future.  Also, the question and string bounding
 		 * here seems a bit questionable and will also require
 		 * attention.
 		 */
 		vfslocked = VFS_LOCK_GIANT(vnp->v_mount);
-		vn_lock(vnp, LK_EXCLUSIVE | LK_RETRY, td);
+		vn_lock(vnp, LK_EXCLUSIVE | LK_RETRY);
 		error = vn_fullpath(td, vnp, &retbuf, &freebuf);
 		if (error == 0) {
 			/* Copy and free buffer allocated by vn_fullpath().
 			 * If the current working directory was the same as
 			 * the root directory, and the path was a relative
 			 * pathname, do not separate the two components with
 			 * the '/' character.
 			 */
 			snprintf(cpath, MAXPATHLEN, "%s%s%s", retbuf,
 			    cisr ? "" : "/", bufp);
 			free(freebuf, M_TEMP);
 		} else
 			cpath[0] = '\0';
 		vput(vnp);
 		VFS_UNLOCK_GIANT(vfslocked);
 	} else
 		strlcpy(cpath, bufp, MAXPATHLEN);
 }
Index: head/sys/security/audit/audit_worker.c
===================================================================
--- head/sys/security/audit/audit_worker.c	(revision 175201)
+++ head/sys/security/audit/audit_worker.c	(revision 175202)
@@ -1,551 +1,551 @@
 /*
  * Copyright (c) 1999-2005 Apple Computer, Inc.
  * Copyright (c) 2006 Robert N. M. Watson
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1.  Redistributions of source code must retain the above copyright
  *     notice, this list of conditions and the following disclaimer.
  * 2.  Redistributions in binary form must reproduce the above copyright
  *     notice, this list of conditions and the following disclaimer in the
  *     documentation and/or other materials provided with the distribution.
  * 3.  Neither the name of Apple Computer, Inc. ("Apple") nor the names of
  *     its contributors may be used to endorse or promote products derived
  *     from this software without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY APPLE AND ITS CONTRIBUTORS "AS IS" AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED. IN NO EVENT SHALL APPLE OR ITS CONTRIBUTORS BE LIABLE FOR
  * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
  * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING
  * IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
  * POSSIBILITY OF SUCH DAMAGE.
  *
  * $FreeBSD$
  */
 
 #include <sys/param.h>
 #include <sys/condvar.h>
 #include <sys/conf.h>
 #include <sys/file.h>
 #include <sys/filedesc.h>
 #include <sys/fcntl.h>
 #include <sys/ipc.h>
 #include <sys/kernel.h>
 #include <sys/kthread.h>
 #include <sys/malloc.h>
 #include <sys/mount.h>
 #include <sys/namei.h>
 #include <sys/proc.h>
 #include <sys/queue.h>
 #include <sys/socket.h>
 #include <sys/socketvar.h>
 #include <sys/protosw.h>
 #include <sys/domain.h>
 #include <sys/sysproto.h>
 #include <sys/sysent.h>
 #include <sys/systm.h>
 #include <sys/ucred.h>
 #include <sys/uio.h>
 #include <sys/un.h>
 #include <sys/unistd.h>
 #include <sys/vnode.h>
 
 #include <bsm/audit.h>
 #include <bsm/audit_internal.h>
 #include <bsm/audit_kevents.h>
 
 #include <netinet/in.h>
 #include <netinet/in_pcb.h>
 
 #include <security/audit/audit.h>
 #include <security/audit/audit_private.h>
 
 #include <vm/uma.h>
 
 /*
  * Worker thread that will schedule disk I/O, etc.
  */
 static struct proc		*audit_thread;
 
 /*
  * When an audit log is rotated, the actual rotation must be performed by the
  * audit worker thread, as it may have outstanding writes on the current
  * audit log.  audit_replacement_vp holds the vnode replacing the current
  * vnode.  We can't let more than one replacement occur at a time, so if more
  * than one thread requests a replacement, only one can have the replacement
  * "in progress" at any given moment.  If a thread tries to replace the audit
  * vnode and discovers a replacement is already in progress (i.e.,
  * audit_replacement_flag != 0), then it will sleep on audit_replacement_cv
  * waiting its turn to perform a replacement.  When a replacement is
  * completed, this cv is signalled by the worker thread so a waiting thread
  * can start another replacement.  We also store a credential to perform
  * audit log write operations with.
  *
  * The current credential and vnode are thread-local to audit_worker.
  */
 static struct cv		audit_replacement_cv;
 
 static int			audit_replacement_flag;
 static struct vnode		*audit_replacement_vp;
 static struct ucred		*audit_replacement_cred;
 
 /*
  * Flags related to Kernel->user-space communication.
  */
 static int			audit_file_rotate_wait;
 
 /*
  * Write an audit record to a file, performed as the last stage after both
  * preselection and BSM conversion.  Both space management and write failures
  * are handled in this function.
  *
  * No attempt is made to deal with possible failure to deliver a trigger to
  * the audit daemon, since the message is asynchronous anyway.
  */
 static void
 audit_record_write(struct vnode *vp, struct ucred *cred, struct thread *td,
     void *data, size_t len)
 {
 	static struct timeval last_lowspace_trigger;
 	static struct timeval last_fail;
 	static int cur_lowspace_trigger;
 	struct statfs *mnt_stat;
 	int error, vfslocked;
 	static int cur_fail;
 	struct vattr vattr;
 	long temp;
 
 	if (vp == NULL)
 		return;
 
  	mnt_stat = &vp->v_mount->mnt_stat;
 	vfslocked = VFS_LOCK_GIANT(vp->v_mount);
 
 	/*
 	 * First, gather statistics on the audit log file and file system so
 	 * that we know how we're doing on space.  Consider failure of these
 	 * operations to indicate a future inability to write to the file.
 	 */
 	error = VFS_STATFS(vp->v_mount, mnt_stat, td);
 	if (error)
 		goto fail;
-	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td);
+	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
 	error = VOP_GETATTR(vp, &vattr, cred, td);
 	VOP_UNLOCK(vp, 0, td);
 	if (error)
 		goto fail;
 	audit_fstat.af_currsz = vattr.va_size;
 
 	/*
 	 * We handle four different space-related limits:
 	 *
 	 * - A fixed (hard) limit on the minimum free blocks we require on
 	 *   the file system, and results in record loss, a trigger, and
 	 *   possible fail stop due to violating invariants.
 	 *
 	 * - An administrative (soft) limit, which when fallen below, results
 	 *   in the kernel notifying the audit daemon of low space.
 	 *
 	 * - An audit trail size limit, which when gone above, results in the
 	 *   kernel notifying the audit daemon that rotation is desired.
 	 *
 	 * - The total depth of the kernel audit record exceeding free space,
 	 *   which can lead to possible fail stop (with drain), in order to
 	 *   prevent violating invariants.  Failure here doesn't halt
 	 *   immediately, but prevents new records from being generated.
 	 *
 	 * Possibly, the last of these should be handled differently, always
 	 * allowing a full queue to be lost, rather than trying to prevent
 	 * loss.
 	 *
 	 * First, handle the hard limit, which generates a trigger and may
 	 * fail stop.  This is handled in the same manner as ENOSPC from
 	 * VOP_WRITE, and results in record loss.
 	 */
 	if (mnt_stat->f_bfree < AUDIT_HARD_LIMIT_FREE_BLOCKS) {
 		error = ENOSPC;
 		goto fail_enospc;
 	}
 
 	/*
 	 * Second, handle falling below the soft limit, if defined; we send
 	 * the daemon a trigger and continue processing the record.  Triggers
 	 * are limited to 1/sec.
 	 */
 	if (audit_qctrl.aq_minfree != 0) {
 		/*
 		 * XXXAUDIT: Check math and block size calculations here.
 		 */
 		temp = mnt_stat->f_blocks / (100 / audit_qctrl.aq_minfree);
 		if (mnt_stat->f_bfree < temp) {
 			if (ppsratecheck(&last_lowspace_trigger,
 			    &cur_lowspace_trigger, 1)) {
 				(void)send_trigger(AUDIT_TRIGGER_LOW_SPACE);
 				printf("Warning: audit space low\n");
 			}
 		}
 	}
 
 	/*
 	 * If the current file is getting full, generate a rotation trigger
 	 * to the daemon.  This is only approximate, which is fine as more
 	 * records may be generated before the daemon rotates the file.
 	 */
 	if ((audit_fstat.af_filesz != 0) && (audit_file_rotate_wait == 0) &&
 	    (vattr.va_size >= audit_fstat.af_filesz)) {
 		audit_file_rotate_wait = 1;
 		(void)send_trigger(AUDIT_TRIGGER_ROTATE_KERNEL);
 	}
 
 	/*
 	 * If the estimated amount of audit data in the audit event queue
 	 * (plus records allocated but not yet queued) has reached the amount
 	 * of free space on the disk, then we need to go into an audit fail
 	 * stop state, in which we do not permit the allocation/committing of
 	 * any new audit records.  We continue to process records but don't
 	 * allow any activities that might generate new records.  In the
 	 * future, we might want to detect when space is available again and
 	 * allow operation to continue, but this behavior is sufficient to
 	 * meet fail stop requirements in CAPP.
 	 */
 	if (audit_fail_stop) {
 		if ((unsigned long)((audit_q_len + audit_pre_q_len + 1) *
 		    MAX_AUDIT_RECORD_SIZE) / mnt_stat->f_bsize >=
 		    (unsigned long)(mnt_stat->f_bfree)) {
 			if (ppsratecheck(&last_fail, &cur_fail, 1))
 				printf("audit_record_write: free space "
 				    "below size of audit queue, failing "
 				    "stop\n");
 			audit_in_failure = 1;
 		} else if (audit_in_failure) {
 			/*
 			 * Note: if we want to handle recovery, this is the
 			 * spot to do it: unset audit_in_failure, and issue a
 			 * wakeup on the cv.
 			 */
 		}
 	}
 
 	error = vn_rdwr(UIO_WRITE, vp, data, len, (off_t)0, UIO_SYSSPACE,
 	    IO_APPEND|IO_UNIT, cred, NULL, NULL, td);
 	if (error == ENOSPC)
 		goto fail_enospc;
 	else if (error)
 		goto fail;
 
 	/*
 	 * Catch completion of a queue drain here; if we're draining and the
 	 * queue is now empty, fail stop.  That audit_fail_stop is implicitly
 	 * true, since audit_in_failure can only be set of audit_fail_stop is
 	 * set.
 	 *
 	 * Note: if we handle recovery from audit_in_failure, then we need to
 	 * make panic here conditional.
 	 */
 	if (audit_in_failure) {
 		if (audit_q_len == 0 && audit_pre_q_len == 0) {
 			VOP_LOCK(vp, LK_DRAIN | LK_INTERLOCK, td);
 			(void)VOP_FSYNC(vp, MNT_WAIT, td);
 			VOP_UNLOCK(vp, 0, td);
 			panic("Audit store overflow; record queue drained.");
 		}
 	}
 
 	VFS_UNLOCK_GIANT(vfslocked);
 	return;
 
 fail_enospc:
 	/*
 	 * ENOSPC is considered a special case with respect to failures, as
 	 * this can reflect either our preemptive detection of insufficient
 	 * space, or ENOSPC returned by the vnode write call.
 	 */
 	if (audit_fail_stop) {
 		VOP_LOCK(vp, LK_DRAIN | LK_INTERLOCK, td);
 		(void)VOP_FSYNC(vp, MNT_WAIT, td);
 		VOP_UNLOCK(vp, 0, td);
 		panic("Audit log space exhausted and fail-stop set.");
 	}
 	(void)send_trigger(AUDIT_TRIGGER_NO_SPACE);
 	audit_suspended = 1;
 
 	/* FALLTHROUGH */
 fail:
 	/*
 	 * We have failed to write to the file, so the current record is
 	 * lost, which may require an immediate system halt.
 	 */
 	if (audit_panic_on_write_fail) {
 		VOP_LOCK(vp, LK_DRAIN | LK_INTERLOCK, td);
 		(void)VOP_FSYNC(vp, MNT_WAIT, td);
 		VOP_UNLOCK(vp, 0, td);
 		panic("audit_worker: write error %d\n", error);
 	} else if (ppsratecheck(&last_fail, &cur_fail, 1))
 		printf("audit_worker: write error %d\n", error);
 	VFS_UNLOCK_GIANT(vfslocked);
 }
 
 /*
  * If an appropriate signal has been received rotate the audit log based on
  * the global replacement variables.  Signal consumers as needed that the
  * rotation has taken place.
  *
  * The global variables and CVs used to signal the audit_worker to perform a
  * rotation are essentially a message queue of depth 1.  It would be much
  * nicer to actually use a message queue.
  */
 static void
 audit_worker_rotate(struct ucred **audit_credp, struct vnode **audit_vpp,
     struct thread *audit_td)
 {
 	int do_replacement_signal, vfslocked;
 	struct ucred *old_cred;
 	struct vnode *old_vp;
 
 	mtx_assert(&audit_mtx, MA_OWNED);
 
 	do_replacement_signal = 0;
 	while (audit_replacement_flag != 0) {
 		old_cred = *audit_credp;
 		old_vp = *audit_vpp;
 		*audit_credp = audit_replacement_cred;
 		*audit_vpp = audit_replacement_vp;
 		audit_replacement_cred = NULL;
 		audit_replacement_vp = NULL;
 		audit_replacement_flag = 0;
 
 		audit_enabled = (*audit_vpp != NULL);
 
 		if (old_vp != NULL) {
 			mtx_unlock(&audit_mtx);
 			vfslocked = VFS_LOCK_GIANT(old_vp->v_mount);
 			vn_close(old_vp, AUDIT_CLOSE_FLAGS, old_cred,
 			    audit_td);
 			VFS_UNLOCK_GIANT(vfslocked);
 			crfree(old_cred);
 			mtx_lock(&audit_mtx);
 			old_cred = NULL;
 			old_vp = NULL;
 		}
 		do_replacement_signal = 1;
 	}
 
 	/*
 	 * Signal that replacement have occurred to wake up and start any
 	 * other replacements started in parallel.  We can continue about our
 	 * business in the mean time.  We broadcast so that both new
 	 * replacements can be inserted, but also so that the source(s) of
 	 * replacement can return successfully.
 	 */
 	if (do_replacement_signal)
 		cv_broadcast(&audit_replacement_cv);
 }
 
 /*
  * Given a kernel audit record, process as required.  Kernel audit records
  * are converted to one, or possibly two, BSM records, depending on whether
  * there is a user audit record present also.  Kernel records need be
  * converted to BSM before they can be written out.  Both types will be
  * written to disk, and audit pipes.
  */
 static void
 audit_worker_process_record(struct vnode *audit_vp, struct ucred *audit_cred,
     struct thread *audit_td, struct kaudit_record *ar)
 {
 	struct au_record *bsm;
 	au_class_t class;
 	au_event_t event;
 	au_id_t auid;
 	int error, sorf;
 
 	/*
 	 * First, handle the user record, if any: commit to the system trail
 	 * and audit pipes as selected.
 	 */
 	if ((ar->k_ar_commit & AR_COMMIT_USER) &&
 	    (ar->k_ar_commit & AR_PRESELECT_USER_TRAIL))
 		audit_record_write(audit_vp, audit_cred, audit_td,
 		    ar->k_udata, ar->k_ulen);
 
 	if ((ar->k_ar_commit & AR_COMMIT_USER) &&
 	    (ar->k_ar_commit & AR_PRESELECT_USER_PIPE))
 		audit_pipe_submit_user(ar->k_udata, ar->k_ulen);
 
 	if (!(ar->k_ar_commit & AR_COMMIT_KERNEL) ||
 	    ((ar->k_ar_commit & AR_PRESELECT_PIPE) == 0 &&
 	    (ar->k_ar_commit & AR_PRESELECT_TRAIL) == 0))
 		return;
 
 	auid = ar->k_ar.ar_subj_auid;
 	event = ar->k_ar.ar_event;
 	class = au_event_class(event);
 	if (ar->k_ar.ar_errno == 0)
 		sorf = AU_PRS_SUCCESS;
 	else
 		sorf = AU_PRS_FAILURE;
 
 	error = kaudit_to_bsm(ar, &bsm);
 	switch (error) {
 	case BSM_NOAUDIT:
 		return;
 
 	case BSM_FAILURE:
 		printf("audit_worker_process_record: BSM_FAILURE\n");
 		return;
 
 	case BSM_SUCCESS:
 		break;
 
 	default:
 		panic("kaudit_to_bsm returned %d", error);
 	}
 
 	if (ar->k_ar_commit & AR_PRESELECT_TRAIL)
 		audit_record_write(audit_vp, audit_cred, audit_td, bsm->data,
 		    bsm->len);
 
 	if (ar->k_ar_commit & AR_PRESELECT_PIPE)
 		audit_pipe_submit(auid, event, class, sorf,
 		    ar->k_ar_commit & AR_PRESELECT_TRAIL, bsm->data,
 		    bsm->len);
 
 	kau_free(bsm);
 }
 
 /*
  * The audit_worker thread is responsible for watching the event queue,
  * dequeueing records, converting them to BSM format, and committing them to
  * disk.  In order to minimize lock thrashing, records are dequeued in sets
  * to a thread-local work queue.  In addition, the audit_work performs the
  * actual exchange of audit log vnode pointer, as audit_vp is a thread-local
  * variable.
  */
 static void
 audit_worker(void *arg)
 {
 	struct kaudit_queue ar_worklist;
 	struct kaudit_record *ar;
 	struct ucred *audit_cred;
 	struct thread *audit_td;
 	struct vnode *audit_vp;
 	int lowater_signal;
 
 	/*
 	 * These are thread-local variables requiring no synchronization.
 	 */
 	TAILQ_INIT(&ar_worklist);
 	audit_cred = NULL;
 	audit_td = curthread;
 	audit_vp = NULL;
 
 	mtx_lock(&audit_mtx);
 	while (1) {
 		mtx_assert(&audit_mtx, MA_OWNED);
 
 		/*
 		 * Wait for record or rotation events.
 		 */
 		while (!audit_replacement_flag && TAILQ_EMPTY(&audit_q))
 			cv_wait(&audit_worker_cv, &audit_mtx);
 
 		/*
 		 * First priority: replace the audit log target if requested.
 		 */
 		audit_worker_rotate(&audit_cred, &audit_vp, audit_td);
 
 		/*
 		 * If there are records in the global audit record queue,
 		 * transfer them to a thread-local queue and process them
 		 * one by one.  If we cross the low watermark threshold,
 		 * signal any waiting processes that they may wake up and
 		 * continue generating records.
 		 */
 		lowater_signal = 0;
 		while ((ar = TAILQ_FIRST(&audit_q))) {
 			TAILQ_REMOVE(&audit_q, ar, k_q);
 			audit_q_len--;
 			if (audit_q_len == audit_qctrl.aq_lowater)
 				lowater_signal++;
 			TAILQ_INSERT_TAIL(&ar_worklist, ar, k_q);
 		}
 		if (lowater_signal)
 			cv_broadcast(&audit_watermark_cv);
 
 		mtx_unlock(&audit_mtx);
 		while ((ar = TAILQ_FIRST(&ar_worklist))) {
 			TAILQ_REMOVE(&ar_worklist, ar, k_q);
 			audit_worker_process_record(audit_vp, audit_cred,
 			    audit_td, ar);
 			audit_free(ar);
 		}
 		mtx_lock(&audit_mtx);
 	}
 }
 
 /*
  * audit_rotate_vnode() is called by a user or kernel thread to configure or
  * de-configure auditing on a vnode.  The arguments are the replacement
  * credential and vnode to substitute for the current credential and vnode,
  * if any.  If either is set to NULL, both should be NULL, and this is used
  * to indicate that audit is being disabled.  The real work is done in the
  * audit_worker thread, but audit_rotate_vnode() waits synchronously for that
  * to complete.
  *
  * The vnode should be referenced and opened by the caller.  The credential
  * should be referenced.  audit_rotate_vnode() will own both references as of
  * this call, so the caller should not release either.
  *
  * XXXAUDIT: Review synchronize communication logic.  Really, this is a
  * message queue of depth 1.  We are essentially acquiring ownership of the
  * communications queue, inserting our message, and waiting for an
  * acknowledgement.
  */
 void
 audit_rotate_vnode(struct ucred *cred, struct vnode *vp)
 {
 
 	/*
 	 * If other parallel log replacements have been requested, we wait
 	 * until they've finished before continuing.
 	 */
 	mtx_lock(&audit_mtx);
 	while (audit_replacement_flag != 0)
 		cv_wait(&audit_replacement_cv, &audit_mtx);
 	audit_replacement_cred = cred;
 	audit_replacement_flag = 1;
 	audit_replacement_vp = vp;
 
 	/*
 	 * Wake up the audit worker to perform the exchange once we release
 	 * the mutex.
 	 */
 	cv_signal(&audit_worker_cv);
 
 	/*
 	 * Wait for the audit_worker to broadcast that a replacement has
 	 * taken place; we know that once this has happened, our vnode has
 	 * been replaced in, so we can return successfully.
 	 */
 	cv_wait(&audit_replacement_cv, &audit_mtx);
 	audit_file_rotate_wait = 0; /* We can now request another rotation */
 	mtx_unlock(&audit_mtx);
 }
 
 void
 audit_worker_init(void)
 {
 	int error;
 
 	cv_init(&audit_replacement_cv, "audit_replacement_cv");
 	error = kproc_create(audit_worker, NULL, &audit_thread, RFHIGHPID,
 	    0, "audit");
 	if (error)
 		panic("audit_worker_init: kproc_create returned %d", error);
 }
Index: head/sys/security/mac/mac_process.c
===================================================================
--- head/sys/security/mac/mac_process.c	(revision 175201)
+++ head/sys/security/mac/mac_process.c	(revision 175202)
@@ -1,633 +1,633 @@
 /*-
  * Copyright (c) 1999-2002 Robert N. M. Watson
  * Copyright (c) 2001 Ilmar S. Habibulin
  * Copyright (c) 2001-2003 Networks Associates Technology, Inc.
  * Copyright (c) 2005 Samy Al Bahra
  * Copyright (c) 2006 SPARTA, Inc.
  * All rights reserved.
  *
  * This software was developed by Robert Watson and Ilmar Habibulin for the
  * TrustedBSD Project.
  *
  * This software was developed for the FreeBSD Project in part by Network
  * Associates Laboratories, the Security Research Division of Network
  * Associates, Inc. under DARPA/SPAWAR contract N66001-01-C-8035 ("CBOSS"),
  * as part of the DARPA CHATS research program.
  *
  * This software was enhanced by SPARTA ISSO under SPAWAR contract
  * N66001-04-C-6019 ("SEFOS").
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_mac.h"
 
 #include <sys/param.h>
 #include <sys/condvar.h>
 #include <sys/imgact.h>
 #include <sys/kernel.h>
 #include <sys/lock.h>
 #include <sys/malloc.h>
 #include <sys/mutex.h>
 #include <sys/mac.h>
 #include <sys/proc.h>
 #include <sys/sbuf.h>
 #include <sys/systm.h>
 #include <sys/vnode.h>
 #include <sys/mount.h>
 #include <sys/file.h>
 #include <sys/namei.h>
 #include <sys/sysctl.h>
 
 #include <vm/vm.h>
 #include <vm/pmap.h>
 #include <vm/vm_map.h>
 #include <vm/vm_object.h>
 
 #include <security/mac/mac_framework.h>
 #include <security/mac/mac_internal.h>
 #include <security/mac/mac_policy.h>
 
 static int	mac_mmap_revocation = 1;
 SYSCTL_INT(_security_mac, OID_AUTO, mmap_revocation, CTLFLAG_RW,
     &mac_mmap_revocation, 0, "Revoke mmap access to files on subject "
     "relabel");
 
 static int	mac_mmap_revocation_via_cow = 0;
 SYSCTL_INT(_security_mac, OID_AUTO, mmap_revocation_via_cow, CTLFLAG_RW,
     &mac_mmap_revocation_via_cow, 0, "Revoke mmap access to files via "
     "copy-on-write semantics, or by removing all write access");
 
 static void	mac_cred_mmapped_drop_perms_recurse(struct thread *td,
 		    struct ucred *cred, struct vm_map *map);
 
 struct label *
 mac_cred_label_alloc(void)
 {
 	struct label *label;
 
 	label = mac_labelzone_alloc(M_WAITOK);
 	MAC_PERFORM(cred_init_label, label);
 	return (label);
 }
 
 void
 mac_cred_init(struct ucred *cred)
 {
 
 	cred->cr_label = mac_cred_label_alloc();
 }
 
 static struct label *
 mac_proc_label_alloc(void)
 {
 	struct label *label;
 
 	label = mac_labelzone_alloc(M_WAITOK);
 	MAC_PERFORM(proc_init_label, label);
 	return (label);
 }
 
 void
 mac_proc_init(struct proc *p)
 {
 
 	p->p_label = mac_proc_label_alloc();
 }
 
 void
 mac_cred_label_free(struct label *label)
 {
 
 	MAC_PERFORM(cred_destroy_label, label);
 	mac_labelzone_free(label);
 }
 
 void
 mac_cred_destroy(struct ucred *cred)
 {
 
 	mac_cred_label_free(cred->cr_label);
 	cred->cr_label = NULL;
 }
 
 static void
 mac_proc_label_free(struct label *label)
 {
 
 	MAC_PERFORM(proc_destroy_label, label);
 	mac_labelzone_free(label);
 }
 
 void
 mac_proc_destroy(struct proc *p)
 {
 
 	mac_proc_label_free(p->p_label);
 	p->p_label = NULL;
 }
 
 int
 mac_cred_externalize_label(struct label *label, char *elements,
     char *outbuf, size_t outbuflen)
 {
 	int error;
 
 	MAC_EXTERNALIZE(cred, label, elements, outbuf, outbuflen);
 
 	return (error);
 }
 
 int
 mac_cred_internalize_label(struct label *label, char *string)
 {
 	int error;
 
 	MAC_INTERNALIZE(cred, label, string);
 
 	return (error);
 }
 
 /*
  * Initialize MAC label for the first kernel process, from which other kernel
  * processes and threads are spawned.
  */
 void
 mac_proc_create_swapper(struct ucred *cred)
 {
 
 	MAC_PERFORM(proc_create_swapper, cred);
 }
 
 /*
  * Initialize MAC label for the first userland process, from which other
  * userland processes and threads are spawned.
  */
 void
 mac_proc_create_init(struct ucred *cred)
 {
 
 	MAC_PERFORM(proc_create_init, cred);
 }
 
 /*
  * When a thread becomes an NFS server daemon, its credential may need to be
  * updated to reflect this so that policies can recognize when file system
  * operations originate from the network.
  *
  * At some point, it would be desirable if the credential used for each NFS
  * RPC could be set based on the RPC context (i.e., source system, etc) to
  * provide more fine-grained access control.
  */
 void
 mac_proc_associate_nfsd(struct ucred *cred)
 {
 
 	MAC_PERFORM(proc_associate_nfsd, cred);
 }
 
 void
 mac_thread_userret(struct thread *td)
 {
 
 	MAC_PERFORM(thread_userret, td);
 }
 
 /*
  * When a new process is created, its label must be initialized.  Generally,
  * this involves inheritence from the parent process, modulo possible deltas.
  * This function allows that processing to take place.
  */
 void
 mac_cred_copy(struct ucred *src, struct ucred *dest)
 {
 
 	MAC_PERFORM(cred_copy_label, src->cr_label, dest->cr_label);
 }
 
 int
 mac_execve_enter(struct image_params *imgp, struct mac *mac_p)
 {
 	struct label *label;
 	struct mac mac;
 	char *buffer;
 	int error;
 
 	if (mac_p == NULL)
 		return (0);
 
 	error = copyin(mac_p, &mac, sizeof(mac));
 	if (error)
 		return (error);
 
 	error = mac_check_structmac_consistent(&mac);
 	if (error)
 		return (error);
 
 	buffer = malloc(mac.m_buflen, M_MACTEMP, M_WAITOK);
 	error = copyinstr(mac.m_string, buffer, mac.m_buflen, NULL);
 	if (error) {
 		free(buffer, M_MACTEMP);
 		return (error);
 	}
 
 	label = mac_cred_label_alloc();
 	error = mac_cred_internalize_label(label, buffer);
 	free(buffer, M_MACTEMP);
 	if (error) {
 		mac_cred_label_free(label);
 		return (error);
 	}
 	imgp->execlabel = label;
 	return (0);
 }
 
 void
 mac_execve_exit(struct image_params *imgp)
 {
 	if (imgp->execlabel != NULL) {
 		mac_cred_label_free(imgp->execlabel);
 		imgp->execlabel = NULL;
 	}
 }
 
 /*
  * When relabeling a process, call out to the policies for the maximum
  * permission allowed for each object type we know about in its memory space,
  * and revoke access (in the least surprising ways we know) when necessary.
  * The process lock is not held here.
  */
 void
 mac_cred_mmapped_drop_perms(struct thread *td, struct ucred *cred)
 {
 
 	/* XXX freeze all other threads */
 	mac_cred_mmapped_drop_perms_recurse(td, cred,
 	    &td->td_proc->p_vmspace->vm_map);
 	/* XXX allow other threads to continue */
 }
 
 static __inline const char *
 prot2str(vm_prot_t prot)
 {
 
 	switch (prot & VM_PROT_ALL) {
 	case VM_PROT_READ:
 		return ("r--");
 	case VM_PROT_READ | VM_PROT_WRITE:
 		return ("rw-");
 	case VM_PROT_READ | VM_PROT_EXECUTE:
 		return ("r-x");
 	case VM_PROT_READ | VM_PROT_WRITE | VM_PROT_EXECUTE:
 		return ("rwx");
 	case VM_PROT_WRITE:
 		return ("-w-");
 	case VM_PROT_EXECUTE:
 		return ("--x");
 	case VM_PROT_WRITE | VM_PROT_EXECUTE:
 		return ("-wx");
 	default:
 		return ("---");
 	}
 }
 
 static void
 mac_cred_mmapped_drop_perms_recurse(struct thread *td, struct ucred *cred,
     struct vm_map *map)
 {
 	struct vm_map_entry *vme;
 	int vfslocked, result;
 	vm_prot_t revokeperms;
 	vm_object_t backing_object, object;
 	vm_ooffset_t offset;
 	struct vnode *vp;
 	struct mount *mp;
 
 	if (!mac_mmap_revocation)
 		return;
 
 	vm_map_lock_read(map);
 	for (vme = map->header.next; vme != &map->header; vme = vme->next) {
 		if (vme->eflags & MAP_ENTRY_IS_SUB_MAP) {
 			mac_cred_mmapped_drop_perms_recurse(td, cred,
 			    vme->object.sub_map);
 			continue;
 		}
 		/*
 		 * Skip over entries that obviously are not shared.
 		 */
 		if (vme->eflags & (MAP_ENTRY_COW | MAP_ENTRY_NOSYNC) ||
 		    !vme->max_protection)
 			continue;
 		/*
 		 * Drill down to the deepest backing object.
 		 */
 		offset = vme->offset;
 		object = vme->object.vm_object;
 		if (object == NULL)
 			continue;
 		VM_OBJECT_LOCK(object);
 		while ((backing_object = object->backing_object) != NULL) {
 			VM_OBJECT_LOCK(backing_object);
 			offset += object->backing_object_offset;
 			VM_OBJECT_UNLOCK(object);
 			object = backing_object;
 		}
 		VM_OBJECT_UNLOCK(object);
 		/*
 		 * At the moment, vm_maps and objects aren't considered by
 		 * the MAC system, so only things with backing by a normal
 		 * object (read: vnodes) are checked.
 		 */
 		if (object->type != OBJT_VNODE)
 			continue;
 		vp = (struct vnode *)object->handle;
 		vfslocked = VFS_LOCK_GIANT(vp->v_mount);
-		vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td);
+		vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
 		result = vme->max_protection;
 		mac_vnode_check_mmap_downgrade(cred, vp, &result);
 		VOP_UNLOCK(vp, 0, td);
 		/*
 		 * Find out what maximum protection we may be allowing now
 		 * but a policy needs to get removed.
 		 */
 		revokeperms = vme->max_protection & ~result;
 		if (!revokeperms) {
 			VFS_UNLOCK_GIANT(vfslocked);
 			continue;
 		}
 		printf("pid %ld: revoking %s perms from %#lx:%ld "
 		    "(max %s/cur %s)\n", (long)td->td_proc->p_pid,
 		    prot2str(revokeperms), (u_long)vme->start,
 		    (long)(vme->end - vme->start),
 		    prot2str(vme->max_protection), prot2str(vme->protection));
 		vm_map_lock_upgrade(map);
 		/*
 		 * This is the really simple case: if a map has more
 		 * max_protection than is allowed, but it's not being
 		 * actually used (that is, the current protection is still
 		 * allowed), we can just wipe it out and do nothing more.
 		 */
 		if ((vme->protection & revokeperms) == 0) {
 			vme->max_protection -= revokeperms;
 		} else {
 			if (revokeperms & VM_PROT_WRITE) {
 				/*
 				 * In the more complicated case, flush out all
 				 * pending changes to the object then turn it
 				 * copy-on-write.
 				 */
 				vm_object_reference(object);
 				(void) vn_start_write(vp, &mp, V_WAIT);
-				vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td);
+				vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
 				VM_OBJECT_LOCK(object);
 				vm_object_page_clean(object,
 				    OFF_TO_IDX(offset),
 				    OFF_TO_IDX(offset + vme->end - vme->start +
 					PAGE_MASK),
 				    OBJPC_SYNC);
 				VM_OBJECT_UNLOCK(object);
 				VOP_UNLOCK(vp, 0, td);
 				vn_finished_write(mp);
 				vm_object_deallocate(object);
 				/*
 				 * Why bother if there's no read permissions
 				 * anymore?  For the rest, we need to leave
 				 * the write permissions on for COW, or
 				 * remove them entirely if configured to.
 				 */
 				if (!mac_mmap_revocation_via_cow) {
 					vme->max_protection &= ~VM_PROT_WRITE;
 					vme->protection &= ~VM_PROT_WRITE;
 				} if ((revokeperms & VM_PROT_READ) == 0)
 					vme->eflags |= MAP_ENTRY_COW |
 					    MAP_ENTRY_NEEDS_COPY;
 			}
 			if (revokeperms & VM_PROT_EXECUTE) {
 				vme->max_protection &= ~VM_PROT_EXECUTE;
 				vme->protection &= ~VM_PROT_EXECUTE;
 			}
 			if (revokeperms & VM_PROT_READ) {
 				vme->max_protection = 0;
 				vme->protection = 0;
 			}
 			pmap_protect(map->pmap, vme->start, vme->end,
 			    vme->protection & ~revokeperms);
 			vm_map_simplify_entry(map, vme);
 		}
 		vm_map_lock_downgrade(map);
 		VFS_UNLOCK_GIANT(vfslocked);
 	}
 	vm_map_unlock_read(map);
 }
 
 /*
  * When the subject's label changes, it may require revocation of privilege
  * to mapped objects.  This can't be done on-the-fly later with a unified
  * buffer cache.
  */
 void
 mac_cred_relabel(struct ucred *cred, struct label *newlabel)
 {
 
 	MAC_PERFORM(cred_relabel, cred, newlabel);
 }
 
 int
 mac_cred_check_relabel(struct ucred *cred, struct label *newlabel)
 {
 	int error;
 
 	MAC_CHECK(cred_check_relabel, cred, newlabel);
 
 	return (error);
 }
 
 int
 mac_cred_check_visible(struct ucred *cr1, struct ucred *cr2)
 {
 	int error;
 
 	MAC_CHECK(cred_check_visible, cr1, cr2);
 
 	return (error);
 }
 
 int
 mac_proc_check_debug(struct ucred *cred, struct proc *p)
 {
 	int error;
 
 	PROC_LOCK_ASSERT(p, MA_OWNED);
 
 	MAC_CHECK(proc_check_debug, cred, p);
 
 	return (error);
 }
 
 int
 mac_proc_check_sched(struct ucred *cred, struct proc *p)
 {
 	int error;
 
 	PROC_LOCK_ASSERT(p, MA_OWNED);
 
 	MAC_CHECK(proc_check_sched, cred, p);
 
 	return (error);
 }
 
 int
 mac_proc_check_signal(struct ucred *cred, struct proc *p, int signum)
 {
 	int error;
 
 	PROC_LOCK_ASSERT(p, MA_OWNED);
 
 	MAC_CHECK(proc_check_signal, cred, p, signum);
 
 	return (error);
 }
 
 int
 mac_proc_check_setuid(struct proc *p, struct ucred *cred, uid_t uid)
 {
 	int error;
 
 	PROC_LOCK_ASSERT(p, MA_OWNED);
 
 	MAC_CHECK(proc_check_setuid, cred, uid);
 	return (error);
 }
 
 int
 mac_proc_check_seteuid(struct proc *p, struct ucred *cred, uid_t euid)
 {
 	int error;
 
 	PROC_LOCK_ASSERT(p, MA_OWNED);
 
 	MAC_CHECK(proc_check_seteuid, cred, euid);
 	return (error);
 }
 
 int
 mac_proc_check_setgid(struct proc *p, struct ucred *cred, gid_t gid)
 {
 	int error;
 
 	PROC_LOCK_ASSERT(p, MA_OWNED);
 
 	MAC_CHECK(proc_check_setgid, cred, gid);
 
 	return (error);
 }
 
 int
 mac_proc_check_setegid(struct proc *p, struct ucred *cred, gid_t egid)
 {
 	int error;
 
 	PROC_LOCK_ASSERT(p, MA_OWNED);
 
 	MAC_CHECK(proc_check_setegid, cred, egid);
 
 	return (error);
 }
 
 int
 mac_proc_check_setgroups(struct proc *p, struct ucred *cred, int ngroups,
     gid_t *gidset)
 {
 	int error;
 
 	PROC_LOCK_ASSERT(p, MA_OWNED);
 
 	MAC_CHECK(proc_check_setgroups, cred, ngroups, gidset);
 	return (error);
 }
 
 int
 mac_proc_check_setreuid(struct proc *p, struct ucred *cred, uid_t ruid,
     uid_t euid)
 {
 	int error;
 
 	PROC_LOCK_ASSERT(p, MA_OWNED);
 
 	MAC_CHECK(proc_check_setreuid, cred, ruid, euid);
 
 	return (error);
 }
 
 int
 mac_proc_check_setregid(struct proc *proc, struct ucred *cred, gid_t rgid,
     gid_t egid)
 {
 	int error;
 
 	PROC_LOCK_ASSERT(proc, MA_OWNED);
 
 	MAC_CHECK(proc_check_setregid, cred, rgid, egid);
 
 	return (error);
 }
 
 int
 mac_proc_check_setresuid(struct proc *p, struct ucred *cred, uid_t ruid,
     uid_t euid, uid_t suid)
 {
 	int error;
 
 	PROC_LOCK_ASSERT(p, MA_OWNED);
 
 	MAC_CHECK(proc_check_setresuid, cred, ruid, euid, suid);
 	return (error);
 }
 
 int
 mac_proc_check_setresgid(struct proc *p, struct ucred *cred, gid_t rgid,
     gid_t egid, gid_t sgid)
 {
 	int error;
 
 	PROC_LOCK_ASSERT(p, MA_OWNED);
 
 	MAC_CHECK(proc_check_setresgid, cred, rgid, egid, sgid);
 
 	return (error);
 }
 
 int
 mac_proc_check_wait(struct ucred *cred, struct proc *p)
 {
 	int error;
 
 	PROC_LOCK_ASSERT(p, MA_OWNED);
 
 	MAC_CHECK(proc_check_wait, cred, p);
 
 	return (error);
 }
Index: head/sys/security/mac/mac_syscalls.c
===================================================================
--- head/sys/security/mac/mac_syscalls.c	(revision 175201)
+++ head/sys/security/mac/mac_syscalls.c	(revision 175202)
@@ -1,702 +1,702 @@
 /*-
  * Copyright (c) 1999-2002, 2006 Robert N. M. Watson
  * Copyright (c) 2001 Ilmar S. Habibulin
  * Copyright (c) 2001-2005 Networks Associates Technology, Inc.
  * Copyright (c) 2005-2006 SPARTA, Inc.
  * All rights reserved.
  *
  * This software was developed by Robert Watson and Ilmar Habibulin for the
  * TrustedBSD Project.
  *
  * This software was developed for the FreeBSD Project in part by Network
  * Associates Laboratories, the Security Research Division of Network
  * Associates, Inc. under DARPA/SPAWAR contract N66001-01-C-8035 ("CBOSS"),
  * as part of the DARPA CHATS research program.
  *
  * This software was enhanced by SPARTA ISSO under SPAWAR contract 
  * N66001-04-C-6019 ("SEFOS").
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_mac.h"
 
 #include <sys/param.h>
 #include <sys/kernel.h>
 #include <sys/lock.h>
 #include <sys/malloc.h>
 #include <sys/mutex.h>
 #include <sys/mac.h>
 #include <sys/proc.h>
 #include <sys/systm.h>
 #include <sys/sysproto.h>
 #include <sys/sysent.h>
 #include <sys/vnode.h>
 #include <sys/mount.h>
 #include <sys/file.h>
 #include <sys/namei.h>
 #include <sys/socket.h>
 #include <sys/pipe.h>
 #include <sys/socketvar.h>
 
 #include <security/mac/mac_framework.h>
 #include <security/mac/mac_internal.h>
 #include <security/mac/mac_policy.h>
 
 #ifdef MAC
 
 int
 __mac_get_pid(struct thread *td, struct __mac_get_pid_args *uap)
 {
 	char *elements, *buffer;
 	struct mac mac;
 	struct proc *tproc;
 	struct ucred *tcred;
 	int error;
 
 	error = copyin(uap->mac_p, &mac, sizeof(mac));
 	if (error)
 		return (error);
 
 	error = mac_check_structmac_consistent(&mac);
 	if (error)
 		return (error);
 
 	tproc = pfind(uap->pid);
 	if (tproc == NULL)
 		return (ESRCH);
 
 	tcred = NULL;				/* Satisfy gcc. */
 	error = p_cansee(td, tproc);
 	if (error == 0)
 		tcred = crhold(tproc->p_ucred);
 	PROC_UNLOCK(tproc);
 	if (error)
 		return (error);
 
 	elements = malloc(mac.m_buflen, M_MACTEMP, M_WAITOK);
 	error = copyinstr(mac.m_string, elements, mac.m_buflen, NULL);
 	if (error) {
 		free(elements, M_MACTEMP);
 		crfree(tcred);
 		return (error);
 	}
 
 	buffer = malloc(mac.m_buflen, M_MACTEMP, M_WAITOK | M_ZERO);
 	error = mac_cred_externalize_label(tcred->cr_label, elements,
 	    buffer, mac.m_buflen);
 	if (error == 0)
 		error = copyout(buffer, mac.m_string, strlen(buffer)+1);
 
 	free(buffer, M_MACTEMP);
 	free(elements, M_MACTEMP);
 	crfree(tcred);
 	return (error);
 }
 
 int
 __mac_get_proc(struct thread *td, struct __mac_get_proc_args *uap)
 {
 	char *elements, *buffer;
 	struct mac mac;
 	int error;
 
 	error = copyin(uap->mac_p, &mac, sizeof(mac));
 	if (error)
 		return (error);
 
 	error = mac_check_structmac_consistent(&mac);
 	if (error)
 		return (error);
 
 	elements = malloc(mac.m_buflen, M_MACTEMP, M_WAITOK);
 	error = copyinstr(mac.m_string, elements, mac.m_buflen, NULL);
 	if (error) {
 		free(elements, M_MACTEMP);
 		return (error);
 	}
 
 	buffer = malloc(mac.m_buflen, M_MACTEMP, M_WAITOK | M_ZERO);
 	error = mac_cred_externalize_label(td->td_ucred->cr_label,
 	    elements, buffer, mac.m_buflen);
 	if (error == 0)
 		error = copyout(buffer, mac.m_string, strlen(buffer)+1);
 
 	free(buffer, M_MACTEMP);
 	free(elements, M_MACTEMP);
 	return (error);
 }
 
 int
 __mac_set_proc(struct thread *td, struct __mac_set_proc_args *uap)
 {
 	struct ucred *newcred, *oldcred;
 	struct label *intlabel;
 	struct proc *p;
 	struct mac mac;
 	char *buffer;
 	int error;
 
 	error = copyin(uap->mac_p, &mac, sizeof(mac));
 	if (error)
 		return (error);
 
 	error = mac_check_structmac_consistent(&mac);
 	if (error)
 		return (error);
 
 	buffer = malloc(mac.m_buflen, M_MACTEMP, M_WAITOK);
 	error = copyinstr(mac.m_string, buffer, mac.m_buflen, NULL);
 	if (error) {
 		free(buffer, M_MACTEMP);
 		return (error);
 	}
 
 	intlabel = mac_cred_label_alloc();
 	error = mac_cred_internalize_label(intlabel, buffer);
 	free(buffer, M_MACTEMP);
 	if (error)
 		goto out;
 
 	newcred = crget();
 
 	p = td->td_proc;
 	PROC_LOCK(p);
 	oldcred = p->p_ucred;
 
 	error = mac_cred_check_relabel(oldcred, intlabel);
 	if (error) {
 		PROC_UNLOCK(p);
 		crfree(newcred);
 		goto out;
 	}
 
 	setsugid(p);
 	crcopy(newcred, oldcred);
 	mac_cred_relabel(newcred, intlabel);
 	p->p_ucred = newcred;
 
 	/*
 	 * Grab additional reference for use while revoking mmaps, prior to
 	 * releasing the proc lock and sharing the cred.
 	 */
 	crhold(newcred);
 	PROC_UNLOCK(p);
 
 	mac_cred_mmapped_drop_perms(td, newcred);
 
 	crfree(newcred);	/* Free revocation reference. */
 	crfree(oldcred);
 
 out:
 	mac_cred_label_free(intlabel);
 	return (error);
 }
 
 int
 __mac_get_fd(struct thread *td, struct __mac_get_fd_args *uap)
 {
 	char *elements, *buffer;
 	struct label *intlabel;
 	struct file *fp;
 	struct mac mac;
 	struct vnode *vp;
 	struct pipe *pipe;
 	struct socket *so;
 	short label_type;
 	int vfslocked, error;
 
 	error = copyin(uap->mac_p, &mac, sizeof(mac));
 	if (error)
 		return (error);
 
 	error = mac_check_structmac_consistent(&mac);
 	if (error)
 		return (error);
 
 	elements = malloc(mac.m_buflen, M_MACTEMP, M_WAITOK);
 	error = copyinstr(mac.m_string, elements, mac.m_buflen, NULL);
 	if (error) {
 		free(elements, M_MACTEMP);
 		return (error);
 	}
 
 	buffer = malloc(mac.m_buflen, M_MACTEMP, M_WAITOK | M_ZERO);
 	error = fget(td, uap->fd, &fp);
 	if (error)
 		goto out;
 
 	label_type = fp->f_type;
 	switch (fp->f_type) {
 	case DTYPE_FIFO:
 	case DTYPE_VNODE:
 		vp = fp->f_vnode;
 		intlabel = mac_vnode_label_alloc();
 		vfslocked = VFS_LOCK_GIANT(vp->v_mount);
-		vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td);
+		vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
 		mac_vnode_copy_label(vp->v_label, intlabel);
 		VOP_UNLOCK(vp, 0, td);
 		VFS_UNLOCK_GIANT(vfslocked);
 		error = mac_vnode_externalize_label(intlabel, elements,
 		    buffer, mac.m_buflen);
 		mac_vnode_label_free(intlabel);
 		break;
 
 	case DTYPE_PIPE:
 		pipe = fp->f_data;
 		intlabel = mac_pipe_label_alloc();
 		PIPE_LOCK(pipe);
 		mac_pipe_copy_label(pipe->pipe_pair->pp_label, intlabel);
 		PIPE_UNLOCK(pipe);
 		error = mac_pipe_externalize_label(intlabel, elements,
 		    buffer, mac.m_buflen);
 		mac_pipe_label_free(intlabel);
 		break;
 
 	case DTYPE_SOCKET:
 		so = fp->f_data;
 		intlabel = mac_socket_label_alloc(M_WAITOK);
 		SOCK_LOCK(so);
 		mac_socket_copy_label(so->so_label, intlabel);
 		SOCK_UNLOCK(so);
 		error = mac_socket_externalize_label(intlabel, elements,
 		    buffer, mac.m_buflen);
 		mac_socket_label_free(intlabel);
 		break;
 
 	default:
 		error = EINVAL;
 	}
 	fdrop(fp, td);
 	if (error == 0)
 		error = copyout(buffer, mac.m_string, strlen(buffer)+1);
 
 out:
 	free(buffer, M_MACTEMP);
 	free(elements, M_MACTEMP);
 	return (error);
 }
 
 int
 __mac_get_file(struct thread *td, struct __mac_get_file_args *uap)
 {
 	char *elements, *buffer;
 	struct nameidata nd;
 	struct label *intlabel;
 	struct mac mac;
 	int vfslocked, error;
 
 	error = copyin(uap->mac_p, &mac, sizeof(mac));
 	if (error)
 		return (error);
 
 	error = mac_check_structmac_consistent(&mac);
 	if (error)
 		return (error);
 
 	elements = malloc(mac.m_buflen, M_MACTEMP, M_WAITOK);
 	error = copyinstr(mac.m_string, elements, mac.m_buflen, NULL);
 	if (error) {
 		free(elements, M_MACTEMP);
 		return (error);
 	}
 
 	buffer = malloc(mac.m_buflen, M_MACTEMP, M_WAITOK | M_ZERO);
 	NDINIT(&nd, LOOKUP, MPSAFE | LOCKLEAF | FOLLOW, UIO_USERSPACE,
 	    uap->path_p, td);
 	error = namei(&nd);
 	if (error)
 		goto out;
 
 	intlabel = mac_vnode_label_alloc();
 	vfslocked = NDHASGIANT(&nd);
 	mac_vnode_copy_label(nd.ni_vp->v_label, intlabel);
 	error = mac_vnode_externalize_label(intlabel, elements, buffer,
 	    mac.m_buflen);
 
 	NDFREE(&nd, 0);
 	VFS_UNLOCK_GIANT(vfslocked);
 	mac_vnode_label_free(intlabel);
 	if (error == 0)
 		error = copyout(buffer, mac.m_string, strlen(buffer)+1);
 
 out:
 	free(buffer, M_MACTEMP);
 	free(elements, M_MACTEMP);
 
 	return (error);
 }
 
 int
 __mac_get_link(struct thread *td, struct __mac_get_link_args *uap)
 {
 	char *elements, *buffer;
 	struct nameidata nd;
 	struct label *intlabel;
 	struct mac mac;
 	int vfslocked, error;
 
 	error = copyin(uap->mac_p, &mac, sizeof(mac));
 	if (error)
 		return (error);
 
 	error = mac_check_structmac_consistent(&mac);
 	if (error)
 		return (error);
 
 	elements = malloc(mac.m_buflen, M_MACTEMP, M_WAITOK);
 	error = copyinstr(mac.m_string, elements, mac.m_buflen, NULL);
 	if (error) {
 		free(elements, M_MACTEMP);
 		return (error);
 	}
 
 	buffer = malloc(mac.m_buflen, M_MACTEMP, M_WAITOK | M_ZERO);
 	NDINIT(&nd, LOOKUP, MPSAFE | LOCKLEAF | NOFOLLOW, UIO_USERSPACE,
 	    uap->path_p, td);
 	error = namei(&nd);
 	if (error)
 		goto out;
 
 	intlabel = mac_vnode_label_alloc();
 	vfslocked = NDHASGIANT(&nd);
 	mac_vnode_copy_label(nd.ni_vp->v_label, intlabel);
 	error = mac_vnode_externalize_label(intlabel, elements, buffer,
 	    mac.m_buflen);
 	NDFREE(&nd, 0);
 	VFS_UNLOCK_GIANT(vfslocked);
 	mac_vnode_label_free(intlabel);
 
 	if (error == 0)
 		error = copyout(buffer, mac.m_string, strlen(buffer)+1);
 
 out:
 	free(buffer, M_MACTEMP);
 	free(elements, M_MACTEMP);
 
 	return (error);
 }
 
 int
 __mac_set_fd(struct thread *td, struct __mac_set_fd_args *uap)
 {
 	struct label *intlabel;
 	struct pipe *pipe;
 	struct socket *so;
 	struct file *fp;
 	struct mount *mp;
 	struct vnode *vp;
 	struct mac mac;
 	char *buffer;
 	int error, vfslocked;
 
 	error = copyin(uap->mac_p, &mac, sizeof(mac));
 	if (error)
 		return (error);
 
 	error = mac_check_structmac_consistent(&mac);
 	if (error)
 		return (error);
 
 	buffer = malloc(mac.m_buflen, M_MACTEMP, M_WAITOK);
 	error = copyinstr(mac.m_string, buffer, mac.m_buflen, NULL);
 	if (error) {
 		free(buffer, M_MACTEMP);
 		return (error);
 	}
 
 	error = fget(td, uap->fd, &fp);
 	if (error)
 		goto out;
 
 	switch (fp->f_type) {
 	case DTYPE_FIFO:
 	case DTYPE_VNODE:
 		intlabel = mac_vnode_label_alloc();
 		error = mac_vnode_internalize_label(intlabel, buffer);
 		if (error) {
 			mac_vnode_label_free(intlabel);
 			break;
 		}
 		vp = fp->f_vnode;
 		vfslocked = VFS_LOCK_GIANT(vp->v_mount);
 		error = vn_start_write(vp, &mp, V_WAIT | PCATCH);
 		if (error != 0) {
 			VFS_UNLOCK_GIANT(vfslocked);
 			mac_vnode_label_free(intlabel);
 			break;
 		}
-		vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td);
+		vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
 		error = vn_setlabel(vp, intlabel, td->td_ucred);
 		VOP_UNLOCK(vp, 0, td);
 		vn_finished_write(mp);
 		VFS_UNLOCK_GIANT(vfslocked);
 		mac_vnode_label_free(intlabel);
 		break;
 
 	case DTYPE_PIPE:
 		intlabel = mac_pipe_label_alloc();
 		error = mac_pipe_internalize_label(intlabel, buffer);
 		if (error == 0) {
 			pipe = fp->f_data;
 			PIPE_LOCK(pipe);
 			error = mac_pipe_label_set(td->td_ucred,
 			    pipe->pipe_pair, intlabel);
 			PIPE_UNLOCK(pipe);
 		}
 		mac_pipe_label_free(intlabel);
 		break;
 
 	case DTYPE_SOCKET:
 		intlabel = mac_socket_label_alloc(M_WAITOK);
 		error = mac_socket_internalize_label(intlabel, buffer);
 		if (error == 0) {
 			so = fp->f_data;
 			error = mac_socket_label_set(td->td_ucred, so,
 			    intlabel);
 		}
 		mac_socket_label_free(intlabel);
 		break;
 
 	default:
 		error = EINVAL;
 	}
 	fdrop(fp, td);
 out:
 	free(buffer, M_MACTEMP);
 	return (error);
 }
 
 int
 __mac_set_file(struct thread *td, struct __mac_set_file_args *uap)
 {
 	struct label *intlabel;
 	struct nameidata nd;
 	struct mount *mp;
 	struct mac mac;
 	char *buffer;
 	int vfslocked, error;
 
 	error = copyin(uap->mac_p, &mac, sizeof(mac));
 	if (error)
 		return (error);
 
 	error = mac_check_structmac_consistent(&mac);
 	if (error)
 		return (error);
 
 	buffer = malloc(mac.m_buflen, M_MACTEMP, M_WAITOK);
 	error = copyinstr(mac.m_string, buffer, mac.m_buflen, NULL);
 	if (error) {
 		free(buffer, M_MACTEMP);
 		return (error);
 	}
 
 	intlabel = mac_vnode_label_alloc();
 	error = mac_vnode_internalize_label(intlabel, buffer);
 	free(buffer, M_MACTEMP);
 	if (error)
 		goto out;
 
 	NDINIT(&nd, LOOKUP, MPSAFE | LOCKLEAF | FOLLOW, UIO_USERSPACE,
 	    uap->path_p, td);
 	error = namei(&nd);
 	vfslocked = NDHASGIANT(&nd);
 	if (error == 0) {
 		error = vn_start_write(nd.ni_vp, &mp, V_WAIT | PCATCH);
 		if (error == 0) {
 			error = vn_setlabel(nd.ni_vp, intlabel,
 			    td->td_ucred);
 			vn_finished_write(mp);
 		}
 	}
 
 	NDFREE(&nd, 0);
 	VFS_UNLOCK_GIANT(vfslocked);
 out:
 	mac_vnode_label_free(intlabel);
 	return (error);
 }
 
 int
 __mac_set_link(struct thread *td, struct __mac_set_link_args *uap)
 {
 	struct label *intlabel;
 	struct nameidata nd;
 	struct mount *mp;
 	struct mac mac;
 	char *buffer;
 	int vfslocked, error;
 
 	error = copyin(uap->mac_p, &mac, sizeof(mac));
 	if (error)
 		return (error);
 
 	error = mac_check_structmac_consistent(&mac);
 	if (error)
 		return (error);
 
 	buffer = malloc(mac.m_buflen, M_MACTEMP, M_WAITOK);
 	error = copyinstr(mac.m_string, buffer, mac.m_buflen, NULL);
 	if (error) {
 		free(buffer, M_MACTEMP);
 		return (error);
 	}
 
 	intlabel = mac_vnode_label_alloc();
 	error = mac_vnode_internalize_label(intlabel, buffer);
 	free(buffer, M_MACTEMP);
 	if (error)
 		goto out;
 
 	NDINIT(&nd, LOOKUP, MPSAFE | LOCKLEAF | NOFOLLOW, UIO_USERSPACE,
 	    uap->path_p, td);
 	error = namei(&nd);
 	vfslocked = NDHASGIANT(&nd);
 	if (error == 0) {
 		error = vn_start_write(nd.ni_vp, &mp, V_WAIT | PCATCH);
 		if (error == 0) {
 			error = vn_setlabel(nd.ni_vp, intlabel,
 			    td->td_ucred);
 			vn_finished_write(mp);
 		}
 	}
 
 	NDFREE(&nd, 0);
 	VFS_UNLOCK_GIANT(vfslocked);
 out:
 	mac_vnode_label_free(intlabel);
 	return (error);
 }
 
 int
 mac_syscall(struct thread *td, struct mac_syscall_args *uap)
 {
 	struct mac_policy_conf *mpc;
 	char target[MAC_MAX_POLICY_NAME];
 	int entrycount, error;
 
 	error = copyinstr(uap->policy, target, sizeof(target), NULL);
 	if (error)
 		return (error);
 
 	error = ENOSYS;
 	LIST_FOREACH(mpc, &mac_static_policy_list, mpc_list) {
 		if (strcmp(mpc->mpc_name, target) == 0 &&
 		    mpc->mpc_ops->mpo_syscall != NULL) {
 			error = mpc->mpc_ops->mpo_syscall(td,
 			    uap->call, uap->arg);
 			goto out;
 		}
 	}
 
 	if ((entrycount = mac_policy_list_conditional_busy()) != 0) {
 		LIST_FOREACH(mpc, &mac_policy_list, mpc_list) {
 			if (strcmp(mpc->mpc_name, target) == 0 &&
 			    mpc->mpc_ops->mpo_syscall != NULL) {
 				error = mpc->mpc_ops->mpo_syscall(td,
 				    uap->call, uap->arg);
 				break;
 			}
 		}
 		mac_policy_list_unbusy();
 	}
 out:
 	return (error);
 }
 
 #else /* !MAC */
 
 int
 __mac_get_pid(struct thread *td, struct __mac_get_pid_args *uap)
 {
 
 	return (ENOSYS);
 }
 
 int
 __mac_get_proc(struct thread *td, struct __mac_get_proc_args *uap)
 {
 
 	return (ENOSYS);
 }
 
 int
 __mac_set_proc(struct thread *td, struct __mac_set_proc_args *uap)
 {
 
 	return (ENOSYS);
 }
 
 int
 __mac_get_fd(struct thread *td, struct __mac_get_fd_args *uap)
 {
 
 	return (ENOSYS);
 }
 
 int
 __mac_get_file(struct thread *td, struct __mac_get_file_args *uap)
 {
 
 	return (ENOSYS);
 }
 
 int
 __mac_get_link(struct thread *td, struct __mac_get_link_args *uap)
 {
 
 	return (ENOSYS);
 }
 
 int
 __mac_set_fd(struct thread *td, struct __mac_set_fd_args *uap)
 {
 
 	return (ENOSYS);
 }
 
 int
 __mac_set_file(struct thread *td, struct __mac_set_file_args *uap)
 {
 
 	return (ENOSYS);
 }
 
 int
 __mac_set_link(struct thread *td, struct __mac_set_link_args *uap)
 {
 
 	return (ENOSYS);
 }
 
 int
 mac_syscall(struct thread *td, struct mac_syscall_args *uap)
 {
 
 	return (ENOSYS);
 }
 
 #endif /* !MAC */
Index: head/sys/sys/vnode.h
===================================================================
--- head/sys/sys/vnode.h	(revision 175201)
+++ head/sys/sys/vnode.h	(revision 175202)
@@ -1,742 +1,742 @@
 /*-
  * Copyright (c) 1989, 1993
  *	The Regents of the University of California.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	@(#)vnode.h	8.7 (Berkeley) 2/4/94
  * $FreeBSD$
  */
 
 #ifndef _SYS_VNODE_H_
 #define	_SYS_VNODE_H_
 
 /*
  * XXX - compatability until lockmgr() goes away or all the #includes are
  * updated.
  */
 #include <sys/lockmgr.h>
 
 #include <sys/bufobj.h>
 #include <sys/queue.h>
 #include <sys/_lock.h>
 #include <sys/lock.h>
 #include <sys/_mutex.h>
 #include <sys/mutex.h>
 #include <sys/selinfo.h>
 #include <sys/uio.h>
 #include <sys/acl.h>
 #include <sys/ktr.h>
 
 /*
  * The vnode is the focus of all file activity in UNIX.  There is a
  * unique vnode allocated for each active file, each current directory,
  * each mounted-on file, text file, and the root.
  */
 
 /*
  * Vnode types.  VNON means no type.
  */
 enum vtype	{ VNON, VREG, VDIR, VBLK, VCHR, VLNK, VSOCK, VFIFO, VBAD,
 		  VMARKER };
 
 /*
  * Each underlying filesystem allocates its own private area and hangs
  * it from v_data.  If non-null, this area is freed in getnewvnode().
  */
 
 struct namecache;
 
 struct vpollinfo {
 	struct	mtx vpi_lock;		/* lock to protect below */
 	struct	selinfo vpi_selinfo;	/* identity of poller(s) */
 	short	vpi_events;		/* what they are looking for */
 	short	vpi_revents;		/* what has happened */
 };
 
 /*
  * Reading or writing any of these items requires holding the appropriate lock.
  *
  * Lock reference:
  *	c - namecache mutex
  *	f - freelist mutex
  *	G - Giant
  *	i - interlock
  *	m - mntvnodes mutex
  *	p - pollinfo lock
  *	s - spechash mutex
  *	S - syncer mutex
  *	u - Only a reference to the vnode is needed to read.
  *	v - vnode lock
  *
  * Vnodes may be found on many lists.  The general way to deal with operating
  * on a vnode that is on a list is:
  *	1) Lock the list and find the vnode.
  *	2) Lock interlock so that the vnode does not go away.
  *	3) Unlock the list to avoid lock order reversals.
  *	4) vget with LK_INTERLOCK and check for ENOENT, or
  *	5) Check for DOOMED if the vnode lock is not required.
  *	6) Perform your operation, then vput().
  *
  * XXX Not all fields are locked yet and some fields that are marked are not
  * locked consistently.  This is a work in progress.  Requires Giant!
  */
 
 #if defined(_KERNEL) || defined(_KVM_VNODE)
 
 struct vnode {
 	/*
 	 * Fields which define the identity of the vnode.  These fields are
 	 * owned by the filesystem (XXX: and vgone() ?)
 	 */
 	enum	vtype v_type;			/* u vnode type */
 	const char *v_tag;			/* u type of underlying data */
 	struct	vop_vector *v_op;		/* u vnode operations vector */
 	void	*v_data;			/* u private data for fs */
 
 	/*
 	 * Filesystem instance stuff
 	 */
 	struct	mount *v_mount;			/* u ptr to vfs we are in */
 	TAILQ_ENTRY(vnode) v_nmntvnodes;	/* m vnodes for mount point */
 
 	/*
 	 * Type specific fields, only one applies to any given vnode.
 	 * See #defines below for renaming to v_* namespace.
 	 */
 	union {
 		struct mount	*vu_mount;	/* v ptr to mountpoint (VDIR) */
 		struct socket	*vu_socket;	/* v unix domain net (VSOCK) */
 		struct cdev	*vu_cdev; 	/* v device (VCHR, VBLK) */
 		struct fifoinfo	*vu_fifoinfo;	/* v fifo (VFIFO) */
 	} v_un;
 
 	/*
 	 * vfs_hash:  (mount + inode) -> vnode hash.
 	 */
 	LIST_ENTRY(vnode)	v_hashlist;
 	u_int			v_hash;
 
 	/*
 	 * VFS_namecache stuff
 	 */
 	LIST_HEAD(, namecache) v_cache_src;	/* c Cache entries from us */
 	TAILQ_HEAD(, namecache) v_cache_dst;	/* c Cache entries to us */
 	struct	vnode *v_dd;			/* c .. vnode */
 
 	/*
 	 * clustering stuff
 	 */
 	daddr_t	v_cstart;			/* v start block of cluster */
 	daddr_t	v_lasta;			/* v last allocation  */
 	daddr_t	v_lastw;			/* v last write  */
 	int	v_clen;				/* v length of cur. cluster */
 
 	/*
 	 * Locking
 	 */
 	struct	lock v_lock;			/* u (if fs don't have one) */
 	struct	mtx v_interlock;		/* lock for "i" things */
 	struct	lock *v_vnlock;			/* u pointer to vnode lock */
 	int	v_holdcnt;			/* i prevents recycling. */
 	int	v_usecount;			/* i ref count of users */
 	u_long	v_iflag;			/* i vnode flags (see below) */
 	u_long	v_vflag;			/* v vnode flags */
 	int	v_writecount;			/* v ref count of writers */
 
 	/*
 	 * The machinery of being a vnode
 	 */
 	TAILQ_ENTRY(vnode) v_freelist;		/* f vnode freelist */
 	struct bufobj	v_bufobj;		/* * Buffer cache object */
 
 	/*
 	 * Hooks for various subsystems and features.
 	 */
 	struct vpollinfo *v_pollinfo;		/* G Poll events, p for *v_pi */
 	struct label *v_label;			/* MAC label for vnode */
 };
 
 #endif /* defined(_KERNEL) || defined(_KVM_VNODE) */
 
 #define	v_mountedhere	v_un.vu_mount
 #define	v_socket	v_un.vu_socket
 #define	v_rdev		v_un.vu_cdev
 #define	v_fifoinfo	v_un.vu_fifoinfo
 
 /* XXX: These are temporary to avoid a source sweep at this time */
 #define v_object	v_bufobj.bo_object
 
 /*
  * Userland version of struct vnode, for sysctl.
  */
 struct xvnode {
 	size_t	xv_size;			/* sizeof(struct xvnode) */
 	void	*xv_vnode;			/* address of real vnode */
 	u_long	xv_flag;			/* vnode vflags */
 	int	xv_usecount;			/* reference count of users */
 	int	xv_writecount;			/* reference count of writers */
 	int	xv_holdcnt;			/* page & buffer references */
 	u_long	xv_id;				/* capability identifier */
 	void	*xv_mount;			/* address of parent mount */
 	long	xv_numoutput;			/* num of writes in progress */
 	enum	vtype xv_type;			/* vnode type */
 	union {
 		void	*xvu_socket;		/* socket, if VSOCK */
 		void	*xvu_fifo;		/* fifo, if VFIFO */
 		dev_t	xvu_rdev;		/* maj/min, if VBLK/VCHR */
 		struct {
 			dev_t	xvu_dev;	/* device, if VDIR/VREG/VLNK */
 			ino_t	xvu_ino;	/* id, if VDIR/VREG/VLNK */
 		} xv_uns;
 	} xv_un;
 };
 #define xv_socket	xv_un.xvu_socket
 #define xv_fifo		xv_un.xvu_fifo
 #define xv_rdev		xv_un.xvu_rdev
 #define xv_dev		xv_un.xv_uns.xvu_dev
 #define xv_ino		xv_un.xv_uns.xvu_ino
 
 /* We don't need to lock the knlist */
 #define	VN_KNLIST_EMPTY(vp) ((vp)->v_pollinfo == NULL ||	\
 	    KNLIST_EMPTY(&(vp)->v_pollinfo->vpi_selinfo.si_note))
 
 #define VN_KNOTE(vp, b, a)					\
 	do {							\
 		if (!VN_KNLIST_EMPTY(vp))			\
 			KNOTE(&vp->v_pollinfo->vpi_selinfo.si_note, (b), (a)); \
 	} while (0)
 #define	VN_KNOTE_LOCKED(vp, b)		VN_KNOTE(vp, b, 1)
 #define	VN_KNOTE_UNLOCKED(vp, b)	VN_KNOTE(vp, b, 0)
 
 /*
  * Vnode flags.
  *	VI flags are protected by interlock and live in v_iflag
  *	VV flags are protected by the vnode lock and live in v_vflag
  */
 #define	VI_MOUNT	0x0020	/* Mount in progress */
 #define	VI_AGE		0x0040	/* Insert vnode at head of free list */
 #define	VI_DOOMED	0x0080	/* This vnode is being recycled */
 #define	VI_FREE		0x0100	/* This vnode is on the freelist */
 #define	VI_OBJDIRTY	0x0400	/* object might be dirty */
 #define	VI_DOINGINACT	0x0800	/* VOP_INACTIVE is in progress */
 #define	VI_OWEINACT	0x1000	/* Need to call inactive */
 
 #define	VV_ROOT		0x0001	/* root of its filesystem */
 #define	VV_ISTTY	0x0002	/* vnode represents a tty */
 #define	VV_NOSYNC	0x0004	/* unlinked, stop syncing */
 #define	VV_CACHEDLABEL	0x0010	/* Vnode has valid cached MAC label */
 #define	VV_TEXT		0x0020	/* vnode is a pure text prototype */
 #define	VV_COPYONWRITE	0x0040	/* vnode is doing copy-on-write */
 #define	VV_SYSTEM	0x0080	/* vnode being used by kernel */
 #define	VV_PROCDEP	0x0100	/* vnode is process dependent */
 #define	VV_NOKNOTE	0x0200	/* don't activate knotes on this vnode */
 #define	VV_DELETED	0x0400	/* should be removed */
 #define	VV_MD		0x0800	/* vnode backs the md device */
 
 /*
  * Vnode attributes.  A field value of VNOVAL represents a field whose value
  * is unavailable (getattr) or which is not to be changed (setattr).
  */
 struct vattr {
 	enum vtype	va_type;	/* vnode type (for create) */
 	u_short		va_mode;	/* files access mode and type */
 	short		va_nlink;	/* number of references to file */
 	uid_t		va_uid;		/* owner user id */
 	gid_t		va_gid;		/* owner group id */
 	dev_t		va_fsid;	/* filesystem id */
 	long		va_fileid;	/* file id */
 	u_quad_t	va_size;	/* file size in bytes */
 	long		va_blocksize;	/* blocksize preferred for i/o */
 	struct timespec	va_atime;	/* time of last access */
 	struct timespec	va_mtime;	/* time of last modification */
 	struct timespec	va_ctime;	/* time file changed */
 	struct timespec	va_birthtime;	/* time file created */
 	u_long		va_gen;		/* generation number of file */
 	u_long		va_flags;	/* flags defined for file */
 	dev_t		va_rdev;	/* device the special file represents */
 	u_quad_t	va_bytes;	/* bytes of disk space held by file */
 	u_quad_t	va_filerev;	/* file modification number */
 	u_int		va_vaflags;	/* operations flags, see below */
 	long		va_spare;	/* remain quad aligned */
 };
 
 /*
  * Flags for va_vaflags.
  */
 #define	VA_UTIMES_NULL	0x01		/* utimes argument was NULL */
 #define	VA_EXCLUSIVE	0x02		/* exclusive create request */
 #define	VA_MARK_ATIME	0x04		/* setting atime for execve/mmap */
 
 /*
  * Flags for ioflag. (high 16 bits used to ask for read-ahead and
  * help with write clustering)
  * NB: IO_NDELAY and IO_DIRECT are linked to fcntl.h
  */
 #define	IO_UNIT		0x0001		/* do I/O as atomic unit */
 #define	IO_APPEND	0x0002		/* append write to end */
 #define	IO_NDELAY	0x0004		/* FNDELAY flag set in file table */
 #define	IO_NODELOCKED	0x0008		/* underlying node already locked */
 #define	IO_ASYNC	0x0010		/* bawrite rather then bdwrite */
 #define	IO_VMIO		0x0020		/* data already in VMIO space */
 #define	IO_INVAL	0x0040		/* invalidate after I/O */
 #define	IO_SYNC		0x0080		/* do I/O synchronously */
 #define	IO_DIRECT	0x0100		/* attempt to bypass buffer cache */
 #define	IO_EXT		0x0400		/* operate on external attributes */
 #define	IO_NORMAL	0x0800		/* operate on regular data */
 #define	IO_NOMACCHECK	0x1000		/* MAC checks unnecessary */
 
 #define IO_SEQMAX	0x7F		/* seq heuristic max value */
 #define IO_SEQSHIFT	16		/* seq heuristic in upper 16 bits */
 
 /*
  *  Modes.  Some values same as Ixxx entries from inode.h for now.
  */
 #define	VEXEC	000100		/* execute/search permission */
 #define	VWRITE	000200		/* write permission */
 #define	VREAD	000400		/* read permission */
 #define	VSVTX	001000		/* save swapped text even after use */
 #define	VSGID	002000		/* set group id on execution */
 #define	VSUID	004000		/* set user id on execution */
 #define	VADMIN	010000		/* permission to administer */
 #define	VSTAT	020000		/* permission to retrieve attrs */
 #define	VAPPEND	040000		/* permission to write/append */
 #define	VALLPERM	(VEXEC | VWRITE | VREAD | VADMIN | VSTAT | VAPPEND)
 
 /*
  * Token indicating no attribute value yet assigned.
  */
 #define	VNOVAL	(-1)
 
 /*
  * LK_TIMELOCK timeout for vnode locks (used mainly by the pageout daemon)
  */
 #define VLKTIMEOUT	(hz / 20 + 1)
 
 #ifdef _KERNEL
 
 #ifdef MALLOC_DECLARE
 MALLOC_DECLARE(M_VNODE);
 #endif
 
 /*
  * Convert between vnode types and inode formats (since POSIX.1
  * defines mode word of stat structure in terms of inode formats).
  */
 extern enum vtype	iftovt_tab[];
 extern int		vttoif_tab[];
 #define	IFTOVT(mode)	(iftovt_tab[((mode) & S_IFMT) >> 12])
 #define	VTTOIF(indx)	(vttoif_tab[(int)(indx)])
 #define	MAKEIMODE(indx, mode)	(int)(VTTOIF(indx) | (mode))
 
 /*
  * Flags to various vnode functions.
  */
 #define	SKIPSYSTEM	0x0001	/* vflush: skip vnodes marked VSYSTEM */
 #define	FORCECLOSE	0x0002	/* vflush: force file closure */
 #define	WRITECLOSE	0x0004	/* vflush: only close writable files */
 #define	DOCLOSE		0x0008	/* vclean: close active files */
 #define	V_SAVE		0x0001	/* vinvalbuf: sync file first */
 #define	V_ALT		0x0002	/* vinvalbuf: invalidate only alternate bufs */
 #define	V_NORMAL	0x0004	/* vinvalbuf: invalidate only regular bufs */
 #define	REVOKEALL	0x0001	/* vop_revoke: revoke all aliases */
 #define	V_WAIT		0x0001	/* vn_start_write: sleep for suspend */
 #define	V_NOWAIT	0x0002	/* vn_start_write: don't sleep for suspend */
 #define	V_XSLEEP	0x0004	/* vn_start_write: just return after sleep */
 
 #define	VREF(vp)	vref(vp)
 
 #ifdef DIAGNOSTIC
 #define	VATTR_NULL(vap)	vattr_null(vap)
 #else
 #define	VATTR_NULL(vap)	(*(vap) = va_null)	/* initialize a vattr */
 #endif /* DIAGNOSTIC */
 
 #define	NULLVP	((struct vnode *)NULL)
 
 /*
  * Global vnode data.
  */
 extern	struct vnode *rootvnode;	/* root (i.e. "/") vnode */
 extern	int async_io_version;		/* 0 or POSIX version of AIO i'face */
 extern	int desiredvnodes;		/* number of vnodes desired */
 extern	struct uma_zone *namei_zone;
 extern	int prtactive;			/* nonzero to call vprint() */
 extern	struct vattr va_null;		/* predefined null vattr structure */
 
 /*
  * Macro/function to check for client cache inconsistency w.r.t. leasing.
  */
 #define	LEASE_READ	0x1		/* Check lease for readers */
 #define	LEASE_WRITE	0x2		/* Check lease for modifiers */
 
 extern void	(*lease_updatetime)(int deltat);
 
 #define	VI_LOCK(vp)	mtx_lock(&(vp)->v_interlock)
 #define	VI_LOCK_FLAGS(vp, flags) mtx_lock_flags(&(vp)->v_interlock, (flags))
 #define	VI_TRYLOCK(vp)	mtx_trylock(&(vp)->v_interlock)
 #define	VI_UNLOCK(vp)	mtx_unlock(&(vp)->v_interlock)
 #define	VI_MTX(vp)	(&(vp)->v_interlock)
 
 #endif /* _KERNEL */
 
 /*
  * Mods for extensibility.
  */
 
 /*
  * Flags for vdesc_flags:
  */
 #define	VDESC_MAX_VPS		16
 /* Low order 16 flag bits are reserved for willrele flags for vp arguments. */
 #define	VDESC_VP0_WILLRELE	0x0001
 #define	VDESC_VP1_WILLRELE	0x0002
 #define	VDESC_VP2_WILLRELE	0x0004
 #define	VDESC_VP3_WILLRELE	0x0008
 #define	VDESC_NOMAP_VPP		0x0100
 #define	VDESC_VPP_WILLRELE	0x0200
 
 /*
  * A generic structure.
  * This can be used by bypass routines to identify generic arguments.
  */
 struct vop_generic_args {
 	struct vnodeop_desc *a_desc;
 	/* other random data follows, presumably */
 };
 
 typedef int vop_bypass_t(struct vop_generic_args *);
 
 /*
  * VDESC_NO_OFFSET is used to identify the end of the offset list
  * and in places where no such field exists.
  */
 #define VDESC_NO_OFFSET -1
 
 /*
  * This structure describes the vnode operation taking place.
  */
 struct vnodeop_desc {
 	char	*vdesc_name;		/* a readable name for debugging */
 	int	 vdesc_flags;		/* VDESC_* flags */
 	vop_bypass_t	*vdesc_call;	/* Function to call */
 
 	/*
 	 * These ops are used by bypass routines to map and locate arguments.
 	 * Creds and procs are not needed in bypass routines, but sometimes
 	 * they are useful to (for example) transport layers.
 	 * Nameidata is useful because it has a cred in it.
 	 */
 	int	*vdesc_vp_offsets;	/* list ended by VDESC_NO_OFFSET */
 	int	vdesc_vpp_offset;	/* return vpp location */
 	int	vdesc_cred_offset;	/* cred location, if any */
 	int	vdesc_thread_offset;	/* thread location, if any */
 	int	vdesc_componentname_offset; /* if any */
 };
 
 #ifdef _KERNEL
 /*
  * A list of all the operation descs.
  */
 extern struct vnodeop_desc *vnodeop_descs[];
 
 #define	VOPARG_OFFSETOF(s_type, field)	__offsetof(s_type, field)
 #define	VOPARG_OFFSETTO(s_type, s_offset, struct_p) \
     ((s_type)(((char*)(struct_p)) + (s_offset)))
 
 
 #ifdef DEBUG_VFS_LOCKS
 /*
  * Support code to aid in debugging VFS locking problems.  Not totally
  * reliable since if the thread sleeps between changing the lock
  * state and checking it with the assert, some other thread could
  * change the state.  They are good enough for debugging a single
  * filesystem using a single-threaded test.
  */
 void	assert_vi_locked(struct vnode *vp, const char *str);
 void	assert_vi_unlocked(struct vnode *vp, const char *str);
 void	assert_vop_elocked(struct vnode *vp, const char *str);
 #if 0
 void	assert_vop_elocked_other(struct vnode *vp, const char *str);
 #endif
 void	assert_vop_locked(struct vnode *vp, const char *str);
 #if 0
 voi0	assert_vop_slocked(struct vnode *vp, const char *str);
 #endif
 void	assert_vop_unlocked(struct vnode *vp, const char *str);
 
 #define	ASSERT_VI_LOCKED(vp, str)	assert_vi_locked((vp), (str))
 #define	ASSERT_VI_UNLOCKED(vp, str)	assert_vi_unlocked((vp), (str))
 #define	ASSERT_VOP_ELOCKED(vp, str)	assert_vop_elocked((vp), (str))
 #if 0
 #define	ASSERT_VOP_ELOCKED_OTHER(vp, str) assert_vop_locked_other((vp), (str))
 #endif
 #define	ASSERT_VOP_LOCKED(vp, str)	assert_vop_locked((vp), (str))
 #if 0
 #define	ASSERT_VOP_SLOCKED(vp, str)	assert_vop_slocked((vp), (str))
 #endif
 #define	ASSERT_VOP_UNLOCKED(vp, str)	assert_vop_unlocked((vp), (str))
 
 #else /* !DEBUG_VFS_LOCKS */
 
 #define	ASSERT_VI_LOCKED(vp, str)
 #define	ASSERT_VI_UNLOCKED(vp, str)
 #define	ASSERT_VOP_ELOCKED(vp, str)
 #if 0
 #define	ASSERT_VOP_ELOCKED_OTHER(vp, str)
 #endif
 #define	ASSERT_VOP_LOCKED(vp, str)
 #if 0
 #define	ASSERT_VOP_SLOCKED(vp, str)
 #endif
 #define	ASSERT_VOP_UNLOCKED(vp, str)
 #endif /* DEBUG_VFS_LOCKS */
 
 
 /*
  * This call works for vnodes in the kernel.
  */
 #define VCALL(c) ((c)->a_desc->vdesc_call(c))
 
 /*
  * VMIO support inline
  */
 
 extern int vmiodirenable;
 
 static __inline int
 vn_canvmio(struct vnode *vp)
 {
       if (vp && (vp->v_type == VREG || (vmiodirenable && vp->v_type == VDIR)))
 		return(TRUE);
 	return(FALSE);
 }
 
 /*
  * Finally, include the default set of vnode operations.
  */
 #include "vnode_if.h"
 
 /*
  * Public vnode manipulation functions.
  */
 struct componentname;
 struct file;
 struct mount;
 struct nameidata;
 struct ostat;
 struct thread;
 struct proc;
 struct stat;
 struct nstat;
 struct ucred;
 struct uio;
 struct vattr;
 struct vnode;
 
 extern int	(*lease_check_hook)(struct vop_lease_args *);
 
 /* cache_* may belong in namei.h. */
 void	cache_enter(struct vnode *dvp, struct vnode *vp,
 	    struct componentname *cnp);
 int	cache_lookup(struct vnode *dvp, struct vnode **vpp,
 	    struct componentname *cnp);
 void	cache_purge(struct vnode *vp);
 void	cache_purgevfs(struct mount *mp);
 int	change_dir(struct vnode *vp, struct thread *td);
 int	change_root(struct vnode *vp, struct thread *td);
 void	cvtstat(struct stat *st, struct ostat *ost);
 void	cvtnstat(struct stat *sb, struct nstat *nsb);
 int	getnewvnode(const char *tag, struct mount *mp, struct vop_vector *vops,
 	    struct vnode **vpp);
 int	insmntque1(struct vnode *vp, struct mount *mp,
 	    void (*dtr)(struct vnode *, void *), void *dtr_arg);
 int	insmntque(struct vnode *vp, struct mount *mp);
 u_quad_t init_va_filerev(void);
 int	lease_check(struct vop_lease_args *ap);
 int	speedup_syncer(void);
 #define textvp_fullpath(p, rb, rfb) \
 	vn_fullpath(FIRST_THREAD_IN_PROC(p), (p)->p_textvp, rb, rfb)
 int	vn_fullpath(struct thread *td, struct vnode *vn,
 	    char **retbuf, char **freebuf);
 int	vaccess(enum vtype type, mode_t file_mode, uid_t file_uid,
 	    gid_t file_gid, mode_t acc_mode, struct ucred *cred,
 	    int *privused);
 int	vaccess_acl_posix1e(enum vtype type, uid_t file_uid,
 	    gid_t file_gid, struct acl *acl, mode_t acc_mode,
 	    struct ucred *cred, int *privused);
 void	vattr_null(struct vattr *vap);
 int	vcount(struct vnode *vp);
 void	vdrop(struct vnode *);
 void	vdropl(struct vnode *);
 int	vflush(struct mount *mp, int rootrefs, int flags, struct thread *td);
 int	vget(struct vnode *vp, int lockflag, struct thread *td);
 void	vgone(struct vnode *vp);
 void	vhold(struct vnode *);
 void	vholdl(struct vnode *);
 int	vinvalbuf(struct vnode *vp, int save,
 	    struct thread *td, int slpflag, int slptimeo);
 int	vtruncbuf(struct vnode *vp, struct ucred *cred, struct thread *td,
 	    off_t length, int blksize);
 void	vn_printf(struct vnode *vp, const char *fmt, ...) __printflike(2,3);
 #define vprint(label, vp) vn_printf((vp), "%s\n", (label))
 int	vrecycle(struct vnode *vp, struct thread *td);
 int	vn_close(struct vnode *vp,
 	    int flags, struct ucred *file_cred, struct thread *td);
 void	vn_finished_write(struct mount *mp);
 void	vn_finished_secondary_write(struct mount *mp);
 int	vn_isdisk(struct vnode *vp, int *errp);
-int	_vn_lock(struct vnode *vp, int flags, struct thread *td, char *file, int line);
-#define vn_lock(vp, flags, td) _vn_lock(vp, flags, td, __FILE__, __LINE__)
+int	_vn_lock(struct vnode *vp, int flags, char *file, int line);
+#define vn_lock(vp, flags) _vn_lock(vp, flags, __FILE__, __LINE__)
 int	vn_open(struct nameidata *ndp, int *flagp, int cmode, struct file *fp);
 int	vn_open_cred(struct nameidata *ndp, int *flagp, int cmode,
 	    struct ucred *cred, struct file *fp);
 int	vn_pollrecord(struct vnode *vp, struct thread *p, int events);
 int	vn_rdwr(enum uio_rw rw, struct vnode *vp, void *base,
 	    int len, off_t offset, enum uio_seg segflg, int ioflg,
 	    struct ucred *active_cred, struct ucred *file_cred, int *aresid,
 	    struct thread *td);
 int	vn_rdwr_inchunks(enum uio_rw rw, struct vnode *vp, void *base,
 	    size_t len, off_t offset, enum uio_seg segflg, int ioflg,
 	    struct ucred *active_cred, struct ucred *file_cred, size_t *aresid,
 	    struct thread *td);
 int	vn_stat(struct vnode *vp, struct stat *sb, struct ucred *active_cred,
 	    struct ucred *file_cred, struct thread *td);
 int	vn_start_write(struct vnode *vp, struct mount **mpp, int flags);
 int	vn_start_secondary_write(struct vnode *vp, struct mount **mpp,
 	    int flags);
 int	vn_write_suspend_wait(struct vnode *vp, struct mount *mp,
 	    int flags);
 int	vn_writechk(struct vnode *vp);
 int	vn_extattr_get(struct vnode *vp, int ioflg, int attrnamespace,
 	    const char *attrname, int *buflen, char *buf, struct thread *td);
 int	vn_extattr_set(struct vnode *vp, int ioflg, int attrnamespace,
 	    const char *attrname, int buflen, char *buf, struct thread *td);
 int	vn_extattr_rm(struct vnode *vp, int ioflg, int attrnamespace,
 	    const char *attrname, struct thread *td);
 int	vfs_cache_lookup(struct vop_lookup_args *ap);
 void	vfs_timestamp(struct timespec *);
 void	vfs_write_resume(struct mount *mp);
 int	vfs_write_suspend(struct mount *mp);
 int	vop_stdbmap(struct vop_bmap_args *);
 int	vop_stdfsync(struct vop_fsync_args *);
 int	vop_stdgetwritemount(struct vop_getwritemount_args *);
 int	vop_stdgetpages(struct vop_getpages_args *);
 int	vop_stdinactive(struct vop_inactive_args *);
 int	vop_stdislocked(struct vop_islocked_args *);
 int	vop_stdkqfilter(struct vop_kqfilter_args *);
 int	vop_stdlock(struct vop_lock1_args *);
 int	vop_stdputpages(struct vop_putpages_args *);
 int	vop_stdunlock(struct vop_unlock_args *);
 int	vop_nopoll(struct vop_poll_args *);
 int	vop_stdpathconf(struct vop_pathconf_args *);
 int	vop_stdpoll(struct vop_poll_args *);
 int	vop_stdvptofh(struct vop_vptofh_args *ap);
 int	vop_eopnotsupp(struct vop_generic_args *ap);
 int	vop_ebadf(struct vop_generic_args *ap);
 int	vop_einval(struct vop_generic_args *ap);
 int	vop_enotty(struct vop_generic_args *ap);
 int	vop_null(struct vop_generic_args *ap);
 int	vop_panic(struct vop_generic_args *ap);
 
 /* These are called from within the actual VOPS. */
 void	vop_create_post(void *a, int rc);
 void	vop_link_post(void *a, int rc);
 void	vop_lock_pre(void *a);
 void	vop_lock_post(void *a, int rc);
 void	vop_lookup_post(void *a, int rc);
 void	vop_lookup_pre(void *a);
 void	vop_mkdir_post(void *a, int rc);
 void	vop_mknod_post(void *a, int rc);
 void	vop_remove_post(void *a, int rc);
 void	vop_rename_post(void *a, int rc);
 void	vop_rename_pre(void *a);
 void	vop_rmdir_post(void *a, int rc);
 void	vop_setattr_post(void *a, int rc);
 void	vop_strategy_pre(void *a);
 void	vop_symlink_post(void *a, int rc);
 void	vop_unlock_post(void *a, int rc);
 void	vop_unlock_pre(void *a);
 
 #define	VOP_WRITE_PRE(ap)						\
 	struct vattr va;						\
 	int error, osize, ooffset, noffset;				\
 									\
 	osize = ooffset = noffset = 0;					\
 	if (!VN_KNLIST_EMPTY((ap)->a_vp)) {				\
 		error = VOP_GETATTR((ap)->a_vp, &va, (ap)->a_cred,	\
 		    curthread);						\
 		if (error)						\
 			return (error);					\
 		ooffset = (ap)->a_uio->uio_offset;			\
 		osize = va.va_size;					\
 	}
 
 #define VOP_WRITE_POST(ap, ret)						\
 	noffset = (ap)->a_uio->uio_offset;				\
 	if (noffset > ooffset && !VN_KNLIST_EMPTY((ap)->a_vp)) {	\
 		VFS_KNOTE_LOCKED((ap)->a_vp, NOTE_WRITE			\
 		    | (noffset > osize ? NOTE_EXTEND : 0));		\
 	}
 
 #define VOP_LOCK(vp, flags, td) VOP_LOCK1(vp, flags, td, __FILE__, __LINE__)
 
 
 void	vput(struct vnode *vp);
 void	vrele(struct vnode *vp);
 void	vref(struct vnode *vp);
 int	vrefcnt(struct vnode *vp);
 void 	v_addpollinfo(struct vnode *vp);
 
 int vnode_create_vobject(struct vnode *vp, off_t size, struct thread *td);
 void vnode_destroy_vobject(struct vnode *vp);
 
 extern struct vop_vector fifo_specops;
 extern struct vop_vector dead_vnodeops;
 extern struct vop_vector default_vnodeops;
 
 #define VOP_PANIC	((void*)(uintptr_t)vop_panic)
 #define VOP_NULL	((void*)(uintptr_t)vop_null)
 #define VOP_EBADF	((void*)(uintptr_t)vop_ebadf)
 #define VOP_ENOTTY	((void*)(uintptr_t)vop_enotty)
 #define VOP_EINVAL	((void*)(uintptr_t)vop_einval)
 #define VOP_EOPNOTSUPP	((void*)(uintptr_t)vop_eopnotsupp)
 
 /* vfs_hash.c */
 typedef int vfs_hash_cmp_t(struct vnode *vp, void *arg);
 
 int vfs_hash_get(const struct mount *mp, u_int hash, int flags, struct thread *td, struct vnode **vpp, vfs_hash_cmp_t *fn, void *arg);
 int vfs_hash_insert(struct vnode *vp, u_int hash, int flags, struct thread *td, struct vnode **vpp, vfs_hash_cmp_t *fn, void *arg);
 void vfs_hash_rehash(struct vnode *vp, u_int hash);
 void vfs_hash_remove(struct vnode *vp);
 
 int vfs_kqfilter(struct vop_kqfilter_args *);
 void vfs_mark_atime(struct vnode *vp, struct thread *td);
 struct dirent;
 int vfs_read_dirent(struct vop_readdir_args *ap, struct dirent *dp, off_t off);
 
 #endif /* _KERNEL */
 
 #endif /* !_SYS_VNODE_H_ */
Index: head/sys/ufs/ffs/ffs_snapshot.c
===================================================================
--- head/sys/ufs/ffs/ffs_snapshot.c	(revision 175201)
+++ head/sys/ufs/ffs/ffs_snapshot.c	(revision 175202)
@@ -1,2527 +1,2527 @@
 /*-
  * Copyright 2000 Marshall Kirk McKusick. All Rights Reserved.
  *
  * Further information about snapshots can be obtained from:
  *
  *	Marshall Kirk McKusick		http://www.mckusick.com/softdep/
  *	1614 Oxford Street		mckusick@mckusick.com
  *	Berkeley, CA 94709-1608		+1-510-843-9542
  *	USA
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  *
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY MARSHALL KIRK MCKUSICK ``AS IS'' AND ANY
  * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
  * DISCLAIMED.  IN NO EVENT SHALL MARSHALL KIRK MCKUSICK BE LIABLE FOR
  * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	@(#)ffs_snapshot.c	8.11 (McKusick) 7/23/00
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_quota.h"
 
 #include <sys/param.h>
 #include <sys/kernel.h>
 #include <sys/systm.h>
 #include <sys/conf.h>
 #include <sys/bio.h>
 #include <sys/buf.h>
 #include <sys/proc.h>
 #include <sys/namei.h>
 #include <sys/sched.h>
 #include <sys/stat.h>
 #include <sys/malloc.h>
 #include <sys/mount.h>
 #include <sys/resource.h>
 #include <sys/resourcevar.h>
 #include <sys/vnode.h>
 
 #include <geom/geom.h>
 
 #include <ufs/ufs/extattr.h>
 #include <ufs/ufs/quota.h>
 #include <ufs/ufs/ufsmount.h>
 #include <ufs/ufs/inode.h>
 #include <ufs/ufs/ufs_extern.h>
 
 #include <ufs/ffs/fs.h>
 #include <ufs/ffs/ffs_extern.h>
 
 #define KERNCRED thread0.td_ucred
 #define DEBUG 1
 
 #include "opt_ffs.h"
 
 #ifdef NO_FFS_SNAPSHOT
 int
 ffs_snapshot(mp, snapfile)
 	struct mount *mp;
 	char *snapfile;
 {
 	return (EINVAL);
 }
 
 int
 ffs_snapblkfree(fs, devvp, bno, size, inum)
 	struct fs *fs;
 	struct vnode *devvp;
 	ufs2_daddr_t bno;
 	long size;
 	ino_t inum;
 {
 	return (EINVAL);
 }
 
 void
 ffs_snapremove(vp)
 	struct vnode *vp;
 {
 }
 
 void
 ffs_snapshot_mount(mp)
 	struct mount *mp;
 {
 }
 
 void
 ffs_snapshot_unmount(mp)
 	struct mount *mp;
 {
 }
 
 void
 ffs_snapgone(ip)
 	struct inode *ip;
 {
 }
 
 int
 ffs_copyonwrite(devvp, bp)
 	struct vnode *devvp;
 	struct buf *bp;
 {
 	return (EINVAL);
 }
 
 #else
 
 TAILQ_HEAD(snaphead, inode);
 
 struct snapdata {
 	struct snaphead sn_head;
 	daddr_t sn_listsize;
 	daddr_t *sn_blklist;
 	struct lock sn_lock;
 };
 
 static int cgaccount(int, struct vnode *, struct buf *, int);
 static int expunge_ufs1(struct vnode *, struct inode *, struct fs *,
     int (*)(struct vnode *, ufs1_daddr_t *, ufs1_daddr_t *, struct fs *,
     ufs_lbn_t, int), int);
 static int indiracct_ufs1(struct vnode *, struct vnode *, int,
     ufs1_daddr_t, ufs_lbn_t, ufs_lbn_t, ufs_lbn_t, ufs_lbn_t, struct fs *,
     int (*)(struct vnode *, ufs1_daddr_t *, ufs1_daddr_t *, struct fs *,
     ufs_lbn_t, int), int);
 static int fullacct_ufs1(struct vnode *, ufs1_daddr_t *, ufs1_daddr_t *,
     struct fs *, ufs_lbn_t, int);
 static int snapacct_ufs1(struct vnode *, ufs1_daddr_t *, ufs1_daddr_t *,
     struct fs *, ufs_lbn_t, int);
 static int mapacct_ufs1(struct vnode *, ufs1_daddr_t *, ufs1_daddr_t *,
     struct fs *, ufs_lbn_t, int);
 static int expunge_ufs2(struct vnode *, struct inode *, struct fs *,
     int (*)(struct vnode *, ufs2_daddr_t *, ufs2_daddr_t *, struct fs *,
     ufs_lbn_t, int), int);
 static int indiracct_ufs2(struct vnode *, struct vnode *, int,
     ufs2_daddr_t, ufs_lbn_t, ufs_lbn_t, ufs_lbn_t, ufs_lbn_t, struct fs *,
     int (*)(struct vnode *, ufs2_daddr_t *, ufs2_daddr_t *, struct fs *,
     ufs_lbn_t, int), int);
 static int fullacct_ufs2(struct vnode *, ufs2_daddr_t *, ufs2_daddr_t *,
     struct fs *, ufs_lbn_t, int);
 static int snapacct_ufs2(struct vnode *, ufs2_daddr_t *, ufs2_daddr_t *,
     struct fs *, ufs_lbn_t, int);
 static int mapacct_ufs2(struct vnode *, ufs2_daddr_t *, ufs2_daddr_t *,
     struct fs *, ufs_lbn_t, int);
 static int readblock(struct vnode *vp, struct buf *, ufs2_daddr_t);
 static void process_deferred_inactive(struct mount *);
 static void try_free_snapdata(struct vnode *devvp, struct thread *td);
 static int ffs_bp_snapblk(struct vnode *, struct buf *);
 
 /*
  * To ensure the consistency of snapshots across crashes, we must
  * synchronously write out copied blocks before allowing the
  * originals to be modified. Because of the rather severe speed
  * penalty that this imposes, the following flag allows this
  * crash persistence to be disabled.
  */
 int dopersistence = 0;
 
 #ifdef DEBUG
 #include <sys/sysctl.h>
 SYSCTL_INT(_debug, OID_AUTO, dopersistence, CTLFLAG_RW, &dopersistence, 0, "");
 static int snapdebug = 0;
 SYSCTL_INT(_debug, OID_AUTO, snapdebug, CTLFLAG_RW, &snapdebug, 0, "");
 int collectsnapstats = 0;
 SYSCTL_INT(_debug, OID_AUTO, collectsnapstats, CTLFLAG_RW, &collectsnapstats,
 	0, "");
 #endif /* DEBUG */
 
 /*
  * Create a snapshot file and initialize it for the filesystem.
  */
 int
 ffs_snapshot(mp, snapfile)
 	struct mount *mp;
 	char *snapfile;
 {
 	ufs2_daddr_t numblks, blkno, *blkp, *snapblklist;
 	int error, cg, snaploc;
 	int i, size, len, loc;
 	int flag;
 	struct timespec starttime = {0, 0}, endtime;
 	char saved_nice = 0;
 	long redo = 0, snaplistsize = 0;
 	int32_t *lp;
 	void *space;
 	struct fs *copy_fs = NULL, *fs;
 	struct thread *td = curthread;
 	struct inode *ip, *xp;
 	struct buf *bp, *nbp, *ibp, *sbp = NULL;
 	struct nameidata nd;
 	struct mount *wrtmp;
 	struct vattr vat;
 	struct vnode *vp, *xvp, *mvp, *devvp;
 	struct uio auio;
 	struct iovec aiov;
 	struct snapdata *sn;
 	struct ufsmount *ump;
 
 	ump = VFSTOUFS(mp);
 	fs = ump->um_fs;
 	sn = NULL;
 	MNT_ILOCK(mp);
 	flag = mp->mnt_flag;
 	MNT_IUNLOCK(mp);
 
 	/*
 	 * Need to serialize access to snapshot code per filesystem.
 	 */
 	/*
 	 * Assign a snapshot slot in the superblock.
 	 */
 	UFS_LOCK(ump);
 	for (snaploc = 0; snaploc < FSMAXSNAP; snaploc++)
 		if (fs->fs_snapinum[snaploc] == 0)
 			break;
 	UFS_UNLOCK(ump);
 	if (snaploc == FSMAXSNAP)
 		return (ENOSPC);
 	/*
 	 * Create the snapshot file.
 	 */
 restart:
 	NDINIT(&nd, CREATE, LOCKPARENT | LOCKLEAF, UIO_SYSSPACE, snapfile, td);
 	if ((error = namei(&nd)) != 0)
 		return (error);
 	if (nd.ni_vp != NULL) {
 		vput(nd.ni_vp);
 		error = EEXIST;
 	}
 	if (nd.ni_dvp->v_mount != mp)
 		error = EXDEV;
 	if (error) {
 		NDFREE(&nd, NDF_ONLY_PNBUF);
 		if (nd.ni_dvp == nd.ni_vp)
 			vrele(nd.ni_dvp);
 		else
 			vput(nd.ni_dvp);
 		return (error);
 	}
 	VATTR_NULL(&vat);
 	vat.va_type = VREG;
 	vat.va_mode = S_IRUSR;
 	vat.va_vaflags |= VA_EXCLUSIVE;
 	if (VOP_GETWRITEMOUNT(nd.ni_dvp, &wrtmp))
 		wrtmp = NULL;
 	if (wrtmp != mp)
 		panic("ffs_snapshot: mount mismatch");
 	vfs_rel(wrtmp);
 	if (vn_start_write(NULL, &wrtmp, V_NOWAIT) != 0) {
 		NDFREE(&nd, NDF_ONLY_PNBUF);
 		vput(nd.ni_dvp);
 		if ((error = vn_start_write(NULL, &wrtmp,
 		    V_XSLEEP | PCATCH)) != 0)
 			return (error);
 		goto restart;
 	}
 	VOP_LEASE(nd.ni_dvp, td, KERNCRED, LEASE_WRITE);
 	error = VOP_CREATE(nd.ni_dvp, &nd.ni_vp, &nd.ni_cnd, &vat);
 	VOP_UNLOCK(nd.ni_dvp, 0, td);
 	if (error) {
 		NDFREE(&nd, NDF_ONLY_PNBUF);
 		vn_finished_write(wrtmp);
 		vrele(nd.ni_dvp);
 		return (error);
 	}
 	vp = nd.ni_vp;
 	vp->v_vflag |= VV_SYSTEM;
 	ip = VTOI(vp);
 	devvp = ip->i_devvp;
 	/*
 	 * Allocate and copy the last block contents so as to be able
 	 * to set size to that of the filesystem.
 	 */
 	numblks = howmany(fs->fs_size, fs->fs_frag);
 	error = UFS_BALLOC(vp, lblktosize(fs, (off_t)(numblks - 1)),
 	    fs->fs_bsize, KERNCRED, BA_CLRBUF, &bp);
 	if (error)
 		goto out;
 	ip->i_size = lblktosize(fs, (off_t)numblks);
 	DIP_SET(ip, i_size, ip->i_size);
 	ip->i_flag |= IN_CHANGE | IN_UPDATE;
 	error = readblock(vp, bp, numblks - 1);
 	bawrite(bp);
 	if (error != 0)
 		goto out;
 	/*
 	 * Preallocate critical data structures so that we can copy
 	 * them in without further allocation after we suspend all
 	 * operations on the filesystem. We would like to just release
 	 * the allocated buffers without writing them since they will
 	 * be filled in below once we are ready to go, but this upsets
 	 * the soft update code, so we go ahead and write the new buffers.
 	 *
 	 * Allocate all indirect blocks and mark all of them as not
 	 * needing to be copied.
 	 */
 	for (blkno = NDADDR; blkno < numblks; blkno += NINDIR(fs)) {
 		error = UFS_BALLOC(vp, lblktosize(fs, (off_t)blkno),
 		    fs->fs_bsize, td->td_ucred, BA_METAONLY, &ibp);
 		if (error)
 			goto out;
 		bawrite(ibp);
 	}
 	/*
 	 * Allocate copies for the superblock and its summary information.
 	 */
 	error = UFS_BALLOC(vp, fs->fs_sblockloc, fs->fs_sbsize, KERNCRED,
 	    0, &nbp);
 	if (error)
 		goto out;
 	bawrite(nbp);
 	blkno = fragstoblks(fs, fs->fs_csaddr);
 	len = howmany(fs->fs_cssize, fs->fs_bsize);
 	for (loc = 0; loc < len; loc++) {
 		error = UFS_BALLOC(vp, lblktosize(fs, (off_t)(blkno + loc)),
 		    fs->fs_bsize, KERNCRED, 0, &nbp);
 		if (error)
 			goto out;
 		bawrite(nbp);
 	}
 	/*
 	 * Allocate all cylinder group blocks.
 	 */
 	for (cg = 0; cg < fs->fs_ncg; cg++) {
 		error = UFS_BALLOC(vp, lfragtosize(fs, cgtod(fs, cg)),
 		    fs->fs_bsize, KERNCRED, 0, &nbp);
 		if (error)
 			goto out;
 		bawrite(nbp);
 	}
 	/*
 	 * Copy all the cylinder group maps. Although the
 	 * filesystem is still active, we hope that only a few
 	 * cylinder groups will change between now and when we
 	 * suspend operations. Thus, we will be able to quickly
 	 * touch up the few cylinder groups that changed during
 	 * the suspension period.
 	 */
 	len = howmany(fs->fs_ncg, NBBY);
 	MALLOC(space, void *, len, M_DEVBUF, M_WAITOK|M_ZERO);
 	UFS_LOCK(ump);
 	fs->fs_active = space;
 	UFS_UNLOCK(ump);
 	for (cg = 0; cg < fs->fs_ncg; cg++) {
 		error = UFS_BALLOC(vp, lfragtosize(fs, cgtod(fs, cg)),
 		    fs->fs_bsize, KERNCRED, 0, &nbp);
 		if (error)
 			goto out;
 		error = cgaccount(cg, vp, nbp, 1);
 		bawrite(nbp);
 		if (error)
 			goto out;
 	}
 	/*
 	 * Change inode to snapshot type file.
 	 */
 	ip->i_flags |= SF_SNAPSHOT;
 	DIP_SET(ip, i_flags, ip->i_flags);
 	ip->i_flag |= IN_CHANGE | IN_UPDATE;
 	/*
 	 * Ensure that the snapshot is completely on disk.
 	 * Since we have marked it as a snapshot it is safe to
 	 * unlock it as no process will be allowed to write to it.
 	 */
 	if ((error = ffs_syncvnode(vp, MNT_WAIT)) != 0)
 		goto out;
 	VOP_UNLOCK(vp, 0, td);
 	/*
 	 * All allocations are done, so we can now snapshot the system.
 	 *
 	 * Recind nice scheduling while running with the filesystem suspended.
 	 */
 	if (td->td_proc->p_nice > 0) {
 		struct proc *p;
 
 		p = td->td_proc;
 		PROC_LOCK(p);
 		PROC_SLOCK(p);
 		saved_nice = p->p_nice;
 		sched_nice(p, 0);
 		PROC_SUNLOCK(p);
 		PROC_UNLOCK(p);
 	}
 	/*
 	 * Suspend operation on filesystem.
 	 */
 	for (;;) {
 		vn_finished_write(wrtmp);
 		if ((error = vfs_write_suspend(vp->v_mount)) != 0) {
 			vn_start_write(NULL, &wrtmp, V_WAIT);
-			vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td);
+			vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
 			goto out;
 		}
 		if (mp->mnt_kern_flag & MNTK_SUSPENDED)
 			break;
 		vn_start_write(NULL, &wrtmp, V_WAIT);
 	}
-	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td);
+	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
 	if (ip->i_effnlink == 0) {
 		error = ENOENT;		/* Snapshot file unlinked */
 		goto out1;
 	}
 	if (collectsnapstats)
 		nanotime(&starttime);
 
 	/* The last block might have changed.  Copy it again to be sure. */
 	error = UFS_BALLOC(vp, lblktosize(fs, (off_t)(numblks - 1)),
 	    fs->fs_bsize, KERNCRED, BA_CLRBUF, &bp);
 	if (error != 0)
 		goto out1;
 	error = readblock(vp, bp, numblks - 1);
 	bp->b_flags |= B_VALIDSUSPWRT;
 	bawrite(bp);
 	if (error != 0)
 		goto out1;
 	/*
 	 * First, copy all the cylinder group maps that have changed.
 	 */
 	for (cg = 0; cg < fs->fs_ncg; cg++) {
 		if ((ACTIVECGNUM(fs, cg) & ACTIVECGOFF(cg)) != 0)
 			continue;
 		redo++;
 		error = UFS_BALLOC(vp, lfragtosize(fs, cgtod(fs, cg)),
 		    fs->fs_bsize, KERNCRED, 0, &nbp);
 		if (error)
 			goto out1;
 		error = cgaccount(cg, vp, nbp, 2);
 		bawrite(nbp);
 		if (error)
 			goto out1;
 	}
 	/*
 	 * Grab a copy of the superblock and its summary information.
 	 * We delay writing it until the suspension is released below.
 	 */
 	error = bread(vp, lblkno(fs, fs->fs_sblockloc), fs->fs_bsize,
 	    KERNCRED, &sbp);
 	if (error) {
 		brelse(sbp);
 		sbp = NULL;
 		goto out1;
 	}
 	loc = blkoff(fs, fs->fs_sblockloc);
 	copy_fs = (struct fs *)(sbp->b_data + loc);
 	bcopy(fs, copy_fs, fs->fs_sbsize);
 	if ((fs->fs_flags & (FS_UNCLEAN | FS_NEEDSFSCK)) == 0)
 		copy_fs->fs_clean = 1;
 	size = fs->fs_bsize < SBLOCKSIZE ? fs->fs_bsize : SBLOCKSIZE;
 	if (fs->fs_sbsize < size)
 		bzero(&sbp->b_data[loc + fs->fs_sbsize], size - fs->fs_sbsize);
 	size = blkroundup(fs, fs->fs_cssize);
 	if (fs->fs_contigsumsize > 0)
 		size += fs->fs_ncg * sizeof(int32_t);
 	space = malloc((u_long)size, M_UFSMNT, M_WAITOK);
 	copy_fs->fs_csp = space;
 	bcopy(fs->fs_csp, copy_fs->fs_csp, fs->fs_cssize);
 	space = (char *)space + fs->fs_cssize;
 	loc = howmany(fs->fs_cssize, fs->fs_fsize);
 	i = fs->fs_frag - loc % fs->fs_frag;
 	len = (i == fs->fs_frag) ? 0 : i * fs->fs_fsize;
 	if (len > 0) {
 		if ((error = bread(devvp, fsbtodb(fs, fs->fs_csaddr + loc),
 		    len, KERNCRED, &bp)) != 0) {
 			brelse(bp);
 			free(copy_fs->fs_csp, M_UFSMNT);
 			bawrite(sbp);
 			sbp = NULL;
 			goto out1;
 		}
 		bcopy(bp->b_data, space, (u_int)len);
 		space = (char *)space + len;
 		bp->b_flags |= B_INVAL | B_NOCACHE;
 		brelse(bp);
 	}
 	if (fs->fs_contigsumsize > 0) {
 		copy_fs->fs_maxcluster = lp = space;
 		for (i = 0; i < fs->fs_ncg; i++)
 			*lp++ = fs->fs_contigsumsize;
 	}
 	/*
 	 * We must check for active files that have been unlinked
 	 * (e.g., with a zero link count). We have to expunge all
 	 * trace of these files from the snapshot so that they are
 	 * not reclaimed prematurely by fsck or unnecessarily dumped.
 	 * We turn off the MNTK_SUSPENDED flag to avoid a panic from
 	 * spec_strategy about writing on a suspended filesystem.
 	 * Note that we skip unlinked snapshot files as they will
 	 * be handled separately below.
 	 *
 	 * We also calculate the needed size for the snapshot list.
 	 */
 	snaplistsize = fs->fs_ncg + howmany(fs->fs_cssize, fs->fs_bsize) +
 	    FSMAXSNAP + 1 /* superblock */ + 1 /* last block */ + 1 /* size */;
 	MNT_ILOCK(mp);
 	mp->mnt_kern_flag &= ~MNTK_SUSPENDED;
 loop:
 	MNT_VNODE_FOREACH(xvp, mp, mvp) {
 		VI_LOCK(xvp);
 		MNT_IUNLOCK(mp);
 		if ((xvp->v_iflag & VI_DOOMED) ||
 		    (xvp->v_usecount == 0 &&
 		     (xvp->v_iflag & (VI_OWEINACT | VI_DOINGINACT)) == 0) ||
 		    xvp->v_type == VNON ||
 		    (VTOI(xvp)->i_flags & SF_SNAPSHOT)) {
 			VI_UNLOCK(xvp);
 			MNT_ILOCK(mp);
 			continue;
 		}
 		/*
 		 * We can skip parent directory vnode because it must have
 		 * this snapshot file in it.
 		 */
 		if (xvp == nd.ni_dvp) {
 			VI_UNLOCK(xvp);
 			MNT_ILOCK(mp);
 			continue;
 		}
 		vholdl(xvp);
-		if (vn_lock(xvp, LK_EXCLUSIVE | LK_INTERLOCK, td) != 0) {
+		if (vn_lock(xvp, LK_EXCLUSIVE | LK_INTERLOCK) != 0) {
 			MNT_ILOCK(mp);
 			MNT_VNODE_FOREACH_ABORT_ILOCKED(mp, mvp);
 			vdrop(xvp);
 			goto loop;
 		}
 		VI_LOCK(xvp);
 		if (xvp->v_usecount == 0 &&
 		    (xvp->v_iflag & (VI_OWEINACT | VI_DOINGINACT)) == 0) {
 			VI_UNLOCK(xvp);
 			VOP_UNLOCK(xvp, 0, td);
 			vdrop(xvp);
 			MNT_ILOCK(mp);
 			continue;
 		}
 		VI_UNLOCK(xvp);
 		if (snapdebug)
 			vprint("ffs_snapshot: busy vnode", xvp);
 		if (VOP_GETATTR(xvp, &vat, td->td_ucred, td) == 0 &&
 		    vat.va_nlink > 0) {
 			VOP_UNLOCK(xvp, 0, td);
 			vdrop(xvp);
 			MNT_ILOCK(mp);
 			continue;
 		}
 		xp = VTOI(xvp);
 		if (ffs_checkfreefile(copy_fs, vp, xp->i_number)) {
 			VOP_UNLOCK(xvp, 0, td);
 			vdrop(xvp);
 			MNT_ILOCK(mp);
 			continue;
 		}
 		/*
 		 * If there is a fragment, clear it here.
 		 */
 		blkno = 0;
 		loc = howmany(xp->i_size, fs->fs_bsize) - 1;
 		if (loc < NDADDR) {
 			len = fragroundup(fs, blkoff(fs, xp->i_size));
 			if (len != 0 && len < fs->fs_bsize) {
 				ffs_blkfree(ump, copy_fs, vp,
 				    DIP(xp, i_db[loc]), len, xp->i_number);
 				blkno = DIP(xp, i_db[loc]);
 				DIP_SET(xp, i_db[loc], 0);
 			}
 		}
 		snaplistsize += 1;
 		if (xp->i_ump->um_fstype == UFS1)
 			error = expunge_ufs1(vp, xp, copy_fs, fullacct_ufs1,
 			    BLK_NOCOPY);
 		else
 			error = expunge_ufs2(vp, xp, copy_fs, fullacct_ufs2,
 			    BLK_NOCOPY);
 		if (blkno)
 			DIP_SET(xp, i_db[loc], blkno);
 		if (!error)
 			error = ffs_freefile(ump, copy_fs, vp, xp->i_number,
 			    xp->i_mode);
 		VOP_UNLOCK(xvp, 0, td);
 		vdrop(xvp);
 		if (error) {
 			free(copy_fs->fs_csp, M_UFSMNT);
 			bawrite(sbp);
 			sbp = NULL;
 			MNT_VNODE_FOREACH_ABORT(mp, mvp);
 			goto out1;
 		}
 		MNT_ILOCK(mp);
 	}
 	MNT_IUNLOCK(mp);
 	/*
 	 * If there already exist snapshots on this filesystem, grab a
 	 * reference to their shared lock. If this is the first snapshot
 	 * on this filesystem, we need to allocate a lock for the snapshots
 	 * to share. In either case, acquire the snapshot lock and give
 	 * up our original private lock.
 	 */
 	VI_LOCK(devvp);
 	sn = devvp->v_rdev->si_snapdata;
 	if (sn != NULL) {
 		xp = TAILQ_FIRST(&sn->sn_head);
 		VI_UNLOCK(devvp);
 		VI_LOCK(vp);
 		vp->v_vnlock = &sn->sn_lock;
 	} else {
 		VI_UNLOCK(devvp);
 		sn = malloc(sizeof *sn, M_UFSMNT, M_WAITOK | M_ZERO);
 		TAILQ_INIT(&sn->sn_head);
 		lockinit(&sn->sn_lock, PVFS, "snaplk", VLKTIMEOUT,
 		    LK_CANRECURSE | LK_NOSHARE);
 		VI_LOCK(vp);
 		vp->v_vnlock = &sn->sn_lock;
 		mp_fixme("si_snapdata setting is racey.");
 		devvp->v_rdev->si_snapdata = sn;
 		xp = NULL;
 	}
 	lockmgr(vp->v_vnlock, LK_INTERLOCK | LK_EXCLUSIVE | LK_RETRY,
 	    VI_MTX(vp), td);
 	lockmgr(&vp->v_lock, LK_RELEASE, NULL, td);
 	/*
 	 * If this is the first snapshot on this filesystem, then we need
 	 * to allocate the space for the list of preallocated snapshot blocks.
 	 * This list will be refined below, but this preliminary one will
 	 * keep us out of deadlock until the full one is ready.
 	 */
 	if (xp == NULL) {
 		MALLOC(snapblklist, daddr_t *, snaplistsize * sizeof(daddr_t),
 		    M_UFSMNT, M_WAITOK);
 		blkp = &snapblklist[1];
 		*blkp++ = lblkno(fs, fs->fs_sblockloc);
 		blkno = fragstoblks(fs, fs->fs_csaddr);
 		for (cg = 0; cg < fs->fs_ncg; cg++) {
 			if (fragstoblks(fs, cgtod(fs, cg) > blkno))
 				break;
 			*blkp++ = fragstoblks(fs, cgtod(fs, cg));
 		}
 		len = howmany(fs->fs_cssize, fs->fs_bsize);
 		for (loc = 0; loc < len; loc++)
 			*blkp++ = blkno + loc;
 		for (; cg < fs->fs_ncg; cg++)
 			*blkp++ = fragstoblks(fs, cgtod(fs, cg));
 		snapblklist[0] = blkp - snapblklist;
 		VI_LOCK(devvp);
 		if (sn->sn_blklist != NULL)
 			panic("ffs_snapshot: non-empty list");
 		sn->sn_blklist = snapblklist;
 		sn->sn_listsize = blkp - snapblklist;
 		VI_UNLOCK(devvp);
 	}
 	/*
 	 * Record snapshot inode. Since this is the newest snapshot,
 	 * it must be placed at the end of the list.
 	 */
 	VI_LOCK(devvp);
 	fs->fs_snapinum[snaploc] = ip->i_number;
 	if (ip->i_nextsnap.tqe_prev != 0)
 		panic("ffs_snapshot: %d already on list", ip->i_number);
 	TAILQ_INSERT_TAIL(&sn->sn_head, ip, i_nextsnap);
 	devvp->v_vflag |= VV_COPYONWRITE;
 	VI_UNLOCK(devvp);
 	ASSERT_VOP_LOCKED(vp, "ffs_snapshot vp");
 out1:
 	KASSERT((sn != NULL && sbp != NULL && error == 0) ||
 		(sn == NULL && sbp == NULL && error != 0),
 		("email phk@ and mckusick@"));
 	/*
 	 * Resume operation on filesystem.
 	 */
 	vfs_write_resume(vp->v_mount);
 	vn_start_write(NULL, &wrtmp, V_WAIT);
 	if (collectsnapstats && starttime.tv_sec > 0) {
 		nanotime(&endtime);
 		timespecsub(&endtime, &starttime);
 		printf("%s: suspended %ld.%03ld sec, redo %ld of %d\n",
 		    vp->v_mount->mnt_stat.f_mntonname, (long)endtime.tv_sec,
 		    endtime.tv_nsec / 1000000, redo, fs->fs_ncg);
 	}
 	if (sbp == NULL)
 		goto out;
 	/*
 	 * Copy allocation information from all the snapshots in
 	 * this snapshot and then expunge them from its view.
 	 */
 	TAILQ_FOREACH(xp, &sn->sn_head, i_nextsnap) {
 		if (xp == ip)
 			break;
 		if (xp->i_ump->um_fstype == UFS1)
 			error = expunge_ufs1(vp, xp, fs, snapacct_ufs1,
 			    BLK_SNAP);
 		else
 			error = expunge_ufs2(vp, xp, fs, snapacct_ufs2,
 			    BLK_SNAP);
 		if (error == 0 && xp->i_effnlink == 0) {
 			error = ffs_freefile(ump,
 					     copy_fs,
 					     vp,
 					     xp->i_number,
 					     xp->i_mode);
 		}
 		if (error) {
 			fs->fs_snapinum[snaploc] = 0;
 			goto done;
 		}
 	}
 	/*
 	 * Allocate space for the full list of preallocated snapshot blocks.
 	 */
 	MALLOC(snapblklist, daddr_t *, snaplistsize * sizeof(daddr_t),
 	    M_UFSMNT, M_WAITOK);
 	ip->i_snapblklist = &snapblklist[1];
 	/*
 	 * Expunge the blocks used by the snapshots from the set of
 	 * blocks marked as used in the snapshot bitmaps. Also, collect
 	 * the list of allocated blocks in i_snapblklist.
 	 */
 	if (ip->i_ump->um_fstype == UFS1)
 		error = expunge_ufs1(vp, ip, copy_fs, mapacct_ufs1, BLK_SNAP);
 	else
 		error = expunge_ufs2(vp, ip, copy_fs, mapacct_ufs2, BLK_SNAP);
 	if (error) {
 		fs->fs_snapinum[snaploc] = 0;
 		FREE(snapblklist, M_UFSMNT);
 		goto done;
 	}
 	if (snaplistsize < ip->i_snapblklist - snapblklist)
 		panic("ffs_snapshot: list too small");
 	snaplistsize = ip->i_snapblklist - snapblklist;
 	snapblklist[0] = snaplistsize;
 	ip->i_snapblklist = 0;
 	/*
 	 * Write out the list of allocated blocks to the end of the snapshot.
 	 */
 	auio.uio_iov = &aiov;
 	auio.uio_iovcnt = 1;
 	aiov.iov_base = (void *)snapblklist;
 	aiov.iov_len = snaplistsize * sizeof(daddr_t);
 	auio.uio_resid = aiov.iov_len;;
 	auio.uio_offset = ip->i_size;
 	auio.uio_segflg = UIO_SYSSPACE;
 	auio.uio_rw = UIO_WRITE;
 	auio.uio_td = td;
 	if ((error = VOP_WRITE(vp, &auio, IO_UNIT, td->td_ucred)) != 0) {
 		fs->fs_snapinum[snaploc] = 0;
 		FREE(snapblklist, M_UFSMNT);
 		goto done;
 	}
 	/*
 	 * Write the superblock and its summary information
 	 * to the snapshot.
 	 */
 	blkno = fragstoblks(fs, fs->fs_csaddr);
 	len = howmany(fs->fs_cssize, fs->fs_bsize);
 	space = copy_fs->fs_csp;
 	for (loc = 0; loc < len; loc++) {
 		error = bread(vp, blkno + loc, fs->fs_bsize, KERNCRED, &nbp);
 		if (error) {
 			brelse(nbp);
 			fs->fs_snapinum[snaploc] = 0;
 			FREE(snapblklist, M_UFSMNT);
 			goto done;
 		}
 		bcopy(space, nbp->b_data, fs->fs_bsize);
 		space = (char *)space + fs->fs_bsize;
 		bawrite(nbp);
 	}
 	/*
 	 * As this is the newest list, it is the most inclusive, so
 	 * should replace the previous list.
 	 */
 	VI_LOCK(devvp);
 	space = sn->sn_blklist;
 	sn->sn_blklist = snapblklist;
 	sn->sn_listsize = snaplistsize;
 	VI_UNLOCK(devvp);
 	if (space != NULL)
 		FREE(space, M_UFSMNT);
 	/*
 	 * If another process is currently writing the buffer containing
 	 * the inode for this snapshot then a deadlock can occur. Drop
 	 * the snapshot lock until the buffer has been written.
 	 */
 	VREF(vp);	/* Protect against ffs_snapgone() */
 	VOP_UNLOCK(vp, 0, td);
 	(void) bread(ip->i_devvp,
 		     fsbtodb(fs, ino_to_fsba(fs, ip->i_number)),
 		     (int) fs->fs_bsize, NOCRED, &nbp);
 	brelse(nbp);
-	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td);
+	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
 	if (ip->i_effnlink == 0)
 		error = ENOENT;		/* Snapshot file unlinked */
 	else
 		vrele(vp);		/* Drop extra reference */
 done:
 	FREE(copy_fs->fs_csp, M_UFSMNT);
 	bawrite(sbp);
 out:
 	NDFREE(&nd, NDF_ONLY_PNBUF);
 	if (saved_nice > 0) {
 		struct proc *p;
 
 		p = td->td_proc;
 		PROC_LOCK(p);
 		PROC_SLOCK(p);
 		sched_nice(td->td_proc, saved_nice);
 		PROC_SUNLOCK(p);
 		PROC_UNLOCK(td->td_proc);
 	}
 	UFS_LOCK(ump);
 	if (fs->fs_active != 0) {
 		FREE(fs->fs_active, M_DEVBUF);
 		fs->fs_active = 0;
 	}
 	UFS_UNLOCK(ump);
 	MNT_ILOCK(mp);
 	mp->mnt_flag = (mp->mnt_flag & MNT_QUOTA) | (flag & ~MNT_QUOTA);
 	MNT_IUNLOCK(mp);
 	if (error)
 		(void) ffs_truncate(vp, (off_t)0, 0, NOCRED, td);
 	(void) ffs_syncvnode(vp, MNT_WAIT);
 	if (error)
 		vput(vp);
 	else
 		VOP_UNLOCK(vp, 0, td);
 	vrele(nd.ni_dvp);
 	vn_finished_write(wrtmp);
 	process_deferred_inactive(mp);
 	return (error);
 }
 
 /*
  * Copy a cylinder group map. All the unallocated blocks are marked
  * BLK_NOCOPY so that the snapshot knows that it need not copy them
  * if they are later written. If passno is one, then this is a first
  * pass, so only setting needs to be done. If passno is 2, then this
  * is a revision to a previous pass which must be undone as the
  * replacement pass is done.
  */
 static int
 cgaccount(cg, vp, nbp, passno)
 	int cg;
 	struct vnode *vp;
 	struct buf *nbp;
 	int passno;
 {
 	struct buf *bp, *ibp;
 	struct inode *ip;
 	struct cg *cgp;
 	struct fs *fs;
 	ufs2_daddr_t base, numblks;
 	int error, len, loc, indiroff;
 
 	ip = VTOI(vp);
 	fs = ip->i_fs;
 	error = bread(ip->i_devvp, fsbtodb(fs, cgtod(fs, cg)),
 		(int)fs->fs_cgsize, KERNCRED, &bp);
 	if (error) {
 		brelse(bp);
 		return (error);
 	}
 	cgp = (struct cg *)bp->b_data;
 	if (!cg_chkmagic(cgp)) {
 		brelse(bp);
 		return (EIO);
 	}
 	UFS_LOCK(ip->i_ump);
 	ACTIVESET(fs, cg);
 	UFS_UNLOCK(ip->i_ump);
 	bcopy(bp->b_data, nbp->b_data, fs->fs_cgsize);
 	if (fs->fs_cgsize < fs->fs_bsize)
 		bzero(&nbp->b_data[fs->fs_cgsize],
 		    fs->fs_bsize - fs->fs_cgsize);
 	cgp = (struct cg *)nbp->b_data;
 	bqrelse(bp);
 	if (passno == 2)
 		nbp->b_flags |= B_VALIDSUSPWRT;
 	numblks = howmany(fs->fs_size, fs->fs_frag);
 	len = howmany(fs->fs_fpg, fs->fs_frag);
 	base = cgbase(fs, cg) / fs->fs_frag;
 	if (base + len >= numblks)
 		len = numblks - base - 1;
 	loc = 0;
 	if (base < NDADDR) {
 		for ( ; loc < NDADDR; loc++) {
 			if (ffs_isblock(fs, cg_blksfree(cgp), loc))
 				DIP_SET(ip, i_db[loc], BLK_NOCOPY);
 			else if (passno == 2 && DIP(ip, i_db[loc])== BLK_NOCOPY)
 				DIP_SET(ip, i_db[loc], 0);
 			else if (passno == 1 && DIP(ip, i_db[loc])== BLK_NOCOPY)
 				panic("ffs_snapshot: lost direct block");
 		}
 	}
 	error = UFS_BALLOC(vp, lblktosize(fs, (off_t)(base + loc)),
 	    fs->fs_bsize, KERNCRED, BA_METAONLY, &ibp);
 	if (error) {
 		return (error);
 	}
 	indiroff = (base + loc - NDADDR) % NINDIR(fs);
 	for ( ; loc < len; loc++, indiroff++) {
 		if (indiroff >= NINDIR(fs)) {
 			if (passno == 2)
 				ibp->b_flags |= B_VALIDSUSPWRT;
 			bawrite(ibp);
 			error = UFS_BALLOC(vp,
 			    lblktosize(fs, (off_t)(base + loc)),
 			    fs->fs_bsize, KERNCRED, BA_METAONLY, &ibp);
 			if (error) {
 				return (error);
 			}
 			indiroff = 0;
 		}
 		if (ip->i_ump->um_fstype == UFS1) {
 			if (ffs_isblock(fs, cg_blksfree(cgp), loc))
 				((ufs1_daddr_t *)(ibp->b_data))[indiroff] =
 				    BLK_NOCOPY;
 			else if (passno == 2 && ((ufs1_daddr_t *)(ibp->b_data))
 			    [indiroff] == BLK_NOCOPY)
 				((ufs1_daddr_t *)(ibp->b_data))[indiroff] = 0;
 			else if (passno == 1 && ((ufs1_daddr_t *)(ibp->b_data))
 			    [indiroff] == BLK_NOCOPY)
 				panic("ffs_snapshot: lost indirect block");
 			continue;
 		}
 		if (ffs_isblock(fs, cg_blksfree(cgp), loc))
 			((ufs2_daddr_t *)(ibp->b_data))[indiroff] = BLK_NOCOPY;
 		else if (passno == 2 &&
 		    ((ufs2_daddr_t *)(ibp->b_data)) [indiroff] == BLK_NOCOPY)
 			((ufs2_daddr_t *)(ibp->b_data))[indiroff] = 0;
 		else if (passno == 1 &&
 		    ((ufs2_daddr_t *)(ibp->b_data)) [indiroff] == BLK_NOCOPY)
 			panic("ffs_snapshot: lost indirect block");
 	}
 	if (passno == 2)
 		ibp->b_flags |= B_VALIDSUSPWRT;
 	bdwrite(ibp);
 	return (0);
 }
 
 /*
  * Before expunging a snapshot inode, note all the
  * blocks that it claims with BLK_SNAP so that fsck will
  * be able to account for those blocks properly and so
  * that this snapshot knows that it need not copy them
  * if the other snapshot holding them is freed. This code
  * is reproduced once each for UFS1 and UFS2.
  */
 static int
 expunge_ufs1(snapvp, cancelip, fs, acctfunc, expungetype)
 	struct vnode *snapvp;
 	struct inode *cancelip;
 	struct fs *fs;
 	int (*acctfunc)(struct vnode *, ufs1_daddr_t *, ufs1_daddr_t *,
 	    struct fs *, ufs_lbn_t, int);
 	int expungetype;
 {
 	int i, error, indiroff;
 	ufs_lbn_t lbn, rlbn;
 	ufs2_daddr_t len, blkno, numblks, blksperindir;
 	struct ufs1_dinode *dip;
 	struct thread *td = curthread;
 	struct buf *bp;
 
 	/*
 	 * Prepare to expunge the inode. If its inode block has not
 	 * yet been copied, then allocate and fill the copy.
 	 */
 	lbn = fragstoblks(fs, ino_to_fsba(fs, cancelip->i_number));
 	blkno = 0;
 	if (lbn < NDADDR) {
 		blkno = VTOI(snapvp)->i_din1->di_db[lbn];
 	} else {
 		td->td_pflags |= TDP_COWINPROGRESS;
 		error = ffs_balloc_ufs1(snapvp, lblktosize(fs, (off_t)lbn),
 		   fs->fs_bsize, KERNCRED, BA_METAONLY, &bp);
 		td->td_pflags &= ~TDP_COWINPROGRESS;
 		if (error)
 			return (error);
 		indiroff = (lbn - NDADDR) % NINDIR(fs);
 		blkno = ((ufs1_daddr_t *)(bp->b_data))[indiroff];
 		bqrelse(bp);
 	}
 	if (blkno != 0) {
 		if ((error = bread(snapvp, lbn, fs->fs_bsize, KERNCRED, &bp)))
 			return (error);
 	} else {
 		error = ffs_balloc_ufs1(snapvp, lblktosize(fs, (off_t)lbn),
 		    fs->fs_bsize, KERNCRED, 0, &bp);
 		if (error)
 			return (error);
 		if ((error = readblock(snapvp, bp, lbn)) != 0)
 			return (error);
 	}
 	/*
 	 * Set a snapshot inode to be a zero length file, regular files
 	 * or unlinked snapshots to be completely unallocated.
 	 */
 	dip = (struct ufs1_dinode *)bp->b_data +
 	    ino_to_fsbo(fs, cancelip->i_number);
 	if (expungetype == BLK_NOCOPY || cancelip->i_effnlink == 0)
 		dip->di_mode = 0;
 	dip->di_size = 0;
 	dip->di_blocks = 0;
 	dip->di_flags &= ~SF_SNAPSHOT;
 	bzero(&dip->di_db[0], (NDADDR + NIADDR) * sizeof(ufs1_daddr_t));
 	bdwrite(bp);
 	/*
 	 * Now go through and expunge all the blocks in the file
 	 * using the function requested.
 	 */
 	numblks = howmany(cancelip->i_size, fs->fs_bsize);
 	if ((error = (*acctfunc)(snapvp, &cancelip->i_din1->di_db[0],
 	    &cancelip->i_din1->di_db[NDADDR], fs, 0, expungetype)))
 		return (error);
 	if ((error = (*acctfunc)(snapvp, &cancelip->i_din1->di_ib[0],
 	    &cancelip->i_din1->di_ib[NIADDR], fs, -1, expungetype)))
 		return (error);
 	blksperindir = 1;
 	lbn = -NDADDR;
 	len = numblks - NDADDR;
 	rlbn = NDADDR;
 	for (i = 0; len > 0 && i < NIADDR; i++) {
 		error = indiracct_ufs1(snapvp, ITOV(cancelip), i,
 		    cancelip->i_din1->di_ib[i], lbn, rlbn, len,
 		    blksperindir, fs, acctfunc, expungetype);
 		if (error)
 			return (error);
 		blksperindir *= NINDIR(fs);
 		lbn -= blksperindir + 1;
 		len -= blksperindir;
 		rlbn += blksperindir;
 	}
 	return (0);
 }
 
 /*
  * Descend an indirect block chain for vnode cancelvp accounting for all
  * its indirect blocks in snapvp.
  */ 
 static int
 indiracct_ufs1(snapvp, cancelvp, level, blkno, lbn, rlbn, remblks,
 	    blksperindir, fs, acctfunc, expungetype)
 	struct vnode *snapvp;
 	struct vnode *cancelvp;
 	int level;
 	ufs1_daddr_t blkno;
 	ufs_lbn_t lbn;
 	ufs_lbn_t rlbn;
 	ufs_lbn_t remblks;
 	ufs_lbn_t blksperindir;
 	struct fs *fs;
 	int (*acctfunc)(struct vnode *, ufs1_daddr_t *, ufs1_daddr_t *,
 	    struct fs *, ufs_lbn_t, int);
 	int expungetype;
 {
 	int error, num, i;
 	ufs_lbn_t subblksperindir;
 	struct indir indirs[NIADDR + 2];
 	ufs1_daddr_t last, *bap;
 	struct buf *bp;
 
 	if (blkno == 0) {
 		if (expungetype == BLK_NOCOPY)
 			return (0);
 		panic("indiracct_ufs1: missing indir");
 	}
 	if ((error = ufs_getlbns(cancelvp, rlbn, indirs, &num)) != 0)
 		return (error);
 	if (lbn != indirs[num - 1 - level].in_lbn || num < 2)
 		panic("indiracct_ufs1: botched params");
 	/*
 	 * We have to expand bread here since it will deadlock looking
 	 * up the block number for any blocks that are not in the cache.
 	 */
 	bp = getblk(cancelvp, lbn, fs->fs_bsize, 0, 0, 0);
 	bp->b_blkno = fsbtodb(fs, blkno);
 	if ((bp->b_flags & (B_DONE | B_DELWRI)) == 0 &&
 	    (error = readblock(cancelvp, bp, fragstoblks(fs, blkno)))) {
 		brelse(bp);
 		return (error);
 	}
 	/*
 	 * Account for the block pointers in this indirect block.
 	 */
 	last = howmany(remblks, blksperindir);
 	if (last > NINDIR(fs))
 		last = NINDIR(fs);
 	MALLOC(bap, ufs1_daddr_t *, fs->fs_bsize, M_DEVBUF, M_WAITOK);
 	bcopy(bp->b_data, (caddr_t)bap, fs->fs_bsize);
 	bqrelse(bp);
 	error = (*acctfunc)(snapvp, &bap[0], &bap[last], fs,
 	    level == 0 ? rlbn : -1, expungetype);
 	if (error || level == 0)
 		goto out;
 	/*
 	 * Account for the block pointers in each of the indirect blocks
 	 * in the levels below us.
 	 */
 	subblksperindir = blksperindir / NINDIR(fs);
 	for (lbn++, level--, i = 0; i < last; i++) {
 		error = indiracct_ufs1(snapvp, cancelvp, level, bap[i], lbn,
 		    rlbn, remblks, subblksperindir, fs, acctfunc, expungetype);
 		if (error)
 			goto out;
 		rlbn += blksperindir;
 		lbn -= blksperindir;
 		remblks -= blksperindir;
 	}
 out:
 	FREE(bap, M_DEVBUF);
 	return (error);
 }
 
 /*
  * Do both snap accounting and map accounting.
  */
 static int
 fullacct_ufs1(vp, oldblkp, lastblkp, fs, lblkno, exptype)
 	struct vnode *vp;
 	ufs1_daddr_t *oldblkp, *lastblkp;
 	struct fs *fs;
 	ufs_lbn_t lblkno;
 	int exptype;	/* BLK_SNAP or BLK_NOCOPY */
 {
 	int error;
 
 	if ((error = snapacct_ufs1(vp, oldblkp, lastblkp, fs, lblkno, exptype)))
 		return (error);
 	return (mapacct_ufs1(vp, oldblkp, lastblkp, fs, lblkno, exptype));
 }
 
 /*
  * Identify a set of blocks allocated in a snapshot inode.
  */
 static int
 snapacct_ufs1(vp, oldblkp, lastblkp, fs, lblkno, expungetype)
 	struct vnode *vp;
 	ufs1_daddr_t *oldblkp, *lastblkp;
 	struct fs *fs;
 	ufs_lbn_t lblkno;
 	int expungetype;	/* BLK_SNAP or BLK_NOCOPY */
 {
 	struct inode *ip = VTOI(vp);
 	ufs1_daddr_t blkno, *blkp;
 	ufs_lbn_t lbn;
 	struct buf *ibp;
 	int error;
 
 	for ( ; oldblkp < lastblkp; oldblkp++) {
 		blkno = *oldblkp;
 		if (blkno == 0 || blkno == BLK_NOCOPY || blkno == BLK_SNAP)
 			continue;
 		lbn = fragstoblks(fs, blkno);
 		if (lbn < NDADDR) {
 			blkp = &ip->i_din1->di_db[lbn];
 			ip->i_flag |= IN_CHANGE | IN_UPDATE;
 		} else {
 			error = ffs_balloc_ufs1(vp, lblktosize(fs, (off_t)lbn),
 			    fs->fs_bsize, KERNCRED, BA_METAONLY, &ibp);
 			if (error)
 				return (error);
 			blkp = &((ufs1_daddr_t *)(ibp->b_data))
 			    [(lbn - NDADDR) % NINDIR(fs)];
 		}
 		/*
 		 * If we are expunging a snapshot vnode and we
 		 * find a block marked BLK_NOCOPY, then it is
 		 * one that has been allocated to this snapshot after
 		 * we took our current snapshot and can be ignored.
 		 */
 		if (expungetype == BLK_SNAP && *blkp == BLK_NOCOPY) {
 			if (lbn >= NDADDR)
 				brelse(ibp);
 		} else {
 			if (*blkp != 0)
 				panic("snapacct_ufs1: bad block");
 			*blkp = expungetype;
 			if (lbn >= NDADDR)
 				bdwrite(ibp);
 		}
 	}
 	return (0);
 }
 
 /*
  * Account for a set of blocks allocated in a snapshot inode.
  */
 static int
 mapacct_ufs1(vp, oldblkp, lastblkp, fs, lblkno, expungetype)
 	struct vnode *vp;
 	ufs1_daddr_t *oldblkp, *lastblkp;
 	struct fs *fs;
 	ufs_lbn_t lblkno;
 	int expungetype;
 {
 	ufs1_daddr_t blkno;
 	struct inode *ip;
 	ino_t inum;
 	int acctit;
 
 	ip = VTOI(vp);
 	inum = ip->i_number;
 	if (lblkno == -1)
 		acctit = 0;
 	else
 		acctit = 1;
 	for ( ; oldblkp < lastblkp; oldblkp++, lblkno++) {
 		blkno = *oldblkp;
 		if (blkno == 0 || blkno == BLK_NOCOPY)
 			continue;
 		if (acctit && expungetype == BLK_SNAP && blkno != BLK_SNAP)
 			*ip->i_snapblklist++ = lblkno;
 		if (blkno == BLK_SNAP)
 			blkno = blkstofrags(fs, lblkno);
 		ffs_blkfree(ip->i_ump, fs, vp, blkno, fs->fs_bsize, inum);
 	}
 	return (0);
 }
 
 /*
  * Before expunging a snapshot inode, note all the
  * blocks that it claims with BLK_SNAP so that fsck will
  * be able to account for those blocks properly and so
  * that this snapshot knows that it need not copy them
  * if the other snapshot holding them is freed. This code
  * is reproduced once each for UFS1 and UFS2.
  */
 static int
 expunge_ufs2(snapvp, cancelip, fs, acctfunc, expungetype)
 	struct vnode *snapvp;
 	struct inode *cancelip;
 	struct fs *fs;
 	int (*acctfunc)(struct vnode *, ufs2_daddr_t *, ufs2_daddr_t *,
 	    struct fs *, ufs_lbn_t, int);
 	int expungetype;
 {
 	int i, error, indiroff;
 	ufs_lbn_t lbn, rlbn;
 	ufs2_daddr_t len, blkno, numblks, blksperindir;
 	struct ufs2_dinode *dip;
 	struct thread *td = curthread;
 	struct buf *bp;
 
 	/*
 	 * Prepare to expunge the inode. If its inode block has not
 	 * yet been copied, then allocate and fill the copy.
 	 */
 	lbn = fragstoblks(fs, ino_to_fsba(fs, cancelip->i_number));
 	blkno = 0;
 	if (lbn < NDADDR) {
 		blkno = VTOI(snapvp)->i_din2->di_db[lbn];
 	} else {
 		td->td_pflags |= TDP_COWINPROGRESS;
 		error = ffs_balloc_ufs2(snapvp, lblktosize(fs, (off_t)lbn),
 		   fs->fs_bsize, KERNCRED, BA_METAONLY, &bp);
 		td->td_pflags &= ~TDP_COWINPROGRESS;
 		if (error)
 			return (error);
 		indiroff = (lbn - NDADDR) % NINDIR(fs);
 		blkno = ((ufs2_daddr_t *)(bp->b_data))[indiroff];
 		bqrelse(bp);
 	}
 	if (blkno != 0) {
 		if ((error = bread(snapvp, lbn, fs->fs_bsize, KERNCRED, &bp)))
 			return (error);
 	} else {
 		error = ffs_balloc_ufs2(snapvp, lblktosize(fs, (off_t)lbn),
 		    fs->fs_bsize, KERNCRED, 0, &bp);
 		if (error)
 			return (error);
 		if ((error = readblock(snapvp, bp, lbn)) != 0)
 			return (error);
 	}
 	/*
 	 * Set a snapshot inode to be a zero length file, regular files
 	 * to be completely unallocated.
 	 */
 	dip = (struct ufs2_dinode *)bp->b_data +
 	    ino_to_fsbo(fs, cancelip->i_number);
 	if (expungetype == BLK_NOCOPY)
 		dip->di_mode = 0;
 	dip->di_size = 0;
 	dip->di_blocks = 0;
 	dip->di_flags &= ~SF_SNAPSHOT;
 	bzero(&dip->di_db[0], (NDADDR + NIADDR) * sizeof(ufs2_daddr_t));
 	bdwrite(bp);
 	/*
 	 * Now go through and expunge all the blocks in the file
 	 * using the function requested.
 	 */
 	numblks = howmany(cancelip->i_size, fs->fs_bsize);
 	if ((error = (*acctfunc)(snapvp, &cancelip->i_din2->di_db[0],
 	    &cancelip->i_din2->di_db[NDADDR], fs, 0, expungetype)))
 		return (error);
 	if ((error = (*acctfunc)(snapvp, &cancelip->i_din2->di_ib[0],
 	    &cancelip->i_din2->di_ib[NIADDR], fs, -1, expungetype)))
 		return (error);
 	blksperindir = 1;
 	lbn = -NDADDR;
 	len = numblks - NDADDR;
 	rlbn = NDADDR;
 	for (i = 0; len > 0 && i < NIADDR; i++) {
 		error = indiracct_ufs2(snapvp, ITOV(cancelip), i,
 		    cancelip->i_din2->di_ib[i], lbn, rlbn, len,
 		    blksperindir, fs, acctfunc, expungetype);
 		if (error)
 			return (error);
 		blksperindir *= NINDIR(fs);
 		lbn -= blksperindir + 1;
 		len -= blksperindir;
 		rlbn += blksperindir;
 	}
 	return (0);
 }
 
 /*
  * Descend an indirect block chain for vnode cancelvp accounting for all
  * its indirect blocks in snapvp.
  */ 
 static int
 indiracct_ufs2(snapvp, cancelvp, level, blkno, lbn, rlbn, remblks,
 	    blksperindir, fs, acctfunc, expungetype)
 	struct vnode *snapvp;
 	struct vnode *cancelvp;
 	int level;
 	ufs2_daddr_t blkno;
 	ufs_lbn_t lbn;
 	ufs_lbn_t rlbn;
 	ufs_lbn_t remblks;
 	ufs_lbn_t blksperindir;
 	struct fs *fs;
 	int (*acctfunc)(struct vnode *, ufs2_daddr_t *, ufs2_daddr_t *,
 	    struct fs *, ufs_lbn_t, int);
 	int expungetype;
 {
 	int error, num, i;
 	ufs_lbn_t subblksperindir;
 	struct indir indirs[NIADDR + 2];
 	ufs2_daddr_t last, *bap;
 	struct buf *bp;
 
 	if (blkno == 0) {
 		if (expungetype == BLK_NOCOPY)
 			return (0);
 		panic("indiracct_ufs2: missing indir");
 	}
 	if ((error = ufs_getlbns(cancelvp, rlbn, indirs, &num)) != 0)
 		return (error);
 	if (lbn != indirs[num - 1 - level].in_lbn || num < 2)
 		panic("indiracct_ufs2: botched params");
 	/*
 	 * We have to expand bread here since it will deadlock looking
 	 * up the block number for any blocks that are not in the cache.
 	 */
 	bp = getblk(cancelvp, lbn, fs->fs_bsize, 0, 0, 0);
 	bp->b_blkno = fsbtodb(fs, blkno);
 	if ((bp->b_flags & (B_DONE | B_DELWRI)) == 0 &&
 	    (error = readblock(cancelvp, bp, fragstoblks(fs, blkno)))) {
 		brelse(bp);
 		return (error);
 	}
 	/*
 	 * Account for the block pointers in this indirect block.
 	 */
 	last = howmany(remblks, blksperindir);
 	if (last > NINDIR(fs))
 		last = NINDIR(fs);
 	MALLOC(bap, ufs2_daddr_t *, fs->fs_bsize, M_DEVBUF, M_WAITOK);
 	bcopy(bp->b_data, (caddr_t)bap, fs->fs_bsize);
 	bqrelse(bp);
 	error = (*acctfunc)(snapvp, &bap[0], &bap[last], fs,
 	    level == 0 ? rlbn : -1, expungetype);
 	if (error || level == 0)
 		goto out;
 	/*
 	 * Account for the block pointers in each of the indirect blocks
 	 * in the levels below us.
 	 */
 	subblksperindir = blksperindir / NINDIR(fs);
 	for (lbn++, level--, i = 0; i < last; i++) {
 		error = indiracct_ufs2(snapvp, cancelvp, level, bap[i], lbn,
 		    rlbn, remblks, subblksperindir, fs, acctfunc, expungetype);
 		if (error)
 			goto out;
 		rlbn += blksperindir;
 		lbn -= blksperindir;
 		remblks -= blksperindir;
 	}
 out:
 	FREE(bap, M_DEVBUF);
 	return (error);
 }
 
 /*
  * Do both snap accounting and map accounting.
  */
 static int
 fullacct_ufs2(vp, oldblkp, lastblkp, fs, lblkno, exptype)
 	struct vnode *vp;
 	ufs2_daddr_t *oldblkp, *lastblkp;
 	struct fs *fs;
 	ufs_lbn_t lblkno;
 	int exptype;	/* BLK_SNAP or BLK_NOCOPY */
 {
 	int error;
 
 	if ((error = snapacct_ufs2(vp, oldblkp, lastblkp, fs, lblkno, exptype)))
 		return (error);
 	return (mapacct_ufs2(vp, oldblkp, lastblkp, fs, lblkno, exptype));
 }
 
 /*
  * Identify a set of blocks allocated in a snapshot inode.
  */
 static int
 snapacct_ufs2(vp, oldblkp, lastblkp, fs, lblkno, expungetype)
 	struct vnode *vp;
 	ufs2_daddr_t *oldblkp, *lastblkp;
 	struct fs *fs;
 	ufs_lbn_t lblkno;
 	int expungetype;	/* BLK_SNAP or BLK_NOCOPY */
 {
 	struct inode *ip = VTOI(vp);
 	ufs2_daddr_t blkno, *blkp;
 	ufs_lbn_t lbn;
 	struct buf *ibp;
 	int error;
 
 	for ( ; oldblkp < lastblkp; oldblkp++) {
 		blkno = *oldblkp;
 		if (blkno == 0 || blkno == BLK_NOCOPY || blkno == BLK_SNAP)
 			continue;
 		lbn = fragstoblks(fs, blkno);
 		if (lbn < NDADDR) {
 			blkp = &ip->i_din2->di_db[lbn];
 			ip->i_flag |= IN_CHANGE | IN_UPDATE;
 		} else {
 			error = ffs_balloc_ufs2(vp, lblktosize(fs, (off_t)lbn),
 			    fs->fs_bsize, KERNCRED, BA_METAONLY, &ibp);
 			if (error)
 				return (error);
 			blkp = &((ufs2_daddr_t *)(ibp->b_data))
 			    [(lbn - NDADDR) % NINDIR(fs)];
 		}
 		/*
 		 * If we are expunging a snapshot vnode and we
 		 * find a block marked BLK_NOCOPY, then it is
 		 * one that has been allocated to this snapshot after
 		 * we took our current snapshot and can be ignored.
 		 */
 		if (expungetype == BLK_SNAP && *blkp == BLK_NOCOPY) {
 			if (lbn >= NDADDR)
 				brelse(ibp);
 		} else {
 			if (*blkp != 0)
 				panic("snapacct_ufs2: bad block");
 			*blkp = expungetype;
 			if (lbn >= NDADDR)
 				bdwrite(ibp);
 		}
 	}
 	return (0);
 }
 
 /*
  * Account for a set of blocks allocated in a snapshot inode.
  */
 static int
 mapacct_ufs2(vp, oldblkp, lastblkp, fs, lblkno, expungetype)
 	struct vnode *vp;
 	ufs2_daddr_t *oldblkp, *lastblkp;
 	struct fs *fs;
 	ufs_lbn_t lblkno;
 	int expungetype;
 {
 	ufs2_daddr_t blkno;
 	struct inode *ip;
 	ino_t inum;
 	int acctit;
 
 	ip = VTOI(vp);
 	inum = ip->i_number;
 	if (lblkno == -1)
 		acctit = 0;
 	else
 		acctit = 1;
 	for ( ; oldblkp < lastblkp; oldblkp++, lblkno++) {
 		blkno = *oldblkp;
 		if (blkno == 0 || blkno == BLK_NOCOPY)
 			continue;
 		if (acctit && expungetype == BLK_SNAP && blkno != BLK_SNAP)
 			*ip->i_snapblklist++ = lblkno;
 		if (blkno == BLK_SNAP)
 			blkno = blkstofrags(fs, lblkno);
 		ffs_blkfree(ip->i_ump, fs, vp, blkno, fs->fs_bsize, inum);
 	}
 	return (0);
 }
 
 /*
  * Decrement extra reference on snapshot when last name is removed.
  * It will not be freed until the last open reference goes away.
  */
 void
 ffs_snapgone(ip)
 	struct inode *ip;
 {
 	struct inode *xp;
 	struct fs *fs;
 	int snaploc;
 	struct snapdata *sn;
 	struct ufsmount *ump;
 
 	/*
 	 * Find snapshot in incore list.
 	 */
 	xp = NULL;
 	sn = ip->i_devvp->v_rdev->si_snapdata;
 	if (sn != NULL)
 		TAILQ_FOREACH(xp, &sn->sn_head, i_nextsnap)
 			if (xp == ip)
 				break;
 	if (xp != NULL)
 		vrele(ITOV(ip));
 	else if (snapdebug)
 		printf("ffs_snapgone: lost snapshot vnode %d\n",
 		    ip->i_number);
 	/*
 	 * Delete snapshot inode from superblock. Keep list dense.
 	 */
 	fs = ip->i_fs;
 	ump = ip->i_ump;
 	UFS_LOCK(ump);
 	for (snaploc = 0; snaploc < FSMAXSNAP; snaploc++)
 		if (fs->fs_snapinum[snaploc] == ip->i_number)
 			break;
 	if (snaploc < FSMAXSNAP) {
 		for (snaploc++; snaploc < FSMAXSNAP; snaploc++) {
 			if (fs->fs_snapinum[snaploc] == 0)
 				break;
 			fs->fs_snapinum[snaploc - 1] = fs->fs_snapinum[snaploc];
 		}
 		fs->fs_snapinum[snaploc - 1] = 0;
 	}
 	UFS_UNLOCK(ump);
 }
 
 /*
  * Prepare a snapshot file for being removed.
  */
 void
 ffs_snapremove(vp)
 	struct vnode *vp;
 {
 	struct inode *ip;
 	struct vnode *devvp;
 	struct buf *ibp;
 	struct fs *fs;
 	struct thread *td = curthread;
 	ufs2_daddr_t numblks, blkno, dblk;
 	int error, loc, last;
 	struct snapdata *sn;
 
 	ip = VTOI(vp);
 	fs = ip->i_fs;
 	devvp = ip->i_devvp;
 	/*
 	 * If active, delete from incore list (this snapshot may
 	 * already have been in the process of being deleted, so
 	 * would not have been active).
 	 *
 	 * Clear copy-on-write flag if last snapshot.
 	 */
 	VI_LOCK(devvp);
 	if (ip->i_nextsnap.tqe_prev != 0) {
 		sn = devvp->v_rdev->si_snapdata;
 		TAILQ_REMOVE(&sn->sn_head, ip, i_nextsnap);
 		ip->i_nextsnap.tqe_prev = 0;
 		VI_UNLOCK(devvp);
 		lockmgr(&vp->v_lock, LK_EXCLUSIVE, NULL, td);
 		VI_LOCK(vp);
 		KASSERT(vp->v_vnlock == &sn->sn_lock,
 			("ffs_snapremove: lost lock mutation")); 
 		vp->v_vnlock = &vp->v_lock;
 		VI_UNLOCK(vp);
 		VI_LOCK(devvp);
 		lockmgr(&sn->sn_lock, LK_RELEASE, NULL, td);
 		try_free_snapdata(devvp, td);
 	} else
 		VI_UNLOCK(devvp);
 	/*
 	 * Clear all BLK_NOCOPY fields. Pass any block claims to other
 	 * snapshots that want them (see ffs_snapblkfree below).
 	 */
 	for (blkno = 1; blkno < NDADDR; blkno++) {
 		dblk = DIP(ip, i_db[blkno]);
 		if (dblk == 0)
 			continue;
 		if (dblk == BLK_NOCOPY || dblk == BLK_SNAP)
 			DIP_SET(ip, i_db[blkno], 0);
 		else if ((dblk == blkstofrags(fs, blkno) &&
 		     ffs_snapblkfree(fs, ip->i_devvp, dblk, fs->fs_bsize,
 		     ip->i_number))) {
 			DIP_SET(ip, i_blocks, DIP(ip, i_blocks) -
 			    btodb(fs->fs_bsize));
 			DIP_SET(ip, i_db[blkno], 0);
 		}
 	}
 	numblks = howmany(ip->i_size, fs->fs_bsize);
 	for (blkno = NDADDR; blkno < numblks; blkno += NINDIR(fs)) {
 		error = UFS_BALLOC(vp, lblktosize(fs, (off_t)blkno),
 		    fs->fs_bsize, KERNCRED, BA_METAONLY, &ibp);
 		if (error)
 			continue;
 		if (fs->fs_size - blkno > NINDIR(fs))
 			last = NINDIR(fs);
 		else
 			last = fs->fs_size - blkno;
 		for (loc = 0; loc < last; loc++) {
 			if (ip->i_ump->um_fstype == UFS1) {
 				dblk = ((ufs1_daddr_t *)(ibp->b_data))[loc];
 				if (dblk == 0)
 					continue;
 				if (dblk == BLK_NOCOPY || dblk == BLK_SNAP)
 					((ufs1_daddr_t *)(ibp->b_data))[loc]= 0;
 				else if ((dblk == blkstofrags(fs, blkno) &&
 				     ffs_snapblkfree(fs, ip->i_devvp, dblk,
 				     fs->fs_bsize, ip->i_number))) {
 					ip->i_din1->di_blocks -=
 					    btodb(fs->fs_bsize);
 					((ufs1_daddr_t *)(ibp->b_data))[loc]= 0;
 				}
 				continue;
 			}
 			dblk = ((ufs2_daddr_t *)(ibp->b_data))[loc];
 			if (dblk == 0)
 				continue;
 			if (dblk == BLK_NOCOPY || dblk == BLK_SNAP)
 				((ufs2_daddr_t *)(ibp->b_data))[loc] = 0;
 			else if ((dblk == blkstofrags(fs, blkno) &&
 			     ffs_snapblkfree(fs, ip->i_devvp, dblk,
 			     fs->fs_bsize, ip->i_number))) {
 				ip->i_din2->di_blocks -= btodb(fs->fs_bsize);
 				((ufs2_daddr_t *)(ibp->b_data))[loc] = 0;
 			}
 		}
 		bawrite(ibp);
 	}
 	/*
 	 * Clear snapshot flag and drop reference.
 	 */
 	ip->i_flags &= ~SF_SNAPSHOT;
 	DIP_SET(ip, i_flags, ip->i_flags);
 	ip->i_flag |= IN_CHANGE | IN_UPDATE;
 #ifdef QUOTA
 	/*
 	 * Reenable disk quotas for ex-snapshot file.
 	 */
 	if (!getinoquota(ip))
 		(void) chkdq(ip, DIP(ip, i_blocks), KERNCRED, FORCE);
 #endif
 }
 
 /*
  * Notification that a block is being freed. Return zero if the free
  * should be allowed to proceed. Return non-zero if the snapshot file
  * wants to claim the block. The block will be claimed if it is an
  * uncopied part of one of the snapshots. It will be freed if it is
  * either a BLK_NOCOPY or has already been copied in all of the snapshots.
  * If a fragment is being freed, then all snapshots that care about
  * it must make a copy since a snapshot file can only claim full sized
  * blocks. Note that if more than one snapshot file maps the block,
  * we can pick one at random to claim it. Since none of the snapshots
  * can change, we are assurred that they will all see the same unmodified
  * image. When deleting a snapshot file (see ffs_snapremove above), we
  * must push any of these claimed blocks to one of the other snapshots
  * that maps it. These claimed blocks are easily identified as they will
  * have a block number equal to their logical block number within the
  * snapshot. A copied block can never have this property because they
  * must always have been allocated from a BLK_NOCOPY location.
  */
 int
 ffs_snapblkfree(fs, devvp, bno, size, inum)
 	struct fs *fs;
 	struct vnode *devvp;
 	ufs2_daddr_t bno;
 	long size;
 	ino_t inum;
 {
 	struct buf *ibp, *cbp, *savedcbp = 0;
 	struct thread *td = curthread;
 	struct inode *ip;
 	struct vnode *vp = NULL;
 	ufs_lbn_t lbn;
 	ufs2_daddr_t blkno;
 	int indiroff = 0, error = 0, claimedblk = 0;
 	struct snapdata *sn;
 
 	lbn = fragstoblks(fs, bno);
 retry:
 	VI_LOCK(devvp);
 	sn = devvp->v_rdev->si_snapdata;
 	if (sn == NULL) {
 		VI_UNLOCK(devvp);
 		return (0);
 	}
 	if (lockmgr(&sn->sn_lock,
 		    LK_INTERLOCK | LK_EXCLUSIVE | LK_SLEEPFAIL,
 		    VI_MTX(devvp), td) != 0)
 		goto retry;
 	TAILQ_FOREACH(ip, &sn->sn_head, i_nextsnap) {
 		vp = ITOV(ip);
 		/*
 		 * Lookup block being written.
 		 */
 		if (lbn < NDADDR) {
 			blkno = DIP(ip, i_db[lbn]);
 		} else {
 			td->td_pflags |= TDP_COWINPROGRESS;
 			error = UFS_BALLOC(vp, lblktosize(fs, (off_t)lbn),
 			    fs->fs_bsize, KERNCRED, BA_METAONLY, &ibp);
 			td->td_pflags &= ~TDP_COWINPROGRESS;
 			if (error)
 				break;
 			indiroff = (lbn - NDADDR) % NINDIR(fs);
 			if (ip->i_ump->um_fstype == UFS1)
 				blkno=((ufs1_daddr_t *)(ibp->b_data))[indiroff];
 			else
 				blkno=((ufs2_daddr_t *)(ibp->b_data))[indiroff];
 		}
 		/*
 		 * Check to see if block needs to be copied.
 		 */
 		if (blkno == 0) {
 			/*
 			 * A block that we map is being freed. If it has not
 			 * been claimed yet, we will claim or copy it (below).
 			 */
 			claimedblk = 1;
 		} else if (blkno == BLK_SNAP) {
 			/*
 			 * No previous snapshot claimed the block,
 			 * so it will be freed and become a BLK_NOCOPY
 			 * (don't care) for us.
 			 */
 			if (claimedblk)
 				panic("snapblkfree: inconsistent block type");
 			if (lbn < NDADDR) {
 				DIP_SET(ip, i_db[lbn], BLK_NOCOPY);
 				ip->i_flag |= IN_CHANGE | IN_UPDATE;
 			} else if (ip->i_ump->um_fstype == UFS1) {
 				((ufs1_daddr_t *)(ibp->b_data))[indiroff] =
 				    BLK_NOCOPY;
 				bdwrite(ibp);
 			} else {
 				((ufs2_daddr_t *)(ibp->b_data))[indiroff] =
 				    BLK_NOCOPY;
 				bdwrite(ibp);
 			}
 			continue;
 		} else /* BLK_NOCOPY or default */ {
 			/*
 			 * If the snapshot has already copied the block
 			 * (default), or does not care about the block,
 			 * it is not needed.
 			 */
 			if (lbn >= NDADDR)
 				bqrelse(ibp);
 			continue;
 		}
 		/*
 		 * If this is a full size block, we will just grab it
 		 * and assign it to the snapshot inode. Otherwise we
 		 * will proceed to copy it. See explanation for this
 		 * routine as to why only a single snapshot needs to
 		 * claim this block.
 		 */
 		if (size == fs->fs_bsize) {
 #ifdef DEBUG
 			if (snapdebug)
 				printf("%s %d lbn %jd from inum %d\n",
 				    "Grabonremove: snapino", ip->i_number,
 				    (intmax_t)lbn, inum);
 #endif
 			if (lbn < NDADDR) {
 				DIP_SET(ip, i_db[lbn], bno);
 			} else if (ip->i_ump->um_fstype == UFS1) {
 				((ufs1_daddr_t *)(ibp->b_data))[indiroff] = bno;
 				bdwrite(ibp);
 			} else {
 				((ufs2_daddr_t *)(ibp->b_data))[indiroff] = bno;
 				bdwrite(ibp);
 			}
 			DIP_SET(ip, i_blocks, DIP(ip, i_blocks) + btodb(size));
 			ip->i_flag |= IN_CHANGE | IN_UPDATE;
 			lockmgr(vp->v_vnlock, LK_RELEASE, NULL, td);
 			return (1);
 		}
 		if (lbn >= NDADDR)
 			bqrelse(ibp);
 		/*
 		 * Allocate the block into which to do the copy. Note that this
 		 * allocation will never require any additional allocations for
 		 * the snapshot inode.
 		 */
 		td->td_pflags |= TDP_COWINPROGRESS;
 		error = UFS_BALLOC(vp, lblktosize(fs, (off_t)lbn),
 		    fs->fs_bsize, KERNCRED, 0, &cbp);
 		td->td_pflags &= ~TDP_COWINPROGRESS;
 		if (error)
 			break;
 #ifdef DEBUG
 		if (snapdebug)
 			printf("%s%d lbn %jd %s %d size %ld to blkno %jd\n",
 			    "Copyonremove: snapino ", ip->i_number,
 			    (intmax_t)lbn, "for inum", inum, size,
 			    (intmax_t)cbp->b_blkno);
 #endif
 		/*
 		 * If we have already read the old block contents, then
 		 * simply copy them to the new block. Note that we need
 		 * to synchronously write snapshots that have not been
 		 * unlinked, and hence will be visible after a crash,
 		 * to ensure their integrity.
 		 */
 		if (savedcbp != 0) {
 			bcopy(savedcbp->b_data, cbp->b_data, fs->fs_bsize);
 			bawrite(cbp);
 			if (dopersistence && ip->i_effnlink > 0)
 				(void) ffs_syncvnode(vp, MNT_WAIT);
 			continue;
 		}
 		/*
 		 * Otherwise, read the old block contents into the buffer.
 		 */
 		if ((error = readblock(vp, cbp, lbn)) != 0) {
 			bzero(cbp->b_data, fs->fs_bsize);
 			bawrite(cbp);
 			if (dopersistence && ip->i_effnlink > 0)
 				(void) ffs_syncvnode(vp, MNT_WAIT);
 			break;
 		}
 		savedcbp = cbp;
 	}
 	/*
 	 * Note that we need to synchronously write snapshots that
 	 * have not been unlinked, and hence will be visible after
 	 * a crash, to ensure their integrity.
 	 */
 	if (savedcbp) {
 		vp = savedcbp->b_vp;
 		bawrite(savedcbp);
 		if (dopersistence && VTOI(vp)->i_effnlink > 0)
 			(void) ffs_syncvnode(vp, MNT_WAIT);
 	}
 	/*
 	 * If we have been unable to allocate a block in which to do
 	 * the copy, then return non-zero so that the fragment will
 	 * not be freed. Although space will be lost, the snapshot
 	 * will stay consistent.
 	 */
 	lockmgr(vp->v_vnlock, LK_RELEASE, NULL, td);
 	return (error);
 }
 
 /*
  * Associate snapshot files when mounting.
  */
 void
 ffs_snapshot_mount(mp)
 	struct mount *mp;
 {
 	struct ufsmount *ump = VFSTOUFS(mp);
 	struct vnode *devvp = ump->um_devvp;
 	struct fs *fs = ump->um_fs;
 	struct thread *td = curthread;
 	struct snapdata *sn;
 	struct vnode *vp;
 	struct vnode *lastvp;
 	struct inode *ip;
 	struct uio auio;
 	struct iovec aiov;
 	void *snapblklist;
 	char *reason;
 	daddr_t snaplistsize;
 	int error, snaploc, loc;
 
 	/*
 	 * XXX The following needs to be set before ffs_truncate or
 	 * VOP_READ can be called.
 	 */
 	mp->mnt_stat.f_iosize = fs->fs_bsize;
 	/*
 	 * Process each snapshot listed in the superblock.
 	 */
 	vp = NULL;
 	lastvp = NULL;
 	sn = devvp->v_rdev->si_snapdata;
 	for (snaploc = 0; snaploc < FSMAXSNAP; snaploc++) {
 		if (fs->fs_snapinum[snaploc] == 0)
 			break;
 		if ((error = ffs_vget(mp, fs->fs_snapinum[snaploc],
 		    LK_EXCLUSIVE, &vp)) != 0){
 			printf("ffs_snapshot_mount: vget failed %d\n", error);
 			continue;
 		}
 		ip = VTOI(vp);
 		if ((ip->i_flags & SF_SNAPSHOT) == 0 || ip->i_size ==
 		    lblktosize(fs, howmany(fs->fs_size, fs->fs_frag))) {
 			if ((ip->i_flags & SF_SNAPSHOT) == 0) {
 				reason = "non-snapshot";
 			} else {
 				reason = "old format snapshot";
 				(void)ffs_truncate(vp, (off_t)0, 0, NOCRED, td);
 				(void)ffs_syncvnode(vp, MNT_WAIT);
 			}
 			printf("ffs_snapshot_mount: %s inode %d\n",
 			    reason, fs->fs_snapinum[snaploc]);
 			vput(vp);
 			vp = NULL;
 			for (loc = snaploc + 1; loc < FSMAXSNAP; loc++) {
 				if (fs->fs_snapinum[loc] == 0)
 					break;
 				fs->fs_snapinum[loc - 1] = fs->fs_snapinum[loc];
 			}
 			fs->fs_snapinum[loc - 1] = 0;
 			snaploc--;
 			continue;
 		}
 		/*
 		 * If there already exist snapshots on this filesystem, grab a
 		 * reference to their shared lock. If this is the first snapshot
 		 * on this filesystem, we need to allocate a lock for the
 		 * snapshots to share. In either case, acquire the snapshot
 		 * lock and give up our original private lock.
 		 */
 		VI_LOCK(devvp);
 		if (sn != NULL) {
 
 			VI_UNLOCK(devvp);
 			VI_LOCK(vp);
 			vp->v_vnlock = &sn->sn_lock;
 		} else {
 			VI_UNLOCK(devvp);
 			sn = malloc(sizeof *sn, M_UFSMNT, M_WAITOK | M_ZERO);
 			TAILQ_INIT(&sn->sn_head);
 			lockinit(&sn->sn_lock, PVFS, "snaplk", VLKTIMEOUT,
 			    LK_CANRECURSE | LK_NOSHARE);
 			VI_LOCK(vp);
 			vp->v_vnlock = &sn->sn_lock;
 			devvp->v_rdev->si_snapdata = sn;
 		}
 		lockmgr(vp->v_vnlock, LK_INTERLOCK | LK_EXCLUSIVE | LK_RETRY,
 		    VI_MTX(vp), td);
 		lockmgr(&vp->v_lock, LK_RELEASE, NULL, td);
 		/*
 		 * Link it onto the active snapshot list.
 		 */
 		VI_LOCK(devvp);
 		if (ip->i_nextsnap.tqe_prev != 0)
 			panic("ffs_snapshot_mount: %d already on list",
 			    ip->i_number);
 		else
 			TAILQ_INSERT_TAIL(&sn->sn_head, ip, i_nextsnap);
 		vp->v_vflag |= VV_SYSTEM;
 		VI_UNLOCK(devvp);
 		VOP_UNLOCK(vp, 0, td);
 		lastvp = vp;
 	}
 	vp = lastvp;
 	/*
 	 * No usable snapshots found.
 	 */
 	if (vp == NULL)
 		return;
 	/*
 	 * Allocate the space for the block hints list. We always want to
 	 * use the list from the newest snapshot.
 	 */
 	auio.uio_iov = &aiov;
 	auio.uio_iovcnt = 1;
 	aiov.iov_base = (void *)&snaplistsize;
 	aiov.iov_len = sizeof(snaplistsize);
 	auio.uio_resid = aiov.iov_len;
 	auio.uio_offset =
 	    lblktosize(fs, howmany(fs->fs_size, fs->fs_frag));
 	auio.uio_segflg = UIO_SYSSPACE;
 	auio.uio_rw = UIO_READ;
 	auio.uio_td = td;
-	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td);
+	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
 	if ((error = VOP_READ(vp, &auio, IO_UNIT, td->td_ucred)) != 0) {
 		printf("ffs_snapshot_mount: read_1 failed %d\n", error);
 		VOP_UNLOCK(vp, 0, td);
 		return;
 	}
 	MALLOC(snapblklist, void *, snaplistsize * sizeof(daddr_t),
 	    M_UFSMNT, M_WAITOK);
 	auio.uio_iovcnt = 1;
 	aiov.iov_base = snapblklist;
 	aiov.iov_len = snaplistsize * sizeof (daddr_t);
 	auio.uio_resid = aiov.iov_len;
 	auio.uio_offset -= sizeof(snaplistsize);
 	if ((error = VOP_READ(vp, &auio, IO_UNIT, td->td_ucred)) != 0) {
 		printf("ffs_snapshot_mount: read_2 failed %d\n", error);
 		VOP_UNLOCK(vp, 0, td);
 		FREE(snapblklist, M_UFSMNT);
 		return;
 	}
 	VOP_UNLOCK(vp, 0, td);
 	VI_LOCK(devvp);
 	ASSERT_VOP_LOCKED(devvp, "ffs_snapshot_mount");
 	sn->sn_listsize = snaplistsize;
 	sn->sn_blklist = (daddr_t *)snapblklist;
 	devvp->v_vflag |= VV_COPYONWRITE;
 	VI_UNLOCK(devvp);
 }
 
 /*
  * Disassociate snapshot files when unmounting.
  */
 void
 ffs_snapshot_unmount(mp)
 	struct mount *mp;
 {
 	struct vnode *devvp = VFSTOUFS(mp)->um_devvp;
 	struct snapdata *sn;
 	struct inode *xp;
 	struct vnode *vp;
 	struct thread *td = curthread;
 
 	VI_LOCK(devvp);
 	sn = devvp->v_rdev->si_snapdata;
 	while (sn != NULL && (xp = TAILQ_FIRST(&sn->sn_head)) != NULL) {
 		vp = ITOV(xp);
 		TAILQ_REMOVE(&sn->sn_head, xp, i_nextsnap);
 		xp->i_nextsnap.tqe_prev = 0;
 		lockmgr(&sn->sn_lock, 
 			LK_INTERLOCK | LK_EXCLUSIVE,
 			VI_MTX(devvp),
 			td);
 		VI_LOCK(vp);
 		lockmgr(&vp->v_lock,
 			LK_INTERLOCK | LK_EXCLUSIVE,
 			VI_MTX(vp), td);
 		VI_LOCK(vp);
 		KASSERT(vp->v_vnlock == &sn->sn_lock,
 		("ffs_snapshot_unmount: lost lock mutation")); 
 		vp->v_vnlock = &vp->v_lock;
 		VI_UNLOCK(vp);
 		lockmgr(&vp->v_lock, LK_RELEASE, NULL, td);
 		lockmgr(&sn->sn_lock, LK_RELEASE, NULL, td);
 		if (xp->i_effnlink > 0)
 			vrele(vp);
 		VI_LOCK(devvp);
 		sn = devvp->v_rdev->si_snapdata;
 	}
 	try_free_snapdata(devvp, td);
 	ASSERT_VOP_LOCKED(devvp, "ffs_snapshot_unmount");
 }
 
 /*
  * Check the buffer block to be belong to device buffer that shall be
  * locked after snaplk. devvp shall be locked on entry, and will be
  * leaved locked upon exit.
  */
 static int
 ffs_bp_snapblk(devvp, bp)
 	struct vnode *devvp;
 	struct buf *bp;
 {
 	struct snapdata *sn;
 	struct fs *fs;
 	ufs2_daddr_t lbn, *snapblklist;
 	int lower, upper, mid;
 
 	ASSERT_VI_LOCKED(devvp, "ffs_bp_snapblk");
 	KASSERT(devvp->v_type == VCHR, ("Not a device %p", devvp));
 	sn = devvp->v_rdev->si_snapdata;
 	if (sn == NULL || TAILQ_FIRST(&sn->sn_head) == NULL)
 		return (0);
 	fs = TAILQ_FIRST(&sn->sn_head)->i_fs;
 	lbn = fragstoblks(fs, dbtofsb(fs, bp->b_blkno));
 	snapblklist = sn->sn_blklist;
 	upper = sn->sn_listsize - 1;
 	lower = 1;
 	while (lower <= upper) {
 		mid = (lower + upper) / 2;
 		if (snapblklist[mid] == lbn)
 			break;
 		if (snapblklist[mid] < lbn)
 			lower = mid + 1;
 		else
 			upper = mid - 1;
 	}
 	if (lower <= upper)
 		return (1);
 	return (0);
 }
 
 void
 ffs_bdflush(bo, bp)
 	struct bufobj *bo;
 	struct buf *bp;
 {
 	struct thread *td;
 	struct vnode *vp, *devvp;
 	struct buf *nbp;
 	int bp_bdskip;
 
 	if (bo->bo_dirty.bv_cnt <= dirtybufthresh)
 		return;
 
 	td = curthread;
 	vp = bp->b_vp;
 	devvp = bo->__bo_vnode;
 	KASSERT(vp == devvp, ("devvp != vp %p %p", bo, bp));
 
 	VI_LOCK(devvp);
 	bp_bdskip = ffs_bp_snapblk(devvp, bp);
 	if (bp_bdskip)
 		bdwriteskip++;
 	VI_UNLOCK(devvp);
 	if (bo->bo_dirty.bv_cnt > dirtybufthresh + 10 && !bp_bdskip) {
 		(void) VOP_FSYNC(vp, MNT_NOWAIT, td);
 		altbufferflushes++;
 	} else {
 		BO_LOCK(bo);
 		/*
 		 * Try to find a buffer to flush.
 		 */
 		TAILQ_FOREACH(nbp, &bo->bo_dirty.bv_hd, b_bobufs) {
 			if ((nbp->b_vflags & BV_BKGRDINPROG) ||
 			    BUF_LOCK(nbp,
 				     LK_EXCLUSIVE | LK_NOWAIT, NULL))
 				continue;
 			if (bp == nbp)
 				panic("bdwrite: found ourselves");
 			BO_UNLOCK(bo);
 			/*
 			 * Don't countdeps with the bo lock
 			 * held.
 			 */
 			if (buf_countdeps(nbp, 0)) {
 				BO_LOCK(bo);
 				BUF_UNLOCK(nbp);
 				continue;
 			}
 			if (bp_bdskip) {
 				VI_LOCK(devvp);
 				if (!ffs_bp_snapblk(vp, nbp)) {
 					if (BO_MTX(bo) != VI_MTX(vp)) {
 						VI_UNLOCK(devvp);
 						BO_LOCK(bo);
 					}
 					BUF_UNLOCK(nbp);
 					continue;
 				}
 				VI_UNLOCK(devvp);
 			}
 			if (nbp->b_flags & B_CLUSTEROK) {
 				vfs_bio_awrite(nbp);
 			} else {
 				bremfree(nbp);
 				bawrite(nbp);
 			}
 			dirtybufferflushes++;
 			break;
 		}
 		if (nbp == NULL)
 			BO_UNLOCK(bo);
 	}
 }
 
 /*
  * Check for need to copy block that is about to be written,
  * copying the block if necessary.
  */
 int
 ffs_copyonwrite(devvp, bp)
 	struct vnode *devvp;
 	struct buf *bp;
 {
 	struct snapdata *sn;
 	struct buf *ibp, *cbp, *savedcbp = 0;
 	struct thread *td = curthread;
 	struct fs *fs;
 	struct inode *ip;
 	struct vnode *vp = 0;
 	ufs2_daddr_t lbn, blkno, *snapblklist;
 	int lower, upper, mid, indiroff, error = 0;
 	int launched_async_io, prev_norunningbuf;
 	long saved_runningbufspace;
 
 	if (devvp != bp->b_vp && (VTOI(bp->b_vp)->i_flags & SF_SNAPSHOT) != 0)
 		return (0);		/* Update on a snapshot file */
 	if (td->td_pflags & TDP_COWINPROGRESS)
 		panic("ffs_copyonwrite: recursive call");
 	/*
 	 * First check to see if it is in the preallocated list.
 	 * By doing this check we avoid several potential deadlocks.
 	 */
 	VI_LOCK(devvp);
 	sn = devvp->v_rdev->si_snapdata;
 	if (sn == NULL ||
 	    TAILQ_EMPTY(&sn->sn_head)) {
 		VI_UNLOCK(devvp);
 		return (0);		/* No snapshot */
 	}
 	ip = TAILQ_FIRST(&sn->sn_head);
 	fs = ip->i_fs;
 	lbn = fragstoblks(fs, dbtofsb(fs, bp->b_blkno));
 	snapblklist = sn->sn_blklist;
 	upper = sn->sn_listsize - 1;
 	lower = 1;
 	while (lower <= upper) {
 		mid = (lower + upper) / 2;
 		if (snapblklist[mid] == lbn)
 			break;
 		if (snapblklist[mid] < lbn)
 			lower = mid + 1;
 		else
 			upper = mid - 1;
 	}
 	if (lower <= upper) {
 		VI_UNLOCK(devvp);
 		return (0);
 	}
 	launched_async_io = 0;
 	prev_norunningbuf = td->td_pflags & TDP_NORUNNINGBUF;
 	/*
 	 * Since I/O on bp isn't yet in progress and it may be blocked
 	 * for a long time waiting on snaplk, back it out of
 	 * runningbufspace, possibly waking other threads waiting for space.
 	 */
 	saved_runningbufspace = bp->b_runningbufspace;
 	if (saved_runningbufspace != 0)
 		runningbufwakeup(bp);
 	/*
 	 * Not in the precomputed list, so check the snapshots.
 	 */
 	while (lockmgr(&sn->sn_lock,
 		       LK_INTERLOCK | LK_EXCLUSIVE | LK_SLEEPFAIL,
 		       VI_MTX(devvp), td) != 0) {
 		VI_LOCK(devvp);
 		sn = devvp->v_rdev->si_snapdata;
 		if (sn == NULL ||
 		    TAILQ_EMPTY(&sn->sn_head)) {
 			VI_UNLOCK(devvp);
 			if (saved_runningbufspace != 0) {
 				bp->b_runningbufspace = saved_runningbufspace;
 				atomic_add_int(&runningbufspace,
 					       bp->b_runningbufspace);
 			}
 			return (0);		/* Snapshot gone */
 		}
 	}
 	TAILQ_FOREACH(ip, &sn->sn_head, i_nextsnap) {
 		vp = ITOV(ip);
 		/*
 		 * We ensure that everything of our own that needs to be
 		 * copied will be done at the time that ffs_snapshot is
 		 * called. Thus we can skip the check here which can
 		 * deadlock in doing the lookup in UFS_BALLOC.
 		 */
 		if (bp->b_vp == vp)
 			continue;
 		/*
 		 * Check to see if block needs to be copied. We do not have
 		 * to hold the snapshot lock while doing this lookup as it
 		 * will never require any additional allocations for the
 		 * snapshot inode.
 		 */
 		if (lbn < NDADDR) {
 			blkno = DIP(ip, i_db[lbn]);
 		} else {
 			td->td_pflags |= TDP_COWINPROGRESS | TDP_NORUNNINGBUF;
 			error = UFS_BALLOC(vp, lblktosize(fs, (off_t)lbn),
 			   fs->fs_bsize, KERNCRED, BA_METAONLY, &ibp);
 			td->td_pflags &= ~TDP_COWINPROGRESS;
 			if (error)
 				break;
 			indiroff = (lbn - NDADDR) % NINDIR(fs);
 			if (ip->i_ump->um_fstype == UFS1)
 				blkno=((ufs1_daddr_t *)(ibp->b_data))[indiroff];
 			else
 				blkno=((ufs2_daddr_t *)(ibp->b_data))[indiroff];
 			bqrelse(ibp);
 		}
 #ifdef INVARIANTS
 		if (blkno == BLK_SNAP && bp->b_lblkno >= 0)
 			panic("ffs_copyonwrite: bad copy block");
 #endif
 		if (blkno != 0)
 			continue;
 		/*
 		 * Allocate the block into which to do the copy. Since
 		 * multiple processes may all try to copy the same block,
 		 * we have to recheck our need to do a copy if we sleep
 		 * waiting for the lock.
 		 *
 		 * Because all snapshots on a filesystem share a single
 		 * lock, we ensure that we will never be in competition
 		 * with another process to allocate a block.
 		 */
 		td->td_pflags |= TDP_COWINPROGRESS | TDP_NORUNNINGBUF;
 		error = UFS_BALLOC(vp, lblktosize(fs, (off_t)lbn),
 		    fs->fs_bsize, KERNCRED, 0, &cbp);
 		td->td_pflags &= ~TDP_COWINPROGRESS;
 		if (error)
 			break;
 #ifdef DEBUG
 		if (snapdebug) {
 			printf("Copyonwrite: snapino %d lbn %jd for ",
 			    ip->i_number, (intmax_t)lbn);
 			if (bp->b_vp == devvp)
 				printf("fs metadata");
 			else
 				printf("inum %d", VTOI(bp->b_vp)->i_number);
 			printf(" lblkno %jd to blkno %jd\n",
 			    (intmax_t)bp->b_lblkno, (intmax_t)cbp->b_blkno);
 		}
 #endif
 		/*
 		 * If we have already read the old block contents, then
 		 * simply copy them to the new block. Note that we need
 		 * to synchronously write snapshots that have not been
 		 * unlinked, and hence will be visible after a crash,
 		 * to ensure their integrity.
 		 */
 		if (savedcbp != 0) {
 			bcopy(savedcbp->b_data, cbp->b_data, fs->fs_bsize);
 			bawrite(cbp);
 			if (dopersistence && ip->i_effnlink > 0)
 				(void) ffs_syncvnode(vp, MNT_WAIT);
 			else
 				launched_async_io = 1;
 			continue;
 		}
 		/*
 		 * Otherwise, read the old block contents into the buffer.
 		 */
 		if ((error = readblock(vp, cbp, lbn)) != 0) {
 			bzero(cbp->b_data, fs->fs_bsize);
 			bawrite(cbp);
 			if (dopersistence && ip->i_effnlink > 0)
 				(void) ffs_syncvnode(vp, MNT_WAIT);
 			else
 				launched_async_io = 1;
 			break;
 		}
 		savedcbp = cbp;
 	}
 	/*
 	 * Note that we need to synchronously write snapshots that
 	 * have not been unlinked, and hence will be visible after
 	 * a crash, to ensure their integrity.
 	 */
 	if (savedcbp) {
 		vp = savedcbp->b_vp;
 		bawrite(savedcbp);
 		if (dopersistence && VTOI(vp)->i_effnlink > 0)
 			(void) ffs_syncvnode(vp, MNT_WAIT);
 		else
 			launched_async_io = 1;
 	}
 	lockmgr(vp->v_vnlock, LK_RELEASE, NULL, td);
 	td->td_pflags = (td->td_pflags & ~TDP_NORUNNINGBUF) |
 		prev_norunningbuf;
 	if (launched_async_io && (td->td_pflags & TDP_NORUNNINGBUF) == 0)
 		waitrunningbufspace();
 	/*
 	 * I/O on bp will now be started, so count it in runningbufspace.
 	 */
 	if (saved_runningbufspace != 0) {
 		bp->b_runningbufspace = saved_runningbufspace;
 		atomic_add_int(&runningbufspace, bp->b_runningbufspace);
 	}
 	return (error);
 }
 
 /*
  * Read the specified block into the given buffer.
  * Much of this boiler-plate comes from bwrite().
  */
 static int
 readblock(vp, bp, lbn)
 	struct vnode *vp;
 	struct buf *bp;
 	ufs2_daddr_t lbn;
 {
 	struct inode *ip = VTOI(vp);
 	struct bio *bip;
 
 	bip = g_alloc_bio();
 	bip->bio_cmd = BIO_READ;
 	bip->bio_offset = dbtob(fsbtodb(ip->i_fs, blkstofrags(ip->i_fs, lbn)));
 	bip->bio_data = bp->b_data;
 	bip->bio_length = bp->b_bcount;
 	bip->bio_done = NULL;
 
 	g_io_request(bip, ip->i_devvp->v_bufobj.bo_private);
 	bp->b_error = biowait(bip, "snaprdb");
 	g_destroy_bio(bip);
 	return (bp->b_error);
 }
 
 /*
  * Process file deletes that were deferred by ufs_inactive() due to
  * the file system being suspended. Transfer IN_LAZYACCESS into
  * IN_MODIFIED for vnodes that were accessed during suspension.
  */
 static void
 process_deferred_inactive(struct mount *mp)
 {
 	struct vnode *vp, *mvp;
 	struct inode *ip;
 	struct thread *td;
 	int error;
 
 	td = curthread;
 	(void) vn_start_secondary_write(NULL, &mp, V_WAIT);
 	MNT_ILOCK(mp);
  loop:
 	MNT_VNODE_FOREACH(vp, mp, mvp) {
 		VI_LOCK(vp);
 		/*
 		 * IN_LAZYACCESS is checked here without holding any
 		 * vnode lock, but this flag is set only while holding
 		 * vnode interlock.
 		 */
 		if (vp->v_type == VNON || (vp->v_iflag & VI_DOOMED) != 0 ||
 		    ((VTOI(vp)->i_flag & IN_LAZYACCESS) == 0 &&
 			((vp->v_iflag & VI_OWEINACT) == 0 ||
 			vp->v_usecount > 0))) {
 			VI_UNLOCK(vp);
 			continue;
 		}
 		MNT_IUNLOCK(mp);
 		vholdl(vp);
-		error = vn_lock(vp, LK_EXCLUSIVE | LK_INTERLOCK, td);
+		error = vn_lock(vp, LK_EXCLUSIVE | LK_INTERLOCK);
 		if (error != 0) {
 			vdrop(vp);
 			MNT_ILOCK(mp);
 			if (error == ENOENT)
 				continue;	/* vnode recycled */
 			MNT_VNODE_FOREACH_ABORT_ILOCKED(mp, mvp);
 			goto loop;
 		}
 		ip = VTOI(vp);
 		if ((ip->i_flag & IN_LAZYACCESS) != 0) {
 			ip->i_flag &= ~IN_LAZYACCESS;
 			ip->i_flag |= IN_MODIFIED;
 		}
 		VI_LOCK(vp);
 		if ((vp->v_iflag & VI_OWEINACT) == 0 || vp->v_usecount > 0) {
 			VI_UNLOCK(vp);
 			VOP_UNLOCK(vp, 0, td);
 			vdrop(vp);
 			MNT_ILOCK(mp);
 			continue;
 		}
 		
 		VNASSERT((vp->v_iflag & VI_DOINGINACT) == 0, vp,
 			 ("process_deferred_inactive: "
 			  "recursed on VI_DOINGINACT"));
 		vp->v_iflag |= VI_DOINGINACT;
 		vp->v_iflag &= ~VI_OWEINACT;
 		VI_UNLOCK(vp);
 		(void) VOP_INACTIVE(vp, td);
 		VI_LOCK(vp);
 		VNASSERT(vp->v_iflag & VI_DOINGINACT, vp,
 			 ("process_deferred_inactive: lost VI_DOINGINACT"));
 		VNASSERT((vp->v_iflag & VI_OWEINACT) == 0, vp,
 			 ("process_deferred_inactive: got VI_OWEINACT"));
 		vp->v_iflag &= ~VI_DOINGINACT;
 		VI_UNLOCK(vp);
 		VOP_UNLOCK(vp, 0, td);
 		vdrop(vp);
 		MNT_ILOCK(mp);
 	}
 	MNT_IUNLOCK(mp);
 	vn_finished_secondary_write(mp);
 }
 
 /* Try to free snapdata associated with devvp */
 static void
 try_free_snapdata(struct vnode *devvp,
 		  struct thread *td)
 {
 	struct snapdata *sn;
 	ufs2_daddr_t *snapblklist;
 
 	sn = devvp->v_rdev->si_snapdata;
 
 	if (sn == NULL || TAILQ_FIRST(&sn->sn_head) != NULL ||
 	    (devvp->v_vflag & VV_COPYONWRITE) == 0) {
 		VI_UNLOCK(devvp);
 		return;
 	}
 
 	devvp->v_rdev->si_snapdata = NULL;
 	devvp->v_vflag &= ~VV_COPYONWRITE;
 	snapblklist = sn->sn_blklist;
 	sn->sn_blklist = NULL;
 	sn->sn_listsize = 0;
 	lockmgr(&sn->sn_lock, LK_DRAIN|LK_INTERLOCK, VI_MTX(devvp), td);
 	lockmgr(&sn->sn_lock, LK_RELEASE, NULL, td);
 	lockdestroy(&sn->sn_lock);
 	free(sn, M_UFSMNT);
 	if (snapblklist != NULL)
 		FREE(snapblklist, M_UFSMNT);
 }
 #endif
Index: head/sys/ufs/ffs/ffs_softdep.c
===================================================================
--- head/sys/ufs/ffs/ffs_softdep.c	(revision 175201)
+++ head/sys/ufs/ffs/ffs_softdep.c	(revision 175202)
@@ -1,6302 +1,6302 @@
 /*-
  * Copyright 1998, 2000 Marshall Kirk McKusick. All Rights Reserved.
  *
  * The soft updates code is derived from the appendix of a University
  * of Michigan technical report (Gregory R. Ganger and Yale N. Patt,
  * "Soft Updates: A Solution to the Metadata Update Problem in File
  * Systems", CSE-TR-254-95, August 1995).
  *
  * Further information about soft updates can be obtained from:
  *
  *	Marshall Kirk McKusick		http://www.mckusick.com/softdep/
  *	1614 Oxford Street		mckusick@mckusick.com
  *	Berkeley, CA 94709-1608		+1-510-843-9542
  *	USA
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  *
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY MARSHALL KIRK MCKUSICK ``AS IS'' AND ANY
  * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
  * DISCLAIMED.  IN NO EVENT SHALL MARSHALL KIRK MCKUSICK BE LIABLE FOR
  * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	from: @(#)ffs_softdep.c	9.59 (McKusick) 6/21/00
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 /*
  * For now we want the safety net that the DEBUG flag provides.
  */
 #ifndef DEBUG
 #define DEBUG
 #endif
 
 #include <sys/param.h>
 #include <sys/kernel.h>
 #include <sys/systm.h>
 #include <sys/bio.h>
 #include <sys/buf.h>
 #include <sys/kdb.h>
 #include <sys/kthread.h>
 #include <sys/lock.h>
 #include <sys/malloc.h>
 #include <sys/mount.h>
 #include <sys/mutex.h>
 #include <sys/proc.h>
 #include <sys/stat.h>
 #include <sys/sysctl.h>
 #include <sys/syslog.h>
 #include <sys/vnode.h>
 #include <sys/conf.h>
 #include <ufs/ufs/dir.h>
 #include <ufs/ufs/extattr.h>
 #include <ufs/ufs/quota.h>
 #include <ufs/ufs/inode.h>
 #include <ufs/ufs/ufsmount.h>
 #include <ufs/ffs/fs.h>
 #include <ufs/ffs/softdep.h>
 #include <ufs/ffs/ffs_extern.h>
 #include <ufs/ufs/ufs_extern.h>
 
 #include <vm/vm.h>
 
 #include "opt_ffs.h"
 #include "opt_quota.h"
 
 #ifndef SOFTUPDATES
 
 int
 softdep_flushfiles(oldmnt, flags, td)
 	struct mount *oldmnt;
 	int flags;
 	struct thread *td;
 {
 
 	panic("softdep_flushfiles called");
 }
 
 int
 softdep_mount(devvp, mp, fs, cred)
 	struct vnode *devvp;
 	struct mount *mp;
 	struct fs *fs;
 	struct ucred *cred;
 {
 
 	return (0);
 }
 
 void 
 softdep_initialize()
 {
 
 	return;
 }
 
 void
 softdep_uninitialize()
 {
 
 	return;
 }
 
 void
 softdep_setup_inomapdep(bp, ip, newinum)
 	struct buf *bp;
 	struct inode *ip;
 	ino_t newinum;
 {
 
 	panic("softdep_setup_inomapdep called");
 }
 
 void
 softdep_setup_blkmapdep(bp, mp, newblkno)
 	struct buf *bp;
 	struct mount *mp;
 	ufs2_daddr_t newblkno;
 {
 
 	panic("softdep_setup_blkmapdep called");
 }
 
 void 
 softdep_setup_allocdirect(ip, lbn, newblkno, oldblkno, newsize, oldsize, bp)
 	struct inode *ip;
 	ufs_lbn_t lbn;
 	ufs2_daddr_t newblkno;
 	ufs2_daddr_t oldblkno;
 	long newsize;
 	long oldsize;
 	struct buf *bp;
 {
 	
 	panic("softdep_setup_allocdirect called");
 }
 
 void 
 softdep_setup_allocext(ip, lbn, newblkno, oldblkno, newsize, oldsize, bp)
 	struct inode *ip;
 	ufs_lbn_t lbn;
 	ufs2_daddr_t newblkno;
 	ufs2_daddr_t oldblkno;
 	long newsize;
 	long oldsize;
 	struct buf *bp;
 {
 	
 	panic("softdep_setup_allocext called");
 }
 
 void
 softdep_setup_allocindir_page(ip, lbn, bp, ptrno, newblkno, oldblkno, nbp)
 	struct inode *ip;
 	ufs_lbn_t lbn;
 	struct buf *bp;
 	int ptrno;
 	ufs2_daddr_t newblkno;
 	ufs2_daddr_t oldblkno;
 	struct buf *nbp;
 {
 
 	panic("softdep_setup_allocindir_page called");
 }
 
 void
 softdep_setup_allocindir_meta(nbp, ip, bp, ptrno, newblkno)
 	struct buf *nbp;
 	struct inode *ip;
 	struct buf *bp;
 	int ptrno;
 	ufs2_daddr_t newblkno;
 {
 
 	panic("softdep_setup_allocindir_meta called");
 }
 
 void
 softdep_setup_freeblocks(ip, length, flags)
 	struct inode *ip;
 	off_t length;
 	int flags;
 {
 	
 	panic("softdep_setup_freeblocks called");
 }
 
 void
 softdep_freefile(pvp, ino, mode)
 		struct vnode *pvp;
 		ino_t ino;
 		int mode;
 {
 
 	panic("softdep_freefile called");
 }
 
 int 
 softdep_setup_directory_add(bp, dp, diroffset, newinum, newdirbp, isnewblk)
 	struct buf *bp;
 	struct inode *dp;
 	off_t diroffset;
 	ino_t newinum;
 	struct buf *newdirbp;
 	int isnewblk;
 {
 
 	panic("softdep_setup_directory_add called");
 }
 
 void 
 softdep_change_directoryentry_offset(dp, base, oldloc, newloc, entrysize)
 	struct inode *dp;
 	caddr_t base;
 	caddr_t oldloc;
 	caddr_t newloc;
 	int entrysize;
 {
 
 	panic("softdep_change_directoryentry_offset called");
 }
 
 void 
 softdep_setup_remove(bp, dp, ip, isrmdir)
 	struct buf *bp;
 	struct inode *dp;
 	struct inode *ip;
 	int isrmdir;
 {
 	
 	panic("softdep_setup_remove called");
 }
 
 void 
 softdep_setup_directory_change(bp, dp, ip, newinum, isrmdir)
 	struct buf *bp;
 	struct inode *dp;
 	struct inode *ip;
 	ino_t newinum;
 	int isrmdir;
 {
 
 	panic("softdep_setup_directory_change called");
 }
 
 void
 softdep_change_linkcnt(ip)
 	struct inode *ip;
 {
 
 	panic("softdep_change_linkcnt called");
 }
 
 void 
 softdep_load_inodeblock(ip)
 	struct inode *ip;
 {
 
 	panic("softdep_load_inodeblock called");
 }
 
 void 
 softdep_update_inodeblock(ip, bp, waitfor)
 	struct inode *ip;
 	struct buf *bp;
 	int waitfor;
 {
 
 	panic("softdep_update_inodeblock called");
 }
 
 int
 softdep_fsync(vp)
 	struct vnode *vp;	/* the "in_core" copy of the inode */
 {
 
 	return (0);
 }
 
 void
 softdep_fsync_mountdev(vp)
 	struct vnode *vp;
 {
 
 	return;
 }
 
 int
 softdep_flushworklist(oldmnt, countp, td)
 	struct mount *oldmnt;
 	int *countp;
 	struct thread *td;
 {
 
 	*countp = 0;
 	return (0);
 }
 
 int
 softdep_sync_metadata(struct vnode *vp)
 {
 
 	return (0);
 }
 
 int
 softdep_slowdown(vp)
 	struct vnode *vp;
 {
 
 	panic("softdep_slowdown called");
 }
 
 void
 softdep_releasefile(ip)
 	struct inode *ip;	/* inode with the zero effective link count */
 {
 
 	panic("softdep_releasefile called");
 }
 
 int
 softdep_request_cleanup(fs, vp)
 	struct fs *fs;
 	struct vnode *vp;
 {
 
 	return (0);
 }
 
 int
 softdep_check_suspend(struct mount *mp,
 		      struct vnode *devvp,
 		      int softdep_deps,
 		      int softdep_accdeps,
 		      int secondary_writes,
 		      int secondary_accwrites)
 {
 	struct bufobj *bo;
 	int error;
 	
 	(void) softdep_deps,
 	(void) softdep_accdeps;
 
 	ASSERT_VI_LOCKED(devvp, "softdep_check_suspend");
 	bo = &devvp->v_bufobj;
 
 	for (;;) {
 		if (!MNT_ITRYLOCK(mp)) {
 			VI_UNLOCK(devvp);
 			MNT_ILOCK(mp);
 			MNT_IUNLOCK(mp);
 			VI_LOCK(devvp);
 			continue;
 		}
 		if (mp->mnt_secondary_writes != 0) {
 			VI_UNLOCK(devvp);
 			msleep(&mp->mnt_secondary_writes,
 			       MNT_MTX(mp),
 			       (PUSER - 1) | PDROP, "secwr", 0);
 			VI_LOCK(devvp);
 			continue;
 		}
 		break;
 	}
 
 	/*
 	 * Reasons for needing more work before suspend:
 	 * - Dirty buffers on devvp.
 	 * - Secondary writes occurred after start of vnode sync loop
 	 */
 	error = 0;
 	if (bo->bo_numoutput > 0 ||
 	    bo->bo_dirty.bv_cnt > 0 ||
 	    secondary_writes != 0 ||
 	    mp->mnt_secondary_writes != 0 ||
 	    secondary_accwrites != mp->mnt_secondary_accwrites)
 		error = EAGAIN;
 	VI_UNLOCK(devvp);
 	return (error);
 }
 
 void
 softdep_get_depcounts(struct mount *mp,
 		      int *softdepactivep,
 		      int *softdepactiveaccp)
 {
 	(void) mp;
 	*softdepactivep = 0;
 	*softdepactiveaccp = 0;
 }
 
 #else
 /*
  * These definitions need to be adapted to the system to which
  * this file is being ported.
  */
 /*
  * malloc types defined for the softdep system.
  */
 static MALLOC_DEFINE(M_PAGEDEP, "pagedep","File page dependencies");
 static MALLOC_DEFINE(M_INODEDEP, "inodedep","Inode dependencies");
 static MALLOC_DEFINE(M_NEWBLK, "newblk","New block allocation");
 static MALLOC_DEFINE(M_BMSAFEMAP, "bmsafemap","Block or frag allocated from cyl group map");
 static MALLOC_DEFINE(M_ALLOCDIRECT, "allocdirect","Block or frag dependency for an inode");
 static MALLOC_DEFINE(M_INDIRDEP, "indirdep","Indirect block dependencies");
 static MALLOC_DEFINE(M_ALLOCINDIR, "allocindir","Block dependency for an indirect block");
 static MALLOC_DEFINE(M_FREEFRAG, "freefrag","Previously used frag for an inode");
 static MALLOC_DEFINE(M_FREEBLKS, "freeblks","Blocks freed from an inode");
 static MALLOC_DEFINE(M_FREEFILE, "freefile","Inode deallocated");
 static MALLOC_DEFINE(M_DIRADD, "diradd","New directory entry");
 static MALLOC_DEFINE(M_MKDIR, "mkdir","New directory");
 static MALLOC_DEFINE(M_DIRREM, "dirrem","Directory entry deleted");
 static MALLOC_DEFINE(M_NEWDIRBLK, "newdirblk","Unclaimed new directory block");
 static MALLOC_DEFINE(M_SAVEDINO, "savedino","Saved inodes");
 
 #define M_SOFTDEP_FLAGS	(M_WAITOK | M_USE_RESERVE)
 
 #define	D_PAGEDEP	0
 #define	D_INODEDEP	1
 #define	D_NEWBLK	2
 #define	D_BMSAFEMAP	3
 #define	D_ALLOCDIRECT	4
 #define	D_INDIRDEP	5
 #define	D_ALLOCINDIR	6
 #define	D_FREEFRAG	7
 #define	D_FREEBLKS	8
 #define	D_FREEFILE	9
 #define	D_DIRADD	10
 #define	D_MKDIR		11
 #define	D_DIRREM	12
 #define	D_NEWDIRBLK	13
 #define	D_LAST		D_NEWDIRBLK
 
 /* 
  * translate from workitem type to memory type
  * MUST match the defines above, such that memtype[D_XXX] == M_XXX
  */
 static struct malloc_type *memtype[] = {
 	M_PAGEDEP,
 	M_INODEDEP,
 	M_NEWBLK,
 	M_BMSAFEMAP,
 	M_ALLOCDIRECT,
 	M_INDIRDEP,
 	M_ALLOCINDIR,
 	M_FREEFRAG,
 	M_FREEBLKS,
 	M_FREEFILE,
 	M_DIRADD,
 	M_MKDIR,
 	M_DIRREM,
 	M_NEWDIRBLK
 };
 
 #define DtoM(type) (memtype[type])
 
 /*
  * Names of malloc types.
  */
 #define TYPENAME(type)  \
 	((unsigned)(type) < D_LAST ? memtype[type]->ks_shortdesc : "???")
 /*
  * End system adaptation definitions.
  */
 
 /*
  * Forward declarations.
  */
 struct inodedep_hashhead;
 struct newblk_hashhead;
 struct pagedep_hashhead;
 
 /*
  * Internal function prototypes.
  */
 static	void softdep_error(char *, int);
 static	void drain_output(struct vnode *);
 static	struct buf *getdirtybuf(struct buf *, struct mtx *, int);
 static	void clear_remove(struct thread *);
 static	void clear_inodedeps(struct thread *);
 static	int flush_pagedep_deps(struct vnode *, struct mount *,
 	    struct diraddhd *);
 static	int flush_inodedep_deps(struct mount *, ino_t);
 static	int flush_deplist(struct allocdirectlst *, int, int *);
 static	int handle_written_filepage(struct pagedep *, struct buf *);
 static  void diradd_inode_written(struct diradd *, struct inodedep *);
 static	int handle_written_inodeblock(struct inodedep *, struct buf *);
 static	void handle_allocdirect_partdone(struct allocdirect *);
 static	void handle_allocindir_partdone(struct allocindir *);
 static	void initiate_write_filepage(struct pagedep *, struct buf *);
 static	void handle_written_mkdir(struct mkdir *, int);
 static	void initiate_write_inodeblock_ufs1(struct inodedep *, struct buf *);
 static	void initiate_write_inodeblock_ufs2(struct inodedep *, struct buf *);
 static	void handle_workitem_freefile(struct freefile *);
 static	void handle_workitem_remove(struct dirrem *, struct vnode *);
 static	struct dirrem *newdirrem(struct buf *, struct inode *,
 	    struct inode *, int, struct dirrem **);
 static	void free_diradd(struct diradd *);
 static	void free_allocindir(struct allocindir *, struct inodedep *);
 static	void free_newdirblk(struct newdirblk *);
 static	int indir_trunc(struct freeblks *, ufs2_daddr_t, int, ufs_lbn_t,
 	    ufs2_daddr_t *);
 static	void deallocate_dependencies(struct buf *, struct inodedep *);
 static	void free_allocdirect(struct allocdirectlst *,
 	    struct allocdirect *, int);
 static	int check_inode_unwritten(struct inodedep *);
 static	int free_inodedep(struct inodedep *);
 static	void handle_workitem_freeblocks(struct freeblks *, int);
 static	void merge_inode_lists(struct allocdirectlst *,struct allocdirectlst *);
 static	void setup_allocindir_phase2(struct buf *, struct inode *,
 	    struct allocindir *);
 static	struct allocindir *newallocindir(struct inode *, int, ufs2_daddr_t,
 	    ufs2_daddr_t);
 static	void handle_workitem_freefrag(struct freefrag *);
 static	struct freefrag *newfreefrag(struct inode *, ufs2_daddr_t, long);
 static	void allocdirect_merge(struct allocdirectlst *,
 	    struct allocdirect *, struct allocdirect *);
 static	struct bmsafemap *bmsafemap_lookup(struct mount *, struct buf *);
 static	int newblk_find(struct newblk_hashhead *, struct fs *, ufs2_daddr_t,
 	    struct newblk **);
 static	int newblk_lookup(struct fs *, ufs2_daddr_t, int, struct newblk **);
 static	int inodedep_find(struct inodedep_hashhead *, struct fs *, ino_t,
 	    struct inodedep **);
 static	int inodedep_lookup(struct mount *, ino_t, int, struct inodedep **);
 static	int pagedep_lookup(struct inode *, ufs_lbn_t, int, struct pagedep **);
 static	int pagedep_find(struct pagedep_hashhead *, ino_t, ufs_lbn_t,
 	    struct mount *mp, int, struct pagedep **);
 static	void pause_timer(void *);
 static	int request_cleanup(struct mount *, int);
 static	int process_worklist_item(struct mount *, int);
 static	void add_to_worklist(struct worklist *);
 static	void softdep_flush(void);
 static	int softdep_speedup(void);
 
 /*
  * Exported softdep operations.
  */
 static	void softdep_disk_io_initiation(struct buf *);
 static	void softdep_disk_write_complete(struct buf *);
 static	void softdep_deallocate_dependencies(struct buf *);
 static	int softdep_count_dependencies(struct buf *bp, int);
 
 static struct mtx lk;
 MTX_SYSINIT(softdep_lock, &lk, "Softdep Lock", MTX_DEF);
 
 #define TRY_ACQUIRE_LOCK(lk)		mtx_trylock(lk)
 #define ACQUIRE_LOCK(lk)		mtx_lock(lk)
 #define FREE_LOCK(lk)			mtx_unlock(lk)
 
 /*
  * Worklist queue management.
  * These routines require that the lock be held.
  */
 #ifndef /* NOT */ DEBUG
 #define WORKLIST_INSERT(head, item) do {	\
 	(item)->wk_state |= ONWORKLIST;		\
 	LIST_INSERT_HEAD(head, item, wk_list);	\
 } while (0)
 #define WORKLIST_REMOVE(item) do {		\
 	(item)->wk_state &= ~ONWORKLIST;	\
 	LIST_REMOVE(item, wk_list);		\
 } while (0)
 #else /* DEBUG */
 static	void worklist_insert(struct workhead *, struct worklist *);
 static	void worklist_remove(struct worklist *);
 
 #define WORKLIST_INSERT(head, item) worklist_insert(head, item)
 #define WORKLIST_REMOVE(item) worklist_remove(item)
 
 static void
 worklist_insert(head, item)
 	struct workhead *head;
 	struct worklist *item;
 {
 
 	mtx_assert(&lk, MA_OWNED);
 	if (item->wk_state & ONWORKLIST)
 		panic("worklist_insert: already on list");
 	item->wk_state |= ONWORKLIST;
 	LIST_INSERT_HEAD(head, item, wk_list);
 }
 
 static void
 worklist_remove(item)
 	struct worklist *item;
 {
 
 	mtx_assert(&lk, MA_OWNED);
 	if ((item->wk_state & ONWORKLIST) == 0)
 		panic("worklist_remove: not on list");
 	item->wk_state &= ~ONWORKLIST;
 	LIST_REMOVE(item, wk_list);
 }
 #endif /* DEBUG */
 
 /*
  * Routines for tracking and managing workitems.
  */
 static	void workitem_free(struct worklist *, int);
 static	void workitem_alloc(struct worklist *, int, struct mount *);
 
 #define	WORKITEM_FREE(item, type) workitem_free((struct worklist *)(item), (type))
 
 static void
 workitem_free(item, type)
 	struct worklist *item;
 	int type;
 {
 	struct ufsmount *ump;
 	mtx_assert(&lk, MA_OWNED);
 
 #ifdef DEBUG
 	if (item->wk_state & ONWORKLIST)
 		panic("workitem_free: still on list");
 	if (item->wk_type != type)
 		panic("workitem_free: type mismatch");
 #endif
 	ump = VFSTOUFS(item->wk_mp);
 	if (--ump->softdep_deps == 0 && ump->softdep_req)
 		wakeup(&ump->softdep_deps);
 	FREE(item, DtoM(type));
 }
 
 static void
 workitem_alloc(item, type, mp)
 	struct worklist *item;
 	int type;
 	struct mount *mp;
 {
 	item->wk_type = type;
 	item->wk_mp = mp;
 	item->wk_state = 0;
 	ACQUIRE_LOCK(&lk);
 	VFSTOUFS(mp)->softdep_deps++;
 	VFSTOUFS(mp)->softdep_accdeps++;
 	FREE_LOCK(&lk);
 }
 
 /*
  * Workitem queue management
  */
 static int max_softdeps;	/* maximum number of structs before slowdown */
 static int maxindirdeps = 50;	/* max number of indirdeps before slowdown */
 static int tickdelay = 2;	/* number of ticks to pause during slowdown */
 static int proc_waiting;	/* tracks whether we have a timeout posted */
 static int *stat_countp;	/* statistic to count in proc_waiting timeout */
 static struct callout_handle handle; /* handle on posted proc_waiting timeout */
 static int req_pending;
 static int req_clear_inodedeps;	/* syncer process flush some inodedeps */
 #define FLUSH_INODES		1
 static int req_clear_remove;	/* syncer process flush some freeblks */
 #define FLUSH_REMOVE		2
 #define FLUSH_REMOVE_WAIT	3
 /*
  * runtime statistics
  */
 static int stat_worklist_push;	/* number of worklist cleanups */
 static int stat_blk_limit_push;	/* number of times block limit neared */
 static int stat_ino_limit_push;	/* number of times inode limit neared */
 static int stat_blk_limit_hit;	/* number of times block slowdown imposed */
 static int stat_ino_limit_hit;	/* number of times inode slowdown imposed */
 static int stat_sync_limit_hit;	/* number of synchronous slowdowns imposed */
 static int stat_indir_blk_ptrs;	/* bufs redirtied as indir ptrs not written */
 static int stat_inode_bitmap;	/* bufs redirtied as inode bitmap not written */
 static int stat_direct_blk_ptrs;/* bufs redirtied as direct ptrs not written */
 static int stat_dir_entry;	/* bufs redirtied as dir entry cannot write */
 
 SYSCTL_INT(_debug, OID_AUTO, max_softdeps, CTLFLAG_RW, &max_softdeps, 0, "");
 SYSCTL_INT(_debug, OID_AUTO, tickdelay, CTLFLAG_RW, &tickdelay, 0, "");
 SYSCTL_INT(_debug, OID_AUTO, maxindirdeps, CTLFLAG_RW, &maxindirdeps, 0, "");
 SYSCTL_INT(_debug, OID_AUTO, worklist_push, CTLFLAG_RW, &stat_worklist_push, 0,"");
 SYSCTL_INT(_debug, OID_AUTO, blk_limit_push, CTLFLAG_RW, &stat_blk_limit_push, 0,"");
 SYSCTL_INT(_debug, OID_AUTO, ino_limit_push, CTLFLAG_RW, &stat_ino_limit_push, 0,"");
 SYSCTL_INT(_debug, OID_AUTO, blk_limit_hit, CTLFLAG_RW, &stat_blk_limit_hit, 0, "");
 SYSCTL_INT(_debug, OID_AUTO, ino_limit_hit, CTLFLAG_RW, &stat_ino_limit_hit, 0, "");
 SYSCTL_INT(_debug, OID_AUTO, sync_limit_hit, CTLFLAG_RW, &stat_sync_limit_hit, 0, "");
 SYSCTL_INT(_debug, OID_AUTO, indir_blk_ptrs, CTLFLAG_RW, &stat_indir_blk_ptrs, 0, "");
 SYSCTL_INT(_debug, OID_AUTO, inode_bitmap, CTLFLAG_RW, &stat_inode_bitmap, 0, "");
 SYSCTL_INT(_debug, OID_AUTO, direct_blk_ptrs, CTLFLAG_RW, &stat_direct_blk_ptrs, 0, "");
 SYSCTL_INT(_debug, OID_AUTO, dir_entry, CTLFLAG_RW, &stat_dir_entry, 0, "");
 /* SYSCTL_INT(_debug, OID_AUTO, worklist_num, CTLFLAG_RD, &softdep_on_worklist, 0, ""); */
 
 SYSCTL_DECL(_vfs_ffs);
 
 static int compute_summary_at_mount = 0;	/* Whether to recompute the summary at mount time */
 SYSCTL_INT(_vfs_ffs, OID_AUTO, compute_summary_at_mount, CTLFLAG_RW,
 	   &compute_summary_at_mount, 0, "Recompute summary at mount");
 
 static struct proc *softdepproc;
 static struct kproc_desc softdep_kp = {
 	"softdepflush",
 	softdep_flush,
 	&softdepproc
 };
 SYSINIT(sdproc, SI_SUB_KTHREAD_UPDATE, SI_ORDER_ANY, kproc_start, &softdep_kp)
 
 static void
 softdep_flush(void)
 {
 	struct mount *nmp;
 	struct mount *mp;
 	struct ufsmount *ump;
 	struct thread *td;
 	int remaining;
 	int vfslocked;
 
 	td = curthread;
 	td->td_pflags |= TDP_NORUNNINGBUF;
 
 	for (;;) {	
 		kproc_suspend_check(softdepproc);
 		vfslocked = VFS_LOCK_GIANT((struct mount *)NULL);
 		ACQUIRE_LOCK(&lk);
 		/*
 		 * If requested, try removing inode or removal dependencies.
 		 */
 		if (req_clear_inodedeps) {
 			clear_inodedeps(td);
 			req_clear_inodedeps -= 1;
 			wakeup_one(&proc_waiting);
 		}
 		if (req_clear_remove) {
 			clear_remove(td);
 			req_clear_remove -= 1;
 			wakeup_one(&proc_waiting);
 		}
 		FREE_LOCK(&lk);
 		VFS_UNLOCK_GIANT(vfslocked);
 		remaining = 0;
 		mtx_lock(&mountlist_mtx);
 		for (mp = TAILQ_FIRST(&mountlist); mp != NULL; mp = nmp)  {
 			nmp = TAILQ_NEXT(mp, mnt_list);
 			if ((mp->mnt_flag & MNT_SOFTDEP) == 0)
 				continue;
 			if (vfs_busy(mp, LK_NOWAIT, &mountlist_mtx, td))
 				continue;
 			vfslocked = VFS_LOCK_GIANT(mp);
 			softdep_process_worklist(mp, 0);
 			ump = VFSTOUFS(mp);
 			remaining += ump->softdep_on_worklist -
 				ump->softdep_on_worklist_inprogress;
 			VFS_UNLOCK_GIANT(vfslocked);
 			mtx_lock(&mountlist_mtx);
 			nmp = TAILQ_NEXT(mp, mnt_list);
 			vfs_unbusy(mp, td);
 		}
 		mtx_unlock(&mountlist_mtx);
 		if (remaining)
 			continue;
 		ACQUIRE_LOCK(&lk);
 		if (!req_pending)
 			msleep(&req_pending, &lk, PVM, "sdflush", hz);
 		req_pending = 0;
 		FREE_LOCK(&lk);
 	}
 }
 
 static int
 softdep_speedup(void)
 {
 
 	mtx_assert(&lk, MA_OWNED);
 	if (req_pending == 0) {
 		req_pending = 1;
 		wakeup(&req_pending);
 	}
 
 	return speedup_syncer();
 }
 
 /*
  * Add an item to the end of the work queue.
  * This routine requires that the lock be held.
  * This is the only routine that adds items to the list.
  * The following routine is the only one that removes items
  * and does so in order from first to last.
  */
 static void
 add_to_worklist(wk)
 	struct worklist *wk;
 {
 	struct ufsmount *ump;
 
 	mtx_assert(&lk, MA_OWNED);
 	ump = VFSTOUFS(wk->wk_mp);
 	if (wk->wk_state & ONWORKLIST)
 		panic("add_to_worklist: already on list");
 	wk->wk_state |= ONWORKLIST;
 	if (LIST_EMPTY(&ump->softdep_workitem_pending))
 		LIST_INSERT_HEAD(&ump->softdep_workitem_pending, wk, wk_list);
 	else
 		LIST_INSERT_AFTER(ump->softdep_worklist_tail, wk, wk_list);
 	ump->softdep_worklist_tail = wk;
 	ump->softdep_on_worklist += 1;
 }
 
 /*
  * Process that runs once per second to handle items in the background queue.
  *
  * Note that we ensure that everything is done in the order in which they
  * appear in the queue. The code below depends on this property to ensure
  * that blocks of a file are freed before the inode itself is freed. This
  * ordering ensures that no new <vfsid, inum, lbn> triples will be generated
  * until all the old ones have been purged from the dependency lists.
  */
 int 
 softdep_process_worklist(mp, full)
 	struct mount *mp;
 	int full;
 {
 	struct thread *td = curthread;
 	int cnt, matchcnt, loopcount;
 	struct ufsmount *ump;
 	long starttime;
 
 	KASSERT(mp != NULL, ("softdep_process_worklist: NULL mp"));
 	/*
 	 * Record the process identifier of our caller so that we can give
 	 * this process preferential treatment in request_cleanup below.
 	 */
 	matchcnt = 0;
 	ump = VFSTOUFS(mp);
 	ACQUIRE_LOCK(&lk);
 	loopcount = 1;
 	starttime = time_second;
 	while (ump->softdep_on_worklist > 0) {
 		if ((cnt = process_worklist_item(mp, 0)) == -1)
 			break;
 		else
 			matchcnt += cnt;
 		/*
 		 * If requested, try removing inode or removal dependencies.
 		 */
 		if (req_clear_inodedeps) {
 			clear_inodedeps(td);
 			req_clear_inodedeps -= 1;
 			wakeup_one(&proc_waiting);
 		}
 		if (req_clear_remove) {
 			clear_remove(td);
 			req_clear_remove -= 1;
 			wakeup_one(&proc_waiting);
 		}
 		/*
 		 * We do not generally want to stop for buffer space, but if
 		 * we are really being a buffer hog, we will stop and wait.
 		 */
 		if (loopcount++ % 128 == 0) {
 			FREE_LOCK(&lk);
 			bwillwrite();
 			ACQUIRE_LOCK(&lk);
 		}
 		/*
 		 * Never allow processing to run for more than one
 		 * second. Otherwise the other mountpoints may get
 		 * excessively backlogged.
 		 */
 		if (!full && starttime != time_second) {
 			matchcnt = -1;
 			break;
 		}
 	}
 	FREE_LOCK(&lk);
 	return (matchcnt);
 }
 
 /*
  * Process one item on the worklist.
  */
 static int
 process_worklist_item(mp, flags)
 	struct mount *mp;
 	int flags;
 {
 	struct worklist *wk, *wkend;
 	struct ufsmount *ump;
 	struct vnode *vp;
 	int matchcnt = 0;
 
 	mtx_assert(&lk, MA_OWNED);
 	KASSERT(mp != NULL, ("process_worklist_item: NULL mp"));
 	/*
 	 * If we are being called because of a process doing a
 	 * copy-on-write, then it is not safe to write as we may
 	 * recurse into the copy-on-write routine.
 	 */
 	if (curthread->td_pflags & TDP_COWINPROGRESS)
 		return (-1);
 	/*
 	 * Normally we just process each item on the worklist in order.
 	 * However, if we are in a situation where we cannot lock any
 	 * inodes, we have to skip over any dirrem requests whose
 	 * vnodes are resident and locked.
 	 */
 	ump = VFSTOUFS(mp);
 	vp = NULL;
 	LIST_FOREACH(wk, &ump->softdep_workitem_pending, wk_list) {
 		if (wk->wk_state & INPROGRESS)
 			continue;
 		if ((flags & LK_NOWAIT) == 0 || wk->wk_type != D_DIRREM)
 			break;
 		wk->wk_state |= INPROGRESS;
 		ump->softdep_on_worklist_inprogress++;
 		FREE_LOCK(&lk);
 		ffs_vget(mp, WK_DIRREM(wk)->dm_oldinum,
 		    LK_NOWAIT | LK_EXCLUSIVE, &vp);
 		ACQUIRE_LOCK(&lk);
 		wk->wk_state &= ~INPROGRESS;
 		ump->softdep_on_worklist_inprogress--;
 		if (vp != NULL)
 			break;
 	}
 	if (wk == 0)
 		return (-1);
 	/*
 	 * Remove the item to be processed. If we are removing the last
 	 * item on the list, we need to recalculate the tail pointer.
 	 * As this happens rarely and usually when the list is short,
 	 * we just run down the list to find it rather than tracking it
 	 * in the above loop.
 	 */
 	WORKLIST_REMOVE(wk);
 	if (wk == ump->softdep_worklist_tail) {
 		LIST_FOREACH(wkend, &ump->softdep_workitem_pending, wk_list)
 			if (LIST_NEXT(wkend, wk_list) == NULL)
 				break;
 		ump->softdep_worklist_tail = wkend;
 	}
 	ump->softdep_on_worklist -= 1;
 	FREE_LOCK(&lk);
 	if (vn_start_secondary_write(NULL, &mp, V_NOWAIT))
 		panic("process_worklist_item: suspended filesystem");
 	matchcnt++;
 	switch (wk->wk_type) {
 
 	case D_DIRREM:
 		/* removal of a directory entry */
 		handle_workitem_remove(WK_DIRREM(wk), vp);
 		break;
 
 	case D_FREEBLKS:
 		/* releasing blocks and/or fragments from a file */
 		handle_workitem_freeblocks(WK_FREEBLKS(wk), flags & LK_NOWAIT);
 		break;
 
 	case D_FREEFRAG:
 		/* releasing a fragment when replaced as a file grows */
 		handle_workitem_freefrag(WK_FREEFRAG(wk));
 		break;
 
 	case D_FREEFILE:
 		/* releasing an inode when its link count drops to 0 */
 		handle_workitem_freefile(WK_FREEFILE(wk));
 		break;
 
 	default:
 		panic("%s_process_worklist: Unknown type %s",
 		    "softdep", TYPENAME(wk->wk_type));
 		/* NOTREACHED */
 	}
 	vn_finished_secondary_write(mp);
 	ACQUIRE_LOCK(&lk);
 	return (matchcnt);
 }
 
 /*
  * Move dependencies from one buffer to another.
  */
 void
 softdep_move_dependencies(oldbp, newbp)
 	struct buf *oldbp;
 	struct buf *newbp;
 {
 	struct worklist *wk, *wktail;
 
 	if (!LIST_EMPTY(&newbp->b_dep))
 		panic("softdep_move_dependencies: need merge code");
 	wktail = 0;
 	ACQUIRE_LOCK(&lk);
 	while ((wk = LIST_FIRST(&oldbp->b_dep)) != NULL) {
 		LIST_REMOVE(wk, wk_list);
 		if (wktail == 0)
 			LIST_INSERT_HEAD(&newbp->b_dep, wk, wk_list);
 		else
 			LIST_INSERT_AFTER(wktail, wk, wk_list);
 		wktail = wk;
 	}
 	FREE_LOCK(&lk);
 }
 
 /*
  * Purge the work list of all items associated with a particular mount point.
  */
 int
 softdep_flushworklist(oldmnt, countp, td)
 	struct mount *oldmnt;
 	int *countp;
 	struct thread *td;
 {
 	struct vnode *devvp;
 	int count, error = 0;
 	struct ufsmount *ump;
 
 	/*
 	 * Alternately flush the block device associated with the mount
 	 * point and process any dependencies that the flushing
 	 * creates. We continue until no more worklist dependencies
 	 * are found.
 	 */
 	*countp = 0;
 	ump = VFSTOUFS(oldmnt);
 	devvp = ump->um_devvp;
 	while ((count = softdep_process_worklist(oldmnt, 1)) > 0) {
 		*countp += count;
-		vn_lock(devvp, LK_EXCLUSIVE | LK_RETRY, td);
+		vn_lock(devvp, LK_EXCLUSIVE | LK_RETRY);
 		error = VOP_FSYNC(devvp, MNT_WAIT, td);
 		VOP_UNLOCK(devvp, 0, td);
 		if (error)
 			break;
 	}
 	return (error);
 }
 
 int
 softdep_waitidle(struct mount *mp)
 {
 	struct ufsmount *ump;
 	int error;
 	int i;
 
 	ump = VFSTOUFS(mp);
 	ACQUIRE_LOCK(&lk);
 	for (i = 0; i < 10 && ump->softdep_deps; i++) {
 		ump->softdep_req = 1;
 		if (ump->softdep_on_worklist)
 			panic("softdep_waitidle: work added after flush.");
 		msleep(&ump->softdep_deps, &lk, PVM, "softdeps", 1);
 	}
 	ump->softdep_req = 0;
 	FREE_LOCK(&lk);
 	error = 0;
 	if (i == 10) {
 		error = EBUSY;
 		printf("softdep_waitidle: Failed to flush worklist for %p\n",
 		    mp);
 	}
 
 	return (error);
 }
 
 /*
  * Flush all vnodes and worklist items associated with a specified mount point.
  */
 int
 softdep_flushfiles(oldmnt, flags, td)
 	struct mount *oldmnt;
 	int flags;
 	struct thread *td;
 {
 	int error, count, loopcnt;
 
 	error = 0;
 
 	/*
 	 * Alternately flush the vnodes associated with the mount
 	 * point and process any dependencies that the flushing
 	 * creates. In theory, this loop can happen at most twice,
 	 * but we give it a few extra just to be sure.
 	 */
 	for (loopcnt = 10; loopcnt > 0; loopcnt--) {
 		/*
 		 * Do another flush in case any vnodes were brought in
 		 * as part of the cleanup operations.
 		 */
 		if ((error = ffs_flushfiles(oldmnt, flags, td)) != 0)
 			break;
 		if ((error = softdep_flushworklist(oldmnt, &count, td)) != 0 ||
 		    count == 0)
 			break;
 	}
 	/*
 	 * If we are unmounting then it is an error to fail. If we
 	 * are simply trying to downgrade to read-only, then filesystem
 	 * activity can keep us busy forever, so we just fail with EBUSY.
 	 */
 	if (loopcnt == 0) {
 		if (oldmnt->mnt_kern_flag & MNTK_UNMOUNT)
 			panic("softdep_flushfiles: looping");
 		error = EBUSY;
 	}
 	if (!error)
 		error = softdep_waitidle(oldmnt);
 	return (error);
 }
 
 /*
  * Structure hashing.
  * 
  * There are three types of structures that can be looked up:
  *	1) pagedep structures identified by mount point, inode number,
  *	   and logical block.
  *	2) inodedep structures identified by mount point and inode number.
  *	3) newblk structures identified by mount point and
  *	   physical block number.
  *
  * The "pagedep" and "inodedep" dependency structures are hashed
  * separately from the file blocks and inodes to which they correspond.
  * This separation helps when the in-memory copy of an inode or
  * file block must be replaced. It also obviates the need to access
  * an inode or file page when simply updating (or de-allocating)
  * dependency structures. Lookup of newblk structures is needed to
  * find newly allocated blocks when trying to associate them with
  * their allocdirect or allocindir structure.
  *
  * The lookup routines optionally create and hash a new instance when
  * an existing entry is not found.
  */
 #define DEPALLOC	0x0001	/* allocate structure if lookup fails */
 #define NODELAY		0x0002	/* cannot do background work */
 
 /*
  * Structures and routines associated with pagedep caching.
  */
 LIST_HEAD(pagedep_hashhead, pagedep) *pagedep_hashtbl;
 u_long	pagedep_hash;		/* size of hash table - 1 */
 #define	PAGEDEP_HASH(mp, inum, lbn) \
 	(&pagedep_hashtbl[((((register_t)(mp)) >> 13) + (inum) + (lbn)) & \
 	    pagedep_hash])
 
 static int
 pagedep_find(pagedephd, ino, lbn, mp, flags, pagedeppp)
 	struct pagedep_hashhead *pagedephd;
 	ino_t ino;
 	ufs_lbn_t lbn;
 	struct mount *mp;
 	int flags;
 	struct pagedep **pagedeppp;
 {
 	struct pagedep *pagedep;
 
 	LIST_FOREACH(pagedep, pagedephd, pd_hash)
 		if (ino == pagedep->pd_ino &&
 		    lbn == pagedep->pd_lbn &&
 		    mp == pagedep->pd_list.wk_mp)
 			break;
 	if (pagedep) {
 		*pagedeppp = pagedep;
 		if ((flags & DEPALLOC) != 0 &&
 		    (pagedep->pd_state & ONWORKLIST) == 0)
 			return (0);
 		return (1);
 	}
 	*pagedeppp = NULL;
 	return (0);
 }
 /*
  * Look up a pagedep. Return 1 if found, 0 if not found or found
  * when asked to allocate but not associated with any buffer.
  * If not found, allocate if DEPALLOC flag is passed.
  * Found or allocated entry is returned in pagedeppp.
  * This routine must be called with splbio interrupts blocked.
  */
 static int
 pagedep_lookup(ip, lbn, flags, pagedeppp)
 	struct inode *ip;
 	ufs_lbn_t lbn;
 	int flags;
 	struct pagedep **pagedeppp;
 {
 	struct pagedep *pagedep;
 	struct pagedep_hashhead *pagedephd;
 	struct mount *mp;
 	int ret;
 	int i;
 
 	mtx_assert(&lk, MA_OWNED);
 	mp = ITOV(ip)->v_mount;
 	pagedephd = PAGEDEP_HASH(mp, ip->i_number, lbn);
 
 	ret = pagedep_find(pagedephd, ip->i_number, lbn, mp, flags, pagedeppp);
 	if (*pagedeppp || (flags & DEPALLOC) == 0)
 		return (ret);
 	FREE_LOCK(&lk);
 	MALLOC(pagedep, struct pagedep *, sizeof(struct pagedep),
 	    M_PAGEDEP, M_SOFTDEP_FLAGS|M_ZERO);
 	workitem_alloc(&pagedep->pd_list, D_PAGEDEP, mp);
 	ACQUIRE_LOCK(&lk);
 	ret = pagedep_find(pagedephd, ip->i_number, lbn, mp, flags, pagedeppp);
 	if (*pagedeppp) {
 		WORKITEM_FREE(pagedep, D_PAGEDEP);
 		return (ret);
 	}
 	pagedep->pd_ino = ip->i_number;
 	pagedep->pd_lbn = lbn;
 	LIST_INIT(&pagedep->pd_dirremhd);
 	LIST_INIT(&pagedep->pd_pendinghd);
 	for (i = 0; i < DAHASHSZ; i++)
 		LIST_INIT(&pagedep->pd_diraddhd[i]);
 	LIST_INSERT_HEAD(pagedephd, pagedep, pd_hash);
 	*pagedeppp = pagedep;
 	return (0);
 }
 
 /*
  * Structures and routines associated with inodedep caching.
  */
 LIST_HEAD(inodedep_hashhead, inodedep) *inodedep_hashtbl;
 static u_long	inodedep_hash;	/* size of hash table - 1 */
 static long	num_inodedep;	/* number of inodedep allocated */
 #define	INODEDEP_HASH(fs, inum) \
       (&inodedep_hashtbl[((((register_t)(fs)) >> 13) + (inum)) & inodedep_hash])
 
 static int
 inodedep_find(inodedephd, fs, inum, inodedeppp)
 	struct inodedep_hashhead *inodedephd;
 	struct fs *fs;
 	ino_t inum;
 	struct inodedep **inodedeppp;
 {
 	struct inodedep *inodedep;
 
 	LIST_FOREACH(inodedep, inodedephd, id_hash)
 		if (inum == inodedep->id_ino && fs == inodedep->id_fs)
 			break;
 	if (inodedep) {
 		*inodedeppp = inodedep;
 		return (1);
 	}
 	*inodedeppp = NULL;
 
 	return (0);
 }
 /*
  * Look up an inodedep. Return 1 if found, 0 if not found.
  * If not found, allocate if DEPALLOC flag is passed.
  * Found or allocated entry is returned in inodedeppp.
  * This routine must be called with splbio interrupts blocked.
  */
 static int
 inodedep_lookup(mp, inum, flags, inodedeppp)
 	struct mount *mp;
 	ino_t inum;
 	int flags;
 	struct inodedep **inodedeppp;
 {
 	struct inodedep *inodedep;
 	struct inodedep_hashhead *inodedephd;
 	struct fs *fs;
 
 	mtx_assert(&lk, MA_OWNED);
 	fs = VFSTOUFS(mp)->um_fs;
 	inodedephd = INODEDEP_HASH(fs, inum);
 
 	if (inodedep_find(inodedephd, fs, inum, inodedeppp))
 		return (1);
 	if ((flags & DEPALLOC) == 0)
 		return (0);
 	/*
 	 * If we are over our limit, try to improve the situation.
 	 */
 	if (num_inodedep > max_softdeps && (flags & NODELAY) == 0)
 		request_cleanup(mp, FLUSH_INODES);
 	FREE_LOCK(&lk);
 	MALLOC(inodedep, struct inodedep *, sizeof(struct inodedep),
 		M_INODEDEP, M_SOFTDEP_FLAGS);
 	workitem_alloc(&inodedep->id_list, D_INODEDEP, mp);
 	ACQUIRE_LOCK(&lk);
 	if (inodedep_find(inodedephd, fs, inum, inodedeppp)) {
 		WORKITEM_FREE(inodedep, D_INODEDEP);
 		return (1);
 	}
 	num_inodedep += 1;
 	inodedep->id_fs = fs;
 	inodedep->id_ino = inum;
 	inodedep->id_state = ALLCOMPLETE;
 	inodedep->id_nlinkdelta = 0;
 	inodedep->id_savedino1 = NULL;
 	inodedep->id_savedsize = -1;
 	inodedep->id_savedextsize = -1;
 	inodedep->id_buf = NULL;
 	LIST_INIT(&inodedep->id_pendinghd);
 	LIST_INIT(&inodedep->id_inowait);
 	LIST_INIT(&inodedep->id_bufwait);
 	TAILQ_INIT(&inodedep->id_inoupdt);
 	TAILQ_INIT(&inodedep->id_newinoupdt);
 	TAILQ_INIT(&inodedep->id_extupdt);
 	TAILQ_INIT(&inodedep->id_newextupdt);
 	LIST_INSERT_HEAD(inodedephd, inodedep, id_hash);
 	*inodedeppp = inodedep;
 	return (0);
 }
 
 /*
  * Structures and routines associated with newblk caching.
  */
 LIST_HEAD(newblk_hashhead, newblk) *newblk_hashtbl;
 u_long	newblk_hash;		/* size of hash table - 1 */
 #define	NEWBLK_HASH(fs, inum) \
 	(&newblk_hashtbl[((((register_t)(fs)) >> 13) + (inum)) & newblk_hash])
 
 static int
 newblk_find(newblkhd, fs, newblkno, newblkpp)
 	struct newblk_hashhead *newblkhd;
 	struct fs *fs;
 	ufs2_daddr_t newblkno;
 	struct newblk **newblkpp;
 {
 	struct newblk *newblk;
 
 	LIST_FOREACH(newblk, newblkhd, nb_hash)
 		if (newblkno == newblk->nb_newblkno && fs == newblk->nb_fs)
 			break;
 	if (newblk) {
 		*newblkpp = newblk;
 		return (1);
 	}
 	*newblkpp = NULL;
 	return (0);
 }
 
 /*
  * Look up a newblk. Return 1 if found, 0 if not found.
  * If not found, allocate if DEPALLOC flag is passed.
  * Found or allocated entry is returned in newblkpp.
  */
 static int
 newblk_lookup(fs, newblkno, flags, newblkpp)
 	struct fs *fs;
 	ufs2_daddr_t newblkno;
 	int flags;
 	struct newblk **newblkpp;
 {
 	struct newblk *newblk;
 	struct newblk_hashhead *newblkhd;
 
 	newblkhd = NEWBLK_HASH(fs, newblkno);
 	if (newblk_find(newblkhd, fs, newblkno, newblkpp))
 		return (1);
 	if ((flags & DEPALLOC) == 0)
 		return (0);
 	FREE_LOCK(&lk);
 	MALLOC(newblk, struct newblk *, sizeof(struct newblk),
 		M_NEWBLK, M_SOFTDEP_FLAGS);
 	ACQUIRE_LOCK(&lk);
 	if (newblk_find(newblkhd, fs, newblkno, newblkpp)) {
 		FREE(newblk, M_NEWBLK);
 		return (1);
 	}
 	newblk->nb_state = 0;
 	newblk->nb_fs = fs;
 	newblk->nb_newblkno = newblkno;
 	LIST_INSERT_HEAD(newblkhd, newblk, nb_hash);
 	*newblkpp = newblk;
 	return (0);
 }
 
 /*
  * Executed during filesystem system initialization before
  * mounting any filesystems.
  */
 void 
 softdep_initialize()
 {
 
 	LIST_INIT(&mkdirlisthd);
 	max_softdeps = desiredvnodes * 4;
 	pagedep_hashtbl = hashinit(desiredvnodes / 5, M_PAGEDEP,
 	    &pagedep_hash);
 	inodedep_hashtbl = hashinit(desiredvnodes, M_INODEDEP, &inodedep_hash);
 	newblk_hashtbl = hashinit(64, M_NEWBLK, &newblk_hash);
 
 	/* initialise bioops hack */
 	bioops.io_start = softdep_disk_io_initiation;
 	bioops.io_complete = softdep_disk_write_complete;
 	bioops.io_deallocate = softdep_deallocate_dependencies;
 	bioops.io_countdeps = softdep_count_dependencies;
 }
 
 /*
  * Executed after all filesystems have been unmounted during
  * filesystem module unload.
  */
 void
 softdep_uninitialize()
 {
 
 	hashdestroy(pagedep_hashtbl, M_PAGEDEP, pagedep_hash);
 	hashdestroy(inodedep_hashtbl, M_INODEDEP, inodedep_hash);
 	hashdestroy(newblk_hashtbl, M_NEWBLK, newblk_hash);
 }
 
 /*
  * Called at mount time to notify the dependency code that a
  * filesystem wishes to use it.
  */
 int
 softdep_mount(devvp, mp, fs, cred)
 	struct vnode *devvp;
 	struct mount *mp;
 	struct fs *fs;
 	struct ucred *cred;
 {
 	struct csum_total cstotal;
 	struct ufsmount *ump;
 	struct cg *cgp;
 	struct buf *bp;
 	int error, cyl;
 
 	MNT_ILOCK(mp);
 	mp->mnt_flag = (mp->mnt_flag & ~MNT_ASYNC) | MNT_SOFTDEP;
 	if ((mp->mnt_kern_flag & MNTK_SOFTDEP) == 0) {
 		mp->mnt_kern_flag = (mp->mnt_kern_flag & ~MNTK_ASYNC) | 
 			MNTK_SOFTDEP;
 		mp->mnt_noasync++;
 	}
 	MNT_IUNLOCK(mp);
 	ump = VFSTOUFS(mp);
 	LIST_INIT(&ump->softdep_workitem_pending);
 	ump->softdep_worklist_tail = NULL;
 	ump->softdep_on_worklist = 0;
 	ump->softdep_deps = 0;
 	/*
 	 * When doing soft updates, the counters in the
 	 * superblock may have gotten out of sync. Recomputation
 	 * can take a long time and can be deferred for background
 	 * fsck.  However, the old behavior of scanning the cylinder
 	 * groups and recalculating them at mount time is available
 	 * by setting vfs.ffs.compute_summary_at_mount to one.
 	 */
 	if (compute_summary_at_mount == 0 || fs->fs_clean != 0)
 		return (0);
 	bzero(&cstotal, sizeof cstotal);
 	for (cyl = 0; cyl < fs->fs_ncg; cyl++) {
 		if ((error = bread(devvp, fsbtodb(fs, cgtod(fs, cyl)),
 		    fs->fs_cgsize, cred, &bp)) != 0) {
 			brelse(bp);
 			return (error);
 		}
 		cgp = (struct cg *)bp->b_data;
 		cstotal.cs_nffree += cgp->cg_cs.cs_nffree;
 		cstotal.cs_nbfree += cgp->cg_cs.cs_nbfree;
 		cstotal.cs_nifree += cgp->cg_cs.cs_nifree;
 		cstotal.cs_ndir += cgp->cg_cs.cs_ndir;
 		fs->fs_cs(fs, cyl) = cgp->cg_cs;
 		brelse(bp);
 	}
 #ifdef DEBUG
 	if (bcmp(&cstotal, &fs->fs_cstotal, sizeof cstotal))
 		printf("%s: superblock summary recomputed\n", fs->fs_fsmnt);
 #endif
 	bcopy(&cstotal, &fs->fs_cstotal, sizeof cstotal);
 	return (0);
 }
 
 /*
  * Protecting the freemaps (or bitmaps).
  * 
  * To eliminate the need to execute fsck before mounting a filesystem
  * after a power failure, one must (conservatively) guarantee that the
  * on-disk copy of the bitmaps never indicate that a live inode or block is
  * free.  So, when a block or inode is allocated, the bitmap should be
  * updated (on disk) before any new pointers.  When a block or inode is
  * freed, the bitmap should not be updated until all pointers have been
  * reset.  The latter dependency is handled by the delayed de-allocation
  * approach described below for block and inode de-allocation.  The former
  * dependency is handled by calling the following procedure when a block or
  * inode is allocated. When an inode is allocated an "inodedep" is created
  * with its DEPCOMPLETE flag cleared until its bitmap is written to disk.
  * Each "inodedep" is also inserted into the hash indexing structure so
  * that any additional link additions can be made dependent on the inode
  * allocation.
  * 
  * The ufs filesystem maintains a number of free block counts (e.g., per
  * cylinder group, per cylinder and per <cylinder, rotational position> pair)
  * in addition to the bitmaps.  These counts are used to improve efficiency
  * during allocation and therefore must be consistent with the bitmaps.
  * There is no convenient way to guarantee post-crash consistency of these
  * counts with simple update ordering, for two main reasons: (1) The counts
  * and bitmaps for a single cylinder group block are not in the same disk
  * sector.  If a disk write is interrupted (e.g., by power failure), one may
  * be written and the other not.  (2) Some of the counts are located in the
  * superblock rather than the cylinder group block. So, we focus our soft
  * updates implementation on protecting the bitmaps. When mounting a
  * filesystem, we recompute the auxiliary counts from the bitmaps.
  */
 
 /*
  * Called just after updating the cylinder group block to allocate an inode.
  */
 void
 softdep_setup_inomapdep(bp, ip, newinum)
 	struct buf *bp;		/* buffer for cylgroup block with inode map */
 	struct inode *ip;	/* inode related to allocation */
 	ino_t newinum;		/* new inode number being allocated */
 {
 	struct inodedep *inodedep;
 	struct bmsafemap *bmsafemap;
 
 	/*
 	 * Create a dependency for the newly allocated inode.
 	 * Panic if it already exists as something is seriously wrong.
 	 * Otherwise add it to the dependency list for the buffer holding
 	 * the cylinder group map from which it was allocated.
 	 */
 	ACQUIRE_LOCK(&lk);
 	if ((inodedep_lookup(UFSTOVFS(ip->i_ump), newinum, DEPALLOC|NODELAY,
 	    &inodedep)))
 		panic("softdep_setup_inomapdep: dependency for new inode "
 		    "already exists");
 	inodedep->id_buf = bp;
 	inodedep->id_state &= ~DEPCOMPLETE;
 	bmsafemap = bmsafemap_lookup(inodedep->id_list.wk_mp, bp);
 	LIST_INSERT_HEAD(&bmsafemap->sm_inodedephd, inodedep, id_deps);
 	FREE_LOCK(&lk);
 }
 
 /*
  * Called just after updating the cylinder group block to
  * allocate block or fragment.
  */
 void
 softdep_setup_blkmapdep(bp, mp, newblkno)
 	struct buf *bp;		/* buffer for cylgroup block with block map */
 	struct mount *mp;	/* filesystem doing allocation */
 	ufs2_daddr_t newblkno;	/* number of newly allocated block */
 {
 	struct newblk *newblk;
 	struct bmsafemap *bmsafemap;
 	struct fs *fs;
 
 	fs = VFSTOUFS(mp)->um_fs;
 	/*
 	 * Create a dependency for the newly allocated block.
 	 * Add it to the dependency list for the buffer holding
 	 * the cylinder group map from which it was allocated.
 	 */
 	ACQUIRE_LOCK(&lk);
 	if (newblk_lookup(fs, newblkno, DEPALLOC, &newblk) != 0)
 		panic("softdep_setup_blkmapdep: found block");
 	newblk->nb_bmsafemap = bmsafemap = bmsafemap_lookup(mp, bp);
 	LIST_INSERT_HEAD(&bmsafemap->sm_newblkhd, newblk, nb_deps);
 	FREE_LOCK(&lk);
 }
 
 /*
  * Find the bmsafemap associated with a cylinder group buffer.
  * If none exists, create one. The buffer must be locked when
  * this routine is called and this routine must be called with
  * splbio interrupts blocked.
  */
 static struct bmsafemap *
 bmsafemap_lookup(mp, bp)
 	struct mount *mp;
 	struct buf *bp;
 {
 	struct bmsafemap *bmsafemap;
 	struct worklist *wk;
 
 	mtx_assert(&lk, MA_OWNED);
 	LIST_FOREACH(wk, &bp->b_dep, wk_list)
 		if (wk->wk_type == D_BMSAFEMAP)
 			return (WK_BMSAFEMAP(wk));
 	FREE_LOCK(&lk);
 	MALLOC(bmsafemap, struct bmsafemap *, sizeof(struct bmsafemap),
 		M_BMSAFEMAP, M_SOFTDEP_FLAGS);
 	workitem_alloc(&bmsafemap->sm_list, D_BMSAFEMAP, mp);
 	bmsafemap->sm_buf = bp;
 	LIST_INIT(&bmsafemap->sm_allocdirecthd);
 	LIST_INIT(&bmsafemap->sm_allocindirhd);
 	LIST_INIT(&bmsafemap->sm_inodedephd);
 	LIST_INIT(&bmsafemap->sm_newblkhd);
 	ACQUIRE_LOCK(&lk);
 	WORKLIST_INSERT(&bp->b_dep, &bmsafemap->sm_list);
 	return (bmsafemap);
 }
 
 /*
  * Direct block allocation dependencies.
  * 
  * When a new block is allocated, the corresponding disk locations must be
  * initialized (with zeros or new data) before the on-disk inode points to
  * them.  Also, the freemap from which the block was allocated must be
  * updated (on disk) before the inode's pointer. These two dependencies are
  * independent of each other and are needed for all file blocks and indirect
  * blocks that are pointed to directly by the inode.  Just before the
  * "in-core" version of the inode is updated with a newly allocated block
  * number, a procedure (below) is called to setup allocation dependency
  * structures.  These structures are removed when the corresponding
  * dependencies are satisfied or when the block allocation becomes obsolete
  * (i.e., the file is deleted, the block is de-allocated, or the block is a
  * fragment that gets upgraded).  All of these cases are handled in
  * procedures described later.
  * 
  * When a file extension causes a fragment to be upgraded, either to a larger
  * fragment or to a full block, the on-disk location may change (if the
  * previous fragment could not simply be extended). In this case, the old
  * fragment must be de-allocated, but not until after the inode's pointer has
  * been updated. In most cases, this is handled by later procedures, which
  * will construct a "freefrag" structure to be added to the workitem queue
  * when the inode update is complete (or obsolete).  The main exception to
  * this is when an allocation occurs while a pending allocation dependency
  * (for the same block pointer) remains.  This case is handled in the main
  * allocation dependency setup procedure by immediately freeing the
  * unreferenced fragments.
  */ 
 void 
 softdep_setup_allocdirect(ip, lbn, newblkno, oldblkno, newsize, oldsize, bp)
 	struct inode *ip;	/* inode to which block is being added */
 	ufs_lbn_t lbn;		/* block pointer within inode */
 	ufs2_daddr_t newblkno;	/* disk block number being added */
 	ufs2_daddr_t oldblkno;	/* previous block number, 0 unless frag */
 	long newsize;		/* size of new block */
 	long oldsize;		/* size of new block */
 	struct buf *bp;		/* bp for allocated block */
 {
 	struct allocdirect *adp, *oldadp;
 	struct allocdirectlst *adphead;
 	struct bmsafemap *bmsafemap;
 	struct inodedep *inodedep;
 	struct pagedep *pagedep;
 	struct newblk *newblk;
 	struct mount *mp;
 
 	mp = UFSTOVFS(ip->i_ump);
 	MALLOC(adp, struct allocdirect *, sizeof(struct allocdirect),
 		M_ALLOCDIRECT, M_SOFTDEP_FLAGS|M_ZERO);
 	workitem_alloc(&adp->ad_list, D_ALLOCDIRECT, mp);
 	adp->ad_lbn = lbn;
 	adp->ad_newblkno = newblkno;
 	adp->ad_oldblkno = oldblkno;
 	adp->ad_newsize = newsize;
 	adp->ad_oldsize = oldsize;
 	adp->ad_state = ATTACHED;
 	LIST_INIT(&adp->ad_newdirblk);
 	if (newblkno == oldblkno)
 		adp->ad_freefrag = NULL;
 	else
 		adp->ad_freefrag = newfreefrag(ip, oldblkno, oldsize);
 
 	ACQUIRE_LOCK(&lk);
 	if (lbn >= NDADDR) {
 		/* allocating an indirect block */
 		if (oldblkno != 0)
 			panic("softdep_setup_allocdirect: non-zero indir");
 	} else {
 		/*
 		 * Allocating a direct block.
 		 *
 		 * If we are allocating a directory block, then we must
 		 * allocate an associated pagedep to track additions and
 		 * deletions.
 		 */
 		if ((ip->i_mode & IFMT) == IFDIR &&
 		    pagedep_lookup(ip, lbn, DEPALLOC, &pagedep) == 0)
 			WORKLIST_INSERT(&bp->b_dep, &pagedep->pd_list);
 	}
 	if (newblk_lookup(ip->i_fs, newblkno, 0, &newblk) == 0)
 		panic("softdep_setup_allocdirect: lost block");
 	if (newblk->nb_state == DEPCOMPLETE) {
 		adp->ad_state |= DEPCOMPLETE;
 		adp->ad_buf = NULL;
 	} else {
 		bmsafemap = newblk->nb_bmsafemap;
 		adp->ad_buf = bmsafemap->sm_buf;
 		LIST_REMOVE(newblk, nb_deps);
 		LIST_INSERT_HEAD(&bmsafemap->sm_allocdirecthd, adp, ad_deps);
 	}
 	LIST_REMOVE(newblk, nb_hash);
 	FREE(newblk, M_NEWBLK);
 
 	inodedep_lookup(mp, ip->i_number, DEPALLOC | NODELAY, &inodedep);
 	adp->ad_inodedep = inodedep;
 	WORKLIST_INSERT(&bp->b_dep, &adp->ad_list);
 	/*
 	 * The list of allocdirects must be kept in sorted and ascending
 	 * order so that the rollback routines can quickly determine the
 	 * first uncommitted block (the size of the file stored on disk
 	 * ends at the end of the lowest committed fragment, or if there
 	 * are no fragments, at the end of the highest committed block).
 	 * Since files generally grow, the typical case is that the new
 	 * block is to be added at the end of the list. We speed this
 	 * special case by checking against the last allocdirect in the
 	 * list before laboriously traversing the list looking for the
 	 * insertion point.
 	 */
 	adphead = &inodedep->id_newinoupdt;
 	oldadp = TAILQ_LAST(adphead, allocdirectlst);
 	if (oldadp == NULL || oldadp->ad_lbn <= lbn) {
 		/* insert at end of list */
 		TAILQ_INSERT_TAIL(adphead, adp, ad_next);
 		if (oldadp != NULL && oldadp->ad_lbn == lbn)
 			allocdirect_merge(adphead, adp, oldadp);
 		FREE_LOCK(&lk);
 		return;
 	}
 	TAILQ_FOREACH(oldadp, adphead, ad_next) {
 		if (oldadp->ad_lbn >= lbn)
 			break;
 	}
 	if (oldadp == NULL)
 		panic("softdep_setup_allocdirect: lost entry");
 	/* insert in middle of list */
 	TAILQ_INSERT_BEFORE(oldadp, adp, ad_next);
 	if (oldadp->ad_lbn == lbn)
 		allocdirect_merge(adphead, adp, oldadp);
 	FREE_LOCK(&lk);
 }
 
 /*
  * Replace an old allocdirect dependency with a newer one.
  * This routine must be called with splbio interrupts blocked.
  */
 static void
 allocdirect_merge(adphead, newadp, oldadp)
 	struct allocdirectlst *adphead;	/* head of list holding allocdirects */
 	struct allocdirect *newadp;	/* allocdirect being added */
 	struct allocdirect *oldadp;	/* existing allocdirect being checked */
 {
 	struct worklist *wk;
 	struct freefrag *freefrag;
 	struct newdirblk *newdirblk;
 
 	mtx_assert(&lk, MA_OWNED);
 	if (newadp->ad_oldblkno != oldadp->ad_newblkno ||
 	    newadp->ad_oldsize != oldadp->ad_newsize ||
 	    newadp->ad_lbn >= NDADDR)
 		panic("%s %jd != new %jd || old size %ld != new %ld",
 		    "allocdirect_merge: old blkno",
 		    (intmax_t)newadp->ad_oldblkno,
 		    (intmax_t)oldadp->ad_newblkno,
 		    newadp->ad_oldsize, oldadp->ad_newsize);
 	newadp->ad_oldblkno = oldadp->ad_oldblkno;
 	newadp->ad_oldsize = oldadp->ad_oldsize;
 	/*
 	 * If the old dependency had a fragment to free or had never
 	 * previously had a block allocated, then the new dependency
 	 * can immediately post its freefrag and adopt the old freefrag.
 	 * This action is done by swapping the freefrag dependencies.
 	 * The new dependency gains the old one's freefrag, and the
 	 * old one gets the new one and then immediately puts it on
 	 * the worklist when it is freed by free_allocdirect. It is
 	 * not possible to do this swap when the old dependency had a
 	 * non-zero size but no previous fragment to free. This condition
 	 * arises when the new block is an extension of the old block.
 	 * Here, the first part of the fragment allocated to the new
 	 * dependency is part of the block currently claimed on disk by
 	 * the old dependency, so cannot legitimately be freed until the
 	 * conditions for the new dependency are fulfilled.
 	 */
 	if (oldadp->ad_freefrag != NULL || oldadp->ad_oldblkno == 0) {
 		freefrag = newadp->ad_freefrag;
 		newadp->ad_freefrag = oldadp->ad_freefrag;
 		oldadp->ad_freefrag = freefrag;
 	}
 	/*
 	 * If we are tracking a new directory-block allocation,
 	 * move it from the old allocdirect to the new allocdirect.
 	 */
 	if ((wk = LIST_FIRST(&oldadp->ad_newdirblk)) != NULL) {
 		newdirblk = WK_NEWDIRBLK(wk);
 		WORKLIST_REMOVE(&newdirblk->db_list);
 		if (!LIST_EMPTY(&oldadp->ad_newdirblk))
 			panic("allocdirect_merge: extra newdirblk");
 		WORKLIST_INSERT(&newadp->ad_newdirblk, &newdirblk->db_list);
 	}
 	free_allocdirect(adphead, oldadp, 0);
 }
 		
 /*
  * Allocate a new freefrag structure if needed.
  */
 static struct freefrag *
 newfreefrag(ip, blkno, size)
 	struct inode *ip;
 	ufs2_daddr_t blkno;
 	long size;
 {
 	struct freefrag *freefrag;
 	struct fs *fs;
 
 	if (blkno == 0)
 		return (NULL);
 	fs = ip->i_fs;
 	if (fragnum(fs, blkno) + numfrags(fs, size) > fs->fs_frag)
 		panic("newfreefrag: frag size");
 	MALLOC(freefrag, struct freefrag *, sizeof(struct freefrag),
 		M_FREEFRAG, M_SOFTDEP_FLAGS);
 	workitem_alloc(&freefrag->ff_list, D_FREEFRAG, UFSTOVFS(ip->i_ump));
 	freefrag->ff_inum = ip->i_number;
 	freefrag->ff_blkno = blkno;
 	freefrag->ff_fragsize = size;
 	return (freefrag);
 }
 
 /*
  * This workitem de-allocates fragments that were replaced during
  * file block allocation.
  */
 static void 
 handle_workitem_freefrag(freefrag)
 	struct freefrag *freefrag;
 {
 	struct ufsmount *ump = VFSTOUFS(freefrag->ff_list.wk_mp);
 
 	ffs_blkfree(ump, ump->um_fs, ump->um_devvp, freefrag->ff_blkno,
 	    freefrag->ff_fragsize, freefrag->ff_inum);
 	ACQUIRE_LOCK(&lk);
 	WORKITEM_FREE(freefrag, D_FREEFRAG);
 	FREE_LOCK(&lk);
 }
 
 /*
  * Set up a dependency structure for an external attributes data block.
  * This routine follows much of the structure of softdep_setup_allocdirect.
  * See the description of softdep_setup_allocdirect above for details.
  */
 void 
 softdep_setup_allocext(ip, lbn, newblkno, oldblkno, newsize, oldsize, bp)
 	struct inode *ip;
 	ufs_lbn_t lbn;
 	ufs2_daddr_t newblkno;
 	ufs2_daddr_t oldblkno;
 	long newsize;
 	long oldsize;
 	struct buf *bp;
 {
 	struct allocdirect *adp, *oldadp;
 	struct allocdirectlst *adphead;
 	struct bmsafemap *bmsafemap;
 	struct inodedep *inodedep;
 	struct newblk *newblk;
 	struct mount *mp;
 
 	mp = UFSTOVFS(ip->i_ump);
 	MALLOC(adp, struct allocdirect *, sizeof(struct allocdirect),
 		M_ALLOCDIRECT, M_SOFTDEP_FLAGS|M_ZERO);
 	workitem_alloc(&adp->ad_list, D_ALLOCDIRECT, mp);
 	adp->ad_lbn = lbn;
 	adp->ad_newblkno = newblkno;
 	adp->ad_oldblkno = oldblkno;
 	adp->ad_newsize = newsize;
 	adp->ad_oldsize = oldsize;
 	adp->ad_state = ATTACHED | EXTDATA;
 	LIST_INIT(&adp->ad_newdirblk);
 	if (newblkno == oldblkno)
 		adp->ad_freefrag = NULL;
 	else
 		adp->ad_freefrag = newfreefrag(ip, oldblkno, oldsize);
 
 	ACQUIRE_LOCK(&lk);
 	if (newblk_lookup(ip->i_fs, newblkno, 0, &newblk) == 0)
 		panic("softdep_setup_allocext: lost block");
 
 	inodedep_lookup(mp, ip->i_number, DEPALLOC | NODELAY, &inodedep);
 	adp->ad_inodedep = inodedep;
 
 	if (newblk->nb_state == DEPCOMPLETE) {
 		adp->ad_state |= DEPCOMPLETE;
 		adp->ad_buf = NULL;
 	} else {
 		bmsafemap = newblk->nb_bmsafemap;
 		adp->ad_buf = bmsafemap->sm_buf;
 		LIST_REMOVE(newblk, nb_deps);
 		LIST_INSERT_HEAD(&bmsafemap->sm_allocdirecthd, adp, ad_deps);
 	}
 	LIST_REMOVE(newblk, nb_hash);
 	FREE(newblk, M_NEWBLK);
 
 	WORKLIST_INSERT(&bp->b_dep, &adp->ad_list);
 	if (lbn >= NXADDR)
 		panic("softdep_setup_allocext: lbn %lld > NXADDR",
 		    (long long)lbn);
 	/*
 	 * The list of allocdirects must be kept in sorted and ascending
 	 * order so that the rollback routines can quickly determine the
 	 * first uncommitted block (the size of the file stored on disk
 	 * ends at the end of the lowest committed fragment, or if there
 	 * are no fragments, at the end of the highest committed block).
 	 * Since files generally grow, the typical case is that the new
 	 * block is to be added at the end of the list. We speed this
 	 * special case by checking against the last allocdirect in the
 	 * list before laboriously traversing the list looking for the
 	 * insertion point.
 	 */
 	adphead = &inodedep->id_newextupdt;
 	oldadp = TAILQ_LAST(adphead, allocdirectlst);
 	if (oldadp == NULL || oldadp->ad_lbn <= lbn) {
 		/* insert at end of list */
 		TAILQ_INSERT_TAIL(adphead, adp, ad_next);
 		if (oldadp != NULL && oldadp->ad_lbn == lbn)
 			allocdirect_merge(adphead, adp, oldadp);
 		FREE_LOCK(&lk);
 		return;
 	}
 	TAILQ_FOREACH(oldadp, adphead, ad_next) {
 		if (oldadp->ad_lbn >= lbn)
 			break;
 	}
 	if (oldadp == NULL)
 		panic("softdep_setup_allocext: lost entry");
 	/* insert in middle of list */
 	TAILQ_INSERT_BEFORE(oldadp, adp, ad_next);
 	if (oldadp->ad_lbn == lbn)
 		allocdirect_merge(adphead, adp, oldadp);
 	FREE_LOCK(&lk);
 }
 
 /*
  * Indirect block allocation dependencies.
  * 
  * The same dependencies that exist for a direct block also exist when
  * a new block is allocated and pointed to by an entry in a block of
  * indirect pointers. The undo/redo states described above are also
  * used here. Because an indirect block contains many pointers that
  * may have dependencies, a second copy of the entire in-memory indirect
  * block is kept. The buffer cache copy is always completely up-to-date.
  * The second copy, which is used only as a source for disk writes,
  * contains only the safe pointers (i.e., those that have no remaining
  * update dependencies). The second copy is freed when all pointers
  * are safe. The cache is not allowed to replace indirect blocks with
  * pending update dependencies. If a buffer containing an indirect
  * block with dependencies is written, these routines will mark it
  * dirty again. It can only be successfully written once all the
  * dependencies are removed. The ffs_fsync routine in conjunction with
  * softdep_sync_metadata work together to get all the dependencies
  * removed so that a file can be successfully written to disk. Three
  * procedures are used when setting up indirect block pointer
  * dependencies. The division is necessary because of the organization
  * of the "balloc" routine and because of the distinction between file
  * pages and file metadata blocks.
  */
 
 /*
  * Allocate a new allocindir structure.
  */
 static struct allocindir *
 newallocindir(ip, ptrno, newblkno, oldblkno)
 	struct inode *ip;	/* inode for file being extended */
 	int ptrno;		/* offset of pointer in indirect block */
 	ufs2_daddr_t newblkno;	/* disk block number being added */
 	ufs2_daddr_t oldblkno;	/* previous block number, 0 if none */
 {
 	struct allocindir *aip;
 
 	MALLOC(aip, struct allocindir *, sizeof(struct allocindir),
 		M_ALLOCINDIR, M_SOFTDEP_FLAGS|M_ZERO);
 	workitem_alloc(&aip->ai_list, D_ALLOCINDIR, UFSTOVFS(ip->i_ump));
 	aip->ai_state = ATTACHED;
 	aip->ai_offset = ptrno;
 	aip->ai_newblkno = newblkno;
 	aip->ai_oldblkno = oldblkno;
 	aip->ai_freefrag = newfreefrag(ip, oldblkno, ip->i_fs->fs_bsize);
 	return (aip);
 }
 
 /*
  * Called just before setting an indirect block pointer
  * to a newly allocated file page.
  */
 void
 softdep_setup_allocindir_page(ip, lbn, bp, ptrno, newblkno, oldblkno, nbp)
 	struct inode *ip;	/* inode for file being extended */
 	ufs_lbn_t lbn;		/* allocated block number within file */
 	struct buf *bp;		/* buffer with indirect blk referencing page */
 	int ptrno;		/* offset of pointer in indirect block */
 	ufs2_daddr_t newblkno;	/* disk block number being added */
 	ufs2_daddr_t oldblkno;	/* previous block number, 0 if none */
 	struct buf *nbp;	/* buffer holding allocated page */
 {
 	struct allocindir *aip;
 	struct pagedep *pagedep;
 
 	ASSERT_VOP_LOCKED(ITOV(ip), "softdep_setup_allocindir_page");
 	aip = newallocindir(ip, ptrno, newblkno, oldblkno);
 	ACQUIRE_LOCK(&lk);
 	/*
 	 * If we are allocating a directory page, then we must
 	 * allocate an associated pagedep to track additions and
 	 * deletions.
 	 */
 	if ((ip->i_mode & IFMT) == IFDIR &&
 	    pagedep_lookup(ip, lbn, DEPALLOC, &pagedep) == 0)
 		WORKLIST_INSERT(&nbp->b_dep, &pagedep->pd_list);
 	WORKLIST_INSERT(&nbp->b_dep, &aip->ai_list);
 	setup_allocindir_phase2(bp, ip, aip);
 	FREE_LOCK(&lk);
 }
 
 /*
  * Called just before setting an indirect block pointer to a
  * newly allocated indirect block.
  */
 void
 softdep_setup_allocindir_meta(nbp, ip, bp, ptrno, newblkno)
 	struct buf *nbp;	/* newly allocated indirect block */
 	struct inode *ip;	/* inode for file being extended */
 	struct buf *bp;		/* indirect block referencing allocated block */
 	int ptrno;		/* offset of pointer in indirect block */
 	ufs2_daddr_t newblkno;	/* disk block number being added */
 {
 	struct allocindir *aip;
 
 	ASSERT_VOP_LOCKED(ITOV(ip), "softdep_setup_allocindir_meta");
 	aip = newallocindir(ip, ptrno, newblkno, 0);
 	ACQUIRE_LOCK(&lk);
 	WORKLIST_INSERT(&nbp->b_dep, &aip->ai_list);
 	setup_allocindir_phase2(bp, ip, aip);
 	FREE_LOCK(&lk);
 }
 
 /*
  * Called to finish the allocation of the "aip" allocated
  * by one of the two routines above.
  */
 static void 
 setup_allocindir_phase2(bp, ip, aip)
 	struct buf *bp;		/* in-memory copy of the indirect block */
 	struct inode *ip;	/* inode for file being extended */
 	struct allocindir *aip;	/* allocindir allocated by the above routines */
 {
 	struct worklist *wk;
 	struct indirdep *indirdep, *newindirdep;
 	struct bmsafemap *bmsafemap;
 	struct allocindir *oldaip;
 	struct freefrag *freefrag;
 	struct newblk *newblk;
 	ufs2_daddr_t blkno;
 
 	mtx_assert(&lk, MA_OWNED);
 	if (bp->b_lblkno >= 0)
 		panic("setup_allocindir_phase2: not indir blk");
 	for (indirdep = NULL, newindirdep = NULL; ; ) {
 		LIST_FOREACH(wk, &bp->b_dep, wk_list) {
 			if (wk->wk_type != D_INDIRDEP)
 				continue;
 			indirdep = WK_INDIRDEP(wk);
 			break;
 		}
 		if (indirdep == NULL && newindirdep) {
 			indirdep = newindirdep;
 			WORKLIST_INSERT(&bp->b_dep, &indirdep->ir_list);
 			newindirdep = NULL;
 		}
 		if (indirdep) {
 			if (newblk_lookup(ip->i_fs, aip->ai_newblkno, 0,
 			    &newblk) == 0)
 				panic("setup_allocindir: lost block");
 			if (newblk->nb_state == DEPCOMPLETE) {
 				aip->ai_state |= DEPCOMPLETE;
 				aip->ai_buf = NULL;
 			} else {
 				bmsafemap = newblk->nb_bmsafemap;
 				aip->ai_buf = bmsafemap->sm_buf;
 				LIST_REMOVE(newblk, nb_deps);
 				LIST_INSERT_HEAD(&bmsafemap->sm_allocindirhd,
 				    aip, ai_deps);
 			}
 			LIST_REMOVE(newblk, nb_hash);
 			FREE(newblk, M_NEWBLK);
 			aip->ai_indirdep = indirdep;
 			/*
 			 * Check to see if there is an existing dependency
 			 * for this block. If there is, merge the old
 			 * dependency into the new one.
 			 */
 			if (aip->ai_oldblkno == 0)
 				oldaip = NULL;
 			else
 
 				LIST_FOREACH(oldaip, &indirdep->ir_deplisthd, ai_next)
 					if (oldaip->ai_offset == aip->ai_offset)
 						break;
 			freefrag = NULL;
 			if (oldaip != NULL) {
 				if (oldaip->ai_newblkno != aip->ai_oldblkno)
 					panic("setup_allocindir_phase2: blkno");
 				aip->ai_oldblkno = oldaip->ai_oldblkno;
 				freefrag = aip->ai_freefrag;
 				aip->ai_freefrag = oldaip->ai_freefrag;
 				oldaip->ai_freefrag = NULL;
 				free_allocindir(oldaip, NULL);
 			}
 			LIST_INSERT_HEAD(&indirdep->ir_deplisthd, aip, ai_next);
 			if (ip->i_ump->um_fstype == UFS1)
 				((ufs1_daddr_t *)indirdep->ir_savebp->b_data)
 				    [aip->ai_offset] = aip->ai_oldblkno;
 			else
 				((ufs2_daddr_t *)indirdep->ir_savebp->b_data)
 				    [aip->ai_offset] = aip->ai_oldblkno;
 			FREE_LOCK(&lk);
 			if (freefrag != NULL)
 				handle_workitem_freefrag(freefrag);
 		} else
 			FREE_LOCK(&lk);
 		if (newindirdep) {
 			newindirdep->ir_savebp->b_flags |= B_INVAL | B_NOCACHE;
 			brelse(newindirdep->ir_savebp);
 			ACQUIRE_LOCK(&lk);
 			WORKITEM_FREE((caddr_t)newindirdep, D_INDIRDEP);
 			if (indirdep)
 				break;
 			FREE_LOCK(&lk);
 		}
 		if (indirdep) {
 			ACQUIRE_LOCK(&lk);
 			break;
 		}
 		MALLOC(newindirdep, struct indirdep *, sizeof(struct indirdep),
 			M_INDIRDEP, M_SOFTDEP_FLAGS);
 		workitem_alloc(&newindirdep->ir_list, D_INDIRDEP,
 		    UFSTOVFS(ip->i_ump));
 		newindirdep->ir_state = ATTACHED;
 		if (ip->i_ump->um_fstype == UFS1)
 			newindirdep->ir_state |= UFS1FMT;
 		LIST_INIT(&newindirdep->ir_deplisthd);
 		LIST_INIT(&newindirdep->ir_donehd);
 		if (bp->b_blkno == bp->b_lblkno) {
 			ufs_bmaparray(bp->b_vp, bp->b_lblkno, &blkno, bp,
 			    NULL, NULL);
 			bp->b_blkno = blkno;
 		}
 		newindirdep->ir_savebp =
 		    getblk(ip->i_devvp, bp->b_blkno, bp->b_bcount, 0, 0, 0);
 		BUF_KERNPROC(newindirdep->ir_savebp);
 		bcopy(bp->b_data, newindirdep->ir_savebp->b_data, bp->b_bcount);
 		ACQUIRE_LOCK(&lk);
 	}
 }
 
 /*
  * Block de-allocation dependencies.
  * 
  * When blocks are de-allocated, the on-disk pointers must be nullified before
  * the blocks are made available for use by other files.  (The true
  * requirement is that old pointers must be nullified before new on-disk
  * pointers are set.  We chose this slightly more stringent requirement to
  * reduce complexity.) Our implementation handles this dependency by updating
  * the inode (or indirect block) appropriately but delaying the actual block
  * de-allocation (i.e., freemap and free space count manipulation) until
  * after the updated versions reach stable storage.  After the disk is
  * updated, the blocks can be safely de-allocated whenever it is convenient.
  * This implementation handles only the common case of reducing a file's
  * length to zero. Other cases are handled by the conventional synchronous
  * write approach.
  *
  * The ffs implementation with which we worked double-checks
  * the state of the block pointers and file size as it reduces
  * a file's length.  Some of this code is replicated here in our
  * soft updates implementation.  The freeblks->fb_chkcnt field is
  * used to transfer a part of this information to the procedure
  * that eventually de-allocates the blocks.
  *
  * This routine should be called from the routine that shortens
  * a file's length, before the inode's size or block pointers
  * are modified. It will save the block pointer information for
  * later release and zero the inode so that the calling routine
  * can release it.
  */
 void
 softdep_setup_freeblocks(ip, length, flags)
 	struct inode *ip;	/* The inode whose length is to be reduced */
 	off_t length;		/* The new length for the file */
 	int flags;		/* IO_EXT and/or IO_NORMAL */
 {
 	struct freeblks *freeblks;
 	struct inodedep *inodedep;
 	struct allocdirect *adp;
 	struct vnode *vp;
 	struct buf *bp;
 	struct fs *fs;
 	ufs2_daddr_t extblocks, datablocks;
 	struct mount *mp;
 	int i, delay, error;
 
 	fs = ip->i_fs;
 	mp = UFSTOVFS(ip->i_ump);
 	if (length != 0)
 		panic("softdep_setup_freeblocks: non-zero length");
 	MALLOC(freeblks, struct freeblks *, sizeof(struct freeblks),
 		M_FREEBLKS, M_SOFTDEP_FLAGS|M_ZERO);
 	workitem_alloc(&freeblks->fb_list, D_FREEBLKS, mp);
 	freeblks->fb_state = ATTACHED;
 	freeblks->fb_uid = ip->i_uid;
 	freeblks->fb_previousinum = ip->i_number;
 	freeblks->fb_devvp = ip->i_devvp;
 	extblocks = 0;
 	if (fs->fs_magic == FS_UFS2_MAGIC)
 		extblocks = btodb(fragroundup(fs, ip->i_din2->di_extsize));
 	datablocks = DIP(ip, i_blocks) - extblocks;
 	if ((flags & IO_NORMAL) == 0) {
 		freeblks->fb_oldsize = 0;
 		freeblks->fb_chkcnt = 0;
 	} else {
 		freeblks->fb_oldsize = ip->i_size;
 		ip->i_size = 0;
 		DIP_SET(ip, i_size, 0);
 		freeblks->fb_chkcnt = datablocks;
 		for (i = 0; i < NDADDR; i++) {
 			freeblks->fb_dblks[i] = DIP(ip, i_db[i]);
 			DIP_SET(ip, i_db[i], 0);
 		}
 		for (i = 0; i < NIADDR; i++) {
 			freeblks->fb_iblks[i] = DIP(ip, i_ib[i]);
 			DIP_SET(ip, i_ib[i], 0);
 		}
 		/*
 		 * If the file was removed, then the space being freed was
 		 * accounted for then (see softdep_releasefile()). If the
 		 * file is merely being truncated, then we account for it now.
 		 */
 		if ((ip->i_flag & IN_SPACECOUNTED) == 0) {
 			UFS_LOCK(ip->i_ump);
 			fs->fs_pendingblocks += datablocks;
 			UFS_UNLOCK(ip->i_ump);
 		}
 	}
 	if ((flags & IO_EXT) == 0) {
 		freeblks->fb_oldextsize = 0;
 	} else {
 		freeblks->fb_oldextsize = ip->i_din2->di_extsize;
 		ip->i_din2->di_extsize = 0;
 		freeblks->fb_chkcnt += extblocks;
 		for (i = 0; i < NXADDR; i++) {
 			freeblks->fb_eblks[i] = ip->i_din2->di_extb[i];
 			ip->i_din2->di_extb[i] = 0;
 		}
 	}
 	DIP_SET(ip, i_blocks, DIP(ip, i_blocks) - freeblks->fb_chkcnt);
 	/*
 	 * Push the zero'ed inode to to its disk buffer so that we are free
 	 * to delete its dependencies below. Once the dependencies are gone
 	 * the buffer can be safely released.
 	 */
 	if ((error = bread(ip->i_devvp,
 	    fsbtodb(fs, ino_to_fsba(fs, ip->i_number)),
 	    (int)fs->fs_bsize, NOCRED, &bp)) != 0) {
 		brelse(bp);
 		softdep_error("softdep_setup_freeblocks", error);
 	}
 	if (ip->i_ump->um_fstype == UFS1)
 		*((struct ufs1_dinode *)bp->b_data +
 		    ino_to_fsbo(fs, ip->i_number)) = *ip->i_din1;
 	else
 		*((struct ufs2_dinode *)bp->b_data +
 		    ino_to_fsbo(fs, ip->i_number)) = *ip->i_din2;
 	/*
 	 * Find and eliminate any inode dependencies.
 	 */
 	ACQUIRE_LOCK(&lk);
 	(void) inodedep_lookup(mp, ip->i_number, DEPALLOC, &inodedep);
 	if ((inodedep->id_state & IOSTARTED) != 0)
 		panic("softdep_setup_freeblocks: inode busy");
 	/*
 	 * Add the freeblks structure to the list of operations that
 	 * must await the zero'ed inode being written to disk. If we
 	 * still have a bitmap dependency (delay == 0), then the inode
 	 * has never been written to disk, so we can process the
 	 * freeblks below once we have deleted the dependencies.
 	 */
 	delay = (inodedep->id_state & DEPCOMPLETE);
 	if (delay)
 		WORKLIST_INSERT(&inodedep->id_bufwait, &freeblks->fb_list);
 	/*
 	 * Because the file length has been truncated to zero, any
 	 * pending block allocation dependency structures associated
 	 * with this inode are obsolete and can simply be de-allocated.
 	 * We must first merge the two dependency lists to get rid of
 	 * any duplicate freefrag structures, then purge the merged list.
 	 * If we still have a bitmap dependency, then the inode has never
 	 * been written to disk, so we can free any fragments without delay.
 	 */
 	if (flags & IO_NORMAL) {
 		merge_inode_lists(&inodedep->id_newinoupdt,
 		    &inodedep->id_inoupdt);
 		while ((adp = TAILQ_FIRST(&inodedep->id_inoupdt)) != 0)
 			free_allocdirect(&inodedep->id_inoupdt, adp, delay);
 	}
 	if (flags & IO_EXT) {
 		merge_inode_lists(&inodedep->id_newextupdt,
 		    &inodedep->id_extupdt);
 		while ((adp = TAILQ_FIRST(&inodedep->id_extupdt)) != 0)
 			free_allocdirect(&inodedep->id_extupdt, adp, delay);
 	}
 	FREE_LOCK(&lk);
 	bdwrite(bp);
 	/*
 	 * We must wait for any I/O in progress to finish so that
 	 * all potential buffers on the dirty list will be visible.
 	 * Once they are all there, walk the list and get rid of
 	 * any dependencies.
 	 */
 	vp = ITOV(ip);
 	VI_LOCK(vp);
 	drain_output(vp);
 restart:
 	TAILQ_FOREACH(bp, &vp->v_bufobj.bo_dirty.bv_hd, b_bobufs) {
 		if (((flags & IO_EXT) == 0 && (bp->b_xflags & BX_ALTDATA)) ||
 		    ((flags & IO_NORMAL) == 0 &&
 		      (bp->b_xflags & BX_ALTDATA) == 0))
 			continue;
 		if ((bp = getdirtybuf(bp, VI_MTX(vp), MNT_WAIT)) == NULL)
 			goto restart;
 		VI_UNLOCK(vp);
 		ACQUIRE_LOCK(&lk);
 		(void) inodedep_lookup(mp, ip->i_number, 0, &inodedep);
 		deallocate_dependencies(bp, inodedep);
 		FREE_LOCK(&lk);
 		bp->b_flags |= B_INVAL | B_NOCACHE;
 		brelse(bp);
 		VI_LOCK(vp);
 		goto restart;
 	}
 	VI_UNLOCK(vp);
 	ACQUIRE_LOCK(&lk);
 	if (inodedep_lookup(mp, ip->i_number, 0, &inodedep) != 0)
 		(void) free_inodedep(inodedep);
 
 	if(delay) {
 		freeblks->fb_state |= DEPCOMPLETE;
 		/*
 		 * If the inode with zeroed block pointers is now on disk
 		 * we can start freeing blocks. Add freeblks to the worklist
 		 * instead of calling  handle_workitem_freeblocks directly as
 		 * it is more likely that additional IO is needed to complete
 		 * the request here than in the !delay case.
 		 */  
 		if ((freeblks->fb_state & ALLCOMPLETE) == ALLCOMPLETE)
 			add_to_worklist(&freeblks->fb_list);
 	}
 
 	FREE_LOCK(&lk);
 	/*
 	 * If the inode has never been written to disk (delay == 0),
 	 * then we can process the freeblks now that we have deleted
 	 * the dependencies.
 	 */
 	if (!delay)
 		handle_workitem_freeblocks(freeblks, 0);
 }
 
 /*
  * Reclaim any dependency structures from a buffer that is about to
  * be reallocated to a new vnode. The buffer must be locked, thus,
  * no I/O completion operations can occur while we are manipulating
  * its associated dependencies. The mutex is held so that other I/O's
  * associated with related dependencies do not occur.
  */
 static void
 deallocate_dependencies(bp, inodedep)
 	struct buf *bp;
 	struct inodedep *inodedep;
 {
 	struct worklist *wk;
 	struct indirdep *indirdep;
 	struct allocindir *aip;
 	struct pagedep *pagedep;
 	struct dirrem *dirrem;
 	struct diradd *dap;
 	int i;
 
 	mtx_assert(&lk, MA_OWNED);
 	while ((wk = LIST_FIRST(&bp->b_dep)) != NULL) {
 		switch (wk->wk_type) {
 
 		case D_INDIRDEP:
 			indirdep = WK_INDIRDEP(wk);
 			/*
 			 * None of the indirect pointers will ever be visible,
 			 * so they can simply be tossed. GOINGAWAY ensures
 			 * that allocated pointers will be saved in the buffer
 			 * cache until they are freed. Note that they will
 			 * only be able to be found by their physical address
 			 * since the inode mapping the logical address will
 			 * be gone. The save buffer used for the safe copy
 			 * was allocated in setup_allocindir_phase2 using
 			 * the physical address so it could be used for this
 			 * purpose. Hence we swap the safe copy with the real
 			 * copy, allowing the safe copy to be freed and holding
 			 * on to the real copy for later use in indir_trunc.
 			 */
 			if (indirdep->ir_state & GOINGAWAY)
 				panic("deallocate_dependencies: already gone");
 			indirdep->ir_state |= GOINGAWAY;
 			VFSTOUFS(bp->b_vp->v_mount)->um_numindirdeps += 1;
 			while ((aip = LIST_FIRST(&indirdep->ir_deplisthd)) != 0)
 				free_allocindir(aip, inodedep);
 			if (bp->b_lblkno >= 0 ||
 			    bp->b_blkno != indirdep->ir_savebp->b_lblkno)
 				panic("deallocate_dependencies: not indir");
 			bcopy(bp->b_data, indirdep->ir_savebp->b_data,
 			    bp->b_bcount);
 			WORKLIST_REMOVE(wk);
 			WORKLIST_INSERT(&indirdep->ir_savebp->b_dep, wk);
 			continue;
 
 		case D_PAGEDEP:
 			pagedep = WK_PAGEDEP(wk);
 			/*
 			 * None of the directory additions will ever be
 			 * visible, so they can simply be tossed.
 			 */
 			for (i = 0; i < DAHASHSZ; i++)
 				while ((dap =
 				    LIST_FIRST(&pagedep->pd_diraddhd[i])))
 					free_diradd(dap);
 			while ((dap = LIST_FIRST(&pagedep->pd_pendinghd)) != 0)
 				free_diradd(dap);
 			/*
 			 * Copy any directory remove dependencies to the list
 			 * to be processed after the zero'ed inode is written.
 			 * If the inode has already been written, then they 
 			 * can be dumped directly onto the work list.
 			 */
 			LIST_FOREACH(dirrem, &pagedep->pd_dirremhd, dm_next) {
 				LIST_REMOVE(dirrem, dm_next);
 				dirrem->dm_dirinum = pagedep->pd_ino;
 				if (inodedep == NULL ||
 				    (inodedep->id_state & ALLCOMPLETE) ==
 				     ALLCOMPLETE)
 					add_to_worklist(&dirrem->dm_list);
 				else
 					WORKLIST_INSERT(&inodedep->id_bufwait,
 					    &dirrem->dm_list);
 			}
 			if ((pagedep->pd_state & NEWBLOCK) != 0) {
 				LIST_FOREACH(wk, &inodedep->id_bufwait, wk_list)
 					if (wk->wk_type == D_NEWDIRBLK &&
 					    WK_NEWDIRBLK(wk)->db_pagedep ==
 					      pagedep)
 						break;
 				if (wk != NULL) {
 					WORKLIST_REMOVE(wk);
 					free_newdirblk(WK_NEWDIRBLK(wk));
 				} else
 					panic("deallocate_dependencies: "
 					      "lost pagedep");
 			}
 			WORKLIST_REMOVE(&pagedep->pd_list);
 			LIST_REMOVE(pagedep, pd_hash);
 			WORKITEM_FREE(pagedep, D_PAGEDEP);
 			continue;
 
 		case D_ALLOCINDIR:
 			free_allocindir(WK_ALLOCINDIR(wk), inodedep);
 			continue;
 
 		case D_ALLOCDIRECT:
 		case D_INODEDEP:
 			panic("deallocate_dependencies: Unexpected type %s",
 			    TYPENAME(wk->wk_type));
 			/* NOTREACHED */
 
 		default:
 			panic("deallocate_dependencies: Unknown type %s",
 			    TYPENAME(wk->wk_type));
 			/* NOTREACHED */
 		}
 	}
 }
 
 /*
  * Free an allocdirect. Generate a new freefrag work request if appropriate.
  * This routine must be called with splbio interrupts blocked.
  */
 static void
 free_allocdirect(adphead, adp, delay)
 	struct allocdirectlst *adphead;
 	struct allocdirect *adp;
 	int delay;
 {
 	struct newdirblk *newdirblk;
 	struct worklist *wk;
 
 	mtx_assert(&lk, MA_OWNED);
 	if ((adp->ad_state & DEPCOMPLETE) == 0)
 		LIST_REMOVE(adp, ad_deps);
 	TAILQ_REMOVE(adphead, adp, ad_next);
 	if ((adp->ad_state & COMPLETE) == 0)
 		WORKLIST_REMOVE(&adp->ad_list);
 	if (adp->ad_freefrag != NULL) {
 		if (delay)
 			WORKLIST_INSERT(&adp->ad_inodedep->id_bufwait,
 			    &adp->ad_freefrag->ff_list);
 		else
 			add_to_worklist(&adp->ad_freefrag->ff_list);
 	}
 	if ((wk = LIST_FIRST(&adp->ad_newdirblk)) != NULL) {
 		newdirblk = WK_NEWDIRBLK(wk);
 		WORKLIST_REMOVE(&newdirblk->db_list);
 		if (!LIST_EMPTY(&adp->ad_newdirblk))
 			panic("free_allocdirect: extra newdirblk");
 		if (delay)
 			WORKLIST_INSERT(&adp->ad_inodedep->id_bufwait,
 			    &newdirblk->db_list);
 		else
 			free_newdirblk(newdirblk);
 	}
 	WORKITEM_FREE(adp, D_ALLOCDIRECT);
 }
 
 /*
  * Free a newdirblk. Clear the NEWBLOCK flag on its associated pagedep.
  * This routine must be called with splbio interrupts blocked.
  */
 static void
 free_newdirblk(newdirblk)
 	struct newdirblk *newdirblk;
 {
 	struct pagedep *pagedep;
 	struct diradd *dap;
 	int i;
 
 	mtx_assert(&lk, MA_OWNED);
 	/*
 	 * If the pagedep is still linked onto the directory buffer
 	 * dependency chain, then some of the entries on the
 	 * pd_pendinghd list may not be committed to disk yet. In
 	 * this case, we will simply clear the NEWBLOCK flag and
 	 * let the pd_pendinghd list be processed when the pagedep
 	 * is next written. If the pagedep is no longer on the buffer
 	 * dependency chain, then all the entries on the pd_pending
 	 * list are committed to disk and we can free them here.
 	 */
 	pagedep = newdirblk->db_pagedep;
 	pagedep->pd_state &= ~NEWBLOCK;
 	if ((pagedep->pd_state & ONWORKLIST) == 0)
 		while ((dap = LIST_FIRST(&pagedep->pd_pendinghd)) != NULL)
 			free_diradd(dap);
 	/*
 	 * If no dependencies remain, the pagedep will be freed.
 	 */
 	for (i = 0; i < DAHASHSZ; i++)
 		if (!LIST_EMPTY(&pagedep->pd_diraddhd[i]))
 			break;
 	if (i == DAHASHSZ && (pagedep->pd_state & ONWORKLIST) == 0) {
 		LIST_REMOVE(pagedep, pd_hash);
 		WORKITEM_FREE(pagedep, D_PAGEDEP);
 	}
 	WORKITEM_FREE(newdirblk, D_NEWDIRBLK);
 }
 
 /*
  * Prepare an inode to be freed. The actual free operation is not
  * done until the zero'ed inode has been written to disk.
  */
 void
 softdep_freefile(pvp, ino, mode)
 	struct vnode *pvp;
 	ino_t ino;
 	int mode;
 {
 	struct inode *ip = VTOI(pvp);
 	struct inodedep *inodedep;
 	struct freefile *freefile;
 
 	/*
 	 * This sets up the inode de-allocation dependency.
 	 */
 	MALLOC(freefile, struct freefile *, sizeof(struct freefile),
 		M_FREEFILE, M_SOFTDEP_FLAGS);
 	workitem_alloc(&freefile->fx_list, D_FREEFILE, pvp->v_mount);
 	freefile->fx_mode = mode;
 	freefile->fx_oldinum = ino;
 	freefile->fx_devvp = ip->i_devvp;
 	if ((ip->i_flag & IN_SPACECOUNTED) == 0) {
 		UFS_LOCK(ip->i_ump);
 		ip->i_fs->fs_pendinginodes += 1;
 		UFS_UNLOCK(ip->i_ump);
 	}
 
 	/*
 	 * If the inodedep does not exist, then the zero'ed inode has
 	 * been written to disk. If the allocated inode has never been
 	 * written to disk, then the on-disk inode is zero'ed. In either
 	 * case we can free the file immediately.
 	 */
 	ACQUIRE_LOCK(&lk);
 	if (inodedep_lookup(pvp->v_mount, ino, 0, &inodedep) == 0 ||
 	    check_inode_unwritten(inodedep)) {
 		FREE_LOCK(&lk);
 		handle_workitem_freefile(freefile);
 		return;
 	}
 	WORKLIST_INSERT(&inodedep->id_inowait, &freefile->fx_list);
 	FREE_LOCK(&lk);
 	ip->i_flag |= IN_MODIFIED;
 }
 
 /*
  * Check to see if an inode has never been written to disk. If
  * so free the inodedep and return success, otherwise return failure.
  * This routine must be called with splbio interrupts blocked.
  *
  * If we still have a bitmap dependency, then the inode has never
  * been written to disk. Drop the dependency as it is no longer
  * necessary since the inode is being deallocated. We set the
  * ALLCOMPLETE flags since the bitmap now properly shows that the
  * inode is not allocated. Even if the inode is actively being
  * written, it has been rolled back to its zero'ed state, so we
  * are ensured that a zero inode is what is on the disk. For short
  * lived files, this change will usually result in removing all the
  * dependencies from the inode so that it can be freed immediately.
  */
 static int
 check_inode_unwritten(inodedep)
 	struct inodedep *inodedep;
 {
 
 	mtx_assert(&lk, MA_OWNED);
 	if ((inodedep->id_state & DEPCOMPLETE) != 0 ||
 	    !LIST_EMPTY(&inodedep->id_pendinghd) ||
 	    !LIST_EMPTY(&inodedep->id_bufwait) ||
 	    !LIST_EMPTY(&inodedep->id_inowait) ||
 	    !TAILQ_EMPTY(&inodedep->id_inoupdt) ||
 	    !TAILQ_EMPTY(&inodedep->id_newinoupdt) ||
 	    !TAILQ_EMPTY(&inodedep->id_extupdt) ||
 	    !TAILQ_EMPTY(&inodedep->id_newextupdt) ||
 	    inodedep->id_nlinkdelta != 0)
 		return (0);
 
 	/*
 	 * Another process might be in initiate_write_inodeblock_ufs[12]
 	 * trying to allocate memory without holding "Softdep Lock".
 	 */
 	if ((inodedep->id_state & IOSTARTED) != 0 &&
 	    inodedep->id_savedino1 == NULL)
 		return (0);
 
 	inodedep->id_state |= ALLCOMPLETE;
 	LIST_REMOVE(inodedep, id_deps);
 	inodedep->id_buf = NULL;
 	if (inodedep->id_state & ONWORKLIST)
 		WORKLIST_REMOVE(&inodedep->id_list);
 	if (inodedep->id_savedino1 != NULL) {
 		FREE(inodedep->id_savedino1, M_SAVEDINO);
 		inodedep->id_savedino1 = NULL;
 	}
 	if (free_inodedep(inodedep) == 0)
 		panic("check_inode_unwritten: busy inode");
 	return (1);
 }
 
 /*
  * Try to free an inodedep structure. Return 1 if it could be freed.
  */
 static int
 free_inodedep(inodedep)
 	struct inodedep *inodedep;
 {
 
 	mtx_assert(&lk, MA_OWNED);
 	if ((inodedep->id_state & ONWORKLIST) != 0 ||
 	    (inodedep->id_state & ALLCOMPLETE) != ALLCOMPLETE ||
 	    !LIST_EMPTY(&inodedep->id_pendinghd) ||
 	    !LIST_EMPTY(&inodedep->id_bufwait) ||
 	    !LIST_EMPTY(&inodedep->id_inowait) ||
 	    !TAILQ_EMPTY(&inodedep->id_inoupdt) ||
 	    !TAILQ_EMPTY(&inodedep->id_newinoupdt) ||
 	    !TAILQ_EMPTY(&inodedep->id_extupdt) ||
 	    !TAILQ_EMPTY(&inodedep->id_newextupdt) ||
 	    inodedep->id_nlinkdelta != 0 || inodedep->id_savedino1 != NULL)
 		return (0);
 	LIST_REMOVE(inodedep, id_hash);
 	WORKITEM_FREE(inodedep, D_INODEDEP);
 	num_inodedep -= 1;
 	return (1);
 }
 
 /*
  * This workitem routine performs the block de-allocation.
  * The workitem is added to the pending list after the updated
  * inode block has been written to disk.  As mentioned above,
  * checks regarding the number of blocks de-allocated (compared
  * to the number of blocks allocated for the file) are also
  * performed in this function.
  */
 static void
 handle_workitem_freeblocks(freeblks, flags)
 	struct freeblks *freeblks;
 	int flags;
 {
 	struct inode *ip;
 	struct vnode *vp;
 	struct fs *fs;
 	struct ufsmount *ump;
 	int i, nblocks, level, bsize;
 	ufs2_daddr_t bn, blocksreleased = 0;
 	int error, allerror = 0;
 	ufs_lbn_t baselbns[NIADDR], tmpval;
 	int fs_pendingblocks;
 
 	ump = VFSTOUFS(freeblks->fb_list.wk_mp);
 	fs = ump->um_fs;
 	fs_pendingblocks = 0;
 	tmpval = 1;
 	baselbns[0] = NDADDR;
 	for (i = 1; i < NIADDR; i++) {
 		tmpval *= NINDIR(fs);
 		baselbns[i] = baselbns[i - 1] + tmpval;
 	}
 	nblocks = btodb(fs->fs_bsize);
 	blocksreleased = 0;
 	/*
 	 * Release all extended attribute blocks or frags.
 	 */
 	if (freeblks->fb_oldextsize > 0) {
 		for (i = (NXADDR - 1); i >= 0; i--) {
 			if ((bn = freeblks->fb_eblks[i]) == 0)
 				continue;
 			bsize = sblksize(fs, freeblks->fb_oldextsize, i);
 			ffs_blkfree(ump, fs, freeblks->fb_devvp, bn, bsize,
 			    freeblks->fb_previousinum);
 			blocksreleased += btodb(bsize);
 		}
 	}
 	/*
 	 * Release all data blocks or frags.
 	 */
 	if (freeblks->fb_oldsize > 0) {
 		/*
 		 * Indirect blocks first.
 		 */
 		for (level = (NIADDR - 1); level >= 0; level--) {
 			if ((bn = freeblks->fb_iblks[level]) == 0)
 				continue;
 			if ((error = indir_trunc(freeblks, fsbtodb(fs, bn),
 			    level, baselbns[level], &blocksreleased)) != 0)
 				allerror = error;
 			ffs_blkfree(ump, fs, freeblks->fb_devvp, bn,
 			    fs->fs_bsize, freeblks->fb_previousinum);
 			fs_pendingblocks += nblocks;
 			blocksreleased += nblocks;
 		}
 		/*
 		 * All direct blocks or frags.
 		 */
 		for (i = (NDADDR - 1); i >= 0; i--) {
 			if ((bn = freeblks->fb_dblks[i]) == 0)
 				continue;
 			bsize = sblksize(fs, freeblks->fb_oldsize, i);
 			ffs_blkfree(ump, fs, freeblks->fb_devvp, bn, bsize,
 			    freeblks->fb_previousinum);
 			fs_pendingblocks += btodb(bsize);
 			blocksreleased += btodb(bsize);
 		}
 	}
 	UFS_LOCK(ump);
 	fs->fs_pendingblocks -= fs_pendingblocks;
 	UFS_UNLOCK(ump);
 	/*
 	 * If we still have not finished background cleanup, then check
 	 * to see if the block count needs to be adjusted.
 	 */
 	if (freeblks->fb_chkcnt != blocksreleased &&
 	    (fs->fs_flags & FS_UNCLEAN) != 0 &&
 	    ffs_vget(freeblks->fb_list.wk_mp, freeblks->fb_previousinum,
 	    (flags & LK_NOWAIT) | LK_EXCLUSIVE, &vp) == 0) {
 		ip = VTOI(vp);
 		DIP_SET(ip, i_blocks, DIP(ip, i_blocks) + \
 		    freeblks->fb_chkcnt - blocksreleased);
 		ip->i_flag |= IN_CHANGE;
 		vput(vp);
 	}
 
 #ifdef INVARIANTS
 	if (freeblks->fb_chkcnt != blocksreleased &&
 	    ((fs->fs_flags & FS_UNCLEAN) == 0 || (flags & LK_NOWAIT) != 0))
 		printf("handle_workitem_freeblocks: block count\n");
 	if (allerror)
 		softdep_error("handle_workitem_freeblks", allerror);
 #endif /* INVARIANTS */
 
 	ACQUIRE_LOCK(&lk);
 	WORKITEM_FREE(freeblks, D_FREEBLKS);
 	FREE_LOCK(&lk);
 }
 
 /*
  * Release blocks associated with the inode ip and stored in the indirect
  * block dbn. If level is greater than SINGLE, the block is an indirect block
  * and recursive calls to indirtrunc must be used to cleanse other indirect
  * blocks.
  */
 static int
 indir_trunc(freeblks, dbn, level, lbn, countp)
 	struct freeblks *freeblks;
 	ufs2_daddr_t dbn;
 	int level;
 	ufs_lbn_t lbn;
 	ufs2_daddr_t *countp;
 {
 	struct buf *bp;
 	struct fs *fs;
 	struct worklist *wk;
 	struct indirdep *indirdep;
 	struct ufsmount *ump;
 	ufs1_daddr_t *bap1 = 0;
 	ufs2_daddr_t nb, *bap2 = 0;
 	ufs_lbn_t lbnadd;
 	int i, nblocks, ufs1fmt;
 	int error, allerror = 0;
 	int fs_pendingblocks;
 
 	ump = VFSTOUFS(freeblks->fb_list.wk_mp);
 	fs = ump->um_fs;
 	fs_pendingblocks = 0;
 	lbnadd = 1;
 	for (i = level; i > 0; i--)
 		lbnadd *= NINDIR(fs);
 	/*
 	 * Get buffer of block pointers to be freed. This routine is not
 	 * called until the zero'ed inode has been written, so it is safe
 	 * to free blocks as they are encountered. Because the inode has
 	 * been zero'ed, calls to bmap on these blocks will fail. So, we
 	 * have to use the on-disk address and the block device for the
 	 * filesystem to look them up. If the file was deleted before its
 	 * indirect blocks were all written to disk, the routine that set
 	 * us up (deallocate_dependencies) will have arranged to leave
 	 * a complete copy of the indirect block in memory for our use.
 	 * Otherwise we have to read the blocks in from the disk.
 	 */
 #ifdef notyet
 	bp = getblk(freeblks->fb_devvp, dbn, (int)fs->fs_bsize, 0, 0,
 	    GB_NOCREAT);
 #else
 	bp = incore(&freeblks->fb_devvp->v_bufobj, dbn);
 #endif
 	ACQUIRE_LOCK(&lk);
 	if (bp != NULL && (wk = LIST_FIRST(&bp->b_dep)) != NULL) {
 		if (wk->wk_type != D_INDIRDEP ||
 		    (indirdep = WK_INDIRDEP(wk))->ir_savebp != bp ||
 		    (indirdep->ir_state & GOINGAWAY) == 0)
 			panic("indir_trunc: lost indirdep");
 		WORKLIST_REMOVE(wk);
 		WORKITEM_FREE(indirdep, D_INDIRDEP);
 		if (!LIST_EMPTY(&bp->b_dep))
 			panic("indir_trunc: dangling dep");
 		ump->um_numindirdeps -= 1;
 		FREE_LOCK(&lk);
 	} else {
 #ifdef notyet
 		if (bp)
 			brelse(bp);
 #endif
 		FREE_LOCK(&lk);
 		error = bread(freeblks->fb_devvp, dbn, (int)fs->fs_bsize,
 		    NOCRED, &bp);
 		if (error) {
 			brelse(bp);
 			return (error);
 		}
 	}
 	/*
 	 * Recursively free indirect blocks.
 	 */
 	if (ump->um_fstype == UFS1) {
 		ufs1fmt = 1;
 		bap1 = (ufs1_daddr_t *)bp->b_data;
 	} else {
 		ufs1fmt = 0;
 		bap2 = (ufs2_daddr_t *)bp->b_data;
 	}
 	nblocks = btodb(fs->fs_bsize);
 	for (i = NINDIR(fs) - 1; i >= 0; i--) {
 		if (ufs1fmt)
 			nb = bap1[i];
 		else
 			nb = bap2[i];
 		if (nb == 0)
 			continue;
 		if (level != 0) {
 			if ((error = indir_trunc(freeblks, fsbtodb(fs, nb),
 			     level - 1, lbn + (i * lbnadd), countp)) != 0)
 				allerror = error;
 		}
 		ffs_blkfree(ump, fs, freeblks->fb_devvp, nb, fs->fs_bsize,
 		    freeblks->fb_previousinum);
 		fs_pendingblocks += nblocks;
 		*countp += nblocks;
 	}
 	UFS_LOCK(ump);
 	fs->fs_pendingblocks -= fs_pendingblocks;
 	UFS_UNLOCK(ump);
 	bp->b_flags |= B_INVAL | B_NOCACHE;
 	brelse(bp);
 	return (allerror);
 }
 
 /*
  * Free an allocindir.
  * This routine must be called with splbio interrupts blocked.
  */
 static void
 free_allocindir(aip, inodedep)
 	struct allocindir *aip;
 	struct inodedep *inodedep;
 {
 	struct freefrag *freefrag;
 
 	mtx_assert(&lk, MA_OWNED);
 	if ((aip->ai_state & DEPCOMPLETE) == 0)
 		LIST_REMOVE(aip, ai_deps);
 	if (aip->ai_state & ONWORKLIST)
 		WORKLIST_REMOVE(&aip->ai_list);
 	LIST_REMOVE(aip, ai_next);
 	if ((freefrag = aip->ai_freefrag) != NULL) {
 		if (inodedep == NULL)
 			add_to_worklist(&freefrag->ff_list);
 		else
 			WORKLIST_INSERT(&inodedep->id_bufwait,
 			    &freefrag->ff_list);
 	}
 	WORKITEM_FREE(aip, D_ALLOCINDIR);
 }
 
 /*
  * Directory entry addition dependencies.
  * 
  * When adding a new directory entry, the inode (with its incremented link
  * count) must be written to disk before the directory entry's pointer to it.
  * Also, if the inode is newly allocated, the corresponding freemap must be
  * updated (on disk) before the directory entry's pointer. These requirements
  * are met via undo/redo on the directory entry's pointer, which consists
  * simply of the inode number.
  * 
  * As directory entries are added and deleted, the free space within a
  * directory block can become fragmented.  The ufs filesystem will compact
  * a fragmented directory block to make space for a new entry. When this
  * occurs, the offsets of previously added entries change. Any "diradd"
  * dependency structures corresponding to these entries must be updated with
  * the new offsets.
  */
 
 /*
  * This routine is called after the in-memory inode's link
  * count has been incremented, but before the directory entry's
  * pointer to the inode has been set.
  */
 int
 softdep_setup_directory_add(bp, dp, diroffset, newinum, newdirbp, isnewblk)
 	struct buf *bp;		/* buffer containing directory block */
 	struct inode *dp;	/* inode for directory */
 	off_t diroffset;	/* offset of new entry in directory */
 	ino_t newinum;		/* inode referenced by new directory entry */
 	struct buf *newdirbp;	/* non-NULL => contents of new mkdir */
 	int isnewblk;		/* entry is in a newly allocated block */
 {
 	int offset;		/* offset of new entry within directory block */
 	ufs_lbn_t lbn;		/* block in directory containing new entry */
 	struct fs *fs;
 	struct diradd *dap;
 	struct allocdirect *adp;
 	struct pagedep *pagedep;
 	struct inodedep *inodedep;
 	struct newdirblk *newdirblk = 0;
 	struct mkdir *mkdir1, *mkdir2;
 	struct mount *mp;
 
 	/*
 	 * Whiteouts have no dependencies.
 	 */
 	if (newinum == WINO) {
 		if (newdirbp != NULL)
 			bdwrite(newdirbp);
 		return (0);
 	}
 	mp = UFSTOVFS(dp->i_ump);
 	fs = dp->i_fs;
 	lbn = lblkno(fs, diroffset);
 	offset = blkoff(fs, diroffset);
 	MALLOC(dap, struct diradd *, sizeof(struct diradd), M_DIRADD,
 		M_SOFTDEP_FLAGS|M_ZERO);
 	workitem_alloc(&dap->da_list, D_DIRADD, mp);
 	dap->da_offset = offset;
 	dap->da_newinum = newinum;
 	dap->da_state = ATTACHED;
 	if (isnewblk && lbn < NDADDR && fragoff(fs, diroffset) == 0) {
 		MALLOC(newdirblk, struct newdirblk *, sizeof(struct newdirblk),
 		    M_NEWDIRBLK, M_SOFTDEP_FLAGS);
 		workitem_alloc(&newdirblk->db_list, D_NEWDIRBLK, mp);
 	}
 	if (newdirbp == NULL) {
 		dap->da_state |= DEPCOMPLETE;
 		ACQUIRE_LOCK(&lk);
 	} else {
 		dap->da_state |= MKDIR_BODY | MKDIR_PARENT;
 		MALLOC(mkdir1, struct mkdir *, sizeof(struct mkdir), M_MKDIR,
 		    M_SOFTDEP_FLAGS);
 		workitem_alloc(&mkdir1->md_list, D_MKDIR, mp);
 		mkdir1->md_state = MKDIR_BODY;
 		mkdir1->md_diradd = dap;
 		MALLOC(mkdir2, struct mkdir *, sizeof(struct mkdir), M_MKDIR,
 		    M_SOFTDEP_FLAGS);
 		workitem_alloc(&mkdir2->md_list, D_MKDIR, mp);
 		mkdir2->md_state = MKDIR_PARENT;
 		mkdir2->md_diradd = dap;
 		/*
 		 * Dependency on "." and ".." being written to disk.
 		 */
 		mkdir1->md_buf = newdirbp;
 		ACQUIRE_LOCK(&lk);
 		LIST_INSERT_HEAD(&mkdirlisthd, mkdir1, md_mkdirs);
 		WORKLIST_INSERT(&newdirbp->b_dep, &mkdir1->md_list);
 		FREE_LOCK(&lk);
 		bdwrite(newdirbp);
 		/*
 		 * Dependency on link count increase for parent directory
 		 */
 		ACQUIRE_LOCK(&lk);
 		if (inodedep_lookup(mp, dp->i_number, 0, &inodedep) == 0
 		    || (inodedep->id_state & ALLCOMPLETE) == ALLCOMPLETE) {
 			dap->da_state &= ~MKDIR_PARENT;
 			WORKITEM_FREE(mkdir2, D_MKDIR);
 		} else {
 			LIST_INSERT_HEAD(&mkdirlisthd, mkdir2, md_mkdirs);
 			WORKLIST_INSERT(&inodedep->id_bufwait,&mkdir2->md_list);
 		}
 	}
 	/*
 	 * Link into parent directory pagedep to await its being written.
 	 */
 	if (pagedep_lookup(dp, lbn, DEPALLOC, &pagedep) == 0)
 		WORKLIST_INSERT(&bp->b_dep, &pagedep->pd_list);
 	dap->da_pagedep = pagedep;
 	LIST_INSERT_HEAD(&pagedep->pd_diraddhd[DIRADDHASH(offset)], dap,
 	    da_pdlist);
 	/*
 	 * Link into its inodedep. Put it on the id_bufwait list if the inode
 	 * is not yet written. If it is written, do the post-inode write
 	 * processing to put it on the id_pendinghd list.
 	 */
 	(void) inodedep_lookup(mp, newinum, DEPALLOC, &inodedep);
 	if ((inodedep->id_state & ALLCOMPLETE) == ALLCOMPLETE)
 		diradd_inode_written(dap, inodedep);
 	else
 		WORKLIST_INSERT(&inodedep->id_bufwait, &dap->da_list);
 	if (isnewblk) {
 		/*
 		 * Directories growing into indirect blocks are rare
 		 * enough and the frequency of new block allocation
 		 * in those cases even more rare, that we choose not
 		 * to bother tracking them. Rather we simply force the
 		 * new directory entry to disk.
 		 */
 		if (lbn >= NDADDR) {
 			FREE_LOCK(&lk);
 			/*
 			 * We only have a new allocation when at the
 			 * beginning of a new block, not when we are
 			 * expanding into an existing block.
 			 */
 			if (blkoff(fs, diroffset) == 0)
 				return (1);
 			return (0);
 		}
 		/*
 		 * We only have a new allocation when at the beginning
 		 * of a new fragment, not when we are expanding into an
 		 * existing fragment. Also, there is nothing to do if we
 		 * are already tracking this block.
 		 */
 		if (fragoff(fs, diroffset) != 0) {
 			FREE_LOCK(&lk);
 			return (0);
 		}
 		if ((pagedep->pd_state & NEWBLOCK) != 0) {
 			WORKITEM_FREE(newdirblk, D_NEWDIRBLK);
 			FREE_LOCK(&lk);
 			return (0);
 		}
 		/*
 		 * Find our associated allocdirect and have it track us.
 		 */
 		if (inodedep_lookup(mp, dp->i_number, 0, &inodedep) == 0)
 			panic("softdep_setup_directory_add: lost inodedep");
 		adp = TAILQ_LAST(&inodedep->id_newinoupdt, allocdirectlst);
 		if (adp == NULL || adp->ad_lbn != lbn)
 			panic("softdep_setup_directory_add: lost entry");
 		pagedep->pd_state |= NEWBLOCK;
 		newdirblk->db_pagedep = pagedep;
 		WORKLIST_INSERT(&adp->ad_newdirblk, &newdirblk->db_list);
 	}
 	FREE_LOCK(&lk);
 	return (0);
 }
 
 /*
  * This procedure is called to change the offset of a directory
  * entry when compacting a directory block which must be owned
  * exclusively by the caller. Note that the actual entry movement
  * must be done in this procedure to ensure that no I/O completions
  * occur while the move is in progress.
  */
 void 
 softdep_change_directoryentry_offset(dp, base, oldloc, newloc, entrysize)
 	struct inode *dp;	/* inode for directory */
 	caddr_t base;		/* address of dp->i_offset */
 	caddr_t oldloc;		/* address of old directory location */
 	caddr_t newloc;		/* address of new directory location */
 	int entrysize;		/* size of directory entry */
 {
 	int offset, oldoffset, newoffset;
 	struct pagedep *pagedep;
 	struct diradd *dap;
 	ufs_lbn_t lbn;
 
 	ACQUIRE_LOCK(&lk);
 	lbn = lblkno(dp->i_fs, dp->i_offset);
 	offset = blkoff(dp->i_fs, dp->i_offset);
 	if (pagedep_lookup(dp, lbn, 0, &pagedep) == 0)
 		goto done;
 	oldoffset = offset + (oldloc - base);
 	newoffset = offset + (newloc - base);
 
 	LIST_FOREACH(dap, &pagedep->pd_diraddhd[DIRADDHASH(oldoffset)], da_pdlist) {
 		if (dap->da_offset != oldoffset)
 			continue;
 		dap->da_offset = newoffset;
 		if (DIRADDHASH(newoffset) == DIRADDHASH(oldoffset))
 			break;
 		LIST_REMOVE(dap, da_pdlist);
 		LIST_INSERT_HEAD(&pagedep->pd_diraddhd[DIRADDHASH(newoffset)],
 		    dap, da_pdlist);
 		break;
 	}
 	if (dap == NULL) {
 
 		LIST_FOREACH(dap, &pagedep->pd_pendinghd, da_pdlist) {
 			if (dap->da_offset == oldoffset) {
 				dap->da_offset = newoffset;
 				break;
 			}
 		}
 	}
 done:
 	bcopy(oldloc, newloc, entrysize);
 	FREE_LOCK(&lk);
 }
 
 /*
  * Free a diradd dependency structure. This routine must be called
  * with splbio interrupts blocked.
  */
 static void
 free_diradd(dap)
 	struct diradd *dap;
 {
 	struct dirrem *dirrem;
 	struct pagedep *pagedep;
 	struct inodedep *inodedep;
 	struct mkdir *mkdir, *nextmd;
 
 	mtx_assert(&lk, MA_OWNED);
 	WORKLIST_REMOVE(&dap->da_list);
 	LIST_REMOVE(dap, da_pdlist);
 	if ((dap->da_state & DIRCHG) == 0) {
 		pagedep = dap->da_pagedep;
 	} else {
 		dirrem = dap->da_previous;
 		pagedep = dirrem->dm_pagedep;
 		dirrem->dm_dirinum = pagedep->pd_ino;
 		add_to_worklist(&dirrem->dm_list);
 	}
 	if (inodedep_lookup(pagedep->pd_list.wk_mp, dap->da_newinum,
 	    0, &inodedep) != 0)
 		(void) free_inodedep(inodedep);
 	if ((dap->da_state & (MKDIR_PARENT | MKDIR_BODY)) != 0) {
 		for (mkdir = LIST_FIRST(&mkdirlisthd); mkdir; mkdir = nextmd) {
 			nextmd = LIST_NEXT(mkdir, md_mkdirs);
 			if (mkdir->md_diradd != dap)
 				continue;
 			dap->da_state &= ~mkdir->md_state;
 			WORKLIST_REMOVE(&mkdir->md_list);
 			LIST_REMOVE(mkdir, md_mkdirs);
 			WORKITEM_FREE(mkdir, D_MKDIR);
 		}
 		if ((dap->da_state & (MKDIR_PARENT | MKDIR_BODY)) != 0)
 			panic("free_diradd: unfound ref");
 	}
 	WORKITEM_FREE(dap, D_DIRADD);
 }
 
 /*
  * Directory entry removal dependencies.
  * 
  * When removing a directory entry, the entry's inode pointer must be
  * zero'ed on disk before the corresponding inode's link count is decremented
  * (possibly freeing the inode for re-use). This dependency is handled by
  * updating the directory entry but delaying the inode count reduction until
  * after the directory block has been written to disk. After this point, the
  * inode count can be decremented whenever it is convenient.
  */
 
 /*
  * This routine should be called immediately after removing
  * a directory entry.  The inode's link count should not be
  * decremented by the calling procedure -- the soft updates
  * code will do this task when it is safe.
  */
 void 
 softdep_setup_remove(bp, dp, ip, isrmdir)
 	struct buf *bp;		/* buffer containing directory block */
 	struct inode *dp;	/* inode for the directory being modified */
 	struct inode *ip;	/* inode for directory entry being removed */
 	int isrmdir;		/* indicates if doing RMDIR */
 {
 	struct dirrem *dirrem, *prevdirrem;
 
 	/*
 	 * Allocate a new dirrem if appropriate and ACQUIRE_LOCK.
 	 */
 	dirrem = newdirrem(bp, dp, ip, isrmdir, &prevdirrem);
 
 	/*
 	 * If the COMPLETE flag is clear, then there were no active
 	 * entries and we want to roll back to a zeroed entry until
 	 * the new inode is committed to disk. If the COMPLETE flag is
 	 * set then we have deleted an entry that never made it to
 	 * disk. If the entry we deleted resulted from a name change,
 	 * then the old name still resides on disk. We cannot delete
 	 * its inode (returned to us in prevdirrem) until the zeroed
 	 * directory entry gets to disk. The new inode has never been
 	 * referenced on the disk, so can be deleted immediately.
 	 */
 	if ((dirrem->dm_state & COMPLETE) == 0) {
 		LIST_INSERT_HEAD(&dirrem->dm_pagedep->pd_dirremhd, dirrem,
 		    dm_next);
 		FREE_LOCK(&lk);
 	} else {
 		if (prevdirrem != NULL)
 			LIST_INSERT_HEAD(&dirrem->dm_pagedep->pd_dirremhd,
 			    prevdirrem, dm_next);
 		dirrem->dm_dirinum = dirrem->dm_pagedep->pd_ino;
 		FREE_LOCK(&lk);
 		handle_workitem_remove(dirrem, NULL);
 	}
 }
 
 /*
  * Allocate a new dirrem if appropriate and return it along with
  * its associated pagedep. Called without a lock, returns with lock.
  */
 static long num_dirrem;		/* number of dirrem allocated */
 static struct dirrem *
 newdirrem(bp, dp, ip, isrmdir, prevdirremp)
 	struct buf *bp;		/* buffer containing directory block */
 	struct inode *dp;	/* inode for the directory being modified */
 	struct inode *ip;	/* inode for directory entry being removed */
 	int isrmdir;		/* indicates if doing RMDIR */
 	struct dirrem **prevdirremp; /* previously referenced inode, if any */
 {
 	int offset;
 	ufs_lbn_t lbn;
 	struct diradd *dap;
 	struct dirrem *dirrem;
 	struct pagedep *pagedep;
 
 	/*
 	 * Whiteouts have no deletion dependencies.
 	 */
 	if (ip == NULL)
 		panic("newdirrem: whiteout");
 	/*
 	 * If we are over our limit, try to improve the situation.
 	 * Limiting the number of dirrem structures will also limit
 	 * the number of freefile and freeblks structures.
 	 */
 	ACQUIRE_LOCK(&lk);
 	if (num_dirrem > max_softdeps / 2)
 		(void) request_cleanup(ITOV(dp)->v_mount, FLUSH_REMOVE);
 	num_dirrem += 1;
 	FREE_LOCK(&lk);
 	MALLOC(dirrem, struct dirrem *, sizeof(struct dirrem),
 		M_DIRREM, M_SOFTDEP_FLAGS|M_ZERO);
 	workitem_alloc(&dirrem->dm_list, D_DIRREM, ITOV(dp)->v_mount);
 	dirrem->dm_state = isrmdir ? RMDIR : 0;
 	dirrem->dm_oldinum = ip->i_number;
 	*prevdirremp = NULL;
 
 	ACQUIRE_LOCK(&lk);
 	lbn = lblkno(dp->i_fs, dp->i_offset);
 	offset = blkoff(dp->i_fs, dp->i_offset);
 	if (pagedep_lookup(dp, lbn, DEPALLOC, &pagedep) == 0)
 		WORKLIST_INSERT(&bp->b_dep, &pagedep->pd_list);
 	dirrem->dm_pagedep = pagedep;
 	/*
 	 * Check for a diradd dependency for the same directory entry.
 	 * If present, then both dependencies become obsolete and can
 	 * be de-allocated. Check for an entry on both the pd_dirraddhd
 	 * list and the pd_pendinghd list.
 	 */
 
 	LIST_FOREACH(dap, &pagedep->pd_diraddhd[DIRADDHASH(offset)], da_pdlist)
 		if (dap->da_offset == offset)
 			break;
 	if (dap == NULL) {
 
 		LIST_FOREACH(dap, &pagedep->pd_pendinghd, da_pdlist)
 			if (dap->da_offset == offset)
 				break;
 		if (dap == NULL)
 			return (dirrem);
 	}
 	/*
 	 * Must be ATTACHED at this point.
 	 */
 	if ((dap->da_state & ATTACHED) == 0)
 		panic("newdirrem: not ATTACHED");
 	if (dap->da_newinum != ip->i_number)
 		panic("newdirrem: inum %d should be %d",
 		    ip->i_number, dap->da_newinum);
 	/*
 	 * If we are deleting a changed name that never made it to disk,
 	 * then return the dirrem describing the previous inode (which
 	 * represents the inode currently referenced from this entry on disk).
 	 */
 	if ((dap->da_state & DIRCHG) != 0) {
 		*prevdirremp = dap->da_previous;
 		dap->da_state &= ~DIRCHG;
 		dap->da_pagedep = pagedep;
 	}
 	/*
 	 * We are deleting an entry that never made it to disk.
 	 * Mark it COMPLETE so we can delete its inode immediately.
 	 */
 	dirrem->dm_state |= COMPLETE;
 	free_diradd(dap);
 	return (dirrem);
 }
 
 /*
  * Directory entry change dependencies.
  * 
  * Changing an existing directory entry requires that an add operation
  * be completed first followed by a deletion. The semantics for the addition
  * are identical to the description of adding a new entry above except
  * that the rollback is to the old inode number rather than zero. Once
  * the addition dependency is completed, the removal is done as described
  * in the removal routine above.
  */
 
 /*
  * This routine should be called immediately after changing
  * a directory entry.  The inode's link count should not be
  * decremented by the calling procedure -- the soft updates
  * code will perform this task when it is safe.
  */
 void 
 softdep_setup_directory_change(bp, dp, ip, newinum, isrmdir)
 	struct buf *bp;		/* buffer containing directory block */
 	struct inode *dp;	/* inode for the directory being modified */
 	struct inode *ip;	/* inode for directory entry being removed */
 	ino_t newinum;		/* new inode number for changed entry */
 	int isrmdir;		/* indicates if doing RMDIR */
 {
 	int offset;
 	struct diradd *dap = NULL;
 	struct dirrem *dirrem, *prevdirrem;
 	struct pagedep *pagedep;
 	struct inodedep *inodedep;
 	struct mount *mp;
 
 	offset = blkoff(dp->i_fs, dp->i_offset);
 	mp = UFSTOVFS(dp->i_ump);
 
 	/*
 	 * Whiteouts do not need diradd dependencies.
 	 */
 	if (newinum != WINO) {
 		MALLOC(dap, struct diradd *, sizeof(struct diradd),
 		    M_DIRADD, M_SOFTDEP_FLAGS|M_ZERO);
 		workitem_alloc(&dap->da_list, D_DIRADD, mp);
 		dap->da_state = DIRCHG | ATTACHED | DEPCOMPLETE;
 		dap->da_offset = offset;
 		dap->da_newinum = newinum;
 	}
 
 	/*
 	 * Allocate a new dirrem and ACQUIRE_LOCK.
 	 */
 	dirrem = newdirrem(bp, dp, ip, isrmdir, &prevdirrem);
 	pagedep = dirrem->dm_pagedep;
 	/*
 	 * The possible values for isrmdir:
 	 *	0 - non-directory file rename
 	 *	1 - directory rename within same directory
 	 *   inum - directory rename to new directory of given inode number
 	 * When renaming to a new directory, we are both deleting and
 	 * creating a new directory entry, so the link count on the new
 	 * directory should not change. Thus we do not need the followup
 	 * dirrem which is usually done in handle_workitem_remove. We set
 	 * the DIRCHG flag to tell handle_workitem_remove to skip the 
 	 * followup dirrem.
 	 */
 	if (isrmdir > 1)
 		dirrem->dm_state |= DIRCHG;
 
 	/*
 	 * Whiteouts have no additional dependencies,
 	 * so just put the dirrem on the correct list.
 	 */
 	if (newinum == WINO) {
 		if ((dirrem->dm_state & COMPLETE) == 0) {
 			LIST_INSERT_HEAD(&pagedep->pd_dirremhd, dirrem,
 			    dm_next);
 		} else {
 			dirrem->dm_dirinum = pagedep->pd_ino;
 			add_to_worklist(&dirrem->dm_list);
 		}
 		FREE_LOCK(&lk);
 		return;
 	}
 
 	/*
 	 * If the COMPLETE flag is clear, then there were no active
 	 * entries and we want to roll back to the previous inode until
 	 * the new inode is committed to disk. If the COMPLETE flag is
 	 * set, then we have deleted an entry that never made it to disk.
 	 * If the entry we deleted resulted from a name change, then the old
 	 * inode reference still resides on disk. Any rollback that we do
 	 * needs to be to that old inode (returned to us in prevdirrem). If
 	 * the entry we deleted resulted from a create, then there is
 	 * no entry on the disk, so we want to roll back to zero rather
 	 * than the uncommitted inode. In either of the COMPLETE cases we
 	 * want to immediately free the unwritten and unreferenced inode.
 	 */
 	if ((dirrem->dm_state & COMPLETE) == 0) {
 		dap->da_previous = dirrem;
 	} else {
 		if (prevdirrem != NULL) {
 			dap->da_previous = prevdirrem;
 		} else {
 			dap->da_state &= ~DIRCHG;
 			dap->da_pagedep = pagedep;
 		}
 		dirrem->dm_dirinum = pagedep->pd_ino;
 		add_to_worklist(&dirrem->dm_list);
 	}
 	/*
 	 * Link into its inodedep. Put it on the id_bufwait list if the inode
 	 * is not yet written. If it is written, do the post-inode write
 	 * processing to put it on the id_pendinghd list.
 	 */
 	if (inodedep_lookup(mp, newinum, DEPALLOC, &inodedep) == 0 ||
 	    (inodedep->id_state & ALLCOMPLETE) == ALLCOMPLETE) {
 		dap->da_state |= COMPLETE;
 		LIST_INSERT_HEAD(&pagedep->pd_pendinghd, dap, da_pdlist);
 		WORKLIST_INSERT(&inodedep->id_pendinghd, &dap->da_list);
 	} else {
 		LIST_INSERT_HEAD(&pagedep->pd_diraddhd[DIRADDHASH(offset)],
 		    dap, da_pdlist);
 		WORKLIST_INSERT(&inodedep->id_bufwait, &dap->da_list);
 	}
 	FREE_LOCK(&lk);
 }
 
 /*
  * Called whenever the link count on an inode is changed.
  * It creates an inode dependency so that the new reference(s)
  * to the inode cannot be committed to disk until the updated
  * inode has been written.
  */
 void
 softdep_change_linkcnt(ip)
 	struct inode *ip;	/* the inode with the increased link count */
 {
 	struct inodedep *inodedep;
 
 	ACQUIRE_LOCK(&lk);
 	(void) inodedep_lookup(UFSTOVFS(ip->i_ump), ip->i_number,
 	    DEPALLOC, &inodedep);
 	if (ip->i_nlink < ip->i_effnlink)
 		panic("softdep_change_linkcnt: bad delta");
 	inodedep->id_nlinkdelta = ip->i_nlink - ip->i_effnlink;
 	FREE_LOCK(&lk);
 }
 
 /*
  * Called when the effective link count and the reference count
  * on an inode drops to zero. At this point there are no names
  * referencing the file in the filesystem and no active file
  * references. The space associated with the file will be freed
  * as soon as the necessary soft dependencies are cleared.
  */
 void
 softdep_releasefile(ip)
 	struct inode *ip;	/* inode with the zero effective link count */
 {
 	struct inodedep *inodedep;
 	struct fs *fs;
 	int extblocks;
 
 	if (ip->i_effnlink > 0)
 		panic("softdep_releasefile: file still referenced");
 	/*
 	 * We may be called several times as the on-disk link count
 	 * drops to zero. We only want to account for the space once.
 	 */
 	if (ip->i_flag & IN_SPACECOUNTED)
 		return;
 	/*
 	 * We have to deactivate a snapshot otherwise copyonwrites may
 	 * add blocks and the cleanup may remove blocks after we have
 	 * tried to account for them.
 	 */
 	if ((ip->i_flags & SF_SNAPSHOT) != 0)
 		ffs_snapremove(ITOV(ip));
 	/*
 	 * If we are tracking an nlinkdelta, we have to also remember
 	 * whether we accounted for the freed space yet.
 	 */
 	ACQUIRE_LOCK(&lk);
 	if ((inodedep_lookup(UFSTOVFS(ip->i_ump), ip->i_number, 0, &inodedep)))
 		inodedep->id_state |= SPACECOUNTED;
 	FREE_LOCK(&lk);
 	fs = ip->i_fs;
 	extblocks = 0;
 	if (fs->fs_magic == FS_UFS2_MAGIC)
 		extblocks = btodb(fragroundup(fs, ip->i_din2->di_extsize));
 	UFS_LOCK(ip->i_ump);
 	ip->i_fs->fs_pendingblocks += DIP(ip, i_blocks) - extblocks;
 	ip->i_fs->fs_pendinginodes += 1;
 	UFS_UNLOCK(ip->i_ump);
 	ip->i_flag |= IN_SPACECOUNTED;
 }
 
 /*
  * This workitem decrements the inode's link count.
  * If the link count reaches zero, the file is removed.
  */
 static void 
 handle_workitem_remove(dirrem, xp)
 	struct dirrem *dirrem;
 	struct vnode *xp;
 {
 	struct thread *td = curthread;
 	struct inodedep *inodedep;
 	struct vnode *vp;
 	struct inode *ip;
 	ino_t oldinum;
 	int error;
 
 	if ((vp = xp) == NULL &&
 	    (error = ffs_vget(dirrem->dm_list.wk_mp,
 	    dirrem->dm_oldinum, LK_EXCLUSIVE, &vp)) != 0) {
 		softdep_error("handle_workitem_remove: vget", error);
 		return;
 	}
 	ip = VTOI(vp);
 	ACQUIRE_LOCK(&lk);
 	if ((inodedep_lookup(dirrem->dm_list.wk_mp,
 	    dirrem->dm_oldinum, 0, &inodedep)) == 0)
 		panic("handle_workitem_remove: lost inodedep");
 	/*
 	 * Normal file deletion.
 	 */
 	if ((dirrem->dm_state & RMDIR) == 0) {
 		ip->i_nlink--;
 		DIP_SET(ip, i_nlink, ip->i_nlink);
 		ip->i_flag |= IN_CHANGE;
 		if (ip->i_nlink < ip->i_effnlink)
 			panic("handle_workitem_remove: bad file delta");
 		inodedep->id_nlinkdelta = ip->i_nlink - ip->i_effnlink;
 		num_dirrem -= 1;
 		WORKITEM_FREE(dirrem, D_DIRREM);
 		FREE_LOCK(&lk);
 		vput(vp);
 		return;
 	}
 	/*
 	 * Directory deletion. Decrement reference count for both the
 	 * just deleted parent directory entry and the reference for ".".
 	 * Next truncate the directory to length zero. When the
 	 * truncation completes, arrange to have the reference count on
 	 * the parent decremented to account for the loss of "..".
 	 */
 	ip->i_nlink -= 2;
 	DIP_SET(ip, i_nlink, ip->i_nlink);
 	ip->i_flag |= IN_CHANGE;
 	if (ip->i_nlink < ip->i_effnlink)
 		panic("handle_workitem_remove: bad dir delta");
 	inodedep->id_nlinkdelta = ip->i_nlink - ip->i_effnlink;
 	FREE_LOCK(&lk);
 	if ((error = ffs_truncate(vp, (off_t)0, 0, td->td_ucred, td)) != 0)
 		softdep_error("handle_workitem_remove: truncate", error);
 	ACQUIRE_LOCK(&lk);
 	/*
 	 * Rename a directory to a new parent. Since, we are both deleting
 	 * and creating a new directory entry, the link count on the new
 	 * directory should not change. Thus we skip the followup dirrem.
 	 */
 	if (dirrem->dm_state & DIRCHG) {
 		num_dirrem -= 1;
 		WORKITEM_FREE(dirrem, D_DIRREM);
 		FREE_LOCK(&lk);
 		vput(vp);
 		return;
 	}
 	/*
 	 * If the inodedep does not exist, then the zero'ed inode has
 	 * been written to disk. If the allocated inode has never been
 	 * written to disk, then the on-disk inode is zero'ed. In either
 	 * case we can remove the file immediately.
 	 */
 	dirrem->dm_state = 0;
 	oldinum = dirrem->dm_oldinum;
 	dirrem->dm_oldinum = dirrem->dm_dirinum;
 	if (inodedep_lookup(dirrem->dm_list.wk_mp, oldinum,
 	    0, &inodedep) == 0 || check_inode_unwritten(inodedep)) {
 		if (xp != NULL)
 			add_to_worklist(&dirrem->dm_list);
 		FREE_LOCK(&lk);
 		vput(vp);
 		if (xp == NULL)
 			handle_workitem_remove(dirrem, NULL);
 		return;
 	}
 	WORKLIST_INSERT(&inodedep->id_inowait, &dirrem->dm_list);
 	FREE_LOCK(&lk);
 	ip->i_flag |= IN_CHANGE;
 	ffs_update(vp, 0);
 	vput(vp);
 }
 
 /*
  * Inode de-allocation dependencies.
  * 
  * When an inode's link count is reduced to zero, it can be de-allocated. We
  * found it convenient to postpone de-allocation until after the inode is
  * written to disk with its new link count (zero).  At this point, all of the
  * on-disk inode's block pointers are nullified and, with careful dependency
  * list ordering, all dependencies related to the inode will be satisfied and
  * the corresponding dependency structures de-allocated.  So, if/when the
  * inode is reused, there will be no mixing of old dependencies with new
  * ones.  This artificial dependency is set up by the block de-allocation
  * procedure above (softdep_setup_freeblocks) and completed by the
  * following procedure.
  */
 static void 
 handle_workitem_freefile(freefile)
 	struct freefile *freefile;
 {
 	struct fs *fs;
 	struct inodedep *idp;
 	struct ufsmount *ump;
 	int error;
 
 	ump = VFSTOUFS(freefile->fx_list.wk_mp);
 	fs = ump->um_fs;
 #ifdef DEBUG
 	ACQUIRE_LOCK(&lk);
 	error = inodedep_lookup(UFSTOVFS(ump), freefile->fx_oldinum, 0, &idp);
 	FREE_LOCK(&lk);
 	if (error)
 		panic("handle_workitem_freefile: inodedep survived");
 #endif
 	UFS_LOCK(ump);
 	fs->fs_pendinginodes -= 1;
 	UFS_UNLOCK(ump);
 	if ((error = ffs_freefile(ump, fs, freefile->fx_devvp,
 	    freefile->fx_oldinum, freefile->fx_mode)) != 0)
 		softdep_error("handle_workitem_freefile", error);
 	ACQUIRE_LOCK(&lk);
 	WORKITEM_FREE(freefile, D_FREEFILE);
 	FREE_LOCK(&lk);
 }
 
 
 /*
  * Helper function which unlinks marker element from work list and returns
  * the next element on the list.
  */
 static __inline struct worklist *
 markernext(struct worklist *marker)
 {
 	struct worklist *next;
 	
 	next = LIST_NEXT(marker, wk_list);
 	LIST_REMOVE(marker, wk_list);
 	return next;
 }
 
 /*
  * Disk writes.
  * 
  * The dependency structures constructed above are most actively used when file
  * system blocks are written to disk.  No constraints are placed on when a
  * block can be written, but unsatisfied update dependencies are made safe by
  * modifying (or replacing) the source memory for the duration of the disk
  * write.  When the disk write completes, the memory block is again brought
  * up-to-date.
  *
  * In-core inode structure reclamation.
  * 
  * Because there are a finite number of "in-core" inode structures, they are
  * reused regularly.  By transferring all inode-related dependencies to the
  * in-memory inode block and indexing them separately (via "inodedep"s), we
  * can allow "in-core" inode structures to be reused at any time and avoid
  * any increase in contention.
  *
  * Called just before entering the device driver to initiate a new disk I/O.
  * The buffer must be locked, thus, no I/O completion operations can occur
  * while we are manipulating its associated dependencies.
  */
 static void 
 softdep_disk_io_initiation(bp)
 	struct buf *bp;		/* structure describing disk write to occur */
 {
 	struct worklist *wk;
 	struct worklist marker;
 	struct indirdep *indirdep;
 	struct inodedep *inodedep;
 
 	/*
 	 * We only care about write operations. There should never
 	 * be dependencies for reads.
 	 */
 	if (bp->b_iocmd != BIO_WRITE)
 		panic("softdep_disk_io_initiation: not write");
 
 	marker.wk_type = D_LAST + 1;	/* Not a normal workitem */
 	PHOLD(curproc);			/* Don't swap out kernel stack */
 
 	ACQUIRE_LOCK(&lk);
 	/*
 	 * Do any necessary pre-I/O processing.
 	 */
 	for (wk = LIST_FIRST(&bp->b_dep); wk != NULL;
 	     wk = markernext(&marker)) {
 		LIST_INSERT_AFTER(wk, &marker, wk_list);
 		switch (wk->wk_type) {
 
 		case D_PAGEDEP:
 			initiate_write_filepage(WK_PAGEDEP(wk), bp);
 			continue;
 
 		case D_INODEDEP:
 			inodedep = WK_INODEDEP(wk);
 			if (inodedep->id_fs->fs_magic == FS_UFS1_MAGIC)
 				initiate_write_inodeblock_ufs1(inodedep, bp);
 			else
 				initiate_write_inodeblock_ufs2(inodedep, bp);
 			continue;
 
 		case D_INDIRDEP:
 			indirdep = WK_INDIRDEP(wk);
 			if (indirdep->ir_state & GOINGAWAY)
 				panic("disk_io_initiation: indirdep gone");
 			/*
 			 * If there are no remaining dependencies, this
 			 * will be writing the real pointers, so the
 			 * dependency can be freed.
 			 */
 			if (LIST_EMPTY(&indirdep->ir_deplisthd)) {
 				struct buf *bp;
 
 				bp = indirdep->ir_savebp;
 				bp->b_flags |= B_INVAL | B_NOCACHE;
 				/* inline expand WORKLIST_REMOVE(wk); */
 				wk->wk_state &= ~ONWORKLIST;
 				LIST_REMOVE(wk, wk_list);
 				WORKITEM_FREE(indirdep, D_INDIRDEP);
 				FREE_LOCK(&lk);
 				brelse(bp);
 				ACQUIRE_LOCK(&lk);
 				continue;
 			}
 			/*
 			 * Replace up-to-date version with safe version.
 			 */
 			FREE_LOCK(&lk);
 			MALLOC(indirdep->ir_saveddata, caddr_t, bp->b_bcount,
 			    M_INDIRDEP, M_SOFTDEP_FLAGS);
 			ACQUIRE_LOCK(&lk);
 			indirdep->ir_state &= ~ATTACHED;
 			indirdep->ir_state |= UNDONE;
 			bcopy(bp->b_data, indirdep->ir_saveddata, bp->b_bcount);
 			bcopy(indirdep->ir_savebp->b_data, bp->b_data,
 			    bp->b_bcount);
 			continue;
 
 		case D_MKDIR:
 		case D_BMSAFEMAP:
 		case D_ALLOCDIRECT:
 		case D_ALLOCINDIR:
 			continue;
 
 		default:
 			panic("handle_disk_io_initiation: Unexpected type %s",
 			    TYPENAME(wk->wk_type));
 			/* NOTREACHED */
 		}
 	}
 	FREE_LOCK(&lk);
 	PRELE(curproc);			/* Allow swapout of kernel stack */
 }
 
 /*
  * Called from within the procedure above to deal with unsatisfied
  * allocation dependencies in a directory. The buffer must be locked,
  * thus, no I/O completion operations can occur while we are
  * manipulating its associated dependencies.
  */
 static void
 initiate_write_filepage(pagedep, bp)
 	struct pagedep *pagedep;
 	struct buf *bp;
 {
 	struct diradd *dap;
 	struct direct *ep;
 	int i;
 
 	if (pagedep->pd_state & IOSTARTED) {
 		/*
 		 * This can only happen if there is a driver that does not
 		 * understand chaining. Here biodone will reissue the call
 		 * to strategy for the incomplete buffers.
 		 */
 		printf("initiate_write_filepage: already started\n");
 		return;
 	}
 	pagedep->pd_state |= IOSTARTED;
 	for (i = 0; i < DAHASHSZ; i++) {
 		LIST_FOREACH(dap, &pagedep->pd_diraddhd[i], da_pdlist) {
 			ep = (struct direct *)
 			    ((char *)bp->b_data + dap->da_offset);
 			if (ep->d_ino != dap->da_newinum)
 				panic("%s: dir inum %d != new %d",
 				    "initiate_write_filepage",
 				    ep->d_ino, dap->da_newinum);
 			if (dap->da_state & DIRCHG)
 				ep->d_ino = dap->da_previous->dm_oldinum;
 			else
 				ep->d_ino = 0;
 			dap->da_state &= ~ATTACHED;
 			dap->da_state |= UNDONE;
 		}
 	}
 }
 
 /*
  * Version of initiate_write_inodeblock that handles UFS1 dinodes.
  * Note that any bug fixes made to this routine must be done in the
  * version found below.
  *
  * Called from within the procedure above to deal with unsatisfied
  * allocation dependencies in an inodeblock. The buffer must be
  * locked, thus, no I/O completion operations can occur while we
  * are manipulating its associated dependencies.
  */
 static void 
 initiate_write_inodeblock_ufs1(inodedep, bp)
 	struct inodedep *inodedep;
 	struct buf *bp;			/* The inode block */
 {
 	struct allocdirect *adp, *lastadp;
 	struct ufs1_dinode *dp;
 	struct ufs1_dinode *sip;
 	struct fs *fs;
 	ufs_lbn_t i;
 #ifdef INVARIANTS
 	ufs_lbn_t prevlbn = 0;
 #endif
 	int deplist;
 
 	if (inodedep->id_state & IOSTARTED)
 		panic("initiate_write_inodeblock_ufs1: already started");
 	inodedep->id_state |= IOSTARTED;
 	fs = inodedep->id_fs;
 	dp = (struct ufs1_dinode *)bp->b_data +
 	    ino_to_fsbo(fs, inodedep->id_ino);
 	/*
 	 * If the bitmap is not yet written, then the allocated
 	 * inode cannot be written to disk.
 	 */
 	if ((inodedep->id_state & DEPCOMPLETE) == 0) {
 		if (inodedep->id_savedino1 != NULL)
 			panic("initiate_write_inodeblock_ufs1: I/O underway");
 		FREE_LOCK(&lk);
 		MALLOC(sip, struct ufs1_dinode *,
 		    sizeof(struct ufs1_dinode), M_SAVEDINO, M_SOFTDEP_FLAGS);
 		ACQUIRE_LOCK(&lk);
 		inodedep->id_savedino1 = sip;
 		*inodedep->id_savedino1 = *dp;
 		bzero((caddr_t)dp, sizeof(struct ufs1_dinode));
 		dp->di_gen = inodedep->id_savedino1->di_gen;
 		return;
 	}
 	/*
 	 * If no dependencies, then there is nothing to roll back.
 	 */
 	inodedep->id_savedsize = dp->di_size;
 	inodedep->id_savedextsize = 0;
 	if (TAILQ_EMPTY(&inodedep->id_inoupdt))
 		return;
 	/*
 	 * Set the dependencies to busy.
 	 */
 	for (deplist = 0, adp = TAILQ_FIRST(&inodedep->id_inoupdt); adp;
 	     adp = TAILQ_NEXT(adp, ad_next)) {
 #ifdef INVARIANTS
 		if (deplist != 0 && prevlbn >= adp->ad_lbn)
 			panic("softdep_write_inodeblock: lbn order");
 		prevlbn = adp->ad_lbn;
 		if (adp->ad_lbn < NDADDR &&
 		    dp->di_db[adp->ad_lbn] != adp->ad_newblkno)
 			panic("%s: direct pointer #%jd mismatch %d != %jd",
 			    "softdep_write_inodeblock",
 			    (intmax_t)adp->ad_lbn,
 			    dp->di_db[adp->ad_lbn],
 			    (intmax_t)adp->ad_newblkno);
 		if (adp->ad_lbn >= NDADDR &&
 		    dp->di_ib[adp->ad_lbn - NDADDR] != adp->ad_newblkno)
 			panic("%s: indirect pointer #%jd mismatch %d != %jd",
 			    "softdep_write_inodeblock",
 			    (intmax_t)adp->ad_lbn - NDADDR,
 			    dp->di_ib[adp->ad_lbn - NDADDR],
 			    (intmax_t)adp->ad_newblkno);
 		deplist |= 1 << adp->ad_lbn;
 		if ((adp->ad_state & ATTACHED) == 0)
 			panic("softdep_write_inodeblock: Unknown state 0x%x",
 			    adp->ad_state);
 #endif /* INVARIANTS */
 		adp->ad_state &= ~ATTACHED;
 		adp->ad_state |= UNDONE;
 	}
 	/*
 	 * The on-disk inode cannot claim to be any larger than the last
 	 * fragment that has been written. Otherwise, the on-disk inode
 	 * might have fragments that were not the last block in the file
 	 * which would corrupt the filesystem.
 	 */
 	for (lastadp = NULL, adp = TAILQ_FIRST(&inodedep->id_inoupdt); adp;
 	     lastadp = adp, adp = TAILQ_NEXT(adp, ad_next)) {
 		if (adp->ad_lbn >= NDADDR)
 			break;
 		dp->di_db[adp->ad_lbn] = adp->ad_oldblkno;
 		/* keep going until hitting a rollback to a frag */
 		if (adp->ad_oldsize == 0 || adp->ad_oldsize == fs->fs_bsize)
 			continue;
 		dp->di_size = fs->fs_bsize * adp->ad_lbn + adp->ad_oldsize;
 		for (i = adp->ad_lbn + 1; i < NDADDR; i++) {
 #ifdef INVARIANTS
 			if (dp->di_db[i] != 0 && (deplist & (1 << i)) == 0)
 				panic("softdep_write_inodeblock: lost dep1");
 #endif /* INVARIANTS */
 			dp->di_db[i] = 0;
 		}
 		for (i = 0; i < NIADDR; i++) {
 #ifdef INVARIANTS
 			if (dp->di_ib[i] != 0 &&
 			    (deplist & ((1 << NDADDR) << i)) == 0)
 				panic("softdep_write_inodeblock: lost dep2");
 #endif /* INVARIANTS */
 			dp->di_ib[i] = 0;
 		}
 		return;
 	}
 	/*
 	 * If we have zero'ed out the last allocated block of the file,
 	 * roll back the size to the last currently allocated block.
 	 * We know that this last allocated block is a full-sized as
 	 * we already checked for fragments in the loop above.
 	 */
 	if (lastadp != NULL &&
 	    dp->di_size <= (lastadp->ad_lbn + 1) * fs->fs_bsize) {
 		for (i = lastadp->ad_lbn; i >= 0; i--)
 			if (dp->di_db[i] != 0)
 				break;
 		dp->di_size = (i + 1) * fs->fs_bsize;
 	}
 	/*
 	 * The only dependencies are for indirect blocks.
 	 *
 	 * The file size for indirect block additions is not guaranteed.
 	 * Such a guarantee would be non-trivial to achieve. The conventional
 	 * synchronous write implementation also does not make this guarantee.
 	 * Fsck should catch and fix discrepancies. Arguably, the file size
 	 * can be over-estimated without destroying integrity when the file
 	 * moves into the indirect blocks (i.e., is large). If we want to
 	 * postpone fsck, we are stuck with this argument.
 	 */
 	for (; adp; adp = TAILQ_NEXT(adp, ad_next))
 		dp->di_ib[adp->ad_lbn - NDADDR] = 0;
 }
 		
 /*
  * Version of initiate_write_inodeblock that handles UFS2 dinodes.
  * Note that any bug fixes made to this routine must be done in the
  * version found above.
  *
  * Called from within the procedure above to deal with unsatisfied
  * allocation dependencies in an inodeblock. The buffer must be
  * locked, thus, no I/O completion operations can occur while we
  * are manipulating its associated dependencies.
  */
 static void 
 initiate_write_inodeblock_ufs2(inodedep, bp)
 	struct inodedep *inodedep;
 	struct buf *bp;			/* The inode block */
 {
 	struct allocdirect *adp, *lastadp;
 	struct ufs2_dinode *dp;
 	struct ufs2_dinode *sip;
 	struct fs *fs;
 	ufs_lbn_t i;
 #ifdef INVARIANTS
 	ufs_lbn_t prevlbn = 0;
 #endif
 	int deplist;
 
 	if (inodedep->id_state & IOSTARTED)
 		panic("initiate_write_inodeblock_ufs2: already started");
 	inodedep->id_state |= IOSTARTED;
 	fs = inodedep->id_fs;
 	dp = (struct ufs2_dinode *)bp->b_data +
 	    ino_to_fsbo(fs, inodedep->id_ino);
 	/*
 	 * If the bitmap is not yet written, then the allocated
 	 * inode cannot be written to disk.
 	 */
 	if ((inodedep->id_state & DEPCOMPLETE) == 0) {
 		if (inodedep->id_savedino2 != NULL)
 			panic("initiate_write_inodeblock_ufs2: I/O underway");
 		FREE_LOCK(&lk);
 		MALLOC(sip, struct ufs2_dinode *,
 		    sizeof(struct ufs2_dinode), M_SAVEDINO, M_SOFTDEP_FLAGS);
 		ACQUIRE_LOCK(&lk);
 		inodedep->id_savedino2 = sip;
 		*inodedep->id_savedino2 = *dp;
 		bzero((caddr_t)dp, sizeof(struct ufs2_dinode));
 		dp->di_gen = inodedep->id_savedino2->di_gen;
 		return;
 	}
 	/*
 	 * If no dependencies, then there is nothing to roll back.
 	 */
 	inodedep->id_savedsize = dp->di_size;
 	inodedep->id_savedextsize = dp->di_extsize;
 	if (TAILQ_EMPTY(&inodedep->id_inoupdt) &&
 	    TAILQ_EMPTY(&inodedep->id_extupdt))
 		return;
 	/*
 	 * Set the ext data dependencies to busy.
 	 */
 	for (deplist = 0, adp = TAILQ_FIRST(&inodedep->id_extupdt); adp;
 	     adp = TAILQ_NEXT(adp, ad_next)) {
 #ifdef INVARIANTS
 		if (deplist != 0 && prevlbn >= adp->ad_lbn)
 			panic("softdep_write_inodeblock: lbn order");
 		prevlbn = adp->ad_lbn;
 		if (dp->di_extb[adp->ad_lbn] != adp->ad_newblkno)
 			panic("%s: direct pointer #%jd mismatch %jd != %jd",
 			    "softdep_write_inodeblock",
 			    (intmax_t)adp->ad_lbn,
 			    (intmax_t)dp->di_extb[adp->ad_lbn],
 			    (intmax_t)adp->ad_newblkno);
 		deplist |= 1 << adp->ad_lbn;
 		if ((adp->ad_state & ATTACHED) == 0)
 			panic("softdep_write_inodeblock: Unknown state 0x%x",
 			    adp->ad_state);
 #endif /* INVARIANTS */
 		adp->ad_state &= ~ATTACHED;
 		adp->ad_state |= UNDONE;
 	}
 	/*
 	 * The on-disk inode cannot claim to be any larger than the last
 	 * fragment that has been written. Otherwise, the on-disk inode
 	 * might have fragments that were not the last block in the ext
 	 * data which would corrupt the filesystem.
 	 */
 	for (lastadp = NULL, adp = TAILQ_FIRST(&inodedep->id_extupdt); adp;
 	     lastadp = adp, adp = TAILQ_NEXT(adp, ad_next)) {
 		dp->di_extb[adp->ad_lbn] = adp->ad_oldblkno;
 		/* keep going until hitting a rollback to a frag */
 		if (adp->ad_oldsize == 0 || adp->ad_oldsize == fs->fs_bsize)
 			continue;
 		dp->di_extsize = fs->fs_bsize * adp->ad_lbn + adp->ad_oldsize;
 		for (i = adp->ad_lbn + 1; i < NXADDR; i++) {
 #ifdef INVARIANTS
 			if (dp->di_extb[i] != 0 && (deplist & (1 << i)) == 0)
 				panic("softdep_write_inodeblock: lost dep1");
 #endif /* INVARIANTS */
 			dp->di_extb[i] = 0;
 		}
 		lastadp = NULL;
 		break;
 	}
 	/*
 	 * If we have zero'ed out the last allocated block of the ext
 	 * data, roll back the size to the last currently allocated block.
 	 * We know that this last allocated block is a full-sized as
 	 * we already checked for fragments in the loop above.
 	 */
 	if (lastadp != NULL &&
 	    dp->di_extsize <= (lastadp->ad_lbn + 1) * fs->fs_bsize) {
 		for (i = lastadp->ad_lbn; i >= 0; i--)
 			if (dp->di_extb[i] != 0)
 				break;
 		dp->di_extsize = (i + 1) * fs->fs_bsize;
 	}
 	/*
 	 * Set the file data dependencies to busy.
 	 */
 	for (deplist = 0, adp = TAILQ_FIRST(&inodedep->id_inoupdt); adp;
 	     adp = TAILQ_NEXT(adp, ad_next)) {
 #ifdef INVARIANTS
 		if (deplist != 0 && prevlbn >= adp->ad_lbn)
 			panic("softdep_write_inodeblock: lbn order");
 		prevlbn = adp->ad_lbn;
 		if (adp->ad_lbn < NDADDR &&
 		    dp->di_db[adp->ad_lbn] != adp->ad_newblkno)
 			panic("%s: direct pointer #%jd mismatch %jd != %jd",
 			    "softdep_write_inodeblock",
 			    (intmax_t)adp->ad_lbn,
 			    (intmax_t)dp->di_db[adp->ad_lbn],
 			    (intmax_t)adp->ad_newblkno);
 		if (adp->ad_lbn >= NDADDR &&
 		    dp->di_ib[adp->ad_lbn - NDADDR] != adp->ad_newblkno)
 			panic("%s indirect pointer #%jd mismatch %jd != %jd",
 			    "softdep_write_inodeblock:",
 			    (intmax_t)adp->ad_lbn - NDADDR,
 			    (intmax_t)dp->di_ib[adp->ad_lbn - NDADDR],
 			    (intmax_t)adp->ad_newblkno);
 		deplist |= 1 << adp->ad_lbn;
 		if ((adp->ad_state & ATTACHED) == 0)
 			panic("softdep_write_inodeblock: Unknown state 0x%x",
 			    adp->ad_state);
 #endif /* INVARIANTS */
 		adp->ad_state &= ~ATTACHED;
 		adp->ad_state |= UNDONE;
 	}
 	/*
 	 * The on-disk inode cannot claim to be any larger than the last
 	 * fragment that has been written. Otherwise, the on-disk inode
 	 * might have fragments that were not the last block in the file
 	 * which would corrupt the filesystem.
 	 */
 	for (lastadp = NULL, adp = TAILQ_FIRST(&inodedep->id_inoupdt); adp;
 	     lastadp = adp, adp = TAILQ_NEXT(adp, ad_next)) {
 		if (adp->ad_lbn >= NDADDR)
 			break;
 		dp->di_db[adp->ad_lbn] = adp->ad_oldblkno;
 		/* keep going until hitting a rollback to a frag */
 		if (adp->ad_oldsize == 0 || adp->ad_oldsize == fs->fs_bsize)
 			continue;
 		dp->di_size = fs->fs_bsize * adp->ad_lbn + adp->ad_oldsize;
 		for (i = adp->ad_lbn + 1; i < NDADDR; i++) {
 #ifdef INVARIANTS
 			if (dp->di_db[i] != 0 && (deplist & (1 << i)) == 0)
 				panic("softdep_write_inodeblock: lost dep2");
 #endif /* INVARIANTS */
 			dp->di_db[i] = 0;
 		}
 		for (i = 0; i < NIADDR; i++) {
 #ifdef INVARIANTS
 			if (dp->di_ib[i] != 0 &&
 			    (deplist & ((1 << NDADDR) << i)) == 0)
 				panic("softdep_write_inodeblock: lost dep3");
 #endif /* INVARIANTS */
 			dp->di_ib[i] = 0;
 		}
 		return;
 	}
 	/*
 	 * If we have zero'ed out the last allocated block of the file,
 	 * roll back the size to the last currently allocated block.
 	 * We know that this last allocated block is a full-sized as
 	 * we already checked for fragments in the loop above.
 	 */
 	if (lastadp != NULL &&
 	    dp->di_size <= (lastadp->ad_lbn + 1) * fs->fs_bsize) {
 		for (i = lastadp->ad_lbn; i >= 0; i--)
 			if (dp->di_db[i] != 0)
 				break;
 		dp->di_size = (i + 1) * fs->fs_bsize;
 	}
 	/*
 	 * The only dependencies are for indirect blocks.
 	 *
 	 * The file size for indirect block additions is not guaranteed.
 	 * Such a guarantee would be non-trivial to achieve. The conventional
 	 * synchronous write implementation also does not make this guarantee.
 	 * Fsck should catch and fix discrepancies. Arguably, the file size
 	 * can be over-estimated without destroying integrity when the file
 	 * moves into the indirect blocks (i.e., is large). If we want to
 	 * postpone fsck, we are stuck with this argument.
 	 */
 	for (; adp; adp = TAILQ_NEXT(adp, ad_next))
 		dp->di_ib[adp->ad_lbn - NDADDR] = 0;
 }
 
 /*
  * This routine is called during the completion interrupt
  * service routine for a disk write (from the procedure called
  * by the device driver to inform the filesystem caches of
  * a request completion).  It should be called early in this
  * procedure, before the block is made available to other
  * processes or other routines are called.
  */
 static void 
 softdep_disk_write_complete(bp)
 	struct buf *bp;		/* describes the completed disk write */
 {
 	struct worklist *wk;
 	struct worklist *owk;
 	struct workhead reattach;
 	struct newblk *newblk;
 	struct allocindir *aip;
 	struct allocdirect *adp;
 	struct indirdep *indirdep;
 	struct inodedep *inodedep;
 	struct bmsafemap *bmsafemap;
 
 	/*
 	 * If an error occurred while doing the write, then the data
 	 * has not hit the disk and the dependencies cannot be unrolled.
 	 */
 	if ((bp->b_ioflags & BIO_ERROR) != 0 && (bp->b_flags & B_INVAL) == 0)
 		return;
 	LIST_INIT(&reattach);
 	/*
 	 * This lock must not be released anywhere in this code segment.
 	 */
 	ACQUIRE_LOCK(&lk);
 	owk = NULL;
 	while ((wk = LIST_FIRST(&bp->b_dep)) != NULL) {
 		WORKLIST_REMOVE(wk);
 		if (wk == owk)
 			panic("duplicate worklist: %p\n", wk);
 		owk = wk;
 		switch (wk->wk_type) {
 
 		case D_PAGEDEP:
 			if (handle_written_filepage(WK_PAGEDEP(wk), bp))
 				WORKLIST_INSERT(&reattach, wk);
 			continue;
 
 		case D_INODEDEP:
 			if (handle_written_inodeblock(WK_INODEDEP(wk), bp))
 				WORKLIST_INSERT(&reattach, wk);
 			continue;
 
 		case D_BMSAFEMAP:
 			bmsafemap = WK_BMSAFEMAP(wk);
 			while ((newblk = LIST_FIRST(&bmsafemap->sm_newblkhd))) {
 				newblk->nb_state |= DEPCOMPLETE;
 				newblk->nb_bmsafemap = NULL;
 				LIST_REMOVE(newblk, nb_deps);
 			}
 			while ((adp =
 			   LIST_FIRST(&bmsafemap->sm_allocdirecthd))) {
 				adp->ad_state |= DEPCOMPLETE;
 				adp->ad_buf = NULL;
 				LIST_REMOVE(adp, ad_deps);
 				handle_allocdirect_partdone(adp);
 			}
 			while ((aip =
 			    LIST_FIRST(&bmsafemap->sm_allocindirhd))) {
 				aip->ai_state |= DEPCOMPLETE;
 				aip->ai_buf = NULL;
 				LIST_REMOVE(aip, ai_deps);
 				handle_allocindir_partdone(aip);
 			}
 			while ((inodedep =
 			     LIST_FIRST(&bmsafemap->sm_inodedephd)) != NULL) {
 				inodedep->id_state |= DEPCOMPLETE;
 				LIST_REMOVE(inodedep, id_deps);
 				inodedep->id_buf = NULL;
 			}
 			WORKITEM_FREE(bmsafemap, D_BMSAFEMAP);
 			continue;
 
 		case D_MKDIR:
 			handle_written_mkdir(WK_MKDIR(wk), MKDIR_BODY);
 			continue;
 
 		case D_ALLOCDIRECT:
 			adp = WK_ALLOCDIRECT(wk);
 			adp->ad_state |= COMPLETE;
 			handle_allocdirect_partdone(adp);
 			continue;
 
 		case D_ALLOCINDIR:
 			aip = WK_ALLOCINDIR(wk);
 			aip->ai_state |= COMPLETE;
 			handle_allocindir_partdone(aip);
 			continue;
 
 		case D_INDIRDEP:
 			indirdep = WK_INDIRDEP(wk);
 			if (indirdep->ir_state & GOINGAWAY)
 				panic("disk_write_complete: indirdep gone");
 			bcopy(indirdep->ir_saveddata, bp->b_data, bp->b_bcount);
 			FREE(indirdep->ir_saveddata, M_INDIRDEP);
 			indirdep->ir_saveddata = 0;
 			indirdep->ir_state &= ~UNDONE;
 			indirdep->ir_state |= ATTACHED;
 			while ((aip = LIST_FIRST(&indirdep->ir_donehd)) != 0) {
 				handle_allocindir_partdone(aip);
 				if (aip == LIST_FIRST(&indirdep->ir_donehd))
 					panic("disk_write_complete: not gone");
 			}
 			WORKLIST_INSERT(&reattach, wk);
 			if ((bp->b_flags & B_DELWRI) == 0)
 				stat_indir_blk_ptrs++;
 			bdirty(bp);
 			continue;
 
 		default:
 			panic("handle_disk_write_complete: Unknown type %s",
 			    TYPENAME(wk->wk_type));
 			/* NOTREACHED */
 		}
 	}
 	/*
 	 * Reattach any requests that must be redone.
 	 */
 	while ((wk = LIST_FIRST(&reattach)) != NULL) {
 		WORKLIST_REMOVE(wk);
 		WORKLIST_INSERT(&bp->b_dep, wk);
 	}
 	FREE_LOCK(&lk);
 }
 
 /*
  * Called from within softdep_disk_write_complete above. Note that
  * this routine is always called from interrupt level with further
  * splbio interrupts blocked.
  */
 static void 
 handle_allocdirect_partdone(adp)
 	struct allocdirect *adp;	/* the completed allocdirect */
 {
 	struct allocdirectlst *listhead;
 	struct allocdirect *listadp;
 	struct inodedep *inodedep;
 	long bsize, delay;
 
 	if ((adp->ad_state & ALLCOMPLETE) != ALLCOMPLETE)
 		return;
 	if (adp->ad_buf != NULL)
 		panic("handle_allocdirect_partdone: dangling dep");
 	/*
 	 * The on-disk inode cannot claim to be any larger than the last
 	 * fragment that has been written. Otherwise, the on-disk inode
 	 * might have fragments that were not the last block in the file
 	 * which would corrupt the filesystem. Thus, we cannot free any
 	 * allocdirects after one whose ad_oldblkno claims a fragment as
 	 * these blocks must be rolled back to zero before writing the inode.
 	 * We check the currently active set of allocdirects in id_inoupdt
 	 * or id_extupdt as appropriate.
 	 */
 	inodedep = adp->ad_inodedep;
 	bsize = inodedep->id_fs->fs_bsize;
 	if (adp->ad_state & EXTDATA)
 		listhead = &inodedep->id_extupdt;
 	else
 		listhead = &inodedep->id_inoupdt;
 	TAILQ_FOREACH(listadp, listhead, ad_next) {
 		/* found our block */
 		if (listadp == adp)
 			break;
 		/* continue if ad_oldlbn is not a fragment */
 		if (listadp->ad_oldsize == 0 ||
 		    listadp->ad_oldsize == bsize)
 			continue;
 		/* hit a fragment */
 		return;
 	}
 	/*
 	 * If we have reached the end of the current list without
 	 * finding the just finished dependency, then it must be
 	 * on the future dependency list. Future dependencies cannot
 	 * be freed until they are moved to the current list.
 	 */
 	if (listadp == NULL) {
 #ifdef DEBUG
 		if (adp->ad_state & EXTDATA)
 			listhead = &inodedep->id_newextupdt;
 		else
 			listhead = &inodedep->id_newinoupdt;
 		TAILQ_FOREACH(listadp, listhead, ad_next)
 			/* found our block */
 			if (listadp == adp)
 				break;
 		if (listadp == NULL)
 			panic("handle_allocdirect_partdone: lost dep");
 #endif /* DEBUG */
 		return;
 	}
 	/*
 	 * If we have found the just finished dependency, then free
 	 * it along with anything that follows it that is complete.
 	 * If the inode still has a bitmap dependency, then it has
 	 * never been written to disk, hence the on-disk inode cannot
 	 * reference the old fragment so we can free it without delay.
 	 */
 	delay = (inodedep->id_state & DEPCOMPLETE);
 	for (; adp; adp = listadp) {
 		listadp = TAILQ_NEXT(adp, ad_next);
 		if ((adp->ad_state & ALLCOMPLETE) != ALLCOMPLETE)
 			return;
 		free_allocdirect(listhead, adp, delay);
 	}
 }
 
 /*
  * Called from within softdep_disk_write_complete above. Note that
  * this routine is always called from interrupt level with further
  * splbio interrupts blocked.
  */
 static void
 handle_allocindir_partdone(aip)
 	struct allocindir *aip;		/* the completed allocindir */
 {
 	struct indirdep *indirdep;
 
 	if ((aip->ai_state & ALLCOMPLETE) != ALLCOMPLETE)
 		return;
 	if (aip->ai_buf != NULL)
 		panic("handle_allocindir_partdone: dangling dependency");
 	indirdep = aip->ai_indirdep;
 	if (indirdep->ir_state & UNDONE) {
 		LIST_REMOVE(aip, ai_next);
 		LIST_INSERT_HEAD(&indirdep->ir_donehd, aip, ai_next);
 		return;
 	}
 	if (indirdep->ir_state & UFS1FMT)
 		((ufs1_daddr_t *)indirdep->ir_savebp->b_data)[aip->ai_offset] =
 		    aip->ai_newblkno;
 	else
 		((ufs2_daddr_t *)indirdep->ir_savebp->b_data)[aip->ai_offset] =
 		    aip->ai_newblkno;
 	LIST_REMOVE(aip, ai_next);
 	if (aip->ai_freefrag != NULL)
 		add_to_worklist(&aip->ai_freefrag->ff_list);
 	WORKITEM_FREE(aip, D_ALLOCINDIR);
 }
 
 /*
  * Called from within softdep_disk_write_complete above to restore
  * in-memory inode block contents to their most up-to-date state. Note
  * that this routine is always called from interrupt level with further
  * splbio interrupts blocked.
  */
 static int 
 handle_written_inodeblock(inodedep, bp)
 	struct inodedep *inodedep;
 	struct buf *bp;		/* buffer containing the inode block */
 {
 	struct worklist *wk, *filefree;
 	struct allocdirect *adp, *nextadp;
 	struct ufs1_dinode *dp1 = NULL;
 	struct ufs2_dinode *dp2 = NULL;
 	int hadchanges, fstype;
 
 	if ((inodedep->id_state & IOSTARTED) == 0)
 		panic("handle_written_inodeblock: not started");
 	inodedep->id_state &= ~IOSTARTED;
 	if (inodedep->id_fs->fs_magic == FS_UFS1_MAGIC) {
 		fstype = UFS1;
 		dp1 = (struct ufs1_dinode *)bp->b_data +
 		    ino_to_fsbo(inodedep->id_fs, inodedep->id_ino);
 	} else {
 		fstype = UFS2;
 		dp2 = (struct ufs2_dinode *)bp->b_data +
 		    ino_to_fsbo(inodedep->id_fs, inodedep->id_ino);
 	}
 	/*
 	 * If we had to rollback the inode allocation because of
 	 * bitmaps being incomplete, then simply restore it.
 	 * Keep the block dirty so that it will not be reclaimed until
 	 * all associated dependencies have been cleared and the
 	 * corresponding updates written to disk.
 	 */
 	if (inodedep->id_savedino1 != NULL) {
 		if (fstype == UFS1)
 			*dp1 = *inodedep->id_savedino1;
 		else
 			*dp2 = *inodedep->id_savedino2;
 		FREE(inodedep->id_savedino1, M_SAVEDINO);
 		inodedep->id_savedino1 = NULL;
 		if ((bp->b_flags & B_DELWRI) == 0)
 			stat_inode_bitmap++;
 		bdirty(bp);
 		return (1);
 	}
 	inodedep->id_state |= COMPLETE;
 	/*
 	 * Roll forward anything that had to be rolled back before 
 	 * the inode could be updated.
 	 */
 	hadchanges = 0;
 	for (adp = TAILQ_FIRST(&inodedep->id_inoupdt); adp; adp = nextadp) {
 		nextadp = TAILQ_NEXT(adp, ad_next);
 		if (adp->ad_state & ATTACHED)
 			panic("handle_written_inodeblock: new entry");
 		if (fstype == UFS1) {
 			if (adp->ad_lbn < NDADDR) {
 				if (dp1->di_db[adp->ad_lbn]!=adp->ad_oldblkno)
 					panic("%s %s #%jd mismatch %d != %jd",
 					    "handle_written_inodeblock:",
 					    "direct pointer",
 					    (intmax_t)adp->ad_lbn,
 					    dp1->di_db[adp->ad_lbn],
 					    (intmax_t)adp->ad_oldblkno);
 				dp1->di_db[adp->ad_lbn] = adp->ad_newblkno;
 			} else {
 				if (dp1->di_ib[adp->ad_lbn - NDADDR] != 0)
 					panic("%s: %s #%jd allocated as %d",
 					    "handle_written_inodeblock",
 					    "indirect pointer",
 					    (intmax_t)adp->ad_lbn - NDADDR,
 					    dp1->di_ib[adp->ad_lbn - NDADDR]);
 				dp1->di_ib[adp->ad_lbn - NDADDR] =
 				    adp->ad_newblkno;
 			}
 		} else {
 			if (adp->ad_lbn < NDADDR) {
 				if (dp2->di_db[adp->ad_lbn]!=adp->ad_oldblkno)
 					panic("%s: %s #%jd %s %jd != %jd",
 					    "handle_written_inodeblock",
 					    "direct pointer",
 					    (intmax_t)adp->ad_lbn, "mismatch",
 					    (intmax_t)dp2->di_db[adp->ad_lbn],
 					    (intmax_t)adp->ad_oldblkno);
 				dp2->di_db[adp->ad_lbn] = adp->ad_newblkno;
 			} else {
 				if (dp2->di_ib[adp->ad_lbn - NDADDR] != 0)
 					panic("%s: %s #%jd allocated as %jd",
 					    "handle_written_inodeblock",
 					    "indirect pointer",
 					    (intmax_t)adp->ad_lbn - NDADDR,
 					    (intmax_t)
 					    dp2->di_ib[adp->ad_lbn - NDADDR]);
 				dp2->di_ib[adp->ad_lbn - NDADDR] =
 				    adp->ad_newblkno;
 			}
 		}
 		adp->ad_state &= ~UNDONE;
 		adp->ad_state |= ATTACHED;
 		hadchanges = 1;
 	}
 	for (adp = TAILQ_FIRST(&inodedep->id_extupdt); adp; adp = nextadp) {
 		nextadp = TAILQ_NEXT(adp, ad_next);
 		if (adp->ad_state & ATTACHED)
 			panic("handle_written_inodeblock: new entry");
 		if (dp2->di_extb[adp->ad_lbn] != adp->ad_oldblkno)
 			panic("%s: direct pointers #%jd %s %jd != %jd",
 			    "handle_written_inodeblock",
 			    (intmax_t)adp->ad_lbn, "mismatch",
 			    (intmax_t)dp2->di_extb[adp->ad_lbn],
 			    (intmax_t)adp->ad_oldblkno);
 		dp2->di_extb[adp->ad_lbn] = adp->ad_newblkno;
 		adp->ad_state &= ~UNDONE;
 		adp->ad_state |= ATTACHED;
 		hadchanges = 1;
 	}
 	if (hadchanges && (bp->b_flags & B_DELWRI) == 0)
 		stat_direct_blk_ptrs++;
 	/*
 	 * Reset the file size to its most up-to-date value.
 	 */
 	if (inodedep->id_savedsize == -1 || inodedep->id_savedextsize == -1)
 		panic("handle_written_inodeblock: bad size");
 	if (fstype == UFS1) {
 		if (dp1->di_size != inodedep->id_savedsize) {
 			dp1->di_size = inodedep->id_savedsize;
 			hadchanges = 1;
 		}
 	} else {
 		if (dp2->di_size != inodedep->id_savedsize) {
 			dp2->di_size = inodedep->id_savedsize;
 			hadchanges = 1;
 		}
 		if (dp2->di_extsize != inodedep->id_savedextsize) {
 			dp2->di_extsize = inodedep->id_savedextsize;
 			hadchanges = 1;
 		}
 	}
 	inodedep->id_savedsize = -1;
 	inodedep->id_savedextsize = -1;
 	/*
 	 * If there were any rollbacks in the inode block, then it must be
 	 * marked dirty so that its will eventually get written back in
 	 * its correct form.
 	 */
 	if (hadchanges)
 		bdirty(bp);
 	/*
 	 * Process any allocdirects that completed during the update.
 	 */
 	if ((adp = TAILQ_FIRST(&inodedep->id_inoupdt)) != NULL)
 		handle_allocdirect_partdone(adp);
 	if ((adp = TAILQ_FIRST(&inodedep->id_extupdt)) != NULL)
 		handle_allocdirect_partdone(adp);
 	/*
 	 * Process deallocations that were held pending until the
 	 * inode had been written to disk. Freeing of the inode
 	 * is delayed until after all blocks have been freed to
 	 * avoid creation of new <vfsid, inum, lbn> triples
 	 * before the old ones have been deleted.
 	 */
 	filefree = NULL;
 	while ((wk = LIST_FIRST(&inodedep->id_bufwait)) != NULL) {
 		WORKLIST_REMOVE(wk);
 		switch (wk->wk_type) {
 
 		case D_FREEFILE:
 			/*
 			 * We defer adding filefree to the worklist until
 			 * all other additions have been made to ensure
 			 * that it will be done after all the old blocks
 			 * have been freed.
 			 */
 			if (filefree != NULL)
 				panic("handle_written_inodeblock: filefree");
 			filefree = wk;
 			continue;
 
 		case D_MKDIR:
 			handle_written_mkdir(WK_MKDIR(wk), MKDIR_PARENT);
 			continue;
 
 		case D_DIRADD:
 			diradd_inode_written(WK_DIRADD(wk), inodedep);
 			continue;
 
 		case D_FREEBLKS:
 			wk->wk_state |= COMPLETE;
 			if ((wk->wk_state  & ALLCOMPLETE) != ALLCOMPLETE)
 				continue;
 			 /* -- fall through -- */
 		case D_FREEFRAG:
 		case D_DIRREM:
 			add_to_worklist(wk);
 			continue;
 
 		case D_NEWDIRBLK:
 			free_newdirblk(WK_NEWDIRBLK(wk));
 			continue;
 
 		default:
 			panic("handle_written_inodeblock: Unknown type %s",
 			    TYPENAME(wk->wk_type));
 			/* NOTREACHED */
 		}
 	}
 	if (filefree != NULL) {
 		if (free_inodedep(inodedep) == 0)
 			panic("handle_written_inodeblock: live inodedep");
 		add_to_worklist(filefree);
 		return (0);
 	}
 
 	/*
 	 * If no outstanding dependencies, free it.
 	 */
 	if (free_inodedep(inodedep) ||
 	    (TAILQ_FIRST(&inodedep->id_inoupdt) == 0 &&
 	     TAILQ_FIRST(&inodedep->id_extupdt) == 0))
 		return (0);
 	return (hadchanges);
 }
 
 /*
  * Process a diradd entry after its dependent inode has been written.
  * This routine must be called with splbio interrupts blocked.
  */
 static void
 diradd_inode_written(dap, inodedep)
 	struct diradd *dap;
 	struct inodedep *inodedep;
 {
 	struct pagedep *pagedep;
 
 	dap->da_state |= COMPLETE;
 	if ((dap->da_state & ALLCOMPLETE) == ALLCOMPLETE) {
 		if (dap->da_state & DIRCHG)
 			pagedep = dap->da_previous->dm_pagedep;
 		else
 			pagedep = dap->da_pagedep;
 		LIST_REMOVE(dap, da_pdlist);
 		LIST_INSERT_HEAD(&pagedep->pd_pendinghd, dap, da_pdlist);
 	}
 	WORKLIST_INSERT(&inodedep->id_pendinghd, &dap->da_list);
 }
 
 /*
  * Handle the completion of a mkdir dependency.
  */
 static void
 handle_written_mkdir(mkdir, type)
 	struct mkdir *mkdir;
 	int type;
 {
 	struct diradd *dap;
 	struct pagedep *pagedep;
 
 	if (mkdir->md_state != type)
 		panic("handle_written_mkdir: bad type");
 	dap = mkdir->md_diradd;
 	dap->da_state &= ~type;
 	if ((dap->da_state & (MKDIR_PARENT | MKDIR_BODY)) == 0)
 		dap->da_state |= DEPCOMPLETE;
 	if ((dap->da_state & ALLCOMPLETE) == ALLCOMPLETE) {
 		if (dap->da_state & DIRCHG)
 			pagedep = dap->da_previous->dm_pagedep;
 		else
 			pagedep = dap->da_pagedep;
 		LIST_REMOVE(dap, da_pdlist);
 		LIST_INSERT_HEAD(&pagedep->pd_pendinghd, dap, da_pdlist);
 	}
 	LIST_REMOVE(mkdir, md_mkdirs);
 	WORKITEM_FREE(mkdir, D_MKDIR);
 }
 
 /*
  * Called from within softdep_disk_write_complete above.
  * A write operation was just completed. Removed inodes can
  * now be freed and associated block pointers may be committed.
  * Note that this routine is always called from interrupt level
  * with further splbio interrupts blocked.
  */
 static int 
 handle_written_filepage(pagedep, bp)
 	struct pagedep *pagedep;
 	struct buf *bp;		/* buffer containing the written page */
 {
 	struct dirrem *dirrem;
 	struct diradd *dap, *nextdap;
 	struct direct *ep;
 	int i, chgs;
 
 	if ((pagedep->pd_state & IOSTARTED) == 0)
 		panic("handle_written_filepage: not started");
 	pagedep->pd_state &= ~IOSTARTED;
 	/*
 	 * Process any directory removals that have been committed.
 	 */
 	while ((dirrem = LIST_FIRST(&pagedep->pd_dirremhd)) != NULL) {
 		LIST_REMOVE(dirrem, dm_next);
 		dirrem->dm_dirinum = pagedep->pd_ino;
 		add_to_worklist(&dirrem->dm_list);
 	}
 	/*
 	 * Free any directory additions that have been committed.
 	 * If it is a newly allocated block, we have to wait until
 	 * the on-disk directory inode claims the new block.
 	 */
 	if ((pagedep->pd_state & NEWBLOCK) == 0)
 		while ((dap = LIST_FIRST(&pagedep->pd_pendinghd)) != NULL)
 			free_diradd(dap);
 	/*
 	 * Uncommitted directory entries must be restored.
 	 */
 	for (chgs = 0, i = 0; i < DAHASHSZ; i++) {
 		for (dap = LIST_FIRST(&pagedep->pd_diraddhd[i]); dap;
 		     dap = nextdap) {
 			nextdap = LIST_NEXT(dap, da_pdlist);
 			if (dap->da_state & ATTACHED)
 				panic("handle_written_filepage: attached");
 			ep = (struct direct *)
 			    ((char *)bp->b_data + dap->da_offset);
 			ep->d_ino = dap->da_newinum;
 			dap->da_state &= ~UNDONE;
 			dap->da_state |= ATTACHED;
 			chgs = 1;
 			/*
 			 * If the inode referenced by the directory has
 			 * been written out, then the dependency can be
 			 * moved to the pending list.
 			 */
 			if ((dap->da_state & ALLCOMPLETE) == ALLCOMPLETE) {
 				LIST_REMOVE(dap, da_pdlist);
 				LIST_INSERT_HEAD(&pagedep->pd_pendinghd, dap,
 				    da_pdlist);
 			}
 		}
 	}
 	/*
 	 * If there were any rollbacks in the directory, then it must be
 	 * marked dirty so that its will eventually get written back in
 	 * its correct form.
 	 */
 	if (chgs) {
 		if ((bp->b_flags & B_DELWRI) == 0)
 			stat_dir_entry++;
 		bdirty(bp);
 		return (1);
 	}
 	/*
 	 * If we are not waiting for a new directory block to be
 	 * claimed by its inode, then the pagedep will be freed.
 	 * Otherwise it will remain to track any new entries on
 	 * the page in case they are fsync'ed.
 	 */
 	if ((pagedep->pd_state & NEWBLOCK) == 0) {
 		LIST_REMOVE(pagedep, pd_hash);
 		WORKITEM_FREE(pagedep, D_PAGEDEP);
 	}
 	return (0);
 }
 
 /*
  * Writing back in-core inode structures.
  * 
  * The filesystem only accesses an inode's contents when it occupies an
  * "in-core" inode structure.  These "in-core" structures are separate from
  * the page frames used to cache inode blocks.  Only the latter are
  * transferred to/from the disk.  So, when the updated contents of the
  * "in-core" inode structure are copied to the corresponding in-memory inode
  * block, the dependencies are also transferred.  The following procedure is
  * called when copying a dirty "in-core" inode to a cached inode block.
  */
 
 /*
  * Called when an inode is loaded from disk. If the effective link count
  * differed from the actual link count when it was last flushed, then we
  * need to ensure that the correct effective link count is put back.
  */
 void 
 softdep_load_inodeblock(ip)
 	struct inode *ip;	/* the "in_core" copy of the inode */
 {
 	struct inodedep *inodedep;
 
 	/*
 	 * Check for alternate nlink count.
 	 */
 	ip->i_effnlink = ip->i_nlink;
 	ACQUIRE_LOCK(&lk);
 	if (inodedep_lookup(UFSTOVFS(ip->i_ump),
 	    ip->i_number, 0, &inodedep) == 0) {
 		FREE_LOCK(&lk);
 		return;
 	}
 	ip->i_effnlink -= inodedep->id_nlinkdelta;
 	if (inodedep->id_state & SPACECOUNTED)
 		ip->i_flag |= IN_SPACECOUNTED;
 	FREE_LOCK(&lk);
 }
 
 /*
  * This routine is called just before the "in-core" inode
  * information is to be copied to the in-memory inode block.
  * Recall that an inode block contains several inodes. If
  * the force flag is set, then the dependencies will be
  * cleared so that the update can always be made. Note that
  * the buffer is locked when this routine is called, so we
  * will never be in the middle of writing the inode block 
  * to disk.
  */
 void 
 softdep_update_inodeblock(ip, bp, waitfor)
 	struct inode *ip;	/* the "in_core" copy of the inode */
 	struct buf *bp;		/* the buffer containing the inode block */
 	int waitfor;		/* nonzero => update must be allowed */
 {
 	struct inodedep *inodedep;
 	struct worklist *wk;
 	struct mount *mp;
 	struct buf *ibp;
 	int error;
 
 	/*
 	 * If the effective link count is not equal to the actual link
 	 * count, then we must track the difference in an inodedep while
 	 * the inode is (potentially) tossed out of the cache. Otherwise,
 	 * if there is no existing inodedep, then there are no dependencies
 	 * to track.
 	 */
 	mp = UFSTOVFS(ip->i_ump);
 	ACQUIRE_LOCK(&lk);
 	if (inodedep_lookup(mp, ip->i_number, 0, &inodedep) == 0) {
 		FREE_LOCK(&lk);
 		if (ip->i_effnlink != ip->i_nlink)
 			panic("softdep_update_inodeblock: bad link count");
 		return;
 	}
 	if (inodedep->id_nlinkdelta != ip->i_nlink - ip->i_effnlink)
 		panic("softdep_update_inodeblock: bad delta");
 	/*
 	 * Changes have been initiated. Anything depending on these
 	 * changes cannot occur until this inode has been written.
 	 */
 	inodedep->id_state &= ~COMPLETE;
 	if ((inodedep->id_state & ONWORKLIST) == 0)
 		WORKLIST_INSERT(&bp->b_dep, &inodedep->id_list);
 	/*
 	 * Any new dependencies associated with the incore inode must 
 	 * now be moved to the list associated with the buffer holding
 	 * the in-memory copy of the inode. Once merged process any
 	 * allocdirects that are completed by the merger.
 	 */
 	merge_inode_lists(&inodedep->id_newinoupdt, &inodedep->id_inoupdt);
 	if (!TAILQ_EMPTY(&inodedep->id_inoupdt))
 		handle_allocdirect_partdone(TAILQ_FIRST(&inodedep->id_inoupdt));
 	merge_inode_lists(&inodedep->id_newextupdt, &inodedep->id_extupdt);
 	if (!TAILQ_EMPTY(&inodedep->id_extupdt))
 		handle_allocdirect_partdone(TAILQ_FIRST(&inodedep->id_extupdt));
 	/*
 	 * Now that the inode has been pushed into the buffer, the
 	 * operations dependent on the inode being written to disk
 	 * can be moved to the id_bufwait so that they will be
 	 * processed when the buffer I/O completes.
 	 */
 	while ((wk = LIST_FIRST(&inodedep->id_inowait)) != NULL) {
 		WORKLIST_REMOVE(wk);
 		WORKLIST_INSERT(&inodedep->id_bufwait, wk);
 	}
 	/*
 	 * Newly allocated inodes cannot be written until the bitmap
 	 * that allocates them have been written (indicated by
 	 * DEPCOMPLETE being set in id_state). If we are doing a
 	 * forced sync (e.g., an fsync on a file), we force the bitmap
 	 * to be written so that the update can be done.
 	 */
 	if (waitfor == 0) {
 		FREE_LOCK(&lk);
 		return;
 	}
 retry:
 	if ((inodedep->id_state & DEPCOMPLETE) != 0) {
 		FREE_LOCK(&lk);
 		return;
 	}
 	ibp = inodedep->id_buf;
 	ibp = getdirtybuf(ibp, &lk, MNT_WAIT);
 	if (ibp == NULL) {
 		/*
 		 * If ibp came back as NULL, the dependency could have been
 		 * freed while we slept.  Look it up again, and check to see
 		 * that it has completed.
 		 */
 		if (inodedep_lookup(mp, ip->i_number, 0, &inodedep) != 0)
 			goto retry;
 		FREE_LOCK(&lk);
 		return;
 	}
 	FREE_LOCK(&lk);
 	if ((error = bwrite(ibp)) != 0)
 		softdep_error("softdep_update_inodeblock: bwrite", error);
 }
 
 /*
  * Merge the a new inode dependency list (such as id_newinoupdt) into an
  * old inode dependency list (such as id_inoupdt). This routine must be
  * called with splbio interrupts blocked.
  */
 static void
 merge_inode_lists(newlisthead, oldlisthead)
 	struct allocdirectlst *newlisthead;
 	struct allocdirectlst *oldlisthead;
 {
 	struct allocdirect *listadp, *newadp;
 
 	newadp = TAILQ_FIRST(newlisthead);
 	for (listadp = TAILQ_FIRST(oldlisthead); listadp && newadp;) {
 		if (listadp->ad_lbn < newadp->ad_lbn) {
 			listadp = TAILQ_NEXT(listadp, ad_next);
 			continue;
 		}
 		TAILQ_REMOVE(newlisthead, newadp, ad_next);
 		TAILQ_INSERT_BEFORE(listadp, newadp, ad_next);
 		if (listadp->ad_lbn == newadp->ad_lbn) {
 			allocdirect_merge(oldlisthead, newadp,
 			    listadp);
 			listadp = newadp;
 		}
 		newadp = TAILQ_FIRST(newlisthead);
 	}
 	while ((newadp = TAILQ_FIRST(newlisthead)) != NULL) {
 		TAILQ_REMOVE(newlisthead, newadp, ad_next);
 		TAILQ_INSERT_TAIL(oldlisthead, newadp, ad_next);
 	}
 }
 
 /*
  * If we are doing an fsync, then we must ensure that any directory
  * entries for the inode have been written after the inode gets to disk.
  */
 int
 softdep_fsync(vp)
 	struct vnode *vp;	/* the "in_core" copy of the inode */
 {
 	struct inodedep *inodedep;
 	struct pagedep *pagedep;
 	struct worklist *wk;
 	struct diradd *dap;
 	struct mount *mp;
 	struct vnode *pvp;
 	struct inode *ip;
 	struct buf *bp;
 	struct fs *fs;
 	struct thread *td = curthread;
 	int error, flushparent, pagedep_new_block;
 	ino_t parentino;
 	ufs_lbn_t lbn;
 
 	ip = VTOI(vp);
 	fs = ip->i_fs;
 	mp = vp->v_mount;
 	ACQUIRE_LOCK(&lk);
 	if (inodedep_lookup(mp, ip->i_number, 0, &inodedep) == 0) {
 		FREE_LOCK(&lk);
 		return (0);
 	}
 	if (!LIST_EMPTY(&inodedep->id_inowait) ||
 	    !LIST_EMPTY(&inodedep->id_bufwait) ||
 	    !TAILQ_EMPTY(&inodedep->id_extupdt) ||
 	    !TAILQ_EMPTY(&inodedep->id_newextupdt) ||
 	    !TAILQ_EMPTY(&inodedep->id_inoupdt) ||
 	    !TAILQ_EMPTY(&inodedep->id_newinoupdt))
 		panic("softdep_fsync: pending ops");
 	for (error = 0, flushparent = 0; ; ) {
 		if ((wk = LIST_FIRST(&inodedep->id_pendinghd)) == NULL)
 			break;
 		if (wk->wk_type != D_DIRADD)
 			panic("softdep_fsync: Unexpected type %s",
 			    TYPENAME(wk->wk_type));
 		dap = WK_DIRADD(wk);
 		/*
 		 * Flush our parent if this directory entry has a MKDIR_PARENT
 		 * dependency or is contained in a newly allocated block.
 		 */
 		if (dap->da_state & DIRCHG)
 			pagedep = dap->da_previous->dm_pagedep;
 		else
 			pagedep = dap->da_pagedep;
 		parentino = pagedep->pd_ino;
 		lbn = pagedep->pd_lbn;
 		if ((dap->da_state & (MKDIR_BODY | COMPLETE)) != COMPLETE)
 			panic("softdep_fsync: dirty");
 		if ((dap->da_state & MKDIR_PARENT) ||
 		    (pagedep->pd_state & NEWBLOCK))
 			flushparent = 1;
 		else
 			flushparent = 0;
 		/*
 		 * If we are being fsync'ed as part of vgone'ing this vnode,
 		 * then we will not be able to release and recover the
 		 * vnode below, so we just have to give up on writing its
 		 * directory entry out. It will eventually be written, just
 		 * not now, but then the user was not asking to have it
 		 * written, so we are not breaking any promises.
 		 */
 		if (vp->v_iflag & VI_DOOMED)
 			break;
 		/*
 		 * We prevent deadlock by always fetching inodes from the
 		 * root, moving down the directory tree. Thus, when fetching
 		 * our parent directory, we first try to get the lock. If
 		 * that fails, we must unlock ourselves before requesting
 		 * the lock on our parent. See the comment in ufs_lookup
 		 * for details on possible races.
 		 */
 		FREE_LOCK(&lk);
 		if (ffs_vget(mp, parentino, LK_NOWAIT | LK_EXCLUSIVE, &pvp)) {
 			VOP_UNLOCK(vp, 0, td);
 			error = ffs_vget(mp, parentino, LK_EXCLUSIVE, &pvp);
-			vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td);
+			vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
 			if (error != 0)
 				return (error);
 		}
 		/*
 		 * All MKDIR_PARENT dependencies and all the NEWBLOCK pagedeps
 		 * that are contained in direct blocks will be resolved by 
 		 * doing a ffs_update. Pagedeps contained in indirect blocks
 		 * may require a complete sync'ing of the directory. So, we
 		 * try the cheap and fast ffs_update first, and if that fails,
 		 * then we do the slower ffs_syncvnode of the directory.
 		 */
 		if (flushparent) {
 			int locked;
 
 			if ((error = ffs_update(pvp, 1)) != 0) {
 				vput(pvp);
 				return (error);
 			}
 			ACQUIRE_LOCK(&lk);
 			locked = 1;
 			if (inodedep_lookup(mp, ip->i_number, 0, &inodedep) != 0) {
 				if ((wk = LIST_FIRST(&inodedep->id_pendinghd)) != NULL) {
 					if (wk->wk_type != D_DIRADD)
 						panic("softdep_fsync: Unexpected type %s",
 						      TYPENAME(wk->wk_type));
 					dap = WK_DIRADD(wk);
 					if (dap->da_state & DIRCHG)
 						pagedep = dap->da_previous->dm_pagedep;
 					else
 						pagedep = dap->da_pagedep;
 					pagedep_new_block = pagedep->pd_state & NEWBLOCK;
 					FREE_LOCK(&lk);
 					locked = 0;
 					if (pagedep_new_block &&
 					    (error = ffs_syncvnode(pvp, MNT_WAIT))) {
 						vput(pvp);
 						return (error);
 					}
 				}
 			}
 			if (locked)
 				FREE_LOCK(&lk);
 		}
 		/*
 		 * Flush directory page containing the inode's name.
 		 */
 		error = bread(pvp, lbn, blksize(fs, VTOI(pvp), lbn), td->td_ucred,
 		    &bp);
 		if (error == 0)
 			error = bwrite(bp);
 		else
 			brelse(bp);
 		vput(pvp);
 		if (error != 0)
 			return (error);
 		ACQUIRE_LOCK(&lk);
 		if (inodedep_lookup(mp, ip->i_number, 0, &inodedep) == 0)
 			break;
 	}
 	FREE_LOCK(&lk);
 	return (0);
 }
 
 /*
  * Flush all the dirty bitmaps associated with the block device
  * before flushing the rest of the dirty blocks so as to reduce
  * the number of dependencies that will have to be rolled back.
  */
 void
 softdep_fsync_mountdev(vp)
 	struct vnode *vp;
 {
 	struct buf *bp, *nbp;
 	struct worklist *wk;
 
 	if (!vn_isdisk(vp, NULL))
 		panic("softdep_fsync_mountdev: vnode not a disk");
 restart:
 	ACQUIRE_LOCK(&lk);
 	VI_LOCK(vp);
 	TAILQ_FOREACH_SAFE(bp, &vp->v_bufobj.bo_dirty.bv_hd, b_bobufs, nbp) {
 		/* 
 		 * If it is already scheduled, skip to the next buffer.
 		 */
 		if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT, NULL))
 			continue;
 
 		if ((bp->b_flags & B_DELWRI) == 0)
 			panic("softdep_fsync_mountdev: not dirty");
 		/*
 		 * We are only interested in bitmaps with outstanding
 		 * dependencies.
 		 */
 		if ((wk = LIST_FIRST(&bp->b_dep)) == NULL ||
 		    wk->wk_type != D_BMSAFEMAP ||
 		    (bp->b_vflags & BV_BKGRDINPROG)) {
 			BUF_UNLOCK(bp);
 			continue;
 		}
 		VI_UNLOCK(vp);
 		FREE_LOCK(&lk);
 		bremfree(bp);
 		(void) bawrite(bp);
 		goto restart;
 	}
 	FREE_LOCK(&lk);
 	drain_output(vp);
 	VI_UNLOCK(vp);
 }
 
 /*
  * This routine is called when we are trying to synchronously flush a
  * file. This routine must eliminate any filesystem metadata dependencies
  * so that the syncing routine can succeed by pushing the dirty blocks
  * associated with the file. If any I/O errors occur, they are returned.
  */
 int
 softdep_sync_metadata(struct vnode *vp)
 {
 	struct pagedep *pagedep;
 	struct allocdirect *adp;
 	struct allocindir *aip;
 	struct buf *bp, *nbp;
 	struct worklist *wk;
 	int i, error, waitfor;
 
 	if (!DOINGSOFTDEP(vp))
 		return (0);
 	/*
 	 * Ensure that any direct block dependencies have been cleared.
 	 */
 	ACQUIRE_LOCK(&lk);
 	if ((error = flush_inodedep_deps(vp->v_mount, VTOI(vp)->i_number))) {
 		FREE_LOCK(&lk);
 		return (error);
 	}
 	FREE_LOCK(&lk);
 	/*
 	 * For most files, the only metadata dependencies are the
 	 * cylinder group maps that allocate their inode or blocks.
 	 * The block allocation dependencies can be found by traversing
 	 * the dependency lists for any buffers that remain on their
 	 * dirty buffer list. The inode allocation dependency will
 	 * be resolved when the inode is updated with MNT_WAIT.
 	 * This work is done in two passes. The first pass grabs most
 	 * of the buffers and begins asynchronously writing them. The
 	 * only way to wait for these asynchronous writes is to sleep
 	 * on the filesystem vnode which may stay busy for a long time
 	 * if the filesystem is active. So, instead, we make a second
 	 * pass over the dependencies blocking on each write. In the
 	 * usual case we will be blocking against a write that we
 	 * initiated, so when it is done the dependency will have been
 	 * resolved. Thus the second pass is expected to end quickly.
 	 */
 	waitfor = MNT_NOWAIT;
 
 top:
 	/*
 	 * We must wait for any I/O in progress to finish so that
 	 * all potential buffers on the dirty list will be visible.
 	 */
 	VI_LOCK(vp);
 	drain_output(vp);
 	while ((bp = TAILQ_FIRST(&vp->v_bufobj.bo_dirty.bv_hd)) != NULL) {
 		bp = getdirtybuf(bp, VI_MTX(vp), MNT_WAIT);
 		if (bp)
 			break;
 	}
 	VI_UNLOCK(vp);
 	if (bp == NULL)
 		return (0);
 loop:
 	/* While syncing snapshots, we must allow recursive lookups */
 	bp->b_lock.lk_flags |= LK_CANRECURSE;
 	ACQUIRE_LOCK(&lk);
 	/*
 	 * As we hold the buffer locked, none of its dependencies
 	 * will disappear.
 	 */
 	LIST_FOREACH(wk, &bp->b_dep, wk_list) {
 		switch (wk->wk_type) {
 
 		case D_ALLOCDIRECT:
 			adp = WK_ALLOCDIRECT(wk);
 			if (adp->ad_state & DEPCOMPLETE)
 				continue;
 			nbp = adp->ad_buf;
 			nbp = getdirtybuf(nbp, &lk, waitfor);
 			if (nbp == NULL)
 				continue;
 			FREE_LOCK(&lk);
 			if (waitfor == MNT_NOWAIT) {
 				bawrite(nbp);
 			} else if ((error = bwrite(nbp)) != 0) {
 				break;
 			}
 			ACQUIRE_LOCK(&lk);
 			continue;
 
 		case D_ALLOCINDIR:
 			aip = WK_ALLOCINDIR(wk);
 			if (aip->ai_state & DEPCOMPLETE)
 				continue;
 			nbp = aip->ai_buf;
 			nbp = getdirtybuf(nbp, &lk, waitfor);
 			if (nbp == NULL)
 				continue;
 			FREE_LOCK(&lk);
 			if (waitfor == MNT_NOWAIT) {
 				bawrite(nbp);
 			} else if ((error = bwrite(nbp)) != 0) {
 				break;
 			}
 			ACQUIRE_LOCK(&lk);
 			continue;
 
 		case D_INDIRDEP:
 		restart:
 
 			LIST_FOREACH(aip, &WK_INDIRDEP(wk)->ir_deplisthd, ai_next) {
 				if (aip->ai_state & DEPCOMPLETE)
 					continue;
 				nbp = aip->ai_buf;
 				nbp = getdirtybuf(nbp, &lk, MNT_WAIT);
 				if (nbp == NULL)
 					goto restart;
 				FREE_LOCK(&lk);
 				if ((error = bwrite(nbp)) != 0) {
 					goto loop_end;
 				}
 				ACQUIRE_LOCK(&lk);
 				goto restart;
 			}
 			continue;
 
 		case D_INODEDEP:
 			if ((error = flush_inodedep_deps(wk->wk_mp,
 			    WK_INODEDEP(wk)->id_ino)) != 0) {
 				FREE_LOCK(&lk);
 				break;
 			}
 			continue;
 
 		case D_PAGEDEP:
 			/*
 			 * We are trying to sync a directory that may
 			 * have dependencies on both its own metadata
 			 * and/or dependencies on the inodes of any
 			 * recently allocated files. We walk its diradd
 			 * lists pushing out the associated inode.
 			 */
 			pagedep = WK_PAGEDEP(wk);
 			for (i = 0; i < DAHASHSZ; i++) {
 				if (LIST_FIRST(&pagedep->pd_diraddhd[i]) == 0)
 					continue;
 				if ((error =
 				    flush_pagedep_deps(vp, wk->wk_mp,
 						&pagedep->pd_diraddhd[i]))) {
 					FREE_LOCK(&lk);
 					goto loop_end;
 				}
 			}
 			continue;
 
 		case D_MKDIR:
 			/*
 			 * This case should never happen if the vnode has
 			 * been properly sync'ed. However, if this function
 			 * is used at a place where the vnode has not yet
 			 * been sync'ed, this dependency can show up. So,
 			 * rather than panic, just flush it.
 			 */
 			nbp = WK_MKDIR(wk)->md_buf;
 			nbp = getdirtybuf(nbp, &lk, waitfor);
 			if (nbp == NULL)
 				continue;
 			FREE_LOCK(&lk);
 			if (waitfor == MNT_NOWAIT) {
 				bawrite(nbp);
 			} else if ((error = bwrite(nbp)) != 0) {
 				break;
 			}
 			ACQUIRE_LOCK(&lk);
 			continue;
 
 		case D_BMSAFEMAP:
 			/*
 			 * This case should never happen if the vnode has
 			 * been properly sync'ed. However, if this function
 			 * is used at a place where the vnode has not yet
 			 * been sync'ed, this dependency can show up. So,
 			 * rather than panic, just flush it.
 			 */
 			nbp = WK_BMSAFEMAP(wk)->sm_buf;
 			nbp = getdirtybuf(nbp, &lk, waitfor);
 			if (nbp == NULL)
 				continue;
 			FREE_LOCK(&lk);
 			if (waitfor == MNT_NOWAIT) {
 				bawrite(nbp);
 			} else if ((error = bwrite(nbp)) != 0) {
 				break;
 			}
 			ACQUIRE_LOCK(&lk);
 			continue;
 
 		default:
 			panic("softdep_sync_metadata: Unknown type %s",
 			    TYPENAME(wk->wk_type));
 			/* NOTREACHED */
 		}
 	loop_end:
 		/* We reach here only in error and unlocked */
 		if (error == 0)
 			panic("softdep_sync_metadata: zero error");
 		bp->b_lock.lk_flags &= ~LK_CANRECURSE;
 		bawrite(bp);
 		return (error);
 	}
 	FREE_LOCK(&lk);
 	VI_LOCK(vp);
 	while ((nbp = TAILQ_NEXT(bp, b_bobufs)) != NULL) {
 		nbp = getdirtybuf(nbp, VI_MTX(vp), MNT_WAIT);
 		if (nbp)
 			break;
 	}
 	VI_UNLOCK(vp);
 	bp->b_lock.lk_flags &= ~LK_CANRECURSE;
 	bawrite(bp);
 	if (nbp != NULL) {
 		bp = nbp;
 		goto loop;
 	}
 	/*
 	 * The brief unlock is to allow any pent up dependency
 	 * processing to be done. Then proceed with the second pass.
 	 */
 	if (waitfor == MNT_NOWAIT) {
 		waitfor = MNT_WAIT;
 		goto top;
 	}
 
 	/*
 	 * If we have managed to get rid of all the dirty buffers,
 	 * then we are done. For certain directories and block
 	 * devices, we may need to do further work.
 	 *
 	 * We must wait for any I/O in progress to finish so that
 	 * all potential buffers on the dirty list will be visible.
 	 */
 	VI_LOCK(vp);
 	drain_output(vp);
 	VI_UNLOCK(vp);
 	return (0);
 }
 
 /*
  * Flush the dependencies associated with an inodedep.
  * Called with splbio blocked.
  */
 static int
 flush_inodedep_deps(mp, ino)
 	struct mount *mp;
 	ino_t ino;
 {
 	struct inodedep *inodedep;
 	int error, waitfor;
 
 	/*
 	 * This work is done in two passes. The first pass grabs most
 	 * of the buffers and begins asynchronously writing them. The
 	 * only way to wait for these asynchronous writes is to sleep
 	 * on the filesystem vnode which may stay busy for a long time
 	 * if the filesystem is active. So, instead, we make a second
 	 * pass over the dependencies blocking on each write. In the
 	 * usual case we will be blocking against a write that we
 	 * initiated, so when it is done the dependency will have been
 	 * resolved. Thus the second pass is expected to end quickly.
 	 * We give a brief window at the top of the loop to allow
 	 * any pending I/O to complete.
 	 */
 	for (error = 0, waitfor = MNT_NOWAIT; ; ) {
 		if (error)
 			return (error);
 		FREE_LOCK(&lk);
 		ACQUIRE_LOCK(&lk);
 		if (inodedep_lookup(mp, ino, 0, &inodedep) == 0)
 			return (0);
 		if (flush_deplist(&inodedep->id_inoupdt, waitfor, &error) ||
 		    flush_deplist(&inodedep->id_newinoupdt, waitfor, &error) ||
 		    flush_deplist(&inodedep->id_extupdt, waitfor, &error) ||
 		    flush_deplist(&inodedep->id_newextupdt, waitfor, &error))
 			continue;
 		/*
 		 * If pass2, we are done, otherwise do pass 2.
 		 */
 		if (waitfor == MNT_WAIT)
 			break;
 		waitfor = MNT_WAIT;
 	}
 	/*
 	 * Try freeing inodedep in case all dependencies have been removed.
 	 */
 	if (inodedep_lookup(mp, ino, 0, &inodedep) != 0)
 		(void) free_inodedep(inodedep);
 	return (0);
 }
 
 /*
  * Flush an inode dependency list.
  * Called with splbio blocked.
  */
 static int
 flush_deplist(listhead, waitfor, errorp)
 	struct allocdirectlst *listhead;
 	int waitfor;
 	int *errorp;
 {
 	struct allocdirect *adp;
 	struct buf *bp;
 
 	mtx_assert(&lk, MA_OWNED);
 	TAILQ_FOREACH(adp, listhead, ad_next) {
 		if (adp->ad_state & DEPCOMPLETE)
 			continue;
 		bp = adp->ad_buf;
 		bp = getdirtybuf(bp, &lk, waitfor);
 		if (bp == NULL) {
 			if (waitfor == MNT_NOWAIT)
 				continue;
 			return (1);
 		}
 		FREE_LOCK(&lk);
 		if (waitfor == MNT_NOWAIT) {
 			bawrite(bp);
 		} else if ((*errorp = bwrite(bp)) != 0) {
 			ACQUIRE_LOCK(&lk);
 			return (1);
 		}
 		ACQUIRE_LOCK(&lk);
 		return (1);
 	}
 	return (0);
 }
 
 /*
  * Eliminate a pagedep dependency by flushing out all its diradd dependencies.
  * Called with splbio blocked.
  */
 static int
 flush_pagedep_deps(pvp, mp, diraddhdp)
 	struct vnode *pvp;
 	struct mount *mp;
 	struct diraddhd *diraddhdp;
 {
 	struct inodedep *inodedep;
 	struct ufsmount *ump;
 	struct diradd *dap;
 	struct vnode *vp;
 	int error = 0;
 	struct buf *bp;
 	ino_t inum;
 	struct worklist *wk;
 
 	ump = VFSTOUFS(mp);
 	while ((dap = LIST_FIRST(diraddhdp)) != NULL) {
 		/*
 		 * Flush ourselves if this directory entry
 		 * has a MKDIR_PARENT dependency.
 		 */
 		if (dap->da_state & MKDIR_PARENT) {
 			FREE_LOCK(&lk);
 			if ((error = ffs_update(pvp, 1)) != 0)
 				break;
 			ACQUIRE_LOCK(&lk);
 			/*
 			 * If that cleared dependencies, go on to next.
 			 */
 			if (dap != LIST_FIRST(diraddhdp))
 				continue;
 			if (dap->da_state & MKDIR_PARENT)
 				panic("flush_pagedep_deps: MKDIR_PARENT");
 		}
 		/*
 		 * A newly allocated directory must have its "." and
 		 * ".." entries written out before its name can be
 		 * committed in its parent. We do not want or need
 		 * the full semantics of a synchronous ffs_syncvnode as
 		 * that may end up here again, once for each directory
 		 * level in the filesystem. Instead, we push the blocks
 		 * and wait for them to clear. We have to fsync twice
 		 * because the first call may choose to defer blocks
 		 * that still have dependencies, but deferral will
 		 * happen at most once.
 		 */
 		inum = dap->da_newinum;
 		if (dap->da_state & MKDIR_BODY) {
 			FREE_LOCK(&lk);
 			if ((error = ffs_vget(mp, inum, LK_EXCLUSIVE, &vp)))
 				break;
 			if ((error=ffs_syncvnode(vp, MNT_NOWAIT)) ||
 			    (error=ffs_syncvnode(vp, MNT_NOWAIT))) {
 				vput(vp);
 				break;
 			}
 			VI_LOCK(vp);
 			drain_output(vp);
 			/*
 			 * If first block is still dirty with a D_MKDIR
 			 * dependency then it needs to be written now.
 			 */
 			for (;;) {
 				error = 0;
 				bp = gbincore(&vp->v_bufobj, 0);
 				if (bp == NULL)
 					break;	/* First block not present */
 				error = BUF_LOCK(bp,
 						 LK_EXCLUSIVE |
 						 LK_SLEEPFAIL |
 						 LK_INTERLOCK,
 						 VI_MTX(vp));
 				VI_LOCK(vp);
 				if (error == ENOLCK)
 					continue;	/* Slept, retry */
 				if (error != 0)
 					break;		/* Failed */
 				if ((bp->b_flags & B_DELWRI) == 0) {
 					BUF_UNLOCK(bp);
 					break;	/* Buffer not dirty */
 				}
 				for (wk = LIST_FIRST(&bp->b_dep);
 				     wk != NULL;
 				     wk = LIST_NEXT(wk, wk_list))
 					if (wk->wk_type == D_MKDIR)
 						break;
 				if (wk == NULL)
 					BUF_UNLOCK(bp);	/* Dependency gone */
 				else {
 					/*
 					 * D_MKDIR dependency remains,
 					 * must write buffer to stable
 					 * storage.
 					 */
 					VI_UNLOCK(vp);
 					bremfree(bp);
 					error = bwrite(bp);
 					VI_LOCK(vp);
 				}
 				break;
 			}
 			VI_UNLOCK(vp);
 			vput(vp);
 			if (error != 0)
 				break;	/* Flushing of first block failed */
 			ACQUIRE_LOCK(&lk);
 			/*
 			 * If that cleared dependencies, go on to next.
 			 */
 			if (dap != LIST_FIRST(diraddhdp))
 				continue;
 			if (dap->da_state & MKDIR_BODY)
 				panic("flush_pagedep_deps: MKDIR_BODY");
 		}
 		/*
 		 * Flush the inode on which the directory entry depends.
 		 * Having accounted for MKDIR_PARENT and MKDIR_BODY above,
 		 * the only remaining dependency is that the updated inode
 		 * count must get pushed to disk. The inode has already
 		 * been pushed into its inode buffer (via VOP_UPDATE) at
 		 * the time of the reference count change. So we need only
 		 * locate that buffer, ensure that there will be no rollback
 		 * caused by a bitmap dependency, then write the inode buffer.
 		 */
 retry:
 		if (inodedep_lookup(UFSTOVFS(ump), inum, 0, &inodedep) == 0)
 			panic("flush_pagedep_deps: lost inode");
 		/*
 		 * If the inode still has bitmap dependencies,
 		 * push them to disk.
 		 */
 		if ((inodedep->id_state & DEPCOMPLETE) == 0) {
 			bp = inodedep->id_buf;
 			bp = getdirtybuf(bp, &lk, MNT_WAIT);
 			if (bp == NULL)
 				goto retry;
 			FREE_LOCK(&lk);
 			if ((error = bwrite(bp)) != 0)
 				break;
 			ACQUIRE_LOCK(&lk);
 			if (dap != LIST_FIRST(diraddhdp))
 				continue;
 		}
 		/*
 		 * If the inode is still sitting in a buffer waiting
 		 * to be written, push it to disk.
 		 */
 		FREE_LOCK(&lk);
 		if ((error = bread(ump->um_devvp,
 		    fsbtodb(ump->um_fs, ino_to_fsba(ump->um_fs, inum)),
 		    (int)ump->um_fs->fs_bsize, NOCRED, &bp)) != 0) {
 			brelse(bp);
 			break;
 		}
 		if ((error = bwrite(bp)) != 0)
 			break;
 		ACQUIRE_LOCK(&lk);
 		/*
 		 * If we have failed to get rid of all the dependencies
 		 * then something is seriously wrong.
 		 */
 		if (dap == LIST_FIRST(diraddhdp))
 			panic("flush_pagedep_deps: flush failed");
 	}
 	if (error)
 		ACQUIRE_LOCK(&lk);
 	return (error);
 }
 
 /*
  * A large burst of file addition or deletion activity can drive the
  * memory load excessively high. First attempt to slow things down
  * using the techniques below. If that fails, this routine requests
  * the offending operations to fall back to running synchronously
  * until the memory load returns to a reasonable level.
  */
 int
 softdep_slowdown(vp)
 	struct vnode *vp;
 {
 	int max_softdeps_hard;
 
 	ACQUIRE_LOCK(&lk);
 	max_softdeps_hard = max_softdeps * 11 / 10;
 	if (num_dirrem < max_softdeps_hard / 2 &&
 	    num_inodedep < max_softdeps_hard &&
 	    VFSTOUFS(vp->v_mount)->um_numindirdeps < maxindirdeps) {
 		FREE_LOCK(&lk);
   		return (0);
 	}
 	if (VFSTOUFS(vp->v_mount)->um_numindirdeps >= maxindirdeps)
 		softdep_speedup();
 	stat_sync_limit_hit += 1;
 	FREE_LOCK(&lk);
 	return (1);
 }
 
 /*
  * Called by the allocation routines when they are about to fail
  * in the hope that we can free up some disk space.
  * 
  * First check to see if the work list has anything on it. If it has,
  * clean up entries until we successfully free some space. Because this
  * process holds inodes locked, we cannot handle any remove requests
  * that might block on a locked inode as that could lead to deadlock.
  * If the worklist yields no free space, encourage the syncer daemon
  * to help us. In no event will we try for longer than tickdelay seconds.
  */
 int
 softdep_request_cleanup(fs, vp)
 	struct fs *fs;
 	struct vnode *vp;
 {
 	struct ufsmount *ump;
 	long starttime;
 	ufs2_daddr_t needed;
 	int error;
 
 	ump = VTOI(vp)->i_ump;
 	mtx_assert(UFS_MTX(ump), MA_OWNED);
 	needed = fs->fs_cstotal.cs_nbfree + fs->fs_contigsumsize;
 	starttime = time_second + tickdelay;
 	/*
 	 * If we are being called because of a process doing a
 	 * copy-on-write, then it is not safe to update the vnode
 	 * as we may recurse into the copy-on-write routine.
 	 */
 	if (!(curthread->td_pflags & TDP_COWINPROGRESS)) {
 		UFS_UNLOCK(ump);
 		error = ffs_update(vp, 1);
 		UFS_LOCK(ump);
 		if (error != 0)
 			return (0);
 	}
 	while (fs->fs_pendingblocks > 0 && fs->fs_cstotal.cs_nbfree <= needed) {
 		if (time_second > starttime)
 			return (0);
 		UFS_UNLOCK(ump);
 		ACQUIRE_LOCK(&lk);
 		if (ump->softdep_on_worklist > 0 &&
 		    process_worklist_item(UFSTOVFS(ump), LK_NOWAIT) != -1) {
 			stat_worklist_push += 1;
 			FREE_LOCK(&lk);
 			UFS_LOCK(ump);
 			continue;
 		}
 		request_cleanup(UFSTOVFS(ump), FLUSH_REMOVE_WAIT);
 		FREE_LOCK(&lk);
 		UFS_LOCK(ump);
 	}
 	return (1);
 }
 
 /*
  * If memory utilization has gotten too high, deliberately slow things
  * down and speed up the I/O processing.
  */
 extern struct thread *syncertd;
 static int
 request_cleanup(mp, resource)
 	struct mount *mp;
 	int resource;
 {
 	struct thread *td = curthread;
 	struct ufsmount *ump;
 
 	mtx_assert(&lk, MA_OWNED);
 	/*
 	 * We never hold up the filesystem syncer or buf daemon.
 	 */
 	if (td->td_pflags & (TDP_SOFTDEP|TDP_NORUNNINGBUF))
 		return (0);
 	ump = VFSTOUFS(mp);
 	/*
 	 * First check to see if the work list has gotten backlogged.
 	 * If it has, co-opt this process to help clean up two entries.
 	 * Because this process may hold inodes locked, we cannot
 	 * handle any remove requests that might block on a locked
 	 * inode as that could lead to deadlock.  We set TDP_SOFTDEP
 	 * to avoid recursively processing the worklist.
 	 */
 	if (ump->softdep_on_worklist > max_softdeps / 10) {
 		td->td_pflags |= TDP_SOFTDEP;
 		process_worklist_item(mp, LK_NOWAIT);
 		process_worklist_item(mp, LK_NOWAIT);
 		td->td_pflags &= ~TDP_SOFTDEP;
 		stat_worklist_push += 2;
 		return(1);
 	}
 	/*
 	 * Next, we attempt to speed up the syncer process. If that
 	 * is successful, then we allow the process to continue.
 	 */
 	if (softdep_speedup() && resource != FLUSH_REMOVE_WAIT)
 		return(0);
 	/*
 	 * If we are resource constrained on inode dependencies, try
 	 * flushing some dirty inodes. Otherwise, we are constrained
 	 * by file deletions, so try accelerating flushes of directories
 	 * with removal dependencies. We would like to do the cleanup
 	 * here, but we probably hold an inode locked at this point and 
 	 * that might deadlock against one that we try to clean. So,
 	 * the best that we can do is request the syncer daemon to do
 	 * the cleanup for us.
 	 */
 	switch (resource) {
 
 	case FLUSH_INODES:
 		stat_ino_limit_push += 1;
 		req_clear_inodedeps += 1;
 		stat_countp = &stat_ino_limit_hit;
 		break;
 
 	case FLUSH_REMOVE:
 	case FLUSH_REMOVE_WAIT:
 		stat_blk_limit_push += 1;
 		req_clear_remove += 1;
 		stat_countp = &stat_blk_limit_hit;
 		break;
 
 	default:
 		panic("request_cleanup: unknown type");
 	}
 	/*
 	 * Hopefully the syncer daemon will catch up and awaken us.
 	 * We wait at most tickdelay before proceeding in any case.
 	 */
 	proc_waiting += 1;
 	if (handle.callout == NULL)
 		handle = timeout(pause_timer, 0, tickdelay > 2 ? tickdelay : 2);
 	msleep((caddr_t)&proc_waiting, &lk, PPAUSE, "softupdate", 0);
 	proc_waiting -= 1;
 	return (1);
 }
 
 /*
  * Awaken processes pausing in request_cleanup and clear proc_waiting
  * to indicate that there is no longer a timer running.
  */
 static void
 pause_timer(arg)
 	void *arg;
 {
 
 	ACQUIRE_LOCK(&lk);
 	*stat_countp += 1;
 	wakeup_one(&proc_waiting);
 	if (proc_waiting > 0)
 		handle = timeout(pause_timer, 0, tickdelay > 2 ? tickdelay : 2);
 	else
 		handle.callout = NULL;
 	FREE_LOCK(&lk);
 }
 
 /*
  * Flush out a directory with at least one removal dependency in an effort to
  * reduce the number of dirrem, freefile, and freeblks dependency structures.
  */
 static void
 clear_remove(td)
 	struct thread *td;
 {
 	struct pagedep_hashhead *pagedephd;
 	struct pagedep *pagedep;
 	static int next = 0;
 	struct mount *mp;
 	struct vnode *vp;
 	int error, cnt;
 	ino_t ino;
 
 	mtx_assert(&lk, MA_OWNED);
 
 	for (cnt = 0; cnt < pagedep_hash; cnt++) {
 		pagedephd = &pagedep_hashtbl[next++];
 		if (next >= pagedep_hash)
 			next = 0;
 		LIST_FOREACH(pagedep, pagedephd, pd_hash) {
 			if (LIST_EMPTY(&pagedep->pd_dirremhd))
 				continue;
 			mp = pagedep->pd_list.wk_mp;
 			ino = pagedep->pd_ino;
 			if (vn_start_write(NULL, &mp, V_NOWAIT) != 0)
 				continue;
 			FREE_LOCK(&lk);
 			if ((error = ffs_vget(mp, ino, LK_EXCLUSIVE, &vp))) {
 				softdep_error("clear_remove: vget", error);
 				vn_finished_write(mp);
 				ACQUIRE_LOCK(&lk);
 				return;
 			}
 			if ((error = ffs_syncvnode(vp, MNT_NOWAIT)))
 				softdep_error("clear_remove: fsync", error);
 			VI_LOCK(vp);
 			drain_output(vp);
 			VI_UNLOCK(vp);
 			vput(vp);
 			vn_finished_write(mp);
 			ACQUIRE_LOCK(&lk);
 			return;
 		}
 	}
 }
 
 /*
  * Clear out a block of dirty inodes in an effort to reduce
  * the number of inodedep dependency structures.
  */
 static void
 clear_inodedeps(td)
 	struct thread *td;
 {
 	struct inodedep_hashhead *inodedephd;
 	struct inodedep *inodedep;
 	static int next = 0;
 	struct mount *mp;
 	struct vnode *vp;
 	struct fs *fs;
 	int error, cnt;
 	ino_t firstino, lastino, ino;
 
 	mtx_assert(&lk, MA_OWNED);
 	/*
 	 * Pick a random inode dependency to be cleared.
 	 * We will then gather up all the inodes in its block 
 	 * that have dependencies and flush them out.
 	 */
 	for (cnt = 0; cnt < inodedep_hash; cnt++) {
 		inodedephd = &inodedep_hashtbl[next++];
 		if (next >= inodedep_hash)
 			next = 0;
 		if ((inodedep = LIST_FIRST(inodedephd)) != NULL)
 			break;
 	}
 	if (inodedep == NULL)
 		return;
 	fs = inodedep->id_fs;
 	mp = inodedep->id_list.wk_mp;
 	/*
 	 * Find the last inode in the block with dependencies.
 	 */
 	firstino = inodedep->id_ino & ~(INOPB(fs) - 1);
 	for (lastino = firstino + INOPB(fs) - 1; lastino > firstino; lastino--)
 		if (inodedep_lookup(mp, lastino, 0, &inodedep) != 0)
 			break;
 	/*
 	 * Asynchronously push all but the last inode with dependencies.
 	 * Synchronously push the last inode with dependencies to ensure
 	 * that the inode block gets written to free up the inodedeps.
 	 */
 	for (ino = firstino; ino <= lastino; ino++) {
 		if (inodedep_lookup(mp, ino, 0, &inodedep) == 0)
 			continue;
 		if (vn_start_write(NULL, &mp, V_NOWAIT) != 0)
 			continue;
 		FREE_LOCK(&lk);
 		if ((error = ffs_vget(mp, ino, LK_EXCLUSIVE, &vp)) != 0) {
 			softdep_error("clear_inodedeps: vget", error);
 			vn_finished_write(mp);
 			ACQUIRE_LOCK(&lk);
 			return;
 		}
 		if (ino == lastino) {
 			if ((error = ffs_syncvnode(vp, MNT_WAIT)))
 				softdep_error("clear_inodedeps: fsync1", error);
 		} else {
 			if ((error = ffs_syncvnode(vp, MNT_NOWAIT)))
 				softdep_error("clear_inodedeps: fsync2", error);
 			VI_LOCK(vp);
 			drain_output(vp);
 			VI_UNLOCK(vp);
 		}
 		vput(vp);
 		vn_finished_write(mp);
 		ACQUIRE_LOCK(&lk);
 	}
 }
 
 /*
  * Function to determine if the buffer has outstanding dependencies
  * that will cause a roll-back if the buffer is written. If wantcount
  * is set, return number of dependencies, otherwise just yes or no.
  */
 static int
 softdep_count_dependencies(bp, wantcount)
 	struct buf *bp;
 	int wantcount;
 {
 	struct worklist *wk;
 	struct inodedep *inodedep;
 	struct indirdep *indirdep;
 	struct allocindir *aip;
 	struct pagedep *pagedep;
 	struct diradd *dap;
 	int i, retval;
 
 	retval = 0;
 	ACQUIRE_LOCK(&lk);
 	LIST_FOREACH(wk, &bp->b_dep, wk_list) {
 		switch (wk->wk_type) {
 
 		case D_INODEDEP:
 			inodedep = WK_INODEDEP(wk);
 			if ((inodedep->id_state & DEPCOMPLETE) == 0) {
 				/* bitmap allocation dependency */
 				retval += 1;
 				if (!wantcount)
 					goto out;
 			}
 			if (TAILQ_FIRST(&inodedep->id_inoupdt)) {
 				/* direct block pointer dependency */
 				retval += 1;
 				if (!wantcount)
 					goto out;
 			}
 			if (TAILQ_FIRST(&inodedep->id_extupdt)) {
 				/* direct block pointer dependency */
 				retval += 1;
 				if (!wantcount)
 					goto out;
 			}
 			continue;
 
 		case D_INDIRDEP:
 			indirdep = WK_INDIRDEP(wk);
 
 			LIST_FOREACH(aip, &indirdep->ir_deplisthd, ai_next) {
 				/* indirect block pointer dependency */
 				retval += 1;
 				if (!wantcount)
 					goto out;
 			}
 			continue;
 
 		case D_PAGEDEP:
 			pagedep = WK_PAGEDEP(wk);
 			for (i = 0; i < DAHASHSZ; i++) {
 
 				LIST_FOREACH(dap, &pagedep->pd_diraddhd[i], da_pdlist) {
 					/* directory entry dependency */
 					retval += 1;
 					if (!wantcount)
 						goto out;
 				}
 			}
 			continue;
 
 		case D_BMSAFEMAP:
 		case D_ALLOCDIRECT:
 		case D_ALLOCINDIR:
 		case D_MKDIR:
 			/* never a dependency on these blocks */
 			continue;
 
 		default:
 			panic("softdep_check_for_rollback: Unexpected type %s",
 			    TYPENAME(wk->wk_type));
 			/* NOTREACHED */
 		}
 	}
 out:
 	FREE_LOCK(&lk);
 	return retval;
 }
 
 /*
  * Acquire exclusive access to a buffer.
  * Must be called with a locked mtx parameter.
  * Return acquired buffer or NULL on failure.
  */
 static struct buf *
 getdirtybuf(bp, mtx, waitfor)
 	struct buf *bp;
 	struct mtx *mtx;
 	int waitfor;
 {
 	int error;
 
 	mtx_assert(mtx, MA_OWNED);
 	if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT, NULL) != 0) {
 		if (waitfor != MNT_WAIT)
 			return (NULL);
 		error = BUF_LOCK(bp,
 		    LK_EXCLUSIVE | LK_SLEEPFAIL | LK_INTERLOCK, mtx);
 		/*
 		 * Even if we sucessfully acquire bp here, we have dropped
 		 * mtx, which may violates our guarantee.
 		 */
 		if (error == 0)
 			BUF_UNLOCK(bp);
 		else if (error != ENOLCK)
 			panic("getdirtybuf: inconsistent lock: %d", error);
 		mtx_lock(mtx);
 		return (NULL);
 	}
 	if ((bp->b_vflags & BV_BKGRDINPROG) != 0) {
 		if (mtx == &lk && waitfor == MNT_WAIT) {
 			mtx_unlock(mtx);
 			BO_LOCK(bp->b_bufobj);
 			BUF_UNLOCK(bp);
 			if ((bp->b_vflags & BV_BKGRDINPROG) != 0) {
 				bp->b_vflags |= BV_BKGRDWAIT;
 				msleep(&bp->b_xflags, BO_MTX(bp->b_bufobj),
 				       PRIBIO | PDROP, "getbuf", 0);
 			} else
 				BO_UNLOCK(bp->b_bufobj);
 			mtx_lock(mtx);
 			return (NULL);
 		}
 		BUF_UNLOCK(bp);
 		if (waitfor != MNT_WAIT)
 			return (NULL);
 		/*
 		 * The mtx argument must be bp->b_vp's mutex in
 		 * this case.
 		 */
 #ifdef	DEBUG_VFS_LOCKS
 		if (bp->b_vp->v_type != VCHR)
 			ASSERT_VI_LOCKED(bp->b_vp, "getdirtybuf");
 #endif
 		bp->b_vflags |= BV_BKGRDWAIT;
 		msleep(&bp->b_xflags, mtx, PRIBIO, "getbuf", 0);
 		return (NULL);
 	}
 	if ((bp->b_flags & B_DELWRI) == 0) {
 		BUF_UNLOCK(bp);
 		return (NULL);
 	}
 	bremfree(bp);
 	return (bp);
 }
 
 
 /*
  * Check if it is safe to suspend the file system now.  On entry,
  * the vnode interlock for devvp should be held.  Return 0 with
  * the mount interlock held if the file system can be suspended now,
  * otherwise return EAGAIN with the mount interlock held.
  */
 int
 softdep_check_suspend(struct mount *mp,
 		      struct vnode *devvp,
 		      int softdep_deps,
 		      int softdep_accdeps,
 		      int secondary_writes,
 		      int secondary_accwrites)
 {
 	struct bufobj *bo;
 	struct ufsmount *ump;
 	int error;
 
 	ASSERT_VI_LOCKED(devvp, "softdep_check_suspend");
 	ump = VFSTOUFS(mp);
 	bo = &devvp->v_bufobj;
 
 	for (;;) {
 		if (!TRY_ACQUIRE_LOCK(&lk)) {
 			VI_UNLOCK(devvp);
 			ACQUIRE_LOCK(&lk);
 			FREE_LOCK(&lk);
 			VI_LOCK(devvp);
 			continue;
 		}
 		if (!MNT_ITRYLOCK(mp)) {
 			FREE_LOCK(&lk);
 			VI_UNLOCK(devvp);
 			MNT_ILOCK(mp);
 			MNT_IUNLOCK(mp);
 			VI_LOCK(devvp);
 			continue;
 		}
 		if (mp->mnt_secondary_writes != 0) {
 			FREE_LOCK(&lk);
 			VI_UNLOCK(devvp);
 			msleep(&mp->mnt_secondary_writes,
 			       MNT_MTX(mp),
 			       (PUSER - 1) | PDROP, "secwr", 0);
 			VI_LOCK(devvp);
 			continue;
 		}
 		break;
 	}
 
 	/*
 	 * Reasons for needing more work before suspend:
 	 * - Dirty buffers on devvp.
 	 * - Softdep activity occurred after start of vnode sync loop
 	 * - Secondary writes occurred after start of vnode sync loop
 	 */
 	error = 0;
 	if (bo->bo_numoutput > 0 ||
 	    bo->bo_dirty.bv_cnt > 0 ||
 	    softdep_deps != 0 ||
 	    ump->softdep_deps != 0 ||
 	    softdep_accdeps != ump->softdep_accdeps ||
 	    secondary_writes != 0 ||
 	    mp->mnt_secondary_writes != 0 ||
 	    secondary_accwrites != mp->mnt_secondary_accwrites)
 		error = EAGAIN;
 	FREE_LOCK(&lk);
 	VI_UNLOCK(devvp);
 	return (error);
 }
 
 
 /*
  * Get the number of dependency structures for the file system, both
  * the current number and the total number allocated.  These will
  * later be used to detect that softdep processing has occurred.
  */
 void
 softdep_get_depcounts(struct mount *mp,
 		      int *softdep_depsp,
 		      int *softdep_accdepsp)
 {
 	struct ufsmount *ump;
 
 	ump = VFSTOUFS(mp);
 	ACQUIRE_LOCK(&lk);
 	*softdep_depsp = ump->softdep_deps;
 	*softdep_accdepsp = ump->softdep_accdeps;
 	FREE_LOCK(&lk);
 }
 
 /*
  * Wait for pending output on a vnode to complete.
  * Must be called with vnode lock and interlock locked.
  *
  * XXX: Should just be a call to bufobj_wwait().
  */
 static void
 drain_output(vp)
 	struct vnode *vp;
 {
 	ASSERT_VOP_LOCKED(vp, "drain_output");
 	ASSERT_VI_LOCKED(vp, "drain_output");
 
 	while (vp->v_bufobj.bo_numoutput) {
 		vp->v_bufobj.bo_flag |= BO_WWAIT;
 		msleep((caddr_t)&vp->v_bufobj.bo_numoutput,
 		    VI_MTX(vp), PRIBIO + 1, "drainvp", 0);
 	}
 }
 
 /*
  * Called whenever a buffer that is being invalidated or reallocated
  * contains dependencies. This should only happen if an I/O error has
  * occurred. The routine is called with the buffer locked.
  */ 
 static void
 softdep_deallocate_dependencies(bp)
 	struct buf *bp;
 {
 
 	if ((bp->b_ioflags & BIO_ERROR) == 0)
 		panic("softdep_deallocate_dependencies: dangling deps");
 	softdep_error(bp->b_vp->v_mount->mnt_stat.f_mntonname, bp->b_error);
 	panic("softdep_deallocate_dependencies: unrecovered I/O error");
 }
 
 /*
  * Function to handle asynchronous write errors in the filesystem.
  */
 static void
 softdep_error(func, error)
 	char *func;
 	int error;
 {
 
 	/* XXX should do something better! */
 	printf("%s: got error %d while accessing filesystem\n", func, error);
 }
 
 #endif /* SOFTUPDATES */
Index: head/sys/ufs/ffs/ffs_vfsops.c
===================================================================
--- head/sys/ufs/ffs/ffs_vfsops.c	(revision 175201)
+++ head/sys/ufs/ffs/ffs_vfsops.c	(revision 175202)
@@ -1,1864 +1,1864 @@
 /*-
  * Copyright (c) 1989, 1991, 1993, 1994
  *	The Regents of the University of California.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	@(#)ffs_vfsops.c	8.31 (Berkeley) 5/20/95
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_mac.h"
 #include "opt_quota.h"
 #include "opt_ufs.h"
 #include "opt_ffs.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/namei.h>
 #include <sys/priv.h>
 #include <sys/proc.h>
 #include <sys/kernel.h>
 #include <sys/vnode.h>
 #include <sys/mount.h>
 #include <sys/bio.h>
 #include <sys/buf.h>
 #include <sys/conf.h>
 #include <sys/fcntl.h>
 #include <sys/malloc.h>
 #include <sys/mutex.h>
 
 #include <security/mac/mac_framework.h>
 
 #include <ufs/ufs/extattr.h>
 #include <ufs/ufs/gjournal.h>
 #include <ufs/ufs/quota.h>
 #include <ufs/ufs/ufsmount.h>
 #include <ufs/ufs/inode.h>
 #include <ufs/ufs/ufs_extern.h>
 
 #include <ufs/ffs/fs.h>
 #include <ufs/ffs/ffs_extern.h>
 
 #include <vm/vm.h>
 #include <vm/uma.h>
 #include <vm/vm_page.h>
 
 #include <geom/geom.h>
 #include <geom/geom_vfs.h>
 
 static uma_zone_t uma_inode, uma_ufs1, uma_ufs2;
 
 static int	ffs_reload(struct mount *, struct thread *);
 static int	ffs_mountfs(struct vnode *, struct mount *, struct thread *);
 static void	ffs_oldfscompat_read(struct fs *, struct ufsmount *,
 		    ufs2_daddr_t);
 static void	ffs_oldfscompat_write(struct fs *, struct ufsmount *);
 static void	ffs_ifree(struct ufsmount *ump, struct inode *ip);
 static vfs_init_t ffs_init;
 static vfs_uninit_t ffs_uninit;
 static vfs_extattrctl_t ffs_extattrctl;
 static vfs_cmount_t ffs_cmount;
 static vfs_unmount_t ffs_unmount;
 static vfs_mount_t ffs_mount;
 static vfs_statfs_t ffs_statfs;
 static vfs_fhtovp_t ffs_fhtovp;
 static vfs_sync_t ffs_sync;
 
 static struct vfsops ufs_vfsops = {
 	.vfs_extattrctl =	ffs_extattrctl,
 	.vfs_fhtovp =		ffs_fhtovp,
 	.vfs_init =		ffs_init,
 	.vfs_mount =		ffs_mount,
 	.vfs_cmount =		ffs_cmount,
 	.vfs_quotactl =		ufs_quotactl,
 	.vfs_root =		ufs_root,
 	.vfs_statfs =		ffs_statfs,
 	.vfs_sync =		ffs_sync,
 	.vfs_uninit =		ffs_uninit,
 	.vfs_unmount =		ffs_unmount,
 	.vfs_vget =		ffs_vget,
 };
 
 VFS_SET(ufs_vfsops, ufs, 0);
 MODULE_VERSION(ufs, 1);
 
 static b_strategy_t ffs_geom_strategy;
 static b_write_t ffs_bufwrite;
 
 static struct buf_ops ffs_ops = {
 	.bop_name =	"FFS",
 	.bop_write =	ffs_bufwrite,
 	.bop_strategy =	ffs_geom_strategy,
 	.bop_sync =	bufsync,
 #ifdef NO_FFS_SNAPSHOT
 	.bop_bdflush =	bufbdflush,
 #else
 	.bop_bdflush =	ffs_bdflush,
 #endif
 };
 
 static const char *ffs_opts[] = { "acls", "async", "atime", "clusterr",
     "clusterw", "exec", "export", "force", "from", "multilabel", 
     "snapshot", "suid", "suiddir", "symfollow", "sync",
     "union", NULL };
 
 static int
 ffs_mount(struct mount *mp, struct thread *td)
 {
 	struct vnode *devvp;
 	struct ufsmount *ump = 0;
 	struct fs *fs;
 	int error, flags;
 	u_int mntorflags, mntandnotflags;
 	mode_t accessmode;
 	struct nameidata ndp;
 	char *fspec;
 
 	if (vfs_filteropt(mp->mnt_optnew, ffs_opts))
 		return (EINVAL);
 	if (uma_inode == NULL) {
 		uma_inode = uma_zcreate("FFS inode",
 		    sizeof(struct inode), NULL, NULL, NULL, NULL,
 		    UMA_ALIGN_PTR, 0);
 		uma_ufs1 = uma_zcreate("FFS1 dinode",
 		    sizeof(struct ufs1_dinode), NULL, NULL, NULL, NULL,
 		    UMA_ALIGN_PTR, 0);
 		uma_ufs2 = uma_zcreate("FFS2 dinode",
 		    sizeof(struct ufs2_dinode), NULL, NULL, NULL, NULL,
 		    UMA_ALIGN_PTR, 0);
 	}
 
 	fspec = vfs_getopts(mp->mnt_optnew, "from", &error);
 	if (error)
 		return (error);
 
 	mntorflags = 0;
 	mntandnotflags = 0;
 	if (vfs_getopt(mp->mnt_optnew, "acls", NULL, NULL) == 0)
 		mntorflags |= MNT_ACLS;
 
 	if (vfs_getopt(mp->mnt_optnew, "async", NULL, NULL) == 0)
 		mntorflags |= MNT_ASYNC;
 
 	if (vfs_getopt(mp->mnt_optnew, "force", NULL, NULL) == 0)
 		mntorflags |= MNT_FORCE;
 
 	if (vfs_getopt(mp->mnt_optnew, "multilabel", NULL, NULL) == 0)
 		mntorflags |= MNT_MULTILABEL;
 
 	if (vfs_getopt(mp->mnt_optnew, "noasync", NULL, NULL) == 0)
 		mntandnotflags |= MNT_ASYNC;
 
 	if (vfs_getopt(mp->mnt_optnew, "noatime", NULL, NULL) == 0)
 		mntorflags |= MNT_NOATIME;
 
 	if (vfs_getopt(mp->mnt_optnew, "noclusterr", NULL, NULL) == 0)
 		mntorflags |= MNT_NOCLUSTERR;
 
 	if (vfs_getopt(mp->mnt_optnew, "noclusterw", NULL, NULL) == 0)
 		mntorflags |= MNT_NOCLUSTERW;
 
 	if (vfs_getopt(mp->mnt_optnew, "snapshot", NULL, NULL) == 0)
 		mntorflags |= MNT_SNAPSHOT;
 
 	MNT_ILOCK(mp);
 	mp->mnt_flag = (mp->mnt_flag | mntorflags) & ~mntandnotflags;
 	MNT_IUNLOCK(mp);
 	/*
 	 * If updating, check whether changing from read-only to
 	 * read/write; if there is no device name, that's all we do.
 	 */
 	if (mp->mnt_flag & MNT_UPDATE) {
 		ump = VFSTOUFS(mp);
 		fs = ump->um_fs;
 		devvp = ump->um_devvp;
 		if (fs->fs_ronly == 0 &&
 		    vfs_flagopt(mp->mnt_optnew, "ro", NULL, 0)) {
 			if ((error = vn_start_write(NULL, &mp, V_WAIT)) != 0)
 				return (error);
 			/*
 			 * Flush any dirty data.
 			 */
 			if ((error = ffs_sync(mp, MNT_WAIT, td)) != 0) {
 				vn_finished_write(mp);
 				return (error);
 			}
 			/*
 			 * Check for and optionally get rid of files open
 			 * for writing.
 			 */
 			flags = WRITECLOSE;
 			if (mp->mnt_flag & MNT_FORCE)
 				flags |= FORCECLOSE;
 			if (mp->mnt_flag & MNT_SOFTDEP) {
 				error = softdep_flushfiles(mp, flags, td);
 			} else {
 				error = ffs_flushfiles(mp, flags, td);
 			}
 			if (error) {
 				vn_finished_write(mp);
 				return (error);
 			}
 			if (fs->fs_pendingblocks != 0 ||
 			    fs->fs_pendinginodes != 0) {
 				printf("%s: %s: blocks %jd files %d\n",
 				    fs->fs_fsmnt, "update error",
 				    (intmax_t)fs->fs_pendingblocks,
 				    fs->fs_pendinginodes);
 				fs->fs_pendingblocks = 0;
 				fs->fs_pendinginodes = 0;
 			}
 			if ((fs->fs_flags & (FS_UNCLEAN | FS_NEEDSFSCK)) == 0)
 				fs->fs_clean = 1;
 			if ((error = ffs_sbupdate(ump, MNT_WAIT, 0)) != 0) {
 				fs->fs_ronly = 0;
 				fs->fs_clean = 0;
 				vn_finished_write(mp);
 				return (error);
 			}
 			vn_finished_write(mp);
 			DROP_GIANT();
 			g_topology_lock();
 			g_access(ump->um_cp, 0, -1, 0);
 			g_topology_unlock();
 			PICKUP_GIANT();
 			fs->fs_ronly = 1;
 			MNT_ILOCK(mp);
 			mp->mnt_flag |= MNT_RDONLY;
 			MNT_IUNLOCK(mp);
 		}
 		if ((mp->mnt_flag & MNT_RELOAD) &&
 		    (error = ffs_reload(mp, td)) != 0)
 			return (error);
 		if (fs->fs_ronly &&
 		    !vfs_flagopt(mp->mnt_optnew, "ro", NULL, 0)) {
 			/*
 			 * If upgrade to read-write by non-root, then verify
 			 * that user has necessary permissions on the device.
 			 */
-			vn_lock(devvp, LK_EXCLUSIVE | LK_RETRY, td);
+			vn_lock(devvp, LK_EXCLUSIVE | LK_RETRY);
 			error = VOP_ACCESS(devvp, VREAD | VWRITE,
 			    td->td_ucred, td);
 			if (error)
 				error = priv_check(td, PRIV_VFS_MOUNT_PERM);
 			if (error) {
 				VOP_UNLOCK(devvp, 0, td);
 				return (error);
 			}
 			VOP_UNLOCK(devvp, 0, td);
 			fs->fs_flags &= ~FS_UNCLEAN;
 			if (fs->fs_clean == 0) {
 				fs->fs_flags |= FS_UNCLEAN;
 				if ((mp->mnt_flag & MNT_FORCE) ||
 				    ((fs->fs_flags & FS_NEEDSFSCK) == 0 &&
 				     (fs->fs_flags & FS_DOSOFTDEP))) {
 					printf("WARNING: %s was not %s\n",
 					   fs->fs_fsmnt, "properly dismounted");
 				} else {
 					printf(
 "WARNING: R/W mount of %s denied.  Filesystem is not clean - run fsck\n",
 					    fs->fs_fsmnt);
 					return (EPERM);
 				}
 			}
 			DROP_GIANT();
 			g_topology_lock();
 			/*
 			 * If we're the root device, we may not have an E count
 			 * yet, get it now.
 			 */
 			if (ump->um_cp->ace == 0)
 				error = g_access(ump->um_cp, 0, 1, 1);
 			else
 				error = g_access(ump->um_cp, 0, 1, 0);
 			g_topology_unlock();
 			PICKUP_GIANT();
 			if (error)
 				return (error);
 			if ((error = vn_start_write(NULL, &mp, V_WAIT)) != 0)
 				return (error);
 			fs->fs_ronly = 0;
 			MNT_ILOCK(mp);
 			mp->mnt_flag &= ~MNT_RDONLY;
 			MNT_IUNLOCK(mp);
 			fs->fs_clean = 0;
 			if ((error = ffs_sbupdate(ump, MNT_WAIT, 0)) != 0) {
 				vn_finished_write(mp);
 				return (error);
 			}
 			/* check to see if we need to start softdep */
 			if ((fs->fs_flags & FS_DOSOFTDEP) &&
 			    (error = softdep_mount(devvp, mp, fs, td->td_ucred))){
 				vn_finished_write(mp);
 				return (error);
 			}
 			if (fs->fs_snapinum[0] != 0)
 				ffs_snapshot_mount(mp);
 			vn_finished_write(mp);
 		}
 		/*
 		 * Soft updates is incompatible with "async",
 		 * so if we are doing softupdates stop the user
 		 * from setting the async flag in an update.
 		 * Softdep_mount() clears it in an initial mount 
 		 * or ro->rw remount.
 		 */
 		if (mp->mnt_flag & MNT_SOFTDEP) {
 			/* XXX: Reset too late ? */
 			MNT_ILOCK(mp);
 			mp->mnt_flag &= ~MNT_ASYNC;
 			MNT_IUNLOCK(mp);
 		}
 		/*
 		 * Keep MNT_ACLS flag if it is stored in superblock.
 		 */
 		if ((fs->fs_flags & FS_ACLS) != 0) {
 			/* XXX: Set too late ? */
 			MNT_ILOCK(mp);
 			mp->mnt_flag |= MNT_ACLS;
 			MNT_IUNLOCK(mp);
 		}
 
 		/*
 		 * If this is a snapshot request, take the snapshot.
 		 */
 		if (mp->mnt_flag & MNT_SNAPSHOT)
 			return (ffs_snapshot(mp, fspec));
 	}
 
 	/*
 	 * Not an update, or updating the name: look up the name
 	 * and verify that it refers to a sensible disk device.
 	 */
 	NDINIT(&ndp, LOOKUP, FOLLOW | LOCKLEAF, UIO_SYSSPACE, fspec, td);
 	if ((error = namei(&ndp)) != 0)
 		return (error);
 	NDFREE(&ndp, NDF_ONLY_PNBUF);
 	devvp = ndp.ni_vp;
 	if (!vn_isdisk(devvp, &error)) {
 		vput(devvp);
 		return (error);
 	}
 
 	/*
 	 * If mount by non-root, then verify that user has necessary
 	 * permissions on the device.
 	 */
 	accessmode = VREAD;
 	if ((mp->mnt_flag & MNT_RDONLY) == 0)
 		accessmode |= VWRITE;
 	error = VOP_ACCESS(devvp, accessmode, td->td_ucred, td);
 	if (error)
 		error = priv_check(td, PRIV_VFS_MOUNT_PERM);
 	if (error) {
 		vput(devvp);
 		return (error);
 	}
 
 	if (mp->mnt_flag & MNT_UPDATE) {
 		/*
 		 * Update only
 		 *
 		 * If it's not the same vnode, or at least the same device
 		 * then it's not correct.
 		 */
 
 		if (devvp->v_rdev != ump->um_devvp->v_rdev)
 			error = EINVAL;	/* needs translation */
 		vput(devvp);
 		if (error)
 			return (error);
 	} else {
 		/*
 		 * New mount
 		 *
 		 * We need the name for the mount point (also used for
 		 * "last mounted on") copied in. If an error occurs,
 		 * the mount point is discarded by the upper level code.
 		 * Note that vfs_mount() populates f_mntonname for us.
 		 */
 		if ((error = ffs_mountfs(devvp, mp, td)) != 0) {
 			vrele(devvp);
 			return (error);
 		}
 	}
 	vfs_mountedfrom(mp, fspec);
 	return (0);
 }
 
 /*
  * Compatibility with old mount system call.
  */
 
 static int
 ffs_cmount(struct mntarg *ma, void *data, int flags, struct thread *td)
 {
 	struct ufs_args args;
 	int error;
 
 	if (data == NULL)
 		return (EINVAL);
 	error = copyin(data, &args, sizeof args);
 	if (error)
 		return (error);
 
 	ma = mount_argsu(ma, "from", args.fspec, MAXPATHLEN);
 	ma = mount_arg(ma, "export", &args.export, sizeof args.export);
 	error = kernel_mount(ma, flags);
 
 	return (error);
 }
 
 /*
  * Reload all incore data for a filesystem (used after running fsck on
  * the root filesystem and finding things to fix). The filesystem must
  * be mounted read-only.
  *
  * Things to do to update the mount:
  *	1) invalidate all cached meta-data.
  *	2) re-read superblock from disk.
  *	3) re-read summary information from disk.
  *	4) invalidate all inactive vnodes.
  *	5) invalidate all cached file data.
  *	6) re-read inode data for all active vnodes.
  */
 static int
 ffs_reload(struct mount *mp, struct thread *td)
 {
 	struct vnode *vp, *mvp, *devvp;
 	struct inode *ip;
 	void *space;
 	struct buf *bp;
 	struct fs *fs, *newfs;
 	struct ufsmount *ump;
 	ufs2_daddr_t sblockloc;
 	int i, blks, size, error;
 	int32_t *lp;
 
 	if ((mp->mnt_flag & MNT_RDONLY) == 0)
 		return (EINVAL);
 	ump = VFSTOUFS(mp);
 	/*
 	 * Step 1: invalidate all cached meta-data.
 	 */
 	devvp = VFSTOUFS(mp)->um_devvp;
-	vn_lock(devvp, LK_EXCLUSIVE | LK_RETRY, td);
+	vn_lock(devvp, LK_EXCLUSIVE | LK_RETRY);
 	if (vinvalbuf(devvp, 0, td, 0, 0) != 0)
 		panic("ffs_reload: dirty1");
 	VOP_UNLOCK(devvp, 0, td);
 
 	/*
 	 * Step 2: re-read superblock from disk.
 	 */
 	fs = VFSTOUFS(mp)->um_fs;
 	if ((error = bread(devvp, btodb(fs->fs_sblockloc), fs->fs_sbsize,
 	    NOCRED, &bp)) != 0)
 		return (error);
 	newfs = (struct fs *)bp->b_data;
 	if ((newfs->fs_magic != FS_UFS1_MAGIC &&
 	     newfs->fs_magic != FS_UFS2_MAGIC) ||
 	    newfs->fs_bsize > MAXBSIZE ||
 	    newfs->fs_bsize < sizeof(struct fs)) {
 			brelse(bp);
 			return (EIO);		/* XXX needs translation */
 	}
 	/*
 	 * Copy pointer fields back into superblock before copying in	XXX
 	 * new superblock. These should really be in the ufsmount.	XXX
 	 * Note that important parameters (eg fs_ncg) are unchanged.
 	 */
 	newfs->fs_csp = fs->fs_csp;
 	newfs->fs_maxcluster = fs->fs_maxcluster;
 	newfs->fs_contigdirs = fs->fs_contigdirs;
 	newfs->fs_active = fs->fs_active;
 	/* The file system is still read-only. */
 	newfs->fs_ronly = 1;
 	sblockloc = fs->fs_sblockloc;
 	bcopy(newfs, fs, (u_int)fs->fs_sbsize);
 	brelse(bp);
 	mp->mnt_maxsymlinklen = fs->fs_maxsymlinklen;
 	ffs_oldfscompat_read(fs, VFSTOUFS(mp), sblockloc);
 	UFS_LOCK(ump);
 	if (fs->fs_pendingblocks != 0 || fs->fs_pendinginodes != 0) {
 		printf("%s: reload pending error: blocks %jd files %d\n",
 		    fs->fs_fsmnt, (intmax_t)fs->fs_pendingblocks,
 		    fs->fs_pendinginodes);
 		fs->fs_pendingblocks = 0;
 		fs->fs_pendinginodes = 0;
 	}
 	UFS_UNLOCK(ump);
 
 	/*
 	 * Step 3: re-read summary information from disk.
 	 */
 	blks = howmany(fs->fs_cssize, fs->fs_fsize);
 	space = fs->fs_csp;
 	for (i = 0; i < blks; i += fs->fs_frag) {
 		size = fs->fs_bsize;
 		if (i + fs->fs_frag > blks)
 			size = (blks - i) * fs->fs_fsize;
 		error = bread(devvp, fsbtodb(fs, fs->fs_csaddr + i), size,
 		    NOCRED, &bp);
 		if (error)
 			return (error);
 		bcopy(bp->b_data, space, (u_int)size);
 		space = (char *)space + size;
 		brelse(bp);
 	}
 	/*
 	 * We no longer know anything about clusters per cylinder group.
 	 */
 	if (fs->fs_contigsumsize > 0) {
 		lp = fs->fs_maxcluster;
 		for (i = 0; i < fs->fs_ncg; i++)
 			*lp++ = fs->fs_contigsumsize;
 	}
 
 loop:
 	MNT_ILOCK(mp);
 	MNT_VNODE_FOREACH(vp, mp, mvp) {
 		VI_LOCK(vp);
 		if (vp->v_iflag & VI_DOOMED) {
 			VI_UNLOCK(vp);
 			continue;
 		}
 		MNT_IUNLOCK(mp);
 		/*
 		 * Step 4: invalidate all cached file data.
 		 */
 		if (vget(vp, LK_EXCLUSIVE | LK_INTERLOCK, td)) {
 			MNT_VNODE_FOREACH_ABORT(mp, mvp);
 			goto loop;
 		}
 		if (vinvalbuf(vp, 0, td, 0, 0))
 			panic("ffs_reload: dirty2");
 		/*
 		 * Step 5: re-read inode data for all active vnodes.
 		 */
 		ip = VTOI(vp);
 		error =
 		    bread(devvp, fsbtodb(fs, ino_to_fsba(fs, ip->i_number)),
 		    (int)fs->fs_bsize, NOCRED, &bp);
 		if (error) {
 			VOP_UNLOCK(vp, 0, td);
 			vrele(vp);
 			MNT_VNODE_FOREACH_ABORT(mp, mvp);
 			return (error);
 		}
 		ffs_load_inode(bp, ip, fs, ip->i_number);
 		ip->i_effnlink = ip->i_nlink;
 		brelse(bp);
 		VOP_UNLOCK(vp, 0, td);
 		vrele(vp);
 		MNT_ILOCK(mp);
 	}
 	MNT_IUNLOCK(mp);
 	return (0);
 }
 
 /*
  * Possible superblock locations ordered from most to least likely.
  */
 static int sblock_try[] = SBLOCKSEARCH;
 
 /*
  * Common code for mount and mountroot
  */
 static int
 ffs_mountfs(devvp, mp, td)
 	struct vnode *devvp;
 	struct mount *mp;
 	struct thread *td;
 {
 	struct ufsmount *ump;
 	struct buf *bp;
 	struct fs *fs;
 	struct cdev *dev;
 	void *space;
 	ufs2_daddr_t sblockloc;
 	int error, i, blks, size, ronly;
 	int32_t *lp;
 	struct ucred *cred;
 	struct g_consumer *cp;
 	struct mount *nmp;
 
 	dev = devvp->v_rdev;
 	cred = td ? td->td_ucred : NOCRED;
 
 	ronly = (mp->mnt_flag & MNT_RDONLY) != 0;
 	DROP_GIANT();
 	g_topology_lock();
 	error = g_vfs_open(devvp, &cp, "ffs", ronly ? 0 : 1);
 
 	/*
 	 * If we are a root mount, drop the E flag so fsck can do its magic.
 	 * We will pick it up again when we remount R/W.
 	 */
 	if (error == 0 && ronly && (mp->mnt_flag & MNT_ROOTFS))
 		error = g_access(cp, 0, 0, -1);
 	g_topology_unlock();
 	PICKUP_GIANT();
 	VOP_UNLOCK(devvp, 0, td);
 	if (error)
 		return (error);
 	if (devvp->v_rdev->si_iosize_max != 0)
 		mp->mnt_iosize_max = devvp->v_rdev->si_iosize_max;
 	if (mp->mnt_iosize_max > MAXPHYS)
 		mp->mnt_iosize_max = MAXPHYS;
 
 	devvp->v_bufobj.bo_private = cp;
 	devvp->v_bufobj.bo_ops = &ffs_ops;
 
 	bp = NULL;
 	ump = NULL;
 	fs = NULL;
 	sblockloc = 0;
 	/*
 	 * Try reading the superblock in each of its possible locations.
 	 */
 	for (i = 0; sblock_try[i] != -1; i++) {
 		if ((SBLOCKSIZE % cp->provider->sectorsize) != 0) {
 			error = EINVAL;
 			vfs_mount_error(mp,
 			    "Invalid sectorsize %d for superblock size %d",
 			    cp->provider->sectorsize, SBLOCKSIZE);
 			goto out;
 		}
 		if ((error = bread(devvp, btodb(sblock_try[i]), SBLOCKSIZE,
 		    cred, &bp)) != 0)
 			goto out;
 		fs = (struct fs *)bp->b_data;
 		sblockloc = sblock_try[i];
 		if ((fs->fs_magic == FS_UFS1_MAGIC ||
 		     (fs->fs_magic == FS_UFS2_MAGIC &&
 		      (fs->fs_sblockloc == sblockloc ||
 		       (fs->fs_old_flags & FS_FLAGS_UPDATED) == 0))) &&
 		    fs->fs_bsize <= MAXBSIZE &&
 		    fs->fs_bsize >= sizeof(struct fs))
 			break;
 		brelse(bp);
 		bp = NULL;
 	}
 	if (sblock_try[i] == -1) {
 		error = EINVAL;		/* XXX needs translation */
 		goto out;
 	}
 	fs->fs_fmod = 0;
 	fs->fs_flags &= ~FS_INDEXDIRS;	/* no support for directory indicies */
 	fs->fs_flags &= ~FS_UNCLEAN;
 	if (fs->fs_clean == 0) {
 		fs->fs_flags |= FS_UNCLEAN;
 		if (ronly || (mp->mnt_flag & MNT_FORCE) ||
 		    ((fs->fs_flags & FS_NEEDSFSCK) == 0 &&
 		     (fs->fs_flags & FS_DOSOFTDEP))) {
 			printf(
 "WARNING: %s was not properly dismounted\n",
 			    fs->fs_fsmnt);
 		} else {
 			printf(
 "WARNING: R/W mount of %s denied.  Filesystem is not clean - run fsck\n",
 			    fs->fs_fsmnt);
 			error = EPERM;
 			goto out;
 		}
 		if ((fs->fs_pendingblocks != 0 || fs->fs_pendinginodes != 0) &&
 		    (mp->mnt_flag & MNT_FORCE)) {
 			printf("%s: lost blocks %jd files %d\n", fs->fs_fsmnt,
 			    (intmax_t)fs->fs_pendingblocks,
 			    fs->fs_pendinginodes);
 			fs->fs_pendingblocks = 0;
 			fs->fs_pendinginodes = 0;
 		}
 	}
 	if (fs->fs_pendingblocks != 0 || fs->fs_pendinginodes != 0) {
 		printf("%s: mount pending error: blocks %jd files %d\n",
 		    fs->fs_fsmnt, (intmax_t)fs->fs_pendingblocks,
 		    fs->fs_pendinginodes);
 		fs->fs_pendingblocks = 0;
 		fs->fs_pendinginodes = 0;
 	}
 	if ((fs->fs_flags & FS_GJOURNAL) != 0) {
 #ifdef UFS_GJOURNAL
 		/*
 		 * Get journal provider name.
 		 */
 		size = 1024;
 		mp->mnt_gjprovider = malloc(size, M_UFSMNT, M_WAITOK);
 		if (g_io_getattr("GJOURNAL::provider", cp, &size,
 		    mp->mnt_gjprovider) == 0) {
 			mp->mnt_gjprovider = realloc(mp->mnt_gjprovider, size,
 			    M_UFSMNT, M_WAITOK);
 			MNT_ILOCK(mp);
 			mp->mnt_flag |= MNT_GJOURNAL;
 			MNT_IUNLOCK(mp);
 		} else {
 			printf(
 "WARNING: %s: GJOURNAL flag on fs but no gjournal provider below\n",
 			    mp->mnt_stat.f_mntonname);
 			free(mp->mnt_gjprovider, M_UFSMNT);
 			mp->mnt_gjprovider = NULL;
 		}
 #else
 		printf(
 "WARNING: %s: GJOURNAL flag on fs but no UFS_GJOURNAL support\n",
 		    mp->mnt_stat.f_mntonname);
 #endif
 	} else {
 		mp->mnt_gjprovider = NULL;
 	}
 	ump = malloc(sizeof *ump, M_UFSMNT, M_WAITOK | M_ZERO);
 	ump->um_cp = cp;
 	ump->um_bo = &devvp->v_bufobj;
 	ump->um_fs = malloc((u_long)fs->fs_sbsize, M_UFSMNT, M_WAITOK);
 	if (fs->fs_magic == FS_UFS1_MAGIC) {
 		ump->um_fstype = UFS1;
 		ump->um_balloc = ffs_balloc_ufs1;
 	} else {
 		ump->um_fstype = UFS2;
 		ump->um_balloc = ffs_balloc_ufs2;
 	}
 	ump->um_blkatoff = ffs_blkatoff;
 	ump->um_truncate = ffs_truncate;
 	ump->um_update = ffs_update;
 	ump->um_valloc = ffs_valloc;
 	ump->um_vfree = ffs_vfree;
 	ump->um_ifree = ffs_ifree;
 	mtx_init(UFS_MTX(ump), "FFS", "FFS Lock", MTX_DEF);
 	bcopy(bp->b_data, ump->um_fs, (u_int)fs->fs_sbsize);
 	if (fs->fs_sbsize < SBLOCKSIZE)
 		bp->b_flags |= B_INVAL | B_NOCACHE;
 	brelse(bp);
 	bp = NULL;
 	fs = ump->um_fs;
 	ffs_oldfscompat_read(fs, ump, sblockloc);
 	fs->fs_ronly = ronly;
 	size = fs->fs_cssize;
 	blks = howmany(size, fs->fs_fsize);
 	if (fs->fs_contigsumsize > 0)
 		size += fs->fs_ncg * sizeof(int32_t);
 	size += fs->fs_ncg * sizeof(u_int8_t);
 	space = malloc((u_long)size, M_UFSMNT, M_WAITOK);
 	fs->fs_csp = space;
 	for (i = 0; i < blks; i += fs->fs_frag) {
 		size = fs->fs_bsize;
 		if (i + fs->fs_frag > blks)
 			size = (blks - i) * fs->fs_fsize;
 		if ((error = bread(devvp, fsbtodb(fs, fs->fs_csaddr + i), size,
 		    cred, &bp)) != 0) {
 			free(fs->fs_csp, M_UFSMNT);
 			goto out;
 		}
 		bcopy(bp->b_data, space, (u_int)size);
 		space = (char *)space + size;
 		brelse(bp);
 		bp = NULL;
 	}
 	if (fs->fs_contigsumsize > 0) {
 		fs->fs_maxcluster = lp = space;
 		for (i = 0; i < fs->fs_ncg; i++)
 			*lp++ = fs->fs_contigsumsize;
 		space = lp;
 	}
 	size = fs->fs_ncg * sizeof(u_int8_t);
 	fs->fs_contigdirs = (u_int8_t *)space;
 	bzero(fs->fs_contigdirs, size);
 	fs->fs_active = NULL;
 	mp->mnt_data = ump;
 	mp->mnt_stat.f_fsid.val[0] = fs->fs_id[0];
 	mp->mnt_stat.f_fsid.val[1] = fs->fs_id[1];
 	nmp = NULL;
 	if (fs->fs_id[0] == 0 || fs->fs_id[1] == 0 || 
 	    (nmp = vfs_getvfs(&mp->mnt_stat.f_fsid))) {
 		if (nmp)
 			vfs_rel(nmp);
 		vfs_getnewfsid(mp);
 	}
 	mp->mnt_maxsymlinklen = fs->fs_maxsymlinklen;
 	MNT_ILOCK(mp);
 	mp->mnt_flag |= MNT_LOCAL;
 	MNT_IUNLOCK(mp);
 	if ((fs->fs_flags & FS_MULTILABEL) != 0) {
 #ifdef MAC
 		MNT_ILOCK(mp);
 		mp->mnt_flag |= MNT_MULTILABEL;
 		MNT_IUNLOCK(mp);
 #else
 		printf(
 "WARNING: %s: multilabel flag on fs but no MAC support\n",
 		    mp->mnt_stat.f_mntonname);
 #endif
 	}
 	if ((fs->fs_flags & FS_ACLS) != 0) {
 #ifdef UFS_ACL
 		MNT_ILOCK(mp);
 		mp->mnt_flag |= MNT_ACLS;
 		MNT_IUNLOCK(mp);
 #else
 		printf(
 "WARNING: %s: ACLs flag on fs but no ACLs support\n",
 		    mp->mnt_stat.f_mntonname);
 #endif
 	}
 	ump->um_mountp = mp;
 	ump->um_dev = dev;
 	ump->um_devvp = devvp;
 	ump->um_nindir = fs->fs_nindir;
 	ump->um_bptrtodb = fs->fs_fsbtodb;
 	ump->um_seqinc = fs->fs_frag;
 	for (i = 0; i < MAXQUOTAS; i++)
 		ump->um_quotas[i] = NULLVP;
 #ifdef UFS_EXTATTR
 	ufs_extattr_uepm_init(&ump->um_extattr);
 #endif
 	/*
 	 * Set FS local "last mounted on" information (NULL pad)
 	 */
 	bzero(fs->fs_fsmnt, MAXMNTLEN);
 	strlcpy(fs->fs_fsmnt, mp->mnt_stat.f_mntonname, MAXMNTLEN);
 
 	if( mp->mnt_flag & MNT_ROOTFS) {
 		/*
 		 * Root mount; update timestamp in mount structure.
 		 * this will be used by the common root mount code
 		 * to update the system clock.
 		 */
 		mp->mnt_time = fs->fs_time;
 	}
 
 	if (ronly == 0) {
 		if ((fs->fs_flags & FS_DOSOFTDEP) &&
 		    (error = softdep_mount(devvp, mp, fs, cred)) != 0) {
 			free(fs->fs_csp, M_UFSMNT);
 			goto out;
 		}
 		if (fs->fs_snapinum[0] != 0)
 			ffs_snapshot_mount(mp);
 		fs->fs_fmod = 1;
 		fs->fs_clean = 0;
 		(void) ffs_sbupdate(ump, MNT_WAIT, 0);
 	}
 	/*
 	 * Initialize filesystem stat information in mount struct.
 	 */
 #ifdef UFS_EXTATTR
 #ifdef UFS_EXTATTR_AUTOSTART
 	/*
 	 *
 	 * Auto-starting does the following:
 	 *	- check for /.attribute in the fs, and extattr_start if so
 	 *	- for each file in .attribute, enable that file with
 	 * 	  an attribute of the same name.
 	 * Not clear how to report errors -- probably eat them.
 	 * This would all happen while the filesystem was busy/not
 	 * available, so would effectively be "atomic".
 	 */
 	(void) ufs_extattr_autostart(mp, td);
 #endif /* !UFS_EXTATTR_AUTOSTART */
 #endif /* !UFS_EXTATTR */
 	MNT_ILOCK(mp);
 	mp->mnt_kern_flag |= MNTK_MPSAFE;
 	MNT_IUNLOCK(mp);
 	return (0);
 out:
 	if (bp)
 		brelse(bp);
 	if (cp != NULL) {
 		DROP_GIANT();
 		g_topology_lock();
 		g_vfs_close(cp, td);
 		g_topology_unlock();
 		PICKUP_GIANT();
 	}
 	if (ump) {
 		mtx_destroy(UFS_MTX(ump));
 		if (mp->mnt_gjprovider != NULL) {
 			free(mp->mnt_gjprovider, M_UFSMNT);
 			mp->mnt_gjprovider = NULL;
 		}
 		free(ump->um_fs, M_UFSMNT);
 		free(ump, M_UFSMNT);
 		mp->mnt_data = NULL;
 	}
 	return (error);
 }
 
 #include <sys/sysctl.h>
 static int bigcgs = 0;
 SYSCTL_INT(_debug, OID_AUTO, bigcgs, CTLFLAG_RW, &bigcgs, 0, "");
 
 /*
  * Sanity checks for loading old filesystem superblocks.
  * See ffs_oldfscompat_write below for unwound actions.
  *
  * XXX - Parts get retired eventually.
  * Unfortunately new bits get added.
  */
 static void
 ffs_oldfscompat_read(fs, ump, sblockloc)
 	struct fs *fs;
 	struct ufsmount *ump;
 	ufs2_daddr_t sblockloc;
 {
 	off_t maxfilesize;
 
 	/*
 	 * If not yet done, update fs_flags location and value of fs_sblockloc.
 	 */
 	if ((fs->fs_old_flags & FS_FLAGS_UPDATED) == 0) {
 		fs->fs_flags = fs->fs_old_flags;
 		fs->fs_old_flags |= FS_FLAGS_UPDATED;
 		fs->fs_sblockloc = sblockloc;
 	}
 	/*
 	 * If not yet done, update UFS1 superblock with new wider fields.
 	 */
 	if (fs->fs_magic == FS_UFS1_MAGIC && fs->fs_maxbsize != fs->fs_bsize) {
 		fs->fs_maxbsize = fs->fs_bsize;
 		fs->fs_time = fs->fs_old_time;
 		fs->fs_size = fs->fs_old_size;
 		fs->fs_dsize = fs->fs_old_dsize;
 		fs->fs_csaddr = fs->fs_old_csaddr;
 		fs->fs_cstotal.cs_ndir = fs->fs_old_cstotal.cs_ndir;
 		fs->fs_cstotal.cs_nbfree = fs->fs_old_cstotal.cs_nbfree;
 		fs->fs_cstotal.cs_nifree = fs->fs_old_cstotal.cs_nifree;
 		fs->fs_cstotal.cs_nffree = fs->fs_old_cstotal.cs_nffree;
 	}
 	if (fs->fs_magic == FS_UFS1_MAGIC &&
 	    fs->fs_old_inodefmt < FS_44INODEFMT) {
 		fs->fs_maxfilesize = ((uint64_t)1 << 31) - 1;
 		fs->fs_qbmask = ~fs->fs_bmask;
 		fs->fs_qfmask = ~fs->fs_fmask;
 	}
 	if (fs->fs_magic == FS_UFS1_MAGIC) {
 		ump->um_savedmaxfilesize = fs->fs_maxfilesize;
 		maxfilesize = (uint64_t)0x80000000 * fs->fs_bsize - 1;
 		if (fs->fs_maxfilesize > maxfilesize)
 			fs->fs_maxfilesize = maxfilesize;
 	}
 	/* Compatibility for old filesystems */
 	if (fs->fs_avgfilesize <= 0)
 		fs->fs_avgfilesize = AVFILESIZ;
 	if (fs->fs_avgfpdir <= 0)
 		fs->fs_avgfpdir = AFPDIR;
 	if (bigcgs) {
 		fs->fs_save_cgsize = fs->fs_cgsize;
 		fs->fs_cgsize = fs->fs_bsize;
 	}
 }
 
 /*
  * Unwinding superblock updates for old filesystems.
  * See ffs_oldfscompat_read above for details.
  *
  * XXX - Parts get retired eventually.
  * Unfortunately new bits get added.
  */
 static void
 ffs_oldfscompat_write(fs, ump)
 	struct fs *fs;
 	struct ufsmount *ump;
 {
 
 	/*
 	 * Copy back UFS2 updated fields that UFS1 inspects.
 	 */
 	if (fs->fs_magic == FS_UFS1_MAGIC) {
 		fs->fs_old_time = fs->fs_time;
 		fs->fs_old_cstotal.cs_ndir = fs->fs_cstotal.cs_ndir;
 		fs->fs_old_cstotal.cs_nbfree = fs->fs_cstotal.cs_nbfree;
 		fs->fs_old_cstotal.cs_nifree = fs->fs_cstotal.cs_nifree;
 		fs->fs_old_cstotal.cs_nffree = fs->fs_cstotal.cs_nffree;
 		fs->fs_maxfilesize = ump->um_savedmaxfilesize;
 	}
 	if (bigcgs) {
 		fs->fs_cgsize = fs->fs_save_cgsize;
 		fs->fs_save_cgsize = 0;
 	}
 }
 
 /*
  * unmount system call
  */
 static int
 ffs_unmount(mp, mntflags, td)
 	struct mount *mp;
 	int mntflags;
 	struct thread *td;
 {
 	struct ufsmount *ump = VFSTOUFS(mp);
 	struct fs *fs;
 	int error, flags;
 
 	flags = 0;
 	if (mntflags & MNT_FORCE) {
 		flags |= FORCECLOSE;
 	}
 #ifdef UFS_EXTATTR
 	if ((error = ufs_extattr_stop(mp, td))) {
 		if (error != EOPNOTSUPP)
 			printf("ffs_unmount: ufs_extattr_stop returned %d\n",
 			    error);
 	} else {
 		ufs_extattr_uepm_destroy(&ump->um_extattr);
 	}
 #endif
 	if (mp->mnt_flag & MNT_SOFTDEP) {
 		if ((error = softdep_flushfiles(mp, flags, td)) != 0)
 			return (error);
 	} else {
 		if ((error = ffs_flushfiles(mp, flags, td)) != 0)
 			return (error);
 	}
 	fs = ump->um_fs;
 	UFS_LOCK(ump);
 	if (fs->fs_pendingblocks != 0 || fs->fs_pendinginodes != 0) {
 		printf("%s: unmount pending error: blocks %jd files %d\n",
 		    fs->fs_fsmnt, (intmax_t)fs->fs_pendingblocks,
 		    fs->fs_pendinginodes);
 		fs->fs_pendingblocks = 0;
 		fs->fs_pendinginodes = 0;
 	}
 	UFS_UNLOCK(ump);
 	if (fs->fs_ronly == 0) {
 		fs->fs_clean = fs->fs_flags & (FS_UNCLEAN|FS_NEEDSFSCK) ? 0 : 1;
 		error = ffs_sbupdate(ump, MNT_WAIT, 0);
 		if (error) {
 			fs->fs_clean = 0;
 			return (error);
 		}
 	}
 	DROP_GIANT();
 	g_topology_lock();
 	g_vfs_close(ump->um_cp, td);
 	g_topology_unlock();
 	PICKUP_GIANT();
 	vrele(ump->um_devvp);
 	mtx_destroy(UFS_MTX(ump));
 	if (mp->mnt_gjprovider != NULL) {
 		free(mp->mnt_gjprovider, M_UFSMNT);
 		mp->mnt_gjprovider = NULL;
 	}
 	free(fs->fs_csp, M_UFSMNT);
 	free(fs, M_UFSMNT);
 	free(ump, M_UFSMNT);
 	mp->mnt_data = NULL;
 	MNT_ILOCK(mp);
 	mp->mnt_flag &= ~MNT_LOCAL;
 	MNT_IUNLOCK(mp);
 	return (error);
 }
 
 /*
  * Flush out all the files in a filesystem.
  */
 int
 ffs_flushfiles(mp, flags, td)
 	struct mount *mp;
 	int flags;
 	struct thread *td;
 {
 	struct ufsmount *ump;
 	int error;
 
 	ump = VFSTOUFS(mp);
 #ifdef QUOTA
 	if (mp->mnt_flag & MNT_QUOTA) {
 		int i;
 		error = vflush(mp, 0, SKIPSYSTEM|flags, td);
 		if (error)
 			return (error);
 		for (i = 0; i < MAXQUOTAS; i++) {
 			quotaoff(td, mp, i);
 		}
 		/*
 		 * Here we fall through to vflush again to ensure
 		 * that we have gotten rid of all the system vnodes.
 		 */
 	}
 #endif
 	ASSERT_VOP_LOCKED(ump->um_devvp, "ffs_flushfiles");
 	if (ump->um_devvp->v_vflag & VV_COPYONWRITE) {
 		if ((error = vflush(mp, 0, SKIPSYSTEM | flags, td)) != 0)
 			return (error);
 		ffs_snapshot_unmount(mp);
 		flags |= FORCECLOSE;
 		/*
 		 * Here we fall through to vflush again to ensure
 		 * that we have gotten rid of all the system vnodes.
 		 */
 	}
         /*
 	 * Flush all the files.
 	 */
 	if ((error = vflush(mp, 0, flags, td)) != 0)
 		return (error);
 	/*
 	 * Flush filesystem metadata.
 	 */
-	vn_lock(ump->um_devvp, LK_EXCLUSIVE | LK_RETRY, td);
+	vn_lock(ump->um_devvp, LK_EXCLUSIVE | LK_RETRY);
 	error = VOP_FSYNC(ump->um_devvp, MNT_WAIT, td);
 	VOP_UNLOCK(ump->um_devvp, 0, td);
 	return (error);
 }
 
 /*
  * Get filesystem statistics.
  */
 static int
 ffs_statfs(mp, sbp, td)
 	struct mount *mp;
 	struct statfs *sbp;
 	struct thread *td;
 {
 	struct ufsmount *ump;
 	struct fs *fs;
 
 	ump = VFSTOUFS(mp);
 	fs = ump->um_fs;
 	if (fs->fs_magic != FS_UFS1_MAGIC && fs->fs_magic != FS_UFS2_MAGIC)
 		panic("ffs_statfs");
 	sbp->f_version = STATFS_VERSION;
 	sbp->f_bsize = fs->fs_fsize;
 	sbp->f_iosize = fs->fs_bsize;
 	sbp->f_blocks = fs->fs_dsize;
 	UFS_LOCK(ump);
 	sbp->f_bfree = fs->fs_cstotal.cs_nbfree * fs->fs_frag +
 	    fs->fs_cstotal.cs_nffree + dbtofsb(fs, fs->fs_pendingblocks);
 	sbp->f_bavail = freespace(fs, fs->fs_minfree) +
 	    dbtofsb(fs, fs->fs_pendingblocks);
 	sbp->f_files =  fs->fs_ncg * fs->fs_ipg - ROOTINO;
 	sbp->f_ffree = fs->fs_cstotal.cs_nifree + fs->fs_pendinginodes;
 	UFS_UNLOCK(ump);
 	sbp->f_namemax = NAME_MAX;
 	return (0);
 }
 
 /*
  * Go through the disk queues to initiate sandbagged IO;
  * go through the inodes to write those that have been modified;
  * initiate the writing of the super block if it has been modified.
  *
  * Note: we are always called with the filesystem marked `MPBUSY'.
  */
 static int
 ffs_sync(mp, waitfor, td)
 	struct mount *mp;
 	int waitfor;
 	struct thread *td;
 {
 	struct vnode *mvp, *vp, *devvp;
 	struct inode *ip;
 	struct ufsmount *ump = VFSTOUFS(mp);
 	struct fs *fs;
 	int error, count, wait, lockreq, allerror = 0;
 	int suspend;
 	int suspended;
 	int secondary_writes;
 	int secondary_accwrites;
 	int softdep_deps;
 	int softdep_accdeps;
 	struct bufobj *bo;
 
 	fs = ump->um_fs;
 	if (fs->fs_fmod != 0 && fs->fs_ronly != 0) {		/* XXX */
 		printf("fs = %s\n", fs->fs_fsmnt);
 		panic("ffs_sync: rofs mod");
 	}
 	/*
 	 * Write back each (modified) inode.
 	 */
 	wait = 0;
 	suspend = 0;
 	suspended = 0;
 	lockreq = LK_EXCLUSIVE | LK_NOWAIT;
 	if (waitfor == MNT_SUSPEND) {
 		suspend = 1;
 		waitfor = MNT_WAIT;
 	}
 	if (waitfor == MNT_WAIT) {
 		wait = 1;
 		lockreq = LK_EXCLUSIVE;
 	}
 	lockreq |= LK_INTERLOCK | LK_SLEEPFAIL;
 	MNT_ILOCK(mp);
 loop:
 	/* Grab snapshot of secondary write counts */
 	secondary_writes = mp->mnt_secondary_writes;
 	secondary_accwrites = mp->mnt_secondary_accwrites;
 
 	/* Grab snapshot of softdep dependency counts */
 	MNT_IUNLOCK(mp);
 	softdep_get_depcounts(mp, &softdep_deps, &softdep_accdeps);
 	MNT_ILOCK(mp);
 
 	MNT_VNODE_FOREACH(vp, mp, mvp) {
 		/*
 		 * Depend on the mntvnode_slock to keep things stable enough
 		 * for a quick test.  Since there might be hundreds of
 		 * thousands of vnodes, we cannot afford even a subroutine
 		 * call unless there's a good chance that we have work to do.
 		 */
 		VI_LOCK(vp);
 		if (vp->v_iflag & VI_DOOMED) {
 			VI_UNLOCK(vp);
 			continue;
 		}
 		ip = VTOI(vp);
 		if (vp->v_type == VNON || ((ip->i_flag &
 		    (IN_ACCESS | IN_CHANGE | IN_MODIFIED | IN_UPDATE)) == 0 &&
 		    vp->v_bufobj.bo_dirty.bv_cnt == 0)) {
 			VI_UNLOCK(vp);
 			continue;
 		}
 		MNT_IUNLOCK(mp);
 		if ((error = vget(vp, lockreq, td)) != 0) {
 			MNT_ILOCK(mp);
 			if (error == ENOENT || error == ENOLCK) {
 				MNT_VNODE_FOREACH_ABORT_ILOCKED(mp, mvp);
 				goto loop;
 			}
 			continue;
 		}
 		if ((error = ffs_syncvnode(vp, waitfor)) != 0)
 			allerror = error;
 		vput(vp);
 		MNT_ILOCK(mp);
 	}
 	MNT_IUNLOCK(mp);
 	/*
 	 * Force stale filesystem control information to be flushed.
 	 */
 	if (waitfor == MNT_WAIT) {
 		if ((error = softdep_flushworklist(ump->um_mountp, &count, td)))
 			allerror = error;
 		/* Flushed work items may create new vnodes to clean */
 		if (allerror == 0 && count) {
 			MNT_ILOCK(mp);
 			goto loop;
 		}
 	}
 #ifdef QUOTA
 	qsync(mp);
 #endif
 	devvp = ump->um_devvp;
 	VI_LOCK(devvp);
 	bo = &devvp->v_bufobj;
 	if (waitfor != MNT_LAZY &&
 	    (bo->bo_numoutput > 0 || bo->bo_dirty.bv_cnt > 0)) {
-		vn_lock(devvp, LK_EXCLUSIVE | LK_RETRY | LK_INTERLOCK, td);
+		vn_lock(devvp, LK_EXCLUSIVE | LK_RETRY | LK_INTERLOCK);
 		if ((error = VOP_FSYNC(devvp, waitfor, td)) != 0)
 			allerror = error;
 		VOP_UNLOCK(devvp, 0, td);
 		if (allerror == 0 && waitfor == MNT_WAIT) {
 			MNT_ILOCK(mp);
 			goto loop;
 		}
 	} else if (suspend != 0) {
 		if (softdep_check_suspend(mp,
 					  devvp,
 					  softdep_deps,
 					  softdep_accdeps,
 					  secondary_writes,
 					  secondary_accwrites) != 0)
 			goto loop;	/* More work needed */
 		mtx_assert(MNT_MTX(mp), MA_OWNED);
 		mp->mnt_kern_flag |= MNTK_SUSPEND2 | MNTK_SUSPENDED;
 		MNT_IUNLOCK(mp);
 		suspended = 1;
 	} else
 		VI_UNLOCK(devvp);
 	/*
 	 * Write back modified superblock.
 	 */
 	if (fs->fs_fmod != 0 &&
 	    (error = ffs_sbupdate(ump, waitfor, suspended)) != 0)
 		allerror = error;
 	return (allerror);
 }
 
 int
 ffs_vget(mp, ino, flags, vpp)
 	struct mount *mp;
 	ino_t ino;
 	int flags;
 	struct vnode **vpp;
 {
 	struct fs *fs;
 	struct inode *ip;
 	struct ufsmount *ump;
 	struct buf *bp;
 	struct vnode *vp;
 	struct cdev *dev;
 	int error;
 	struct thread *td;
 
 	error = vfs_hash_get(mp, ino, flags, curthread, vpp, NULL, NULL);
 	if (error || *vpp != NULL)
 		return (error);
 
 	/*
 	 * We must promote to an exclusive lock for vnode creation.  This
 	 * can happen if lookup is passed LOCKSHARED.
  	 */
 	if ((flags & LK_TYPE_MASK) == LK_SHARED) {
 		flags &= ~LK_TYPE_MASK;
 		flags |= LK_EXCLUSIVE;
 	}
 
 	/*
 	 * We do not lock vnode creation as it is believed to be too
 	 * expensive for such rare case as simultaneous creation of vnode
 	 * for same ino by different processes. We just allow them to race
 	 * and check later to decide who wins. Let the race begin!
 	 */
 
 	ump = VFSTOUFS(mp);
 	dev = ump->um_dev;
 	fs = ump->um_fs;
 
 	/*
 	 * If this MALLOC() is performed after the getnewvnode()
 	 * it might block, leaving a vnode with a NULL v_data to be
 	 * found by ffs_sync() if a sync happens to fire right then,
 	 * which will cause a panic because ffs_sync() blindly
 	 * dereferences vp->v_data (as well it should).
 	 */
 	ip = uma_zalloc(uma_inode, M_WAITOK | M_ZERO);
 
 	/* Allocate a new vnode/inode. */
 	if (fs->fs_magic == FS_UFS1_MAGIC)
 		error = getnewvnode("ufs", mp, &ffs_vnodeops1, &vp);
 	else
 		error = getnewvnode("ufs", mp, &ffs_vnodeops2, &vp);
 	if (error) {
 		*vpp = NULL;
 		uma_zfree(uma_inode, ip);
 		return (error);
 	}
 	/*
 	 * FFS supports recursive and shared locking.
 	 */
 	vp->v_vnlock->lk_flags |= LK_CANRECURSE;
 	vp->v_vnlock->lk_flags &= ~LK_NOSHARE;
 	vp->v_data = ip;
 	vp->v_bufobj.bo_bsize = fs->fs_bsize;
 	ip->i_vnode = vp;
 	ip->i_ump = ump;
 	ip->i_fs = fs;
 	ip->i_dev = dev;
 	ip->i_number = ino;
 #ifdef QUOTA
 	{
 		int i;
 		for (i = 0; i < MAXQUOTAS; i++)
 			ip->i_dquot[i] = NODQUOT;
 	}
 #endif
 
 	td = curthread;
 	lockmgr(vp->v_vnlock, LK_EXCLUSIVE, NULL, td);
 	error = insmntque(vp, mp);
 	if (error != 0) {
 		uma_zfree(uma_inode, ip);
 		*vpp = NULL;
 		return (error);
 	}
 	error = vfs_hash_insert(vp, ino, flags, td, vpp, NULL, NULL);
 	if (error || *vpp != NULL)
 		return (error);
 
 	/* Read in the disk contents for the inode, copy into the inode. */
 	error = bread(ump->um_devvp, fsbtodb(fs, ino_to_fsba(fs, ino)),
 	    (int)fs->fs_bsize, NOCRED, &bp);
 	if (error) {
 		/*
 		 * The inode does not contain anything useful, so it would
 		 * be misleading to leave it on its hash chain. With mode
 		 * still zero, it will be unlinked and returned to the free
 		 * list by vput().
 		 */
 		brelse(bp);
 		vput(vp);
 		*vpp = NULL;
 		return (error);
 	}
 	if (ip->i_ump->um_fstype == UFS1)
 		ip->i_din1 = uma_zalloc(uma_ufs1, M_WAITOK);
 	else
 		ip->i_din2 = uma_zalloc(uma_ufs2, M_WAITOK);
 	ffs_load_inode(bp, ip, fs, ino);
 	if (DOINGSOFTDEP(vp))
 		softdep_load_inodeblock(ip);
 	else
 		ip->i_effnlink = ip->i_nlink;
 	bqrelse(bp);
 
 	/*
 	 * Initialize the vnode from the inode, check for aliases.
 	 * Note that the underlying vnode may have changed.
 	 */
 	if (ip->i_ump->um_fstype == UFS1)
 		error = ufs_vinit(mp, &ffs_fifoops1, &vp);
 	else
 		error = ufs_vinit(mp, &ffs_fifoops2, &vp);
 	if (error) {
 		vput(vp);
 		*vpp = NULL;
 		return (error);
 	}
 
 	/*
 	 * Finish inode initialization.
 	 */
 
 	/*
 	 * Set up a generation number for this inode if it does not
 	 * already have one. This should only happen on old filesystems.
 	 */
 	if (ip->i_gen == 0) {
 		ip->i_gen = arc4random() / 2 + 1;
 		if ((vp->v_mount->mnt_flag & MNT_RDONLY) == 0) {
 			ip->i_flag |= IN_MODIFIED;
 			DIP_SET(ip, i_gen, ip->i_gen);
 		}
 	}
 	/*
 	 * Ensure that uid and gid are correct. This is a temporary
 	 * fix until fsck has been changed to do the update.
 	 */
 	if (fs->fs_magic == FS_UFS1_MAGIC &&		/* XXX */
 	    fs->fs_old_inodefmt < FS_44INODEFMT) {	/* XXX */
 		ip->i_uid = ip->i_din1->di_ouid;	/* XXX */
 		ip->i_gid = ip->i_din1->di_ogid;	/* XXX */
 	}						/* XXX */
 
 #ifdef MAC
 	if ((mp->mnt_flag & MNT_MULTILABEL) && ip->i_mode) {
 		/*
 		 * If this vnode is already allocated, and we're running
 		 * multi-label, attempt to perform a label association
 		 * from the extended attributes on the inode.
 		 */
 		error = mac_vnode_associate_extattr(mp, vp);
 		if (error) {
 			/* ufs_inactive will release ip->i_devvp ref. */
 			vput(vp);
 			*vpp = NULL;
 			return (error);
 		}
 	}
 #endif
 
 	*vpp = vp;
 	return (0);
 }
 
 /*
  * File handle to vnode
  *
  * Have to be really careful about stale file handles:
  * - check that the inode number is valid
  * - call ffs_vget() to get the locked inode
  * - check for an unallocated inode (i_mode == 0)
  * - check that the given client host has export rights and return
  *   those rights via. exflagsp and credanonp
  */
 static int
 ffs_fhtovp(mp, fhp, vpp)
 	struct mount *mp;
 	struct fid *fhp;
 	struct vnode **vpp;
 {
 	struct ufid *ufhp;
 	struct fs *fs;
 
 	ufhp = (struct ufid *)fhp;
 	fs = VFSTOUFS(mp)->um_fs;
 	if (ufhp->ufid_ino < ROOTINO ||
 	    ufhp->ufid_ino >= fs->fs_ncg * fs->fs_ipg)
 		return (ESTALE);
 	return (ufs_fhtovp(mp, ufhp, vpp));
 }
 
 /*
  * Initialize the filesystem.
  */
 static int
 ffs_init(vfsp)
 	struct vfsconf *vfsp;
 {
 
 	softdep_initialize();
 	return (ufs_init(vfsp));
 }
 
 /*
  * Undo the work of ffs_init().
  */
 static int
 ffs_uninit(vfsp)
 	struct vfsconf *vfsp;
 {
 	int ret;
 
 	ret = ufs_uninit(vfsp);
 	softdep_uninitialize();
 	return (ret);
 }
 
 /*
  * Write a superblock and associated information back to disk.
  */
 int
 ffs_sbupdate(mp, waitfor, suspended)
 	struct ufsmount *mp;
 	int waitfor;
 	int suspended;
 {
 	struct fs *fs = mp->um_fs;
 	struct buf *sbbp;
 	struct buf *bp;
 	int blks;
 	void *space;
 	int i, size, error, allerror = 0;
 
 	if (fs->fs_ronly == 1 &&
 	    (mp->um_mountp->mnt_flag & (MNT_RDONLY | MNT_UPDATE)) != 
 	    (MNT_RDONLY | MNT_UPDATE))
 		panic("ffs_sbupdate: write read-only filesystem");
 	/*
 	 * We use the superblock's buf to serialize calls to ffs_sbupdate().
 	 */
 	sbbp = getblk(mp->um_devvp, btodb(fs->fs_sblockloc), (int)fs->fs_sbsize,
 	    0, 0, 0);
 	/*
 	 * First write back the summary information.
 	 */
 	blks = howmany(fs->fs_cssize, fs->fs_fsize);
 	space = fs->fs_csp;
 	for (i = 0; i < blks; i += fs->fs_frag) {
 		size = fs->fs_bsize;
 		if (i + fs->fs_frag > blks)
 			size = (blks - i) * fs->fs_fsize;
 		bp = getblk(mp->um_devvp, fsbtodb(fs, fs->fs_csaddr + i),
 		    size, 0, 0, 0);
 		bcopy(space, bp->b_data, (u_int)size);
 		space = (char *)space + size;
 		if (suspended)
 			bp->b_flags |= B_VALIDSUSPWRT;
 		if (waitfor != MNT_WAIT)
 			bawrite(bp);
 		else if ((error = bwrite(bp)) != 0)
 			allerror = error;
 	}
 	/*
 	 * Now write back the superblock itself. If any errors occurred
 	 * up to this point, then fail so that the superblock avoids
 	 * being written out as clean.
 	 */
 	if (allerror) {
 		brelse(sbbp);
 		return (allerror);
 	}
 	bp = sbbp;
 	if (fs->fs_magic == FS_UFS1_MAGIC && fs->fs_sblockloc != SBLOCK_UFS1 &&
 	    (fs->fs_flags & FS_FLAGS_UPDATED) == 0) {
 		printf("%s: correcting fs_sblockloc from %jd to %d\n",
 		    fs->fs_fsmnt, fs->fs_sblockloc, SBLOCK_UFS1);
 		fs->fs_sblockloc = SBLOCK_UFS1;
 	}
 	if (fs->fs_magic == FS_UFS2_MAGIC && fs->fs_sblockloc != SBLOCK_UFS2 &&
 	    (fs->fs_flags & FS_FLAGS_UPDATED) == 0) {
 		printf("%s: correcting fs_sblockloc from %jd to %d\n",
 		    fs->fs_fsmnt, fs->fs_sblockloc, SBLOCK_UFS2);
 		fs->fs_sblockloc = SBLOCK_UFS2;
 	}
 	fs->fs_fmod = 0;
 	fs->fs_time = time_second;
 	bcopy((caddr_t)fs, bp->b_data, (u_int)fs->fs_sbsize);
 	ffs_oldfscompat_write((struct fs *)bp->b_data, mp);
 	if (suspended)
 		bp->b_flags |= B_VALIDSUSPWRT;
 	if (waitfor != MNT_WAIT)
 		bawrite(bp);
 	else if ((error = bwrite(bp)) != 0)
 		allerror = error;
 	return (allerror);
 }
 
 static int
 ffs_extattrctl(struct mount *mp, int cmd, struct vnode *filename_vp,
 	int attrnamespace, const char *attrname, struct thread *td)
 {
 
 #ifdef UFS_EXTATTR
 	return (ufs_extattrctl(mp, cmd, filename_vp, attrnamespace,
 	    attrname, td));
 #else
 	return (vfs_stdextattrctl(mp, cmd, filename_vp, attrnamespace,
 	    attrname, td));
 #endif
 }
 
 static void
 ffs_ifree(struct ufsmount *ump, struct inode *ip)
 {
 
 	if (ump->um_fstype == UFS1 && ip->i_din1 != NULL)
 		uma_zfree(uma_ufs1, ip->i_din1);
 	else if (ip->i_din2 != NULL)
 		uma_zfree(uma_ufs2, ip->i_din2);
 	uma_zfree(uma_inode, ip);
 }
 
 static int dobkgrdwrite = 1;
 SYSCTL_INT(_debug, OID_AUTO, dobkgrdwrite, CTLFLAG_RW, &dobkgrdwrite, 0,
     "Do background writes (honoring the BV_BKGRDWRITE flag)?");
 
 /*
  * Complete a background write started from bwrite.
  */
 static void
 ffs_backgroundwritedone(struct buf *bp)
 {
 	struct bufobj *bufobj;
 	struct buf *origbp;
 
 	/*
 	 * Find the original buffer that we are writing.
 	 */
 	bufobj = bp->b_bufobj;
 	BO_LOCK(bufobj);
 	if ((origbp = gbincore(bp->b_bufobj, bp->b_lblkno)) == NULL)
 		panic("backgroundwritedone: lost buffer");
 	/* Grab an extra reference to be dropped by the bufdone() below. */
 	bufobj_wrefl(bufobj);
 	BO_UNLOCK(bufobj);
 	/*
 	 * Process dependencies then return any unfinished ones.
 	 */
 	if (!LIST_EMPTY(&bp->b_dep))
 		buf_complete(bp);
 #ifdef SOFTUPDATES
 	if (!LIST_EMPTY(&bp->b_dep))
 		softdep_move_dependencies(bp, origbp);
 #endif
 	/*
 	 * This buffer is marked B_NOCACHE so when it is released
 	 * by biodone it will be tossed.
 	 */
 	bp->b_flags |= B_NOCACHE;
 	bp->b_flags &= ~B_CACHE;
 	bufdone(bp);
 	BO_LOCK(bufobj);
 	/*
 	 * Clear the BV_BKGRDINPROG flag in the original buffer
 	 * and awaken it if it is waiting for the write to complete.
 	 * If BV_BKGRDINPROG is not set in the original buffer it must
 	 * have been released and re-instantiated - which is not legal.
 	 */
 	KASSERT((origbp->b_vflags & BV_BKGRDINPROG),
 	    ("backgroundwritedone: lost buffer2"));
 	origbp->b_vflags &= ~BV_BKGRDINPROG;
 	if (origbp->b_vflags & BV_BKGRDWAIT) {
 		origbp->b_vflags &= ~BV_BKGRDWAIT;
 		wakeup(&origbp->b_xflags);
 	}
 	BO_UNLOCK(bufobj);
 }
 
 
 /*
  * Write, release buffer on completion.  (Done by iodone
  * if async).  Do not bother writing anything if the buffer
  * is invalid.
  *
  * Note that we set B_CACHE here, indicating that buffer is
  * fully valid and thus cacheable.  This is true even of NFS
  * now so we set it generally.  This could be set either here 
  * or in biodone() since the I/O is synchronous.  We put it
  * here.
  */
 static int
 ffs_bufwrite(struct buf *bp)
 {
 	int oldflags, s;
 	struct buf *newbp;
 
 	CTR3(KTR_BUF, "bufwrite(%p) vp %p flags %X", bp, bp->b_vp, bp->b_flags);
 	if (bp->b_flags & B_INVAL) {
 		brelse(bp);
 		return (0);
 	}
 
 	oldflags = bp->b_flags;
 
 	if (BUF_REFCNT(bp) == 0)
 		panic("bufwrite: buffer is not busy???");
 	s = splbio();
 	/*
 	 * If a background write is already in progress, delay
 	 * writing this block if it is asynchronous. Otherwise
 	 * wait for the background write to complete.
 	 */
 	BO_LOCK(bp->b_bufobj);
 	if (bp->b_vflags & BV_BKGRDINPROG) {
 		if (bp->b_flags & B_ASYNC) {
 			BO_UNLOCK(bp->b_bufobj);
 			splx(s);
 			bdwrite(bp);
 			return (0);
 		}
 		bp->b_vflags |= BV_BKGRDWAIT;
 		msleep(&bp->b_xflags, BO_MTX(bp->b_bufobj), PRIBIO, "bwrbg", 0);
 		if (bp->b_vflags & BV_BKGRDINPROG)
 			panic("bufwrite: still writing");
 	}
 	BO_UNLOCK(bp->b_bufobj);
 
 	/* Mark the buffer clean */
 	bundirty(bp);
 
 	/*
 	 * If this buffer is marked for background writing and we
 	 * do not have to wait for it, make a copy and write the
 	 * copy so as to leave this buffer ready for further use.
 	 *
 	 * This optimization eats a lot of memory.  If we have a page
 	 * or buffer shortfall we can't do it.
 	 */
 	if (dobkgrdwrite && (bp->b_xflags & BX_BKGRDWRITE) && 
 	    (bp->b_flags & B_ASYNC) &&
 	    !vm_page_count_severe() &&
 	    !buf_dirty_count_severe()) {
 		KASSERT(bp->b_iodone == NULL,
 		    ("bufwrite: needs chained iodone (%p)", bp->b_iodone));
 
 		/* get a new block */
 		newbp = geteblk(bp->b_bufsize);
 
 		/*
 		 * set it to be identical to the old block.  We have to
 		 * set b_lblkno and BKGRDMARKER before calling bgetvp()
 		 * to avoid confusing the splay tree and gbincore().
 		 */
 		memcpy(newbp->b_data, bp->b_data, bp->b_bufsize);
 		newbp->b_lblkno = bp->b_lblkno;
 		newbp->b_xflags |= BX_BKGRDMARKER;
 		BO_LOCK(bp->b_bufobj);
 		bp->b_vflags |= BV_BKGRDINPROG;
 		bgetvp(bp->b_vp, newbp);
 		BO_UNLOCK(bp->b_bufobj);
 		newbp->b_bufobj = &bp->b_vp->v_bufobj;
 		newbp->b_blkno = bp->b_blkno;
 		newbp->b_offset = bp->b_offset;
 		newbp->b_iodone = ffs_backgroundwritedone;
 		newbp->b_flags |= B_ASYNC;
 		newbp->b_flags &= ~B_INVAL;
 
 #ifdef SOFTUPDATES
 		/* move over the dependencies */
 		if (!LIST_EMPTY(&bp->b_dep))
 			softdep_move_dependencies(bp, newbp);
 #endif 
 
 		/*
 		 * Initiate write on the copy, release the original to
 		 * the B_LOCKED queue so that it cannot go away until
 		 * the background write completes. If not locked it could go
 		 * away and then be reconstituted while it was being written.
 		 * If the reconstituted buffer were written, we could end up
 		 * with two background copies being written at the same time.
 		 */
 		bqrelse(bp);
 		bp = newbp;
 	}
 
 	/* Let the normal bufwrite do the rest for us */
 	return (bufwrite(bp));
 }
 
 
 static void
 ffs_geom_strategy(struct bufobj *bo, struct buf *bp)
 {
 	struct vnode *vp;
 	int error;
 	struct buf *tbp;
 
 	vp = bo->__bo_vnode;
 	if (bp->b_iocmd == BIO_WRITE) {
 		if ((bp->b_flags & B_VALIDSUSPWRT) == 0 &&
 		    bp->b_vp != NULL && bp->b_vp->v_mount != NULL &&
 		    (bp->b_vp->v_mount->mnt_kern_flag & MNTK_SUSPENDED) != 0)
 			panic("ffs_geom_strategy: bad I/O");
 		bp->b_flags &= ~B_VALIDSUSPWRT;
 		if ((vp->v_vflag & VV_COPYONWRITE) &&
 		    vp->v_rdev->si_snapdata != NULL) {
 			if ((bp->b_flags & B_CLUSTER) != 0) {
 				runningbufwakeup(bp);
 				TAILQ_FOREACH(tbp, &bp->b_cluster.cluster_head,
 					      b_cluster.cluster_entry) {
 					error = ffs_copyonwrite(vp, tbp);
 					if (error != 0 &&
 					    error != EOPNOTSUPP) {
 						bp->b_error = error;
 						bp->b_ioflags |= BIO_ERROR;
 						bufdone(bp);
 						return;
 					}
 				}
 				bp->b_runningbufspace = bp->b_bufsize;
 				atomic_add_int(&runningbufspace,
 					       bp->b_runningbufspace);
 			} else {
 				error = ffs_copyonwrite(vp, bp);
 				if (error != 0 && error != EOPNOTSUPP) {
 					bp->b_error = error;
 					bp->b_ioflags |= BIO_ERROR;
 					bufdone(bp);
 					return;
 				}
 			}
 		}
 #ifdef SOFTUPDATES
 		if ((bp->b_flags & B_CLUSTER) != 0) {
 			TAILQ_FOREACH(tbp, &bp->b_cluster.cluster_head,
 				      b_cluster.cluster_entry) {
 				if (!LIST_EMPTY(&tbp->b_dep))
 					buf_start(tbp);
 			}
 		} else {
 			if (!LIST_EMPTY(&bp->b_dep))
 				buf_start(bp);
 		}
 
 #endif
 	}
 	g_vfs_strategy(bo, bp);
 }
Index: head/sys/ufs/ufs/ufs_extattr.c
===================================================================
--- head/sys/ufs/ufs/ufs_extattr.c	(revision 175201)
+++ head/sys/ufs/ufs/ufs_extattr.c	(revision 175202)
@@ -1,1277 +1,1273 @@
 /*-
  * Copyright (c) 1999-2002 Robert N. M. Watson
  * Copyright (c) 2002-2003 Networks Associates Technology, Inc.
  * All rights reserved.
  *
  * This software was developed by Robert Watson for the TrustedBSD Project.
  *
  * This software was developed for the FreeBSD Project in part by Network
  * Associates Laboratories, the Security Research Division of Network
  * Associates, Inc. under DARPA/SPAWAR contract N66001-01-C-8035 ("CBOSS"),
  * as part of the DARPA CHATS research program.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  */
 
 /*
  * Support for filesystem extended attribute: UFS-specific support functions.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_ufs.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/kernel.h>
 #include <sys/namei.h>
 #include <sys/malloc.h>
 #include <sys/fcntl.h>
 #include <sys/priv.h>
 #include <sys/proc.h>
 #include <sys/vnode.h>
 #include <sys/mount.h>
 #include <sys/lock.h>
 #include <sys/dirent.h>
 #include <sys/extattr.h>
 #include <sys/sysctl.h>
 
 #include <vm/uma.h>
 
 #include <ufs/ufs/dir.h>
 #include <ufs/ufs/extattr.h>
 #include <ufs/ufs/quota.h>
 #include <ufs/ufs/ufsmount.h>
 #include <ufs/ufs/inode.h>
 #include <ufs/ufs/ufs_extern.h>
 
 #ifdef UFS_EXTATTR
 
 static MALLOC_DEFINE(M_UFS_EXTATTR, "ufs_extattr", "ufs extended attribute");
 
 static int ufs_extattr_sync = 0;
 SYSCTL_INT(_debug, OID_AUTO, ufs_extattr_sync, CTLFLAG_RW, &ufs_extattr_sync,
     0, "");
 
 static int	ufs_extattr_valid_attrname(int attrnamespace,
 		    const char *attrname);
 static int	ufs_extattr_enable_with_open(struct ufsmount *ump,
 		    struct vnode *vp, int attrnamespace, const char *attrname,
 		    struct thread *td);
 static int	ufs_extattr_enable(struct ufsmount *ump, int attrnamespace,
 		    const char *attrname, struct vnode *backing_vnode,
 		    struct thread *td);
 static int	ufs_extattr_disable(struct ufsmount *ump, int attrnamespace,
 		    const char *attrname, struct thread *td);
 static int	ufs_extattr_get(struct vnode *vp, int attrnamespace,
 		    const char *name, struct uio *uio, size_t *size,
 		    struct ucred *cred, struct thread *td);
 static int	ufs_extattr_set(struct vnode *vp, int attrnamespace,
 		    const char *name, struct uio *uio, struct ucred *cred,
 		    struct thread *td);
 static int	ufs_extattr_rm(struct vnode *vp, int attrnamespace,
 		    const char *name, struct ucred *cred, struct thread *td);
 
 /*
  * Per-FS attribute lock protecting attribute operations.
  * XXX Right now there is a lot of lock contention due to having a single
  * lock per-FS; really, this should be far more fine-grained.
  */
 static void
 ufs_extattr_uepm_lock(struct ufsmount *ump, struct thread *td)
 {
 
 	/* Ideally, LK_CANRECURSE would not be used, here. */
 	lockmgr(&ump->um_extattr.uepm_lock, LK_EXCLUSIVE | LK_RETRY |
 	    LK_CANRECURSE, 0, td);
 }
 
 static void
 ufs_extattr_uepm_unlock(struct ufsmount *ump, struct thread *td)
 {
 
 	lockmgr(&ump->um_extattr.uepm_lock, LK_RELEASE, 0, td);
 }
 
 /*-
  * Determine whether the name passed is a valid name for an actual
  * attribute.
  *
  * Invalid currently consists of:
  *	 NULL pointer for attrname
  *	 zero-length attrname (used to retrieve application attribute list)
  */
 static int
 ufs_extattr_valid_attrname(int attrnamespace, const char *attrname)
 {
 
 	if (attrname == NULL)
 		return (0);
 	if (strlen(attrname) == 0)
 		return (0);
 	return (1);
 }
 
 /*
  * Locate an attribute given a name and mountpoint.
  * Must be holding uepm lock for the mount point.
  */
 static struct ufs_extattr_list_entry *
 ufs_extattr_find_attr(struct ufsmount *ump, int attrnamespace,
     const char *attrname)
 {
 	struct ufs_extattr_list_entry *search_attribute;
 
 	for (search_attribute = LIST_FIRST(&ump->um_extattr.uepm_list);
 	    search_attribute != NULL;
 	    search_attribute = LIST_NEXT(search_attribute, uele_entries)) {
 		if (!(strncmp(attrname, search_attribute->uele_attrname,
 		    UFS_EXTATTR_MAXEXTATTRNAME)) &&
 		    (attrnamespace == search_attribute->uele_attrnamespace)) {
 			return (search_attribute);
 		}
 	}
 
 	return (0);
 }
 
 /*
  * Initialize per-FS structures supporting extended attributes.  Do not
  * start extended attributes yet.
  */
 void
 ufs_extattr_uepm_init(struct ufs_extattr_per_mount *uepm)
 {
 
 	uepm->uepm_flags = 0;
 
 	LIST_INIT(&uepm->uepm_list);
 	/* XXX is PVFS right, here? */
 	lockinit(&uepm->uepm_lock, PVFS, "extattr", 0, 0);
 	uepm->uepm_flags |= UFS_EXTATTR_UEPM_INITIALIZED;
 }
 
 /*
  * Destroy per-FS structures supporting extended attributes.  Assumes
  * that EAs have already been stopped, and will panic if not.
  */
 void
 ufs_extattr_uepm_destroy(struct ufs_extattr_per_mount *uepm)
 {
 
 	if (!(uepm->uepm_flags & UFS_EXTATTR_UEPM_INITIALIZED))
 		panic("ufs_extattr_uepm_destroy: not initialized");
 
 	if ((uepm->uepm_flags & UFS_EXTATTR_UEPM_STARTED))
 		panic("ufs_extattr_uepm_destroy: called while still started");
 
 	/*
 	 * It's not clear that either order for the next two lines is
 	 * ideal, and it should never be a problem if this is only called
 	 * during unmount, and with vfs_busy().
 	 */
 	uepm->uepm_flags &= ~UFS_EXTATTR_UEPM_INITIALIZED;
 	lockdestroy(&uepm->uepm_lock);
 }
 
 /*
  * Start extended attribute support on an FS.
  */
 int
 ufs_extattr_start(struct mount *mp, struct thread *td)
 {
 	struct ufsmount *ump;
 	int error = 0;
 
 	ump = VFSTOUFS(mp);
 
 	ufs_extattr_uepm_lock(ump, td);
 
 	if (!(ump->um_extattr.uepm_flags & UFS_EXTATTR_UEPM_INITIALIZED)) {
 		error = EOPNOTSUPP;
 		goto unlock;
 	}
 	if (ump->um_extattr.uepm_flags & UFS_EXTATTR_UEPM_STARTED) {
 		error = EBUSY;
 		goto unlock;
 	}
 
 	ump->um_extattr.uepm_flags |= UFS_EXTATTR_UEPM_STARTED;
 
 	ump->um_extattr.uepm_ucred = crhold(td->td_ucred);
 
 unlock:
 	ufs_extattr_uepm_unlock(ump, td);
 
 	return (error);
 }
 
 #ifdef UFS_EXTATTR_AUTOSTART
 /*
  * Helper routine: given a locked parent directory and filename, return
  * the locked vnode of the inode associated with the name.  Will not
  * follow symlinks, may return any type of vnode.  Lock on parent will
  * be released even in the event of a failure.  In the event that the
  * target is the parent (i.e., "."), there will be two references and
  * one lock, requiring the caller to possibly special-case.
  */
 #define	UE_GETDIR_LOCKPARENT	1
 #define	UE_GETDIR_LOCKPARENT_DONT	2
 static int
 ufs_extattr_lookup(struct vnode *start_dvp, int lockparent, char *dirname,
     struct vnode **vp, struct thread *td)
 {
 	struct vop_cachedlookup_args vargs;
 	struct componentname cnp;
 	struct vnode *target_vp;
 	int error;
 
 	bzero(&cnp, sizeof(cnp));
 	cnp.cn_nameiop = LOOKUP;
 	cnp.cn_flags = ISLASTCN;
 	if (lockparent == UE_GETDIR_LOCKPARENT)
 		cnp.cn_flags |= LOCKPARENT;
 	cnp.cn_lkflags = LK_EXCLUSIVE;
 	cnp.cn_thread = td;
 	cnp.cn_cred = td->td_ucred;
 	cnp.cn_pnbuf = uma_zalloc(namei_zone, M_WAITOK);
 	cnp.cn_nameptr = cnp.cn_pnbuf;
 	error = copystr(dirname, cnp.cn_pnbuf, MAXPATHLEN,
 	    (size_t *) &cnp.cn_namelen);
 	if (error) {
 		if (lockparent == UE_GETDIR_LOCKPARENT_DONT) {
 			VOP_UNLOCK(start_dvp, 0, td);
 		}
 		uma_zfree(namei_zone, cnp.cn_pnbuf);
 		printf("ufs_extattr_lookup: copystr failed\n");
 		return (error);
 	}
 	cnp.cn_namelen--;	/* trim nul termination */
 	vargs.a_gen.a_desc = NULL;
 	vargs.a_dvp = start_dvp;
 	vargs.a_vpp = &target_vp;
 	vargs.a_cnp = &cnp;
 	error = ufs_lookup(&vargs);
 	uma_zfree(namei_zone, cnp.cn_pnbuf);
 	if (error) {
 		/*
 		 * Error condition, may have to release the lock on the parent
 		 * if ufs_lookup() didn't.
 		 */
 		if (lockparent == UE_GETDIR_LOCKPARENT_DONT)
 			VOP_UNLOCK(start_dvp, 0, td);
 
 		/*
 		 * Check that ufs_lookup() didn't release the lock when we
 		 * didn't want it to.
 		 */
 		if (lockparent == UE_GETDIR_LOCKPARENT)
 			ASSERT_VOP_LOCKED(start_dvp, "ufs_extattr_lookup");
 
 		return (error);
 	}
 /*
 	if (target_vp == start_dvp)
 		panic("ufs_extattr_lookup: target_vp == start_dvp");
 */
 
 	if (target_vp != start_dvp && lockparent == UE_GETDIR_LOCKPARENT_DONT)
 		VOP_UNLOCK(start_dvp, 0, td);
 
 	if (lockparent == UE_GETDIR_LOCKPARENT)
 		ASSERT_VOP_LOCKED(start_dvp, "ufs_extattr_lookup");
 
 	/* printf("ufs_extattr_lookup: success\n"); */
 	*vp = target_vp;
 	return (0);
 }
 #endif /* !UFS_EXTATTR_AUTOSTART */
 
 /*
  * Enable an EA using the passed filesystem, backing vnode, attribute name,
  * namespace, and proc.  Will perform a VOP_OPEN() on the vp, so expects vp
  * to be locked when passed in.  The vnode will be returned unlocked,
  * regardless of success/failure of the function.  As a result, the caller
  * will always need to vrele(), but not vput().
  */
 static int
 ufs_extattr_enable_with_open(struct ufsmount *ump, struct vnode *vp,
     int attrnamespace, const char *attrname, struct thread *td)
 {
 	int error;
 
 	error = VOP_OPEN(vp, FREAD|FWRITE, td->td_ucred, td, NULL);
 	if (error) {
 		printf("ufs_extattr_enable_with_open.VOP_OPEN(): failed "
 		    "with %d\n", error);
 		VOP_UNLOCK(vp, 0, td);
 		return (error);
 	}
 
 	vp->v_writecount++;
 
 	vref(vp);
 
 	VOP_UNLOCK(vp, 0, td);
 
 	error = ufs_extattr_enable(ump, attrnamespace, attrname, vp, td);
 	if (error != 0)
 		vn_close(vp, FREAD|FWRITE, td->td_ucred, td);
 	return (error);
 }
 
 #ifdef UFS_EXTATTR_AUTOSTART
 /*
  * Given a locked directory vnode, iterate over the names in the directory
  * and use ufs_extattr_lookup() to retrieve locked vnodes of potential
  * attribute files.  Then invoke ufs_extattr_enable_with_open() on each
  * to attempt to start the attribute.  Leaves the directory locked on
  * exit.
  */
 static int
 ufs_extattr_iterate_directory(struct ufsmount *ump, struct vnode *dvp,
     int attrnamespace, struct thread *td)
 {
 	struct vop_readdir_args vargs;
 	struct dirent *dp, *edp;
 	struct vnode *attr_vp;
 	struct uio auio;
 	struct iovec aiov;
 	char *dirbuf;
 	int error, eofflag = 0;
 
 	if (dvp->v_type != VDIR)
 		return (ENOTDIR);
 
 	MALLOC(dirbuf, char *, DIRBLKSIZ, M_TEMP, M_WAITOK);
 
 	auio.uio_iov = &aiov;
 	auio.uio_iovcnt = 1;
 	auio.uio_rw = UIO_READ;
 	auio.uio_segflg = UIO_SYSSPACE;
 	auio.uio_td = td;
 	auio.uio_offset = 0;
 
 	vargs.a_gen.a_desc = NULL;
 	vargs.a_vp = dvp;
 	vargs.a_uio = &auio;
 	vargs.a_cred = td->td_ucred;
 	vargs.a_eofflag = &eofflag;
 	vargs.a_ncookies = NULL;
 	vargs.a_cookies = NULL;
 
 	while (!eofflag) {
 		auio.uio_resid = DIRBLKSIZ;
 		aiov.iov_base = dirbuf;
 		aiov.iov_len = DIRBLKSIZ;
 		error = ufs_readdir(&vargs);
 		if (error) {
 			printf("ufs_extattr_iterate_directory: ufs_readdir "
 			    "%d\n", error);
 			return (error);
 		}
 
 		/*
 		 * XXXRW: While in UFS, we always get DIRBLKSIZ returns from
 		 * the directory code on success, on other file systems this
 		 * may not be the case.  For portability, we should check the
 		 * read length on return from ufs_readdir().
 		 */
 		edp = (struct dirent *)&dirbuf[DIRBLKSIZ];
 		for (dp = (struct dirent *)dirbuf; dp < edp; ) {
 #if (BYTE_ORDER == LITTLE_ENDIAN)
 			dp->d_type = dp->d_namlen;
 			dp->d_namlen = 0;
 #else
 			dp->d_type = 0;
 #endif
 			if (dp->d_reclen == 0)
 				break;
 			error = ufs_extattr_lookup(dvp, UE_GETDIR_LOCKPARENT,
 			    dp->d_name, &attr_vp, td);
 			if (error) {
 				printf("ufs_extattr_iterate_directory: lookup "
 				    "%s %d\n", dp->d_name, error);
 			} else if (attr_vp == dvp) {
 				vrele(attr_vp);
 			} else if (attr_vp->v_type != VREG) {
 				vput(attr_vp);
 			} else {
 				error = ufs_extattr_enable_with_open(ump,
 				    attr_vp, attrnamespace, dp->d_name, td);
 				vrele(attr_vp);
 				if (error) {
 					printf("ufs_extattr_iterate_directory: "
 					    "enable %s %d\n", dp->d_name,
 					    error);
 				} else if (bootverbose) {
 					printf("UFS autostarted EA %s\n",
 					    dp->d_name);
 				}
 			}
 			dp = (struct dirent *) ((char *)dp + dp->d_reclen);
 			if (dp >= edp)
 				break;
 		}
 	}
 	FREE(dirbuf, M_TEMP);
 	
 	return (0);
 }
 
 /*
  * Auto-start of extended attributes, to be executed (optionally) at
  * mount-time.
  */
 int
 ufs_extattr_autostart(struct mount *mp, struct thread *td)
 {
 	struct vnode *rvp, *attr_dvp, *attr_system_dvp, *attr_user_dvp;
 	int error;
 
 	/*
 	 * Does UFS_EXTATTR_FSROOTSUBDIR exist off the filesystem root?
 	 * If so, automatically start EA's.
 	 */
 	error = VFS_ROOT(mp, LK_EXCLUSIVE, &rvp, td);
 	if (error) {
 		printf("ufs_extattr_autostart.VFS_ROOT() returned %d\n",
 		    error);
 		return (error);
 	}
 
 	error = ufs_extattr_lookup(rvp, UE_GETDIR_LOCKPARENT_DONT,
 	    UFS_EXTATTR_FSROOTSUBDIR, &attr_dvp, td);
 	if (error) {
 		/* rvp ref'd but now unlocked */
 		vrele(rvp);
 		return (error);
 	}
 	if (rvp == attr_dvp) {
 		/* Should never happen. */
 		vput(rvp);
 		vrele(attr_dvp);
 		return (EINVAL);
 	}
 	vrele(rvp);
 
 	if (attr_dvp->v_type != VDIR) {
 		printf("ufs_extattr_autostart: %s != VDIR\n",
 		    UFS_EXTATTR_FSROOTSUBDIR);
 		goto return_vput_attr_dvp;
 	}
 
 	error = ufs_extattr_start(mp, td);
 	if (error) {
 		printf("ufs_extattr_autostart: ufs_extattr_start failed (%d)\n",
 		    error);
 		goto return_vput_attr_dvp;
 	}
 
 	/*
 	 * Look for two subdirectories: UFS_EXTATTR_SUBDIR_SYSTEM,
 	 * UFS_EXTATTR_SUBDIR_USER.  For each, iterate over the sub-directory,
 	 * and start with appropriate type.  Failures in either don't
 	 * result in an over-all failure.  attr_dvp is left locked to
 	 * be cleaned up on exit.
 	 */
 	error = ufs_extattr_lookup(attr_dvp, UE_GETDIR_LOCKPARENT,
 	    UFS_EXTATTR_SUBDIR_SYSTEM, &attr_system_dvp, td);
 	if (!error) {
 		error = ufs_extattr_iterate_directory(VFSTOUFS(mp),
 		    attr_system_dvp, EXTATTR_NAMESPACE_SYSTEM, td);
 		if (error)
 			printf("ufs_extattr_iterate_directory returned %d\n",
 			    error);
 		vput(attr_system_dvp);
 	}
 
 	error = ufs_extattr_lookup(attr_dvp, UE_GETDIR_LOCKPARENT,
 	    UFS_EXTATTR_SUBDIR_USER, &attr_user_dvp, td);
 	if (!error) {
 		error = ufs_extattr_iterate_directory(VFSTOUFS(mp),
 		    attr_user_dvp, EXTATTR_NAMESPACE_USER, td);
 		if (error)
 			printf("ufs_extattr_iterate_directory returned %d\n",
 			    error);
 		vput(attr_user_dvp);
 	}
 
 	/* Mask startup failures in sub-directories. */
 	error = 0;
 
 return_vput_attr_dvp:
 	vput(attr_dvp);
 
 	return (error);
 }
 #endif /* !UFS_EXTATTR_AUTOSTART */
 
 /*
  * Stop extended attribute support on an FS.
  */
 int
 ufs_extattr_stop(struct mount *mp, struct thread *td)
 {
 	struct ufs_extattr_list_entry *uele;
 	struct ufsmount *ump = VFSTOUFS(mp);
 	int error = 0;
 
 	ufs_extattr_uepm_lock(ump, td);
 
 	if (!(ump->um_extattr.uepm_flags & UFS_EXTATTR_UEPM_STARTED)) {
 		error = EOPNOTSUPP;
 		goto unlock;
 	}
 
 	while ((uele = LIST_FIRST(&ump->um_extattr.uepm_list)) != NULL) {
 		ufs_extattr_disable(ump, uele->uele_attrnamespace,
 		    uele->uele_attrname, td);
 	}
 
 	ump->um_extattr.uepm_flags &= ~UFS_EXTATTR_UEPM_STARTED;
 
 	crfree(ump->um_extattr.uepm_ucred);
 	ump->um_extattr.uepm_ucred = NULL;
 
 unlock:
 	ufs_extattr_uepm_unlock(ump, td);
 
 	return (error);
 }
 
 /*
  * Enable a named attribute on the specified filesystem; provide an
  * unlocked backing vnode to hold the attribute data.
  */
 static int
 ufs_extattr_enable(struct ufsmount *ump, int attrnamespace,
     const char *attrname, struct vnode *backing_vnode, struct thread *td)
 {
 	struct ufs_extattr_list_entry *attribute;
 	struct iovec aiov;
 	struct uio auio;
 	int error = 0;
 
 	if (!ufs_extattr_valid_attrname(attrnamespace, attrname))
 		return (EINVAL);
 	if (backing_vnode->v_type != VREG)
 		return (EINVAL);
 
 	MALLOC(attribute, struct ufs_extattr_list_entry *,
 	    sizeof(struct ufs_extattr_list_entry), M_UFS_EXTATTR, M_WAITOK);
 	if (attribute == NULL)
 		return (ENOMEM);
 
 	if (!(ump->um_extattr.uepm_flags & UFS_EXTATTR_UEPM_STARTED)) {
 		error = EOPNOTSUPP;
 		goto free_exit;
 	}
 
 	if (ufs_extattr_find_attr(ump, attrnamespace, attrname)) {
 		error = EEXIST;
 		goto free_exit;
 	}
 
 	strncpy(attribute->uele_attrname, attrname,
 	    UFS_EXTATTR_MAXEXTATTRNAME);
 	attribute->uele_attrnamespace = attrnamespace;
 	bzero(&attribute->uele_fileheader,
 	    sizeof(struct ufs_extattr_fileheader));
 	
 	attribute->uele_backing_vnode = backing_vnode;
 
 	auio.uio_iov = &aiov;
 	auio.uio_iovcnt = 1;
 	aiov.iov_base = (caddr_t) &attribute->uele_fileheader;
 	aiov.iov_len = sizeof(struct ufs_extattr_fileheader);
 	auio.uio_resid = sizeof(struct ufs_extattr_fileheader);
 	auio.uio_offset = (off_t) 0;
 	auio.uio_segflg = UIO_SYSSPACE;
 	auio.uio_rw = UIO_READ;
 	auio.uio_td = td;
 
-	vn_lock(backing_vnode, LK_SHARED | LK_RETRY, td);
+	vn_lock(backing_vnode, LK_SHARED | LK_RETRY);
 	error = VOP_READ(backing_vnode, &auio, IO_NODELOCKED,
 	    ump->um_extattr.uepm_ucred);
 
 	if (error)
 		goto unlock_free_exit;
 
 	if (auio.uio_resid != 0) {
 		printf("ufs_extattr_enable: malformed attribute header\n");
 		error = EINVAL;
 		goto unlock_free_exit;
 	}
 
 	if (attribute->uele_fileheader.uef_magic != UFS_EXTATTR_MAGIC) {
 		printf("ufs_extattr_enable: invalid attribute header magic\n");
 		error = EINVAL;
 		goto unlock_free_exit;
 	}
 
 	if (attribute->uele_fileheader.uef_version != UFS_EXTATTR_VERSION) {
 		printf("ufs_extattr_enable: incorrect attribute header "
 		    "version\n");
 		error = EINVAL;
 		goto unlock_free_exit;
 	}
 
 	ASSERT_VOP_LOCKED(backing_vnode, "ufs_extattr_enable");
 	LIST_INSERT_HEAD(&ump->um_extattr.uepm_list, attribute,
 	    uele_entries);
 
 	VOP_UNLOCK(backing_vnode, 0, td);
 	return (0);
 
 unlock_free_exit:
 	VOP_UNLOCK(backing_vnode, 0, td);
 
 free_exit:
 	FREE(attribute, M_UFS_EXTATTR);
 	return (error);
 }
 
 /*
  * Disable extended attribute support on an FS.
  */
 static int
 ufs_extattr_disable(struct ufsmount *ump, int attrnamespace,
     const char *attrname, struct thread *td)
 {
 	struct ufs_extattr_list_entry *uele;
 	int error = 0;
 
 	if (!ufs_extattr_valid_attrname(attrnamespace, attrname))
 		return (EINVAL);
 
 	uele = ufs_extattr_find_attr(ump, attrnamespace, attrname);
 	if (!uele)
 		return (ENOATTR);
 
 	LIST_REMOVE(uele, uele_entries);
 
-	vn_lock(uele->uele_backing_vnode, LK_SHARED | LK_RETRY,
-	    td);
+	vn_lock(uele->uele_backing_vnode, LK_SHARED | LK_RETRY);
 	ASSERT_VOP_LOCKED(uele->uele_backing_vnode, "ufs_extattr_disable");
 	VOP_UNLOCK(uele->uele_backing_vnode, 0, td);
 	error = vn_close(uele->uele_backing_vnode, FREAD|FWRITE,
 	    td->td_ucred, td);
 
 	FREE(uele, M_UFS_EXTATTR);
 
 	return (error);
 }
 
 /*
  * VFS call to manage extended attributes in UFS.  If filename_vp is
  * non-NULL, it must be passed in locked, and regardless of errors in
  * processing, will be unlocked.
  */
 int
 ufs_extattrctl(struct mount *mp, int cmd, struct vnode *filename_vp,
     int attrnamespace, const char *attrname, struct thread *td)
 {
 	struct ufsmount *ump = VFSTOUFS(mp);
 	int error;
 
 	/*
 	 * Processes with privilege, but in jail, are not allowed to
 	 * configure extended attributes.
 	 */
 	error = priv_check(td, PRIV_UFS_EXTATTRCTL);
 	if (error) {
 		if (filename_vp != NULL)
 			VOP_UNLOCK(filename_vp, 0, td);
 		return (error);
 	}
 
 	switch(cmd) {
 	case UFS_EXTATTR_CMD_START:
 		if (filename_vp != NULL) {
 			VOP_UNLOCK(filename_vp, 0, td);
 			return (EINVAL);
 		}
 		if (attrname != NULL)
 			return (EINVAL);
 
 		error = ufs_extattr_start(mp, td);
 
 		return (error);
 		
 	case UFS_EXTATTR_CMD_STOP:
 		if (filename_vp != NULL) {
 			VOP_UNLOCK(filename_vp, 0, td);
 			return (EINVAL);
 		}
 		if (attrname != NULL)
 			return (EINVAL);
 
 		error = ufs_extattr_stop(mp, td);
 
 		return (error);
 
 	case UFS_EXTATTR_CMD_ENABLE:
 
 		if (filename_vp == NULL)
 			return (EINVAL);
 		if (attrname == NULL) {
 			VOP_UNLOCK(filename_vp, 0, td);
 			return (EINVAL);
 		}
 
 		/*
 		 * ufs_extattr_enable_with_open() will always unlock the
 		 * vnode, regardless of failure.
 		 */
 		ufs_extattr_uepm_lock(ump, td);
 		error = ufs_extattr_enable_with_open(ump, filename_vp,
 		    attrnamespace, attrname, td);
 		ufs_extattr_uepm_unlock(ump, td);
 
 		return (error);
 
 	case UFS_EXTATTR_CMD_DISABLE:
 
 		if (filename_vp != NULL) {
 			VOP_UNLOCK(filename_vp, 0, td);
 			return (EINVAL);
 		}
 		if (attrname == NULL)
 			return (EINVAL);
 
 		ufs_extattr_uepm_lock(ump, td);
 		error = ufs_extattr_disable(ump, attrnamespace, attrname,
 		    td);
 		ufs_extattr_uepm_unlock(ump, td);
 
 		return (error);
 
 	default:
 		return (EINVAL);
 	}
 }
 
 /*
  * Vnode operating to retrieve a named extended attribute.
  */
 int
 ufs_getextattr(struct vop_getextattr_args *ap)
 /*
 vop_getextattr {
 	IN struct vnode *a_vp;
 	IN int a_attrnamespace;
 	IN const char *a_name;
 	INOUT struct uio *a_uio;
 	OUT size_t *a_size;
 	IN struct ucred *a_cred;
 	IN struct thread *a_td;
 };
 */
 {
 	struct mount *mp = ap->a_vp->v_mount;
 	struct ufsmount *ump = VFSTOUFS(mp);
 	int error;
 
 	ufs_extattr_uepm_lock(ump, ap->a_td);
 
 	error = ufs_extattr_get(ap->a_vp, ap->a_attrnamespace, ap->a_name,
 	    ap->a_uio, ap->a_size, ap->a_cred, ap->a_td);
 
 	ufs_extattr_uepm_unlock(ump, ap->a_td);
 
 	return (error);
 }
 
 /*
  * Real work associated with retrieving a named attribute--assumes that
  * the attribute lock has already been grabbed.
  */
 static int
 ufs_extattr_get(struct vnode *vp, int attrnamespace, const char *name,
     struct uio *uio, size_t *size, struct ucred *cred, struct thread *td)
 {
 	struct ufs_extattr_list_entry *attribute;
 	struct ufs_extattr_header ueh;
 	struct iovec local_aiov;
 	struct uio local_aio;
 	struct mount *mp = vp->v_mount;
 	struct ufsmount *ump = VFSTOUFS(mp);
 	struct inode *ip = VTOI(vp);
 	off_t base_offset;
 	size_t len, old_len;
 	int error = 0;
 
 	if (!(ump->um_extattr.uepm_flags & UFS_EXTATTR_UEPM_STARTED))
 		return (EOPNOTSUPP);
 
 	if (strlen(name) == 0)
 		return (EINVAL);
 
 	error = extattr_check_cred(vp, attrnamespace, cred, td, IREAD);
 	if (error)
 		return (error);
 
 	attribute = ufs_extattr_find_attr(ump, attrnamespace, name);
 	if (!attribute)
 		return (ENOATTR);
 
 	/*
 	 * Allow only offsets of zero to encourage the read/replace
 	 * extended attribute semantic.  Otherwise we can't guarantee
 	 * atomicity, as we don't provide locks for extended attributes.
 	 */
 	if (uio != NULL && uio->uio_offset != 0)
 		return (ENXIO);
 
 	/*
 	 * Find base offset of header in file based on file header size, and
 	 * data header size + maximum data size, indexed by inode number.
 	 */
 	base_offset = sizeof(struct ufs_extattr_fileheader) +
 	    ip->i_number * (sizeof(struct ufs_extattr_header) +
 	    attribute->uele_fileheader.uef_size);
 
 	/*
 	 * Read in the data header to see if the data is defined, and if so
 	 * how much.
 	 */
 	bzero(&ueh, sizeof(struct ufs_extattr_header));
 	local_aiov.iov_base = (caddr_t) &ueh;
 	local_aiov.iov_len = sizeof(struct ufs_extattr_header);
 	local_aio.uio_iov = &local_aiov;
 	local_aio.uio_iovcnt = 1;
 	local_aio.uio_rw = UIO_READ;
 	local_aio.uio_segflg = UIO_SYSSPACE;
 	local_aio.uio_td = td;
 	local_aio.uio_offset = base_offset;
 	local_aio.uio_resid = sizeof(struct ufs_extattr_header);
 	
 	/*
 	 * Acquire locks.
 	 *
 	 * Don't need to get a lock on the backing file if the getattr is
 	 * being applied to the backing file, as the lock is already held.
 	 */
 	if (attribute->uele_backing_vnode != vp)
-		vn_lock(attribute->uele_backing_vnode, LK_SHARED |
-		    LK_RETRY, td);
+		vn_lock(attribute->uele_backing_vnode, LK_SHARED | LK_RETRY);
 
 	error = VOP_READ(attribute->uele_backing_vnode, &local_aio,
 	    IO_NODELOCKED, ump->um_extattr.uepm_ucred);
 	if (error)
 		goto vopunlock_exit;
 
 	/* Defined? */
 	if ((ueh.ueh_flags & UFS_EXTATTR_ATTR_FLAG_INUSE) == 0) {
 		error = ENOATTR;
 		goto vopunlock_exit;
 	}
 
 	/* Valid for the current inode generation? */
 	if (ueh.ueh_i_gen != ip->i_gen) {
 		/*
 		 * The inode itself has a different generation number
 		 * than the attribute data.  For now, the best solution
 		 * is to coerce this to undefined, and let it get cleaned
 		 * up by the next write or extattrctl clean.
 		 */
 		printf("ufs_extattr_get (%s): inode number inconsistency (%d, %jd)\n",
 		    mp->mnt_stat.f_mntonname, ueh.ueh_i_gen, (intmax_t)ip->i_gen);
 		error = ENOATTR;
 		goto vopunlock_exit;
 	}
 
 	/* Local size consistency check. */
 	if (ueh.ueh_len > attribute->uele_fileheader.uef_size) {
 		error = ENXIO;
 		goto vopunlock_exit;
 	}
 
 	/* Return full data size if caller requested it. */
 	if (size != NULL)
 		*size = ueh.ueh_len;
 
 	/* Return data if the caller requested it. */
 	if (uio != NULL) {
 		/* Allow for offset into the attribute data. */
 		uio->uio_offset = base_offset + sizeof(struct
 		    ufs_extattr_header);
 
 		/*
 		 * Figure out maximum to transfer -- use buffer size and
 		 * local data limit.
 		 */
 		len = MIN(uio->uio_resid, ueh.ueh_len);
 		old_len = uio->uio_resid;
 		uio->uio_resid = len;
 
 		error = VOP_READ(attribute->uele_backing_vnode, uio,
 		    IO_NODELOCKED, ump->um_extattr.uepm_ucred);
 		if (error)
 			goto vopunlock_exit;
 
 		uio->uio_resid = old_len - (len - uio->uio_resid);
 	}
 
 vopunlock_exit:
 
 	if (uio != NULL)
 		uio->uio_offset = 0;
 
 	if (attribute->uele_backing_vnode != vp)
 		VOP_UNLOCK(attribute->uele_backing_vnode, 0, td);
 
 	return (error);
 }
 
 /*
  * Vnode operation to remove a named attribute.
  */
 int
 ufs_deleteextattr(struct vop_deleteextattr_args *ap)
 /*
 vop_deleteextattr {
 	IN struct vnode *a_vp;
 	IN int a_attrnamespace;
 	IN const char *a_name;
 	IN struct ucred *a_cred;
 	IN struct thread *a_td;
 };
 */
 {
 	struct mount *mp = ap->a_vp->v_mount;
 	struct ufsmount *ump = VFSTOUFS(mp); 
 	int error;
 
 	ufs_extattr_uepm_lock(ump, ap->a_td);
 
 	error = ufs_extattr_rm(ap->a_vp, ap->a_attrnamespace, ap->a_name,
 	    ap->a_cred, ap->a_td);
 
 
 	ufs_extattr_uepm_unlock(ump, ap->a_td);
 
 	return (error);
 }
 
 /*
  * Vnode operation to set a named attribute.
  */
 int
 ufs_setextattr(struct vop_setextattr_args *ap)
 /*
 vop_setextattr {
 	IN struct vnode *a_vp;
 	IN int a_attrnamespace;
 	IN const char *a_name;
 	INOUT struct uio *a_uio;
 	IN struct ucred *a_cred;
 	IN struct thread *a_td;
 };
 */
 {
 	struct mount *mp = ap->a_vp->v_mount;
 	struct ufsmount *ump = VFSTOUFS(mp); 
 	int error;
 
 	ufs_extattr_uepm_lock(ump, ap->a_td);
 
 	/*
 	 * XXX: No longer a supported way to delete extended attributes.
 	 */
 	if (ap->a_uio == NULL)
 		return (EINVAL);
 
 	error = ufs_extattr_set(ap->a_vp, ap->a_attrnamespace, ap->a_name,
 	    ap->a_uio, ap->a_cred, ap->a_td);
 
 	ufs_extattr_uepm_unlock(ump, ap->a_td);
 
 	return (error);
 }
 
 /*
  * Real work associated with setting a vnode's extended attributes;
  * assumes that the attribute lock has already been grabbed.
  */
 static int
 ufs_extattr_set(struct vnode *vp, int attrnamespace, const char *name,
     struct uio *uio, struct ucred *cred, struct thread *td)
 {
 	struct ufs_extattr_list_entry *attribute;
 	struct ufs_extattr_header ueh;
 	struct iovec local_aiov;
 	struct uio local_aio;
 	struct mount *mp = vp->v_mount;
 	struct ufsmount *ump = VFSTOUFS(mp);
 	struct inode *ip = VTOI(vp);
 	off_t base_offset;
 	int error = 0, ioflag;
 
 	if (vp->v_mount->mnt_flag & MNT_RDONLY)
 		return (EROFS);
 	if (!(ump->um_extattr.uepm_flags & UFS_EXTATTR_UEPM_STARTED))
 		return (EOPNOTSUPP);
 	if (!ufs_extattr_valid_attrname(attrnamespace, name))
 		return (EINVAL);
 
 	error = extattr_check_cred(vp, attrnamespace, cred, td, IWRITE);
 	if (error)
 		return (error);
 
 	attribute = ufs_extattr_find_attr(ump, attrnamespace, name);
 	if (!attribute)
 		return (ENOATTR);
 
 	/*
 	 * Early rejection of invalid offsets/length.
 	 * Reject: any offset but 0 (replace)
 	 *	 Any size greater than attribute size limit
  	 */
 	if (uio->uio_offset != 0 ||
 	    uio->uio_resid > attribute->uele_fileheader.uef_size)
 		return (ENXIO);
 
 	/*
 	 * Find base offset of header in file based on file header size, and
 	 * data header size + maximum data size, indexed by inode number.
 	 */
 	base_offset = sizeof(struct ufs_extattr_fileheader) +
 	    ip->i_number * (sizeof(struct ufs_extattr_header) +
 	    attribute->uele_fileheader.uef_size);
 
 	/*
 	 * Write out a data header for the data.
 	 */
 	ueh.ueh_len = uio->uio_resid;
 	ueh.ueh_flags = UFS_EXTATTR_ATTR_FLAG_INUSE;
 	ueh.ueh_i_gen = ip->i_gen;
 	local_aiov.iov_base = (caddr_t) &ueh;
 	local_aiov.iov_len = sizeof(struct ufs_extattr_header);
 	local_aio.uio_iov = &local_aiov;
 	local_aio.uio_iovcnt = 1;
 	local_aio.uio_rw = UIO_WRITE;
 	local_aio.uio_segflg = UIO_SYSSPACE;
 	local_aio.uio_td = td;
 	local_aio.uio_offset = base_offset;
 	local_aio.uio_resid = sizeof(struct ufs_extattr_header);
 
 	/*
 	 * Acquire locks.
 	 *
 	 * Don't need to get a lock on the backing file if the setattr is
 	 * being applied to the backing file, as the lock is already held.
 	 */
 	if (attribute->uele_backing_vnode != vp)
-		vn_lock(attribute->uele_backing_vnode, 
-		    LK_EXCLUSIVE | LK_RETRY, td);
+		vn_lock(attribute->uele_backing_vnode, LK_EXCLUSIVE | LK_RETRY);
 
 	ioflag = IO_NODELOCKED;
 	if (ufs_extattr_sync)
 		ioflag |= IO_SYNC;
 	error = VOP_WRITE(attribute->uele_backing_vnode, &local_aio, ioflag,
 	    ump->um_extattr.uepm_ucred);
 	if (error)
 		goto vopunlock_exit;
 
 	if (local_aio.uio_resid != 0) {
 		error = ENXIO;
 		goto vopunlock_exit;
 	}
 
 	/*
 	 * Write out user data.
 	 */
 	uio->uio_offset = base_offset + sizeof(struct ufs_extattr_header);
 
 	ioflag = IO_NODELOCKED;
 	if (ufs_extattr_sync)
 		ioflag |= IO_SYNC;
 	error = VOP_WRITE(attribute->uele_backing_vnode, uio, ioflag,
 	    ump->um_extattr.uepm_ucred);
 
 vopunlock_exit:
 	uio->uio_offset = 0;
 
 	if (attribute->uele_backing_vnode != vp)
 		VOP_UNLOCK(attribute->uele_backing_vnode, 0, td);
 
 	return (error);
 }
 
 /*
  * Real work associated with removing an extended attribute from a vnode.
  * Assumes the attribute lock has already been grabbed.
  */
 static int
 ufs_extattr_rm(struct vnode *vp, int attrnamespace, const char *name,
     struct ucred *cred, struct thread *td)
 {
 	struct ufs_extattr_list_entry *attribute;
 	struct ufs_extattr_header ueh;
 	struct iovec local_aiov;
 	struct uio local_aio;
 	struct mount *mp = vp->v_mount;
 	struct ufsmount *ump = VFSTOUFS(mp);
 	struct inode *ip = VTOI(vp);
 	off_t base_offset;
 	int error = 0, ioflag;
 
 	if (vp->v_mount->mnt_flag & MNT_RDONLY)  
 		return (EROFS);
 	if (!(ump->um_extattr.uepm_flags & UFS_EXTATTR_UEPM_STARTED))
 		return (EOPNOTSUPP);
 	if (!ufs_extattr_valid_attrname(attrnamespace, name))
 		return (EINVAL);
 
 	error = extattr_check_cred(vp, attrnamespace, cred, td, IWRITE);
 	if (error)
 		return (error);
 
 	attribute = ufs_extattr_find_attr(ump, attrnamespace, name);
 	if (!attribute)
 		return (ENOATTR);
 
 	/*
 	 * Find base offset of header in file based on file header size, and
 	 * data header size + maximum data size, indexed by inode number.
 	 */
 	base_offset = sizeof(struct ufs_extattr_fileheader) +
 	    ip->i_number * (sizeof(struct ufs_extattr_header) +
 	    attribute->uele_fileheader.uef_size);
 
 	/*
 	 * Check to see if currently defined.
 	 */
 	bzero(&ueh, sizeof(struct ufs_extattr_header));
 
 	local_aiov.iov_base = (caddr_t) &ueh;
 	local_aiov.iov_len = sizeof(struct ufs_extattr_header);
 	local_aio.uio_iov = &local_aiov;
 	local_aio.uio_iovcnt = 1;
 	local_aio.uio_rw = UIO_READ;
 	local_aio.uio_segflg = UIO_SYSSPACE;
 	local_aio.uio_td = td;
 	local_aio.uio_offset = base_offset;
 	local_aio.uio_resid = sizeof(struct ufs_extattr_header);
 
 	/*
 	 * Don't need to get the lock on the backing vnode if the vnode we're
 	 * modifying is it, as we already hold the lock.
 	 */
 	if (attribute->uele_backing_vnode != vp)
-		vn_lock(attribute->uele_backing_vnode,
-		    LK_EXCLUSIVE | LK_RETRY, td);
+		vn_lock(attribute->uele_backing_vnode, LK_EXCLUSIVE | LK_RETRY);
 
 	error = VOP_READ(attribute->uele_backing_vnode, &local_aio,
 	    IO_NODELOCKED, ump->um_extattr.uepm_ucred);
 	if (error)
 		goto vopunlock_exit;
 
 	/* Defined? */
 	if ((ueh.ueh_flags & UFS_EXTATTR_ATTR_FLAG_INUSE) == 0) {
 		error = ENOATTR;
 		goto vopunlock_exit;
 	}
 
 	/* Valid for the current inode generation? */
 	if (ueh.ueh_i_gen != ip->i_gen) {
 		/*
 		 * The inode itself has a different generation number than
 		 * the attribute data.  For now, the best solution is to
 		 * coerce this to undefined, and let it get cleaned up by
 		 * the next write or extattrctl clean.
 		 */
 		printf("ufs_extattr_rm (%s): inode number inconsistency (%d, %jd)\n",
 		    mp->mnt_stat.f_mntonname, ueh.ueh_i_gen, (intmax_t)ip->i_gen);
 		error = ENOATTR;
 		goto vopunlock_exit;
 	}
 
 	/* Flag it as not in use. */
 	ueh.ueh_flags = 0;
 	ueh.ueh_len = 0;
 
 	local_aiov.iov_base = (caddr_t) &ueh;
 	local_aiov.iov_len = sizeof(struct ufs_extattr_header);
 	local_aio.uio_iov = &local_aiov;
 	local_aio.uio_iovcnt = 1;
 	local_aio.uio_rw = UIO_WRITE;
 	local_aio.uio_segflg = UIO_SYSSPACE;
 	local_aio.uio_td = td;
 	local_aio.uio_offset = base_offset;
 	local_aio.uio_resid = sizeof(struct ufs_extattr_header);
 
 	ioflag = IO_NODELOCKED;
 	if (ufs_extattr_sync)
 		ioflag |= IO_SYNC;
 	error = VOP_WRITE(attribute->uele_backing_vnode, &local_aio, ioflag,
 	    ump->um_extattr.uepm_ucred);
 	if (error)
 		goto vopunlock_exit;
 
 	if (local_aio.uio_resid != 0)
 		error = ENXIO;
 
 vopunlock_exit:
 	VOP_UNLOCK(attribute->uele_backing_vnode, 0, td);
 
 	return (error);
 }
 
 /*
  * Called by UFS when an inode is no longer active and should have its
  * attributes stripped.
  */
 void
 ufs_extattr_vnode_inactive(struct vnode *vp, struct thread *td)
 {
 	struct ufs_extattr_list_entry *uele;
 	struct mount *mp = vp->v_mount;
 	struct ufsmount *ump = VFSTOUFS(mp);
 
 	/*
 	 * In that case, we cannot lock. We should not have any active vnodes
 	 * on the fs if this is not yet initialized but is going to be, so
 	 * this can go unlocked.
 	 */
 	if (!(ump->um_extattr.uepm_flags & UFS_EXTATTR_UEPM_INITIALIZED))
 		return;
 
 	ufs_extattr_uepm_lock(ump, td);
 
 	if (!(ump->um_extattr.uepm_flags & UFS_EXTATTR_UEPM_STARTED)) {
 		ufs_extattr_uepm_unlock(ump, td);
 		return;
 	}
 
 	LIST_FOREACH(uele, &ump->um_extattr.uepm_list, uele_entries)
 		ufs_extattr_rm(vp, uele->uele_attrnamespace,
 		    uele->uele_attrname, NULL, td);
 
 	ufs_extattr_uepm_unlock(ump, td);
 }
 
 #endif /* !UFS_EXTATTR */
Index: head/sys/ufs/ufs/ufs_lookup.c
===================================================================
--- head/sys/ufs/ufs/ufs_lookup.c	(revision 175201)
+++ head/sys/ufs/ufs/ufs_lookup.c	(revision 175202)
@@ -1,1253 +1,1253 @@
 /*-
  * Copyright (c) 1989, 1993
  *	The Regents of the University of California.  All rights reserved.
  * (c) UNIX System Laboratories, Inc.
  * All or some portions of this file are derived from material licensed
  * to the University of California by American Telephone and Telegraph
  * Co. or Unix System Laboratories, Inc. and are reproduced herein with
  * the permission of UNIX System Laboratories, Inc.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	@(#)ufs_lookup.c	8.15 (Berkeley) 6/16/95
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_ffs_broken_fixme.h"
 #include "opt_ufs.h"
 #include "opt_quota.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/kernel.h>
 #include <sys/namei.h>
 #include <sys/bio.h>
 #include <sys/buf.h>
 #include <sys/proc.h>
 #include <sys/stat.h>
 #include <sys/mount.h>
 #include <sys/vnode.h>
 #include <sys/sysctl.h>
 
 #include <vm/vm.h>
 #include <vm/vm_extern.h>
 
 #include <ufs/ufs/extattr.h>
 #include <ufs/ufs/quota.h>
 #include <ufs/ufs/inode.h>
 #include <ufs/ufs/dir.h>
 #ifdef UFS_DIRHASH
 #include <ufs/ufs/dirhash.h>
 #endif
 #include <ufs/ufs/ufsmount.h>
 #include <ufs/ufs/ufs_extern.h>
 
 #ifdef DIAGNOSTIC
 static int	dirchk = 1;
 #else
 static int	dirchk = 0;
 #endif
 
 SYSCTL_INT(_debug, OID_AUTO, dircheck, CTLFLAG_RW, &dirchk, 0, "");
 
 /* true if old FS format...*/
 #define OFSFMT(vp)	((vp)->v_mount->mnt_maxsymlinklen <= 0)
 
 /*
  * Convert a component of a pathname into a pointer to a locked inode.
  * This is a very central and rather complicated routine.
  * If the filesystem is not maintained in a strict tree hierarchy,
  * this can result in a deadlock situation (see comments in code below).
  *
  * The cnp->cn_nameiop argument is LOOKUP, CREATE, RENAME, or DELETE depending
  * on whether the name is to be looked up, created, renamed, or deleted.
  * When CREATE, RENAME, or DELETE is specified, information usable in
  * creating, renaming, or deleting a directory entry may be calculated.
  * If flag has LOCKPARENT or'ed into it and the target of the pathname
  * exists, lookup returns both the target and its parent directory locked.
  * When creating or renaming and LOCKPARENT is specified, the target may
  * not be ".".  When deleting and LOCKPARENT is specified, the target may
  * be "."., but the caller must check to ensure it does an vrele and vput
  * instead of two vputs.
  *
  * This routine is actually used as VOP_CACHEDLOOKUP method, and the
  * filesystem employs the generic vfs_cache_lookup() as VOP_LOOKUP
  * method.
  *
  * vfs_cache_lookup() performs the following for us:
  *	check that it is a directory
  *	check accessibility of directory
  *	check for modification attempts on read-only mounts
  *	if name found in cache
  *	    if at end of path and deleting or creating
  *		drop it
  *	     else
  *		return name.
  *	return VOP_CACHEDLOOKUP()
  *
  * Overall outline of ufs_lookup:
  *
  *	search for name in directory, to found or notfound
  * notfound:
  *	if creating, return locked directory, leaving info on available slots
  *	else return error
  * found:
  *	if at end of path and deleting, return information to allow delete
  *	if at end of path and rewriting (RENAME and LOCKPARENT), lock target
  *	  inode and return info to allow rewrite
  *	if not at end, add name to cache; if at end and neither creating
  *	  nor deleting, add name to cache
  */
 int
 ufs_lookup(ap)
 	struct vop_cachedlookup_args /* {
 		struct vnode *a_dvp;
 		struct vnode **a_vpp;
 		struct componentname *a_cnp;
 	} */ *ap;
 {
 	struct vnode *vdp;		/* vnode for directory being searched */
 	struct inode *dp;		/* inode for directory being searched */
 	struct buf *bp;			/* a buffer of directory entries */
 	struct direct *ep;		/* the current directory entry */
 	int entryoffsetinblock;		/* offset of ep in bp's buffer */
 	enum {NONE, COMPACT, FOUND} slotstatus;
 	doff_t slotoffset;		/* offset of area with free space */
 	int slotsize;			/* size of area at slotoffset */
 	int slotfreespace;		/* amount of space free in slot */
 	int slotneeded;			/* size of the entry we're seeking */
 	int numdirpasses;		/* strategy for directory search */
 	doff_t endsearch;		/* offset to end directory search */
 	doff_t prevoff;			/* prev entry dp->i_offset */
 	struct vnode *pdp;		/* saved dp during symlink work */
 	struct vnode *tdp;		/* returned by VFS_VGET */
 	doff_t enduseful;		/* pointer past last used dir slot */
 	u_long bmask;			/* block offset mask */
 	int namlen, error;
 	struct vnode **vpp = ap->a_vpp;
 	struct componentname *cnp = ap->a_cnp;
 	struct ucred *cred = cnp->cn_cred;
 	int flags = cnp->cn_flags;
 	int nameiop = cnp->cn_nameiop;
 	struct thread *td = cnp->cn_thread;
 	ino_t saved_ino;
 
 	bp = NULL;
 	slotoffset = -1;
 /*
  *  XXX there was a soft-update diff about this I couldn't merge.
  * I think this was the equiv.
  */
 	*vpp = NULL;
 
 	vdp = ap->a_dvp;
 	dp = VTOI(vdp);
 	/*
 	 * We now have a segment name to search for, and a directory to search.
 	 *
 	 * Suppress search for slots unless creating
 	 * file and at end of pathname, in which case
 	 * we watch for a place to put the new file in
 	 * case it doesn't already exist.
 	 */
 	slotstatus = FOUND;
 	slotfreespace = slotsize = slotneeded = 0;
 	if ((nameiop == CREATE || nameiop == RENAME) &&
 	    (flags & ISLASTCN)) {
 		slotstatus = NONE;
 		slotneeded = DIRECTSIZ(cnp->cn_namelen);
 	}
 	bmask = VFSTOUFS(vdp->v_mount)->um_mountp->mnt_stat.f_iosize - 1;
 
 #ifdef UFS_DIRHASH
 	/*
 	 * Use dirhash for fast operations on large directories. The logic
 	 * to determine whether to hash the directory is contained within
 	 * ufsdirhash_build(); a zero return means that it decided to hash
 	 * this directory and it successfully built up the hash table.
 	 */
 	if (ufsdirhash_build(dp) == 0) {
 		/* Look for a free slot if needed. */
 		enduseful = dp->i_size;
 		if (slotstatus != FOUND) {
 			slotoffset = ufsdirhash_findfree(dp, slotneeded,
 			    &slotsize);
 			if (slotoffset >= 0) {
 				slotstatus = COMPACT;
 				enduseful = ufsdirhash_enduseful(dp);
 				if (enduseful < 0)
 					enduseful = dp->i_size;
 			}
 		}
 		/* Look up the component. */
 		numdirpasses = 1;
 		entryoffsetinblock = 0; /* silence compiler warning */
 		switch (ufsdirhash_lookup(dp, cnp->cn_nameptr, cnp->cn_namelen,
 		    &dp->i_offset, &bp, nameiop == DELETE ? &prevoff : NULL)) {
 		case 0:
 			ep = (struct direct *)((char *)bp->b_data +
 			    (dp->i_offset & bmask));
 			goto foundentry;
 		case ENOENT:
 			dp->i_offset = roundup2(dp->i_size, DIRBLKSIZ);
 			goto notfound;
 		default:
 			/* Something failed; just do a linear search. */
 			break;
 		}
 	}
 #endif /* UFS_DIRHASH */
 	/*
 	 * If there is cached information on a previous search of
 	 * this directory, pick up where we last left off.
 	 * We cache only lookups as these are the most common
 	 * and have the greatest payoff. Caching CREATE has little
 	 * benefit as it usually must search the entire directory
 	 * to determine that the entry does not exist. Caching the
 	 * location of the last DELETE or RENAME has not reduced
 	 * profiling time and hence has been removed in the interest
 	 * of simplicity.
 	 */
 	if (nameiop != LOOKUP || dp->i_diroff == 0 ||
 	    dp->i_diroff >= dp->i_size) {
 		entryoffsetinblock = 0;
 		dp->i_offset = 0;
 		numdirpasses = 1;
 	} else {
 		dp->i_offset = dp->i_diroff;
 		if ((entryoffsetinblock = dp->i_offset & bmask) &&
 		    (error = UFS_BLKATOFF(vdp, (off_t)dp->i_offset, NULL, &bp)))
 			return (error);
 		numdirpasses = 2;
 		nchstats.ncs_2passes++;
 	}
 	prevoff = dp->i_offset;
 	endsearch = roundup2(dp->i_size, DIRBLKSIZ);
 	enduseful = 0;
 
 searchloop:
 	while (dp->i_offset < endsearch) {
 		/*
 		 * If necessary, get the next directory block.
 		 */
 		if ((dp->i_offset & bmask) == 0) {
 			if (bp != NULL)
 				brelse(bp);
 			error =
 			    UFS_BLKATOFF(vdp, (off_t)dp->i_offset, NULL, &bp);
 			if (error)
 				return (error);
 			entryoffsetinblock = 0;
 		}
 		/*
 		 * If still looking for a slot, and at a DIRBLKSIZE
 		 * boundary, have to start looking for free space again.
 		 */
 		if (slotstatus == NONE &&
 		    (entryoffsetinblock & (DIRBLKSIZ - 1)) == 0) {
 			slotoffset = -1;
 			slotfreespace = 0;
 		}
 		/*
 		 * Get pointer to next entry.
 		 * Full validation checks are slow, so we only check
 		 * enough to insure forward progress through the
 		 * directory. Complete checks can be run by patching
 		 * "dirchk" to be true.
 		 */
 		ep = (struct direct *)((char *)bp->b_data + entryoffsetinblock);
 		if (ep->d_reclen == 0 || ep->d_reclen >
 		    DIRBLKSIZ - (entryoffsetinblock & (DIRBLKSIZ - 1)) ||
 		    (dirchk && ufs_dirbadentry(vdp, ep, entryoffsetinblock))) {
 			int i;
 
 			ufs_dirbad(dp, dp->i_offset, "mangled entry");
 			i = DIRBLKSIZ - (entryoffsetinblock & (DIRBLKSIZ - 1));
 			dp->i_offset += i;
 			entryoffsetinblock += i;
 			continue;
 		}
 
 		/*
 		 * If an appropriate sized slot has not yet been found,
 		 * check to see if one is available. Also accumulate space
 		 * in the current block so that we can determine if
 		 * compaction is viable.
 		 */
 		if (slotstatus != FOUND) {
 			int size = ep->d_reclen;
 
 			if (ep->d_ino != 0)
 				size -= DIRSIZ(OFSFMT(vdp), ep);
 			if (size > 0) {
 				if (size >= slotneeded) {
 					slotstatus = FOUND;
 					slotoffset = dp->i_offset;
 					slotsize = ep->d_reclen;
 				} else if (slotstatus == NONE) {
 					slotfreespace += size;
 					if (slotoffset == -1)
 						slotoffset = dp->i_offset;
 					if (slotfreespace >= slotneeded) {
 						slotstatus = COMPACT;
 						slotsize = dp->i_offset +
 						      ep->d_reclen - slotoffset;
 					}
 				}
 			}
 		}
 
 		/*
 		 * Check for a name match.
 		 */
 		if (ep->d_ino) {
 #			if (BYTE_ORDER == LITTLE_ENDIAN)
 				if (OFSFMT(vdp))
 					namlen = ep->d_type;
 				else
 					namlen = ep->d_namlen;
 #			else
 				namlen = ep->d_namlen;
 #			endif
 			if (namlen == cnp->cn_namelen &&
 				(cnp->cn_nameptr[0] == ep->d_name[0]) &&
 			    !bcmp(cnp->cn_nameptr, ep->d_name,
 				(unsigned)namlen)) {
 #ifdef UFS_DIRHASH
 foundentry:
 #endif
 				/*
 				 * Save directory entry's inode number and
 				 * reclen in ndp->ni_ufs area, and release
 				 * directory buffer.
 				 */
 				if (vdp->v_mount->mnt_maxsymlinklen > 0 &&
 				    ep->d_type == DT_WHT) {
 					slotstatus = FOUND;
 					slotoffset = dp->i_offset;
 					slotsize = ep->d_reclen;
 					dp->i_reclen = slotsize;
 					enduseful = dp->i_size;
 					ap->a_cnp->cn_flags |= ISWHITEOUT;
 					numdirpasses--;
 					goto notfound;
 				}
 				dp->i_ino = ep->d_ino;
 				dp->i_reclen = ep->d_reclen;
 				goto found;
 			}
 		}
 		prevoff = dp->i_offset;
 		dp->i_offset += ep->d_reclen;
 		entryoffsetinblock += ep->d_reclen;
 		if (ep->d_ino)
 			enduseful = dp->i_offset;
 	}
 notfound:
 	/*
 	 * If we started in the middle of the directory and failed
 	 * to find our target, we must check the beginning as well.
 	 */
 	if (numdirpasses == 2) {
 		numdirpasses--;
 		dp->i_offset = 0;
 		endsearch = dp->i_diroff;
 		goto searchloop;
 	}
 	if (bp != NULL)
 		brelse(bp);
 	/*
 	 * If creating, and at end of pathname and current
 	 * directory has not been removed, then can consider
 	 * allowing file to be created.
 	 */
 	if ((nameiop == CREATE || nameiop == RENAME ||
 	     (nameiop == DELETE &&
 	      (ap->a_cnp->cn_flags & DOWHITEOUT) &&
 	      (ap->a_cnp->cn_flags & ISWHITEOUT))) &&
 	    (flags & ISLASTCN) && dp->i_effnlink != 0) {
 		/*
 		 * Access for write is interpreted as allowing
 		 * creation of files in the directory.
 		 */
 		error = VOP_ACCESS(vdp, VWRITE, cred, cnp->cn_thread);
 		if (error)
 			return (error);
 		/*
 		 * Return an indication of where the new directory
 		 * entry should be put.  If we didn't find a slot,
 		 * then set dp->i_count to 0 indicating
 		 * that the new slot belongs at the end of the
 		 * directory. If we found a slot, then the new entry
 		 * can be put in the range from dp->i_offset to
 		 * dp->i_offset + dp->i_count.
 		 */
 		if (slotstatus == NONE) {
 			dp->i_offset = roundup2(dp->i_size, DIRBLKSIZ);
 			dp->i_count = 0;
 			enduseful = dp->i_offset;
 		} else if (nameiop == DELETE) {
 			dp->i_offset = slotoffset;
 			if ((dp->i_offset & (DIRBLKSIZ - 1)) == 0)
 				dp->i_count = 0;
 			else
 				dp->i_count = dp->i_offset - prevoff;
 		} else {
 			dp->i_offset = slotoffset;
 			dp->i_count = slotsize;
 			if (enduseful < slotoffset + slotsize)
 				enduseful = slotoffset + slotsize;
 		}
 		dp->i_endoff = roundup2(enduseful, DIRBLKSIZ);
 		/*
 		 * We return with the directory locked, so that
 		 * the parameters we set up above will still be
 		 * valid if we actually decide to do a direnter().
 		 * We return ni_vp == NULL to indicate that the entry
 		 * does not currently exist; we leave a pointer to
 		 * the (locked) directory inode in ndp->ni_dvp.
 		 * The pathname buffer is saved so that the name
 		 * can be obtained later.
 		 *
 		 * NB - if the directory is unlocked, then this
 		 * information cannot be used.
 		 */
 		cnp->cn_flags |= SAVENAME;
 		return (EJUSTRETURN);
 	}
 	/*
 	 * Insert name into cache (as non-existent) if appropriate.
 	 */
 	if ((cnp->cn_flags & MAKEENTRY) && nameiop != CREATE)
 		cache_enter(vdp, *vpp, cnp);
 	return (ENOENT);
 
 found:
 	if (numdirpasses == 2)
 		nchstats.ncs_pass2++;
 	/*
 	 * Check that directory length properly reflects presence
 	 * of this entry.
 	 */
 	if (dp->i_offset + DIRSIZ(OFSFMT(vdp), ep) > dp->i_size) {
 		ufs_dirbad(dp, dp->i_offset, "i_size too small");
 		dp->i_size = dp->i_offset + DIRSIZ(OFSFMT(vdp), ep);
 		DIP_SET(dp, i_size, dp->i_size);
 		dp->i_flag |= IN_CHANGE | IN_UPDATE;
 	}
 	brelse(bp);
 
 	/*
 	 * Found component in pathname.
 	 * If the final component of path name, save information
 	 * in the cache as to where the entry was found.
 	 */
 	if ((flags & ISLASTCN) && nameiop == LOOKUP)
 		dp->i_diroff = dp->i_offset &~ (DIRBLKSIZ - 1);
 
 	/*
 	 * If deleting, and at end of pathname, return
 	 * parameters which can be used to remove file.
 	 */
 	if (nameiop == DELETE && (flags & ISLASTCN)) {
 		/*
 		 * Write access to directory required to delete files.
 		 */
 		error = VOP_ACCESS(vdp, VWRITE, cred, cnp->cn_thread);
 		if (error)
 			return (error);
 		/*
 		 * Return pointer to current entry in dp->i_offset,
 		 * and distance past previous entry (if there
 		 * is a previous entry in this block) in dp->i_count.
 		 * Save directory inode pointer in ndp->ni_dvp for dirremove().
 		 */
 		if ((dp->i_offset & (DIRBLKSIZ - 1)) == 0)
 			dp->i_count = 0;
 		else
 			dp->i_count = dp->i_offset - prevoff;
 		if (dp->i_number == dp->i_ino) {
 			VREF(vdp);
 			*vpp = vdp;
 			return (0);
 		}
 		if ((error = VFS_VGET(vdp->v_mount, dp->i_ino,
 		    LK_EXCLUSIVE, &tdp)) != 0)
 			return (error);
 		/*
 		 * If directory is "sticky", then user must own
 		 * the directory, or the file in it, else she
 		 * may not delete it (unless she's root). This
 		 * implements append-only directories.
 		 */
 		if ((dp->i_mode & ISVTX) &&
 		    VOP_ACCESS(vdp, VADMIN, cred, cnp->cn_thread) &&
 		    VOP_ACCESS(tdp, VADMIN, cred, cnp->cn_thread)) {
 			vput(tdp);
 			return (EPERM);
 		}
 		*vpp = tdp;
 		return (0);
 	}
 
 	/*
 	 * If rewriting (RENAME), return the inode and the
 	 * information required to rewrite the present directory
 	 * Must get inode of directory entry to verify it's a
 	 * regular file, or empty directory.
 	 */
 	if (nameiop == RENAME && (flags & ISLASTCN)) {
 		if ((error = VOP_ACCESS(vdp, VWRITE, cred, cnp->cn_thread)))
 			return (error);
 		/*
 		 * Careful about locking second inode.
 		 * This can only occur if the target is ".".
 		 */
 		if (dp->i_number == dp->i_ino)
 			return (EISDIR);
 		if ((error = VFS_VGET(vdp->v_mount, dp->i_ino,
 		    LK_EXCLUSIVE, &tdp)) != 0)
 			return (error);
 		*vpp = tdp;
 		cnp->cn_flags |= SAVENAME;
 		return (0);
 	}
 
 	/*
 	 * Step through the translation in the name.  We do not `vput' the
 	 * directory because we may need it again if a symbolic link
 	 * is relative to the current directory.  Instead we save it
 	 * unlocked as "pdp".  We must get the target inode before unlocking
 	 * the directory to insure that the inode will not be removed
 	 * before we get it.  We prevent deadlock by always fetching
 	 * inodes from the root, moving down the directory tree. Thus
 	 * when following backward pointers ".." we must unlock the
 	 * parent directory before getting the requested directory.
 	 * There is a potential race condition here if both the current
 	 * and parent directories are removed before the VFS_VGET for the
 	 * inode associated with ".." returns.  We hope that this occurs
 	 * infrequently since we cannot avoid this race condition without
 	 * implementing a sophisticated deadlock detection algorithm.
 	 * Note also that this simple deadlock detection scheme will not
 	 * work if the filesystem has any hard links other than ".."
 	 * that point backwards in the directory structure.
 	 */
 	pdp = vdp;
 	if (flags & ISDOTDOT) {
 		saved_ino = dp->i_ino;
 		VOP_UNLOCK(pdp, 0, td);	/* race to get the inode */
 		error = VFS_VGET(pdp->v_mount, saved_ino,
 		    cnp->cn_lkflags, &tdp);
-		vn_lock(pdp, LK_EXCLUSIVE | LK_RETRY, td);
+		vn_lock(pdp, LK_EXCLUSIVE | LK_RETRY);
 		if (error)
 			return (error);
 		*vpp = tdp;
 	} else if (dp->i_number == dp->i_ino) {
 		VREF(vdp);	/* we want ourself, ie "." */
 		*vpp = vdp;
 	} else {
 		error = VFS_VGET(pdp->v_mount, dp->i_ino,
 		    cnp->cn_lkflags, &tdp);
 		if (error)
 			return (error);
 		*vpp = tdp;
 	}
 
 	/*
 	 * Insert name into cache if appropriate.
 	 */
 	if (cnp->cn_flags & MAKEENTRY)
 		cache_enter(vdp, *vpp, cnp);
 	return (0);
 }
 
 void
 ufs_dirbad(ip, offset, how)
 	struct inode *ip;
 	doff_t offset;
 	char *how;
 {
 	struct mount *mp;
 
 	mp = ITOV(ip)->v_mount;
 	if ((mp->mnt_flag & MNT_RDONLY) == 0)
 		panic("ufs_dirbad: %s: bad dir ino %lu at offset %ld: %s",
 		    mp->mnt_stat.f_mntonname, (u_long)ip->i_number, (long)offset, how);
 	else
 		(void)printf("%s: bad dir ino %lu at offset %ld: %s\n",
 		    mp->mnt_stat.f_mntonname, (u_long)ip->i_number, (long)offset, how);
 }
 
 /*
  * Do consistency checking on a directory entry:
  *	record length must be multiple of 4
  *	entry must fit in rest of its DIRBLKSIZ block
  *	record must be large enough to contain entry
  *	name is not longer than MAXNAMLEN
  *	name must be as long as advertised, and null terminated
  */
 int
 ufs_dirbadentry(dp, ep, entryoffsetinblock)
 	struct vnode *dp;
 	struct direct *ep;
 	int entryoffsetinblock;
 {
 	int i, namlen;
 
 #	if (BYTE_ORDER == LITTLE_ENDIAN)
 		if (OFSFMT(dp))
 			namlen = ep->d_type;
 		else
 			namlen = ep->d_namlen;
 #	else
 		namlen = ep->d_namlen;
 #	endif
 	if ((ep->d_reclen & 0x3) != 0 ||
 	    ep->d_reclen > DIRBLKSIZ - (entryoffsetinblock & (DIRBLKSIZ - 1)) ||
 	    ep->d_reclen < DIRSIZ(OFSFMT(dp), ep) || namlen > MAXNAMLEN) {
 		/*return (1); */
 		printf("First bad\n");
 		goto bad;
 	}
 	if (ep->d_ino == 0)
 		return (0);
 	for (i = 0; i < namlen; i++)
 		if (ep->d_name[i] == '\0') {
 			/*return (1); */
 			printf("Second bad\n");
 			goto bad;
 		}
 	if (ep->d_name[i])
 		goto bad;
 	return (0);
 bad:
 	return (1);
 }
 
 /*
  * Construct a new directory entry after a call to namei, using the
  * parameters that it left in the componentname argument cnp. The
  * argument ip is the inode to which the new directory entry will refer.
  */
 void
 ufs_makedirentry(ip, cnp, newdirp)
 	struct inode *ip;
 	struct componentname *cnp;
 	struct direct *newdirp;
 {
 
 #ifdef INVARIANTS
 	if ((cnp->cn_flags & SAVENAME) == 0)
 		panic("ufs_makedirentry: missing name");
 #endif
 	newdirp->d_ino = ip->i_number;
 	newdirp->d_namlen = cnp->cn_namelen;
 	bcopy(cnp->cn_nameptr, newdirp->d_name, (unsigned)cnp->cn_namelen + 1);
 	if (ITOV(ip)->v_mount->mnt_maxsymlinklen > 0)
 		newdirp->d_type = IFTODT(ip->i_mode);
 	else {
 		newdirp->d_type = 0;
 #		if (BYTE_ORDER == LITTLE_ENDIAN)
 			{ u_char tmp = newdirp->d_namlen;
 			newdirp->d_namlen = newdirp->d_type;
 			newdirp->d_type = tmp; }
 #		endif
 	}
 }
 
 /*
  * Write a directory entry after a call to namei, using the parameters
  * that it left in nameidata. The argument dirp is the new directory
  * entry contents. Dvp is a pointer to the directory to be written,
  * which was left locked by namei. Remaining parameters (dp->i_offset, 
  * dp->i_count) indicate how the space for the new entry is to be obtained.
  * Non-null bp indicates that a directory is being created (for the
  * soft dependency code).
  */
 int
 ufs_direnter(dvp, tvp, dirp, cnp, newdirbp)
 	struct vnode *dvp;
 	struct vnode *tvp;
 	struct direct *dirp;
 	struct componentname *cnp;
 	struct buf *newdirbp;
 {
 	struct ucred *cr;
 	struct thread *td;
 	int newentrysize;
 	struct inode *dp;
 	struct buf *bp;
 	u_int dsize;
 	struct direct *ep, *nep;
 	int error, ret, blkoff, loc, spacefree, flags, namlen;
 	char *dirbuf;
 
 	td = curthread;	/* XXX */
 	cr = td->td_ucred;
 
 	dp = VTOI(dvp);
 	newentrysize = DIRSIZ(OFSFMT(dvp), dirp);
 
 	if (dp->i_count == 0) {
 		/*
 		 * If dp->i_count is 0, then namei could find no
 		 * space in the directory. Here, dp->i_offset will
 		 * be on a directory block boundary and we will write the
 		 * new entry into a fresh block.
 		 */
 		if (dp->i_offset & (DIRBLKSIZ - 1))
 			panic("ufs_direnter: newblk");
 		flags = BA_CLRBUF;
 		if (!DOINGSOFTDEP(dvp) && !DOINGASYNC(dvp))
 			flags |= IO_SYNC;
 #ifdef QUOTA
 		if ((error = getinoquota(dp)) != 0) {
 			if (DOINGSOFTDEP(dvp) && newdirbp != NULL)
 				bdwrite(newdirbp);
 			return (error);
 		}
 #endif
 		if ((error = UFS_BALLOC(dvp, (off_t)dp->i_offset, DIRBLKSIZ,
 		    cr, flags, &bp)) != 0) {
 			if (DOINGSOFTDEP(dvp) && newdirbp != NULL)
 				bdwrite(newdirbp);
 			return (error);
 		}
 		dp->i_size = dp->i_offset + DIRBLKSIZ;
 		DIP_SET(dp, i_size, dp->i_size);
 		dp->i_flag |= IN_CHANGE | IN_UPDATE;
 		vnode_pager_setsize(dvp, (u_long)dp->i_size);
 		dirp->d_reclen = DIRBLKSIZ;
 		blkoff = dp->i_offset &
 		    (VFSTOUFS(dvp->v_mount)->um_mountp->mnt_stat.f_iosize - 1);
 		bcopy((caddr_t)dirp, (caddr_t)bp->b_data + blkoff,newentrysize);
 #ifdef UFS_DIRHASH
 		if (dp->i_dirhash != NULL) {
 			ufsdirhash_newblk(dp, dp->i_offset);
 			ufsdirhash_add(dp, dirp, dp->i_offset);
 			ufsdirhash_checkblock(dp, (char *)bp->b_data + blkoff,
 			    dp->i_offset);
 		}
 #endif
 		if (DOINGSOFTDEP(dvp)) {
 			/*
 			 * Ensure that the entire newly allocated block is a
 			 * valid directory so that future growth within the
 			 * block does not have to ensure that the block is
 			 * written before the inode.
 			 */
 			blkoff += DIRBLKSIZ;
 			while (blkoff < bp->b_bcount) {
 				((struct direct *)
 				   (bp->b_data + blkoff))->d_reclen = DIRBLKSIZ;
 				blkoff += DIRBLKSIZ;
 			}
 			if (softdep_setup_directory_add(bp, dp, dp->i_offset,
 			    dirp->d_ino, newdirbp, 1) == 0) {
 				bdwrite(bp);
 				return (UFS_UPDATE(dvp, 0));
 			}
 			/* We have just allocated a directory block in an
 			 * indirect block. Rather than tracking when it gets
 			 * claimed by the inode, we simply do a VOP_FSYNC
 			 * now to ensure that it is there (in case the user
 			 * does a future fsync). Note that we have to unlock
 			 * the inode for the entry that we just entered, as
 			 * the VOP_FSYNC may need to lock other inodes which
 			 * can lead to deadlock if we also hold a lock on
 			 * the newly entered node.
 			 */
 			if ((error = bwrite(bp)))
 				return (error);
 			if (tvp != NULL)
 				VOP_UNLOCK(tvp, 0, td);
 			error = VOP_FSYNC(dvp, MNT_WAIT, td);
 			if (tvp != NULL)
-				vn_lock(tvp, LK_EXCLUSIVE | LK_RETRY, td);
+				vn_lock(tvp, LK_EXCLUSIVE | LK_RETRY);
 			return (error);
 		}
 		if (DOINGASYNC(dvp)) {
 			bdwrite(bp);
 			return (UFS_UPDATE(dvp, 0));
 		}
 		error = bwrite(bp);
 		ret = UFS_UPDATE(dvp, 1);
 		if (error == 0)
 			return (ret);
 		return (error);
 	}
 
 	/*
 	 * If dp->i_count is non-zero, then namei found space for the new
 	 * entry in the range dp->i_offset to dp->i_offset + dp->i_count
 	 * in the directory. To use this space, we may have to compact
 	 * the entries located there, by copying them together towards the
 	 * beginning of the block, leaving the free space in one usable
 	 * chunk at the end.
 	 */
 
 	/*
 	 * Increase size of directory if entry eats into new space.
 	 * This should never push the size past a new multiple of
 	 * DIRBLKSIZE.
 	 *
 	 * N.B. - THIS IS AN ARTIFACT OF 4.2 AND SHOULD NEVER HAPPEN.
 	 */
 	if (dp->i_offset + dp->i_count > dp->i_size) {
 		dp->i_size = dp->i_offset + dp->i_count;
 		DIP_SET(dp, i_size, dp->i_size);
 	}
 	/*
 	 * Get the block containing the space for the new directory entry.
 	 */
 	error = UFS_BLKATOFF(dvp, (off_t)dp->i_offset, &dirbuf, &bp);
 	if (error) {
 		if (DOINGSOFTDEP(dvp) && newdirbp != NULL)
 			bdwrite(newdirbp);
 		return (error);
 	}
 	/*
 	 * Find space for the new entry. In the simple case, the entry at
 	 * offset base will have the space. If it does not, then namei
 	 * arranged that compacting the region dp->i_offset to
 	 * dp->i_offset + dp->i_count would yield the space.
 	 */
 	ep = (struct direct *)dirbuf;
 	dsize = ep->d_ino ? DIRSIZ(OFSFMT(dvp), ep) : 0;
 	spacefree = ep->d_reclen - dsize;
 	for (loc = ep->d_reclen; loc < dp->i_count; ) {
 		nep = (struct direct *)(dirbuf + loc);
 
 		/* Trim the existing slot (NB: dsize may be zero). */
 		ep->d_reclen = dsize;
 		ep = (struct direct *)((char *)ep + dsize);
 
 		/* Read nep->d_reclen now as the bcopy() may clobber it. */
 		loc += nep->d_reclen;
 		if (nep->d_ino == 0) {
 			/*
 			 * A mid-block unused entry. Such entries are
 			 * never created by the kernel, but fsck_ffs
 			 * can create them (and it doesn't fix them).
 			 *
 			 * Add up the free space, and initialise the
 			 * relocated entry since we don't bcopy it.
 			 */
 			spacefree += nep->d_reclen;
 			ep->d_ino = 0;
 			dsize = 0;
 			continue;
 		}
 		dsize = DIRSIZ(OFSFMT(dvp), nep);
 		spacefree += nep->d_reclen - dsize;
 #ifdef UFS_DIRHASH
 		if (dp->i_dirhash != NULL)
 			ufsdirhash_move(dp, nep,
 			    dp->i_offset + ((char *)nep - dirbuf),
 			    dp->i_offset + ((char *)ep - dirbuf));
 #endif
 		if (DOINGSOFTDEP(dvp))
 			softdep_change_directoryentry_offset(dp, dirbuf,
 			    (caddr_t)nep, (caddr_t)ep, dsize); 
 		else
 			bcopy((caddr_t)nep, (caddr_t)ep, dsize);
 	}
 	/*
 	 * Here, `ep' points to a directory entry containing `dsize' in-use
 	 * bytes followed by `spacefree' unused bytes. If ep->d_ino == 0,
 	 * then the entry is completely unused (dsize == 0). The value
 	 * of ep->d_reclen is always indeterminate.
 	 *
 	 * Update the pointer fields in the previous entry (if any),
 	 * copy in the new entry, and write out the block.
 	 */
 #	if (BYTE_ORDER == LITTLE_ENDIAN)
 		if (OFSFMT(dvp))
 			namlen = ep->d_type;
 		else
 			namlen = ep->d_namlen;
 #	else
 		namlen = ep->d_namlen;
 #	endif
 	if (ep->d_ino == 0 ||
 	    (ep->d_ino == WINO && namlen == dirp->d_namlen &&
 	     bcmp(ep->d_name, dirp->d_name, dirp->d_namlen) == 0)) {
 		if (spacefree + dsize < newentrysize)
 			panic("ufs_direnter: compact1");
 		dirp->d_reclen = spacefree + dsize;
 	} else {
 		if (spacefree < newentrysize)
 			panic("ufs_direnter: compact2");
 		dirp->d_reclen = spacefree;
 		ep->d_reclen = dsize;
 		ep = (struct direct *)((char *)ep + dsize);
 	}
 #ifdef UFS_DIRHASH
 	if (dp->i_dirhash != NULL && (ep->d_ino == 0 ||
 	    dirp->d_reclen == spacefree))
 		ufsdirhash_add(dp, dirp, dp->i_offset + ((char *)ep - dirbuf));
 #endif
 	bcopy((caddr_t)dirp, (caddr_t)ep, (u_int)newentrysize);
 #ifdef UFS_DIRHASH
 	if (dp->i_dirhash != NULL)
 		ufsdirhash_checkblock(dp, dirbuf -
 		    (dp->i_offset & (DIRBLKSIZ - 1)),
 		    dp->i_offset & ~(DIRBLKSIZ - 1));
 #endif
 
 	if (DOINGSOFTDEP(dvp)) {
 		(void) softdep_setup_directory_add(bp, dp,
 		    dp->i_offset + (caddr_t)ep - dirbuf,
 		    dirp->d_ino, newdirbp, 0);
 		bdwrite(bp);
 	} else {
 		if (DOINGASYNC(dvp)) {
 			bdwrite(bp);
 			error = 0;
 		} else {
 			error = bwrite(bp);
 		}
 	}
 	dp->i_flag |= IN_CHANGE | IN_UPDATE;
 	/*
 	 * If all went well, and the directory can be shortened, proceed
 	 * with the truncation. Note that we have to unlock the inode for
 	 * the entry that we just entered, as the truncation may need to
 	 * lock other inodes which can lead to deadlock if we also hold a
 	 * lock on the newly entered node.
 	 */
 	if (error == 0 && dp->i_endoff && dp->i_endoff < dp->i_size) {
 		if (tvp != NULL)
 			VOP_UNLOCK(tvp, 0, td);
 #ifdef UFS_DIRHASH
 		if (dp->i_dirhash != NULL)
 			ufsdirhash_dirtrunc(dp, dp->i_endoff);
 #endif
 		(void) UFS_TRUNCATE(dvp, (off_t)dp->i_endoff,
 		    IO_NORMAL | IO_SYNC, cr, td);
 		if (tvp != NULL)
-			vn_lock(tvp, LK_EXCLUSIVE | LK_RETRY, td);
+			vn_lock(tvp, LK_EXCLUSIVE | LK_RETRY);
 	}
 	return (error);
 }
 
 /*
  * Remove a directory entry after a call to namei, using
  * the parameters which it left in nameidata. The entry
  * dp->i_offset contains the offset into the directory of the
  * entry to be eliminated.  The dp->i_count field contains the
  * size of the previous record in the directory.  If this
  * is 0, the first entry is being deleted, so we need only
  * zero the inode number to mark the entry as free.  If the
  * entry is not the first in the directory, we must reclaim
  * the space of the now empty record by adding the record size
  * to the size of the previous entry.
  */
 int
 ufs_dirremove(dvp, ip, flags, isrmdir)
 	struct vnode *dvp;
 	struct inode *ip;
 	int flags;
 	int isrmdir;
 {
 	struct inode *dp;
 	struct direct *ep;
 	struct buf *bp;
 	int error;
 
 	dp = VTOI(dvp);
 
 	if (flags & DOWHITEOUT) {
 		/*
 		 * Whiteout entry: set d_ino to WINO.
 		 */
 		if ((error =
 		    UFS_BLKATOFF(dvp, (off_t)dp->i_offset, (char **)&ep, &bp)) != 0)
 			return (error);
 		ep->d_ino = WINO;
 		ep->d_type = DT_WHT;
 		goto out;
 	}
 
 	if ((error = UFS_BLKATOFF(dvp,
 	    (off_t)(dp->i_offset - dp->i_count), (char **)&ep, &bp)) != 0)
 		return (error);
 #ifdef UFS_DIRHASH
 	/*
 	 * Remove the dirhash entry. This is complicated by the fact
 	 * that `ep' is the previous entry when dp->i_count != 0.
 	 */
 	if (dp->i_dirhash != NULL)
 		ufsdirhash_remove(dp, (dp->i_count == 0) ? ep :
 		   (struct direct *)((char *)ep + ep->d_reclen), dp->i_offset);
 #endif
 	if (dp->i_count == 0) {
 		/*
 		 * First entry in block: set d_ino to zero.
 		 */
 		ep->d_ino = 0;
 	} else {
 		/*
 		 * Collapse new free space into previous entry.
 		 */
 		ep->d_reclen += dp->i_reclen;
 	}
 #ifdef UFS_DIRHASH
 	if (dp->i_dirhash != NULL)
 		ufsdirhash_checkblock(dp, (char *)ep -
 		    ((dp->i_offset - dp->i_count) & (DIRBLKSIZ - 1)),
 		    dp->i_offset & ~(DIRBLKSIZ - 1));
 #endif
 out:
 	if (DOINGSOFTDEP(dvp)) {
 		if (ip) {
 			ip->i_effnlink--;
 			softdep_change_linkcnt(ip);
 			softdep_setup_remove(bp, dp, ip, isrmdir);
 		}
 		if (softdep_slowdown(dvp)) {
 			error = bwrite(bp);
 		} else {
 			bdwrite(bp);
 			error = 0;
 		}
 	} else {
 		if (ip) {
 			ip->i_effnlink--;
 			ip->i_nlink--;
 			DIP_SET(ip, i_nlink, ip->i_nlink);
 			ip->i_flag |= IN_CHANGE;
 		}
 		if (flags & DOWHITEOUT)
 			error = bwrite(bp);
 		else if (DOINGASYNC(dvp) && dp->i_count != 0) {
 			bdwrite(bp);
 			error = 0;
 		} else
 			error = bwrite(bp);
 	}
 	dp->i_flag |= IN_CHANGE | IN_UPDATE;
 	/*
 	 * If the last named reference to a snapshot goes away,
 	 * drop its snapshot reference so that it will be reclaimed
 	 * when last open reference goes away.
 	 */
 #if defined(FFS) || defined(IFS)
 	if (ip != 0 && (ip->i_flags & SF_SNAPSHOT) != 0 && ip->i_effnlink == 0)
 		ffs_snapgone(ip);
 #endif
 	return (error);
 }
 
 /*
  * Rewrite an existing directory entry to point at the inode
  * supplied.  The parameters describing the directory entry are
  * set up by a call to namei.
  */
 int
 ufs_dirrewrite(dp, oip, newinum, newtype, isrmdir)
 	struct inode *dp, *oip;
 	ino_t newinum;
 	int newtype;
 	int isrmdir;
 {
 	struct buf *bp;
 	struct direct *ep;
 	struct vnode *vdp = ITOV(dp);
 	int error;
 
 	error = UFS_BLKATOFF(vdp, (off_t)dp->i_offset, (char **)&ep, &bp);
 	if (error)
 		return (error);
 	ep->d_ino = newinum;
 	if (!OFSFMT(vdp))
 		ep->d_type = newtype;
 	oip->i_effnlink--;
 	if (DOINGSOFTDEP(vdp)) {
 		softdep_change_linkcnt(oip);
 		softdep_setup_directory_change(bp, dp, oip, newinum, isrmdir);
 		bdwrite(bp);
 	} else {
 		oip->i_nlink--;
 		DIP_SET(oip, i_nlink, oip->i_nlink);
 		oip->i_flag |= IN_CHANGE;
 		if (DOINGASYNC(vdp)) {
 			bdwrite(bp);
 			error = 0;
 		} else {
 			error = bwrite(bp);
 		}
 	}
 	dp->i_flag |= IN_CHANGE | IN_UPDATE;
 	/*
 	 * If the last named reference to a snapshot goes away,
 	 * drop its snapshot reference so that it will be reclaimed
 	 * when last open reference goes away.
 	 */
 #if defined(FFS) || defined(IFS)
 	if ((oip->i_flags & SF_SNAPSHOT) != 0 && oip->i_effnlink == 0)
 		ffs_snapgone(oip);
 #endif
 	return (error);
 }
 
 /*
  * Check if a directory is empty or not.
  * Inode supplied must be locked.
  *
  * Using a struct dirtemplate here is not precisely
  * what we want, but better than using a struct direct.
  *
  * NB: does not handle corrupted directories.
  */
 int
 ufs_dirempty(ip, parentino, cred)
 	struct inode *ip;
 	ino_t parentino;
 	struct ucred *cred;
 {
 	doff_t off;
 	struct dirtemplate dbuf;
 	struct direct *dp = (struct direct *)&dbuf;
 	int error, count, namlen;
 #define	MINDIRSIZ (sizeof (struct dirtemplate) / 2)
 
 	for (off = 0; off < ip->i_size; off += dp->d_reclen) {
 		error = vn_rdwr(UIO_READ, ITOV(ip), (caddr_t)dp, MINDIRSIZ,
 		    off, UIO_SYSSPACE, IO_NODELOCKED | IO_NOMACCHECK, cred,
 		    NOCRED, &count, (struct thread *)0);
 		/*
 		 * Since we read MINDIRSIZ, residual must
 		 * be 0 unless we're at end of file.
 		 */
 		if (error || count != 0)
 			return (0);
 		/* avoid infinite loops */
 		if (dp->d_reclen == 0)
 			return (0);
 		/* skip empty entries */
 		if (dp->d_ino == 0 || dp->d_ino == WINO)
 			continue;
 		/* accept only "." and ".." */
 #		if (BYTE_ORDER == LITTLE_ENDIAN)
 			if (OFSFMT(ITOV(ip)))
 				namlen = dp->d_type;
 			else
 				namlen = dp->d_namlen;
 #		else
 			namlen = dp->d_namlen;
 #		endif
 		if (namlen > 2)
 			return (0);
 		if (dp->d_name[0] != '.')
 			return (0);
 		/*
 		 * At this point namlen must be 1 or 2.
 		 * 1 implies ".", 2 implies ".." if second
 		 * char is also "."
 		 */
 		if (namlen == 1 && dp->d_ino == ip->i_number)
 			continue;
 		if (dp->d_name[1] == '.' && dp->d_ino == parentino)
 			continue;
 		return (0);
 	}
 	return (1);
 }
 
 /*
  * Check if source directory is in the path of the target directory.
  * Target is supplied locked, source is unlocked.
  * The target is always vput before returning.
  */
 int
 ufs_checkpath(source, target, cred)
 	struct inode *source, *target;
 	struct ucred *cred;
 {
 	struct vnode *vp;
 	int error, namlen;
 	ino_t rootino;
 	struct dirtemplate dirbuf;
 
 	vp = ITOV(target);
 	if (target->i_number == source->i_number) {
 		error = EEXIST;
 		goto out;
 	}
 	rootino = ROOTINO;
 	error = 0;
 	if (target->i_number == rootino)
 		goto out;
 
 	for (;;) {
 		if (vp->v_type != VDIR) {
 			error = ENOTDIR;
 			break;
 		}
 		error = vn_rdwr(UIO_READ, vp, (caddr_t)&dirbuf,
 			sizeof (struct dirtemplate), (off_t)0, UIO_SYSSPACE,
 			IO_NODELOCKED | IO_NOMACCHECK, cred, NOCRED, (int *)0,
 			(struct thread *)0);
 		if (error != 0)
 			break;
 #		if (BYTE_ORDER == LITTLE_ENDIAN)
 			if (OFSFMT(vp))
 				namlen = dirbuf.dotdot_type;
 			else
 				namlen = dirbuf.dotdot_namlen;
 #		else
 			namlen = dirbuf.dotdot_namlen;
 #		endif
 		if (namlen != 2 ||
 		    dirbuf.dotdot_name[0] != '.' ||
 		    dirbuf.dotdot_name[1] != '.') {
 			error = ENOTDIR;
 			break;
 		}
 		if (dirbuf.dotdot_ino == source->i_number) {
 			error = EINVAL;
 			break;
 		}
 		if (dirbuf.dotdot_ino == rootino)
 			break;
 		vput(vp);
 		error = VFS_VGET(vp->v_mount, dirbuf.dotdot_ino,
 		    LK_EXCLUSIVE, &vp);
 		if (error) {
 			vp = NULL;
 			break;
 		}
 	}
 
 out:
 	if (error == ENOTDIR)
 		printf("checkpath: .. not a directory\n");
 	if (vp != NULL)
 		vput(vp);
 	return (error);
 }
Index: head/sys/ufs/ufs/ufs_quota.c
===================================================================
--- head/sys/ufs/ufs/ufs_quota.c	(revision 175201)
+++ head/sys/ufs/ufs/ufs_quota.c	(revision 175202)
@@ -1,1449 +1,1447 @@
 /*-
  * Copyright (c) 1982, 1986, 1990, 1993, 1995
  *	The Regents of the University of California.  All rights reserved.
  *
  * This code is derived from software contributed to Berkeley by
  * Robert Elz at The University of Melbourne.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	@(#)ufs_quota.c	8.5 (Berkeley) 5/20/95
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_ffs.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/fcntl.h>
 #include <sys/kernel.h>
 #include <sys/lock.h>
 #include <sys/malloc.h>
 #include <sys/mount.h>
 #include <sys/mutex.h>
 #include <sys/namei.h>
 #include <sys/priv.h>
 #include <sys/proc.h>
 #include <sys/socket.h>
 #include <sys/stat.h>
 #include <sys/sysctl.h>
 #include <sys/vnode.h>
 
 #include <ufs/ufs/extattr.h>
 #include <ufs/ufs/quota.h>
 #include <ufs/ufs/inode.h>
 #include <ufs/ufs/ufsmount.h>
 #include <ufs/ufs/ufs_extern.h>
 
 static int unprivileged_get_quota = 0;
 SYSCTL_INT(_security_bsd, OID_AUTO, unprivileged_get_quota, CTLFLAG_RW,
     &unprivileged_get_quota, 0,
     "Unprivileged processes may retrieve quotas for other uids and gids");
 
 static MALLOC_DEFINE(M_DQUOT, "ufs_quota", "UFS quota entries");
 
 /*
  * Quota name to error message mapping.
  */
 static char *quotatypes[] = INITQFNAMES;
 
 static int chkdqchg(struct inode *, ufs2_daddr_t, struct ucred *, int, int *);
 static int chkiqchg(struct inode *, int, struct ucred *, int, int *);
 static int dqget(struct vnode *,
 	u_long, struct ufsmount *, int, struct dquot **);
 static int dqsync(struct vnode *, struct dquot *);
 static void dqflush(struct vnode *);
 static int quotaoff1(struct thread *td, struct mount *mp, int type);
 static int quotaoff_inchange(struct thread *td, struct mount *mp, int type);
 
 #ifdef DIAGNOSTIC
 static void dqref(struct dquot *);
 static void chkdquot(struct inode *);
 #endif
 
 /*
  * Set up the quotas for an inode.
  *
  * This routine completely defines the semantics of quotas.
  * If other criterion want to be used to establish quotas, the
  * MAXQUOTAS value in quotas.h should be increased, and the
  * additional dquots set up here.
  */
 int
 getinoquota(ip)
 	struct inode *ip;
 {
 	struct ufsmount *ump;
 	struct vnode *vp;
 	int error;
 
 	vp = ITOV(ip);
 
 	/*
 	 * Disk quotas must be turned off for system files.  Currently
 	 * snapshot and quota files.
 	 */
 	if ((vp->v_vflag & VV_SYSTEM) != 0)
 		return (0);
 	/*
 	 * XXX: Turn off quotas for files with a negative UID or GID.
 	 * This prevents the creation of 100GB+ quota files.
 	 */
 	if ((int)ip->i_uid < 0 || (int)ip->i_gid < 0)
 		return (0);
 	ump = VFSTOUFS(vp->v_mount);
 	/*
 	 * Set up the user quota based on file uid.
 	 * EINVAL means that quotas are not enabled.
 	 */
 	if ((error =
 		dqget(vp, ip->i_uid, ump, USRQUOTA, &ip->i_dquot[USRQUOTA])) &&
 	    error != EINVAL)
 		return (error);
 	/*
 	 * Set up the group quota based on file gid.
 	 * EINVAL means that quotas are not enabled.
 	 */
 	if ((error =
 		dqget(vp, ip->i_gid, ump, GRPQUOTA, &ip->i_dquot[GRPQUOTA])) &&
 	    error != EINVAL)
 		return (error);
 	return (0);
 }
 
 /*
  * Update disk usage, and take corrective action.
  */
 int
 chkdq(ip, change, cred, flags)
 	struct inode *ip;
 	ufs2_daddr_t change;
 	struct ucred *cred;
 	int flags;
 {
 	struct dquot *dq;
 	ufs2_daddr_t ncurblocks;
 	struct vnode *vp = ITOV(ip);
 	int i, error, warn, do_check;
 
 	/*
 	 * Disk quotas must be turned off for system files.  Currently
 	 * snapshot and quota files.
 	 */
 	if ((vp->v_vflag & VV_SYSTEM) != 0)
 		return (0);
 	/*
 	 * XXX: Turn off quotas for files with a negative UID or GID.
 	 * This prevents the creation of 100GB+ quota files.
 	 */
 	if ((int)ip->i_uid < 0 || (int)ip->i_gid < 0)
 		return (0);
 #ifdef DIAGNOSTIC
 	if ((flags & CHOWN) == 0)
 		chkdquot(ip);
 #endif
 	if (change == 0)
 		return (0);
 	if (change < 0) {
 		for (i = 0; i < MAXQUOTAS; i++) {
 			if ((dq = ip->i_dquot[i]) == NODQUOT)
 				continue;
 			DQI_LOCK(dq);
 			DQI_WAIT(dq, PINOD+1, "chkdq1");
 			ncurblocks = dq->dq_curblocks + change;
 			if (ncurblocks >= 0)
 				dq->dq_curblocks = ncurblocks;
 			else
 				dq->dq_curblocks = 0;
 			dq->dq_flags &= ~DQ_BLKS;
 			dq->dq_flags |= DQ_MOD;
 			DQI_UNLOCK(dq);
 		}
 		return (0);
 	}
 	if ((flags & FORCE) == 0 &&
 	    priv_check_cred(cred, PRIV_VFS_EXCEEDQUOTA, 0))
 		do_check = 1;
 	else
 		do_check = 0;
 	for (i = 0; i < MAXQUOTAS; i++) {
 		if ((dq = ip->i_dquot[i]) == NODQUOT)
 			continue;
 		warn = 0;
 		DQI_LOCK(dq);
 		DQI_WAIT(dq, PINOD+1, "chkdq2");
 		if (do_check) {
 			error = chkdqchg(ip, change, cred, i, &warn);
 			if (error) {
 				/*
 				 * Roll back user quota changes when
 				 * group quota failed.
 				 */
 				while (i > 0) {
 					--i;
 					dq = ip->i_dquot[i];
 					if (dq == NODQUOT)
 						continue;
 					DQI_LOCK(dq);
 					DQI_WAIT(dq, PINOD+1, "chkdq3");
 					ncurblocks = dq->dq_curblocks - change;
 					if (ncurblocks >= 0)
 						dq->dq_curblocks = ncurblocks;
 					else
 						dq->dq_curblocks = 0;
 					dq->dq_flags &= ~DQ_BLKS;
 					dq->dq_flags |= DQ_MOD;
 					DQI_UNLOCK(dq);
 				}
 				return (error);
 			}
 		}
 		/* Reset timer when crossing soft limit */
 		if (dq->dq_curblocks + change >= dq->dq_bsoftlimit &&
 		    dq->dq_curblocks < dq->dq_bsoftlimit)
 			dq->dq_btime = time_second +
 			    VFSTOUFS(ITOV(ip)->v_mount)->um_btime[i];
 		dq->dq_curblocks += change;
 		dq->dq_flags |= DQ_MOD;
 		DQI_UNLOCK(dq);
 		if (warn)
 			uprintf("\n%s: warning, %s %s\n",
 				ITOV(ip)->v_mount->mnt_stat.f_mntonname,
 				quotatypes[i], "disk quota exceeded");
 	}
 	return (0);
 }
 
 /*
  * Check for a valid change to a users allocation.
  * Issue an error message if appropriate.
  */
 static int
 chkdqchg(ip, change, cred, type, warn)
 	struct inode *ip;
 	ufs2_daddr_t change;
 	struct ucred *cred;
 	int type;
 	int *warn;
 {
 	struct dquot *dq = ip->i_dquot[type];
 	ufs2_daddr_t ncurblocks = dq->dq_curblocks + change;
 
 	/*
 	 * If user would exceed their hard limit, disallow space allocation.
 	 */
 	if (ncurblocks >= dq->dq_bhardlimit && dq->dq_bhardlimit) {
 		if ((dq->dq_flags & DQ_BLKS) == 0 &&
 		    ip->i_uid == cred->cr_uid) {
 			dq->dq_flags |= DQ_BLKS;
 			DQI_UNLOCK(dq);
 			uprintf("\n%s: write failed, %s disk limit reached\n",
 			    ITOV(ip)->v_mount->mnt_stat.f_mntonname,
 			    quotatypes[type]);
 			return (EDQUOT);
 		}
 		DQI_UNLOCK(dq);
 		return (EDQUOT);
 	}
 	/*
 	 * If user is over their soft limit for too long, disallow space
 	 * allocation. Reset time limit as they cross their soft limit.
 	 */
 	if (ncurblocks >= dq->dq_bsoftlimit && dq->dq_bsoftlimit) {
 		if (dq->dq_curblocks < dq->dq_bsoftlimit) {
 			dq->dq_btime = time_second +
 			    VFSTOUFS(ITOV(ip)->v_mount)->um_btime[type];
 			if (ip->i_uid == cred->cr_uid)
 				*warn = 1;
 			return (0);
 		}
 		if (time_second > dq->dq_btime) {
 			if ((dq->dq_flags & DQ_BLKS) == 0 &&
 			    ip->i_uid == cred->cr_uid) {
 				dq->dq_flags |= DQ_BLKS;
 				DQI_UNLOCK(dq);
 				uprintf("\n%s: write failed, %s %s\n",
 				    ITOV(ip)->v_mount->mnt_stat.f_mntonname,
 				    quotatypes[type],
 				    "disk quota exceeded for too long");
 				return (EDQUOT);
 			}
 			DQI_UNLOCK(dq);
 			return (EDQUOT);
 		}
 	}
 	return (0);
 }
 
 /*
  * Check the inode limit, applying corrective action.
  */
 int
 chkiq(ip, change, cred, flags)
 	struct inode *ip;
 	int change;
 	struct ucred *cred;
 	int flags;
 {
 	struct dquot *dq;
 	ino_t ncurinodes;
 	int i, error, warn, do_check;
 
 #ifdef DIAGNOSTIC
 	if ((flags & CHOWN) == 0)
 		chkdquot(ip);
 #endif
 	if (change == 0)
 		return (0);
 	if (change < 0) {
 		for (i = 0; i < MAXQUOTAS; i++) {
 			if ((dq = ip->i_dquot[i]) == NODQUOT)
 				continue;
 			DQI_LOCK(dq);
 			DQI_WAIT(dq, PINOD+1, "chkiq1");
 			ncurinodes = dq->dq_curinodes + change;
 			/* XXX: ncurinodes is unsigned */
 			if (dq->dq_curinodes != 0 && ncurinodes >= 0)
 				dq->dq_curinodes = ncurinodes;
 			else
 				dq->dq_curinodes = 0;
 			dq->dq_flags &= ~DQ_INODS;
 			dq->dq_flags |= DQ_MOD;
 			DQI_UNLOCK(dq);
 		}
 		return (0);
 	}
 	if ((flags & FORCE) == 0 &&
 	    priv_check_cred(cred, PRIV_VFS_EXCEEDQUOTA, 0))
 		do_check = 1;
 	else
 		do_check = 0;
 	for (i = 0; i < MAXQUOTAS; i++) {
 		if ((dq = ip->i_dquot[i]) == NODQUOT)
 			continue;
 		warn = 0;
 		DQI_LOCK(dq);
 		DQI_WAIT(dq, PINOD+1, "chkiq2");
 		if (do_check) {
 			error = chkiqchg(ip, change, cred, i, &warn);
 			if (error) {
 				/*
 				 * Roll back user quota changes when
 				 * group quota failed.
 				 */
 				while (i > 0) {
 					--i;
 					dq = ip->i_dquot[i];
 					if (dq == NODQUOT)
 						continue;
 					DQI_LOCK(dq);
 					DQI_WAIT(dq, PINOD+1, "chkiq3");
 					ncurinodes = dq->dq_curinodes - change;
 					/* XXX: ncurinodes is unsigned */
 					if (dq->dq_curinodes != 0 &&
 					    ncurinodes >= 0)
 						dq->dq_curinodes = ncurinodes;
 					else
 						dq->dq_curinodes = 0;
 					dq->dq_flags &= ~DQ_INODS;
 					dq->dq_flags |= DQ_MOD;
 					DQI_UNLOCK(dq);
 				}
 				return (error);
 			}
 		}
 		/* Reset timer when crossing soft limit */
 		if (dq->dq_curinodes + change >= dq->dq_isoftlimit &&
 		    dq->dq_curinodes < dq->dq_isoftlimit)
 			dq->dq_itime = time_second +
 			    VFSTOUFS(ITOV(ip)->v_mount)->um_itime[i];
 		dq->dq_curinodes += change;
 		dq->dq_flags |= DQ_MOD;
 		DQI_UNLOCK(dq);
 		if (warn)
 			uprintf("\n%s: warning, %s %s\n",
 				ITOV(ip)->v_mount->mnt_stat.f_mntonname,
 				quotatypes[i], "inode quota exceeded");
 	}
 	return (0);
 }
 
 /*
  * Check for a valid change to a users allocation.
  * Issue an error message if appropriate.
  */
 static int
 chkiqchg(ip, change, cred, type, warn)
 	struct inode *ip;
 	int change;
 	struct ucred *cred;
 	int type;
 	int *warn;
 {
 	struct dquot *dq = ip->i_dquot[type];
 	ino_t ncurinodes = dq->dq_curinodes + change;
 
 	/*
 	 * If user would exceed their hard limit, disallow inode allocation.
 	 */
 	if (ncurinodes >= dq->dq_ihardlimit && dq->dq_ihardlimit) {
 		if ((dq->dq_flags & DQ_INODS) == 0 &&
 		    ip->i_uid == cred->cr_uid) {
 			dq->dq_flags |= DQ_INODS;
 			DQI_UNLOCK(dq);
 			uprintf("\n%s: write failed, %s inode limit reached\n",
 			    ITOV(ip)->v_mount->mnt_stat.f_mntonname,
 			    quotatypes[type]);
 			return (EDQUOT);
 		}
 		DQI_UNLOCK(dq);
 		return (EDQUOT);
 	}
 	/*
 	 * If user is over their soft limit for too long, disallow inode
 	 * allocation. Reset time limit as they cross their soft limit.
 	 */
 	if (ncurinodes >= dq->dq_isoftlimit && dq->dq_isoftlimit) {
 		if (dq->dq_curinodes < dq->dq_isoftlimit) {
 			dq->dq_itime = time_second +
 			    VFSTOUFS(ITOV(ip)->v_mount)->um_itime[type];
 			if (ip->i_uid == cred->cr_uid)
 				*warn = 1;
 			return (0);
 		}
 		if (time_second > dq->dq_itime) {
 			if ((dq->dq_flags & DQ_INODS) == 0 &&
 			    ip->i_uid == cred->cr_uid) {
 				dq->dq_flags |= DQ_INODS;
 				DQI_UNLOCK(dq);
 				uprintf("\n%s: write failed, %s %s\n",
 					ITOV(ip)->v_mount->mnt_stat.f_mntonname,
 					quotatypes[type],
 					"inode quota exceeded for too long");
 				return (EDQUOT);
 			}
 			DQI_UNLOCK(dq);
 			return (EDQUOT);
 		}
 	}
 	return (0);
 }
 
 #ifdef DIAGNOSTIC
 /*
  * On filesystems with quotas enabled, it is an error for a file to change
  * size and not to have a dquot structure associated with it.
  */
 static void
 chkdquot(ip)
 	struct inode *ip;
 {
 	struct ufsmount *ump = VFSTOUFS(ITOV(ip)->v_mount);
 	struct vnode *vp = ITOV(ip);
 	int i;
 
 	/*
 	 * Disk quotas must be turned off for system files.  Currently
 	 * these are snapshots and quota files.
 	 */
 	if ((vp->v_vflag & VV_SYSTEM) != 0)
 		return;
 	/*
 	 * XXX: Turn off quotas for files with a negative UID or GID.
 	 * This prevents the creation of 100GB+ quota files.
 	 */
 	if ((int)ip->i_uid < 0 || (int)ip->i_gid < 0)
 		return;
 
 	UFS_LOCK(ump);
 	for (i = 0; i < MAXQUOTAS; i++) {
 		if (ump->um_quotas[i] == NULLVP ||
 		    (ump->um_qflags[i] & (QTF_OPENING|QTF_CLOSING)))
 			continue;
 		if (ip->i_dquot[i] == NODQUOT) {
 			UFS_UNLOCK(ump);
 			vprint("chkdquot: missing dquot", ITOV(ip));
 			panic("chkdquot: missing dquot");
 		}
 	}
 	UFS_UNLOCK(ump);
 }
 #endif
 
 /*
  * Code to process quotactl commands.
  */
 
 /*
  * Q_QUOTAON - set up a quota file for a particular filesystem.
  */
 int
 quotaon(td, mp, type, fname)
 	struct thread *td;
 	struct mount *mp;
 	int type;
 	void *fname;
 {
 	struct ufsmount *ump;
 	struct vnode *vp, **vpp;
 	struct vnode *mvp;
 	struct dquot *dq;
 	int error, flags, vfslocked;
 	struct nameidata nd;
 
 	error = priv_check(td, PRIV_UFS_QUOTAON);
 	if (error)
 		return (error);
 
 	ump = VFSTOUFS(mp);
 	dq = NODQUOT;
 
 	NDINIT(&nd, LOOKUP, FOLLOW | MPSAFE, UIO_USERSPACE, fname, td);
 	flags = FREAD | FWRITE;
 	error = vn_open(&nd, &flags, 0, NULL);
 	if (error)
 		return (error);
 	vfslocked = NDHASGIANT(&nd);
 	NDFREE(&nd, NDF_ONLY_PNBUF);
 	vp = nd.ni_vp;
 	VOP_UNLOCK(vp, 0, td);
 	if (vp->v_type != VREG) {
 		(void) vn_close(vp, FREAD|FWRITE, td->td_ucred, td);
 		VFS_UNLOCK_GIANT(vfslocked);
 		return (EACCES);
 	}
 
 	UFS_LOCK(ump);
 	if ((ump->um_qflags[type] & (QTF_OPENING|QTF_CLOSING)) != 0) {
 		UFS_UNLOCK(ump);
 		(void) vn_close(vp, FREAD|FWRITE, td->td_ucred, td);
 		VFS_UNLOCK_GIANT(vfslocked);
 		return (EALREADY);
 	}
 	ump->um_qflags[type] |= QTF_OPENING|QTF_CLOSING;
 	MNT_ILOCK(mp);
 	mp->mnt_flag |= MNT_QUOTA;
 	MNT_IUNLOCK(mp);
 	UFS_UNLOCK(ump);
 
 	vpp = &ump->um_quotas[type];
 	if (*vpp != vp)
 		quotaoff1(td, mp, type);
 
-	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td);
+	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
 	vp->v_vflag |= VV_SYSTEM;
 	VOP_UNLOCK(vp, 0, td);
 	*vpp = vp;
 	VFS_UNLOCK_GIANT(vfslocked);
 	/*
 	 * Save the credential of the process that turned on quotas.
 	 * Set up the time limits for this quota.
 	 */
 	ump->um_cred[type] = crhold(td->td_ucred);
 	ump->um_btime[type] = MAX_DQ_TIME;
 	ump->um_itime[type] = MAX_IQ_TIME;
 	if (dqget(NULLVP, 0, ump, type, &dq) == 0) {
 		if (dq->dq_btime > 0)
 			ump->um_btime[type] = dq->dq_btime;
 		if (dq->dq_itime > 0)
 			ump->um_itime[type] = dq->dq_itime;
 		dqrele(NULLVP, dq);
 	}
 	/*
 	 * Allow the getdq from getinoquota below to read the quota
 	 * from file.
 	 */
 	UFS_LOCK(ump);
 	ump->um_qflags[type] &= ~QTF_CLOSING;
 	UFS_UNLOCK(ump);
 	/*
 	 * Search vnodes associated with this mount point,
 	 * adding references to quota file being opened.
 	 * NB: only need to add dquot's for inodes being modified.
 	 */
 	MNT_ILOCK(mp);
 again:
 	MNT_VNODE_FOREACH(vp, mp, mvp) {
 		VI_LOCK(vp);
 		MNT_IUNLOCK(mp);
 		if (vget(vp, LK_EXCLUSIVE | LK_INTERLOCK, td)) {
 			MNT_ILOCK(mp);
 			MNT_VNODE_FOREACH_ABORT_ILOCKED(mp, mvp);
 			goto again;
 		}
 		if (vp->v_type == VNON || vp->v_writecount == 0) {
 			VOP_UNLOCK(vp, 0, td);
 			vrele(vp);
 			MNT_ILOCK(mp);
 			continue;
 		}
 		error = getinoquota(VTOI(vp));
 		VOP_UNLOCK(vp, 0, td);
 		vrele(vp);
 		MNT_ILOCK(mp);
 		if (error) {
 			MNT_VNODE_FOREACH_ABORT_ILOCKED(mp, mvp);
 			break;
 		}
 	}
 	MNT_IUNLOCK(mp);
 
         if (error)
 		quotaoff_inchange(td, mp, type);
 	UFS_LOCK(ump);
 	ump->um_qflags[type] &= ~QTF_OPENING;
 	KASSERT((ump->um_qflags[type] & QTF_CLOSING) == 0,
 		("quotaon: leaking flags"));
 	UFS_UNLOCK(ump);
 
 	return (error);
 }
 
 /*
  * Main code to turn off disk quotas for a filesystem. Does not change
  * flags.
  */
 static int
 quotaoff1(td, mp, type)
 	struct thread *td;
 	struct mount *mp;
 	int type;
 {
 	struct vnode *vp;
 	struct vnode *qvp, *mvp;
 	struct ufsmount *ump;
 	struct dquot *dq;
 	struct inode *ip;
 	struct ucred *cr;
 	int vfslocked;
 	int error;
 
 	ump = VFSTOUFS(mp);
 
 	UFS_LOCK(ump);
 	KASSERT((ump->um_qflags[type] & QTF_CLOSING) != 0,
 		("quotaoff1: flags are invalid"));
 	if ((qvp = ump->um_quotas[type]) == NULLVP) {
 		UFS_UNLOCK(ump);
 		return (0);
 	}
 	cr = ump->um_cred[type];
 	UFS_UNLOCK(ump);
 	
 	/*
 	 * Search vnodes associated with this mount point,
 	 * deleting any references to quota file being closed.
 	 */
 	MNT_ILOCK(mp);
 again:
 	MNT_VNODE_FOREACH(vp, mp, mvp) {
 		VI_LOCK(vp);
 		MNT_IUNLOCK(mp);
 		if (vp->v_type == VNON) {
 			VI_UNLOCK(vp);
 			MNT_ILOCK(mp);
 			continue;
 		}
 		if (vget(vp, LK_EXCLUSIVE | LK_INTERLOCK, td)) {
 			MNT_ILOCK(mp);
 			MNT_VNODE_FOREACH_ABORT_ILOCKED(mp, mvp);
 			goto again;
 		}
 		ip = VTOI(vp);
 		dq = ip->i_dquot[type];
 		ip->i_dquot[type] = NODQUOT;
 		dqrele(vp, dq);
 		VOP_UNLOCK(vp, 0, td);
 		vrele(vp);
 		MNT_ILOCK(mp);
 	}
 	MNT_IUNLOCK(mp);
 
 	dqflush(qvp);
 	/* Clear um_quotas before closing the quota vnode to prevent
 	 * access to the closed vnode from dqget/dqsync
 	 */
 	UFS_LOCK(ump);
 	ump->um_quotas[type] = NULLVP;
 	ump->um_cred[type] = NOCRED;
 	UFS_UNLOCK(ump);
 
 	vfslocked = VFS_LOCK_GIANT(qvp->v_mount);
-	vn_lock(qvp, LK_EXCLUSIVE | LK_RETRY, td);
+	vn_lock(qvp, LK_EXCLUSIVE | LK_RETRY);
 	qvp->v_vflag &= ~VV_SYSTEM;
 	VOP_UNLOCK(qvp, 0, td);
 	error = vn_close(qvp, FREAD|FWRITE, td->td_ucred, td);
 	VFS_UNLOCK_GIANT(vfslocked);
 	crfree(cr);
 
 	return (error);
 }
 
 /*
  * Turns off quotas, assumes that ump->um_qflags are already checked
  * and QTF_CLOSING is set to indicate operation in progress. Fixes
  * ump->um_qflags and mp->mnt_flag after.
  */
 int
 quotaoff_inchange(td, mp, type)
 	struct thread *td;
 	struct mount *mp;
 	int type;
 {
 	struct ufsmount *ump;
 	int i;
 	int error;
 
 	error = quotaoff1(td, mp, type);
 
 	ump = VFSTOUFS(mp);
 	UFS_LOCK(ump);
 	ump->um_qflags[type] &= ~QTF_CLOSING;
 	for (i = 0; i < MAXQUOTAS; i++)
 		if (ump->um_quotas[i] != NULLVP)
 			break;
 	if (i == MAXQUOTAS) {
 		MNT_ILOCK(mp);
 		mp->mnt_flag &= ~MNT_QUOTA;
 		MNT_IUNLOCK(mp);
 	}
 	UFS_UNLOCK(ump);
 	return (error);
 }
 
 /*
  * Q_QUOTAOFF - turn off disk quotas for a filesystem.
  */
 int
 quotaoff(td, mp, type)
 	struct thread *td;
 	struct mount *mp;
 	int type;
 {
 	struct ufsmount *ump;
 	int error;
 
 	error = priv_check(td, PRIV_UFS_QUOTAOFF);
 	if (error)
 		return (error);
 
 	ump = VFSTOUFS(mp);
 	UFS_LOCK(ump);
 	if ((ump->um_qflags[type] & (QTF_OPENING|QTF_CLOSING)) != 0) {
 		UFS_UNLOCK(ump);
 		return (EALREADY);
 	}
 	ump->um_qflags[type] |= QTF_CLOSING;
 	UFS_UNLOCK(ump);
 
 	return (quotaoff_inchange(td, mp, type));
 }
 
 /*
  * Q_GETQUOTA - return current values in a dqblk structure.
  */
 int
 getquota(td, mp, id, type, addr)
 	struct thread *td;
 	struct mount *mp;
 	u_long id;
 	int type;
 	void *addr;
 {
 	struct dquot *dq;
 	int error;
 
 	switch (type) {
 	case USRQUOTA:
 		if ((td->td_ucred->cr_uid != id) && !unprivileged_get_quota) {
 			error = priv_check(td, PRIV_VFS_GETQUOTA);
 			if (error)
 				return (error);
 		}
 		break;  
 
 	case GRPQUOTA:
 		if (!groupmember(id, td->td_ucred) &&
 		    !unprivileged_get_quota) {
 			error = priv_check(td, PRIV_VFS_GETQUOTA);
 			if (error)
 				return (error);
 		}
 		break;
 
 	default:
 		return (EINVAL);
 	}
 
 	dq = NODQUOT;
 	error = dqget(NULLVP, id, VFSTOUFS(mp), type, &dq);
 	if (error)
 		return (error);
 	error = copyout(&dq->dq_dqb, addr, sizeof (struct dqblk));
 	dqrele(NULLVP, dq);
 	return (error);
 }
 
 /*
  * Q_SETQUOTA - assign an entire dqblk structure.
  */
 int
 setquota(td, mp, id, type, addr)
 	struct thread *td;
 	struct mount *mp;
 	u_long id;
 	int type;
 	void *addr;
 {
 	struct dquot *dq;
 	struct dquot *ndq;
 	struct ufsmount *ump;
 	struct dqblk newlim;
 	int error;
 
 	error = priv_check(td, PRIV_VFS_SETQUOTA);
 	if (error)
 		return (error);
 
 	ump = VFSTOUFS(mp);
 	error = copyin(addr, &newlim, sizeof (struct dqblk));
 	if (error)
 		return (error);
 
 	ndq = NODQUOT;
 	ump = VFSTOUFS(mp);
 
 	error = dqget(NULLVP, id, ump, type, &ndq);
 	if (error)
 		return (error);
 	dq = ndq;
 	DQI_LOCK(dq);
 	DQI_WAIT(dq, PINOD+1, "setqta");
 	/*
 	 * Copy all but the current values.
 	 * Reset time limit if previously had no soft limit or were
 	 * under it, but now have a soft limit and are over it.
 	 */
 	newlim.dqb_curblocks = dq->dq_curblocks;
 	newlim.dqb_curinodes = dq->dq_curinodes;
 	if (dq->dq_id != 0) {
 		newlim.dqb_btime = dq->dq_btime;
 		newlim.dqb_itime = dq->dq_itime;
 	}
 	if (newlim.dqb_bsoftlimit &&
 	    dq->dq_curblocks >= newlim.dqb_bsoftlimit &&
 	    (dq->dq_bsoftlimit == 0 || dq->dq_curblocks < dq->dq_bsoftlimit))
 		newlim.dqb_btime = time_second + ump->um_btime[type];
 	if (newlim.dqb_isoftlimit &&
 	    dq->dq_curinodes >= newlim.dqb_isoftlimit &&
 	    (dq->dq_isoftlimit == 0 || dq->dq_curinodes < dq->dq_isoftlimit))
 		newlim.dqb_itime = time_second + ump->um_itime[type];
 	dq->dq_dqb = newlim;
 	if (dq->dq_curblocks < dq->dq_bsoftlimit)
 		dq->dq_flags &= ~DQ_BLKS;
 	if (dq->dq_curinodes < dq->dq_isoftlimit)
 		dq->dq_flags &= ~DQ_INODS;
 	if (dq->dq_isoftlimit == 0 && dq->dq_bsoftlimit == 0 &&
 	    dq->dq_ihardlimit == 0 && dq->dq_bhardlimit == 0)
 		dq->dq_flags |= DQ_FAKE;
 	else
 		dq->dq_flags &= ~DQ_FAKE;
 	dq->dq_flags |= DQ_MOD;
 	DQI_UNLOCK(dq);
 	dqrele(NULLVP, dq);
 	return (0);
 }
 
 /*
  * Q_SETUSE - set current inode and block usage.
  */
 int
 setuse(td, mp, id, type, addr)
 	struct thread *td;
 	struct mount *mp;
 	u_long id;
 	int type;
 	void *addr;
 {
 	struct dquot *dq;
 	struct ufsmount *ump;
 	struct dquot *ndq;
 	struct dqblk usage;
 	int error;
 
 	error = priv_check(td, PRIV_UFS_SETUSE);
 	if (error)
 		return (error);
 
 	ump = VFSTOUFS(mp);
 	error = copyin(addr, &usage, sizeof (struct dqblk));
 	if (error)
 		return (error);
 
 	ump = VFSTOUFS(mp);
 	ndq = NODQUOT;
 
 	error = dqget(NULLVP, id, ump, type, &ndq);
 	if (error)
 		return (error);
 	dq = ndq;
 	DQI_LOCK(dq);
 	DQI_WAIT(dq, PINOD+1, "setuse");
 	/*
 	 * Reset time limit if have a soft limit and were
 	 * previously under it, but are now over it.
 	 */
 	if (dq->dq_bsoftlimit && dq->dq_curblocks < dq->dq_bsoftlimit &&
 	    usage.dqb_curblocks >= dq->dq_bsoftlimit)
 		dq->dq_btime = time_second + ump->um_btime[type];
 	if (dq->dq_isoftlimit && dq->dq_curinodes < dq->dq_isoftlimit &&
 	    usage.dqb_curinodes >= dq->dq_isoftlimit)
 		dq->dq_itime = time_second + ump->um_itime[type];
 	dq->dq_curblocks = usage.dqb_curblocks;
 	dq->dq_curinodes = usage.dqb_curinodes;
 	if (dq->dq_curblocks < dq->dq_bsoftlimit)
 		dq->dq_flags &= ~DQ_BLKS;
 	if (dq->dq_curinodes < dq->dq_isoftlimit)
 		dq->dq_flags &= ~DQ_INODS;
 	dq->dq_flags |= DQ_MOD;
 	DQI_UNLOCK(dq);
 	dqrele(NULLVP, dq);
 	return (0);
 }
 
 /*
  * Q_SYNC - sync quota files to disk.
  */
 int
 qsync(mp)
 	struct mount *mp;
 {
 	struct ufsmount *ump = VFSTOUFS(mp);
 	struct thread *td = curthread;		/* XXX */
 	struct vnode *vp, *mvp;
 	struct dquot *dq;
 	int i, error;
 
 	/*
 	 * Check if the mount point has any quotas.
 	 * If not, simply return.
 	 */
 	UFS_LOCK(ump);
 	for (i = 0; i < MAXQUOTAS; i++)
 		if (ump->um_quotas[i] != NULLVP)
 			break;
 	UFS_UNLOCK(ump);
 	if (i == MAXQUOTAS)
 		return (0);
 	/*
 	 * Search vnodes associated with this mount point,
 	 * synchronizing any modified dquot structures.
 	 */
 	MNT_ILOCK(mp);
 again:
 	MNT_VNODE_FOREACH(vp, mp, mvp) {
 		VI_LOCK(vp);
 		MNT_IUNLOCK(mp);
 		if (vp->v_type == VNON) {
 			VI_UNLOCK(vp);
 			MNT_ILOCK(mp);
 			continue;
 		}
 		error = vget(vp, LK_EXCLUSIVE | LK_INTERLOCK, td);
 		if (error) {
 			MNT_ILOCK(mp);
 			if (error == ENOENT) {
 				MNT_VNODE_FOREACH_ABORT_ILOCKED(mp, mvp);
 				goto again;
 			}
 			continue;
 		}
 		for (i = 0; i < MAXQUOTAS; i++) {
 			dq = VTOI(vp)->i_dquot[i];
 			if (dq != NODQUOT)
 				dqsync(vp, dq);
 		}
 		vput(vp);
 		MNT_ILOCK(mp);
 	}
 	MNT_IUNLOCK(mp);
 	return (0);
 }
 
 /*
  * Code pertaining to management of the in-core dquot data structures.
  */
 #define DQHASH(dqvp, id) \
 	(&dqhashtbl[((((intptr_t)(dqvp)) >> 8) + id) & dqhash])
 static LIST_HEAD(dqhash, dquot) *dqhashtbl;
 static u_long dqhash;
 
 /*
  * Dquot free list.
  */
 #define	DQUOTINC	5	/* minimum free dquots desired */
 static TAILQ_HEAD(dqfreelist, dquot) dqfreelist;
 static long numdquot, desireddquot = DQUOTINC;
 
 /* 
  * Lock to protect quota hash, dq free list and dq_cnt ref counters of
  * _all_ dqs.
  */
 struct mtx dqhlock;
 
 #define	DQH_LOCK()	mtx_lock(&dqhlock)
 #define	DQH_UNLOCK()	mtx_unlock(&dqhlock)
 
 static struct dquot *dqhashfind(struct dqhash *dqh, u_long id,
 	struct vnode *dqvp);
 
 /*
  * Initialize the quota system.
  */
 void
 dqinit()
 {
 
 	mtx_init(&dqhlock, "dqhlock", NULL, MTX_DEF);
 	dqhashtbl = hashinit(desiredvnodes, M_DQUOT, &dqhash);
 	TAILQ_INIT(&dqfreelist);
 }
 
 /*
  * Shut down the quota system.
  */
 void
 dquninit()
 {
 	struct dquot *dq;
 
 	hashdestroy(dqhashtbl, M_DQUOT, dqhash);
 	while ((dq = TAILQ_FIRST(&dqfreelist)) != NULL) {
 		TAILQ_REMOVE(&dqfreelist, dq, dq_freelist);
 		mtx_destroy(&dq->dq_lock);
 		free(dq, M_DQUOT);
 	}
 	mtx_destroy(&dqhlock);
 }
 
 static struct dquot *
 dqhashfind(dqh, id, dqvp)
 	struct dqhash *dqh;
 	u_long id;
 	struct vnode *dqvp;
 {
 	struct dquot *dq;
 
 	mtx_assert(&dqhlock, MA_OWNED);
 	LIST_FOREACH(dq, dqh, dq_hash) {
 		if (dq->dq_id != id ||
 		    dq->dq_ump->um_quotas[dq->dq_type] != dqvp)
 			continue;
 		/*
 		 * Cache hit with no references.  Take
 		 * the structure off the free list.
 		 */
 		if (dq->dq_cnt == 0)
 			TAILQ_REMOVE(&dqfreelist, dq, dq_freelist);
 		DQREF(dq);
 		return (dq);
 	}
 	return (NODQUOT);
 }
 
 /*
  * Obtain a dquot structure for the specified identifier and quota file
  * reading the information from the file if necessary.
  */
 static int
 dqget(vp, id, ump, type, dqp)
 	struct vnode *vp;
 	u_long id;
 	struct ufsmount *ump;
 	int type;
 	struct dquot **dqp;
 {
-	struct thread *td = curthread;		/* XXX */
 	struct dquot *dq, *dq1;
 	struct dqhash *dqh;
 	struct vnode *dqvp;
 	struct iovec aiov;
 	struct uio auio;
 	int vfslocked, dqvplocked, error;
 
 #ifdef DEBUG_VFS_LOCKS
 	if (vp != NULLVP)
 		ASSERT_VOP_ELOCKED(vp, "dqget");
 #endif
 
 	if (vp != NULLVP && *dqp != NODQUOT) {
 		return (0);
 	}
 
 	/* XXX: Disallow negative id values to prevent the
 	* creation of 100GB+ quota data files.
 	*/
 	if ((int)id < 0)
 		return (EINVAL);
 
 	UFS_LOCK(ump);
 	dqvp = ump->um_quotas[type];
 	if (dqvp == NULLVP || (ump->um_qflags[type] & QTF_CLOSING)) {
 		*dqp = NODQUOT;
 		UFS_UNLOCK(ump);
 		return (EINVAL);
 	}
 	vref(dqvp);
 	UFS_UNLOCK(ump);
 	error = 0;
 	dqvplocked = 0;
 
 	/*
 	 * Check the cache first.
 	 */
 	dqh = DQHASH(dqvp, id);
 	DQH_LOCK();
 	dq = dqhashfind(dqh, id, dqvp);
 	if (dq != NULL) {
 		DQH_UNLOCK();
 hfound:		DQI_LOCK(dq);
 		DQI_WAIT(dq, PINOD+1, "dqget");
 		DQI_UNLOCK(dq);
 		if (dq->dq_ump == NULL) {
 			dqrele(vp, dq);
 			dq = NODQUOT;
 			error = EIO;
 		}
 		*dqp = dq;
 		vfslocked = VFS_LOCK_GIANT(dqvp->v_mount);
 		if (dqvplocked)
 			vput(dqvp);
 		else
 			vrele(dqvp);
 		VFS_UNLOCK_GIANT(vfslocked);
 		return (error);
 	}
 
 	/*
 	 * Quota vnode lock is before DQ_LOCK. Acquire dqvp lock there
 	 * since new dq will appear on the hash chain DQ_LOCKed.
 	 */
 	if (vp != dqvp) {
 		DQH_UNLOCK();
-		vn_lock(dqvp, LK_SHARED | LK_RETRY, td);
+		vn_lock(dqvp, LK_SHARED | LK_RETRY);
 		dqvplocked = 1;
 		DQH_LOCK();
 		/*
 		 * Recheck the cache after sleep for quota vnode lock.
 		 */
 		dq = dqhashfind(dqh, id, dqvp);
 		if (dq != NULL) {
 			DQH_UNLOCK();
 			goto hfound;
 		}
 	}
 
 	/*
 	 * Not in cache, allocate a new one or take it from the
 	 * free list.
 	 */
 	if (TAILQ_FIRST(&dqfreelist) == NODQUOT &&
 	    numdquot < MAXQUOTAS * desiredvnodes)
 		desireddquot += DQUOTINC;
 	if (numdquot < desireddquot) {
 		numdquot++;
 		DQH_UNLOCK();
 		dq1 = (struct dquot *)malloc(sizeof *dq, M_DQUOT,
 		    M_WAITOK | M_ZERO);
 		mtx_init(&dq1->dq_lock, "dqlock", NULL, MTX_DEF);
 		DQH_LOCK();
 		/*
 		 * Recheck the cache after sleep for memory.
 		 */
 		dq = dqhashfind(dqh, id, dqvp);
 		if (dq != NULL) {
 			numdquot--;
 			DQH_UNLOCK();
 			mtx_destroy(&dq1->dq_lock);
 			free(dq1, M_DQUOT);
 			goto hfound;
 		}
 		dq = dq1;
 	} else {
 		if ((dq = TAILQ_FIRST(&dqfreelist)) == NULL) {
 			DQH_UNLOCK();
 			tablefull("dquot");
 			*dqp = NODQUOT;
 			vfslocked = VFS_LOCK_GIANT(dqvp->v_mount);
 			if (dqvplocked)
 				vput(dqvp);
 			else
 				vrele(dqvp);
 			VFS_UNLOCK_GIANT(vfslocked);
 			return (EUSERS);
 		}
 		if (dq->dq_cnt || (dq->dq_flags & DQ_MOD))
 			panic("dqget: free dquot isn't");
 		TAILQ_REMOVE(&dqfreelist, dq, dq_freelist);
 		if (dq->dq_ump != NULL)
 			LIST_REMOVE(dq, dq_hash);
 	}
 
 	/*
 	 * Dq is put into hash already locked to prevent parallel
 	 * usage while it is being read from file.
 	 */
 	dq->dq_flags = DQ_LOCK;
 	dq->dq_id = id;
 	dq->dq_type = type;
 	dq->dq_ump = ump;
 	LIST_INSERT_HEAD(dqh, dq, dq_hash);
 	DQREF(dq);
 	DQH_UNLOCK();
 
 	auio.uio_iov = &aiov;
 	auio.uio_iovcnt = 1;
 	aiov.iov_base = &dq->dq_dqb;
 	aiov.iov_len = sizeof (struct dqblk);
 	auio.uio_resid = sizeof (struct dqblk);
 	auio.uio_offset = (off_t)id * sizeof (struct dqblk);
 	auio.uio_segflg = UIO_SYSSPACE;
 	auio.uio_rw = UIO_READ;
 	auio.uio_td = (struct thread *)0;
 
 	vfslocked = VFS_LOCK_GIANT(dqvp->v_mount);
 	error = VOP_READ(dqvp, &auio, 0, ump->um_cred[type]);
 	if (auio.uio_resid == sizeof(struct dqblk) && error == 0)
 		bzero(&dq->dq_dqb, sizeof(struct dqblk));
 	if (dqvplocked)
 		vput(dqvp);
 	else
 		vrele(dqvp);
 	VFS_UNLOCK_GIANT(vfslocked);
 	/*
 	 * I/O error in reading quota file, release
 	 * quota structure and reflect problem to caller.
 	 */
 	if (error) {
 		DQH_LOCK();
 		dq->dq_ump = NULL;
 		LIST_REMOVE(dq, dq_hash);
 		DQH_UNLOCK();
 		DQI_LOCK(dq);
 		if (dq->dq_flags & DQ_WANT)
 			wakeup(dq);
 		dq->dq_flags = 0;
 		DQI_UNLOCK(dq);
 		dqrele(vp, dq);
 		*dqp = NODQUOT;
 		return (error);
 	}
 	DQI_LOCK(dq);
 	/*
 	 * Check for no limit to enforce.
 	 * Initialize time values if necessary.
 	 */
 	if (dq->dq_isoftlimit == 0 && dq->dq_bsoftlimit == 0 &&
 	    dq->dq_ihardlimit == 0 && dq->dq_bhardlimit == 0)
 		dq->dq_flags |= DQ_FAKE;
 	if (dq->dq_id != 0) {
 		if (dq->dq_btime == 0) {
 			dq->dq_btime = time_second + ump->um_btime[type];
 			if (dq->dq_bsoftlimit &&
 			    dq->dq_curblocks >= dq->dq_bsoftlimit)
 				dq->dq_flags |= DQ_MOD;
 		}
 		if (dq->dq_itime == 0) {
 			dq->dq_itime = time_second + ump->um_itime[type];
 			if (dq->dq_isoftlimit &&
 			    dq->dq_curinodes >= dq->dq_isoftlimit)
 				dq->dq_flags |= DQ_MOD;
 		}
 	}
 	DQI_WAKEUP(dq);
 	DQI_UNLOCK(dq);
 	*dqp = dq;
 	return (0);
 }
 
 #ifdef DIAGNOSTIC
 /*
  * Obtain a reference to a dquot.
  */
 static void
 dqref(dq)
 	struct dquot *dq;
 {
 
 	dq->dq_cnt++;
 }
 #endif
 
 /*
  * Release a reference to a dquot.
  */
 void
 dqrele(vp, dq)
 	struct vnode *vp;
 	struct dquot *dq;
 {
 
 	if (dq == NODQUOT)
 		return;
 	DQH_LOCK();
 	if (dq->dq_cnt > 1) {
 		dq->dq_cnt--;
 		DQH_UNLOCK();
 		return;
 	}
 	DQH_UNLOCK();
 
 	(void) dqsync(vp, dq);
 
 	DQH_LOCK();
 	if (--dq->dq_cnt > 0)
 	{
 		DQH_UNLOCK();
 		return;
 	}
 	TAILQ_INSERT_TAIL(&dqfreelist, dq, dq_freelist);
 	DQH_UNLOCK();
 }
 
 /*
  * Update the disk quota in the quota file.
  */
 static int
 dqsync(vp, dq)
 	struct vnode *vp;
 	struct dquot *dq;
 {
-	struct thread *td = curthread;		/* XXX */
 	struct vnode *dqvp;
 	struct iovec aiov;
 	struct uio auio;
 	int vfslocked, error;
 	struct mount *mp;
 	struct ufsmount *ump;
 
 #ifdef DEBUG_VFS_LOCKS
 	if (vp != NULL)
 		ASSERT_VOP_ELOCKED(vp, "dqsync");
 #endif
 
 	mp = NULL;
 	error = 0;
 	if (dq == NODQUOT)
 		panic("dqsync: dquot");
 	if ((ump = dq->dq_ump) == NULL)
 		return (0);
 	UFS_LOCK(ump);
 	if ((dqvp = ump->um_quotas[dq->dq_type]) == NULLVP)
 		panic("dqsync: file");
 	vref(dqvp);
 	UFS_UNLOCK(ump);
 
 	vfslocked = VFS_LOCK_GIANT(dqvp->v_mount);
 	DQI_LOCK(dq);
 	if ((dq->dq_flags & DQ_MOD) == 0) {
 		DQI_UNLOCK(dq);
 		vrele(dqvp);
 		VFS_UNLOCK_GIANT(vfslocked);
 		return (0);
 	}
 	DQI_UNLOCK(dq);
 
 	(void) vn_start_secondary_write(dqvp, &mp, V_WAIT);
 	if (vp != dqvp)
-		vn_lock(dqvp, LK_EXCLUSIVE | LK_RETRY, td);
+		vn_lock(dqvp, LK_EXCLUSIVE | LK_RETRY);
 
 	VFS_UNLOCK_GIANT(vfslocked);
 	DQI_LOCK(dq);
 	DQI_WAIT(dq, PINOD+2, "dqsync");
 	if ((dq->dq_flags & DQ_MOD) == 0)
 		goto out;
 	dq->dq_flags |= DQ_LOCK;
 	DQI_UNLOCK(dq);
 
 	auio.uio_iov = &aiov;
 	auio.uio_iovcnt = 1;
 	aiov.iov_base = &dq->dq_dqb;
 	aiov.iov_len = sizeof (struct dqblk);
 	auio.uio_resid = sizeof (struct dqblk);
 	auio.uio_offset = (off_t)dq->dq_id * sizeof (struct dqblk);
 	auio.uio_segflg = UIO_SYSSPACE;
 	auio.uio_rw = UIO_WRITE;
 	auio.uio_td = (struct thread *)0;
 	vfslocked = VFS_LOCK_GIANT(dqvp->v_mount);
 	error = VOP_WRITE(dqvp, &auio, 0, dq->dq_ump->um_cred[dq->dq_type]);
 	VFS_UNLOCK_GIANT(vfslocked);
 	if (auio.uio_resid && error == 0)
 		error = EIO;
 
 	DQI_LOCK(dq);
 	DQI_WAKEUP(dq);
 	dq->dq_flags &= ~DQ_MOD;
 out:	DQI_UNLOCK(dq);
 	vfslocked = VFS_LOCK_GIANT(dqvp->v_mount);
 	if (vp != dqvp)
 		vput(dqvp);
 	else
 		vrele(dqvp);
 	vn_finished_secondary_write(mp);
 	VFS_UNLOCK_GIANT(vfslocked);
 	return (error);
 }
 
 /*
  * Flush all entries from the cache for a particular vnode.
  */
 static void
 dqflush(vp)
 	struct vnode *vp;
 {
 	struct dquot *dq, *nextdq;
 	struct dqhash *dqh;
 
 	/*
 	 * Move all dquot's that used to refer to this quota
 	 * file off their hash chains (they will eventually
 	 * fall off the head of the free list and be re-used).
 	 */
 	DQH_LOCK();
 	for (dqh = &dqhashtbl[dqhash]; dqh >= dqhashtbl; dqh--) {
 		for (dq = LIST_FIRST(dqh); dq; dq = nextdq) {
 			nextdq = LIST_NEXT(dq, dq_hash);
 			if (dq->dq_ump->um_quotas[dq->dq_type] != vp)
 				continue;
 			if (dq->dq_cnt)
 				panic("dqflush: stray dquot");
 			LIST_REMOVE(dq, dq_hash);
 			dq->dq_ump = (struct ufsmount *)0;
 		}
 	}
 	DQH_UNLOCK();
 }
Index: head/sys/ufs/ufs/ufs_vnops.c
===================================================================
--- head/sys/ufs/ufs/ufs_vnops.c	(revision 175201)
+++ head/sys/ufs/ufs/ufs_vnops.c	(revision 175202)
@@ -1,2517 +1,2517 @@
 /*-
  * Copyright (c) 1982, 1986, 1989, 1993, 1995
  *	The Regents of the University of California.  All rights reserved.
  * (c) UNIX System Laboratories, Inc.
  * All or some portions of this file are derived from material licensed
  * to the University of California by American Telephone and Telegraph
  * Co. or Unix System Laboratories, Inc. and are reproduced herein with
  * the permission of UNIX System Laboratories, Inc.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	@(#)ufs_vnops.c	8.27 (Berkeley) 5/27/95
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_mac.h"
 #include "opt_quota.h"
 #include "opt_suiddir.h"
 #include "opt_ufs.h"
 #include "opt_ffs.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/malloc.h>
 #include <sys/namei.h>
 #include <sys/kernel.h>
 #include <sys/fcntl.h>
 #include <sys/stat.h>
 #include <sys/bio.h>
 #include <sys/buf.h>
 #include <sys/mount.h>
 #include <sys/priv.h>
 #include <sys/refcount.h>
 #include <sys/unistd.h>
 #include <sys/vnode.h>
 #include <sys/dirent.h>
 #include <sys/lockf.h>
 #include <sys/conf.h>
 #include <sys/acl.h>
 #include <sys/jail.h>
 
 #include <machine/mutex.h>
 
 #include <security/mac/mac_framework.h>
 
 #include <sys/file.h>		/* XXX */
 
 #include <vm/vm.h>
 #include <vm/vm_extern.h>
 
 #include <fs/fifofs/fifo.h>
 
 #include <ufs/ufs/acl.h>
 #include <ufs/ufs/extattr.h>
 #include <ufs/ufs/quota.h>
 #include <ufs/ufs/inode.h>
 #include <ufs/ufs/dir.h>
 #include <ufs/ufs/ufsmount.h>
 #include <ufs/ufs/ufs_extern.h>
 #ifdef UFS_DIRHASH
 #include <ufs/ufs/dirhash.h>
 #endif
 #ifdef UFS_GJOURNAL
 #include <ufs/ufs/gjournal.h>
 #endif
 
 #include <ufs/ffs/ffs_extern.h>
 
 static vop_access_t	ufs_access;
 static vop_advlock_t	ufs_advlock;
 static int ufs_chmod(struct vnode *, int, struct ucred *, struct thread *);
 static int ufs_chown(struct vnode *, uid_t, gid_t, struct ucred *, struct thread *);
 static vop_close_t	ufs_close;
 static vop_create_t	ufs_create;
 static vop_getattr_t	ufs_getattr;
 static vop_link_t	ufs_link;
 static int ufs_makeinode(int mode, struct vnode *, struct vnode **, struct componentname *);
 static vop_mkdir_t	ufs_mkdir;
 static vop_mknod_t	ufs_mknod;
 static vop_open_t	ufs_open;
 static vop_pathconf_t	ufs_pathconf;
 static vop_print_t	ufs_print;
 static vop_readlink_t	ufs_readlink;
 static vop_remove_t	ufs_remove;
 static vop_rename_t	ufs_rename;
 static vop_rmdir_t	ufs_rmdir;
 static vop_setattr_t	ufs_setattr;
 static vop_strategy_t	ufs_strategy;
 static vop_symlink_t	ufs_symlink;
 static vop_whiteout_t	ufs_whiteout;
 static vop_close_t	ufsfifo_close;
 static vop_kqfilter_t	ufsfifo_kqfilter;
 
 /*
  * A virgin directory (no blushing please).
  */
 static struct dirtemplate mastertemplate = {
 	0, 12, DT_DIR, 1, ".",
 	0, DIRBLKSIZ - 12, DT_DIR, 2, ".."
 };
 static struct odirtemplate omastertemplate = {
 	0, 12, 1, ".",
 	0, DIRBLKSIZ - 12, 2, ".."
 };
 
 static void
 ufs_itimes_locked(struct vnode *vp)
 {
 	struct inode *ip;
 	struct timespec ts;
 
 	ASSERT_VI_LOCKED(vp, __func__);
 
 	ip = VTOI(vp);
 	if ((vp->v_mount->mnt_flag & MNT_RDONLY) != 0)
 		goto out;
 	if ((ip->i_flag & (IN_ACCESS | IN_CHANGE | IN_UPDATE)) == 0)
 		return;
 
 	if ((vp->v_type == VBLK || vp->v_type == VCHR) && !DOINGSOFTDEP(vp))
 		ip->i_flag |= IN_LAZYMOD;
 	else if (((vp->v_mount->mnt_kern_flag &
 		    (MNTK_SUSPENDED | MNTK_SUSPEND)) == 0) ||
 		    (ip->i_flag & (IN_CHANGE | IN_UPDATE)))
 		ip->i_flag |= IN_MODIFIED;
 	else if (ip->i_flag & IN_ACCESS)
 		ip->i_flag |= IN_LAZYACCESS;
 	vfs_timestamp(&ts);
 	if (ip->i_flag & IN_ACCESS) {
 		DIP_SET(ip, i_atime, ts.tv_sec);
 		DIP_SET(ip, i_atimensec, ts.tv_nsec);
 	}
 	if (ip->i_flag & IN_UPDATE) {
 		DIP_SET(ip, i_mtime, ts.tv_sec);
 		DIP_SET(ip, i_mtimensec, ts.tv_nsec);
 		ip->i_modrev++;
 	}
 	if (ip->i_flag & IN_CHANGE) {
 		DIP_SET(ip, i_ctime, ts.tv_sec);
 		DIP_SET(ip, i_ctimensec, ts.tv_nsec);
 	}
 
  out:
 	ip->i_flag &= ~(IN_ACCESS | IN_CHANGE | IN_UPDATE);
 }
 
 void
 ufs_itimes(struct vnode *vp)
 {
 
 	VI_LOCK(vp);
 	ufs_itimes_locked(vp);
 	VI_UNLOCK(vp);
 }
 
 /*
  * Create a regular file
  */
 static int
 ufs_create(ap)
 	struct vop_create_args /* {
 		struct vnode *a_dvp;
 		struct vnode **a_vpp;
 		struct componentname *a_cnp;
 		struct vattr *a_vap;
 	} */ *ap;
 {
 	int error;
 
 	error =
 	    ufs_makeinode(MAKEIMODE(ap->a_vap->va_type, ap->a_vap->va_mode),
 	    ap->a_dvp, ap->a_vpp, ap->a_cnp);
 	if (error)
 		return (error);
 	return (0);
 }
 
 /*
  * Mknod vnode call
  */
 /* ARGSUSED */
 static int
 ufs_mknod(ap)
 	struct vop_mknod_args /* {
 		struct vnode *a_dvp;
 		struct vnode **a_vpp;
 		struct componentname *a_cnp;
 		struct vattr *a_vap;
 	} */ *ap;
 {
 	struct vattr *vap = ap->a_vap;
 	struct vnode **vpp = ap->a_vpp;
 	struct inode *ip;
 	ino_t ino;
 	int error;
 
 	error = ufs_makeinode(MAKEIMODE(vap->va_type, vap->va_mode),
 	    ap->a_dvp, vpp, ap->a_cnp);
 	if (error)
 		return (error);
 	ip = VTOI(*vpp);
 	ip->i_flag |= IN_ACCESS | IN_CHANGE | IN_UPDATE;
 	if (vap->va_rdev != VNOVAL) {
 		/*
 		 * Want to be able to use this to make badblock
 		 * inodes, so don't truncate the dev number.
 		 */
 		DIP_SET(ip, i_rdev, vap->va_rdev);
 	}
 	/*
 	 * Remove inode, then reload it through VFS_VGET so it is
 	 * checked to see if it is an alias of an existing entry in
 	 * the inode cache.  XXX I don't believe this is necessary now.
 	 */
 	(*vpp)->v_type = VNON;
 	ino = ip->i_number;	/* Save this before vgone() invalidates ip. */
 	vgone(*vpp);
 	vput(*vpp);
 	error = VFS_VGET(ap->a_dvp->v_mount, ino, LK_EXCLUSIVE, vpp);
 	if (error) {
 		*vpp = NULL;
 		return (error);
 	}
 	return (0);
 }
 
 /*
  * Open called.
  */
 /* ARGSUSED */
 static int
 ufs_open(struct vop_open_args *ap)
 {
 	struct vnode *vp = ap->a_vp;
 	struct inode *ip;
 
 	if (vp->v_type == VCHR || vp->v_type == VBLK)
 		return (EOPNOTSUPP);
 
 	ip = VTOI(vp);
 	/*
 	 * Files marked append-only must be opened for appending.
 	 */
 	if ((ip->i_flags & APPEND) &&
 	    (ap->a_mode & (FWRITE | O_APPEND)) == FWRITE)
 		return (EPERM);
 	vnode_create_vobject(vp, DIP(ip, i_size), ap->a_td);
 	return (0);
 }
 
 /*
  * Close called.
  *
  * Update the times on the inode.
  */
 /* ARGSUSED */
 static int
 ufs_close(ap)
 	struct vop_close_args /* {
 		struct vnode *a_vp;
 		int  a_fflag;
 		struct ucred *a_cred;
 		struct thread *a_td;
 	} */ *ap;
 {
 	struct vnode *vp = ap->a_vp;
 	int usecount;
 
 	VI_LOCK(vp);
 	usecount = vp->v_usecount;
 	if (usecount > 1)
 		ufs_itimes_locked(vp);
 	VI_UNLOCK(vp);
 	return (0);
 }
 
 static int
 ufs_access(ap)
 	struct vop_access_args /* {
 		struct vnode *a_vp;
 		int  a_mode;
 		struct ucred *a_cred;
 		struct thread *a_td;
 	} */ *ap;
 {
 	struct vnode *vp = ap->a_vp;
 	struct inode *ip = VTOI(vp);
 	mode_t mode = ap->a_mode;
 	int error;
 #ifdef UFS_ACL
 	struct acl *acl;
 #endif
 
 	/*
 	 * Disallow write attempts on read-only filesystems;
 	 * unless the file is a socket, fifo, or a block or
 	 * character device resident on the filesystem.
 	 */
 	if (mode & VWRITE) {
 		switch (vp->v_type) {
 		case VDIR:
 		case VLNK:
 		case VREG:
 			if (vp->v_mount->mnt_flag & MNT_RDONLY)
 				return (EROFS);
 			break;
 		default:
 			break;
 		}
 	}
 
 	/* If immutable bit set, nobody gets to write it. */
 	if ((mode & VWRITE) && (ip->i_flags & (IMMUTABLE | SF_SNAPSHOT)))
 		return (EPERM);
 
 #ifdef UFS_ACL
 	if ((vp->v_mount->mnt_flag & MNT_ACLS) != 0) {
 		acl = uma_zalloc(acl_zone, M_WAITOK);
 		error = VOP_GETACL(vp, ACL_TYPE_ACCESS, acl, ap->a_cred,
 		    ap->a_td);
 		switch (error) {
 		case EOPNOTSUPP:
 			error = vaccess(vp->v_type, ip->i_mode, ip->i_uid,
 			    ip->i_gid, ap->a_mode, ap->a_cred, NULL);
 			break;
 		case 0:
 			error = vaccess_acl_posix1e(vp->v_type, ip->i_uid,
 			    ip->i_gid, acl, ap->a_mode, ap->a_cred, NULL);
 			break;
 		default:
 			printf(
 "ufs_access(): Error retrieving ACL on object (%d).\n",
 			    error);
 			/*
 			 * XXX: Fall back until debugged.  Should
 			 * eventually possibly log an error, and return
 			 * EPERM for safety.
 			 */
 			error = vaccess(vp->v_type, ip->i_mode, ip->i_uid,
 			    ip->i_gid, ap->a_mode, ap->a_cred, NULL);
 		}
 		uma_zfree(acl_zone, acl);
 	} else
 #endif /* !UFS_ACL */
 		error = vaccess(vp->v_type, ip->i_mode, ip->i_uid, ip->i_gid,
 		    ap->a_mode, ap->a_cred, NULL);
 	return (error);
 }
 
 /* ARGSUSED */
 static int
 ufs_getattr(ap)
 	struct vop_getattr_args /* {
 		struct vnode *a_vp;
 		struct vattr *a_vap;
 		struct ucred *a_cred;
 		struct thread *a_td;
 	} */ *ap;
 {
 	struct vnode *vp = ap->a_vp;
 	struct inode *ip = VTOI(vp);
 	struct vattr *vap = ap->a_vap;
 
 	VI_LOCK(vp);
 	ufs_itimes_locked(vp);
 	if (ip->i_ump->um_fstype == UFS1) {
 		vap->va_atime.tv_sec = ip->i_din1->di_atime;
 		vap->va_atime.tv_nsec = ip->i_din1->di_atimensec;
 	} else {
 		vap->va_atime.tv_sec = ip->i_din2->di_atime;
 		vap->va_atime.tv_nsec = ip->i_din2->di_atimensec;
 	}
 	VI_UNLOCK(vp);
 	/*
 	 * Copy from inode table
 	 */
 	vap->va_fsid = dev2udev(ip->i_dev);
 	vap->va_fileid = ip->i_number;
 	vap->va_mode = ip->i_mode & ~IFMT;
 	vap->va_nlink = ip->i_effnlink;
 	vap->va_uid = ip->i_uid;
 	vap->va_gid = ip->i_gid;
 	if (ip->i_ump->um_fstype == UFS1) {
 		vap->va_rdev = ip->i_din1->di_rdev;
 		vap->va_size = ip->i_din1->di_size;
 		vap->va_mtime.tv_sec = ip->i_din1->di_mtime;
 		vap->va_mtime.tv_nsec = ip->i_din1->di_mtimensec;
 		vap->va_ctime.tv_sec = ip->i_din1->di_ctime;
 		vap->va_ctime.tv_nsec = ip->i_din1->di_ctimensec;
 		vap->va_birthtime.tv_sec = 0;
 		vap->va_birthtime.tv_nsec = 0;
 		vap->va_bytes = dbtob((u_quad_t)ip->i_din1->di_blocks);
 	} else {
 		vap->va_rdev = ip->i_din2->di_rdev;
 		vap->va_size = ip->i_din2->di_size;
 		vap->va_mtime.tv_sec = ip->i_din2->di_mtime;
 		vap->va_mtime.tv_nsec = ip->i_din2->di_mtimensec;
 		vap->va_ctime.tv_sec = ip->i_din2->di_ctime;
 		vap->va_ctime.tv_nsec = ip->i_din2->di_ctimensec;
 		vap->va_birthtime.tv_sec = ip->i_din2->di_birthtime;
 		vap->va_birthtime.tv_nsec = ip->i_din2->di_birthnsec;
 		vap->va_bytes = dbtob((u_quad_t)ip->i_din2->di_blocks);
 	}
 	vap->va_flags = ip->i_flags;
 	vap->va_gen = ip->i_gen;
 	vap->va_blocksize = vp->v_mount->mnt_stat.f_iosize;
 	vap->va_type = IFTOVT(ip->i_mode);
 	vap->va_filerev = ip->i_modrev;
 	return (0);
 }
 
 /*
  * Set attribute vnode op. called from several syscalls
  */
 static int
 ufs_setattr(ap)
 	struct vop_setattr_args /* {
 		struct vnode *a_vp;
 		struct vattr *a_vap;
 		struct ucred *a_cred;
 		struct thread *a_td;
 	} */ *ap;
 {
 	struct vattr *vap = ap->a_vap;
 	struct vnode *vp = ap->a_vp;
 	struct inode *ip = VTOI(vp);
 	struct ucred *cred = ap->a_cred;
 	struct thread *td = ap->a_td;
 	int error;
 
 	/*
 	 * Check for unsettable attributes.
 	 */
 	if ((vap->va_type != VNON) || (vap->va_nlink != VNOVAL) ||
 	    (vap->va_fsid != VNOVAL) || (vap->va_fileid != VNOVAL) ||
 	    (vap->va_blocksize != VNOVAL) || (vap->va_rdev != VNOVAL) ||
 	    ((int)vap->va_bytes != VNOVAL) || (vap->va_gen != VNOVAL)) {
 		return (EINVAL);
 	}
 	/*
 	 * Mark for update the file's access time for vfs_mark_atime().
 	 * We are doing this here to avoid some of the checks done
 	 * below -- this operation is done by request of the kernel and
 	 * should bypass some security checks.  Things like read-only
 	 * checks get handled by other levels (e.g., ffs_update()).
 	 */
 	if (vap->va_vaflags & VA_MARK_ATIME) {
 		ip->i_flag |= IN_ACCESS;
 		return (0);
 	}
 	if (vap->va_flags != VNOVAL) {
 		if (vp->v_mount->mnt_flag & MNT_RDONLY)
 			return (EROFS);
 		/*
 		 * Callers may only modify the file flags on objects they
 		 * have VADMIN rights for.
 		 */
 		if ((error = VOP_ACCESS(vp, VADMIN, cred, td)))
 			return (error);
 		/*
 		 * Unprivileged processes are not permitted to unset system
 		 * flags, or modify flags if any system flags are set.
 		 * Privileged non-jail processes may not modify system flags
 		 * if securelevel > 0 and any existing system flags are set.
 		 * Privileged jail processes behave like privileged non-jail
 		 * processes if the security.jail.chflags_allowed sysctl is
 		 * is non-zero; otherwise, they behave like unprivileged
 		 * processes.
 		 */
 		if (!priv_check_cred(cred, PRIV_VFS_SYSFLAGS, 0)) {
 			if (ip->i_flags
 			    & (SF_NOUNLINK | SF_IMMUTABLE | SF_APPEND)) {
 				error = securelevel_gt(cred, 0);
 				if (error)
 					return (error);
 			}
 			/* Snapshot flag cannot be set or cleared */
 			if (((vap->va_flags & SF_SNAPSHOT) != 0 &&
 			     (ip->i_flags & SF_SNAPSHOT) == 0) ||
 			    ((vap->va_flags & SF_SNAPSHOT) == 0 &&
 			     (ip->i_flags & SF_SNAPSHOT) != 0))
 				return (EPERM);
 			ip->i_flags = vap->va_flags;
 			DIP_SET(ip, i_flags, vap->va_flags);
 		} else {
 			if (ip->i_flags
 			    & (SF_NOUNLINK | SF_IMMUTABLE | SF_APPEND) ||
 			    (vap->va_flags & UF_SETTABLE) != vap->va_flags)
 				return (EPERM);
 			ip->i_flags &= SF_SETTABLE;
 			ip->i_flags |= (vap->va_flags & UF_SETTABLE);
 			DIP_SET(ip, i_flags, ip->i_flags);
 		}
 		ip->i_flag |= IN_CHANGE;
 		if (vap->va_flags & (IMMUTABLE | APPEND))
 			return (0);
 	}
 	if (ip->i_flags & (IMMUTABLE | APPEND))
 		return (EPERM);
 	/*
 	 * Go through the fields and update iff not VNOVAL.
 	 */
 	if (vap->va_uid != (uid_t)VNOVAL || vap->va_gid != (gid_t)VNOVAL) {
 		if (vp->v_mount->mnt_flag & MNT_RDONLY)
 			return (EROFS);
 		if ((error = ufs_chown(vp, vap->va_uid, vap->va_gid, cred,
 		    td)) != 0)
 			return (error);
 	}
 	if (vap->va_size != VNOVAL) {
 		/*
 		 * XXX most of the following special cases should be in
 		 * callers instead of in N filesystems.  The VDIR check
 		 * mostly already is.
 		 */
 		switch (vp->v_type) {
 		case VDIR:
 			return (EISDIR);
 		case VLNK:
 		case VREG:
 			/*
 			 * Truncation should have an effect in these cases.
 			 * Disallow it if the filesystem is read-only or
 			 * the file is being snapshotted.
 			 */
 			if (vp->v_mount->mnt_flag & MNT_RDONLY)
 				return (EROFS);
 			if ((ip->i_flags & SF_SNAPSHOT) != 0)
 				return (EPERM);
 			break;
 		default:
 			/*
 			 * According to POSIX, the result is unspecified
 			 * for file types other than regular files,
 			 * directories and shared memory objects.  We
 			 * don't support shared memory objects in the file
 			 * system, and have dubious support for truncating
 			 * symlinks.  Just ignore the request in other cases.
 			 */
 			return (0);
 		}
 		if ((error = UFS_TRUNCATE(vp, vap->va_size, IO_NORMAL,
 		    cred, td)) != 0)
 			return (error);
 	}
 	if (vap->va_atime.tv_sec != VNOVAL ||
 	    vap->va_mtime.tv_sec != VNOVAL ||
 	    vap->va_birthtime.tv_sec != VNOVAL) {
 		if (vp->v_mount->mnt_flag & MNT_RDONLY)
 			return (EROFS);
 		if ((ip->i_flags & SF_SNAPSHOT) != 0)
 			return (EPERM);
 		/*
 		 * From utimes(2):
 		 * If times is NULL, ... The caller must be the owner of
 		 * the file, have permission to write the file, or be the
 		 * super-user.
 		 * If times is non-NULL, ... The caller must be the owner of
 		 * the file or be the super-user.
 		 *
 		 * Possibly for historical reasons, try to use VADMIN in
 		 * preference to VWRITE for a NULL timestamp.  This means we
 		 * will return EACCES in preference to EPERM if neither
 		 * check succeeds.
 		 */
 		if (vap->va_vaflags & VA_UTIMES_NULL) {
 			error = VOP_ACCESS(vp, VADMIN, cred, td);
 			if (error)
 				error = VOP_ACCESS(vp, VWRITE, cred, td);
 		} else
 			error = VOP_ACCESS(vp, VADMIN, cred, td);
 		if (error)
 			return (error);
 		if (vap->va_atime.tv_sec != VNOVAL)
 			ip->i_flag |= IN_ACCESS;
 		if (vap->va_mtime.tv_sec != VNOVAL)
 			ip->i_flag |= IN_CHANGE | IN_UPDATE;
 		if (vap->va_birthtime.tv_sec != VNOVAL &&
 		    ip->i_ump->um_fstype == UFS2)
 			ip->i_flag |= IN_MODIFIED;
 		ufs_itimes(vp);
 		if (vap->va_atime.tv_sec != VNOVAL) {
 			DIP_SET(ip, i_atime, vap->va_atime.tv_sec);
 			DIP_SET(ip, i_atimensec, vap->va_atime.tv_nsec);
 		}
 		if (vap->va_mtime.tv_sec != VNOVAL) {
 			DIP_SET(ip, i_mtime, vap->va_mtime.tv_sec);
 			DIP_SET(ip, i_mtimensec, vap->va_mtime.tv_nsec);
 		}
 		if (vap->va_birthtime.tv_sec != VNOVAL &&
 		    ip->i_ump->um_fstype == UFS2) {
 			ip->i_din2->di_birthtime = vap->va_birthtime.tv_sec;
 			ip->i_din2->di_birthnsec = vap->va_birthtime.tv_nsec;
 		}
 		error = UFS_UPDATE(vp, 0);
 		if (error)
 			return (error);
 	}
 	error = 0;
 	if (vap->va_mode != (mode_t)VNOVAL) {
 		if (vp->v_mount->mnt_flag & MNT_RDONLY)
 			return (EROFS);
 		if ((ip->i_flags & SF_SNAPSHOT) != 0 && (vap->va_mode &
 		   (S_IXUSR | S_IWUSR | S_IXGRP | S_IWGRP | S_IXOTH | S_IWOTH)))
 			return (EPERM);
 		error = ufs_chmod(vp, (int)vap->va_mode, cred, td);
 	}
 	return (error);
 }
 
 /*
  * Change the mode on a file.
  * Inode must be locked before calling.
  */
 static int
 ufs_chmod(vp, mode, cred, td)
 	struct vnode *vp;
 	int mode;
 	struct ucred *cred;
 	struct thread *td;
 {
 	struct inode *ip = VTOI(vp);
 	int error;
 
 	/*
 	 * To modify the permissions on a file, must possess VADMIN
 	 * for that file.
 	 */
 	if ((error = VOP_ACCESS(vp, VADMIN, cred, td)))
 		return (error);
 	/*
 	 * Privileged processes may set the sticky bit on non-directories,
 	 * as well as set the setgid bit on a file with a group that the
 	 * process is not a member of.  Both of these are allowed in
 	 * jail(8).
 	 */
 	if (vp->v_type != VDIR && (mode & S_ISTXT)) {
 		if (priv_check_cred(cred, PRIV_VFS_STICKYFILE, 0))
 			return (EFTYPE);
 	}
 	if (!groupmember(ip->i_gid, cred) && (mode & ISGID)) {
 		error = priv_check_cred(cred, PRIV_VFS_SETGID, 0);
 		if (error)
 			return (error);
 	}
 	ip->i_mode &= ~ALLPERMS;
 	ip->i_mode |= (mode & ALLPERMS);
 	DIP_SET(ip, i_mode, ip->i_mode);
 	ip->i_flag |= IN_CHANGE;
 	return (0);
 }
 
 /*
  * Perform chown operation on inode ip;
  * inode must be locked prior to call.
  */
 static int
 ufs_chown(vp, uid, gid, cred, td)
 	struct vnode *vp;
 	uid_t uid;
 	gid_t gid;
 	struct ucred *cred;
 	struct thread *td;
 {
 	struct inode *ip = VTOI(vp);
 	uid_t ouid;
 	gid_t ogid;
 	int error = 0;
 #ifdef QUOTA
 	int i;
 	ufs2_daddr_t change;
 #endif
 
 	if (uid == (uid_t)VNOVAL)
 		uid = ip->i_uid;
 	if (gid == (gid_t)VNOVAL)
 		gid = ip->i_gid;
 	/*
 	 * To modify the ownership of a file, must possess VADMIN for that
 	 * file.
 	 */
 	if ((error = VOP_ACCESS(vp, VADMIN, cred, td)))
 		return (error);
 	/*
 	 * To change the owner of a file, or change the group of a file to a
 	 * group of which we are not a member, the caller must have
 	 * privilege.
 	 */
 	if ((uid != ip->i_uid || 
 	    (gid != ip->i_gid && !groupmember(gid, cred))) &&
 	    (error = priv_check_cred(cred, PRIV_VFS_CHOWN, 0)))
 		return (error);
 	ogid = ip->i_gid;
 	ouid = ip->i_uid;
 #ifdef QUOTA
 	if ((error = getinoquota(ip)) != 0)
 		return (error);
 	if (ouid == uid) {
 		dqrele(vp, ip->i_dquot[USRQUOTA]);
 		ip->i_dquot[USRQUOTA] = NODQUOT;
 	}
 	if (ogid == gid) {
 		dqrele(vp, ip->i_dquot[GRPQUOTA]);
 		ip->i_dquot[GRPQUOTA] = NODQUOT;
 	}
 	change = DIP(ip, i_blocks);
 	(void) chkdq(ip, -change, cred, CHOWN);
 	(void) chkiq(ip, -1, cred, CHOWN);
 	for (i = 0; i < MAXQUOTAS; i++) {
 		dqrele(vp, ip->i_dquot[i]);
 		ip->i_dquot[i] = NODQUOT;
 	}
 #endif
 	ip->i_gid = gid;
 	DIP_SET(ip, i_gid, gid);
 	ip->i_uid = uid;
 	DIP_SET(ip, i_uid, uid);
 #ifdef QUOTA
 	if ((error = getinoquota(ip)) == 0) {
 		if (ouid == uid) {
 			dqrele(vp, ip->i_dquot[USRQUOTA]);
 			ip->i_dquot[USRQUOTA] = NODQUOT;
 		}
 		if (ogid == gid) {
 			dqrele(vp, ip->i_dquot[GRPQUOTA]);
 			ip->i_dquot[GRPQUOTA] = NODQUOT;
 		}
 		if ((error = chkdq(ip, change, cred, CHOWN)) == 0) {
 			if ((error = chkiq(ip, 1, cred, CHOWN)) == 0)
 				goto good;
 			else
 				(void) chkdq(ip, -change, cred, CHOWN|FORCE);
 		}
 		for (i = 0; i < MAXQUOTAS; i++) {
 			dqrele(vp, ip->i_dquot[i]);
 			ip->i_dquot[i] = NODQUOT;
 		}
 	}
 	ip->i_gid = ogid;
 	DIP_SET(ip, i_gid, ogid);
 	ip->i_uid = ouid;
 	DIP_SET(ip, i_uid, ouid);
 	if (getinoquota(ip) == 0) {
 		if (ouid == uid) {
 			dqrele(vp, ip->i_dquot[USRQUOTA]);
 			ip->i_dquot[USRQUOTA] = NODQUOT;
 		}
 		if (ogid == gid) {
 			dqrele(vp, ip->i_dquot[GRPQUOTA]);
 			ip->i_dquot[GRPQUOTA] = NODQUOT;
 		}
 		(void) chkdq(ip, change, cred, FORCE|CHOWN);
 		(void) chkiq(ip, 1, cred, FORCE|CHOWN);
 		(void) getinoquota(ip);
 	}
 	return (error);
 good:
 	if (getinoquota(ip))
 		panic("ufs_chown: lost quota");
 #endif /* QUOTA */
 	ip->i_flag |= IN_CHANGE;
 	if ((ip->i_mode & (ISUID | ISGID)) && (ouid != uid || ogid != gid)) {
 		if (priv_check_cred(cred, PRIV_VFS_RETAINSUGID, 0)) {
 			ip->i_mode &= ~(ISUID | ISGID);
 			DIP_SET(ip, i_mode, ip->i_mode);
 		}
 	}
 	return (0);
 }
 
 static int
 ufs_remove(ap)
 	struct vop_remove_args /* {
 		struct vnode *a_dvp;
 		struct vnode *a_vp;
 		struct componentname *a_cnp;
 	} */ *ap;
 {
 	struct inode *ip;
 	struct vnode *vp = ap->a_vp;
 	struct vnode *dvp = ap->a_dvp;
 	int error;
 	struct thread *td;
 
 	td = curthread;
 	ip = VTOI(vp);
 	if ((ip->i_flags & (NOUNLINK | IMMUTABLE | APPEND)) ||
 	    (VTOI(dvp)->i_flags & APPEND)) {
 		error = EPERM;
 		goto out;
 	}
 #ifdef UFS_GJOURNAL
 	ufs_gjournal_orphan(vp);
 #endif
 	error = ufs_dirremove(dvp, ip, ap->a_cnp->cn_flags, 0);
 	if (ip->i_nlink <= 0)
 		vp->v_vflag |= VV_NOSYNC;
 	if ((ip->i_flags & SF_SNAPSHOT) != 0) {
 		/*
 		 * Avoid deadlock where another thread is trying to
 		 * update the inodeblock for dvp and is waiting on
 		 * snaplk.  Temporary unlock the vnode lock for the
 		 * unlinked file and sync the directory.  This should
 		 * allow vput() of the directory to not block later on
 		 * while holding the snapshot vnode locked, assuming
 		 * that the directory hasn't been unlinked too.
 		 */
 		VOP_UNLOCK(vp, 0, td);
 		(void) VOP_FSYNC(dvp, MNT_WAIT, td);
-		vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td);
+		vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
 	}
 out:
 	return (error);
 }
 
 /*
  * link vnode call
  */
 static int
 ufs_link(ap)
 	struct vop_link_args /* {
 		struct vnode *a_tdvp;
 		struct vnode *a_vp;
 		struct componentname *a_cnp;
 	} */ *ap;
 {
 	struct vnode *vp = ap->a_vp;
 	struct vnode *tdvp = ap->a_tdvp;
 	struct componentname *cnp = ap->a_cnp;
 	struct inode *ip;
 	struct direct newdir;
 	int error;
 
 #ifdef INVARIANTS
 	if ((cnp->cn_flags & HASBUF) == 0)
 		panic("ufs_link: no name");
 #endif
 	if (tdvp->v_mount != vp->v_mount) {
 		error = EXDEV;
 		goto out;
 	}
 	ip = VTOI(vp);
 	if ((nlink_t)ip->i_nlink >= LINK_MAX) {
 		error = EMLINK;
 		goto out;
 	}
 	if (ip->i_flags & (IMMUTABLE | APPEND)) {
 		error = EPERM;
 		goto out;
 	}
 	ip->i_effnlink++;
 	ip->i_nlink++;
 	DIP_SET(ip, i_nlink, ip->i_nlink);
 	ip->i_flag |= IN_CHANGE;
 	if (DOINGSOFTDEP(vp))
 		softdep_change_linkcnt(ip);
 	error = UFS_UPDATE(vp, !(DOINGSOFTDEP(vp) | DOINGASYNC(vp)));
 	if (!error) {
 		ufs_makedirentry(ip, cnp, &newdir);
 		error = ufs_direnter(tdvp, vp, &newdir, cnp, NULL);
 	}
 
 	if (error) {
 		ip->i_effnlink--;
 		ip->i_nlink--;
 		DIP_SET(ip, i_nlink, ip->i_nlink);
 		ip->i_flag |= IN_CHANGE;
 		if (DOINGSOFTDEP(vp))
 			softdep_change_linkcnt(ip);
 	}
 out:
 	return (error);
 }
 
 /*
  * whiteout vnode call
  */
 static int
 ufs_whiteout(ap)
 	struct vop_whiteout_args /* {
 		struct vnode *a_dvp;
 		struct componentname *a_cnp;
 		int a_flags;
 	} */ *ap;
 {
 	struct vnode *dvp = ap->a_dvp;
 	struct componentname *cnp = ap->a_cnp;
 	struct direct newdir;
 	int error = 0;
 
 	switch (ap->a_flags) {
 	case LOOKUP:
 		/* 4.4 format directories support whiteout operations */
 		if (dvp->v_mount->mnt_maxsymlinklen > 0)
 			return (0);
 		return (EOPNOTSUPP);
 
 	case CREATE:
 		/* create a new directory whiteout */
 #ifdef INVARIANTS
 		if ((cnp->cn_flags & SAVENAME) == 0)
 			panic("ufs_whiteout: missing name");
 		if (dvp->v_mount->mnt_maxsymlinklen <= 0)
 			panic("ufs_whiteout: old format filesystem");
 #endif
 
 		newdir.d_ino = WINO;
 		newdir.d_namlen = cnp->cn_namelen;
 		bcopy(cnp->cn_nameptr, newdir.d_name, (unsigned)cnp->cn_namelen + 1);
 		newdir.d_type = DT_WHT;
 		error = ufs_direnter(dvp, NULL, &newdir, cnp, NULL);
 		break;
 
 	case DELETE:
 		/* remove an existing directory whiteout */
 #ifdef INVARIANTS
 		if (dvp->v_mount->mnt_maxsymlinklen <= 0)
 			panic("ufs_whiteout: old format filesystem");
 #endif
 
 		cnp->cn_flags &= ~DOWHITEOUT;
 		error = ufs_dirremove(dvp, NULL, cnp->cn_flags, 0);
 		break;
 	default:
 		panic("ufs_whiteout: unknown op");
 	}
 	return (error);
 }
 
 /*
  * Rename system call.
  * 	rename("foo", "bar");
  * is essentially
  *	unlink("bar");
  *	link("foo", "bar");
  *	unlink("foo");
  * but ``atomically''.  Can't do full commit without saving state in the
  * inode on disk which isn't feasible at this time.  Best we can do is
  * always guarantee the target exists.
  *
  * Basic algorithm is:
  *
  * 1) Bump link count on source while we're linking it to the
  *    target.  This also ensure the inode won't be deleted out
  *    from underneath us while we work (it may be truncated by
  *    a concurrent `trunc' or `open' for creation).
  * 2) Link source to destination.  If destination already exists,
  *    delete it first.
  * 3) Unlink source reference to inode if still around. If a
  *    directory was moved and the parent of the destination
  *    is different from the source, patch the ".." entry in the
  *    directory.
  */
 static int
 ufs_rename(ap)
 	struct vop_rename_args  /* {
 		struct vnode *a_fdvp;
 		struct vnode *a_fvp;
 		struct componentname *a_fcnp;
 		struct vnode *a_tdvp;
 		struct vnode *a_tvp;
 		struct componentname *a_tcnp;
 	} */ *ap;
 {
 	struct vnode *tvp = ap->a_tvp;
 	struct vnode *tdvp = ap->a_tdvp;
 	struct vnode *fvp = ap->a_fvp;
 	struct vnode *fdvp = ap->a_fdvp;
 	struct componentname *tcnp = ap->a_tcnp;
 	struct componentname *fcnp = ap->a_fcnp;
 	struct thread *td = fcnp->cn_thread;
 	struct inode *ip, *xp, *dp;
 	struct direct newdir;
 	int doingdirectory = 0, oldparent = 0, newparent = 0;
 	int error = 0, ioflag;
 
 #ifdef INVARIANTS
 	if ((tcnp->cn_flags & HASBUF) == 0 ||
 	    (fcnp->cn_flags & HASBUF) == 0)
 		panic("ufs_rename: no name");
 #endif
 	/*
 	 * Check for cross-device rename.
 	 */
 	if ((fvp->v_mount != tdvp->v_mount) ||
 	    (tvp && (fvp->v_mount != tvp->v_mount))) {
 		error = EXDEV;
 abortit:
 		if (tdvp == tvp)
 			vrele(tdvp);
 		else
 			vput(tdvp);
 		if (tvp)
 			vput(tvp);
 		vrele(fdvp);
 		vrele(fvp);
 		return (error);
 	}
 
 	if (tvp && ((VTOI(tvp)->i_flags & (NOUNLINK | IMMUTABLE | APPEND)) ||
 	    (VTOI(tdvp)->i_flags & APPEND))) {
 		error = EPERM;
 		goto abortit;
 	}
 
 	/*
 	 * Renaming a file to itself has no effect.  The upper layers should
 	 * not call us in that case.  Temporarily just warn if they do.
 	 */
 	if (fvp == tvp) {
 		printf("ufs_rename: fvp == tvp (can't happen)\n");
 		error = 0;
 		goto abortit;
 	}
 
-	if ((error = vn_lock(fvp, LK_EXCLUSIVE, td)) != 0)
+	if ((error = vn_lock(fvp, LK_EXCLUSIVE)) != 0)
 		goto abortit;
 	dp = VTOI(fdvp);
 	ip = VTOI(fvp);
 	if (ip->i_nlink >= LINK_MAX) {
 		VOP_UNLOCK(fvp, 0, td);
 		error = EMLINK;
 		goto abortit;
 	}
 	if ((ip->i_flags & (NOUNLINK | IMMUTABLE | APPEND))
 	    || (dp->i_flags & APPEND)) {
 		VOP_UNLOCK(fvp, 0, td);
 		error = EPERM;
 		goto abortit;
 	}
 	if ((ip->i_mode & IFMT) == IFDIR) {
 		/*
 		 * Avoid ".", "..", and aliases of "." for obvious reasons.
 		 */
 		if ((fcnp->cn_namelen == 1 && fcnp->cn_nameptr[0] == '.') ||
 		    dp == ip || (fcnp->cn_flags | tcnp->cn_flags) & ISDOTDOT ||
 		    (ip->i_flag & IN_RENAME)) {
 			VOP_UNLOCK(fvp, 0, td);
 			error = EINVAL;
 			goto abortit;
 		}
 		ip->i_flag |= IN_RENAME;
 		oldparent = dp->i_number;
 		doingdirectory = 1;
 	}
 	vrele(fdvp);
 
 	/*
 	 * When the target exists, both the directory
 	 * and target vnodes are returned locked.
 	 */
 	dp = VTOI(tdvp);
 	xp = NULL;
 	if (tvp)
 		xp = VTOI(tvp);
 
 	/*
 	 * 1) Bump link count while we're moving stuff
 	 *    around.  If we crash somewhere before
 	 *    completing our work, the link count
 	 *    may be wrong, but correctable.
 	 */
 	ip->i_effnlink++;
 	ip->i_nlink++;
 	DIP_SET(ip, i_nlink, ip->i_nlink);
 	ip->i_flag |= IN_CHANGE;
 	if (DOINGSOFTDEP(fvp))
 		softdep_change_linkcnt(ip);
 	if ((error = UFS_UPDATE(fvp, !(DOINGSOFTDEP(fvp) |
 				       DOINGASYNC(fvp)))) != 0) {
 		VOP_UNLOCK(fvp, 0, td);
 		goto bad;
 	}
 
 	/*
 	 * If ".." must be changed (ie the directory gets a new
 	 * parent) then the source directory must not be in the
 	 * directory hierarchy above the target, as this would
 	 * orphan everything below the source directory. Also
 	 * the user must have write permission in the source so
 	 * as to be able to change "..". We must repeat the call
 	 * to namei, as the parent directory is unlocked by the
 	 * call to checkpath().
 	 */
 	error = VOP_ACCESS(fvp, VWRITE, tcnp->cn_cred, tcnp->cn_thread);
 	VOP_UNLOCK(fvp, 0, td);
 	if (oldparent != dp->i_number)
 		newparent = dp->i_number;
 	if (doingdirectory && newparent) {
 		if (error)	/* write access check above */
 			goto bad;
 		if (xp != NULL)
 			vput(tvp);
 		error = ufs_checkpath(ip, dp, tcnp->cn_cred);
 		if (error)
 			goto out;
 		if ((tcnp->cn_flags & SAVESTART) == 0)
 			panic("ufs_rename: lost to startdir");
 		VREF(tdvp);
 		error = relookup(tdvp, &tvp, tcnp);
 		if (error)
 			goto out;
 		vrele(tdvp);
 		dp = VTOI(tdvp);
 		xp = NULL;
 		if (tvp)
 			xp = VTOI(tvp);
 	}
 	/*
 	 * 2) If target doesn't exist, link the target
 	 *    to the source and unlink the source.
 	 *    Otherwise, rewrite the target directory
 	 *    entry to reference the source inode and
 	 *    expunge the original entry's existence.
 	 */
 	if (xp == NULL) {
 		if (dp->i_dev != ip->i_dev)
 			panic("ufs_rename: EXDEV");
 		/*
 		 * Account for ".." in new directory.
 		 * When source and destination have the same
 		 * parent we don't fool with the link count.
 		 */
 		if (doingdirectory && newparent) {
 			if ((nlink_t)dp->i_nlink >= LINK_MAX) {
 				error = EMLINK;
 				goto bad;
 			}
 			dp->i_effnlink++;
 			dp->i_nlink++;
 			DIP_SET(dp, i_nlink, dp->i_nlink);
 			dp->i_flag |= IN_CHANGE;
 			if (DOINGSOFTDEP(tdvp))
 				softdep_change_linkcnt(dp);
 			error = UFS_UPDATE(tdvp, !(DOINGSOFTDEP(tdvp) |
 						   DOINGASYNC(tdvp)));
 			if (error)
 				goto bad;
 		}
 		ufs_makedirentry(ip, tcnp, &newdir);
 		error = ufs_direnter(tdvp, NULL, &newdir, tcnp, NULL);
 		if (error) {
 			if (doingdirectory && newparent) {
 				dp->i_effnlink--;
 				dp->i_nlink--;
 				DIP_SET(dp, i_nlink, dp->i_nlink);
 				dp->i_flag |= IN_CHANGE;
 				if (DOINGSOFTDEP(tdvp))
 					softdep_change_linkcnt(dp);
 				(void)UFS_UPDATE(tdvp, 1);
 			}
 			goto bad;
 		}
 		vput(tdvp);
 	} else {
 		if (xp->i_dev != dp->i_dev || xp->i_dev != ip->i_dev)
 			panic("ufs_rename: EXDEV");
 		/*
 		 * Short circuit rename(foo, foo).
 		 */
 		if (xp->i_number == ip->i_number)
 			panic("ufs_rename: same file");
 		/*
 		 * If the parent directory is "sticky", then the caller
 		 * must possess VADMIN for the parent directory, or the
 		 * destination of the rename.  This implements append-only
 		 * directories.
 		 */
 		if ((dp->i_mode & S_ISTXT) &&
 		    VOP_ACCESS(tdvp, VADMIN, tcnp->cn_cred, td) &&
 		    VOP_ACCESS(tvp, VADMIN, tcnp->cn_cred, td)) {
 			error = EPERM;
 			goto bad;
 		}
 		/*
 		 * Target must be empty if a directory and have no links
 		 * to it. Also, ensure source and target are compatible
 		 * (both directories, or both not directories).
 		 */
 		if ((xp->i_mode&IFMT) == IFDIR) {
 			if ((xp->i_effnlink > 2) ||
 			    !ufs_dirempty(xp, dp->i_number, tcnp->cn_cred)) {
 				error = ENOTEMPTY;
 				goto bad;
 			}
 			if (!doingdirectory) {
 				error = ENOTDIR;
 				goto bad;
 			}
 			cache_purge(tdvp);
 		} else if (doingdirectory) {
 			error = EISDIR;
 			goto bad;
 		}
 		error = ufs_dirrewrite(dp, xp, ip->i_number,
 		    IFTODT(ip->i_mode),
 		    (doingdirectory && newparent) ? newparent : doingdirectory);
 		if (error)
 			goto bad;
 		if (doingdirectory) {
 			if (!newparent) {
 				dp->i_effnlink--;
 				if (DOINGSOFTDEP(tdvp))
 					softdep_change_linkcnt(dp);
 			}
 			xp->i_effnlink--;
 			if (DOINGSOFTDEP(tvp))
 				softdep_change_linkcnt(xp);
 		}
 		if (doingdirectory && !DOINGSOFTDEP(tvp)) {
 			/*
 			 * Truncate inode. The only stuff left in the directory
 			 * is "." and "..". The "." reference is inconsequential
 			 * since we are quashing it. We have removed the "."
 			 * reference and the reference in the parent directory,
 			 * but there may be other hard links. The soft
 			 * dependency code will arrange to do these operations
 			 * after the parent directory entry has been deleted on
 			 * disk, so when running with that code we avoid doing
 			 * them now.
 			 */
 			if (!newparent) {
 				dp->i_nlink--;
 				DIP_SET(dp, i_nlink, dp->i_nlink);
 				dp->i_flag |= IN_CHANGE;
 			}
 			xp->i_nlink--;
 			DIP_SET(xp, i_nlink, xp->i_nlink);
 			xp->i_flag |= IN_CHANGE;
 			ioflag = IO_NORMAL;
 			if (!DOINGASYNC(tvp))
 				ioflag |= IO_SYNC;
 			if ((error = UFS_TRUNCATE(tvp, (off_t)0, ioflag,
 			    tcnp->cn_cred, tcnp->cn_thread)) != 0)
 				goto bad;
 		}
 		vput(tdvp);
 		vput(tvp);
 		xp = NULL;
 	}
 
 	/*
 	 * 3) Unlink the source.
 	 */
 	fcnp->cn_flags &= ~MODMASK;
 	fcnp->cn_flags |= LOCKPARENT | LOCKLEAF;
 	if ((fcnp->cn_flags & SAVESTART) == 0)
 		panic("ufs_rename: lost from startdir");
 	VREF(fdvp);
 	error = relookup(fdvp, &fvp, fcnp);
 	if (error == 0)
 		vrele(fdvp);
 	if (fvp != NULL) {
 		xp = VTOI(fvp);
 		dp = VTOI(fdvp);
 	} else {
 		/*
 		 * From name has disappeared.  IN_RENAME is not sufficient
 		 * to protect against directory races due to timing windows,
 		 * so we have to remove the panic.  XXX the only real way
 		 * to solve this issue is at a much higher level.  By the
 		 * time we hit ufs_rename() it's too late.
 		 */
 #if 0
 		if (doingdirectory)
 			panic("ufs_rename: lost dir entry");
 #endif
 		vrele(ap->a_fvp);
 		return (0);
 	}
 	/*
 	 * Ensure that the directory entry still exists and has not
 	 * changed while the new name has been entered. If the source is
 	 * a file then the entry may have been unlinked or renamed. In
 	 * either case there is no further work to be done. If the source
 	 * is a directory then it cannot have been rmdir'ed; the IN_RENAME
 	 * flag ensures that it cannot be moved by another rename or removed
 	 * by a rmdir.
 	 */
 	if (xp != ip) {
 		/*
 		 * From name resolves to a different inode.  IN_RENAME is
 		 * not sufficient protection against timing window races
 		 * so we can't panic here.  XXX the only real way
 		 * to solve this issue is at a much higher level.  By the
 		 * time we hit ufs_rename() it's too late.
 		 */
 #if 0
 		if (doingdirectory)
 			panic("ufs_rename: lost dir entry");
 #endif
 	} else {
 		/*
 		 * If the source is a directory with a
 		 * new parent, the link count of the old
 		 * parent directory must be decremented
 		 * and ".." set to point to the new parent.
 		 */
 		if (doingdirectory && newparent) {
 			xp->i_offset = mastertemplate.dot_reclen;
 			ufs_dirrewrite(xp, dp, newparent, DT_DIR, 0);
 			cache_purge(fdvp);
 		}
 		error = ufs_dirremove(fdvp, xp, fcnp->cn_flags, 0);
 		xp->i_flag &= ~IN_RENAME;
 	}
 	if (dp)
 		vput(fdvp);
 	if (xp)
 		vput(fvp);
 	vrele(ap->a_fvp);
 	return (error);
 
 bad:
 	if (xp)
 		vput(ITOV(xp));
 	vput(ITOV(dp));
 out:
 	if (doingdirectory)
 		ip->i_flag &= ~IN_RENAME;
-	if (vn_lock(fvp, LK_EXCLUSIVE, td) == 0) {
+	if (vn_lock(fvp, LK_EXCLUSIVE) == 0) {
 		ip->i_effnlink--;
 		ip->i_nlink--;
 		DIP_SET(ip, i_nlink, ip->i_nlink);
 		ip->i_flag |= IN_CHANGE;
 		ip->i_flag &= ~IN_RENAME;
 		if (DOINGSOFTDEP(fvp))
 			softdep_change_linkcnt(ip);
 		vput(fvp);
 	} else
 		vrele(fvp);
 	return (error);
 }
 
 /*
  * Mkdir system call
  */
 static int
 ufs_mkdir(ap)
 	struct vop_mkdir_args /* {
 		struct vnode *a_dvp;
 		struct vnode **a_vpp;
 		struct componentname *a_cnp;
 		struct vattr *a_vap;
 	} */ *ap;
 {
 	struct vnode *dvp = ap->a_dvp;
 	struct vattr *vap = ap->a_vap;
 	struct componentname *cnp = ap->a_cnp;
 	struct inode *ip, *dp;
 	struct vnode *tvp;
 	struct buf *bp;
 	struct dirtemplate dirtemplate, *dtp;
 	struct direct newdir;
 #ifdef UFS_ACL
 	struct acl *acl, *dacl;
 #endif
 	int error, dmode;
 	long blkoff;
 
 #ifdef INVARIANTS
 	if ((cnp->cn_flags & HASBUF) == 0)
 		panic("ufs_mkdir: no name");
 #endif
 	dp = VTOI(dvp);
 	if ((nlink_t)dp->i_nlink >= LINK_MAX) {
 		error = EMLINK;
 		goto out;
 	}
 	dmode = vap->va_mode & 0777;
 	dmode |= IFDIR;
 	/*
 	 * Must simulate part of ufs_makeinode here to acquire the inode,
 	 * but not have it entered in the parent directory. The entry is
 	 * made later after writing "." and ".." entries.
 	 */
 	error = UFS_VALLOC(dvp, dmode, cnp->cn_cred, &tvp);
 	if (error)
 		goto out;
 	ip = VTOI(tvp);
 	ip->i_gid = dp->i_gid;
 	DIP_SET(ip, i_gid, dp->i_gid);
 #ifdef SUIDDIR
 	{
 #ifdef QUOTA
 		struct ucred ucred, *ucp;
 		ucp = cnp->cn_cred;
 #endif
 		/*
 		 * If we are hacking owners here, (only do this where told to)
 		 * and we are not giving it TO root, (would subvert quotas)
 		 * then go ahead and give it to the other user.
 		 * The new directory also inherits the SUID bit.
 		 * If user's UID and dir UID are the same,
 		 * 'give it away' so that the SUID is still forced on.
 		 */
 		if ((dvp->v_mount->mnt_flag & MNT_SUIDDIR) &&
 		    (dp->i_mode & ISUID) && dp->i_uid) {
 			dmode |= ISUID;
 			ip->i_uid = dp->i_uid;
 			DIP_SET(ip, i_uid, dp->i_uid);
 #ifdef QUOTA
 			if (dp->i_uid != cnp->cn_cred->cr_uid) {
 				/*
 				 * Make sure the correct user gets charged
 				 * for the space.
 				 * Make a dummy credential for the victim.
 				 * XXX This seems to never be accessed out of
 				 * our context so a stack variable is ok.
 				 */
 				refcount_init(&ucred.cr_ref, 1);
 				ucred.cr_uid = ip->i_uid;
 				ucred.cr_ngroups = 1;
 				ucred.cr_groups[0] = dp->i_gid;
 				ucp = &ucred;
 			}
 #endif
 		} else {
 			ip->i_uid = cnp->cn_cred->cr_uid;
 			DIP_SET(ip, i_uid, ip->i_uid);
 		}
 #ifdef QUOTA
 		if ((error = getinoquota(ip)) ||
 	    	    (error = chkiq(ip, 1, ucp, 0))) {
 			UFS_VFREE(tvp, ip->i_number, dmode);
 			vput(tvp);
 			return (error);
 		}
 #endif
 	}
 #else	/* !SUIDDIR */
 	ip->i_uid = cnp->cn_cred->cr_uid;
 	DIP_SET(ip, i_uid, ip->i_uid);
 #ifdef QUOTA
 	if ((error = getinoquota(ip)) ||
 	    (error = chkiq(ip, 1, cnp->cn_cred, 0))) {
 		UFS_VFREE(tvp, ip->i_number, dmode);
 		vput(tvp);
 		return (error);
 	}
 #endif
 #endif	/* !SUIDDIR */
 	ip->i_flag |= IN_ACCESS | IN_CHANGE | IN_UPDATE;
 #ifdef UFS_ACL
 	acl = dacl = NULL;
 	if ((dvp->v_mount->mnt_flag & MNT_ACLS) != 0) {
 		acl = uma_zalloc(acl_zone, M_WAITOK);
 		dacl = uma_zalloc(acl_zone, M_WAITOK);
 
 		/*
 		 * Retrieve default ACL from parent, if any.
 		 */
 		error = VOP_GETACL(dvp, ACL_TYPE_DEFAULT, acl, cnp->cn_cred,
 		    cnp->cn_thread);
 		switch (error) {
 		case 0:
 			/*
 			 * Retrieved a default ACL, so merge mode and ACL if
 			 * necessary.  If the ACL is empty, fall through to
 			 * the "not defined or available" case.
 			 */
 			if (acl->acl_cnt != 0) {
 				dmode = acl_posix1e_newfilemode(dmode, acl);
 				ip->i_mode = dmode;
 				DIP_SET(ip, i_mode, dmode);
 				*dacl = *acl;
 				ufs_sync_acl_from_inode(ip, acl);
 				break;
 			}
 			/* FALLTHROUGH */
 	
 		case EOPNOTSUPP:
 			/*
 			 * Just use the mode as-is.
 			 */
 			ip->i_mode = dmode;
 			DIP_SET(ip, i_mode, dmode);
 			uma_zfree(acl_zone, acl);
 			uma_zfree(acl_zone, dacl);
 			dacl = acl = NULL;
 			break;
 		
 		default:
 			UFS_VFREE(tvp, ip->i_number, dmode);
 			vput(tvp);
 			uma_zfree(acl_zone, acl);
 			uma_zfree(acl_zone, dacl);
 			return (error);
 		}
 	} else {
 #endif /* !UFS_ACL */
 		ip->i_mode = dmode;
 		DIP_SET(ip, i_mode, dmode);
 #ifdef UFS_ACL
 	}
 #endif
 	tvp->v_type = VDIR;	/* Rest init'd in getnewvnode(). */
 	ip->i_effnlink = 2;
 	ip->i_nlink = 2;
 	DIP_SET(ip, i_nlink, 2);
 	if (DOINGSOFTDEP(tvp))
 		softdep_change_linkcnt(ip);
 	if (cnp->cn_flags & ISWHITEOUT) {
 		ip->i_flags |= UF_OPAQUE;
 		DIP_SET(ip, i_flags, ip->i_flags);
 	}
 
 	/*
 	 * Bump link count in parent directory to reflect work done below.
 	 * Should be done before reference is created so cleanup is
 	 * possible if we crash.
 	 */
 	dp->i_effnlink++;
 	dp->i_nlink++;
 	DIP_SET(dp, i_nlink, dp->i_nlink);
 	dp->i_flag |= IN_CHANGE;
 	if (DOINGSOFTDEP(dvp))
 		softdep_change_linkcnt(dp);
 	error = UFS_UPDATE(tvp, !(DOINGSOFTDEP(dvp) | DOINGASYNC(dvp)));
 	if (error)
 		goto bad;
 #ifdef MAC
 	if (dvp->v_mount->mnt_flag & MNT_MULTILABEL) {
 		error = mac_vnode_create_extattr(cnp->cn_cred, dvp->v_mount,
 		    dvp, tvp, cnp);
 		if (error)
 			goto bad;
 	}
 #endif
 #ifdef UFS_ACL
 	if (acl != NULL) {
 		/*
 		 * XXX: If we abort now, will Soft Updates notify the extattr
 		 * code that the EAs for the file need to be released?
 		 */
 		error = VOP_SETACL(tvp, ACL_TYPE_ACCESS, acl, cnp->cn_cred,
 		    cnp->cn_thread);
 		if (error == 0)
 			error = VOP_SETACL(tvp, ACL_TYPE_DEFAULT, dacl,
 			    cnp->cn_cred, cnp->cn_thread);
 		switch (error) {
 		case 0:
 			break;
 
 		case EOPNOTSUPP:
 			/*
 			 * XXX: This should not happen, as EOPNOTSUPP above
 			 * was supposed to free acl.
 			 */
 			printf("ufs_mkdir: VOP_GETACL() but no VOP_SETACL()\n");
 			/*
 			panic("ufs_mkdir: VOP_GETACL() but no VOP_SETACL()");
 			 */
 			break;
 
 		default:
 			uma_zfree(acl_zone, acl);
 			uma_zfree(acl_zone, dacl);
 			dacl = acl = NULL;
 			goto bad;
 		}
 		uma_zfree(acl_zone, acl);
 		uma_zfree(acl_zone, dacl);
 		dacl = acl = NULL;
 	}
 #endif /* !UFS_ACL */
 
 	/*
 	 * Initialize directory with "." and ".." from static template.
 	 */
 	if (dvp->v_mount->mnt_maxsymlinklen > 0)
 		dtp = &mastertemplate;
 	else
 		dtp = (struct dirtemplate *)&omastertemplate;
 	dirtemplate = *dtp;
 	dirtemplate.dot_ino = ip->i_number;
 	dirtemplate.dotdot_ino = dp->i_number;
 	if ((error = UFS_BALLOC(tvp, (off_t)0, DIRBLKSIZ, cnp->cn_cred,
 	    BA_CLRBUF, &bp)) != 0)
 		goto bad;
 	ip->i_size = DIRBLKSIZ;
 	DIP_SET(ip, i_size, DIRBLKSIZ);
 	ip->i_flag |= IN_CHANGE | IN_UPDATE;
 	vnode_pager_setsize(tvp, (u_long)ip->i_size);
 	bcopy((caddr_t)&dirtemplate, (caddr_t)bp->b_data, sizeof dirtemplate);
 	if (DOINGSOFTDEP(tvp)) {
 		/*
 		 * Ensure that the entire newly allocated block is a
 		 * valid directory so that future growth within the
 		 * block does not have to ensure that the block is
 		 * written before the inode.
 		 */
 		blkoff = DIRBLKSIZ;
 		while (blkoff < bp->b_bcount) {
 			((struct direct *)
 			   (bp->b_data + blkoff))->d_reclen = DIRBLKSIZ;
 			blkoff += DIRBLKSIZ;
 		}
 	}
 	if ((error = UFS_UPDATE(tvp, !(DOINGSOFTDEP(tvp) |
 				       DOINGASYNC(tvp)))) != 0) {
 		(void)bwrite(bp);
 		goto bad;
 	}
 	/*
 	 * Directory set up, now install its entry in the parent directory.
 	 *
 	 * If we are not doing soft dependencies, then we must write out the
 	 * buffer containing the new directory body before entering the new 
 	 * name in the parent. If we are doing soft dependencies, then the
 	 * buffer containing the new directory body will be passed to and
 	 * released in the soft dependency code after the code has attached
 	 * an appropriate ordering dependency to the buffer which ensures that
 	 * the buffer is written before the new name is written in the parent.
 	 */
 	if (DOINGASYNC(dvp))
 		bdwrite(bp);
 	else if (!DOINGSOFTDEP(dvp) && ((error = bwrite(bp))))
 		goto bad;
 	ufs_makedirentry(ip, cnp, &newdir);
 	error = ufs_direnter(dvp, tvp, &newdir, cnp, bp);
 	
 bad:
 	if (error == 0) {
 		*ap->a_vpp = tvp;
 	} else {
 #ifdef UFS_ACL
 		if (acl != NULL)
 			uma_zfree(acl_zone, acl);
 		if (dacl != NULL)
 			uma_zfree(acl_zone, dacl);
 #endif
 		dp->i_effnlink--;
 		dp->i_nlink--;
 		DIP_SET(dp, i_nlink, dp->i_nlink);
 		dp->i_flag |= IN_CHANGE;
 		if (DOINGSOFTDEP(dvp))
 			softdep_change_linkcnt(dp);
 		/*
 		 * No need to do an explicit VOP_TRUNCATE here, vrele will
 		 * do this for us because we set the link count to 0.
 		 */
 		ip->i_effnlink = 0;
 		ip->i_nlink = 0;
 		DIP_SET(ip, i_nlink, 0);
 		ip->i_flag |= IN_CHANGE;
 		if (DOINGSOFTDEP(tvp))
 			softdep_change_linkcnt(ip);
 		vput(tvp);
 	}
 out:
 	return (error);
 }
 
 /*
  * Rmdir system call.
  */
 static int
 ufs_rmdir(ap)
 	struct vop_rmdir_args /* {
 		struct vnode *a_dvp;
 		struct vnode *a_vp;
 		struct componentname *a_cnp;
 	} */ *ap;
 {
 	struct vnode *vp = ap->a_vp;
 	struct vnode *dvp = ap->a_dvp;
 	struct componentname *cnp = ap->a_cnp;
 	struct inode *ip, *dp;
 	int error, ioflag;
 
 	ip = VTOI(vp);
 	dp = VTOI(dvp);
 
 	/*
 	 * Do not remove a directory that is in the process of being renamed.
 	 * Verify the directory is empty (and valid). Rmdir ".." will not be
 	 * valid since ".." will contain a reference to the current directory
 	 * and thus be non-empty. Do not allow the removal of mounted on
 	 * directories (this can happen when an NFS exported filesystem
 	 * tries to remove a locally mounted on directory).
 	 */
 	error = 0;
 	if ((ip->i_flag & IN_RENAME) || ip->i_effnlink < 2) {
 		error = EINVAL;
 		goto out;
 	}
 	if (!ufs_dirempty(ip, dp->i_number, cnp->cn_cred)) {
 		error = ENOTEMPTY;
 		goto out;
 	}
 	if ((dp->i_flags & APPEND)
 	    || (ip->i_flags & (NOUNLINK | IMMUTABLE | APPEND))) {
 		error = EPERM;
 		goto out;
 	}
 	if (vp->v_mountedhere != 0) {
 		error = EINVAL;
 		goto out;
 	}
 #ifdef UFS_GJOURNAL
 	ufs_gjournal_orphan(vp);
 #endif
 	/*
 	 * Delete reference to directory before purging
 	 * inode.  If we crash in between, the directory
 	 * will be reattached to lost+found,
 	 */
 	dp->i_effnlink--;
 	ip->i_effnlink--;
 	if (DOINGSOFTDEP(vp)) {
 		softdep_change_linkcnt(dp);
 		softdep_change_linkcnt(ip);
 	}
 	error = ufs_dirremove(dvp, ip, cnp->cn_flags, 1);
 	if (error) {
 		dp->i_effnlink++;
 		ip->i_effnlink++;
 		if (DOINGSOFTDEP(vp)) {
 			softdep_change_linkcnt(dp);
 			softdep_change_linkcnt(ip);
 		}
 		goto out;
 	}
 	cache_purge(dvp);
 	/*
 	 * Truncate inode. The only stuff left in the directory is "." and
 	 * "..". The "." reference is inconsequential since we are quashing
 	 * it. The soft dependency code will arrange to do these operations
 	 * after the parent directory entry has been deleted on disk, so
 	 * when running with that code we avoid doing them now.
 	 */
 	if (!DOINGSOFTDEP(vp)) {
 		dp->i_nlink--;
 		DIP_SET(dp, i_nlink, dp->i_nlink);
 		dp->i_flag |= IN_CHANGE;
 		ip->i_nlink--;
 		DIP_SET(ip, i_nlink, ip->i_nlink);
 		ip->i_flag |= IN_CHANGE;
 		ioflag = IO_NORMAL;
 		if (!DOINGASYNC(vp))
 			ioflag |= IO_SYNC;
 		error = UFS_TRUNCATE(vp, (off_t)0, ioflag, cnp->cn_cred,
 		    cnp->cn_thread);
 	}
 	cache_purge(vp);
 #ifdef UFS_DIRHASH
 	/* Kill any active hash; i_effnlink == 0, so it will not come back. */
 	if (ip->i_dirhash != NULL)
 		ufsdirhash_free(ip);
 #endif
 out:
 	return (error);
 }
 
 /*
  * symlink -- make a symbolic link
  */
 static int
 ufs_symlink(ap)
 	struct vop_symlink_args /* {
 		struct vnode *a_dvp;
 		struct vnode **a_vpp;
 		struct componentname *a_cnp;
 		struct vattr *a_vap;
 		char *a_target;
 	} */ *ap;
 {
 	struct vnode *vp, **vpp = ap->a_vpp;
 	struct inode *ip;
 	int len, error;
 
 	error = ufs_makeinode(IFLNK | ap->a_vap->va_mode, ap->a_dvp,
 	    vpp, ap->a_cnp);
 	if (error)
 		return (error);
 	vp = *vpp;
 	len = strlen(ap->a_target);
 	if (len < vp->v_mount->mnt_maxsymlinklen) {
 		ip = VTOI(vp);
 		bcopy(ap->a_target, SHORTLINK(ip), len);
 		ip->i_size = len;
 		DIP_SET(ip, i_size, len);
 		ip->i_flag |= IN_CHANGE | IN_UPDATE;
 	} else
 		error = vn_rdwr(UIO_WRITE, vp, ap->a_target, len, (off_t)0,
 		    UIO_SYSSPACE, IO_NODELOCKED | IO_NOMACCHECK,
 		    ap->a_cnp->cn_cred, NOCRED, (int *)0, (struct thread *)0);
 	if (error)
 		vput(vp);
 	return (error);
 }
 
 /*
  * Vnode op for reading directories.
  *
  * The routine below assumes that the on-disk format of a directory
  * is the same as that defined by <sys/dirent.h>. If the on-disk
  * format changes, then it will be necessary to do a conversion
  * from the on-disk format that read returns to the format defined
  * by <sys/dirent.h>.
  */
 int
 ufs_readdir(ap)
 	struct vop_readdir_args /* {
 		struct vnode *a_vp;
 		struct uio *a_uio;
 		struct ucred *a_cred;
 		int *a_eofflag;
 		int *a_ncookies;
 		u_long **a_cookies;
 	} */ *ap;
 {
 	struct uio *uio = ap->a_uio;
 	int error;
 	size_t count, lost;
 	off_t off;
 
 	if (ap->a_ncookies != NULL)
 		/*
 		 * Ensure that the block is aligned.  The caller can use
 		 * the cookies to determine where in the block to start.
 		 */
 		uio->uio_offset &= ~(DIRBLKSIZ - 1);
 	off = uio->uio_offset;
 	count = uio->uio_resid;
 	/* Make sure we don't return partial entries. */
 	if (count <= ((uio->uio_offset + count) & (DIRBLKSIZ -1)))
 		return (EINVAL);
 	count -= (uio->uio_offset + count) & (DIRBLKSIZ -1);
 	lost = uio->uio_resid - count;
 	uio->uio_resid = count;
 	uio->uio_iov->iov_len = count;
 #	if (BYTE_ORDER == LITTLE_ENDIAN)
 		if (ap->a_vp->v_mount->mnt_maxsymlinklen > 0) {
 			error = VOP_READ(ap->a_vp, uio, 0, ap->a_cred);
 		} else {
 			struct dirent *dp, *edp;
 			struct uio auio;
 			struct iovec aiov;
 			caddr_t dirbuf;
 			int readcnt;
 			u_char tmp;
 
 			auio = *uio;
 			auio.uio_iov = &aiov;
 			auio.uio_iovcnt = 1;
 			auio.uio_segflg = UIO_SYSSPACE;
 			aiov.iov_len = count;
 			MALLOC(dirbuf, caddr_t, count, M_TEMP, M_WAITOK);
 			aiov.iov_base = dirbuf;
 			error = VOP_READ(ap->a_vp, &auio, 0, ap->a_cred);
 			if (error == 0) {
 				readcnt = count - auio.uio_resid;
 				edp = (struct dirent *)&dirbuf[readcnt];
 				for (dp = (struct dirent *)dirbuf; dp < edp; ) {
 					tmp = dp->d_namlen;
 					dp->d_namlen = dp->d_type;
 					dp->d_type = tmp;
 					if (dp->d_reclen > 0) {
 						dp = (struct dirent *)
 						    ((char *)dp + dp->d_reclen);
 					} else {
 						error = EIO;
 						break;
 					}
 				}
 				if (dp >= edp)
 					error = uiomove(dirbuf, readcnt, uio);
 			}
 			FREE(dirbuf, M_TEMP);
 		}
 #	else
 		error = VOP_READ(ap->a_vp, uio, 0, ap->a_cred);
 #	endif
 	if (!error && ap->a_ncookies != NULL) {
 		struct dirent* dpStart;
 		struct dirent* dpEnd;
 		struct dirent* dp;
 		int ncookies;
 		u_long *cookies;
 		u_long *cookiep;
 
 		if (uio->uio_segflg != UIO_SYSSPACE || uio->uio_iovcnt != 1)
 			panic("ufs_readdir: unexpected uio from NFS server");
 		dpStart = (struct dirent *)
 		    ((char *)uio->uio_iov->iov_base - (uio->uio_offset - off));
 		dpEnd = (struct dirent *) uio->uio_iov->iov_base;
 		for (dp = dpStart, ncookies = 0;
 		     dp < dpEnd;
 		     dp = (struct dirent *)((caddr_t) dp + dp->d_reclen))
 			ncookies++;
 		MALLOC(cookies, u_long *, ncookies * sizeof(u_long), M_TEMP,
 		    M_WAITOK);
 		for (dp = dpStart, cookiep = cookies;
 		     dp < dpEnd;
 		     dp = (struct dirent *)((caddr_t) dp + dp->d_reclen)) {
 			off += dp->d_reclen;
 			*cookiep++ = (u_long) off;
 		}
 		*ap->a_ncookies = ncookies;
 		*ap->a_cookies = cookies;
 	}
 	uio->uio_resid += lost;
 	if (ap->a_eofflag)
 	    *ap->a_eofflag = VTOI(ap->a_vp)->i_size <= uio->uio_offset;
 	return (error);
 }
 
 /*
  * Return target name of a symbolic link
  */
 static int
 ufs_readlink(ap)
 	struct vop_readlink_args /* {
 		struct vnode *a_vp;
 		struct uio *a_uio;
 		struct ucred *a_cred;
 	} */ *ap;
 {
 	struct vnode *vp = ap->a_vp;
 	struct inode *ip = VTOI(vp);
 	doff_t isize;
 
 	isize = ip->i_size;
 	if ((isize < vp->v_mount->mnt_maxsymlinklen) ||
 	    DIP(ip, i_blocks) == 0) { /* XXX - for old fastlink support */
 		return (uiomove(SHORTLINK(ip), isize, ap->a_uio));
 	}
 	return (VOP_READ(vp, ap->a_uio, 0, ap->a_cred));
 }
 
 /*
  * Calculate the logical to physical mapping if not done already,
  * then call the device strategy routine.
  *
  * In order to be able to swap to a file, the ufs_bmaparray() operation may not
  * deadlock on memory.  See ufs_bmap() for details.
  */
 static int
 ufs_strategy(ap)
 	struct vop_strategy_args /* {
 		struct vnode *a_vp;
 		struct buf *a_bp;
 	} */ *ap;
 {
 	struct buf *bp = ap->a_bp;
 	struct vnode *vp = ap->a_vp;
 	struct bufobj *bo;
 	struct inode *ip;
 	ufs2_daddr_t blkno;
 	int error;
 
 	ip = VTOI(vp);
 	if (bp->b_blkno == bp->b_lblkno) {
 		error = ufs_bmaparray(vp, bp->b_lblkno, &blkno, bp, NULL, NULL);
 		bp->b_blkno = blkno;
 		if (error) {
 			bp->b_error = error;
 			bp->b_ioflags |= BIO_ERROR;
 			bufdone(bp);
 			return (error);
 		}
 		if ((long)bp->b_blkno == -1)
 			vfs_bio_clrbuf(bp);
 	}
 	if ((long)bp->b_blkno == -1) {
 		bufdone(bp);
 		return (0);
 	}
 	bp->b_iooffset = dbtob(bp->b_blkno);
 	bo = ip->i_umbufobj;
 	BO_STRATEGY(bo, bp);
 	return (0);
 }
 
 /*
  * Print out the contents of an inode.
  */
 static int
 ufs_print(ap)
 	struct vop_print_args /* {
 		struct vnode *a_vp;
 	} */ *ap;
 {
 	struct vnode *vp = ap->a_vp;
 	struct inode *ip = VTOI(vp);
 
 	printf("\tino %lu, on dev %s", (u_long)ip->i_number,
 	    devtoname(ip->i_dev));
 	if (vp->v_type == VFIFO)
 		fifo_printinfo(vp);
 	printf("\n");
 	return (0);
 }
 
 /*
  * Close wrapper for fifos.
  *
  * Update the times on the inode then do device close.
  */
 static int
 ufsfifo_close(ap)
 	struct vop_close_args /* {
 		struct vnode *a_vp;
 		int  a_fflag;
 		struct ucred *a_cred;
 		struct thread *a_td;
 	} */ *ap;
 {
 	struct vnode *vp = ap->a_vp;
 	int usecount;
 
 	VI_LOCK(vp);
 	usecount = vp->v_usecount;
 	if (usecount > 1)
 		ufs_itimes_locked(vp);
 	VI_UNLOCK(vp);
 	return (fifo_specops.vop_close(ap));
 }
 
 /*
  * Kqfilter wrapper for fifos.
  *
  * Fall through to ufs kqfilter routines if needed 
  */
 static int
 ufsfifo_kqfilter(ap)
 	struct vop_kqfilter_args *ap;
 {
 	int error;
 
 	error = fifo_specops.vop_kqfilter(ap);
 	if (error)
 		error = vfs_kqfilter(ap);
 	return (error);
 }
 
 /*
  * Return POSIX pathconf information applicable to ufs filesystems.
  */
 static int
 ufs_pathconf(ap)
 	struct vop_pathconf_args /* {
 		struct vnode *a_vp;
 		int a_name;
 		int *a_retval;
 	} */ *ap;
 {
 	int error;
 
 	error = 0;
 	switch (ap->a_name) {
 	case _PC_LINK_MAX:
 		*ap->a_retval = LINK_MAX;
 		break;
 	case _PC_NAME_MAX:
 		*ap->a_retval = NAME_MAX;
 		break;
 	case _PC_PATH_MAX:
 		*ap->a_retval = PATH_MAX;
 		break;
 	case _PC_PIPE_BUF:
 		*ap->a_retval = PIPE_BUF;
 		break;
 	case _PC_CHOWN_RESTRICTED:
 		*ap->a_retval = 1;
 		break;
 	case _PC_NO_TRUNC:
 		*ap->a_retval = 1;
 		break;
 	case _PC_ACL_EXTENDED:
 #ifdef UFS_ACL
 		if (ap->a_vp->v_mount->mnt_flag & MNT_ACLS)
 			*ap->a_retval = 1;
 		else
 			*ap->a_retval = 0;
 #else
 		*ap->a_retval = 0;
 #endif
 		break;
 	case _PC_ACL_PATH_MAX:
 #ifdef UFS_ACL
 		if (ap->a_vp->v_mount->mnt_flag & MNT_ACLS)
 			*ap->a_retval = ACL_MAX_ENTRIES;
 		else
 			*ap->a_retval = 3;
 #else
 		*ap->a_retval = 3;
 #endif
 		break;
 	case _PC_MAC_PRESENT:
 #ifdef MAC
 		if (ap->a_vp->v_mount->mnt_flag & MNT_MULTILABEL)
 			*ap->a_retval = 1;
 		else
 			*ap->a_retval = 0;
 #else
 		*ap->a_retval = 0;
 #endif
 		break;
 	case _PC_ASYNC_IO:
 		/* _PC_ASYNC_IO should have been handled by upper layers. */
 		KASSERT(0, ("_PC_ASYNC_IO should not get here"));
 		error = EINVAL;
 		break;
 	case _PC_PRIO_IO:
 		*ap->a_retval = 0;
 		break;
 	case _PC_SYNC_IO:
 		*ap->a_retval = 0;
 		break;
 	case _PC_ALLOC_SIZE_MIN:
 		*ap->a_retval = ap->a_vp->v_mount->mnt_stat.f_bsize;
 		break;
 	case _PC_FILESIZEBITS:
 		*ap->a_retval = 64;
 		break;
 	case _PC_REC_INCR_XFER_SIZE:
 		*ap->a_retval = ap->a_vp->v_mount->mnt_stat.f_iosize;
 		break;
 	case _PC_REC_MAX_XFER_SIZE:
 		*ap->a_retval = -1; /* means ``unlimited'' */
 		break;
 	case _PC_REC_MIN_XFER_SIZE:
 		*ap->a_retval = ap->a_vp->v_mount->mnt_stat.f_iosize;
 		break;
 	case _PC_REC_XFER_ALIGN:
 		*ap->a_retval = PAGE_SIZE;
 		break;
 	case _PC_SYMLINK_MAX:
 		*ap->a_retval = MAXPATHLEN;
 		break;
 
 	default:
 		error = EINVAL;
 		break;
 	}
 	return (error);
 }
 
 /*
  * Advisory record locking support
  */
 static int
 ufs_advlock(ap)
 	struct vop_advlock_args /* {
 		struct vnode *a_vp;
 		caddr_t  a_id;
 		int  a_op;
 		struct flock *a_fl;
 		int  a_flags;
 	} */ *ap;
 {
 	struct inode *ip = VTOI(ap->a_vp);
 
 	return (lf_advlock(ap, &(ip->i_lockf), ip->i_size));
 }
 
 /*
  * Initialize the vnode associated with a new inode, handle aliased
  * vnodes.
  */
 int
 ufs_vinit(mntp, fifoops, vpp)
 	struct mount *mntp;
 	struct vop_vector *fifoops;
 	struct vnode **vpp;
 {
 	struct inode *ip;
 	struct vnode *vp;
 
 	vp = *vpp;
 	ip = VTOI(vp);
 	vp->v_type = IFTOVT(ip->i_mode);
 	if (vp->v_type == VFIFO)
 		vp->v_op = fifoops;
 	ASSERT_VOP_LOCKED(vp, "ufs_vinit");
 	if (ip->i_number == ROOTINO)
 		vp->v_vflag |= VV_ROOT;
 	ip->i_modrev = init_va_filerev();
 	*vpp = vp;
 	return (0);
 }
 
 /*
  * Allocate a new inode.
  * Vnode dvp must be locked.
  */
 static int
 ufs_makeinode(mode, dvp, vpp, cnp)
 	int mode;
 	struct vnode *dvp;
 	struct vnode **vpp;
 	struct componentname *cnp;
 {
 	struct inode *ip, *pdir;
 	struct direct newdir;
 	struct vnode *tvp;
 #ifdef UFS_ACL
 	struct acl *acl;
 #endif
 	int error;
 
 	pdir = VTOI(dvp);
 #ifdef INVARIANTS
 	if ((cnp->cn_flags & HASBUF) == 0)
 		panic("ufs_makeinode: no name");
 #endif
 	*vpp = NULL;
 	if ((mode & IFMT) == 0)
 		mode |= IFREG;
 
 	error = UFS_VALLOC(dvp, mode, cnp->cn_cred, &tvp);
 	if (error)
 		return (error);
 	ip = VTOI(tvp);
 	ip->i_gid = pdir->i_gid;
 	DIP_SET(ip, i_gid, pdir->i_gid);
 #ifdef SUIDDIR
 	{
 #ifdef QUOTA
 		struct ucred ucred, *ucp;
 		ucp = cnp->cn_cred;
 #endif
 		/*
 		 * If we are not the owner of the directory,
 		 * and we are hacking owners here, (only do this where told to)
 		 * and we are not giving it TO root, (would subvert quotas)
 		 * then go ahead and give it to the other user.
 		 * Note that this drops off the execute bits for security.
 		 */
 		if ((dvp->v_mount->mnt_flag & MNT_SUIDDIR) &&
 		    (pdir->i_mode & ISUID) &&
 		    (pdir->i_uid != cnp->cn_cred->cr_uid) && pdir->i_uid) {
 			ip->i_uid = pdir->i_uid;
 			DIP_SET(ip, i_uid, ip->i_uid);
 			mode &= ~07111;
 #ifdef QUOTA
 			/*
 			 * Make sure the correct user gets charged
 			 * for the space.
 			 * Quickly knock up a dummy credential for the victim.
 			 * XXX This seems to never be accessed out of our
 			 * context so a stack variable is ok.
 			 */
 			refcount_init(&ucred.cr_ref, 1);
 			ucred.cr_uid = ip->i_uid;
 			ucred.cr_ngroups = 1;
 			ucred.cr_groups[0] = pdir->i_gid;
 			ucp = &ucred;
 #endif
 		} else {
 			ip->i_uid = cnp->cn_cred->cr_uid;
 			DIP_SET(ip, i_uid, ip->i_uid);
 		}
 
 #ifdef QUOTA
 		if ((error = getinoquota(ip)) ||
 	    	    (error = chkiq(ip, 1, ucp, 0))) {
 			UFS_VFREE(tvp, ip->i_number, mode);
 			vput(tvp);
 			return (error);
 		}
 #endif
 	}
 #else	/* !SUIDDIR */
 	ip->i_uid = cnp->cn_cred->cr_uid;
 	DIP_SET(ip, i_uid, ip->i_uid);
 #ifdef QUOTA
 	if ((error = getinoquota(ip)) ||
 	    (error = chkiq(ip, 1, cnp->cn_cred, 0))) {
 		UFS_VFREE(tvp, ip->i_number, mode);
 		vput(tvp);
 		return (error);
 	}
 #endif
 #endif	/* !SUIDDIR */
 	ip->i_flag |= IN_ACCESS | IN_CHANGE | IN_UPDATE;
 #ifdef UFS_ACL
 	acl = NULL;
 	if ((dvp->v_mount->mnt_flag & MNT_ACLS) != 0) {
 		acl = uma_zalloc(acl_zone, M_WAITOK);
 
 		/*
 		 * Retrieve default ACL for parent, if any.
 		 */
 		error = VOP_GETACL(dvp, ACL_TYPE_DEFAULT, acl, cnp->cn_cred,
 		    cnp->cn_thread);
 		switch (error) {
 		case 0:
 			/*
 			 * Retrieved a default ACL, so merge mode and ACL if
 			 * necessary.
 			 */
 			if (acl->acl_cnt != 0) {
 				/*
 				 * Two possible ways for default ACL to not
 				 * be present.  First, the EA can be
 				 * undefined, or second, the default ACL can
 				 * be blank.  If it's blank, fall through to
 				 * the it's not defined case.
 				 */
 				mode = acl_posix1e_newfilemode(mode, acl);
 				ip->i_mode = mode;
 				DIP_SET(ip, i_mode, mode);
 				ufs_sync_acl_from_inode(ip, acl);
 				break;
 			}
 			/* FALLTHROUGH */
 	
 		case EOPNOTSUPP:
 			/*
 			 * Just use the mode as-is.
 			 */
 			ip->i_mode = mode;
 			DIP_SET(ip, i_mode, mode);
 			uma_zfree(acl_zone, acl);
 			acl = NULL;
 			break;
 	
 		default:
 			UFS_VFREE(tvp, ip->i_number, mode);
 			vput(tvp);
 			uma_zfree(acl_zone, acl);
 			acl = NULL;
 			return (error);
 		}
 	} else {
 #endif
 		ip->i_mode = mode;
 		DIP_SET(ip, i_mode, mode);
 #ifdef UFS_ACL
 	}
 #endif
 	tvp->v_type = IFTOVT(mode);	/* Rest init'd in getnewvnode(). */
 	ip->i_effnlink = 1;
 	ip->i_nlink = 1;
 	DIP_SET(ip, i_nlink, 1);
 	if (DOINGSOFTDEP(tvp))
 		softdep_change_linkcnt(ip);
 	if ((ip->i_mode & ISGID) && !groupmember(ip->i_gid, cnp->cn_cred) &&
 	    priv_check_cred(cnp->cn_cred, PRIV_VFS_SETGID, 0)) {
 		ip->i_mode &= ~ISGID;
 		DIP_SET(ip, i_mode, ip->i_mode);
 	}
 
 	if (cnp->cn_flags & ISWHITEOUT) {
 		ip->i_flags |= UF_OPAQUE;
 		DIP_SET(ip, i_flags, ip->i_flags);
 	}
 
 	/*
 	 * Make sure inode goes to disk before directory entry.
 	 */
 	error = UFS_UPDATE(tvp, !(DOINGSOFTDEP(tvp) | DOINGASYNC(tvp)));
 	if (error)
 		goto bad;
 #ifdef MAC
 	if (dvp->v_mount->mnt_flag & MNT_MULTILABEL) {
 		error = mac_vnode_create_extattr(cnp->cn_cred, dvp->v_mount,
 		    dvp, tvp, cnp);
 		if (error)
 			goto bad;
 	}
 #endif
 #ifdef UFS_ACL
 	if (acl != NULL) {
 		/*
 		 * XXX: If we abort now, will Soft Updates notify the extattr
 		 * code that the EAs for the file need to be released?
 		 */
 		error = VOP_SETACL(tvp, ACL_TYPE_ACCESS, acl, cnp->cn_cred,
 		    cnp->cn_thread);
 		switch (error) {
 		case 0:
 			break;
 
 		case EOPNOTSUPP:
 			/*
 			 * XXX: This should not happen, as EOPNOTSUPP above was
 			 * supposed to free acl.
 			 */
 			printf("ufs_makeinode: VOP_GETACL() but no "
 			    "VOP_SETACL()\n");
 			/* panic("ufs_makeinode: VOP_GETACL() but no "
 			    "VOP_SETACL()"); */
 			break;
 
 		default:
 			uma_zfree(acl_zone, acl);
 			goto bad;
 		}
 		uma_zfree(acl_zone, acl);
 	}
 #endif /* !UFS_ACL */
 	ufs_makedirentry(ip, cnp, &newdir);
 	error = ufs_direnter(dvp, tvp, &newdir, cnp, NULL);
 	if (error)
 		goto bad;
 	*vpp = tvp;
 	return (0);
 
 bad:
 	/*
 	 * Write error occurred trying to update the inode
 	 * or the directory so must deallocate the inode.
 	 */
 	ip->i_effnlink = 0;
 	ip->i_nlink = 0;
 	DIP_SET(ip, i_nlink, 0);
 	ip->i_flag |= IN_CHANGE;
 	if (DOINGSOFTDEP(tvp))
 		softdep_change_linkcnt(ip);
 	vput(tvp);
 	return (error);
 }
 
 /* Global vfs data structures for ufs. */
 struct vop_vector ufs_vnodeops = {
 	.vop_default =		&default_vnodeops,
 	.vop_fsync =		VOP_PANIC,
 	.vop_read =		VOP_PANIC,
 	.vop_reallocblks =	VOP_PANIC,
 	.vop_write =		VOP_PANIC,
 	.vop_access =		ufs_access,
 	.vop_advlock =		ufs_advlock,
 	.vop_bmap =		ufs_bmap,
 	.vop_cachedlookup =	ufs_lookup,
 	.vop_close =		ufs_close,
 	.vop_create =		ufs_create,
 	.vop_getattr =		ufs_getattr,
 	.vop_inactive =		ufs_inactive,
 	.vop_link =		ufs_link,
 	.vop_lookup =		vfs_cache_lookup,
 	.vop_mkdir =		ufs_mkdir,
 	.vop_mknod =		ufs_mknod,
 	.vop_open =		ufs_open,
 	.vop_pathconf =		ufs_pathconf,
 	.vop_poll =		vop_stdpoll,
 	.vop_print =		ufs_print,
 	.vop_readdir =		ufs_readdir,
 	.vop_readlink =		ufs_readlink,
 	.vop_reclaim =		ufs_reclaim,
 	.vop_remove =		ufs_remove,
 	.vop_rename =		ufs_rename,
 	.vop_rmdir =		ufs_rmdir,
 	.vop_setattr =		ufs_setattr,
 #ifdef MAC
 	.vop_setlabel =		vop_stdsetlabel_ea,
 #endif
 	.vop_strategy =		ufs_strategy,
 	.vop_symlink =		ufs_symlink,
 	.vop_whiteout =		ufs_whiteout,
 #ifdef UFS_EXTATTR
 	.vop_getextattr =	ufs_getextattr,
 	.vop_deleteextattr =	ufs_deleteextattr,
 	.vop_setextattr =	ufs_setextattr,
 #endif
 #ifdef UFS_ACL
 	.vop_getacl =		ufs_getacl,
 	.vop_setacl =		ufs_setacl,
 	.vop_aclcheck =		ufs_aclcheck,
 #endif
 };
 
 struct vop_vector ufs_fifoops = {
 	.vop_default =		&fifo_specops,
 	.vop_fsync =		VOP_PANIC,
 	.vop_access =		ufs_access,
 	.vop_close =		ufsfifo_close,
 	.vop_getattr =		ufs_getattr,
 	.vop_inactive =		ufs_inactive,
 	.vop_kqfilter =		ufsfifo_kqfilter,
 	.vop_print =		ufs_print,
 	.vop_read =		VOP_PANIC,
 	.vop_reclaim =		ufs_reclaim,
 	.vop_setattr =		ufs_setattr,
 #ifdef MAC
 	.vop_setlabel =		vop_stdsetlabel_ea,
 #endif
 	.vop_write =		VOP_PANIC,
 #ifdef UFS_EXTATTR
 	.vop_getextattr =	ufs_getextattr,
 	.vop_deleteextattr =	ufs_deleteextattr,
 	.vop_setextattr =	ufs_setextattr,
 #endif
 #ifdef UFS_ACL
 	.vop_getacl =		ufs_getacl,
 	.vop_setacl =		ufs_setacl,
 	.vop_aclcheck =		ufs_aclcheck,
 #endif
 };
Index: head/sys/vm/swap_pager.c
===================================================================
--- head/sys/vm/swap_pager.c	(revision 175201)
+++ head/sys/vm/swap_pager.c	(revision 175202)
@@ -1,2543 +1,2543 @@
 /*-
  * Copyright (c) 1998 Matthew Dillon,
  * Copyright (c) 1994 John S. Dyson
  * Copyright (c) 1990 University of Utah.
  * Copyright (c) 1982, 1986, 1989, 1993
  *	The Regents of the University of California.  All rights reserved.
  *
  * This code is derived from software contributed to Berkeley by
  * the Systems Programming Group of the University of Utah Computer
  * Science Department.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. All advertising materials mentioning features or use of this software
  *    must display the following acknowledgement:
  *	This product includes software developed by the University of
  *	California, Berkeley and its contributors.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *				New Swap System
  *				Matthew Dillon
  *
  * Radix Bitmap 'blists'.
  *
  *	- The new swapper uses the new radix bitmap code.  This should scale
  *	  to arbitrarily small or arbitrarily large swap spaces and an almost
  *	  arbitrary degree of fragmentation.
  *
  * Features:
  *
  *	- on the fly reallocation of swap during putpages.  The new system
  *	  does not try to keep previously allocated swap blocks for dirty
  *	  pages.  
  *
  *	- on the fly deallocation of swap
  *
  *	- No more garbage collection required.  Unnecessarily allocated swap
  *	  blocks only exist for dirty vm_page_t's now and these are already
  *	  cycled (in a high-load system) by the pager.  We also do on-the-fly
  *	  removal of invalidated swap blocks when a page is destroyed
  *	  or renamed.
  *
  * from: Utah $Hdr: swap_pager.c 1.4 91/04/30$
  *
  *	@(#)swap_pager.c	8.9 (Berkeley) 3/21/94
  *	@(#)vm_swap.c	8.5 (Berkeley) 2/17/94
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_mac.h"
 #include "opt_swap.h"
 #include "opt_vm.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/conf.h>
 #include <sys/kernel.h>
 #include <sys/priv.h>
 #include <sys/proc.h>
 #include <sys/bio.h>
 #include <sys/buf.h>
 #include <sys/disk.h>
 #include <sys/fcntl.h>
 #include <sys/mount.h>
 #include <sys/namei.h>
 #include <sys/vnode.h>
 #include <sys/malloc.h>
 #include <sys/sysctl.h>
 #include <sys/sysproto.h>
 #include <sys/blist.h>
 #include <sys/lock.h>
 #include <sys/sx.h>
 #include <sys/vmmeter.h>
 
 #include <security/mac/mac_framework.h>
 
 #include <vm/vm.h>
 #include <vm/pmap.h>
 #include <vm/vm_map.h>
 #include <vm/vm_kern.h>
 #include <vm/vm_object.h>
 #include <vm/vm_page.h>
 #include <vm/vm_pager.h>
 #include <vm/vm_pageout.h>
 #include <vm/vm_param.h>
 #include <vm/swap_pager.h>
 #include <vm/vm_extern.h>
 #include <vm/uma.h>
 
 #include <geom/geom.h>
 
 /*
  * SWB_NPAGES must be a power of 2.  It may be set to 1, 2, 4, 8, or 16
  * pages per allocation.  We recommend you stick with the default of 8.
  * The 16-page limit is due to the radix code (kern/subr_blist.c).
  */
 #ifndef MAX_PAGEOUT_CLUSTER
 #define MAX_PAGEOUT_CLUSTER 16
 #endif
 
 #if !defined(SWB_NPAGES)
 #define SWB_NPAGES	MAX_PAGEOUT_CLUSTER
 #endif
 
 /*
  * Piecemeal swap metadata structure.  Swap is stored in a radix tree.
  *
  * If SWB_NPAGES is 8 and sizeof(char *) == sizeof(daddr_t), our radix
  * is basically 8.  Assuming PAGE_SIZE == 4096, one tree level represents
  * 32K worth of data, two levels represent 256K, three levels represent
  * 2 MBytes.   This is acceptable.
  *
  * Overall memory utilization is about the same as the old swap structure.
  */
 #define SWCORRECT(n) (sizeof(void *) * (n) / sizeof(daddr_t))
 #define SWAP_META_PAGES		(SWB_NPAGES * 2)
 #define SWAP_META_MASK		(SWAP_META_PAGES - 1)
 
 struct swblock {
 	struct swblock	*swb_hnext;
 	vm_object_t	swb_object;
 	vm_pindex_t	swb_index;
 	int		swb_count;
 	daddr_t		swb_pages[SWAP_META_PAGES];
 };
 
 static struct mtx sw_dev_mtx;
 static TAILQ_HEAD(, swdevt) swtailq = TAILQ_HEAD_INITIALIZER(swtailq);
 static struct swdevt *swdevhd;	/* Allocate from here next */
 static int nswapdev;		/* Number of swap devices */
 int swap_pager_avail;
 static int swdev_syscall_active = 0; /* serialize swap(on|off) */
 
 static void swapdev_strategy(struct buf *, struct swdevt *sw);
 
 #define SWM_FREE	0x02	/* free, period			*/
 #define SWM_POP		0x04	/* pop out			*/
 
 int swap_pager_full = 2;	/* swap space exhaustion (task killing) */
 static int swap_pager_almost_full = 1; /* swap space exhaustion (w/hysteresis)*/
 static int nsw_rcount;		/* free read buffers			*/
 static int nsw_wcount_sync;	/* limit write buffers / synchronous	*/
 static int nsw_wcount_async;	/* limit write buffers / asynchronous	*/
 static int nsw_wcount_async_max;/* assigned maximum			*/
 static int nsw_cluster_max;	/* maximum VOP I/O allowed		*/
 
 static struct swblock **swhash;
 static int swhash_mask;
 static struct mtx swhash_mtx;
 
 static int swap_async_max = 4;	/* maximum in-progress async I/O's	*/
 static struct sx sw_alloc_sx;
 
 
 SYSCTL_INT(_vm, OID_AUTO, swap_async_max,
         CTLFLAG_RW, &swap_async_max, 0, "Maximum running async swap ops");
 
 /*
  * "named" and "unnamed" anon region objects.  Try to reduce the overhead
  * of searching a named list by hashing it just a little.
  */
 
 #define NOBJLISTS		8
 
 #define NOBJLIST(handle)	\
 	(&swap_pager_object_list[((int)(intptr_t)handle >> 4) & (NOBJLISTS-1)])
 
 static struct mtx sw_alloc_mtx;	/* protect list manipulation */ 
 static struct pagerlst	swap_pager_object_list[NOBJLISTS];
 static uma_zone_t	swap_zone;
 static struct vm_object	swap_zone_obj;
 
 /*
  * pagerops for OBJT_SWAP - "swap pager".  Some ops are also global procedure
  * calls hooked from other parts of the VM system and do not appear here.
  * (see vm/swap_pager.h).
  */
 static vm_object_t
 		swap_pager_alloc(void *handle, vm_ooffset_t size,
 				      vm_prot_t prot, vm_ooffset_t offset);
 static void	swap_pager_dealloc(vm_object_t object);
 static int	swap_pager_getpages(vm_object_t, vm_page_t *, int, int);
 static void	swap_pager_putpages(vm_object_t, vm_page_t *, int, boolean_t, int *);
 static boolean_t
 		swap_pager_haspage(vm_object_t object, vm_pindex_t pindex, int *before, int *after);
 static void	swap_pager_init(void);
 static void	swap_pager_unswapped(vm_page_t);
 static void	swap_pager_swapoff(struct swdevt *sp);
 
 struct pagerops swappagerops = {
 	.pgo_init =	swap_pager_init,	/* early system initialization of pager	*/
 	.pgo_alloc =	swap_pager_alloc,	/* allocate an OBJT_SWAP object		*/
 	.pgo_dealloc =	swap_pager_dealloc,	/* deallocate an OBJT_SWAP object	*/
 	.pgo_getpages =	swap_pager_getpages,	/* pagein				*/
 	.pgo_putpages =	swap_pager_putpages,	/* pageout				*/
 	.pgo_haspage =	swap_pager_haspage,	/* get backing store status for page	*/
 	.pgo_pageunswapped = swap_pager_unswapped,	/* remove swap related to page		*/
 };
 
 /*
  * dmmax is in page-sized chunks with the new swap system.  It was
  * dev-bsized chunks in the old.  dmmax is always a power of 2.
  *
  * swap_*() routines are externally accessible.  swp_*() routines are
  * internal.
  */
 static int dmmax;
 static int nswap_lowat = 128;	/* in pages, swap_pager_almost_full warn */
 static int nswap_hiwat = 512;	/* in pages, swap_pager_almost_full warn */
 
 SYSCTL_INT(_vm, OID_AUTO, dmmax,
 	CTLFLAG_RD, &dmmax, 0, "Maximum size of a swap block");
 
 static void	swp_sizecheck(void);
 static void	swp_pager_async_iodone(struct buf *bp);
 static int	swapongeom(struct thread *, struct vnode *);
 static int	swaponvp(struct thread *, struct vnode *, u_long);
 static int	swapoff_one(struct swdevt *sp, struct ucred *cred);
 
 /*
  * Swap bitmap functions
  */
 static void	swp_pager_freeswapspace(daddr_t blk, int npages);
 static daddr_t	swp_pager_getswapspace(int npages);
 
 /*
  * Metadata functions
  */
 static struct swblock **swp_pager_hash(vm_object_t object, vm_pindex_t index);
 static void swp_pager_meta_build(vm_object_t, vm_pindex_t, daddr_t);
 static void swp_pager_meta_free(vm_object_t, vm_pindex_t, daddr_t);
 static void swp_pager_meta_free_all(vm_object_t);
 static daddr_t swp_pager_meta_ctl(vm_object_t, vm_pindex_t, int);
 
 /*
  * SWP_SIZECHECK() -	update swap_pager_full indication
  *	
  *	update the swap_pager_almost_full indication and warn when we are
  *	about to run out of swap space, using lowat/hiwat hysteresis.
  *
  *	Clear swap_pager_full ( task killing ) indication when lowat is met.
  *
  *	No restrictions on call
  *	This routine may not block.
  *	This routine must be called at splvm()
  */
 static void
 swp_sizecheck(void)
 {
 
 	if (swap_pager_avail < nswap_lowat) {
 		if (swap_pager_almost_full == 0) {
 			printf("swap_pager: out of swap space\n");
 			swap_pager_almost_full = 1;
 		}
 	} else {
 		swap_pager_full = 0;
 		if (swap_pager_avail > nswap_hiwat)
 			swap_pager_almost_full = 0;
 	}
 }
 
 /*
  * SWP_PAGER_HASH() -	hash swap meta data
  *
  *	This is an helper function which hashes the swapblk given
  *	the object and page index.  It returns a pointer to a pointer
  *	to the object, or a pointer to a NULL pointer if it could not
  *	find a swapblk.
  *
  *	This routine must be called at splvm().
  */
 static struct swblock **
 swp_pager_hash(vm_object_t object, vm_pindex_t index)
 {
 	struct swblock **pswap;
 	struct swblock *swap;
 
 	index &= ~(vm_pindex_t)SWAP_META_MASK;
 	pswap = &swhash[(index ^ (int)(intptr_t)object) & swhash_mask];
 	while ((swap = *pswap) != NULL) {
 		if (swap->swb_object == object &&
 		    swap->swb_index == index
 		) {
 			break;
 		}
 		pswap = &swap->swb_hnext;
 	}
 	return (pswap);
 }
 
 /*
  * SWAP_PAGER_INIT() -	initialize the swap pager!
  *
  *	Expected to be started from system init.  NOTE:  This code is run 
  *	before much else so be careful what you depend on.  Most of the VM
  *	system has yet to be initialized at this point.
  */
 static void
 swap_pager_init(void)
 {
 	/*
 	 * Initialize object lists
 	 */
 	int i;
 
 	for (i = 0; i < NOBJLISTS; ++i)
 		TAILQ_INIT(&swap_pager_object_list[i]);
 	mtx_init(&sw_alloc_mtx, "swap_pager list", NULL, MTX_DEF);
 	mtx_init(&sw_dev_mtx, "swapdev", NULL, MTX_DEF);
 
 	/*
 	 * Device Stripe, in PAGE_SIZE'd blocks
 	 */
 	dmmax = SWB_NPAGES * 2;
 }
 
 /*
  * SWAP_PAGER_SWAP_INIT() - swap pager initialization from pageout process
  *
  *	Expected to be started from pageout process once, prior to entering
  *	its main loop.
  */
 void
 swap_pager_swap_init(void)
 {
 	int n, n2;
 
 	/*
 	 * Number of in-transit swap bp operations.  Don't
 	 * exhaust the pbufs completely.  Make sure we
 	 * initialize workable values (0 will work for hysteresis
 	 * but it isn't very efficient).
 	 *
 	 * The nsw_cluster_max is constrained by the bp->b_pages[]
 	 * array (MAXPHYS/PAGE_SIZE) and our locally defined
 	 * MAX_PAGEOUT_CLUSTER.   Also be aware that swap ops are
 	 * constrained by the swap device interleave stripe size.
 	 *
 	 * Currently we hardwire nsw_wcount_async to 4.  This limit is 
 	 * designed to prevent other I/O from having high latencies due to
 	 * our pageout I/O.  The value 4 works well for one or two active swap
 	 * devices but is probably a little low if you have more.  Even so,
 	 * a higher value would probably generate only a limited improvement
 	 * with three or four active swap devices since the system does not
 	 * typically have to pageout at extreme bandwidths.   We will want
 	 * at least 2 per swap devices, and 4 is a pretty good value if you
 	 * have one NFS swap device due to the command/ack latency over NFS.
 	 * So it all works out pretty well.
 	 */
 	nsw_cluster_max = min((MAXPHYS/PAGE_SIZE), MAX_PAGEOUT_CLUSTER);
 
 	mtx_lock(&pbuf_mtx);
 	nsw_rcount = (nswbuf + 1) / 2;
 	nsw_wcount_sync = (nswbuf + 3) / 4;
 	nsw_wcount_async = 4;
 	nsw_wcount_async_max = nsw_wcount_async;
 	mtx_unlock(&pbuf_mtx);
 
 	/*
 	 * Initialize our zone.  Right now I'm just guessing on the number
 	 * we need based on the number of pages in the system.  Each swblock
 	 * can hold 16 pages, so this is probably overkill.  This reservation
 	 * is typically limited to around 32MB by default.
 	 */
 	n = cnt.v_page_count / 2;
 	if (maxswzone && n > maxswzone / sizeof(struct swblock))
 		n = maxswzone / sizeof(struct swblock);
 	n2 = n;
 	swap_zone = uma_zcreate("SWAPMETA", sizeof(struct swblock), NULL, NULL,
 	    NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE | UMA_ZONE_VM);
 	if (swap_zone == NULL)
 		panic("failed to create swap_zone.");
 	do {
 		if (uma_zone_set_obj(swap_zone, &swap_zone_obj, n))
 			break;
 		/*
 		 * if the allocation failed, try a zone two thirds the
 		 * size of the previous attempt.
 		 */
 		n -= ((n + 2) / 3);
 	} while (n > 0);
 	if (n2 != n)
 		printf("Swap zone entries reduced from %d to %d.\n", n2, n);
 	n2 = n;
 
 	/*
 	 * Initialize our meta-data hash table.  The swapper does not need to
 	 * be quite as efficient as the VM system, so we do not use an 
 	 * oversized hash table.
 	 *
 	 * 	n: 		size of hash table, must be power of 2
 	 *	swhash_mask:	hash table index mask
 	 */
 	for (n = 1; n < n2 / 8; n *= 2)
 		;
 	swhash = malloc(sizeof(struct swblock *) * n, M_VMPGDATA, M_WAITOK | M_ZERO);
 	swhash_mask = n - 1;
 	mtx_init(&swhash_mtx, "swap_pager swhash", NULL, MTX_DEF);
 }
 
 /*
  * SWAP_PAGER_ALLOC() -	allocate a new OBJT_SWAP VM object and instantiate
  *			its metadata structures.
  *
  *	This routine is called from the mmap and fork code to create a new
  *	OBJT_SWAP object.  We do this by creating an OBJT_DEFAULT object
  *	and then converting it with swp_pager_meta_build().
  *
  *	This routine may block in vm_object_allocate() and create a named
  *	object lookup race, so we must interlock.   We must also run at
  *	splvm() for the object lookup to handle races with interrupts, but
  *	we do not have to maintain splvm() in between the lookup and the
  *	add because (I believe) it is not possible to attempt to create
  *	a new swap object w/handle when a default object with that handle
  *	already exists.
  *
  * MPSAFE
  */
 static vm_object_t
 swap_pager_alloc(void *handle, vm_ooffset_t size, vm_prot_t prot,
 		 vm_ooffset_t offset)
 {
 	vm_object_t object;
 	vm_pindex_t pindex;
 
 	pindex = OFF_TO_IDX(offset + PAGE_MASK + size);
 
 	if (handle) {
 		mtx_lock(&Giant);
 		/*
 		 * Reference existing named region or allocate new one.  There
 		 * should not be a race here against swp_pager_meta_build()
 		 * as called from vm_page_remove() in regards to the lookup
 		 * of the handle.
 		 */
 		sx_xlock(&sw_alloc_sx);
 		object = vm_pager_object_lookup(NOBJLIST(handle), handle);
 
 		if (object == NULL) {
 			object = vm_object_allocate(OBJT_DEFAULT, pindex);
 			object->handle = handle;
 
 			VM_OBJECT_LOCK(object);
 			swp_pager_meta_build(object, 0, SWAPBLK_NONE);
 			VM_OBJECT_UNLOCK(object);
 		}
 		sx_xunlock(&sw_alloc_sx);
 		mtx_unlock(&Giant);
 	} else {
 		object = vm_object_allocate(OBJT_DEFAULT, pindex);
 
 		VM_OBJECT_LOCK(object);
 		swp_pager_meta_build(object, 0, SWAPBLK_NONE);
 		VM_OBJECT_UNLOCK(object);
 	}
 	return (object);
 }
 
 /*
  * SWAP_PAGER_DEALLOC() -	remove swap metadata from object
  *
  *	The swap backing for the object is destroyed.  The code is 
  *	designed such that we can reinstantiate it later, but this
  *	routine is typically called only when the entire object is
  *	about to be destroyed.
  *
  *	This routine may block, but no longer does. 
  *
  *	The object must be locked or unreferenceable.
  */
 static void
 swap_pager_dealloc(vm_object_t object)
 {
 
 	/*
 	 * Remove from list right away so lookups will fail if we block for
 	 * pageout completion.
 	 */
 	if (object->handle != NULL) {
 		mtx_lock(&sw_alloc_mtx);
 		TAILQ_REMOVE(NOBJLIST(object->handle), object, pager_object_list);
 		mtx_unlock(&sw_alloc_mtx);
 	}
 
 	VM_OBJECT_LOCK_ASSERT(object, MA_OWNED);
 	vm_object_pip_wait(object, "swpdea");
 
 	/*
 	 * Free all remaining metadata.  We only bother to free it from 
 	 * the swap meta data.  We do not attempt to free swapblk's still
 	 * associated with vm_page_t's for this object.  We do not care
 	 * if paging is still in progress on some objects.
 	 */
 	swp_pager_meta_free_all(object);
 }
 
 /************************************************************************
  *			SWAP PAGER BITMAP ROUTINES			*
  ************************************************************************/
 
 /*
  * SWP_PAGER_GETSWAPSPACE() -	allocate raw swap space
  *
  *	Allocate swap for the requested number of pages.  The starting
  *	swap block number (a page index) is returned or SWAPBLK_NONE
  *	if the allocation failed.
  *
  *	Also has the side effect of advising that somebody made a mistake
  *	when they configured swap and didn't configure enough.
  *
  *	Must be called at splvm() to avoid races with bitmap frees from
  *	vm_page_remove() aka swap_pager_page_removed().
  *
  *	This routine may not block
  *	This routine must be called at splvm().
  *
  *	We allocate in round-robin fashion from the configured devices.
  */
 static daddr_t
 swp_pager_getswapspace(int npages)
 {
 	daddr_t blk;
 	struct swdevt *sp;
 	int i;
 
 	blk = SWAPBLK_NONE;
 	mtx_lock(&sw_dev_mtx);
 	sp = swdevhd;
 	for (i = 0; i < nswapdev; i++) {
 		if (sp == NULL)
 			sp = TAILQ_FIRST(&swtailq);
 		if (!(sp->sw_flags & SW_CLOSING)) {
 			blk = blist_alloc(sp->sw_blist, npages);
 			if (blk != SWAPBLK_NONE) {
 				blk += sp->sw_first;
 				sp->sw_used += npages;
 				swap_pager_avail -= npages;
 				swp_sizecheck();
 				swdevhd = TAILQ_NEXT(sp, sw_list);
 				goto done;
 			}
 		}
 		sp = TAILQ_NEXT(sp, sw_list);
 	}
 	if (swap_pager_full != 2) {
 		printf("swap_pager_getswapspace(%d): failed\n", npages);
 		swap_pager_full = 2;
 		swap_pager_almost_full = 1;
 	}
 	swdevhd = NULL;
 done:
 	mtx_unlock(&sw_dev_mtx);
 	return (blk);
 }
 
 static int
 swp_pager_isondev(daddr_t blk, struct swdevt *sp)
 {
 
 	return (blk >= sp->sw_first && blk < sp->sw_end);
 }
 	
 static void
 swp_pager_strategy(struct buf *bp)
 {
 	struct swdevt *sp;
 
 	mtx_lock(&sw_dev_mtx);
 	TAILQ_FOREACH(sp, &swtailq, sw_list) {
 		if (bp->b_blkno >= sp->sw_first && bp->b_blkno < sp->sw_end) {
 			mtx_unlock(&sw_dev_mtx);
 			sp->sw_strategy(bp, sp);
 			return;
 		}
 	}
 	panic("Swapdev not found");
 }
 	
 
 /*
  * SWP_PAGER_FREESWAPSPACE() -	free raw swap space 
  *
  *	This routine returns the specified swap blocks back to the bitmap.
  *
  *	Note:  This routine may not block (it could in the old swap code),
  *	and through the use of the new blist routines it does not block.
  *
  *	We must be called at splvm() to avoid races with bitmap frees from
  *	vm_page_remove() aka swap_pager_page_removed().
  *
  *	This routine may not block
  *	This routine must be called at splvm().
  */
 static void
 swp_pager_freeswapspace(daddr_t blk, int npages)
 {
 	struct swdevt *sp;
 
 	mtx_lock(&sw_dev_mtx);
 	TAILQ_FOREACH(sp, &swtailq, sw_list) {
 		if (blk >= sp->sw_first && blk < sp->sw_end) {
 			sp->sw_used -= npages;
 			/*
 			 * If we are attempting to stop swapping on
 			 * this device, we don't want to mark any
 			 * blocks free lest they be reused.  
 			 */
 			if ((sp->sw_flags & SW_CLOSING) == 0) {
 				blist_free(sp->sw_blist, blk - sp->sw_first,
 				    npages);
 				swap_pager_avail += npages;
 				swp_sizecheck();
 			}
 			mtx_unlock(&sw_dev_mtx);
 			return;
 		}
 	}
 	panic("Swapdev not found");
 }
 
 /*
  * SWAP_PAGER_FREESPACE() -	frees swap blocks associated with a page
  *				range within an object.
  *
  *	This is a globally accessible routine.
  *
  *	This routine removes swapblk assignments from swap metadata.
  *
  *	The external callers of this routine typically have already destroyed 
  *	or renamed vm_page_t's associated with this range in the object so 
  *	we should be ok.
  *
  *	This routine may be called at any spl.  We up our spl to splvm temporarily
  *	in order to perform the metadata removal.
  */
 void
 swap_pager_freespace(vm_object_t object, vm_pindex_t start, vm_size_t size)
 {
 
 	VM_OBJECT_LOCK_ASSERT(object, MA_OWNED);
 	swp_pager_meta_free(object, start, size);
 }
 
 /*
  * SWAP_PAGER_RESERVE() - reserve swap blocks in object
  *
  *	Assigns swap blocks to the specified range within the object.  The 
  *	swap blocks are not zerod.  Any previous swap assignment is destroyed.
  *
  *	Returns 0 on success, -1 on failure.
  */
 int
 swap_pager_reserve(vm_object_t object, vm_pindex_t start, vm_size_t size)
 {
 	int n = 0;
 	daddr_t blk = SWAPBLK_NONE;
 	vm_pindex_t beg = start;	/* save start index */
 
 	VM_OBJECT_LOCK(object);
 	while (size) {
 		if (n == 0) {
 			n = BLIST_MAX_ALLOC;
 			while ((blk = swp_pager_getswapspace(n)) == SWAPBLK_NONE) {
 				n >>= 1;
 				if (n == 0) {
 					swp_pager_meta_free(object, beg, start - beg);
 					VM_OBJECT_UNLOCK(object);
 					return (-1);
 				}
 			}
 		}
 		swp_pager_meta_build(object, start, blk);
 		--size;
 		++start;
 		++blk;
 		--n;
 	}
 	swp_pager_meta_free(object, start, n);
 	VM_OBJECT_UNLOCK(object);
 	return (0);
 }
 
 /*
  * SWAP_PAGER_COPY() -  copy blocks from source pager to destination pager
  *			and destroy the source.
  *
  *	Copy any valid swapblks from the source to the destination.  In
  *	cases where both the source and destination have a valid swapblk,
  *	we keep the destination's.
  *
  *	This routine is allowed to block.  It may block allocating metadata
  *	indirectly through swp_pager_meta_build() or if paging is still in
  *	progress on the source. 
  *
  *	This routine can be called at any spl
  *
  *	XXX vm_page_collapse() kinda expects us not to block because we 
  *	supposedly do not need to allocate memory, but for the moment we
  *	*may* have to get a little memory from the zone allocator, but
  *	it is taken from the interrupt memory.  We should be ok. 
  *
  *	The source object contains no vm_page_t's (which is just as well)
  *
  *	The source object is of type OBJT_SWAP.
  *
  *	The source and destination objects must be locked or 
  *	inaccessible (XXX are they ?)
  */
 void
 swap_pager_copy(vm_object_t srcobject, vm_object_t dstobject,
     vm_pindex_t offset, int destroysource)
 {
 	vm_pindex_t i;
 
 	VM_OBJECT_LOCK_ASSERT(srcobject, MA_OWNED);
 	VM_OBJECT_LOCK_ASSERT(dstobject, MA_OWNED);
 
 	/*
 	 * If destroysource is set, we remove the source object from the 
 	 * swap_pager internal queue now. 
 	 */
 	if (destroysource) {
 		if (srcobject->handle != NULL) {
 			mtx_lock(&sw_alloc_mtx);
 			TAILQ_REMOVE(
 			    NOBJLIST(srcobject->handle),
 			    srcobject,
 			    pager_object_list
 			);
 			mtx_unlock(&sw_alloc_mtx);
 		}
 	}
 
 	/*
 	 * transfer source to destination.
 	 */
 	for (i = 0; i < dstobject->size; ++i) {
 		daddr_t dstaddr;
 
 		/*
 		 * Locate (without changing) the swapblk on the destination,
 		 * unless it is invalid in which case free it silently, or
 		 * if the destination is a resident page, in which case the
 		 * source is thrown away.
 		 */
 		dstaddr = swp_pager_meta_ctl(dstobject, i, 0);
 
 		if (dstaddr == SWAPBLK_NONE) {
 			/*
 			 * Destination has no swapblk and is not resident,
 			 * copy source.
 			 */
 			daddr_t srcaddr;
 
 			srcaddr = swp_pager_meta_ctl(
 			    srcobject, 
 			    i + offset,
 			    SWM_POP
 			);
 
 			if (srcaddr != SWAPBLK_NONE) {
 				/*
 				 * swp_pager_meta_build() can sleep.
 				 */
 				vm_object_pip_add(srcobject, 1);
 				VM_OBJECT_UNLOCK(srcobject);
 				vm_object_pip_add(dstobject, 1);
 				swp_pager_meta_build(dstobject, i, srcaddr);
 				vm_object_pip_wakeup(dstobject);
 				VM_OBJECT_LOCK(srcobject);
 				vm_object_pip_wakeup(srcobject);
 			}
 		} else {
 			/*
 			 * Destination has valid swapblk or it is represented
 			 * by a resident page.  We destroy the sourceblock.
 			 */
 			
 			swp_pager_meta_ctl(srcobject, i + offset, SWM_FREE);
 		}
 	}
 
 	/*
 	 * Free left over swap blocks in source.
 	 *
 	 * We have to revert the type to OBJT_DEFAULT so we do not accidently
 	 * double-remove the object from the swap queues.
 	 */
 	if (destroysource) {
 		swp_pager_meta_free_all(srcobject);
 		/*
 		 * Reverting the type is not necessary, the caller is going
 		 * to destroy srcobject directly, but I'm doing it here
 		 * for consistency since we've removed the object from its
 		 * queues.
 		 */
 		srcobject->type = OBJT_DEFAULT;
 	}
 }
 
 /*
  * SWAP_PAGER_HASPAGE() -	determine if we have good backing store for
  *				the requested page.
  *
  *	We determine whether good backing store exists for the requested
  *	page and return TRUE if it does, FALSE if it doesn't.
  *
  *	If TRUE, we also try to determine how much valid, contiguous backing
  *	store exists before and after the requested page within a reasonable
  *	distance.  We do not try to restrict it to the swap device stripe
  *	(that is handled in getpages/putpages).  It probably isn't worth
  *	doing here.
  */
 static boolean_t
 swap_pager_haspage(vm_object_t object, vm_pindex_t pindex, int *before, int *after)
 {
 	daddr_t blk0;
 
 	VM_OBJECT_LOCK_ASSERT(object, MA_OWNED);
 	/*
 	 * do we have good backing store at the requested index ?
 	 */
 	blk0 = swp_pager_meta_ctl(object, pindex, 0);
 
 	if (blk0 == SWAPBLK_NONE) {
 		if (before)
 			*before = 0;
 		if (after)
 			*after = 0;
 		return (FALSE);
 	}
 
 	/*
 	 * find backwards-looking contiguous good backing store
 	 */
 	if (before != NULL) {
 		int i;
 
 		for (i = 1; i < (SWB_NPAGES/2); ++i) {
 			daddr_t blk;
 
 			if (i > pindex)
 				break;
 			blk = swp_pager_meta_ctl(object, pindex - i, 0);
 			if (blk != blk0 - i)
 				break;
 		}
 		*before = (i - 1);
 	}
 
 	/*
 	 * find forward-looking contiguous good backing store
 	 */
 	if (after != NULL) {
 		int i;
 
 		for (i = 1; i < (SWB_NPAGES/2); ++i) {
 			daddr_t blk;
 
 			blk = swp_pager_meta_ctl(object, pindex + i, 0);
 			if (blk != blk0 + i)
 				break;
 		}
 		*after = (i - 1);
 	}
 	return (TRUE);
 }
 
 /*
  * SWAP_PAGER_PAGE_UNSWAPPED() - remove swap backing store related to page
  *
  *	This removes any associated swap backing store, whether valid or
  *	not, from the page.  
  *
  *	This routine is typically called when a page is made dirty, at
  *	which point any associated swap can be freed.  MADV_FREE also
  *	calls us in a special-case situation
  *
  *	NOTE!!!  If the page is clean and the swap was valid, the caller
  *	should make the page dirty before calling this routine.  This routine
  *	does NOT change the m->dirty status of the page.  Also: MADV_FREE
  *	depends on it.
  *
  *	This routine may not block
  *	This routine must be called at splvm()
  */
 static void
 swap_pager_unswapped(vm_page_t m)
 {
 
 	VM_OBJECT_LOCK_ASSERT(m->object, MA_OWNED);
 	swp_pager_meta_ctl(m->object, m->pindex, SWM_FREE);
 }
 
 /*
  * SWAP_PAGER_GETPAGES() - bring pages in from swap
  *
  *	Attempt to retrieve (m, count) pages from backing store, but make
  *	sure we retrieve at least m[reqpage].  We try to load in as large
  *	a chunk surrounding m[reqpage] as is contiguous in swap and which
  *	belongs to the same object.
  *
  *	The code is designed for asynchronous operation and 
  *	immediate-notification of 'reqpage' but tends not to be
  *	used that way.  Please do not optimize-out this algorithmic
  *	feature, I intend to improve on it in the future.
  *
  *	The parent has a single vm_object_pip_add() reference prior to
  *	calling us and we should return with the same.
  *
  *	The parent has BUSY'd the pages.  We should return with 'm'
  *	left busy, but the others adjusted.
  */
 static int
 swap_pager_getpages(vm_object_t object, vm_page_t *m, int count, int reqpage)
 {
 	struct buf *bp;
 	vm_page_t mreq;
 	int i;
 	int j;
 	daddr_t blk;
 
 	mreq = m[reqpage];
 
 	KASSERT(mreq->object == object,
 	    ("swap_pager_getpages: object mismatch %p/%p",
 	    object, mreq->object));
 
 	/*
 	 * Calculate range to retrieve.  The pages have already been assigned
 	 * their swapblks.  We require a *contiguous* range but we know it to
 	 * not span devices.   If we do not supply it, bad things
 	 * happen.  Note that blk, iblk & jblk can be SWAPBLK_NONE, but the 
 	 * loops are set up such that the case(s) are handled implicitly.
 	 *
 	 * The swp_*() calls must be made at splvm().  vm_page_free() does
 	 * not need to be, but it will go a little faster if it is.
 	 */
 	blk = swp_pager_meta_ctl(mreq->object, mreq->pindex, 0);
 
 	for (i = reqpage - 1; i >= 0; --i) {
 		daddr_t iblk;
 
 		iblk = swp_pager_meta_ctl(m[i]->object, m[i]->pindex, 0);
 		if (blk != iblk + (reqpage - i))
 			break;
 	}
 	++i;
 
 	for (j = reqpage + 1; j < count; ++j) {
 		daddr_t jblk;
 
 		jblk = swp_pager_meta_ctl(m[j]->object, m[j]->pindex, 0);
 		if (blk != jblk - (j - reqpage))
 			break;
 	}
 
 	/*
 	 * free pages outside our collection range.   Note: we never free
 	 * mreq, it must remain busy throughout.
 	 */
 	if (0 < i || j < count) {
 		int k;
 
 		vm_page_lock_queues();
 		for (k = 0; k < i; ++k)
 			vm_page_free(m[k]);
 		for (k = j; k < count; ++k)
 			vm_page_free(m[k]);
 		vm_page_unlock_queues();
 	}
 
 	/*
 	 * Return VM_PAGER_FAIL if we have nothing to do.  Return mreq 
 	 * still busy, but the others unbusied.
 	 */
 	if (blk == SWAPBLK_NONE)
 		return (VM_PAGER_FAIL);
 
 	/*
 	 * Getpbuf() can sleep.
 	 */
 	VM_OBJECT_UNLOCK(object);
 	/*
 	 * Get a swap buffer header to perform the IO
 	 */
 	bp = getpbuf(&nsw_rcount);
 	bp->b_flags |= B_PAGING;
 
 	/*
 	 * map our page(s) into kva for input
 	 */
 	pmap_qenter((vm_offset_t)bp->b_data, m + i, j - i);
 
 	bp->b_iocmd = BIO_READ;
 	bp->b_iodone = swp_pager_async_iodone;
 	bp->b_rcred = crhold(thread0.td_ucred);
 	bp->b_wcred = crhold(thread0.td_ucred);
 	bp->b_blkno = blk - (reqpage - i);
 	bp->b_bcount = PAGE_SIZE * (j - i);
 	bp->b_bufsize = PAGE_SIZE * (j - i);
 	bp->b_pager.pg_reqpage = reqpage - i;
 
 	VM_OBJECT_LOCK(object);
 	{
 		int k;
 
 		for (k = i; k < j; ++k) {
 			bp->b_pages[k - i] = m[k];
 			m[k]->oflags |= VPO_SWAPINPROG;
 		}
 	}
 	bp->b_npages = j - i;
 
 	PCPU_INC(cnt.v_swapin);
 	PCPU_ADD(cnt.v_swappgsin, bp->b_npages);
 
 	/*
 	 * We still hold the lock on mreq, and our automatic completion routine
 	 * does not remove it.
 	 */
 	vm_object_pip_add(object, bp->b_npages);
 	VM_OBJECT_UNLOCK(object);
 
 	/*
 	 * perform the I/O.  NOTE!!!  bp cannot be considered valid after
 	 * this point because we automatically release it on completion.
 	 * Instead, we look at the one page we are interested in which we
 	 * still hold a lock on even through the I/O completion.
 	 *
 	 * The other pages in our m[] array are also released on completion,
 	 * so we cannot assume they are valid anymore either.
 	 *
 	 * NOTE: b_blkno is destroyed by the call to swapdev_strategy
 	 */
 	BUF_KERNPROC(bp);
 	swp_pager_strategy(bp);
 
 	/*
 	 * wait for the page we want to complete.  VPO_SWAPINPROG is always
 	 * cleared on completion.  If an I/O error occurs, SWAPBLK_NONE
 	 * is set in the meta-data.
 	 */
 	VM_OBJECT_LOCK(object);
 	while ((mreq->oflags & VPO_SWAPINPROG) != 0) {
 		mreq->oflags |= VPO_WANTED;
 		vm_page_lock_queues();
 		vm_page_flag_set(mreq, PG_REFERENCED);
 		vm_page_unlock_queues();
 		PCPU_INC(cnt.v_intrans);
 		if (msleep(mreq, VM_OBJECT_MTX(object), PSWP, "swread", hz*20)) {
 			printf(
 "swap_pager: indefinite wait buffer: bufobj: %p, blkno: %jd, size: %ld\n",
 			    bp->b_bufobj, (intmax_t)bp->b_blkno, bp->b_bcount);
 		}
 	}
 
 	/*
 	 * mreq is left busied after completion, but all the other pages
 	 * are freed.  If we had an unrecoverable read error the page will
 	 * not be valid.
 	 */
 	if (mreq->valid != VM_PAGE_BITS_ALL) {
 		return (VM_PAGER_ERROR);
 	} else {
 		return (VM_PAGER_OK);
 	}
 
 	/*
 	 * A final note: in a low swap situation, we cannot deallocate swap
 	 * and mark a page dirty here because the caller is likely to mark
 	 * the page clean when we return, causing the page to possibly revert 
 	 * to all-zero's later.
 	 */
 }
 
 /*
  *	swap_pager_putpages: 
  *
  *	Assign swap (if necessary) and initiate I/O on the specified pages.
  *
  *	We support both OBJT_DEFAULT and OBJT_SWAP objects.  DEFAULT objects
  *	are automatically converted to SWAP objects.
  *
  *	In a low memory situation we may block in VOP_STRATEGY(), but the new 
  *	vm_page reservation system coupled with properly written VFS devices 
  *	should ensure that no low-memory deadlock occurs.  This is an area
  *	which needs work.
  *
  *	The parent has N vm_object_pip_add() references prior to
  *	calling us and will remove references for rtvals[] that are
  *	not set to VM_PAGER_PEND.  We need to remove the rest on I/O
  *	completion.
  *
  *	The parent has soft-busy'd the pages it passes us and will unbusy
  *	those whos rtvals[] entry is not set to VM_PAGER_PEND on return.
  *	We need to unbusy the rest on I/O completion.
  */
 void
 swap_pager_putpages(vm_object_t object, vm_page_t *m, int count,
     boolean_t sync, int *rtvals)
 {
 	int i;
 	int n = 0;
 
 	if (count && m[0]->object != object) {
 		panic("swap_pager_putpages: object mismatch %p/%p", 
 		    object, 
 		    m[0]->object
 		);
 	}
 
 	/*
 	 * Step 1
 	 *
 	 * Turn object into OBJT_SWAP
 	 * check for bogus sysops
 	 * force sync if not pageout process
 	 */
 	if (object->type != OBJT_SWAP)
 		swp_pager_meta_build(object, 0, SWAPBLK_NONE);
 	VM_OBJECT_UNLOCK(object);
 
 	if (curproc != pageproc)
 		sync = TRUE;
 
 	/*
 	 * Step 2
 	 *
 	 * Update nsw parameters from swap_async_max sysctl values.  
 	 * Do not let the sysop crash the machine with bogus numbers.
 	 */
 	mtx_lock(&pbuf_mtx);
 	if (swap_async_max != nsw_wcount_async_max) {
 		int n;
 
 		/*
 		 * limit range
 		 */
 		if ((n = swap_async_max) > nswbuf / 2)
 			n = nswbuf / 2;
 		if (n < 1)
 			n = 1;
 		swap_async_max = n;
 
 		/*
 		 * Adjust difference ( if possible ).  If the current async
 		 * count is too low, we may not be able to make the adjustment
 		 * at this time.
 		 */
 		n -= nsw_wcount_async_max;
 		if (nsw_wcount_async + n >= 0) {
 			nsw_wcount_async += n;
 			nsw_wcount_async_max += n;
 			wakeup(&nsw_wcount_async);
 		}
 	}
 	mtx_unlock(&pbuf_mtx);
 
 	/*
 	 * Step 3
 	 *
 	 * Assign swap blocks and issue I/O.  We reallocate swap on the fly.
 	 * The page is left dirty until the pageout operation completes
 	 * successfully.
 	 */
 	for (i = 0; i < count; i += n) {
 		int j;
 		struct buf *bp;
 		daddr_t blk;
 
 		/*
 		 * Maximum I/O size is limited by a number of factors.
 		 */
 		n = min(BLIST_MAX_ALLOC, count - i);
 		n = min(n, nsw_cluster_max);
 
 		/*
 		 * Get biggest block of swap we can.  If we fail, fall
 		 * back and try to allocate a smaller block.  Don't go
 		 * overboard trying to allocate space if it would overly
 		 * fragment swap.
 		 */
 		while (
 		    (blk = swp_pager_getswapspace(n)) == SWAPBLK_NONE &&
 		    n > 4
 		) {
 			n >>= 1;
 		}
 		if (blk == SWAPBLK_NONE) {
 			for (j = 0; j < n; ++j)
 				rtvals[i+j] = VM_PAGER_FAIL;
 			continue;
 		}
 
 		/*
 		 * All I/O parameters have been satisfied, build the I/O
 		 * request and assign the swap space.
 		 */
 		if (sync == TRUE) {
 			bp = getpbuf(&nsw_wcount_sync);
 		} else {
 			bp = getpbuf(&nsw_wcount_async);
 			bp->b_flags = B_ASYNC;
 		}
 		bp->b_flags |= B_PAGING;
 		bp->b_iocmd = BIO_WRITE;
 
 		pmap_qenter((vm_offset_t)bp->b_data, &m[i], n);
 
 		bp->b_rcred = crhold(thread0.td_ucred);
 		bp->b_wcred = crhold(thread0.td_ucred);
 		bp->b_bcount = PAGE_SIZE * n;
 		bp->b_bufsize = PAGE_SIZE * n;
 		bp->b_blkno = blk;
 
 		VM_OBJECT_LOCK(object);
 		for (j = 0; j < n; ++j) {
 			vm_page_t mreq = m[i+j];
 
 			swp_pager_meta_build(
 			    mreq->object, 
 			    mreq->pindex,
 			    blk + j
 			);
 			vm_page_dirty(mreq);
 			rtvals[i+j] = VM_PAGER_OK;
 
 			mreq->oflags |= VPO_SWAPINPROG;
 			bp->b_pages[j] = mreq;
 		}
 		VM_OBJECT_UNLOCK(object);
 		bp->b_npages = n;
 		/*
 		 * Must set dirty range for NFS to work.
 		 */
 		bp->b_dirtyoff = 0;
 		bp->b_dirtyend = bp->b_bcount;
 
 		PCPU_INC(cnt.v_swapout);
 		PCPU_ADD(cnt.v_swappgsout, bp->b_npages);
 
 		/*
 		 * asynchronous
 		 *
 		 * NOTE: b_blkno is destroyed by the call to swapdev_strategy
 		 */
 		if (sync == FALSE) {
 			bp->b_iodone = swp_pager_async_iodone;
 			BUF_KERNPROC(bp);
 			swp_pager_strategy(bp);
 
 			for (j = 0; j < n; ++j)
 				rtvals[i+j] = VM_PAGER_PEND;
 			/* restart outter loop */
 			continue;
 		}
 
 		/*
 		 * synchronous
 		 *
 		 * NOTE: b_blkno is destroyed by the call to swapdev_strategy
 		 */
 		bp->b_iodone = bdone;
 		swp_pager_strategy(bp);
 
 		/*
 		 * Wait for the sync I/O to complete, then update rtvals.
 		 * We just set the rtvals[] to VM_PAGER_PEND so we can call
 		 * our async completion routine at the end, thus avoiding a
 		 * double-free.
 		 */
 		bwait(bp, PVM, "swwrt");
 		for (j = 0; j < n; ++j)
 			rtvals[i+j] = VM_PAGER_PEND;
 		/*
 		 * Now that we are through with the bp, we can call the
 		 * normal async completion, which frees everything up.
 		 */
 		swp_pager_async_iodone(bp);
 	}
 	VM_OBJECT_LOCK(object);
 }
 
 /*
  *	swp_pager_async_iodone:
  *
  *	Completion routine for asynchronous reads and writes from/to swap.
  *	Also called manually by synchronous code to finish up a bp.
  *
  *	For READ operations, the pages are PG_BUSY'd.  For WRITE operations, 
  *	the pages are vm_page_t->busy'd.  For READ operations, we PG_BUSY 
  *	unbusy all pages except the 'main' request page.  For WRITE 
  *	operations, we vm_page_t->busy'd unbusy all pages ( we can do this 
  *	because we marked them all VM_PAGER_PEND on return from putpages ).
  *
  *	This routine may not block.
  *	This routine is called at splbio() or better
  *
  *	We up ourselves to splvm() as required for various vm_page related
  *	calls.
  */
 static void
 swp_pager_async_iodone(struct buf *bp)
 {
 	int i;
 	vm_object_t object = NULL;
 
 	/*
 	 * report error
 	 */
 	if (bp->b_ioflags & BIO_ERROR) {
 		printf(
 		    "swap_pager: I/O error - %s failed; blkno %ld,"
 			"size %ld, error %d\n",
 		    ((bp->b_iocmd == BIO_READ) ? "pagein" : "pageout"),
 		    (long)bp->b_blkno, 
 		    (long)bp->b_bcount,
 		    bp->b_error
 		);
 	}
 
 	/*
 	 * remove the mapping for kernel virtual
 	 */
 	pmap_qremove((vm_offset_t)bp->b_data, bp->b_npages);
 
 	if (bp->b_npages) {
 		object = bp->b_pages[0]->object;
 		VM_OBJECT_LOCK(object);
 	}
 	vm_page_lock_queues();
 	/*
 	 * cleanup pages.  If an error occurs writing to swap, we are in
 	 * very serious trouble.  If it happens to be a disk error, though,
 	 * we may be able to recover by reassigning the swap later on.  So
 	 * in this case we remove the m->swapblk assignment for the page 
 	 * but do not free it in the rlist.  The errornous block(s) are thus
 	 * never reallocated as swap.  Redirty the page and continue.
 	 */
 	for (i = 0; i < bp->b_npages; ++i) {
 		vm_page_t m = bp->b_pages[i];
 
 		m->oflags &= ~VPO_SWAPINPROG;
 
 		if (bp->b_ioflags & BIO_ERROR) {
 			/*
 			 * If an error occurs I'd love to throw the swapblk
 			 * away without freeing it back to swapspace, so it
 			 * can never be used again.  But I can't from an 
 			 * interrupt.
 			 */
 			if (bp->b_iocmd == BIO_READ) {
 				/*
 				 * When reading, reqpage needs to stay
 				 * locked for the parent, but all other
 				 * pages can be freed.  We still want to
 				 * wakeup the parent waiting on the page,
 				 * though.  ( also: pg_reqpage can be -1 and 
 				 * not match anything ).
 				 *
 				 * We have to wake specifically requested pages
 				 * up too because we cleared VPO_SWAPINPROG and
 				 * someone may be waiting for that.
 				 *
 				 * NOTE: for reads, m->dirty will probably
 				 * be overridden by the original caller of
 				 * getpages so don't play cute tricks here.
 				 */
 				m->valid = 0;
 				if (i != bp->b_pager.pg_reqpage)
 					vm_page_free(m);
 				else
 					vm_page_flash(m);
 				/*
 				 * If i == bp->b_pager.pg_reqpage, do not wake 
 				 * the page up.  The caller needs to.
 				 */
 			} else {
 				/*
 				 * If a write error occurs, reactivate page
 				 * so it doesn't clog the inactive list,
 				 * then finish the I/O.
 				 */
 				vm_page_dirty(m);
 				vm_page_activate(m);
 				vm_page_io_finish(m);
 			}
 		} else if (bp->b_iocmd == BIO_READ) {
 			/*
 			 * For read success, clear dirty bits.  Nobody should
 			 * have this page mapped but don't take any chances,
 			 * make sure the pmap modify bits are also cleared.
 			 *
 			 * NOTE: for reads, m->dirty will probably be 
 			 * overridden by the original caller of getpages so
 			 * we cannot set them in order to free the underlying
 			 * swap in a low-swap situation.  I don't think we'd
 			 * want to do that anyway, but it was an optimization
 			 * that existed in the old swapper for a time before
 			 * it got ripped out due to precisely this problem.
 			 *
 			 * If not the requested page then deactivate it.
 			 *
 			 * Note that the requested page, reqpage, is left
 			 * busied, but we still have to wake it up.  The
 			 * other pages are released (unbusied) by 
 			 * vm_page_wakeup().  We do not set reqpage's
 			 * valid bits here, it is up to the caller.
 			 */
 			pmap_clear_modify(m);
 			m->valid = VM_PAGE_BITS_ALL;
 			vm_page_undirty(m);
 
 			/*
 			 * We have to wake specifically requested pages
 			 * up too because we cleared VPO_SWAPINPROG and
 			 * could be waiting for it in getpages.  However,
 			 * be sure to not unbusy getpages specifically
 			 * requested page - getpages expects it to be 
 			 * left busy.
 			 */
 			if (i != bp->b_pager.pg_reqpage) {
 				vm_page_deactivate(m);
 				vm_page_wakeup(m);
 			} else {
 				vm_page_flash(m);
 			}
 		} else {
 			/*
 			 * For write success, clear the modify and dirty 
 			 * status, then finish the I/O ( which decrements the 
 			 * busy count and possibly wakes waiter's up ).
 			 */
 			pmap_clear_modify(m);
 			vm_page_undirty(m);
 			vm_page_io_finish(m);
 			if (vm_page_count_severe())
 				vm_page_try_to_cache(m);
 		}
 	}
 	vm_page_unlock_queues();
 
 	/*
 	 * adjust pip.  NOTE: the original parent may still have its own
 	 * pip refs on the object.
 	 */
 	if (object != NULL) {
 		vm_object_pip_wakeupn(object, bp->b_npages);
 		VM_OBJECT_UNLOCK(object);
 	}
 
 	/* 
 	 * swapdev_strategy() manually sets b_vp and b_bufobj before calling 
 	 * bstrategy(). Set them back to NULL now we're done with it, or we'll
 	 * trigger a KASSERT in relpbuf().
 	 */
 	if (bp->b_vp) {
 		    bp->b_vp = NULL;
 		    bp->b_bufobj = NULL;
 	}
 	/*
 	 * release the physical I/O buffer
 	 */
 	relpbuf(
 	    bp, 
 	    ((bp->b_iocmd == BIO_READ) ? &nsw_rcount : 
 		((bp->b_flags & B_ASYNC) ? 
 		    &nsw_wcount_async : 
 		    &nsw_wcount_sync
 		)
 	    )
 	);
 }
 
 /*
  *	swap_pager_isswapped:
  *
  *	Return 1 if at least one page in the given object is paged
  *	out to the given swap device.
  *
  *	This routine may not block.
  */
 int
 swap_pager_isswapped(vm_object_t object, struct swdevt *sp)
 {
 	daddr_t index = 0;
 	int bcount;
 	int i;
 
 	VM_OBJECT_LOCK_ASSERT(object, MA_OWNED);
 	if (object->type != OBJT_SWAP)
 		return (0);
 
 	mtx_lock(&swhash_mtx);
 	for (bcount = 0; bcount < object->un_pager.swp.swp_bcount; bcount++) {
 		struct swblock *swap;
 
 		if ((swap = *swp_pager_hash(object, index)) != NULL) {
 			for (i = 0; i < SWAP_META_PAGES; ++i) {
 				if (swp_pager_isondev(swap->swb_pages[i], sp)) {
 					mtx_unlock(&swhash_mtx);
 					return (1);
 				}
 			}
 		}
 		index += SWAP_META_PAGES;
 		if (index > 0x20000000)
 			panic("swap_pager_isswapped: failed to locate all swap meta blocks");
 	}
 	mtx_unlock(&swhash_mtx);
 	return (0);
 }
 
 /*
  * SWP_PAGER_FORCE_PAGEIN() - force a swap block to be paged in
  *
  *	This routine dissociates the page at the given index within a
  *	swap block from its backing store, paging it in if necessary.
  *	If the page is paged in, it is placed in the inactive queue,
  *	since it had its backing store ripped out from under it.
  *	We also attempt to swap in all other pages in the swap block,
  *	we only guarantee that the one at the specified index is
  *	paged in.
  *
  *	XXX - The code to page the whole block in doesn't work, so we
  *	      revert to the one-by-one behavior for now.  Sigh.
  */
 static inline void
 swp_pager_force_pagein(vm_object_t object, vm_pindex_t pindex)
 {
 	vm_page_t m;
 
 	vm_object_pip_add(object, 1);
 	m = vm_page_grab(object, pindex, VM_ALLOC_NORMAL|VM_ALLOC_RETRY);
 	if (m->valid == VM_PAGE_BITS_ALL) {
 		vm_object_pip_subtract(object, 1);
 		vm_page_lock_queues();
 		vm_page_activate(m);
 		vm_page_dirty(m);
 		vm_page_unlock_queues();
 		vm_page_wakeup(m);
 		vm_pager_page_unswapped(m);
 		return;
 	}
 
 	if (swap_pager_getpages(object, &m, 1, 0) != VM_PAGER_OK)
 		panic("swap_pager_force_pagein: read from swap failed");/*XXX*/
 	vm_object_pip_subtract(object, 1);
 	vm_page_lock_queues();
 	vm_page_dirty(m);
 	vm_page_dontneed(m);
 	vm_page_unlock_queues();
 	vm_page_wakeup(m);
 	vm_pager_page_unswapped(m);
 }
 
 /*
  *	swap_pager_swapoff:
  *
  *	Page in all of the pages that have been paged out to the
  *	given device.  The corresponding blocks in the bitmap must be
  *	marked as allocated and the device must be flagged SW_CLOSING.
  *	There may be no processes swapped out to the device.
  *
  *	This routine may block.
  */
 static void
 swap_pager_swapoff(struct swdevt *sp)
 {
 	struct swblock *swap;
 	int i, j, retries;
 
 	GIANT_REQUIRED;
 
 	retries = 0;
 full_rescan:
 	mtx_lock(&swhash_mtx);
 	for (i = 0; i <= swhash_mask; i++) { /* '<=' is correct here */
 restart:
 		for (swap = swhash[i]; swap != NULL; swap = swap->swb_hnext) {
 			vm_object_t object = swap->swb_object;
 			vm_pindex_t pindex = swap->swb_index;
                         for (j = 0; j < SWAP_META_PAGES; ++j) {
                                 if (swp_pager_isondev(swap->swb_pages[j], sp)) {
 					/* avoid deadlock */
 					if (!VM_OBJECT_TRYLOCK(object)) {
 						break;
 					} else {
 						mtx_unlock(&swhash_mtx);
 						swp_pager_force_pagein(object,
 						    pindex + j);
 						VM_OBJECT_UNLOCK(object);
 						mtx_lock(&swhash_mtx);
 						goto restart;
 					}
 				}
                         }
 		}
 	}
 	mtx_unlock(&swhash_mtx);
 	if (sp->sw_used) {
 		/*
 		 * Objects may be locked or paging to the device being
 		 * removed, so we will miss their pages and need to
 		 * make another pass.  We have marked this device as
 		 * SW_CLOSING, so the activity should finish soon.
 		 */
 		retries++;
 		if (retries > 100) {
 			panic("swapoff: failed to locate %d swap blocks",
 			    sp->sw_used);
 		}
 		pause("swpoff", hz / 20);
 		goto full_rescan;
 	}
 }
 
 /************************************************************************
  *				SWAP META DATA 				*
  ************************************************************************
  *
  *	These routines manipulate the swap metadata stored in the 
  *	OBJT_SWAP object.  All swp_*() routines must be called at
  *	splvm() because swap can be freed up by the low level vm_page
  *	code which might be called from interrupts beyond what splbio() covers.
  *
  *	Swap metadata is implemented with a global hash and not directly
  *	linked into the object.  Instead the object simply contains
  *	appropriate tracking counters.
  */
 
 /*
  * SWP_PAGER_META_BUILD() -	add swap block to swap meta data for object
  *
  *	We first convert the object to a swap object if it is a default
  *	object.
  *
  *	The specified swapblk is added to the object's swap metadata.  If
  *	the swapblk is not valid, it is freed instead.  Any previously
  *	assigned swapblk is freed.
  *
  *	This routine must be called at splvm(), except when used to convert
  *	an OBJT_DEFAULT object into an OBJT_SWAP object.
  */
 static void
 swp_pager_meta_build(vm_object_t object, vm_pindex_t pindex, daddr_t swapblk)
 {
 	struct swblock *swap;
 	struct swblock **pswap;
 	int idx;
 
 	VM_OBJECT_LOCK_ASSERT(object, MA_OWNED);
 	/*
 	 * Convert default object to swap object if necessary
 	 */
 	if (object->type != OBJT_SWAP) {
 		object->type = OBJT_SWAP;
 		object->un_pager.swp.swp_bcount = 0;
 
 		if (object->handle != NULL) {
 			mtx_lock(&sw_alloc_mtx);
 			TAILQ_INSERT_TAIL(
 			    NOBJLIST(object->handle),
 			    object, 
 			    pager_object_list
 			);
 			mtx_unlock(&sw_alloc_mtx);
 		}
 	}
 	
 	/*
 	 * Locate hash entry.  If not found create, but if we aren't adding
 	 * anything just return.  If we run out of space in the map we wait
 	 * and, since the hash table may have changed, retry.
 	 */
 retry:
 	mtx_lock(&swhash_mtx);
 	pswap = swp_pager_hash(object, pindex);
 
 	if ((swap = *pswap) == NULL) {
 		int i;
 
 		if (swapblk == SWAPBLK_NONE)
 			goto done;
 
 		swap = *pswap = uma_zalloc(swap_zone, M_NOWAIT);
 		if (swap == NULL) {
 			mtx_unlock(&swhash_mtx);
 			VM_OBJECT_UNLOCK(object);
 			if (uma_zone_exhausted(swap_zone))
 				printf("swap zone exhausted, increase kern.maxswzone\n");
 			VM_WAIT;
 			VM_OBJECT_LOCK(object);
 			goto retry;
 		}
 
 		swap->swb_hnext = NULL;
 		swap->swb_object = object;
 		swap->swb_index = pindex & ~(vm_pindex_t)SWAP_META_MASK;
 		swap->swb_count = 0;
 
 		++object->un_pager.swp.swp_bcount;
 
 		for (i = 0; i < SWAP_META_PAGES; ++i)
 			swap->swb_pages[i] = SWAPBLK_NONE;
 	}
 
 	/*
 	 * Delete prior contents of metadata
 	 */
 	idx = pindex & SWAP_META_MASK;
 
 	if (swap->swb_pages[idx] != SWAPBLK_NONE) {
 		swp_pager_freeswapspace(swap->swb_pages[idx], 1);
 		--swap->swb_count;
 	}
 
 	/*
 	 * Enter block into metadata
 	 */
 	swap->swb_pages[idx] = swapblk;
 	if (swapblk != SWAPBLK_NONE)
 		++swap->swb_count;
 done:
 	mtx_unlock(&swhash_mtx);
 }
 
 /*
  * SWP_PAGER_META_FREE() - free a range of blocks in the object's swap metadata
  *
  *	The requested range of blocks is freed, with any associated swap 
  *	returned to the swap bitmap.
  *
  *	This routine will free swap metadata structures as they are cleaned 
  *	out.  This routine does *NOT* operate on swap metadata associated
  *	with resident pages.
  *
  *	This routine must be called at splvm()
  */
 static void
 swp_pager_meta_free(vm_object_t object, vm_pindex_t index, daddr_t count)
 {
 
 	VM_OBJECT_LOCK_ASSERT(object, MA_OWNED);
 	if (object->type != OBJT_SWAP)
 		return;
 
 	while (count > 0) {
 		struct swblock **pswap;
 		struct swblock *swap;
 
 		mtx_lock(&swhash_mtx);
 		pswap = swp_pager_hash(object, index);
 
 		if ((swap = *pswap) != NULL) {
 			daddr_t v = swap->swb_pages[index & SWAP_META_MASK];
 
 			if (v != SWAPBLK_NONE) {
 				swp_pager_freeswapspace(v, 1);
 				swap->swb_pages[index & SWAP_META_MASK] =
 					SWAPBLK_NONE;
 				if (--swap->swb_count == 0) {
 					*pswap = swap->swb_hnext;
 					uma_zfree(swap_zone, swap);
 					--object->un_pager.swp.swp_bcount;
 				}
 			}
 			--count;
 			++index;
 		} else {
 			int n = SWAP_META_PAGES - (index & SWAP_META_MASK);
 			count -= n;
 			index += n;
 		}
 		mtx_unlock(&swhash_mtx);
 	}
 }
 
 /*
  * SWP_PAGER_META_FREE_ALL() - destroy all swap metadata associated with object
  *
  *	This routine locates and destroys all swap metadata associated with
  *	an object.
  *
  *	This routine must be called at splvm()
  */
 static void
 swp_pager_meta_free_all(vm_object_t object)
 {
 	daddr_t index = 0;
 
 	VM_OBJECT_LOCK_ASSERT(object, MA_OWNED);
 	if (object->type != OBJT_SWAP)
 		return;
 
 	while (object->un_pager.swp.swp_bcount) {
 		struct swblock **pswap;
 		struct swblock *swap;
 
 		mtx_lock(&swhash_mtx);
 		pswap = swp_pager_hash(object, index);
 		if ((swap = *pswap) != NULL) {
 			int i;
 
 			for (i = 0; i < SWAP_META_PAGES; ++i) {
 				daddr_t v = swap->swb_pages[i];
 				if (v != SWAPBLK_NONE) {
 					--swap->swb_count;
 					swp_pager_freeswapspace(v, 1);
 				}
 			}
 			if (swap->swb_count != 0)
 				panic("swap_pager_meta_free_all: swb_count != 0");
 			*pswap = swap->swb_hnext;
 			uma_zfree(swap_zone, swap);
 			--object->un_pager.swp.swp_bcount;
 		}
 		mtx_unlock(&swhash_mtx);
 		index += SWAP_META_PAGES;
 		if (index > 0x20000000)
 			panic("swp_pager_meta_free_all: failed to locate all swap meta blocks");
 	}
 }
 
 /*
  * SWP_PAGER_METACTL() -  misc control of swap and vm_page_t meta data.
  *
  *	This routine is capable of looking up, popping, or freeing
  *	swapblk assignments in the swap meta data or in the vm_page_t.
  *	The routine typically returns the swapblk being looked-up, or popped,
  *	or SWAPBLK_NONE if the block was freed, or SWAPBLK_NONE if the block
  *	was invalid.  This routine will automatically free any invalid 
  *	meta-data swapblks.
  *
  *	It is not possible to store invalid swapblks in the swap meta data
  *	(other then a literal 'SWAPBLK_NONE'), so we don't bother checking.
  *
  *	When acting on a busy resident page and paging is in progress, we 
  *	have to wait until paging is complete but otherwise can act on the 
  *	busy page.
  *
  *	This routine must be called at splvm().
  *
  *	SWM_FREE	remove and free swap block from metadata
  *	SWM_POP		remove from meta data but do not free.. pop it out
  */
 static daddr_t
 swp_pager_meta_ctl(vm_object_t object, vm_pindex_t pindex, int flags)
 {
 	struct swblock **pswap;
 	struct swblock *swap;
 	daddr_t r1;
 	int idx;
 
 	VM_OBJECT_LOCK_ASSERT(object, MA_OWNED);
 	/*
 	 * The meta data only exists of the object is OBJT_SWAP 
 	 * and even then might not be allocated yet.
 	 */
 	if (object->type != OBJT_SWAP)
 		return (SWAPBLK_NONE);
 
 	r1 = SWAPBLK_NONE;
 	mtx_lock(&swhash_mtx);
 	pswap = swp_pager_hash(object, pindex);
 
 	if ((swap = *pswap) != NULL) {
 		idx = pindex & SWAP_META_MASK;
 		r1 = swap->swb_pages[idx];
 
 		if (r1 != SWAPBLK_NONE) {
 			if (flags & SWM_FREE) {
 				swp_pager_freeswapspace(r1, 1);
 				r1 = SWAPBLK_NONE;
 			}
 			if (flags & (SWM_FREE|SWM_POP)) {
 				swap->swb_pages[idx] = SWAPBLK_NONE;
 				if (--swap->swb_count == 0) {
 					*pswap = swap->swb_hnext;
 					uma_zfree(swap_zone, swap);
 					--object->un_pager.swp.swp_bcount;
 				}
 			} 
 		}
 	}
 	mtx_unlock(&swhash_mtx);
 	return (r1);
 }
 
 /*
  * System call swapon(name) enables swapping on device name,
  * which must be in the swdevsw.  Return EBUSY
  * if already swapping on this device.
  */
 #ifndef _SYS_SYSPROTO_H_
 struct swapon_args {
 	char *name;
 };
 #endif
 
 /* 
  * MPSAFE
  */
 /* ARGSUSED */
 int
 swapon(struct thread *td, struct swapon_args *uap)
 {
 	struct vattr attr;
 	struct vnode *vp;
 	struct nameidata nd;
 	int error;
 
 	error = priv_check(td, PRIV_SWAPON);
 	if (error)
 		return (error);
 
 	mtx_lock(&Giant);
 	while (swdev_syscall_active)
 	    tsleep(&swdev_syscall_active, PUSER - 1, "swpon", 0);
 	swdev_syscall_active = 1;
 
 	/*
 	 * Swap metadata may not fit in the KVM if we have physical
 	 * memory of >1GB.
 	 */
 	if (swap_zone == NULL) {
 		error = ENOMEM;
 		goto done;
 	}
 
 	NDINIT(&nd, LOOKUP, ISOPEN | FOLLOW | AUDITVNODE1, UIO_USERSPACE,
 	    uap->name, td);
 	error = namei(&nd);
 	if (error)
 		goto done;
 
 	NDFREE(&nd, NDF_ONLY_PNBUF);
 	vp = nd.ni_vp;
 
 	if (vn_isdisk(vp, &error)) {
 		error = swapongeom(td, vp);
 	} else if (vp->v_type == VREG &&
 	    (vp->v_mount->mnt_vfc->vfc_flags & VFCF_NETWORK) != 0 &&
 	    (error = VOP_GETATTR(vp, &attr, td->td_ucred, td)) == 0) {
 		/*
 		 * Allow direct swapping to NFS regular files in the same
 		 * way that nfs_mountroot() sets up diskless swapping.
 		 */
 		error = swaponvp(td, vp, attr.va_size / DEV_BSIZE);
 	}
 
 	if (error)
 		vrele(vp);
 done:
 	swdev_syscall_active = 0;
 	wakeup_one(&swdev_syscall_active);
 	mtx_unlock(&Giant);
 	return (error);
 }
 
 static void
 swaponsomething(struct vnode *vp, void *id, u_long nblks, sw_strategy_t *strategy, sw_close_t *close, dev_t dev)
 {
 	struct swdevt *sp, *tsp;
 	swblk_t dvbase;
 	u_long mblocks;
 
 	/*
 	 * If we go beyond this, we get overflows in the radix
 	 * tree bitmap code.
 	 */
 	mblocks = 0x40000000 / BLIST_META_RADIX;
 	if (nblks > mblocks) {
 		printf("WARNING: reducing size to maximum of %lu blocks per swap unit\n",
 			mblocks);
 		nblks = mblocks;
 	}
 	/*
 	 * nblks is in DEV_BSIZE'd chunks, convert to PAGE_SIZE'd chunks.
 	 * First chop nblks off to page-align it, then convert.
 	 * 
 	 * sw->sw_nblks is in page-sized chunks now too.
 	 */
 	nblks &= ~(ctodb(1) - 1);
 	nblks = dbtoc(nblks);
 
 	sp = malloc(sizeof *sp, M_VMPGDATA, M_WAITOK | M_ZERO);
 	sp->sw_vp = vp;
 	sp->sw_id = id;
 	sp->sw_dev = dev;
 	sp->sw_flags = 0;
 	sp->sw_nblks = nblks;
 	sp->sw_used = 0;
 	sp->sw_strategy = strategy;
 	sp->sw_close = close;
 
 	sp->sw_blist = blist_create(nblks);
 	/*
 	 * Do not free the first two block in order to avoid overwriting
 	 * any bsd label at the front of the partition
 	 */
 	blist_free(sp->sw_blist, 2, nblks - 2);
 
 	dvbase = 0;
 	mtx_lock(&sw_dev_mtx);
 	TAILQ_FOREACH(tsp, &swtailq, sw_list) {
 		if (tsp->sw_end >= dvbase) {
 			/*
 			 * We put one uncovered page between the devices
 			 * in order to definitively prevent any cross-device
 			 * I/O requests
 			 */
 			dvbase = tsp->sw_end + 1;
 		}
 	}
 	sp->sw_first = dvbase;
 	sp->sw_end = dvbase + nblks;
 	TAILQ_INSERT_TAIL(&swtailq, sp, sw_list);
 	nswapdev++;
 	swap_pager_avail += nblks;
 	swp_sizecheck();
 	mtx_unlock(&sw_dev_mtx);
 }
 
 /*
  * SYSCALL: swapoff(devname)
  *
  * Disable swapping on the given device.
  *
  * XXX: Badly designed system call: it should use a device index
  * rather than filename as specification.  We keep sw_vp around
  * only to make this work.
  */
 #ifndef _SYS_SYSPROTO_H_
 struct swapoff_args {
 	char *name;
 };
 #endif
 
 /*
  * MPSAFE
  */
 /* ARGSUSED */
 int
 swapoff(struct thread *td, struct swapoff_args *uap)
 {
 	struct vnode *vp;
 	struct nameidata nd;
 	struct swdevt *sp;
 	int error;
 
 	error = priv_check(td, PRIV_SWAPOFF);
 	if (error)
 		return (error);
 
 	mtx_lock(&Giant);
 	while (swdev_syscall_active)
 	    tsleep(&swdev_syscall_active, PUSER - 1, "swpoff", 0);
 	swdev_syscall_active = 1;
 
 	NDINIT(&nd, LOOKUP, FOLLOW | AUDITVNODE1, UIO_USERSPACE, uap->name,
 	    td);
 	error = namei(&nd);
 	if (error)
 		goto done;
 	NDFREE(&nd, NDF_ONLY_PNBUF);
 	vp = nd.ni_vp;
 
 	mtx_lock(&sw_dev_mtx);
 	TAILQ_FOREACH(sp, &swtailq, sw_list) {
 		if (sp->sw_vp == vp)
 			break;
 	}
 	mtx_unlock(&sw_dev_mtx);
 	if (sp == NULL) {
 		error = EINVAL;
 		goto done;
 	}
 	error = swapoff_one(sp, td->td_ucred);
 done:
 	swdev_syscall_active = 0;
 	wakeup_one(&swdev_syscall_active);
 	mtx_unlock(&Giant);
 	return (error);
 }
 
 static int
 swapoff_one(struct swdevt *sp, struct ucred *cred)
 {
 	u_long nblks, dvbase;
 #ifdef MAC
 	int error;
 #endif
 
 	mtx_assert(&Giant, MA_OWNED);
 #ifdef MAC
-	(void) vn_lock(sp->sw_vp, LK_EXCLUSIVE | LK_RETRY, curthread);
+	(void) vn_lock(sp->sw_vp, LK_EXCLUSIVE | LK_RETRY);
 	error = mac_system_check_swapoff(cred, sp->sw_vp);
 	(void) VOP_UNLOCK(sp->sw_vp, 0, curthread);
 	if (error != 0)
 		return (error);
 #endif
 	nblks = sp->sw_nblks;
 
 	/*
 	 * We can turn off this swap device safely only if the
 	 * available virtual memory in the system will fit the amount
 	 * of data we will have to page back in, plus an epsilon so
 	 * the system doesn't become critically low on swap space.
 	 */
 	if (cnt.v_free_count + cnt.v_cache_count + swap_pager_avail <
 	    nblks + nswap_lowat) {
 		return (ENOMEM);
 	}
 
 	/*
 	 * Prevent further allocations on this device.
 	 */
 	mtx_lock(&sw_dev_mtx);
 	sp->sw_flags |= SW_CLOSING;
 	for (dvbase = 0; dvbase < sp->sw_end; dvbase += dmmax) {
 		swap_pager_avail -= blist_fill(sp->sw_blist,
 		     dvbase, dmmax);
 	}
 	mtx_unlock(&sw_dev_mtx);
 
 	/*
 	 * Page in the contents of the device and close it.
 	 */
 	swap_pager_swapoff(sp);
 
 	sp->sw_close(curthread, sp);
 	sp->sw_id = NULL;
 	mtx_lock(&sw_dev_mtx);
 	TAILQ_REMOVE(&swtailq, sp, sw_list);
 	nswapdev--;
 	if (nswapdev == 0) {
 		swap_pager_full = 2;
 		swap_pager_almost_full = 1;
 	}
 	if (swdevhd == sp)
 		swdevhd = NULL;
 	mtx_unlock(&sw_dev_mtx);
 	blist_destroy(sp->sw_blist);
 	free(sp, M_VMPGDATA);
 	return (0);
 }
 
 void
 swapoff_all(void)
 {
 	struct swdevt *sp, *spt;
 	const char *devname;
 	int error;
  
 	mtx_lock(&Giant);
 	while (swdev_syscall_active)
 		tsleep(&swdev_syscall_active, PUSER - 1, "swpoff", 0);
 	swdev_syscall_active = 1;
  
 	mtx_lock(&sw_dev_mtx);
 	TAILQ_FOREACH_SAFE(sp, &swtailq, sw_list, spt) {
 		mtx_unlock(&sw_dev_mtx);
 		if (vn_isdisk(sp->sw_vp, NULL))
 			devname = sp->sw_vp->v_rdev->si_name;
 		else
 			devname = "[file]";
 		error = swapoff_one(sp, thread0.td_ucred);
 		if (error != 0) {
 			printf("Cannot remove swap device %s (error=%d), "
 			    "skipping.\n", devname, error);
 		} else if (bootverbose) {
 			printf("Swap device %s removed.\n", devname);
 		}
 		mtx_lock(&sw_dev_mtx);
 	}
 	mtx_unlock(&sw_dev_mtx);
  
 	swdev_syscall_active = 0;
 	wakeup_one(&swdev_syscall_active);
 	mtx_unlock(&Giant);
 }
 
 void
 swap_pager_status(int *total, int *used)
 {
 	struct swdevt *sp;
 
 	*total = 0;
 	*used = 0;
 	mtx_lock(&sw_dev_mtx);
 	TAILQ_FOREACH(sp, &swtailq, sw_list) {
 		*total += sp->sw_nblks;
 		*used += sp->sw_used;
 	}
 	mtx_unlock(&sw_dev_mtx);
 }
 
 static int
 sysctl_vm_swap_info(SYSCTL_HANDLER_ARGS)
 {
 	int	*name = (int *)arg1;
 	int	error, n;
 	struct xswdev xs;
 	struct swdevt *sp;
 
 	if (arg2 != 1) /* name length */
 		return (EINVAL);
 
 	n = 0;
 	mtx_lock(&sw_dev_mtx);
 	TAILQ_FOREACH(sp, &swtailq, sw_list) {
 		if (n == *name) {
 			mtx_unlock(&sw_dev_mtx);
 			xs.xsw_version = XSWDEV_VERSION;
 			xs.xsw_dev = sp->sw_dev;
 			xs.xsw_flags = sp->sw_flags;
 			xs.xsw_nblks = sp->sw_nblks;
 			xs.xsw_used = sp->sw_used;
 
 			error = SYSCTL_OUT(req, &xs, sizeof(xs));
 			return (error);
 		}
 		n++;
 	}
 	mtx_unlock(&sw_dev_mtx);
 	return (ENOENT);
 }
 
 SYSCTL_INT(_vm, OID_AUTO, nswapdev, CTLFLAG_RD, &nswapdev, 0,
     "Number of swap devices");
 SYSCTL_NODE(_vm, OID_AUTO, swap_info, CTLFLAG_RD, sysctl_vm_swap_info,
     "Swap statistics by device");
 
 /*
  * vmspace_swap_count() - count the approximate swap useage in pages for a
  *			  vmspace.
  *
  *	The map must be locked.
  *
  *	Swap useage is determined by taking the proportional swap used by
  *	VM objects backing the VM map.  To make up for fractional losses,
  *	if the VM object has any swap use at all the associated map entries
  *	count for at least 1 swap page.
  */
 int
 vmspace_swap_count(struct vmspace *vmspace)
 {
 	vm_map_t map = &vmspace->vm_map;
 	vm_map_entry_t cur;
 	int count = 0;
 
 	for (cur = map->header.next; cur != &map->header; cur = cur->next) {
 		vm_object_t object;
 
 		if ((cur->eflags & MAP_ENTRY_IS_SUB_MAP) == 0 &&
 		    (object = cur->object.vm_object) != NULL) {
 			VM_OBJECT_LOCK(object);
 			if (object->type == OBJT_SWAP &&
 			    object->un_pager.swp.swp_bcount != 0) {
 				int n = (cur->end - cur->start) / PAGE_SIZE;
 
 				count += object->un_pager.swp.swp_bcount *
 				    SWAP_META_PAGES * n / object->size + 1;
 			}
 			VM_OBJECT_UNLOCK(object);
 		}
 	}
 	return (count);
 }
 
 /*
  * GEOM backend
  *
  * Swapping onto disk devices.
  *
  */
 
 static g_orphan_t swapgeom_orphan;
 
 static struct g_class g_swap_class = {
 	.name = "SWAP",
 	.version = G_VERSION,
 	.orphan = swapgeom_orphan,
 };
 
 DECLARE_GEOM_CLASS(g_swap_class, g_class);
 
 
 static void
 swapgeom_done(struct bio *bp2)
 {
 	struct buf *bp;
 
 	bp = bp2->bio_caller2;
 	bp->b_ioflags = bp2->bio_flags;
 	if (bp2->bio_error)
 		bp->b_ioflags |= BIO_ERROR;
 	bp->b_resid = bp->b_bcount - bp2->bio_completed;
 	bp->b_error = bp2->bio_error;
 	bufdone(bp);
 	g_destroy_bio(bp2);
 }
 
 static void
 swapgeom_strategy(struct buf *bp, struct swdevt *sp)
 {
 	struct bio *bio;
 	struct g_consumer *cp;
 
 	cp = sp->sw_id;
 	if (cp == NULL) {
 		bp->b_error = ENXIO;
 		bp->b_ioflags |= BIO_ERROR;
 		bufdone(bp);
 		return;
 	}
 	bio = g_alloc_bio();
 #if 0
 	/*
 	 * XXX: We shouldn't really sleep here when we run out of buffers
 	 * XXX: but the alternative is worse right now.
 	 */
 	if (bio == NULL) {
 		bp->b_error = ENOMEM;
 		bp->b_ioflags |= BIO_ERROR;
 		bufdone(bp);
 		return;
 	}
 #endif
 	bio->bio_caller2 = bp;
 	bio->bio_cmd = bp->b_iocmd;
 	bio->bio_data = bp->b_data;
 	bio->bio_offset = (bp->b_blkno - sp->sw_first) * PAGE_SIZE;
 	bio->bio_length = bp->b_bcount;
 	bio->bio_done = swapgeom_done;
 	g_io_request(bio, cp);
 	return;
 }
 
 static void
 swapgeom_orphan(struct g_consumer *cp)
 {
 	struct swdevt *sp;
 
 	mtx_lock(&sw_dev_mtx);
 	TAILQ_FOREACH(sp, &swtailq, sw_list)
 		if (sp->sw_id == cp)
 			sp->sw_id = NULL;
 	mtx_unlock(&sw_dev_mtx);
 }
 
 static void
 swapgeom_close_ev(void *arg, int flags)
 {
 	struct g_consumer *cp;
 
 	cp = arg;
 	g_access(cp, -1, -1, 0);
 	g_detach(cp);
 	g_destroy_consumer(cp);
 }
 
 static void
 swapgeom_close(struct thread *td, struct swdevt *sw)
 {
 
 	/* XXX: direct call when Giant untangled */
 	g_waitfor_event(swapgeom_close_ev, sw->sw_id, M_WAITOK, NULL);
 }
 
 
 struct swh0h0 {
 	struct cdev *dev;
 	struct vnode *vp;
 	int	error;
 };
 
 static void
 swapongeom_ev(void *arg, int flags)
 {
 	struct swh0h0 *swh;
 	struct g_provider *pp;
 	struct g_consumer *cp;
 	static struct g_geom *gp;
 	struct swdevt *sp;
 	u_long nblks;
 	int error;
 
 	swh = arg;
 	swh->error = 0;
 	pp = g_dev_getprovider(swh->dev);
 	if (pp == NULL) {
 		swh->error = ENODEV;
 		return;
 	}
 	mtx_lock(&sw_dev_mtx);
 	TAILQ_FOREACH(sp, &swtailq, sw_list) {
 		cp = sp->sw_id;
 		if (cp != NULL && cp->provider == pp) {
 			mtx_unlock(&sw_dev_mtx);
 			swh->error = EBUSY;
 			return;
 		}
 	}
 	mtx_unlock(&sw_dev_mtx);
 	if (gp == NULL)
 		gp = g_new_geomf(&g_swap_class, "swap", NULL);
 	cp = g_new_consumer(gp);
 	g_attach(cp, pp);
 	/*
 	 * XXX: Everytime you think you can improve the margin for
 	 * footshooting, somebody depends on the ability to do so:
 	 * savecore(8) wants to write to our swapdev so we cannot
 	 * set an exclusive count :-(
 	 */
 	error = g_access(cp, 1, 1, 0);
 	if (error) {
 		g_detach(cp);
 		g_destroy_consumer(cp);
 		swh->error = error;
 		return;
 	}
 	nblks = pp->mediasize / DEV_BSIZE;
 	swaponsomething(swh->vp, cp, nblks, swapgeom_strategy,
 	    swapgeom_close, dev2udev(swh->dev));
 	swh->error = 0;
 	return;
 }
 
 static int
 swapongeom(struct thread *td, struct vnode *vp)
 {
 	int error;
 	struct swh0h0 swh;
 
-	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td);
+	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
 
 	swh.dev = vp->v_rdev;
 	swh.vp = vp;
 	swh.error = 0;
 	/* XXX: direct call when Giant untangled */
 	error = g_waitfor_event(swapongeom_ev, &swh, M_WAITOK, NULL);
 	if (!error)
 		error = swh.error;
 	VOP_UNLOCK(vp, 0, td);
 	return (error);
 }
 
 /*
  * VNODE backend
  *
  * This is used mainly for network filesystem (read: probably only tested
  * with NFS) swapfiles.
  *
  */
 
 static void
 swapdev_strategy(struct buf *bp, struct swdevt *sp)
 {
 	struct vnode *vp2;
 
 	bp->b_blkno = ctodb(bp->b_blkno - sp->sw_first);
 
 	vp2 = sp->sw_id;
 	vhold(vp2);
 	if (bp->b_iocmd == BIO_WRITE) {
 		if (bp->b_bufobj)
 			bufobj_wdrop(bp->b_bufobj);
 		bufobj_wref(&vp2->v_bufobj);
 	}
 	if (bp->b_bufobj != &vp2->v_bufobj)
 		bp->b_bufobj = &vp2->v_bufobj;
 	bp->b_vp = vp2;
 	bp->b_iooffset = dbtob(bp->b_blkno);
 	bstrategy(bp);
 	return;
 }
 
 static void
 swapdev_close(struct thread *td, struct swdevt *sp)
 {
 
 	VOP_CLOSE(sp->sw_vp, FREAD | FWRITE, td->td_ucred, td);
 	vrele(sp->sw_vp);
 }
 
 
 static int
 swaponvp(struct thread *td, struct vnode *vp, u_long nblks)
 {
 	struct swdevt *sp;
 	int error;
 
 	if (nblks == 0)
 		return (ENXIO);
 	mtx_lock(&sw_dev_mtx);
 	TAILQ_FOREACH(sp, &swtailq, sw_list) {
 		if (sp->sw_id == vp) {
 			mtx_unlock(&sw_dev_mtx);
 			return (EBUSY);
 		}
 	}
 	mtx_unlock(&sw_dev_mtx);
     
-	(void) vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td);
+	(void) vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
 #ifdef MAC
 	error = mac_system_check_swapon(td->td_ucred, vp);
 	if (error == 0)
 #endif
 		error = VOP_OPEN(vp, FREAD | FWRITE, td->td_ucred, td, NULL);
 	(void) VOP_UNLOCK(vp, 0, td);
 	if (error)
 		return (error);
 
 	swaponsomething(vp, vp, nblks, swapdev_strategy, swapdev_close,
 	    NODEV);
 	return (0);
 }
Index: head/sys/vm/vm_contig.c
===================================================================
--- head/sys/vm/vm_contig.c	(revision 175201)
+++ head/sys/vm/vm_contig.c	(revision 175202)
@@ -1,289 +1,289 @@
 /*-
  * Copyright (c) 1991 Regents of the University of California.
  * All rights reserved.
  *
  * This code is derived from software contributed to Berkeley by
  * The Mach Operating System project at Carnegie-Mellon University.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	from: @(#)vm_page.c	7.4 (Berkeley) 5/7/91
  */
 
 /*-
  * Copyright (c) 1987, 1990 Carnegie-Mellon University.
  * All rights reserved.
  *
  * Authors: Avadis Tevanian, Jr., Michael Wayne Young
  *
  * Permission to use, copy, modify and distribute this software and
  * its documentation is hereby granted, provided that both the copyright
  * notice and this permission notice appear in all copies of the
  * software, derivative works or modified versions, and any portions
  * thereof, and that both notices appear in supporting documentation.
  *
  * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
  * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
  * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
  *
  * Carnegie Mellon requests users of this software to return to
  *
  *  Software Distribution Coordinator  or  Software.Distribution@CS.CMU.EDU
  *  School of Computer Science
  *  Carnegie Mellon University
  *  Pittsburgh PA 15213-3890
  *
  * any improvements or extensions that they make and grant Carnegie the
  * rights to redistribute these changes.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/lock.h>
 #include <sys/malloc.h>
 #include <sys/mount.h>
 #include <sys/mutex.h>
 #include <sys/proc.h>
 #include <sys/kernel.h>
 #include <sys/linker_set.h>
 #include <sys/sysctl.h>
 #include <sys/vmmeter.h>
 #include <sys/vnode.h>
 
 #include <vm/vm.h>
 #include <vm/vm_param.h>
 #include <vm/vm_kern.h>
 #include <vm/pmap.h>
 #include <vm/vm_map.h>
 #include <vm/vm_object.h>
 #include <vm/vm_page.h>
 #include <vm/vm_pageout.h>
 #include <vm/vm_pager.h>
 #include <vm/vm_phys.h>
 #include <vm/vm_extern.h>
 
 static int
 vm_contig_launder_page(vm_page_t m, vm_page_t *next)
 {
 	vm_object_t object;
 	vm_page_t m_tmp;
 	struct vnode *vp;
 	struct mount *mp;
 	int vfslocked;
 
 	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
 	object = m->object;
 	if (!VM_OBJECT_TRYLOCK(object) &&
 	    !vm_pageout_fallback_object_lock(m, next)) {
 		VM_OBJECT_UNLOCK(object);
 		return (EAGAIN);
 	}
 	if (vm_page_sleep_if_busy(m, TRUE, "vpctw0")) {
 		VM_OBJECT_UNLOCK(object);
 		vm_page_lock_queues();
 		return (EBUSY);
 	}
 	vm_page_test_dirty(m);
 	if (m->dirty == 0 && m->hold_count == 0)
 		pmap_remove_all(m);
 	if (m->dirty) {
 		if ((object->flags & OBJ_DEAD) != 0) {
 			VM_OBJECT_UNLOCK(object);
 			return (EAGAIN);
 		}
 		if (object->type == OBJT_VNODE) {
 			vm_page_unlock_queues();
 			vp = object->handle;
 			vm_object_reference_locked(object);
 			VM_OBJECT_UNLOCK(object);
 			(void) vn_start_write(vp, &mp, V_WAIT);
 			vfslocked = VFS_LOCK_GIANT(vp->v_mount);
-			vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, curthread);
+			vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
 			VM_OBJECT_LOCK(object);
 			vm_object_page_clean(object, 0, 0, OBJPC_SYNC);
 			VM_OBJECT_UNLOCK(object);
 			VOP_UNLOCK(vp, 0, curthread);
 			VFS_UNLOCK_GIANT(vfslocked);
 			vm_object_deallocate(object);
 			vn_finished_write(mp);
 			vm_page_lock_queues();
 			return (0);
 		} else if (object->type == OBJT_SWAP ||
 			   object->type == OBJT_DEFAULT) {
 			m_tmp = m;
 			vm_pageout_flush(&m_tmp, 1, VM_PAGER_PUT_SYNC);
 			VM_OBJECT_UNLOCK(object);
 			return (0);
 		}
 	} else if (m->hold_count == 0)
 		vm_page_cache(m);
 	VM_OBJECT_UNLOCK(object);
 	return (0);
 }
 
 static int
 vm_contig_launder(int queue)
 {
 	vm_page_t m, next;
 	int error;
 
 	TAILQ_FOREACH_SAFE(m, &vm_page_queues[queue].pl, pageq, next) {
 
 		/* Skip marker pages */
 		if ((m->flags & PG_MARKER) != 0)
 			continue;
 
 		KASSERT(VM_PAGE_INQUEUE2(m, queue),
 		    ("vm_contig_launder: page %p's queue is not %d", m, queue));
 		error = vm_contig_launder_page(m, &next);
 		if (error == 0)
 			return (TRUE);
 		if (error == EBUSY)
 			return (FALSE);
 	}
 	return (FALSE);
 }
 
 /*
  *	Frees the given physically contiguous pages.
  *
  *	N.B.: Any pages with PG_ZERO set must, in fact, be zero filled.
  */
 static void
 vm_page_release_contig(vm_page_t m, vm_pindex_t count)
 {
 
 	while (count--) {
 		/* Leave PG_ZERO unchanged. */
 		vm_page_free_toq(m);
 		m++;
 	}
 }
 
 /*
  *	Allocates a region from the kernel address map, inserts the
  *	given physically contiguous pages into the kernel object,
  *	creates a wired mapping from the region to the pages, and
  *	returns the region's starting virtual address.  If M_ZERO is
  *	specified through the given flags, then the pages are zeroed
  *	before they are mapped.
  */
 static void *
 contigmapping(vm_page_t m, vm_pindex_t npages, int flags)
 {
 	vm_object_t object = kernel_object;
 	vm_map_t map = kernel_map;
 	vm_offset_t addr, tmp_addr;
 	vm_pindex_t i;
  
 	vm_map_lock(map);
 	if (vm_map_findspace(map, vm_map_min(map), npages << PAGE_SHIFT, &addr)
 	    != KERN_SUCCESS) {
 		vm_map_unlock(map);
 		return (NULL);
 	}
 	vm_object_reference(object);
 	vm_map_insert(map, object, addr - VM_MIN_KERNEL_ADDRESS,
 	    addr, addr + (npages << PAGE_SHIFT), VM_PROT_ALL, VM_PROT_ALL, 0);
 	vm_map_unlock(map);
 	tmp_addr = addr;
 	VM_OBJECT_LOCK(object);
 	for (i = 0; i < npages; i++) {
 		vm_page_insert(&m[i], object,
 		    OFF_TO_IDX(tmp_addr - VM_MIN_KERNEL_ADDRESS));
 		if ((flags & M_ZERO) && !(m[i].flags & PG_ZERO))
 			pmap_zero_page(&m[i]);
 		tmp_addr += PAGE_SIZE;
 	}
 	VM_OBJECT_UNLOCK(object);
 	vm_map_wire(map, addr, addr + (npages << PAGE_SHIFT),
 	    VM_MAP_WIRE_SYSTEM | VM_MAP_WIRE_NOHOLES);
 	return ((void *)addr);
 }
 
 void *
 contigmalloc(
 	unsigned long size,	/* should be size_t here and for malloc() */
 	struct malloc_type *type,
 	int flags,
 	vm_paddr_t low,
 	vm_paddr_t high,
 	unsigned long alignment,
 	unsigned long boundary)
 {
 	void *ret;
 	vm_page_t pages;
 	unsigned long npgs;
 	int actl, actmax, inactl, inactmax, tries;
 
 	npgs = round_page(size) >> PAGE_SHIFT;
 	tries = 0;
 retry:
 	pages = vm_phys_alloc_contig(npgs, low, high, alignment, boundary);
 	if (pages == NULL) {
 		if (tries < ((flags & M_NOWAIT) != 0 ? 1 : 3)) {
 			vm_page_lock_queues();
 			inactl = 0;
 			inactmax = tries < 1 ? 0 : cnt.v_inactive_count;
 			actl = 0;
 			actmax = tries < 2 ? 0 : cnt.v_active_count;
 again:
 			if (inactl < inactmax &&
 			    vm_contig_launder(PQ_INACTIVE)) {
 				inactl++;
 				goto again;
 			}
 			if (actl < actmax &&
 			    vm_contig_launder(PQ_ACTIVE)) {
 				actl++;
 				goto again;
 			}
 			vm_page_unlock_queues();
 			tries++;
 			goto retry;
 		}
 		ret = NULL;
 	} else {
 		ret = contigmapping(pages, npgs, flags);
 		if (ret == NULL)
 			vm_page_release_contig(pages, npgs);
 		else
 			malloc_type_allocated(type, npgs << PAGE_SHIFT);
 	}
 	return (ret);
 }
 
 void
 contigfree(void *addr, unsigned long size, struct malloc_type *type)
 {
 	vm_pindex_t npgs;
 
 	npgs = round_page(size) >> PAGE_SHIFT;
 	kmem_free(kernel_map, (vm_offset_t)addr, size);
 	malloc_type_freed(type, npgs << PAGE_SHIFT);
 }
Index: head/sys/vm/vm_object.c
===================================================================
--- head/sys/vm/vm_object.c	(revision 175201)
+++ head/sys/vm/vm_object.c	(revision 175202)
@@ -1,2257 +1,2257 @@
 /*-
  * Copyright (c) 1991, 1993
  *	The Regents of the University of California.  All rights reserved.
  *
  * This code is derived from software contributed to Berkeley by
  * The Mach Operating System project at Carnegie-Mellon University.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	from: @(#)vm_object.c	8.5 (Berkeley) 3/22/94
  *
  *
  * Copyright (c) 1987, 1990 Carnegie-Mellon University.
  * All rights reserved.
  *
  * Authors: Avadis Tevanian, Jr., Michael Wayne Young
  *
  * Permission to use, copy, modify and distribute this software and
  * its documentation is hereby granted, provided that both the copyright
  * notice and this permission notice appear in all copies of the
  * software, derivative works or modified versions, and any portions
  * thereof, and that both notices appear in supporting documentation.
  *
  * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
  * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
  * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
  *
  * Carnegie Mellon requests users of this software to return to
  *
  *  Software Distribution Coordinator  or  Software.Distribution@CS.CMU.EDU
  *  School of Computer Science
  *  Carnegie Mellon University
  *  Pittsburgh PA 15213-3890
  *
  * any improvements or extensions that they make and grant Carnegie the
  * rights to redistribute these changes.
  */
 
 /*
  *	Virtual memory object module.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_vm.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/lock.h>
 #include <sys/mman.h>
 #include <sys/mount.h>
 #include <sys/kernel.h>
 #include <sys/sysctl.h>
 #include <sys/mutex.h>
 #include <sys/proc.h>		/* for curproc, pageproc */
 #include <sys/socket.h>
 #include <sys/vnode.h>
 #include <sys/vmmeter.h>
 #include <sys/sx.h>
 
 #include <vm/vm.h>
 #include <vm/vm_param.h>
 #include <vm/pmap.h>
 #include <vm/vm_map.h>
 #include <vm/vm_object.h>
 #include <vm/vm_page.h>
 #include <vm/vm_pageout.h>
 #include <vm/vm_pager.h>
 #include <vm/swap_pager.h>
 #include <vm/vm_kern.h>
 #include <vm/vm_extern.h>
 #include <vm/vm_reserv.h>
 #include <vm/uma.h>
 
 #define EASY_SCAN_FACTOR       8
 
 #define MSYNC_FLUSH_HARDSEQ	0x01
 #define MSYNC_FLUSH_SOFTSEQ	0x02
 
 /*
  * msync / VM object flushing optimizations
  */
 static int msync_flush_flags = MSYNC_FLUSH_HARDSEQ | MSYNC_FLUSH_SOFTSEQ;
 SYSCTL_INT(_vm, OID_AUTO, msync_flush_flags,
         CTLFLAG_RW, &msync_flush_flags, 0, "");
 
 static int old_msync;
 SYSCTL_INT(_vm, OID_AUTO, old_msync, CTLFLAG_RW, &old_msync, 0,
     "Use old (insecure) msync behavior");
 
 static void	vm_object_qcollapse(vm_object_t object);
 static int	vm_object_page_collect_flush(vm_object_t object, vm_page_t p, int curgeneration, int pagerflags);
 static void	vm_object_vndeallocate(vm_object_t object);
 
 /*
  *	Virtual memory objects maintain the actual data
  *	associated with allocated virtual memory.  A given
  *	page of memory exists within exactly one object.
  *
  *	An object is only deallocated when all "references"
  *	are given up.  Only one "reference" to a given
  *	region of an object should be writeable.
  *
  *	Associated with each object is a list of all resident
  *	memory pages belonging to that object; this list is
  *	maintained by the "vm_page" module, and locked by the object's
  *	lock.
  *
  *	Each object also records a "pager" routine which is
  *	used to retrieve (and store) pages to the proper backing
  *	storage.  In addition, objects may be backed by other
  *	objects from which they were virtual-copied.
  *
  *	The only items within the object structure which are
  *	modified after time of creation are:
  *		reference count		locked by object's lock
  *		pager routine		locked by object's lock
  *
  */
 
 struct object_q vm_object_list;
 struct mtx vm_object_list_mtx;	/* lock for object list and count */
 
 struct vm_object kernel_object_store;
 struct vm_object kmem_object_store;
 
 SYSCTL_NODE(_vm_stats, OID_AUTO, object, CTLFLAG_RD, 0, "VM object stats");
 
 static long object_collapses;
 SYSCTL_LONG(_vm_stats_object, OID_AUTO, collapses, CTLFLAG_RD,
     &object_collapses, 0, "VM object collapses");
 
 static long object_bypasses;
 SYSCTL_LONG(_vm_stats_object, OID_AUTO, bypasses, CTLFLAG_RD,
     &object_bypasses, 0, "VM object bypasses");
 
 static uma_zone_t obj_zone;
 
 static int vm_object_zinit(void *mem, int size, int flags);
 
 #ifdef INVARIANTS
 static void vm_object_zdtor(void *mem, int size, void *arg);
 
 static void
 vm_object_zdtor(void *mem, int size, void *arg)
 {
 	vm_object_t object;
 
 	object = (vm_object_t)mem;
 	KASSERT(TAILQ_EMPTY(&object->memq),
 	    ("object %p has resident pages",
 	    object));
 #if VM_NRESERVLEVEL > 0
 	KASSERT(LIST_EMPTY(&object->rvq),
 	    ("object %p has reservations",
 	    object));
 #endif
 	KASSERT(object->cache == NULL,
 	    ("object %p has cached pages",
 	    object));
 	KASSERT(object->paging_in_progress == 0,
 	    ("object %p paging_in_progress = %d",
 	    object, object->paging_in_progress));
 	KASSERT(object->resident_page_count == 0,
 	    ("object %p resident_page_count = %d",
 	    object, object->resident_page_count));
 	KASSERT(object->shadow_count == 0,
 	    ("object %p shadow_count = %d",
 	    object, object->shadow_count));
 }
 #endif
 
 static int
 vm_object_zinit(void *mem, int size, int flags)
 {
 	vm_object_t object;
 
 	object = (vm_object_t)mem;
 	bzero(&object->mtx, sizeof(object->mtx));
 	VM_OBJECT_LOCK_INIT(object, "standard object");
 
 	/* These are true for any object that has been freed */
 	object->paging_in_progress = 0;
 	object->resident_page_count = 0;
 	object->shadow_count = 0;
 	return (0);
 }
 
 void
 _vm_object_allocate(objtype_t type, vm_pindex_t size, vm_object_t object)
 {
 
 	TAILQ_INIT(&object->memq);
 	LIST_INIT(&object->shadow_head);
 
 	object->root = NULL;
 	object->type = type;
 	object->size = size;
 	object->generation = 1;
 	object->ref_count = 1;
 	object->flags = 0;
 	if ((object->type == OBJT_DEFAULT) || (object->type == OBJT_SWAP))
 		object->flags = OBJ_ONEMAPPING;
 	object->pg_color = 0;
 	object->handle = NULL;
 	object->backing_object = NULL;
 	object->backing_object_offset = (vm_ooffset_t) 0;
 #if VM_NRESERVLEVEL > 0
 	LIST_INIT(&object->rvq);
 #endif
 	object->cache = NULL;
 
 	mtx_lock(&vm_object_list_mtx);
 	TAILQ_INSERT_TAIL(&vm_object_list, object, object_list);
 	mtx_unlock(&vm_object_list_mtx);
 }
 
 /*
  *	vm_object_init:
  *
  *	Initialize the VM objects module.
  */
 void
 vm_object_init(void)
 {
 	TAILQ_INIT(&vm_object_list);
 	mtx_init(&vm_object_list_mtx, "vm object_list", NULL, MTX_DEF);
 	
 	VM_OBJECT_LOCK_INIT(&kernel_object_store, "kernel object");
 	_vm_object_allocate(OBJT_PHYS, OFF_TO_IDX(VM_MAX_KERNEL_ADDRESS - VM_MIN_KERNEL_ADDRESS),
 	    kernel_object);
 #if VM_NRESERVLEVEL > 0
 	kernel_object->flags |= OBJ_COLORED;
 	kernel_object->pg_color = (u_short)atop(VM_MIN_KERNEL_ADDRESS);
 #endif
 
 	VM_OBJECT_LOCK_INIT(&kmem_object_store, "kmem object");
 	_vm_object_allocate(OBJT_PHYS, OFF_TO_IDX(VM_MAX_KERNEL_ADDRESS - VM_MIN_KERNEL_ADDRESS),
 	    kmem_object);
 #if VM_NRESERVLEVEL > 0
 	kmem_object->flags |= OBJ_COLORED;
 	kmem_object->pg_color = (u_short)atop(VM_MIN_KERNEL_ADDRESS);
 #endif
 
 	/*
 	 * The lock portion of struct vm_object must be type stable due
 	 * to vm_pageout_fallback_object_lock locking a vm object
 	 * without holding any references to it.
 	 */
 	obj_zone = uma_zcreate("VM OBJECT", sizeof (struct vm_object), NULL,
 #ifdef INVARIANTS
 	    vm_object_zdtor,
 #else
 	    NULL,
 #endif
 	    vm_object_zinit, NULL, UMA_ALIGN_PTR, UMA_ZONE_VM|UMA_ZONE_NOFREE);
 }
 
 void
 vm_object_clear_flag(vm_object_t object, u_short bits)
 {
 
 	VM_OBJECT_LOCK_ASSERT(object, MA_OWNED);
 	object->flags &= ~bits;
 }
 
 void
 vm_object_pip_add(vm_object_t object, short i)
 {
 
 	VM_OBJECT_LOCK_ASSERT(object, MA_OWNED);
 	object->paging_in_progress += i;
 }
 
 void
 vm_object_pip_subtract(vm_object_t object, short i)
 {
 
 	VM_OBJECT_LOCK_ASSERT(object, MA_OWNED);
 	object->paging_in_progress -= i;
 }
 
 void
 vm_object_pip_wakeup(vm_object_t object)
 {
 
 	VM_OBJECT_LOCK_ASSERT(object, MA_OWNED);
 	object->paging_in_progress--;
 	if ((object->flags & OBJ_PIPWNT) && object->paging_in_progress == 0) {
 		vm_object_clear_flag(object, OBJ_PIPWNT);
 		wakeup(object);
 	}
 }
 
 void
 vm_object_pip_wakeupn(vm_object_t object, short i)
 {
 
 	VM_OBJECT_LOCK_ASSERT(object, MA_OWNED);
 	if (i)
 		object->paging_in_progress -= i;
 	if ((object->flags & OBJ_PIPWNT) && object->paging_in_progress == 0) {
 		vm_object_clear_flag(object, OBJ_PIPWNT);
 		wakeup(object);
 	}
 }
 
 void
 vm_object_pip_wait(vm_object_t object, char *waitid)
 {
 
 	VM_OBJECT_LOCK_ASSERT(object, MA_OWNED);
 	while (object->paging_in_progress) {
 		object->flags |= OBJ_PIPWNT;
 		msleep(object, VM_OBJECT_MTX(object), PVM, waitid, 0);
 	}
 }
 
 /*
  *	vm_object_allocate:
  *
  *	Returns a new object with the given size.
  */
 vm_object_t
 vm_object_allocate(objtype_t type, vm_pindex_t size)
 {
 	vm_object_t object;
 
 	object = (vm_object_t)uma_zalloc(obj_zone, M_WAITOK);
 	_vm_object_allocate(type, size, object);
 	return (object);
 }
 
 
 /*
  *	vm_object_reference:
  *
  *	Gets another reference to the given object.  Note: OBJ_DEAD
  *	objects can be referenced during final cleaning.
  */
 void
 vm_object_reference(vm_object_t object)
 {
 	struct vnode *vp;
 
 	if (object == NULL)
 		return;
 	VM_OBJECT_LOCK(object);
 	object->ref_count++;
 	if (object->type == OBJT_VNODE) {
 		int vfslocked;
 
 		vp = object->handle;
 		VM_OBJECT_UNLOCK(object);
 		vfslocked = VFS_LOCK_GIANT(vp->v_mount);
 		vget(vp, LK_RETRY, curthread);
 		VFS_UNLOCK_GIANT(vfslocked);
 	} else
 		VM_OBJECT_UNLOCK(object);
 }
 
 /*
  *	vm_object_reference_locked:
  *
  *	Gets another reference to the given object.
  *
  *	The object must be locked.
  */
 void
 vm_object_reference_locked(vm_object_t object)
 {
 	struct vnode *vp;
 
 	VM_OBJECT_LOCK_ASSERT(object, MA_OWNED);
 	KASSERT((object->flags & OBJ_DEAD) == 0,
 	    ("vm_object_reference_locked: dead object referenced"));
 	object->ref_count++;
 	if (object->type == OBJT_VNODE) {
 		vp = object->handle;
 		vref(vp);
 	}
 }
 
 /*
  * Handle deallocating an object of type OBJT_VNODE.
  */
 static void
 vm_object_vndeallocate(vm_object_t object)
 {
 	struct vnode *vp = (struct vnode *) object->handle;
 
 	VFS_ASSERT_GIANT(vp->v_mount);
 	VM_OBJECT_LOCK_ASSERT(object, MA_OWNED);
 	KASSERT(object->type == OBJT_VNODE,
 	    ("vm_object_vndeallocate: not a vnode object"));
 	KASSERT(vp != NULL, ("vm_object_vndeallocate: missing vp"));
 #ifdef INVARIANTS
 	if (object->ref_count == 0) {
 		vprint("vm_object_vndeallocate", vp);
 		panic("vm_object_vndeallocate: bad object reference count");
 	}
 #endif
 
 	object->ref_count--;
 	if (object->ref_count == 0) {
 		mp_fixme("Unlocked vflag access.");
 		vp->v_vflag &= ~VV_TEXT;
 	}
 	VM_OBJECT_UNLOCK(object);
 	/*
 	 * vrele may need a vop lock
 	 */
 	vrele(vp);
 }
 
 /*
  *	vm_object_deallocate:
  *
  *	Release a reference to the specified object,
  *	gained either through a vm_object_allocate
  *	or a vm_object_reference call.  When all references
  *	are gone, storage associated with this object
  *	may be relinquished.
  *
  *	No object may be locked.
  */
 void
 vm_object_deallocate(vm_object_t object)
 {
 	vm_object_t temp;
 
 	while (object != NULL) {
 		int vfslocked;
 
 		vfslocked = 0;
 	restart:
 		VM_OBJECT_LOCK(object);
 		if (object->type == OBJT_VNODE) {
 			struct vnode *vp = (struct vnode *) object->handle;
 
 			/*
 			 * Conditionally acquire Giant for a vnode-backed
 			 * object.  We have to be careful since the type of
 			 * a vnode object can change while the object is
 			 * unlocked.
 			 */
 			if (VFS_NEEDSGIANT(vp->v_mount) && !vfslocked) {
 				vfslocked = 1;
 				if (!mtx_trylock(&Giant)) {
 					VM_OBJECT_UNLOCK(object);
 					mtx_lock(&Giant);
 					goto restart;
 				}
 			}
 			vm_object_vndeallocate(object);
 			VFS_UNLOCK_GIANT(vfslocked);
 			return;
 		} else
 			/*
 			 * This is to handle the case that the object
 			 * changed type while we dropped its lock to
 			 * obtain Giant.
 			 */
 			VFS_UNLOCK_GIANT(vfslocked);
 
 		KASSERT(object->ref_count != 0,
 			("vm_object_deallocate: object deallocated too many times: %d", object->type));
 
 		/*
 		 * If the reference count goes to 0 we start calling
 		 * vm_object_terminate() on the object chain.
 		 * A ref count of 1 may be a special case depending on the
 		 * shadow count being 0 or 1.
 		 */
 		object->ref_count--;
 		if (object->ref_count > 1) {
 			VM_OBJECT_UNLOCK(object);
 			return;
 		} else if (object->ref_count == 1) {
 			if (object->shadow_count == 0) {
 				vm_object_set_flag(object, OBJ_ONEMAPPING);
 			} else if ((object->shadow_count == 1) &&
 			    (object->handle == NULL) &&
 			    (object->type == OBJT_DEFAULT ||
 			     object->type == OBJT_SWAP)) {
 				vm_object_t robject;
 
 				robject = LIST_FIRST(&object->shadow_head);
 				KASSERT(robject != NULL,
 				    ("vm_object_deallocate: ref_count: %d, shadow_count: %d",
 					 object->ref_count,
 					 object->shadow_count));
 				if (!VM_OBJECT_TRYLOCK(robject)) {
 					/*
 					 * Avoid a potential deadlock.
 					 */
 					object->ref_count++;
 					VM_OBJECT_UNLOCK(object);
 					/*
 					 * More likely than not the thread
 					 * holding robject's lock has lower
 					 * priority than the current thread.
 					 * Let the lower priority thread run.
 					 */
 					pause("vmo_de", 1);
 					continue;
 				}
 				/*
 				 * Collapse object into its shadow unless its
 				 * shadow is dead.  In that case, object will
 				 * be deallocated by the thread that is
 				 * deallocating its shadow.
 				 */
 				if ((robject->flags & OBJ_DEAD) == 0 &&
 				    (robject->handle == NULL) &&
 				    (robject->type == OBJT_DEFAULT ||
 				     robject->type == OBJT_SWAP)) {
 
 					robject->ref_count++;
 retry:
 					if (robject->paging_in_progress) {
 						VM_OBJECT_UNLOCK(object);
 						vm_object_pip_wait(robject,
 						    "objde1");
 						temp = robject->backing_object;
 						if (object == temp) {
 							VM_OBJECT_LOCK(object);
 							goto retry;
 						}
 					} else if (object->paging_in_progress) {
 						VM_OBJECT_UNLOCK(robject);
 						object->flags |= OBJ_PIPWNT;
 						msleep(object,
 						    VM_OBJECT_MTX(object),
 						    PDROP | PVM, "objde2", 0);
 						VM_OBJECT_LOCK(robject);
 						temp = robject->backing_object;
 						if (object == temp) {
 							VM_OBJECT_LOCK(object);
 							goto retry;
 						}
 					} else
 						VM_OBJECT_UNLOCK(object);
 
 					if (robject->ref_count == 1) {
 						robject->ref_count--;
 						object = robject;
 						goto doterm;
 					}
 					object = robject;
 					vm_object_collapse(object);
 					VM_OBJECT_UNLOCK(object);
 					continue;
 				}
 				VM_OBJECT_UNLOCK(robject);
 			}
 			VM_OBJECT_UNLOCK(object);
 			return;
 		}
 doterm:
 		temp = object->backing_object;
 		if (temp != NULL) {
 			VM_OBJECT_LOCK(temp);
 			LIST_REMOVE(object, shadow_list);
 			temp->shadow_count--;
 			temp->generation++;
 			VM_OBJECT_UNLOCK(temp);
 			object->backing_object = NULL;
 		}
 		/*
 		 * Don't double-terminate, we could be in a termination
 		 * recursion due to the terminate having to sync data
 		 * to disk.
 		 */
 		if ((object->flags & OBJ_DEAD) == 0)
 			vm_object_terminate(object);
 		else
 			VM_OBJECT_UNLOCK(object);
 		object = temp;
 	}
 }
 
 /*
  *	vm_object_terminate actually destroys the specified object, freeing
  *	up all previously used resources.
  *
  *	The object must be locked.
  *	This routine may block.
  */
 void
 vm_object_terminate(vm_object_t object)
 {
 	vm_page_t p;
 
 	VM_OBJECT_LOCK_ASSERT(object, MA_OWNED);
 
 	/*
 	 * Make sure no one uses us.
 	 */
 	vm_object_set_flag(object, OBJ_DEAD);
 
 	/*
 	 * wait for the pageout daemon to be done with the object
 	 */
 	vm_object_pip_wait(object, "objtrm");
 
 	KASSERT(!object->paging_in_progress,
 		("vm_object_terminate: pageout in progress"));
 
 	/*
 	 * Clean and free the pages, as appropriate. All references to the
 	 * object are gone, so we don't need to lock it.
 	 */
 	if (object->type == OBJT_VNODE) {
 		struct vnode *vp = (struct vnode *)object->handle;
 
 		/*
 		 * Clean pages and flush buffers.
 		 */
 		vm_object_page_clean(object, 0, 0, OBJPC_SYNC);
 		VM_OBJECT_UNLOCK(object);
 
 		vinvalbuf(vp, V_SAVE, NULL, 0, 0);
 
 		VM_OBJECT_LOCK(object);
 	}
 
 	KASSERT(object->ref_count == 0, 
 		("vm_object_terminate: object with references, ref_count=%d",
 		object->ref_count));
 
 	/*
 	 * Now free any remaining pages. For internal objects, this also
 	 * removes them from paging queues. Don't free wired pages, just
 	 * remove them from the object. 
 	 */
 	vm_page_lock_queues();
 	while ((p = TAILQ_FIRST(&object->memq)) != NULL) {
 		KASSERT(!p->busy && (p->oflags & VPO_BUSY) == 0,
 			("vm_object_terminate: freeing busy page %p "
 			"p->busy = %d, p->flags %x\n", p, p->busy, p->flags));
 		if (p->wire_count == 0) {
 			vm_page_free(p);
 			cnt.v_pfree++;
 		} else {
 			vm_page_remove(p);
 		}
 	}
 	vm_page_unlock_queues();
 
 #if VM_NRESERVLEVEL > 0
 	if (__predict_false(!LIST_EMPTY(&object->rvq)))
 		vm_reserv_break_all(object);
 #endif
 	if (__predict_false(object->cache != NULL))
 		vm_page_cache_free(object, 0, 0);
 
 	/*
 	 * Let the pager know object is dead.
 	 */
 	vm_pager_deallocate(object);
 	VM_OBJECT_UNLOCK(object);
 
 	/*
 	 * Remove the object from the global object list.
 	 */
 	mtx_lock(&vm_object_list_mtx);
 	TAILQ_REMOVE(&vm_object_list, object, object_list);
 	mtx_unlock(&vm_object_list_mtx);
 
 	/*
 	 * Free the space for the object.
 	 */
 	uma_zfree(obj_zone, object);
 }
 
 /*
  *	vm_object_page_clean
  *
  *	Clean all dirty pages in the specified range of object.  Leaves page 
  * 	on whatever queue it is currently on.   If NOSYNC is set then do not
  *	write out pages with VPO_NOSYNC set (originally comes from MAP_NOSYNC),
  *	leaving the object dirty.
  *
  *	When stuffing pages asynchronously, allow clustering.  XXX we need a
  *	synchronous clustering mode implementation.
  *
  *	Odd semantics: if start == end, we clean everything.
  *
  *	The object must be locked.
  */
 void
 vm_object_page_clean(vm_object_t object, vm_pindex_t start, vm_pindex_t end, int flags)
 {
 	vm_page_t p, np;
 	vm_pindex_t tstart, tend;
 	vm_pindex_t pi;
 	int clearobjflags;
 	int pagerflags;
 	int curgeneration;
 
 	VM_OBJECT_LOCK_ASSERT(object, MA_OWNED);
 	if (object->type != OBJT_VNODE ||
 		(object->flags & OBJ_MIGHTBEDIRTY) == 0)
 		return;
 
 	pagerflags = (flags & (OBJPC_SYNC | OBJPC_INVAL)) ? VM_PAGER_PUT_SYNC : VM_PAGER_CLUSTER_OK;
 	pagerflags |= (flags & OBJPC_INVAL) ? VM_PAGER_PUT_INVAL : 0;
 
 	vm_object_set_flag(object, OBJ_CLEANING);
 
 	tstart = start;
 	if (end == 0) {
 		tend = object->size;
 	} else {
 		tend = end;
 	}
 
 	vm_page_lock_queues();
 	/*
 	 * If the caller is smart and only msync()s a range he knows is
 	 * dirty, we may be able to avoid an object scan.  This results in
 	 * a phenominal improvement in performance.  We cannot do this
 	 * as a matter of course because the object may be huge - e.g.
 	 * the size might be in the gigabytes or terrabytes.
 	 */
 	if (msync_flush_flags & MSYNC_FLUSH_HARDSEQ) {
 		vm_pindex_t tscan;
 		int scanlimit;
 		int scanreset;
 
 		scanreset = object->resident_page_count / EASY_SCAN_FACTOR;
 		if (scanreset < 16)
 			scanreset = 16;
 		pagerflags |= VM_PAGER_IGNORE_CLEANCHK;
 
 		scanlimit = scanreset;
 		tscan = tstart;
 		while (tscan < tend) {
 			curgeneration = object->generation;
 			p = vm_page_lookup(object, tscan);
 			if (p == NULL || p->valid == 0) {
 				if (--scanlimit == 0)
 					break;
 				++tscan;
 				continue;
 			}
 			vm_page_test_dirty(p);
 			if ((p->dirty & p->valid) == 0) {
 				if (--scanlimit == 0)
 					break;
 				++tscan;
 				continue;
 			}
 			/*
 			 * If we have been asked to skip nosync pages and 
 			 * this is a nosync page, we can't continue.
 			 */
 			if ((flags & OBJPC_NOSYNC) && (p->oflags & VPO_NOSYNC)) {
 				if (--scanlimit == 0)
 					break;
 				++tscan;
 				continue;
 			}
 			scanlimit = scanreset;
 
 			/*
 			 * This returns 0 if it was unable to busy the first
 			 * page (i.e. had to sleep).
 			 */
 			tscan += vm_object_page_collect_flush(object, p, curgeneration, pagerflags);
 		}
 
 		/*
 		 * If everything was dirty and we flushed it successfully,
 		 * and the requested range is not the entire object, we
 		 * don't have to mess with CLEANCHK or MIGHTBEDIRTY and can
 		 * return immediately.
 		 */
 		if (tscan >= tend && (tstart || tend < object->size)) {
 			vm_page_unlock_queues();
 			vm_object_clear_flag(object, OBJ_CLEANING);
 			return;
 		}
 		pagerflags &= ~VM_PAGER_IGNORE_CLEANCHK;
 	}
 
 	/*
 	 * Generally set CLEANCHK interlock and make the page read-only so
 	 * we can then clear the object flags.
 	 *
 	 * However, if this is a nosync mmap then the object is likely to 
 	 * stay dirty so do not mess with the page and do not clear the
 	 * object flags.
 	 */
 	clearobjflags = 1;
 	TAILQ_FOREACH(p, &object->memq, listq) {
 		p->oflags |= VPO_CLEANCHK;
 		if ((flags & OBJPC_NOSYNC) && (p->oflags & VPO_NOSYNC))
 			clearobjflags = 0;
 		else
 			pmap_remove_write(p);
 	}
 
 	if (clearobjflags && (tstart == 0) && (tend == object->size)) {
 		struct vnode *vp;
 
 		vm_object_clear_flag(object, OBJ_MIGHTBEDIRTY);
 		if (object->type == OBJT_VNODE &&
 		    (vp = (struct vnode *)object->handle) != NULL) {
 			VI_LOCK(vp);
 			if (vp->v_iflag & VI_OBJDIRTY)
 				vp->v_iflag &= ~VI_OBJDIRTY;
 			VI_UNLOCK(vp);
 		}
 	}
 
 rescan:
 	curgeneration = object->generation;
 
 	for (p = TAILQ_FIRST(&object->memq); p; p = np) {
 		int n;
 
 		np = TAILQ_NEXT(p, listq);
 
 again:
 		pi = p->pindex;
 		if ((p->oflags & VPO_CLEANCHK) == 0 ||
 			(pi < tstart) || (pi >= tend) ||
 		    p->valid == 0) {
 			p->oflags &= ~VPO_CLEANCHK;
 			continue;
 		}
 
 		vm_page_test_dirty(p);
 		if ((p->dirty & p->valid) == 0) {
 			p->oflags &= ~VPO_CLEANCHK;
 			continue;
 		}
 
 		/*
 		 * If we have been asked to skip nosync pages and this is a
 		 * nosync page, skip it.  Note that the object flags were
 		 * not cleared in this case so we do not have to set them.
 		 */
 		if ((flags & OBJPC_NOSYNC) && (p->oflags & VPO_NOSYNC)) {
 			p->oflags &= ~VPO_CLEANCHK;
 			continue;
 		}
 
 		n = vm_object_page_collect_flush(object, p,
 			curgeneration, pagerflags);
 		if (n == 0)
 			goto rescan;
 
 		if (object->generation != curgeneration)
 			goto rescan;
 
 		/*
 		 * Try to optimize the next page.  If we can't we pick up
 		 * our (random) scan where we left off.
 		 */
 		if (msync_flush_flags & MSYNC_FLUSH_SOFTSEQ) {
 			if ((p = vm_page_lookup(object, pi + n)) != NULL)
 				goto again;
 		}
 	}
 	vm_page_unlock_queues();
 #if 0
 	VOP_FSYNC(vp, (pagerflags & VM_PAGER_PUT_SYNC)?MNT_WAIT:0, curproc);
 #endif
 
 	vm_object_clear_flag(object, OBJ_CLEANING);
 	return;
 }
 
 static int
 vm_object_page_collect_flush(vm_object_t object, vm_page_t p, int curgeneration, int pagerflags)
 {
 	int runlen;
 	int maxf;
 	int chkb;
 	int maxb;
 	int i;
 	vm_pindex_t pi;
 	vm_page_t maf[vm_pageout_page_count];
 	vm_page_t mab[vm_pageout_page_count];
 	vm_page_t ma[vm_pageout_page_count];
 
 	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
 	pi = p->pindex;
 	while (vm_page_sleep_if_busy(p, TRUE, "vpcwai")) {
 		vm_page_lock_queues();
 		if (object->generation != curgeneration) {
 			return(0);
 		}
 	}
 	maxf = 0;
 	for(i = 1; i < vm_pageout_page_count; i++) {
 		vm_page_t tp;
 
 		if ((tp = vm_page_lookup(object, pi + i)) != NULL) {
 			if ((tp->oflags & VPO_BUSY) ||
 				((pagerflags & VM_PAGER_IGNORE_CLEANCHK) == 0 &&
 				 (tp->oflags & VPO_CLEANCHK) == 0) ||
 				(tp->busy != 0))
 				break;
 			vm_page_test_dirty(tp);
 			if ((tp->dirty & tp->valid) == 0) {
 				tp->oflags &= ~VPO_CLEANCHK;
 				break;
 			}
 			maf[ i - 1 ] = tp;
 			maxf++;
 			continue;
 		}
 		break;
 	}
 
 	maxb = 0;
 	chkb = vm_pageout_page_count -  maxf;
 	if (chkb) {
 		for(i = 1; i < chkb;i++) {
 			vm_page_t tp;
 
 			if ((tp = vm_page_lookup(object, pi - i)) != NULL) {
 				if ((tp->oflags & VPO_BUSY) ||
 					((pagerflags & VM_PAGER_IGNORE_CLEANCHK) == 0 &&
 					 (tp->oflags & VPO_CLEANCHK) == 0) ||
 					(tp->busy != 0))
 					break;
 				vm_page_test_dirty(tp);
 				if ((tp->dirty & tp->valid) == 0) {
 					tp->oflags &= ~VPO_CLEANCHK;
 					break;
 				}
 				mab[ i - 1 ] = tp;
 				maxb++;
 				continue;
 			}
 			break;
 		}
 	}
 
 	for(i = 0; i < maxb; i++) {
 		int index = (maxb - i) - 1;
 		ma[index] = mab[i];
 		ma[index]->oflags &= ~VPO_CLEANCHK;
 	}
 	p->oflags &= ~VPO_CLEANCHK;
 	ma[maxb] = p;
 	for(i = 0; i < maxf; i++) {
 		int index = (maxb + i) + 1;
 		ma[index] = maf[i];
 		ma[index]->oflags &= ~VPO_CLEANCHK;
 	}
 	runlen = maxb + maxf + 1;
 
 	vm_pageout_flush(ma, runlen, pagerflags);
 	for (i = 0; i < runlen; i++) {
 		if (ma[i]->valid & ma[i]->dirty) {
 			pmap_remove_write(ma[i]);
 			ma[i]->oflags |= VPO_CLEANCHK;
 
 			/*
 			 * maxf will end up being the actual number of pages
 			 * we wrote out contiguously, non-inclusive of the
 			 * first page.  We do not count look-behind pages.
 			 */
 			if (i >= maxb + 1 && (maxf > i - maxb - 1))
 				maxf = i - maxb - 1;
 		}
 	}
 	return(maxf + 1);
 }
 
 /*
  * Note that there is absolutely no sense in writing out
  * anonymous objects, so we track down the vnode object
  * to write out.
  * We invalidate (remove) all pages from the address space
  * for semantic correctness.
  *
  * Note: certain anonymous maps, such as MAP_NOSYNC maps,
  * may start out with a NULL object.
  */
 void
 vm_object_sync(vm_object_t object, vm_ooffset_t offset, vm_size_t size,
     boolean_t syncio, boolean_t invalidate)
 {
 	vm_object_t backing_object;
 	struct vnode *vp;
 	struct mount *mp;
 	int flags;
 
 	if (object == NULL)
 		return;
 	VM_OBJECT_LOCK(object);
 	while ((backing_object = object->backing_object) != NULL) {
 		VM_OBJECT_LOCK(backing_object);
 		offset += object->backing_object_offset;
 		VM_OBJECT_UNLOCK(object);
 		object = backing_object;
 		if (object->size < OFF_TO_IDX(offset + size))
 			size = IDX_TO_OFF(object->size) - offset;
 	}
 	/*
 	 * Flush pages if writing is allowed, invalidate them
 	 * if invalidation requested.  Pages undergoing I/O
 	 * will be ignored by vm_object_page_remove().
 	 *
 	 * We cannot lock the vnode and then wait for paging
 	 * to complete without deadlocking against vm_fault.
 	 * Instead we simply call vm_object_page_remove() and
 	 * allow it to block internally on a page-by-page
 	 * basis when it encounters pages undergoing async
 	 * I/O.
 	 */
 	if (object->type == OBJT_VNODE &&
 	    (object->flags & OBJ_MIGHTBEDIRTY) != 0) {
 		int vfslocked;
 		vp = object->handle;
 		VM_OBJECT_UNLOCK(object);
 		(void) vn_start_write(vp, &mp, V_WAIT);
 		vfslocked = VFS_LOCK_GIANT(vp->v_mount);
-		vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, curthread);
+		vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
 		flags = (syncio || invalidate) ? OBJPC_SYNC : 0;
 		flags |= invalidate ? OBJPC_INVAL : 0;
 		VM_OBJECT_LOCK(object);
 		vm_object_page_clean(object,
 		    OFF_TO_IDX(offset),
 		    OFF_TO_IDX(offset + size + PAGE_MASK),
 		    flags);
 		VM_OBJECT_UNLOCK(object);
 		VOP_UNLOCK(vp, 0, curthread);
 		VFS_UNLOCK_GIANT(vfslocked);
 		vn_finished_write(mp);
 		VM_OBJECT_LOCK(object);
 	}
 	if ((object->type == OBJT_VNODE ||
 	     object->type == OBJT_DEVICE) && invalidate) {
 		boolean_t purge;
 		purge = old_msync || (object->type == OBJT_DEVICE);
 		vm_object_page_remove(object,
 		    OFF_TO_IDX(offset),
 		    OFF_TO_IDX(offset + size + PAGE_MASK),
 		    purge ? FALSE : TRUE);
 	}
 	VM_OBJECT_UNLOCK(object);
 }
 
 /*
  *	vm_object_madvise:
  *
  *	Implements the madvise function at the object/page level.
  *
  *	MADV_WILLNEED	(any object)
  *
  *	    Activate the specified pages if they are resident.
  *
  *	MADV_DONTNEED	(any object)
  *
  *	    Deactivate the specified pages if they are resident.
  *
  *	MADV_FREE	(OBJT_DEFAULT/OBJT_SWAP objects,
  *			 OBJ_ONEMAPPING only)
  *
  *	    Deactivate and clean the specified pages if they are
  *	    resident.  This permits the process to reuse the pages
  *	    without faulting or the kernel to reclaim the pages
  *	    without I/O.
  */
 void
 vm_object_madvise(vm_object_t object, vm_pindex_t pindex, int count, int advise)
 {
 	vm_pindex_t end, tpindex;
 	vm_object_t backing_object, tobject;
 	vm_page_t m;
 
 	if (object == NULL)
 		return;
 	VM_OBJECT_LOCK(object);
 	end = pindex + count;
 	/*
 	 * Locate and adjust resident pages
 	 */
 	for (; pindex < end; pindex += 1) {
 relookup:
 		tobject = object;
 		tpindex = pindex;
 shadowlookup:
 		/*
 		 * MADV_FREE only operates on OBJT_DEFAULT or OBJT_SWAP pages
 		 * and those pages must be OBJ_ONEMAPPING.
 		 */
 		if (advise == MADV_FREE) {
 			if ((tobject->type != OBJT_DEFAULT &&
 			     tobject->type != OBJT_SWAP) ||
 			    (tobject->flags & OBJ_ONEMAPPING) == 0) {
 				goto unlock_tobject;
 			}
 		}
 		m = vm_page_lookup(tobject, tpindex);
 		if (m == NULL && advise == MADV_WILLNEED) {
 			/*
 			 * If the page is cached, reactivate it.
 			 */
 			m = vm_page_alloc(tobject, tpindex, VM_ALLOC_IFCACHED |
 			    VM_ALLOC_NOBUSY);
 		}
 		if (m == NULL) {
 			/*
 			 * There may be swap even if there is no backing page
 			 */
 			if (advise == MADV_FREE && tobject->type == OBJT_SWAP)
 				swap_pager_freespace(tobject, tpindex, 1);
 			/*
 			 * next object
 			 */
 			backing_object = tobject->backing_object;
 			if (backing_object == NULL)
 				goto unlock_tobject;
 			VM_OBJECT_LOCK(backing_object);
 			tpindex += OFF_TO_IDX(tobject->backing_object_offset);
 			if (tobject != object)
 				VM_OBJECT_UNLOCK(tobject);
 			tobject = backing_object;
 			goto shadowlookup;
 		}
 		/*
 		 * If the page is busy or not in a normal active state,
 		 * we skip it.  If the page is not managed there are no
 		 * page queues to mess with.  Things can break if we mess
 		 * with pages in any of the below states.
 		 */
 		vm_page_lock_queues();
 		if (m->hold_count ||
 		    m->wire_count ||
 		    (m->flags & PG_UNMANAGED) ||
 		    m->valid != VM_PAGE_BITS_ALL) {
 			vm_page_unlock_queues();
 			goto unlock_tobject;
 		}
 		if ((m->oflags & VPO_BUSY) || m->busy) {
 			vm_page_flag_set(m, PG_REFERENCED);
 			vm_page_unlock_queues();
 			if (object != tobject)
 				VM_OBJECT_UNLOCK(object);
 			m->oflags |= VPO_WANTED;
 			msleep(m, VM_OBJECT_MTX(tobject), PDROP | PVM, "madvpo", 0);
 			VM_OBJECT_LOCK(object);
   			goto relookup;
 		}
 		if (advise == MADV_WILLNEED) {
 			vm_page_activate(m);
 		} else if (advise == MADV_DONTNEED) {
 			vm_page_dontneed(m);
 		} else if (advise == MADV_FREE) {
 			/*
 			 * Mark the page clean.  This will allow the page
 			 * to be freed up by the system.  However, such pages
 			 * are often reused quickly by malloc()/free()
 			 * so we do not do anything that would cause
 			 * a page fault if we can help it.
 			 *
 			 * Specifically, we do not try to actually free
 			 * the page now nor do we try to put it in the
 			 * cache (which would cause a page fault on reuse).
 			 *
 			 * But we do make the page is freeable as we
 			 * can without actually taking the step of unmapping
 			 * it.
 			 */
 			pmap_clear_modify(m);
 			m->dirty = 0;
 			m->act_count = 0;
 			vm_page_dontneed(m);
 		}
 		vm_page_unlock_queues();
 		if (advise == MADV_FREE && tobject->type == OBJT_SWAP)
 			swap_pager_freespace(tobject, tpindex, 1);
 unlock_tobject:
 		if (tobject != object)
 			VM_OBJECT_UNLOCK(tobject);
 	}	
 	VM_OBJECT_UNLOCK(object);
 }
 
 /*
  *	vm_object_shadow:
  *
  *	Create a new object which is backed by the
  *	specified existing object range.  The source
  *	object reference is deallocated.
  *
  *	The new object and offset into that object
  *	are returned in the source parameters.
  */
 void
 vm_object_shadow(
 	vm_object_t *object,	/* IN/OUT */
 	vm_ooffset_t *offset,	/* IN/OUT */
 	vm_size_t length)
 {
 	vm_object_t source;
 	vm_object_t result;
 
 	source = *object;
 
 	/*
 	 * Don't create the new object if the old object isn't shared.
 	 */
 	if (source != NULL) {
 		VM_OBJECT_LOCK(source);
 		if (source->ref_count == 1 &&
 		    source->handle == NULL &&
 		    (source->type == OBJT_DEFAULT ||
 		     source->type == OBJT_SWAP)) {
 			VM_OBJECT_UNLOCK(source);
 			return;
 		}
 		VM_OBJECT_UNLOCK(source);
 	}
 
 	/*
 	 * Allocate a new object with the given length.
 	 */
 	result = vm_object_allocate(OBJT_DEFAULT, length);
 
 	/*
 	 * The new object shadows the source object, adding a reference to it.
 	 * Our caller changes his reference to point to the new object,
 	 * removing a reference to the source object.  Net result: no change
 	 * of reference count.
 	 *
 	 * Try to optimize the result object's page color when shadowing
 	 * in order to maintain page coloring consistency in the combined 
 	 * shadowed object.
 	 */
 	result->backing_object = source;
 	/*
 	 * Store the offset into the source object, and fix up the offset into
 	 * the new object.
 	 */
 	result->backing_object_offset = *offset;
 	if (source != NULL) {
 		VM_OBJECT_LOCK(source);
 		LIST_INSERT_HEAD(&source->shadow_head, result, shadow_list);
 		source->shadow_count++;
 		source->generation++;
 #if VM_NRESERVLEVEL > 0
 		result->flags |= source->flags & (OBJ_NEEDGIANT | OBJ_COLORED);
 		result->pg_color = (source->pg_color + OFF_TO_IDX(*offset)) &
 		    ((1 << (VM_NFREEORDER - 1)) - 1);
 #else
 		result->flags |= source->flags & OBJ_NEEDGIANT;
 #endif
 		VM_OBJECT_UNLOCK(source);
 	}
 
 
 	/*
 	 * Return the new things
 	 */
 	*offset = 0;
 	*object = result;
 }
 
 /*
  *	vm_object_split:
  *
  * Split the pages in a map entry into a new object.  This affords
  * easier removal of unused pages, and keeps object inheritance from
  * being a negative impact on memory usage.
  */
 void
 vm_object_split(vm_map_entry_t entry)
 {
 	vm_page_t m, m_next;
 	vm_object_t orig_object, new_object, source;
 	vm_pindex_t idx, offidxstart;
 	vm_size_t size;
 
 	orig_object = entry->object.vm_object;
 	if (orig_object->type != OBJT_DEFAULT && orig_object->type != OBJT_SWAP)
 		return;
 	if (orig_object->ref_count <= 1)
 		return;
 	VM_OBJECT_UNLOCK(orig_object);
 
 	offidxstart = OFF_TO_IDX(entry->offset);
 	size = atop(entry->end - entry->start);
 
 	/*
 	 * If swap_pager_copy() is later called, it will convert new_object
 	 * into a swap object.
 	 */
 	new_object = vm_object_allocate(OBJT_DEFAULT, size);
 
 	/*
 	 * At this point, the new object is still private, so the order in
 	 * which the original and new objects are locked does not matter.
 	 */
 	VM_OBJECT_LOCK(new_object);
 	VM_OBJECT_LOCK(orig_object);
 	source = orig_object->backing_object;
 	if (source != NULL) {
 		VM_OBJECT_LOCK(source);
 		if ((source->flags & OBJ_DEAD) != 0) {
 			VM_OBJECT_UNLOCK(source);
 			VM_OBJECT_UNLOCK(orig_object);
 			VM_OBJECT_UNLOCK(new_object);
 			vm_object_deallocate(new_object);
 			VM_OBJECT_LOCK(orig_object);
 			return;
 		}
 		LIST_INSERT_HEAD(&source->shadow_head,
 				  new_object, shadow_list);
 		source->shadow_count++;
 		source->generation++;
 		vm_object_reference_locked(source);	/* for new_object */
 		vm_object_clear_flag(source, OBJ_ONEMAPPING);
 		VM_OBJECT_UNLOCK(source);
 		new_object->backing_object_offset = 
 			orig_object->backing_object_offset + entry->offset;
 		new_object->backing_object = source;
 	}
 	new_object->flags |= orig_object->flags & OBJ_NEEDGIANT;
 retry:
 	if ((m = TAILQ_FIRST(&orig_object->memq)) != NULL) {
 		if (m->pindex < offidxstart) {
 			m = vm_page_splay(offidxstart, orig_object->root);
 			if ((orig_object->root = m)->pindex < offidxstart)
 				m = TAILQ_NEXT(m, listq);
 		}
 	}
 	vm_page_lock_queues();
 	for (; m != NULL && (idx = m->pindex - offidxstart) < size;
 	    m = m_next) {
 		m_next = TAILQ_NEXT(m, listq);
 
 		/*
 		 * We must wait for pending I/O to complete before we can
 		 * rename the page.
 		 *
 		 * We do not have to VM_PROT_NONE the page as mappings should
 		 * not be changed by this operation.
 		 */
 		if ((m->oflags & VPO_BUSY) || m->busy) {
 			vm_page_flag_set(m, PG_REFERENCED);
 			vm_page_unlock_queues();
 			VM_OBJECT_UNLOCK(new_object);
 			m->oflags |= VPO_WANTED;
 			msleep(m, VM_OBJECT_MTX(orig_object), PVM, "spltwt", 0);
 			VM_OBJECT_LOCK(new_object);
 			goto retry;
 		}
 		vm_page_rename(m, new_object, idx);
 		/* page automatically made dirty by rename and cache handled */
 		vm_page_busy(m);
 	}
 	vm_page_unlock_queues();
 	if (orig_object->type == OBJT_SWAP) {
 		/*
 		 * swap_pager_copy() can sleep, in which case the orig_object's
 		 * and new_object's locks are released and reacquired. 
 		 */
 		swap_pager_copy(orig_object, new_object, offidxstart, 0);
 
 		/*
 		 * Transfer any cached pages from orig_object to new_object.
 		 */
 		if (__predict_false(orig_object->cache != NULL))
 			vm_page_cache_transfer(orig_object, offidxstart,
 			    new_object);
 	}
 	VM_OBJECT_UNLOCK(orig_object);
 	TAILQ_FOREACH(m, &new_object->memq, listq)
 		vm_page_wakeup(m);
 	VM_OBJECT_UNLOCK(new_object);
 	entry->object.vm_object = new_object;
 	entry->offset = 0LL;
 	vm_object_deallocate(orig_object);
 	VM_OBJECT_LOCK(new_object);
 }
 
 #define	OBSC_TEST_ALL_SHADOWED	0x0001
 #define	OBSC_COLLAPSE_NOWAIT	0x0002
 #define	OBSC_COLLAPSE_WAIT	0x0004
 
 static int
 vm_object_backing_scan(vm_object_t object, int op)
 {
 	int r = 1;
 	vm_page_t p;
 	vm_object_t backing_object;
 	vm_pindex_t backing_offset_index;
 
 	VM_OBJECT_LOCK_ASSERT(object, MA_OWNED);
 	VM_OBJECT_LOCK_ASSERT(object->backing_object, MA_OWNED);
 
 	backing_object = object->backing_object;
 	backing_offset_index = OFF_TO_IDX(object->backing_object_offset);
 
 	/*
 	 * Initial conditions
 	 */
 	if (op & OBSC_TEST_ALL_SHADOWED) {
 		/*
 		 * We do not want to have to test for the existence of cache
 		 * or swap pages in the backing object.  XXX but with the
 		 * new swapper this would be pretty easy to do.
 		 *
 		 * XXX what about anonymous MAP_SHARED memory that hasn't
 		 * been ZFOD faulted yet?  If we do not test for this, the
 		 * shadow test may succeed! XXX
 		 */
 		if (backing_object->type != OBJT_DEFAULT) {
 			return (0);
 		}
 	}
 	if (op & OBSC_COLLAPSE_WAIT) {
 		vm_object_set_flag(backing_object, OBJ_DEAD);
 	}
 
 	/*
 	 * Our scan
 	 */
 	p = TAILQ_FIRST(&backing_object->memq);
 	while (p) {
 		vm_page_t next = TAILQ_NEXT(p, listq);
 		vm_pindex_t new_pindex = p->pindex - backing_offset_index;
 
 		if (op & OBSC_TEST_ALL_SHADOWED) {
 			vm_page_t pp;
 
 			/*
 			 * Ignore pages outside the parent object's range
 			 * and outside the parent object's mapping of the 
 			 * backing object.
 			 *
 			 * note that we do not busy the backing object's
 			 * page.
 			 */
 			if (
 			    p->pindex < backing_offset_index ||
 			    new_pindex >= object->size
 			) {
 				p = next;
 				continue;
 			}
 
 			/*
 			 * See if the parent has the page or if the parent's
 			 * object pager has the page.  If the parent has the
 			 * page but the page is not valid, the parent's
 			 * object pager must have the page.
 			 *
 			 * If this fails, the parent does not completely shadow
 			 * the object and we might as well give up now.
 			 */
 
 			pp = vm_page_lookup(object, new_pindex);
 			if (
 			    (pp == NULL || pp->valid == 0) &&
 			    !vm_pager_has_page(object, new_pindex, NULL, NULL)
 			) {
 				r = 0;
 				break;
 			}
 		}
 
 		/*
 		 * Check for busy page
 		 */
 		if (op & (OBSC_COLLAPSE_WAIT | OBSC_COLLAPSE_NOWAIT)) {
 			vm_page_t pp;
 
 			if (op & OBSC_COLLAPSE_NOWAIT) {
 				if ((p->oflags & VPO_BUSY) ||
 				    !p->valid || 
 				    p->busy) {
 					p = next;
 					continue;
 				}
 			} else if (op & OBSC_COLLAPSE_WAIT) {
 				if ((p->oflags & VPO_BUSY) || p->busy) {
 					vm_page_lock_queues();
 					vm_page_flag_set(p, PG_REFERENCED);
 					vm_page_unlock_queues();
 					VM_OBJECT_UNLOCK(object);
 					p->oflags |= VPO_WANTED;
 					msleep(p, VM_OBJECT_MTX(backing_object),
 					    PDROP | PVM, "vmocol", 0);
 					VM_OBJECT_LOCK(object);
 					VM_OBJECT_LOCK(backing_object);
 					/*
 					 * If we slept, anything could have
 					 * happened.  Since the object is
 					 * marked dead, the backing offset
 					 * should not have changed so we
 					 * just restart our scan.
 					 */
 					p = TAILQ_FIRST(&backing_object->memq);
 					continue;
 				}
 			}
 
 			KASSERT(
 			    p->object == backing_object,
 			    ("vm_object_backing_scan: object mismatch")
 			);
 
 			/*
 			 * Destroy any associated swap
 			 */
 			if (backing_object->type == OBJT_SWAP) {
 				swap_pager_freespace(
 				    backing_object, 
 				    p->pindex,
 				    1
 				);
 			}
 
 			if (
 			    p->pindex < backing_offset_index ||
 			    new_pindex >= object->size
 			) {
 				/*
 				 * Page is out of the parent object's range, we 
 				 * can simply destroy it. 
 				 */
 				vm_page_lock_queues();
 				KASSERT(!pmap_page_is_mapped(p),
 				    ("freeing mapped page %p", p));
 				if (p->wire_count == 0)
 					vm_page_free(p);
 				else
 					vm_page_remove(p);
 				vm_page_unlock_queues();
 				p = next;
 				continue;
 			}
 
 			pp = vm_page_lookup(object, new_pindex);
 			if (
 			    pp != NULL ||
 			    vm_pager_has_page(object, new_pindex, NULL, NULL)
 			) {
 				/*
 				 * page already exists in parent OR swap exists
 				 * for this location in the parent.  Destroy 
 				 * the original page from the backing object.
 				 *
 				 * Leave the parent's page alone
 				 */
 				vm_page_lock_queues();
 				KASSERT(!pmap_page_is_mapped(p),
 				    ("freeing mapped page %p", p));
 				if (p->wire_count == 0)
 					vm_page_free(p);
 				else
 					vm_page_remove(p);
 				vm_page_unlock_queues();
 				p = next;
 				continue;
 			}
 
 #if VM_NRESERVLEVEL > 0
 			/*
 			 * Rename the reservation.
 			 */
 			vm_reserv_rename(p, object, backing_object,
 			    backing_offset_index);
 #endif
 
 			/*
 			 * Page does not exist in parent, rename the
 			 * page from the backing object to the main object. 
 			 *
 			 * If the page was mapped to a process, it can remain 
 			 * mapped through the rename.
 			 */
 			vm_page_lock_queues();
 			vm_page_rename(p, object, new_pindex);
 			vm_page_unlock_queues();
 			/* page automatically made dirty by rename */
 		}
 		p = next;
 	}
 	return (r);
 }
 
 
 /*
  * this version of collapse allows the operation to occur earlier and
  * when paging_in_progress is true for an object...  This is not a complete
  * operation, but should plug 99.9% of the rest of the leaks.
  */
 static void
 vm_object_qcollapse(vm_object_t object)
 {
 	vm_object_t backing_object = object->backing_object;
 
 	VM_OBJECT_LOCK_ASSERT(object, MA_OWNED);
 	VM_OBJECT_LOCK_ASSERT(backing_object, MA_OWNED);
 
 	if (backing_object->ref_count != 1)
 		return;
 
 	vm_object_backing_scan(object, OBSC_COLLAPSE_NOWAIT);
 }
 
 /*
  *	vm_object_collapse:
  *
  *	Collapse an object with the object backing it.
  *	Pages in the backing object are moved into the
  *	parent, and the backing object is deallocated.
  */
 void
 vm_object_collapse(vm_object_t object)
 {
 	VM_OBJECT_LOCK_ASSERT(object, MA_OWNED);
 	
 	while (TRUE) {
 		vm_object_t backing_object;
 
 		/*
 		 * Verify that the conditions are right for collapse:
 		 *
 		 * The object exists and the backing object exists.
 		 */
 		if ((backing_object = object->backing_object) == NULL)
 			break;
 
 		/*
 		 * we check the backing object first, because it is most likely
 		 * not collapsable.
 		 */
 		VM_OBJECT_LOCK(backing_object);
 		if (backing_object->handle != NULL ||
 		    (backing_object->type != OBJT_DEFAULT &&
 		     backing_object->type != OBJT_SWAP) ||
 		    (backing_object->flags & OBJ_DEAD) ||
 		    object->handle != NULL ||
 		    (object->type != OBJT_DEFAULT &&
 		     object->type != OBJT_SWAP) ||
 		    (object->flags & OBJ_DEAD)) {
 			VM_OBJECT_UNLOCK(backing_object);
 			break;
 		}
 
 		if (
 		    object->paging_in_progress != 0 ||
 		    backing_object->paging_in_progress != 0
 		) {
 			vm_object_qcollapse(object);
 			VM_OBJECT_UNLOCK(backing_object);
 			break;
 		}
 		/*
 		 * We know that we can either collapse the backing object (if
 		 * the parent is the only reference to it) or (perhaps) have
 		 * the parent bypass the object if the parent happens to shadow
 		 * all the resident pages in the entire backing object.
 		 *
 		 * This is ignoring pager-backed pages such as swap pages.
 		 * vm_object_backing_scan fails the shadowing test in this
 		 * case.
 		 */
 		if (backing_object->ref_count == 1) {
 			/*
 			 * If there is exactly one reference to the backing
 			 * object, we can collapse it into the parent.  
 			 */
 			vm_object_backing_scan(object, OBSC_COLLAPSE_WAIT);
 
 #if VM_NRESERVLEVEL > 0
 			/*
 			 * Break any reservations from backing_object.
 			 */
 			if (__predict_false(!LIST_EMPTY(&backing_object->rvq)))
 				vm_reserv_break_all(backing_object);
 #endif
 
 			/*
 			 * Move the pager from backing_object to object.
 			 */
 			if (backing_object->type == OBJT_SWAP) {
 				/*
 				 * swap_pager_copy() can sleep, in which case
 				 * the backing_object's and object's locks are
 				 * released and reacquired.
 				 */
 				swap_pager_copy(
 				    backing_object,
 				    object,
 				    OFF_TO_IDX(object->backing_object_offset), TRUE);
 
 				/*
 				 * Free any cached pages from backing_object.
 				 */
 				if (__predict_false(backing_object->cache != NULL))
 					vm_page_cache_free(backing_object, 0, 0);
 			}
 			/*
 			 * Object now shadows whatever backing_object did.
 			 * Note that the reference to 
 			 * backing_object->backing_object moves from within 
 			 * backing_object to within object.
 			 */
 			LIST_REMOVE(object, shadow_list);
 			backing_object->shadow_count--;
 			backing_object->generation++;
 			if (backing_object->backing_object) {
 				VM_OBJECT_LOCK(backing_object->backing_object);
 				LIST_REMOVE(backing_object, shadow_list);
 				LIST_INSERT_HEAD(
 				    &backing_object->backing_object->shadow_head,
 				    object, shadow_list);
 				/*
 				 * The shadow_count has not changed.
 				 */
 				backing_object->backing_object->generation++;
 				VM_OBJECT_UNLOCK(backing_object->backing_object);
 			}
 			object->backing_object = backing_object->backing_object;
 			object->backing_object_offset +=
 			    backing_object->backing_object_offset;
 
 			/*
 			 * Discard backing_object.
 			 *
 			 * Since the backing object has no pages, no pager left,
 			 * and no object references within it, all that is
 			 * necessary is to dispose of it.
 			 */
 			KASSERT(backing_object->ref_count == 1, ("backing_object %p was somehow re-referenced during collapse!", backing_object));
 			VM_OBJECT_UNLOCK(backing_object);
 
 			mtx_lock(&vm_object_list_mtx);
 			TAILQ_REMOVE(
 			    &vm_object_list, 
 			    backing_object,
 			    object_list
 			);
 			mtx_unlock(&vm_object_list_mtx);
 
 			uma_zfree(obj_zone, backing_object);
 
 			object_collapses++;
 		} else {
 			vm_object_t new_backing_object;
 
 			/*
 			 * If we do not entirely shadow the backing object,
 			 * there is nothing we can do so we give up.
 			 */
 			if (object->resident_page_count != object->size &&
 			    vm_object_backing_scan(object,
 			    OBSC_TEST_ALL_SHADOWED) == 0) {
 				VM_OBJECT_UNLOCK(backing_object);
 				break;
 			}
 
 			/*
 			 * Make the parent shadow the next object in the
 			 * chain.  Deallocating backing_object will not remove
 			 * it, since its reference count is at least 2.
 			 */
 			LIST_REMOVE(object, shadow_list);
 			backing_object->shadow_count--;
 			backing_object->generation++;
 
 			new_backing_object = backing_object->backing_object;
 			if ((object->backing_object = new_backing_object) != NULL) {
 				VM_OBJECT_LOCK(new_backing_object);
 				LIST_INSERT_HEAD(
 				    &new_backing_object->shadow_head,
 				    object,
 				    shadow_list
 				);
 				new_backing_object->shadow_count++;
 				new_backing_object->generation++;
 				vm_object_reference_locked(new_backing_object);
 				VM_OBJECT_UNLOCK(new_backing_object);
 				object->backing_object_offset +=
 					backing_object->backing_object_offset;
 			}
 
 			/*
 			 * Drop the reference count on backing_object. Since
 			 * its ref_count was at least 2, it will not vanish.
 			 */
 			backing_object->ref_count--;
 			VM_OBJECT_UNLOCK(backing_object);
 			object_bypasses++;
 		}
 
 		/*
 		 * Try again with this object's new backing object.
 		 */
 	}
 }
 
 /*
  *	vm_object_page_remove:
  *
  *	Removes all physical pages in the given range from the
  *	object's list of pages.  If the range's end is zero, all
  *	physical pages from the range's start to the end of the object
  *	are deleted.
  *
  *	The object must be locked.
  */
 void
 vm_object_page_remove(vm_object_t object, vm_pindex_t start, vm_pindex_t end,
     boolean_t clean_only)
 {
 	vm_page_t p, next;
 	int wirings;
 
 	VM_OBJECT_LOCK_ASSERT(object, MA_OWNED);
 	if (object->resident_page_count == 0)
 		goto skipmemq;
 
 	/*
 	 * Since physically-backed objects do not use managed pages, we can't
 	 * remove pages from the object (we must instead remove the page
 	 * references, and then destroy the object).
 	 */
 	KASSERT(object->type != OBJT_PHYS || object == kernel_object ||
 	    object == kmem_object,
 	    ("attempt to remove pages from a physical object"));
 
 	vm_object_pip_add(object, 1);
 again:
 	vm_page_lock_queues();
 	if ((p = TAILQ_FIRST(&object->memq)) != NULL) {
 		if (p->pindex < start) {
 			p = vm_page_splay(start, object->root);
 			if ((object->root = p)->pindex < start)
 				p = TAILQ_NEXT(p, listq);
 		}
 	}
 	/*
 	 * Assert: the variable p is either (1) the page with the
 	 * least pindex greater than or equal to the parameter pindex
 	 * or (2) NULL.
 	 */
 	for (;
 	     p != NULL && (p->pindex < end || end == 0);
 	     p = next) {
 		next = TAILQ_NEXT(p, listq);
 
 		/*
 		 * If the page is wired for any reason besides the
 		 * existence of managed, wired mappings, then it cannot
 		 * be freed.  
 		 */
 		if ((wirings = p->wire_count) != 0 &&
 		    (wirings = pmap_page_wired_mappings(p)) != p->wire_count) {
 			pmap_remove_all(p);
 			/* Account for removal of managed, wired mappings. */
 			p->wire_count -= wirings;
 			if (!clean_only)
 				p->valid = 0;
 			continue;
 		}
 		if (vm_page_sleep_if_busy(p, TRUE, "vmopar"))
 			goto again;
 		if (clean_only && p->valid) {
 			pmap_remove_write(p);
 			if (p->valid & p->dirty)
 				continue;
 		}
 		pmap_remove_all(p);
 		/* Account for removal of managed, wired mappings. */
 		if (wirings != 0)
 			p->wire_count -= wirings;
 		vm_page_free(p);
 	}
 	vm_page_unlock_queues();
 	vm_object_pip_wakeup(object);
 skipmemq:
 	if (__predict_false(object->cache != NULL))
 		vm_page_cache_free(object, start, end);
 }
 
 /*
  *	Routine:	vm_object_coalesce
  *	Function:	Coalesces two objects backing up adjoining
  *			regions of memory into a single object.
  *
  *	returns TRUE if objects were combined.
  *
  *	NOTE:	Only works at the moment if the second object is NULL -
  *		if it's not, which object do we lock first?
  *
  *	Parameters:
  *		prev_object	First object to coalesce
  *		prev_offset	Offset into prev_object
  *		prev_size	Size of reference to prev_object
  *		next_size	Size of reference to the second object
  *
  *	Conditions:
  *	The object must *not* be locked.
  */
 boolean_t
 vm_object_coalesce(vm_object_t prev_object, vm_ooffset_t prev_offset,
 	vm_size_t prev_size, vm_size_t next_size)
 {
 	vm_pindex_t next_pindex;
 
 	if (prev_object == NULL)
 		return (TRUE);
 	VM_OBJECT_LOCK(prev_object);
 	if (prev_object->type != OBJT_DEFAULT &&
 	    prev_object->type != OBJT_SWAP) {
 		VM_OBJECT_UNLOCK(prev_object);
 		return (FALSE);
 	}
 
 	/*
 	 * Try to collapse the object first
 	 */
 	vm_object_collapse(prev_object);
 
 	/*
 	 * Can't coalesce if: . more than one reference . paged out . shadows
 	 * another object . has a copy elsewhere (any of which mean that the
 	 * pages not mapped to prev_entry may be in use anyway)
 	 */
 	if (prev_object->backing_object != NULL) {
 		VM_OBJECT_UNLOCK(prev_object);
 		return (FALSE);
 	}
 
 	prev_size >>= PAGE_SHIFT;
 	next_size >>= PAGE_SHIFT;
 	next_pindex = OFF_TO_IDX(prev_offset) + prev_size;
 
 	if ((prev_object->ref_count > 1) &&
 	    (prev_object->size != next_pindex)) {
 		VM_OBJECT_UNLOCK(prev_object);
 		return (FALSE);
 	}
 
 	/*
 	 * Remove any pages that may still be in the object from a previous
 	 * deallocation.
 	 */
 	if (next_pindex < prev_object->size) {
 		vm_object_page_remove(prev_object,
 				      next_pindex,
 				      next_pindex + next_size, FALSE);
 		if (prev_object->type == OBJT_SWAP)
 			swap_pager_freespace(prev_object,
 					     next_pindex, next_size);
 	}
 
 	/*
 	 * Extend the object if necessary.
 	 */
 	if (next_pindex + next_size > prev_object->size)
 		prev_object->size = next_pindex + next_size;
 
 	VM_OBJECT_UNLOCK(prev_object);
 	return (TRUE);
 }
 
 void
 vm_object_set_writeable_dirty(vm_object_t object)
 {
 	struct vnode *vp;
 
 	VM_OBJECT_LOCK_ASSERT(object, MA_OWNED);
 	if ((object->flags & OBJ_MIGHTBEDIRTY) != 0)
 		return;
 	vm_object_set_flag(object, OBJ_MIGHTBEDIRTY);
 	if (object->type == OBJT_VNODE &&
 	    (vp = (struct vnode *)object->handle) != NULL) {
 		VI_LOCK(vp);
 		vp->v_iflag |= VI_OBJDIRTY;
 		VI_UNLOCK(vp);
 	}
 }
 
 #include "opt_ddb.h"
 #ifdef DDB
 #include <sys/kernel.h>
 
 #include <sys/cons.h>
 
 #include <ddb/ddb.h>
 
 static int
 _vm_object_in_map(vm_map_t map, vm_object_t object, vm_map_entry_t entry)
 {
 	vm_map_t tmpm;
 	vm_map_entry_t tmpe;
 	vm_object_t obj;
 	int entcount;
 
 	if (map == 0)
 		return 0;
 
 	if (entry == 0) {
 		tmpe = map->header.next;
 		entcount = map->nentries;
 		while (entcount-- && (tmpe != &map->header)) {
 			if (_vm_object_in_map(map, object, tmpe)) {
 				return 1;
 			}
 			tmpe = tmpe->next;
 		}
 	} else if (entry->eflags & MAP_ENTRY_IS_SUB_MAP) {
 		tmpm = entry->object.sub_map;
 		tmpe = tmpm->header.next;
 		entcount = tmpm->nentries;
 		while (entcount-- && tmpe != &tmpm->header) {
 			if (_vm_object_in_map(tmpm, object, tmpe)) {
 				return 1;
 			}
 			tmpe = tmpe->next;
 		}
 	} else if ((obj = entry->object.vm_object) != NULL) {
 		for (; obj; obj = obj->backing_object)
 			if (obj == object) {
 				return 1;
 			}
 	}
 	return 0;
 }
 
 static int
 vm_object_in_map(vm_object_t object)
 {
 	struct proc *p;
 
 	/* sx_slock(&allproc_lock); */
 	FOREACH_PROC_IN_SYSTEM(p) {
 		if (!p->p_vmspace /* || (p->p_flag & (P_SYSTEM|P_WEXIT)) */)
 			continue;
 		if (_vm_object_in_map(&p->p_vmspace->vm_map, object, 0)) {
 			/* sx_sunlock(&allproc_lock); */
 			return 1;
 		}
 	}
 	/* sx_sunlock(&allproc_lock); */
 	if (_vm_object_in_map(kernel_map, object, 0))
 		return 1;
 	if (_vm_object_in_map(kmem_map, object, 0))
 		return 1;
 	if (_vm_object_in_map(pager_map, object, 0))
 		return 1;
 	if (_vm_object_in_map(buffer_map, object, 0))
 		return 1;
 	return 0;
 }
 
 DB_SHOW_COMMAND(vmochk, vm_object_check)
 {
 	vm_object_t object;
 
 	/*
 	 * make sure that internal objs are in a map somewhere
 	 * and none have zero ref counts.
 	 */
 	TAILQ_FOREACH(object, &vm_object_list, object_list) {
 		if (object->handle == NULL &&
 		    (object->type == OBJT_DEFAULT || object->type == OBJT_SWAP)) {
 			if (object->ref_count == 0) {
 				db_printf("vmochk: internal obj has zero ref count: %ld\n",
 					(long)object->size);
 			}
 			if (!vm_object_in_map(object)) {
 				db_printf(
 			"vmochk: internal obj is not in a map: "
 			"ref: %d, size: %lu: 0x%lx, backing_object: %p\n",
 				    object->ref_count, (u_long)object->size, 
 				    (u_long)object->size,
 				    (void *)object->backing_object);
 			}
 		}
 	}
 }
 
 /*
  *	vm_object_print:	[ debug ]
  */
 DB_SHOW_COMMAND(object, vm_object_print_static)
 {
 	/* XXX convert args. */
 	vm_object_t object = (vm_object_t)addr;
 	boolean_t full = have_addr;
 
 	vm_page_t p;
 
 	/* XXX count is an (unused) arg.  Avoid shadowing it. */
 #define	count	was_count
 
 	int count;
 
 	if (object == NULL)
 		return;
 
 	db_iprintf(
 	    "Object %p: type=%d, size=0x%jx, res=%d, ref=%d, flags=0x%x\n",
 	    object, (int)object->type, (uintmax_t)object->size,
 	    object->resident_page_count, object->ref_count, object->flags);
 	db_iprintf(" sref=%d, backing_object(%d)=(%p)+0x%jx\n",
 	    object->shadow_count, 
 	    object->backing_object ? object->backing_object->ref_count : 0,
 	    object->backing_object, (uintmax_t)object->backing_object_offset);
 
 	if (!full)
 		return;
 
 	db_indent += 2;
 	count = 0;
 	TAILQ_FOREACH(p, &object->memq, listq) {
 		if (count == 0)
 			db_iprintf("memory:=");
 		else if (count == 6) {
 			db_printf("\n");
 			db_iprintf(" ...");
 			count = 0;
 		} else
 			db_printf(",");
 		count++;
 
 		db_printf("(off=0x%jx,page=0x%jx)",
 		    (uintmax_t)p->pindex, (uintmax_t)VM_PAGE_TO_PHYS(p));
 	}
 	if (count != 0)
 		db_printf("\n");
 	db_indent -= 2;
 }
 
 /* XXX. */
 #undef count
 
 /* XXX need this non-static entry for calling from vm_map_print. */
 void
 vm_object_print(
         /* db_expr_t */ long addr,
 	boolean_t have_addr,
 	/* db_expr_t */ long count,
 	char *modif)
 {
 	vm_object_print_static(addr, have_addr, count, modif);
 }
 
 DB_SHOW_COMMAND(vmopag, vm_object_print_pages)
 {
 	vm_object_t object;
 	int nl = 0;
 	int c;
 
 	TAILQ_FOREACH(object, &vm_object_list, object_list) {
 		vm_pindex_t idx, fidx;
 		vm_pindex_t osize;
 		vm_paddr_t pa = -1;
 		int rcount;
 		vm_page_t m;
 
 		db_printf("new object: %p\n", (void *)object);
 		if (nl > 18) {
 			c = cngetc();
 			if (c != ' ')
 				return;
 			nl = 0;
 		}
 		nl++;
 		rcount = 0;
 		fidx = 0;
 		osize = object->size;
 		if (osize > 128)
 			osize = 128;
 		for (idx = 0; idx < osize; idx++) {
 			m = vm_page_lookup(object, idx);
 			if (m == NULL) {
 				if (rcount) {
 					db_printf(" index(%ld)run(%d)pa(0x%lx)\n",
 						(long)fidx, rcount, (long)pa);
 					if (nl > 18) {
 						c = cngetc();
 						if (c != ' ')
 							return;
 						nl = 0;
 					}
 					nl++;
 					rcount = 0;
 				}
 				continue;
 			}
 
 				
 			if (rcount &&
 				(VM_PAGE_TO_PHYS(m) == pa + rcount * PAGE_SIZE)) {
 				++rcount;
 				continue;
 			}
 			if (rcount) {
 				db_printf(" index(%ld)run(%d)pa(0x%lx)\n",
 					(long)fidx, rcount, (long)pa);
 				if (nl > 18) {
 					c = cngetc();
 					if (c != ' ')
 						return;
 					nl = 0;
 				}
 				nl++;
 			}
 			fidx = idx;
 			pa = VM_PAGE_TO_PHYS(m);
 			rcount = 1;
 		}
 		if (rcount) {
 			db_printf(" index(%ld)run(%d)pa(0x%lx)\n",
 				(long)fidx, rcount, (long)pa);
 			if (nl > 18) {
 				c = cngetc();
 				if (c != ' ')
 					return;
 				nl = 0;
 			}
 			nl++;
 		}
 	}
 }
 #endif /* DDB */
Index: head/sys/vm/vnode_pager.c
===================================================================
--- head/sys/vm/vnode_pager.c	(revision 175201)
+++ head/sys/vm/vnode_pager.c	(revision 175202)
@@ -1,1212 +1,1212 @@
 /*-
  * Copyright (c) 1990 University of Utah.
  * Copyright (c) 1991 The Regents of the University of California.
  * All rights reserved.
  * Copyright (c) 1993, 1994 John S. Dyson
  * Copyright (c) 1995, David Greenman
  *
  * This code is derived from software contributed to Berkeley by
  * the Systems Programming Group of the University of Utah Computer
  * Science Department.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. All advertising materials mentioning features or use of this software
  *    must display the following acknowledgement:
  *	This product includes software developed by the University of
  *	California, Berkeley and its contributors.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	from: @(#)vnode_pager.c	7.5 (Berkeley) 4/20/91
  */
 
 /*
  * Page to/from files (vnodes).
  */
 
 /*
  * TODO:
  *	Implement VOP_GETPAGES/PUTPAGES interface for filesystems. Will
  *	greatly re-simplify the vnode_pager.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/proc.h>
 #include <sys/vnode.h>
 #include <sys/mount.h>
 #include <sys/bio.h>
 #include <sys/buf.h>
 #include <sys/vmmeter.h>
 #include <sys/limits.h>
 #include <sys/conf.h>
 #include <sys/sf_buf.h>
 
 #include <machine/atomic.h>
 
 #include <vm/vm.h>
 #include <vm/vm_object.h>
 #include <vm/vm_page.h>
 #include <vm/vm_pager.h>
 #include <vm/vm_map.h>
 #include <vm/vnode_pager.h>
 #include <vm/vm_extern.h>
 
 static int vnode_pager_addr(struct vnode *vp, vm_ooffset_t address,
     daddr_t *rtaddress, int *run);
 static int vnode_pager_input_smlfs(vm_object_t object, vm_page_t m);
 static int vnode_pager_input_old(vm_object_t object, vm_page_t m);
 static void vnode_pager_dealloc(vm_object_t);
 static int vnode_pager_getpages(vm_object_t, vm_page_t *, int, int);
 static void vnode_pager_putpages(vm_object_t, vm_page_t *, int, boolean_t, int *);
 static boolean_t vnode_pager_haspage(vm_object_t, vm_pindex_t, int *, int *);
 static vm_object_t vnode_pager_alloc(void *, vm_ooffset_t, vm_prot_t, vm_ooffset_t);
 
 struct pagerops vnodepagerops = {
 	.pgo_alloc =	vnode_pager_alloc,
 	.pgo_dealloc =	vnode_pager_dealloc,
 	.pgo_getpages =	vnode_pager_getpages,
 	.pgo_putpages =	vnode_pager_putpages,
 	.pgo_haspage =	vnode_pager_haspage,
 };
 
 int vnode_pbuf_freecnt;
 
 /* Create the VM system backing object for this vnode */
 int
 vnode_create_vobject(struct vnode *vp, off_t isize, struct thread *td)
 {
 	vm_object_t object;
 	vm_ooffset_t size = isize;
 	struct vattr va;
 
 	if (!vn_isdisk(vp, NULL) && vn_canvmio(vp) == FALSE)
 		return (0);
 
 	while ((object = vp->v_object) != NULL) {
 		VM_OBJECT_LOCK(object);
 		if (!(object->flags & OBJ_DEAD)) {
 			VM_OBJECT_UNLOCK(object);
 			return (0);
 		}
 		VOP_UNLOCK(vp, 0, td);
 		vm_object_set_flag(object, OBJ_DISCONNECTWNT);
 		msleep(object, VM_OBJECT_MTX(object), PDROP | PVM, "vodead", 0);
-		vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td);
+		vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
 	}
 
 	if (size == 0) {
 		if (vn_isdisk(vp, NULL)) {
 			size = IDX_TO_OFF(INT_MAX);
 		} else {
 			if (VOP_GETATTR(vp, &va, td->td_ucred, td) != 0)
 				return (0);
 			size = va.va_size;
 		}
 	}
 
 	object = vnode_pager_alloc(vp, size, 0, 0);
 	/*
 	 * Dereference the reference we just created.  This assumes
 	 * that the object is associated with the vp.
 	 */
 	VM_OBJECT_LOCK(object);
 	object->ref_count--;
 	VM_OBJECT_UNLOCK(object);
 	vrele(vp);
 
 	KASSERT(vp->v_object != NULL, ("vnode_create_vobject: NULL object"));
 
 	return (0);
 }
 
 void
 vnode_destroy_vobject(struct vnode *vp)
 {
 	struct vm_object *obj;
 
 	obj = vp->v_object;
 	if (obj == NULL)
 		return;
 	ASSERT_VOP_ELOCKED(vp, "vnode_destroy_vobject");
 	VM_OBJECT_LOCK(obj);
 	if (obj->ref_count == 0) {
 		/*
 		 * vclean() may be called twice. The first time
 		 * removes the primary reference to the object,
 		 * the second time goes one further and is a
 		 * special-case to terminate the object.
 		 *
 		 * don't double-terminate the object
 		 */
 		if ((obj->flags & OBJ_DEAD) == 0)
 			vm_object_terminate(obj);
 		else
 			VM_OBJECT_UNLOCK(obj);
 	} else {
 		/*
 		 * Woe to the process that tries to page now :-).
 		 */
 		vm_pager_deallocate(obj);
 		VM_OBJECT_UNLOCK(obj);
 	}
 	vp->v_object = NULL;
 }
 
 
 /*
  * Allocate (or lookup) pager for a vnode.
  * Handle is a vnode pointer.
  *
  * MPSAFE
  */
 vm_object_t
 vnode_pager_alloc(void *handle, vm_ooffset_t size, vm_prot_t prot,
 		  vm_ooffset_t offset)
 {
 	vm_object_t object;
 	struct vnode *vp;
 
 	/*
 	 * Pageout to vnode, no can do yet.
 	 */
 	if (handle == NULL)
 		return (NULL);
 
 	vp = (struct vnode *) handle;
 
 	ASSERT_VOP_ELOCKED(vp, "vnode_pager_alloc");
 
 	/*
 	 * If the object is being terminated, wait for it to
 	 * go away.
 	 */
 	while ((object = vp->v_object) != NULL) {
 		VM_OBJECT_LOCK(object);
 		if ((object->flags & OBJ_DEAD) == 0)
 			break;
 		vm_object_set_flag(object, OBJ_DISCONNECTWNT);
 		msleep(object, VM_OBJECT_MTX(object), PDROP | PVM, "vadead", 0);
 	}
 
 	if (vp->v_usecount == 0)
 		panic("vnode_pager_alloc: no vnode reference");
 
 	if (object == NULL) {
 		/*
 		 * And an object of the appropriate size
 		 */
 		object = vm_object_allocate(OBJT_VNODE, OFF_TO_IDX(round_page(size)));
 
 		object->un_pager.vnp.vnp_size = size;
 
 		object->handle = handle;
 		if (VFS_NEEDSGIANT(vp->v_mount))
 			vm_object_set_flag(object, OBJ_NEEDGIANT);
 		vp->v_object = object;
 	} else {
 		object->ref_count++;
 		VM_OBJECT_UNLOCK(object);
 	}
 	vref(vp);
 	return (object);
 }
 
 /*
  *	The object must be locked.
  */
 static void
 vnode_pager_dealloc(object)
 	vm_object_t object;
 {
 	struct vnode *vp = object->handle;
 
 	if (vp == NULL)
 		panic("vnode_pager_dealloc: pager already dealloced");
 
 	VM_OBJECT_LOCK_ASSERT(object, MA_OWNED);
 	vm_object_pip_wait(object, "vnpdea");
 
 	object->handle = NULL;
 	object->type = OBJT_DEAD;
 	if (object->flags & OBJ_DISCONNECTWNT) {
 		vm_object_clear_flag(object, OBJ_DISCONNECTWNT);
 		wakeup(object);
 	}
 	ASSERT_VOP_ELOCKED(vp, "vnode_pager_dealloc");
 	vp->v_object = NULL;
 	vp->v_vflag &= ~VV_TEXT;
 }
 
 static boolean_t
 vnode_pager_haspage(object, pindex, before, after)
 	vm_object_t object;
 	vm_pindex_t pindex;
 	int *before;
 	int *after;
 {
 	struct vnode *vp = object->handle;
 	daddr_t bn;
 	int err;
 	daddr_t reqblock;
 	int poff;
 	int bsize;
 	int pagesperblock, blocksperpage;
 	int vfslocked;
 
 	VM_OBJECT_LOCK_ASSERT(object, MA_OWNED);
 	/*
 	 * If no vp or vp is doomed or marked transparent to VM, we do not
 	 * have the page.
 	 */
 	if (vp == NULL || vp->v_iflag & VI_DOOMED)
 		return FALSE;
 	/*
 	 * If the offset is beyond end of file we do
 	 * not have the page.
 	 */
 	if (IDX_TO_OFF(pindex) >= object->un_pager.vnp.vnp_size)
 		return FALSE;
 
 	bsize = vp->v_mount->mnt_stat.f_iosize;
 	pagesperblock = bsize / PAGE_SIZE;
 	blocksperpage = 0;
 	if (pagesperblock > 0) {
 		reqblock = pindex / pagesperblock;
 	} else {
 		blocksperpage = (PAGE_SIZE / bsize);
 		reqblock = pindex * blocksperpage;
 	}
 	VM_OBJECT_UNLOCK(object);
 	vfslocked = VFS_LOCK_GIANT(vp->v_mount);
 	err = VOP_BMAP(vp, reqblock, NULL, &bn, after, before);
 	VFS_UNLOCK_GIANT(vfslocked);
 	VM_OBJECT_LOCK(object);
 	if (err)
 		return TRUE;
 	if (bn == -1)
 		return FALSE;
 	if (pagesperblock > 0) {
 		poff = pindex - (reqblock * pagesperblock);
 		if (before) {
 			*before *= pagesperblock;
 			*before += poff;
 		}
 		if (after) {
 			int numafter;
 			*after *= pagesperblock;
 			numafter = pagesperblock - (poff + 1);
 			if (IDX_TO_OFF(pindex + numafter) >
 			    object->un_pager.vnp.vnp_size) {
 				numafter =
 		    		    OFF_TO_IDX(object->un_pager.vnp.vnp_size) -
 				    pindex;
 			}
 			*after += numafter;
 		}
 	} else {
 		if (before) {
 			*before /= blocksperpage;
 		}
 
 		if (after) {
 			*after /= blocksperpage;
 		}
 	}
 	return TRUE;
 }
 
 /*
  * Lets the VM system know about a change in size for a file.
  * We adjust our own internal size and flush any cached pages in
  * the associated object that are affected by the size change.
  *
  * Note: this routine may be invoked as a result of a pager put
  * operation (possibly at object termination time), so we must be careful.
  */
 void
 vnode_pager_setsize(vp, nsize)
 	struct vnode *vp;
 	vm_ooffset_t nsize;
 {
 	vm_object_t object;
 	vm_page_t m;
 	vm_pindex_t nobjsize;
 
 	if ((object = vp->v_object) == NULL)
 		return;
 	VM_OBJECT_LOCK(object);
 	if (nsize == object->un_pager.vnp.vnp_size) {
 		/*
 		 * Hasn't changed size
 		 */
 		VM_OBJECT_UNLOCK(object);
 		return;
 	}
 	nobjsize = OFF_TO_IDX(nsize + PAGE_MASK);
 	if (nsize < object->un_pager.vnp.vnp_size) {
 		/*
 		 * File has shrunk. Toss any cached pages beyond the new EOF.
 		 */
 		if (nobjsize < object->size)
 			vm_object_page_remove(object, nobjsize, object->size,
 			    FALSE);
 		/*
 		 * this gets rid of garbage at the end of a page that is now
 		 * only partially backed by the vnode.
 		 *
 		 * XXX for some reason (I don't know yet), if we take a
 		 * completely invalid page and mark it partially valid
 		 * it can screw up NFS reads, so we don't allow the case.
 		 */
 		if ((nsize & PAGE_MASK) &&
 		    (m = vm_page_lookup(object, OFF_TO_IDX(nsize))) != NULL &&
 		    m->valid != 0) {
 			int base = (int)nsize & PAGE_MASK;
 			int size = PAGE_SIZE - base;
 
 			/*
 			 * Clear out partial-page garbage in case
 			 * the page has been mapped.
 			 */
 			pmap_zero_page_area(m, base, size);
 
 			/*
 			 * Clear out partial-page dirty bits.  This
 			 * has the side effect of setting the valid
 			 * bits, but that is ok.  There are a bunch
 			 * of places in the VM system where we expected
 			 * m->dirty == VM_PAGE_BITS_ALL.  The file EOF
 			 * case is one of them.  If the page is still
 			 * partially dirty, make it fully dirty.
 			 *
 			 * note that we do not clear out the valid
 			 * bits.  This would prevent bogus_page
 			 * replacement from working properly.
 			 */
 			vm_page_lock_queues();
 			vm_page_set_validclean(m, base, size);
 			if (m->dirty != 0)
 				m->dirty = VM_PAGE_BITS_ALL;
 			vm_page_unlock_queues();
 		} else if ((nsize & PAGE_MASK) &&
 		    __predict_false(object->cache != NULL)) {
 			vm_page_cache_free(object, OFF_TO_IDX(nsize),
 			    nobjsize);
 		}
 	}
 	object->un_pager.vnp.vnp_size = nsize;
 	object->size = nobjsize;
 	VM_OBJECT_UNLOCK(object);
 }
 
 /*
  * calculate the linear (byte) disk address of specified virtual
  * file address
  */
 static int
 vnode_pager_addr(struct vnode *vp, vm_ooffset_t address, daddr_t *rtaddress,
     int *run)
 {
 	int bsize;
 	int err;
 	daddr_t vblock;
 	daddr_t voffset;
 
 	if (address < 0)
 		return -1;
 
 	if (vp->v_iflag & VI_DOOMED)
 		return -1;
 
 	bsize = vp->v_mount->mnt_stat.f_iosize;
 	vblock = address / bsize;
 	voffset = address % bsize;
 
 	err = VOP_BMAP(vp, vblock, NULL, rtaddress, run, NULL);
 	if (err == 0) {
 		if (*rtaddress != -1)
 			*rtaddress += voffset / DEV_BSIZE;
 		if (run) {
 			*run += 1;
 			*run *= bsize/PAGE_SIZE;
 			*run -= voffset/PAGE_SIZE;
 		}
 	}
 
 	return (err);
 }
 
 /*
  * small block filesystem vnode pager input
  */
 static int
 vnode_pager_input_smlfs(object, m)
 	vm_object_t object;
 	vm_page_t m;
 {
 	int i;
 	struct vnode *vp;
 	struct bufobj *bo;
 	struct buf *bp;
 	struct sf_buf *sf;
 	daddr_t fileaddr;
 	vm_offset_t bsize;
 	int error = 0;
 
 	vp = object->handle;
 	if (vp->v_iflag & VI_DOOMED)
 		return VM_PAGER_BAD;
 
 	bsize = vp->v_mount->mnt_stat.f_iosize;
 
 	VOP_BMAP(vp, 0, &bo, 0, NULL, NULL);
 
 	sf = sf_buf_alloc(m, 0);
 
 	for (i = 0; i < PAGE_SIZE / bsize; i++) {
 		vm_ooffset_t address;
 
 		if (vm_page_bits(i * bsize, bsize) & m->valid)
 			continue;
 
 		address = IDX_TO_OFF(m->pindex) + i * bsize;
 		if (address >= object->un_pager.vnp.vnp_size) {
 			fileaddr = -1;
 		} else {
 			error = vnode_pager_addr(vp, address, &fileaddr, NULL);
 			if (error)
 				break;
 		}
 		if (fileaddr != -1) {
 			bp = getpbuf(&vnode_pbuf_freecnt);
 
 			/* build a minimal buffer header */
 			bp->b_iocmd = BIO_READ;
 			bp->b_iodone = bdone;
 			KASSERT(bp->b_rcred == NOCRED, ("leaking read ucred"));
 			KASSERT(bp->b_wcred == NOCRED, ("leaking write ucred"));
 			bp->b_rcred = crhold(curthread->td_ucred);
 			bp->b_wcred = crhold(curthread->td_ucred);
 			bp->b_data = (caddr_t)sf_buf_kva(sf) + i * bsize;
 			bp->b_blkno = fileaddr;
 			pbgetbo(bo, bp);
 			bp->b_bcount = bsize;
 			bp->b_bufsize = bsize;
 			bp->b_runningbufspace = bp->b_bufsize;
 			atomic_add_int(&runningbufspace, bp->b_runningbufspace);
 
 			/* do the input */
 			bp->b_iooffset = dbtob(bp->b_blkno);
 			bstrategy(bp);
 
 			bwait(bp, PVM, "vnsrd");
 
 			if ((bp->b_ioflags & BIO_ERROR) != 0)
 				error = EIO;
 
 			/*
 			 * free the buffer header back to the swap buffer pool
 			 */
 			pbrelbo(bp);
 			relpbuf(bp, &vnode_pbuf_freecnt);
 			if (error)
 				break;
 
 			VM_OBJECT_LOCK(object);
 			vm_page_lock_queues();
 			vm_page_set_validclean(m, (i * bsize) & PAGE_MASK, bsize);
 			vm_page_unlock_queues();
 			VM_OBJECT_UNLOCK(object);
 		} else {
 			VM_OBJECT_LOCK(object);
 			vm_page_lock_queues();
 			vm_page_set_validclean(m, (i * bsize) & PAGE_MASK, bsize);
 			vm_page_unlock_queues();
 			VM_OBJECT_UNLOCK(object);
 			bzero((caddr_t)sf_buf_kva(sf) + i * bsize, bsize);
 		}
 	}
 	sf_buf_free(sf);
 	vm_page_lock_queues();
 	pmap_clear_modify(m);
 	vm_page_unlock_queues();
 	if (error) {
 		return VM_PAGER_ERROR;
 	}
 	return VM_PAGER_OK;
 
 }
 
 
 /*
  * old style vnode pager input routine
  */
 static int
 vnode_pager_input_old(object, m)
 	vm_object_t object;
 	vm_page_t m;
 {
 	struct uio auio;
 	struct iovec aiov;
 	int error;
 	int size;
 	struct sf_buf *sf;
 	struct vnode *vp;
 
 	VM_OBJECT_LOCK_ASSERT(object, MA_OWNED);
 	error = 0;
 
 	/*
 	 * Return failure if beyond current EOF
 	 */
 	if (IDX_TO_OFF(m->pindex) >= object->un_pager.vnp.vnp_size) {
 		return VM_PAGER_BAD;
 	} else {
 		size = PAGE_SIZE;
 		if (IDX_TO_OFF(m->pindex) + size > object->un_pager.vnp.vnp_size)
 			size = object->un_pager.vnp.vnp_size - IDX_TO_OFF(m->pindex);
 		vp = object->handle;
 		VM_OBJECT_UNLOCK(object);
 
 		/*
 		 * Allocate a kernel virtual address and initialize so that
 		 * we can use VOP_READ/WRITE routines.
 		 */
 		sf = sf_buf_alloc(m, 0);
 
 		aiov.iov_base = (caddr_t)sf_buf_kva(sf);
 		aiov.iov_len = size;
 		auio.uio_iov = &aiov;
 		auio.uio_iovcnt = 1;
 		auio.uio_offset = IDX_TO_OFF(m->pindex);
 		auio.uio_segflg = UIO_SYSSPACE;
 		auio.uio_rw = UIO_READ;
 		auio.uio_resid = size;
 		auio.uio_td = curthread;
 
 		error = VOP_READ(vp, &auio, 0, curthread->td_ucred);
 		if (!error) {
 			int count = size - auio.uio_resid;
 
 			if (count == 0)
 				error = EINVAL;
 			else if (count != PAGE_SIZE)
 				bzero((caddr_t)sf_buf_kva(sf) + count,
 				    PAGE_SIZE - count);
 		}
 		sf_buf_free(sf);
 
 		VM_OBJECT_LOCK(object);
 	}
 	vm_page_lock_queues();
 	pmap_clear_modify(m);
 	vm_page_undirty(m);
 	vm_page_unlock_queues();
 	if (!error)
 		m->valid = VM_PAGE_BITS_ALL;
 	return error ? VM_PAGER_ERROR : VM_PAGER_OK;
 }
 
 /*
  * generic vnode pager input routine
  */
 
 /*
  * Local media VFS's that do not implement their own VOP_GETPAGES
  * should have their VOP_GETPAGES call to vnode_pager_generic_getpages()
  * to implement the previous behaviour.
  *
  * All other FS's should use the bypass to get to the local media
  * backing vp's VOP_GETPAGES.
  */
 static int
 vnode_pager_getpages(object, m, count, reqpage)
 	vm_object_t object;
 	vm_page_t *m;
 	int count;
 	int reqpage;
 {
 	int rtval;
 	struct vnode *vp;
 	int bytes = count * PAGE_SIZE;
 	int vfslocked;
 
 	vp = object->handle;
 	VM_OBJECT_UNLOCK(object);
 	vfslocked = VFS_LOCK_GIANT(vp->v_mount);
 	rtval = VOP_GETPAGES(vp, m, bytes, reqpage, 0);
 	KASSERT(rtval != EOPNOTSUPP,
 	    ("vnode_pager: FS getpages not implemented\n"));
 	VFS_UNLOCK_GIANT(vfslocked);
 	VM_OBJECT_LOCK(object);
 	return rtval;
 }
 
 /*
  * This is now called from local media FS's to operate against their
  * own vnodes if they fail to implement VOP_GETPAGES.
  */
 int
 vnode_pager_generic_getpages(vp, m, bytecount, reqpage)
 	struct vnode *vp;
 	vm_page_t *m;
 	int bytecount;
 	int reqpage;
 {
 	vm_object_t object;
 	vm_offset_t kva;
 	off_t foff, tfoff, nextoff;
 	int i, j, size, bsize, first;
 	daddr_t firstaddr, reqblock;
 	struct bufobj *bo;
 	int runpg;
 	int runend;
 	struct buf *bp;
 	int count;
 	int error;
 
 	object = vp->v_object;
 	count = bytecount / PAGE_SIZE;
 
 	KASSERT(vp->v_type != VCHR && vp->v_type != VBLK,
 	    ("vnode_pager_generic_getpages does not support devices"));
 	if (vp->v_iflag & VI_DOOMED)
 		return VM_PAGER_BAD;
 
 	bsize = vp->v_mount->mnt_stat.f_iosize;
 
 	/* get the UNDERLYING device for the file with VOP_BMAP() */
 
 	/*
 	 * originally, we did not check for an error return value -- assuming
 	 * an fs always has a bmap entry point -- that assumption is wrong!!!
 	 */
 	foff = IDX_TO_OFF(m[reqpage]->pindex);
 
 	/*
 	 * if we can't bmap, use old VOP code
 	 */
 	error = VOP_BMAP(vp, foff / bsize, &bo, &reqblock, NULL, NULL);
 	if (error == EOPNOTSUPP) {
 		VM_OBJECT_LOCK(object);
 		vm_page_lock_queues();
 		for (i = 0; i < count; i++)
 			if (i != reqpage)
 				vm_page_free(m[i]);
 		vm_page_unlock_queues();
 		PCPU_INC(cnt.v_vnodein);
 		PCPU_INC(cnt.v_vnodepgsin);
 		error = vnode_pager_input_old(object, m[reqpage]);
 		VM_OBJECT_UNLOCK(object);
 		return (error);
 	} else if (error != 0) {
 		VM_OBJECT_LOCK(object);
 		vm_page_lock_queues();
 		for (i = 0; i < count; i++)
 			if (i != reqpage)
 				vm_page_free(m[i]);
 		vm_page_unlock_queues();
 		VM_OBJECT_UNLOCK(object);
 		return (VM_PAGER_ERROR);
 
 		/*
 		 * if the blocksize is smaller than a page size, then use
 		 * special small filesystem code.  NFS sometimes has a small
 		 * blocksize, but it can handle large reads itself.
 		 */
 	} else if ((PAGE_SIZE / bsize) > 1 &&
 	    (vp->v_mount->mnt_stat.f_type != nfs_mount_type)) {
 		VM_OBJECT_LOCK(object);
 		vm_page_lock_queues();
 		for (i = 0; i < count; i++)
 			if (i != reqpage)
 				vm_page_free(m[i]);
 		vm_page_unlock_queues();
 		VM_OBJECT_UNLOCK(object);
 		PCPU_INC(cnt.v_vnodein);
 		PCPU_INC(cnt.v_vnodepgsin);
 		return vnode_pager_input_smlfs(object, m[reqpage]);
 	}
 
 	/*
 	 * If we have a completely valid page available to us, we can
 	 * clean up and return.  Otherwise we have to re-read the
 	 * media.
 	 */
 	VM_OBJECT_LOCK(object);
 	if (m[reqpage]->valid == VM_PAGE_BITS_ALL) {
 		vm_page_lock_queues();
 		for (i = 0; i < count; i++)
 			if (i != reqpage)
 				vm_page_free(m[i]);
 		vm_page_unlock_queues();
 		VM_OBJECT_UNLOCK(object);
 		return VM_PAGER_OK;
 	} else if (reqblock == -1) {
 		pmap_zero_page(m[reqpage]);
 		vm_page_undirty(m[reqpage]);
 		m[reqpage]->valid = VM_PAGE_BITS_ALL;
 		vm_page_lock_queues();
 		for (i = 0; i < count; i++)
 			if (i != reqpage)
 				vm_page_free(m[i]);
 		vm_page_unlock_queues();
 		VM_OBJECT_UNLOCK(object);
 		return (VM_PAGER_OK);
 	}
 	m[reqpage]->valid = 0;
 	VM_OBJECT_UNLOCK(object);
 
 	/*
 	 * here on direct device I/O
 	 */
 	firstaddr = -1;
 
 	/*
 	 * calculate the run that includes the required page
 	 */
 	for (first = 0, i = 0; i < count; i = runend) {
 		if (vnode_pager_addr(vp, IDX_TO_OFF(m[i]->pindex), &firstaddr,
 		    &runpg) != 0) {
 			VM_OBJECT_LOCK(object);
 			vm_page_lock_queues();
 			for (; i < count; i++)
 				if (i != reqpage)
 					vm_page_free(m[i]);
 			vm_page_unlock_queues();
 			VM_OBJECT_UNLOCK(object);
 			return (VM_PAGER_ERROR);
 		}
 		if (firstaddr == -1) {
 			VM_OBJECT_LOCK(object);
 			if (i == reqpage && foff < object->un_pager.vnp.vnp_size) {
 				panic("vnode_pager_getpages: unexpected missing page: firstaddr: %jd, foff: 0x%jx%08jx, vnp_size: 0x%jx%08jx",
 				    (intmax_t)firstaddr, (uintmax_t)(foff >> 32),
 				    (uintmax_t)foff,
 				    (uintmax_t)
 				    (object->un_pager.vnp.vnp_size >> 32),
 				    (uintmax_t)object->un_pager.vnp.vnp_size);
 			}
 			vm_page_lock_queues();
 			vm_page_free(m[i]);
 			vm_page_unlock_queues();
 			VM_OBJECT_UNLOCK(object);
 			runend = i + 1;
 			first = runend;
 			continue;
 		}
 		runend = i + runpg;
 		if (runend <= reqpage) {
 			VM_OBJECT_LOCK(object);
 			vm_page_lock_queues();
 			for (j = i; j < runend; j++)
 				vm_page_free(m[j]);
 			vm_page_unlock_queues();
 			VM_OBJECT_UNLOCK(object);
 		} else {
 			if (runpg < (count - first)) {
 				VM_OBJECT_LOCK(object);
 				vm_page_lock_queues();
 				for (i = first + runpg; i < count; i++)
 					vm_page_free(m[i]);
 				vm_page_unlock_queues();
 				VM_OBJECT_UNLOCK(object);
 				count = first + runpg;
 			}
 			break;
 		}
 		first = runend;
 	}
 
 	/*
 	 * the first and last page have been calculated now, move input pages
 	 * to be zero based...
 	 */
 	if (first != 0) {
 		m += first;
 		count -= first;
 		reqpage -= first;
 	}
 
 	/*
 	 * calculate the file virtual address for the transfer
 	 */
 	foff = IDX_TO_OFF(m[0]->pindex);
 
 	/*
 	 * calculate the size of the transfer
 	 */
 	size = count * PAGE_SIZE;
 	KASSERT(count > 0, ("zero count"));
 	if ((foff + size) > object->un_pager.vnp.vnp_size)
 		size = object->un_pager.vnp.vnp_size - foff;
 	KASSERT(size > 0, ("zero size"));
 
 	/*
 	 * round up physical size for real devices.
 	 */
 	if (1) {
 		int secmask = bo->bo_bsize - 1;
 		KASSERT(secmask < PAGE_SIZE && secmask > 0,
 		    ("vnode_pager_generic_getpages: sector size %d too large",
 		    secmask + 1));
 		size = (size + secmask) & ~secmask;
 	}
 
 	bp = getpbuf(&vnode_pbuf_freecnt);
 	kva = (vm_offset_t) bp->b_data;
 
 	/*
 	 * and map the pages to be read into the kva
 	 */
 	pmap_qenter(kva, m, count);
 
 	/* build a minimal buffer header */
 	bp->b_iocmd = BIO_READ;
 	bp->b_iodone = bdone;
 	KASSERT(bp->b_rcred == NOCRED, ("leaking read ucred"));
 	KASSERT(bp->b_wcred == NOCRED, ("leaking write ucred"));
 	bp->b_rcred = crhold(curthread->td_ucred);
 	bp->b_wcred = crhold(curthread->td_ucred);
 	bp->b_blkno = firstaddr;
 	pbgetbo(bo, bp);
 	bp->b_bcount = size;
 	bp->b_bufsize = size;
 	bp->b_runningbufspace = bp->b_bufsize;
 	atomic_add_int(&runningbufspace, bp->b_runningbufspace);
 
 	PCPU_INC(cnt.v_vnodein);
 	PCPU_ADD(cnt.v_vnodepgsin, count);
 
 	/* do the input */
 	bp->b_iooffset = dbtob(bp->b_blkno);
 	bstrategy(bp);
 
 	bwait(bp, PVM, "vnread");
 
 	if ((bp->b_ioflags & BIO_ERROR) != 0)
 		error = EIO;
 
 	if (!error) {
 		if (size != count * PAGE_SIZE)
 			bzero((caddr_t) kva + size, PAGE_SIZE * count - size);
 	}
 	pmap_qremove(kva, count);
 
 	/*
 	 * free the buffer header back to the swap buffer pool
 	 */
 	pbrelbo(bp);
 	relpbuf(bp, &vnode_pbuf_freecnt);
 
 	VM_OBJECT_LOCK(object);
 	vm_page_lock_queues();
 	for (i = 0, tfoff = foff; i < count; i++, tfoff = nextoff) {
 		vm_page_t mt;
 
 		nextoff = tfoff + PAGE_SIZE;
 		mt = m[i];
 
 		if (nextoff <= object->un_pager.vnp.vnp_size) {
 			/*
 			 * Read filled up entire page.
 			 */
 			mt->valid = VM_PAGE_BITS_ALL;
 			vm_page_undirty(mt);	/* should be an assert? XXX */
 			pmap_clear_modify(mt);
 		} else {
 			/*
 			 * Read did not fill up entire page.  Since this
 			 * is getpages, the page may be mapped, so we have
 			 * to zero the invalid portions of the page even
 			 * though we aren't setting them valid.
 			 *
 			 * Currently we do not set the entire page valid,
 			 * we just try to clear the piece that we couldn't
 			 * read.
 			 */
 			vm_page_set_validclean(mt, 0,
 			    object->un_pager.vnp.vnp_size - tfoff);
 			/* handled by vm_fault now */
 			/* vm_page_zero_invalid(mt, FALSE); */
 		}
 		
 		if (i != reqpage) {
 
 			/*
 			 * whether or not to leave the page activated is up in
 			 * the air, but we should put the page on a page queue
 			 * somewhere. (it already is in the object). Result:
 			 * It appears that empirical results show that
 			 * deactivating pages is best.
 			 */
 
 			/*
 			 * just in case someone was asking for this page we
 			 * now tell them that it is ok to use
 			 */
 			if (!error) {
 				if (mt->oflags & VPO_WANTED)
 					vm_page_activate(mt);
 				else
 					vm_page_deactivate(mt);
 				vm_page_wakeup(mt);
 			} else {
 				vm_page_free(mt);
 			}
 		}
 	}
 	vm_page_unlock_queues();
 	VM_OBJECT_UNLOCK(object);
 	if (error) {
 		printf("vnode_pager_getpages: I/O read error\n");
 	}
 	return (error ? VM_PAGER_ERROR : VM_PAGER_OK);
 }
 
 /*
  * EOPNOTSUPP is no longer legal.  For local media VFS's that do not
  * implement their own VOP_PUTPAGES, their VOP_PUTPAGES should call to
  * vnode_pager_generic_putpages() to implement the previous behaviour.
  *
  * All other FS's should use the bypass to get to the local media
  * backing vp's VOP_PUTPAGES.
  */
 static void
 vnode_pager_putpages(object, m, count, sync, rtvals)
 	vm_object_t object;
 	vm_page_t *m;
 	int count;
 	boolean_t sync;
 	int *rtvals;
 {
 	int rtval;
 	struct vnode *vp;
 	struct mount *mp;
 	int bytes = count * PAGE_SIZE;
 
 	/*
 	 * Force synchronous operation if we are extremely low on memory
 	 * to prevent a low-memory deadlock.  VOP operations often need to
 	 * allocate more memory to initiate the I/O ( i.e. do a BMAP 
 	 * operation ).  The swapper handles the case by limiting the amount
 	 * of asynchronous I/O, but that sort of solution doesn't scale well
 	 * for the vnode pager without a lot of work.
 	 *
 	 * Also, the backing vnode's iodone routine may not wake the pageout
 	 * daemon up.  This should be probably be addressed XXX.
 	 */
 
 	if ((cnt.v_free_count + cnt.v_cache_count) < cnt.v_pageout_free_min)
 		sync |= OBJPC_SYNC;
 
 	/*
 	 * Call device-specific putpages function
 	 */
 	vp = object->handle;
 	VM_OBJECT_UNLOCK(object);
 	if (vp->v_type != VREG)
 		mp = NULL;
 	rtval = VOP_PUTPAGES(vp, m, bytes, sync, rtvals, 0);
 	KASSERT(rtval != EOPNOTSUPP, 
 	    ("vnode_pager: stale FS putpages\n"));
 	VM_OBJECT_LOCK(object);
 }
 
 
 /*
  * This is now called from local media FS's to operate against their
  * own vnodes if they fail to implement VOP_PUTPAGES.
  *
  * This is typically called indirectly via the pageout daemon and
  * clustering has already typically occured, so in general we ask the
  * underlying filesystem to write the data out asynchronously rather
  * then delayed.
  */
 int
 vnode_pager_generic_putpages(vp, m, bytecount, flags, rtvals)
 	struct vnode *vp;
 	vm_page_t *m;
 	int bytecount;
 	int flags;
 	int *rtvals;
 {
 	int i;
 	vm_object_t object;
 	int count;
 
 	int maxsize, ncount;
 	vm_ooffset_t poffset;
 	struct uio auio;
 	struct iovec aiov;
 	int error;
 	int ioflags;
 	int ppscheck = 0;
 	static struct timeval lastfail;
 	static int curfail;
 
 	object = vp->v_object;
 	count = bytecount / PAGE_SIZE;
 
 	for (i = 0; i < count; i++)
 		rtvals[i] = VM_PAGER_AGAIN;
 
 	if ((int64_t)m[0]->pindex < 0) {
 		printf("vnode_pager_putpages: attempt to write meta-data!!! -- 0x%lx(%lx)\n",
 			(long)m[0]->pindex, (u_long)m[0]->dirty);
 		rtvals[0] = VM_PAGER_BAD;
 		return VM_PAGER_BAD;
 	}
 
 	maxsize = count * PAGE_SIZE;
 	ncount = count;
 
 	poffset = IDX_TO_OFF(m[0]->pindex);
 
 	/*
 	 * If the page-aligned write is larger then the actual file we
 	 * have to invalidate pages occuring beyond the file EOF.  However,
 	 * there is an edge case where a file may not be page-aligned where
 	 * the last page is partially invalid.  In this case the filesystem
 	 * may not properly clear the dirty bits for the entire page (which
 	 * could be VM_PAGE_BITS_ALL due to the page having been mmap()d).
 	 * With the page locked we are free to fix-up the dirty bits here.
 	 *
 	 * We do not under any circumstances truncate the valid bits, as
 	 * this will screw up bogus page replacement.
 	 */
 	if (maxsize + poffset > object->un_pager.vnp.vnp_size) {
 		if (object->un_pager.vnp.vnp_size > poffset) {
 			int pgoff;
 
 			maxsize = object->un_pager.vnp.vnp_size - poffset;
 			ncount = btoc(maxsize);
 			if ((pgoff = (int)maxsize & PAGE_MASK) != 0) {
 				vm_page_lock_queues();
 				vm_page_clear_dirty(m[ncount - 1], pgoff,
 					PAGE_SIZE - pgoff);
 				vm_page_unlock_queues();
 			}
 		} else {
 			maxsize = 0;
 			ncount = 0;
 		}
 		if (ncount < count) {
 			for (i = ncount; i < count; i++) {
 				rtvals[i] = VM_PAGER_BAD;
 			}
 		}
 	}
 
 	/*
 	 * pageouts are already clustered, use IO_ASYNC t o force a bawrite()
 	 * rather then a bdwrite() to prevent paging I/O from saturating 
 	 * the buffer cache.  Dummy-up the sequential heuristic to cause
 	 * large ranges to cluster.  If neither IO_SYNC or IO_ASYNC is set,
 	 * the system decides how to cluster.
 	 */
 	ioflags = IO_VMIO;
 	if (flags & (VM_PAGER_PUT_SYNC | VM_PAGER_PUT_INVAL))
 		ioflags |= IO_SYNC;
 	else if ((flags & VM_PAGER_CLUSTER_OK) == 0)
 		ioflags |= IO_ASYNC;
 	ioflags |= (flags & VM_PAGER_PUT_INVAL) ? IO_INVAL: 0;
 	ioflags |= IO_SEQMAX << IO_SEQSHIFT;
 
 	aiov.iov_base = (caddr_t) 0;
 	aiov.iov_len = maxsize;
 	auio.uio_iov = &aiov;
 	auio.uio_iovcnt = 1;
 	auio.uio_offset = poffset;
 	auio.uio_segflg = UIO_NOCOPY;
 	auio.uio_rw = UIO_WRITE;
 	auio.uio_resid = maxsize;
 	auio.uio_td = (struct thread *) 0;
 	error = VOP_WRITE(vp, &auio, ioflags, curthread->td_ucred);
 	PCPU_INC(cnt.v_vnodeout);
 	PCPU_ADD(cnt.v_vnodepgsout, ncount);
 
 	if (error) {
 		if ((ppscheck = ppsratecheck(&lastfail, &curfail, 1)))
 			printf("vnode_pager_putpages: I/O error %d\n", error);
 	}
 	if (auio.uio_resid) {
 		if (ppscheck || ppsratecheck(&lastfail, &curfail, 1))
 			printf("vnode_pager_putpages: residual I/O %d at %lu\n",
 			    auio.uio_resid, (u_long)m[0]->pindex);
 	}
 	for (i = 0; i < ncount; i++) {
 		rtvals[i] = VM_PAGER_OK;
 	}
 	return rtvals[0];
 }
 
 struct vnode *
 vnode_pager_lock(vm_object_t first_object)
 {
 	struct vnode *vp;
 	vm_object_t backing_object, object;
 
 	VM_OBJECT_LOCK_ASSERT(first_object, MA_OWNED);
 	for (object = first_object; object != NULL; object = backing_object) {
 		if (object->type != OBJT_VNODE) {
 			if ((backing_object = object->backing_object) != NULL)
 				VM_OBJECT_LOCK(backing_object);
 			if (object != first_object)
 				VM_OBJECT_UNLOCK(object);
 			continue;
 		}
 	retry:
 		if (object->flags & OBJ_DEAD) {
 			if (object != first_object)
 				VM_OBJECT_UNLOCK(object);
 			return NULL;
 		}
 		vp = object->handle;
 		VI_LOCK(vp);
 		VM_OBJECT_UNLOCK(object);
 		if (first_object != object)
 			VM_OBJECT_UNLOCK(first_object);
 		VFS_ASSERT_GIANT(vp->v_mount);
 		if (vget(vp, LK_CANRECURSE | LK_INTERLOCK |
 		    LK_RETRY | LK_SHARED, curthread)) {
 			VM_OBJECT_LOCK(first_object);
 			if (object != first_object)
 				VM_OBJECT_LOCK(object);
 			if (object->type != OBJT_VNODE) {
 				if (object != first_object)
 					VM_OBJECT_UNLOCK(object);
 				return NULL;
 			}
 			printf("vnode_pager_lock: retrying\n");
 			goto retry;
 		}
 		VM_OBJECT_LOCK(first_object);
 		return (vp);
 	}
 	return NULL;
 }