Index: head/sys/fs/cd9660/cd9660_vnops.c
===================================================================
--- head/sys/fs/cd9660/cd9660_vnops.c	(revision 154151)
+++ head/sys/fs/cd9660/cd9660_vnops.c	(revision 154152)
@@ -1,832 +1,833 @@
 /*-
  * Copyright (c) 1994
  *	The Regents of the University of California.  All rights reserved.
  *
  * This code is derived from software contributed to Berkeley
  * by Pace Willisson (pace@blitz.com).  The Rock Ridge Extension
  * Support code is derived from software contributed to Berkeley
  * by Atsushi Murai (amurai@spec.co.jp).
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	@(#)cd9660_vnops.c	8.19 (Berkeley) 5/27/95
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/namei.h>
 #include <sys/kernel.h>
 #include <sys/conf.h>
 #include <sys/stat.h>
 #include <sys/bio.h>
 #include <sys/buf.h>
 #include <sys/mount.h>
 #include <sys/vnode.h>
 #include <fs/fifofs/fifo.h>
 #include <sys/malloc.h>
 #include <sys/dirent.h>
 #include <sys/unistd.h>
 #include <sys/filio.h>
 
 #include <vm/vm.h>
 #include <vm/vnode_pager.h>
 #include <vm/uma.h>
 
 #include <isofs/cd9660/iso.h>
 #include <isofs/cd9660/cd9660_node.h>
 #include <isofs/cd9660/iso_rrip.h>
 
 static vop_setattr_t	cd9660_setattr;
 static vop_open_t	cd9660_open;
 static vop_access_t	cd9660_access;
 static vop_getattr_t	cd9660_getattr;
 static vop_ioctl_t	cd9660_ioctl;
 static vop_pathconf_t	cd9660_pathconf;
 static vop_read_t	cd9660_read;
 struct isoreaddir;
 static int iso_uiodir(struct isoreaddir *idp, struct dirent *dp, off_t off);
 static int iso_shipdir(struct isoreaddir *idp);
 static vop_readdir_t	cd9660_readdir;
 static vop_readlink_t	cd9660_readlink;
 static vop_strategy_t	cd9660_strategy;
 
 /*
  * Setattr call. Only allowed for block and character special devices.
  */
 static int
 cd9660_setattr(ap)
 	struct vop_setattr_args /* {
 		struct vnodeop_desc *a_desc;
 		struct vnode *a_vp;
 		struct vattr *a_vap;
 		struct ucred *a_cred;
 		struct thread *a_td;
 	} */ *ap;
 {
 	struct vnode *vp = ap->a_vp;
 	struct vattr *vap = ap->a_vap;
 
 	if (vap->va_flags != (u_long)VNOVAL || vap->va_uid != (uid_t)VNOVAL ||
 	    vap->va_gid != (gid_t)VNOVAL || vap->va_atime.tv_sec != VNOVAL ||
 	    vap->va_mtime.tv_sec != VNOVAL || vap->va_mode != (mode_t)VNOVAL)
 		return (EROFS);
 	if (vap->va_size != (u_quad_t)VNOVAL) {
 		switch (vp->v_type) {
 		case VDIR:
 			return (EISDIR);
 		case VLNK:
 		case VREG:
 			return (EROFS);
 		case VCHR:
 		case VBLK:
 		case VSOCK:
 		case VFIFO:
 		case VNON:
 		case VBAD:
+		case VMARKER:
 			return (0);
 		}
 	}
 	return (0);
 }
 
 /*
  * Check mode permission on inode pointer. Mode is READ, WRITE or EXEC.
  * The mode is shifted to select the owner/group/other fields. The
  * super user is granted all permissions.
  */
 /* ARGSUSED */
 static int
 cd9660_access(ap)
 	struct vop_access_args /* {
 		struct vnode *a_vp;
 		int  a_mode;
 		struct ucred *a_cred;
 		struct thread *a_td;
 	} */ *ap;
 {
 	struct vnode *vp = ap->a_vp;
 	struct iso_node *ip = VTOI(vp);
 	mode_t mode = ap->a_mode;
 
 	if (vp->v_type == VCHR || vp->v_type == VBLK)
 		return (EOPNOTSUPP);
 
 	/*
 	 * Disallow write attempts unless the file is a socket,
 	 * fifo, or a block or character device resident on the
 	 * filesystem.
 	 */
 	if (mode & VWRITE) {
 		switch (vp->v_type) {
 		case VDIR:
 		case VLNK:
 		case VREG:
 			return (EROFS);
 			/* NOT REACHED */
 		default:
 			break;
 		}
 	}
 
 	return (vaccess(vp->v_type, ip->inode.iso_mode, ip->inode.iso_uid,
 	    ip->inode.iso_gid, ap->a_mode, ap->a_cred, NULL));
 }
 
 static int
 cd9660_open(ap)
 	struct vop_open_args /* {
 		struct vnode *a_vp;
 		int a_mode;
 		struct ucred *a_cred;
 		struct thread *a_td;
 		int a_fdidx;
 	} */ *ap;
 {
 	struct iso_node *ip = VTOI(ap->a_vp);
 
 	vnode_create_vobject(ap->a_vp, ip->i_size, ap->a_td);
 	return 0;
 }
 
 
 static int
 cd9660_getattr(ap)
 	struct vop_getattr_args /* {
 		struct vnode *a_vp;
 		struct vattr *a_vap;
 		struct ucred *a_cred;
 		struct thread *a_td;
 	} */ *ap;
 
 {
 	struct vnode *vp = ap->a_vp;
 	struct vattr *vap = ap->a_vap;
 	struct iso_node *ip = VTOI(vp);
 
 	vap->va_fsid    = dev2udev(ip->i_mnt->im_dev);
 	vap->va_fileid	= ip->i_number;
 
 	vap->va_mode	= ip->inode.iso_mode;
 	vap->va_nlink	= ip->inode.iso_links;
 	vap->va_uid	= ip->inode.iso_uid;
 	vap->va_gid	= ip->inode.iso_gid;
 	vap->va_atime	= ip->inode.iso_atime;
 	vap->va_mtime	= ip->inode.iso_mtime;
 	vap->va_ctime	= ip->inode.iso_ctime;
 	vap->va_rdev	= ip->inode.iso_rdev;
 
 	vap->va_size	= (u_quad_t) ip->i_size;
 	if (ip->i_size == 0 && (vap->va_mode & S_IFMT) == S_IFLNK) {
 		struct vop_readlink_args rdlnk;
 		struct iovec aiov;
 		struct uio auio;
 		char *cp;
 
 		MALLOC(cp, char *, MAXPATHLEN, M_TEMP, M_WAITOK);
 		aiov.iov_base = cp;
 		aiov.iov_len = MAXPATHLEN;
 		auio.uio_iov = &aiov;
 		auio.uio_iovcnt = 1;
 		auio.uio_offset = 0;
 		auio.uio_rw = UIO_READ;
 		auio.uio_segflg = UIO_SYSSPACE;
 		auio.uio_td = ap->a_td;
 		auio.uio_resid = MAXPATHLEN;
 		rdlnk.a_uio = &auio;
 		rdlnk.a_vp = ap->a_vp;
 		rdlnk.a_cred = ap->a_cred;
 		if (cd9660_readlink(&rdlnk) == 0)
 			vap->va_size = MAXPATHLEN - auio.uio_resid;
 		FREE(cp, M_TEMP);
 	}
 	vap->va_flags	= 0;
 	vap->va_gen = 1;
 	vap->va_blocksize = ip->i_mnt->logical_block_size;
 	vap->va_bytes	= (u_quad_t) ip->i_size;
 	vap->va_type	= vp->v_type;
 	vap->va_filerev	= 0;
 	return (0);
 }
 
 /*
  * Vnode op for ioctl.
  */
 static int
 cd9660_ioctl(ap)
 	struct vop_ioctl_args /* {
 		struct vnode *a_vp;
 		u_long  a_command;
 		caddr_t  a_data;
 		int  a_fflag;
 		struct ucred *a_cred;
 		struct thread *a_td;
 	} */ *ap;
 {
 	struct vnode *vp = ap->a_vp;
 	struct iso_node *ip = VTOI(vp);
 
 	if (vp->v_type == VCHR || vp->v_type == VBLK)
 		return (EOPNOTSUPP);
 
 	switch (ap->a_command) {
 
 	case FIOGETLBA:
 		*(int *)(ap->a_data) = ip->iso_start;
 		return 0;
 	default:
 		return (ENOTTY);
 	}
 }
 
 /*
  * Vnode op for reading.
  */
 static int
 cd9660_read(ap)
 	struct vop_read_args /* {
 		struct vnode *a_vp;
 		struct uio *a_uio;
 		int a_ioflag;
 		struct ucred *a_cred;
 	} */ *ap;
 {
 	struct vnode *vp = ap->a_vp;
 	struct uio *uio = ap->a_uio;
 	struct iso_node *ip = VTOI(vp);
 	struct iso_mnt *imp;
 	struct buf *bp;
 	daddr_t lbn, rablock;
 	off_t diff;
 	int rasize, error = 0;
 	int seqcount;
 	long size, n, on;
 
 	if (vp->v_type == VCHR || vp->v_type == VBLK)
 		return (EOPNOTSUPP);
 
 	seqcount = ap->a_ioflag >> IO_SEQSHIFT;
 
 	if (uio->uio_resid == 0)
 		return (0);
 	if (uio->uio_offset < 0)
 		return (EINVAL);
 	ip->i_flag |= IN_ACCESS;
 	imp = ip->i_mnt;
 	do {
 		lbn = lblkno(imp, uio->uio_offset);
 		on = blkoff(imp, uio->uio_offset);
 		n = min((u_int)(imp->logical_block_size - on),
 			uio->uio_resid);
 		diff = (off_t)ip->i_size - uio->uio_offset;
 		if (diff <= 0)
 			return (0);
 		if (diff < n)
 			n = diff;
 		size = blksize(imp, ip, lbn);
 		rablock = lbn + 1;
 		if ((vp->v_mount->mnt_flag & MNT_NOCLUSTERR) == 0) {
 			if (lblktosize(imp, rablock) < ip->i_size)
 				error = cluster_read(vp, (off_t)ip->i_size,
 					 lbn, size, NOCRED, uio->uio_resid,
 					 (ap->a_ioflag >> 16), &bp);
 			else
 				error = bread(vp, lbn, size, NOCRED, &bp);
 		} else {
 			if (seqcount > 1 &&
 			    lblktosize(imp, rablock) < ip->i_size) {
 				rasize = blksize(imp, ip, rablock);
 				error = breadn(vp, lbn, size, &rablock,
 					       &rasize, 1, NOCRED, &bp);
 			} else
 				error = bread(vp, lbn, size, NOCRED, &bp);
 		}
 		n = min(n, size - bp->b_resid);
 		if (error) {
 			brelse(bp);
 			return (error);
 		}
 
 		error = uiomove(bp->b_data + on, (int)n, uio);
 		brelse(bp);
 	} while (error == 0 && uio->uio_resid > 0 && n != 0);
 	return (error);
 }
 
 /*
  * Structure for reading directories
  */
 struct isoreaddir {
 	struct dirent saveent;
 	struct dirent assocent;
 	struct dirent current;
 	off_t saveoff;
 	off_t assocoff;
 	off_t curroff;
 	struct uio *uio;
 	off_t uio_off;
 	int eofflag;
 	u_long *cookies;
 	int ncookies;
 };
 
 static int
 iso_uiodir(idp,dp,off)
 	struct isoreaddir *idp;
 	struct dirent *dp;
 	off_t off;
 {
 	int error;
 
 	dp->d_name[dp->d_namlen] = 0;
 	dp->d_reclen = GENERIC_DIRSIZ(dp);
 
 	if (idp->uio->uio_resid < dp->d_reclen) {
 		idp->eofflag = 0;
 		return (-1);
 	}
 
 	if (idp->cookies) {
 		if (idp->ncookies <= 0) {
 			idp->eofflag = 0;
 			return (-1);
 		}
 
 		*idp->cookies++ = off;
 		--idp->ncookies;
 	}
 
 	if ((error = uiomove(dp, dp->d_reclen, idp->uio)) != 0)
 		return (error);
 	idp->uio_off = off;
 	return (0);
 }
 
 static int
 iso_shipdir(idp)
 	struct isoreaddir *idp;
 {
 	struct dirent *dp;
 	int cl, sl, assoc;
 	int error;
 	char *cname, *sname;
 
 	cl = idp->current.d_namlen;
 	cname = idp->current.d_name;
 assoc = (cl > 1) && (*cname == ASSOCCHAR);
 	if (assoc) {
 		cl--;
 		cname++;
 	}
 
 	dp = &idp->saveent;
 	sname = dp->d_name;
 	if (!(sl = dp->d_namlen)) {
 		dp = &idp->assocent;
 		sname = dp->d_name + 1;
 		sl = dp->d_namlen - 1;
 	}
 	if (sl > 0) {
 		if (sl != cl
 		    || bcmp(sname,cname,sl)) {
 			if (idp->assocent.d_namlen) {
 				if ((error = iso_uiodir(idp,&idp->assocent,idp->assocoff)) != 0)
 					return (error);
 				idp->assocent.d_namlen = 0;
 			}
 			if (idp->saveent.d_namlen) {
 				if ((error = iso_uiodir(idp,&idp->saveent,idp->saveoff)) != 0)
 					return (error);
 				idp->saveent.d_namlen = 0;
 			}
 		}
 	}
 	idp->current.d_reclen = GENERIC_DIRSIZ(&idp->current);
 	if (assoc) {
 		idp->assocoff = idp->curroff;
 		bcopy(&idp->current,&idp->assocent,idp->current.d_reclen);
 	} else {
 		idp->saveoff = idp->curroff;
 		bcopy(&idp->current,&idp->saveent,idp->current.d_reclen);
 	}
 	return (0);
 }
 
 /*
  * Vnode op for readdir
  */
 static int
 cd9660_readdir(ap)
 	struct vop_readdir_args /* {
 		struct vnode *a_vp;
 		struct uio *a_uio;
 		struct ucred *a_cred;
 		int *a_eofflag;
 		int *a_ncookies;
 		u_long **a_cookies;
 	} */ *ap;
 {
 	struct uio *uio = ap->a_uio;
 	struct isoreaddir *idp;
 	struct vnode *vdp = ap->a_vp;
 	struct iso_node *dp;
 	struct iso_mnt *imp;
 	struct buf *bp = NULL;
 	struct iso_directory_record *ep;
 	int entryoffsetinblock;
 	doff_t endsearch;
 	u_long bmask;
 	int error = 0;
 	int reclen;
 	u_short namelen;
 	int ncookies = 0;
 	u_long *cookies = NULL;
 
 	dp = VTOI(vdp);
 	imp = dp->i_mnt;
 	bmask = imp->im_bmask;
 
 	MALLOC(idp, struct isoreaddir *, sizeof(*idp), M_TEMP, M_WAITOK);
 	idp->saveent.d_namlen = idp->assocent.d_namlen = 0;
 	/*
 	 * XXX
 	 * Is it worth trying to figure out the type?
 	 */
 	idp->saveent.d_type = idp->assocent.d_type = idp->current.d_type =
 	    DT_UNKNOWN;
 	idp->uio = uio;
 	if (ap->a_ncookies == NULL) {
 		idp->cookies = NULL;
 	} else {
 		/*
 		 * Guess the number of cookies needed.
 		 */
 		ncookies = uio->uio_resid / 16;
 		MALLOC(cookies, u_long *, ncookies * sizeof(u_long),
 		    M_TEMP, M_WAITOK);
 		idp->cookies = cookies;
 		idp->ncookies = ncookies;
 	}
 	idp->eofflag = 1;
 	idp->curroff = uio->uio_offset;
 
 	if ((entryoffsetinblock = idp->curroff & bmask) &&
 	    (error = cd9660_blkatoff(vdp, (off_t)idp->curroff, NULL, &bp))) {
 		FREE(idp, M_TEMP);
 		return (error);
 	}
 	endsearch = dp->i_size;
 
 	while (idp->curroff < endsearch) {
 		/*
 		 * If offset is on a block boundary,
 		 * read the next directory block.
 		 * Release previous if it exists.
 		 */
 		if ((idp->curroff & bmask) == 0) {
 			if (bp != NULL)
 				brelse(bp);
 			if ((error =
 			    cd9660_blkatoff(vdp, (off_t)idp->curroff, NULL, &bp)) != 0)
 				break;
 			entryoffsetinblock = 0;
 		}
 		/*
 		 * Get pointer to next entry.
 		 */
 		ep = (struct iso_directory_record *)
 			((char *)bp->b_data + entryoffsetinblock);
 
 		reclen = isonum_711(ep->length);
 		if (reclen == 0) {
 			/* skip to next block, if any */
 			idp->curroff =
 			    (idp->curroff & ~bmask) + imp->logical_block_size;
 			continue;
 		}
 
 		if (reclen < ISO_DIRECTORY_RECORD_SIZE) {
 			error = EINVAL;
 			/* illegal entry, stop */
 			break;
 		}
 
 		if (entryoffsetinblock + reclen > imp->logical_block_size) {
 			error = EINVAL;
 			/* illegal directory, so stop looking */
 			break;
 		}
 
 		idp->current.d_namlen = isonum_711(ep->name_len);
 
 		if (reclen < ISO_DIRECTORY_RECORD_SIZE + idp->current.d_namlen) {
 			error = EINVAL;
 			/* illegal entry, stop */
 			break;
 		}
 
 		if (isonum_711(ep->flags)&2)
 			idp->current.d_fileno = isodirino(ep, imp);
 		else
 			idp->current.d_fileno = dbtob(bp->b_blkno) +
 				entryoffsetinblock;
 
 		idp->curroff += reclen;
 
 		switch (imp->iso_ftype) {
 		case ISO_FTYPE_RRIP:
 			cd9660_rrip_getname(ep,idp->current.d_name, &namelen,
 					   &idp->current.d_fileno,imp);
 			idp->current.d_namlen = (u_char)namelen;
 			if (idp->current.d_namlen)
 				error = iso_uiodir(idp,&idp->current,idp->curroff);
 			break;
 		default: /* ISO_FTYPE_DEFAULT || ISO_FTYPE_9660 || ISO_FTYPE_HIGH_SIERRA*/
 			strcpy(idp->current.d_name,"..");
 			if (idp->current.d_namlen == 1 && ep->name[0] == 0) {
 				idp->current.d_namlen = 1;
 				error = iso_uiodir(idp,&idp->current,idp->curroff);
 			} else if (idp->current.d_namlen == 1 && ep->name[0] == 1) {
 				idp->current.d_namlen = 2;
 				error = iso_uiodir(idp,&idp->current,idp->curroff);
 			} else {
 				isofntrans(ep->name,idp->current.d_namlen,
 					   idp->current.d_name, &namelen,
 					   imp->iso_ftype == ISO_FTYPE_9660,
 					   isonum_711(ep->flags)&4,
 					   imp->joliet_level,
 					   imp->im_flags,
 					   imp->im_d2l);
 				idp->current.d_namlen = (u_char)namelen;
 				if (imp->iso_ftype == ISO_FTYPE_DEFAULT)
 					error = iso_shipdir(idp);
 				else
 					error = iso_uiodir(idp,&idp->current,idp->curroff);
 			}
 		}
 		if (error)
 			break;
 
 		entryoffsetinblock += reclen;
 	}
 
 	if (!error && imp->iso_ftype == ISO_FTYPE_DEFAULT) {
 		idp->current.d_namlen = 0;
 		error = iso_shipdir(idp);
 	}
 	if (error < 0)
 		error = 0;
 
 	if (ap->a_ncookies != NULL) {
 		if (error)
 			free(cookies, M_TEMP);
 		else {
 			/*
 			 * Work out the number of cookies actually used.
 			 */
 			*ap->a_ncookies = ncookies - idp->ncookies;
 			*ap->a_cookies = cookies;
 		}
 	}
 
 	if (bp)
 		brelse (bp);
 
 	uio->uio_offset = idp->uio_off;
 	*ap->a_eofflag = idp->eofflag;
 
 	FREE(idp, M_TEMP);
 
 	return (error);
 }
 
 /*
  * Return target name of a symbolic link
  * Shouldn't we get the parent vnode and read the data from there?
  * This could eventually result in deadlocks in cd9660_lookup.
  * But otherwise the block read here is in the block buffer two times.
  */
 typedef struct iso_directory_record ISODIR;
 typedef struct iso_node		    ISONODE;
 typedef struct iso_mnt		    ISOMNT;
 static int
 cd9660_readlink(ap)
 	struct vop_readlink_args /* {
 		struct vnode *a_vp;
 		struct uio *a_uio;
 		struct ucred *a_cred;
 	} */ *ap;
 {
 	ISONODE	*ip;
 	ISODIR	*dirp;
 	ISOMNT	*imp;
 	struct	buf *bp;
 	struct	uio *uio;
 	u_short	symlen;
 	int	error;
 	char	*symname;
 
 	ip  = VTOI(ap->a_vp);
 	imp = ip->i_mnt;
 	uio = ap->a_uio;
 
 	if (imp->iso_ftype != ISO_FTYPE_RRIP)
 		return (EINVAL);
 
 	/*
 	 * Get parents directory record block that this inode included.
 	 */
 	error = bread(imp->im_devvp,
 		      (ip->i_number >> imp->im_bshift) <<
 		      (imp->im_bshift - DEV_BSHIFT),
 		      imp->logical_block_size, NOCRED, &bp);
 	if (error) {
 		brelse(bp);
 		return (EINVAL);
 	}
 
 	/*
 	 * Setup the directory pointer for this inode
 	 */
 	dirp = (ISODIR *)(bp->b_data + (ip->i_number & imp->im_bmask));
 
 	/*
 	 * Just make sure, we have a right one....
 	 *   1: Check not cross boundary on block
 	 */
 	if ((ip->i_number & imp->im_bmask) + isonum_711(dirp->length)
 	    > (unsigned)imp->logical_block_size) {
 		brelse(bp);
 		return (EINVAL);
 	}
 
 	/*
 	 * Now get a buffer
 	 * Abuse a namei buffer for now.
 	 */
 	if (uio->uio_segflg == UIO_SYSSPACE)
 		symname = uio->uio_iov->iov_base;
 	else
 		symname = uma_zalloc(namei_zone, M_WAITOK);
 
 	/*
 	 * Ok, we just gathering a symbolic name in SL record.
 	 */
 	if (cd9660_rrip_getsymname(dirp, symname, &symlen, imp) == 0) {
 		if (uio->uio_segflg != UIO_SYSSPACE)
 			uma_zfree(namei_zone, symname);
 		brelse(bp);
 		return (EINVAL);
 	}
 	/*
 	 * Don't forget before you leave from home ;-)
 	 */
 	brelse(bp);
 
 	/*
 	 * return with the symbolic name to caller's.
 	 */
 	if (uio->uio_segflg != UIO_SYSSPACE) {
 		error = uiomove(symname, symlen, uio);
 		uma_zfree(namei_zone, symname);
 		return (error);
 	}
 	uio->uio_resid -= symlen;
 	uio->uio_iov->iov_base = (char *)uio->uio_iov->iov_base + symlen;
 	uio->uio_iov->iov_len -= symlen;
 	return (0);
 }
 
 /*
  * Calculate the logical to physical mapping if not done already,
  * then call the device strategy routine.
  */
 static int
 cd9660_strategy(ap)
 	struct vop_strategy_args /* {
 		struct buf *a_vp;
 		struct buf *a_bp;
 	} */ *ap;
 {
 	struct buf *bp = ap->a_bp;
 	struct vnode *vp = ap->a_vp;
 	struct iso_node *ip;
 	struct bufobj *bo;
 
 	ip = VTOI(vp);
 	if (vp->v_type == VBLK || vp->v_type == VCHR)
 		panic("cd9660_strategy: spec");
 	if (bp->b_blkno == bp->b_lblkno) {
 		bp->b_blkno = (ip->iso_start + bp->b_lblkno) <<
 		    (ip->i_mnt->im_bshift - DEV_BSHIFT);
 		if ((long)bp->b_blkno == -1)	/* XXX: cut&paste junk ? */
 			clrbuf(bp);
 	}
 	if ((long)bp->b_blkno == -1) {	/* XXX: cut&paste junk ? */
 		bufdone(bp);
 		return (0);
 	}
 	bp->b_iooffset = dbtob(bp->b_blkno);
 	bo = ip->i_mnt->im_bo;
 	BO_STRATEGY(bo, bp);
 	return (0);
 }
 
 /*
  * Return POSIX pathconf information applicable to cd9660 filesystems.
  */
 static int
 cd9660_pathconf(ap)
 	struct vop_pathconf_args /* {
 		struct vnode *a_vp;
 		int a_name;
 		register_t *a_retval;
 	} */ *ap;
 {
 
 	switch (ap->a_name) {
 	case _PC_LINK_MAX:
 		*ap->a_retval = 1;
 		return (0);
 	case _PC_NAME_MAX:
 		if (VTOI(ap->a_vp)->i_mnt->iso_ftype == ISO_FTYPE_RRIP)
 			*ap->a_retval = NAME_MAX;
 		else
 			*ap->a_retval = 37;
 		return (0);
 	case _PC_PATH_MAX:
 		*ap->a_retval = PATH_MAX;
 		return (0);
 	case _PC_PIPE_BUF:
 		*ap->a_retval = PIPE_BUF;
 		return (0);
 	case _PC_CHOWN_RESTRICTED:
 		*ap->a_retval = 1;
 		return (0);
 	case _PC_NO_TRUNC:
 		*ap->a_retval = 1;
 		return (0);
 	default:
 		return (EINVAL);
 	}
 	/* NOTREACHED */
 }
 
 /*
  * Global vfs data structures for cd9660
  */
 struct vop_vector cd9660_vnodeops = {
 	.vop_default =		&default_vnodeops,
 	.vop_open =		cd9660_open,
 	.vop_access =		cd9660_access,
 	.vop_bmap =		cd9660_bmap,
 	.vop_cachedlookup =	cd9660_lookup,
 	.vop_getattr =		cd9660_getattr,
 	.vop_inactive =		cd9660_inactive,
 	.vop_ioctl =		cd9660_ioctl,
 	.vop_lookup =		vfs_cache_lookup,
 	.vop_pathconf =		cd9660_pathconf,
 	.vop_read =		cd9660_read,
 	.vop_readdir =		cd9660_readdir,
 	.vop_readlink =		cd9660_readlink,
 	.vop_reclaim =		cd9660_reclaim,
 	.vop_setattr =		cd9660_setattr,
 	.vop_strategy =		cd9660_strategy,
 };
 
 /*
  * Special device vnode ops
  */
 
 struct vop_vector cd9660_fifoops = {
 	.vop_default =		&fifo_specops,
 	.vop_access =		cd9660_access,
 	.vop_getattr =		cd9660_getattr,
 	.vop_inactive =		cd9660_inactive,
 	.vop_reclaim =		cd9660_reclaim,
 	.vop_setattr =		cd9660_setattr,
 };
Index: head/sys/gnu/fs/ext2fs/ext2_vfsops.c
===================================================================
--- head/sys/gnu/fs/ext2fs/ext2_vfsops.c	(revision 154151)
+++ head/sys/gnu/fs/ext2fs/ext2_vfsops.c	(revision 154152)
@@ -1,1157 +1,1161 @@
 /*-
  *  modified for EXT2FS support in Lites 1.1
  *
  *  Aug 1995, Godmar Back (gback@cs.utah.edu)
  *  University of Utah, Department of Computer Science
  */
 /*-
  * Copyright (c) 1989, 1991, 1993, 1994	
  *	The Regents of the University of California.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	@(#)ffs_vfsops.c	8.8 (Berkeley) 4/18/94
  * $FreeBSD$
  */
 
 /*-
  * COPYRIGHT.INFO says this has some GPL'd code from ext2_super.c in it
  *
  *      This program is free software; you can redistribute it and/or modify
  *      it under the terms of the GNU General Public License as published by
  *      the Free Software Foundation; either version 2 of the License.
  *
  *      This program is distributed in the hope that it will be useful,
  *      but WITHOUT ANY WARRANTY; without even the implied warranty of
  *      MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  *      GNU General Public License for more details.
  *
  *      You should have received a copy of the GNU General Public License
  *      along with this program; if not, write to the Free Software
  *      Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
  *
  */
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/namei.h>
 #include <sys/proc.h>
 #include <sys/kernel.h>
 #include <sys/vnode.h>
 #include <sys/mount.h>
 #include <sys/bio.h>
 #include <sys/buf.h>
 #include <sys/conf.h>
 #include <sys/fcntl.h>
 #include <sys/malloc.h>
 #include <sys/stat.h>
 #include <sys/mutex.h>
 
 #include <geom/geom.h>
 #include <geom/geom_vfs.h>
 
 #include <gnu/fs/ext2fs/ext2_mount.h>
 #include <gnu/fs/ext2fs/inode.h>
 
 #include <gnu/fs/ext2fs/fs.h>
 #include <gnu/fs/ext2fs/ext2_extern.h>
 #include <gnu/fs/ext2fs/ext2_fs.h>
 #include <gnu/fs/ext2fs/ext2_fs_sb.h>
 
 static int ext2_flushfiles(struct mount *mp, int flags, struct thread *td);
 static int ext2_mountfs(struct vnode *, struct mount *, struct thread *);
 static int ext2_reload(struct mount *mp, struct thread *td);
 static int ext2_sbupdate(struct ext2mount *, int);
 
 static vfs_unmount_t		ext2_unmount;
 static vfs_root_t		ext2_root;
 static vfs_statfs_t		ext2_statfs;
 static vfs_sync_t		ext2_sync;
 static vfs_vget_t		ext2_vget;
 static vfs_fhtovp_t		ext2_fhtovp;
 static vfs_vptofh_t		ext2_vptofh;
 static vfs_mount_t		ext2_mount;
 
 MALLOC_DEFINE(M_EXT2NODE, "ext2_node", "EXT2 vnode private part");
 static MALLOC_DEFINE(M_EXT2MNT, "ext2_mount", "EXT2 mount structure");
 
 static struct vfsops ext2fs_vfsops = {
 	.vfs_fhtovp =		ext2_fhtovp,
 	.vfs_mount =		ext2_mount,
 	.vfs_root =		ext2_root,	/* root inode via vget */
 	.vfs_statfs =		ext2_statfs,
 	.vfs_sync =		ext2_sync,
 	.vfs_unmount =		ext2_unmount,
 	.vfs_vget =		ext2_vget,
 	.vfs_vptofh =		ext2_vptofh,
 };
 
 VFS_SET(ext2fs_vfsops, ext2fs, 0);
 
 #define bsd_malloc malloc
 #define bsd_free free
 
 static int	ext2_check_sb_compat(struct ext2_super_block *es, struct cdev *dev,
 		    int ronly);
 static int	compute_sb_data(struct vnode * devvp,
 		    struct ext2_super_block * es, struct ext2_sb_info * fs);
 
 static const char *ext2_opts[] = { "from", "export" };
 /*
  * VFS Operations.
  *
  * mount system call
  */
 static int
 ext2_mount(mp, td)
 	struct mount *mp;
 	struct thread *td;
 {
 	struct export_args *export;
 	struct vfsoptlist *opts;
 	struct vnode *devvp;
 	struct ext2mount *ump = 0;
 	struct ext2_sb_info *fs;
 	char *path, *fspec;
 	int error, flags, len;
 	mode_t accessmode;
 	struct nameidata nd, *ndp = &nd;
 
 	opts = mp->mnt_optnew;
 
 	if (vfs_filteropt(opts, ext2_opts))
 		return (EINVAL);
 
 	vfs_getopt(opts, "fspath", (void **)&path, NULL);
 	/* Double-check the length of path.. */
 	if (strlen(path) >= MAXMNTLEN - 1)
 		return (ENAMETOOLONG);
 
 	fspec = NULL;
 	error = vfs_getopt(opts, "from", (void **)&fspec, &len);
 	if (!error && fspec[len - 1] != '\0')
 		return (EINVAL);
 
 	/*
 	 * If updating, check whether changing from read-only to
 	 * read/write; if there is no device name, that's all we do.
 	 */
 	if (mp->mnt_flag & MNT_UPDATE) {
 		ump = VFSTOEXT2(mp);
 		fs = ump->um_e2fs;
 		error = 0;
 		if (fs->s_rd_only == 0 &&
 		    vfs_flagopt(opts, "ro", NULL, 0)) {
 			error = VFS_SYNC(mp, MNT_WAIT, td);
 			if (error)
 				return (error);
 			flags = WRITECLOSE;
 			if (mp->mnt_flag & MNT_FORCE)
 				flags |= FORCECLOSE;
 			if (vfs_busy(mp, LK_NOWAIT, 0, td))
 				return (EBUSY);
 			error = ext2_flushfiles(mp, flags, td);
 			vfs_unbusy(mp, td);
 			if (!error && fs->s_wasvalid) {
 				fs->s_es->s_state |= EXT2_VALID_FS;
 				ext2_sbupdate(ump, MNT_WAIT);
 			}
 			fs->s_rd_only = 1;
 			vfs_flagopt(opts, "ro", &mp->mnt_flag, MNT_RDONLY);
 			DROP_GIANT();
 			g_topology_lock();
 			g_access(ump->um_cp, 0, -1, 0);
 			g_topology_unlock();
 			PICKUP_GIANT();
 		}
 		if (!error && (mp->mnt_flag & MNT_RELOAD))
 			error = ext2_reload(mp, td);
 		if (error)
 			return (error);
 		devvp = ump->um_devvp;
 		if (fs->s_rd_only && !vfs_flagopt(opts, "ro", NULL, 0)) {
 			if (ext2_check_sb_compat(fs->s_es, devvp->v_rdev, 0))
 				return (EPERM);
 			/*
 			 * If upgrade to read-write by non-root, then verify
 			 * that user has necessary permissions on the device.
 			 */
 			if (suser(td)) {
 				vn_lock(devvp, LK_EXCLUSIVE | LK_RETRY, td);
 				if ((error = VOP_ACCESS(devvp, VREAD | VWRITE,
 				    td->td_ucred, td)) != 0) {
 					VOP_UNLOCK(devvp, 0, td);
 					return (error);
 				}
 				VOP_UNLOCK(devvp, 0, td);
 			}
 			DROP_GIANT();
 			g_topology_lock();
 			error = g_access(ump->um_cp, 0, 1, 0);
 			g_topology_unlock();
 			PICKUP_GIANT();
 			if (error)
 				return (error);
 
 			if ((fs->s_es->s_state & EXT2_VALID_FS) == 0 ||
 			    (fs->s_es->s_state & EXT2_ERROR_FS)) {
 				if (mp->mnt_flag & MNT_FORCE) {
 					printf(
 "WARNING: %s was not properly dismounted\n",
 					    fs->fs_fsmnt);
 				} else {
 					printf(
 "WARNING: R/W mount of %s denied.  Filesystem is not clean - run fsck\n",
 					    fs->fs_fsmnt);
 					return (EPERM);
 				}
 			}
 			fs->s_es->s_state &= ~EXT2_VALID_FS;
 			ext2_sbupdate(ump, MNT_WAIT);
 			fs->s_rd_only = 0;
 			mp->mnt_flag &= ~MNT_RDONLY;
 		}
 		if (fspec == NULL) {
 			error = vfs_getopt(opts, "export", (void **)&export,
 			    &len);
 			if (error || len != sizeof(struct export_args))
 				return (EINVAL);
 				/* Process export requests. */
 			return (vfs_export(mp, export));
 		}
 	}
 	/*
 	 * Not an update, or updating the name: look up the name
 	 * and verify that it refers to a sensible disk device.
 	 */
 	if (fspec == NULL)
 		return (EINVAL);
 	NDINIT(ndp, LOOKUP, FOLLOW | LOCKLEAF, UIO_SYSSPACE, fspec, td);
 	if ((error = namei(ndp)) != 0)
 		return (error);
 	NDFREE(ndp, NDF_ONLY_PNBUF);
 	devvp = ndp->ni_vp;
 
 	if (!vn_isdisk(devvp, &error)) {
 		vput(devvp);
 		return (error);
 	}
 
 	/*
 	 * If mount by non-root, then verify that user has necessary
 	 * permissions on the device.
 	 */
 	if (suser(td)) {
 		accessmode = VREAD;
 		if ((mp->mnt_flag & MNT_RDONLY) == 0)
 			accessmode |= VWRITE;
 		if ((error = VOP_ACCESS(devvp, accessmode, td->td_ucred, td)) != 0) {
 			vput(devvp);
 			return (error);
 		}
 	}
 
 	if ((mp->mnt_flag & MNT_UPDATE) == 0) {
 		error = ext2_mountfs(devvp, mp, td);
 	} else {
 		if (devvp != ump->um_devvp) {
 			vput(devvp);
 			return (EINVAL);	/* needs translation */
 		} else
 			vput(devvp);
 	}
 	if (error) {
 		vrele(devvp);
 		return (error);
 	}
 	ump = VFSTOEXT2(mp);
 	fs = ump->um_e2fs;
 	/*
 	 * Note that this strncpy() is ok because of a check at the start
 	 * of ext2_mount().
 	 */
 	strncpy(fs->fs_fsmnt, path, MAXMNTLEN);
 	fs->fs_fsmnt[MAXMNTLEN - 1] = '\0';
 	vfs_mountedfrom(mp, fspec);
 	return (0);
 }
 
 /*
  * checks that the data in the descriptor blocks make sense
  * this is taken from ext2/super.c
  */
 static int ext2_check_descriptors (struct ext2_sb_info * sb)
 {
         int i;
         int desc_block = 0;
         unsigned long block = sb->s_es->s_first_data_block;
         struct ext2_group_desc * gdp = NULL;
 
         /* ext2_debug ("Checking group descriptors"); */
 
         for (i = 0; i < sb->s_groups_count; i++)
         {
 		/* examine next descriptor block */
                 if ((i % EXT2_DESC_PER_BLOCK(sb)) == 0)
                         gdp = (struct ext2_group_desc *) 
 				sb->s_group_desc[desc_block++]->b_data;
                 if (gdp->bg_block_bitmap < block ||
                     gdp->bg_block_bitmap >= block + EXT2_BLOCKS_PER_GROUP(sb))
                 {
                         printf ("ext2_check_descriptors: "
                                     "Block bitmap for group %d"
                                     " not in group (block %lu)!\n",
                                     i, (unsigned long) gdp->bg_block_bitmap);
                         return 0;
                 }
                 if (gdp->bg_inode_bitmap < block ||
                     gdp->bg_inode_bitmap >= block + EXT2_BLOCKS_PER_GROUP(sb))
                 {
                         printf ("ext2_check_descriptors: "
                                     "Inode bitmap for group %d"
                                     " not in group (block %lu)!\n",
                                     i, (unsigned long) gdp->bg_inode_bitmap);
                         return 0;
                 }
                 if (gdp->bg_inode_table < block ||
                     gdp->bg_inode_table + sb->s_itb_per_group >=
                     block + EXT2_BLOCKS_PER_GROUP(sb))
                 {
                         printf ("ext2_check_descriptors: "
                                     "Inode table for group %d"
                                     " not in group (block %lu)!\n",
                                     i, (unsigned long) gdp->bg_inode_table);
                         return 0;
                 }
                 block += EXT2_BLOCKS_PER_GROUP(sb);
                 gdp++;
         }
         return 1;
 }
 
 static int
 ext2_check_sb_compat(es, dev, ronly)
 	struct ext2_super_block *es;
 	struct cdev *dev;
 	int ronly;
 {
 
 	if (es->s_magic != EXT2_SUPER_MAGIC) {
 		printf("ext2fs: %s: wrong magic number %#x (expected %#x)\n",
 		    devtoname(dev), es->s_magic, EXT2_SUPER_MAGIC);
 		return (1);
 	}
 	if (es->s_rev_level > EXT2_GOOD_OLD_REV) {
 		if (es->s_feature_incompat & ~EXT2_FEATURE_INCOMPAT_SUPP) {
 			printf(
 "WARNING: mount of %s denied due to unsupported optional features\n",
 			    devtoname(dev));
 			return (1);
 		}
 		if (!ronly &&
 		    (es->s_feature_ro_compat & ~EXT2_FEATURE_RO_COMPAT_SUPP)) {
 			printf(
 "WARNING: R/W mount of %s denied due to unsupported optional features\n",
 			    devtoname(dev));
 			return (1);
 		}
 	}
 	return (0);
 }
 
 /*
  * this computes the fields of the  ext2_sb_info structure from the
  * data in the ext2_super_block structure read in
  */
 static int compute_sb_data(devvp, es, fs)
 	struct vnode * devvp;
 	struct ext2_super_block * es;
 	struct ext2_sb_info * fs;
 {
     int db_count, error;
     int i, j;
     int logic_sb_block = 1;	/* XXX for now */
 
 #if 1
 #define V(v)  
 #else
 #define V(v)  printf(#v"= %d\n", fs->v);
 #endif
 
     fs->s_blocksize = EXT2_MIN_BLOCK_SIZE << es->s_log_block_size; 
     V(s_blocksize)
     fs->s_bshift = EXT2_MIN_BLOCK_LOG_SIZE + es->s_log_block_size;
     V(s_bshift)
     fs->s_fsbtodb = es->s_log_block_size + 1;
     V(s_fsbtodb)
     fs->s_qbmask = fs->s_blocksize - 1;
     V(s_bmask)
     fs->s_blocksize_bits = EXT2_BLOCK_SIZE_BITS(es);
     V(s_blocksize_bits)
     fs->s_frag_size = EXT2_MIN_FRAG_SIZE << es->s_log_frag_size;
     V(s_frag_size)
     if (fs->s_frag_size)
 	fs->s_frags_per_block = fs->s_blocksize / fs->s_frag_size;
     V(s_frags_per_block)
     fs->s_blocks_per_group = es->s_blocks_per_group;
     V(s_blocks_per_group)
     fs->s_frags_per_group = es->s_frags_per_group;
     V(s_frags_per_group)
     fs->s_inodes_per_group = es->s_inodes_per_group;
     V(s_inodes_per_group)
     fs->s_inodes_per_block = fs->s_blocksize / EXT2_INODE_SIZE;
     V(s_inodes_per_block)
     fs->s_itb_per_group = fs->s_inodes_per_group /fs->s_inodes_per_block;
     V(s_itb_per_group)
     fs->s_desc_per_block = fs->s_blocksize / sizeof (struct ext2_group_desc);
     V(s_desc_per_block)
     /* s_resuid / s_resgid ? */
     fs->s_groups_count = (es->s_blocks_count -
 			  es->s_first_data_block +
 			  EXT2_BLOCKS_PER_GROUP(fs) - 1) /
 			 EXT2_BLOCKS_PER_GROUP(fs);
     V(s_groups_count)
     db_count = (fs->s_groups_count + EXT2_DESC_PER_BLOCK(fs) - 1) /
 	EXT2_DESC_PER_BLOCK(fs);
     fs->s_db_per_group = db_count;
     V(s_db_per_group)
 
     fs->s_group_desc = bsd_malloc(db_count * sizeof (struct buf *),
 		M_EXT2MNT, M_WAITOK);
 
     /* adjust logic_sb_block */
     if(fs->s_blocksize > SBSIZE) 
 	/* Godmar thinks: if the blocksize is greater than 1024, then
 	   the superblock is logically part of block zero. 
 	 */
         logic_sb_block = 0;
     
     for (i = 0; i < db_count; i++) {
 	error = bread(devvp , fsbtodb(fs, logic_sb_block + i + 1), 
 		fs->s_blocksize, NOCRED, &fs->s_group_desc[i]);
 	if(error) {
 	    for (j = 0; j < i; j++)
 		brelse(fs->s_group_desc[j]);
 	    bsd_free(fs->s_group_desc, M_EXT2MNT);
 	    printf("EXT2-fs: unable to read group descriptors (%d)\n", error);
 	    return EIO;
 	}
 	LCK_BUF(fs->s_group_desc[i])
     }
     if(!ext2_check_descriptors(fs)) {
 	    for (j = 0; j < db_count; j++)
 		    ULCK_BUF(fs->s_group_desc[j])
 	    bsd_free(fs->s_group_desc, M_EXT2MNT);
 	    printf("EXT2-fs: (ext2_check_descriptors failure) "
 		   "unable to read group descriptors\n");
 	    return EIO;
     }
 
     for (i = 0; i < EXT2_MAX_GROUP_LOADED; i++) {
 	    fs->s_inode_bitmap_number[i] = 0;
 	    fs->s_inode_bitmap[i] = NULL;
 	    fs->s_block_bitmap_number[i] = 0;
 	    fs->s_block_bitmap[i] = NULL;
     }
     fs->s_loaded_inode_bitmaps = 0;
     fs->s_loaded_block_bitmaps = 0;
     if (es->s_rev_level == EXT2_GOOD_OLD_REV || (es->s_feature_ro_compat &
         EXT2_FEATURE_RO_COMPAT_LARGE_FILE) == 0)
 	fs->fs_maxfilesize = 0x7fffffff;
     else
 	fs->fs_maxfilesize = 0x7fffffffffffffff;
     return 0;
 }
 
 /*
  * Reload all incore data for a filesystem (used after running fsck on
  * the root filesystem and finding things to fix). The filesystem must
  * be mounted read-only.
  *
  * Things to do to update the mount:
  *	1) invalidate all cached meta-data.
  *	2) re-read superblock from disk.
  *	3) re-read summary information from disk.
  *	4) invalidate all inactive vnodes.
  *	5) invalidate all cached file data.
  *	6) re-read inode data for all active vnodes.
  */
 static int
 ext2_reload(struct mount *mp, struct thread *td)
 {
-	struct vnode *vp, *nvp, *devvp;
+	struct vnode *vp, *mvp, *devvp;
 	struct inode *ip;
 	struct buf *bp;
 	struct ext2_super_block * es;
 	struct ext2_sb_info *fs;
 	int error;
 
 	if ((mp->mnt_flag & MNT_RDONLY) == 0)
 		return (EINVAL);
 	/*
 	 * Step 1: invalidate all cached meta-data.
 	 */
 	devvp = VFSTOEXT2(mp)->um_devvp;
 	vn_lock(devvp, LK_EXCLUSIVE | LK_RETRY, td);
 	if (vinvalbuf(devvp, 0, td, 0, 0) != 0)
 		panic("ext2_reload: dirty1");
 	VOP_UNLOCK(devvp, 0, td);
 
 	/*
 	 * Step 2: re-read superblock from disk.
 	 * constants have been adjusted for ext2
 	 */
 	if ((error = bread(devvp, SBLOCK, SBSIZE, NOCRED, &bp)) != 0)
 		return (error);
 	es = (struct ext2_super_block *)bp->b_data;
 	if (ext2_check_sb_compat(es, devvp->v_rdev, 0) != 0) {
 		brelse(bp);
 		return (EIO);		/* XXX needs translation */
 	}
 	fs = VFSTOEXT2(mp)->um_e2fs;
 	bcopy(bp->b_data, fs->s_es, sizeof(struct ext2_super_block));
 
 	if((error = compute_sb_data(devvp, es, fs)) != 0) {
 		brelse(bp);
 		return error;
 	}
 #ifdef UNKLAR
 	if (fs->fs_sbsize < SBSIZE)
 		bp->b_flags |= B_INVAL;
 #endif
 	brelse(bp);
 
 loop:
 	MNT_ILOCK(mp);
-	MNT_VNODE_FOREACH(vp, mp, nvp) {
+	MNT_VNODE_FOREACH(vp, mp, mvp) {
 		VI_LOCK(vp);
 		if (vp->v_iflag & VI_DOOMED) {
 			VI_UNLOCK(vp);
 			continue;
 		}
 		MNT_IUNLOCK(mp);
 		/*
 		 * Step 4: invalidate all cached file data.
 		 */
 		if (vget(vp, LK_EXCLUSIVE | LK_INTERLOCK, td)) {
+			MNT_VNODE_FOREACH_ABORT(mp, mvp);
 			goto loop;
 		}
 		if (vinvalbuf(vp, 0, td, 0, 0))
 			panic("ext2_reload: dirty2");
 		/*
 		 * Step 5: re-read inode data for all active vnodes.
 		 */
 		ip = VTOI(vp);
 		error =
 		    bread(devvp, fsbtodb(fs, ino_to_fsba(fs, ip->i_number)),
 		    (int)fs->s_blocksize, NOCRED, &bp);
 		if (error) {
 			VOP_UNLOCK(vp, 0, td);
 			vrele(vp);
+			MNT_VNODE_FOREACH_ABORT(mp, mvp);
 			return (error);
 		}
 		ext2_ei2i((struct ext2_inode *) ((char *)bp->b_data +
 		    EXT2_INODE_SIZE * ino_to_fsbo(fs, ip->i_number)), ip);
 		brelse(bp);
 		VOP_UNLOCK(vp, 0, td);
 		vrele(vp);
 		MNT_ILOCK(mp);
 	}
 	MNT_IUNLOCK(mp);
 	return (0);
 }
 
 /*
  * Common code for mount and mountroot
  */
 static int
 ext2_mountfs(devvp, mp, td)
 	struct vnode *devvp;
 	struct mount *mp;
 	struct thread *td;
 {
 	struct ext2mount *ump;
 	struct buf *bp;
 	struct ext2_sb_info *fs;
 	struct ext2_super_block * es;
 	struct cdev *dev = devvp->v_rdev;
 	struct g_consumer *cp;
 	struct bufobj *bo;
 	int error;
 	int ronly;
 
 	ronly = vfs_flagopt(mp->mnt_optnew, "ro", NULL, 0);
 	/* XXX: use VOP_ACESS to check FS perms */
 	DROP_GIANT();
 	g_topology_lock();
 	error = g_vfs_open(devvp, &cp, "ext2fs", ronly ? 0 : 1);
 	g_topology_unlock();
 	PICKUP_GIANT();
 	VOP_UNLOCK(devvp, 0, td);
 	if (error)
 		return (error);
 
 	/* XXX: should we check for some sectorsize or 512 instead? */
 	if (((SBSIZE % cp->provider->sectorsize) != 0) ||
 	    (SBSIZE < cp->provider->sectorsize)) {
 		DROP_GIANT();
 		g_topology_lock();
 		g_vfs_close(cp, td);
 		g_topology_unlock();
 		PICKUP_GIANT();
 		return (EINVAL);
 	}
 
 	bo = &devvp->v_bufobj;
 	bo->bo_private = cp;
 	bo->bo_ops = g_vfs_bufops;
 	if (devvp->v_rdev->si_iosize_max != 0)
 		mp->mnt_iosize_max = devvp->v_rdev->si_iosize_max;
 	if (mp->mnt_iosize_max > MAXPHYS)
 		mp->mnt_iosize_max = MAXPHYS;
 
 	bp = NULL;
 	ump = NULL;
 	if ((error = bread(devvp, SBLOCK, SBSIZE, NOCRED, &bp)) != 0)
 		goto out;
 	es = (struct ext2_super_block *)bp->b_data;
 	if (ext2_check_sb_compat(es, dev, ronly) != 0) {
 		error = EINVAL;		/* XXX needs translation */
 		goto out;
 	}
 	if ((es->s_state & EXT2_VALID_FS) == 0 ||
 	    (es->s_state & EXT2_ERROR_FS)) {
 		if (ronly || (mp->mnt_flag & MNT_FORCE)) {
 			printf(
 "WARNING: Filesystem was not properly dismounted\n");
 		} else {
 			printf(
 "WARNING: R/W mount denied.  Filesystem is not clean - run fsck\n");
 			error = EPERM;
 			goto out;
 		}
 	}
 	ump = bsd_malloc(sizeof *ump, M_EXT2MNT, M_WAITOK);
 	bzero((caddr_t)ump, sizeof *ump);
 	/* I don't know whether this is the right strategy. Note that
 	   we dynamically allocate both an ext2_sb_info and an ext2_super_block
 	   while Linux keeps the super block in a locked buffer
 	 */
 	ump->um_e2fs = bsd_malloc(sizeof(struct ext2_sb_info), 
 		M_EXT2MNT, M_WAITOK);
 	ump->um_e2fs->s_es = bsd_malloc(sizeof(struct ext2_super_block), 
 		M_EXT2MNT, M_WAITOK);
 	bcopy(es, ump->um_e2fs->s_es, (u_int)sizeof(struct ext2_super_block));
 	if ((error = compute_sb_data(devvp, ump->um_e2fs->s_es, ump->um_e2fs)))
 		goto out;
 	/*
 	 * We don't free the group descriptors allocated by compute_sb_data()
 	 * until ext2_unmount().  This is OK since the mount will succeed.
 	 */
 	brelse(bp);
 	bp = NULL;
 	fs = ump->um_e2fs;
 	fs->s_rd_only = ronly;	/* ronly is set according to mnt_flags */
 	/* if the fs is not mounted read-only, make sure the super block is 
 	   always written back on a sync()
 	 */
 	fs->s_wasvalid = fs->s_es->s_state & EXT2_VALID_FS ? 1 : 0;
 	if (ronly == 0) {
 		fs->s_dirt = 1;		/* mark it modified */
 		fs->s_es->s_state &= ~EXT2_VALID_FS;	/* set fs invalid */
 	}
 	mp->mnt_data = (qaddr_t)ump;
 	mp->mnt_stat.f_fsid.val[0] = dev2udev(dev);
 	mp->mnt_stat.f_fsid.val[1] = mp->mnt_vfc->vfc_typenum;
 	mp->mnt_maxsymlinklen = EXT2_MAXSYMLINKLEN;
 	mp->mnt_flag |= MNT_LOCAL;
 	ump->um_mountp = mp;
 	ump->um_dev = dev;
 	ump->um_devvp = devvp;
 	ump->um_bo = &devvp->v_bufobj;
 	ump->um_cp = cp;
 	/* setting those two parameters allowed us to use
 	   ufs_bmap w/o changse !
 	*/
 	ump->um_nindir = EXT2_ADDR_PER_BLOCK(fs);
 	ump->um_bptrtodb = fs->s_es->s_log_block_size + 1;
 	ump->um_seqinc = EXT2_FRAGS_PER_BLOCK(fs);
 	if (ronly == 0) 
 		ext2_sbupdate(ump, MNT_WAIT);
 	return (0);
 out:
 	if (bp)
 		brelse(bp);
 	if (cp != NULL) {
 		DROP_GIANT();
 		g_topology_lock();
 		g_vfs_close(cp, td);
 		g_topology_unlock();
 		PICKUP_GIANT();
 	}
 	if (ump) {
 		bsd_free(ump->um_e2fs->s_es, M_EXT2MNT);
 		bsd_free(ump->um_e2fs, M_EXT2MNT);
 		bsd_free(ump, M_EXT2MNT);
 		mp->mnt_data = (qaddr_t)0;
 	}
 	return (error);
 }
 
 /*
  * unmount system call
  */
 static int
 ext2_unmount(mp, mntflags, td)
 	struct mount *mp;
 	int mntflags;
 	struct thread *td;
 {
 	struct ext2mount *ump;
 	struct ext2_sb_info *fs;
 	int error, flags, ronly, i;
 
 	flags = 0;
 	if (mntflags & MNT_FORCE) {
 		if (mp->mnt_flag & MNT_ROOTFS)
 			return (EINVAL);
 		flags |= FORCECLOSE;
 	}
 	if ((error = ext2_flushfiles(mp, flags, td)) != 0)
 		return (error);
 	ump = VFSTOEXT2(mp);
 	fs = ump->um_e2fs;
 	ronly = fs->s_rd_only;
 	if (ronly == 0) {
 		if (fs->s_wasvalid)
 			fs->s_es->s_state |= EXT2_VALID_FS;
 		ext2_sbupdate(ump, MNT_WAIT);
 	}
 
 	/* release buffers containing group descriptors */
 	for(i = 0; i < fs->s_db_per_group; i++) 
 		ULCK_BUF(fs->s_group_desc[i])
 	bsd_free(fs->s_group_desc, M_EXT2MNT);
 
 	/* release cached inode/block bitmaps */
         for (i = 0; i < EXT2_MAX_GROUP_LOADED; i++)
                 if (fs->s_inode_bitmap[i])
 			ULCK_BUF(fs->s_inode_bitmap[i])
 
         for (i = 0; i < EXT2_MAX_GROUP_LOADED; i++)
                 if (fs->s_block_bitmap[i])
 			ULCK_BUF(fs->s_block_bitmap[i])
 
 	DROP_GIANT();
 	g_topology_lock();
 	g_vfs_close(ump->um_cp, td);
 	g_topology_unlock();
 	PICKUP_GIANT();
 	vrele(ump->um_devvp);
 	bsd_free(fs->s_es, M_EXT2MNT);
 	bsd_free(fs, M_EXT2MNT);
 	bsd_free(ump, M_EXT2MNT);
 	mp->mnt_data = (qaddr_t)0;
 	mp->mnt_flag &= ~MNT_LOCAL;
 	return (error);
 }
 
 /*
  * Flush out all the files in a filesystem.
  */
 static int
 ext2_flushfiles(mp, flags, td)
 	struct mount *mp;
 	int flags;
 	struct thread *td;
 {
 	int error;
 
 	error = vflush(mp, 0, flags, td);
 	return (error);
 }
 
 /*
  * Get file system statistics.
  * taken from ext2/super.c ext2_statfs
  */
 static int
 ext2_statfs(mp, sbp, td)
 	struct mount *mp;
 	struct statfs *sbp;
 	struct thread *td;
 {
         unsigned long overhead;
 	struct ext2mount *ump;
 	struct ext2_sb_info *fs;
 	struct ext2_super_block *es;
 	int i, nsb;
 
 	ump = VFSTOEXT2(mp);
 	fs = ump->um_e2fs;
 	es = fs->s_es;
 
 	if (es->s_magic != EXT2_SUPER_MAGIC)
 		panic("ext2_statfs - magic number spoiled");
 
 	/*
 	 * Compute the overhead (FS structures)
 	 */
 	if (es->s_feature_ro_compat & EXT2_FEATURE_RO_COMPAT_SPARSE_SUPER) {
 		nsb = 0;
 		for (i = 0 ; i < fs->s_groups_count; i++)
 			if (ext2_group_sparse(i))
 				nsb++;
 	} else
 		nsb = fs->s_groups_count;
 	overhead = es->s_first_data_block + 
 	    /* Superblocks and block group descriptors: */
 	    nsb * (1 + fs->s_db_per_group) +
 	    /* Inode bitmap, block bitmap, and inode table: */
 	    fs->s_groups_count * (1 + 1 + fs->s_itb_per_group);
 
 	sbp->f_bsize = EXT2_FRAG_SIZE(fs);	
 	sbp->f_iosize = EXT2_BLOCK_SIZE(fs);
 	sbp->f_blocks = es->s_blocks_count - overhead;
 	sbp->f_bfree = es->s_free_blocks_count; 
 	sbp->f_bavail = sbp->f_bfree - es->s_r_blocks_count; 
 	sbp->f_files = es->s_inodes_count; 
 	sbp->f_ffree = es->s_free_inodes_count; 
 	return (0);
 }
 
 /*
  * Go through the disk queues to initiate sandbagged IO;
  * go through the inodes to write those that have been modified;
  * initiate the writing of the super block if it has been modified.
  *
  * Note: we are always called with the filesystem marked `MPBUSY'.
  */
 static int
 ext2_sync(mp, waitfor, td)
 	struct mount *mp;
 	int waitfor;
 	struct thread *td;
 {
-	struct vnode *nvp, *vp;
+	struct vnode *mvp, *vp;
 	struct inode *ip;
 	struct ext2mount *ump = VFSTOEXT2(mp);
 	struct ext2_sb_info *fs;
 	int error, allerror = 0;
 
 	fs = ump->um_e2fs;
 	if (fs->s_dirt != 0 && fs->s_rd_only != 0) {		/* XXX */
 		printf("fs = %s\n", fs->fs_fsmnt);
 		panic("ext2_sync: rofs mod");
 	}
 	/*
 	 * Write back each (modified) inode.
 	 */
 	MNT_ILOCK(mp);
 loop:
-	MNT_VNODE_FOREACH(vp, mp, nvp) {
+	MNT_VNODE_FOREACH(vp, mp, mvp) {
 		VI_LOCK(vp);
 		if (vp->v_type == VNON || (vp->v_iflag & VI_DOOMED)) {
 			VI_UNLOCK(vp);
 			continue;
 		}
 		MNT_IUNLOCK(mp);
 		ip = VTOI(vp);
 		if ((ip->i_flag &
 		    (IN_ACCESS | IN_CHANGE | IN_MODIFIED | IN_UPDATE)) == 0 &&
 		    (vp->v_bufobj.bo_dirty.bv_cnt == 0 ||
 		    waitfor == MNT_LAZY)) {
 			VI_UNLOCK(vp);
 			MNT_ILOCK(mp);
 			continue;
 		}
 		error = vget(vp, LK_EXCLUSIVE | LK_NOWAIT | LK_INTERLOCK, td);
 		if (error) {
 			MNT_ILOCK(mp);
-			if (error == ENOENT)
+			if (error == ENOENT) {
+				MNT_VNODE_FOREACH_ABORT_ILOCKED(mp, mvp);
 				goto loop;
+			}
 			continue;
 		}
 		if ((error = VOP_FSYNC(vp, waitfor, td)) != 0)
 			allerror = error;
 		VOP_UNLOCK(vp, 0, td);
 		vrele(vp);
 		MNT_ILOCK(mp);
 	}
 	MNT_IUNLOCK(mp);
 	/*
 	 * Force stale file system control information to be flushed.
 	 */
 	if (waitfor != MNT_LAZY) {
 		vn_lock(ump->um_devvp, LK_EXCLUSIVE | LK_RETRY, td);
 		if ((error = VOP_FSYNC(ump->um_devvp, waitfor, td)) != 0)
 			allerror = error;
 		VOP_UNLOCK(ump->um_devvp, 0, td);
 	}
 	/*
 	 * Write back modified superblock.
 	 */
 	if (fs->s_dirt != 0) {
 		fs->s_dirt = 0;
 		fs->s_es->s_wtime = time_second;
 		if ((error = ext2_sbupdate(ump, waitfor)) != 0)
 			allerror = error;
 	}
 	return (allerror);
 }
 
 /*
  * Look up an EXT2FS dinode number to find its incore vnode, otherwise read it
  * in from disk.  If it is in core, wait for the lock bit to clear, then
  * return the inode locked.  Detection and handling of mount points must be
  * done by the calling routine.
  */
 static int
 ext2_vget(mp, ino, flags, vpp)
 	struct mount *mp;
 	ino_t ino;
 	int flags;
 	struct vnode **vpp;
 {
 	struct ext2_sb_info *fs;
 	struct inode *ip;
 	struct ext2mount *ump;
 	struct buf *bp;
 	struct vnode *vp;
 	struct cdev *dev;
 	int i, error;
 	int used_blocks;
 
 	error = vfs_hash_get(mp, ino, flags, curthread, vpp, NULL, NULL);
 	if (error || *vpp != NULL)
 		return (error);
 
 	ump = VFSTOEXT2(mp);
 	dev = ump->um_dev;
 
 	/*
 	 * If this MALLOC() is performed after the getnewvnode()
 	 * it might block, leaving a vnode with a NULL v_data to be
 	 * found by ext2_sync() if a sync happens to fire right then,
 	 * which will cause a panic because ext2_sync() blindly
 	 * dereferences vp->v_data (as well it should).
 	 */
 	ip = malloc(sizeof(struct inode), M_EXT2NODE, M_WAITOK | M_ZERO);
 
 	/* Allocate a new vnode/inode. */
 	if ((error = getnewvnode("ext2fs", mp, &ext2_vnodeops, &vp)) != 0) {
 		*vpp = NULL;
 		free(ip, M_EXT2NODE);
 		return (error);
 	}
 	vp->v_data = ip;
 	ip->i_vnode = vp;
 	ip->i_e2fs = fs = ump->um_e2fs;
 	ip->i_number = ino;
 
 	error = vfs_hash_insert(vp, ino, flags, curthread, vpp, NULL, NULL);
 	if (error || *vpp != NULL)
 		return (error);
 
 	/* Read in the disk contents for the inode, copy into the inode. */
 #if 0
 printf("ext2_vget(%d) dbn= %d ", ino, fsbtodb(fs, ino_to_fsba(fs, ino)));
 #endif
 	if ((error = bread(ump->um_devvp, fsbtodb(fs, ino_to_fsba(fs, ino)),
 	    (int)fs->s_blocksize, NOCRED, &bp)) != 0) {
 		/*
 		 * The inode does not contain anything useful, so it would
 		 * be misleading to leave it on its hash chain. With mode
 		 * still zero, it will be unlinked and returned to the free
 		 * list by vput().
 		 */
 		vput(vp);
 		brelse(bp);
 		*vpp = NULL;
 		return (error);
 	}
 	/* convert ext2 inode to dinode */
 	ext2_ei2i((struct ext2_inode *) ((char *)bp->b_data + EXT2_INODE_SIZE *
 			ino_to_fsbo(fs, ino)), ip);
 	ip->i_block_group = ino_to_cg(fs, ino);
 	ip->i_next_alloc_block = 0;
 	ip->i_next_alloc_goal = 0;
 	ip->i_prealloc_count = 0;
 	ip->i_prealloc_block = 0;
         /* now we want to make sure that block pointers for unused
            blocks are zeroed out - ext2_balloc depends on this 
 	   although for regular files and directories only
 	*/
 	if(S_ISDIR(ip->i_mode) || S_ISREG(ip->i_mode)) {
 		used_blocks = (ip->i_size+fs->s_blocksize-1) / fs->s_blocksize;
 		for(i = used_blocks; i < EXT2_NDIR_BLOCKS; i++)
 			ip->i_db[i] = 0;
 	}
 /*
 	ext2_print_inode(ip);
 */
 	brelse(bp);
 
 	/*
 	 * Initialize the vnode from the inode, check for aliases.
 	 * Note that the underlying vnode may have changed.
 	 */
 	if ((error = ext2_vinit(mp, &ext2_fifoops, &vp)) != 0) {
 		vput(vp);
 		*vpp = NULL;
 		return (error);
 	}
 	/*
 	 * Finish inode initialization now that aliasing has been resolved.
 	 */
 	ip->i_devvp = ump->um_devvp;
 	/*
 	 * Set up a generation number for this inode if it does not
 	 * already have one. This should only happen on old filesystems.
 	 */
 	if (ip->i_gen == 0) {
 		ip->i_gen = random() / 2 + 1;
 		if ((vp->v_mount->mnt_flag & MNT_RDONLY) == 0)
 			ip->i_flag |= IN_MODIFIED;
 	}
 	*vpp = vp;
 	return (0);
 }
 
 /*
  * File handle to vnode
  *
  * Have to be really careful about stale file handles:
  * - check that the inode number is valid
  * - call ext2_vget() to get the locked inode
  * - check for an unallocated inode (i_mode == 0)
  * - check that the given client host has export rights and return
  *   those rights via. exflagsp and credanonp
  */
 static int
 ext2_fhtovp(mp, fhp, vpp)
 	struct mount *mp;
 	struct fid *fhp;
 	struct vnode **vpp;
 {
 	struct inode *ip;
 	struct ufid *ufhp;
 	struct vnode *nvp;
 	struct ext2_sb_info *fs;
 	int error;
 
 	ufhp = (struct ufid *)fhp;
 	fs = VFSTOEXT2(mp)->um_e2fs;
 	if (ufhp->ufid_ino < ROOTINO ||
 	    ufhp->ufid_ino > fs->s_groups_count * fs->s_es->s_inodes_per_group)
 		return (ESTALE);
 
 	error = VFS_VGET(mp, ufhp->ufid_ino, LK_EXCLUSIVE, &nvp);
 	if (error) {
 		*vpp = NULLVP;
 		return (error);
 	}
 	ip = VTOI(nvp);
 	if (ip->i_mode == 0 ||
 	    ip->i_gen != ufhp->ufid_gen || ip->i_nlink <= 0) {
 		vput(nvp);
 		*vpp = NULLVP;
 		return (ESTALE);
 	}
 	*vpp = nvp;
 	vnode_create_vobject(*vpp, 0, curthread);
 	return (0);
 }
 
 /*
  * Vnode pointer to File handle
  */
 /* ARGSUSED */
 static int
 ext2_vptofh(vp, fhp)
 	struct vnode *vp;
 	struct fid *fhp;
 {
 	struct inode *ip;
 	struct ufid *ufhp;
 
 	ip = VTOI(vp);
 	ufhp = (struct ufid *)fhp;
 	ufhp->ufid_len = sizeof(struct ufid);
 	ufhp->ufid_ino = ip->i_number;
 	ufhp->ufid_gen = ip->i_gen;
 	return (0);
 }
 
 /*
  * Write a superblock and associated information back to disk.
  */
 static int
 ext2_sbupdate(mp, waitfor)
 	struct ext2mount *mp;
 	int waitfor;
 {
 	struct ext2_sb_info *fs = mp->um_e2fs;
 	struct ext2_super_block *es = fs->s_es;
 	struct buf *bp;
 	int error = 0;
 /*
 printf("\nupdating superblock, waitfor=%s\n", waitfor == MNT_WAIT ? "yes":"no");
 */
 	bp = getblk(mp->um_devvp, SBLOCK, SBSIZE, 0, 0, 0);
 	bcopy((caddr_t)es, bp->b_data, (u_int)sizeof(struct ext2_super_block));
 	if (waitfor == MNT_WAIT)
 		error = bwrite(bp);
 	else
 		bawrite(bp);
 
 	/*
 	 * The buffers for group descriptors, inode bitmaps and block bitmaps
 	 * are not busy at this point and are (hopefully) written by the
 	 * usual sync mechanism. No need to write them here
 		 */
 
 	return (error);
 }
 
 /*
  * Return the root of a filesystem.
  */
 static int
 ext2_root(mp, flags, vpp, td)
 	struct mount *mp;
 	int flags;
 	struct vnode **vpp;
 	struct thread *td;
 {
 	struct vnode *nvp;
 	int error;
 
 	error = VFS_VGET(mp, (ino_t)ROOTINO, LK_EXCLUSIVE, &nvp);
 	if (error)
 		return (error);
 	*vpp = nvp;
 	return (0);
 }
Index: head/sys/isofs/cd9660/cd9660_vnops.c
===================================================================
--- head/sys/isofs/cd9660/cd9660_vnops.c	(revision 154151)
+++ head/sys/isofs/cd9660/cd9660_vnops.c	(revision 154152)
@@ -1,832 +1,833 @@
 /*-
  * Copyright (c) 1994
  *	The Regents of the University of California.  All rights reserved.
  *
  * This code is derived from software contributed to Berkeley
  * by Pace Willisson (pace@blitz.com).  The Rock Ridge Extension
  * Support code is derived from software contributed to Berkeley
  * by Atsushi Murai (amurai@spec.co.jp).
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	@(#)cd9660_vnops.c	8.19 (Berkeley) 5/27/95
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/namei.h>
 #include <sys/kernel.h>
 #include <sys/conf.h>
 #include <sys/stat.h>
 #include <sys/bio.h>
 #include <sys/buf.h>
 #include <sys/mount.h>
 #include <sys/vnode.h>
 #include <fs/fifofs/fifo.h>
 #include <sys/malloc.h>
 #include <sys/dirent.h>
 #include <sys/unistd.h>
 #include <sys/filio.h>
 
 #include <vm/vm.h>
 #include <vm/vnode_pager.h>
 #include <vm/uma.h>
 
 #include <isofs/cd9660/iso.h>
 #include <isofs/cd9660/cd9660_node.h>
 #include <isofs/cd9660/iso_rrip.h>
 
 static vop_setattr_t	cd9660_setattr;
 static vop_open_t	cd9660_open;
 static vop_access_t	cd9660_access;
 static vop_getattr_t	cd9660_getattr;
 static vop_ioctl_t	cd9660_ioctl;
 static vop_pathconf_t	cd9660_pathconf;
 static vop_read_t	cd9660_read;
 struct isoreaddir;
 static int iso_uiodir(struct isoreaddir *idp, struct dirent *dp, off_t off);
 static int iso_shipdir(struct isoreaddir *idp);
 static vop_readdir_t	cd9660_readdir;
 static vop_readlink_t	cd9660_readlink;
 static vop_strategy_t	cd9660_strategy;
 
 /*
  * Setattr call. Only allowed for block and character special devices.
  */
 static int
 cd9660_setattr(ap)
 	struct vop_setattr_args /* {
 		struct vnodeop_desc *a_desc;
 		struct vnode *a_vp;
 		struct vattr *a_vap;
 		struct ucred *a_cred;
 		struct thread *a_td;
 	} */ *ap;
 {
 	struct vnode *vp = ap->a_vp;
 	struct vattr *vap = ap->a_vap;
 
 	if (vap->va_flags != (u_long)VNOVAL || vap->va_uid != (uid_t)VNOVAL ||
 	    vap->va_gid != (gid_t)VNOVAL || vap->va_atime.tv_sec != VNOVAL ||
 	    vap->va_mtime.tv_sec != VNOVAL || vap->va_mode != (mode_t)VNOVAL)
 		return (EROFS);
 	if (vap->va_size != (u_quad_t)VNOVAL) {
 		switch (vp->v_type) {
 		case VDIR:
 			return (EISDIR);
 		case VLNK:
 		case VREG:
 			return (EROFS);
 		case VCHR:
 		case VBLK:
 		case VSOCK:
 		case VFIFO:
 		case VNON:
 		case VBAD:
+		case VMARKER:
 			return (0);
 		}
 	}
 	return (0);
 }
 
 /*
  * Check mode permission on inode pointer. Mode is READ, WRITE or EXEC.
  * The mode is shifted to select the owner/group/other fields. The
  * super user is granted all permissions.
  */
 /* ARGSUSED */
 static int
 cd9660_access(ap)
 	struct vop_access_args /* {
 		struct vnode *a_vp;
 		int  a_mode;
 		struct ucred *a_cred;
 		struct thread *a_td;
 	} */ *ap;
 {
 	struct vnode *vp = ap->a_vp;
 	struct iso_node *ip = VTOI(vp);
 	mode_t mode = ap->a_mode;
 
 	if (vp->v_type == VCHR || vp->v_type == VBLK)
 		return (EOPNOTSUPP);
 
 	/*
 	 * Disallow write attempts unless the file is a socket,
 	 * fifo, or a block or character device resident on the
 	 * filesystem.
 	 */
 	if (mode & VWRITE) {
 		switch (vp->v_type) {
 		case VDIR:
 		case VLNK:
 		case VREG:
 			return (EROFS);
 			/* NOT REACHED */
 		default:
 			break;
 		}
 	}
 
 	return (vaccess(vp->v_type, ip->inode.iso_mode, ip->inode.iso_uid,
 	    ip->inode.iso_gid, ap->a_mode, ap->a_cred, NULL));
 }
 
 static int
 cd9660_open(ap)
 	struct vop_open_args /* {
 		struct vnode *a_vp;
 		int a_mode;
 		struct ucred *a_cred;
 		struct thread *a_td;
 		int a_fdidx;
 	} */ *ap;
 {
 	struct iso_node *ip = VTOI(ap->a_vp);
 
 	vnode_create_vobject(ap->a_vp, ip->i_size, ap->a_td);
 	return 0;
 }
 
 
 static int
 cd9660_getattr(ap)
 	struct vop_getattr_args /* {
 		struct vnode *a_vp;
 		struct vattr *a_vap;
 		struct ucred *a_cred;
 		struct thread *a_td;
 	} */ *ap;
 
 {
 	struct vnode *vp = ap->a_vp;
 	struct vattr *vap = ap->a_vap;
 	struct iso_node *ip = VTOI(vp);
 
 	vap->va_fsid    = dev2udev(ip->i_mnt->im_dev);
 	vap->va_fileid	= ip->i_number;
 
 	vap->va_mode	= ip->inode.iso_mode;
 	vap->va_nlink	= ip->inode.iso_links;
 	vap->va_uid	= ip->inode.iso_uid;
 	vap->va_gid	= ip->inode.iso_gid;
 	vap->va_atime	= ip->inode.iso_atime;
 	vap->va_mtime	= ip->inode.iso_mtime;
 	vap->va_ctime	= ip->inode.iso_ctime;
 	vap->va_rdev	= ip->inode.iso_rdev;
 
 	vap->va_size	= (u_quad_t) ip->i_size;
 	if (ip->i_size == 0 && (vap->va_mode & S_IFMT) == S_IFLNK) {
 		struct vop_readlink_args rdlnk;
 		struct iovec aiov;
 		struct uio auio;
 		char *cp;
 
 		MALLOC(cp, char *, MAXPATHLEN, M_TEMP, M_WAITOK);
 		aiov.iov_base = cp;
 		aiov.iov_len = MAXPATHLEN;
 		auio.uio_iov = &aiov;
 		auio.uio_iovcnt = 1;
 		auio.uio_offset = 0;
 		auio.uio_rw = UIO_READ;
 		auio.uio_segflg = UIO_SYSSPACE;
 		auio.uio_td = ap->a_td;
 		auio.uio_resid = MAXPATHLEN;
 		rdlnk.a_uio = &auio;
 		rdlnk.a_vp = ap->a_vp;
 		rdlnk.a_cred = ap->a_cred;
 		if (cd9660_readlink(&rdlnk) == 0)
 			vap->va_size = MAXPATHLEN - auio.uio_resid;
 		FREE(cp, M_TEMP);
 	}
 	vap->va_flags	= 0;
 	vap->va_gen = 1;
 	vap->va_blocksize = ip->i_mnt->logical_block_size;
 	vap->va_bytes	= (u_quad_t) ip->i_size;
 	vap->va_type	= vp->v_type;
 	vap->va_filerev	= 0;
 	return (0);
 }
 
 /*
  * Vnode op for ioctl.
  */
 static int
 cd9660_ioctl(ap)
 	struct vop_ioctl_args /* {
 		struct vnode *a_vp;
 		u_long  a_command;
 		caddr_t  a_data;
 		int  a_fflag;
 		struct ucred *a_cred;
 		struct thread *a_td;
 	} */ *ap;
 {
 	struct vnode *vp = ap->a_vp;
 	struct iso_node *ip = VTOI(vp);
 
 	if (vp->v_type == VCHR || vp->v_type == VBLK)
 		return (EOPNOTSUPP);
 
 	switch (ap->a_command) {
 
 	case FIOGETLBA:
 		*(int *)(ap->a_data) = ip->iso_start;
 		return 0;
 	default:
 		return (ENOTTY);
 	}
 }
 
 /*
  * Vnode op for reading.
  */
 static int
 cd9660_read(ap)
 	struct vop_read_args /* {
 		struct vnode *a_vp;
 		struct uio *a_uio;
 		int a_ioflag;
 		struct ucred *a_cred;
 	} */ *ap;
 {
 	struct vnode *vp = ap->a_vp;
 	struct uio *uio = ap->a_uio;
 	struct iso_node *ip = VTOI(vp);
 	struct iso_mnt *imp;
 	struct buf *bp;
 	daddr_t lbn, rablock;
 	off_t diff;
 	int rasize, error = 0;
 	int seqcount;
 	long size, n, on;
 
 	if (vp->v_type == VCHR || vp->v_type == VBLK)
 		return (EOPNOTSUPP);
 
 	seqcount = ap->a_ioflag >> IO_SEQSHIFT;
 
 	if (uio->uio_resid == 0)
 		return (0);
 	if (uio->uio_offset < 0)
 		return (EINVAL);
 	ip->i_flag |= IN_ACCESS;
 	imp = ip->i_mnt;
 	do {
 		lbn = lblkno(imp, uio->uio_offset);
 		on = blkoff(imp, uio->uio_offset);
 		n = min((u_int)(imp->logical_block_size - on),
 			uio->uio_resid);
 		diff = (off_t)ip->i_size - uio->uio_offset;
 		if (diff <= 0)
 			return (0);
 		if (diff < n)
 			n = diff;
 		size = blksize(imp, ip, lbn);
 		rablock = lbn + 1;
 		if ((vp->v_mount->mnt_flag & MNT_NOCLUSTERR) == 0) {
 			if (lblktosize(imp, rablock) < ip->i_size)
 				error = cluster_read(vp, (off_t)ip->i_size,
 					 lbn, size, NOCRED, uio->uio_resid,
 					 (ap->a_ioflag >> 16), &bp);
 			else
 				error = bread(vp, lbn, size, NOCRED, &bp);
 		} else {
 			if (seqcount > 1 &&
 			    lblktosize(imp, rablock) < ip->i_size) {
 				rasize = blksize(imp, ip, rablock);
 				error = breadn(vp, lbn, size, &rablock,
 					       &rasize, 1, NOCRED, &bp);
 			} else
 				error = bread(vp, lbn, size, NOCRED, &bp);
 		}
 		n = min(n, size - bp->b_resid);
 		if (error) {
 			brelse(bp);
 			return (error);
 		}
 
 		error = uiomove(bp->b_data + on, (int)n, uio);
 		brelse(bp);
 	} while (error == 0 && uio->uio_resid > 0 && n != 0);
 	return (error);
 }
 
 /*
  * Structure for reading directories
  */
 struct isoreaddir {
 	struct dirent saveent;
 	struct dirent assocent;
 	struct dirent current;
 	off_t saveoff;
 	off_t assocoff;
 	off_t curroff;
 	struct uio *uio;
 	off_t uio_off;
 	int eofflag;
 	u_long *cookies;
 	int ncookies;
 };
 
 static int
 iso_uiodir(idp,dp,off)
 	struct isoreaddir *idp;
 	struct dirent *dp;
 	off_t off;
 {
 	int error;
 
 	dp->d_name[dp->d_namlen] = 0;
 	dp->d_reclen = GENERIC_DIRSIZ(dp);
 
 	if (idp->uio->uio_resid < dp->d_reclen) {
 		idp->eofflag = 0;
 		return (-1);
 	}
 
 	if (idp->cookies) {
 		if (idp->ncookies <= 0) {
 			idp->eofflag = 0;
 			return (-1);
 		}
 
 		*idp->cookies++ = off;
 		--idp->ncookies;
 	}
 
 	if ((error = uiomove(dp, dp->d_reclen, idp->uio)) != 0)
 		return (error);
 	idp->uio_off = off;
 	return (0);
 }
 
 static int
 iso_shipdir(idp)
 	struct isoreaddir *idp;
 {
 	struct dirent *dp;
 	int cl, sl, assoc;
 	int error;
 	char *cname, *sname;
 
 	cl = idp->current.d_namlen;
 	cname = idp->current.d_name;
 assoc = (cl > 1) && (*cname == ASSOCCHAR);
 	if (assoc) {
 		cl--;
 		cname++;
 	}
 
 	dp = &idp->saveent;
 	sname = dp->d_name;
 	if (!(sl = dp->d_namlen)) {
 		dp = &idp->assocent;
 		sname = dp->d_name + 1;
 		sl = dp->d_namlen - 1;
 	}
 	if (sl > 0) {
 		if (sl != cl
 		    || bcmp(sname,cname,sl)) {
 			if (idp->assocent.d_namlen) {
 				if ((error = iso_uiodir(idp,&idp->assocent,idp->assocoff)) != 0)
 					return (error);
 				idp->assocent.d_namlen = 0;
 			}
 			if (idp->saveent.d_namlen) {
 				if ((error = iso_uiodir(idp,&idp->saveent,idp->saveoff)) != 0)
 					return (error);
 				idp->saveent.d_namlen = 0;
 			}
 		}
 	}
 	idp->current.d_reclen = GENERIC_DIRSIZ(&idp->current);
 	if (assoc) {
 		idp->assocoff = idp->curroff;
 		bcopy(&idp->current,&idp->assocent,idp->current.d_reclen);
 	} else {
 		idp->saveoff = idp->curroff;
 		bcopy(&idp->current,&idp->saveent,idp->current.d_reclen);
 	}
 	return (0);
 }
 
 /*
  * Vnode op for readdir
  */
 static int
 cd9660_readdir(ap)
 	struct vop_readdir_args /* {
 		struct vnode *a_vp;
 		struct uio *a_uio;
 		struct ucred *a_cred;
 		int *a_eofflag;
 		int *a_ncookies;
 		u_long **a_cookies;
 	} */ *ap;
 {
 	struct uio *uio = ap->a_uio;
 	struct isoreaddir *idp;
 	struct vnode *vdp = ap->a_vp;
 	struct iso_node *dp;
 	struct iso_mnt *imp;
 	struct buf *bp = NULL;
 	struct iso_directory_record *ep;
 	int entryoffsetinblock;
 	doff_t endsearch;
 	u_long bmask;
 	int error = 0;
 	int reclen;
 	u_short namelen;
 	int ncookies = 0;
 	u_long *cookies = NULL;
 
 	dp = VTOI(vdp);
 	imp = dp->i_mnt;
 	bmask = imp->im_bmask;
 
 	MALLOC(idp, struct isoreaddir *, sizeof(*idp), M_TEMP, M_WAITOK);
 	idp->saveent.d_namlen = idp->assocent.d_namlen = 0;
 	/*
 	 * XXX
 	 * Is it worth trying to figure out the type?
 	 */
 	idp->saveent.d_type = idp->assocent.d_type = idp->current.d_type =
 	    DT_UNKNOWN;
 	idp->uio = uio;
 	if (ap->a_ncookies == NULL) {
 		idp->cookies = NULL;
 	} else {
 		/*
 		 * Guess the number of cookies needed.
 		 */
 		ncookies = uio->uio_resid / 16;
 		MALLOC(cookies, u_long *, ncookies * sizeof(u_long),
 		    M_TEMP, M_WAITOK);
 		idp->cookies = cookies;
 		idp->ncookies = ncookies;
 	}
 	idp->eofflag = 1;
 	idp->curroff = uio->uio_offset;
 
 	if ((entryoffsetinblock = idp->curroff & bmask) &&
 	    (error = cd9660_blkatoff(vdp, (off_t)idp->curroff, NULL, &bp))) {
 		FREE(idp, M_TEMP);
 		return (error);
 	}
 	endsearch = dp->i_size;
 
 	while (idp->curroff < endsearch) {
 		/*
 		 * If offset is on a block boundary,
 		 * read the next directory block.
 		 * Release previous if it exists.
 		 */
 		if ((idp->curroff & bmask) == 0) {
 			if (bp != NULL)
 				brelse(bp);
 			if ((error =
 			    cd9660_blkatoff(vdp, (off_t)idp->curroff, NULL, &bp)) != 0)
 				break;
 			entryoffsetinblock = 0;
 		}
 		/*
 		 * Get pointer to next entry.
 		 */
 		ep = (struct iso_directory_record *)
 			((char *)bp->b_data + entryoffsetinblock);
 
 		reclen = isonum_711(ep->length);
 		if (reclen == 0) {
 			/* skip to next block, if any */
 			idp->curroff =
 			    (idp->curroff & ~bmask) + imp->logical_block_size;
 			continue;
 		}
 
 		if (reclen < ISO_DIRECTORY_RECORD_SIZE) {
 			error = EINVAL;
 			/* illegal entry, stop */
 			break;
 		}
 
 		if (entryoffsetinblock + reclen > imp->logical_block_size) {
 			error = EINVAL;
 			/* illegal directory, so stop looking */
 			break;
 		}
 
 		idp->current.d_namlen = isonum_711(ep->name_len);
 
 		if (reclen < ISO_DIRECTORY_RECORD_SIZE + idp->current.d_namlen) {
 			error = EINVAL;
 			/* illegal entry, stop */
 			break;
 		}
 
 		if (isonum_711(ep->flags)&2)
 			idp->current.d_fileno = isodirino(ep, imp);
 		else
 			idp->current.d_fileno = dbtob(bp->b_blkno) +
 				entryoffsetinblock;
 
 		idp->curroff += reclen;
 
 		switch (imp->iso_ftype) {
 		case ISO_FTYPE_RRIP:
 			cd9660_rrip_getname(ep,idp->current.d_name, &namelen,
 					   &idp->current.d_fileno,imp);
 			idp->current.d_namlen = (u_char)namelen;
 			if (idp->current.d_namlen)
 				error = iso_uiodir(idp,&idp->current,idp->curroff);
 			break;
 		default: /* ISO_FTYPE_DEFAULT || ISO_FTYPE_9660 || ISO_FTYPE_HIGH_SIERRA*/
 			strcpy(idp->current.d_name,"..");
 			if (idp->current.d_namlen == 1 && ep->name[0] == 0) {
 				idp->current.d_namlen = 1;
 				error = iso_uiodir(idp,&idp->current,idp->curroff);
 			} else if (idp->current.d_namlen == 1 && ep->name[0] == 1) {
 				idp->current.d_namlen = 2;
 				error = iso_uiodir(idp,&idp->current,idp->curroff);
 			} else {
 				isofntrans(ep->name,idp->current.d_namlen,
 					   idp->current.d_name, &namelen,
 					   imp->iso_ftype == ISO_FTYPE_9660,
 					   isonum_711(ep->flags)&4,
 					   imp->joliet_level,
 					   imp->im_flags,
 					   imp->im_d2l);
 				idp->current.d_namlen = (u_char)namelen;
 				if (imp->iso_ftype == ISO_FTYPE_DEFAULT)
 					error = iso_shipdir(idp);
 				else
 					error = iso_uiodir(idp,&idp->current,idp->curroff);
 			}
 		}
 		if (error)
 			break;
 
 		entryoffsetinblock += reclen;
 	}
 
 	if (!error && imp->iso_ftype == ISO_FTYPE_DEFAULT) {
 		idp->current.d_namlen = 0;
 		error = iso_shipdir(idp);
 	}
 	if (error < 0)
 		error = 0;
 
 	if (ap->a_ncookies != NULL) {
 		if (error)
 			free(cookies, M_TEMP);
 		else {
 			/*
 			 * Work out the number of cookies actually used.
 			 */
 			*ap->a_ncookies = ncookies - idp->ncookies;
 			*ap->a_cookies = cookies;
 		}
 	}
 
 	if (bp)
 		brelse (bp);
 
 	uio->uio_offset = idp->uio_off;
 	*ap->a_eofflag = idp->eofflag;
 
 	FREE(idp, M_TEMP);
 
 	return (error);
 }
 
 /*
  * Return target name of a symbolic link
  * Shouldn't we get the parent vnode and read the data from there?
  * This could eventually result in deadlocks in cd9660_lookup.
  * But otherwise the block read here is in the block buffer two times.
  */
 typedef struct iso_directory_record ISODIR;
 typedef struct iso_node		    ISONODE;
 typedef struct iso_mnt		    ISOMNT;
 static int
 cd9660_readlink(ap)
 	struct vop_readlink_args /* {
 		struct vnode *a_vp;
 		struct uio *a_uio;
 		struct ucred *a_cred;
 	} */ *ap;
 {
 	ISONODE	*ip;
 	ISODIR	*dirp;
 	ISOMNT	*imp;
 	struct	buf *bp;
 	struct	uio *uio;
 	u_short	symlen;
 	int	error;
 	char	*symname;
 
 	ip  = VTOI(ap->a_vp);
 	imp = ip->i_mnt;
 	uio = ap->a_uio;
 
 	if (imp->iso_ftype != ISO_FTYPE_RRIP)
 		return (EINVAL);
 
 	/*
 	 * Get parents directory record block that this inode included.
 	 */
 	error = bread(imp->im_devvp,
 		      (ip->i_number >> imp->im_bshift) <<
 		      (imp->im_bshift - DEV_BSHIFT),
 		      imp->logical_block_size, NOCRED, &bp);
 	if (error) {
 		brelse(bp);
 		return (EINVAL);
 	}
 
 	/*
 	 * Setup the directory pointer for this inode
 	 */
 	dirp = (ISODIR *)(bp->b_data + (ip->i_number & imp->im_bmask));
 
 	/*
 	 * Just make sure, we have a right one....
 	 *   1: Check not cross boundary on block
 	 */
 	if ((ip->i_number & imp->im_bmask) + isonum_711(dirp->length)
 	    > (unsigned)imp->logical_block_size) {
 		brelse(bp);
 		return (EINVAL);
 	}
 
 	/*
 	 * Now get a buffer
 	 * Abuse a namei buffer for now.
 	 */
 	if (uio->uio_segflg == UIO_SYSSPACE)
 		symname = uio->uio_iov->iov_base;
 	else
 		symname = uma_zalloc(namei_zone, M_WAITOK);
 
 	/*
 	 * Ok, we just gathering a symbolic name in SL record.
 	 */
 	if (cd9660_rrip_getsymname(dirp, symname, &symlen, imp) == 0) {
 		if (uio->uio_segflg != UIO_SYSSPACE)
 			uma_zfree(namei_zone, symname);
 		brelse(bp);
 		return (EINVAL);
 	}
 	/*
 	 * Don't forget before you leave from home ;-)
 	 */
 	brelse(bp);
 
 	/*
 	 * return with the symbolic name to caller's.
 	 */
 	if (uio->uio_segflg != UIO_SYSSPACE) {
 		error = uiomove(symname, symlen, uio);
 		uma_zfree(namei_zone, symname);
 		return (error);
 	}
 	uio->uio_resid -= symlen;
 	uio->uio_iov->iov_base = (char *)uio->uio_iov->iov_base + symlen;
 	uio->uio_iov->iov_len -= symlen;
 	return (0);
 }
 
 /*
  * Calculate the logical to physical mapping if not done already,
  * then call the device strategy routine.
  */
 static int
 cd9660_strategy(ap)
 	struct vop_strategy_args /* {
 		struct buf *a_vp;
 		struct buf *a_bp;
 	} */ *ap;
 {
 	struct buf *bp = ap->a_bp;
 	struct vnode *vp = ap->a_vp;
 	struct iso_node *ip;
 	struct bufobj *bo;
 
 	ip = VTOI(vp);
 	if (vp->v_type == VBLK || vp->v_type == VCHR)
 		panic("cd9660_strategy: spec");
 	if (bp->b_blkno == bp->b_lblkno) {
 		bp->b_blkno = (ip->iso_start + bp->b_lblkno) <<
 		    (ip->i_mnt->im_bshift - DEV_BSHIFT);
 		if ((long)bp->b_blkno == -1)	/* XXX: cut&paste junk ? */
 			clrbuf(bp);
 	}
 	if ((long)bp->b_blkno == -1) {	/* XXX: cut&paste junk ? */
 		bufdone(bp);
 		return (0);
 	}
 	bp->b_iooffset = dbtob(bp->b_blkno);
 	bo = ip->i_mnt->im_bo;
 	BO_STRATEGY(bo, bp);
 	return (0);
 }
 
 /*
  * Return POSIX pathconf information applicable to cd9660 filesystems.
  */
 static int
 cd9660_pathconf(ap)
 	struct vop_pathconf_args /* {
 		struct vnode *a_vp;
 		int a_name;
 		register_t *a_retval;
 	} */ *ap;
 {
 
 	switch (ap->a_name) {
 	case _PC_LINK_MAX:
 		*ap->a_retval = 1;
 		return (0);
 	case _PC_NAME_MAX:
 		if (VTOI(ap->a_vp)->i_mnt->iso_ftype == ISO_FTYPE_RRIP)
 			*ap->a_retval = NAME_MAX;
 		else
 			*ap->a_retval = 37;
 		return (0);
 	case _PC_PATH_MAX:
 		*ap->a_retval = PATH_MAX;
 		return (0);
 	case _PC_PIPE_BUF:
 		*ap->a_retval = PIPE_BUF;
 		return (0);
 	case _PC_CHOWN_RESTRICTED:
 		*ap->a_retval = 1;
 		return (0);
 	case _PC_NO_TRUNC:
 		*ap->a_retval = 1;
 		return (0);
 	default:
 		return (EINVAL);
 	}
 	/* NOTREACHED */
 }
 
 /*
  * Global vfs data structures for cd9660
  */
 struct vop_vector cd9660_vnodeops = {
 	.vop_default =		&default_vnodeops,
 	.vop_open =		cd9660_open,
 	.vop_access =		cd9660_access,
 	.vop_bmap =		cd9660_bmap,
 	.vop_cachedlookup =	cd9660_lookup,
 	.vop_getattr =		cd9660_getattr,
 	.vop_inactive =		cd9660_inactive,
 	.vop_ioctl =		cd9660_ioctl,
 	.vop_lookup =		vfs_cache_lookup,
 	.vop_pathconf =		cd9660_pathconf,
 	.vop_read =		cd9660_read,
 	.vop_readdir =		cd9660_readdir,
 	.vop_readlink =		cd9660_readlink,
 	.vop_reclaim =		cd9660_reclaim,
 	.vop_setattr =		cd9660_setattr,
 	.vop_strategy =		cd9660_strategy,
 };
 
 /*
  * Special device vnode ops
  */
 
 struct vop_vector cd9660_fifoops = {
 	.vop_default =		&fifo_specops,
 	.vop_access =		cd9660_access,
 	.vop_getattr =		cd9660_getattr,
 	.vop_inactive =		cd9660_inactive,
 	.vop_reclaim =		cd9660_reclaim,
 	.vop_setattr =		cd9660_setattr,
 };
Index: head/sys/kern/vfs_default.c
===================================================================
--- head/sys/kern/vfs_default.c	(revision 154151)
+++ head/sys/kern/vfs_default.c	(revision 154152)
@@ -1,658 +1,660 @@
 /*-
  * Copyright (c) 1989, 1993
  *	The Regents of the University of California.  All rights reserved.
  *
  * This code is derived from software contributed
  * to Berkeley by John Heidemann of the UCLA Ficus project.
  *
  * Source: * @(#)i405_init.c 2.10 92/04/27 UCLA Ficus project
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/bio.h>
 #include <sys/buf.h>
 #include <sys/conf.h>
 #include <sys/event.h>
 #include <sys/kernel.h>
 #include <sys/limits.h>
 #include <sys/lock.h>
 #include <sys/malloc.h>
 #include <sys/mount.h>
 #include <sys/mutex.h>
 #include <sys/unistd.h>
 #include <sys/vnode.h>
 #include <sys/poll.h>
 
 #include <vm/vm.h>
 #include <vm/vm_object.h>
 #include <vm/vm_extern.h>
 #include <vm/pmap.h>
 #include <vm/vm_map.h>
 #include <vm/vm_page.h>
 #include <vm/vm_pager.h>
 #include <vm/vnode_pager.h>
 
 static int	vop_nolookup(struct vop_lookup_args *);
 static int	vop_nostrategy(struct vop_strategy_args *);
 
 /*
  * This vnode table stores what we want to do if the filesystem doesn't
  * implement a particular VOP.
  *
  * If there is no specific entry here, we will return EOPNOTSUPP.
  *
  */
 
 struct vop_vector default_vnodeops = {
 	.vop_default =		NULL,
 	.vop_bypass =		VOP_EOPNOTSUPP,
 
 	.vop_advlock =		VOP_EINVAL,
 	.vop_bmap =		vop_stdbmap,
 	.vop_close =		VOP_NULL,
 	.vop_fsync =		VOP_NULL,
 	.vop_getpages =		vop_stdgetpages,
 	.vop_getwritemount = 	vop_stdgetwritemount,
 	.vop_inactive =		VOP_NULL,
 	.vop_ioctl =		VOP_ENOTTY,
 	.vop_kqfilter =		vop_stdkqfilter,
 	.vop_islocked =		vop_stdislocked,
 	.vop_lease =		VOP_NULL,
 	.vop_lock =		vop_stdlock,
 	.vop_lookup =		vop_nolookup,
 	.vop_open =		VOP_NULL,
 	.vop_pathconf =		VOP_EINVAL,
 	.vop_poll =		vop_nopoll,
 	.vop_putpages =		vop_stdputpages,
 	.vop_readlink =		VOP_EINVAL,
 	.vop_revoke =		VOP_PANIC,
 	.vop_strategy =		vop_nostrategy,
 	.vop_unlock =		vop_stdunlock,
 };
 
 /*
  * Series of placeholder functions for various error returns for
  * VOPs.
  */
 
 int
 vop_eopnotsupp(struct vop_generic_args *ap)
 {
 	/*
 	printf("vop_notsupp[%s]\n", ap->a_desc->vdesc_name);
 	*/
 
 	return (EOPNOTSUPP);
 }
 
 int
 vop_ebadf(struct vop_generic_args *ap)
 {
 
 	return (EBADF);
 }
 
 int
 vop_enotty(struct vop_generic_args *ap)
 {
 
 	return (ENOTTY);
 }
 
 int
 vop_einval(struct vop_generic_args *ap)
 {
 
 	return (EINVAL);
 }
 
 int
 vop_null(struct vop_generic_args *ap)
 {
 
 	return (0);
 }
 
 /*
  * Helper function to panic on some bad VOPs in some filesystems.
  */
 int
 vop_panic(struct vop_generic_args *ap)
 {
 
 	panic("filesystem goof: vop_panic[%s]", ap->a_desc->vdesc_name);
 }
 
 /*
  * vop_std<something> and vop_no<something> are default functions for use by
  * filesystems that need the "default reasonable" implementation for a
  * particular operation.
  *
  * The documentation for the operations they implement exists (if it exists)
  * in the VOP_<SOMETHING>(9) manpage (all uppercase).
  */
 
 /*
  * Default vop for filesystems that do not support name lookup
  */
 static int
 vop_nolookup(ap)
 	struct vop_lookup_args /* {
 		struct vnode *a_dvp;
 		struct vnode **a_vpp;
 		struct componentname *a_cnp;
 	} */ *ap;
 {
 
 	*ap->a_vpp = NULL;
 	return (ENOTDIR);
 }
 
 /*
  *	vop_nostrategy:
  *
  *	Strategy routine for VFS devices that have none.
  *
  *	BIO_ERROR and B_INVAL must be cleared prior to calling any strategy
  *	routine.  Typically this is done for a BIO_READ strategy call.
  *	Typically B_INVAL is assumed to already be clear prior to a write
  *	and should not be cleared manually unless you just made the buffer
  *	invalid.  BIO_ERROR should be cleared either way.
  */
 
 static int
 vop_nostrategy (struct vop_strategy_args *ap)
 {
 	printf("No strategy for buffer at %p\n", ap->a_bp);
 	vprint("vnode", ap->a_vp);
 	ap->a_bp->b_ioflags |= BIO_ERROR;
 	ap->a_bp->b_error = EOPNOTSUPP;
 	bufdone(ap->a_bp);
 	return (EOPNOTSUPP);
 }
 
 /*
  * vop_stdpathconf:
  *
  * Standard implementation of POSIX pathconf, to get information about limits
  * for a filesystem.
  * Override per filesystem for the case where the filesystem has smaller
  * limits.
  */
 int
 vop_stdpathconf(ap)
 	struct vop_pathconf_args /* {
 	struct vnode *a_vp;
 	int a_name;
 	int *a_retval;
 	} */ *ap;
 {
 
 	switch (ap->a_name) {
 		case _PC_NAME_MAX:
 			*ap->a_retval = NAME_MAX;
 			return (0);
 		case _PC_PATH_MAX:
 			*ap->a_retval = PATH_MAX;
 			return (0);
 		case _PC_LINK_MAX:
 			*ap->a_retval = LINK_MAX;
 			return (0);
 		case _PC_MAX_CANON:
 			*ap->a_retval = MAX_CANON;
 			return (0);
 		case _PC_MAX_INPUT:
 			*ap->a_retval = MAX_INPUT;
 			return (0);
 		case _PC_PIPE_BUF:
 			*ap->a_retval = PIPE_BUF;
 			return (0);
 		case _PC_CHOWN_RESTRICTED:
 			*ap->a_retval = 1;
 			return (0);
 		case _PC_VDISABLE:
 			*ap->a_retval = _POSIX_VDISABLE;
 			return (0);
 		default:
 			return (EINVAL);
 	}
 	/* NOTREACHED */
 }
 
 /*
  * Standard lock, unlock and islocked functions.
  */
 int
 vop_stdlock(ap)
 	struct vop_lock_args /* {
 		struct vnode *a_vp;
 		int a_flags;
 		struct thread *a_td;
 	} */ *ap;
 {
 	struct vnode *vp = ap->a_vp;
 
 	return (lockmgr(vp->v_vnlock, ap->a_flags, VI_MTX(vp), ap->a_td));
 }
 
 /* See above. */
 int
 vop_stdunlock(ap)
 	struct vop_unlock_args /* {
 		struct vnode *a_vp;
 		int a_flags;
 		struct thread *a_td;
 	} */ *ap;
 {
 	struct vnode *vp = ap->a_vp;
 
 	return (lockmgr(vp->v_vnlock, ap->a_flags | LK_RELEASE, VI_MTX(vp),
 	    ap->a_td));
 }
 
 /* See above. */
 int
 vop_stdislocked(ap)
 	struct vop_islocked_args /* {
 		struct vnode *a_vp;
 		struct thread *a_td;
 	} */ *ap;
 {
 
 	return (lockstatus(ap->a_vp->v_vnlock, ap->a_td));
 }
 
 /*
  * Return true for select/poll.
  */
 int
 vop_nopoll(ap)
 	struct vop_poll_args /* {
 		struct vnode *a_vp;
 		int  a_events;
 		struct ucred *a_cred;
 		struct thread *a_td;
 	} */ *ap;
 {
 	/*
 	 * Return true for read/write.  If the user asked for something
 	 * special, return POLLNVAL, so that clients have a way of
 	 * determining reliably whether or not the extended
 	 * functionality is present without hard-coding knowledge
 	 * of specific filesystem implementations.
 	 * Stay in sync with kern_conf.c::no_poll().
 	 */
 	if (ap->a_events & ~POLLSTANDARD)
 		return (POLLNVAL);
 
 	return (ap->a_events & (POLLIN | POLLOUT | POLLRDNORM | POLLWRNORM));
 }
 
 /*
  * Implement poll for local filesystems that support it.
  */
 int
 vop_stdpoll(ap)
 	struct vop_poll_args /* {
 		struct vnode *a_vp;
 		int  a_events;
 		struct ucred *a_cred;
 		struct thread *a_td;
 	} */ *ap;
 {
 	if (ap->a_events & ~POLLSTANDARD)
 		return (vn_pollrecord(ap->a_vp, ap->a_td, ap->a_events));
 	return (ap->a_events & (POLLIN | POLLOUT | POLLRDNORM | POLLWRNORM));
 }
 
 /*
  * Return our mount point, as we will take charge of the writes.
  */
 int
 vop_stdgetwritemount(ap)
 	struct vop_getwritemount_args /* {
 		struct vnode *a_vp;
 		struct mount **a_mpp;
 	} */ *ap;
 {
 
 	*(ap->a_mpp) = ap->a_vp->v_mount;
 	return (0);
 }
 
 /* XXX Needs good comment and VOP_BMAP(9) manpage */
 int
 vop_stdbmap(ap)
 	struct vop_bmap_args /* {
 		struct vnode *a_vp;
 		daddr_t  a_bn;
 		struct bufobj **a_bop;
 		daddr_t *a_bnp;
 		int *a_runp;
 		int *a_runb;
 	} */ *ap;
 {
 
 	if (ap->a_bop != NULL)
 		*ap->a_bop = &ap->a_vp->v_bufobj;
 	if (ap->a_bnp != NULL)
 		*ap->a_bnp = ap->a_bn * btodb(ap->a_vp->v_mount->mnt_stat.f_iosize);
 	if (ap->a_runp != NULL)
 		*ap->a_runp = 0;
 	if (ap->a_runb != NULL)
 		*ap->a_runb = 0;
 	return (0);
 }
 
 int
 vop_stdfsync(ap)
 	struct vop_fsync_args /* {
 		struct vnode *a_vp;
 		struct ucred *a_cred;
 		int a_waitfor;
 		struct thread *a_td;
 	} */ *ap;
 {
 	struct vnode *vp = ap->a_vp;
 	struct buf *bp;
 	struct bufobj *bo;
 	struct buf *nbp;
 	int error = 0;
 	int maxretry = 1000;     /* large, arbitrarily chosen */
 
 	VI_LOCK(vp);
 loop1:
 	/*
 	 * MARK/SCAN initialization to avoid infinite loops.
 	 */
         TAILQ_FOREACH(bp, &vp->v_bufobj.bo_dirty.bv_hd, b_bobufs) {
                 bp->b_vflags &= ~BV_SCANNED;
 		bp->b_error = 0;
 	}
 
 	/*
 	 * Flush all dirty buffers associated with a vnode.
 	 */
 loop2:
 	TAILQ_FOREACH_SAFE(bp, &vp->v_bufobj.bo_dirty.bv_hd, b_bobufs, nbp) {
 		if ((bp->b_vflags & BV_SCANNED) != 0)
 			continue;
 		bp->b_vflags |= BV_SCANNED;
 		if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT, NULL))
 			continue;
 		VI_UNLOCK(vp);
 		KASSERT(bp->b_bufobj == &vp->v_bufobj,
 		    ("bp %p wrong b_bufobj %p should be %p",
 		    bp, bp->b_bufobj, &vp->v_bufobj));
 		if ((bp->b_flags & B_DELWRI) == 0)
 			panic("fsync: not dirty");
 		if ((vp->v_object != NULL) && (bp->b_flags & B_CLUSTEROK)) {
 			vfs_bio_awrite(bp);
 		} else {
 			bremfree(bp);
 			bawrite(bp);
 		}
 		VI_LOCK(vp);
 		goto loop2;
 	}
 
 	/*
 	 * If synchronous the caller expects us to completely resolve all
 	 * dirty buffers in the system.  Wait for in-progress I/O to
 	 * complete (which could include background bitmap writes), then
 	 * retry if dirty blocks still exist.
 	 */
 	if (ap->a_waitfor == MNT_WAIT) {
 		bo = &vp->v_bufobj;
 		bufobj_wwait(bo, 0, 0);
 		if (bo->bo_dirty.bv_cnt > 0) {
 			/*
 			 * If we are unable to write any of these buffers
 			 * then we fail now rather than trying endlessly
 			 * to write them out.
 			 */
 			TAILQ_FOREACH(bp, &bo->bo_dirty.bv_hd, b_bobufs)
 				if ((error = bp->b_error) == 0)
 					continue;
 			if (error == 0 && --maxretry >= 0)
 				goto loop1;
 			error = EAGAIN;
 		}
 	}
 	VI_UNLOCK(vp);
 	if (error == EAGAIN)
 		vprint("fsync: giving up on dirty", vp);
 
 	return (error);
 }
 
 /* XXX Needs good comment and more info in the manpage (VOP_GETPAGES(9)). */
 int
 vop_stdgetpages(ap)
 	struct vop_getpages_args /* {
 		struct vnode *a_vp;
 		vm_page_t *a_m;
 		int a_count;
 		int a_reqpage;
 		vm_ooffset_t a_offset;
 	} */ *ap;
 {
 
 	return vnode_pager_generic_getpages(ap->a_vp, ap->a_m,
 	    ap->a_count, ap->a_reqpage);
 }
 
 int
 vop_stdkqfilter(struct vop_kqfilter_args *ap)
 {
 	return vfs_kqfilter(ap);
 }
 
 /* XXX Needs good comment and more info in the manpage (VOP_PUTPAGES(9)). */
 int
 vop_stdputpages(ap)
 	struct vop_putpages_args /* {
 		struct vnode *a_vp;
 		vm_page_t *a_m;
 		int a_count;
 		int a_sync;
 		int *a_rtvals;
 		vm_ooffset_t a_offset;
 	} */ *ap;
 {
 
 	return vnode_pager_generic_putpages(ap->a_vp, ap->a_m, ap->a_count,
 	     ap->a_sync, ap->a_rtvals);
 }
 
 /*
  * vfs default ops
  * used to fill the vfs function table to get reasonable default return values.
  */
 int
 vfs_stdroot (mp, flags, vpp, td)
 	struct mount *mp;
 	int flags;
 	struct vnode **vpp;
 	struct thread *td;
 {
 
 	return (EOPNOTSUPP);
 }
 
 int
 vfs_stdstatfs (mp, sbp, td)
 	struct mount *mp;
 	struct statfs *sbp;
 	struct thread *td;
 {
 
 	return (EOPNOTSUPP);
 }
 
 int
 vfs_stdvptofh (vp, fhp)
 	struct vnode *vp;
 	struct fid *fhp;
 {
 
 	return (EOPNOTSUPP);
 }
 
 int
 vfs_stdquotactl (mp, cmds, uid, arg, td)
 	struct mount *mp;
 	int cmds;
 	uid_t uid;
 	void *arg;
 	struct thread *td;
 {
 
 	return (EOPNOTSUPP);
 }
 
 int
 vfs_stdsync(mp, waitfor, td)
 	struct mount *mp;
 	int waitfor;
 	struct thread *td;
 {
-	struct vnode *vp, *nvp;
+	struct vnode *vp, *mvp;
 	int error, lockreq, allerror = 0;
 
 	lockreq = LK_EXCLUSIVE | LK_INTERLOCK;
 	if (waitfor != MNT_WAIT)
 		lockreq |= LK_NOWAIT;
 	/*
 	 * Force stale buffer cache information to be flushed.
 	 */
 	MNT_ILOCK(mp);
 loop:
-	MNT_VNODE_FOREACH(vp, mp, nvp) {
+	MNT_VNODE_FOREACH(vp, mp, mvp) {
 
 		VI_LOCK(vp);
 		if (vp->v_bufobj.bo_dirty.bv_cnt == 0) {
 			VI_UNLOCK(vp);
 			continue;
 		}
 		MNT_IUNLOCK(mp);
 
 		if ((error = vget(vp, lockreq, td)) != 0) {
 			MNT_ILOCK(mp);
-			if (error == ENOENT)
+			if (error == ENOENT) {
+				MNT_VNODE_FOREACH_ABORT_ILOCKED(mp, mvp);
 				goto loop;
+			}
 			continue;
 		}
 		error = VOP_FSYNC(vp, waitfor, td);
 		if (error)
 			allerror = error;
 
 		VOP_UNLOCK(vp, 0, td);
 		vrele(vp);
 		MNT_ILOCK(mp);
 	}
 	MNT_IUNLOCK(mp);
 	return (allerror);
 }
 
 int
 vfs_stdnosync (mp, waitfor, td)
 	struct mount *mp;
 	int waitfor;
 	struct thread *td;
 {
 
 	return (0);
 }
 
 int
 vfs_stdvget (mp, ino, flags, vpp)
 	struct mount *mp;
 	ino_t ino;
 	int flags;
 	struct vnode **vpp;
 {
 
 	return (EOPNOTSUPP);
 }
 
 int
 vfs_stdfhtovp (mp, fhp, vpp)
 	struct mount *mp;
 	struct fid *fhp;
 	struct vnode **vpp;
 {
 
 	return (EOPNOTSUPP);
 }
 
 int
 vfs_stdinit (vfsp)
 	struct vfsconf *vfsp;
 {
 
 	return (0);
 }
 
 int
 vfs_stduninit (vfsp)
 	struct vfsconf *vfsp;
 {
 
 	return(0);
 }
 
 int
 vfs_stdextattrctl(mp, cmd, filename_vp, attrnamespace, attrname, td)
 	struct mount *mp;
 	int cmd;
 	struct vnode *filename_vp;
 	int attrnamespace;
 	const char *attrname;
 	struct thread *td;
 {
 
 	if (filename_vp != NULL)
 		VOP_UNLOCK(filename_vp, 0, td);
 	return (EOPNOTSUPP);
 }
 
 int
 vfs_stdsysctl(mp, op, req)
 	struct mount *mp;
 	fsctlop_t op;
 	struct sysctl_req *req;
 {
 
 	return (EOPNOTSUPP);
 }
 
 /* end of vfs default ops */
Index: head/sys/kern/vfs_mount.c
===================================================================
--- head/sys/kern/vfs_mount.c	(revision 154151)
+++ head/sys/kern/vfs_mount.c	(revision 154152)
@@ -1,1904 +1,2002 @@
 /*-
  * Copyright (c) 1999-2004 Poul-Henning Kamp
  * Copyright (c) 1999 Michael Smith
  * Copyright (c) 1989, 1993
  *	The Regents of the University of California.  All rights reserved.
  * (c) UNIX System Laboratories, Inc.
  * All or some portions of this file are derived from material licensed
  * to the University of California by American Telephone and Telegraph
  * Co. or Unix System Laboratories, Inc. and are reproduced herein with
  * the permission of UNIX System Laboratories, Inc.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <sys/param.h>
 #include <sys/conf.h>
 #include <sys/jail.h>
 #include <sys/kernel.h>
 #include <sys/libkern.h>
 #include <sys/mac.h>
 #include <sys/malloc.h>
 #include <sys/mount.h>
 #include <sys/mutex.h>
 #include <sys/namei.h>
 #include <sys/proc.h>
 #include <sys/filedesc.h>
 #include <sys/reboot.h>
 #include <sys/syscallsubr.h>
 #include <sys/sysproto.h>
 #include <sys/sx.h>
 #include <sys/sysctl.h>
 #include <sys/sysent.h>
 #include <sys/systm.h>
 #include <sys/vnode.h>
 
 #include <geom/geom.h>
 
 #include <machine/stdarg.h>
 
 #include "opt_rootdevname.h"
 #include "opt_ddb.h"
 #include "opt_mac.h"
 
 #ifdef DDB
 #include <ddb/ddb.h>
 #endif
 
 #define	ROOTNAME		"root_device"
 #define	VFS_MOUNTARG_SIZE_MAX	(1024 * 64)
 
 static int	vfs_domount(struct thread *td, const char *fstype,
 		    char *fspath, int fsflags, void *fsdata);
 static struct mount *vfs_mount_alloc(struct vnode *dvp, struct vfsconf *vfsp,
 		    const char *fspath, struct thread *td);
 static int	vfs_mountroot_ask(void);
 static int	vfs_mountroot_try(const char *mountfrom);
 static int	vfs_donmount(struct thread *td, int fsflags,
 		    struct uio *fsoptions);
 static void	free_mntarg(struct mntarg *ma);
 static void	vfs_mount_destroy(struct mount *, struct thread *);
 static int	vfs_getopt_pos(struct vfsoptlist *opts, const char *name);
 
 static int	usermount = 0;
 SYSCTL_INT(_vfs, OID_AUTO, usermount, CTLFLAG_RW, &usermount, 0,
     "Unprivileged users may mount and unmount file systems");
 
 MALLOC_DEFINE(M_MOUNT, "mount", "vfs mount structure");
+MALLOC_DEFINE(M_VNODE_MARKER, "vnodemarker", "vnode marker");
 
 /* List of mounted filesystems. */
 struct mntlist mountlist = TAILQ_HEAD_INITIALIZER(mountlist);
 
 /* For any iteration/modification of mountlist */
 struct mtx mountlist_mtx;
 MTX_SYSINIT(mountlist, &mountlist_mtx, "mountlist", MTX_DEF);
 
 TAILQ_HEAD(vfsoptlist, vfsopt);
 struct vfsopt {
 	TAILQ_ENTRY(vfsopt) link;
 	char	*name;
 	void	*value;
 	int	len;
 };
 
 /*
  * The vnode of the system's root (/ in the filesystem, without chroot
  * active.)
  */
 struct vnode	*rootvnode;
 
 /*
  * The root filesystem is detailed in the kernel environment variable
  * vfs.root.mountfrom, which is expected to be in the general format
  *
  * <vfsname>:[<path>]
  * vfsname   := the name of a VFS known to the kernel and capable
  *              of being mounted as root
  * path      := disk device name or other data used by the filesystem
  *              to locate its physical store
  */
 
 /*
  * Global opts, taken by all filesystems
  */
 static const char *global_opts[] = {
 	"errmsg",
 	"fstype",
 	"fspath",
 	"rdonly",
 	"ro",
 	"rw",
 	"suid",
 	"exec",
 	NULL
 };
 
 /*
  * The root specifiers we will try if RB_CDROM is specified.
  */
 static char *cdrom_rootdevnames[] = {
 	"cd9660:cd0",
 	"cd9660:acd0",
 	NULL
 };
 
 /* legacy find-root code */
 char		*rootdevnames[2] = {NULL, NULL};
 #ifndef ROOTDEVNAME
 #  define ROOTDEVNAME NULL
 #endif
 static const char	*ctrootdevname = ROOTDEVNAME;
 
 /*
  * ---------------------------------------------------------------------
  * Functions for building and sanitizing the mount options
  */
 
 /* Remove one mount option. */
 static void
 vfs_freeopt(struct vfsoptlist *opts, struct vfsopt *opt)
 {
 
 	TAILQ_REMOVE(opts, opt, link);
 	free(opt->name, M_MOUNT);
 	if (opt->value != NULL)
 		free(opt->value, M_MOUNT);
 #ifdef INVARIANTS
 	else if (opt->len != 0)
 		panic("%s: mount option with NULL value but length != 0",
 		    __func__);
 #endif
 	free(opt, M_MOUNT);
 }
 
 /* Release all resources related to the mount options. */
 static void
 vfs_freeopts(struct vfsoptlist *opts)
 {
 	struct vfsopt *opt;
 
 	while (!TAILQ_EMPTY(opts)) {
 		opt = TAILQ_FIRST(opts);
 		vfs_freeopt(opts, opt);
 	}
 	free(opts, M_MOUNT);
 }
 
 /*
  * Check if options are equal (with or without the "no" prefix).
  */
 static int
 vfs_equalopts(const char *opt1, const char *opt2)
 {
 
 	/* "opt" vs. "opt" or "noopt" vs. "noopt" */
 	if (strcmp(opt1, opt2) == 0)
 		return (1);
 	/* "noopt" vs. "opt" */
 	if (strncmp(opt1, "no", 2) == 0 && strcmp(opt1 + 2, opt2) == 0)
 		return (1);
 	/* "opt" vs. "noopt" */
 	if (strncmp(opt2, "no", 2) == 0 && strcmp(opt1, opt2 + 2) == 0)
 		return (1);
 	return (0);
 }
 
 /*
  * If a mount option is specified several times,
  * (with or without the "no" prefix) only keep
  * the last occurence of it.
  */
 static void
 vfs_sanitizeopts(struct vfsoptlist *opts)
 {
 	struct vfsopt *opt, *opt2, *tmp;
 
 	TAILQ_FOREACH_REVERSE(opt, opts, vfsoptlist, link) {
 		opt2 = TAILQ_PREV(opt, vfsoptlist, link);
 		while (opt2 != NULL) {
 			if (vfs_equalopts(opt->name, opt2->name)) {
 				tmp = TAILQ_PREV(opt2, vfsoptlist, link);
 				vfs_freeopt(opts, opt2);
 				opt2 = tmp;
 			} else {
 				opt2 = TAILQ_PREV(opt2, vfsoptlist, link);
 			}
 		}
 	}
 }
 
 /*
  * Build a linked list of mount options from a struct uio.
  */
 static int
 vfs_buildopts(struct uio *auio, struct vfsoptlist **options)
 {
 	struct vfsoptlist *opts;
 	struct vfsopt *opt;
 	size_t memused;
 	unsigned int i, iovcnt;
 	int error, namelen, optlen;
 
 	opts = malloc(sizeof(struct vfsoptlist), M_MOUNT, M_WAITOK);
 	TAILQ_INIT(opts);
 	memused = 0;
 	iovcnt = auio->uio_iovcnt;
 	for (i = 0; i < iovcnt; i += 2) {
 		opt = malloc(sizeof(struct vfsopt), M_MOUNT, M_WAITOK);
 		namelen = auio->uio_iov[i].iov_len;
 		optlen = auio->uio_iov[i + 1].iov_len;
 		opt->name = malloc(namelen, M_MOUNT, M_WAITOK);
 		opt->value = NULL;
 		opt->len = 0;
 
 		/*
 		 * Do this early, so jumps to "bad" will free the current
 		 * option.
 		 */
 		TAILQ_INSERT_TAIL(opts, opt, link);
 		memused += sizeof(struct vfsopt) + optlen + namelen;
 
 		/*
 		 * Avoid consuming too much memory, and attempts to overflow
 		 * memused.
 		 */
 		if (memused > VFS_MOUNTARG_SIZE_MAX ||
 		    optlen > VFS_MOUNTARG_SIZE_MAX ||
 		    namelen > VFS_MOUNTARG_SIZE_MAX) {
 			error = EINVAL;
 			goto bad;
 		}
 
 		if (auio->uio_segflg == UIO_SYSSPACE) {
 			bcopy(auio->uio_iov[i].iov_base, opt->name, namelen);
 		} else {
 			error = copyin(auio->uio_iov[i].iov_base, opt->name,
 			    namelen);
 			if (error)
 				goto bad;
 		}
 		/* Ensure names are null-terminated strings. */
 		if (opt->name[namelen - 1] != '\0') {
 			error = EINVAL;
 			goto bad;
 		}
 		if (optlen != 0) {
 			opt->len = optlen;
 			opt->value = malloc(optlen, M_MOUNT, M_WAITOK);
 			if (auio->uio_segflg == UIO_SYSSPACE) {
 				bcopy(auio->uio_iov[i + 1].iov_base, opt->value,
 				    optlen);
 			} else {
 				error = copyin(auio->uio_iov[i + 1].iov_base,
 				    opt->value, optlen);
 				if (error)
 					goto bad;
 			}
 		}
 	}
 	vfs_sanitizeopts(opts);
 	*options = opts;
 	return (0);
 bad:
 	vfs_freeopts(opts);
 	return (error);
 }
 
 /*
  * Merge the old mount options with the new ones passed
  * in the MNT_UPDATE case.
  */
 static void
 vfs_mergeopts(struct vfsoptlist *toopts, struct vfsoptlist *opts)
 {
 	struct vfsopt *opt, *opt2, *new;
 
 	TAILQ_FOREACH(opt, opts, link) {
 		/*
 		 * Check that this option hasn't been redefined
 		 * nor cancelled with a "no" mount option.
 		 */
 		opt2 = TAILQ_FIRST(toopts);
 		while (opt2 != NULL) {
 			if (strcmp(opt2->name, opt->name) == 0)
 				goto next;
 			if (strncmp(opt2->name, "no", 2) == 0 &&
 			    strcmp(opt2->name + 2, opt->name) == 0) {
 				vfs_freeopt(toopts, opt2);
 				goto next;
 			}
 			opt2 = TAILQ_NEXT(opt2, link);
 		}
 		/* We want this option, duplicate it. */
 		new = malloc(sizeof(struct vfsopt), M_MOUNT, M_WAITOK);
 		new->name = malloc(strlen(opt->name) + 1, M_MOUNT, M_WAITOK);
 		strcpy(new->name, opt->name);
 		if (opt->len != 0) {
 			new->value = malloc(opt->len, M_MOUNT, M_WAITOK);
 			bcopy(opt->value, new->value, opt->len);
 		} else {
 			new->value = NULL;
 		}
 		new->len = opt->len;
 		TAILQ_INSERT_TAIL(toopts, new, link);
 next:
 		continue;
 	}
 }
 
 /*
  * ---------------------------------------------------------------------
  * Mount a filesystem
  */
 int
 nmount(td, uap)
 	struct thread *td;
 	struct nmount_args /* {
 		struct iovec *iovp;
 		unsigned int iovcnt;
 		int flags;
 	} */ *uap;
 {
 	struct uio *auio;
 	struct iovec *iov;
 	unsigned int i;
 	int error;
 	u_int iovcnt;
 
 	/* Kick out MNT_ROOTFS early as it is legal internally */
 	if (uap->flags & MNT_ROOTFS)
 		return (EINVAL);
 
 	iovcnt = uap->iovcnt;
 	/*
 	 * Check that we have an even number of iovec's
 	 * and that we have at least two options.
 	 */
 	if ((iovcnt & 1) || (iovcnt < 4))
 		return (EINVAL);
 
 	error = copyinuio(uap->iovp, iovcnt, &auio);
 	if (error)
 		return (error);
 	iov = auio->uio_iov;
 	for (i = 0; i < iovcnt; i++) {
 		if (iov->iov_len > MMAXOPTIONLEN) {
 			free(auio, M_IOV);
 			return (EINVAL);
 		}
 		iov++;
 	}
 	error = vfs_donmount(td, uap->flags, auio);
 
 	free(auio, M_IOV);
 	return (error);
 }
 
 /*
  * ---------------------------------------------------------------------
  * Various utility functions
  */
 
 /*
  * Allocate and initialize the mount point struct.
  */
 static struct mount *
 vfs_mount_alloc(struct vnode *vp, struct vfsconf *vfsp,
     const char *fspath, struct thread *td)
 {
 	struct mount *mp;
 
 	mp = malloc(sizeof(struct mount), M_MOUNT, M_WAITOK | M_ZERO);
 	TAILQ_INIT(&mp->mnt_nvnodelist);
 	mp->mnt_nvnodelistsize = 0;
 	mtx_init(&mp->mnt_mtx, "struct mount mtx", NULL, MTX_DEF);
 	lockinit(&mp->mnt_lock, PVFS, "vfslock", 0, 0);
 	vfs_busy(mp, LK_NOWAIT, 0, td);
 	mp->mnt_op = vfsp->vfc_vfsops;
 	mp->mnt_vfc = vfsp;
 	vfsp->vfc_refcount++;
 	mp->mnt_stat.f_type = vfsp->vfc_typenum;
 	mp->mnt_flag |= vfsp->vfc_flags & MNT_VISFLAGMASK;
 	strlcpy(mp->mnt_stat.f_fstypename, vfsp->vfc_name, MFSNAMELEN);
 	mp->mnt_vnodecovered = vp;
 	mp->mnt_cred = crdup(td->td_ucred);
 	mp->mnt_stat.f_owner = td->td_ucred->cr_uid;
 	strlcpy(mp->mnt_stat.f_mntonname, fspath, MNAMELEN);
 	mp->mnt_iosize_max = DFLTPHYS;
 #ifdef MAC
 	mac_init_mount(mp);
 	mac_create_mount(td->td_ucred, mp);
 #endif
 	arc4rand(&mp->mnt_hashseed, sizeof mp->mnt_hashseed, 0);
 	return (mp);
 }
 
 /*
  * Destroy the mount struct previously allocated by vfs_mount_alloc().
  */
 static void
 vfs_mount_destroy(struct mount *mp, struct thread *td)
 {
 
+	MNT_ILOCK(mp);
+	if (mp->mnt_holdcnt != 0) {
+		printf("Waiting for mount point to be unheld\n");
+		while (mp->mnt_holdcnt != 0) {
+			mp->mnt_holdcntwaiters++;
+			msleep(&mp->mnt_holdcnt, MNT_MTX(mp),
+			       PZERO, "mntdestroy", 0);
+			mp->mnt_holdcntwaiters--;
+		}
+		printf("mount point unheld\n");
+	}
+	if (mp->mnt_writeopcount > 0) {
+		printf("Waiting for mount point write ops\n");
+		while (mp->mnt_writeopcount > 0) {
+			mp->mnt_kern_flag |= MNTK_SUSPEND;
+			msleep(&mp->mnt_writeopcount,
+			       MNT_MTX(mp),
+			       PZERO, "mntdestroy2", 0);
+		}
+		printf("mount point write ops completed\n");
+	}
+	MNT_IUNLOCK(mp);
 	mp->mnt_vfc->vfc_refcount--;
 	if (!TAILQ_EMPTY(&mp->mnt_nvnodelist))
 		panic("unmount: dangling vnode");
 	vfs_unbusy(mp,td);
 	lockdestroy(&mp->mnt_lock);
 	MNT_ILOCK(mp);
 	if (mp->mnt_kern_flag & MNTK_MWAIT)
 		wakeup(mp);
+	if (mp->mnt_writeopcount != 0)
+		panic("vfs_mount_destroy: nonzero writeopcount");
+	if (mp->mnt_nvnodelistsize != 0)
+		panic("vfs_mount_destroy: nonzero nvnodelistsize");
+	mp->mnt_writeopcount = -1000;
+	mp->mnt_nvnodelistsize = -1000;
 	MNT_IUNLOCK(mp);
 	mtx_destroy(&mp->mnt_mtx);
 #ifdef MAC
 	mac_destroy_mount(mp);
 #endif
 	if (mp->mnt_opt != NULL)
 		vfs_freeopts(mp->mnt_opt);
 	crfree(mp->mnt_cred);
 	free(mp, M_MOUNT);
 }
 
 static int
 vfs_donmount(struct thread *td, int fsflags, struct uio *fsoptions)
 {
 	struct vfsoptlist *optlist;
 	char *fstype, *fspath, *errmsg;
 	int error, fstypelen, fspathlen, errmsg_len, errmsg_pos;
 
 	errmsg_len = 0;
 	errmsg_pos = -1;
 
 	error = vfs_buildopts(fsoptions, &optlist);
 	if (error)
 		return (error);
 
 	if (vfs_getopt(optlist, "errmsg", (void **)&errmsg, &errmsg_len) == 0) 
 		errmsg_pos = vfs_getopt_pos(optlist, "errmsg");
 	else
 		errmsg_len = 0;
 
 	/*
 	 * We need these two options before the others,
 	 * and they are mandatory for any filesystem.
 	 * Ensure they are NUL terminated as well.
 	 */
 	fstypelen = 0;
 	error = vfs_getopt(optlist, "fstype", (void **)&fstype, &fstypelen);
 	if (error || fstype[fstypelen - 1] != '\0') {
 		error = EINVAL;
 		if (errmsg != NULL)
 			strncpy(errmsg, "Invalid fstype", errmsg_len);
 		goto bail;
 	}
 	fspathlen = 0;
 	error = vfs_getopt(optlist, "fspath", (void **)&fspath, &fspathlen);
 	if (error || fspath[fspathlen - 1] != '\0') {
 		error = EINVAL;
 		if (errmsg != NULL)
 			strncpy(errmsg, "Invalid fspath", errmsg_len);
 		goto bail;
 	}
 
 	/*
 	 * We need to see if we have the "update" option
 	 * before we call vfs_domount(), since vfs_domount() has special
 	 * logic based on MNT_UPDATE.  This is very important
 	 * when we want to update the root filesystem.
 	 */ 
 	if (vfs_getopt(optlist, "update", NULL, NULL) == 0)
 		fsflags |= MNT_UPDATE;
 
 	if (vfs_getopt(optlist, "async", NULL, NULL) == 0)
 		fsflags |= MNT_ASYNC;
 
 	if (vfs_getopt(optlist, "force", NULL, NULL) == 0)
 		fsflags |= MNT_FORCE;
 
 	if (vfs_getopt(optlist, "multilabel", NULL, NULL) == 0)
 		fsflags |= MNT_MULTILABEL;
 
 	if (vfs_getopt(optlist, "noasync", NULL, NULL) == 0)
 		fsflags &= ~MNT_ASYNC;
 
 	if (vfs_getopt(optlist, "noatime", NULL, NULL) == 0)
 		fsflags |= MNT_NOATIME;
 
 	if (vfs_getopt(optlist, "noclusterr", NULL, NULL) == 0)
 		fsflags |= MNT_NOCLUSTERR;
 
 	if (vfs_getopt(optlist, "noclusterw", NULL, NULL) == 0)
 		fsflags |= MNT_NOCLUSTERW;
 
 	if (vfs_getopt(optlist, "noexec", NULL, NULL) == 0)
 		fsflags |= MNT_NOEXEC;
 
 	if (vfs_getopt(optlist, "nosuid", NULL, NULL) == 0)
 		fsflags |= MNT_NOSUID;
 
 	if (vfs_getopt(optlist, "nosymfollow", NULL, NULL) == 0)
 		fsflags |= MNT_NOSYMFOLLOW;
 
 	if (vfs_getopt(optlist, "noro", NULL, NULL) == 0)
 		fsflags &= ~MNT_RDONLY;
 
 	if (vfs_getopt(optlist, "ro", NULL, NULL) == 0)
 		fsflags |= MNT_RDONLY;
 
 	if (vfs_getopt(optlist, "rdonly", NULL, NULL) == 0)
 		fsflags |= MNT_RDONLY;
 
 	if (vfs_getopt(optlist, "rw", NULL, NULL) == 0)
 		fsflags &= ~MNT_RDONLY;
 
 	if (vfs_getopt(optlist, "snapshot", NULL, NULL) == 0)
 		fsflags |= MNT_SNAPSHOT;
 
 	if (vfs_getopt(optlist, "suiddir", NULL, NULL) == 0)
 		fsflags |= MNT_SUIDDIR;
 
 	if (vfs_getopt(optlist, "sync", NULL, NULL) == 0)
 		fsflags |= MNT_SYNCHRONOUS;
 
 	if (vfs_getopt(optlist, "union", NULL, NULL) == 0)
 		fsflags |= MNT_UNION;
 
 	/*
 	 * Be ultra-paranoid about making sure the type and fspath
 	 * variables will fit in our mp buffers, including the
 	 * terminating NUL.
 	 */
 	if (fstypelen >= MFSNAMELEN - 1 || fspathlen >= MNAMELEN - 1) {
 		error = ENAMETOOLONG;
 		goto bail;
 	}
 
 	mtx_lock(&Giant);
 	error = vfs_domount(td, fstype, fspath, fsflags, optlist);
 	mtx_unlock(&Giant);
 bail:
 	/* copyout the errmsg */
 	if (errmsg_pos != -1 && ((2 * errmsg_pos + 1) < fsoptions->uio_iovcnt)
 	    && errmsg_len > 0 && errmsg != NULL) {
 		if (fsoptions->uio_segflg == UIO_SYSSPACE) {
 			strncpy(fsoptions->uio_iov[2 * errmsg_pos + 1].iov_base,
 			    errmsg, 
 			    fsoptions->uio_iov[2 * errmsg_pos + 1].iov_len);
 		} else {
 			copystr(errmsg,
 			    fsoptions->uio_iov[2 * errmsg_pos + 1].iov_base,
 			    fsoptions->uio_iov[2 * errmsg_pos + 1].iov_len,
 			    NULL);
 		} 
 	}
 
 	if (error != 0)
 		vfs_freeopts(optlist);
 	return (error);
 }
 
 /*
  * ---------------------------------------------------------------------
  * Old mount API.
  */
 #ifndef _SYS_SYSPROTO_H_
 struct mount_args {
 	char	*type;
 	char	*path;
 	int	flags;
 	caddr_t	data;
 };
 #endif
 /* ARGSUSED */
 int
 mount(td, uap)
 	struct thread *td;
 	struct mount_args /* {
 		char *type;
 		char *path;
 		int flags;
 		caddr_t data;
 	} */ *uap;
 {
 	char *fstype;
 	struct vfsconf *vfsp = NULL;
 	struct mntarg *ma = NULL;
 	int error;
 
 	/* Kick out MNT_ROOTFS early as it is legal internally */
 	uap->flags &= ~MNT_ROOTFS;
 
 	if (uap->data == NULL)
 		return (EINVAL);
 
 	fstype = malloc(MFSNAMELEN, M_TEMP, M_WAITOK);
 	error = copyinstr(uap->type, fstype, MFSNAMELEN, NULL);
 	if (!error) {
 		mtx_lock(&Giant);	/* XXX ? */
 		vfsp = vfs_byname_kld(fstype, td, &error);
 		mtx_unlock(&Giant);
 	}
 	free(fstype, M_TEMP);
 	if (error)
 		return (error);
 	if (vfsp == NULL)
 		return (ENOENT);
 	if (vfsp->vfc_vfsops->vfs_cmount == NULL)
 		return (EOPNOTSUPP);
 
 	ma = mount_argsu(ma, "fstype", uap->type, MNAMELEN);
 	ma = mount_argsu(ma, "fspath", uap->path, MNAMELEN);
 	ma = mount_argb(ma, uap->flags & MNT_RDONLY, "noro");
 	ma = mount_argb(ma, !(uap->flags & MNT_NOSUID), "nosuid");
 	ma = mount_argb(ma, !(uap->flags & MNT_NOEXEC), "noexec");
 
 	error = vfsp->vfc_vfsops->vfs_cmount(ma, uap->data, uap->flags, td);
 	return (error);
 }
 
 
 /*
  * vfs_domount(): actually attempt a filesystem mount.
  */
 static int
 vfs_domount(
 	struct thread *td,	/* Flags common to all filesystems. */
 	const char *fstype,	/* Filesystem type. */
 	char *fspath,		/* Mount path. */
 	int fsflags,		/* Flags common to all filesystems. */
 	void *fsdata		/* Options local to the filesystem. */
 	)
 {
 	struct vnode *vp;
 	struct mount *mp;
 	struct vfsconf *vfsp;
 	int error, flag = 0, kern_flag = 0;
 	struct vattr va;
 	struct nameidata nd;
 
 	mtx_assert(&Giant, MA_OWNED);
 
 	/*
 	 * Be ultra-paranoid about making sure the type and fspath
 	 * variables will fit in our mp buffers, including the
 	 * terminating NUL.
 	 */
 	if (strlen(fstype) >= MFSNAMELEN || strlen(fspath) >= MNAMELEN)
 		return (ENAMETOOLONG);
 
 	if (jailed(td->td_ucred))
 		return (EPERM);
 	if (usermount == 0) {
 		if ((error = suser(td)) != 0)
 			return (error);
 	}
 
 	/*
 	 * Do not allow NFS export or MNT_SUIDDIR by unprivileged users.
 	 */
 	if (fsflags & (MNT_EXPORTED | MNT_SUIDDIR)) {
 		if ((error = suser(td)) != 0)
 			return (error);
 	}
 	/*
 	 * Silently enforce MNT_NOSUID and MNT_USER for
 	 * unprivileged users.
 	 */
 	if (suser(td) != 0)
 		fsflags |= MNT_NOSUID | MNT_USER;
 	/*
 	 * Get vnode to be covered
 	 */
 	NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF, UIO_SYSSPACE, fspath, td);
 	if ((error = namei(&nd)) != 0)
 		return (error);
 	NDFREE(&nd, NDF_ONLY_PNBUF);
 	vp = nd.ni_vp;
 	if (fsflags & MNT_UPDATE) {
 		if ((vp->v_vflag & VV_ROOT) == 0) {
 			vput(vp);
 			return (EINVAL);
 		}
 		mp = vp->v_mount;
 		flag = mp->mnt_flag;
 		kern_flag = mp->mnt_kern_flag;
 		/*
 		 * We only allow the filesystem to be reloaded if it
 		 * is currently mounted read-only.
 		 */
 		if ((fsflags & MNT_RELOAD) &&
 		    ((mp->mnt_flag & MNT_RDONLY) == 0)) {
 			vput(vp);
 			return (EOPNOTSUPP);	/* Needs translation */
 		}
 		/*
 		 * Only privileged root, or (if MNT_USER is set) the user that
 		 * did the original mount is permitted to update it.
 		 */
 		error = vfs_suser(mp, td);
 		if (error) {
 			vput(vp);
 			return (error);
 		}
 		if (vfs_busy(mp, LK_NOWAIT, 0, td)) {
 			vput(vp);
 			return (EBUSY);
 		}
 		VI_LOCK(vp);
 		if ((vp->v_iflag & VI_MOUNT) != 0 ||
 		    vp->v_mountedhere != NULL) {
 			VI_UNLOCK(vp);
 			vfs_unbusy(mp, td);
 			vput(vp);
 			return (EBUSY);
 		}
 		vp->v_iflag |= VI_MOUNT;
 		VI_UNLOCK(vp);
 		mp->mnt_flag |= fsflags &
 		    (MNT_RELOAD | MNT_FORCE | MNT_UPDATE | MNT_SNAPSHOT | MNT_ROOTFS);
 		VOP_UNLOCK(vp, 0, td);
 		mp->mnt_optnew = fsdata;
 		vfs_mergeopts(mp->mnt_optnew, mp->mnt_opt);
 	} else {
 		/*
 		 * If the user is not root, ensure that they own the directory
 		 * onto which we are attempting to mount.
 		 */
 		error = VOP_GETATTR(vp, &va, td->td_ucred, td);
 		if (error) {
 			vput(vp);
 			return (error);
 		}
 		if (va.va_uid != td->td_ucred->cr_uid) {
 			if ((error = suser(td)) != 0) {
 				vput(vp);
 				return (error);
 			}
 		}
 		error = vinvalbuf(vp, V_SAVE, td, 0, 0);
 		if (error != 0) {
 			vput(vp);
 			return (error);
 		}
 		if (vp->v_type != VDIR) {
 			vput(vp);
 			return (ENOTDIR);
 		}
 		vfsp = vfs_byname_kld(fstype, td, &error);
 		if (vfsp == NULL) {
 			vput(vp);
 			return (error);
 		}
 		VI_LOCK(vp);
 		if ((vp->v_iflag & VI_MOUNT) != 0 ||
 		    vp->v_mountedhere != NULL) {
 			VI_UNLOCK(vp);
 			vput(vp);
 			return (EBUSY);
 		}
 		vp->v_iflag |= VI_MOUNT;
 		VI_UNLOCK(vp);
 
 		/*
 		 * Allocate and initialize the filesystem.
 		 */
 		mp = vfs_mount_alloc(vp, vfsp, fspath, td);
 		VOP_UNLOCK(vp, 0, td);
 
 		/* XXXMAC: pass to vfs_mount_alloc? */
 		mp->mnt_optnew = fsdata;
 	}
 
 	/*
 	 * Set the mount level flags.
 	 */
 	if (fsflags & MNT_RDONLY)
 		mp->mnt_flag |= MNT_RDONLY;
 	mp->mnt_flag &=~ MNT_UPDATEMASK;
 	mp->mnt_flag |= fsflags & (MNT_UPDATEMASK | MNT_FORCE | MNT_ROOTFS);
 	/*
 	 * Mount the filesystem.
 	 * XXX The final recipients of VFS_MOUNT just overwrite the ndp they
 	 * get.  No freeing of cn_pnbuf.
 	 */
         error = VFS_MOUNT(mp, td);
 	if (!error) {
 		if (mp->mnt_opt != NULL)
 			vfs_freeopts(mp->mnt_opt);
 		mp->mnt_opt = mp->mnt_optnew;
 		VFS_STATFS(mp, &mp->mnt_stat, td);
 	}
 	/*
 	 * Prevent external consumers of mount options from reading
 	 * mnt_optnew.
 	*/
 	mp->mnt_optnew = NULL;
 	if (mp->mnt_flag & MNT_UPDATE) {
 		mp->mnt_flag &=
 		    ~(MNT_UPDATE | MNT_RELOAD | MNT_FORCE | MNT_SNAPSHOT);
 		if (error) {
 			mp->mnt_flag = flag;
 			mp->mnt_kern_flag = kern_flag;
 		}
 		if ((mp->mnt_flag & MNT_RDONLY) == 0) {
 			if (mp->mnt_syncer == NULL)
 				error = vfs_allocate_syncvnode(mp);
 		} else {
 			if (mp->mnt_syncer != NULL)
 				vrele(mp->mnt_syncer);
 			mp->mnt_syncer = NULL;
 		}
 		vfs_unbusy(mp, td);
 		VI_LOCK(vp);
 		vp->v_iflag &= ~VI_MOUNT;
 		VI_UNLOCK(vp);
 		vrele(vp);
 		return (error);
 	}
 	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td);
 	/*
 	 * Put the new filesystem on the mount list after root.
 	 */
 	cache_purge(vp);
 	if (!error) {
 		struct vnode *newdp;
 
 		VI_LOCK(vp);
 		vp->v_iflag &= ~VI_MOUNT;
 		VI_UNLOCK(vp);
 		vp->v_mountedhere = mp;
 		mtx_lock(&mountlist_mtx);
 		TAILQ_INSERT_TAIL(&mountlist, mp, mnt_list);
 		mtx_unlock(&mountlist_mtx);
 		vfs_event_signal(NULL, VQ_MOUNT, 0);
 		if (VFS_ROOT(mp, LK_EXCLUSIVE, &newdp, td))
 			panic("mount: lost mount");
 		mountcheckdirs(vp, newdp);
 		vput(newdp);
 		VOP_UNLOCK(vp, 0, td);
 		if ((mp->mnt_flag & MNT_RDONLY) == 0)
 			error = vfs_allocate_syncvnode(mp);
 		vfs_unbusy(mp, td);
 		if (error)
 			vrele(vp);
 	} else {
 		VI_LOCK(vp);
 		vp->v_iflag &= ~VI_MOUNT;
 		VI_UNLOCK(vp);
 		vfs_mount_destroy(mp, td);
 		vput(vp);
 	}
 	return (error);
 }
 
 /*
  * ---------------------------------------------------------------------
  * Unmount a filesystem.
  *
  * Note: unmount takes a path to the vnode mounted on as argument,
  * not special file (as before).
  */
 #ifndef _SYS_SYSPROTO_H_
 struct unmount_args {
 	char	*path;
 	int	flags;
 };
 #endif
 /* ARGSUSED */
 int
 unmount(td, uap)
 	struct thread *td;
 	register struct unmount_args /* {
 		char *path;
 		int flags;
 	} */ *uap;
 {
 	struct mount *mp;
 	char *pathbuf;
 	int error, id0, id1;
 
 	if (jailed(td->td_ucred))
 		return (EPERM);
 	if (usermount == 0) {
 		if ((error = suser(td)) != 0)
 			return (error);
 	}
 
 	pathbuf = malloc(MNAMELEN, M_TEMP, M_WAITOK);
 	error = copyinstr(uap->path, pathbuf, MNAMELEN, NULL);
 	if (error) {
 		free(pathbuf, M_TEMP);
 		return (error);
 	}
 	if (uap->flags & MNT_BYFSID) {
 		/* Decode the filesystem ID. */
 		if (sscanf(pathbuf, "FSID:%d:%d", &id0, &id1) != 2) {
 			free(pathbuf, M_TEMP);
 			return (EINVAL);
 		}
 
 		mtx_lock(&mountlist_mtx);
 		TAILQ_FOREACH_REVERSE(mp, &mountlist, mntlist, mnt_list) {
 			if (mp->mnt_stat.f_fsid.val[0] == id0 &&
 			    mp->mnt_stat.f_fsid.val[1] == id1)
 				break;
 		}
 		mtx_unlock(&mountlist_mtx);
 	} else {
 		mtx_lock(&mountlist_mtx);
 		TAILQ_FOREACH_REVERSE(mp, &mountlist, mntlist, mnt_list) {
 			if (strcmp(mp->mnt_stat.f_mntonname, pathbuf) == 0)
 				break;
 		}
 		mtx_unlock(&mountlist_mtx);
 	}
 	free(pathbuf, M_TEMP);
 	if (mp == NULL) {
 		/*
 		 * Previously we returned ENOENT for a nonexistent path and
 		 * EINVAL for a non-mountpoint.  We cannot tell these apart
 		 * now, so in the !MNT_BYFSID case return the more likely
 		 * EINVAL for compatibility.
 		 */
 		return ((uap->flags & MNT_BYFSID) ? ENOENT : EINVAL);
 	}
 
 	/*
 	 * Only privileged root, or (if MNT_USER is set) the user that did the
 	 * original mount is permitted to unmount this filesystem.
 	 */
 	error = vfs_suser(mp, td);
 	if (error)
 		return (error);
 
 	/*
 	 * Don't allow unmounting the root filesystem.
 	 */
 	if (mp->mnt_flag & MNT_ROOTFS)
 		return (EINVAL);
 	mtx_lock(&Giant);
 	error = dounmount(mp, uap->flags, td);
 	mtx_unlock(&Giant);
 	return (error);
 }
 
 /*
  * Do the actual filesystem unmount.
  */
 int
 dounmount(mp, flags, td)
 	struct mount *mp;
 	int flags;
 	struct thread *td;
 {
 	struct vnode *coveredvp, *fsrootvp;
 	int error;
 	int async_flag;
 
 	mtx_assert(&Giant, MA_OWNED);
 
 	MNT_ILOCK(mp);
 	if (mp->mnt_kern_flag & MNTK_UNMOUNT) {
 		MNT_IUNLOCK(mp);
 		return (EBUSY);
 	}
 	mp->mnt_kern_flag |= MNTK_UNMOUNT;
 	/* Allow filesystems to detect that a forced unmount is in progress. */
 	if (flags & MNT_FORCE)
 		mp->mnt_kern_flag |= MNTK_UNMOUNTF;
 	error = lockmgr(&mp->mnt_lock, LK_DRAIN | LK_INTERLOCK |
 	    ((flags & MNT_FORCE) ? 0 : LK_NOWAIT), MNT_MTX(mp), td);
 	if (error) {
 		MNT_ILOCK(mp);
 		mp->mnt_kern_flag &= ~(MNTK_UNMOUNT | MNTK_UNMOUNTF);
 		if (mp->mnt_kern_flag & MNTK_MWAIT)
 			wakeup(mp);
 		MNT_IUNLOCK(mp);
 		return (error);
 	}
 	vn_start_write(NULL, &mp, V_WAIT);
 
 	if (mp->mnt_flag & MNT_EXPUBLIC)
 		vfs_setpublicfs(NULL, NULL, NULL);
 
 	vfs_msync(mp, MNT_WAIT);
 	async_flag = mp->mnt_flag & MNT_ASYNC;
 	mp->mnt_flag &= ~MNT_ASYNC;
 	cache_purgevfs(mp);	/* remove cache entries for this file sys */
 	if (mp->mnt_syncer != NULL)
 		vrele(mp->mnt_syncer);
 	/*
 	 * For forced unmounts, move process cdir/rdir refs on the fs root
 	 * vnode to the covered vnode.  For non-forced unmounts we want
 	 * such references to cause an EBUSY error.
 	 */
 	if ((flags & MNT_FORCE) &&
 	    VFS_ROOT(mp, LK_EXCLUSIVE, &fsrootvp, td) == 0) {
 		if (mp->mnt_vnodecovered != NULL)
 			mountcheckdirs(fsrootvp, mp->mnt_vnodecovered);
 		if (fsrootvp == rootvnode) {
 			vrele(rootvnode);
 			rootvnode = NULL;
 		}
 		vput(fsrootvp);
 	}
 	if (((mp->mnt_flag & MNT_RDONLY) ||
 	     (error = VFS_SYNC(mp, MNT_WAIT, td)) == 0) ||
 	    (flags & MNT_FORCE)) {
 		error = VFS_UNMOUNT(mp, flags, td);
 	}
 	vn_finished_write(mp);
 	if (error) {
 		/* Undo cdir/rdir and rootvnode changes made above. */
 		if ((flags & MNT_FORCE) &&
 		    VFS_ROOT(mp, LK_EXCLUSIVE, &fsrootvp, td) == 0) {
 			if (mp->mnt_vnodecovered != NULL)
 				mountcheckdirs(mp->mnt_vnodecovered, fsrootvp);
 			if (rootvnode == NULL) {
 				rootvnode = fsrootvp;
 				vref(rootvnode);
 			}
 			vput(fsrootvp);
 		}
 		if ((mp->mnt_flag & MNT_RDONLY) == 0 && mp->mnt_syncer == NULL)
 			(void) vfs_allocate_syncvnode(mp);
 		MNT_ILOCK(mp);
 		mp->mnt_kern_flag &= ~(MNTK_UNMOUNT | MNTK_UNMOUNTF);
 		mp->mnt_flag |= async_flag;
 		lockmgr(&mp->mnt_lock, LK_RELEASE, NULL, td);
 		if (mp->mnt_kern_flag & MNTK_MWAIT)
 			wakeup(mp);
 		MNT_IUNLOCK(mp);
 		return (error);
 	}
 	mtx_lock(&mountlist_mtx);
 	TAILQ_REMOVE(&mountlist, mp, mnt_list);
 	if ((coveredvp = mp->mnt_vnodecovered) != NULL)
 		coveredvp->v_mountedhere = NULL;
 	mtx_unlock(&mountlist_mtx);
 	vfs_event_signal(NULL, VQ_UNMOUNT, 0);
 	vfs_mount_destroy(mp, td);
 	if (coveredvp != NULL)
 		vrele(coveredvp);
 	return (0);
 }
 
 /*
  * ---------------------------------------------------------------------
  * Mounting of root filesystem
  *
  */
 
 struct root_hold_token {
 	const char 			*who;
 	LIST_ENTRY(root_hold_token)	list;
 };
 
 static LIST_HEAD(, root_hold_token)	root_holds =
     LIST_HEAD_INITIALIZER(&root_holds);
 
 struct root_hold_token *
 root_mount_hold(const char *identifier)
 {
 	struct root_hold_token *h;
 
 	h = malloc(sizeof *h, M_DEVBUF, M_ZERO | M_WAITOK);
 	h->who = identifier;
 	mtx_lock(&mountlist_mtx);
 	LIST_INSERT_HEAD(&root_holds, h, list);
 	mtx_unlock(&mountlist_mtx);
 	return (h);
 }
 
 void
 root_mount_rel(struct root_hold_token *h)
 {
 
 	mtx_lock(&mountlist_mtx);
 	LIST_REMOVE(h, list);
 	wakeup(&root_holds);
 	mtx_unlock(&mountlist_mtx);
 	free(h, M_DEVBUF);
 }
 
 static void
 root_mount_wait(void)
 {
 	struct root_hold_token *h;
 
 	for (;;) {
 		DROP_GIANT();
 		g_waitidle();
 		PICKUP_GIANT();
 		mtx_lock(&mountlist_mtx);
 		if (LIST_EMPTY(&root_holds)) {
 			mtx_unlock(&mountlist_mtx);
 			break;
 		}
 		printf("Root mount waiting for:");
 		LIST_FOREACH(h, &root_holds, list)
 			printf(" %s", h->who);
 		printf("\n");
 		msleep(&root_holds, &mountlist_mtx, PZERO | PDROP, "roothold",
 		    hz);
 	}
 }
 
 static void
 set_rootvnode(struct thread *td)
 {
 	struct proc *p;
 
 	if (VFS_ROOT(TAILQ_FIRST(&mountlist), LK_EXCLUSIVE, &rootvnode, td))
 		panic("Cannot find root vnode");
 
 	p = td->td_proc;
 	FILEDESC_LOCK(p->p_fd);
 
 	if (p->p_fd->fd_cdir != NULL)
 		vrele(p->p_fd->fd_cdir);
 	p->p_fd->fd_cdir = rootvnode;
 	VREF(rootvnode);
 
 	if (p->p_fd->fd_rdir != NULL)
 		vrele(p->p_fd->fd_rdir);
 	p->p_fd->fd_rdir = rootvnode;
 	VREF(rootvnode);
 
 	FILEDESC_UNLOCK(p->p_fd);
 
 	VOP_UNLOCK(rootvnode, 0, td);
 }
 
 /*
  * Mount /devfs as our root filesystem, but do not put it on the mountlist
  * yet.  Create a /dev -> / symlink so that absolute pathnames will lookup.
  */
 
 static void
 devfs_first(void)
 {
 	struct thread *td = curthread;
 	struct vfsoptlist *opts;
 	struct vfsconf *vfsp;
 	struct mount *mp = NULL;
 	int error;
 
 	vfsp = vfs_byname("devfs");
 	KASSERT(vfsp != NULL, ("Could not find devfs by name"));
 	if (vfsp == NULL) 
 		return;
 
 	mp = vfs_mount_alloc(NULLVP, vfsp, "/dev", td);
 
 	error = VFS_MOUNT(mp, td);
 	KASSERT(error == 0, ("VFS_MOUNT(devfs) failed %d", error));
 	if (error)
 		return;
 
 	opts = malloc(sizeof(struct vfsoptlist), M_MOUNT, M_WAITOK);
 	TAILQ_INIT(opts);
 	mp->mnt_opt = opts;
 
 	mtx_lock(&mountlist_mtx);
 	TAILQ_INSERT_HEAD(&mountlist, mp, mnt_list);
 	mtx_unlock(&mountlist_mtx);
 
 	set_rootvnode(td);
 
 	error = kern_symlink(td, "/", "dev", UIO_SYSSPACE);
 	if (error)
 		printf("kern_symlink /dev -> / returns %d\n", error);
 }
 
 /*
  * Surgically move our devfs to be mounted on /dev.
  */
 
 static void
 devfs_fixup(struct thread *td)
 {
 	struct nameidata nd;
 	int error;
 	struct vnode *vp, *dvp;
 	struct mount *mp;
 
 	/* Remove our devfs mount from the mountlist and purge the cache */
 	mtx_lock(&mountlist_mtx);
 	mp = TAILQ_FIRST(&mountlist);
 	TAILQ_REMOVE(&mountlist, mp, mnt_list);
 	mtx_unlock(&mountlist_mtx);
 	cache_purgevfs(mp);
 
 	VFS_ROOT(mp, LK_EXCLUSIVE, &dvp, td);
 	VI_LOCK(dvp);
 	dvp->v_iflag &= ~VI_MOUNT;
 	dvp->v_mountedhere = NULL;
 	VI_UNLOCK(dvp);
 
 	/* Set up the real rootvnode, and purge the cache */
 	TAILQ_FIRST(&mountlist)->mnt_vnodecovered = NULL;
 	set_rootvnode(td);
 	cache_purgevfs(rootvnode->v_mount);
 
 	NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF, UIO_SYSSPACE, "/dev", td);
 	error = namei(&nd);
 	if (error) {
 		printf("Lookup of /dev for devfs, error: %d\n", error);
 		return;
 	}
 	NDFREE(&nd, NDF_ONLY_PNBUF);
 	vp = nd.ni_vp;
 	if (vp->v_type != VDIR) {
 		vput(vp);
 	}
 	error = vinvalbuf(vp, V_SAVE, td, 0, 0);
 	if (error) {
 		vput(vp);
 	}
 	cache_purge(vp);
 	mp->mnt_vnodecovered = vp;
 	vp->v_mountedhere = mp;
 	mtx_lock(&mountlist_mtx);
 	TAILQ_INSERT_TAIL(&mountlist, mp, mnt_list);
 	mtx_unlock(&mountlist_mtx);
 	VOP_UNLOCK(vp, 0, td);
 	vfs_unbusy(mp, td);
 	vput(dvp);
 
 	/* Unlink the no longer needed /dev/dev -> / symlink */
 	kern_unlink(td, "/dev/dev", UIO_SYSSPACE);
 }
 
 /*
  * Report errors during filesystem mounting. 
  */
 void
 vfs_mount_error(struct mount *mp, const char *fmt, ...)
 {
 	struct vfsoptlist *moptlist = mp->mnt_optnew;
 	va_list ap;
 	int error, len;
 	char *errmsg;
 
 	error = vfs_getopt(moptlist, "errmsg", (void **)&errmsg, &len);
 	if (error || errmsg == NULL || len <= 0)
 		return;
 
 	va_start(ap, fmt);
 	vsnprintf(errmsg, (size_t)len, fmt, ap);
 	va_end(ap);
 }
 
 /*
  * Find and mount the root filesystem
  */
 void
 vfs_mountroot(void)
 {
 	char *cp;
 	int error, i, asked = 0;
 
 	root_mount_wait();
 
 	devfs_first();
 
 	/*
 	 * We are booted with instructions to prompt for the root filesystem.
 	 */
 	if (boothowto & RB_ASKNAME) {
 		if (!vfs_mountroot_ask())
 			return;
 		asked = 1;
 	}
 
 	/*
 	 * The root filesystem information is compiled in, and we are
 	 * booted with instructions to use it.
 	 */
 	if (ctrootdevname != NULL && (boothowto & RB_DFLTROOT)) {
 		if (!vfs_mountroot_try(ctrootdevname))
 			return;
 		ctrootdevname = NULL;
 	}
 
 	/*
 	 * We've been given the generic "use CDROM as root" flag.  This is
 	 * necessary because one media may be used in many different
 	 * devices, so we need to search for them.
 	 */
 	if (boothowto & RB_CDROM) {
 		for (i = 0; cdrom_rootdevnames[i] != NULL; i++) {
 			if (!vfs_mountroot_try(cdrom_rootdevnames[i]))
 				return;
 		}
 	}
 
 	/*
 	 * Try to use the value read by the loader from /etc/fstab, or
 	 * supplied via some other means.  This is the preferred
 	 * mechanism.
 	 */
 	cp = getenv("vfs.root.mountfrom");
 	if (cp != NULL) {
 		error = vfs_mountroot_try(cp);
 		freeenv(cp);
 		if (!error)
 			return;
 	}
 
 	/*
 	 * Try values that may have been computed by code during boot
 	 */
 	if (!vfs_mountroot_try(rootdevnames[0]))
 		return;
 	if (!vfs_mountroot_try(rootdevnames[1]))
 		return;
 
 	/*
 	 * If we (still) have a compiled-in default, try it.
 	 */
 	if (ctrootdevname != NULL)
 		if (!vfs_mountroot_try(ctrootdevname))
 			return;
 	/*
 	 * Everything so far has failed, prompt on the console if we haven't
 	 * already tried that.
 	 */
 	if (!asked)
 		if (!vfs_mountroot_ask())
 			return;
 
 	panic("Root mount failed, startup aborted.");
 }
 
 /*
  * Mount (mountfrom) as the root filesystem.
  */
 static int
 vfs_mountroot_try(const char *mountfrom)
 {
 	struct mount	*mp;
 	char		*vfsname, *path;
 	time_t		timebase;
 	int		error;
 	char		patt[32];
 
 	vfsname = NULL;
 	path    = NULL;
 	mp      = NULL;
 	error   = EINVAL;
 
 	if (mountfrom == NULL)
 		return (error);		/* don't complain */
 	printf("Trying to mount root from %s\n", mountfrom);
 
 	/* parse vfs name and path */
 	vfsname = malloc(MFSNAMELEN, M_MOUNT, M_WAITOK);
 	path = malloc(MNAMELEN, M_MOUNT, M_WAITOK);
 	vfsname[0] = path[0] = 0;
 	sprintf(patt, "%%%d[a-z0-9]:%%%ds", MFSNAMELEN, MNAMELEN);
 	if (sscanf(mountfrom, patt, vfsname, path) < 1)
 		goto out;
 
 	if (path[0] == '\0')
 		strcpy(path, ROOTNAME);
 
 	error = kernel_vmount(
 	    MNT_RDONLY | MNT_ROOTFS,
 	    "fstype", vfsname,
 	    "fspath", "/",
 	    "from", path,
 	    NULL);
 	if (error == 0) {
 		/*
 		 * We mount devfs prior to mounting the / FS, so the first
 		 * entry will typically be devfs.
 		 */
 		mp = TAILQ_FIRST(&mountlist);
 		KASSERT(mp != NULL, ("%s: mountlist is empty", __func__));
 
 		/*
 		 * Iterate over all currently mounted file systems and use
 		 * the time stamp found to check and/or initialize the RTC.
 		 * Typically devfs has no time stamp and the only other FS
 		 * is the actual / FS.
 		 * Call inittodr() only once and pass it the largest of the
 		 * timestamps we encounter.
 		 */
 		timebase = 0;
 		do {
 			if (mp->mnt_time > timebase)
 				timebase = mp->mnt_time;
 			mp = TAILQ_NEXT(mp, mnt_list);
 		} while (mp != NULL);
 		inittodr(timebase);
 
 		devfs_fixup(curthread);
 	}
 out:
 	free(path, M_MOUNT);
 	free(vfsname, M_MOUNT);
 	return (error);
 }
 
 /*
  * ---------------------------------------------------------------------
  * Interactive root filesystem selection code.
  */
 
 static int
 vfs_mountroot_ask(void)
 {
 	char name[128];
 
 	for(;;) {
 		printf("\nManual root filesystem specification:\n");
 		printf("  <fstype>:<device>  Mount <device> using filesystem <fstype>\n");
 #if defined(__i386__) || defined(__ia64__)
 		printf("                       eg. ufs:da0s1a\n");
 #else
 		printf("                       eg. ufs:/dev/da0a\n");
 #endif
 		printf("  ?                  List valid disk boot devices\n");
 		printf("  <empty line>       Abort manual input\n");
 		printf("\nmountroot> ");
 		gets(name, sizeof(name), 1);
 		if (name[0] == '\0')
 			return (1);
 		if (name[0] == '?') {
 			printf("\nList of GEOM managed disk devices:\n  ");
 			g_dev_print();
 			continue;
 		}
 		if (!vfs_mountroot_try(name))
 			return (0);
 	}
 }
 
 /*
  * ---------------------------------------------------------------------
  * Functions for querying mount options/arguments from filesystems.
  */
 
 /*
  * Check that no unknown options are given
  */
 int
 vfs_filteropt(struct vfsoptlist *opts, const char **legal)
 {
 	struct vfsopt *opt;
 	const char **t, *p;
 	
 
 	TAILQ_FOREACH(opt, opts, link) {
 		p = opt->name;
 		if (p[0] == 'n' && p[1] == 'o')
 			p += 2;
 		for(t = global_opts; *t != NULL; t++)
 			if (!strcmp(*t, p))
 				break;
 		if (*t != NULL)
 			continue;
 		for(t = legal; *t != NULL; t++)
 			if (!strcmp(*t, p))
 				break;
 		if (*t != NULL)
 			continue;
 		printf("mount option <%s> is unknown\n", p);
 		return (EINVAL);
 	}
 	return (0);
 }
 
 /*
  * Get a mount option by its name.
  *
  * Return 0 if the option was found, ENOENT otherwise.
  * If len is non-NULL it will be filled with the length
  * of the option. If buf is non-NULL, it will be filled
  * with the address of the option.
  */
 int
 vfs_getopt(opts, name, buf, len)
 	struct vfsoptlist *opts;
 	const char *name;
 	void **buf;
 	int *len;
 {
 	struct vfsopt *opt;
 
 	KASSERT(opts != NULL, ("vfs_getopt: caller passed 'opts' as NULL"));
 
 	TAILQ_FOREACH(opt, opts, link) {
 		if (strcmp(name, opt->name) == 0) {
 			if (len != NULL)
 				*len = opt->len;
 			if (buf != NULL)
 				*buf = opt->value;
 			return (0);
 		}
 	}
 	return (ENOENT);
 }
 
 static int
 vfs_getopt_pos(struct vfsoptlist *opts, const char *name)
 {
 	struct vfsopt *opt;
 	int i;
 
 	if (opts == NULL)
 		return (-1);
 
 	i = 0;
 	TAILQ_FOREACH(opt, opts, link) {
 		if (strcmp(name, opt->name) == 0)
 			return (i);
 		++i;
 	}
 	return (-1);
 }
 
 char *
 vfs_getopts(struct vfsoptlist *opts, const char *name, int *error)
 {
 	struct vfsopt *opt;
 
 	*error = 0;
 	TAILQ_FOREACH(opt, opts, link) {
 		if (strcmp(name, opt->name) != 0)
 			continue;
 		if (((char *)opt->value)[opt->len - 1] != '\0') {
 			*error = EINVAL;
 			return (NULL);
 		}
 		return (opt->value);
 	}
 	return (NULL);
 }
 
 int
 vfs_flagopt(struct vfsoptlist *opts, const char *name, u_int *w, u_int val)
 {
 	struct vfsopt *opt;
 
 	TAILQ_FOREACH(opt, opts, link) {
 		if (strcmp(name, opt->name) == 0) {
 			if (w != NULL)
 				*w |= val;
 			return (1);
 		}
 	}
 	if (w != NULL)
 		*w &= ~val;
 	return (0);
 }
 
 int
 vfs_scanopt(struct vfsoptlist *opts, const char *name, const char *fmt, ...)
 {
 	va_list ap;
 	struct vfsopt *opt;
 	int ret;
 
 	KASSERT(opts != NULL, ("vfs_getopt: caller passed 'opts' as NULL"));
 
 	TAILQ_FOREACH(opt, opts, link) {
 		if (strcmp(name, opt->name) != 0)
 			continue;
 		if (((char *)opt->value)[opt->len - 1] != '\0')
 			return (0);
 		va_start(ap, fmt);
 		ret = vsscanf(opt->value, fmt, ap);
 		va_end(ap);
 		return (ret);
 	}
 	return (0);
 }
 
 /*
  * Find and copy a mount option.
  *
  * The size of the buffer has to be specified
  * in len, if it is not the same length as the
  * mount option, EINVAL is returned.
  * Returns ENOENT if the option is not found.
  */
 int
 vfs_copyopt(opts, name, dest, len)
 	struct vfsoptlist *opts;
 	const char *name;
 	void *dest;
 	int len;
 {
 	struct vfsopt *opt;
 
 	KASSERT(opts != NULL, ("vfs_copyopt: caller passed 'opts' as NULL"));
 
 	TAILQ_FOREACH(opt, opts, link) {
 		if (strcmp(name, opt->name) == 0) {
 			if (len != opt->len)
 				return (EINVAL);
 			bcopy(opt->value, dest, opt->len);
 			return (0);
 		}
 	}
 	return (ENOENT);
 }
 
 /*
  * This is a helper function for filesystems to traverse their
  * vnodes.  See MNT_VNODE_FOREACH() in sys/mount.h
  */
 
 struct vnode *
-__mnt_vnode_next(struct vnode **nvp, struct mount *mp)
+__mnt_vnode_next(struct vnode **mvp, struct mount *mp)
 {
 	struct vnode *vp;
 
-	mtx_assert(&mp->mnt_mtx, MA_OWNED);
+	mtx_assert(MNT_MTX(mp), MA_OWNED);
 
-	vp = *nvp;
+	KASSERT((*mvp)->v_mount == mp, ("marker vnode mount list mismatch"));
+	vp = TAILQ_NEXT(*mvp, v_nmntvnodes);
+	while (vp != NULL && vp->v_type == VMARKER) 
+		vp = TAILQ_NEXT(vp, v_nmntvnodes);
+	
 	/* Check if we are done */
-	if (vp == NULL)
+	if (vp == NULL) {
+		__mnt_vnode_markerfree(mvp, mp);
 		return (NULL);
-	/* If our next vnode is no longer ours, start over */
-	if (vp->v_mount != mp) 
-		vp = TAILQ_FIRST(&mp->mnt_nvnodelist);
-	/* Save pointer to next vnode in list */
-	if (vp != NULL)
-		*nvp = TAILQ_NEXT(vp, v_nmntvnodes);
-	else
-		*nvp = NULL;
+	}
+	TAILQ_REMOVE(&mp->mnt_nvnodelist, *mvp, v_nmntvnodes);
+	TAILQ_INSERT_AFTER(&mp->mnt_nvnodelist, vp, *mvp, v_nmntvnodes);
 	return (vp);
 }
+
+struct vnode *
+__mnt_vnode_first(struct vnode **mvp, struct mount *mp)
+{
+	struct vnode *vp;
+
+	mtx_assert(MNT_MTX(mp), MA_OWNED);
+
+	vp = TAILQ_FIRST(&mp->mnt_nvnodelist);
+	while (vp != NULL && vp->v_type == VMARKER) 
+		vp = TAILQ_NEXT(vp, v_nmntvnodes);
+	
+	/* Check if we are done */
+	if (vp == NULL) {
+		*mvp = NULL;
+		return (NULL);
+	}
+	mp->mnt_holdcnt++;
+	MNT_IUNLOCK(mp);
+	*mvp = (struct vnode *) malloc(sizeof(struct vnode),
+				       M_VNODE_MARKER,
+				       M_WAITOK | M_ZERO);
+	MNT_ILOCK(mp);
+	(*mvp)->v_type = VMARKER;
+
+	vp = TAILQ_FIRST(&mp->mnt_nvnodelist);
+	while (vp != NULL && vp->v_type == VMARKER) 
+		vp = TAILQ_NEXT(vp, v_nmntvnodes);
+	
+	/* Check if we are done */
+	if (vp == NULL) {
+		MNT_IUNLOCK(mp);
+		free(*mvp, M_VNODE_MARKER);
+		MNT_ILOCK(mp);
+		*mvp = NULL;
+		mp->mnt_holdcnt--;
+		if (mp->mnt_holdcnt == 0 && mp->mnt_holdcntwaiters != 0)
+			wakeup(&mp->mnt_holdcnt);
+		return (NULL);
+	}
+	mp->mnt_markercnt++;
+	(*mvp)->v_mount = mp;
+	TAILQ_INSERT_AFTER(&mp->mnt_nvnodelist, vp, *mvp, v_nmntvnodes);
+	return (vp);
+}
+
+
+void
+__mnt_vnode_markerfree(struct vnode **mvp, struct mount *mp)
+{
+
+	if (*mvp == NULL)
+		return;
+	
+	mtx_assert(MNT_MTX(mp), MA_OWNED);
+
+	KASSERT((*mvp)->v_mount == mp, ("marker vnode mount list mismatch"));
+	TAILQ_REMOVE(&mp->mnt_nvnodelist, *mvp, v_nmntvnodes);
+	MNT_IUNLOCK(mp);
+	free(*mvp, M_VNODE_MARKER);
+	MNT_ILOCK(mp);
+	*mvp = NULL;
+
+	mp->mnt_markercnt--;
+	mp->mnt_holdcnt--;
+	if (mp->mnt_holdcnt == 0 && mp->mnt_holdcntwaiters != 0)
+		wakeup(&mp->mnt_holdcnt);
+}
+
 
 int
 __vfs_statfs(struct mount *mp, struct statfs *sbp, struct thread *td)
 {
 	int error;
 
 	error = mp->mnt_op->vfs_statfs(mp, &mp->mnt_stat, td);
 	if (sbp != &mp->mnt_stat)
 		*sbp = mp->mnt_stat;
 	return (error);
 }
 
 void
 vfs_mountedfrom(struct mount *mp, const char *from)
 {
 
 	bzero(mp->mnt_stat.f_mntfromname, sizeof mp->mnt_stat.f_mntfromname);
 	strlcpy(mp->mnt_stat.f_mntfromname, from,
 	    sizeof mp->mnt_stat.f_mntfromname);
 }
 
 /*
  * ---------------------------------------------------------------------
  * This is the api for building mount args and mounting filesystems from
  * inside the kernel.
  *
  * The API works by accumulation of individual args.  First error is
  * latched.
  *
  * XXX: should be documented in new manpage kernel_mount(9)
  */
 
 /* A memory allocation which must be freed when we are done */
 struct mntaarg {
 	SLIST_ENTRY(mntaarg)	next;
 };
 
 /* The header for the mount arguments */
 struct mntarg {
 	struct iovec *v;
 	int len;
 	int error;
 	SLIST_HEAD(, mntaarg)	list;
 };
 
 /*
  * Add a boolean argument.
  *
  * flag is the boolean value.
  * name must start with "no".
  */
 struct mntarg *
 mount_argb(struct mntarg *ma, int flag, const char *name)
 {
 
 	KASSERT(name[0] == 'n' && name[1] == 'o',
 	    ("mount_argb(...,%s): name must start with 'no'", name));
 
 	return (mount_arg(ma, name + (flag ? 2 : 0), NULL, 0));
 }
 
 /*
  * Add an argument printf style
  */
 struct mntarg *
 mount_argf(struct mntarg *ma, const char *name, const char *fmt, ...)
 {
 	va_list ap;
 	struct mntaarg *maa;
 	struct sbuf *sb;
 	int len;
 
 	if (ma == NULL) {
 		ma = malloc(sizeof *ma, M_MOUNT, M_WAITOK | M_ZERO);
 		SLIST_INIT(&ma->list);
 	}
 	if (ma->error)
 		return (ma);
 
 	ma->v = realloc(ma->v, sizeof *ma->v * (ma->len + 2),
 	    M_MOUNT, M_WAITOK);
 	ma->v[ma->len].iov_base = (void *)(uintptr_t)name;
 	ma->v[ma->len].iov_len = strlen(name) + 1;
 	ma->len++;
 
 	sb = sbuf_new(NULL, NULL, 0, SBUF_AUTOEXTEND);
 	va_start(ap, fmt);
 	sbuf_vprintf(sb, fmt, ap);
 	va_end(ap);
 	sbuf_finish(sb);
 	len = sbuf_len(sb) + 1;
 	maa = malloc(sizeof *maa + len, M_MOUNT, M_WAITOK | M_ZERO);
 	SLIST_INSERT_HEAD(&ma->list, maa, next);
 	bcopy(sbuf_data(sb), maa + 1, len);
 	sbuf_delete(sb);
 
 	ma->v[ma->len].iov_base = maa + 1;
 	ma->v[ma->len].iov_len = len;
 	ma->len++;
 
 	return (ma);
 }
 
 /*
  * Add an argument which is a userland string.
  */
 struct mntarg *
 mount_argsu(struct mntarg *ma, const char *name, const void *val, int len)
 {
 	struct mntaarg *maa;
 	char *tbuf;
 
 	if (val == NULL)
 		return (ma);
 	if (ma == NULL) {
 		ma = malloc(sizeof *ma, M_MOUNT, M_WAITOK | M_ZERO);
 		SLIST_INIT(&ma->list);
 	}
 	if (ma->error)
 		return (ma);
 	maa = malloc(sizeof *maa + len, M_MOUNT, M_WAITOK | M_ZERO);
 	SLIST_INSERT_HEAD(&ma->list, maa, next);
 	tbuf = (void *)(maa + 1);
 	ma->error = copyinstr(val, tbuf, len, NULL);
 	return (mount_arg(ma, name, tbuf, -1));
 }
 
 /*
  * Plain argument.
  *
  * If length is -1, use printf.
  */
 struct mntarg *
 mount_arg(struct mntarg *ma, const char *name, const void *val, int len)
 {
 
 	if (ma == NULL) {
 		ma = malloc(sizeof *ma, M_MOUNT, M_WAITOK | M_ZERO);
 		SLIST_INIT(&ma->list);
 	}
 	if (ma->error)
 		return (ma);
 
 	ma->v = realloc(ma->v, sizeof *ma->v * (ma->len + 2),
 	    M_MOUNT, M_WAITOK);
 	ma->v[ma->len].iov_base = (void *)(uintptr_t)name;
 	ma->v[ma->len].iov_len = strlen(name) + 1;
 	ma->len++;
 
 	ma->v[ma->len].iov_base = (void *)(uintptr_t)val;
 	if (len < 0)
 		ma->v[ma->len].iov_len = strlen(val) + 1;
 	else
 		ma->v[ma->len].iov_len = len;
 	ma->len++;
 	return (ma);
 }
 
 /*
  * Free a mntarg structure
  */
 static void
 free_mntarg(struct mntarg *ma)
 {
 	struct mntaarg *maa;
 
 	while (!SLIST_EMPTY(&ma->list)) {
 		maa = SLIST_FIRST(&ma->list);
 		SLIST_REMOVE_HEAD(&ma->list, next);
 		free(maa, M_MOUNT);
 	}
 	free(ma->v, M_MOUNT);
 	free(ma, M_MOUNT);
 }
 
 /*
  * Mount a filesystem
  */
 int
 kernel_mount(struct mntarg *ma, int flags)
 {
 	struct uio auio;
 	int error;
 
 	KASSERT(ma != NULL, ("kernel_mount NULL ma"));
 	KASSERT(ma->v != NULL, ("kernel_mount NULL ma->v"));
 	KASSERT(!(ma->len & 1), ("kernel_mount odd ma->len (%d)", ma->len));
 
 	auio.uio_iov = ma->v;
 	auio.uio_iovcnt = ma->len;
 	auio.uio_segflg = UIO_SYSSPACE;
 
 	error = ma->error;
 	if (!error)
 		error = vfs_donmount(curthread, flags, &auio);
 	free_mntarg(ma);
 	return (error);
 }
 
 /*
  * A printflike function to mount a filesystem.
  */
 int
 kernel_vmount(int flags, ...)
 {
 	struct mntarg *ma = NULL;
 	va_list ap;
 	const char *cp;
 	const void *vp;
 	int error;
 
 	va_start(ap, flags);
 	for (;;) {
 		cp = va_arg(ap, const char *);
 		if (cp == NULL)
 			break;
 		vp = va_arg(ap, const void *);
 		ma = mount_arg(ma, cp, vp, -1);
 	}
 	va_end(ap);
 
 	error = kernel_mount(ma, flags);
 	return (error);
 }
Index: head/sys/kern/vfs_subr.c
===================================================================
--- head/sys/kern/vfs_subr.c	(revision 154151)
+++ head/sys/kern/vfs_subr.c	(revision 154152)
@@ -1,3924 +1,3919 @@
 /*-
  * Copyright (c) 1989, 1993
  *	The Regents of the University of California.  All rights reserved.
  * (c) UNIX System Laboratories, Inc.
  * All or some portions of this file are derived from material licensed
  * to the University of California by American Telephone and Telegraph
  * Co. or Unix System Laboratories, Inc. and are reproduced herein with
  * the permission of UNIX System Laboratories, Inc.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	@(#)vfs_subr.c	8.31 (Berkeley) 5/26/95
  */
 
 /*
  * External virtual filesystem routines
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_ddb.h"
 #include "opt_mac.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/bio.h>
 #include <sys/buf.h>
 #include <sys/conf.h>
 #include <sys/dirent.h>
 #include <sys/event.h>
 #include <sys/eventhandler.h>
 #include <sys/extattr.h>
 #include <sys/file.h>
 #include <sys/fcntl.h>
 #include <sys/kdb.h>
 #include <sys/kernel.h>
 #include <sys/kthread.h>
 #include <sys/mac.h>
 #include <sys/malloc.h>
 #include <sys/mount.h>
 #include <sys/namei.h>
 #include <sys/reboot.h>
 #include <sys/sleepqueue.h>
 #include <sys/stat.h>
 #include <sys/sysctl.h>
 #include <sys/syslog.h>
 #include <sys/vmmeter.h>
 #include <sys/vnode.h>
 
 #include <machine/stdarg.h>
 
 #include <vm/vm.h>
 #include <vm/vm_object.h>
 #include <vm/vm_extern.h>
 #include <vm/pmap.h>
 #include <vm/vm_map.h>
 #include <vm/vm_page.h>
 #include <vm/vm_kern.h>
 #include <vm/uma.h>
 
 static MALLOC_DEFINE(M_NETADDR, "subr_export_host", "Export host address structure");
 
 static void	delmntque(struct vnode *vp);
 static void	insmntque(struct vnode *vp, struct mount *mp);
 static int	flushbuflist(struct bufv *bufv, int flags, struct bufobj *bo,
 		    int slpflag, int slptimeo);
 static void	syncer_shutdown(void *arg, int howto);
 static int	vtryrecycle(struct vnode *vp);
 static void	vbusy(struct vnode *vp);
 static void	vdropl(struct vnode *vp);
 static void	vinactive(struct vnode *, struct thread *);
 static void	v_incr_usecount(struct vnode *);
 static void	v_decr_usecount(struct vnode *);
 static void	v_decr_useonly(struct vnode *);
 static void	vfree(struct vnode *);
 static void	vnlru_free(int);
 static void	vdestroy(struct vnode *);
 static void	vgonel(struct vnode *);
 static void	vfs_knllock(void *arg);
 static void	vfs_knlunlock(void *arg);
 static int	vfs_knllocked(void *arg);
 
 
 /*
  * Enable Giant pushdown based on whether or not the vm is mpsafe in this
  * build.  Without mpsafevm the buffer cache can not run Giant free.
  */
 #if defined(__alpha__) || defined(__amd64__) || defined(__i386__) || \
 	defined(__ia64__) || defined(__sparc64__)
 int mpsafe_vfs = 1;
 #else
 int mpsafe_vfs;
 #endif
 TUNABLE_INT("debug.mpsafevfs", &mpsafe_vfs);
 SYSCTL_INT(_debug, OID_AUTO, mpsafevfs, CTLFLAG_RD, &mpsafe_vfs, 0,
     "MPSAFE VFS");
 
 /*
  * Number of vnodes in existence.  Increased whenever getnewvnode()
  * allocates a new vnode, never decreased.
  */
 static unsigned long	numvnodes;
 
 SYSCTL_LONG(_vfs, OID_AUTO, numvnodes, CTLFLAG_RD, &numvnodes, 0, "");
 
 /*
  * Conversion tables for conversion from vnode types to inode formats
  * and back.
  */
 enum vtype iftovt_tab[16] = {
 	VNON, VFIFO, VCHR, VNON, VDIR, VNON, VBLK, VNON,
 	VREG, VNON, VLNK, VNON, VSOCK, VNON, VNON, VBAD,
 };
-int vttoif_tab[9] = {
+int vttoif_tab[10] = {
 	0, S_IFREG, S_IFDIR, S_IFBLK, S_IFCHR, S_IFLNK,
-	S_IFSOCK, S_IFIFO, S_IFMT,
+	S_IFSOCK, S_IFIFO, S_IFMT, S_IFMT
 };
 
 /*
  * List of vnodes that are ready for recycling.
  */
 static TAILQ_HEAD(freelst, vnode) vnode_free_list;
 
 /*
  * Free vnode target.  Free vnodes may simply be files which have been stat'd
  * but not read.  This is somewhat common, and a small cache of such files
  * should be kept to avoid recreation costs.
  */
 static u_long wantfreevnodes;
 SYSCTL_LONG(_vfs, OID_AUTO, wantfreevnodes, CTLFLAG_RW, &wantfreevnodes, 0, "");
 /* Number of vnodes in the free list. */
 static u_long freevnodes;
 SYSCTL_LONG(_vfs, OID_AUTO, freevnodes, CTLFLAG_RD, &freevnodes, 0, "");
 
 /*
  * Various variables used for debugging the new implementation of
  * reassignbuf().
  * XXX these are probably of (very) limited utility now.
  */
 static int reassignbufcalls;
 SYSCTL_INT(_vfs, OID_AUTO, reassignbufcalls, CTLFLAG_RW, &reassignbufcalls, 0, "");
 
 /*
  * Cache for the mount type id assigned to NFS.  This is used for
  * special checks in nfs/nfs_nqlease.c and vm/vnode_pager.c.
  */
 int	nfs_mount_type = -1;
 
 /* To keep more than one thread at a time from running vfs_getnewfsid */
 static struct mtx mntid_mtx;
 
 /*
  * Lock for any access to the following:
  *	vnode_free_list
  *	numvnodes
  *	freevnodes
  */
 static struct mtx vnode_free_list_mtx;
 
 /* Publicly exported FS */
 struct nfs_public nfs_pub;
 
 /* Zone for allocation of new vnodes - used exclusively by getnewvnode() */
 static uma_zone_t vnode_zone;
 static uma_zone_t vnodepoll_zone;
 
 /* Set to 1 to print out reclaim of active vnodes */
 int	prtactive;
 
 /*
  * The workitem queue.
  *
  * It is useful to delay writes of file data and filesystem metadata
  * for tens of seconds so that quickly created and deleted files need
  * not waste disk bandwidth being created and removed. To realize this,
  * we append vnodes to a "workitem" queue. When running with a soft
  * updates implementation, most pending metadata dependencies should
  * not wait for more than a few seconds. Thus, mounted on block devices
  * are delayed only about a half the time that file data is delayed.
  * Similarly, directory updates are more critical, so are only delayed
  * about a third the time that file data is delayed. Thus, there are
  * SYNCER_MAXDELAY queues that are processed round-robin at a rate of
  * one each second (driven off the filesystem syncer process). The
  * syncer_delayno variable indicates the next queue that is to be processed.
  * Items that need to be processed soon are placed in this queue:
  *
  *	syncer_workitem_pending[syncer_delayno]
  *
  * A delay of fifteen seconds is done by placing the request fifteen
  * entries later in the queue:
  *
  *	syncer_workitem_pending[(syncer_delayno + 15) & syncer_mask]
  *
  */
 static int syncer_delayno;
 static long syncer_mask;
 LIST_HEAD(synclist, bufobj);
 static struct synclist *syncer_workitem_pending;
 /*
  * The sync_mtx protects:
  *	bo->bo_synclist
  *	sync_vnode_count
  *	syncer_delayno
  *	syncer_state
  *	syncer_workitem_pending
  *	syncer_worklist_len
  *	rushjob
  */
 static struct mtx sync_mtx;
 
 #define SYNCER_MAXDELAY		32
 static int syncer_maxdelay = SYNCER_MAXDELAY;	/* maximum delay time */
 static int syncdelay = 30;		/* max time to delay syncing data */
 static int filedelay = 30;		/* time to delay syncing files */
 SYSCTL_INT(_kern, OID_AUTO, filedelay, CTLFLAG_RW, &filedelay, 0, "");
 static int dirdelay = 29;		/* time to delay syncing directories */
 SYSCTL_INT(_kern, OID_AUTO, dirdelay, CTLFLAG_RW, &dirdelay, 0, "");
 static int metadelay = 28;		/* time to delay syncing metadata */
 SYSCTL_INT(_kern, OID_AUTO, metadelay, CTLFLAG_RW, &metadelay, 0, "");
 static int rushjob;		/* number of slots to run ASAP */
 static int stat_rush_requests;	/* number of times I/O speeded up */
 SYSCTL_INT(_debug, OID_AUTO, rush_requests, CTLFLAG_RW, &stat_rush_requests, 0, "");
 
 /*
  * When shutting down the syncer, run it at four times normal speed.
  */
 #define SYNCER_SHUTDOWN_SPEEDUP		4
 static int sync_vnode_count;
 static int syncer_worklist_len;
 static enum { SYNCER_RUNNING, SYNCER_SHUTTING_DOWN, SYNCER_FINAL_DELAY }
     syncer_state;
 
 /*
  * Number of vnodes we want to exist at any one time.  This is mostly used
  * to size hash tables in vnode-related code.  It is normally not used in
  * getnewvnode(), as wantfreevnodes is normally nonzero.)
  *
  * XXX desiredvnodes is historical cruft and should not exist.
  */
 int desiredvnodes;
 SYSCTL_INT(_kern, KERN_MAXVNODES, maxvnodes, CTLFLAG_RW,
     &desiredvnodes, 0, "Maximum number of vnodes");
 SYSCTL_INT(_kern, OID_AUTO, minvnodes, CTLFLAG_RW,
     &wantfreevnodes, 0, "Minimum number of vnodes (legacy)");
 static int vnlru_nowhere;
 SYSCTL_INT(_debug, OID_AUTO, vnlru_nowhere, CTLFLAG_RW,
     &vnlru_nowhere, 0, "Number of times the vnlru process ran without success");
 
 /* Hook for calling soft updates. */
 int (*softdep_process_worklist_hook)(struct mount *);
 
 /*
  * Macros to control when a vnode is freed and recycled.  All require
  * the vnode interlock.
  */
 #define VCANRECYCLE(vp) (((vp)->v_iflag & VI_FREE) && !(vp)->v_holdcnt)
 #define VSHOULDFREE(vp) (!((vp)->v_iflag & VI_FREE) && !(vp)->v_holdcnt)
 #define VSHOULDBUSY(vp) (((vp)->v_iflag & VI_FREE) && (vp)->v_holdcnt)
 
 
 /*
  * Initialize the vnode management data structures.
  */
 #ifndef	MAXVNODES_MAX
 #define	MAXVNODES_MAX	100000
 #endif
 static void
 vntblinit(void *dummy __unused)
 {
 
 	/*
 	 * Desiredvnodes is a function of the physical memory size and
 	 * the kernel's heap size.  Specifically, desiredvnodes scales
 	 * in proportion to the physical memory size until two fifths
 	 * of the kernel's heap size is consumed by vnodes and vm
 	 * objects.
 	 */
 	desiredvnodes = min(maxproc + cnt.v_page_count / 4, 2 * vm_kmem_size /
 	    (5 * (sizeof(struct vm_object) + sizeof(struct vnode))));
 	if (desiredvnodes > MAXVNODES_MAX) {
 		if (bootverbose)
 			printf("Reducing kern.maxvnodes %d -> %d\n",
 			    desiredvnodes, MAXVNODES_MAX);
 		desiredvnodes = MAXVNODES_MAX;
 	}
 	wantfreevnodes = desiredvnodes / 4; 
 	mtx_init(&mntid_mtx, "mntid", NULL, MTX_DEF);
 	TAILQ_INIT(&vnode_free_list);
 	mtx_init(&vnode_free_list_mtx, "vnode_free_list", NULL, MTX_DEF);
 	vnode_zone = uma_zcreate("VNODE", sizeof (struct vnode), NULL, NULL,
 	    NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE);
 	vnodepoll_zone = uma_zcreate("VNODEPOLL", sizeof (struct vpollinfo),
 	      NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE);
 	/*
 	 * Initialize the filesystem syncer.
 	 */
 	syncer_workitem_pending = hashinit(syncer_maxdelay, M_VNODE,
 		&syncer_mask);
 	syncer_maxdelay = syncer_mask + 1;
 	mtx_init(&sync_mtx, "Syncer mtx", NULL, MTX_DEF);
 }
 SYSINIT(vfs, SI_SUB_VFS, SI_ORDER_FIRST, vntblinit, NULL)
 
 
 /*
  * Mark a mount point as busy. Used to synchronize access and to delay
  * unmounting. Interlock is not released on failure.
  */
 int
 vfs_busy(mp, flags, interlkp, td)
 	struct mount *mp;
 	int flags;
 	struct mtx *interlkp;
 	struct thread *td;
 {
 	int lkflags;
 
 	MNT_ILOCK(mp);
 	if (mp->mnt_kern_flag & MNTK_UNMOUNT) {
 		if (flags & LK_NOWAIT) {
 			MNT_IUNLOCK(mp);
 			return (ENOENT);
 		}
 		if (interlkp)
 			mtx_unlock(interlkp);
 		mp->mnt_kern_flag |= MNTK_MWAIT;
 		/*
 		 * Since all busy locks are shared except the exclusive
 		 * lock granted when unmounting, the only place that a
 		 * wakeup needs to be done is at the release of the
 		 * exclusive lock at the end of dounmount.
 		 */
 		msleep(mp, MNT_MTX(mp), PVFS|PDROP, "vfs_busy", 0);
 		if (interlkp)
 			mtx_lock(interlkp);
 		return (ENOENT);
 	}
 	if (interlkp)
 		mtx_unlock(interlkp);
 	lkflags = LK_SHARED | LK_INTERLOCK;
 	if (lockmgr(&mp->mnt_lock, lkflags, MNT_MTX(mp), td))
 		panic("vfs_busy: unexpected lock failure");
 	return (0);
 }
 
 /*
  * Free a busy filesystem.
  */
 void
 vfs_unbusy(mp, td)
 	struct mount *mp;
 	struct thread *td;
 {
 
 	lockmgr(&mp->mnt_lock, LK_RELEASE, NULL, td);
 }
 
 /*
  * Lookup a mount point by filesystem identifier.
  */
 struct mount *
 vfs_getvfs(fsid)
 	fsid_t *fsid;
 {
 	struct mount *mp;
 
 	mtx_lock(&mountlist_mtx);
 	TAILQ_FOREACH(mp, &mountlist, mnt_list) {
 		if (mp->mnt_stat.f_fsid.val[0] == fsid->val[0] &&
 		    mp->mnt_stat.f_fsid.val[1] == fsid->val[1]) {
 			mtx_unlock(&mountlist_mtx);
 			return (mp);
 		}
 	}
 	mtx_unlock(&mountlist_mtx);
 	return ((struct mount *) 0);
 }
 
 /*
  * Check if a user can access priveledged mount options.
  */
 int
 vfs_suser(struct mount *mp, struct thread *td)
 {
 	int error;
 
 	if ((mp->mnt_flag & MNT_USER) == 0 ||
 	    mp->mnt_cred->cr_uid != td->td_ucred->cr_uid) {
 		if ((error = suser(td)) != 0)
 			return (error);
 	}
 	return (0);
 }
 
 /*
  * Get a new unique fsid.  Try to make its val[0] unique, since this value
  * will be used to create fake device numbers for stat().  Also try (but
  * not so hard) make its val[0] unique mod 2^16, since some emulators only
  * support 16-bit device numbers.  We end up with unique val[0]'s for the
  * first 2^16 calls and unique val[0]'s mod 2^16 for the first 2^8 calls.
  *
  * Keep in mind that several mounts may be running in parallel.  Starting
  * the search one past where the previous search terminated is both a
  * micro-optimization and a defense against returning the same fsid to
  * different mounts.
  */
 void
 vfs_getnewfsid(mp)
 	struct mount *mp;
 {
 	static u_int16_t mntid_base;
 	fsid_t tfsid;
 	int mtype;
 
 	mtx_lock(&mntid_mtx);
 	mtype = mp->mnt_vfc->vfc_typenum;
 	tfsid.val[1] = mtype;
 	mtype = (mtype & 0xFF) << 24;
 	for (;;) {
 		tfsid.val[0] = makedev(255,
 		    mtype | ((mntid_base & 0xFF00) << 8) | (mntid_base & 0xFF));
 		mntid_base++;
 		if (vfs_getvfs(&tfsid) == NULL)
 			break;
 	}
 	mp->mnt_stat.f_fsid.val[0] = tfsid.val[0];
 	mp->mnt_stat.f_fsid.val[1] = tfsid.val[1];
 	mtx_unlock(&mntid_mtx);
 }
 
 /*
  * Knob to control the precision of file timestamps:
  *
  *   0 = seconds only; nanoseconds zeroed.
  *   1 = seconds and nanoseconds, accurate within 1/HZ.
  *   2 = seconds and nanoseconds, truncated to microseconds.
  * >=3 = seconds and nanoseconds, maximum precision.
  */
 enum { TSP_SEC, TSP_HZ, TSP_USEC, TSP_NSEC };
 
 static int timestamp_precision = TSP_SEC;
 SYSCTL_INT(_vfs, OID_AUTO, timestamp_precision, CTLFLAG_RW,
     &timestamp_precision, 0, "");
 
 /*
  * Get a current timestamp.
  */
 void
 vfs_timestamp(tsp)
 	struct timespec *tsp;
 {
 	struct timeval tv;
 
 	switch (timestamp_precision) {
 	case TSP_SEC:
 		tsp->tv_sec = time_second;
 		tsp->tv_nsec = 0;
 		break;
 	case TSP_HZ:
 		getnanotime(tsp);
 		break;
 	case TSP_USEC:
 		microtime(&tv);
 		TIMEVAL_TO_TIMESPEC(&tv, tsp);
 		break;
 	case TSP_NSEC:
 	default:
 		nanotime(tsp);
 		break;
 	}
 }
 
 /*
  * Set vnode attributes to VNOVAL
  */
 void
 vattr_null(vap)
 	struct vattr *vap;
 {
 
 	vap->va_type = VNON;
 	vap->va_size = VNOVAL;
 	vap->va_bytes = VNOVAL;
 	vap->va_mode = VNOVAL;
 	vap->va_nlink = VNOVAL;
 	vap->va_uid = VNOVAL;
 	vap->va_gid = VNOVAL;
 	vap->va_fsid = VNOVAL;
 	vap->va_fileid = VNOVAL;
 	vap->va_blocksize = VNOVAL;
 	vap->va_rdev = VNOVAL;
 	vap->va_atime.tv_sec = VNOVAL;
 	vap->va_atime.tv_nsec = VNOVAL;
 	vap->va_mtime.tv_sec = VNOVAL;
 	vap->va_mtime.tv_nsec = VNOVAL;
 	vap->va_ctime.tv_sec = VNOVAL;
 	vap->va_ctime.tv_nsec = VNOVAL;
 	vap->va_birthtime.tv_sec = VNOVAL;
 	vap->va_birthtime.tv_nsec = VNOVAL;
 	vap->va_flags = VNOVAL;
 	vap->va_gen = VNOVAL;
 	vap->va_vaflags = 0;
 }
 
 /*
  * This routine is called when we have too many vnodes.  It attempts
  * to free <count> vnodes and will potentially free vnodes that still
  * have VM backing store (VM backing store is typically the cause
  * of a vnode blowout so we want to do this).  Therefore, this operation
  * is not considered cheap.
  *
  * A number of conditions may prevent a vnode from being reclaimed.
  * the buffer cache may have references on the vnode, a directory
  * vnode may still have references due to the namei cache representing
  * underlying files, or the vnode may be in active use.   It is not
  * desireable to reuse such vnodes.  These conditions may cause the
  * number of vnodes to reach some minimum value regardless of what
  * you set kern.maxvnodes to.  Do not set kern.maxvnodes too low.
  */
 static int
 vlrureclaim(struct mount *mp)
 {
 	struct thread *td;
 	struct vnode *vp;
 	int done;
 	int trigger;
 	int usevnodes;
 	int count;
 
 	/*
 	 * Calculate the trigger point, don't allow user
 	 * screwups to blow us up.   This prevents us from
 	 * recycling vnodes with lots of resident pages.  We
 	 * aren't trying to free memory, we are trying to
 	 * free vnodes.
 	 */
 	usevnodes = desiredvnodes;
 	if (usevnodes <= 0)
 		usevnodes = 1;
 	trigger = cnt.v_page_count * 2 / usevnodes;
 	done = 0;
 	td = curthread;
 	vn_start_write(NULL, &mp, V_WAIT);
 	MNT_ILOCK(mp);
 	count = mp->mnt_nvnodelistsize / 10 + 1;
-	while (count && (vp = TAILQ_FIRST(&mp->mnt_nvnodelist)) != NULL) {
+	while (count != 0) {
+		vp = TAILQ_FIRST(&mp->mnt_nvnodelist);
+		while (vp != NULL && vp->v_type == VMARKER)
+			vp = TAILQ_NEXT(vp, v_nmntvnodes);
+		if (vp == NULL)
+			break;
 		TAILQ_REMOVE(&mp->mnt_nvnodelist, vp, v_nmntvnodes);
 		TAILQ_INSERT_TAIL(&mp->mnt_nvnodelist, vp, v_nmntvnodes);
 		--count;
 		if (!VI_TRYLOCK(vp))
 			goto next_iter;
 		/*
 		 * If it's been deconstructed already, it's still
 		 * referenced, or it exceeds the trigger, skip it.
 		 */
 		if (vp->v_usecount || !LIST_EMPTY(&(vp)->v_cache_src) ||
 		    (vp->v_iflag & VI_DOOMED) != 0 || (vp->v_object != NULL &&
 		    vp->v_object->resident_page_count > trigger)) {
 			VI_UNLOCK(vp);
 			goto next_iter;
 		}
 		MNT_IUNLOCK(mp);
 		vholdl(vp);
 		if (VOP_LOCK(vp, LK_INTERLOCK|LK_EXCLUSIVE|LK_NOWAIT, td)) {
 			vdrop(vp);
 			goto next_iter_mntunlocked;
 		}
 		VI_LOCK(vp);
 		/*
 		 * v_usecount may have been bumped after VOP_LOCK() dropped
 		 * the vnode interlock and before it was locked again.
 		 *
 		 * It is not necessary to recheck VI_DOOMED because it can
 		 * only be set by another thread that holds both the vnode
 		 * lock and vnode interlock.  If another thread has the
 		 * vnode lock before we get to VOP_LOCK() and obtains the
 		 * vnode interlock after VOP_LOCK() drops the vnode
 		 * interlock, the other thread will be unable to drop the
 		 * vnode lock before our VOP_LOCK() call fails.
 		 */
 		if (vp->v_usecount || !LIST_EMPTY(&(vp)->v_cache_src) ||
 		    (vp->v_object != NULL && 
 		    vp->v_object->resident_page_count > trigger)) {
 			VOP_UNLOCK(vp, LK_INTERLOCK, td);
 			goto next_iter_mntunlocked;
 		}
 		KASSERT((vp->v_iflag & VI_DOOMED) == 0,
 		    ("VI_DOOMED unexpectedly detected in vlrureclaim()"));
 		vgonel(vp);
 		VOP_UNLOCK(vp, 0, td);
 		vdropl(vp);
 		done++;
 next_iter_mntunlocked:
 		if ((count % 256) != 0)
 			goto relock_mnt;
 		goto yield;
 next_iter:
 		if ((count % 256) != 0)
 			continue;
 		MNT_IUNLOCK(mp);
 yield:
 		uio_yield();
 relock_mnt:
 		MNT_ILOCK(mp);
 	}
 	MNT_IUNLOCK(mp);
 	vn_finished_write(mp);
 	return done;
 }
 
 /*
  * Attempt to keep the free list at wantfreevnodes length.
  */
 static void
 vnlru_free(int count)
 {
 	struct vnode *vp;
 
 	mtx_assert(&vnode_free_list_mtx, MA_OWNED);
 	for (; count > 0; count--) {
 		vp = TAILQ_FIRST(&vnode_free_list);
 		/*
 		 * The list can be modified while the free_list_mtx
 		 * has been dropped and vp could be NULL here.
 		 */
 		if (!vp)
 			break;
 		VNASSERT(vp->v_op != NULL, vp,
 		    ("vnlru_free: vnode already reclaimed."));
 		TAILQ_REMOVE(&vnode_free_list, vp, v_freelist);
 		/*
 		 * Don't recycle if we can't get the interlock.
 		 */
 		if (!VI_TRYLOCK(vp)) {
 			TAILQ_INSERT_TAIL(&vnode_free_list, vp, v_freelist);
 			continue;
 		}
 		VNASSERT(VCANRECYCLE(vp), vp,
 		    ("vp inconsistent on freelist"));
 		freevnodes--;
 		vp->v_iflag &= ~VI_FREE;
 		vholdl(vp);
 		mtx_unlock(&vnode_free_list_mtx);
 		VI_UNLOCK(vp);
 		vtryrecycle(vp);
 		/*
 		 * If the recycled succeeded this vdrop will actually free
 		 * the vnode.  If not it will simply place it back on
 		 * the free list.
 		 */
 		vdrop(vp);
 		mtx_lock(&vnode_free_list_mtx);
 	}
 }
 /*
  * Attempt to recycle vnodes in a context that is always safe to block.
  * Calling vlrurecycle() from the bowels of filesystem code has some
  * interesting deadlock problems.
  */
 static struct proc *vnlruproc;
 static int vnlruproc_sig;
 
 static void
 vnlru_proc(void)
 {
 	struct mount *mp, *nmp;
 	int done;
 	struct proc *p = vnlruproc;
 	struct thread *td = FIRST_THREAD_IN_PROC(p);
 
 	mtx_lock(&Giant);
 
 	EVENTHANDLER_REGISTER(shutdown_pre_sync, kproc_shutdown, p,
 	    SHUTDOWN_PRI_FIRST);
 
 	for (;;) {
 		kthread_suspend_check(p);
 		mtx_lock(&vnode_free_list_mtx);
 		if (freevnodes > wantfreevnodes)
 			vnlru_free(freevnodes - wantfreevnodes);
 		if (numvnodes <= desiredvnodes * 9 / 10) {
 			vnlruproc_sig = 0;
 			wakeup(&vnlruproc_sig);
 			msleep(vnlruproc, &vnode_free_list_mtx,
 			    PVFS|PDROP, "vlruwt", hz);
 			continue;
 		}
 		mtx_unlock(&vnode_free_list_mtx);
 		done = 0;
 		mtx_lock(&mountlist_mtx);
 		for (mp = TAILQ_FIRST(&mountlist); mp != NULL; mp = nmp) {
 			int vfsunlocked;
 			if (vfs_busy(mp, LK_NOWAIT, &mountlist_mtx, td)) {
 				nmp = TAILQ_NEXT(mp, mnt_list);
 				continue;
 			}
 			if (!VFS_NEEDSGIANT(mp)) {
 				mtx_unlock(&Giant);
 				vfsunlocked = 1;
 			} else
 				vfsunlocked = 0;
 			done += vlrureclaim(mp);
 			if (vfsunlocked)
 				mtx_lock(&Giant);
 			mtx_lock(&mountlist_mtx);
 			nmp = TAILQ_NEXT(mp, mnt_list);
 			vfs_unbusy(mp, td);
 		}
 		mtx_unlock(&mountlist_mtx);
 		if (done == 0) {
 #if 0
 			/* These messages are temporary debugging aids */
 			if (vnlru_nowhere < 5)
 				printf("vnlru process getting nowhere..\n");
 			else if (vnlru_nowhere == 5)
 				printf("vnlru process messages stopped.\n");
 #endif
 			vnlru_nowhere++;
 			tsleep(vnlruproc, PPAUSE, "vlrup", hz * 3);
 		} else 
 			uio_yield();
 	}
 }
 
 static struct kproc_desc vnlru_kp = {
 	"vnlru",
 	vnlru_proc,
 	&vnlruproc
 };
 SYSINIT(vnlru, SI_SUB_KTHREAD_UPDATE, SI_ORDER_FIRST, kproc_start, &vnlru_kp)
 
 /*
  * Routines having to do with the management of the vnode table.
  */
 
 static void
 vdestroy(struct vnode *vp)
 {
 	struct bufobj *bo;
 
 	CTR1(KTR_VFS, "vdestroy vp %p", vp);
 	mtx_lock(&vnode_free_list_mtx);
 	numvnodes--;
 	mtx_unlock(&vnode_free_list_mtx);
 	bo = &vp->v_bufobj;
 	VNASSERT((vp->v_iflag & VI_FREE) == 0, vp,
 	    ("cleaned vnode still on the free list."));
 	VNASSERT(vp->v_data == NULL, vp, ("cleaned vnode isn't"));
 	VNASSERT(vp->v_holdcnt == 0, vp, ("Non-zero hold count"));
 	VNASSERT(vp->v_usecount == 0, vp, ("Non-zero use count"));
 	VNASSERT(vp->v_writecount == 0, vp, ("Non-zero write count"));
 	VNASSERT(bo->bo_numoutput == 0, vp, ("Clean vnode has pending I/O's"));
 	VNASSERT(bo->bo_clean.bv_cnt == 0, vp, ("cleanbufcnt not 0"));
 	VNASSERT(bo->bo_clean.bv_root == NULL, vp, ("cleanblkroot not NULL"));
 	VNASSERT(bo->bo_dirty.bv_cnt == 0, vp, ("dirtybufcnt not 0"));
 	VNASSERT(bo->bo_dirty.bv_root == NULL, vp, ("dirtyblkroot not NULL"));
 	VNASSERT(TAILQ_EMPTY(&vp->v_cache_dst), vp, ("vp has namecache dst"));
 	VNASSERT(LIST_EMPTY(&vp->v_cache_src), vp, ("vp has namecache src"));
 	VI_UNLOCK(vp);
 #ifdef MAC
 	mac_destroy_vnode(vp);
 #endif
 	if (vp->v_pollinfo != NULL) {
 		knlist_destroy(&vp->v_pollinfo->vpi_selinfo.si_note);
 		mtx_destroy(&vp->v_pollinfo->vpi_lock);
 		uma_zfree(vnodepoll_zone, vp->v_pollinfo);
 	}
 #ifdef INVARIANTS
 	/* XXX Elsewhere we can detect an already freed vnode via NULL v_op. */
 	vp->v_op = NULL;
 #endif
 	lockdestroy(vp->v_vnlock);
 	mtx_destroy(&vp->v_interlock);
 	uma_zfree(vnode_zone, vp);
 }
 
 /*
  * Try to recycle a freed vnode.  We abort if anyone picks up a reference
  * before we actually vgone().  This function must be called with the vnode
  * held to prevent the vnode from being returned to the free list midway
  * through vgone().
  */
 static int
 vtryrecycle(struct vnode *vp)
 {
 	struct thread *td = curthread;
 	struct mount *vnmp;
 
 	CTR1(KTR_VFS, "vtryrecycle: trying vp %p", vp);
 	VNASSERT(vp->v_holdcnt, vp,
 	    ("vtryrecycle: Recycling vp %p without a reference.", vp));
 	/*
 	 * This vnode may found and locked via some other list, if so we
 	 * can't recycle it yet.
 	 */
 	if (VOP_LOCK(vp, LK_EXCLUSIVE | LK_NOWAIT, td) != 0)
 		return (EWOULDBLOCK);
 	/*
 	 * Don't recycle if its filesystem is being suspended.
 	 */
 	if (vn_start_write(vp, &vnmp, V_NOWAIT) != 0) {
 		VOP_UNLOCK(vp, 0, td);
 		return (EBUSY);
 	}
 	/*
 	 * If we got this far, we need to acquire the interlock and see if
 	 * anyone picked up this vnode from another list.  If not, we will
 	 * mark it with DOOMED via vgonel() so that anyone who does find it
 	 * will skip over it.
 	 */
 	VI_LOCK(vp);
 	if (vp->v_usecount) {
 		VOP_UNLOCK(vp, LK_INTERLOCK, td);
 		vn_finished_write(vnmp);
 		return (EBUSY);
 	}
 	if ((vp->v_iflag & VI_DOOMED) == 0)
 		vgonel(vp);
 	VOP_UNLOCK(vp, LK_INTERLOCK, td);
 	vn_finished_write(vnmp);
 	CTR1(KTR_VFS, "vtryrecycle: recycled vp %p", vp);
 	return (0);
 }
 
 /*
  * Return the next vnode from the free list.
  */
 int
 getnewvnode(tag, mp, vops, vpp)
 	const char *tag;
 	struct mount *mp;
 	struct vop_vector *vops;
 	struct vnode **vpp;
 {
 	struct vnode *vp = NULL;
 	struct bufobj *bo;
 
 	mtx_lock(&vnode_free_list_mtx);
 	/*
 	 * Lend our context to reclaim vnodes if they've exceeded the max.
 	 */
 	if (freevnodes > wantfreevnodes)
 		vnlru_free(1);
 	/*
 	 * Wait for available vnodes.
 	 */
 	if (numvnodes > desiredvnodes) {
 		if (vnlruproc_sig == 0) {
 			vnlruproc_sig = 1;      /* avoid unnecessary wakeups */
 			wakeup(vnlruproc);
 		}
 		msleep(&vnlruproc_sig, &vnode_free_list_mtx, PVFS,
 		    "vlruwk", hz);
 #if 0	/* XXX Not all VFS_VGET/ffs_vget callers check returns. */
 		if (numvnodes > desiredvnodes) {
 			mtx_unlock(&vnode_free_list_mtx);
 			return (ENFILE);
 		}
 #endif
 	}
 	numvnodes++;
 	mtx_unlock(&vnode_free_list_mtx);
 	vp = (struct vnode *) uma_zalloc(vnode_zone, M_WAITOK|M_ZERO);
 	/*
 	 * Setup locks.
 	 */
 	vp->v_vnlock = &vp->v_lock;
 	mtx_init(&vp->v_interlock, "vnode interlock", NULL, MTX_DEF);
 	/*
 	 * By default, don't allow shared locks unless filesystems
 	 * opt-in.
 	 */
 	lockinit(vp->v_vnlock, PVFS, tag, VLKTIMEOUT, LK_NOSHARE);
 	/*
 	 * Initialize bufobj.
 	 */
 	bo = &vp->v_bufobj;
 	bo->__bo_vnode = vp;
 	bo->bo_mtx = &vp->v_interlock;
 	bo->bo_ops = &buf_ops_bio;
 	bo->bo_private = vp;
 	TAILQ_INIT(&bo->bo_clean.bv_hd);
 	TAILQ_INIT(&bo->bo_dirty.bv_hd);
 	/*
 	 * Initialize namecache.
 	 */
 	LIST_INIT(&vp->v_cache_src);
 	TAILQ_INIT(&vp->v_cache_dst);
 	/*
 	 * Finalize various vnode identity bits.
 	 */
 	vp->v_type = VNON;
 	vp->v_tag = tag;
 	vp->v_op = vops;
 	v_incr_usecount(vp);
 	vp->v_data = 0;
 #ifdef MAC
 	mac_init_vnode(vp);
 	if (mp != NULL && (mp->mnt_flag & MNT_MULTILABEL) == 0)
 		mac_associate_vnode_singlelabel(mp, vp);
 	else if (mp == NULL)
 		printf("NULL mp in getnewvnode()\n");
 #endif
 	delmntque(vp);
 	if (mp != NULL) {
 		insmntque(vp, mp);
 		bo->bo_bsize = mp->mnt_stat.f_iosize;
 		if ((mp->mnt_kern_flag & MNTK_NOKNOTE) != 0)
 			vp->v_vflag |= VV_NOKNOTE;
 	}
 
 	CTR2(KTR_VFS, "getnewvnode: mp %p vp %p", mp, vp);
 	*vpp = vp;
 	return (0);
 }
 
 /*
  * Delete from old mount point vnode list, if on one.
  */
 static void
 delmntque(struct vnode *vp)
 {
 	struct mount *mp;
 
 	if (vp->v_mount == NULL)
 		return;
 	mp = vp->v_mount;
 	MNT_ILOCK(mp);
 	vp->v_mount = NULL;
 	VNASSERT(mp->mnt_nvnodelistsize > 0, vp,
 		("bad mount point vnode list size"));
 	TAILQ_REMOVE(&mp->mnt_nvnodelist, vp, v_nmntvnodes);
 	mp->mnt_nvnodelistsize--;
 	MNT_IUNLOCK(mp);
 }
 
 /*
  * Insert into list of vnodes for the new mount point, if available.
  */
 static void
 insmntque(struct vnode *vp, struct mount *mp)
 {
 
 	vp->v_mount = mp;
 	VNASSERT(mp != NULL, vp, ("Don't call insmntque(foo, NULL)"));
 	MNT_ILOCK(vp->v_mount);
 	TAILQ_INSERT_TAIL(&mp->mnt_nvnodelist, vp, v_nmntvnodes);
+	VNASSERT(mp->mnt_nvnodelistsize >= 0, vp,
+		("neg mount point vnode list size"));
 	mp->mnt_nvnodelistsize++;
 	MNT_IUNLOCK(vp->v_mount);
 }
 
 /*
  * Flush out and invalidate all buffers associated with a bufobj
  * Called with the underlying object locked.
  */
 int
 bufobj_invalbuf(struct bufobj *bo, int flags, struct thread *td, int slpflag, int slptimeo)
 {
 	int error;
 
 	BO_LOCK(bo);
 	if (flags & V_SAVE) {
 		error = bufobj_wwait(bo, slpflag, slptimeo);
 		if (error) {
 			BO_UNLOCK(bo);
 			return (error);
 		}
 		if (bo->bo_dirty.bv_cnt > 0) {
 			BO_UNLOCK(bo);
 			if ((error = BO_SYNC(bo, MNT_WAIT, td)) != 0)
 				return (error);
 			/*
 			 * XXX We could save a lock/unlock if this was only
 			 * enabled under INVARIANTS
 			 */
 			BO_LOCK(bo);
 			if (bo->bo_numoutput > 0 || bo->bo_dirty.bv_cnt > 0)
 				panic("vinvalbuf: dirty bufs");
 		}
 	}
 	/*
 	 * If you alter this loop please notice that interlock is dropped and
 	 * reacquired in flushbuflist.  Special care is needed to ensure that
 	 * no race conditions occur from this.
 	 */
 	do {
 		error = flushbuflist(&bo->bo_clean,
 		    flags, bo, slpflag, slptimeo);
 		if (error == 0)
 			error = flushbuflist(&bo->bo_dirty,
 			    flags, bo, slpflag, slptimeo);
 		if (error != 0 && error != EAGAIN) {
 			BO_UNLOCK(bo);
 			return (error);
 		}
 	} while (error != 0);
 
 	/*
 	 * Wait for I/O to complete.  XXX needs cleaning up.  The vnode can
 	 * have write I/O in-progress but if there is a VM object then the
 	 * VM object can also have read-I/O in-progress.
 	 */
 	do {
 		bufobj_wwait(bo, 0, 0);
 		BO_UNLOCK(bo);
 		if (bo->bo_object != NULL) {
 			VM_OBJECT_LOCK(bo->bo_object);
 			vm_object_pip_wait(bo->bo_object, "bovlbx");
 			VM_OBJECT_UNLOCK(bo->bo_object);
 		}
 		BO_LOCK(bo);
 	} while (bo->bo_numoutput > 0);
 	BO_UNLOCK(bo);
 
 	/*
 	 * Destroy the copy in the VM cache, too.
 	 */
 	if (bo->bo_object != NULL) {
 		VM_OBJECT_LOCK(bo->bo_object);
 		vm_object_page_remove(bo->bo_object, 0, 0,
 			(flags & V_SAVE) ? TRUE : FALSE);
 		VM_OBJECT_UNLOCK(bo->bo_object);
 	}
 
 #ifdef INVARIANTS
 	BO_LOCK(bo);
 	if ((flags & (V_ALT | V_NORMAL)) == 0 &&
 	    (bo->bo_dirty.bv_cnt > 0 || bo->bo_clean.bv_cnt > 0))
 		panic("vinvalbuf: flush failed");
 	BO_UNLOCK(bo);
 #endif
 	return (0);
 }
 
 /*
  * Flush out and invalidate all buffers associated with a vnode.
  * Called with the underlying object locked.
  */
 int
 vinvalbuf(struct vnode *vp, int flags, struct thread *td, int slpflag, int slptimeo)
 {
 
 	CTR2(KTR_VFS, "vinvalbuf vp %p flags %d", vp, flags);
 	ASSERT_VOP_LOCKED(vp, "vinvalbuf");
 	return (bufobj_invalbuf(&vp->v_bufobj, flags, td, slpflag, slptimeo));
 }
 
 /*
  * Flush out buffers on the specified list.
  *
  */
 static int
 flushbuflist(bufv, flags, bo, slpflag, slptimeo)
 	struct bufv *bufv;
 	int flags;
 	struct bufobj *bo;
 	int slpflag, slptimeo;
 {
 	struct buf *bp, *nbp;
 	int retval, error;
 	daddr_t lblkno;
 	b_xflags_t xflags;
 
 	ASSERT_BO_LOCKED(bo);
 
 	retval = 0;
 	TAILQ_FOREACH_SAFE(bp, &bufv->bv_hd, b_bobufs, nbp) {
 		if (((flags & V_NORMAL) && (bp->b_xflags & BX_ALTDATA)) ||
 		    ((flags & V_ALT) && (bp->b_xflags & BX_ALTDATA) == 0)) {
 			continue;
 		}
 		lblkno = 0;
 		xflags = 0;
 		if (nbp != NULL) {
 			lblkno = nbp->b_lblkno;
 			xflags = nbp->b_xflags &
 				(BX_BKGRDMARKER | BX_VNDIRTY | BX_VNCLEAN);
 		}
 		retval = EAGAIN;
 		error = BUF_TIMELOCK(bp,
 		    LK_EXCLUSIVE | LK_SLEEPFAIL | LK_INTERLOCK, BO_MTX(bo),
 		    "flushbuf", slpflag, slptimeo);
 		if (error) {
 			BO_LOCK(bo);
 			return (error != ENOLCK ? error : EAGAIN);
 		}
 		KASSERT(bp->b_bufobj == bo,
 	            ("bp %p wrong b_bufobj %p should be %p",
 		    bp, bp->b_bufobj, bo));
 		if (bp->b_bufobj != bo) {	/* XXX: necessary ? */
 			BUF_UNLOCK(bp);
 			BO_LOCK(bo);
 			return (EAGAIN);
 		}
 		/*
 		 * XXX Since there are no node locks for NFS, I
 		 * believe there is a slight chance that a delayed
 		 * write will occur while sleeping just above, so
 		 * check for it.
 		 */
 		if (((bp->b_flags & (B_DELWRI | B_INVAL)) == B_DELWRI) &&
 		    (flags & V_SAVE)) {
 			bremfree(bp);
 			bp->b_flags |= B_ASYNC;
 			bwrite(bp);
 			BO_LOCK(bo);
 			return (EAGAIN);	/* XXX: why not loop ? */
 		}
 		bremfree(bp);
 		bp->b_flags |= (B_INVAL | B_NOCACHE | B_RELBUF);
 		bp->b_flags &= ~B_ASYNC;
 		brelse(bp);
 		BO_LOCK(bo);
 		if (nbp != NULL &&
 		    (nbp->b_bufobj != bo || 
 		     nbp->b_lblkno != lblkno ||
 		     (nbp->b_xflags &
 		      (BX_BKGRDMARKER | BX_VNDIRTY | BX_VNCLEAN)) != xflags))
 			break;			/* nbp invalid */
 	}
 	return (retval);
 }
 
 /*
  * Truncate a file's buffer and pages to a specified length.  This
  * is in lieu of the old vinvalbuf mechanism, which performed unneeded
  * sync activity.
  */
 int
 vtruncbuf(struct vnode *vp, struct ucred *cred, struct thread *td, off_t length, int blksize)
 {
 	struct buf *bp, *nbp;
 	int anyfreed;
 	int trunclbn;
 	struct bufobj *bo;
 
 	CTR2(KTR_VFS, "vtruncbuf vp %p length %jd", vp, length);
 	/*
 	 * Round up to the *next* lbn.
 	 */
 	trunclbn = (length + blksize - 1) / blksize;
 
 	ASSERT_VOP_LOCKED(vp, "vtruncbuf");
 restart:
 	VI_LOCK(vp);
 	bo = &vp->v_bufobj;
 	anyfreed = 1;
 	for (;anyfreed;) {
 		anyfreed = 0;
 		TAILQ_FOREACH_SAFE(bp, &bo->bo_clean.bv_hd, b_bobufs, nbp) {
 			if (bp->b_lblkno < trunclbn)
 				continue;
 			if (BUF_LOCK(bp,
 			    LK_EXCLUSIVE | LK_SLEEPFAIL | LK_INTERLOCK,
 			    VI_MTX(vp)) == ENOLCK)
 				goto restart;
 
 			bremfree(bp);
 			bp->b_flags |= (B_INVAL | B_RELBUF);
 			bp->b_flags &= ~B_ASYNC;
 			brelse(bp);
 			anyfreed = 1;
 
 			if (nbp != NULL &&
 			    (((nbp->b_xflags & BX_VNCLEAN) == 0) ||
 			    (nbp->b_vp != vp) ||
 			    (nbp->b_flags & B_DELWRI))) {
 				goto restart;
 			}
 			VI_LOCK(vp);
 		}
 
 		TAILQ_FOREACH_SAFE(bp, &bo->bo_dirty.bv_hd, b_bobufs, nbp) {
 			if (bp->b_lblkno < trunclbn)
 				continue;
 			if (BUF_LOCK(bp,
 			    LK_EXCLUSIVE | LK_SLEEPFAIL | LK_INTERLOCK,
 			    VI_MTX(vp)) == ENOLCK)
 				goto restart;
 			bremfree(bp);
 			bp->b_flags |= (B_INVAL | B_RELBUF);
 			bp->b_flags &= ~B_ASYNC;
 			brelse(bp);
 			anyfreed = 1;
 			if (nbp != NULL &&
 			    (((nbp->b_xflags & BX_VNDIRTY) == 0) ||
 			    (nbp->b_vp != vp) ||
 			    (nbp->b_flags & B_DELWRI) == 0)) {
 				goto restart;
 			}
 			VI_LOCK(vp);
 		}
 	}
 
 	if (length > 0) {
 restartsync:
 		TAILQ_FOREACH_SAFE(bp, &bo->bo_dirty.bv_hd, b_bobufs, nbp) {
 			if (bp->b_lblkno > 0)
 				continue;
 			/*
 			 * Since we hold the vnode lock this should only
 			 * fail if we're racing with the buf daemon.
 			 */
 			if (BUF_LOCK(bp,
 			    LK_EXCLUSIVE | LK_SLEEPFAIL | LK_INTERLOCK,
 			    VI_MTX(vp)) == ENOLCK) {
 				goto restart;
 			}
 			VNASSERT((bp->b_flags & B_DELWRI), vp,
 			    ("buf(%p) on dirty queue without DELWRI", bp));
 
 			bremfree(bp);
 			bawrite(bp);
 			VI_LOCK(vp);
 			goto restartsync;
 		}
 	}
 
 	bufobj_wwait(bo, 0, 0);
 	VI_UNLOCK(vp);
 	vnode_pager_setsize(vp, length);
 
 	return (0);
 }
 
 /*
  * buf_splay() - splay tree core for the clean/dirty list of buffers in
  * 		 a vnode.
  *
  *	NOTE: We have to deal with the special case of a background bitmap
  *	buffer, a situation where two buffers will have the same logical
  *	block offset.  We want (1) only the foreground buffer to be accessed
  *	in a lookup and (2) must differentiate between the foreground and
  *	background buffer in the splay tree algorithm because the splay
  *	tree cannot normally handle multiple entities with the same 'index'.
  *	We accomplish this by adding differentiating flags to the splay tree's
  *	numerical domain.
  */
 static
 struct buf *
 buf_splay(daddr_t lblkno, b_xflags_t xflags, struct buf *root)
 {
 	struct buf dummy;
 	struct buf *lefttreemax, *righttreemin, *y;
 
 	if (root == NULL)
 		return (NULL);
 	lefttreemax = righttreemin = &dummy;
 	for (;;) {
 		if (lblkno < root->b_lblkno ||
 		    (lblkno == root->b_lblkno &&
 		    (xflags & BX_BKGRDMARKER) < (root->b_xflags & BX_BKGRDMARKER))) {
 			if ((y = root->b_left) == NULL)
 				break;
 			if (lblkno < y->b_lblkno) {
 				/* Rotate right. */
 				root->b_left = y->b_right;
 				y->b_right = root;
 				root = y;
 				if ((y = root->b_left) == NULL)
 					break;
 			}
 			/* Link into the new root's right tree. */
 			righttreemin->b_left = root;
 			righttreemin = root;
 		} else if (lblkno > root->b_lblkno ||
 		    (lblkno == root->b_lblkno &&
 		    (xflags & BX_BKGRDMARKER) > (root->b_xflags & BX_BKGRDMARKER))) {
 			if ((y = root->b_right) == NULL)
 				break;
 			if (lblkno > y->b_lblkno) {
 				/* Rotate left. */
 				root->b_right = y->b_left;
 				y->b_left = root;
 				root = y;
 				if ((y = root->b_right) == NULL)
 					break;
 			}
 			/* Link into the new root's left tree. */
 			lefttreemax->b_right = root;
 			lefttreemax = root;
 		} else {
 			break;
 		}
 		root = y;
 	}
 	/* Assemble the new root. */
 	lefttreemax->b_right = root->b_left;
 	righttreemin->b_left = root->b_right;
 	root->b_left = dummy.b_right;
 	root->b_right = dummy.b_left;
 	return (root);
 }
 
 static void
 buf_vlist_remove(struct buf *bp)
 {
 	struct buf *root;
 	struct bufv *bv;
 
 	KASSERT(bp->b_bufobj != NULL, ("No b_bufobj %p", bp));
 	ASSERT_BO_LOCKED(bp->b_bufobj);
 	KASSERT((bp->b_xflags & (BX_VNDIRTY|BX_VNCLEAN)) !=
 	    (BX_VNDIRTY|BX_VNCLEAN),
 	    ("buf_vlist_remove: Buf %p is on two lists", bp));
 	if (bp->b_xflags & BX_VNDIRTY) 
 		bv = &bp->b_bufobj->bo_dirty;
 	else
 		bv = &bp->b_bufobj->bo_clean;
 	if (bp != bv->bv_root) {
 		root = buf_splay(bp->b_lblkno, bp->b_xflags, bv->bv_root);
 		KASSERT(root == bp, ("splay lookup failed in remove"));
 	}
 	if (bp->b_left == NULL) {
 		root = bp->b_right;
 	} else {
 		root = buf_splay(bp->b_lblkno, bp->b_xflags, bp->b_left);
 		root->b_right = bp->b_right;
 	}
 	bv->bv_root = root;
 	TAILQ_REMOVE(&bv->bv_hd, bp, b_bobufs);
 	bv->bv_cnt--;
 	bp->b_xflags &= ~(BX_VNDIRTY | BX_VNCLEAN);
 }
 
 /*
  * Add the buffer to the sorted clean or dirty block list using a
  * splay tree algorithm.
  *
  * NOTE: xflags is passed as a constant, optimizing this inline function!
  */
 static void
 buf_vlist_add(struct buf *bp, struct bufobj *bo, b_xflags_t xflags)
 {
 	struct buf *root;
 	struct bufv *bv;
 
 	ASSERT_BO_LOCKED(bo);
 	KASSERT((bp->b_xflags & (BX_VNDIRTY|BX_VNCLEAN)) == 0,
 	    ("buf_vlist_add: Buf %p has existing xflags %d", bp, bp->b_xflags));
 	bp->b_xflags |= xflags;
 	if (xflags & BX_VNDIRTY)
 		bv = &bo->bo_dirty;
 	else
 		bv = &bo->bo_clean;
 
 	root = buf_splay(bp->b_lblkno, bp->b_xflags, bv->bv_root);
 	if (root == NULL) {
 		bp->b_left = NULL;
 		bp->b_right = NULL;
 		TAILQ_INSERT_TAIL(&bv->bv_hd, bp, b_bobufs);
 	} else if (bp->b_lblkno < root->b_lblkno ||
 	    (bp->b_lblkno == root->b_lblkno &&
 	    (bp->b_xflags & BX_BKGRDMARKER) < (root->b_xflags & BX_BKGRDMARKER))) {
 		bp->b_left = root->b_left;
 		bp->b_right = root;
 		root->b_left = NULL;
 		TAILQ_INSERT_BEFORE(root, bp, b_bobufs);
 	} else {
 		bp->b_right = root->b_right;
 		bp->b_left = root;
 		root->b_right = NULL;
 		TAILQ_INSERT_AFTER(&bv->bv_hd, root, bp, b_bobufs);
 	}
 	bv->bv_cnt++;
 	bv->bv_root = bp;
 }
 
 /*
  * Lookup a buffer using the splay tree.  Note that we specifically avoid
  * shadow buffers used in background bitmap writes.
  *
  * This code isn't quite efficient as it could be because we are maintaining
  * two sorted lists and do not know which list the block resides in.
  *
  * During a "make buildworld" the desired buffer is found at one of
  * the roots more than 60% of the time.  Thus, checking both roots
  * before performing either splay eliminates unnecessary splays on the
  * first tree splayed.
  */
 struct buf *
 gbincore(struct bufobj *bo, daddr_t lblkno)
 {
 	struct buf *bp;
 
 	ASSERT_BO_LOCKED(bo);
 	if ((bp = bo->bo_clean.bv_root) != NULL &&
 	    bp->b_lblkno == lblkno && !(bp->b_xflags & BX_BKGRDMARKER))
 		return (bp);
 	if ((bp = bo->bo_dirty.bv_root) != NULL &&
 	    bp->b_lblkno == lblkno && !(bp->b_xflags & BX_BKGRDMARKER))
 		return (bp);
 	if ((bp = bo->bo_clean.bv_root) != NULL) {
 		bo->bo_clean.bv_root = bp = buf_splay(lblkno, 0, bp);
 		if (bp->b_lblkno == lblkno && !(bp->b_xflags & BX_BKGRDMARKER))
 			return (bp);
 	}
 	if ((bp = bo->bo_dirty.bv_root) != NULL) {
 		bo->bo_dirty.bv_root = bp = buf_splay(lblkno, 0, bp);
 		if (bp->b_lblkno == lblkno && !(bp->b_xflags & BX_BKGRDMARKER))
 			return (bp);
 	}
 	return (NULL);
 }
 
 /*
  * Associate a buffer with a vnode.
  */
 void
 bgetvp(struct vnode *vp, struct buf *bp)
 {
 
 	VNASSERT(bp->b_vp == NULL, bp->b_vp, ("bgetvp: not free"));
 
 	CTR3(KTR_BUF, "bgetvp(%p) vp %p flags %X", bp, vp, bp->b_flags);
 	VNASSERT((bp->b_xflags & (BX_VNDIRTY|BX_VNCLEAN)) == 0, vp,
 	    ("bgetvp: bp already attached! %p", bp));
 
 	ASSERT_VI_LOCKED(vp, "bgetvp");
 	vholdl(vp);
 	bp->b_vp = vp;
 	bp->b_bufobj = &vp->v_bufobj;
 	/*
 	 * Insert onto list for new vnode.
 	 */
 	buf_vlist_add(bp, &vp->v_bufobj, BX_VNCLEAN);
 }
 
 /*
  * Disassociate a buffer from a vnode.
  */
 void
 brelvp(struct buf *bp)
 {
 	struct bufobj *bo;
 	struct vnode *vp;
 
 	CTR3(KTR_BUF, "brelvp(%p) vp %p flags %X", bp, bp->b_vp, bp->b_flags);
 	KASSERT(bp->b_vp != NULL, ("brelvp: NULL"));
 
 	/*
 	 * Delete from old vnode list, if on one.
 	 */
 	vp = bp->b_vp;		/* XXX */
 	bo = bp->b_bufobj;
 	BO_LOCK(bo);
 	if (bp->b_xflags & (BX_VNDIRTY | BX_VNCLEAN))
 		buf_vlist_remove(bp);
 	else
 		panic("brelvp: Buffer %p not on queue.", bp);
 	if ((bo->bo_flag & BO_ONWORKLST) && bo->bo_dirty.bv_cnt == 0) {
 		bo->bo_flag &= ~BO_ONWORKLST;
 		mtx_lock(&sync_mtx);
 		LIST_REMOVE(bo, bo_synclist);
  		syncer_worklist_len--;
 		mtx_unlock(&sync_mtx);
 	}
 	bp->b_vp = NULL;
 	bp->b_bufobj = NULL;
 	vdropl(vp);
 }
 
 /*
  * Add an item to the syncer work queue.
  */
 static void
 vn_syncer_add_to_worklist(struct bufobj *bo, int delay)
 {
 	int slot;
 
 	ASSERT_BO_LOCKED(bo);
 
 	mtx_lock(&sync_mtx);
 	if (bo->bo_flag & BO_ONWORKLST)
 		LIST_REMOVE(bo, bo_synclist);
 	else {
 		bo->bo_flag |= BO_ONWORKLST;
  		syncer_worklist_len++;
 	}
 
 	if (delay > syncer_maxdelay - 2)
 		delay = syncer_maxdelay - 2;
 	slot = (syncer_delayno + delay) & syncer_mask;
 
 	LIST_INSERT_HEAD(&syncer_workitem_pending[slot], bo, bo_synclist);
 	mtx_unlock(&sync_mtx);
 }
 
 static int
 sysctl_vfs_worklist_len(SYSCTL_HANDLER_ARGS)
 {
 	int error, len;
 
 	mtx_lock(&sync_mtx);
 	len = syncer_worklist_len - sync_vnode_count;
 	mtx_unlock(&sync_mtx);
 	error = SYSCTL_OUT(req, &len, sizeof(len));
 	return (error);
 }
 
 SYSCTL_PROC(_vfs, OID_AUTO, worklist_len, CTLTYPE_INT | CTLFLAG_RD, NULL, 0,
     sysctl_vfs_worklist_len, "I", "Syncer thread worklist length");
 
 static struct proc *updateproc;
 static void sched_sync(void);
 static struct kproc_desc up_kp = {
 	"syncer",
 	sched_sync,
 	&updateproc
 };
 SYSINIT(syncer, SI_SUB_KTHREAD_UPDATE, SI_ORDER_FIRST, kproc_start, &up_kp)
 
 static int
 sync_vnode(struct bufobj *bo, struct thread *td)
 {
 	struct vnode *vp;
 	struct mount *mp;
 
 	vp = bo->__bo_vnode; 	/* XXX */
 	if (VOP_ISLOCKED(vp, NULL) != 0)
 		return (1);
 	if (VI_TRYLOCK(vp) == 0)
 		return (1);
 	/*
 	 * We use vhold in case the vnode does not
 	 * successfully sync.  vhold prevents the vnode from
 	 * going away when we unlock the sync_mtx so that
 	 * we can acquire the vnode interlock.
 	 */
 	vholdl(vp);
 	mtx_unlock(&sync_mtx);
 	VI_UNLOCK(vp);
 	if (vn_start_write(vp, &mp, V_NOWAIT) != 0) {
 		vdrop(vp);
 		mtx_lock(&sync_mtx);
 		return (1);
 	}
 	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td);
 	(void) VOP_FSYNC(vp, MNT_LAZY, td);
 	VOP_UNLOCK(vp, 0, td);
 	vn_finished_write(mp);
 	VI_LOCK(vp);
 	if ((bo->bo_flag & BO_ONWORKLST) != 0) {
 		/*
 		 * Put us back on the worklist.  The worklist
 		 * routine will remove us from our current
 		 * position and then add us back in at a later
 		 * position.
 		 */
 		vn_syncer_add_to_worklist(bo, syncdelay);
 	}
 	vdropl(vp);
 	mtx_lock(&sync_mtx);
 	return (0);
 }
 
 /*
  * System filesystem synchronizer daemon.
  */
 static void
 sched_sync(void)
 {
 	struct synclist *next;
 	struct synclist *slp;
 	struct bufobj *bo;
 	long starttime;
 	struct thread *td = FIRST_THREAD_IN_PROC(updateproc);
 	static int dummychan;
 	int last_work_seen;
 	int net_worklist_len;
 	int syncer_final_iter;
 	int first_printf;
 	int error;
 
 	mtx_lock(&Giant);
 	last_work_seen = 0;
 	syncer_final_iter = 0;
 	first_printf = 1;
 	syncer_state = SYNCER_RUNNING;
 	starttime = time_uptime;
 	td->td_pflags |= TDP_NORUNNINGBUF;
 
 	EVENTHANDLER_REGISTER(shutdown_pre_sync, syncer_shutdown, td->td_proc,
 	    SHUTDOWN_PRI_LAST);
 
 	for (;;) {
 		mtx_lock(&sync_mtx);
 		if (syncer_state == SYNCER_FINAL_DELAY &&
 		    syncer_final_iter == 0) {
 			mtx_unlock(&sync_mtx);
 			kthread_suspend_check(td->td_proc);
 			mtx_lock(&sync_mtx);
 		}
 		net_worklist_len = syncer_worklist_len - sync_vnode_count;
 		if (syncer_state != SYNCER_RUNNING &&
 		    starttime != time_uptime) {
 			if (first_printf) {
 				printf("\nSyncing disks, vnodes remaining...");
 				first_printf = 0;
 			}
 			printf("%d ", net_worklist_len);
 		}
 		starttime = time_uptime;
 
 		/*
 		 * Push files whose dirty time has expired.  Be careful
 		 * of interrupt race on slp queue.
 		 *
 		 * Skip over empty worklist slots when shutting down.
 		 */
 		do {
 			slp = &syncer_workitem_pending[syncer_delayno];
 			syncer_delayno += 1;
 			if (syncer_delayno == syncer_maxdelay)
 				syncer_delayno = 0;
 			next = &syncer_workitem_pending[syncer_delayno];
 			/*
 			 * If the worklist has wrapped since the
 			 * it was emptied of all but syncer vnodes, 
 			 * switch to the FINAL_DELAY state and run
 			 * for one more second.
 			 */
 			if (syncer_state == SYNCER_SHUTTING_DOWN &&
 			    net_worklist_len == 0 &&
 			    last_work_seen == syncer_delayno) {
 				syncer_state = SYNCER_FINAL_DELAY;
 				syncer_final_iter = SYNCER_SHUTDOWN_SPEEDUP;
 			}
 		} while (syncer_state != SYNCER_RUNNING && LIST_EMPTY(slp) &&
 		    syncer_worklist_len > 0);
 
 		/*
 		 * Keep track of the last time there was anything
 		 * on the worklist other than syncer vnodes.
 		 * Return to the SHUTTING_DOWN state if any
 		 * new work appears.
 		 */
 		if (net_worklist_len > 0 || syncer_state == SYNCER_RUNNING)
 			last_work_seen = syncer_delayno;
 		if (net_worklist_len > 0 && syncer_state == SYNCER_FINAL_DELAY)
 			syncer_state = SYNCER_SHUTTING_DOWN;
 		while ((bo = LIST_FIRST(slp)) != NULL) {
 			error = sync_vnode(bo, td);
 			if (error == 1) {
 				LIST_REMOVE(bo, bo_synclist);
 				LIST_INSERT_HEAD(next, bo, bo_synclist);
 				continue;
 			}
 		}
 		if (syncer_state == SYNCER_FINAL_DELAY && syncer_final_iter > 0)
 			syncer_final_iter--;
 		mtx_unlock(&sync_mtx);
 
 		/*
 		 * Do soft update processing.
 		 */
 		if (softdep_process_worklist_hook != NULL)
 			(*softdep_process_worklist_hook)(NULL);
 
 		/*
 		 * The variable rushjob allows the kernel to speed up the
 		 * processing of the filesystem syncer process. A rushjob
 		 * value of N tells the filesystem syncer to process the next
 		 * N seconds worth of work on its queue ASAP. Currently rushjob
 		 * is used by the soft update code to speed up the filesystem
 		 * syncer process when the incore state is getting so far
 		 * ahead of the disk that the kernel memory pool is being
 		 * threatened with exhaustion.
 		 */
 		mtx_lock(&sync_mtx);
 		if (rushjob > 0) {
 			rushjob -= 1;
 			mtx_unlock(&sync_mtx);
 			continue;
 		}
 		mtx_unlock(&sync_mtx);
 		/*
 		 * Just sleep for a short period if time between
 		 * iterations when shutting down to allow some I/O
 		 * to happen.
 		 *
 		 * If it has taken us less than a second to process the
 		 * current work, then wait. Otherwise start right over
 		 * again. We can still lose time if any single round
 		 * takes more than two seconds, but it does not really
 		 * matter as we are just trying to generally pace the
 		 * filesystem activity.
 		 */
 		if (syncer_state != SYNCER_RUNNING)
 			tsleep(&dummychan, PPAUSE, "syncfnl",
 			    hz / SYNCER_SHUTDOWN_SPEEDUP);
 		else if (time_uptime == starttime)
 			tsleep(&lbolt, PPAUSE, "syncer", 0);
 	}
 }
 
 /*
  * Request the syncer daemon to speed up its work.
  * We never push it to speed up more than half of its
  * normal turn time, otherwise it could take over the cpu.
  */
 int
 speedup_syncer()
 {
 	struct thread *td;
 	int ret = 0;
 
 	td = FIRST_THREAD_IN_PROC(updateproc);
 	sleepq_remove(td, &lbolt);
 	mtx_lock(&sync_mtx);
 	if (rushjob < syncdelay / 2) {
 		rushjob += 1;
 		stat_rush_requests += 1;
 		ret = 1;
 	}
 	mtx_unlock(&sync_mtx);
 	return (ret);
 }
 
 /*
  * Tell the syncer to speed up its work and run though its work
  * list several times, then tell it to shut down.
  */
 static void
 syncer_shutdown(void *arg, int howto)
 {
 	struct thread *td;
 
 	if (howto & RB_NOSYNC)
 		return;
 	td = FIRST_THREAD_IN_PROC(updateproc);
 	sleepq_remove(td, &lbolt);
 	mtx_lock(&sync_mtx);
 	syncer_state = SYNCER_SHUTTING_DOWN;
 	rushjob = 0;
 	mtx_unlock(&sync_mtx);
 	kproc_shutdown(arg, howto);
 }
 
 /*
  * Reassign a buffer from one vnode to another.
  * Used to assign file specific control information
  * (indirect blocks) to the vnode to which they belong.
  */
 void
 reassignbuf(struct buf *bp)
 {
 	struct vnode *vp;
 	struct bufobj *bo;
 	int delay;
 #ifdef INVARIANTS
 	struct bufv *bv;
 #endif
 
 	vp = bp->b_vp;
 	bo = bp->b_bufobj;
 	++reassignbufcalls;
 
 	CTR3(KTR_BUF, "reassignbuf(%p) vp %p flags %X",
 	    bp, bp->b_vp, bp->b_flags);
 	/*
 	 * B_PAGING flagged buffers cannot be reassigned because their vp
 	 * is not fully linked in.
 	 */
 	if (bp->b_flags & B_PAGING)
 		panic("cannot reassign paging buffer");
 
 	/*
 	 * Delete from old vnode list, if on one.
 	 */
 	VI_LOCK(vp);
 	if (bp->b_xflags & (BX_VNDIRTY | BX_VNCLEAN))
 		buf_vlist_remove(bp);
 	else
 		panic("reassignbuf: Buffer %p not on queue.", bp);
 	/*
 	 * If dirty, put on list of dirty buffers; otherwise insert onto list
 	 * of clean buffers.
 	 */
 	if (bp->b_flags & B_DELWRI) {
 		if ((bo->bo_flag & BO_ONWORKLST) == 0) {
 			switch (vp->v_type) {
 			case VDIR:
 				delay = dirdelay;
 				break;
 			case VCHR:
 				delay = metadelay;
 				break;
 			default:
 				delay = filedelay;
 			}
 			vn_syncer_add_to_worklist(bo, delay);
 		}
 		buf_vlist_add(bp, bo, BX_VNDIRTY);
 	} else {
 		buf_vlist_add(bp, bo, BX_VNCLEAN);
 
 		if ((bo->bo_flag & BO_ONWORKLST) && bo->bo_dirty.bv_cnt == 0) {
 			mtx_lock(&sync_mtx);
 			LIST_REMOVE(bo, bo_synclist);
  			syncer_worklist_len--;
 			mtx_unlock(&sync_mtx);
 			bo->bo_flag &= ~BO_ONWORKLST;
 		}
 	}
 #ifdef INVARIANTS
 	bv = &bo->bo_clean;
 	bp = TAILQ_FIRST(&bv->bv_hd);
 	KASSERT(bp == NULL || bp->b_bufobj == bo,
 	    ("bp %p wrong b_bufobj %p should be %p", bp, bp->b_bufobj, bo));
 	bp = TAILQ_LAST(&bv->bv_hd, buflists);
 	KASSERT(bp == NULL || bp->b_bufobj == bo,
 	    ("bp %p wrong b_bufobj %p should be %p", bp, bp->b_bufobj, bo));
 	bv = &bo->bo_dirty;
 	bp = TAILQ_FIRST(&bv->bv_hd);
 	KASSERT(bp == NULL || bp->b_bufobj == bo,
 	    ("bp %p wrong b_bufobj %p should be %p", bp, bp->b_bufobj, bo));
 	bp = TAILQ_LAST(&bv->bv_hd, buflists);
 	KASSERT(bp == NULL || bp->b_bufobj == bo,
 	    ("bp %p wrong b_bufobj %p should be %p", bp, bp->b_bufobj, bo));
 #endif
 	VI_UNLOCK(vp);
 }
 
 /*
  * Increment the use and hold counts on the vnode, taking care to reference
  * the driver's usecount if this is a chardev.  The vholdl() will remove
  * the vnode from the free list if it is presently free.  Requires the
  * vnode interlock and returns with it held.
  */
 static void
 v_incr_usecount(struct vnode *vp)
 {
 
 	CTR3(KTR_VFS, "v_incr_usecount: vp %p holdcnt %d usecount %d\n",
 	    vp, vp->v_holdcnt, vp->v_usecount);
 	vp->v_usecount++;
 	if (vp->v_type == VCHR && vp->v_rdev != NULL) {
 		dev_lock();
 		vp->v_rdev->si_usecount++;
 		dev_unlock();
 	}
 	vholdl(vp);
 }
 
 /*
  * Decrement the vnode use and hold count along with the driver's usecount
  * if this is a chardev.  The vdropl() below releases the vnode interlock
  * as it may free the vnode.
  */
 static void
 v_decr_usecount(struct vnode *vp)
 {
 
 	CTR3(KTR_VFS, "v_decr_usecount: vp %p holdcnt %d usecount %d\n",
 	    vp, vp->v_holdcnt, vp->v_usecount);
 	ASSERT_VI_LOCKED(vp, __FUNCTION__);
 	VNASSERT(vp->v_usecount > 0, vp,
 	    ("v_decr_usecount: negative usecount"));
 	vp->v_usecount--;
 	if (vp->v_type == VCHR && vp->v_rdev != NULL) {
 		dev_lock();
 		vp->v_rdev->si_usecount--;
 		dev_unlock();
 	}
 	vdropl(vp);
 }
 
 /*
  * Decrement only the use count and driver use count.  This is intended to
  * be paired with a follow on vdropl() to release the remaining hold count.
  * In this way we may vgone() a vnode with a 0 usecount without risk of
  * having it end up on a free list because the hold count is kept above 0.
  */
 static void
 v_decr_useonly(struct vnode *vp)
 {
 
 	CTR3(KTR_VFS, "v_decr_useonly: vp %p holdcnt %d usecount %d\n",
 	    vp, vp->v_holdcnt, vp->v_usecount);
 	ASSERT_VI_LOCKED(vp, __FUNCTION__);
 	VNASSERT(vp->v_usecount > 0, vp,
 	    ("v_decr_useonly: negative usecount"));
 	vp->v_usecount--;
 	if (vp->v_type == VCHR && vp->v_rdev != NULL) {
 		dev_lock();
 		vp->v_rdev->si_usecount--;
 		dev_unlock();
 	}
 }
 
 /*
  * Grab a particular vnode from the free list, increment its
  * reference count and lock it. The vnode lock bit is set if the
  * vnode is being eliminated in vgone. The process is awakened
  * when the transition is completed, and an error returned to
  * indicate that the vnode is no longer usable (possibly having
  * been changed to a new filesystem type).
  */
 int
 vget(vp, flags, td)
 	struct vnode *vp;
 	int flags;
 	struct thread *td;
 {
 	int oweinact;
 	int oldflags;
 	int usecount;
 	int error;
 
 	error = 0;
 	oldflags = flags;
 	oweinact = 0;
 	if ((flags & LK_INTERLOCK) == 0)
 		VI_LOCK(vp);
 	/*
 	 * If the inactive call was deferred because vput() was called
 	 * with a shared lock, we have to do it here before another thread
 	 * gets a reference to data that should be dead.
 	 */
 	if (vp->v_iflag & VI_OWEINACT) {
 		if (flags & LK_NOWAIT) {
 			VI_UNLOCK(vp);
 			return (EBUSY);
 		}
 		flags &= ~LK_TYPE_MASK;
 		flags |= LK_EXCLUSIVE;
 		oweinact = 1;
 	}
 	usecount = vp->v_usecount;
 	v_incr_usecount(vp);
 	if ((error = vn_lock(vp, flags | LK_INTERLOCK, td)) != 0) {
 		VI_LOCK(vp);
 		/*
 		 * must expand vrele here because we do not want
 		 * to call VOP_INACTIVE if the reference count
 		 * drops back to zero since it was never really
 		 * active.
 		 */
 		v_decr_usecount(vp);
 		/*
 		 * Print warning when race below occur:
 		 *
 		 * thread1	thread2
 		 * -------	-------
 		 *					v_usecount=0
 		 * vref(vp)				v_usecount=1
 		 *		vget(vp)
 		 *		v_incr_usecount(vp)	v_usecount=2
 		 *		vn_lock(vp)
 		 * vrele(vp)				v_usecount=1
 		 *		v_decr_usecount(vp)	v_usecount=0
 		 *
 		 * In such situation VOP_INACTIVE() will not be called for
 		 * the vnode vp.
 		 */
 		if (usecount > 0 && vp->v_usecount == 0)
 			printf("vinactive() won't be called for vp=%p\n", vp);
 		return (error);
 	}
 	if (vp->v_iflag & VI_DOOMED && (flags & LK_RETRY) == 0)
 		panic("vget: vn_lock failed to return ENOENT\n");
 	if (oweinact) {
 		VI_LOCK(vp);
 		if (vp->v_iflag & VI_OWEINACT)
 			vinactive(vp, td);
 		VI_UNLOCK(vp);
 		if ((oldflags & LK_TYPE_MASK) == 0)
 			VOP_UNLOCK(vp, 0, td);
 	}
 	return (0);
 }
 
 /*
  * Increase the reference count of a vnode.
  */
 void
 vref(struct vnode *vp)
 {
 
 	VI_LOCK(vp);
 	v_incr_usecount(vp);
 	VI_UNLOCK(vp);
 }
 
 /*
  * Return reference count of a vnode.
  *
  * The results of this call are only guaranteed when some mechanism other
  * than the VI lock is used to stop other processes from gaining references
  * to the vnode.  This may be the case if the caller holds the only reference.
  * This is also useful when stale data is acceptable as race conditions may
  * be accounted for by some other means.
  */
 int
 vrefcnt(struct vnode *vp)
 {
 	int usecnt;
 
 	VI_LOCK(vp);
 	usecnt = vp->v_usecount;
 	VI_UNLOCK(vp);
 
 	return (usecnt);
 }
 
 
 /*
  * Vnode put/release.
  * If count drops to zero, call inactive routine and return to freelist.
  */
 void
 vrele(vp)
 	struct vnode *vp;
 {
 	struct thread *td = curthread;	/* XXX */
 
 	KASSERT(vp != NULL, ("vrele: null vp"));
 
 	VI_LOCK(vp);
 
 	/* Skip this v_writecount check if we're going to panic below. */
 	VNASSERT(vp->v_writecount < vp->v_usecount || vp->v_usecount < 1, vp,
 	    ("vrele: missed vn_close"));
 
 	if (vp->v_usecount > 1 || ((vp->v_iflag & VI_DOINGINACT) &&
 	    vp->v_usecount == 1)) {
 		v_decr_usecount(vp);
 		return;
 	}
 	if (vp->v_usecount != 1) {
 #ifdef DIAGNOSTIC
 		vprint("vrele: negative ref count", vp);
 #endif
 		VI_UNLOCK(vp);
 		panic("vrele: negative ref cnt");
 	}
 	/*
 	 * We want to hold the vnode until the inactive finishes to
 	 * prevent vgone() races.  We drop the use count here and the
 	 * hold count below when we're done.
 	 */
 	v_decr_useonly(vp);
 	/*
 	 * We must call VOP_INACTIVE with the node locked. Mark
 	 * as VI_DOINGINACT to avoid recursion.
 	 */
 	if (vn_lock(vp, LK_EXCLUSIVE | LK_INTERLOCK, td) == 0) {
 		VI_LOCK(vp);
 		vinactive(vp, td);
 		VOP_UNLOCK(vp, 0, td);
 	} else
 		VI_LOCK(vp);
 	vdropl(vp);
 }
 
 /*
  * Release an already locked vnode.  This give the same effects as
  * unlock+vrele(), but takes less time and avoids releasing and
  * re-aquiring the lock (as vrele() aquires the lock internally.)
  */
 void
 vput(vp)
 	struct vnode *vp;
 {
 	struct thread *td = curthread;	/* XXX */
 	int error;
 
 	KASSERT(vp != NULL, ("vput: null vp"));
 	ASSERT_VOP_LOCKED(vp, "vput");
 	VI_LOCK(vp);
 	/* Skip this v_writecount check if we're going to panic below. */
 	VNASSERT(vp->v_writecount < vp->v_usecount || vp->v_usecount < 1, vp,
 	    ("vput: missed vn_close"));
 	error = 0;
 
 	if (vp->v_usecount > 1 || ((vp->v_iflag & VI_DOINGINACT) &&
 	    vp->v_usecount == 1)) {
 		VOP_UNLOCK(vp, 0, td);
 		v_decr_usecount(vp);
 		return;
 	}
 
 	if (vp->v_usecount != 1) {
 #ifdef DIAGNOSTIC
 		vprint("vput: negative ref count", vp);
 #endif
 		panic("vput: negative ref cnt");
 	}
 	/*
 	 * We want to hold the vnode until the inactive finishes to
 	 * prevent vgone() races.  We drop the use count here and the
 	 * hold count below when we're done.
 	 */
 	v_decr_useonly(vp);
 	vp->v_iflag |= VI_OWEINACT;
 	if (VOP_ISLOCKED(vp, NULL) != LK_EXCLUSIVE) {
 		error = VOP_LOCK(vp, LK_EXCLUPGRADE|LK_INTERLOCK|LK_NOWAIT, td);
 		VI_LOCK(vp);
 		if (error)
 			goto done;
 	}
 	if (vp->v_iflag & VI_OWEINACT)
 		vinactive(vp, td);
 	VOP_UNLOCK(vp, 0, td);
 done:
 	vdropl(vp);
 }
 
 /*
  * Somebody doesn't want the vnode recycled.
  */
 void
 vhold(struct vnode *vp)
 {
 
 	VI_LOCK(vp);
 	vholdl(vp);
 	VI_UNLOCK(vp);
 }
 
 void
 vholdl(struct vnode *vp)
 {
 
 	vp->v_holdcnt++;
 	if (VSHOULDBUSY(vp))
 		vbusy(vp);
 }
 
 /*
  * Note that there is one less who cares about this vnode.  vdrop() is the
  * opposite of vhold().
  */
 void
 vdrop(struct vnode *vp)
 {
 
 	VI_LOCK(vp);
 	vdropl(vp);
 }
 
 /*
  * Drop the hold count of the vnode.  If this is the last reference to
  * the vnode we will free it if it has been vgone'd otherwise it is
  * placed on the free list.
  */
 static void
 vdropl(struct vnode *vp)
 {
 
 	if (vp->v_holdcnt <= 0)
 		panic("vdrop: holdcnt %d", vp->v_holdcnt);
 	vp->v_holdcnt--;
 	if (vp->v_holdcnt == 0) {
 		if (vp->v_iflag & VI_DOOMED) {
 			vdestroy(vp);
 			return;
 		} else
 			vfree(vp);
 	}
 	VI_UNLOCK(vp);
 }
 
 /*
  * Call VOP_INACTIVE on the vnode and manage the DOINGINACT and OWEINACT
  * flags.  DOINGINACT prevents us from recursing in calls to vinactive.
  * OWEINACT tracks whether a vnode missed a call to inactive due to a
  * failed lock upgrade.
  */
 static void
 vinactive(struct vnode *vp, struct thread *td)
 {
 
 	ASSERT_VOP_LOCKED(vp, "vinactive");
 	ASSERT_VI_LOCKED(vp, "vinactive");
 	VNASSERT((vp->v_iflag & VI_DOINGINACT) == 0, vp,
 	    ("vinactive: recursed on VI_DOINGINACT"));
 	vp->v_iflag |= VI_DOINGINACT;
 	vp->v_iflag &= ~VI_OWEINACT;
 	VI_UNLOCK(vp);
 	VOP_INACTIVE(vp, td);
 	VI_LOCK(vp);
 	VNASSERT(vp->v_iflag & VI_DOINGINACT, vp,
 	    ("vinactive: lost VI_DOINGINACT"));
 	vp->v_iflag &= ~VI_DOINGINACT;
 }
 
 /*
  * Remove any vnodes in the vnode table belonging to mount point mp.
  *
  * If FORCECLOSE is not specified, there should not be any active ones,
  * return error if any are found (nb: this is a user error, not a
  * system error). If FORCECLOSE is specified, detach any active vnodes
  * that are found.
  *
  * If WRITECLOSE is set, only flush out regular file vnodes open for
  * writing.
  *
  * SKIPSYSTEM causes any vnodes marked VV_SYSTEM to be skipped.
  *
  * `rootrefs' specifies the base reference count for the root vnode
  * of this filesystem. The root vnode is considered busy if its
  * v_usecount exceeds this value. On a successful return, vflush(, td)
  * will call vrele() on the root vnode exactly rootrefs times.
  * If the SKIPSYSTEM or WRITECLOSE flags are specified, rootrefs must
  * be zero.
  */
 #ifdef DIAGNOSTIC
 static int busyprt = 0;		/* print out busy vnodes */
 SYSCTL_INT(_debug, OID_AUTO, busyprt, CTLFLAG_RW, &busyprt, 0, "");
 #endif
 
 int
 vflush(mp, rootrefs, flags, td)
 	struct mount *mp;
 	int rootrefs;
 	int flags;
 	struct thread *td;
 {
-	struct vnode *vp, *nvp, *rootvp = NULL;
+	struct vnode *vp, *mvp, *rootvp = NULL;
 	struct vattr vattr;
 	int busy = 0, error;
 
 	CTR1(KTR_VFS, "vflush: mp %p", mp);
 	if (rootrefs > 0) {
 		KASSERT((flags & (SKIPSYSTEM | WRITECLOSE)) == 0,
 		    ("vflush: bad args"));
 		/*
 		 * Get the filesystem root vnode. We can vput() it
 		 * immediately, since with rootrefs > 0, it won't go away.
 		 */
 		if ((error = VFS_ROOT(mp, LK_EXCLUSIVE, &rootvp, td)) != 0)
 			return (error);
 		vput(rootvp);
 
 	}
 	MNT_ILOCK(mp);
 loop:
-	MNT_VNODE_FOREACH(vp, mp, nvp) {
+	MNT_VNODE_FOREACH(vp, mp, mvp) {
 
 		VI_LOCK(vp);
 		vholdl(vp);
 		MNT_IUNLOCK(mp);
 		error = vn_lock(vp, LK_INTERLOCK | LK_EXCLUSIVE, td);
 		if (error) {
 			vdrop(vp);
 			MNT_ILOCK(mp);
+			MNT_VNODE_FOREACH_ABORT_ILOCKED(mp, mvp);
 			goto loop;
 		}
 		/*
 		 * Skip over a vnodes marked VV_SYSTEM.
 		 */
 		if ((flags & SKIPSYSTEM) && (vp->v_vflag & VV_SYSTEM)) {
 			VOP_UNLOCK(vp, 0, td);
 			vdrop(vp);
 			MNT_ILOCK(mp);
 			continue;
 		}
 		/*
 		 * If WRITECLOSE is set, flush out unlinked but still open
 		 * files (even if open only for reading) and regular file
 		 * vnodes open for writing.
 		 */
 		if (flags & WRITECLOSE) {
 			error = VOP_GETATTR(vp, &vattr, td->td_ucred, td);
 			VI_LOCK(vp);
 
 			if ((vp->v_type == VNON ||
 			    (error == 0 && vattr.va_nlink > 0)) &&
 			    (vp->v_writecount == 0 || vp->v_type != VREG)) {
 				VOP_UNLOCK(vp, 0, td);
 				vdropl(vp);
 				MNT_ILOCK(mp);
 				continue;
 			}
 		} else
 			VI_LOCK(vp);
 		/*
 		 * With v_usecount == 0, all we need to do is clear out the
 		 * vnode data structures and we are done.
 		 *
 		 * If FORCECLOSE is set, forcibly close the vnode.
 		 */
 		if (vp->v_usecount == 0 || (flags & FORCECLOSE)) {
 			VNASSERT(vp->v_usecount == 0 ||
 			    (vp->v_type != VCHR && vp->v_type != VBLK), vp,
 			    ("device VNODE %p is FORCECLOSED", vp));
 			vgonel(vp);
 		} else {
 			busy++;
 #ifdef DIAGNOSTIC
 			if (busyprt)
 				vprint("vflush: busy vnode", vp);
 #endif
 		}
 		VOP_UNLOCK(vp, 0, td);
 		vdropl(vp);
 		MNT_ILOCK(mp);
 	}
 	MNT_IUNLOCK(mp);
 	if (rootrefs > 0 && (flags & FORCECLOSE) == 0) {
 		/*
 		 * If just the root vnode is busy, and if its refcount
 		 * is equal to `rootrefs', then go ahead and kill it.
 		 */
 		VI_LOCK(rootvp);
 		KASSERT(busy > 0, ("vflush: not busy"));
 		VNASSERT(rootvp->v_usecount >= rootrefs, rootvp,
 		    ("vflush: usecount %d < rootrefs %d",
 		     rootvp->v_usecount, rootrefs));
 		if (busy == 1 && rootvp->v_usecount == rootrefs) {
 			VOP_LOCK(rootvp, LK_EXCLUSIVE|LK_INTERLOCK, td);
 			vgone(rootvp);
 			VOP_UNLOCK(rootvp, 0, td);
 			busy = 0;
 		} else
 			VI_UNLOCK(rootvp);
 	}
 	if (busy)
 		return (EBUSY);
 	for (; rootrefs > 0; rootrefs--)
 		vrele(rootvp);
 	return (0);
 }
 
 /*
  * Recycle an unused vnode to the front of the free list.
  */
 int
 vrecycle(struct vnode *vp, struct thread *td)
 {
 	int recycled;
 
 	ASSERT_VOP_LOCKED(vp, "vrecycle");
 	recycled = 0;
 	VI_LOCK(vp);
 	if (vp->v_usecount == 0) {
 		recycled = 1;
 		vgonel(vp);
 	}
 	VI_UNLOCK(vp);
 	return (recycled);
 }
 
 /*
  * Eliminate all activity associated with a vnode
  * in preparation for reuse.
  */
 void
 vgone(struct vnode *vp)
 {
 	VI_LOCK(vp);
 	vgonel(vp);
 	VI_UNLOCK(vp);
 }
 
 /*
  * vgone, with the vp interlock held.
  */
 void
 vgonel(struct vnode *vp)
 {
 	struct thread *td;
 	int oweinact;
 	int active;
 
 	CTR1(KTR_VFS, "vgonel: vp %p", vp);
 	ASSERT_VOP_LOCKED(vp, "vgonel");
 	ASSERT_VI_LOCKED(vp, "vgonel");
 #if 0
 	/* XXX Need to fix ttyvp before I enable this. */
 	VNASSERT(vp->v_holdcnt, vp,
 	    ("vgonel: vp %p has no reference.", vp));
 #endif
 	td = curthread;
 
 	/*
 	 * Don't vgonel if we're already doomed.
 	 */
 	if (vp->v_iflag & VI_DOOMED)
 		return;
 	vp->v_iflag |= VI_DOOMED;
 	/*
 	 * Check to see if the vnode is in use.  If so, we have to call
 	 * VOP_CLOSE() and VOP_INACTIVE().
 	 */
 	active = vp->v_usecount;
 	oweinact = (vp->v_iflag & VI_OWEINACT);
 	VI_UNLOCK(vp);
 	/*
 	 * Clean out any buffers associated with the vnode.
 	 * If the flush fails, just toss the buffers.
 	 */
 	if (!TAILQ_EMPTY(&vp->v_bufobj.bo_dirty.bv_hd))
 		(void) vn_write_suspend_wait(vp, NULL, V_WAIT);
 	if (vinvalbuf(vp, V_SAVE, td, 0, 0) != 0)
 		vinvalbuf(vp, 0, td, 0, 0);
 
 	/*
 	 * If purging an active vnode, it must be closed and
 	 * deactivated before being reclaimed.
 	 */
 	if (active)
 		VOP_CLOSE(vp, FNONBLOCK, NOCRED, td);
 	if (oweinact || active) {
 		VI_LOCK(vp);
 		if ((vp->v_iflag & VI_DOINGINACT) == 0)
 			vinactive(vp, td);
 		VI_UNLOCK(vp);
 	}
 	/*
 	 * Reclaim the vnode.
 	 */
 	if (VOP_RECLAIM(vp, td))
 		panic("vgone: cannot reclaim");
 	VNASSERT(vp->v_object == NULL, vp,
 	    ("vop_reclaim left v_object vp=%p, tag=%s", vp, vp->v_tag));
 	/*
 	 * Delete from old mount point vnode list.
 	 */
 	delmntque(vp);
 	cache_purge(vp);
 	/*
 	 * Done with purge, reset to the standard lock and invalidate
 	 * the vnode.
 	 */
 	VI_LOCK(vp);
 	vp->v_vnlock = &vp->v_lock;
 	vp->v_op = &dead_vnodeops;
 	vp->v_tag = "none";
 	vp->v_type = VBAD;
 }
 
 /*
  * Calculate the total number of references to a special device.
  */
 int
 vcount(vp)
 	struct vnode *vp;
 {
 	int count;
 
 	dev_lock();
 	count = vp->v_rdev->si_usecount;
 	dev_unlock();
 	return (count);
 }
 
 /*
  * Same as above, but using the struct cdev *as argument
  */
 int
 count_dev(dev)
 	struct cdev *dev;
 {
 	int count;
 
 	dev_lock();
 	count = dev->si_usecount;
 	dev_unlock();
 	return(count);
 }
 
 /*
  * Print out a description of a vnode.
  */
 static char *typename[] =
-{"VNON", "VREG", "VDIR", "VBLK", "VCHR", "VLNK", "VSOCK", "VFIFO", "VBAD"};
+{"VNON", "VREG", "VDIR", "VBLK", "VCHR", "VLNK", "VSOCK", "VFIFO", "VBAD",
+ "VMARKER"};
 
 void
 vn_printf(struct vnode *vp, const char *fmt, ...)
 {
 	va_list ap;
 	char buf[96];
 
 	va_start(ap, fmt);
 	vprintf(fmt, ap);
 	va_end(ap);
 	printf("%p: ", (void *)vp);
 	printf("tag %s, type %s\n", vp->v_tag, typename[vp->v_type]);
 	printf("    usecount %d, writecount %d, refcount %d mountedhere %p\n",
 	    vp->v_usecount, vp->v_writecount, vp->v_holdcnt, vp->v_mountedhere);
 	buf[0] = '\0';
 	buf[1] = '\0';
 	if (vp->v_vflag & VV_ROOT)
 		strcat(buf, "|VV_ROOT");
 	if (vp->v_vflag & VV_TEXT)
 		strcat(buf, "|VV_TEXT");
 	if (vp->v_vflag & VV_SYSTEM)
 		strcat(buf, "|VV_SYSTEM");
 	if (vp->v_iflag & VI_DOOMED)
 		strcat(buf, "|VI_DOOMED");
 	if (vp->v_iflag & VI_FREE)
 		strcat(buf, "|VI_FREE");
 	printf("    flags (%s)\n", buf + 1);
 	if (mtx_owned(VI_MTX(vp)))
 		printf(" VI_LOCKed");
 	if (vp->v_object != NULL)
 		printf("    v_object %p ref %d pages %d\n",
 		    vp->v_object, vp->v_object->ref_count,
 		    vp->v_object->resident_page_count);
 	printf("    ");
 	lockmgr_printinfo(vp->v_vnlock);
 	printf("\n");
 	if (vp->v_data != NULL)
 		VOP_PRINT(vp);
 }
 
 #ifdef DDB
 #include <ddb/ddb.h>
 /*
  * List all of the locked vnodes in the system.
  * Called when debugging the kernel.
  */
 DB_SHOW_COMMAND(lockedvnods, lockedvnodes)
 {
 	struct mount *mp, *nmp;
 	struct vnode *vp;
 
 	/*
 	 * Note: because this is DDB, we can't obey the locking semantics
 	 * for these structures, which means we could catch an inconsistent
 	 * state and dereference a nasty pointer.  Not much to be done
 	 * about that.
 	 */
 	printf("Locked vnodes\n");
 	for (mp = TAILQ_FIRST(&mountlist); mp != NULL; mp = nmp) {
 		nmp = TAILQ_NEXT(mp, mnt_list);
 		TAILQ_FOREACH(vp, &mp->mnt_nvnodelist, v_nmntvnodes) {
 			if (VOP_ISLOCKED(vp, NULL))
 				vprint("", vp);
 		}
 		nmp = TAILQ_NEXT(mp, mnt_list);
 	}
 }
 #endif
 
 /*
  * Fill in a struct xvfsconf based on a struct vfsconf.
  */
 static void
 vfsconf2x(struct vfsconf *vfsp, struct xvfsconf *xvfsp)
 {
 
 	strcpy(xvfsp->vfc_name, vfsp->vfc_name);
 	xvfsp->vfc_typenum = vfsp->vfc_typenum;
 	xvfsp->vfc_refcount = vfsp->vfc_refcount;
 	xvfsp->vfc_flags = vfsp->vfc_flags;
 	/*
 	 * These are unused in userland, we keep them
 	 * to not break binary compatibility.
 	 */
 	xvfsp->vfc_vfsops = NULL;
 	xvfsp->vfc_next = NULL;
 }
 
 /*
  * Top level filesystem related information gathering.
  */
 static int
 sysctl_vfs_conflist(SYSCTL_HANDLER_ARGS)
 {
 	struct vfsconf *vfsp;
 	struct xvfsconf xvfsp;
 	int error;
 
 	error = 0;
 	TAILQ_FOREACH(vfsp, &vfsconf, vfc_list) {
 		bzero(&xvfsp, sizeof(xvfsp));
 		vfsconf2x(vfsp, &xvfsp);
 		error = SYSCTL_OUT(req, &xvfsp, sizeof xvfsp);
 		if (error)
 			break;
 	}
 	return (error);
 }
 
 SYSCTL_PROC(_vfs, OID_AUTO, conflist, CTLFLAG_RD, NULL, 0, sysctl_vfs_conflist,
     "S,xvfsconf", "List of all configured filesystems");
 
 #ifndef BURN_BRIDGES
 static int	sysctl_ovfs_conf(SYSCTL_HANDLER_ARGS);
 
 static int
 vfs_sysctl(SYSCTL_HANDLER_ARGS)
 {
 	int *name = (int *)arg1 - 1;	/* XXX */
 	u_int namelen = arg2 + 1;	/* XXX */
 	struct vfsconf *vfsp;
 	struct xvfsconf xvfsp;
 
 	printf("WARNING: userland calling deprecated sysctl, "
 	    "please rebuild world\n");
 
 #if 1 || defined(COMPAT_PRELITE2)
 	/* Resolve ambiguity between VFS_VFSCONF and VFS_GENERIC. */
 	if (namelen == 1)
 		return (sysctl_ovfs_conf(oidp, arg1, arg2, req));
 #endif
 
 	switch (name[1]) {
 	case VFS_MAXTYPENUM:
 		if (namelen != 2)
 			return (ENOTDIR);
 		return (SYSCTL_OUT(req, &maxvfsconf, sizeof(int)));
 	case VFS_CONF:
 		if (namelen != 3)
 			return (ENOTDIR);	/* overloaded */
 		TAILQ_FOREACH(vfsp, &vfsconf, vfc_list)
 			if (vfsp->vfc_typenum == name[2])
 				break;
 		if (vfsp == NULL)
 			return (EOPNOTSUPP);
 		bzero(&xvfsp, sizeof(xvfsp));
 		vfsconf2x(vfsp, &xvfsp);
 		return (SYSCTL_OUT(req, &xvfsp, sizeof(xvfsp)));
 	}
 	return (EOPNOTSUPP);
 }
 
 static SYSCTL_NODE(_vfs, VFS_GENERIC, generic, CTLFLAG_RD | CTLFLAG_SKIP,
 	vfs_sysctl, "Generic filesystem");
 
 #if 1 || defined(COMPAT_PRELITE2)
 
 static int
 sysctl_ovfs_conf(SYSCTL_HANDLER_ARGS)
 {
 	int error;
 	struct vfsconf *vfsp;
 	struct ovfsconf ovfs;
 
 	TAILQ_FOREACH(vfsp, &vfsconf, vfc_list) {
 		bzero(&ovfs, sizeof(ovfs));
 		ovfs.vfc_vfsops = vfsp->vfc_vfsops;	/* XXX used as flag */
 		strcpy(ovfs.vfc_name, vfsp->vfc_name);
 		ovfs.vfc_index = vfsp->vfc_typenum;
 		ovfs.vfc_refcount = vfsp->vfc_refcount;
 		ovfs.vfc_flags = vfsp->vfc_flags;
 		error = SYSCTL_OUT(req, &ovfs, sizeof ovfs);
 		if (error)
 			return error;
 	}
 	return 0;
 }
 
 #endif /* 1 || COMPAT_PRELITE2 */
 #endif /* !BURN_BRIDGES */
 
 #define KINFO_VNODESLOP		10
 #ifdef notyet
 /*
  * Dump vnode list (via sysctl).
  */
 /* ARGSUSED */
 static int
 sysctl_vnode(SYSCTL_HANDLER_ARGS)
 {
 	struct xvnode *xvn;
 	struct thread *td = req->td;
 	struct mount *mp;
 	struct vnode *vp;
 	int error, len, n;
 
 	/*
 	 * Stale numvnodes access is not fatal here.
 	 */
 	req->lock = 0;
 	len = (numvnodes + KINFO_VNODESLOP) * sizeof *xvn;
 	if (!req->oldptr)
 		/* Make an estimate */
 		return (SYSCTL_OUT(req, 0, len));
 
 	error = sysctl_wire_old_buffer(req, 0);
 	if (error != 0)
 		return (error);
 	xvn = malloc(len, M_TEMP, M_ZERO | M_WAITOK);
 	n = 0;
 	mtx_lock(&mountlist_mtx);
 	TAILQ_FOREACH(mp, &mountlist, mnt_list) {
 		if (vfs_busy(mp, LK_NOWAIT, &mountlist_mtx, td))
 			continue;
 		MNT_ILOCK(mp);
 		TAILQ_FOREACH(vp, &mp->mnt_nvnodelist, v_nmntvnodes) {
 			if (n == len)
 				break;
 			vref(vp);
 			xvn[n].xv_size = sizeof *xvn;
 			xvn[n].xv_vnode = vp;
 			xvn[n].xv_id = 0;	/* XXX compat */
 #define XV_COPY(field) xvn[n].xv_##field = vp->v_##field
 			XV_COPY(usecount);
 			XV_COPY(writecount);
 			XV_COPY(holdcnt);
 			XV_COPY(mount);
 			XV_COPY(numoutput);
 			XV_COPY(type);
 #undef XV_COPY
 			xvn[n].xv_flag = vp->v_vflag;
 
 			switch (vp->v_type) {
 			case VREG:
 			case VDIR:
 			case VLNK:
 				break;
 			case VBLK:
 			case VCHR:
 				if (vp->v_rdev == NULL) {
 					vrele(vp);
 					continue;
 				}
 				xvn[n].xv_dev = dev2udev(vp->v_rdev);
 				break;
 			case VSOCK:
 				xvn[n].xv_socket = vp->v_socket;
 				break;
 			case VFIFO:
 				xvn[n].xv_fifo = vp->v_fifoinfo;
 				break;
 			case VNON:
 			case VBAD:
 			default:
 				/* shouldn't happen? */
 				vrele(vp);
 				continue;
 			}
 			vrele(vp);
 			++n;
 		}
 		MNT_IUNLOCK(mp);
 		mtx_lock(&mountlist_mtx);
 		vfs_unbusy(mp, td);
 		if (n == len)
 			break;
 	}
 	mtx_unlock(&mountlist_mtx);
 
 	error = SYSCTL_OUT(req, xvn, n * sizeof *xvn);
 	free(xvn, M_TEMP);
 	return (error);
 }
 
 SYSCTL_PROC(_kern, KERN_VNODE, vnode, CTLTYPE_OPAQUE|CTLFLAG_RD,
 	0, 0, sysctl_vnode, "S,xvnode", "");
 #endif
 
 /*
  * Unmount all filesystems. The list is traversed in reverse order
  * of mounting to avoid dependencies.
  */
 void
 vfs_unmountall()
 {
 	struct mount *mp;
 	struct thread *td;
 	int error;
 
 	KASSERT(curthread != NULL, ("vfs_unmountall: NULL curthread"));
 	td = curthread;
 	/*
 	 * Since this only runs when rebooting, it is not interlocked.
 	 */
 	while(!TAILQ_EMPTY(&mountlist)) {
 		mp = TAILQ_LAST(&mountlist, mntlist);
 		error = dounmount(mp, MNT_FORCE, td);
 		if (error) {
 			TAILQ_REMOVE(&mountlist, mp, mnt_list);
 			/*
 			 * XXX: Due to the way in which we mount the root
 			 * file system off of devfs, devfs will generate a
 			 * "busy" warning when we try to unmount it before
 			 * the root.  Don't print a warning as a result in
 			 * order to avoid false positive errors that may
 			 * cause needless upset.
 			 */
 			if (strcmp(mp->mnt_vfc->vfc_name, "devfs") != 0) {
 				printf("unmount of %s failed (",
 				    mp->mnt_stat.f_mntonname);
 				if (error == EBUSY)
 					printf("BUSY)\n");
 				else
 					printf("%d)\n", error);
 			}
 		} else {
 			/* The unmount has removed mp from the mountlist */
 		}
 	}
 }
 
 /*
  * perform msync on all vnodes under a mount point
  * the mount point must be locked.
  */
 void
 vfs_msync(struct mount *mp, int flags)
 {
-	struct vnode *vp, *nvp;
+	struct vnode *vp, *mvp;
 	struct vm_object *obj;
-	int tries;
 
-	tries = 5;
 	MNT_ILOCK(mp);
-loop:
-	TAILQ_FOREACH_SAFE(vp, &mp->mnt_nvnodelist, v_nmntvnodes, nvp) {
-		if (vp->v_mount != mp) {
-			if (--tries > 0)
-				goto loop;
-			break;
-		}
-
+	MNT_VNODE_FOREACH(vp, mp, mvp) {
 		VI_LOCK(vp);
 		if ((vp->v_iflag & VI_OBJDIRTY) &&
 		    (flags == MNT_WAIT || VOP_ISLOCKED(vp, NULL) == 0)) {
 			MNT_IUNLOCK(mp);
 			if (!vget(vp,
 			    LK_EXCLUSIVE | LK_RETRY | LK_INTERLOCK,
 			    curthread)) {
 				if (vp->v_vflag & VV_NOSYNC) {	/* unlinked */
 					vput(vp);
 					MNT_ILOCK(mp);
 					continue;
 				}
 
 				obj = vp->v_object;
 				if (obj != NULL) {
 					VM_OBJECT_LOCK(obj);
 					vm_object_page_clean(obj, 0, 0,
 					    flags == MNT_WAIT ?
 					    OBJPC_SYNC : OBJPC_NOSYNC);
 					VM_OBJECT_UNLOCK(obj);
 				}
 				vput(vp);
 			}
 			MNT_ILOCK(mp);
-			if (TAILQ_NEXT(vp, v_nmntvnodes) != nvp) {
-				if (--tries > 0)
-					goto loop;
-				break;
-			}
 		} else
 			VI_UNLOCK(vp);
 	}
 	MNT_IUNLOCK(mp);
 }
 
 /*
  * Mark a vnode as free, putting it up for recycling.
  */
 static void
 vfree(struct vnode *vp)
 {
 
 	CTR1(KTR_VFS, "vfree vp %p", vp);
 	ASSERT_VI_LOCKED(vp, "vfree");
 	mtx_lock(&vnode_free_list_mtx);
 	VNASSERT(vp->v_op != NULL, vp, ("vfree: vnode already reclaimed."));
 	VNASSERT((vp->v_iflag & VI_FREE) == 0, vp, ("vnode already free"));
 	VNASSERT(VSHOULDFREE(vp), vp, ("vfree: freeing when we shouldn't"));
 	VNASSERT((vp->v_iflag & VI_DOOMED) == 0, vp,
 	    ("vfree: Freeing doomed vnode"));
 	if (vp->v_iflag & VI_AGE) {
 		TAILQ_INSERT_HEAD(&vnode_free_list, vp, v_freelist);
 	} else {
 		TAILQ_INSERT_TAIL(&vnode_free_list, vp, v_freelist);
 	}
 	freevnodes++;
 	vp->v_iflag &= ~VI_AGE;
 	vp->v_iflag |= VI_FREE;
 	mtx_unlock(&vnode_free_list_mtx);
 }
 
 /*
  * Opposite of vfree() - mark a vnode as in use.
  */
 static void
 vbusy(struct vnode *vp)
 {
 	CTR1(KTR_VFS, "vbusy vp %p", vp);
 	ASSERT_VI_LOCKED(vp, "vbusy");
 	VNASSERT((vp->v_iflag & VI_FREE) != 0, vp, ("vnode not free"));
 	VNASSERT(vp->v_op != NULL, vp, ("vbusy: vnode already reclaimed."));
 
 	mtx_lock(&vnode_free_list_mtx);
 	TAILQ_REMOVE(&vnode_free_list, vp, v_freelist);
 	freevnodes--;
 	vp->v_iflag &= ~(VI_FREE|VI_AGE);
 	mtx_unlock(&vnode_free_list_mtx);
 }
 
 /*
  * Initalize per-vnode helper structure to hold poll-related state.
  */
 void
 v_addpollinfo(struct vnode *vp)
 {
 	struct vpollinfo *vi;
 
 	vi = uma_zalloc(vnodepoll_zone, M_WAITOK);
 	if (vp->v_pollinfo != NULL) {
 		uma_zfree(vnodepoll_zone, vi);
 		return;
 	}
 	vp->v_pollinfo = vi;
 	mtx_init(&vp->v_pollinfo->vpi_lock, "vnode pollinfo", NULL, MTX_DEF);
 	knlist_init(&vp->v_pollinfo->vpi_selinfo.si_note, vp, vfs_knllock,
 	    vfs_knlunlock, vfs_knllocked);
 }
 
 /*
  * Record a process's interest in events which might happen to
  * a vnode.  Because poll uses the historic select-style interface
  * internally, this routine serves as both the ``check for any
  * pending events'' and the ``record my interest in future events''
  * functions.  (These are done together, while the lock is held,
  * to avoid race conditions.)
  */
 int
 vn_pollrecord(vp, td, events)
 	struct vnode *vp;
 	struct thread *td;
 	short events;
 {
 
 	if (vp->v_pollinfo == NULL)
 		v_addpollinfo(vp);
 	mtx_lock(&vp->v_pollinfo->vpi_lock);
 	if (vp->v_pollinfo->vpi_revents & events) {
 		/*
 		 * This leaves events we are not interested
 		 * in available for the other process which
 		 * which presumably had requested them
 		 * (otherwise they would never have been
 		 * recorded).
 		 */
 		events &= vp->v_pollinfo->vpi_revents;
 		vp->v_pollinfo->vpi_revents &= ~events;
 
 		mtx_unlock(&vp->v_pollinfo->vpi_lock);
 		return events;
 	}
 	vp->v_pollinfo->vpi_events |= events;
 	selrecord(td, &vp->v_pollinfo->vpi_selinfo);
 	mtx_unlock(&vp->v_pollinfo->vpi_lock);
 	return 0;
 }
 
 /*
  * Routine to create and manage a filesystem syncer vnode.
  */
 #define sync_close ((int (*)(struct  vop_close_args *))nullop)
 static int	sync_fsync(struct  vop_fsync_args *);
 static int	sync_inactive(struct  vop_inactive_args *);
 static int	sync_reclaim(struct  vop_reclaim_args *);
 
 static struct vop_vector sync_vnodeops = {
 	.vop_bypass =	VOP_EOPNOTSUPP,
 	.vop_close =	sync_close,		/* close */
 	.vop_fsync =	sync_fsync,		/* fsync */
 	.vop_inactive =	sync_inactive,	/* inactive */
 	.vop_reclaim =	sync_reclaim,	/* reclaim */
 	.vop_lock =	vop_stdlock,	/* lock */
 	.vop_unlock =	vop_stdunlock,	/* unlock */
 	.vop_islocked =	vop_stdislocked,	/* islocked */
 };
 
 /*
  * Create a new filesystem syncer vnode for the specified mount point.
  */
 int
 vfs_allocate_syncvnode(mp)
 	struct mount *mp;
 {
 	struct vnode *vp;
 	static long start, incr, next;
 	int error;
 
 	/* Allocate a new vnode */
 	if ((error = getnewvnode("syncer", mp, &sync_vnodeops, &vp)) != 0) {
 		mp->mnt_syncer = NULL;
 		return (error);
 	}
 	vp->v_type = VNON;
 	/*
 	 * Place the vnode onto the syncer worklist. We attempt to
 	 * scatter them about on the list so that they will go off
 	 * at evenly distributed times even if all the filesystems
 	 * are mounted at once.
 	 */
 	next += incr;
 	if (next == 0 || next > syncer_maxdelay) {
 		start /= 2;
 		incr /= 2;
 		if (start == 0) {
 			start = syncer_maxdelay / 2;
 			incr = syncer_maxdelay;
 		}
 		next = start;
 	}
 	VI_LOCK(vp);
 	vn_syncer_add_to_worklist(&vp->v_bufobj,
 	    syncdelay > 0 ? next % syncdelay : 0);
 	/* XXX - vn_syncer_add_to_worklist() also grabs and drops sync_mtx. */
 	mtx_lock(&sync_mtx);
 	sync_vnode_count++;
 	mtx_unlock(&sync_mtx);
 	VI_UNLOCK(vp);
 	mp->mnt_syncer = vp;
 	return (0);
 }
 
 /*
  * Do a lazy sync of the filesystem.
  */
 static int
 sync_fsync(ap)
 	struct vop_fsync_args /* {
 		struct vnode *a_vp;
 		struct ucred *a_cred;
 		int a_waitfor;
 		struct thread *a_td;
 	} */ *ap;
 {
 	struct vnode *syncvp = ap->a_vp;
 	struct mount *mp = syncvp->v_mount;
 	struct thread *td = ap->a_td;
 	int error, asyncflag;
 	struct bufobj *bo;
 
 	/*
 	 * We only need to do something if this is a lazy evaluation.
 	 */
 	if (ap->a_waitfor != MNT_LAZY)
 		return (0);
 
 	/*
 	 * Move ourselves to the back of the sync list.
 	 */
 	bo = &syncvp->v_bufobj;
 	BO_LOCK(bo);
 	vn_syncer_add_to_worklist(bo, syncdelay);
 	BO_UNLOCK(bo);
 
 	/*
 	 * Walk the list of vnodes pushing all that are dirty and
 	 * not already on the sync list.
 	 */
 	mtx_lock(&mountlist_mtx);
 	if (vfs_busy(mp, LK_EXCLUSIVE | LK_NOWAIT, &mountlist_mtx, td) != 0) {
 		mtx_unlock(&mountlist_mtx);
 		return (0);
 	}
 	if (vn_start_write(NULL, &mp, V_NOWAIT) != 0) {
 		vfs_unbusy(mp, td);
 		return (0);
 	}
 	asyncflag = mp->mnt_flag & MNT_ASYNC;
 	mp->mnt_flag &= ~MNT_ASYNC;
 	vfs_msync(mp, MNT_NOWAIT);
 	error = VFS_SYNC(mp, MNT_LAZY, td);
 	if (asyncflag)
 		mp->mnt_flag |= MNT_ASYNC;
 	vn_finished_write(mp);
 	vfs_unbusy(mp, td);
 	return (error);
 }
 
 /*
  * The syncer vnode is no referenced.
  */
 static int
 sync_inactive(ap)
 	struct vop_inactive_args /* {
 		struct vnode *a_vp;
 		struct thread *a_td;
 	} */ *ap;
 {
 
 	vgone(ap->a_vp);
 	return (0);
 }
 
 /*
  * The syncer vnode is no longer needed and is being decommissioned.
  *
  * Modifications to the worklist must be protected by sync_mtx.
  */
 static int
 sync_reclaim(ap)
 	struct vop_reclaim_args /* {
 		struct vnode *a_vp;
 	} */ *ap;
 {
 	struct vnode *vp = ap->a_vp;
 	struct bufobj *bo;
 
 	VI_LOCK(vp);
 	bo = &vp->v_bufobj;
 	vp->v_mount->mnt_syncer = NULL;
 	if (bo->bo_flag & BO_ONWORKLST) {
 		mtx_lock(&sync_mtx);
 		LIST_REMOVE(bo, bo_synclist);
  		syncer_worklist_len--;
 		sync_vnode_count--;
 		mtx_unlock(&sync_mtx);
 		bo->bo_flag &= ~BO_ONWORKLST;
 	}
 	VI_UNLOCK(vp);
 
 	return (0);
 }
 
 /*
  * Check if vnode represents a disk device
  */
 int
 vn_isdisk(vp, errp)
 	struct vnode *vp;
 	int *errp;
 {
 	int error;
 
 	error = 0;
 	dev_lock();
 	if (vp->v_type != VCHR)
 		error = ENOTBLK;
 	else if (vp->v_rdev == NULL)
 		error = ENXIO;
 	else if (vp->v_rdev->si_devsw == NULL)
 		error = ENXIO;
 	else if (!(vp->v_rdev->si_devsw->d_flags & D_DISK))
 		error = ENOTBLK;
 	dev_unlock();
 	if (errp != NULL)
 		*errp = error;
 	return (error == 0);
 }
 
 /*
  * Common filesystem object access control check routine.  Accepts a
  * vnode's type, "mode", uid and gid, requested access mode, credentials,
  * and optional call-by-reference privused argument allowing vaccess()
  * to indicate to the caller whether privilege was used to satisfy the
  * request (obsoleted).  Returns 0 on success, or an errno on failure.
  */
 int
 vaccess(type, file_mode, file_uid, file_gid, acc_mode, cred, privused)
 	enum vtype type;
 	mode_t file_mode;
 	uid_t file_uid;
 	gid_t file_gid;
 	mode_t acc_mode;
 	struct ucred *cred;
 	int *privused;
 {
 	mode_t dac_granted;
 #ifdef CAPABILITIES
 	mode_t cap_granted;
 #endif
 
 	/*
 	 * Look for a normal, non-privileged way to access the file/directory
 	 * as requested.  If it exists, go with that.
 	 */
 
 	if (privused != NULL)
 		*privused = 0;
 
 	dac_granted = 0;
 
 	/* Check the owner. */
 	if (cred->cr_uid == file_uid) {
 		dac_granted |= VADMIN;
 		if (file_mode & S_IXUSR)
 			dac_granted |= VEXEC;
 		if (file_mode & S_IRUSR)
 			dac_granted |= VREAD;
 		if (file_mode & S_IWUSR)
 			dac_granted |= (VWRITE | VAPPEND);
 
 		if ((acc_mode & dac_granted) == acc_mode)
 			return (0);
 
 		goto privcheck;
 	}
 
 	/* Otherwise, check the groups (first match) */
 	if (groupmember(file_gid, cred)) {
 		if (file_mode & S_IXGRP)
 			dac_granted |= VEXEC;
 		if (file_mode & S_IRGRP)
 			dac_granted |= VREAD;
 		if (file_mode & S_IWGRP)
 			dac_granted |= (VWRITE | VAPPEND);
 
 		if ((acc_mode & dac_granted) == acc_mode)
 			return (0);
 
 		goto privcheck;
 	}
 
 	/* Otherwise, check everyone else. */
 	if (file_mode & S_IXOTH)
 		dac_granted |= VEXEC;
 	if (file_mode & S_IROTH)
 		dac_granted |= VREAD;
 	if (file_mode & S_IWOTH)
 		dac_granted |= (VWRITE | VAPPEND);
 	if ((acc_mode & dac_granted) == acc_mode)
 		return (0);
 
 privcheck:
 	if (!suser_cred(cred, SUSER_ALLOWJAIL)) {
 		/* XXX audit: privilege used */
 		if (privused != NULL)
 			*privused = 1;
 		return (0);
 	}
 
 #ifdef CAPABILITIES
 	/*
 	 * Build a capability mask to determine if the set of capabilities
 	 * satisfies the requirements when combined with the granted mask
 	 * from above.
 	 * For each capability, if the capability is required, bitwise
 	 * or the request type onto the cap_granted mask.
 	 */
 	cap_granted = 0;
 
 	if (type == VDIR) {
 		/*
 		 * For directories, use CAP_DAC_READ_SEARCH to satisfy
 		 * VEXEC requests, instead of CAP_DAC_EXECUTE.
 		 */
 		if ((acc_mode & VEXEC) && ((dac_granted & VEXEC) == 0) &&
 		    !cap_check(cred, NULL, CAP_DAC_READ_SEARCH, SUSER_ALLOWJAIL))
 			cap_granted |= VEXEC;
 	} else {
 		if ((acc_mode & VEXEC) && ((dac_granted & VEXEC) == 0) &&
 		    !cap_check(cred, NULL, CAP_DAC_EXECUTE, SUSER_ALLOWJAIL))
 			cap_granted |= VEXEC;
 	}
 
 	if ((acc_mode & VREAD) && ((dac_granted & VREAD) == 0) &&
 	    !cap_check(cred, NULL, CAP_DAC_READ_SEARCH, SUSER_ALLOWJAIL))
 		cap_granted |= VREAD;
 
 	if ((acc_mode & VWRITE) && ((dac_granted & VWRITE) == 0) &&
 	    !cap_check(cred, NULL, CAP_DAC_WRITE, SUSER_ALLOWJAIL))
 		cap_granted |= (VWRITE | VAPPEND);
 
 	if ((acc_mode & VADMIN) && ((dac_granted & VADMIN) == 0) &&
 	    !cap_check(cred, NULL, CAP_FOWNER, SUSER_ALLOWJAIL))
 		cap_granted |= VADMIN;
 
 	if ((acc_mode & (cap_granted | dac_granted)) == acc_mode) {
 		/* XXX audit: privilege used */
 		if (privused != NULL)
 			*privused = 1;
 		return (0);
 	}
 #endif
 
 	return ((acc_mode & VADMIN) ? EPERM : EACCES);
 }
 
 /*
  * Credential check based on process requesting service, and per-attribute
  * permissions.
  */
 int
 extattr_check_cred(struct vnode *vp, int attrnamespace,
     struct ucred *cred, struct thread *td, int access)
 {
 
 	/*
 	 * Kernel-invoked always succeeds.
 	 */
 	if (cred == NOCRED)
 		return (0);
 
 	/*
 	 * Do not allow privileged processes in jail to directly
 	 * manipulate system attributes.
 	 *
 	 * XXX What capability should apply here?
 	 * Probably CAP_SYS_SETFFLAG.
 	 */
 	switch (attrnamespace) {
 	case EXTATTR_NAMESPACE_SYSTEM:
 		/* Potentially should be: return (EPERM); */
 		return (suser_cred(cred, 0));
 	case EXTATTR_NAMESPACE_USER:
 		return (VOP_ACCESS(vp, access, cred, td));
 	default:
 		return (EPERM);
 	}
 }
 
 #ifdef DEBUG_VFS_LOCKS
 /*
  * This only exists to supress warnings from unlocked specfs accesses.  It is
  * no longer ok to have an unlocked VFS.
  */
 #define	IGNORE_LOCK(vp) ((vp)->v_type == VCHR || (vp)->v_type == VBAD)
 
 int vfs_badlock_ddb = 1;	/* Drop into debugger on violation. */
 SYSCTL_INT(_debug, OID_AUTO, vfs_badlock_ddb, CTLFLAG_RW, &vfs_badlock_ddb, 0, "");
 
 int vfs_badlock_mutex = 1;	/* Check for interlock across VOPs. */
 SYSCTL_INT(_debug, OID_AUTO, vfs_badlock_mutex, CTLFLAG_RW, &vfs_badlock_mutex, 0, "");
 
 int vfs_badlock_print = 1;	/* Print lock violations. */
 SYSCTL_INT(_debug, OID_AUTO, vfs_badlock_print, CTLFLAG_RW, &vfs_badlock_print, 0, "");
 
 #ifdef KDB
 int vfs_badlock_backtrace = 1;	/* Print backtrace at lock violations. */
 SYSCTL_INT(_debug, OID_AUTO, vfs_badlock_backtrace, CTLFLAG_RW, &vfs_badlock_backtrace, 0, "");
 #endif
 
 static void
 vfs_badlock(const char *msg, const char *str, struct vnode *vp)
 {
 
 #ifdef KDB
 	if (vfs_badlock_backtrace)
 		kdb_backtrace();
 #endif
 	if (vfs_badlock_print)
 		printf("%s: %p %s\n", str, (void *)vp, msg);
 	if (vfs_badlock_ddb)
 		kdb_enter("lock violation");
 }
 
 void
 assert_vi_locked(struct vnode *vp, const char *str)
 {
 
 	if (vfs_badlock_mutex && !mtx_owned(VI_MTX(vp)))
 		vfs_badlock("interlock is not locked but should be", str, vp);
 }
 
 void
 assert_vi_unlocked(struct vnode *vp, const char *str)
 {
 
 	if (vfs_badlock_mutex && mtx_owned(VI_MTX(vp)))
 		vfs_badlock("interlock is locked but should not be", str, vp);
 }
 
 void
 assert_vop_locked(struct vnode *vp, const char *str)
 {
 
 	if (vp && !IGNORE_LOCK(vp) && VOP_ISLOCKED(vp, NULL) == 0)
 		vfs_badlock("is not locked but should be", str, vp);
 }
 
 void
 assert_vop_unlocked(struct vnode *vp, const char *str)
 {
 
 	if (vp && !IGNORE_LOCK(vp) &&
 	    VOP_ISLOCKED(vp, curthread) == LK_EXCLUSIVE)
 		vfs_badlock("is locked but should not be", str, vp);
 }
 
 void
 assert_vop_elocked(struct vnode *vp, const char *str)
 {
 
 	if (vp && !IGNORE_LOCK(vp) &&
 	    VOP_ISLOCKED(vp, curthread) != LK_EXCLUSIVE)
 		vfs_badlock("is not exclusive locked but should be", str, vp);
 }
 
 #if 0
 void
 assert_vop_elocked_other(struct vnode *vp, const char *str)
 {
 
 	if (vp && !IGNORE_LOCK(vp) &&
 	    VOP_ISLOCKED(vp, curthread) != LK_EXCLOTHER)
 		vfs_badlock("is not exclusive locked by another thread",
 		    str, vp);
 }
 
 void
 assert_vop_slocked(struct vnode *vp, const char *str)
 {
 
 	if (vp && !IGNORE_LOCK(vp) &&
 	    VOP_ISLOCKED(vp, curthread) != LK_SHARED)
 		vfs_badlock("is not locked shared but should be", str, vp);
 }
 #endif /* 0 */
 #endif /* DEBUG_VFS_LOCKS */
 
 void
 vop_rename_pre(void *ap)
 {
 	struct vop_rename_args *a = ap;
 
 #ifdef DEBUG_VFS_LOCKS
 	if (a->a_tvp)
 		ASSERT_VI_UNLOCKED(a->a_tvp, "VOP_RENAME");
 	ASSERT_VI_UNLOCKED(a->a_tdvp, "VOP_RENAME");
 	ASSERT_VI_UNLOCKED(a->a_fvp, "VOP_RENAME");
 	ASSERT_VI_UNLOCKED(a->a_fdvp, "VOP_RENAME");
 
 	/* Check the source (from). */
 	if (a->a_tdvp != a->a_fdvp)
 		ASSERT_VOP_UNLOCKED(a->a_fdvp, "vop_rename: fdvp locked");
 	if (a->a_tvp != a->a_fvp)
 		ASSERT_VOP_UNLOCKED(a->a_fvp, "vop_rename: tvp locked");
 
 	/* Check the target. */
 	if (a->a_tvp)
 		ASSERT_VOP_LOCKED(a->a_tvp, "vop_rename: tvp not locked");
 	ASSERT_VOP_LOCKED(a->a_tdvp, "vop_rename: tdvp not locked");
 #endif
 	if (a->a_tdvp != a->a_fdvp)
 		vhold(a->a_fdvp);
 	if (a->a_tvp != a->a_fvp)
 		vhold(a->a_fvp);
 	vhold(a->a_tdvp);
 	if (a->a_tvp)
 		vhold(a->a_tvp);
 }
 
 void
 vop_strategy_pre(void *ap)
 {
 #ifdef DEBUG_VFS_LOCKS
 	struct vop_strategy_args *a;
 	struct buf *bp;
 
 	a = ap;
 	bp = a->a_bp;
 
 	/*
 	 * Cluster ops lock their component buffers but not the IO container.
 	 */
 	if ((bp->b_flags & B_CLUSTER) != 0)
 		return;
 
 	if (BUF_REFCNT(bp) < 1) {
 		if (vfs_badlock_print)
 			printf(
 			    "VOP_STRATEGY: bp is not locked but should be\n");
 		if (vfs_badlock_ddb)
 			kdb_enter("lock violation");
 	}
 #endif
 }
 
 void
 vop_lookup_pre(void *ap)
 {
 #ifdef DEBUG_VFS_LOCKS
 	struct vop_lookup_args *a;
 	struct vnode *dvp;
 
 	a = ap;
 	dvp = a->a_dvp;
 	ASSERT_VI_UNLOCKED(dvp, "VOP_LOOKUP");
 	ASSERT_VOP_LOCKED(dvp, "VOP_LOOKUP");
 #endif
 }
 
 void
 vop_lookup_post(void *ap, int rc)
 {
 #ifdef DEBUG_VFS_LOCKS
 	struct vop_lookup_args *a;
 	struct vnode *dvp;
 	struct vnode *vp;
 
 	a = ap;
 	dvp = a->a_dvp;
 	vp = *(a->a_vpp);
 
 	ASSERT_VI_UNLOCKED(dvp, "VOP_LOOKUP");
 	ASSERT_VOP_LOCKED(dvp, "VOP_LOOKUP");
 
 	if (!rc)
 		ASSERT_VOP_LOCKED(vp, "VOP_LOOKUP (child)");
 #endif
 }
 
 void
 vop_lock_pre(void *ap)
 {
 #ifdef DEBUG_VFS_LOCKS
 	struct vop_lock_args *a = ap;
 
 	if ((a->a_flags & LK_INTERLOCK) == 0)
 		ASSERT_VI_UNLOCKED(a->a_vp, "VOP_LOCK");
 	else
 		ASSERT_VI_LOCKED(a->a_vp, "VOP_LOCK");
 #endif
 }
 
 void
 vop_lock_post(void *ap, int rc)
 {
 #ifdef DEBUG_VFS_LOCKS
 	struct vop_lock_args *a = ap;
 
 	ASSERT_VI_UNLOCKED(a->a_vp, "VOP_LOCK");
 	if (rc == 0)
 		ASSERT_VOP_LOCKED(a->a_vp, "VOP_LOCK");
 #endif
 }
 
 void
 vop_unlock_pre(void *ap)
 {
 #ifdef DEBUG_VFS_LOCKS
 	struct vop_unlock_args *a = ap;
 
 	if (a->a_flags & LK_INTERLOCK)
 		ASSERT_VI_LOCKED(a->a_vp, "VOP_UNLOCK");
 	ASSERT_VOP_LOCKED(a->a_vp, "VOP_UNLOCK");
 #endif
 }
 
 void
 vop_unlock_post(void *ap, int rc)
 {
 #ifdef DEBUG_VFS_LOCKS
 	struct vop_unlock_args *a = ap;
 
 	if (a->a_flags & LK_INTERLOCK)
 		ASSERT_VI_UNLOCKED(a->a_vp, "VOP_UNLOCK");
 #endif
 }
 
 void
 vop_create_post(void *ap, int rc)
 {
 	struct vop_create_args *a = ap;
 
 	if (!rc)
 		VFS_KNOTE_LOCKED(a->a_dvp, NOTE_WRITE); 
 }
 
 void
 vop_link_post(void *ap, int rc)
 {
 	struct vop_link_args *a = ap;
 	
 	if (!rc) {
 		VFS_KNOTE_LOCKED(a->a_vp, NOTE_LINK); 
 		VFS_KNOTE_LOCKED(a->a_tdvp, NOTE_WRITE);
 	}
 }
 
 void
 vop_mkdir_post(void *ap, int rc)
 {
 	struct vop_mkdir_args *a = ap;
 
 	if (!rc)
 		VFS_KNOTE_LOCKED(a->a_dvp, NOTE_WRITE | NOTE_LINK);
 }
 
 void
 vop_mknod_post(void *ap, int rc)
 {
 	struct vop_mknod_args *a = ap;
 
 	if (!rc)
 		VFS_KNOTE_LOCKED(a->a_dvp, NOTE_WRITE);
 }
 
 void
 vop_remove_post(void *ap, int rc)
 {
 	struct vop_remove_args *a = ap;
 
 	if (!rc) {
 		VFS_KNOTE_LOCKED(a->a_dvp, NOTE_WRITE);
 		VFS_KNOTE_LOCKED(a->a_vp, NOTE_DELETE);
 	}
 }
 
 void
 vop_rename_post(void *ap, int rc)
 {
 	struct vop_rename_args *a = ap;
 
 	if (!rc) {
 		VFS_KNOTE_UNLOCKED(a->a_fdvp, NOTE_WRITE);
 		VFS_KNOTE_UNLOCKED(a->a_tdvp, NOTE_WRITE);
 		VFS_KNOTE_UNLOCKED(a->a_fvp, NOTE_RENAME);
 		if (a->a_tvp)
 			VFS_KNOTE_UNLOCKED(a->a_tvp, NOTE_DELETE);
 	}
 	if (a->a_tdvp != a->a_fdvp)
 		vdrop(a->a_fdvp);
 	if (a->a_tvp != a->a_fvp)
 		vdrop(a->a_fvp);
 	vdrop(a->a_tdvp);
 	if (a->a_tvp)
 		vdrop(a->a_tvp);
 }
 
 void
 vop_rmdir_post(void *ap, int rc)
 {
 	struct vop_rmdir_args *a = ap;
 
 	if (!rc) {
 		VFS_KNOTE_LOCKED(a->a_dvp, NOTE_WRITE | NOTE_LINK);
 		VFS_KNOTE_LOCKED(a->a_vp, NOTE_DELETE);
 	}
 }
 
 void
 vop_setattr_post(void *ap, int rc)
 {
 	struct vop_setattr_args *a = ap;
 
 	if (!rc)
 		VFS_KNOTE_LOCKED(a->a_vp, NOTE_ATTRIB);
 }
 
 void
 vop_symlink_post(void *ap, int rc)
 {
 	struct vop_symlink_args *a = ap;
 	
 	if (!rc)
 		VFS_KNOTE_LOCKED(a->a_dvp, NOTE_WRITE);
 }
 
 static struct knlist fs_knlist;
 
 static void
 vfs_event_init(void *arg)
 {
 	knlist_init(&fs_knlist, NULL, NULL, NULL, NULL);
 }
 /* XXX - correct order? */
 SYSINIT(vfs_knlist, SI_SUB_VFS, SI_ORDER_ANY, vfs_event_init, NULL);
 
 void
 vfs_event_signal(fsid_t *fsid, u_int32_t event, intptr_t data __unused)
 {
 
 	KNOTE_UNLOCKED(&fs_knlist, event);
 }
 
 static int	filt_fsattach(struct knote *kn);
 static void	filt_fsdetach(struct knote *kn);
 static int	filt_fsevent(struct knote *kn, long hint);
 
 struct filterops fs_filtops =
 	{ 0, filt_fsattach, filt_fsdetach, filt_fsevent };
 
 static int
 filt_fsattach(struct knote *kn)
 {
 
 	kn->kn_flags |= EV_CLEAR;
 	knlist_add(&fs_knlist, kn, 0);
 	return (0);
 }
 
 static void
 filt_fsdetach(struct knote *kn)
 {
 
 	knlist_remove(&fs_knlist, kn, 0);
 }
 
 static int
 filt_fsevent(struct knote *kn, long hint)
 {
 
 	kn->kn_fflags |= hint;
 	return (kn->kn_fflags != 0);
 }
 
 static int
 sysctl_vfs_ctl(SYSCTL_HANDLER_ARGS)
 {
 	struct vfsidctl vc;
 	int error;
 	struct mount *mp;
 
 	error = SYSCTL_IN(req, &vc, sizeof(vc));
 	if (error)
 		return (error);
 	if (vc.vc_vers != VFS_CTL_VERS1)
 		return (EINVAL);
 	mp = vfs_getvfs(&vc.vc_fsid);
 	if (mp == NULL)
 		return (ENOENT);
 	/* ensure that a specific sysctl goes to the right filesystem. */
 	if (strcmp(vc.vc_fstypename, "*") != 0 &&
 	    strcmp(vc.vc_fstypename, mp->mnt_vfc->vfc_name) != 0) {
 		return (EINVAL);
 	}
 	VCTLTOREQ(&vc, req);
 	return (VFS_SYSCTL(mp, vc.vc_op, req));
 }
 
 SYSCTL_PROC(_vfs, OID_AUTO, ctl, CTLFLAG_WR,
         NULL, 0, sysctl_vfs_ctl, "", "Sysctl by fsid");
 
 /*
  * Function to initialize a va_filerev field sensibly.
  * XXX: Wouldn't a random number make a lot more sense ??
  */
 u_quad_t
 init_va_filerev(void)
 {
 	struct bintime bt;
 
 	getbinuptime(&bt);
 	return (((u_quad_t)bt.sec << 32LL) | (bt.frac >> 32LL));
 }
 
 static int	filt_vfsread(struct knote *kn, long hint);
 static int	filt_vfswrite(struct knote *kn, long hint);
 static int	filt_vfsvnode(struct knote *kn, long hint);
 static void	filt_vfsdetach(struct knote *kn);
 static struct filterops vfsread_filtops =
 	{ 1, NULL, filt_vfsdetach, filt_vfsread };
 static struct filterops vfswrite_filtops =
 	{ 1, NULL, filt_vfsdetach, filt_vfswrite };
 static struct filterops vfsvnode_filtops =
 	{ 1, NULL, filt_vfsdetach, filt_vfsvnode };
 
 static void
 vfs_knllock(void *arg)
 {
 	struct vnode *vp = arg;
 
 	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, curthread);
 }
 
 static void
 vfs_knlunlock(void *arg)
 {
 	struct vnode *vp = arg;
 
 	VOP_UNLOCK(vp, 0, curthread);
 }
 
 static int
 vfs_knllocked(void *arg)
 {
 	struct vnode *vp = arg;
 
 	return (VOP_ISLOCKED(vp, curthread) == LK_EXCLUSIVE);
 }
 
 int
 vfs_kqfilter(struct vop_kqfilter_args *ap)
 {
 	struct vnode *vp = ap->a_vp;
 	struct knote *kn = ap->a_kn;
 	struct knlist *knl; 
 
 	switch (kn->kn_filter) {
 	case EVFILT_READ:
 		kn->kn_fop = &vfsread_filtops;
 		break;
 	case EVFILT_WRITE:
 		kn->kn_fop = &vfswrite_filtops;
 		break;
 	case EVFILT_VNODE:
 		kn->kn_fop = &vfsvnode_filtops;
 		break;
 	default:
 		return (EINVAL);
 	}
 
 	kn->kn_hook = (caddr_t)vp;
 
 	if (vp->v_pollinfo == NULL)
 		v_addpollinfo(vp);
 	if (vp->v_pollinfo == NULL)
 		return (ENOMEM);
 	knl = &vp->v_pollinfo->vpi_selinfo.si_note;
 	knlist_add(knl, kn, 0);
 
 	return (0);
 }
 
 /*
  * Detach knote from vnode
  */
 static void
 filt_vfsdetach(struct knote *kn)
 {
 	struct vnode *vp = (struct vnode *)kn->kn_hook;
 
 	KASSERT(vp->v_pollinfo != NULL, ("Missing v_pollinfo"));
 	knlist_remove(&vp->v_pollinfo->vpi_selinfo.si_note, kn, 0);
 }
 
 /*ARGSUSED*/
 static int
 filt_vfsread(struct knote *kn, long hint)
 {
 	struct vnode *vp = (struct vnode *)kn->kn_hook;
 	struct vattr va;
 
 	/*
 	 * filesystem is gone, so set the EOF flag and schedule
 	 * the knote for deletion.
 	 */
 	if (hint == NOTE_REVOKE) {
 		kn->kn_flags |= (EV_EOF | EV_ONESHOT);
 		return (1);
 	}
 
 	if (VOP_GETATTR(vp, &va, curthread->td_ucred, curthread)) 
 		return (0);
 
 	kn->kn_data = va.va_size - kn->kn_fp->f_offset;
 	return (kn->kn_data != 0);
 }
 
 /*ARGSUSED*/
 static int
 filt_vfswrite(struct knote *kn, long hint)
 {
 	/*
 	 * filesystem is gone, so set the EOF flag and schedule
 	 * the knote for deletion.
 	 */
 	if (hint == NOTE_REVOKE)
 		kn->kn_flags |= (EV_EOF | EV_ONESHOT);
 
 	kn->kn_data = 0;
 	return (1);
 }
 
 static int
 filt_vfsvnode(struct knote *kn, long hint)
 {
 	if (kn->kn_sfflags & hint)
 		kn->kn_fflags |= hint;
 	if (hint == NOTE_REVOKE) {
 		kn->kn_flags |= EV_EOF;
 		return (1);
 	}
 	return (kn->kn_fflags != 0);
 }
 
 int
 vfs_read_dirent(struct vop_readdir_args *ap, struct dirent *dp, off_t off)
 {
 	int error;
 
 	if (dp->d_reclen > ap->a_uio->uio_resid)
 		return (ENAMETOOLONG);
 	error = uiomove(dp, dp->d_reclen, ap->a_uio);
 	if (error) {
 		if (ap->a_ncookies != NULL) {
 			if (ap->a_cookies != NULL)
 				free(ap->a_cookies, M_TEMP);
 			ap->a_cookies = NULL;
 			*ap->a_ncookies = 0;
 		}
 		return (error);
 	}
 	if (ap->a_ncookies == NULL)
 		return (0);
 
 	KASSERT(ap->a_cookies,
 	    ("NULL ap->a_cookies value with non-NULL ap->a_ncookies!"));
 
 	*ap->a_cookies = realloc(*ap->a_cookies,
 	    (*ap->a_ncookies + 1) * sizeof(u_long), M_TEMP, M_WAITOK | M_ZERO);
 	(*ap->a_cookies)[*ap->a_ncookies] = off;
 	return (0);
 }
 
 /*
  * Mark for update the access time of the file if the filesystem
  * supports VA_MARK_ATIME.  This functionality is used by execve
  * and mmap, so we want to avoid the synchronous I/O implied by
  * directly setting va_atime for the sake of efficiency.
  */
 void
 vfs_mark_atime(struct vnode *vp, struct thread *td)
 {
 	struct vattr atimeattr;
 
 	if ((vp->v_mount->mnt_flag & (MNT_NOATIME | MNT_RDONLY)) == 0) {
 		VATTR_NULL(&atimeattr);
 		atimeattr.va_vaflags |= VA_MARK_ATIME;
 		(void)VOP_SETATTR(vp, &atimeattr, td->td_ucred, td);
 	}
 }
Index: head/sys/nfs4client/nfs4_vfsops.c
===================================================================
--- head/sys/nfs4client/nfs4_vfsops.c	(revision 154151)
+++ head/sys/nfs4client/nfs4_vfsops.c	(revision 154152)
@@ -1,872 +1,873 @@
 /* $FreeBSD$ */
 /* $Id: nfs_vfsops.c,v 1.38 2003/11/05 14:59:01 rees Exp $ */
 
 /*-
  * copyright (c) 2003
  * the regents of the university of michigan
  * all rights reserved
  * 
  * permission is granted to use, copy, create derivative works and redistribute
  * this software and such derivative works for any purpose, so long as the name
  * of the university of michigan is not used in any advertising or publicity
  * pertaining to the use or distribution of this software without specific,
  * written prior authorization.  if the above copyright notice or any other
  * identification of the university of michigan is included in any copy of any
  * portion of this software, then the disclaimer below must also be included.
  * 
  * this software is provided as is, without representation from the university
  * of michigan as to its fitness for any purpose, and without warranty by the
  * university of michigan of any kind, either express or implied, including
  * without limitation the implied warranties of merchantability and fitness for
  * a particular purpose. the regents of the university of michigan shall not be
  * liable for any damages, including special, indirect, incidental, or
  * consequential damages, with respect to any claim arising out of or in
  * connection with the use of the software, even if it has been or is hereafter
  * advised of the possibility of such damages.
  */
 
 /*-
  * Copyright (c) 1989, 1993, 1995
  *	The Regents of the University of California.  All rights reserved.
  *
  * This code is derived from software contributed to Berkeley by
  * Rick Macklem at The University of Guelph.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	@(#)nfs_vfsops.c	8.12 (Berkeley) 5/20/95
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_bootp.h"
 #include "opt_nfsroot.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/kernel.h>
 #include <sys/kthread.h>
 #include <sys/limits.h>
 #include <sys/lock.h>
 #include <sys/malloc.h>
 #include <sys/mbuf.h>
 #include <sys/module.h>
 #include <sys/mount.h>
 #include <sys/proc.h>
 #include <sys/socket.h>
 #include <sys/socketvar.h>
 #include <sys/sockio.h>
 #include <sys/sysctl.h>
 #include <sys/unistd.h>
 #include <sys/vnode.h>
 #include <sys/signalvar.h>
 
 #include <vm/vm.h>
 #include <vm/vm_extern.h>
 #include <vm/uma.h>
 
 #include <net/if.h>
 #include <net/route.h>
 #include <netinet/in.h>
 #include <netinet/in_var.h>
 
 #include <rpc/rpcclnt.h>
 
 #include <nfs/rpcv2.h>
 #include <nfs/nfsproto.h>
 #include <nfsclient/nfs.h>
 #include <nfs4client/nfs4.h>
 #include <nfsclient/nfsnode.h>
 #include <nfsclient/nfsmount.h>
 #include <nfs/xdr_subs.h>
 #include <nfsclient/nfsm_subs.h>
 #include <nfsclient/nfsdiskless.h>
 
 #include <nfs4client/nfs4m_subs.h>
 #include <nfs4client/nfs4_vfs.h>
 
 #include <nfs4client/nfs4_dev.h>
 #include <nfs4client/nfs4_idmap.h>
 
 SYSCTL_NODE(_vfs, OID_AUTO, nfs4, CTLFLAG_RW, 0, "NFS4 filesystem");
 SYSCTL_STRUCT(_vfs_nfs4, NFS_NFSSTATS, nfsstats, CTLFLAG_RD,
 	&nfsstats, nfsstats, "S,nfsstats");
 
 static void	nfs_decode_args(struct nfsmount *nmp, struct nfs_args *argp);
 static void	nfs4_daemon(void *arg);
 static int	mountnfs(struct nfs_args *, struct mount *,
 		    struct sockaddr *, char *, struct vnode **,
 		    struct ucred *cred);
 static int	nfs4_do_setclientid(struct nfsmount *nmp, struct ucred *cred);
 static vfs_mount_t nfs_mount;
 static vfs_cmount_t nfs_cmount;
 static vfs_unmount_t nfs_unmount;
 static vfs_root_t nfs_root;
 static vfs_statfs_t nfs_statfs;
 static vfs_sync_t nfs_sync;
 
 /*
  * nfs vfs operations.
  */
 static struct vfsops nfs_vfsops = {
 	.vfs_init =		nfs4_init,
 	.vfs_mount =		nfs_mount,
 	.vfs_cmount =		nfs_cmount,
 	.vfs_root =		nfs_root,
 	.vfs_statfs =		nfs_statfs,
 	.vfs_sync =		nfs_sync,
 	.vfs_uninit =		nfs4_uninit,
 	.vfs_unmount =		nfs_unmount,
 };
 VFS_SET(nfs_vfsops, nfs4, VFCF_NETWORK);
 
 static struct nfs_rpcops nfs4_rpcops = {
 	nfs4_readrpc,
 	nfs4_writerpc,
 	nfs4_writebp,
 	nfs4_readlinkrpc,
 	nfs4_invaldir,
 	nfs4_commit,
 };
 
 /* So that loader and kldload(2) can find us, wherever we are.. */
 MODULE_VERSION(nfs4, 1);
 
 void		nfsargs_ntoh(struct nfs_args *);
 
 int
 nfs4_init(struct vfsconf *vfsp)
 {
 
 	rpcclnt_init();
 	nfs4dev_init();
 	idmap_init();
 	nfsm_v4init();
 
 	return (0);
 }
 
 int
 nfs4_uninit(struct vfsconf *vfsp)
 {
 
 	rpcclnt_uninit();
 	nfs4dev_uninit();
 	idmap_uninit();
 
 	return (0);
 }
 
 /*
  * nfs statfs call
  */
 static int
 nfs_statfs(struct mount *mp, struct statfs *sbp, struct thread *td)
 {
 	struct vnode *vp;
 	struct nfs_statfs *sfp;
 	caddr_t bpos, dpos;
 	struct nfsmount *nmp = VFSTONFS(mp);
 	int error = 0;
 	struct mbuf *mreq, *mrep = NULL, *md, *mb;
 	struct nfsnode *np;
 	struct nfs4_compound cp;
 	struct nfs4_oparg_getattr ga;
 	struct nfsv4_fattr *fap = &ga.fa;
 
 #ifndef nolint
 	sfp = NULL;
 #endif
 	error = nfs_nget(mp, (nfsfh_t *)nmp->nm_fh, nmp->nm_fhsize, &np);
 	if (error)
 		return (error);
 	vp = NFSTOV(np);
 	nfsstats.rpccnt[NFSPROC_FSSTAT]++;
 	mreq = nfsm_reqhead(vp, NFSV4PROC_COMPOUND, NFSX_FH(1));
 	mb = mreq;
 	bpos = mtod(mb, caddr_t);
 
 	ga.bm = &nfsv4_fsattrbm;
 	nfs_v4initcompound(&cp);
 
 	nfsm_v4build_compound(&cp, "statfs()");
 	nfsm_v4build_putfh(&cp, vp);
 	nfsm_v4build_getattr(&cp, &ga);
 	nfsm_v4build_finalize(&cp);
 
 	nfsm_request(vp, NFSV4PROC_COMPOUND, td, td->td_ucred);
 	if (error != 0)
 		goto nfsmout;
 
 	nfsm_v4dissect_compound(&cp);
 	nfsm_v4dissect_putfh(&cp);
 	nfsm_v4dissect_getattr(&cp, &ga);
 
 	nfs4_vfsop_statfs(fap, sbp, mp);
 
 nfsmout:
 	error = nfs_v4postop(&cp, error);
 
 	vput(vp);
 	if (mrep != NULL)
 		m_freem(mrep);
 
 	return (error);
 }
 
 static void
 nfs_decode_args(struct nfsmount *nmp, struct nfs_args *argp)
 {
 	int s;
 	int adjsock;
 	int maxio;
 
 	s = splnet();
 	/*
 	 * Silently clear NFSMNT_NOCONN if it's a TCP mount, it makes
 	 * no sense in that context.
 	 */
 	if (argp->sotype == SOCK_STREAM)
 		nmp->nm_flag &= ~NFSMNT_NOCONN;
 
 	nmp->nm_flag &= ~NFSMNT_RDIRPLUS;
 
 	/* Re-bind if rsrvd port requested and wasn't on one */
 	adjsock = !(nmp->nm_flag & NFSMNT_RESVPORT)
 		  && (argp->flags & NFSMNT_RESVPORT);
 	/* Also re-bind if we're switching to/from a connected UDP socket */
 	adjsock |= ((nmp->nm_flag & NFSMNT_NOCONN) !=
 		    (argp->flags & NFSMNT_NOCONN));
 
 	/* Update flags atomically.  Don't change the lock bits. */
 	nmp->nm_flag = argp->flags | nmp->nm_flag;
 	splx(s);
 
 	if ((argp->flags & NFSMNT_TIMEO) && argp->timeo > 0) {
 		nmp->nm_timeo = (argp->timeo * NFS_HZ + 5) / 10;
 		if (nmp->nm_timeo < NFS_MINTIMEO)
 			nmp->nm_timeo = NFS_MINTIMEO;
 		else if (nmp->nm_timeo > NFS_MAXTIMEO)
 			nmp->nm_timeo = NFS_MAXTIMEO;
 	}
 
 	if ((argp->flags & NFSMNT_RETRANS) && argp->retrans > 1) {
 		nmp->nm_retry = argp->retrans;
 		if (nmp->nm_retry > NFS_MAXREXMIT)
 			nmp->nm_retry = NFS_MAXREXMIT;
 	}
 
 	if (argp->flags & NFSMNT_NFSV3) {
 		if (argp->sotype == SOCK_DGRAM)
 			maxio = NFS_MAXDGRAMDATA;
 		else
 			maxio = NFS_MAXDATA;
 	} else
 		maxio = NFS_V2MAXDATA;
 
 	if ((argp->flags & NFSMNT_WSIZE) && argp->wsize > 0) {
 		nmp->nm_wsize = argp->wsize;
 		/* Round down to multiple of blocksize */
 		nmp->nm_wsize &= ~(NFS_FABLKSIZE - 1);
 		if (nmp->nm_wsize <= 0)
 			nmp->nm_wsize = NFS_FABLKSIZE;
 	}
 	if (nmp->nm_wsize > maxio)
 		nmp->nm_wsize = maxio;
 	if (nmp->nm_wsize > MAXBSIZE)
 		nmp->nm_wsize = MAXBSIZE;
 
 	if ((argp->flags & NFSMNT_RSIZE) && argp->rsize > 0) {
 		nmp->nm_rsize = argp->rsize;
 		/* Round down to multiple of blocksize */
 		nmp->nm_rsize &= ~(NFS_FABLKSIZE - 1);
 		if (nmp->nm_rsize <= 0)
 			nmp->nm_rsize = NFS_FABLKSIZE;
 	}
 	if (nmp->nm_rsize > maxio)
 		nmp->nm_rsize = maxio;
 	if (nmp->nm_rsize > MAXBSIZE)
 		nmp->nm_rsize = MAXBSIZE;
 
 	if ((argp->flags & NFSMNT_READDIRSIZE) && argp->readdirsize > 0) {
 		nmp->nm_readdirsize = argp->readdirsize;
 	}
 	if (nmp->nm_readdirsize > maxio)
 		nmp->nm_readdirsize = maxio;
 	if (nmp->nm_readdirsize > nmp->nm_rsize)
 		nmp->nm_readdirsize = nmp->nm_rsize;
 
 	if ((argp->flags & NFSMNT_ACREGMIN) && argp->acregmin >= 0)
 		nmp->nm_acregmin = argp->acregmin;
 	else
 		nmp->nm_acregmin = NFS_MINATTRTIMO;
 	if ((argp->flags & NFSMNT_ACREGMAX) && argp->acregmax >= 0)
 		nmp->nm_acregmax = argp->acregmax;
 	else
 		nmp->nm_acregmax = NFS_MAXATTRTIMO;
 	if ((argp->flags & NFSMNT_ACDIRMIN) && argp->acdirmin >= 0)
 		nmp->nm_acdirmin = argp->acdirmin;
 	else
 		nmp->nm_acdirmin = NFS_MINDIRATTRTIMO;
 	if ((argp->flags & NFSMNT_ACDIRMAX) && argp->acdirmax >= 0)
 		nmp->nm_acdirmax = argp->acdirmax;
 	else
 		nmp->nm_acdirmax = NFS_MAXDIRATTRTIMO;
 	if (nmp->nm_acdirmin > nmp->nm_acdirmax)
 		nmp->nm_acdirmin = nmp->nm_acdirmax;
 	if (nmp->nm_acregmin > nmp->nm_acregmax)
 		nmp->nm_acregmin = nmp->nm_acregmax;
 
 	if ((argp->flags & NFSMNT_MAXGRPS) && argp->maxgrouplist >= 0) {
 		if (argp->maxgrouplist <= NFS_MAXGRPS)
 			nmp->nm_numgrps = argp->maxgrouplist;
 		else
 			nmp->nm_numgrps = NFS_MAXGRPS;
 	}
 	if ((argp->flags & NFSMNT_READAHEAD) && argp->readahead >= 0) {
 		if (argp->readahead <= NFS_MAXRAHEAD)
 			nmp->nm_readahead = argp->readahead;
 		else
 			nmp->nm_readahead = NFS_MAXRAHEAD;
 	}
 	if ((argp->flags & NFSMNT_DEADTHRESH) && argp->deadthresh >= 0) {
 		if (argp->deadthresh <= NFS_MAXDEADTHRESH)
 			nmp->nm_deadthresh = argp->deadthresh;
 		else
 			nmp->nm_deadthresh = NFS_MAXDEADTHRESH;
 	}
 
 	adjsock |= ((nmp->nm_sotype != argp->sotype) ||
 		    (nmp->nm_soproto != argp->proto));
 	nmp->nm_sotype = argp->sotype;
 	nmp->nm_soproto = argp->proto;
 
 	if (nmp->nm_rpcclnt.rc_so && adjsock) {
 		nfs_safedisconnect(nmp);
 		if (nmp->nm_sotype == SOCK_DGRAM) {
 			while (nfs4_connect(nmp)) {
 				printf("nfs_args: retrying connect\n");
 				(void) tsleep((caddr_t)&lbolt,
 					      PSOCK, "nfscon", 0);
 			}
 		}
 	}
 }
 
 /*
  * VFS Operations.
  *
  * mount system call
  * It seems a bit dumb to copyinstr() the host and path here and then
  * bcopy() them in mountnfs(), but I wanted to detect errors before
  * doing the sockargs() call because sockargs() allocates an mbuf and
  * an error after that means that I have to release the mbuf.
  */
 /* ARGSUSED */
 static int
 nfs_cmount(struct mntarg *ma, void *data, int flags, struct thread *td)
 {
 	struct nfs_args args;
 	int error;
 
 	error = copyin(data, (caddr_t)&args, sizeof (struct nfs_args));
 	if (error)
 		return (error);
 
 	ma = mount_arg(ma, "nfs_args", &args, sizeof args);
 
 	error = kernel_mount(ma, flags);
 
 	 return (error);
 }
 
 static int
 nfs_mount(struct mount *mp, struct thread *td)
 {
 	int error;
 	struct nfs_args args;
 	struct sockaddr *nam;
 	struct vnode *vp;
 	char hst[MNAMELEN];
 	size_t len;
 
 	if (mp->mnt_flag & MNT_ROOTFS) {
 		printf("NFSv4: nfs_mountroot not supported\n");
 		return EINVAL;
 	}
 	error = vfs_copyopt(mp->mnt_optnew, "nfs_args", &args, sizeof args);
 	if (error)
 		return (error);
 
 	if (args.version != NFS_ARGSVERSION)
 		return (EPROGMISMATCH);
 	if (mp->mnt_flag & MNT_UPDATE) {
 		struct nfsmount *nmp = VFSTONFS(mp);
 
 		if (nmp == NULL)
 			return (EIO);
 		/*
 		 * When doing an update, we can't change from or to
 		 * v3, switch lockd strategies or change cookie translation
 		 */
 		args.flags = (args.flags &
 		    ~(NFSMNT_NFSV3 | NFSMNT_NFSV4 | NFSMNT_NOLOCKD)) |
 		    (nmp->nm_flag &
 			(NFSMNT_NFSV3 | NFSMNT_NFSV4 | NFSMNT_NOLOCKD));
 		nfs_decode_args(nmp, &args);
 		return (0);
 	}
 
 	error = copyinstr(args.hostname, hst, MNAMELEN-1, &len);
 	if (error)
 		return (error);
 	bzero(&hst[len], MNAMELEN - len);
 	/* sockargs() call must be after above copyin() calls */
 	error = getsockaddr(&nam, (caddr_t)args.addr, args.addrlen);
 	if (error)
 		return (error);
 	error = mountnfs(&args, mp, nam, hst, &vp, td->td_ucred);
 	return (error);
 }
 
 /*
  * renew should be done async
  * should re-scan mount queue each time
  */
 struct proc *nfs4_daemonproc;
 
 static int
 nfs4_do_renew(struct nfsmount *nmp, struct ucred *cred)
 {
 	struct nfs4_compound cp;
 	struct mbuf *mreq, *mrep = NULL, *md, *mb;
 	caddr_t bpos, dpos;	
 	int error;
 
 	mreq = nfsm_reqhead(NULL, NFSV4PROC_COMPOUND, sizeof(uint64_t));
 	mb = mreq;
 	bpos = mtod(mb, caddr_t);
 
 	nfs_v4initcompound(&cp);
 
 	nfsm_v4build_compound(&cp, "nfs4_do_renew()");
 	nfsm_v4build_renew(&cp, nmp->nm_clientid);
 	nfsm_v4build_finalize(&cp);
 
 	nfsm_request_mnt(nmp, NFSV4PROC_COMPOUND, curthread, cred);
 	if (error != 0)
 		goto nfsmout;
 
 	nfsm_v4dissect_compound(&cp);
 	nfsm_v4dissect_renew(&cp);
 	nmp->nm_last_renewal = time_second;
 	return (0);
 
  nfsmout:
 	error = nfs_v4postop(&cp, error);
 
 	/* XXX */
 	if (mrep != NULL)
 		m_freem(mrep);
 	return (error);
 }
 
 static void
 nfs4_daemon(void *arg)
 {
 	struct mount *mp;
 	struct nfsmount *nmp;
 	int nmounts;
 
 	while (1) {
 		nmounts = 0;
 		mtx_lock(&mountlist_mtx);
 		TAILQ_FOREACH(mp, &mountlist, mnt_list) {
 			if (strcmp(mp->mnt_vfc->vfc_name, "nfs4") != 0)
 				continue;
 			nmounts++;
 			nmp = VFSTONFS(mp);
 			if (time_second < nmp->nm_last_renewal + nmp->nm_lease_time - 4)
 				continue;
 			mtx_unlock(&mountlist_mtx);
 			mtx_lock(&Giant);
 			nfs4_do_renew(nmp, (struct ucred *) arg);
 			mtx_unlock(&Giant);
 			mtx_lock(&mountlist_mtx);
 		}
 		mtx_unlock(&mountlist_mtx);
 
 		/* Must kill the daemon here, or module unload will cause a panic */
 		if (nmounts == 0) {
 			mtx_lock(&Giant);
 			nfs4_daemonproc = NULL;
 			mtx_unlock(&Giant);
 			/*printf("nfsv4 renewd exiting\n");*/
 			kthread_exit(0);
 		}
 		tsleep(&nfs4_daemonproc, PVFS, "nfs4", 2 * hz);
 	}
 }
 
 /*
  * Common code for mount and mountroot
  */
 static int
 mountnfs(struct nfs_args *argp, struct mount *mp, struct sockaddr *nam,
     char *hst, struct vnode **vpp, struct ucred *cred)
 {
 	struct nfsmount *nmp;
 	char *rpth, *cp1, *cp2;
 	int nlkup = 0, error;
 	struct nfs4_compound cp;
 	struct mbuf *mreq, *mrep = NULL, *md, *mb;
 	caddr_t bpos, dpos;	
 	struct nfs4_oparg_lookup lkup;
 	struct nfs4_oparg_getfh gfh;
 	struct nfs4_oparg_getattr ga;
 	struct thread *td = curthread; /* XXX */
 
 	if (mp->mnt_flag & MNT_UPDATE) {
 		nmp = VFSTONFS(mp);
 		/* update paths, file handles, etc, here	XXX */
 		FREE(nam, M_SONAME);
 		return (0);
 	} else {
 		nmp = uma_zalloc(nfsmount_zone, M_WAITOK);
 		bzero((caddr_t)nmp, sizeof (struct nfsmount));
 		TAILQ_INIT(&nmp->nm_bufq);
 		mp->mnt_data = (qaddr_t)nmp;
 	}
 
 	vfs_getnewfsid(mp);
 	nmp->nm_mountp = mp;
 	nmp->nm_maxfilesize = 0xffffffffLL;
 	nmp->nm_timeo = NFS_TIMEO;
 	nmp->nm_retry = NFS_RETRANS;
 	nmp->nm_wsize = NFS_WSIZE;
 	nmp->nm_rsize = NFS_RSIZE;
 	nmp->nm_readdirsize = NFS_READDIRSIZE;
 	nmp->nm_numgrps = NFS_MAXGRPS;
 	nmp->nm_readahead = NFS_DEFRAHEAD;
 	nmp->nm_deadthresh = NFS_MAXDEADTHRESH;
 	vfs_mountedfrom(mp, hst);
 	nmp->nm_nam = nam;
 	/* Set up the sockets and per-host congestion */
 	nmp->nm_sotype = argp->sotype;
 	nmp->nm_soproto = argp->proto;
 	nmp->nm_rpcops = &nfs4_rpcops;
 	/* XXX */
         mp->mnt_stat.f_iosize = PAGE_SIZE;
 
 	argp->flags |= (NFSMNT_NFSV3 | NFSMNT_NFSV4);
 
 	nfs_decode_args(nmp, argp);
 
 	if ((error = nfs4_connect(nmp)))
 		goto bad;
 
 	mreq = nfsm_reqhead(NULL, NFSV4PROC_COMPOUND, NFSX_FH(1));
 	mb = mreq;
 	bpos = mtod(mb, caddr_t);
 
 	ga.bm = &nfsv4_fsinfobm;
 	nfs_v4initcompound(&cp);
 
 	/* Get remote path */
 	rpth = hst;
 	strsep(&rpth, ":");
 
 	nfsm_v4build_compound(&cp, "mountnfs()");
 	nfsm_v4build_putrootfh(&cp);
 	for (cp1 = rpth; cp1 && *cp1; cp1 = cp2)  {
 		while (*cp1 == '/')
 			cp1++;
 		if (!*cp1)
 			break;
 		for (cp2 = cp1; *cp2 && *cp2 != '/'; cp2++)
 			;
 		lkup.name = cp1;
 		lkup.namelen = cp2 - cp1;
 		nfsm_v4build_lookup(&cp, &lkup);
 		nlkup++;
 	}
 	nfsm_v4build_getfh(&cp, &gfh);
 	nfsm_v4build_getattr(&cp, &ga);
 	nfsm_v4build_finalize(&cp);
 
 	nfsm_request_mnt(nmp, NFSV4PROC_COMPOUND, td, cred);
 	if (error != 0)
 		goto nfsmout;
 
 	nfsm_v4dissect_compound(&cp);
 	nfsm_v4dissect_putrootfh(&cp);
 	while (nlkup--)
 		nfsm_v4dissect_lookup(&cp);
 	nfsm_v4dissect_getfh(&cp, &gfh);
 	nfsm_v4dissect_getattr(&cp, &ga);
 
 	nfs4_vfsop_fsinfo(&ga.fa, nmp);
 	nmp->nm_state |= NFSSTA_GOTFSINFO;
 
 	/* Copy root fh into nfsmount. */
 	nmp->nm_fhsize = gfh.fh_len;
 	bcopy(&gfh.fh_val, nmp->nm_fh, nmp->nm_fhsize);
 	nmp->nm_last_renewal = time_second;
 
 	if ((error = nfs4_do_setclientid(nmp, cred)) != 0)
 		goto nfsmout;
 
 	/* Start renewd if it isn't already running */
 	if (nfs4_daemonproc == NULL)
 		kthread_create(nfs4_daemon, crdup(cred), &nfs4_daemonproc,
 			       (RFPROC|RFMEM), 0, "nfs4rd");
 
 	return (0);
  nfsmout:
 	error = nfs_v4postop(&cp, error);
 
 	/* XXX */
 	if (mrep != NULL)
 		m_freem(mrep);
 bad:
 	nfs_disconnect(nmp);
 	uma_zfree(nfsmount_zone, nmp);
 	FREE(nam, M_SONAME);
 
 	return (error);
 }
 
 /*
  * unmount system call
  */
 static int
 nfs_unmount(struct mount *mp, int mntflags, struct thread *td)
 {
 	struct nfsmount *nmp;
 	int error, flags = 0;
 
 	if (mntflags & MNT_FORCE)
 		flags |= FORCECLOSE;
 	nmp = VFSTONFS(mp);
 	/*
 	 * Goes something like this..
 	 * - Call vflush(, td) to clear out vnodes for this filesystem
 	 * - Close the socket
 	 * - Free up the data structures
 	 */
 	/* In the forced case, cancel any outstanding requests. */
 	if (flags & FORCECLOSE) {
 		error = nfs_nmcancelreqs(nmp);
 		if (error)
 			return (error);
 		nfs4dev_purge();
 	}
 
 	error = vflush(mp, 0, flags, td);
 	if (error)
 		return (error);
 
 	/*
 	 * We are now committed to the unmount.
 	 */
 	nfs_disconnect(nmp);
 	FREE(nmp->nm_nam, M_SONAME);
 
 	/* XXX there's a race condition here for SMP */
 	wakeup(&nfs4_daemonproc);
 
 	uma_zfree(nfsmount_zone, nmp);
 	return (0);
 }
 
 /*
  * Return root of a filesystem
  */
 static int
 nfs_root(struct mount *mp, int flags, struct vnode **vpp, struct thread *td)
 {
 	struct vnode *vp;
 	struct nfsmount *nmp;
 	struct nfsnode *np;
 	int error;
 
 	nmp = VFSTONFS(mp);
 	error = nfs_nget(mp, (nfsfh_t *)nmp->nm_fh, nmp->nm_fhsize, &np);
 	if (error)
 		return (error);
 	vp = NFSTOV(np);
 	if (vp->v_type == VNON)
 	    vp->v_type = VDIR;
 	vp->v_vflag |= VV_ROOT;
 	*vpp = vp;
 
 	return (0);
 }
 
 /*
  * Flush out the buffer cache
  */
 /* ARGSUSED */
 static int
 nfs_sync(struct mount *mp, int waitfor, struct thread *td)
 {
-	struct vnode *vp, *nvp;
+	struct vnode *vp, *mvp;
 	int error, allerror = 0;
 
 	/*
 	 * Force stale buffer cache information to be flushed.
 	 */
 	MNT_ILOCK(mp);
 loop:
-	MNT_VNODE_FOREACH(vp, mp, nvp) {
+	MNT_VNODE_FOREACH(vp, mp, mvp) {
 		VI_LOCK(vp);
 		MNT_IUNLOCK(mp);
 		if (VOP_ISLOCKED(vp, NULL) ||
 		    vp->v_bufobj.bo_dirty.bv_cnt == 0 ||
 		    waitfor == MNT_LAZY) {
 			VI_UNLOCK(vp);
 			MNT_ILOCK(mp);
 			continue;
 		}
 		if (vget(vp, LK_EXCLUSIVE | LK_INTERLOCK, td)) {
 			MNT_ILOCK(mp);
+			MNT_VNODE_FOREACH_ABORT_ILOCKED(mp, mvp);
 			goto loop;
 		}
 		error = VOP_FSYNC(vp, waitfor, td);
 		if (error)
 			allerror = error;
 		VOP_UNLOCK(vp, 0, td);
 		vrele(vp);
 
 		MNT_ILOCK(mp);
 	}
 	MNT_IUNLOCK(mp);
 	return (allerror);
 }
 
 static int
 nfs4_do_setclientid(struct nfsmount *nmp, struct ucred *cred)
 {
 	struct nfs4_oparg_setclientid scid;
 	struct nfs4_compound cp;
 	struct mbuf *mreq, *mrep = NULL, *md, *mb;
 	caddr_t bpos, dpos;	
 	struct route ro;
 	char *ipsrc = NULL, uaddr[24], name[24];
 	int try = 0;
 	static unsigned long seq;
 	int error;
 
 #ifndef NFS4_USE_RPCCLNT
 	return (0);
 #endif
 	if (nmp->nm_clientid) {
 		printf("nfs4_do_setclientid: already have clientid!\n");
 		error = 0;
 		goto nfsmout;
 	}
 
 	/* Try not to re-use clientids */
 	if (seq == 0)
 		seq = time_second;
 
 #ifdef NFS4_USE_RPCCLNT
 	scid.cb_netid = (nmp->nm_rpcclnt.rc_sotype == SOCK_STREAM) ? "tcp" : "udp";
 #endif
 	scid.cb_netid = "tcp";
 	scid.cb_netidlen = 3;
 	scid.cb_prog = 0x1234; /* XXX */
 
 	/* Do a route lookup to find our source address for talking to this server */
 	bzero(&ro, sizeof ro);
 
 #ifdef NFS4_USE_RPCCLNT
 	ro.ro_dst = *nmp->nm_rpcclnt.rc_name;
 #endif
 	rtalloc(&ro);
 	if (ro.ro_rt == NULL) {
 		error = EHOSTUNREACH;
 		goto nfsmout;
 	}
 	ipsrc = inet_ntoa(IA_SIN(ifatoia(ro.ro_rt->rt_ifa))->sin_addr);
 	sprintf(uaddr, "%s.12.48", ipsrc);
 	scid.cb_univaddr = uaddr;
 	scid.cb_univaddrlen = strlen(uaddr);
 	RTFREE(ro.ro_rt);
 
  try_again:
 	sprintf(name, "%s-%d", ipsrc, (int) ((seq + try) % 1000000L));
 	scid.namelen = strlen(name);
 	scid.name = name;
 	nfs_v4initcompound(&cp);
 
 	mreq = nfsm_reqhead(NULL, NFSV4PROC_COMPOUND, NFSX_FH(1));
 	mb = mreq;
 	bpos = mtod(mb, caddr_t);
 
 	nfsm_v4build_compound(&cp, "nfs4_do_setclientid()");
 	nfsm_v4build_setclientid(&cp, &scid);
 	nfsm_v4build_finalize(&cp);
 
 	nfsm_request_mnt(nmp, NFSV4PROC_COMPOUND, curthread, cred);
 	if (error != 0)
 		goto nfsmout;
 
 	nfsm_v4dissect_compound(&cp);
 	nfsm_v4dissect_setclientid(&cp, &scid);
 	nmp->nm_clientid = scid.clientid;
 
 	error = nfs_v4postop(&cp, error);
 
 	/* Confirm */
 	m_freem(mrep);
 	mreq = nfsm_reqhead(NULL, NFSV4PROC_COMPOUND, NFSX_FH(1));
 	mb = mreq;
 	bpos = mtod(mb, caddr_t);
 
 	nfs_v4initcompound(&cp);
 
 	nfsm_v4build_compound(&cp, "nfs4_do_setclientid() (confirm)");
 	nfsm_v4build_setclientid_confirm(&cp, &scid);
 	nfsm_v4build_finalize(&cp);
 
 	nfsm_request_mnt(nmp, NFSV4PROC_COMPOUND, curthread, cred);
 	if (error != 0)
 		goto nfsmout;
 
 	nfsm_v4dissect_compound(&cp);
 	nfsm_v4dissect_setclientid_confirm(&cp);
 
  nfsmout:
 	error = nfs_v4postop(&cp, error);
 
 	if (mrep)
 		m_freem(mrep);
 	if (error == NFSERR_CLID_INUSE && (++try < NFS4_SETCLIENTID_MAXTRIES))
 		goto try_again;
 
 	return (error);
 }
Index: head/sys/nfsclient/nfs_vfsops.c
===================================================================
--- head/sys/nfsclient/nfs_vfsops.c	(revision 154151)
+++ head/sys/nfsclient/nfs_vfsops.c	(revision 154152)
@@ -1,1079 +1,1080 @@
 /*-
  * Copyright (c) 1989, 1993, 1995
  *	The Regents of the University of California.  All rights reserved.
  *
  * This code is derived from software contributed to Berkeley by
  * Rick Macklem at The University of Guelph.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	@(#)nfs_vfsops.c	8.12 (Berkeley) 5/20/95
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_bootp.h"
 #include "opt_nfsroot.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/kernel.h>
 #include <sys/bio.h>
 #include <sys/buf.h>
 #include <sys/lock.h>
 #include <sys/malloc.h>
 #include <sys/mbuf.h>
 #include <sys/module.h>
 #include <sys/mount.h>
 #include <sys/proc.h>
 #include <sys/socket.h>
 #include <sys/socketvar.h>
 #include <sys/sockio.h>
 #include <sys/sysctl.h>
 #include <sys/vnode.h>
 #include <sys/signalvar.h>
 
 #include <vm/vm.h>
 #include <vm/vm_extern.h>
 #include <vm/uma.h>
 
 #include <net/if.h>
 #include <net/route.h>
 #include <netinet/in.h>
 
 #include <rpc/rpcclnt.h>
 
 #include <nfs/rpcv2.h>
 #include <nfs/nfsproto.h>
 #include <nfsclient/nfs.h>
 #include <nfsclient/nfsnode.h>
 #include <nfsclient/nfsmount.h>
 #include <nfs/xdr_subs.h>
 #include <nfsclient/nfsm_subs.h>
 #include <nfsclient/nfsdiskless.h>
 
 MALLOC_DEFINE(M_NFSREQ, "nfsclient_req", "NFS request header");
 MALLOC_DEFINE(M_NFSBIGFH, "nfsclient_bigfh", "NFS version 3 file handle");
 MALLOC_DEFINE(M_NFSDIROFF, "nfsclient_diroff", "NFS directory offset data");
 MALLOC_DEFINE(M_NFSHASH, "nfsclient_hash", "NFS hash tables");
 MALLOC_DEFINE(M_NFSDIRECTIO, "nfsclient_directio", "NFS Direct IO async write state");
 
 uma_zone_t nfsmount_zone;
 
 struct nfsstats	nfsstats;
 SYSCTL_NODE(_vfs, OID_AUTO, nfs, CTLFLAG_RW, 0, "NFS filesystem");
 SYSCTL_STRUCT(_vfs_nfs, NFS_NFSSTATS, nfsstats, CTLFLAG_RD,
 	&nfsstats, nfsstats, "S,nfsstats");
 static int nfs_ip_paranoia = 1;
 SYSCTL_INT(_vfs_nfs, OID_AUTO, nfs_ip_paranoia, CTLFLAG_RW,
     &nfs_ip_paranoia, 0, "");
 #ifdef NFS_DEBUG
 int nfs_debug;
 SYSCTL_INT(_vfs_nfs, OID_AUTO, debug, CTLFLAG_RW, &nfs_debug, 0, "");
 #endif
 static int nfs_tprintf_initial_delay = NFS_TPRINTF_INITIAL_DELAY;
 SYSCTL_INT(_vfs_nfs, NFS_TPRINTF_INITIAL_DELAY,
         downdelayinitial, CTLFLAG_RW, &nfs_tprintf_initial_delay, 0, "");
 /* how long between console messages "nfs server foo not responding" */
 static int nfs_tprintf_delay = NFS_TPRINTF_DELAY;
 SYSCTL_INT(_vfs_nfs, NFS_TPRINTF_DELAY,
         downdelayinterval, CTLFLAG_RW, &nfs_tprintf_delay, 0, "");
 
 static int	nfs_iosize(struct nfsmount *nmp);
 static void	nfs_decode_args(struct mount *mp, struct nfsmount *nmp, struct nfs_args *argp);
 static int	mountnfs(struct nfs_args *, struct mount *,
 		    struct sockaddr *, char *, struct vnode **,
 		    struct ucred *cred);
 static vfs_mount_t nfs_mount;
 static vfs_cmount_t nfs_cmount;
 static vfs_unmount_t nfs_unmount;
 static vfs_root_t nfs_root;
 static vfs_statfs_t nfs_statfs;
 static vfs_sync_t nfs_sync;
 static vfs_sysctl_t nfs_sysctl;
 
 /*
  * nfs vfs operations.
  */
 static struct vfsops nfs_vfsops = {
 	.vfs_init =		nfs_init,
 	.vfs_mount =		nfs_mount,
 	.vfs_cmount =		nfs_cmount,
 	.vfs_root =		nfs_root,
 	.vfs_statfs =		nfs_statfs,
 	.vfs_sync =		nfs_sync,
 	.vfs_uninit =		nfs_uninit,
 	.vfs_unmount =		nfs_unmount,
 	.vfs_sysctl =		nfs_sysctl,
 };
 VFS_SET(nfs_vfsops, nfs, VFCF_NETWORK);
 
 /* So that loader and kldload(2) can find us, wherever we are.. */
 MODULE_VERSION(nfs, 1);
 
 static struct nfs_rpcops nfs_rpcops = {
 	nfs_readrpc,
 	nfs_writerpc,
 	nfs_writebp,
 	nfs_readlinkrpc,
 	nfs_invaldir,
 	nfs_commit,
 };
 
 /*
  * This structure must be filled in by a primary bootstrap or bootstrap
  * server for a diskless/dataless machine. It is initialized below just
  * to ensure that it is allocated to initialized data (.data not .bss).
  */
 struct nfs_diskless nfs_diskless = { { { 0 } } };
 struct nfsv3_diskless nfsv3_diskless = { { { 0 } } };
 int nfs_diskless_valid = 0;
 
 SYSCTL_INT(_vfs_nfs, OID_AUTO, diskless_valid, CTLFLAG_RD,
 	&nfs_diskless_valid, 0, "");
 
 SYSCTL_STRING(_vfs_nfs, OID_AUTO, diskless_rootpath, CTLFLAG_RD,
 	nfsv3_diskless.root_hostnam, 0, "");
 
 SYSCTL_OPAQUE(_vfs_nfs, OID_AUTO, diskless_rootaddr, CTLFLAG_RD,
 	&nfsv3_diskless.root_saddr, sizeof nfsv3_diskless.root_saddr,
 	"%Ssockaddr_in", "");
 
 
 void		nfsargs_ntoh(struct nfs_args *);
 static int	nfs_mountdiskless(char *, int,
 		    struct sockaddr_in *, struct nfs_args *,
 		    struct thread *, struct vnode **, struct mount *);
 static void	nfs_convert_diskless(void);
 static void	nfs_convert_oargs(struct nfs_args *args,
 		    struct onfs_args *oargs);
 
 static int
 nfs_iosize(struct nfsmount *nmp)
 {
 	int iosize;
 
 	/*
 	 * Calculate the size used for io buffers.  Use the larger
 	 * of the two sizes to minimise nfs requests but make sure
 	 * that it is at least one VM page to avoid wasting buffer
 	 * space.
 	 */
 	iosize = max(nmp->nm_rsize, nmp->nm_wsize);
 	if (iosize < PAGE_SIZE) iosize = PAGE_SIZE;
 	return iosize;
 }
 
 static void
 nfs_convert_oargs(struct nfs_args *args, struct onfs_args *oargs)
 {
 
 	args->version = NFS_ARGSVERSION;
 	args->addr = oargs->addr;
 	args->addrlen = oargs->addrlen;
 	args->sotype = oargs->sotype;
 	args->proto = oargs->proto;
 	args->fh = oargs->fh;
 	args->fhsize = oargs->fhsize;
 	args->flags = oargs->flags;
 	args->wsize = oargs->wsize;
 	args->rsize = oargs->rsize;
 	args->readdirsize = oargs->readdirsize;
 	args->timeo = oargs->timeo;
 	args->retrans = oargs->retrans;
 	args->maxgrouplist = oargs->maxgrouplist;
 	args->readahead = oargs->readahead;
 	args->deadthresh = oargs->deadthresh;
 	args->hostname = oargs->hostname;
 }
 
 static void
 nfs_convert_diskless(void)
 {
 
 	bcopy(&nfs_diskless.myif, &nfsv3_diskless.myif,
 		sizeof(struct ifaliasreq));
 	bcopy(&nfs_diskless.mygateway, &nfsv3_diskless.mygateway,
 		sizeof(struct sockaddr_in));
 	nfs_convert_oargs(&nfsv3_diskless.root_args,&nfs_diskless.root_args);
 	nfsv3_diskless.root_fhsize = NFSX_V2FH;
 	bcopy(nfs_diskless.root_fh, nfsv3_diskless.root_fh, NFSX_V2FH);
 	bcopy(&nfs_diskless.root_saddr,&nfsv3_diskless.root_saddr,
 		sizeof(struct sockaddr_in));
 	bcopy(nfs_diskless.root_hostnam, nfsv3_diskless.root_hostnam, MNAMELEN);
 	nfsv3_diskless.root_time = nfs_diskless.root_time;
 	bcopy(nfs_diskless.my_hostnam, nfsv3_diskless.my_hostnam,
 		MAXHOSTNAMELEN);
 	nfs_diskless_valid = 3;
 }
 
 /*
  * nfs statfs call
  */
 static int
 nfs_statfs(struct mount *mp, struct statfs *sbp, struct thread *td)
 {
 	struct vnode *vp;
 	struct nfs_statfs *sfp;
 	caddr_t bpos, dpos;
 	struct nfsmount *nmp = VFSTONFS(mp);
 	int error = 0, v3 = (nmp->nm_flag & NFSMNT_NFSV3), retattr;
 	struct mbuf *mreq, *mrep, *md, *mb;
 	struct nfsnode *np;
 	u_quad_t tquad;
 
 #ifndef nolint
 	sfp = NULL;
 #endif
 	error = nfs_nget(mp, (nfsfh_t *)nmp->nm_fh, nmp->nm_fhsize, &np);
 	if (error)
 		return (error);
 	vp = NFSTOV(np);
 	if (v3 && (nmp->nm_state & NFSSTA_GOTFSINFO) == 0)
 		(void)nfs_fsinfo(nmp, vp, td->td_ucred, td);
 	nfsstats.rpccnt[NFSPROC_FSSTAT]++;
 	mreq = nfsm_reqhead(vp, NFSPROC_FSSTAT, NFSX_FH(v3));
 	mb = mreq;
 	bpos = mtod(mb, caddr_t);
 	nfsm_fhtom(vp, v3);
 	nfsm_request(vp, NFSPROC_FSSTAT, td, td->td_ucred);
 	if (v3)
 		nfsm_postop_attr(vp, retattr);
 	if (error) {
 		if (mrep != NULL)
 			m_freem(mrep);
 		goto nfsmout;
 	}
 	sfp = nfsm_dissect(struct nfs_statfs *, NFSX_STATFS(v3));
 	sbp->f_iosize = nfs_iosize(nmp);
 	if (v3) {
 		sbp->f_bsize = NFS_FABLKSIZE;
 		tquad = fxdr_hyper(&sfp->sf_tbytes);
 		sbp->f_blocks = tquad / NFS_FABLKSIZE;
 		tquad = fxdr_hyper(&sfp->sf_fbytes);
 		sbp->f_bfree = tquad / NFS_FABLKSIZE;
 		tquad = fxdr_hyper(&sfp->sf_abytes);
 		sbp->f_bavail = tquad / NFS_FABLKSIZE;
 		sbp->f_files = (fxdr_unsigned(int32_t,
 		    sfp->sf_tfiles.nfsuquad[1]) & 0x7fffffff);
 		sbp->f_ffree = (fxdr_unsigned(int32_t,
 		    sfp->sf_ffiles.nfsuquad[1]) & 0x7fffffff);
 	} else {
 		sbp->f_bsize = fxdr_unsigned(int32_t, sfp->sf_bsize);
 		sbp->f_blocks = fxdr_unsigned(int32_t, sfp->sf_blocks);
 		sbp->f_bfree = fxdr_unsigned(int32_t, sfp->sf_bfree);
 		sbp->f_bavail = fxdr_unsigned(int32_t, sfp->sf_bavail);
 		sbp->f_files = 0;
 		sbp->f_ffree = 0;
 	}
 	m_freem(mrep);
 nfsmout:
 	vput(vp);
 	return (error);
 }
 
 /*
  * nfs version 3 fsinfo rpc call
  */
 int
 nfs_fsinfo(struct nfsmount *nmp, struct vnode *vp, struct ucred *cred,
     struct thread *td)
 {
 	struct nfsv3_fsinfo *fsp;
 	u_int32_t pref, max;
 	caddr_t bpos, dpos;
 	int error = 0, retattr;
 	struct mbuf *mreq, *mrep, *md, *mb;
 	u_int64_t maxfsize;
 
 	nfsstats.rpccnt[NFSPROC_FSINFO]++;
 	mreq = nfsm_reqhead(vp, NFSPROC_FSINFO, NFSX_FH(1));
 	mb = mreq;
 	bpos = mtod(mb, caddr_t);
 	nfsm_fhtom(vp, 1);
 	nfsm_request(vp, NFSPROC_FSINFO, td, cred);
 	nfsm_postop_attr(vp, retattr);
 	if (!error) {
 		fsp = nfsm_dissect(struct nfsv3_fsinfo *, NFSX_V3FSINFO);
 		pref = fxdr_unsigned(u_int32_t, fsp->fs_wtpref);
 		if (pref < nmp->nm_wsize && pref >= NFS_FABLKSIZE)
 			nmp->nm_wsize = (pref + NFS_FABLKSIZE - 1) &
 				~(NFS_FABLKSIZE - 1);
 		max = fxdr_unsigned(u_int32_t, fsp->fs_wtmax);
 		if (max < nmp->nm_wsize && max > 0) {
 			nmp->nm_wsize = max & ~(NFS_FABLKSIZE - 1);
 			if (nmp->nm_wsize == 0)
 				nmp->nm_wsize = max;
 		}
 		pref = fxdr_unsigned(u_int32_t, fsp->fs_rtpref);
 		if (pref < nmp->nm_rsize && pref >= NFS_FABLKSIZE)
 			nmp->nm_rsize = (pref + NFS_FABLKSIZE - 1) &
 				~(NFS_FABLKSIZE - 1);
 		max = fxdr_unsigned(u_int32_t, fsp->fs_rtmax);
 		if (max < nmp->nm_rsize && max > 0) {
 			nmp->nm_rsize = max & ~(NFS_FABLKSIZE - 1);
 			if (nmp->nm_rsize == 0)
 				nmp->nm_rsize = max;
 		}
 		pref = fxdr_unsigned(u_int32_t, fsp->fs_dtpref);
 		if (pref < nmp->nm_readdirsize && pref >= NFS_DIRBLKSIZ)
 			nmp->nm_readdirsize = (pref + NFS_DIRBLKSIZ - 1) &
 				~(NFS_DIRBLKSIZ - 1);
 		if (max < nmp->nm_readdirsize && max > 0) {
 			nmp->nm_readdirsize = max & ~(NFS_DIRBLKSIZ - 1);
 			if (nmp->nm_readdirsize == 0)
 				nmp->nm_readdirsize = max;
 		}
 		maxfsize = fxdr_hyper(&fsp->fs_maxfilesize);
 		if (maxfsize > 0 && maxfsize < nmp->nm_maxfilesize)
 			nmp->nm_maxfilesize = maxfsize;
 		nmp->nm_mountp->mnt_stat.f_iosize = nfs_iosize(nmp);
 		nmp->nm_state |= NFSSTA_GOTFSINFO;
 	}
 	m_freem(mrep);
 nfsmout:
 	return (error);
 }
 
 /*
  * Mount a remote root fs via. nfs. This depends on the info in the
  * nfs_diskless structure that has been filled in properly by some primary
  * bootstrap.
  * It goes something like this:
  * - do enough of "ifconfig" by calling ifioctl() so that the system
  *   can talk to the server
  * - If nfs_diskless.mygateway is filled in, use that address as
  *   a default gateway.
  * - build the rootfs mount point and call mountnfs() to do the rest.
  *
  * It is assumed to be safe to read, modify, and write the nfsv3_diskless
  * structure, as well as other global NFS client variables here, as
  * nfs_mountroot() will be called once in the boot before any other NFS
  * client activity occurs.
  */
 int
 nfs_mountroot(struct mount *mp, struct thread *td)
 {
 	struct nfsv3_diskless *nd = &nfsv3_diskless;
 	struct socket *so;
 	struct vnode *vp;
 	int error, i;
 	u_long l;
 	char buf[128];
 
 	NET_ASSERT_GIANT();
 
 #if defined(BOOTP_NFSROOT) && defined(BOOTP)
 	bootpc_init();		/* use bootp to get nfs_diskless filled in */
 #elif defined(NFS_ROOT)
 	nfs_setup_diskless();
 #endif
 
 	if (nfs_diskless_valid == 0)
 		return (-1);
 	if (nfs_diskless_valid == 1)
 		nfs_convert_diskless();
 
 	/*
 	 * XXX splnet, so networks will receive...
 	 */
 	splnet();
 
 	/*
 	 * Do enough of ifconfig(8) so that the critical net interface can
 	 * talk to the server.
 	 */
 	error = socreate(nd->myif.ifra_addr.sa_family, &so, SOCK_DGRAM, 0,
 	    td->td_ucred, td);
 	if (error)
 		panic("nfs_mountroot: socreate(%04x): %d",
 			nd->myif.ifra_addr.sa_family, error);
 
 #if 0 /* XXX Bad idea */
 	/*
 	 * We might not have been told the right interface, so we pass
 	 * over the first ten interfaces of the same kind, until we get
 	 * one of them configured.
 	 */
 
 	for (i = strlen(nd->myif.ifra_name) - 1;
 		nd->myif.ifra_name[i] >= '0' &&
 		nd->myif.ifra_name[i] <= '9';
 		nd->myif.ifra_name[i] ++) {
 		error = ifioctl(so, SIOCAIFADDR, (caddr_t)&nd->myif, td);
 		if(!error)
 			break;
 	}
 #endif
 	error = ifioctl(so, SIOCAIFADDR, (caddr_t)&nd->myif, td);
 	if (error)
 		panic("nfs_mountroot: SIOCAIFADDR: %d", error);
 	soclose(so);
 
 	/*
 	 * If the gateway field is filled in, set it as the default route.
 	 * Note that pxeboot will set a default route of 0 if the route
 	 * is not set by the DHCP server.  Check also for a value of 0
 	 * to avoid panicking inappropriately in that situation.
 	 */
 	if (nd->mygateway.sin_len != 0 &&
 	    nd->mygateway.sin_addr.s_addr != 0) {
 		struct sockaddr_in mask, sin;
 
 		bzero((caddr_t)&mask, sizeof(mask));
 		sin = mask;
 		sin.sin_family = AF_INET;
 		sin.sin_len = sizeof(sin);
 		error = rtrequest(RTM_ADD, (struct sockaddr *)&sin,
 		    (struct sockaddr *)&nd->mygateway,
 		    (struct sockaddr *)&mask,
 		    RTF_UP | RTF_GATEWAY, NULL);
 		if (error)
 			panic("nfs_mountroot: RTM_ADD: %d", error);
 	}
 
 	/*
 	 * Create the rootfs mount point.
 	 */
 	nd->root_args.fh = nd->root_fh;
 	nd->root_args.fhsize = nd->root_fhsize;
 	l = ntohl(nd->root_saddr.sin_addr.s_addr);
 	snprintf(buf, sizeof(buf), "%ld.%ld.%ld.%ld:%s",
 		(l >> 24) & 0xff, (l >> 16) & 0xff,
 		(l >>  8) & 0xff, (l >>  0) & 0xff, nd->root_hostnam);
 	printf("NFS ROOT: %s\n", buf);
 	if ((error = nfs_mountdiskless(buf, MNT_RDONLY,
 	    &nd->root_saddr, &nd->root_args, td, &vp, mp)) != 0) {
 		return (error);
 	}
 
 	/*
 	 * This is not really an nfs issue, but it is much easier to
 	 * set hostname here and then let the "/etc/rc.xxx" files
 	 * mount the right /var based upon its preset value.
 	 */
 	bcopy(nd->my_hostnam, hostname, MAXHOSTNAMELEN);
 	hostname[MAXHOSTNAMELEN - 1] = '\0';
 	for (i = 0; i < MAXHOSTNAMELEN; i++)
 		if (hostname[i] == '\0')
 			break;
 	inittodr(ntohl(nd->root_time));
 	return (0);
 }
 
 /*
  * Internal version of mount system call for diskless setup.
  */
 static int
 nfs_mountdiskless(char *path, int mountflag,
     struct sockaddr_in *sin, struct nfs_args *args, struct thread *td,
     struct vnode **vpp, struct mount *mp)
 {
 	struct sockaddr *nam;
 	int error;
 
 	mp->mnt_kern_flag = 0;
 	mp->mnt_flag = mountflag;
 	nam = sodupsockaddr((struct sockaddr *)sin, M_WAITOK);
 	if ((error = mountnfs(args, mp, nam, path, vpp,
 	    td->td_ucred)) != 0) {
 		printf("nfs_mountroot: mount %s on /: %d\n", path, error);
 		return (error);
 	}
 	return (0);
 }
 
 static void
 nfs_decode_args(struct mount *mp, struct nfsmount *nmp, struct nfs_args *argp)
 {
 	int s;
 	int adjsock;
 	int maxio;
 
 	s = splnet();
 
 	/*
 	 * Set read-only flag if requested; otherwise, clear it if this is
 	 * an update.  If this is not an update, then either the read-only
 	 * flag is already clear, or this is a root mount and it was set
 	 * intentionally at some previous point.
 	 */
 	if (vfs_getopt(mp->mnt_optnew, "ro", NULL, NULL) == 0)
 		mp->mnt_flag |= MNT_RDONLY;
 	else if (mp->mnt_flag & MNT_UPDATE)
 		mp->mnt_flag &= ~MNT_RDONLY;
 
 	/*
 	 * Silently clear NFSMNT_NOCONN if it's a TCP mount, it makes
 	 * no sense in that context.
 	 */
 	if (argp->sotype == SOCK_STREAM)
 		nmp->nm_flag &= ~NFSMNT_NOCONN;
 
 	/* Also clear RDIRPLUS if not NFSv3, it crashes some servers */
 	if ((argp->flags & NFSMNT_NFSV3) == 0)
 		nmp->nm_flag &= ~NFSMNT_RDIRPLUS;
 
 	/* Re-bind if rsrvd port requested and wasn't on one */
 	adjsock = !(nmp->nm_flag & NFSMNT_RESVPORT)
 		  && (argp->flags & NFSMNT_RESVPORT);
 	/* Also re-bind if we're switching to/from a connected UDP socket */
 	adjsock |= ((nmp->nm_flag & NFSMNT_NOCONN) !=
 		    (argp->flags & NFSMNT_NOCONN));
 
 	/* Update flags atomically.  Don't change the lock bits. */
 	nmp->nm_flag = argp->flags | nmp->nm_flag;
 	splx(s);
 
 	if ((argp->flags & NFSMNT_TIMEO) && argp->timeo > 0) {
 		nmp->nm_timeo = (argp->timeo * NFS_HZ + 5) / 10;
 		if (nmp->nm_timeo < NFS_MINTIMEO)
 			nmp->nm_timeo = NFS_MINTIMEO;
 		else if (nmp->nm_timeo > NFS_MAXTIMEO)
 			nmp->nm_timeo = NFS_MAXTIMEO;
 	}
 
 	if ((argp->flags & NFSMNT_RETRANS) && argp->retrans > 1) {
 		nmp->nm_retry = argp->retrans;
 		if (nmp->nm_retry > NFS_MAXREXMIT)
 			nmp->nm_retry = NFS_MAXREXMIT;
 	}
 
 	if (argp->flags & NFSMNT_NFSV3) {
 		if (argp->sotype == SOCK_DGRAM)
 			maxio = NFS_MAXDGRAMDATA;
 		else
 			maxio = NFS_MAXDATA;
 	} else
 		maxio = NFS_V2MAXDATA;
 
 	if ((argp->flags & NFSMNT_WSIZE) && argp->wsize > 0) {
 		nmp->nm_wsize = argp->wsize;
 		/* Round down to multiple of blocksize */
 		nmp->nm_wsize &= ~(NFS_FABLKSIZE - 1);
 		if (nmp->nm_wsize <= 0)
 			nmp->nm_wsize = NFS_FABLKSIZE;
 	}
 	if (nmp->nm_wsize > maxio)
 		nmp->nm_wsize = maxio;
 	if (nmp->nm_wsize > MAXBSIZE)
 		nmp->nm_wsize = MAXBSIZE;
 
 	if ((argp->flags & NFSMNT_RSIZE) && argp->rsize > 0) {
 		nmp->nm_rsize = argp->rsize;
 		/* Round down to multiple of blocksize */
 		nmp->nm_rsize &= ~(NFS_FABLKSIZE - 1);
 		if (nmp->nm_rsize <= 0)
 			nmp->nm_rsize = NFS_FABLKSIZE;
 	}
 	if (nmp->nm_rsize > maxio)
 		nmp->nm_rsize = maxio;
 	if (nmp->nm_rsize > MAXBSIZE)
 		nmp->nm_rsize = MAXBSIZE;
 
 	if ((argp->flags & NFSMNT_READDIRSIZE) && argp->readdirsize > 0) {
 		nmp->nm_readdirsize = argp->readdirsize;
 	}
 	if (nmp->nm_readdirsize > maxio)
 		nmp->nm_readdirsize = maxio;
 	if (nmp->nm_readdirsize > nmp->nm_rsize)
 		nmp->nm_readdirsize = nmp->nm_rsize;
 
 	if ((argp->flags & NFSMNT_ACREGMIN) && argp->acregmin >= 0)
 		nmp->nm_acregmin = argp->acregmin;
 	else
 		nmp->nm_acregmin = NFS_MINATTRTIMO;
 	if ((argp->flags & NFSMNT_ACREGMAX) && argp->acregmax >= 0)
 		nmp->nm_acregmax = argp->acregmax;
 	else
 		nmp->nm_acregmax = NFS_MAXATTRTIMO;
 	if ((argp->flags & NFSMNT_ACDIRMIN) && argp->acdirmin >= 0)
 		nmp->nm_acdirmin = argp->acdirmin;
 	else
 		nmp->nm_acdirmin = NFS_MINDIRATTRTIMO;
 	if ((argp->flags & NFSMNT_ACDIRMAX) && argp->acdirmax >= 0)
 		nmp->nm_acdirmax = argp->acdirmax;
 	else
 		nmp->nm_acdirmax = NFS_MAXDIRATTRTIMO;
 	if (nmp->nm_acdirmin > nmp->nm_acdirmax)
 		nmp->nm_acdirmin = nmp->nm_acdirmax;
 	if (nmp->nm_acregmin > nmp->nm_acregmax)
 		nmp->nm_acregmin = nmp->nm_acregmax;
 
 	if ((argp->flags & NFSMNT_MAXGRPS) && argp->maxgrouplist >= 0) {
 		if (argp->maxgrouplist <= NFS_MAXGRPS)
 			nmp->nm_numgrps = argp->maxgrouplist;
 		else
 			nmp->nm_numgrps = NFS_MAXGRPS;
 	}
 	if ((argp->flags & NFSMNT_READAHEAD) && argp->readahead >= 0) {
 		if (argp->readahead <= NFS_MAXRAHEAD)
 			nmp->nm_readahead = argp->readahead;
 		else
 			nmp->nm_readahead = NFS_MAXRAHEAD;
 	}
 	if ((argp->flags & NFSMNT_WCOMMITSIZE) && argp->wcommitsize >= 0) {
 		if (argp->wcommitsize < nmp->nm_wsize)
 			nmp->nm_wcommitsize = nmp->nm_wsize;
 		else
 			nmp->nm_wcommitsize = argp->wcommitsize;
 	}
 	if ((argp->flags & NFSMNT_DEADTHRESH) && argp->deadthresh >= 0) {
 		if (argp->deadthresh <= NFS_MAXDEADTHRESH)
 			nmp->nm_deadthresh = argp->deadthresh;
 		else
 			nmp->nm_deadthresh = NFS_MAXDEADTHRESH;
 	}
 
 	adjsock |= ((nmp->nm_sotype != argp->sotype) ||
 		    (nmp->nm_soproto != argp->proto));
 	nmp->nm_sotype = argp->sotype;
 	nmp->nm_soproto = argp->proto;
 
 	if (nmp->nm_so && adjsock) {
 		nfs_safedisconnect(nmp);
 		if (nmp->nm_sotype == SOCK_DGRAM)
 			while (nfs_connect(nmp, NULL)) {
 				printf("nfs_args: retrying connect\n");
 				(void) tsleep((caddr_t)&lbolt,
 					      PSOCK, "nfscon", 0);
 			}
 	}
 }
 
 static const char *nfs_opts[] = { "from", "nfs_args", NULL };
 
 /*
  * VFS Operations.
  *
  * mount system call
  * It seems a bit dumb to copyinstr() the host and path here and then
  * bcopy() them in mountnfs(), but I wanted to detect errors before
  * doing the sockargs() call because sockargs() allocates an mbuf and
  * an error after that means that I have to release the mbuf.
  */
 /* ARGSUSED */
 static int
 nfs_mount(struct mount *mp, struct thread *td)
 {
 	int error;
 	struct nfs_args args;
 	struct sockaddr *nam;
 	struct vnode *vp;
 	char hst[MNAMELEN];
 	size_t len;
 	u_char nfh[NFSX_V3FHMAX];
 
 	if (vfs_filteropt(mp->mnt_optnew, nfs_opts))
 		return (EINVAL);
 
 	if (mp->mnt_flag & MNT_ROOTFS)
 		return (nfs_mountroot(mp, td));
 
 	error = vfs_copyopt(mp->mnt_optnew, "nfs_args", &args, sizeof args);
 	if (error)
 		return (error);
 
 	if (args.version != NFS_ARGSVERSION) {
 		return (EPROGMISMATCH);
 	}
 	if (mp->mnt_flag & MNT_UPDATE) {
 		struct nfsmount *nmp = VFSTONFS(mp);
 
 		if (nmp == NULL)
 			return (EIO);
 		/*
 		 * When doing an update, we can't change from or to
 		 * v3, switch lockd strategies or change cookie translation
 		 */
 		args.flags = (args.flags &
 		    ~(NFSMNT_NFSV3 | NFSMNT_NOLOCKD /*|NFSMNT_XLATECOOKIE*/)) |
 		    (nmp->nm_flag &
 			(NFSMNT_NFSV3 | NFSMNT_NOLOCKD /*|NFSMNT_XLATECOOKIE*/));
 		nfs_decode_args(mp, nmp, &args);
 		return (0);
 	}
 
 	/*
 	 * Make the nfs_ip_paranoia sysctl serve as the default connection
 	 * or no-connection mode for those protocols that support 
 	 * no-connection mode (the flag will be cleared later for protocols
 	 * that do not support no-connection mode).  This will allow a client
 	 * to receive replies from a different IP then the request was
 	 * sent to.  Note: default value for nfs_ip_paranoia is 1 (paranoid),
 	 * not 0.
 	 */
 	if (nfs_ip_paranoia == 0)
 		args.flags |= NFSMNT_NOCONN;
 	if (args.fhsize < 0 || args.fhsize > NFSX_V3FHMAX)
 		return (EINVAL);
 	error = copyin((caddr_t)args.fh, (caddr_t)nfh, args.fhsize);
 	if (error)
 		return (error);
 	error = copyinstr(args.hostname, hst, MNAMELEN-1, &len);
 	if (error)
 		return (error);
 	bzero(&hst[len], MNAMELEN - len);
 	/* sockargs() call must be after above copyin() calls */
 	error = getsockaddr(&nam, (caddr_t)args.addr, args.addrlen);
 	if (error)
 		return (error);
 	args.fh = nfh;
 	error = mountnfs(&args, mp, nam, hst, &vp, td->td_ucred);
 	return (error);
 }
 
 
 /*
  * VFS Operations.
  *
  * mount system call
  * It seems a bit dumb to copyinstr() the host and path here and then
  * bcopy() them in mountnfs(), but I wanted to detect errors before
  * doing the sockargs() call because sockargs() allocates an mbuf and
  * an error after that means that I have to release the mbuf.
  */
 /* ARGSUSED */
 static int
 nfs_cmount(struct mntarg *ma, void *data, int flags, struct thread *td)
 {
 	int error;
 	struct nfs_args args;
 
 	error = copyin(data, &args, sizeof (struct nfs_args));
 	if (error)
 		return (error);
 
 	ma = mount_arg(ma, "nfs_args", &args, sizeof args);
 
 	error = kernel_mount(ma, flags);
 
 	return (error);
 }
 
 /*
  * Common code for mount and mountroot
  */
 static int
 mountnfs(struct nfs_args *argp, struct mount *mp, struct sockaddr *nam,
     char *hst, struct vnode **vpp, struct ucred *cred)
 {
 	struct nfsmount *nmp;
 	struct nfsnode *np;
 	int error;
 	struct vattr attrs;
 
 	if (mp->mnt_flag & MNT_UPDATE) {
 		nmp = VFSTONFS(mp);
 		/* update paths, file handles, etc, here	XXX */
 		FREE(nam, M_SONAME);
 		return (0);
 	} else {
 		nmp = uma_zalloc(nfsmount_zone, M_WAITOK);
 		bzero((caddr_t)nmp, sizeof (struct nfsmount));
 		TAILQ_INIT(&nmp->nm_bufq);
 		mp->mnt_data = (qaddr_t)nmp;
 	}
 	vfs_getnewfsid(mp);
 	nmp->nm_mountp = mp;
 
 	/*
 	 * V2 can only handle 32 bit filesizes.  A 4GB-1 limit may be too
 	 * high, depending on whether we end up with negative offsets in
 	 * the client or server somewhere.  2GB-1 may be safer.
 	 *
 	 * For V3, nfs_fsinfo will adjust this as necessary.  Assume maximum
 	 * that we can handle until we find out otherwise.
 	 * XXX Our "safe" limit on the client is what we can store in our
 	 * buffer cache using signed(!) block numbers.
 	 */
 	if ((argp->flags & NFSMNT_NFSV3) == 0)
 		nmp->nm_maxfilesize = 0xffffffffLL;
 	else
 		nmp->nm_maxfilesize = (u_int64_t)0x80000000 * DEV_BSIZE - 1;
 
 	nmp->nm_timeo = NFS_TIMEO;
 	nmp->nm_retry = NFS_RETRANS;
 	if ((argp->flags & NFSMNT_NFSV3) && argp->sotype == SOCK_STREAM) {
 		nmp->nm_wsize = nmp->nm_rsize = NFS_MAXDATA;
 	} else {
 		nmp->nm_wsize = NFS_WSIZE;
 		nmp->nm_rsize = NFS_RSIZE;
 	}
 	nmp->nm_wcommitsize = hibufspace / (desiredvnodes / 1000);
 	nmp->nm_readdirsize = NFS_READDIRSIZE;
 	nmp->nm_numgrps = NFS_MAXGRPS;
 	nmp->nm_readahead = NFS_DEFRAHEAD;
 	nmp->nm_deadthresh = NFS_MAXDEADTHRESH;
 	nmp->nm_tprintf_delay = nfs_tprintf_delay;
 	if (nmp->nm_tprintf_delay < 0)
 		nmp->nm_tprintf_delay = 0;
 	nmp->nm_tprintf_initial_delay = nfs_tprintf_initial_delay;
 	if (nmp->nm_tprintf_initial_delay < 0)
 		nmp->nm_tprintf_initial_delay = 0;
 	nmp->nm_fhsize = argp->fhsize;
 	bcopy((caddr_t)argp->fh, (caddr_t)nmp->nm_fh, argp->fhsize);
 	bcopy(hst, mp->mnt_stat.f_mntfromname, MNAMELEN);
 	nmp->nm_nam = nam;
 	/* Set up the sockets and per-host congestion */
 	nmp->nm_sotype = argp->sotype;
 	nmp->nm_soproto = argp->proto;
 	nmp->nm_rpcops = &nfs_rpcops;
 
 	nfs_decode_args(mp, nmp, argp);
 
 	if (nmp->nm_sotype == SOCK_STREAM)
 		mtx_init(&nmp->nm_nfstcpstate.mtx, "NFS/TCP state lock", 
 			 NULL, MTX_DEF);		
 
 	/*
 	 * For Connection based sockets (TCP,...) defer the connect until
 	 * the first request, in case the server is not responding.
 	 */
 	if (nmp->nm_sotype == SOCK_DGRAM &&
 		(error = nfs_connect(nmp, NULL)))
 		goto bad;
 
 	/*
 	 * This is silly, but it has to be set so that vinifod() works.
 	 * We do not want to do an nfs_statfs() here since we can get
 	 * stuck on a dead server and we are holding a lock on the mount
 	 * point.
 	 */
 	mp->mnt_stat.f_iosize = nfs_iosize(nmp);
 	/*
 	 * A reference count is needed on the nfsnode representing the
 	 * remote root.  If this object is not persistent, then backward
 	 * traversals of the mount point (i.e. "..") will not work if
 	 * the nfsnode gets flushed out of the cache. Ufs does not have
 	 * this problem, because one can identify root inodes by their
 	 * number == ROOTINO (2).
 	 */
 	error = nfs_nget(mp, (nfsfh_t *)nmp->nm_fh, nmp->nm_fhsize, &np);
 	if (error)
 		goto bad;
 	*vpp = NFSTOV(np);
 
 	/*
 	 * Get file attributes and transfer parameters for the
 	 * mountpoint.  This has the side effect of filling in
 	 * (*vpp)->v_type with the correct value.
 	 */
 	if (argp->flags & NFSMNT_NFSV3)
 		nfs_fsinfo(nmp, *vpp, curthread->td_ucred, curthread);
 	else
 		VOP_GETATTR(*vpp, &attrs, curthread->td_ucred, curthread);
 
 	/*
 	 * Lose the lock but keep the ref.
 	 */
 	VOP_UNLOCK(*vpp, 0, curthread);
 
 	return (0);
 bad:
 	if (nmp->nm_sotype == SOCK_STREAM)
 		mtx_destroy(&nmp->nm_nfstcpstate.mtx);
 	nfs_disconnect(nmp);
 	uma_zfree(nfsmount_zone, nmp);
 	FREE(nam, M_SONAME);
 	return (error);
 }
 
 /*
  * unmount system call
  */
 static int
 nfs_unmount(struct mount *mp, int mntflags, struct thread *td)
 {
 	struct nfsmount *nmp;
 	int error, flags = 0;
 
 	if (mntflags & MNT_FORCE)
 		flags |= FORCECLOSE;
 	nmp = VFSTONFS(mp);
 	/*
 	 * Goes something like this..
 	 * - Call vflush() to clear out vnodes for this filesystem
 	 * - Close the socket
 	 * - Free up the data structures
 	 */
 	/* In the forced case, cancel any outstanding requests. */
 	if (flags & FORCECLOSE) {
 		error = nfs_nmcancelreqs(nmp);
 		if (error)
 			return (error);
 	}
 	/* We hold 1 extra ref on the root vnode; see comment in mountnfs(). */
 	error = vflush(mp, 1, flags, td);
 	if (error)
 		return (error);
 
 	/*
 	 * We are now committed to the unmount.
 	 */
 	nfs_disconnect(nmp);
 	FREE(nmp->nm_nam, M_SONAME);
 
 	if (nmp->nm_sotype == SOCK_STREAM)
 		mtx_destroy(&nmp->nm_nfstcpstate.mtx);
 	
 	uma_zfree(nfsmount_zone, nmp);
 	return (0);
 }
 
 /*
  * Return root of a filesystem
  */
 static int
 nfs_root(struct mount *mp, int flags, struct vnode **vpp, struct thread *td)
 {
 	struct vnode *vp;
 	struct nfsmount *nmp;
 	struct nfsnode *np;
 	int error;
 
 	nmp = VFSTONFS(mp);
 	error = nfs_nget(mp, (nfsfh_t *)nmp->nm_fh, nmp->nm_fhsize, &np);
 	if (error)
 		return (error);
 	vp = NFSTOV(np);
 	/*
 	 * Get transfer parameters and attributes for root vnode once.
 	 */
 	if ((nmp->nm_state & NFSSTA_GOTFSINFO) == 0 &&
 	    (nmp->nm_flag & NFSMNT_NFSV3)) {
 		nfs_fsinfo(nmp, vp, curthread->td_ucred, curthread);
 	}
 	if (vp->v_type == VNON)
 	    vp->v_type = VDIR;
 	vp->v_vflag |= VV_ROOT;
 	*vpp = vp;
 	return (0);
 }
 
 /*
  * Flush out the buffer cache
  */
 /* ARGSUSED */
 static int
 nfs_sync(struct mount *mp, int waitfor, struct thread *td)
 {
-	struct vnode *vp, *nvp;
+	struct vnode *vp, *mvp;
 	int error, allerror = 0;
 
 	/*
 	 * Force stale buffer cache information to be flushed.
 	 */
 	MNT_ILOCK(mp);
 loop:
-	MNT_VNODE_FOREACH(vp, mp, nvp) {
+	MNT_VNODE_FOREACH(vp, mp, mvp) {
 		VI_LOCK(vp);
 		MNT_IUNLOCK(mp);
 		if (VOP_ISLOCKED(vp, NULL) ||
 		    vp->v_bufobj.bo_dirty.bv_cnt == 0 ||
 		    waitfor == MNT_LAZY) {
 			VI_UNLOCK(vp);
 			MNT_ILOCK(mp);
 			continue;
 		}
 		if (vget(vp, LK_EXCLUSIVE | LK_INTERLOCK, td)) {
 			MNT_ILOCK(mp);
+			MNT_VNODE_FOREACH_ABORT_ILOCKED(mp, mvp);
 			goto loop;
 		}
 		error = VOP_FSYNC(vp, waitfor, td);
 		if (error)
 			allerror = error;
 		VOP_UNLOCK(vp, 0, td);
 		vrele(vp);
 
 		MNT_ILOCK(mp);
 	}
 	MNT_IUNLOCK(mp);
 	return (allerror);
 }
 
 static int
 nfs_sysctl(struct mount *mp, fsctlop_t op, struct sysctl_req *req)
 {
 	struct nfsmount *nmp = VFSTONFS(mp);
 	struct vfsquery vq;
 	int error;
 
 	bzero(&vq, sizeof(vq));
 	switch (op) {
 #if 0
 	case VFS_CTL_NOLOCKS:
 		val = (nmp->nm_flag & NFSMNT_NOLOCKS) ? 1 : 0;
  		if (req->oldptr != NULL) {
  			error = SYSCTL_OUT(req, &val, sizeof(val));
  			if (error)
  				return (error);
  		}
  		if (req->newptr != NULL) {
  			error = SYSCTL_IN(req, &val, sizeof(val));
  			if (error)
  				return (error);
 			if (val)
 				nmp->nm_flag |= NFSMNT_NOLOCKS;
 			else
 				nmp->nm_flag &= ~NFSMNT_NOLOCKS;
  		}
 		break;
 #endif
 	case VFS_CTL_QUERY:
 		if (nmp->nm_state & NFSSTA_TIMEO)
 			vq.vq_flags |= VQ_NOTRESP;
 #if 0
 		if (!(nmp->nm_flag & NFSMNT_NOLOCKS) &&
 		    (nmp->nm_state & NFSSTA_LOCKTIMEO))
 			vq.vq_flags |= VQ_NOTRESPLOCK;
 #endif
 		error = SYSCTL_OUT(req, &vq, sizeof(vq));
 		break;
  	case VFS_CTL_TIMEO:
  		if (req->oldptr != NULL) {
  			error = SYSCTL_OUT(req, &nmp->nm_tprintf_initial_delay,
  			    sizeof(nmp->nm_tprintf_initial_delay));
  			if (error)
  				return (error);
  		}
  		if (req->newptr != NULL) {
 			error = vfs_suser(mp, req->td);
 			if (error)
 				return (error);
  			error = SYSCTL_IN(req, &nmp->nm_tprintf_initial_delay,
  			    sizeof(nmp->nm_tprintf_initial_delay));
  			if (error)
  				return (error);
  			if (nmp->nm_tprintf_initial_delay < 0)
  				nmp->nm_tprintf_initial_delay = 0;
  		}
 		break;
 	default:
 		return (ENOTSUP);
 	}
 	return (0);
 }
Index: head/sys/sys/mount.h
===================================================================
--- head/sys/sys/mount.h	(revision 154151)
+++ head/sys/sys/mount.h	(revision 154152)
@@ -1,688 +1,702 @@
 /*-
  * Copyright (c) 1989, 1991, 1993
  *	The Regents of the University of California.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	@(#)mount.h	8.21 (Berkeley) 5/20/95
  * $FreeBSD$
  */
 
 #ifndef _SYS_MOUNT_H_
 #define _SYS_MOUNT_H_
 
 #include <sys/ucred.h>
 #include <sys/queue.h>
 #ifdef _KERNEL
 #include <sys/lockmgr.h>
 #include <sys/_lock.h>
 #include <sys/_mutex.h>
 #endif
 
 typedef struct fsid { int32_t val[2]; } fsid_t;	/* filesystem id type */
 
 /*
  * File identifier.
  * These are unique per filesystem on a single machine.
  */
 #define	MAXFIDSZ	16
 
 struct fid {
 	u_short		fid_len;		/* length of data in bytes */
 	u_short		fid_reserved;		/* force longword alignment */
 	char		fid_data[MAXFIDSZ];	/* data (variable length) */
 };
 
 /*
  * filesystem statistics
  */
 #define	MFSNAMELEN	16		/* length of type name including null */
 #define	MNAMELEN	88		/* size of on/from name bufs */
 #define	STATFS_VERSION	0x20030518	/* current version number */
 struct statfs {
 	uint32_t f_version;		/* structure version number */
 	uint32_t f_type;		/* type of filesystem */
 	uint64_t f_flags;		/* copy of mount exported flags */
 	uint64_t f_bsize;		/* filesystem fragment size */
 	uint64_t f_iosize;		/* optimal transfer block size */
 	uint64_t f_blocks;		/* total data blocks in filesystem */
 	uint64_t f_bfree;		/* free blocks in filesystem */
 	int64_t	 f_bavail;		/* free blocks avail to non-superuser */
 	uint64_t f_files;		/* total file nodes in filesystem */
 	int64_t	 f_ffree;		/* free nodes avail to non-superuser */
 	uint64_t f_syncwrites;		/* count of sync writes since mount */
 	uint64_t f_asyncwrites;		/* count of async writes since mount */
 	uint64_t f_syncreads;		/* count of sync reads since mount */
 	uint64_t f_asyncreads;		/* count of async reads since mount */
 	uint64_t f_spare[10];		/* unused spare */
 	uint32_t f_namemax;		/* maximum filename length */
 	uid_t	  f_owner;		/* user that mounted the filesystem */
 	fsid_t	  f_fsid;		/* filesystem id */
 	char	  f_charspare[80];	    /* spare string space */
 	char	  f_fstypename[MFSNAMELEN]; /* filesystem type name */
 	char	  f_mntfromname[MNAMELEN];  /* mounted filesystem */
 	char	  f_mntonname[MNAMELEN];    /* directory on which mounted */
 };
 
 #ifdef _KERNEL
 #define	OMFSNAMELEN	16	/* length of fs type name, including null */
 #define	OMNAMELEN	(88 - 2 * sizeof(long))	/* size of on/from name bufs */
 
 /* XXX getfsstat.2 is out of date with write and read counter changes here. */
 /* XXX statfs.2 is out of date with read counter changes here. */
 struct ostatfs {
 	long	f_spare2;		/* placeholder */
 	long	f_bsize;		/* fundamental filesystem block size */
 	long	f_iosize;		/* optimal transfer block size */
 	long	f_blocks;		/* total data blocks in filesystem */
 	long	f_bfree;		/* free blocks in fs */
 	long	f_bavail;		/* free blocks avail to non-superuser */
 	long	f_files;		/* total file nodes in filesystem */
 	long	f_ffree;		/* free file nodes in fs */
 	fsid_t	f_fsid;			/* filesystem id */
 	uid_t	f_owner;		/* user that mounted the filesystem */
 	int	f_type;			/* type of filesystem */
 	int	f_flags;		/* copy of mount exported flags */
 	long	f_syncwrites;		/* count of sync writes since mount */
 	long	f_asyncwrites;		/* count of async writes since mount */
 	char	f_fstypename[OMFSNAMELEN]; /* fs type name */
 	char	f_mntonname[OMNAMELEN];	/* directory on which mounted */
 	long	f_syncreads;		/* count of sync reads since mount */
 	long	f_asyncreads;		/* count of async reads since mount */
 	short	f_spares1;		/* unused spare */
 	char	f_mntfromname[OMNAMELEN];/* mounted filesystem */
 	short	f_spares2;		/* unused spare */
 	/*
 	 * XXX on machines where longs are aligned to 8-byte boundaries, there
 	 * is an unnamed int32_t here.  This spare was after the apparent end
 	 * of the struct until we bit off the read counters from f_mntonname.
 	 */
 	long	f_spare[2];		/* unused spare */
 };
 
 #define	MMAXOPTIONLEN	65536		/* maximum length of a mount option */
 
 TAILQ_HEAD(vnodelst, vnode);
 
 struct vfsoptlist;
 struct vfsopt;
 
 /*
  * Structure per mounted filesystem.  Each mounted filesystem has an
  * array of operations and an instance record.  The filesystems are
  * put on a doubly linked list.
  *
  */
 struct mount {
 	TAILQ_ENTRY(mount) mnt_list;		/* mount list */
 	struct vfsops	*mnt_op;		/* operations on fs */
 	struct vfsconf	*mnt_vfc;		/* configuration info */
 	struct vnode	*mnt_vnodecovered;	/* vnode we mounted on */
 	struct vnode	*mnt_syncer;		/* syncer vnode */
 	struct vnodelst	mnt_nvnodelist;		/* list of vnodes this mount */
 	struct lock	mnt_lock;		/* mount structure lock */
 	struct mtx	mnt_mtx;		/* mount structure interlock */
 	int		mnt_writeopcount;	/* write syscalls in progress */
 	u_int		mnt_flag;		/* flags shared with user */
 	struct vfsoptlist *mnt_opt;		/* current mount options */
 	struct vfsoptlist *mnt_optnew;		/* new options passed to fs */
 	int		mnt_kern_flag;		/* kernel only flags */
 	int		mnt_maxsymlinklen;	/* max size of short symlink */
 	struct statfs	mnt_stat;		/* cache of filesystem stats */
 	struct ucred	*mnt_cred;		/* credentials of mounter */
 	void *		mnt_data;		/* private data */
 	time_t		mnt_time;		/* last time written*/
 	int		mnt_iosize_max;		/* max size for clusters, etc */
 	struct netexport *mnt_export;		/* export list */
 	struct label	*mnt_mntlabel;		/* MAC label for the mount */
 	struct label	*mnt_fslabel;		/* MAC label for the fs */
 	int		mnt_nvnodelistsize;	/* # of vnodes on this mount */
 	u_int		mnt_hashseed;		/* Random seed for vfs_hash */
+	int		mnt_markercnt;		/* marker vnodes in use */
+	int		mnt_holdcnt;		/* hold count */
+	int		mnt_holdcntwaiters;	/* waits on hold count */
 };
 
-struct vnode *__mnt_vnode_next(struct vnode **nvp, struct mount *mp);
+struct vnode *__mnt_vnode_next(struct vnode **mvp, struct mount *mp);
+struct vnode *__mnt_vnode_first(struct vnode **mvp, struct mount *mp);
+void          __mnt_vnode_markerfree(struct vnode **mvp, struct mount *mp);
 
-#define MNT_VNODE_FOREACH(vp, mp, vp2) \
-	for ((vp2) = TAILQ_FIRST(&(mp)->mnt_nvnodelist);	\
-		(vp = __mnt_vnode_next(&(vp2), (mp))) != NULL;)
+#define MNT_VNODE_FOREACH(vp, mp, mvp) \
+	for (vp = __mnt_vnode_first(&(mvp), (mp)); \
+		(vp) != NULL; vp = __mnt_vnode_next(&(mvp), (mp)))
 
+#define MNT_VNODE_FOREACH_ABORT_ILOCKED(mp, mvp)			\
+	__mnt_vnode_markerfree(&(mvp), (mp)) 
+
+#define MNT_VNODE_FOREACH_ABORT(mp, mvp)				\
+        do {								\
+	  MNT_ILOCK(mp);						\
+          MNT_VNODE_FOREACH_ABORT_ILOCKED(mp, mvp);			\
+	  MNT_IUNLOCK(mp);						\
+	} while (0)
 
 #define	MNT_ILOCK(mp)	mtx_lock(&(mp)->mnt_mtx)
 #define	MNT_IUNLOCK(mp)	mtx_unlock(&(mp)->mnt_mtx)
 #define	MNT_MTX(mp)	(&(mp)->mnt_mtx)
 
 #endif /* _KERNEL */
 
 /*
  * User specifiable flags.
  */
 #define	MNT_RDONLY	0x00000001	/* read only filesystem */
 #define	MNT_SYNCHRONOUS	0x00000002	/* filesystem written synchronously */
 #define	MNT_NOEXEC	0x00000004	/* can't exec from filesystem */
 #define	MNT_NOSUID	0x00000008	/* don't honor setuid bits on fs */
 #define	MNT_UNION	0x00000020	/* union with underlying filesystem */
 #define	MNT_ASYNC	0x00000040	/* filesystem written asynchronously */
 #define	MNT_SUIDDIR	0x00100000	/* special handling of SUID on dirs */
 #define	MNT_SOFTDEP	0x00200000	/* soft updates being done */
 #define	MNT_NOSYMFOLLOW	0x00400000	/* do not follow symlinks */
 #define	MNT_MULTILABEL	0x04000000	/* MAC support for individual objects */
 #define	MNT_ACLS	0x08000000	/* ACL support enabled */
 #define	MNT_NOATIME	0x10000000	/* disable update of file access time */
 #define	MNT_NOCLUSTERR	0x40000000	/* disable cluster read */
 #define	MNT_NOCLUSTERW	0x80000000	/* disable cluster write */
 
 /*
  * NFS export related mount flags.
  */
 #define	MNT_EXRDONLY	0x00000080	/* exported read only */
 #define	MNT_EXPORTED	0x00000100	/* filesystem is exported */
 #define	MNT_DEFEXPORTED	0x00000200	/* exported to the world */
 #define	MNT_EXPORTANON	0x00000400	/* use anon uid mapping for everyone */
 #define	MNT_EXKERB	0x00000800	/* exported with Kerberos uid mapping */
 #define	MNT_EXPUBLIC	0x20000000	/* public export (WebNFS) */
 
 /*
  * Flags set by internal operations,
  * but visible to the user.
  * XXX some of these are not quite right.. (I've never seen the root flag set)
  */
 #define	MNT_LOCAL	0x00001000	/* filesystem is stored locally */
 #define	MNT_QUOTA	0x00002000	/* quotas are enabled on filesystem */
 #define	MNT_ROOTFS	0x00004000	/* identifies the root filesystem */
 #define	MNT_USER	0x00008000	/* mounted by a user */
 #define	MNT_IGNORE	0x00800000	/* do not show entry in df */
 
 /*
  * Mask of flags that are visible to statfs().
  * XXX I think that this could now become (~(MNT_CMDFLAGS))
  * but the 'mount' program may need changing to handle this.
  */
 #define	MNT_VISFLAGMASK	(MNT_RDONLY	| MNT_SYNCHRONOUS | MNT_NOEXEC	| \
 			MNT_NOSUID	| MNT_UNION	| \
 			MNT_ASYNC	| MNT_EXRDONLY	| MNT_EXPORTED	| \
 			MNT_DEFEXPORTED	| MNT_EXPORTANON| MNT_EXKERB	| \
 			MNT_LOCAL	| MNT_USER	| MNT_QUOTA	| \
 			MNT_ROOTFS	| MNT_NOATIME	| MNT_NOCLUSTERR| \
 			MNT_NOCLUSTERW	| MNT_SUIDDIR	| MNT_SOFTDEP	| \
 			MNT_IGNORE	| MNT_EXPUBLIC	| MNT_NOSYMFOLLOW | \
 			MNT_MULTILABEL	| MNT_ACLS)
 
 /* Mask of flags that can be updated. */
 #define	MNT_UPDATEMASK (MNT_NOSUID	| MNT_NOEXEC	| \
 			MNT_SYNCHRONOUS	| MNT_UNION	| MNT_ASYNC	| \
 			MNT_NOATIME | \
 			MNT_NOSYMFOLLOW	| MNT_IGNORE	| \
 			MNT_NOCLUSTERR	| MNT_NOCLUSTERW | MNT_SUIDDIR	| \
 			MNT_ACLS	| MNT_USER)
 
 /*
  * External filesystem command modifier flags.
  * Unmount can use the MNT_FORCE flag.
  * XXX: These are not STATES and really should be somewhere else.
  * XXX: MNT_BYFSID collides with MNT_ACLS, but because MNT_ACLS is only used for
  *      mount(2) and MNT_BYFSID is only used for unmount(2) it's harmless.
  */
 #define	MNT_UPDATE	0x00010000	/* not a real mount, just an update */
 #define	MNT_DELEXPORT	0x00020000	/* delete export host lists */
 #define	MNT_RELOAD	0x00040000	/* reload filesystem data */
 #define	MNT_FORCE	0x00080000	/* force unmount or readonly change */
 #define	MNT_SNAPSHOT	0x01000000	/* snapshot the filesystem */
 #define	MNT_BYFSID	0x08000000	/* specify filesystem by ID. */
 #define MNT_CMDFLAGS   (MNT_UPDATE	| MNT_DELEXPORT	| MNT_RELOAD	| \
 			MNT_FORCE	| MNT_SNAPSHOT	| MNT_BYFSID)
 /*
  * Still available.
  */
 #define	MNT_SPARE_0x00000010	0x00000010
 #define	MNT_SPARE_0x02000000	0x02000000
 /*
  * Internal filesystem control flags stored in mnt_kern_flag.
  *
  * MNTK_UNMOUNT locks the mount entry so that name lookup cannot proceed
  * past the mount point.  This keeps the subtree stable during mounts
  * and unmounts.
  *
  * MNTK_UNMOUNTF permits filesystems to detect a forced unmount while
  * dounmount() is still waiting to lock the mountpoint. This allows
  * the filesystem to cancel operations that might otherwise deadlock
  * with the unmount attempt (used by NFS).
  */
 #define MNTK_UNMOUNTF	0x00000001	/* forced unmount in progress */
 #define MNTK_UNMOUNT	0x01000000	/* unmount in progress */
 #define	MNTK_MWAIT	0x02000000	/* waiting for unmount to finish */
 #define	MNTK_SUSPEND	0x08000000	/* request write suspension */
 #define	MNTK_SUSPENDED	0x10000000	/* write operations are suspended */
 #define	MNTK_MPSAFE	0x20000000	/* Filesystem is MPSAFE. */
 #define	MNTK_NOKNOTE	0x80000000	/* Don't send KNOTEs from VOP hooks */
 
 /*
  * Sysctl CTL_VFS definitions.
  *
  * Second level identifier specifies which filesystem. Second level
  * identifier VFS_VFSCONF returns information about all filesystems.
  * Second level identifier VFS_GENERIC is non-terminal.
  */
 #define	VFS_VFSCONF		0	/* get configured filesystems */
 #define	VFS_GENERIC		0	/* generic filesystem information */
 /*
  * Third level identifiers for VFS_GENERIC are given below; third
  * level identifiers for specific filesystems are given in their
  * mount specific header files.
  */
 #define VFS_MAXTYPENUM	1	/* int: highest defined filesystem type */
 #define VFS_CONF	2	/* struct: vfsconf for filesystem given
 				   as next argument */
 
 /*
  * Flags for various system call interfaces.
  *
  * waitfor flags to vfs_sync() and getfsstat()
  */
 #define MNT_WAIT	1	/* synchronously wait for I/O to complete */
 #define MNT_NOWAIT	2	/* start all I/O, but do not wait for it */
 #define MNT_LAZY	3	/* push data not written by filesystem syncer */
 
 /*
  * Generic file handle
  */
 struct fhandle {
 	fsid_t	fh_fsid;	/* Filesystem id of mount point */
 	struct	fid fh_fid;	/* Filesys specific id */
 };
 typedef struct fhandle	fhandle_t;
 
 /*
  * Export arguments for local filesystem mount calls.
  */
 struct export_args {
 	int	ex_flags;		/* export related flags */
 	uid_t	ex_root;		/* mapping for root uid */
 	struct	xucred ex_anon;		/* mapping for anonymous user */
 	struct	sockaddr *ex_addr;	/* net address to which exported */
 	u_char	ex_addrlen;		/* and the net address length */
 	struct	sockaddr *ex_mask;	/* mask of valid bits in saddr */
 	u_char	ex_masklen;		/* and the smask length */
 	char	*ex_indexfile;		/* index file for WebNFS URLs */
 };
 
 /*
  * Structure holding information for a publicly exported filesystem
  * (WebNFS). Currently the specs allow just for one such filesystem.
  */
 struct nfs_public {
 	int		np_valid;	/* Do we hold valid information */
 	fhandle_t	np_handle;	/* Filehandle for pub fs (internal) */
 	struct mount	*np_mount;	/* Mountpoint of exported fs */
 	char		*np_index;	/* Index file */
 };
 
 /*
  * Filesystem configuration information. One of these exists for each
  * type of filesystem supported by the kernel. These are searched at
  * mount time to identify the requested filesystem.
  *
  * XXX: Never change the first two arguments!
  */
 struct vfsconf {
 	u_int	vfc_version;		/* ABI version number */
 	char	vfc_name[MFSNAMELEN];	/* filesystem type name */
 	struct	vfsops *vfc_vfsops;	/* filesystem operations vector */
 	int	vfc_typenum;		/* historic filesystem type number */
 	int	vfc_refcount;		/* number mounted of this type */
 	int	vfc_flags;		/* permanent flags */
 	struct	vfsoptdecl *vfc_opts;	/* mount options */
 	TAILQ_ENTRY(vfsconf) vfc_list;	/* list of vfscons */
 };
 
 /* Userland version of the struct vfsconf. */
 struct xvfsconf {
 	struct	vfsops *vfc_vfsops;	/* filesystem operations vector */
 	char	vfc_name[MFSNAMELEN];	/* filesystem type name */
 	int	vfc_typenum;		/* historic filesystem type number */
 	int	vfc_refcount;		/* number mounted of this type */
 	int	vfc_flags;		/* permanent flags */
 	struct	vfsconf *vfc_next;	/* next in list */
 };
 
 #ifndef BURN_BRIDGES
 struct ovfsconf {
 	void	*vfc_vfsops;
 	char	vfc_name[32];
 	int	vfc_index;
 	int	vfc_refcount;
 	int	vfc_flags;
 };
 #endif
 
 /*
  * NB: these flags refer to IMPLEMENTATION properties, not properties of
  * any actual mounts; i.e., it does not make sense to change the flags.
  */
 #define	VFCF_STATIC	0x00010000	/* statically compiled into kernel */
 #define	VFCF_NETWORK	0x00020000	/* may get data over the network */
 #define	VFCF_READONLY	0x00040000	/* writes are not implemented */
 #define VFCF_SYNTHETIC	0x00080000	/* data does not represent real files */
 #define	VFCF_LOOPBACK	0x00100000	/* aliases some other mounted FS */
 #define	VFCF_UNICODE	0x00200000	/* stores file names as Unicode*/
 
 typedef uint32_t fsctlop_t;
 
 struct vfsidctl {
 	int		vc_vers;	/* should be VFSIDCTL_VERS1 (below) */
 	fsid_t		vc_fsid;	/* fsid to operate on. */
 	char		vc_fstypename[MFSNAMELEN];
 					/* type of fs 'nfs' or '*' */
 	fsctlop_t	vc_op;		/* operation VFS_CTL_* (below) */
 	void		*vc_ptr;	/* pointer to data structure. */
 	size_t		vc_len;		/* sizeof said structure. */
 	u_int32_t	vc_spare[12];	/* spare (must be zero). */
 };
 
 /* vfsidctl API version. */
 #define VFS_CTL_VERS1	0x01
 
 /*
  * New style VFS sysctls, do not reuse/conflict with the namespace for
  * private sysctls.
  * All "global" sysctl ops have the 33rd bit set:
  * 0x...1....
  * Private sysctl ops should have the 33rd bit unset.
  */
 #define VFS_CTL_QUERY	0x00010001	/* anything wrong? (vfsquery) */
 #define VFS_CTL_TIMEO	0x00010002	/* set timeout for vfs notification */
 #define VFS_CTL_NOLOCKS	0x00010003	/* disable file locking */
 
 struct vfsquery {
 	u_int32_t	vq_flags;
 	u_int32_t	vq_spare[31];
 };
 
 /* vfsquery flags */
 #define VQ_NOTRESP	0x0001	/* server down */
 #define VQ_NEEDAUTH	0x0002	/* server bad auth */
 #define VQ_LOWDISK	0x0004	/* we're low on space */
 #define VQ_MOUNT	0x0008	/* new filesystem arrived */
 #define VQ_UNMOUNT	0x0010	/* filesystem has left */
 #define VQ_DEAD		0x0020	/* filesystem is dead, needs force unmount */
 #define VQ_ASSIST	0x0040	/* filesystem needs assistance from external
 				   program */
 #define VQ_NOTRESPLOCK	0x0080	/* server lockd down */
 #define VQ_FLAG0100	0x0100	/* placeholder */
 #define VQ_FLAG0200	0x0200	/* placeholder */
 #define VQ_FLAG0400	0x0400	/* placeholder */
 #define VQ_FLAG0800	0x0800	/* placeholder */
 #define VQ_FLAG1000	0x1000	/* placeholder */
 #define VQ_FLAG2000	0x2000	/* placeholder */
 #define VQ_FLAG4000	0x4000	/* placeholder */
 #define VQ_FLAG8000	0x8000	/* placeholder */
 
 #ifdef _KERNEL
 /* Point a sysctl request at a vfsidctl's data. */
 #define VCTLTOREQ(vc, req)						\
 	do {								\
 		(req)->newptr = (vc)->vc_ptr;				\
 		(req)->newlen = (vc)->vc_len;				\
 		(req)->newidx = 0;					\
 	} while (0)
 #endif
 
 struct iovec;
 struct uio;
 
 #ifdef _KERNEL
 
 #ifdef MALLOC_DECLARE
 MALLOC_DECLARE(M_MOUNT);
 #endif
 extern int maxvfsconf;		/* highest defined filesystem type */
 extern int nfs_mount_type;	/* vfc_typenum for nfs, or -1 */
 
 TAILQ_HEAD(vfsconfhead, vfsconf);
 extern struct vfsconfhead vfsconf;
 
 /*
  * Operations supported on mounted filesystem.
  */
 struct mount_args;
 struct nameidata;
 struct sysctl_req;
 struct mntarg;
 
 typedef int vfs_cmount_t(struct mntarg *ma, void *data, int flags, struct thread *td);
 typedef int vfs_unmount_t(struct mount *mp, int mntflags, struct thread *td);
 typedef int vfs_root_t(struct mount *mp, int flags, struct vnode **vpp,
 		    struct thread *td);
 typedef	int vfs_quotactl_t(struct mount *mp, int cmds, uid_t uid,
 		    void *arg, struct thread *td);
 typedef	int vfs_statfs_t(struct mount *mp, struct statfs *sbp,
 		    struct thread *td);
 typedef	int vfs_sync_t(struct mount *mp, int waitfor, struct thread *td);
 typedef	int vfs_vget_t(struct mount *mp, ino_t ino, int flags,
 		    struct vnode **vpp);
 typedef	int vfs_fhtovp_t(struct mount *mp, struct fid *fhp, struct vnode **vpp);
 typedef	int vfs_checkexp_t(struct mount *mp, struct sockaddr *nam,
 		    int *extflagsp, struct ucred **credanonp);
 typedef	int vfs_vptofh_t(struct vnode *vp, struct fid *fhp);
 typedef	int vfs_init_t(struct vfsconf *);
 typedef	int vfs_uninit_t(struct vfsconf *);
 typedef	int vfs_extattrctl_t(struct mount *mp, int cmd,
 		    struct vnode *filename_vp, int attrnamespace,
 		    const char *attrname, struct thread *td);
 typedef	int vfs_mount_t(struct mount *mp, struct thread *td);
 typedef int vfs_sysctl_t(struct mount *mp, fsctlop_t op,
 		    struct sysctl_req *req);
 
 struct vfsops {
 	vfs_mount_t		*vfs_mount;
 	vfs_cmount_t		*vfs_cmount;
 	vfs_unmount_t		*vfs_unmount;
 	vfs_root_t		*vfs_root;
 	vfs_quotactl_t		*vfs_quotactl;
 	vfs_statfs_t		*vfs_statfs;
 	vfs_sync_t		*vfs_sync;
 	vfs_vget_t		*vfs_vget;
 	vfs_fhtovp_t		*vfs_fhtovp;
 	vfs_checkexp_t		*vfs_checkexp;
 	vfs_vptofh_t		*vfs_vptofh;
 	vfs_init_t		*vfs_init;
 	vfs_uninit_t		*vfs_uninit;
 	vfs_extattrctl_t	*vfs_extattrctl;
 	vfs_sysctl_t		*vfs_sysctl;
 };
 
 vfs_statfs_t	__vfs_statfs;
 
 #define VFS_MOUNT(MP, P)    (*(MP)->mnt_op->vfs_mount)(MP, P)
 #define VFS_UNMOUNT(MP, FORCE, P) (*(MP)->mnt_op->vfs_unmount)(MP, FORCE, P)
 #define VFS_ROOT(MP, FLAGS, VPP, P) \
 	(*(MP)->mnt_op->vfs_root)(MP, FLAGS, VPP, P)
 #define VFS_QUOTACTL(MP,C,U,A,P)  (*(MP)->mnt_op->vfs_quotactl)(MP, C, U, A, P)
 #define VFS_STATFS(MP, SBP, P)	  __vfs_statfs((MP), (SBP), (P))
 #define VFS_SYNC(MP, WAIT, P)  (*(MP)->mnt_op->vfs_sync)(MP, WAIT, P)
 #define VFS_VGET(MP, INO, FLAGS, VPP) \
 	(*(MP)->mnt_op->vfs_vget)(MP, INO, FLAGS, VPP)
 #define VFS_FHTOVP(MP, FIDP, VPP) \
 	(*(MP)->mnt_op->vfs_fhtovp)(MP, FIDP, VPP)
 #define	VFS_VPTOFH(VP, FIDP)	  (*(VP)->v_mount->mnt_op->vfs_vptofh)(VP, FIDP)
 #define VFS_CHECKEXP(MP, NAM, EXFLG, CRED) \
 	(*(MP)->mnt_op->vfs_checkexp)(MP, NAM, EXFLG, CRED)
 #define VFS_EXTATTRCTL(MP, C, FN, NS, N, P) \
 	(*(MP)->mnt_op->vfs_extattrctl)(MP, C, FN, NS, N, P)
 #define VFS_SYSCTL(MP, OP, REQ) \
 	(*(MP)->mnt_op->vfs_sysctl)(MP, OP, REQ)
 
 extern int mpsafe_vfs;
 
 #define	VFS_NEEDSGIANT(MP)						\
     (!mpsafe_vfs || ((MP) != NULL && ((MP)->mnt_kern_flag & MNTK_MPSAFE) == 0))
 
 #define	VFS_LOCK_GIANT(MP) __extension__				\
 ({									\
 	int _locked;							\
 	if (VFS_NEEDSGIANT((MP))) {					\
 		mtx_lock(&Giant);					\
 		_locked = 1;						\
 	} else								\
 		_locked = 0;						\
 	_locked;							\
 })
 #define	VFS_UNLOCK_GIANT(locked)	if ((locked)) mtx_unlock(&Giant);
 #define	VFS_ASSERT_GIANT(MP) do 					\
 {									\
 	if (VFS_NEEDSGIANT((MP)))					\
 		mtx_assert(&Giant, MA_OWNED);				\
 } while (0)
 
 #define VFS_KNOTE_LOCKED(vp, hint) do					\
 {									\
 	if (((vp)->v_vflag & VV_NOKNOTE) == 0)				\
 		VN_KNOTE((vp), (hint), 1);				\
 } while (0)
 
 #define VFS_KNOTE_UNLOCKED(vp, hint) do					\
 {									\
 	if (((vp)->v_vflag & VV_NOKNOTE) == 0)				\
 		VN_KNOTE((vp), (hint), 0);				\
 } while (0)
 
 #include <sys/module.h>
 
 /*
  * Version numbers.
  */
 #define VFS_VERSION_00	0x19660120
 #define VFS_VERSION	VFS_VERSION_00
 
 #define VFS_SET(vfsops, fsname, flags) \
 	static struct vfsconf fsname ## _vfsconf = {		\
 		.vfc_version = VFS_VERSION,			\
 		.vfc_name = #fsname,				\
 		.vfc_vfsops = &vfsops,				\
 		.vfc_typenum = -1,				\
 		.vfc_flags = flags,				\
 	};							\
 	static moduledata_t fsname ## _mod = {			\
 		#fsname,					\
 		vfs_modevent,					\
 		& fsname ## _vfsconf				\
 	};							\
 	DECLARE_MODULE(fsname, fsname ## _mod, SI_SUB_VFS, SI_ORDER_MIDDLE)
 
 extern	char *mountrootfsname;
 
 /*
  * exported vnode operations
  */
 
 int	dounmount(struct mount *, int, struct thread *);
 
 int	kernel_mount(struct mntarg *ma, int flags);
 int	kernel_vmount(int flags, ...);
 struct mntarg *mount_arg(struct mntarg *ma, const char *name, const void *val, int len);
 struct mntarg *mount_argb(struct mntarg *ma, int flag, const char *name);
 struct mntarg *mount_argf(struct mntarg *ma, const char *name, const char *fmt, ...);
 struct mntarg *mount_argsu(struct mntarg *ma, const char *name, const void *val, int len);
 struct vfsconf *vfs_byname(const char *);
 struct vfsconf *vfs_byname_kld(const char *, struct thread *td, int *);
 void	vfs_event_signal(fsid_t *, u_int32_t, intptr_t);
 int	vfs_flagopt(struct vfsoptlist *opts, const char *name, u_int *w, u_int val);
 int	vfs_getopt(struct vfsoptlist *, const char *, void **, int *);
 char	*vfs_getopts(struct vfsoptlist *, const char *, int *error);
 int	vfs_copyopt(struct vfsoptlist *, const char *, void *, int);
 int	vfs_filteropt(struct vfsoptlist *, const char **legal);
 int	vfs_scanopt(struct vfsoptlist *opts, const char *name, const char *fmt, ...);
 int	vfs_setpublicfs			    /* set publicly exported fs */
 	    (struct mount *, struct netexport *, struct export_args *);
 int	vfs_lock(struct mount *);         /* lock a vfs */
 void	vfs_msync(struct mount *, int);
 void	vfs_unlock(struct mount *);       /* unlock a vfs */
 int	vfs_busy(struct mount *, int, struct mtx *, struct thread *);
 int	vfs_export			 /* process mount export info */
 	    (struct mount *, struct export_args *);
 int	vfs_allocate_syncvnode(struct mount *);
 void	vfs_getnewfsid(struct mount *);
 struct cdev *vfs_getrootfsid(struct mount *);
 struct	mount *vfs_getvfs(fsid_t *);      /* return vfs given fsid */
 int	vfs_modevent(module_t, int, void *);
 void	vfs_mount_error(struct mount *, const char *, ...);
 void	vfs_mountroot(void);			/* mount our root filesystem */
 void	vfs_mountedfrom(struct mount *, const char *from);
 int	vfs_suser(struct mount *, struct thread *);
 void	vfs_unbusy(struct mount *, struct thread *);
 void	vfs_unmountall(void);
 extern	TAILQ_HEAD(mntlist, mount) mountlist;	/* mounted filesystem list */
 extern	struct mtx mountlist_mtx;
 extern	struct nfs_public nfs_pub;
 
 /*
  * Declarations for these vfs default operations are located in
  * kern/vfs_default.c, they should be used instead of making "dummy"
  * functions or casting entries in the VFS op table to "enopnotsupp()".
  */
 vfs_root_t		vfs_stdroot;
 vfs_quotactl_t		vfs_stdquotactl;
 vfs_statfs_t		vfs_stdstatfs;
 vfs_sync_t		vfs_stdsync;
 vfs_sync_t		vfs_stdnosync;
 vfs_vget_t		vfs_stdvget;
 vfs_fhtovp_t		vfs_stdfhtovp;
 vfs_checkexp_t		vfs_stdcheckexp;
 vfs_vptofh_t		vfs_stdvptofh;
 vfs_init_t		vfs_stdinit;
 vfs_uninit_t		vfs_stduninit;
 vfs_extattrctl_t	vfs_stdextattrctl;
 vfs_sysctl_t		vfs_stdsysctl;
 
 /* XXX - these should be indirect functions!!! */
 int	softdep_fsync(struct vnode *);
 int	softdep_process_worklist(struct mount *);
 
 #else /* !_KERNEL */
 
 #include <sys/cdefs.h>
 
 struct stat;
 
 __BEGIN_DECLS
 int	fhopen(const struct fhandle *, int);
 int	fhstat(const struct fhandle *, struct stat *);
 int	fhstatfs(const struct fhandle *, struct statfs *);
 int	fstatfs(int, struct statfs *);
 int	getfh(const char *, fhandle_t *);
 int	getfsstat(struct statfs *, long, int);
 int	getmntinfo(struct statfs **, int);
 int	lgetfh(const char *, fhandle_t *);
 int	mount(const char *, const char *, int, void *);
 int	nmount(struct iovec *, unsigned int, int);
 int	statfs(const char *, struct statfs *);
 int	unmount(const char *, int);
 
 /* C library stuff */
 int	getvfsbyname(const char *, struct xvfsconf *);
 __END_DECLS
 
 #endif /* _KERNEL */
 
 #endif /* !_SYS_MOUNT_H_ */
Index: head/sys/sys/vnode.h
===================================================================
--- head/sys/sys/vnode.h	(revision 154151)
+++ head/sys/sys/vnode.h	(revision 154152)
@@ -1,730 +1,731 @@
 /*-
  * Copyright (c) 1989, 1993
  *	The Regents of the University of California.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	@(#)vnode.h	8.7 (Berkeley) 2/4/94
  * $FreeBSD$
  */
 
 #ifndef _SYS_VNODE_H_
 #define	_SYS_VNODE_H_
 
 /*
  * XXX - compatability until lockmgr() goes away or all the #includes are
  * updated.
  */
 #include <sys/lockmgr.h>
 
 #include <sys/bufobj.h>
 #include <sys/queue.h>
 #include <sys/_lock.h>
 #include <sys/lock.h>
 #include <sys/_mutex.h>
 #include <sys/mutex.h>
 #include <sys/selinfo.h>
 #include <sys/uio.h>
 #include <sys/acl.h>
 #include <sys/ktr.h>
 
 /*
  * The vnode is the focus of all file activity in UNIX.  There is a
  * unique vnode allocated for each active file, each current directory,
  * each mounted-on file, text file, and the root.
  */
 
 /*
  * Vnode types.  VNON means no type.
  */
-enum vtype	{ VNON, VREG, VDIR, VBLK, VCHR, VLNK, VSOCK, VFIFO, VBAD };
+enum vtype	{ VNON, VREG, VDIR, VBLK, VCHR, VLNK, VSOCK, VFIFO, VBAD,
+		  VMARKER };
 
 /*
  * Each underlying filesystem allocates its own private area and hangs
  * it from v_data.  If non-null, this area is freed in getnewvnode().
  */
 
 struct namecache;
 
 struct vpollinfo {
 	struct	mtx vpi_lock;		/* lock to protect below */
 	struct	selinfo vpi_selinfo;	/* identity of poller(s) */
 	short	vpi_events;		/* what they are looking for */
 	short	vpi_revents;		/* what has happened */
 };
 
 /*
  * Reading or writing any of these items requires holding the appropriate lock.
  *
  * Lock reference:
  *	c - namecache mutex
  *	f - freelist mutex
  *	G - Giant
  *	i - interlock
  *	m - mntvnodes mutex
  *	p - pollinfo lock
  *	s - spechash mutex
  *	S - syncer mutex
  *	u - Only a reference to the vnode is needed to read.
  *	v - vnode lock
  *
  * Vnodes may be found on many lists.  The general way to deal with operating
  * on a vnode that is on a list is:
  *	1) Lock the list and find the vnode.
  *	2) Lock interlock so that the vnode does not go away.
  *	3) Unlock the list to avoid lock order reversals.
  *	4) vget with LK_INTERLOCK and check for ENOENT, or
  *	5) Check for DOOMED if the vnode lock is not required.
  *	6) Perform your operation, then vput().
  *
  * XXX Not all fields are locked yet and some fields that are marked are not
  * locked consistently.  This is a work in progress.  Requires Giant!
  */
 
 #if defined(_KERNEL) || defined(_KVM_VNODE)
 
 struct vnode {
 	/*
 	 * Fields which define the identity of the vnode.  These fields are
 	 * owned by the filesystem (XXX: and vgone() ?)
 	 */
 	enum	vtype v_type;			/* u vnode type */
 	const char *v_tag;			/* u type of underlying data */
 	struct	vop_vector *v_op;		/* u vnode operations vector */
 	void	*v_data;			/* u private data for fs */
 
 	/*
 	 * Filesystem instance stuff
 	 */
 	struct	mount *v_mount;			/* u ptr to vfs we are in */
 	TAILQ_ENTRY(vnode) v_nmntvnodes;	/* m vnodes for mount point */
 
 	/*
 	 * Type specific fields, only one applies to any given vnode.
 	 * See #defines below for renaming to v_* namespace.
 	 */
 	union {
 		struct mount	*vu_mount;	/* v ptr to mountpoint (VDIR) */
 		struct socket	*vu_socket;	/* v unix domain net (VSOCK) */
 		struct cdev	*vu_cdev; 	/* v device (VCHR, VBLK) */
 		struct fifoinfo	*vu_fifoinfo;	/* v fifo (VFIFO) */
 	} v_un;
 
 	/*
 	 * vfs_hash:  (mount + inode) -> vnode hash.
 	 */
 	LIST_ENTRY(vnode)	v_hashlist;
 	u_int			v_hash;
 
 	/*
 	 * VFS_namecache stuff
 	 */
 	LIST_HEAD(, namecache) v_cache_src;	/* c Cache entries from us */
 	TAILQ_HEAD(, namecache) v_cache_dst;	/* c Cache entries to us */
 	struct	vnode *v_dd;			/* c .. vnode */
 
 	/*
 	 * clustering stuff
 	 */
 	daddr_t	v_cstart;			/* v start block of cluster */
 	daddr_t	v_lasta;			/* v last allocation  */
 	daddr_t	v_lastw;			/* v last write  */
 	int	v_clen;				/* v length of cur. cluster */
 
 	/*
 	 * Locking
 	 */
 	struct	lock v_lock;			/* u (if fs don't have one) */
 	struct	mtx v_interlock;		/* lock for "i" things */
 	struct	lock *v_vnlock;			/* u pointer to vnode lock */
 	int	v_holdcnt;			/* i prevents recycling. */
 	int	v_usecount;			/* i ref count of users */
 	u_long	v_iflag;			/* i vnode flags (see below) */
 	u_long	v_vflag;			/* v vnode flags */
 	int	v_writecount;			/* v ref count of writers */
 
 	/*
 	 * The machinery of being a vnode
 	 */
 	TAILQ_ENTRY(vnode) v_freelist;		/* f vnode freelist */
 	struct bufobj	v_bufobj;		/* * Buffer cache object */
 
 	/*
 	 * Hooks for various subsystems and features.
 	 */
 	struct vpollinfo *v_pollinfo;		/* G Poll events, p for *v_pi */
 	struct label *v_label;			/* MAC label for vnode */
 };
 
 #endif /* defined(_KERNEL) || defined(_KVM_VNODE) */
 
 #define	v_mountedhere	v_un.vu_mount
 #define	v_socket	v_un.vu_socket
 #define	v_rdev		v_un.vu_cdev
 #define	v_fifoinfo	v_un.vu_fifoinfo
 
 /* XXX: These are temporary to avoid a source sweep at this time */
 #define v_object	v_bufobj.bo_object
 
 /*
  * Userland version of struct vnode, for sysctl.
  */
 struct xvnode {
 	size_t	xv_size;			/* sizeof(struct xvnode) */
 	void	*xv_vnode;			/* address of real vnode */
 	u_long	xv_flag;			/* vnode vflags */
 	int	xv_usecount;			/* reference count of users */
 	int	xv_writecount;			/* reference count of writers */
 	int	xv_holdcnt;			/* page & buffer references */
 	u_long	xv_id;				/* capability identifier */
 	void	*xv_mount;			/* address of parent mount */
 	long	xv_numoutput;			/* num of writes in progress */
 	enum	vtype xv_type;			/* vnode type */
 	union {
 		void	*xvu_socket;		/* socket, if VSOCK */
 		void	*xvu_fifo;		/* fifo, if VFIFO */
 		dev_t	xvu_rdev;		/* maj/min, if VBLK/VCHR */
 		struct {
 			dev_t	xvu_dev;	/* device, if VDIR/VREG/VLNK */
 			ino_t	xvu_ino;	/* id, if VDIR/VREG/VLNK */
 		} xv_uns;
 	} xv_un;
 };
 #define xv_socket	xv_un.xvu_socket
 #define xv_fifo		xv_un.xvu_fifo
 #define xv_rdev		xv_un.xvu_rdev
 #define xv_dev		xv_un.xv_uns.xvu_dev
 #define xv_ino		xv_un.xv_uns.xvu_ino
 
 /* We don't need to lock the knlist */
 #define	VN_KNLIST_EMPTY(vp) ((vp)->v_pollinfo == NULL ||	\
 	    KNLIST_EMPTY(&(vp)->v_pollinfo->vpi_selinfo.si_note))
 
 #define VN_KNOTE(vp, b, a)					\
 	do {							\
 		if (!VN_KNLIST_EMPTY(vp))			\
 			KNOTE(&vp->v_pollinfo->vpi_selinfo.si_note, (b), (a)); \
 	} while (0)
 #define	VN_KNOTE_LOCKED(vp, b)		VN_KNOTE(vp, b, 1)
 #define	VN_KNOTE_UNLOCKED(vp, b)	VN_KNOTE(vp, b, 0)
 
 /*
  * Vnode flags.
  *	VI flags are protected by interlock and live in v_iflag
  *	VV flags are protected by the vnode lock and live in v_vflag
  */
 #define	VI_MOUNT	0x0020	/* Mount in progress */
 #define	VI_AGE		0x0040	/* Insert vnode at head of free list */
 #define	VI_DOOMED	0x0080	/* This vnode is being recycled */
 #define	VI_FREE		0x0100	/* This vnode is on the freelist */
 #define	VI_OBJDIRTY	0x0400	/* object might be dirty */
 #define	VI_DOINGINACT	0x0800	/* VOP_INACTIVE is in progress */
 #define	VI_OWEINACT	0x1000	/* Need to call inactive */
 
 #define	VV_ROOT		0x0001	/* root of its filesystem */
 #define	VV_ISTTY	0x0002	/* vnode represents a tty */
 #define	VV_NOSYNC	0x0004	/* unlinked, stop syncing */
 #define	VV_CACHEDLABEL	0x0010	/* Vnode has valid cached MAC label */
 #define	VV_TEXT		0x0020	/* vnode is a pure text prototype */
 #define	VV_COPYONWRITE	0x0040	/* vnode is doing copy-on-write */
 #define	VV_SYSTEM	0x0080	/* vnode being used by kernel */
 #define	VV_PROCDEP	0x0100	/* vnode is process dependent */
 #define	VV_NOKNOTE	0x0200	/* don't activate knotes on this vnode */
 
 /*
  * Vnode attributes.  A field value of VNOVAL represents a field whose value
  * is unavailable (getattr) or which is not to be changed (setattr).
  */
 struct vattr {
 	enum vtype	va_type;	/* vnode type (for create) */
 	u_short		va_mode;	/* files access mode and type */
 	short		va_nlink;	/* number of references to file */
 	uid_t		va_uid;		/* owner user id */
 	gid_t		va_gid;		/* owner group id */
 	dev_t		va_fsid;	/* filesystem id */
 	long		va_fileid;	/* file id */
 	u_quad_t	va_size;	/* file size in bytes */
 	long		va_blocksize;	/* blocksize preferred for i/o */
 	struct timespec	va_atime;	/* time of last access */
 	struct timespec	va_mtime;	/* time of last modification */
 	struct timespec	va_ctime;	/* time file changed */
 	struct timespec	va_birthtime;	/* time file created */
 	u_long		va_gen;		/* generation number of file */
 	u_long		va_flags;	/* flags defined for file */
 	dev_t		va_rdev;	/* device the special file represents */
 	u_quad_t	va_bytes;	/* bytes of disk space held by file */
 	u_quad_t	va_filerev;	/* file modification number */
 	u_int		va_vaflags;	/* operations flags, see below */
 	long		va_spare;	/* remain quad aligned */
 };
 
 /*
  * Flags for va_vaflags.
  */
 #define	VA_UTIMES_NULL	0x01		/* utimes argument was NULL */
 #define	VA_EXCLUSIVE	0x02		/* exclusive create request */
 #define	VA_MARK_ATIME	0x04		/* setting atime for execve/mmap */
 
 /*
  * Flags for ioflag. (high 16 bits used to ask for read-ahead and
  * help with write clustering)
  * NB: IO_NDELAY and IO_DIRECT are linked to fcntl.h
  */
 #define	IO_UNIT		0x0001		/* do I/O as atomic unit */
 #define	IO_APPEND	0x0002		/* append write to end */
 #define	IO_NDELAY	0x0004		/* FNDELAY flag set in file table */
 #define	IO_NODELOCKED	0x0008		/* underlying node already locked */
 #define	IO_ASYNC	0x0010		/* bawrite rather then bdwrite */
 #define	IO_VMIO		0x0020		/* data already in VMIO space */
 #define	IO_INVAL	0x0040		/* invalidate after I/O */
 #define	IO_SYNC		0x0080		/* do I/O synchronously */
 #define	IO_DIRECT	0x0100		/* attempt to bypass buffer cache */
 #define	IO_EXT		0x0400		/* operate on external attributes */
 #define	IO_NORMAL	0x0800		/* operate on regular data */
 #define	IO_NOMACCHECK	0x1000		/* MAC checks unnecessary */
 
 #define IO_SEQMAX	0x7F		/* seq heuristic max value */
 #define IO_SEQSHIFT	16		/* seq heuristic in upper 16 bits */
 
 /*
  *  Modes.  Some values same as Ixxx entries from inode.h for now.
  */
 #define	VEXEC	000100		/* execute/search permission */
 #define	VWRITE	000200		/* write permission */
 #define	VREAD	000400		/* read permission */
 #define	VSVTX	001000		/* save swapped text even after use */
 #define	VSGID	002000		/* set group id on execution */
 #define	VSUID	004000		/* set user id on execution */
 #define	VADMIN	010000		/* permission to administer */
 #define	VSTAT	020000		/* permission to retrieve attrs */
 #define	VAPPEND	040000		/* permission to write/append */
 #define	VALLPERM	(VEXEC | VWRITE | VREAD | VADMIN | VSTAT | VAPPEND)
 
 /*
  * Token indicating no attribute value yet assigned.
  */
 #define	VNOVAL	(-1)
 
 /*
  * LK_TIMELOCK timeout for vnode locks (used mainly by the pageout daemon)
  */
 #define VLKTIMEOUT	(hz / 20 + 1)
 
 #ifdef _KERNEL
 
 #ifdef MALLOC_DECLARE
 MALLOC_DECLARE(M_VNODE);
 #endif
 
 /*
  * Convert between vnode types and inode formats (since POSIX.1
  * defines mode word of stat structure in terms of inode formats).
  */
 extern enum vtype	iftovt_tab[];
 extern int		vttoif_tab[];
 #define	IFTOVT(mode)	(iftovt_tab[((mode) & S_IFMT) >> 12])
 #define	VTTOIF(indx)	(vttoif_tab[(int)(indx)])
 #define	MAKEIMODE(indx, mode)	(int)(VTTOIF(indx) | (mode))
 
 /*
  * Flags to various vnode functions.
  */
 #define	SKIPSYSTEM	0x0001	/* vflush: skip vnodes marked VSYSTEM */
 #define	FORCECLOSE	0x0002	/* vflush: force file closure */
 #define	WRITECLOSE	0x0004	/* vflush: only close writable files */
 #define	DOCLOSE		0x0008	/* vclean: close active files */
 #define	V_SAVE		0x0001	/* vinvalbuf: sync file first */
 #define	V_ALT		0x0002	/* vinvalbuf: invalidate only alternate bufs */
 #define	V_NORMAL	0x0004	/* vinvalbuf: invalidate only regular bufs */
 #define	REVOKEALL	0x0001	/* vop_revoke: revoke all aliases */
 #define	V_WAIT		0x0001	/* vn_start_write: sleep for suspend */
 #define	V_NOWAIT	0x0002	/* vn_start_write: don't sleep for suspend */
 #define	V_XSLEEP	0x0004	/* vn_start_write: just return after sleep */
 
 #define	VREF(vp)	vref(vp)
 
 #ifdef DIAGNOSTIC
 #define	VATTR_NULL(vap)	vattr_null(vap)
 #else
 #define	VATTR_NULL(vap)	(*(vap) = va_null)	/* initialize a vattr */
 #endif /* DIAGNOSTIC */
 
 #define	NULLVP	((struct vnode *)NULL)
 
 /*
  * Global vnode data.
  */
 extern	struct vnode *rootvnode;	/* root (i.e. "/") vnode */
 extern	int async_io_version;		/* 0 or POSIX version of AIO i'face */
 extern	int desiredvnodes;		/* number of vnodes desired */
 extern	struct uma_zone *namei_zone;
 extern	int prtactive;			/* nonzero to call vprint() */
 extern	struct vattr va_null;		/* predefined null vattr structure */
 
 /*
  * Macro/function to check for client cache inconsistency w.r.t. leasing.
  */
 #define	LEASE_READ	0x1		/* Check lease for readers */
 #define	LEASE_WRITE	0x2		/* Check lease for modifiers */
 
 extern void	(*lease_updatetime)(int deltat);
 
 #define	VI_LOCK(vp)	mtx_lock(&(vp)->v_interlock)
 #define	VI_LOCK_FLAGS(vp, flags) mtx_lock_flags(&(vp)->v_interlock, (flags))
 #define	VI_TRYLOCK(vp)	mtx_trylock(&(vp)->v_interlock)
 #define	VI_UNLOCK(vp)	mtx_unlock(&(vp)->v_interlock)
 #define	VI_MTX(vp)	(&(vp)->v_interlock)
 
 #endif /* _KERNEL */
 
 /*
  * Mods for extensibility.
  */
 
 /*
  * Flags for vdesc_flags:
  */
 #define	VDESC_MAX_VPS		16
 /* Low order 16 flag bits are reserved for willrele flags for vp arguments. */
 #define	VDESC_VP0_WILLRELE	0x0001
 #define	VDESC_VP1_WILLRELE	0x0002
 #define	VDESC_VP2_WILLRELE	0x0004
 #define	VDESC_VP3_WILLRELE	0x0008
 #define	VDESC_NOMAP_VPP		0x0100
 #define	VDESC_VPP_WILLRELE	0x0200
 
 /*
  * A generic structure.
  * This can be used by bypass routines to identify generic arguments.
  */
 struct vop_generic_args {
 	struct vnodeop_desc *a_desc;
 	/* other random data follows, presumably */
 };
 
 typedef int vop_bypass_t(struct vop_generic_args *);
 
 /*
  * VDESC_NO_OFFSET is used to identify the end of the offset list
  * and in places where no such field exists.
  */
 #define VDESC_NO_OFFSET -1
 
 /*
  * This structure describes the vnode operation taking place.
  */
 struct vnodeop_desc {
 	char	*vdesc_name;		/* a readable name for debugging */
 	int	 vdesc_flags;		/* VDESC_* flags */
 	vop_bypass_t	*vdesc_call;	/* Function to call */
 
 	/*
 	 * These ops are used by bypass routines to map and locate arguments.
 	 * Creds and procs are not needed in bypass routines, but sometimes
 	 * they are useful to (for example) transport layers.
 	 * Nameidata is useful because it has a cred in it.
 	 */
 	int	*vdesc_vp_offsets;	/* list ended by VDESC_NO_OFFSET */
 	int	vdesc_vpp_offset;	/* return vpp location */
 	int	vdesc_cred_offset;	/* cred location, if any */
 	int	vdesc_thread_offset;	/* thread location, if any */
 	int	vdesc_componentname_offset; /* if any */
 };
 
 #ifdef _KERNEL
 /*
  * A list of all the operation descs.
  */
 extern struct vnodeop_desc *vnodeop_descs[];
 
 #define	VOPARG_OFFSETOF(s_type, field)	__offsetof(s_type, field)
 #define	VOPARG_OFFSETTO(s_type, s_offset, struct_p) \
     ((s_type)(((char*)(struct_p)) + (s_offset)))
 
 
 #ifdef DEBUG_VFS_LOCKS
 /*
  * Support code to aid in debugging VFS locking problems.  Not totally
  * reliable since if the thread sleeps between changing the lock
  * state and checking it with the assert, some other thread could
  * change the state.  They are good enough for debugging a single
  * filesystem using a single-threaded test.
  */
 void	assert_vi_locked(struct vnode *vp, const char *str);
 void	assert_vi_unlocked(struct vnode *vp, const char *str);
 void	assert_vop_elocked(struct vnode *vp, const char *str);
 #if 0
 void	assert_vop_elocked_other(struct vnode *vp, const char *str);
 #endif
 void	assert_vop_locked(struct vnode *vp, const char *str);
 #if 0
 voi0	assert_vop_slocked(struct vnode *vp, const char *str);
 #endif
 void	assert_vop_unlocked(struct vnode *vp, const char *str);
 
 #define	ASSERT_VI_LOCKED(vp, str)	assert_vi_locked((vp), (str))
 #define	ASSERT_VI_UNLOCKED(vp, str)	assert_vi_unlocked((vp), (str))
 #define	ASSERT_VOP_ELOCKED(vp, str)	assert_vop_elocked((vp), (str))
 #if 0
 #define	ASSERT_VOP_ELOCKED_OTHER(vp, str) assert_vop_locked_other((vp), (str))
 #endif
 #define	ASSERT_VOP_LOCKED(vp, str)	assert_vop_locked((vp), (str))
 #if 0
 #define	ASSERT_VOP_SLOCKED(vp, str)	assert_vop_slocked((vp), (str))
 #endif
 #define	ASSERT_VOP_UNLOCKED(vp, str)	assert_vop_unlocked((vp), (str))
 
 #else /* !DEBUG_VFS_LOCKS */
 
 #define	ASSERT_VI_LOCKED(vp, str)
 #define	ASSERT_VI_UNLOCKED(vp, str)
 #define	ASSERT_VOP_ELOCKED(vp, str)
 #if 0
 #define	ASSERT_VOP_ELOCKED_OTHER(vp, str)
 #endif
 #define	ASSERT_VOP_LOCKED(vp, str)
 #if 0
 #define	ASSERT_VOP_SLOCKED(vp, str)
 #endif
 #define	ASSERT_VOP_UNLOCKED(vp, str)
 #endif /* DEBUG_VFS_LOCKS */
 
 
 /*
  * This call works for vnodes in the kernel.
  */
 #define VCALL(c) ((c)->a_desc->vdesc_call(c))
 
 /*
  * VMIO support inline
  */
 
 extern int vmiodirenable;
 
 static __inline int
 vn_canvmio(struct vnode *vp)
 {
       if (vp && (vp->v_type == VREG || (vmiodirenable && vp->v_type == VDIR)))
 		return(TRUE);
 	return(FALSE);
 }
 
 /*
  * Finally, include the default set of vnode operations.
  */
 #include "vnode_if.h"
 
 /*
  * Public vnode manipulation functions.
  */
 struct componentname;
 struct file;
 struct mount;
 struct nameidata;
 struct ostat;
 struct thread;
 struct proc;
 struct stat;
 struct nstat;
 struct ucred;
 struct uio;
 struct vattr;
 struct vnode;
 
 extern int	(*lease_check_hook)(struct vop_lease_args *);
 extern int	(*softdep_process_worklist_hook)(struct mount *);
 
 /* cache_* may belong in namei.h. */
 void	cache_enter(struct vnode *dvp, struct vnode *vp,
 	    struct componentname *cnp);
 int	cache_lookup(struct vnode *dvp, struct vnode **vpp,
 	    struct componentname *cnp);
 void	cache_purge(struct vnode *vp);
 void	cache_purgevfs(struct mount *mp);
 int	cache_leaf_test(struct vnode *vp);
 int	change_dir(struct vnode *vp, struct thread *td);
 int	change_root(struct vnode *vp, struct thread *td);
 void	cvtstat(struct stat *st, struct ostat *ost);
 void	cvtnstat(struct stat *sb, struct nstat *nsb);
 int	getnewvnode(const char *tag, struct mount *mp, struct vop_vector *vops,
 	    struct vnode **vpp);
 u_quad_t init_va_filerev(void);
 int	lease_check(struct vop_lease_args *ap);
 int	speedup_syncer(void);
 #define textvp_fullpath(p, rb, rfb) \
 	vn_fullpath(FIRST_THREAD_IN_PROC(p), (p)->p_textvp, rb, rfb)
 int	vn_fullpath(struct thread *td, struct vnode *vn,
 	    char **retbuf, char **freebuf);
 int	vaccess(enum vtype type, mode_t file_mode, uid_t uid, gid_t gid,
 	    mode_t acc_mode, struct ucred *cred, int *privused);
 int	vaccess_acl_posix1e(enum vtype type, uid_t file_uid,
 	    gid_t file_gid, struct acl *acl, mode_t acc_mode,
 	    struct ucred *cred, int *privused);
 void	vattr_null(struct vattr *vap);
 int	vcount(struct vnode *vp);
 void	vdrop(struct vnode *);
 void	vfs_add_vnodeops(const void *);
 void	vfs_rm_vnodeops(const void *);
 int	vflush(struct mount *mp, int rootrefs, int flags, struct thread *td);
 int	vget(struct vnode *vp, int lockflag, struct thread *td);
 void	vgone(struct vnode *vp);
 void	vhold(struct vnode *);
 void	vholdl(struct vnode *);
 int	vinvalbuf(struct vnode *vp, int save,
 	    struct thread *td, int slpflag, int slptimeo);
 int	vtruncbuf(struct vnode *vp, struct ucred *cred, struct thread *td,
 	    off_t length, int blksize);
 void	vn_printf(struct vnode *vp, const char *fmt, ...) __printflike(2,3);
 #define vprint(label, vp) vn_printf((vp), "%s\n", (label))
 int	vrecycle(struct vnode *vp, struct thread *td);
 int	vn_close(struct vnode *vp,
 	    int flags, struct ucred *file_cred, struct thread *td);
 void	vn_finished_write(struct mount *mp);
 int	vn_isdisk(struct vnode *vp, int *errp);
 int	vn_lock(struct vnode *vp, int flags, struct thread *td);
 int	vn_open(struct nameidata *ndp, int *flagp, int cmode, int fdidx);
 int	vn_open_cred(struct nameidata *ndp, int *flagp, int cmode,
 	    struct ucred *cred, int fdidx);
 int	vn_pollrecord(struct vnode *vp, struct thread *p, int events);
 int	vn_rdwr(enum uio_rw rw, struct vnode *vp, void *base,
 	    int len, off_t offset, enum uio_seg segflg, int ioflg,
 	    struct ucred *active_cred, struct ucred *file_cred, int *aresid,
 	    struct thread *td);
 int	vn_rdwr_inchunks(enum uio_rw rw, struct vnode *vp, void *base,
 	    size_t len, off_t offset, enum uio_seg segflg, int ioflg,
 	    struct ucred *active_cred, struct ucred *file_cred, size_t *aresid,
 	    struct thread *td);
 int	vn_stat(struct vnode *vp, struct stat *sb, struct ucred *active_cred,
 	    struct ucred *file_cred, struct thread *td);
 int	vn_start_write(struct vnode *vp, struct mount **mpp, int flags);
 int	vn_write_suspend_wait(struct vnode *vp, struct mount *mp,
 	    int flags);
 int	vn_writechk(struct vnode *vp);
 int	vn_extattr_get(struct vnode *vp, int ioflg, int attrnamespace,
 	    const char *attrname, int *buflen, char *buf, struct thread *td);
 int	vn_extattr_set(struct vnode *vp, int ioflg, int attrnamespace,
 	    const char *attrname, int buflen, char *buf, struct thread *td);
 int	vn_extattr_rm(struct vnode *vp, int ioflg, int attrnamespace,
 	    const char *attrname, struct thread *td);
 int	vfs_cache_lookup(struct vop_lookup_args *ap);
 void	vfs_timestamp(struct timespec *);
 void	vfs_write_resume(struct mount *mp);
 int	vfs_write_suspend(struct mount *mp);
 int	vop_stdbmap(struct vop_bmap_args *);
 int	vop_stdfsync(struct vop_fsync_args *);
 int	vop_stdgetwritemount(struct vop_getwritemount_args *);
 int	vop_stdgetpages(struct vop_getpages_args *);
 int	vop_stdinactive(struct vop_inactive_args *);
 int	vop_stdislocked(struct vop_islocked_args *);
 int	vop_stdkqfilter(struct vop_kqfilter_args *);
 int	vop_stdlock(struct vop_lock_args *);
 int	vop_stdputpages(struct vop_putpages_args *);
 int	vop_stdunlock(struct vop_unlock_args *);
 int	vop_nopoll(struct vop_poll_args *);
 int	vop_stdpathconf(struct vop_pathconf_args *);
 int	vop_stdpoll(struct vop_poll_args *);
 int	vop_eopnotsupp(struct vop_generic_args *ap);
 int	vop_ebadf(struct vop_generic_args *ap);
 int	vop_einval(struct vop_generic_args *ap);
 int	vop_enotty(struct vop_generic_args *ap);
 int	vop_null(struct vop_generic_args *ap);
 int	vop_panic(struct vop_generic_args *ap);
 
 /* These are called from within the actual VOPS. */
 void	vop_create_post(void *a, int rc);
 void	vop_link_post(void *a, int rc);
 void	vop_lock_pre(void *a);
 void	vop_lock_post(void *a, int rc);
 void	vop_lookup_post(void *a, int rc);
 void	vop_lookup_pre(void *a);
 void	vop_mkdir_post(void *a, int rc);
 void	vop_mknod_post(void *a, int rc);
 void	vop_remove_post(void *a, int rc);
 void	vop_rename_post(void *a, int rc);
 void	vop_rename_pre(void *a);
 void	vop_rmdir_post(void *a, int rc);
 void	vop_setattr_post(void *a, int rc);
 void	vop_strategy_pre(void *a);
 void	vop_symlink_post(void *a, int rc);
 void	vop_unlock_post(void *a, int rc);
 void	vop_unlock_pre(void *a);
 
 #define	VOP_WRITE_PRE(ap)						\
 	struct vattr va;						\
 	int error, osize, ooffset, noffset;				\
 									\
 	osize = ooffset = noffset = 0;					\
 	if (!VN_KNLIST_EMPTY((ap)->a_vp)) {				\
 		error = VOP_GETATTR((ap)->a_vp, &va, (ap)->a_cred,	\
 		    curthread);						\
 		if (error)						\
 			return (error);					\
 		ooffset = (ap)->a_uio->uio_offset;			\
 		osize = va.va_size;					\
 	}
 
 #define VOP_WRITE_POST(ap, ret)						\
 	noffset = (ap)->a_uio->uio_offset;				\
 	if (noffset > ooffset && !VN_KNLIST_EMPTY((ap)->a_vp)) {	\
 		VFS_KNOTE_LOCKED((ap)->a_vp, NOTE_WRITE			\
 		    | (noffset > osize ? NOTE_EXTEND : 0));		\
 	}
 
 void	vput(struct vnode *vp);
 void	vrele(struct vnode *vp);
 void	vref(struct vnode *vp);
 int	vrefcnt(struct vnode *vp);
 void 	v_addpollinfo(struct vnode *vp);
 
 int vnode_create_vobject(struct vnode *vp, size_t size, struct thread *td);
 void vnode_destroy_vobject(struct vnode *vp);
 
 extern struct vop_vector fifo_specops;
 extern struct vop_vector dead_vnodeops;
 extern struct vop_vector default_vnodeops;
 
 #define VOP_PANIC	((void*)(uintptr_t)vop_panic)
 #define VOP_NULL	((void*)(uintptr_t)vop_null)
 #define VOP_EBADF	((void*)(uintptr_t)vop_ebadf)
 #define VOP_ENOTTY	((void*)(uintptr_t)vop_enotty)
 #define VOP_EINVAL	((void*)(uintptr_t)vop_einval)
 #define VOP_EOPNOTSUPP	((void*)(uintptr_t)vop_eopnotsupp)
 
 /* vfs_hash.c */
 typedef int vfs_hash_cmp_t(struct vnode *vp, void *arg);
 
 int vfs_hash_get(struct mount *mp, u_int hash, int flags, struct thread *td, struct vnode **vpp, vfs_hash_cmp_t *fn, void *arg);
 int vfs_hash_insert(struct vnode *vp, u_int hash, int flags, struct thread *td, struct vnode **vpp, vfs_hash_cmp_t *fn, void *arg);
 void vfs_hash_rehash(struct vnode *vp, u_int hash);
 void vfs_hash_remove(struct vnode *vp);
 
 int vfs_kqfilter(struct vop_kqfilter_args *);
 void vfs_mark_atime(struct vnode *vp, struct thread *td);
 struct dirent;
 int vfs_read_dirent(struct vop_readdir_args *ap, struct dirent *dp, off_t off);
 
 #endif /* _KERNEL */
 
 #endif /* !_SYS_VNODE_H_ */
Index: head/sys/ufs/ffs/ffs_snapshot.c
===================================================================
--- head/sys/ufs/ffs/ffs_snapshot.c	(revision 154151)
+++ head/sys/ufs/ffs/ffs_snapshot.c	(revision 154152)
@@ -1,2231 +1,2233 @@
 /*-
  * Copyright 2000 Marshall Kirk McKusick. All Rights Reserved.
  *
  * Further information about snapshots can be obtained from:
  *
  *	Marshall Kirk McKusick		http://www.mckusick.com/softdep/
  *	1614 Oxford Street		mckusick@mckusick.com
  *	Berkeley, CA 94709-1608		+1-510-843-9542
  *	USA
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  *
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY MARSHALL KIRK MCKUSICK ``AS IS'' AND ANY
  * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
  * DISCLAIMED.  IN NO EVENT SHALL MARSHALL KIRK MCKUSICK BE LIABLE FOR
  * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	@(#)ffs_snapshot.c	8.11 (McKusick) 7/23/00
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <sys/param.h>
 #include <sys/kernel.h>
 #include <sys/systm.h>
 #include <sys/conf.h>
 #include <sys/bio.h>
 #include <sys/buf.h>
 #include <sys/proc.h>
 #include <sys/namei.h>
 #include <sys/sched.h>
 #include <sys/stat.h>
 #include <sys/malloc.h>
 #include <sys/mount.h>
 #include <sys/resource.h>
 #include <sys/resourcevar.h>
 #include <sys/vnode.h>
 
 #include <geom/geom.h>
 
 #include <ufs/ufs/extattr.h>
 #include <ufs/ufs/quota.h>
 #include <ufs/ufs/ufsmount.h>
 #include <ufs/ufs/inode.h>
 #include <ufs/ufs/ufs_extern.h>
 
 #include <ufs/ffs/fs.h>
 #include <ufs/ffs/ffs_extern.h>
 
 #define KERNCRED thread0.td_ucred
 #define DEBUG 1
 
 #include "opt_ffs.h"
 
 #ifdef NO_FFS_SNAPSHOT
 int
 ffs_snapshot(mp, snapfile)
 	struct mount *mp;
 	char *snapfile;
 {
 	return (EINVAL);
 }
 
 int
 ffs_snapblkfree(fs, devvp, bno, size, inum)
 	struct fs *fs;
 	struct vnode *devvp;
 	ufs2_daddr_t bno;
 	long size;
 	ino_t inum;
 {
 	return (EINVAL);
 }
 
 void
 ffs_snapremove(vp)
 	struct vnode *vp;
 {
 }
 
 void
 ffs_snapshot_mount(mp)
 	struct mount *mp;
 {
 }
 
 void
 ffs_snapshot_unmount(mp)
 	struct mount *mp;
 {
 }
 
 void
 ffs_snapgone(ip)
 	struct inode *ip;
 {
 }
 
 int
 ffs_copyonwrite(devvp, bp)
 	struct vnode *devvp;
 	struct buf *bp;
 {
 	return (EINVAL);
 }
 
 #else
 
 TAILQ_HEAD(snaphead, inode);
 
 struct snapdata {
 	struct snaphead sn_head;
 	daddr_t sn_listsize;
 	daddr_t *sn_blklist;
 	struct lock sn_lock;
 };
 
 static int cgaccount(int, struct vnode *, struct buf *, int);
 static int expunge_ufs1(struct vnode *, struct inode *, struct fs *,
     int (*)(struct vnode *, ufs1_daddr_t *, ufs1_daddr_t *, struct fs *,
     ufs_lbn_t, int), int);
 static int indiracct_ufs1(struct vnode *, struct vnode *, int,
     ufs1_daddr_t, ufs_lbn_t, ufs_lbn_t, ufs_lbn_t, ufs_lbn_t, struct fs *,
     int (*)(struct vnode *, ufs1_daddr_t *, ufs1_daddr_t *, struct fs *,
     ufs_lbn_t, int), int);
 static int fullacct_ufs1(struct vnode *, ufs1_daddr_t *, ufs1_daddr_t *,
     struct fs *, ufs_lbn_t, int);
 static int snapacct_ufs1(struct vnode *, ufs1_daddr_t *, ufs1_daddr_t *,
     struct fs *, ufs_lbn_t, int);
 static int mapacct_ufs1(struct vnode *, ufs1_daddr_t *, ufs1_daddr_t *,
     struct fs *, ufs_lbn_t, int);
 static int expunge_ufs2(struct vnode *, struct inode *, struct fs *,
     int (*)(struct vnode *, ufs2_daddr_t *, ufs2_daddr_t *, struct fs *,
     ufs_lbn_t, int), int);
 static int indiracct_ufs2(struct vnode *, struct vnode *, int,
     ufs2_daddr_t, ufs_lbn_t, ufs_lbn_t, ufs_lbn_t, ufs_lbn_t, struct fs *,
     int (*)(struct vnode *, ufs2_daddr_t *, ufs2_daddr_t *, struct fs *,
     ufs_lbn_t, int), int);
 static int fullacct_ufs2(struct vnode *, ufs2_daddr_t *, ufs2_daddr_t *,
     struct fs *, ufs_lbn_t, int);
 static int snapacct_ufs2(struct vnode *, ufs2_daddr_t *, ufs2_daddr_t *,
     struct fs *, ufs_lbn_t, int);
 static int mapacct_ufs2(struct vnode *, ufs2_daddr_t *, ufs2_daddr_t *,
     struct fs *, ufs_lbn_t, int);
 static int readblock(struct vnode *vp, struct buf *, ufs2_daddr_t);
 
 /*
  * To ensure the consistency of snapshots across crashes, we must
  * synchronously write out copied blocks before allowing the
  * originals to be modified. Because of the rather severe speed
  * penalty that this imposes, the following flag allows this
  * crash persistence to be disabled.
  */
 int dopersistence = 0;
 
 #ifdef DEBUG
 #include <sys/sysctl.h>
 SYSCTL_INT(_debug, OID_AUTO, dopersistence, CTLFLAG_RW, &dopersistence, 0, "");
 static int snapdebug = 0;
 SYSCTL_INT(_debug, OID_AUTO, snapdebug, CTLFLAG_RW, &snapdebug, 0, "");
 int collectsnapstats = 0;
 SYSCTL_INT(_debug, OID_AUTO, collectsnapstats, CTLFLAG_RW, &collectsnapstats,
 	0, "");
 #endif /* DEBUG */
 
 /*
  * Create a snapshot file and initialize it for the filesystem.
  */
 int
 ffs_snapshot(mp, snapfile)
 	struct mount *mp;
 	char *snapfile;
 {
 	ufs2_daddr_t numblks, blkno, *blkp, *snapblklist;
 	int error, cg, snaploc;
 	int i, size, len, loc;
 	int flag = mp->mnt_flag;
 	struct timespec starttime = {0, 0}, endtime;
 	char saved_nice = 0;
 	long redo = 0, snaplistsize = 0;
 	int32_t *lp;
 	void *space;
 	struct fs *copy_fs = NULL, *fs;
 	struct thread *td = curthread;
 	struct inode *ip, *xp;
 	struct buf *bp, *nbp, *ibp, *sbp = NULL;
 	struct nameidata nd;
 	struct mount *wrtmp;
 	struct vattr vat;
-	struct vnode *vp, *xvp, *nvp, *devvp;
+	struct vnode *vp, *xvp, *mvp, *devvp;
 	struct uio auio;
 	struct iovec aiov;
 	struct snapdata *sn;
 	struct ufsmount *ump;
 
 	ump = VFSTOUFS(mp);
 	fs = ump->um_fs;
 	/*
 	 * XXX: make sure we don't go to out1 before we setup sn
 	 */
 	sn = (void *)0xdeadbeef;
 
 	/*
 	 * Need to serialize access to snapshot code per filesystem.
 	 */
 	/*
 	 * Assign a snapshot slot in the superblock.
 	 */
 	UFS_LOCK(ump);
 	for (snaploc = 0; snaploc < FSMAXSNAP; snaploc++)
 		if (fs->fs_snapinum[snaploc] == 0)
 			break;
 	UFS_UNLOCK(ump);
 	if (snaploc == FSMAXSNAP)
 		return (ENOSPC);
 	/*
 	 * Create the snapshot file.
 	 */
 restart:
 	NDINIT(&nd, CREATE, LOCKPARENT | LOCKLEAF, UIO_SYSSPACE, snapfile, td);
 	if ((error = namei(&nd)) != 0)
 		return (error);
 	if (nd.ni_vp != NULL) {
 		vput(nd.ni_vp);
 		error = EEXIST;
 	}
 	if (nd.ni_dvp->v_mount != mp)
 		error = EXDEV;
 	if (error) {
 		NDFREE(&nd, NDF_ONLY_PNBUF);
 		if (nd.ni_dvp == nd.ni_vp)
 			vrele(nd.ni_dvp);
 		else
 			vput(nd.ni_dvp);
 		return (error);
 	}
 	VATTR_NULL(&vat);
 	vat.va_type = VREG;
 	vat.va_mode = S_IRUSR;
 	vat.va_vaflags |= VA_EXCLUSIVE;
 	if (VOP_GETWRITEMOUNT(nd.ni_dvp, &wrtmp))
 		wrtmp = NULL;
 	if (wrtmp != mp)
 		panic("ffs_snapshot: mount mismatch");
 	if (vn_start_write(NULL, &wrtmp, V_NOWAIT) != 0) {
 		NDFREE(&nd, NDF_ONLY_PNBUF);
 		vput(nd.ni_dvp);
 		if ((error = vn_start_write(NULL, &wrtmp,
 		    V_XSLEEP | PCATCH)) != 0)
 			return (error);
 		goto restart;
 	}
 	VOP_LEASE(nd.ni_dvp, td, KERNCRED, LEASE_WRITE);
 	error = VOP_CREATE(nd.ni_dvp, &nd.ni_vp, &nd.ni_cnd, &vat);
 	vput(nd.ni_dvp);
 	if (error) {
 		NDFREE(&nd, NDF_ONLY_PNBUF);
 		vn_finished_write(wrtmp);
 		return (error);
 	}
 	vp = nd.ni_vp;
 	ip = VTOI(vp);
 	devvp = ip->i_devvp;
 	/*
 	 * Allocate and copy the last block contents so as to be able
 	 * to set size to that of the filesystem.
 	 */
 	numblks = howmany(fs->fs_size, fs->fs_frag);
 	error = UFS_BALLOC(vp, lblktosize(fs, (off_t)(numblks - 1)),
 	    fs->fs_bsize, KERNCRED, BA_CLRBUF, &bp);
 	if (error)
 		goto out;
 	ip->i_size = lblktosize(fs, (off_t)numblks);
 	DIP_SET(ip, i_size, ip->i_size);
 	ip->i_flag |= IN_CHANGE | IN_UPDATE;
 	if ((error = readblock(vp, bp, numblks - 1)) != 0)
 		goto out;
 	bawrite(bp);
 	/*
 	 * Preallocate critical data structures so that we can copy
 	 * them in without further allocation after we suspend all
 	 * operations on the filesystem. We would like to just release
 	 * the allocated buffers without writing them since they will
 	 * be filled in below once we are ready to go, but this upsets
 	 * the soft update code, so we go ahead and write the new buffers.
 	 *
 	 * Allocate all indirect blocks and mark all of them as not
 	 * needing to be copied.
 	 */
 	for (blkno = NDADDR; blkno < numblks; blkno += NINDIR(fs)) {
 		error = UFS_BALLOC(vp, lblktosize(fs, (off_t)blkno),
 		    fs->fs_bsize, td->td_ucred, BA_METAONLY, &ibp);
 		if (error)
 			goto out;
 		bawrite(ibp);
 	}
 	/*
 	 * Allocate copies for the superblock and its summary information.
 	 */
 	error = UFS_BALLOC(vp, fs->fs_sblockloc, fs->fs_sbsize, KERNCRED,
 	    0, &nbp);
 	if (error)
 		goto out;
 	bawrite(nbp);
 	blkno = fragstoblks(fs, fs->fs_csaddr);
 	len = howmany(fs->fs_cssize, fs->fs_bsize);
 	for (loc = 0; loc < len; loc++) {
 		error = UFS_BALLOC(vp, lblktosize(fs, (off_t)(blkno + loc)),
 		    fs->fs_bsize, KERNCRED, 0, &nbp);
 		if (error)
 			goto out;
 		bawrite(nbp);
 	}
 	/*
 	 * Allocate all cylinder group blocks.
 	 */
 	for (cg = 0; cg < fs->fs_ncg; cg++) {
 		error = UFS_BALLOC(vp, lfragtosize(fs, cgtod(fs, cg)),
 		    fs->fs_bsize, KERNCRED, 0, &nbp);
 		if (error)
 			goto out;
 		bawrite(nbp);
 	}
 	/*
 	 * Copy all the cylinder group maps. Although the
 	 * filesystem is still active, we hope that only a few
 	 * cylinder groups will change between now and when we
 	 * suspend operations. Thus, we will be able to quickly
 	 * touch up the few cylinder groups that changed during
 	 * the suspension period.
 	 */
 	len = howmany(fs->fs_ncg, NBBY);
 	MALLOC(space, void *, len, M_DEVBUF, M_WAITOK|M_ZERO);
 	UFS_LOCK(ump);
 	fs->fs_active = space;
 	UFS_UNLOCK(ump);
 	for (cg = 0; cg < fs->fs_ncg; cg++) {
 		error = UFS_BALLOC(vp, lfragtosize(fs, cgtod(fs, cg)),
 		    fs->fs_bsize, KERNCRED, 0, &nbp);
 		if (error)
 			goto out;
 		error = cgaccount(cg, vp, nbp, 1);
 		bawrite(nbp);
 		if (error)
 			goto out;
 	}
 	/*
 	 * Change inode to snapshot type file.
 	 */
 	ip->i_flags |= SF_SNAPSHOT;
 	DIP_SET(ip, i_flags, ip->i_flags);
 	ip->i_flag |= IN_CHANGE | IN_UPDATE;
 	/*
 	 * Ensure that the snapshot is completely on disk.
 	 * Since we have marked it as a snapshot it is safe to
 	 * unlock it as no process will be allowed to write to it.
 	 */
 	if ((error = ffs_syncvnode(vp, MNT_WAIT)) != 0)
 		goto out;
 	VOP_UNLOCK(vp, 0, td);
 	/*
 	 * All allocations are done, so we can now snapshot the system.
 	 *
 	 * Recind nice scheduling while running with the filesystem suspended.
 	 */
 	if (td->td_proc->p_nice > 0) {
 		PROC_LOCK(td->td_proc);
 		mtx_lock_spin(&sched_lock);
 		saved_nice = td->td_proc->p_nice;
 		sched_nice(td->td_proc, 0);
 		mtx_unlock_spin(&sched_lock);
 		PROC_UNLOCK(td->td_proc);
 	}
 	/*
 	 * Suspend operation on filesystem.
 	 */
 	for (;;) {
 		vn_finished_write(wrtmp);
 		if ((error = vfs_write_suspend(vp->v_mount)) != 0) {
 			vn_start_write(NULL, &wrtmp, V_WAIT);
 			vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td);
 			goto out;
 		}
 		if (mp->mnt_kern_flag & MNTK_SUSPENDED)
 			break;
 		vn_start_write(NULL, &wrtmp, V_WAIT);
 	}
 	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td);
 	if (collectsnapstats)
 		nanotime(&starttime);
 	/*
 	 * First, copy all the cylinder group maps that have changed.
 	 */
 	for (cg = 0; cg < fs->fs_ncg; cg++) {
 		if ((ACTIVECGNUM(fs, cg) & ACTIVECGOFF(cg)) != 0)
 			continue;
 		redo++;
 		error = UFS_BALLOC(vp, lfragtosize(fs, cgtod(fs, cg)),
 		    fs->fs_bsize, KERNCRED, 0, &nbp);
 		if (error)
 			goto out1;
 		error = cgaccount(cg, vp, nbp, 2);
 		bawrite(nbp);
 		if (error)
 			goto out1;
 	}
 	/*
 	 * Grab a copy of the superblock and its summary information.
 	 * We delay writing it until the suspension is released below.
 	 */
 	error = bread(vp, lblkno(fs, fs->fs_sblockloc), fs->fs_bsize,
 	    KERNCRED, &sbp);
 	if (error) {
 		brelse(sbp);
 		sbp = NULL;
 		goto out1;
 	}
 	loc = blkoff(fs, fs->fs_sblockloc);
 	copy_fs = (struct fs *)(sbp->b_data + loc);
 	bcopy(fs, copy_fs, fs->fs_sbsize);
 	if ((fs->fs_flags & (FS_UNCLEAN | FS_NEEDSFSCK)) == 0)
 		copy_fs->fs_clean = 1;
 	size = fs->fs_bsize < SBLOCKSIZE ? fs->fs_bsize : SBLOCKSIZE;
 	if (fs->fs_sbsize < size)
 		bzero(&sbp->b_data[loc + fs->fs_sbsize], size - fs->fs_sbsize);
 	size = blkroundup(fs, fs->fs_cssize);
 	if (fs->fs_contigsumsize > 0)
 		size += fs->fs_ncg * sizeof(int32_t);
 	space = malloc((u_long)size, M_UFSMNT, M_WAITOK);
 	copy_fs->fs_csp = space;
 	bcopy(fs->fs_csp, copy_fs->fs_csp, fs->fs_cssize);
 	space = (char *)space + fs->fs_cssize;
 	loc = howmany(fs->fs_cssize, fs->fs_fsize);
 	i = fs->fs_frag - loc % fs->fs_frag;
 	len = (i == fs->fs_frag) ? 0 : i * fs->fs_fsize;
 	if (len > 0) {
 		if ((error = bread(devvp, fsbtodb(fs, fs->fs_csaddr + loc),
 		    len, KERNCRED, &bp)) != 0) {
 			brelse(bp);
 			free(copy_fs->fs_csp, M_UFSMNT);
 			bawrite(sbp);
 			sbp = NULL;
 			goto out1;
 		}
 		bcopy(bp->b_data, space, (u_int)len);
 		space = (char *)space + len;
 		bp->b_flags |= B_INVAL | B_NOCACHE;
 		brelse(bp);
 	}
 	if (fs->fs_contigsumsize > 0) {
 		copy_fs->fs_maxcluster = lp = space;
 		for (i = 0; i < fs->fs_ncg; i++)
 			*lp++ = fs->fs_contigsumsize;
 	}
 	/*
 	 * We must check for active files that have been unlinked
 	 * (e.g., with a zero link count). We have to expunge all
 	 * trace of these files from the snapshot so that they are
 	 * not reclaimed prematurely by fsck or unnecessarily dumped.
 	 * We turn off the MNTK_SUSPENDED flag to avoid a panic from
 	 * spec_strategy about writing on a suspended filesystem.
 	 * Note that we skip unlinked snapshot files as they will
 	 * be handled separately below.
 	 *
 	 * We also calculate the needed size for the snapshot list.
 	 */
 	snaplistsize = fs->fs_ncg + howmany(fs->fs_cssize, fs->fs_bsize) +
 	    FSMAXSNAP + 1 /* superblock */ + 1 /* last block */ + 1 /* size */;
 	MNT_ILOCK(mp);
 	mp->mnt_kern_flag &= ~MNTK_SUSPENDED;
 loop:
-	MNT_VNODE_FOREACH(xvp, mp, nvp) {
+	MNT_VNODE_FOREACH(xvp, mp, mvp) {
 		VI_LOCK(xvp);
 		MNT_IUNLOCK(mp);
 		if ((xvp->v_iflag & VI_DOOMED) ||
 		    xvp->v_usecount == 0 || xvp->v_type == VNON ||
 		    (VTOI(xvp)->i_flags & SF_SNAPSHOT)) {
 			VI_UNLOCK(xvp);
 			MNT_ILOCK(mp);
 			continue;
 		}
 		/*
 		 * We can skip parent directory vnode because it must have
 		 * this snapshot file in it.
 		 */
 		if (xvp == nd.ni_dvp) {
 			VI_UNLOCK(xvp);
 			MNT_ILOCK(mp);
 			continue;
 		}
 		if (vn_lock(xvp, LK_EXCLUSIVE | LK_INTERLOCK, td) != 0) {
 			MNT_ILOCK(mp);
+			MNT_VNODE_FOREACH_ABORT_ILOCKED(mp, mvp);
 			goto loop;
 		}
 		if (snapdebug)
 			vprint("ffs_snapshot: busy vnode", xvp);
 		if (VOP_GETATTR(xvp, &vat, td->td_ucred, td) == 0 &&
 		    vat.va_nlink > 0) {
 			VOP_UNLOCK(xvp, 0, td);
 			MNT_ILOCK(mp);
 			continue;
 		}
 		xp = VTOI(xvp);
 		if (ffs_checkfreefile(copy_fs, vp, xp->i_number)) {
 			VOP_UNLOCK(xvp, 0, td);
 			MNT_ILOCK(mp);
 			continue;
 		}
 		/*
 		 * If there is a fragment, clear it here.
 		 */
 		blkno = 0;
 		loc = howmany(xp->i_size, fs->fs_bsize) - 1;
 		if (loc < NDADDR) {
 			len = fragroundup(fs, blkoff(fs, xp->i_size));
 			if (len != 0 && len < fs->fs_bsize) {
 				ffs_blkfree(ump, copy_fs, vp,
 				    DIP(xp, i_db[loc]), len, xp->i_number);
 				blkno = DIP(xp, i_db[loc]);
 				DIP_SET(xp, i_db[loc], 0);
 			}
 		}
 		snaplistsize += 1;
 		if (xp->i_ump->um_fstype == UFS1)
 			error = expunge_ufs1(vp, xp, copy_fs, fullacct_ufs1,
 			    BLK_NOCOPY);
 		else
 			error = expunge_ufs2(vp, xp, copy_fs, fullacct_ufs2,
 			    BLK_NOCOPY);
 		if (blkno)
 			DIP_SET(xp, i_db[loc], blkno);
 		if (!error)
 			error = ffs_freefile(ump, copy_fs, vp, xp->i_number,
 			    xp->i_mode);
 		VOP_UNLOCK(xvp, 0, td);
 		if (error) {
 			free(copy_fs->fs_csp, M_UFSMNT);
 			bawrite(sbp);
 			sbp = NULL;
+			MNT_VNODE_FOREACH_ABORT(mp, mvp);
 			goto out1;
 		}
 		MNT_ILOCK(mp);
 	}
 	MNT_IUNLOCK(mp);
 	/*
 	 * If there already exist snapshots on this filesystem, grab a
 	 * reference to their shared lock. If this is the first snapshot
 	 * on this filesystem, we need to allocate a lock for the snapshots
 	 * to share. In either case, acquire the snapshot lock and give
 	 * up our original private lock.
 	 */
 	VI_LOCK(devvp);
 	sn = devvp->v_rdev->si_snapdata;
 	if (sn != NULL) {
 		xp = TAILQ_FIRST(&sn->sn_head);
 		VI_UNLOCK(devvp);
 		VI_LOCK(vp);
 		vp->v_vnlock = &sn->sn_lock;
 	} else {
 		VI_UNLOCK(devvp);
 		sn = malloc(sizeof *sn, M_UFSMNT, M_WAITOK | M_ZERO);
 		TAILQ_INIT(&sn->sn_head);
 		lockinit(&sn->sn_lock, PVFS, "snaplk", VLKTIMEOUT,
 		    LK_CANRECURSE | LK_NOSHARE);
 		VI_LOCK(vp);
 		vp->v_vnlock = &sn->sn_lock;
 		mp_fixme("si_snapdata setting is racey.");
 		devvp->v_rdev->si_snapdata = sn;
 		xp = NULL;
 	}
 	lockmgr(vp->v_vnlock, LK_INTERLOCK | LK_EXCLUSIVE | LK_RETRY,
 	    VI_MTX(vp), td);
 	transferlockers(&vp->v_lock, vp->v_vnlock);
 	lockmgr(&vp->v_lock, LK_RELEASE, NULL, td);
 	/*
 	 * If this is the first snapshot on this filesystem, then we need
 	 * to allocate the space for the list of preallocated snapshot blocks.
 	 * This list will be refined below, but this preliminary one will
 	 * keep us out of deadlock until the full one is ready.
 	 */
 	if (xp == NULL) {
 		MALLOC(snapblklist, daddr_t *, snaplistsize * sizeof(daddr_t),
 		    M_UFSMNT, M_WAITOK);
 		blkp = &snapblklist[1];
 		*blkp++ = lblkno(fs, fs->fs_sblockloc);
 		blkno = fragstoblks(fs, fs->fs_csaddr);
 		for (cg = 0; cg < fs->fs_ncg; cg++) {
 			if (fragstoblks(fs, cgtod(fs, cg) > blkno))
 				break;
 			*blkp++ = fragstoblks(fs, cgtod(fs, cg));
 		}
 		len = howmany(fs->fs_cssize, fs->fs_bsize);
 		for (loc = 0; loc < len; loc++)
 			*blkp++ = blkno + loc;
 		for (; cg < fs->fs_ncg; cg++)
 			*blkp++ = fragstoblks(fs, cgtod(fs, cg));
 		snapblklist[0] = blkp - snapblklist;
 		VI_LOCK(devvp);
 		if (sn->sn_blklist != NULL)
 			panic("ffs_snapshot: non-empty list");
 		sn->sn_blklist = snapblklist;
 		sn->sn_listsize = blkp - snapblklist;
 		VI_UNLOCK(devvp);
 	}
 	/*
 	 * Record snapshot inode. Since this is the newest snapshot,
 	 * it must be placed at the end of the list.
 	 */
 	VI_LOCK(devvp);
 	fs->fs_snapinum[snaploc] = ip->i_number;
 	if (ip->i_nextsnap.tqe_prev != 0)
 		panic("ffs_snapshot: %d already on list", ip->i_number);
 	TAILQ_INSERT_TAIL(&sn->sn_head, ip, i_nextsnap);
 	devvp->v_vflag |= VV_COPYONWRITE;
 	VI_UNLOCK(devvp);
 	ASSERT_VOP_LOCKED(vp, "ffs_snapshot vp");
 	vp->v_vflag |= VV_SYSTEM;
 out1:
 	KASSERT(sn != (void *)0xdeadbeef, ("email phk@ and mckusick@"));
 	/*
 	 * Resume operation on filesystem.
 	 */
 	vfs_write_resume(vp->v_mount);
 	vn_start_write(NULL, &wrtmp, V_WAIT);
 	if (collectsnapstats && starttime.tv_sec > 0) {
 		nanotime(&endtime);
 		timespecsub(&endtime, &starttime);
 		printf("%s: suspended %ld.%03ld sec, redo %ld of %d\n",
 		    vp->v_mount->mnt_stat.f_mntonname, (long)endtime.tv_sec,
 		    endtime.tv_nsec / 1000000, redo, fs->fs_ncg);
 	}
 	if (sbp == NULL)
 		goto out;
 	/*
 	 * Copy allocation information from all the snapshots in
 	 * this snapshot and then expunge them from its view.
 	 */
 	TAILQ_FOREACH(xp, &sn->sn_head, i_nextsnap) {
 		if (xp == ip)
 			break;
 		if (xp->i_ump->um_fstype == UFS1)
 			error = expunge_ufs1(vp, xp, fs, snapacct_ufs1,
 			    BLK_SNAP);
 		else
 			error = expunge_ufs2(vp, xp, fs, snapacct_ufs2,
 			    BLK_SNAP);
 		if (error) {
 			fs->fs_snapinum[snaploc] = 0;
 			goto done;
 		}
 	}
 	/*
 	 * Allocate space for the full list of preallocated snapshot blocks.
 	 */
 	MALLOC(snapblklist, daddr_t *, snaplistsize * sizeof(daddr_t),
 	    M_UFSMNT, M_WAITOK);
 	ip->i_snapblklist = &snapblklist[1];
 	/*
 	 * Expunge the blocks used by the snapshots from the set of
 	 * blocks marked as used in the snapshot bitmaps. Also, collect
 	 * the list of allocated blocks in i_snapblklist.
 	 */
 	if (ip->i_ump->um_fstype == UFS1)
 		error = expunge_ufs1(vp, ip, copy_fs, mapacct_ufs1, BLK_SNAP);
 	else
 		error = expunge_ufs2(vp, ip, copy_fs, mapacct_ufs2, BLK_SNAP);
 	if (error) {
 		fs->fs_snapinum[snaploc] = 0;
 		FREE(snapblklist, M_UFSMNT);
 		goto done;
 	}
 	if (snaplistsize < ip->i_snapblklist - snapblklist)
 		panic("ffs_snapshot: list too small");
 	snaplistsize = ip->i_snapblklist - snapblklist;
 	snapblklist[0] = snaplistsize;
 	ip->i_snapblklist = 0;
 	/*
 	 * Write out the list of allocated blocks to the end of the snapshot.
 	 */
 	auio.uio_iov = &aiov;
 	auio.uio_iovcnt = 1;
 	aiov.iov_base = (void *)snapblklist;
 	aiov.iov_len = snaplistsize * sizeof(daddr_t);
 	auio.uio_resid = aiov.iov_len;;
 	auio.uio_offset = ip->i_size;
 	auio.uio_segflg = UIO_SYSSPACE;
 	auio.uio_rw = UIO_WRITE;
 	auio.uio_td = td;
 	if ((error = VOP_WRITE(vp, &auio, IO_UNIT, td->td_ucred)) != 0) {
 		fs->fs_snapinum[snaploc] = 0;
 		FREE(snapblklist, M_UFSMNT);
 		goto done;
 	}
 	/*
 	 * Write the superblock and its summary information
 	 * to the snapshot.
 	 */
 	blkno = fragstoblks(fs, fs->fs_csaddr);
 	len = howmany(fs->fs_cssize, fs->fs_bsize);
 	space = copy_fs->fs_csp;
 	for (loc = 0; loc < len; loc++) {
 		error = bread(vp, blkno + loc, fs->fs_bsize, KERNCRED, &nbp);
 		if (error) {
 			brelse(nbp);
 			fs->fs_snapinum[snaploc] = 0;
 			FREE(snapblklist, M_UFSMNT);
 			goto done;
 		}
 		bcopy(space, nbp->b_data, fs->fs_bsize);
 		space = (char *)space + fs->fs_bsize;
 		bawrite(nbp);
 	}
 	/*
 	 * As this is the newest list, it is the most inclusive, so
 	 * should replace the previous list.
 	 */
 	VI_LOCK(devvp);
 	space = sn->sn_blklist;
 	sn->sn_blklist = snapblklist;
 	sn->sn_listsize = snaplistsize;
 	VI_UNLOCK(devvp);
 	if (space != NULL)
 		FREE(space, M_UFSMNT);
 	/*
 	 * If another process is currently writing the buffer containing
 	 * the inode for this snapshot then a deadlock can occur. Drop
 	 * the snapshot lock until the buffer has been written.
 	 */
 	VOP_UNLOCK(vp, 0, td);
 	(void) bread(ip->i_devvp,
 		     fsbtodb(fs, ino_to_fsba(fs, ip->i_number)),
 		     (int) fs->fs_bsize, NOCRED, &nbp);
 	brelse(nbp);
 	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td);
 done:
 	FREE(copy_fs->fs_csp, M_UFSMNT);
 	bawrite(sbp);
 out:
 	if (saved_nice > 0) {
 		PROC_LOCK(td->td_proc);
 		mtx_lock_spin(&sched_lock);
 		sched_nice(td->td_proc, saved_nice);
 		mtx_unlock_spin(&sched_lock);
 		PROC_UNLOCK(td->td_proc);
 	}
 	UFS_LOCK(ump);
 	if (fs->fs_active != 0) {
 		FREE(fs->fs_active, M_DEVBUF);
 		fs->fs_active = 0;
 	}
 	UFS_UNLOCK(ump);
 	mp->mnt_flag = flag;
 	if (error)
 		(void) ffs_truncate(vp, (off_t)0, 0, NOCRED, td);
 	(void) ffs_syncvnode(vp, MNT_WAIT);
 	if (error)
 		vput(vp);
 	else
 		VOP_UNLOCK(vp, 0, td);
 	vn_finished_write(wrtmp);
 	return (error);
 }
 
 /*
  * Copy a cylinder group map. All the unallocated blocks are marked
  * BLK_NOCOPY so that the snapshot knows that it need not copy them
  * if they are later written. If passno is one, then this is a first
  * pass, so only setting needs to be done. If passno is 2, then this
  * is a revision to a previous pass which must be undone as the
  * replacement pass is done.
  */
 static int
 cgaccount(cg, vp, nbp, passno)
 	int cg;
 	struct vnode *vp;
 	struct buf *nbp;
 	int passno;
 {
 	struct buf *bp, *ibp;
 	struct inode *ip;
 	struct cg *cgp;
 	struct fs *fs;
 	ufs2_daddr_t base, numblks;
 	int error, len, loc, indiroff;
 
 	ip = VTOI(vp);
 	fs = ip->i_fs;
 	error = bread(ip->i_devvp, fsbtodb(fs, cgtod(fs, cg)),
 		(int)fs->fs_cgsize, KERNCRED, &bp);
 	if (error) {
 		brelse(bp);
 		return (error);
 	}
 	cgp = (struct cg *)bp->b_data;
 	if (!cg_chkmagic(cgp)) {
 		brelse(bp);
 		return (EIO);
 	}
 	UFS_LOCK(ip->i_ump);
 	ACTIVESET(fs, cg);
 	UFS_UNLOCK(ip->i_ump);
 	bcopy(bp->b_data, nbp->b_data, fs->fs_cgsize);
 	if (fs->fs_cgsize < fs->fs_bsize)
 		bzero(&nbp->b_data[fs->fs_cgsize],
 		    fs->fs_bsize - fs->fs_cgsize);
 	cgp = (struct cg *)nbp->b_data;
 	bqrelse(bp);
 	if (passno == 2)
 		nbp->b_flags |= B_VALIDSUSPWRT;
 	numblks = howmany(fs->fs_size, fs->fs_frag);
 	len = howmany(fs->fs_fpg, fs->fs_frag);
 	base = cgbase(fs, cg) / fs->fs_frag;
 	if (base + len >= numblks)
 		len = numblks - base - 1;
 	loc = 0;
 	if (base < NDADDR) {
 		for ( ; loc < NDADDR; loc++) {
 			if (ffs_isblock(fs, cg_blksfree(cgp), loc))
 				DIP_SET(ip, i_db[loc], BLK_NOCOPY);
 			else if (passno == 2 && DIP(ip, i_db[loc])== BLK_NOCOPY)
 				DIP_SET(ip, i_db[loc], 0);
 			else if (passno == 1 && DIP(ip, i_db[loc])== BLK_NOCOPY)
 				panic("ffs_snapshot: lost direct block");
 		}
 	}
 	error = UFS_BALLOC(vp, lblktosize(fs, (off_t)(base + loc)),
 	    fs->fs_bsize, KERNCRED, BA_METAONLY, &ibp);
 	if (error) {
 		return (error);
 	}
 	indiroff = (base + loc - NDADDR) % NINDIR(fs);
 	for ( ; loc < len; loc++, indiroff++) {
 		if (indiroff >= NINDIR(fs)) {
 			if (passno == 2)
 				ibp->b_flags |= B_VALIDSUSPWRT;
 			bawrite(ibp);
 			error = UFS_BALLOC(vp,
 			    lblktosize(fs, (off_t)(base + loc)),
 			    fs->fs_bsize, KERNCRED, BA_METAONLY, &ibp);
 			if (error) {
 				return (error);
 			}
 			indiroff = 0;
 		}
 		if (ip->i_ump->um_fstype == UFS1) {
 			if (ffs_isblock(fs, cg_blksfree(cgp), loc))
 				((ufs1_daddr_t *)(ibp->b_data))[indiroff] =
 				    BLK_NOCOPY;
 			else if (passno == 2 && ((ufs1_daddr_t *)(ibp->b_data))
 			    [indiroff] == BLK_NOCOPY)
 				((ufs1_daddr_t *)(ibp->b_data))[indiroff] = 0;
 			else if (passno == 1 && ((ufs1_daddr_t *)(ibp->b_data))
 			    [indiroff] == BLK_NOCOPY)
 				panic("ffs_snapshot: lost indirect block");
 			continue;
 		}
 		if (ffs_isblock(fs, cg_blksfree(cgp), loc))
 			((ufs2_daddr_t *)(ibp->b_data))[indiroff] = BLK_NOCOPY;
 		else if (passno == 2 &&
 		    ((ufs2_daddr_t *)(ibp->b_data)) [indiroff] == BLK_NOCOPY)
 			((ufs2_daddr_t *)(ibp->b_data))[indiroff] = 0;
 		else if (passno == 1 &&
 		    ((ufs2_daddr_t *)(ibp->b_data)) [indiroff] == BLK_NOCOPY)
 			panic("ffs_snapshot: lost indirect block");
 	}
 	if (passno == 2)
 		ibp->b_flags |= B_VALIDSUSPWRT;
 	bdwrite(ibp);
 	return (0);
 }
 
 /*
  * Before expunging a snapshot inode, note all the
  * blocks that it claims with BLK_SNAP so that fsck will
  * be able to account for those blocks properly and so
  * that this snapshot knows that it need not copy them
  * if the other snapshot holding them is freed. This code
  * is reproduced once each for UFS1 and UFS2.
  */
 static int
 expunge_ufs1(snapvp, cancelip, fs, acctfunc, expungetype)
 	struct vnode *snapvp;
 	struct inode *cancelip;
 	struct fs *fs;
 	int (*acctfunc)(struct vnode *, ufs1_daddr_t *, ufs1_daddr_t *,
 	    struct fs *, ufs_lbn_t, int);
 	int expungetype;
 {
 	int i, error, indiroff;
 	ufs_lbn_t lbn, rlbn;
 	ufs2_daddr_t len, blkno, numblks, blksperindir;
 	struct ufs1_dinode *dip;
 	struct thread *td = curthread;
 	struct buf *bp;
 
 	/*
 	 * Prepare to expunge the inode. If its inode block has not
 	 * yet been copied, then allocate and fill the copy.
 	 */
 	lbn = fragstoblks(fs, ino_to_fsba(fs, cancelip->i_number));
 	blkno = 0;
 	if (lbn < NDADDR) {
 		blkno = VTOI(snapvp)->i_din1->di_db[lbn];
 	} else {
 		td->td_pflags |= TDP_COWINPROGRESS;
 		error = ffs_balloc_ufs1(snapvp, lblktosize(fs, (off_t)lbn),
 		   fs->fs_bsize, KERNCRED, BA_METAONLY, &bp);
 		td->td_pflags &= ~TDP_COWINPROGRESS;
 		if (error)
 			return (error);
 		indiroff = (lbn - NDADDR) % NINDIR(fs);
 		blkno = ((ufs1_daddr_t *)(bp->b_data))[indiroff];
 		bqrelse(bp);
 	}
 	if (blkno != 0) {
 		if ((error = bread(snapvp, lbn, fs->fs_bsize, KERNCRED, &bp)))
 			return (error);
 	} else {
 		error = ffs_balloc_ufs1(snapvp, lblktosize(fs, (off_t)lbn),
 		    fs->fs_bsize, KERNCRED, 0, &bp);
 		if (error)
 			return (error);
 		if ((error = readblock(snapvp, bp, lbn)) != 0)
 			return (error);
 	}
 	/*
 	 * Set a snapshot inode to be a zero length file, regular files
 	 * to be completely unallocated.
 	 */
 	dip = (struct ufs1_dinode *)bp->b_data +
 	    ino_to_fsbo(fs, cancelip->i_number);
 	if (expungetype == BLK_NOCOPY)
 		dip->di_mode = 0;
 	dip->di_size = 0;
 	dip->di_blocks = 0;
 	dip->di_flags &= ~SF_SNAPSHOT;
 	bzero(&dip->di_db[0], (NDADDR + NIADDR) * sizeof(ufs1_daddr_t));
 	bdwrite(bp);
 	/*
 	 * Now go through and expunge all the blocks in the file
 	 * using the function requested.
 	 */
 	numblks = howmany(cancelip->i_size, fs->fs_bsize);
 	if ((error = (*acctfunc)(snapvp, &cancelip->i_din1->di_db[0],
 	    &cancelip->i_din1->di_db[NDADDR], fs, 0, expungetype)))
 		return (error);
 	if ((error = (*acctfunc)(snapvp, &cancelip->i_din1->di_ib[0],
 	    &cancelip->i_din1->di_ib[NIADDR], fs, -1, expungetype)))
 		return (error);
 	blksperindir = 1;
 	lbn = -NDADDR;
 	len = numblks - NDADDR;
 	rlbn = NDADDR;
 	for (i = 0; len > 0 && i < NIADDR; i++) {
 		error = indiracct_ufs1(snapvp, ITOV(cancelip), i,
 		    cancelip->i_din1->di_ib[i], lbn, rlbn, len,
 		    blksperindir, fs, acctfunc, expungetype);
 		if (error)
 			return (error);
 		blksperindir *= NINDIR(fs);
 		lbn -= blksperindir + 1;
 		len -= blksperindir;
 		rlbn += blksperindir;
 	}
 	return (0);
 }
 
 /*
  * Descend an indirect block chain for vnode cancelvp accounting for all
  * its indirect blocks in snapvp.
  */ 
 static int
 indiracct_ufs1(snapvp, cancelvp, level, blkno, lbn, rlbn, remblks,
 	    blksperindir, fs, acctfunc, expungetype)
 	struct vnode *snapvp;
 	struct vnode *cancelvp;
 	int level;
 	ufs1_daddr_t blkno;
 	ufs_lbn_t lbn;
 	ufs_lbn_t rlbn;
 	ufs_lbn_t remblks;
 	ufs_lbn_t blksperindir;
 	struct fs *fs;
 	int (*acctfunc)(struct vnode *, ufs1_daddr_t *, ufs1_daddr_t *,
 	    struct fs *, ufs_lbn_t, int);
 	int expungetype;
 {
 	int error, num, i;
 	ufs_lbn_t subblksperindir;
 	struct indir indirs[NIADDR + 2];
 	ufs1_daddr_t last, *bap;
 	struct buf *bp;
 
 	if (blkno == 0) {
 		if (expungetype == BLK_NOCOPY)
 			return (0);
 		panic("indiracct_ufs1: missing indir");
 	}
 	if ((error = ufs_getlbns(cancelvp, rlbn, indirs, &num)) != 0)
 		return (error);
 	if (lbn != indirs[num - 1 - level].in_lbn || num < 2)
 		panic("indiracct_ufs1: botched params");
 	/*
 	 * We have to expand bread here since it will deadlock looking
 	 * up the block number for any blocks that are not in the cache.
 	 */
 	bp = getblk(cancelvp, lbn, fs->fs_bsize, 0, 0, 0);
 	bp->b_blkno = fsbtodb(fs, blkno);
 	if ((bp->b_flags & (B_DONE | B_DELWRI)) == 0 &&
 	    (error = readblock(cancelvp, bp, fragstoblks(fs, blkno)))) {
 		brelse(bp);
 		return (error);
 	}
 	/*
 	 * Account for the block pointers in this indirect block.
 	 */
 	last = howmany(remblks, blksperindir);
 	if (last > NINDIR(fs))
 		last = NINDIR(fs);
 	MALLOC(bap, ufs1_daddr_t *, fs->fs_bsize, M_DEVBUF, M_WAITOK);
 	bcopy(bp->b_data, (caddr_t)bap, fs->fs_bsize);
 	bqrelse(bp);
 	error = (*acctfunc)(snapvp, &bap[0], &bap[last], fs,
 	    level == 0 ? rlbn : -1, expungetype);
 	if (error || level == 0)
 		goto out;
 	/*
 	 * Account for the block pointers in each of the indirect blocks
 	 * in the levels below us.
 	 */
 	subblksperindir = blksperindir / NINDIR(fs);
 	for (lbn++, level--, i = 0; i < last; i++) {
 		error = indiracct_ufs1(snapvp, cancelvp, level, bap[i], lbn,
 		    rlbn, remblks, subblksperindir, fs, acctfunc, expungetype);
 		if (error)
 			goto out;
 		rlbn += blksperindir;
 		lbn -= blksperindir;
 		remblks -= blksperindir;
 	}
 out:
 	FREE(bap, M_DEVBUF);
 	return (error);
 }
 
 /*
  * Do both snap accounting and map accounting.
  */
 static int
 fullacct_ufs1(vp, oldblkp, lastblkp, fs, lblkno, exptype)
 	struct vnode *vp;
 	ufs1_daddr_t *oldblkp, *lastblkp;
 	struct fs *fs;
 	ufs_lbn_t lblkno;
 	int exptype;	/* BLK_SNAP or BLK_NOCOPY */
 {
 	int error;
 
 	if ((error = snapacct_ufs1(vp, oldblkp, lastblkp, fs, lblkno, exptype)))
 		return (error);
 	return (mapacct_ufs1(vp, oldblkp, lastblkp, fs, lblkno, exptype));
 }
 
 /*
  * Identify a set of blocks allocated in a snapshot inode.
  */
 static int
 snapacct_ufs1(vp, oldblkp, lastblkp, fs, lblkno, expungetype)
 	struct vnode *vp;
 	ufs1_daddr_t *oldblkp, *lastblkp;
 	struct fs *fs;
 	ufs_lbn_t lblkno;
 	int expungetype;	/* BLK_SNAP or BLK_NOCOPY */
 {
 	struct inode *ip = VTOI(vp);
 	ufs1_daddr_t blkno, *blkp;
 	ufs_lbn_t lbn;
 	struct buf *ibp;
 	int error;
 
 	for ( ; oldblkp < lastblkp; oldblkp++) {
 		blkno = *oldblkp;
 		if (blkno == 0 || blkno == BLK_NOCOPY || blkno == BLK_SNAP)
 			continue;
 		lbn = fragstoblks(fs, blkno);
 		if (lbn < NDADDR) {
 			blkp = &ip->i_din1->di_db[lbn];
 			ip->i_flag |= IN_CHANGE | IN_UPDATE;
 		} else {
 			error = ffs_balloc_ufs1(vp, lblktosize(fs, (off_t)lbn),
 			    fs->fs_bsize, KERNCRED, BA_METAONLY, &ibp);
 			if (error)
 				return (error);
 			blkp = &((ufs1_daddr_t *)(ibp->b_data))
 			    [(lbn - NDADDR) % NINDIR(fs)];
 		}
 		/*
 		 * If we are expunging a snapshot vnode and we
 		 * find a block marked BLK_NOCOPY, then it is
 		 * one that has been allocated to this snapshot after
 		 * we took our current snapshot and can be ignored.
 		 */
 		if (expungetype == BLK_SNAP && *blkp == BLK_NOCOPY) {
 			if (lbn >= NDADDR)
 				brelse(ibp);
 		} else {
 			if (*blkp != 0)
 				panic("snapacct_ufs1: bad block");
 			*blkp = expungetype;
 			if (lbn >= NDADDR)
 				bdwrite(ibp);
 		}
 	}
 	return (0);
 }
 
 /*
  * Account for a set of blocks allocated in a snapshot inode.
  */
 static int
 mapacct_ufs1(vp, oldblkp, lastblkp, fs, lblkno, expungetype)
 	struct vnode *vp;
 	ufs1_daddr_t *oldblkp, *lastblkp;
 	struct fs *fs;
 	ufs_lbn_t lblkno;
 	int expungetype;
 {
 	ufs1_daddr_t blkno;
 	struct inode *ip;
 	ino_t inum;
 	int acctit;
 
 	ip = VTOI(vp);
 	inum = ip->i_number;
 	if (lblkno == -1)
 		acctit = 0;
 	else
 		acctit = 1;
 	for ( ; oldblkp < lastblkp; oldblkp++, lblkno++) {
 		blkno = *oldblkp;
 		if (blkno == 0 || blkno == BLK_NOCOPY)
 			continue;
 		if (acctit && expungetype == BLK_SNAP && blkno != BLK_SNAP)
 			*ip->i_snapblklist++ = lblkno;
 		if (blkno == BLK_SNAP)
 			blkno = blkstofrags(fs, lblkno);
 		ffs_blkfree(ip->i_ump, fs, vp, blkno, fs->fs_bsize, inum);
 	}
 	return (0);
 }
 
 /*
  * Before expunging a snapshot inode, note all the
  * blocks that it claims with BLK_SNAP so that fsck will
  * be able to account for those blocks properly and so
  * that this snapshot knows that it need not copy them
  * if the other snapshot holding them is freed. This code
  * is reproduced once each for UFS1 and UFS2.
  */
 static int
 expunge_ufs2(snapvp, cancelip, fs, acctfunc, expungetype)
 	struct vnode *snapvp;
 	struct inode *cancelip;
 	struct fs *fs;
 	int (*acctfunc)(struct vnode *, ufs2_daddr_t *, ufs2_daddr_t *,
 	    struct fs *, ufs_lbn_t, int);
 	int expungetype;
 {
 	int i, error, indiroff;
 	ufs_lbn_t lbn, rlbn;
 	ufs2_daddr_t len, blkno, numblks, blksperindir;
 	struct ufs2_dinode *dip;
 	struct thread *td = curthread;
 	struct buf *bp;
 
 	/*
 	 * Prepare to expunge the inode. If its inode block has not
 	 * yet been copied, then allocate and fill the copy.
 	 */
 	lbn = fragstoblks(fs, ino_to_fsba(fs, cancelip->i_number));
 	blkno = 0;
 	if (lbn < NDADDR) {
 		blkno = VTOI(snapvp)->i_din2->di_db[lbn];
 	} else {
 		td->td_pflags |= TDP_COWINPROGRESS;
 		error = ffs_balloc_ufs2(snapvp, lblktosize(fs, (off_t)lbn),
 		   fs->fs_bsize, KERNCRED, BA_METAONLY, &bp);
 		td->td_pflags &= ~TDP_COWINPROGRESS;
 		if (error)
 			return (error);
 		indiroff = (lbn - NDADDR) % NINDIR(fs);
 		blkno = ((ufs2_daddr_t *)(bp->b_data))[indiroff];
 		bqrelse(bp);
 	}
 	if (blkno != 0) {
 		if ((error = bread(snapvp, lbn, fs->fs_bsize, KERNCRED, &bp)))
 			return (error);
 	} else {
 		error = ffs_balloc_ufs2(snapvp, lblktosize(fs, (off_t)lbn),
 		    fs->fs_bsize, KERNCRED, 0, &bp);
 		if (error)
 			return (error);
 		if ((error = readblock(snapvp, bp, lbn)) != 0)
 			return (error);
 	}
 	/*
 	 * Set a snapshot inode to be a zero length file, regular files
 	 * to be completely unallocated.
 	 */
 	dip = (struct ufs2_dinode *)bp->b_data +
 	    ino_to_fsbo(fs, cancelip->i_number);
 	if (expungetype == BLK_NOCOPY)
 		dip->di_mode = 0;
 	dip->di_size = 0;
 	dip->di_blocks = 0;
 	dip->di_flags &= ~SF_SNAPSHOT;
 	bzero(&dip->di_db[0], (NDADDR + NIADDR) * sizeof(ufs2_daddr_t));
 	bdwrite(bp);
 	/*
 	 * Now go through and expunge all the blocks in the file
 	 * using the function requested.
 	 */
 	numblks = howmany(cancelip->i_size, fs->fs_bsize);
 	if ((error = (*acctfunc)(snapvp, &cancelip->i_din2->di_db[0],
 	    &cancelip->i_din2->di_db[NDADDR], fs, 0, expungetype)))
 		return (error);
 	if ((error = (*acctfunc)(snapvp, &cancelip->i_din2->di_ib[0],
 	    &cancelip->i_din2->di_ib[NIADDR], fs, -1, expungetype)))
 		return (error);
 	blksperindir = 1;
 	lbn = -NDADDR;
 	len = numblks - NDADDR;
 	rlbn = NDADDR;
 	for (i = 0; len > 0 && i < NIADDR; i++) {
 		error = indiracct_ufs2(snapvp, ITOV(cancelip), i,
 		    cancelip->i_din2->di_ib[i], lbn, rlbn, len,
 		    blksperindir, fs, acctfunc, expungetype);
 		if (error)
 			return (error);
 		blksperindir *= NINDIR(fs);
 		lbn -= blksperindir + 1;
 		len -= blksperindir;
 		rlbn += blksperindir;
 	}
 	return (0);
 }
 
 /*
  * Descend an indirect block chain for vnode cancelvp accounting for all
  * its indirect blocks in snapvp.
  */ 
 static int
 indiracct_ufs2(snapvp, cancelvp, level, blkno, lbn, rlbn, remblks,
 	    blksperindir, fs, acctfunc, expungetype)
 	struct vnode *snapvp;
 	struct vnode *cancelvp;
 	int level;
 	ufs2_daddr_t blkno;
 	ufs_lbn_t lbn;
 	ufs_lbn_t rlbn;
 	ufs_lbn_t remblks;
 	ufs_lbn_t blksperindir;
 	struct fs *fs;
 	int (*acctfunc)(struct vnode *, ufs2_daddr_t *, ufs2_daddr_t *,
 	    struct fs *, ufs_lbn_t, int);
 	int expungetype;
 {
 	int error, num, i;
 	ufs_lbn_t subblksperindir;
 	struct indir indirs[NIADDR + 2];
 	ufs2_daddr_t last, *bap;
 	struct buf *bp;
 
 	if (blkno == 0) {
 		if (expungetype == BLK_NOCOPY)
 			return (0);
 		panic("indiracct_ufs2: missing indir");
 	}
 	if ((error = ufs_getlbns(cancelvp, rlbn, indirs, &num)) != 0)
 		return (error);
 	if (lbn != indirs[num - 1 - level].in_lbn || num < 2)
 		panic("indiracct_ufs2: botched params");
 	/*
 	 * We have to expand bread here since it will deadlock looking
 	 * up the block number for any blocks that are not in the cache.
 	 */
 	bp = getblk(cancelvp, lbn, fs->fs_bsize, 0, 0, 0);
 	bp->b_blkno = fsbtodb(fs, blkno);
 	if ((bp->b_flags & (B_DONE | B_DELWRI)) == 0 &&
 	    (error = readblock(cancelvp, bp, fragstoblks(fs, blkno)))) {
 		brelse(bp);
 		return (error);
 	}
 	/*
 	 * Account for the block pointers in this indirect block.
 	 */
 	last = howmany(remblks, blksperindir);
 	if (last > NINDIR(fs))
 		last = NINDIR(fs);
 	MALLOC(bap, ufs2_daddr_t *, fs->fs_bsize, M_DEVBUF, M_WAITOK);
 	bcopy(bp->b_data, (caddr_t)bap, fs->fs_bsize);
 	bqrelse(bp);
 	error = (*acctfunc)(snapvp, &bap[0], &bap[last], fs,
 	    level == 0 ? rlbn : -1, expungetype);
 	if (error || level == 0)
 		goto out;
 	/*
 	 * Account for the block pointers in each of the indirect blocks
 	 * in the levels below us.
 	 */
 	subblksperindir = blksperindir / NINDIR(fs);
 	for (lbn++, level--, i = 0; i < last; i++) {
 		error = indiracct_ufs2(snapvp, cancelvp, level, bap[i], lbn,
 		    rlbn, remblks, subblksperindir, fs, acctfunc, expungetype);
 		if (error)
 			goto out;
 		rlbn += blksperindir;
 		lbn -= blksperindir;
 		remblks -= blksperindir;
 	}
 out:
 	FREE(bap, M_DEVBUF);
 	return (error);
 }
 
 /*
  * Do both snap accounting and map accounting.
  */
 static int
 fullacct_ufs2(vp, oldblkp, lastblkp, fs, lblkno, exptype)
 	struct vnode *vp;
 	ufs2_daddr_t *oldblkp, *lastblkp;
 	struct fs *fs;
 	ufs_lbn_t lblkno;
 	int exptype;	/* BLK_SNAP or BLK_NOCOPY */
 {
 	int error;
 
 	if ((error = snapacct_ufs2(vp, oldblkp, lastblkp, fs, lblkno, exptype)))
 		return (error);
 	return (mapacct_ufs2(vp, oldblkp, lastblkp, fs, lblkno, exptype));
 }
 
 /*
  * Identify a set of blocks allocated in a snapshot inode.
  */
 static int
 snapacct_ufs2(vp, oldblkp, lastblkp, fs, lblkno, expungetype)
 	struct vnode *vp;
 	ufs2_daddr_t *oldblkp, *lastblkp;
 	struct fs *fs;
 	ufs_lbn_t lblkno;
 	int expungetype;	/* BLK_SNAP or BLK_NOCOPY */
 {
 	struct inode *ip = VTOI(vp);
 	ufs2_daddr_t blkno, *blkp;
 	ufs_lbn_t lbn;
 	struct buf *ibp;
 	int error;
 
 	for ( ; oldblkp < lastblkp; oldblkp++) {
 		blkno = *oldblkp;
 		if (blkno == 0 || blkno == BLK_NOCOPY || blkno == BLK_SNAP)
 			continue;
 		lbn = fragstoblks(fs, blkno);
 		if (lbn < NDADDR) {
 			blkp = &ip->i_din2->di_db[lbn];
 			ip->i_flag |= IN_CHANGE | IN_UPDATE;
 		} else {
 			error = ffs_balloc_ufs2(vp, lblktosize(fs, (off_t)lbn),
 			    fs->fs_bsize, KERNCRED, BA_METAONLY, &ibp);
 			if (error)
 				return (error);
 			blkp = &((ufs2_daddr_t *)(ibp->b_data))
 			    [(lbn - NDADDR) % NINDIR(fs)];
 		}
 		/*
 		 * If we are expunging a snapshot vnode and we
 		 * find a block marked BLK_NOCOPY, then it is
 		 * one that has been allocated to this snapshot after
 		 * we took our current snapshot and can be ignored.
 		 */
 		if (expungetype == BLK_SNAP && *blkp == BLK_NOCOPY) {
 			if (lbn >= NDADDR)
 				brelse(ibp);
 		} else {
 			if (*blkp != 0)
 				panic("snapacct_ufs2: bad block");
 			*blkp = expungetype;
 			if (lbn >= NDADDR)
 				bdwrite(ibp);
 		}
 	}
 	return (0);
 }
 
 /*
  * Account for a set of blocks allocated in a snapshot inode.
  */
 static int
 mapacct_ufs2(vp, oldblkp, lastblkp, fs, lblkno, expungetype)
 	struct vnode *vp;
 	ufs2_daddr_t *oldblkp, *lastblkp;
 	struct fs *fs;
 	ufs_lbn_t lblkno;
 	int expungetype;
 {
 	ufs2_daddr_t blkno;
 	struct inode *ip;
 	ino_t inum;
 	int acctit;
 
 	ip = VTOI(vp);
 	inum = ip->i_number;
 	if (lblkno == -1)
 		acctit = 0;
 	else
 		acctit = 1;
 	for ( ; oldblkp < lastblkp; oldblkp++, lblkno++) {
 		blkno = *oldblkp;
 		if (blkno == 0 || blkno == BLK_NOCOPY)
 			continue;
 		if (acctit && expungetype == BLK_SNAP && blkno != BLK_SNAP)
 			*ip->i_snapblklist++ = lblkno;
 		if (blkno == BLK_SNAP)
 			blkno = blkstofrags(fs, lblkno);
 		ffs_blkfree(ip->i_ump, fs, vp, blkno, fs->fs_bsize, inum);
 	}
 	return (0);
 }
 
 /*
  * Decrement extra reference on snapshot when last name is removed.
  * It will not be freed until the last open reference goes away.
  */
 void
 ffs_snapgone(ip)
 	struct inode *ip;
 {
 	struct inode *xp;
 	struct fs *fs;
 	int snaploc;
 	struct snapdata *sn;
 	struct ufsmount *ump;
 
 	/*
 	 * Find snapshot in incore list.
 	 */
 	xp = NULL;
 	sn = ip->i_devvp->v_rdev->si_snapdata;
 	if (sn != NULL)
 		TAILQ_FOREACH(xp, &sn->sn_head, i_nextsnap)
 			if (xp == ip)
 				break;
 	if (xp != NULL)
 		vrele(ITOV(ip));
 	else if (snapdebug)
 		printf("ffs_snapgone: lost snapshot vnode %d\n",
 		    ip->i_number);
 	/*
 	 * Delete snapshot inode from superblock. Keep list dense.
 	 */
 	fs = ip->i_fs;
 	ump = ip->i_ump;
 	UFS_LOCK(ump);
 	for (snaploc = 0; snaploc < FSMAXSNAP; snaploc++)
 		if (fs->fs_snapinum[snaploc] == ip->i_number)
 			break;
 	if (snaploc < FSMAXSNAP) {
 		for (snaploc++; snaploc < FSMAXSNAP; snaploc++) {
 			if (fs->fs_snapinum[snaploc] == 0)
 				break;
 			fs->fs_snapinum[snaploc - 1] = fs->fs_snapinum[snaploc];
 		}
 		fs->fs_snapinum[snaploc - 1] = 0;
 	}
 	UFS_UNLOCK(ump);
 }
 
 /*
  * Prepare a snapshot file for being removed.
  */
 void
 ffs_snapremove(vp)
 	struct vnode *vp;
 {
 	struct inode *ip;
 	struct vnode *devvp;
 	struct lock *lkp;
 	struct buf *ibp;
 	struct fs *fs;
 	struct thread *td = curthread;
 	ufs2_daddr_t numblks, blkno, dblk, *snapblklist;
 	int error, loc, last;
 	struct snapdata *sn;
 
 	ip = VTOI(vp);
 	fs = ip->i_fs;
 	devvp = ip->i_devvp;
 	sn = devvp->v_rdev->si_snapdata;
 	/*
 	 * If active, delete from incore list (this snapshot may
 	 * already have been in the process of being deleted, so
 	 * would not have been active).
 	 *
 	 * Clear copy-on-write flag if last snapshot.
 	 */
 	if (ip->i_nextsnap.tqe_prev != 0) {
 		lockmgr(&vp->v_lock, LK_EXCLUSIVE, NULL, td);
 		VI_LOCK(devvp);
 		TAILQ_REMOVE(&sn->sn_head, ip, i_nextsnap);
 		ip->i_nextsnap.tqe_prev = 0;
 		lkp = vp->v_vnlock;
 		vp->v_vnlock = &vp->v_lock;
 		lockmgr(lkp, LK_RELEASE, NULL, td);
 		if (TAILQ_FIRST(&sn->sn_head) != 0) {
 			VI_UNLOCK(devvp);
 		} else {
 			snapblklist = sn->sn_blklist;
 			sn->sn_blklist = 0;
 			sn->sn_listsize = 0;
 			devvp->v_rdev->si_snapdata = NULL;
 			devvp->v_vflag &= ~VV_COPYONWRITE;
 			lockmgr(lkp, LK_DRAIN|LK_INTERLOCK, VI_MTX(devvp), td);
 			lockmgr(lkp, LK_RELEASE, NULL, td);
 			lockdestroy(lkp);
 			free(sn, M_UFSMNT);
 			FREE(snapblklist, M_UFSMNT);
 		}
 	}
 	/*
 	 * Clear all BLK_NOCOPY fields. Pass any block claims to other
 	 * snapshots that want them (see ffs_snapblkfree below).
 	 */
 	for (blkno = 1; blkno < NDADDR; blkno++) {
 		dblk = DIP(ip, i_db[blkno]);
 		if (dblk == 0)
 			continue;
 		if (dblk == BLK_NOCOPY || dblk == BLK_SNAP)
 			DIP_SET(ip, i_db[blkno], 0);
 		else if ((dblk == blkstofrags(fs, blkno) &&
 		     ffs_snapblkfree(fs, ip->i_devvp, dblk, fs->fs_bsize,
 		     ip->i_number))) {
 			DIP_SET(ip, i_blocks, DIP(ip, i_blocks) -
 			    btodb(fs->fs_bsize));
 			DIP_SET(ip, i_db[blkno], 0);
 		}
 	}
 	numblks = howmany(ip->i_size, fs->fs_bsize);
 	for (blkno = NDADDR; blkno < numblks; blkno += NINDIR(fs)) {
 		error = UFS_BALLOC(vp, lblktosize(fs, (off_t)blkno),
 		    fs->fs_bsize, KERNCRED, BA_METAONLY, &ibp);
 		if (error)
 			continue;
 		if (fs->fs_size - blkno > NINDIR(fs))
 			last = NINDIR(fs);
 		else
 			last = fs->fs_size - blkno;
 		for (loc = 0; loc < last; loc++) {
 			if (ip->i_ump->um_fstype == UFS1) {
 				dblk = ((ufs1_daddr_t *)(ibp->b_data))[loc];
 				if (dblk == 0)
 					continue;
 				if (dblk == BLK_NOCOPY || dblk == BLK_SNAP)
 					((ufs1_daddr_t *)(ibp->b_data))[loc]= 0;
 				else if ((dblk == blkstofrags(fs, blkno) &&
 				     ffs_snapblkfree(fs, ip->i_devvp, dblk,
 				     fs->fs_bsize, ip->i_number))) {
 					ip->i_din1->di_blocks -=
 					    btodb(fs->fs_bsize);
 					((ufs1_daddr_t *)(ibp->b_data))[loc]= 0;
 				}
 				continue;
 			}
 			dblk = ((ufs2_daddr_t *)(ibp->b_data))[loc];
 			if (dblk == 0)
 				continue;
 			if (dblk == BLK_NOCOPY || dblk == BLK_SNAP)
 				((ufs2_daddr_t *)(ibp->b_data))[loc] = 0;
 			else if ((dblk == blkstofrags(fs, blkno) &&
 			     ffs_snapblkfree(fs, ip->i_devvp, dblk,
 			     fs->fs_bsize, ip->i_number))) {
 				ip->i_din2->di_blocks -= btodb(fs->fs_bsize);
 				((ufs2_daddr_t *)(ibp->b_data))[loc] = 0;
 			}
 		}
 		bawrite(ibp);
 	}
 	/*
 	 * Clear snapshot flag and drop reference.
 	 */
 	ip->i_flags &= ~SF_SNAPSHOT;
 	DIP_SET(ip, i_flags, ip->i_flags);
 	ip->i_flag |= IN_CHANGE | IN_UPDATE;
 }
 
 /*
  * Notification that a block is being freed. Return zero if the free
  * should be allowed to proceed. Return non-zero if the snapshot file
  * wants to claim the block. The block will be claimed if it is an
  * uncopied part of one of the snapshots. It will be freed if it is
  * either a BLK_NOCOPY or has already been copied in all of the snapshots.
  * If a fragment is being freed, then all snapshots that care about
  * it must make a copy since a snapshot file can only claim full sized
  * blocks. Note that if more than one snapshot file maps the block,
  * we can pick one at random to claim it. Since none of the snapshots
  * can change, we are assurred that they will all see the same unmodified
  * image. When deleting a snapshot file (see ffs_snapremove above), we
  * must push any of these claimed blocks to one of the other snapshots
  * that maps it. These claimed blocks are easily identified as they will
  * have a block number equal to their logical block number within the
  * snapshot. A copied block can never have this property because they
  * must always have been allocated from a BLK_NOCOPY location.
  */
 int
 ffs_snapblkfree(fs, devvp, bno, size, inum)
 	struct fs *fs;
 	struct vnode *devvp;
 	ufs2_daddr_t bno;
 	long size;
 	ino_t inum;
 {
 	struct buf *ibp, *cbp, *savedcbp = 0;
 	struct thread *td = curthread;
 	struct inode *ip;
 	struct vnode *vp = NULL;
 	ufs_lbn_t lbn;
 	ufs2_daddr_t blkno;
 	int indiroff = 0, error = 0, claimedblk = 0;
 	struct snapdata *sn;
 
 	lbn = fragstoblks(fs, bno);
 retry:
 	VI_LOCK(devvp);
 	sn = devvp->v_rdev->si_snapdata;
 	if (sn == NULL) {
 		VI_UNLOCK(devvp);
 		return (0);
 	}
 	if (lockmgr(&sn->sn_lock,
 		    LK_INTERLOCK | LK_EXCLUSIVE | LK_SLEEPFAIL,
 		    VI_MTX(devvp), td) != 0)
 		goto retry;
 	TAILQ_FOREACH(ip, &sn->sn_head, i_nextsnap) {
 		vp = ITOV(ip);
 		/*
 		 * Lookup block being written.
 		 */
 		if (lbn < NDADDR) {
 			blkno = DIP(ip, i_db[lbn]);
 		} else {
 			td->td_pflags |= TDP_COWINPROGRESS;
 			error = UFS_BALLOC(vp, lblktosize(fs, (off_t)lbn),
 			    fs->fs_bsize, KERNCRED, BA_METAONLY, &ibp);
 			td->td_pflags &= ~TDP_COWINPROGRESS;
 			if (error)
 				break;
 			indiroff = (lbn - NDADDR) % NINDIR(fs);
 			if (ip->i_ump->um_fstype == UFS1)
 				blkno=((ufs1_daddr_t *)(ibp->b_data))[indiroff];
 			else
 				blkno=((ufs2_daddr_t *)(ibp->b_data))[indiroff];
 		}
 		/*
 		 * Check to see if block needs to be copied.
 		 */
 		if (blkno == 0) {
 			/*
 			 * A block that we map is being freed. If it has not
 			 * been claimed yet, we will claim or copy it (below).
 			 */
 			claimedblk = 1;
 		} else if (blkno == BLK_SNAP) {
 			/*
 			 * No previous snapshot claimed the block,
 			 * so it will be freed and become a BLK_NOCOPY
 			 * (don't care) for us.
 			 */
 			if (claimedblk)
 				panic("snapblkfree: inconsistent block type");
 			if (lbn < NDADDR) {
 				DIP_SET(ip, i_db[lbn], BLK_NOCOPY);
 				ip->i_flag |= IN_CHANGE | IN_UPDATE;
 			} else if (ip->i_ump->um_fstype == UFS1) {
 				((ufs1_daddr_t *)(ibp->b_data))[indiroff] =
 				    BLK_NOCOPY;
 				bdwrite(ibp);
 			} else {
 				((ufs2_daddr_t *)(ibp->b_data))[indiroff] =
 				    BLK_NOCOPY;
 				bdwrite(ibp);
 			}
 			continue;
 		} else /* BLK_NOCOPY or default */ {
 			/*
 			 * If the snapshot has already copied the block
 			 * (default), or does not care about the block,
 			 * it is not needed.
 			 */
 			if (lbn >= NDADDR)
 				bqrelse(ibp);
 			continue;
 		}
 		/*
 		 * If this is a full size block, we will just grab it
 		 * and assign it to the snapshot inode. Otherwise we
 		 * will proceed to copy it. See explanation for this
 		 * routine as to why only a single snapshot needs to
 		 * claim this block.
 		 */
 		if (size == fs->fs_bsize) {
 #ifdef DEBUG
 			if (snapdebug)
 				printf("%s %d lbn %jd from inum %d\n",
 				    "Grabonremove: snapino", ip->i_number,
 				    (intmax_t)lbn, inum);
 #endif
 			if (lbn < NDADDR) {
 				DIP_SET(ip, i_db[lbn], bno);
 			} else if (ip->i_ump->um_fstype == UFS1) {
 				((ufs1_daddr_t *)(ibp->b_data))[indiroff] = bno;
 				bdwrite(ibp);
 			} else {
 				((ufs2_daddr_t *)(ibp->b_data))[indiroff] = bno;
 				bdwrite(ibp);
 			}
 			DIP_SET(ip, i_blocks, DIP(ip, i_blocks) + btodb(size));
 			ip->i_flag |= IN_CHANGE | IN_UPDATE;
 			lockmgr(vp->v_vnlock, LK_RELEASE, NULL, td);
 			return (1);
 		}
 		if (lbn >= NDADDR)
 			bqrelse(ibp);
 		/*
 		 * Allocate the block into which to do the copy. Note that this
 		 * allocation will never require any additional allocations for
 		 * the snapshot inode.
 		 */
 		td->td_pflags |= TDP_COWINPROGRESS;
 		error = UFS_BALLOC(vp, lblktosize(fs, (off_t)lbn),
 		    fs->fs_bsize, KERNCRED, 0, &cbp);
 		td->td_pflags &= ~TDP_COWINPROGRESS;
 		if (error)
 			break;
 #ifdef DEBUG
 		if (snapdebug)
 			printf("%s%d lbn %jd %s %d size %ld to blkno %jd\n",
 			    "Copyonremove: snapino ", ip->i_number,
 			    (intmax_t)lbn, "for inum", inum, size,
 			    (intmax_t)cbp->b_blkno);
 #endif
 		/*
 		 * If we have already read the old block contents, then
 		 * simply copy them to the new block. Note that we need
 		 * to synchronously write snapshots that have not been
 		 * unlinked, and hence will be visible after a crash,
 		 * to ensure their integrity.
 		 */
 		if (savedcbp != 0) {
 			bcopy(savedcbp->b_data, cbp->b_data, fs->fs_bsize);
 			bawrite(cbp);
 			if (dopersistence && ip->i_effnlink > 0)
 				(void) ffs_syncvnode(vp, MNT_WAIT);
 			continue;
 		}
 		/*
 		 * Otherwise, read the old block contents into the buffer.
 		 */
 		if ((error = readblock(vp, cbp, lbn)) != 0) {
 			bzero(cbp->b_data, fs->fs_bsize);
 			bawrite(cbp);
 			if (dopersistence && ip->i_effnlink > 0)
 				(void) ffs_syncvnode(vp, MNT_WAIT);
 			break;
 		}
 		savedcbp = cbp;
 	}
 	/*
 	 * Note that we need to synchronously write snapshots that
 	 * have not been unlinked, and hence will be visible after
 	 * a crash, to ensure their integrity.
 	 */
 	if (savedcbp) {
 		vp = savedcbp->b_vp;
 		bawrite(savedcbp);
 		if (dopersistence && VTOI(vp)->i_effnlink > 0)
 			(void) ffs_syncvnode(vp, MNT_WAIT);
 	}
 	/*
 	 * If we have been unable to allocate a block in which to do
 	 * the copy, then return non-zero so that the fragment will
 	 * not be freed. Although space will be lost, the snapshot
 	 * will stay consistent.
 	 */
 	lockmgr(vp->v_vnlock, LK_RELEASE, NULL, td);
 	return (error);
 }
 
 /*
  * Associate snapshot files when mounting.
  */
 void
 ffs_snapshot_mount(mp)
 	struct mount *mp;
 {
 	struct ufsmount *ump = VFSTOUFS(mp);
 	struct vnode *devvp = ump->um_devvp;
 	struct fs *fs = ump->um_fs;
 	struct thread *td = curthread;
 	struct snapdata *sn;
 	struct vnode *vp;
 	struct inode *ip;
 	struct uio auio;
 	struct iovec aiov;
 	void *snapblklist;
 	char *reason;
 	daddr_t snaplistsize;
 	int error, snaploc, loc;
 
 	/*
 	 * XXX The following needs to be set before ffs_truncate or
 	 * VOP_READ can be called.
 	 */
 	mp->mnt_stat.f_iosize = fs->fs_bsize;
 	/*
 	 * Process each snapshot listed in the superblock.
 	 */
 	vp = NULL;
 	sn = devvp->v_rdev->si_snapdata;
 	for (snaploc = 0; snaploc < FSMAXSNAP; snaploc++) {
 		if (fs->fs_snapinum[snaploc] == 0)
 			break;
 		if ((error = ffs_vget(mp, fs->fs_snapinum[snaploc],
 		    LK_EXCLUSIVE, &vp)) != 0){
 			printf("ffs_snapshot_mount: vget failed %d\n", error);
 			continue;
 		}
 		ip = VTOI(vp);
 		if ((ip->i_flags & SF_SNAPSHOT) == 0 || ip->i_size ==
 		    lblktosize(fs, howmany(fs->fs_size, fs->fs_frag))) {
 			if ((ip->i_flags & SF_SNAPSHOT) == 0) {
 				reason = "non-snapshot";
 			} else {
 				reason = "old format snapshot";
 				(void)ffs_truncate(vp, (off_t)0, 0, NOCRED, td);
 				(void)ffs_syncvnode(vp, MNT_WAIT);
 			}
 			printf("ffs_snapshot_mount: %s inode %d\n",
 			    reason, fs->fs_snapinum[snaploc]);
 			vput(vp);
 			vp = NULL;
 			for (loc = snaploc + 1; loc < FSMAXSNAP; loc++) {
 				if (fs->fs_snapinum[loc] == 0)
 					break;
 				fs->fs_snapinum[loc - 1] = fs->fs_snapinum[loc];
 			}
 			fs->fs_snapinum[loc - 1] = 0;
 			snaploc--;
 			continue;
 		}
 		/*
 		 * If there already exist snapshots on this filesystem, grab a
 		 * reference to their shared lock. If this is the first snapshot
 		 * on this filesystem, we need to allocate a lock for the
 		 * snapshots to share. In either case, acquire the snapshot
 		 * lock and give up our original private lock.
 		 */
 		VI_LOCK(devvp);
 		if (sn != NULL) {
 
 			VI_UNLOCK(devvp);
 			VI_LOCK(vp);
 			vp->v_vnlock = &sn->sn_lock;
 		} else {
 			VI_UNLOCK(devvp);
 			sn = malloc(sizeof *sn, M_UFSMNT, M_WAITOK | M_ZERO);
 			TAILQ_INIT(&sn->sn_head);
 			lockinit(&sn->sn_lock, PVFS, "snaplk", VLKTIMEOUT,
 			    LK_CANRECURSE | LK_NOSHARE);
 			VI_LOCK(vp);
 			vp->v_vnlock = &sn->sn_lock;
 			devvp->v_rdev->si_snapdata = sn;
 		}
 		lockmgr(vp->v_vnlock, LK_INTERLOCK | LK_EXCLUSIVE | LK_RETRY,
 		    VI_MTX(vp), td);
 		transferlockers(&vp->v_lock, vp->v_vnlock);
 		lockmgr(&vp->v_lock, LK_RELEASE, NULL, td);
 		/*
 		 * Link it onto the active snapshot list.
 		 */
 		VI_LOCK(devvp);
 		if (ip->i_nextsnap.tqe_prev != 0)
 			panic("ffs_snapshot_mount: %d already on list",
 			    ip->i_number);
 		else
 			TAILQ_INSERT_TAIL(&sn->sn_head, ip, i_nextsnap);
 		vp->v_vflag |= VV_SYSTEM;
 		VI_UNLOCK(devvp);
 		VOP_UNLOCK(vp, 0, td);
 	}
 	/*
 	 * No usable snapshots found.
 	 */
 	if (vp == NULL)
 		return;
 	/*
 	 * Allocate the space for the block hints list. We always want to
 	 * use the list from the newest snapshot.
 	 */
 	auio.uio_iov = &aiov;
 	auio.uio_iovcnt = 1;
 	aiov.iov_base = (void *)&snaplistsize;
 	aiov.iov_len = sizeof(snaplistsize);
 	auio.uio_resid = aiov.iov_len;
 	auio.uio_offset =
 	    lblktosize(fs, howmany(fs->fs_size, fs->fs_frag));
 	auio.uio_segflg = UIO_SYSSPACE;
 	auio.uio_rw = UIO_READ;
 	auio.uio_td = td;
 	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td);
 	if ((error = VOP_READ(vp, &auio, IO_UNIT, td->td_ucred)) != 0) {
 		printf("ffs_snapshot_mount: read_1 failed %d\n", error);
 		VOP_UNLOCK(vp, 0, td);
 		return;
 	}
 	MALLOC(snapblklist, void *, snaplistsize * sizeof(daddr_t),
 	    M_UFSMNT, M_WAITOK);
 	auio.uio_iovcnt = 1;
 	aiov.iov_base = snapblklist;
 	aiov.iov_len = snaplistsize * sizeof (daddr_t);
 	auio.uio_resid = aiov.iov_len;
 	auio.uio_offset -= sizeof(snaplistsize);
 	if ((error = VOP_READ(vp, &auio, IO_UNIT, td->td_ucred)) != 0) {
 		printf("ffs_snapshot_mount: read_2 failed %d\n", error);
 		VOP_UNLOCK(vp, 0, td);
 		FREE(snapblklist, M_UFSMNT);
 		return;
 	}
 	VOP_UNLOCK(vp, 0, td);
 	VI_LOCK(devvp);
 	ASSERT_VOP_LOCKED(devvp, "ffs_snapshot_mount");
 	sn->sn_listsize = snaplistsize;
 	sn->sn_blklist = (daddr_t *)snapblklist;
 	devvp->v_vflag |= VV_COPYONWRITE;
 	VI_UNLOCK(devvp);
 }
 
 /*
  * Disassociate snapshot files when unmounting.
  */
 void
 ffs_snapshot_unmount(mp)
 	struct mount *mp;
 {
 	struct vnode *devvp = VFSTOUFS(mp)->um_devvp;
 	struct snapdata *sn;
 	struct inode *xp;
 	struct vnode *vp;
 
 	sn = devvp->v_rdev->si_snapdata;
 	VI_LOCK(devvp);
 	while ((xp = TAILQ_FIRST(&sn->sn_head)) != 0) {
 		vp = ITOV(xp);
 		vp->v_vnlock = &vp->v_lock;
 		TAILQ_REMOVE(&sn->sn_head, xp, i_nextsnap);
 		xp->i_nextsnap.tqe_prev = 0;
 		if (xp->i_effnlink > 0) {
 			VI_UNLOCK(devvp);
 			vrele(vp);
 			VI_LOCK(devvp);
 		}
 	}
 	devvp->v_rdev->si_snapdata = NULL;
 	devvp->v_vflag &= ~VV_COPYONWRITE;
 	VI_UNLOCK(devvp);
 	if (sn->sn_blklist != NULL) {
 		FREE(sn->sn_blklist, M_UFSMNT);
 		sn->sn_blklist = NULL;
 		sn->sn_listsize = 0;
 	}
 	lockdestroy(&sn->sn_lock);
 	free(sn, M_UFSMNT);
 	ASSERT_VOP_LOCKED(devvp, "ffs_snapshot_unmount");
 }
 
 /*
  * Check for need to copy block that is about to be written,
  * copying the block if necessary.
  */
 int
 ffs_copyonwrite(devvp, bp)
 	struct vnode *devvp;
 	struct buf *bp;
 {
 	struct snapdata *sn;
 	struct buf *ibp, *cbp, *savedcbp = 0;
 	struct thread *td = curthread;
 	struct fs *fs;
 	struct inode *ip;
 	struct vnode *vp = 0;
 	ufs2_daddr_t lbn, blkno, *snapblklist;
 	int lower, upper, mid, indiroff, error = 0;
 	int launched_async_io, prev_norunningbuf;
 
 	if ((VTOI(bp->b_vp)->i_flags & SF_SNAPSHOT) != 0)
 		return (0);		/* Update on a snapshot file */
 	if (td->td_pflags & TDP_COWINPROGRESS)
 		panic("ffs_copyonwrite: recursive call");
 	/*
 	 * First check to see if it is in the preallocated list.
 	 * By doing this check we avoid several potential deadlocks.
 	 */
 	VI_LOCK(devvp);
 	sn = devvp->v_rdev->si_snapdata;
 	if (sn == NULL ||
 	    TAILQ_FIRST(&sn->sn_head) == NULL) {
 		VI_UNLOCK(devvp);
 		return (0);		/* No snapshot */
 	}
 	ip = TAILQ_FIRST(&sn->sn_head);
 	fs = ip->i_fs;
 	lbn = fragstoblks(fs, dbtofsb(fs, bp->b_blkno));
 	snapblklist = sn->sn_blklist;
 	upper = sn->sn_listsize - 1;
 	lower = 1;
 	while (lower <= upper) {
 		mid = (lower + upper) / 2;
 		if (snapblklist[mid] == lbn)
 			break;
 		if (snapblklist[mid] < lbn)
 			lower = mid + 1;
 		else
 			upper = mid - 1;
 	}
 	if (lower <= upper) {
 		VI_UNLOCK(devvp);
 		return (0);
 	}
 	launched_async_io = 0;
 	prev_norunningbuf = td->td_pflags & TDP_NORUNNINGBUF;
 	/*
 	 * Since I/O on bp isn't yet in progress and it may be blocked
 	 * for a long time waiting on snaplk, back it out of
 	 * runningbufspace, possibly waking other threads waiting for space.
 	 */
 	runningbufwakeup(bp);
 	/*
 	 * Not in the precomputed list, so check the snapshots.
 	 */
 	while (lockmgr(&sn->sn_lock,
 		       LK_INTERLOCK | LK_EXCLUSIVE | LK_SLEEPFAIL,
 		       VI_MTX(devvp), td) != 0) {
 		VI_LOCK(devvp);
 		sn = devvp->v_rdev->si_snapdata;
 		if (sn == NULL ||
 		    TAILQ_FIRST(&sn->sn_head) == NULL) {
 			VI_UNLOCK(devvp);
 			if (bp->b_runningbufspace)
 				atomic_add_int(&runningbufspace,
 					       bp->b_runningbufspace);
 			return (0);		/* Snapshot gone */
 		}
 	}
 	TAILQ_FOREACH(ip, &sn->sn_head, i_nextsnap) {
 		vp = ITOV(ip);
 		/*
 		 * We ensure that everything of our own that needs to be
 		 * copied will be done at the time that ffs_snapshot is
 		 * called. Thus we can skip the check here which can
 		 * deadlock in doing the lookup in UFS_BALLOC.
 		 */
 		if (bp->b_vp == vp)
 			continue;
 		/*
 		 * Check to see if block needs to be copied. We do not have
 		 * to hold the snapshot lock while doing this lookup as it
 		 * will never require any additional allocations for the
 		 * snapshot inode.
 		 */
 		if (lbn < NDADDR) {
 			blkno = DIP(ip, i_db[lbn]);
 		} else {
 			td->td_pflags |= TDP_COWINPROGRESS | TDP_NORUNNINGBUF;
 			error = UFS_BALLOC(vp, lblktosize(fs, (off_t)lbn),
 			   fs->fs_bsize, KERNCRED, BA_METAONLY, &ibp);
 			td->td_pflags &= ~TDP_COWINPROGRESS;
 			if (error)
 				break;
 			indiroff = (lbn - NDADDR) % NINDIR(fs);
 			if (ip->i_ump->um_fstype == UFS1)
 				blkno=((ufs1_daddr_t *)(ibp->b_data))[indiroff];
 			else
 				blkno=((ufs2_daddr_t *)(ibp->b_data))[indiroff];
 			bqrelse(ibp);
 		}
 #ifdef DIAGNOSTIC
 		if (blkno == BLK_SNAP && bp->b_lblkno >= 0)
 			panic("ffs_copyonwrite: bad copy block");
 #endif
 		if (blkno != 0)
 			continue;
 		/*
 		 * Allocate the block into which to do the copy. Since
 		 * multiple processes may all try to copy the same block,
 		 * we have to recheck our need to do a copy if we sleep
 		 * waiting for the lock.
 		 *
 		 * Because all snapshots on a filesystem share a single
 		 * lock, we ensure that we will never be in competition
 		 * with another process to allocate a block.
 		 */
 		td->td_pflags |= TDP_COWINPROGRESS | TDP_NORUNNINGBUF;
 		error = UFS_BALLOC(vp, lblktosize(fs, (off_t)lbn),
 		    fs->fs_bsize, KERNCRED, 0, &cbp);
 		td->td_pflags &= ~TDP_COWINPROGRESS;
 		if (error)
 			break;
 #ifdef DEBUG
 		if (snapdebug) {
 			printf("Copyonwrite: snapino %d lbn %jd for ",
 			    ip->i_number, (intmax_t)lbn);
 			if (bp->b_vp == devvp)
 				printf("fs metadata");
 			else
 				printf("inum %d", VTOI(bp->b_vp)->i_number);
 			printf(" lblkno %jd to blkno %jd\n",
 			    (intmax_t)bp->b_lblkno, (intmax_t)cbp->b_blkno);
 		}
 #endif
 		/*
 		 * If we have already read the old block contents, then
 		 * simply copy them to the new block. Note that we need
 		 * to synchronously write snapshots that have not been
 		 * unlinked, and hence will be visible after a crash,
 		 * to ensure their integrity.
 		 */
 		if (savedcbp != 0) {
 			bcopy(savedcbp->b_data, cbp->b_data, fs->fs_bsize);
 			bawrite(cbp);
 			if (dopersistence && ip->i_effnlink > 0)
 				(void) ffs_syncvnode(vp, MNT_WAIT);
 			else
 				launched_async_io = 1;
 			continue;
 		}
 		/*
 		 * Otherwise, read the old block contents into the buffer.
 		 */
 		if ((error = readblock(vp, cbp, lbn)) != 0) {
 			bzero(cbp->b_data, fs->fs_bsize);
 			bawrite(cbp);
 			if (dopersistence && ip->i_effnlink > 0)
 				(void) ffs_syncvnode(vp, MNT_WAIT);
 			else
 				launched_async_io = 1;
 			break;
 		}
 		savedcbp = cbp;
 	}
 	/*
 	 * Note that we need to synchronously write snapshots that
 	 * have not been unlinked, and hence will be visible after
 	 * a crash, to ensure their integrity.
 	 */
 	if (savedcbp) {
 		vp = savedcbp->b_vp;
 		bawrite(savedcbp);
 		if (dopersistence && VTOI(vp)->i_effnlink > 0)
 			(void) ffs_syncvnode(vp, MNT_WAIT);
 		else
 			launched_async_io = 1;
 	}
 	lockmgr(vp->v_vnlock, LK_RELEASE, NULL, td);
 	td->td_pflags = (td->td_pflags & ~TDP_NORUNNINGBUF) |
 		prev_norunningbuf;
 	if (launched_async_io && (td->td_pflags & TDP_NORUNNINGBUF) == 0)
 		waitrunningbufspace();
 	/*
 	 * I/O on bp will now be started, so count it in runningbufspace.
 	 */
 	if (bp->b_runningbufspace)
 		atomic_add_int(&runningbufspace, bp->b_runningbufspace);
 	return (error);
 }
 
 /*
  * Read the specified block into the given buffer.
  * Much of this boiler-plate comes from bwrite().
  */
 static int
 readblock(vp, bp, lbn)
 	struct vnode *vp;
 	struct buf *bp;
 	ufs2_daddr_t lbn;
 {
 	struct inode *ip = VTOI(vp);
 	struct bio *bip;
 
 	bip = g_alloc_bio();
 	bip->bio_cmd = BIO_READ;
 	bip->bio_offset = dbtob(fsbtodb(ip->i_fs, blkstofrags(ip->i_fs, lbn)));
 	bip->bio_data = bp->b_data;
 	bip->bio_length = bp->b_bcount;
 
 	g_io_request(bip, ip->i_devvp->v_bufobj.bo_private);
 
 	do 
 		msleep(bip, NULL, PRIBIO, "snaprdb", hz/10);
 	while (!(bip->bio_flags & BIO_DONE));
 	bp->b_error = bip->bio_error;
 	g_destroy_bio(bip);
 	return (bp->b_error);
 }
 
 #endif
Index: head/sys/ufs/ffs/ffs_vfsops.c
===================================================================
--- head/sys/ufs/ffs/ffs_vfsops.c	(revision 154151)
+++ head/sys/ufs/ffs/ffs_vfsops.c	(revision 154152)
@@ -1,1740 +1,1744 @@
 /*-
  * Copyright (c) 1989, 1991, 1993, 1994
  *	The Regents of the University of California.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	@(#)ffs_vfsops.c	8.31 (Berkeley) 5/20/95
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_mac.h"
 #include "opt_quota.h"
 #include "opt_ufs.h"
 #include "opt_ffs.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/namei.h>
 #include <sys/proc.h>
 #include <sys/kernel.h>
 #include <sys/mac.h>
 #include <sys/vnode.h>
 #include <sys/mount.h>
 #include <sys/bio.h>
 #include <sys/buf.h>
 #include <sys/conf.h>
 #include <sys/fcntl.h>
 #include <sys/malloc.h>
 #include <sys/mutex.h>
 
 #include <ufs/ufs/extattr.h>
 #include <ufs/ufs/quota.h>
 #include <ufs/ufs/ufsmount.h>
 #include <ufs/ufs/inode.h>
 #include <ufs/ufs/ufs_extern.h>
 
 #include <ufs/ffs/fs.h>
 #include <ufs/ffs/ffs_extern.h>
 
 #include <vm/vm.h>
 #include <vm/uma.h>
 #include <vm/vm_page.h>
 
 #include <geom/geom.h>
 #include <geom/geom_vfs.h>
 
 static uma_zone_t uma_inode, uma_ufs1, uma_ufs2;
 
 static int	ffs_sbupdate(struct ufsmount *, int);
 static int	ffs_reload(struct mount *, struct thread *);
 static int	ffs_mountfs(struct vnode *, struct mount *, struct thread *);
 static void	ffs_oldfscompat_read(struct fs *, struct ufsmount *,
 		    ufs2_daddr_t);
 static void	ffs_oldfscompat_write(struct fs *, struct ufsmount *);
 static void	ffs_ifree(struct ufsmount *ump, struct inode *ip);
 static vfs_init_t ffs_init;
 static vfs_uninit_t ffs_uninit;
 static vfs_extattrctl_t ffs_extattrctl;
 static vfs_cmount_t ffs_cmount;
 static vfs_unmount_t ffs_unmount;
 static vfs_mount_t ffs_mount;
 static vfs_statfs_t ffs_statfs;
 static vfs_fhtovp_t ffs_fhtovp;
 static vfs_vptofh_t ffs_vptofh;
 static vfs_sync_t ffs_sync;
 
 static struct vfsops ufs_vfsops = {
 	.vfs_extattrctl =	ffs_extattrctl,
 	.vfs_fhtovp =		ffs_fhtovp,
 	.vfs_init =		ffs_init,
 	.vfs_mount =		ffs_mount,
 	.vfs_cmount =		ffs_cmount,
 	.vfs_quotactl =		ufs_quotactl,
 	.vfs_root =		ufs_root,
 	.vfs_statfs =		ffs_statfs,
 	.vfs_sync =		ffs_sync,
 	.vfs_uninit =		ffs_uninit,
 	.vfs_unmount =		ffs_unmount,
 	.vfs_vget =		ffs_vget,
 	.vfs_vptofh =		ffs_vptofh,
 };
 
 VFS_SET(ufs_vfsops, ufs, 0);
 
 static b_strategy_t ffs_geom_strategy;
 static b_write_t ffs_bufwrite;
 
 static struct buf_ops ffs_ops = {
 	.bop_name =	"FFS",
 	.bop_write =	ffs_bufwrite,
 	.bop_strategy =	ffs_geom_strategy,
 	.bop_sync =	bufsync,
 };
 
 static const char *ffs_opts[] = { "acls", "async", "atime", "clusterr",
     "clusterw", "exec", "errmsg", "export", "force", "from", "multilabel", 
     "snapshot", "suid", "suiddir", "symfollow", "sync",
     "update", "union", NULL };
 
 static int
 ffs_mount(struct mount *mp, struct thread *td)
 {
 	struct vnode *devvp;
 	struct ufsmount *ump = 0;
 	struct fs *fs;
 	int error, flags;
 	mode_t accessmode;
 	struct nameidata ndp;
 	struct export_args export;
 	char *fspec;
 
 	if (vfs_filteropt(mp->mnt_optnew, ffs_opts))
 		return (EINVAL);
 	if (uma_inode == NULL) {
 		uma_inode = uma_zcreate("FFS inode",
 		    sizeof(struct inode), NULL, NULL, NULL, NULL,
 		    UMA_ALIGN_PTR, 0);
 		uma_ufs1 = uma_zcreate("FFS1 dinode",
 		    sizeof(struct ufs1_dinode), NULL, NULL, NULL, NULL,
 		    UMA_ALIGN_PTR, 0);
 		uma_ufs2 = uma_zcreate("FFS2 dinode",
 		    sizeof(struct ufs2_dinode), NULL, NULL, NULL, NULL,
 		    UMA_ALIGN_PTR, 0);
 	}
 
 	fspec = vfs_getopts(mp->mnt_optnew, "from", &error);
 	if (error)
 		return (error);
 
 	if (vfs_getopt(mp->mnt_optnew, "acls", NULL, NULL) == 0)
 		mp->mnt_flag |= MNT_ACLS;
 
 	if (vfs_getopt(mp->mnt_optnew, "async", NULL, NULL) == 0)
 		mp->mnt_flag |= MNT_ASYNC;
 
 	if (vfs_getopt(mp->mnt_optnew, "force", NULL, NULL) == 0)
 		mp->mnt_flag |= MNT_FORCE;
 
 	if (vfs_getopt(mp->mnt_optnew, "multilabel", NULL, NULL) == 0)
 		mp->mnt_flag |= MNT_MULTILABEL;
 
 	if (vfs_getopt(mp->mnt_optnew, "noasync", NULL, NULL) == 0)
 		mp->mnt_flag &= ~MNT_ASYNC;
 
 	if (vfs_getopt(mp->mnt_optnew, "noatime", NULL, NULL) == 0)
 		mp->mnt_flag |= MNT_NOATIME;
 
 	if (vfs_getopt(mp->mnt_optnew, "noclusterr", NULL, NULL) == 0)
 		mp->mnt_flag |= MNT_NOCLUSTERR;
 
 	if (vfs_getopt(mp->mnt_optnew, "noclusterw", NULL, NULL) == 0)
 		mp->mnt_flag |= MNT_NOCLUSTERW;
 
 	if (vfs_getopt(mp->mnt_optnew, "snapshot", NULL, NULL) == 0)
 		mp->mnt_flag |= MNT_SNAPSHOT;
 
 	if (vfs_getopt(mp->mnt_optnew, "update", NULL, NULL) == 0)
 		mp->mnt_flag |= MNT_UPDATE;
 
 	export.ex_root = -2; /* DEFAULT_ROOTID */
 
 	if (mp->mnt_flag & MNT_RDONLY)
 		export.ex_flags = MNT_EXRDONLY;
 	else
 		export.ex_flags = 0;
 
 	/*
 	 * If updating, check whether changing from read-only to
 	 * read/write; if there is no device name, that's all we do.
 	 */
 	if (mp->mnt_flag & MNT_UPDATE) {
 		ump = VFSTOUFS(mp);
 		fs = ump->um_fs;
 		devvp = ump->um_devvp;
 		if (fs->fs_ronly == 0 &&
 		    vfs_flagopt(mp->mnt_optnew, "ro", NULL, 0)) {
 			if ((error = vn_start_write(NULL, &mp, V_WAIT)) != 0)
 				return (error);
 			/*
 			 * Flush any dirty data.
 			 */
 			if ((error = ffs_sync(mp, MNT_WAIT, td)) != 0) {
 				vn_finished_write(mp);
 				return (error);
 			}
 			/*
 			 * Check for and optionally get rid of files open
 			 * for writing.
 			 */
 			flags = WRITECLOSE;
 			if (mp->mnt_flag & MNT_FORCE)
 				flags |= FORCECLOSE;
 			if (mp->mnt_flag & MNT_SOFTDEP) {
 				error = softdep_flushfiles(mp, flags, td);
 			} else {
 				error = ffs_flushfiles(mp, flags, td);
 			}
 			if (error) {
 				vn_finished_write(mp);
 				return (error);
 			}
 			if (fs->fs_pendingblocks != 0 ||
 			    fs->fs_pendinginodes != 0) {
 				printf("%s: %s: blocks %jd files %d\n",
 				    fs->fs_fsmnt, "update error",
 				    (intmax_t)fs->fs_pendingblocks,
 				    fs->fs_pendinginodes);
 				fs->fs_pendingblocks = 0;
 				fs->fs_pendinginodes = 0;
 			}
 			if ((fs->fs_flags & (FS_UNCLEAN | FS_NEEDSFSCK)) == 0)
 				fs->fs_clean = 1;
 			if ((error = ffs_sbupdate(ump, MNT_WAIT)) != 0) {
 				fs->fs_ronly = 0;
 				fs->fs_clean = 0;
 				vn_finished_write(mp);
 				return (error);
 			}
 			vn_finished_write(mp);
 			DROP_GIANT();
 			g_topology_lock();
 			g_access(ump->um_cp, 0, -1, 0);
 			g_topology_unlock();
 			PICKUP_GIANT();
 			fs->fs_ronly = 1;
 			mp->mnt_flag |= MNT_RDONLY;
 		}
 		if ((mp->mnt_flag & MNT_RELOAD) &&
 		    (error = ffs_reload(mp, td)) != 0)
 			return (error);
 		if (fs->fs_ronly &&
 		    !vfs_flagopt(mp->mnt_optnew, "ro", NULL, 0)) {
 			/*
 			 * If upgrade to read-write by non-root, then verify
 			 * that user has necessary permissions on the device.
 			 */
 			if (suser(td)) {
 				vn_lock(devvp, LK_EXCLUSIVE | LK_RETRY, td);
 				if ((error = VOP_ACCESS(devvp, VREAD | VWRITE,
 				    td->td_ucred, td)) != 0) {
 					VOP_UNLOCK(devvp, 0, td);
 					return (error);
 				}
 				VOP_UNLOCK(devvp, 0, td);
 			}
 			fs->fs_flags &= ~FS_UNCLEAN;
 			if (fs->fs_clean == 0) {
 				fs->fs_flags |= FS_UNCLEAN;
 				if ((mp->mnt_flag & MNT_FORCE) ||
 				    ((fs->fs_flags & FS_NEEDSFSCK) == 0 &&
 				     (fs->fs_flags & FS_DOSOFTDEP))) {
 					printf("WARNING: %s was not %s\n",
 					   fs->fs_fsmnt, "properly dismounted");
 				} else {
 					printf(
 "WARNING: R/W mount of %s denied.  Filesystem is not clean - run fsck\n",
 					    fs->fs_fsmnt);
 					return (EPERM);
 				}
 			}
 			DROP_GIANT();
 			g_topology_lock();
 			/*
 			 * If we're the root device, we may not have an E count
 			 * yet, get it now.
 			 */
 			if (ump->um_cp->ace == 0)
 				error = g_access(ump->um_cp, 0, 1, 1);
 			else
 				error = g_access(ump->um_cp, 0, 1, 0);
 			g_topology_unlock();
 			PICKUP_GIANT();
 			if (error)
 				return (error);
 			if ((error = vn_start_write(NULL, &mp, V_WAIT)) != 0)
 				return (error);
 			fs->fs_ronly = 0;
 			mp->mnt_flag &= ~MNT_RDONLY;
 			fs->fs_clean = 0;
 			if ((error = ffs_sbupdate(ump, MNT_WAIT)) != 0) {
 				vn_finished_write(mp);
 				return (error);
 			}
 			/* check to see if we need to start softdep */
 			if ((fs->fs_flags & FS_DOSOFTDEP) &&
 			    (error = softdep_mount(devvp, mp, fs, td->td_ucred))){
 				vn_finished_write(mp);
 				return (error);
 			}
 			if (fs->fs_snapinum[0] != 0)
 				ffs_snapshot_mount(mp);
 			vn_finished_write(mp);
 		}
 		/*
 		 * Soft updates is incompatible with "async",
 		 * so if we are doing softupdates stop the user
 		 * from setting the async flag in an update.
 		 * Softdep_mount() clears it in an initial mount 
 		 * or ro->rw remount.
 		 */
 		if (mp->mnt_flag & MNT_SOFTDEP)
 			mp->mnt_flag &= ~MNT_ASYNC;
 		/*
 		 * Keep MNT_ACLS flag if it is stored in superblock.
 		 */
 		if ((fs->fs_flags & FS_ACLS) != 0)
 			mp->mnt_flag |= MNT_ACLS;
 		/*
 		 * If not updating name, process export requests.
 		 */
 		error = 0;
 		if (vfs_getopt(mp->mnt_optnew, "export", NULL, NULL) == 0) { 
 			error = vfs_copyopt(mp->mnt_optnew, "export",
 			    &export, sizeof export);
 		}
 
 		if (error == 0 && export.ex_flags != 0)
 			return (vfs_export(mp, &export));
 		/*
 		 * If this is a snapshot request, take the snapshot.
 		 */
 		if (mp->mnt_flag & MNT_SNAPSHOT)
 			return (ffs_snapshot(mp, fspec));
 	}
 
 	/*
 	 * Not an update, or updating the name: look up the name
 	 * and verify that it refers to a sensible disk device.
 	 */
 	NDINIT(&ndp, LOOKUP, FOLLOW | LOCKLEAF, UIO_SYSSPACE, fspec, td);
 	if ((error = namei(&ndp)) != 0)
 		return (error);
 	NDFREE(&ndp, NDF_ONLY_PNBUF);
 	devvp = ndp.ni_vp;
 	if (!vn_isdisk(devvp, &error)) {
 		vput(devvp);
 		return (error);
 	}
 
 	/*
 	 * If mount by non-root, then verify that user has necessary
 	 * permissions on the device.
 	 */
 	if (suser(td)) {
 		accessmode = VREAD;
 		if ((mp->mnt_flag & MNT_RDONLY) == 0)
 			accessmode |= VWRITE;
 		if ((error = VOP_ACCESS(devvp, accessmode, td->td_ucred, td))!= 0){
 			vput(devvp);
 			return (error);
 		}
 	}
 
 	if (mp->mnt_flag & MNT_UPDATE) {
 		/*
 		 * Update only
 		 *
 		 * If it's not the same vnode, or at least the same device
 		 * then it's not correct.
 		 */
 
 		if (devvp->v_rdev != ump->um_devvp->v_rdev)
 			error = EINVAL;	/* needs translation */
 		vput(devvp);
 		if (error)
 			return (error);
 	} else {
 		/*
 		 * New mount
 		 *
 		 * We need the name for the mount point (also used for
 		 * "last mounted on") copied in. If an error occurs,
 		 * the mount point is discarded by the upper level code.
 		 * Note that vfs_mount() populates f_mntonname for us.
 		 */
 		if ((error = ffs_mountfs(devvp, mp, td)) != 0) {
 			vrele(devvp);
 			return (error);
 		}
 	}
 	vfs_mountedfrom(mp, fspec);
 	return (0);
 }
 
 /*
  * Compatibility with old mount system call.
  */
 
 static int
 ffs_cmount(struct mntarg *ma, void *data, int flags, struct thread *td)
 {
 	struct ufs_args args;
 	int error;
 
 	if (data == NULL)
 		return (EINVAL);
 	error = copyin(data, &args, sizeof args);
 	if (error)
 		return (error);
 
 	ma = mount_argsu(ma, "from", args.fspec, MAXPATHLEN);
 	ma = mount_arg(ma, "export", &args.export, sizeof args.export);
 	error = kernel_mount(ma, flags);
 
 	return (error);
 }
 
 /*
  * Reload all incore data for a filesystem (used after running fsck on
  * the root filesystem and finding things to fix). The filesystem must
  * be mounted read-only.
  *
  * Things to do to update the mount:
  *	1) invalidate all cached meta-data.
  *	2) re-read superblock from disk.
  *	3) re-read summary information from disk.
  *	4) invalidate all inactive vnodes.
  *	5) invalidate all cached file data.
  *	6) re-read inode data for all active vnodes.
  */
 static int
 ffs_reload(struct mount *mp, struct thread *td)
 {
-	struct vnode *vp, *nvp, *devvp;
+	struct vnode *vp, *mvp, *devvp;
 	struct inode *ip;
 	void *space;
 	struct buf *bp;
 	struct fs *fs, *newfs;
 	struct ufsmount *ump;
 	ufs2_daddr_t sblockloc;
 	int i, blks, size, error;
 	int32_t *lp;
 
 	if ((mp->mnt_flag & MNT_RDONLY) == 0)
 		return (EINVAL);
 	ump = VFSTOUFS(mp);
 	/*
 	 * Step 1: invalidate all cached meta-data.
 	 */
 	devvp = VFSTOUFS(mp)->um_devvp;
 	vn_lock(devvp, LK_EXCLUSIVE | LK_RETRY, td);
 	if (vinvalbuf(devvp, 0, td, 0, 0) != 0)
 		panic("ffs_reload: dirty1");
 	VOP_UNLOCK(devvp, 0, td);
 
 	/*
 	 * Step 2: re-read superblock from disk.
 	 */
 	fs = VFSTOUFS(mp)->um_fs;
 	if ((error = bread(devvp, btodb(fs->fs_sblockloc), fs->fs_sbsize,
 	    NOCRED, &bp)) != 0)
 		return (error);
 	newfs = (struct fs *)bp->b_data;
 	if ((newfs->fs_magic != FS_UFS1_MAGIC &&
 	     newfs->fs_magic != FS_UFS2_MAGIC) ||
 	    newfs->fs_bsize > MAXBSIZE ||
 	    newfs->fs_bsize < sizeof(struct fs)) {
 			brelse(bp);
 			return (EIO);		/* XXX needs translation */
 	}
 	/*
 	 * Copy pointer fields back into superblock before copying in	XXX
 	 * new superblock. These should really be in the ufsmount.	XXX
 	 * Note that important parameters (eg fs_ncg) are unchanged.
 	 */
 	newfs->fs_csp = fs->fs_csp;
 	newfs->fs_maxcluster = fs->fs_maxcluster;
 	newfs->fs_contigdirs = fs->fs_contigdirs;
 	newfs->fs_active = fs->fs_active;
 	/* The file system is still read-only. */
 	newfs->fs_ronly = 1;
 	sblockloc = fs->fs_sblockloc;
 	bcopy(newfs, fs, (u_int)fs->fs_sbsize);
 	brelse(bp);
 	mp->mnt_maxsymlinklen = fs->fs_maxsymlinklen;
 	ffs_oldfscompat_read(fs, VFSTOUFS(mp), sblockloc);
 	UFS_LOCK(ump);
 	if (fs->fs_pendingblocks != 0 || fs->fs_pendinginodes != 0) {
 		printf("%s: reload pending error: blocks %jd files %d\n",
 		    fs->fs_fsmnt, (intmax_t)fs->fs_pendingblocks,
 		    fs->fs_pendinginodes);
 		fs->fs_pendingblocks = 0;
 		fs->fs_pendinginodes = 0;
 	}
 	UFS_UNLOCK(ump);
 
 	/*
 	 * Step 3: re-read summary information from disk.
 	 */
 	blks = howmany(fs->fs_cssize, fs->fs_fsize);
 	space = fs->fs_csp;
 	for (i = 0; i < blks; i += fs->fs_frag) {
 		size = fs->fs_bsize;
 		if (i + fs->fs_frag > blks)
 			size = (blks - i) * fs->fs_fsize;
 		error = bread(devvp, fsbtodb(fs, fs->fs_csaddr + i), size,
 		    NOCRED, &bp);
 		if (error)
 			return (error);
 		bcopy(bp->b_data, space, (u_int)size);
 		space = (char *)space + size;
 		brelse(bp);
 	}
 	/*
 	 * We no longer know anything about clusters per cylinder group.
 	 */
 	if (fs->fs_contigsumsize > 0) {
 		lp = fs->fs_maxcluster;
 		for (i = 0; i < fs->fs_ncg; i++)
 			*lp++ = fs->fs_contigsumsize;
 	}
 
 loop:
 	MNT_ILOCK(mp);
-	MNT_VNODE_FOREACH(vp, mp, nvp) {
+	MNT_VNODE_FOREACH(vp, mp, mvp) {
 		VI_LOCK(vp);
 		if (vp->v_iflag & VI_DOOMED) {
 			VI_UNLOCK(vp);
 			continue;
 		}
 		MNT_IUNLOCK(mp);
 		/*
 		 * Step 4: invalidate all cached file data.
 		 */
 		if (vget(vp, LK_EXCLUSIVE | LK_INTERLOCK, td)) {
+			MNT_VNODE_FOREACH_ABORT(mp, mvp);
 			goto loop;
 		}
 		if (vinvalbuf(vp, 0, td, 0, 0))
 			panic("ffs_reload: dirty2");
 		/*
 		 * Step 5: re-read inode data for all active vnodes.
 		 */
 		ip = VTOI(vp);
 		error =
 		    bread(devvp, fsbtodb(fs, ino_to_fsba(fs, ip->i_number)),
 		    (int)fs->fs_bsize, NOCRED, &bp);
 		if (error) {
 			VOP_UNLOCK(vp, 0, td);
 			vrele(vp);
+			MNT_VNODE_FOREACH_ABORT(mp, mvp);
 			return (error);
 		}
 		ffs_load_inode(bp, ip, fs, ip->i_number);
 		ip->i_effnlink = ip->i_nlink;
 		brelse(bp);
 		VOP_UNLOCK(vp, 0, td);
 		vrele(vp);
 		MNT_ILOCK(mp);
 	}
 	MNT_IUNLOCK(mp);
 	return (0);
 }
 
 /*
  * Possible superblock locations ordered from most to least likely.
  */
 static int sblock_try[] = SBLOCKSEARCH;
 
 /*
  * Common code for mount and mountroot
  */
 static int
 ffs_mountfs(devvp, mp, td)
 	struct vnode *devvp;
 	struct mount *mp;
 	struct thread *td;
 {
 	struct ufsmount *ump;
 	struct buf *bp;
 	struct fs *fs;
 	struct cdev *dev;
 	void *space;
 	ufs2_daddr_t sblockloc;
 	int error, i, blks, size, ronly;
 	int32_t *lp;
 	struct ucred *cred;
 	struct g_consumer *cp;
 
 	dev = devvp->v_rdev;
 	cred = td ? td->td_ucred : NOCRED;
 
 	ronly = (mp->mnt_flag & MNT_RDONLY) != 0;
 	DROP_GIANT();
 	g_topology_lock();
 	error = g_vfs_open(devvp, &cp, "ffs", ronly ? 0 : 1);
 
 	/*
 	 * If we are a root mount, drop the E flag so fsck can do its magic.
 	 * We will pick it up again when we remount R/W.
 	 */
 	if (error == 0 && ronly && (mp->mnt_flag & MNT_ROOTFS))
 		error = g_access(cp, 0, 0, -1);
 	g_topology_unlock();
 	PICKUP_GIANT();
 	VOP_UNLOCK(devvp, 0, td);
 	if (error)
 		return (error);
 	if (devvp->v_rdev->si_iosize_max != 0)
 		mp->mnt_iosize_max = devvp->v_rdev->si_iosize_max;
 	if (mp->mnt_iosize_max > MAXPHYS)
 		mp->mnt_iosize_max = MAXPHYS;
 
 	devvp->v_bufobj.bo_private = cp;
 	devvp->v_bufobj.bo_ops = &ffs_ops;
 
 	bp = NULL;
 	ump = NULL;
 	fs = NULL;
 	sblockloc = 0;
 	/*
 	 * Try reading the superblock in each of its possible locations.
 	 */
 	for (i = 0; sblock_try[i] != -1; i++) {
 		if ((error = bread(devvp, sblock_try[i] / DEV_BSIZE, SBLOCKSIZE,
 		    cred, &bp)) != 0)
 			goto out;
 		fs = (struct fs *)bp->b_data;
 		sblockloc = sblock_try[i];
 		if ((fs->fs_magic == FS_UFS1_MAGIC ||
 		     (fs->fs_magic == FS_UFS2_MAGIC &&
 		      (fs->fs_sblockloc == sblockloc ||
 		       (fs->fs_old_flags & FS_FLAGS_UPDATED) == 0))) &&
 		    fs->fs_bsize <= MAXBSIZE &&
 		    fs->fs_bsize >= sizeof(struct fs))
 			break;
 		brelse(bp);
 		bp = NULL;
 	}
 	if (sblock_try[i] == -1) {
 		error = EINVAL;		/* XXX needs translation */
 		goto out;
 	}
 	fs->fs_fmod = 0;
 	fs->fs_flags &= ~FS_INDEXDIRS;	/* no support for directory indicies */
 	fs->fs_flags &= ~FS_UNCLEAN;
 	if (fs->fs_clean == 0) {
 		fs->fs_flags |= FS_UNCLEAN;
 		if (ronly || (mp->mnt_flag & MNT_FORCE) ||
 		    ((fs->fs_flags & FS_NEEDSFSCK) == 0 &&
 		     (fs->fs_flags & FS_DOSOFTDEP))) {
 			printf(
 "WARNING: %s was not properly dismounted\n",
 			    fs->fs_fsmnt);
 		} else {
 			printf(
 "WARNING: R/W mount of %s denied.  Filesystem is not clean - run fsck\n",
 			    fs->fs_fsmnt);
 			error = EPERM;
 			goto out;
 		}
 		if ((fs->fs_pendingblocks != 0 || fs->fs_pendinginodes != 0) &&
 		    (mp->mnt_flag & MNT_FORCE)) {
 			printf("%s: lost blocks %jd files %d\n", fs->fs_fsmnt,
 			    (intmax_t)fs->fs_pendingblocks,
 			    fs->fs_pendinginodes);
 			fs->fs_pendingblocks = 0;
 			fs->fs_pendinginodes = 0;
 		}
 	}
 	if (fs->fs_pendingblocks != 0 || fs->fs_pendinginodes != 0) {
 		printf("%s: mount pending error: blocks %jd files %d\n",
 		    fs->fs_fsmnt, (intmax_t)fs->fs_pendingblocks,
 		    fs->fs_pendinginodes);
 		fs->fs_pendingblocks = 0;
 		fs->fs_pendinginodes = 0;
 	}
 	ump = malloc(sizeof *ump, M_UFSMNT, M_WAITOK | M_ZERO);
 	ump->um_cp = cp;
 	ump->um_bo = &devvp->v_bufobj;
 	ump->um_fs = malloc((u_long)fs->fs_sbsize, M_UFSMNT, M_WAITOK);
 	if (fs->fs_magic == FS_UFS1_MAGIC) {
 		ump->um_fstype = UFS1;
 		ump->um_balloc = ffs_balloc_ufs1;
 	} else {
 		ump->um_fstype = UFS2;
 		ump->um_balloc = ffs_balloc_ufs2;
 	}
 	ump->um_blkatoff = ffs_blkatoff;
 	ump->um_truncate = ffs_truncate;
 	ump->um_update = ffs_update;
 	ump->um_valloc = ffs_valloc;
 	ump->um_vfree = ffs_vfree;
 	ump->um_ifree = ffs_ifree;
 	mtx_init(UFS_MTX(ump), "FFS", "FFS Lock", MTX_DEF);
 	bcopy(bp->b_data, ump->um_fs, (u_int)fs->fs_sbsize);
 	if (fs->fs_sbsize < SBLOCKSIZE)
 		bp->b_flags |= B_INVAL | B_NOCACHE;
 	brelse(bp);
 	bp = NULL;
 	fs = ump->um_fs;
 	ffs_oldfscompat_read(fs, ump, sblockloc);
 	fs->fs_ronly = ronly;
 	size = fs->fs_cssize;
 	blks = howmany(size, fs->fs_fsize);
 	if (fs->fs_contigsumsize > 0)
 		size += fs->fs_ncg * sizeof(int32_t);
 	size += fs->fs_ncg * sizeof(u_int8_t);
 	space = malloc((u_long)size, M_UFSMNT, M_WAITOK);
 	fs->fs_csp = space;
 	for (i = 0; i < blks; i += fs->fs_frag) {
 		size = fs->fs_bsize;
 		if (i + fs->fs_frag > blks)
 			size = (blks - i) * fs->fs_fsize;
 		if ((error = bread(devvp, fsbtodb(fs, fs->fs_csaddr + i), size,
 		    cred, &bp)) != 0) {
 			free(fs->fs_csp, M_UFSMNT);
 			goto out;
 		}
 		bcopy(bp->b_data, space, (u_int)size);
 		space = (char *)space + size;
 		brelse(bp);
 		bp = NULL;
 	}
 	if (fs->fs_contigsumsize > 0) {
 		fs->fs_maxcluster = lp = space;
 		for (i = 0; i < fs->fs_ncg; i++)
 			*lp++ = fs->fs_contigsumsize;
 		space = lp;
 	}
 	size = fs->fs_ncg * sizeof(u_int8_t);
 	fs->fs_contigdirs = (u_int8_t *)space;
 	bzero(fs->fs_contigdirs, size);
 	fs->fs_active = NULL;
 	mp->mnt_data = (qaddr_t)ump;
 	mp->mnt_stat.f_fsid.val[0] = fs->fs_id[0];
 	mp->mnt_stat.f_fsid.val[1] = fs->fs_id[1];
 	if (fs->fs_id[0] == 0 || fs->fs_id[1] == 0 || 
 	    vfs_getvfs(&mp->mnt_stat.f_fsid)) 
 		vfs_getnewfsid(mp);
 	mp->mnt_maxsymlinklen = fs->fs_maxsymlinklen;
 	mp->mnt_flag |= MNT_LOCAL;
 	if ((fs->fs_flags & FS_MULTILABEL) != 0)
 #ifdef MAC
 		mp->mnt_flag |= MNT_MULTILABEL;
 #else
 		printf(
 "WARNING: %s: multilabel flag on fs but no MAC support\n",
 		    fs->fs_fsmnt);
 #endif
 	if ((fs->fs_flags & FS_ACLS) != 0)
 #ifdef UFS_ACL
 		mp->mnt_flag |= MNT_ACLS;
 #else
 		printf(
 "WARNING: %s: ACLs flag on fs but no ACLs support\n",
 		    fs->fs_fsmnt);
 #endif
 	ump->um_mountp = mp;
 	ump->um_dev = dev;
 	ump->um_devvp = devvp;
 	ump->um_nindir = fs->fs_nindir;
 	ump->um_bptrtodb = fs->fs_fsbtodb;
 	ump->um_seqinc = fs->fs_frag;
 	for (i = 0; i < MAXQUOTAS; i++)
 		ump->um_quotas[i] = NULLVP;
 #ifdef UFS_EXTATTR
 	ufs_extattr_uepm_init(&ump->um_extattr);
 #endif
 	/*
 	 * Set FS local "last mounted on" information (NULL pad)
 	 */
 	bzero(fs->fs_fsmnt, MAXMNTLEN);
 	strlcpy(fs->fs_fsmnt, mp->mnt_stat.f_mntonname, MAXMNTLEN);
 
 	if( mp->mnt_flag & MNT_ROOTFS) {
 		/*
 		 * Root mount; update timestamp in mount structure.
 		 * this will be used by the common root mount code
 		 * to update the system clock.
 		 */
 		mp->mnt_time = fs->fs_time;
 	}
 
 	if (ronly == 0) {
 		if ((fs->fs_flags & FS_DOSOFTDEP) &&
 		    (error = softdep_mount(devvp, mp, fs, cred)) != 0) {
 			free(fs->fs_csp, M_UFSMNT);
 			goto out;
 		}
 		if (fs->fs_snapinum[0] != 0)
 			ffs_snapshot_mount(mp);
 		fs->fs_fmod = 1;
 		fs->fs_clean = 0;
 		(void) ffs_sbupdate(ump, MNT_WAIT);
 	}
 	/*
 	 * Initialize filesystem stat information in mount struct.
 	 */
 #ifdef UFS_EXTATTR
 #ifdef UFS_EXTATTR_AUTOSTART
 	/*
 	 *
 	 * Auto-starting does the following:
 	 *	- check for /.attribute in the fs, and extattr_start if so
 	 *	- for each file in .attribute, enable that file with
 	 * 	  an attribute of the same name.
 	 * Not clear how to report errors -- probably eat them.
 	 * This would all happen while the filesystem was busy/not
 	 * available, so would effectively be "atomic".
 	 */
 	(void) ufs_extattr_autostart(mp, td);
 #endif /* !UFS_EXTATTR_AUTOSTART */
 #endif /* !UFS_EXTATTR */
 #ifndef QUOTA
 	mp->mnt_kern_flag |= MNTK_MPSAFE;
 #endif
 	return (0);
 out:
 	if (bp)
 		brelse(bp);
 	if (cp != NULL) {
 		DROP_GIANT();
 		g_topology_lock();
 		g_vfs_close(cp, td);
 		g_topology_unlock();
 		PICKUP_GIANT();
 	}
 	if (ump) {
 		mtx_destroy(UFS_MTX(ump));
 		free(ump->um_fs, M_UFSMNT);
 		free(ump, M_UFSMNT);
 		mp->mnt_data = (qaddr_t)0;
 	}
 	return (error);
 }
 
 #include <sys/sysctl.h>
 static int bigcgs = 0;
 SYSCTL_INT(_debug, OID_AUTO, bigcgs, CTLFLAG_RW, &bigcgs, 0, "");
 
 /*
  * Sanity checks for loading old filesystem superblocks.
  * See ffs_oldfscompat_write below for unwound actions.
  *
  * XXX - Parts get retired eventually.
  * Unfortunately new bits get added.
  */
 static void
 ffs_oldfscompat_read(fs, ump, sblockloc)
 	struct fs *fs;
 	struct ufsmount *ump;
 	ufs2_daddr_t sblockloc;
 {
 	off_t maxfilesize;
 
 	/*
 	 * If not yet done, update fs_flags location and value of fs_sblockloc.
 	 */
 	if ((fs->fs_old_flags & FS_FLAGS_UPDATED) == 0) {
 		fs->fs_flags = fs->fs_old_flags;
 		fs->fs_old_flags |= FS_FLAGS_UPDATED;
 		fs->fs_sblockloc = sblockloc;
 	}
 	/*
 	 * If not yet done, update UFS1 superblock with new wider fields.
 	 */
 	if (fs->fs_magic == FS_UFS1_MAGIC && fs->fs_maxbsize != fs->fs_bsize) {
 		fs->fs_maxbsize = fs->fs_bsize;
 		fs->fs_time = fs->fs_old_time;
 		fs->fs_size = fs->fs_old_size;
 		fs->fs_dsize = fs->fs_old_dsize;
 		fs->fs_csaddr = fs->fs_old_csaddr;
 		fs->fs_cstotal.cs_ndir = fs->fs_old_cstotal.cs_ndir;
 		fs->fs_cstotal.cs_nbfree = fs->fs_old_cstotal.cs_nbfree;
 		fs->fs_cstotal.cs_nifree = fs->fs_old_cstotal.cs_nifree;
 		fs->fs_cstotal.cs_nffree = fs->fs_old_cstotal.cs_nffree;
 	}
 	if (fs->fs_magic == FS_UFS1_MAGIC &&
 	    fs->fs_old_inodefmt < FS_44INODEFMT) {
 		fs->fs_maxfilesize = ((uint64_t)1 << 31) - 1;
 		fs->fs_qbmask = ~fs->fs_bmask;
 		fs->fs_qfmask = ~fs->fs_fmask;
 	}
 	if (fs->fs_magic == FS_UFS1_MAGIC) {
 		ump->um_savedmaxfilesize = fs->fs_maxfilesize;
 		maxfilesize = (uint64_t)0x80000000 * fs->fs_bsize - 1;
 		if (fs->fs_maxfilesize > maxfilesize)
 			fs->fs_maxfilesize = maxfilesize;
 	}
 	/* Compatibility for old filesystems */
 	if (fs->fs_avgfilesize <= 0)
 		fs->fs_avgfilesize = AVFILESIZ;
 	if (fs->fs_avgfpdir <= 0)
 		fs->fs_avgfpdir = AFPDIR;
 	if (bigcgs) {
 		fs->fs_save_cgsize = fs->fs_cgsize;
 		fs->fs_cgsize = fs->fs_bsize;
 	}
 }
 
 /*
  * Unwinding superblock updates for old filesystems.
  * See ffs_oldfscompat_read above for details.
  *
  * XXX - Parts get retired eventually.
  * Unfortunately new bits get added.
  */
 static void
 ffs_oldfscompat_write(fs, ump)
 	struct fs *fs;
 	struct ufsmount *ump;
 {
 
 	/*
 	 * Copy back UFS2 updated fields that UFS1 inspects.
 	 */
 	if (fs->fs_magic == FS_UFS1_MAGIC) {
 		fs->fs_old_time = fs->fs_time;
 		fs->fs_old_cstotal.cs_ndir = fs->fs_cstotal.cs_ndir;
 		fs->fs_old_cstotal.cs_nbfree = fs->fs_cstotal.cs_nbfree;
 		fs->fs_old_cstotal.cs_nifree = fs->fs_cstotal.cs_nifree;
 		fs->fs_old_cstotal.cs_nffree = fs->fs_cstotal.cs_nffree;
 		fs->fs_maxfilesize = ump->um_savedmaxfilesize;
 	}
 	if (bigcgs) {
 		fs->fs_cgsize = fs->fs_save_cgsize;
 		fs->fs_save_cgsize = 0;
 	}
 }
 
 /*
  * unmount system call
  */
 static int
 ffs_unmount(mp, mntflags, td)
 	struct mount *mp;
 	int mntflags;
 	struct thread *td;
 {
 	struct ufsmount *ump = VFSTOUFS(mp);
 	struct fs *fs;
 	int error, flags;
 
 	flags = 0;
 	if (mntflags & MNT_FORCE) {
 		flags |= FORCECLOSE;
 	}
 #ifdef UFS_EXTATTR
 	if ((error = ufs_extattr_stop(mp, td))) {
 		if (error != EOPNOTSUPP)
 			printf("ffs_unmount: ufs_extattr_stop returned %d\n",
 			    error);
 	} else {
 		ufs_extattr_uepm_destroy(&ump->um_extattr);
 	}
 #endif
 	if (mp->mnt_flag & MNT_SOFTDEP) {
 		if ((error = softdep_flushfiles(mp, flags, td)) != 0)
 			return (error);
 	} else {
 		if ((error = ffs_flushfiles(mp, flags, td)) != 0)
 			return (error);
 	}
 	fs = ump->um_fs;
 	UFS_LOCK(ump);
 	if (fs->fs_pendingblocks != 0 || fs->fs_pendinginodes != 0) {
 		printf("%s: unmount pending error: blocks %jd files %d\n",
 		    fs->fs_fsmnt, (intmax_t)fs->fs_pendingblocks,
 		    fs->fs_pendinginodes);
 		fs->fs_pendingblocks = 0;
 		fs->fs_pendinginodes = 0;
 	}
 	UFS_UNLOCK(ump);
 	if (fs->fs_ronly == 0) {
 		fs->fs_clean = fs->fs_flags & (FS_UNCLEAN|FS_NEEDSFSCK) ? 0 : 1;
 		error = ffs_sbupdate(ump, MNT_WAIT);
 		if (error) {
 			fs->fs_clean = 0;
 			return (error);
 		}
 	}
 	DROP_GIANT();
 	g_topology_lock();
 	g_vfs_close(ump->um_cp, td);
 	g_topology_unlock();
 	PICKUP_GIANT();
 	vrele(ump->um_devvp);
 	mtx_destroy(UFS_MTX(ump));
 	free(fs->fs_csp, M_UFSMNT);
 	free(fs, M_UFSMNT);
 	free(ump, M_UFSMNT);
 	mp->mnt_data = (qaddr_t)0;
 	mp->mnt_flag &= ~MNT_LOCAL;
 	return (error);
 }
 
 /*
  * Flush out all the files in a filesystem.
  */
 int
 ffs_flushfiles(mp, flags, td)
 	struct mount *mp;
 	int flags;
 	struct thread *td;
 {
 	struct ufsmount *ump;
 	int error;
 
 	ump = VFSTOUFS(mp);
 #ifdef QUOTA
 	if (mp->mnt_flag & MNT_QUOTA) {
 		int i;
 		error = vflush(mp, 0, SKIPSYSTEM|flags, td);
 		if (error)
 			return (error);
 		for (i = 0; i < MAXQUOTAS; i++) {
 			if (ump->um_quotas[i] == NULLVP)
 				continue;
 			quotaoff(td, mp, i);
 		}
 		/*
 		 * Here we fall through to vflush again to ensure
 		 * that we have gotten rid of all the system vnodes.
 		 */
 	}
 #endif
 	ASSERT_VOP_LOCKED(ump->um_devvp, "ffs_flushfiles");
 	if (ump->um_devvp->v_vflag & VV_COPYONWRITE) {
 		if ((error = vflush(mp, 0, SKIPSYSTEM | flags, td)) != 0)
 			return (error);
 		ffs_snapshot_unmount(mp);
 		/*
 		 * Here we fall through to vflush again to ensure
 		 * that we have gotten rid of all the system vnodes.
 		 */
 	}
         /*
 	 * Flush all the files.
 	 */
 	if ((error = vflush(mp, 0, flags, td)) != 0)
 		return (error);
 	/*
 	 * Flush filesystem metadata.
 	 */
 	vn_lock(ump->um_devvp, LK_EXCLUSIVE | LK_RETRY, td);
 	error = VOP_FSYNC(ump->um_devvp, MNT_WAIT, td);
 	VOP_UNLOCK(ump->um_devvp, 0, td);
 	return (error);
 }
 
 /*
  * Get filesystem statistics.
  */
 static int
 ffs_statfs(mp, sbp, td)
 	struct mount *mp;
 	struct statfs *sbp;
 	struct thread *td;
 {
 	struct ufsmount *ump;
 	struct fs *fs;
 
 	ump = VFSTOUFS(mp);
 	fs = ump->um_fs;
 	if (fs->fs_magic != FS_UFS1_MAGIC && fs->fs_magic != FS_UFS2_MAGIC)
 		panic("ffs_statfs");
 	sbp->f_version = STATFS_VERSION;
 	sbp->f_bsize = fs->fs_fsize;
 	sbp->f_iosize = fs->fs_bsize;
 	sbp->f_blocks = fs->fs_dsize;
 	UFS_LOCK(ump);
 	sbp->f_bfree = fs->fs_cstotal.cs_nbfree * fs->fs_frag +
 	    fs->fs_cstotal.cs_nffree + dbtofsb(fs, fs->fs_pendingblocks);
 	sbp->f_bavail = freespace(fs, fs->fs_minfree) +
 	    dbtofsb(fs, fs->fs_pendingblocks);
 	sbp->f_files =  fs->fs_ncg * fs->fs_ipg - ROOTINO;
 	sbp->f_ffree = fs->fs_cstotal.cs_nifree + fs->fs_pendinginodes;
 	UFS_UNLOCK(ump);
 	sbp->f_namemax = NAME_MAX;
 	return (0);
 }
 
 /*
  * Go through the disk queues to initiate sandbagged IO;
  * go through the inodes to write those that have been modified;
  * initiate the writing of the super block if it has been modified.
  *
  * Note: we are always called with the filesystem marked `MPBUSY'.
  */
 static int
 ffs_sync(mp, waitfor, td)
 	struct mount *mp;
 	int waitfor;
 	struct thread *td;
 {
-	struct vnode *nvp, *vp, *devvp;
+	struct vnode *mvp, *vp, *devvp;
 	struct inode *ip;
 	struct ufsmount *ump = VFSTOUFS(mp);
 	struct fs *fs;
 	int error, count, wait, lockreq, allerror = 0;
 	struct bufobj *bo;
 
 	fs = ump->um_fs;
 	if (fs->fs_fmod != 0 && fs->fs_ronly != 0) {		/* XXX */
 		printf("fs = %s\n", fs->fs_fsmnt);
 		panic("ffs_sync: rofs mod");
 	}
 	/*
 	 * Write back each (modified) inode.
 	 */
 	wait = 0;
 	lockreq = LK_EXCLUSIVE | LK_NOWAIT;
 	if (waitfor == MNT_WAIT) {
 		wait = 1;
 		lockreq = LK_EXCLUSIVE;
 	}
 	lockreq |= LK_INTERLOCK | LK_SLEEPFAIL;
 	MNT_ILOCK(mp);
 loop:
-	MNT_VNODE_FOREACH(vp, mp, nvp) {
+	MNT_VNODE_FOREACH(vp, mp, mvp) {
 		/*
 		 * Depend on the mntvnode_slock to keep things stable enough
 		 * for a quick test.  Since there might be hundreds of
 		 * thousands of vnodes, we cannot afford even a subroutine
 		 * call unless there's a good chance that we have work to do.
 		 */
 		VI_LOCK(vp);
 		if (vp->v_iflag & VI_DOOMED) {
 			VI_UNLOCK(vp);
 			continue;
 		}
 		ip = VTOI(vp);
 		if (vp->v_type == VNON || ((ip->i_flag &
 		    (IN_ACCESS | IN_CHANGE | IN_MODIFIED | IN_UPDATE)) == 0 &&
 		    vp->v_bufobj.bo_dirty.bv_cnt == 0)) {
 			VI_UNLOCK(vp);
 			continue;
 		}
 		MNT_IUNLOCK(mp);
 		if ((error = vget(vp, lockreq, td)) != 0) {
 			MNT_ILOCK(mp);
-			if (error == ENOENT || error == ENOLCK)
+			if (error == ENOENT || error == ENOLCK) {
+				MNT_VNODE_FOREACH_ABORT_ILOCKED(mp, mvp);
 				goto loop;
+			}
 			continue;
 		}
 		if ((error = ffs_syncvnode(vp, waitfor)) != 0)
 			allerror = error;
 		vput(vp);
 		MNT_ILOCK(mp);
 	}
 	MNT_IUNLOCK(mp);
 	/*
 	 * Force stale filesystem control information to be flushed.
 	 */
 	if (waitfor == MNT_WAIT) {
 		if ((error = softdep_flushworklist(ump->um_mountp, &count, td)))
 			allerror = error;
 		/* Flushed work items may create new vnodes to clean */
 		if (allerror == 0 && count) {
 			MNT_ILOCK(mp);
 			goto loop;
 		}
 	}
 #ifdef QUOTA
 	qsync(mp);
 #endif
 	devvp = ump->um_devvp;
 	VI_LOCK(devvp);
 	bo = &devvp->v_bufobj;
 	if (waitfor != MNT_LAZY &&
 	    (bo->bo_numoutput > 0 || bo->bo_dirty.bv_cnt > 0)) {
 		vn_lock(devvp, LK_EXCLUSIVE | LK_RETRY | LK_INTERLOCK, td);
 		if ((error = VOP_FSYNC(devvp, waitfor, td)) != 0)
 			allerror = error;
 		VOP_UNLOCK(devvp, 0, td);
 		if (allerror == 0 && waitfor == MNT_WAIT) {
 			MNT_ILOCK(mp);
 			goto loop;
 		}
 	} else
 		VI_UNLOCK(devvp);
 	/*
 	 * Write back modified superblock.
 	 */
 	if (fs->fs_fmod != 0 && (error = ffs_sbupdate(ump, waitfor)) != 0)
 		allerror = error;
 	return (allerror);
 }
 
 int
 ffs_vget(mp, ino, flags, vpp)
 	struct mount *mp;
 	ino_t ino;
 	int flags;
 	struct vnode **vpp;
 {
 	struct fs *fs;
 	struct inode *ip;
 	struct ufsmount *ump;
 	struct buf *bp;
 	struct vnode *vp;
 	struct cdev *dev;
 	int error;
 
 	error = vfs_hash_get(mp, ino, flags, curthread, vpp, NULL, NULL);
 	if (error || *vpp != NULL)
 		return (error);
 
 	/*
 	 * We must promote to an exclusive lock for vnode creation.  This
 	 * can happen if lookup is passed LOCKSHARED.
  	 */
 	if ((flags & LK_TYPE_MASK) == LK_SHARED) {
 		flags &= ~LK_TYPE_MASK;
 		flags |= LK_EXCLUSIVE;
 	}
 
 	/*
 	 * We do not lock vnode creation as it is believed to be too
 	 * expensive for such rare case as simultaneous creation of vnode
 	 * for same ino by different processes. We just allow them to race
 	 * and check later to decide who wins. Let the race begin!
 	 */
 
 	ump = VFSTOUFS(mp);
 	dev = ump->um_dev;
 	fs = ump->um_fs;
 
 	/*
 	 * If this MALLOC() is performed after the getnewvnode()
 	 * it might block, leaving a vnode with a NULL v_data to be
 	 * found by ffs_sync() if a sync happens to fire right then,
 	 * which will cause a panic because ffs_sync() blindly
 	 * dereferences vp->v_data (as well it should).
 	 */
 	ip = uma_zalloc(uma_inode, M_WAITOK | M_ZERO);
 
 	/* Allocate a new vnode/inode. */
 	if (fs->fs_magic == FS_UFS1_MAGIC)
 		error = getnewvnode("ufs", mp, &ffs_vnodeops1, &vp);
 	else
 		error = getnewvnode("ufs", mp, &ffs_vnodeops2, &vp);
 	if (error) {
 		*vpp = NULL;
 		uma_zfree(uma_inode, ip);
 		return (error);
 	}
 	/*
 	 * FFS supports recursive and shared locking.
 	 */
 	vp->v_vnlock->lk_flags |= LK_CANRECURSE;
 	vp->v_vnlock->lk_flags &= ~LK_NOSHARE;
 	vp->v_data = ip;
 	vp->v_bufobj.bo_bsize = fs->fs_bsize;
 	ip->i_vnode = vp;
 	ip->i_ump = ump;
 	ip->i_fs = fs;
 	ip->i_dev = dev;
 	ip->i_number = ino;
 #ifdef QUOTA
 	{
 		int i;
 		for (i = 0; i < MAXQUOTAS; i++)
 			ip->i_dquot[i] = NODQUOT;
 	}
 #endif
 
 	error = vfs_hash_insert(vp, ino, flags, curthread, vpp, NULL, NULL);
 	if (error || *vpp != NULL)
 		return (error);
 
 	/* Read in the disk contents for the inode, copy into the inode. */
 	error = bread(ump->um_devvp, fsbtodb(fs, ino_to_fsba(fs, ino)),
 	    (int)fs->fs_bsize, NOCRED, &bp);
 	if (error) {
 		/*
 		 * The inode does not contain anything useful, so it would
 		 * be misleading to leave it on its hash chain. With mode
 		 * still zero, it will be unlinked and returned to the free
 		 * list by vput().
 		 */
 		brelse(bp);
 		vput(vp);
 		*vpp = NULL;
 		return (error);
 	}
 	if (ip->i_ump->um_fstype == UFS1)
 		ip->i_din1 = uma_zalloc(uma_ufs1, M_WAITOK);
 	else
 		ip->i_din2 = uma_zalloc(uma_ufs2, M_WAITOK);
 	ffs_load_inode(bp, ip, fs, ino);
 	if (DOINGSOFTDEP(vp))
 		softdep_load_inodeblock(ip);
 	else
 		ip->i_effnlink = ip->i_nlink;
 	bqrelse(bp);
 
 	/*
 	 * Initialize the vnode from the inode, check for aliases.
 	 * Note that the underlying vnode may have changed.
 	 */
 	if (ip->i_ump->um_fstype == UFS1)
 		error = ufs_vinit(mp, &ffs_fifoops1, &vp);
 	else
 		error = ufs_vinit(mp, &ffs_fifoops2, &vp);
 	if (error) {
 		vput(vp);
 		*vpp = NULL;
 		return (error);
 	}
 
 	/*
 	 * Finish inode initialization.
 	 */
 
 	/*
 	 * Set up a generation number for this inode if it does not
 	 * already have one. This should only happen on old filesystems.
 	 */
 	if (ip->i_gen == 0) {
 		ip->i_gen = arc4random() / 2 + 1;
 		if ((vp->v_mount->mnt_flag & MNT_RDONLY) == 0) {
 			ip->i_flag |= IN_MODIFIED;
 			DIP_SET(ip, i_gen, ip->i_gen);
 		}
 	}
 	/*
 	 * Ensure that uid and gid are correct. This is a temporary
 	 * fix until fsck has been changed to do the update.
 	 */
 	if (fs->fs_magic == FS_UFS1_MAGIC &&		/* XXX */
 	    fs->fs_old_inodefmt < FS_44INODEFMT) {	/* XXX */
 		ip->i_uid = ip->i_din1->di_ouid;	/* XXX */
 		ip->i_gid = ip->i_din1->di_ogid;	/* XXX */
 	}						/* XXX */
 
 #ifdef MAC
 	if ((mp->mnt_flag & MNT_MULTILABEL) && ip->i_mode) {
 		/*
 		 * If this vnode is already allocated, and we're running
 		 * multi-label, attempt to perform a label association
 		 * from the extended attributes on the inode.
 		 */
 		error = mac_associate_vnode_extattr(mp, vp);
 		if (error) {
 			/* ufs_inactive will release ip->i_devvp ref. */
 			vput(vp);
 			*vpp = NULL;
 			return (error);
 		}
 	}
 #endif
 
 	*vpp = vp;
 	return (0);
 }
 
 /*
  * File handle to vnode
  *
  * Have to be really careful about stale file handles:
  * - check that the inode number is valid
  * - call ffs_vget() to get the locked inode
  * - check for an unallocated inode (i_mode == 0)
  * - check that the given client host has export rights and return
  *   those rights via. exflagsp and credanonp
  */
 static int
 ffs_fhtovp(mp, fhp, vpp)
 	struct mount *mp;
 	struct fid *fhp;
 	struct vnode **vpp;
 {
 	struct ufid *ufhp;
 	struct fs *fs;
 
 	ufhp = (struct ufid *)fhp;
 	fs = VFSTOUFS(mp)->um_fs;
 	if (ufhp->ufid_ino < ROOTINO ||
 	    ufhp->ufid_ino >= fs->fs_ncg * fs->fs_ipg)
 		return (ESTALE);
 	return (ufs_fhtovp(mp, ufhp, vpp));
 }
 
 /*
  * Vnode pointer to File handle
  */
 /* ARGSUSED */
 static int
 ffs_vptofh(vp, fhp)
 	struct vnode *vp;
 	struct fid *fhp;
 {
 	struct inode *ip;
 	struct ufid *ufhp;
 
 	ip = VTOI(vp);
 	ufhp = (struct ufid *)fhp;
 	ufhp->ufid_len = sizeof(struct ufid);
 	ufhp->ufid_ino = ip->i_number;
 	ufhp->ufid_gen = ip->i_gen;
 	return (0);
 }
 
 /*
  * Initialize the filesystem.
  */
 static int
 ffs_init(vfsp)
 	struct vfsconf *vfsp;
 {
 
 	softdep_initialize();
 	return (ufs_init(vfsp));
 }
 
 /*
  * Undo the work of ffs_init().
  */
 static int
 ffs_uninit(vfsp)
 	struct vfsconf *vfsp;
 {
 	int ret;
 
 	ret = ufs_uninit(vfsp);
 	softdep_uninitialize();
 	return (ret);
 }
 
 /*
  * Write a superblock and associated information back to disk.
  */
 static int
 ffs_sbupdate(mp, waitfor)
 	struct ufsmount *mp;
 	int waitfor;
 {
 	struct fs *fs = mp->um_fs;
 	struct buf *sbbp;
 	struct buf *bp;
 	int blks;
 	void *space;
 	int i, size, error, allerror = 0;
 
 	if (fs->fs_ronly == 1 &&
 	    (mp->um_mountp->mnt_flag & (MNT_RDONLY | MNT_UPDATE)) != 
 	    (MNT_RDONLY | MNT_UPDATE))
 		panic("ffs_sbupdate: write read-only filesystem");
 	/*
 	 * We use the superblock's buf to serialize calls to ffs_sbupdate().
 	 */
 	sbbp = getblk(mp->um_devvp, btodb(fs->fs_sblockloc), (int)fs->fs_sbsize,
 	    0, 0, 0);
 	/*
 	 * First write back the summary information.
 	 */
 	blks = howmany(fs->fs_cssize, fs->fs_fsize);
 	space = fs->fs_csp;
 	for (i = 0; i < blks; i += fs->fs_frag) {
 		size = fs->fs_bsize;
 		if (i + fs->fs_frag > blks)
 			size = (blks - i) * fs->fs_fsize;
 		bp = getblk(mp->um_devvp, fsbtodb(fs, fs->fs_csaddr + i),
 		    size, 0, 0, 0);
 		bcopy(space, bp->b_data, (u_int)size);
 		space = (char *)space + size;
 		if (waitfor != MNT_WAIT)
 			bawrite(bp);
 		else if ((error = bwrite(bp)) != 0)
 			allerror = error;
 	}
 	/*
 	 * Now write back the superblock itself. If any errors occurred
 	 * up to this point, then fail so that the superblock avoids
 	 * being written out as clean.
 	 */
 	if (allerror) {
 		brelse(sbbp);
 		return (allerror);
 	}
 	bp = sbbp;
 	if (fs->fs_magic == FS_UFS1_MAGIC && fs->fs_sblockloc != SBLOCK_UFS1 &&
 	    (fs->fs_flags & FS_FLAGS_UPDATED) == 0) {
 		printf("%s: correcting fs_sblockloc from %jd to %d\n",
 		    fs->fs_fsmnt, fs->fs_sblockloc, SBLOCK_UFS1);
 		fs->fs_sblockloc = SBLOCK_UFS1;
 	}
 	if (fs->fs_magic == FS_UFS2_MAGIC && fs->fs_sblockloc != SBLOCK_UFS2 &&
 	    (fs->fs_flags & FS_FLAGS_UPDATED) == 0) {
 		printf("%s: correcting fs_sblockloc from %jd to %d\n",
 		    fs->fs_fsmnt, fs->fs_sblockloc, SBLOCK_UFS2);
 		fs->fs_sblockloc = SBLOCK_UFS2;
 	}
 	fs->fs_fmod = 0;
 	fs->fs_time = time_second;
 	bcopy((caddr_t)fs, bp->b_data, (u_int)fs->fs_sbsize);
 	ffs_oldfscompat_write((struct fs *)bp->b_data, mp);
 	if (waitfor != MNT_WAIT)
 		bawrite(bp);
 	else if ((error = bwrite(bp)) != 0)
 		allerror = error;
 	return (allerror);
 }
 
 static int
 ffs_extattrctl(struct mount *mp, int cmd, struct vnode *filename_vp,
 	int attrnamespace, const char *attrname, struct thread *td)
 {
 
 #ifdef UFS_EXTATTR
 	return (ufs_extattrctl(mp, cmd, filename_vp, attrnamespace,
 	    attrname, td));
 #else
 	return (vfs_stdextattrctl(mp, cmd, filename_vp, attrnamespace,
 	    attrname, td));
 #endif
 }
 
 static void
 ffs_ifree(struct ufsmount *ump, struct inode *ip)
 {
 
 	if (ump->um_fstype == UFS1 && ip->i_din1 != NULL)
 		uma_zfree(uma_ufs1, ip->i_din1);
 	else if (ip->i_din2 != NULL)
 		uma_zfree(uma_ufs2, ip->i_din2);
 	uma_zfree(uma_inode, ip);
 }
 
 static int dobkgrdwrite = 1;
 SYSCTL_INT(_debug, OID_AUTO, dobkgrdwrite, CTLFLAG_RW, &dobkgrdwrite, 0,
     "Do background writes (honoring the BV_BKGRDWRITE flag)?");
 
 /*
  * Complete a background write started from bwrite.
  */
 static void
 ffs_backgroundwritedone(struct buf *bp)
 {
 	struct bufobj *bufobj;
 	struct buf *origbp;
 
 	/*
 	 * Find the original buffer that we are writing.
 	 */
 	bufobj = bp->b_bufobj;
 	BO_LOCK(bufobj);
 	if ((origbp = gbincore(bp->b_bufobj, bp->b_lblkno)) == NULL)
 		panic("backgroundwritedone: lost buffer");
 	/* Grab an extra reference to be dropped by the bufdone() below. */
 	bufobj_wrefl(bufobj);
 	BO_UNLOCK(bufobj);
 	/*
 	 * Process dependencies then return any unfinished ones.
 	 */
 	if (LIST_FIRST(&bp->b_dep) != NULL)
 		buf_complete(bp);
 #ifdef SOFTUPDATES
 	if (LIST_FIRST(&bp->b_dep) != NULL)
 		softdep_move_dependencies(bp, origbp);
 #endif
 	/*
 	 * This buffer is marked B_NOCACHE so when it is released
 	 * by biodone it will be tossed.
 	 */
 	bp->b_flags |= B_NOCACHE;
 	bp->b_flags &= ~B_CACHE;
 	bufdone(bp);
 	BO_LOCK(bufobj);
 	/*
 	 * Clear the BV_BKGRDINPROG flag in the original buffer
 	 * and awaken it if it is waiting for the write to complete.
 	 * If BV_BKGRDINPROG is not set in the original buffer it must
 	 * have been released and re-instantiated - which is not legal.
 	 */
 	KASSERT((origbp->b_vflags & BV_BKGRDINPROG),
 	    ("backgroundwritedone: lost buffer2"));
 	origbp->b_vflags &= ~BV_BKGRDINPROG;
 	if (origbp->b_vflags & BV_BKGRDWAIT) {
 		origbp->b_vflags &= ~BV_BKGRDWAIT;
 		wakeup(&origbp->b_xflags);
 	}
 	BO_UNLOCK(bufobj);
 }
 
 
 /*
  * Write, release buffer on completion.  (Done by iodone
  * if async).  Do not bother writing anything if the buffer
  * is invalid.
  *
  * Note that we set B_CACHE here, indicating that buffer is
  * fully valid and thus cacheable.  This is true even of NFS
  * now so we set it generally.  This could be set either here 
  * or in biodone() since the I/O is synchronous.  We put it
  * here.
  */
 static int
 ffs_bufwrite(struct buf *bp)
 {
 	int oldflags, s;
 	struct buf *newbp;
 
 	CTR3(KTR_BUF, "bufwrite(%p) vp %p flags %X", bp, bp->b_vp, bp->b_flags);
 	if (bp->b_flags & B_INVAL) {
 		brelse(bp);
 		return (0);
 	}
 
 	oldflags = bp->b_flags;
 
 	if (BUF_REFCNT(bp) == 0)
 		panic("bufwrite: buffer is not busy???");
 	s = splbio();
 	/*
 	 * If a background write is already in progress, delay
 	 * writing this block if it is asynchronous. Otherwise
 	 * wait for the background write to complete.
 	 */
 	BO_LOCK(bp->b_bufobj);
 	if (bp->b_vflags & BV_BKGRDINPROG) {
 		if (bp->b_flags & B_ASYNC) {
 			BO_UNLOCK(bp->b_bufobj);
 			splx(s);
 			bdwrite(bp);
 			return (0);
 		}
 		bp->b_vflags |= BV_BKGRDWAIT;
 		msleep(&bp->b_xflags, BO_MTX(bp->b_bufobj), PRIBIO, "bwrbg", 0);
 		if (bp->b_vflags & BV_BKGRDINPROG)
 			panic("bufwrite: still writing");
 	}
 	BO_UNLOCK(bp->b_bufobj);
 
 	/* Mark the buffer clean */
 	bundirty(bp);
 
 	/*
 	 * If this buffer is marked for background writing and we
 	 * do not have to wait for it, make a copy and write the
 	 * copy so as to leave this buffer ready for further use.
 	 *
 	 * This optimization eats a lot of memory.  If we have a page
 	 * or buffer shortfall we can't do it.
 	 */
 	if (dobkgrdwrite && (bp->b_xflags & BX_BKGRDWRITE) && 
 	    (bp->b_flags & B_ASYNC) &&
 	    !vm_page_count_severe() &&
 	    !buf_dirty_count_severe()) {
 		KASSERT(bp->b_iodone == NULL,
 		    ("bufwrite: needs chained iodone (%p)", bp->b_iodone));
 
 		/* get a new block */
 		newbp = geteblk(bp->b_bufsize);
 
 		/*
 		 * set it to be identical to the old block.  We have to
 		 * set b_lblkno and BKGRDMARKER before calling bgetvp()
 		 * to avoid confusing the splay tree and gbincore().
 		 */
 		memcpy(newbp->b_data, bp->b_data, bp->b_bufsize);
 		newbp->b_lblkno = bp->b_lblkno;
 		newbp->b_xflags |= BX_BKGRDMARKER;
 		BO_LOCK(bp->b_bufobj);
 		bp->b_vflags |= BV_BKGRDINPROG;
 		bgetvp(bp->b_vp, newbp);
 		BO_UNLOCK(bp->b_bufobj);
 		newbp->b_bufobj = &bp->b_vp->v_bufobj;
 		newbp->b_blkno = bp->b_blkno;
 		newbp->b_offset = bp->b_offset;
 		newbp->b_iodone = ffs_backgroundwritedone;
 		newbp->b_flags |= B_ASYNC;
 		newbp->b_flags &= ~B_INVAL;
 
 #ifdef SOFTUPDATES
 		/* move over the dependencies */
 		if (LIST_FIRST(&bp->b_dep) != NULL)
 			softdep_move_dependencies(bp, newbp);
 #endif 
 
 		/*
 		 * Initiate write on the copy, release the original to
 		 * the B_LOCKED queue so that it cannot go away until
 		 * the background write completes. If not locked it could go
 		 * away and then be reconstituted while it was being written.
 		 * If the reconstituted buffer were written, we could end up
 		 * with two background copies being written at the same time.
 		 */
 		bqrelse(bp);
 		bp = newbp;
 	}
 
 	/* Let the normal bufwrite do the rest for us */
 	return (bufwrite(bp));
 }
 
 
 static void
 ffs_geom_strategy(struct bufobj *bo, struct buf *bp)
 {
 	struct vnode *vp;
 	int error;
 
 	vp = bo->__bo_vnode;
 	if (bp->b_iocmd == BIO_WRITE) {
 #ifdef SOFTUPDATES
 		if (LIST_FIRST(&bp->b_dep) != NULL)
 			buf_start(bp);
 #endif
 		if ((bp->b_flags & B_VALIDSUSPWRT) == 0 &&
 		    bp->b_vp != NULL && bp->b_vp->v_mount != NULL &&
 		    (bp->b_vp->v_mount->mnt_kern_flag & MNTK_SUSPENDED) != 0)
 			panic("ffs_geom_strategy: bad I/O");
 		bp->b_flags &= ~B_VALIDSUSPWRT;
 		if ((vp->v_vflag & VV_COPYONWRITE) &&
 		    vp->v_rdev->si_snapdata != NULL &&
 		    (error = (ffs_copyonwrite)(vp, bp)) != 0 &&
 		    error != EOPNOTSUPP) {
 			bp->b_error = error;
 			bp->b_ioflags |= BIO_ERROR;
 			bufdone(bp);
 			return;
 		}
 	}
 	g_vfs_strategy(bo, bp);
 }
Index: head/sys/ufs/ufs/ufs_quota.c
===================================================================
--- head/sys/ufs/ufs/ufs_quota.c	(revision 154151)
+++ head/sys/ufs/ufs/ufs_quota.c	(revision 154152)
@@ -1,1041 +1,1047 @@
 /*-
  * Copyright (c) 1982, 1986, 1990, 1993, 1995
  *	The Regents of the University of California.  All rights reserved.
  *
  * This code is derived from software contributed to Berkeley by
  * Robert Elz at The University of Melbourne.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	@(#)ufs_quota.c	8.5 (Berkeley) 5/20/95
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/fcntl.h>
 #include <sys/kernel.h>
 #include <sys/lock.h>
 #include <sys/malloc.h>
 #include <sys/mount.h>
 #include <sys/mutex.h>
 #include <sys/namei.h>
 #include <sys/proc.h>
 #include <sys/socket.h>
 #include <sys/sysctl.h>
 #include <sys/vnode.h>
 
 #include <ufs/ufs/extattr.h>
 #include <ufs/ufs/quota.h>
 #include <ufs/ufs/inode.h>
 #include <ufs/ufs/ufsmount.h>
 #include <ufs/ufs/ufs_extern.h>
 
 SYSCTL_DECL(_security_bsd);
 
 static int unprivileged_get_quota = 0;
 SYSCTL_INT(_security_bsd, OID_AUTO, unprivileged_get_quota, CTLFLAG_RW,
     &unprivileged_get_quota, 0,
     "Unprivileged processes may retrieve quotas for other uids and gids");
 
 static MALLOC_DEFINE(M_DQUOT, "ufs_quota", "UFS quota entries");
 
 /*
  * Quota name to error message mapping.
  */
 static char *quotatypes[] = INITQFNAMES;
 
 static int chkdqchg(struct inode *, ufs2_daddr_t, struct ucred *, int);
 static int chkiqchg(struct inode *, ino_t, struct ucred *, int);
 static int dqget(struct vnode *,
 		u_long, struct ufsmount *, int, struct dquot **);
 static int dqsync(struct vnode *, struct dquot *);
 static void dqflush(struct vnode *);
 
 #ifdef DIAGNOSTIC
 static void dqref(struct dquot *);
 static void chkdquot(struct inode *);
 #endif
 
 /*
  * Set up the quotas for an inode.
  *
  * This routine completely defines the semantics of quotas.
  * If other criterion want to be used to establish quotas, the
  * MAXQUOTAS value in quotas.h should be increased, and the
  * additional dquots set up here.
  */
 int
 getinoquota(ip)
 	struct inode *ip;
 {
 	struct ufsmount *ump;
 	struct vnode *vp = ITOV(ip);
 	int error;
 
 	ump = VFSTOUFS(vp->v_mount);
 	/*
 	 * Set up the user quota based on file uid.
 	 * EINVAL means that quotas are not enabled.
 	 */
 	if (ip->i_dquot[USRQUOTA] == NODQUOT &&
 	    (error =
 		dqget(vp, ip->i_uid, ump, USRQUOTA, &ip->i_dquot[USRQUOTA])) &&
 	    error != EINVAL)
 		return (error);
 	/*
 	 * Set up the group quota based on file gid.
 	 * EINVAL means that quotas are not enabled.
 	 */
 	if (ip->i_dquot[GRPQUOTA] == NODQUOT &&
 	    (error =
 		dqget(vp, ip->i_gid, ump, GRPQUOTA, &ip->i_dquot[GRPQUOTA])) &&
 	    error != EINVAL)
 		return (error);
 	return (0);
 }
 
 /*
  * Update disk usage, and take corrective action.
  */
 int
 chkdq(ip, change, cred, flags)
 	struct inode *ip;
 	ufs2_daddr_t change;
 	struct ucred *cred;
 	int flags;
 {
 	struct dquot *dq;
 	ufs2_daddr_t ncurblocks;
 	int i, error;
 
 #ifdef DIAGNOSTIC
 	if ((flags & CHOWN) == 0)
 		chkdquot(ip);
 #endif
 	if (change == 0)
 		return (0);
 	if (change < 0) {
 		for (i = 0; i < MAXQUOTAS; i++) {
 			if ((dq = ip->i_dquot[i]) == NODQUOT)
 				continue;
 			while (dq->dq_flags & DQ_LOCK) {
 				dq->dq_flags |= DQ_WANT;
 				(void) tsleep(dq, PINOD+1, "chkdq1", 0);
 			}
 			ncurblocks = dq->dq_curblocks + change;
 			if (ncurblocks >= 0)
 				dq->dq_curblocks = ncurblocks;
 			else
 				dq->dq_curblocks = 0;
 			dq->dq_flags &= ~DQ_BLKS;
 			dq->dq_flags |= DQ_MOD;
 		}
 		return (0);
 	}
 	if ((flags & FORCE) == 0 && suser_cred(cred, 0)) {
 		for (i = 0; i < MAXQUOTAS; i++) {
 			if ((dq = ip->i_dquot[i]) == NODQUOT)
 				continue;
 			error = chkdqchg(ip, change, cred, i);
 			if (error)
 				return (error);
 		}
 	}
 	for (i = 0; i < MAXQUOTAS; i++) {
 		if ((dq = ip->i_dquot[i]) == NODQUOT)
 			continue;
 		while (dq->dq_flags & DQ_LOCK) {
 			dq->dq_flags |= DQ_WANT;
 			(void) tsleep(dq, PINOD+1, "chkdq2", 0);
 		}
 		/* Reset timer when crossing soft limit */
 		if (dq->dq_curblocks + change >= dq->dq_bsoftlimit &&
 		    dq->dq_curblocks < dq->dq_bsoftlimit)
 			dq->dq_btime = time_second +
 			    VFSTOUFS(ITOV(ip)->v_mount)->um_btime[i];
 		dq->dq_curblocks += change;
 		dq->dq_flags |= DQ_MOD;
 	}
 	return (0);
 }
 
 /*
  * Check for a valid change to a users allocation.
  * Issue an error message if appropriate.
  */
 static int
 chkdqchg(ip, change, cred, type)
 	struct inode *ip;
 	ufs2_daddr_t change;
 	struct ucred *cred;
 	int type;
 {
 	struct dquot *dq = ip->i_dquot[type];
 	ufs2_daddr_t ncurblocks = dq->dq_curblocks + change;
 
 	/*
 	 * If user would exceed their hard limit, disallow space allocation.
 	 */
 	if (ncurblocks >= dq->dq_bhardlimit && dq->dq_bhardlimit) {
 		if ((dq->dq_flags & DQ_BLKS) == 0 &&
 		    ip->i_uid == cred->cr_uid) {
 			uprintf("\n%s: write failed, %s disk limit reached\n",
 			    ITOV(ip)->v_mount->mnt_stat.f_mntonname,
 			    quotatypes[type]);
 			dq->dq_flags |= DQ_BLKS;
 		}
 		return (EDQUOT);
 	}
 	/*
 	 * If user is over their soft limit for too long, disallow space
 	 * allocation. Reset time limit as they cross their soft limit.
 	 */
 	if (ncurblocks >= dq->dq_bsoftlimit && dq->dq_bsoftlimit) {
 		if (dq->dq_curblocks < dq->dq_bsoftlimit) {
 			dq->dq_btime = time_second +
 			    VFSTOUFS(ITOV(ip)->v_mount)->um_btime[type];
 			if (ip->i_uid == cred->cr_uid)
 				uprintf("\n%s: warning, %s %s\n",
 				    ITOV(ip)->v_mount->mnt_stat.f_mntonname,
 				    quotatypes[type], "disk quota exceeded");
 			return (0);
 		}
 		if (time_second > dq->dq_btime) {
 			if ((dq->dq_flags & DQ_BLKS) == 0 &&
 			    ip->i_uid == cred->cr_uid) {
 				uprintf("\n%s: write failed, %s %s\n",
 				    ITOV(ip)->v_mount->mnt_stat.f_mntonname,
 				    quotatypes[type],
 				    "disk quota exceeded for too long");
 				dq->dq_flags |= DQ_BLKS;
 			}
 			return (EDQUOT);
 		}
 	}
 	return (0);
 }
 
 /*
  * Check the inode limit, applying corrective action.
  */
 int
 chkiq(ip, change, cred, flags)
 	struct inode *ip;
 	ino_t change;
 	struct ucred *cred;
 	int flags;
 {
 	struct dquot *dq;
 	ino_t ncurinodes;
 	int i, error;
 
 #ifdef DIAGNOSTIC
 	if ((flags & CHOWN) == 0)
 		chkdquot(ip);
 #endif
 	if (change == 0)
 		return (0);
 	/* XXX: change is unsigned */
 	if (change < 0) {
 		for (i = 0; i < MAXQUOTAS; i++) {
 			if ((dq = ip->i_dquot[i]) == NODQUOT)
 				continue;
 			while (dq->dq_flags & DQ_LOCK) {
 				dq->dq_flags |= DQ_WANT;
 				(void) tsleep(dq, PINOD+1, "chkiq1", 0);
 			}
 			ncurinodes = dq->dq_curinodes + change;
 			/* XXX: ncurinodes is unsigned */
 			if (ncurinodes >= 0)
 				dq->dq_curinodes = ncurinodes;
 			else
 				dq->dq_curinodes = 0;
 			dq->dq_flags &= ~DQ_INODS;
 			dq->dq_flags |= DQ_MOD;
 		}
 		return (0);
 	}
 	if ((flags & FORCE) == 0 && suser_cred(cred, 0)) {
 		for (i = 0; i < MAXQUOTAS; i++) {
 			if ((dq = ip->i_dquot[i]) == NODQUOT)
 				continue;
 			error = chkiqchg(ip, change, cred, i);
 			if (error)
 				return (error);
 		}
 	}
 	for (i = 0; i < MAXQUOTAS; i++) {
 		if ((dq = ip->i_dquot[i]) == NODQUOT)
 			continue;
 		while (dq->dq_flags & DQ_LOCK) {
 			dq->dq_flags |= DQ_WANT;
 			(void) tsleep(dq, PINOD+1, "chkiq2", 0);
 		}
 		/* Reset timer when crossing soft limit */
 		if (dq->dq_curinodes + change >= dq->dq_isoftlimit &&
 		    dq->dq_curinodes < dq->dq_isoftlimit)
 			dq->dq_itime = time_second +
 			    VFSTOUFS(ITOV(ip)->v_mount)->um_itime[i];
 		dq->dq_curinodes += change;
 		dq->dq_flags |= DQ_MOD;
 	}
 	return (0);
 }
 
 /*
  * Check for a valid change to a users allocation.
  * Issue an error message if appropriate.
  */
 static int
 chkiqchg(ip, change, cred, type)
 	struct inode *ip;
 	ino_t change;
 	struct ucred *cred;
 	int type;
 {
 	struct dquot *dq = ip->i_dquot[type];
 	ino_t ncurinodes = dq->dq_curinodes + change;
 
 	/*
 	 * If user would exceed their hard limit, disallow inode allocation.
 	 */
 	if (ncurinodes >= dq->dq_ihardlimit && dq->dq_ihardlimit) {
 		if ((dq->dq_flags & DQ_INODS) == 0 &&
 		    ip->i_uid == cred->cr_uid) {
 			uprintf("\n%s: write failed, %s inode limit reached\n",
 			    ITOV(ip)->v_mount->mnt_stat.f_mntonname,
 			    quotatypes[type]);
 			dq->dq_flags |= DQ_INODS;
 		}
 		return (EDQUOT);
 	}
 	/*
 	 * If user is over their soft limit for too long, disallow inode
 	 * allocation. Reset time limit as they cross their soft limit.
 	 */
 	if (ncurinodes >= dq->dq_isoftlimit && dq->dq_isoftlimit) {
 		if (dq->dq_curinodes < dq->dq_isoftlimit) {
 			dq->dq_itime = time_second +
 			    VFSTOUFS(ITOV(ip)->v_mount)->um_itime[type];
 			if (ip->i_uid == cred->cr_uid)
 				uprintf("\n%s: warning, %s %s\n",
 				    ITOV(ip)->v_mount->mnt_stat.f_mntonname,
 				    quotatypes[type], "inode quota exceeded");
 			return (0);
 		}
 		if (time_second > dq->dq_itime) {
 			if ((dq->dq_flags & DQ_INODS) == 0 &&
 			    ip->i_uid == cred->cr_uid) {
 				uprintf("\n%s: write failed, %s %s\n",
 				    ITOV(ip)->v_mount->mnt_stat.f_mntonname,
 				    quotatypes[type],
 				    "inode quota exceeded for too long");
 				dq->dq_flags |= DQ_INODS;
 			}
 			return (EDQUOT);
 		}
 	}
 	return (0);
 }
 
 #ifdef DIAGNOSTIC
 /*
  * On filesystems with quotas enabled, it is an error for a file to change
  * size and not to have a dquot structure associated with it.
  */
 static void
 chkdquot(ip)
 	struct inode *ip;
 {
 	struct ufsmount *ump = VFSTOUFS(ITOV(ip)->v_mount);
 	int i;
 
 	for (i = 0; i < MAXQUOTAS; i++) {
 		if (ump->um_quotas[i] == NULLVP ||
 		    (ump->um_qflags[i] & (QTF_OPENING|QTF_CLOSING)))
 			continue;
 		if (ip->i_dquot[i] == NODQUOT) {
 			vprint("chkdquot: missing dquot", ITOV(ip));
 			panic("chkdquot: missing dquot");
 		}
 	}
 }
 #endif
 
 /*
  * Code to process quotactl commands.
  */
 
 /*
  * Q_QUOTAON - set up a quota file for a particular filesystem.
  */
 int
 quotaon(td, mp, type, fname)
 	struct thread *td;
 	struct mount *mp;
 	int type;
 	void *fname;
 {
 	struct ufsmount *ump = VFSTOUFS(mp);
 	struct vnode *vp, **vpp;
-	struct vnode *nvp;
+	struct vnode *mvp;
 	struct dquot *dq;
 	int error, flags;
 	struct nameidata nd;
 
 	error = suser_cred(td->td_ucred, SUSER_ALLOWJAIL);
 	if (error)
 		return (error);
 
 	vpp = &ump->um_quotas[type];
 	NDINIT(&nd, LOOKUP, FOLLOW, UIO_USERSPACE, fname, td);
 	flags = FREAD | FWRITE;
 	error = vn_open(&nd, &flags, 0, -1);
 	if (error)
 		return (error);
 	NDFREE(&nd, NDF_ONLY_PNBUF);
 	vp = nd.ni_vp;
 	VOP_UNLOCK(vp, 0, td);
 	if (vp->v_type != VREG) {
 		(void) vn_close(vp, FREAD|FWRITE, td->td_ucred, td);
 		return (EACCES);
 	}
 	if (*vpp != vp)
 		quotaoff(td, mp, type);
 	ump->um_qflags[type] |= QTF_OPENING;
 	mp->mnt_flag |= MNT_QUOTA;
 	ASSERT_VOP_LOCKED(vp, "quotaon");
 	vp->v_vflag |= VV_SYSTEM;
 	*vpp = vp;
 	/*
 	 * Save the credential of the process that turned on quotas.
 	 * Set up the time limits for this quota.
 	 */
 	ump->um_cred[type] = crhold(td->td_ucred);
 	ump->um_btime[type] = MAX_DQ_TIME;
 	ump->um_itime[type] = MAX_IQ_TIME;
 	if (dqget(NULLVP, 0, ump, type, &dq) == 0) {
 		if (dq->dq_btime > 0)
 			ump->um_btime[type] = dq->dq_btime;
 		if (dq->dq_itime > 0)
 			ump->um_itime[type] = dq->dq_itime;
 		dqrele(NULLVP, dq);
 	}
 	/*
 	 * Search vnodes associated with this mount point,
 	 * adding references to quota file being opened.
 	 * NB: only need to add dquot's for inodes being modified.
 	 */
 	MNT_ILOCK(mp);
 again:
-	MNT_VNODE_FOREACH(vp, mp, nvp) {
+	MNT_VNODE_FOREACH(vp, mp, mvp) {
 		VI_LOCK(vp);
 		MNT_IUNLOCK(mp);
 		if (vget(vp, LK_EXCLUSIVE | LK_INTERLOCK, td)) {
 			MNT_ILOCK(mp);
+			MNT_VNODE_FOREACH_ABORT_ILOCKED(mp, mvp);
 			goto again;
 		}
 		if (vp->v_type == VNON || vp->v_writecount == 0) {
 			VOP_UNLOCK(vp, 0, td);
 			vrele(vp);
 			MNT_ILOCK(mp);
 			continue;
 		}
 		error = getinoquota(VTOI(vp));
 		VOP_UNLOCK(vp, 0, td);
 		vrele(vp);
 		MNT_ILOCK(mp);
-		if (error)
+		if (error) {
+			MNT_VNODE_FOREACH_ABORT_ILOCKED(mp, mvp);
 			break;
+		}
 	}
 	MNT_IUNLOCK(mp);
 	ump->um_qflags[type] &= ~QTF_OPENING;
 	if (error)
 		quotaoff(td, mp, type);
 	return (error);
 }
 
 /*
  * Q_QUOTAOFF - turn off disk quotas for a filesystem.
  */
 int
 quotaoff(td, mp, type)
 	struct thread *td;
 	struct mount *mp;
 	int type;
 {
 	struct vnode *vp;
-	struct vnode *qvp, *nvp;
+	struct vnode *qvp, *mvp;
 	struct ufsmount *ump = VFSTOUFS(mp);
 	struct dquot *dq;
 	struct inode *ip;
 	int error;
 
 	error = suser_cred(td->td_ucred, SUSER_ALLOWJAIL);
 	if (error)
 		return (error);
 
 	if ((qvp = ump->um_quotas[type]) == NULLVP)
 		return (0);
 	ump->um_qflags[type] |= QTF_CLOSING;
 	/*
 	 * Search vnodes associated with this mount point,
 	 * deleting any references to quota file being closed.
 	 */
 	MNT_ILOCK(mp);
 again:
-	MNT_VNODE_FOREACH(vp, mp, nvp) {
+	MNT_VNODE_FOREACH(vp, mp, mvp) {
 		VI_LOCK(vp);
 		MNT_IUNLOCK(mp);
 		if (vp->v_type == VNON) {
 			VI_UNLOCK(vp);
 			MNT_ILOCK(mp);
 			continue;
 		}
 		if (vget(vp, LK_EXCLUSIVE | LK_INTERLOCK, td)) {
 			MNT_ILOCK(mp);
+			MNT_VNODE_FOREACH_ABORT_ILOCKED(mp, mvp);
 			goto again;
 		}
 		ip = VTOI(vp);
 		dq = ip->i_dquot[type];
 		ip->i_dquot[type] = NODQUOT;
 		dqrele(vp, dq);
 		VOP_UNLOCK(vp, 0, td);
 		vrele(vp);
 		MNT_ILOCK(mp);
 	}
 	MNT_IUNLOCK(mp);
 	dqflush(qvp);
 	ASSERT_VOP_LOCKED(qvp, "quotaoff");
 	qvp->v_vflag &= ~VV_SYSTEM;
 	error = vn_close(qvp, FREAD|FWRITE, td->td_ucred, td);
 	ump->um_quotas[type] = NULLVP;
 	crfree(ump->um_cred[type]);
 	ump->um_cred[type] = NOCRED;
 	ump->um_qflags[type] &= ~QTF_CLOSING;
 	for (type = 0; type < MAXQUOTAS; type++)
 		if (ump->um_quotas[type] != NULLVP)
 			break;
 	if (type == MAXQUOTAS)
 		mp->mnt_flag &= ~MNT_QUOTA;
 	return (error);
 }
 
 /*
  * Q_GETQUOTA - return current values in a dqblk structure.
  */
 int
 getquota(td, mp, id, type, addr)
 	struct thread *td;
 	struct mount *mp;
 	u_long id;
 	int type;
 	void *addr;
 {
 	struct dquot *dq;
 	int error;
 
 	switch (type) {
 	case USRQUOTA:
 		if ((td->td_ucred->cr_uid != id) && !unprivileged_get_quota) {
 			error = suser_cred(td->td_ucred, SUSER_ALLOWJAIL);
 			if (error)
 				return (error);
 		}
 		break;  
 
 	case GRPQUOTA:
 		if (!groupmember(id, td->td_ucred) && !unprivileged_get_quota) {
 			error = suser_cred(td->td_ucred, SUSER_ALLOWJAIL);
 			if (error)
 				return (error);
 		}
 		break;
 
 	default:
 		return (EINVAL);
 	}
 
 	error = dqget(NULLVP, id, VFSTOUFS(mp), type, &dq);
 	if (error)
 		return (error);
 	error = copyout(&dq->dq_dqb, addr, sizeof (struct dqblk));
 	dqrele(NULLVP, dq);
 	return (error);
 }
 
 /*
  * Q_SETQUOTA - assign an entire dqblk structure.
  */
 int
 setquota(td, mp, id, type, addr)
 	struct thread *td;
 	struct mount *mp;
 	u_long id;
 	int type;
 	void *addr;
 {
 	struct dquot *dq;
 	struct dquot *ndq;
 	struct ufsmount *ump = VFSTOUFS(mp);
 	struct dqblk newlim;
 	int error;
 
 	error = suser_cred(td->td_ucred, SUSER_ALLOWJAIL);
 	if (error)
 		return (error);
 
 	error = copyin(addr, &newlim, sizeof (struct dqblk));
 	if (error)
 		return (error);
 	error = dqget(NULLVP, id, ump, type, &ndq);
 	if (error)
 		return (error);
 	dq = ndq;
 	while (dq->dq_flags & DQ_LOCK) {
 		dq->dq_flags |= DQ_WANT;
 		(void) tsleep(dq, PINOD+1, "setqta", 0);
 	}
 	/*
 	 * Copy all but the current values.
 	 * Reset time limit if previously had no soft limit or were
 	 * under it, but now have a soft limit and are over it.
 	 */
 	newlim.dqb_curblocks = dq->dq_curblocks;
 	newlim.dqb_curinodes = dq->dq_curinodes;
 	if (dq->dq_id != 0) {
 		newlim.dqb_btime = dq->dq_btime;
 		newlim.dqb_itime = dq->dq_itime;
 	}
 	if (newlim.dqb_bsoftlimit &&
 	    dq->dq_curblocks >= newlim.dqb_bsoftlimit &&
 	    (dq->dq_bsoftlimit == 0 || dq->dq_curblocks < dq->dq_bsoftlimit))
 		newlim.dqb_btime = time_second + ump->um_btime[type];
 	if (newlim.dqb_isoftlimit &&
 	    dq->dq_curinodes >= newlim.dqb_isoftlimit &&
 	    (dq->dq_isoftlimit == 0 || dq->dq_curinodes < dq->dq_isoftlimit))
 		newlim.dqb_itime = time_second + ump->um_itime[type];
 	dq->dq_dqb = newlim;
 	if (dq->dq_curblocks < dq->dq_bsoftlimit)
 		dq->dq_flags &= ~DQ_BLKS;
 	if (dq->dq_curinodes < dq->dq_isoftlimit)
 		dq->dq_flags &= ~DQ_INODS;
 	if (dq->dq_isoftlimit == 0 && dq->dq_bsoftlimit == 0 &&
 	    dq->dq_ihardlimit == 0 && dq->dq_bhardlimit == 0)
 		dq->dq_flags |= DQ_FAKE;
 	else
 		dq->dq_flags &= ~DQ_FAKE;
 	dq->dq_flags |= DQ_MOD;
 	dqrele(NULLVP, dq);
 	return (0);
 }
 
 /*
  * Q_SETUSE - set current inode and block usage.
  */
 int
 setuse(td, mp, id, type, addr)
 	struct thread *td;
 	struct mount *mp;
 	u_long id;
 	int type;
 	void *addr;
 {
 	struct dquot *dq;
 	struct ufsmount *ump = VFSTOUFS(mp);
 	struct dquot *ndq;
 	struct dqblk usage;
 	int error;
 
 	error = suser_cred(td->td_ucred, SUSER_ALLOWJAIL);
 	if (error)
 		return (error);
 
 	error = copyin(addr, &usage, sizeof (struct dqblk));
 	if (error)
 		return (error);
 	error = dqget(NULLVP, id, ump, type, &ndq);
 	if (error)
 		return (error);
 	dq = ndq;
 	while (dq->dq_flags & DQ_LOCK) {
 		dq->dq_flags |= DQ_WANT;
 		(void) tsleep(dq, PINOD+1, "setuse", 0);
 	}
 	/*
 	 * Reset time limit if have a soft limit and were
 	 * previously under it, but are now over it.
 	 */
 	if (dq->dq_bsoftlimit && dq->dq_curblocks < dq->dq_bsoftlimit &&
 	    usage.dqb_curblocks >= dq->dq_bsoftlimit)
 		dq->dq_btime = time_second + ump->um_btime[type];
 	if (dq->dq_isoftlimit && dq->dq_curinodes < dq->dq_isoftlimit &&
 	    usage.dqb_curinodes >= dq->dq_isoftlimit)
 		dq->dq_itime = time_second + ump->um_itime[type];
 	dq->dq_curblocks = usage.dqb_curblocks;
 	dq->dq_curinodes = usage.dqb_curinodes;
 	if (dq->dq_curblocks < dq->dq_bsoftlimit)
 		dq->dq_flags &= ~DQ_BLKS;
 	if (dq->dq_curinodes < dq->dq_isoftlimit)
 		dq->dq_flags &= ~DQ_INODS;
 	dq->dq_flags |= DQ_MOD;
 	dqrele(NULLVP, dq);
 	return (0);
 }
 
 /*
  * Q_SYNC - sync quota files to disk.
  */
 int
 qsync(mp)
 	struct mount *mp;
 {
 	struct ufsmount *ump = VFSTOUFS(mp);
 	struct thread *td = curthread;		/* XXX */
-	struct vnode *vp, *nvp;
+	struct vnode *vp, *mvp;
 	struct dquot *dq;
 	int i, error;
 
 	/*
 	 * Check if the mount point has any quotas.
 	 * If not, simply return.
 	 */
 	for (i = 0; i < MAXQUOTAS; i++)
 		if (ump->um_quotas[i] != NULLVP)
 			break;
 	if (i == MAXQUOTAS)
 		return (0);
 	/*
 	 * Search vnodes associated with this mount point,
 	 * synchronizing any modified dquot structures.
 	 */
 	MNT_ILOCK(mp);
 again:
-	MNT_VNODE_FOREACH(vp, mp, nvp) {
+	MNT_VNODE_FOREACH(vp, mp, mvp) {
 		VI_LOCK(vp);
 		MNT_IUNLOCK(mp);
 		if (vp->v_type == VNON) {
 			VI_UNLOCK(vp);
 			MNT_ILOCK(mp);
 			continue;
 		}
 		error = vget(vp, LK_EXCLUSIVE | LK_NOWAIT | LK_INTERLOCK, td);
 		if (error) {
 			MNT_ILOCK(mp);
-			if (error == ENOENT)
+			if (error == ENOENT) {
+				MNT_VNODE_FOREACH_ABORT_ILOCKED(mp, mvp);
 				goto again;
+			}
 			continue;
 		}
 		for (i = 0; i < MAXQUOTAS; i++) {
 			dq = VTOI(vp)->i_dquot[i];
 			if (dq != NODQUOT && (dq->dq_flags & DQ_MOD))
 				dqsync(vp, dq);
 		}
 		vput(vp);
 		MNT_ILOCK(mp);
 	}
 	MNT_IUNLOCK(mp);
 	return (0);
 }
 
 /*
  * Code pertaining to management of the in-core dquot data structures.
  */
 #define DQHASH(dqvp, id) \
 	(&dqhashtbl[((((intptr_t)(dqvp)) >> 8) + id) & dqhash])
 static LIST_HEAD(dqhash, dquot) *dqhashtbl;
 static u_long dqhash;
 
 /*
  * Dquot free list.
  */
 #define	DQUOTINC	5	/* minimum free dquots desired */
 static TAILQ_HEAD(dqfreelist, dquot) dqfreelist;
 static long numdquot, desireddquot = DQUOTINC;
 
 /*
  * Initialize the quota system.
  */
 void
 dqinit()
 {
 
 	dqhashtbl = hashinit(desiredvnodes, M_DQUOT, &dqhash);
 	TAILQ_INIT(&dqfreelist);
 }
 
 /*
  * Shut down the quota system.
  */
 void
 dquninit()
 {
 	struct dquot *dq;
 
 	hashdestroy(dqhashtbl, M_DQUOT, dqhash);
 	while ((dq = TAILQ_FIRST(&dqfreelist)) != NULL) {
 		TAILQ_REMOVE(&dqfreelist, dq, dq_freelist);
 		free(dq, M_DQUOT);
 	}
 }
 
 /*
  * Obtain a dquot structure for the specified identifier and quota file
  * reading the information from the file if necessary.
  */
 static int
 dqget(vp, id, ump, type, dqp)
 	struct vnode *vp;
 	u_long id;
 	struct ufsmount *ump;
 	int type;
 	struct dquot **dqp;
 {
 	struct thread *td = curthread;		/* XXX */
 	struct dquot *dq;
 	struct dqhash *dqh;
 	struct vnode *dqvp;
 	struct iovec aiov;
 	struct uio auio;
 	int error;
 
 	dqvp = ump->um_quotas[type];
 	if (dqvp == NULLVP || (ump->um_qflags[type] & QTF_CLOSING)) {
 		*dqp = NODQUOT;
 		return (EINVAL);
 	}
 	/*
 	 * Check the cache first.
 	 */
 	dqh = DQHASH(dqvp, id);
 	LIST_FOREACH(dq, dqh, dq_hash) {
 		if (dq->dq_id != id ||
 		    dq->dq_ump->um_quotas[dq->dq_type] != dqvp)
 			continue;
 		/*
 		 * Cache hit with no references.  Take
 		 * the structure off the free list.
 		 */
 		if (dq->dq_cnt == 0)
 			TAILQ_REMOVE(&dqfreelist, dq, dq_freelist);
 		DQREF(dq);
 		*dqp = dq;
 		return (0);
 	}
 	/*
 	 * Not in cache, allocate a new one.
 	 */
 	if (TAILQ_FIRST(&dqfreelist) == NODQUOT &&
 	    numdquot < MAXQUOTAS * desiredvnodes)
 		desireddquot += DQUOTINC;
 	if (numdquot < desireddquot) {
 		dq = (struct dquot *)malloc(sizeof *dq, M_DQUOT,
 		    M_WAITOK | M_ZERO);
 		numdquot++;
 	} else {
 		if ((dq = TAILQ_FIRST(&dqfreelist)) == NULL) {
 			tablefull("dquot");
 			*dqp = NODQUOT;
 			return (EUSERS);
 		}
 		if (dq->dq_cnt || (dq->dq_flags & DQ_MOD))
 			panic("dqget: free dquot isn't");
 		TAILQ_REMOVE(&dqfreelist, dq, dq_freelist);
 		if (dq->dq_ump != NULL)
 			LIST_REMOVE(dq, dq_hash);
 	}
 	/*
 	 * Initialize the contents of the dquot structure.
 	 */
 	if (vp != dqvp)
 		vn_lock(dqvp, LK_EXCLUSIVE | LK_RETRY, td);
 	LIST_INSERT_HEAD(dqh, dq, dq_hash);
 	DQREF(dq);
 	dq->dq_flags = DQ_LOCK;
 	dq->dq_id = id;
 	dq->dq_ump = ump;
 	dq->dq_type = type;
 	auio.uio_iov = &aiov;
 	auio.uio_iovcnt = 1;
 	aiov.iov_base = &dq->dq_dqb;
 	aiov.iov_len = sizeof (struct dqblk);
 	auio.uio_resid = sizeof (struct dqblk);
 	auio.uio_offset = (off_t)(id * sizeof (struct dqblk));
 	auio.uio_segflg = UIO_SYSSPACE;
 	auio.uio_rw = UIO_READ;
 	auio.uio_td = (struct thread *)0;
 	error = VOP_READ(dqvp, &auio, 0, ump->um_cred[type]);
 	if (auio.uio_resid == sizeof(struct dqblk) && error == 0)
 		bzero(&dq->dq_dqb, sizeof(struct dqblk));
 	if (vp != dqvp)
 		VOP_UNLOCK(dqvp, 0, td);
 	if (dq->dq_flags & DQ_WANT)
 		wakeup(dq);
 	dq->dq_flags = 0;
 	/*
 	 * I/O error in reading quota file, release
 	 * quota structure and reflect problem to caller.
 	 */
 	if (error) {
 		LIST_REMOVE(dq, dq_hash);
 		dqrele(vp, dq);
 		*dqp = NODQUOT;
 		return (error);
 	}
 	/*
 	 * Check for no limit to enforce.
 	 * Initialize time values if necessary.
 	 */
 	if (dq->dq_isoftlimit == 0 && dq->dq_bsoftlimit == 0 &&
 	    dq->dq_ihardlimit == 0 && dq->dq_bhardlimit == 0)
 		dq->dq_flags |= DQ_FAKE;
 	if (dq->dq_id != 0) {
 		if (dq->dq_btime == 0)
 			dq->dq_btime = time_second + ump->um_btime[type];
 		if (dq->dq_itime == 0)
 			dq->dq_itime = time_second + ump->um_itime[type];
 	}
 	*dqp = dq;
 	return (0);
 }
 
 #ifdef DIAGNOSTIC
 /*
  * Obtain a reference to a dquot.
  */
 static void
 dqref(dq)
 	struct dquot *dq;
 {
 
 	dq->dq_cnt++;
 }
 #endif
 
 /*
  * Release a reference to a dquot.
  */
 void
 dqrele(vp, dq)
 	struct vnode *vp;
 	struct dquot *dq;
 {
 
 	if (dq == NODQUOT)
 		return;
 	if (dq->dq_cnt > 1) {
 		dq->dq_cnt--;
 		return;
 	}
 	if (dq->dq_flags & DQ_MOD)
 		(void) dqsync(vp, dq);
 	if (--dq->dq_cnt > 0)
 		return;
 	TAILQ_INSERT_TAIL(&dqfreelist, dq, dq_freelist);
 }
 
 /*
  * Update the disk quota in the quota file.
  */
 static int
 dqsync(vp, dq)
 	struct vnode *vp;
 	struct dquot *dq;
 {
 	struct thread *td = curthread;		/* XXX */
 	struct vnode *dqvp;
 	struct iovec aiov;
 	struct uio auio;
 	int error;
 
 	if (dq == NODQUOT)
 		panic("dqsync: dquot");
 	if ((dq->dq_flags & DQ_MOD) == 0)
 		return (0);
 	if ((dqvp = dq->dq_ump->um_quotas[dq->dq_type]) == NULLVP)
 		panic("dqsync: file");
 	(void) vn_write_suspend_wait(dqvp, NULL, V_WAIT);
 	if (vp != dqvp)
 		vn_lock(dqvp, LK_EXCLUSIVE | LK_RETRY, td);
 	while (dq->dq_flags & DQ_LOCK) {
 		dq->dq_flags |= DQ_WANT;
 		(void) tsleep(dq, PINOD+2, "dqsync", 0);
 		if ((dq->dq_flags & DQ_MOD) == 0) {
 			if (vp != dqvp)
 				VOP_UNLOCK(dqvp, 0, td);
 			return (0);
 		}
 	}
 	dq->dq_flags |= DQ_LOCK;
 	auio.uio_iov = &aiov;
 	auio.uio_iovcnt = 1;
 	aiov.iov_base = &dq->dq_dqb;
 	aiov.iov_len = sizeof (struct dqblk);
 	auio.uio_resid = sizeof (struct dqblk);
 	auio.uio_offset = (off_t)(dq->dq_id * sizeof (struct dqblk));
 	auio.uio_segflg = UIO_SYSSPACE;
 	auio.uio_rw = UIO_WRITE;
 	auio.uio_td = (struct thread *)0;
 	error = VOP_WRITE(dqvp, &auio, 0, dq->dq_ump->um_cred[dq->dq_type]);
 	if (auio.uio_resid && error == 0)
 		error = EIO;
 	if (dq->dq_flags & DQ_WANT)
 		wakeup(dq);
 	dq->dq_flags &= ~(DQ_MOD|DQ_LOCK|DQ_WANT);
 	if (vp != dqvp)
 		VOP_UNLOCK(dqvp, 0, td);
 	return (error);
 }
 
 /*
  * Flush all entries from the cache for a particular vnode.
  */
 static void
 dqflush(vp)
 	struct vnode *vp;
 {
 	struct dquot *dq, *nextdq;
 	struct dqhash *dqh;
 
 	/*
 	 * Move all dquot's that used to refer to this quota
 	 * file off their hash chains (they will eventually
 	 * fall off the head of the free list and be re-used).
 	 */
 	for (dqh = &dqhashtbl[dqhash]; dqh >= dqhashtbl; dqh--) {
 		for (dq = LIST_FIRST(dqh); dq; dq = nextdq) {
 			nextdq = LIST_NEXT(dq, dq_hash);
 			if (dq->dq_ump->um_quotas[dq->dq_type] != vp)
 				continue;
 			if (dq->dq_cnt)
 				panic("dqflush: stray dquot");
 			LIST_REMOVE(dq, dq_hash);
 			dq->dq_ump = (struct ufsmount *)0;
 		}
 	}
 }
Index: head/usr.bin/fstat/fstat.c
===================================================================
--- head/usr.bin/fstat/fstat.c	(revision 154151)
+++ head/usr.bin/fstat/fstat.c	(revision 154152)
@@ -1,946 +1,947 @@
 /*-
  * Copyright (c) 1988, 1993
  *	The Regents of the University of California.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. All advertising materials mentioning features or use of this software
  *    must display the following acknowledgement:
  *	This product includes software developed by the University of
  *	California, Berkeley and its contributors.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #ifndef lint
 static const char copyright[] =
 "@(#) Copyright (c) 1988, 1993\n\
 	The Regents of the University of California.  All rights reserved.\n";
 #endif /* not lint */
 
 #ifndef lint
 #if 0
 static char sccsid[] = "@(#)fstat.c	8.3 (Berkeley) 5/2/95";
 #endif
 #endif /* not lint */
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <sys/param.h>
 #include <sys/time.h>
 #include <sys/proc.h>
 #include <sys/user.h>
 #include <sys/stat.h>
 #include <sys/vnode.h>
 #include <sys/socket.h>
 #include <sys/socketvar.h>
 #include <sys/domain.h>
 #include <sys/protosw.h>
 #include <sys/un.h>
 #include <sys/unpcb.h>
 #include <sys/sysctl.h>
 #include <sys/filedesc.h>
 #include <sys/queue.h>
 #define	_KERNEL
 #include <sys/pipe.h>
 #include <sys/conf.h>
 #include <sys/file.h>
 #include <sys/mount.h>
 #include <ufs/ufs/quota.h>
 #include <ufs/ufs/inode.h>
 #include <fs/devfs/devfs.h>
 #include <fs/devfs/devfs_int.h>
 #undef _KERNEL
 #include <nfs/nfsproto.h>
 #include <nfs/rpcv2.h>
 #include <nfsclient/nfs.h>
 #include <nfsclient/nfsnode.h>
 
 
 #include <vm/vm.h>
 #include <vm/vm_map.h>
 #include <vm/vm_object.h>
 
 #include <net/route.h>
 #include <netinet/in.h>
 #include <netinet/in_systm.h>
 #include <netinet/ip.h>
 #include <netinet/in_pcb.h>
 
 #include <ctype.h>
 #include <err.h>
 #include <fcntl.h>
 #include <kvm.h>
 #include <limits.h>
 #include <nlist.h>
 #include <paths.h>
 #include <pwd.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
 #include <unistd.h>
 #include <netdb.h>
 
 #include "fstat.h"
 
 #define	TEXT	-1
 #define	CDIR	-2
 #define	RDIR	-3
 #define	TRACE	-4
 #define	MMAP	-5
 #define	JDIR	-6
 
 DEVS *devs;
 
 #ifdef notdef
 struct nlist nl[] = {
 	{ "" },
 };
 #endif
 
 int 	fsflg,	/* show files on same filesystem as file(s) argument */
 	pflg,	/* show files open by a particular pid */
 	uflg;	/* show files open by a particular (effective) user */
 int 	checkfile; /* true if restricting to particular files or filesystems */
 int	nflg;	/* (numerical) display f.s. and rdev as dev_t */
 int	vflg;	/* display errors in locating kernel data objects etc... */
 int	mflg;	/* include memory-mapped files */
 
 
 struct file **ofiles;	/* buffer of pointers to file structures */
 int maxfiles;
 #define ALLOC_OFILES(d)	\
 	if ((d) > maxfiles) { \
 		free(ofiles); \
 		ofiles = malloc((d) * sizeof(struct file *)); \
 		if (ofiles == NULL) { \
 			err(1, NULL); \
 		} \
 		maxfiles = (d); \
 	}
 
 char *memf, *nlistf;
 kvm_t *kd;
 
 static void fstat_kvm(int, int);
 static void fstat_sysctl(int, int);
 void dofiles(struct kinfo_proc *kp);
 void dommap(struct kinfo_proc *kp);
 void vtrans(struct vnode *vp, int i, int flag);
 int  ufs_filestat(struct vnode *vp, struct filestat *fsp);
 int  nfs_filestat(struct vnode *vp, struct filestat *fsp);
 int  devfs_filestat(struct vnode *vp, struct filestat *fsp);
 char *getmnton(struct mount *m);
 void pipetrans(struct pipe *pi, int i, int flag);
 void socktrans(struct socket *sock, int i);
 void getinetproto(int number);
 int  getfname(const char *filename);
 void usage(void);
 char *kdevtoname(struct cdev *dev);
 
 int
 main(int argc, char **argv)
 {
 	struct passwd *passwd;
 	int arg, ch, what;
 
 	arg = 0;
 	what = KERN_PROC_ALL;
 	nlistf = memf = NULL;
 	while ((ch = getopt(argc, argv, "fmnp:u:vN:M:")) != -1)
 		switch((char)ch) {
 		case 'f':
 			fsflg = 1;
 			break;
 		case 'M':
 			memf = optarg;
 			break;
 		case 'N':
 			nlistf = optarg;
 			break;
 		case 'm':
 			mflg = 1;
 			break;
 		case 'n':
 			nflg = 1;
 			break;
 		case 'p':
 			if (pflg++)
 				usage();
 			if (!isdigit(*optarg)) {
 				warnx("-p requires a process id");
 				usage();
 			}
 			what = KERN_PROC_PID;
 			arg = atoi(optarg);
 			break;
 		case 'u':
 			if (uflg++)
 				usage();
 			if (!(passwd = getpwnam(optarg)))
 				errx(1, "%s: unknown uid", optarg);
 			what = KERN_PROC_UID;
 			arg = passwd->pw_uid;
 			break;
 		case 'v':
 			vflg = 1;
 			break;
 		case '?':
 		default:
 			usage();
 		}
 
 	if (*(argv += optind)) {
 		for (; *argv; ++argv) {
 			if (getfname(*argv))
 				checkfile = 1;
 		}
 		if (!checkfile)	/* file(s) specified, but none accessable */
 			exit(1);
 	}
 
 	if (fsflg && !checkfile) {
 		/* -f with no files means use wd */
 		if (getfname(".") == 0)
 			exit(1);
 		checkfile = 1;
 	}
 
 	if (memf != NULL)
 		fstat_kvm(what, arg);
 	else
 		fstat_sysctl(what, arg);
 	exit(0);
 }
 
 static void
 print_header(void)
 {
 
 	if (nflg)
 		printf("%s",
 "USER     CMD          PID   FD  DEV    INUM       MODE SZ|DV R/W");
 	else
 		printf("%s",
 "USER     CMD          PID   FD MOUNT      INUM MODE         SZ|DV R/W");
 	if (checkfile && fsflg == 0)
 		printf(" NAME\n");
 	else
 		putchar('\n');
 }
 
 static void
 fstat_kvm(int what, int arg)
 {
 	struct kinfo_proc *p, *plast;
 	char buf[_POSIX2_LINE_MAX];
 	int cnt;
 
 	ALLOC_OFILES(256);	/* reserve space for file pointers */
 
 	/*
 	 * Discard setgid privileges if not the running kernel so that bad
 	 * guys can't print interesting stuff from kernel memory.
 	 */
 	if (nlistf != NULL || memf != NULL)
 		setgid(getgid());
 
 	if ((kd = kvm_openfiles(nlistf, memf, NULL, O_RDONLY, buf)) == NULL)
 		errx(1, "%s", buf);
 	setgid(getgid());
 #ifdef notdef
 	if (kvm_nlist(kd, nl) != 0)
 		errx(1, "no namelist: %s", kvm_geterr(kd));
 #endif
 	if ((p = kvm_getprocs(kd, what, arg, &cnt)) == NULL)
 		errx(1, "%s", kvm_geterr(kd));
 	print_header();
 	for (plast = &p[cnt]; p < plast; ++p) {
 		if (p->ki_stat == SZOMB)
 			continue;
 		dofiles(p);
 		if (mflg)
 			dommap(p);
 	}
 }
 
 static void
 fstat_sysctl(int what, int arg)
 {
 
 	/* not yet implemented */
 	fstat_kvm(what, arg);
 }
 
 const char	*Uname, *Comm;
 int	Pid;
 
 #define PREFIX(i) printf("%-8.8s %-10s %5d", Uname, Comm, Pid); \
 	switch(i) { \
 	case TEXT: \
 		printf(" text"); \
 		break; \
 	case CDIR: \
 		printf("   wd"); \
 		break; \
 	case RDIR: \
 		printf(" root"); \
 		break; \
 	case TRACE: \
 		printf("   tr"); \
 		break; \
 	case MMAP: \
 		printf(" mmap"); \
 		break; \
 	case JDIR: \
 		printf(" jail"); \
 		break; \
 	default: \
 		printf(" %4d", i); \
 		break; \
 	}
 
 /*
  * print open files attributed to this process
  */
 void
 dofiles(struct kinfo_proc *kp)
 {
 	int i;
 	struct file file;
 	struct filedesc filed;
 
 	Uname = user_from_uid(kp->ki_uid, 0);
 	Pid = kp->ki_pid;
 	Comm = kp->ki_comm;
 
 	if (kp->ki_fd == NULL)
 		return;
 	if (!KVM_READ(kp->ki_fd, &filed, sizeof (filed))) {
 		dprintf(stderr, "can't read filedesc at %p for pid %d\n",
 		    (void *)kp->ki_fd, Pid);
 		return;
 	}
 	/*
 	 * root directory vnode, if one
 	 */
 	if (filed.fd_rdir)
 		vtrans(filed.fd_rdir, RDIR, FREAD);
 	/*
 	 * current working directory vnode
 	 */
 	vtrans(filed.fd_cdir, CDIR, FREAD);
 	/*
 	 * jail root, if any.
 	 */
 	if (filed.fd_jdir)
 		vtrans(filed.fd_jdir, JDIR, FREAD);
 	/*
 	 * ktrace vnode, if one
 	 */
 	if (kp->ki_tracep)
 		vtrans(kp->ki_tracep, TRACE, FREAD|FWRITE);
 	/*
 	 * text vnode, if one
 	 */
 	if (kp->ki_textvp)
 		vtrans(kp->ki_textvp, TEXT, FREAD);
 	/*
 	 * open files
 	 */
 #define FPSIZE	(sizeof (struct file *))
 #define MAX_LASTFILE	(0x1000000)
 
 	/* Sanity check on filed.fd_lastfile */
 	if (filed.fd_lastfile <= -1 || filed.fd_lastfile > MAX_LASTFILE)
 		return;
 
 	ALLOC_OFILES(filed.fd_lastfile+1);
 	if (!KVM_READ(filed.fd_ofiles, ofiles,
 	    (filed.fd_lastfile+1) * FPSIZE)) {
 		dprintf(stderr,
 		    "can't read file structures at %p for pid %d\n",
 		    (void *)filed.fd_ofiles, Pid);
 		return;
 	}
 	for (i = 0; i <= filed.fd_lastfile; i++) {
 		if (ofiles[i] == NULL)
 			continue;
 		if (!KVM_READ(ofiles[i], &file, sizeof (struct file))) {
 			dprintf(stderr, "can't read file %d at %p for pid %d\n",
 			    i, (void *)ofiles[i], Pid);
 			continue;
 		}
 		if (file.f_type == DTYPE_VNODE)
 			vtrans(file.f_vnode, i, file.f_flag);
 		else if (file.f_type == DTYPE_SOCKET) {
 			if (checkfile == 0)
 				socktrans(file.f_data, i);
 		}
 #ifdef DTYPE_PIPE
 		else if (file.f_type == DTYPE_PIPE) {
 			if (checkfile == 0)
 				pipetrans(file.f_data, i, file.f_flag);
 		}
 #endif
 #ifdef DTYPE_FIFO
 		else if (file.f_type == DTYPE_FIFO) {
 			if (checkfile == 0)
 				vtrans(file.f_vnode, i, file.f_flag);
 		}
 #endif
 		else {
 			dprintf(stderr,
 			    "unknown file type %d for file %d of pid %d\n",
 			    file.f_type, i, Pid);
 		}
 	}
 }
 
 void
 dommap(struct kinfo_proc *kp)
 {
 	vm_map_t map;
 	struct vmspace vmspace;
 	struct vm_map_entry entry;
 	vm_map_entry_t entryp;
 	struct vm_object object;
 	vm_object_t objp;
 	int prot, fflags;
 
 	if (!KVM_READ(kp->ki_vmspace, &vmspace, sizeof(vmspace))) {
 		dprintf(stderr,
 		    "can't read vmspace at %p for pid %d\n",
 		    (void *)kp->ki_vmspace, Pid);
 		return;
 	}
 	map = &vmspace.vm_map;
 
 	for (entryp = map->header.next;
 	    entryp != &kp->ki_vmspace->vm_map.header; entryp = entry.next) {
 		if (!KVM_READ(entryp, &entry, sizeof(entry))) {
 			dprintf(stderr,
 			    "can't read vm_map_entry at %p for pid %d\n",
 			    (void *)entryp, Pid);
 			return;
 		}
 
 		if (entry.eflags & MAP_ENTRY_IS_SUB_MAP)
 			continue;
 
 		if ((objp = entry.object.vm_object) == NULL)
 			continue;
 
 		for (; objp; objp = object.backing_object) {
 			if (!KVM_READ(objp, &object, sizeof(object))) {
 				dprintf(stderr,
 				    "can't read vm_object at %p for pid %d\n",
 				    (void *)objp, Pid);
 				return;
 			}
 		}
 
 		prot = entry.protection;
 		fflags = (prot & VM_PROT_READ ? FREAD : 0) |
 		    (prot & VM_PROT_WRITE ? FWRITE : 0);
 
 		switch (object.type) {
 		case OBJT_VNODE:
 			vtrans((struct vnode *)object.handle, MMAP, fflags);
 			break;
 		default:
 			break;
 		}
 	}
 }
 
 char *
 kdevtoname(struct cdev *dev)
 {
 	struct cdev si;
 
 	if (!KVM_READ(dev, &si, sizeof si))
 		return (NULL);
 	return (strdup(si.__si_namebuf));
 }
 
 void
 vtrans(struct vnode *vp, int i, int flag)
 {
 	struct vnode vn;
 	struct filestat fst;
 	char rw[3], mode[15], tagstr[12], *tagptr;
 	const char *badtype, *filename;
 
 	filename = badtype = NULL;
 	if (!KVM_READ(vp, &vn, sizeof (struct vnode))) {
 		dprintf(stderr, "can't read vnode at %p for pid %d\n",
 		    (void *)vp, Pid);
 		return;
 	}
 	if (!KVM_READ(&vp->v_tag, &tagptr, sizeof tagptr) ||
 	    !KVM_READ(tagptr, tagstr, sizeof tagstr)) {
 		dprintf(stderr, "can't read v_tag at %p for pid %d\n",
 		    (void *)vp, Pid);
 		return;
 	}
 	tagstr[sizeof(tagstr) - 1] = '\0';
 	if (vn.v_type == VNON)
 		badtype = "none";
 	else if (vn.v_type == VBAD)
 		badtype = "bad";
 	else {
 		if (!strcmp("ufs", tagstr)) {
 			if (!ufs_filestat(&vn, &fst))
 				badtype = "error";
 		} else if (!strcmp("devfs", tagstr)) {
 			if (!devfs_filestat(&vn, &fst))
 				badtype = "error";
 		} else if (!strcmp("nfs", tagstr)) {
 			if (!nfs_filestat(&vn, &fst))
 				badtype = "error";
 		} else if (!strcmp("msdosfs", tagstr)) {
 			if (!msdosfs_filestat(&vn, &fst))
 				badtype = "error";
 		} else if (!strcmp("isofs", tagstr)) {
 			if (!isofs_filestat(&vn, &fst))
 				badtype = "error";
 		} else {
 			static char unknown[32];
 			snprintf(unknown, sizeof unknown, "?(%s)", tagstr);
 			badtype = unknown;
 		}
 	}
 	if (checkfile) {
 		int fsmatch = 0;
 		DEVS *d;
 
 		if (badtype)
 			return;
 		for (d = devs; d != NULL; d = d->next)
 			if (d->fsid == fst.fsid) {
 				fsmatch = 1;
 				if (d->ino == fst.fileid) {
 					filename = d->name;
 					break;
 				}
 			}
 		if (fsmatch == 0 || (filename == NULL && fsflg == 0))
 			return;
 	}
 	PREFIX(i);
 	if (badtype) {
 		(void)printf(" -         -  %10s    -\n", badtype);
 		return;
 	}
 	if (nflg)
 		(void)printf(" %2d,%-2d", major(fst.fsid), minor(fst.fsid));
 	else
 		(void)printf(" %-8s", getmnton(vn.v_mount));
 	if (nflg)
 		(void)sprintf(mode, "%o", fst.mode);
 	else
 		strmode(fst.mode, mode);
 	(void)printf(" %6ld %10s", fst.fileid, mode);
 	switch (vn.v_type) {
 	case VBLK:
 	case VCHR: {
 		char *name;
 
 		name = kdevtoname(vn.v_rdev);
 		if (nflg || !name)
 			printf("  %2d,%-2d", major(fst.rdev), minor(fst.rdev));
 		else {
 			printf(" %6s", name);
 			free(name);
 		}
 		break;
 	}
 	default:
 		printf(" %6lu", fst.size);
 	}
 	rw[0] = '\0';
 	if (flag & FREAD)
 		strcat(rw, "r");
 	if (flag & FWRITE)
 		strcat(rw, "w");
 	printf(" %2s", rw);
 	if (filename && !fsflg)
 		printf("  %s", filename);
 	putchar('\n');
 }
 
 int
 ufs_filestat(struct vnode *vp, struct filestat *fsp)
 {
 	struct inode inode;
 
 	if (!KVM_READ(VTOI(vp), &inode, sizeof (inode))) {
 		dprintf(stderr, "can't read inode at %p for pid %d\n",
 		    (void *)VTOI(vp), Pid);
 		return 0;
 	}
 	/*
 	 * The st_dev from stat(2) is a dev_t. These kernel structures
 	 * contain cdev pointers. We need to convert to dev_t to make
 	 * comparisons
 	 */
 	fsp->fsid = dev2udev(inode.i_dev);
 	fsp->fileid = (long)inode.i_number;
 	fsp->mode = (mode_t)inode.i_mode;
 	fsp->size = (u_long)inode.i_size;
 #if should_be_but_is_hard
 	/* XXX - need to load i_ump and i_din[12] from kernel memory */
 	if (inode.i_ump->um_fstype == UFS1)
 		fsp->rdev = inode.i_din1->di_rdev;
 	else
 		fsp->rdev = inode.i_din2->di_rdev;
 #else
 	fsp->rdev = 0;
 #endif
 
 	return 1;
 }
 
 int
 devfs_filestat(struct vnode *vp, struct filestat *fsp)
 {
 	struct devfs_dirent devfs_dirent;
 	struct mount mount;
 	struct vnode vnode;
 
 	if (!KVM_READ(vp->v_data, &devfs_dirent, sizeof (devfs_dirent))) {
 		dprintf(stderr, "can't read devfs_dirent at %p for pid %d\n",
 		    (void *)vp->v_data, Pid);
 		return 0;
 	}
 	if (!KVM_READ(vp->v_mount, &mount, sizeof (mount))) {
 		dprintf(stderr, "can't read mount at %p for pid %d\n",
 		    (void *)vp->v_mount, Pid);
 		return 0;
 	}
 	if (!KVM_READ(devfs_dirent.de_vnode, &vnode, sizeof (vnode))) {
 		dprintf(stderr, "can't read vnode at %p for pid %d\n",
 		    (void *)devfs_dirent.de_vnode, Pid);
 		return 0;
 	}
 	fsp->fsid = (long)mount.mnt_stat.f_fsid.val[0];
 	fsp->fileid = devfs_dirent.de_inode;
 	fsp->mode = (devfs_dirent.de_mode & ~S_IFMT) | S_IFCHR;
 	fsp->size = 0;
 	fsp->rdev = dev2udev(vnode.v_rdev);
 
 	return 1;
 }
 
 int
 nfs_filestat(struct vnode *vp, struct filestat *fsp)
 {
 	struct nfsnode nfsnode;
 	mode_t mode;
 
 	if (!KVM_READ(VTONFS(vp), &nfsnode, sizeof (nfsnode))) {
 		dprintf(stderr, "can't read nfsnode at %p for pid %d\n",
 		    (void *)VTONFS(vp), Pid);
 		return 0;
 	}
 	fsp->fsid = nfsnode.n_vattr.va_fsid;
 	fsp->fileid = nfsnode.n_vattr.va_fileid;
 	fsp->size = nfsnode.n_size;
 	fsp->rdev = nfsnode.n_vattr.va_rdev;
 	mode = (mode_t)nfsnode.n_vattr.va_mode;
 	switch (vp->v_type) {
 	case VREG:
 		mode |= S_IFREG;
 		break;
 	case VDIR:
 		mode |= S_IFDIR;
 		break;
 	case VBLK:
 		mode |= S_IFBLK;
 		break;
 	case VCHR:
 		mode |= S_IFCHR;
 		break;
 	case VLNK:
 		mode |= S_IFLNK;
 		break;
 	case VSOCK:
 		mode |= S_IFSOCK;
 		break;
 	case VFIFO:
 		mode |= S_IFIFO;
 		break;
 	case VNON:
 	case VBAD:
+	case VMARKER:
 		return 0;
 	};
 	fsp->mode = mode;
 
 	return 1;
 }
 
 
 char *
 getmnton(struct mount *m)
 {
 	static struct mount mount;
 	static struct mtab {
 		struct mtab *next;
 		struct mount *m;
 		char mntonname[MNAMELEN];
 	} *mhead = NULL;
 	struct mtab *mt;
 
 	for (mt = mhead; mt != NULL; mt = mt->next)
 		if (m == mt->m)
 			return (mt->mntonname);
 	if (!KVM_READ(m, &mount, sizeof(struct mount))) {
 		warnx("can't read mount table at %p", (void *)m);
 		return (NULL);
 	}
 	if ((mt = malloc(sizeof (struct mtab))) == NULL)
 		err(1, NULL);
 	mt->m = m;
 	bcopy(&mount.mnt_stat.f_mntonname[0], &mt->mntonname[0], MNAMELEN);
 	mt->next = mhead;
 	mhead = mt;
 	return (mt->mntonname);
 }
 
 void
 pipetrans(struct pipe *pi, int i, int flag)
 {
 	struct pipe pip;
 	char rw[3];
 
 	PREFIX(i);
 
 	/* fill in socket */
 	if (!KVM_READ(pi, &pip, sizeof(struct pipe))) {
 		dprintf(stderr, "can't read pipe at %p\n", (void *)pi);
 		goto bad;
 	}
 
 	printf("* pipe %8lx <-> %8lx", (u_long)pi, (u_long)pip.pipe_peer);
 	printf(" %6d", (int)pip.pipe_buffer.cnt);
 	rw[0] = '\0';
 	if (flag & FREAD)
 		strcat(rw, "r");
 	if (flag & FWRITE)
 		strcat(rw, "w");
 	printf(" %2s", rw);
 	putchar('\n');
 	return;
 
 bad:
 	printf("* error\n");
 }
 
 void
 socktrans(struct socket *sock, int i)
 {
 	static const char *stypename[] = {
 		"unused",	/* 0 */
 		"stream", 	/* 1 */
 		"dgram",	/* 2 */
 		"raw",		/* 3 */
 		"rdm",		/* 4 */
 		"seqpak"	/* 5 */
 	};
 #define	STYPEMAX 5
 	struct socket	so;
 	struct protosw	proto;
 	struct domain	dom;
 	struct inpcb	inpcb;
 	struct unpcb	unpcb;
 	int len;
 	char dname[32];
 
 	PREFIX(i);
 
 	/* fill in socket */
 	if (!KVM_READ(sock, &so, sizeof(struct socket))) {
 		dprintf(stderr, "can't read sock at %p\n", (void *)sock);
 		goto bad;
 	}
 
 	/* fill in protosw entry */
 	if (!KVM_READ(so.so_proto, &proto, sizeof(struct protosw))) {
 		dprintf(stderr, "can't read protosw at %p",
 		    (void *)so.so_proto);
 		goto bad;
 	}
 
 	/* fill in domain */
 	if (!KVM_READ(proto.pr_domain, &dom, sizeof(struct domain))) {
 		dprintf(stderr, "can't read domain at %p\n",
 		    (void *)proto.pr_domain);
 		goto bad;
 	}
 
 	if ((len = kvm_read(kd, (u_long)dom.dom_name, dname,
 	    sizeof(dname) - 1)) < 0) {
 		dprintf(stderr, "can't read domain name at %p\n",
 		    (void *)dom.dom_name);
 		dname[0] = '\0';
 	}
 	else
 		dname[len] = '\0';
 
 	if ((u_short)so.so_type > STYPEMAX)
 		printf("* %s ?%d", dname, so.so_type);
 	else
 		printf("* %s %s", dname, stypename[so.so_type]);
 
 	/*
 	 * protocol specific formatting
 	 *
 	 * Try to find interesting things to print.  For tcp, the interesting
 	 * thing is the address of the tcpcb, for udp and others, just the
 	 * inpcb (socket pcb).  For unix domain, its the address of the socket
 	 * pcb and the address of the connected pcb (if connected).  Otherwise
 	 * just print the protocol number and address of the socket itself.
 	 * The idea is not to duplicate netstat, but to make available enough
 	 * information for further analysis.
 	 */
 	switch(dom.dom_family) {
 	case AF_INET:
 	case AF_INET6:
 		getinetproto(proto.pr_protocol);
 		if (proto.pr_protocol == IPPROTO_TCP ) {
 			if (so.so_pcb) {
 				if (kvm_read(kd, (u_long)so.so_pcb,
 				    (char *)&inpcb, sizeof(struct inpcb))
 				    != sizeof(struct inpcb)) {
 					dprintf(stderr,
 					    "can't read inpcb at %p\n",
 					    (void *)so.so_pcb);
 					goto bad;
 				}
 				printf(" %lx", (u_long)inpcb.inp_ppcb);
 			}
 		}
 		else if (so.so_pcb)
 			printf(" %lx", (u_long)so.so_pcb);
 		break;
 	case AF_UNIX:
 		/* print address of pcb and connected pcb */
 		if (so.so_pcb) {
 			printf(" %lx", (u_long)so.so_pcb);
 			if (kvm_read(kd, (u_long)so.so_pcb, (char *)&unpcb,
 			    sizeof(struct unpcb)) != sizeof(struct unpcb)){
 				dprintf(stderr, "can't read unpcb at %p\n",
 				    (void *)so.so_pcb);
 				goto bad;
 			}
 			if (unpcb.unp_conn) {
 				char shoconn[4], *cp;
 
 				cp = shoconn;
 				if (!(so.so_rcv.sb_state & SBS_CANTRCVMORE))
 					*cp++ = '<';
 				*cp++ = '-';
 				if (!(so.so_snd.sb_state & SBS_CANTSENDMORE))
 					*cp++ = '>';
 				*cp = '\0';
 				printf(" %s %lx", shoconn,
 				    (u_long)unpcb.unp_conn);
 			}
 		}
 		break;
 	default:
 		/* print protocol number and socket address */
 		printf(" %d %lx", proto.pr_protocol, (u_long)sock);
 	}
 	printf("\n");
 	return;
 bad:
 	printf("* error\n");
 }
 
 
 /*
  * Read the cdev structure in the kernel in order to work out the
  * associated dev_t
  */
 dev_t
 dev2udev(struct cdev *dev)
 {
 	struct cdev_priv priv;
 	struct cdev si;
 
 	if (KVM_READ(dev, &si, sizeof si) &&
 	    KVM_READ(si.si_priv, &priv, sizeof priv)) {
 		return ((dev_t)priv.cdp_inode);
 	} else {
 		dprintf(stderr, "can't convert cdev *%p to a dev_t\n", dev);
 		return -1;
 	}
 }
 
 /*
  * getinetproto --
  *	print name of protocol number
  */
 void
 getinetproto(int number)
 {
 	static int isopen;
 	struct protoent *pe;
 
 	if (!isopen)
 		setprotoent(++isopen);
 	if ((pe = getprotobynumber(number)) != NULL)
 		printf(" %s", pe->p_name);
 	else
 		printf(" %d", number);
 }
 
 int
 getfname(const char *filename)
 {
 	struct stat statbuf;
 	DEVS *cur;
 
 	if (stat(filename, &statbuf)) {
 		warn("%s", filename);
 		return(0);
 	}
 	if ((cur = malloc(sizeof(DEVS))) == NULL)
 		err(1, NULL);
 	cur->next = devs;
 	devs = cur;
 
 	cur->ino = statbuf.st_ino;
 	cur->fsid = statbuf.st_dev;
 	cur->name = filename;
 	return(1);
 }
 
 void
 usage(void)
 {
 	(void)fprintf(stderr,
  "usage: fstat [-fmnv] [-M core] [-N system] [-p pid] [-u user] [file ...]\n");
 	exit(1);
 }