Index: head/sys/fs/specfs/spec_vnops.c
===================================================================
--- head/sys/fs/specfs/spec_vnops.c	(revision 105901)
+++ head/sys/fs/specfs/spec_vnops.c	(revision 105902)
@@ -1,952 +1,962 @@
 /*
  * Copyright (c) 1989, 1993, 1995
  *	The Regents of the University of California.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. All advertising materials mentioning features or use of this software
  *    must display the following acknowledgement:
  *	This product includes software developed by the University of
  *	California, Berkeley and its contributors.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	@(#)spec_vnops.c	8.14 (Berkeley) 5/21/95
  * $FreeBSD$
  */
 
 #include <sys/param.h>
 #include <sys/lock.h>
 #include <sys/sx.h>
 #include <sys/proc.h>
 #include <sys/systm.h>
 #include <sys/kernel.h>
 #include <sys/mutex.h>
 #include <sys/conf.h>
 #include <sys/bio.h>
 #include <sys/buf.h>
 #include <sys/mount.h>
 #include <sys/vnode.h>
 #include <sys/stat.h>
 #include <sys/fcntl.h>
 #include <sys/vmmeter.h>
 #include <sys/tty.h>
 #include <vm/vm.h>
 #include <vm/vm_object.h>
 #include <vm/vm_page.h>
 #include <vm/vm_pager.h>
 
 
 static int	spec_advlock(struct vop_advlock_args *);
 static int	spec_bmap(struct vop_bmap_args *);
 static int	spec_close(struct vop_close_args *);
 static int	spec_freeblks(struct vop_freeblks_args *);
 static int	spec_fsync(struct  vop_fsync_args *);
 static int	spec_getpages(struct vop_getpages_args *);
 static int	spec_ioctl(struct vop_ioctl_args *);
 static int	spec_kqfilter(struct vop_kqfilter_args *);
 static int	spec_open(struct vop_open_args *);
 static int	spec_poll(struct vop_poll_args *);
 static int	spec_print(struct vop_print_args *);
 static int	spec_read(struct vop_read_args *);
 static int	spec_strategy(struct vop_strategy_args *);
 static int	spec_write(struct vop_write_args *);
 
 vop_t **spec_vnodeop_p;
 static struct vnodeopv_entry_desc spec_vnodeop_entries[] = {
 	{ &vop_default_desc,		(vop_t *) vop_defaultop },
 	{ &vop_access_desc,		(vop_t *) vop_ebadf },
 	{ &vop_advlock_desc,		(vop_t *) spec_advlock },
 	{ &vop_bmap_desc,		(vop_t *) spec_bmap },
 	{ &vop_close_desc,		(vop_t *) spec_close },
 	{ &vop_create_desc,		(vop_t *) vop_panic },
 	{ &vop_freeblks_desc,		(vop_t *) spec_freeblks },
 	{ &vop_fsync_desc,		(vop_t *) spec_fsync },
 	{ &vop_getpages_desc,		(vop_t *) spec_getpages },
 	{ &vop_getwritemount_desc, 	(vop_t *) vop_stdgetwritemount },
 	{ &vop_ioctl_desc,		(vop_t *) spec_ioctl },
 	{ &vop_kqfilter_desc,		(vop_t *) spec_kqfilter },
 	{ &vop_lease_desc,		(vop_t *) vop_null },
 	{ &vop_link_desc,		(vop_t *) vop_panic },
 	{ &vop_mkdir_desc,		(vop_t *) vop_panic },
 	{ &vop_mknod_desc,		(vop_t *) vop_panic },
 	{ &vop_open_desc,		(vop_t *) spec_open },
 	{ &vop_pathconf_desc,		(vop_t *) vop_stdpathconf },
 	{ &vop_poll_desc,		(vop_t *) spec_poll },
 	{ &vop_print_desc,		(vop_t *) spec_print },
 	{ &vop_read_desc,		(vop_t *) spec_read },
 	{ &vop_readdir_desc,		(vop_t *) vop_panic },
 	{ &vop_readlink_desc,		(vop_t *) vop_panic },
 	{ &vop_reallocblks_desc,	(vop_t *) vop_panic },
 	{ &vop_reclaim_desc,		(vop_t *) vop_null },
 	{ &vop_remove_desc,		(vop_t *) vop_panic },
 	{ &vop_rename_desc,		(vop_t *) vop_panic },
 	{ &vop_rmdir_desc,		(vop_t *) vop_panic },
 	{ &vop_setattr_desc,		(vop_t *) vop_ebadf },
 	{ &vop_strategy_desc,		(vop_t *) spec_strategy },
 	{ &vop_symlink_desc,		(vop_t *) vop_panic },
 	{ &vop_write_desc,		(vop_t *) spec_write },
 	{ &vop_lock_desc,		(vop_t *) vop_nolock },
 	{ &vop_unlock_desc,		(vop_t *) vop_nounlock },
 	{ &vop_islocked_desc,		(vop_t *) vop_noislocked },
 	{ NULL, NULL }
 };
 static struct vnodeopv_desc spec_vnodeop_opv_desc =
 	{ &spec_vnodeop_p, spec_vnodeop_entries };
 
 VNODEOP_SET(spec_vnodeop_opv_desc);
 
 int
 spec_vnoperate(ap)
 	struct vop_generic_args /* {
 		struct vnodeop_desc *a_desc;
 		<other random data follows, presumably>
 	} */ *ap;
 {
 	return (VOCALL(spec_vnodeop_p, ap->a_desc->vdesc_offset, ap));
 }
 
 static void spec_getpages_iodone(struct buf *bp);
 
 /*
  * Open a special file.
  */
 /* ARGSUSED */
 static int
 spec_open(ap)
 	struct vop_open_args /* {
 		struct vnode *a_vp;
 		int  a_mode;
 		struct ucred *a_cred;
 		struct thread *a_td;
 	} */ *ap;
 {
 	struct thread *td = ap->a_td;
 	struct vnode *vp = ap->a_vp;
 	dev_t dev = vp->v_rdev;
 	int error;
 	struct cdevsw *dsw;
 	const char *cp;
 
 	if (vp->v_type == VBLK)
 		return (ENXIO);
 
 	/* Don't allow open if fs is mounted -nodev. */
 	if (vp->v_mount && (vp->v_mount->mnt_flag & MNT_NODEV))
 		return (ENXIO);
 
 	dsw = devsw(dev);
 	if (dsw == NULL || dsw->d_open == NULL)
 		return (ENXIO);
 
 	/* Make this field valid before any I/O in d_open. */
 	if (dev->si_iosize_max == 0)
 		dev->si_iosize_max = DFLTPHYS;
 
 	/*
 	 * XXX: Disks get special billing here, but it is mostly wrong.
 	 * XXX: Disk partitions can overlap and the real checks should
 	 * XXX: take this into account, and consequently they need to
 	 * XXX: live in the disk slice code.  Some checks do.
 	 */
 	if (vn_isdisk(vp, NULL) && ap->a_cred != FSCRED &&
 	    (ap->a_mode & FWRITE)) {
 		/*
 		 * Never allow opens for write if the disk is mounted R/W.
 		 */
 		if (vp->v_rdev->si_mountpoint != NULL &&
 		    !(vp->v_rdev->si_mountpoint->mnt_flag & MNT_RDONLY))
 			return (EBUSY);
 
 		/*
 		 * When running in secure mode, do not allow opens
 		 * for writing if the disk is mounted.
 		 */
 		error = securelevel_ge(td->td_ucred, 1);
 		if (error && vfs_mountedon(vp))
 			return (error);
 
 		/*
 		 * When running in very secure mode, do not allow
 		 * opens for writing of any disks.
 		 */
 		error = securelevel_ge(td->td_ucred, 2);
 		if (error)
 			return (error);
 	}
 
 	/* XXX: Special casing of ttys for deadfs.  Probably redundant. */
 	if (dsw->d_flags & D_TTY)
 		vp->v_vflag |= VV_ISTTY;
 
 	VOP_UNLOCK(vp, 0, td);
 	if(dsw->d_flags & D_NOGIANT) {
 		DROP_GIANT();
 		error = dsw->d_open(dev, ap->a_mode, S_IFCHR, td);
 		PICKUP_GIANT();
 	} else
 		error = dsw->d_open(dev, ap->a_mode, S_IFCHR, td);
 	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td);
 
 	if (error)
 		return (error);
 
 	if (dsw->d_flags & D_TTY) {
 		if (dev->si_tty) {
 			struct tty *tp;
 			tp = dev->si_tty;
 			if (!tp->t_stop) {
 				printf("Warning:%s: no t_stop, using nottystop\n", devtoname(dev));
 				tp->t_stop = nottystop;
 			}
 		}
 	}
 
 	if (vn_isdisk(vp, NULL)) {
 		if (!dev->si_bsize_phys)
 			dev->si_bsize_phys = DEV_BSIZE;
 	}
 	if ((dsw->d_flags & D_DISK) == 0) {
 		cp = devtoname(dev);
 		if (*cp == '#' && (dsw->d_flags & D_NAGGED) == 0) {
 			printf("WARNING: driver %s should register devices with make_dev() (dev_t = \"%s\")\n",
 			    dsw->d_name, cp);
 			dsw->d_flags |= D_NAGGED;
 		}
 	}
 	return (error);
 }
 
 /*
  * Vnode op for read
  */
 /* ARGSUSED */
 static int
 spec_read(ap)
 	struct vop_read_args /* {
 		struct vnode *a_vp;
 		struct uio *a_uio;
 		int a_ioflag;
 		struct ucred *a_cred;
 	} */ *ap;
 {
 	struct vnode *vp;
 	struct thread *td;
 	struct uio *uio;
 	dev_t dev;
 	int error, resid;
 	struct cdevsw *dsw;
 
 	vp = ap->a_vp;
 	dev = vp->v_rdev;
 	uio = ap->a_uio;
 	td = uio->uio_td;
 	resid = uio->uio_resid;
 
 	if (resid == 0)
 		return (0);
 
 	dsw = devsw(dev);
 	VOP_UNLOCK(vp, 0, td);
 	if (dsw->d_flags & D_NOGIANT) {
 		DROP_GIANT();
 		error = dsw->d_read(dev, uio, ap->a_ioflag);
 		PICKUP_GIANT();
 	} else
 		error = dsw->d_read(dev, uio, ap->a_ioflag);
 	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td);
 	if (uio->uio_resid != resid || (error == 0 && resid != 0))
 		vfs_timestamp(&dev->si_atime);
 	return (error);
 }
 
 /*
  * Vnode op for write
  */
 /* ARGSUSED */
 static int
 spec_write(ap)
 	struct vop_write_args /* {
 		struct vnode *a_vp;
 		struct uio *a_uio;
 		int a_ioflag;
 		struct ucred *a_cred;
 	} */ *ap;
 {
 	struct vnode *vp;
 	struct thread *td;
 	struct uio *uio;
 	dev_t dev;
 	int error, resid;
 	struct cdevsw *dsw;
 
 	vp = ap->a_vp;
 	dev = vp->v_rdev;
 	dsw = devsw(dev);
 	uio = ap->a_uio;
 	td = uio->uio_td;
 	resid = uio->uio_resid;
 
 	VOP_UNLOCK(vp, 0, td);
 	if (dsw->d_flags & D_NOGIANT) {
 		DROP_GIANT();
 		error = dsw->d_write(dev, uio, ap->a_ioflag);
 		PICKUP_GIANT();
 	} else
 		error = dsw->d_write(dev, uio, ap->a_ioflag);
 	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td);
 	if (uio->uio_resid != resid || (error == 0 && resid != 0)) {
 		vfs_timestamp(&dev->si_ctime);
 		dev->si_mtime = dev->si_ctime;
 	}
 	return (error);
 }
 
 /*
  * Device ioctl operation.
  */
 /* ARGSUSED */
 static int
 spec_ioctl(ap)
 	struct vop_ioctl_args /* {
 		struct vnode *a_vp;
 		u_long  a_command;
 		caddr_t  a_data;
 		int  a_fflag;
 		struct ucred *a_cred;
 		struct thread *a_td;
 	} */ *ap;
 {
 	dev_t dev;
 	int error;
 	struct cdevsw *dsw;
 
 	dev = ap->a_vp->v_rdev;
 	dsw = devsw(dev);
 	if (dsw->d_flags & D_NOGIANT) {
 		DROP_GIANT();
 		error = dsw->d_ioctl(dev, ap->a_command,
 		    ap->a_data, ap->a_fflag, ap->a_td);
 		PICKUP_GIANT();
 	} else 
 		error = dsw->d_ioctl(dev, ap->a_command,
 		    ap->a_data, ap->a_fflag, ap->a_td);
 	if (error == ENOIOCTL)
 		error = ENOTTY;
 	return (error);
 }
 
 /* ARGSUSED */
 static int
 spec_poll(ap)
 	struct vop_poll_args /* {
 		struct vnode *a_vp;
 		int  a_events;
 		struct ucred *a_cred;
 		struct thread *a_td;
 	} */ *ap;
 {
 	dev_t dev;
 	struct cdevsw *dsw;
 	int error;
 
 	dev = ap->a_vp->v_rdev;
 	dsw = devsw(dev);
 	if (dsw->d_flags & D_NOGIANT) {
 		DROP_GIANT();
 		error = dsw->d_poll(dev, ap->a_events, ap->a_td);
 		PICKUP_GIANT();
 	} else
 		error = dsw->d_poll(dev, ap->a_events, ap->a_td);
 	return(error);
 }
 
 /* ARGSUSED */
 static int
 spec_kqfilter(ap)
 	struct vop_kqfilter_args /* {
 		struct vnode *a_vp;
 		struct knote *a_kn;
 	} */ *ap;
 {
 	dev_t dev;
 	struct cdevsw *dsw;
 	int error;
 
 	dev = ap->a_vp->v_rdev;
 	dsw = devsw(dev);
 	if (!(dsw->d_flags & D_KQFILTER))
 		return (1);
 	if (dsw->d_flags & D_NOGIANT) {
 		DROP_GIANT();
 		error = dsw->d_kqfilter(dev, ap->a_kn);
 		PICKUP_GIANT();
 	} else
 		error = dsw->d_kqfilter(dev, ap->a_kn);
 	return (error);
 }
 
 /*
  * Synch buffers associated with a block device
  */
 /* ARGSUSED */
 static int
 spec_fsync(ap)
 	struct vop_fsync_args /* {
 		struct vnode *a_vp;
 		struct ucred *a_cred;
 		int  a_waitfor;
 		struct thread *a_td;
 	} */ *ap;
 {
 	struct vnode *vp = ap->a_vp;
 	struct buf *bp;
 	struct buf *nbp;
-	int s;
-	int maxretry = 10000;	/* large, arbitrarily chosen */
+	int s, error = 0;
+	int maxretry = 100;	/* large, arbitrarily chosen */
 
 	if (!vn_isdisk(vp, NULL))
 		return (0);
 
 	VI_LOCK(vp);
 loop1:
 	/*
 	 * MARK/SCAN initialization to avoid infinite loops.
 	 */
 	s = splbio();
         TAILQ_FOREACH(bp, &vp->v_dirtyblkhd, b_vnbufs) {
                 bp->b_flags &= ~B_SCANNED;
+		bp->b_error = 0;
 	}
 	splx(s);
 
 	/*
 	 * Flush all dirty buffers associated with a block device.
 	 */
 loop2:
 	s = splbio();
 	for (bp = TAILQ_FIRST(&vp->v_dirtyblkhd); bp != NULL; bp = nbp) {
 		nbp = TAILQ_NEXT(bp, b_vnbufs);
 		if ((bp->b_flags & B_SCANNED) != 0)
 			continue;
 		VI_UNLOCK(vp);
 		bp->b_flags |= B_SCANNED;
 		if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT)) {
 			VI_LOCK(vp);
 			continue;
 		}
 		if ((bp->b_flags & B_DELWRI) == 0)
 			panic("spec_fsync: not dirty");
 		if ((vp->v_vflag & VV_OBJBUF) && (bp->b_flags & B_CLUSTEROK)) {
 			BUF_UNLOCK(bp);
 			vfs_bio_awrite(bp);
 			splx(s);
 		} else {
 			bremfree(bp);
 			splx(s);
 			bawrite(bp);
 		}
 		VI_LOCK(vp);
 		goto loop2;
 	}
 
 	/*
 	 * If synchronous the caller expects us to completely resolve all
 	 * dirty buffers in the system.  Wait for in-progress I/O to
 	 * complete (which could include background bitmap writes), then
 	 * retry if dirty blocks still exist.
 	 */
 	if (ap->a_waitfor == MNT_WAIT) {
 		while (vp->v_numoutput) {
 			vp->v_iflag |= VI_BWAIT;
 			msleep((caddr_t)&vp->v_numoutput, VI_MTX(vp),
 			    PRIBIO + 1, "spfsyn", 0);
 		}
 		if (!TAILQ_EMPTY(&vp->v_dirtyblkhd)) {
-			if (--maxretry != 0) {
+			/*
+			 * If we are unable to write any of these buffers
+			 * then we fail now rather than trying endlessly
+			 * to write them out.
+			 */
+			TAILQ_FOREACH(bp, &vp->v_dirtyblkhd, b_vnbufs)
+				if ((error = bp->b_error) == 0)
+					continue;
+			if (error == 0 && --maxretry >= 0) {
 				splx(s);
 				goto loop1;
 			}
 			vprint("spec_fsync: giving up on dirty", vp);
+			error = EAGAIN;
 		}
 	}
 	VI_UNLOCK(vp);
 	splx(s);
-	return (0);
+	return (error);
 }
 
 /*
  * Mutex to use when delaying niced I/O bound processes in spec_strategy().
  */
 static struct mtx strategy_mtx;
 static void
 strategy_init(void)
 {
 
 	mtx_init(&strategy_mtx, "strategy", NULL, MTX_DEF);
 }
 SYSINIT(strategy, SI_SUB_DRIVERS, SI_ORDER_MIDDLE, strategy_init, NULL)
 
 /*
  * Just call the device strategy routine
  */
 static int
 spec_strategy(ap)
 	struct vop_strategy_args /* {
 		struct vnode *a_vp;
 		struct buf *a_bp;
 	} */ *ap;
 {
 	struct buf *bp;
 	struct vnode *vp;
 	struct mount *mp;
 	int error;
 	struct cdevsw *dsw;
 	struct thread *td = curthread;
 	
 	/*
 	 * Slow down disk requests for niced processes.
 	 */
 	if (td && td->td_ksegrp->kg_nice > 0) {
 		mtx_lock(&strategy_mtx);
 		msleep(&strategy_mtx, &strategy_mtx,
 		    PPAUSE | PCATCH | PDROP, "ioslow",
 		    td->td_ksegrp->kg_nice);
 	}
 	bp = ap->a_bp;
 	vp = ap->a_vp;
 	if (bp->b_iocmd == BIO_WRITE) {
 		if ((bp->b_flags & B_VALIDSUSPWRT) == 0 &&
 		    bp->b_vp != NULL && bp->b_vp->v_mount != NULL &&
 		    (bp->b_vp->v_mount->mnt_kern_flag & MNTK_SUSPENDED) != 0)
 			panic("spec_strategy: bad I/O");
 		bp->b_flags &= ~B_VALIDSUSPWRT;
 		if (LIST_FIRST(&bp->b_dep) != NULL)
 			buf_start(bp);
 		mp_fixme("This should require the vnode lock.");
 		if ((vp->v_vflag & VV_COPYONWRITE) &&
 		    vp->v_rdev->si_copyonwrite &&
 		    (error = (*vp->v_rdev->si_copyonwrite)(vp, bp)) != 0 &&
 		    error != EOPNOTSUPP) {
 			bp->b_io.bio_error = error;
 			bp->b_io.bio_flags |= BIO_ERROR;
 			biodone(&bp->b_io);
 			return (0);
 		}
 	}
 	/*
 	 * Collect statistics on synchronous and asynchronous read
 	 * and write counts for disks that have associated filesystems.
 	 */
 	if (vn_isdisk(vp, NULL) && (mp = vp->v_rdev->si_mountpoint) != NULL) {
 		if (bp->b_iocmd == BIO_WRITE) {
 			if (bp->b_lock.lk_lockholder == LK_KERNPROC)
 				mp->mnt_stat.f_asyncwrites++;
 			else
 				mp->mnt_stat.f_syncwrites++;
 		} else {
 			if (bp->b_lock.lk_lockholder == LK_KERNPROC)
 				mp->mnt_stat.f_asyncreads++;
 			else
 				mp->mnt_stat.f_syncreads++;
 		}
 	}
 	if (devsw(bp->b_dev) == NULL) {
 		bp->b_io.bio_error = ENXIO;
 		bp->b_io.bio_flags |= BIO_ERROR;
 		biodone(&bp->b_io);
 		return (0);
 	}
 	dsw = devsw(bp->b_dev);
 	KASSERT(dsw->d_strategy != NULL,
 	   ("No strategy on dev %s responsible for buffer %p\n",
 	   devtoname(bp->b_dev), bp));
 	
 	if (dsw->d_flags & D_NOGIANT) {
 		DROP_GIANT();
 		DEV_STRATEGY(bp, 0);
 		PICKUP_GIANT();
 	} else
 		DEV_STRATEGY(bp, 0);
 		
 	return (0);
 }
 
 static int
 spec_freeblks(ap)
 	struct vop_freeblks_args /* {
 		struct vnode *a_vp;
 		daddr_t a_addr;
 		daddr_t a_length;
 	} */ *ap;
 {
 	struct cdevsw *bsw;
 	struct buf *bp;
 
 	/*
 	 * XXX: This assumes that strategy does the deed right away.
 	 * XXX: this may not be TRTTD.
 	 */
 	bsw = devsw(ap->a_vp->v_rdev);
 	if ((bsw->d_flags & D_CANFREE) == 0)
 		return (0);
 	bp = geteblk(ap->a_length);
 	bp->b_iocmd = BIO_DELETE;
 	bp->b_dev = ap->a_vp->v_rdev;
 	bp->b_blkno = ap->a_addr;
 	bp->b_offset = dbtob(ap->a_addr);
 	bp->b_bcount = ap->a_length;
 	BUF_KERNPROC(bp);
 	DEV_STRATEGY(bp, 0);
 	return (0);
 }
 
 /*
  * Implement degenerate case where the block requested is the block
  * returned, and assume that the entire device is contiguous in regards
  * to the contiguous block range (runp and runb).
  */
 static int
 spec_bmap(ap)
 	struct vop_bmap_args /* {
 		struct vnode *a_vp;
 		daddr_t  a_bn;
 		struct vnode **a_vpp;
 		daddr_t *a_bnp;
 		int *a_runp;
 		int *a_runb;
 	} */ *ap;
 {
 	struct vnode *vp = ap->a_vp;
 	int runp = 0;
 	int runb = 0;
 
 	if (ap->a_vpp != NULL)
 		*ap->a_vpp = vp;
 	if (ap->a_bnp != NULL)
 		*ap->a_bnp = ap->a_bn;
 	if (vp->v_mount != NULL)
 		runp = runb = MAXBSIZE / vp->v_mount->mnt_stat.f_iosize;
 	if (ap->a_runp != NULL)
 		*ap->a_runp = runp;
 	if (ap->a_runb != NULL)
 		*ap->a_runb = runb;
 	return (0);
 }
 
 /*
  * Device close routine
  */
 /* ARGSUSED */
 static int
 spec_close(ap)
 	struct vop_close_args /* {
 		struct vnode *a_vp;
 		int  a_fflag;
 		struct ucred *a_cred;
 		struct thread *a_td;
 	} */ *ap;
 {
 	struct vnode *vp = ap->a_vp, *oldvp;
 	struct thread *td = ap->a_td;
 	dev_t dev = vp->v_rdev;
 	struct cdevsw *dsw;
 	int error;
 
 	/*
 	 * Hack: a tty device that is a controlling terminal
 	 * has a reference from the session structure.
 	 * We cannot easily tell that a character device is
 	 * a controlling terminal, unless it is the closing
 	 * process' controlling terminal.  In that case,
 	 * if the reference count is 2 (this last descriptor
 	 * plus the session), release the reference from the session.
 	 */
 
 	/*
 	 * This needs to be rewritten to take the vp interlock into
 	 * consideration.
 	 */
 
 	dsw = devsw(dev);
 	oldvp = NULL;
 	sx_xlock(&proctree_lock);
 	if (td && vp == td->td_proc->p_session->s_ttyvp) {
 		SESS_LOCK(td->td_proc->p_session);
 		VI_LOCK(vp);
 		if (vcount(vp) == 2 && (vp->v_iflag & VI_XLOCK) == 0) {
 			td->td_proc->p_session->s_ttyvp = NULL;
 			oldvp = vp;
 		}
 		VI_UNLOCK(vp);
 		SESS_UNLOCK(td->td_proc->p_session);
 	}
 	sx_xunlock(&proctree_lock);
 	if (oldvp != NULL)
 		vrele(oldvp);
 	/*
 	 * We do not want to really close the device if it
 	 * is still in use unless we are trying to close it
 	 * forcibly. Since every use (buffer, vnode, swap, cmap)
 	 * holds a reference to the vnode, and because we mark
 	 * any other vnodes that alias this device, when the
 	 * sum of the reference counts on all the aliased
 	 * vnodes descends to one, we are on last close.
 	 */
 	VI_LOCK(vp);
 	if (vp->v_iflag & VI_XLOCK) {
 		/* Forced close. */
 	} else if (dsw->d_flags & D_TRACKCLOSE) {
 		/* Keep device updated on status. */
 	} else if (vcount(vp) > 1) {
 		VI_UNLOCK(vp);
 		return (0);
 	}
 	VI_UNLOCK(vp);
 	if (dsw->d_flags & D_NOGIANT) {
 		DROP_GIANT();
 		error = dsw->d_close(dev, ap->a_fflag, S_IFCHR, td);
 		PICKUP_GIANT();
 	} else
 		error = dsw->d_close(dev, ap->a_fflag, S_IFCHR, td);
 	return (error);
 }
 
 /*
  * Print out the contents of a special device vnode.
  */
 static int
 spec_print(ap)
 	struct vop_print_args /* {
 		struct vnode *a_vp;
 	} */ *ap;
 {
 
 	printf("tag %s, dev %s\n", ap->a_vp->v_tag,
 	       devtoname(ap->a_vp->v_rdev));
 	return (0);
 }
 
 /*
  * Special device advisory byte-level locks.
  */
 /* ARGSUSED */
 static int
 spec_advlock(ap)
 	struct vop_advlock_args /* {
 		struct vnode *a_vp;
 		caddr_t  a_id;
 		int  a_op;
 		struct flock *a_fl;
 		int  a_flags;
 	} */ *ap;
 {
 
 	return (ap->a_flags & F_FLOCK ? EOPNOTSUPP : EINVAL);
 }
 
 static void
 spec_getpages_iodone(bp)
 	struct buf *bp;
 {
 
 	bp->b_flags |= B_DONE;
 	wakeup(bp);
 }
 
 static int
 spec_getpages(ap)
 	struct vop_getpages_args *ap;
 {
 	vm_offset_t kva;
 	int error;
 	int i, pcount, size, s;
 	daddr_t blkno;
 	struct buf *bp;
 	vm_page_t m;
 	vm_ooffset_t offset;
 	int toff, nextoff, nread;
 	struct vnode *vp = ap->a_vp;
 	int blksiz;
 	int gotreqpage;
 
 	GIANT_REQUIRED;
 
 	error = 0;
 	pcount = round_page(ap->a_count) / PAGE_SIZE;
 
 	/*
 	 * Calculate the offset of the transfer and do a sanity check.
 	 * FreeBSD currently only supports an 8 TB range due to b_blkno
 	 * being in DEV_BSIZE ( usually 512 ) byte chunks on call to
 	 * VOP_STRATEGY.  XXX
 	 */
 	offset = IDX_TO_OFF(ap->a_m[0]->pindex) + ap->a_offset;
 	blkno = btodb(offset);
 
 	/*
 	 * Round up physical size for real devices.  We cannot round using
 	 * v_mount's block size data because v_mount has nothing to do with
 	 * the device.  i.e. it's usually '/dev'.  We need the physical block
 	 * size for the device itself.
 	 *
 	 * We can't use v_rdev->si_mountpoint because it only exists when the
 	 * block device is mounted.  However, we can use v_rdev.
 	 */
 
 	if (vn_isdisk(vp, NULL))
 		blksiz = vp->v_rdev->si_bsize_phys;
 	else
 		blksiz = DEV_BSIZE;
 
 	size = (ap->a_count + blksiz - 1) & ~(blksiz - 1);
 
 	bp = getpbuf(NULL);
 	kva = (vm_offset_t)bp->b_data;
 
 	/*
 	 * Map the pages to be read into the kva.
 	 */
 	pmap_qenter(kva, ap->a_m, pcount);
 
 	/* Build a minimal buffer header. */
 	bp->b_iocmd = BIO_READ;
 	bp->b_iodone = spec_getpages_iodone;
 
 	/* B_PHYS is not set, but it is nice to fill this in. */
 	KASSERT(bp->b_rcred == NOCRED, ("leaking read ucred"));
 	KASSERT(bp->b_wcred == NOCRED, ("leaking write ucred"));
 	bp->b_rcred = crhold(curthread->td_ucred);
 	bp->b_wcred = crhold(curthread->td_ucred);
 	bp->b_blkno = blkno;
 	bp->b_lblkno = blkno;
 	pbgetvp(ap->a_vp, bp);
 	bp->b_bcount = size;
 	bp->b_bufsize = size;
 	bp->b_resid = 0;
 	bp->b_runningbufspace = bp->b_bufsize;
 	runningbufspace += bp->b_runningbufspace;
 
 	cnt.v_vnodein++;
 	cnt.v_vnodepgsin += pcount;
 
 	/* Do the input. */
 	BUF_STRATEGY(bp);
 
 	s = splbio();
 
 	/* We definitely need to be at splbio here. */
 	while ((bp->b_flags & B_DONE) == 0)
 		tsleep(bp, PVM, "spread", 0);
 
 	splx(s);
 
 	if ((bp->b_ioflags & BIO_ERROR) != 0) {
 		if (bp->b_error)
 			error = bp->b_error;
 		else
 			error = EIO;
 	}
 
 	nread = size - bp->b_resid;
 
 	if (nread < ap->a_count) {
 		bzero((caddr_t)kva + nread,
 			ap->a_count - nread);
 	}
 	pmap_qremove(kva, pcount);
 
 	gotreqpage = 0;
 	vm_page_lock_queues();
 	for (i = 0, toff = 0; i < pcount; i++, toff = nextoff) {
 		nextoff = toff + PAGE_SIZE;
 		m = ap->a_m[i];
 
 		m->flags &= ~PG_ZERO;
 
 		if (nextoff <= nread) {
 			m->valid = VM_PAGE_BITS_ALL;
 			vm_page_undirty(m);
 		} else if (toff < nread) {
 			/*
 			 * Since this is a VM request, we have to supply the
 			 * unaligned offset to allow vm_page_set_validclean()
 			 * to zero sub-DEV_BSIZE'd portions of the page.
 			 */
 			vm_page_set_validclean(m, 0, nread - toff);
 		} else {
 			m->valid = 0;
 			vm_page_undirty(m);
 		}
 
 		if (i != ap->a_reqpage) {
 			/*
 			 * Just in case someone was asking for this page we
 			 * now tell them that it is ok to use.
 			 */
 			if (!error || (m->valid == VM_PAGE_BITS_ALL)) {
 				if (m->valid) {
 					if (m->flags & PG_WANTED) {
 						vm_page_activate(m);
 					} else {
 						vm_page_deactivate(m);
 					}
 					vm_page_wakeup(m);
 				} else {
 					vm_page_free(m);
 				}
 			} else {
 				vm_page_free(m);
 			}
 		} else if (m->valid) {
 			gotreqpage = 1;
 			/*
 			 * Since this is a VM request, we need to make the
 			 * entire page presentable by zeroing invalid sections.
 			 */
 			if (m->valid != VM_PAGE_BITS_ALL)
 				vm_page_zero_invalid(m, FALSE);
 		}
 	}
 	vm_page_unlock_queues();
 	if (!gotreqpage) {
 		m = ap->a_m[ap->a_reqpage];
 		printf(
 	    "spec_getpages:(%s) I/O read failure: (error=%d) bp %p vp %p\n",
 			devtoname(bp->b_dev), error, bp, bp->b_vp);
 		printf(
 	    "               size: %d, resid: %ld, a_count: %d, valid: 0x%x\n",
 		    size, bp->b_resid, ap->a_count, m->valid);
 		printf(
 	    "               nread: %d, reqpage: %d, pindex: %lu, pcount: %d\n",
 		    nread, ap->a_reqpage, (u_long)m->pindex, pcount);
 		/*
 		 * Free the buffer header back to the swap buffer pool.
 		 */
 		relpbuf(bp, NULL);
 		return VM_PAGER_ERROR;
 	}
 	/*
 	 * Free the buffer header back to the swap buffer pool.
 	 */
 	relpbuf(bp, NULL);
 	return VM_PAGER_OK;
 }
Index: head/sys/kern/vfs_subr.c
===================================================================
--- head/sys/kern/vfs_subr.c	(revision 105901)
+++ head/sys/kern/vfs_subr.c	(revision 105902)
@@ -1,3774 +1,3774 @@
 /*
  * Copyright (c) 1989, 1993
  *	The Regents of the University of California.  All rights reserved.
  * (c) UNIX System Laboratories, Inc.
  * All or some portions of this file are derived from material licensed
  * to the University of California by American Telephone and Telegraph
  * Co. or Unix System Laboratories, Inc. and are reproduced herein with
  * the permission of UNIX System Laboratories, Inc.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. All advertising materials mentioning features or use of this software
  *    must display the following acknowledgement:
  *	This product includes software developed by the University of
  *	California, Berkeley and its contributors.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	@(#)vfs_subr.c	8.31 (Berkeley) 5/26/95
  * $FreeBSD$
  */
 
 /*
  * External virtual filesystem routines
  */
 #include "opt_ddb.h"
 #include "opt_mac.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/bio.h>
 #include <sys/buf.h>
 #include <sys/conf.h>
 #include <sys/eventhandler.h>
 #include <sys/extattr.h>
 #include <sys/fcntl.h>
 #include <sys/kernel.h>
 #include <sys/kthread.h>
 #include <sys/mac.h>
 #include <sys/malloc.h>
 #include <sys/mount.h>
 #include <sys/namei.h>
 #include <sys/stat.h>
 #include <sys/sysctl.h>
 #include <sys/syslog.h>
 #include <sys/vmmeter.h>
 #include <sys/vnode.h>
 
 #include <vm/vm.h>
 #include <vm/vm_object.h>
 #include <vm/vm_extern.h>
 #include <vm/pmap.h>
 #include <vm/vm_map.h>
 #include <vm/vm_page.h>
 #include <vm/uma.h>
 
 static MALLOC_DEFINE(M_NETADDR, "Export Host", "Export host address structure");
 
 static void	addalias(struct vnode *vp, dev_t nvp_rdev);
 static void	insmntque(struct vnode *vp, struct mount *mp);
 static void	vclean(struct vnode *vp, int flags, struct thread *td);
 static void	vlruvp(struct vnode *vp);
 static int	flushbuflist(struct buf *blist, int flags, struct vnode *vp,
 		    int slpflag, int slptimeo, int *errorp);
 static int	vcanrecycle(struct vnode *vp, struct mount **vnmpp);
 
 
 /*
  * Number of vnodes in existence.  Increased whenever getnewvnode()
  * allocates a new vnode, never decreased.
  */
 static unsigned long	numvnodes;
 
 SYSCTL_LONG(_vfs, OID_AUTO, numvnodes, CTLFLAG_RD, &numvnodes, 0, "");
 
 /*
  * Conversion tables for conversion from vnode types to inode formats
  * and back.
  */
 enum vtype iftovt_tab[16] = {
 	VNON, VFIFO, VCHR, VNON, VDIR, VNON, VBLK, VNON,
 	VREG, VNON, VLNK, VNON, VSOCK, VNON, VNON, VBAD,
 };
 int vttoif_tab[9] = {
 	0, S_IFREG, S_IFDIR, S_IFBLK, S_IFCHR, S_IFLNK,
 	S_IFSOCK, S_IFIFO, S_IFMT,
 };
 
 /*
  * List of vnodes that are ready for recycling.
  */
 static TAILQ_HEAD(freelst, vnode) vnode_free_list;
 
 /*
  * Minimum number of free vnodes.  If there are fewer than this free vnodes,
  * getnewvnode() will return a newly allocated vnode.
  */
 static u_long wantfreevnodes = 25;
 SYSCTL_LONG(_vfs, OID_AUTO, wantfreevnodes, CTLFLAG_RW, &wantfreevnodes, 0, "");
 /* Number of vnodes in the free list. */
 static u_long freevnodes;
 SYSCTL_LONG(_vfs, OID_AUTO, freevnodes, CTLFLAG_RD, &freevnodes, 0, "");
 
 /*
  * Various variables used for debugging the new implementation of
  * reassignbuf().
  * XXX these are probably of (very) limited utility now.
  */
 static int reassignbufcalls;
 SYSCTL_INT(_vfs, OID_AUTO, reassignbufcalls, CTLFLAG_RW, &reassignbufcalls, 0, "");
 static int nameileafonly;
 SYSCTL_INT(_vfs, OID_AUTO, nameileafonly, CTLFLAG_RW, &nameileafonly, 0, "");
 
 #ifdef ENABLE_VFS_IOOPT
 /* See NOTES for a description of this setting. */
 int vfs_ioopt;
 SYSCTL_INT(_vfs, OID_AUTO, ioopt, CTLFLAG_RW, &vfs_ioopt, 0, "");
 #endif
 
 /*
  * Cache for the mount type id assigned to NFS.  This is used for
  * special checks in nfs/nfs_nqlease.c and vm/vnode_pager.c.
  */
 int	nfs_mount_type = -1;
 
 /* To keep more than one thread at a time from running vfs_getnewfsid */
 static struct mtx mntid_mtx;
 
 /*
  * Lock for any access to the following:
  *	vnode_free_list
  *	numvnodes
  *	freevnodes
  */
 static struct mtx vnode_free_list_mtx;
 
 /*
  * For any iteration/modification of dev->si_hlist (linked through
  * v_specnext)
  */
 static struct mtx spechash_mtx;
 
 /* Publicly exported FS */
 struct nfs_public nfs_pub;
 
 /* Zone for allocation of new vnodes - used exclusively by getnewvnode() */
 static uma_zone_t vnode_zone;
 static uma_zone_t vnodepoll_zone;
 
 /* Set to 1 to print out reclaim of active vnodes */
 int	prtactive;
 
 /*
  * The workitem queue.
  *
  * It is useful to delay writes of file data and filesystem metadata
  * for tens of seconds so that quickly created and deleted files need
  * not waste disk bandwidth being created and removed. To realize this,
  * we append vnodes to a "workitem" queue. When running with a soft
  * updates implementation, most pending metadata dependencies should
  * not wait for more than a few seconds. Thus, mounted on block devices
  * are delayed only about a half the time that file data is delayed.
  * Similarly, directory updates are more critical, so are only delayed
  * about a third the time that file data is delayed. Thus, there are
  * SYNCER_MAXDELAY queues that are processed round-robin at a rate of
  * one each second (driven off the filesystem syncer process). The
  * syncer_delayno variable indicates the next queue that is to be processed.
  * Items that need to be processed soon are placed in this queue:
  *
  *	syncer_workitem_pending[syncer_delayno]
  *
  * A delay of fifteen seconds is done by placing the request fifteen
  * entries later in the queue:
  *
  *	syncer_workitem_pending[(syncer_delayno + 15) & syncer_mask]
  *
  */
 static int syncer_delayno;
 static long syncer_mask;
 LIST_HEAD(synclist, vnode);
 static struct synclist *syncer_workitem_pending;
 /*
  * The sync_mtx protects:
  *	vp->v_synclist
  *	syncer_delayno
  *	syncer_workitem_pending
  *	rushjob
  */
 static struct mtx sync_mtx;
 
 #define SYNCER_MAXDELAY		32
 static int syncer_maxdelay = SYNCER_MAXDELAY;	/* maximum delay time */
 static int syncdelay = 30;		/* max time to delay syncing data */
 static int filedelay = 30;		/* time to delay syncing files */
 SYSCTL_INT(_kern, OID_AUTO, filedelay, CTLFLAG_RW, &filedelay, 0, "");
 static int dirdelay = 29;		/* time to delay syncing directories */
 SYSCTL_INT(_kern, OID_AUTO, dirdelay, CTLFLAG_RW, &dirdelay, 0, "");
 static int metadelay = 28;		/* time to delay syncing metadata */
 SYSCTL_INT(_kern, OID_AUTO, metadelay, CTLFLAG_RW, &metadelay, 0, "");
 static int rushjob;		/* number of slots to run ASAP */
 static int stat_rush_requests;	/* number of times I/O speeded up */
 SYSCTL_INT(_debug, OID_AUTO, rush_requests, CTLFLAG_RW, &stat_rush_requests, 0, "");
 
 /*
  * Number of vnodes we want to exist at any one time.  This is mostly used
  * to size hash tables in vnode-related code.  It is normally not used in
  * getnewvnode(), as wantfreevnodes is normally nonzero.)
  *
  * XXX desiredvnodes is historical cruft and should not exist.
  */
 int desiredvnodes;
 SYSCTL_INT(_kern, KERN_MAXVNODES, maxvnodes, CTLFLAG_RW,
     &desiredvnodes, 0, "Maximum number of vnodes");
 static int minvnodes;
 SYSCTL_INT(_kern, OID_AUTO, minvnodes, CTLFLAG_RW,
     &minvnodes, 0, "Minimum number of vnodes");
 static int vnlru_nowhere;
 SYSCTL_INT(_debug, OID_AUTO, vnlru_nowhere, CTLFLAG_RW, &vnlru_nowhere, 0,
     "Number of times the vnlru process ran without success");
 
 /* Hook for calling soft updates */
 int (*softdep_process_worklist_hook)(struct mount *);
 
 /*
  * This only exists to supress warnings from unlocked specfs accesses.  It is
  * no longer ok to have an unlocked VFS.
  */
 #define IGNORE_LOCK(vp) ((vp)->v_type == VCHR || (vp)->v_type == VBAD)
 
 /* Print lock violations */
 int vfs_badlock_print = 1;
 
 /* Panic on violation */
 int vfs_badlock_panic = 1;
 
 /* Check for interlock across VOPs */
 int vfs_badlock_mutex = 1;
 
 static void
 vfs_badlock(char *msg, char *str, struct vnode *vp)
 {
 	if (vfs_badlock_print)
 		printf("%s: %p %s\n", str, vp, msg);
 	if (vfs_badlock_panic)
 		Debugger("Lock violation.\n");
 }
 
 void
 assert_vi_unlocked(struct vnode *vp, char *str)
 { 
 	if (vfs_badlock_mutex && mtx_owned(VI_MTX(vp)))
 		vfs_badlock("interlock is locked but should not be", str, vp);
 }
 
 void
 assert_vi_locked(struct vnode *vp, char *str)
 {
 	if (vfs_badlock_mutex && !mtx_owned(VI_MTX(vp)))
 		vfs_badlock("interlock is not locked but should be", str, vp);
 }
 
 void
 assert_vop_locked(struct vnode *vp, char *str)
 {
 	if (vp && !IGNORE_LOCK(vp) && !VOP_ISLOCKED(vp, NULL))
 		vfs_badlock("is not locked but should be", str, vp);
 }
 
 void
 assert_vop_unlocked(struct vnode *vp, char *str)
 {
 	if (vp && !IGNORE_LOCK(vp) &&
 	    VOP_ISLOCKED(vp, curthread) == LK_EXCLUSIVE)
 		vfs_badlock("is locked but should not be", str, vp);
 }
 
 void
 assert_vop_elocked(struct vnode *vp, char *str)
 {
 	if (vp && !IGNORE_LOCK(vp) &&
 	    VOP_ISLOCKED(vp, curthread) != LK_EXCLUSIVE)
 		vfs_badlock("is not exclusive locked but should be", str, vp);
 }
 
 void
 assert_vop_elocked_other(struct vnode *vp, char *str)
 {
 	if (vp && !IGNORE_LOCK(vp) &&
 	    VOP_ISLOCKED(vp, curthread) != LK_EXCLOTHER)
 		vfs_badlock("is not exclusive locked by another thread",
 		    str, vp);
 }
 
 void
 assert_vop_slocked(struct vnode *vp, char *str)
 {
 	if (vp && !IGNORE_LOCK(vp) &&
 	    VOP_ISLOCKED(vp, curthread) != LK_SHARED)
 		vfs_badlock("is not locked shared but should be", str, vp);
 }
 
 void
 vop_rename_pre(void *ap)
 {
 	struct vop_rename_args *a = ap;
 
 	if (a->a_tvp)
 		ASSERT_VI_UNLOCKED(a->a_tvp, "VOP_RENAME");
 	ASSERT_VI_UNLOCKED(a->a_tdvp, "VOP_RENAME");
 	ASSERT_VI_UNLOCKED(a->a_fvp, "VOP_RENAME");
 	ASSERT_VI_UNLOCKED(a->a_fdvp, "VOP_RENAME");
 
 	/* Check the source (from) */
 	if (a->a_tdvp != a->a_fdvp)
 		ASSERT_VOP_UNLOCKED(a->a_fdvp, "vop_rename: fdvp locked.\n");
 	if (a->a_tvp != a->a_fvp)
 		ASSERT_VOP_UNLOCKED(a->a_fvp, "vop_rename: tvp locked.\n");
 
 	/* Check the target */
 	if (a->a_tvp)
 		ASSERT_VOP_LOCKED(a->a_tvp, "vop_rename: tvp not locked.\n");
 
 	ASSERT_VOP_LOCKED(a->a_tdvp, "vop_rename: tdvp not locked.\n");
 }
 
 void
 vop_strategy_pre(void *ap)
 {
 	struct vop_strategy_args *a = ap;
 	struct buf *bp;
 
 	bp = a->a_bp;
 
 	/*
 	 * Cluster ops lock their component buffers but not the IO container.
 	 */
 	if ((bp->b_flags & B_CLUSTER) != 0)
 		return;
 
 	if (BUF_REFCNT(bp) < 1) {
 		if (vfs_badlock_print)
 			printf("VOP_STRATEGY: bp is not locked but should be.\n");
 		if (vfs_badlock_panic)
 			Debugger("Lock violation.\n");
 	}
 }
 
 void
 vop_lookup_pre(void *ap)
 {
 	struct vop_lookup_args *a = ap;
 	struct vnode *dvp;
 
 	dvp = a->a_dvp;
 
 	ASSERT_VI_UNLOCKED(dvp, "VOP_LOOKUP");
 	ASSERT_VOP_LOCKED(dvp, "VOP_LOOKUP");
 }
 
 void
 vop_lookup_post(void *ap, int rc)
 {
 	struct vop_lookup_args *a = ap;
 	struct componentname *cnp;
 	struct vnode *dvp;
 	struct vnode *vp;
 	int flags;
 
 	dvp = a->a_dvp;
 	cnp = a->a_cnp;
 	vp = *(a->a_vpp);
 	flags = cnp->cn_flags;
 
 
 	ASSERT_VI_UNLOCKED(dvp, "VOP_LOOKUP");
 	/*
 	 * If this is the last path component for this lookup and LOCPARENT
 	 * is set, OR if there is an error the directory has to be locked.
 	 */
 	if ((flags & LOCKPARENT) && (flags & ISLASTCN))
 		ASSERT_VOP_LOCKED(dvp, "VOP_LOOKUP (LOCKPARENT)");
 	else if (rc != 0)
 		ASSERT_VOP_LOCKED(dvp, "VOP_LOOKUP (error)");
 	else if (dvp != vp)
 		ASSERT_VOP_UNLOCKED(dvp, "VOP_LOOKUP (dvp)");
 
 	if (flags & PDIRUNLOCK)
 		ASSERT_VOP_UNLOCKED(dvp, "VOP_LOOKUP (PDIRUNLOCK)");
 }
 
 void
 vop_unlock_pre(void *ap)
 {
 	struct vop_unlock_args *a = ap;
 
 	if (a->a_flags & LK_INTERLOCK)
 		ASSERT_VI_LOCKED(a->a_vp, "VOP_UNLOCK");
 
 	ASSERT_VOP_LOCKED(a->a_vp, "VOP_UNLOCK");
 }
 
 void
 vop_unlock_post(void *ap, int rc)
 {
 	struct vop_unlock_args *a = ap;
 
 	if (a->a_flags & LK_INTERLOCK)
 		ASSERT_VI_UNLOCKED(a->a_vp, "VOP_UNLOCK");
 }
 
 void
 vop_lock_pre(void *ap)
 {
 	struct vop_lock_args *a = ap;
 
 	if ((a->a_flags & LK_INTERLOCK) == 0)
 		ASSERT_VI_UNLOCKED(a->a_vp, "VOP_LOCK");
 	else
 		ASSERT_VI_LOCKED(a->a_vp, "VOP_LOCK");
 }
 
 void
 vop_lock_post(void *ap, int rc)
 {
 	struct vop_lock_args *a;
 
 	a = ap;
 
 	ASSERT_VI_UNLOCKED(a->a_vp, "VOP_LOCK");
 	ASSERT_VOP_LOCKED(a->a_vp, "VOP_LOCK");
 }
 
 void
 v_addpollinfo(struct vnode *vp)
 {
 	vp->v_pollinfo = uma_zalloc(vnodepoll_zone, M_WAITOK);
 	mtx_init(&vp->v_pollinfo->vpi_lock, "vnode pollinfo", NULL, MTX_DEF);
 }
 
 /*
  * Initialize the vnode management data structures.
  */
 static void
 vntblinit(void *dummy __unused)
 {
 
 	desiredvnodes = maxproc + cnt.v_page_count / 4;
 	minvnodes = desiredvnodes / 4;
 	mtx_init(&mountlist_mtx, "mountlist", NULL, MTX_DEF);
 	mtx_init(&mntvnode_mtx, "mntvnode", NULL, MTX_DEF);
 	mtx_init(&mntid_mtx, "mntid", NULL, MTX_DEF);
 	mtx_init(&spechash_mtx, "spechash", NULL, MTX_DEF);
 	TAILQ_INIT(&vnode_free_list);
 	mtx_init(&vnode_free_list_mtx, "vnode_free_list", NULL, MTX_DEF);
 	vnode_zone = uma_zcreate("VNODE", sizeof (struct vnode), NULL, NULL,
 	    NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE);
 	vnodepoll_zone = uma_zcreate("VNODEPOLL", sizeof (struct vpollinfo),
 	      NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE);
 	/*
 	 * Initialize the filesystem syncer.
 	 */
 	syncer_workitem_pending = hashinit(syncer_maxdelay, M_VNODE,
 		&syncer_mask);
 	syncer_maxdelay = syncer_mask + 1;
 	mtx_init(&sync_mtx, "Syncer mtx", NULL, MTX_DEF);
 }
 SYSINIT(vfs, SI_SUB_VFS, SI_ORDER_FIRST, vntblinit, NULL)
 
 
 /*
  * Mark a mount point as busy. Used to synchronize access and to delay
  * unmounting. Interlock is not released on failure.
  */
 int
 vfs_busy(mp, flags, interlkp, td)
 	struct mount *mp;
 	int flags;
 	struct mtx *interlkp;
 	struct thread *td;
 {
 	int lkflags;
 
 	if (mp->mnt_kern_flag & MNTK_UNMOUNT) {
 		if (flags & LK_NOWAIT)
 			return (ENOENT);
 		mp->mnt_kern_flag |= MNTK_MWAIT;
 		/*
 		 * Since all busy locks are shared except the exclusive
 		 * lock granted when unmounting, the only place that a
 		 * wakeup needs to be done is at the release of the
 		 * exclusive lock at the end of dounmount.
 		 */
 		msleep(mp, interlkp, PVFS, "vfs_busy", 0);
 		return (ENOENT);
 	}
 	lkflags = LK_SHARED | LK_NOPAUSE;
 	if (interlkp)
 		lkflags |= LK_INTERLOCK;
 	if (lockmgr(&mp->mnt_lock, lkflags, interlkp, td))
 		panic("vfs_busy: unexpected lock failure");
 	return (0);
 }
 
 /*
  * Free a busy filesystem.
  */
 void
 vfs_unbusy(mp, td)
 	struct mount *mp;
 	struct thread *td;
 {
 
 	lockmgr(&mp->mnt_lock, LK_RELEASE, NULL, td);
 }
 
 /*
  * Lookup a mount point by filesystem identifier.
  */
 struct mount *
 vfs_getvfs(fsid)
 	fsid_t *fsid;
 {
 	register struct mount *mp;
 
 	mtx_lock(&mountlist_mtx);
 	TAILQ_FOREACH(mp, &mountlist, mnt_list) {
 		if (mp->mnt_stat.f_fsid.val[0] == fsid->val[0] &&
 		    mp->mnt_stat.f_fsid.val[1] == fsid->val[1]) {
 			mtx_unlock(&mountlist_mtx);
 			return (mp);
 		}
 	}
 	mtx_unlock(&mountlist_mtx);
 	return ((struct mount *) 0);
 }
 
 /*
  * Get a new unique fsid.  Try to make its val[0] unique, since this value
  * will be used to create fake device numbers for stat().  Also try (but
  * not so hard) make its val[0] unique mod 2^16, since some emulators only
  * support 16-bit device numbers.  We end up with unique val[0]'s for the
  * first 2^16 calls and unique val[0]'s mod 2^16 for the first 2^8 calls.
  *
  * Keep in mind that several mounts may be running in parallel.  Starting
  * the search one past where the previous search terminated is both a
  * micro-optimization and a defense against returning the same fsid to
  * different mounts.
  */
 void
 vfs_getnewfsid(mp)
 	struct mount *mp;
 {
 	static u_int16_t mntid_base;
 	fsid_t tfsid;
 	int mtype;
 
 	mtx_lock(&mntid_mtx);
 	mtype = mp->mnt_vfc->vfc_typenum;
 	tfsid.val[1] = mtype;
 	mtype = (mtype & 0xFF) << 24;
 	for (;;) {
 		tfsid.val[0] = makeudev(255,
 		    mtype | ((mntid_base & 0xFF00) << 8) | (mntid_base & 0xFF));
 		mntid_base++;
 		if (vfs_getvfs(&tfsid) == NULL)
 			break;
 	}
 	mp->mnt_stat.f_fsid.val[0] = tfsid.val[0];
 	mp->mnt_stat.f_fsid.val[1] = tfsid.val[1];
 	mtx_unlock(&mntid_mtx);
 }
 
 /*
  * Knob to control the precision of file timestamps:
  *
  *   0 = seconds only; nanoseconds zeroed.
  *   1 = seconds and nanoseconds, accurate within 1/HZ.
  *   2 = seconds and nanoseconds, truncated to microseconds.
  * >=3 = seconds and nanoseconds, maximum precision.
  */
 enum { TSP_SEC, TSP_HZ, TSP_USEC, TSP_NSEC };
 
 static int timestamp_precision = TSP_SEC;
 SYSCTL_INT(_vfs, OID_AUTO, timestamp_precision, CTLFLAG_RW,
     &timestamp_precision, 0, "");
 
 /*
  * Get a current timestamp.
  */
 void
 vfs_timestamp(tsp)
 	struct timespec *tsp;
 {
 	struct timeval tv;
 
 	switch (timestamp_precision) {
 	case TSP_SEC:
 		tsp->tv_sec = time_second;
 		tsp->tv_nsec = 0;
 		break;
 	case TSP_HZ:
 		getnanotime(tsp);
 		break;
 	case TSP_USEC:
 		microtime(&tv);
 		TIMEVAL_TO_TIMESPEC(&tv, tsp);
 		break;
 	case TSP_NSEC:
 	default:
 		nanotime(tsp);
 		break;
 	}
 }
 
 /*
  * Set vnode attributes to VNOVAL
  */
 void
 vattr_null(vap)
 	register struct vattr *vap;
 {
 
 	vap->va_type = VNON;
 	vap->va_size = VNOVAL;
 	vap->va_bytes = VNOVAL;
 	vap->va_mode = VNOVAL;
 	vap->va_nlink = VNOVAL;
 	vap->va_uid = VNOVAL;
 	vap->va_gid = VNOVAL;
 	vap->va_fsid = VNOVAL;
 	vap->va_fileid = VNOVAL;
 	vap->va_blocksize = VNOVAL;
 	vap->va_rdev = VNOVAL;
 	vap->va_atime.tv_sec = VNOVAL;
 	vap->va_atime.tv_nsec = VNOVAL;
 	vap->va_mtime.tv_sec = VNOVAL;
 	vap->va_mtime.tv_nsec = VNOVAL;
 	vap->va_ctime.tv_sec = VNOVAL;
 	vap->va_ctime.tv_nsec = VNOVAL;
 	vap->va_birthtime.tv_sec = VNOVAL;
 	vap->va_birthtime.tv_nsec = VNOVAL;
 	vap->va_flags = VNOVAL;
 	vap->va_gen = VNOVAL;
 	vap->va_vaflags = 0;
 }
 
 /*
  * This routine is called when we have too many vnodes.  It attempts
  * to free <count> vnodes and will potentially free vnodes that still
  * have VM backing store (VM backing store is typically the cause
  * of a vnode blowout so we want to do this).  Therefore, this operation
  * is not considered cheap.
  *
  * A number of conditions may prevent a vnode from being reclaimed.
  * the buffer cache may have references on the vnode, a directory
  * vnode may still have references due to the namei cache representing
  * underlying files, or the vnode may be in active use.   It is not
  * desireable to reuse such vnodes.  These conditions may cause the
  * number of vnodes to reach some minimum value regardless of what
  * you set kern.maxvnodes to.  Do not set kern.maxvnodes too low.
  */
 static int
 vlrureclaim(struct mount *mp, int count)
 {
 	struct vnode *vp;
 	int done;
 	int trigger;
 	int usevnodes;
 
 	/*
 	 * Calculate the trigger point, don't allow user
 	 * screwups to blow us up.   This prevents us from
 	 * recycling vnodes with lots of resident pages.  We
 	 * aren't trying to free memory, we are trying to
 	 * free vnodes.
 	 */
 	usevnodes = desiredvnodes;
 	if (usevnodes <= 0)
 		usevnodes = 1;
 	trigger = cnt.v_page_count * 2 / usevnodes;
 
 	done = 0;
 	mtx_lock(&mntvnode_mtx);
 	while (count && (vp = TAILQ_FIRST(&mp->mnt_nvnodelist)) != NULL) {
 		TAILQ_REMOVE(&mp->mnt_nvnodelist, vp, v_nmntvnodes);
 		TAILQ_INSERT_TAIL(&mp->mnt_nvnodelist, vp, v_nmntvnodes);
 
 		if (vp->v_type != VNON &&
 		    vp->v_type != VBAD &&
 		    VI_TRYLOCK(vp)) {
 			if (VMIGHTFREE(vp) &&           /* critical path opt */
 			    (vp->v_object == NULL ||
 			    vp->v_object->resident_page_count < trigger)) {
 				mtx_unlock(&mntvnode_mtx);
 				vgonel(vp, curthread);
 				done++;
 				mtx_lock(&mntvnode_mtx);
 			} else
 				VI_UNLOCK(vp);
 		}
 		--count;
 	}
 	mtx_unlock(&mntvnode_mtx);
 	return done;
 }
 
 /*
  * Attempt to recycle vnodes in a context that is always safe to block.
  * Calling vlrurecycle() from the bowels of filesystem code has some
  * interesting deadlock problems.
  */
 static struct proc *vnlruproc;
 static int vnlruproc_sig;
 
 static void
 vnlru_proc(void)
 {
 	struct mount *mp, *nmp;
 	int s;
 	int done;
 	struct proc *p = vnlruproc;
 	struct thread *td = FIRST_THREAD_IN_PROC(p);	/* XXXKSE */
 
 	mtx_lock(&Giant);
 
 	EVENTHANDLER_REGISTER(shutdown_pre_sync, kproc_shutdown, p,
 	    SHUTDOWN_PRI_FIRST);
 
 	s = splbio();
 	for (;;) {
 		kthread_suspend_check(p);
 		mtx_lock(&vnode_free_list_mtx);
 		if (numvnodes - freevnodes <= desiredvnodes * 9 / 10) {
 			mtx_unlock(&vnode_free_list_mtx);
 			vnlruproc_sig = 0;
 			tsleep(vnlruproc, PVFS, "vlruwt", 0);
 			continue;
 		}
 		mtx_unlock(&vnode_free_list_mtx);
 		done = 0;
 		mtx_lock(&mountlist_mtx);
 		for (mp = TAILQ_FIRST(&mountlist); mp != NULL; mp = nmp) {
 			if (vfs_busy(mp, LK_NOWAIT, &mountlist_mtx, td)) {
 				nmp = TAILQ_NEXT(mp, mnt_list);
 				continue;
 			}
 			done += vlrureclaim(mp, 10);
 			mtx_lock(&mountlist_mtx);
 			nmp = TAILQ_NEXT(mp, mnt_list);
 			vfs_unbusy(mp, td);
 		}
 		mtx_unlock(&mountlist_mtx);
 		if (done == 0) {
 #if 0
 			/* These messages are temporary debugging aids */
 			if (vnlru_nowhere < 5)
 				printf("vnlru process getting nowhere..\n");
 			else if (vnlru_nowhere == 5)
 				printf("vnlru process messages stopped.\n");
 #endif
 			vnlru_nowhere++;
 			tsleep(vnlruproc, PPAUSE, "vlrup", hz * 3);
 		}
 	}
 	splx(s);
 }
 
 static struct kproc_desc vnlru_kp = {
 	"vnlru",
 	vnlru_proc,
 	&vnlruproc
 };
 SYSINIT(vnlru, SI_SUB_KTHREAD_UPDATE, SI_ORDER_FIRST, kproc_start, &vnlru_kp)
 
 
 /*
  * Routines having to do with the management of the vnode table.
  */
 
 /*
  * Check to see if a free vnode can be recycled. If it can,
  * return it locked with the vn lock, but not interlock. Also
  * get the vn_start_write lock. Otherwise indicate the error.
  */
 static int
 vcanrecycle(struct vnode *vp, struct mount **vnmpp)
 {
 	struct thread *td = curthread;
 	vm_object_t object;
 	int error;
 
 	/* Don't recycle if we can't get the interlock */
 	if (!VI_TRYLOCK(vp))
 		return (EWOULDBLOCK);
 
 	/* We should be able to immediately acquire this */
 	/* XXX This looks like it should panic if it fails */
 	if (vn_lock(vp, LK_INTERLOCK | LK_EXCLUSIVE, td) != 0) {
 		if (VOP_ISLOCKED(vp, td))
 			panic("vcanrecycle: locked vnode");
 		return (EWOULDBLOCK);
 	}
 
 	/*
 	 * Don't recycle if its filesystem is being suspended.
 	 */
 	if (vn_start_write(vp, vnmpp, V_NOWAIT) != 0) {
 		error = EBUSY;
 		goto done;
 	}
 
 	/*
 	 * Don't recycle if we still have cached pages.
 	 */
 	if (VOP_GETVOBJECT(vp, &object) == 0 &&
 	     (object->resident_page_count ||
 	      object->ref_count)) {
 		error = EBUSY;
 		goto done;
 	}
 	if (LIST_FIRST(&vp->v_cache_src)) {
 		/*
 		 * note: nameileafonly sysctl is temporary,
 		 * for debugging only, and will eventually be
 		 * removed.
 		 */
 		if (nameileafonly > 0) {
 			/*
 			 * Do not reuse namei-cached directory
 			 * vnodes that have cached
 			 * subdirectories.
 			 */
 			if (cache_leaf_test(vp) < 0) {
 				error = EISDIR;
 				goto done;
 			}
 		} else if (nameileafonly < 0 ||
 			    vmiodirenable == 0) {
 			/*
 			 * Do not reuse namei-cached directory
 			 * vnodes if nameileafonly is -1 or
 			 * if VMIO backing for directories is
 			 * turned off (otherwise we reuse them
 			 * too quickly).
 			 */
 			error = EBUSY;
 			goto done;
 		}
 	}
 	return (0);
 done:
 	VOP_UNLOCK(vp, 0, td);
 	return (error);
 }
 
 /*
  * Return the next vnode from the free list.
  */
 int
 getnewvnode(tag, mp, vops, vpp)
 	const char *tag;
 	struct mount *mp;
 	vop_t **vops;
 	struct vnode **vpp;
 {
 	int s;
 	struct thread *td = curthread;	/* XXX */
 	struct vnode *vp = NULL;
 	struct mount *vnmp;
 
 	s = splbio();
 	mtx_lock(&vnode_free_list_mtx);
 
 	/*
 	 * Try to reuse vnodes if we hit the max.  This situation only
 	 * occurs in certain large-memory (2G+) situations.  We cannot
 	 * attempt to directly reclaim vnodes due to nasty recursion
 	 * problems.
 	 */
 	if (vnlruproc_sig == 0 && numvnodes - freevnodes > desiredvnodes) {
 		vnlruproc_sig = 1;      /* avoid unnecessary wakeups */
 		wakeup(vnlruproc);
 	}
 
 	/*
 	 * Attempt to reuse a vnode already on the free list, allocating
 	 * a new vnode if we can't find one or if we have not reached a
 	 * good minimum for good LRU performance.
 	 */
 
 	if (freevnodes >= wantfreevnodes && numvnodes >= minvnodes) {
 		int error;
 		int count;
 
 		for (count = 0; count < freevnodes; count++) {
 			vp = TAILQ_FIRST(&vnode_free_list);
 
 			KASSERT(vp->v_usecount == 0, 
 			    ("getnewvnode: free vnode isn't"));
 
 			TAILQ_REMOVE(&vnode_free_list, vp, v_freelist);
 			/*
 			 * We have to drop the free list mtx to avoid lock
 			 * order reversals with interlock.
 			 */
 			mtx_unlock(&vnode_free_list_mtx);
 			error = vcanrecycle(vp, &vnmp);
 			mtx_lock(&vnode_free_list_mtx);
 			if (error == 0)
 				break;
 			TAILQ_INSERT_TAIL(&vnode_free_list, vp, v_freelist);
 			vp = NULL;
 		}
 	}
 	if (vp) {
 		freevnodes--;
 		mtx_unlock(&vnode_free_list_mtx);
 
 		cache_purge(vp);
 		VI_LOCK(vp);
 		vp->v_iflag |= VI_DOOMED;
 		vp->v_iflag &= ~VI_FREE;
 		if (vp->v_type != VBAD) {
 			VOP_UNLOCK(vp, 0, td);
 			vgonel(vp, td);
 			VI_LOCK(vp);
 		} else {
 			VOP_UNLOCK(vp, 0, td);
 		}
 		vn_finished_write(vnmp);
 
 #ifdef INVARIANTS
 		{
 			if (vp->v_data)
 				panic("cleaned vnode isn't");
 			if (vp->v_numoutput)
 				panic("Clean vnode has pending I/O's");
 			if (vp->v_writecount != 0)
 				panic("Non-zero write count");
 		}
 #endif
 		if (vp->v_pollinfo) {
 			mtx_destroy(&vp->v_pollinfo->vpi_lock);
 			uma_zfree(vnodepoll_zone, vp->v_pollinfo);
 		}
 		vp->v_pollinfo = NULL;
 #ifdef MAC
 		mac_destroy_vnode(vp);
 #endif
 		vp->v_iflag = 0;
 		vp->v_vflag = 0;
 		vp->v_lastw = 0;
 		vp->v_lasta = 0;
 		vp->v_cstart = 0;
 		vp->v_clen = 0;
 		vp->v_socket = 0;
 		lockdestroy(vp->v_vnlock);
 		lockinit(vp->v_vnlock, PVFS, tag, VLKTIMEOUT, LK_NOPAUSE);
 		KASSERT(vp->v_cleanblkroot == NULL, ("cleanblkroot not NULL"));
 		KASSERT(vp->v_dirtyblkroot == NULL, ("dirtyblkroot not NULL"));
 	} else {
 		numvnodes++;
 		mtx_unlock(&vnode_free_list_mtx);
 
 		vp = (struct vnode *) uma_zalloc(vnode_zone, M_WAITOK|M_ZERO);
 		mtx_init(&vp->v_interlock, "vnode interlock", NULL, MTX_DEF);
 		VI_LOCK(vp);
 		vp->v_dd = vp;
 		vp->v_vnlock = &vp->v_lock;
 		lockinit(vp->v_vnlock, PVFS, tag, VLKTIMEOUT, LK_NOPAUSE);
 		cache_purge(vp);
 		LIST_INIT(&vp->v_cache_src);
 		TAILQ_INIT(&vp->v_cache_dst);
 	}
 
 	TAILQ_INIT(&vp->v_cleanblkhd);
 	TAILQ_INIT(&vp->v_dirtyblkhd);
 	vp->v_type = VNON;
 	vp->v_tag = tag;
 	vp->v_op = vops;
 	*vpp = vp;
 	vp->v_usecount = 1;
 	vp->v_data = 0;
 	vp->v_cachedid = -1;
 	VI_UNLOCK(vp);
 #ifdef MAC
 	mac_init_vnode(vp);
 #endif
 	insmntque(vp, mp);
 
 	return (0);
 }
 
 /*
  * Move a vnode from one mount queue to another.
  */
 static void
 insmntque(vp, mp)
 	register struct vnode *vp;
 	register struct mount *mp;
 {
 
 	mtx_lock(&mntvnode_mtx);
 	/*
 	 * Delete from old mount point vnode list, if on one.
 	 */
 	if (vp->v_mount != NULL)
 		TAILQ_REMOVE(&vp->v_mount->mnt_nvnodelist, vp, v_nmntvnodes);
 	/*
 	 * Insert into list of vnodes for the new mount point, if available.
 	 */
 	if ((vp->v_mount = mp) == NULL) {
 		mtx_unlock(&mntvnode_mtx);
 		return;
 	}
 	TAILQ_INSERT_TAIL(&mp->mnt_nvnodelist, vp, v_nmntvnodes);
 	mtx_unlock(&mntvnode_mtx);
 }
 
 /*
  * Update outstanding I/O count and do wakeup if requested.
  */
 void
 vwakeup(bp)
 	register struct buf *bp;
 {
 	register struct vnode *vp;
 
 	bp->b_flags &= ~B_WRITEINPROG;
 	if ((vp = bp->b_vp)) {
 		VI_LOCK(vp);
 		vp->v_numoutput--;
 		if (vp->v_numoutput < 0)
 			panic("vwakeup: neg numoutput");
 		if ((vp->v_numoutput == 0) && (vp->v_iflag & VI_BWAIT)) {
 			vp->v_iflag &= ~VI_BWAIT;
 			wakeup(&vp->v_numoutput);
 		}
 		VI_UNLOCK(vp);
 	}
 }
 
 /*
  * Flush out and invalidate all buffers associated with a vnode.
  * Called with the underlying object locked.
  */
 int
 vinvalbuf(vp, flags, cred, td, slpflag, slptimeo)
 	struct vnode *vp;
 	int flags;
 	struct ucred *cred;
 	struct thread *td;
 	int slpflag, slptimeo;
 {
 	struct buf *blist;
 	int s, error;
 	vm_object_t object;
 
 	GIANT_REQUIRED;
 
 	ASSERT_VOP_LOCKED(vp, "vinvalbuf");
 
 	VI_LOCK(vp);
 	if (flags & V_SAVE) {
 		s = splbio();
 		while (vp->v_numoutput) {
 			vp->v_iflag |= VI_BWAIT;
 			error = msleep(&vp->v_numoutput, VI_MTX(vp),
 			    slpflag | (PRIBIO + 1), "vinvlbuf", slptimeo);
 			if (error) {
 				VI_UNLOCK(vp);
 				splx(s);
 				return (error);
 			}
 		}
 		if (!TAILQ_EMPTY(&vp->v_dirtyblkhd)) {
 			splx(s);
 			VI_UNLOCK(vp);
 			if ((error = VOP_FSYNC(vp, cred, MNT_WAIT, td)) != 0)
 				return (error);
 			/*
 			 * XXX We could save a lock/unlock if this was only
 			 * enabled under INVARIANTS
 			 */
 			VI_LOCK(vp);
 			s = splbio();
 			if (vp->v_numoutput > 0 ||
 			    !TAILQ_EMPTY(&vp->v_dirtyblkhd))
 				panic("vinvalbuf: dirty bufs");
 		}
 		splx(s);
 	}
 	s = splbio();
 	/*
 	 * If you alter this loop please notice that interlock is dropped and
 	 * reacquired in flushbuflist.  Special care is needed to ensure that
 	 * no race conditions occur from this.
 	 */
 	for (error = 0;;) {
 		if ((blist = TAILQ_FIRST(&vp->v_cleanblkhd)) != 0 &&
 		    flushbuflist(blist, flags, vp, slpflag, slptimeo, &error)) {
 			if (error)
 				break;
 			continue;
 		}
 		if ((blist = TAILQ_FIRST(&vp->v_dirtyblkhd)) != 0 &&
 		    flushbuflist(blist, flags, vp, slpflag, slptimeo, &error)) {
 			if (error)
 				break;
 			continue;
 		}
 		break;
 	}
 	if (error) {
 		splx(s);
 		VI_UNLOCK(vp);
 		return (error);
 	}
 
 	/*
 	 * Wait for I/O to complete.  XXX needs cleaning up.  The vnode can
 	 * have write I/O in-progress but if there is a VM object then the
 	 * VM object can also have read-I/O in-progress.
 	 */
 	do {
 		while (vp->v_numoutput > 0) {
 			vp->v_iflag |= VI_BWAIT;
 			msleep(&vp->v_numoutput, VI_MTX(vp), PVM, "vnvlbv", 0);
 		}
 		VI_UNLOCK(vp);
 		if (VOP_GETVOBJECT(vp, &object) == 0) {
 			while (object->paging_in_progress)
 			vm_object_pip_sleep(object, "vnvlbx");
 		}
 		VI_LOCK(vp);
 	} while (vp->v_numoutput > 0);
 	VI_UNLOCK(vp);
 
 	splx(s);
 
 	/*
 	 * Destroy the copy in the VM cache, too.
 	 */
 	if (VOP_GETVOBJECT(vp, &object) == 0) {
 		vm_object_page_remove(object, 0, 0,
 			(flags & V_SAVE) ? TRUE : FALSE);
 	}
 
 #ifdef INVARIANTS
 	VI_LOCK(vp);
 	if ((flags & (V_ALT | V_NORMAL)) == 0 &&
 	    (!TAILQ_EMPTY(&vp->v_dirtyblkhd) ||
 	     !TAILQ_EMPTY(&vp->v_cleanblkhd)))
 		panic("vinvalbuf: flush failed");
 	VI_UNLOCK(vp);
 #endif
 	return (0);
 }
 
 /*
  * Flush out buffers on the specified list.
  *
  */
 static int
 flushbuflist(blist, flags, vp, slpflag, slptimeo, errorp)
 	struct buf *blist;
 	int flags;
 	struct vnode *vp;
 	int slpflag, slptimeo;
 	int *errorp;
 {
 	struct buf *bp, *nbp;
 	int found, error;
 
 	ASSERT_VI_LOCKED(vp, "flushbuflist");
 
 	for (found = 0, bp = blist; bp; bp = nbp) {
 		nbp = TAILQ_NEXT(bp, b_vnbufs);
 		VI_UNLOCK(vp);
 		if (((flags & V_NORMAL) && (bp->b_xflags & BX_ALTDATA)) ||
 		    ((flags & V_ALT) && (bp->b_xflags & BX_ALTDATA) == 0)) {
 			VI_LOCK(vp);
 			continue;
 		}
 		found += 1;
 		if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT)) {
 			error = BUF_TIMELOCK(bp,
 			    LK_EXCLUSIVE | LK_SLEEPFAIL,
 			    "flushbuf", slpflag, slptimeo);
 			if (error != ENOLCK)
 				*errorp = error;
 			goto done;
 		}
 		/*
 		 * XXX Since there are no node locks for NFS, I
 		 * believe there is a slight chance that a delayed
 		 * write will occur while sleeping just above, so
 		 * check for it.  Note that vfs_bio_awrite expects
 		 * buffers to reside on a queue, while BUF_WRITE and
 		 * brelse do not.
 		 */
 		if (((bp->b_flags & (B_DELWRI | B_INVAL)) == B_DELWRI) &&
 			(flags & V_SAVE)) {
 
 			if (bp->b_vp == vp) {
 				if (bp->b_flags & B_CLUSTEROK) {
 					BUF_UNLOCK(bp);
 					vfs_bio_awrite(bp);
 				} else {
 					bremfree(bp);
 					bp->b_flags |= B_ASYNC;
 					BUF_WRITE(bp);
 				}
 			} else {
 				bremfree(bp);
 				(void) BUF_WRITE(bp);
 			}
 			goto done;
 		}
 		bremfree(bp);
 		bp->b_flags |= (B_INVAL | B_NOCACHE | B_RELBUF);
 		bp->b_flags &= ~B_ASYNC;
 		brelse(bp);
 		VI_LOCK(vp);
 	}
 	return (found);
 done:
 	VI_LOCK(vp);
 	return (found);
 }
 
 /*
  * Truncate a file's buffer and pages to a specified length.  This
  * is in lieu of the old vinvalbuf mechanism, which performed unneeded
  * sync activity.
  */
 int
 vtruncbuf(vp, cred, td, length, blksize)
 	register struct vnode *vp;
 	struct ucred *cred;
 	struct thread *td;
 	off_t length;
 	int blksize;
 {
 	register struct buf *bp;
 	struct buf *nbp;
 	int s, anyfreed;
 	int trunclbn;
 
 	/*
 	 * Round up to the *next* lbn.
 	 */
 	trunclbn = (length + blksize - 1) / blksize;
 
 	s = splbio();
 	ASSERT_VOP_LOCKED(vp, "vtruncbuf");
 restart:
 	VI_LOCK(vp);
 	anyfreed = 1;
 	for (;anyfreed;) {
 		anyfreed = 0;
 		for (bp = TAILQ_FIRST(&vp->v_cleanblkhd); bp; bp = nbp) {
 			nbp = TAILQ_NEXT(bp, b_vnbufs);
 			VI_UNLOCK(vp);
 			if (bp->b_lblkno >= trunclbn) {
 				if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT)) {
 					BUF_LOCK(bp, LK_EXCLUSIVE|LK_SLEEPFAIL);
 					goto restart;
 				} else {
 					bremfree(bp);
 					bp->b_flags |= (B_INVAL | B_RELBUF);
 					bp->b_flags &= ~B_ASYNC;
 					brelse(bp);
 					anyfreed = 1;
 				}
 				if (nbp &&
 				    (((nbp->b_xflags & BX_VNCLEAN) == 0) ||
 				    (nbp->b_vp != vp) ||
 				    (nbp->b_flags & B_DELWRI))) {
 					goto restart;
 				}
 			}
 			VI_LOCK(vp);
 		}
 
 		for (bp = TAILQ_FIRST(&vp->v_dirtyblkhd); bp; bp = nbp) {
 			nbp = TAILQ_NEXT(bp, b_vnbufs);
 			VI_UNLOCK(vp);
 			if (bp->b_lblkno >= trunclbn) {
 				if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT)) {
 					BUF_LOCK(bp, LK_EXCLUSIVE|LK_SLEEPFAIL);
 					goto restart;
 				} else {
 					bremfree(bp);
 					bp->b_flags |= (B_INVAL | B_RELBUF);
 					bp->b_flags &= ~B_ASYNC;
 					brelse(bp);
 					anyfreed = 1;
 				}
 				if (nbp &&
 				    (((nbp->b_xflags & BX_VNDIRTY) == 0) ||
 				    (nbp->b_vp != vp) ||
 				    (nbp->b_flags & B_DELWRI) == 0)) {
 					goto restart;
 				}
 			}
 			VI_LOCK(vp);
 		}
 	}
 
 	if (length > 0) {
 restartsync:
 		for (bp = TAILQ_FIRST(&vp->v_dirtyblkhd); bp; bp = nbp) {
 			nbp = TAILQ_NEXT(bp, b_vnbufs);
 			VI_UNLOCK(vp);
 			if ((bp->b_flags & B_DELWRI) && (bp->b_lblkno < 0)) {
 				if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT)) {
 					BUF_LOCK(bp, LK_EXCLUSIVE|LK_SLEEPFAIL);
 					goto restart;
 				} else {
 					bremfree(bp);
 					if (bp->b_vp == vp) {
 						bp->b_flags |= B_ASYNC;
 					} else {
 						bp->b_flags &= ~B_ASYNC;
 					}
 					BUF_WRITE(bp);
 				}
 				VI_LOCK(vp);
 				goto restartsync;
 			}
 			VI_LOCK(vp);
 		}
 	}
 	
 	while (vp->v_numoutput > 0) {
 		vp->v_iflag |= VI_BWAIT;
 		msleep(&vp->v_numoutput, VI_MTX(vp), PVM, "vbtrunc", 0);
 	}
 	VI_UNLOCK(vp);
 	splx(s);
 
 	vnode_pager_setsize(vp, length);
 
 	return (0);
 }
 
 /*
  * buf_splay() - splay tree core for the clean/dirty list of buffers in
  * 		 a vnode.
  *
  *	NOTE: We have to deal with the special case of a background bitmap
  *	buffer, a situation where two buffers will have the same logical
  *	block offset.  We want (1) only the foreground buffer to be accessed
  *	in a lookup and (2) must differentiate between the foreground and
  *	background buffer in the splay tree algorithm because the splay
  *	tree cannot normally handle multiple entities with the same 'index'.
  *	We accomplish this by adding differentiating flags to the splay tree's
  *	numerical domain.
  */
 static
 struct buf *
 buf_splay(daddr_t lblkno, b_xflags_t xflags, struct buf *root)
 {
 	struct buf dummy;
 	struct buf *lefttreemax, *righttreemin, *y;
 
 	if (root == NULL)
 		return (NULL);
 	lefttreemax = righttreemin = &dummy;
 	for (;;) {
 		if (lblkno < root->b_lblkno ||
 		    (lblkno == root->b_lblkno &&
 		    (xflags & BX_BKGRDMARKER) < (root->b_xflags & BX_BKGRDMARKER))) {
 			if ((y = root->b_left) == NULL)
 				break;
 			if (lblkno < y->b_lblkno) {
 				/* Rotate right. */
 				root->b_left = y->b_right;
 				y->b_right = root;
 				root = y;
 				if ((y = root->b_left) == NULL)
 					break;
 			}
 			/* Link into the new root's right tree. */
 			righttreemin->b_left = root;
 			righttreemin = root;
 		} else if (lblkno > root->b_lblkno ||
 		    (lblkno == root->b_lblkno &&
 		    (xflags & BX_BKGRDMARKER) > (root->b_xflags & BX_BKGRDMARKER))) {
 			if ((y = root->b_right) == NULL)
 				break;
 			if (lblkno > y->b_lblkno) {
 				/* Rotate left. */
 				root->b_right = y->b_left;
 				y->b_left = root;
 				root = y;
 				if ((y = root->b_right) == NULL)
 					break;
 			}
 			/* Link into the new root's left tree. */
 			lefttreemax->b_right = root;
 			lefttreemax = root;
 		} else {
 			break;
 		}
 		root = y;
 	}
 	/* Assemble the new root. */
 	lefttreemax->b_right = root->b_left;
 	righttreemin->b_left = root->b_right;
 	root->b_left = dummy.b_right;
 	root->b_right = dummy.b_left;
 	return (root);
 }
 
 static
 void
 buf_vlist_remove(struct buf *bp)
 {
 	struct vnode *vp = bp->b_vp;
 	struct buf *root;
 
 	ASSERT_VI_LOCKED(vp, "buf_vlist_remove");
 	if (bp->b_xflags & BX_VNDIRTY) {
 		if (bp != vp->v_dirtyblkroot) {
 			root = buf_splay(bp->b_lblkno, bp->b_xflags, vp->v_dirtyblkroot);
 			KASSERT(root == bp, ("splay lookup failed during dirty remove"));
 		}
 		if (bp->b_left == NULL) {
 			root = bp->b_right;
 		} else {
 			root = buf_splay(bp->b_lblkno, bp->b_xflags, bp->b_left);
 			root->b_right = bp->b_right;
 		}
 		vp->v_dirtyblkroot = root;
 		TAILQ_REMOVE(&vp->v_dirtyblkhd, bp, b_vnbufs);
 	} else {
 		/* KASSERT(bp->b_xflags & BX_VNCLEAN, ("bp wasn't clean")); */
 		if (bp != vp->v_cleanblkroot) {
 			root = buf_splay(bp->b_lblkno, bp->b_xflags, vp->v_cleanblkroot);
 			KASSERT(root == bp, ("splay lookup failed during clean remove"));
 		}
 		if (bp->b_left == NULL) {
 			root = bp->b_right;
 		} else {
 			root = buf_splay(bp->b_lblkno, bp->b_xflags, bp->b_left);
 			root->b_right = bp->b_right;
 		}
 		vp->v_cleanblkroot = root;
 		TAILQ_REMOVE(&vp->v_cleanblkhd, bp, b_vnbufs);
 	}
 	bp->b_xflags &= ~(BX_VNDIRTY | BX_VNCLEAN);
 }
 
 /*
  * Add the buffer to the sorted clean or dirty block list using a
  * splay tree algorithm.
  *
  * NOTE: xflags is passed as a constant, optimizing this inline function!
  */
 static 
 void
 buf_vlist_add(struct buf *bp, struct vnode *vp, b_xflags_t xflags)
 {
 	struct buf *root;
 
 	ASSERT_VI_LOCKED(vp, "buf_vlist_add");
 	bp->b_xflags |= xflags;
 	if (xflags & BX_VNDIRTY) {
 		root = buf_splay(bp->b_lblkno, bp->b_xflags, vp->v_dirtyblkroot);
 		if (root == NULL) {
 			bp->b_left = NULL;
 			bp->b_right = NULL;
 			TAILQ_INSERT_TAIL(&vp->v_dirtyblkhd, bp, b_vnbufs);
 		} else if (bp->b_lblkno < root->b_lblkno ||
 		    (bp->b_lblkno == root->b_lblkno &&
 		    (bp->b_xflags & BX_BKGRDMARKER) < (root->b_xflags & BX_BKGRDMARKER))) {
 			bp->b_left = root->b_left;
 			bp->b_right = root;
 			root->b_left = NULL;
 			TAILQ_INSERT_BEFORE(root, bp, b_vnbufs);
 		} else {
 			bp->b_right = root->b_right;
 			bp->b_left = root;
 			root->b_right = NULL;
 			TAILQ_INSERT_AFTER(&vp->v_dirtyblkhd, 
 			    root, bp, b_vnbufs);
 		}
 		vp->v_dirtyblkroot = bp;
 	} else {
 		/* KASSERT(xflags & BX_VNCLEAN, ("xflags not clean")); */
 		root = buf_splay(bp->b_lblkno, bp->b_xflags, vp->v_cleanblkroot);
 		if (root == NULL) {
 			bp->b_left = NULL;
 			bp->b_right = NULL;
 			TAILQ_INSERT_TAIL(&vp->v_cleanblkhd, bp, b_vnbufs);
 		} else if (bp->b_lblkno < root->b_lblkno ||
 		    (bp->b_lblkno == root->b_lblkno &&
 		    (bp->b_xflags & BX_BKGRDMARKER) < (root->b_xflags & BX_BKGRDMARKER))) {
 			bp->b_left = root->b_left;
 			bp->b_right = root;
 			root->b_left = NULL;
 			TAILQ_INSERT_BEFORE(root, bp, b_vnbufs);
 		} else {
 			bp->b_right = root->b_right;
 			bp->b_left = root;
 			root->b_right = NULL;
 			TAILQ_INSERT_AFTER(&vp->v_cleanblkhd, 
 			    root, bp, b_vnbufs);
 		}
 		vp->v_cleanblkroot = bp;
 	}
 }
 
 #ifndef USE_BUFHASH
 
 /*
  * Lookup a buffer using the splay tree.  Note that we specifically avoid
  * shadow buffers used in background bitmap writes.
  *
  * This code isn't quite efficient as it could be because we are maintaining
  * two sorted lists and do not know which list the block resides in.
  */
 struct buf *
 gbincore(struct vnode *vp, daddr_t lblkno)
 {
 	struct buf *bp;
 
 	GIANT_REQUIRED;
 
 	ASSERT_VI_LOCKED(vp, "gbincore");
 	bp = vp->v_cleanblkroot = buf_splay(lblkno, 0, vp->v_cleanblkroot);
 	if (bp && bp->b_lblkno == lblkno && !(bp->b_xflags & BX_BKGRDMARKER))
 		return(bp);
 	bp = vp->v_dirtyblkroot = buf_splay(lblkno, 0, vp->v_dirtyblkroot);
 	if (bp && bp->b_lblkno == lblkno && !(bp->b_xflags & BX_BKGRDMARKER))
 		return(bp);
 	return(NULL);
 }
 
 #endif
 
 /*
  * Associate a buffer with a vnode.
  */
 void
 bgetvp(vp, bp)
 	register struct vnode *vp;
 	register struct buf *bp;
 {
 	int s;
 
 	KASSERT(bp->b_vp == NULL, ("bgetvp: not free"));
 
 	KASSERT((bp->b_xflags & (BX_VNDIRTY|BX_VNCLEAN)) == 0,
 	    ("bgetvp: bp already attached! %p", bp));
 
 	VI_LOCK(vp);
 	vholdl(vp);
 	bp->b_vp = vp;
 	bp->b_dev = vn_todev(vp);
 	/*
 	 * Insert onto list for new vnode.
 	 */
 	s = splbio();
 	buf_vlist_add(bp, vp, BX_VNCLEAN);
 	splx(s);
 	VI_UNLOCK(vp);
 }
 
 /*
  * Disassociate a buffer from a vnode.
  */
 void
 brelvp(bp)
 	register struct buf *bp;
 {
 	struct vnode *vp;
 	int s;
 
 	KASSERT(bp->b_vp != NULL, ("brelvp: NULL"));
 
 	/*
 	 * Delete from old vnode list, if on one.
 	 */
 	vp = bp->b_vp;
 	s = splbio();
 	VI_LOCK(vp);
 	if (bp->b_xflags & (BX_VNDIRTY | BX_VNCLEAN))
 		buf_vlist_remove(bp);
 	if ((vp->v_iflag & VI_ONWORKLST) && TAILQ_EMPTY(&vp->v_dirtyblkhd)) {
 		vp->v_iflag &= ~VI_ONWORKLST;
 		mtx_lock(&sync_mtx);
 		LIST_REMOVE(vp, v_synclist);
 		mtx_unlock(&sync_mtx);
 	}
 	vdropl(vp);
 	VI_UNLOCK(vp);
 	bp->b_vp = (struct vnode *) 0;
 	if (bp->b_object)
 		bp->b_object = NULL;
 	splx(s);
 }
 
 /*
  * Add an item to the syncer work queue.
  */
 static void
 vn_syncer_add_to_worklist(struct vnode *vp, int delay)
 {
 	int s, slot;
 
 	s = splbio();
 	ASSERT_VI_LOCKED(vp, "vn_syncer_add_to_worklist");
 
 	mtx_lock(&sync_mtx);
 	if (vp->v_iflag & VI_ONWORKLST)
 		LIST_REMOVE(vp, v_synclist);
 	else
 		vp->v_iflag |= VI_ONWORKLST;
 
 	if (delay > syncer_maxdelay - 2)
 		delay = syncer_maxdelay - 2;
 	slot = (syncer_delayno + delay) & syncer_mask;
 
 	LIST_INSERT_HEAD(&syncer_workitem_pending[slot], vp, v_synclist);
 	mtx_unlock(&sync_mtx);
 
 	splx(s);
 }
 
 struct  proc *updateproc;
 static void sched_sync(void);
 static struct kproc_desc up_kp = {
 	"syncer",
 	sched_sync,
 	&updateproc
 };
 SYSINIT(syncer, SI_SUB_KTHREAD_UPDATE, SI_ORDER_FIRST, kproc_start, &up_kp)
 
 /*
  * System filesystem synchronizer daemon.
  */
 static void
 sched_sync(void)
 {
 	struct synclist *slp;
 	struct vnode *vp;
 	struct mount *mp;
 	long starttime;
 	int s;
 	struct thread *td = FIRST_THREAD_IN_PROC(updateproc);  /* XXXKSE */
 
 	mtx_lock(&Giant);
 
 	EVENTHANDLER_REGISTER(shutdown_pre_sync, kproc_shutdown, td->td_proc,
 	    SHUTDOWN_PRI_LAST);
 
 	for (;;) {
 		kthread_suspend_check(td->td_proc);
 
 		starttime = time_second;
 
 		/*
 		 * Push files whose dirty time has expired.  Be careful
 		 * of interrupt race on slp queue.
 		 */
 		s = splbio();
 		mtx_lock(&sync_mtx);
 		slp = &syncer_workitem_pending[syncer_delayno];
 		syncer_delayno += 1;
 		if (syncer_delayno == syncer_maxdelay)
 			syncer_delayno = 0;
 		splx(s);
 
 		while ((vp = LIST_FIRST(slp)) != NULL) {
 			mtx_unlock(&sync_mtx);
 			if (VOP_ISLOCKED(vp, NULL) == 0 &&
 			    vn_start_write(vp, &mp, V_NOWAIT) == 0) {
 				vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td);
 				(void) VOP_FSYNC(vp, td->td_ucred, MNT_LAZY, td);
 				VOP_UNLOCK(vp, 0, td);
 				vn_finished_write(mp);
 			}
 			s = splbio();
 			mtx_lock(&sync_mtx);
 			if (LIST_FIRST(slp) == vp) {
 				mtx_unlock(&sync_mtx);
 				/*
 				 * Note: VFS vnodes can remain on the
 				 * worklist too with no dirty blocks, but
 				 * since sync_fsync() moves it to a different
 				 * slot we are safe.
 				 */
 				VI_LOCK(vp);
 				if (TAILQ_EMPTY(&vp->v_dirtyblkhd) &&
 				    !vn_isdisk(vp, NULL)) {
 					panic("sched_sync: fsync failed "
 					      "vp %p tag %s", vp, vp->v_tag);
 				}
 				/*
 				 * Put us back on the worklist.  The worklist
 				 * routine will remove us from our current
 				 * position and then add us back in at a later
 				 * position.
 				 */
 				vn_syncer_add_to_worklist(vp, syncdelay);
 				VI_UNLOCK(vp);
 				mtx_lock(&sync_mtx);
 			}
 			splx(s);
 		}
 		mtx_unlock(&sync_mtx);
 
 		/*
 		 * Do soft update processing.
 		 */
 		if (softdep_process_worklist_hook != NULL)
 			(*softdep_process_worklist_hook)(NULL);
 
 		/*
 		 * The variable rushjob allows the kernel to speed up the
 		 * processing of the filesystem syncer process. A rushjob
 		 * value of N tells the filesystem syncer to process the next
 		 * N seconds worth of work on its queue ASAP. Currently rushjob
 		 * is used by the soft update code to speed up the filesystem
 		 * syncer process when the incore state is getting so far
 		 * ahead of the disk that the kernel memory pool is being
 		 * threatened with exhaustion.
 		 */
 		mtx_lock(&sync_mtx);
 		if (rushjob > 0) {
 			rushjob -= 1;
 			mtx_unlock(&sync_mtx);
 			continue;
 		}
 		mtx_unlock(&sync_mtx);
 		/*
 		 * If it has taken us less than a second to process the
 		 * current work, then wait. Otherwise start right over
 		 * again. We can still lose time if any single round
 		 * takes more than two seconds, but it does not really
 		 * matter as we are just trying to generally pace the
 		 * filesystem activity.
 		 */
 		if (time_second == starttime)
 			tsleep(&lbolt, PPAUSE, "syncer", 0);
 	}
 }
 
 /*
  * Request the syncer daemon to speed up its work.
  * We never push it to speed up more than half of its
  * normal turn time, otherwise it could take over the cpu.
  * XXXKSE  only one update?
  */
 int
 speedup_syncer()
 {
 	struct thread *td;
 	int ret = 0;
 
 	td = FIRST_THREAD_IN_PROC(updateproc);
 	mtx_lock_spin(&sched_lock);
 	if (td->td_wchan == &lbolt) {
 		unsleep(td);
 		TD_CLR_SLEEPING(td);
 		setrunnable(td);
 	}
 	mtx_unlock_spin(&sched_lock);
 	mtx_lock(&sync_mtx);
 	if (rushjob < syncdelay / 2) {
 		rushjob += 1;
 		stat_rush_requests += 1;
 		ret = 1;
 	}
 	mtx_unlock(&sync_mtx);
 	return (ret);
 }
 
 /*
  * Associate a p-buffer with a vnode.
  *
  * Also sets B_PAGING flag to indicate that vnode is not fully associated
  * with the buffer.  i.e. the bp has not been linked into the vnode or
  * ref-counted.
  */
 void
 pbgetvp(vp, bp)
 	register struct vnode *vp;
 	register struct buf *bp;
 {
 
 	KASSERT(bp->b_vp == NULL, ("pbgetvp: not free"));
 
 	bp->b_vp = vp;
 	bp->b_flags |= B_PAGING;
 	bp->b_dev = vn_todev(vp);
 }
 
 /*
  * Disassociate a p-buffer from a vnode.
  */
 void
 pbrelvp(bp)
 	register struct buf *bp;
 {
 
 	KASSERT(bp->b_vp != NULL, ("pbrelvp: NULL"));
 
 	/* XXX REMOVE ME */
 	VI_LOCK(bp->b_vp);
 	if (TAILQ_NEXT(bp, b_vnbufs) != NULL) {
 		panic(
 		    "relpbuf(): b_vp was probably reassignbuf()d %p %x",
 		    bp,
 		    (int)bp->b_flags
 		);
 	}
 	VI_UNLOCK(bp->b_vp);
 	bp->b_vp = (struct vnode *) 0;
 	bp->b_flags &= ~B_PAGING;
 }
 
 /*
  * Reassign a buffer from one vnode to another.
  * Used to assign file specific control information
  * (indirect blocks) to the vnode to which they belong.
  */
 void
 reassignbuf(bp, newvp)
 	register struct buf *bp;
 	register struct vnode *newvp;
 {
 	int delay;
 	int s;
 
 	if (newvp == NULL) {
 		printf("reassignbuf: NULL");
 		return;
 	}
 	++reassignbufcalls;
 
 	/*
 	 * B_PAGING flagged buffers cannot be reassigned because their vp
 	 * is not fully linked in.
 	 */
 	if (bp->b_flags & B_PAGING)
 		panic("cannot reassign paging buffer");
 
 	s = splbio();
 	/*
 	 * Delete from old vnode list, if on one.
 	 */
 	VI_LOCK(bp->b_vp);
 	if (bp->b_xflags & (BX_VNDIRTY | BX_VNCLEAN)) {
 		buf_vlist_remove(bp);
 		if (bp->b_vp != newvp) {
 			vdropl(bp->b_vp);
 			bp->b_vp = NULL;	/* for clarification */
 		}
 	}
 	VI_UNLOCK(bp->b_vp);
 	/*
 	 * If dirty, put on list of dirty buffers; otherwise insert onto list
 	 * of clean buffers.
 	 */
 	VI_LOCK(newvp);
 	if (bp->b_flags & B_DELWRI) {
 		if ((newvp->v_iflag & VI_ONWORKLST) == 0) {
 			switch (newvp->v_type) {
 			case VDIR:
 				delay = dirdelay;
 				break;
 			case VCHR:
 				if (newvp->v_rdev->si_mountpoint != NULL) {
 					delay = metadelay;
 					break;
 				}
 				/* FALLTHROUGH */
 			default:
 				delay = filedelay;
 			}
 			vn_syncer_add_to_worklist(newvp, delay);
 		}
 		buf_vlist_add(bp, newvp, BX_VNDIRTY);
 	} else {
 		buf_vlist_add(bp, newvp, BX_VNCLEAN);
 
 		if ((newvp->v_iflag & VI_ONWORKLST) &&
 		    TAILQ_EMPTY(&newvp->v_dirtyblkhd)) {
 			mtx_lock(&sync_mtx);
 			LIST_REMOVE(newvp, v_synclist);
 			mtx_unlock(&sync_mtx);
 			newvp->v_iflag &= ~VI_ONWORKLST;
 		}
 	}
 	if (bp->b_vp != newvp) {
 		bp->b_vp = newvp;
 		vholdl(bp->b_vp);
 	}
 	VI_UNLOCK(newvp);
 	splx(s);
 }
 
 /*
  * Create a vnode for a device.
  * Used for mounting the root filesystem.
  */
 int
 bdevvp(dev, vpp)
 	dev_t dev;
 	struct vnode **vpp;
 {
 	register struct vnode *vp;
 	struct vnode *nvp;
 	int error;
 
 	if (dev == NODEV) {
 		*vpp = NULLVP;
 		return (ENXIO);
 	}
 	if (vfinddev(dev, VCHR, vpp))
 		return (0);
 	error = getnewvnode("none", (struct mount *)0, spec_vnodeop_p, &nvp);
 	if (error) {
 		*vpp = NULLVP;
 		return (error);
 	}
 	vp = nvp;
 	vp->v_type = VCHR;
 	addalias(vp, dev);
 	*vpp = vp;
 	return (0);
 }
 
 static void
 v_incr_usecount(struct vnode *vp, int delta)
 {
 	vp->v_usecount += delta;
 	if (vp->v_type == VCHR) {
 		mtx_lock(&spechash_mtx);
 		vp->v_rdev->si_usecount += delta;
 		mtx_unlock(&spechash_mtx);
 	}
 }
 
 /*
  * Add vnode to the alias list hung off the dev_t.
  *
  * The reason for this gunk is that multiple vnodes can reference
  * the same physical device, so checking vp->v_usecount to see
  * how many users there are is inadequate; the v_usecount for
  * the vnodes need to be accumulated.  vcount() does that.
  */
 struct vnode *
 addaliasu(nvp, nvp_rdev)
 	struct vnode *nvp;
 	udev_t nvp_rdev;
 {
 	struct vnode *ovp;
 	vop_t **ops;
 	dev_t dev;
 
 	if (nvp->v_type == VBLK)
 		return (nvp);
 	if (nvp->v_type != VCHR)
 		panic("addaliasu on non-special vnode");
 	dev = udev2dev(nvp_rdev, 0);
 	/*
 	 * Check to see if we have a bdevvp vnode with no associated
 	 * filesystem. If so, we want to associate the filesystem of
 	 * the new newly instigated vnode with the bdevvp vnode and
 	 * discard the newly created vnode rather than leaving the
 	 * bdevvp vnode lying around with no associated filesystem.
 	 */
 	if (vfinddev(dev, nvp->v_type, &ovp) == 0 || ovp->v_data != NULL) {
 		addalias(nvp, dev);
 		return (nvp);
 	}
 	/*
 	 * Discard unneeded vnode, but save its node specific data.
 	 * Note that if there is a lock, it is carried over in the
 	 * node specific data to the replacement vnode.
 	 */
 	vref(ovp);
 	ovp->v_data = nvp->v_data;
 	ovp->v_tag = nvp->v_tag;
 	nvp->v_data = NULL;
 	lockdestroy(ovp->v_vnlock);
 	lockinit(ovp->v_vnlock, PVFS, nvp->v_vnlock->lk_wmesg,
 	    nvp->v_vnlock->lk_timo, nvp->v_vnlock->lk_flags & LK_EXTFLG_MASK);
 	ops = ovp->v_op;
 	ovp->v_op = nvp->v_op;
 	if (VOP_ISLOCKED(nvp, curthread)) {
 		VOP_UNLOCK(nvp, 0, curthread);
 		vn_lock(ovp, LK_EXCLUSIVE | LK_RETRY, curthread);
 	}
 	nvp->v_op = ops;
 	insmntque(ovp, nvp->v_mount);
 	vrele(nvp);
 	vgone(nvp);
 	return (ovp);
 }
 
 /* This is a local helper function that do the same as addaliasu, but for a
  * dev_t instead of an udev_t. */
 static void
 addalias(nvp, dev)
 	struct vnode *nvp;
 	dev_t dev;
 {
 
 	KASSERT(nvp->v_type == VCHR, ("addalias on non-special vnode"));
 	nvp->v_rdev = dev;
 	VI_LOCK(nvp);
 	mtx_lock(&spechash_mtx);
 	SLIST_INSERT_HEAD(&dev->si_hlist, nvp, v_specnext);
 	dev->si_usecount += nvp->v_usecount;
 	mtx_unlock(&spechash_mtx);
 	VI_UNLOCK(nvp);
 }
 
 /*
  * Grab a particular vnode from the free list, increment its
  * reference count and lock it. The vnode lock bit is set if the
  * vnode is being eliminated in vgone. The process is awakened
  * when the transition is completed, and an error returned to
  * indicate that the vnode is no longer usable (possibly having
  * been changed to a new filesystem type).
  */
 int
 vget(vp, flags, td)
 	register struct vnode *vp;
 	int flags;
 	struct thread *td;
 {
 	int error;
 
 	/*
 	 * If the vnode is in the process of being cleaned out for
 	 * another use, we wait for the cleaning to finish and then
 	 * return failure. Cleaning is determined by checking that
 	 * the VI_XLOCK flag is set.
 	 */
 	if ((flags & LK_INTERLOCK) == 0)
 		VI_LOCK(vp);
 	if (vp->v_iflag & VI_XLOCK && vp->v_vxproc != curthread) {
 		vp->v_iflag |= VI_XWANT;
 		msleep(vp, VI_MTX(vp), PINOD | PDROP, "vget", 0);
 		return (ENOENT);
 	}
 
 	v_incr_usecount(vp, 1);
 
 	if (VSHOULDBUSY(vp))
 		vbusy(vp);
 	if (flags & LK_TYPE_MASK) {
 		if ((error = vn_lock(vp, flags | LK_INTERLOCK, td)) != 0) {
 			/*
 			 * must expand vrele here because we do not want
 			 * to call VOP_INACTIVE if the reference count
 			 * drops back to zero since it was never really
 			 * active. We must remove it from the free list
 			 * before sleeping so that multiple processes do
 			 * not try to recycle it.
 			 */
 			VI_LOCK(vp);
 			v_incr_usecount(vp, -1);
 			if (VSHOULDFREE(vp))
 				vfree(vp);
 			else
 				vlruvp(vp);
 			VI_UNLOCK(vp);
 		}
 		return (error);
 	}
 	VI_UNLOCK(vp);
 	return (0);
 }
 
 /*
  * Increase the reference count of a vnode.
  */
 void
 vref(struct vnode *vp)
 {
 	VI_LOCK(vp);
 	v_incr_usecount(vp, 1);
 	VI_UNLOCK(vp);
 }
 
 /*
  * Return reference count of a vnode.
  *
  * The results of this call are only guaranteed when some mechanism other
  * than the VI lock is used to stop other processes from gaining references
  * to the vnode.  This may be the case if the caller holds the only reference.
  * This is also useful when stale data is acceptable as race conditions may
  * be accounted for by some other means.
  */
 int
 vrefcnt(struct vnode *vp)
 {
 	int usecnt;
 
 	VI_LOCK(vp);
 	usecnt = vp->v_usecount;
 	VI_UNLOCK(vp);
 
 	return (usecnt);
 }
 
 
 /*
  * Vnode put/release.
  * If count drops to zero, call inactive routine and return to freelist.
  */
 void
 vrele(vp)
 	struct vnode *vp;
 {
 	struct thread *td = curthread;	/* XXX */
 
 	KASSERT(vp != NULL, ("vrele: null vp"));
 
 	VI_LOCK(vp);
 
 	/* Skip this v_writecount check if we're going to panic below. */
 	KASSERT(vp->v_writecount < vp->v_usecount || vp->v_usecount < 1,
 	    ("vrele: missed vn_close"));
 
 	if (vp->v_usecount > 1) {
 
 		v_incr_usecount(vp, -1);
 		VI_UNLOCK(vp);
 
 		return;
 	}
 
 	if (vp->v_usecount == 1) {
 		v_incr_usecount(vp, -1);
 		/*
 		 * We must call VOP_INACTIVE with the node locked.
 		 * If we are doing a vput, the node is already locked,
 		 * but, in the case of vrele, we must explicitly lock
 		 * the vnode before calling VOP_INACTIVE.
 		 */
 		if (vn_lock(vp, LK_EXCLUSIVE | LK_INTERLOCK, td) == 0)
 			VOP_INACTIVE(vp, td);
 		VI_LOCK(vp);
 		if (VSHOULDFREE(vp))
 			vfree(vp);
 		else
 			vlruvp(vp);
 		VI_UNLOCK(vp);
 
 	} else {
 #ifdef DIAGNOSTIC
 		vprint("vrele: negative ref count", vp);
 #endif
 		VI_UNLOCK(vp);
 		panic("vrele: negative ref cnt");
 	}
 }
 
 /*
  * Release an already locked vnode.  This give the same effects as
  * unlock+vrele(), but takes less time and avoids releasing and
  * re-aquiring the lock (as vrele() aquires the lock internally.)
  */
 void
 vput(vp)
 	struct vnode *vp;
 {
 	struct thread *td = curthread;	/* XXX */
 
 	GIANT_REQUIRED;
 
 	KASSERT(vp != NULL, ("vput: null vp"));
 	VI_LOCK(vp);
 	/* Skip this v_writecount check if we're going to panic below. */
 	KASSERT(vp->v_writecount < vp->v_usecount || vp->v_usecount < 1,
 	    ("vput: missed vn_close"));
 
 	if (vp->v_usecount > 1) {
 		v_incr_usecount(vp, -1);
 		VOP_UNLOCK(vp, LK_INTERLOCK, td);
 		return;
 	}
 
 	if (vp->v_usecount == 1) {
 		v_incr_usecount(vp, -1);
 		/*
 		 * We must call VOP_INACTIVE with the node locked.
 		 * If we are doing a vput, the node is already locked,
 		 * so we just need to release the vnode mutex.
 		 */
 		VI_UNLOCK(vp);
 		VOP_INACTIVE(vp, td);
 		VI_LOCK(vp);
 		if (VSHOULDFREE(vp))
 			vfree(vp);
 		else
 			vlruvp(vp);
 		VI_UNLOCK(vp);
 
 	} else {
 #ifdef DIAGNOSTIC
 		vprint("vput: negative ref count", vp);
 #endif
 		panic("vput: negative ref cnt");
 	}
 }
 
 /*
  * Somebody doesn't want the vnode recycled.
  */
 void
 vhold(struct vnode *vp)
 {
 	VI_LOCK(vp);
 	vholdl(vp);
 	VI_UNLOCK(vp);
 }
 
 void
 vholdl(vp)
 	register struct vnode *vp;
 {
 	int s;
 
 	s = splbio();
 	vp->v_holdcnt++;
 	if (VSHOULDBUSY(vp))
 		vbusy(vp);
 	splx(s);
 }
 
 /*
  * Note that there is one less who cares about this vnode.  vdrop() is the
  * opposite of vhold().
  */
 void
 vdrop(struct vnode *vp)
 {
 	VI_LOCK(vp);
 	vdropl(vp);
 	VI_UNLOCK(vp);
 }
 	
 void
 vdropl(vp)
 	register struct vnode *vp;
 {
 	int s;
 
 	s = splbio();
 	if (vp->v_holdcnt <= 0)
 		panic("vdrop: holdcnt");
 	vp->v_holdcnt--;
 	if (VSHOULDFREE(vp))
 		vfree(vp);
 	else
 		vlruvp(vp);
 	splx(s);
 }
 
 /*
  * Remove any vnodes in the vnode table belonging to mount point mp.
  *
  * If FORCECLOSE is not specified, there should not be any active ones,
  * return error if any are found (nb: this is a user error, not a
  * system error). If FORCECLOSE is specified, detach any active vnodes
  * that are found.
  *
  * If WRITECLOSE is set, only flush out regular file vnodes open for
  * writing.
  *
  * SKIPSYSTEM causes any vnodes marked VV_SYSTEM to be skipped.
  *
  * `rootrefs' specifies the base reference count for the root vnode
  * of this filesystem. The root vnode is considered busy if its
  * v_usecount exceeds this value. On a successful return, vflush()
  * will call vrele() on the root vnode exactly rootrefs times.
  * If the SKIPSYSTEM or WRITECLOSE flags are specified, rootrefs must
  * be zero.
  */
 #ifdef DIAGNOSTIC
 static int busyprt = 0;		/* print out busy vnodes */
 SYSCTL_INT(_debug, OID_AUTO, busyprt, CTLFLAG_RW, &busyprt, 0, "");
 #endif
 
 int
 vflush(mp, rootrefs, flags)
 	struct mount *mp;
 	int rootrefs;
 	int flags;
 {
 	struct thread *td = curthread;	/* XXX */
 	struct vnode *vp, *nvp, *rootvp = NULL;
 	struct vattr vattr;
 	int busy = 0, error;
 
 	if (rootrefs > 0) {
 		KASSERT((flags & (SKIPSYSTEM | WRITECLOSE)) == 0,
 		    ("vflush: bad args"));
 		/*
 		 * Get the filesystem root vnode. We can vput() it
 		 * immediately, since with rootrefs > 0, it won't go away.
 		 */
 		if ((error = VFS_ROOT(mp, &rootvp)) != 0)
 			return (error);
 		vput(rootvp);
 
 	}
 	mtx_lock(&mntvnode_mtx);
 loop:
 	for (vp = TAILQ_FIRST(&mp->mnt_nvnodelist); vp; vp = nvp) {
 		/*
 		 * Make sure this vnode wasn't reclaimed in getnewvnode().
 		 * Start over if it has (it won't be on the list anymore).
 		 */
 		if (vp->v_mount != mp)
 			goto loop;
 		nvp = TAILQ_NEXT(vp, v_nmntvnodes);
 
 		VI_LOCK(vp);
 		mtx_unlock(&mntvnode_mtx);
 		vn_lock(vp, LK_INTERLOCK | LK_EXCLUSIVE | LK_RETRY, td);
 		/*
 		 * Skip over a vnodes marked VV_SYSTEM.
 		 */
 		if ((flags & SKIPSYSTEM) && (vp->v_vflag & VV_SYSTEM)) {
 			VOP_UNLOCK(vp, 0, td);
 			mtx_lock(&mntvnode_mtx);
 			continue;
 		}
 		/*
 		 * If WRITECLOSE is set, flush out unlinked but still open
 		 * files (even if open only for reading) and regular file
 		 * vnodes open for writing.
 		 */
 		if (flags & WRITECLOSE) {
 			error = VOP_GETATTR(vp, &vattr, td->td_ucred, td);
 			VI_LOCK(vp);
 
 			if ((vp->v_type == VNON ||
 			    (error == 0 && vattr.va_nlink > 0)) &&
 			    (vp->v_writecount == 0 || vp->v_type != VREG)) {
 				VOP_UNLOCK(vp, LK_INTERLOCK, td);
 				mtx_lock(&mntvnode_mtx);
 				continue;
 			}
 		} else
 			VI_LOCK(vp);
 
 		VOP_UNLOCK(vp, 0, td);
 
 		/*
 		 * With v_usecount == 0, all we need to do is clear out the
 		 * vnode data structures and we are done.
 		 */
 		if (vp->v_usecount == 0) {
 			vgonel(vp, td);
 			mtx_lock(&mntvnode_mtx);
 			continue;
 		}
 
 		/*
 		 * If FORCECLOSE is set, forcibly close the vnode. For block
 		 * or character devices, revert to an anonymous device. For
 		 * all other files, just kill them.
 		 */
 		if (flags & FORCECLOSE) {
 			if (vp->v_type != VCHR) {
 				vgonel(vp, td);
 			} else {
 				vclean(vp, 0, td);
 				VI_UNLOCK(vp);
 				vp->v_op = spec_vnodeop_p;
 				insmntque(vp, (struct mount *) 0);
 			}
 			mtx_lock(&mntvnode_mtx);
 			continue;
 		}
 #ifdef DIAGNOSTIC
 		if (busyprt)
 			vprint("vflush: busy vnode", vp);
 #endif
 		VI_UNLOCK(vp);
 		mtx_lock(&mntvnode_mtx);
 		busy++;
 	}
 	mtx_unlock(&mntvnode_mtx);
 	if (rootrefs > 0 && (flags & FORCECLOSE) == 0) {
 		/*
 		 * If just the root vnode is busy, and if its refcount
 		 * is equal to `rootrefs', then go ahead and kill it.
 		 */
 		VI_LOCK(rootvp);
 		KASSERT(busy > 0, ("vflush: not busy"));
 		KASSERT(rootvp->v_usecount >= rootrefs, ("vflush: rootrefs"));
 		if (busy == 1 && rootvp->v_usecount == rootrefs) {
 			vgonel(rootvp, td);
 			busy = 0;
 		} else
 			VI_UNLOCK(rootvp);
 	}
 	if (busy)
 		return (EBUSY);
 	for (; rootrefs > 0; rootrefs--)
 		vrele(rootvp);
 	return (0);
 }
 
 /*
  * This moves a now (likely recyclable) vnode to the end of the
  * mountlist.  XXX However, it is temporarily disabled until we
  * can clean up ffs_sync() and friends, which have loop restart
  * conditions which this code causes to operate O(N^2).
  */
 static void
 vlruvp(struct vnode *vp)
 {
 #if 0
 	struct mount *mp;
 
 	if ((mp = vp->v_mount) != NULL) {
 		mtx_lock(&mntvnode_mtx);
 		TAILQ_REMOVE(&mp->mnt_nvnodelist, vp, v_nmntvnodes);
 		TAILQ_INSERT_TAIL(&mp->mnt_nvnodelist, vp, v_nmntvnodes);
 		mtx_unlock(&mntvnode_mtx);
 	}
 #endif
 }
 
 /*
  * Disassociate the underlying filesystem from a vnode.
  */
 static void
 vclean(vp, flags, td)
 	struct vnode *vp;
 	int flags;
 	struct thread *td;
 {
 	int active;
 
 	ASSERT_VI_LOCKED(vp, "vclean");
 	/*
 	 * Check to see if the vnode is in use. If so we have to reference it
 	 * before we clean it out so that its count cannot fall to zero and
 	 * generate a race against ourselves to recycle it.
 	 */
 	if ((active = vp->v_usecount))
 		v_incr_usecount(vp, 1);
 
 	/*
 	 * Prevent the vnode from being recycled or brought into use while we
 	 * clean it out.
 	 */
 	if (vp->v_iflag & VI_XLOCK)
 		panic("vclean: deadlock");
 	vp->v_iflag |= VI_XLOCK;
 	vp->v_vxproc = curthread;
 	/*
 	 * Even if the count is zero, the VOP_INACTIVE routine may still
 	 * have the object locked while it cleans it out. The VOP_LOCK
 	 * ensures that the VOP_INACTIVE routine is done with its work.
 	 * For active vnodes, it ensures that no other activity can
 	 * occur while the underlying object is being cleaned out.
 	 */
 	VOP_LOCK(vp, LK_DRAIN | LK_INTERLOCK, td);
 
 	/*
 	 * Clean out any buffers associated with the vnode.
 	 * If the flush fails, just toss the buffers.
 	 */
 	if (flags & DOCLOSE) {
 		struct buf *bp;
 		VI_LOCK(vp);
 		bp = TAILQ_FIRST(&vp->v_dirtyblkhd);
 		VI_UNLOCK(vp);
 		if (bp != NULL)
 			(void) vn_write_suspend_wait(vp, NULL, V_WAIT);
 		if (vinvalbuf(vp, V_SAVE, NOCRED, td, 0, 0) != 0)
 			vinvalbuf(vp, 0, NOCRED, td, 0, 0);
 	}
 
 	VOP_DESTROYVOBJECT(vp);
 
 	/*
 	 * Any other processes trying to obtain this lock must first
 	 * wait for VXLOCK to clear, then call the new lock operation.
 	 */
 	VOP_UNLOCK(vp, 0, td);
 
 	/*
 	 * If purging an active vnode, it must be closed and
 	 * deactivated before being reclaimed. Note that the
 	 * VOP_INACTIVE will unlock the vnode.
 	 */
 	if (active) {
 		if (flags & DOCLOSE)
 			VOP_CLOSE(vp, FNONBLOCK, NOCRED, td);
 		if (vn_lock(vp, LK_EXCLUSIVE | LK_NOWAIT, td) != 0)
 			panic("vclean: cannot relock.");
 		VOP_INACTIVE(vp, td);
 	}
 
 	/*
 	 * Reclaim the vnode.
 	 */
 	if (VOP_RECLAIM(vp, td))
 		panic("vclean: cannot reclaim");
 
 	if (active) {
 		/*
 		 * Inline copy of vrele() since VOP_INACTIVE
 		 * has already been called.
 		 */
 		VI_LOCK(vp);
 		v_incr_usecount(vp, -1);
 		if (vp->v_usecount <= 0) {
 #ifdef DIAGNOSTIC
 			if (vp->v_usecount < 0 || vp->v_writecount != 0) {
 				vprint("vclean: bad ref count", vp);
 				panic("vclean: ref cnt");
 			}
 #endif
 			vfree(vp);
 		}
 		VI_UNLOCK(vp);
 	}
 
 	cache_purge(vp);
 	VI_LOCK(vp);
 	if (VSHOULDFREE(vp))
 		vfree(vp);
 
 	/*
 	 * Done with purge, reset to the standard lock and
 	 * notify sleepers of the grim news.
 	 */
 	vp->v_vnlock = &vp->v_lock;
 	vp->v_op = dead_vnodeop_p;
 	if (vp->v_pollinfo != NULL)
 		vn_pollgone(vp);
 	vp->v_tag = "none";
 	vp->v_iflag &= ~VI_XLOCK;
 	vp->v_vxproc = NULL;
 	if (vp->v_iflag & VI_XWANT) {
 		vp->v_iflag &= ~VI_XWANT;
 		wakeup(vp);
 	}
 }
 
 /*
  * Eliminate all activity associated with the requested vnode
  * and with all vnodes aliased to the requested vnode.
  */
 int
 vop_revoke(ap)
 	struct vop_revoke_args /* {
 		struct vnode *a_vp;
 		int a_flags;
 	} */ *ap;
 {
 	struct vnode *vp, *vq;
 	dev_t dev;
 
 	KASSERT((ap->a_flags & REVOKEALL) != 0, ("vop_revoke"));
 
 	vp = ap->a_vp;
 	VI_LOCK(vp);
 	/*
 	 * If a vgone (or vclean) is already in progress,
 	 * wait until it is done and return.
 	 */
 	if (vp->v_iflag & VI_XLOCK) {
 		vp->v_iflag |= VI_XWANT;
 		msleep(vp, VI_MTX(vp), PINOD | PDROP,
 		    "vop_revokeall", 0);
 		return (0);
 	}
 	VI_UNLOCK(vp);
 	dev = vp->v_rdev;
 	for (;;) {
 		mtx_lock(&spechash_mtx);
 		vq = SLIST_FIRST(&dev->si_hlist);
 		mtx_unlock(&spechash_mtx);
 		if (!vq)
 			break;
 		vgone(vq);
 	}
 	return (0);
 }
 
 /*
  * Recycle an unused vnode to the front of the free list.
  * Release the passed interlock if the vnode will be recycled.
  */
 int
 vrecycle(vp, inter_lkp, td)
 	struct vnode *vp;
 	struct mtx *inter_lkp;
 	struct thread *td;
 {
 
 	VI_LOCK(vp);
 	if (vp->v_usecount == 0) {
 		if (inter_lkp) {
 			mtx_unlock(inter_lkp);
 		}
 		vgonel(vp, td);
 		return (1);
 	}
 	VI_UNLOCK(vp);
 	return (0);
 }
 
 /*
  * Eliminate all activity associated with a vnode
  * in preparation for reuse.
  */
 void
 vgone(vp)
 	register struct vnode *vp;
 {
 	struct thread *td = curthread;	/* XXX */
 
 	VI_LOCK(vp);
 	vgonel(vp, td);
 }
 
 /*
  * vgone, with the vp interlock held.
  */
 void
 vgonel(vp, td)
 	struct vnode *vp;
 	struct thread *td;
 {
 	int s;
 
 	/*
 	 * If a vgone (or vclean) is already in progress,
 	 * wait until it is done and return.
 	 */
 	ASSERT_VI_LOCKED(vp, "vgonel");
 	if (vp->v_iflag & VI_XLOCK) {
 		vp->v_iflag |= VI_XWANT;
 		msleep(vp, VI_MTX(vp), PINOD | PDROP, "vgone", 0);
 		return;
 	}
 
 	/*
 	 * Clean out the filesystem specific data.
 	 */
 	vclean(vp, DOCLOSE, td);
 	VI_UNLOCK(vp);
 
 	/*
 	 * Delete from old mount point vnode list, if on one.
 	 */
 	if (vp->v_mount != NULL)
 		insmntque(vp, (struct mount *)0);
 	/*
 	 * If special device, remove it from special device alias list
 	 * if it is on one.
 	 */
 	if (vp->v_type == VCHR && vp->v_rdev != NULL && vp->v_rdev != NODEV) {
 		VI_LOCK(vp);
 		mtx_lock(&spechash_mtx);
 		SLIST_REMOVE(&vp->v_rdev->si_hlist, vp, vnode, v_specnext);
 		vp->v_rdev->si_usecount -= vp->v_usecount;
 		mtx_unlock(&spechash_mtx);
 		VI_UNLOCK(vp);
 		vp->v_rdev = NULL;
 	}
 
 	/*
 	 * If it is on the freelist and not already at the head,
 	 * move it to the head of the list. The test of the
 	 * VDOOMED flag and the reference count of zero is because
 	 * it will be removed from the free list by getnewvnode,
 	 * but will not have its reference count incremented until
 	 * after calling vgone. If the reference count were
 	 * incremented first, vgone would (incorrectly) try to
 	 * close the previous instance of the underlying object.
 	 */
 	VI_LOCK(vp);
 	if (vp->v_usecount == 0 && !(vp->v_iflag & VI_DOOMED)) {
 		s = splbio();
 		mtx_lock(&vnode_free_list_mtx);
 		if (vp->v_iflag & VI_FREE) {
 			TAILQ_REMOVE(&vnode_free_list, vp, v_freelist);
 		} else {
 			vp->v_iflag |= VI_FREE;
 			freevnodes++;
 		}
 		TAILQ_INSERT_HEAD(&vnode_free_list, vp, v_freelist);
 		mtx_unlock(&vnode_free_list_mtx);
 		splx(s);
 	}
 
 	vp->v_type = VBAD;
 	VI_UNLOCK(vp);
 }
 
 /*
  * Lookup a vnode by device number.
  */
 int
 vfinddev(dev, type, vpp)
 	dev_t dev;
 	enum vtype type;
 	struct vnode **vpp;
 {
 	struct vnode *vp;
 
 	mtx_lock(&spechash_mtx);
 	SLIST_FOREACH(vp, &dev->si_hlist, v_specnext) {
 		if (type == vp->v_type) {
 			*vpp = vp;
 			mtx_unlock(&spechash_mtx);
 			return (1);
 		}
 	}
 	mtx_unlock(&spechash_mtx);
 	return (0);
 }
 
 /*
  * Calculate the total number of references to a special device.
  */
 int
 vcount(vp)
 	struct vnode *vp;
 {
 	int count;
 
 	mtx_lock(&spechash_mtx);
 	count = vp->v_rdev->si_usecount;
 	mtx_unlock(&spechash_mtx);
 	return (count);
 }
 
 /*
  * Same as above, but using the dev_t as argument
  */
 int
 count_dev(dev)
 	dev_t dev;
 {
 	struct vnode *vp;
 
 	vp = SLIST_FIRST(&dev->si_hlist);
 	if (vp == NULL)
 		return (0);
 	return(vcount(vp));
 }
 
 /*
  * Print out a description of a vnode.
  */
 static char *typename[] =
 {"VNON", "VREG", "VDIR", "VBLK", "VCHR", "VLNK", "VSOCK", "VFIFO", "VBAD"};
 
 void
 vprint(label, vp)
 	char *label;
 	struct vnode *vp;
 {
 	char buf[96];
 
 	if (label != NULL)
 		printf("%s: %p: ", label, (void *)vp);
 	else
 		printf("%p: ", (void *)vp);
 	printf("tag %s, type %s, usecount %d, writecount %d, refcount %d,",
 	    vp->v_tag, typename[vp->v_type], vp->v_usecount,
 	    vp->v_writecount, vp->v_holdcnt);
 	buf[0] = '\0';
 	if (vp->v_vflag & VV_ROOT)
 		strcat(buf, "|VV_ROOT");
 	if (vp->v_vflag & VV_TEXT)
 		strcat(buf, "|VV_TEXT");
 	if (vp->v_vflag & VV_SYSTEM)
 		strcat(buf, "|VV_SYSTEM");
 	if (vp->v_iflag & VI_XLOCK)
 		strcat(buf, "|VI_XLOCK");
 	if (vp->v_iflag & VI_XWANT)
 		strcat(buf, "|VI_XWANT");
 	if (vp->v_iflag & VI_BWAIT)
 		strcat(buf, "|VI_BWAIT");
 	if (vp->v_iflag & VI_DOOMED)
 		strcat(buf, "|VI_DOOMED");
 	if (vp->v_iflag & VI_FREE)
 		strcat(buf, "|VI_FREE");
 	if (vp->v_vflag & VV_OBJBUF)
 		strcat(buf, "|VV_OBJBUF");
 	if (buf[0] != '\0')
 		printf(" flags (%s),", &buf[1]);
 	lockmgr_printinfo(vp->v_vnlock);
 	printf("\n");
 	if (vp->v_data != NULL) {
 		printf("\t");
 		VOP_PRINT(vp);
 	}
 }
 
 #ifdef DDB
 #include <ddb/ddb.h>
 /*
  * List all of the locked vnodes in the system.
  * Called when debugging the kernel.
  */
 DB_SHOW_COMMAND(lockedvnods, lockedvnodes)
 {
 	struct thread *td = curthread;	/* XXX */
 	struct mount *mp, *nmp;
 	struct vnode *vp;
 
 	printf("Locked vnodes\n");
 	mtx_lock(&mountlist_mtx);
 	for (mp = TAILQ_FIRST(&mountlist); mp != NULL; mp = nmp) {
 		if (vfs_busy(mp, LK_NOWAIT, &mountlist_mtx, td)) {
 			nmp = TAILQ_NEXT(mp, mnt_list);
 			continue;
 		}
 		mtx_lock(&mntvnode_mtx);
 		TAILQ_FOREACH(vp, &mp->mnt_nvnodelist, v_nmntvnodes) {
 			if (VOP_ISLOCKED(vp, NULL))
 				vprint((char *)0, vp);
 		}
 		mtx_unlock(&mntvnode_mtx);
 		mtx_lock(&mountlist_mtx);
 		nmp = TAILQ_NEXT(mp, mnt_list);
 		vfs_unbusy(mp, td);
 	}
 	mtx_unlock(&mountlist_mtx);
 }
 #endif
 
 /*
  * Fill in a struct xvfsconf based on a struct vfsconf.
  */
 static void
 vfsconf2x(struct vfsconf *vfsp, struct xvfsconf *xvfsp)
 {
 
 	strcpy(xvfsp->vfc_name, vfsp->vfc_name);
 	xvfsp->vfc_typenum = vfsp->vfc_typenum;
 	xvfsp->vfc_refcount = vfsp->vfc_refcount;
 	xvfsp->vfc_flags = vfsp->vfc_flags;
 	/*
 	 * These are unused in userland, we keep them
 	 * to not break binary compatibility.
 	 */
 	xvfsp->vfc_vfsops = NULL;
 	xvfsp->vfc_next = NULL;
 }
 
 static int
 sysctl_vfs_conflist(SYSCTL_HANDLER_ARGS)
 {
 	struct vfsconf *vfsp;
 	struct xvfsconf *xvfsp;
 	int cnt, error, i;
 
 	cnt = 0;
 	for (vfsp = vfsconf; vfsp != NULL; vfsp = vfsp->vfc_next)
 		cnt++;
 	xvfsp = malloc(sizeof(struct xvfsconf) * cnt, M_TEMP, M_WAITOK);
 	/*
 	 * Handle the race that we will have here when struct vfsconf
 	 * will be locked down by using both cnt and checking vfc_next
 	 * against NULL to determine the end of the loop.  The race will
 	 * happen because we will have to unlock before calling malloc().
 	 * We are protected by Giant for now.
 	 */
 	i = 0;
 	for (vfsp = vfsconf; vfsp != NULL && i < cnt; vfsp = vfsp->vfc_next) {
 		vfsconf2x(vfsp, xvfsp + i);
 		i++;
 	}
 	error = SYSCTL_OUT(req, xvfsp, sizeof(struct xvfsconf) * i);
 	free(xvfsp, M_TEMP);
 	return (error);
 }
 
 SYSCTL_PROC(_vfs, OID_AUTO, conflist, CTLFLAG_RD, NULL, 0, sysctl_vfs_conflist,
     "S,xvfsconf", "List of all configured filesystems");
 
 /*
  * Top level filesystem related information gathering.
  */
 static int	sysctl_ovfs_conf(SYSCTL_HANDLER_ARGS);
 
 static int
 vfs_sysctl(SYSCTL_HANDLER_ARGS)
 {
 	int *name = (int *)arg1 - 1;	/* XXX */
 	u_int namelen = arg2 + 1;	/* XXX */
 	struct vfsconf *vfsp;
 	struct xvfsconf xvfsp;
 
 	printf("WARNING: userland calling deprecated sysctl, "
 	    "please rebuild world\n");
 
 #if 1 || defined(COMPAT_PRELITE2)
 	/* Resolve ambiguity between VFS_VFSCONF and VFS_GENERIC. */
 	if (namelen == 1)
 		return (sysctl_ovfs_conf(oidp, arg1, arg2, req));
 #endif
 
 	switch (name[1]) {
 	case VFS_MAXTYPENUM:
 		if (namelen != 2)
 			return (ENOTDIR);
 		return (SYSCTL_OUT(req, &maxvfsconf, sizeof(int)));
 	case VFS_CONF:
 		if (namelen != 3)
 			return (ENOTDIR);	/* overloaded */
 		for (vfsp = vfsconf; vfsp; vfsp = vfsp->vfc_next)
 			if (vfsp->vfc_typenum == name[2])
 				break;
 		if (vfsp == NULL)
 			return (EOPNOTSUPP);
 		vfsconf2x(vfsp, &xvfsp);
 		return (SYSCTL_OUT(req, &xvfsp, sizeof(xvfsp)));
 	}
 	return (EOPNOTSUPP);
 }
 
 SYSCTL_NODE(_vfs, VFS_GENERIC, generic, CTLFLAG_RD | CTLFLAG_SKIP, vfs_sysctl,
 	"Generic filesystem");
 
 #if 1 || defined(COMPAT_PRELITE2)
 
 static int
 sysctl_ovfs_conf(SYSCTL_HANDLER_ARGS)
 {
 	int error;
 	struct vfsconf *vfsp;
 	struct ovfsconf ovfs;
 
 	for (vfsp = vfsconf; vfsp; vfsp = vfsp->vfc_next) {
 		ovfs.vfc_vfsops = vfsp->vfc_vfsops;	/* XXX used as flag */
 		strcpy(ovfs.vfc_name, vfsp->vfc_name);
 		ovfs.vfc_index = vfsp->vfc_typenum;
 		ovfs.vfc_refcount = vfsp->vfc_refcount;
 		ovfs.vfc_flags = vfsp->vfc_flags;
 		error = SYSCTL_OUT(req, &ovfs, sizeof ovfs);
 		if (error)
 			return error;
 	}
 	return 0;
 }
 
 #endif /* 1 || COMPAT_PRELITE2 */
 
 #define KINFO_VNODESLOP		10
 /*
  * Dump vnode list (via sysctl).
  */
 /* ARGSUSED */
 static int
 sysctl_vnode(SYSCTL_HANDLER_ARGS)
 {
 	struct xvnode *xvn;
 	struct thread *td = req->td;
 	struct mount *mp;
 	struct vnode *vp;
 	int error, len, n;
 
 	/*
 	 * Stale numvnodes access is not fatal here.
 	 */
 	req->lock = 0;
 	len = (numvnodes + KINFO_VNODESLOP) * sizeof *xvn;
 	if (!req->oldptr)
 		/* Make an estimate */
 		return (SYSCTL_OUT(req, 0, len));
 
 	sysctl_wire_old_buffer(req, 0);
 	xvn = malloc(len, M_TEMP, M_ZERO | M_WAITOK);
 	n = 0;
 	mtx_lock(&mountlist_mtx);
 	TAILQ_FOREACH(mp, &mountlist, mnt_list) {
 		if (vfs_busy(mp, LK_NOWAIT, &mountlist_mtx, td))
 			continue;
 		mtx_lock(&mntvnode_mtx);
 		TAILQ_FOREACH(vp, &mp->mnt_nvnodelist, v_nmntvnodes) {
 			if (n == len)
 				break;
 			vref(vp);
 			xvn[n].xv_size = sizeof *xvn;
 			xvn[n].xv_vnode = vp;
 #define XV_COPY(field) xvn[n].xv_##field = vp->v_##field
 			XV_COPY(usecount);
 			XV_COPY(writecount);
 			XV_COPY(holdcnt);
 			XV_COPY(id);
 			XV_COPY(mount);
 			XV_COPY(numoutput);
 			XV_COPY(type);
 #undef XV_COPY
 			xvn[n].xv_flag = vp->v_vflag;
 
 			switch (vp->v_type) {
 			case VREG:
 			case VDIR:
 			case VLNK:
 				xvn[n].xv_dev = vp->v_cachedfs;
 				xvn[n].xv_ino = vp->v_cachedid;
 				break;
 			case VBLK:
 			case VCHR:
 				if (vp->v_rdev == NULL) {
 					vrele(vp);
 					continue;
 				}
 				xvn[n].xv_dev = dev2udev(vp->v_rdev);
 				break;
 			case VSOCK:
 				xvn[n].xv_socket = vp->v_socket;
 				break;
 			case VFIFO:
 				xvn[n].xv_fifo = vp->v_fifoinfo;
 				break;
 			case VNON:
 			case VBAD:
 			default:
 				/* shouldn't happen? */
 				vrele(vp);
 				continue;
 			}
 			vrele(vp);
 			++n;
 		}
 		mtx_unlock(&mntvnode_mtx);
 		mtx_lock(&mountlist_mtx);
 		vfs_unbusy(mp, td);
 		if (n == len)
 			break;
 	}
 	mtx_unlock(&mountlist_mtx);
 
 	error = SYSCTL_OUT(req, xvn, n * sizeof *xvn);
 	free(xvn, M_TEMP);
 	return (error);
 }
 
 SYSCTL_PROC(_kern, KERN_VNODE, vnode, CTLTYPE_OPAQUE|CTLFLAG_RD,
 	0, 0, sysctl_vnode, "S,xvnode", "");
 
 /*
  * Check to see if a filesystem is mounted on a block device.
  */
 int
 vfs_mountedon(vp)
 	struct vnode *vp;
 {
 
 	if (vp->v_rdev->si_mountpoint != NULL)
 		return (EBUSY);
 	return (0);
 }
 
 /*
  * Unmount all filesystems. The list is traversed in reverse order
  * of mounting to avoid dependencies.
  */
 void
 vfs_unmountall()
 {
 	struct mount *mp;
 	struct thread *td;
 	int error;
 
 	if (curthread != NULL)
 		td = curthread;
 	else
 		td = FIRST_THREAD_IN_PROC(initproc); /* XXX XXX proc0? */
 	/*
 	 * Since this only runs when rebooting, it is not interlocked.
 	 */
 	while(!TAILQ_EMPTY(&mountlist)) {
 		mp = TAILQ_LAST(&mountlist, mntlist);
 		error = dounmount(mp, MNT_FORCE, td);
 		if (error) {
 			TAILQ_REMOVE(&mountlist, mp, mnt_list);
 			printf("unmount of %s failed (",
 			    mp->mnt_stat.f_mntonname);
 			if (error == EBUSY)
 				printf("BUSY)\n");
 			else
 				printf("%d)\n", error);
 		} else {
 			/* The unmount has removed mp from the mountlist */
 		}
 	}
 }
 
 /*
  * perform msync on all vnodes under a mount point
  * the mount point must be locked.
  */
 void
 vfs_msync(struct mount *mp, int flags)
 {
 	struct vnode *vp, *nvp;
 	struct vm_object *obj;
 	int tries;
 
 	GIANT_REQUIRED;
 
 	tries = 5;
 	mtx_lock(&mntvnode_mtx);
 loop:
 	for (vp = TAILQ_FIRST(&mp->mnt_nvnodelist); vp != NULL; vp = nvp) {
 		if (vp->v_mount != mp) {
 			if (--tries > 0)
 				goto loop;
 			break;
 		}
 		nvp = TAILQ_NEXT(vp, v_nmntvnodes);
 
 		VI_LOCK(vp);
 		if (vp->v_iflag & VI_XLOCK) {	/* XXX: what if MNT_WAIT? */
 			VI_UNLOCK(vp);
 			continue;
 		}
 
 		if ((vp->v_iflag & VI_OBJDIRTY) &&
 		    (flags == MNT_WAIT || VOP_ISLOCKED(vp, NULL) == 0)) {
 			mtx_unlock(&mntvnode_mtx);
 			if (!vget(vp,
 			    LK_EXCLUSIVE | LK_RETRY | LK_INTERLOCK,
 			    curthread)) {
 				if (vp->v_vflag & VV_NOSYNC) {	/* unlinked */
 					vput(vp);
 					mtx_lock(&mntvnode_mtx);
 					continue;
 				}
 
 				if (VOP_GETVOBJECT(vp, &obj) == 0) {
 					vm_object_page_clean(obj, 0, 0,
 					    flags == MNT_WAIT ?
 					    OBJPC_SYNC : OBJPC_NOSYNC);
 				}
 				vput(vp);
 			}
 			mtx_lock(&mntvnode_mtx);
 			if (TAILQ_NEXT(vp, v_nmntvnodes) != nvp) {
 				if (--tries > 0)
 					goto loop;
 				break;
 			}
 		} else
 			VI_UNLOCK(vp);
 	}
 	mtx_unlock(&mntvnode_mtx);
 }
 
 /*
  * Create the VM object needed for VMIO and mmap support.  This
  * is done for all VREG files in the system.  Some filesystems might
  * afford the additional metadata buffering capability of the
  * VMIO code by making the device node be VMIO mode also.
  *
  * vp must be locked when vfs_object_create is called.
  */
 int
 vfs_object_create(vp, td, cred)
 	struct vnode *vp;
 	struct thread *td;
 	struct ucred *cred;
 {
 	GIANT_REQUIRED;
 	return (VOP_CREATEVOBJECT(vp, cred, td));
 }
 
 /*
  * Mark a vnode as free, putting it up for recycling.
  */
 void
 vfree(vp)
 	struct vnode *vp;
 {
 	int s;
 
 	ASSERT_VI_LOCKED(vp, "vfree");
 	s = splbio();
 	mtx_lock(&vnode_free_list_mtx);
 	KASSERT((vp->v_iflag & VI_FREE) == 0, ("vnode already free"));
 	if (vp->v_iflag & VI_AGE) {
 		TAILQ_INSERT_HEAD(&vnode_free_list, vp, v_freelist);
 	} else {
 		TAILQ_INSERT_TAIL(&vnode_free_list, vp, v_freelist);
 	}
 	freevnodes++;
 	mtx_unlock(&vnode_free_list_mtx);
 	vp->v_iflag &= ~VI_AGE;
 	vp->v_iflag |= VI_FREE;
 	splx(s);
 }
 
 /*
  * Opposite of vfree() - mark a vnode as in use.
  */
 void
 vbusy(vp)
 	struct vnode *vp;
 {
 	int s;
 
 	s = splbio();
 	ASSERT_VI_LOCKED(vp, "vbusy");
 	KASSERT((vp->v_iflag & VI_FREE) != 0, ("vnode not free"));
 
 	mtx_lock(&vnode_free_list_mtx);
 	TAILQ_REMOVE(&vnode_free_list, vp, v_freelist);
 	freevnodes--;
 	mtx_unlock(&vnode_free_list_mtx);
 
 	vp->v_iflag &= ~(VI_FREE|VI_AGE);
 	splx(s);
 }
 
 /*
  * Record a process's interest in events which might happen to
  * a vnode.  Because poll uses the historic select-style interface
  * internally, this routine serves as both the ``check for any
  * pending events'' and the ``record my interest in future events''
  * functions.  (These are done together, while the lock is held,
  * to avoid race conditions.)
  */
 int
 vn_pollrecord(vp, td, events)
 	struct vnode *vp;
 	struct thread *td;
 	short events;
 {
 
 	if (vp->v_pollinfo == NULL)
 		v_addpollinfo(vp);
 	mtx_lock(&vp->v_pollinfo->vpi_lock);
 	if (vp->v_pollinfo->vpi_revents & events) {
 		/*
 		 * This leaves events we are not interested
 		 * in available for the other process which
 		 * which presumably had requested them
 		 * (otherwise they would never have been
 		 * recorded).
 		 */
 		events &= vp->v_pollinfo->vpi_revents;
 		vp->v_pollinfo->vpi_revents &= ~events;
 
 		mtx_unlock(&vp->v_pollinfo->vpi_lock);
 		return events;
 	}
 	vp->v_pollinfo->vpi_events |= events;
 	selrecord(td, &vp->v_pollinfo->vpi_selinfo);
 	mtx_unlock(&vp->v_pollinfo->vpi_lock);
 	return 0;
 }
 
 /*
  * Note the occurrence of an event.  If the VN_POLLEVENT macro is used,
  * it is possible for us to miss an event due to race conditions, but
  * that condition is expected to be rare, so for the moment it is the
  * preferred interface.
  */
 void
 vn_pollevent(vp, events)
 	struct vnode *vp;
 	short events;
 {
 
 	if (vp->v_pollinfo == NULL)
 		v_addpollinfo(vp);
 	mtx_lock(&vp->v_pollinfo->vpi_lock);
 	if (vp->v_pollinfo->vpi_events & events) {
 		/*
 		 * We clear vpi_events so that we don't
 		 * call selwakeup() twice if two events are
 		 * posted before the polling process(es) is
 		 * awakened.  This also ensures that we take at
 		 * most one selwakeup() if the polling process
 		 * is no longer interested.  However, it does
 		 * mean that only one event can be noticed at
 		 * a time.  (Perhaps we should only clear those
 		 * event bits which we note?) XXX
 		 */
 		vp->v_pollinfo->vpi_events = 0;	/* &= ~events ??? */
 		vp->v_pollinfo->vpi_revents |= events;
 		selwakeup(&vp->v_pollinfo->vpi_selinfo);
 	}
 	mtx_unlock(&vp->v_pollinfo->vpi_lock);
 }
 
 /*
  * Wake up anyone polling on vp because it is being revoked.
  * This depends on dead_poll() returning POLLHUP for correct
  * behavior.
  */
 void
 vn_pollgone(vp)
 	struct vnode *vp;
 {
 
 	mtx_lock(&vp->v_pollinfo->vpi_lock);
 	VN_KNOTE(vp, NOTE_REVOKE);
 	if (vp->v_pollinfo->vpi_events) {
 		vp->v_pollinfo->vpi_events = 0;
 		selwakeup(&vp->v_pollinfo->vpi_selinfo);
 	}
 	mtx_unlock(&vp->v_pollinfo->vpi_lock);
 }
 
 
 
 /*
  * Routine to create and manage a filesystem syncer vnode.
  */
 #define sync_close ((int (*)(struct  vop_close_args *))nullop)
 static int	sync_fsync(struct  vop_fsync_args *);
 static int	sync_inactive(struct  vop_inactive_args *);
 static int	sync_reclaim(struct  vop_reclaim_args *);
 static int	sync_print(struct vop_print_args *);
 
 static vop_t **sync_vnodeop_p;
 static struct vnodeopv_entry_desc sync_vnodeop_entries[] = {
 	{ &vop_default_desc,	(vop_t *) vop_eopnotsupp },
 	{ &vop_close_desc,	(vop_t *) sync_close },		/* close */
 	{ &vop_fsync_desc,	(vop_t *) sync_fsync },		/* fsync */
 	{ &vop_inactive_desc,	(vop_t *) sync_inactive },	/* inactive */
 	{ &vop_reclaim_desc,	(vop_t *) sync_reclaim },	/* reclaim */
 	{ &vop_lock_desc,	(vop_t *) vop_stdlock },	/* lock */
 	{ &vop_unlock_desc,	(vop_t *) vop_stdunlock },	/* unlock */
 	{ &vop_print_desc,	(vop_t *) sync_print },		/* print */
 	{ &vop_islocked_desc,	(vop_t *) vop_stdislocked },	/* islocked */
 	{ NULL, NULL }
 };
 static struct vnodeopv_desc sync_vnodeop_opv_desc =
 	{ &sync_vnodeop_p, sync_vnodeop_entries };
 
 VNODEOP_SET(sync_vnodeop_opv_desc);
 
 /*
  * Create a new filesystem syncer vnode for the specified mount point.
  */
 int
 vfs_allocate_syncvnode(mp)
 	struct mount *mp;
 {
 	struct vnode *vp;
 	static long start, incr, next;
 	int error;
 
 	/* Allocate a new vnode */
 	if ((error = getnewvnode("vfs", mp, sync_vnodeop_p, &vp)) != 0) {
 		mp->mnt_syncer = NULL;
 		return (error);
 	}
 	vp->v_type = VNON;
 	/*
 	 * Place the vnode onto the syncer worklist. We attempt to
 	 * scatter them about on the list so that they will go off
 	 * at evenly distributed times even if all the filesystems
 	 * are mounted at once.
 	 */
 	next += incr;
 	if (next == 0 || next > syncer_maxdelay) {
 		start /= 2;
 		incr /= 2;
 		if (start == 0) {
 			start = syncer_maxdelay / 2;
 			incr = syncer_maxdelay;
 		}
 		next = start;
 	}
 	VI_LOCK(vp);
 	vn_syncer_add_to_worklist(vp, syncdelay > 0 ? next % syncdelay : 0);
 	VI_UNLOCK(vp);
 	mp->mnt_syncer = vp;
 	return (0);
 }
 
 /*
  * Do a lazy sync of the filesystem.
  */
 static int
 sync_fsync(ap)
 	struct vop_fsync_args /* {
 		struct vnode *a_vp;
 		struct ucred *a_cred;
 		int a_waitfor;
 		struct thread *a_td;
 	} */ *ap;
 {
 	struct vnode *syncvp = ap->a_vp;
 	struct mount *mp = syncvp->v_mount;
 	struct thread *td = ap->a_td;
-	int asyncflag;
+	int error, asyncflag;
 
 	/*
 	 * We only need to do something if this is a lazy evaluation.
 	 */
 	if (ap->a_waitfor != MNT_LAZY)
 		return (0);
 
 	/*
 	 * Move ourselves to the back of the sync list.
 	 */
 	VI_LOCK(syncvp);
 	vn_syncer_add_to_worklist(syncvp, syncdelay);
 	VI_UNLOCK(syncvp);
 
 	/*
 	 * Walk the list of vnodes pushing all that are dirty and
 	 * not already on the sync list.
 	 */
 	mtx_lock(&mountlist_mtx);
 	if (vfs_busy(mp, LK_EXCLUSIVE | LK_NOWAIT, &mountlist_mtx, td) != 0) {
 		mtx_unlock(&mountlist_mtx);
 		return (0);
 	}
 	if (vn_start_write(NULL, &mp, V_NOWAIT) != 0) {
 		vfs_unbusy(mp, td);
 		return (0);
 	}
 	asyncflag = mp->mnt_flag & MNT_ASYNC;
 	mp->mnt_flag &= ~MNT_ASYNC;
 	vfs_msync(mp, MNT_NOWAIT);
-	VFS_SYNC(mp, MNT_LAZY, ap->a_cred, td);
+	error = VFS_SYNC(mp, MNT_LAZY, ap->a_cred, td);
 	if (asyncflag)
 		mp->mnt_flag |= MNT_ASYNC;
 	vn_finished_write(mp);
 	vfs_unbusy(mp, td);
-	return (0);
+	return (error);
 }
 
 /*
  * The syncer vnode is no referenced.
  */
 static int
 sync_inactive(ap)
 	struct vop_inactive_args /* {
 		struct vnode *a_vp;
 		struct thread *a_td;
 	} */ *ap;
 {
 
 	VOP_UNLOCK(ap->a_vp, 0, ap->a_td);
 	vgone(ap->a_vp);
 	return (0);
 }
 
 /*
  * The syncer vnode is no longer needed and is being decommissioned.
  *
  * Modifications to the worklist must be protected at splbio().
  */
 static int
 sync_reclaim(ap)
 	struct vop_reclaim_args /* {
 		struct vnode *a_vp;
 	} */ *ap;
 {
 	struct vnode *vp = ap->a_vp;
 	int s;
 
 	s = splbio();
 	vp->v_mount->mnt_syncer = NULL;
 	VI_LOCK(vp);
 	if (vp->v_iflag & VI_ONWORKLST) {
 		mtx_lock(&sync_mtx);
 		LIST_REMOVE(vp, v_synclist);
 		mtx_unlock(&sync_mtx);
 		vp->v_iflag &= ~VI_ONWORKLST;
 	}
 	VI_UNLOCK(vp);
 	splx(s);
 
 	return (0);
 }
 
 /*
  * Print out a syncer vnode.
  */
 static int
 sync_print(ap)
 	struct vop_print_args /* {
 		struct vnode *a_vp;
 	} */ *ap;
 {
 	struct vnode *vp = ap->a_vp;
 
 	printf("syncer vnode");
 	if (vp->v_vnlock != NULL)
 		lockmgr_printinfo(vp->v_vnlock);
 	printf("\n");
 	return (0);
 }
 
 /*
  * extract the dev_t from a VCHR
  */
 dev_t
 vn_todev(vp)
 	struct vnode *vp;
 {
 	if (vp->v_type != VCHR)
 		return (NODEV);
 	return (vp->v_rdev);
 }
 
 /*
  * Check if vnode represents a disk device
  */
 int
 vn_isdisk(vp, errp)
 	struct vnode *vp;
 	int *errp;
 {
 	struct cdevsw *cdevsw;
 
 	if (vp->v_type != VCHR) {
 		if (errp != NULL)
 			*errp = ENOTBLK;
 		return (0);
 	}
 	if (vp->v_rdev == NULL) {
 		if (errp != NULL)
 			*errp = ENXIO;
 		return (0);
 	}
 	cdevsw = devsw(vp->v_rdev);
 	if (cdevsw == NULL) {
 		if (errp != NULL)
 			*errp = ENXIO;
 		return (0);
 	}
 	if (!(cdevsw->d_flags & D_DISK)) {
 		if (errp != NULL)
 			*errp = ENOTBLK;
 		return (0);
 	}
 	if (errp != NULL)
 		*errp = 0;
 	return (1);
 }
 
 /*
  * Free data allocated by namei(); see namei(9) for details.
  */
 void
 NDFREE(ndp, flags)
      struct nameidata *ndp;
      const uint flags;
 {
 	if (!(flags & NDF_NO_FREE_PNBUF) &&
 	    (ndp->ni_cnd.cn_flags & HASBUF)) {
 		uma_zfree(namei_zone, ndp->ni_cnd.cn_pnbuf);
 		ndp->ni_cnd.cn_flags &= ~HASBUF;
 	}
 	if (!(flags & NDF_NO_DVP_UNLOCK) &&
 	    (ndp->ni_cnd.cn_flags & LOCKPARENT) &&
 	    ndp->ni_dvp != ndp->ni_vp)
 		VOP_UNLOCK(ndp->ni_dvp, 0, ndp->ni_cnd.cn_thread);
 	if (!(flags & NDF_NO_DVP_RELE) &&
 	    (ndp->ni_cnd.cn_flags & (LOCKPARENT|WANTPARENT))) {
 		vrele(ndp->ni_dvp);
 		ndp->ni_dvp = NULL;
 	}
 	if (!(flags & NDF_NO_VP_UNLOCK) &&
 	    (ndp->ni_cnd.cn_flags & LOCKLEAF) && ndp->ni_vp)
 		VOP_UNLOCK(ndp->ni_vp, 0, ndp->ni_cnd.cn_thread);
 	if (!(flags & NDF_NO_VP_RELE) &&
 	    ndp->ni_vp) {
 		vrele(ndp->ni_vp);
 		ndp->ni_vp = NULL;
 	}
 	if (!(flags & NDF_NO_STARTDIR_RELE) &&
 	    (ndp->ni_cnd.cn_flags & SAVESTART)) {
 		vrele(ndp->ni_startdir);
 		ndp->ni_startdir = NULL;
 	}
 }
 
 /*
  * Common filesystem object access control check routine.  Accepts a
  * vnode's type, "mode", uid and gid, requested access mode, credentials,
  * and optional call-by-reference privused argument allowing vaccess()
  * to indicate to the caller whether privilege was used to satisfy the
  * request (obsoleted).  Returns 0 on success, or an errno on failure.
  */
 int
 vaccess(type, file_mode, file_uid, file_gid, acc_mode, cred, privused)
 	enum vtype type;
 	mode_t file_mode;
 	uid_t file_uid;
 	gid_t file_gid;
 	mode_t acc_mode;
 	struct ucred *cred;
 	int *privused;
 {
 	mode_t dac_granted;
 #ifdef CAPABILITIES
 	mode_t cap_granted;
 #endif
 
 	/*
 	 * Look for a normal, non-privileged way to access the file/directory
 	 * as requested.  If it exists, go with that.
 	 */
 
 	if (privused != NULL)
 		*privused = 0;
 
 	dac_granted = 0;
 
 	/* Check the owner. */
 	if (cred->cr_uid == file_uid) {
 		dac_granted |= VADMIN;
 		if (file_mode & S_IXUSR)
 			dac_granted |= VEXEC;
 		if (file_mode & S_IRUSR)
 			dac_granted |= VREAD;
 		if (file_mode & S_IWUSR)
 			dac_granted |= (VWRITE | VAPPEND);
 
 		if ((acc_mode & dac_granted) == acc_mode)
 			return (0);
 
 		goto privcheck;
 	}
 
 	/* Otherwise, check the groups (first match) */
 	if (groupmember(file_gid, cred)) {
 		if (file_mode & S_IXGRP)
 			dac_granted |= VEXEC;
 		if (file_mode & S_IRGRP)
 			dac_granted |= VREAD;
 		if (file_mode & S_IWGRP)
 			dac_granted |= (VWRITE | VAPPEND);
 
 		if ((acc_mode & dac_granted) == acc_mode)
 			return (0);
 
 		goto privcheck;
 	}
 
 	/* Otherwise, check everyone else. */
 	if (file_mode & S_IXOTH)
 		dac_granted |= VEXEC;
 	if (file_mode & S_IROTH)
 		dac_granted |= VREAD;
 	if (file_mode & S_IWOTH)
 		dac_granted |= (VWRITE | VAPPEND);
 	if ((acc_mode & dac_granted) == acc_mode)
 		return (0);
 
 privcheck:
 	if (!suser_cred(cred, PRISON_ROOT)) {
 		/* XXX audit: privilege used */
 		if (privused != NULL)
 			*privused = 1;
 		return (0);
 	}
 
 #ifdef CAPABILITIES
 	/*
 	 * Build a capability mask to determine if the set of capabilities
 	 * satisfies the requirements when combined with the granted mask
 	 * from above.
 	 * For each capability, if the capability is required, bitwise
 	 * or the request type onto the cap_granted mask.
 	 */
 	cap_granted = 0;
 
 	if (type == VDIR) {
 		/*
 		 * For directories, use CAP_DAC_READ_SEARCH to satisfy
 		 * VEXEC requests, instead of CAP_DAC_EXECUTE.
 		 */
 		if ((acc_mode & VEXEC) && ((dac_granted & VEXEC) == 0) &&
 		    !cap_check(cred, NULL, CAP_DAC_READ_SEARCH, PRISON_ROOT))
 			cap_granted |= VEXEC;
 	} else {
 		if ((acc_mode & VEXEC) && ((dac_granted & VEXEC) == 0) &&
 		    !cap_check(cred, NULL, CAP_DAC_EXECUTE, PRISON_ROOT))
 			cap_granted |= VEXEC;
 	}
 
 	if ((acc_mode & VREAD) && ((dac_granted & VREAD) == 0) &&
 	    !cap_check(cred, NULL, CAP_DAC_READ_SEARCH, PRISON_ROOT))
 		cap_granted |= VREAD;
 
 	if ((acc_mode & VWRITE) && ((dac_granted & VWRITE) == 0) &&
 	    !cap_check(cred, NULL, CAP_DAC_WRITE, PRISON_ROOT))
 		cap_granted |= (VWRITE | VAPPEND);
 
 	if ((acc_mode & VADMIN) && ((dac_granted & VADMIN) == 0) &&
 	    !cap_check(cred, NULL, CAP_FOWNER, PRISON_ROOT))
 		cap_granted |= VADMIN;
 
 	if ((acc_mode & (cap_granted | dac_granted)) == acc_mode) {
 		/* XXX audit: privilege used */
 		if (privused != NULL)
 			*privused = 1;
 		return (0);
 	}
 #endif
 
 	return ((acc_mode & VADMIN) ? EPERM : EACCES);
 }
 
 /*
  * Credential check based on process requesting service, and per-attribute
  * permissions.
  */
 int
 extattr_check_cred(struct vnode *vp, int attrnamespace,
     struct ucred *cred, struct thread *td, int access)
 {
 
 	/*
 	 * Kernel-invoked always succeeds.
 	 */
 	if (cred == NOCRED)
 		return (0);
 
 	/*
 	 * Do not allow privileged processes in jail to directly
 	 * manipulate system attributes.
 	 *
 	 * XXX What capability should apply here?
 	 * Probably CAP_SYS_SETFFLAG.
 	 */
 	switch (attrnamespace) {
 	case EXTATTR_NAMESPACE_SYSTEM:
 		/* Potentially should be: return (EPERM); */
 		return (suser_cred(cred, 0));
 	case EXTATTR_NAMESPACE_USER:
 		return (VOP_ACCESS(vp, access, cred, td));
 	default:
 		return (EPERM);
 	}
 }
Index: head/sys/kern/vfs_vnops.c
===================================================================
--- head/sys/kern/vfs_vnops.c	(revision 105901)
+++ head/sys/kern/vfs_vnops.c	(revision 105902)
@@ -1,1151 +1,1156 @@
 /*
  * Copyright (c) 1982, 1986, 1989, 1993
  *	The Regents of the University of California.  All rights reserved.
  * (c) UNIX System Laboratories, Inc.
  * All or some portions of this file are derived from material licensed
  * to the University of California by American Telephone and Telegraph
  * Co. or Unix System Laboratories, Inc. and are reproduced herein with
  * the permission of UNIX System Laboratories, Inc.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. All advertising materials mentioning features or use of this software
  *    must display the following acknowledgement:
  *	This product includes software developed by the University of
  *	California, Berkeley and its contributors.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	@(#)vfs_vnops.c	8.2 (Berkeley) 1/21/94
  * $FreeBSD$
  */
 
 #include "opt_mac.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/fcntl.h>
 #include <sys/file.h>
 #include <sys/stat.h>
 #include <sys/proc.h>
 #include <sys/lock.h>
 #include <sys/mac.h>
 #include <sys/mount.h>
 #include <sys/mutex.h>
 #include <sys/namei.h>
 #include <sys/vnode.h>
 #include <sys/bio.h>
 #include <sys/buf.h>
 #include <sys/filio.h>
 #include <sys/sx.h>
 #include <sys/ttycom.h>
 #include <sys/conf.h>
 #include <sys/syslog.h>
 
 #include <machine/limits.h>
 
 static int vn_closefile(struct file *fp, struct thread *td);
 static int vn_ioctl(struct file *fp, u_long com, void *data, 
 		struct ucred *active_cred, struct thread *td);
 static int vn_read(struct file *fp, struct uio *uio, 
 		struct ucred *active_cred, int flags, struct thread *td);
 static int vn_poll(struct file *fp, int events, struct ucred *active_cred,
 		struct thread *td);
 static int vn_kqfilter(struct file *fp, struct knote *kn);
 static int vn_statfile(struct file *fp, struct stat *sb,
 		struct ucred *active_cred, struct thread *td);
 static int vn_write(struct file *fp, struct uio *uio, 
 		struct ucred *active_cred, int flags, struct thread *td);
 
 struct 	fileops vnops = {
 	vn_read, vn_write, vn_ioctl, vn_poll, vn_kqfilter,
 	vn_statfile, vn_closefile
 };
 
 int
 vn_open(ndp, flagp, cmode)
 	register struct nameidata *ndp;
 	int *flagp, cmode;
 {
 	struct thread *td = ndp->ni_cnd.cn_thread;
 
 	return (vn_open_cred(ndp, flagp, cmode, td->td_ucred));
 }
 
 /*
  * Common code for vnode open operations.
  * Check permissions, and call the VOP_OPEN or VOP_CREATE routine.
  * 
  * Note that this does NOT free nameidata for the successful case,
  * due to the NDINIT being done elsewhere.
  */
 int
 vn_open_cred(ndp, flagp, cmode, cred)
 	register struct nameidata *ndp;
 	int *flagp, cmode;
 	struct ucred *cred;
 {
 	struct vnode *vp;
 	struct mount *mp;
 	struct thread *td = ndp->ni_cnd.cn_thread;
 	struct vattr vat;
 	struct vattr *vap = &vat;
 	int mode, fmode, error;
 #ifdef LOOKUP_SHARED
 	int exclusive;	/* The current intended lock state */
 
 	exclusive = 0;
 #endif
 
 restart:
 	fmode = *flagp;
 	if (fmode & O_CREAT) {
 		ndp->ni_cnd.cn_nameiop = CREATE;
 		ndp->ni_cnd.cn_flags = LOCKPARENT | LOCKLEAF;
 		if ((fmode & O_EXCL) == 0 && (fmode & O_NOFOLLOW) == 0)
 			ndp->ni_cnd.cn_flags |= FOLLOW;
 		bwillwrite();
 		if ((error = namei(ndp)) != 0)
 			return (error);
 		if (ndp->ni_vp == NULL) {
 			VATTR_NULL(vap);
 			vap->va_type = VREG;
 			vap->va_mode = cmode;
 			if (fmode & O_EXCL)
 				vap->va_vaflags |= VA_EXCLUSIVE;
 			if (vn_start_write(ndp->ni_dvp, &mp, V_NOWAIT) != 0) {
 				NDFREE(ndp, NDF_ONLY_PNBUF);
 				vput(ndp->ni_dvp);
 				if ((error = vn_start_write(NULL, &mp,
 				    V_XSLEEP | PCATCH)) != 0)
 					return (error);
 				goto restart;
 			}
 #ifdef MAC
 			error = mac_check_vnode_create(cred, ndp->ni_dvp,
 			    &ndp->ni_cnd, vap);
 			if (error == 0) {
 #endif
 				VOP_LEASE(ndp->ni_dvp, td, cred, LEASE_WRITE);
 				error = VOP_CREATE(ndp->ni_dvp, &ndp->ni_vp,
 						   &ndp->ni_cnd, vap);
 #ifdef MAC
 			}
 #endif
 			vput(ndp->ni_dvp);
 			vn_finished_write(mp);
 			if (error) {
 				NDFREE(ndp, NDF_ONLY_PNBUF);
 				return (error);
 			}
 			ASSERT_VOP_UNLOCKED(ndp->ni_dvp, "create");
 			ASSERT_VOP_LOCKED(ndp->ni_vp, "create");
 			fmode &= ~O_TRUNC;
 			vp = ndp->ni_vp;
 #ifdef LOOKUP_SHARED
 			exclusive = 1;
 #endif
 		} else {
 			if (ndp->ni_dvp == ndp->ni_vp)
 				vrele(ndp->ni_dvp);
 			else
 				vput(ndp->ni_dvp);
 			ndp->ni_dvp = NULL;
 			vp = ndp->ni_vp;
 			if (fmode & O_EXCL) {
 				error = EEXIST;
 				goto bad;
 			}
 			fmode &= ~O_CREAT;
 		}
 	} else {
 		ndp->ni_cnd.cn_nameiop = LOOKUP;
 #ifdef LOOKUP_SHARED
 		ndp->ni_cnd.cn_flags =
 		    ((fmode & O_NOFOLLOW) ? NOFOLLOW : FOLLOW) |
 		    LOCKSHARED | LOCKLEAF;
 #else
 		ndp->ni_cnd.cn_flags =
 		    ((fmode & O_NOFOLLOW) ? NOFOLLOW : FOLLOW) | LOCKLEAF;
 #endif
 		if ((error = namei(ndp)) != 0)
 			return (error);
 		vp = ndp->ni_vp;
 	}
 	if (vp->v_type == VLNK) {
 		error = EMLINK;
 		goto bad;
 	}
 	if (vp->v_type == VSOCK) {
 		error = EOPNOTSUPP;
 		goto bad;
 	}
 	mode = 0;
 	if (fmode & (FWRITE | O_TRUNC)) {
 		if (vp->v_type == VDIR) {
 			error = EISDIR;
 			goto bad;
 		}
 		mode |= VWRITE;
 	}
 	if (fmode & FREAD)
 		mode |= VREAD;
 	if (fmode & O_APPEND)
 		mode |= VAPPEND;
 #ifdef MAC
 	error = mac_check_vnode_open(cred, vp, mode);
 	if (error)
 		goto bad;
 #endif
 	if ((fmode & O_CREAT) == 0) {
 		if (mode & VWRITE) {
 			error = vn_writechk(vp);
 			if (error)
 				goto bad;
 		}
 		if (mode) {
 		        error = VOP_ACCESS(vp, mode, cred, td);
 			if (error)
 				goto bad;
 		}
 	}
 	if ((error = VOP_GETATTR(vp, vap, cred, td)) == 0) {
 		vp->v_cachedfs = vap->va_fsid;
 		vp->v_cachedid = vap->va_fileid;
 	}
 	if ((error = VOP_OPEN(vp, fmode, cred, td)) != 0)
 		goto bad;
 	/*
 	 * Make sure that a VM object is created for VMIO support.
 	 */
 	if (vn_canvmio(vp) == TRUE) {
 #ifdef LOOKUP_SHARED
 		int flock;
 
 		if (!exclusive && VOP_GETVOBJECT(vp, NULL) != 0)
 			VOP_LOCK(vp, LK_UPGRADE, td);
 		/*
 		 * In cases where the object is marked as dead object_create
 		 * will unlock and relock exclusive.  It is safe to call in
 		 * here with a shared lock because we only examine fields that
 		 * the shared lock guarantees will be stable.  In the UPGRADE
 		 * case it is not likely that anyone has used this vnode yet
 		 * so there will be no contention.  The logic after this call
 		 * restores the requested locking state.
 		 */
 #endif
 		if ((error = vfs_object_create(vp, td, cred)) != 0) {
 			VOP_UNLOCK(vp, 0, td);
 			VOP_CLOSE(vp, fmode, cred, td);
 			NDFREE(ndp, NDF_ONLY_PNBUF);
 			vrele(vp);
 			*flagp = fmode;
 			return (error);
 		}
 #ifdef LOOKUP_SHARED
 		flock = VOP_ISLOCKED(vp, td);
 		if (!exclusive && flock == LK_EXCLUSIVE)
 			VOP_LOCK(vp, LK_DOWNGRADE, td);
 #endif
 	}
 
 	if (fmode & FWRITE)
 		vp->v_writecount++;
 	*flagp = fmode;
 	return (0);
 bad:
 	NDFREE(ndp, NDF_ONLY_PNBUF);
 	vput(vp);
 	*flagp = fmode;
 	return (error);
 }
 
 /*
  * Check for write permissions on the specified vnode.
  * Prototype text segments cannot be written.
  */
 int
 vn_writechk(vp)
 	register struct vnode *vp;
 {
 
 	ASSERT_VOP_LOCKED(vp, "vn_writechk");
 	/*
 	 * If there's shared text associated with
 	 * the vnode, try to free it up once.  If
 	 * we fail, we can't allow writing.
 	 */
 	if (vp->v_vflag & VV_TEXT)
 		return (ETXTBSY);
 
 	return (0);
 }
 
 /*
  * Vnode close call
  */
 int
 vn_close(vp, flags, file_cred, td)
 	register struct vnode *vp;
 	int flags;
 	struct ucred *file_cred;
 	struct thread *td;
 {
 	int error;
 
 	if (flags & FWRITE)
 		vp->v_writecount--;
 	error = VOP_CLOSE(vp, flags, file_cred, td);
 	/*
 	 * XXX - In certain instances VOP_CLOSE has to do the vrele
 	 * itself. If the vrele has been done, it will return EAGAIN
 	 * to indicate that the vrele should not be done again. When
 	 * this happens, we just return success. The correct thing to
 	 * do would be to have all VOP_CLOSE instances do the vrele.
 	 */
 	if (error == EAGAIN)
 		return (0);
 	vrele(vp);
 	return (error);
 }
 
 /*
  * Sequential heuristic - detect sequential operation
  */
 static __inline
 int
 sequential_heuristic(struct uio *uio, struct file *fp)
 {
 
 	if ((uio->uio_offset == 0 && fp->f_seqcount > 0) ||
 	    uio->uio_offset == fp->f_nextoff) {
 		/*
 		 * XXX we assume that the filesystem block size is
 		 * the default.  Not true, but still gives us a pretty
 		 * good indicator of how sequential the read operations
 		 * are.
 		 */
 		fp->f_seqcount += (uio->uio_resid + BKVASIZE - 1) / BKVASIZE;
 		if (fp->f_seqcount >= 127)
 			fp->f_seqcount = 127;
 		return(fp->f_seqcount << 16);
 	}
 
 	/*
 	 * Not sequential, quick draw-down of seqcount
 	 */
 	if (fp->f_seqcount > 1)
 		fp->f_seqcount = 1;
 	else
 		fp->f_seqcount = 0;
 	return(0);
 }
 
 /*
  * Package up an I/O request on a vnode into a uio and do it.
  */
 int
 vn_rdwr(rw, vp, base, len, offset, segflg, ioflg, active_cred, file_cred,
     aresid, td)
 	enum uio_rw rw;
 	struct vnode *vp;
 	caddr_t base;
 	int len;
 	off_t offset;
 	enum uio_seg segflg;
 	int ioflg;
 	struct ucred *active_cred;
 	struct ucred *file_cred;
 	int *aresid;
 	struct thread *td;
 {
 	struct uio auio;
 	struct iovec aiov;
 	struct mount *mp;
 	struct ucred *cred;
 	int error;
 
 	if ((ioflg & IO_NODELOCKED) == 0) {
 		mp = NULL;
 		if (rw == UIO_WRITE) { 
 			if (vp->v_type != VCHR &&
 			    (error = vn_start_write(vp, &mp, V_WAIT | PCATCH))
 			    != 0)
 				return (error);
 			vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td);
 		} else {
 			/*
 			 * XXX This should be LK_SHARED but I don't trust VFS
 			 * enough to leave it like that until it has been
 			 * reviewed further.
 			 */
 			vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td);
 		}
 
 	}
 	auio.uio_iov = &aiov;
 	auio.uio_iovcnt = 1;
 	aiov.iov_base = base;
 	aiov.iov_len = len;
 	auio.uio_resid = len;
 	auio.uio_offset = offset;
 	auio.uio_segflg = segflg;
 	auio.uio_rw = rw;
 	auio.uio_td = td;
 	error = 0;
 #ifdef MAC
 	if ((ioflg & IO_NOMACCHECK) == 0) {
 		if (rw == UIO_READ)
 			error = mac_check_vnode_read(active_cred, file_cred,
 			    vp);
 		else
 			error = mac_check_vnode_write(active_cred, file_cred,
 			    vp);
 	}
 #endif
 	if (error == 0) {
 		if (file_cred)
 			cred = file_cred;
 		else
 			cred = active_cred;
 		if (rw == UIO_READ)
 			error = VOP_READ(vp, &auio, ioflg, cred);
 		else
 			error = VOP_WRITE(vp, &auio, ioflg, cred);
 	}
 	if (aresid)
 		*aresid = auio.uio_resid;
 	else
 		if (auio.uio_resid && error == 0)
 			error = EIO;
 	if ((ioflg & IO_NODELOCKED) == 0) {
 		if (rw == UIO_WRITE)
 			vn_finished_write(mp);
 		VOP_UNLOCK(vp, 0, td);
 	}
 	return (error);
 }
 
 /*
  * Package up an I/O request on a vnode into a uio and do it.  The I/O
  * request is split up into smaller chunks and we try to avoid saturating
  * the buffer cache while potentially holding a vnode locked, so we 
  * check bwillwrite() before calling vn_rdwr().  We also call uio_yield()
  * to give other processes a chance to lock the vnode (either other processes
  * core'ing the same binary, or unrelated processes scanning the directory).
  */
 int
 vn_rdwr_inchunks(rw, vp, base, len, offset, segflg, ioflg, active_cred,
     file_cred, aresid, td)
 	enum uio_rw rw;
 	struct vnode *vp;
 	caddr_t base;
 	int len;
 	off_t offset;
 	enum uio_seg segflg;
 	int ioflg;
 	struct ucred *active_cred;
 	struct ucred *file_cred;
 	int *aresid;
 	struct thread *td;
 {
 	int error = 0;
 
 	do {
 		int chunk = (len > MAXBSIZE) ? MAXBSIZE : len;
 
 		if (rw != UIO_READ && vp->v_type == VREG)
 			bwillwrite();
 		error = vn_rdwr(rw, vp, base, chunk, offset, segflg,
 		    ioflg, active_cred, file_cred, aresid, td);
 		len -= chunk;	/* aresid calc already includes length */
 		if (error)
 			break;
 		offset += chunk;
 		base += chunk;
 		uio_yield();
 	} while (len);
 	if (aresid)
 		*aresid += len;
 	return (error);
 }
 
 /*
  * File table vnode read routine.
  */
 static int
 vn_read(fp, uio, active_cred, flags, td)
 	struct file *fp;
 	struct uio *uio;
 	struct ucred *active_cred;
 	struct thread *td;
 	int flags;
 {
 	struct vnode *vp;
 	int error, ioflag;
 
 	mtx_lock(&Giant);
 	KASSERT(uio->uio_td == td, ("uio_td %p is not td %p",
 	    uio->uio_td, td));
 	vp = (struct vnode *)fp->f_data;
 	ioflag = 0;
 	if (fp->f_flag & FNONBLOCK)
 		ioflag |= IO_NDELAY;
 	if (fp->f_flag & O_DIRECT)
 		ioflag |= IO_DIRECT;
 	VOP_LEASE(vp, td, fp->f_cred, LEASE_READ);
 	/*
 	 * According to McKusick the vn lock is protecting f_offset here.
 	 * Once this field has it's own lock we can acquire this shared.
 	 */
 	vn_lock(vp, LK_EXCLUSIVE | LK_NOPAUSE | LK_RETRY, td);
 	if ((flags & FOF_OFFSET) == 0)
 		uio->uio_offset = fp->f_offset;
 
 	ioflag |= sequential_heuristic(uio, fp);
 
 #ifdef MAC
 	error = mac_check_vnode_read(active_cred, fp->f_cred, vp);
 	if (error == 0)
 #endif
 		error = VOP_READ(vp, uio, ioflag, fp->f_cred);
 	if ((flags & FOF_OFFSET) == 0)
 		fp->f_offset = uio->uio_offset;
 	fp->f_nextoff = uio->uio_offset;
 	VOP_UNLOCK(vp, 0, td);
 	mtx_unlock(&Giant);
 	return (error);
 }
 
 /*
  * File table vnode write routine.
  */
 static int
 vn_write(fp, uio, active_cred, flags, td)
 	struct file *fp;
 	struct uio *uio;
 	struct ucred *active_cred;
 	struct thread *td;
 	int flags;
 {
 	struct vnode *vp;
 	struct mount *mp;
 	int error, ioflag;
 
 	mtx_lock(&Giant);
 	KASSERT(uio->uio_td == td, ("uio_td %p is not td %p",
 	    uio->uio_td, td));
 	vp = (struct vnode *)fp->f_data;
 	if (vp->v_type == VREG)
 		bwillwrite();
 	ioflag = IO_UNIT;
 	if (vp->v_type == VREG && (fp->f_flag & O_APPEND))
 		ioflag |= IO_APPEND;
 	if (fp->f_flag & FNONBLOCK)
 		ioflag |= IO_NDELAY;
 	if (fp->f_flag & O_DIRECT)
 		ioflag |= IO_DIRECT;
 	if ((fp->f_flag & O_FSYNC) ||
 	    (vp->v_mount && (vp->v_mount->mnt_flag & MNT_SYNCHRONOUS)))
 		ioflag |= IO_SYNC;
 	mp = NULL;
 	if (vp->v_type != VCHR &&
 	    (error = vn_start_write(vp, &mp, V_WAIT | PCATCH)) != 0) {
 		mtx_unlock(&Giant);
 		return (error);
 	}
 	VOP_LEASE(vp, td, fp->f_cred, LEASE_WRITE);
 	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td);
 	if ((flags & FOF_OFFSET) == 0)
 		uio->uio_offset = fp->f_offset;
 	ioflag |= sequential_heuristic(uio, fp);
 #ifdef MAC
 	error = mac_check_vnode_write(active_cred, fp->f_cred, vp);
 	if (error == 0)
 #endif
 		error = VOP_WRITE(vp, uio, ioflag, fp->f_cred);
 	if ((flags & FOF_OFFSET) == 0)
 		fp->f_offset = uio->uio_offset;
 	fp->f_nextoff = uio->uio_offset;
 	VOP_UNLOCK(vp, 0, td);
 	vn_finished_write(mp);
 	mtx_unlock(&Giant);
 	return (error);
 }
 
 /*
  * File table vnode stat routine.
  */
 static int
 vn_statfile(fp, sb, active_cred, td)
 	struct file *fp;
 	struct stat *sb;
 	struct ucred *active_cred;
 	struct thread *td;
 {
 	struct vnode *vp = (struct vnode *)fp->f_data;
 	int error;
 
 	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td);
 	error = vn_stat(vp, sb, active_cred, fp->f_cred, td);
 	VOP_UNLOCK(vp, 0, td);
 
 	return (error);
 }
 
 /*
  * Stat a vnode; implementation for the stat syscall
  */
 int
 vn_stat(vp, sb, active_cred, file_cred, td)
 	struct vnode *vp;
 	register struct stat *sb;
 	struct ucred *active_cred;
 	struct ucred *file_cred;
 	struct thread *td;
 {
 	struct vattr vattr;
 	register struct vattr *vap;
 	int error;
 	u_short mode;
 
 #ifdef MAC
 	error = mac_check_vnode_stat(active_cred, file_cred, vp);
 	if (error)
 		return (error);
 #endif
 
 	vap = &vattr;
 	error = VOP_GETATTR(vp, vap, active_cred, td);
 	if (error)
 		return (error);
 
 	vp->v_cachedfs = vap->va_fsid;
 	vp->v_cachedid = vap->va_fileid;
 
 	/*
 	 * Zero the spare stat fields
 	 */
 	bzero(sb, sizeof *sb);
 
 	/*
 	 * Copy from vattr table
 	 */
 	if (vap->va_fsid != VNOVAL)
 		sb->st_dev = vap->va_fsid;
 	else
 		sb->st_dev = vp->v_mount->mnt_stat.f_fsid.val[0];
 	sb->st_ino = vap->va_fileid;
 	mode = vap->va_mode;
 	switch (vap->va_type) {
 	case VREG:
 		mode |= S_IFREG;
 		break;
 	case VDIR:
 		mode |= S_IFDIR;
 		break;
 	case VBLK:
 		mode |= S_IFBLK;
 		break;
 	case VCHR:
 		mode |= S_IFCHR;
 		break;
 	case VLNK:
 		mode |= S_IFLNK;
 		/* This is a cosmetic change, symlinks do not have a mode. */
 		if (vp->v_mount->mnt_flag & MNT_NOSYMFOLLOW)
 			sb->st_mode &= ~ACCESSPERMS;	/* 0000 */
 		else
 			sb->st_mode |= ACCESSPERMS;	/* 0777 */
 		break;
 	case VSOCK:
 		mode |= S_IFSOCK;
 		break;
 	case VFIFO:
 		mode |= S_IFIFO;
 		break;
 	default:
 		return (EBADF);
 	};
 	sb->st_mode = mode;
 	sb->st_nlink = vap->va_nlink;
 	sb->st_uid = vap->va_uid;
 	sb->st_gid = vap->va_gid;
 	sb->st_rdev = vap->va_rdev;
 	if (vap->va_size > OFF_MAX)
 		return (EOVERFLOW);
 	sb->st_size = vap->va_size;
 	sb->st_atimespec = vap->va_atime;
 	sb->st_mtimespec = vap->va_mtime;
 	sb->st_ctimespec = vap->va_ctime;
 	sb->st_birthtimespec = vap->va_birthtime;
 
         /*
 	 * According to www.opengroup.org, the meaning of st_blksize is 
 	 *   "a filesystem-specific preferred I/O block size for this 
 	 *    object.  In some filesystem types, this may vary from file
 	 *    to file"
 	 * Default to PAGE_SIZE after much discussion.
 	 */
 
 	if (vap->va_type == VREG) {
 		sb->st_blksize = vap->va_blocksize;
 	} else if (vn_isdisk(vp, NULL)) {
 		sb->st_blksize = vp->v_rdev->si_bsize_best;
 		if (sb->st_blksize < vp->v_rdev->si_bsize_phys)
 			sb->st_blksize = vp->v_rdev->si_bsize_phys;
 		if (sb->st_blksize < BLKDEV_IOSIZE)
 			sb->st_blksize = BLKDEV_IOSIZE;
 	} else {
 		sb->st_blksize = PAGE_SIZE;
 	}
 	
 	sb->st_flags = vap->va_flags;
 	if (suser(td))
 		sb->st_gen = 0;
 	else
 		sb->st_gen = vap->va_gen;
 
 #if (S_BLKSIZE == 512)
 	/* Optimize this case */
 	sb->st_blocks = vap->va_bytes >> 9;
 #else
 	sb->st_blocks = vap->va_bytes / S_BLKSIZE;
 #endif
 	return (0);
 }
 
 /*
  * File table vnode ioctl routine.
  */
 static int
 vn_ioctl(fp, com, data, active_cred, td)
 	struct file *fp;
 	u_long com;
 	void *data;
 	struct ucred *active_cred;
 	struct thread *td;
 {
 	register struct vnode *vp = ((struct vnode *)fp->f_data);
 	struct vnode *vpold;
 	struct vattr vattr;
 	int error;
 
 	switch (vp->v_type) {
 
 	case VREG:
 	case VDIR:
 		if (com == FIONREAD) {
 			vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td);
 			error = VOP_GETATTR(vp, &vattr, active_cred, td);
 			VOP_UNLOCK(vp, 0, td);
 			if (error)
 				return (error);
 			*(int *)data = vattr.va_size - fp->f_offset;
 			return (0);
 		}
 		if (com == FIONBIO || com == FIOASYNC)	/* XXX */
 			return (0);			/* XXX */
 		/* FALLTHROUGH */
 
 	default:
 #if 0
 		return (ENOTTY);
 #endif
 	case VFIFO:
 	case VCHR:
 	case VBLK:
 		if (com == FIODTYPE) {
 			if (vp->v_type != VCHR && vp->v_type != VBLK)
 				return (ENOTTY);
 			*(int *)data = devsw(vp->v_rdev)->d_flags & D_TYPEMASK;
 			return (0);
 		}
 		error = VOP_IOCTL(vp, com, data, fp->f_flag, active_cred, td);
 		if (error == ENOIOCTL) {
 #ifdef DIAGNOSTIC
 			Debugger("ENOIOCTL leaked through");
 #endif
 			error = ENOTTY;
 		}
 		if (error == 0 && com == TIOCSCTTY) {
 
 			/* Do nothing if reassigning same control tty */
 			sx_slock(&proctree_lock);
 			if (td->td_proc->p_session->s_ttyvp == vp) {
 				sx_sunlock(&proctree_lock);
 				return (0);
 			}
 
 			vpold = td->td_proc->p_session->s_ttyvp;
 			VREF(vp);
 			SESS_LOCK(td->td_proc->p_session);
 			td->td_proc->p_session->s_ttyvp = vp;
 			SESS_UNLOCK(td->td_proc->p_session);
 
 			sx_sunlock(&proctree_lock);
 
 			/* Get rid of reference to old control tty */
 			if (vpold)
 				vrele(vpold);
 		}
 		return (error);
 	}
 }
 
 /*
  * File table vnode poll routine.
  */
 static int
 vn_poll(fp, events, active_cred, td)
 	struct file *fp;
 	int events;
 	struct ucred *active_cred;
 	struct thread *td;
 {
 	struct vnode *vp;
 #ifdef MAC
 	int error;
 #endif
 
 	vp = (struct vnode *)fp->f_data;
 #ifdef MAC
 	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td);
 	error = mac_check_vnode_poll(active_cred, fp->f_cred, vp);
 	VOP_UNLOCK(vp, 0, td);
 	if (error)
 		return (error);
 #endif
 
 	return (VOP_POLL(vp, events, fp->f_cred, td));
 }
 
 /*
  * Check that the vnode is still valid, and if so
  * acquire requested lock.
  */
 int
 #ifndef	DEBUG_LOCKS
 vn_lock(vp, flags, td)
 #else
 debug_vn_lock(vp, flags, td, filename, line)
 #endif
 	struct vnode *vp;
 	int flags;
 	struct thread *td;
 #ifdef	DEBUG_LOCKS
 	const char *filename;
 	int line;
 #endif
 {
 	int error;
 
 	do {
 		if ((flags & LK_INTERLOCK) == 0)
 			VI_LOCK(vp);
 		if ((vp->v_iflag & VI_XLOCK) && vp->v_vxproc != curthread) {
 			vp->v_iflag |= VI_XWANT;
 			msleep(vp, VI_MTX(vp), PINOD, "vn_lock", 0);
 			error = ENOENT;
 			if ((flags & LK_RETRY) == 0) {
 				VI_UNLOCK(vp);
 				return (error);
 			}
 		} 
 #ifdef	DEBUG_LOCKS
 		vp->filename = filename;
 		vp->line = line;
 #endif
 		/*
 		 * lockmgr drops interlock before it will return for
 		 * any reason.  So force the code above to relock it.
 		 */
 		error = VOP_LOCK(vp, flags | LK_NOPAUSE | LK_INTERLOCK, td);
 		flags &= ~LK_INTERLOCK;
 	} while (flags & LK_RETRY && error != 0);
 	return (error);
 }
 
 /*
  * File table vnode close routine.
  */
 static int
 vn_closefile(fp, td)
 	struct file *fp;
 	struct thread *td;
 {
 
 	fp->f_ops = &badfileops;
 	return (vn_close(((struct vnode *)fp->f_data), fp->f_flag,
 		fp->f_cred, td));
 }
 
 /*
  * Preparing to start a filesystem write operation. If the operation is
  * permitted, then we bump the count of operations in progress and
  * proceed. If a suspend request is in progress, we wait until the
  * suspension is over, and then proceed.
  */
 int
 vn_start_write(vp, mpp, flags)
 	struct vnode *vp;
 	struct mount **mpp;
 	int flags;
 {
 	struct mount *mp;
 	int error;
 
 	/*
 	 * If a vnode is provided, get and return the mount point that
 	 * to which it will write.
 	 */
 	if (vp != NULL) {
 		if ((error = VOP_GETWRITEMOUNT(vp, mpp)) != 0) {
 			*mpp = NULL;
 			if (error != EOPNOTSUPP)
 				return (error);
 			return (0);
 		}
 	}
 	if ((mp = *mpp) == NULL)
 		return (0);
 	/*
 	 * Check on status of suspension.
 	 */
 	while ((mp->mnt_kern_flag & MNTK_SUSPEND) != 0) {
 		if (flags & V_NOWAIT)
 			return (EWOULDBLOCK);
 		error = tsleep(&mp->mnt_flag, (PUSER - 1) | (flags & PCATCH),
 		    "suspfs", 0);
 		if (error)
 			return (error);
 	}
 	if (flags & V_XSLEEP)
 		return (0);
 	mp->mnt_writeopcount++;
 	return (0);
 }
 
 /*
  * Secondary suspension. Used by operations such as vop_inactive
  * routines that are needed by the higher level functions. These
  * are allowed to proceed until all the higher level functions have
  * completed (indicated by mnt_writeopcount dropping to zero). At that
  * time, these operations are halted until the suspension is over.
  */
 int
 vn_write_suspend_wait(vp, mp, flags)
 	struct vnode *vp;
 	struct mount *mp;
 	int flags;
 {
 	int error;
 
 	if (vp != NULL) {
 		if ((error = VOP_GETWRITEMOUNT(vp, &mp)) != 0) {
 			if (error != EOPNOTSUPP)
 				return (error);
 			return (0);
 		}
 	}
 	/*
 	 * If we are not suspended or have not yet reached suspended
 	 * mode, then let the operation proceed.
 	 */
 	if (mp == NULL || (mp->mnt_kern_flag & MNTK_SUSPENDED) == 0)
 		return (0);
 	if (flags & V_NOWAIT)
 		return (EWOULDBLOCK);
 	/*
 	 * Wait for the suspension to finish.
 	 */
 	return (tsleep(&mp->mnt_flag, (PUSER - 1) | (flags & PCATCH),
 	    "suspfs", 0));
 }
 
 /*
  * Filesystem write operation has completed. If we are suspending and this
  * operation is the last one, notify the suspender that the suspension is
  * now in effect.
  */
 void
 vn_finished_write(mp)
 	struct mount *mp;
 {
 
 	if (mp == NULL)
 		return;
 	mp->mnt_writeopcount--;
 	if (mp->mnt_writeopcount < 0)
 		panic("vn_finished_write: neg cnt");
 	if ((mp->mnt_kern_flag & MNTK_SUSPEND) != 0 &&
 	    mp->mnt_writeopcount <= 0)
 		wakeup(&mp->mnt_writeopcount);
 }
 
 /*
  * Request a filesystem to suspend write operations.
  */
-void
+int
 vfs_write_suspend(mp)
 	struct mount *mp;
 {
 	struct thread *td = curthread;
+	int error;
 
 	if (mp->mnt_kern_flag & MNTK_SUSPEND)
-		return;
+		return (0);
 	mp->mnt_kern_flag |= MNTK_SUSPEND;
 	if (mp->mnt_writeopcount > 0)
 		(void) tsleep(&mp->mnt_writeopcount, PUSER - 1, "suspwt", 0);
-	VFS_SYNC(mp, MNT_WAIT, td->td_ucred, td);
+	if ((error = VFS_SYNC(mp, MNT_WAIT, td->td_ucred, td)) != 0) {
+		vfs_write_resume(mp);
+		return (error);
+	}
 	mp->mnt_kern_flag |= MNTK_SUSPENDED;
+	return (0);
 }
 
 /*
  * Request a filesystem to resume write operations.
  */
 void
 vfs_write_resume(mp)
 	struct mount *mp;
 {
 
 	if ((mp->mnt_kern_flag & MNTK_SUSPEND) == 0)
 		return;
 	mp->mnt_kern_flag &= ~(MNTK_SUSPEND | MNTK_SUSPENDED);
 	wakeup(&mp->mnt_writeopcount);
 	wakeup(&mp->mnt_flag);
 }
 
 /*
  * Implement kqueues for files by translating it to vnode operation.
  */
 static int
 vn_kqfilter(struct file *fp, struct knote *kn)
 {
 
 	return (VOP_KQFILTER(((struct vnode *)fp->f_data), kn));
 }
 
 /*
  * Simplified in-kernel wrapper calls for extended attribute access.
  * Both calls pass in a NULL credential, authorizing as "kernel" access.
  * Set IO_NODELOCKED in ioflg if the vnode is already locked.
  */
 int
 vn_extattr_get(struct vnode *vp, int ioflg, int attrnamespace,
     const char *attrname, int *buflen, char *buf, struct thread *td)
 {
 	struct uio	auio;
 	struct iovec	iov;
 	int	error;
 
 	iov.iov_len = *buflen;
 	iov.iov_base = buf;
 
 	auio.uio_iov = &iov;
 	auio.uio_iovcnt = 1;
 	auio.uio_rw = UIO_READ;
 	auio.uio_segflg = UIO_SYSSPACE;
 	auio.uio_td = td;
 	auio.uio_offset = 0;
 	auio.uio_resid = *buflen;
 
 	if ((ioflg & IO_NODELOCKED) == 0)
 		vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td);
 
 	/* authorize attribute retrieval as kernel */
 	error = VOP_GETEXTATTR(vp, attrnamespace, attrname, &auio, NULL, NULL,
 	    td);
 
 	if ((ioflg & IO_NODELOCKED) == 0)
 		VOP_UNLOCK(vp, 0, td);
 
 	if (error == 0) {
 		*buflen = *buflen - auio.uio_resid;
 	}
 
 	return (error);
 }
 
 /*
  * XXX failure mode if partially written?
  */
 int
 vn_extattr_set(struct vnode *vp, int ioflg, int attrnamespace,
     const char *attrname, int buflen, char *buf, struct thread *td)
 {
 	struct uio	auio;
 	struct iovec	iov;
 	struct mount	*mp;
 	int	error;
 
 	iov.iov_len = buflen;
 	iov.iov_base = buf;
 
 	auio.uio_iov = &iov;
 	auio.uio_iovcnt = 1;
 	auio.uio_rw = UIO_WRITE;
 	auio.uio_segflg = UIO_SYSSPACE;
 	auio.uio_td = td;
 	auio.uio_offset = 0;
 	auio.uio_resid = buflen;
 
 	if ((ioflg & IO_NODELOCKED) == 0) {
 		if ((error = vn_start_write(vp, &mp, V_WAIT)) != 0)
 			return (error);
 		vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td);
 	}
 
 	/* authorize attribute setting as kernel */
 	error = VOP_SETEXTATTR(vp, attrnamespace, attrname, &auio, NULL, td);
 
 	if ((ioflg & IO_NODELOCKED) == 0) {
 		vn_finished_write(mp);
 		VOP_UNLOCK(vp, 0, td);
 	}
 
 	return (error);
 }
 
 int
 vn_extattr_rm(struct vnode *vp, int ioflg, int attrnamespace,
     const char *attrname, struct thread *td)
 {
 	struct mount	*mp;
 	int	error;
 
 	if ((ioflg & IO_NODELOCKED) == 0) {
 		if ((error = vn_start_write(vp, &mp, V_WAIT)) != 0)
 			return (error);
 		vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td);
 	}
 
 	/* authorize attribute removal as kernel */
 	error = VOP_SETEXTATTR(vp, attrnamespace, attrname, NULL, NULL, td);
 
 	if ((ioflg & IO_NODELOCKED) == 0) {
 		vn_finished_write(mp);
 		VOP_UNLOCK(vp, 0, td);
 	}
 
 	return (error);
 }
Index: head/sys/sys/vnode.h
===================================================================
--- head/sys/sys/vnode.h	(revision 105901)
+++ head/sys/sys/vnode.h	(revision 105902)
@@ -1,726 +1,726 @@
 /*
  * Copyright (c) 1989, 1993
  *	The Regents of the University of California.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. All advertising materials mentioning features or use of this software
  *    must display the following acknowledgement:
  *	This product includes software developed by the University of
  *	California, Berkeley and its contributors.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	@(#)vnode.h	8.7 (Berkeley) 2/4/94
  * $FreeBSD$
  */
 
 #ifndef _SYS_VNODE_H_
 #define	_SYS_VNODE_H_
 
 /*
  * XXX - compatability until lockmgr() goes away or all the #includes are
  * updated.
  */
 #include <sys/lockmgr.h>
 
 #include <sys/queue.h>
 #include <sys/_label.h>
 #include <sys/_lock.h>
 #include <sys/lock.h>
 #include <sys/_mutex.h>
 #include <sys/mutex.h>
 #include <sys/selinfo.h>
 #include <sys/uio.h>
 #include <sys/acl.h>
 #include <sys/ktr.h>
 
 /*
  * The vnode is the focus of all file activity in UNIX.  There is a
  * unique vnode allocated for each active file, each current directory,
  * each mounted-on file, text file, and the root.
  */
 
 /*
  * Vnode types.  VNON means no type.
  */
 enum vtype	{ VNON, VREG, VDIR, VBLK, VCHR, VLNK, VSOCK, VFIFO, VBAD };
 
 /*
  * Each underlying filesystem allocates its own private area and hangs
  * it from v_data.  If non-null, this area is freed in getnewvnode().
  */
 TAILQ_HEAD(buflists, buf);
 
 typedef	int	vop_t(void *);
 struct namecache;
 
 struct vpollinfo {
 	struct	mtx vpi_lock;		/* lock to protect below */
 	struct	selinfo vpi_selinfo;	/* identity of poller(s) */
 	short	vpi_events;		/* what they are looking for */
 	short	vpi_revents;		/* what has happened */
 };
 
 /*
  * Reading or writing any of these items requires holding the appropriate lock.
  *
  * Lock reference:
  *	c - namecache mutex
  *	f - freelist mutex
  *	i - interlock
  *	m - mntvnodes mutex
  *	p - pollinfo lock
  *	s - spechash mutex
  *	S - syncer mutex
  *	u - Only a reference to the vnode is needed to read.
  *	v - vnode lock
  *
  * XXX Not all fields are locked yet and some fields that are marked are not
  * locked consistently.  This is a work in progress.
  */
 
 struct vnode {
 	struct	mtx v_interlock;		/* lock for "i" things */
 	u_long	v_iflag;			/* i vnode flags (see below) */
 	int	v_usecount;			/* i ref count of users */
 	long	v_numoutput;			/* i writes in progress */
 	struct thread *v_vxproc;		/* i thread owning VXLOCK */
 	int	v_holdcnt;			/* i page & buffer references */
 	struct	buflists v_cleanblkhd;		/* i SORTED clean blocklist */
 	struct buf	*v_cleanblkroot;	/* i clean buf splay tree  */
 	struct	buflists v_dirtyblkhd;		/* i SORTED dirty blocklist */
 	struct buf	*v_dirtyblkroot;	/* i dirty buf splay tree */
 	u_long	v_vflag;			/* v vnode flags */
 	int	v_writecount;			/* v ref count of writers */
 	struct vm_object *v_object;		/* v Place to store VM object */
 	daddr_t	v_lastw;			/* v last write (write cluster) */
 	daddr_t	v_cstart;			/* v start block of cluster */
 	daddr_t	v_lasta;			/* v last allocation (cluster) */
 	int	v_clen;				/* v length of current cluster */
 	union {
 		struct mount	*vu_mountedhere;/* v ptr to mounted vfs (VDIR) */
 		struct socket	*vu_socket;	/* v unix ipc (VSOCK) */
 		struct {
 			struct cdev	*vu_cdev; /* v device (VCHR, VBLK) */
 			SLIST_ENTRY(vnode) vu_specnext;	/* s device aliases */
 		} vu_spec;
 		struct fifoinfo	*vu_fifoinfo;	/* v fifo (VFIFO) */
 	} v_un;
 	TAILQ_ENTRY(vnode) v_freelist;		/* f vnode freelist */
 	TAILQ_ENTRY(vnode) v_nmntvnodes;	/* m vnodes for mount point */
 	LIST_ENTRY(vnode) v_synclist;		/* S dirty vnode list */
 	enum	vtype v_type;			/* u vnode type */
 	const char *v_tag;			/* u type of underlying data */
 	void	*v_data;			/* u private data for fs */
 	struct	lock v_lock;			/* u used if fs don't have one */
 	struct	lock *v_vnlock;			/* u pointer to vnode lock */
 	vop_t	**v_op;				/* u vnode operations vector */
 	struct	mount *v_mount;			/* u ptr to vfs we are in */
 	LIST_HEAD(, namecache) v_cache_src;	/* c Cache entries from us */
 	TAILQ_HEAD(, namecache) v_cache_dst;	/* c Cache entries to us */
 	u_long	v_id;				/* c capability identifier */
 	struct	vnode *v_dd;			/* c .. vnode */
 	u_long	v_ddid;				/* c .. capability identifier */
 	struct vpollinfo *v_pollinfo;		/* p Poll events */
 	struct label v_label;			/* MAC label for vnode */
 #ifdef	DEBUG_LOCKS
 	const char *filename;			/* Source file doing locking */
 	int line;				/* Line number doing locking */
 #endif
 	udev_t	v_cachedfs;			/* cached fs id */
 	ino_t	v_cachedid;			/* cached file id */
 };
 #define	v_mountedhere	v_un.vu_mountedhere
 #define	v_socket	v_un.vu_socket
 #define	v_rdev		v_un.vu_spec.vu_cdev
 #define	v_specnext	v_un.vu_spec.vu_specnext
 #define	v_fifoinfo	v_un.vu_fifoinfo
 
 /*
  * Userland version of struct vnode, for sysctl.
  */
 struct xvnode {
 	size_t	xv_size;			/* sizeof(struct xvnode) */
 	void	*xv_vnode;			/* address of real vnode */
 	u_long	xv_flag;			/* vnode vflags */
 	int	xv_usecount;			/* reference count of users */
 	int	xv_writecount;			/* reference count of writers */
 	int	xv_holdcnt;			/* page & buffer references */
 	u_long	xv_id;				/* capability identifier */
 	void	*xv_mount;			/* address of parent mount */
 	long	xv_numoutput;			/* num of writes in progress */
 	enum	vtype xv_type;			/* vnode type */
 	union {
 		void	*xvu_socket;		/* socket, if VSOCK */
 		void	*xvu_fifo;		/* fifo, if VFIFO */
 		udev_t	xvu_rdev;		/* maj/min, if VBLK/VCHR */
 		struct {
 			udev_t	xvu_dev;	/* device, if VDIR/VREG/VLNK */
 			ino_t	xvu_ino;	/* id, if VDIR/VREG/VLNK */
 		} xv_uns;
 	} xv_un;
 };
 #define xv_socket	xv_un.xvu_socket
 #define xv_fifo		xv_un.xvu_fifo
 #define xv_rdev		xv_un.xvu_rdev
 #define xv_dev		xv_un.xv_uns.xvu_dev
 #define xv_ino		xv_un.xv_uns.xvu_ino
 
 #define	VN_POLLEVENT(vp, events)				\
 	do {							\
 		if ((vp)->v_pollinfo != NULL && 		\
 		    (vp)->v_pollinfo->vpi_events & (events))	\
 			vn_pollevent((vp), (events));		\
 	} while (0)
 
 #define VN_KNOTE(vp, b)						\
 	do {							\
 		if ((vp)->v_pollinfo != NULL)			\
 			KNOTE(&vp->v_pollinfo->vpi_selinfo.si_note, (b)); \
 	} while (0)
 
 /*
  * Vnode flags.
  *	VI flags are protected by interlock and live in v_iflag
  *	VV flags are protected by the vnode lock and live in v_vflag
  */
 #define	VI_XLOCK	0x0001	/* vnode is locked to change vtype */
 #define	VI_XWANT	0x0002	/* thread is waiting for vnode */
 #define	VI_BWAIT	0x0004	/* waiting for output to complete */
 #define	VI_OLOCK	0x0008	/* vnode is locked waiting for an object */
 #define	VI_OWANT	0x0010	/* a thread is waiting for VOLOCK */
 #define	VI_MOUNT	0x0020	/* Mount in progress */
 #define	VI_AGE		0x0040	/* Insert vnode at head of free list */
 #define	VI_DOOMED	0x0080	/* This vnode is being recycled */
 #define	VI_FREE		0x0100	/* This vnode is on the freelist */
 #define	VI_OBJDIRTY	0x0400	/* object might be dirty */
 /*
  * XXX VI_ONWORKLST could be replaced with a check for NULL list elements
  * in v_synclist.
  */
 #define	VI_ONWORKLST	0x0200	/* On syncer work-list */
 
 #define	VV_ROOT		0x0001	/* root of its filesystem */
 #define	VV_ISTTY	0x0002	/* vnode represents a tty */
 #define	VV_NOSYNC	0x0004	/* unlinked, stop syncing */
 #define	VV_OBJBUF	0x0008	/* Allocate buffers in VM object */
 #define	VV_CACHEDLABEL	0x0010	/* Vnode has valid cached MAC label */
 #define	VV_TEXT		0x0020	/* vnode is a pure text prototype */
 #define	VV_COPYONWRITE	0x0040	/* vnode is doing copy-on-write */
 #define	VV_SYSTEM	0x0080	/* vnode being used by kernel */
 #define	VV_PROCDEP	0x0100	/* vnode is process dependent */
 
 /*
  * Vnode attributes.  A field value of VNOVAL represents a field whose value
  * is unavailable (getattr) or which is not to be changed (setattr).
  */
 struct vattr {
 	enum vtype	va_type;	/* vnode type (for create) */
 	u_short		va_mode;	/* files access mode and type */
 	short		va_nlink;	/* number of references to file */
 	uid_t		va_uid;		/* owner user id */
 	gid_t		va_gid;		/* owner group id */
 	udev_t		va_fsid;	/* filesystem id */
 	long		va_fileid;	/* file id */
 	u_quad_t	va_size;	/* file size in bytes */
 	long		va_blocksize;	/* blocksize preferred for i/o */
 	struct timespec	va_atime;	/* time of last access */
 	struct timespec	va_mtime;	/* time of last modification */
 	struct timespec	va_ctime;	/* time file changed */
 	struct timespec	va_birthtime;	/* time file created */
 	u_long		va_gen;		/* generation number of file */
 	u_long		va_flags;	/* flags defined for file */
 	udev_t		va_rdev;	/* device the special file represents */
 	u_quad_t	va_bytes;	/* bytes of disk space held by file */
 	u_quad_t	va_filerev;	/* file modification number */
 	u_int		va_vaflags;	/* operations flags, see below */
 	long		va_spare;	/* remain quad aligned */
 };
 
 /*
  * Flags for va_vaflags.
  */
 #define	VA_UTIMES_NULL	0x01		/* utimes argument was NULL */
 #define	VA_EXCLUSIVE	0x02		/* exclusive create request */
 
 /*
  * Flags for ioflag. (high 16 bits used to ask for read-ahead and
  * help with write clustering)
  */
 #define	IO_UNIT		0x0001		/* do I/O as atomic unit */
 #define	IO_APPEND	0x0002		/* append write to end */
 #define	IO_SYNC		0x0004		/* do I/O synchronously */
 #define	IO_NODELOCKED	0x0008		/* underlying node already locked */
 #define	IO_NDELAY	0x0010		/* FNDELAY flag set in file table */
 #define	IO_VMIO		0x0020		/* data already in VMIO space */
 #define	IO_INVAL	0x0040		/* invalidate after I/O */
 #define	IO_ASYNC	0x0080		/* bawrite rather then bdwrite */
 #define	IO_DIRECT	0x0100		/* attempt to bypass buffer cache */
 #define	IO_NOWDRAIN	0x0200		/* do not block on wdrain */
 #define	IO_EXT		0x0400		/* operate on external attributes */
 #define	IO_NORMAL	0x0800		/* operate on regular data */
 #define	IO_NOMACCHECK	0x1000		/* MAC checks unnecessary */
 
 /*
  *  Modes.  Some values same as Ixxx entries from inode.h for now.
  */
 #define	VEXEC	000100		/* execute/search permission */
 #define	VWRITE	000200		/* write permission */
 #define	VREAD	000400		/* read permission */
 #define	VSVTX	001000		/* save swapped text even after use */
 #define	VSGID	002000		/* set group id on execution */
 #define	VSUID	004000		/* set user id on execution */
 #define	VADMIN	010000		/* permission to administer */
 #define	VSTAT	020000		/* permission to retrieve attrs */
 #define	VAPPEND	040000		/* permission to write/append */
 #define	VALLPERM	(VEXEC | VWRITE | VREAD | VADMIN | VSTAT | VAPPEND)
 
 /*
  * Token indicating no attribute value yet assigned.
  */
 #define	VNOVAL	(-1)
 
 /*
  * LK_TIMELOCK timeout for vnode locks (used mainly by the pageout daemon)
  */
 #define VLKTIMEOUT	(hz / 20 + 1)
 
 #ifdef _KERNEL
 
 #ifdef MALLOC_DECLARE
 MALLOC_DECLARE(M_VNODE);
 #endif
 
 /*
  * Convert between vnode types and inode formats (since POSIX.1
  * defines mode word of stat structure in terms of inode formats).
  */
 extern enum vtype	iftovt_tab[];
 extern int		vttoif_tab[];
 #define	IFTOVT(mode)	(iftovt_tab[((mode) & S_IFMT) >> 12])
 #define	VTTOIF(indx)	(vttoif_tab[(int)(indx)])
 #define	MAKEIMODE(indx, mode)	(int)(VTTOIF(indx) | (mode))
 
 /*
  * Flags to various vnode functions.
  */
 #define	SKIPSYSTEM	0x0001	/* vflush: skip vnodes marked VSYSTEM */
 #define	FORCECLOSE	0x0002	/* vflush: force file closure */
 #define	WRITECLOSE	0x0004	/* vflush: only close writable files */
 #define	DOCLOSE		0x0008	/* vclean: close active files */
 #define	V_SAVE		0x0001	/* vinvalbuf: sync file first */
 #define	V_ALT		0x0002	/* vinvalbuf: invalidate only alternate bufs */
 #define	V_NORMAL	0x0004	/* vinvalbuf: invalidate only regular bufs */
 #define	REVOKEALL	0x0001	/* vop_revoke: revoke all aliases */
 #define	V_WAIT		0x0001	/* vn_start_write: sleep for suspend */
 #define	V_NOWAIT	0x0002	/* vn_start_write: don't sleep for suspend */
 #define	V_XSLEEP	0x0004	/* vn_start_write: just return after sleep */
 
 #define	VREF(vp)	vref(vp)
 
 
 #ifdef DIAGNOSTIC
 #define	VATTR_NULL(vap)	vattr_null(vap)
 #else
 #define	VATTR_NULL(vap)	(*(vap) = va_null)	/* initialize a vattr */
 #endif /* DIAGNOSTIC */
 
 #define	NULLVP	((struct vnode *)NULL)
 
 #define	VNODEOP_SET(f) \
 	C_SYSINIT(f##init, SI_SUB_VFS, SI_ORDER_SECOND, vfs_add_vnodeops, &f); \
 	C_SYSUNINIT(f##uninit, SI_SUB_VFS, SI_ORDER_SECOND, vfs_rm_vnodeops, &f);
 
 /*
  * Global vnode data.
  */
 extern	struct vnode *rootvnode;	/* root (i.e. "/") vnode */
 extern	int desiredvnodes;		/* number of vnodes desired */
 extern	struct uma_zone *namei_zone;
 extern	int prtactive;			/* nonzero to call vprint() */
 extern	struct vattr va_null;		/* predefined null vattr structure */
 extern	int vfs_ioopt;
 
 /*
  * Macro/function to check for client cache inconsistency w.r.t. leasing.
  */
 #define	LEASE_READ	0x1		/* Check lease for readers */
 #define	LEASE_WRITE	0x2		/* Check lease for modifiers */
 
 
 extern void	(*lease_updatetime)(int deltat);
 
 /* Requires interlock */
 #define	VSHOULDFREE(vp)	\
 	(!((vp)->v_iflag & (VI_FREE|VI_DOOMED)) && \
 	 !(vp)->v_holdcnt && !(vp)->v_usecount && \
 	 (!(vp)->v_object || \
 	  !((vp)->v_object->ref_count || (vp)->v_object->resident_page_count)))
 
 /* Requires interlock */
 #define VMIGHTFREE(vp) \
 	(!((vp)->v_iflag & (VI_FREE|VI_DOOMED|VI_XLOCK)) &&	\
 	 LIST_EMPTY(&(vp)->v_cache_src) && !(vp)->v_usecount)
 
 /* Requires interlock */
 #define	VSHOULDBUSY(vp)	\
 	(((vp)->v_iflag & VI_FREE) && \
 	 ((vp)->v_holdcnt || (vp)->v_usecount))
 
 #define	VI_LOCK(vp)	mtx_lock(&(vp)->v_interlock)
 #define	VI_TRYLOCK(vp)	mtx_trylock(&(vp)->v_interlock)
 #define	VI_UNLOCK(vp)	mtx_unlock(&(vp)->v_interlock)
 #define	VI_MTX(vp)	(&(vp)->v_interlock)
 
 #endif /* _KERNEL */
 
 
 /*
  * Mods for extensibility.
  */
 
 /*
  * Flags for vdesc_flags:
  */
 #define	VDESC_MAX_VPS		16
 /* Low order 16 flag bits are reserved for willrele flags for vp arguments. */
 #define	VDESC_VP0_WILLRELE	0x0001
 #define	VDESC_VP1_WILLRELE	0x0002
 #define	VDESC_VP2_WILLRELE	0x0004
 #define	VDESC_VP3_WILLRELE	0x0008
 #define	VDESC_NOMAP_VPP		0x0100
 #define	VDESC_VPP_WILLRELE	0x0200
 
 /*
  * VDESC_NO_OFFSET is used to identify the end of the offset list
  * and in places where no such field exists.
  */
 #define VDESC_NO_OFFSET -1
 
 /*
  * This structure describes the vnode operation taking place.
  */
 struct vnodeop_desc {
 	int	 vdesc_offset;		/* offset in vector,first for speed */
 	char	*vdesc_name;		/* a readable name for debugging */
 	int	 vdesc_flags;		/* VDESC_* flags */
 
 	/*
 	 * These ops are used by bypass routines to map and locate arguments.
 	 * Creds and procs are not needed in bypass routines, but sometimes
 	 * they are useful to (for example) transport layers.
 	 * Nameidata is useful because it has a cred in it.
 	 */
 	int	*vdesc_vp_offsets;	/* list ended by VDESC_NO_OFFSET */
 	int	vdesc_vpp_offset;	/* return vpp location */
 	int	vdesc_cred_offset;	/* cred location, if any */
 	int	vdesc_thread_offset;	/* thread location, if any */
 	int	vdesc_componentname_offset; /* if any */
 	/*
 	 * Finally, we've got a list of private data (about each operation)
 	 * for each transport layer.  (Support to manage this list is not
 	 * yet part of BSD.)
 	 */
 	caddr_t	*vdesc_transports;
 };
 
 #ifdef _KERNEL
 /*
  * A list of all the operation descs.
  */
 extern struct vnodeop_desc *vnodeop_descs[];
 
 /*
  * Interlock for scanning list of vnodes attached to a mountpoint
  */
 extern struct mtx mntvnode_mtx;
 
 /*
  * This macro is very helpful in defining those offsets in the vdesc struct.
  *
  * This is stolen from X11R4.  I ignored all the fancy stuff for
  * Crays, so if you decide to port this to such a serious machine,
  * you might want to consult Intrinsic.h's XtOffset{,Of,To}.
  */
 #define	VOPARG_OFFSET(p_type,field) \
 	((int) (((char *) (&(((p_type)NULL)->field))) - ((char *) NULL)))
 #define	VOPARG_OFFSETOF(s_type,field) \
 	VOPARG_OFFSET(s_type*,field)
 #define	VOPARG_OFFSETTO(S_TYPE,S_OFFSET,STRUCT_P) \
 	((S_TYPE)(((char*)(STRUCT_P))+(S_OFFSET)))
 
 
 /*
  * This structure is used to configure the new vnodeops vector.
  */
 struct vnodeopv_entry_desc {
 	struct vnodeop_desc *opve_op;   /* which operation this is */
 	vop_t *opve_impl;		/* code implementing this operation */
 };
 struct vnodeopv_desc {
 			/* ptr to the ptr to the vector where op should go */
 	vop_t ***opv_desc_vector_p;
 	struct vnodeopv_entry_desc *opv_desc_ops;   /* null terminated list */
 };
 
 /*
  * A generic structure.
  * This can be used by bypass routines to identify generic arguments.
  */
 struct vop_generic_args {
 	struct vnodeop_desc *a_desc;
 	/* other random data follows, presumably */
 };
 
 /*
  * Support code to aid in debugging VFS locking problems.  Not totally
  * reliable since if the thread sleeps between changing the lock
  * state and checking it with the assert, some other thread could
  * change the state.  They are good enough for debugging a single
  * filesystem using a single-threaded test.
  */
 void assert_vi_locked(struct vnode *vp, char *str);
 void assert_vi_unlocked(struct vnode *vp, char *str);
 void assert_vop_unlocked(struct vnode *vp, char *str);
 void assert_vop_locked(struct vnode *vp, char *str);
 void assert_vop_slocked(struct vnode *vp, char *str);
 void assert_vop_elocked(struct vnode *vp, char *str);
 void assert_vop_elocked_other(struct vnode *vp, char *str);
 
 /* These are called from within the actuall VOPS */
 void vop_rename_pre(void *a);
 void vop_strategy_pre(void *a);
 void vop_lookup_pre(void *a);
 void vop_lookup_post(void *a, int rc);
 void vop_lock_pre(void *a);
 void vop_lock_post(void *a, int rc);
 void vop_unlock_pre(void *a);
 void vop_unlock_post(void *a, int rc);
 
 #ifdef DEBUG_VFS_LOCKS
 
 #define	ASSERT_VI_LOCKED(vp, str)	assert_vi_locked((vp), (str))
 #define	ASSERT_VI_UNLOCKED(vp, str)	assert_vi_unlocked((vp), (str))
 #define	ASSERT_VOP_LOCKED(vp, str)	assert_vop_locked((vp), (str))
 #define	ASSERT_VOP_UNLOCKED(vp, str)	assert_vop_unlocked((vp), (str))
 #define	ASSERT_VOP_ELOCKED(vp, str)	assert_vop_elocked((vp), (str))
 #define	ASSERT_VOP_ELOCKED_OTHER(vp, str) assert_vop_locked_other((vp), (str))
 #define	ASSERT_VOP_SLOCKED(vp, str)	assert_vop_slocked((vp), (str))
 
 #else
 
 #define	ASSERT_VOP_LOCKED(vp, str)
 #define	ASSERT_VOP_UNLOCKED(vp, str)
 #define	ASSERT_VOP_ELOCKED(vp, str)
 #define	ASSERT_VOP_ELOCKED_OTHER(vp, str)
 #define	ASSERT_VOP_SLOCKED(vp, str)
 #define	ASSERT_VI_UNLOCKED(vp, str)
 #define	ASSERT_VI_LOCKED(vp, str)
 
 #endif
 
 /*
  * VOCALL calls an op given an ops vector.  We break it out because BSD's
  * vclean changes the ops vector and then wants to call ops with the old
  * vector.
  */
 #define VOCALL(OPSV,OFF,AP) (( *((OPSV)[(OFF)])) (AP))
 
 /*
  * This call works for vnodes in the kernel.
  */
 #define VCALL(VP,OFF,AP) VOCALL((VP)->v_op,(OFF),(AP))
 #define VDESC(OP) (& __CONCAT(OP,_desc))
 #define VOFFSET(OP) (VDESC(OP)->vdesc_offset)
 
 /*
  * VMIO support inline
  */
 
 extern int vmiodirenable;
 
 static __inline int
 vn_canvmio(struct vnode *vp)
 {
       if (vp && (vp->v_type == VREG || (vmiodirenable && vp->v_type == VDIR)))
 		return(TRUE);
 	return(FALSE);
 }
 
 /*
  * Finally, include the default set of vnode operations.
  */
 #include "vnode_if.h"
 
 /*
  * Public vnode manipulation functions.
  */
 struct componentname;
 struct file;
 struct mount;
 struct nameidata;
 struct ostat;
 struct thread;
 struct proc;
 struct stat;
 struct nstat;
 struct ucred;
 struct uio;
 struct vattr;
 struct vnode;
 
 extern int	(*lease_check_hook)(struct vop_lease_args *);
 extern int	(*softdep_fsync_hook)(struct vnode *);
 extern int	(*softdep_process_worklist_hook)(struct mount *);
 
 struct	vnode *addaliasu(struct vnode *vp, udev_t nvp_rdev);
 int	bdevvp(dev_t dev, struct vnode **vpp);
 /* cache_* may belong in namei.h. */
 void	cache_enter(struct vnode *dvp, struct vnode *vp,
 	    struct componentname *cnp);
 int	cache_lookup(struct vnode *dvp, struct vnode **vpp,
 	    struct componentname *cnp);
 void	cache_purge(struct vnode *vp);
 void	cache_purgevfs(struct mount *mp);
 int	cache_leaf_test(struct vnode *vp);
 void	cvtstat(struct stat *st, struct ostat *ost);
 void	cvtnstat(struct stat *sb, struct nstat *nsb);
 int	getnewvnode(const char *tag, struct mount *mp, vop_t **vops,
 	    struct vnode **vpp);
 int	lease_check(struct vop_lease_args *ap);
 int	spec_vnoperate(struct vop_generic_args *);
 int	speedup_syncer(void);
 #define textvp_fullpath(p, rb, rfb) \
 	vn_fullpath(FIRST_THREAD_IN_PROC(p), (p)->p_textvp, rb, rfb)
 int	vn_fullpath(struct thread *td, struct vnode *vn,
 	    char **retbuf, char **freebuf);
 int	vaccess(enum vtype type, mode_t file_mode, uid_t uid, gid_t gid,
 	    mode_t acc_mode, struct ucred *cred, int *privused);
 int	vaccess_acl_posix1e(enum vtype type, uid_t file_uid,
 	    gid_t file_gid, struct acl *acl, mode_t acc_mode,
 	    struct ucred *cred, int *privused);
 void	vattr_null(struct vattr *vap);
 int	vcount(struct vnode *vp);
 void	vdrop(struct vnode *);
 void	vdropl(struct vnode *);
 int	vfinddev(dev_t dev, enum vtype type, struct vnode **vpp);
 void	vfs_add_vnodeops(const void *);
 void	vfs_rm_vnodeops(const void *);
 int	vflush(struct mount *mp, int rootrefs, int flags);
 int	vget(struct vnode *vp, int lockflag, struct thread *td);
 void	vgone(struct vnode *vp);
 void	vgonel(struct vnode *vp, struct thread *td);
 void	vhold(struct vnode *);
 void	vholdl(struct vnode *);
 int	vinvalbuf(struct vnode *vp, int save, struct ucred *cred,
 	    struct thread *td, int slpflag, int slptimeo);
 int	vtruncbuf(struct vnode *vp, struct ucred *cred, struct thread *td,
 	    off_t length, int blksize);
 void	vprint(char *label, struct vnode *vp);
 int	vrecycle(struct vnode *vp, struct mtx *inter_lkp,
 	    struct thread *td);
 int	vn_close(struct vnode *vp,
 	    int flags, struct ucred *file_cred, struct thread *td);
 void	vn_finished_write(struct mount *mp);
 int	vn_isdisk(struct vnode *vp, int *errp);
 int	vn_lock(struct vnode *vp, int flags, struct thread *td);
 #ifdef	DEBUG_LOCKS
 int	debug_vn_lock(struct vnode *vp, int flags, struct thread *p,
 	    const char *filename, int line);
 #define vn_lock(vp,flags,p) debug_vn_lock(vp,flags,p,__FILE__,__LINE__)
 #endif
 int	vn_open(struct nameidata *ndp, int *flagp, int cmode);
 int	vn_open_cred(struct nameidata *ndp, int *flagp, int cmode,
 	    struct ucred *cred);
 void	vn_pollevent(struct vnode *vp, int events);
 void	vn_pollgone(struct vnode *vp);
 int	vn_pollrecord(struct vnode *vp, struct thread *p, int events);
 int	vn_rdwr(enum uio_rw rw, struct vnode *vp, caddr_t base,
 	    int len, off_t offset, enum uio_seg segflg, int ioflg,
 	    struct ucred *active_cred, struct ucred *file_cred, int *aresid,
 	    struct thread *td);
 int	vn_rdwr_inchunks(enum uio_rw rw, struct vnode *vp, caddr_t base,
 	    int len, off_t offset, enum uio_seg segflg, int ioflg,
 	    struct ucred *active_cred, struct ucred *file_cred, int *aresid,
 	    struct thread *td);
 int	vn_stat(struct vnode *vp, struct stat *sb, struct ucred *active_cred,
 	    struct ucred *file_cred, struct thread *td);
 int	vn_start_write(struct vnode *vp, struct mount **mpp, int flags);
 dev_t	vn_todev(struct vnode *vp);
 int	vn_write_suspend_wait(struct vnode *vp, struct mount *mp,
 	    int flags);
 int	vn_writechk(struct vnode *vp);
 int	vn_extattr_get(struct vnode *vp, int ioflg, int attrnamespace,
 	    const char *attrname, int *buflen, char *buf, struct thread *td);
 int	vn_extattr_set(struct vnode *vp, int ioflg, int attrnamespace,
 	    const char *attrname, int buflen, char *buf, struct thread *td);
 int	vn_extattr_rm(struct vnode *vp, int ioflg, int attrnamespace,
 	    const char *attrname, struct thread *td);
 int	vfs_cache_lookup(struct vop_lookup_args *ap);
 int	vfs_object_create(struct vnode *vp, struct thread *td,
 	    struct ucred *cred);
 void	vfs_timestamp(struct timespec *);
 void	vfs_write_resume(struct mount *mp);
-void	vfs_write_suspend(struct mount *mp);
+int	vfs_write_suspend(struct mount *mp);
 int	vop_stdbmap(struct vop_bmap_args *);
 int	vop_stdgetwritemount(struct vop_getwritemount_args *);
 int	vop_stdgetpages(struct vop_getpages_args *);
 int	vop_stdinactive(struct vop_inactive_args *);
 int	vop_stdislocked(struct vop_islocked_args *);
 int	vop_stdlock(struct vop_lock_args *);
 int	vop_stdputpages(struct vop_putpages_args *);
 int	vop_stdunlock(struct vop_unlock_args *);
 int	vop_noislocked(struct vop_islocked_args *);
 int	vop_nolock(struct vop_lock_args *);
 int	vop_nopoll(struct vop_poll_args *);
 int	vop_nounlock(struct vop_unlock_args *);
 int	vop_stdpathconf(struct vop_pathconf_args *);
 int	vop_stdpoll(struct vop_poll_args *);
 int	vop_revoke(struct vop_revoke_args *);
 int	vop_sharedlock(struct vop_lock_args *);
 int	vop_eopnotsupp(struct vop_generic_args *ap);
 int	vop_ebadf(struct vop_generic_args *ap);
 int	vop_einval(struct vop_generic_args *ap);
 int	vop_enotty(struct vop_generic_args *ap);
 int	vop_defaultop(struct vop_generic_args *ap);
 int	vop_null(struct vop_generic_args *ap);
 int	vop_panic(struct vop_generic_args *ap);
 int	vop_stdcreatevobject(struct vop_createvobject_args *ap);
 int	vop_stddestroyvobject(struct vop_destroyvobject_args *ap);
 int	vop_stdgetvobject(struct vop_getvobject_args *ap);
 
 void	vfree(struct vnode *);
 void	vput(struct vnode *vp);
 void	vrele(struct vnode *vp);
 void	vref(struct vnode *vp);
 int	vrefcnt(struct vnode *vp);
 void	vbusy(struct vnode *vp);
 void 	v_addpollinfo(struct vnode *vp);
 
 extern	vop_t **default_vnodeop_p;
 extern	vop_t **spec_vnodeop_p;
 extern	vop_t **dead_vnodeop_p;
 
 #endif /* _KERNEL */
 
 #endif /* !_SYS_VNODE_H_ */
Index: head/sys/ufs/ffs/ffs_snapshot.c
===================================================================
--- head/sys/ufs/ffs/ffs_snapshot.c	(revision 105901)
+++ head/sys/ufs/ffs/ffs_snapshot.c	(revision 105902)
@@ -1,1926 +1,1929 @@
 /*
  * Copyright 2000 Marshall Kirk McKusick. All Rights Reserved.
  *
  * Further information about snapshots can be obtained from:
  *
  *	Marshall Kirk McKusick		http://www.mckusick.com/softdep/
  *	1614 Oxford Street		mckusick@mckusick.com
  *	Berkeley, CA 94709-1608		+1-510-843-9542
  *	USA
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  *
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY MARSHALL KIRK MCKUSICK ``AS IS'' AND ANY
  * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
  * DISCLAIMED.  IN NO EVENT SHALL MARSHALL KIRK MCKUSICK BE LIABLE FOR
  * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	@(#)ffs_snapshot.c	8.11 (McKusick) 7/23/00
  * $FreeBSD$
  */
 
 #include <sys/param.h>
 #include <sys/stdint.h>
 #include <sys/kernel.h>
 #include <sys/systm.h>
 #include <sys/conf.h>
 #include <sys/bio.h>
 #include <sys/buf.h>
 #include <sys/proc.h>
 #include <sys/namei.h>
 #include <sys/stat.h>
 #include <sys/malloc.h>
 #include <sys/mount.h>
 #include <sys/resource.h>
 #include <sys/resourcevar.h>
 #include <sys/vnode.h>
 
 #include <ufs/ufs/extattr.h>
 #include <ufs/ufs/quota.h>
 #include <ufs/ufs/ufsmount.h>
 #include <ufs/ufs/inode.h>
 #include <ufs/ufs/ufs_extern.h>
 
 #include <ufs/ffs/fs.h>
 #include <ufs/ffs/ffs_extern.h>
 
 #define KERNCRED thread0.td_ucred
 #define DEBUG 1
 
 static int cgaccount(int, struct vnode *, struct buf *, int);
 static int expunge_ufs1(struct vnode *, struct inode *, struct fs *,
     int (*)(struct vnode *, ufs1_daddr_t *, ufs1_daddr_t *, struct fs *,
     ufs_lbn_t, int), int);
 static int indiracct_ufs1(struct vnode *, struct vnode *, int,
     ufs1_daddr_t, ufs_lbn_t, ufs_lbn_t, ufs_lbn_t, ufs_lbn_t, struct fs *,
     int (*)(struct vnode *, ufs1_daddr_t *, ufs1_daddr_t *, struct fs *,
     ufs_lbn_t, int), int);
 static int fullacct_ufs1(struct vnode *, ufs1_daddr_t *, ufs1_daddr_t *,
     struct fs *, ufs_lbn_t, int);
 static int snapacct_ufs1(struct vnode *, ufs1_daddr_t *, ufs1_daddr_t *,
     struct fs *, ufs_lbn_t, int);
 static int mapacct_ufs1(struct vnode *, ufs1_daddr_t *, ufs1_daddr_t *,
     struct fs *, ufs_lbn_t, int);
 static int expunge_ufs2(struct vnode *, struct inode *, struct fs *,
     int (*)(struct vnode *, ufs2_daddr_t *, ufs2_daddr_t *, struct fs *,
     ufs_lbn_t, int), int);
 static int indiracct_ufs2(struct vnode *, struct vnode *, int,
     ufs2_daddr_t, ufs_lbn_t, ufs_lbn_t, ufs_lbn_t, ufs_lbn_t, struct fs *,
     int (*)(struct vnode *, ufs2_daddr_t *, ufs2_daddr_t *, struct fs *,
     ufs_lbn_t, int), int);
 static int fullacct_ufs2(struct vnode *, ufs2_daddr_t *, ufs2_daddr_t *,
     struct fs *, ufs_lbn_t, int);
 static int snapacct_ufs2(struct vnode *, ufs2_daddr_t *, ufs2_daddr_t *,
     struct fs *, ufs_lbn_t, int);
 static int mapacct_ufs2(struct vnode *, ufs2_daddr_t *, ufs2_daddr_t *,
     struct fs *, ufs_lbn_t, int);
 static int ffs_copyonwrite(struct vnode *, struct buf *);
 static int readblock(struct buf *, ufs2_daddr_t);
 
 /*
  * To ensure the consistency of snapshots across crashes, we must
  * synchronously write out copied blocks before allowing the
  * originals to be modified. Because of the rather severe speed
  * penalty that this imposes, the following flag allows this
  * crash persistence to be disabled.
  */
 int dopersistence = 0;
 
 #ifdef DEBUG
 #include <sys/sysctl.h>
 SYSCTL_INT(_debug, OID_AUTO, dopersistence, CTLFLAG_RW, &dopersistence, 0, "");
 int snapdebug = 0;
 SYSCTL_INT(_debug, OID_AUTO, snapdebug, CTLFLAG_RW, &snapdebug, 0, "");
 int collectsnapstats = 0;
 SYSCTL_INT(_debug, OID_AUTO, collectsnapstats, CTLFLAG_RW, &collectsnapstats,
 	0, "");
 #endif /* DEBUG */
 
 /*
  * Create a snapshot file and initialize it for the filesystem.
  */
 int
 ffs_snapshot(mp, snapfile)
 	struct mount *mp;
 	char *snapfile;
 {
 	ufs2_daddr_t numblks, blkno;
 	int error, cg, snaploc;
 	int i, size, len, loc;
 	int flag = mp->mnt_flag;
 	struct timespec starttime = {0, 0}, endtime;
 	char saved_nice = 0;
 	long redo = 0;
 	int32_t *lp;
 	void *space;
 	daddr_t *listhd;
 	struct fs *copy_fs = NULL, *fs = VFSTOUFS(mp)->um_fs;
 	struct snaphead *snaphead;
 	struct thread *td = curthread;
 	struct inode *ip, *xp;
 	struct buf *bp, *nbp, *ibp, *sbp = NULL;
 	struct nameidata nd;
 	struct mount *wrtmp;
 	struct vattr vat;
 	struct vnode *vp, *xvp, *nvp;
 	struct uio auio;
 	struct iovec aiov;
 
 	/*
 	 * Need to serialize access to snapshot code per filesystem.
 	 */
 	/*
 	 * Assign a snapshot slot in the superblock.
 	 */
 	for (snaploc = 0; snaploc < FSMAXSNAP; snaploc++)
 		if (fs->fs_snapinum[snaploc] == 0)
 			break;
 	if (snaploc == FSMAXSNAP)
 		return (ENOSPC);
 	/*
 	 * Create the snapshot file.
 	 */
 restart:
 	NDINIT(&nd, CREATE, LOCKPARENT | LOCKLEAF, UIO_USERSPACE, snapfile, td);
 	if ((error = namei(&nd)) != 0)
 		return (error);
 	if (nd.ni_vp != NULL) {
 		vput(nd.ni_vp);
 		error = EEXIST;
 	}
 	if (nd.ni_dvp->v_mount != mp)
 		error = EXDEV;
 	if (error) {
 		NDFREE(&nd, NDF_ONLY_PNBUF);
 		if (nd.ni_dvp == nd.ni_vp)
 			vrele(nd.ni_dvp);
 		else
 			vput(nd.ni_dvp);
 		return (error);
 	}
 	VATTR_NULL(&vat);
 	vat.va_type = VREG;
 	vat.va_mode = S_IRUSR;
 	vat.va_vaflags |= VA_EXCLUSIVE;
 	if (VOP_GETWRITEMOUNT(nd.ni_dvp, &wrtmp))
 		wrtmp = NULL;
 	if (wrtmp != mp)
 		panic("ffs_snapshot: mount mismatch");
 	if (vn_start_write(NULL, &wrtmp, V_NOWAIT) != 0) {
 		NDFREE(&nd, NDF_ONLY_PNBUF);
 		vput(nd.ni_dvp);
 		if ((error = vn_start_write(NULL, &wrtmp,
 		    V_XSLEEP | PCATCH)) != 0)
 			return (error);
 		goto restart;
 	}
 	VOP_LEASE(nd.ni_dvp, td, KERNCRED, LEASE_WRITE);
 	error = VOP_CREATE(nd.ni_dvp, &nd.ni_vp, &nd.ni_cnd, &vat);
 	vput(nd.ni_dvp);
 	if (error) {
 		NDFREE(&nd, NDF_ONLY_PNBUF);
 		vn_finished_write(wrtmp);
 		return (error);
 	}
 	vp = nd.ni_vp;
 	ip = VTOI(vp);
 	/*
 	 * Allocate and copy the last block contents so as to be able
 	 * to set size to that of the filesystem.
 	 */
 	numblks = howmany(fs->fs_size, fs->fs_frag);
 	error = UFS_BALLOC(vp, lblktosize(fs, (off_t)(numblks - 1)),
 	    fs->fs_bsize, KERNCRED, BA_CLRBUF, &bp);
 	if (error)
 		goto out;
 	ip->i_size = lblktosize(fs, (off_t)numblks);
 	DIP(ip, i_size) = ip->i_size;
 	ip->i_flag |= IN_CHANGE | IN_UPDATE;
 	if ((error = readblock(bp, numblks - 1)) != 0)
 		goto out;
 	bawrite(bp);
 	/*
 	 * Preallocate critical data structures so that we can copy
 	 * them in without further allocation after we suspend all
 	 * operations on the filesystem. We would like to just release
 	 * the allocated buffers without writing them since they will
 	 * be filled in below once we are ready to go, but this upsets
 	 * the soft update code, so we go ahead and write the new buffers.
 	 *
 	 * Allocate all indirect blocks and mark all of them as not
 	 * needing to be copied.
 	 */
 	for (blkno = NDADDR; blkno < numblks; blkno += NINDIR(fs)) {
 		error = UFS_BALLOC(vp, lblktosize(fs, (off_t)blkno),
 		    fs->fs_bsize, td->td_ucred, BA_METAONLY, &ibp);
 		if (error)
 			goto out;
 		bdwrite(ibp);
 	}
 	/*
 	 * Allocate copies for the superblock and its summary information.
 	 */
 	error = UFS_BALLOC(vp, lfragtosize(fs, fs->fs_sblockloc),
 	    fs->fs_sbsize, KERNCRED, 0, &nbp);
 	if (error)
 		goto out;
 	bawrite(nbp);
 	blkno = fragstoblks(fs, fs->fs_csaddr);
 	len = howmany(fs->fs_cssize, fs->fs_bsize);
 	for (loc = 0; loc < len; loc++) {
 		error = UFS_BALLOC(vp, lblktosize(fs, (off_t)(blkno + loc)),
 		    fs->fs_bsize, KERNCRED, 0, &nbp);
 		if (error)
 			goto out;
 		bawrite(nbp);
 	}
 	/*
 	 * Allocate all cylinder group blocks.
 	 */
 	for (cg = 0; cg < fs->fs_ncg; cg++) {
 		error = UFS_BALLOC(vp, (off_t)(cgtod(fs, cg)) << fs->fs_fshift,
 		    fs->fs_bsize, KERNCRED, 0, &nbp);
 		if (error)
 			goto out;
 		bdwrite(nbp);
 	}
 	/*
 	 * Copy all the cylinder group maps. Although the
 	 * filesystem is still active, we hope that only a few
 	 * cylinder groups will change between now and when we
 	 * suspend operations. Thus, we will be able to quickly
 	 * touch up the few cylinder groups that changed during
 	 * the suspension period.
 	 */
 	len = howmany(fs->fs_ncg, NBBY);
 	MALLOC(fs->fs_active, int *, len, M_DEVBUF, M_WAITOK);
 	bzero(fs->fs_active, len);
 	for (cg = 0; cg < fs->fs_ncg; cg++) {
 		error = bread(vp, fragstoblks(fs, cgtod(fs, cg)), fs->fs_bsize,
 		    KERNCRED, &nbp);
 		if (error) {
 			brelse(nbp);
 			goto out;
 		}
 		error = cgaccount(cg, vp, nbp, 1);
 		bawrite(nbp);
 		if (error)
 			goto out;
 	}
 	/*
 	 * Change inode to snapshot type file.
 	 */
 	ip->i_flags |= SF_SNAPSHOT;
 	DIP(ip, i_flags) = ip->i_flags;
 	ip->i_flag |= IN_CHANGE | IN_UPDATE;
 	/*
 	 * Ensure that the snapshot is completely on disk.
 	 */
 	if ((error = VOP_FSYNC(vp, KERNCRED, MNT_WAIT, td)) != 0)
 		goto out;
 	/*
 	 * All allocations are done, so we can now snapshot the system.
 	 *
 	 * Recind nice scheduling while running with the filesystem suspended.
 	 */
 	if (td->td_ksegrp->kg_nice > 0) {
 		saved_nice = td->td_ksegrp->kg_nice;
 		td->td_ksegrp->kg_nice = 0;
 	}
 	/*
 	 * Suspend operation on filesystem.
 	 */
 	for (;;) {
 		vn_finished_write(wrtmp);
-		vfs_write_suspend(vp->v_mount);
+		if ((error = vfs_write_suspend(vp->v_mount)) != 0) {
+			vn_start_write(NULL, &wrtmp, V_WAIT);
+			goto out;
+		}
 		if (mp->mnt_kern_flag & MNTK_SUSPENDED)
 			break;
 		vn_start_write(NULL, &wrtmp, V_WAIT);
 	}
 	if (collectsnapstats)
 		nanotime(&starttime);
 	/*
 	 * First, copy all the cylinder group maps that have changed.
 	 */
 	for (cg = 0; cg < fs->fs_ncg; cg++) {
 		if ((ACTIVECGNUM(fs, cg) & ACTIVECGOFF(cg)) != 0)
 			continue;
 		redo++;
 		error = bread(vp, fragstoblks(fs, cgtod(fs, cg)), fs->fs_bsize,
 			KERNCRED, &nbp);
 		if (error) {
 			brelse(nbp);
 			goto out1;
 		}
 		error = cgaccount(cg, vp, nbp, 2);
 		bawrite(nbp);
 		if (error)
 			goto out1;
 	}
 	/*
 	 * Grab a copy of the superblock and its summary information.
 	 * We delay writing it until the suspension is released below.
 	 */
 	error = bread(vp, fragstoblks(fs, fs->fs_sblockloc), fs->fs_bsize,
 	    KERNCRED, &sbp);
 	if (error) {
 		brelse(sbp);
 		sbp = NULL;
 		goto out1;
 	}
 	loc = blkoff(fs, lfragtosize(fs, fs->fs_sblockloc));
 	copy_fs = (struct fs *)(sbp->b_data + loc);
 	bcopy(fs, copy_fs, fs->fs_sbsize);
 	if ((fs->fs_flags & (FS_UNCLEAN | FS_NEEDSFSCK)) == 0)
 		copy_fs->fs_clean = 1;
 	if (fs->fs_sbsize < SBLOCKSIZE)
 		bzero(&sbp->b_data[loc + fs->fs_sbsize],
 		    SBLOCKSIZE - fs->fs_sbsize);
 	size = blkroundup(fs, fs->fs_cssize);
 	if (fs->fs_contigsumsize > 0)
 		size += fs->fs_ncg * sizeof(int32_t);
 	space = malloc((u_long)size, M_UFSMNT, M_WAITOK);
 	copy_fs->fs_csp = space;
 	bcopy(fs->fs_csp, copy_fs->fs_csp, fs->fs_cssize);
 	(char *)space += fs->fs_cssize;
 	loc = howmany(fs->fs_cssize, fs->fs_fsize);
 	i = fs->fs_frag - loc % fs->fs_frag;
 	len = (i == fs->fs_frag) ? 0 : i * fs->fs_fsize;
 	if (len > 0) {
 		if ((error = bread(ip->i_devvp,
 		    fsbtodb(fs, fs->fs_csaddr + loc),
 		    len, KERNCRED, &bp)) != 0) {
 			brelse(bp);
 			free(copy_fs->fs_csp, M_UFSMNT);
 			bawrite(sbp);
 			sbp = NULL;
 			goto out1;
 		}
 		bcopy(bp->b_data, space, (u_int)len);
 		(char *)space += len;
 		bp->b_flags |= B_INVAL | B_NOCACHE;
 		brelse(bp);
 	}
 	if (fs->fs_contigsumsize > 0) {
 		copy_fs->fs_maxcluster = lp = space;
 		for (i = 0; i < fs->fs_ncg; i++)
 			*lp++ = fs->fs_contigsumsize;
 	}
 	/*
 	 * We must check for active files that have been unlinked
 	 * (e.g., with a zero link count). We have to expunge all
 	 * trace of these files from the snapshot so that they are
 	 * not reclaimed prematurely by fsck or unnecessarily dumped.
 	 * We turn off the MNTK_SUSPENDED flag to avoid a panic from
 	 * spec_strategy about writing on a suspended filesystem.
 	 * Note that we skip unlinked snapshot files as they will
 	 * be handled separately below.
 	 */
 	mp->mnt_kern_flag &= ~MNTK_SUSPENDED;
 	mtx_lock(&mntvnode_mtx);
 loop:
 	for (xvp = TAILQ_FIRST(&mp->mnt_nvnodelist); xvp; xvp = nvp) {
 		/*
 		 * Make sure this vnode wasn't reclaimed in getnewvnode().
 		 * Start over if it has (it won't be on the list anymore).
 		 */
 		if (xvp->v_mount != mp)
 			goto loop;
 		nvp = TAILQ_NEXT(xvp, v_nmntvnodes);
 		mtx_unlock(&mntvnode_mtx);
 		mp_fixme("Unlocked GETATTR.");
 		if (vrefcnt(xvp) == 0 || xvp->v_type == VNON ||
 		    (VTOI(xvp)->i_flags & SF_SNAPSHOT) ||
 		    (VOP_GETATTR(xvp, &vat, td->td_proc->p_ucred, td) == 0 &&
 		    vat.va_nlink > 0)) {
 			mtx_lock(&mntvnode_mtx);
 			continue;
 		}
 		if (snapdebug)
 			vprint("ffs_snapshot: busy vnode", xvp);
 		if (vn_lock(xvp, LK_EXCLUSIVE, td) != 0)
 			goto loop;
 		xp = VTOI(xvp);
 		/*
 		 * If there is a fragment, clear it here.
 		 */
 		blkno = 0;
 		loc = howmany(xp->i_size, fs->fs_bsize) - 1;
 		if (loc < NDADDR) {
 			len = fragroundup(fs, blkoff(fs, xp->i_size));
 			if (len < fs->fs_bsize) {
 				ffs_blkfree(copy_fs, vp, DIP(xp, i_db[loc]),
 				    len, xp->i_number);
 				blkno = DIP(xp, i_db[loc]);
 				DIP(xp, i_db[loc]) = 0;
 			}
 		}
 		if (xp->i_ump->um_fstype == UFS1)
 			error = expunge_ufs1(vp, xp, copy_fs, fullacct_ufs1,
 			    BLK_NOCOPY);
 		else
 			error = expunge_ufs2(vp, xp, copy_fs, fullacct_ufs2,
 			    BLK_NOCOPY);
 		if (blkno)
 			DIP(xp, i_db[loc]) = blkno;
 		if (!error)
 			error = ffs_freefile(copy_fs, vp, xp->i_number,
 			    xp->i_mode);
 		VOP_UNLOCK(xvp, 0, td);
 		if (error) {
 			free(copy_fs->fs_csp, M_UFSMNT);
 			bawrite(sbp);
 			sbp = NULL;
 			goto out1;
 		}
 		mtx_lock(&mntvnode_mtx);
 	}
 	mtx_unlock(&mntvnode_mtx);
 	/*
 	 * If there already exist snapshots on this filesystem, grab a
 	 * reference to their shared lock. If this is the first snapshot
 	 * on this filesystem, we need to allocate a lock for the snapshots
 	 * to share. In either case, acquire the snapshot lock and give
 	 * up our original private lock.
 	 */
 	snaphead = &ip->i_devvp->v_rdev->si_snapshots;
 	if ((xp = TAILQ_FIRST(snaphead)) != NULL) {
 		VI_LOCK(vp);
 		vp->v_vnlock = ITOV(xp)->v_vnlock;
 	} else {
 		struct lock *lkp;
 
 		MALLOC(lkp, struct lock *, sizeof(struct lock), M_UFSMNT,
 		    M_WAITOK);
 		lockinit(lkp, PVFS, "snaplk", VLKTIMEOUT,
 		    LK_CANRECURSE | LK_NOPAUSE);
 		VI_LOCK(vp);
 		vp->v_vnlock = lkp;
 	}
 	vn_lock(vp, LK_INTERLOCK | LK_EXCLUSIVE | LK_RETRY, td);
 	VI_LOCK(vp);
 	lockmgr(&vp->v_lock, LK_INTERLOCK | LK_RELEASE, VI_MTX(vp), td);
 	/*
 	 * Record snapshot inode. Since this is the newest snapshot,
 	 * it must be placed at the end of the list.
 	 */
 	fs->fs_snapinum[snaploc] = ip->i_number;
 	if (ip->i_nextsnap.tqe_prev != 0)
 		panic("ffs_snapshot: %d already on list", ip->i_number);
 	ASSERT_VOP_LOCKED(ip->i_devvp, "ffs_snapshot devvp");
 	TAILQ_INSERT_TAIL(snaphead, ip, i_nextsnap);
 	ip->i_devvp->v_rdev->si_copyonwrite = ffs_copyonwrite;
 	ip->i_devvp->v_vflag |= VV_COPYONWRITE;
 	ASSERT_VOP_LOCKED(vp, "ffs_snapshot vp");
 	vp->v_vflag |= VV_SYSTEM;
 out1:
 	/*
 	 * Resume operation on filesystem.
 	 */
 	vfs_write_resume(vp->v_mount);
 	vn_start_write(NULL, &wrtmp, V_WAIT);
 	if (collectsnapstats && starttime.tv_sec > 0) {
 		nanotime(&endtime);
 		timespecsub(&endtime, &starttime);
 		printf("%s: suspended %d.%03ld sec, redo %ld of %d\n",
 		    vp->v_mount->mnt_stat.f_mntonname, endtime.tv_sec,
 		    endtime.tv_nsec / 1000000, redo, fs->fs_ncg);
 	}
 	if (sbp == NULL)
 		goto out;
 	/*
 	 * Copy allocation information from all the snapshots in
 	 * this snapshot and then expunge them from its view.
 	 */
 	snaphead = &ip->i_devvp->v_rdev->si_snapshots;
 	TAILQ_FOREACH(xp, snaphead, i_nextsnap) {
 		if (xp == ip)
 			break;
 		if (xp->i_ump->um_fstype == UFS1)
 			error = expunge_ufs1(vp, xp, fs, snapacct_ufs1,
 			    BLK_SNAP);
 		else
 			error = expunge_ufs2(vp, xp, fs, snapacct_ufs2,
 			    BLK_SNAP);
 		if (error) {
 			fs->fs_snapinum[snaploc] = 0;
 			goto done;
 		}
 	}
 	/*
 	 * Allocate the space for the list of preallocated snapshot blocks.
 	 */
 	ip->i_snaplistsize = fragstoblks(fs, dbtofsb(fs, DIP(ip,i_blocks))) + 1;
 	MALLOC(listhd, daddr_t *, ip->i_snaplistsize * sizeof(daddr_t),
 	    M_UFSMNT, M_WAITOK);
 	ip->i_snapblklist = listhd;
 	*ip->i_snapblklist++ = ip->i_snaplistsize;
 	/*
 	 * Expunge the blocks used by the snapshots from the set of
 	 * blocks marked as used in the snapshot bitmaps. Also, collect
 	 * the list of allocated blocks in i_snapblklist.
 	 */
 	if (ip->i_ump->um_fstype == UFS1)
 		error = expunge_ufs1(vp, ip, copy_fs, mapacct_ufs1, BLK_SNAP);
 	else
 		error = expunge_ufs2(vp, ip, copy_fs, mapacct_ufs2, BLK_SNAP);
 	if (error) {
 		fs->fs_snapinum[snaploc] = 0;
 		FREE(listhd, M_UFSMNT);
 		goto done;
 	}
 	/*
 	 * Write out the list of allocated blocks to the end of the snapshot.
 	 */
 	if (ip->i_snapblklist - listhd != ip->i_snaplistsize)
 		printf("Snaplist mismatch, got %jd should be %jd\n",
 		    (intmax_t)(ip->i_snapblklist - listhd),
 		    (intmax_t)ip->i_snaplistsize);
 	auio.uio_iov = &aiov;
 	auio.uio_iovcnt = 1;
 	aiov.iov_base = (void *)listhd;
 	aiov.iov_len = ip->i_snaplistsize * sizeof(daddr_t);
 	auio.uio_resid = aiov.iov_len;;
 	auio.uio_offset = ip->i_size;
 	auio.uio_segflg = UIO_SYSSPACE;
 	auio.uio_rw = UIO_WRITE;
 	auio.uio_td = td;
 	if ((error = VOP_WRITE(vp, &auio, IO_UNIT, td->td_ucred)) != 0) {
 		fs->fs_snapinum[snaploc] = 0;
 		FREE(listhd, M_UFSMNT);
 		goto done;
 	}
 	ip->i_snapblklist = listhd;
 	/*
 	 * Write the superblock and its summary information
 	 * to the snapshot.
 	 */
 	blkno = fragstoblks(fs, fs->fs_csaddr);
 	len = howmany(fs->fs_cssize, fs->fs_bsize);
 	space = copy_fs->fs_csp;
 	for (loc = 0; loc < len; loc++) {
 		error = bread(vp, blkno + loc, fs->fs_bsize, KERNCRED, &nbp);
 		if (error) {
 			brelse(nbp);
 			fs->fs_snapinum[snaploc] = 0;
 			FREE(listhd, M_UFSMNT);
 			ip->i_snapblklist = NULL;
 			goto done;
 		}
 		bcopy(space, nbp->b_data, fs->fs_bsize);
 		space = (char *)space + fs->fs_bsize;
 		bawrite(nbp);
 	}
 done:
 	free(copy_fs->fs_csp, M_UFSMNT);
 	bawrite(sbp);
 out:
 	if (saved_nice > 0)
 		td->td_ksegrp->kg_nice = saved_nice;
 	if (fs->fs_active != 0) {
 		FREE(fs->fs_active, M_DEVBUF);
 		fs->fs_active = 0;
 	}
 	mp->mnt_flag = flag;
 	if (error)
 		(void) UFS_TRUNCATE(vp, (off_t)0, 0, NOCRED, td);
 	(void) VOP_FSYNC(vp, KERNCRED, MNT_WAIT, td);
 	if (error)
 		vput(vp);
 	else
 		VOP_UNLOCK(vp, 0, td);
 	vn_finished_write(wrtmp);
 	return (error);
 }
 
 /*
  * Copy a cylinder group map. All the unallocated blocks are marked
  * BLK_NOCOPY so that the snapshot knows that it need not copy them
  * if they are later written. If passno is one, then this is a first
  * pass, so only setting needs to be done. If passno is 2, then this
  * is a revision to a previous pass which must be undone as the
  * replacement pass is done.
  */
 static int
 cgaccount(cg, vp, nbp, passno)
 	int cg;
 	struct vnode *vp;
 	struct buf *nbp;
 	int passno;
 {
 	struct buf *bp, *ibp;
 	struct inode *ip;
 	struct cg *cgp;
 	struct fs *fs;
 	ufs2_daddr_t base, numblks;
 	int error, len, loc, indiroff;
 
 	ip = VTOI(vp);
 	fs = ip->i_fs;
 	error = bread(ip->i_devvp, fsbtodb(fs, cgtod(fs, cg)),
 		(int)fs->fs_cgsize, KERNCRED, &bp);
 	if (error) {
 		brelse(bp);
 		return (error);
 	}
 	cgp = (struct cg *)bp->b_data;
 	if (!cg_chkmagic(cgp)) {
 		brelse(bp);
 		return (EIO);
 	}
 	atomic_set_int(&ACTIVECGNUM(fs, cg), ACTIVECGOFF(cg));
 	bcopy(bp->b_data, nbp->b_data, fs->fs_cgsize);
 	if (fs->fs_cgsize < fs->fs_bsize)
 		bzero(&nbp->b_data[fs->fs_cgsize],
 		    fs->fs_bsize - fs->fs_cgsize);
 	if (passno == 2)
 		nbp->b_flags |= B_VALIDSUSPWRT;
 	numblks = howmany(fs->fs_size, fs->fs_frag);
 	len = howmany(fs->fs_fpg, fs->fs_frag);
 	base = cg * fs->fs_fpg / fs->fs_frag;
 	if (base + len >= numblks)
 		len = numblks - base - 1;
 	loc = 0;
 	if (base < NDADDR) {
 		for ( ; loc < NDADDR; loc++) {
 			if (ffs_isblock(fs, cg_blksfree(cgp), loc))
 				DIP(ip, i_db[loc]) = BLK_NOCOPY;
 			else if (passno == 2 && DIP(ip, i_db[loc])== BLK_NOCOPY)
 				DIP(ip, i_db[loc]) = 0;
 			else if (passno == 1 && DIP(ip, i_db[loc])== BLK_NOCOPY)
 				panic("ffs_snapshot: lost direct block");
 		}
 	}
 	error = UFS_BALLOC(vp, lblktosize(fs, (off_t)(base + loc)),
 	    fs->fs_bsize, KERNCRED, BA_METAONLY, &ibp);
 	if (error) {
 		brelse(bp);
 		return (error);
 	}
 	indiroff = (base + loc - NDADDR) % NINDIR(fs);
 	for ( ; loc < len; loc++, indiroff++) {
 		if (indiroff >= NINDIR(fs)) {
 			if (passno == 2)
 				ibp->b_flags |= B_VALIDSUSPWRT;
 			bawrite(ibp);
 			error = UFS_BALLOC(vp,
 			    lblktosize(fs, (off_t)(base + loc)),
 			    fs->fs_bsize, KERNCRED, BA_METAONLY, &ibp);
 			if (error) {
 				brelse(bp);
 				return (error);
 			}
 			indiroff = 0;
 		}
 		if (ip->i_ump->um_fstype == UFS1) {
 			if (ffs_isblock(fs, cg_blksfree(cgp), loc))
 				((ufs1_daddr_t *)(ibp->b_data))[indiroff] =
 				    BLK_NOCOPY;
 			else if (passno == 2 && ((ufs1_daddr_t *)(ibp->b_data))
 			    [indiroff] == BLK_NOCOPY)
 				((ufs1_daddr_t *)(ibp->b_data))[indiroff] = 0;
 			else if (passno == 1 && ((ufs1_daddr_t *)(ibp->b_data))
 			    [indiroff] == BLK_NOCOPY)
 				panic("ffs_snapshot: lost indirect block");
 			continue;
 		}
 		if (ffs_isblock(fs, cg_blksfree(cgp), loc))
 			((ufs2_daddr_t *)(ibp->b_data))[indiroff] = BLK_NOCOPY;
 		else if (passno == 2 &&
 		    ((ufs2_daddr_t *)(ibp->b_data)) [indiroff] == BLK_NOCOPY)
 			((ufs2_daddr_t *)(ibp->b_data))[indiroff] = 0;
 		else if (passno == 1 &&
 		    ((ufs2_daddr_t *)(ibp->b_data)) [indiroff] == BLK_NOCOPY)
 			panic("ffs_snapshot: lost indirect block");
 	}
 	bqrelse(bp);
 	if (passno == 2)
 		ibp->b_flags |= B_VALIDSUSPWRT;
 	bdwrite(ibp);
 	return (0);
 }
 
 /*
  * Before expunging a snapshot inode, note all the
  * blocks that it claims with BLK_SNAP so that fsck will
  * be able to account for those blocks properly and so
  * that this snapshot knows that it need not copy them
  * if the other snapshot holding them is freed. This code
  * is reproduced once each for UFS1 and UFS2.
  */
 static int
 expunge_ufs1(snapvp, cancelip, fs, acctfunc, expungetype)
 	struct vnode *snapvp;
 	struct inode *cancelip;
 	struct fs *fs;
 	int (*acctfunc)(struct vnode *, ufs1_daddr_t *, ufs1_daddr_t *,
 	    struct fs *, ufs_lbn_t, int);
 	int expungetype;
 {
 	int i, error, indiroff;
 	ufs_lbn_t lbn, rlbn;
 	ufs2_daddr_t len, blkno, numblks, blksperindir;
 	struct ufs1_dinode *dip;
 	struct thread *td = curthread;
 	struct buf *bp;
 
 	numblks = howmany(cancelip->i_size, fs->fs_bsize);
 	if ((error = (*acctfunc)(snapvp, &cancelip->i_din1->di_db[0],
 	    &cancelip->i_din1->di_ib[NIADDR], fs, 0, expungetype)))
 		return (error);
 	blksperindir = 1;
 	lbn = -NDADDR;
 	len = numblks - NDADDR;
 	rlbn = NDADDR;
 	for (i = 0; len > 0 && i < NIADDR; i++) {
 		error = indiracct_ufs1(snapvp, ITOV(cancelip), i,
 		    cancelip->i_din1->di_ib[i], lbn, rlbn, len,
 		    blksperindir, fs, acctfunc, expungetype);
 		if (error)
 			return (error);
 		blksperindir *= NINDIR(fs);
 		lbn -= blksperindir + 1;
 		len -= blksperindir;
 		rlbn += blksperindir;
 	}
 	/*
 	 * Prepare to expunge the inode. If its inode block has not
 	 * yet been copied, then allocate and fill the copy.
 	 */
 	lbn = fragstoblks(fs, ino_to_fsba(fs, cancelip->i_number));
 	blkno = 0;
 	if (lbn < NDADDR) {
 		blkno = cancelip->i_din1->di_db[lbn];
 	} else {
 		td->td_proc->p_flag |= P_COWINPROGRESS;
 		error = UFS_BALLOC(snapvp, lblktosize(fs, (off_t)lbn),
 		   fs->fs_bsize, KERNCRED, BA_METAONLY, &bp);
 		td->td_proc->p_flag &= ~P_COWINPROGRESS;
 		if (error)
 			return (error);
 		indiroff = (lbn - NDADDR) % NINDIR(fs);
 		blkno = ((ufs1_daddr_t *)(bp->b_data))[indiroff];
 		bqrelse(bp);
 	}
 	error = UFS_BALLOC(snapvp, lblktosize(fs, (off_t)lbn),
 	    fs->fs_bsize, KERNCRED, 0, &bp);
 	if (error)
 		return (error);
 	if (blkno == 0 && (error = readblock(bp, lbn)))
 		return (error);
 	/*
 	 * Set a snapshot inode to be a zero length file, regular files
 	 * to be completely unallocated.
 	 */
 	dip = (struct ufs1_dinode *)bp->b_data +
 	    ino_to_fsbo(fs, cancelip->i_number);
 	if (expungetype == BLK_NOCOPY)
 		dip->di_mode = 0;
 	dip->di_size = 0;
 	dip->di_blocks = 0;
 	dip->di_flags &= ~SF_SNAPSHOT;
 	bzero(&dip->di_db[0], (NDADDR + NIADDR) * sizeof(ufs1_daddr_t));
 	bdwrite(bp);
 	return (0);
 }
 
 /*
  * Descend an indirect block chain for vnode cancelvp accounting for all
  * its indirect blocks in snapvp.
  */ 
 static int
 indiracct_ufs1(snapvp, cancelvp, level, blkno, lbn, rlbn, remblks,
 	    blksperindir, fs, acctfunc, expungetype)
 	struct vnode *snapvp;
 	struct vnode *cancelvp;
 	int level;
 	ufs1_daddr_t blkno;
 	ufs_lbn_t lbn;
 	ufs_lbn_t rlbn;
 	ufs_lbn_t remblks;
 	ufs_lbn_t blksperindir;
 	struct fs *fs;
 	int (*acctfunc)(struct vnode *, ufs1_daddr_t *, ufs1_daddr_t *,
 	    struct fs *, ufs_lbn_t, int);
 	int expungetype;
 {
 	int error, num, i;
 	ufs_lbn_t subblksperindir;
 	struct indir indirs[NIADDR + 2];
 	ufs1_daddr_t last, *bap;
 	struct buf *bp;
 
 	if ((error = ufs_getlbns(cancelvp, rlbn, indirs, &num)) != 0)
 		return (error);
 	if (lbn != indirs[num - 1 - level].in_lbn || blkno == 0 || num < 2)
 		panic("indiracct: botched params");
 	/*
 	 * We have to expand bread here since it will deadlock looking
 	 * up the block number for any blocks that are not in the cache.
 	 */
 	bp = getblk(cancelvp, lbn, fs->fs_bsize, 0, 0);
 	bp->b_blkno = fsbtodb(fs, blkno);
 	if ((bp->b_flags & (B_DONE | B_DELWRI)) == 0 &&
 	    (error = readblock(bp, fragstoblks(fs, blkno)))) {
 		brelse(bp);
 		return (error);
 	}
 	/*
 	 * Account for the block pointers in this indirect block.
 	 */
 	last = howmany(remblks, blksperindir);
 	if (last > NINDIR(fs))
 		last = NINDIR(fs);
 	MALLOC(bap, ufs1_daddr_t *, fs->fs_bsize, M_DEVBUF, M_WAITOK);
 	bcopy(bp->b_data, (caddr_t)bap, fs->fs_bsize);
 	bqrelse(bp);
 	error = (*acctfunc)(snapvp, &bap[0], &bap[last], fs, rlbn, expungetype);
 	if (error || level == 0)
 		goto out;
 	/*
 	 * Account for the block pointers in each of the indirect blocks
 	 * in the levels below us.
 	 */
 	subblksperindir = blksperindir / NINDIR(fs);
 	for (lbn++, level--, i = 0; i < last; i++) {
 		error = indiracct_ufs1(snapvp, cancelvp, level, bap[i], lbn,
 		    rlbn, remblks, subblksperindir, fs, acctfunc, expungetype);
 		if (error)
 			goto out;
 		rlbn += blksperindir;
 		lbn -= blksperindir;
 		remblks -= blksperindir;
 	}
 out:
 	FREE(bap, M_DEVBUF);
 	return (error);
 }
 
 /*
  * Do both snap accounting and map accounting.
  */
 static int
 fullacct_ufs1(vp, oldblkp, lastblkp, fs, lblkno, exptype)
 	struct vnode *vp;
 	ufs1_daddr_t *oldblkp, *lastblkp;
 	struct fs *fs;
 	ufs_lbn_t lblkno;
 	int exptype;	/* BLK_SNAP or BLK_NOCOPY */
 {
 	int error;
 
 	if ((error = snapacct_ufs1(vp, oldblkp, lastblkp, fs, lblkno, exptype)))
 		return (error);
 	return (mapacct_ufs1(vp, oldblkp, lastblkp, fs, lblkno, exptype));
 }
 
 /*
  * Identify a set of blocks allocated in a snapshot inode.
  */
 static int
 snapacct_ufs1(vp, oldblkp, lastblkp, fs, lblkno, expungetype)
 	struct vnode *vp;
 	ufs1_daddr_t *oldblkp, *lastblkp;
 	struct fs *fs;
 	ufs_lbn_t lblkno;
 	int expungetype;	/* BLK_SNAP or BLK_NOCOPY */
 {
 	struct inode *ip = VTOI(vp);
 	ufs1_daddr_t blkno, *blkp;
 	ufs_lbn_t lbn;
 	struct buf *ibp;
 	int error;
 
 	for ( ; oldblkp < lastblkp; oldblkp++) {
 		blkno = *oldblkp;
 		if (blkno == 0 || blkno == BLK_NOCOPY || blkno == BLK_SNAP)
 			continue;
 		lbn = fragstoblks(fs, blkno);
 		if (lbn < NDADDR) {
 			blkp = &ip->i_din1->di_db[lbn];
 			ip->i_flag |= IN_CHANGE | IN_UPDATE;
 		} else {
 			error = UFS_BALLOC(vp, lblktosize(fs, (off_t)lbn),
 			    fs->fs_bsize, KERNCRED, BA_METAONLY, &ibp);
 			if (error)
 				return (error);
 			blkp = &((ufs1_daddr_t *)(ibp->b_data))
 			    [(lbn - NDADDR) % NINDIR(fs)];
 		}
 		/*
 		 * If we are expunging a snapshot vnode and we
 		 * find a block marked BLK_NOCOPY, then it is
 		 * one that has been allocated to this snapshot after
 		 * we took our current snapshot and can be ignored.
 		 */
 		if (expungetype == BLK_SNAP && *blkp == BLK_NOCOPY) {
 			if (lbn >= NDADDR)
 				brelse(ibp);
 		} else {
 			if (*blkp != 0)
 				panic("snapacct: bad block");
 			*blkp = expungetype;
 			if (lbn >= NDADDR)
 				bdwrite(ibp);
 		}
 	}
 	return (0);
 }
 
 /*
  * Account for a set of blocks allocated in a snapshot inode.
  */
 static int
 mapacct_ufs1(vp, oldblkp, lastblkp, fs, lblkno, expungetype)
 	struct vnode *vp;
 	ufs1_daddr_t *oldblkp, *lastblkp;
 	struct fs *fs;
 	ufs_lbn_t lblkno;
 	int expungetype;
 {
 	ufs1_daddr_t blkno;
 	struct inode *ip;
 	ino_t inum;
 
 	ip = VTOI(vp);
 	inum = ip->i_number;
 	for ( ; oldblkp < lastblkp; oldblkp++, lblkno++) {
 		blkno = *oldblkp;
 		if (blkno == 0 || blkno == BLK_NOCOPY)
 			continue;
 		if (expungetype == BLK_SNAP && blkno != BLK_SNAP)
 			*ip->i_snapblklist++ = lblkno;
 		if (blkno == BLK_SNAP)
 			blkno = blkstofrags(fs, lblkno);
 		ffs_blkfree(fs, vp, blkno, fs->fs_bsize, inum);
 	}
 	return (0);
 }
 
 /*
  * Before expunging a snapshot inode, note all the
  * blocks that it claims with BLK_SNAP so that fsck will
  * be able to account for those blocks properly and so
  * that this snapshot knows that it need not copy them
  * if the other snapshot holding them is freed. This code
  * is reproduced once each for UFS1 and UFS2.
  */
 static int
 expunge_ufs2(snapvp, cancelip, fs, acctfunc, expungetype)
 	struct vnode *snapvp;
 	struct inode *cancelip;
 	struct fs *fs;
 	int (*acctfunc)(struct vnode *, ufs2_daddr_t *, ufs2_daddr_t *,
 	    struct fs *, ufs_lbn_t, int);
 	int expungetype;
 {
 	int i, error, indiroff;
 	ufs_lbn_t lbn, rlbn;
 	ufs2_daddr_t len, blkno, numblks, blksperindir;
 	struct ufs2_dinode *dip;
 	struct thread *td = curthread;
 	struct buf *bp;
 
 	numblks = howmany(cancelip->i_size, fs->fs_bsize);
 	if ((error = (*acctfunc)(snapvp, &cancelip->i_din2->di_db[0],
 	    &cancelip->i_din2->di_ib[NIADDR], fs, 0, expungetype)))
 		return (error);
 	blksperindir = 1;
 	lbn = -NDADDR;
 	len = numblks - NDADDR;
 	rlbn = NDADDR;
 	for (i = 0; len > 0 && i < NIADDR; i++) {
 		error = indiracct_ufs2(snapvp, ITOV(cancelip), i,
 		    cancelip->i_din2->di_ib[i], lbn, rlbn, len,
 		    blksperindir, fs, acctfunc, expungetype);
 		if (error)
 			return (error);
 		blksperindir *= NINDIR(fs);
 		lbn -= blksperindir + 1;
 		len -= blksperindir;
 		rlbn += blksperindir;
 	}
 	/*
 	 * Prepare to expunge the inode. If its inode block has not
 	 * yet been copied, then allocate and fill the copy.
 	 */
 	lbn = fragstoblks(fs, ino_to_fsba(fs, cancelip->i_number));
 	blkno = 0;
 	if (lbn < NDADDR) {
 		blkno = cancelip->i_din2->di_db[lbn];
 	} else {
 		td->td_proc->p_flag |= P_COWINPROGRESS;
 		error = UFS_BALLOC(snapvp, lblktosize(fs, (off_t)lbn),
 		   fs->fs_bsize, KERNCRED, BA_METAONLY, &bp);
 		td->td_proc->p_flag &= ~P_COWINPROGRESS;
 		if (error)
 			return (error);
 		indiroff = (lbn - NDADDR) % NINDIR(fs);
 		blkno = ((ufs2_daddr_t *)(bp->b_data))[indiroff];
 		bqrelse(bp);
 	}
 	error = UFS_BALLOC(snapvp, lblktosize(fs, (off_t)lbn),
 	    fs->fs_bsize, KERNCRED, 0, &bp);
 	if (error)
 		return (error);
 	if (blkno == 0 && (error = readblock(bp, lbn)))
 		return (error);
 	/*
 	 * Set a snapshot inode to be a zero length file, regular files
 	 * to be completely unallocated.
 	 */
 	dip = (struct ufs2_dinode *)bp->b_data +
 	    ino_to_fsbo(fs, cancelip->i_number);
 	if (expungetype == BLK_NOCOPY)
 		dip->di_mode = 0;
 	dip->di_size = 0;
 	dip->di_blocks = 0;
 	dip->di_flags &= ~SF_SNAPSHOT;
 	bzero(&dip->di_db[0], (NDADDR + NIADDR) * sizeof(ufs2_daddr_t));
 	bdwrite(bp);
 	return (0);
 }
 
 /*
  * Descend an indirect block chain for vnode cancelvp accounting for all
  * its indirect blocks in snapvp.
  */ 
 static int
 indiracct_ufs2(snapvp, cancelvp, level, blkno, lbn, rlbn, remblks,
 	    blksperindir, fs, acctfunc, expungetype)
 	struct vnode *snapvp;
 	struct vnode *cancelvp;
 	int level;
 	ufs2_daddr_t blkno;
 	ufs_lbn_t lbn;
 	ufs_lbn_t rlbn;
 	ufs_lbn_t remblks;
 	ufs_lbn_t blksperindir;
 	struct fs *fs;
 	int (*acctfunc)(struct vnode *, ufs2_daddr_t *, ufs2_daddr_t *,
 	    struct fs *, ufs_lbn_t, int);
 	int expungetype;
 {
 	int error, num, i;
 	ufs_lbn_t subblksperindir;
 	struct indir indirs[NIADDR + 2];
 	ufs2_daddr_t last, *bap;
 	struct buf *bp;
 
 	if ((error = ufs_getlbns(cancelvp, rlbn, indirs, &num)) != 0)
 		return (error);
 	if (lbn != indirs[num - 1 - level].in_lbn || blkno == 0 || num < 2)
 		panic("indiracct: botched params");
 	/*
 	 * We have to expand bread here since it will deadlock looking
 	 * up the block number for any blocks that are not in the cache.
 	 */
 	bp = getblk(cancelvp, lbn, fs->fs_bsize, 0, 0);
 	bp->b_blkno = fsbtodb(fs, blkno);
 	if ((bp->b_flags & (B_DONE | B_DELWRI)) == 0 &&
 	    (error = readblock(bp, fragstoblks(fs, blkno)))) {
 		brelse(bp);
 		return (error);
 	}
 	/*
 	 * Account for the block pointers in this indirect block.
 	 */
 	last = howmany(remblks, blksperindir);
 	if (last > NINDIR(fs))
 		last = NINDIR(fs);
 	MALLOC(bap, ufs2_daddr_t *, fs->fs_bsize, M_DEVBUF, M_WAITOK);
 	bcopy(bp->b_data, (caddr_t)bap, fs->fs_bsize);
 	bqrelse(bp);
 	error = (*acctfunc)(snapvp, &bap[0], &bap[last], fs, rlbn, expungetype);
 	if (error || level == 0)
 		goto out;
 	/*
 	 * Account for the block pointers in each of the indirect blocks
 	 * in the levels below us.
 	 */
 	subblksperindir = blksperindir / NINDIR(fs);
 	for (lbn++, level--, i = 0; i < last; i++) {
 		error = indiracct_ufs2(snapvp, cancelvp, level, bap[i], lbn,
 		    rlbn, remblks, subblksperindir, fs, acctfunc, expungetype);
 		if (error)
 			goto out;
 		rlbn += blksperindir;
 		lbn -= blksperindir;
 		remblks -= blksperindir;
 	}
 out:
 	FREE(bap, M_DEVBUF);
 	return (error);
 }
 
 /*
  * Do both snap accounting and map accounting.
  */
 static int
 fullacct_ufs2(vp, oldblkp, lastblkp, fs, lblkno, exptype)
 	struct vnode *vp;
 	ufs2_daddr_t *oldblkp, *lastblkp;
 	struct fs *fs;
 	ufs_lbn_t lblkno;
 	int exptype;	/* BLK_SNAP or BLK_NOCOPY */
 {
 	int error;
 
 	if ((error = snapacct_ufs2(vp, oldblkp, lastblkp, fs, lblkno, exptype)))
 		return (error);
 	return (mapacct_ufs2(vp, oldblkp, lastblkp, fs, lblkno, exptype));
 }
 
 /*
  * Identify a set of blocks allocated in a snapshot inode.
  */
 static int
 snapacct_ufs2(vp, oldblkp, lastblkp, fs, lblkno, expungetype)
 	struct vnode *vp;
 	ufs2_daddr_t *oldblkp, *lastblkp;
 	struct fs *fs;
 	ufs_lbn_t lblkno;
 	int expungetype;	/* BLK_SNAP or BLK_NOCOPY */
 {
 	struct inode *ip = VTOI(vp);
 	ufs2_daddr_t blkno, *blkp;
 	ufs_lbn_t lbn;
 	struct buf *ibp;
 	int error;
 
 	for ( ; oldblkp < lastblkp; oldblkp++) {
 		blkno = *oldblkp;
 		if (blkno == 0 || blkno == BLK_NOCOPY || blkno == BLK_SNAP)
 			continue;
 		lbn = fragstoblks(fs, blkno);
 		if (lbn < NDADDR) {
 			blkp = &ip->i_din2->di_db[lbn];
 			ip->i_flag |= IN_CHANGE | IN_UPDATE;
 		} else {
 			error = UFS_BALLOC(vp, lblktosize(fs, (off_t)lbn),
 			    fs->fs_bsize, KERNCRED, BA_METAONLY, &ibp);
 			if (error)
 				return (error);
 			blkp = &((ufs2_daddr_t *)(ibp->b_data))
 			    [(lbn - NDADDR) % NINDIR(fs)];
 		}
 		/*
 		 * If we are expunging a snapshot vnode and we
 		 * find a block marked BLK_NOCOPY, then it is
 		 * one that has been allocated to this snapshot after
 		 * we took our current snapshot and can be ignored.
 		 */
 		if (expungetype == BLK_SNAP && *blkp == BLK_NOCOPY) {
 			if (lbn >= NDADDR)
 				brelse(ibp);
 		} else {
 			if (*blkp != 0)
 				panic("snapacct: bad block");
 			*blkp = expungetype;
 			if (lbn >= NDADDR)
 				bdwrite(ibp);
 		}
 	}
 	return (0);
 }
 
 /*
  * Account for a set of blocks allocated in a snapshot inode.
  */
 static int
 mapacct_ufs2(vp, oldblkp, lastblkp, fs, lblkno, expungetype)
 	struct vnode *vp;
 	ufs2_daddr_t *oldblkp, *lastblkp;
 	struct fs *fs;
 	ufs_lbn_t lblkno;
 	int expungetype;
 {
 	ufs2_daddr_t blkno;
 	struct inode *ip;
 	ino_t inum;
 
 	ip = VTOI(vp);
 	inum = ip->i_number;
 	for ( ; oldblkp < lastblkp; oldblkp++, lblkno++) {
 		blkno = *oldblkp;
 		if (blkno == 0 || blkno == BLK_NOCOPY)
 			continue;
 		if (expungetype == BLK_SNAP && blkno != BLK_SNAP)
 			*ip->i_snapblklist++ = lblkno;
 		if (blkno == BLK_SNAP)
 			blkno = blkstofrags(fs, lblkno);
 		ffs_blkfree(fs, vp, blkno, fs->fs_bsize, inum);
 	}
 	return (0);
 }
 
 /*
  * Decrement extra reference on snapshot when last name is removed.
  * It will not be freed until the last open reference goes away.
  */
 void
 ffs_snapgone(ip)
 	struct inode *ip;
 {
 	struct inode *xp;
 	struct fs *fs;
 	int snaploc;
 
 	/*
 	 * Find snapshot in incore list.
 	 */
 	TAILQ_FOREACH(xp, &ip->i_devvp->v_rdev->si_snapshots, i_nextsnap)
 		if (xp == ip)
 			break;
 	if (xp == 0)
 		printf("ffs_snapgone: lost snapshot vnode %d\n",
 		    ip->i_number);
 	else
 		vrele(ITOV(ip));
 	/*
 	 * Delete snapshot inode from superblock. Keep list dense.
 	 */
 	fs = ip->i_fs;
 	for (snaploc = 0; snaploc < FSMAXSNAP; snaploc++)
 		if (fs->fs_snapinum[snaploc] == ip->i_number)
 			break;
 	if (snaploc < FSMAXSNAP) {
 		for (snaploc++; snaploc < FSMAXSNAP; snaploc++) {
 			if (fs->fs_snapinum[snaploc] == 0)
 				break;
 			fs->fs_snapinum[snaploc - 1] = fs->fs_snapinum[snaploc];
 		}
 		fs->fs_snapinum[snaploc - 1] = 0;
 	}
 }
 
 /*
  * Prepare a snapshot file for being removed.
  */
 void
 ffs_snapremove(vp)
 	struct vnode *vp;
 {
 	struct inode *ip;
 	struct vnode *devvp;
 	struct lock *lkp;
 	struct buf *ibp;
 	struct fs *fs;
 	struct thread *td = curthread;
 	ufs2_daddr_t numblks, blkno, dblk;
 	int error, loc, last;
 
 	ip = VTOI(vp);
 	fs = ip->i_fs;
 	/*
 	 * If active, delete from incore list (this snapshot may
 	 * already have been in the process of being deleted, so
 	 * would not have been active).
 	 *
 	 * Clear copy-on-write flag if last snapshot.
 	 */
 	if (ip->i_nextsnap.tqe_prev != 0) {
 		VI_LOCK(vp);
 		lockmgr(&vp->v_lock, LK_INTERLOCK|LK_EXCLUSIVE, VI_MTX(vp), td);
 		VI_LOCK(vp);
 		lkp = vp->v_vnlock;
 		vp->v_vnlock = &vp->v_lock;
 		lockmgr(lkp, LK_INTERLOCK | LK_RELEASE, VI_MTX(vp), td);
 		devvp = ip->i_devvp;
 		TAILQ_REMOVE(&devvp->v_rdev->si_snapshots, ip, i_nextsnap);
 		ip->i_nextsnap.tqe_prev = 0;
 		ASSERT_VOP_LOCKED(devvp, "ffs_snapremove devvp");
 		if (TAILQ_FIRST(&devvp->v_rdev->si_snapshots) == 0) {
 			lockdestroy(lkp);
 			FREE(lkp, M_UFSMNT);
 			devvp->v_rdev->si_copyonwrite = 0;
 			devvp->v_vflag &= ~VV_COPYONWRITE;
 		}
 	}
 	/*
 	 * Get rid of its hints list.
 	 */
 	if (ip->i_snapblklist != NULL) {
 		FREE(ip->i_snapblklist, M_UFSMNT);
 		ip->i_snapblklist = NULL;
 	}
 	/*
 	 * Clear all BLK_NOCOPY fields. Pass any block claims to other
 	 * snapshots that want them (see ffs_snapblkfree below).
 	 */
 	for (blkno = 1; blkno < NDADDR; blkno++) {
 		dblk = DIP(ip, i_db[blkno]);
 		if (dblk == BLK_NOCOPY || dblk == BLK_SNAP)
 			DIP(ip, i_db[blkno]) = 0;
 		else if ((dblk == blkstofrags(fs, blkno) &&
 		     ffs_snapblkfree(fs, ip->i_devvp, dblk, fs->fs_bsize,
 		     ip->i_number))) {
 			DIP(ip, i_blocks) -= btodb(fs->fs_bsize);
 			DIP(ip, i_db[blkno]) = 0;
 		}
 	}
 	numblks = howmany(ip->i_size, fs->fs_bsize);
 	for (blkno = NDADDR; blkno < numblks; blkno += NINDIR(fs)) {
 		error = UFS_BALLOC(vp, lblktosize(fs, (off_t)blkno),
 		    fs->fs_bsize, KERNCRED, BA_METAONLY, &ibp);
 		if (error)
 			continue;
 		if (fs->fs_size - blkno > NINDIR(fs))
 			last = NINDIR(fs);
 		else
 			last = fs->fs_size - blkno;
 		for (loc = 0; loc < last; loc++) {
 			if (ip->i_ump->um_fstype == UFS1) {
 				dblk = ((ufs1_daddr_t *)(ibp->b_data))[loc];
 				if (dblk == BLK_NOCOPY || dblk == BLK_SNAP)
 					((ufs1_daddr_t *)(ibp->b_data))[loc]= 0;
 				else if ((dblk == blkstofrags(fs, blkno) &&
 				     ffs_snapblkfree(fs, ip->i_devvp, dblk,
 				     fs->fs_bsize, ip->i_number))) {
 					ip->i_din1->di_blocks -=
 					    btodb(fs->fs_bsize);
 					((ufs1_daddr_t *)(ibp->b_data))[loc]= 0;
 				}
 				continue;
 			}
 			dblk = ((ufs2_daddr_t *)(ibp->b_data))[loc];
 			if (dblk == BLK_NOCOPY || dblk == BLK_SNAP)
 				((ufs2_daddr_t *)(ibp->b_data))[loc] = 0;
 			else if ((dblk == blkstofrags(fs, blkno) &&
 			     ffs_snapblkfree(fs, ip->i_devvp, dblk,
 			     fs->fs_bsize, ip->i_number))) {
 				ip->i_din2->di_blocks -= btodb(fs->fs_bsize);
 				((ufs2_daddr_t *)(ibp->b_data))[loc] = 0;
 			}
 		}
 		bawrite(ibp);
 	}
 	/*
 	 * Clear snapshot flag and drop reference.
 	 */
 	ip->i_flags &= ~SF_SNAPSHOT;
 	DIP(ip, i_flags) = ip->i_flags;
 	ip->i_flag |= IN_CHANGE | IN_UPDATE;
 }
 
 /*
  * Notification that a block is being freed. Return zero if the free
  * should be allowed to proceed. Return non-zero if the snapshot file
  * wants to claim the block. The block will be claimed if it is an
  * uncopied part of one of the snapshots. It will be freed if it is
  * either a BLK_NOCOPY or has already been copied in all of the snapshots.
  * If a fragment is being freed, then all snapshots that care about
  * it must make a copy since a snapshot file can only claim full sized
  * blocks. Note that if more than one snapshot file maps the block,
  * we can pick one at random to claim it. Since none of the snapshots
  * can change, we are assurred that they will all see the same unmodified
  * image. When deleting a snapshot file (see ffs_snapremove above), we
  * must push any of these claimed blocks to one of the other snapshots
  * that maps it. These claimed blocks are easily identified as they will
  * have a block number equal to their logical block number within the
  * snapshot. A copied block can never have this property because they
  * must always have been allocated from a BLK_NOCOPY location.
  */
 int
 ffs_snapblkfree(fs, devvp, bno, size, inum)
 	struct fs *fs;
 	struct vnode *devvp;
 	ufs2_daddr_t bno;
 	long size;
 	ino_t inum;
 {
 	struct buf *ibp, *cbp, *savedcbp = 0;
 	struct thread *td = curthread;
 	struct inode *ip;
 	struct vnode *vp;
 	ufs_lbn_t lbn;
 	ufs2_daddr_t blkno;
 	int indiroff = 0, error = 0, claimedblk = 0;
 	struct snaphead *snaphead;
 
 	lbn = fragstoblks(fs, bno);
 	snaphead = &devvp->v_rdev->si_snapshots;
 	TAILQ_FOREACH(ip, snaphead, i_nextsnap) {
 		vp = ITOV(ip);
 		/*
 		 * Lookup block being written.
 		 */
 		if (lbn < NDADDR) {
 			blkno = DIP(ip, i_db[lbn]);
 		} else {
 			vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td);
 			td->td_proc->p_flag |= P_COWINPROGRESS;
 			error = UFS_BALLOC(vp, lblktosize(fs, (off_t)lbn),
 			    fs->fs_bsize, KERNCRED, BA_METAONLY, &ibp);
 			td->td_proc->p_flag &= ~P_COWINPROGRESS;
 			VOP_UNLOCK(vp, 0, td);
 			if (error)
 				break;
 			indiroff = (lbn - NDADDR) % NINDIR(fs);
 			if (ip->i_ump->um_fstype == UFS1)
 				blkno=((ufs1_daddr_t *)(ibp->b_data))[indiroff];
 			else
 				blkno=((ufs2_daddr_t *)(ibp->b_data))[indiroff];
 		}
 		/*
 		 * Check to see if block needs to be copied.
 		 */
 		if (blkno == 0) {
 			/*
 			 * A block that we map is being freed. If it has not
 			 * been claimed yet, we will claim or copy it (below).
 			 */
 			claimedblk = 1;
 		} else if (blkno == BLK_SNAP) {
 			/*
 			 * No previous snapshot claimed the block,
 			 * so it will be * freed and become a BLK_NOCOPY
 			 * (don't care) for us.
 			 */
 			if (claimedblk)
 				panic("snapblkfree: inconsistent block type");
 			vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td);
 			if (lbn < NDADDR) {
 				DIP(ip, i_db[lbn]) = BLK_NOCOPY;
 				ip->i_flag |= IN_CHANGE | IN_UPDATE;
 			} else if (ip->i_ump->um_fstype == UFS1) {
 				((ufs1_daddr_t *)(ibp->b_data))[indiroff] =
 				    BLK_NOCOPY;
 				bdwrite(ibp);
 			} else {
 				((ufs2_daddr_t *)(ibp->b_data))[indiroff] =
 				    BLK_NOCOPY;
 				bdwrite(ibp);
 			}
 			VOP_UNLOCK(vp, 0, td);
 			continue;
 		} else /* BLK_NOCOPY or default */ {
 			/*
 			 * If the snapshot has already copied the block
 			 * (default), or does not care about the block,
 			 * it is not needed.
 			 */
 			if (lbn >= NDADDR)
 				bqrelse(ibp);
 			continue;
 		}
 		/*
 		 * If this is a full size block, we will just grab it
 		 * and assign it to the snapshot inode. Otherwise we
 		 * will proceed to copy it. See explanation for this
 		 * routine as to why only a single snapshot needs to
 		 * claim this block.
 		 */
 		if (size == fs->fs_bsize) {
 #ifdef DEBUG
 			if (snapdebug)
 				printf("%s %d lbn %jd from inum %d\n",
 				    "Grabonremove: snapino", ip->i_number,
 				    (intmax_t)lbn, inum);
 #endif
 			vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td);
 			if (lbn < NDADDR) {
 				DIP(ip, i_db[lbn]) = bno;
 			} else if (ip->i_ump->um_fstype == UFS1) {
 				((ufs1_daddr_t *)(ibp->b_data))[indiroff] = bno;
 				bdwrite(ibp);
 			} else {
 				((ufs2_daddr_t *)(ibp->b_data))[indiroff] = bno;
 				bdwrite(ibp);
 			}
 			DIP(ip, i_blocks) += btodb(size);
 			ip->i_flag |= IN_CHANGE | IN_UPDATE;
 			VOP_UNLOCK(vp, 0, td);
 			return (1);
 		}
 		if (lbn >= NDADDR)
 			bqrelse(ibp);
 		/*
 		 * Allocate the block into which to do the copy. Note that this
 		 * allocation will never require any additional allocations for
 		 * the snapshot inode.
 		 */
 		vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td);
 		td->td_proc->p_flag |= P_COWINPROGRESS;
 		error = UFS_BALLOC(vp, lblktosize(fs, (off_t)lbn),
 		    fs->fs_bsize, KERNCRED, 0, &cbp);
 		td->td_proc->p_flag &= ~P_COWINPROGRESS;
 		if (error) {
 			VOP_UNLOCK(vp, 0, td);
 			break;
 		}
 #ifdef DEBUG
 		if (snapdebug)
 			printf("%s%d lbn %jd %s %d size %ld to blkno %jd\n",
 			    "Copyonremove: snapino ", ip->i_number,
 			    (intmax_t)lbn, "for inum", inum, size,
 			    (intmax_t)cbp->b_blkno);
 #endif
 		/*
 		 * If we have already read the old block contents, then
 		 * simply copy them to the new block. Note that we need
 		 * to synchronously write snapshots that have not been
 		 * unlinked, and hence will be visible after a crash,
 		 * to ensure their integrity.
 		 */
 		if (savedcbp != 0) {
 			bcopy(savedcbp->b_data, cbp->b_data, fs->fs_bsize);
 			bawrite(cbp);
 			if (dopersistence && ip->i_effnlink > 0)
 				(void) VOP_FSYNC(vp, KERNCRED, MNT_WAIT, td);
 			VOP_UNLOCK(vp, 0, td);
 			continue;
 		}
 		/*
 		 * Otherwise, read the old block contents into the buffer.
 		 */
 		if ((error = readblock(cbp, lbn)) != 0) {
 			bzero(cbp->b_data, fs->fs_bsize);
 			bawrite(cbp);
 			if (dopersistence && ip->i_effnlink > 0)
 				(void) VOP_FSYNC(vp, KERNCRED, MNT_WAIT, td);
 			VOP_UNLOCK(vp, 0, td);
 			break;
 		}
 		VOP_UNLOCK(vp, 0, td);
 		savedcbp = cbp;
 	}
 	/*
 	 * Note that we need to synchronously write snapshots that
 	 * have not been unlinked, and hence will be visible after
 	 * a crash, to ensure their integrity.
 	 */
 	if (savedcbp) {
 		vp = savedcbp->b_vp;
 		bawrite(savedcbp);
 		if (dopersistence && VTOI(vp)->i_effnlink > 0) {
 			vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td);
 			(void) VOP_FSYNC(vp, KERNCRED, MNT_WAIT, td);
 			VOP_UNLOCK(vp, 0, td);
 		}
 	}
 	/*
 	 * If we have been unable to allocate a block in which to do
 	 * the copy, then return non-zero so that the fragment will
 	 * not be freed. Although space will be lost, the snapshot
 	 * will stay consistent.
 	 */
 	return (error);
 }
 
 /*
  * Associate snapshot files when mounting.
  */
 void
 ffs_snapshot_mount(mp)
 	struct mount *mp;
 {
 	struct ufsmount *ump = VFSTOUFS(mp);
 	struct fs *fs = ump->um_fs;
 	struct thread *td = curthread;
 	struct snaphead *snaphead;
 	struct vnode *vp;
 	struct inode *ip, *xp;
 	struct uio auio;
 	struct iovec aiov;
 	void *listhd;
 	char *reason;
 	int error, snaploc, loc;
 
 	/*
 	 * XXX The following needs to be set before UFS_TRUNCATE or
 	 * VOP_READ can be called.
 	 */
 	mp->mnt_stat.f_iosize = fs->fs_bsize;
 	/*
 	 * Process each snapshot listed in the superblock.
 	 */
 	snaphead = &ump->um_devvp->v_rdev->si_snapshots;
 	for (snaploc = 0; snaploc < FSMAXSNAP; snaploc++) {
 		if (fs->fs_snapinum[snaploc] == 0)
 			return;
 		if ((error = VFS_VGET(mp, fs->fs_snapinum[snaploc],
 		    LK_EXCLUSIVE, &vp)) != 0){
 			printf("ffs_snapshot_mount: vget failed %d\n", error);
 			continue;
 		}
 		ip = VTOI(vp);
 		if ((ip->i_flags & SF_SNAPSHOT) == 0 || ip->i_size ==
 		    lblktosize(fs, howmany(fs->fs_size, fs->fs_frag))) {
 			if ((ip->i_flags & SF_SNAPSHOT) == 0) {
 				reason = "non-snapshot";
 			} else {
 				reason = "old format snapshot";
 				(void)UFS_TRUNCATE(vp, (off_t)0, 0, NOCRED, td);
 				(void)VOP_FSYNC(vp, KERNCRED, MNT_WAIT, td);
 			}
 			printf("ffs_snapshot_mount: %s inode %d\n",
 			    reason, fs->fs_snapinum[snaploc]);
 			vput(vp);
 			for (loc = snaploc + 1; loc < FSMAXSNAP; loc++) {
 				if (fs->fs_snapinum[loc] == 0)
 					break;
 				fs->fs_snapinum[loc - 1] = fs->fs_snapinum[loc];
 			}
 			fs->fs_snapinum[loc - 1] = 0;
 			snaploc--;
 			continue;
 		}
 		/*
 		 * Allocate the space for the block hints list.
 		 */
 		auio.uio_iov = &aiov;
 		auio.uio_iovcnt = 1;
 		aiov.iov_base = (void *)&ip->i_snaplistsize;
 		aiov.iov_len = sizeof(ip->i_snaplistsize);
 		auio.uio_resid = aiov.iov_len;
 		auio.uio_offset =
 		    lblktosize(fs, howmany(fs->fs_size, fs->fs_frag));
 		auio.uio_segflg = UIO_SYSSPACE;
 		auio.uio_rw = UIO_READ;
 		auio.uio_td = td;
 		if ((error = VOP_READ(vp, &auio, IO_UNIT, td->td_ucred)) != 0) {
 			printf("ffs_snapshot_mount: read_1 failed %d\n", error);
 			continue;
 		}
 		MALLOC(listhd, void *, ip->i_snaplistsize * sizeof(daddr_t),
 		    M_UFSMNT, M_WAITOK);
 		auio.uio_iovcnt = 1;
 		aiov.iov_base = listhd;
 		aiov.iov_len = ip->i_snaplistsize * sizeof (daddr_t);
 		auio.uio_resid = aiov.iov_len;
 		auio.uio_offset -= sizeof(ip->i_snaplistsize);
 		if ((error = VOP_READ(vp, &auio, IO_UNIT, td->td_ucred)) != 0) {
 			printf("ffs_snapshot_mount: read_2 failed %d\n", error);
 			FREE(listhd, M_UFSMNT);
 			continue;
 		}
 		ip->i_snapblklist = (daddr_t *)listhd;
 		/*
 		 * If there already exist snapshots on this filesystem, grab a
 		 * reference to their shared lock. If this is the first snapshot
 		 * on this filesystem, we need to allocate a lock for the
 		 * snapshots to share. In either case, acquire the snapshot
 		 * lock and give up our original private lock.
 		 */
 		if ((xp = TAILQ_FIRST(snaphead)) != NULL) {
 			VI_LOCK(vp);
 			vp->v_vnlock = ITOV(xp)->v_vnlock;
 		} else {
 			struct lock *lkp;
 
 			MALLOC(lkp, struct lock *, sizeof(struct lock),
 			    M_UFSMNT, M_WAITOK);
 			lockinit(lkp, PVFS, "snaplk", VLKTIMEOUT,
 			    LK_CANRECURSE | LK_NOPAUSE);
 			VI_LOCK(vp);
 			vp->v_vnlock = lkp;
 		}
 		vn_lock(vp, LK_INTERLOCK | LK_EXCLUSIVE | LK_RETRY, td);
 		VI_LOCK(vp);
 		lockmgr(&vp->v_lock, LK_INTERLOCK | LK_RELEASE, VI_MTX(vp), td);
 		/*
 		 * Link it onto the active snapshot list.
 		 */
 		if (ip->i_nextsnap.tqe_prev != 0)
 			panic("ffs_snapshot_mount: %d already on list",
 			    ip->i_number);
 		else
 			TAILQ_INSERT_TAIL(snaphead, ip, i_nextsnap);
 		vp->v_vflag |= VV_SYSTEM;
 		ump->um_devvp->v_rdev->si_copyonwrite = ffs_copyonwrite;
 		ASSERT_VOP_LOCKED(ump->um_devvp, "ffs_snapshot_mount");
 		ump->um_devvp->v_vflag |= VV_COPYONWRITE;
 		VOP_UNLOCK(vp, 0, td);
 	}
 }
 
 /*
  * Disassociate snapshot files when unmounting.
  */
 void
 ffs_snapshot_unmount(mp)
 	struct mount *mp;
 {
 	struct ufsmount *ump = VFSTOUFS(mp);
 	struct snaphead *snaphead = &ump->um_devvp->v_rdev->si_snapshots;
 	struct lock *lkp = NULL;
 	struct inode *xp;
 	struct vnode *vp;
 
 	while ((xp = TAILQ_FIRST(snaphead)) != 0) {
 		vp = ITOV(xp);
 		lkp = vp->v_vnlock;
 		vp->v_vnlock = &vp->v_lock;
 		TAILQ_REMOVE(snaphead, xp, i_nextsnap);
 		if (xp->i_snapblklist != NULL) {
 			FREE(xp->i_snapblklist, M_UFSMNT);
 			xp->i_snapblklist = NULL;
 		}
 		xp->i_nextsnap.tqe_prev = 0;
 		if (xp->i_effnlink > 0)
 			vrele(vp);
 	}
 	if (lkp != NULL) {
 		lockdestroy(lkp);
 		FREE(lkp, M_UFSMNT);
 	}
 	ASSERT_VOP_LOCKED(ump->um_devvp, "ffs_snapshot_unmount");
 	ump->um_devvp->v_rdev->si_copyonwrite = 0;
 	ump->um_devvp->v_vflag &= ~VV_COPYONWRITE;
 }
 
 /*
  * Check for need to copy block that is about to be written,
  * copying the block if necessary.
  */
 static int
 ffs_copyonwrite(devvp, bp)
 	struct vnode *devvp;
 	struct buf *bp;
 {
 	struct snaphead *snaphead;
 	struct buf *ibp, *cbp, *savedcbp = 0;
 	struct thread *td = curthread;
 	struct fs *fs;
 	struct inode *ip;
 	struct vnode *vp = 0;
 	ufs2_daddr_t lbn, blkno;
 	int lower, upper, mid, indiroff, snapshot_locked = 0, error = 0;
 
 	if (td->td_proc->p_flag & P_COWINPROGRESS)
 		panic("ffs_copyonwrite: recursive call");
 	snaphead = &devvp->v_rdev->si_snapshots;
 	ip = TAILQ_FIRST(snaphead);
 	fs = ip->i_fs;
 	lbn = fragstoblks(fs, dbtofsb(fs, bp->b_blkno));
 	TAILQ_FOREACH(ip, snaphead, i_nextsnap) {
 		vp = ITOV(ip);
 		/*
 		 * We ensure that everything of our own that needs to be
 		 * copied will be done at the time that ffs_snapshot is
 		 * called. Thus we can skip the check here which can
 		 * deadlock in doing the lookup in UFS_BALLOC.
 		 */
 		if (bp->b_vp == vp)
 			continue;
 	retry:
 		/*
 		 * First check to see if it is in the preallocated list.
 		 * By doing this check we avoid several potential deadlocks.
 		 */
 		lower = 1;
 		upper = ip->i_snaplistsize - 1;
 		while (lower <= upper) {
 			mid = (lower + upper) / 2;
 			if (ip->i_snapblklist[mid] == lbn)
 				break;
 			if (ip->i_snapblklist[mid] < lbn)
 				lower = mid + 1;
 			else
 				upper = mid - 1;
 		}
 		if (lower <= upper)
 			continue;
 		/*
 		 * Check to see if block needs to be copied. We do not have
 		 * to hold the snapshot lock while doing this lookup as it
 		 * will never require any additional allocations for the
 		 * snapshot inode.
 		 */
 		if (lbn < NDADDR) {
 			blkno = DIP(ip, i_db[lbn]);
 		} else {
 			td->td_proc->p_flag |= P_COWINPROGRESS;
 			error = UFS_BALLOC(vp, lblktosize(fs, (off_t)lbn),
 			   fs->fs_bsize, KERNCRED, BA_METAONLY, &ibp);
 			td->td_proc->p_flag &= ~P_COWINPROGRESS;
 			if (error)
 				break;
 			indiroff = (lbn - NDADDR) % NINDIR(fs);
 			if (ip->i_ump->um_fstype == UFS1)
 				blkno=((ufs1_daddr_t *)(ibp->b_data))[indiroff];
 			else
 				blkno=((ufs2_daddr_t *)(ibp->b_data))[indiroff];
 			bqrelse(ibp);
 		}
 #ifdef DIAGNOSTIC
 		if (blkno == BLK_SNAP && bp->b_lblkno >= 0)
 			panic("ffs_copyonwrite: bad copy block");
 #endif
 		if (blkno != 0)
 			continue;
 		/*
 		 * Allocate the block into which to do the copy. Since
 		 * multiple processes may all try to copy the same block,
 		 * we have to recheck our need to do a copy if we sleep
 		 * waiting for the lock.
 		 *
 		 * Because all snapshots on a filesystem share a single
 		 * lock, we ensure that we will never be in competition
 		 * with another process to allocate a block.
 		 */
 		if (snapshot_locked == 0 &&
 		    vn_lock(vp, LK_EXCLUSIVE | LK_SLEEPFAIL, td) != 0)
 			goto retry;
 		snapshot_locked = 1;
 		td->td_proc->p_flag |= P_COWINPROGRESS;
 		error = UFS_BALLOC(vp, lblktosize(fs, (off_t)lbn),
 		    fs->fs_bsize, KERNCRED, 0, &cbp);
 		td->td_proc->p_flag &= ~P_COWINPROGRESS;
 		if (error)
 			break;
 #ifdef DEBUG
 		if (snapdebug) {
 			printf("Copyonwrite: snapino %d lbn %jd for ",
 			    ip->i_number, (intmax_t)lbn);
 			if (bp->b_vp == devvp)
 				printf("fs metadata");
 			else
 				printf("inum %d", VTOI(bp->b_vp)->i_number);
 			printf(" lblkno %jd to blkno %jd\n",
 			    (intmax_t)bp->b_lblkno, (intmax_t)cbp->b_blkno);
 		}
 #endif
 		/*
 		 * If we have already read the old block contents, then
 		 * simply copy them to the new block. Note that we need
 		 * to synchronously write snapshots that have not been
 		 * unlinked, and hence will be visible after a crash,
 		 * to ensure their integrity.
 		 */
 		if (savedcbp != 0) {
 			bcopy(savedcbp->b_data, cbp->b_data, fs->fs_bsize);
 			bawrite(cbp);
 			if (dopersistence && ip->i_effnlink > 0)
 				(void) VOP_FSYNC(vp, KERNCRED, MNT_WAIT, td);
 			continue;
 		}
 		/*
 		 * Otherwise, read the old block contents into the buffer.
 		 */
 		if ((error = readblock(cbp, lbn)) != 0) {
 			bzero(cbp->b_data, fs->fs_bsize);
 			bawrite(cbp);
 			if (dopersistence && ip->i_effnlink > 0)
 				(void) VOP_FSYNC(vp, KERNCRED, MNT_WAIT, td);
 			break;
 		}
 		savedcbp = cbp;
 	}
 	/*
 	 * Note that we need to synchronously write snapshots that
 	 * have not been unlinked, and hence will be visible after
 	 * a crash, to ensure their integrity.
 	 */
 	if (savedcbp) {
 		vp = savedcbp->b_vp;
 		bawrite(savedcbp);
 		if (dopersistence && VTOI(vp)->i_effnlink > 0)
 			(void) VOP_FSYNC(vp, KERNCRED, MNT_WAIT, td);
 	}
 	if (snapshot_locked)
 		VOP_UNLOCK(vp, 0, td);
 	return (error);
 }
 
 /*
  * Read the specified block into the given buffer.
  * Much of this boiler-plate comes from bwrite().
  */
 static int
 readblock(bp, lbn)
 	struct buf *bp;
 	ufs2_daddr_t lbn;
 {
 	struct uio auio;
 	struct iovec aiov;
 	struct thread *td = curthread;
 	struct inode *ip = VTOI(bp->b_vp);
 
 	aiov.iov_base = bp->b_data;
 	aiov.iov_len = bp->b_bcount;
 	auio.uio_iov = &aiov;
 	auio.uio_iovcnt = 1;
 	auio.uio_offset = dbtob(fsbtodb(ip->i_fs, blkstofrags(ip->i_fs, lbn)));
 	auio.uio_resid = bp->b_bcount;
 	auio.uio_rw = UIO_READ;
 	auio.uio_segflg = UIO_SYSSPACE;
 	auio.uio_td = td;
 	return (physio(ip->i_devvp->v_rdev, &auio, 0));
 }
Index: head/sys/ufs/ffs/ffs_vfsops.c
===================================================================
--- head/sys/ufs/ffs/ffs_vfsops.c	(revision 105901)
+++ head/sys/ufs/ffs/ffs_vfsops.c	(revision 105902)
@@ -1,1477 +1,1481 @@
 /*
  * Copyright (c) 1989, 1991, 1993, 1994
  *	The Regents of the University of California.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. All advertising materials mentioning features or use of this software
  *    must display the following acknowledgement:
  *	This product includes software developed by the University of
  *	California, Berkeley and its contributors.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	@(#)ffs_vfsops.c	8.31 (Berkeley) 5/20/95
  * $FreeBSD$
  */
 
 #include "opt_mac.h"
 #include "opt_quota.h"
 #include "opt_ufs.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/stdint.h>
 #include <sys/namei.h>
 #include <sys/proc.h>
 #include <sys/kernel.h>
 #include <sys/vnode.h>
 #include <sys/mount.h>
 #include <sys/bio.h>
 #include <sys/buf.h>
 #include <sys/conf.h>
 #include <sys/fcntl.h>
 #include <sys/disk.h>
 #include <sys/malloc.h>
 #include <sys/mutex.h>
 
 #include <ufs/ufs/extattr.h>
 #include <ufs/ufs/quota.h>
 #include <ufs/ufs/ufsmount.h>
 #include <ufs/ufs/inode.h>
 #include <ufs/ufs/ufs_extern.h>
 
 #include <ufs/ffs/fs.h>
 #include <ufs/ffs/ffs_extern.h>
 
 #include <vm/vm.h>
 #include <vm/vm_page.h>
 
 static MALLOC_DEFINE(M_FFSNODE, "FFS node", "FFS vnode private part");
 
 static int	ffs_sbupdate(struct ufsmount *, int);
        int	ffs_reload(struct mount *,struct ucred *,struct thread *);
 static void	ffs_oldfscompat_read(struct fs *, struct ufsmount *,
 		    ufs2_daddr_t);
 static void	ffs_oldfscompat_write(struct fs *, struct ufsmount *);
 static vfs_init_t ffs_init;
 static vfs_uninit_t ffs_uninit;
 static vfs_extattrctl_t ffs_extattrctl;
 
 static struct vfsops ufs_vfsops = {
 	ffs_mount,
 	ufs_start,
 	ffs_unmount,
 	ufs_root,
 	ufs_quotactl,
 	ffs_statfs,
 	ffs_sync,
 	ffs_vget,
 	ffs_fhtovp,
 	vfs_stdcheckexp,
 	ffs_vptofh,
 	ffs_init,
 	ffs_uninit,
 	ffs_extattrctl,
 };
 
 VFS_SET(ufs_vfsops, ufs, 0);
 
 /*
  * ffs_mount
  *
  * Called when mounting local physical media
  *
  * PARAMETERS:
  *		mountroot
  *			mp	mount point structure
  *			path	NULL (flag for root mount!!!)
  *			data	<unused>
  *			ndp	<unused>
  *			p	process (user credentials check [statfs])
  *
  *		mount
  *			mp	mount point structure
  *			path	path to mount point
  *			data	pointer to argument struct in user space
  *			ndp	mount point namei() return (used for
  *				credentials on reload), reused to look
  *				up block device.
  *			p	process (user credentials check)
  *
  * RETURNS:	0	Success
  *		!0	error number (errno.h)
  *
  * LOCK STATE:
  *
  *		ENTRY
  *			mount point is locked
  *		EXIT
  *			mount point is locked
  *
  * NOTES:
  *		A NULL path can be used for a flag since the mount
  *		system call will fail with EFAULT in copyinstr in
  *		namei() if it is a genuine NULL from the user.
  */
 int
 ffs_mount(mp, path, data, ndp, td)
         struct mount		*mp;	/* mount struct pointer*/
         char			*path;	/* path to mount point*/
         caddr_t			data;	/* arguments to FS specific mount*/
         struct nameidata	*ndp;	/* mount point credentials*/
         struct thread		*td;	/* process requesting mount*/
 {
 	size_t size;
 	struct vnode *devvp;
 	struct ufs_args args;
 	struct ufsmount *ump = 0;
 	struct fs *fs;
 	int error, flags;
 	mode_t accessmode;
 
 	/*
 	 * Use NULL path to indicate we are mounting the root filesystem.
 	 */
 	if (path == NULL) {
 		if ((error = bdevvp(rootdev, &rootvp))) {
 			printf("ffs_mountroot: can't find rootvp\n");
 			return (error);
 		}
 
 		if ((error = ffs_mountfs(rootvp, mp, td, M_FFSNODE)) != 0)
 			return (error);
 		(void)VFS_STATFS(mp, &mp->mnt_stat, td);
 		return (0);
 	}
 
 	/*
 	 * Mounting non-root filesystem or updating a filesystem
 	 */
 	if ((error = copyin(data, (caddr_t)&args, sizeof(struct ufs_args)))!= 0)
 		return (error);
 
 	/*
 	 * If updating, check whether changing from read-only to
 	 * read/write; if there is no device name, that's all we do.
 	 */
 	if (mp->mnt_flag & MNT_UPDATE) {
 		ump = VFSTOUFS(mp);
 		fs = ump->um_fs;
 		devvp = ump->um_devvp;
 		if (fs->fs_ronly == 0 && (mp->mnt_flag & MNT_RDONLY)) {
 			if ((error = vn_start_write(NULL, &mp, V_WAIT)) != 0)
 				return (error);
 			/*
 			 * Flush any dirty data.
 			 */
-			VFS_SYNC(mp, MNT_WAIT, td->td_proc->p_ucred, td);
+			if ((error = VFS_SYNC(mp, MNT_WAIT,
+			    td->td_proc->p_ucred, td)) != 0) {
+				vn_finished_write(mp);
+				return (error);
+			}
 			/*
 			 * Check for and optionally get rid of files open
 			 * for writing.
 			 */
 			flags = WRITECLOSE;
 			if (mp->mnt_flag & MNT_FORCE)
 				flags |= FORCECLOSE;
 			if (mp->mnt_flag & MNT_SOFTDEP) {
 				error = softdep_flushfiles(mp, flags, td);
 			} else {
 				error = ffs_flushfiles(mp, flags, td);
 			}
 			if (error) {
 				vn_finished_write(mp);
 				return (error);
 			}
 			if (fs->fs_pendingblocks != 0 ||
 			    fs->fs_pendinginodes != 0) {
 				printf("%s: %s: blocks %jd files %d\n",
 				    fs->fs_fsmnt, "update error",
 				    (intmax_t)fs->fs_pendingblocks,
 				    fs->fs_pendinginodes);
 				fs->fs_pendingblocks = 0;
 				fs->fs_pendinginodes = 0;
 			}
 			fs->fs_ronly = 1;
 			if ((fs->fs_flags & (FS_UNCLEAN | FS_NEEDSFSCK)) == 0)
 				fs->fs_clean = 1;
 			if ((error = ffs_sbupdate(ump, MNT_WAIT)) != 0) {
 				fs->fs_ronly = 0;
 				fs->fs_clean = 0;
 				vn_finished_write(mp);
 				return (error);
 			}
 			vn_finished_write(mp);
 		}
 		if ((mp->mnt_flag & MNT_RELOAD) &&
 		    (error = ffs_reload(mp, ndp->ni_cnd.cn_cred, td)) != 0)
 			return (error);
 		if (fs->fs_ronly && (mp->mnt_kern_flag & MNTK_WANTRDWR)) {
 			/*
 			 * If upgrade to read-write by non-root, then verify
 			 * that user has necessary permissions on the device.
 			 */
 			if (suser(td)) {
 				vn_lock(devvp, LK_EXCLUSIVE | LK_RETRY, td);
 				if ((error = VOP_ACCESS(devvp, VREAD | VWRITE,
 				    td->td_ucred, td)) != 0) {
 					VOP_UNLOCK(devvp, 0, td);
 					return (error);
 				}
 				VOP_UNLOCK(devvp, 0, td);
 			}
 			fs->fs_flags &= ~FS_UNCLEAN;
 			if (fs->fs_clean == 0) {
 				fs->fs_flags |= FS_UNCLEAN;
 				if ((mp->mnt_flag & MNT_FORCE) ||
 				    ((fs->fs_flags & FS_NEEDSFSCK) == 0 &&
 				     (fs->fs_flags & FS_DOSOFTDEP))) {
 					printf("WARNING: %s was not %s\n",
 					   fs->fs_fsmnt, "properly dismounted");
 				} else {
 					printf(
 "WARNING: R/W mount of %s denied.  Filesystem is not clean - run fsck\n",
 					    fs->fs_fsmnt);
 					return (EPERM);
 				}
 			}
 			if ((error = vn_start_write(NULL, &mp, V_WAIT)) != 0)
 				return (error);
 			fs->fs_ronly = 0;
 			fs->fs_clean = 0;
 			if ((error = ffs_sbupdate(ump, MNT_WAIT)) != 0) {
 				vn_finished_write(mp);
 				return (error);
 			}
 			/* check to see if we need to start softdep */
 			if ((fs->fs_flags & FS_DOSOFTDEP) &&
 			    (error = softdep_mount(devvp, mp, fs, td->td_ucred))){
 				vn_finished_write(mp);
 				return (error);
 			}
 			if (fs->fs_snapinum[0] != 0)
 				ffs_snapshot_mount(mp);
 			vn_finished_write(mp);
 		}
 		/*
 		 * Soft updates is incompatible with "async",
 		 * so if we are doing softupdates stop the user
 		 * from setting the async flag in an update.
 		 * Softdep_mount() clears it in an initial mount 
 		 * or ro->rw remount.
 		 */
 		if (mp->mnt_flag & MNT_SOFTDEP)
 			mp->mnt_flag &= ~MNT_ASYNC;
 		/*
 		 * If not updating name, process export requests.
 		 */
 		if (args.fspec == 0)
 			return (vfs_export(mp, &args.export));
 		/*
 		 * If this is a snapshot request, take the snapshot.
 		 */
 		if (mp->mnt_flag & MNT_SNAPSHOT)
 			return (ffs_snapshot(mp, args.fspec));
 	}
 
 	/*
 	 * Not an update, or updating the name: look up the name
 	 * and verify that it refers to a sensible block device.
 	 */
 	NDINIT(ndp, LOOKUP, FOLLOW, UIO_USERSPACE, args.fspec, td);
 	if ((error = namei(ndp)) != 0)
 		return (error);
 	NDFREE(ndp, NDF_ONLY_PNBUF);
 	devvp = ndp->ni_vp;
 	if (!vn_isdisk(devvp, &error)) {
 		vrele(devvp);
 		return (error);
 	}
 
 	/*
 	 * If mount by non-root, then verify that user has necessary
 	 * permissions on the device.
 	 */
 	if (suser(td)) {
 		accessmode = VREAD;
 		if ((mp->mnt_flag & MNT_RDONLY) == 0)
 			accessmode |= VWRITE;
 		vn_lock(devvp, LK_EXCLUSIVE | LK_RETRY, td);
 		if ((error = VOP_ACCESS(devvp, accessmode, td->td_ucred, td))!= 0){
 			vput(devvp);
 			return (error);
 		}
 		VOP_UNLOCK(devvp, 0, td);
 	}
 
 	if (mp->mnt_flag & MNT_UPDATE) {
 		/*
 		 * Update only
 		 *
 		 * If it's not the same vnode, or at least the same device
 		 * then it's not correct.
 		 */
 
 		if (devvp != ump->um_devvp &&
 		    devvp->v_rdev != ump->um_devvp->v_rdev)
 			error = EINVAL;	/* needs translation */
 		vrele(devvp);
 		if (error)
 			return (error);
 	} else {
 		/*
 		 * New mount
 		 *
 		 * We need the name for the mount point (also used for
 		 * "last mounted on") copied in. If an error occurs,
 		 * the mount point is discarded by the upper level code.
 		 * Note that vfs_mount() populates f_mntonname for us.
 		 */
 		if ((error = ffs_mountfs(devvp, mp, td, M_FFSNODE)) != 0) {
 			vrele(devvp);
 			return (error);
 		}
 	}
 	/*
 	 * Save "mounted from" device name info for mount point (NULL pad).
 	 */
 	copyinstr(args.fspec, mp->mnt_stat.f_mntfromname, MNAMELEN - 1, &size);
 	bzero( mp->mnt_stat.f_mntfromname + size, MNAMELEN - size);
 	/*
 	 * Initialize filesystem stat information in mount struct.
 	 */
 	(void)VFS_STATFS(mp, &mp->mnt_stat, td);
 	return (0);
 }
 
 /*
  * Reload all incore data for a filesystem (used after running fsck on
  * the root filesystem and finding things to fix). The filesystem must
  * be mounted read-only.
  *
  * Things to do to update the mount:
  *	1) invalidate all cached meta-data.
  *	2) re-read superblock from disk.
  *	3) re-read summary information from disk.
  *	4) invalidate all inactive vnodes.
  *	5) invalidate all cached file data.
  *	6) re-read inode data for all active vnodes.
  */
 int
 ffs_reload(mp, cred, td)
 	struct mount *mp;
 	struct ucred *cred;
 	struct thread *td;
 {
 	struct vnode *vp, *nvp, *devvp;
 	struct inode *ip;
 	void *space;
 	struct buf *bp;
 	struct fs *fs, *newfs;
 	dev_t dev;
 	ufs2_daddr_t sblockloc;
 	int i, blks, size, error;
 	int32_t *lp;
 
 	if ((mp->mnt_flag & MNT_RDONLY) == 0)
 		return (EINVAL);
 	/*
 	 * Step 1: invalidate all cached meta-data.
 	 */
 	devvp = VFSTOUFS(mp)->um_devvp;
 	vn_lock(devvp, LK_EXCLUSIVE | LK_RETRY, td);
 	error = vinvalbuf(devvp, 0, cred, td, 0, 0);
 	VOP_UNLOCK(devvp, 0, td);
 	if (error)
 		panic("ffs_reload: dirty1");
 
 	dev = devvp->v_rdev;
 
 	/*
 	 * Only VMIO the backing device if the backing device is a real
 	 * block device.
 	 */
 	if (vn_isdisk(devvp, NULL)) {
 		vn_lock(devvp, LK_EXCLUSIVE | LK_RETRY, td);
 		vfs_object_create(devvp, td, td->td_ucred);
 		/* XXX Why lock only to release immediately?? */
 		mtx_lock(&devvp->v_interlock);
 		VOP_UNLOCK(devvp, LK_INTERLOCK, td);
 	}
 
 	/*
 	 * Step 2: re-read superblock from disk.
 	 */
 	fs = VFSTOUFS(mp)->um_fs;
 	if ((error = bread(devvp, fsbtodb(fs, fs->fs_sblockloc), fs->fs_sbsize,
 	    NOCRED, &bp)) != 0)
 		return (error);
 	newfs = (struct fs *)bp->b_data;
 	if ((newfs->fs_magic != FS_UFS1_MAGIC &&
 	     newfs->fs_magic != FS_UFS2_MAGIC) ||
 	    newfs->fs_bsize > MAXBSIZE ||
 	    newfs->fs_bsize < sizeof(struct fs)) {
 			brelse(bp);
 			return (EIO);		/* XXX needs translation */
 	}
 	/*
 	 * Copy pointer fields back into superblock before copying in	XXX
 	 * new superblock. These should really be in the ufsmount.	XXX
 	 * Note that important parameters (eg fs_ncg) are unchanged.
 	 */
 	newfs->fs_csp = fs->fs_csp;
 	newfs->fs_maxcluster = fs->fs_maxcluster;
 	newfs->fs_contigdirs = fs->fs_contigdirs;
 	newfs->fs_active = fs->fs_active;
 	sblockloc = fs->fs_sblockloc;
 	bcopy(newfs, fs, (u_int)fs->fs_sbsize);
 	brelse(bp);
 	mp->mnt_maxsymlinklen = fs->fs_maxsymlinklen;
 	ffs_oldfscompat_read(fs, VFSTOUFS(mp), sblockloc);
 	if (fs->fs_pendingblocks != 0 || fs->fs_pendinginodes != 0) {
 		printf("%s: reload pending error: blocks %jd files %d\n",
 		    fs->fs_fsmnt, (intmax_t)fs->fs_pendingblocks,
 		    fs->fs_pendinginodes);
 		fs->fs_pendingblocks = 0;
 		fs->fs_pendinginodes = 0;
 	}
 
 	/*
 	 * Step 3: re-read summary information from disk.
 	 */
 	blks = howmany(fs->fs_cssize, fs->fs_fsize);
 	space = fs->fs_csp;
 	for (i = 0; i < blks; i += fs->fs_frag) {
 		size = fs->fs_bsize;
 		if (i + fs->fs_frag > blks)
 			size = (blks - i) * fs->fs_fsize;
 		error = bread(devvp, fsbtodb(fs, fs->fs_csaddr + i), size,
 		    NOCRED, &bp);
 		if (error)
 			return (error);
 		bcopy(bp->b_data, space, (u_int)size);
 		space = (char *)space + size;
 		brelse(bp);
 	}
 	/*
 	 * We no longer know anything about clusters per cylinder group.
 	 */
 	if (fs->fs_contigsumsize > 0) {
 		lp = fs->fs_maxcluster;
 		for (i = 0; i < fs->fs_ncg; i++)
 			*lp++ = fs->fs_contigsumsize;
 	}
 
 loop:
 	mtx_lock(&mntvnode_mtx);
 	for (vp = TAILQ_FIRST(&mp->mnt_nvnodelist); vp != NULL; vp = nvp) {
 		if (vp->v_mount != mp) {
 			mtx_unlock(&mntvnode_mtx);
 			goto loop;
 		}
 		nvp = TAILQ_NEXT(vp, v_nmntvnodes);
 		mtx_unlock(&mntvnode_mtx);
 		/*
 		 * Step 4: invalidate all inactive vnodes.
 		 */
 		if (vrecycle(vp, NULL, td))
 			goto loop;
 		/*
 		 * Step 5: invalidate all cached file data.
 		 */
 		/* XXX Why lock only to release immediately? */
 		mtx_lock(&vp->v_interlock);
 		if (vget(vp, LK_EXCLUSIVE | LK_INTERLOCK, td)) {
 			goto loop;
 		}
 		if (vinvalbuf(vp, 0, cred, td, 0, 0))
 			panic("ffs_reload: dirty2");
 		/*
 		 * Step 6: re-read inode data for all active vnodes.
 		 */
 		ip = VTOI(vp);
 		error =
 		    bread(devvp, fsbtodb(fs, ino_to_fsba(fs, ip->i_number)),
 		    (int)fs->fs_bsize, NOCRED, &bp);
 		if (error) {
 			vput(vp);
 			return (error);
 		}
 		ffs_load_inode(bp, ip, NULL, fs, ip->i_number);
 		ip->i_effnlink = ip->i_nlink;
 		brelse(bp);
 		vput(vp);
 		mtx_lock(&mntvnode_mtx);
 	}
 	mtx_unlock(&mntvnode_mtx);
 	return (0);
 }
 
 /*
  * Possible superblock locations ordered from most to least likely.
  */
 static int sblock_try[] = SBLOCKSEARCH;
 
 /*
  * Common code for mount and mountroot
  */
 int
 ffs_mountfs(devvp, mp, td, malloctype)
 	struct vnode *devvp;
 	struct mount *mp;
 	struct thread *td;
 	struct malloc_type *malloctype;
 {
 	struct ufsmount *ump;
 	struct buf *bp;
 	struct fs *fs;
 	dev_t dev;
 	void *space;
 	ufs2_daddr_t sblockloc;
 	int error, i, blks, size, ronly;
 	int32_t *lp;
 	struct ucred *cred;
 	size_t strsize;
 	int ncount;
 
 	dev = devvp->v_rdev;
 	cred = td ? td->td_ucred : NOCRED;
 	/*
 	 * Disallow multiple mounts of the same device.
 	 * Disallow mounting of a device that is currently in use
 	 * (except for root, which might share swap device for miniroot).
 	 * Flush out any old buffers remaining from a previous use.
 	 */
 	error = vfs_mountedon(devvp);
 	if (error)
 		return (error);
 	ncount = vcount(devvp);
 
 	if (ncount > 1 && devvp != rootvp)
 		return (EBUSY);
 	vn_lock(devvp, LK_EXCLUSIVE | LK_RETRY, td);
 	error = vinvalbuf(devvp, V_SAVE, cred, td, 0, 0);
 	VOP_UNLOCK(devvp, 0, td);
 	if (error)
 		return (error);
 
 	/*
 	 * Only VMIO the backing device if the backing device is a real
 	 * block device.
 	 * Note that it is optional that the backing device be VMIOed.  This
 	 * increases the opportunity for metadata caching.
 	 */
 	if (vn_isdisk(devvp, NULL)) {
 		vn_lock(devvp, LK_EXCLUSIVE | LK_RETRY, td);
 		vfs_object_create(devvp, td, cred);
 		/* XXX Why lock only to release immediately?? */
 		mtx_lock(&devvp->v_interlock);
 		VOP_UNLOCK(devvp, LK_INTERLOCK, td);
 	}
 
 	ronly = (mp->mnt_flag & MNT_RDONLY) != 0;
 	vn_lock(devvp, LK_EXCLUSIVE | LK_RETRY, td);
 	/*
 	 * XXX: We don't re-VOP_OPEN in FREAD|FWRITE mode if the filesystem
 	 * XXX: is subsequently remounted, so open it FREAD|FWRITE from the
 	 * XXX: start to avoid getting trashed later on.
 	 */
 #ifdef notyet
 	error = VOP_OPEN(devvp, ronly ? FREAD : FREAD|FWRITE, FSCRED, td);
 #else
 	error = VOP_OPEN(devvp, FREAD|FWRITE, FSCRED, td);
 #endif
 	VOP_UNLOCK(devvp, 0, td);
 	if (error)
 		return (error);
 	if (devvp->v_rdev->si_iosize_max != 0)
 		mp->mnt_iosize_max = devvp->v_rdev->si_iosize_max;
 	if (mp->mnt_iosize_max > MAXPHYS)
 		mp->mnt_iosize_max = MAXPHYS;
 
 	bp = NULL;
 	ump = NULL;
 	fs = NULL;
 	sblockloc = 0;
 	/*
 	 * Try reading the superblock in each of its possible locations.
 	 */
 	for (i = 0; sblock_try[i] != -1; i++) {
 		if ((error = bread(devvp, sblock_try[i] / DEV_BSIZE, SBLOCKSIZE,
 		    cred, &bp)) != 0)
 			goto out;
 		fs = (struct fs *)bp->b_data;
 		sblockloc = numfrags(fs, sblock_try[i]);
 		if ((fs->fs_magic == FS_UFS1_MAGIC ||
 		     (fs->fs_magic == FS_UFS2_MAGIC &&
 		      fs->fs_sblockloc == sblockloc)) &&
 		    fs->fs_bsize <= MAXBSIZE &&
 		    fs->fs_bsize >= sizeof(struct fs))
 			break;
 		brelse(bp);
 		bp = NULL;
 	}
 	if (sblock_try[i] == -1) {
 		error = EINVAL;		/* XXX needs translation */
 		goto out;
 	}
 	fs->fs_fmod = 0;
 	fs->fs_flags &= ~FS_INDEXDIRS;	/* no support for directory indicies */
 	fs->fs_flags &= ~FS_UNCLEAN;
 	if (fs->fs_clean == 0) {
 		fs->fs_flags |= FS_UNCLEAN;
 		if (ronly || (mp->mnt_flag & MNT_FORCE) ||
 		    ((fs->fs_flags & FS_NEEDSFSCK) == 0 &&
 		     (fs->fs_flags & FS_DOSOFTDEP))) {
 			printf(
 "WARNING: %s was not properly dismounted\n",
 			    fs->fs_fsmnt);
 		} else {
 			printf(
 "WARNING: R/W mount of %s denied.  Filesystem is not clean - run fsck\n",
 			    fs->fs_fsmnt);
 			error = EPERM;
 			goto out;
 		}
 		if ((fs->fs_pendingblocks != 0 || fs->fs_pendinginodes != 0) &&
 		    (mp->mnt_flag & MNT_FORCE)) {
 			printf("%s: lost blocks %jd files %d\n", fs->fs_fsmnt,
 			    (intmax_t)fs->fs_pendingblocks,
 			    fs->fs_pendinginodes);
 			fs->fs_pendingblocks = 0;
 			fs->fs_pendinginodes = 0;
 		}
 	}
 	if (fs->fs_pendingblocks != 0 || fs->fs_pendinginodes != 0) {
 		printf("%s: mount pending error: blocks %jd files %d\n",
 		    fs->fs_fsmnt, (intmax_t)fs->fs_pendingblocks,
 		    fs->fs_pendinginodes);
 		fs->fs_pendingblocks = 0;
 		fs->fs_pendinginodes = 0;
 	}
 	ump = malloc(sizeof *ump, M_UFSMNT, M_WAITOK | M_ZERO);
 	ump->um_malloctype = malloctype;
 	ump->um_fs = malloc((u_long)fs->fs_sbsize, M_UFSMNT,
 	    M_WAITOK);
 	if (fs->fs_magic == FS_UFS1_MAGIC) {
 		ump->um_fstype = UFS1;
 		ump->um_balloc = ffs_balloc_ufs1;
 	} else {
 		ump->um_fstype = UFS2;
 		ump->um_balloc = ffs_balloc_ufs2;
 	}
 	ump->um_blkatoff = ffs_blkatoff;
 	ump->um_truncate = ffs_truncate;
 	ump->um_update = ffs_update;
 	ump->um_valloc = ffs_valloc;
 	ump->um_vfree = ffs_vfree;
 	bcopy(bp->b_data, ump->um_fs, (u_int)fs->fs_sbsize);
 	if (fs->fs_sbsize < SBLOCKSIZE)
 		bp->b_flags |= B_INVAL | B_NOCACHE;
 	brelse(bp);
 	bp = NULL;
 	fs = ump->um_fs;
 	ffs_oldfscompat_read(fs, ump, sblockloc);
 	fs->fs_ronly = ronly;
 	size = fs->fs_cssize;
 	blks = howmany(size, fs->fs_fsize);
 	if (fs->fs_contigsumsize > 0)
 		size += fs->fs_ncg * sizeof(int32_t);
 	size += fs->fs_ncg * sizeof(u_int8_t);
 	space = malloc((u_long)size, M_UFSMNT, M_WAITOK);
 	fs->fs_csp = space;
 	for (i = 0; i < blks; i += fs->fs_frag) {
 		size = fs->fs_bsize;
 		if (i + fs->fs_frag > blks)
 			size = (blks - i) * fs->fs_fsize;
 		if ((error = bread(devvp, fsbtodb(fs, fs->fs_csaddr + i), size,
 		    cred, &bp)) != 0) {
 			free(fs->fs_csp, M_UFSMNT);
 			goto out;
 		}
 		bcopy(bp->b_data, space, (u_int)size);
 		space = (char *)space + size;
 		brelse(bp);
 		bp = NULL;
 	}
 	if (fs->fs_contigsumsize > 0) {
 		fs->fs_maxcluster = lp = space;
 		for (i = 0; i < fs->fs_ncg; i++)
 			*lp++ = fs->fs_contigsumsize;
 		space = lp;
 	}
 	size = fs->fs_ncg * sizeof(u_int8_t);
 	fs->fs_contigdirs = (u_int8_t *)space;
 	bzero(fs->fs_contigdirs, size);
 	fs->fs_active = NULL;
 	mp->mnt_data = (qaddr_t)ump;
 	mp->mnt_stat.f_fsid.val[0] = fs->fs_id[0];
 	mp->mnt_stat.f_fsid.val[1] = fs->fs_id[1];
 	if (fs->fs_id[0] == 0 || fs->fs_id[1] == 0 || 
 	    vfs_getvfs(&mp->mnt_stat.f_fsid)) 
 		vfs_getnewfsid(mp);
 	mp->mnt_maxsymlinklen = fs->fs_maxsymlinklen;
 	mp->mnt_flag |= MNT_LOCAL;
 	if ((fs->fs_flags & FS_MULTILABEL) != 0)
 #ifdef MAC
 		mp->mnt_flag |= MNT_MULTILABEL;
 #else
 		printf(
 "WARNING: %s: multilabel flag on fs but no MAC support\n",
 		    fs->fs_fsmnt);
 #endif
 	if ((fs->fs_flags & FS_ACLS) != 0)
 #ifdef UFS_ACL
 		mp->mnt_flag |= MNT_ACLS;
 #else
 		printf(
 "WARNING: %s: ACLs flag on fs but no ACLs support\n",
 		    fs->fs_fsmnt);
 #endif
 	ump->um_mountp = mp;
 	ump->um_dev = dev;
 	ump->um_devvp = devvp;
 	ump->um_nindir = fs->fs_nindir;
 	ump->um_bptrtodb = fs->fs_fsbtodb;
 	ump->um_seqinc = fs->fs_frag;
 	for (i = 0; i < MAXQUOTAS; i++)
 		ump->um_quotas[i] = NULLVP;
 #ifdef UFS_EXTATTR
 	ufs_extattr_uepm_init(&ump->um_extattr);
 #endif
 	devvp->v_rdev->si_mountpoint = mp;
 
 	/*
 	 * Set FS local "last mounted on" information (NULL pad)
 	 */
 	copystr(	mp->mnt_stat.f_mntonname,	/* mount point*/
 			fs->fs_fsmnt,			/* copy area*/
 			sizeof(fs->fs_fsmnt) - 1,	/* max size*/
 			&strsize);			/* real size*/
 	bzero( fs->fs_fsmnt + strsize, sizeof(fs->fs_fsmnt) - strsize);
 
 	if( mp->mnt_flag & MNT_ROOTFS) {
 		/*
 		 * Root mount; update timestamp in mount structure.
 		 * this will be used by the common root mount code
 		 * to update the system clock.
 		 */
 		mp->mnt_time = fs->fs_time;
 	}
 
 	if (ronly == 0) {
 		if ((fs->fs_flags & FS_DOSOFTDEP) &&
 		    (error = softdep_mount(devvp, mp, fs, cred)) != 0) {
 			free(fs->fs_csp, M_UFSMNT);
 			goto out;
 		}
 		if (fs->fs_snapinum[0] != 0)
 			ffs_snapshot_mount(mp);
 		fs->fs_fmod = 1;
 		fs->fs_clean = 0;
 		(void) ffs_sbupdate(ump, MNT_WAIT);
 	}
 #ifdef UFS_EXTATTR
 #ifdef UFS_EXTATTR_AUTOSTART
 	/*
 	 *
 	 * Auto-starting does the following:
 	 *	- check for /.attribute in the fs, and extattr_start if so
 	 *	- for each file in .attribute, enable that file with
 	 * 	  an attribute of the same name.
 	 * Not clear how to report errors -- probably eat them.
 	 * This would all happen while the filesystem was busy/not
 	 * available, so would effectively be "atomic".
 	 */
 	(void) ufs_extattr_autostart(mp, td);
 #endif /* !UFS_EXTATTR_AUTOSTART */
 #endif /* !UFS_EXTATTR */
 	return (0);
 out:
 	devvp->v_rdev->si_mountpoint = NULL;
 	if (bp)
 		brelse(bp);
 	/* XXX: see comment above VOP_OPEN */
 #ifdef notyet
 	(void)VOP_CLOSE(devvp, ronly ? FREAD : FREAD|FWRITE, cred, td);
 #else
 	(void)VOP_CLOSE(devvp, FREAD|FWRITE, cred, td);
 #endif
 	if (ump) {
 		free(ump->um_fs, M_UFSMNT);
 		free(ump, M_UFSMNT);
 		mp->mnt_data = (qaddr_t)0;
 	}
 	return (error);
 }
 
 #include <sys/sysctl.h>
 int bigcgs = 0;
 SYSCTL_INT(_debug, OID_AUTO, bigcgs, CTLFLAG_RW, &bigcgs, 0, "");
 
 /*
  * Sanity checks for loading old filesystem superblocks.
  * See ffs_oldfscompat_write below for unwound actions.
  *
  * XXX - Parts get retired eventually.
  * Unfortunately new bits get added.
  */
 static void
 ffs_oldfscompat_read(fs, ump, sblockloc)
 	struct fs *fs;
 	struct ufsmount *ump;
 	ufs2_daddr_t sblockloc;
 {
 	off_t maxfilesize;
 
 	/*
 	 * If not yet done, update UFS1 superblock with new wider fields.
 	 */
 	if (fs->fs_magic == FS_UFS1_MAGIC &&
 	    fs->fs_sblockloc != sblockloc) {
 		fs->fs_maxbsize = fs->fs_bsize;
 		fs->fs_sblockloc = sblockloc;
 		fs->fs_time = fs->fs_old_time;
 		fs->fs_size = fs->fs_old_size;
 		fs->fs_dsize = fs->fs_old_dsize;
 		fs->fs_csaddr = fs->fs_old_csaddr;
 		fs->fs_cstotal.cs_ndir = fs->fs_old_cstotal.cs_ndir;
 		fs->fs_cstotal.cs_nbfree = fs->fs_old_cstotal.cs_nbfree;
 		fs->fs_cstotal.cs_nifree = fs->fs_old_cstotal.cs_nifree;
 		fs->fs_cstotal.cs_nffree = fs->fs_old_cstotal.cs_nffree;
 	}
 	if (fs->fs_magic == FS_UFS1_MAGIC &&
 	    fs->fs_old_inodefmt < FS_44INODEFMT) {
 		fs->fs_maxfilesize = (u_quad_t) 1LL << 39;
 		fs->fs_qbmask = ~fs->fs_bmask;
 		fs->fs_qfmask = ~fs->fs_fmask;
 	}
 	if (fs->fs_magic == FS_UFS1_MAGIC) {
 		ump->um_savedmaxfilesize = fs->fs_maxfilesize;
 		maxfilesize = (u_int64_t)0x40000000 * fs->fs_bsize - 1;
 		if (fs->fs_maxfilesize > maxfilesize)
 			fs->fs_maxfilesize = maxfilesize;
 	}
 	/* Compatibility for old filesystems */
 	if (fs->fs_avgfilesize <= 0)
 		fs->fs_avgfilesize = AVFILESIZ;
 	if (fs->fs_avgfpdir <= 0)
 		fs->fs_avgfpdir = AFPDIR;
 	if (bigcgs) {
 		fs->fs_save_cgsize = fs->fs_cgsize;
 		fs->fs_cgsize = fs->fs_bsize;
 	}
 }
 
 /*
  * Unwinding superblock updates for old filesystems.
  * See ffs_oldfscompat_read above for details.
  *
  * XXX - Parts get retired eventually.
  * Unfortunately new bits get added.
  */
 static void
 ffs_oldfscompat_write(fs, ump)
 	struct fs *fs;
 	struct ufsmount *ump;
 {
 
 	/*
 	 * Copy back UFS2 updated fields that UFS1 inspects.
 	 */
 	if (fs->fs_magic == FS_UFS1_MAGIC) {
 		fs->fs_old_time = fs->fs_time;
 		fs->fs_old_cstotal.cs_ndir = fs->fs_cstotal.cs_ndir;
 		fs->fs_old_cstotal.cs_nbfree = fs->fs_cstotal.cs_nbfree;
 		fs->fs_old_cstotal.cs_nifree = fs->fs_cstotal.cs_nifree;
 		fs->fs_old_cstotal.cs_nffree = fs->fs_cstotal.cs_nffree;
 		fs->fs_maxfilesize = ump->um_savedmaxfilesize;
 	}
 	if (bigcgs) {
 		fs->fs_cgsize = fs->fs_save_cgsize;
 		fs->fs_save_cgsize = 0;
 	}
 }
 
 /*
  * unmount system call
  */
 int
 ffs_unmount(mp, mntflags, td)
 	struct mount *mp;
 	int mntflags;
 	struct thread *td;
 {
 	struct ufsmount *ump = VFSTOUFS(mp);
 	struct fs *fs;
 	int error, flags;
 
 	flags = 0;
 	if (mntflags & MNT_FORCE) {
 		flags |= FORCECLOSE;
 	}
 #ifdef UFS_EXTATTR
 	if ((error = ufs_extattr_stop(mp, td))) {
 		if (error != EOPNOTSUPP)
 			printf("ffs_unmount: ufs_extattr_stop returned %d\n",
 			    error);
 	} else {
 		ufs_extattr_uepm_destroy(&ump->um_extattr);
 	}
 #endif
 	if (mp->mnt_flag & MNT_SOFTDEP) {
 		if ((error = softdep_flushfiles(mp, flags, td)) != 0)
 			return (error);
 	} else {
 		if ((error = ffs_flushfiles(mp, flags, td)) != 0)
 			return (error);
 	}
 	fs = ump->um_fs;
 	if (fs->fs_pendingblocks != 0 || fs->fs_pendinginodes != 0) {
 		printf("%s: unmount pending error: blocks %jd files %d\n",
 		    fs->fs_fsmnt, (intmax_t)fs->fs_pendingblocks,
 		    fs->fs_pendinginodes);
 		fs->fs_pendingblocks = 0;
 		fs->fs_pendinginodes = 0;
 	}
 	if (fs->fs_ronly == 0) {
 		fs->fs_clean = fs->fs_flags & (FS_UNCLEAN|FS_NEEDSFSCK) ? 0 : 1;
 		error = ffs_sbupdate(ump, MNT_WAIT);
 		if (error) {
 			fs->fs_clean = 0;
 			return (error);
 		}
 	}
 	ump->um_devvp->v_rdev->si_mountpoint = NULL;
 
 	vinvalbuf(ump->um_devvp, V_SAVE, NOCRED, td, 0, 0);
 	/* XXX: see comment above VOP_OPEN */
 #ifdef notyet
 	error = VOP_CLOSE(ump->um_devvp, fs->fs_ronly ? FREAD : FREAD|FWRITE,
 		NOCRED, td);
 #else
 	error = VOP_CLOSE(ump->um_devvp, FREAD|FWRITE, NOCRED, td);
 #endif
 
 	vrele(ump->um_devvp);
 
 	free(fs->fs_csp, M_UFSMNT);
 	free(fs, M_UFSMNT);
 	free(ump, M_UFSMNT);
 	mp->mnt_data = (qaddr_t)0;
 	mp->mnt_flag &= ~MNT_LOCAL;
 	return (error);
 }
 
 /*
  * Flush out all the files in a filesystem.
  */
 int
 ffs_flushfiles(mp, flags, td)
 	struct mount *mp;
 	int flags;
 	struct thread *td;
 {
 	struct ufsmount *ump;
 	int error;
 
 	ump = VFSTOUFS(mp);
 #ifdef QUOTA
 	if (mp->mnt_flag & MNT_QUOTA) {
 		int i;
 		error = vflush(mp, 0, SKIPSYSTEM|flags);
 		if (error)
 			return (error);
 		for (i = 0; i < MAXQUOTAS; i++) {
 			if (ump->um_quotas[i] == NULLVP)
 				continue;
 			quotaoff(td, mp, i);
 		}
 		/*
 		 * Here we fall through to vflush again to ensure
 		 * that we have gotten rid of all the system vnodes.
 		 */
 	}
 #endif
 	ASSERT_VOP_LOCKED(ump->um_devvp, "ffs_flushfiles");
 	if (ump->um_devvp->v_vflag & VV_COPYONWRITE) {
 		if ((error = vflush(mp, 0, SKIPSYSTEM | flags)) != 0)
 			return (error);
 		ffs_snapshot_unmount(mp);
 		/*
 		 * Here we fall through to vflush again to ensure
 		 * that we have gotten rid of all the system vnodes.
 		 */
 	}
         /*
 	 * Flush all the files.
 	 */
 	if ((error = vflush(mp, 0, flags)) != 0)
 		return (error);
 	/*
 	 * Flush filesystem metadata.
 	 */
 	vn_lock(ump->um_devvp, LK_EXCLUSIVE | LK_RETRY, td);
 	error = VOP_FSYNC(ump->um_devvp, td->td_ucred, MNT_WAIT, td);
 	VOP_UNLOCK(ump->um_devvp, 0, td);
 	return (error);
 }
 
 /*
  * Get filesystem statistics.
  */
 int
 ffs_statfs(mp, sbp, td)
 	struct mount *mp;
 	struct statfs *sbp;
 	struct thread *td;
 {
 	struct ufsmount *ump;
 	struct fs *fs;
 
 	ump = VFSTOUFS(mp);
 	fs = ump->um_fs;
 	if (fs->fs_magic != FS_UFS1_MAGIC && fs->fs_magic != FS_UFS2_MAGIC)
 		panic("ffs_statfs");
 	sbp->f_bsize = fs->fs_fsize;
 	sbp->f_iosize = fs->fs_bsize;
 	sbp->f_blocks = fs->fs_dsize;
 	sbp->f_bfree = fs->fs_cstotal.cs_nbfree * fs->fs_frag +
 	    fs->fs_cstotal.cs_nffree + dbtofsb(fs, fs->fs_pendingblocks);
 	sbp->f_bavail = freespace(fs, fs->fs_minfree) +
 	    dbtofsb(fs, fs->fs_pendingblocks);
 	sbp->f_files =  fs->fs_ncg * fs->fs_ipg - ROOTINO;
 	sbp->f_ffree = fs->fs_cstotal.cs_nifree + fs->fs_pendinginodes;
 	if (sbp != &mp->mnt_stat) {
 		sbp->f_type = mp->mnt_vfc->vfc_typenum;
 		bcopy((caddr_t)mp->mnt_stat.f_mntonname,
 			(caddr_t)&sbp->f_mntonname[0], MNAMELEN);
 		bcopy((caddr_t)mp->mnt_stat.f_mntfromname,
 			(caddr_t)&sbp->f_mntfromname[0], MNAMELEN);
 	}
 	return (0);
 }
 
 /*
  * Go through the disk queues to initiate sandbagged IO;
  * go through the inodes to write those that have been modified;
  * initiate the writing of the super block if it has been modified.
  *
  * Note: we are always called with the filesystem marked `MPBUSY'.
  */
 int
 ffs_sync(mp, waitfor, cred, td)
 	struct mount *mp;
 	int waitfor;
 	struct ucred *cred;
 	struct thread *td;
 {
 	struct vnode *nvp, *vp, *devvp;
 	struct inode *ip;
 	struct ufsmount *ump = VFSTOUFS(mp);
 	struct fs *fs;
 	int error, count, wait, lockreq, allerror = 0;
 
 	fs = ump->um_fs;
 	if (fs->fs_fmod != 0 && fs->fs_ronly != 0) {		/* XXX */
 		printf("fs = %s\n", fs->fs_fsmnt);
 		panic("ffs_sync: rofs mod");
 	}
 	/*
 	 * Write back each (modified) inode.
 	 */
 	wait = 0;
 	lockreq = LK_EXCLUSIVE | LK_NOWAIT;
 	if (waitfor == MNT_WAIT) {
 		wait = 1;
 		lockreq = LK_EXCLUSIVE;
 	}
 	mtx_lock(&mntvnode_mtx);
 loop:
 	for (vp = TAILQ_FIRST(&mp->mnt_nvnodelist); vp != NULL; vp = nvp) {
 		/*
 		 * If the vnode that we are about to sync is no longer
 		 * associated with this mount point, start over.
 		 */
 		if (vp->v_mount != mp)
 			goto loop;
 
 		/*
 		 * Depend on the mntvnode_slock to keep things stable enough
 		 * for a quick test.  Since there might be hundreds of
 		 * thousands of vnodes, we cannot afford even a subroutine
 		 * call unless there's a good chance that we have work to do.
 		 */
 		nvp = TAILQ_NEXT(vp, v_nmntvnodes);
 		ip = VTOI(vp);
 		if (vp->v_type == VNON || ((ip->i_flag &
 		    (IN_ACCESS | IN_CHANGE | IN_MODIFIED | IN_UPDATE)) == 0 &&
 		    TAILQ_EMPTY(&vp->v_dirtyblkhd))) {
 			continue;
 		}
 		if (vp->v_type != VCHR) {
 			mtx_unlock(&mntvnode_mtx);
 			if ((error = vget(vp, lockreq, td)) != 0) {
 				mtx_lock(&mntvnode_mtx);
 				if (error == ENOENT)
 					goto loop;
 			} else {
 				if ((error = VOP_FSYNC(vp, cred, waitfor, td)) != 0)
 					allerror = error;
 				VOP_UNLOCK(vp, 0, td);
 				vrele(vp);
 				mtx_lock(&mntvnode_mtx);
 			}
 		} else {
 			mtx_unlock(&mntvnode_mtx);
 			UFS_UPDATE(vp, wait);
 			mtx_lock(&mntvnode_mtx);
 		}
 		if (TAILQ_NEXT(vp, v_nmntvnodes) != nvp)
 			goto loop;
 	}
 	mtx_unlock(&mntvnode_mtx);
 	/*
 	 * Force stale filesystem control information to be flushed.
 	 */
 	if (waitfor == MNT_WAIT) {
 		if ((error = softdep_flushworklist(ump->um_mountp, &count, td)))
 			allerror = error;
 		/* Flushed work items may create new vnodes to clean */
-		if (count) {
+		if (allerror == 0 && count) {
 			mtx_lock(&mntvnode_mtx);
 			goto loop;
 		}
 	}
 #ifdef QUOTA
 	qsync(mp);
 #endif
 	devvp = ump->um_devvp;
 	VI_LOCK(devvp);
 	if (waitfor != MNT_LAZY &&
 	    (devvp->v_numoutput > 0 || TAILQ_FIRST(&devvp->v_dirtyblkhd))) {
 		vn_lock(devvp, LK_EXCLUSIVE | LK_RETRY | LK_INTERLOCK, td);
 		if ((error = VOP_FSYNC(devvp, cred, waitfor, td)) != 0)
 			allerror = error;
 		VOP_UNLOCK(devvp, 0, td);
-		if (waitfor == MNT_WAIT) {
+		if (allerror == 0 && waitfor == MNT_WAIT) {
 			mtx_lock(&mntvnode_mtx);
 			goto loop;
 		}
 	} else
 		VI_UNLOCK(devvp);
 	/*
 	 * Write back modified superblock.
 	 */
 	if (fs->fs_fmod != 0 && (error = ffs_sbupdate(ump, waitfor)) != 0)
 		allerror = error;
 	return (allerror);
 }
 
 int
 ffs_vget(mp, ino, flags, vpp)
 	struct mount *mp;
 	ino_t ino;
 	int flags;
 	struct vnode **vpp;
 {
 	struct thread *td = curthread; 		/* XXX */
 	struct fs *fs;
 	struct inode *ip;
 	struct ufsmount *ump;
 	struct buf *bp;
 	struct vnode *vp;
 	dev_t dev;
 	int error;
 
 	ump = VFSTOUFS(mp);
 	dev = ump->um_dev;
 
 	/*
 	 * We do not lock vnode creation as it is believed to be too
 	 * expensive for such rare case as simultaneous creation of vnode
 	 * for same ino by different processes. We just allow them to race
 	 * and check later to decide who wins. Let the race begin!
 	 */
 	if ((error = ufs_ihashget(dev, ino, flags, vpp)) != 0)
 		return (error);
 	if (*vpp != NULL)
 		return (0);
 
 	/*
 	 * If this MALLOC() is performed after the getnewvnode()
 	 * it might block, leaving a vnode with a NULL v_data to be
 	 * found by ffs_sync() if a sync happens to fire right then,
 	 * which will cause a panic because ffs_sync() blindly
 	 * dereferences vp->v_data (as well it should).
 	 */
 	MALLOC(ip, struct inode *, sizeof(struct inode), 
 	    ump->um_malloctype, M_WAITOK);
 
 	/* Allocate a new vnode/inode. */
 	error = getnewvnode("ufs", mp, ffs_vnodeop_p, &vp);
 	if (error) {
 		*vpp = NULL;
 		FREE(ip, ump->um_malloctype);
 		return (error);
 	}
 	bzero((caddr_t)ip, sizeof(struct inode));
 	/*
 	 * FFS supports recursive locking.
 	 */
 	vp->v_vnlock->lk_flags |= LK_CANRECURSE;
 	vp->v_data = ip;
 	ip->i_vnode = vp;
 	ip->i_ump = ump;
 	ip->i_fs = fs = ump->um_fs;
 	ip->i_dev = dev;
 	ip->i_number = ino;
 #ifdef QUOTA
 	{
 		int i;
 		for (i = 0; i < MAXQUOTAS; i++)
 			ip->i_dquot[i] = NODQUOT;
 	}
 #endif
 	/*
 	 * Exclusively lock the vnode before adding to hash. Note, that we
 	 * must not release nor downgrade the lock (despite flags argument
 	 * says) till it is fully initialized.
 	 */
 	lockmgr(vp->v_vnlock, LK_EXCLUSIVE, (struct mtx *)0, td);
 
 	/*
 	 * Atomicaly (in terms of ufs_hash operations) check the hash for
 	 * duplicate of vnode being created and add it to the hash. If a
 	 * duplicate vnode was found, it will be vget()ed from hash for us.
 	 */
 	if ((error = ufs_ihashins(ip, flags, vpp)) != 0) {
 		vput(vp);
 		*vpp = NULL;
 		return (error);
 	}
 
 	/* We lost the race, then throw away our vnode and return existing */
 	if (*vpp != NULL) {
 		vput(vp);
 		return (0);
 	}
 
 	/* Read in the disk contents for the inode, copy into the inode. */
 	error = bread(ump->um_devvp, fsbtodb(fs, ino_to_fsba(fs, ino)),
 	    (int)fs->fs_bsize, NOCRED, &bp);
 	if (error) {
 		/*
 		 * The inode does not contain anything useful, so it would
 		 * be misleading to leave it on its hash chain. With mode
 		 * still zero, it will be unlinked and returned to the free
 		 * list by vput().
 		 */
 		brelse(bp);
 		vput(vp);
 		*vpp = NULL;
 		return (error);
 	}
 	ffs_load_inode(bp, ip, ump->um_malloctype, fs, ino);
 	if (DOINGSOFTDEP(vp))
 		softdep_load_inodeblock(ip);
 	else
 		ip->i_effnlink = ip->i_nlink;
 	bqrelse(bp);
 
 	/*
 	 * Initialize the vnode from the inode, check for aliases.
 	 * Note that the underlying vnode may have changed.
 	 */
 	error = ufs_vinit(mp, ffs_specop_p, ffs_fifoop_p, &vp);
 	if (error) {
 		vput(vp);
 		*vpp = NULL;
 		return (error);
 	}
 	/*
 	 * Finish inode initialization now that aliasing has been resolved.
 	 */
 	ip->i_devvp = ump->um_devvp;
 	VREF(ip->i_devvp);
 	/*
 	 * Set up a generation number for this inode if it does not
 	 * already have one. This should only happen on old filesystems.
 	 */
 	if (ip->i_gen == 0) {
 		ip->i_gen = random() / 2 + 1;
 		if ((vp->v_mount->mnt_flag & MNT_RDONLY) == 0) {
 			ip->i_flag |= IN_MODIFIED;
 			DIP(ip, i_gen) = ip->i_gen;
 		}
 	}
 	/*
 	 * Ensure that uid and gid are correct. This is a temporary
 	 * fix until fsck has been changed to do the update.
 	 */
 	if (fs->fs_magic == FS_UFS1_MAGIC &&		/* XXX */
 	    fs->fs_old_inodefmt < FS_44INODEFMT) {	/* XXX */
 		ip->i_uid = ip->i_din1->di_ouid;	/* XXX */
 		ip->i_gid = ip->i_din1->di_ogid;	/* XXX */
 	}						/* XXX */
 
 	*vpp = vp;
 	return (0);
 }
 
 /*
  * File handle to vnode
  *
  * Have to be really careful about stale file handles:
  * - check that the inode number is valid
  * - call ffs_vget() to get the locked inode
  * - check for an unallocated inode (i_mode == 0)
  * - check that the given client host has export rights and return
  *   those rights via. exflagsp and credanonp
  */
 int
 ffs_fhtovp(mp, fhp, vpp)
 	struct mount *mp;
 	struct fid *fhp;
 	struct vnode **vpp;
 {
 	struct ufid *ufhp;
 	struct fs *fs;
 
 	ufhp = (struct ufid *)fhp;
 	fs = VFSTOUFS(mp)->um_fs;
 	if (ufhp->ufid_ino < ROOTINO ||
 	    ufhp->ufid_ino >= fs->fs_ncg * fs->fs_ipg)
 		return (ESTALE);
 	return (ufs_fhtovp(mp, ufhp, vpp));
 }
 
 /*
  * Vnode pointer to File handle
  */
 /* ARGSUSED */
 int
 ffs_vptofh(vp, fhp)
 	struct vnode *vp;
 	struct fid *fhp;
 {
 	struct inode *ip;
 	struct ufid *ufhp;
 
 	ip = VTOI(vp);
 	ufhp = (struct ufid *)fhp;
 	ufhp->ufid_len = sizeof(struct ufid);
 	ufhp->ufid_ino = ip->i_number;
 	ufhp->ufid_gen = ip->i_gen;
 	return (0);
 }
 
 /*
  * Initialize the filesystem.
  */
 static int
 ffs_init(vfsp)
 	struct vfsconf *vfsp;
 {
 
 	softdep_initialize();
 	return (ufs_init(vfsp));
 }
 
 /*
  * Undo the work of ffs_init().
  */
 static int
 ffs_uninit(vfsp)
 	struct vfsconf *vfsp;
 {
 	int ret;
 
 	ret = ufs_uninit(vfsp);
 	softdep_uninitialize();
 	return (ret);
 }
 
 /*
  * Write a superblock and associated information back to disk.
  */
 static int
 ffs_sbupdate(mp, waitfor)
 	struct ufsmount *mp;
 	int waitfor;
 {
 	struct fs *fs = mp->um_fs;
 	struct buf *bp;
 	int blks;
 	void *space;
 	int i, size, error, allerror = 0;
 
 	/*
 	 * First write back the summary information.
 	 */
 	blks = howmany(fs->fs_cssize, fs->fs_fsize);
 	space = fs->fs_csp;
 	for (i = 0; i < blks; i += fs->fs_frag) {
 		size = fs->fs_bsize;
 		if (i + fs->fs_frag > blks)
 			size = (blks - i) * fs->fs_fsize;
 		bp = getblk(mp->um_devvp, fsbtodb(fs, fs->fs_csaddr + i),
 		    size, 0, 0);
 		bcopy(space, bp->b_data, (u_int)size);
 		space = (char *)space + size;
 		if (waitfor != MNT_WAIT)
 			bawrite(bp);
 		else if ((error = bwrite(bp)) != 0)
 			allerror = error;
 	}
 	/*
 	 * Now write back the superblock itself. If any errors occurred
 	 * up to this point, then fail so that the superblock avoids
 	 * being written out as clean.
 	 */
 	if (allerror)
 		return (allerror);
 	bp = getblk(mp->um_devvp, fsbtodb(fs, fs->fs_sblockloc),
 	    (int)fs->fs_sbsize, 0, 0);
 	fs->fs_fmod = 0;
 	fs->fs_time = time_second;
 	bcopy((caddr_t)fs, bp->b_data, (u_int)fs->fs_sbsize);
 	ffs_oldfscompat_write((struct fs *)bp->b_data, mp);
 	if (waitfor != MNT_WAIT)
 		bawrite(bp);
 	else if ((error = bwrite(bp)) != 0)
 		allerror = error;
 	return (allerror);
 }
 
 static int
 ffs_extattrctl(struct mount *mp, int cmd, struct vnode *filename_vp,
 	int attrnamespace, const char *attrname, struct thread *td)
 {
 
 #ifdef UFS_EXTATTR
 	return (ufs_extattrctl(mp, cmd, filename_vp, attrnamespace,
 	    attrname, td));
 #else
 	return (vfs_stdextattrctl(mp, cmd, filename_vp, attrnamespace,
 	    attrname, td));
 #endif
 }