Index: stable/12/sys/fs/ext2fs/ext2_inode.c
===================================================================
--- stable/12/sys/fs/ext2fs/ext2_inode.c	(revision 349304)
+++ stable/12/sys/fs/ext2fs/ext2_inode.c	(revision 349305)
@@ -1,644 +1,644 @@
 /*-
  *  modified for Lites 1.1
  *
  *  Aug 1995, Godmar Back (gback@cs.utah.edu)
  *  University of Utah, Department of Computer Science
  */
 /*-
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Copyright (c) 1982, 1986, 1989, 1993
  *	The Regents of the University of California.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	@(#)ffs_inode.c	8.5 (Berkeley) 12/30/93
  * $FreeBSD$
  */
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/mount.h>
 #include <sys/bio.h>
 #include <sys/buf.h>
 #include <sys/vnode.h>
 #include <sys/malloc.h>
 #include <sys/rwlock.h>
 #include <sys/sdt.h>
 
 #include <vm/vm.h>
 #include <vm/vm_extern.h>
 
 #include <fs/ext2fs/fs.h>
 #include <fs/ext2fs/inode.h>
 #include <fs/ext2fs/ext2_mount.h>
 #include <fs/ext2fs/ext2fs.h>
 #include <fs/ext2fs/fs.h>
 #include <fs/ext2fs/ext2_extern.h>
 #include <fs/ext2fs/ext2_extattr.h>
 
 /*
  * Update the access, modified, and inode change times as specified by the
  * IN_ACCESS, IN_UPDATE, and IN_CHANGE flags respectively.  Write the inode
  * to disk if the IN_MODIFIED flag is set (it may be set initially, or by
  * the timestamp update).  The IN_LAZYMOD flag is set to force a write
  * later if not now.  If we write now, then clear both IN_MODIFIED and
  * IN_LAZYMOD to reflect the presumably successful write, and if waitfor is
  * set, then wait for the write to complete.
  */
 int
 ext2_update(struct vnode *vp, int waitfor)
 {
 	struct m_ext2fs *fs;
 	struct buf *bp;
 	struct inode *ip;
 	int error;
 
 	ASSERT_VOP_ELOCKED(vp, "ext2_update");
 	ext2_itimes(vp);
 	ip = VTOI(vp);
 	if ((ip->i_flag & IN_MODIFIED) == 0 && waitfor == 0)
 		return (0);
 	ip->i_flag &= ~(IN_LAZYACCESS | IN_LAZYMOD | IN_MODIFIED);
 	fs = ip->i_e2fs;
 	if (fs->e2fs_ronly)
 		return (0);
 	if ((error = bread(ip->i_devvp,
 	    fsbtodb(fs, ino_to_fsba(fs, ip->i_number)),
 	    (int)fs->e2fs_bsize, NOCRED, &bp)) != 0) {
 		brelse(bp);
 		return (error);
 	}
 	error = ext2_i2ei(ip, (struct ext2fs_dinode *)((char *)bp->b_data +
 	    EXT2_INODE_SIZE(fs) * ino_to_fsbo(fs, ip->i_number)));
 	if (error) {
 		brelse(bp);
 		return (error);
 	}
 	if (waitfor && !DOINGASYNC(vp))
 		return (bwrite(bp));
 	else {
 		bdwrite(bp);
 		return (0);
 	}
 }
 
 #define	SINGLE	0	/* index of single indirect block */
 #define	DOUBLE	1	/* index of double indirect block */
 #define	TRIPLE	2	/* index of triple indirect block */
 
 /*
  * Release blocks associated with the inode ip and stored in the indirect
  * block bn.  Blocks are free'd in LIFO order up to (but not including)
  * lastbn.  If level is greater than SINGLE, the block is an indirect block
  * and recursive calls to indirtrunc must be used to cleanse other indirect
  * blocks.
  *
  * NB: triple indirect blocks are untested.
  */
 static int
 ext2_indirtrunc(struct inode *ip, daddr_t lbn, daddr_t dbn,
     daddr_t lastbn, int level, e4fs_daddr_t *countp)
 {
 	struct buf *bp;
 	struct m_ext2fs *fs = ip->i_e2fs;
 	struct vnode *vp;
 	e2fs_daddr_t *bap, *copy;
 	int i, nblocks, error = 0, allerror = 0;
 	e2fs_lbn_t nb, nlbn, last;
 	e4fs_daddr_t blkcount, factor, blocksreleased = 0;
 
 	/*
 	 * Calculate index in current block of last
 	 * block to be kept.  -1 indicates the entire
 	 * block so we need not calculate the index.
 	 */
 	factor = 1;
 	for (i = SINGLE; i < level; i++)
 		factor *= NINDIR(fs);
 	last = lastbn;
 	if (lastbn > 0)
 		last /= factor;
 	nblocks = btodb(fs->e2fs_bsize);
 	/*
 	 * Get buffer of block pointers, zero those entries corresponding
 	 * to blocks to be free'd, and update on disk copy first.  Since
 	 * double(triple) indirect before single(double) indirect, calls
 	 * to bmap on these blocks will fail.  However, we already have
 	 * the on disk address, so we have to set the b_blkno field
 	 * explicitly instead of letting bread do everything for us.
 	 */
 	vp = ITOV(ip);
 	bp = getblk(vp, lbn, (int)fs->e2fs_bsize, 0, 0, 0);
 	if ((bp->b_flags & (B_DONE | B_DELWRI)) == 0) {
 		bp->b_iocmd = BIO_READ;
 		if (bp->b_bcount > bp->b_bufsize)
 			panic("ext2_indirtrunc: bad buffer size");
 		bp->b_blkno = dbn;
 		vfs_busy_pages(bp, 0);
 		bp->b_iooffset = dbtob(bp->b_blkno);
 		bstrategy(bp);
 		error = bufwait(bp);
 	}
 	if (error) {
 		brelse(bp);
 		*countp = 0;
 		return (error);
 	}
 	bap = (e2fs_daddr_t *)bp->b_data;
 	copy = malloc(fs->e2fs_bsize, M_TEMP, M_WAITOK);
 	bcopy((caddr_t)bap, (caddr_t)copy, (u_int)fs->e2fs_bsize);
 	bzero((caddr_t)&bap[last + 1],
 	    (NINDIR(fs) - (last + 1)) * sizeof(e2fs_daddr_t));
 	if (last == -1)
 		bp->b_flags |= B_INVAL;
 	if (DOINGASYNC(vp)) {
 		bdwrite(bp);
 	} else {
 		error = bwrite(bp);
 		if (error)
 			allerror = error;
 	}
 	bap = copy;
 
 	/*
 	 * Recursively free totally unused blocks.
 	 */
 	for (i = NINDIR(fs) - 1, nlbn = lbn + 1 - i * factor; i > last;
 	    i--, nlbn += factor) {
 		nb = bap[i];
 		if (nb == 0)
 			continue;
 		if (level > SINGLE) {
 			if ((error = ext2_indirtrunc(ip, nlbn,
 			    fsbtodb(fs, nb), (int32_t)-1, level - 1, &blkcount)) != 0)
 				allerror = error;
 			blocksreleased += blkcount;
 		}
 		ext2_blkfree(ip, nb, fs->e2fs_bsize);
 		blocksreleased += nblocks;
 	}
 
 	/*
 	 * Recursively free last partial block.
 	 */
 	if (level > SINGLE && lastbn >= 0) {
 		last = lastbn % factor;
 		nb = bap[i];
 		if (nb != 0) {
 			if ((error = ext2_indirtrunc(ip, nlbn, fsbtodb(fs, nb),
 			    last, level - 1, &blkcount)) != 0)
 				allerror = error;
 			blocksreleased += blkcount;
 		}
 	}
 	free(copy, M_TEMP);
 	*countp = blocksreleased;
 	return (allerror);
 }
 
 /*
  * Truncate the inode oip to at most length size, freeing the
  * disk blocks.
  */
 static int
 ext2_ind_truncate(struct vnode *vp, off_t length, int flags, struct ucred *cred,
     struct thread *td)
 {
 	struct vnode *ovp = vp;
 	e4fs_daddr_t lastblock;
 	struct inode *oip;
 	e4fs_daddr_t bn, lbn, lastiblock[EXT2_NIADDR], indir_lbn[EXT2_NIADDR];
 	uint32_t oldblks[EXT2_NDADDR + EXT2_NIADDR];
 	uint32_t newblks[EXT2_NDADDR + EXT2_NIADDR];
 	struct m_ext2fs *fs;
 	struct buf *bp;
 	int offset, size, level;
 	e4fs_daddr_t count, nblocks, blocksreleased = 0;
 	int error, i, allerror;
 	off_t osize;
 #ifdef INVARIANTS
 	struct bufobj *bo;
 #endif
 
 	oip = VTOI(ovp);
 #ifdef INVARIANTS
 	bo = &ovp->v_bufobj;
 #endif
 
 	fs = oip->i_e2fs;
 	osize = oip->i_size;
 	/*
 	 * Lengthen the size of the file. We must ensure that the
 	 * last byte of the file is allocated. Since the smallest
 	 * value of osize is 0, length will be at least 1.
 	 */
 	if (osize < length) {
 		if (length > oip->i_e2fs->e2fs_maxfilesize)
 			return (EFBIG);
 		vnode_pager_setsize(ovp, length);
 		offset = blkoff(fs, length - 1);
 		lbn = lblkno(fs, length - 1);
 		flags |= BA_CLRBUF;
 		error = ext2_balloc(oip, lbn, offset + 1, cred, &bp, flags);
 		if (error) {
 			vnode_pager_setsize(vp, osize);
 			return (error);
 		}
 		oip->i_size = length;
 		if (bp->b_bufsize == fs->e2fs_bsize)
 			bp->b_flags |= B_CLUSTEROK;
 		if (flags & IO_SYNC)
 			bwrite(bp);
 		else if (DOINGASYNC(ovp))
 			bdwrite(bp);
 		else
 			bawrite(bp);
 		oip->i_flag |= IN_CHANGE | IN_UPDATE;
 		return (ext2_update(ovp, !DOINGASYNC(ovp)));
 	}
 	/*
 	 * Shorten the size of the file. If the file is not being
 	 * truncated to a block boundary, the contents of the
 	 * partial block following the end of the file must be
 	 * zero'ed in case it ever become accessible again because
 	 * of subsequent file growth.
 	 */
 	/* I don't understand the comment above */
 	offset = blkoff(fs, length);
 	if (offset == 0) {
 		oip->i_size = length;
 	} else {
 		lbn = lblkno(fs, length);
 		flags |= BA_CLRBUF;
 		error = ext2_balloc(oip, lbn, offset, cred, &bp, flags);
 		if (error)
 			return (error);
 		oip->i_size = length;
 		size = blksize(fs, oip, lbn);
 		bzero((char *)bp->b_data + offset, (u_int)(size - offset));
 		allocbuf(bp, size);
 		if (bp->b_bufsize == fs->e2fs_bsize)
 			bp->b_flags |= B_CLUSTEROK;
 		if (flags & IO_SYNC)
 			bwrite(bp);
 		else if (DOINGASYNC(ovp))
 			bdwrite(bp);
 		else
 			bawrite(bp);
 	}
 	/*
 	 * Calculate index into inode's block list of
 	 * last direct and indirect blocks (if any)
 	 * which we want to keep.  Lastblock is -1 when
 	 * the file is truncated to 0.
 	 */
 	lastblock = lblkno(fs, length + fs->e2fs_bsize - 1) - 1;
 	lastiblock[SINGLE] = lastblock - EXT2_NDADDR;
 	lastiblock[DOUBLE] = lastiblock[SINGLE] - NINDIR(fs);
 	lastiblock[TRIPLE] = lastiblock[DOUBLE] - NINDIR(fs) * NINDIR(fs);
 	nblocks = btodb(fs->e2fs_bsize);
 	/*
 	 * Update file and block pointers on disk before we start freeing
 	 * blocks.  If we crash before free'ing blocks below, the blocks
 	 * will be returned to the free list.  lastiblock values are also
 	 * normalized to -1 for calls to ext2_indirtrunc below.
 	 */
 	for (level = TRIPLE; level >= SINGLE; level--) {
 		oldblks[EXT2_NDADDR + level] = oip->i_ib[level];
 		if (lastiblock[level] < 0) {
 			oip->i_ib[level] = 0;
 			lastiblock[level] = -1;
 		}
 	}
 	for (i = 0; i < EXT2_NDADDR; i++) {
 		oldblks[i] = oip->i_db[i];
 		if (i > lastblock)
 			oip->i_db[i] = 0;
 	}
 	oip->i_flag |= IN_CHANGE | IN_UPDATE;
 	allerror = ext2_update(ovp, !DOINGASYNC(ovp));
 
 	/*
 	 * Having written the new inode to disk, save its new configuration
 	 * and put back the old block pointers long enough to process them.
 	 * Note that we save the new block configuration so we can check it
 	 * when we are done.
 	 */
 	for (i = 0; i < EXT2_NDADDR; i++) {
 		newblks[i] = oip->i_db[i];
 		oip->i_db[i] = oldblks[i];
 	}
 	for (i = 0; i < EXT2_NIADDR; i++) {
 		newblks[EXT2_NDADDR + i] = oip->i_ib[i];
 		oip->i_ib[i] = oldblks[EXT2_NDADDR + i];
 	}
 	oip->i_size = osize;
-	error = vtruncbuf(ovp, cred, length, (int)fs->e2fs_bsize);
+	error = vtruncbuf(ovp, length, (int)fs->e2fs_bsize);
 	if (error && (allerror == 0))
 		allerror = error;
 	vnode_pager_setsize(ovp, length);
 
 	/*
 	 * Indirect blocks first.
 	 */
 	indir_lbn[SINGLE] = -EXT2_NDADDR;
 	indir_lbn[DOUBLE] = indir_lbn[SINGLE] - NINDIR(fs) - 1;
 	indir_lbn[TRIPLE] = indir_lbn[DOUBLE] - NINDIR(fs) * NINDIR(fs) - 1;
 	for (level = TRIPLE; level >= SINGLE; level--) {
 		bn = oip->i_ib[level];
 		if (bn != 0) {
 			error = ext2_indirtrunc(oip, indir_lbn[level],
 			    fsbtodb(fs, bn), lastiblock[level], level, &count);
 			if (error)
 				allerror = error;
 			blocksreleased += count;
 			if (lastiblock[level] < 0) {
 				oip->i_ib[level] = 0;
 				ext2_blkfree(oip, bn, fs->e2fs_fsize);
 				blocksreleased += nblocks;
 			}
 		}
 		if (lastiblock[level] >= 0)
 			goto done;
 	}
 
 	/*
 	 * All whole direct blocks or frags.
 	 */
 	for (i = EXT2_NDADDR - 1; i > lastblock; i--) {
 		long bsize;
 
 		bn = oip->i_db[i];
 		if (bn == 0)
 			continue;
 		oip->i_db[i] = 0;
 		bsize = blksize(fs, oip, i);
 		ext2_blkfree(oip, bn, bsize);
 		blocksreleased += btodb(bsize);
 	}
 	if (lastblock < 0)
 		goto done;
 
 	/*
 	 * Finally, look for a change in size of the
 	 * last direct block; release any frags.
 	 */
 	bn = oip->i_db[lastblock];
 	if (bn != 0) {
 		long oldspace, newspace;
 
 		/*
 		 * Calculate amount of space we're giving
 		 * back as old block size minus new block size.
 		 */
 		oldspace = blksize(fs, oip, lastblock);
 		oip->i_size = length;
 		newspace = blksize(fs, oip, lastblock);
 		if (newspace == 0)
 			panic("ext2_truncate: newspace");
 		if (oldspace - newspace > 0) {
 			/*
 			 * Block number of space to be free'd is
 			 * the old block # plus the number of frags
 			 * required for the storage we're keeping.
 			 */
 			bn += numfrags(fs, newspace);
 			ext2_blkfree(oip, bn, oldspace - newspace);
 			blocksreleased += btodb(oldspace - newspace);
 		}
 	}
 done:
 #ifdef INVARIANTS
 	for (level = SINGLE; level <= TRIPLE; level++)
 		if (newblks[EXT2_NDADDR + level] != oip->i_ib[level])
 			panic("itrunc1");
 	for (i = 0; i < EXT2_NDADDR; i++)
 		if (newblks[i] != oip->i_db[i])
 			panic("itrunc2");
 	BO_LOCK(bo);
 	if (length == 0 && (bo->bo_dirty.bv_cnt != 0 ||
 	    bo->bo_clean.bv_cnt != 0))
 		panic("itrunc3");
 	BO_UNLOCK(bo);
 #endif	/* INVARIANTS */
 	/*
 	 * Put back the real size.
 	 */
 	oip->i_size = length;
 	if (oip->i_blocks >= blocksreleased)
 		oip->i_blocks -= blocksreleased;
 	else				/* sanity */
 		oip->i_blocks = 0;
 	oip->i_flag |= IN_CHANGE;
 	vnode_pager_setsize(ovp, length);
 	return (allerror);
 }
 
 static int
 ext2_ext_truncate(struct vnode *vp, off_t length, int flags,
     struct ucred *cred, struct thread *td)
 {
 	struct vnode *ovp = vp;
 	int32_t lastblock;
 	struct m_ext2fs *fs;
 	struct inode *oip;
 	struct buf *bp;
 	uint32_t lbn, offset;
 	int error, size;
 	off_t osize;
 
 	oip = VTOI(ovp);
 	fs = oip->i_e2fs;
 	osize = oip->i_size;
 
 	if (osize < length) {
 		if (length > oip->i_e2fs->e2fs_maxfilesize) {
 			return (EFBIG);
 		}
 		vnode_pager_setsize(ovp, length);
 		offset = blkoff(fs, length - 1);
 		lbn = lblkno(fs, length - 1);
 		flags |= BA_CLRBUF;
 		error = ext2_balloc(oip, lbn, offset + 1, cred, &bp, flags);
 		if (error) {
 			vnode_pager_setsize(vp, osize);
 			return (error);
 		}
 		oip->i_size = length;
 		if (bp->b_bufsize == fs->e2fs_bsize)
 			bp->b_flags |= B_CLUSTEROK;
 		if (flags & IO_SYNC)
 			bwrite(bp);
 		else if (DOINGASYNC(ovp))
 			bdwrite(bp);
 		else
 			bawrite(bp);
 		oip->i_flag |= IN_CHANGE | IN_UPDATE;
 		return (ext2_update(ovp, !DOINGASYNC(ovp)));
 	}
 
 	lastblock = (length + fs->e2fs_bsize - 1) / fs->e2fs_bsize;
 	error = ext4_ext_remove_space(oip, lastblock, flags, cred, td);
 	if (error)
 		return (error);
 
 	offset = blkoff(fs, length);
 	if (offset == 0) {
 		oip->i_size = length;
 	} else {
 		lbn = lblkno(fs, length);
 		flags |= BA_CLRBUF;
 		error = ext2_balloc(oip, lbn, offset, cred, &bp, flags);
 		if (error) {
 			return (error);
 		}
 		oip->i_size = length;
 		size = blksize(fs, oip, lbn);
 		bzero((char *)bp->b_data + offset, (u_int)(size - offset));
 		allocbuf(bp, size);
 		if (bp->b_bufsize == fs->e2fs_bsize)
 			bp->b_flags |= B_CLUSTEROK;
 		if (flags & IO_SYNC)
 			bwrite(bp);
 		else if (DOINGASYNC(ovp))
 			bdwrite(bp);
 		else
 			bawrite(bp);
 	}
 
 	oip->i_size = osize;
-	error = vtruncbuf(ovp, cred, length, (int)fs->e2fs_bsize);
+	error = vtruncbuf(ovp, length, (int)fs->e2fs_bsize);
 	if (error)
 		return (error);
 
 	vnode_pager_setsize(ovp, length);
 
 	oip->i_size = length;
 	oip->i_flag |= IN_CHANGE | IN_UPDATE;
 	error = ext2_update(ovp, !DOINGASYNC(ovp));
 
 	return (error);
 }
 
 /*
  * Truncate the inode ip to at most length size, freeing the
  * disk blocks.
  */
 int
 ext2_truncate(struct vnode *vp, off_t length, int flags, struct ucred *cred,
     struct thread *td)
 {
 	struct inode *ip;
 	int error;
 
 	ASSERT_VOP_LOCKED(vp, "ext2_truncate");
 
 	if (length < 0)
 		return (EINVAL);
 
 	ip = VTOI(vp);
 	if (vp->v_type == VLNK &&
 	    ip->i_size < vp->v_mount->mnt_maxsymlinklen) {
 #ifdef INVARIANTS
 		if (length != 0)
 			panic("ext2_truncate: partial truncate of symlink");
 #endif
 		bzero((char *)&ip->i_shortlink, (u_int)ip->i_size);
 		ip->i_size = 0;
 		ip->i_flag |= IN_CHANGE | IN_UPDATE;
 		return (ext2_update(vp, 1));
 	}
 	if (ip->i_size == length) {
 		ip->i_flag |= IN_CHANGE | IN_UPDATE;
 		return (ext2_update(vp, 0));
 	}
 
 	if (ip->i_flag & IN_E4EXTENTS)
 		error = ext2_ext_truncate(vp, length, flags, cred, td);
 	else
 		error = ext2_ind_truncate(vp, length, flags, cred, td);
 
 	return (error);
 }
 
 /*
  *	discard preallocated blocks
  */
 int
 ext2_inactive(struct vop_inactive_args *ap)
 {
 	struct vnode *vp = ap->a_vp;
 	struct inode *ip = VTOI(vp);
 	struct thread *td = ap->a_td;
 	int mode, error = 0;
 
 	/*
 	 * Ignore inodes related to stale file handles.
 	 */
 	if (ip->i_mode == 0)
 		goto out;
 	if (ip->i_nlink <= 0) {
 		ext2_extattr_free(ip);
 		error = ext2_truncate(vp, (off_t)0, 0, NOCRED, td);
 		if (!(ip->i_flag & IN_E4EXTENTS))
 			ip->i_rdev = 0;
 		mode = ip->i_mode;
 		ip->i_mode = 0;
 		ip->i_flag |= IN_CHANGE | IN_UPDATE;
 		ext2_vfree(vp, ip->i_number, mode);
 	}
 	if (ip->i_flag & (IN_ACCESS | IN_CHANGE | IN_MODIFIED | IN_UPDATE))
 		ext2_update(vp, 0);
 out:
 	/*
 	 * If we are done with the inode, reclaim it
 	 * so that it can be reused immediately.
 	 */
 	if (ip->i_mode == 0)
 		vrecycle(vp);
 	return (error);
 }
 
 /*
  * Reclaim an inode so that it can be used for other purposes.
  */
 int
 ext2_reclaim(struct vop_reclaim_args *ap)
 {
 	struct inode *ip;
 	struct vnode *vp = ap->a_vp;
 
 	ip = VTOI(vp);
 	if (ip->i_flag & IN_LAZYMOD) {
 		ip->i_flag |= IN_MODIFIED;
 		ext2_update(vp, 0);
 	}
 	vfs_hash_remove(vp);
 	free(vp->v_data, M_EXT2NODE);
 	vp->v_data = 0;
 	vnode_destroy_vobject(vp);
 	return (0);
 }
Index: stable/12/sys/fs/fuse/fuse_io.c
===================================================================
--- stable/12/sys/fs/fuse/fuse_io.c	(revision 349304)
+++ stable/12/sys/fs/fuse/fuse_io.c	(revision 349305)
@@ -1,809 +1,809 @@
 /*-
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Copyright (c) 2007-2009 Google Inc.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are
  * met:
  *
  * * Redistributions of source code must retain the above copyright
  *   notice, this list of conditions and the following disclaimer.
  * * Redistributions in binary form must reproduce the above
  *   copyright notice, this list of conditions and the following disclaimer
  *   in the documentation and/or other materials provided with the
  *   distribution.
  * * Neither the name of Google Inc. nor the names of its
  *   contributors may be used to endorse or promote products derived from
  *   this software without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
  * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
  * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
  * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
  * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
  * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  * Copyright (C) 2005 Csaba Henk.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <sys/types.h>
 #include <sys/module.h>
 #include <sys/systm.h>
 #include <sys/errno.h>
 #include <sys/param.h>
 #include <sys/kernel.h>
 #include <sys/conf.h>
 #include <sys/uio.h>
 #include <sys/malloc.h>
 #include <sys/queue.h>
 #include <sys/lock.h>
 #include <sys/sx.h>
 #include <sys/mutex.h>
 #include <sys/rwlock.h>
 #include <sys/proc.h>
 #include <sys/mount.h>
 #include <sys/vnode.h>
 #include <sys/stat.h>
 #include <sys/unistd.h>
 #include <sys/filedesc.h>
 #include <sys/file.h>
 #include <sys/fcntl.h>
 #include <sys/bio.h>
 #include <sys/buf.h>
 #include <sys/sysctl.h>
 
 #include <vm/vm.h>
 #include <vm/vm_extern.h>
 #include <vm/pmap.h>
 #include <vm/vm_map.h>
 #include <vm/vm_page.h>
 #include <vm/vm_object.h>
 
 #include "fuse.h"
 #include "fuse_file.h"
 #include "fuse_node.h"
 #include "fuse_internal.h"
 #include "fuse_ipc.h"
 #include "fuse_io.h"
 
 #define FUSE_DEBUG_MODULE IO
 #include "fuse_debug.h"
 
 
 static int 
 fuse_read_directbackend(struct vnode *vp, struct uio *uio,
     struct ucred *cred, struct fuse_filehandle *fufh);
 static int 
 fuse_read_biobackend(struct vnode *vp, struct uio *uio,
     struct ucred *cred, struct fuse_filehandle *fufh);
 static int 
 fuse_write_directbackend(struct vnode *vp, struct uio *uio,
     struct ucred *cred, struct fuse_filehandle *fufh, int ioflag);
 static int 
 fuse_write_biobackend(struct vnode *vp, struct uio *uio,
     struct ucred *cred, struct fuse_filehandle *fufh, int ioflag);
 
 int
 fuse_io_dispatch(struct vnode *vp, struct uio *uio, int ioflag,
     struct ucred *cred)
 {
 	struct fuse_filehandle *fufh;
 	int err, directio;
 
 	MPASS(vp->v_type == VREG || vp->v_type == VDIR);
 
 	err = fuse_filehandle_getrw(vp,
 	    (uio->uio_rw == UIO_READ) ? FUFH_RDONLY : FUFH_WRONLY, &fufh);
 	if (err) {
 		printf("FUSE: io dispatch: filehandles are closed\n");
 		return err;
 	}
 	/*
          * Ideally, when the daemon asks for direct io at open time, the
          * standard file flag should be set according to this, so that would
          * just change the default mode, which later on could be changed via
          * fcntl(2).
          * But this doesn't work, the O_DIRECT flag gets cleared at some point
          * (don't know where). So to make any use of the Fuse direct_io option,
          * we hardwire it into the file's private data (similarly to Linux,
          * btw.).
          */
 	directio = (ioflag & IO_DIRECT) || !fsess_opt_datacache(vnode_mount(vp));
 
 	switch (uio->uio_rw) {
 	case UIO_READ:
 		if (directio) {
 			FS_DEBUG("direct read of vnode %ju via file handle %ju\n",
 			    (uintmax_t)VTOILLU(vp), (uintmax_t)fufh->fh_id);
 			err = fuse_read_directbackend(vp, uio, cred, fufh);
 		} else {
 			FS_DEBUG("buffered read of vnode %ju\n", 
 			      (uintmax_t)VTOILLU(vp));
 			err = fuse_read_biobackend(vp, uio, cred, fufh);
 		}
 		break;
 	case UIO_WRITE:
 		if (directio) {
 			FS_DEBUG("direct write of vnode %ju via file handle %ju\n",
 			    (uintmax_t)VTOILLU(vp), (uintmax_t)fufh->fh_id);
 			err = fuse_write_directbackend(vp, uio, cred, fufh, ioflag);
 		} else {
 			FS_DEBUG("buffered write of vnode %ju\n", 
 			      (uintmax_t)VTOILLU(vp));
 			err = fuse_write_biobackend(vp, uio, cred, fufh, ioflag);
 		}
 		break;
 	default:
 		panic("uninterpreted mode passed to fuse_io_dispatch");
 	}
 
 	return (err);
 }
 
 static int
 fuse_read_biobackend(struct vnode *vp, struct uio *uio,
     struct ucred *cred, struct fuse_filehandle *fufh)
 {
 	struct buf *bp;
 	daddr_t lbn;
 	int bcount;
 	int err = 0, n = 0, on = 0;
 	off_t filesize;
 
 	const int biosize = fuse_iosize(vp);
 
 	FS_DEBUG("resid=%zx offset=%jx fsize=%jx\n",
 	    uio->uio_resid, uio->uio_offset, VTOFUD(vp)->filesize);
 
 	if (uio->uio_resid == 0)
 		return (0);
 	if (uio->uio_offset < 0)
 		return (EINVAL);
 
 	bcount = MIN(MAXBSIZE, biosize);
 	filesize = VTOFUD(vp)->filesize;
 
 	do {
 		if (fuse_isdeadfs(vp)) {
 			err = ENXIO;
 			break;
 		}
 		lbn = uio->uio_offset / biosize;
 		on = uio->uio_offset & (biosize - 1);
 
 		FS_DEBUG2G("biosize %d, lbn %d, on %d\n", biosize, (int)lbn, on);
 
 		/*
 	         * Obtain the buffer cache block.  Figure out the buffer size
 	         * when we are at EOF.  If we are modifying the size of the
 	         * buffer based on an EOF condition we need to hold
 	         * nfs_rslock() through obtaining the buffer to prevent
 	         * a potential writer-appender from messing with n_size.
 	         * Otherwise we may accidentally truncate the buffer and
 	         * lose dirty data.
 	         *
 	         * Note that bcount is *not* DEV_BSIZE aligned.
 	         */
 		if ((off_t)lbn * biosize >= filesize) {
 			bcount = 0;
 		} else if ((off_t)(lbn + 1) * biosize > filesize) {
 			bcount = filesize - (off_t)lbn *biosize;
 		}
 		bp = getblk(vp, lbn, bcount, PCATCH, 0, 0);
 
 		if (!bp)
 			return (EINTR);
 
 		/*
 	         * If B_CACHE is not set, we must issue the read.  If this
 	         * fails, we return an error.
 	         */
 
 		if ((bp->b_flags & B_CACHE) == 0) {
 			bp->b_iocmd = BIO_READ;
 			vfs_busy_pages(bp, 0);
 			err = fuse_io_strategy(vp, bp);
 			if (err) {
 				brelse(bp);
 				return (err);
 			}
 		}
 		/*
 	         * on is the offset into the current bp.  Figure out how many
 	         * bytes we can copy out of the bp.  Note that bcount is
 	         * NOT DEV_BSIZE aligned.
 	         *
 	         * Then figure out how many bytes we can copy into the uio.
 	         */
 
 		n = 0;
 		if (on < bcount)
 			n = MIN((unsigned)(bcount - on), uio->uio_resid);
 		if (n > 0) {
 			FS_DEBUG2G("feeding buffeater with %d bytes of buffer %p,"
 				" saying %d was asked for\n",
 				n, bp->b_data + on, n + (int)bp->b_resid);
 			err = uiomove(bp->b_data + on, n, uio);
 		}
 		brelse(bp);
 		FS_DEBUG2G("end of turn, err %d, uio->uio_resid %zd, n %d\n",
 		    err, uio->uio_resid, n);
 	} while (err == 0 && uio->uio_resid > 0 && n > 0);
 
 	return (err);
 }
 
 static int
 fuse_read_directbackend(struct vnode *vp, struct uio *uio,
     struct ucred *cred, struct fuse_filehandle *fufh)
 {
 	struct fuse_dispatcher fdi;
 	struct fuse_read_in *fri;
 	int err = 0;
 
 	if (uio->uio_resid == 0)
 		return (0);
 
 	fdisp_init(&fdi, 0);
 
 	/*
          * XXX In "normal" case we use an intermediate kernel buffer for
          * transmitting data from daemon's context to ours. Eventually, we should
          * get rid of this. Anyway, if the target uio lives in sysspace (we are
          * called from pageops), and the input data doesn't need kernel-side
          * processing (we are not called from readdir) we can already invoke
          * an optimized, "peer-to-peer" I/O routine.
          */
 	while (uio->uio_resid > 0) {
 		fdi.iosize = sizeof(*fri);
 		fdisp_make_vp(&fdi, FUSE_READ, vp, uio->uio_td, cred);
 		fri = fdi.indata;
 		fri->fh = fufh->fh_id;
 		fri->offset = uio->uio_offset;
 		fri->size = MIN(uio->uio_resid,
 		    fuse_get_mpdata(vp->v_mount)->max_read);
 
 		FS_DEBUG2G("fri->fh %ju, fri->offset %ju, fri->size %ju\n",
 			(uintmax_t)fri->fh, (uintmax_t)fri->offset, 
 			(uintmax_t)fri->size);
 
 		if ((err = fdisp_wait_answ(&fdi)))
 			goto out;
 
 		FS_DEBUG2G("complete: got iosize=%d, requested fri.size=%zd; "
 			"resid=%zd offset=%ju\n",
 			fri->size, fdi.iosize, uio->uio_resid, 
 			(uintmax_t)uio->uio_offset);
 
 		if ((err = uiomove(fdi.answ, MIN(fri->size, fdi.iosize), uio)))
 			break;
 		if (fdi.iosize < fri->size)
 			break;
 	}
 
 out:
 	fdisp_destroy(&fdi);
 	return (err);
 }
 
 static int
 fuse_write_directbackend(struct vnode *vp, struct uio *uio,
     struct ucred *cred, struct fuse_filehandle *fufh, int ioflag)
 {
 	struct fuse_vnode_data *fvdat = VTOFUD(vp);
 	struct fuse_write_in *fwi;
 	struct fuse_dispatcher fdi;
 	size_t chunksize;
 	int diff;
 	int err = 0;
 
 	if (uio->uio_resid == 0)
 		return (0);
 	if (ioflag & IO_APPEND)
 		uio_setoffset(uio, fvdat->filesize);
 
 	fdisp_init(&fdi, 0);
 
 	while (uio->uio_resid > 0) {
 		chunksize = MIN(uio->uio_resid,
 		    fuse_get_mpdata(vp->v_mount)->max_write);
 
 		fdi.iosize = sizeof(*fwi) + chunksize;
 		fdisp_make_vp(&fdi, FUSE_WRITE, vp, uio->uio_td, cred);
 
 		fwi = fdi.indata;
 		fwi->fh = fufh->fh_id;
 		fwi->offset = uio->uio_offset;
 		fwi->size = chunksize;
 
 		if ((err = uiomove((char *)fdi.indata + sizeof(*fwi),
 		    chunksize, uio)))
 			break;
 
 		if ((err = fdisp_wait_answ(&fdi)))
 			break;
 
 		diff = chunksize - ((struct fuse_write_out *)fdi.answ)->size;
 		if (diff < 0) {
 			err = EINVAL;
 			break;
 		}
 		uio->uio_resid += diff;
 		uio->uio_offset -= diff;
 		if (uio->uio_offset > fvdat->filesize)
-			fuse_vnode_setsize(vp, cred, uio->uio_offset);
+			fuse_vnode_setsize(vp, uio->uio_offset);
 	}
 
 	fdisp_destroy(&fdi);
 
 	return (err);
 }
 
 static int
 fuse_write_biobackend(struct vnode *vp, struct uio *uio,
     struct ucred *cred, struct fuse_filehandle *fufh, int ioflag)
 {
 	struct fuse_vnode_data *fvdat = VTOFUD(vp);
 	struct buf *bp;
 	daddr_t lbn;
 	int bcount;
 	int n, on, err = 0;
 
 	const int biosize = fuse_iosize(vp);
 
 	KASSERT(uio->uio_rw == UIO_WRITE, ("ncl_write mode"));
 	FS_DEBUG("resid=%zx offset=%jx fsize=%jx\n",
 	    uio->uio_resid, uio->uio_offset, fvdat->filesize);
 	if (vp->v_type != VREG)
 		return (EIO);
 	if (uio->uio_offset < 0)
 		return (EINVAL);
 	if (uio->uio_resid == 0)
 		return (0);
 	if (ioflag & IO_APPEND)
 		uio_setoffset(uio, fvdat->filesize);
 
 	/*
          * Find all of this file's B_NEEDCOMMIT buffers.  If our writes
          * would exceed the local maximum per-file write commit size when
          * combined with those, we must decide whether to flush,
          * go synchronous, or return err.  We don't bother checking
          * IO_UNIT -- we just make all writes atomic anyway, as there's
          * no point optimizing for something that really won't ever happen.
          */
 	do {
 		if (fuse_isdeadfs(vp)) {
 			err = ENXIO;
 			break;
 		}
 		lbn = uio->uio_offset / biosize;
 		on = uio->uio_offset & (biosize - 1);
 		n = MIN((unsigned)(biosize - on), uio->uio_resid);
 
 		FS_DEBUG2G("lbn %ju, on %d, n %d, uio offset %ju, uio resid %zd\n",
 			(uintmax_t)lbn, on, n, 
 			(uintmax_t)uio->uio_offset, uio->uio_resid);
 
 again:
 		/*
 	         * Handle direct append and file extension cases, calculate
 	         * unaligned buffer size.
 	         */
 		if (uio->uio_offset == fvdat->filesize && n) {
 			/*
 	                 * Get the buffer (in its pre-append state to maintain
 	                 * B_CACHE if it was previously set).  Resize the
 	                 * nfsnode after we have locked the buffer to prevent
 	                 * readers from reading garbage.
 	                 */
 			bcount = on;
 			FS_DEBUG("getting block from OS, bcount %d\n", bcount);
 			bp = getblk(vp, lbn, bcount, PCATCH, 0, 0);
 
 			if (bp != NULL) {
 				long save;
 
-				err = fuse_vnode_setsize(vp, cred, 
+				err = fuse_vnode_setsize(vp,
 							 uio->uio_offset + n);
 				if (err) {
 					brelse(bp);
 					break;
 				}
 				save = bp->b_flags & B_CACHE;
 				bcount += n;
 				allocbuf(bp, bcount);
 				bp->b_flags |= save;
 			}
 		} else {
 			/*
 	                 * Obtain the locked cache block first, and then
 	                 * adjust the file's size as appropriate.
 	                 */
 			bcount = on + n;
 			if ((off_t)lbn * biosize + bcount < fvdat->filesize) {
 				if ((off_t)(lbn + 1) * biosize < fvdat->filesize)
 					bcount = biosize;
 				else
 					bcount = fvdat->filesize - 
 					  (off_t)lbn *biosize;
 			}
 			FS_DEBUG("getting block from OS, bcount %d\n", bcount);
 			bp = getblk(vp, lbn, bcount, PCATCH, 0, 0);
 			if (bp && uio->uio_offset + n > fvdat->filesize) {
-				err = fuse_vnode_setsize(vp, cred, 
+				err = fuse_vnode_setsize(vp,
 							 uio->uio_offset + n);
 				if (err) {
 					brelse(bp);
 					break;
 				}
 			}
 		}
 
 		if (!bp) {
 			err = EINTR;
 			break;
 		}
 		/*
 	         * Issue a READ if B_CACHE is not set.  In special-append
 	         * mode, B_CACHE is based on the buffer prior to the write
 	         * op and is typically set, avoiding the read.  If a read
 	         * is required in special append mode, the server will
 	         * probably send us a short-read since we extended the file
 	         * on our end, resulting in b_resid == 0 and, thusly,
 	         * B_CACHE getting set.
 	         *
 	         * We can also avoid issuing the read if the write covers
 	         * the entire buffer.  We have to make sure the buffer state
 	         * is reasonable in this case since we will not be initiating
 	         * I/O.  See the comments in kern/vfs_bio.c's getblk() for
 	         * more information.
 	         *
 	         * B_CACHE may also be set due to the buffer being cached
 	         * normally.
 	         */
 
 		if (on == 0 && n == bcount) {
 			bp->b_flags |= B_CACHE;
 			bp->b_flags &= ~B_INVAL;
 			bp->b_ioflags &= ~BIO_ERROR;
 		}
 		if ((bp->b_flags & B_CACHE) == 0) {
 			bp->b_iocmd = BIO_READ;
 			vfs_busy_pages(bp, 0);
 			fuse_io_strategy(vp, bp);
 			if ((err = bp->b_error)) {
 				brelse(bp);
 				break;
 			}
 		}
 		if (bp->b_wcred == NOCRED)
 			bp->b_wcred = crhold(cred);
 
 		/*
 	         * If dirtyend exceeds file size, chop it down.  This should
 	         * not normally occur but there is an append race where it
 	         * might occur XXX, so we log it.
 	         *
 	         * If the chopping creates a reverse-indexed or degenerate
 	         * situation with dirtyoff/end, we 0 both of them.
 	         */
 
 		if (bp->b_dirtyend > bcount) {
 			FS_DEBUG("FUSE append race @%lx:%d\n",
 			    (long)bp->b_blkno * biosize,
 			    bp->b_dirtyend - bcount);
 			bp->b_dirtyend = bcount;
 		}
 		if (bp->b_dirtyoff >= bp->b_dirtyend)
 			bp->b_dirtyoff = bp->b_dirtyend = 0;
 
 		/*
 	         * If the new write will leave a contiguous dirty
 	         * area, just update the b_dirtyoff and b_dirtyend,
 	         * otherwise force a write rpc of the old dirty area.
 	         *
 	         * While it is possible to merge discontiguous writes due to
 	         * our having a B_CACHE buffer ( and thus valid read data
 	         * for the hole), we don't because it could lead to
 	         * significant cache coherency problems with multiple clients,
 	         * especially if locking is implemented later on.
 	         *
 	         * as an optimization we could theoretically maintain
 	         * a linked list of discontinuous areas, but we would still
 	         * have to commit them separately so there isn't much
 	         * advantage to it except perhaps a bit of asynchronization.
 	         */
 
 		if (bp->b_dirtyend > 0 &&
 		    (on > bp->b_dirtyend || (on + n) < bp->b_dirtyoff)) {
 			/*
 	                 * Yes, we mean it. Write out everything to "storage"
 	                 * immediately, without hesitation. (Apart from other
 	                 * reasons: the only way to know if a write is valid
 	                 * if its actually written out.)
 	                 */
 			bwrite(bp);
 			if (bp->b_error == EINTR) {
 				err = EINTR;
 				break;
 			}
 			goto again;
 		}
 		err = uiomove((char *)bp->b_data + on, n, uio);
 
 		/*
 	         * Since this block is being modified, it must be written
 	         * again and not just committed.  Since write clustering does
 	         * not work for the stage 1 data write, only the stage 2
 	         * commit rpc, we have to clear B_CLUSTEROK as well.
 	         */
 		bp->b_flags &= ~(B_NEEDCOMMIT | B_CLUSTEROK);
 
 		if (err) {
 			bp->b_ioflags |= BIO_ERROR;
 			bp->b_error = err;
 			brelse(bp);
 			break;
 		}
 		/*
 	         * Only update dirtyoff/dirtyend if not a degenerate
 	         * condition.
 	         */
 		if (n) {
 			if (bp->b_dirtyend > 0) {
 				bp->b_dirtyoff = MIN(on, bp->b_dirtyoff);
 				bp->b_dirtyend = MAX((on + n), bp->b_dirtyend);
 			} else {
 				bp->b_dirtyoff = on;
 				bp->b_dirtyend = on + n;
 			}
 			vfs_bio_set_valid(bp, on, n);
 		}
 		err = bwrite(bp);
 		if (err)
 			break;
 	} while (uio->uio_resid > 0 && n > 0);
 
 	if (fuse_sync_resize && (fvdat->flag & FN_SIZECHANGE) != 0)
 		fuse_vnode_savesize(vp, cred);
 
 	return (err);
 }
 
 int
 fuse_io_strategy(struct vnode *vp, struct buf *bp)
 {
 	struct fuse_filehandle *fufh;
 	struct fuse_vnode_data *fvdat = VTOFUD(vp);
 	struct ucred *cred;
 	struct uio *uiop;
 	struct uio uio;
 	struct iovec io;
 	int error = 0;
 
 	const int biosize = fuse_iosize(vp);
 
 	MPASS(vp->v_type == VREG || vp->v_type == VDIR);
 	MPASS(bp->b_iocmd == BIO_READ || bp->b_iocmd == BIO_WRITE);
 	FS_DEBUG("inode=%ju offset=%jd resid=%ld\n",
 	    (uintmax_t)VTOI(vp), (intmax_t)(((off_t)bp->b_blkno) * biosize),
 	    bp->b_bcount);
 
 	error = fuse_filehandle_getrw(vp,
 	    (bp->b_iocmd == BIO_READ) ? FUFH_RDONLY : FUFH_WRONLY, &fufh);
 	if (error) {
 		printf("FUSE: strategy: filehandles are closed\n");
 		bp->b_ioflags |= BIO_ERROR;
 		bp->b_error = error;
 		return (error);
 	}
 	cred = bp->b_iocmd == BIO_READ ? bp->b_rcred : bp->b_wcred;
 
 	uiop = &uio;
 	uiop->uio_iov = &io;
 	uiop->uio_iovcnt = 1;
 	uiop->uio_segflg = UIO_SYSSPACE;
 	uiop->uio_td = curthread;
 
 	/*
          * clear BIO_ERROR and B_INVAL state prior to initiating the I/O.  We
          * do this here so we do not have to do it in all the code that
          * calls us.
          */
 	bp->b_flags &= ~B_INVAL;
 	bp->b_ioflags &= ~BIO_ERROR;
 
 	KASSERT(!(bp->b_flags & B_DONE),
 	    ("fuse_io_strategy: bp %p already marked done", bp));
 	if (bp->b_iocmd == BIO_READ) {
 		io.iov_len = uiop->uio_resid = bp->b_bcount;
 		io.iov_base = bp->b_data;
 		uiop->uio_rw = UIO_READ;
 
 		uiop->uio_offset = ((off_t)bp->b_blkno) * biosize;
 		error = fuse_read_directbackend(vp, uiop, cred, fufh);
 
 		if ((!error && uiop->uio_resid) ||
 		    (fsess_opt_brokenio(vnode_mount(vp)) && error == EIO &&
 		    uiop->uio_offset < fvdat->filesize && fvdat->filesize > 0 &&
 		    uiop->uio_offset >= fvdat->cached_attrs.va_size)) {
 			/*
 	                 * If we had a short read with no error, we must have
 	                 * hit a file hole.  We should zero-fill the remainder.
 	                 * This can also occur if the server hits the file EOF.
 	                 *
 	                 * Holes used to be able to occur due to pending
 	                 * writes, but that is not possible any longer.
 	                 */
 			int nread = bp->b_bcount - uiop->uio_resid;
 			int left = uiop->uio_resid;
 
 			if (error != 0) {
 				printf("FUSE: Fix broken io: offset %ju, "
 				       " resid %zd, file size %ju/%ju\n", 
 				       (uintmax_t)uiop->uio_offset,
 				    uiop->uio_resid, fvdat->filesize,
 				    fvdat->cached_attrs.va_size);
 				error = 0;
 			}
 			if (left > 0)
 				bzero((char *)bp->b_data + nread, left);
 			uiop->uio_resid = 0;
 		}
 		if (error) {
 			bp->b_ioflags |= BIO_ERROR;
 			bp->b_error = error;
 		}
 	} else {
 		/*
 	         * If we only need to commit, try to commit
 	         */
 		if (bp->b_flags & B_NEEDCOMMIT) {
 			FS_DEBUG("write: B_NEEDCOMMIT flags set\n");
 		}
 		/*
 	         * Setup for actual write
 	         */
 		if ((off_t)bp->b_blkno * biosize + bp->b_dirtyend > 
 		    fvdat->filesize)
 			bp->b_dirtyend = fvdat->filesize - 
 				(off_t)bp->b_blkno * biosize;
 
 		if (bp->b_dirtyend > bp->b_dirtyoff) {
 			io.iov_len = uiop->uio_resid = bp->b_dirtyend
 			    - bp->b_dirtyoff;
 			uiop->uio_offset = (off_t)bp->b_blkno * biosize
 			    + bp->b_dirtyoff;
 			io.iov_base = (char *)bp->b_data + bp->b_dirtyoff;
 			uiop->uio_rw = UIO_WRITE;
 
 			error = fuse_write_directbackend(vp, uiop, cred, fufh, 0);
 
 			if (error == EINTR || error == ETIMEDOUT
 			    || (!error && (bp->b_flags & B_NEEDCOMMIT))) {
 
 				bp->b_flags &= ~(B_INVAL | B_NOCACHE);
 				if ((bp->b_flags & B_PAGING) == 0) {
 					bdirty(bp);
 					bp->b_flags &= ~B_DONE;
 				}
 				if ((error == EINTR || error == ETIMEDOUT) &&
 				    (bp->b_flags & B_ASYNC) == 0)
 					bp->b_flags |= B_EINTR;
 			} else {
 				if (error) {
 					bp->b_ioflags |= BIO_ERROR;
 					bp->b_flags |= B_INVAL;
 					bp->b_error = error;
 				}
 				bp->b_dirtyoff = bp->b_dirtyend = 0;
 			}
 		} else {
 			bp->b_resid = 0;
 			bufdone(bp);
 			return (0);
 		}
 	}
 	bp->b_resid = uiop->uio_resid;
 	bufdone(bp);
 	return (error);
 }
 
 int
 fuse_io_flushbuf(struct vnode *vp, int waitfor, struct thread *td)
 {
 
 	return (vn_fsync_buf(vp, waitfor));
 }
 
 /*
  * Flush and invalidate all dirty buffers. If another process is already
  * doing the flush, just wait for completion.
  */
 int
 fuse_io_invalbuf(struct vnode *vp, struct thread *td)
 {
 	struct fuse_vnode_data *fvdat = VTOFUD(vp);
 	int error = 0;
 
 	if (vp->v_iflag & VI_DOOMED)
 		return 0;
 
 	ASSERT_VOP_ELOCKED(vp, "fuse_io_invalbuf");
 
 	while (fvdat->flag & FN_FLUSHINPROG) {
 		struct proc *p = td->td_proc;
 
 		if (vp->v_mount->mnt_kern_flag & MNTK_UNMOUNTF)
 			return EIO;
 		fvdat->flag |= FN_FLUSHWANT;
 		tsleep(&fvdat->flag, PRIBIO + 2, "fusevinv", 2 * hz);
 		error = 0;
 		if (p != NULL) {
 			PROC_LOCK(p);
 			if (SIGNOTEMPTY(p->p_siglist) ||
 			    SIGNOTEMPTY(td->td_siglist))
 				error = EINTR;
 			PROC_UNLOCK(p);
 		}
 		if (error == EINTR)
 			return EINTR;
 	}
 	fvdat->flag |= FN_FLUSHINPROG;
 
 	if (vp->v_bufobj.bo_object != NULL) {
 		VM_OBJECT_WLOCK(vp->v_bufobj.bo_object);
 		vm_object_page_clean(vp->v_bufobj.bo_object, 0, 0, OBJPC_SYNC);
 		VM_OBJECT_WUNLOCK(vp->v_bufobj.bo_object);
 	}
 	error = vinvalbuf(vp, V_SAVE, PCATCH, 0);
 	while (error) {
 		if (error == ERESTART || error == EINTR) {
 			fvdat->flag &= ~FN_FLUSHINPROG;
 			if (fvdat->flag & FN_FLUSHWANT) {
 				fvdat->flag &= ~FN_FLUSHWANT;
 				wakeup(&fvdat->flag);
 			}
 			return EINTR;
 		}
 		error = vinvalbuf(vp, V_SAVE, PCATCH, 0);
 	}
 	fvdat->flag &= ~FN_FLUSHINPROG;
 	if (fvdat->flag & FN_FLUSHWANT) {
 		fvdat->flag &= ~FN_FLUSHWANT;
 		wakeup(&fvdat->flag);
 	}
 	return (error);
 }
Index: stable/12/sys/fs/fuse/fuse_node.c
===================================================================
--- stable/12/sys/fs/fuse/fuse_node.c	(revision 349304)
+++ stable/12/sys/fs/fuse/fuse_node.c	(revision 349305)
@@ -1,402 +1,402 @@
 /*-
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Copyright (c) 2007-2009 Google Inc. and Amit Singh
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are
  * met:
  *
  * * Redistributions of source code must retain the above copyright
  *   notice, this list of conditions and the following disclaimer.
  * * Redistributions in binary form must reproduce the above
  *   copyright notice, this list of conditions and the following disclaimer
  *   in the documentation and/or other materials provided with the
  *   distribution.
  * * Neither the name of Google Inc. nor the names of its
  *   contributors may be used to endorse or promote products derived from
  *   this software without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
  * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
  * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
  * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
  * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
  * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  * Copyright (C) 2005 Csaba Henk.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <sys/types.h>
 #include <sys/module.h>
 #include <sys/systm.h>
 #include <sys/errno.h>
 #include <sys/param.h>
 #include <sys/kernel.h>
 #include <sys/conf.h>
 #include <sys/uio.h>
 #include <sys/malloc.h>
 #include <sys/queue.h>
 #include <sys/lock.h>
 #include <sys/sx.h>
 #include <sys/mutex.h>
 #include <sys/proc.h>
 #include <sys/vnode.h>
 #include <sys/namei.h>
 #include <sys/mount.h>
 #include <sys/sysctl.h>
 #include <sys/fcntl.h>
 #include <sys/fnv_hash.h>
 #include <sys/priv.h>
 #include <security/mac/mac_framework.h>
 #include <vm/vm.h>
 #include <vm/vm_extern.h>
 
 #include "fuse.h"
 #include "fuse_node.h"
 #include "fuse_internal.h"
 #include "fuse_io.h"
 #include "fuse_ipc.h"
 
 #define FUSE_DEBUG_MODULE VNOPS
 #include "fuse_debug.h"
 
 MALLOC_DEFINE(M_FUSEVN, "fuse_vnode", "fuse vnode private data");
 
 static int fuse_node_count = 0;
 
 SYSCTL_INT(_vfs_fuse, OID_AUTO, node_count, CTLFLAG_RD,
     &fuse_node_count, 0, "Count of FUSE vnodes");
 
 int	fuse_data_cache_enable = 1;
 
 SYSCTL_INT(_vfs_fuse, OID_AUTO, data_cache_enable, CTLFLAG_RW,
     &fuse_data_cache_enable, 0,
     "enable caching of FUSE file data (including dirty data)");
 
 int	fuse_data_cache_invalidate = 0;
 
 SYSCTL_INT(_vfs_fuse, OID_AUTO, data_cache_invalidate, CTLFLAG_RW,
     &fuse_data_cache_invalidate, 0,
     "If non-zero, discard cached clean file data when there are no active file"
     " users");
 
 int	fuse_mmap_enable = 1;
 
 SYSCTL_INT(_vfs_fuse, OID_AUTO, mmap_enable, CTLFLAG_RW,
     &fuse_mmap_enable, 0,
     "If non-zero, and data_cache_enable is also non-zero, enable mmap(2) of "
     "FUSE files");
 
 int	fuse_refresh_size = 0;
 
 SYSCTL_INT(_vfs_fuse, OID_AUTO, refresh_size, CTLFLAG_RW,
     &fuse_refresh_size, 0,
     "If non-zero, and no dirty file extension data is buffered, fetch file "
     "size before write operations");
 
 int	fuse_sync_resize = 1;
 
 SYSCTL_INT(_vfs_fuse, OID_AUTO, sync_resize, CTLFLAG_RW,
     &fuse_sync_resize, 0,
     "If a cached write extended a file, inform FUSE filesystem of the changed"
     "size immediately subsequent to the issued writes");
 
 int	fuse_fix_broken_io = 0;
 
 SYSCTL_INT(_vfs_fuse, OID_AUTO, fix_broken_io, CTLFLAG_RW,
     &fuse_fix_broken_io, 0,
     "If non-zero, print a diagnostic warning if a userspace filesystem returns"
     " EIO on reads of recently extended portions of files");
 
 static void
 fuse_vnode_init(struct vnode *vp, struct fuse_vnode_data *fvdat,
     uint64_t nodeid, enum vtype vtyp)
 {
 	int i;
 
 	fvdat->nid = nodeid;
 	if (nodeid == FUSE_ROOT_ID) {
 		vp->v_vflag |= VV_ROOT;
 	}
 	vp->v_type = vtyp;
 	vp->v_data = fvdat;
 
 	for (i = 0; i < FUFH_MAXTYPE; i++)
 		fvdat->fufh[i].fh_type = FUFH_INVALID;
 
 	atomic_add_acq_int(&fuse_node_count, 1);
 }
 
 void
 fuse_vnode_destroy(struct vnode *vp)
 {
 	struct fuse_vnode_data *fvdat = vp->v_data;
 
 	vp->v_data = NULL;
 	free(fvdat, M_FUSEVN);
 
 	atomic_subtract_acq_int(&fuse_node_count, 1);
 }
 
 static int
 fuse_vnode_cmp(struct vnode *vp, void *nidp)
 {
 	return (VTOI(vp) != *((uint64_t *)nidp));
 }
 
 static uint32_t __inline
 fuse_vnode_hash(uint64_t id)
 {
 	return (fnv_32_buf(&id, sizeof(id), FNV1_32_INIT));
 }
 
 static int
 fuse_vnode_alloc(struct mount *mp,
     struct thread *td,
     uint64_t nodeid,
     enum vtype vtyp,
     struct vnode **vpp)
 {
 	struct fuse_vnode_data *fvdat;
 	struct vnode *vp2;
 	int err = 0;
 
 	FS_DEBUG("been asked for vno #%ju\n", (uintmax_t)nodeid);
 
 	if (vtyp == VNON) {
 		return EINVAL;
 	}
 	*vpp = NULL;
 	err = vfs_hash_get(mp, fuse_vnode_hash(nodeid), LK_EXCLUSIVE, td, vpp,
 	    fuse_vnode_cmp, &nodeid);
 	if (err)
 		return (err);
 
 	if (*vpp) {
 		MPASS((*vpp)->v_type == vtyp && (*vpp)->v_data != NULL);
 		FS_DEBUG("vnode taken from hash\n");
 		return (0);
 	}
 	fvdat = malloc(sizeof(*fvdat), M_FUSEVN, M_WAITOK | M_ZERO);
 	err = getnewvnode("fuse", mp, &fuse_vnops, vpp);
 	if (err) {
 		free(fvdat, M_FUSEVN);
 		return (err);
 	}
 	lockmgr((*vpp)->v_vnlock, LK_EXCLUSIVE, NULL);
 	fuse_vnode_init(*vpp, fvdat, nodeid, vtyp);
 	err = insmntque(*vpp, mp);
 	ASSERT_VOP_ELOCKED(*vpp, "fuse_vnode_alloc");
 	if (err) {
 		free(fvdat, M_FUSEVN);
 		*vpp = NULL;
 		return (err);
 	}
 	err = vfs_hash_insert(*vpp, fuse_vnode_hash(nodeid), LK_EXCLUSIVE,
 	    td, &vp2, fuse_vnode_cmp, &nodeid);
 	if (err)
 		return (err);
 	if (vp2 != NULL) {
 		*vpp = vp2;
 		return (0);
 	}
 
 	ASSERT_VOP_ELOCKED(*vpp, "fuse_vnode_alloc");
 
 	return (0);
 }
 
 int
 fuse_vnode_get(struct mount *mp,
     uint64_t nodeid,
     struct vnode *dvp,
     struct vnode **vpp,
     struct componentname *cnp,
     enum vtype vtyp)
 {
 	struct thread *td = (cnp != NULL ? cnp->cn_thread : curthread);
 	int err = 0;
 
 	debug_printf("dvp=%p\n", dvp);
 
 	err = fuse_vnode_alloc(mp, td, nodeid, vtyp, vpp);
 	if (err) {
 		return err;
 	}
 	if (dvp != NULL) {
 		MPASS((cnp->cn_flags & ISDOTDOT) == 0);
 		MPASS(!(cnp->cn_namelen == 1 && cnp->cn_nameptr[0] == '.'));
 		fuse_vnode_setparent(*vpp, dvp);
 	}
 	if (dvp != NULL && cnp != NULL && (cnp->cn_flags & MAKEENTRY) != 0) {
 		ASSERT_VOP_LOCKED(*vpp, "fuse_vnode_get");
 		ASSERT_VOP_LOCKED(dvp, "fuse_vnode_get");
 		cache_enter(dvp, *vpp, cnp);
 	}
 
 	/*
 	 * In userland, libfuse uses cached lookups for dot and dotdot entries,
 	 * thus it does not really bump the nlookup counter for forget.
 	 * Follow the same semantic and avoid tu bump it in order to keep
 	 * nlookup counters consistent.
 	 */
 	if (cnp == NULL || ((cnp->cn_flags & ISDOTDOT) == 0 &&
 	    (cnp->cn_namelen != 1 || cnp->cn_nameptr[0] != '.')))
 		VTOFUD(*vpp)->nlookup++;
 
 	return 0;
 }
 
 void
 fuse_vnode_open(struct vnode *vp, int32_t fuse_open_flags, struct thread *td)
 {
 	/*
 	 * Funcation is called for every vnode open.
 	 * Merge fuse_open_flags it may be 0
 	 */
 	/*
 	 * Ideally speaking, direct io should be enabled on
 	 * fd's but do not see of any way of providing that
 	 * this implementation.
 	 *
 	 * Also cannot think of a reason why would two
 	 * different fd's on same vnode would like
 	 * have DIRECT_IO turned on and off. But linux
 	 * based implementation works on an fd not an
 	 * inode and provides such a feature.
 	 *
 	 * XXXIP: Handle fd based DIRECT_IO
 	 */
 	if (fuse_open_flags & FOPEN_DIRECT_IO) {
 		ASSERT_VOP_ELOCKED(vp, __func__);
 		VTOFUD(vp)->flag |= FN_DIRECTIO;
 		fuse_io_invalbuf(vp, td);
 	} else {
 		if ((fuse_open_flags & FOPEN_KEEP_CACHE) == 0)
 			fuse_io_invalbuf(vp, td);
 	        VTOFUD(vp)->flag &= ~FN_DIRECTIO;
 	}
 
 	if (vnode_vtype(vp) == VREG) {
 		/* XXXIP prevent getattr, by using cached node size */
 		vnode_create_vobject(vp, 0, td);
 	}
 }
 
 int
 fuse_vnode_savesize(struct vnode *vp, struct ucred *cred)
 {
 	struct fuse_vnode_data *fvdat = VTOFUD(vp);
 	struct thread *td = curthread;
 	struct fuse_filehandle *fufh = NULL;
 	struct fuse_dispatcher fdi;
 	struct fuse_setattr_in *fsai;
 	int err = 0;
 
 	FS_DEBUG("inode=%ju size=%ju\n", (uintmax_t)VTOI(vp),
 	    (uintmax_t)fvdat->filesize);
 	ASSERT_VOP_ELOCKED(vp, "fuse_io_extend");
 
 	if (fuse_isdeadfs(vp)) {
 		return EBADF;
 	}
 	if (vnode_vtype(vp) == VDIR) {
 		return EISDIR;
 	}
 	if (vfs_isrdonly(vnode_mount(vp))) {
 		return EROFS;
 	}
 	if (cred == NULL) {
 		cred = td->td_ucred;
 	}
 	fdisp_init(&fdi, sizeof(*fsai));
 	fdisp_make_vp(&fdi, FUSE_SETATTR, vp, td, cred);
 	fsai = fdi.indata;
 	fsai->valid = 0;
 
 	/* Truncate to a new value. */
 	fsai->size = fvdat->filesize;
 	fsai->valid |= FATTR_SIZE;
 
 	fuse_filehandle_getrw(vp, FUFH_WRONLY, &fufh);
 	if (fufh) {
 		fsai->fh = fufh->fh_id;
 		fsai->valid |= FATTR_FH;
 	}
 	err = fdisp_wait_answ(&fdi);
 	fdisp_destroy(&fdi);
 	if (err == 0)
 		fvdat->flag &= ~FN_SIZECHANGE;
 
 	return err;
 }
 
 void
 fuse_vnode_refreshsize(struct vnode *vp, struct ucred *cred)
 {
 
 	struct fuse_vnode_data *fvdat = VTOFUD(vp);
 	struct vattr va;
 
 	if ((fvdat->flag & FN_SIZECHANGE) != 0 ||
 	    (fuse_refresh_size == 0 && fvdat->filesize != 0))
 		return;
 
 	VOP_GETATTR(vp, &va, cred);
 	FS_DEBUG("refreshed file size: %jd\n", (intmax_t)VTOFUD(vp)->filesize);
 }
 
 int
-fuse_vnode_setsize(struct vnode *vp, struct ucred *cred, off_t newsize)
+fuse_vnode_setsize(struct vnode *vp, off_t newsize)
 {
 	struct fuse_vnode_data *fvdat = VTOFUD(vp);
 	off_t oldsize;
 	int err = 0;
 
 	FS_DEBUG("inode=%ju oldsize=%ju newsize=%ju\n",
 	    (uintmax_t)VTOI(vp), (uintmax_t)fvdat->filesize,
 	    (uintmax_t)newsize);
 	ASSERT_VOP_ELOCKED(vp, "fuse_vnode_setsize");
 
 	oldsize = fvdat->filesize;
 	fvdat->filesize = newsize;
 	fvdat->flag |= FN_SIZECHANGE;
 
 	if (newsize < oldsize) {
-		err = vtruncbuf(vp, cred, newsize, fuse_iosize(vp));
+		err = vtruncbuf(vp, newsize, fuse_iosize(vp));
 	}
 	vnode_pager_setsize(vp, newsize);
 	return err;
 }
Index: stable/12/sys/fs/fuse/fuse_node.h
===================================================================
--- stable/12/sys/fs/fuse/fuse_node.h	(revision 349304)
+++ stable/12/sys/fs/fuse/fuse_node.h	(revision 349305)
@@ -1,133 +1,133 @@
 /*-
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Copyright (c) 2007-2009 Google Inc. and Amit Singh
  * All rights reserved.
  * 
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are
  * met:
  * 
  * * Redistributions of source code must retain the above copyright
  *   notice, this list of conditions and the following disclaimer.
  * * Redistributions in binary form must reproduce the above
  *   copyright notice, this list of conditions and the following disclaimer
  *   in the documentation and/or other materials provided with the
  *   distribution.
  * * Neither the name of Google Inc. nor the names of its
  *   contributors may be used to endorse or promote products derived from
  *   this software without specific prior written permission.
  * 
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
  * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
  * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
  * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
  * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
  * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  * 
  * Copyright (C) 2005 Csaba Henk.
  * All rights reserved.
  * 
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 
  * THIS SOFTWARE IS PROVIDED BY AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  * $FreeBSD$
  */
 
 #ifndef _FUSE_NODE_H_
 #define _FUSE_NODE_H_
 
 #include <sys/types.h>
 #include <sys/mutex.h>
 
 #include "fuse_file.h"
 
 #define FN_REVOKED           0x00000020
 #define FN_FLUSHINPROG       0x00000040
 #define FN_FLUSHWANT         0x00000080
 #define FN_SIZECHANGE        0x00000100
 #define FN_DIRECTIO          0x00000200
 
 struct fuse_vnode_data {
     /** self **/
     uint64_t   nid;
 
     /** parent **/
     /* XXXIP very likely to be stale, it's not updated in rename() */
     uint64_t   parent_nid;
 
     /** I/O **/
     struct     fuse_filehandle fufh[FUFH_MAXTYPE];
 
     /** flags **/
     uint32_t   flag;
 
     /** meta **/
     struct vattr      cached_attrs;
     off_t             filesize;
     uint64_t          nlookup;
     enum vtype        vtype;
 };
 
 #define VTOFUD(vp) \
     ((struct fuse_vnode_data *)((vp)->v_data))
 #define VTOI(vp)    (VTOFUD(vp)->nid)
 #define VTOVA(vp)   (&(VTOFUD(vp)->cached_attrs))
 #define VTOILLU(vp) ((uint64_t)(VTOFUD(vp) ? VTOI(vp) : 0))
 
 #define FUSE_NULL_ID 0
 
 extern struct vop_vector fuse_vnops;
 
 static __inline void
 fuse_vnode_setparent(struct vnode *vp, struct vnode *dvp)
 {
     if (dvp != NULL && vp->v_type == VDIR) {
         MPASS(dvp->v_type == VDIR);
         VTOFUD(vp)->parent_nid = VTOI(dvp);
     }
 }
 
 void fuse_vnode_destroy(struct vnode *vp);
 
 int fuse_vnode_get(struct mount         *mp,
                    uint64_t              nodeid,
                    struct vnode         *dvp,
                    struct vnode        **vpp,
                    struct componentname *cnp,
                    enum vtype            vtyp);
 
 void fuse_vnode_open(struct vnode *vp,
                      int32_t fuse_open_flags,
                      struct thread *td);
 
 void fuse_vnode_refreshsize(struct vnode *vp, struct ucred *cred);
 
 int fuse_vnode_savesize(struct vnode *vp, struct ucred *cred);
 
-int fuse_vnode_setsize(struct vnode *vp, struct ucred *cred, off_t newsize);
+int fuse_vnode_setsize(struct vnode *vp, off_t newsize);
 
 #endif /* _FUSE_NODE_H_ */
Index: stable/12/sys/fs/fuse/fuse_vnops.c
===================================================================
--- stable/12/sys/fs/fuse/fuse_vnops.c	(revision 349304)
+++ stable/12/sys/fs/fuse/fuse_vnops.c	(revision 349305)
@@ -1,2391 +1,2391 @@
 /*-
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Copyright (c) 2007-2009 Google Inc. and Amit Singh
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are
  * met:
  *
  * * Redistributions of source code must retain the above copyright
  *   notice, this list of conditions and the following disclaimer.
  * * Redistributions in binary form must reproduce the above
  *   copyright notice, this list of conditions and the following disclaimer
  *   in the documentation and/or other materials provided with the
  *   distribution.
  * * Neither the name of Google Inc. nor the names of its
  *   contributors may be used to endorse or promote products derived from
  *   this software without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
  * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
  * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
  * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
  * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
  * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  * Copyright (C) 2005 Csaba Henk.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <sys/types.h>
 #include <sys/module.h>
 #include <sys/systm.h>
 #include <sys/errno.h>
 #include <sys/param.h>
 #include <sys/kernel.h>
 #include <sys/conf.h>
 #include <sys/uio.h>
 #include <sys/malloc.h>
 #include <sys/queue.h>
 #include <sys/lock.h>
 #include <sys/rwlock.h>
 #include <sys/sx.h>
 #include <sys/proc.h>
 #include <sys/mount.h>
 #include <sys/vnode.h>
 #include <sys/namei.h>
 #include <sys/extattr.h>
 #include <sys/stat.h>
 #include <sys/unistd.h>
 #include <sys/filedesc.h>
 #include <sys/file.h>
 #include <sys/fcntl.h>
 #include <sys/dirent.h>
 #include <sys/bio.h>
 #include <sys/buf.h>
 #include <sys/sysctl.h>
 #include <sys/vmmeter.h>
 
 #include <vm/vm.h>
 #include <vm/vm_extern.h>
 #include <vm/pmap.h>
 #include <vm/vm_map.h>
 #include <vm/vm_page.h>
 #include <vm/vm_param.h>
 #include <vm/vm_object.h>
 #include <vm/vm_pager.h>
 #include <vm/vnode_pager.h>
 #include <vm/vm_object.h>
 
 #include "fuse.h"
 #include "fuse_file.h"
 #include "fuse_internal.h"
 #include "fuse_ipc.h"
 #include "fuse_node.h"
 #include "fuse_param.h"
 #include "fuse_io.h"
 
 #include <sys/priv.h>
 
 #define FUSE_DEBUG_MODULE VNOPS
 #include "fuse_debug.h"
 
 /* vnode ops */
 static vop_access_t fuse_vnop_access;
 static vop_close_t fuse_vnop_close;
 static vop_create_t fuse_vnop_create;
 static vop_deleteextattr_t fuse_vnop_deleteextattr;
 static vop_fsync_t fuse_vnop_fsync;
 static vop_getattr_t fuse_vnop_getattr;
 static vop_getextattr_t fuse_vnop_getextattr;
 static vop_inactive_t fuse_vnop_inactive;
 static vop_link_t fuse_vnop_link;
 static vop_listextattr_t fuse_vnop_listextattr;
 static vop_lookup_t fuse_vnop_lookup;
 static vop_mkdir_t fuse_vnop_mkdir;
 static vop_mknod_t fuse_vnop_mknod;
 static vop_open_t fuse_vnop_open;
 static vop_pathconf_t fuse_vnop_pathconf;
 static vop_read_t fuse_vnop_read;
 static vop_readdir_t fuse_vnop_readdir;
 static vop_readlink_t fuse_vnop_readlink;
 static vop_reclaim_t fuse_vnop_reclaim;
 static vop_remove_t fuse_vnop_remove;
 static vop_rename_t fuse_vnop_rename;
 static vop_rmdir_t fuse_vnop_rmdir;
 static vop_setattr_t fuse_vnop_setattr;
 static vop_setextattr_t fuse_vnop_setextattr;
 static vop_strategy_t fuse_vnop_strategy;
 static vop_symlink_t fuse_vnop_symlink;
 static vop_write_t fuse_vnop_write;
 static vop_getpages_t fuse_vnop_getpages;
 static vop_putpages_t fuse_vnop_putpages;
 static vop_print_t fuse_vnop_print;
 
 struct vop_vector fuse_vnops = {
 	.vop_default = &default_vnodeops,
 	.vop_access = fuse_vnop_access,
 	.vop_close = fuse_vnop_close,
 	.vop_create = fuse_vnop_create,
 	.vop_deleteextattr = fuse_vnop_deleteextattr,
 	.vop_fsync = fuse_vnop_fsync,
 	.vop_getattr = fuse_vnop_getattr,
 	.vop_getextattr = fuse_vnop_getextattr,
 	.vop_inactive = fuse_vnop_inactive,
 	.vop_link = fuse_vnop_link,
 	.vop_listextattr = fuse_vnop_listextattr,
 	.vop_lookup = fuse_vnop_lookup,
 	.vop_mkdir = fuse_vnop_mkdir,
 	.vop_mknod = fuse_vnop_mknod,
 	.vop_open = fuse_vnop_open,
 	.vop_pathconf = fuse_vnop_pathconf,
 	.vop_read = fuse_vnop_read,
 	.vop_readdir = fuse_vnop_readdir,
 	.vop_readlink = fuse_vnop_readlink,
 	.vop_reclaim = fuse_vnop_reclaim,
 	.vop_remove = fuse_vnop_remove,
 	.vop_rename = fuse_vnop_rename,
 	.vop_rmdir = fuse_vnop_rmdir,
 	.vop_setattr = fuse_vnop_setattr,
 	.vop_setextattr = fuse_vnop_setextattr,
 	.vop_strategy = fuse_vnop_strategy,
 	.vop_symlink = fuse_vnop_symlink,
 	.vop_write = fuse_vnop_write,
 	.vop_getpages = fuse_vnop_getpages,
 	.vop_putpages = fuse_vnop_putpages,
 	.vop_print = fuse_vnop_print,
 };
 
 static u_long fuse_lookup_cache_hits = 0;
 
 SYSCTL_ULONG(_vfs_fuse, OID_AUTO, lookup_cache_hits, CTLFLAG_RD,
     &fuse_lookup_cache_hits, 0, "");
 
 static u_long fuse_lookup_cache_misses = 0;
 
 SYSCTL_ULONG(_vfs_fuse, OID_AUTO, lookup_cache_misses, CTLFLAG_RD,
     &fuse_lookup_cache_misses, 0, "");
 
 int	fuse_lookup_cache_enable = 1;
 
 SYSCTL_INT(_vfs_fuse, OID_AUTO, lookup_cache_enable, CTLFLAG_RW,
     &fuse_lookup_cache_enable, 0, "");
 
 /*
  * XXX: This feature is highly experimental and can bring to instabilities,
  * needs revisiting before to be enabled by default.
  */
 static int fuse_reclaim_revoked = 0;
 
 SYSCTL_INT(_vfs_fuse, OID_AUTO, reclaim_revoked, CTLFLAG_RW,
     &fuse_reclaim_revoked, 0, "");
 
 int	fuse_pbuf_freecnt = -1;
 
 #define fuse_vm_page_lock(m)		vm_page_lock((m));
 #define fuse_vm_page_unlock(m)		vm_page_unlock((m));
 #define fuse_vm_page_lock_queues()	((void)0)
 #define fuse_vm_page_unlock_queues()	((void)0)
 
 /*
     struct vnop_access_args {
         struct vnode *a_vp;
 #if VOP_ACCESS_TAKES_ACCMODE_T
         accmode_t a_accmode;
 #else
         int a_mode;
 #endif
         struct ucred *a_cred;
         struct thread *a_td;
     };
 */
 static int
 fuse_vnop_access(struct vop_access_args *ap)
 {
 	struct vnode *vp = ap->a_vp;
 	int accmode = ap->a_accmode;
 	struct ucred *cred = ap->a_cred;
 
 	struct fuse_access_param facp;
 	struct fuse_data *data = fuse_get_mpdata(vnode_mount(vp));
 
 	int err;
 
 	FS_DEBUG2G("inode=%ju\n", (uintmax_t)VTOI(vp));
 
 	if (fuse_isdeadfs(vp)) {
 		if (vnode_isvroot(vp)) {
 			return 0;
 		}
 		return ENXIO;
 	}
 	if (!(data->dataflags & FSESS_INITED)) {
 		if (vnode_isvroot(vp)) {
 			if (priv_check_cred(cred, PRIV_VFS_ADMIN, 0) ||
 			    (fuse_match_cred(data->daemoncred, cred) == 0)) {
 				return 0;
 			}
 		}
 		return EBADF;
 	}
 	if (vnode_islnk(vp)) {
 		return 0;
 	}
 	bzero(&facp, sizeof(facp));
 
 	err = fuse_internal_access(vp, accmode, &facp, ap->a_td, ap->a_cred);
 	FS_DEBUG2G("err=%d accmode=0x%x\n", err, accmode);
 	return err;
 }
 
 /*
     struct vnop_close_args {
 	struct vnode *a_vp;
 	int  a_fflag;
 	struct ucred *a_cred;
 	struct thread *a_td;
     };
 */
 static int
 fuse_vnop_close(struct vop_close_args *ap)
 {
 	struct vnode *vp = ap->a_vp;
 	struct ucred *cred = ap->a_cred;
 	int fflag = ap->a_fflag;
 	fufh_type_t fufh_type;
 
 	fuse_trace_printf_vnop();
 
 	if (fuse_isdeadfs(vp)) {
 		return 0;
 	}
 	if (vnode_isdir(vp)) {
 		if (fuse_filehandle_valid(vp, FUFH_RDONLY)) {
 			fuse_filehandle_close(vp, FUFH_RDONLY, NULL, cred);
 		}
 		return 0;
 	}
 	if (fflag & IO_NDELAY) {
 		return 0;
 	}
 	fufh_type = fuse_filehandle_xlate_from_fflags(fflag);
 
 	if (!fuse_filehandle_valid(vp, fufh_type)) {
 		int i;
 
 		for (i = 0; i < FUFH_MAXTYPE; i++)
 			if (fuse_filehandle_valid(vp, i))
 				break;
 		if (i == FUFH_MAXTYPE)
 			panic("FUSE: fufh type %d found to be invalid in close"
 			      " (fflag=0x%x)\n",
 			      fufh_type, fflag);
 	}
 	if ((VTOFUD(vp)->flag & FN_SIZECHANGE) != 0) {
 		fuse_vnode_savesize(vp, cred);
 	}
 	return 0;
 }
 
 /*
     struct vnop_create_args {
 	struct vnode *a_dvp;
 	struct vnode **a_vpp;
 	struct componentname *a_cnp;
 	struct vattr *a_vap;
     };
 */
 static int
 fuse_vnop_create(struct vop_create_args *ap)
 {
 	struct vnode *dvp = ap->a_dvp;
 	struct vnode **vpp = ap->a_vpp;
 	struct componentname *cnp = ap->a_cnp;
 	struct vattr *vap = ap->a_vap;
 	struct thread *td = cnp->cn_thread;
 	struct ucred *cred = cnp->cn_cred;
 
 	struct fuse_open_in *foi;
 	struct fuse_entry_out *feo;
 	struct fuse_dispatcher fdi;
 	struct fuse_dispatcher *fdip = &fdi;
 
 	int err;
 
 	struct mount *mp = vnode_mount(dvp);
 	uint64_t parentnid = VTOFUD(dvp)->nid;
 	mode_t mode = MAKEIMODE(vap->va_type, vap->va_mode);
 	uint64_t x_fh_id;
 	uint32_t x_open_flags;
 
 	fuse_trace_printf_vnop();
 
 	if (fuse_isdeadfs(dvp)) {
 		return ENXIO;
 	}
 	bzero(&fdi, sizeof(fdi));
 
 	/* XXX:	Will we ever want devices ? */
 	if ((vap->va_type != VREG)) {
 		printf("fuse_vnop_create: unsupported va_type %d\n",
 		    vap->va_type);
 		return (EINVAL);
 	}
 	debug_printf("parent nid = %ju, mode = %x\n", (uintmax_t)parentnid,
 	    mode);
 
 	fdisp_init(fdip, sizeof(*foi) + cnp->cn_namelen + 1);
 	if (!fsess_isimpl(mp, FUSE_CREATE)) {
 		debug_printf("eh, daemon doesn't implement create?\n");
 		return (EINVAL);
 	}
 	fdisp_make(fdip, FUSE_CREATE, vnode_mount(dvp), parentnid, td, cred);
 
 	foi = fdip->indata;
 	foi->mode = mode;
 	foi->flags = O_CREAT | O_RDWR;
 
 	memcpy((char *)fdip->indata + sizeof(*foi), cnp->cn_nameptr,
 	    cnp->cn_namelen);
 	((char *)fdip->indata)[sizeof(*foi) + cnp->cn_namelen] = '\0';
 
 	err = fdisp_wait_answ(fdip);
 
 	if (err) {
 		if (err == ENOSYS)
 			fsess_set_notimpl(mp, FUSE_CREATE);
 		debug_printf("create: got err=%d from daemon\n", err);
 		goto out;
 	}
 
 	feo = fdip->answ;
 
 	if ((err = fuse_internal_checkentry(feo, VREG))) {
 		goto out;
 	}
 	err = fuse_vnode_get(mp, feo->nodeid, dvp, vpp, cnp, VREG);
 	if (err) {
 		struct fuse_release_in *fri;
 		uint64_t nodeid = feo->nodeid;
 		uint64_t fh_id = ((struct fuse_open_out *)(feo + 1))->fh;
 
 		fdisp_init(fdip, sizeof(*fri));
 		fdisp_make(fdip, FUSE_RELEASE, mp, nodeid, td, cred);
 		fri = fdip->indata;
 		fri->fh = fh_id;
 		fri->flags = OFLAGS(mode);
 		fuse_insert_callback(fdip->tick, fuse_internal_forget_callback);
 		fuse_insert_message(fdip->tick);
 		return err;
 	}
 	ASSERT_VOP_ELOCKED(*vpp, "fuse_vnop_create");
 
 	fdip->answ = feo + 1;
 
 	x_fh_id = ((struct fuse_open_out *)(feo + 1))->fh;
 	x_open_flags = ((struct fuse_open_out *)(feo + 1))->open_flags;
 	fuse_filehandle_init(*vpp, FUFH_RDWR, NULL, x_fh_id);
 	fuse_vnode_open(*vpp, x_open_flags, td);
 	cache_purge_negative(dvp);
 
 out:
 	fdisp_destroy(fdip);
 	return err;
 }
 
 /*
  * Our vnop_fsync roughly corresponds to the FUSE_FSYNC method. The Linux
  * version of FUSE also has a FUSE_FLUSH method.
  *
  * On Linux, fsync() synchronizes a file's complete in-core state with that
  * on disk. The call is not supposed to return until the system has completed
  * that action or until an error is detected.
  *
  * Linux also has an fdatasync() call that is similar to fsync() but is not
  * required to update the metadata such as access time and modification time.
  */
 
 /*
     struct vnop_fsync_args {
 	struct vnodeop_desc *a_desc;
 	struct vnode * a_vp;
 	struct ucred * a_cred;
 	int  a_waitfor;
 	struct thread * a_td;
     };
 */
 static int
 fuse_vnop_fsync(struct vop_fsync_args *ap)
 {
 	struct vnode *vp = ap->a_vp;
 	struct thread *td = ap->a_td;
 
 	struct fuse_filehandle *fufh;
 	struct fuse_vnode_data *fvdat = VTOFUD(vp);
 
 	int type, err = 0;
 
 	fuse_trace_printf_vnop();
 
 	if (fuse_isdeadfs(vp)) {
 		return 0;
 	}
 	if ((err = vop_stdfsync(ap)))
 		return err;
 
 	if (!fsess_isimpl(vnode_mount(vp),
 	    (vnode_vtype(vp) == VDIR ? FUSE_FSYNCDIR : FUSE_FSYNC))) {
 		goto out;
 	}
 	for (type = 0; type < FUFH_MAXTYPE; type++) {
 		fufh = &(fvdat->fufh[type]);
 		if (FUFH_IS_VALID(fufh)) {
 			fuse_internal_fsync(vp, td, NULL, fufh);
 		}
 	}
 
 out:
 	return 0;
 }
 
 /*
     struct vnop_getattr_args {
 	struct vnode *a_vp;
 	struct vattr *a_vap;
 	struct ucred *a_cred;
 	struct thread *a_td;
     };
 */
 static int
 fuse_vnop_getattr(struct vop_getattr_args *ap)
 {
 	struct vnode *vp = ap->a_vp;
 	struct vattr *vap = ap->a_vap;
 	struct ucred *cred = ap->a_cred;
 	struct thread *td = curthread;
 	struct fuse_vnode_data *fvdat = VTOFUD(vp);
 
 	int err = 0;
 	int dataflags;
 	struct fuse_dispatcher fdi;
 
 	FS_DEBUG2G("inode=%ju\n", (uintmax_t)VTOI(vp));
 
 	dataflags = fuse_get_mpdata(vnode_mount(vp))->dataflags;
 
 	/* Note that we are not bailing out on a dead file system just yet. */
 
 	if (!(dataflags & FSESS_INITED)) {
 		if (!vnode_isvroot(vp)) {
 			fdata_set_dead(fuse_get_mpdata(vnode_mount(vp)));
 			err = ENOTCONN;
 			debug_printf("fuse_getattr b: returning ENOTCONN\n");
 			return err;
 		} else {
 			goto fake;
 		}
 	}
 	fdisp_init(&fdi, 0);
 	if ((err = fdisp_simple_putget_vp(&fdi, FUSE_GETATTR, vp, td, cred))) {
 		if ((err == ENOTCONN) && vnode_isvroot(vp)) {
 			/* see comment at similar place in fuse_statfs() */
 			fdisp_destroy(&fdi);
 			goto fake;
 		}
 		if (err == ENOENT) {
 			fuse_internal_vnode_disappear(vp);
 		}
 		goto out;
 	}
 	cache_attrs(vp, (struct fuse_attr_out *)fdi.answ);
 	if (vap != VTOVA(vp)) {
 		memcpy(vap, VTOVA(vp), sizeof(*vap));
 	}
 	if (vap->va_type != vnode_vtype(vp)) {
 		fuse_internal_vnode_disappear(vp);
 		err = ENOENT;
 		goto out;
 	}
 	if ((fvdat->flag & FN_SIZECHANGE) != 0)
 		vap->va_size = fvdat->filesize;
 
 	if (vnode_isreg(vp) && (fvdat->flag & FN_SIZECHANGE) == 0) {
 		/*
 	         * This is for those cases when the file size changed without us
 	         * knowing, and we want to catch up.
 	         */
 		off_t new_filesize = ((struct fuse_attr_out *)
 				      fdi.answ)->attr.size;
 
 		if (fvdat->filesize != new_filesize) {
-			fuse_vnode_setsize(vp, cred, new_filesize);
+			fuse_vnode_setsize(vp, new_filesize);
 		}
 	}
 	debug_printf("fuse_getattr e: returning 0\n");
 
 out:
 	fdisp_destroy(&fdi);
 	return err;
 
 fake:
 	bzero(vap, sizeof(*vap));
 	vap->va_type = vnode_vtype(vp);
 
 	return 0;
 }
 
 /*
     struct vnop_inactive_args {
 	struct vnode *a_vp;
 	struct thread *a_td;
     };
 */
 static int
 fuse_vnop_inactive(struct vop_inactive_args *ap)
 {
 	struct vnode *vp = ap->a_vp;
 	struct thread *td = ap->a_td;
 
 	struct fuse_vnode_data *fvdat = VTOFUD(vp);
 	struct fuse_filehandle *fufh = NULL;
 
 	int type, need_flush = 1;
 
 	FS_DEBUG("inode=%ju\n", (uintmax_t)VTOI(vp));
 
 	for (type = 0; type < FUFH_MAXTYPE; type++) {
 		fufh = &(fvdat->fufh[type]);
 		if (FUFH_IS_VALID(fufh)) {
 			if (need_flush && vp->v_type == VREG) {
 				if ((VTOFUD(vp)->flag & FN_SIZECHANGE) != 0) {
 					fuse_vnode_savesize(vp, NULL);
 				}
 				if (fuse_data_cache_invalidate ||
 				    (fvdat->flag & FN_REVOKED) != 0)
 					fuse_io_invalbuf(vp, td);
 				else
 					fuse_io_flushbuf(vp, MNT_WAIT, td);
 				need_flush = 0;
 			}
 			fuse_filehandle_close(vp, type, td, NULL);
 		}
 	}
 
 	if ((fvdat->flag & FN_REVOKED) != 0 && fuse_reclaim_revoked) {
 		vrecycle(vp);
 	}
 	return 0;
 }
 
 /*
     struct vnop_link_args {
 	struct vnode *a_tdvp;
 	struct vnode *a_vp;
 	struct componentname *a_cnp;
     };
 */
 static int
 fuse_vnop_link(struct vop_link_args *ap)
 {
 	struct vnode *vp = ap->a_vp;
 	struct vnode *tdvp = ap->a_tdvp;
 	struct componentname *cnp = ap->a_cnp;
 
 	struct vattr *vap = VTOVA(vp);
 
 	struct fuse_dispatcher fdi;
 	struct fuse_entry_out *feo;
 	struct fuse_link_in fli;
 
 	int err;
 
 	fuse_trace_printf_vnop();
 
 	if (fuse_isdeadfs(vp)) {
 		return ENXIO;
 	}
 	if (vnode_mount(tdvp) != vnode_mount(vp)) {
 		return EXDEV;
 	}
 	if (vap->va_nlink >= FUSE_LINK_MAX) {
 		return EMLINK;
 	}
 	fli.oldnodeid = VTOI(vp);
 
 	fdisp_init(&fdi, 0);
 	fuse_internal_newentry_makerequest(vnode_mount(tdvp), VTOI(tdvp), cnp,
 	    FUSE_LINK, &fli, sizeof(fli), &fdi);
 	if ((err = fdisp_wait_answ(&fdi))) {
 		goto out;
 	}
 	feo = fdi.answ;
 
 	err = fuse_internal_checkentry(feo, vnode_vtype(vp));
 out:
 	fdisp_destroy(&fdi);
 	return err;
 }
 
 /*
     struct vnop_lookup_args {
 	struct vnodeop_desc *a_desc;
 	struct vnode *a_dvp;
 	struct vnode **a_vpp;
 	struct componentname *a_cnp;
     };
 */
 int
 fuse_vnop_lookup(struct vop_lookup_args *ap)
 {
 	struct vnode *dvp = ap->a_dvp;
 	struct vnode **vpp = ap->a_vpp;
 	struct componentname *cnp = ap->a_cnp;
 	struct thread *td = cnp->cn_thread;
 	struct ucred *cred = cnp->cn_cred;
 
 	int nameiop = cnp->cn_nameiop;
 	int flags = cnp->cn_flags;
 	int wantparent = flags & (LOCKPARENT | WANTPARENT);
 	int islastcn = flags & ISLASTCN;
 	struct mount *mp = vnode_mount(dvp);
 
 	int err = 0;
 	int lookup_err = 0;
 	struct vnode *vp = NULL;
 
 	struct fuse_dispatcher fdi;
 	enum fuse_opcode op;
 
 	uint64_t nid;
 	struct fuse_access_param facp;
 
 	FS_DEBUG2G("parent_inode=%ju - %*s\n",
 	    (uintmax_t)VTOI(dvp), (int)cnp->cn_namelen, cnp->cn_nameptr);
 
 	if (fuse_isdeadfs(dvp)) {
 		*vpp = NULL;
 		return ENXIO;
 	}
 	if (!vnode_isdir(dvp)) {
 		return ENOTDIR;
 	}
 	if (islastcn && vfs_isrdonly(mp) && (nameiop != LOOKUP)) {
 		return EROFS;
 	}
 	/*
          * We do access check prior to doing anything else only in the case
          * when we are at fs root (we'd like to say, "we are at the first
          * component", but that's not exactly the same... nevermind).
          * See further comments at further access checks.
          */
 
 	bzero(&facp, sizeof(facp));
 	if (vnode_isvroot(dvp)) {	/* early permission check hack */
 		if ((err = fuse_internal_access(dvp, VEXEC, &facp, td, cred))) {
 			return err;
 		}
 	}
 	if (flags & ISDOTDOT) {
 		nid = VTOFUD(dvp)->parent_nid;
 		if (nid == 0) {
 			return ENOENT;
 		}
 		fdisp_init(&fdi, 0);
 		op = FUSE_GETATTR;
 		goto calldaemon;
 	} else if (cnp->cn_namelen == 1 && *(cnp->cn_nameptr) == '.') {
 		nid = VTOI(dvp);
 		fdisp_init(&fdi, 0);
 		op = FUSE_GETATTR;
 		goto calldaemon;
 	} else if (fuse_lookup_cache_enable) {
 		err = cache_lookup(dvp, vpp, cnp, NULL, NULL);
 		switch (err) {
 
 		case -1:		/* positive match */
 			atomic_add_acq_long(&fuse_lookup_cache_hits, 1);
 			return 0;
 
 		case 0:		/* no match in cache */
 			atomic_add_acq_long(&fuse_lookup_cache_misses, 1);
 			break;
 
 		case ENOENT:		/* negative match */
 			/* fall through */
 		default:
 			return err;
 		}
 	}
 	nid = VTOI(dvp);
 	fdisp_init(&fdi, cnp->cn_namelen + 1);
 	op = FUSE_LOOKUP;
 
 calldaemon:
 	fdisp_make(&fdi, op, mp, nid, td, cred);
 
 	if (op == FUSE_LOOKUP) {
 		memcpy(fdi.indata, cnp->cn_nameptr, cnp->cn_namelen);
 		((char *)fdi.indata)[cnp->cn_namelen] = '\0';
 	}
 	lookup_err = fdisp_wait_answ(&fdi);
 
 	if ((op == FUSE_LOOKUP) && !lookup_err) {	/* lookup call succeeded */
 		nid = ((struct fuse_entry_out *)fdi.answ)->nodeid;
 		if (!nid) {
 			/*
 	                 * zero nodeid is the same as "not found",
 	                 * but it's also cacheable (which we keep
 	                 * keep on doing not as of writing this)
 	                 */
 			lookup_err = ENOENT;
 		} else if (nid == FUSE_ROOT_ID) {
 			lookup_err = EINVAL;
 		}
 	}
 	if (lookup_err &&
 	    (!fdi.answ_stat || lookup_err != ENOENT || op != FUSE_LOOKUP)) {
 		fdisp_destroy(&fdi);
 		return lookup_err;
 	}
 	/* lookup_err, if non-zero, must be ENOENT at this point */
 
 	if (lookup_err) {
 
 		if ((nameiop == CREATE || nameiop == RENAME) && islastcn
 		     /* && directory dvp has not been removed */ ) {
 
 			if (vfs_isrdonly(mp)) {
 				err = EROFS;
 				goto out;
 			}
 #if 0 /* THINK_ABOUT_THIS */
 			if ((err = fuse_internal_access(dvp, VWRITE, cred, td, &facp))) {
 				goto out;
 			}
 #endif
 
 			/*
 	                 * Possibly record the position of a slot in the
 	                 * directory large enough for the new component name.
 	                 * This can be recorded in the vnode private data for
 	                 * dvp. Set the SAVENAME flag to hold onto the
 	                 * pathname for use later in VOP_CREATE or VOP_RENAME.
 	                 */
 			cnp->cn_flags |= SAVENAME;
 
 			err = EJUSTRETURN;
 			goto out;
 		}
 		/* Consider inserting name into cache. */
 
 		/*
 	         * No we can't use negative caching, as the fs
 	         * changes are out of our control.
 	         * False positives' falseness turns out just as things
 	         * go by, but false negatives' falseness doesn't.
 	         * (and aiding the caching mechanism with extra control
 	         * mechanisms comes quite close to beating the whole purpose
 	         * caching...)
 	         */
 #if 0
 		if ((cnp->cn_flags & MAKEENTRY) != 0) {
 			FS_DEBUG("inserting NULL into cache\n");
 			cache_enter(dvp, NULL, cnp);
 		}
 #endif
 		err = ENOENT;
 		goto out;
 
 	} else {
 
 		/* !lookup_err */
 
 		struct fuse_entry_out *feo = NULL;
 		struct fuse_attr *fattr = NULL;
 
 		if (op == FUSE_GETATTR) {
 			fattr = &((struct fuse_attr_out *)fdi.answ)->attr;
 		} else {
 			feo = (struct fuse_entry_out *)fdi.answ;
 			fattr = &(feo->attr);
 		}
 
 		/*
 	         * If deleting, and at end of pathname, return parameters
 	         * which can be used to remove file.  If the wantparent flag
 	         * isn't set, we return only the directory, otherwise we go on
 	         * and lock the inode, being careful with ".".
 	         */
 		if (nameiop == DELETE && islastcn) {
 			/*
 	                 * Check for write access on directory.
 	                 */
 			facp.xuid = fattr->uid;
 			facp.facc_flags |= FACCESS_STICKY;
 			err = fuse_internal_access(dvp, VWRITE, &facp, td, cred);
 			facp.facc_flags &= ~FACCESS_XQUERIES;
 
 			if (err) {
 				goto out;
 			}
 			if (nid == VTOI(dvp)) {
 				vref(dvp);
 				*vpp = dvp;
 			} else {
 				err = fuse_vnode_get(dvp->v_mount, nid, dvp,
 				    &vp, cnp, IFTOVT(fattr->mode));
 				if (err)
 					goto out;
 				*vpp = vp;
 			}
 
 			/*
 			 * Save the name for use in VOP_RMDIR and VOP_REMOVE
 			 * later.
 			 */
 			cnp->cn_flags |= SAVENAME;
 			goto out;
 
 		}
 		/*
 	         * If rewriting (RENAME), return the inode and the
 	         * information required to rewrite the present directory
 	         * Must get inode of directory entry to verify it's a
 	         * regular file, or empty directory.
 	         */
 		if (nameiop == RENAME && wantparent && islastcn) {
 
 #if 0 /* THINK_ABOUT_THIS */
 			if ((err = fuse_internal_access(dvp, VWRITE, cred, td, &facp))) {
 				goto out;
 			}
 #endif
 
 			/*
 	                 * Check for "."
 	                 */
 			if (nid == VTOI(dvp)) {
 				err = EISDIR;
 				goto out;
 			}
 			err = fuse_vnode_get(vnode_mount(dvp),
 			    nid,
 			    dvp,
 			    &vp,
 			    cnp,
 			    IFTOVT(fattr->mode));
 			if (err) {
 				goto out;
 			}
 			*vpp = vp;
 			/*
 	                 * Save the name for use in VOP_RENAME later.
 	                 */
 			cnp->cn_flags |= SAVENAME;
 
 			goto out;
 		}
 		if (flags & ISDOTDOT) {
 			struct mount *mp;
 			int ltype;
 
 			/*
 			 * Expanded copy of vn_vget_ino() so that
 			 * fuse_vnode_get() can be used.
 			 */
 			mp = dvp->v_mount;
 			ltype = VOP_ISLOCKED(dvp);
 			err = vfs_busy(mp, MBF_NOWAIT);
 			if (err != 0) {
 				vfs_ref(mp);
 				VOP_UNLOCK(dvp, 0);
 				err = vfs_busy(mp, 0);
 				vn_lock(dvp, ltype | LK_RETRY);
 				vfs_rel(mp);
 				if (err)
 					goto out;
 				if ((dvp->v_iflag & VI_DOOMED) != 0) {
 					err = ENOENT;
 					vfs_unbusy(mp);
 					goto out;
 				}
 			}
 			VOP_UNLOCK(dvp, 0);
 			err = fuse_vnode_get(vnode_mount(dvp),
 			    nid,
 			    NULL,
 			    &vp,
 			    cnp,
 			    IFTOVT(fattr->mode));
 			vfs_unbusy(mp);
 			vn_lock(dvp, ltype | LK_RETRY);
 			if ((dvp->v_iflag & VI_DOOMED) != 0) {
 				if (err == 0)
 					vput(vp);
 				err = ENOENT;
 			}
 			if (err)
 				goto out;
 			*vpp = vp;
 		} else if (nid == VTOI(dvp)) {
 			vref(dvp);
 			*vpp = dvp;
 		} else {
 			err = fuse_vnode_get(vnode_mount(dvp),
 			    nid,
 			    dvp,
 			    &vp,
 			    cnp,
 			    IFTOVT(fattr->mode));
 			if (err) {
 				goto out;
 			}
 			fuse_vnode_setparent(vp, dvp);
 			*vpp = vp;
 		}
 
 		if (op == FUSE_GETATTR) {
 			cache_attrs(*vpp, (struct fuse_attr_out *)fdi.answ);
 		} else {
 			cache_attrs(*vpp, (struct fuse_entry_out *)fdi.answ);
 		}
 
 		/* Insert name into cache if appropriate. */
 
 		/*
 	         * Nooo, caching is evil. With caching, we can't avoid stale
 	         * information taking over the playground (cached info is not
 	         * just positive/negative, it does have qualitative aspects,
 	         * too). And a (VOP/FUSE)_GETATTR is always thrown anyway, when
 	         * walking down along cached path components, and that's not
 	         * any cheaper than FUSE_LOOKUP. This might change with
 	         * implementing kernel side attr caching, but... In Linux,
 	         * lookup results are not cached, and the daemon is bombarded
 	         * with FUSE_LOOKUPS on and on. This shows that by design, the
 	         * daemon is expected to handle frequent lookup queries
 	         * efficiently, do its caching in userspace, and so on.
 	         *
 	         * So just leave the name cache alone.
 	         */
 
 		/*
 	         * Well, now I know, Linux caches lookups, but with a
 	         * timeout... So it's the same thing as attribute caching:
 	         * we can deal with it when implement timeouts.
 	         */
 #if 0
 		if (cnp->cn_flags & MAKEENTRY) {
 			cache_enter(dvp, *vpp, cnp);
 		}
 #endif
 	}
 out:
 	if (!lookup_err) {
 
 		/* No lookup error; need to clean up. */
 
 		if (err) {		/* Found inode; exit with no vnode. */
 			if (op == FUSE_LOOKUP) {
 				fuse_internal_forget_send(vnode_mount(dvp), td, cred,
 				    nid, 1);
 			}
 			fdisp_destroy(&fdi);
 			return err;
 		} else {
 #ifndef NO_EARLY_PERM_CHECK_HACK
 			if (!islastcn) {
 				/*
 				 * We have the attributes of the next item
 				 * *now*, and it's a fact, and we do not
 				 * have to do extra work for it (ie, beg the
 				 * daemon), and it neither depends on such
 				 * accidental things like attr caching. So
 				 * the big idea: check credentials *now*,
 				 * not at the beginning of the next call to
 				 * lookup.
 				 * 
 				 * The first item of the lookup chain (fs root)
 				 * won't be checked then here, of course, as
 				 * its never "the next". But go and see that
 				 * the root is taken care about at the very
 				 * beginning of this function.
 				 * 
 				 * Now, given we want to do the access check
 				 * this way, one might ask: so then why not
 				 * do the access check just after fetching
 				 * the inode and its attributes from the
 				 * daemon? Why bother with producing the
 				 * corresponding vnode at all if something
 				 * is not OK? We know what's the deal as
 				 * soon as we get those attrs... There is
 				 * one bit of info though not given us by
 				 * the daemon: whether his response is
 				 * authoritative or not... His response should
 				 * be ignored if something is mounted over
 				 * the dir in question. But that can be
 				 * known only by having the vnode...
 				 */
 				int tmpvtype = vnode_vtype(*vpp);
 
 				bzero(&facp, sizeof(facp));
 				/*the early perm check hack */
 				    facp.facc_flags |= FACCESS_VA_VALID;
 
 				if ((tmpvtype != VDIR) && (tmpvtype != VLNK)) {
 					err = ENOTDIR;
 				}
 				if (!err && !vnode_mountedhere(*vpp)) {
 					err = fuse_internal_access(*vpp, VEXEC, &facp, td, cred);
 				}
 				if (err) {
 					if (tmpvtype == VLNK)
 						FS_DEBUG("weird, permission error with a symlink?\n");
 					vput(*vpp);
 					*vpp = NULL;
 				}
 			}
 #endif
 		}
 	}
 	fdisp_destroy(&fdi);
 
 	return err;
 }
 
 /*
     struct vnop_mkdir_args {
 	struct vnode *a_dvp;
 	struct vnode **a_vpp;
 	struct componentname *a_cnp;
 	struct vattr *a_vap;
     };
 */
 static int
 fuse_vnop_mkdir(struct vop_mkdir_args *ap)
 {
 	struct vnode *dvp = ap->a_dvp;
 	struct vnode **vpp = ap->a_vpp;
 	struct componentname *cnp = ap->a_cnp;
 	struct vattr *vap = ap->a_vap;
 
 	struct fuse_mkdir_in fmdi;
 
 	fuse_trace_printf_vnop();
 
 	if (fuse_isdeadfs(dvp)) {
 		return ENXIO;
 	}
 	fmdi.mode = MAKEIMODE(vap->va_type, vap->va_mode);
 
 	return (fuse_internal_newentry(dvp, vpp, cnp, FUSE_MKDIR, &fmdi,
 	    sizeof(fmdi), VDIR));
 }
 
 /*
     struct vnop_mknod_args {
 	struct vnode *a_dvp;
 	struct vnode **a_vpp;
 	struct componentname *a_cnp;
 	struct vattr *a_vap;
     };
 */
 static int
 fuse_vnop_mknod(struct vop_mknod_args *ap)
 {
 
 	return (EINVAL);
 }
 
 
 /*
     struct vnop_open_args {
 	struct vnode *a_vp;
 	int  a_mode;
 	struct ucred *a_cred;
 	struct thread *a_td;
 	int a_fdidx; / struct file *a_fp;
     };
 */
 static int
 fuse_vnop_open(struct vop_open_args *ap)
 {
 	struct vnode *vp = ap->a_vp;
 	int mode = ap->a_mode;
 	struct thread *td = ap->a_td;
 	struct ucred *cred = ap->a_cred;
 
 	fufh_type_t fufh_type;
 	struct fuse_vnode_data *fvdat;
 
 	int error, isdir = 0;
 	int32_t fuse_open_flags;
 
 	FS_DEBUG2G("inode=%ju mode=0x%x\n", (uintmax_t)VTOI(vp), mode);
 
 	if (fuse_isdeadfs(vp)) {
 		return ENXIO;
 	}
 	fvdat = VTOFUD(vp);
 
 	if (vnode_isdir(vp)) {
 		isdir = 1;
 	}
 	fuse_open_flags = 0;
 	if (isdir) {
 		fufh_type = FUFH_RDONLY;
 	} else {
 		fufh_type = fuse_filehandle_xlate_from_fflags(mode);
 		/*
 		 * For WRONLY opens, force DIRECT_IO.  This is necessary
 		 * since writing a partial block through the buffer cache
 		 * will result in a read of the block and that read won't
 		 * be allowed by the WRONLY open.
 		 */
 		if (fufh_type == FUFH_WRONLY ||
 		    (fvdat->flag & FN_DIRECTIO) != 0)
 			fuse_open_flags = FOPEN_DIRECT_IO;
 	}
 
 	if (fuse_filehandle_validrw(vp, fufh_type) != FUFH_INVALID) {
 		fuse_vnode_open(vp, fuse_open_flags, td);
 		return 0;
 	}
 	error = fuse_filehandle_open(vp, fufh_type, NULL, td, cred);
 
 	return error;
 }
 
 static int
 fuse_vnop_pathconf(struct vop_pathconf_args *ap)
 {
 
 	switch (ap->a_name) {
 	case _PC_FILESIZEBITS:
 		*ap->a_retval = 64;
 		return (0);
 	case _PC_NAME_MAX:
 		*ap->a_retval = NAME_MAX;
 		return (0);
 	case _PC_LINK_MAX:
 		*ap->a_retval = MIN(LONG_MAX, FUSE_LINK_MAX);
 		return (0);
 	case _PC_SYMLINK_MAX:
 		*ap->a_retval = MAXPATHLEN;
 		return (0);
 	case _PC_NO_TRUNC:
 		*ap->a_retval = 1;
 		return (0);
 	default:
 		return (vop_stdpathconf(ap));
 	}
 }
 
 /*
     struct vnop_read_args {
 	struct vnode *a_vp;
 	struct uio *a_uio;
 	int  a_ioflag;
 	struct ucred *a_cred;
     };
 */
 static int
 fuse_vnop_read(struct vop_read_args *ap)
 {
 	struct vnode *vp = ap->a_vp;
 	struct uio *uio = ap->a_uio;
 	int ioflag = ap->a_ioflag;
 	struct ucred *cred = ap->a_cred;
 
 	FS_DEBUG2G("inode=%ju offset=%jd resid=%zd\n",
 	    (uintmax_t)VTOI(vp), uio->uio_offset, uio->uio_resid);
 
 	if (fuse_isdeadfs(vp)) {
 		return ENXIO;
 	}
 
 	if (VTOFUD(vp)->flag & FN_DIRECTIO) {
 		ioflag |= IO_DIRECT;
 	}
 
 	return fuse_io_dispatch(vp, uio, ioflag, cred);
 }
 
 /*
     struct vnop_readdir_args {
 	struct vnode *a_vp;
 	struct uio *a_uio;
 	struct ucred *a_cred;
 	int *a_eofflag;
 	int *ncookies;
 	u_long **a_cookies;
     };
 */
 static int
 fuse_vnop_readdir(struct vop_readdir_args *ap)
 {
 	struct vnode *vp = ap->a_vp;
 	struct uio *uio = ap->a_uio;
 	struct ucred *cred = ap->a_cred;
 
 	struct fuse_filehandle *fufh = NULL;
 	struct fuse_iov cookediov;
 
 	int err = 0;
 	int freefufh = 0;
 
 	FS_DEBUG2G("inode=%ju\n", (uintmax_t)VTOI(vp));
 
 	if (fuse_isdeadfs(vp)) {
 		return ENXIO;
 	}
 	if (				/* XXXIP ((uio_iovcnt(uio) > 1)) || */
 	    (uio_resid(uio) < sizeof(struct dirent))) {
 		return EINVAL;
 	}
 
 	if (!fuse_filehandle_valid(vp, FUFH_RDONLY)) {
 		FS_DEBUG("calling readdir() before open()");
 		err = fuse_filehandle_open(vp, FUFH_RDONLY, &fufh, NULL, cred);
 		freefufh = 1;
 	} else {
 		err = fuse_filehandle_get(vp, FUFH_RDONLY, &fufh);
 	}
 	if (err) {
 		return (err);
 	}
 #define DIRCOOKEDSIZE FUSE_DIRENT_ALIGN(FUSE_NAME_OFFSET + MAXNAMLEN + 1)
 	fiov_init(&cookediov, DIRCOOKEDSIZE);
 
 	err = fuse_internal_readdir(vp, uio, fufh, &cookediov);
 
 	fiov_teardown(&cookediov);
 	if (freefufh) {
 		fuse_filehandle_close(vp, FUFH_RDONLY, NULL, cred);
 	}
 	return err;
 }
 
 /*
     struct vnop_readlink_args {
 	struct vnode *a_vp;
 	struct uio *a_uio;
 	struct ucred *a_cred;
     };
 */
 static int
 fuse_vnop_readlink(struct vop_readlink_args *ap)
 {
 	struct vnode *vp = ap->a_vp;
 	struct uio *uio = ap->a_uio;
 	struct ucred *cred = ap->a_cred;
 
 	struct fuse_dispatcher fdi;
 	int err;
 
 	FS_DEBUG2G("inode=%ju\n", (uintmax_t)VTOI(vp));
 
 	if (fuse_isdeadfs(vp)) {
 		return ENXIO;
 	}
 	if (!vnode_islnk(vp)) {
 		return EINVAL;
 	}
 	fdisp_init(&fdi, 0);
 	err = fdisp_simple_putget_vp(&fdi, FUSE_READLINK, vp, curthread, cred);
 	if (err) {
 		goto out;
 	}
 	if (((char *)fdi.answ)[0] == '/' &&
 	    fuse_get_mpdata(vnode_mount(vp))->dataflags & FSESS_PUSH_SYMLINKS_IN) {
 		char *mpth = vnode_mount(vp)->mnt_stat.f_mntonname;
 
 		err = uiomove(mpth, strlen(mpth), uio);
 	}
 	if (!err) {
 		err = uiomove(fdi.answ, fdi.iosize, uio);
 	}
 out:
 	fdisp_destroy(&fdi);
 	return err;
 }
 
 /*
     struct vnop_reclaim_args {
 	struct vnode *a_vp;
 	struct thread *a_td;
     };
 */
 static int
 fuse_vnop_reclaim(struct vop_reclaim_args *ap)
 {
 	struct vnode *vp = ap->a_vp;
 	struct thread *td = ap->a_td;
 
 	struct fuse_vnode_data *fvdat = VTOFUD(vp);
 	struct fuse_filehandle *fufh = NULL;
 
 	int type;
 
 	if (!fvdat) {
 		panic("FUSE: no vnode data during recycling");
 	}
 	FS_DEBUG("inode=%ju\n", (uintmax_t)VTOI(vp));
 
 	for (type = 0; type < FUFH_MAXTYPE; type++) {
 		fufh = &(fvdat->fufh[type]);
 		if (FUFH_IS_VALID(fufh)) {
 			printf("FUSE: vnode being reclaimed but fufh (type=%d) is valid",
 			    type);
 			fuse_filehandle_close(vp, type, td, NULL);
 		}
 	}
 
 	if ((!fuse_isdeadfs(vp)) && (fvdat->nlookup)) {
 		fuse_internal_forget_send(vnode_mount(vp), td, NULL, VTOI(vp),
 		    fvdat->nlookup);
 	}
 	fuse_vnode_setparent(vp, NULL);
 	cache_purge(vp);
 	vfs_hash_remove(vp);
 	vnode_destroy_vobject(vp);
 	fuse_vnode_destroy(vp);
 
 	return 0;
 }
 
 /*
     struct vnop_remove_args {
 	struct vnode *a_dvp;
 	struct vnode *a_vp;
 	struct componentname *a_cnp;
     };
 */
 static int
 fuse_vnop_remove(struct vop_remove_args *ap)
 {
 	struct vnode *dvp = ap->a_dvp;
 	struct vnode *vp = ap->a_vp;
 	struct componentname *cnp = ap->a_cnp;
 
 	int err;
 
 	FS_DEBUG2G("inode=%ju name=%*s\n",
 	    (uintmax_t)VTOI(vp), (int)cnp->cn_namelen, cnp->cn_nameptr);
 
 	if (fuse_isdeadfs(vp)) {
 		return ENXIO;
 	}
 	if (vnode_isdir(vp)) {
 		return EPERM;
 	}
 	cache_purge(vp);
 
 	err = fuse_internal_remove(dvp, vp, cnp, FUSE_UNLINK);
 
 	if (err == 0)
 		fuse_internal_vnode_disappear(vp);
 	return err;
 }
 
 /*
     struct vnop_rename_args {
 	struct vnode *a_fdvp;
 	struct vnode *a_fvp;
 	struct componentname *a_fcnp;
 	struct vnode *a_tdvp;
 	struct vnode *a_tvp;
 	struct componentname *a_tcnp;
     };
 */
 static int
 fuse_vnop_rename(struct vop_rename_args *ap)
 {
 	struct vnode *fdvp = ap->a_fdvp;
 	struct vnode *fvp = ap->a_fvp;
 	struct componentname *fcnp = ap->a_fcnp;
 	struct vnode *tdvp = ap->a_tdvp;
 	struct vnode *tvp = ap->a_tvp;
 	struct componentname *tcnp = ap->a_tcnp;
 	struct fuse_data *data;
 
 	int err = 0;
 
 	FS_DEBUG2G("from: inode=%ju name=%*s -> to: inode=%ju name=%*s\n",
 	    (uintmax_t)VTOI(fvp), (int)fcnp->cn_namelen, fcnp->cn_nameptr,
 	    (uintmax_t)(tvp == NULL ? -1 : VTOI(tvp)),
 	    (int)tcnp->cn_namelen, tcnp->cn_nameptr);
 
 	if (fuse_isdeadfs(fdvp)) {
 		return ENXIO;
 	}
 	if (fvp->v_mount != tdvp->v_mount ||
 	    (tvp && fvp->v_mount != tvp->v_mount)) {
 		FS_DEBUG("cross-device rename: %s -> %s\n",
 		    fcnp->cn_nameptr, (tcnp != NULL ? tcnp->cn_nameptr : "(NULL)"));
 		err = EXDEV;
 		goto out;
 	}
 	cache_purge(fvp);
 
 	/*
          * FUSE library is expected to check if target directory is not
          * under the source directory in the file system tree.
          * Linux performs this check at VFS level.
          */
 	data = fuse_get_mpdata(vnode_mount(tdvp));
 	sx_xlock(&data->rename_lock);
 	err = fuse_internal_rename(fdvp, fcnp, tdvp, tcnp);
 	if (err == 0) {
 		if (tdvp != fdvp)
 			fuse_vnode_setparent(fvp, tdvp);
 		if (tvp != NULL)
 			fuse_vnode_setparent(tvp, NULL);
 	}
 	sx_unlock(&data->rename_lock);
 
 	if (tvp != NULL && tvp != fvp) {
 		cache_purge(tvp);
 	}
 	if (vnode_isdir(fvp)) {
 		if ((tvp != NULL) && vnode_isdir(tvp)) {
 			cache_purge(tdvp);
 		}
 		cache_purge(fdvp);
 	}
 out:
 	if (tdvp == tvp) {
 		vrele(tdvp);
 	} else {
 		vput(tdvp);
 	}
 	if (tvp != NULL) {
 		vput(tvp);
 	}
 	vrele(fdvp);
 	vrele(fvp);
 
 	return err;
 }
 
 /*
     struct vnop_rmdir_args {
 	    struct vnode *a_dvp;
 	    struct vnode *a_vp;
 	    struct componentname *a_cnp;
     } *ap;
 */
 static int
 fuse_vnop_rmdir(struct vop_rmdir_args *ap)
 {
 	struct vnode *dvp = ap->a_dvp;
 	struct vnode *vp = ap->a_vp;
 
 	int err;
 
 	FS_DEBUG2G("inode=%ju\n", (uintmax_t)VTOI(vp));
 
 	if (fuse_isdeadfs(vp)) {
 		return ENXIO;
 	}
 	if (VTOFUD(vp) == VTOFUD(dvp)) {
 		return EINVAL;
 	}
 	err = fuse_internal_remove(dvp, vp, ap->a_cnp, FUSE_RMDIR);
 
 	if (err == 0)
 		fuse_internal_vnode_disappear(vp);
 	return err;
 }
 
 /*
     struct vnop_setattr_args {
 	struct vnode *a_vp;
 	struct vattr *a_vap;
 	struct ucred *a_cred;
 	struct thread *a_td;
     };
 */
 static int
 fuse_vnop_setattr(struct vop_setattr_args *ap)
 {
 	struct vnode *vp = ap->a_vp;
 	struct vattr *vap = ap->a_vap;
 	struct ucred *cred = ap->a_cred;
 	struct thread *td = curthread;
 
 	struct fuse_dispatcher fdi;
 	struct fuse_setattr_in *fsai;
 	struct fuse_access_param facp;
 
 	int err = 0;
 	enum vtype vtyp;
 	int sizechanged = 0;
 	uint64_t newsize = 0;
 
 	FS_DEBUG2G("inode=%ju\n", (uintmax_t)VTOI(vp));
 
 	if (fuse_isdeadfs(vp)) {
 		return ENXIO;
 	}
 	fdisp_init(&fdi, sizeof(*fsai));
 	fdisp_make_vp(&fdi, FUSE_SETATTR, vp, td, cred);
 	fsai = fdi.indata;
 	fsai->valid = 0;
 
 	bzero(&facp, sizeof(facp));
 
 	facp.xuid = vap->va_uid;
 	facp.xgid = vap->va_gid;
 
 	if (vap->va_uid != (uid_t)VNOVAL) {
 		facp.facc_flags |= FACCESS_CHOWN;
 		fsai->uid = vap->va_uid;
 		fsai->valid |= FATTR_UID;
 	}
 	if (vap->va_gid != (gid_t)VNOVAL) {
 		facp.facc_flags |= FACCESS_CHOWN;
 		fsai->gid = vap->va_gid;
 		fsai->valid |= FATTR_GID;
 	}
 	if (vap->va_size != VNOVAL) {
 
 		struct fuse_filehandle *fufh = NULL;
 
 		/*Truncate to a new value. */
 		    fsai->size = vap->va_size;
 		sizechanged = 1;
 		newsize = vap->va_size;
 		fsai->valid |= FATTR_SIZE;
 
 		fuse_filehandle_getrw(vp, FUFH_WRONLY, &fufh);
 		if (fufh) {
 			fsai->fh = fufh->fh_id;
 			fsai->valid |= FATTR_FH;
 		}
 	}
 	if (vap->va_atime.tv_sec != VNOVAL) {
 		fsai->atime = vap->va_atime.tv_sec;
 		fsai->atimensec = vap->va_atime.tv_nsec;
 		fsai->valid |= FATTR_ATIME;
 	}
 	if (vap->va_mtime.tv_sec != VNOVAL) {
 		fsai->mtime = vap->va_mtime.tv_sec;
 		fsai->mtimensec = vap->va_mtime.tv_nsec;
 		fsai->valid |= FATTR_MTIME;
 	}
 	if (vap->va_mode != (mode_t)VNOVAL) {
 		fsai->mode = vap->va_mode & ALLPERMS;
 		fsai->valid |= FATTR_MODE;
 	}
 	if (!fsai->valid) {
 		goto out;
 	}
 	vtyp = vnode_vtype(vp);
 
 	if (fsai->valid & FATTR_SIZE && vtyp == VDIR) {
 		err = EISDIR;
 		goto out;
 	}
 	if (vfs_isrdonly(vnode_mount(vp)) && (fsai->valid & ~FATTR_SIZE || vtyp == VREG)) {
 		err = EROFS;
 		goto out;
 	}
 	if (fsai->valid & ~FATTR_SIZE) {
 	  /*err = fuse_internal_access(vp, VADMIN, context, &facp); */
 	  /*XXX */
 		    err = 0;
 	}
 	facp.facc_flags &= ~FACCESS_XQUERIES;
 
 	if (err && !(fsai->valid & ~(FATTR_ATIME | FATTR_MTIME)) &&
 	    vap->va_vaflags & VA_UTIMES_NULL) {
 		err = fuse_internal_access(vp, VWRITE, &facp, td, cred);
 	}
 	if (err)
 		goto out;
 	if ((err = fdisp_wait_answ(&fdi)))
 		goto out;
 	vtyp = IFTOVT(((struct fuse_attr_out *)fdi.answ)->attr.mode);
 
 	if (vnode_vtype(vp) != vtyp) {
 		if (vnode_vtype(vp) == VNON && vtyp != VNON) {
 			debug_printf("FUSE: Dang! vnode_vtype is VNON and vtype isn't.\n");
 		} else {
 			/*
 	                 * STALE vnode, ditch
 	                 *
 	                 * The vnode has changed its type "behind our back". There's
 	                 * nothing really we can do, so let us just force an internal
 	                 * revocation and tell the caller to try again, if interested.
 	                 */
 			fuse_internal_vnode_disappear(vp);
 			err = EAGAIN;
 		}
 	}
 	if (!err && !sizechanged) {
 		cache_attrs(vp, (struct fuse_attr_out *)fdi.answ);
 	}
 out:
 	fdisp_destroy(&fdi);
 	if (!err && sizechanged) {
-		fuse_vnode_setsize(vp, cred, newsize);
+		fuse_vnode_setsize(vp, newsize);
 		VTOFUD(vp)->flag &= ~FN_SIZECHANGE;
 	}
 	return err;
 }
 
 /*
     struct vnop_strategy_args {
 	struct vnode *a_vp;
 	struct buf *a_bp;
     };
 */
 static int
 fuse_vnop_strategy(struct vop_strategy_args *ap)
 {
 	struct vnode *vp = ap->a_vp;
 	struct buf *bp = ap->a_bp;
 
 	fuse_trace_printf_vnop();
 
 	if (!vp || fuse_isdeadfs(vp)) {
 		bp->b_ioflags |= BIO_ERROR;
 		bp->b_error = ENXIO;
 		bufdone(bp);
 		return ENXIO;
 	}
 	if (bp->b_iocmd == BIO_WRITE)
 		fuse_vnode_refreshsize(vp, NOCRED);
 
 	(void)fuse_io_strategy(vp, bp);
 
 	/*
 	 * This is a dangerous function. If returns error, that might mean a
 	 * panic. We prefer pretty much anything over being forced to panic
 	 * by a malicious daemon (a demon?). So we just return 0 anyway. You
 	 * should never mind this: this function has its own error
 	 * propagation mechanism via the argument buffer, so
 	 * not-that-melodramatic residents of the call chain still will be
 	 * able to know what to do.
 	 */
 	return 0;
 }
 
 
 /*
     struct vnop_symlink_args {
 	struct vnode *a_dvp;
 	struct vnode **a_vpp;
 	struct componentname *a_cnp;
 	struct vattr *a_vap;
 	char *a_target;
     };
 */
 static int
 fuse_vnop_symlink(struct vop_symlink_args *ap)
 {
 	struct vnode *dvp = ap->a_dvp;
 	struct vnode **vpp = ap->a_vpp;
 	struct componentname *cnp = ap->a_cnp;
 	char *target = ap->a_target;
 
 	struct fuse_dispatcher fdi;
 
 	int err;
 	size_t len;
 
 	FS_DEBUG2G("inode=%ju name=%*s\n",
 	    (uintmax_t)VTOI(dvp), (int)cnp->cn_namelen, cnp->cn_nameptr);
 
 	if (fuse_isdeadfs(dvp)) {
 		return ENXIO;
 	}
 	/*
          * Unlike the other creator type calls, here we have to create a message
          * where the name of the new entry comes first, and the data describing
          * the entry comes second.
          * Hence we can't rely on our handy fuse_internal_newentry() routine,
          * but put together the message manually and just call the core part.
          */
 
 	len = strlen(target) + 1;
 	fdisp_init(&fdi, len + cnp->cn_namelen + 1);
 	fdisp_make_vp(&fdi, FUSE_SYMLINK, dvp, curthread, NULL);
 
 	memcpy(fdi.indata, cnp->cn_nameptr, cnp->cn_namelen);
 	((char *)fdi.indata)[cnp->cn_namelen] = '\0';
 	memcpy((char *)fdi.indata + cnp->cn_namelen + 1, target, len);
 
 	err = fuse_internal_newentry_core(dvp, vpp, cnp, VLNK, &fdi);
 	fdisp_destroy(&fdi);
 	return err;
 }
 
 /*
     struct vnop_write_args {
 	struct vnode *a_vp;
 	struct uio *a_uio;
 	int  a_ioflag;
 	struct ucred *a_cred;
     };
 */
 static int
 fuse_vnop_write(struct vop_write_args *ap)
 {
 	struct vnode *vp = ap->a_vp;
 	struct uio *uio = ap->a_uio;
 	int ioflag = ap->a_ioflag;
 	struct ucred *cred = ap->a_cred;
 
 	fuse_trace_printf_vnop();
 
 	if (fuse_isdeadfs(vp)) {
 		return ENXIO;
 	}
 	fuse_vnode_refreshsize(vp, cred);
 
 	if (VTOFUD(vp)->flag & FN_DIRECTIO) {
 		ioflag |= IO_DIRECT;
 	}
 
 	return fuse_io_dispatch(vp, uio, ioflag, cred);
 }
 
 /*
     struct vnop_getpages_args {
         struct vnode *a_vp;
         vm_page_t *a_m;
         int a_count;
         int a_reqpage;
     };
 */
 static int
 fuse_vnop_getpages(struct vop_getpages_args *ap)
 {
 	int i, error, nextoff, size, toff, count, npages;
 	struct uio uio;
 	struct iovec iov;
 	vm_offset_t kva;
 	struct buf *bp;
 	struct vnode *vp;
 	struct thread *td;
 	struct ucred *cred;
 	vm_page_t *pages;
 
 	FS_DEBUG2G("heh\n");
 
 	vp = ap->a_vp;
 	KASSERT(vp->v_object, ("objectless vp passed to getpages"));
 	td = curthread;			/* XXX */
 	cred = curthread->td_ucred;	/* XXX */
 	pages = ap->a_m;
 	npages = ap->a_count;
 
 	if (!fsess_opt_mmap(vnode_mount(vp))) {
 		FS_DEBUG("called on non-cacheable vnode??\n");
 		return (VM_PAGER_ERROR);
 	}
 
 	/*
 	 * If the last page is partially valid, just return it and allow
 	 * the pager to zero-out the blanks.  Partially valid pages can
 	 * only occur at the file EOF.
 	 *
 	 * XXXGL: is that true for FUSE, which is a local filesystem,
 	 * but still somewhat disconnected from the kernel?
 	 */
 	VM_OBJECT_WLOCK(vp->v_object);
 	if (pages[npages - 1]->valid != 0 && --npages == 0)
 		goto out;
 	VM_OBJECT_WUNLOCK(vp->v_object);
 
 	/*
 	 * We use only the kva address for the buffer, but this is extremely
 	 * convenient and fast.
 	 */
 	bp = getpbuf(&fuse_pbuf_freecnt);
 
 	kva = (vm_offset_t)bp->b_data;
 	pmap_qenter(kva, pages, npages);
 	VM_CNT_INC(v_vnodein);
 	VM_CNT_ADD(v_vnodepgsin, npages);
 
 	count = npages << PAGE_SHIFT;
 	iov.iov_base = (caddr_t)kva;
 	iov.iov_len = count;
 	uio.uio_iov = &iov;
 	uio.uio_iovcnt = 1;
 	uio.uio_offset = IDX_TO_OFF(pages[0]->pindex);
 	uio.uio_resid = count;
 	uio.uio_segflg = UIO_SYSSPACE;
 	uio.uio_rw = UIO_READ;
 	uio.uio_td = td;
 
 	error = fuse_io_dispatch(vp, &uio, IO_DIRECT, cred);
 	pmap_qremove(kva, npages);
 
 	relpbuf(bp, &fuse_pbuf_freecnt);
 
 	if (error && (uio.uio_resid == count)) {
 		FS_DEBUG("error %d\n", error);
 		return VM_PAGER_ERROR;
 	}
 	/*
 	 * Calculate the number of bytes read and validate only that number
 	 * of bytes.  Note that due to pending writes, size may be 0.  This
 	 * does not mean that the remaining data is invalid!
 	 */
 
 	size = count - uio.uio_resid;
 	VM_OBJECT_WLOCK(vp->v_object);
 	fuse_vm_page_lock_queues();
 	for (i = 0, toff = 0; i < npages; i++, toff = nextoff) {
 		vm_page_t m;
 
 		nextoff = toff + PAGE_SIZE;
 		m = pages[i];
 
 		if (nextoff <= size) {
 			/*
 			 * Read operation filled an entire page
 			 */
 			m->valid = VM_PAGE_BITS_ALL;
 			KASSERT(m->dirty == 0,
 			    ("fuse_getpages: page %p is dirty", m));
 		} else if (size > toff) {
 			/*
 			 * Read operation filled a partial page.
 			 */
 			m->valid = 0;
 			vm_page_set_valid_range(m, 0, size - toff);
 			KASSERT(m->dirty == 0,
 			    ("fuse_getpages: page %p is dirty", m));
 		} else {
 			/*
 			 * Read operation was short.  If no error occurred
 			 * we may have hit a zero-fill section.   We simply
 			 * leave valid set to 0.
 			 */
 			;
 		}
 	}
 	fuse_vm_page_unlock_queues();
 out:
 	VM_OBJECT_WUNLOCK(vp->v_object);
 	if (ap->a_rbehind)
 		*ap->a_rbehind = 0;
 	if (ap->a_rahead)
 		*ap->a_rahead = 0;
 	return (VM_PAGER_OK);
 }
 
 /*
     struct vnop_putpages_args {
         struct vnode *a_vp;
         vm_page_t *a_m;
         int a_count;
         int a_sync;
         int *a_rtvals;
         vm_ooffset_t a_offset;
     };
 */
 static int
 fuse_vnop_putpages(struct vop_putpages_args *ap)
 {
 	struct uio uio;
 	struct iovec iov;
 	vm_offset_t kva;
 	struct buf *bp;
 	int i, error, npages, count;
 	off_t offset;
 	int *rtvals;
 	struct vnode *vp;
 	struct thread *td;
 	struct ucred *cred;
 	vm_page_t *pages;
 	vm_ooffset_t fsize;
 
 	FS_DEBUG2G("heh\n");
 
 	vp = ap->a_vp;
 	KASSERT(vp->v_object, ("objectless vp passed to putpages"));
 	fsize = vp->v_object->un_pager.vnp.vnp_size;
 	td = curthread;			/* XXX */
 	cred = curthread->td_ucred;	/* XXX */
 	pages = ap->a_m;
 	count = ap->a_count;
 	rtvals = ap->a_rtvals;
 	npages = btoc(count);
 	offset = IDX_TO_OFF(pages[0]->pindex);
 
 	if (!fsess_opt_mmap(vnode_mount(vp))) {
 		FS_DEBUG("called on non-cacheable vnode??\n");
 	}
 	for (i = 0; i < npages; i++)
 		rtvals[i] = VM_PAGER_AGAIN;
 
 	/*
 	 * When putting pages, do not extend file past EOF.
 	 */
 
 	if (offset + count > fsize) {
 		count = fsize - offset;
 		if (count < 0)
 			count = 0;
 	}
 	/*
 	 * We use only the kva address for the buffer, but this is extremely
 	 * convenient and fast.
 	 */
 	bp = getpbuf(&fuse_pbuf_freecnt);
 
 	kva = (vm_offset_t)bp->b_data;
 	pmap_qenter(kva, pages, npages);
 	VM_CNT_INC(v_vnodeout);
 	VM_CNT_ADD(v_vnodepgsout, count);
 
 	iov.iov_base = (caddr_t)kva;
 	iov.iov_len = count;
 	uio.uio_iov = &iov;
 	uio.uio_iovcnt = 1;
 	uio.uio_offset = offset;
 	uio.uio_resid = count;
 	uio.uio_segflg = UIO_SYSSPACE;
 	uio.uio_rw = UIO_WRITE;
 	uio.uio_td = td;
 
 	error = fuse_io_dispatch(vp, &uio, IO_DIRECT, cred);
 
 	pmap_qremove(kva, npages);
 	relpbuf(bp, &fuse_pbuf_freecnt);
 
 	if (!error) {
 		int nwritten = round_page(count - uio.uio_resid) / PAGE_SIZE;
 
 		for (i = 0; i < nwritten; i++) {
 			rtvals[i] = VM_PAGER_OK;
 			VM_OBJECT_WLOCK(pages[i]->object);
 			vm_page_undirty(pages[i]);
 			VM_OBJECT_WUNLOCK(pages[i]->object);
 		}
 	}
 	return rtvals[0];
 }
 
 static const char extattr_namespace_separator = '.';
 
 /*
     struct vop_getextattr_args {
         struct vop_generic_args a_gen;
         struct vnode *a_vp;
         int a_attrnamespace;
         const char *a_name;
         struct uio *a_uio;
         size_t *a_size;
         struct ucred *a_cred;
         struct thread *a_td;
     };
 */
 static int
 fuse_vnop_getextattr(struct vop_getextattr_args *ap)
 {
 	struct vnode *vp = ap->a_vp;
 	struct uio *uio = ap->a_uio;
 	struct fuse_dispatcher fdi;
 	struct fuse_getxattr_in *get_xattr_in;
 	struct fuse_getxattr_out *get_xattr_out;
 	struct mount *mp = vnode_mount(vp);
 	struct thread *td = ap->a_td;
 	struct ucred *cred = ap->a_cred;
 	char *prefix;
 	char *attr_str;
 	size_t len;
 	int err;
 
 	fuse_trace_printf_vnop();
 
 	if (fuse_isdeadfs(vp))
 		return (ENXIO);
 
 	/* Default to looking for user attributes. */
 	if (ap->a_attrnamespace == EXTATTR_NAMESPACE_SYSTEM)
 		prefix = EXTATTR_NAMESPACE_SYSTEM_STRING;
 	else
 		prefix = EXTATTR_NAMESPACE_USER_STRING;
 
 	len = strlen(prefix) + sizeof(extattr_namespace_separator) +
 	    strlen(ap->a_name) + 1;
 
 	fdisp_init(&fdi, len + sizeof(*get_xattr_in));
 	fdisp_make_vp(&fdi, FUSE_GETXATTR, vp, td, cred);
 
 	get_xattr_in = fdi.indata;
 	/*
 	 * Check to see whether we're querying the available size or
 	 * issuing the actual request.  If we pass in 0, we get back struct
 	 * fuse_getxattr_out.  If we pass in a non-zero size, we get back
 	 * that much data, without the struct fuse_getxattr_out header.
 	 */
 	if (uio == NULL)
 		get_xattr_in->size = 0;
 	else
 		get_xattr_in->size = uio->uio_resid;
 
 	attr_str = (char *)fdi.indata + sizeof(*get_xattr_in);
 	snprintf(attr_str, len, "%s%c%s", prefix, extattr_namespace_separator,
 	    ap->a_name);
 
 	err = fdisp_wait_answ(&fdi);
 	if (err != 0) {
 		if (err == ENOSYS)
 			fsess_set_notimpl(mp, FUSE_GETXATTR);
 		debug_printf("getxattr: got err=%d from daemon\n", err);
 		goto out;
 	}
 
 	get_xattr_out = fdi.answ;
 
 	if (ap->a_size != NULL)
 		*ap->a_size = get_xattr_out->size;
 
 	if (uio != NULL)
 		err = uiomove(fdi.answ, fdi.iosize, uio);
 
 out:
 	fdisp_destroy(&fdi);
 	return (err);
 }
 
 /*
     struct vop_setextattr_args {
         struct vop_generic_args a_gen;
         struct vnode *a_vp;
         int a_attrnamespace;
         const char *a_name;
         struct uio *a_uio;
         struct ucred *a_cred;
         struct thread *a_td;
     };
 */
 static int
 fuse_vnop_setextattr(struct vop_setextattr_args *ap)
 {
 	struct vnode *vp = ap->a_vp;
 	struct uio *uio = ap->a_uio;
 	struct fuse_dispatcher fdi;
 	struct fuse_setxattr_in *set_xattr_in;
 	struct mount *mp = vnode_mount(vp);
 	struct thread *td = ap->a_td;
 	struct ucred *cred = ap->a_cred;
 	char *prefix;
 	size_t len;
 	char *attr_str;
 	int err;
 	
 	fuse_trace_printf_vnop();
 
 	if (fuse_isdeadfs(vp))
 		return (ENXIO);
 
 	/* Default to looking for user attributes. */
 	if (ap->a_attrnamespace == EXTATTR_NAMESPACE_SYSTEM)
 		prefix = EXTATTR_NAMESPACE_SYSTEM_STRING;
 	else
 		prefix = EXTATTR_NAMESPACE_USER_STRING;
 
 	len = strlen(prefix) + sizeof(extattr_namespace_separator) +
 	    strlen(ap->a_name) + 1;
 
 	fdisp_init(&fdi, len + sizeof(*set_xattr_in) + uio->uio_resid);
 	fdisp_make_vp(&fdi, FUSE_SETXATTR, vp, td, cred);
 
 	set_xattr_in = fdi.indata;
 	set_xattr_in->size = uio->uio_resid;
 
 	attr_str = (char *)fdi.indata + sizeof(*set_xattr_in);
 	snprintf(attr_str, len, "%s%c%s", prefix, extattr_namespace_separator,
 	    ap->a_name);
 
 	err = uiomove((char *)fdi.indata + sizeof(*set_xattr_in) + len,
 	    uio->uio_resid, uio);
 	if (err != 0) {
 		debug_printf("setxattr: got error %d from uiomove\n", err);
 		goto out;
 	}
 
 	err = fdisp_wait_answ(&fdi);
 
 	if (err != 0) {
 		if (err == ENOSYS)
 			fsess_set_notimpl(mp, FUSE_SETXATTR);
 		debug_printf("setxattr: got err=%d from daemon\n", err);
 		goto out;
 	}
 
 out:
 	fdisp_destroy(&fdi);
 	return (err);
 }
 
 /*
  * The Linux / FUSE extended attribute list is simply a collection of
  * NUL-terminated strings.  The FreeBSD extended attribute list is a single
  * byte length followed by a non-NUL terminated string.  So, this allows
  * conversion of the Linux / FUSE format to the FreeBSD format in place.
  * Linux attribute names are reported with the namespace as a prefix (e.g.
  * "user.attribute_name"), but in FreeBSD they are reported without the
  * namespace prefix (e.g. "attribute_name").  So, we're going from:
  *
  * user.attr_name1\0user.attr_name2\0
  *
  * to:
  *
  * <num>attr_name1<num>attr_name2
  *
  * Where "<num>" is a single byte number of characters in the attribute name.
  * 
  * Args:
  * prefix - exattr namespace prefix string
  * list, list_len - input list with namespace prefixes
  * bsd_list, bsd_list_len - output list compatible with bsd vfs
  */
 static int
 fuse_xattrlist_convert(char *prefix, const char *list, int list_len,
     char *bsd_list, int *bsd_list_len)
 {
 	int len, pos, dist_to_next, prefix_len;
 
 	pos = 0;
 	*bsd_list_len = 0;
 	prefix_len = strlen(prefix);
 
 	while (pos < list_len && list[pos] != '\0') {
 		dist_to_next = strlen(&list[pos]) + 1;
 		if (bcmp(&list[pos], prefix, prefix_len) == 0 &&
 		    list[pos + prefix_len] == extattr_namespace_separator) {
 			len = dist_to_next -
 			    (prefix_len + sizeof(extattr_namespace_separator)) - 1;
 			if (len >= EXTATTR_MAXNAMELEN)
 				return (ENAMETOOLONG);
 
 			bsd_list[*bsd_list_len] = len;
 			memcpy(&bsd_list[*bsd_list_len + 1],
 			    &list[pos + prefix_len +
 			    sizeof(extattr_namespace_separator)], len);
 
 			*bsd_list_len += len + 1;
 		}
 
 		pos += dist_to_next;
 	}
 
 	return (0);
 }
 
 /*
     struct vop_listextattr_args {
         struct vop_generic_args a_gen;
         struct vnode *a_vp;
         int a_attrnamespace;
         struct uio *a_uio;
         size_t *a_size;
         struct ucred *a_cred;
         struct thread *a_td;
     };
 */
 static int
 fuse_vnop_listextattr(struct vop_listextattr_args *ap)
 {
 	struct vnode *vp = ap->a_vp;
 	struct uio *uio = ap->a_uio;
 	struct fuse_dispatcher fdi;
 	struct fuse_listxattr_in *list_xattr_in;
 	struct fuse_listxattr_out *list_xattr_out;
 	struct mount *mp = vnode_mount(vp);
 	struct thread *td = ap->a_td;
 	struct ucred *cred = ap->a_cred;
 	size_t len;
 	char *prefix;
 	char *attr_str;
 	char *bsd_list = NULL;
 	char *linux_list;
 	int bsd_list_len;
 	int linux_list_len;
 	int err;
 
 	fuse_trace_printf_vnop();
 
 	if (fuse_isdeadfs(vp))
 		return (ENXIO);
 
 	/*
 	 * Add space for a NUL and the period separator if enabled.
 	 * Default to looking for user attributes.
 	 */
 	if (ap->a_attrnamespace == EXTATTR_NAMESPACE_SYSTEM)
 		prefix = EXTATTR_NAMESPACE_SYSTEM_STRING;
 	else
 		prefix = EXTATTR_NAMESPACE_USER_STRING;
 
 	len = strlen(prefix) + sizeof(extattr_namespace_separator) + 1;
 
 	fdisp_init(&fdi, sizeof(*list_xattr_in) + len);
 	fdisp_make_vp(&fdi, FUSE_LISTXATTR, vp, td, cred);
 
 	/*
 	 * Retrieve Linux / FUSE compatible list size.
 	 */
 	list_xattr_in = fdi.indata;
 	list_xattr_in->size = 0;
 	attr_str = (char *)fdi.indata + sizeof(*list_xattr_in);
 	snprintf(attr_str, len, "%s%c", prefix, extattr_namespace_separator);
 
 	err = fdisp_wait_answ(&fdi);
 	if (err != 0) {
 		if (err == ENOSYS)
 			fsess_set_notimpl(mp, FUSE_LISTXATTR);
 		debug_printf("listextattr: got err=%d from daemon\n", err);
 		goto out;
 	}
 
 	list_xattr_out = fdi.answ;
 	linux_list_len = list_xattr_out->size;
 	if (linux_list_len == 0) {
 		if (ap->a_size != NULL)
 			*ap->a_size = linux_list_len;
 		goto out;
 	}
 
 	/*
 	 * Retrieve Linux / FUSE compatible list values.
 	 */
 	fdisp_make_vp(&fdi, FUSE_LISTXATTR, vp, td, cred);
 	list_xattr_in = fdi.indata;
 	list_xattr_in->size = linux_list_len + sizeof(*list_xattr_out);
 	attr_str = (char *)fdi.indata + sizeof(*list_xattr_in);
 	snprintf(attr_str, len, "%s%c", prefix, extattr_namespace_separator);
 
 	err = fdisp_wait_answ(&fdi);
 	if (err != 0)
 		goto out;
 
 	linux_list = fdi.answ;
 	linux_list_len = fdi.iosize;
 
 	/*
 	 * Retrieve the BSD compatible list values.
 	 * The Linux / FUSE attribute list format isn't the same
 	 * as FreeBSD's format. So we need to transform it into
 	 * FreeBSD's format before giving it to the user.
 	 */
 	bsd_list = malloc(linux_list_len, M_TEMP, M_WAITOK);
 	err = fuse_xattrlist_convert(prefix, linux_list, linux_list_len,
 	    bsd_list, &bsd_list_len);
 	if (err != 0)
 		goto out;
 
 	if (ap->a_size != NULL)
 		*ap->a_size = bsd_list_len;
 
 	if (uio != NULL)
 		err = uiomove(bsd_list, bsd_list_len, uio);
 
 out:
 	free(bsd_list, M_TEMP);
 	fdisp_destroy(&fdi);
 	return (err);
 }
 
 /*
     struct vop_deleteextattr_args {
         struct vop_generic_args a_gen;
         struct vnode *a_vp;
         int a_attrnamespace;
         const char *a_name;
         struct ucred *a_cred;
         struct thread *a_td;
     };
 */
 static int
 fuse_vnop_deleteextattr(struct vop_deleteextattr_args *ap)
 {
 	struct vnode *vp = ap->a_vp;
 	struct fuse_dispatcher fdi;
 	struct mount *mp = vnode_mount(vp);
 	struct thread *td = ap->a_td;
 	struct ucred *cred = ap->a_cred;
 	char *prefix;
 	size_t len;
 	char *attr_str;
 	int err;
 
 	fuse_trace_printf_vnop();
 
 	if (fuse_isdeadfs(vp))
 		return (ENXIO);
 
 	/* Default to looking for user attributes. */
 	if (ap->a_attrnamespace == EXTATTR_NAMESPACE_SYSTEM)
 		prefix = EXTATTR_NAMESPACE_SYSTEM_STRING;
 	else
 		prefix = EXTATTR_NAMESPACE_USER_STRING;
 
 	len = strlen(prefix) + sizeof(extattr_namespace_separator) +
 	    strlen(ap->a_name) + 1;
 
 	fdisp_init(&fdi, len);
 	fdisp_make_vp(&fdi, FUSE_REMOVEXATTR, vp, td, cred);
 
 	attr_str = fdi.indata;
 	snprintf(attr_str, len, "%s%c%s", prefix, extattr_namespace_separator,
 	    ap->a_name);
 
 	err = fdisp_wait_answ(&fdi);
 	if (err != 0) {
 		if (err == ENOSYS)
 			fsess_set_notimpl(mp, FUSE_REMOVEXATTR);
 		debug_printf("removexattr: got err=%d from daemon\n", err);
 	}
 
 	fdisp_destroy(&fdi);
 	return (err);
 }
 
 /*
     struct vnop_print_args {
         struct vnode *a_vp;
     };
 */
 static int
 fuse_vnop_print(struct vop_print_args *ap)
 {
 	struct fuse_vnode_data *fvdat = VTOFUD(ap->a_vp);
 
 	printf("nodeid: %ju, parent nodeid: %ju, nlookup: %ju, flag: %#x\n",
 	    (uintmax_t)VTOILLU(ap->a_vp), (uintmax_t)fvdat->parent_nid,
 	    (uintmax_t)fvdat->nlookup,
 	    fvdat->flag);
 
 	return 0;
 }
Index: stable/12/sys/fs/msdosfs/msdosfs_denode.c
===================================================================
--- stable/12/sys/fs/msdosfs/msdosfs_denode.c	(revision 349304)
+++ stable/12/sys/fs/msdosfs/msdosfs_denode.c	(revision 349305)
@@ -1,620 +1,620 @@
 /* $FreeBSD$ */
 /*	$NetBSD: msdosfs_denode.c,v 1.28 1998/02/10 14:10:00 mrg Exp $	*/
 
 /*-
  * SPDX-License-Identifier: BSD-4-Clause
  *
  * Copyright (C) 1994, 1995, 1997 Wolfgang Solfrank.
  * Copyright (C) 1994, 1995, 1997 TooLs GmbH.
  * All rights reserved.
  * Original code by Paul Popelka (paulp@uts.amdahl.com) (see below).
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. All advertising materials mentioning features or use of this software
  *    must display the following acknowledgement:
  *	This product includes software developed by TooLs GmbH.
  * 4. The name of TooLs GmbH may not be used to endorse or promote products
  *    derived from this software without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY TOOLS GMBH ``AS IS'' AND ANY EXPRESS OR
  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
  * IN NO EVENT SHALL TOOLS GMBH BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
  * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
  * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
  * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
  * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
  * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
  * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 /*-
  * Written by Paul Popelka (paulp@uts.amdahl.com)
  *
  * You can do anything you want with this software, just don't say you wrote
  * it, and don't remove this notice.
  *
  * This software is provided "as is".
  *
  * The author supplies this software to be publicly redistributed on the
  * understanding that the author is not responsible for the correct
  * functioning of this software in any circumstances and is not liable for
  * any damages caused by this software.
  *
  * October 1992
  */
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/buf.h>
 #include <sys/clock.h>
 #include <sys/kernel.h>
 #include <sys/malloc.h>
 #include <sys/mount.h>
 #include <sys/vmmeter.h>
 #include <sys/vnode.h>
 
 #include <vm/vm.h>
 #include <vm/vm_extern.h>
 
 #include <fs/msdosfs/bpb.h>
 #include <fs/msdosfs/direntry.h>
 #include <fs/msdosfs/denode.h>
 #include <fs/msdosfs/fat.h>
 #include <fs/msdosfs/msdosfsmount.h>
 
 static MALLOC_DEFINE(M_MSDOSFSNODE, "msdosfs_node", "MSDOSFS vnode private part");
 
 static int
 de_vncmpf(struct vnode *vp, void *arg)
 {
 	struct denode *de;
 	uint64_t *a;
 
 	a = arg;
 	de = VTODE(vp);
 	return (de->de_inode != *a);
 }
 
 /*
  * If deget() succeeds it returns with the gotten denode locked().
  *
  * pmp	     - address of msdosfsmount structure of the filesystem containing
  *	       the denode of interest.  The address of
  *	       the msdosfsmount structure are used.
  * dirclust  - which cluster bp contains, if dirclust is 0 (root directory)
  *	       diroffset is relative to the beginning of the root directory,
  *	       otherwise it is cluster relative.
  * diroffset - offset past begin of cluster of denode we want
  * depp	     - returns the address of the gotten denode.
  */
 int
 deget(struct msdosfsmount *pmp, u_long dirclust, u_long diroffset,
     struct denode **depp)
 {
 	int error;
 	uint64_t inode;
 	struct mount *mntp = pmp->pm_mountp;
 	struct direntry *direntptr;
 	struct denode *ldep;
 	struct vnode *nvp, *xvp;
 	struct buf *bp;
 
 #ifdef MSDOSFS_DEBUG
 	printf("deget(pmp %p, dirclust %lu, diroffset %lx, depp %p)\n",
 	    pmp, dirclust, diroffset, depp);
 #endif
 
 	/*
 	 * On FAT32 filesystems, root is a (more or less) normal
 	 * directory
 	 */
 	if (FAT32(pmp) && dirclust == MSDOSFSROOT)
 		dirclust = pmp->pm_rootdirblk;
 
 	/*
 	 * See if the denode is in the denode cache. Use the location of
 	 * the directory entry to compute the hash value. For subdir use
 	 * address of "." entry. For root dir (if not FAT32) use cluster
 	 * MSDOSFSROOT, offset MSDOSFSROOT_OFS
 	 *
 	 * NOTE: The check for de_refcnt > 0 below insures the denode being
 	 * examined does not represent an unlinked but still open file.
 	 * These files are not to be accessible even when the directory
 	 * entry that represented the file happens to be reused while the
 	 * deleted file is still open.
 	 */
 	inode = (uint64_t)pmp->pm_bpcluster * dirclust + diroffset;
 
 	error = vfs_hash_get(mntp, inode, LK_EXCLUSIVE, curthread, &nvp,
 	    de_vncmpf, &inode);
 	if (error)
 		return (error);
 	if (nvp != NULL) {
 		*depp = VTODE(nvp);
 		KASSERT((*depp)->de_dirclust == dirclust, ("wrong dirclust"));
 		KASSERT((*depp)->de_diroffset == diroffset, ("wrong diroffset"));
 		return (0);
 	}
 	ldep = malloc(sizeof(struct denode), M_MSDOSFSNODE, M_WAITOK | M_ZERO);
 
 	/*
 	 * Directory entry was not in cache, have to create a vnode and
 	 * copy it from the passed disk buffer.
 	 */
 	/* getnewvnode() does a VREF() on the vnode */
 	error = getnewvnode("msdosfs", mntp, &msdosfs_vnodeops, &nvp);
 	if (error) {
 		*depp = NULL;
 		free(ldep, M_MSDOSFSNODE);
 		return error;
 	}
 	nvp->v_data = ldep;
 	ldep->de_vnode = nvp;
 	ldep->de_flag = 0;
 	ldep->de_dirclust = dirclust;
 	ldep->de_diroffset = diroffset;
 	ldep->de_inode = inode;
 	lockmgr(nvp->v_vnlock, LK_EXCLUSIVE, NULL);
 	fc_purge(ldep, 0);	/* init the FAT cache for this denode */
 	error = insmntque(nvp, mntp);
 	if (error != 0) {
 		free(ldep, M_MSDOSFSNODE);
 		*depp = NULL;
 		return (error);
 	}
 	error = vfs_hash_insert(nvp, inode, LK_EXCLUSIVE, curthread, &xvp,
 	    de_vncmpf, &inode);
 	if (error) {
 		*depp = NULL;
 		return (error);
 	}
 	if (xvp != NULL) {
 		*depp = xvp->v_data;
 		return (0);
 	}
 
 	ldep->de_pmp = pmp;
 	ldep->de_refcnt = 1;
 	/*
 	 * Copy the directory entry into the denode area of the vnode.
 	 */
 	if ((dirclust == MSDOSFSROOT
 	     || (FAT32(pmp) && dirclust == pmp->pm_rootdirblk))
 	    && diroffset == MSDOSFSROOT_OFS) {
 		/*
 		 * Directory entry for the root directory. There isn't one,
 		 * so we manufacture one. We should probably rummage
 		 * through the root directory and find a label entry (if it
 		 * exists), and then use the time and date from that entry
 		 * as the time and date for the root denode.
 		 */
 		nvp->v_vflag |= VV_ROOT; /* should be further down XXX */
 
 		ldep->de_Attributes = ATTR_DIRECTORY;
 		ldep->de_LowerCase = 0;
 		if (FAT32(pmp))
 			ldep->de_StartCluster = pmp->pm_rootdirblk;
 			/* de_FileSize will be filled in further down */
 		else {
 			ldep->de_StartCluster = MSDOSFSROOT;
 			ldep->de_FileSize = pmp->pm_rootdirsize * DEV_BSIZE;
 		}
 		/*
 		 * fill in time and date so that fattime2timespec() doesn't
 		 * spit up when called from msdosfs_getattr() with root
 		 * denode
 		 */
 		ldep->de_CHun = 0;
 		ldep->de_CTime = 0x0000;	/* 00:00:00	 */
 		ldep->de_CDate = (0 << DD_YEAR_SHIFT) | (1 << DD_MONTH_SHIFT)
 		    | (1 << DD_DAY_SHIFT);
 		/* Jan 1, 1980	 */
 		ldep->de_ADate = ldep->de_CDate;
 		ldep->de_MTime = ldep->de_CTime;
 		ldep->de_MDate = ldep->de_CDate;
 		/* leave the other fields as garbage */
 	} else {
 		error = readep(pmp, dirclust, diroffset, &bp, &direntptr);
 		if (error) {
 			/*
 			 * The denode does not contain anything useful, so
 			 * it would be wrong to leave it on its hash chain.
 			 * Arrange for vput() to just forget about it.
 			 */
 			ldep->de_Name[0] = SLOT_DELETED;
 
 			vput(nvp);
 			*depp = NULL;
 			return (error);
 		}
 		(void)DE_INTERNALIZE(ldep, direntptr);
 		brelse(bp);
 	}
 
 	/*
 	 * Fill in a few fields of the vnode and finish filling in the
 	 * denode.  Then return the address of the found denode.
 	 */
 	if (ldep->de_Attributes & ATTR_DIRECTORY) {
 		/*
 		 * Since DOS directory entries that describe directories
 		 * have 0 in the filesize field, we take this opportunity
 		 * to find out the length of the directory and plug it into
 		 * the denode structure.
 		 */
 		u_long size;
 
 		/*
 		 * XXX it sometimes happens that the "." entry has cluster
 		 * number 0 when it shouldn't.  Use the actual cluster number
 		 * instead of what is written in directory entry.
 		 */
 		if (diroffset == 0 && ldep->de_StartCluster != dirclust) {
 #ifdef MSDOSFS_DEBUG
 			printf("deget(): \".\" entry at clust %lu != %lu\n",
 			    dirclust, ldep->de_StartCluster);
 #endif
 			ldep->de_StartCluster = dirclust;
 		}
 
 		nvp->v_type = VDIR;
 		if (ldep->de_StartCluster != MSDOSFSROOT) {
 			error = pcbmap(ldep, 0xffff, 0, &size, 0);
 			if (error == E2BIG) {
 				ldep->de_FileSize = de_cn2off(pmp, size);
 				error = 0;
 			} else {
 #ifdef MSDOSFS_DEBUG
 				printf("deget(): pcbmap returned %d\n", error);
 #endif
 			}
 		}
 	} else
 		nvp->v_type = VREG;
 	ldep->de_modrev = init_va_filerev();
 	*depp = ldep;
 	return (0);
 }
 
 int
 deupdat(struct denode *dep, int waitfor)
 {
 	struct direntry dir;
 	struct timespec ts;
 	struct buf *bp;
 	struct direntry *dirp;
 	int error;
 
 	if (DETOV(dep)->v_mount->mnt_flag & MNT_RDONLY) {
 		dep->de_flag &= ~(DE_UPDATE | DE_CREATE | DE_ACCESS |
 		    DE_MODIFIED);
 		return (0);
 	}
 	vfs_timestamp(&ts);
 	DETIMES(dep, &ts, &ts, &ts);
 	if ((dep->de_flag & DE_MODIFIED) == 0 && waitfor == 0)
 		return (0);
 	dep->de_flag &= ~DE_MODIFIED;
 	if (DETOV(dep)->v_vflag & VV_ROOT)
 		return (EINVAL);
 	if (dep->de_refcnt <= 0)
 		return (0);
 	error = readde(dep, &bp, &dirp);
 	if (error)
 		return (error);
 	DE_EXTERNALIZE(&dir, dep);
 	if (bcmp(dirp, &dir, sizeof(dir)) == 0) {
 		if (waitfor == 0 || (bp->b_flags & B_DELWRI) == 0) {
 			brelse(bp);
 			return (0);
 		}
 	} else
 		*dirp = dir;
 	if ((DETOV(dep)->v_mount->mnt_flag & MNT_NOCLUSTERW) == 0)
 		bp->b_flags |= B_CLUSTEROK;
 	if (waitfor)
 		error = bwrite(bp);
 	else if (vm_page_count_severe() || buf_dirty_count_severe())
 		bawrite(bp);
 	else
 		bdwrite(bp);
 	return (error);
 }
 
 /*
  * Truncate the file described by dep to the length specified by length.
  */
 int
 detrunc(struct denode *dep, u_long length, int flags, struct ucred *cred)
 {
 	int error;
 	int allerror;
 	u_long eofentry;
 	u_long chaintofree;
 	daddr_t bn;
 	int boff;
 	int isadir = dep->de_Attributes & ATTR_DIRECTORY;
 	struct buf *bp;
 	struct msdosfsmount *pmp = dep->de_pmp;
 
 #ifdef MSDOSFS_DEBUG
 	printf("detrunc(): file %s, length %lu, flags %x\n", dep->de_Name, length, flags);
 #endif
 
 	/*
 	 * Disallow attempts to truncate the root directory since it is of
 	 * fixed size.  That's just the way dos filesystems are.  We use
 	 * the VROOT bit in the vnode because checking for the directory
 	 * bit and a startcluster of 0 in the denode is not adequate to
 	 * recognize the root directory at this point in a file or
 	 * directory's life.
 	 */
 	if ((DETOV(dep)->v_vflag & VV_ROOT) && !FAT32(pmp)) {
 #ifdef MSDOSFS_DEBUG
 		printf("detrunc(): can't truncate root directory, clust %ld, offset %ld\n",
 		    dep->de_dirclust, dep->de_diroffset);
 #endif
 		return (EINVAL);
 	}
 
 	if (dep->de_FileSize < length) {
 		vnode_pager_setsize(DETOV(dep), length);
 		return deextend(dep, length, cred);
 	}
 
 	/*
 	 * If the desired length is 0 then remember the starting cluster of
 	 * the file and set the StartCluster field in the directory entry
 	 * to 0.  If the desired length is not zero, then get the number of
 	 * the last cluster in the shortened file.  Then get the number of
 	 * the first cluster in the part of the file that is to be freed.
 	 * Then set the next cluster pointer in the last cluster of the
 	 * file to CLUST_EOFE.
 	 */
 	if (length == 0) {
 		chaintofree = dep->de_StartCluster;
 		dep->de_StartCluster = 0;
 		eofentry = ~0;
 	} else {
 		error = pcbmap(dep, de_clcount(pmp, length) - 1, 0,
 			       &eofentry, 0);
 		if (error) {
 #ifdef MSDOSFS_DEBUG
 			printf("detrunc(): pcbmap fails %d\n", error);
 #endif
 			return (error);
 		}
 	}
 
 	fc_purge(dep, de_clcount(pmp, length));
 
 	/*
 	 * If the new length is not a multiple of the cluster size then we
 	 * must zero the tail end of the new last cluster in case it
 	 * becomes part of the file again because of a seek.
 	 */
 	if ((boff = length & pmp->pm_crbomask) != 0) {
 		if (isadir) {
 			bn = cntobn(pmp, eofentry);
 			error = bread(pmp->pm_devvp, bn, pmp->pm_bpcluster,
 			    NOCRED, &bp);
 		} else {
 			error = bread(DETOV(dep), de_cluster(pmp, length),
 			    pmp->pm_bpcluster, cred, &bp);
 		}
 		if (error) {
 #ifdef MSDOSFS_DEBUG
 			printf("detrunc(): bread fails %d\n", error);
 #endif
 			return (error);
 		}
 		memset(bp->b_data + boff, 0, pmp->pm_bpcluster - boff);
 		if ((flags & IO_SYNC) != 0)
 			bwrite(bp);
 		else
 			bdwrite(bp);
 	}
 
 	/*
 	 * Write out the updated directory entry.  Even if the update fails
 	 * we free the trailing clusters.
 	 */
 	dep->de_FileSize = length;
 	if (!isadir)
 		dep->de_flag |= DE_UPDATE | DE_MODIFIED;
-	allerror = vtruncbuf(DETOV(dep), cred, length, pmp->pm_bpcluster);
+	allerror = vtruncbuf(DETOV(dep), length, pmp->pm_bpcluster);
 #ifdef MSDOSFS_DEBUG
 	if (allerror)
 		printf("detrunc(): vtruncbuf error %d\n", allerror);
 #endif
 	error = deupdat(dep, !DOINGASYNC((DETOV(dep))));
 	if (error != 0 && allerror == 0)
 		allerror = error;
 #ifdef MSDOSFS_DEBUG
 	printf("detrunc(): allerror %d, eofentry %lu\n",
 	       allerror, eofentry);
 #endif
 
 	/*
 	 * If we need to break the cluster chain for the file then do it
 	 * now.
 	 */
 	if (eofentry != ~0) {
 		error = fatentry(FAT_GET_AND_SET, pmp, eofentry,
 				 &chaintofree, CLUST_EOFE);
 		if (error) {
 #ifdef MSDOSFS_DEBUG
 			printf("detrunc(): fatentry errors %d\n", error);
 #endif
 			return (error);
 		}
 		fc_setcache(dep, FC_LASTFC, de_cluster(pmp, length - 1),
 			    eofentry);
 	}
 
 	/*
 	 * Now free the clusters removed from the file because of the
 	 * truncation.
 	 */
 	if (chaintofree != 0 && !MSDOSFSEOF(pmp, chaintofree))
 		freeclusterchain(pmp, chaintofree);
 
 	return (allerror);
 }
 
 /*
  * Extend the file described by dep to length specified by length.
  */
 int
 deextend(struct denode *dep, u_long length, struct ucred *cred)
 {
 	struct msdosfsmount *pmp = dep->de_pmp;
 	u_long count;
 	int error;
 
 	/*
 	 * The root of a DOS filesystem cannot be extended.
 	 */
 	if ((DETOV(dep)->v_vflag & VV_ROOT) && !FAT32(pmp))
 		return (EINVAL);
 
 	/*
 	 * Directories cannot be extended.
 	 */
 	if (dep->de_Attributes & ATTR_DIRECTORY)
 		return (EISDIR);
 
 	if (length <= dep->de_FileSize)
 		panic("deextend: file too large");
 
 	/*
 	 * Compute the number of clusters to allocate.
 	 */
 	count = de_clcount(pmp, length) - de_clcount(pmp, dep->de_FileSize);
 	if (count > 0) {
 		if (count > pmp->pm_freeclustercount)
 			return (ENOSPC);
 		error = extendfile(dep, count, NULL, NULL, DE_CLEAR);
 		if (error) {
 			/* truncate the added clusters away again */
 			(void) detrunc(dep, dep->de_FileSize, 0, cred);
 			return (error);
 		}
 	}
 	dep->de_FileSize = length;
 	dep->de_flag |= DE_UPDATE | DE_MODIFIED;
 	return (deupdat(dep, !DOINGASYNC(DETOV(dep))));
 }
 
 /*
  * Move a denode to its correct hash queue after the file it represents has
  * been moved to a new directory.
  */
 void
 reinsert(struct denode *dep)
 {
 	struct vnode *vp;
 
 	/*
 	 * Fix up the denode cache.  If the denode is for a directory,
 	 * there is nothing to do since the hash is based on the starting
 	 * cluster of the directory file and that hasn't changed.  If for a
 	 * file the hash is based on the location of the directory entry,
 	 * so we must remove it from the cache and re-enter it with the
 	 * hash based on the new location of the directory entry.
 	 */
 #if 0
 	if (dep->de_Attributes & ATTR_DIRECTORY)
 		return;
 #endif
 	vp = DETOV(dep);
 	dep->de_inode = (uint64_t)dep->de_pmp->pm_bpcluster * dep->de_dirclust +
 	    dep->de_diroffset;
 	vfs_hash_rehash(vp, dep->de_inode);
 }
 
 int
 msdosfs_reclaim(struct vop_reclaim_args *ap)
 {
 	struct vnode *vp = ap->a_vp;
 	struct denode *dep = VTODE(vp);
 
 #ifdef MSDOSFS_DEBUG
 	printf("msdosfs_reclaim(): dep %p, file %s, refcnt %ld\n",
 	    dep, dep->de_Name, dep->de_refcnt);
 #endif
 
 	/*
 	 * Destroy the vm object and flush associated pages.
 	 */
 	vnode_destroy_vobject(vp);
 	/*
 	 * Remove the denode from its hash chain.
 	 */
 	vfs_hash_remove(vp);
 	/*
 	 * Purge old data structures associated with the denode.
 	 */
 #if 0 /* XXX */
 	dep->de_flag = 0;
 #endif
 	free(dep, M_MSDOSFSNODE);
 	vp->v_data = NULL;
 
 	return (0);
 }
 
 int
 msdosfs_inactive(struct vop_inactive_args *ap)
 {
 	struct vnode *vp = ap->a_vp;
 	struct denode *dep = VTODE(vp);
 	int error = 0;
 
 #ifdef MSDOSFS_DEBUG
 	printf("msdosfs_inactive(): dep %p, de_Name[0] %x\n", dep, dep->de_Name[0]);
 #endif
 
 	/*
 	 * Ignore denodes related to stale file handles.
 	 */
 	if (dep->de_Name[0] == SLOT_DELETED || dep->de_Name[0] == SLOT_EMPTY)
 		goto out;
 
 	/*
 	 * If the file has been deleted and it is on a read/write
 	 * filesystem, then truncate the file, and mark the directory slot
 	 * as empty.  (This may not be necessary for the dos filesystem.)
 	 */
 #ifdef MSDOSFS_DEBUG
 	printf("msdosfs_inactive(): dep %p, refcnt %ld, mntflag %llx, MNT_RDONLY %llx\n",
 	    dep, dep->de_refcnt, (unsigned long long)vp->v_mount->mnt_flag,
 	    (unsigned long long)MNT_RDONLY);
 #endif
 	if (dep->de_refcnt <= 0 && (vp->v_mount->mnt_flag & MNT_RDONLY) == 0) {
 		error = detrunc(dep, (u_long) 0, 0, NOCRED);
 		dep->de_flag |= DE_UPDATE;
 		dep->de_Name[0] = SLOT_DELETED;
 	}
 	deupdat(dep, 0);
 
 out:
 	/*
 	 * If we are done with the denode, reclaim it
 	 * so that it can be reused immediately.
 	 */
 #ifdef MSDOSFS_DEBUG
 	printf("msdosfs_inactive(): v_usecount %d, de_Name[0] %x\n",
 	       vrefcnt(vp), dep->de_Name[0]);
 #endif
 	if (dep->de_Name[0] == SLOT_DELETED || dep->de_Name[0] == SLOT_EMPTY)
 		vrecycle(vp);
 	return (error);
 }
Index: stable/12/sys/fs/nfsclient/nfs.h
===================================================================
--- stable/12/sys/fs/nfsclient/nfs.h	(revision 349304)
+++ stable/12/sys/fs/nfsclient/nfs.h	(revision 349305)
@@ -1,129 +1,128 @@
 /*-
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Copyright (c) 1989, 1993
  *	The Regents of the University of California.  All rights reserved.
  *
  * This code is derived from software contributed to Berkeley by
  * Rick Macklem at The University of Guelph.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  * $FreeBSD$
  */
 
 #ifndef _NFSCLIENT_NFS_H_
 #define	_NFSCLIENT_NFS_H_
 
 #if defined(_KERNEL)
 
 #ifndef NFS_TPRINTF_INITIAL_DELAY
 #define	NFS_TPRINTF_INITIAL_DELAY       12
 #endif
 
 #ifndef NFS_TPRINTF_DELAY
 #define	NFS_TPRINTF_DELAY               30
 #endif
 
 /*
  * Nfs version macros.
  */
 #define	NFS_ISV3(v) \
 	(VFSTONFS((v)->v_mount)->nm_flag & NFSMNT_NFSV3)
 #define	NFS_ISV4(v) \
 	(VFSTONFS((v)->v_mount)->nm_flag & NFSMNT_NFSV4)
 #define	NFS_ISV34(v) \
 	(VFSTONFS((v)->v_mount)->nm_flag & (NFSMNT_NFSV3 | NFSMNT_NFSV4))
 
 #ifdef NFS_DEBUG
 
 extern int nfs_debug;
 #define	NFS_DEBUG_ASYNCIO	1 /* asynchronous i/o */
 #define	NFS_DEBUG_WG		2 /* server write gathering */
 #define	NFS_DEBUG_RC		4 /* server request caching */
 
 #define	NFS_DPF(cat, args)					\
 	do {							\
 		if (nfs_debug & NFS_DEBUG_##cat) printf args;	\
 	} while (0)
 
 #else
 
 #define	NFS_DPF(cat, args)
 
 #endif
 
 /*
  * NFS iod threads can be in one of these three states once spawned.
  * NFSIOD_NOT_AVAILABLE - Cannot be assigned an I/O operation at this time.
  * NFSIOD_AVAILABLE - Available to be assigned an I/O operation.
  * NFSIOD_CREATED_FOR_NFS_ASYNCIO - Newly created for nfs_asyncio() and
  *	will be used by the thread that called nfs_asyncio().
  */
 enum nfsiod_state {
 	NFSIOD_NOT_AVAILABLE = 0,
 	NFSIOD_AVAILABLE = 1,
 	NFSIOD_CREATED_FOR_NFS_ASYNCIO = 2,
 };
 
 /*
  * Function prototypes.
  */
-int ncl_meta_setsize(struct vnode *, struct ucred *, struct thread *,
-    u_quad_t);
+int ncl_meta_setsize(struct vnode *, struct thread *, u_quad_t);
 void ncl_doio_directwrite(struct buf *);
 int ncl_bioread(struct vnode *, struct uio *, int, struct ucred *);
 int ncl_biowrite(struct vnode *, struct uio *, int, struct ucred *);
 int ncl_vinvalbuf(struct vnode *, int, struct thread *, int);
 int ncl_asyncio(struct nfsmount *, struct buf *, struct ucred *,
     struct thread *);
 int ncl_doio(struct vnode *, struct buf *, struct ucred *, struct thread *,
     int);
 void ncl_nhinit(void);
 void ncl_nhuninit(void);
 void ncl_nodelock(struct nfsnode *);
 void ncl_nodeunlock(struct nfsnode *);
 int ncl_getattrcache(struct vnode *, struct vattr *);
 int ncl_readrpc(struct vnode *, struct uio *, struct ucred *);
 int ncl_writerpc(struct vnode *, struct uio *, struct ucred *, int *, int *,
     int);
 int ncl_readlinkrpc(struct vnode *, struct uio *, struct ucred *);
 int ncl_readdirrpc(struct vnode *, struct uio *, struct ucred *,
     struct thread *);
 int ncl_readdirplusrpc(struct vnode *, struct uio *, struct ucred *,
     struct thread *);
 int ncl_writebp(struct buf *, int, struct thread *);
 int ncl_commit(struct vnode *, u_quad_t, int, struct ucred *, struct thread *);
 void ncl_clearcommit(struct mount *);
 int ncl_fsinfo(struct nfsmount *, struct vnode *, struct ucred *,
     struct thread *);
 int ncl_init(struct vfsconf *);
 int ncl_uninit(struct vfsconf *);
 void	ncl_nfsiodnew(void);
 void	ncl_nfsiodnew_tq(__unused void *, int);
 
 #endif	/* _KERNEL */
 
 #endif	/* _NFSCLIENT_NFS_H_ */
Index: stable/12/sys/fs/nfsclient/nfs_clbio.c
===================================================================
--- stable/12/sys/fs/nfsclient/nfs_clbio.c	(revision 349304)
+++ stable/12/sys/fs/nfsclient/nfs_clbio.c	(revision 349305)
@@ -1,1874 +1,1874 @@
 /*-
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Copyright (c) 1989, 1993
  *	The Regents of the University of California.  All rights reserved.
  *
  * This code is derived from software contributed to Berkeley by
  * Rick Macklem at The University of Guelph.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	@(#)nfs_bio.c	8.9 (Berkeley) 3/30/95
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/bio.h>
 #include <sys/buf.h>
 #include <sys/kernel.h>
 #include <sys/mount.h>
 #include <sys/rwlock.h>
 #include <sys/vmmeter.h>
 #include <sys/vnode.h>
 
 #include <vm/vm.h>
 #include <vm/vm_param.h>
 #include <vm/vm_extern.h>
 #include <vm/vm_page.h>
 #include <vm/vm_object.h>
 #include <vm/vm_pager.h>
 #include <vm/vnode_pager.h>
 
 #include <fs/nfs/nfsport.h>
 #include <fs/nfsclient/nfsmount.h>
 #include <fs/nfsclient/nfs.h>
 #include <fs/nfsclient/nfsnode.h>
 #include <fs/nfsclient/nfs_kdtrace.h>
 
 extern int newnfs_directio_allow_mmap;
 extern struct nfsstatsv1 nfsstatsv1;
 extern struct mtx ncl_iod_mutex;
 extern int ncl_numasync;
 extern enum nfsiod_state ncl_iodwant[NFS_MAXASYNCDAEMON];
 extern struct nfsmount *ncl_iodmount[NFS_MAXASYNCDAEMON];
 extern int newnfs_directio_enable;
 extern int nfs_keep_dirty_on_error;
 
 int ncl_pbuf_freecnt = -1;	/* start out unlimited */
 
 static struct buf *nfs_getcacheblk(struct vnode *vp, daddr_t bn, int size,
     struct thread *td);
 static int nfs_directio_write(struct vnode *vp, struct uio *uiop,
     struct ucred *cred, int ioflag);
 
 /*
  * Vnode op for VM getpages.
  */
 SYSCTL_DECL(_vfs_nfs);
 static int use_buf_pager = 1;
 SYSCTL_INT(_vfs_nfs, OID_AUTO, use_buf_pager, CTLFLAG_RWTUN,
     &use_buf_pager, 0,
     "Use buffer pager instead of direct readrpc call");
 
 static daddr_t
 ncl_gbp_getblkno(struct vnode *vp, vm_ooffset_t off)
 {
 
 	return (off / vp->v_bufobj.bo_bsize);
 }
 
 static int
 ncl_gbp_getblksz(struct vnode *vp, daddr_t lbn)
 {
 	struct nfsnode *np;
 	u_quad_t nsize;
 	int biosize, bcount;
 
 	np = VTONFS(vp);
 	mtx_lock(&np->n_mtx);
 	nsize = np->n_size;
 	mtx_unlock(&np->n_mtx);
 
 	biosize = vp->v_bufobj.bo_bsize;
 	bcount = biosize;
 	if ((off_t)lbn * biosize >= nsize)
 		bcount = 0;
 	else if ((off_t)(lbn + 1) * biosize > nsize)
 		bcount = nsize - (off_t)lbn * biosize;
 	return (bcount);
 }
 
 int
 ncl_getpages(struct vop_getpages_args *ap)
 {
 	int i, error, nextoff, size, toff, count, npages;
 	struct uio uio;
 	struct iovec iov;
 	vm_offset_t kva;
 	struct buf *bp;
 	struct vnode *vp;
 	struct thread *td;
 	struct ucred *cred;
 	struct nfsmount *nmp;
 	vm_object_t object;
 	vm_page_t *pages;
 	struct nfsnode *np;
 
 	vp = ap->a_vp;
 	np = VTONFS(vp);
 	td = curthread;
 	cred = curthread->td_ucred;
 	nmp = VFSTONFS(vp->v_mount);
 	pages = ap->a_m;
 	npages = ap->a_count;
 
 	if ((object = vp->v_object) == NULL) {
 		printf("ncl_getpages: called with non-merged cache vnode\n");
 		return (VM_PAGER_ERROR);
 	}
 
 	if (newnfs_directio_enable && !newnfs_directio_allow_mmap) {
 		mtx_lock(&np->n_mtx);
 		if ((np->n_flag & NNONCACHE) && (vp->v_type == VREG)) {
 			mtx_unlock(&np->n_mtx);
 			printf("ncl_getpages: called on non-cacheable vnode\n");
 			return (VM_PAGER_ERROR);
 		} else
 			mtx_unlock(&np->n_mtx);
 	}
 
 	mtx_lock(&nmp->nm_mtx);
 	if ((nmp->nm_flag & NFSMNT_NFSV3) != 0 &&
 	    (nmp->nm_state & NFSSTA_GOTFSINFO) == 0) {
 		mtx_unlock(&nmp->nm_mtx);
 		/* We'll never get here for v4, because we always have fsinfo */
 		(void)ncl_fsinfo(nmp, vp, cred, td);
 	} else
 		mtx_unlock(&nmp->nm_mtx);
 
 	if (use_buf_pager)
 		return (vfs_bio_getpages(vp, pages, npages, ap->a_rbehind,
 		    ap->a_rahead, ncl_gbp_getblkno, ncl_gbp_getblksz));
 
 	/*
 	 * If the requested page is partially valid, just return it and
 	 * allow the pager to zero-out the blanks.  Partially valid pages
 	 * can only occur at the file EOF.
 	 *
 	 * XXXGL: is that true for NFS, where short read can occur???
 	 */
 	VM_OBJECT_WLOCK(object);
 	if (pages[npages - 1]->valid != 0 && --npages == 0)
 		goto out;
 	VM_OBJECT_WUNLOCK(object);
 
 	/*
 	 * We use only the kva address for the buffer, but this is extremely
 	 * convenient and fast.
 	 */
 	bp = getpbuf(&ncl_pbuf_freecnt);
 
 	kva = (vm_offset_t) bp->b_data;
 	pmap_qenter(kva, pages, npages);
 	VM_CNT_INC(v_vnodein);
 	VM_CNT_ADD(v_vnodepgsin, npages);
 
 	count = npages << PAGE_SHIFT;
 	iov.iov_base = (caddr_t) kva;
 	iov.iov_len = count;
 	uio.uio_iov = &iov;
 	uio.uio_iovcnt = 1;
 	uio.uio_offset = IDX_TO_OFF(pages[0]->pindex);
 	uio.uio_resid = count;
 	uio.uio_segflg = UIO_SYSSPACE;
 	uio.uio_rw = UIO_READ;
 	uio.uio_td = td;
 
 	error = ncl_readrpc(vp, &uio, cred);
 	pmap_qremove(kva, npages);
 
 	relpbuf(bp, &ncl_pbuf_freecnt);
 
 	if (error && (uio.uio_resid == count)) {
 		printf("ncl_getpages: error %d\n", error);
 		return (VM_PAGER_ERROR);
 	}
 
 	/*
 	 * Calculate the number of bytes read and validate only that number
 	 * of bytes.  Note that due to pending writes, size may be 0.  This
 	 * does not mean that the remaining data is invalid!
 	 */
 
 	size = count - uio.uio_resid;
 	VM_OBJECT_WLOCK(object);
 	for (i = 0, toff = 0; i < npages; i++, toff = nextoff) {
 		vm_page_t m;
 		nextoff = toff + PAGE_SIZE;
 		m = pages[i];
 
 		if (nextoff <= size) {
 			/*
 			 * Read operation filled an entire page
 			 */
 			m->valid = VM_PAGE_BITS_ALL;
 			KASSERT(m->dirty == 0,
 			    ("nfs_getpages: page %p is dirty", m));
 		} else if (size > toff) {
 			/*
 			 * Read operation filled a partial page.
 			 */
 			m->valid = 0;
 			vm_page_set_valid_range(m, 0, size - toff);
 			KASSERT(m->dirty == 0,
 			    ("nfs_getpages: page %p is dirty", m));
 		} else {
 			/*
 			 * Read operation was short.  If no error
 			 * occurred we may have hit a zero-fill
 			 * section.  We leave valid set to 0, and page
 			 * is freed by vm_page_readahead_finish() if
 			 * its index is not equal to requested, or
 			 * page is zeroed and set valid by
 			 * vm_pager_get_pages() for requested page.
 			 */
 			;
 		}
 	}
 out:
 	VM_OBJECT_WUNLOCK(object);
 	if (ap->a_rbehind)
 		*ap->a_rbehind = 0;
 	if (ap->a_rahead)
 		*ap->a_rahead = 0;
 	return (VM_PAGER_OK);
 }
 
 /*
  * Vnode op for VM putpages.
  */
 int
 ncl_putpages(struct vop_putpages_args *ap)
 {
 	struct uio uio;
 	struct iovec iov;
 	int i, error, npages, count;
 	off_t offset;
 	int *rtvals;
 	struct vnode *vp;
 	struct thread *td;
 	struct ucred *cred;
 	struct nfsmount *nmp;
 	struct nfsnode *np;
 	vm_page_t *pages;
 
 	vp = ap->a_vp;
 	np = VTONFS(vp);
 	td = curthread;				/* XXX */
 	/* Set the cred to n_writecred for the write rpcs. */
 	if (np->n_writecred != NULL)
 		cred = crhold(np->n_writecred);
 	else
 		cred = crhold(curthread->td_ucred);	/* XXX */
 	nmp = VFSTONFS(vp->v_mount);
 	pages = ap->a_m;
 	count = ap->a_count;
 	rtvals = ap->a_rtvals;
 	npages = btoc(count);
 	offset = IDX_TO_OFF(pages[0]->pindex);
 
 	mtx_lock(&nmp->nm_mtx);
 	if ((nmp->nm_flag & NFSMNT_NFSV3) != 0 &&
 	    (nmp->nm_state & NFSSTA_GOTFSINFO) == 0) {
 		mtx_unlock(&nmp->nm_mtx);
 		(void)ncl_fsinfo(nmp, vp, cred, td);
 	} else
 		mtx_unlock(&nmp->nm_mtx);
 
 	mtx_lock(&np->n_mtx);
 	if (newnfs_directio_enable && !newnfs_directio_allow_mmap &&
 	    (np->n_flag & NNONCACHE) && (vp->v_type == VREG)) {
 		mtx_unlock(&np->n_mtx);
 		printf("ncl_putpages: called on noncache-able vnode\n");
 		mtx_lock(&np->n_mtx);
 	}
 	/*
 	 * When putting pages, do not extend file past EOF.
 	 */
 	if (offset + count > np->n_size) {
 		count = np->n_size - offset;
 		if (count < 0)
 			count = 0;
 	}
 	mtx_unlock(&np->n_mtx);
 
 	for (i = 0; i < npages; i++)
 		rtvals[i] = VM_PAGER_ERROR;
 
 	VM_CNT_INC(v_vnodeout);
 	VM_CNT_ADD(v_vnodepgsout, count);
 
 	iov.iov_base = unmapped_buf;
 	iov.iov_len = count;
 	uio.uio_iov = &iov;
 	uio.uio_iovcnt = 1;
 	uio.uio_offset = offset;
 	uio.uio_resid = count;
 	uio.uio_segflg = UIO_NOCOPY;
 	uio.uio_rw = UIO_WRITE;
 	uio.uio_td = td;
 
 	error = VOP_WRITE(vp, &uio, vnode_pager_putpages_ioflags(ap->a_sync),
 	    cred);
 	crfree(cred);
 
 	if (error == 0 || !nfs_keep_dirty_on_error) {
 		vnode_pager_undirty_pages(pages, rtvals, count - uio.uio_resid,
 		    np->n_size - offset, npages * PAGE_SIZE);
 	}
 	return (rtvals[0]);
 }
 
 /*
  * For nfs, cache consistency can only be maintained approximately.
  * Although RFC1094 does not specify the criteria, the following is
  * believed to be compatible with the reference port.
  * For nfs:
  * If the file's modify time on the server has changed since the
  * last read rpc or you have written to the file,
  * you may have lost data cache consistency with the
  * server, so flush all of the file's data out of the cache.
  * Then force a getattr rpc to ensure that you have up to date
  * attributes.
  * NB: This implies that cache data can be read when up to
  * NFS_ATTRTIMEO seconds out of date. If you find that you need current
  * attributes this could be forced by setting n_attrstamp to 0 before
  * the VOP_GETATTR() call.
  */
 static inline int
 nfs_bioread_check_cons(struct vnode *vp, struct thread *td, struct ucred *cred)
 {
 	int error = 0;
 	struct vattr vattr;
 	struct nfsnode *np = VTONFS(vp);
 	bool old_lock;
 
 	/*
 	 * Ensure the exclusove access to the node before checking
 	 * whether the cache is consistent.
 	 */
 	old_lock = ncl_excl_start(vp);
 	mtx_lock(&np->n_mtx);
 	if (np->n_flag & NMODIFIED) {
 		mtx_unlock(&np->n_mtx);
 		if (vp->v_type != VREG) {
 			if (vp->v_type != VDIR)
 				panic("nfs: bioread, not dir");
 			ncl_invaldir(vp);
 			error = ncl_vinvalbuf(vp, V_SAVE | V_ALLOWCLEAN, td, 1);
 			if (error != 0)
 				goto out;
 		}
 		np->n_attrstamp = 0;
 		KDTRACE_NFS_ATTRCACHE_FLUSH_DONE(vp);
 		error = VOP_GETATTR(vp, &vattr, cred);
 		if (error)
 			goto out;
 		mtx_lock(&np->n_mtx);
 		np->n_mtime = vattr.va_mtime;
 		mtx_unlock(&np->n_mtx);
 	} else {
 		mtx_unlock(&np->n_mtx);
 		error = VOP_GETATTR(vp, &vattr, cred);
 		if (error)
 			goto out;
 		mtx_lock(&np->n_mtx);
 		if ((np->n_flag & NSIZECHANGED)
 		    || (NFS_TIMESPEC_COMPARE(&np->n_mtime, &vattr.va_mtime))) {
 			mtx_unlock(&np->n_mtx);
 			if (vp->v_type == VDIR)
 				ncl_invaldir(vp);
 			error = ncl_vinvalbuf(vp, V_SAVE | V_ALLOWCLEAN, td, 1);
 			if (error != 0)
 				goto out;
 			mtx_lock(&np->n_mtx);
 			np->n_mtime = vattr.va_mtime;
 			np->n_flag &= ~NSIZECHANGED;
 		}
 		mtx_unlock(&np->n_mtx);
 	}
 out:
 	ncl_excl_finish(vp, old_lock);
 	return (error);
 }
 
 /*
  * Vnode op for read using bio
  */
 int
 ncl_bioread(struct vnode *vp, struct uio *uio, int ioflag, struct ucred *cred)
 {
 	struct nfsnode *np = VTONFS(vp);
 	int biosize, i;
 	struct buf *bp, *rabp;
 	struct thread *td;
 	struct nfsmount *nmp = VFSTONFS(vp->v_mount);
 	daddr_t lbn, rabn;
 	int bcount;
 	int seqcount;
 	int nra, error = 0, n = 0, on = 0;
 	off_t tmp_off;
 
 	KASSERT(uio->uio_rw == UIO_READ, ("ncl_read mode"));
 	if (uio->uio_resid == 0)
 		return (0);
 	if (uio->uio_offset < 0)	/* XXX VDIR cookies can be negative */
 		return (EINVAL);
 	td = uio->uio_td;
 
 	mtx_lock(&nmp->nm_mtx);
 	if ((nmp->nm_flag & NFSMNT_NFSV3) != 0 &&
 	    (nmp->nm_state & NFSSTA_GOTFSINFO) == 0) {
 		mtx_unlock(&nmp->nm_mtx);
 		(void)ncl_fsinfo(nmp, vp, cred, td);
 		mtx_lock(&nmp->nm_mtx);
 	}
 	if (nmp->nm_rsize == 0 || nmp->nm_readdirsize == 0)
 		(void) newnfs_iosize(nmp);
 
 	tmp_off = uio->uio_offset + uio->uio_resid;
 	if (vp->v_type != VDIR &&
 	    (tmp_off > nmp->nm_maxfilesize || tmp_off < uio->uio_offset)) {
 		mtx_unlock(&nmp->nm_mtx);
 		return (EFBIG);
 	}
 	mtx_unlock(&nmp->nm_mtx);
 
 	if (newnfs_directio_enable && (ioflag & IO_DIRECT) && (vp->v_type == VREG))
 		/* No caching/ no readaheads. Just read data into the user buffer */
 		return ncl_readrpc(vp, uio, cred);
 
 	biosize = vp->v_bufobj.bo_bsize;
 	seqcount = (int)((off_t)(ioflag >> IO_SEQSHIFT) * biosize / BKVASIZE);
 
 	error = nfs_bioread_check_cons(vp, td, cred);
 	if (error)
 		return error;
 
 	do {
 	    u_quad_t nsize;
 
 	    mtx_lock(&np->n_mtx);
 	    nsize = np->n_size;
 	    mtx_unlock(&np->n_mtx);
 
 	    switch (vp->v_type) {
 	    case VREG:
 		NFSINCRGLOBAL(nfsstatsv1.biocache_reads);
 		lbn = uio->uio_offset / biosize;
 		on = uio->uio_offset - (lbn * biosize);
 
 		/*
 		 * Start the read ahead(s), as required.
 		 */
 		if (nmp->nm_readahead > 0) {
 		    for (nra = 0; nra < nmp->nm_readahead && nra < seqcount &&
 			(off_t)(lbn + 1 + nra) * biosize < nsize; nra++) {
 			rabn = lbn + 1 + nra;
 			if (incore(&vp->v_bufobj, rabn) == NULL) {
 			    rabp = nfs_getcacheblk(vp, rabn, biosize, td);
 			    if (!rabp) {
 				error = newnfs_sigintr(nmp, td);
 				return (error ? error : EINTR);
 			    }
 			    if ((rabp->b_flags & (B_CACHE|B_DELWRI)) == 0) {
 				rabp->b_flags |= B_ASYNC;
 				rabp->b_iocmd = BIO_READ;
 				vfs_busy_pages(rabp, 0);
 				if (ncl_asyncio(nmp, rabp, cred, td)) {
 				    rabp->b_flags |= B_INVAL;
 				    rabp->b_ioflags |= BIO_ERROR;
 				    vfs_unbusy_pages(rabp);
 				    brelse(rabp);
 				    break;
 				}
 			    } else {
 				brelse(rabp);
 			    }
 			}
 		    }
 		}
 
 		/* Note that bcount is *not* DEV_BSIZE aligned. */
 		bcount = biosize;
 		if ((off_t)lbn * biosize >= nsize) {
 			bcount = 0;
 		} else if ((off_t)(lbn + 1) * biosize > nsize) {
 			bcount = nsize - (off_t)lbn * biosize;
 		}
 		bp = nfs_getcacheblk(vp, lbn, bcount, td);
 
 		if (!bp) {
 			error = newnfs_sigintr(nmp, td);
 			return (error ? error : EINTR);
 		}
 
 		/*
 		 * If B_CACHE is not set, we must issue the read.  If this
 		 * fails, we return an error.
 		 */
 
 		if ((bp->b_flags & B_CACHE) == 0) {
 		    bp->b_iocmd = BIO_READ;
 		    vfs_busy_pages(bp, 0);
 		    error = ncl_doio(vp, bp, cred, td, 0);
 		    if (error) {
 			brelse(bp);
 			return (error);
 		    }
 		}
 
 		/*
 		 * on is the offset into the current bp.  Figure out how many
 		 * bytes we can copy out of the bp.  Note that bcount is
 		 * NOT DEV_BSIZE aligned.
 		 *
 		 * Then figure out how many bytes we can copy into the uio.
 		 */
 
 		n = 0;
 		if (on < bcount)
 			n = MIN((unsigned)(bcount - on), uio->uio_resid);
 		break;
 	    case VLNK:
 		NFSINCRGLOBAL(nfsstatsv1.biocache_readlinks);
 		bp = nfs_getcacheblk(vp, (daddr_t)0, NFS_MAXPATHLEN, td);
 		if (!bp) {
 			error = newnfs_sigintr(nmp, td);
 			return (error ? error : EINTR);
 		}
 		if ((bp->b_flags & B_CACHE) == 0) {
 		    bp->b_iocmd = BIO_READ;
 		    vfs_busy_pages(bp, 0);
 		    error = ncl_doio(vp, bp, cred, td, 0);
 		    if (error) {
 			bp->b_ioflags |= BIO_ERROR;
 			brelse(bp);
 			return (error);
 		    }
 		}
 		n = MIN(uio->uio_resid, NFS_MAXPATHLEN - bp->b_resid);
 		on = 0;
 		break;
 	    case VDIR:
 		NFSINCRGLOBAL(nfsstatsv1.biocache_readdirs);
 		if (np->n_direofoffset
 		    && uio->uio_offset >= np->n_direofoffset) {
 		    return (0);
 		}
 		lbn = (uoff_t)uio->uio_offset / NFS_DIRBLKSIZ;
 		on = uio->uio_offset & (NFS_DIRBLKSIZ - 1);
 		bp = nfs_getcacheblk(vp, lbn, NFS_DIRBLKSIZ, td);
 		if (!bp) {
 		    error = newnfs_sigintr(nmp, td);
 		    return (error ? error : EINTR);
 		}
 		if ((bp->b_flags & B_CACHE) == 0) {
 		    bp->b_iocmd = BIO_READ;
 		    vfs_busy_pages(bp, 0);
 		    error = ncl_doio(vp, bp, cred, td, 0);
 		    if (error) {
 			    brelse(bp);
 		    }
 		    while (error == NFSERR_BAD_COOKIE) {
 			ncl_invaldir(vp);
 			error = ncl_vinvalbuf(vp, 0, td, 1);
 
 			/*
 			 * Yuck! The directory has been modified on the
 			 * server. The only way to get the block is by
 			 * reading from the beginning to get all the
 			 * offset cookies.
 			 *
 			 * Leave the last bp intact unless there is an error.
 			 * Loop back up to the while if the error is another
 			 * NFSERR_BAD_COOKIE (double yuch!).
 			 */
 			for (i = 0; i <= lbn && !error; i++) {
 			    if (np->n_direofoffset
 				&& (i * NFS_DIRBLKSIZ) >= np->n_direofoffset)
 				    return (0);
 			    bp = nfs_getcacheblk(vp, i, NFS_DIRBLKSIZ, td);
 			    if (!bp) {
 				error = newnfs_sigintr(nmp, td);
 				return (error ? error : EINTR);
 			    }
 			    if ((bp->b_flags & B_CACHE) == 0) {
 				    bp->b_iocmd = BIO_READ;
 				    vfs_busy_pages(bp, 0);
 				    error = ncl_doio(vp, bp, cred, td, 0);
 				    /*
 				     * no error + B_INVAL == directory EOF,
 				     * use the block.
 				     */
 				    if (error == 0 && (bp->b_flags & B_INVAL))
 					    break;
 			    }
 			    /*
 			     * An error will throw away the block and the
 			     * for loop will break out.  If no error and this
 			     * is not the block we want, we throw away the
 			     * block and go for the next one via the for loop.
 			     */
 			    if (error || i < lbn)
 				    brelse(bp);
 			}
 		    }
 		    /*
 		     * The above while is repeated if we hit another cookie
 		     * error.  If we hit an error and it wasn't a cookie error,
 		     * we give up.
 		     */
 		    if (error)
 			    return (error);
 		}
 
 		/*
 		 * If not eof and read aheads are enabled, start one.
 		 * (You need the current block first, so that you have the
 		 *  directory offset cookie of the next block.)
 		 */
 		if (nmp->nm_readahead > 0 &&
 		    (bp->b_flags & B_INVAL) == 0 &&
 		    (np->n_direofoffset == 0 ||
 		    (lbn + 1) * NFS_DIRBLKSIZ < np->n_direofoffset) &&
 		    incore(&vp->v_bufobj, lbn + 1) == NULL) {
 			rabp = nfs_getcacheblk(vp, lbn + 1, NFS_DIRBLKSIZ, td);
 			if (rabp) {
 			    if ((rabp->b_flags & (B_CACHE|B_DELWRI)) == 0) {
 				rabp->b_flags |= B_ASYNC;
 				rabp->b_iocmd = BIO_READ;
 				vfs_busy_pages(rabp, 0);
 				if (ncl_asyncio(nmp, rabp, cred, td)) {
 				    rabp->b_flags |= B_INVAL;
 				    rabp->b_ioflags |= BIO_ERROR;
 				    vfs_unbusy_pages(rabp);
 				    brelse(rabp);
 				}
 			    } else {
 				brelse(rabp);
 			    }
 			}
 		}
 		/*
 		 * Unlike VREG files, whos buffer size ( bp->b_bcount ) is
 		 * chopped for the EOF condition, we cannot tell how large
 		 * NFS directories are going to be until we hit EOF.  So
 		 * an NFS directory buffer is *not* chopped to its EOF.  Now,
 		 * it just so happens that b_resid will effectively chop it
 		 * to EOF.  *BUT* this information is lost if the buffer goes
 		 * away and is reconstituted into a B_CACHE state ( due to
 		 * being VMIO ) later.  So we keep track of the directory eof
 		 * in np->n_direofoffset and chop it off as an extra step
 		 * right here.
 		 */
 		n = lmin(uio->uio_resid, NFS_DIRBLKSIZ - bp->b_resid - on);
 		if (np->n_direofoffset && n > np->n_direofoffset - uio->uio_offset)
 			n = np->n_direofoffset - uio->uio_offset;
 		break;
 	    default:
 		printf(" ncl_bioread: type %x unexpected\n", vp->v_type);
 		bp = NULL;
 		break;
 	    }
 
 	    if (n > 0) {
 		    error = vn_io_fault_uiomove(bp->b_data + on, (int)n, uio);
 	    }
 	    if (vp->v_type == VLNK)
 		n = 0;
 	    if (bp != NULL)
 		brelse(bp);
 	} while (error == 0 && uio->uio_resid > 0 && n > 0);
 	return (error);
 }
 
 /*
  * The NFS write path cannot handle iovecs with len > 1. So we need to
  * break up iovecs accordingly (restricting them to wsize).
  * For the SYNC case, we can do this with 1 copy (user buffer -> mbuf).
  * For the ASYNC case, 2 copies are needed. The first a copy from the
  * user buffer to a staging buffer and then a second copy from the staging
  * buffer to mbufs. This can be optimized by copying from the user buffer
  * directly into mbufs and passing the chain down, but that requires a
  * fair amount of re-working of the relevant codepaths (and can be done
  * later).
  */
 static int
 nfs_directio_write(vp, uiop, cred, ioflag)
 	struct vnode *vp;
 	struct uio *uiop;
 	struct ucred *cred;
 	int ioflag;
 {
 	int error;
 	struct nfsmount *nmp = VFSTONFS(vp->v_mount);
 	struct thread *td = uiop->uio_td;
 	int size;
 	int wsize;
 
 	mtx_lock(&nmp->nm_mtx);
 	wsize = nmp->nm_wsize;
 	mtx_unlock(&nmp->nm_mtx);
 	if (ioflag & IO_SYNC) {
 		int iomode, must_commit;
 		struct uio uio;
 		struct iovec iov;
 do_sync:
 		while (uiop->uio_resid > 0) {
 			size = MIN(uiop->uio_resid, wsize);
 			size = MIN(uiop->uio_iov->iov_len, size);
 			iov.iov_base = uiop->uio_iov->iov_base;
 			iov.iov_len = size;
 			uio.uio_iov = &iov;
 			uio.uio_iovcnt = 1;
 			uio.uio_offset = uiop->uio_offset;
 			uio.uio_resid = size;
 			uio.uio_segflg = UIO_USERSPACE;
 			uio.uio_rw = UIO_WRITE;
 			uio.uio_td = td;
 			iomode = NFSWRITE_FILESYNC;
 			error = ncl_writerpc(vp, &uio, cred, &iomode,
 			    &must_commit, 0);
 			KASSERT((must_commit == 0),
 				("ncl_directio_write: Did not commit write"));
 			if (error)
 				return (error);
 			uiop->uio_offset += size;
 			uiop->uio_resid -= size;
 			if (uiop->uio_iov->iov_len <= size) {
 				uiop->uio_iovcnt--;
 				uiop->uio_iov++;
 			} else {
 				uiop->uio_iov->iov_base =
 					(char *)uiop->uio_iov->iov_base + size;
 				uiop->uio_iov->iov_len -= size;
 			}
 		}
 	} else {
 		struct uio *t_uio;
 		struct iovec *t_iov;
 		struct buf *bp;
 
 		/*
 		 * Break up the write into blocksize chunks and hand these
 		 * over to nfsiod's for write back.
 		 * Unfortunately, this incurs a copy of the data. Since
 		 * the user could modify the buffer before the write is
 		 * initiated.
 		 *
 		 * The obvious optimization here is that one of the 2 copies
 		 * in the async write path can be eliminated by copying the
 		 * data here directly into mbufs and passing the mbuf chain
 		 * down. But that will require a fair amount of re-working
 		 * of the code and can be done if there's enough interest
 		 * in NFS directio access.
 		 */
 		while (uiop->uio_resid > 0) {
 			size = MIN(uiop->uio_resid, wsize);
 			size = MIN(uiop->uio_iov->iov_len, size);
 			bp = getpbuf(&ncl_pbuf_freecnt);
 			t_uio = malloc(sizeof(struct uio), M_NFSDIRECTIO, M_WAITOK);
 			t_iov = malloc(sizeof(struct iovec), M_NFSDIRECTIO, M_WAITOK);
 			t_iov->iov_base = malloc(size, M_NFSDIRECTIO, M_WAITOK);
 			t_iov->iov_len = size;
 			t_uio->uio_iov = t_iov;
 			t_uio->uio_iovcnt = 1;
 			t_uio->uio_offset = uiop->uio_offset;
 			t_uio->uio_resid = size;
 			t_uio->uio_segflg = UIO_SYSSPACE;
 			t_uio->uio_rw = UIO_WRITE;
 			t_uio->uio_td = td;
 			KASSERT(uiop->uio_segflg == UIO_USERSPACE ||
 			    uiop->uio_segflg == UIO_SYSSPACE,
 			    ("nfs_directio_write: Bad uio_segflg"));
 			if (uiop->uio_segflg == UIO_USERSPACE) {
 				error = copyin(uiop->uio_iov->iov_base,
 				    t_iov->iov_base, size);
 				if (error != 0)
 					goto err_free;
 			} else
 				/*
 				 * UIO_SYSSPACE may never happen, but handle
 				 * it just in case it does.
 				 */
 				bcopy(uiop->uio_iov->iov_base, t_iov->iov_base,
 				    size);
 			bp->b_flags |= B_DIRECT;
 			bp->b_iocmd = BIO_WRITE;
 			if (cred != NOCRED) {
 				crhold(cred);
 				bp->b_wcred = cred;
 			} else
 				bp->b_wcred = NOCRED;
 			bp->b_caller1 = (void *)t_uio;
 			bp->b_vp = vp;
 			error = ncl_asyncio(nmp, bp, NOCRED, td);
 err_free:
 			if (error) {
 				free(t_iov->iov_base, M_NFSDIRECTIO);
 				free(t_iov, M_NFSDIRECTIO);
 				free(t_uio, M_NFSDIRECTIO);
 				bp->b_vp = NULL;
 				relpbuf(bp, &ncl_pbuf_freecnt);
 				if (error == EINTR)
 					return (error);
 				goto do_sync;
 			}
 			uiop->uio_offset += size;
 			uiop->uio_resid -= size;
 			if (uiop->uio_iov->iov_len <= size) {
 				uiop->uio_iovcnt--;
 				uiop->uio_iov++;
 			} else {
 				uiop->uio_iov->iov_base =
 					(char *)uiop->uio_iov->iov_base + size;
 				uiop->uio_iov->iov_len -= size;
 			}
 		}
 	}
 	return (0);
 }
 
 /*
  * Vnode op for write using bio
  */
 int
 ncl_write(struct vop_write_args *ap)
 {
 	int biosize;
 	struct uio *uio = ap->a_uio;
 	struct thread *td = uio->uio_td;
 	struct vnode *vp = ap->a_vp;
 	struct nfsnode *np = VTONFS(vp);
 	struct ucred *cred = ap->a_cred;
 	int ioflag = ap->a_ioflag;
 	struct buf *bp;
 	struct vattr vattr;
 	struct nfsmount *nmp = VFSTONFS(vp->v_mount);
 	daddr_t lbn;
 	int bcount, noncontig_write, obcount;
 	int bp_cached, n, on, error = 0, error1, wouldcommit;
 	size_t orig_resid, local_resid;
 	off_t orig_size, tmp_off;
 
 	KASSERT(uio->uio_rw == UIO_WRITE, ("ncl_write mode"));
 	KASSERT(uio->uio_segflg != UIO_USERSPACE || uio->uio_td == curthread,
 	    ("ncl_write proc"));
 	if (vp->v_type != VREG)
 		return (EIO);
 	mtx_lock(&np->n_mtx);
 	if (np->n_flag & NWRITEERR) {
 		np->n_flag &= ~NWRITEERR;
 		mtx_unlock(&np->n_mtx);
 		return (np->n_error);
 	} else
 		mtx_unlock(&np->n_mtx);
 	mtx_lock(&nmp->nm_mtx);
 	if ((nmp->nm_flag & NFSMNT_NFSV3) != 0 &&
 	    (nmp->nm_state & NFSSTA_GOTFSINFO) == 0) {
 		mtx_unlock(&nmp->nm_mtx);
 		(void)ncl_fsinfo(nmp, vp, cred, td);
 		mtx_lock(&nmp->nm_mtx);
 	}
 	if (nmp->nm_wsize == 0)
 		(void) newnfs_iosize(nmp);
 	mtx_unlock(&nmp->nm_mtx);
 
 	/*
 	 * Synchronously flush pending buffers if we are in synchronous
 	 * mode or if we are appending.
 	 */
 	if (ioflag & (IO_APPEND | IO_SYNC)) {
 		mtx_lock(&np->n_mtx);
 		if (np->n_flag & NMODIFIED) {
 			mtx_unlock(&np->n_mtx);
 #ifdef notyet /* Needs matching nonblock semantics elsewhere, too. */
 			/*
 			 * Require non-blocking, synchronous writes to
 			 * dirty files to inform the program it needs
 			 * to fsync(2) explicitly.
 			 */
 			if (ioflag & IO_NDELAY)
 				return (EAGAIN);
 #endif
 			np->n_attrstamp = 0;
 			KDTRACE_NFS_ATTRCACHE_FLUSH_DONE(vp);
 			error = ncl_vinvalbuf(vp, V_SAVE | ((ioflag &
 			    IO_VMIO) != 0 ? V_VMIO : 0), td, 1);
 			if (error != 0)
 				return (error);
 		} else
 			mtx_unlock(&np->n_mtx);
 	}
 
 	orig_resid = uio->uio_resid;
 	mtx_lock(&np->n_mtx);
 	orig_size = np->n_size;
 	mtx_unlock(&np->n_mtx);
 
 	/*
 	 * If IO_APPEND then load uio_offset.  We restart here if we cannot
 	 * get the append lock.
 	 */
 	if (ioflag & IO_APPEND) {
 		np->n_attrstamp = 0;
 		KDTRACE_NFS_ATTRCACHE_FLUSH_DONE(vp);
 		error = VOP_GETATTR(vp, &vattr, cred);
 		if (error)
 			return (error);
 		mtx_lock(&np->n_mtx);
 		uio->uio_offset = np->n_size;
 		mtx_unlock(&np->n_mtx);
 	}
 
 	if (uio->uio_offset < 0)
 		return (EINVAL);
 	tmp_off = uio->uio_offset + uio->uio_resid;
 	if (tmp_off > nmp->nm_maxfilesize || tmp_off < uio->uio_offset)
 		return (EFBIG);
 	if (uio->uio_resid == 0)
 		return (0);
 
 	if (newnfs_directio_enable && (ioflag & IO_DIRECT) && vp->v_type == VREG)
 		return nfs_directio_write(vp, uio, cred, ioflag);
 
 	/*
 	 * Maybe this should be above the vnode op call, but so long as
 	 * file servers have no limits, i don't think it matters
 	 */
 	if (vn_rlimit_fsize(vp, uio, td))
 		return (EFBIG);
 
 	biosize = vp->v_bufobj.bo_bsize;
 	/*
 	 * Find all of this file's B_NEEDCOMMIT buffers.  If our writes
 	 * would exceed the local maximum per-file write commit size when
 	 * combined with those, we must decide whether to flush,
 	 * go synchronous, or return error.  We don't bother checking
 	 * IO_UNIT -- we just make all writes atomic anyway, as there's
 	 * no point optimizing for something that really won't ever happen.
 	 */
 	wouldcommit = 0;
 	if (!(ioflag & IO_SYNC)) {
 		int nflag;
 
 		mtx_lock(&np->n_mtx);
 		nflag = np->n_flag;
 		mtx_unlock(&np->n_mtx);
 		if (nflag & NMODIFIED) {
 			BO_LOCK(&vp->v_bufobj);
 			if (vp->v_bufobj.bo_dirty.bv_cnt != 0) {
 				TAILQ_FOREACH(bp, &vp->v_bufobj.bo_dirty.bv_hd,
 				    b_bobufs) {
 					if (bp->b_flags & B_NEEDCOMMIT)
 						wouldcommit += bp->b_bcount;
 				}
 			}
 			BO_UNLOCK(&vp->v_bufobj);
 		}
 	}
 
 	do {
 		if (!(ioflag & IO_SYNC)) {
 			wouldcommit += biosize;
 			if (wouldcommit > nmp->nm_wcommitsize) {
 				np->n_attrstamp = 0;
 				KDTRACE_NFS_ATTRCACHE_FLUSH_DONE(vp);
 				error = ncl_vinvalbuf(vp, V_SAVE | ((ioflag &
 				    IO_VMIO) != 0 ? V_VMIO : 0), td, 1);
 				if (error != 0)
 					return (error);
 				wouldcommit = biosize;
 			}
 		}
 
 		NFSINCRGLOBAL(nfsstatsv1.biocache_writes);
 		lbn = uio->uio_offset / biosize;
 		on = uio->uio_offset - (lbn * biosize);
 		n = MIN((unsigned)(biosize - on), uio->uio_resid);
 again:
 		/*
 		 * Handle direct append and file extension cases, calculate
 		 * unaligned buffer size.
 		 */
 		mtx_lock(&np->n_mtx);
 		if ((np->n_flag & NHASBEENLOCKED) == 0 &&
 		    (nmp->nm_flag & NFSMNT_NONCONTIGWR) != 0)
 			noncontig_write = 1;
 		else
 			noncontig_write = 0;
 		if ((uio->uio_offset == np->n_size ||
 		    (noncontig_write != 0 &&
 		    lbn == (np->n_size / biosize) &&
 		    uio->uio_offset + n > np->n_size)) && n) {
 			mtx_unlock(&np->n_mtx);
 			/*
 			 * Get the buffer (in its pre-append state to maintain
 			 * B_CACHE if it was previously set).  Resize the
 			 * nfsnode after we have locked the buffer to prevent
 			 * readers from reading garbage.
 			 */
 			obcount = np->n_size - (lbn * biosize);
 			bp = nfs_getcacheblk(vp, lbn, obcount, td);
 
 			if (bp != NULL) {
 				long save;
 
 				mtx_lock(&np->n_mtx);
 				np->n_size = uio->uio_offset + n;
 				np->n_flag |= NMODIFIED;
 				vnode_pager_setsize(vp, np->n_size);
 				mtx_unlock(&np->n_mtx);
 
 				save = bp->b_flags & B_CACHE;
 				bcount = on + n;
 				allocbuf(bp, bcount);
 				bp->b_flags |= save;
 				if (noncontig_write != 0 && on > obcount)
 					vfs_bio_bzero_buf(bp, obcount, on -
 					    obcount);
 			}
 		} else {
 			/*
 			 * Obtain the locked cache block first, and then
 			 * adjust the file's size as appropriate.
 			 */
 			bcount = on + n;
 			if ((off_t)lbn * biosize + bcount < np->n_size) {
 				if ((off_t)(lbn + 1) * biosize < np->n_size)
 					bcount = biosize;
 				else
 					bcount = np->n_size - (off_t)lbn * biosize;
 			}
 			mtx_unlock(&np->n_mtx);
 			bp = nfs_getcacheblk(vp, lbn, bcount, td);
 			mtx_lock(&np->n_mtx);
 			if (uio->uio_offset + n > np->n_size) {
 				np->n_size = uio->uio_offset + n;
 				np->n_flag |= NMODIFIED;
 				vnode_pager_setsize(vp, np->n_size);
 			}
 			mtx_unlock(&np->n_mtx);
 		}
 
 		if (!bp) {
 			error = newnfs_sigintr(nmp, td);
 			if (!error)
 				error = EINTR;
 			break;
 		}
 
 		/*
 		 * Issue a READ if B_CACHE is not set.  In special-append
 		 * mode, B_CACHE is based on the buffer prior to the write
 		 * op and is typically set, avoiding the read.  If a read
 		 * is required in special append mode, the server will
 		 * probably send us a short-read since we extended the file
 		 * on our end, resulting in b_resid == 0 and, thusly,
 		 * B_CACHE getting set.
 		 *
 		 * We can also avoid issuing the read if the write covers
 		 * the entire buffer.  We have to make sure the buffer state
 		 * is reasonable in this case since we will not be initiating
 		 * I/O.  See the comments in kern/vfs_bio.c's getblk() for
 		 * more information.
 		 *
 		 * B_CACHE may also be set due to the buffer being cached
 		 * normally.
 		 */
 
 		bp_cached = 1;
 		if (on == 0 && n == bcount) {
 			if ((bp->b_flags & B_CACHE) == 0)
 				bp_cached = 0;
 			bp->b_flags |= B_CACHE;
 			bp->b_flags &= ~B_INVAL;
 			bp->b_ioflags &= ~BIO_ERROR;
 		}
 
 		if ((bp->b_flags & B_CACHE) == 0) {
 			bp->b_iocmd = BIO_READ;
 			vfs_busy_pages(bp, 0);
 			error = ncl_doio(vp, bp, cred, td, 0);
 			if (error) {
 				brelse(bp);
 				break;
 			}
 		}
 		if (bp->b_wcred == NOCRED)
 			bp->b_wcred = crhold(cred);
 		mtx_lock(&np->n_mtx);
 		np->n_flag |= NMODIFIED;
 		mtx_unlock(&np->n_mtx);
 
 		/*
 		 * If dirtyend exceeds file size, chop it down.  This should
 		 * not normally occur but there is an append race where it
 		 * might occur XXX, so we log it.
 		 *
 		 * If the chopping creates a reverse-indexed or degenerate
 		 * situation with dirtyoff/end, we 0 both of them.
 		 */
 
 		if (bp->b_dirtyend > bcount) {
 			printf("NFS append race @%lx:%d\n",
 			    (long)bp->b_blkno * DEV_BSIZE,
 			    bp->b_dirtyend - bcount);
 			bp->b_dirtyend = bcount;
 		}
 
 		if (bp->b_dirtyoff >= bp->b_dirtyend)
 			bp->b_dirtyoff = bp->b_dirtyend = 0;
 
 		/*
 		 * If the new write will leave a contiguous dirty
 		 * area, just update the b_dirtyoff and b_dirtyend,
 		 * otherwise force a write rpc of the old dirty area.
 		 *
 		 * If there has been a file lock applied to this file
 		 * or vfs.nfs.old_noncontig_writing is set, do the following:
 		 * While it is possible to merge discontiguous writes due to
 		 * our having a B_CACHE buffer ( and thus valid read data
 		 * for the hole), we don't because it could lead to
 		 * significant cache coherency problems with multiple clients,
 		 * especially if locking is implemented later on.
 		 *
 		 * If vfs.nfs.old_noncontig_writing is not set and there has
 		 * not been file locking done on this file:
 		 * Relax coherency a bit for the sake of performance and
 		 * expand the current dirty region to contain the new
 		 * write even if it means we mark some non-dirty data as
 		 * dirty.
 		 */
 
 		if (noncontig_write == 0 && bp->b_dirtyend > 0 &&
 		    (on > bp->b_dirtyend || (on + n) < bp->b_dirtyoff)) {
 			if (bwrite(bp) == EINTR) {
 				error = EINTR;
 				break;
 			}
 			goto again;
 		}
 
 		local_resid = uio->uio_resid;
 		error = vn_io_fault_uiomove((char *)bp->b_data + on, n, uio);
 
 		if (error != 0 && !bp_cached) {
 			/*
 			 * This block has no other content then what
 			 * possibly was written by the faulty uiomove.
 			 * Release it, forgetting the data pages, to
 			 * prevent the leak of uninitialized data to
 			 * usermode.
 			 */
 			bp->b_ioflags |= BIO_ERROR;
 			brelse(bp);
 			uio->uio_offset -= local_resid - uio->uio_resid;
 			uio->uio_resid = local_resid;
 			break;
 		}
 
 		/*
 		 * Since this block is being modified, it must be written
 		 * again and not just committed.  Since write clustering does
 		 * not work for the stage 1 data write, only the stage 2
 		 * commit rpc, we have to clear B_CLUSTEROK as well.
 		 */
 		bp->b_flags &= ~(B_NEEDCOMMIT | B_CLUSTEROK);
 
 		/*
 		 * Get the partial update on the progress made from
 		 * uiomove, if an error occurred.
 		 */
 		if (error != 0)
 			n = local_resid - uio->uio_resid;
 
 		/*
 		 * Only update dirtyoff/dirtyend if not a degenerate
 		 * condition.
 		 */
 		if (n > 0) {
 			if (bp->b_dirtyend > 0) {
 				bp->b_dirtyoff = min(on, bp->b_dirtyoff);
 				bp->b_dirtyend = max((on + n), bp->b_dirtyend);
 			} else {
 				bp->b_dirtyoff = on;
 				bp->b_dirtyend = on + n;
 			}
 			vfs_bio_set_valid(bp, on, n);
 		}
 
 		/*
 		 * If IO_SYNC do bwrite().
 		 *
 		 * IO_INVAL appears to be unused.  The idea appears to be
 		 * to turn off caching in this case.  Very odd.  XXX
 		 */
 		if ((ioflag & IO_SYNC)) {
 			if (ioflag & IO_INVAL)
 				bp->b_flags |= B_NOCACHE;
 			error1 = bwrite(bp);
 			if (error1 != 0) {
 				if (error == 0)
 					error = error1;
 				break;
 			}
 		} else if ((n + on) == biosize || (ioflag & IO_ASYNC) != 0) {
 			bp->b_flags |= B_ASYNC;
 			(void) ncl_writebp(bp, 0, NULL);
 		} else {
 			bdwrite(bp);
 		}
 
 		if (error != 0)
 			break;
 	} while (uio->uio_resid > 0 && n > 0);
 
 	if (error != 0) {
 		if (ioflag & IO_UNIT) {
 			VATTR_NULL(&vattr);
 			vattr.va_size = orig_size;
 			/* IO_SYNC is handled implicitely */
 			(void)VOP_SETATTR(vp, &vattr, cred);
 			uio->uio_offset -= orig_resid - uio->uio_resid;
 			uio->uio_resid = orig_resid;
 		}
 	}
 
 	return (error);
 }
 
 /*
  * Get an nfs cache block.
  *
  * Allocate a new one if the block isn't currently in the cache
  * and return the block marked busy. If the calling process is
  * interrupted by a signal for an interruptible mount point, return
  * NULL.
  *
  * The caller must carefully deal with the possible B_INVAL state of
  * the buffer.  ncl_doio() clears B_INVAL (and ncl_asyncio() clears it
  * indirectly), so synchronous reads can be issued without worrying about
  * the B_INVAL state.  We have to be a little more careful when dealing
  * with writes (see comments in nfs_write()) when extending a file past
  * its EOF.
  */
 static struct buf *
 nfs_getcacheblk(struct vnode *vp, daddr_t bn, int size, struct thread *td)
 {
 	struct buf *bp;
 	struct mount *mp;
 	struct nfsmount *nmp;
 
 	mp = vp->v_mount;
 	nmp = VFSTONFS(mp);
 
 	if (nmp->nm_flag & NFSMNT_INT) {
 		sigset_t oldset;
 
 		newnfs_set_sigmask(td, &oldset);
 		bp = getblk(vp, bn, size, PCATCH, 0, 0);
 		newnfs_restore_sigmask(td, &oldset);
 		while (bp == NULL) {
 			if (newnfs_sigintr(nmp, td))
 				return (NULL);
 			bp = getblk(vp, bn, size, 0, 2 * hz, 0);
 		}
 	} else {
 		bp = getblk(vp, bn, size, 0, 0, 0);
 	}
 
 	if (vp->v_type == VREG)
 		bp->b_blkno = bn * (vp->v_bufobj.bo_bsize / DEV_BSIZE);
 	return (bp);
 }
 
 /*
  * Flush and invalidate all dirty buffers. If another process is already
  * doing the flush, just wait for completion.
  */
 int
 ncl_vinvalbuf(struct vnode *vp, int flags, struct thread *td, int intrflg)
 {
 	struct nfsnode *np = VTONFS(vp);
 	struct nfsmount *nmp = VFSTONFS(vp->v_mount);
 	int error = 0, slpflag, slptimeo;
 	bool old_lock;
 
 	ASSERT_VOP_LOCKED(vp, "ncl_vinvalbuf");
 
 	if ((nmp->nm_flag & NFSMNT_INT) == 0)
 		intrflg = 0;
 	if (NFSCL_FORCEDISM(nmp->nm_mountp))
 		intrflg = 1;
 	if (intrflg) {
 		slpflag = PCATCH;
 		slptimeo = 2 * hz;
 	} else {
 		slpflag = 0;
 		slptimeo = 0;
 	}
 
 	old_lock = ncl_excl_start(vp);
 	if (old_lock)
 		flags |= V_ALLOWCLEAN;
 
 	/*
 	 * Now, flush as required.
 	 */
 	if ((flags & (V_SAVE | V_VMIO)) == V_SAVE &&
 	     vp->v_bufobj.bo_object != NULL) {
 		VM_OBJECT_WLOCK(vp->v_bufobj.bo_object);
 		vm_object_page_clean(vp->v_bufobj.bo_object, 0, 0, OBJPC_SYNC);
 		VM_OBJECT_WUNLOCK(vp->v_bufobj.bo_object);
 		/*
 		 * If the page clean was interrupted, fail the invalidation.
 		 * Not doing so, we run the risk of losing dirty pages in the
 		 * vinvalbuf() call below.
 		 */
 		if (intrflg && (error = newnfs_sigintr(nmp, td)))
 			goto out;
 	}
 
 	error = vinvalbuf(vp, flags, slpflag, 0);
 	while (error) {
 		if (intrflg && (error = newnfs_sigintr(nmp, td)))
 			goto out;
 		error = vinvalbuf(vp, flags, 0, slptimeo);
 	}
 	if (NFSHASPNFS(nmp)) {
 		nfscl_layoutcommit(vp, td);
 		/*
 		 * Invalidate the attribute cache, since writes to a DS
 		 * won't update the size attribute.
 		 */
 		mtx_lock(&np->n_mtx);
 		np->n_attrstamp = 0;
 	} else
 		mtx_lock(&np->n_mtx);
 	if (np->n_directio_asyncwr == 0)
 		np->n_flag &= ~NMODIFIED;
 	mtx_unlock(&np->n_mtx);
 out:
 	ncl_excl_finish(vp, old_lock);
 	return error;
 }
 
 /*
  * Initiate asynchronous I/O. Return an error if no nfsiods are available.
  * This is mainly to avoid queueing async I/O requests when the nfsiods
  * are all hung on a dead server.
  *
  * Note: ncl_asyncio() does not clear (BIO_ERROR|B_INVAL) but when the bp
  * is eventually dequeued by the async daemon, ncl_doio() *will*.
  */
 int
 ncl_asyncio(struct nfsmount *nmp, struct buf *bp, struct ucred *cred, struct thread *td)
 {
 	int iod;
 	int gotiod;
 	int slpflag = 0;
 	int slptimeo = 0;
 	int error, error2;
 
 	/*
 	 * Commits are usually short and sweet so lets save some cpu and
 	 * leave the async daemons for more important rpc's (such as reads
 	 * and writes).
 	 *
 	 * Readdirplus RPCs do vget()s to acquire the vnodes for entries
 	 * in the directory in order to update attributes. This can deadlock
 	 * with another thread that is waiting for async I/O to be done by
 	 * an nfsiod thread while holding a lock on one of these vnodes.
 	 * To avoid this deadlock, don't allow the async nfsiod threads to
 	 * perform Readdirplus RPCs.
 	 */
 	mtx_lock(&ncl_iod_mutex);
 	if ((bp->b_iocmd == BIO_WRITE && (bp->b_flags & B_NEEDCOMMIT) &&
 	     (nmp->nm_bufqiods > ncl_numasync / 2)) ||
 	    (bp->b_vp->v_type == VDIR && (nmp->nm_flag & NFSMNT_RDIRPLUS))) {
 		mtx_unlock(&ncl_iod_mutex);
 		return(EIO);
 	}
 again:
 	if (nmp->nm_flag & NFSMNT_INT)
 		slpflag = PCATCH;
 	gotiod = FALSE;
 
 	/*
 	 * Find a free iod to process this request.
 	 */
 	for (iod = 0; iod < ncl_numasync; iod++)
 		if (ncl_iodwant[iod] == NFSIOD_AVAILABLE) {
 			gotiod = TRUE;
 			break;
 		}
 
 	/*
 	 * Try to create one if none are free.
 	 */
 	if (!gotiod)
 		ncl_nfsiodnew();
 	else {
 		/*
 		 * Found one, so wake it up and tell it which
 		 * mount to process.
 		 */
 		NFS_DPF(ASYNCIO, ("ncl_asyncio: waking iod %d for mount %p\n",
 		    iod, nmp));
 		ncl_iodwant[iod] = NFSIOD_NOT_AVAILABLE;
 		ncl_iodmount[iod] = nmp;
 		nmp->nm_bufqiods++;
 		wakeup(&ncl_iodwant[iod]);
 	}
 
 	/*
 	 * If none are free, we may already have an iod working on this mount
 	 * point.  If so, it will process our request.
 	 */
 	if (!gotiod) {
 		if (nmp->nm_bufqiods > 0) {
 			NFS_DPF(ASYNCIO,
 				("ncl_asyncio: %d iods are already processing mount %p\n",
 				 nmp->nm_bufqiods, nmp));
 			gotiod = TRUE;
 		}
 	}
 
 	/*
 	 * If we have an iod which can process the request, then queue
 	 * the buffer.
 	 */
 	if (gotiod) {
 		/*
 		 * Ensure that the queue never grows too large.  We still want
 		 * to asynchronize so we block rather then return EIO.
 		 */
 		while (nmp->nm_bufqlen >= 2*ncl_numasync) {
 			NFS_DPF(ASYNCIO,
 				("ncl_asyncio: waiting for mount %p queue to drain\n", nmp));
 			nmp->nm_bufqwant = TRUE;
 			error = newnfs_msleep(td, &nmp->nm_bufq,
 			    &ncl_iod_mutex, slpflag | PRIBIO, "nfsaio",
 			   slptimeo);
 			if (error) {
 				error2 = newnfs_sigintr(nmp, td);
 				if (error2) {
 					mtx_unlock(&ncl_iod_mutex);
 					return (error2);
 				}
 				if (slpflag == PCATCH) {
 					slpflag = 0;
 					slptimeo = 2 * hz;
 				}
 			}
 			/*
 			 * We might have lost our iod while sleeping,
 			 * so check and loop if necessary.
 			 */
 			goto again;
 		}
 
 		/* We might have lost our nfsiod */
 		if (nmp->nm_bufqiods == 0) {
 			NFS_DPF(ASYNCIO,
 				("ncl_asyncio: no iods after mount %p queue was drained, looping\n", nmp));
 			goto again;
 		}
 
 		if (bp->b_iocmd == BIO_READ) {
 			if (bp->b_rcred == NOCRED && cred != NOCRED)
 				bp->b_rcred = crhold(cred);
 		} else {
 			if (bp->b_wcred == NOCRED && cred != NOCRED)
 				bp->b_wcred = crhold(cred);
 		}
 
 		if (bp->b_flags & B_REMFREE)
 			bremfreef(bp);
 		BUF_KERNPROC(bp);
 		TAILQ_INSERT_TAIL(&nmp->nm_bufq, bp, b_freelist);
 		nmp->nm_bufqlen++;
 		if ((bp->b_flags & B_DIRECT) && bp->b_iocmd == BIO_WRITE) {
 			mtx_lock(&(VTONFS(bp->b_vp))->n_mtx);
 			VTONFS(bp->b_vp)->n_flag |= NMODIFIED;
 			VTONFS(bp->b_vp)->n_directio_asyncwr++;
 			mtx_unlock(&(VTONFS(bp->b_vp))->n_mtx);
 		}
 		mtx_unlock(&ncl_iod_mutex);
 		return (0);
 	}
 
 	mtx_unlock(&ncl_iod_mutex);
 
 	/*
 	 * All the iods are busy on other mounts, so return EIO to
 	 * force the caller to process the i/o synchronously.
 	 */
 	NFS_DPF(ASYNCIO, ("ncl_asyncio: no iods available, i/o is synchronous\n"));
 	return (EIO);
 }
 
 void
 ncl_doio_directwrite(struct buf *bp)
 {
 	int iomode, must_commit;
 	struct uio *uiop = (struct uio *)bp->b_caller1;
 	char *iov_base = uiop->uio_iov->iov_base;
 
 	iomode = NFSWRITE_FILESYNC;
 	uiop->uio_td = NULL; /* NULL since we're in nfsiod */
 	ncl_writerpc(bp->b_vp, uiop, bp->b_wcred, &iomode, &must_commit, 0);
 	KASSERT((must_commit == 0), ("ncl_doio_directwrite: Did not commit write"));
 	free(iov_base, M_NFSDIRECTIO);
 	free(uiop->uio_iov, M_NFSDIRECTIO);
 	free(uiop, M_NFSDIRECTIO);
 	if ((bp->b_flags & B_DIRECT) && bp->b_iocmd == BIO_WRITE) {
 		struct nfsnode *np = VTONFS(bp->b_vp);
 		mtx_lock(&np->n_mtx);
 		if (NFSHASPNFS(VFSTONFS(vnode_mount(bp->b_vp)))) {
 			/*
 			 * Invalidate the attribute cache, since writes to a DS
 			 * won't update the size attribute.
 			 */
 			np->n_attrstamp = 0;
 		}
 		np->n_directio_asyncwr--;
 		if (np->n_directio_asyncwr == 0) {
 			np->n_flag &= ~NMODIFIED;
 			if ((np->n_flag & NFSYNCWAIT)) {
 				np->n_flag &= ~NFSYNCWAIT;
 				wakeup((caddr_t)&np->n_directio_asyncwr);
 			}
 		}
 		mtx_unlock(&np->n_mtx);
 	}
 	bp->b_vp = NULL;
 	relpbuf(bp, &ncl_pbuf_freecnt);
 }
 
 /*
  * Do an I/O operation to/from a cache block. This may be called
  * synchronously or from an nfsiod.
  */
 int
 ncl_doio(struct vnode *vp, struct buf *bp, struct ucred *cr, struct thread *td,
     int called_from_strategy)
 {
 	struct uio *uiop;
 	struct nfsnode *np;
 	struct nfsmount *nmp;
 	int error = 0, iomode, must_commit = 0;
 	struct uio uio;
 	struct iovec io;
 	struct proc *p = td ? td->td_proc : NULL;
 	uint8_t	iocmd;
 
 	np = VTONFS(vp);
 	nmp = VFSTONFS(vp->v_mount);
 	uiop = &uio;
 	uiop->uio_iov = &io;
 	uiop->uio_iovcnt = 1;
 	uiop->uio_segflg = UIO_SYSSPACE;
 	uiop->uio_td = td;
 
 	/*
 	 * clear BIO_ERROR and B_INVAL state prior to initiating the I/O.  We
 	 * do this here so we do not have to do it in all the code that
 	 * calls us.
 	 */
 	bp->b_flags &= ~B_INVAL;
 	bp->b_ioflags &= ~BIO_ERROR;
 
 	KASSERT(!(bp->b_flags & B_DONE), ("ncl_doio: bp %p already marked done", bp));
 	iocmd = bp->b_iocmd;
 	if (iocmd == BIO_READ) {
 	    io.iov_len = uiop->uio_resid = bp->b_bcount;
 	    io.iov_base = bp->b_data;
 	    uiop->uio_rw = UIO_READ;
 
 	    switch (vp->v_type) {
 	    case VREG:
 		uiop->uio_offset = ((off_t)bp->b_blkno) * DEV_BSIZE;
 		NFSINCRGLOBAL(nfsstatsv1.read_bios);
 		error = ncl_readrpc(vp, uiop, cr);
 
 		if (!error) {
 		    if (uiop->uio_resid) {
 			/*
 			 * If we had a short read with no error, we must have
 			 * hit a file hole.  We should zero-fill the remainder.
 			 * This can also occur if the server hits the file EOF.
 			 *
 			 * Holes used to be able to occur due to pending
 			 * writes, but that is not possible any longer.
 			 */
 			int nread = bp->b_bcount - uiop->uio_resid;
 			ssize_t left = uiop->uio_resid;
 
 			if (left > 0)
 				bzero((char *)bp->b_data + nread, left);
 			uiop->uio_resid = 0;
 		    }
 		}
 		/* ASSERT_VOP_LOCKED(vp, "ncl_doio"); */
 		if (p && vp->v_writecount <= -1) {
 			mtx_lock(&np->n_mtx);
 			if (NFS_TIMESPEC_COMPARE(&np->n_mtime, &np->n_vattr.na_mtime)) {
 				mtx_unlock(&np->n_mtx);
 				PROC_LOCK(p);
 				killproc(p, "text file modification");
 				PROC_UNLOCK(p);
 			} else
 				mtx_unlock(&np->n_mtx);
 		}
 		break;
 	    case VLNK:
 		uiop->uio_offset = (off_t)0;
 		NFSINCRGLOBAL(nfsstatsv1.readlink_bios);
 		error = ncl_readlinkrpc(vp, uiop, cr);
 		break;
 	    case VDIR:
 		NFSINCRGLOBAL(nfsstatsv1.readdir_bios);
 		uiop->uio_offset = ((u_quad_t)bp->b_lblkno) * NFS_DIRBLKSIZ;
 		if ((nmp->nm_flag & NFSMNT_RDIRPLUS) != 0) {
 			error = ncl_readdirplusrpc(vp, uiop, cr, td);
 			if (error == NFSERR_NOTSUPP)
 				nmp->nm_flag &= ~NFSMNT_RDIRPLUS;
 		}
 		if ((nmp->nm_flag & NFSMNT_RDIRPLUS) == 0)
 			error = ncl_readdirrpc(vp, uiop, cr, td);
 		/*
 		 * end-of-directory sets B_INVAL but does not generate an
 		 * error.
 		 */
 		if (error == 0 && uiop->uio_resid == bp->b_bcount)
 			bp->b_flags |= B_INVAL;
 		break;
 	    default:
 		printf("ncl_doio:  type %x unexpected\n", vp->v_type);
 		break;
 	    }
 	    if (error) {
 		bp->b_ioflags |= BIO_ERROR;
 		bp->b_error = error;
 	    }
 	} else {
 	    /*
 	     * If we only need to commit, try to commit
 	     */
 	    if (bp->b_flags & B_NEEDCOMMIT) {
 		    int retv;
 		    off_t off;
 
 		    off = ((u_quad_t)bp->b_blkno) * DEV_BSIZE + bp->b_dirtyoff;
 		    retv = ncl_commit(vp, off, bp->b_dirtyend-bp->b_dirtyoff,
 			bp->b_wcred, td);
 		    if (retv == 0) {
 			    bp->b_dirtyoff = bp->b_dirtyend = 0;
 			    bp->b_flags &= ~(B_NEEDCOMMIT | B_CLUSTEROK);
 			    bp->b_resid = 0;
 			    bufdone(bp);
 			    return (0);
 		    }
 		    if (retv == NFSERR_STALEWRITEVERF) {
 			    ncl_clearcommit(vp->v_mount);
 		    }
 	    }
 
 	    /*
 	     * Setup for actual write
 	     */
 	    mtx_lock(&np->n_mtx);
 	    if ((off_t)bp->b_blkno * DEV_BSIZE + bp->b_dirtyend > np->n_size)
 		bp->b_dirtyend = np->n_size - (off_t)bp->b_blkno * DEV_BSIZE;
 	    mtx_unlock(&np->n_mtx);
 
 	    if (bp->b_dirtyend > bp->b_dirtyoff) {
 		io.iov_len = uiop->uio_resid = bp->b_dirtyend
 		    - bp->b_dirtyoff;
 		uiop->uio_offset = (off_t)bp->b_blkno * DEV_BSIZE
 		    + bp->b_dirtyoff;
 		io.iov_base = (char *)bp->b_data + bp->b_dirtyoff;
 		uiop->uio_rw = UIO_WRITE;
 		NFSINCRGLOBAL(nfsstatsv1.write_bios);
 
 		if ((bp->b_flags & (B_ASYNC | B_NEEDCOMMIT | B_NOCACHE | B_CLUSTER)) == B_ASYNC)
 		    iomode = NFSWRITE_UNSTABLE;
 		else
 		    iomode = NFSWRITE_FILESYNC;
 
 		error = ncl_writerpc(vp, uiop, cr, &iomode, &must_commit,
 		    called_from_strategy);
 
 		/*
 		 * When setting B_NEEDCOMMIT also set B_CLUSTEROK to try
 		 * to cluster the buffers needing commit.  This will allow
 		 * the system to submit a single commit rpc for the whole
 		 * cluster.  We can do this even if the buffer is not 100%
 		 * dirty (relative to the NFS blocksize), so we optimize the
 		 * append-to-file-case.
 		 *
 		 * (when clearing B_NEEDCOMMIT, B_CLUSTEROK must also be
 		 * cleared because write clustering only works for commit
 		 * rpc's, not for the data portion of the write).
 		 */
 
 		if (!error && iomode == NFSWRITE_UNSTABLE) {
 		    bp->b_flags |= B_NEEDCOMMIT;
 		    if (bp->b_dirtyoff == 0
 			&& bp->b_dirtyend == bp->b_bcount)
 			bp->b_flags |= B_CLUSTEROK;
 		} else {
 		    bp->b_flags &= ~(B_NEEDCOMMIT | B_CLUSTEROK);
 		}
 
 		/*
 		 * For an interrupted write, the buffer is still valid
 		 * and the write hasn't been pushed to the server yet,
 		 * so we can't set BIO_ERROR and report the interruption
 		 * by setting B_EINTR. For the B_ASYNC case, B_EINTR
 		 * is not relevant, so the rpc attempt is essentially
 		 * a noop.  For the case of a V3 write rpc not being
 		 * committed to stable storage, the block is still
 		 * dirty and requires either a commit rpc or another
 		 * write rpc with iomode == NFSV3WRITE_FILESYNC before
 		 * the block is reused. This is indicated by setting
 		 * the B_DELWRI and B_NEEDCOMMIT flags.
 		 *
 		 * EIO is returned by ncl_writerpc() to indicate a recoverable
 		 * write error and is handled as above, except that
 		 * B_EINTR isn't set. One cause of this is a stale stateid
 		 * error for the RPC that indicates recovery is required,
 		 * when called with called_from_strategy != 0.
 		 *
 		 * If the buffer is marked B_PAGING, it does not reside on
 		 * the vp's paging queues so we cannot call bdirty().  The
 		 * bp in this case is not an NFS cache block so we should
 		 * be safe. XXX
 		 *
 		 * The logic below breaks up errors into recoverable and
 		 * unrecoverable. For the former, we clear B_INVAL|B_NOCACHE
 		 * and keep the buffer around for potential write retries.
 		 * For the latter (eg ESTALE), we toss the buffer away (B_INVAL)
 		 * and save the error in the nfsnode. This is less than ideal
 		 * but necessary. Keeping such buffers around could potentially
 		 * cause buffer exhaustion eventually (they can never be written
 		 * out, so will get constantly be re-dirtied). It also causes
 		 * all sorts of vfs panics. For non-recoverable write errors,
 		 * also invalidate the attrcache, so we'll be forced to go over
 		 * the wire for this object, returning an error to user on next
 		 * call (most of the time).
 		 */
 		if (error == EINTR || error == EIO || error == ETIMEDOUT
 		    || (!error && (bp->b_flags & B_NEEDCOMMIT))) {
 			bp->b_flags &= ~(B_INVAL|B_NOCACHE);
 			if ((bp->b_flags & B_PAGING) == 0) {
 			    bdirty(bp);
 			    bp->b_flags &= ~B_DONE;
 			}
 			if ((error == EINTR || error == ETIMEDOUT) &&
 			    (bp->b_flags & B_ASYNC) == 0)
 			    bp->b_flags |= B_EINTR;
 		} else {
 		    if (error) {
 			bp->b_ioflags |= BIO_ERROR;
 			bp->b_flags |= B_INVAL;
 			bp->b_error = np->n_error = error;
 			mtx_lock(&np->n_mtx);
 			np->n_flag |= NWRITEERR;
 			np->n_attrstamp = 0;
 			KDTRACE_NFS_ATTRCACHE_FLUSH_DONE(vp);
 			mtx_unlock(&np->n_mtx);
 		    }
 		    bp->b_dirtyoff = bp->b_dirtyend = 0;
 		}
 	    } else {
 		bp->b_resid = 0;
 		bufdone(bp);
 		return (0);
 	    }
 	}
 	bp->b_resid = uiop->uio_resid;
 	if (must_commit)
 	    ncl_clearcommit(vp->v_mount);
 	bufdone(bp);
 	return (error);
 }
 
 /*
  * Used to aid in handling ftruncate() operations on the NFS client side.
  * Truncation creates a number of special problems for NFS.  We have to
  * throw away VM pages and buffer cache buffers that are beyond EOF, and
  * we have to properly handle VM pages or (potentially dirty) buffers
  * that straddle the truncation point.
  */
 
 int
-ncl_meta_setsize(struct vnode *vp, struct ucred *cred, struct thread *td, u_quad_t nsize)
+ncl_meta_setsize(struct vnode *vp, struct thread *td, u_quad_t nsize)
 {
 	struct nfsnode *np = VTONFS(vp);
 	u_quad_t tsize;
 	int biosize = vp->v_bufobj.bo_bsize;
 	int error = 0;
 
 	mtx_lock(&np->n_mtx);
 	tsize = np->n_size;
 	np->n_size = nsize;
 	mtx_unlock(&np->n_mtx);
 
 	if (nsize < tsize) {
 		struct buf *bp;
 		daddr_t lbn;
 		int bufsize;
 
 		/*
 		 * vtruncbuf() doesn't get the buffer overlapping the
 		 * truncation point.  We may have a B_DELWRI and/or B_CACHE
 		 * buffer that now needs to be truncated.
 		 */
-		error = vtruncbuf(vp, cred, nsize, biosize);
+		error = vtruncbuf(vp, nsize, biosize);
 		lbn = nsize / biosize;
 		bufsize = nsize - (lbn * biosize);
 		bp = nfs_getcacheblk(vp, lbn, bufsize, td);
 		if (!bp)
 			return EINTR;
 		if (bp->b_dirtyoff > bp->b_bcount)
 			bp->b_dirtyoff = bp->b_bcount;
 		if (bp->b_dirtyend > bp->b_bcount)
 			bp->b_dirtyend = bp->b_bcount;
 		bp->b_flags |= B_RELBUF;  /* don't leave garbage around */
 		brelse(bp);
 	} else {
 		vnode_pager_setsize(vp, nsize);
 	}
 	return(error);
 }
 
Index: stable/12/sys/fs/nfsclient/nfs_clvnops.c
===================================================================
--- stable/12/sys/fs/nfsclient/nfs_clvnops.c	(revision 349304)
+++ stable/12/sys/fs/nfsclient/nfs_clvnops.c	(revision 349305)
@@ -1,3530 +1,3529 @@
 /*-
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Copyright (c) 1989, 1993
  *	The Regents of the University of California.  All rights reserved.
  *
  * This code is derived from software contributed to Berkeley by
  * Rick Macklem at The University of Guelph.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	from nfs_vnops.c	8.16 (Berkeley) 5/27/95
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 /*
  * vnode op calls for Sun NFS version 2, 3 and 4
  */
 
 #include "opt_inet.h"
 
 #include <sys/param.h>
 #include <sys/kernel.h>
 #include <sys/systm.h>
 #include <sys/resourcevar.h>
 #include <sys/proc.h>
 #include <sys/mount.h>
 #include <sys/bio.h>
 #include <sys/buf.h>
 #include <sys/jail.h>
 #include <sys/malloc.h>
 #include <sys/mbuf.h>
 #include <sys/namei.h>
 #include <sys/socket.h>
 #include <sys/vnode.h>
 #include <sys/dirent.h>
 #include <sys/fcntl.h>
 #include <sys/lockf.h>
 #include <sys/stat.h>
 #include <sys/sysctl.h>
 #include <sys/signalvar.h>
 
 #include <vm/vm.h>
 #include <vm/vm_extern.h>
 #include <vm/vm_object.h>
 
 #include <fs/nfs/nfsport.h>
 #include <fs/nfsclient/nfsnode.h>
 #include <fs/nfsclient/nfsmount.h>
 #include <fs/nfsclient/nfs.h>
 #include <fs/nfsclient/nfs_kdtrace.h>
 
 #include <net/if.h>
 #include <netinet/in.h>
 #include <netinet/in_var.h>
 
 #include <nfs/nfs_lock.h>
 
 #ifdef KDTRACE_HOOKS
 #include <sys/dtrace_bsd.h>
 
 dtrace_nfsclient_accesscache_flush_probe_func_t
 		dtrace_nfscl_accesscache_flush_done_probe;
 uint32_t	nfscl_accesscache_flush_done_id;
 
 dtrace_nfsclient_accesscache_get_probe_func_t
 		dtrace_nfscl_accesscache_get_hit_probe,
 		dtrace_nfscl_accesscache_get_miss_probe;
 uint32_t	nfscl_accesscache_get_hit_id;
 uint32_t	nfscl_accesscache_get_miss_id;
 
 dtrace_nfsclient_accesscache_load_probe_func_t
 		dtrace_nfscl_accesscache_load_done_probe;
 uint32_t	nfscl_accesscache_load_done_id;
 #endif /* !KDTRACE_HOOKS */
 
 /* Defs */
 #define	TRUE	1
 #define	FALSE	0
 
 extern struct nfsstatsv1 nfsstatsv1;
 extern int nfsrv_useacl;
 extern int nfscl_debuglevel;
 MALLOC_DECLARE(M_NEWNFSREQ);
 
 static vop_read_t	nfsfifo_read;
 static vop_write_t	nfsfifo_write;
 static vop_close_t	nfsfifo_close;
 static int	nfs_setattrrpc(struct vnode *, struct vattr *, struct ucred *,
 		    struct thread *);
 static vop_lookup_t	nfs_lookup;
 static vop_create_t	nfs_create;
 static vop_mknod_t	nfs_mknod;
 static vop_open_t	nfs_open;
 static vop_pathconf_t	nfs_pathconf;
 static vop_close_t	nfs_close;
 static vop_access_t	nfs_access;
 static vop_getattr_t	nfs_getattr;
 static vop_setattr_t	nfs_setattr;
 static vop_read_t	nfs_read;
 static vop_fsync_t	nfs_fsync;
 static vop_remove_t	nfs_remove;
 static vop_link_t	nfs_link;
 static vop_rename_t	nfs_rename;
 static vop_mkdir_t	nfs_mkdir;
 static vop_rmdir_t	nfs_rmdir;
 static vop_symlink_t	nfs_symlink;
 static vop_readdir_t	nfs_readdir;
 static vop_strategy_t	nfs_strategy;
 static	int	nfs_lookitup(struct vnode *, char *, int,
 		    struct ucred *, struct thread *, struct nfsnode **);
 static	int	nfs_sillyrename(struct vnode *, struct vnode *,
 		    struct componentname *);
 static vop_access_t	nfsspec_access;
 static vop_readlink_t	nfs_readlink;
 static vop_print_t	nfs_print;
 static vop_advlock_t	nfs_advlock;
 static vop_advlockasync_t nfs_advlockasync;
 static vop_getacl_t nfs_getacl;
 static vop_setacl_t nfs_setacl;
 
 /*
  * Global vfs data structures for nfs
  */
 struct vop_vector newnfs_vnodeops = {
 	.vop_default =		&default_vnodeops,
 	.vop_access =		nfs_access,
 	.vop_advlock =		nfs_advlock,
 	.vop_advlockasync =	nfs_advlockasync,
 	.vop_close =		nfs_close,
 	.vop_create =		nfs_create,
 	.vop_fsync =		nfs_fsync,
 	.vop_getattr =		nfs_getattr,
 	.vop_getpages =		ncl_getpages,
 	.vop_putpages =		ncl_putpages,
 	.vop_inactive =		ncl_inactive,
 	.vop_link =		nfs_link,
 	.vop_lookup =		nfs_lookup,
 	.vop_mkdir =		nfs_mkdir,
 	.vop_mknod =		nfs_mknod,
 	.vop_open =		nfs_open,
 	.vop_pathconf =		nfs_pathconf,
 	.vop_print =		nfs_print,
 	.vop_read =		nfs_read,
 	.vop_readdir =		nfs_readdir,
 	.vop_readlink =		nfs_readlink,
 	.vop_reclaim =		ncl_reclaim,
 	.vop_remove =		nfs_remove,
 	.vop_rename =		nfs_rename,
 	.vop_rmdir =		nfs_rmdir,
 	.vop_setattr =		nfs_setattr,
 	.vop_strategy =		nfs_strategy,
 	.vop_symlink =		nfs_symlink,
 	.vop_write =		ncl_write,
 	.vop_getacl =		nfs_getacl,
 	.vop_setacl =		nfs_setacl,
 };
 
 struct vop_vector newnfs_fifoops = {
 	.vop_default =		&fifo_specops,
 	.vop_access =		nfsspec_access,
 	.vop_close =		nfsfifo_close,
 	.vop_fsync =		nfs_fsync,
 	.vop_getattr =		nfs_getattr,
 	.vop_inactive =		ncl_inactive,
 	.vop_pathconf =		nfs_pathconf,
 	.vop_print =		nfs_print,
 	.vop_read =		nfsfifo_read,
 	.vop_reclaim =		ncl_reclaim,
 	.vop_setattr =		nfs_setattr,
 	.vop_write =		nfsfifo_write,
 };
 
 static int nfs_mknodrpc(struct vnode *dvp, struct vnode **vpp,
     struct componentname *cnp, struct vattr *vap);
 static int nfs_removerpc(struct vnode *dvp, struct vnode *vp, char *name,
     int namelen, struct ucred *cred, struct thread *td);
 static int nfs_renamerpc(struct vnode *fdvp, struct vnode *fvp,
     char *fnameptr, int fnamelen, struct vnode *tdvp, struct vnode *tvp,
     char *tnameptr, int tnamelen, struct ucred *cred, struct thread *td);
 static int nfs_renameit(struct vnode *sdvp, struct vnode *svp,
     struct componentname *scnp, struct sillyrename *sp);
 
 /*
  * Global variables
  */
 SYSCTL_DECL(_vfs_nfs);
 
 static int	nfsaccess_cache_timeout = NFS_MAXATTRTIMO;
 SYSCTL_INT(_vfs_nfs, OID_AUTO, access_cache_timeout, CTLFLAG_RW,
 	   &nfsaccess_cache_timeout, 0, "NFS ACCESS cache timeout");
 
 static int	nfs_prime_access_cache = 0;
 SYSCTL_INT(_vfs_nfs, OID_AUTO, prime_access_cache, CTLFLAG_RW,
 	   &nfs_prime_access_cache, 0,
 	   "Prime NFS ACCESS cache when fetching attributes");
 
 static int	newnfs_commit_on_close = 0;
 SYSCTL_INT(_vfs_nfs, OID_AUTO, commit_on_close, CTLFLAG_RW,
     &newnfs_commit_on_close, 0, "write+commit on close, else only write");
 
 static int	nfs_clean_pages_on_close = 1;
 SYSCTL_INT(_vfs_nfs, OID_AUTO, clean_pages_on_close, CTLFLAG_RW,
 	   &nfs_clean_pages_on_close, 0, "NFS clean dirty pages on close");
 
 int newnfs_directio_enable = 0;
 SYSCTL_INT(_vfs_nfs, OID_AUTO, nfs_directio_enable, CTLFLAG_RW,
 	   &newnfs_directio_enable, 0, "Enable NFS directio");
 
 int nfs_keep_dirty_on_error;
 SYSCTL_INT(_vfs_nfs, OID_AUTO, nfs_keep_dirty_on_error, CTLFLAG_RW,
     &nfs_keep_dirty_on_error, 0, "Retry pageout if error returned");
 
 /*
  * This sysctl allows other processes to mmap a file that has been opened
  * O_DIRECT by a process.  In general, having processes mmap the file while
  * Direct IO is in progress can lead to Data Inconsistencies.  But, we allow
  * this by default to prevent DoS attacks - to prevent a malicious user from
  * opening up files O_DIRECT preventing other users from mmap'ing these
  * files.  "Protected" environments where stricter consistency guarantees are
  * required can disable this knob.  The process that opened the file O_DIRECT
  * cannot mmap() the file, because mmap'ed IO on an O_DIRECT open() is not
  * meaningful.
  */
 int newnfs_directio_allow_mmap = 1;
 SYSCTL_INT(_vfs_nfs, OID_AUTO, nfs_directio_allow_mmap, CTLFLAG_RW,
 	   &newnfs_directio_allow_mmap, 0, "Enable mmaped IO on file with O_DIRECT opens");
 
 #define	NFSACCESS_ALL (NFSACCESS_READ | NFSACCESS_MODIFY		\
 			 | NFSACCESS_EXTEND | NFSACCESS_EXECUTE	\
 			 | NFSACCESS_DELETE | NFSACCESS_LOOKUP)
 
 /*
  * SMP Locking Note :
  * The list of locks after the description of the lock is the ordering
  * of other locks acquired with the lock held.
  * np->n_mtx : Protects the fields in the nfsnode.
        VM Object Lock
        VI_MTX (acquired indirectly)
  * nmp->nm_mtx : Protects the fields in the nfsmount.
        rep->r_mtx
  * ncl_iod_mutex : Global lock, protects shared nfsiod state.
  * nfs_reqq_mtx : Global lock, protects the nfs_reqq list.
        nmp->nm_mtx
        rep->r_mtx
  * rep->r_mtx : Protects the fields in an nfsreq.
  */
 
 static int
 nfs34_access_otw(struct vnode *vp, int wmode, struct thread *td,
     struct ucred *cred, u_int32_t *retmode)
 {
 	int error = 0, attrflag, i, lrupos;
 	u_int32_t rmode;
 	struct nfsnode *np = VTONFS(vp);
 	struct nfsvattr nfsva;
 
 	error = nfsrpc_accessrpc(vp, wmode, cred, td, &nfsva, &attrflag,
 	    &rmode, NULL);
 	if (attrflag)
 		(void) nfscl_loadattrcache(&vp, &nfsva, NULL, NULL, 0, 1);
 	if (!error) {
 		lrupos = 0;
 		mtx_lock(&np->n_mtx);
 		for (i = 0; i < NFS_ACCESSCACHESIZE; i++) {
 			if (np->n_accesscache[i].uid == cred->cr_uid) {
 				np->n_accesscache[i].mode = rmode;
 				np->n_accesscache[i].stamp = time_second;
 				break;
 			}
 			if (i > 0 && np->n_accesscache[i].stamp <
 			    np->n_accesscache[lrupos].stamp)
 				lrupos = i;
 		}
 		if (i == NFS_ACCESSCACHESIZE) {
 			np->n_accesscache[lrupos].uid = cred->cr_uid;
 			np->n_accesscache[lrupos].mode = rmode;
 			np->n_accesscache[lrupos].stamp = time_second;
 		}
 		mtx_unlock(&np->n_mtx);
 		if (retmode != NULL)
 			*retmode = rmode;
 		KDTRACE_NFS_ACCESSCACHE_LOAD_DONE(vp, cred->cr_uid, rmode, 0);
 	} else if (NFS_ISV4(vp)) {
 		error = nfscl_maperr(td, error, (uid_t)0, (gid_t)0);
 	}
 #ifdef KDTRACE_HOOKS
 	if (error != 0)
 		KDTRACE_NFS_ACCESSCACHE_LOAD_DONE(vp, cred->cr_uid, 0,
 		    error);
 #endif
 	return (error);
 }
 
 /*
  * nfs access vnode op.
  * For nfs version 2, just return ok. File accesses may fail later.
  * For nfs version 3, use the access rpc to check accessibility. If file modes
  * are changed on the server, accesses might still fail later.
  */
 static int
 nfs_access(struct vop_access_args *ap)
 {
 	struct vnode *vp = ap->a_vp;
 	int error = 0, i, gotahit;
 	u_int32_t mode, wmode, rmode;
 	int v34 = NFS_ISV34(vp);
 	struct nfsnode *np = VTONFS(vp);
 
 	/*
 	 * Disallow write attempts on filesystems mounted read-only;
 	 * unless the file is a socket, fifo, or a block or character
 	 * device resident on the filesystem.
 	 */
 	if ((ap->a_accmode & (VWRITE | VAPPEND | VWRITE_NAMED_ATTRS |
 	    VDELETE_CHILD | VWRITE_ATTRIBUTES | VDELETE | VWRITE_ACL |
 	    VWRITE_OWNER)) != 0 && (vp->v_mount->mnt_flag & MNT_RDONLY) != 0) {
 		switch (vp->v_type) {
 		case VREG:
 		case VDIR:
 		case VLNK:
 			return (EROFS);
 		default:
 			break;
 		}
 	}
 	/*
 	 * For nfs v3 or v4, check to see if we have done this recently, and if
 	 * so return our cached result instead of making an ACCESS call.
 	 * If not, do an access rpc, otherwise you are stuck emulating
 	 * ufs_access() locally using the vattr. This may not be correct,
 	 * since the server may apply other access criteria such as
 	 * client uid-->server uid mapping that we do not know about.
 	 */
 	if (v34) {
 		if (ap->a_accmode & VREAD)
 			mode = NFSACCESS_READ;
 		else
 			mode = 0;
 		if (vp->v_type != VDIR) {
 			if (ap->a_accmode & VWRITE)
 				mode |= (NFSACCESS_MODIFY | NFSACCESS_EXTEND);
 			if (ap->a_accmode & VAPPEND)
 				mode |= NFSACCESS_EXTEND;
 			if (ap->a_accmode & VEXEC)
 				mode |= NFSACCESS_EXECUTE;
 			if (ap->a_accmode & VDELETE)
 				mode |= NFSACCESS_DELETE;
 		} else {
 			if (ap->a_accmode & VWRITE)
 				mode |= (NFSACCESS_MODIFY | NFSACCESS_EXTEND);
 			if (ap->a_accmode & VAPPEND)
 				mode |= NFSACCESS_EXTEND;
 			if (ap->a_accmode & VEXEC)
 				mode |= NFSACCESS_LOOKUP;
 			if (ap->a_accmode & VDELETE)
 				mode |= NFSACCESS_DELETE;
 			if (ap->a_accmode & VDELETE_CHILD)
 				mode |= NFSACCESS_MODIFY;
 		}
 		/* XXX safety belt, only make blanket request if caching */
 		if (nfsaccess_cache_timeout > 0) {
 			wmode = NFSACCESS_READ | NFSACCESS_MODIFY |
 				NFSACCESS_EXTEND | NFSACCESS_EXECUTE |
 				NFSACCESS_DELETE | NFSACCESS_LOOKUP;
 		} else {
 			wmode = mode;
 		}
 
 		/*
 		 * Does our cached result allow us to give a definite yes to
 		 * this request?
 		 */
 		gotahit = 0;
 		mtx_lock(&np->n_mtx);
 		for (i = 0; i < NFS_ACCESSCACHESIZE; i++) {
 			if (ap->a_cred->cr_uid == np->n_accesscache[i].uid) {
 			    if (time_second < (np->n_accesscache[i].stamp
 				+ nfsaccess_cache_timeout) &&
 				(np->n_accesscache[i].mode & mode) == mode) {
 				NFSINCRGLOBAL(nfsstatsv1.accesscache_hits);
 				gotahit = 1;
 			    }
 			    break;
 			}
 		}
 		mtx_unlock(&np->n_mtx);
 #ifdef KDTRACE_HOOKS
 		if (gotahit != 0)
 			KDTRACE_NFS_ACCESSCACHE_GET_HIT(vp,
 			    ap->a_cred->cr_uid, mode);
 		else
 			KDTRACE_NFS_ACCESSCACHE_GET_MISS(vp,
 			    ap->a_cred->cr_uid, mode);
 #endif
 		if (gotahit == 0) {
 			/*
 			 * Either a no, or a don't know.  Go to the wire.
 			 */
 			NFSINCRGLOBAL(nfsstatsv1.accesscache_misses);
 		        error = nfs34_access_otw(vp, wmode, ap->a_td,
 			    ap->a_cred, &rmode);
 			if (!error &&
 			    (rmode & mode) != mode)
 				error = EACCES;
 		}
 		return (error);
 	} else {
 		if ((error = nfsspec_access(ap)) != 0) {
 			return (error);
 		}
 		/*
 		 * Attempt to prevent a mapped root from accessing a file
 		 * which it shouldn't.  We try to read a byte from the file
 		 * if the user is root and the file is not zero length.
 		 * After calling nfsspec_access, we should have the correct
 		 * file size cached.
 		 */
 		mtx_lock(&np->n_mtx);
 		if (ap->a_cred->cr_uid == 0 && (ap->a_accmode & VREAD)
 		    && VTONFS(vp)->n_size > 0) {
 			struct iovec aiov;
 			struct uio auio;
 			char buf[1];
 
 			mtx_unlock(&np->n_mtx);
 			aiov.iov_base = buf;
 			aiov.iov_len = 1;
 			auio.uio_iov = &aiov;
 			auio.uio_iovcnt = 1;
 			auio.uio_offset = 0;
 			auio.uio_resid = 1;
 			auio.uio_segflg = UIO_SYSSPACE;
 			auio.uio_rw = UIO_READ;
 			auio.uio_td = ap->a_td;
 
 			if (vp->v_type == VREG)
 				error = ncl_readrpc(vp, &auio, ap->a_cred);
 			else if (vp->v_type == VDIR) {
 				char* bp;
 				bp = malloc(NFS_DIRBLKSIZ, M_TEMP, M_WAITOK);
 				aiov.iov_base = bp;
 				aiov.iov_len = auio.uio_resid = NFS_DIRBLKSIZ;
 				error = ncl_readdirrpc(vp, &auio, ap->a_cred,
 				    ap->a_td);
 				free(bp, M_TEMP);
 			} else if (vp->v_type == VLNK)
 				error = ncl_readlinkrpc(vp, &auio, ap->a_cred);
 			else
 				error = EACCES;
 		} else
 			mtx_unlock(&np->n_mtx);
 		return (error);
 	}
 }
 
 
 /*
  * nfs open vnode op
  * Check to see if the type is ok
  * and that deletion is not in progress.
  * For paged in text files, you will need to flush the page cache
  * if consistency is lost.
  */
 /* ARGSUSED */
 static int
 nfs_open(struct vop_open_args *ap)
 {
 	struct vnode *vp = ap->a_vp;
 	struct nfsnode *np = VTONFS(vp);
 	struct vattr vattr;
 	int error;
 	int fmode = ap->a_mode;
 	struct ucred *cred;
 	vm_object_t obj;
 
 	if (vp->v_type != VREG && vp->v_type != VDIR && vp->v_type != VLNK)
 		return (EOPNOTSUPP);
 
 	/*
 	 * For NFSv4, we need to do the Open Op before cache validation,
 	 * so that we conform to RFC3530 Sec. 9.3.1.
 	 */
 	if (NFS_ISV4(vp)) {
 		error = nfsrpc_open(vp, fmode, ap->a_cred, ap->a_td);
 		if (error) {
 			error = nfscl_maperr(ap->a_td, error, (uid_t)0,
 			    (gid_t)0);
 			return (error);
 		}
 	}
 
 	/*
 	 * Now, if this Open will be doing reading, re-validate/flush the
 	 * cache, so that Close/Open coherency is maintained.
 	 */
 	mtx_lock(&np->n_mtx);
 	if (np->n_flag & NMODIFIED) {
 		mtx_unlock(&np->n_mtx);
 		error = ncl_vinvalbuf(vp, V_SAVE, ap->a_td, 1);
 		if (error == EINTR || error == EIO) {
 			if (NFS_ISV4(vp))
 				(void) nfsrpc_close(vp, 0, ap->a_td);
 			return (error);
 		}
 		mtx_lock(&np->n_mtx);
 		np->n_attrstamp = 0;
 		KDTRACE_NFS_ATTRCACHE_FLUSH_DONE(vp);
 		if (vp->v_type == VDIR)
 			np->n_direofoffset = 0;
 		mtx_unlock(&np->n_mtx);
 		error = VOP_GETATTR(vp, &vattr, ap->a_cred);
 		if (error) {
 			if (NFS_ISV4(vp))
 				(void) nfsrpc_close(vp, 0, ap->a_td);
 			return (error);
 		}
 		mtx_lock(&np->n_mtx);
 		np->n_mtime = vattr.va_mtime;
 		if (NFS_ISV4(vp))
 			np->n_change = vattr.va_filerev;
 	} else {
 		mtx_unlock(&np->n_mtx);
 		error = VOP_GETATTR(vp, &vattr, ap->a_cred);
 		if (error) {
 			if (NFS_ISV4(vp))
 				(void) nfsrpc_close(vp, 0, ap->a_td);
 			return (error);
 		}
 		mtx_lock(&np->n_mtx);
 		if ((NFS_ISV4(vp) && np->n_change != vattr.va_filerev) ||
 		    NFS_TIMESPEC_COMPARE(&np->n_mtime, &vattr.va_mtime)) {
 			if (vp->v_type == VDIR)
 				np->n_direofoffset = 0;
 			mtx_unlock(&np->n_mtx);
 			error = ncl_vinvalbuf(vp, V_SAVE, ap->a_td, 1);
 			if (error == EINTR || error == EIO) {
 				if (NFS_ISV4(vp))
 					(void) nfsrpc_close(vp, 0, ap->a_td);
 				return (error);
 			}
 			mtx_lock(&np->n_mtx);
 			np->n_mtime = vattr.va_mtime;
 			if (NFS_ISV4(vp))
 				np->n_change = vattr.va_filerev;
 		}
 	}
 
 	/*
 	 * If the object has >= 1 O_DIRECT active opens, we disable caching.
 	 */
 	if (newnfs_directio_enable && (fmode & O_DIRECT) &&
 	    (vp->v_type == VREG)) {
 		if (np->n_directio_opens == 0) {
 			mtx_unlock(&np->n_mtx);
 			error = ncl_vinvalbuf(vp, V_SAVE, ap->a_td, 1);
 			if (error) {
 				if (NFS_ISV4(vp))
 					(void) nfsrpc_close(vp, 0, ap->a_td);
 				return (error);
 			}
 			mtx_lock(&np->n_mtx);
 			np->n_flag |= NNONCACHE;
 		}
 		np->n_directio_opens++;
 	}
 
 	/* If opened for writing via NFSv4.1 or later, mark that for pNFS. */
 	if (NFSHASPNFS(VFSTONFS(vp->v_mount)) && (fmode & FWRITE) != 0)
 		np->n_flag |= NWRITEOPENED;
 
 	/*
 	 * If this is an open for writing, capture a reference to the
 	 * credentials, so they can be used by ncl_putpages(). Using
 	 * these write credentials is preferable to the credentials of
 	 * whatever thread happens to be doing the VOP_PUTPAGES() since
 	 * the write RPCs are less likely to fail with EACCES.
 	 */
 	if ((fmode & FWRITE) != 0) {
 		cred = np->n_writecred;
 		np->n_writecred = crhold(ap->a_cred);
 	} else
 		cred = NULL;
 	mtx_unlock(&np->n_mtx);
 
 	if (cred != NULL)
 		crfree(cred);
 	vnode_create_vobject(vp, vattr.va_size, ap->a_td);
 
 	/*
 	 * If the text file has been mmap'd, flush any dirty pages to the
 	 * buffer cache and then...
 	 * Make sure all writes are pushed to the NFS server.  If this is not
 	 * done, the modify time of the file can change while the text
 	 * file is being executed.  This will cause the process that is
 	 * executing the text file to be terminated.
 	 */
 	if (vp->v_writecount <= -1) {
 		if ((obj = vp->v_object) != NULL &&
 		    (obj->flags & OBJ_MIGHTBEDIRTY) != 0) {
 			VM_OBJECT_WLOCK(obj);
 			vm_object_page_clean(obj, 0, 0, OBJPC_SYNC);
 			VM_OBJECT_WUNLOCK(obj);
 		}
 
 		/* Now, flush the buffer cache. */
 		ncl_flush(vp, MNT_WAIT, curthread, 0, 0);
 
 		/* And, finally, make sure that n_mtime is up to date. */
 		np = VTONFS(vp);
 		mtx_lock(&np->n_mtx);
 		np->n_mtime = np->n_vattr.na_mtime;
 		mtx_unlock(&np->n_mtx);
 	}
 	return (0);
 }
 
 /*
  * nfs close vnode op
  * What an NFS client should do upon close after writing is a debatable issue.
  * Most NFS clients push delayed writes to the server upon close, basically for
  * two reasons:
  * 1 - So that any write errors may be reported back to the client process
  *     doing the close system call. By far the two most likely errors are
  *     NFSERR_NOSPC and NFSERR_DQUOT to indicate space allocation failure.
  * 2 - To put a worst case upper bound on cache inconsistency between
  *     multiple clients for the file.
  * There is also a consistency problem for Version 2 of the protocol w.r.t.
  * not being able to tell if other clients are writing a file concurrently,
  * since there is no way of knowing if the changed modify time in the reply
  * is only due to the write for this client.
  * (NFS Version 3 provides weak cache consistency data in the reply that
  *  should be sufficient to detect and handle this case.)
  *
  * The current code does the following:
  * for NFS Version 2 - play it safe and flush/invalidate all dirty buffers
  * for NFS Version 3 - flush dirty buffers to the server but don't invalidate
  *                     or commit them (this satisfies 1 and 2 except for the
  *                     case where the server crashes after this close but
  *                     before the commit RPC, which is felt to be "good
  *                     enough". Changing the last argument to ncl_flush() to
  *                     a 1 would force a commit operation, if it is felt a
  *                     commit is necessary now.
  * for NFS Version 4 - flush the dirty buffers and commit them, if
  *		       nfscl_mustflush() says this is necessary.
  *                     It is necessary if there is no write delegation held,
  *                     in order to satisfy open/close coherency.
  *                     If the file isn't cached on local stable storage,
  *                     it may be necessary in order to detect "out of space"
  *                     errors from the server, if the write delegation
  *                     issued by the server doesn't allow the file to grow.
  */
 /* ARGSUSED */
 static int
 nfs_close(struct vop_close_args *ap)
 {
 	struct vnode *vp = ap->a_vp;
 	struct nfsnode *np = VTONFS(vp);
 	struct nfsvattr nfsva;
 	struct ucred *cred;
 	int error = 0, ret, localcred = 0;
 	int fmode = ap->a_fflag;
 
 	if (NFSCL_FORCEDISM(vp->v_mount))
 		return (0);
 	/*
 	 * During shutdown, a_cred isn't valid, so just use root.
 	 */
 	if (ap->a_cred == NOCRED) {
 		cred = newnfs_getcred();
 		localcred = 1;
 	} else {
 		cred = ap->a_cred;
 	}
 	if (vp->v_type == VREG) {
 	    /*
 	     * Examine and clean dirty pages, regardless of NMODIFIED.
 	     * This closes a major hole in close-to-open consistency.
 	     * We want to push out all dirty pages (and buffers) on
 	     * close, regardless of whether they were dirtied by
 	     * mmap'ed writes or via write().
 	     */
 	    if (nfs_clean_pages_on_close && vp->v_object) {
 		VM_OBJECT_WLOCK(vp->v_object);
 		vm_object_page_clean(vp->v_object, 0, 0, 0);
 		VM_OBJECT_WUNLOCK(vp->v_object);
 	    }
 	    mtx_lock(&np->n_mtx);
 	    if (np->n_flag & NMODIFIED) {
 		mtx_unlock(&np->n_mtx);
 		if (NFS_ISV3(vp)) {
 		    /*
 		     * Under NFSv3 we have dirty buffers to dispose of.  We
 		     * must flush them to the NFS server.  We have the option
 		     * of waiting all the way through the commit rpc or just
 		     * waiting for the initial write.  The default is to only
 		     * wait through the initial write so the data is in the
 		     * server's cache, which is roughly similar to the state
 		     * a standard disk subsystem leaves the file in on close().
 		     *
 		     * We cannot clear the NMODIFIED bit in np->n_flag due to
 		     * potential races with other processes, and certainly
 		     * cannot clear it if we don't commit.
 		     * These races occur when there is no longer the old
 		     * traditional vnode locking implemented for Vnode Ops.
 		     */
 		    int cm = newnfs_commit_on_close ? 1 : 0;
 		    error = ncl_flush(vp, MNT_WAIT, ap->a_td, cm, 0);
 		    /* np->n_flag &= ~NMODIFIED; */
 		} else if (NFS_ISV4(vp)) { 
 			if (nfscl_mustflush(vp) != 0) {
 				int cm = newnfs_commit_on_close ? 1 : 0;
 				error = ncl_flush(vp, MNT_WAIT, ap->a_td,
 				    cm, 0);
 				/*
 				 * as above w.r.t races when clearing
 				 * NMODIFIED.
 				 * np->n_flag &= ~NMODIFIED;
 				 */
 			}
 		} else {
 			error = ncl_vinvalbuf(vp, V_SAVE, ap->a_td, 1);
 		}
 		mtx_lock(&np->n_mtx);
 	    }
  	    /* 
  	     * Invalidate the attribute cache in all cases.
  	     * An open is going to fetch fresh attrs any way, other procs
  	     * on this node that have file open will be forced to do an 
  	     * otw attr fetch, but this is safe.
 	     * --> A user found that their RPC count dropped by 20% when
 	     *     this was commented out and I can't see any requirement
 	     *     for it, so I've disabled it when negative lookups are
 	     *     enabled. (What does this have to do with negative lookup
 	     *     caching? Well nothing, except it was reported by the
 	     *     same user that needed negative lookup caching and I wanted
 	     *     there to be a way to disable it to see if it
 	     *     is the cause of some caching/coherency issue that might
 	     *     crop up.)
  	     */
 	    if (VFSTONFS(vp->v_mount)->nm_negnametimeo == 0) {
 		    np->n_attrstamp = 0;
 		    KDTRACE_NFS_ATTRCACHE_FLUSH_DONE(vp);
 	    }
 	    if (np->n_flag & NWRITEERR) {
 		np->n_flag &= ~NWRITEERR;
 		error = np->n_error;
 	    }
 	    mtx_unlock(&np->n_mtx);
 	}
 
 	if (NFS_ISV4(vp)) {
 		/*
 		 * Get attributes so "change" is up to date.
 		 */
 		if (error == 0 && nfscl_mustflush(vp) != 0 &&
 		    vp->v_type == VREG &&
 		    (VFSTONFS(vp->v_mount)->nm_flag & NFSMNT_NOCTO) == 0) {
 			ret = nfsrpc_getattr(vp, cred, ap->a_td, &nfsva,
 			    NULL);
 			if (!ret) {
 				np->n_change = nfsva.na_filerev;
 				(void) nfscl_loadattrcache(&vp, &nfsva, NULL,
 				    NULL, 0, 0);
 			}
 		}
 
 		/*
 		 * and do the close.
 		 */
 		ret = nfsrpc_close(vp, 0, ap->a_td);
 		if (!error && ret)
 			error = ret;
 		if (error)
 			error = nfscl_maperr(ap->a_td, error, (uid_t)0,
 			    (gid_t)0);
 	}
 	if (newnfs_directio_enable)
 		KASSERT((np->n_directio_asyncwr == 0),
 			("nfs_close: dirty unflushed (%d) directio buffers\n",
 			 np->n_directio_asyncwr));
 	if (newnfs_directio_enable && (fmode & O_DIRECT) && (vp->v_type == VREG)) {
 		mtx_lock(&np->n_mtx);
 		KASSERT((np->n_directio_opens > 0), 
 			("nfs_close: unexpectedly value (0) of n_directio_opens\n"));
 		np->n_directio_opens--;
 		if (np->n_directio_opens == 0)
 			np->n_flag &= ~NNONCACHE;
 		mtx_unlock(&np->n_mtx);
 	}
 	if (localcred)
 		NFSFREECRED(cred);
 	return (error);
 }
 
 /*
  * nfs getattr call from vfs.
  */
 static int
 nfs_getattr(struct vop_getattr_args *ap)
 {
 	struct vnode *vp = ap->a_vp;
 	struct thread *td = curthread;	/* XXX */
 	struct nfsnode *np = VTONFS(vp);
 	int error = 0;
 	struct nfsvattr nfsva;
 	struct vattr *vap = ap->a_vap;
 	struct vattr vattr;
 
 	/*
 	 * Update local times for special files.
 	 */
 	mtx_lock(&np->n_mtx);
 	if (np->n_flag & (NACC | NUPD))
 		np->n_flag |= NCHG;
 	mtx_unlock(&np->n_mtx);
 	/*
 	 * First look in the cache.
 	 */
 	if (ncl_getattrcache(vp, &vattr) == 0) {
 		vap->va_type = vattr.va_type;
 		vap->va_mode = vattr.va_mode;
 		vap->va_nlink = vattr.va_nlink;
 		vap->va_uid = vattr.va_uid;
 		vap->va_gid = vattr.va_gid;
 		vap->va_fsid = vattr.va_fsid;
 		vap->va_fileid = vattr.va_fileid;
 		vap->va_size = vattr.va_size;
 		vap->va_blocksize = vattr.va_blocksize;
 		vap->va_atime = vattr.va_atime;
 		vap->va_mtime = vattr.va_mtime;
 		vap->va_ctime = vattr.va_ctime;
 		vap->va_gen = vattr.va_gen;
 		vap->va_flags = vattr.va_flags;
 		vap->va_rdev = vattr.va_rdev;
 		vap->va_bytes = vattr.va_bytes;
 		vap->va_filerev = vattr.va_filerev;
 		/*
 		 * Get the local modify time for the case of a write
 		 * delegation.
 		 */
 		nfscl_deleggetmodtime(vp, &vap->va_mtime);
 		return (0);
 	}
 
 	if (NFS_ISV34(vp) && nfs_prime_access_cache &&
 	    nfsaccess_cache_timeout > 0) {
 		NFSINCRGLOBAL(nfsstatsv1.accesscache_misses);
 		nfs34_access_otw(vp, NFSACCESS_ALL, td, ap->a_cred, NULL);
 		if (ncl_getattrcache(vp, ap->a_vap) == 0) {
 			nfscl_deleggetmodtime(vp, &ap->a_vap->va_mtime);
 			return (0);
 		}
 	}
 	error = nfsrpc_getattr(vp, ap->a_cred, td, &nfsva, NULL);
 	if (!error)
 		error = nfscl_loadattrcache(&vp, &nfsva, vap, NULL, 0, 0);
 	if (!error) {
 		/*
 		 * Get the local modify time for the case of a write
 		 * delegation.
 		 */
 		nfscl_deleggetmodtime(vp, &vap->va_mtime);
 	} else if (NFS_ISV4(vp)) {
 		error = nfscl_maperr(td, error, (uid_t)0, (gid_t)0);
 	}
 	return (error);
 }
 
 /*
  * nfs setattr call.
  */
 static int
 nfs_setattr(struct vop_setattr_args *ap)
 {
 	struct vnode *vp = ap->a_vp;
 	struct nfsnode *np = VTONFS(vp);
 	struct thread *td = curthread;	/* XXX */
 	struct vattr *vap = ap->a_vap;
 	int error = 0;
 	u_quad_t tsize;
 
 #ifndef nolint
 	tsize = (u_quad_t)0;
 #endif
 
 	/*
 	 * Setting of flags and marking of atimes are not supported.
 	 */
 	if (vap->va_flags != VNOVAL)
 		return (EOPNOTSUPP);
 
 	/*
 	 * Disallow write attempts if the filesystem is mounted read-only.
 	 */
   	if ((vap->va_flags != VNOVAL || vap->va_uid != (uid_t)VNOVAL ||
 	    vap->va_gid != (gid_t)VNOVAL || vap->va_atime.tv_sec != VNOVAL ||
 	    vap->va_mtime.tv_sec != VNOVAL || vap->va_mode != (mode_t)VNOVAL) &&
 	    (vp->v_mount->mnt_flag & MNT_RDONLY))
 		return (EROFS);
 	if (vap->va_size != VNOVAL) {
  		switch (vp->v_type) {
  		case VDIR:
  			return (EISDIR);
  		case VCHR:
  		case VBLK:
  		case VSOCK:
  		case VFIFO:
 			if (vap->va_mtime.tv_sec == VNOVAL &&
 			    vap->va_atime.tv_sec == VNOVAL &&
 			    vap->va_mode == (mode_t)VNOVAL &&
 			    vap->va_uid == (uid_t)VNOVAL &&
 			    vap->va_gid == (gid_t)VNOVAL)
 				return (0);		
  			vap->va_size = VNOVAL;
  			break;
  		default:
 			/*
 			 * Disallow write attempts if the filesystem is
 			 * mounted read-only.
 			 */
 			if (vp->v_mount->mnt_flag & MNT_RDONLY)
 				return (EROFS);
 			/*
 			 *  We run vnode_pager_setsize() early (why?),
 			 * we must set np->n_size now to avoid vinvalbuf
 			 * V_SAVE races that might setsize a lower
 			 * value.
 			 */
 			mtx_lock(&np->n_mtx);
 			tsize = np->n_size;
 			mtx_unlock(&np->n_mtx);
-			error = ncl_meta_setsize(vp, ap->a_cred, td,
-			    vap->va_size);
+			error = ncl_meta_setsize(vp, td, vap->va_size);
 			mtx_lock(&np->n_mtx);
  			if (np->n_flag & NMODIFIED) {
 			    tsize = np->n_size;
 			    mtx_unlock(&np->n_mtx);
 			    error = ncl_vinvalbuf(vp, vap->va_size == 0 ?
 			        0 : V_SAVE, td, 1);
 			    if (error != 0) {
 				    vnode_pager_setsize(vp, tsize);
 				    return (error);
 			    }
 			    /*
 			     * Call nfscl_delegmodtime() to set the modify time
 			     * locally, as required.
 			     */
 			    nfscl_delegmodtime(vp);
  			} else
 			    mtx_unlock(&np->n_mtx);
 			/*
 			 * np->n_size has already been set to vap->va_size
 			 * in ncl_meta_setsize(). We must set it again since
 			 * nfs_loadattrcache() could be called through
 			 * ncl_meta_setsize() and could modify np->n_size.
 			 */
 			mtx_lock(&np->n_mtx);
  			np->n_vattr.na_size = np->n_size = vap->va_size;
 			mtx_unlock(&np->n_mtx);
   		}
   	} else {
 		mtx_lock(&np->n_mtx);
 		if ((vap->va_mtime.tv_sec != VNOVAL || vap->va_atime.tv_sec != VNOVAL) && 
 		    (np->n_flag & NMODIFIED) && vp->v_type == VREG) {
 			mtx_unlock(&np->n_mtx);
 			error = ncl_vinvalbuf(vp, V_SAVE, td, 1);
 			if (error == EINTR || error == EIO)
 				return (error);
 		} else
 			mtx_unlock(&np->n_mtx);
 	}
 	error = nfs_setattrrpc(vp, vap, ap->a_cred, td);
 	if (error && vap->va_size != VNOVAL) {
 		mtx_lock(&np->n_mtx);
 		np->n_size = np->n_vattr.na_size = tsize;
 		vnode_pager_setsize(vp, tsize);
 		mtx_unlock(&np->n_mtx);
 	}
 	return (error);
 }
 
 /*
  * Do an nfs setattr rpc.
  */
 static int
 nfs_setattrrpc(struct vnode *vp, struct vattr *vap, struct ucred *cred,
     struct thread *td)
 {
 	struct nfsnode *np = VTONFS(vp);
 	int error, ret, attrflag, i;
 	struct nfsvattr nfsva;
 
 	if (NFS_ISV34(vp)) {
 		mtx_lock(&np->n_mtx);
 		for (i = 0; i < NFS_ACCESSCACHESIZE; i++)
 			np->n_accesscache[i].stamp = 0;
 		np->n_flag |= NDELEGMOD;
 		mtx_unlock(&np->n_mtx);
 		KDTRACE_NFS_ACCESSCACHE_FLUSH_DONE(vp);
 	}
 	error = nfsrpc_setattr(vp, vap, NULL, cred, td, &nfsva, &attrflag,
 	    NULL);
 	if (attrflag) {
 		ret = nfscl_loadattrcache(&vp, &nfsva, NULL, NULL, 0, 1);
 		if (ret && !error)
 			error = ret;
 	}
 	if (error && NFS_ISV4(vp))
 		error = nfscl_maperr(td, error, vap->va_uid, vap->va_gid);
 	return (error);
 }
 
 /*
  * nfs lookup call, one step at a time...
  * First look in cache
  * If not found, unlock the directory nfsnode and do the rpc
  */
 static int
 nfs_lookup(struct vop_lookup_args *ap)
 {
 	struct componentname *cnp = ap->a_cnp;
 	struct vnode *dvp = ap->a_dvp;
 	struct vnode **vpp = ap->a_vpp;
 	struct mount *mp = dvp->v_mount;
 	int flags = cnp->cn_flags;
 	struct vnode *newvp;
 	struct nfsmount *nmp;
 	struct nfsnode *np, *newnp;
 	int error = 0, attrflag, dattrflag, ltype, ncticks;
 	struct thread *td = cnp->cn_thread;
 	struct nfsfh *nfhp;
 	struct nfsvattr dnfsva, nfsva;
 	struct vattr vattr;
 	struct timespec nctime;
 	
 	*vpp = NULLVP;
 	if ((flags & ISLASTCN) && (mp->mnt_flag & MNT_RDONLY) &&
 	    (cnp->cn_nameiop == DELETE || cnp->cn_nameiop == RENAME))
 		return (EROFS);
 	if (dvp->v_type != VDIR)
 		return (ENOTDIR);
 	nmp = VFSTONFS(mp);
 	np = VTONFS(dvp);
 
 	/* For NFSv4, wait until any remove is done. */
 	mtx_lock(&np->n_mtx);
 	while (NFSHASNFSV4(nmp) && (np->n_flag & NREMOVEINPROG)) {
 		np->n_flag |= NREMOVEWANT;
 		(void) msleep((caddr_t)np, &np->n_mtx, PZERO, "nfslkup", 0);
 	}
 	mtx_unlock(&np->n_mtx);
 
 	if ((error = VOP_ACCESS(dvp, VEXEC, cnp->cn_cred, td)) != 0)
 		return (error);
 	error = cache_lookup(dvp, vpp, cnp, &nctime, &ncticks);
 	if (error > 0 && error != ENOENT)
 		return (error);
 	if (error == -1) {
 		/*
 		 * Lookups of "." are special and always return the
 		 * current directory.  cache_lookup() already handles
 		 * associated locking bookkeeping, etc.
 		 */
 		if (cnp->cn_namelen == 1 && cnp->cn_nameptr[0] == '.') {
 			/* XXX: Is this really correct? */
 			if (cnp->cn_nameiop != LOOKUP &&
 			    (flags & ISLASTCN))
 				cnp->cn_flags |= SAVENAME;
 			return (0);
 		}
 
 		/*
 		 * We only accept a positive hit in the cache if the
 		 * change time of the file matches our cached copy.
 		 * Otherwise, we discard the cache entry and fallback
 		 * to doing a lookup RPC.  We also only trust cache
 		 * entries for less than nm_nametimeo seconds.
 		 *
 		 * To better handle stale file handles and attributes,
 		 * clear the attribute cache of this node if it is a
 		 * leaf component, part of an open() call, and not
 		 * locally modified before fetching the attributes.
 		 * This should allow stale file handles to be detected
 		 * here where we can fall back to a LOOKUP RPC to
 		 * recover rather than having nfs_open() detect the
 		 * stale file handle and failing open(2) with ESTALE.
 		 */
 		newvp = *vpp;
 		newnp = VTONFS(newvp);
 		if (!(nmp->nm_flag & NFSMNT_NOCTO) &&
 		    (flags & (ISLASTCN | ISOPEN)) == (ISLASTCN | ISOPEN) &&
 		    !(newnp->n_flag & NMODIFIED)) {
 			mtx_lock(&newnp->n_mtx);
 			newnp->n_attrstamp = 0;
 			KDTRACE_NFS_ATTRCACHE_FLUSH_DONE(newvp);
 			mtx_unlock(&newnp->n_mtx);
 		}
 		if (nfscl_nodeleg(newvp, 0) == 0 ||
 		    ((u_int)(ticks - ncticks) < (nmp->nm_nametimeo * hz) &&
 		    VOP_GETATTR(newvp, &vattr, cnp->cn_cred) == 0 &&
 		    timespeccmp(&vattr.va_ctime, &nctime, ==))) {
 			NFSINCRGLOBAL(nfsstatsv1.lookupcache_hits);
 			if (cnp->cn_nameiop != LOOKUP &&
 			    (flags & ISLASTCN))
 				cnp->cn_flags |= SAVENAME;
 			return (0);
 		}
 		cache_purge(newvp);
 		if (dvp != newvp)
 			vput(newvp);
 		else 
 			vrele(newvp);
 		*vpp = NULLVP;
 	} else if (error == ENOENT) {
 		if (dvp->v_iflag & VI_DOOMED)
 			return (ENOENT);
 		/*
 		 * We only accept a negative hit in the cache if the
 		 * modification time of the parent directory matches
 		 * the cached copy in the name cache entry.
 		 * Otherwise, we discard all of the negative cache
 		 * entries for this directory.  We also only trust
 		 * negative cache entries for up to nm_negnametimeo
 		 * seconds.
 		 */
 		if ((u_int)(ticks - ncticks) < (nmp->nm_negnametimeo * hz) &&
 		    VOP_GETATTR(dvp, &vattr, cnp->cn_cred) == 0 &&
 		    timespeccmp(&vattr.va_mtime, &nctime, ==)) {
 			NFSINCRGLOBAL(nfsstatsv1.lookupcache_hits);
 			return (ENOENT);
 		}
 		cache_purge_negative(dvp);
 	}
 
 	error = 0;
 	newvp = NULLVP;
 	NFSINCRGLOBAL(nfsstatsv1.lookupcache_misses);
 	error = nfsrpc_lookup(dvp, cnp->cn_nameptr, cnp->cn_namelen,
 	    cnp->cn_cred, td, &dnfsva, &nfsva, &nfhp, &attrflag, &dattrflag,
 	    NULL);
 	if (dattrflag)
 		(void) nfscl_loadattrcache(&dvp, &dnfsva, NULL, NULL, 0, 1);
 	if (error) {
 		if (newvp != NULLVP) {
 			vput(newvp);
 			*vpp = NULLVP;
 		}
 
 		if (error != ENOENT) {
 			if (NFS_ISV4(dvp))
 				error = nfscl_maperr(td, error, (uid_t)0,
 				    (gid_t)0);
 			return (error);
 		}
 
 		/* The requested file was not found. */
 		if ((cnp->cn_nameiop == CREATE || cnp->cn_nameiop == RENAME) &&
 		    (flags & ISLASTCN)) {
 			/*
 			 * XXX: UFS does a full VOP_ACCESS(dvp,
 			 * VWRITE) here instead of just checking
 			 * MNT_RDONLY.
 			 */
 			if (mp->mnt_flag & MNT_RDONLY)
 				return (EROFS);
 			cnp->cn_flags |= SAVENAME;
 			return (EJUSTRETURN);
 		}
 
 		if ((cnp->cn_flags & MAKEENTRY) != 0 && dattrflag) {
 			/*
 			 * Cache the modification time of the parent
 			 * directory from the post-op attributes in
 			 * the name cache entry.  The negative cache
 			 * entry will be ignored once the directory
 			 * has changed.  Don't bother adding the entry
 			 * if the directory has already changed.
 			 */
 			mtx_lock(&np->n_mtx);
 			if (timespeccmp(&np->n_vattr.na_mtime,
 			    &dnfsva.na_mtime, ==)) {
 				mtx_unlock(&np->n_mtx);
 				cache_enter_time(dvp, NULL, cnp,
 				    &dnfsva.na_mtime, NULL);
 			} else
 				mtx_unlock(&np->n_mtx);
 		}
 		return (ENOENT);
 	}
 
 	/*
 	 * Handle RENAME case...
 	 */
 	if (cnp->cn_nameiop == RENAME && (flags & ISLASTCN)) {
 		if (NFS_CMPFH(np, nfhp->nfh_fh, nfhp->nfh_len)) {
 			free(nfhp, M_NFSFH);
 			return (EISDIR);
 		}
 		error = nfscl_nget(mp, dvp, nfhp, cnp, td, &np, NULL,
 		    LK_EXCLUSIVE);
 		if (error)
 			return (error);
 		newvp = NFSTOV(np);
 		if (attrflag)
 			(void) nfscl_loadattrcache(&newvp, &nfsva, NULL, NULL,
 			    0, 1);
 		*vpp = newvp;
 		cnp->cn_flags |= SAVENAME;
 		return (0);
 	}
 
 	if (flags & ISDOTDOT) {
 		ltype = NFSVOPISLOCKED(dvp);
 		error = vfs_busy(mp, MBF_NOWAIT);
 		if (error != 0) {
 			vfs_ref(mp);
 			NFSVOPUNLOCK(dvp, 0);
 			error = vfs_busy(mp, 0);
 			NFSVOPLOCK(dvp, ltype | LK_RETRY);
 			vfs_rel(mp);
 			if (error == 0 && (dvp->v_iflag & VI_DOOMED)) {
 				vfs_unbusy(mp);
 				error = ENOENT;
 			}
 			if (error != 0)
 				return (error);
 		}
 		NFSVOPUNLOCK(dvp, 0);
 		error = nfscl_nget(mp, dvp, nfhp, cnp, td, &np, NULL,
 		    cnp->cn_lkflags);
 		if (error == 0)
 			newvp = NFSTOV(np);
 		vfs_unbusy(mp);
 		if (newvp != dvp)
 			NFSVOPLOCK(dvp, ltype | LK_RETRY);
 		if (dvp->v_iflag & VI_DOOMED) {
 			if (error == 0) {
 				if (newvp == dvp)
 					vrele(newvp);
 				else
 					vput(newvp);
 			}
 			error = ENOENT;
 		}
 		if (error != 0)
 			return (error);
 		if (attrflag)
 			(void) nfscl_loadattrcache(&newvp, &nfsva, NULL, NULL,
 			    0, 1);
 	} else if (NFS_CMPFH(np, nfhp->nfh_fh, nfhp->nfh_len)) {
 		free(nfhp, M_NFSFH);
 		VREF(dvp);
 		newvp = dvp;
 		if (attrflag)
 			(void) nfscl_loadattrcache(&newvp, &nfsva, NULL, NULL,
 			    0, 1);
 	} else {
 		error = nfscl_nget(mp, dvp, nfhp, cnp, td, &np, NULL,
 		    cnp->cn_lkflags);
 		if (error)
 			return (error);
 		newvp = NFSTOV(np);
 		if (attrflag)
 			(void) nfscl_loadattrcache(&newvp, &nfsva, NULL, NULL,
 			    0, 1);
 		else if ((flags & (ISLASTCN | ISOPEN)) == (ISLASTCN | ISOPEN) &&
 		    !(np->n_flag & NMODIFIED)) {			
 			/*
 			 * Flush the attribute cache when opening a
 			 * leaf node to ensure that fresh attributes
 			 * are fetched in nfs_open() since we did not
 			 * fetch attributes from the LOOKUP reply.
 			 */
 			mtx_lock(&np->n_mtx);
 			np->n_attrstamp = 0;
 			KDTRACE_NFS_ATTRCACHE_FLUSH_DONE(newvp);
 			mtx_unlock(&np->n_mtx);
 		}
 	}
 	if (cnp->cn_nameiop != LOOKUP && (flags & ISLASTCN))
 		cnp->cn_flags |= SAVENAME;
 	if ((cnp->cn_flags & MAKEENTRY) &&
 	    (cnp->cn_nameiop != DELETE || !(flags & ISLASTCN)) &&
 	    attrflag != 0 && (newvp->v_type != VDIR || dattrflag != 0))
 		cache_enter_time(dvp, newvp, cnp, &nfsva.na_ctime,
 		    newvp->v_type != VDIR ? NULL : &dnfsva.na_ctime);
 	*vpp = newvp;
 	return (0);
 }
 
 /*
  * nfs read call.
  * Just call ncl_bioread() to do the work.
  */
 static int
 nfs_read(struct vop_read_args *ap)
 {
 	struct vnode *vp = ap->a_vp;
 
 	switch (vp->v_type) {
 	case VREG:
 		return (ncl_bioread(vp, ap->a_uio, ap->a_ioflag, ap->a_cred));
 	case VDIR:
 		return (EISDIR);
 	default:
 		return (EOPNOTSUPP);
 	}
 }
 
 /*
  * nfs readlink call
  */
 static int
 nfs_readlink(struct vop_readlink_args *ap)
 {
 	struct vnode *vp = ap->a_vp;
 
 	if (vp->v_type != VLNK)
 		return (EINVAL);
 	return (ncl_bioread(vp, ap->a_uio, 0, ap->a_cred));
 }
 
 /*
  * Do a readlink rpc.
  * Called by ncl_doio() from below the buffer cache.
  */
 int
 ncl_readlinkrpc(struct vnode *vp, struct uio *uiop, struct ucred *cred)
 {
 	int error, ret, attrflag;
 	struct nfsvattr nfsva;
 
 	error = nfsrpc_readlink(vp, uiop, cred, uiop->uio_td, &nfsva,
 	    &attrflag, NULL);
 	if (attrflag) {
 		ret = nfscl_loadattrcache(&vp, &nfsva, NULL, NULL, 0, 1);
 		if (ret && !error)
 			error = ret;
 	}
 	if (error && NFS_ISV4(vp))
 		error = nfscl_maperr(uiop->uio_td, error, (uid_t)0, (gid_t)0);
 	return (error);
 }
 
 /*
  * nfs read rpc call
  * Ditto above
  */
 int
 ncl_readrpc(struct vnode *vp, struct uio *uiop, struct ucred *cred)
 {
 	int error, ret, attrflag;
 	struct nfsvattr nfsva;
 	struct nfsmount *nmp;
 
 	nmp = VFSTONFS(vnode_mount(vp));
 	error = EIO;
 	attrflag = 0;
 	if (NFSHASPNFS(nmp))
 		error = nfscl_doiods(vp, uiop, NULL, NULL,
 		    NFSV4OPEN_ACCESSREAD, 0, cred, uiop->uio_td);
 	NFSCL_DEBUG(4, "readrpc: aft doiods=%d\n", error);
 	if (error != 0)
 		error = nfsrpc_read(vp, uiop, cred, uiop->uio_td, &nfsva,
 		    &attrflag, NULL);
 	if (attrflag) {
 		ret = nfscl_loadattrcache(&vp, &nfsva, NULL, NULL, 0, 1);
 		if (ret && !error)
 			error = ret;
 	}
 	if (error && NFS_ISV4(vp))
 		error = nfscl_maperr(uiop->uio_td, error, (uid_t)0, (gid_t)0);
 	return (error);
 }
 
 /*
  * nfs write call
  */
 int
 ncl_writerpc(struct vnode *vp, struct uio *uiop, struct ucred *cred,
     int *iomode, int *must_commit, int called_from_strategy)
 {
 	struct nfsvattr nfsva;
 	int error, attrflag, ret;
 	struct nfsmount *nmp;
 
 	nmp = VFSTONFS(vnode_mount(vp));
 	error = EIO;
 	attrflag = 0;
 	if (NFSHASPNFS(nmp))
 		error = nfscl_doiods(vp, uiop, iomode, must_commit,
 		    NFSV4OPEN_ACCESSWRITE, 0, cred, uiop->uio_td);
 	NFSCL_DEBUG(4, "writerpc: aft doiods=%d\n", error);
 	if (error != 0)
 		error = nfsrpc_write(vp, uiop, iomode, must_commit, cred,
 		    uiop->uio_td, &nfsva, &attrflag, NULL,
 		    called_from_strategy);
 	if (attrflag) {
 		if (VTONFS(vp)->n_flag & ND_NFSV4)
 			ret = nfscl_loadattrcache(&vp, &nfsva, NULL, NULL, 1,
 			    1);
 		else
 			ret = nfscl_loadattrcache(&vp, &nfsva, NULL, NULL, 0,
 			    1);
 		if (ret && !error)
 			error = ret;
 	}
 	if (DOINGASYNC(vp))
 		*iomode = NFSWRITE_FILESYNC;
 	if (error && NFS_ISV4(vp))
 		error = nfscl_maperr(uiop->uio_td, error, (uid_t)0, (gid_t)0);
 	return (error);
 }
 
 /*
  * nfs mknod rpc
  * For NFS v2 this is a kludge. Use a create rpc but with the IFMT bits of the
  * mode set to specify the file type and the size field for rdev.
  */
 static int
 nfs_mknodrpc(struct vnode *dvp, struct vnode **vpp, struct componentname *cnp,
     struct vattr *vap)
 {
 	struct nfsvattr nfsva, dnfsva;
 	struct vnode *newvp = NULL;
 	struct nfsnode *np = NULL, *dnp;
 	struct nfsfh *nfhp;
 	struct vattr vattr;
 	int error = 0, attrflag, dattrflag;
 	u_int32_t rdev;
 
 	if (vap->va_type == VCHR || vap->va_type == VBLK)
 		rdev = vap->va_rdev;
 	else if (vap->va_type == VFIFO || vap->va_type == VSOCK)
 		rdev = 0xffffffff;
 	else
 		return (EOPNOTSUPP);
 	if ((error = VOP_GETATTR(dvp, &vattr, cnp->cn_cred)))
 		return (error);
 	error = nfsrpc_mknod(dvp, cnp->cn_nameptr, cnp->cn_namelen, vap,
 	    rdev, vap->va_type, cnp->cn_cred, cnp->cn_thread, &dnfsva,
 	    &nfsva, &nfhp, &attrflag, &dattrflag, NULL);
 	if (!error) {
 		if (!nfhp)
 			(void) nfsrpc_lookup(dvp, cnp->cn_nameptr,
 			    cnp->cn_namelen, cnp->cn_cred, cnp->cn_thread,
 			    &dnfsva, &nfsva, &nfhp, &attrflag, &dattrflag,
 			    NULL);
 		if (nfhp)
 			error = nfscl_nget(dvp->v_mount, dvp, nfhp, cnp,
 			    cnp->cn_thread, &np, NULL, LK_EXCLUSIVE);
 	}
 	if (dattrflag)
 		(void) nfscl_loadattrcache(&dvp, &dnfsva, NULL, NULL, 0, 1);
 	if (!error) {
 		newvp = NFSTOV(np);
 		if (attrflag != 0) {
 			error = nfscl_loadattrcache(&newvp, &nfsva, NULL, NULL,
 			    0, 1);
 			if (error != 0)
 				vput(newvp);
 		}
 	}
 	if (!error) {
 		*vpp = newvp;
 	} else if (NFS_ISV4(dvp)) {
 		error = nfscl_maperr(cnp->cn_thread, error, vap->va_uid,
 		    vap->va_gid);
 	}
 	dnp = VTONFS(dvp);
 	mtx_lock(&dnp->n_mtx);
 	dnp->n_flag |= NMODIFIED;
 	if (!dattrflag) {
 		dnp->n_attrstamp = 0;
 		KDTRACE_NFS_ATTRCACHE_FLUSH_DONE(dvp);
 	}
 	mtx_unlock(&dnp->n_mtx);
 	return (error);
 }
 
 /*
  * nfs mknod vop
  * just call nfs_mknodrpc() to do the work.
  */
 /* ARGSUSED */
 static int
 nfs_mknod(struct vop_mknod_args *ap)
 {
 	return (nfs_mknodrpc(ap->a_dvp, ap->a_vpp, ap->a_cnp, ap->a_vap));
 }
 
 static struct mtx nfs_cverf_mtx;
 MTX_SYSINIT(nfs_cverf_mtx, &nfs_cverf_mtx, "NFS create verifier mutex",
     MTX_DEF);
 
 static nfsquad_t
 nfs_get_cverf(void)
 {
 	static nfsquad_t cverf;
 	nfsquad_t ret;
 	static int cverf_initialized = 0;
 
 	mtx_lock(&nfs_cverf_mtx);
 	if (cverf_initialized == 0) {
 		cverf.lval[0] = arc4random();
 		cverf.lval[1] = arc4random();
 		cverf_initialized = 1;
 	} else
 		cverf.qval++;
 	ret = cverf;
 	mtx_unlock(&nfs_cverf_mtx);
 
 	return (ret);
 }
 
 /*
  * nfs file create call
  */
 static int
 nfs_create(struct vop_create_args *ap)
 {
 	struct vnode *dvp = ap->a_dvp;
 	struct vattr *vap = ap->a_vap;
 	struct componentname *cnp = ap->a_cnp;
 	struct nfsnode *np = NULL, *dnp;
 	struct vnode *newvp = NULL;
 	struct nfsmount *nmp;
 	struct nfsvattr dnfsva, nfsva;
 	struct nfsfh *nfhp;
 	nfsquad_t cverf;
 	int error = 0, attrflag, dattrflag, fmode = 0;
 	struct vattr vattr;
 
 	/*
 	 * Oops, not for me..
 	 */
 	if (vap->va_type == VSOCK)
 		return (nfs_mknodrpc(dvp, ap->a_vpp, cnp, vap));
 
 	if ((error = VOP_GETATTR(dvp, &vattr, cnp->cn_cred)))
 		return (error);
 	if (vap->va_vaflags & VA_EXCLUSIVE)
 		fmode |= O_EXCL;
 	dnp = VTONFS(dvp);
 	nmp = VFSTONFS(vnode_mount(dvp));
 again:
 	/* For NFSv4, wait until any remove is done. */
 	mtx_lock(&dnp->n_mtx);
 	while (NFSHASNFSV4(nmp) && (dnp->n_flag & NREMOVEINPROG)) {
 		dnp->n_flag |= NREMOVEWANT;
 		(void) msleep((caddr_t)dnp, &dnp->n_mtx, PZERO, "nfscrt", 0);
 	}
 	mtx_unlock(&dnp->n_mtx);
 
 	cverf = nfs_get_cverf();
 	error = nfsrpc_create(dvp, cnp->cn_nameptr, cnp->cn_namelen,
 	    vap, cverf, fmode, cnp->cn_cred, cnp->cn_thread, &dnfsva, &nfsva,
 	    &nfhp, &attrflag, &dattrflag, NULL);
 	if (!error) {
 		if (nfhp == NULL)
 			(void) nfsrpc_lookup(dvp, cnp->cn_nameptr,
 			    cnp->cn_namelen, cnp->cn_cred, cnp->cn_thread,
 			    &dnfsva, &nfsva, &nfhp, &attrflag, &dattrflag,
 			    NULL);
 		if (nfhp != NULL)
 			error = nfscl_nget(dvp->v_mount, dvp, nfhp, cnp,
 			    cnp->cn_thread, &np, NULL, LK_EXCLUSIVE);
 	}
 	if (dattrflag)
 		(void) nfscl_loadattrcache(&dvp, &dnfsva, NULL, NULL, 0, 1);
 	if (!error) {
 		newvp = NFSTOV(np);
 		if (attrflag == 0)
 			error = nfsrpc_getattr(newvp, cnp->cn_cred,
 			    cnp->cn_thread, &nfsva, NULL);
 		if (error == 0)
 			error = nfscl_loadattrcache(&newvp, &nfsva, NULL, NULL,
 			    0, 1);
 	}
 	if (error) {
 		if (newvp != NULL) {
 			vput(newvp);
 			newvp = NULL;
 		}
 		if (NFS_ISV34(dvp) && (fmode & O_EXCL) &&
 		    error == NFSERR_NOTSUPP) {
 			fmode &= ~O_EXCL;
 			goto again;
 		}
 	} else if (NFS_ISV34(dvp) && (fmode & O_EXCL)) {
 		if (nfscl_checksattr(vap, &nfsva)) {
 			error = nfsrpc_setattr(newvp, vap, NULL, cnp->cn_cred,
 			    cnp->cn_thread, &nfsva, &attrflag, NULL);
 			if (error && (vap->va_uid != (uid_t)VNOVAL ||
 			    vap->va_gid != (gid_t)VNOVAL)) {
 				/* try again without setting uid/gid */
 				vap->va_uid = (uid_t)VNOVAL;
 				vap->va_gid = (uid_t)VNOVAL;
 				error = nfsrpc_setattr(newvp, vap, NULL, 
 				    cnp->cn_cred, cnp->cn_thread, &nfsva,
 				    &attrflag, NULL);
 			}
 			if (attrflag)
 				(void) nfscl_loadattrcache(&newvp, &nfsva, NULL,
 				    NULL, 0, 1);
 			if (error != 0)
 				vput(newvp);
 		}
 	}
 	if (!error) {
 		if ((cnp->cn_flags & MAKEENTRY) && attrflag)
 			cache_enter_time(dvp, newvp, cnp, &nfsva.na_ctime,
 			    NULL);
 		*ap->a_vpp = newvp;
 	} else if (NFS_ISV4(dvp)) {
 		error = nfscl_maperr(cnp->cn_thread, error, vap->va_uid,
 		    vap->va_gid);
 	}
 	mtx_lock(&dnp->n_mtx);
 	dnp->n_flag |= NMODIFIED;
 	if (!dattrflag) {
 		dnp->n_attrstamp = 0;
 		KDTRACE_NFS_ATTRCACHE_FLUSH_DONE(dvp);
 	}
 	mtx_unlock(&dnp->n_mtx);
 	return (error);
 }
 
 /*
  * nfs file remove call
  * To try and make nfs semantics closer to ufs semantics, a file that has
  * other processes using the vnode is renamed instead of removed and then
  * removed later on the last close.
  * - If v_usecount > 1
  *	  If a rename is not already in the works
  *	     call nfs_sillyrename() to set it up
  *     else
  *	  do the remove rpc
  */
 static int
 nfs_remove(struct vop_remove_args *ap)
 {
 	struct vnode *vp = ap->a_vp;
 	struct vnode *dvp = ap->a_dvp;
 	struct componentname *cnp = ap->a_cnp;
 	struct nfsnode *np = VTONFS(vp);
 	int error = 0;
 	struct vattr vattr;
 
 	KASSERT((cnp->cn_flags & HASBUF) != 0, ("nfs_remove: no name"));
 	KASSERT(vrefcnt(vp) > 0, ("nfs_remove: bad v_usecount"));
 	if (vp->v_type == VDIR)
 		error = EPERM;
 	else if (vrefcnt(vp) == 1 || (np->n_sillyrename &&
 	    VOP_GETATTR(vp, &vattr, cnp->cn_cred) == 0 &&
 	    vattr.va_nlink > 1)) {
 		/*
 		 * Purge the name cache so that the chance of a lookup for
 		 * the name succeeding while the remove is in progress is
 		 * minimized. Without node locking it can still happen, such
 		 * that an I/O op returns ESTALE, but since you get this if
 		 * another host removes the file..
 		 */
 		cache_purge(vp);
 		/*
 		 * throw away biocache buffers, mainly to avoid
 		 * unnecessary delayed writes later.
 		 */
 		error = ncl_vinvalbuf(vp, 0, cnp->cn_thread, 1);
 		if (error != EINTR && error != EIO)
 			/* Do the rpc */
 			error = nfs_removerpc(dvp, vp, cnp->cn_nameptr,
 			    cnp->cn_namelen, cnp->cn_cred, cnp->cn_thread);
 		/*
 		 * Kludge City: If the first reply to the remove rpc is lost..
 		 *   the reply to the retransmitted request will be ENOENT
 		 *   since the file was in fact removed
 		 *   Therefore, we cheat and return success.
 		 */
 		if (error == ENOENT)
 			error = 0;
 	} else if (!np->n_sillyrename)
 		error = nfs_sillyrename(dvp, vp, cnp);
 	mtx_lock(&np->n_mtx);
 	np->n_attrstamp = 0;
 	mtx_unlock(&np->n_mtx);
 	KDTRACE_NFS_ATTRCACHE_FLUSH_DONE(vp);
 	return (error);
 }
 
 /*
  * nfs file remove rpc called from nfs_inactive
  */
 int
 ncl_removeit(struct sillyrename *sp, struct vnode *vp)
 {
 	/*
 	 * Make sure that the directory vnode is still valid.
 	 * XXX we should lock sp->s_dvp here.
 	 */
 	if (sp->s_dvp->v_type == VBAD)
 		return (0);
 	return (nfs_removerpc(sp->s_dvp, vp, sp->s_name, sp->s_namlen,
 	    sp->s_cred, NULL));
 }
 
 /*
  * Nfs remove rpc, called from nfs_remove() and ncl_removeit().
  */
 static int
 nfs_removerpc(struct vnode *dvp, struct vnode *vp, char *name,
     int namelen, struct ucred *cred, struct thread *td)
 {
 	struct nfsvattr dnfsva;
 	struct nfsnode *dnp = VTONFS(dvp);
 	int error = 0, dattrflag;
 
 	mtx_lock(&dnp->n_mtx);
 	dnp->n_flag |= NREMOVEINPROG;
 	mtx_unlock(&dnp->n_mtx);
 	error = nfsrpc_remove(dvp, name, namelen, vp, cred, td, &dnfsva,
 	    &dattrflag, NULL);
 	mtx_lock(&dnp->n_mtx);
 	if ((dnp->n_flag & NREMOVEWANT)) {
 		dnp->n_flag &= ~(NREMOVEWANT | NREMOVEINPROG);
 		mtx_unlock(&dnp->n_mtx);
 		wakeup((caddr_t)dnp);
 	} else {
 		dnp->n_flag &= ~NREMOVEINPROG;
 		mtx_unlock(&dnp->n_mtx);
 	}
 	if (dattrflag)
 		(void) nfscl_loadattrcache(&dvp, &dnfsva, NULL, NULL, 0, 1);
 	mtx_lock(&dnp->n_mtx);
 	dnp->n_flag |= NMODIFIED;
 	if (!dattrflag) {
 		dnp->n_attrstamp = 0;
 		KDTRACE_NFS_ATTRCACHE_FLUSH_DONE(dvp);
 	}
 	mtx_unlock(&dnp->n_mtx);
 	if (error && NFS_ISV4(dvp))
 		error = nfscl_maperr(td, error, (uid_t)0, (gid_t)0);
 	return (error);
 }
 
 /*
  * nfs file rename call
  */
 static int
 nfs_rename(struct vop_rename_args *ap)
 {
 	struct vnode *fvp = ap->a_fvp;
 	struct vnode *tvp = ap->a_tvp;
 	struct vnode *fdvp = ap->a_fdvp;
 	struct vnode *tdvp = ap->a_tdvp;
 	struct componentname *tcnp = ap->a_tcnp;
 	struct componentname *fcnp = ap->a_fcnp;
 	struct nfsnode *fnp = VTONFS(ap->a_fvp);
 	struct nfsnode *tdnp = VTONFS(ap->a_tdvp);
 	struct nfsv4node *newv4 = NULL;
 	int error;
 
 	KASSERT((tcnp->cn_flags & HASBUF) != 0 &&
 	    (fcnp->cn_flags & HASBUF) != 0, ("nfs_rename: no name"));
 	/* Check for cross-device rename */
 	if ((fvp->v_mount != tdvp->v_mount) ||
 	    (tvp && (fvp->v_mount != tvp->v_mount))) {
 		error = EXDEV;
 		goto out;
 	}
 
 	if (fvp == tvp) {
 		printf("nfs_rename: fvp == tvp (can't happen)\n");
 		error = 0;
 		goto out;
 	}
 	if ((error = NFSVOPLOCK(fvp, LK_EXCLUSIVE)) != 0)
 		goto out;
 
 	/*
 	 * We have to flush B_DELWRI data prior to renaming
 	 * the file.  If we don't, the delayed-write buffers
 	 * can be flushed out later after the file has gone stale
 	 * under NFSV3.  NFSV2 does not have this problem because
 	 * ( as far as I can tell ) it flushes dirty buffers more
 	 * often.
 	 * 
 	 * Skip the rename operation if the fsync fails, this can happen
 	 * due to the server's volume being full, when we pushed out data
 	 * that was written back to our cache earlier. Not checking for
 	 * this condition can result in potential (silent) data loss.
 	 */
 	error = VOP_FSYNC(fvp, MNT_WAIT, fcnp->cn_thread);
 	NFSVOPUNLOCK(fvp, 0);
 	if (!error && tvp)
 		error = VOP_FSYNC(tvp, MNT_WAIT, tcnp->cn_thread);
 	if (error)
 		goto out;
 
 	/*
 	 * If the tvp exists and is in use, sillyrename it before doing the
 	 * rename of the new file over it.
 	 * XXX Can't sillyrename a directory.
 	 */
 	if (tvp && vrefcnt(tvp) > 1 && !VTONFS(tvp)->n_sillyrename &&
 		tvp->v_type != VDIR && !nfs_sillyrename(tdvp, tvp, tcnp)) {
 		vput(tvp);
 		tvp = NULL;
 	}
 
 	error = nfs_renamerpc(fdvp, fvp, fcnp->cn_nameptr, fcnp->cn_namelen,
 	    tdvp, tvp, tcnp->cn_nameptr, tcnp->cn_namelen, tcnp->cn_cred,
 	    tcnp->cn_thread);
 
 	if (error == 0 && NFS_ISV4(tdvp)) {
 		/*
 		 * For NFSv4, check to see if it is the same name and
 		 * replace the name, if it is different.
 		 */
 		newv4 = malloc(
 		    sizeof (struct nfsv4node) +
 		    tdnp->n_fhp->nfh_len + tcnp->cn_namelen - 1,
 		    M_NFSV4NODE, M_WAITOK);
 		mtx_lock(&tdnp->n_mtx);
 		mtx_lock(&fnp->n_mtx);
 		if (fnp->n_v4 != NULL && fvp->v_type == VREG &&
 		    (fnp->n_v4->n4_namelen != tcnp->cn_namelen ||
 		      NFSBCMP(tcnp->cn_nameptr, NFS4NODENAME(fnp->n_v4),
 		      tcnp->cn_namelen) ||
 		      tdnp->n_fhp->nfh_len != fnp->n_v4->n4_fhlen ||
 		      NFSBCMP(tdnp->n_fhp->nfh_fh, fnp->n_v4->n4_data,
 			tdnp->n_fhp->nfh_len))) {
 #ifdef notdef
 { char nnn[100]; int nnnl;
 nnnl = (tcnp->cn_namelen < 100) ? tcnp->cn_namelen : 99;
 bcopy(tcnp->cn_nameptr, nnn, nnnl);
 nnn[nnnl] = '\0';
 printf("ren replace=%s\n",nnn);
 }
 #endif
 			free(fnp->n_v4, M_NFSV4NODE);
 			fnp->n_v4 = newv4;
 			newv4 = NULL;
 			fnp->n_v4->n4_fhlen = tdnp->n_fhp->nfh_len;
 			fnp->n_v4->n4_namelen = tcnp->cn_namelen;
 			NFSBCOPY(tdnp->n_fhp->nfh_fh, fnp->n_v4->n4_data,
 			    tdnp->n_fhp->nfh_len);
 			NFSBCOPY(tcnp->cn_nameptr,
 			    NFS4NODENAME(fnp->n_v4), tcnp->cn_namelen);
 		}
 		mtx_unlock(&tdnp->n_mtx);
 		mtx_unlock(&fnp->n_mtx);
 		if (newv4 != NULL)
 			free(newv4, M_NFSV4NODE);
 	}
 
 	if (fvp->v_type == VDIR) {
 		if (tvp != NULL && tvp->v_type == VDIR)
 			cache_purge(tdvp);
 		cache_purge(fdvp);
 	}
 
 out:
 	if (tdvp == tvp)
 		vrele(tdvp);
 	else
 		vput(tdvp);
 	if (tvp)
 		vput(tvp);
 	vrele(fdvp);
 	vrele(fvp);
 	/*
 	 * Kludge: Map ENOENT => 0 assuming that it is a reply to a retry.
 	 */
 	if (error == ENOENT)
 		error = 0;
 	return (error);
 }
 
 /*
  * nfs file rename rpc called from nfs_remove() above
  */
 static int
 nfs_renameit(struct vnode *sdvp, struct vnode *svp, struct componentname *scnp,
     struct sillyrename *sp)
 {
 
 	return (nfs_renamerpc(sdvp, svp, scnp->cn_nameptr, scnp->cn_namelen,
 	    sdvp, NULL, sp->s_name, sp->s_namlen, scnp->cn_cred,
 	    scnp->cn_thread));
 }
 
 /*
  * Do an nfs rename rpc. Called from nfs_rename() and nfs_renameit().
  */
 static int
 nfs_renamerpc(struct vnode *fdvp, struct vnode *fvp, char *fnameptr,
     int fnamelen, struct vnode *tdvp, struct vnode *tvp, char *tnameptr,
     int tnamelen, struct ucred *cred, struct thread *td)
 {
 	struct nfsvattr fnfsva, tnfsva;
 	struct nfsnode *fdnp = VTONFS(fdvp);
 	struct nfsnode *tdnp = VTONFS(tdvp);
 	int error = 0, fattrflag, tattrflag;
 
 	error = nfsrpc_rename(fdvp, fvp, fnameptr, fnamelen, tdvp, tvp,
 	    tnameptr, tnamelen, cred, td, &fnfsva, &tnfsva, &fattrflag,
 	    &tattrflag, NULL, NULL);
 	mtx_lock(&fdnp->n_mtx);
 	fdnp->n_flag |= NMODIFIED;
 	if (fattrflag != 0) {
 		mtx_unlock(&fdnp->n_mtx);
 		(void) nfscl_loadattrcache(&fdvp, &fnfsva, NULL, NULL, 0, 1);
 	} else {
 		fdnp->n_attrstamp = 0;
 		mtx_unlock(&fdnp->n_mtx);
 		KDTRACE_NFS_ATTRCACHE_FLUSH_DONE(fdvp);
 	}
 	mtx_lock(&tdnp->n_mtx);
 	tdnp->n_flag |= NMODIFIED;
 	if (tattrflag != 0) {
 		mtx_unlock(&tdnp->n_mtx);
 		(void) nfscl_loadattrcache(&tdvp, &tnfsva, NULL, NULL, 0, 1);
 	} else {
 		tdnp->n_attrstamp = 0;
 		mtx_unlock(&tdnp->n_mtx);
 		KDTRACE_NFS_ATTRCACHE_FLUSH_DONE(tdvp);
 	}
 	if (error && NFS_ISV4(fdvp))
 		error = nfscl_maperr(td, error, (uid_t)0, (gid_t)0);
 	return (error);
 }
 
 /*
  * nfs hard link create call
  */
 static int
 nfs_link(struct vop_link_args *ap)
 {
 	struct vnode *vp = ap->a_vp;
 	struct vnode *tdvp = ap->a_tdvp;
 	struct componentname *cnp = ap->a_cnp;
 	struct nfsnode *np, *tdnp;
 	struct nfsvattr nfsva, dnfsva;
 	int error = 0, attrflag, dattrflag;
 
 	/*
 	 * Push all writes to the server, so that the attribute cache
 	 * doesn't get "out of sync" with the server.
 	 * XXX There should be a better way!
 	 */
 	VOP_FSYNC(vp, MNT_WAIT, cnp->cn_thread);
 
 	error = nfsrpc_link(tdvp, vp, cnp->cn_nameptr, cnp->cn_namelen,
 	    cnp->cn_cred, cnp->cn_thread, &dnfsva, &nfsva, &attrflag,
 	    &dattrflag, NULL);
 	tdnp = VTONFS(tdvp);
 	mtx_lock(&tdnp->n_mtx);
 	tdnp->n_flag |= NMODIFIED;
 	if (dattrflag != 0) {
 		mtx_unlock(&tdnp->n_mtx);
 		(void) nfscl_loadattrcache(&tdvp, &dnfsva, NULL, NULL, 0, 1);
 	} else {
 		tdnp->n_attrstamp = 0;
 		mtx_unlock(&tdnp->n_mtx);
 		KDTRACE_NFS_ATTRCACHE_FLUSH_DONE(tdvp);
 	}
 	if (attrflag)
 		(void) nfscl_loadattrcache(&vp, &nfsva, NULL, NULL, 0, 1);
 	else {
 		np = VTONFS(vp);
 		mtx_lock(&np->n_mtx);
 		np->n_attrstamp = 0;
 		mtx_unlock(&np->n_mtx);
 		KDTRACE_NFS_ATTRCACHE_FLUSH_DONE(vp);
 	}
 	/*
 	 * If negative lookup caching is enabled, I might as well
 	 * add an entry for this node. Not necessary for correctness,
 	 * but if negative caching is enabled, then the system
 	 * must care about lookup caching hit rate, so...
 	 */
 	if (VFSTONFS(vp->v_mount)->nm_negnametimeo != 0 &&
 	    (cnp->cn_flags & MAKEENTRY) && attrflag != 0 && error == 0) {
 		cache_enter_time(tdvp, vp, cnp, &nfsva.na_ctime, NULL);
 	}
 	if (error && NFS_ISV4(vp))
 		error = nfscl_maperr(cnp->cn_thread, error, (uid_t)0,
 		    (gid_t)0);
 	return (error);
 }
 
 /*
  * nfs symbolic link create call
  */
 static int
 nfs_symlink(struct vop_symlink_args *ap)
 {
 	struct vnode *dvp = ap->a_dvp;
 	struct vattr *vap = ap->a_vap;
 	struct componentname *cnp = ap->a_cnp;
 	struct nfsvattr nfsva, dnfsva;
 	struct nfsfh *nfhp;
 	struct nfsnode *np = NULL, *dnp;
 	struct vnode *newvp = NULL;
 	int error = 0, attrflag, dattrflag, ret;
 
 	vap->va_type = VLNK;
 	error = nfsrpc_symlink(dvp, cnp->cn_nameptr, cnp->cn_namelen,
 	    ap->a_target, vap, cnp->cn_cred, cnp->cn_thread, &dnfsva,
 	    &nfsva, &nfhp, &attrflag, &dattrflag, NULL);
 	if (nfhp) {
 		ret = nfscl_nget(dvp->v_mount, dvp, nfhp, cnp, cnp->cn_thread,
 		    &np, NULL, LK_EXCLUSIVE);
 		if (!ret)
 			newvp = NFSTOV(np);
 		else if (!error)
 			error = ret;
 	}
 	if (newvp != NULL) {
 		if (attrflag)
 			(void) nfscl_loadattrcache(&newvp, &nfsva, NULL, NULL,
 			    0, 1);
 	} else if (!error) {
 		/*
 		 * If we do not have an error and we could not extract the
 		 * newvp from the response due to the request being NFSv2, we
 		 * have to do a lookup in order to obtain a newvp to return.
 		 */
 		error = nfs_lookitup(dvp, cnp->cn_nameptr, cnp->cn_namelen,
 		    cnp->cn_cred, cnp->cn_thread, &np);
 		if (!error)
 			newvp = NFSTOV(np);
 	}
 	if (error) {
 		if (newvp)
 			vput(newvp);
 		if (NFS_ISV4(dvp))
 			error = nfscl_maperr(cnp->cn_thread, error,
 			    vap->va_uid, vap->va_gid);
 	} else {
 		*ap->a_vpp = newvp;
 	}
 
 	dnp = VTONFS(dvp);
 	mtx_lock(&dnp->n_mtx);
 	dnp->n_flag |= NMODIFIED;
 	if (dattrflag != 0) {
 		mtx_unlock(&dnp->n_mtx);
 		(void) nfscl_loadattrcache(&dvp, &dnfsva, NULL, NULL, 0, 1);
 	} else {
 		dnp->n_attrstamp = 0;
 		mtx_unlock(&dnp->n_mtx);
 		KDTRACE_NFS_ATTRCACHE_FLUSH_DONE(dvp);
 	}
 	/*
 	 * If negative lookup caching is enabled, I might as well
 	 * add an entry for this node. Not necessary for correctness,
 	 * but if negative caching is enabled, then the system
 	 * must care about lookup caching hit rate, so...
 	 */
 	if (VFSTONFS(dvp->v_mount)->nm_negnametimeo != 0 &&
 	    (cnp->cn_flags & MAKEENTRY) && attrflag != 0 && error == 0) {
 		cache_enter_time(dvp, newvp, cnp, &nfsva.na_ctime, NULL);
 	}
 	return (error);
 }
 
 /*
  * nfs make dir call
  */
 static int
 nfs_mkdir(struct vop_mkdir_args *ap)
 {
 	struct vnode *dvp = ap->a_dvp;
 	struct vattr *vap = ap->a_vap;
 	struct componentname *cnp = ap->a_cnp;
 	struct nfsnode *np = NULL, *dnp;
 	struct vnode *newvp = NULL;
 	struct vattr vattr;
 	struct nfsfh *nfhp;
 	struct nfsvattr nfsva, dnfsva;
 	int error = 0, attrflag, dattrflag, ret;
 
 	if ((error = VOP_GETATTR(dvp, &vattr, cnp->cn_cred)) != 0)
 		return (error);
 	vap->va_type = VDIR;
 	error = nfsrpc_mkdir(dvp, cnp->cn_nameptr, cnp->cn_namelen,
 	    vap, cnp->cn_cred, cnp->cn_thread, &dnfsva, &nfsva, &nfhp,
 	    &attrflag, &dattrflag, NULL);
 	dnp = VTONFS(dvp);
 	mtx_lock(&dnp->n_mtx);
 	dnp->n_flag |= NMODIFIED;
 	if (dattrflag != 0) {
 		mtx_unlock(&dnp->n_mtx);
 		(void) nfscl_loadattrcache(&dvp, &dnfsva, NULL, NULL, 0, 1);
 	} else {
 		dnp->n_attrstamp = 0;
 		mtx_unlock(&dnp->n_mtx);
 		KDTRACE_NFS_ATTRCACHE_FLUSH_DONE(dvp);
 	}
 	if (nfhp) {
 		ret = nfscl_nget(dvp->v_mount, dvp, nfhp, cnp, cnp->cn_thread,
 		    &np, NULL, LK_EXCLUSIVE);
 		if (!ret) {
 			newvp = NFSTOV(np);
 			if (attrflag)
 			   (void) nfscl_loadattrcache(&newvp, &nfsva, NULL,
 				NULL, 0, 1);
 		} else if (!error)
 			error = ret;
 	}
 	if (!error && newvp == NULL) {
 		error = nfs_lookitup(dvp, cnp->cn_nameptr, cnp->cn_namelen,
 		    cnp->cn_cred, cnp->cn_thread, &np);
 		if (!error) {
 			newvp = NFSTOV(np);
 			if (newvp->v_type != VDIR)
 				error = EEXIST;
 		}
 	}
 	if (error) {
 		if (newvp)
 			vput(newvp);
 		if (NFS_ISV4(dvp))
 			error = nfscl_maperr(cnp->cn_thread, error,
 			    vap->va_uid, vap->va_gid);
 	} else {
 		/*
 		 * If negative lookup caching is enabled, I might as well
 		 * add an entry for this node. Not necessary for correctness,
 		 * but if negative caching is enabled, then the system
 		 * must care about lookup caching hit rate, so...
 		 */
 		if (VFSTONFS(dvp->v_mount)->nm_negnametimeo != 0 &&
 		    (cnp->cn_flags & MAKEENTRY) &&
 		    attrflag != 0 && dattrflag != 0)
 			cache_enter_time(dvp, newvp, cnp, &nfsva.na_ctime,
 			    &dnfsva.na_ctime);
 		*ap->a_vpp = newvp;
 	}
 	return (error);
 }
 
 /*
  * nfs remove directory call
  */
 static int
 nfs_rmdir(struct vop_rmdir_args *ap)
 {
 	struct vnode *vp = ap->a_vp;
 	struct vnode *dvp = ap->a_dvp;
 	struct componentname *cnp = ap->a_cnp;
 	struct nfsnode *dnp;
 	struct nfsvattr dnfsva;
 	int error, dattrflag;
 
 	if (dvp == vp)
 		return (EINVAL);
 	error = nfsrpc_rmdir(dvp, cnp->cn_nameptr, cnp->cn_namelen,
 	    cnp->cn_cred, cnp->cn_thread, &dnfsva, &dattrflag, NULL);
 	dnp = VTONFS(dvp);
 	mtx_lock(&dnp->n_mtx);
 	dnp->n_flag |= NMODIFIED;
 	if (dattrflag != 0) {
 		mtx_unlock(&dnp->n_mtx);
 		(void) nfscl_loadattrcache(&dvp, &dnfsva, NULL, NULL, 0, 1);
 	} else {
 		dnp->n_attrstamp = 0;
 		mtx_unlock(&dnp->n_mtx);
 		KDTRACE_NFS_ATTRCACHE_FLUSH_DONE(dvp);
 	}
 
 	cache_purge(dvp);
 	cache_purge(vp);
 	if (error && NFS_ISV4(dvp))
 		error = nfscl_maperr(cnp->cn_thread, error, (uid_t)0,
 		    (gid_t)0);
 	/*
 	 * Kludge: Map ENOENT => 0 assuming that you have a reply to a retry.
 	 */
 	if (error == ENOENT)
 		error = 0;
 	return (error);
 }
 
 /*
  * nfs readdir call
  */
 static int
 nfs_readdir(struct vop_readdir_args *ap)
 {
 	struct vnode *vp = ap->a_vp;
 	struct nfsnode *np = VTONFS(vp);
 	struct uio *uio = ap->a_uio;
 	ssize_t tresid, left;
 	int error = 0;
 	struct vattr vattr;
 	
 	if (ap->a_eofflag != NULL)
 		*ap->a_eofflag = 0;
 	if (vp->v_type != VDIR) 
 		return(EPERM);
 
 	/*
 	 * First, check for hit on the EOF offset cache
 	 */
 	if (np->n_direofoffset > 0 && uio->uio_offset >= np->n_direofoffset &&
 	    (np->n_flag & NMODIFIED) == 0) {
 		if (VOP_GETATTR(vp, &vattr, ap->a_cred) == 0) {
 			mtx_lock(&np->n_mtx);
 			if ((NFS_ISV4(vp) && np->n_change == vattr.va_filerev) ||
 			    !NFS_TIMESPEC_COMPARE(&np->n_mtime, &vattr.va_mtime)) {
 				mtx_unlock(&np->n_mtx);
 				NFSINCRGLOBAL(nfsstatsv1.direofcache_hits);
 				if (ap->a_eofflag != NULL)
 					*ap->a_eofflag = 1;
 				return (0);
 			} else
 				mtx_unlock(&np->n_mtx);
 		}
 	}
 
 	/*
 	 * NFS always guarantees that directory entries don't straddle
 	 * DIRBLKSIZ boundaries.  As such, we need to limit the size
 	 * to an exact multiple of DIRBLKSIZ, to avoid copying a partial
 	 * directory entry.
 	 */
 	left = uio->uio_resid % DIRBLKSIZ;
 	if (left == uio->uio_resid)
 		return (EINVAL);
 	uio->uio_resid -= left;
 
 	/*
 	 * Call ncl_bioread() to do the real work.
 	 */
 	tresid = uio->uio_resid;
 	error = ncl_bioread(vp, uio, 0, ap->a_cred);
 
 	if (!error && uio->uio_resid == tresid) {
 		NFSINCRGLOBAL(nfsstatsv1.direofcache_misses);
 		if (ap->a_eofflag != NULL)
 			*ap->a_eofflag = 1;
 	}
 	
 	/* Add the partial DIRBLKSIZ (left) back in. */
 	uio->uio_resid += left;
 	return (error);
 }
 
 /*
  * Readdir rpc call.
  * Called from below the buffer cache by ncl_doio().
  */
 int
 ncl_readdirrpc(struct vnode *vp, struct uio *uiop, struct ucred *cred,
     struct thread *td)
 {
 	struct nfsvattr nfsva;
 	nfsuint64 *cookiep, cookie;
 	struct nfsnode *dnp = VTONFS(vp);
 	struct nfsmount *nmp = VFSTONFS(vp->v_mount);
 	int error = 0, eof, attrflag;
 
 	KASSERT(uiop->uio_iovcnt == 1 &&
 	    (uiop->uio_offset & (DIRBLKSIZ - 1)) == 0 &&
 	    (uiop->uio_resid & (DIRBLKSIZ - 1)) == 0,
 	    ("nfs readdirrpc bad uio"));
 
 	/*
 	 * If there is no cookie, assume directory was stale.
 	 */
 	ncl_dircookie_lock(dnp);
 	cookiep = ncl_getcookie(dnp, uiop->uio_offset, 0);
 	if (cookiep) {
 		cookie = *cookiep;
 		ncl_dircookie_unlock(dnp);
 	} else {
 		ncl_dircookie_unlock(dnp);		
 		return (NFSERR_BAD_COOKIE);
 	}
 
 	if (NFSHASNFSV3(nmp) && !NFSHASGOTFSINFO(nmp))
 		(void)ncl_fsinfo(nmp, vp, cred, td);
 
 	error = nfsrpc_readdir(vp, uiop, &cookie, cred, td, &nfsva,
 	    &attrflag, &eof, NULL);
 	if (attrflag)
 		(void) nfscl_loadattrcache(&vp, &nfsva, NULL, NULL, 0, 1);
 
 	if (!error) {
 		/*
 		 * We are now either at the end of the directory or have filled
 		 * the block.
 		 */
 		if (eof)
 			dnp->n_direofoffset = uiop->uio_offset;
 		else {
 			if (uiop->uio_resid > 0)
 				printf("EEK! readdirrpc resid > 0\n");
 			ncl_dircookie_lock(dnp);
 			cookiep = ncl_getcookie(dnp, uiop->uio_offset, 1);
 			*cookiep = cookie;
 			ncl_dircookie_unlock(dnp);
 		}
 	} else if (NFS_ISV4(vp)) {
 		error = nfscl_maperr(td, error, (uid_t)0, (gid_t)0);
 	}
 	return (error);
 }
 
 /*
  * NFS V3 readdir plus RPC. Used in place of ncl_readdirrpc().
  */
 int
 ncl_readdirplusrpc(struct vnode *vp, struct uio *uiop, struct ucred *cred,
     struct thread *td)
 {
 	struct nfsvattr nfsva;
 	nfsuint64 *cookiep, cookie;
 	struct nfsnode *dnp = VTONFS(vp);
 	struct nfsmount *nmp = VFSTONFS(vp->v_mount);
 	int error = 0, attrflag, eof;
 
 	KASSERT(uiop->uio_iovcnt == 1 &&
 	    (uiop->uio_offset & (DIRBLKSIZ - 1)) == 0 &&
 	    (uiop->uio_resid & (DIRBLKSIZ - 1)) == 0,
 	    ("nfs readdirplusrpc bad uio"));
 
 	/*
 	 * If there is no cookie, assume directory was stale.
 	 */
 	ncl_dircookie_lock(dnp);
 	cookiep = ncl_getcookie(dnp, uiop->uio_offset, 0);
 	if (cookiep) {
 		cookie = *cookiep;
 		ncl_dircookie_unlock(dnp);
 	} else {
 		ncl_dircookie_unlock(dnp);
 		return (NFSERR_BAD_COOKIE);
 	}
 
 	if (NFSHASNFSV3(nmp) && !NFSHASGOTFSINFO(nmp))
 		(void)ncl_fsinfo(nmp, vp, cred, td);
 	error = nfsrpc_readdirplus(vp, uiop, &cookie, cred, td, &nfsva,
 	    &attrflag, &eof, NULL);
 	if (attrflag)
 		(void) nfscl_loadattrcache(&vp, &nfsva, NULL, NULL, 0, 1);
 
 	if (!error) {
 		/*
 		 * We are now either at end of the directory or have filled the
 		 * the block.
 		 */
 		if (eof)
 			dnp->n_direofoffset = uiop->uio_offset;
 		else {
 			if (uiop->uio_resid > 0)
 				printf("EEK! readdirplusrpc resid > 0\n");
 			ncl_dircookie_lock(dnp);
 			cookiep = ncl_getcookie(dnp, uiop->uio_offset, 1);
 			*cookiep = cookie;
 			ncl_dircookie_unlock(dnp);
 		}
 	} else if (NFS_ISV4(vp)) {
 		error = nfscl_maperr(td, error, (uid_t)0, (gid_t)0);
 	}
 	return (error);
 }
 
 /*
  * Silly rename. To make the NFS filesystem that is stateless look a little
  * more like the "ufs" a remove of an active vnode is translated to a rename
  * to a funny looking filename that is removed by nfs_inactive on the
  * nfsnode. There is the potential for another process on a different client
  * to create the same funny name between the nfs_lookitup() fails and the
  * nfs_rename() completes, but...
  */
 static int
 nfs_sillyrename(struct vnode *dvp, struct vnode *vp, struct componentname *cnp)
 {
 	struct sillyrename *sp;
 	struct nfsnode *np;
 	int error;
 	short pid;
 	unsigned int lticks;
 
 	cache_purge(dvp);
 	np = VTONFS(vp);
 	KASSERT(vp->v_type != VDIR, ("nfs: sillyrename dir"));
 	sp = malloc(sizeof (struct sillyrename),
 	    M_NEWNFSREQ, M_WAITOK);
 	sp->s_cred = crhold(cnp->cn_cred);
 	sp->s_dvp = dvp;
 	VREF(dvp);
 
 	/* 
 	 * Fudge together a funny name.
 	 * Changing the format of the funny name to accommodate more 
 	 * sillynames per directory.
 	 * The name is now changed to .nfs.<ticks>.<pid>.4, where ticks is 
 	 * CPU ticks since boot.
 	 */
 	pid = cnp->cn_thread->td_proc->p_pid;
 	lticks = (unsigned int)ticks;
 	for ( ; ; ) {
 		sp->s_namlen = sprintf(sp->s_name, 
 				       ".nfs.%08x.%04x4.4", lticks, 
 				       pid);
 		if (nfs_lookitup(dvp, sp->s_name, sp->s_namlen, sp->s_cred,
 				 cnp->cn_thread, NULL))
 			break;
 		lticks++;
 	}
 	error = nfs_renameit(dvp, vp, cnp, sp);
 	if (error)
 		goto bad;
 	error = nfs_lookitup(dvp, sp->s_name, sp->s_namlen, sp->s_cred,
 		cnp->cn_thread, &np);
 	np->n_sillyrename = sp;
 	return (0);
 bad:
 	vrele(sp->s_dvp);
 	crfree(sp->s_cred);
 	free(sp, M_NEWNFSREQ);
 	return (error);
 }
 
 /*
  * Look up a file name and optionally either update the file handle or
  * allocate an nfsnode, depending on the value of npp.
  * npp == NULL	--> just do the lookup
  * *npp == NULL --> allocate a new nfsnode and make sure attributes are
  *			handled too
  * *npp != NULL --> update the file handle in the vnode
  */
 static int
 nfs_lookitup(struct vnode *dvp, char *name, int len, struct ucred *cred,
     struct thread *td, struct nfsnode **npp)
 {
 	struct vnode *newvp = NULL, *vp;
 	struct nfsnode *np, *dnp = VTONFS(dvp);
 	struct nfsfh *nfhp, *onfhp;
 	struct nfsvattr nfsva, dnfsva;
 	struct componentname cn;
 	int error = 0, attrflag, dattrflag;
 	u_int hash;
 
 	error = nfsrpc_lookup(dvp, name, len, cred, td, &dnfsva, &nfsva,
 	    &nfhp, &attrflag, &dattrflag, NULL);
 	if (dattrflag)
 		(void) nfscl_loadattrcache(&dvp, &dnfsva, NULL, NULL, 0, 1);
 	if (npp && !error) {
 		if (*npp != NULL) {
 		    np = *npp;
 		    vp = NFSTOV(np);
 		    /*
 		     * For NFSv4, check to see if it is the same name and
 		     * replace the name, if it is different.
 		     */
 		    if (np->n_v4 != NULL && nfsva.na_type == VREG &&
 			(np->n_v4->n4_namelen != len ||
 			 NFSBCMP(name, NFS4NODENAME(np->n_v4), len) ||
 			 dnp->n_fhp->nfh_len != np->n_v4->n4_fhlen ||
 			 NFSBCMP(dnp->n_fhp->nfh_fh, np->n_v4->n4_data,
 			 dnp->n_fhp->nfh_len))) {
 #ifdef notdef
 { char nnn[100]; int nnnl;
 nnnl = (len < 100) ? len : 99;
 bcopy(name, nnn, nnnl);
 nnn[nnnl] = '\0';
 printf("replace=%s\n",nnn);
 }
 #endif
 			    free(np->n_v4, M_NFSV4NODE);
 			    np->n_v4 = malloc(
 				sizeof (struct nfsv4node) +
 				dnp->n_fhp->nfh_len + len - 1,
 				M_NFSV4NODE, M_WAITOK);
 			    np->n_v4->n4_fhlen = dnp->n_fhp->nfh_len;
 			    np->n_v4->n4_namelen = len;
 			    NFSBCOPY(dnp->n_fhp->nfh_fh, np->n_v4->n4_data,
 				dnp->n_fhp->nfh_len);
 			    NFSBCOPY(name, NFS4NODENAME(np->n_v4), len);
 		    }
 		    hash = fnv_32_buf(nfhp->nfh_fh, nfhp->nfh_len,
 			FNV1_32_INIT);
 		    onfhp = np->n_fhp;
 		    /*
 		     * Rehash node for new file handle.
 		     */
 		    vfs_hash_rehash(vp, hash);
 		    np->n_fhp = nfhp;
 		    if (onfhp != NULL)
 			free(onfhp, M_NFSFH);
 		    newvp = NFSTOV(np);
 		} else if (NFS_CMPFH(dnp, nfhp->nfh_fh, nfhp->nfh_len)) {
 		    free(nfhp, M_NFSFH);
 		    VREF(dvp);
 		    newvp = dvp;
 		} else {
 		    cn.cn_nameptr = name;
 		    cn.cn_namelen = len;
 		    error = nfscl_nget(dvp->v_mount, dvp, nfhp, &cn, td,
 			&np, NULL, LK_EXCLUSIVE);
 		    if (error)
 			return (error);
 		    newvp = NFSTOV(np);
 		}
 		if (!attrflag && *npp == NULL) {
 			if (newvp == dvp)
 				vrele(newvp);
 			else
 				vput(newvp);
 			return (ENOENT);
 		}
 		if (attrflag)
 			(void) nfscl_loadattrcache(&newvp, &nfsva, NULL, NULL,
 			    0, 1);
 	}
 	if (npp && *npp == NULL) {
 		if (error) {
 			if (newvp) {
 				if (newvp == dvp)
 					vrele(newvp);
 				else
 					vput(newvp);
 			}
 		} else
 			*npp = np;
 	}
 	if (error && NFS_ISV4(dvp))
 		error = nfscl_maperr(td, error, (uid_t)0, (gid_t)0);
 	return (error);
 }
 
 /*
  * Nfs Version 3 and 4 commit rpc
  */
 int
 ncl_commit(struct vnode *vp, u_quad_t offset, int cnt, struct ucred *cred,
    struct thread *td)
 {
 	struct nfsvattr nfsva;
 	struct nfsmount *nmp = VFSTONFS(vp->v_mount);
 	struct nfsnode *np;
 	struct uio uio;
 	int error, attrflag;
 
 	np = VTONFS(vp);
 	error = EIO;
 	attrflag = 0;
 	if (NFSHASPNFS(nmp) && (np->n_flag & NDSCOMMIT) != 0) {
 		uio.uio_offset = offset;
 		uio.uio_resid = cnt;
 		error = nfscl_doiods(vp, &uio, NULL, NULL,
 		    NFSV4OPEN_ACCESSWRITE, 1, cred, td);
 		if (error != 0) {
 			mtx_lock(&np->n_mtx);
 			np->n_flag &= ~NDSCOMMIT;
 			mtx_unlock(&np->n_mtx);
 		}
 	}
 	if (error != 0) {
 		mtx_lock(&nmp->nm_mtx);
 		if ((nmp->nm_state & NFSSTA_HASWRITEVERF) == 0) {
 			mtx_unlock(&nmp->nm_mtx);
 			return (0);
 		}
 		mtx_unlock(&nmp->nm_mtx);
 		error = nfsrpc_commit(vp, offset, cnt, cred, td, &nfsva,
 		    &attrflag, NULL);
 	}
 	if (attrflag != 0)
 		(void) nfscl_loadattrcache(&vp, &nfsva, NULL, NULL,
 		    0, 1);
 	if (error != 0 && NFS_ISV4(vp))
 		error = nfscl_maperr(td, error, (uid_t)0, (gid_t)0);
 	return (error);
 }
 
 /*
  * Strategy routine.
  * For async requests when nfsiod(s) are running, queue the request by
  * calling ncl_asyncio(), otherwise just all ncl_doio() to do the
  * request.
  */
 static int
 nfs_strategy(struct vop_strategy_args *ap)
 {
 	struct buf *bp;
 	struct vnode *vp;
 	struct ucred *cr;
 
 	bp = ap->a_bp;
 	vp = ap->a_vp;
 	KASSERT(bp->b_vp == vp, ("missing b_getvp"));
 	KASSERT(!(bp->b_flags & B_DONE),
 	    ("nfs_strategy: buffer %p unexpectedly marked B_DONE", bp));
 	BUF_ASSERT_HELD(bp);
 
 	if (vp->v_type == VREG && bp->b_blkno == bp->b_lblkno)
 		bp->b_blkno = bp->b_lblkno * (vp->v_bufobj.bo_bsize /
 		    DEV_BSIZE);
 	if (bp->b_iocmd == BIO_READ)
 		cr = bp->b_rcred;
 	else
 		cr = bp->b_wcred;
 
 	/*
 	 * If the op is asynchronous and an i/o daemon is waiting
 	 * queue the request, wake it up and wait for completion
 	 * otherwise just do it ourselves.
 	 */
 	if ((bp->b_flags & B_ASYNC) == 0 ||
 	    ncl_asyncio(VFSTONFS(vp->v_mount), bp, NOCRED, curthread))
 		(void) ncl_doio(vp, bp, cr, curthread, 1);
 	return (0);
 }
 
 /*
  * fsync vnode op. Just call ncl_flush() with commit == 1.
  */
 /* ARGSUSED */
 static int
 nfs_fsync(struct vop_fsync_args *ap)
 {
 
 	if (ap->a_vp->v_type != VREG) {
 		/*
 		 * For NFS, metadata is changed synchronously on the server,
 		 * so there is nothing to flush. Also, ncl_flush() clears
 		 * the NMODIFIED flag and that shouldn't be done here for
 		 * directories.
 		 */
 		return (0);
 	}
 	return (ncl_flush(ap->a_vp, ap->a_waitfor, ap->a_td, 1, 0));
 }
 
 /*
  * Flush all the blocks associated with a vnode.
  * 	Walk through the buffer pool and push any dirty pages
  *	associated with the vnode.
  * If the called_from_renewthread argument is TRUE, it has been called
  * from the NFSv4 renew thread and, as such, cannot block indefinitely
  * waiting for a buffer write to complete.
  */
 int
 ncl_flush(struct vnode *vp, int waitfor, struct thread *td,
     int commit, int called_from_renewthread)
 {
 	struct nfsnode *np = VTONFS(vp);
 	struct buf *bp;
 	int i;
 	struct buf *nbp;
 	struct nfsmount *nmp = VFSTONFS(vp->v_mount);
 	int error = 0, slptimeo = 0, slpflag = 0, retv, bvecpos;
 	int passone = 1, trycnt = 0;
 	u_quad_t off, endoff, toff;
 	struct ucred* wcred = NULL;
 	struct buf **bvec = NULL;
 	struct bufobj *bo;
 #ifndef NFS_COMMITBVECSIZ
 #define	NFS_COMMITBVECSIZ	20
 #endif
 	struct buf *bvec_on_stack[NFS_COMMITBVECSIZ];
 	u_int bvecsize = 0, bveccount;
 
 	if (called_from_renewthread != 0)
 		slptimeo = hz;
 	if (nmp->nm_flag & NFSMNT_INT)
 		slpflag = PCATCH;
 	if (!commit)
 		passone = 0;
 	bo = &vp->v_bufobj;
 	/*
 	 * A b_flags == (B_DELWRI | B_NEEDCOMMIT) block has been written to the
 	 * server, but has not been committed to stable storage on the server
 	 * yet. On the first pass, the byte range is worked out and the commit
 	 * rpc is done. On the second pass, ncl_writebp() is called to do the
 	 * job.
 	 */
 again:
 	off = (u_quad_t)-1;
 	endoff = 0;
 	bvecpos = 0;
 	if (NFS_ISV34(vp) && commit) {
 		if (bvec != NULL && bvec != bvec_on_stack)
 			free(bvec, M_TEMP);
 		/*
 		 * Count up how many buffers waiting for a commit.
 		 */
 		bveccount = 0;
 		BO_LOCK(bo);
 		TAILQ_FOREACH_SAFE(bp, &bo->bo_dirty.bv_hd, b_bobufs, nbp) {
 			if (!BUF_ISLOCKED(bp) &&
 			    (bp->b_flags & (B_DELWRI | B_NEEDCOMMIT))
 				== (B_DELWRI | B_NEEDCOMMIT))
 				bveccount++;
 		}
 		/*
 		 * Allocate space to remember the list of bufs to commit.  It is
 		 * important to use M_NOWAIT here to avoid a race with nfs_write.
 		 * If we can't get memory (for whatever reason), we will end up
 		 * committing the buffers one-by-one in the loop below.
 		 */
 		if (bveccount > NFS_COMMITBVECSIZ) {
 			/*
 			 * Release the vnode interlock to avoid a lock
 			 * order reversal.
 			 */
 			BO_UNLOCK(bo);
 			bvec = (struct buf **)
 				malloc(bveccount * sizeof(struct buf *),
 				       M_TEMP, M_NOWAIT);
 			BO_LOCK(bo);
 			if (bvec == NULL) {
 				bvec = bvec_on_stack;
 				bvecsize = NFS_COMMITBVECSIZ;
 			} else
 				bvecsize = bveccount;
 		} else {
 			bvec = bvec_on_stack;
 			bvecsize = NFS_COMMITBVECSIZ;
 		}
 		TAILQ_FOREACH_SAFE(bp, &bo->bo_dirty.bv_hd, b_bobufs, nbp) {
 			if (bvecpos >= bvecsize)
 				break;
 			if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT, NULL)) {
 				nbp = TAILQ_NEXT(bp, b_bobufs);
 				continue;
 			}
 			if ((bp->b_flags & (B_DELWRI | B_NEEDCOMMIT)) !=
 			    (B_DELWRI | B_NEEDCOMMIT)) {
 				BUF_UNLOCK(bp);
 				nbp = TAILQ_NEXT(bp, b_bobufs);
 				continue;
 			}
 			BO_UNLOCK(bo);
 			bremfree(bp);
 			/*
 			 * Work out if all buffers are using the same cred
 			 * so we can deal with them all with one commit.
 			 *
 			 * NOTE: we are not clearing B_DONE here, so we have
 			 * to do it later on in this routine if we intend to
 			 * initiate I/O on the bp.
 			 *
 			 * Note: to avoid loopback deadlocks, we do not
 			 * assign b_runningbufspace.
 			 */
 			if (wcred == NULL)
 				wcred = bp->b_wcred;
 			else if (wcred != bp->b_wcred)
 				wcred = NOCRED;
 			vfs_busy_pages(bp, 1);
 
 			BO_LOCK(bo);
 			/*
 			 * bp is protected by being locked, but nbp is not
 			 * and vfs_busy_pages() may sleep.  We have to
 			 * recalculate nbp.
 			 */
 			nbp = TAILQ_NEXT(bp, b_bobufs);
 
 			/*
 			 * A list of these buffers is kept so that the
 			 * second loop knows which buffers have actually
 			 * been committed. This is necessary, since there
 			 * may be a race between the commit rpc and new
 			 * uncommitted writes on the file.
 			 */
 			bvec[bvecpos++] = bp;
 			toff = ((u_quad_t)bp->b_blkno) * DEV_BSIZE +
 				bp->b_dirtyoff;
 			if (toff < off)
 				off = toff;
 			toff += (u_quad_t)(bp->b_dirtyend - bp->b_dirtyoff);
 			if (toff > endoff)
 				endoff = toff;
 		}
 		BO_UNLOCK(bo);
 	}
 	if (bvecpos > 0) {
 		/*
 		 * Commit data on the server, as required.
 		 * If all bufs are using the same wcred, then use that with
 		 * one call for all of them, otherwise commit each one
 		 * separately.
 		 */
 		if (wcred != NOCRED)
 			retv = ncl_commit(vp, off, (int)(endoff - off),
 					  wcred, td);
 		else {
 			retv = 0;
 			for (i = 0; i < bvecpos; i++) {
 				off_t off, size;
 				bp = bvec[i];
 				off = ((u_quad_t)bp->b_blkno) * DEV_BSIZE +
 					bp->b_dirtyoff;
 				size = (u_quad_t)(bp->b_dirtyend
 						  - bp->b_dirtyoff);
 				retv = ncl_commit(vp, off, (int)size,
 						  bp->b_wcred, td);
 				if (retv) break;
 			}
 		}
 
 		if (retv == NFSERR_STALEWRITEVERF)
 			ncl_clearcommit(vp->v_mount);
 
 		/*
 		 * Now, either mark the blocks I/O done or mark the
 		 * blocks dirty, depending on whether the commit
 		 * succeeded.
 		 */
 		for (i = 0; i < bvecpos; i++) {
 			bp = bvec[i];
 			bp->b_flags &= ~(B_NEEDCOMMIT | B_CLUSTEROK);
 			if (retv) {
 				/*
 				 * Error, leave B_DELWRI intact
 				 */
 				vfs_unbusy_pages(bp);
 				brelse(bp);
 			} else {
 				/*
 				 * Success, remove B_DELWRI ( bundirty() ).
 				 *
 				 * b_dirtyoff/b_dirtyend seem to be NFS
 				 * specific.  We should probably move that
 				 * into bundirty(). XXX
 				 */
 				bufobj_wref(bo);
 				bp->b_flags |= B_ASYNC;
 				bundirty(bp);
 				bp->b_flags &= ~B_DONE;
 				bp->b_ioflags &= ~BIO_ERROR;
 				bp->b_dirtyoff = bp->b_dirtyend = 0;
 				bufdone(bp);
 			}
 		}
 	}
 
 	/*
 	 * Start/do any write(s) that are required.
 	 */
 loop:
 	BO_LOCK(bo);
 	TAILQ_FOREACH_SAFE(bp, &bo->bo_dirty.bv_hd, b_bobufs, nbp) {
 		if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT, NULL)) {
 			if (waitfor != MNT_WAIT || passone)
 				continue;
 
 			error = BUF_TIMELOCK(bp,
 			    LK_EXCLUSIVE | LK_SLEEPFAIL | LK_INTERLOCK,
 			    BO_LOCKPTR(bo), "nfsfsync", slpflag, slptimeo);
 			if (error == 0) {
 				BUF_UNLOCK(bp);
 				goto loop;
 			}
 			if (error == ENOLCK) {
 				error = 0;
 				goto loop;
 			}
 			if (called_from_renewthread != 0) {
 				/*
 				 * Return EIO so the flush will be retried
 				 * later.
 				 */
 				error = EIO;
 				goto done;
 			}
 			if (newnfs_sigintr(nmp, td)) {
 				error = EINTR;
 				goto done;
 			}
 			if (slpflag == PCATCH) {
 				slpflag = 0;
 				slptimeo = 2 * hz;
 			}
 			goto loop;
 		}
 		if ((bp->b_flags & B_DELWRI) == 0)
 			panic("nfs_fsync: not dirty");
 		if ((passone || !commit) && (bp->b_flags & B_NEEDCOMMIT)) {
 			BUF_UNLOCK(bp);
 			continue;
 		}
 		BO_UNLOCK(bo);
 		bremfree(bp);
 		if (passone || !commit)
 		    bp->b_flags |= B_ASYNC;
 		else
 		    bp->b_flags |= B_ASYNC;
 		bwrite(bp);
 		if (newnfs_sigintr(nmp, td)) {
 			error = EINTR;
 			goto done;
 		}
 		goto loop;
 	}
 	if (passone) {
 		passone = 0;
 		BO_UNLOCK(bo);
 		goto again;
 	}
 	if (waitfor == MNT_WAIT) {
 		while (bo->bo_numoutput) {
 			error = bufobj_wwait(bo, slpflag, slptimeo);
 			if (error) {
 			    BO_UNLOCK(bo);
 			    if (called_from_renewthread != 0) {
 				/*
 				 * Return EIO so that the flush will be
 				 * retried later.
 				 */
 				error = EIO;
 				goto done;
 			    }
 			    error = newnfs_sigintr(nmp, td);
 			    if (error)
 				goto done;
 			    if (slpflag == PCATCH) {
 				slpflag = 0;
 				slptimeo = 2 * hz;
 			    }
 			    BO_LOCK(bo);
 			}
 		}
 		if (bo->bo_dirty.bv_cnt != 0 && commit) {
 			BO_UNLOCK(bo);
 			goto loop;
 		}
 		/*
 		 * Wait for all the async IO requests to drain
 		 */
 		BO_UNLOCK(bo);
 		mtx_lock(&np->n_mtx);
 		while (np->n_directio_asyncwr > 0) {
 			np->n_flag |= NFSYNCWAIT;
 			error = newnfs_msleep(td, &np->n_directio_asyncwr,
 			    &np->n_mtx, slpflag | (PRIBIO + 1), 
 			    "nfsfsync", 0);
 			if (error) {
 				if (newnfs_sigintr(nmp, td)) {
 					mtx_unlock(&np->n_mtx);
 					error = EINTR;	
 					goto done;
 				}
 			}
 		}
 		mtx_unlock(&np->n_mtx);
 	} else
 		BO_UNLOCK(bo);
 	if (NFSHASPNFS(nmp)) {
 		nfscl_layoutcommit(vp, td);
 		/*
 		 * Invalidate the attribute cache, since writes to a DS
 		 * won't update the size attribute.
 		 */
 		mtx_lock(&np->n_mtx);
 		np->n_attrstamp = 0;
 	} else
 		mtx_lock(&np->n_mtx);
 	if (np->n_flag & NWRITEERR) {
 		error = np->n_error;
 		np->n_flag &= ~NWRITEERR;
 	}
   	if (commit && bo->bo_dirty.bv_cnt == 0 &&
 	    bo->bo_numoutput == 0 && np->n_directio_asyncwr == 0)
   		np->n_flag &= ~NMODIFIED;
 	mtx_unlock(&np->n_mtx);
 done:
 	if (bvec != NULL && bvec != bvec_on_stack)
 		free(bvec, M_TEMP);
 	if (error == 0 && commit != 0 && waitfor == MNT_WAIT &&
 	    (bo->bo_dirty.bv_cnt != 0 || bo->bo_numoutput != 0 ||
 	    np->n_directio_asyncwr != 0)) {
 		if (trycnt++ < 5) {
 			/* try, try again... */
 			passone = 1;
 			wcred = NULL;
 			bvec = NULL;
 			bvecsize = 0;
 			goto again;
 		}
 		vn_printf(vp, "ncl_flush failed");
 		error = called_from_renewthread != 0 ? EIO : EBUSY;
 	}
 	return (error);
 }
 
 /*
  * NFS advisory byte-level locks.
  */
 static int
 nfs_advlock(struct vop_advlock_args *ap)
 {
 	struct vnode *vp = ap->a_vp;
 	struct ucred *cred;
 	struct nfsnode *np = VTONFS(ap->a_vp);
 	struct proc *p = (struct proc *)ap->a_id;
 	struct thread *td = curthread;	/* XXX */
 	struct vattr va;
 	int ret, error = EOPNOTSUPP;
 	u_quad_t size;
 	
 	ret = NFSVOPLOCK(vp, LK_SHARED);
 	if (ret != 0)
 		return (EBADF);
 	if (NFS_ISV4(vp) && (ap->a_flags & (F_POSIX | F_FLOCK)) != 0) {
 		if (vp->v_type != VREG) {
 			NFSVOPUNLOCK(vp, 0);
 			return (EINVAL);
 		}
 		if ((ap->a_flags & F_POSIX) != 0)
 			cred = p->p_ucred;
 		else
 			cred = td->td_ucred;
 		NFSVOPLOCK(vp, LK_UPGRADE | LK_RETRY);
 		if (vp->v_iflag & VI_DOOMED) {
 			NFSVOPUNLOCK(vp, 0);
 			return (EBADF);
 		}
 
 		/*
 		 * If this is unlocking a write locked region, flush and
 		 * commit them before unlocking. This is required by
 		 * RFC3530 Sec. 9.3.2.
 		 */
 		if (ap->a_op == F_UNLCK &&
 		    nfscl_checkwritelocked(vp, ap->a_fl, cred, td, ap->a_id,
 		    ap->a_flags))
 			(void) ncl_flush(vp, MNT_WAIT, td, 1, 0);
 
 		/*
 		 * Loop around doing the lock op, while a blocking lock
 		 * must wait for the lock op to succeed.
 		 */
 		do {
 			ret = nfsrpc_advlock(vp, np->n_size, ap->a_op,
 			    ap->a_fl, 0, cred, td, ap->a_id, ap->a_flags);
 			if (ret == NFSERR_DENIED && (ap->a_flags & F_WAIT) &&
 			    ap->a_op == F_SETLK) {
 				NFSVOPUNLOCK(vp, 0);
 				error = nfs_catnap(PZERO | PCATCH, ret,
 				    "ncladvl");
 				if (error)
 					return (EINTR);
 				NFSVOPLOCK(vp, LK_EXCLUSIVE | LK_RETRY);
 				if (vp->v_iflag & VI_DOOMED) {
 					NFSVOPUNLOCK(vp, 0);
 					return (EBADF);
 				}
 			}
 		} while (ret == NFSERR_DENIED && (ap->a_flags & F_WAIT) &&
 		     ap->a_op == F_SETLK);
 		if (ret == NFSERR_DENIED) {
 			NFSVOPUNLOCK(vp, 0);
 			return (EAGAIN);
 		} else if (ret == EINVAL || ret == EBADF || ret == EINTR) {
 			NFSVOPUNLOCK(vp, 0);
 			return (ret);
 		} else if (ret != 0) {
 			NFSVOPUNLOCK(vp, 0);
 			return (EACCES);
 		}
 
 		/*
 		 * Now, if we just got a lock, invalidate data in the buffer
 		 * cache, as required, so that the coherency conforms with
 		 * RFC3530 Sec. 9.3.2.
 		 */
 		if (ap->a_op == F_SETLK) {
 			if ((np->n_flag & NMODIFIED) == 0) {
 				np->n_attrstamp = 0;
 				KDTRACE_NFS_ATTRCACHE_FLUSH_DONE(vp);
 				ret = VOP_GETATTR(vp, &va, cred);
 			}
 			if ((np->n_flag & NMODIFIED) || ret ||
 			    np->n_change != va.va_filerev) {
 				(void) ncl_vinvalbuf(vp, V_SAVE, td, 1);
 				np->n_attrstamp = 0;
 				KDTRACE_NFS_ATTRCACHE_FLUSH_DONE(vp);
 				ret = VOP_GETATTR(vp, &va, cred);
 				if (!ret) {
 					np->n_mtime = va.va_mtime;
 					np->n_change = va.va_filerev;
 				}
 			}
 			/* Mark that a file lock has been acquired. */
 			mtx_lock(&np->n_mtx);
 			np->n_flag |= NHASBEENLOCKED;
 			mtx_unlock(&np->n_mtx);
 		}
 		NFSVOPUNLOCK(vp, 0);
 		return (0);
 	} else if (!NFS_ISV4(vp)) {
 		if ((VFSTONFS(vp->v_mount)->nm_flag & NFSMNT_NOLOCKD) != 0) {
 			size = VTONFS(vp)->n_size;
 			NFSVOPUNLOCK(vp, 0);
 			error = lf_advlock(ap, &(vp->v_lockf), size);
 		} else {
 			if (nfs_advlock_p != NULL)
 				error = nfs_advlock_p(ap);
 			else {
 				NFSVOPUNLOCK(vp, 0);
 				error = ENOLCK;
 			}
 		}
 		if (error == 0 && ap->a_op == F_SETLK) {
 			error = NFSVOPLOCK(vp, LK_SHARED);
 			if (error == 0) {
 				/* Mark that a file lock has been acquired. */
 				mtx_lock(&np->n_mtx);
 				np->n_flag |= NHASBEENLOCKED;
 				mtx_unlock(&np->n_mtx);
 				NFSVOPUNLOCK(vp, 0);
 			}
 		}
 	} else
 		NFSVOPUNLOCK(vp, 0);
 	return (error);
 }
 
 /*
  * NFS advisory byte-level locks.
  */
 static int
 nfs_advlockasync(struct vop_advlockasync_args *ap)
 {
 	struct vnode *vp = ap->a_vp;
 	u_quad_t size;
 	int error;
 	
 	if (NFS_ISV4(vp))
 		return (EOPNOTSUPP);
 	error = NFSVOPLOCK(vp, LK_SHARED);
 	if (error)
 		return (error);
 	if ((VFSTONFS(vp->v_mount)->nm_flag & NFSMNT_NOLOCKD) != 0) {
 		size = VTONFS(vp)->n_size;
 		NFSVOPUNLOCK(vp, 0);
 		error = lf_advlockasync(ap, &(vp->v_lockf), size);
 	} else {
 		NFSVOPUNLOCK(vp, 0);
 		error = EOPNOTSUPP;
 	}
 	return (error);
 }
 
 /*
  * Print out the contents of an nfsnode.
  */
 static int
 nfs_print(struct vop_print_args *ap)
 {
 	struct vnode *vp = ap->a_vp;
 	struct nfsnode *np = VTONFS(vp);
 
 	printf("\tfileid %jd fsid 0x%jx", (uintmax_t)np->n_vattr.na_fileid,
 	    (uintmax_t)np->n_vattr.na_fsid);
 	if (vp->v_type == VFIFO)
 		fifo_printinfo(vp);
 	printf("\n");
 	return (0);
 }
 
 /*
  * This is the "real" nfs::bwrite(struct buf*).
  * We set B_CACHE if this is a VMIO buffer.
  */
 int
 ncl_writebp(struct buf *bp, int force __unused, struct thread *td)
 {
 	int oldflags, rtval;
 
 	BUF_ASSERT_HELD(bp);
 
 	if (bp->b_flags & B_INVAL) {
 		brelse(bp);
 		return (0);
 	}
 
 	oldflags = bp->b_flags;
 	bp->b_flags |= B_CACHE;
 
 	/*
 	 * Undirty the bp.  We will redirty it later if the I/O fails.
 	 */
 	bundirty(bp);
 	bp->b_flags &= ~B_DONE;
 	bp->b_ioflags &= ~BIO_ERROR;
 	bp->b_iocmd = BIO_WRITE;
 
 	bufobj_wref(bp->b_bufobj);
 	curthread->td_ru.ru_oublock++;
 
 	/*
 	 * Note: to avoid loopback deadlocks, we do not
 	 * assign b_runningbufspace.
 	 */
 	vfs_busy_pages(bp, 1);
 
 	BUF_KERNPROC(bp);
 	bp->b_iooffset = dbtob(bp->b_blkno);
 	bstrategy(bp);
 
 	if ((oldflags & B_ASYNC) != 0)
 		return (0);
 
 	rtval = bufwait(bp);
 	if (oldflags & B_DELWRI)
 		reassignbuf(bp);
 	brelse(bp);
 	return (rtval);
 }
 
 /*
  * nfs special file access vnode op.
  * Essentially just get vattr and then imitate iaccess() since the device is
  * local to the client.
  */
 static int
 nfsspec_access(struct vop_access_args *ap)
 {
 	struct vattr *vap;
 	struct ucred *cred = ap->a_cred;
 	struct vnode *vp = ap->a_vp;
 	accmode_t accmode = ap->a_accmode;
 	struct vattr vattr;
 	int error;
 
 	/*
 	 * Disallow write attempts on filesystems mounted read-only;
 	 * unless the file is a socket, fifo, or a block or character
 	 * device resident on the filesystem.
 	 */
 	if ((accmode & VWRITE) && (vp->v_mount->mnt_flag & MNT_RDONLY)) {
 		switch (vp->v_type) {
 		case VREG:
 		case VDIR:
 		case VLNK:
 			return (EROFS);
 		default:
 			break;
 		}
 	}
 	vap = &vattr;
 	error = VOP_GETATTR(vp, vap, cred);
 	if (error)
 		goto out;
 	error  = vaccess(vp->v_type, vap->va_mode, vap->va_uid, vap->va_gid,
 	    accmode, cred, NULL);
 out:
 	return error;
 }
 
 /*
  * Read wrapper for fifos.
  */
 static int
 nfsfifo_read(struct vop_read_args *ap)
 {
 	struct nfsnode *np = VTONFS(ap->a_vp);
 	int error;
 
 	/*
 	 * Set access flag.
 	 */
 	mtx_lock(&np->n_mtx);
 	np->n_flag |= NACC;
 	vfs_timestamp(&np->n_atim);
 	mtx_unlock(&np->n_mtx);
 	error = fifo_specops.vop_read(ap);
 	return error;	
 }
 
 /*
  * Write wrapper for fifos.
  */
 static int
 nfsfifo_write(struct vop_write_args *ap)
 {
 	struct nfsnode *np = VTONFS(ap->a_vp);
 
 	/*
 	 * Set update flag.
 	 */
 	mtx_lock(&np->n_mtx);
 	np->n_flag |= NUPD;
 	vfs_timestamp(&np->n_mtim);
 	mtx_unlock(&np->n_mtx);
 	return(fifo_specops.vop_write(ap));
 }
 
 /*
  * Close wrapper for fifos.
  *
  * Update the times on the nfsnode then do fifo close.
  */
 static int
 nfsfifo_close(struct vop_close_args *ap)
 {
 	struct vnode *vp = ap->a_vp;
 	struct nfsnode *np = VTONFS(vp);
 	struct vattr vattr;
 	struct timespec ts;
 
 	mtx_lock(&np->n_mtx);
 	if (np->n_flag & (NACC | NUPD)) {
 		vfs_timestamp(&ts);
 		if (np->n_flag & NACC)
 			np->n_atim = ts;
 		if (np->n_flag & NUPD)
 			np->n_mtim = ts;
 		np->n_flag |= NCHG;
 		if (vrefcnt(vp) == 1 &&
 		    (vp->v_mount->mnt_flag & MNT_RDONLY) == 0) {
 			VATTR_NULL(&vattr);
 			if (np->n_flag & NACC)
 				vattr.va_atime = np->n_atim;
 			if (np->n_flag & NUPD)
 				vattr.va_mtime = np->n_mtim;
 			mtx_unlock(&np->n_mtx);
 			(void)VOP_SETATTR(vp, &vattr, ap->a_cred);
 			goto out;
 		}
 	}
 	mtx_unlock(&np->n_mtx);
 out:
 	return (fifo_specops.vop_close(ap));
 }
 
 /*
  * Just call ncl_writebp() with the force argument set to 1.
  *
  * NOTE: B_DONE may or may not be set in a_bp on call.
  */
 static int
 nfs_bwrite(struct buf *bp)
 {
 
 	return (ncl_writebp(bp, 1, curthread));
 }
 
 struct buf_ops buf_ops_newnfs = {
 	.bop_name	=	"buf_ops_nfs",
 	.bop_write	=	nfs_bwrite,
 	.bop_strategy	=	bufstrategy,
 	.bop_sync	=	bufsync,
 	.bop_bdflush	=	bufbdflush,
 };
 
 static int
 nfs_getacl(struct vop_getacl_args *ap)
 {
 	int error;
 
 	if (ap->a_type != ACL_TYPE_NFS4)
 		return (EOPNOTSUPP);
 	error = nfsrpc_getacl(ap->a_vp, ap->a_cred, ap->a_td, ap->a_aclp,
 	    NULL);
 	if (error > NFSERR_STALE) {
 		(void) nfscl_maperr(ap->a_td, error, (uid_t)0, (gid_t)0);
 		error = EPERM;
 	}
 	return (error);
 }
 
 static int
 nfs_setacl(struct vop_setacl_args *ap)
 {
 	int error;
 
 	if (ap->a_type != ACL_TYPE_NFS4)
 		return (EOPNOTSUPP);
 	error = nfsrpc_setacl(ap->a_vp, ap->a_cred, ap->a_td, ap->a_aclp,
 	    NULL);
 	if (error > NFSERR_STALE) {
 		(void) nfscl_maperr(ap->a_td, error, (uid_t)0, (gid_t)0);
 		error = EPERM;
 	}
 	return (error);
 }
 
 /*
  * Return POSIX pathconf information applicable to nfs filesystems.
  */
 static int
 nfs_pathconf(struct vop_pathconf_args *ap)
 {
 	struct nfsv3_pathconf pc;
 	struct nfsvattr nfsva;
 	struct vnode *vp = ap->a_vp;
 	struct thread *td = curthread;
 	int attrflag, error;
 
 	if ((NFS_ISV34(vp) && (ap->a_name == _PC_LINK_MAX ||
 	    ap->a_name == _PC_NAME_MAX || ap->a_name == _PC_CHOWN_RESTRICTED ||
 	    ap->a_name == _PC_NO_TRUNC)) ||
 	    (NFS_ISV4(vp) && ap->a_name == _PC_ACL_NFS4)) {
 		/*
 		 * Since only the above 4 a_names are returned by the NFSv3
 		 * Pathconf RPC, there is no point in doing it for others.
 		 * For NFSv4, the Pathconf RPC (actually a Getattr Op.) can
 		 * be used for _PC_NFS4_ACL as well.
 		 */
 		error = nfsrpc_pathconf(vp, &pc, td->td_ucred, td, &nfsva,
 		    &attrflag, NULL);
 		if (attrflag != 0)
 			(void) nfscl_loadattrcache(&vp, &nfsva, NULL, NULL, 0,
 			    1);
 		if (error != 0)
 			return (error);
 	} else {
 		/*
 		 * For NFSv2 (or NFSv3 when not one of the above 4 a_names),
 		 * just fake them.
 		 */
 		pc.pc_linkmax = NFS_LINK_MAX;
 		pc.pc_namemax = NFS_MAXNAMLEN;
 		pc.pc_notrunc = 1;
 		pc.pc_chownrestricted = 1;
 		pc.pc_caseinsensitive = 0;
 		pc.pc_casepreserving = 1;
 		error = 0;
 	}
 	switch (ap->a_name) {
 	case _PC_LINK_MAX:
 #ifdef _LP64
 		*ap->a_retval = pc.pc_linkmax;
 #else
 		*ap->a_retval = MIN(LONG_MAX, pc.pc_linkmax);
 #endif
 		break;
 	case _PC_NAME_MAX:
 		*ap->a_retval = pc.pc_namemax;
 		break;
 	case _PC_PIPE_BUF:
 		if (ap->a_vp->v_type == VDIR || ap->a_vp->v_type == VFIFO)
 			*ap->a_retval = PIPE_BUF;
 		else
 			error = EINVAL;
 		break;
 	case _PC_CHOWN_RESTRICTED:
 		*ap->a_retval = pc.pc_chownrestricted;
 		break;
 	case _PC_NO_TRUNC:
 		*ap->a_retval = pc.pc_notrunc;
 		break;
 	case _PC_ACL_NFS4:
 		if (NFS_ISV4(vp) && nfsrv_useacl != 0 && attrflag != 0 &&
 		    NFSISSET_ATTRBIT(&nfsva.na_suppattr, NFSATTRBIT_ACL))
 			*ap->a_retval = 1;
 		else
 			*ap->a_retval = 0;
 		break;
 	case _PC_ACL_PATH_MAX:
 		if (NFS_ISV4(vp))
 			*ap->a_retval = ACL_MAX_ENTRIES;
 		else
 			*ap->a_retval = 3;
 		break;
 	case _PC_PRIO_IO:
 		*ap->a_retval = 0;
 		break;
 	case _PC_SYNC_IO:
 		*ap->a_retval = 0;
 		break;
 	case _PC_ALLOC_SIZE_MIN:
 		*ap->a_retval = vp->v_mount->mnt_stat.f_bsize;
 		break;
 	case _PC_FILESIZEBITS:
 		if (NFS_ISV34(vp))
 			*ap->a_retval = 64;
 		else
 			*ap->a_retval = 32;
 		break;
 	case _PC_REC_INCR_XFER_SIZE:
 		*ap->a_retval = vp->v_mount->mnt_stat.f_iosize;
 		break;
 	case _PC_REC_MAX_XFER_SIZE:
 		*ap->a_retval = -1; /* means ``unlimited'' */
 		break;
 	case _PC_REC_MIN_XFER_SIZE:
 		*ap->a_retval = vp->v_mount->mnt_stat.f_iosize;
 		break;
 	case _PC_REC_XFER_ALIGN:
 		*ap->a_retval = PAGE_SIZE;
 		break;
 	case _PC_SYMLINK_MAX:
 		*ap->a_retval = NFS_MAXPATHLEN;
 		break;
 
 	default:
 		error = vop_stdpathconf(ap);
 		break;
 	}
 	return (error);
 }
 
Index: stable/12/sys/kern/vfs_subr.c
===================================================================
--- stable/12/sys/kern/vfs_subr.c	(revision 349304)
+++ stable/12/sys/kern/vfs_subr.c	(revision 349305)
@@ -1,5562 +1,5562 @@
 /*-
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Copyright (c) 1989, 1993
  *	The Regents of the University of California.  All rights reserved.
  * (c) UNIX System Laboratories, Inc.
  * All or some portions of this file are derived from material licensed
  * to the University of California by American Telephone and Telegraph
  * Co. or Unix System Laboratories, Inc. and are reproduced herein with
  * the permission of UNIX System Laboratories, Inc.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	@(#)vfs_subr.c	8.31 (Berkeley) 5/26/95
  */
 
 /*
  * External virtual filesystem routines
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_ddb.h"
 #include "opt_watchdog.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/bio.h>
 #include <sys/buf.h>
 #include <sys/condvar.h>
 #include <sys/conf.h>
 #include <sys/counter.h>
 #include <sys/dirent.h>
 #include <sys/event.h>
 #include <sys/eventhandler.h>
 #include <sys/extattr.h>
 #include <sys/file.h>
 #include <sys/fcntl.h>
 #include <sys/jail.h>
 #include <sys/kdb.h>
 #include <sys/kernel.h>
 #include <sys/kthread.h>
 #include <sys/lockf.h>
 #include <sys/malloc.h>
 #include <sys/mount.h>
 #include <sys/namei.h>
 #include <sys/pctrie.h>
 #include <sys/priv.h>
 #include <sys/reboot.h>
 #include <sys/refcount.h>
 #include <sys/rwlock.h>
 #include <sys/sched.h>
 #include <sys/sleepqueue.h>
 #include <sys/smp.h>
 #include <sys/stat.h>
 #include <sys/sysctl.h>
 #include <sys/syslog.h>
 #include <sys/vmmeter.h>
 #include <sys/vnode.h>
 #include <sys/watchdog.h>
 
 #include <machine/stdarg.h>
 
 #include <security/mac/mac_framework.h>
 
 #include <vm/vm.h>
 #include <vm/vm_object.h>
 #include <vm/vm_extern.h>
 #include <vm/pmap.h>
 #include <vm/vm_map.h>
 #include <vm/vm_page.h>
 #include <vm/vm_kern.h>
 #include <vm/uma.h>
 
 #ifdef DDB
 #include <ddb/ddb.h>
 #endif
 
 static void	delmntque(struct vnode *vp);
 static int	flushbuflist(struct bufv *bufv, int flags, struct bufobj *bo,
 		    int slpflag, int slptimeo);
 static void	syncer_shutdown(void *arg, int howto);
 static int	vtryrecycle(struct vnode *vp);
 static void	v_init_counters(struct vnode *);
 static void	v_incr_usecount(struct vnode *);
 static void	v_incr_usecount_locked(struct vnode *);
 static void	v_incr_devcount(struct vnode *);
 static void	v_decr_devcount(struct vnode *);
 static void	vgonel(struct vnode *);
 static void	vfs_knllock(void *arg);
 static void	vfs_knlunlock(void *arg);
 static void	vfs_knl_assert_locked(void *arg);
 static void	vfs_knl_assert_unlocked(void *arg);
 static void	vnlru_return_batches(struct vfsops *mnt_op);
 static void	destroy_vpollinfo(struct vpollinfo *vi);
 
 /*
  * These fences are intended for cases where some synchronization is
  * needed between access of v_iflags and lockless vnode refcount (v_holdcnt
  * and v_usecount) updates.  Access to v_iflags is generally synchronized
  * by the interlock, but we have some internal assertions that check vnode
  * flags without acquiring the lock.  Thus, these fences are INVARIANTS-only
  * for now.
  */
 #ifdef INVARIANTS
 #define	VNODE_REFCOUNT_FENCE_ACQ()	atomic_thread_fence_acq()
 #define	VNODE_REFCOUNT_FENCE_REL()	atomic_thread_fence_rel()
 #else
 #define	VNODE_REFCOUNT_FENCE_ACQ()
 #define	VNODE_REFCOUNT_FENCE_REL()
 #endif
 
 /*
  * Number of vnodes in existence.  Increased whenever getnewvnode()
  * allocates a new vnode, decreased in vdropl() for VI_DOOMED vnode.
  */
 static unsigned long	numvnodes;
 
 SYSCTL_ULONG(_vfs, OID_AUTO, numvnodes, CTLFLAG_RD, &numvnodes, 0,
     "Number of vnodes in existence");
 
 static counter_u64_t vnodes_created;
 SYSCTL_COUNTER_U64(_vfs, OID_AUTO, vnodes_created, CTLFLAG_RD, &vnodes_created,
     "Number of vnodes created by getnewvnode");
 
 static u_long mnt_free_list_batch = 128;
 SYSCTL_ULONG(_vfs, OID_AUTO, mnt_free_list_batch, CTLFLAG_RW,
     &mnt_free_list_batch, 0, "Limit of vnodes held on mnt's free list");
 
 /*
  * Conversion tables for conversion from vnode types to inode formats
  * and back.
  */
 enum vtype iftovt_tab[16] = {
 	VNON, VFIFO, VCHR, VNON, VDIR, VNON, VBLK, VNON,
 	VREG, VNON, VLNK, VNON, VSOCK, VNON, VNON, VNON
 };
 int vttoif_tab[10] = {
 	0, S_IFREG, S_IFDIR, S_IFBLK, S_IFCHR, S_IFLNK,
 	S_IFSOCK, S_IFIFO, S_IFMT, S_IFMT
 };
 
 /*
  * List of vnodes that are ready for recycling.
  */
 static TAILQ_HEAD(freelst, vnode) vnode_free_list;
 
 /*
  * "Free" vnode target.  Free vnodes are rarely completely free, but are
  * just ones that are cheap to recycle.  Usually they are for files which
  * have been stat'd but not read; these usually have inode and namecache
  * data attached to them.  This target is the preferred minimum size of a
  * sub-cache consisting mostly of such files. The system balances the size
  * of this sub-cache with its complement to try to prevent either from
  * thrashing while the other is relatively inactive.  The targets express
  * a preference for the best balance.
  *
  * "Above" this target there are 2 further targets (watermarks) related
  * to recyling of free vnodes.  In the best-operating case, the cache is
  * exactly full, the free list has size between vlowat and vhiwat above the
  * free target, and recycling from it and normal use maintains this state.
  * Sometimes the free list is below vlowat or even empty, but this state
  * is even better for immediate use provided the cache is not full.
  * Otherwise, vnlru_proc() runs to reclaim enough vnodes (usually non-free
  * ones) to reach one of these states.  The watermarks are currently hard-
  * coded as 4% and 9% of the available space higher.  These and the default
  * of 25% for wantfreevnodes are too large if the memory size is large.
  * E.g., 9% of 75% of MAXVNODES is more than 566000 vnodes to reclaim
  * whenever vnlru_proc() becomes active.
  */
 static u_long wantfreevnodes;
 SYSCTL_ULONG(_vfs, OID_AUTO, wantfreevnodes, CTLFLAG_RW,
     &wantfreevnodes, 0, "Target for minimum number of \"free\" vnodes");
 static u_long freevnodes;
 SYSCTL_ULONG(_vfs, OID_AUTO, freevnodes, CTLFLAG_RD,
     &freevnodes, 0, "Number of \"free\" vnodes");
 
 static counter_u64_t recycles_count;
 SYSCTL_COUNTER_U64(_vfs, OID_AUTO, recycles, CTLFLAG_RD, &recycles_count,
     "Number of vnodes recycled to meet vnode cache targets");
 
 /*
  * Various variables used for debugging the new implementation of
  * reassignbuf().
  * XXX these are probably of (very) limited utility now.
  */
 static int reassignbufcalls;
 SYSCTL_INT(_vfs, OID_AUTO, reassignbufcalls, CTLFLAG_RW, &reassignbufcalls, 0,
     "Number of calls to reassignbuf");
 
 static counter_u64_t free_owe_inact;
 SYSCTL_COUNTER_U64(_vfs, OID_AUTO, free_owe_inact, CTLFLAG_RD, &free_owe_inact,
     "Number of times free vnodes kept on active list due to VFS "
     "owing inactivation");
 
 /* To keep more than one thread at a time from running vfs_getnewfsid */
 static struct mtx mntid_mtx;
 
 /*
  * Lock for any access to the following:
  *	vnode_free_list
  *	numvnodes
  *	freevnodes
  */
 static struct mtx vnode_free_list_mtx;
 
 /* Publicly exported FS */
 struct nfs_public nfs_pub;
 
 static uma_zone_t buf_trie_zone;
 
 /* Zone for allocation of new vnodes - used exclusively by getnewvnode() */
 static uma_zone_t vnode_zone;
 static uma_zone_t vnodepoll_zone;
 
 /*
  * The workitem queue.
  *
  * It is useful to delay writes of file data and filesystem metadata
  * for tens of seconds so that quickly created and deleted files need
  * not waste disk bandwidth being created and removed. To realize this,
  * we append vnodes to a "workitem" queue. When running with a soft
  * updates implementation, most pending metadata dependencies should
  * not wait for more than a few seconds. Thus, mounted on block devices
  * are delayed only about a half the time that file data is delayed.
  * Similarly, directory updates are more critical, so are only delayed
  * about a third the time that file data is delayed. Thus, there are
  * SYNCER_MAXDELAY queues that are processed round-robin at a rate of
  * one each second (driven off the filesystem syncer process). The
  * syncer_delayno variable indicates the next queue that is to be processed.
  * Items that need to be processed soon are placed in this queue:
  *
  *	syncer_workitem_pending[syncer_delayno]
  *
  * A delay of fifteen seconds is done by placing the request fifteen
  * entries later in the queue:
  *
  *	syncer_workitem_pending[(syncer_delayno + 15) & syncer_mask]
  *
  */
 static int syncer_delayno;
 static long syncer_mask;
 LIST_HEAD(synclist, bufobj);
 static struct synclist *syncer_workitem_pending;
 /*
  * The sync_mtx protects:
  *	bo->bo_synclist
  *	sync_vnode_count
  *	syncer_delayno
  *	syncer_state
  *	syncer_workitem_pending
  *	syncer_worklist_len
  *	rushjob
  */
 static struct mtx sync_mtx;
 static struct cv sync_wakeup;
 
 #define SYNCER_MAXDELAY		32
 static int syncer_maxdelay = SYNCER_MAXDELAY;	/* maximum delay time */
 static int syncdelay = 30;		/* max time to delay syncing data */
 static int filedelay = 30;		/* time to delay syncing files */
 SYSCTL_INT(_kern, OID_AUTO, filedelay, CTLFLAG_RW, &filedelay, 0,
     "Time to delay syncing files (in seconds)");
 static int dirdelay = 29;		/* time to delay syncing directories */
 SYSCTL_INT(_kern, OID_AUTO, dirdelay, CTLFLAG_RW, &dirdelay, 0,
     "Time to delay syncing directories (in seconds)");
 static int metadelay = 28;		/* time to delay syncing metadata */
 SYSCTL_INT(_kern, OID_AUTO, metadelay, CTLFLAG_RW, &metadelay, 0,
     "Time to delay syncing metadata (in seconds)");
 static int rushjob;		/* number of slots to run ASAP */
 static int stat_rush_requests;	/* number of times I/O speeded up */
 SYSCTL_INT(_debug, OID_AUTO, rush_requests, CTLFLAG_RW, &stat_rush_requests, 0,
     "Number of times I/O speeded up (rush requests)");
 
 /*
  * When shutting down the syncer, run it at four times normal speed.
  */
 #define SYNCER_SHUTDOWN_SPEEDUP		4
 static int sync_vnode_count;
 static int syncer_worklist_len;
 static enum { SYNCER_RUNNING, SYNCER_SHUTTING_DOWN, SYNCER_FINAL_DELAY }
     syncer_state;
 
 /* Target for maximum number of vnodes. */
 int desiredvnodes;
 static int gapvnodes;		/* gap between wanted and desired */
 static int vhiwat;		/* enough extras after expansion */
 static int vlowat;		/* minimal extras before expansion */
 static int vstir;		/* nonzero to stir non-free vnodes */
 static volatile int vsmalltrigger = 8;	/* pref to keep if > this many pages */
 
 static int
 sysctl_update_desiredvnodes(SYSCTL_HANDLER_ARGS)
 {
 	int error, old_desiredvnodes;
 
 	old_desiredvnodes = desiredvnodes;
 	if ((error = sysctl_handle_int(oidp, arg1, arg2, req)) != 0)
 		return (error);
 	if (old_desiredvnodes != desiredvnodes) {
 		wantfreevnodes = desiredvnodes / 4;
 		/* XXX locking seems to be incomplete. */
 		vfs_hash_changesize(desiredvnodes);
 		cache_changesize(desiredvnodes);
 	}
 	return (0);
 }
 
 SYSCTL_PROC(_kern, KERN_MAXVNODES, maxvnodes,
     CTLTYPE_INT | CTLFLAG_MPSAFE | CTLFLAG_RW, &desiredvnodes, 0,
     sysctl_update_desiredvnodes, "I", "Target for maximum number of vnodes");
 SYSCTL_ULONG(_kern, OID_AUTO, minvnodes, CTLFLAG_RW,
     &wantfreevnodes, 0, "Old name for vfs.wantfreevnodes (legacy)");
 static int vnlru_nowhere;
 SYSCTL_INT(_debug, OID_AUTO, vnlru_nowhere, CTLFLAG_RW,
     &vnlru_nowhere, 0, "Number of times the vnlru process ran without success");
 
 /* Shift count for (uintptr_t)vp to initialize vp->v_hash. */
 static int vnsz2log;
 
 /*
  * Support for the bufobj clean & dirty pctrie.
  */
 static void *
 buf_trie_alloc(struct pctrie *ptree)
 {
 
 	return uma_zalloc(buf_trie_zone, M_NOWAIT);
 }
 
 static void
 buf_trie_free(struct pctrie *ptree, void *node)
 {
 
 	uma_zfree(buf_trie_zone, node);
 }
 PCTRIE_DEFINE(BUF, buf, b_lblkno, buf_trie_alloc, buf_trie_free);
 
 /*
  * Initialize the vnode management data structures.
  *
  * Reevaluate the following cap on the number of vnodes after the physical
  * memory size exceeds 512GB.  In the limit, as the physical memory size
  * grows, the ratio of the memory size in KB to vnodes approaches 64:1.
  */
 #ifndef	MAXVNODES_MAX
 #define	MAXVNODES_MAX	(512 * 1024 * 1024 / 64)	/* 8M */
 #endif
 
 /*
  * Initialize a vnode as it first enters the zone.
  */
 static int
 vnode_init(void *mem, int size, int flags)
 {
 	struct vnode *vp;
 
 	vp = mem;
 	bzero(vp, size);
 	/*
 	 * Setup locks.
 	 */
 	vp->v_vnlock = &vp->v_lock;
 	mtx_init(&vp->v_interlock, "vnode interlock", NULL, MTX_DEF);
 	/*
 	 * By default, don't allow shared locks unless filesystems opt-in.
 	 */
 	lockinit(vp->v_vnlock, PVFS, "vnode", VLKTIMEOUT,
 	    LK_NOSHARE | LK_IS_VNODE);
 	/*
 	 * Initialize bufobj.
 	 */
 	bufobj_init(&vp->v_bufobj, vp);
 	/*
 	 * Initialize namecache.
 	 */
 	LIST_INIT(&vp->v_cache_src);
 	TAILQ_INIT(&vp->v_cache_dst);
 	/*
 	 * Initialize rangelocks.
 	 */
 	rangelock_init(&vp->v_rl);
 	return (0);
 }
 
 /*
  * Free a vnode when it is cleared from the zone.
  */
 static void
 vnode_fini(void *mem, int size)
 {
 	struct vnode *vp;
 	struct bufobj *bo;
 
 	vp = mem;
 	rangelock_destroy(&vp->v_rl);
 	lockdestroy(vp->v_vnlock);
 	mtx_destroy(&vp->v_interlock);
 	bo = &vp->v_bufobj;
 	rw_destroy(BO_LOCKPTR(bo));
 }
 
 /*
  * Provide the size of NFS nclnode and NFS fh for calculation of the
  * vnode memory consumption.  The size is specified directly to
  * eliminate dependency on NFS-private header.
  *
  * Other filesystems may use bigger or smaller (like UFS and ZFS)
  * private inode data, but the NFS-based estimation is ample enough.
  * Still, we care about differences in the size between 64- and 32-bit
  * platforms.
  *
  * Namecache structure size is heuristically
  * sizeof(struct namecache_ts) + CACHE_PATH_CUTOFF + 1.
  */
 #ifdef _LP64
 #define	NFS_NCLNODE_SZ	(528 + 64)
 #define	NC_SZ		148
 #else
 #define	NFS_NCLNODE_SZ	(360 + 32)
 #define	NC_SZ		92
 #endif
 
 static void
 vntblinit(void *dummy __unused)
 {
 	u_int i;
 	int physvnodes, virtvnodes;
 
 	/*
 	 * Desiredvnodes is a function of the physical memory size and the
 	 * kernel's heap size.  Generally speaking, it scales with the
 	 * physical memory size.  The ratio of desiredvnodes to the physical
 	 * memory size is 1:16 until desiredvnodes exceeds 98,304.
 	 * Thereafter, the
 	 * marginal ratio of desiredvnodes to the physical memory size is
 	 * 1:64.  However, desiredvnodes is limited by the kernel's heap
 	 * size.  The memory required by desiredvnodes vnodes and vm objects
 	 * must not exceed 1/10th of the kernel's heap size.
 	 */
 	physvnodes = maxproc + pgtok(vm_cnt.v_page_count) / 64 +
 	    3 * min(98304 * 16, pgtok(vm_cnt.v_page_count)) / 64;
 	virtvnodes = vm_kmem_size / (10 * (sizeof(struct vm_object) +
 	    sizeof(struct vnode) + NC_SZ * ncsizefactor + NFS_NCLNODE_SZ));
 	desiredvnodes = min(physvnodes, virtvnodes);
 	if (desiredvnodes > MAXVNODES_MAX) {
 		if (bootverbose)
 			printf("Reducing kern.maxvnodes %d -> %d\n",
 			    desiredvnodes, MAXVNODES_MAX);
 		desiredvnodes = MAXVNODES_MAX;
 	}
 	wantfreevnodes = desiredvnodes / 4;
 	mtx_init(&mntid_mtx, "mntid", NULL, MTX_DEF);
 	TAILQ_INIT(&vnode_free_list);
 	mtx_init(&vnode_free_list_mtx, "vnode_free_list", NULL, MTX_DEF);
 	vnode_zone = uma_zcreate("VNODE", sizeof (struct vnode), NULL, NULL,
 	    vnode_init, vnode_fini, UMA_ALIGN_PTR, 0);
 	vnodepoll_zone = uma_zcreate("VNODEPOLL", sizeof (struct vpollinfo),
 	    NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0);
 	/*
 	 * Preallocate enough nodes to support one-per buf so that
 	 * we can not fail an insert.  reassignbuf() callers can not
 	 * tolerate the insertion failure.
 	 */
 	buf_trie_zone = uma_zcreate("BUF TRIE", pctrie_node_size(),
 	    NULL, NULL, pctrie_zone_init, NULL, UMA_ALIGN_PTR, 
 	    UMA_ZONE_NOFREE | UMA_ZONE_VM);
 	uma_prealloc(buf_trie_zone, nbuf);
 
 	vnodes_created = counter_u64_alloc(M_WAITOK);
 	recycles_count = counter_u64_alloc(M_WAITOK);
 	free_owe_inact = counter_u64_alloc(M_WAITOK);
 
 	/*
 	 * Initialize the filesystem syncer.
 	 */
 	syncer_workitem_pending = hashinit(syncer_maxdelay, M_VNODE,
 	    &syncer_mask);
 	syncer_maxdelay = syncer_mask + 1;
 	mtx_init(&sync_mtx, "Syncer mtx", NULL, MTX_DEF);
 	cv_init(&sync_wakeup, "syncer");
 	for (i = 1; i <= sizeof(struct vnode); i <<= 1)
 		vnsz2log++;
 	vnsz2log--;
 }
 SYSINIT(vfs, SI_SUB_VFS, SI_ORDER_FIRST, vntblinit, NULL);
 
 
 /*
  * Mark a mount point as busy. Used to synchronize access and to delay
  * unmounting. Eventually, mountlist_mtx is not released on failure.
  *
  * vfs_busy() is a custom lock, it can block the caller.
  * vfs_busy() only sleeps if the unmount is active on the mount point.
  * For a mountpoint mp, vfs_busy-enforced lock is before lock of any
  * vnode belonging to mp.
  *
  * Lookup uses vfs_busy() to traverse mount points.
  * root fs			var fs
  * / vnode lock		A	/ vnode lock (/var)		D
  * /var vnode lock	B	/log vnode lock(/var/log)	E
  * vfs_busy lock	C	vfs_busy lock			F
  *
  * Within each file system, the lock order is C->A->B and F->D->E.
  *
  * When traversing across mounts, the system follows that lock order:
  *
  *        C->A->B
  *              |
  *              +->F->D->E
  *
  * The lookup() process for namei("/var") illustrates the process:
  *  VOP_LOOKUP() obtains B while A is held
  *  vfs_busy() obtains a shared lock on F while A and B are held
  *  vput() releases lock on B
  *  vput() releases lock on A
  *  VFS_ROOT() obtains lock on D while shared lock on F is held
  *  vfs_unbusy() releases shared lock on F
  *  vn_lock() obtains lock on deadfs vnode vp_crossmp instead of A.
  *    Attempt to lock A (instead of vp_crossmp) while D is held would
  *    violate the global order, causing deadlocks.
  *
  * dounmount() locks B while F is drained.
  */
 int
 vfs_busy(struct mount *mp, int flags)
 {
 
 	MPASS((flags & ~MBF_MASK) == 0);
 	CTR3(KTR_VFS, "%s: mp %p with flags %d", __func__, mp, flags);
 
 	MNT_ILOCK(mp);
 	MNT_REF(mp);
 	/*
 	 * If mount point is currently being unmounted, sleep until the
 	 * mount point fate is decided.  If thread doing the unmounting fails,
 	 * it will clear MNTK_UNMOUNT flag before waking us up, indicating
 	 * that this mount point has survived the unmount attempt and vfs_busy
 	 * should retry.  Otherwise the unmounter thread will set MNTK_REFEXPIRE
 	 * flag in addition to MNTK_UNMOUNT, indicating that mount point is
 	 * about to be really destroyed.  vfs_busy needs to release its
 	 * reference on the mount point in this case and return with ENOENT,
 	 * telling the caller that mount mount it tried to busy is no longer
 	 * valid.
 	 */
 	while (mp->mnt_kern_flag & MNTK_UNMOUNT) {
 		if (flags & MBF_NOWAIT || mp->mnt_kern_flag & MNTK_REFEXPIRE) {
 			MNT_REL(mp);
 			MNT_IUNLOCK(mp);
 			CTR1(KTR_VFS, "%s: failed busying before sleeping",
 			    __func__);
 			return (ENOENT);
 		}
 		if (flags & MBF_MNTLSTLOCK)
 			mtx_unlock(&mountlist_mtx);
 		mp->mnt_kern_flag |= MNTK_MWAIT;
 		msleep(mp, MNT_MTX(mp), PVFS | PDROP, "vfs_busy", 0);
 		if (flags & MBF_MNTLSTLOCK)
 			mtx_lock(&mountlist_mtx);
 		MNT_ILOCK(mp);
 	}
 	if (flags & MBF_MNTLSTLOCK)
 		mtx_unlock(&mountlist_mtx);
 	mp->mnt_lockref++;
 	MNT_IUNLOCK(mp);
 	return (0);
 }
 
 /*
  * Free a busy filesystem.
  */
 void
 vfs_unbusy(struct mount *mp)
 {
 
 	CTR2(KTR_VFS, "%s: mp %p", __func__, mp);
 	MNT_ILOCK(mp);
 	MNT_REL(mp);
 	KASSERT(mp->mnt_lockref > 0, ("negative mnt_lockref"));
 	mp->mnt_lockref--;
 	if (mp->mnt_lockref == 0 && (mp->mnt_kern_flag & MNTK_DRAINING) != 0) {
 		MPASS(mp->mnt_kern_flag & MNTK_UNMOUNT);
 		CTR1(KTR_VFS, "%s: waking up waiters", __func__);
 		mp->mnt_kern_flag &= ~MNTK_DRAINING;
 		wakeup(&mp->mnt_lockref);
 	}
 	MNT_IUNLOCK(mp);
 }
 
 /*
  * Lookup a mount point by filesystem identifier.
  */
 struct mount *
 vfs_getvfs(fsid_t *fsid)
 {
 	struct mount *mp;
 
 	CTR2(KTR_VFS, "%s: fsid %p", __func__, fsid);
 	mtx_lock(&mountlist_mtx);
 	TAILQ_FOREACH(mp, &mountlist, mnt_list) {
 		if (mp->mnt_stat.f_fsid.val[0] == fsid->val[0] &&
 		    mp->mnt_stat.f_fsid.val[1] == fsid->val[1]) {
 			vfs_ref(mp);
 			mtx_unlock(&mountlist_mtx);
 			return (mp);
 		}
 	}
 	mtx_unlock(&mountlist_mtx);
 	CTR2(KTR_VFS, "%s: lookup failed for %p id", __func__, fsid);
 	return ((struct mount *) 0);
 }
 
 /*
  * Lookup a mount point by filesystem identifier, busying it before
  * returning.
  *
  * To avoid congestion on mountlist_mtx, implement simple direct-mapped
  * cache for popular filesystem identifiers.  The cache is lockess, using
  * the fact that struct mount's are never freed.  In worst case we may
  * get pointer to unmounted or even different filesystem, so we have to
  * check what we got, and go slow way if so.
  */
 struct mount *
 vfs_busyfs(fsid_t *fsid)
 {
 #define	FSID_CACHE_SIZE	256
 	typedef struct mount * volatile vmp_t;
 	static vmp_t cache[FSID_CACHE_SIZE];
 	struct mount *mp;
 	int error;
 	uint32_t hash;
 
 	CTR2(KTR_VFS, "%s: fsid %p", __func__, fsid);
 	hash = fsid->val[0] ^ fsid->val[1];
 	hash = (hash >> 16 ^ hash) & (FSID_CACHE_SIZE - 1);
 	mp = cache[hash];
 	if (mp == NULL ||
 	    mp->mnt_stat.f_fsid.val[0] != fsid->val[0] ||
 	    mp->mnt_stat.f_fsid.val[1] != fsid->val[1])
 		goto slow;
 	if (vfs_busy(mp, 0) != 0) {
 		cache[hash] = NULL;
 		goto slow;
 	}
 	if (mp->mnt_stat.f_fsid.val[0] == fsid->val[0] &&
 	    mp->mnt_stat.f_fsid.val[1] == fsid->val[1])
 		return (mp);
 	else
 	    vfs_unbusy(mp);
 
 slow:
 	mtx_lock(&mountlist_mtx);
 	TAILQ_FOREACH(mp, &mountlist, mnt_list) {
 		if (mp->mnt_stat.f_fsid.val[0] == fsid->val[0] &&
 		    mp->mnt_stat.f_fsid.val[1] == fsid->val[1]) {
 			error = vfs_busy(mp, MBF_MNTLSTLOCK);
 			if (error) {
 				cache[hash] = NULL;
 				mtx_unlock(&mountlist_mtx);
 				return (NULL);
 			}
 			cache[hash] = mp;
 			return (mp);
 		}
 	}
 	CTR2(KTR_VFS, "%s: lookup failed for %p id", __func__, fsid);
 	mtx_unlock(&mountlist_mtx);
 	return ((struct mount *) 0);
 }
 
 /*
  * Check if a user can access privileged mount options.
  */
 int
 vfs_suser(struct mount *mp, struct thread *td)
 {
 	int error;
 
 	if (jailed(td->td_ucred)) {
 		/*
 		 * If the jail of the calling thread lacks permission for
 		 * this type of file system, deny immediately.
 		 */
 		if (!prison_allow(td->td_ucred, mp->mnt_vfc->vfc_prison_flag))
 			return (EPERM);
 
 		/*
 		 * If the file system was mounted outside the jail of the
 		 * calling thread, deny immediately.
 		 */
 		if (prison_check(td->td_ucred, mp->mnt_cred) != 0)
 			return (EPERM);
 	}
 
 	/*
 	 * If file system supports delegated administration, we don't check
 	 * for the PRIV_VFS_MOUNT_OWNER privilege - it will be better verified
 	 * by the file system itself.
 	 * If this is not the user that did original mount, we check for
 	 * the PRIV_VFS_MOUNT_OWNER privilege.
 	 */
 	if (!(mp->mnt_vfc->vfc_flags & VFCF_DELEGADMIN) &&
 	    mp->mnt_cred->cr_uid != td->td_ucred->cr_uid) {
 		if ((error = priv_check(td, PRIV_VFS_MOUNT_OWNER)) != 0)
 			return (error);
 	}
 	return (0);
 }
 
 /*
  * Get a new unique fsid.  Try to make its val[0] unique, since this value
  * will be used to create fake device numbers for stat().  Also try (but
  * not so hard) make its val[0] unique mod 2^16, since some emulators only
  * support 16-bit device numbers.  We end up with unique val[0]'s for the
  * first 2^16 calls and unique val[0]'s mod 2^16 for the first 2^8 calls.
  *
  * Keep in mind that several mounts may be running in parallel.  Starting
  * the search one past where the previous search terminated is both a
  * micro-optimization and a defense against returning the same fsid to
  * different mounts.
  */
 void
 vfs_getnewfsid(struct mount *mp)
 {
 	static uint16_t mntid_base;
 	struct mount *nmp;
 	fsid_t tfsid;
 	int mtype;
 
 	CTR2(KTR_VFS, "%s: mp %p", __func__, mp);
 	mtx_lock(&mntid_mtx);
 	mtype = mp->mnt_vfc->vfc_typenum;
 	tfsid.val[1] = mtype;
 	mtype = (mtype & 0xFF) << 24;
 	for (;;) {
 		tfsid.val[0] = makedev(255,
 		    mtype | ((mntid_base & 0xFF00) << 8) | (mntid_base & 0xFF));
 		mntid_base++;
 		if ((nmp = vfs_getvfs(&tfsid)) == NULL)
 			break;
 		vfs_rel(nmp);
 	}
 	mp->mnt_stat.f_fsid.val[0] = tfsid.val[0];
 	mp->mnt_stat.f_fsid.val[1] = tfsid.val[1];
 	mtx_unlock(&mntid_mtx);
 }
 
 /*
  * Knob to control the precision of file timestamps:
  *
  *   0 = seconds only; nanoseconds zeroed.
  *   1 = seconds and nanoseconds, accurate within 1/HZ.
  *   2 = seconds and nanoseconds, truncated to microseconds.
  * >=3 = seconds and nanoseconds, maximum precision.
  */
 enum { TSP_SEC, TSP_HZ, TSP_USEC, TSP_NSEC };
 
 static int timestamp_precision = TSP_USEC;
 SYSCTL_INT(_vfs, OID_AUTO, timestamp_precision, CTLFLAG_RW,
     &timestamp_precision, 0, "File timestamp precision (0: seconds, "
     "1: sec + ns accurate to 1/HZ, 2: sec + ns truncated to us, "
     "3+: sec + ns (max. precision))");
 
 /*
  * Get a current timestamp.
  */
 void
 vfs_timestamp(struct timespec *tsp)
 {
 	struct timeval tv;
 
 	switch (timestamp_precision) {
 	case TSP_SEC:
 		tsp->tv_sec = time_second;
 		tsp->tv_nsec = 0;
 		break;
 	case TSP_HZ:
 		getnanotime(tsp);
 		break;
 	case TSP_USEC:
 		microtime(&tv);
 		TIMEVAL_TO_TIMESPEC(&tv, tsp);
 		break;
 	case TSP_NSEC:
 	default:
 		nanotime(tsp);
 		break;
 	}
 }
 
 /*
  * Set vnode attributes to VNOVAL
  */
 void
 vattr_null(struct vattr *vap)
 {
 
 	vap->va_type = VNON;
 	vap->va_size = VNOVAL;
 	vap->va_bytes = VNOVAL;
 	vap->va_mode = VNOVAL;
 	vap->va_nlink = VNOVAL;
 	vap->va_uid = VNOVAL;
 	vap->va_gid = VNOVAL;
 	vap->va_fsid = VNOVAL;
 	vap->va_fileid = VNOVAL;
 	vap->va_blocksize = VNOVAL;
 	vap->va_rdev = VNOVAL;
 	vap->va_atime.tv_sec = VNOVAL;
 	vap->va_atime.tv_nsec = VNOVAL;
 	vap->va_mtime.tv_sec = VNOVAL;
 	vap->va_mtime.tv_nsec = VNOVAL;
 	vap->va_ctime.tv_sec = VNOVAL;
 	vap->va_ctime.tv_nsec = VNOVAL;
 	vap->va_birthtime.tv_sec = VNOVAL;
 	vap->va_birthtime.tv_nsec = VNOVAL;
 	vap->va_flags = VNOVAL;
 	vap->va_gen = VNOVAL;
 	vap->va_vaflags = 0;
 }
 
 /*
  * This routine is called when we have too many vnodes.  It attempts
  * to free <count> vnodes and will potentially free vnodes that still
  * have VM backing store (VM backing store is typically the cause
  * of a vnode blowout so we want to do this).  Therefore, this operation
  * is not considered cheap.
  *
  * A number of conditions may prevent a vnode from being reclaimed.
  * the buffer cache may have references on the vnode, a directory
  * vnode may still have references due to the namei cache representing
  * underlying files, or the vnode may be in active use.   It is not
  * desirable to reuse such vnodes.  These conditions may cause the
  * number of vnodes to reach some minimum value regardless of what
  * you set kern.maxvnodes to.  Do not set kern.maxvnodes too low.
  */
 static int
 vlrureclaim(struct mount *mp, int reclaim_nc_src, int trigger)
 {
 	struct vnode *vp;
 	int count, done, target;
 
 	done = 0;
 	vn_start_write(NULL, &mp, V_WAIT);
 	MNT_ILOCK(mp);
 	count = mp->mnt_nvnodelistsize;
 	target = count * (int64_t)gapvnodes / imax(desiredvnodes, 1);
 	target = target / 10 + 1;
 	while (count != 0 && done < target) {
 		vp = TAILQ_FIRST(&mp->mnt_nvnodelist);
 		while (vp != NULL && vp->v_type == VMARKER)
 			vp = TAILQ_NEXT(vp, v_nmntvnodes);
 		if (vp == NULL)
 			break;
 		/*
 		 * XXX LRU is completely broken for non-free vnodes.  First
 		 * by calling here in mountpoint order, then by moving
 		 * unselected vnodes to the end here, and most grossly by
 		 * removing the vlruvp() function that was supposed to
 		 * maintain the order.  (This function was born broken
 		 * since syncer problems prevented it doing anything.)  The
 		 * order is closer to LRC (C = Created).
 		 *
 		 * LRU reclaiming of vnodes seems to have last worked in
 		 * FreeBSD-3 where LRU wasn't mentioned under any spelling.
 		 * Then there was no hold count, and inactive vnodes were
 		 * simply put on the free list in LRU order.  The separate
 		 * lists also break LRU.  We prefer to reclaim from the
 		 * free list for technical reasons.  This tends to thrash
 		 * the free list to keep very unrecently used held vnodes.
 		 * The problem is mitigated by keeping the free list large.
 		 */
 		TAILQ_REMOVE(&mp->mnt_nvnodelist, vp, v_nmntvnodes);
 		TAILQ_INSERT_TAIL(&mp->mnt_nvnodelist, vp, v_nmntvnodes);
 		--count;
 		if (!VI_TRYLOCK(vp))
 			goto next_iter;
 		/*
 		 * If it's been deconstructed already, it's still
 		 * referenced, or it exceeds the trigger, skip it.
 		 * Also skip free vnodes.  We are trying to make space
 		 * to expand the free list, not reduce it.
 		 */
 		if (vp->v_usecount ||
 		    (!reclaim_nc_src && !LIST_EMPTY(&vp->v_cache_src)) ||
 		    ((vp->v_iflag & VI_FREE) != 0) ||
 		    (vp->v_iflag & VI_DOOMED) != 0 || (vp->v_object != NULL &&
 		    vp->v_object->resident_page_count > trigger)) {
 			VI_UNLOCK(vp);
 			goto next_iter;
 		}
 		MNT_IUNLOCK(mp);
 		vholdl(vp);
 		if (VOP_LOCK(vp, LK_INTERLOCK|LK_EXCLUSIVE|LK_NOWAIT)) {
 			vdrop(vp);
 			goto next_iter_mntunlocked;
 		}
 		VI_LOCK(vp);
 		/*
 		 * v_usecount may have been bumped after VOP_LOCK() dropped
 		 * the vnode interlock and before it was locked again.
 		 *
 		 * It is not necessary to recheck VI_DOOMED because it can
 		 * only be set by another thread that holds both the vnode
 		 * lock and vnode interlock.  If another thread has the
 		 * vnode lock before we get to VOP_LOCK() and obtains the
 		 * vnode interlock after VOP_LOCK() drops the vnode
 		 * interlock, the other thread will be unable to drop the
 		 * vnode lock before our VOP_LOCK() call fails.
 		 */
 		if (vp->v_usecount ||
 		    (!reclaim_nc_src && !LIST_EMPTY(&vp->v_cache_src)) ||
 		    (vp->v_iflag & VI_FREE) != 0 ||
 		    (vp->v_object != NULL &&
 		    vp->v_object->resident_page_count > trigger)) {
 			VOP_UNLOCK(vp, LK_INTERLOCK);
 			vdrop(vp);
 			goto next_iter_mntunlocked;
 		}
 		KASSERT((vp->v_iflag & VI_DOOMED) == 0,
 		    ("VI_DOOMED unexpectedly detected in vlrureclaim()"));
 		counter_u64_add(recycles_count, 1);
 		vgonel(vp);
 		VOP_UNLOCK(vp, 0);
 		vdropl(vp);
 		done++;
 next_iter_mntunlocked:
 		if (!should_yield())
 			goto relock_mnt;
 		goto yield;
 next_iter:
 		if (!should_yield())
 			continue;
 		MNT_IUNLOCK(mp);
 yield:
 		kern_yield(PRI_USER);
 relock_mnt:
 		MNT_ILOCK(mp);
 	}
 	MNT_IUNLOCK(mp);
 	vn_finished_write(mp);
 	return done;
 }
 
 static int max_vnlru_free = 10000; /* limit on vnode free requests per call */
 SYSCTL_INT(_debug, OID_AUTO, max_vnlru_free, CTLFLAG_RW, &max_vnlru_free,
     0,
     "limit on vnode free requests per call to the vnlru_free routine");
 
 /*
  * Attempt to reduce the free list by the requested amount.
  */
 static void
 vnlru_free_locked(int count, struct vfsops *mnt_op)
 {
 	struct vnode *vp;
 	struct mount *mp;
 	bool tried_batches;
 
 	tried_batches = false;
 	mtx_assert(&vnode_free_list_mtx, MA_OWNED);
 	if (count > max_vnlru_free)
 		count = max_vnlru_free;
 	for (; count > 0; count--) {
 		vp = TAILQ_FIRST(&vnode_free_list);
 		/*
 		 * The list can be modified while the free_list_mtx
 		 * has been dropped and vp could be NULL here.
 		 */
 		if (vp == NULL) {
 			if (tried_batches)
 				break;
 			mtx_unlock(&vnode_free_list_mtx);
 			vnlru_return_batches(mnt_op);
 			tried_batches = true;
 			mtx_lock(&vnode_free_list_mtx);
 			continue;
 		}
 
 		VNASSERT(vp->v_op != NULL, vp,
 		    ("vnlru_free: vnode already reclaimed."));
 		KASSERT((vp->v_iflag & VI_FREE) != 0,
 		    ("Removing vnode not on freelist"));
 		KASSERT((vp->v_iflag & VI_ACTIVE) == 0,
 		    ("Mangling active vnode"));
 		TAILQ_REMOVE(&vnode_free_list, vp, v_actfreelist);
 
 		/*
 		 * Don't recycle if our vnode is from different type
 		 * of mount point.  Note that mp is type-safe, the
 		 * check does not reach unmapped address even if
 		 * vnode is reclaimed.
 		 * Don't recycle if we can't get the interlock without
 		 * blocking.
 		 */
 		if ((mnt_op != NULL && (mp = vp->v_mount) != NULL &&
 		    mp->mnt_op != mnt_op) || !VI_TRYLOCK(vp)) {
 			TAILQ_INSERT_TAIL(&vnode_free_list, vp, v_actfreelist);
 			continue;
 		}
 		VNASSERT((vp->v_iflag & VI_FREE) != 0 && vp->v_holdcnt == 0,
 		    vp, ("vp inconsistent on freelist"));
 
 		/*
 		 * The clear of VI_FREE prevents activation of the
 		 * vnode.  There is no sense in putting the vnode on
 		 * the mount point active list, only to remove it
 		 * later during recycling.  Inline the relevant part
 		 * of vholdl(), to avoid triggering assertions or
 		 * activating.
 		 */
 		freevnodes--;
 		vp->v_iflag &= ~VI_FREE;
 		VNODE_REFCOUNT_FENCE_REL();
 		refcount_acquire(&vp->v_holdcnt);
 
 		mtx_unlock(&vnode_free_list_mtx);
 		VI_UNLOCK(vp);
 		vtryrecycle(vp);
 		/*
 		 * If the recycled succeeded this vdrop will actually free
 		 * the vnode.  If not it will simply place it back on
 		 * the free list.
 		 */
 		vdrop(vp);
 		mtx_lock(&vnode_free_list_mtx);
 	}
 }
 
 void
 vnlru_free(int count, struct vfsops *mnt_op)
 {
 
 	mtx_lock(&vnode_free_list_mtx);
 	vnlru_free_locked(count, mnt_op);
 	mtx_unlock(&vnode_free_list_mtx);
 }
 
 
 /* XXX some names and initialization are bad for limits and watermarks. */
 static int
 vspace(void)
 {
 	int space;
 
 	gapvnodes = imax(desiredvnodes - wantfreevnodes, 100);
 	vhiwat = gapvnodes / 11; /* 9% -- just under the 10% in vlrureclaim() */
 	vlowat = vhiwat / 2;
 	if (numvnodes > desiredvnodes)
 		return (0);
 	space = desiredvnodes - numvnodes;
 	if (freevnodes > wantfreevnodes)
 		space += freevnodes - wantfreevnodes;
 	return (space);
 }
 
 static void
 vnlru_return_batch_locked(struct mount *mp)
 {
 	struct vnode *vp;
 
 	mtx_assert(&mp->mnt_listmtx, MA_OWNED);
 
 	if (mp->mnt_tmpfreevnodelistsize == 0)
 		return;
 
 	TAILQ_FOREACH(vp, &mp->mnt_tmpfreevnodelist, v_actfreelist) {
 		VNASSERT((vp->v_mflag & VMP_TMPMNTFREELIST) != 0, vp,
 		    ("vnode without VMP_TMPMNTFREELIST on mnt_tmpfreevnodelist"));
 		vp->v_mflag &= ~VMP_TMPMNTFREELIST;
 	}
 	mtx_lock(&vnode_free_list_mtx);
 	TAILQ_CONCAT(&vnode_free_list, &mp->mnt_tmpfreevnodelist, v_actfreelist);
 	freevnodes += mp->mnt_tmpfreevnodelistsize;
 	mtx_unlock(&vnode_free_list_mtx);
 	mp->mnt_tmpfreevnodelistsize = 0;
 }
 
 static void
 vnlru_return_batch(struct mount *mp)
 {
 
 	mtx_lock(&mp->mnt_listmtx);
 	vnlru_return_batch_locked(mp);
 	mtx_unlock(&mp->mnt_listmtx);
 }
 
 static void
 vnlru_return_batches(struct vfsops *mnt_op)
 {
 	struct mount *mp, *nmp;
 	bool need_unbusy;
 
 	mtx_lock(&mountlist_mtx);
 	for (mp = TAILQ_FIRST(&mountlist); mp != NULL; mp = nmp) {
 		need_unbusy = false;
 		if (mnt_op != NULL && mp->mnt_op != mnt_op)
 			goto next;
 		if (mp->mnt_tmpfreevnodelistsize == 0)
 			goto next;
 		if (vfs_busy(mp, MBF_NOWAIT | MBF_MNTLSTLOCK) == 0) {
 			vnlru_return_batch(mp);
 			need_unbusy = true;
 			mtx_lock(&mountlist_mtx);
 		}
 next:
 		nmp = TAILQ_NEXT(mp, mnt_list);
 		if (need_unbusy)
 			vfs_unbusy(mp);
 	}
 	mtx_unlock(&mountlist_mtx);
 }
 
 /*
  * Attempt to recycle vnodes in a context that is always safe to block.
  * Calling vlrurecycle() from the bowels of filesystem code has some
  * interesting deadlock problems.
  */
 static struct proc *vnlruproc;
 static int vnlruproc_sig;
 
 static void
 vnlru_proc(void)
 {
 	struct mount *mp, *nmp;
 	unsigned long onumvnodes;
 	int done, force, reclaim_nc_src, trigger, usevnodes;
 
 	EVENTHANDLER_REGISTER(shutdown_pre_sync, kproc_shutdown, vnlruproc,
 	    SHUTDOWN_PRI_FIRST);
 
 	force = 0;
 	for (;;) {
 		kproc_suspend_check(vnlruproc);
 		mtx_lock(&vnode_free_list_mtx);
 		/*
 		 * If numvnodes is too large (due to desiredvnodes being
 		 * adjusted using its sysctl, or emergency growth), first
 		 * try to reduce it by discarding from the free list.
 		 */
 		if (numvnodes > desiredvnodes)
 			vnlru_free_locked(numvnodes - desiredvnodes, NULL);
 		/*
 		 * Sleep if the vnode cache is in a good state.  This is
 		 * when it is not over-full and has space for about a 4%
 		 * or 9% expansion (by growing its size or inexcessively
 		 * reducing its free list).  Otherwise, try to reclaim
 		 * space for a 10% expansion.
 		 */
 		if (vstir && force == 0) {
 			force = 1;
 			vstir = 0;
 		}
 		if (vspace() >= vlowat && force == 0) {
 			vnlruproc_sig = 0;
 			wakeup(&vnlruproc_sig);
 			msleep(vnlruproc, &vnode_free_list_mtx,
 			    PVFS|PDROP, "vlruwt", hz);
 			continue;
 		}
 		mtx_unlock(&vnode_free_list_mtx);
 		done = 0;
 		onumvnodes = numvnodes;
 		/*
 		 * Calculate parameters for recycling.  These are the same
 		 * throughout the loop to give some semblance of fairness.
 		 * The trigger point is to avoid recycling vnodes with lots
 		 * of resident pages.  We aren't trying to free memory; we
 		 * are trying to recycle or at least free vnodes.
 		 */
 		if (numvnodes <= desiredvnodes)
 			usevnodes = numvnodes - freevnodes;
 		else
 			usevnodes = numvnodes;
 		if (usevnodes <= 0)
 			usevnodes = 1;
 		/*
 		 * The trigger value is is chosen to give a conservatively
 		 * large value to ensure that it alone doesn't prevent
 		 * making progress.  The value can easily be so large that
 		 * it is effectively infinite in some congested and
 		 * misconfigured cases, and this is necessary.  Normally
 		 * it is about 8 to 100 (pages), which is quite large.
 		 */
 		trigger = vm_cnt.v_page_count * 2 / usevnodes;
 		if (force < 2)
 			trigger = vsmalltrigger;
 		reclaim_nc_src = force >= 3;
 		mtx_lock(&mountlist_mtx);
 		for (mp = TAILQ_FIRST(&mountlist); mp != NULL; mp = nmp) {
 			if (vfs_busy(mp, MBF_NOWAIT | MBF_MNTLSTLOCK)) {
 				nmp = TAILQ_NEXT(mp, mnt_list);
 				continue;
 			}
 			done += vlrureclaim(mp, reclaim_nc_src, trigger);
 			mtx_lock(&mountlist_mtx);
 			nmp = TAILQ_NEXT(mp, mnt_list);
 			vfs_unbusy(mp);
 		}
 		mtx_unlock(&mountlist_mtx);
 		if (onumvnodes > desiredvnodes && numvnodes <= desiredvnodes)
 			uma_reclaim();
 		if (done == 0) {
 			if (force == 0 || force == 1) {
 				force = 2;
 				continue;
 			}
 			if (force == 2) {
 				force = 3;
 				continue;
 			}
 			force = 0;
 			vnlru_nowhere++;
 			tsleep(vnlruproc, PPAUSE, "vlrup", hz * 3);
 		} else
 			kern_yield(PRI_USER);
 		/*
 		 * After becoming active to expand above low water, keep
 		 * active until above high water.
 		 */
 		force = vspace() < vhiwat;
 	}
 }
 
 static struct kproc_desc vnlru_kp = {
 	"vnlru",
 	vnlru_proc,
 	&vnlruproc
 };
 SYSINIT(vnlru, SI_SUB_KTHREAD_UPDATE, SI_ORDER_FIRST, kproc_start,
     &vnlru_kp);
  
 /*
  * Routines having to do with the management of the vnode table.
  */
 
 /*
  * Try to recycle a freed vnode.  We abort if anyone picks up a reference
  * before we actually vgone().  This function must be called with the vnode
  * held to prevent the vnode from being returned to the free list midway
  * through vgone().
  */
 static int
 vtryrecycle(struct vnode *vp)
 {
 	struct mount *vnmp;
 
 	CTR2(KTR_VFS, "%s: vp %p", __func__, vp);
 	VNASSERT(vp->v_holdcnt, vp,
 	    ("vtryrecycle: Recycling vp %p without a reference.", vp));
 	/*
 	 * This vnode may found and locked via some other list, if so we
 	 * can't recycle it yet.
 	 */
 	if (VOP_LOCK(vp, LK_EXCLUSIVE | LK_NOWAIT) != 0) {
 		CTR2(KTR_VFS,
 		    "%s: impossible to recycle, vp %p lock is already held",
 		    __func__, vp);
 		return (EWOULDBLOCK);
 	}
 	/*
 	 * Don't recycle if its filesystem is being suspended.
 	 */
 	if (vn_start_write(vp, &vnmp, V_NOWAIT) != 0) {
 		VOP_UNLOCK(vp, 0);
 		CTR2(KTR_VFS,
 		    "%s: impossible to recycle, cannot start the write for %p",
 		    __func__, vp);
 		return (EBUSY);
 	}
 	/*
 	 * If we got this far, we need to acquire the interlock and see if
 	 * anyone picked up this vnode from another list.  If not, we will
 	 * mark it with DOOMED via vgonel() so that anyone who does find it
 	 * will skip over it.
 	 */
 	VI_LOCK(vp);
 	if (vp->v_usecount) {
 		VOP_UNLOCK(vp, LK_INTERLOCK);
 		vn_finished_write(vnmp);
 		CTR2(KTR_VFS,
 		    "%s: impossible to recycle, %p is already referenced",
 		    __func__, vp);
 		return (EBUSY);
 	}
 	if ((vp->v_iflag & VI_DOOMED) == 0) {
 		counter_u64_add(recycles_count, 1);
 		vgonel(vp);
 	}
 	VOP_UNLOCK(vp, LK_INTERLOCK);
 	vn_finished_write(vnmp);
 	return (0);
 }
 
 static void
 vcheckspace(void)
 {
 
 	if (vspace() < vlowat && vnlruproc_sig == 0) {
 		vnlruproc_sig = 1;
 		wakeup(vnlruproc);
 	}
 }
 
 /*
  * Wait if necessary for space for a new vnode.
  */
 static int
 getnewvnode_wait(int suspended)
 {
 
 	mtx_assert(&vnode_free_list_mtx, MA_OWNED);
 	if (numvnodes >= desiredvnodes) {
 		if (suspended) {
 			/*
 			 * The file system is being suspended.  We cannot
 			 * risk a deadlock here, so allow allocation of
 			 * another vnode even if this would give too many.
 			 */
 			return (0);
 		}
 		if (vnlruproc_sig == 0) {
 			vnlruproc_sig = 1;	/* avoid unnecessary wakeups */
 			wakeup(vnlruproc);
 		}
 		msleep(&vnlruproc_sig, &vnode_free_list_mtx, PVFS,
 		    "vlruwk", hz);
 	}
 	/* Post-adjust like the pre-adjust in getnewvnode(). */
 	if (numvnodes + 1 > desiredvnodes && freevnodes > 1)
 		vnlru_free_locked(1, NULL);
 	return (numvnodes >= desiredvnodes ? ENFILE : 0);
 }
 
 /*
  * This hack is fragile, and probably not needed any more now that the
  * watermark handling works.
  */
 void
 getnewvnode_reserve(u_int count)
 {
 	struct thread *td;
 
 	/* Pre-adjust like the pre-adjust in getnewvnode(), with any count. */
 	/* XXX no longer so quick, but this part is not racy. */
 	mtx_lock(&vnode_free_list_mtx);
 	if (numvnodes + count > desiredvnodes && freevnodes > wantfreevnodes)
 		vnlru_free_locked(ulmin(numvnodes + count - desiredvnodes,
 		    freevnodes - wantfreevnodes), NULL);
 	mtx_unlock(&vnode_free_list_mtx);
 
 	td = curthread;
 	/* First try to be quick and racy. */
 	if (atomic_fetchadd_long(&numvnodes, count) + count <= desiredvnodes) {
 		td->td_vp_reserv += count;
 		vcheckspace();	/* XXX no longer so quick, but more racy */
 		return;
 	} else
 		atomic_subtract_long(&numvnodes, count);
 
 	mtx_lock(&vnode_free_list_mtx);
 	while (count > 0) {
 		if (getnewvnode_wait(0) == 0) {
 			count--;
 			td->td_vp_reserv++;
 			atomic_add_long(&numvnodes, 1);
 		}
 	}
 	vcheckspace();
 	mtx_unlock(&vnode_free_list_mtx);
 }
 
 /*
  * This hack is fragile, especially if desiredvnodes or wantvnodes are
  * misconfgured or changed significantly.  Reducing desiredvnodes below
  * the reserved amount should cause bizarre behaviour like reducing it
  * below the number of active vnodes -- the system will try to reduce
  * numvnodes to match, but should fail, so the subtraction below should
  * not overflow.
  */
 void
 getnewvnode_drop_reserve(void)
 {
 	struct thread *td;
 
 	td = curthread;
 	atomic_subtract_long(&numvnodes, td->td_vp_reserv);
 	td->td_vp_reserv = 0;
 }
 
 /*
  * Return the next vnode from the free list.
  */
 int
 getnewvnode(const char *tag, struct mount *mp, struct vop_vector *vops,
     struct vnode **vpp)
 {
 	struct vnode *vp;
 	struct thread *td;
 	struct lock_object *lo;
 	static int cyclecount;
 	int error __unused;
 
 	CTR3(KTR_VFS, "%s: mp %p with tag %s", __func__, mp, tag);
 	vp = NULL;
 	td = curthread;
 	if (td->td_vp_reserv > 0) {
 		td->td_vp_reserv -= 1;
 		goto alloc;
 	}
 	mtx_lock(&vnode_free_list_mtx);
 	if (numvnodes < desiredvnodes)
 		cyclecount = 0;
 	else if (cyclecount++ >= freevnodes) {
 		cyclecount = 0;
 		vstir = 1;
 	}
 	/*
 	 * Grow the vnode cache if it will not be above its target max
 	 * after growing.  Otherwise, if the free list is nonempty, try
 	 * to reclaim 1 item from it before growing the cache (possibly
 	 * above its target max if the reclamation failed or is delayed).
 	 * Otherwise, wait for some space.  In all cases, schedule
 	 * vnlru_proc() if we are getting short of space.  The watermarks
 	 * should be chosen so that we never wait or even reclaim from
 	 * the free list to below its target minimum.
 	 */
 	if (numvnodes + 1 <= desiredvnodes)
 		;
 	else if (freevnodes > 0)
 		vnlru_free_locked(1, NULL);
 	else {
 		error = getnewvnode_wait(mp != NULL && (mp->mnt_kern_flag &
 		    MNTK_SUSPEND));
 #if 0	/* XXX Not all VFS_VGET/ffs_vget callers check returns. */
 		if (error != 0) {
 			mtx_unlock(&vnode_free_list_mtx);
 			return (error);
 		}
 #endif
 	}
 	vcheckspace();
 	atomic_add_long(&numvnodes, 1);
 	mtx_unlock(&vnode_free_list_mtx);
 alloc:
 	counter_u64_add(vnodes_created, 1);
 	vp = (struct vnode *) uma_zalloc(vnode_zone, M_WAITOK);
 	/*
 	 * Locks are given the generic name "vnode" when created.
 	 * Follow the historic practice of using the filesystem
 	 * name when they allocated, e.g., "zfs", "ufs", "nfs, etc.
 	 *
 	 * Locks live in a witness group keyed on their name. Thus,
 	 * when a lock is renamed, it must also move from the witness
 	 * group of its old name to the witness group of its new name.
 	 *
 	 * The change only needs to be made when the vnode moves
 	 * from one filesystem type to another. We ensure that each
 	 * filesystem use a single static name pointer for its tag so
 	 * that we can compare pointers rather than doing a strcmp().
 	 */
 	lo = &vp->v_vnlock->lock_object;
 	if (lo->lo_name != tag) {
 		lo->lo_name = tag;
 		WITNESS_DESTROY(lo);
 		WITNESS_INIT(lo, tag);
 	}
 	/*
 	 * By default, don't allow shared locks unless filesystems opt-in.
 	 */
 	vp->v_vnlock->lock_object.lo_flags |= LK_NOSHARE;
 	/*
 	 * Finalize various vnode identity bits.
 	 */
 	KASSERT(vp->v_object == NULL, ("stale v_object %p", vp));
 	KASSERT(vp->v_lockf == NULL, ("stale v_lockf %p", vp));
 	KASSERT(vp->v_pollinfo == NULL, ("stale v_pollinfo %p", vp));
 	vp->v_type = VNON;
 	vp->v_tag = tag;
 	vp->v_op = vops;
 	v_init_counters(vp);
 	vp->v_bufobj.bo_ops = &buf_ops_bio;
 #ifdef DIAGNOSTIC
 	if (mp == NULL && vops != &dead_vnodeops)
 		printf("NULL mp in getnewvnode(9), tag %s\n", tag);
 #endif
 #ifdef MAC
 	mac_vnode_init(vp);
 	if (mp != NULL && (mp->mnt_flag & MNT_MULTILABEL) == 0)
 		mac_vnode_associate_singlelabel(mp, vp);
 #endif
 	if (mp != NULL) {
 		vp->v_bufobj.bo_bsize = mp->mnt_stat.f_iosize;
 		if ((mp->mnt_kern_flag & MNTK_NOKNOTE) != 0)
 			vp->v_vflag |= VV_NOKNOTE;
 	}
 
 	/*
 	 * For the filesystems which do not use vfs_hash_insert(),
 	 * still initialize v_hash to have vfs_hash_index() useful.
 	 * E.g., nullfs uses vfs_hash_index() on the lower vnode for
 	 * its own hashing.
 	 */
 	vp->v_hash = (uintptr_t)vp >> vnsz2log;
 
 	*vpp = vp;
 	return (0);
 }
 
 /*
  * Delete from old mount point vnode list, if on one.
  */
 static void
 delmntque(struct vnode *vp)
 {
 	struct mount *mp;
 	int active;
 
 	mp = vp->v_mount;
 	if (mp == NULL)
 		return;
 	MNT_ILOCK(mp);
 	VI_LOCK(vp);
 	KASSERT(mp->mnt_activevnodelistsize <= mp->mnt_nvnodelistsize,
 	    ("Active vnode list size %d > Vnode list size %d",
 	     mp->mnt_activevnodelistsize, mp->mnt_nvnodelistsize));
 	active = vp->v_iflag & VI_ACTIVE;
 	vp->v_iflag &= ~VI_ACTIVE;
 	if (active) {
 		mtx_lock(&mp->mnt_listmtx);
 		TAILQ_REMOVE(&mp->mnt_activevnodelist, vp, v_actfreelist);
 		mp->mnt_activevnodelistsize--;
 		mtx_unlock(&mp->mnt_listmtx);
 	}
 	vp->v_mount = NULL;
 	VI_UNLOCK(vp);
 	VNASSERT(mp->mnt_nvnodelistsize > 0, vp,
 		("bad mount point vnode list size"));
 	TAILQ_REMOVE(&mp->mnt_nvnodelist, vp, v_nmntvnodes);
 	mp->mnt_nvnodelistsize--;
 	MNT_REL(mp);
 	MNT_IUNLOCK(mp);
 }
 
 static void
 insmntque_stddtr(struct vnode *vp, void *dtr_arg)
 {
 
 	vp->v_data = NULL;
 	vp->v_op = &dead_vnodeops;
 	vgone(vp);
 	vput(vp);
 }
 
 /*
  * Insert into list of vnodes for the new mount point, if available.
  */
 int
 insmntque1(struct vnode *vp, struct mount *mp,
 	void (*dtr)(struct vnode *, void *), void *dtr_arg)
 {
 
 	KASSERT(vp->v_mount == NULL,
 		("insmntque: vnode already on per mount vnode list"));
 	VNASSERT(mp != NULL, vp, ("Don't call insmntque(foo, NULL)"));
 	ASSERT_VOP_ELOCKED(vp, "insmntque: non-locked vp");
 
 	/*
 	 * We acquire the vnode interlock early to ensure that the
 	 * vnode cannot be recycled by another process releasing a
 	 * holdcnt on it before we get it on both the vnode list
 	 * and the active vnode list. The mount mutex protects only
 	 * manipulation of the vnode list and the vnode freelist
 	 * mutex protects only manipulation of the active vnode list.
 	 * Hence the need to hold the vnode interlock throughout.
 	 */
 	MNT_ILOCK(mp);
 	VI_LOCK(vp);
 	if (((mp->mnt_kern_flag & MNTK_NOINSMNTQ) != 0 &&
 	    ((mp->mnt_kern_flag & MNTK_UNMOUNTF) != 0 ||
 	    mp->mnt_nvnodelistsize == 0)) &&
 	    (vp->v_vflag & VV_FORCEINSMQ) == 0) {
 		VI_UNLOCK(vp);
 		MNT_IUNLOCK(mp);
 		if (dtr != NULL)
 			dtr(vp, dtr_arg);
 		return (EBUSY);
 	}
 	vp->v_mount = mp;
 	MNT_REF(mp);
 	TAILQ_INSERT_TAIL(&mp->mnt_nvnodelist, vp, v_nmntvnodes);
 	VNASSERT(mp->mnt_nvnodelistsize >= 0, vp,
 		("neg mount point vnode list size"));
 	mp->mnt_nvnodelistsize++;
 	KASSERT((vp->v_iflag & VI_ACTIVE) == 0,
 	    ("Activating already active vnode"));
 	vp->v_iflag |= VI_ACTIVE;
 	mtx_lock(&mp->mnt_listmtx);
 	TAILQ_INSERT_HEAD(&mp->mnt_activevnodelist, vp, v_actfreelist);
 	mp->mnt_activevnodelistsize++;
 	mtx_unlock(&mp->mnt_listmtx);
 	VI_UNLOCK(vp);
 	MNT_IUNLOCK(mp);
 	return (0);
 }
 
 int
 insmntque(struct vnode *vp, struct mount *mp)
 {
 
 	return (insmntque1(vp, mp, insmntque_stddtr, NULL));
 }
 
 /*
  * Flush out and invalidate all buffers associated with a bufobj
  * Called with the underlying object locked.
  */
 int
 bufobj_invalbuf(struct bufobj *bo, int flags, int slpflag, int slptimeo)
 {
 	int error;
 
 	BO_LOCK(bo);
 	if (flags & V_SAVE) {
 		error = bufobj_wwait(bo, slpflag, slptimeo);
 		if (error) {
 			BO_UNLOCK(bo);
 			return (error);
 		}
 		if (bo->bo_dirty.bv_cnt > 0) {
 			BO_UNLOCK(bo);
 			if ((error = BO_SYNC(bo, MNT_WAIT)) != 0)
 				return (error);
 			/*
 			 * XXX We could save a lock/unlock if this was only
 			 * enabled under INVARIANTS
 			 */
 			BO_LOCK(bo);
 			if (bo->bo_numoutput > 0 || bo->bo_dirty.bv_cnt > 0)
 				panic("vinvalbuf: dirty bufs");
 		}
 	}
 	/*
 	 * If you alter this loop please notice that interlock is dropped and
 	 * reacquired in flushbuflist.  Special care is needed to ensure that
 	 * no race conditions occur from this.
 	 */
 	do {
 		error = flushbuflist(&bo->bo_clean,
 		    flags, bo, slpflag, slptimeo);
 		if (error == 0 && !(flags & V_CLEANONLY))
 			error = flushbuflist(&bo->bo_dirty,
 			    flags, bo, slpflag, slptimeo);
 		if (error != 0 && error != EAGAIN) {
 			BO_UNLOCK(bo);
 			return (error);
 		}
 	} while (error != 0);
 
 	/*
 	 * Wait for I/O to complete.  XXX needs cleaning up.  The vnode can
 	 * have write I/O in-progress but if there is a VM object then the
 	 * VM object can also have read-I/O in-progress.
 	 */
 	do {
 		bufobj_wwait(bo, 0, 0);
 		if ((flags & V_VMIO) == 0) {
 			BO_UNLOCK(bo);
 			if (bo->bo_object != NULL) {
 				VM_OBJECT_WLOCK(bo->bo_object);
 				vm_object_pip_wait(bo->bo_object, "bovlbx");
 				VM_OBJECT_WUNLOCK(bo->bo_object);
 			}
 			BO_LOCK(bo);
 		}
 	} while (bo->bo_numoutput > 0);
 	BO_UNLOCK(bo);
 
 	/*
 	 * Destroy the copy in the VM cache, too.
 	 */
 	if (bo->bo_object != NULL &&
 	    (flags & (V_ALT | V_NORMAL | V_CLEANONLY | V_VMIO)) == 0) {
 		VM_OBJECT_WLOCK(bo->bo_object);
 		vm_object_page_remove(bo->bo_object, 0, 0, (flags & V_SAVE) ?
 		    OBJPR_CLEANONLY : 0);
 		VM_OBJECT_WUNLOCK(bo->bo_object);
 	}
 
 #ifdef INVARIANTS
 	BO_LOCK(bo);
 	if ((flags & (V_ALT | V_NORMAL | V_CLEANONLY | V_VMIO |
 	    V_ALLOWCLEAN)) == 0 && (bo->bo_dirty.bv_cnt > 0 ||
 	    bo->bo_clean.bv_cnt > 0))
 		panic("vinvalbuf: flush failed");
 	if ((flags & (V_ALT | V_NORMAL | V_CLEANONLY | V_VMIO)) == 0 &&
 	    bo->bo_dirty.bv_cnt > 0)
 		panic("vinvalbuf: flush dirty failed");
 	BO_UNLOCK(bo);
 #endif
 	return (0);
 }
 
 /*
  * Flush out and invalidate all buffers associated with a vnode.
  * Called with the underlying object locked.
  */
 int
 vinvalbuf(struct vnode *vp, int flags, int slpflag, int slptimeo)
 {
 
 	CTR3(KTR_VFS, "%s: vp %p with flags %d", __func__, vp, flags);
 	ASSERT_VOP_LOCKED(vp, "vinvalbuf");
 	if (vp->v_object != NULL && vp->v_object->handle != vp)
 		return (0);
 	return (bufobj_invalbuf(&vp->v_bufobj, flags, slpflag, slptimeo));
 }
 
 /*
  * Flush out buffers on the specified list.
  *
  */
 static int
 flushbuflist(struct bufv *bufv, int flags, struct bufobj *bo, int slpflag,
     int slptimeo)
 {
 	struct buf *bp, *nbp;
 	int retval, error;
 	daddr_t lblkno;
 	b_xflags_t xflags;
 
 	ASSERT_BO_WLOCKED(bo);
 
 	retval = 0;
 	TAILQ_FOREACH_SAFE(bp, &bufv->bv_hd, b_bobufs, nbp) {
 		if (((flags & V_NORMAL) && (bp->b_xflags & BX_ALTDATA)) ||
 		    ((flags & V_ALT) && (bp->b_xflags & BX_ALTDATA) == 0)) {
 			continue;
 		}
 		if (nbp != NULL) {
 			lblkno = nbp->b_lblkno;
 			xflags = nbp->b_xflags & (BX_VNDIRTY | BX_VNCLEAN);
 		}
 		retval = EAGAIN;
 		error = BUF_TIMELOCK(bp,
 		    LK_EXCLUSIVE | LK_SLEEPFAIL | LK_INTERLOCK, BO_LOCKPTR(bo),
 		    "flushbuf", slpflag, slptimeo);
 		if (error) {
 			BO_LOCK(bo);
 			return (error != ENOLCK ? error : EAGAIN);
 		}
 		KASSERT(bp->b_bufobj == bo,
 		    ("bp %p wrong b_bufobj %p should be %p",
 		    bp, bp->b_bufobj, bo));
 		/*
 		 * XXX Since there are no node locks for NFS, I
 		 * believe there is a slight chance that a delayed
 		 * write will occur while sleeping just above, so
 		 * check for it.
 		 */
 		if (((bp->b_flags & (B_DELWRI | B_INVAL)) == B_DELWRI) &&
 		    (flags & V_SAVE)) {
 			bremfree(bp);
 			bp->b_flags |= B_ASYNC;
 			bwrite(bp);
 			BO_LOCK(bo);
 			return (EAGAIN);	/* XXX: why not loop ? */
 		}
 		bremfree(bp);
 		bp->b_flags |= (B_INVAL | B_RELBUF);
 		bp->b_flags &= ~B_ASYNC;
 		brelse(bp);
 		BO_LOCK(bo);
 		if (nbp == NULL)
 			break;
 		nbp = gbincore(bo, lblkno);
 		if (nbp == NULL || (nbp->b_xflags & (BX_VNDIRTY | BX_VNCLEAN))
 		    != xflags)
 			break;			/* nbp invalid */
 	}
 	return (retval);
 }
 
 int
 bnoreuselist(struct bufv *bufv, struct bufobj *bo, daddr_t startn, daddr_t endn)
 {
 	struct buf *bp;
 	int error;
 	daddr_t lblkno;
 
 	ASSERT_BO_LOCKED(bo);
 
 	for (lblkno = startn;;) {
 again:
 		bp = BUF_PCTRIE_LOOKUP_GE(&bufv->bv_root, lblkno);
 		if (bp == NULL || bp->b_lblkno >= endn ||
 		    bp->b_lblkno < startn)
 			break;
 		error = BUF_TIMELOCK(bp, LK_EXCLUSIVE | LK_SLEEPFAIL |
 		    LK_INTERLOCK, BO_LOCKPTR(bo), "brlsfl", 0, 0);
 		if (error != 0) {
 			BO_RLOCK(bo);
 			if (error == ENOLCK)
 				goto again;
 			return (error);
 		}
 		KASSERT(bp->b_bufobj == bo,
 		    ("bp %p wrong b_bufobj %p should be %p",
 		    bp, bp->b_bufobj, bo));
 		lblkno = bp->b_lblkno + 1;
 		if ((bp->b_flags & B_MANAGED) == 0)
 			bremfree(bp);
 		bp->b_flags |= B_RELBUF;
 		/*
 		 * In the VMIO case, use the B_NOREUSE flag to hint that the
 		 * pages backing each buffer in the range are unlikely to be
 		 * reused.  Dirty buffers will have the hint applied once
 		 * they've been written.
 		 */
 		if ((bp->b_flags & B_VMIO) != 0)
 			bp->b_flags |= B_NOREUSE;
 		brelse(bp);
 		BO_RLOCK(bo);
 	}
 	return (0);
 }
 
 /*
  * Truncate a file's buffer and pages to a specified length.  This
  * is in lieu of the old vinvalbuf mechanism, which performed unneeded
  * sync activity.
  */
 int
-vtruncbuf(struct vnode *vp, struct ucred *cred, off_t length, int blksize)
+vtruncbuf(struct vnode *vp, off_t length, int blksize)
 {
 	struct buf *bp, *nbp;
 	int anyfreed;
 	daddr_t trunclbn;
 	struct bufobj *bo;
 
-	CTR5(KTR_VFS, "%s: vp %p with cred %p and block %d:%ju", __func__,
-	    vp, cred, blksize, (uintmax_t)length);
+	CTR4(KTR_VFS, "%s: vp %p with block %d:%ju", __func__,
+	    vp, blksize, (uintmax_t)length);
 
 	/*
 	 * Round up to the *next* lbn.
 	 */
 	trunclbn = howmany(length, blksize);
 
 	ASSERT_VOP_LOCKED(vp, "vtruncbuf");
 restart:
 	bo = &vp->v_bufobj;
 	BO_LOCK(bo);
 	anyfreed = 1;
 	for (;anyfreed;) {
 		anyfreed = 0;
 		TAILQ_FOREACH_SAFE(bp, &bo->bo_clean.bv_hd, b_bobufs, nbp) {
 			if (bp->b_lblkno < trunclbn)
 				continue;
 			if (BUF_LOCK(bp,
 			    LK_EXCLUSIVE | LK_SLEEPFAIL | LK_INTERLOCK,
 			    BO_LOCKPTR(bo)) == ENOLCK)
 				goto restart;
 
 			bremfree(bp);
 			bp->b_flags |= (B_INVAL | B_RELBUF);
 			bp->b_flags &= ~B_ASYNC;
 			brelse(bp);
 			anyfreed = 1;
 
 			BO_LOCK(bo);
 			if (nbp != NULL &&
 			    (((nbp->b_xflags & BX_VNCLEAN) == 0) ||
 			    (nbp->b_vp != vp) ||
 			    (nbp->b_flags & B_DELWRI))) {
 				BO_UNLOCK(bo);
 				goto restart;
 			}
 		}
 
 		TAILQ_FOREACH_SAFE(bp, &bo->bo_dirty.bv_hd, b_bobufs, nbp) {
 			if (bp->b_lblkno < trunclbn)
 				continue;
 			if (BUF_LOCK(bp,
 			    LK_EXCLUSIVE | LK_SLEEPFAIL | LK_INTERLOCK,
 			    BO_LOCKPTR(bo)) == ENOLCK)
 				goto restart;
 			bremfree(bp);
 			bp->b_flags |= (B_INVAL | B_RELBUF);
 			bp->b_flags &= ~B_ASYNC;
 			brelse(bp);
 			anyfreed = 1;
 
 			BO_LOCK(bo);
 			if (nbp != NULL &&
 			    (((nbp->b_xflags & BX_VNDIRTY) == 0) ||
 			    (nbp->b_vp != vp) ||
 			    (nbp->b_flags & B_DELWRI) == 0)) {
 				BO_UNLOCK(bo);
 				goto restart;
 			}
 		}
 	}
 
 	if (length > 0) {
 restartsync:
 		TAILQ_FOREACH_SAFE(bp, &bo->bo_dirty.bv_hd, b_bobufs, nbp) {
 			if (bp->b_lblkno > 0)
 				continue;
 			/*
 			 * Since we hold the vnode lock this should only
 			 * fail if we're racing with the buf daemon.
 			 */
 			if (BUF_LOCK(bp,
 			    LK_EXCLUSIVE | LK_SLEEPFAIL | LK_INTERLOCK,
 			    BO_LOCKPTR(bo)) == ENOLCK) {
 				goto restart;
 			}
 			VNASSERT((bp->b_flags & B_DELWRI), vp,
 			    ("buf(%p) on dirty queue without DELWRI", bp));
 
 			bremfree(bp);
 			bawrite(bp);
 			BO_LOCK(bo);
 			goto restartsync;
 		}
 	}
 
 	bufobj_wwait(bo, 0, 0);
 	BO_UNLOCK(bo);
 	vnode_pager_setsize(vp, length);
 
 	return (0);
 }
 
 static void
 buf_vlist_remove(struct buf *bp)
 {
 	struct bufv *bv;
 
 	KASSERT(bp->b_bufobj != NULL, ("No b_bufobj %p", bp));
 	ASSERT_BO_WLOCKED(bp->b_bufobj);
 	KASSERT((bp->b_xflags & (BX_VNDIRTY|BX_VNCLEAN)) !=
 	    (BX_VNDIRTY|BX_VNCLEAN),
 	    ("buf_vlist_remove: Buf %p is on two lists", bp));
 	if (bp->b_xflags & BX_VNDIRTY)
 		bv = &bp->b_bufobj->bo_dirty;
 	else
 		bv = &bp->b_bufobj->bo_clean;
 	BUF_PCTRIE_REMOVE(&bv->bv_root, bp->b_lblkno);
 	TAILQ_REMOVE(&bv->bv_hd, bp, b_bobufs);
 	bv->bv_cnt--;
 	bp->b_xflags &= ~(BX_VNDIRTY | BX_VNCLEAN);
 }
 
 /*
  * Add the buffer to the sorted clean or dirty block list.
  *
  * NOTE: xflags is passed as a constant, optimizing this inline function!
  */
 static void
 buf_vlist_add(struct buf *bp, struct bufobj *bo, b_xflags_t xflags)
 {
 	struct bufv *bv;
 	struct buf *n;
 	int error;
 
 	ASSERT_BO_WLOCKED(bo);
 	KASSERT((xflags & BX_VNDIRTY) == 0 || (bo->bo_flag & BO_DEAD) == 0,
 	    ("dead bo %p", bo));
 	KASSERT((bp->b_xflags & (BX_VNDIRTY|BX_VNCLEAN)) == 0,
 	    ("buf_vlist_add: Buf %p has existing xflags %d", bp, bp->b_xflags));
 	bp->b_xflags |= xflags;
 	if (xflags & BX_VNDIRTY)
 		bv = &bo->bo_dirty;
 	else
 		bv = &bo->bo_clean;
 
 	/*
 	 * Keep the list ordered.  Optimize empty list insertion.  Assume
 	 * we tend to grow at the tail so lookup_le should usually be cheaper
 	 * than _ge. 
 	 */
 	if (bv->bv_cnt == 0 ||
 	    bp->b_lblkno > TAILQ_LAST(&bv->bv_hd, buflists)->b_lblkno)
 		TAILQ_INSERT_TAIL(&bv->bv_hd, bp, b_bobufs);
 	else if ((n = BUF_PCTRIE_LOOKUP_LE(&bv->bv_root, bp->b_lblkno)) == NULL)
 		TAILQ_INSERT_HEAD(&bv->bv_hd, bp, b_bobufs);
 	else
 		TAILQ_INSERT_AFTER(&bv->bv_hd, n, bp, b_bobufs);
 	error = BUF_PCTRIE_INSERT(&bv->bv_root, bp);
 	if (error)
 		panic("buf_vlist_add:  Preallocated nodes insufficient.");
 	bv->bv_cnt++;
 }
 
 /*
  * Look up a buffer using the buffer tries.
  */
 struct buf *
 gbincore(struct bufobj *bo, daddr_t lblkno)
 {
 	struct buf *bp;
 
 	ASSERT_BO_LOCKED(bo);
 	bp = BUF_PCTRIE_LOOKUP(&bo->bo_clean.bv_root, lblkno);
 	if (bp != NULL)
 		return (bp);
 	return BUF_PCTRIE_LOOKUP(&bo->bo_dirty.bv_root, lblkno);
 }
 
 /*
  * Associate a buffer with a vnode.
  */
 void
 bgetvp(struct vnode *vp, struct buf *bp)
 {
 	struct bufobj *bo;
 
 	bo = &vp->v_bufobj;
 	ASSERT_BO_WLOCKED(bo);
 	VNASSERT(bp->b_vp == NULL, bp->b_vp, ("bgetvp: not free"));
 
 	CTR3(KTR_BUF, "bgetvp(%p) vp %p flags %X", bp, vp, bp->b_flags);
 	VNASSERT((bp->b_xflags & (BX_VNDIRTY|BX_VNCLEAN)) == 0, vp,
 	    ("bgetvp: bp already attached! %p", bp));
 
 	vhold(vp);
 	bp->b_vp = vp;
 	bp->b_bufobj = bo;
 	/*
 	 * Insert onto list for new vnode.
 	 */
 	buf_vlist_add(bp, bo, BX_VNCLEAN);
 }
 
 /*
  * Disassociate a buffer from a vnode.
  */
 void
 brelvp(struct buf *bp)
 {
 	struct bufobj *bo;
 	struct vnode *vp;
 
 	CTR3(KTR_BUF, "brelvp(%p) vp %p flags %X", bp, bp->b_vp, bp->b_flags);
 	KASSERT(bp->b_vp != NULL, ("brelvp: NULL"));
 
 	/*
 	 * Delete from old vnode list, if on one.
 	 */
 	vp = bp->b_vp;		/* XXX */
 	bo = bp->b_bufobj;
 	BO_LOCK(bo);
 	if (bp->b_xflags & (BX_VNDIRTY | BX_VNCLEAN))
 		buf_vlist_remove(bp);
 	else
 		panic("brelvp: Buffer %p not on queue.", bp);
 	if ((bo->bo_flag & BO_ONWORKLST) && bo->bo_dirty.bv_cnt == 0) {
 		bo->bo_flag &= ~BO_ONWORKLST;
 		mtx_lock(&sync_mtx);
 		LIST_REMOVE(bo, bo_synclist);
 		syncer_worklist_len--;
 		mtx_unlock(&sync_mtx);
 	}
 	bp->b_vp = NULL;
 	bp->b_bufobj = NULL;
 	BO_UNLOCK(bo);
 	vdrop(vp);
 }
 
 /*
  * Add an item to the syncer work queue.
  */
 static void
 vn_syncer_add_to_worklist(struct bufobj *bo, int delay)
 {
 	int slot;
 
 	ASSERT_BO_WLOCKED(bo);
 
 	mtx_lock(&sync_mtx);
 	if (bo->bo_flag & BO_ONWORKLST)
 		LIST_REMOVE(bo, bo_synclist);
 	else {
 		bo->bo_flag |= BO_ONWORKLST;
 		syncer_worklist_len++;
 	}
 
 	if (delay > syncer_maxdelay - 2)
 		delay = syncer_maxdelay - 2;
 	slot = (syncer_delayno + delay) & syncer_mask;
 
 	LIST_INSERT_HEAD(&syncer_workitem_pending[slot], bo, bo_synclist);
 	mtx_unlock(&sync_mtx);
 }
 
 static int
 sysctl_vfs_worklist_len(SYSCTL_HANDLER_ARGS)
 {
 	int error, len;
 
 	mtx_lock(&sync_mtx);
 	len = syncer_worklist_len - sync_vnode_count;
 	mtx_unlock(&sync_mtx);
 	error = SYSCTL_OUT(req, &len, sizeof(len));
 	return (error);
 }
 
 SYSCTL_PROC(_vfs, OID_AUTO, worklist_len, CTLTYPE_INT | CTLFLAG_RD, NULL, 0,
     sysctl_vfs_worklist_len, "I", "Syncer thread worklist length");
 
 static struct proc *updateproc;
 static void sched_sync(void);
 static struct kproc_desc up_kp = {
 	"syncer",
 	sched_sync,
 	&updateproc
 };
 SYSINIT(syncer, SI_SUB_KTHREAD_UPDATE, SI_ORDER_FIRST, kproc_start, &up_kp);
 
 static int
 sync_vnode(struct synclist *slp, struct bufobj **bo, struct thread *td)
 {
 	struct vnode *vp;
 	struct mount *mp;
 
 	*bo = LIST_FIRST(slp);
 	if (*bo == NULL)
 		return (0);
 	vp = bo2vnode(*bo);
 	if (VOP_ISLOCKED(vp) != 0 || VI_TRYLOCK(vp) == 0)
 		return (1);
 	/*
 	 * We use vhold in case the vnode does not
 	 * successfully sync.  vhold prevents the vnode from
 	 * going away when we unlock the sync_mtx so that
 	 * we can acquire the vnode interlock.
 	 */
 	vholdl(vp);
 	mtx_unlock(&sync_mtx);
 	VI_UNLOCK(vp);
 	if (vn_start_write(vp, &mp, V_NOWAIT) != 0) {
 		vdrop(vp);
 		mtx_lock(&sync_mtx);
 		return (*bo == LIST_FIRST(slp));
 	}
 	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
 	(void) VOP_FSYNC(vp, MNT_LAZY, td);
 	VOP_UNLOCK(vp, 0);
 	vn_finished_write(mp);
 	BO_LOCK(*bo);
 	if (((*bo)->bo_flag & BO_ONWORKLST) != 0) {
 		/*
 		 * Put us back on the worklist.  The worklist
 		 * routine will remove us from our current
 		 * position and then add us back in at a later
 		 * position.
 		 */
 		vn_syncer_add_to_worklist(*bo, syncdelay);
 	}
 	BO_UNLOCK(*bo);
 	vdrop(vp);
 	mtx_lock(&sync_mtx);
 	return (0);
 }
 
 static int first_printf = 1;
 
 /*
  * System filesystem synchronizer daemon.
  */
 static void
 sched_sync(void)
 {
 	struct synclist *next, *slp;
 	struct bufobj *bo;
 	long starttime;
 	struct thread *td = curthread;
 	int last_work_seen;
 	int net_worklist_len;
 	int syncer_final_iter;
 	int error;
 
 	last_work_seen = 0;
 	syncer_final_iter = 0;
 	syncer_state = SYNCER_RUNNING;
 	starttime = time_uptime;
 	td->td_pflags |= TDP_NORUNNINGBUF;
 
 	EVENTHANDLER_REGISTER(shutdown_pre_sync, syncer_shutdown, td->td_proc,
 	    SHUTDOWN_PRI_LAST);
 
 	mtx_lock(&sync_mtx);
 	for (;;) {
 		if (syncer_state == SYNCER_FINAL_DELAY &&
 		    syncer_final_iter == 0) {
 			mtx_unlock(&sync_mtx);
 			kproc_suspend_check(td->td_proc);
 			mtx_lock(&sync_mtx);
 		}
 		net_worklist_len = syncer_worklist_len - sync_vnode_count;
 		if (syncer_state != SYNCER_RUNNING &&
 		    starttime != time_uptime) {
 			if (first_printf) {
 				printf("\nSyncing disks, vnodes remaining... ");
 				first_printf = 0;
 			}
 			printf("%d ", net_worklist_len);
 		}
 		starttime = time_uptime;
 
 		/*
 		 * Push files whose dirty time has expired.  Be careful
 		 * of interrupt race on slp queue.
 		 *
 		 * Skip over empty worklist slots when shutting down.
 		 */
 		do {
 			slp = &syncer_workitem_pending[syncer_delayno];
 			syncer_delayno += 1;
 			if (syncer_delayno == syncer_maxdelay)
 				syncer_delayno = 0;
 			next = &syncer_workitem_pending[syncer_delayno];
 			/*
 			 * If the worklist has wrapped since the
 			 * it was emptied of all but syncer vnodes,
 			 * switch to the FINAL_DELAY state and run
 			 * for one more second.
 			 */
 			if (syncer_state == SYNCER_SHUTTING_DOWN &&
 			    net_worklist_len == 0 &&
 			    last_work_seen == syncer_delayno) {
 				syncer_state = SYNCER_FINAL_DELAY;
 				syncer_final_iter = SYNCER_SHUTDOWN_SPEEDUP;
 			}
 		} while (syncer_state != SYNCER_RUNNING && LIST_EMPTY(slp) &&
 		    syncer_worklist_len > 0);
 
 		/*
 		 * Keep track of the last time there was anything
 		 * on the worklist other than syncer vnodes.
 		 * Return to the SHUTTING_DOWN state if any
 		 * new work appears.
 		 */
 		if (net_worklist_len > 0 || syncer_state == SYNCER_RUNNING)
 			last_work_seen = syncer_delayno;
 		if (net_worklist_len > 0 && syncer_state == SYNCER_FINAL_DELAY)
 			syncer_state = SYNCER_SHUTTING_DOWN;
 		while (!LIST_EMPTY(slp)) {
 			error = sync_vnode(slp, &bo, td);
 			if (error == 1) {
 				LIST_REMOVE(bo, bo_synclist);
 				LIST_INSERT_HEAD(next, bo, bo_synclist);
 				continue;
 			}
 
 			if (first_printf == 0) {
 				/*
 				 * Drop the sync mutex, because some watchdog
 				 * drivers need to sleep while patting
 				 */
 				mtx_unlock(&sync_mtx);
 				wdog_kern_pat(WD_LASTVAL);
 				mtx_lock(&sync_mtx);
 			}
 
 		}
 		if (syncer_state == SYNCER_FINAL_DELAY && syncer_final_iter > 0)
 			syncer_final_iter--;
 		/*
 		 * The variable rushjob allows the kernel to speed up the
 		 * processing of the filesystem syncer process. A rushjob
 		 * value of N tells the filesystem syncer to process the next
 		 * N seconds worth of work on its queue ASAP. Currently rushjob
 		 * is used by the soft update code to speed up the filesystem
 		 * syncer process when the incore state is getting so far
 		 * ahead of the disk that the kernel memory pool is being
 		 * threatened with exhaustion.
 		 */
 		if (rushjob > 0) {
 			rushjob -= 1;
 			continue;
 		}
 		/*
 		 * Just sleep for a short period of time between
 		 * iterations when shutting down to allow some I/O
 		 * to happen.
 		 *
 		 * If it has taken us less than a second to process the
 		 * current work, then wait. Otherwise start right over
 		 * again. We can still lose time if any single round
 		 * takes more than two seconds, but it does not really
 		 * matter as we are just trying to generally pace the
 		 * filesystem activity.
 		 */
 		if (syncer_state != SYNCER_RUNNING ||
 		    time_uptime == starttime) {
 			thread_lock(td);
 			sched_prio(td, PPAUSE);
 			thread_unlock(td);
 		}
 		if (syncer_state != SYNCER_RUNNING)
 			cv_timedwait(&sync_wakeup, &sync_mtx,
 			    hz / SYNCER_SHUTDOWN_SPEEDUP);
 		else if (time_uptime == starttime)
 			cv_timedwait(&sync_wakeup, &sync_mtx, hz);
 	}
 }
 
 /*
  * Request the syncer daemon to speed up its work.
  * We never push it to speed up more than half of its
  * normal turn time, otherwise it could take over the cpu.
  */
 int
 speedup_syncer(void)
 {
 	int ret = 0;
 
 	mtx_lock(&sync_mtx);
 	if (rushjob < syncdelay / 2) {
 		rushjob += 1;
 		stat_rush_requests += 1;
 		ret = 1;
 	}
 	mtx_unlock(&sync_mtx);
 	cv_broadcast(&sync_wakeup);
 	return (ret);
 }
 
 /*
  * Tell the syncer to speed up its work and run though its work
  * list several times, then tell it to shut down.
  */
 static void
 syncer_shutdown(void *arg, int howto)
 {
 
 	if (howto & RB_NOSYNC)
 		return;
 	mtx_lock(&sync_mtx);
 	syncer_state = SYNCER_SHUTTING_DOWN;
 	rushjob = 0;
 	mtx_unlock(&sync_mtx);
 	cv_broadcast(&sync_wakeup);
 	kproc_shutdown(arg, howto);
 }
 
 void
 syncer_suspend(void)
 {
 
 	syncer_shutdown(updateproc, 0);
 }
 
 void
 syncer_resume(void)
 {
 
 	mtx_lock(&sync_mtx);
 	first_printf = 1;
 	syncer_state = SYNCER_RUNNING;
 	mtx_unlock(&sync_mtx);
 	cv_broadcast(&sync_wakeup);
 	kproc_resume(updateproc);
 }
 
 /*
  * Reassign a buffer from one vnode to another.
  * Used to assign file specific control information
  * (indirect blocks) to the vnode to which they belong.
  */
 void
 reassignbuf(struct buf *bp)
 {
 	struct vnode *vp;
 	struct bufobj *bo;
 	int delay;
 #ifdef INVARIANTS
 	struct bufv *bv;
 #endif
 
 	vp = bp->b_vp;
 	bo = bp->b_bufobj;
 	++reassignbufcalls;
 
 	CTR3(KTR_BUF, "reassignbuf(%p) vp %p flags %X",
 	    bp, bp->b_vp, bp->b_flags);
 	/*
 	 * B_PAGING flagged buffers cannot be reassigned because their vp
 	 * is not fully linked in.
 	 */
 	if (bp->b_flags & B_PAGING)
 		panic("cannot reassign paging buffer");
 
 	/*
 	 * Delete from old vnode list, if on one.
 	 */
 	BO_LOCK(bo);
 	if (bp->b_xflags & (BX_VNDIRTY | BX_VNCLEAN))
 		buf_vlist_remove(bp);
 	else
 		panic("reassignbuf: Buffer %p not on queue.", bp);
 	/*
 	 * If dirty, put on list of dirty buffers; otherwise insert onto list
 	 * of clean buffers.
 	 */
 	if (bp->b_flags & B_DELWRI) {
 		if ((bo->bo_flag & BO_ONWORKLST) == 0) {
 			switch (vp->v_type) {
 			case VDIR:
 				delay = dirdelay;
 				break;
 			case VCHR:
 				delay = metadelay;
 				break;
 			default:
 				delay = filedelay;
 			}
 			vn_syncer_add_to_worklist(bo, delay);
 		}
 		buf_vlist_add(bp, bo, BX_VNDIRTY);
 	} else {
 		buf_vlist_add(bp, bo, BX_VNCLEAN);
 
 		if ((bo->bo_flag & BO_ONWORKLST) && bo->bo_dirty.bv_cnt == 0) {
 			mtx_lock(&sync_mtx);
 			LIST_REMOVE(bo, bo_synclist);
 			syncer_worklist_len--;
 			mtx_unlock(&sync_mtx);
 			bo->bo_flag &= ~BO_ONWORKLST;
 		}
 	}
 #ifdef INVARIANTS
 	bv = &bo->bo_clean;
 	bp = TAILQ_FIRST(&bv->bv_hd);
 	KASSERT(bp == NULL || bp->b_bufobj == bo,
 	    ("bp %p wrong b_bufobj %p should be %p", bp, bp->b_bufobj, bo));
 	bp = TAILQ_LAST(&bv->bv_hd, buflists);
 	KASSERT(bp == NULL || bp->b_bufobj == bo,
 	    ("bp %p wrong b_bufobj %p should be %p", bp, bp->b_bufobj, bo));
 	bv = &bo->bo_dirty;
 	bp = TAILQ_FIRST(&bv->bv_hd);
 	KASSERT(bp == NULL || bp->b_bufobj == bo,
 	    ("bp %p wrong b_bufobj %p should be %p", bp, bp->b_bufobj, bo));
 	bp = TAILQ_LAST(&bv->bv_hd, buflists);
 	KASSERT(bp == NULL || bp->b_bufobj == bo,
 	    ("bp %p wrong b_bufobj %p should be %p", bp, bp->b_bufobj, bo));
 #endif
 	BO_UNLOCK(bo);
 }
 
 static void
 v_init_counters(struct vnode *vp)
 {
 
 	VNASSERT(vp->v_type == VNON && vp->v_data == NULL && vp->v_iflag == 0,
 	    vp, ("%s called for an initialized vnode", __FUNCTION__));
 	ASSERT_VI_UNLOCKED(vp, __FUNCTION__);
 
 	refcount_init(&vp->v_holdcnt, 1);
 	refcount_init(&vp->v_usecount, 1);
 }
 
 static void
 v_incr_usecount_locked(struct vnode *vp)
 {
 
 	ASSERT_VI_LOCKED(vp, __func__);
 	if ((vp->v_iflag & VI_OWEINACT) != 0) {
 		VNASSERT(vp->v_usecount == 0, vp,
 		    ("vnode with usecount and VI_OWEINACT set"));
 		vp->v_iflag &= ~VI_OWEINACT;
 	}
 	refcount_acquire(&vp->v_usecount);
 	v_incr_devcount(vp);
 }
 
 /*
  * Increment the use count on the vnode, taking care to reference
  * the driver's usecount if this is a chardev.
  */
 static void
 v_incr_usecount(struct vnode *vp)
 {
 
 	ASSERT_VI_UNLOCKED(vp, __func__);
 	CTR2(KTR_VFS, "%s: vp %p", __func__, vp);
 
 	if (vp->v_type != VCHR &&
 	    refcount_acquire_if_not_zero(&vp->v_usecount)) {
 		VNODE_REFCOUNT_FENCE_ACQ();
 		VNASSERT((vp->v_iflag & VI_OWEINACT) == 0, vp,
 		    ("vnode with usecount and VI_OWEINACT set"));
 	} else {
 		VI_LOCK(vp);
 		v_incr_usecount_locked(vp);
 		VI_UNLOCK(vp);
 	}
 }
 
 /*
  * Increment si_usecount of the associated device, if any.
  */
 static void
 v_incr_devcount(struct vnode *vp)
 {
 
 	ASSERT_VI_LOCKED(vp, __FUNCTION__);
 	if (vp->v_type == VCHR && vp->v_rdev != NULL) {
 		dev_lock();
 		vp->v_rdev->si_usecount++;
 		dev_unlock();
 	}
 }
 
 /*
  * Decrement si_usecount of the associated device, if any.
  */
 static void
 v_decr_devcount(struct vnode *vp)
 {
 
 	ASSERT_VI_LOCKED(vp, __FUNCTION__);
 	if (vp->v_type == VCHR && vp->v_rdev != NULL) {
 		dev_lock();
 		vp->v_rdev->si_usecount--;
 		dev_unlock();
 	}
 }
 
 /*
  * Grab a particular vnode from the free list, increment its
  * reference count and lock it.  VI_DOOMED is set if the vnode
  * is being destroyed.  Only callers who specify LK_RETRY will
  * see doomed vnodes.  If inactive processing was delayed in
  * vput try to do it here.
  *
  * Notes on lockless counter manipulation:
  * _vhold, vputx and other routines make various decisions based
  * on either holdcnt or usecount being 0. As long as either counter
  * is not transitioning 0->1 nor 1->0, the manipulation can be done
  * with atomic operations. Otherwise the interlock is taken covering
  * both the atomic and additional actions.
  */
 int
 vget(struct vnode *vp, int flags, struct thread *td)
 {
 	int error, oweinact;
 
 	VNASSERT((flags & LK_TYPE_MASK) != 0, vp,
 	    ("vget: invalid lock operation"));
 
 	if ((flags & LK_INTERLOCK) != 0)
 		ASSERT_VI_LOCKED(vp, __func__);
 	else
 		ASSERT_VI_UNLOCKED(vp, __func__);
 	if ((flags & LK_VNHELD) != 0)
 		VNASSERT((vp->v_holdcnt > 0), vp,
 		    ("vget: LK_VNHELD passed but vnode not held"));
 
 	CTR3(KTR_VFS, "%s: vp %p with flags %d", __func__, vp, flags);
 
 	if ((flags & LK_VNHELD) == 0)
 		_vhold(vp, (flags & LK_INTERLOCK) != 0);
 
 	if ((error = vn_lock(vp, flags)) != 0) {
 		vdrop(vp);
 		CTR2(KTR_VFS, "%s: impossible to lock vnode %p", __func__,
 		    vp);
 		return (error);
 	}
 	if (vp->v_iflag & VI_DOOMED && (flags & LK_RETRY) == 0)
 		panic("vget: vn_lock failed to return ENOENT\n");
 	/*
 	 * We don't guarantee that any particular close will
 	 * trigger inactive processing so just make a best effort
 	 * here at preventing a reference to a removed file.  If
 	 * we don't succeed no harm is done.
 	 *
 	 * Upgrade our holdcnt to a usecount.
 	 */
 	if (vp->v_type == VCHR ||
 	    !refcount_acquire_if_not_zero(&vp->v_usecount)) {
 		VI_LOCK(vp);
 		if ((vp->v_iflag & VI_OWEINACT) == 0) {
 			oweinact = 0;
 		} else {
 			oweinact = 1;
 			vp->v_iflag &= ~VI_OWEINACT;
 			VNODE_REFCOUNT_FENCE_REL();
 		}
 		refcount_acquire(&vp->v_usecount);
 		v_incr_devcount(vp);
 		if (oweinact && VOP_ISLOCKED(vp) == LK_EXCLUSIVE &&
 		    (flags & LK_NOWAIT) == 0)
 			vinactive(vp, td);
 		VI_UNLOCK(vp);
 	}
 	return (0);
 }
 
 /*
  * Increase the reference (use) and hold count of a vnode.
  * This will also remove the vnode from the free list if it is presently free.
  */
 void
 vref(struct vnode *vp)
 {
 
 	CTR2(KTR_VFS, "%s: vp %p", __func__, vp);
 	_vhold(vp, false);
 	v_incr_usecount(vp);
 }
 
 void
 vrefl(struct vnode *vp)
 {
 
 	ASSERT_VI_LOCKED(vp, __func__);
 	CTR2(KTR_VFS, "%s: vp %p", __func__, vp);
 	_vhold(vp, true);
 	v_incr_usecount_locked(vp);
 }
 
 void
 vrefact(struct vnode *vp)
 {
 
 	CTR2(KTR_VFS, "%s: vp %p", __func__, vp);
 	if (__predict_false(vp->v_type == VCHR)) {
 		VNASSERT(vp->v_holdcnt > 0 && vp->v_usecount > 0, vp,
 		    ("%s: wrong ref counts", __func__));
 		vref(vp);
 		return;
 	}
 #ifdef INVARIANTS
 	int old = atomic_fetchadd_int(&vp->v_holdcnt, 1);
 	VNASSERT(old > 0, vp, ("%s: wrong hold count", __func__));
 	old = atomic_fetchadd_int(&vp->v_usecount, 1);
 	VNASSERT(old > 0, vp, ("%s: wrong use count", __func__));
 #else
 	refcount_acquire(&vp->v_holdcnt);
 	refcount_acquire(&vp->v_usecount);
 #endif
 }
 
 /*
  * Return reference count of a vnode.
  *
  * The results of this call are only guaranteed when some mechanism is used to
  * stop other processes from gaining references to the vnode.  This may be the
  * case if the caller holds the only reference.  This is also useful when stale
  * data is acceptable as race conditions may be accounted for by some other
  * means.
  */
 int
 vrefcnt(struct vnode *vp)
 {
 
 	return (vp->v_usecount);
 }
 
 #define	VPUTX_VRELE	1
 #define	VPUTX_VPUT	2
 #define	VPUTX_VUNREF	3
 
 /*
  * Decrement the use and hold counts for a vnode.
  *
  * See an explanation near vget() as to why atomic operation is safe.
  */
 static void
 vputx(struct vnode *vp, int func)
 {
 	int error;
 
 	KASSERT(vp != NULL, ("vputx: null vp"));
 	if (func == VPUTX_VUNREF)
 		ASSERT_VOP_LOCKED(vp, "vunref");
 	else if (func == VPUTX_VPUT)
 		ASSERT_VOP_LOCKED(vp, "vput");
 	else
 		KASSERT(func == VPUTX_VRELE, ("vputx: wrong func"));
 	ASSERT_VI_UNLOCKED(vp, __func__);
 	CTR2(KTR_VFS, "%s: vp %p", __func__, vp);
 
 	if (vp->v_type != VCHR &&
 	    refcount_release_if_not_last(&vp->v_usecount)) {
 		if (func == VPUTX_VPUT)
 			VOP_UNLOCK(vp, 0);
 		vdrop(vp);
 		return;
 	}
 
 	VI_LOCK(vp);
 
 	/*
 	 * We want to hold the vnode until the inactive finishes to
 	 * prevent vgone() races.  We drop the use count here and the
 	 * hold count below when we're done.
 	 */
 	if (!refcount_release(&vp->v_usecount) ||
 	    (vp->v_iflag & VI_DOINGINACT)) {
 		if (func == VPUTX_VPUT)
 			VOP_UNLOCK(vp, 0);
 		v_decr_devcount(vp);
 		vdropl(vp);
 		return;
 	}
 
 	v_decr_devcount(vp);
 
 	error = 0;
 
 	if (vp->v_usecount != 0) {
 		vn_printf(vp, "vputx: usecount not zero for vnode ");
 		panic("vputx: usecount not zero");
 	}
 
 	CTR2(KTR_VFS, "%s: return vnode %p to the freelist", __func__, vp);
 
 	/*
 	 * We must call VOP_INACTIVE with the node locked. Mark
 	 * as VI_DOINGINACT to avoid recursion.
 	 */
 	vp->v_iflag |= VI_OWEINACT;
 	switch (func) {
 	case VPUTX_VRELE:
 		error = vn_lock(vp, LK_EXCLUSIVE | LK_INTERLOCK);
 		VI_LOCK(vp);
 		break;
 	case VPUTX_VPUT:
 		if (VOP_ISLOCKED(vp) != LK_EXCLUSIVE) {
 			error = VOP_LOCK(vp, LK_UPGRADE | LK_INTERLOCK |
 			    LK_NOWAIT);
 			VI_LOCK(vp);
 		}
 		break;
 	case VPUTX_VUNREF:
 		if (VOP_ISLOCKED(vp) != LK_EXCLUSIVE) {
 			error = VOP_LOCK(vp, LK_TRYUPGRADE | LK_INTERLOCK);
 			VI_LOCK(vp);
 		}
 		break;
 	}
 	VNASSERT(vp->v_usecount == 0 || (vp->v_iflag & VI_OWEINACT) == 0, vp,
 	    ("vnode with usecount and VI_OWEINACT set"));
 	if (error == 0) {
 		if (vp->v_iflag & VI_OWEINACT)
 			vinactive(vp, curthread);
 		if (func != VPUTX_VUNREF)
 			VOP_UNLOCK(vp, 0);
 	}
 	vdropl(vp);
 }
 
 /*
  * Vnode put/release.
  * If count drops to zero, call inactive routine and return to freelist.
  */
 void
 vrele(struct vnode *vp)
 {
 
 	vputx(vp, VPUTX_VRELE);
 }
 
 /*
  * Release an already locked vnode.  This give the same effects as
  * unlock+vrele(), but takes less time and avoids releasing and
  * re-aquiring the lock (as vrele() acquires the lock internally.)
  */
 void
 vput(struct vnode *vp)
 {
 
 	vputx(vp, VPUTX_VPUT);
 }
 
 /*
  * Release an exclusively locked vnode. Do not unlock the vnode lock.
  */
 void
 vunref(struct vnode *vp)
 {
 
 	vputx(vp, VPUTX_VUNREF);
 }
 
 /*
  * Increase the hold count and activate if this is the first reference.
  */
 void
 _vhold(struct vnode *vp, bool locked)
 {
 	struct mount *mp;
 
 	if (locked)
 		ASSERT_VI_LOCKED(vp, __func__);
 	else
 		ASSERT_VI_UNLOCKED(vp, __func__);
 	CTR2(KTR_VFS, "%s: vp %p", __func__, vp);
 	if (!locked) {
 		if (refcount_acquire_if_not_zero(&vp->v_holdcnt)) {
 			VNODE_REFCOUNT_FENCE_ACQ();
 			VNASSERT((vp->v_iflag & VI_FREE) == 0, vp,
 			    ("_vhold: vnode with holdcnt is free"));
 			return;
 		}
 		VI_LOCK(vp);
 	}
 	if ((vp->v_iflag & VI_FREE) == 0) {
 		refcount_acquire(&vp->v_holdcnt);
 		if (!locked)
 			VI_UNLOCK(vp);
 		return;
 	}
 	VNASSERT(vp->v_holdcnt == 0, vp,
 	    ("%s: wrong hold count", __func__));
 	VNASSERT(vp->v_op != NULL, vp,
 	    ("%s: vnode already reclaimed.", __func__));
 	/*
 	 * Remove a vnode from the free list, mark it as in use,
 	 * and put it on the active list.
 	 */
 	VNASSERT(vp->v_mount != NULL, vp,
 	    ("_vhold: vnode not on per mount vnode list"));
 	mp = vp->v_mount;
 	mtx_lock(&mp->mnt_listmtx);
 	if ((vp->v_mflag & VMP_TMPMNTFREELIST) != 0) {
 		TAILQ_REMOVE(&mp->mnt_tmpfreevnodelist, vp, v_actfreelist);
 		mp->mnt_tmpfreevnodelistsize--;
 		vp->v_mflag &= ~VMP_TMPMNTFREELIST;
 	} else {
 		mtx_lock(&vnode_free_list_mtx);
 		TAILQ_REMOVE(&vnode_free_list, vp, v_actfreelist);
 		freevnodes--;
 		mtx_unlock(&vnode_free_list_mtx);
 	}
 	KASSERT((vp->v_iflag & VI_ACTIVE) == 0,
 	    ("Activating already active vnode"));
 	vp->v_iflag &= ~VI_FREE;
 	vp->v_iflag |= VI_ACTIVE;
 	TAILQ_INSERT_HEAD(&mp->mnt_activevnodelist, vp, v_actfreelist);
 	mp->mnt_activevnodelistsize++;
 	mtx_unlock(&mp->mnt_listmtx);
 	refcount_acquire(&vp->v_holdcnt);
 	if (!locked)
 		VI_UNLOCK(vp);
 }
 
 /*
  * Drop the hold count of the vnode.  If this is the last reference to
  * the vnode we place it on the free list unless it has been vgone'd
  * (marked VI_DOOMED) in which case we will free it.
  *
  * Because the vnode vm object keeps a hold reference on the vnode if
  * there is at least one resident non-cached page, the vnode cannot
  * leave the active list without the page cleanup done.
  */
 void
 _vdrop(struct vnode *vp, bool locked)
 {
 	struct bufobj *bo;
 	struct mount *mp;
 	int active;
 
 	if (locked)
 		ASSERT_VI_LOCKED(vp, __func__);
 	else
 		ASSERT_VI_UNLOCKED(vp, __func__);
 	CTR2(KTR_VFS, "%s: vp %p", __func__, vp);
 	if ((int)vp->v_holdcnt <= 0)
 		panic("vdrop: holdcnt %d", vp->v_holdcnt);
 	if (!locked) {
 		if (refcount_release_if_not_last(&vp->v_holdcnt))
 			return;
 		VI_LOCK(vp);
 	}
 	if (refcount_release(&vp->v_holdcnt) == 0) {
 		VI_UNLOCK(vp);
 		return;
 	}
 	if ((vp->v_iflag & VI_DOOMED) == 0) {
 		/*
 		 * Mark a vnode as free: remove it from its active list
 		 * and put it up for recycling on the freelist.
 		 */
 		VNASSERT(vp->v_op != NULL, vp,
 		    ("vdropl: vnode already reclaimed."));
 		VNASSERT((vp->v_iflag & VI_FREE) == 0, vp,
 		    ("vnode already free"));
 		VNASSERT(vp->v_holdcnt == 0, vp,
 		    ("vdropl: freeing when we shouldn't"));
 		active = vp->v_iflag & VI_ACTIVE;
 		if ((vp->v_iflag & VI_OWEINACT) == 0) {
 			vp->v_iflag &= ~VI_ACTIVE;
 			mp = vp->v_mount;
 			if (mp != NULL) {
 				mtx_lock(&mp->mnt_listmtx);
 				if (active) {
 					TAILQ_REMOVE(&mp->mnt_activevnodelist,
 					    vp, v_actfreelist);
 					mp->mnt_activevnodelistsize--;
 				}
 				TAILQ_INSERT_TAIL(&mp->mnt_tmpfreevnodelist,
 				    vp, v_actfreelist);
 				mp->mnt_tmpfreevnodelistsize++;
 				vp->v_iflag |= VI_FREE;
 				vp->v_mflag |= VMP_TMPMNTFREELIST;
 				VI_UNLOCK(vp);
 				if (mp->mnt_tmpfreevnodelistsize >=
 				    mnt_free_list_batch)
 					vnlru_return_batch_locked(mp);
 				mtx_unlock(&mp->mnt_listmtx);
 			} else {
 				VNASSERT(active == 0, vp,
 				    ("vdropl: active vnode not on per mount "
 				    "vnode list"));
 				mtx_lock(&vnode_free_list_mtx);
 				TAILQ_INSERT_TAIL(&vnode_free_list, vp,
 				    v_actfreelist);
 				freevnodes++;
 				vp->v_iflag |= VI_FREE;
 				VI_UNLOCK(vp);
 				mtx_unlock(&vnode_free_list_mtx);
 			}
 		} else {
 			VI_UNLOCK(vp);
 			counter_u64_add(free_owe_inact, 1);
 		}
 		return;
 	}
 	/*
 	 * The vnode has been marked for destruction, so free it.
 	 *
 	 * The vnode will be returned to the zone where it will
 	 * normally remain until it is needed for another vnode. We
 	 * need to cleanup (or verify that the cleanup has already
 	 * been done) any residual data left from its current use
 	 * so as not to contaminate the freshly allocated vnode.
 	 */
 	CTR2(KTR_VFS, "%s: destroying the vnode %p", __func__, vp);
 	atomic_subtract_long(&numvnodes, 1);
 	bo = &vp->v_bufobj;
 	VNASSERT((vp->v_iflag & VI_FREE) == 0, vp,
 	    ("cleaned vnode still on the free list."));
 	VNASSERT(vp->v_data == NULL, vp, ("cleaned vnode isn't"));
 	VNASSERT(vp->v_holdcnt == 0, vp, ("Non-zero hold count"));
 	VNASSERT(vp->v_usecount == 0, vp, ("Non-zero use count"));
 	VNASSERT(vp->v_writecount == 0, vp, ("Non-zero write count"));
 	VNASSERT(bo->bo_numoutput == 0, vp, ("Clean vnode has pending I/O's"));
 	VNASSERT(bo->bo_clean.bv_cnt == 0, vp, ("cleanbufcnt not 0"));
 	VNASSERT(pctrie_is_empty(&bo->bo_clean.bv_root), vp,
 	    ("clean blk trie not empty"));
 	VNASSERT(bo->bo_dirty.bv_cnt == 0, vp, ("dirtybufcnt not 0"));
 	VNASSERT(pctrie_is_empty(&bo->bo_dirty.bv_root), vp,
 	    ("dirty blk trie not empty"));
 	VNASSERT(TAILQ_EMPTY(&vp->v_cache_dst), vp, ("vp has namecache dst"));
 	VNASSERT(LIST_EMPTY(&vp->v_cache_src), vp, ("vp has namecache src"));
 	VNASSERT(vp->v_cache_dd == NULL, vp, ("vp has namecache for .."));
 	VNASSERT(TAILQ_EMPTY(&vp->v_rl.rl_waiters), vp,
 	    ("Dangling rangelock waiters"));
 	VI_UNLOCK(vp);
 #ifdef MAC
 	mac_vnode_destroy(vp);
 #endif
 	if (vp->v_pollinfo != NULL) {
 		destroy_vpollinfo(vp->v_pollinfo);
 		vp->v_pollinfo = NULL;
 	}
 #ifdef INVARIANTS
 	/* XXX Elsewhere we detect an already freed vnode via NULL v_op. */
 	vp->v_op = NULL;
 #endif
 	vp->v_mountedhere = NULL;
 	vp->v_unpcb = NULL;
 	vp->v_rdev = NULL;
 	vp->v_fifoinfo = NULL;
 	vp->v_lasta = vp->v_clen = vp->v_cstart = vp->v_lastw = 0;
 	vp->v_iflag = 0;
 	vp->v_vflag = 0;
 	bo->bo_flag = 0;
 	uma_zfree(vnode_zone, vp);
 }
 
 /*
  * Call VOP_INACTIVE on the vnode and manage the DOINGINACT and OWEINACT
  * flags.  DOINGINACT prevents us from recursing in calls to vinactive.
  * OWEINACT tracks whether a vnode missed a call to inactive due to a
  * failed lock upgrade.
  */
 void
 vinactive(struct vnode *vp, struct thread *td)
 {
 	struct vm_object *obj;
 
 	ASSERT_VOP_ELOCKED(vp, "vinactive");
 	ASSERT_VI_LOCKED(vp, "vinactive");
 	VNASSERT((vp->v_iflag & VI_DOINGINACT) == 0, vp,
 	    ("vinactive: recursed on VI_DOINGINACT"));
 	CTR2(KTR_VFS, "%s: vp %p", __func__, vp);
 	vp->v_iflag |= VI_DOINGINACT;
 	vp->v_iflag &= ~VI_OWEINACT;
 	VI_UNLOCK(vp);
 	/*
 	 * Before moving off the active list, we must be sure that any
 	 * modified pages are converted into the vnode's dirty
 	 * buffers, since these will no longer be checked once the
 	 * vnode is on the inactive list.
 	 *
 	 * The write-out of the dirty pages is asynchronous.  At the
 	 * point that VOP_INACTIVE() is called, there could still be
 	 * pending I/O and dirty pages in the object.
 	 */
 	if ((obj = vp->v_object) != NULL && (vp->v_vflag & VV_NOSYNC) == 0 &&
 	    (obj->flags & OBJ_MIGHTBEDIRTY) != 0) {
 		VM_OBJECT_WLOCK(obj);
 		vm_object_page_clean(obj, 0, 0, 0);
 		VM_OBJECT_WUNLOCK(obj);
 	}
 	VOP_INACTIVE(vp, td);
 	VI_LOCK(vp);
 	VNASSERT(vp->v_iflag & VI_DOINGINACT, vp,
 	    ("vinactive: lost VI_DOINGINACT"));
 	vp->v_iflag &= ~VI_DOINGINACT;
 }
 
 /*
  * Remove any vnodes in the vnode table belonging to mount point mp.
  *
  * If FORCECLOSE is not specified, there should not be any active ones,
  * return error if any are found (nb: this is a user error, not a
  * system error). If FORCECLOSE is specified, detach any active vnodes
  * that are found.
  *
  * If WRITECLOSE is set, only flush out regular file vnodes open for
  * writing.
  *
  * SKIPSYSTEM causes any vnodes marked VV_SYSTEM to be skipped.
  *
  * `rootrefs' specifies the base reference count for the root vnode
  * of this filesystem. The root vnode is considered busy if its
  * v_usecount exceeds this value. On a successful return, vflush(, td)
  * will call vrele() on the root vnode exactly rootrefs times.
  * If the SKIPSYSTEM or WRITECLOSE flags are specified, rootrefs must
  * be zero.
  */
 #ifdef DIAGNOSTIC
 static int busyprt = 0;		/* print out busy vnodes */
 SYSCTL_INT(_debug, OID_AUTO, busyprt, CTLFLAG_RW, &busyprt, 0, "Print out busy vnodes");
 #endif
 
 int
 vflush(struct mount *mp, int rootrefs, int flags, struct thread *td)
 {
 	struct vnode *vp, *mvp, *rootvp = NULL;
 	struct vattr vattr;
 	int busy = 0, error;
 
 	CTR4(KTR_VFS, "%s: mp %p with rootrefs %d and flags %d", __func__, mp,
 	    rootrefs, flags);
 	if (rootrefs > 0) {
 		KASSERT((flags & (SKIPSYSTEM | WRITECLOSE)) == 0,
 		    ("vflush: bad args"));
 		/*
 		 * Get the filesystem root vnode. We can vput() it
 		 * immediately, since with rootrefs > 0, it won't go away.
 		 */
 		if ((error = VFS_ROOT(mp, LK_EXCLUSIVE, &rootvp)) != 0) {
 			CTR2(KTR_VFS, "%s: vfs_root lookup failed with %d",
 			    __func__, error);
 			return (error);
 		}
 		vput(rootvp);
 	}
 loop:
 	MNT_VNODE_FOREACH_ALL(vp, mp, mvp) {
 		vholdl(vp);
 		error = vn_lock(vp, LK_INTERLOCK | LK_EXCLUSIVE);
 		if (error) {
 			vdrop(vp);
 			MNT_VNODE_FOREACH_ALL_ABORT(mp, mvp);
 			goto loop;
 		}
 		/*
 		 * Skip over a vnodes marked VV_SYSTEM.
 		 */
 		if ((flags & SKIPSYSTEM) && (vp->v_vflag & VV_SYSTEM)) {
 			VOP_UNLOCK(vp, 0);
 			vdrop(vp);
 			continue;
 		}
 		/*
 		 * If WRITECLOSE is set, flush out unlinked but still open
 		 * files (even if open only for reading) and regular file
 		 * vnodes open for writing.
 		 */
 		if (flags & WRITECLOSE) {
 			if (vp->v_object != NULL) {
 				VM_OBJECT_WLOCK(vp->v_object);
 				vm_object_page_clean(vp->v_object, 0, 0, 0);
 				VM_OBJECT_WUNLOCK(vp->v_object);
 			}
 			error = VOP_FSYNC(vp, MNT_WAIT, td);
 			if (error != 0) {
 				VOP_UNLOCK(vp, 0);
 				vdrop(vp);
 				MNT_VNODE_FOREACH_ALL_ABORT(mp, mvp);
 				return (error);
 			}
 			error = VOP_GETATTR(vp, &vattr, td->td_ucred);
 			VI_LOCK(vp);
 
 			if ((vp->v_type == VNON ||
 			    (error == 0 && vattr.va_nlink > 0)) &&
 			    (vp->v_writecount <= 0 || vp->v_type != VREG)) {
 				VOP_UNLOCK(vp, 0);
 				vdropl(vp);
 				continue;
 			}
 		} else
 			VI_LOCK(vp);
 		/*
 		 * With v_usecount == 0, all we need to do is clear out the
 		 * vnode data structures and we are done.
 		 *
 		 * If FORCECLOSE is set, forcibly close the vnode.
 		 */
 		if (vp->v_usecount == 0 || (flags & FORCECLOSE)) {
 			vgonel(vp);
 		} else {
 			busy++;
 #ifdef DIAGNOSTIC
 			if (busyprt)
 				vn_printf(vp, "vflush: busy vnode ");
 #endif
 		}
 		VOP_UNLOCK(vp, 0);
 		vdropl(vp);
 	}
 	if (rootrefs > 0 && (flags & FORCECLOSE) == 0) {
 		/*
 		 * If just the root vnode is busy, and if its refcount
 		 * is equal to `rootrefs', then go ahead and kill it.
 		 */
 		VI_LOCK(rootvp);
 		KASSERT(busy > 0, ("vflush: not busy"));
 		VNASSERT(rootvp->v_usecount >= rootrefs, rootvp,
 		    ("vflush: usecount %d < rootrefs %d",
 		     rootvp->v_usecount, rootrefs));
 		if (busy == 1 && rootvp->v_usecount == rootrefs) {
 			VOP_LOCK(rootvp, LK_EXCLUSIVE|LK_INTERLOCK);
 			vgone(rootvp);
 			VOP_UNLOCK(rootvp, 0);
 			busy = 0;
 		} else
 			VI_UNLOCK(rootvp);
 	}
 	if (busy) {
 		CTR2(KTR_VFS, "%s: failing as %d vnodes are busy", __func__,
 		    busy);
 		return (EBUSY);
 	}
 	for (; rootrefs > 0; rootrefs--)
 		vrele(rootvp);
 	return (0);
 }
 
 /*
  * Recycle an unused vnode to the front of the free list.
  */
 int
 vrecycle(struct vnode *vp)
 {
 	int recycled;
 
 	VI_LOCK(vp);
 	recycled = vrecyclel(vp);
 	VI_UNLOCK(vp);
 	return (recycled);
 }
 
 /*
  * vrecycle, with the vp interlock held.
  */
 int
 vrecyclel(struct vnode *vp)
 {
 	int recycled;
 
 	ASSERT_VOP_ELOCKED(vp, __func__);
 	ASSERT_VI_LOCKED(vp, __func__);
 	CTR2(KTR_VFS, "%s: vp %p", __func__, vp);
 	recycled = 0;
 	if (vp->v_usecount == 0) {
 		recycled = 1;
 		vgonel(vp);
 	}
 	return (recycled);
 }
 
 /*
  * Eliminate all activity associated with a vnode
  * in preparation for reuse.
  */
 void
 vgone(struct vnode *vp)
 {
 	VI_LOCK(vp);
 	vgonel(vp);
 	VI_UNLOCK(vp);
 }
 
 static void
 notify_lowervp_vfs_dummy(struct mount *mp __unused,
     struct vnode *lowervp __unused)
 {
 }
 
 /*
  * Notify upper mounts about reclaimed or unlinked vnode.
  */
 void
 vfs_notify_upper(struct vnode *vp, int event)
 {
 	static struct vfsops vgonel_vfsops = {
 		.vfs_reclaim_lowervp = notify_lowervp_vfs_dummy,
 		.vfs_unlink_lowervp = notify_lowervp_vfs_dummy,
 	};
 	struct mount *mp, *ump, *mmp;
 
 	mp = vp->v_mount;
 	if (mp == NULL)
 		return;
 
 	MNT_ILOCK(mp);
 	if (TAILQ_EMPTY(&mp->mnt_uppers))
 		goto unlock;
 	MNT_IUNLOCK(mp);
 	mmp = malloc(sizeof(struct mount), M_TEMP, M_WAITOK | M_ZERO);
 	mmp->mnt_op = &vgonel_vfsops;
 	mmp->mnt_kern_flag |= MNTK_MARKER;
 	MNT_ILOCK(mp);
 	mp->mnt_kern_flag |= MNTK_VGONE_UPPER;
 	for (ump = TAILQ_FIRST(&mp->mnt_uppers); ump != NULL;) {
 		if ((ump->mnt_kern_flag & MNTK_MARKER) != 0) {
 			ump = TAILQ_NEXT(ump, mnt_upper_link);
 			continue;
 		}
 		TAILQ_INSERT_AFTER(&mp->mnt_uppers, ump, mmp, mnt_upper_link);
 		MNT_IUNLOCK(mp);
 		switch (event) {
 		case VFS_NOTIFY_UPPER_RECLAIM:
 			VFS_RECLAIM_LOWERVP(ump, vp);
 			break;
 		case VFS_NOTIFY_UPPER_UNLINK:
 			VFS_UNLINK_LOWERVP(ump, vp);
 			break;
 		default:
 			KASSERT(0, ("invalid event %d", event));
 			break;
 		}
 		MNT_ILOCK(mp);
 		ump = TAILQ_NEXT(mmp, mnt_upper_link);
 		TAILQ_REMOVE(&mp->mnt_uppers, mmp, mnt_upper_link);
 	}
 	free(mmp, M_TEMP);
 	mp->mnt_kern_flag &= ~MNTK_VGONE_UPPER;
 	if ((mp->mnt_kern_flag & MNTK_VGONE_WAITER) != 0) {
 		mp->mnt_kern_flag &= ~MNTK_VGONE_WAITER;
 		wakeup(&mp->mnt_uppers);
 	}
 unlock:
 	MNT_IUNLOCK(mp);
 }
 
 /*
  * vgone, with the vp interlock held.
  */
 static void
 vgonel(struct vnode *vp)
 {
 	struct thread *td;
 	int oweinact;
 	int active;
 	struct mount *mp;
 
 	ASSERT_VOP_ELOCKED(vp, "vgonel");
 	ASSERT_VI_LOCKED(vp, "vgonel");
 	VNASSERT(vp->v_holdcnt, vp,
 	    ("vgonel: vp %p has no reference.", vp));
 	CTR2(KTR_VFS, "%s: vp %p", __func__, vp);
 	td = curthread;
 
 	/*
 	 * Don't vgonel if we're already doomed.
 	 */
 	if (vp->v_iflag & VI_DOOMED)
 		return;
 	vp->v_iflag |= VI_DOOMED;
 
 	/*
 	 * Check to see if the vnode is in use.  If so, we have to call
 	 * VOP_CLOSE() and VOP_INACTIVE().
 	 */
 	active = vp->v_usecount;
 	oweinact = (vp->v_iflag & VI_OWEINACT);
 	VI_UNLOCK(vp);
 	vfs_notify_upper(vp, VFS_NOTIFY_UPPER_RECLAIM);
 
 	/*
 	 * If purging an active vnode, it must be closed and
 	 * deactivated before being reclaimed.
 	 */
 	if (active)
 		VOP_CLOSE(vp, FNONBLOCK, NOCRED, td);
 	if (oweinact || active) {
 		VI_LOCK(vp);
 		if ((vp->v_iflag & VI_DOINGINACT) == 0)
 			vinactive(vp, td);
 		VI_UNLOCK(vp);
 	}
 	if (vp->v_type == VSOCK)
 		vfs_unp_reclaim(vp);
 
 	/*
 	 * Clean out any buffers associated with the vnode.
 	 * If the flush fails, just toss the buffers.
 	 */
 	mp = NULL;
 	if (!TAILQ_EMPTY(&vp->v_bufobj.bo_dirty.bv_hd))
 		(void) vn_start_secondary_write(vp, &mp, V_WAIT);
 	if (vinvalbuf(vp, V_SAVE, 0, 0) != 0) {
 		while (vinvalbuf(vp, 0, 0, 0) != 0)
 			;
 	}
 
 	BO_LOCK(&vp->v_bufobj);
 	KASSERT(TAILQ_EMPTY(&vp->v_bufobj.bo_dirty.bv_hd) &&
 	    vp->v_bufobj.bo_dirty.bv_cnt == 0 &&
 	    TAILQ_EMPTY(&vp->v_bufobj.bo_clean.bv_hd) &&
 	    vp->v_bufobj.bo_clean.bv_cnt == 0,
 	    ("vp %p bufobj not invalidated", vp));
 
 	/*
 	 * For VMIO bufobj, BO_DEAD is set in vm_object_terminate()
 	 * after the object's page queue is flushed.
 	 */
 	if (vp->v_bufobj.bo_object == NULL)
 		vp->v_bufobj.bo_flag |= BO_DEAD;
 	BO_UNLOCK(&vp->v_bufobj);
 
 	/*
 	 * Reclaim the vnode.
 	 */
 	if (VOP_RECLAIM(vp, td))
 		panic("vgone: cannot reclaim");
 	if (mp != NULL)
 		vn_finished_secondary_write(mp);
 	VNASSERT(vp->v_object == NULL, vp,
 	    ("vop_reclaim left v_object vp=%p, tag=%s", vp, vp->v_tag));
 	/*
 	 * Clear the advisory locks and wake up waiting threads.
 	 */
 	(void)VOP_ADVLOCKPURGE(vp);
 	vp->v_lockf = NULL;
 	/*
 	 * Delete from old mount point vnode list.
 	 */
 	delmntque(vp);
 	cache_purge(vp);
 	/*
 	 * Done with purge, reset to the standard lock and invalidate
 	 * the vnode.
 	 */
 	VI_LOCK(vp);
 	vp->v_vnlock = &vp->v_lock;
 	vp->v_op = &dead_vnodeops;
 	vp->v_tag = "none";
 	vp->v_type = VBAD;
 }
 
 /*
  * Calculate the total number of references to a special device.
  */
 int
 vcount(struct vnode *vp)
 {
 	int count;
 
 	dev_lock();
 	count = vp->v_rdev->si_usecount;
 	dev_unlock();
 	return (count);
 }
 
 /*
  * Same as above, but using the struct cdev *as argument
  */
 int
 count_dev(struct cdev *dev)
 {
 	int count;
 
 	dev_lock();
 	count = dev->si_usecount;
 	dev_unlock();
 	return(count);
 }
 
 /*
  * Print out a description of a vnode.
  */
 static char *typename[] =
 {"VNON", "VREG", "VDIR", "VBLK", "VCHR", "VLNK", "VSOCK", "VFIFO", "VBAD",
  "VMARKER"};
 
 void
 vn_printf(struct vnode *vp, const char *fmt, ...)
 {
 	va_list ap;
 	char buf[256], buf2[16];
 	u_long flags;
 
 	va_start(ap, fmt);
 	vprintf(fmt, ap);
 	va_end(ap);
 	printf("%p: ", (void *)vp);
 	printf("tag %s, type %s\n", vp->v_tag, typename[vp->v_type]);
 	printf("    usecount %d, writecount %d, refcount %d",
 	    vp->v_usecount, vp->v_writecount, vp->v_holdcnt);
 	switch (vp->v_type) {
 	case VDIR:
 		printf(" mountedhere %p\n", vp->v_mountedhere);
 		break;
 	case VCHR:
 		printf(" rdev %p\n", vp->v_rdev);
 		break;
 	case VSOCK:
 		printf(" socket %p\n", vp->v_unpcb);
 		break;
 	case VFIFO:
 		printf(" fifoinfo %p\n", vp->v_fifoinfo);
 		break;
 	default:
 		printf("\n");
 		break;
 	}
 	buf[0] = '\0';
 	buf[1] = '\0';
 	if (vp->v_vflag & VV_ROOT)
 		strlcat(buf, "|VV_ROOT", sizeof(buf));
 	if (vp->v_vflag & VV_ISTTY)
 		strlcat(buf, "|VV_ISTTY", sizeof(buf));
 	if (vp->v_vflag & VV_NOSYNC)
 		strlcat(buf, "|VV_NOSYNC", sizeof(buf));
 	if (vp->v_vflag & VV_ETERNALDEV)
 		strlcat(buf, "|VV_ETERNALDEV", sizeof(buf));
 	if (vp->v_vflag & VV_CACHEDLABEL)
 		strlcat(buf, "|VV_CACHEDLABEL", sizeof(buf));
 	if (vp->v_vflag & VV_COPYONWRITE)
 		strlcat(buf, "|VV_COPYONWRITE", sizeof(buf));
 	if (vp->v_vflag & VV_SYSTEM)
 		strlcat(buf, "|VV_SYSTEM", sizeof(buf));
 	if (vp->v_vflag & VV_PROCDEP)
 		strlcat(buf, "|VV_PROCDEP", sizeof(buf));
 	if (vp->v_vflag & VV_NOKNOTE)
 		strlcat(buf, "|VV_NOKNOTE", sizeof(buf));
 	if (vp->v_vflag & VV_DELETED)
 		strlcat(buf, "|VV_DELETED", sizeof(buf));
 	if (vp->v_vflag & VV_MD)
 		strlcat(buf, "|VV_MD", sizeof(buf));
 	if (vp->v_vflag & VV_FORCEINSMQ)
 		strlcat(buf, "|VV_FORCEINSMQ", sizeof(buf));
 	flags = vp->v_vflag & ~(VV_ROOT | VV_ISTTY | VV_NOSYNC | VV_ETERNALDEV |
 	    VV_CACHEDLABEL | VV_COPYONWRITE | VV_SYSTEM | VV_PROCDEP |
 	    VV_NOKNOTE | VV_DELETED | VV_MD | VV_FORCEINSMQ);
 	if (flags != 0) {
 		snprintf(buf2, sizeof(buf2), "|VV(0x%lx)", flags);
 		strlcat(buf, buf2, sizeof(buf));
 	}
 	if (vp->v_iflag & VI_MOUNT)
 		strlcat(buf, "|VI_MOUNT", sizeof(buf));
 	if (vp->v_iflag & VI_DOOMED)
 		strlcat(buf, "|VI_DOOMED", sizeof(buf));
 	if (vp->v_iflag & VI_FREE)
 		strlcat(buf, "|VI_FREE", sizeof(buf));
 	if (vp->v_iflag & VI_ACTIVE)
 		strlcat(buf, "|VI_ACTIVE", sizeof(buf));
 	if (vp->v_iflag & VI_DOINGINACT)
 		strlcat(buf, "|VI_DOINGINACT", sizeof(buf));
 	if (vp->v_iflag & VI_OWEINACT)
 		strlcat(buf, "|VI_OWEINACT", sizeof(buf));
 	flags = vp->v_iflag & ~(VI_MOUNT | VI_DOOMED | VI_FREE |
 	    VI_ACTIVE | VI_DOINGINACT | VI_OWEINACT);
 	if (flags != 0) {
 		snprintf(buf2, sizeof(buf2), "|VI(0x%lx)", flags);
 		strlcat(buf, buf2, sizeof(buf));
 	}
 	printf("    flags (%s)\n", buf + 1);
 	if (mtx_owned(VI_MTX(vp)))
 		printf(" VI_LOCKed");
 	if (vp->v_object != NULL)
 		printf("    v_object %p ref %d pages %d "
 		    "cleanbuf %d dirtybuf %d\n",
 		    vp->v_object, vp->v_object->ref_count,
 		    vp->v_object->resident_page_count,
 		    vp->v_bufobj.bo_clean.bv_cnt,
 		    vp->v_bufobj.bo_dirty.bv_cnt);
 	printf("    ");
 	lockmgr_printinfo(vp->v_vnlock);
 	if (vp->v_data != NULL)
 		VOP_PRINT(vp);
 }
 
 #ifdef DDB
 /*
  * List all of the locked vnodes in the system.
  * Called when debugging the kernel.
  */
 DB_SHOW_COMMAND(lockedvnods, lockedvnodes)
 {
 	struct mount *mp;
 	struct vnode *vp;
 
 	/*
 	 * Note: because this is DDB, we can't obey the locking semantics
 	 * for these structures, which means we could catch an inconsistent
 	 * state and dereference a nasty pointer.  Not much to be done
 	 * about that.
 	 */
 	db_printf("Locked vnodes\n");
 	TAILQ_FOREACH(mp, &mountlist, mnt_list) {
 		TAILQ_FOREACH(vp, &mp->mnt_nvnodelist, v_nmntvnodes) {
 			if (vp->v_type != VMARKER && VOP_ISLOCKED(vp))
 				vn_printf(vp, "vnode ");
 		}
 	}
 }
 
 /*
  * Show details about the given vnode.
  */
 DB_SHOW_COMMAND(vnode, db_show_vnode)
 {
 	struct vnode *vp;
 
 	if (!have_addr)
 		return;
 	vp = (struct vnode *)addr;
 	vn_printf(vp, "vnode ");
 }
 
 /*
  * Show details about the given mount point.
  */
 DB_SHOW_COMMAND(mount, db_show_mount)
 {
 	struct mount *mp;
 	struct vfsopt *opt;
 	struct statfs *sp;
 	struct vnode *vp;
 	char buf[512];
 	uint64_t mflags;
 	u_int flags;
 
 	if (!have_addr) {
 		/* No address given, print short info about all mount points. */
 		TAILQ_FOREACH(mp, &mountlist, mnt_list) {
 			db_printf("%p %s on %s (%s)\n", mp,
 			    mp->mnt_stat.f_mntfromname,
 			    mp->mnt_stat.f_mntonname,
 			    mp->mnt_stat.f_fstypename);
 			if (db_pager_quit)
 				break;
 		}
 		db_printf("\nMore info: show mount <addr>\n");
 		return;
 	}
 
 	mp = (struct mount *)addr;
 	db_printf("%p %s on %s (%s)\n", mp, mp->mnt_stat.f_mntfromname,
 	    mp->mnt_stat.f_mntonname, mp->mnt_stat.f_fstypename);
 
 	buf[0] = '\0';
 	mflags = mp->mnt_flag;
 #define	MNT_FLAG(flag)	do {						\
 	if (mflags & (flag)) {						\
 		if (buf[0] != '\0')					\
 			strlcat(buf, ", ", sizeof(buf));		\
 		strlcat(buf, (#flag) + 4, sizeof(buf));			\
 		mflags &= ~(flag);					\
 	}								\
 } while (0)
 	MNT_FLAG(MNT_RDONLY);
 	MNT_FLAG(MNT_SYNCHRONOUS);
 	MNT_FLAG(MNT_NOEXEC);
 	MNT_FLAG(MNT_NOSUID);
 	MNT_FLAG(MNT_NFS4ACLS);
 	MNT_FLAG(MNT_UNION);
 	MNT_FLAG(MNT_ASYNC);
 	MNT_FLAG(MNT_SUIDDIR);
 	MNT_FLAG(MNT_SOFTDEP);
 	MNT_FLAG(MNT_NOSYMFOLLOW);
 	MNT_FLAG(MNT_GJOURNAL);
 	MNT_FLAG(MNT_MULTILABEL);
 	MNT_FLAG(MNT_ACLS);
 	MNT_FLAG(MNT_NOATIME);
 	MNT_FLAG(MNT_NOCLUSTERR);
 	MNT_FLAG(MNT_NOCLUSTERW);
 	MNT_FLAG(MNT_SUJ);
 	MNT_FLAG(MNT_EXRDONLY);
 	MNT_FLAG(MNT_EXPORTED);
 	MNT_FLAG(MNT_DEFEXPORTED);
 	MNT_FLAG(MNT_EXPORTANON);
 	MNT_FLAG(MNT_EXKERB);
 	MNT_FLAG(MNT_EXPUBLIC);
 	MNT_FLAG(MNT_LOCAL);
 	MNT_FLAG(MNT_QUOTA);
 	MNT_FLAG(MNT_ROOTFS);
 	MNT_FLAG(MNT_USER);
 	MNT_FLAG(MNT_IGNORE);
 	MNT_FLAG(MNT_UPDATE);
 	MNT_FLAG(MNT_DELEXPORT);
 	MNT_FLAG(MNT_RELOAD);
 	MNT_FLAG(MNT_FORCE);
 	MNT_FLAG(MNT_SNAPSHOT);
 	MNT_FLAG(MNT_BYFSID);
 #undef MNT_FLAG
 	if (mflags != 0) {
 		if (buf[0] != '\0')
 			strlcat(buf, ", ", sizeof(buf));
 		snprintf(buf + strlen(buf), sizeof(buf) - strlen(buf),
 		    "0x%016jx", mflags);
 	}
 	db_printf("    mnt_flag = %s\n", buf);
 
 	buf[0] = '\0';
 	flags = mp->mnt_kern_flag;
 #define	MNT_KERN_FLAG(flag)	do {					\
 	if (flags & (flag)) {						\
 		if (buf[0] != '\0')					\
 			strlcat(buf, ", ", sizeof(buf));		\
 		strlcat(buf, (#flag) + 5, sizeof(buf));			\
 		flags &= ~(flag);					\
 	}								\
 } while (0)
 	MNT_KERN_FLAG(MNTK_UNMOUNTF);
 	MNT_KERN_FLAG(MNTK_ASYNC);
 	MNT_KERN_FLAG(MNTK_SOFTDEP);
 	MNT_KERN_FLAG(MNTK_NOINSMNTQ);
 	MNT_KERN_FLAG(MNTK_DRAINING);
 	MNT_KERN_FLAG(MNTK_REFEXPIRE);
 	MNT_KERN_FLAG(MNTK_EXTENDED_SHARED);
 	MNT_KERN_FLAG(MNTK_SHARED_WRITES);
 	MNT_KERN_FLAG(MNTK_NO_IOPF);
 	MNT_KERN_FLAG(MNTK_VGONE_UPPER);
 	MNT_KERN_FLAG(MNTK_VGONE_WAITER);
 	MNT_KERN_FLAG(MNTK_LOOKUP_EXCL_DOTDOT);
 	MNT_KERN_FLAG(MNTK_MARKER);
 	MNT_KERN_FLAG(MNTK_USES_BCACHE);
 	MNT_KERN_FLAG(MNTK_NOASYNC);
 	MNT_KERN_FLAG(MNTK_UNMOUNT);
 	MNT_KERN_FLAG(MNTK_MWAIT);
 	MNT_KERN_FLAG(MNTK_SUSPEND);
 	MNT_KERN_FLAG(MNTK_SUSPEND2);
 	MNT_KERN_FLAG(MNTK_SUSPENDED);
 	MNT_KERN_FLAG(MNTK_LOOKUP_SHARED);
 	MNT_KERN_FLAG(MNTK_NOKNOTE);
 #undef MNT_KERN_FLAG
 	if (flags != 0) {
 		if (buf[0] != '\0')
 			strlcat(buf, ", ", sizeof(buf));
 		snprintf(buf + strlen(buf), sizeof(buf) - strlen(buf),
 		    "0x%08x", flags);
 	}
 	db_printf("    mnt_kern_flag = %s\n", buf);
 
 	db_printf("    mnt_opt = ");
 	opt = TAILQ_FIRST(mp->mnt_opt);
 	if (opt != NULL) {
 		db_printf("%s", opt->name);
 		opt = TAILQ_NEXT(opt, link);
 		while (opt != NULL) {
 			db_printf(", %s", opt->name);
 			opt = TAILQ_NEXT(opt, link);
 		}
 	}
 	db_printf("\n");
 
 	sp = &mp->mnt_stat;
 	db_printf("    mnt_stat = { version=%u type=%u flags=0x%016jx "
 	    "bsize=%ju iosize=%ju blocks=%ju bfree=%ju bavail=%jd files=%ju "
 	    "ffree=%jd syncwrites=%ju asyncwrites=%ju syncreads=%ju "
 	    "asyncreads=%ju namemax=%u owner=%u fsid=[%d, %d] }\n",
 	    (u_int)sp->f_version, (u_int)sp->f_type, (uintmax_t)sp->f_flags,
 	    (uintmax_t)sp->f_bsize, (uintmax_t)sp->f_iosize,
 	    (uintmax_t)sp->f_blocks, (uintmax_t)sp->f_bfree,
 	    (intmax_t)sp->f_bavail, (uintmax_t)sp->f_files,
 	    (intmax_t)sp->f_ffree, (uintmax_t)sp->f_syncwrites,
 	    (uintmax_t)sp->f_asyncwrites, (uintmax_t)sp->f_syncreads,
 	    (uintmax_t)sp->f_asyncreads, (u_int)sp->f_namemax,
 	    (u_int)sp->f_owner, (int)sp->f_fsid.val[0], (int)sp->f_fsid.val[1]);
 
 	db_printf("    mnt_cred = { uid=%u ruid=%u",
 	    (u_int)mp->mnt_cred->cr_uid, (u_int)mp->mnt_cred->cr_ruid);
 	if (jailed(mp->mnt_cred))
 		db_printf(", jail=%d", mp->mnt_cred->cr_prison->pr_id);
 	db_printf(" }\n");
 	db_printf("    mnt_ref = %d\n", mp->mnt_ref);
 	db_printf("    mnt_gen = %d\n", mp->mnt_gen);
 	db_printf("    mnt_nvnodelistsize = %d\n", mp->mnt_nvnodelistsize);
 	db_printf("    mnt_activevnodelistsize = %d\n",
 	    mp->mnt_activevnodelistsize);
 	db_printf("    mnt_writeopcount = %d\n", mp->mnt_writeopcount);
 	db_printf("    mnt_maxsymlinklen = %d\n", mp->mnt_maxsymlinklen);
 	db_printf("    mnt_iosize_max = %d\n", mp->mnt_iosize_max);
 	db_printf("    mnt_hashseed = %u\n", mp->mnt_hashseed);
 	db_printf("    mnt_lockref = %d\n", mp->mnt_lockref);
 	db_printf("    mnt_secondary_writes = %d\n", mp->mnt_secondary_writes);
 	db_printf("    mnt_secondary_accwrites = %d\n",
 	    mp->mnt_secondary_accwrites);
 	db_printf("    mnt_gjprovider = %s\n",
 	    mp->mnt_gjprovider != NULL ? mp->mnt_gjprovider : "NULL");
 
 	db_printf("\n\nList of active vnodes\n");
 	TAILQ_FOREACH(vp, &mp->mnt_activevnodelist, v_actfreelist) {
 		if (vp->v_type != VMARKER) {
 			vn_printf(vp, "vnode ");
 			if (db_pager_quit)
 				break;
 		}
 	}
 	db_printf("\n\nList of inactive vnodes\n");
 	TAILQ_FOREACH(vp, &mp->mnt_nvnodelist, v_nmntvnodes) {
 		if (vp->v_type != VMARKER && (vp->v_iflag & VI_ACTIVE) == 0) {
 			vn_printf(vp, "vnode ");
 			if (db_pager_quit)
 				break;
 		}
 	}
 }
 #endif	/* DDB */
 
 /*
  * Fill in a struct xvfsconf based on a struct vfsconf.
  */
 static int
 vfsconf2x(struct sysctl_req *req, struct vfsconf *vfsp)
 {
 	struct xvfsconf xvfsp;
 
 	bzero(&xvfsp, sizeof(xvfsp));
 	strcpy(xvfsp.vfc_name, vfsp->vfc_name);
 	xvfsp.vfc_typenum = vfsp->vfc_typenum;
 	xvfsp.vfc_refcount = vfsp->vfc_refcount;
 	xvfsp.vfc_flags = vfsp->vfc_flags;
 	/*
 	 * These are unused in userland, we keep them
 	 * to not break binary compatibility.
 	 */
 	xvfsp.vfc_vfsops = NULL;
 	xvfsp.vfc_next = NULL;
 	return (SYSCTL_OUT(req, &xvfsp, sizeof(xvfsp)));
 }
 
 #ifdef COMPAT_FREEBSD32
 struct xvfsconf32 {
 	uint32_t	vfc_vfsops;
 	char		vfc_name[MFSNAMELEN];
 	int32_t		vfc_typenum;
 	int32_t		vfc_refcount;
 	int32_t		vfc_flags;
 	uint32_t	vfc_next;
 };
 
 static int
 vfsconf2x32(struct sysctl_req *req, struct vfsconf *vfsp)
 {
 	struct xvfsconf32 xvfsp;
 
 	bzero(&xvfsp, sizeof(xvfsp));
 	strcpy(xvfsp.vfc_name, vfsp->vfc_name);
 	xvfsp.vfc_typenum = vfsp->vfc_typenum;
 	xvfsp.vfc_refcount = vfsp->vfc_refcount;
 	xvfsp.vfc_flags = vfsp->vfc_flags;
 	return (SYSCTL_OUT(req, &xvfsp, sizeof(xvfsp)));
 }
 #endif
 
 /*
  * Top level filesystem related information gathering.
  */
 static int
 sysctl_vfs_conflist(SYSCTL_HANDLER_ARGS)
 {
 	struct vfsconf *vfsp;
 	int error;
 
 	error = 0;
 	vfsconf_slock();
 	TAILQ_FOREACH(vfsp, &vfsconf, vfc_list) {
 #ifdef COMPAT_FREEBSD32
 		if (req->flags & SCTL_MASK32)
 			error = vfsconf2x32(req, vfsp);
 		else
 #endif
 			error = vfsconf2x(req, vfsp);
 		if (error)
 			break;
 	}
 	vfsconf_sunlock();
 	return (error);
 }
 
 SYSCTL_PROC(_vfs, OID_AUTO, conflist, CTLTYPE_OPAQUE | CTLFLAG_RD |
     CTLFLAG_MPSAFE, NULL, 0, sysctl_vfs_conflist,
     "S,xvfsconf", "List of all configured filesystems");
 
 #ifndef BURN_BRIDGES
 static int	sysctl_ovfs_conf(SYSCTL_HANDLER_ARGS);
 
 static int
 vfs_sysctl(SYSCTL_HANDLER_ARGS)
 {
 	int *name = (int *)arg1 - 1;	/* XXX */
 	u_int namelen = arg2 + 1;	/* XXX */
 	struct vfsconf *vfsp;
 
 	log(LOG_WARNING, "userland calling deprecated sysctl, "
 	    "please rebuild world\n");
 
 #if 1 || defined(COMPAT_PRELITE2)
 	/* Resolve ambiguity between VFS_VFSCONF and VFS_GENERIC. */
 	if (namelen == 1)
 		return (sysctl_ovfs_conf(oidp, arg1, arg2, req));
 #endif
 
 	switch (name[1]) {
 	case VFS_MAXTYPENUM:
 		if (namelen != 2)
 			return (ENOTDIR);
 		return (SYSCTL_OUT(req, &maxvfsconf, sizeof(int)));
 	case VFS_CONF:
 		if (namelen != 3)
 			return (ENOTDIR);	/* overloaded */
 		vfsconf_slock();
 		TAILQ_FOREACH(vfsp, &vfsconf, vfc_list) {
 			if (vfsp->vfc_typenum == name[2])
 				break;
 		}
 		vfsconf_sunlock();
 		if (vfsp == NULL)
 			return (EOPNOTSUPP);
 #ifdef COMPAT_FREEBSD32
 		if (req->flags & SCTL_MASK32)
 			return (vfsconf2x32(req, vfsp));
 		else
 #endif
 			return (vfsconf2x(req, vfsp));
 	}
 	return (EOPNOTSUPP);
 }
 
 static SYSCTL_NODE(_vfs, VFS_GENERIC, generic, CTLFLAG_RD | CTLFLAG_SKIP |
     CTLFLAG_MPSAFE, vfs_sysctl,
     "Generic filesystem");
 
 #if 1 || defined(COMPAT_PRELITE2)
 
 static int
 sysctl_ovfs_conf(SYSCTL_HANDLER_ARGS)
 {
 	int error;
 	struct vfsconf *vfsp;
 	struct ovfsconf ovfs;
 
 	vfsconf_slock();
 	TAILQ_FOREACH(vfsp, &vfsconf, vfc_list) {
 		bzero(&ovfs, sizeof(ovfs));
 		ovfs.vfc_vfsops = vfsp->vfc_vfsops;	/* XXX used as flag */
 		strcpy(ovfs.vfc_name, vfsp->vfc_name);
 		ovfs.vfc_index = vfsp->vfc_typenum;
 		ovfs.vfc_refcount = vfsp->vfc_refcount;
 		ovfs.vfc_flags = vfsp->vfc_flags;
 		error = SYSCTL_OUT(req, &ovfs, sizeof ovfs);
 		if (error != 0) {
 			vfsconf_sunlock();
 			return (error);
 		}
 	}
 	vfsconf_sunlock();
 	return (0);
 }
 
 #endif /* 1 || COMPAT_PRELITE2 */
 #endif /* !BURN_BRIDGES */
 
 #define KINFO_VNODESLOP		10
 #ifdef notyet
 /*
  * Dump vnode list (via sysctl).
  */
 /* ARGSUSED */
 static int
 sysctl_vnode(SYSCTL_HANDLER_ARGS)
 {
 	struct xvnode *xvn;
 	struct mount *mp;
 	struct vnode *vp;
 	int error, len, n;
 
 	/*
 	 * Stale numvnodes access is not fatal here.
 	 */
 	req->lock = 0;
 	len = (numvnodes + KINFO_VNODESLOP) * sizeof *xvn;
 	if (!req->oldptr)
 		/* Make an estimate */
 		return (SYSCTL_OUT(req, 0, len));
 
 	error = sysctl_wire_old_buffer(req, 0);
 	if (error != 0)
 		return (error);
 	xvn = malloc(len, M_TEMP, M_ZERO | M_WAITOK);
 	n = 0;
 	mtx_lock(&mountlist_mtx);
 	TAILQ_FOREACH(mp, &mountlist, mnt_list) {
 		if (vfs_busy(mp, MBF_NOWAIT | MBF_MNTLSTLOCK))
 			continue;
 		MNT_ILOCK(mp);
 		TAILQ_FOREACH(vp, &mp->mnt_nvnodelist, v_nmntvnodes) {
 			if (n == len)
 				break;
 			vref(vp);
 			xvn[n].xv_size = sizeof *xvn;
 			xvn[n].xv_vnode = vp;
 			xvn[n].xv_id = 0;	/* XXX compat */
 #define XV_COPY(field) xvn[n].xv_##field = vp->v_##field
 			XV_COPY(usecount);
 			XV_COPY(writecount);
 			XV_COPY(holdcnt);
 			XV_COPY(mount);
 			XV_COPY(numoutput);
 			XV_COPY(type);
 #undef XV_COPY
 			xvn[n].xv_flag = vp->v_vflag;
 
 			switch (vp->v_type) {
 			case VREG:
 			case VDIR:
 			case VLNK:
 				break;
 			case VBLK:
 			case VCHR:
 				if (vp->v_rdev == NULL) {
 					vrele(vp);
 					continue;
 				}
 				xvn[n].xv_dev = dev2udev(vp->v_rdev);
 				break;
 			case VSOCK:
 				xvn[n].xv_socket = vp->v_socket;
 				break;
 			case VFIFO:
 				xvn[n].xv_fifo = vp->v_fifoinfo;
 				break;
 			case VNON:
 			case VBAD:
 			default:
 				/* shouldn't happen? */
 				vrele(vp);
 				continue;
 			}
 			vrele(vp);
 			++n;
 		}
 		MNT_IUNLOCK(mp);
 		mtx_lock(&mountlist_mtx);
 		vfs_unbusy(mp);
 		if (n == len)
 			break;
 	}
 	mtx_unlock(&mountlist_mtx);
 
 	error = SYSCTL_OUT(req, xvn, n * sizeof *xvn);
 	free(xvn, M_TEMP);
 	return (error);
 }
 
 SYSCTL_PROC(_kern, KERN_VNODE, vnode, CTLTYPE_OPAQUE | CTLFLAG_RD |
     CTLFLAG_MPSAFE, 0, 0, sysctl_vnode, "S,xvnode",
     "");
 #endif
 
 static void
 unmount_or_warn(struct mount *mp)
 {
 	int error;
 
 	error = dounmount(mp, MNT_FORCE, curthread);
 	if (error != 0) {
 		printf("unmount of %s failed (", mp->mnt_stat.f_mntonname);
 		if (error == EBUSY)
 			printf("BUSY)\n");
 		else
 			printf("%d)\n", error);
 	}
 }
 
 /*
  * Unmount all filesystems. The list is traversed in reverse order
  * of mounting to avoid dependencies.
  */
 void
 vfs_unmountall(void)
 {
 	struct mount *mp, *tmp;
 
 	CTR1(KTR_VFS, "%s: unmounting all filesystems", __func__);
 
 	/*
 	 * Since this only runs when rebooting, it is not interlocked.
 	 */
 	TAILQ_FOREACH_REVERSE_SAFE(mp, &mountlist, mntlist, mnt_list, tmp) {
 		vfs_ref(mp);
 
 		/*
 		 * Forcibly unmounting "/dev" before "/" would prevent clean
 		 * unmount of the latter.
 		 */
 		if (mp == rootdevmp)
 			continue;
 
 		unmount_or_warn(mp);
 	}
 
 	if (rootdevmp != NULL)
 		unmount_or_warn(rootdevmp);
 }
 
 /*
  * perform msync on all vnodes under a mount point
  * the mount point must be locked.
  */
 void
 vfs_msync(struct mount *mp, int flags)
 {
 	struct vnode *vp, *mvp;
 	struct vm_object *obj;
 
 	CTR2(KTR_VFS, "%s: mp %p", __func__, mp);
 
 	vnlru_return_batch(mp);
 
 	MNT_VNODE_FOREACH_ACTIVE(vp, mp, mvp) {
 		obj = vp->v_object;
 		if (obj != NULL && (obj->flags & OBJ_MIGHTBEDIRTY) != 0 &&
 		    (flags == MNT_WAIT || VOP_ISLOCKED(vp) == 0)) {
 			if (!vget(vp,
 			    LK_EXCLUSIVE | LK_RETRY | LK_INTERLOCK,
 			    curthread)) {
 				if (vp->v_vflag & VV_NOSYNC) {	/* unlinked */
 					vput(vp);
 					continue;
 				}
 
 				obj = vp->v_object;
 				if (obj != NULL) {
 					VM_OBJECT_WLOCK(obj);
 					vm_object_page_clean(obj, 0, 0,
 					    flags == MNT_WAIT ?
 					    OBJPC_SYNC : OBJPC_NOSYNC);
 					VM_OBJECT_WUNLOCK(obj);
 				}
 				vput(vp);
 			}
 		} else
 			VI_UNLOCK(vp);
 	}
 }
 
 static void
 destroy_vpollinfo_free(struct vpollinfo *vi)
 {
 
 	knlist_destroy(&vi->vpi_selinfo.si_note);
 	mtx_destroy(&vi->vpi_lock);
 	uma_zfree(vnodepoll_zone, vi);
 }
 
 static void
 destroy_vpollinfo(struct vpollinfo *vi)
 {
 
 	knlist_clear(&vi->vpi_selinfo.si_note, 1);
 	seldrain(&vi->vpi_selinfo);
 	destroy_vpollinfo_free(vi);
 }
 
 /*
  * Initialize per-vnode helper structure to hold poll-related state.
  */
 void
 v_addpollinfo(struct vnode *vp)
 {
 	struct vpollinfo *vi;
 
 	if (vp->v_pollinfo != NULL)
 		return;
 	vi = uma_zalloc(vnodepoll_zone, M_WAITOK | M_ZERO);
 	mtx_init(&vi->vpi_lock, "vnode pollinfo", NULL, MTX_DEF);
 	knlist_init(&vi->vpi_selinfo.si_note, vp, vfs_knllock,
 	    vfs_knlunlock, vfs_knl_assert_locked, vfs_knl_assert_unlocked);
 	VI_LOCK(vp);
 	if (vp->v_pollinfo != NULL) {
 		VI_UNLOCK(vp);
 		destroy_vpollinfo_free(vi);
 		return;
 	}
 	vp->v_pollinfo = vi;
 	VI_UNLOCK(vp);
 }
 
 /*
  * Record a process's interest in events which might happen to
  * a vnode.  Because poll uses the historic select-style interface
  * internally, this routine serves as both the ``check for any
  * pending events'' and the ``record my interest in future events''
  * functions.  (These are done together, while the lock is held,
  * to avoid race conditions.)
  */
 int
 vn_pollrecord(struct vnode *vp, struct thread *td, int events)
 {
 
 	v_addpollinfo(vp);
 	mtx_lock(&vp->v_pollinfo->vpi_lock);
 	if (vp->v_pollinfo->vpi_revents & events) {
 		/*
 		 * This leaves events we are not interested
 		 * in available for the other process which
 		 * which presumably had requested them
 		 * (otherwise they would never have been
 		 * recorded).
 		 */
 		events &= vp->v_pollinfo->vpi_revents;
 		vp->v_pollinfo->vpi_revents &= ~events;
 
 		mtx_unlock(&vp->v_pollinfo->vpi_lock);
 		return (events);
 	}
 	vp->v_pollinfo->vpi_events |= events;
 	selrecord(td, &vp->v_pollinfo->vpi_selinfo);
 	mtx_unlock(&vp->v_pollinfo->vpi_lock);
 	return (0);
 }
 
 /*
  * Routine to create and manage a filesystem syncer vnode.
  */
 #define sync_close ((int (*)(struct  vop_close_args *))nullop)
 static int	sync_fsync(struct  vop_fsync_args *);
 static int	sync_inactive(struct  vop_inactive_args *);
 static int	sync_reclaim(struct  vop_reclaim_args *);
 
 static struct vop_vector sync_vnodeops = {
 	.vop_bypass =	VOP_EOPNOTSUPP,
 	.vop_close =	sync_close,		/* close */
 	.vop_fsync =	sync_fsync,		/* fsync */
 	.vop_inactive =	sync_inactive,	/* inactive */
 	.vop_reclaim =	sync_reclaim,	/* reclaim */
 	.vop_lock1 =	vop_stdlock,	/* lock */
 	.vop_unlock =	vop_stdunlock,	/* unlock */
 	.vop_islocked =	vop_stdislocked,	/* islocked */
 };
 
 /*
  * Create a new filesystem syncer vnode for the specified mount point.
  */
 void
 vfs_allocate_syncvnode(struct mount *mp)
 {
 	struct vnode *vp;
 	struct bufobj *bo;
 	static long start, incr, next;
 	int error;
 
 	/* Allocate a new vnode */
 	error = getnewvnode("syncer", mp, &sync_vnodeops, &vp);
 	if (error != 0)
 		panic("vfs_allocate_syncvnode: getnewvnode() failed");
 	vp->v_type = VNON;
 	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
 	vp->v_vflag |= VV_FORCEINSMQ;
 	error = insmntque(vp, mp);
 	if (error != 0)
 		panic("vfs_allocate_syncvnode: insmntque() failed");
 	vp->v_vflag &= ~VV_FORCEINSMQ;
 	VOP_UNLOCK(vp, 0);
 	/*
 	 * Place the vnode onto the syncer worklist. We attempt to
 	 * scatter them about on the list so that they will go off
 	 * at evenly distributed times even if all the filesystems
 	 * are mounted at once.
 	 */
 	next += incr;
 	if (next == 0 || next > syncer_maxdelay) {
 		start /= 2;
 		incr /= 2;
 		if (start == 0) {
 			start = syncer_maxdelay / 2;
 			incr = syncer_maxdelay;
 		}
 		next = start;
 	}
 	bo = &vp->v_bufobj;
 	BO_LOCK(bo);
 	vn_syncer_add_to_worklist(bo, syncdelay > 0 ? next % syncdelay : 0);
 	/* XXX - vn_syncer_add_to_worklist() also grabs and drops sync_mtx. */
 	mtx_lock(&sync_mtx);
 	sync_vnode_count++;
 	if (mp->mnt_syncer == NULL) {
 		mp->mnt_syncer = vp;
 		vp = NULL;
 	}
 	mtx_unlock(&sync_mtx);
 	BO_UNLOCK(bo);
 	if (vp != NULL) {
 		vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
 		vgone(vp);
 		vput(vp);
 	}
 }
 
 void
 vfs_deallocate_syncvnode(struct mount *mp)
 {
 	struct vnode *vp;
 
 	mtx_lock(&sync_mtx);
 	vp = mp->mnt_syncer;
 	if (vp != NULL)
 		mp->mnt_syncer = NULL;
 	mtx_unlock(&sync_mtx);
 	if (vp != NULL)
 		vrele(vp);
 }
 
 /*
  * Do a lazy sync of the filesystem.
  */
 static int
 sync_fsync(struct vop_fsync_args *ap)
 {
 	struct vnode *syncvp = ap->a_vp;
 	struct mount *mp = syncvp->v_mount;
 	int error, save;
 	struct bufobj *bo;
 
 	/*
 	 * We only need to do something if this is a lazy evaluation.
 	 */
 	if (ap->a_waitfor != MNT_LAZY)
 		return (0);
 
 	/*
 	 * Move ourselves to the back of the sync list.
 	 */
 	bo = &syncvp->v_bufobj;
 	BO_LOCK(bo);
 	vn_syncer_add_to_worklist(bo, syncdelay);
 	BO_UNLOCK(bo);
 
 	/*
 	 * Walk the list of vnodes pushing all that are dirty and
 	 * not already on the sync list.
 	 */
 	if (vfs_busy(mp, MBF_NOWAIT) != 0)
 		return (0);
 	if (vn_start_write(NULL, &mp, V_NOWAIT) != 0) {
 		vfs_unbusy(mp);
 		return (0);
 	}
 	save = curthread_pflags_set(TDP_SYNCIO);
 	vfs_msync(mp, MNT_NOWAIT);
 	error = VFS_SYNC(mp, MNT_LAZY);
 	curthread_pflags_restore(save);
 	vn_finished_write(mp);
 	vfs_unbusy(mp);
 	return (error);
 }
 
 /*
  * The syncer vnode is no referenced.
  */
 static int
 sync_inactive(struct vop_inactive_args *ap)
 {
 
 	vgone(ap->a_vp);
 	return (0);
 }
 
 /*
  * The syncer vnode is no longer needed and is being decommissioned.
  *
  * Modifications to the worklist must be protected by sync_mtx.
  */
 static int
 sync_reclaim(struct vop_reclaim_args *ap)
 {
 	struct vnode *vp = ap->a_vp;
 	struct bufobj *bo;
 
 	bo = &vp->v_bufobj;
 	BO_LOCK(bo);
 	mtx_lock(&sync_mtx);
 	if (vp->v_mount->mnt_syncer == vp)
 		vp->v_mount->mnt_syncer = NULL;
 	if (bo->bo_flag & BO_ONWORKLST) {
 		LIST_REMOVE(bo, bo_synclist);
 		syncer_worklist_len--;
 		sync_vnode_count--;
 		bo->bo_flag &= ~BO_ONWORKLST;
 	}
 	mtx_unlock(&sync_mtx);
 	BO_UNLOCK(bo);
 
 	return (0);
 }
 
 /*
  * Check if vnode represents a disk device
  */
 int
 vn_isdisk(struct vnode *vp, int *errp)
 {
 	int error;
 
 	if (vp->v_type != VCHR) {
 		error = ENOTBLK;
 		goto out;
 	}
 	error = 0;
 	dev_lock();
 	if (vp->v_rdev == NULL)
 		error = ENXIO;
 	else if (vp->v_rdev->si_devsw == NULL)
 		error = ENXIO;
 	else if (!(vp->v_rdev->si_devsw->d_flags & D_DISK))
 		error = ENOTBLK;
 	dev_unlock();
 out:
 	if (errp != NULL)
 		*errp = error;
 	return (error == 0);
 }
 
 /*
  * Common filesystem object access control check routine.  Accepts a
  * vnode's type, "mode", uid and gid, requested access mode, credentials,
  * and optional call-by-reference privused argument allowing vaccess()
  * to indicate to the caller whether privilege was used to satisfy the
  * request (obsoleted).  Returns 0 on success, or an errno on failure.
  */
 int
 vaccess(enum vtype type, mode_t file_mode, uid_t file_uid, gid_t file_gid,
     accmode_t accmode, struct ucred *cred, int *privused)
 {
 	accmode_t dac_granted;
 	accmode_t priv_granted;
 
 	KASSERT((accmode & ~(VEXEC | VWRITE | VREAD | VADMIN | VAPPEND)) == 0,
 	    ("invalid bit in accmode"));
 	KASSERT((accmode & VAPPEND) == 0 || (accmode & VWRITE),
 	    ("VAPPEND without VWRITE"));
 
 	/*
 	 * Look for a normal, non-privileged way to access the file/directory
 	 * as requested.  If it exists, go with that.
 	 */
 
 	if (privused != NULL)
 		*privused = 0;
 
 	dac_granted = 0;
 
 	/* Check the owner. */
 	if (cred->cr_uid == file_uid) {
 		dac_granted |= VADMIN;
 		if (file_mode & S_IXUSR)
 			dac_granted |= VEXEC;
 		if (file_mode & S_IRUSR)
 			dac_granted |= VREAD;
 		if (file_mode & S_IWUSR)
 			dac_granted |= (VWRITE | VAPPEND);
 
 		if ((accmode & dac_granted) == accmode)
 			return (0);
 
 		goto privcheck;
 	}
 
 	/* Otherwise, check the groups (first match) */
 	if (groupmember(file_gid, cred)) {
 		if (file_mode & S_IXGRP)
 			dac_granted |= VEXEC;
 		if (file_mode & S_IRGRP)
 			dac_granted |= VREAD;
 		if (file_mode & S_IWGRP)
 			dac_granted |= (VWRITE | VAPPEND);
 
 		if ((accmode & dac_granted) == accmode)
 			return (0);
 
 		goto privcheck;
 	}
 
 	/* Otherwise, check everyone else. */
 	if (file_mode & S_IXOTH)
 		dac_granted |= VEXEC;
 	if (file_mode & S_IROTH)
 		dac_granted |= VREAD;
 	if (file_mode & S_IWOTH)
 		dac_granted |= (VWRITE | VAPPEND);
 	if ((accmode & dac_granted) == accmode)
 		return (0);
 
 privcheck:
 	/*
 	 * Build a privilege mask to determine if the set of privileges
 	 * satisfies the requirements when combined with the granted mask
 	 * from above.  For each privilege, if the privilege is required,
 	 * bitwise or the request type onto the priv_granted mask.
 	 */
 	priv_granted = 0;
 
 	if (type == VDIR) {
 		/*
 		 * For directories, use PRIV_VFS_LOOKUP to satisfy VEXEC
 		 * requests, instead of PRIV_VFS_EXEC.
 		 */
 		if ((accmode & VEXEC) && ((dac_granted & VEXEC) == 0) &&
 		    !priv_check_cred(cred, PRIV_VFS_LOOKUP, 0))
 			priv_granted |= VEXEC;
 	} else {
 		/*
 		 * Ensure that at least one execute bit is on. Otherwise,
 		 * a privileged user will always succeed, and we don't want
 		 * this to happen unless the file really is executable.
 		 */
 		if ((accmode & VEXEC) && ((dac_granted & VEXEC) == 0) &&
 		    (file_mode & (S_IXUSR | S_IXGRP | S_IXOTH)) != 0 &&
 		    !priv_check_cred(cred, PRIV_VFS_EXEC, 0))
 			priv_granted |= VEXEC;
 	}
 
 	if ((accmode & VREAD) && ((dac_granted & VREAD) == 0) &&
 	    !priv_check_cred(cred, PRIV_VFS_READ, 0))
 		priv_granted |= VREAD;
 
 	if ((accmode & VWRITE) && ((dac_granted & VWRITE) == 0) &&
 	    !priv_check_cred(cred, PRIV_VFS_WRITE, 0))
 		priv_granted |= (VWRITE | VAPPEND);
 
 	if ((accmode & VADMIN) && ((dac_granted & VADMIN) == 0) &&
 	    !priv_check_cred(cred, PRIV_VFS_ADMIN, 0))
 		priv_granted |= VADMIN;
 
 	if ((accmode & (priv_granted | dac_granted)) == accmode) {
 		/* XXX audit: privilege used */
 		if (privused != NULL)
 			*privused = 1;
 		return (0);
 	}
 
 	return ((accmode & VADMIN) ? EPERM : EACCES);
 }
 
 /*
  * Credential check based on process requesting service, and per-attribute
  * permissions.
  */
 int
 extattr_check_cred(struct vnode *vp, int attrnamespace, struct ucred *cred,
     struct thread *td, accmode_t accmode)
 {
 
 	/*
 	 * Kernel-invoked always succeeds.
 	 */
 	if (cred == NOCRED)
 		return (0);
 
 	/*
 	 * Do not allow privileged processes in jail to directly manipulate
 	 * system attributes.
 	 */
 	switch (attrnamespace) {
 	case EXTATTR_NAMESPACE_SYSTEM:
 		/* Potentially should be: return (EPERM); */
 		return (priv_check_cred(cred, PRIV_VFS_EXTATTR_SYSTEM, 0));
 	case EXTATTR_NAMESPACE_USER:
 		return (VOP_ACCESS(vp, accmode, cred, td));
 	default:
 		return (EPERM);
 	}
 }
 
 #ifdef DEBUG_VFS_LOCKS
 /*
  * This only exists to suppress warnings from unlocked specfs accesses.  It is
  * no longer ok to have an unlocked VFS.
  */
 #define	IGNORE_LOCK(vp) (panicstr != NULL || (vp) == NULL ||		\
 	(vp)->v_type == VCHR ||	(vp)->v_type == VBAD)
 
 int vfs_badlock_ddb = 1;	/* Drop into debugger on violation. */
 SYSCTL_INT(_debug, OID_AUTO, vfs_badlock_ddb, CTLFLAG_RW, &vfs_badlock_ddb, 0,
     "Drop into debugger on lock violation");
 
 int vfs_badlock_mutex = 1;	/* Check for interlock across VOPs. */
 SYSCTL_INT(_debug, OID_AUTO, vfs_badlock_mutex, CTLFLAG_RW, &vfs_badlock_mutex,
     0, "Check for interlock across VOPs");
 
 int vfs_badlock_print = 1;	/* Print lock violations. */
 SYSCTL_INT(_debug, OID_AUTO, vfs_badlock_print, CTLFLAG_RW, &vfs_badlock_print,
     0, "Print lock violations");
 
 int vfs_badlock_vnode = 1;	/* Print vnode details on lock violations. */
 SYSCTL_INT(_debug, OID_AUTO, vfs_badlock_vnode, CTLFLAG_RW, &vfs_badlock_vnode,
     0, "Print vnode details on lock violations");
 
 #ifdef KDB
 int vfs_badlock_backtrace = 1;	/* Print backtrace at lock violations. */
 SYSCTL_INT(_debug, OID_AUTO, vfs_badlock_backtrace, CTLFLAG_RW,
     &vfs_badlock_backtrace, 0, "Print backtrace at lock violations");
 #endif
 
 static void
 vfs_badlock(const char *msg, const char *str, struct vnode *vp)
 {
 
 #ifdef KDB
 	if (vfs_badlock_backtrace)
 		kdb_backtrace();
 #endif
 	if (vfs_badlock_vnode)
 		vn_printf(vp, "vnode ");
 	if (vfs_badlock_print)
 		printf("%s: %p %s\n", str, (void *)vp, msg);
 	if (vfs_badlock_ddb)
 		kdb_enter(KDB_WHY_VFSLOCK, "lock violation");
 }
 
 void
 assert_vi_locked(struct vnode *vp, const char *str)
 {
 
 	if (vfs_badlock_mutex && !mtx_owned(VI_MTX(vp)))
 		vfs_badlock("interlock is not locked but should be", str, vp);
 }
 
 void
 assert_vi_unlocked(struct vnode *vp, const char *str)
 {
 
 	if (vfs_badlock_mutex && mtx_owned(VI_MTX(vp)))
 		vfs_badlock("interlock is locked but should not be", str, vp);
 }
 
 void
 assert_vop_locked(struct vnode *vp, const char *str)
 {
 	int locked;
 
 	if (!IGNORE_LOCK(vp)) {
 		locked = VOP_ISLOCKED(vp);
 		if (locked == 0 || locked == LK_EXCLOTHER)
 			vfs_badlock("is not locked but should be", str, vp);
 	}
 }
 
 void
 assert_vop_unlocked(struct vnode *vp, const char *str)
 {
 
 	if (!IGNORE_LOCK(vp) && VOP_ISLOCKED(vp) == LK_EXCLUSIVE)
 		vfs_badlock("is locked but should not be", str, vp);
 }
 
 void
 assert_vop_elocked(struct vnode *vp, const char *str)
 {
 
 	if (!IGNORE_LOCK(vp) && VOP_ISLOCKED(vp) != LK_EXCLUSIVE)
 		vfs_badlock("is not exclusive locked but should be", str, vp);
 }
 #endif /* DEBUG_VFS_LOCKS */
 
 void
 vop_rename_fail(struct vop_rename_args *ap)
 {
 
 	if (ap->a_tvp != NULL)
 		vput(ap->a_tvp);
 	if (ap->a_tdvp == ap->a_tvp)
 		vrele(ap->a_tdvp);
 	else
 		vput(ap->a_tdvp);
 	vrele(ap->a_fdvp);
 	vrele(ap->a_fvp);
 }
 
 void
 vop_rename_pre(void *ap)
 {
 	struct vop_rename_args *a = ap;
 
 #ifdef DEBUG_VFS_LOCKS
 	if (a->a_tvp)
 		ASSERT_VI_UNLOCKED(a->a_tvp, "VOP_RENAME");
 	ASSERT_VI_UNLOCKED(a->a_tdvp, "VOP_RENAME");
 	ASSERT_VI_UNLOCKED(a->a_fvp, "VOP_RENAME");
 	ASSERT_VI_UNLOCKED(a->a_fdvp, "VOP_RENAME");
 
 	/* Check the source (from). */
 	if (a->a_tdvp->v_vnlock != a->a_fdvp->v_vnlock &&
 	    (a->a_tvp == NULL || a->a_tvp->v_vnlock != a->a_fdvp->v_vnlock))
 		ASSERT_VOP_UNLOCKED(a->a_fdvp, "vop_rename: fdvp locked");
 	if (a->a_tvp == NULL || a->a_tvp->v_vnlock != a->a_fvp->v_vnlock)
 		ASSERT_VOP_UNLOCKED(a->a_fvp, "vop_rename: fvp locked");
 
 	/* Check the target. */
 	if (a->a_tvp)
 		ASSERT_VOP_LOCKED(a->a_tvp, "vop_rename: tvp not locked");
 	ASSERT_VOP_LOCKED(a->a_tdvp, "vop_rename: tdvp not locked");
 #endif
 	if (a->a_tdvp != a->a_fdvp)
 		vhold(a->a_fdvp);
 	if (a->a_tvp != a->a_fvp)
 		vhold(a->a_fvp);
 	vhold(a->a_tdvp);
 	if (a->a_tvp)
 		vhold(a->a_tvp);
 }
 
 #ifdef DEBUG_VFS_LOCKS
 void
 vop_strategy_pre(void *ap)
 {
 	struct vop_strategy_args *a;
 	struct buf *bp;
 
 	a = ap;
 	bp = a->a_bp;
 
 	/*
 	 * Cluster ops lock their component buffers but not the IO container.
 	 */
 	if ((bp->b_flags & B_CLUSTER) != 0)
 		return;
 
 	if (panicstr == NULL && !BUF_ISLOCKED(bp)) {
 		if (vfs_badlock_print)
 			printf(
 			    "VOP_STRATEGY: bp is not locked but should be\n");
 		if (vfs_badlock_ddb)
 			kdb_enter(KDB_WHY_VFSLOCK, "lock violation");
 	}
 }
 
 void
 vop_lock_pre(void *ap)
 {
 	struct vop_lock1_args *a = ap;
 
 	if ((a->a_flags & LK_INTERLOCK) == 0)
 		ASSERT_VI_UNLOCKED(a->a_vp, "VOP_LOCK");
 	else
 		ASSERT_VI_LOCKED(a->a_vp, "VOP_LOCK");
 }
 
 void
 vop_lock_post(void *ap, int rc)
 {
 	struct vop_lock1_args *a = ap;
 
 	ASSERT_VI_UNLOCKED(a->a_vp, "VOP_LOCK");
 	if (rc == 0 && (a->a_flags & LK_EXCLOTHER) == 0)
 		ASSERT_VOP_LOCKED(a->a_vp, "VOP_LOCK");
 }
 
 void
 vop_unlock_pre(void *ap)
 {
 	struct vop_unlock_args *a = ap;
 
 	if (a->a_flags & LK_INTERLOCK)
 		ASSERT_VI_LOCKED(a->a_vp, "VOP_UNLOCK");
 	ASSERT_VOP_LOCKED(a->a_vp, "VOP_UNLOCK");
 }
 
 void
 vop_unlock_post(void *ap, int rc)
 {
 	struct vop_unlock_args *a = ap;
 
 	if (a->a_flags & LK_INTERLOCK)
 		ASSERT_VI_UNLOCKED(a->a_vp, "VOP_UNLOCK");
 }
 #endif
 
 void
 vop_create_post(void *ap, int rc)
 {
 	struct vop_create_args *a = ap;
 
 	if (!rc)
 		VFS_KNOTE_LOCKED(a->a_dvp, NOTE_WRITE);
 }
 
 void
 vop_deleteextattr_post(void *ap, int rc)
 {
 	struct vop_deleteextattr_args *a = ap;
 
 	if (!rc)
 		VFS_KNOTE_LOCKED(a->a_vp, NOTE_ATTRIB);
 }
 
 void
 vop_link_post(void *ap, int rc)
 {
 	struct vop_link_args *a = ap;
 
 	if (!rc) {
 		VFS_KNOTE_LOCKED(a->a_vp, NOTE_LINK);
 		VFS_KNOTE_LOCKED(a->a_tdvp, NOTE_WRITE);
 	}
 }
 
 void
 vop_mkdir_post(void *ap, int rc)
 {
 	struct vop_mkdir_args *a = ap;
 
 	if (!rc)
 		VFS_KNOTE_LOCKED(a->a_dvp, NOTE_WRITE | NOTE_LINK);
 }
 
 void
 vop_mknod_post(void *ap, int rc)
 {
 	struct vop_mknod_args *a = ap;
 
 	if (!rc)
 		VFS_KNOTE_LOCKED(a->a_dvp, NOTE_WRITE);
 }
 
 void
 vop_reclaim_post(void *ap, int rc)
 {
 	struct vop_reclaim_args *a = ap;
 
 	if (!rc)
 		VFS_KNOTE_LOCKED(a->a_vp, NOTE_REVOKE);
 }
 
 void
 vop_remove_post(void *ap, int rc)
 {
 	struct vop_remove_args *a = ap;
 
 	if (!rc) {
 		VFS_KNOTE_LOCKED(a->a_dvp, NOTE_WRITE);
 		VFS_KNOTE_LOCKED(a->a_vp, NOTE_DELETE);
 	}
 }
 
 void
 vop_rename_post(void *ap, int rc)
 {
 	struct vop_rename_args *a = ap;
 	long hint;
 
 	if (!rc) {
 		hint = NOTE_WRITE;
 		if (a->a_fdvp == a->a_tdvp) {
 			if (a->a_tvp != NULL && a->a_tvp->v_type == VDIR)
 				hint |= NOTE_LINK;
 			VFS_KNOTE_UNLOCKED(a->a_fdvp, hint);
 			VFS_KNOTE_UNLOCKED(a->a_tdvp, hint);
 		} else {
 			hint |= NOTE_EXTEND;
 			if (a->a_fvp->v_type == VDIR)
 				hint |= NOTE_LINK;
 			VFS_KNOTE_UNLOCKED(a->a_fdvp, hint);
 
 			if (a->a_fvp->v_type == VDIR && a->a_tvp != NULL &&
 			    a->a_tvp->v_type == VDIR)
 				hint &= ~NOTE_LINK;
 			VFS_KNOTE_UNLOCKED(a->a_tdvp, hint);
 		}
 
 		VFS_KNOTE_UNLOCKED(a->a_fvp, NOTE_RENAME);
 		if (a->a_tvp)
 			VFS_KNOTE_UNLOCKED(a->a_tvp, NOTE_DELETE);
 	}
 	if (a->a_tdvp != a->a_fdvp)
 		vdrop(a->a_fdvp);
 	if (a->a_tvp != a->a_fvp)
 		vdrop(a->a_fvp);
 	vdrop(a->a_tdvp);
 	if (a->a_tvp)
 		vdrop(a->a_tvp);
 }
 
 void
 vop_rmdir_post(void *ap, int rc)
 {
 	struct vop_rmdir_args *a = ap;
 
 	if (!rc) {
 		VFS_KNOTE_LOCKED(a->a_dvp, NOTE_WRITE | NOTE_LINK);
 		VFS_KNOTE_LOCKED(a->a_vp, NOTE_DELETE);
 	}
 }
 
 void
 vop_setattr_post(void *ap, int rc)
 {
 	struct vop_setattr_args *a = ap;
 
 	if (!rc)
 		VFS_KNOTE_LOCKED(a->a_vp, NOTE_ATTRIB);
 }
 
 void
 vop_setextattr_post(void *ap, int rc)
 {
 	struct vop_setextattr_args *a = ap;
 
 	if (!rc)
 		VFS_KNOTE_LOCKED(a->a_vp, NOTE_ATTRIB);
 }
 
 void
 vop_symlink_post(void *ap, int rc)
 {
 	struct vop_symlink_args *a = ap;
 
 	if (!rc)
 		VFS_KNOTE_LOCKED(a->a_dvp, NOTE_WRITE);
 }
 
 void
 vop_open_post(void *ap, int rc)
 {
 	struct vop_open_args *a = ap;
 
 	if (!rc)
 		VFS_KNOTE_LOCKED(a->a_vp, NOTE_OPEN);
 }
 
 void
 vop_close_post(void *ap, int rc)
 {
 	struct vop_close_args *a = ap;
 
 	if (!rc && (a->a_cred != NOCRED || /* filter out revokes */
 	    (a->a_vp->v_iflag & VI_DOOMED) == 0)) {
 		VFS_KNOTE_LOCKED(a->a_vp, (a->a_fflag & FWRITE) != 0 ?
 		    NOTE_CLOSE_WRITE : NOTE_CLOSE);
 	}
 }
 
 void
 vop_read_post(void *ap, int rc)
 {
 	struct vop_read_args *a = ap;
 
 	if (!rc)
 		VFS_KNOTE_LOCKED(a->a_vp, NOTE_READ);
 }
 
 void
 vop_readdir_post(void *ap, int rc)
 {
 	struct vop_readdir_args *a = ap;
 
 	if (!rc)
 		VFS_KNOTE_LOCKED(a->a_vp, NOTE_READ);
 }
 
 static struct knlist fs_knlist;
 
 static void
 vfs_event_init(void *arg)
 {
 	knlist_init_mtx(&fs_knlist, NULL);
 }
 /* XXX - correct order? */
 SYSINIT(vfs_knlist, SI_SUB_VFS, SI_ORDER_ANY, vfs_event_init, NULL);
 
 void
 vfs_event_signal(fsid_t *fsid, uint32_t event, intptr_t data __unused)
 {
 
 	KNOTE_UNLOCKED(&fs_knlist, event);
 }
 
 static int	filt_fsattach(struct knote *kn);
 static void	filt_fsdetach(struct knote *kn);
 static int	filt_fsevent(struct knote *kn, long hint);
 
 struct filterops fs_filtops = {
 	.f_isfd = 0,
 	.f_attach = filt_fsattach,
 	.f_detach = filt_fsdetach,
 	.f_event = filt_fsevent
 };
 
 static int
 filt_fsattach(struct knote *kn)
 {
 
 	kn->kn_flags |= EV_CLEAR;
 	knlist_add(&fs_knlist, kn, 0);
 	return (0);
 }
 
 static void
 filt_fsdetach(struct knote *kn)
 {
 
 	knlist_remove(&fs_knlist, kn, 0);
 }
 
 static int
 filt_fsevent(struct knote *kn, long hint)
 {
 
 	kn->kn_fflags |= hint;
 	return (kn->kn_fflags != 0);
 }
 
 static int
 sysctl_vfs_ctl(SYSCTL_HANDLER_ARGS)
 {
 	struct vfsidctl vc;
 	int error;
 	struct mount *mp;
 
 	error = SYSCTL_IN(req, &vc, sizeof(vc));
 	if (error)
 		return (error);
 	if (vc.vc_vers != VFS_CTL_VERS1)
 		return (EINVAL);
 	mp = vfs_getvfs(&vc.vc_fsid);
 	if (mp == NULL)
 		return (ENOENT);
 	/* ensure that a specific sysctl goes to the right filesystem. */
 	if (strcmp(vc.vc_fstypename, "*") != 0 &&
 	    strcmp(vc.vc_fstypename, mp->mnt_vfc->vfc_name) != 0) {
 		vfs_rel(mp);
 		return (EINVAL);
 	}
 	VCTLTOREQ(&vc, req);
 	error = VFS_SYSCTL(mp, vc.vc_op, req);
 	vfs_rel(mp);
 	return (error);
 }
 
 SYSCTL_PROC(_vfs, OID_AUTO, ctl, CTLTYPE_OPAQUE | CTLFLAG_WR,
     NULL, 0, sysctl_vfs_ctl, "",
     "Sysctl by fsid");
 
 /*
  * Function to initialize a va_filerev field sensibly.
  * XXX: Wouldn't a random number make a lot more sense ??
  */
 u_quad_t
 init_va_filerev(void)
 {
 	struct bintime bt;
 
 	getbinuptime(&bt);
 	return (((u_quad_t)bt.sec << 32LL) | (bt.frac >> 32LL));
 }
 
 static int	filt_vfsread(struct knote *kn, long hint);
 static int	filt_vfswrite(struct knote *kn, long hint);
 static int	filt_vfsvnode(struct knote *kn, long hint);
 static void	filt_vfsdetach(struct knote *kn);
 static struct filterops vfsread_filtops = {
 	.f_isfd = 1,
 	.f_detach = filt_vfsdetach,
 	.f_event = filt_vfsread
 };
 static struct filterops vfswrite_filtops = {
 	.f_isfd = 1,
 	.f_detach = filt_vfsdetach,
 	.f_event = filt_vfswrite
 };
 static struct filterops vfsvnode_filtops = {
 	.f_isfd = 1,
 	.f_detach = filt_vfsdetach,
 	.f_event = filt_vfsvnode
 };
 
 static void
 vfs_knllock(void *arg)
 {
 	struct vnode *vp = arg;
 
 	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
 }
 
 static void
 vfs_knlunlock(void *arg)
 {
 	struct vnode *vp = arg;
 
 	VOP_UNLOCK(vp, 0);
 }
 
 static void
 vfs_knl_assert_locked(void *arg)
 {
 #ifdef DEBUG_VFS_LOCKS
 	struct vnode *vp = arg;
 
 	ASSERT_VOP_LOCKED(vp, "vfs_knl_assert_locked");
 #endif
 }
 
 static void
 vfs_knl_assert_unlocked(void *arg)
 {
 #ifdef DEBUG_VFS_LOCKS
 	struct vnode *vp = arg;
 
 	ASSERT_VOP_UNLOCKED(vp, "vfs_knl_assert_unlocked");
 #endif
 }
 
 int
 vfs_kqfilter(struct vop_kqfilter_args *ap)
 {
 	struct vnode *vp = ap->a_vp;
 	struct knote *kn = ap->a_kn;
 	struct knlist *knl;
 
 	switch (kn->kn_filter) {
 	case EVFILT_READ:
 		kn->kn_fop = &vfsread_filtops;
 		break;
 	case EVFILT_WRITE:
 		kn->kn_fop = &vfswrite_filtops;
 		break;
 	case EVFILT_VNODE:
 		kn->kn_fop = &vfsvnode_filtops;
 		break;
 	default:
 		return (EINVAL);
 	}
 
 	kn->kn_hook = (caddr_t)vp;
 
 	v_addpollinfo(vp);
 	if (vp->v_pollinfo == NULL)
 		return (ENOMEM);
 	knl = &vp->v_pollinfo->vpi_selinfo.si_note;
 	vhold(vp);
 	knlist_add(knl, kn, 0);
 
 	return (0);
 }
 
 /*
  * Detach knote from vnode
  */
 static void
 filt_vfsdetach(struct knote *kn)
 {
 	struct vnode *vp = (struct vnode *)kn->kn_hook;
 
 	KASSERT(vp->v_pollinfo != NULL, ("Missing v_pollinfo"));
 	knlist_remove(&vp->v_pollinfo->vpi_selinfo.si_note, kn, 0);
 	vdrop(vp);
 }
 
 /*ARGSUSED*/
 static int
 filt_vfsread(struct knote *kn, long hint)
 {
 	struct vnode *vp = (struct vnode *)kn->kn_hook;
 	struct vattr va;
 	int res;
 
 	/*
 	 * filesystem is gone, so set the EOF flag and schedule
 	 * the knote for deletion.
 	 */
 	if (hint == NOTE_REVOKE || (hint == 0 && vp->v_type == VBAD)) {
 		VI_LOCK(vp);
 		kn->kn_flags |= (EV_EOF | EV_ONESHOT);
 		VI_UNLOCK(vp);
 		return (1);
 	}
 
 	if (VOP_GETATTR(vp, &va, curthread->td_ucred))
 		return (0);
 
 	VI_LOCK(vp);
 	kn->kn_data = va.va_size - kn->kn_fp->f_offset;
 	res = (kn->kn_sfflags & NOTE_FILE_POLL) != 0 || kn->kn_data != 0;
 	VI_UNLOCK(vp);
 	return (res);
 }
 
 /*ARGSUSED*/
 static int
 filt_vfswrite(struct knote *kn, long hint)
 {
 	struct vnode *vp = (struct vnode *)kn->kn_hook;
 
 	VI_LOCK(vp);
 
 	/*
 	 * filesystem is gone, so set the EOF flag and schedule
 	 * the knote for deletion.
 	 */
 	if (hint == NOTE_REVOKE || (hint == 0 && vp->v_type == VBAD))
 		kn->kn_flags |= (EV_EOF | EV_ONESHOT);
 
 	kn->kn_data = 0;
 	VI_UNLOCK(vp);
 	return (1);
 }
 
 static int
 filt_vfsvnode(struct knote *kn, long hint)
 {
 	struct vnode *vp = (struct vnode *)kn->kn_hook;
 	int res;
 
 	VI_LOCK(vp);
 	if (kn->kn_sfflags & hint)
 		kn->kn_fflags |= hint;
 	if (hint == NOTE_REVOKE || (hint == 0 && vp->v_type == VBAD)) {
 		kn->kn_flags |= EV_EOF;
 		VI_UNLOCK(vp);
 		return (1);
 	}
 	res = (kn->kn_fflags != 0);
 	VI_UNLOCK(vp);
 	return (res);
 }
 
 int
 vfs_read_dirent(struct vop_readdir_args *ap, struct dirent *dp, off_t off)
 {
 	int error;
 
 	if (dp->d_reclen > ap->a_uio->uio_resid)
 		return (ENAMETOOLONG);
 	error = uiomove(dp, dp->d_reclen, ap->a_uio);
 	if (error) {
 		if (ap->a_ncookies != NULL) {
 			if (ap->a_cookies != NULL)
 				free(ap->a_cookies, M_TEMP);
 			ap->a_cookies = NULL;
 			*ap->a_ncookies = 0;
 		}
 		return (error);
 	}
 	if (ap->a_ncookies == NULL)
 		return (0);
 
 	KASSERT(ap->a_cookies,
 	    ("NULL ap->a_cookies value with non-NULL ap->a_ncookies!"));
 
 	*ap->a_cookies = realloc(*ap->a_cookies,
 	    (*ap->a_ncookies + 1) * sizeof(u_long), M_TEMP, M_WAITOK | M_ZERO);
 	(*ap->a_cookies)[*ap->a_ncookies] = off;
 	*ap->a_ncookies += 1;
 	return (0);
 }
 
 /*
  * Mark for update the access time of the file if the filesystem
  * supports VOP_MARKATIME.  This functionality is used by execve and
  * mmap, so we want to avoid the I/O implied by directly setting
  * va_atime for the sake of efficiency.
  */
 void
 vfs_mark_atime(struct vnode *vp, struct ucred *cred)
 {
 	struct mount *mp;
 
 	mp = vp->v_mount;
 	ASSERT_VOP_LOCKED(vp, "vfs_mark_atime");
 	if (mp != NULL && (mp->mnt_flag & (MNT_NOATIME | MNT_RDONLY)) == 0)
 		(void)VOP_MARKATIME(vp);
 }
 
 /*
  * The purpose of this routine is to remove granularity from accmode_t,
  * reducing it into standard unix access bits - VEXEC, VREAD, VWRITE,
  * VADMIN and VAPPEND.
  *
  * If it returns 0, the caller is supposed to continue with the usual
  * access checks using 'accmode' as modified by this routine.  If it
  * returns nonzero value, the caller is supposed to return that value
  * as errno.
  *
  * Note that after this routine runs, accmode may be zero.
  */
 int
 vfs_unixify_accmode(accmode_t *accmode)
 {
 	/*
 	 * There is no way to specify explicit "deny" rule using
 	 * file mode or POSIX.1e ACLs.
 	 */
 	if (*accmode & VEXPLICIT_DENY) {
 		*accmode = 0;
 		return (0);
 	}
 
 	/*
 	 * None of these can be translated into usual access bits.
 	 * Also, the common case for NFSv4 ACLs is to not contain
 	 * either of these bits. Caller should check for VWRITE
 	 * on the containing directory instead.
 	 */
 	if (*accmode & (VDELETE_CHILD | VDELETE))
 		return (EPERM);
 
 	if (*accmode & VADMIN_PERMS) {
 		*accmode &= ~VADMIN_PERMS;
 		*accmode |= VADMIN;
 	}
 
 	/*
 	 * There is no way to deny VREAD_ATTRIBUTES, VREAD_ACL
 	 * or VSYNCHRONIZE using file mode or POSIX.1e ACL.
 	 */
 	*accmode &= ~(VSTAT_PERMS | VSYNCHRONIZE);
 
 	return (0);
 }
 
 /*
  * These are helper functions for filesystems to traverse all
  * their vnodes.  See MNT_VNODE_FOREACH_ALL() in sys/mount.h.
  *
  * This interface replaces MNT_VNODE_FOREACH.
  */
 
 MALLOC_DEFINE(M_VNODE_MARKER, "vnodemarker", "vnode marker");
 
 struct vnode *
 __mnt_vnode_next_all(struct vnode **mvp, struct mount *mp)
 {
 	struct vnode *vp;
 
 	if (should_yield())
 		kern_yield(PRI_USER);
 	MNT_ILOCK(mp);
 	KASSERT((*mvp)->v_mount == mp, ("marker vnode mount list mismatch"));
 	for (vp = TAILQ_NEXT(*mvp, v_nmntvnodes); vp != NULL;
 	    vp = TAILQ_NEXT(vp, v_nmntvnodes)) {
 		/* Allow a racy peek at VI_DOOMED to save a lock acquisition. */
 		if (vp->v_type == VMARKER || (vp->v_iflag & VI_DOOMED) != 0)
 			continue;
 		VI_LOCK(vp);
 		if ((vp->v_iflag & VI_DOOMED) != 0) {
 			VI_UNLOCK(vp);
 			continue;
 		}
 		break;
 	}
 	if (vp == NULL) {
 		__mnt_vnode_markerfree_all(mvp, mp);
 		/* MNT_IUNLOCK(mp); -- done in above function */
 		mtx_assert(MNT_MTX(mp), MA_NOTOWNED);
 		return (NULL);
 	}
 	TAILQ_REMOVE(&mp->mnt_nvnodelist, *mvp, v_nmntvnodes);
 	TAILQ_INSERT_AFTER(&mp->mnt_nvnodelist, vp, *mvp, v_nmntvnodes);
 	MNT_IUNLOCK(mp);
 	return (vp);
 }
 
 struct vnode *
 __mnt_vnode_first_all(struct vnode **mvp, struct mount *mp)
 {
 	struct vnode *vp;
 
 	*mvp = malloc(sizeof(struct vnode), M_VNODE_MARKER, M_WAITOK | M_ZERO);
 	MNT_ILOCK(mp);
 	MNT_REF(mp);
 	(*mvp)->v_mount = mp;
 	(*mvp)->v_type = VMARKER;
 
 	TAILQ_FOREACH(vp, &mp->mnt_nvnodelist, v_nmntvnodes) {
 		/* Allow a racy peek at VI_DOOMED to save a lock acquisition. */
 		if (vp->v_type == VMARKER || (vp->v_iflag & VI_DOOMED) != 0)
 			continue;
 		VI_LOCK(vp);
 		if ((vp->v_iflag & VI_DOOMED) != 0) {
 			VI_UNLOCK(vp);
 			continue;
 		}
 		break;
 	}
 	if (vp == NULL) {
 		MNT_REL(mp);
 		MNT_IUNLOCK(mp);
 		free(*mvp, M_VNODE_MARKER);
 		*mvp = NULL;
 		return (NULL);
 	}
 	TAILQ_INSERT_AFTER(&mp->mnt_nvnodelist, vp, *mvp, v_nmntvnodes);
 	MNT_IUNLOCK(mp);
 	return (vp);
 }
 
 void
 __mnt_vnode_markerfree_all(struct vnode **mvp, struct mount *mp)
 {
 
 	if (*mvp == NULL) {
 		MNT_IUNLOCK(mp);
 		return;
 	}
 
 	mtx_assert(MNT_MTX(mp), MA_OWNED);
 
 	KASSERT((*mvp)->v_mount == mp, ("marker vnode mount list mismatch"));
 	TAILQ_REMOVE(&mp->mnt_nvnodelist, *mvp, v_nmntvnodes);
 	MNT_REL(mp);
 	MNT_IUNLOCK(mp);
 	free(*mvp, M_VNODE_MARKER);
 	*mvp = NULL;
 }
 
 /*
  * These are helper functions for filesystems to traverse their
  * active vnodes.  See MNT_VNODE_FOREACH_ACTIVE() in sys/mount.h
  */
 static void
 mnt_vnode_markerfree_active(struct vnode **mvp, struct mount *mp)
 {
 
 	KASSERT((*mvp)->v_mount == mp, ("marker vnode mount list mismatch"));
 
 	MNT_ILOCK(mp);
 	MNT_REL(mp);
 	MNT_IUNLOCK(mp);
 	free(*mvp, M_VNODE_MARKER);
 	*mvp = NULL;
 }
 
 /*
  * Relock the mp mount vnode list lock with the vp vnode interlock in the
  * conventional lock order during mnt_vnode_next_active iteration.
  *
  * On entry, the mount vnode list lock is held and the vnode interlock is not.
  * The list lock is dropped and reacquired.  On success, both locks are held.
  * On failure, the mount vnode list lock is held but the vnode interlock is
  * not, and the procedure may have yielded.
  */
 static bool
 mnt_vnode_next_active_relock(struct vnode *mvp, struct mount *mp,
     struct vnode *vp)
 {
 	const struct vnode *tmp;
 	bool held, ret;
 
 	VNASSERT(mvp->v_mount == mp && mvp->v_type == VMARKER &&
 	    TAILQ_NEXT(mvp, v_actfreelist) != NULL, mvp,
 	    ("%s: bad marker", __func__));
 	VNASSERT(vp->v_mount == mp && vp->v_type != VMARKER, vp,
 	    ("%s: inappropriate vnode", __func__));
 	ASSERT_VI_UNLOCKED(vp, __func__);
 	mtx_assert(&mp->mnt_listmtx, MA_OWNED);
 
 	ret = false;
 
 	TAILQ_REMOVE(&mp->mnt_activevnodelist, mvp, v_actfreelist);
 	TAILQ_INSERT_BEFORE(vp, mvp, v_actfreelist);
 
 	/*
 	 * Use a hold to prevent vp from disappearing while the mount vnode
 	 * list lock is dropped and reacquired.  Normally a hold would be
 	 * acquired with vhold(), but that might try to acquire the vnode
 	 * interlock, which would be a LOR with the mount vnode list lock.
 	 */
 	held = refcount_acquire_if_not_zero(&vp->v_holdcnt);
 	mtx_unlock(&mp->mnt_listmtx);
 	if (!held)
 		goto abort;
 	VI_LOCK(vp);
 	if (!refcount_release_if_not_last(&vp->v_holdcnt)) {
 		vdropl(vp);
 		goto abort;
 	}
 	mtx_lock(&mp->mnt_listmtx);
 
 	/*
 	 * Determine whether the vnode is still the next one after the marker,
 	 * excepting any other markers.  If the vnode has not been doomed by
 	 * vgone() then the hold should have ensured that it remained on the
 	 * active list.  If it has been doomed but is still on the active list,
 	 * don't abort, but rather skip over it (avoid spinning on doomed
 	 * vnodes).
 	 */
 	tmp = mvp;
 	do {
 		tmp = TAILQ_NEXT(tmp, v_actfreelist);
 	} while (tmp != NULL && tmp->v_type == VMARKER);
 	if (tmp != vp) {
 		mtx_unlock(&mp->mnt_listmtx);
 		VI_UNLOCK(vp);
 		goto abort;
 	}
 
 	ret = true;
 	goto out;
 abort:
 	maybe_yield();
 	mtx_lock(&mp->mnt_listmtx);
 out:
 	if (ret)
 		ASSERT_VI_LOCKED(vp, __func__);
 	else
 		ASSERT_VI_UNLOCKED(vp, __func__);
 	mtx_assert(&mp->mnt_listmtx, MA_OWNED);
 	return (ret);
 }
 
 static struct vnode *
 mnt_vnode_next_active(struct vnode **mvp, struct mount *mp)
 {
 	struct vnode *vp, *nvp;
 
 	mtx_assert(&mp->mnt_listmtx, MA_OWNED);
 	KASSERT((*mvp)->v_mount == mp, ("marker vnode mount list mismatch"));
 restart:
 	vp = TAILQ_NEXT(*mvp, v_actfreelist);
 	while (vp != NULL) {
 		if (vp->v_type == VMARKER) {
 			vp = TAILQ_NEXT(vp, v_actfreelist);
 			continue;
 		}
 		/*
 		 * Try-lock because this is the wrong lock order.  If that does
 		 * not succeed, drop the mount vnode list lock and try to
 		 * reacquire it and the vnode interlock in the right order.
 		 */
 		if (!VI_TRYLOCK(vp) &&
 		    !mnt_vnode_next_active_relock(*mvp, mp, vp))
 			goto restart;
 		KASSERT(vp->v_type != VMARKER, ("locked marker %p", vp));
 		KASSERT(vp->v_mount == mp || vp->v_mount == NULL,
 		    ("alien vnode on the active list %p %p", vp, mp));
 		if (vp->v_mount == mp && (vp->v_iflag & VI_DOOMED) == 0)
 			break;
 		nvp = TAILQ_NEXT(vp, v_actfreelist);
 		VI_UNLOCK(vp);
 		vp = nvp;
 	}
 	TAILQ_REMOVE(&mp->mnt_activevnodelist, *mvp, v_actfreelist);
 
 	/* Check if we are done */
 	if (vp == NULL) {
 		mtx_unlock(&mp->mnt_listmtx);
 		mnt_vnode_markerfree_active(mvp, mp);
 		return (NULL);
 	}
 	TAILQ_INSERT_AFTER(&mp->mnt_activevnodelist, vp, *mvp, v_actfreelist);
 	mtx_unlock(&mp->mnt_listmtx);
 	ASSERT_VI_LOCKED(vp, "active iter");
 	KASSERT((vp->v_iflag & VI_ACTIVE) != 0, ("Non-active vp %p", vp));
 	return (vp);
 }
 
 struct vnode *
 __mnt_vnode_next_active(struct vnode **mvp, struct mount *mp)
 {
 
 	if (should_yield())
 		kern_yield(PRI_USER);
 	mtx_lock(&mp->mnt_listmtx);
 	return (mnt_vnode_next_active(mvp, mp));
 }
 
 struct vnode *
 __mnt_vnode_first_active(struct vnode **mvp, struct mount *mp)
 {
 	struct vnode *vp;
 
 	*mvp = malloc(sizeof(struct vnode), M_VNODE_MARKER, M_WAITOK | M_ZERO);
 	MNT_ILOCK(mp);
 	MNT_REF(mp);
 	MNT_IUNLOCK(mp);
 	(*mvp)->v_type = VMARKER;
 	(*mvp)->v_mount = mp;
 
 	mtx_lock(&mp->mnt_listmtx);
 	vp = TAILQ_FIRST(&mp->mnt_activevnodelist);
 	if (vp == NULL) {
 		mtx_unlock(&mp->mnt_listmtx);
 		mnt_vnode_markerfree_active(mvp, mp);
 		return (NULL);
 	}
 	TAILQ_INSERT_BEFORE(vp, *mvp, v_actfreelist);
 	return (mnt_vnode_next_active(mvp, mp));
 }
 
 void
 __mnt_vnode_markerfree_active(struct vnode **mvp, struct mount *mp)
 {
 
 	if (*mvp == NULL)
 		return;
 
 	mtx_lock(&mp->mnt_listmtx);
 	TAILQ_REMOVE(&mp->mnt_activevnodelist, *mvp, v_actfreelist);
 	mtx_unlock(&mp->mnt_listmtx);
 	mnt_vnode_markerfree_active(mvp, mp);
 }
Index: stable/12/sys/sys/vnode.h
===================================================================
--- stable/12/sys/sys/vnode.h	(revision 349304)
+++ stable/12/sys/sys/vnode.h	(revision 349305)
@@ -1,919 +1,918 @@
 /*-
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Copyright (c) 1989, 1993
  *	The Regents of the University of California.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	@(#)vnode.h	8.7 (Berkeley) 2/4/94
  * $FreeBSD$
  */
 
 #ifndef _SYS_VNODE_H_
 #define	_SYS_VNODE_H_
 
 #include <sys/bufobj.h>
 #include <sys/queue.h>
 #include <sys/lock.h>
 #include <sys/lockmgr.h>
 #include <sys/mutex.h>
 #include <sys/rangelock.h>
 #include <sys/selinfo.h>
 #include <sys/uio.h>
 #include <sys/acl.h>
 #include <sys/ktr.h>
 
 /*
  * The vnode is the focus of all file activity in UNIX.  There is a
  * unique vnode allocated for each active file, each current directory,
  * each mounted-on file, text file, and the root.
  */
 
 /*
  * Vnode types.  VNON means no type.
  */
 enum vtype	{ VNON, VREG, VDIR, VBLK, VCHR, VLNK, VSOCK, VFIFO, VBAD,
 		  VMARKER };
 
 /*
  * Each underlying filesystem allocates its own private area and hangs
  * it from v_data.  If non-null, this area is freed in getnewvnode().
  */
 
 struct namecache;
 
 struct vpollinfo {
 	struct	mtx vpi_lock;		/* lock to protect below */
 	struct	selinfo vpi_selinfo;	/* identity of poller(s) */
 	short	vpi_events;		/* what they are looking for */
 	short	vpi_revents;		/* what has happened */
 };
 
 /*
  * Reading or writing any of these items requires holding the appropriate lock.
  *
  * Lock reference:
  *	c - namecache mutex
  *	i - interlock
  *	l - mp mnt_listmtx or freelist mutex
  *	I - updated with atomics, 0->1 and 1->0 transitions with interlock held
  *	m - mount point interlock
  *	p - pollinfo lock
  *	u - Only a reference to the vnode is needed to read.
  *	v - vnode lock
  *
  * Vnodes may be found on many lists.  The general way to deal with operating
  * on a vnode that is on a list is:
  *	1) Lock the list and find the vnode.
  *	2) Lock interlock so that the vnode does not go away.
  *	3) Unlock the list to avoid lock order reversals.
  *	4) vget with LK_INTERLOCK and check for ENOENT, or
  *	5) Check for DOOMED if the vnode lock is not required.
  *	6) Perform your operation, then vput().
  */
 
 #if defined(_KERNEL) || defined(_KVM_VNODE)
 
 struct vnode {
 	/*
 	 * Fields which define the identity of the vnode.  These fields are
 	 * owned by the filesystem (XXX: and vgone() ?)
 	 */
 	const char *v_tag;			/* u type of underlying data */
 	struct	vop_vector *v_op;		/* u vnode operations vector */
 	void	*v_data;			/* u private data for fs */
 
 	/*
 	 * Filesystem instance stuff
 	 */
 	struct	mount *v_mount;			/* u ptr to vfs we are in */
 	TAILQ_ENTRY(vnode) v_nmntvnodes;	/* m vnodes for mount point */
 
 	/*
 	 * Type specific fields, only one applies to any given vnode.
 	 */
 	union {
 		struct mount	*v_mountedhere;	/* v ptr to mountpoint (VDIR) */
 		struct unpcb	*v_unpcb;	/* v unix domain net (VSOCK) */
 		struct cdev	*v_rdev; 	/* v device (VCHR, VBLK) */
 		struct fifoinfo	*v_fifoinfo;	/* v fifo (VFIFO) */
 	};
 
 	/*
 	 * vfs_hash: (mount + inode) -> vnode hash.  The hash value
 	 * itself is grouped with other int fields, to avoid padding.
 	 */
 	LIST_ENTRY(vnode)	v_hashlist;
 
 	/*
 	 * VFS_namecache stuff
 	 */
 	LIST_HEAD(, namecache) v_cache_src;	/* c Cache entries from us */
 	TAILQ_HEAD(, namecache) v_cache_dst;	/* c Cache entries to us */
 	struct namecache *v_cache_dd;		/* c Cache entry for .. vnode */
 
 	/*
 	 * Locking
 	 */
 	struct	lock v_lock;			/* u (if fs don't have one) */
 	struct	mtx v_interlock;		/* lock for "i" things */
 	struct	lock *v_vnlock;			/* u pointer to vnode lock */
 
 	/*
 	 * The machinery of being a vnode
 	 */
 	TAILQ_ENTRY(vnode) v_actfreelist;	/* l vnode active/free lists */
 	struct bufobj	v_bufobj;		/* * Buffer cache object */
 
 	/*
 	 * Hooks for various subsystems and features.
 	 */
 	struct vpollinfo *v_pollinfo;		/* i Poll events, p for *v_pi */
 	struct label *v_label;			/* MAC label for vnode */
 	struct lockf *v_lockf;		/* Byte-level advisory lock list */
 	struct rangelock v_rl;			/* Byte-range lock */
 
 	/*
 	 * clustering stuff
 	 */
 	daddr_t	v_cstart;			/* v start block of cluster */
 	daddr_t	v_lasta;			/* v last allocation  */
 	daddr_t	v_lastw;			/* v last write  */
 	int	v_clen;				/* v length of cur. cluster */
 
 	u_int	v_holdcnt;			/* I prevents recycling. */
 	u_int	v_usecount;			/* I ref count of users */
 	u_int	v_iflag;			/* i vnode flags (see below) */
 	u_int	v_vflag;			/* v vnode flags */
 	u_int	v_mflag;			/* l mnt-specific vnode flags */
 	int	v_writecount;			/* I ref count of writers or
 						   (negative) text users */
 	u_int	v_hash;
 	enum	vtype v_type;			/* u vnode type */
 };
 
 #endif /* defined(_KERNEL) || defined(_KVM_VNODE) */
 
 #define	bo2vnode(bo)	__containerof((bo), struct vnode, v_bufobj)
 
 /* XXX: These are temporary to avoid a source sweep at this time */
 #define v_object	v_bufobj.bo_object
 
 /*
  * Userland version of struct vnode, for sysctl.
  */
 struct xvnode {
 	size_t	xv_size;			/* sizeof(struct xvnode) */
 	void	*xv_vnode;			/* address of real vnode */
 	u_long	xv_flag;			/* vnode vflags */
 	int	xv_usecount;			/* reference count of users */
 	int	xv_writecount;			/* reference count of writers */
 	int	xv_holdcnt;			/* page & buffer references */
 	u_long	xv_id;				/* capability identifier */
 	void	*xv_mount;			/* address of parent mount */
 	long	xv_numoutput;			/* num of writes in progress */
 	enum	vtype xv_type;			/* vnode type */
 	union {
 		void	*xvu_socket;		/* unpcb, if VSOCK */
 		void	*xvu_fifo;		/* fifo, if VFIFO */
 		dev_t	xvu_rdev;		/* maj/min, if VBLK/VCHR */
 		struct {
 			dev_t	xvu_dev;	/* device, if VDIR/VREG/VLNK */
 			ino_t	xvu_ino;	/* id, if VDIR/VREG/VLNK */
 		} xv_uns;
 	} xv_un;
 };
 #define xv_socket	xv_un.xvu_socket
 #define xv_fifo		xv_un.xvu_fifo
 #define xv_rdev		xv_un.xvu_rdev
 #define xv_dev		xv_un.xv_uns.xvu_dev
 #define xv_ino		xv_un.xv_uns.xvu_ino
 
 /* We don't need to lock the knlist */
 #define	VN_KNLIST_EMPTY(vp) ((vp)->v_pollinfo == NULL ||	\
 	    KNLIST_EMPTY(&(vp)->v_pollinfo->vpi_selinfo.si_note))
 
 #define VN_KNOTE(vp, b, a)					\
 	do {							\
 		if (!VN_KNLIST_EMPTY(vp))			\
 			KNOTE(&vp->v_pollinfo->vpi_selinfo.si_note, (b), \
 			    (a) | KNF_NOKQLOCK);		\
 	} while (0)
 #define	VN_KNOTE_LOCKED(vp, b)		VN_KNOTE(vp, b, KNF_LISTLOCKED)
 #define	VN_KNOTE_UNLOCKED(vp, b)	VN_KNOTE(vp, b, 0)
 
 /*
  * Vnode flags.
  *	VI flags are protected by interlock and live in v_iflag
  *	VV flags are protected by the vnode lock and live in v_vflag
  *
  *	VI_DOOMED is doubly protected by the interlock and vnode lock.  Both
  *	are required for writing but the status may be checked with either.
  */
 #define	VI_MOUNT	0x0020	/* Mount in progress */
 #define	VI_DOOMED	0x0080	/* This vnode is being recycled */
 #define	VI_FREE		0x0100	/* This vnode is on the freelist */
 #define	VI_ACTIVE	0x0200	/* This vnode is on the active list */
 #define	VI_DOINGINACT	0x0800	/* VOP_INACTIVE is in progress */
 #define	VI_OWEINACT	0x1000	/* Need to call inactive */
 
 #define	VV_ROOT		0x0001	/* root of its filesystem */
 #define	VV_ISTTY	0x0002	/* vnode represents a tty */
 #define	VV_NOSYNC	0x0004	/* unlinked, stop syncing */
 #define	VV_ETERNALDEV	0x0008	/* device that is never destroyed */
 #define	VV_CACHEDLABEL	0x0010	/* Vnode has valid cached MAC label */
 #define	VV_COPYONWRITE	0x0040	/* vnode is doing copy-on-write */
 #define	VV_SYSTEM	0x0080	/* vnode being used by kernel */
 #define	VV_PROCDEP	0x0100	/* vnode is process dependent */
 #define	VV_NOKNOTE	0x0200	/* don't activate knotes on this vnode */
 #define	VV_DELETED	0x0400	/* should be removed */
 #define	VV_MD		0x0800	/* vnode backs the md device */
 #define	VV_FORCEINSMQ	0x1000	/* force the insmntque to succeed */
 #define	VV_READLINK	0x2000	/* fdescfs linux vnode */
 
 #define	VMP_TMPMNTFREELIST	0x0001	/* Vnode is on mnt's tmp free list */
 
 /*
  * Vnode attributes.  A field value of VNOVAL represents a field whose value
  * is unavailable (getattr) or which is not to be changed (setattr).
  */
 struct vattr {
 	enum vtype	va_type;	/* vnode type (for create) */
 	u_short		va_mode;	/* files access mode and type */
 	u_short		va_padding0;
 	uid_t		va_uid;		/* owner user id */
 	gid_t		va_gid;		/* owner group id */
 	nlink_t		va_nlink;	/* number of references to file */
 	dev_t		va_fsid;	/* filesystem id */
 	ino_t		va_fileid;	/* file id */
 	u_quad_t	va_size;	/* file size in bytes */
 	long		va_blocksize;	/* blocksize preferred for i/o */
 	struct timespec	va_atime;	/* time of last access */
 	struct timespec	va_mtime;	/* time of last modification */
 	struct timespec	va_ctime;	/* time file changed */
 	struct timespec	va_birthtime;	/* time file created */
 	u_long		va_gen;		/* generation number of file */
 	u_long		va_flags;	/* flags defined for file */
 	dev_t		va_rdev;	/* device the special file represents */
 	u_quad_t	va_bytes;	/* bytes of disk space held by file */
 	u_quad_t	va_filerev;	/* file modification number */
 	u_int		va_vaflags;	/* operations flags, see below */
 	long		va_spare;	/* remain quad aligned */
 };
 
 /*
  * Flags for va_vaflags.
  */
 #define	VA_UTIMES_NULL	0x01		/* utimes argument was NULL */
 #define	VA_EXCLUSIVE	0x02		/* exclusive create request */
 #define	VA_SYNC		0x04		/* O_SYNC truncation */
 
 /*
  * Flags for ioflag. (high 16 bits used to ask for read-ahead and
  * help with write clustering)
  * NB: IO_NDELAY and IO_DIRECT are linked to fcntl.h
  */
 #define	IO_UNIT		0x0001		/* do I/O as atomic unit */
 #define	IO_APPEND	0x0002		/* append write to end */
 #define	IO_NDELAY	0x0004		/* FNDELAY flag set in file table */
 #define	IO_NODELOCKED	0x0008		/* underlying node already locked */
 #define	IO_ASYNC	0x0010		/* bawrite rather then bdwrite */
 #define	IO_VMIO		0x0020		/* data already in VMIO space */
 #define	IO_INVAL	0x0040		/* invalidate after I/O */
 #define	IO_SYNC		0x0080		/* do I/O synchronously */
 #define	IO_DIRECT	0x0100		/* attempt to bypass buffer cache */
 #define	IO_NOREUSE	0x0200		/* VMIO data won't be reused */
 #define	IO_EXT		0x0400		/* operate on external attributes */
 #define	IO_NORMAL	0x0800		/* operate on regular data */
 #define	IO_NOMACCHECK	0x1000		/* MAC checks unnecessary */
 #define	IO_BUFLOCKED	0x2000		/* ffs flag; indir buf is locked */
 #define	IO_RANGELOCKED	0x4000		/* range locked */
 
 #define IO_SEQMAX	0x7F		/* seq heuristic max value */
 #define IO_SEQSHIFT	16		/* seq heuristic in upper 16 bits */
 
 /*
  * Flags for accmode_t.
  */
 #define	VEXEC			000000000100 /* execute/search permission */
 #define	VWRITE			000000000200 /* write permission */
 #define	VREAD			000000000400 /* read permission */
 #define	VADMIN			000000010000 /* being the file owner */
 #define	VAPPEND			000000040000 /* permission to write/append */
 /*
  * VEXPLICIT_DENY makes VOP_ACCESSX(9) return EPERM or EACCES only
  * if permission was denied explicitly, by a "deny" rule in NFSv4 ACL,
  * and 0 otherwise.  This never happens with ordinary unix access rights
  * or POSIX.1e ACLs.  Obviously, VEXPLICIT_DENY must be OR-ed with
  * some other V* constant.
  */
 #define	VEXPLICIT_DENY		000000100000
 #define	VREAD_NAMED_ATTRS 	000000200000 /* not used */
 #define	VWRITE_NAMED_ATTRS 	000000400000 /* not used */
 #define	VDELETE_CHILD	 	000001000000
 #define	VREAD_ATTRIBUTES 	000002000000 /* permission to stat(2) */
 #define	VWRITE_ATTRIBUTES 	000004000000 /* change {m,c,a}time */
 #define	VDELETE		 	000010000000
 #define	VREAD_ACL	 	000020000000 /* read ACL and file mode */
 #define	VWRITE_ACL	 	000040000000 /* change ACL and/or file mode */
 #define	VWRITE_OWNER	 	000100000000 /* change file owner */
 #define	VSYNCHRONIZE	 	000200000000 /* not used */
 #define	VCREAT			000400000000 /* creating new file */
 #define	VVERIFY			001000000000 /* verification required */
 
 /*
  * Permissions that were traditionally granted only to the file owner.
  */
 #define VADMIN_PERMS	(VADMIN | VWRITE_ATTRIBUTES | VWRITE_ACL | \
     VWRITE_OWNER)
 
 /*
  * Permissions that were traditionally granted to everyone.
  */
 #define VSTAT_PERMS	(VREAD_ATTRIBUTES | VREAD_ACL)
 
 /*
  * Permissions that allow to change the state of the file in any way.
  */
 #define VMODIFY_PERMS	(VWRITE | VAPPEND | VADMIN_PERMS | VDELETE_CHILD | \
     VDELETE)
 
 /*
  * Token indicating no attribute value yet assigned.
  */
 #define	VNOVAL	(-1)
 
 /*
  * LK_TIMELOCK timeout for vnode locks (used mainly by the pageout daemon)
  */
 #define VLKTIMEOUT	(hz / 20 + 1)
 
 #ifdef _KERNEL
 
 #ifdef MALLOC_DECLARE
 MALLOC_DECLARE(M_VNODE);
 #endif
 
 extern u_int ncsizefactor;
 
 /*
  * Convert between vnode types and inode formats (since POSIX.1
  * defines mode word of stat structure in terms of inode formats).
  */
 extern enum vtype	iftovt_tab[];
 extern int		vttoif_tab[];
 #define	IFTOVT(mode)	(iftovt_tab[((mode) & S_IFMT) >> 12])
 #define	VTTOIF(indx)	(vttoif_tab[(int)(indx)])
 #define	MAKEIMODE(indx, mode)	(int)(VTTOIF(indx) | (mode))
 
 /*
  * Flags to various vnode functions.
  */
 #define	SKIPSYSTEM	0x0001	/* vflush: skip vnodes marked VSYSTEM */
 #define	FORCECLOSE	0x0002	/* vflush: force file closure */
 #define	WRITECLOSE	0x0004	/* vflush: only close writable files */
 #define	EARLYFLUSH	0x0008	/* vflush: early call for ffs_flushfiles */
 #define	V_SAVE		0x0001	/* vinvalbuf: sync file first */
 #define	V_ALT		0x0002	/* vinvalbuf: invalidate only alternate bufs */
 #define	V_NORMAL	0x0004	/* vinvalbuf: invalidate only regular bufs */
 #define	V_CLEANONLY	0x0008	/* vinvalbuf: invalidate only clean bufs */
 #define	V_VMIO		0x0010	/* vinvalbuf: called during pageout */
 #define	V_ALLOWCLEAN	0x0020	/* vinvalbuf: allow clean buffers after flush */
 #define	REVOKEALL	0x0001	/* vop_revoke: revoke all aliases */
 #define	V_WAIT		0x0001	/* vn_start_write: sleep for suspend */
 #define	V_NOWAIT	0x0002	/* vn_start_write: don't sleep for suspend */
 #define	V_XSLEEP	0x0004	/* vn_start_write: just return after sleep */
 #define	V_MNTREF	0x0010	/* vn_start_write: mp is already ref-ed */
 
 #define	VR_START_WRITE	0x0001	/* vfs_write_resume: start write atomically */
 #define	VR_NO_SUSPCLR	0x0002	/* vfs_write_resume: do not clear suspension */
 
 #define	VS_SKIP_UNMOUNT	0x0001	/* vfs_write_suspend: fail if the
 				   filesystem is being unmounted */
 
 #define	VREF(vp)	vref(vp)
 
 #ifdef DIAGNOSTIC
 #define	VATTR_NULL(vap)	vattr_null(vap)
 #else
 #define	VATTR_NULL(vap)	(*(vap) = va_null)	/* initialize a vattr */
 #endif /* DIAGNOSTIC */
 
 #define	NULLVP	((struct vnode *)NULL)
 
 /*
  * Global vnode data.
  */
 extern	struct vnode *rootvnode;	/* root (i.e. "/") vnode */
 extern	struct mount *rootdevmp;	/* "/dev" mount */
 extern	int desiredvnodes;		/* number of vnodes desired */
 extern	struct uma_zone *namei_zone;
 extern	struct vattr va_null;		/* predefined null vattr structure */
 
 #define	VI_LOCK(vp)	mtx_lock(&(vp)->v_interlock)
 #define	VI_LOCK_FLAGS(vp, flags) mtx_lock_flags(&(vp)->v_interlock, (flags))
 #define	VI_TRYLOCK(vp)	mtx_trylock(&(vp)->v_interlock)
 #define	VI_UNLOCK(vp)	mtx_unlock(&(vp)->v_interlock)
 #define	VI_MTX(vp)	(&(vp)->v_interlock)
 
 #define	VN_LOCK_AREC(vp)	lockallowrecurse((vp)->v_vnlock)
 #define	VN_LOCK_ASHARE(vp)	lockallowshare((vp)->v_vnlock)
 #define	VN_LOCK_DSHARE(vp)	lockdisableshare((vp)->v_vnlock)
 
 #endif /* _KERNEL */
 
 /*
  * Mods for extensibility.
  */
 
 /*
  * Flags for vdesc_flags:
  */
 #define	VDESC_MAX_VPS		16
 /* Low order 16 flag bits are reserved for willrele flags for vp arguments. */
 #define	VDESC_VP0_WILLRELE	0x0001
 #define	VDESC_VP1_WILLRELE	0x0002
 #define	VDESC_VP2_WILLRELE	0x0004
 #define	VDESC_VP3_WILLRELE	0x0008
 #define	VDESC_NOMAP_VPP		0x0100
 #define	VDESC_VPP_WILLRELE	0x0200
 
 /*
  * A generic structure.
  * This can be used by bypass routines to identify generic arguments.
  */
 struct vop_generic_args {
 	struct vnodeop_desc *a_desc;
 	/* other random data follows, presumably */
 };
 
 typedef int vop_bypass_t(struct vop_generic_args *);
 
 /*
  * VDESC_NO_OFFSET is used to identify the end of the offset list
  * and in places where no such field exists.
  */
 #define VDESC_NO_OFFSET -1
 
 /*
  * This structure describes the vnode operation taking place.
  */
 struct vnodeop_desc {
 	char	*vdesc_name;		/* a readable name for debugging */
 	int	 vdesc_flags;		/* VDESC_* flags */
 	vop_bypass_t	*vdesc_call;	/* Function to call */
 
 	/*
 	 * These ops are used by bypass routines to map and locate arguments.
 	 * Creds and procs are not needed in bypass routines, but sometimes
 	 * they are useful to (for example) transport layers.
 	 * Nameidata is useful because it has a cred in it.
 	 */
 	int	*vdesc_vp_offsets;	/* list ended by VDESC_NO_OFFSET */
 	int	vdesc_vpp_offset;	/* return vpp location */
 	int	vdesc_cred_offset;	/* cred location, if any */
 	int	vdesc_thread_offset;	/* thread location, if any */
 	int	vdesc_componentname_offset; /* if any */
 };
 
 #ifdef _KERNEL
 /*
  * A list of all the operation descs.
  */
 extern struct vnodeop_desc *vnodeop_descs[];
 
 #define	VOPARG_OFFSETOF(s_type, field)	__offsetof(s_type, field)
 #define	VOPARG_OFFSETTO(s_type, s_offset, struct_p) \
     ((s_type)(((char*)(struct_p)) + (s_offset)))
 
 
 #ifdef DEBUG_VFS_LOCKS
 /*
  * Support code to aid in debugging VFS locking problems.  Not totally
  * reliable since if the thread sleeps between changing the lock
  * state and checking it with the assert, some other thread could
  * change the state.  They are good enough for debugging a single
  * filesystem using a single-threaded test.  Note that the unreliability is
  * limited to false negatives; efforts were made to ensure that false
  * positives cannot occur.
  */
 void	assert_vi_locked(struct vnode *vp, const char *str);
 void	assert_vi_unlocked(struct vnode *vp, const char *str);
 void	assert_vop_elocked(struct vnode *vp, const char *str);
 void	assert_vop_locked(struct vnode *vp, const char *str);
 void	assert_vop_unlocked(struct vnode *vp, const char *str);
 
 #define	ASSERT_VI_LOCKED(vp, str)	assert_vi_locked((vp), (str))
 #define	ASSERT_VI_UNLOCKED(vp, str)	assert_vi_unlocked((vp), (str))
 #define	ASSERT_VOP_ELOCKED(vp, str)	assert_vop_elocked((vp), (str))
 #define	ASSERT_VOP_LOCKED(vp, str)	assert_vop_locked((vp), (str))
 #define	ASSERT_VOP_UNLOCKED(vp, str)	assert_vop_unlocked((vp), (str))
 
 #else /* !DEBUG_VFS_LOCKS */
 
 #define	ASSERT_VI_LOCKED(vp, str)	((void)0)
 #define	ASSERT_VI_UNLOCKED(vp, str)	((void)0)
 #define	ASSERT_VOP_ELOCKED(vp, str)	((void)0)
 #define	ASSERT_VOP_LOCKED(vp, str)	((void)0)
 #define	ASSERT_VOP_UNLOCKED(vp, str)	((void)0)
 #endif /* DEBUG_VFS_LOCKS */
 
 
 /*
  * This call works for vnodes in the kernel.
  */
 #define VCALL(c) ((c)->a_desc->vdesc_call(c))
 
 #define DOINGASYNC(vp)	   					\
 	(((vp)->v_mount->mnt_kern_flag & MNTK_ASYNC) != 0 &&	\
 	 ((curthread->td_pflags & TDP_SYNCIO) == 0))
 
 /*
  * VMIO support inline
  */
 
 extern int vmiodirenable;
 
 static __inline int
 vn_canvmio(struct vnode *vp)
 {
       if (vp && (vp->v_type == VREG || (vmiodirenable && vp->v_type == VDIR)))
 		return(TRUE);
 	return(FALSE);
 }
 
 /*
  * Finally, include the default set of vnode operations.
  */
 typedef void vop_getpages_iodone_t(void *, vm_page_t *, int, int);
 #include "vnode_if.h"
 
 /* vn_open_flags */
 #define	VN_OPEN_NOAUDIT		0x00000001
 #define	VN_OPEN_NOCAPCHECK	0x00000002
 #define	VN_OPEN_NAMECACHE	0x00000004
 
 /*
  * Public vnode manipulation functions.
  */
 struct componentname;
 struct file;
 struct mount;
 struct nameidata;
 struct ostat;
 struct freebsd11_stat;
 struct thread;
 struct proc;
 struct stat;
 struct nstat;
 struct ucred;
 struct uio;
 struct vattr;
 struct vfsops;
 struct vnode;
 
 typedef int (*vn_get_ino_t)(struct mount *, void *, int, struct vnode **);
 
 int	bnoreuselist(struct bufv *bufv, struct bufobj *bo, daddr_t startn,
 	    daddr_t endn);
 /* cache_* may belong in namei.h. */
 void	cache_changesize(int newhashsize);
 #define	cache_enter(dvp, vp, cnp)					\
 	cache_enter_time(dvp, vp, cnp, NULL, NULL)
 void	cache_enter_time(struct vnode *dvp, struct vnode *vp,
 	    struct componentname *cnp, struct timespec *tsp,
 	    struct timespec *dtsp);
 int	cache_lookup(struct vnode *dvp, struct vnode **vpp,
 	    struct componentname *cnp, struct timespec *tsp, int *ticksp);
 void	cache_purge(struct vnode *vp);
 void	cache_purge_negative(struct vnode *vp);
 void	cache_purgevfs(struct mount *mp, bool force);
 int	change_dir(struct vnode *vp, struct thread *td);
 void	cvtstat(struct stat *st, struct ostat *ost);
 void	freebsd11_cvtnstat(struct stat *sb, struct nstat *nsb);
 int	freebsd11_cvtstat(struct stat *st, struct freebsd11_stat *ost);
 int	getnewvnode(const char *tag, struct mount *mp, struct vop_vector *vops,
 	    struct vnode **vpp);
 void	getnewvnode_reserve(u_int count);
 void	getnewvnode_drop_reserve(void);
 int	insmntque1(struct vnode *vp, struct mount *mp,
 	    void (*dtr)(struct vnode *, void *), void *dtr_arg);
 int	insmntque(struct vnode *vp, struct mount *mp);
 u_quad_t init_va_filerev(void);
 int	speedup_syncer(void);
 int	vn_vptocnp(struct vnode **vp, struct ucred *cred, char *buf,
 	    u_int *buflen);
 int	vn_fullpath(struct thread *td, struct vnode *vn,
 	    char **retbuf, char **freebuf);
 int	vn_fullpath_global(struct thread *td, struct vnode *vn,
 	    char **retbuf, char **freebuf);
 struct vnode *
 	vn_dir_dd_ino(struct vnode *vp);
 int	vn_commname(struct vnode *vn, char *buf, u_int buflen);
 int	vn_path_to_global_path(struct thread *td, struct vnode *vp,
 	    char *path, u_int pathlen);
 int	vaccess(enum vtype type, mode_t file_mode, uid_t file_uid,
 	    gid_t file_gid, accmode_t accmode, struct ucred *cred,
 	    int *privused);
 int	vaccess_acl_nfs4(enum vtype type, uid_t file_uid, gid_t file_gid,
 	    struct acl *aclp, accmode_t accmode, struct ucred *cred,
 	    int *privused);
 int	vaccess_acl_posix1e(enum vtype type, uid_t file_uid,
 	    gid_t file_gid, struct acl *acl, accmode_t accmode,
 	    struct ucred *cred, int *privused);
 void	vattr_null(struct vattr *vap);
 int	vcount(struct vnode *vp);
 #define	vdrop(vp)	_vdrop((vp), 0)
 #define	vdropl(vp)	_vdrop((vp), 1)
 void	_vdrop(struct vnode *, bool);
 int	vflush(struct mount *mp, int rootrefs, int flags, struct thread *td);
 int	vget(struct vnode *vp, int lockflag, struct thread *td);
 void	vgone(struct vnode *vp);
 #define	vhold(vp)	_vhold((vp), 0)
 #define	vholdl(vp)	_vhold((vp), 1)
 void	_vhold(struct vnode *, bool);
 void	vinactive(struct vnode *, struct thread *);
 int	vinvalbuf(struct vnode *vp, int save, int slpflag, int slptimeo);
-int	vtruncbuf(struct vnode *vp, struct ucred *cred, off_t length,
-	    int blksize);
+int	vtruncbuf(struct vnode *vp, off_t length, int blksize);
 void	vunref(struct vnode *);
 void	vn_printf(struct vnode *vp, const char *fmt, ...) __printflike(2,3);
 int	vrecycle(struct vnode *vp);
 int	vrecyclel(struct vnode *vp);
 int	vn_bmap_seekhole(struct vnode *vp, u_long cmd, off_t *off,
 	    struct ucred *cred);
 int	vn_close(struct vnode *vp,
 	    int flags, struct ucred *file_cred, struct thread *td);
 void	vn_finished_write(struct mount *mp);
 void	vn_finished_secondary_write(struct mount *mp);
 int	vn_fsync_buf(struct vnode *vp, int waitfor);
 int	vn_isdisk(struct vnode *vp, int *errp);
 int	_vn_lock(struct vnode *vp, int flags, char *file, int line);
 #define vn_lock(vp, flags) _vn_lock(vp, flags, __FILE__, __LINE__)
 int	vn_open(struct nameidata *ndp, int *flagp, int cmode, struct file *fp);
 int	vn_open_cred(struct nameidata *ndp, int *flagp, int cmode,
 	    u_int vn_open_flags, struct ucred *cred, struct file *fp);
 int	vn_open_vnode(struct vnode *vp, int fmode, struct ucred *cred,
 	    struct thread *td, struct file *fp);
 void	vn_pages_remove(struct vnode *vp, vm_pindex_t start, vm_pindex_t end);
 int	vn_pollrecord(struct vnode *vp, struct thread *p, int events);
 int	vn_rdwr(enum uio_rw rw, struct vnode *vp, void *base,
 	    int len, off_t offset, enum uio_seg segflg, int ioflg,
 	    struct ucred *active_cred, struct ucred *file_cred, ssize_t *aresid,
 	    struct thread *td);
 int	vn_rdwr_inchunks(enum uio_rw rw, struct vnode *vp, void *base,
 	    size_t len, off_t offset, enum uio_seg segflg, int ioflg,
 	    struct ucred *active_cred, struct ucred *file_cred, size_t *aresid,
 	    struct thread *td);
 int	vn_rlimit_fsize(const struct vnode *vn, const struct uio *uio,
 	    struct thread *td);
 int	vn_stat(struct vnode *vp, struct stat *sb, struct ucred *active_cred,
 	    struct ucred *file_cred, struct thread *td);
 int	vn_start_write(struct vnode *vp, struct mount **mpp, int flags);
 int	vn_start_secondary_write(struct vnode *vp, struct mount **mpp,
 	    int flags);
 int	vn_writechk(struct vnode *vp);
 int	vn_extattr_get(struct vnode *vp, int ioflg, int attrnamespace,
 	    const char *attrname, int *buflen, char *buf, struct thread *td);
 int	vn_extattr_set(struct vnode *vp, int ioflg, int attrnamespace,
 	    const char *attrname, int buflen, char *buf, struct thread *td);
 int	vn_extattr_rm(struct vnode *vp, int ioflg, int attrnamespace,
 	    const char *attrname, struct thread *td);
 int	vn_vget_ino(struct vnode *vp, ino_t ino, int lkflags,
 	    struct vnode **rvp);
 int	vn_vget_ino_gen(struct vnode *vp, vn_get_ino_t alloc,
 	    void *alloc_arg, int lkflags, struct vnode **rvp);
 int	vn_utimes_perm(struct vnode *vp, struct vattr *vap,
 	    struct ucred *cred, struct thread *td);
 
 int	vn_io_fault_uiomove(char *data, int xfersize, struct uio *uio);
 int	vn_io_fault_pgmove(vm_page_t ma[], vm_offset_t offset, int xfersize,
 	    struct uio *uio);
 
 #define	vn_rangelock_unlock(vp, cookie)					\
 	rangelock_unlock(&(vp)->v_rl, (cookie), VI_MTX(vp))
 #define	vn_rangelock_unlock_range(vp, cookie, start, end)		\
 	rangelock_unlock_range(&(vp)->v_rl, (cookie), (start), (end), 	\
 	    VI_MTX(vp))
 #define	vn_rangelock_rlock(vp, start, end)				\
 	rangelock_rlock(&(vp)->v_rl, (start), (end), VI_MTX(vp))
 #define	vn_rangelock_wlock(vp, start, end)				\
 	rangelock_wlock(&(vp)->v_rl, (start), (end), VI_MTX(vp))
 
 int	vfs_cache_lookup(struct vop_lookup_args *ap);
 void	vfs_timestamp(struct timespec *);
 void	vfs_write_resume(struct mount *mp, int flags);
 int	vfs_write_suspend(struct mount *mp, int flags);
 int	vfs_write_suspend_umnt(struct mount *mp);
 void	vnlru_free(int, struct vfsops *);
 int	vop_stdbmap(struct vop_bmap_args *);
 int	vop_stdfdatasync_buf(struct vop_fdatasync_args *);
 int	vop_stdfsync(struct vop_fsync_args *);
 int	vop_stdgetwritemount(struct vop_getwritemount_args *);
 int	vop_stdgetpages(struct vop_getpages_args *);
 int	vop_stdinactive(struct vop_inactive_args *);
 int	vop_stdislocked(struct vop_islocked_args *);
 int	vop_stdkqfilter(struct vop_kqfilter_args *);
 int	vop_stdlock(struct vop_lock1_args *);
 int	vop_stdputpages(struct vop_putpages_args *);
 int	vop_stdunlock(struct vop_unlock_args *);
 int	vop_nopoll(struct vop_poll_args *);
 int	vop_stdaccess(struct vop_access_args *ap);
 int	vop_stdaccessx(struct vop_accessx_args *ap);
 int	vop_stdadvise(struct vop_advise_args *ap);
 int	vop_stdadvlock(struct vop_advlock_args *ap);
 int	vop_stdadvlockasync(struct vop_advlockasync_args *ap);
 int	vop_stdadvlockpurge(struct vop_advlockpurge_args *ap);
 int	vop_stdallocate(struct vop_allocate_args *ap);
 int	vop_stdset_text(struct vop_set_text_args *ap);
 int	vop_stdpathconf(struct vop_pathconf_args *);
 int	vop_stdpoll(struct vop_poll_args *);
 int	vop_stdvptocnp(struct vop_vptocnp_args *ap);
 int	vop_stdvptofh(struct vop_vptofh_args *ap);
 int	vop_stdunp_bind(struct vop_unp_bind_args *ap);
 int	vop_stdunp_connect(struct vop_unp_connect_args *ap);
 int	vop_stdunp_detach(struct vop_unp_detach_args *ap);
 int	vop_eopnotsupp(struct vop_generic_args *ap);
 int	vop_ebadf(struct vop_generic_args *ap);
 int	vop_einval(struct vop_generic_args *ap);
 int	vop_enoent(struct vop_generic_args *ap);
 int	vop_enotty(struct vop_generic_args *ap);
 int	vop_null(struct vop_generic_args *ap);
 int	vop_panic(struct vop_generic_args *ap);
 int	dead_poll(struct vop_poll_args *ap);
 int	dead_read(struct vop_read_args *ap);
 int	dead_write(struct vop_write_args *ap);
 
 /* These are called from within the actual VOPS. */
 void	vop_close_post(void *a, int rc);
 void	vop_create_post(void *a, int rc);
 void	vop_deleteextattr_post(void *a, int rc);
 void	vop_link_post(void *a, int rc);
 void	vop_lookup_post(void *a, int rc);
 void	vop_lookup_pre(void *a);
 void	vop_mkdir_post(void *a, int rc);
 void	vop_mknod_post(void *a, int rc);
 void	vop_open_post(void *a, int rc);
 void	vop_read_post(void *a, int rc);
 void	vop_readdir_post(void *a, int rc);
 void	vop_reclaim_post(void *a, int rc);
 void	vop_remove_post(void *a, int rc);
 void	vop_rename_post(void *a, int rc);
 void	vop_rename_pre(void *a);
 void	vop_rmdir_post(void *a, int rc);
 void	vop_setattr_post(void *a, int rc);
 void	vop_setextattr_post(void *a, int rc);
 void	vop_symlink_post(void *a, int rc);
 
 #ifdef DEBUG_VFS_LOCKS
 void	vop_strategy_pre(void *a);
 void	vop_lock_pre(void *a);
 void	vop_lock_post(void *a, int rc);
 void	vop_unlock_post(void *a, int rc);
 void	vop_unlock_pre(void *a);
 #else
 #define	vop_strategy_pre(x)	do { } while (0)
 #define	vop_lock_pre(x)		do { } while (0)
 #define	vop_lock_post(x, y)	do { } while (0)
 #define	vop_unlock_post(x, y)	do { } while (0)
 #define	vop_unlock_pre(x)	do { } while (0)
 #endif
 
 void	vop_rename_fail(struct vop_rename_args *ap);
 
 #define	VOP_WRITE_PRE(ap)						\
 	struct vattr va;						\
 	int error;							\
 	off_t osize, ooffset, noffset;					\
 									\
 	osize = ooffset = noffset = 0;					\
 	if (!VN_KNLIST_EMPTY((ap)->a_vp)) {				\
 		error = VOP_GETATTR((ap)->a_vp, &va, (ap)->a_cred);	\
 		if (error)						\
 			return (error);					\
 		ooffset = (ap)->a_uio->uio_offset;			\
 		osize = (off_t)va.va_size;				\
 	}
 
 #define VOP_WRITE_POST(ap, ret)						\
 	noffset = (ap)->a_uio->uio_offset;				\
 	if (noffset > ooffset && !VN_KNLIST_EMPTY((ap)->a_vp)) {	\
 		VFS_KNOTE_LOCKED((ap)->a_vp, NOTE_WRITE			\
 		    | (noffset > osize ? NOTE_EXTEND : 0));		\
 	}
 
 #define VOP_LOCK(vp, flags) VOP_LOCK1(vp, flags, __FILE__, __LINE__)
 
 #ifdef	INVARIANTS
 #define	VOP_ADD_WRITECOUNT_CHECKED(vp, cnt)				\
 do {									\
 	int error_;							\
 									\
 	error_ = VOP_ADD_WRITECOUNT((vp), (cnt));			\
 	MPASS(error_ == 0);						\
 } while (0)
 #define	VOP_SET_TEXT_CHECKED(vp)					\
 do {									\
 	int error_;							\
 									\
 	error_ = VOP_SET_TEXT((vp));					\
 	MPASS(error_ == 0);						\
 } while (0)
 #define	VOP_UNSET_TEXT_CHECKED(vp)					\
 do {									\
 	int error_;							\
 									\
 	error_ = VOP_UNSET_TEXT((vp));					\
 	MPASS(error_ == 0);						\
 } while (0)
 #else
 #define	VOP_ADD_WRITECOUNT_CHECKED(vp, cnt)	VOP_ADD_WRITECOUNT((vp), (cnt))
 #define	VOP_SET_TEXT_CHECKED(vp)		VOP_SET_TEXT((vp))
 #define	VOP_UNSET_TEXT_CHECKED(vp)		VOP_UNSET_TEXT((vp))
 #endif
 
 void	vput(struct vnode *vp);
 void	vrele(struct vnode *vp);
 void	vref(struct vnode *vp);
 void	vrefl(struct vnode *vp);
 void	vrefact(struct vnode *vp);
 int	vrefcnt(struct vnode *vp);
 void 	v_addpollinfo(struct vnode *vp);
 
 int vnode_create_vobject(struct vnode *vp, off_t size, struct thread *td);
 void vnode_destroy_vobject(struct vnode *vp);
 
 extern struct vop_vector fifo_specops;
 extern struct vop_vector dead_vnodeops;
 extern struct vop_vector default_vnodeops;
 
 #define VOP_PANIC	((void*)(uintptr_t)vop_panic)
 #define VOP_NULL	((void*)(uintptr_t)vop_null)
 #define VOP_EBADF	((void*)(uintptr_t)vop_ebadf)
 #define VOP_ENOTTY	((void*)(uintptr_t)vop_enotty)
 #define VOP_EINVAL	((void*)(uintptr_t)vop_einval)
 #define VOP_ENOENT	((void*)(uintptr_t)vop_enoent)
 #define VOP_EOPNOTSUPP	((void*)(uintptr_t)vop_eopnotsupp)
 
 /* fifo_vnops.c */
 int	fifo_printinfo(struct vnode *);
 
 /* vfs_hash.c */
 typedef int vfs_hash_cmp_t(struct vnode *vp, void *arg);
 
 void vfs_hash_changesize(int newhashsize);
 int vfs_hash_get(const struct mount *mp, u_int hash, int flags,
     struct thread *td, struct vnode **vpp, vfs_hash_cmp_t *fn, void *arg);
 u_int vfs_hash_index(struct vnode *vp);
 int vfs_hash_insert(struct vnode *vp, u_int hash, int flags, struct thread *td,
     struct vnode **vpp, vfs_hash_cmp_t *fn, void *arg);
 void vfs_hash_ref(const struct mount *mp, u_int hash, struct thread *td,
     struct vnode **vpp, vfs_hash_cmp_t *fn, void *arg);
 void vfs_hash_rehash(struct vnode *vp, u_int hash);
 void vfs_hash_remove(struct vnode *vp);
 
 int vfs_kqfilter(struct vop_kqfilter_args *);
 void vfs_mark_atime(struct vnode *vp, struct ucred *cred);
 struct dirent;
 int vfs_read_dirent(struct vop_readdir_args *ap, struct dirent *dp, off_t off);
 
 int vfs_unixify_accmode(accmode_t *accmode);
 
 void vfs_unp_reclaim(struct vnode *vp);
 
 int setfmode(struct thread *td, struct ucred *cred, struct vnode *vp, int mode);
 int setfown(struct thread *td, struct ucred *cred, struct vnode *vp, uid_t uid,
     gid_t gid);
 int vn_chmod(struct file *fp, mode_t mode, struct ucred *active_cred,
     struct thread *td);
 int vn_chown(struct file *fp, uid_t uid, gid_t gid, struct ucred *active_cred,
     struct thread *td);
 
 void vn_fsid(struct vnode *vp, struct vattr *va);
 
 #endif /* _KERNEL */
 
 #endif /* !_SYS_VNODE_H_ */
Index: stable/12/sys/ufs/ffs/ffs_inode.c
===================================================================
--- stable/12/sys/ufs/ffs/ffs_inode.c	(revision 349304)
+++ stable/12/sys/ufs/ffs/ffs_inode.c	(revision 349305)
@@ -1,778 +1,778 @@
 /*-
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Copyright (c) 1982, 1986, 1989, 1993
  *	The Regents of the University of California.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	@(#)ffs_inode.c	8.13 (Berkeley) 4/21/95
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_quota.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/bio.h>
 #include <sys/buf.h>
 #include <sys/malloc.h>
 #include <sys/mount.h>
 #include <sys/proc.h>
 #include <sys/racct.h>
 #include <sys/random.h>
 #include <sys/resourcevar.h>
 #include <sys/rwlock.h>
 #include <sys/stat.h>
 #include <sys/vmmeter.h>
 #include <sys/vnode.h>
 
 #include <vm/vm.h>
 #include <vm/vm_extern.h>
 #include <vm/vm_object.h>
 
 #include <ufs/ufs/extattr.h>
 #include <ufs/ufs/quota.h>
 #include <ufs/ufs/ufsmount.h>
 #include <ufs/ufs/inode.h>
 #include <ufs/ufs/ufs_extern.h>
 
 #include <ufs/ffs/fs.h>
 #include <ufs/ffs/ffs_extern.h>
 
 static int ffs_indirtrunc(struct inode *, ufs2_daddr_t, ufs2_daddr_t,
 	    ufs2_daddr_t, int, ufs2_daddr_t *);
 
 /*
  * Update the access, modified, and inode change times as specified by the
  * IN_ACCESS, IN_UPDATE, and IN_CHANGE flags respectively.  Write the inode
  * to disk if the IN_MODIFIED flag is set (it may be set initially, or by
  * the timestamp update).  The IN_LAZYMOD flag is set to force a write
  * later if not now.  The IN_LAZYACCESS is set instead of IN_MODIFIED if the fs
  * is currently being suspended (or is suspended) and vnode has been accessed.
  * If we write now, then clear IN_MODIFIED, IN_LAZYACCESS and IN_LAZYMOD to
  * reflect the presumably successful write, and if waitfor is set, then wait
  * for the write to complete.
  */
 int
 ffs_update(vp, waitfor)
 	struct vnode *vp;
 	int waitfor;
 {
 	struct fs *fs;
 	struct buf *bp;
 	struct inode *ip;
 	int flags, error;
 
 	ASSERT_VOP_ELOCKED(vp, "ffs_update");
 	ufs_itimes(vp);
 	ip = VTOI(vp);
 	if ((ip->i_flag & IN_MODIFIED) == 0 && waitfor == 0)
 		return (0);
 	ip->i_flag &= ~(IN_LAZYACCESS | IN_LAZYMOD | IN_MODIFIED);
 	fs = ITOFS(ip);
 	if (fs->fs_ronly && ITOUMP(ip)->um_fsckpid == 0)
 		return (0);
 	/*
 	 * If we are updating a snapshot and another process is currently
 	 * writing the buffer containing the inode for this snapshot then
 	 * a deadlock can occur when it tries to check the snapshot to see
 	 * if that block needs to be copied. Thus when updating a snapshot
 	 * we check to see if the buffer is already locked, and if it is
 	 * we drop the snapshot lock until the buffer has been written
 	 * and is available to us. We have to grab a reference to the
 	 * snapshot vnode to prevent it from being removed while we are
 	 * waiting for the buffer.
 	 */
 	flags = 0;
 	if (IS_SNAPSHOT(ip))
 		flags = GB_LOCK_NOWAIT;
 loop:
 	error = bread_gb(ITODEVVP(ip),
 	     fsbtodb(fs, ino_to_fsba(fs, ip->i_number)),
 	     (int) fs->fs_bsize, NOCRED, flags, &bp);
 	if (error != 0) {
 		if (error != EBUSY)
 			return (error);
 		KASSERT((IS_SNAPSHOT(ip)), ("EBUSY from non-snapshot"));
 		/*
 		 * Wait for our inode block to become available.
 		 *
 		 * Hold a reference to the vnode to protect against
 		 * ffs_snapgone(). Since we hold a reference, it can only
 		 * get reclaimed (VI_DOOMED flag) in a forcible downgrade
 		 * or unmount. For an unmount, the entire filesystem will be
 		 * gone, so we cannot attempt to touch anything associated
 		 * with it while the vnode is unlocked; all we can do is 
 		 * pause briefly and try again. If when we relock the vnode
 		 * we discover that it has been reclaimed, updating it is no
 		 * longer necessary and we can just return an error.
 		 */
 		vref(vp);
 		VOP_UNLOCK(vp, 0);
 		pause("ffsupd", 1);
 		vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
 		vrele(vp);
 		if ((vp->v_iflag & VI_DOOMED) != 0)
 			return (ENOENT);
 		goto loop;
 	}
 	if (DOINGSOFTDEP(vp))
 		softdep_update_inodeblock(ip, bp, waitfor);
 	else if (ip->i_effnlink != ip->i_nlink)
 		panic("ffs_update: bad link cnt");
 	if (I_IS_UFS1(ip)) {
 		*((struct ufs1_dinode *)bp->b_data +
 		    ino_to_fsbo(fs, ip->i_number)) = *ip->i_din1;
 		/* XXX: FIX? The entropy here is desirable, but the harvesting may be expensive */
 		random_harvest_queue(&(ip->i_din1), sizeof(ip->i_din1), RANDOM_FS_ATIME);
 	} else {
 		*((struct ufs2_dinode *)bp->b_data +
 		    ino_to_fsbo(fs, ip->i_number)) = *ip->i_din2;
 		/* XXX: FIX? The entropy here is desirable, but the harvesting may be expensive */
 		random_harvest_queue(&(ip->i_din2), sizeof(ip->i_din2), RANDOM_FS_ATIME);
 	}
 	if (waitfor)
 		error = bwrite(bp);
 	else if (vm_page_count_severe() || buf_dirty_count_severe()) {
 		bawrite(bp);
 		error = 0;
 	} else {
 		if (bp->b_bufsize == fs->fs_bsize)
 			bp->b_flags |= B_CLUSTEROK;
 		bdwrite(bp);
 		error = 0;
 	}
 	return (error);
 }
 
 #define	SINGLE	0	/* index of single indirect block */
 #define	DOUBLE	1	/* index of double indirect block */
 #define	TRIPLE	2	/* index of triple indirect block */
 /*
  * Truncate the inode ip to at most length size, freeing the
  * disk blocks.
  */
 int
 ffs_truncate(vp, length, flags, cred)
 	struct vnode *vp;
 	off_t length;
 	int flags;
 	struct ucred *cred;
 {
 	struct inode *ip;
 	ufs2_daddr_t bn, lbn, lastblock, lastiblock[UFS_NIADDR];
 	ufs2_daddr_t indir_lbn[UFS_NIADDR], oldblks[UFS_NDADDR + UFS_NIADDR];
 	ufs2_daddr_t newblks[UFS_NDADDR + UFS_NIADDR];
 	ufs2_daddr_t count, blocksreleased = 0, datablocks, blkno;
 	struct bufobj *bo;
 	struct fs *fs;
 	struct buf *bp;
 	struct ufsmount *ump;
 	int softdeptrunc, journaltrunc;
 	int needextclean, extblocks;
 	int offset, size, level, nblocks;
 	int i, error, allerror, indiroff, waitforupdate;
 	u_long key;
 	off_t osize;
 
 	ip = VTOI(vp);
 	ump = VFSTOUFS(vp->v_mount);
 	fs = ump->um_fs;
 	bo = &vp->v_bufobj;
 
 	ASSERT_VOP_LOCKED(vp, "ffs_truncate");
 
 	if (length < 0)
 		return (EINVAL);
 	if (length > fs->fs_maxfilesize)
 		return (EFBIG);
 #ifdef QUOTA
 	error = getinoquota(ip);
 	if (error)
 		return (error);
 #endif
 	/*
 	 * Historically clients did not have to specify which data
 	 * they were truncating. So, if not specified, we assume
 	 * traditional behavior, e.g., just the normal data.
 	 */
 	if ((flags & (IO_EXT | IO_NORMAL)) == 0)
 		flags |= IO_NORMAL;
 	if (!DOINGSOFTDEP(vp) && !DOINGASYNC(vp))
 		flags |= IO_SYNC;
 	waitforupdate = (flags & IO_SYNC) != 0 || !DOINGASYNC(vp);
 	/*
 	 * If we are truncating the extended-attributes, and cannot
 	 * do it with soft updates, then do it slowly here. If we are
 	 * truncating both the extended attributes and the file contents
 	 * (e.g., the file is being unlinked), then pick it off with
 	 * soft updates below.
 	 */
 	allerror = 0;
 	needextclean = 0;
 	softdeptrunc = 0;
 	journaltrunc = DOINGSUJ(vp);
 	if (journaltrunc == 0 && DOINGSOFTDEP(vp) && length == 0)
 		softdeptrunc = !softdep_slowdown(vp);
 	extblocks = 0;
 	datablocks = DIP(ip, i_blocks);
 	if (fs->fs_magic == FS_UFS2_MAGIC && ip->i_din2->di_extsize > 0) {
 		extblocks = btodb(fragroundup(fs, ip->i_din2->di_extsize));
 		datablocks -= extblocks;
 	}
 	if ((flags & IO_EXT) && extblocks > 0) {
 		if (length != 0)
 			panic("ffs_truncate: partial trunc of extdata");
 		if (softdeptrunc || journaltrunc) {
 			if ((flags & IO_NORMAL) == 0)
 				goto extclean;
 			needextclean = 1;
 		} else {
 			if ((error = ffs_syncvnode(vp, MNT_WAIT, 0)) != 0)
 				return (error);
 #ifdef QUOTA
 			(void) chkdq(ip, -extblocks, NOCRED, 0);
 #endif
 			vinvalbuf(vp, V_ALT, 0, 0);
 			vn_pages_remove(vp,
 			    OFF_TO_IDX(lblktosize(fs, -extblocks)), 0);
 			osize = ip->i_din2->di_extsize;
 			ip->i_din2->di_blocks -= extblocks;
 			ip->i_din2->di_extsize = 0;
 			for (i = 0; i < UFS_NXADDR; i++) {
 				oldblks[i] = ip->i_din2->di_extb[i];
 				ip->i_din2->di_extb[i] = 0;
 			}
 			ip->i_flag |= IN_CHANGE;
 			if ((error = ffs_update(vp, waitforupdate)))
 				return (error);
 			for (i = 0; i < UFS_NXADDR; i++) {
 				if (oldblks[i] == 0)
 					continue;
 				ffs_blkfree(ump, fs, ITODEVVP(ip), oldblks[i],
 				    sblksize(fs, osize, i), ip->i_number,
 				    vp->v_type, NULL, SINGLETON_KEY);
 			}
 		}
 	}
 	if ((flags & IO_NORMAL) == 0)
 		return (0);
 	if (vp->v_type == VLNK &&
 	    (ip->i_size < vp->v_mount->mnt_maxsymlinklen ||
 	     datablocks == 0)) {
 #ifdef INVARIANTS
 		if (length != 0)
 			panic("ffs_truncate: partial truncate of symlink");
 #endif
 		bzero(SHORTLINK(ip), (u_int)ip->i_size);
 		ip->i_size = 0;
 		DIP_SET(ip, i_size, 0);
 		ip->i_flag |= IN_CHANGE | IN_UPDATE;
 		if (needextclean)
 			goto extclean;
 		return (ffs_update(vp, waitforupdate));
 	}
 	if (ip->i_size == length) {
 		ip->i_flag |= IN_CHANGE | IN_UPDATE;
 		if (needextclean)
 			goto extclean;
 		return (ffs_update(vp, 0));
 	}
 	if (fs->fs_ronly)
 		panic("ffs_truncate: read-only filesystem");
 	if (IS_SNAPSHOT(ip))
 		ffs_snapremove(vp);
 	vp->v_lasta = vp->v_clen = vp->v_cstart = vp->v_lastw = 0;
 	osize = ip->i_size;
 	/*
 	 * Lengthen the size of the file. We must ensure that the
 	 * last byte of the file is allocated. Since the smallest
 	 * value of osize is 0, length will be at least 1.
 	 */
 	if (osize < length) {
 		vnode_pager_setsize(vp, length);
 		flags |= BA_CLRBUF;
 		error = UFS_BALLOC(vp, length - 1, 1, cred, flags, &bp);
 		if (error) {
 			vnode_pager_setsize(vp, osize);
 			return (error);
 		}
 		ip->i_size = length;
 		DIP_SET(ip, i_size, length);
 		if (bp->b_bufsize == fs->fs_bsize)
 			bp->b_flags |= B_CLUSTEROK;
 		if (flags & IO_SYNC)
 			bwrite(bp);
 		else if (DOINGASYNC(vp))
 			bdwrite(bp);
 		else
 			bawrite(bp);
 		ip->i_flag |= IN_CHANGE | IN_UPDATE;
 		return (ffs_update(vp, waitforupdate));
 	}
 	/*
 	 * Lookup block number for a given offset. Zero length files
 	 * have no blocks, so return a blkno of -1.
 	 */
 	lbn = lblkno(fs, length - 1);
 	if (length == 0) {
 		blkno = -1;
 	} else if (lbn < UFS_NDADDR) {
 		blkno = DIP(ip, i_db[lbn]);
 	} else {
 		error = UFS_BALLOC(vp, lblktosize(fs, (off_t)lbn), fs->fs_bsize,
 		    cred, BA_METAONLY, &bp);
 		if (error)
 			return (error);
 		indiroff = (lbn - UFS_NDADDR) % NINDIR(fs);
 		if (I_IS_UFS1(ip))
 			blkno = ((ufs1_daddr_t *)(bp->b_data))[indiroff];
 		else
 			blkno = ((ufs2_daddr_t *)(bp->b_data))[indiroff];
 		/*
 		 * If the block number is non-zero, then the indirect block
 		 * must have been previously allocated and need not be written.
 		 * If the block number is zero, then we may have allocated
 		 * the indirect block and hence need to write it out.
 		 */
 		if (blkno != 0)
 			brelse(bp);
 		else if (flags & IO_SYNC)
 			bwrite(bp);
 		else
 			bdwrite(bp);
 	}
 	/*
 	 * If the block number at the new end of the file is zero,
 	 * then we must allocate it to ensure that the last block of 
 	 * the file is allocated. Soft updates does not handle this
 	 * case, so here we have to clean up the soft updates data
 	 * structures describing the allocation past the truncation
 	 * point. Finding and deallocating those structures is a lot of
 	 * work. Since partial truncation with a hole at the end occurs
 	 * rarely, we solve the problem by syncing the file so that it
 	 * will have no soft updates data structures left.
 	 */
 	if (blkno == 0 && (error = ffs_syncvnode(vp, MNT_WAIT, 0)) != 0)
 		return (error);
 	if (blkno != 0 && DOINGSOFTDEP(vp)) {
 		if (softdeptrunc == 0 && journaltrunc == 0) {
 			/*
 			 * If soft updates cannot handle this truncation,
 			 * clean up soft dependency data structures and
 			 * fall through to the synchronous truncation.
 			 */
 			if ((error = ffs_syncvnode(vp, MNT_WAIT, 0)) != 0)
 				return (error);
 		} else {
 			flags = IO_NORMAL | (needextclean ? IO_EXT: 0);
 			if (journaltrunc)
 				softdep_journal_freeblocks(ip, cred, length,
 				    flags);
 			else
 				softdep_setup_freeblocks(ip, length, flags);
 			ASSERT_VOP_LOCKED(vp, "ffs_truncate1");
 			if (journaltrunc == 0) {
 				ip->i_flag |= IN_CHANGE | IN_UPDATE;
 				error = ffs_update(vp, 0);
 			}
 			return (error);
 		}
 	}
 	/*
 	 * Shorten the size of the file. If the last block of the
 	 * shortened file is unallocated, we must allocate it.
 	 * Additionally, if the file is not being truncated to a
 	 * block boundary, the contents of the partial block
 	 * following the end of the file must be zero'ed in
 	 * case it ever becomes accessible again because of
 	 * subsequent file growth. Directories however are not
 	 * zero'ed as they should grow back initialized to empty.
 	 */
 	offset = blkoff(fs, length);
 	if (blkno != 0 && offset == 0) {
 		ip->i_size = length;
 		DIP_SET(ip, i_size, length);
 	} else {
 		lbn = lblkno(fs, length);
 		flags |= BA_CLRBUF;
 		error = UFS_BALLOC(vp, length - 1, 1, cred, flags, &bp);
 		if (error)
 			return (error);
 		/*
 		 * When we are doing soft updates and the UFS_BALLOC
 		 * above fills in a direct block hole with a full sized
 		 * block that will be truncated down to a fragment below,
 		 * we must flush out the block dependency with an FSYNC
 		 * so that we do not get a soft updates inconsistency
 		 * when we create the fragment below.
 		 */
 		if (DOINGSOFTDEP(vp) && lbn < UFS_NDADDR &&
 		    fragroundup(fs, blkoff(fs, length)) < fs->fs_bsize &&
 		    (error = ffs_syncvnode(vp, MNT_WAIT, 0)) != 0)
 			return (error);
 		ip->i_size = length;
 		DIP_SET(ip, i_size, length);
 		size = blksize(fs, ip, lbn);
 		if (vp->v_type != VDIR && offset != 0)
 			bzero((char *)bp->b_data + offset,
 			    (u_int)(size - offset));
 		/* Kirk's code has reallocbuf(bp, size, 1) here */
 		allocbuf(bp, size);
 		if (bp->b_bufsize == fs->fs_bsize)
 			bp->b_flags |= B_CLUSTEROK;
 		if (flags & IO_SYNC)
 			bwrite(bp);
 		else if (DOINGASYNC(vp))
 			bdwrite(bp);
 		else
 			bawrite(bp);
 	}
 	/*
 	 * Calculate index into inode's block list of
 	 * last direct and indirect blocks (if any)
 	 * which we want to keep.  Lastblock is -1 when
 	 * the file is truncated to 0.
 	 */
 	lastblock = lblkno(fs, length + fs->fs_bsize - 1) - 1;
 	lastiblock[SINGLE] = lastblock - UFS_NDADDR;
 	lastiblock[DOUBLE] = lastiblock[SINGLE] - NINDIR(fs);
 	lastiblock[TRIPLE] = lastiblock[DOUBLE] - NINDIR(fs) * NINDIR(fs);
 	nblocks = btodb(fs->fs_bsize);
 	/*
 	 * Update file and block pointers on disk before we start freeing
 	 * blocks.  If we crash before free'ing blocks below, the blocks
 	 * will be returned to the free list.  lastiblock values are also
 	 * normalized to -1 for calls to ffs_indirtrunc below.
 	 */
 	for (level = TRIPLE; level >= SINGLE; level--) {
 		oldblks[UFS_NDADDR + level] = DIP(ip, i_ib[level]);
 		if (lastiblock[level] < 0) {
 			DIP_SET(ip, i_ib[level], 0);
 			lastiblock[level] = -1;
 		}
 	}
 	for (i = 0; i < UFS_NDADDR; i++) {
 		oldblks[i] = DIP(ip, i_db[i]);
 		if (i > lastblock)
 			DIP_SET(ip, i_db[i], 0);
 	}
 	ip->i_flag |= IN_CHANGE | IN_UPDATE;
 	allerror = ffs_update(vp, waitforupdate);
 	
 	/*
 	 * Having written the new inode to disk, save its new configuration
 	 * and put back the old block pointers long enough to process them.
 	 * Note that we save the new block configuration so we can check it
 	 * when we are done.
 	 */
 	for (i = 0; i < UFS_NDADDR; i++) {
 		newblks[i] = DIP(ip, i_db[i]);
 		DIP_SET(ip, i_db[i], oldblks[i]);
 	}
 	for (i = 0; i < UFS_NIADDR; i++) {
 		newblks[UFS_NDADDR + i] = DIP(ip, i_ib[i]);
 		DIP_SET(ip, i_ib[i], oldblks[UFS_NDADDR + i]);
 	}
 	ip->i_size = osize;
 	DIP_SET(ip, i_size, osize);
 
-	error = vtruncbuf(vp, cred, length, fs->fs_bsize);
+	error = vtruncbuf(vp, length, fs->fs_bsize);
 	if (error && (allerror == 0))
 		allerror = error;
 
 	/*
 	 * Indirect blocks first.
 	 */
 	indir_lbn[SINGLE] = -UFS_NDADDR;
 	indir_lbn[DOUBLE] = indir_lbn[SINGLE] - NINDIR(fs) - 1;
 	indir_lbn[TRIPLE] = indir_lbn[DOUBLE] - NINDIR(fs) * NINDIR(fs) - 1;
 	for (level = TRIPLE; level >= SINGLE; level--) {
 		bn = DIP(ip, i_ib[level]);
 		if (bn != 0) {
 			error = ffs_indirtrunc(ip, indir_lbn[level],
 			    fsbtodb(fs, bn), lastiblock[level], level, &count);
 			if (error)
 				allerror = error;
 			blocksreleased += count;
 			if (lastiblock[level] < 0) {
 				DIP_SET(ip, i_ib[level], 0);
 				ffs_blkfree(ump, fs, ump->um_devvp, bn,
 				    fs->fs_bsize, ip->i_number,
 				    vp->v_type, NULL, SINGLETON_KEY);
 				blocksreleased += nblocks;
 			}
 		}
 		if (lastiblock[level] >= 0)
 			goto done;
 	}
 
 	/*
 	 * All whole direct blocks or frags.
 	 */
 	key = ffs_blkrelease_start(ump, ump->um_devvp, ip->i_number);
 	for (i = UFS_NDADDR - 1; i > lastblock; i--) {
 		long bsize;
 
 		bn = DIP(ip, i_db[i]);
 		if (bn == 0)
 			continue;
 		DIP_SET(ip, i_db[i], 0);
 		bsize = blksize(fs, ip, i);
 		ffs_blkfree(ump, fs, ump->um_devvp, bn, bsize, ip->i_number,
 		    vp->v_type, NULL, key);
 		blocksreleased += btodb(bsize);
 	}
 	ffs_blkrelease_finish(ump, key);
 	if (lastblock < 0)
 		goto done;
 
 	/*
 	 * Finally, look for a change in size of the
 	 * last direct block; release any frags.
 	 */
 	bn = DIP(ip, i_db[lastblock]);
 	if (bn != 0) {
 		long oldspace, newspace;
 
 		/*
 		 * Calculate amount of space we're giving
 		 * back as old block size minus new block size.
 		 */
 		oldspace = blksize(fs, ip, lastblock);
 		ip->i_size = length;
 		DIP_SET(ip, i_size, length);
 		newspace = blksize(fs, ip, lastblock);
 		if (newspace == 0)
 			panic("ffs_truncate: newspace");
 		if (oldspace - newspace > 0) {
 			/*
 			 * Block number of space to be free'd is
 			 * the old block # plus the number of frags
 			 * required for the storage we're keeping.
 			 */
 			bn += numfrags(fs, newspace);
 			ffs_blkfree(ump, fs, ump->um_devvp, bn,
 			   oldspace - newspace, ip->i_number, vp->v_type,
 			   NULL, SINGLETON_KEY);
 			blocksreleased += btodb(oldspace - newspace);
 		}
 	}
 done:
 #ifdef INVARIANTS
 	for (level = SINGLE; level <= TRIPLE; level++)
 		if (newblks[UFS_NDADDR + level] != DIP(ip, i_ib[level]))
 			panic("ffs_truncate1");
 	for (i = 0; i < UFS_NDADDR; i++)
 		if (newblks[i] != DIP(ip, i_db[i]))
 			panic("ffs_truncate2");
 	BO_LOCK(bo);
 	if (length == 0 &&
 	    (fs->fs_magic != FS_UFS2_MAGIC || ip->i_din2->di_extsize == 0) &&
 	    (bo->bo_dirty.bv_cnt > 0 || bo->bo_clean.bv_cnt > 0))
 		panic("ffs_truncate3");
 	BO_UNLOCK(bo);
 #endif /* INVARIANTS */
 	/*
 	 * Put back the real size.
 	 */
 	ip->i_size = length;
 	DIP_SET(ip, i_size, length);
 	if (DIP(ip, i_blocks) >= blocksreleased)
 		DIP_SET(ip, i_blocks, DIP(ip, i_blocks) - blocksreleased);
 	else	/* sanity */
 		DIP_SET(ip, i_blocks, 0);
 	ip->i_flag |= IN_CHANGE;
 #ifdef QUOTA
 	(void) chkdq(ip, -blocksreleased, NOCRED, 0);
 #endif
 	return (allerror);
 
 extclean:
 	if (journaltrunc)
 		softdep_journal_freeblocks(ip, cred, length, IO_EXT);
 	else
 		softdep_setup_freeblocks(ip, length, IO_EXT);
 	return (ffs_update(vp, waitforupdate));
 }
 
 /*
  * Release blocks associated with the inode ip and stored in the indirect
  * block bn.  Blocks are free'd in LIFO order up to (but not including)
  * lastbn.  If level is greater than SINGLE, the block is an indirect block
  * and recursive calls to indirtrunc must be used to cleanse other indirect
  * blocks.
  */
 static int
 ffs_indirtrunc(ip, lbn, dbn, lastbn, level, countp)
 	struct inode *ip;
 	ufs2_daddr_t lbn, lastbn;
 	ufs2_daddr_t dbn;
 	int level;
 	ufs2_daddr_t *countp;
 {
 	struct buf *bp;
 	struct fs *fs;
 	struct ufsmount *ump;
 	struct vnode *vp;
 	caddr_t copy = NULL;
 	u_long key;
 	int i, nblocks, error = 0, allerror = 0;
 	ufs2_daddr_t nb, nlbn, last;
 	ufs2_daddr_t blkcount, factor, blocksreleased = 0;
 	ufs1_daddr_t *bap1 = NULL;
 	ufs2_daddr_t *bap2 = NULL;
 #define BAP(ip, i) (I_IS_UFS1(ip) ? bap1[i] : bap2[i])
 
 	fs = ITOFS(ip);
 	ump = ITOUMP(ip);
 
 	/*
 	 * Calculate index in current block of last
 	 * block to be kept.  -1 indicates the entire
 	 * block so we need not calculate the index.
 	 */
 	factor = lbn_offset(fs, level);
 	last = lastbn;
 	if (lastbn > 0)
 		last /= factor;
 	nblocks = btodb(fs->fs_bsize);
 	/*
 	 * Get buffer of block pointers, zero those entries corresponding
 	 * to blocks to be free'd, and update on disk copy first.  Since
 	 * double(triple) indirect before single(double) indirect, calls
 	 * to bmap on these blocks will fail.  However, we already have
 	 * the on disk address, so we have to set the b_blkno field
 	 * explicitly instead of letting bread do everything for us.
 	 */
 	vp = ITOV(ip);
 	bp = getblk(vp, lbn, (int)fs->fs_bsize, 0, 0, 0);
 	if ((bp->b_flags & B_CACHE) == 0) {
 #ifdef RACCT
 		if (racct_enable) {
 			PROC_LOCK(curproc);
 			racct_add_buf(curproc, bp, 0);
 			PROC_UNLOCK(curproc);
 		}
 #endif /* RACCT */
 		curthread->td_ru.ru_inblock++;	/* pay for read */
 		bp->b_iocmd = BIO_READ;
 		bp->b_flags &= ~B_INVAL;
 		bp->b_ioflags &= ~BIO_ERROR;
 		if (bp->b_bcount > bp->b_bufsize)
 			panic("ffs_indirtrunc: bad buffer size");
 		bp->b_blkno = dbn;
 		vfs_busy_pages(bp, 0);
 		bp->b_iooffset = dbtob(bp->b_blkno);
 		bstrategy(bp);
 		error = bufwait(bp);
 	}
 	if (error) {
 		brelse(bp);
 		*countp = 0;
 		return (error);
 	}
 
 	if (I_IS_UFS1(ip))
 		bap1 = (ufs1_daddr_t *)bp->b_data;
 	else
 		bap2 = (ufs2_daddr_t *)bp->b_data;
 	if (lastbn != -1) {
 		copy = malloc(fs->fs_bsize, M_TEMP, M_WAITOK);
 		bcopy((caddr_t)bp->b_data, copy, (u_int)fs->fs_bsize);
 		for (i = last + 1; i < NINDIR(fs); i++)
 			if (I_IS_UFS1(ip))
 				bap1[i] = 0;
 			else
 				bap2[i] = 0;
 		if (DOINGASYNC(vp)) {
 			bdwrite(bp);
 		} else {
 			error = bwrite(bp);
 			if (error)
 				allerror = error;
 		}
 		if (I_IS_UFS1(ip))
 			bap1 = (ufs1_daddr_t *)copy;
 		else
 			bap2 = (ufs2_daddr_t *)copy;
 	}
 
 	/*
 	 * Recursively free totally unused blocks.
 	 */
 	key = ffs_blkrelease_start(ump, ITODEVVP(ip), ip->i_number);
 	for (i = NINDIR(fs) - 1, nlbn = lbn + 1 - i * factor; i > last;
 	    i--, nlbn += factor) {
 		nb = BAP(ip, i);
 		if (nb == 0)
 			continue;
 		if (level > SINGLE) {
 			if ((error = ffs_indirtrunc(ip, nlbn, fsbtodb(fs, nb),
 			    (ufs2_daddr_t)-1, level - 1, &blkcount)) != 0)
 				allerror = error;
 			blocksreleased += blkcount;
 		}
 		ffs_blkfree(ump, fs, ITODEVVP(ip), nb, fs->fs_bsize,
 		    ip->i_number, vp->v_type, NULL, key);
 		blocksreleased += nblocks;
 	}
 	ffs_blkrelease_finish(ump, key);
 
 	/*
 	 * Recursively free last partial block.
 	 */
 	if (level > SINGLE && lastbn >= 0) {
 		last = lastbn % factor;
 		nb = BAP(ip, i);
 		if (nb != 0) {
 			error = ffs_indirtrunc(ip, nlbn, fsbtodb(fs, nb),
 			    last, level - 1, &blkcount);
 			if (error)
 				allerror = error;
 			blocksreleased += blkcount;
 		}
 	}
 	if (copy != NULL) {
 		free(copy, M_TEMP);
 	} else {
 		bp->b_flags |= B_INVAL | B_NOCACHE;
 		brelse(bp);
 	}
 
 	*countp = blocksreleased;
 	return (allerror);
 }
 
 int
 ffs_rdonly(struct inode *ip)
 {
 
 	return (ITOFS(ip)->fs_ronly != 0);
 }
 
Index: stable/12
===================================================================
--- stable/12	(revision 349304)
+++ stable/12	(revision 349305)

Property changes on: stable/12
___________________________________________________________________
Modified: svn:mergeinfo
## -0,0 +0,1 ##
   Merged /head:r348251