Index: head/sys/fs/nfsclient/nfs_clbio.c
===================================================================
--- head/sys/fs/nfsclient/nfs_clbio.c	(revision 358251)
+++ head/sys/fs/nfsclient/nfs_clbio.c	(revision 358252)
@@ -1,1874 +1,1893 @@
 /*-
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Copyright (c) 1989, 1993
  *	The Regents of the University of California.  All rights reserved.
  *
  * This code is derived from software contributed to Berkeley by
  * Rick Macklem at The University of Guelph.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	@(#)nfs_bio.c	8.9 (Berkeley) 3/30/95
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/bio.h>
 #include <sys/buf.h>
 #include <sys/kernel.h>
 #include <sys/mount.h>
 #include <sys/rwlock.h>
 #include <sys/vmmeter.h>
 #include <sys/vnode.h>
 
 #include <vm/vm.h>
 #include <vm/vm_param.h>
 #include <vm/vm_extern.h>
 #include <vm/vm_page.h>
 #include <vm/vm_object.h>
 #include <vm/vm_pager.h>
 #include <vm/vnode_pager.h>
 
 #include <fs/nfs/nfsport.h>
 #include <fs/nfsclient/nfsmount.h>
 #include <fs/nfsclient/nfs.h>
 #include <fs/nfsclient/nfsnode.h>
 #include <fs/nfsclient/nfs_kdtrace.h>
 
 extern int newnfs_directio_allow_mmap;
 extern struct nfsstatsv1 nfsstatsv1;
 extern struct mtx ncl_iod_mutex;
 extern int ncl_numasync;
 extern enum nfsiod_state ncl_iodwant[NFS_MAXASYNCDAEMON];
 extern struct nfsmount *ncl_iodmount[NFS_MAXASYNCDAEMON];
 extern int newnfs_directio_enable;
 extern int nfs_keep_dirty_on_error;
 
 uma_zone_t ncl_pbuf_zone;
 
 static struct buf *nfs_getcacheblk(struct vnode *vp, daddr_t bn, int size,
     struct thread *td);
 static int nfs_directio_write(struct vnode *vp, struct uio *uiop,
     struct ucred *cred, int ioflag);
 
 /*
  * Vnode op for VM getpages.
  */
 SYSCTL_DECL(_vfs_nfs);
 static int use_buf_pager = 1;
 SYSCTL_INT(_vfs_nfs, OID_AUTO, use_buf_pager, CTLFLAG_RWTUN,
     &use_buf_pager, 0,
     "Use buffer pager instead of direct readrpc call");
 
 static daddr_t
 ncl_gbp_getblkno(struct vnode *vp, vm_ooffset_t off)
 {
 
 	return (off / vp->v_bufobj.bo_bsize);
 }
 
 static int
 ncl_gbp_getblksz(struct vnode *vp, daddr_t lbn)
 {
 	struct nfsnode *np;
 	u_quad_t nsize;
 	int biosize, bcount;
 
 	np = VTONFS(vp);
 	NFSLOCKNODE(np);
 	nsize = np->n_size;
 	NFSUNLOCKNODE(np);
 
 	biosize = vp->v_bufobj.bo_bsize;
 	bcount = biosize;
 	if ((off_t)lbn * biosize >= nsize)
 		bcount = 0;
 	else if ((off_t)(lbn + 1) * biosize > nsize)
 		bcount = nsize - (off_t)lbn * biosize;
 	return (bcount);
 }
 
 int
 ncl_getpages(struct vop_getpages_args *ap)
 {
 	int i, error, nextoff, size, toff, count, npages;
 	struct uio uio;
 	struct iovec iov;
 	vm_offset_t kva;
 	struct buf *bp;
 	struct vnode *vp;
 	struct thread *td;
 	struct ucred *cred;
 	struct nfsmount *nmp;
 	vm_object_t object;
 	vm_page_t *pages;
 	struct nfsnode *np;
 
 	vp = ap->a_vp;
 	np = VTONFS(vp);
 	td = curthread;
 	cred = curthread->td_ucred;
 	nmp = VFSTONFS(vp->v_mount);
 	pages = ap->a_m;
 	npages = ap->a_count;
 
 	if ((object = vp->v_object) == NULL) {
 		printf("ncl_getpages: called with non-merged cache vnode\n");
 		return (VM_PAGER_ERROR);
 	}
 
 	if (newnfs_directio_enable && !newnfs_directio_allow_mmap) {
 		NFSLOCKNODE(np);
 		if ((np->n_flag & NNONCACHE) && (vp->v_type == VREG)) {
 			NFSUNLOCKNODE(np);
 			printf("ncl_getpages: called on non-cacheable vnode\n");
 			return (VM_PAGER_ERROR);
 		} else
 			NFSUNLOCKNODE(np);
 	}
 
 	mtx_lock(&nmp->nm_mtx);
 	if ((nmp->nm_flag & NFSMNT_NFSV3) != 0 &&
 	    (nmp->nm_state & NFSSTA_GOTFSINFO) == 0) {
 		mtx_unlock(&nmp->nm_mtx);
 		/* We'll never get here for v4, because we always have fsinfo */
 		(void)ncl_fsinfo(nmp, vp, cred, td);
 	} else
 		mtx_unlock(&nmp->nm_mtx);
 
 	if (use_buf_pager)
 		return (vfs_bio_getpages(vp, pages, npages, ap->a_rbehind,
 		    ap->a_rahead, ncl_gbp_getblkno, ncl_gbp_getblksz));
 
 	/*
 	 * If the requested page is partially valid, just return it and
 	 * allow the pager to zero-out the blanks.  Partially valid pages
 	 * can only occur at the file EOF.
 	 *
 	 * XXXGL: is that true for NFS, where short read can occur???
 	 */
 	VM_OBJECT_WLOCK(object);
 	if (!vm_page_none_valid(pages[npages - 1]) && --npages == 0)
 		goto out;
 	VM_OBJECT_WUNLOCK(object);
 
 	/*
 	 * We use only the kva address for the buffer, but this is extremely
 	 * convenient and fast.
 	 */
 	bp = uma_zalloc(ncl_pbuf_zone, M_WAITOK);
 
 	kva = (vm_offset_t) bp->b_data;
 	pmap_qenter(kva, pages, npages);
 	VM_CNT_INC(v_vnodein);
 	VM_CNT_ADD(v_vnodepgsin, npages);
 
 	count = npages << PAGE_SHIFT;
 	iov.iov_base = (caddr_t) kva;
 	iov.iov_len = count;
 	uio.uio_iov = &iov;
 	uio.uio_iovcnt = 1;
 	uio.uio_offset = IDX_TO_OFF(pages[0]->pindex);
 	uio.uio_resid = count;
 	uio.uio_segflg = UIO_SYSSPACE;
 	uio.uio_rw = UIO_READ;
 	uio.uio_td = td;
 
 	error = ncl_readrpc(vp, &uio, cred);
 	pmap_qremove(kva, npages);
 
 	uma_zfree(ncl_pbuf_zone, bp);
 
 	if (error && (uio.uio_resid == count)) {
 		printf("ncl_getpages: error %d\n", error);
 		return (VM_PAGER_ERROR);
 	}
 
 	/*
 	 * Calculate the number of bytes read and validate only that number
 	 * of bytes.  Note that due to pending writes, size may be 0.  This
 	 * does not mean that the remaining data is invalid!
 	 */
 
 	size = count - uio.uio_resid;
 	VM_OBJECT_WLOCK(object);
 	for (i = 0, toff = 0; i < npages; i++, toff = nextoff) {
 		vm_page_t m;
 		nextoff = toff + PAGE_SIZE;
 		m = pages[i];
 
 		if (nextoff <= size) {
 			/*
 			 * Read operation filled an entire page
 			 */
 			vm_page_valid(m);
 			KASSERT(m->dirty == 0,
 			    ("nfs_getpages: page %p is dirty", m));
 		} else if (size > toff) {
 			/*
 			 * Read operation filled a partial page.
 			 */
 			vm_page_invalid(m);
 			vm_page_set_valid_range(m, 0, size - toff);
 			KASSERT(m->dirty == 0,
 			    ("nfs_getpages: page %p is dirty", m));
 		} else {
 			/*
 			 * Read operation was short.  If no error
 			 * occurred we may have hit a zero-fill
 			 * section.  We leave valid set to 0, and page
 			 * is freed by vm_page_readahead_finish() if
 			 * its index is not equal to requested, or
 			 * page is zeroed and set valid by
 			 * vm_pager_get_pages() for requested page.
 			 */
 			;
 		}
 	}
 out:
 	VM_OBJECT_WUNLOCK(object);
 	if (ap->a_rbehind)
 		*ap->a_rbehind = 0;
 	if (ap->a_rahead)
 		*ap->a_rahead = 0;
 	return (VM_PAGER_OK);
 }
 
 /*
  * Vnode op for VM putpages.
  */
 int
 ncl_putpages(struct vop_putpages_args *ap)
 {
 	struct uio uio;
 	struct iovec iov;
 	int i, error, npages, count;
 	off_t offset;
 	int *rtvals;
 	struct vnode *vp;
 	struct thread *td;
 	struct ucred *cred;
 	struct nfsmount *nmp;
 	struct nfsnode *np;
 	vm_page_t *pages;
 
 	vp = ap->a_vp;
 	np = VTONFS(vp);
 	td = curthread;				/* XXX */
 	/* Set the cred to n_writecred for the write rpcs. */
 	if (np->n_writecred != NULL)
 		cred = crhold(np->n_writecred);
 	else
 		cred = crhold(curthread->td_ucred);	/* XXX */
 	nmp = VFSTONFS(vp->v_mount);
 	pages = ap->a_m;
 	count = ap->a_count;
 	rtvals = ap->a_rtvals;
 	npages = btoc(count);
 	offset = IDX_TO_OFF(pages[0]->pindex);
 
 	mtx_lock(&nmp->nm_mtx);
 	if ((nmp->nm_flag & NFSMNT_NFSV3) != 0 &&
 	    (nmp->nm_state & NFSSTA_GOTFSINFO) == 0) {
 		mtx_unlock(&nmp->nm_mtx);
 		(void)ncl_fsinfo(nmp, vp, cred, td);
 	} else
 		mtx_unlock(&nmp->nm_mtx);
 
 	NFSLOCKNODE(np);
 	if (newnfs_directio_enable && !newnfs_directio_allow_mmap &&
 	    (np->n_flag & NNONCACHE) && (vp->v_type == VREG)) {
 		NFSUNLOCKNODE(np);
 		printf("ncl_putpages: called on noncache-able vnode\n");
 		NFSLOCKNODE(np);
 	}
 	/*
 	 * When putting pages, do not extend file past EOF.
 	 */
 	if (offset + count > np->n_size) {
 		count = np->n_size - offset;
 		if (count < 0)
 			count = 0;
 	}
 	NFSUNLOCKNODE(np);
 
 	for (i = 0; i < npages; i++)
 		rtvals[i] = VM_PAGER_ERROR;
 
 	VM_CNT_INC(v_vnodeout);
 	VM_CNT_ADD(v_vnodepgsout, count);
 
 	iov.iov_base = unmapped_buf;
 	iov.iov_len = count;
 	uio.uio_iov = &iov;
 	uio.uio_iovcnt = 1;
 	uio.uio_offset = offset;
 	uio.uio_resid = count;
 	uio.uio_segflg = UIO_NOCOPY;
 	uio.uio_rw = UIO_WRITE;
 	uio.uio_td = td;
 
 	error = VOP_WRITE(vp, &uio, vnode_pager_putpages_ioflags(ap->a_sync),
 	    cred);
 	crfree(cred);
 
 	if (error == 0 || !nfs_keep_dirty_on_error) {
 		vnode_pager_undirty_pages(pages, rtvals, count - uio.uio_resid,
 		    np->n_size - offset, npages * PAGE_SIZE);
 	}
 	return (rtvals[0]);
 }
 
 /*
  * For nfs, cache consistency can only be maintained approximately.
  * Although RFC1094 does not specify the criteria, the following is
  * believed to be compatible with the reference port.
  * For nfs:
  * If the file's modify time on the server has changed since the
  * last read rpc or you have written to the file,
  * you may have lost data cache consistency with the
  * server, so flush all of the file's data out of the cache.
  * Then force a getattr rpc to ensure that you have up to date
  * attributes.
  * NB: This implies that cache data can be read when up to
  * NFS_ATTRTIMEO seconds out of date. If you find that you need current
  * attributes this could be forced by setting n_attrstamp to 0 before
  * the VOP_GETATTR() call.
  */
 static inline int
 nfs_bioread_check_cons(struct vnode *vp, struct thread *td, struct ucred *cred)
 {
 	int error = 0;
 	struct vattr vattr;
 	struct nfsnode *np = VTONFS(vp);
 	bool old_lock;
 
 	/*
 	 * Ensure the exclusove access to the node before checking
 	 * whether the cache is consistent.
 	 */
 	old_lock = ncl_excl_start(vp);
 	NFSLOCKNODE(np);
 	if (np->n_flag & NMODIFIED) {
 		NFSUNLOCKNODE(np);
 		if (vp->v_type != VREG) {
 			if (vp->v_type != VDIR)
 				panic("nfs: bioread, not dir");
 			ncl_invaldir(vp);
 			error = ncl_vinvalbuf(vp, V_SAVE | V_ALLOWCLEAN, td, 1);
 			if (error != 0)
 				goto out;
 		}
 		np->n_attrstamp = 0;
 		KDTRACE_NFS_ATTRCACHE_FLUSH_DONE(vp);
 		error = VOP_GETATTR(vp, &vattr, cred);
 		if (error)
 			goto out;
 		NFSLOCKNODE(np);
 		np->n_mtime = vattr.va_mtime;
 		NFSUNLOCKNODE(np);
 	} else {
 		NFSUNLOCKNODE(np);
 		error = VOP_GETATTR(vp, &vattr, cred);
 		if (error)
 			goto out;
 		NFSLOCKNODE(np);
 		if ((np->n_flag & NSIZECHANGED)
 		    || (NFS_TIMESPEC_COMPARE(&np->n_mtime, &vattr.va_mtime))) {
 			NFSUNLOCKNODE(np);
 			if (vp->v_type == VDIR)
 				ncl_invaldir(vp);
 			error = ncl_vinvalbuf(vp, V_SAVE | V_ALLOWCLEAN, td, 1);
 			if (error != 0)
 				goto out;
 			NFSLOCKNODE(np);
 			np->n_mtime = vattr.va_mtime;
 			np->n_flag &= ~NSIZECHANGED;
 		}
 		NFSUNLOCKNODE(np);
 	}
 out:
 	ncl_excl_finish(vp, old_lock);
 	return (error);
 }
 
 /*
  * Vnode op for read using bio
  */
 int
 ncl_bioread(struct vnode *vp, struct uio *uio, int ioflag, struct ucred *cred)
 {
 	struct nfsnode *np = VTONFS(vp);
-	int biosize, i;
 	struct buf *bp, *rabp;
 	struct thread *td;
 	struct nfsmount *nmp = VFSTONFS(vp->v_mount);
 	daddr_t lbn, rabn;
-	int bcount;
-	int seqcount;
-	int nra, error = 0, n = 0, on = 0;
+	int biosize, bcount, error, i, n, nra, on, save2, seqcount;
 	off_t tmp_off;
 
 	KASSERT(uio->uio_rw == UIO_READ, ("ncl_read mode"));
 	if (uio->uio_resid == 0)
 		return (0);
 	if (uio->uio_offset < 0)	/* XXX VDIR cookies can be negative */
 		return (EINVAL);
 	td = uio->uio_td;
 
 	mtx_lock(&nmp->nm_mtx);
 	if ((nmp->nm_flag & NFSMNT_NFSV3) != 0 &&
 	    (nmp->nm_state & NFSSTA_GOTFSINFO) == 0) {
 		mtx_unlock(&nmp->nm_mtx);
 		(void)ncl_fsinfo(nmp, vp, cred, td);
 		mtx_lock(&nmp->nm_mtx);
 	}
 	if (nmp->nm_rsize == 0 || nmp->nm_readdirsize == 0)
 		(void) newnfs_iosize(nmp);
 
 	tmp_off = uio->uio_offset + uio->uio_resid;
 	if (vp->v_type != VDIR &&
 	    (tmp_off > nmp->nm_maxfilesize || tmp_off < uio->uio_offset)) {
 		mtx_unlock(&nmp->nm_mtx);
 		return (EFBIG);
 	}
 	mtx_unlock(&nmp->nm_mtx);
 
 	if (newnfs_directio_enable && (ioflag & IO_DIRECT) && (vp->v_type == VREG))
 		/* No caching/ no readaheads. Just read data into the user buffer */
 		return ncl_readrpc(vp, uio, cred);
 
+	n = 0;
+	on = 0;
 	biosize = vp->v_bufobj.bo_bsize;
 	seqcount = (int)((off_t)(ioflag >> IO_SEQSHIFT) * biosize / BKVASIZE);
 
 	error = nfs_bioread_check_cons(vp, td, cred);
 	if (error)
 		return error;
 
+	save2 = curthread_pflags2_set(TDP2_SBPAGES);
 	do {
 	    u_quad_t nsize;
 
 	    NFSLOCKNODE(np);
 	    nsize = np->n_size;
 	    NFSUNLOCKNODE(np);
 
 	    switch (vp->v_type) {
 	    case VREG:
 		NFSINCRGLOBAL(nfsstatsv1.biocache_reads);
 		lbn = uio->uio_offset / biosize;
 		on = uio->uio_offset - (lbn * biosize);
 
 		/*
 		 * Start the read ahead(s), as required.
 		 */
 		if (nmp->nm_readahead > 0) {
 		    for (nra = 0; nra < nmp->nm_readahead && nra < seqcount &&
 			(off_t)(lbn + 1 + nra) * biosize < nsize; nra++) {
 			rabn = lbn + 1 + nra;
 			if (incore(&vp->v_bufobj, rabn) == NULL) {
 			    rabp = nfs_getcacheblk(vp, rabn, biosize, td);
 			    if (!rabp) {
 				error = newnfs_sigintr(nmp, td);
-				return (error ? error : EINTR);
+				if (error == 0)
+					error = EINTR;
+				goto out;
 			    }
 			    if ((rabp->b_flags & (B_CACHE|B_DELWRI)) == 0) {
 				rabp->b_flags |= B_ASYNC;
 				rabp->b_iocmd = BIO_READ;
 				vfs_busy_pages(rabp, 0);
 				if (ncl_asyncio(nmp, rabp, cred, td)) {
 				    rabp->b_flags |= B_INVAL;
 				    rabp->b_ioflags |= BIO_ERROR;
 				    vfs_unbusy_pages(rabp);
 				    brelse(rabp);
 				    break;
 				}
 			    } else {
 				brelse(rabp);
 			    }
 			}
 		    }
 		}
 
 		/* Note that bcount is *not* DEV_BSIZE aligned. */
 		bcount = biosize;
 		if ((off_t)lbn * biosize >= nsize) {
 			bcount = 0;
 		} else if ((off_t)(lbn + 1) * biosize > nsize) {
 			bcount = nsize - (off_t)lbn * biosize;
 		}
 		bp = nfs_getcacheblk(vp, lbn, bcount, td);
 
 		if (!bp) {
 			error = newnfs_sigintr(nmp, td);
-			return (error ? error : EINTR);
+			if (error == 0)
+				error = EINTR;
+			goto out;
 		}
 
 		/*
 		 * If B_CACHE is not set, we must issue the read.  If this
 		 * fails, we return an error.
 		 */
 
 		if ((bp->b_flags & B_CACHE) == 0) {
 		    bp->b_iocmd = BIO_READ;
 		    vfs_busy_pages(bp, 0);
 		    error = ncl_doio(vp, bp, cred, td, 0);
 		    if (error) {
 			brelse(bp);
-			return (error);
+			goto out;
 		    }
 		}
 
 		/*
 		 * on is the offset into the current bp.  Figure out how many
 		 * bytes we can copy out of the bp.  Note that bcount is
 		 * NOT DEV_BSIZE aligned.
 		 *
 		 * Then figure out how many bytes we can copy into the uio.
 		 */
 
 		n = 0;
 		if (on < bcount)
 			n = MIN((unsigned)(bcount - on), uio->uio_resid);
 		break;
 	    case VLNK:
 		NFSINCRGLOBAL(nfsstatsv1.biocache_readlinks);
 		bp = nfs_getcacheblk(vp, (daddr_t)0, NFS_MAXPATHLEN, td);
 		if (!bp) {
 			error = newnfs_sigintr(nmp, td);
-			return (error ? error : EINTR);
+			if (error == 0)
+				error = EINTR;
+			goto out;
 		}
 		if ((bp->b_flags & B_CACHE) == 0) {
 		    bp->b_iocmd = BIO_READ;
 		    vfs_busy_pages(bp, 0);
 		    error = ncl_doio(vp, bp, cred, td, 0);
 		    if (error) {
 			bp->b_ioflags |= BIO_ERROR;
 			brelse(bp);
-			return (error);
+			goto out;
 		    }
 		}
 		n = MIN(uio->uio_resid, NFS_MAXPATHLEN - bp->b_resid);
 		on = 0;
 		break;
 	    case VDIR:
 		NFSINCRGLOBAL(nfsstatsv1.biocache_readdirs);
 		if (np->n_direofoffset
 		    && uio->uio_offset >= np->n_direofoffset) {
-		    return (0);
+			error = 0;
+			goto out;
 		}
 		lbn = (uoff_t)uio->uio_offset / NFS_DIRBLKSIZ;
 		on = uio->uio_offset & (NFS_DIRBLKSIZ - 1);
 		bp = nfs_getcacheblk(vp, lbn, NFS_DIRBLKSIZ, td);
 		if (!bp) {
-		    error = newnfs_sigintr(nmp, td);
-		    return (error ? error : EINTR);
+			error = newnfs_sigintr(nmp, td);
+			if (error == 0)
+				error = EINTR;
+			goto out;
 		}
 		if ((bp->b_flags & B_CACHE) == 0) {
 		    bp->b_iocmd = BIO_READ;
 		    vfs_busy_pages(bp, 0);
 		    error = ncl_doio(vp, bp, cred, td, 0);
 		    if (error) {
 			    brelse(bp);
 		    }
 		    while (error == NFSERR_BAD_COOKIE) {
 			ncl_invaldir(vp);
 			error = ncl_vinvalbuf(vp, 0, td, 1);
 
 			/*
 			 * Yuck! The directory has been modified on the
 			 * server. The only way to get the block is by
 			 * reading from the beginning to get all the
 			 * offset cookies.
 			 *
 			 * Leave the last bp intact unless there is an error.
 			 * Loop back up to the while if the error is another
 			 * NFSERR_BAD_COOKIE (double yuch!).
 			 */
 			for (i = 0; i <= lbn && !error; i++) {
 			    if (np->n_direofoffset
-				&& (i * NFS_DIRBLKSIZ) >= np->n_direofoffset)
-				    return (0);
+				&& (i * NFS_DIRBLKSIZ) >= np->n_direofoffset) {
+				    error = 0;
+				    goto out;
+			    }
 			    bp = nfs_getcacheblk(vp, i, NFS_DIRBLKSIZ, td);
 			    if (!bp) {
 				error = newnfs_sigintr(nmp, td);
-				return (error ? error : EINTR);
+				if (error == 0)
+					error = EINTR;
+				goto out;
 			    }
 			    if ((bp->b_flags & B_CACHE) == 0) {
 				    bp->b_iocmd = BIO_READ;
 				    vfs_busy_pages(bp, 0);
 				    error = ncl_doio(vp, bp, cred, td, 0);
 				    /*
 				     * no error + B_INVAL == directory EOF,
 				     * use the block.
 				     */
 				    if (error == 0 && (bp->b_flags & B_INVAL))
 					    break;
 			    }
 			    /*
 			     * An error will throw away the block and the
 			     * for loop will break out.  If no error and this
 			     * is not the block we want, we throw away the
 			     * block and go for the next one via the for loop.
 			     */
 			    if (error || i < lbn)
 				    brelse(bp);
 			}
 		    }
 		    /*
 		     * The above while is repeated if we hit another cookie
 		     * error.  If we hit an error and it wasn't a cookie error,
 		     * we give up.
 		     */
 		    if (error)
-			    return (error);
+			    goto out;
 		}
 
 		/*
 		 * If not eof and read aheads are enabled, start one.
 		 * (You need the current block first, so that you have the
 		 *  directory offset cookie of the next block.)
 		 */
 		if (nmp->nm_readahead > 0 &&
 		    (bp->b_flags & B_INVAL) == 0 &&
 		    (np->n_direofoffset == 0 ||
 		    (lbn + 1) * NFS_DIRBLKSIZ < np->n_direofoffset) &&
 		    incore(&vp->v_bufobj, lbn + 1) == NULL) {
 			rabp = nfs_getcacheblk(vp, lbn + 1, NFS_DIRBLKSIZ, td);
 			if (rabp) {
 			    if ((rabp->b_flags & (B_CACHE|B_DELWRI)) == 0) {
 				rabp->b_flags |= B_ASYNC;
 				rabp->b_iocmd = BIO_READ;
 				vfs_busy_pages(rabp, 0);
 				if (ncl_asyncio(nmp, rabp, cred, td)) {
 				    rabp->b_flags |= B_INVAL;
 				    rabp->b_ioflags |= BIO_ERROR;
 				    vfs_unbusy_pages(rabp);
 				    brelse(rabp);
 				}
 			    } else {
 				brelse(rabp);
 			    }
 			}
 		}
 		/*
 		 * Unlike VREG files, whos buffer size ( bp->b_bcount ) is
 		 * chopped for the EOF condition, we cannot tell how large
 		 * NFS directories are going to be until we hit EOF.  So
 		 * an NFS directory buffer is *not* chopped to its EOF.  Now,
 		 * it just so happens that b_resid will effectively chop it
 		 * to EOF.  *BUT* this information is lost if the buffer goes
 		 * away and is reconstituted into a B_CACHE state ( due to
 		 * being VMIO ) later.  So we keep track of the directory eof
 		 * in np->n_direofoffset and chop it off as an extra step
 		 * right here.
 		 */
 		n = lmin(uio->uio_resid, NFS_DIRBLKSIZ - bp->b_resid - on);
 		if (np->n_direofoffset && n > np->n_direofoffset - uio->uio_offset)
 			n = np->n_direofoffset - uio->uio_offset;
 		break;
 	    default:
 		printf(" ncl_bioread: type %x unexpected\n", vp->v_type);
 		bp = NULL;
 		break;
 	    }
 
 	    if (n > 0) {
 		    error = vn_io_fault_uiomove(bp->b_data + on, (int)n, uio);
 	    }
 	    if (vp->v_type == VLNK)
 		n = 0;
 	    if (bp != NULL)
 		brelse(bp);
 	} while (error == 0 && uio->uio_resid > 0 && n > 0);
+out:
+	curthread_pflags2_restore(save2);
+	if ((curthread->td_pflags2 & TDP2_SBPAGES) == 0) {
+		NFSLOCKNODE(np);
+		ncl_pager_setsize(vp, NULL);
+	}
 	return (error);
 }
 
 /*
  * The NFS write path cannot handle iovecs with len > 1. So we need to
  * break up iovecs accordingly (restricting them to wsize).
  * For the SYNC case, we can do this with 1 copy (user buffer -> mbuf).
  * For the ASYNC case, 2 copies are needed. The first a copy from the
  * user buffer to a staging buffer and then a second copy from the staging
  * buffer to mbufs. This can be optimized by copying from the user buffer
  * directly into mbufs and passing the chain down, but that requires a
  * fair amount of re-working of the relevant codepaths (and can be done
  * later).
  */
 static int
 nfs_directio_write(vp, uiop, cred, ioflag)
 	struct vnode *vp;
 	struct uio *uiop;
 	struct ucred *cred;
 	int ioflag;
 {
 	int error;
 	struct nfsmount *nmp = VFSTONFS(vp->v_mount);
 	struct thread *td = uiop->uio_td;
 	int size;
 	int wsize;
 
 	mtx_lock(&nmp->nm_mtx);
 	wsize = nmp->nm_wsize;
 	mtx_unlock(&nmp->nm_mtx);
 	if (ioflag & IO_SYNC) {
 		int iomode, must_commit;
 		struct uio uio;
 		struct iovec iov;
 do_sync:
 		while (uiop->uio_resid > 0) {
 			size = MIN(uiop->uio_resid, wsize);
 			size = MIN(uiop->uio_iov->iov_len, size);
 			iov.iov_base = uiop->uio_iov->iov_base;
 			iov.iov_len = size;
 			uio.uio_iov = &iov;
 			uio.uio_iovcnt = 1;
 			uio.uio_offset = uiop->uio_offset;
 			uio.uio_resid = size;
 			uio.uio_segflg = UIO_USERSPACE;
 			uio.uio_rw = UIO_WRITE;
 			uio.uio_td = td;
 			iomode = NFSWRITE_FILESYNC;
 			error = ncl_writerpc(vp, &uio, cred, &iomode,
 			    &must_commit, 0);
 			KASSERT((must_commit == 0),
 				("ncl_directio_write: Did not commit write"));
 			if (error)
 				return (error);
 			uiop->uio_offset += size;
 			uiop->uio_resid -= size;
 			if (uiop->uio_iov->iov_len <= size) {
 				uiop->uio_iovcnt--;
 				uiop->uio_iov++;
 			} else {
 				uiop->uio_iov->iov_base =
 					(char *)uiop->uio_iov->iov_base + size;
 				uiop->uio_iov->iov_len -= size;
 			}
 		}
 	} else {
 		struct uio *t_uio;
 		struct iovec *t_iov;
 		struct buf *bp;
 
 		/*
 		 * Break up the write into blocksize chunks and hand these
 		 * over to nfsiod's for write back.
 		 * Unfortunately, this incurs a copy of the data. Since
 		 * the user could modify the buffer before the write is
 		 * initiated.
 		 *
 		 * The obvious optimization here is that one of the 2 copies
 		 * in the async write path can be eliminated by copying the
 		 * data here directly into mbufs and passing the mbuf chain
 		 * down. But that will require a fair amount of re-working
 		 * of the code and can be done if there's enough interest
 		 * in NFS directio access.
 		 */
 		while (uiop->uio_resid > 0) {
 			size = MIN(uiop->uio_resid, wsize);
 			size = MIN(uiop->uio_iov->iov_len, size);
 			bp = uma_zalloc(ncl_pbuf_zone, M_WAITOK);
 			t_uio = malloc(sizeof(struct uio), M_NFSDIRECTIO, M_WAITOK);
 			t_iov = malloc(sizeof(struct iovec), M_NFSDIRECTIO, M_WAITOK);
 			t_iov->iov_base = malloc(size, M_NFSDIRECTIO, M_WAITOK);
 			t_iov->iov_len = size;
 			t_uio->uio_iov = t_iov;
 			t_uio->uio_iovcnt = 1;
 			t_uio->uio_offset = uiop->uio_offset;
 			t_uio->uio_resid = size;
 			t_uio->uio_segflg = UIO_SYSSPACE;
 			t_uio->uio_rw = UIO_WRITE;
 			t_uio->uio_td = td;
 			KASSERT(uiop->uio_segflg == UIO_USERSPACE ||
 			    uiop->uio_segflg == UIO_SYSSPACE,
 			    ("nfs_directio_write: Bad uio_segflg"));
 			if (uiop->uio_segflg == UIO_USERSPACE) {
 				error = copyin(uiop->uio_iov->iov_base,
 				    t_iov->iov_base, size);
 				if (error != 0)
 					goto err_free;
 			} else
 				/*
 				 * UIO_SYSSPACE may never happen, but handle
 				 * it just in case it does.
 				 */
 				bcopy(uiop->uio_iov->iov_base, t_iov->iov_base,
 				    size);
 			bp->b_flags |= B_DIRECT;
 			bp->b_iocmd = BIO_WRITE;
 			if (cred != NOCRED) {
 				crhold(cred);
 				bp->b_wcred = cred;
 			} else
 				bp->b_wcred = NOCRED;
 			bp->b_caller1 = (void *)t_uio;
 			bp->b_vp = vp;
 			error = ncl_asyncio(nmp, bp, NOCRED, td);
 err_free:
 			if (error) {
 				free(t_iov->iov_base, M_NFSDIRECTIO);
 				free(t_iov, M_NFSDIRECTIO);
 				free(t_uio, M_NFSDIRECTIO);
 				bp->b_vp = NULL;
 				uma_zfree(ncl_pbuf_zone, bp);
 				if (error == EINTR)
 					return (error);
 				goto do_sync;
 			}
 			uiop->uio_offset += size;
 			uiop->uio_resid -= size;
 			if (uiop->uio_iov->iov_len <= size) {
 				uiop->uio_iovcnt--;
 				uiop->uio_iov++;
 			} else {
 				uiop->uio_iov->iov_base =
 					(char *)uiop->uio_iov->iov_base + size;
 				uiop->uio_iov->iov_len -= size;
 			}
 		}
 	}
 	return (0);
 }
 
 /*
  * Vnode op for write using bio
  */
 int
 ncl_write(struct vop_write_args *ap)
 {
 	int biosize;
 	struct uio *uio = ap->a_uio;
 	struct thread *td = uio->uio_td;
 	struct vnode *vp = ap->a_vp;
 	struct nfsnode *np = VTONFS(vp);
 	struct ucred *cred = ap->a_cred;
 	int ioflag = ap->a_ioflag;
 	struct buf *bp;
 	struct vattr vattr;
 	struct nfsmount *nmp = VFSTONFS(vp->v_mount);
 	daddr_t lbn;
 	int bcount, noncontig_write, obcount;
 	int bp_cached, n, on, error = 0, error1, wouldcommit;
 	size_t orig_resid, local_resid;
 	off_t orig_size, tmp_off;
 
 	KASSERT(uio->uio_rw == UIO_WRITE, ("ncl_write mode"));
 	KASSERT(uio->uio_segflg != UIO_USERSPACE || uio->uio_td == curthread,
 	    ("ncl_write proc"));
 	if (vp->v_type != VREG)
 		return (EIO);
 	NFSLOCKNODE(np);
 	if (np->n_flag & NWRITEERR) {
 		np->n_flag &= ~NWRITEERR;
 		NFSUNLOCKNODE(np);
 		return (np->n_error);
 	} else
 		NFSUNLOCKNODE(np);
 	mtx_lock(&nmp->nm_mtx);
 	if ((nmp->nm_flag & NFSMNT_NFSV3) != 0 &&
 	    (nmp->nm_state & NFSSTA_GOTFSINFO) == 0) {
 		mtx_unlock(&nmp->nm_mtx);
 		(void)ncl_fsinfo(nmp, vp, cred, td);
 		mtx_lock(&nmp->nm_mtx);
 	}
 	if (nmp->nm_wsize == 0)
 		(void) newnfs_iosize(nmp);
 	mtx_unlock(&nmp->nm_mtx);
 
 	/*
 	 * Synchronously flush pending buffers if we are in synchronous
 	 * mode or if we are appending.
 	 */
 	if (ioflag & (IO_APPEND | IO_SYNC)) {
 		NFSLOCKNODE(np);
 		if (np->n_flag & NMODIFIED) {
 			NFSUNLOCKNODE(np);
 #ifdef notyet /* Needs matching nonblock semantics elsewhere, too. */
 			/*
 			 * Require non-blocking, synchronous writes to
 			 * dirty files to inform the program it needs
 			 * to fsync(2) explicitly.
 			 */
 			if (ioflag & IO_NDELAY)
 				return (EAGAIN);
 #endif
 			np->n_attrstamp = 0;
 			KDTRACE_NFS_ATTRCACHE_FLUSH_DONE(vp);
 			error = ncl_vinvalbuf(vp, V_SAVE | ((ioflag &
 			    IO_VMIO) != 0 ? V_VMIO : 0), td, 1);
 			if (error != 0)
 				return (error);
 		} else
 			NFSUNLOCKNODE(np);
 	}
 
 	orig_resid = uio->uio_resid;
 	NFSLOCKNODE(np);
 	orig_size = np->n_size;
 	NFSUNLOCKNODE(np);
 
 	/*
 	 * If IO_APPEND then load uio_offset.  We restart here if we cannot
 	 * get the append lock.
 	 */
 	if (ioflag & IO_APPEND) {
 		np->n_attrstamp = 0;
 		KDTRACE_NFS_ATTRCACHE_FLUSH_DONE(vp);
 		error = VOP_GETATTR(vp, &vattr, cred);
 		if (error)
 			return (error);
 		NFSLOCKNODE(np);
 		uio->uio_offset = np->n_size;
 		NFSUNLOCKNODE(np);
 	}
 
 	if (uio->uio_offset < 0)
 		return (EINVAL);
 	tmp_off = uio->uio_offset + uio->uio_resid;
 	if (tmp_off > nmp->nm_maxfilesize || tmp_off < uio->uio_offset)
 		return (EFBIG);
 	if (uio->uio_resid == 0)
 		return (0);
 
 	if (newnfs_directio_enable && (ioflag & IO_DIRECT) && vp->v_type == VREG)
 		return nfs_directio_write(vp, uio, cred, ioflag);
 
 	/*
 	 * Maybe this should be above the vnode op call, but so long as
 	 * file servers have no limits, i don't think it matters
 	 */
 	if (vn_rlimit_fsize(vp, uio, td))
 		return (EFBIG);
 
 	biosize = vp->v_bufobj.bo_bsize;
 	/*
 	 * Find all of this file's B_NEEDCOMMIT buffers.  If our writes
 	 * would exceed the local maximum per-file write commit size when
 	 * combined with those, we must decide whether to flush,
 	 * go synchronous, or return error.  We don't bother checking
 	 * IO_UNIT -- we just make all writes atomic anyway, as there's
 	 * no point optimizing for something that really won't ever happen.
 	 */
 	wouldcommit = 0;
 	if (!(ioflag & IO_SYNC)) {
 		int nflag;
 
 		NFSLOCKNODE(np);
 		nflag = np->n_flag;
 		NFSUNLOCKNODE(np);
 		if (nflag & NMODIFIED) {
 			BO_LOCK(&vp->v_bufobj);
 			if (vp->v_bufobj.bo_dirty.bv_cnt != 0) {
 				TAILQ_FOREACH(bp, &vp->v_bufobj.bo_dirty.bv_hd,
 				    b_bobufs) {
 					if (bp->b_flags & B_NEEDCOMMIT)
 						wouldcommit += bp->b_bcount;
 				}
 			}
 			BO_UNLOCK(&vp->v_bufobj);
 		}
 	}
 
 	do {
 		if (!(ioflag & IO_SYNC)) {
 			wouldcommit += biosize;
 			if (wouldcommit > nmp->nm_wcommitsize) {
 				np->n_attrstamp = 0;
 				KDTRACE_NFS_ATTRCACHE_FLUSH_DONE(vp);
 				error = ncl_vinvalbuf(vp, V_SAVE | ((ioflag &
 				    IO_VMIO) != 0 ? V_VMIO : 0), td, 1);
 				if (error != 0)
 					return (error);
 				wouldcommit = biosize;
 			}
 		}
 
 		NFSINCRGLOBAL(nfsstatsv1.biocache_writes);
 		lbn = uio->uio_offset / biosize;
 		on = uio->uio_offset - (lbn * biosize);
 		n = MIN((unsigned)(biosize - on), uio->uio_resid);
 again:
 		/*
 		 * Handle direct append and file extension cases, calculate
 		 * unaligned buffer size.
 		 */
 		NFSLOCKNODE(np);
 		if ((np->n_flag & NHASBEENLOCKED) == 0 &&
 		    (nmp->nm_flag & NFSMNT_NONCONTIGWR) != 0)
 			noncontig_write = 1;
 		else
 			noncontig_write = 0;
 		if ((uio->uio_offset == np->n_size ||
 		    (noncontig_write != 0 &&
 		    lbn == (np->n_size / biosize) &&
 		    uio->uio_offset + n > np->n_size)) && n) {
 			NFSUNLOCKNODE(np);
 			/*
 			 * Get the buffer (in its pre-append state to maintain
 			 * B_CACHE if it was previously set).  Resize the
 			 * nfsnode after we have locked the buffer to prevent
 			 * readers from reading garbage.
 			 */
 			obcount = np->n_size - (lbn * biosize);
 			bp = nfs_getcacheblk(vp, lbn, obcount, td);
 
 			if (bp != NULL) {
 				long save;
 
 				NFSLOCKNODE(np);
 				np->n_size = uio->uio_offset + n;
 				np->n_flag |= NMODIFIED;
 				vnode_pager_setsize(vp, np->n_size);
 				NFSUNLOCKNODE(np);
 
 				save = bp->b_flags & B_CACHE;
 				bcount = on + n;
 				allocbuf(bp, bcount);
 				bp->b_flags |= save;
 				if (noncontig_write != 0 && on > obcount)
 					vfs_bio_bzero_buf(bp, obcount, on -
 					    obcount);
 			}
 		} else {
 			/*
 			 * Obtain the locked cache block first, and then
 			 * adjust the file's size as appropriate.
 			 */
 			bcount = on + n;
 			if ((off_t)lbn * biosize + bcount < np->n_size) {
 				if ((off_t)(lbn + 1) * biosize < np->n_size)
 					bcount = biosize;
 				else
 					bcount = np->n_size - (off_t)lbn * biosize;
 			}
 			NFSUNLOCKNODE(np);
 			bp = nfs_getcacheblk(vp, lbn, bcount, td);
 			NFSLOCKNODE(np);
 			if (uio->uio_offset + n > np->n_size) {
 				np->n_size = uio->uio_offset + n;
 				np->n_flag |= NMODIFIED;
 				vnode_pager_setsize(vp, np->n_size);
 			}
 			NFSUNLOCKNODE(np);
 		}
 
 		if (!bp) {
 			error = newnfs_sigintr(nmp, td);
 			if (!error)
 				error = EINTR;
 			break;
 		}
 
 		/*
 		 * Issue a READ if B_CACHE is not set.  In special-append
 		 * mode, B_CACHE is based on the buffer prior to the write
 		 * op and is typically set, avoiding the read.  If a read
 		 * is required in special append mode, the server will
 		 * probably send us a short-read since we extended the file
 		 * on our end, resulting in b_resid == 0 and, thusly,
 		 * B_CACHE getting set.
 		 *
 		 * We can also avoid issuing the read if the write covers
 		 * the entire buffer.  We have to make sure the buffer state
 		 * is reasonable in this case since we will not be initiating
 		 * I/O.  See the comments in kern/vfs_bio.c's getblk() for
 		 * more information.
 		 *
 		 * B_CACHE may also be set due to the buffer being cached
 		 * normally.
 		 */
 
 		bp_cached = 1;
 		if (on == 0 && n == bcount) {
 			if ((bp->b_flags & B_CACHE) == 0)
 				bp_cached = 0;
 			bp->b_flags |= B_CACHE;
 			bp->b_flags &= ~B_INVAL;
 			bp->b_ioflags &= ~BIO_ERROR;
 		}
 
 		if ((bp->b_flags & B_CACHE) == 0) {
 			bp->b_iocmd = BIO_READ;
 			vfs_busy_pages(bp, 0);
 			error = ncl_doio(vp, bp, cred, td, 0);
 			if (error) {
 				brelse(bp);
 				break;
 			}
 		}
 		if (bp->b_wcred == NOCRED)
 			bp->b_wcred = crhold(cred);
 		NFSLOCKNODE(np);
 		np->n_flag |= NMODIFIED;
 		NFSUNLOCKNODE(np);
 
 		/*
 		 * If dirtyend exceeds file size, chop it down.  This should
 		 * not normally occur but there is an append race where it
 		 * might occur XXX, so we log it.
 		 *
 		 * If the chopping creates a reverse-indexed or degenerate
 		 * situation with dirtyoff/end, we 0 both of them.
 		 */
 
 		if (bp->b_dirtyend > bcount) {
 			printf("NFS append race @%lx:%d\n",
 			    (long)bp->b_blkno * DEV_BSIZE,
 			    bp->b_dirtyend - bcount);
 			bp->b_dirtyend = bcount;
 		}
 
 		if (bp->b_dirtyoff >= bp->b_dirtyend)
 			bp->b_dirtyoff = bp->b_dirtyend = 0;
 
 		/*
 		 * If the new write will leave a contiguous dirty
 		 * area, just update the b_dirtyoff and b_dirtyend,
 		 * otherwise force a write rpc of the old dirty area.
 		 *
 		 * If there has been a file lock applied to this file
 		 * or vfs.nfs.old_noncontig_writing is set, do the following:
 		 * While it is possible to merge discontiguous writes due to
 		 * our having a B_CACHE buffer ( and thus valid read data
 		 * for the hole), we don't because it could lead to
 		 * significant cache coherency problems with multiple clients,
 		 * especially if locking is implemented later on.
 		 *
 		 * If vfs.nfs.old_noncontig_writing is not set and there has
 		 * not been file locking done on this file:
 		 * Relax coherency a bit for the sake of performance and
 		 * expand the current dirty region to contain the new
 		 * write even if it means we mark some non-dirty data as
 		 * dirty.
 		 */
 
 		if (noncontig_write == 0 && bp->b_dirtyend > 0 &&
 		    (on > bp->b_dirtyend || (on + n) < bp->b_dirtyoff)) {
 			if (bwrite(bp) == EINTR) {
 				error = EINTR;
 				break;
 			}
 			goto again;
 		}
 
 		local_resid = uio->uio_resid;
 		error = vn_io_fault_uiomove((char *)bp->b_data + on, n, uio);
 
 		if (error != 0 && !bp_cached) {
 			/*
 			 * This block has no other content then what
 			 * possibly was written by the faulty uiomove.
 			 * Release it, forgetting the data pages, to
 			 * prevent the leak of uninitialized data to
 			 * usermode.
 			 */
 			bp->b_ioflags |= BIO_ERROR;
 			brelse(bp);
 			uio->uio_offset -= local_resid - uio->uio_resid;
 			uio->uio_resid = local_resid;
 			break;
 		}
 
 		/*
 		 * Since this block is being modified, it must be written
 		 * again and not just committed.  Since write clustering does
 		 * not work for the stage 1 data write, only the stage 2
 		 * commit rpc, we have to clear B_CLUSTEROK as well.
 		 */
 		bp->b_flags &= ~(B_NEEDCOMMIT | B_CLUSTEROK);
 
 		/*
 		 * Get the partial update on the progress made from
 		 * uiomove, if an error occurred.
 		 */
 		if (error != 0)
 			n = local_resid - uio->uio_resid;
 
 		/*
 		 * Only update dirtyoff/dirtyend if not a degenerate
 		 * condition.
 		 */
 		if (n > 0) {
 			if (bp->b_dirtyend > 0) {
 				bp->b_dirtyoff = min(on, bp->b_dirtyoff);
 				bp->b_dirtyend = max((on + n), bp->b_dirtyend);
 			} else {
 				bp->b_dirtyoff = on;
 				bp->b_dirtyend = on + n;
 			}
 			vfs_bio_set_valid(bp, on, n);
 		}
 
 		/*
 		 * If IO_SYNC do bwrite().
 		 *
 		 * IO_INVAL appears to be unused.  The idea appears to be
 		 * to turn off caching in this case.  Very odd.  XXX
 		 */
 		if ((ioflag & IO_SYNC)) {
 			if (ioflag & IO_INVAL)
 				bp->b_flags |= B_NOCACHE;
 			error1 = bwrite(bp);
 			if (error1 != 0) {
 				if (error == 0)
 					error = error1;
 				break;
 			}
 		} else if ((n + on) == biosize || (ioflag & IO_ASYNC) != 0) {
 			bp->b_flags |= B_ASYNC;
 			(void) ncl_writebp(bp, 0, NULL);
 		} else {
 			bdwrite(bp);
 		}
 
 		if (error != 0)
 			break;
 	} while (uio->uio_resid > 0 && n > 0);
 
 	if (error != 0) {
 		if (ioflag & IO_UNIT) {
 			VATTR_NULL(&vattr);
 			vattr.va_size = orig_size;
 			/* IO_SYNC is handled implicitely */
 			(void)VOP_SETATTR(vp, &vattr, cred);
 			uio->uio_offset -= orig_resid - uio->uio_resid;
 			uio->uio_resid = orig_resid;
 		}
 	}
 
 	return (error);
 }
 
 /*
  * Get an nfs cache block.
  *
  * Allocate a new one if the block isn't currently in the cache
  * and return the block marked busy. If the calling process is
  * interrupted by a signal for an interruptible mount point, return
  * NULL.
  *
  * The caller must carefully deal with the possible B_INVAL state of
  * the buffer.  ncl_doio() clears B_INVAL (and ncl_asyncio() clears it
  * indirectly), so synchronous reads can be issued without worrying about
  * the B_INVAL state.  We have to be a little more careful when dealing
  * with writes (see comments in nfs_write()) when extending a file past
  * its EOF.
  */
 static struct buf *
 nfs_getcacheblk(struct vnode *vp, daddr_t bn, int size, struct thread *td)
 {
 	struct buf *bp;
 	struct mount *mp;
 	struct nfsmount *nmp;
 
 	mp = vp->v_mount;
 	nmp = VFSTONFS(mp);
 
 	if (nmp->nm_flag & NFSMNT_INT) {
 		sigset_t oldset;
 
 		newnfs_set_sigmask(td, &oldset);
 		bp = getblk(vp, bn, size, PCATCH, 0, 0);
 		newnfs_restore_sigmask(td, &oldset);
 		while (bp == NULL) {
 			if (newnfs_sigintr(nmp, td))
 				return (NULL);
 			bp = getblk(vp, bn, size, 0, 2 * hz, 0);
 		}
 	} else {
 		bp = getblk(vp, bn, size, 0, 0, 0);
 	}
 
 	if (vp->v_type == VREG)
 		bp->b_blkno = bn * (vp->v_bufobj.bo_bsize / DEV_BSIZE);
 	return (bp);
 }
 
 /*
  * Flush and invalidate all dirty buffers. If another process is already
  * doing the flush, just wait for completion.
  */
 int
 ncl_vinvalbuf(struct vnode *vp, int flags, struct thread *td, int intrflg)
 {
 	struct nfsnode *np = VTONFS(vp);
 	struct nfsmount *nmp = VFSTONFS(vp->v_mount);
 	int error = 0, slpflag, slptimeo;
 	bool old_lock;
 
 	ASSERT_VOP_LOCKED(vp, "ncl_vinvalbuf");
 
 	if ((nmp->nm_flag & NFSMNT_INT) == 0)
 		intrflg = 0;
 	if (NFSCL_FORCEDISM(nmp->nm_mountp))
 		intrflg = 1;
 	if (intrflg) {
 		slpflag = PCATCH;
 		slptimeo = 2 * hz;
 	} else {
 		slpflag = 0;
 		slptimeo = 0;
 	}
 
 	old_lock = ncl_excl_start(vp);
 	if (old_lock)
 		flags |= V_ALLOWCLEAN;
 
 	/*
 	 * Now, flush as required.
 	 */
 	if ((flags & (V_SAVE | V_VMIO)) == V_SAVE &&
 	     vp->v_bufobj.bo_object != NULL) {
 		VM_OBJECT_WLOCK(vp->v_bufobj.bo_object);
 		vm_object_page_clean(vp->v_bufobj.bo_object, 0, 0, OBJPC_SYNC);
 		VM_OBJECT_WUNLOCK(vp->v_bufobj.bo_object);
 		/*
 		 * If the page clean was interrupted, fail the invalidation.
 		 * Not doing so, we run the risk of losing dirty pages in the
 		 * vinvalbuf() call below.
 		 */
 		if (intrflg && (error = newnfs_sigintr(nmp, td)))
 			goto out;
 	}
 
 	error = vinvalbuf(vp, flags, slpflag, 0);
 	while (error) {
 		if (intrflg && (error = newnfs_sigintr(nmp, td)))
 			goto out;
 		error = vinvalbuf(vp, flags, 0, slptimeo);
 	}
 	if (NFSHASPNFS(nmp)) {
 		nfscl_layoutcommit(vp, td);
 		/*
 		 * Invalidate the attribute cache, since writes to a DS
 		 * won't update the size attribute.
 		 */
 		NFSLOCKNODE(np);
 		np->n_attrstamp = 0;
 	} else
 		NFSLOCKNODE(np);
 	if (np->n_directio_asyncwr == 0)
 		np->n_flag &= ~NMODIFIED;
 	NFSUNLOCKNODE(np);
 out:
 	ncl_excl_finish(vp, old_lock);
 	return error;
 }
 
 /*
  * Initiate asynchronous I/O. Return an error if no nfsiods are available.
  * This is mainly to avoid queueing async I/O requests when the nfsiods
  * are all hung on a dead server.
  *
  * Note: ncl_asyncio() does not clear (BIO_ERROR|B_INVAL) but when the bp
  * is eventually dequeued by the async daemon, ncl_doio() *will*.
  */
 int
 ncl_asyncio(struct nfsmount *nmp, struct buf *bp, struct ucred *cred, struct thread *td)
 {
 	int iod;
 	int gotiod;
 	int slpflag = 0;
 	int slptimeo = 0;
 	int error, error2;
 
 	/*
 	 * Commits are usually short and sweet so lets save some cpu and
 	 * leave the async daemons for more important rpc's (such as reads
 	 * and writes).
 	 *
 	 * Readdirplus RPCs do vget()s to acquire the vnodes for entries
 	 * in the directory in order to update attributes. This can deadlock
 	 * with another thread that is waiting for async I/O to be done by
 	 * an nfsiod thread while holding a lock on one of these vnodes.
 	 * To avoid this deadlock, don't allow the async nfsiod threads to
 	 * perform Readdirplus RPCs.
 	 */
 	NFSLOCKIOD();
 	if ((bp->b_iocmd == BIO_WRITE && (bp->b_flags & B_NEEDCOMMIT) &&
 	     (nmp->nm_bufqiods > ncl_numasync / 2)) ||
 	    (bp->b_vp->v_type == VDIR && (nmp->nm_flag & NFSMNT_RDIRPLUS))) {
 		NFSUNLOCKIOD();
 		return(EIO);
 	}
 again:
 	if (nmp->nm_flag & NFSMNT_INT)
 		slpflag = PCATCH;
 	gotiod = FALSE;
 
 	/*
 	 * Find a free iod to process this request.
 	 */
 	for (iod = 0; iod < ncl_numasync; iod++)
 		if (ncl_iodwant[iod] == NFSIOD_AVAILABLE) {
 			gotiod = TRUE;
 			break;
 		}
 
 	/*
 	 * Try to create one if none are free.
 	 */
 	if (!gotiod)
 		ncl_nfsiodnew();
 	else {
 		/*
 		 * Found one, so wake it up and tell it which
 		 * mount to process.
 		 */
 		NFS_DPF(ASYNCIO, ("ncl_asyncio: waking iod %d for mount %p\n",
 		    iod, nmp));
 		ncl_iodwant[iod] = NFSIOD_NOT_AVAILABLE;
 		ncl_iodmount[iod] = nmp;
 		nmp->nm_bufqiods++;
 		wakeup(&ncl_iodwant[iod]);
 	}
 
 	/*
 	 * If none are free, we may already have an iod working on this mount
 	 * point.  If so, it will process our request.
 	 */
 	if (!gotiod) {
 		if (nmp->nm_bufqiods > 0) {
 			NFS_DPF(ASYNCIO,
 				("ncl_asyncio: %d iods are already processing mount %p\n",
 				 nmp->nm_bufqiods, nmp));
 			gotiod = TRUE;
 		}
 	}
 
 	/*
 	 * If we have an iod which can process the request, then queue
 	 * the buffer.
 	 */
 	if (gotiod) {
 		/*
 		 * Ensure that the queue never grows too large.  We still want
 		 * to asynchronize so we block rather then return EIO.
 		 */
 		while (nmp->nm_bufqlen >= 2*ncl_numasync) {
 			NFS_DPF(ASYNCIO,
 				("ncl_asyncio: waiting for mount %p queue to drain\n", nmp));
 			nmp->nm_bufqwant = TRUE;
 			error = newnfs_msleep(td, &nmp->nm_bufq,
 			    &ncl_iod_mutex, slpflag | PRIBIO, "nfsaio",
 			   slptimeo);
 			if (error) {
 				error2 = newnfs_sigintr(nmp, td);
 				if (error2) {
 					NFSUNLOCKIOD();
 					return (error2);
 				}
 				if (slpflag == PCATCH) {
 					slpflag = 0;
 					slptimeo = 2 * hz;
 				}
 			}
 			/*
 			 * We might have lost our iod while sleeping,
 			 * so check and loop if necessary.
 			 */
 			goto again;
 		}
 
 		/* We might have lost our nfsiod */
 		if (nmp->nm_bufqiods == 0) {
 			NFS_DPF(ASYNCIO,
 				("ncl_asyncio: no iods after mount %p queue was drained, looping\n", nmp));
 			goto again;
 		}
 
 		if (bp->b_iocmd == BIO_READ) {
 			if (bp->b_rcred == NOCRED && cred != NOCRED)
 				bp->b_rcred = crhold(cred);
 		} else {
 			if (bp->b_wcred == NOCRED && cred != NOCRED)
 				bp->b_wcred = crhold(cred);
 		}
 
 		if (bp->b_flags & B_REMFREE)
 			bremfreef(bp);
 		BUF_KERNPROC(bp);
 		TAILQ_INSERT_TAIL(&nmp->nm_bufq, bp, b_freelist);
 		nmp->nm_bufqlen++;
 		if ((bp->b_flags & B_DIRECT) && bp->b_iocmd == BIO_WRITE) {
 			NFSLOCKNODE(VTONFS(bp->b_vp));
 			VTONFS(bp->b_vp)->n_flag |= NMODIFIED;
 			VTONFS(bp->b_vp)->n_directio_asyncwr++;
 			NFSUNLOCKNODE(VTONFS(bp->b_vp));
 		}
 		NFSUNLOCKIOD();
 		return (0);
 	}
 
 	NFSUNLOCKIOD();
 
 	/*
 	 * All the iods are busy on other mounts, so return EIO to
 	 * force the caller to process the i/o synchronously.
 	 */
 	NFS_DPF(ASYNCIO, ("ncl_asyncio: no iods available, i/o is synchronous\n"));
 	return (EIO);
 }
 
 void
 ncl_doio_directwrite(struct buf *bp)
 {
 	int iomode, must_commit;
 	struct uio *uiop = (struct uio *)bp->b_caller1;
 	char *iov_base = uiop->uio_iov->iov_base;
 
 	iomode = NFSWRITE_FILESYNC;
 	uiop->uio_td = NULL; /* NULL since we're in nfsiod */
 	ncl_writerpc(bp->b_vp, uiop, bp->b_wcred, &iomode, &must_commit, 0);
 	KASSERT((must_commit == 0), ("ncl_doio_directwrite: Did not commit write"));
 	free(iov_base, M_NFSDIRECTIO);
 	free(uiop->uio_iov, M_NFSDIRECTIO);
 	free(uiop, M_NFSDIRECTIO);
 	if ((bp->b_flags & B_DIRECT) && bp->b_iocmd == BIO_WRITE) {
 		struct nfsnode *np = VTONFS(bp->b_vp);
 		NFSLOCKNODE(np);
 		if (NFSHASPNFS(VFSTONFS(vnode_mount(bp->b_vp)))) {
 			/*
 			 * Invalidate the attribute cache, since writes to a DS
 			 * won't update the size attribute.
 			 */
 			np->n_attrstamp = 0;
 		}
 		np->n_directio_asyncwr--;
 		if (np->n_directio_asyncwr == 0) {
 			np->n_flag &= ~NMODIFIED;
 			if ((np->n_flag & NFSYNCWAIT)) {
 				np->n_flag &= ~NFSYNCWAIT;
 				wakeup((caddr_t)&np->n_directio_asyncwr);
 			}
 		}
 		NFSUNLOCKNODE(np);
 	}
 	bp->b_vp = NULL;
 	uma_zfree(ncl_pbuf_zone, bp);
 }
 
 /*
  * Do an I/O operation to/from a cache block. This may be called
  * synchronously or from an nfsiod.
  */
 int
 ncl_doio(struct vnode *vp, struct buf *bp, struct ucred *cr, struct thread *td,
     int called_from_strategy)
 {
 	struct uio *uiop;
 	struct nfsnode *np;
 	struct nfsmount *nmp;
 	int error = 0, iomode, must_commit = 0;
 	struct uio uio;
 	struct iovec io;
 	struct proc *p = td ? td->td_proc : NULL;
 	uint8_t	iocmd;
 
 	np = VTONFS(vp);
 	nmp = VFSTONFS(vp->v_mount);
 	uiop = &uio;
 	uiop->uio_iov = &io;
 	uiop->uio_iovcnt = 1;
 	uiop->uio_segflg = UIO_SYSSPACE;
 	uiop->uio_td = td;
 
 	/*
 	 * clear BIO_ERROR and B_INVAL state prior to initiating the I/O.  We
 	 * do this here so we do not have to do it in all the code that
 	 * calls us.
 	 */
 	bp->b_flags &= ~B_INVAL;
 	bp->b_ioflags &= ~BIO_ERROR;
 
 	KASSERT(!(bp->b_flags & B_DONE), ("ncl_doio: bp %p already marked done", bp));
 	iocmd = bp->b_iocmd;
 	if (iocmd == BIO_READ) {
 	    io.iov_len = uiop->uio_resid = bp->b_bcount;
 	    io.iov_base = bp->b_data;
 	    uiop->uio_rw = UIO_READ;
 
 	    switch (vp->v_type) {
 	    case VREG:
 		uiop->uio_offset = ((off_t)bp->b_blkno) * DEV_BSIZE;
 		NFSINCRGLOBAL(nfsstatsv1.read_bios);
 		error = ncl_readrpc(vp, uiop, cr);
 
 		if (!error) {
 		    if (uiop->uio_resid) {
 			/*
 			 * If we had a short read with no error, we must have
 			 * hit a file hole.  We should zero-fill the remainder.
 			 * This can also occur if the server hits the file EOF.
 			 *
 			 * Holes used to be able to occur due to pending
 			 * writes, but that is not possible any longer.
 			 */
 			int nread = bp->b_bcount - uiop->uio_resid;
 			ssize_t left = uiop->uio_resid;
 
 			if (left > 0)
 				bzero((char *)bp->b_data + nread, left);
 			uiop->uio_resid = 0;
 		    }
 		}
 		/* ASSERT_VOP_LOCKED(vp, "ncl_doio"); */
 		if (p && vp->v_writecount <= -1) {
 			NFSLOCKNODE(np);
 			if (NFS_TIMESPEC_COMPARE(&np->n_mtime, &np->n_vattr.na_mtime)) {
 				NFSUNLOCKNODE(np);
 				PROC_LOCK(p);
 				killproc(p, "text file modification");
 				PROC_UNLOCK(p);
 			} else
 				NFSUNLOCKNODE(np);
 		}
 		break;
 	    case VLNK:
 		uiop->uio_offset = (off_t)0;
 		NFSINCRGLOBAL(nfsstatsv1.readlink_bios);
 		error = ncl_readlinkrpc(vp, uiop, cr);
 		break;
 	    case VDIR:
 		NFSINCRGLOBAL(nfsstatsv1.readdir_bios);
 		uiop->uio_offset = ((u_quad_t)bp->b_lblkno) * NFS_DIRBLKSIZ;
 		if ((nmp->nm_flag & NFSMNT_RDIRPLUS) != 0) {
 			error = ncl_readdirplusrpc(vp, uiop, cr, td);
 			if (error == NFSERR_NOTSUPP)
 				nmp->nm_flag &= ~NFSMNT_RDIRPLUS;
 		}
 		if ((nmp->nm_flag & NFSMNT_RDIRPLUS) == 0)
 			error = ncl_readdirrpc(vp, uiop, cr, td);
 		/*
 		 * end-of-directory sets B_INVAL but does not generate an
 		 * error.
 		 */
 		if (error == 0 && uiop->uio_resid == bp->b_bcount)
 			bp->b_flags |= B_INVAL;
 		break;
 	    default:
 		printf("ncl_doio:  type %x unexpected\n", vp->v_type);
 		break;
 	    }
 	    if (error) {
 		bp->b_ioflags |= BIO_ERROR;
 		bp->b_error = error;
 	    }
 	} else {
 	    /*
 	     * If we only need to commit, try to commit
 	     */
 	    if (bp->b_flags & B_NEEDCOMMIT) {
 		    int retv;
 		    off_t off;
 
 		    off = ((u_quad_t)bp->b_blkno) * DEV_BSIZE + bp->b_dirtyoff;
 		    retv = ncl_commit(vp, off, bp->b_dirtyend-bp->b_dirtyoff,
 			bp->b_wcred, td);
 		    if (retv == 0) {
 			    bp->b_dirtyoff = bp->b_dirtyend = 0;
 			    bp->b_flags &= ~(B_NEEDCOMMIT | B_CLUSTEROK);
 			    bp->b_resid = 0;
 			    bufdone(bp);
 			    return (0);
 		    }
 		    if (retv == NFSERR_STALEWRITEVERF) {
 			    ncl_clearcommit(vp->v_mount);
 		    }
 	    }
 
 	    /*
 	     * Setup for actual write
 	     */
 	    NFSLOCKNODE(np);
 	    if ((off_t)bp->b_blkno * DEV_BSIZE + bp->b_dirtyend > np->n_size)
 		bp->b_dirtyend = np->n_size - (off_t)bp->b_blkno * DEV_BSIZE;
 	    NFSUNLOCKNODE(np);
 
 	    if (bp->b_dirtyend > bp->b_dirtyoff) {
 		io.iov_len = uiop->uio_resid = bp->b_dirtyend
 		    - bp->b_dirtyoff;
 		uiop->uio_offset = (off_t)bp->b_blkno * DEV_BSIZE
 		    + bp->b_dirtyoff;
 		io.iov_base = (char *)bp->b_data + bp->b_dirtyoff;
 		uiop->uio_rw = UIO_WRITE;
 		NFSINCRGLOBAL(nfsstatsv1.write_bios);
 
 		if ((bp->b_flags & (B_ASYNC | B_NEEDCOMMIT | B_NOCACHE | B_CLUSTER)) == B_ASYNC)
 		    iomode = NFSWRITE_UNSTABLE;
 		else
 		    iomode = NFSWRITE_FILESYNC;
 
 		error = ncl_writerpc(vp, uiop, cr, &iomode, &must_commit,
 		    called_from_strategy);
 
 		/*
 		 * When setting B_NEEDCOMMIT also set B_CLUSTEROK to try
 		 * to cluster the buffers needing commit.  This will allow
 		 * the system to submit a single commit rpc for the whole
 		 * cluster.  We can do this even if the buffer is not 100%
 		 * dirty (relative to the NFS blocksize), so we optimize the
 		 * append-to-file-case.
 		 *
 		 * (when clearing B_NEEDCOMMIT, B_CLUSTEROK must also be
 		 * cleared because write clustering only works for commit
 		 * rpc's, not for the data portion of the write).
 		 */
 
 		if (!error && iomode == NFSWRITE_UNSTABLE) {
 		    bp->b_flags |= B_NEEDCOMMIT;
 		    if (bp->b_dirtyoff == 0
 			&& bp->b_dirtyend == bp->b_bcount)
 			bp->b_flags |= B_CLUSTEROK;
 		} else {
 		    bp->b_flags &= ~(B_NEEDCOMMIT | B_CLUSTEROK);
 		}
 
 		/*
 		 * For an interrupted write, the buffer is still valid
 		 * and the write hasn't been pushed to the server yet,
 		 * so we can't set BIO_ERROR and report the interruption
 		 * by setting B_EINTR. For the B_ASYNC case, B_EINTR
 		 * is not relevant, so the rpc attempt is essentially
 		 * a noop.  For the case of a V3 write rpc not being
 		 * committed to stable storage, the block is still
 		 * dirty and requires either a commit rpc or another
 		 * write rpc with iomode == NFSV3WRITE_FILESYNC before
 		 * the block is reused. This is indicated by setting
 		 * the B_DELWRI and B_NEEDCOMMIT flags.
 		 *
 		 * EIO is returned by ncl_writerpc() to indicate a recoverable
 		 * write error and is handled as above, except that
 		 * B_EINTR isn't set. One cause of this is a stale stateid
 		 * error for the RPC that indicates recovery is required,
 		 * when called with called_from_strategy != 0.
 		 *
 		 * If the buffer is marked B_PAGING, it does not reside on
 		 * the vp's paging queues so we cannot call bdirty().  The
 		 * bp in this case is not an NFS cache block so we should
 		 * be safe. XXX
 		 *
 		 * The logic below breaks up errors into recoverable and
 		 * unrecoverable. For the former, we clear B_INVAL|B_NOCACHE
 		 * and keep the buffer around for potential write retries.
 		 * For the latter (eg ESTALE), we toss the buffer away (B_INVAL)
 		 * and save the error in the nfsnode. This is less than ideal
 		 * but necessary. Keeping such buffers around could potentially
 		 * cause buffer exhaustion eventually (they can never be written
 		 * out, so will get constantly be re-dirtied). It also causes
 		 * all sorts of vfs panics. For non-recoverable write errors,
 		 * also invalidate the attrcache, so we'll be forced to go over
 		 * the wire for this object, returning an error to user on next
 		 * call (most of the time).
 		 */
 		if (error == EINTR || error == EIO || error == ETIMEDOUT
 		    || (!error && (bp->b_flags & B_NEEDCOMMIT))) {
 			bp->b_flags &= ~(B_INVAL|B_NOCACHE);
 			if ((bp->b_flags & B_PAGING) == 0) {
 			    bdirty(bp);
 			    bp->b_flags &= ~B_DONE;
 			}
 			if ((error == EINTR || error == ETIMEDOUT) &&
 			    (bp->b_flags & B_ASYNC) == 0)
 			    bp->b_flags |= B_EINTR;
 		} else {
 		    if (error) {
 			bp->b_ioflags |= BIO_ERROR;
 			bp->b_flags |= B_INVAL;
 			bp->b_error = np->n_error = error;
 			NFSLOCKNODE(np);
 			np->n_flag |= NWRITEERR;
 			np->n_attrstamp = 0;
 			KDTRACE_NFS_ATTRCACHE_FLUSH_DONE(vp);
 			NFSUNLOCKNODE(np);
 		    }
 		    bp->b_dirtyoff = bp->b_dirtyend = 0;
 		}
 	    } else {
 		bp->b_resid = 0;
 		bufdone(bp);
 		return (0);
 	    }
 	}
 	bp->b_resid = uiop->uio_resid;
 	if (must_commit)
 	    ncl_clearcommit(vp->v_mount);
 	bufdone(bp);
 	return (error);
 }
 
 /*
  * Used to aid in handling ftruncate() operations on the NFS client side.
  * Truncation creates a number of special problems for NFS.  We have to
  * throw away VM pages and buffer cache buffers that are beyond EOF, and
  * we have to properly handle VM pages or (potentially dirty) buffers
  * that straddle the truncation point.
  */
 
 int
 ncl_meta_setsize(struct vnode *vp, struct thread *td, u_quad_t nsize)
 {
 	struct nfsnode *np = VTONFS(vp);
 	u_quad_t tsize;
 	int biosize = vp->v_bufobj.bo_bsize;
 	int error = 0;
 
 	NFSLOCKNODE(np);
 	tsize = np->n_size;
 	np->n_size = nsize;
 	NFSUNLOCKNODE(np);
 
 	if (nsize < tsize) {
 		struct buf *bp;
 		daddr_t lbn;
 		int bufsize;
 
 		/*
 		 * vtruncbuf() doesn't get the buffer overlapping the
 		 * truncation point.  We may have a B_DELWRI and/or B_CACHE
 		 * buffer that now needs to be truncated.
 		 */
 		error = vtruncbuf(vp, nsize, biosize);
 		lbn = nsize / biosize;
 		bufsize = nsize - (lbn * biosize);
 		bp = nfs_getcacheblk(vp, lbn, bufsize, td);
 		if (!bp)
 			return EINTR;
 		if (bp->b_dirtyoff > bp->b_bcount)
 			bp->b_dirtyoff = bp->b_bcount;
 		if (bp->b_dirtyend > bp->b_bcount)
 			bp->b_dirtyend = bp->b_bcount;
 		bp->b_flags |= B_RELBUF;  /* don't leave garbage around */
 		brelse(bp);
 	} else {
 		vnode_pager_setsize(vp, nsize);
 	}
 	return(error);
 }
 
Index: head/sys/fs/nfsclient/nfs_clport.c
===================================================================
--- head/sys/fs/nfsclient/nfs_clport.c	(revision 358251)
+++ head/sys/fs/nfsclient/nfs_clport.c	(revision 358252)
@@ -1,1415 +1,1416 @@
 /*-
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Copyright (c) 1989, 1993
  *	The Regents of the University of California.  All rights reserved.
  *
  * This code is derived from software contributed to Berkeley by
  * Rick Macklem at The University of Guelph.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_inet.h"
 #include "opt_inet6.h"
 
 #include <sys/capsicum.h>
 
 /*
  * generally, I don't like #includes inside .h files, but it seems to
  * be the easiest way to handle the port.
  */
 #include <sys/fail.h>
 #include <sys/hash.h>
 #include <sys/sysctl.h>
 #include <fs/nfs/nfsport.h>
 #include <netinet/in_fib.h>
 #include <netinet/if_ether.h>
 #include <netinet6/ip6_var.h>
 #include <net/if_types.h>
 
 #include <fs/nfsclient/nfs_kdtrace.h>
 
 #ifdef KDTRACE_HOOKS
 dtrace_nfsclient_attrcache_flush_probe_func_t
 		dtrace_nfscl_attrcache_flush_done_probe;
 uint32_t	nfscl_attrcache_flush_done_id;
 
 dtrace_nfsclient_attrcache_get_hit_probe_func_t
 		dtrace_nfscl_attrcache_get_hit_probe;
 uint32_t	nfscl_attrcache_get_hit_id;
 
 dtrace_nfsclient_attrcache_get_miss_probe_func_t
 		dtrace_nfscl_attrcache_get_miss_probe;
 uint32_t	nfscl_attrcache_get_miss_id;
 
 dtrace_nfsclient_attrcache_load_probe_func_t
 		dtrace_nfscl_attrcache_load_done_probe;
 uint32_t	nfscl_attrcache_load_done_id;
 #endif /* !KDTRACE_HOOKS */
 
 extern u_int32_t newnfs_true, newnfs_false, newnfs_xdrneg1;
 extern struct vop_vector newnfs_vnodeops;
 extern struct vop_vector newnfs_fifoops;
 extern uma_zone_t newnfsnode_zone;
 extern struct buf_ops buf_ops_newnfs;
 extern uma_zone_t ncl_pbuf_zone;
 extern short nfsv4_cbport;
 extern int nfscl_enablecallb;
 extern int nfs_numnfscbd;
 extern int nfscl_inited;
 struct mtx ncl_iod_mutex;
 NFSDLOCKMUTEX;
 extern struct mtx nfsrv_dslock_mtx;
 
 extern void (*ncl_call_invalcaches)(struct vnode *);
 
 SYSCTL_DECL(_vfs_nfs);
 static int ncl_fileid_maxwarnings = 10;
 SYSCTL_INT(_vfs_nfs, OID_AUTO, fileid_maxwarnings, CTLFLAG_RWTUN,
     &ncl_fileid_maxwarnings, 0,
     "Limit fileid corruption warnings; 0 is off; -1 is unlimited");
 static volatile int ncl_fileid_nwarnings;
 
 static void nfscl_warn_fileid(struct nfsmount *, struct nfsvattr *,
     struct nfsvattr *);
 
 /*
  * Comparison function for vfs_hash functions.
  */
 int
 newnfs_vncmpf(struct vnode *vp, void *arg)
 {
 	struct nfsfh *nfhp = (struct nfsfh *)arg;
 	struct nfsnode *np = VTONFS(vp);
 
 	if (np->n_fhp->nfh_len != nfhp->nfh_len ||
 	    NFSBCMP(np->n_fhp->nfh_fh, nfhp->nfh_fh, nfhp->nfh_len))
 		return (1);
 	return (0);
 }
 
 /*
  * Look up a vnode/nfsnode by file handle.
  * Callers must check for mount points!!
  * In all cases, a pointer to a
  * nfsnode structure is returned.
  * This variant takes a "struct nfsfh *" as second argument and uses
  * that structure up, either by hanging off the nfsnode or FREEing it.
  */
 int
 nfscl_nget(struct mount *mntp, struct vnode *dvp, struct nfsfh *nfhp,
     struct componentname *cnp, struct thread *td, struct nfsnode **npp,
     void *stuff, int lkflags)
 {
 	struct nfsnode *np, *dnp;
 	struct vnode *vp, *nvp;
 	struct nfsv4node *newd, *oldd;
 	int error;
 	u_int hash;
 	struct nfsmount *nmp;
 
 	nmp = VFSTONFS(mntp);
 	dnp = VTONFS(dvp);
 	*npp = NULL;
 
 	hash = fnv_32_buf(nfhp->nfh_fh, nfhp->nfh_len, FNV1_32_INIT);
 
 	error = vfs_hash_get(mntp, hash, lkflags,
 	    td, &nvp, newnfs_vncmpf, nfhp);
 	if (error == 0 && nvp != NULL) {
 		/*
 		 * I believe there is a slight chance that vgonel() could
 		 * get called on this vnode between when NFSVOPLOCK() drops
 		 * the VI_LOCK() and vget() acquires it again, so that it
 		 * hasn't yet had v_usecount incremented. If this were to
 		 * happen, the VIRF_DOOMED flag would be set, so check for
 		 * that here. Since we now have the v_usecount incremented,
 		 * we should be ok until we vrele() it, if the VIRF_DOOMED
 		 * flag isn't set now.
 		 */
 		VI_LOCK(nvp);
 		if (VN_IS_DOOMED(nvp)) {
 			VI_UNLOCK(nvp);
 			vrele(nvp);
 			error = ENOENT;
 		} else {
 			VI_UNLOCK(nvp);
 		}
 	}
 	if (error) {
 		free(nfhp, M_NFSFH);
 		return (error);
 	}
 	if (nvp != NULL) {
 		np = VTONFS(nvp);
 		/*
 		 * For NFSv4, check to see if it is the same name and
 		 * replace the name, if it is different.
 		 */
 		oldd = newd = NULL;
 		if ((nmp->nm_flag & NFSMNT_NFSV4) && np->n_v4 != NULL &&
 		    nvp->v_type == VREG &&
 		    (np->n_v4->n4_namelen != cnp->cn_namelen ||
 		     NFSBCMP(cnp->cn_nameptr, NFS4NODENAME(np->n_v4),
 		     cnp->cn_namelen) ||
 		     dnp->n_fhp->nfh_len != np->n_v4->n4_fhlen ||
 		     NFSBCMP(dnp->n_fhp->nfh_fh, np->n_v4->n4_data,
 		     dnp->n_fhp->nfh_len))) {
 		    newd = malloc(
 			sizeof (struct nfsv4node) + dnp->n_fhp->nfh_len +
 			+ cnp->cn_namelen - 1, M_NFSV4NODE, M_WAITOK);
 		    NFSLOCKNODE(np);
 		    if (newd != NULL && np->n_v4 != NULL && nvp->v_type == VREG
 			&& (np->n_v4->n4_namelen != cnp->cn_namelen ||
 			 NFSBCMP(cnp->cn_nameptr, NFS4NODENAME(np->n_v4),
 			 cnp->cn_namelen) ||
 			 dnp->n_fhp->nfh_len != np->n_v4->n4_fhlen ||
 			 NFSBCMP(dnp->n_fhp->nfh_fh, np->n_v4->n4_data,
 			 dnp->n_fhp->nfh_len))) {
 			oldd = np->n_v4;
 			np->n_v4 = newd;
 			newd = NULL;
 			np->n_v4->n4_fhlen = dnp->n_fhp->nfh_len;
 			np->n_v4->n4_namelen = cnp->cn_namelen;
 			NFSBCOPY(dnp->n_fhp->nfh_fh, np->n_v4->n4_data,
 			    dnp->n_fhp->nfh_len);
 			NFSBCOPY(cnp->cn_nameptr, NFS4NODENAME(np->n_v4),
 			    cnp->cn_namelen);
 		    }
 		    NFSUNLOCKNODE(np);
 		}
 		if (newd != NULL)
 			free(newd, M_NFSV4NODE);
 		if (oldd != NULL)
 			free(oldd, M_NFSV4NODE);
 		*npp = np;
 		free(nfhp, M_NFSFH);
 		return (0);
 	}
 	np = uma_zalloc(newnfsnode_zone, M_WAITOK | M_ZERO);
 
 	error = getnewvnode(nfs_vnode_tag, mntp, &newnfs_vnodeops, &nvp);
 	if (error) {
 		uma_zfree(newnfsnode_zone, np);
 		free(nfhp, M_NFSFH);
 		return (error);
 	}
 	vp = nvp;
 	KASSERT(vp->v_bufobj.bo_bsize != 0, ("nfscl_nget: bo_bsize == 0"));
 	vp->v_bufobj.bo_ops = &buf_ops_newnfs;
 	vp->v_data = np;
 	np->n_vnode = vp;
 	/* 
 	 * Initialize the mutex even if the vnode is going to be a loser.
 	 * This simplifies the logic in reclaim, which can then unconditionally
 	 * destroy the mutex (in the case of the loser, or if hash_insert
 	 * happened to return an error no special casing is needed).
 	 */
 	mtx_init(&np->n_mtx, "NEWNFSnode lock", NULL, MTX_DEF | MTX_DUPOK);
 	lockinit(&np->n_excl, PVFS, "nfsupg", VLKTIMEOUT, LK_NOSHARE |
 	    LK_CANRECURSE);
 
 	/* 
 	 * Are we getting the root? If so, make sure the vnode flags
 	 * are correct 
 	 */
 	if ((nfhp->nfh_len == nmp->nm_fhsize) &&
 	    !bcmp(nfhp->nfh_fh, nmp->nm_fh, nfhp->nfh_len)) {
 		if (vp->v_type == VNON)
 			vp->v_type = VDIR;
 		vp->v_vflag |= VV_ROOT;
 	}
 
 	vp->v_vflag |= VV_VMSIZEVNLOCK;
 	
 	np->n_fhp = nfhp;
 	/*
 	 * For NFSv4, we have to attach the directory file handle and
 	 * file name, so that Open Ops can be done later.
 	 */
 	if (nmp->nm_flag & NFSMNT_NFSV4) {
 		np->n_v4 = malloc(sizeof (struct nfsv4node)
 		    + dnp->n_fhp->nfh_len + cnp->cn_namelen - 1, M_NFSV4NODE,
 		    M_WAITOK);
 		np->n_v4->n4_fhlen = dnp->n_fhp->nfh_len;
 		np->n_v4->n4_namelen = cnp->cn_namelen;
 		NFSBCOPY(dnp->n_fhp->nfh_fh, np->n_v4->n4_data,
 		    dnp->n_fhp->nfh_len);
 		NFSBCOPY(cnp->cn_nameptr, NFS4NODENAME(np->n_v4),
 		    cnp->cn_namelen);
 	} else {
 		np->n_v4 = NULL;
 	}
 
 	/*
 	 * NFS supports recursive and shared locking.
 	 */
 	lockmgr(vp->v_vnlock, LK_EXCLUSIVE | LK_NOWITNESS, NULL);
 	VN_LOCK_AREC(vp);
 	VN_LOCK_ASHARE(vp);
 	error = insmntque(vp, mntp);
 	if (error != 0) {
 		*npp = NULL;
 		mtx_destroy(&np->n_mtx);
 		lockdestroy(&np->n_excl);
 		free(nfhp, M_NFSFH);
 		if (np->n_v4 != NULL)
 			free(np->n_v4, M_NFSV4NODE);
 		uma_zfree(newnfsnode_zone, np);
 		return (error);
 	}
 	error = vfs_hash_insert(vp, hash, lkflags, 
 	    td, &nvp, newnfs_vncmpf, nfhp);
 	if (error)
 		return (error);
 	if (nvp != NULL) {
 		*npp = VTONFS(nvp);
 		/* vfs_hash_insert() vput()'s the losing vnode */
 		return (0);
 	}
 	*npp = np;
 
 	return (0);
 }
 
 /*
  * Another variant of nfs_nget(). This one is only used by reopen. It
  * takes almost the same args as nfs_nget(), but only succeeds if an entry
  * exists in the cache. (Since files should already be "open" with a
  * vnode ref cnt on the node when reopen calls this, it should always
  * succeed.)
  * Also, don't get a vnode lock, since it may already be locked by some
  * other process that is handling it. This is ok, since all other threads
  * on the client are blocked by the nfsc_lock being exclusively held by the
  * caller of this function.
  */
 int
 nfscl_ngetreopen(struct mount *mntp, u_int8_t *fhp, int fhsize,
     struct thread *td, struct nfsnode **npp)
 {
 	struct vnode *nvp;
 	u_int hash;
 	struct nfsfh *nfhp;
 	int error;
 
 	*npp = NULL;
 	/* For forced dismounts, just return error. */
 	if (NFSCL_FORCEDISM(mntp))
 		return (EINTR);
 	nfhp = malloc(sizeof (struct nfsfh) + fhsize,
 	    M_NFSFH, M_WAITOK);
 	bcopy(fhp, &nfhp->nfh_fh[0], fhsize);
 	nfhp->nfh_len = fhsize;
 
 	hash = fnv_32_buf(fhp, fhsize, FNV1_32_INIT);
 
 	/*
 	 * First, try to get the vnode locked, but don't block for the lock.
 	 */
 	error = vfs_hash_get(mntp, hash, (LK_EXCLUSIVE | LK_NOWAIT), td, &nvp,
 	    newnfs_vncmpf, nfhp);
 	if (error == 0 && nvp != NULL) {
 		NFSVOPUNLOCK(nvp);
 	} else if (error == EBUSY) {
 		/*
 		 * It is safe so long as a vflush() with
 		 * FORCECLOSE has not been done. Since the Renew thread is
 		 * stopped and the MNTK_UNMOUNTF flag is set before doing
 		 * a vflush() with FORCECLOSE, we should be ok here.
 		 */
 		if (NFSCL_FORCEDISM(mntp))
 			error = EINTR;
 		else {
 			vfs_hash_ref(mntp, hash, td, &nvp, newnfs_vncmpf, nfhp);
 			if (nvp == NULL) {
 				error = ENOENT;
 			} else if (VN_IS_DOOMED(nvp)) {
 				error = ENOENT;
 				vrele(nvp);
 			} else {
 				error = 0;
 			}
 		}
 	}
 	free(nfhp, M_NFSFH);
 	if (error)
 		return (error);
 	if (nvp != NULL) {
 		*npp = VTONFS(nvp);
 		return (0);
 	}
 	return (EINVAL);
 }
 
 static void
 nfscl_warn_fileid(struct nfsmount *nmp, struct nfsvattr *oldnap,
     struct nfsvattr *newnap)
 {
 	int off;
 
 	if (ncl_fileid_maxwarnings >= 0 &&
 	    ncl_fileid_nwarnings >= ncl_fileid_maxwarnings)
 		return;
 	off = 0;
 	if (ncl_fileid_maxwarnings >= 0) {
 		if (++ncl_fileid_nwarnings >= ncl_fileid_maxwarnings)
 			off = 1;
 	}
 
 	printf("newnfs: server '%s' error: fileid changed. "
 	    "fsid %jx:%jx: expected fileid %#jx, got %#jx. "
 	    "(BROKEN NFS SERVER OR MIDDLEWARE)\n",
 	    nmp->nm_com.nmcom_hostname,
 	    (uintmax_t)nmp->nm_fsid[0],
 	    (uintmax_t)nmp->nm_fsid[1],
 	    (uintmax_t)oldnap->na_fileid,
 	    (uintmax_t)newnap->na_fileid);
 
 	if (off)
 		printf("newnfs: Logged %d times about fileid corruption; "
 		    "going quiet to avoid spamming logs excessively. (Limit "
 		    "is: %d).\n", ncl_fileid_nwarnings,
 		    ncl_fileid_maxwarnings);
 }
 
 /*
  * Load the attribute cache (that lives in the nfsnode entry) with
  * the attributes of the second argument and
  * Iff vaper not NULL
  *    copy the attributes to *vaper
  * Similar to nfs_loadattrcache(), except the attributes are passed in
  * instead of being parsed out of the mbuf list.
  */
 int
 nfscl_loadattrcache(struct vnode **vpp, struct nfsvattr *nap, void *nvaper,
     void *stuff, int writeattr, int dontshrink)
 {
 	struct vnode *vp = *vpp;
 	struct vattr *vap, *nvap = &nap->na_vattr, *vaper = nvaper;
 	struct nfsnode *np;
 	struct nfsmount *nmp;
 	struct timespec mtime_save;
 	int error, force_fid_err;
 
 	error = 0;
 
 	/*
 	 * If v_type == VNON it is a new node, so fill in the v_type,
 	 * n_mtime fields. Check to see if it represents a special 
 	 * device, and if so, check for a possible alias. Once the
 	 * correct vnode has been obtained, fill in the rest of the
 	 * information.
 	 */
 	np = VTONFS(vp);
 	NFSLOCKNODE(np);
 	if (vp->v_type != nvap->va_type) {
 		vp->v_type = nvap->va_type;
 		if (vp->v_type == VFIFO)
 			vp->v_op = &newnfs_fifoops;
 		np->n_mtime = nvap->va_mtime;
 	}
 	nmp = VFSTONFS(vp->v_mount);
 	vap = &np->n_vattr.na_vattr;
 	mtime_save = vap->va_mtime;
 	if (writeattr) {
 		np->n_vattr.na_filerev = nap->na_filerev;
 		np->n_vattr.na_size = nap->na_size;
 		np->n_vattr.na_mtime = nap->na_mtime;
 		np->n_vattr.na_ctime = nap->na_ctime;
 		np->n_vattr.na_fsid = nap->na_fsid;
 		np->n_vattr.na_mode = nap->na_mode;
 	} else {
 		force_fid_err = 0;
 		KFAIL_POINT_ERROR(DEBUG_FP, nfscl_force_fileid_warning,
 		    force_fid_err);
 		/*
 		 * BROKEN NFS SERVER OR MIDDLEWARE
 		 *
 		 * Certain NFS servers (certain old proprietary filers ca.
 		 * 2006) or broken middleboxes (e.g. WAN accelerator products)
 		 * will respond to GETATTR requests with results for a
 		 * different fileid.
 		 *
 		 * The WAN accelerator we've observed not only serves stale
 		 * cache results for a given file, it also occasionally serves
 		 * results for wholly different files.  This causes surprising
 		 * problems; for example the cached size attribute of a file
 		 * may truncate down and then back up, resulting in zero
 		 * regions in file contents read by applications.  We observed
 		 * this reliably with Clang and .c files during parallel build.
 		 * A pcap revealed packet fragmentation and GETATTR RPC
 		 * responses with wholly wrong fileids.
 		 */
 		if ((np->n_vattr.na_fileid != 0 &&
 		     np->n_vattr.na_fileid != nap->na_fileid) ||
 		    force_fid_err) {
 			nfscl_warn_fileid(nmp, &np->n_vattr, nap);
 			error = EIDRM;
 			goto out;
 		}
 		NFSBCOPY((caddr_t)nap, (caddr_t)&np->n_vattr,
 		    sizeof (struct nfsvattr));
 	}
 
 	/*
 	 * For NFSv4, if the node's fsid is not equal to the mount point's
 	 * fsid, return the low order 32bits of the node's fsid. This
 	 * allows getcwd(3) to work. There is a chance that the fsid might
 	 * be the same as a local fs, but since this is in an NFS mount
 	 * point, I don't think that will cause any problems?
 	 */
 	if (NFSHASNFSV4(nmp) && NFSHASHASSETFSID(nmp) &&
 	    (nmp->nm_fsid[0] != np->n_vattr.na_filesid[0] ||
 	     nmp->nm_fsid[1] != np->n_vattr.na_filesid[1])) {
 		/*
 		 * va_fsid needs to be set to some value derived from
 		 * np->n_vattr.na_filesid that is not equal
 		 * vp->v_mount->mnt_stat.f_fsid[0], so that it changes
 		 * from the value used for the top level server volume
 		 * in the mounted subtree.
 		 */
 		vn_fsid(vp, vap);
 		if ((uint32_t)vap->va_fsid == np->n_vattr.na_filesid[0])
 			vap->va_fsid = hash32_buf(
 			    np->n_vattr.na_filesid, 2 * sizeof(uint64_t), 0);
 	} else
 		vn_fsid(vp, vap);
 	np->n_attrstamp = time_second;
 	if (vap->va_size != np->n_size) {
 		if (vap->va_type == VREG) {
 			if (dontshrink && vap->va_size < np->n_size) {
 				/*
 				 * We've been told not to shrink the file;
 				 * zero np->n_attrstamp to indicate that
 				 * the attributes are stale.
 				 */
 				vap->va_size = np->n_size;
 				np->n_attrstamp = 0;
 				KDTRACE_NFS_ATTRCACHE_FLUSH_DONE(vp);
 			} else if (np->n_flag & NMODIFIED) {
 				/*
 				 * We've modified the file: Use the larger
 				 * of our size, and the server's size.
 				 */
 				if (vap->va_size < np->n_size) {
 					vap->va_size = np->n_size;
 				} else {
 					np->n_size = vap->va_size;
 					np->n_flag |= NSIZECHANGED;
 				}
 			} else {
 				np->n_size = vap->va_size;
 				np->n_flag |= NSIZECHANGED;
 			}
 		} else {
 			np->n_size = vap->va_size;
 		}
 	}
 	/*
 	 * The following checks are added to prevent a race between (say)
 	 * a READDIR+ and a WRITE. 
 	 * READDIR+, WRITE requests sent out.
 	 * READDIR+ resp, WRITE resp received on client.
 	 * However, the WRITE resp was handled before the READDIR+ resp
 	 * causing the post op attrs from the write to be loaded first
 	 * and the attrs from the READDIR+ to be loaded later. If this 
 	 * happens, we have stale attrs loaded into the attrcache.
 	 * We detect this by for the mtime moving back. We invalidate the 
 	 * attrcache when this happens.
 	 */
 	if (timespeccmp(&mtime_save, &vap->va_mtime, >)) {
 		/* Size changed or mtime went backwards */
 		np->n_attrstamp = 0;
 		KDTRACE_NFS_ATTRCACHE_FLUSH_DONE(vp);
 	}
 	if (vaper != NULL) {
 		NFSBCOPY((caddr_t)vap, (caddr_t)vaper, sizeof(*vap));
 		if (np->n_flag & NCHG) {
 			if (np->n_flag & NACC)
 				vaper->va_atime = np->n_atim;
 			if (np->n_flag & NUPD)
 				vaper->va_mtime = np->n_mtim;
 		}
 	}
 
 out:
 #ifdef KDTRACE_HOOKS
 	if (np->n_attrstamp != 0)
 		KDTRACE_NFS_ATTRCACHE_LOAD_DONE(vp, vap, error);
 #endif
 	(void)ncl_pager_setsize(vp, NULL);
 	return (error);
 }
 
 /*
  * Call vnode_pager_setsize() if the size of the node changed, as
  * recorded in nfsnode vs. v_object, or delay the call if notifying
  * the pager is not possible at the moment.
  *
  * If nsizep is non-NULL, the call is delayed and the new node size is
  * provided.  Caller should itself call vnode_pager_setsize() if
  * function returned true.  If nsizep is NULL, function tries to call
  * vnode_pager_setsize() itself if needed and possible, and the nfs
  * node is unlocked unconditionally, the return value is not useful.
  */
 bool
 ncl_pager_setsize(struct vnode *vp, u_quad_t *nsizep)
 {
 	struct nfsnode *np;
 	vm_object_t object;
 	struct vattr *vap;
 	u_quad_t nsize;
 	bool setnsize;
 
 	np = VTONFS(vp);
 	NFSASSERTNODE(np);
 
 	vap = &np->n_vattr.na_vattr;
 	nsize = vap->va_size;
 	object = vp->v_object;
 	setnsize = false;
 
 	if (object != NULL && nsize != object->un_pager.vnp.vnp_size) {
-		if (VOP_ISLOCKED(vp) == LK_EXCLUSIVE)
+		if (VOP_ISLOCKED(vp) == LK_EXCLUSIVE &&
+		    (curthread->td_pflags2 & TDP2_SBPAGES) == 0)
 			setnsize = true;
 		else
 			np->n_flag |= NVNSETSZSKIP;
 	}
 	if (nsizep == NULL) {
 		NFSUNLOCKNODE(np);
 		if (setnsize)
 			vnode_pager_setsize(vp, nsize);
 		setnsize = false;
 	} else {
 		*nsizep = nsize;
 	}
 	return (setnsize);
 }
 
 /*
  * Fill in the client id name. For these bytes:
  * 1 - they must be unique
  * 2 - they should be persistent across client reboots
  * 1 is more critical than 2
  * Use the mount point's unique id plus either the uuid or, if that
  * isn't set, random junk.
  */
 void
 nfscl_fillclid(u_int64_t clval, char *uuid, u_int8_t *cp, u_int16_t idlen)
 {
 	int uuidlen;
 
 	/*
 	 * First, put in the 64bit mount point identifier.
 	 */
 	if (idlen >= sizeof (u_int64_t)) {
 		NFSBCOPY((caddr_t)&clval, cp, sizeof (u_int64_t));
 		cp += sizeof (u_int64_t);
 		idlen -= sizeof (u_int64_t);
 	}
 
 	/*
 	 * If uuid is non-zero length, use it.
 	 */
 	uuidlen = strlen(uuid);
 	if (uuidlen > 0 && idlen >= uuidlen) {
 		NFSBCOPY(uuid, cp, uuidlen);
 		cp += uuidlen;
 		idlen -= uuidlen;
 	}
 
 	/*
 	 * This only normally happens if the uuid isn't set.
 	 */
 	while (idlen > 0) {
 		*cp++ = (u_int8_t)(arc4random() % 256);
 		idlen--;
 	}
 }
 
 /*
  * Fill in a lock owner name. For now, pid + the process's creation time.
  */
 void
 nfscl_filllockowner(void *id, u_int8_t *cp, int flags)
 {
 	union {
 		u_int32_t	lval;
 		u_int8_t	cval[4];
 	} tl;
 	struct proc *p;
 
 	if (id == NULL) {
 		/* Return the single open_owner of all 0 bytes. */
 		bzero(cp, NFSV4CL_LOCKNAMELEN);
 		return;
 	}
 	if ((flags & F_POSIX) != 0) {
 		p = (struct proc *)id;
 		tl.lval = p->p_pid;
 		*cp++ = tl.cval[0];
 		*cp++ = tl.cval[1];
 		*cp++ = tl.cval[2];
 		*cp++ = tl.cval[3];
 		tl.lval = p->p_stats->p_start.tv_sec;
 		*cp++ = tl.cval[0];
 		*cp++ = tl.cval[1];
 		*cp++ = tl.cval[2];
 		*cp++ = tl.cval[3];
 		tl.lval = p->p_stats->p_start.tv_usec;
 		*cp++ = tl.cval[0];
 		*cp++ = tl.cval[1];
 		*cp++ = tl.cval[2];
 		*cp = tl.cval[3];
 	} else if ((flags & F_FLOCK) != 0) {
 		bcopy(&id, cp, sizeof(id));
 		bzero(&cp[sizeof(id)], NFSV4CL_LOCKNAMELEN - sizeof(id));
 	} else {
 		printf("nfscl_filllockowner: not F_POSIX or F_FLOCK\n");
 		bzero(cp, NFSV4CL_LOCKNAMELEN);
 	}
 }
 
 /*
  * Find the parent process for the thread passed in as an argument.
  * If none exists, return NULL, otherwise return a thread for the parent.
  * (Can be any of the threads, since it is only used for td->td_proc.)
  */
 NFSPROC_T *
 nfscl_getparent(struct thread *td)
 {
 	struct proc *p;
 	struct thread *ptd;
 
 	if (td == NULL)
 		return (NULL);
 	p = td->td_proc;
 	if (p->p_pid == 0)
 		return (NULL);
 	p = p->p_pptr;
 	if (p == NULL)
 		return (NULL);
 	ptd = TAILQ_FIRST(&p->p_threads);
 	return (ptd);
 }
 
 /*
  * Start up the renew kernel thread.
  */
 static void
 start_nfscl(void *arg)
 {
 	struct nfsclclient *clp;
 	struct thread *td;
 
 	clp = (struct nfsclclient *)arg;
 	td = TAILQ_FIRST(&clp->nfsc_renewthread->p_threads);
 	nfscl_renewthread(clp, td);
 	kproc_exit(0);
 }
 
 void
 nfscl_start_renewthread(struct nfsclclient *clp)
 {
 
 	kproc_create(start_nfscl, (void *)clp, &clp->nfsc_renewthread, 0, 0,
 	    "nfscl");
 }
 
 /*
  * Handle wcc_data.
  * For NFSv4, it assumes that nfsv4_wccattr() was used to set up the getattr
  * as the first Op after PutFH.
  * (For NFSv4, the postop attributes are after the Op, so they can't be
  *  parsed here. A separate call to nfscl_postop_attr() is required.)
  */
 int
 nfscl_wcc_data(struct nfsrv_descript *nd, struct vnode *vp,
     struct nfsvattr *nap, int *flagp, int *wccflagp, void *stuff)
 {
 	u_int32_t *tl;
 	struct nfsnode *np = VTONFS(vp);
 	struct nfsvattr nfsva;
 	int error = 0;
 
 	if (wccflagp != NULL)
 		*wccflagp = 0;
 	if (nd->nd_flag & ND_NFSV3) {
 		*flagp = 0;
 		NFSM_DISSECT(tl, u_int32_t *, NFSX_UNSIGNED);
 		if (*tl == newnfs_true) {
 			NFSM_DISSECT(tl, u_int32_t *, 6 * NFSX_UNSIGNED);
 			if (wccflagp != NULL) {
 				NFSLOCKNODE(np);
 				*wccflagp = (np->n_mtime.tv_sec ==
 				    fxdr_unsigned(u_int32_t, *(tl + 2)) &&
 				    np->n_mtime.tv_nsec ==
 				    fxdr_unsigned(u_int32_t, *(tl + 3)));
 				NFSUNLOCKNODE(np);
 			}
 		}
 		error = nfscl_postop_attr(nd, nap, flagp, stuff);
 		if (wccflagp != NULL && *flagp == 0)
 			*wccflagp = 0;
 	} else if ((nd->nd_flag & (ND_NOMOREDATA | ND_NFSV4 | ND_V4WCCATTR))
 	    == (ND_NFSV4 | ND_V4WCCATTR)) {
 		error = nfsv4_loadattr(nd, NULL, &nfsva, NULL,
 		    NULL, 0, NULL, NULL, NULL, NULL, NULL, 0,
 		    NULL, NULL, NULL, NULL, NULL);
 		if (error)
 			return (error);
 		/*
 		 * Get rid of Op# and status for next op.
 		 */
 		NFSM_DISSECT(tl, u_int32_t *, 2 * NFSX_UNSIGNED);
 		if (*++tl)
 			nd->nd_flag |= ND_NOMOREDATA;
 		if (wccflagp != NULL &&
 		    nfsva.na_vattr.va_mtime.tv_sec != 0) {
 			NFSLOCKNODE(np);
 			*wccflagp = (np->n_mtime.tv_sec ==
 			    nfsva.na_vattr.va_mtime.tv_sec &&
 			    np->n_mtime.tv_nsec ==
 			    nfsva.na_vattr.va_mtime.tv_sec);
 			NFSUNLOCKNODE(np);
 		}
 	}
 nfsmout:
 	return (error);
 }
 
 /*
  * Get postop attributes.
  */
 int
 nfscl_postop_attr(struct nfsrv_descript *nd, struct nfsvattr *nap, int *retp,
     void *stuff)
 {
 	u_int32_t *tl;
 	int error = 0;
 
 	*retp = 0;
 	if (nd->nd_flag & ND_NOMOREDATA)
 		return (error);
 	if (nd->nd_flag & ND_NFSV3) {
 		NFSM_DISSECT(tl, u_int32_t *, NFSX_UNSIGNED);
 		*retp = fxdr_unsigned(int, *tl);
 	} else if (nd->nd_flag & ND_NFSV4) {
 		/*
 		 * For NFSv4, the postop attr are at the end, so no point
 		 * in looking if nd_repstat != 0.
 		 */
 		if (!nd->nd_repstat) {
 			NFSM_DISSECT(tl, u_int32_t *, 2 * NFSX_UNSIGNED);
 			if (*(tl + 1))
 				/* should never happen since nd_repstat != 0 */
 				nd->nd_flag |= ND_NOMOREDATA;
 			else
 				*retp = 1;
 		}
 	} else if (!nd->nd_repstat) {
 		/* For NFSv2, the attributes are here iff nd_repstat == 0 */
 		*retp = 1;
 	}
 	if (*retp) {
 		error = nfsm_loadattr(nd, nap);
 		if (error)
 			*retp = 0;
 	}
 nfsmout:
 	return (error);
 }
 
 /*
  * nfscl_request() - mostly a wrapper for newnfs_request().
  */
 int
 nfscl_request(struct nfsrv_descript *nd, struct vnode *vp, NFSPROC_T *p,
     struct ucred *cred, void *stuff)
 {
 	int ret, vers;
 	struct nfsmount *nmp;
 
 	nmp = VFSTONFS(vp->v_mount);
 	if (nd->nd_flag & ND_NFSV4)
 		vers = NFS_VER4;
 	else if (nd->nd_flag & ND_NFSV3)
 		vers = NFS_VER3;
 	else
 		vers = NFS_VER2;
 	ret = newnfs_request(nd, nmp, NULL, &nmp->nm_sockreq, vp, p, cred,
 		NFS_PROG, vers, NULL, 1, NULL, NULL);
 	return (ret);
 }
 
 /*
  * fill in this bsden's variant of statfs using nfsstatfs.
  */
 void
 nfscl_loadsbinfo(struct nfsmount *nmp, struct nfsstatfs *sfp, void *statfs)
 {
 	struct statfs *sbp = (struct statfs *)statfs;
 
 	if (nmp->nm_flag & (NFSMNT_NFSV3 | NFSMNT_NFSV4)) {
 		sbp->f_bsize = NFS_FABLKSIZE;
 		sbp->f_blocks = sfp->sf_tbytes / NFS_FABLKSIZE;
 		sbp->f_bfree = sfp->sf_fbytes / NFS_FABLKSIZE;
 		/*
 		 * Although sf_abytes is uint64_t and f_bavail is int64_t,
 		 * the value after dividing by NFS_FABLKSIZE is small
 		 * enough that it will fit in 63bits, so it is ok to
 		 * assign it to f_bavail without fear that it will become
 		 * negative.
 		 */
 		sbp->f_bavail = sfp->sf_abytes / NFS_FABLKSIZE;
 		sbp->f_files = sfp->sf_tfiles;
 		/* Since f_ffree is int64_t, clip it to 63bits. */
 		if (sfp->sf_ffiles > INT64_MAX)
 			sbp->f_ffree = INT64_MAX;
 		else
 			sbp->f_ffree = sfp->sf_ffiles;
 	} else if ((nmp->nm_flag & NFSMNT_NFSV4) == 0) {
 		/*
 		 * The type casts to (int32_t) ensure that this code is
 		 * compatible with the old NFS client, in that it will
 		 * propagate bit31 to the high order bits. This may or may
 		 * not be correct for NFSv2, but since it is a legacy
 		 * environment, I'd rather retain backwards compatibility.
 		 */
 		sbp->f_bsize = (int32_t)sfp->sf_bsize;
 		sbp->f_blocks = (int32_t)sfp->sf_blocks;
 		sbp->f_bfree = (int32_t)sfp->sf_bfree;
 		sbp->f_bavail = (int32_t)sfp->sf_bavail;
 		sbp->f_files = 0;
 		sbp->f_ffree = 0;
 	}
 }
 
 /*
  * Use the fsinfo stuff to update the mount point.
  */
 void
 nfscl_loadfsinfo(struct nfsmount *nmp, struct nfsfsinfo *fsp)
 {
 
 	if ((nmp->nm_wsize == 0 || fsp->fs_wtpref < nmp->nm_wsize) &&
 	    fsp->fs_wtpref >= NFS_FABLKSIZE)
 		nmp->nm_wsize = (fsp->fs_wtpref + NFS_FABLKSIZE - 1) &
 		    ~(NFS_FABLKSIZE - 1);
 	if (fsp->fs_wtmax < nmp->nm_wsize && fsp->fs_wtmax > 0) {
 		nmp->nm_wsize = fsp->fs_wtmax & ~(NFS_FABLKSIZE - 1);
 		if (nmp->nm_wsize == 0)
 			nmp->nm_wsize = fsp->fs_wtmax;
 	}
 	if (nmp->nm_wsize < NFS_FABLKSIZE)
 		nmp->nm_wsize = NFS_FABLKSIZE;
 	if ((nmp->nm_rsize == 0 || fsp->fs_rtpref < nmp->nm_rsize) &&
 	    fsp->fs_rtpref >= NFS_FABLKSIZE)
 		nmp->nm_rsize = (fsp->fs_rtpref + NFS_FABLKSIZE - 1) &
 		    ~(NFS_FABLKSIZE - 1);
 	if (fsp->fs_rtmax < nmp->nm_rsize && fsp->fs_rtmax > 0) {
 		nmp->nm_rsize = fsp->fs_rtmax & ~(NFS_FABLKSIZE - 1);
 		if (nmp->nm_rsize == 0)
 			nmp->nm_rsize = fsp->fs_rtmax;
 	}
 	if (nmp->nm_rsize < NFS_FABLKSIZE)
 		nmp->nm_rsize = NFS_FABLKSIZE;
 	if ((nmp->nm_readdirsize == 0 || fsp->fs_dtpref < nmp->nm_readdirsize)
 	    && fsp->fs_dtpref >= NFS_DIRBLKSIZ)
 		nmp->nm_readdirsize = (fsp->fs_dtpref + NFS_DIRBLKSIZ - 1) &
 		    ~(NFS_DIRBLKSIZ - 1);
 	if (fsp->fs_rtmax < nmp->nm_readdirsize && fsp->fs_rtmax > 0) {
 		nmp->nm_readdirsize = fsp->fs_rtmax & ~(NFS_DIRBLKSIZ - 1);
 		if (nmp->nm_readdirsize == 0)
 			nmp->nm_readdirsize = fsp->fs_rtmax;
 	}
 	if (nmp->nm_readdirsize < NFS_DIRBLKSIZ)
 		nmp->nm_readdirsize = NFS_DIRBLKSIZ;
 	if (fsp->fs_maxfilesize > 0 &&
 	    fsp->fs_maxfilesize < nmp->nm_maxfilesize)
 		nmp->nm_maxfilesize = fsp->fs_maxfilesize;
 	nmp->nm_mountp->mnt_stat.f_iosize = newnfs_iosize(nmp);
 	nmp->nm_state |= NFSSTA_GOTFSINFO;
 }
 
 /*
  * Lookups source address which should be used to communicate with
  * @nmp and stores it inside @pdst.
  *
  * Returns 0 on success.
  */
 u_int8_t *
 nfscl_getmyip(struct nfsmount *nmp, struct in6_addr *paddr, int *isinet6p)
 {
 #if defined(INET6) || defined(INET)
 	int error, fibnum;
 
 	fibnum = curthread->td_proc->p_fibnum;
 #endif
 #ifdef INET
 	if (nmp->nm_nam->sa_family == AF_INET) {
 		struct sockaddr_in *sin;
 		struct nhop4_extended nh_ext;
 
 		sin = (struct sockaddr_in *)nmp->nm_nam;
 		CURVNET_SET(CRED_TO_VNET(nmp->nm_sockreq.nr_cred));
 		error = fib4_lookup_nh_ext(fibnum, sin->sin_addr, 0, 0,
 		    &nh_ext);
 		CURVNET_RESTORE();
 		if (error != 0)
 			return (NULL);
 
 		if (IN_LOOPBACK(ntohl(nh_ext.nh_src.s_addr))) {
 			/* Ignore loopback addresses */
 			return (NULL);
 		}
 
 		*isinet6p = 0;
 		*((struct in_addr *)paddr) = nh_ext.nh_src;
 
 		return (u_int8_t *)paddr;
 	}
 #endif
 #ifdef INET6
 	if (nmp->nm_nam->sa_family == AF_INET6) {
 		struct sockaddr_in6 *sin6;
 
 		sin6 = (struct sockaddr_in6 *)nmp->nm_nam;
 
 		CURVNET_SET(CRED_TO_VNET(nmp->nm_sockreq.nr_cred));
 		error = in6_selectsrc_addr(fibnum, &sin6->sin6_addr,
 		    sin6->sin6_scope_id, NULL, paddr, NULL);
 		CURVNET_RESTORE();
 		if (error != 0)
 			return (NULL);
 
 		if (IN6_IS_ADDR_LOOPBACK(paddr))
 			return (NULL);
 
 		/* Scope is embedded in */
 		*isinet6p = 1;
 
 		return (u_int8_t *)paddr;
 	}
 #endif
 	return (NULL);
 }
 
 /*
  * Copy NFS uid, gids from the cred structure.
  */
 void
 newnfs_copyincred(struct ucred *cr, struct nfscred *nfscr)
 {
 	int i;
 
 	KASSERT(cr->cr_ngroups >= 0,
 	    ("newnfs_copyincred: negative cr_ngroups"));
 	nfscr->nfsc_uid = cr->cr_uid;
 	nfscr->nfsc_ngroups = MIN(cr->cr_ngroups, NFS_MAXGRPS + 1);
 	for (i = 0; i < nfscr->nfsc_ngroups; i++)
 		nfscr->nfsc_groups[i] = cr->cr_groups[i];
 }
 
 
 /*
  * Do any client specific initialization.
  */
 void
 nfscl_init(void)
 {
 	static int inited = 0;
 
 	if (inited)
 		return;
 	inited = 1;
 	nfscl_inited = 1;
 	ncl_pbuf_zone = pbuf_zsecond_create("nfspbuf", nswbuf / 2);
 }
 
 /*
  * Check each of the attributes to be set, to ensure they aren't already
  * the correct value. Disable setting ones already correct.
  */
 int
 nfscl_checksattr(struct vattr *vap, struct nfsvattr *nvap)
 {
 
 	if (vap->va_mode != (mode_t)VNOVAL) {
 		if (vap->va_mode == nvap->na_mode)
 			vap->va_mode = (mode_t)VNOVAL;
 	}
 	if (vap->va_uid != (uid_t)VNOVAL) {
 		if (vap->va_uid == nvap->na_uid)
 			vap->va_uid = (uid_t)VNOVAL;
 	}
 	if (vap->va_gid != (gid_t)VNOVAL) {
 		if (vap->va_gid == nvap->na_gid)
 			vap->va_gid = (gid_t)VNOVAL;
 	}
 	if (vap->va_size != VNOVAL) {
 		if (vap->va_size == nvap->na_size)
 			vap->va_size = VNOVAL;
 	}
 
 	/*
 	 * We are normally called with only a partially initialized
 	 * VAP.  Since the NFSv3 spec says that server may use the
 	 * file attributes to store the verifier, the spec requires
 	 * us to do a SETATTR RPC. FreeBSD servers store the verifier
 	 * in atime, but we can't really assume that all servers will
 	 * so we ensure that our SETATTR sets both atime and mtime.
 	 * Set the VA_UTIMES_NULL flag for this case, so that
 	 * the server's time will be used.  This is needed to
 	 * work around a bug in some Solaris servers, where
 	 * setting the time TOCLIENT causes the Setattr RPC
 	 * to return NFS_OK, but not set va_mode.
 	 */
 	if (vap->va_mtime.tv_sec == VNOVAL) {
 		vfs_timestamp(&vap->va_mtime);
 		vap->va_vaflags |= VA_UTIMES_NULL;
 	}
 	if (vap->va_atime.tv_sec == VNOVAL)
 		vap->va_atime = vap->va_mtime;
 	return (1);
 }
 
 /*
  * Map nfsv4 errors to errno.h errors.
  * The uid and gid arguments are only used for NFSERR_BADOWNER and that
  * error should only be returned for the Open, Create and Setattr Ops.
  * As such, most calls can just pass in 0 for those arguments.
  */
 APPLESTATIC int
 nfscl_maperr(struct thread *td, int error, uid_t uid, gid_t gid)
 {
 	struct proc *p;
 
 	if (error < 10000 || error >= NFSERR_STALEWRITEVERF)
 		return (error);
 	if (td != NULL)
 		p = td->td_proc;
 	else
 		p = NULL;
 	switch (error) {
 	case NFSERR_BADOWNER:
 		tprintf(p, LOG_INFO,
 		    "No name and/or group mapping for uid,gid:(%d,%d)\n",
 		    uid, gid);
 		return (EPERM);
 	case NFSERR_BADNAME:
 	case NFSERR_BADCHAR:
 		printf("nfsv4 char/name not handled by server\n");
 		return (ENOENT);
 	case NFSERR_STALECLIENTID:
 	case NFSERR_STALESTATEID:
 	case NFSERR_EXPIRED:
 	case NFSERR_BADSTATEID:
 	case NFSERR_BADSESSION:
 		printf("nfsv4 recover err returned %d\n", error);
 		return (EIO);
 	case NFSERR_BADHANDLE:
 	case NFSERR_SERVERFAULT:
 	case NFSERR_BADTYPE:
 	case NFSERR_FHEXPIRED:
 	case NFSERR_RESOURCE:
 	case NFSERR_MOVED:
 	case NFSERR_NOFILEHANDLE:
 	case NFSERR_MINORVERMISMATCH:
 	case NFSERR_OLDSTATEID:
 	case NFSERR_BADSEQID:
 	case NFSERR_LEASEMOVED:
 	case NFSERR_RECLAIMBAD:
 	case NFSERR_BADXDR:
 	case NFSERR_OPILLEGAL:
 		printf("nfsv4 client/server protocol prob err=%d\n",
 		    error);
 		return (EIO);
 	default:
 		tprintf(p, LOG_INFO, "nfsv4 err=%d\n", error);
 		return (EIO);
 	};
 }
 
 /*
  * Check to see if the process for this owner exists. Return 1 if it doesn't
  * and 0 otherwise.
  */
 int
 nfscl_procdoesntexist(u_int8_t *own)
 {
 	union {
 		u_int32_t	lval;
 		u_int8_t	cval[4];
 	} tl;
 	struct proc *p;
 	pid_t pid;
 	int i, ret = 0;
 
 	/* For the single open_owner of all 0 bytes, just return 0. */
 	for (i = 0; i < NFSV4CL_LOCKNAMELEN; i++)
 		if (own[i] != 0)
 			break;
 	if (i == NFSV4CL_LOCKNAMELEN)
 		return (0);
 
 	tl.cval[0] = *own++;
 	tl.cval[1] = *own++;
 	tl.cval[2] = *own++;
 	tl.cval[3] = *own++;
 	pid = tl.lval;
 	p = pfind_any_locked(pid);
 	if (p == NULL)
 		return (1);
 	if (p->p_stats == NULL) {
 		PROC_UNLOCK(p);
 		return (0);
 	}
 	tl.cval[0] = *own++;
 	tl.cval[1] = *own++;
 	tl.cval[2] = *own++;
 	tl.cval[3] = *own++;
 	if (tl.lval != p->p_stats->p_start.tv_sec) {
 		ret = 1;
 	} else {
 		tl.cval[0] = *own++;
 		tl.cval[1] = *own++;
 		tl.cval[2] = *own++;
 		tl.cval[3] = *own;
 		if (tl.lval != p->p_stats->p_start.tv_usec)
 			ret = 1;
 	}
 	PROC_UNLOCK(p);
 	return (ret);
 }
 
 /*
  * - nfs pseudo system call for the client
  */
 /*
  * MPSAFE
  */
 static int
 nfssvc_nfscl(struct thread *td, struct nfssvc_args *uap)
 {
 	struct file *fp;
 	struct nfscbd_args nfscbdarg;
 	struct nfsd_nfscbd_args nfscbdarg2;
 	struct nameidata nd;
 	struct nfscl_dumpmntopts dumpmntopts;
 	cap_rights_t rights;
 	char *buf;
 	int error;
 	struct mount *mp;
 	struct nfsmount *nmp;
 
 	if (uap->flag & NFSSVC_CBADDSOCK) {
 		error = copyin(uap->argp, (caddr_t)&nfscbdarg, sizeof(nfscbdarg));
 		if (error)
 			return (error);
 		/*
 		 * Since we don't know what rights might be required,
 		 * pretend that we need them all. It is better to be too
 		 * careful than too reckless.
 		 */
 		error = fget(td, nfscbdarg.sock,
 		    cap_rights_init(&rights, CAP_SOCK_CLIENT), &fp);
 		if (error)
 			return (error);
 		if (fp->f_type != DTYPE_SOCKET) {
 			fdrop(fp, td);
 			return (EPERM);
 		}
 		error = nfscbd_addsock(fp);
 		fdrop(fp, td);
 		if (!error && nfscl_enablecallb == 0) {
 			nfsv4_cbport = nfscbdarg.port;
 			nfscl_enablecallb = 1;
 		}
 	} else if (uap->flag & NFSSVC_NFSCBD) {
 		if (uap->argp == NULL) 
 			return (EINVAL);
 		error = copyin(uap->argp, (caddr_t)&nfscbdarg2,
 		    sizeof(nfscbdarg2));
 		if (error)
 			return (error);
 		error = nfscbd_nfsd(td, &nfscbdarg2);
 	} else if (uap->flag & NFSSVC_DUMPMNTOPTS) {
 		error = copyin(uap->argp, &dumpmntopts, sizeof(dumpmntopts));
 		if (error == 0 && (dumpmntopts.ndmnt_blen < 256 ||
 		    dumpmntopts.ndmnt_blen > 1024))
 			error = EINVAL;
 		if (error == 0)
 			error = nfsrv_lookupfilename(&nd,
 			    dumpmntopts.ndmnt_fname, td);
 		if (error == 0 && strcmp(nd.ni_vp->v_mount->mnt_vfc->vfc_name,
 		    "nfs") != 0) {
 			vput(nd.ni_vp);
 			error = EINVAL;
 		}
 		if (error == 0) {
 			buf = malloc(dumpmntopts.ndmnt_blen, M_TEMP, M_WAITOK);
 			nfscl_retopts(VFSTONFS(nd.ni_vp->v_mount), buf,
 			    dumpmntopts.ndmnt_blen);
 			vput(nd.ni_vp);
 			error = copyout(buf, dumpmntopts.ndmnt_buf,
 			    dumpmntopts.ndmnt_blen);
 			free(buf, M_TEMP);
 		}
 	} else if (uap->flag & NFSSVC_FORCEDISM) {
 		buf = malloc(MNAMELEN + 1, M_TEMP, M_WAITOK);
 		error = copyinstr(uap->argp, buf, MNAMELEN + 1, NULL);
 		if (error == 0) {
 			nmp = NULL;
 			mtx_lock(&mountlist_mtx);
 			TAILQ_FOREACH(mp, &mountlist, mnt_list) {
 				if (strcmp(mp->mnt_stat.f_mntonname, buf) ==
 				    0 && strcmp(mp->mnt_stat.f_fstypename,
 				    "nfs") == 0 && mp->mnt_data != NULL) {
 					nmp = VFSTONFS(mp);
 					NFSDDSLOCK();
 					if (nfsv4_findmirror(nmp) != NULL) {
 						NFSDDSUNLOCK();
 						error = ENXIO;
 						nmp = NULL;
 						break;
 					}
 					mtx_lock(&nmp->nm_mtx);
 					if ((nmp->nm_privflag &
 					    NFSMNTP_FORCEDISM) == 0) {
 						nmp->nm_privflag |= 
 						   (NFSMNTP_FORCEDISM |
 						    NFSMNTP_CANCELRPCS);
 						mtx_unlock(&nmp->nm_mtx);
 					} else {
 						mtx_unlock(&nmp->nm_mtx);
 						nmp = NULL;
 					}
 					NFSDDSUNLOCK();
 					break;
 				}
 			}
 			mtx_unlock(&mountlist_mtx);
 
 			if (nmp != NULL) {
 				/*
 				 * Call newnfs_nmcancelreqs() to cause
 				 * any RPCs in progress on the mount point to
 				 * fail.
 				 * This will cause any process waiting for an
 				 * RPC to complete while holding a vnode lock
 				 * on the mounted-on vnode (such as "df" or
 				 * a non-forced "umount") to fail.
 				 * This will unlock the mounted-on vnode so
 				 * a forced dismount can succeed.
 				 * Then clear NFSMNTP_CANCELRPCS and wakeup(),
 				 * so that nfs_unmount() can complete.
 				 */
 				newnfs_nmcancelreqs(nmp);
 				mtx_lock(&nmp->nm_mtx);
 				nmp->nm_privflag &= ~NFSMNTP_CANCELRPCS;
 				wakeup(nmp);
 				mtx_unlock(&nmp->nm_mtx);
 			} else if (error == 0)
 				error = EINVAL;
 		}
 		free(buf, M_TEMP);
 	} else {
 		error = EINVAL;
 	}
 	return (error);
 }
 
 extern int (*nfsd_call_nfscl)(struct thread *, struct nfssvc_args *);
 
 /*
  * Called once to initialize data structures...
  */
 static int
 nfscl_modevent(module_t mod, int type, void *data)
 {
 	int error = 0;
 	static int loaded = 0;
 
 	switch (type) {
 	case MOD_LOAD:
 		if (loaded)
 			return (0);
 		newnfs_portinit();
 		mtx_init(&ncl_iod_mutex, "ncl_iod_mutex", NULL, MTX_DEF);
 		nfscl_init();
 		NFSD_LOCK();
 		nfsrvd_cbinit(0);
 		NFSD_UNLOCK();
 		ncl_call_invalcaches = ncl_invalcaches;
 		nfsd_call_nfscl = nfssvc_nfscl;
 		loaded = 1;
 		break;
 
 	case MOD_UNLOAD:
 		if (nfs_numnfscbd != 0) {
 			error = EBUSY;
 			break;
 		}
 
 		/*
 		 * XXX: Unloading of nfscl module is unsupported.
 		 */
 #if 0
 		ncl_call_invalcaches = NULL;
 		nfsd_call_nfscl = NULL;
 		uma_zdestroy(ncl_pbuf_zone);
 		/* and get rid of the mutexes */
 		mtx_destroy(&ncl_iod_mutex);
 		loaded = 0;
 		break;
 #else
 		/* FALLTHROUGH */
 #endif
 	default:
 		error = EOPNOTSUPP;
 		break;
 	}
 	return error;
 }
 static moduledata_t nfscl_mod = {
 	"nfscl",
 	nfscl_modevent,
 	NULL,
 };
 DECLARE_MODULE(nfscl, nfscl_mod, SI_SUB_VFS, SI_ORDER_FIRST);
 
 /* So that loader and kldload(2) can find us, wherever we are.. */
 MODULE_VERSION(nfscl, 1);
 MODULE_DEPEND(nfscl, nfscommon, 1, 1, 1);
 MODULE_DEPEND(nfscl, krpc, 1, 1, 1);
 MODULE_DEPEND(nfscl, nfssvc, 1, 1, 1);
 MODULE_DEPEND(nfscl, nfslock, 1, 1, 1);
 
Index: head/sys/sys/proc.h
===================================================================
--- head/sys/sys/proc.h	(revision 358251)
+++ head/sys/sys/proc.h	(revision 358252)
@@ -1,1259 +1,1261 @@
 /*-
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Copyright (c) 1986, 1989, 1991, 1993
  *	The Regents of the University of California.  All rights reserved.
  * (c) UNIX System Laboratories, Inc.
  * All or some portions of this file are derived from material licensed
  * to the University of California by American Telephone and Telegraph
  * Co. or Unix System Laboratories, Inc. and are reproduced herein with
  * the permission of UNIX System Laboratories, Inc.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	@(#)proc.h	8.15 (Berkeley) 5/19/95
  * $FreeBSD$
  */
 
 #ifndef _SYS_PROC_H_
 #define	_SYS_PROC_H_
 
 #include <sys/callout.h>		/* For struct callout. */
 #include <sys/event.h>			/* For struct klist. */
 #ifdef _KERNEL
 #include <sys/_eventhandler.h>
 #endif
 #include <sys/condvar.h>
 #ifndef _KERNEL
 #include <sys/filedesc.h>
 #endif
 #include <sys/queue.h>
 #include <sys/_lock.h>
 #include <sys/lock_profile.h>
 #include <sys/_mutex.h>
 #include <sys/osd.h>
 #include <sys/priority.h>
 #include <sys/rtprio.h>			/* XXX. */
 #include <sys/runq.h>
 #include <sys/resource.h>
 #include <sys/sigio.h>
 #include <sys/signal.h>
 #include <sys/signalvar.h>
 #ifndef _KERNEL
 #include <sys/time.h>			/* For structs itimerval, timeval. */
 #else
 #include <sys/pcpu.h>
 #include <sys/systm.h>
 #endif
 #include <sys/ucontext.h>
 #include <sys/ucred.h>
 #include <sys/types.h>
 #include <sys/_domainset.h>
 
 #include <machine/proc.h>		/* Machine-dependent proc substruct. */
 #ifdef _KERNEL
 #include <machine/cpu.h>
 #endif
 
 /*
  * One structure allocated per session.
  *
  * List of locks
  * (m)		locked by s_mtx mtx
  * (e)		locked by proctree_lock sx
  * (c)		const until freeing
  */
 struct session {
 	u_int		s_count;	/* Ref cnt; pgrps in session - atomic. */
 	struct proc	*s_leader;	/* (m + e) Session leader. */
 	struct vnode	*s_ttyvp;	/* (m) Vnode of controlling tty. */
 	struct cdev_priv *s_ttydp;	/* (m) Device of controlling tty.  */
 	struct tty	*s_ttyp;	/* (e) Controlling tty. */
 	pid_t		s_sid;		/* (c) Session ID. */
 					/* (m) Setlogin() name: */
 	char		s_login[roundup(MAXLOGNAME, sizeof(long))];
 	struct mtx	s_mtx;		/* Mutex to protect members. */
 };
 
 /*
  * One structure allocated per process group.
  *
  * List of locks
  * (m)		locked by pg_mtx mtx
  * (e)		locked by proctree_lock sx
  * (c)		const until freeing
  */
 struct pgrp {
 	LIST_ENTRY(pgrp) pg_hash;	/* (e) Hash chain. */
 	LIST_HEAD(, proc) pg_members;	/* (m + e) Pointer to pgrp members. */
 	struct session	*pg_session;	/* (c) Pointer to session. */
 	struct sigiolst	pg_sigiolst;	/* (m) List of sigio sources. */
 	pid_t		pg_id;		/* (c) Process group id. */
 	int		pg_jobc;	/* (m) Job control process count. */
 	struct mtx	pg_mtx;		/* Mutex to protect members */
 };
 
 /*
  * pargs, used to hold a copy of the command line, if it had a sane length.
  */
 struct pargs {
 	u_int	ar_ref;		/* Reference count. */
 	u_int	ar_length;	/* Length. */
 	u_char	ar_args[1];	/* Arguments. */
 };
 
 /*-
  * Description of a process.
  *
  * This structure contains the information needed to manage a thread of
  * control, known in UN*X as a process; it has references to substructures
  * containing descriptions of things that the process uses, but may share
  * with related processes.  The process structure and the substructures
  * are always addressable except for those marked "(CPU)" below,
  * which might be addressable only on a processor on which the process
  * is running.
  *
  * Below is a key of locks used to protect each member of struct proc.  The
  * lock is indicated by a reference to a specific character in parens in the
  * associated comment.
  *      * - not yet protected
  *      a - only touched by curproc or parent during fork/wait
  *      b - created at fork, never changes
  *		(exception aiods switch vmspaces, but they are also
  *		marked 'P_SYSTEM' so hopefully it will be left alone)
  *      c - locked by proc mtx
  *      d - locked by allproc_lock lock
  *      e - locked by proctree_lock lock
  *      f - session mtx
  *      g - process group mtx
  *      h - callout_lock mtx
  *      i - by curproc or the master session mtx
  *      j - locked by proc slock
  *      k - only accessed by curthread
  *	k*- only accessed by curthread and from an interrupt
  *	kx- only accessed by curthread and by debugger
  *      l - the attaching proc or attaching proc parent
  *      m - Giant
  *      n - not locked, lazy
  *      o - ktrace lock
  *      q - td_contested lock
  *      r - p_peers lock
  *      s - see sleepq_switch(), sleeping_on_old_rtc(), and sleep(9)
  *      t - thread lock
  *	u - process stat lock
  *	w - process timer lock
  *      x - created at fork, only changes during single threading in exec
  *      y - created at first aio, doesn't change until exit or exec at which
  *          point we are single-threaded and only curthread changes it
  *      z - zombie threads lock
  *
  * If the locking key specifies two identifiers (for example, p_pptr) then
  * either lock is sufficient for read access, but both locks must be held
  * for write access.
  */
 struct cpuset;
 struct filecaps;
 struct filemon;
 struct kaioinfo;
 struct kaudit_record;
 struct kcov_info;
 struct kdtrace_proc;
 struct kdtrace_thread;
 struct mqueue_notifier;
 struct nlminfo;
 struct p_sched;
 struct proc;
 struct procdesc;
 struct racct;
 struct sbuf;
 struct sleepqueue;
 struct socket;
 struct syscall_args;
 struct td_sched;
 struct thread;
 struct trapframe;
 struct turnstile;
 struct vm_map;
 struct vm_map_entry;
 struct epoch_tracker;
 
 /*
  * XXX: Does this belong in resource.h or resourcevar.h instead?
  * Resource usage extension.  The times in rusage structs in the kernel are
  * never up to date.  The actual times are kept as runtimes and tick counts
  * (with control info in the "previous" times), and are converted when
  * userland asks for rusage info.  Backwards compatibility prevents putting
  * this directly in the user-visible rusage struct.
  *
  * Locking for p_rux: (cu) means (u) for p_rux and (c) for p_crux.
  * Locking for td_rux: (t) for all fields.
  */
 struct rusage_ext {
 	uint64_t	rux_runtime;    /* (cu) Real time. */
 	uint64_t	rux_uticks;     /* (cu) Statclock hits in user mode. */
 	uint64_t	rux_sticks;     /* (cu) Statclock hits in sys mode. */
 	uint64_t	rux_iticks;     /* (cu) Statclock hits in intr mode. */
 	uint64_t	rux_uu;         /* (c) Previous user time in usec. */
 	uint64_t	rux_su;         /* (c) Previous sys time in usec. */
 	uint64_t	rux_tu;         /* (c) Previous total time in usec. */
 };
 
 /*
  * Kernel runnable context (thread).
  * This is what is put to sleep and reactivated.
  * Thread context.  Processes may have multiple threads.
  */
 struct thread {
 	struct mtx	*volatile td_lock; /* replaces sched lock */
 	struct proc	*td_proc;	/* (*) Associated process. */
 	TAILQ_ENTRY(thread) td_plist;	/* (*) All threads in this proc. */
 	TAILQ_ENTRY(thread) td_runq;	/* (t) Run queue. */
 	TAILQ_ENTRY(thread) td_slpq;	/* (t) Sleep queue. */
 	TAILQ_ENTRY(thread) td_lockq;	/* (t) Lock queue. */
 	LIST_ENTRY(thread) td_hash;	/* (d) Hash chain. */
 	struct cpuset	*td_cpuset;	/* (t) CPU affinity mask. */
 	struct domainset_ref td_domain;	/* (a) NUMA policy */
 	struct seltd	*td_sel;	/* Select queue/channel. */
 	struct sleepqueue *td_sleepqueue; /* (k) Associated sleep queue. */
 	struct turnstile *td_turnstile;	/* (k) Associated turnstile. */
 	struct rl_q_entry *td_rlqe;	/* (k) Associated range lock entry. */
 	struct umtx_q   *td_umtxq;	/* (c?) Link for when we're blocked. */
 	lwpid_t		td_tid;		/* (b) Thread ID. */
 	sigqueue_t	td_sigqueue;	/* (c) Sigs arrived, not delivered. */
 #define	td_siglist	td_sigqueue.sq_signals
 	u_char		td_lend_user_pri; /* (t) Lend user pri. */
 
 /* Cleared during fork1() */
 #define	td_startzero td_flags
 	int		td_flags;	/* (t) TDF_* flags. */
 	int		td_inhibitors;	/* (t) Why can not run. */
 	int		td_pflags;	/* (k) Private thread (TDP_*) flags. */
 	int		td_pflags2;	/* (k) Private thread (TDP2_*) flags. */
 	int		td_dupfd;	/* (k) Ret value from fdopen. XXX */
 	int		td_sqqueue;	/* (t) Sleepqueue queue blocked on. */
 	const void	*td_wchan;	/* (t) Sleep address. */
 	const char	*td_wmesg;	/* (t) Reason for sleep. */
 	volatile u_char td_owepreempt;  /* (k*) Preempt on last critical_exit */
 	u_char		td_tsqueue;	/* (t) Turnstile queue blocked on. */
 	short		td_locks;	/* (k) Debug: count of non-spin locks */
 	short		td_rw_rlocks;	/* (k) Count of rwlock read locks. */
 	short		td_sx_slocks;	/* (k) Count of sx shared locks. */
 	short		td_lk_slocks;	/* (k) Count of lockmgr shared locks. */
 	short		td_stopsched;	/* (k) Scheduler stopped. */
 	struct turnstile *td_blocked;	/* (t) Lock thread is blocked on. */
 	const char	*td_lockname;	/* (t) Name of lock blocked on. */
 	LIST_HEAD(, turnstile) td_contested;	/* (q) Contested locks. */
 	struct lock_list_entry *td_sleeplocks; /* (k) Held sleep locks. */
 	int		td_intr_nesting_level; /* (k) Interrupt recursion. */
 	int		td_pinned;	/* (k) Temporary cpu pin count. */
 	struct ucred	*td_ucred;	/* (k) Reference to credentials. */
 	struct plimit	*td_limit;	/* (k) Resource limits. */
 	int		td_slptick;	/* (t) Time at sleep. */
 	int		td_blktick;	/* (t) Time spent blocked. */
 	int		td_swvoltick;	/* (t) Time at last SW_VOL switch. */
 	int		td_swinvoltick;	/* (t) Time at last SW_INVOL switch. */
 	u_int		td_cow;		/* (*) Number of copy-on-write faults */
 	struct rusage	td_ru;		/* (t) rusage information. */
 	struct rusage_ext td_rux;	/* (t) Internal rusage information. */
 	uint64_t	td_incruntime;	/* (t) Cpu ticks to transfer to proc. */
 	uint64_t	td_runtime;	/* (t) How many cpu ticks we've run. */
 	u_int 		td_pticks;	/* (t) Statclock hits for profiling */
 	u_int		td_sticks;	/* (t) Statclock hits in system mode. */
 	u_int		td_iticks;	/* (t) Statclock hits in intr mode. */
 	u_int		td_uticks;	/* (t) Statclock hits in user mode. */
 	int		td_intrval;	/* (t) Return value for sleepq. */
 	sigset_t	td_oldsigmask;	/* (k) Saved mask from pre sigpause. */
 	volatile u_int	td_generation;	/* (k) For detection of preemption */
 	stack_t		td_sigstk;	/* (k) Stack ptr and on-stack flag. */
 	int		td_xsig;	/* (c) Signal for ptrace */
 	u_long		td_profil_addr;	/* (k) Temporary addr until AST. */
 	u_int		td_profil_ticks; /* (k) Temporary ticks until AST. */
 	char		td_name[MAXCOMLEN + 1];	/* (*) Thread name. */
 	struct file	*td_fpop;	/* (k) file referencing cdev under op */
 	int		td_dbgflags;	/* (c) Userland debugger flags */
 	siginfo_t	td_si;		/* (c) For debugger or core file */
 	int		td_ng_outbound;	/* (k) Thread entered ng from above. */
 	struct osd	td_osd;		/* (k) Object specific data. */
 	struct vm_map_entry *td_map_def_user; /* (k) Deferred entries. */
 	pid_t		td_dbg_forked;	/* (c) Child pid for debugger. */
 	struct vnode	*td_vp_reserved;/* (k) Prealloated vnode. */
 	u_int		td_no_sleeping;	/* (k) Sleeping disabled count. */
 	void		*td_su;		/* (k) FFS SU private */
 	sbintime_t	td_sleeptimo;	/* (t) Sleep timeout. */
 	int		td_rtcgen;	/* (s) rtc_generation of abs. sleep */
 	int		td_errno;	/* (k) Error from last syscall. */
 	size_t		td_vslock_sz;	/* (k) amount of vslock-ed space */
 	struct kcov_info *td_kcov_info;	/* (*) Kernel code coverage data */
 #define	td_endzero td_sigmask
 
 /* Copied during fork1() or create_thread(). */
 #define	td_startcopy td_endzero
 	sigset_t	td_sigmask;	/* (c) Current signal mask. */
 	u_char		td_rqindex;	/* (t) Run queue index. */
 	u_char		td_base_pri;	/* (t) Thread base kernel priority. */
 	u_char		td_priority;	/* (t) Thread active priority. */
 	u_char		td_pri_class;	/* (t) Scheduling class. */
 	u_char		td_user_pri;	/* (t) User pri from estcpu and nice. */
 	u_char		td_base_user_pri; /* (t) Base user pri */
 	u_char		td_pre_epoch_prio; /* (k) User pri on entry to epoch */
 	uintptr_t	td_rb_list;	/* (k) Robust list head. */
 	uintptr_t	td_rbp_list;	/* (k) Robust priv list head. */
 	uintptr_t	td_rb_inact;	/* (k) Current in-action mutex loc. */
 	struct syscall_args td_sa;	/* (kx) Syscall parameters. Copied on
 					   fork for child tracing. */
 	void		*td_sigblock_ptr; /* (k) uptr for fast sigblock. */
 	uint32_t	td_sigblock_val;  /* (k) fast sigblock value read at
 					     td_sigblock_ptr on kern entry */
 #define	td_endcopy td_pcb
 
 /*
  * Fields that must be manually set in fork1() or create_thread()
  * or already have been set in the allocator, constructor, etc.
  */
 	struct pcb	*td_pcb;	/* (k) Kernel VA of pcb and kstack. */
 	enum td_states {
 		TDS_INACTIVE = 0x0,
 		TDS_INHIBITED,
 		TDS_CAN_RUN,
 		TDS_RUNQ,
 		TDS_RUNNING
 	} td_state;			/* (t) thread state */
 	union {
 		register_t	tdu_retval[2];
 		off_t		tdu_off;
 	} td_uretoff;			/* (k) Syscall aux returns. */
 #define td_retval	td_uretoff.tdu_retval
 	u_int		td_cowgen;	/* (k) Generation of COW pointers. */
 	/* LP64 hole */
 	struct callout	td_slpcallout;	/* (h) Callout for sleep. */
 	struct trapframe *td_frame;	/* (k) */
 	struct vm_object *td_kstack_obj;/* (a) Kstack object. */
 	vm_offset_t	td_kstack;	/* (a) Kernel VA of kstack. */
 	int		td_kstack_pages; /* (a) Size of the kstack. */
 	volatile u_int	td_critnest;	/* (k*) Critical section nest level. */
 	struct mdthread td_md;		/* (k) Any machine-dependent fields. */
 	struct kaudit_record	*td_ar;	/* (k) Active audit record, if any. */
 	struct lpohead	td_lprof[2];	/* (a) lock profiling objects. */
 	struct kdtrace_thread	*td_dtrace; /* (*) DTrace-specific data. */
 	struct vnet	*td_vnet;	/* (k) Effective vnet. */
 	const char	*td_vnet_lpush;	/* (k) Debugging vnet push / pop. */
 	struct trapframe *td_intr_frame;/* (k) Frame of the current irq */
 	struct proc	*td_rfppwait_p;	/* (k) The vforked child */
 	struct vm_page	**td_ma;	/* (k) uio pages held */
 	int		td_ma_cnt;	/* (k) size of *td_ma */
 	/* LP64 hole */
 	void		*td_emuldata;	/* Emulator state data */
 	int		td_lastcpu;	/* (t) Last cpu we were on. */
 	int		td_oncpu;	/* (t) Which cpu we are on. */
 	void		*td_lkpi_task;	/* LinuxKPI task struct pointer */
 	int		td_pmcpend;
 #ifdef EPOCH_TRACE
 	SLIST_HEAD(, epoch_tracker) td_epochs;
 #endif
 };
 
 struct thread0_storage {
 	struct thread t0st_thread;
 	uint64_t t0st_sched[10];
 };
 
 struct mtx *thread_lock_block(struct thread *);
 void thread_lock_block_wait(struct thread *);
 void thread_lock_set(struct thread *, struct mtx *);
 void thread_lock_unblock(struct thread *, struct mtx *);
 #define	THREAD_LOCK_ASSERT(td, type)					\
 	mtx_assert((td)->td_lock, (type))
 
 #define	THREAD_LOCK_BLOCKED_ASSERT(td, type)				\
 do {									\
 	struct mtx *__m = (td)->td_lock;				\
 	if (__m != &blocked_lock)					\
 		mtx_assert(__m, (type));				\
 } while (0)
 
 #ifdef INVARIANTS
 #define	THREAD_LOCKPTR_ASSERT(td, lock)					\
 do {									\
 	struct mtx *__m;						\
 	__m = (td)->td_lock;						\
 	KASSERT(__m == (lock),						\
 	    ("Thread %p lock %p does not match %p", td, __m, (lock)));	\
 } while (0)
 
 #define	THREAD_LOCKPTR_BLOCKED_ASSERT(td, lock)				\
 do {									\
 	struct mtx *__m;						\
 	__m = (td)->td_lock;						\
 	KASSERT(__m == (lock) || __m == &blocked_lock,			\
 	    ("Thread %p lock %p does not match %p", td, __m, (lock)));	\
 } while (0)
 
 #define	TD_LOCKS_INC(td)	((td)->td_locks++)
 #define	TD_LOCKS_DEC(td) do {						\
 	KASSERT(SCHEDULER_STOPPED_TD(td) || (td)->td_locks > 0,		\
 	    ("thread %p owns no locks", (td)));				\
 	(td)->td_locks--;						\
 } while (0)
 #else
 #define	THREAD_LOCKPTR_ASSERT(td, lock)
 #define	THREAD_LOCKPTR_BLOCKED_ASSERT(td, lock)
 
 #define	TD_LOCKS_INC(td)
 #define	TD_LOCKS_DEC(td)
 #endif
 
 /*
  * Flags kept in td_flags:
  * To change these you MUST have the scheduler lock.
  */
 #define	TDF_BORROWING	0x00000001 /* Thread is borrowing pri from another. */
 #define	TDF_INPANIC	0x00000002 /* Caused a panic, let it drive crashdump. */
 #define	TDF_INMEM	0x00000004 /* Thread's stack is in memory. */
 #define	TDF_SINTR	0x00000008 /* Sleep is interruptible. */
 #define	TDF_TIMEOUT	0x00000010 /* Timing out during sleep. */
 #define	TDF_IDLETD	0x00000020 /* This is a per-CPU idle thread. */
 #define	TDF_CANSWAP	0x00000040 /* Thread can be swapped. */
 #define	TDF_UNUSED80	0x00000080 /* unused. */
 #define	TDF_KTH_SUSP	0x00000100 /* kthread is suspended */
 #define	TDF_ALLPROCSUSP	0x00000200 /* suspended by SINGLE_ALLPROC */
 #define	TDF_BOUNDARY	0x00000400 /* Thread suspended at user boundary */
 #define	TDF_ASTPENDING	0x00000800 /* Thread has some asynchronous events. */
 #define	TDF_UNUSED12	0x00001000 /* --available-- */
 #define	TDF_SBDRY	0x00002000 /* Stop only on usermode boundary. */
 #define	TDF_UPIBLOCKED	0x00004000 /* Thread blocked on user PI mutex. */
 #define	TDF_NEEDSUSPCHK	0x00008000 /* Thread may need to suspend. */
 #define	TDF_NEEDRESCHED	0x00010000 /* Thread needs to yield. */
 #define	TDF_NEEDSIGCHK	0x00020000 /* Thread may need signal delivery. */
 #define	TDF_NOLOAD	0x00040000 /* Ignore during load avg calculations. */
 #define	TDF_SERESTART	0x00080000 /* ERESTART on stop attempts. */
 #define	TDF_THRWAKEUP	0x00100000 /* Libthr thread must not suspend itself. */
 #define	TDF_SEINTR	0x00200000 /* EINTR on stop attempts. */
 #define	TDF_SWAPINREQ	0x00400000 /* Swapin request due to wakeup. */
 #define	TDF_UNUSED23	0x00800000 /* --available-- */
 #define	TDF_SCHED0	0x01000000 /* Reserved for scheduler private use */
 #define	TDF_SCHED1	0x02000000 /* Reserved for scheduler private use */
 #define	TDF_SCHED2	0x04000000 /* Reserved for scheduler private use */
 #define	TDF_SCHED3	0x08000000 /* Reserved for scheduler private use */
 #define	TDF_ALRMPEND	0x10000000 /* Pending SIGVTALRM needs to be posted. */
 #define	TDF_PROFPEND	0x20000000 /* Pending SIGPROF needs to be posted. */
 #define	TDF_MACPEND	0x40000000 /* AST-based MAC event pending. */
 
 /* Userland debug flags */
 #define	TDB_SUSPEND	0x00000001 /* Thread is suspended by debugger */
 #define	TDB_XSIG	0x00000002 /* Thread is exchanging signal under trace */
 #define	TDB_USERWR	0x00000004 /* Debugger modified memory or registers */
 #define	TDB_SCE		0x00000008 /* Thread performs syscall enter */
 #define	TDB_SCX		0x00000010 /* Thread performs syscall exit */
 #define	TDB_EXEC	0x00000020 /* TDB_SCX from exec(2) family */
 #define	TDB_FORK	0x00000040 /* TDB_SCX from fork(2) that created new
 				      process */
 #define	TDB_STOPATFORK	0x00000080 /* Stop at the return from fork (child
 				      only) */
 #define	TDB_CHILD	0x00000100 /* New child indicator for ptrace() */
 #define	TDB_BORN	0x00000200 /* New LWP indicator for ptrace() */
 #define	TDB_EXIT	0x00000400 /* Exiting LWP indicator for ptrace() */
 #define	TDB_VFORK	0x00000800 /* vfork indicator for ptrace() */
 #define	TDB_FSTP	0x00001000 /* The thread is PT_ATTACH leader */
 #define	TDB_STEP	0x00002000 /* (x86) PSL_T set for PT_STEP */
 
 /*
  * "Private" flags kept in td_pflags:
  * These are only written by curthread and thus need no locking.
  */
 #define	TDP_OLDMASK	0x00000001 /* Need to restore mask after suspend. */
 #define	TDP_INKTR	0x00000002 /* Thread is currently in KTR code. */
 #define	TDP_INKTRACE	0x00000004 /* Thread is currently in KTRACE code. */
 #define	TDP_BUFNEED	0x00000008 /* Do not recurse into the buf flush */
 #define	TDP_COWINPROGRESS 0x00000010 /* Snapshot copy-on-write in progress. */
 #define	TDP_ALTSTACK	0x00000020 /* Have alternate signal stack. */
 #define	TDP_DEADLKTREAT	0x00000040 /* Lock acquisition - deadlock treatment. */
 #define	TDP_NOFAULTING	0x00000080 /* Do not handle page faults. */
 #define	TDP_SIGFASTBLOCK 0x00000100 /* Fast sigblock active */
 #define	TDP_OWEUPC	0x00000200 /* Call addupc() at next AST. */
 #define	TDP_ITHREAD	0x00000400 /* Thread is an interrupt thread. */
 #define	TDP_SYNCIO	0x00000800 /* Local override, disable async i/o. */
 #define	TDP_SCHED1	0x00001000 /* Reserved for scheduler private use */
 #define	TDP_SCHED2	0x00002000 /* Reserved for scheduler private use */
 #define	TDP_SCHED3	0x00004000 /* Reserved for scheduler private use */
 #define	TDP_SCHED4	0x00008000 /* Reserved for scheduler private use */
 #define	TDP_GEOM	0x00010000 /* Settle GEOM before finishing syscall */
 #define	TDP_SOFTDEP	0x00020000 /* Stuck processing softdep worklist */
 #define	TDP_NORUNNINGBUF 0x00040000 /* Ignore runningbufspace check */
 #define	TDP_WAKEUP	0x00080000 /* Don't sleep in umtx cond_wait */
 #define	TDP_INBDFLUSH	0x00100000 /* Already in BO_BDFLUSH, do not recurse */
 #define	TDP_KTHREAD	0x00200000 /* This is an official kernel thread */
 #define	TDP_CALLCHAIN	0x00400000 /* Capture thread's callchain */
 #define	TDP_IGNSUSP	0x00800000 /* Permission to ignore the MNTK_SUSPEND* */
 #define	TDP_AUDITREC	0x01000000 /* Audit record pending on thread */
 #define	TDP_RFPPWAIT	0x02000000 /* Handle RFPPWAIT on syscall exit */
 #define	TDP_RESETSPUR	0x04000000 /* Reset spurious page fault history. */
 #define	TDP_NERRNO	0x08000000 /* Last errno is already in td_errno */
 #define	TDP_UIOHELD	0x10000000 /* Current uio has pages held in td_ma */
 #define	TDP_FORKING	0x20000000 /* Thread is being created through fork() */
 #define	TDP_EXECVMSPC	0x40000000 /* Execve destroyed old vmspace */
 #define	TDP_SIGFASTPENDING 0x80000000 /* Pending signal due to sigfastblock */
 
+#define	TDP2_SBPAGES	0x00000001 /* Owns sbusy on some pages */
+
 /*
  * Reasons that the current thread can not be run yet.
  * More than one may apply.
  */
 #define	TDI_SUSPENDED	0x0001	/* On suspension queue. */
 #define	TDI_SLEEPING	0x0002	/* Actually asleep! (tricky). */
 #define	TDI_SWAPPED	0x0004	/* Stack not in mem.  Bad juju if run. */
 #define	TDI_LOCK	0x0008	/* Stopped on a lock. */
 #define	TDI_IWAIT	0x0010	/* Awaiting interrupt. */
 
 #define	TD_IS_SLEEPING(td)	((td)->td_inhibitors & TDI_SLEEPING)
 #define	TD_ON_SLEEPQ(td)	((td)->td_wchan != NULL)
 #define	TD_IS_SUSPENDED(td)	((td)->td_inhibitors & TDI_SUSPENDED)
 #define	TD_IS_SWAPPED(td)	((td)->td_inhibitors & TDI_SWAPPED)
 #define	TD_ON_LOCK(td)		((td)->td_inhibitors & TDI_LOCK)
 #define	TD_AWAITING_INTR(td)	((td)->td_inhibitors & TDI_IWAIT)
 #define	TD_IS_RUNNING(td)	((td)->td_state == TDS_RUNNING)
 #define	TD_ON_RUNQ(td)		((td)->td_state == TDS_RUNQ)
 #define	TD_CAN_RUN(td)		((td)->td_state == TDS_CAN_RUN)
 #define	TD_IS_INHIBITED(td)	((td)->td_state == TDS_INHIBITED)
 #define	TD_ON_UPILOCK(td)	((td)->td_flags & TDF_UPIBLOCKED)
 #define TD_IS_IDLETHREAD(td)	((td)->td_flags & TDF_IDLETD)
 
 #define	TD_CAN_ABORT(td)	(TD_ON_SLEEPQ((td)) &&			\
 				    ((td)->td_flags & TDF_SINTR) != 0)
 
 #define	KTDSTATE(td)							\
 	(((td)->td_inhibitors & TDI_SLEEPING) != 0 ? "sleep"  :		\
 	((td)->td_inhibitors & TDI_SUSPENDED) != 0 ? "suspended" :	\
 	((td)->td_inhibitors & TDI_SWAPPED) != 0 ? "swapped" :		\
 	((td)->td_inhibitors & TDI_LOCK) != 0 ? "blocked" :		\
 	((td)->td_inhibitors & TDI_IWAIT) != 0 ? "iwait" : "yielding")
 
 #define	TD_SET_INHIB(td, inhib) do {			\
 	(td)->td_state = TDS_INHIBITED;			\
 	(td)->td_inhibitors |= (inhib);			\
 } while (0)
 
 #define	TD_CLR_INHIB(td, inhib) do {			\
 	if (((td)->td_inhibitors & (inhib)) &&		\
 	    (((td)->td_inhibitors &= ~(inhib)) == 0))	\
 		(td)->td_state = TDS_CAN_RUN;		\
 } while (0)
 
 #define	TD_SET_SLEEPING(td)	TD_SET_INHIB((td), TDI_SLEEPING)
 #define	TD_SET_SWAPPED(td)	TD_SET_INHIB((td), TDI_SWAPPED)
 #define	TD_SET_LOCK(td)		TD_SET_INHIB((td), TDI_LOCK)
 #define	TD_SET_SUSPENDED(td)	TD_SET_INHIB((td), TDI_SUSPENDED)
 #define	TD_SET_IWAIT(td)	TD_SET_INHIB((td), TDI_IWAIT)
 #define	TD_SET_EXITING(td)	TD_SET_INHIB((td), TDI_EXITING)
 
 #define	TD_CLR_SLEEPING(td)	TD_CLR_INHIB((td), TDI_SLEEPING)
 #define	TD_CLR_SWAPPED(td)	TD_CLR_INHIB((td), TDI_SWAPPED)
 #define	TD_CLR_LOCK(td)		TD_CLR_INHIB((td), TDI_LOCK)
 #define	TD_CLR_SUSPENDED(td)	TD_CLR_INHIB((td), TDI_SUSPENDED)
 #define	TD_CLR_IWAIT(td)	TD_CLR_INHIB((td), TDI_IWAIT)
 
 #define	TD_SET_RUNNING(td)	(td)->td_state = TDS_RUNNING
 #define	TD_SET_RUNQ(td)		(td)->td_state = TDS_RUNQ
 #define	TD_SET_CAN_RUN(td)	(td)->td_state = TDS_CAN_RUN
 
 #define	TD_SBDRY_INTR(td) \
     (((td)->td_flags & (TDF_SEINTR | TDF_SERESTART)) != 0)
 #define	TD_SBDRY_ERRNO(td) \
     (((td)->td_flags & TDF_SEINTR) != 0 ? EINTR : ERESTART)
 
 /*
  * Process structure.
  */
 struct proc {
 	LIST_ENTRY(proc) p_list;	/* (d) List of all processes. */
 	TAILQ_HEAD(, thread) p_threads;	/* (c) all threads. */
 	struct mtx	p_slock;	/* process spin lock */
 	struct ucred	*p_ucred;	/* (c) Process owner's identity. */
 	struct filedesc	*p_fd;		/* (b) Open files. */
 	struct filedesc_to_leader *p_fdtol; /* (b) Tracking node */
 	struct pstats	*p_stats;	/* (b) Accounting/statistics (CPU). */
 	struct plimit	*p_limit;	/* (c) Resource limits. */
 	struct callout	p_limco;	/* (c) Limit callout handle */
 	struct sigacts	*p_sigacts;	/* (x) Signal actions, state (CPU). */
 
 	int		p_flag;		/* (c) P_* flags. */
 	int		p_flag2;	/* (c) P2_* flags. */
 	enum p_states {
 		PRS_NEW = 0,		/* In creation */
 		PRS_NORMAL,		/* threads can be run. */
 		PRS_ZOMBIE
 	} p_state;			/* (j/c) Process status. */
 	pid_t		p_pid;		/* (b) Process identifier. */
 	LIST_ENTRY(proc) p_hash;	/* (d) Hash chain. */
 	LIST_ENTRY(proc) p_pglist;	/* (g + e) List of processes in pgrp. */
 	struct proc	*p_pptr;	/* (c + e) Pointer to parent process. */
 	LIST_ENTRY(proc) p_sibling;	/* (e) List of sibling processes. */
 	LIST_HEAD(, proc) p_children;	/* (e) Pointer to list of children. */
 	struct proc	*p_reaper;	/* (e) My reaper. */
 	LIST_HEAD(, proc) p_reaplist;	/* (e) List of my descendants
 					       (if I am reaper). */
 	LIST_ENTRY(proc) p_reapsibling;	/* (e) List of siblings - descendants of
 					       the same reaper. */
 	struct mtx	p_mtx;		/* (n) Lock for this struct. */
 	struct mtx	p_statmtx;	/* Lock for the stats */
 	struct mtx	p_itimmtx;	/* Lock for the virt/prof timers */
 	struct mtx	p_profmtx;	/* Lock for the profiling */
 	struct ksiginfo *p_ksi;	/* Locked by parent proc lock */
 	sigqueue_t	p_sigqueue;	/* (c) Sigs not delivered to a td. */
 #define p_siglist	p_sigqueue.sq_signals
 	pid_t		p_oppid;	/* (c + e) Real parent pid. */
 
 /* The following fields are all zeroed upon creation in fork. */
 #define	p_startzero	p_vmspace
 	struct vmspace	*p_vmspace;	/* (b) Address space. */
 	u_int		p_swtick;	/* (c) Tick when swapped in or out. */
 	u_int		p_cowgen;	/* (c) Generation of COW pointers. */
 	struct itimerval p_realtimer;	/* (c) Alarm timer. */
 	struct rusage	p_ru;		/* (a) Exit information. */
 	struct rusage_ext p_rux;	/* (cu) Internal resource usage. */
 	struct rusage_ext p_crux;	/* (c) Internal child resource usage. */
 	int		p_profthreads;	/* (c) Num threads in addupc_task. */
 	volatile int	p_exitthreads;	/* (j) Number of threads exiting */
 	int		p_traceflag;	/* (o) Kernel trace points. */
 	struct vnode	*p_tracevp;	/* (c + o) Trace to vnode. */
 	struct ucred	*p_tracecred;	/* (o) Credentials to trace with. */
 	struct vnode	*p_textvp;	/* (b) Vnode of executable. */
 	u_int		p_lock;		/* (c) Proclock (prevent swap) count. */
 	struct sigiolst	p_sigiolst;	/* (c) List of sigio sources. */
 	int		p_sigparent;	/* (c) Signal to parent on exit. */
 	int		p_sig;		/* (n) For core dump/debugger XXX. */
 	u_int		p_stops;	/* (c) Stop event bitmask. */
 	u_int		p_stype;	/* (c) Stop event type. */
 	char		p_step;		/* (c) Process is stopped. */
 	u_char		p_pfsflags;	/* (c) Procfs flags. */
 	u_int		p_ptevents;	/* (c + e) ptrace() event mask. */
 	struct nlminfo	*p_nlminfo;	/* (?) Only used by/for lockd. */
 	struct kaioinfo	*p_aioinfo;	/* (y) ASYNC I/O info. */
 	struct thread	*p_singlethread;/* (c + j) If single threading this is it */
 	int		p_suspcount;	/* (j) Num threads in suspended mode. */
 	struct thread	*p_xthread;	/* (c) Trap thread */
 	int		p_boundary_count;/* (j) Num threads at user boundary */
 	int		p_pendingcnt;	/* how many signals are pending */
 	struct itimers	*p_itimers;	/* (c) POSIX interval timers. */
 	struct procdesc	*p_procdesc;	/* (e) Process descriptor, if any. */
 	u_int		p_treeflag;	/* (e) P_TREE flags */
 	int		p_pendingexits; /* (c) Count of pending thread exits. */
 	struct filemon	*p_filemon;	/* (c) filemon-specific data. */
 	int		p_pdeathsig;	/* (c) Signal from parent on exit. */
 /* End area that is zeroed on creation. */
 #define	p_endzero	p_magic
 
 /* The following fields are all copied upon creation in fork. */
 #define	p_startcopy	p_endzero
 	u_int		p_magic;	/* (b) Magic number. */
 	int		p_osrel;	/* (x) osreldate for the
 					       binary (from ELF note, if any) */
 	uint32_t	p_fctl0;	/* (x) ABI feature control, ELF note */
 	char		p_comm[MAXCOMLEN + 1];	/* (x) Process name. */
 	struct sysentvec *p_sysent;	/* (b) Syscall dispatch info. */
 	struct pargs	*p_args;	/* (c) Process arguments. */
 	rlim_t		p_cpulimit;	/* (c) Current CPU limit in seconds. */
 	signed char	p_nice;		/* (c) Process "nice" value. */
 	int		p_fibnum;	/* in this routing domain XXX MRT */
 	pid_t		p_reapsubtree;	/* (e) Pid of the direct child of the
 					       reaper which spawned
 					       our subtree. */
 	uint16_t	p_elf_machine;	/* (x) ELF machine type */
 	uint64_t	p_elf_flags;	/* (x) ELF flags */
 /* End area that is copied on creation. */
 #define	p_endcopy	p_xexit
 
 	u_int		p_xexit;	/* (c) Exit code. */
 	u_int		p_xsig;		/* (c) Stop/kill sig. */
 	struct pgrp	*p_pgrp;	/* (c + e) Pointer to process group. */
 	struct knlist	*p_klist;	/* (c) Knotes attached to this proc. */
 	int		p_numthreads;	/* (c) Number of threads. */
 	struct mdproc	p_md;		/* Any machine-dependent fields. */
 	struct callout	p_itcallout;	/* (h + c) Interval timer callout. */
 	u_short		p_acflag;	/* (c) Accounting flags. */
 	struct proc	*p_peers;	/* (r) */
 	struct proc	*p_leader;	/* (b) */
 	void		*p_emuldata;	/* (c) Emulator state data. */
 	struct label	*p_label;	/* (*) Proc (not subject) MAC label. */
 	STAILQ_HEAD(, ktr_request)	p_ktr;	/* (o) KTR event queue. */
 	LIST_HEAD(, mqueue_notifier)	p_mqnotifier; /* (c) mqueue notifiers.*/
 	struct kdtrace_proc	*p_dtrace; /* (*) DTrace-specific data. */
 	struct cv	p_pwait;	/* (*) wait cv for exit/exec. */
 	uint64_t	p_prev_runtime;	/* (c) Resource usage accounting. */
 	struct racct	*p_racct;	/* (b) Resource accounting. */
 	int		p_throttled;	/* (c) Flag for racct pcpu throttling */
 	/*
 	 * An orphan is the child that has been re-parented to the
 	 * debugger as a result of attaching to it.  Need to keep
 	 * track of them for parent to be able to collect the exit
 	 * status of what used to be children.
 	 */
 	LIST_ENTRY(proc) p_orphan;	/* (e) List of orphan processes. */
 	LIST_HEAD(, proc) p_orphans;	/* (e) Pointer to list of orphans. */
 };
 
 #define	p_session	p_pgrp->pg_session
 #define	p_pgid		p_pgrp->pg_id
 
 #define	NOCPU		(-1)	/* For when we aren't on a CPU. */
 #define	NOCPU_OLD	(255)
 #define	MAXCPU_OLD	(254)
 
 #define	PROC_SLOCK(p)	mtx_lock_spin(&(p)->p_slock)
 #define	PROC_SUNLOCK(p)	mtx_unlock_spin(&(p)->p_slock)
 #define	PROC_SLOCK_ASSERT(p, type)	mtx_assert(&(p)->p_slock, (type))
 
 #define	PROC_STATLOCK(p)	mtx_lock_spin(&(p)->p_statmtx)
 #define	PROC_STATUNLOCK(p)	mtx_unlock_spin(&(p)->p_statmtx)
 #define	PROC_STATLOCK_ASSERT(p, type)	mtx_assert(&(p)->p_statmtx, (type))
 
 #define	PROC_ITIMLOCK(p)	mtx_lock_spin(&(p)->p_itimmtx)
 #define	PROC_ITIMUNLOCK(p)	mtx_unlock_spin(&(p)->p_itimmtx)
 #define	PROC_ITIMLOCK_ASSERT(p, type)	mtx_assert(&(p)->p_itimmtx, (type))
 
 #define	PROC_PROFLOCK(p)	mtx_lock_spin(&(p)->p_profmtx)
 #define	PROC_PROFUNLOCK(p)	mtx_unlock_spin(&(p)->p_profmtx)
 #define	PROC_PROFLOCK_ASSERT(p, type)	mtx_assert(&(p)->p_profmtx, (type))
 
 /* These flags are kept in p_flag. */
 #define	P_ADVLOCK	0x00001	/* Process may hold a POSIX advisory lock. */
 #define	P_CONTROLT	0x00002	/* Has a controlling terminal. */
 #define	P_KPROC		0x00004	/* Kernel process. */
 #define	P_UNUSED3	0x00008	/* --available-- */
 #define	P_PPWAIT	0x00010	/* Parent is waiting for child to exec/exit. */
 #define	P_PROFIL	0x00020	/* Has started profiling. */
 #define	P_STOPPROF	0x00040	/* Has thread requesting to stop profiling. */
 #define	P_HADTHREADS	0x00080	/* Has had threads (no cleanup shortcuts) */
 #define	P_SUGID		0x00100	/* Had set id privileges since last exec. */
 #define	P_SYSTEM	0x00200	/* System proc: no sigs, stats or swapping. */
 #define	P_SINGLE_EXIT	0x00400	/* Threads suspending should exit, not wait. */
 #define	P_TRACED	0x00800	/* Debugged process being traced. */
 #define	P_WAITED	0x01000	/* Someone is waiting for us. */
 #define	P_WEXIT		0x02000	/* Working on exiting. */
 #define	P_EXEC		0x04000	/* Process called exec. */
 #define	P_WKILLED	0x08000	/* Killed, go to kernel/user boundary ASAP. */
 #define	P_CONTINUED	0x10000	/* Proc has continued from a stopped state. */
 #define	P_STOPPED_SIG	0x20000	/* Stopped due to SIGSTOP/SIGTSTP. */
 #define	P_STOPPED_TRACE	0x40000	/* Stopped because of tracing. */
 #define	P_STOPPED_SINGLE 0x80000 /* Only 1 thread can continue (not to user). */
 #define	P_PROTECTED	0x100000 /* Do not kill on memory overcommit. */
 #define	P_SIGEVENT	0x200000 /* Process pending signals changed. */
 #define	P_SINGLE_BOUNDARY 0x400000 /* Threads should suspend at user boundary. */
 #define	P_HWPMC		0x800000 /* Process is using HWPMCs */
 #define	P_JAILED	0x1000000 /* Process is in jail. */
 #define	P_TOTAL_STOP	0x2000000 /* Stopped in stop_all_proc. */
 #define	P_INEXEC	0x4000000 /* Process is in execve(). */
 #define	P_STATCHILD	0x8000000 /* Child process stopped or exited. */
 #define	P_INMEM		0x10000000 /* Loaded into memory. */
 #define	P_SWAPPINGOUT	0x20000000 /* Process is being swapped out. */
 #define	P_SWAPPINGIN	0x40000000 /* Process is being swapped in. */
 #define	P_PPTRACE	0x80000000 /* PT_TRACEME by vforked child. */
 
 #define	P_STOPPED	(P_STOPPED_SIG|P_STOPPED_SINGLE|P_STOPPED_TRACE)
 #define	P_SHOULDSTOP(p)	((p)->p_flag & P_STOPPED)
 #define	P_KILLED(p)	((p)->p_flag & P_WKILLED)
 
 /* These flags are kept in p_flag2. */
 #define	P2_INHERIT_PROTECTED 0x00000001 /* New children get P_PROTECTED. */
 #define	P2_NOTRACE	0x00000002	/* No ptrace(2) attach or coredumps. */
 #define	P2_NOTRACE_EXEC 0x00000004	/* Keep P2_NOPTRACE on exec(2). */
 #define	P2_AST_SU	0x00000008	/* Handles SU ast for kthreads. */
 #define	P2_PTRACE_FSTP	0x00000010 /* SIGSTOP from PT_ATTACH not yet handled. */
 #define	P2_TRAPCAP	0x00000020	/* SIGTRAP on ENOTCAPABLE */
 #define	P2_ASLR_ENABLE	0x00000040	/* Force enable ASLR. */
 #define	P2_ASLR_DISABLE	0x00000080	/* Force disable ASLR. */
 #define	P2_ASLR_IGNSTART 0x00000100	/* Enable ASLR to consume sbrk area. */
 #define	P2_PROTMAX_ENABLE 0x00000200	/* Force enable implied PROT_MAX. */
 #define	P2_PROTMAX_DISABLE 0x00000400	/* Force disable implied PROT_MAX. */
 #define	P2_STKGAP_DISABLE 0x00000800	/* Disable stack gap for MAP_STACK */
 #define	P2_STKGAP_DISABLE_EXEC 0x00001000 /* Stack gap disabled after exec */
 
 /* Flags protected by proctree_lock, kept in p_treeflags. */
 #define	P_TREE_ORPHANED		0x00000001	/* Reparented, on orphan list */
 #define	P_TREE_FIRST_ORPHAN	0x00000002	/* First element of orphan
 						   list */
 #define	P_TREE_REAPER		0x00000004	/* Reaper of subtree */
 
 /*
  * These were process status values (p_stat), now they are only used in
  * legacy conversion code.
  */
 #define	SIDL	1		/* Process being created by fork. */
 #define	SRUN	2		/* Currently runnable. */
 #define	SSLEEP	3		/* Sleeping on an address. */
 #define	SSTOP	4		/* Process debugging or suspension. */
 #define	SZOMB	5		/* Awaiting collection by parent. */
 #define	SWAIT	6		/* Waiting for interrupt. */
 #define	SLOCK	7		/* Blocked on a lock. */
 
 #define	P_MAGIC		0xbeefface
 
 #ifdef _KERNEL
 
 /* Types and flags for mi_switch(). */
 #define	SW_TYPE_MASK		0xff	/* First 8 bits are switch type */
 #define	SWT_NONE		0	/* Unspecified switch. */
 #define	SWT_PREEMPT		1	/* Switching due to preemption. */
 #define	SWT_OWEPREEMPT		2	/* Switching due to owepreempt. */
 #define	SWT_TURNSTILE		3	/* Turnstile contention. */
 #define	SWT_SLEEPQ		4	/* Sleepq wait. */
 #define	SWT_SLEEPQTIMO		5	/* Sleepq timeout wait. */
 #define	SWT_RELINQUISH		6	/* yield call. */
 #define	SWT_NEEDRESCHED		7	/* NEEDRESCHED was set. */
 #define	SWT_IDLE		8	/* Switching from the idle thread. */
 #define	SWT_IWAIT		9	/* Waiting for interrupts. */
 #define	SWT_SUSPEND		10	/* Thread suspended. */
 #define	SWT_REMOTEPREEMPT	11	/* Remote processor preempted. */
 #define	SWT_REMOTEWAKEIDLE	12	/* Remote processor preempted idle. */
 #define	SWT_COUNT		13	/* Number of switch types. */
 /* Flags */
 #define	SW_VOL		0x0100		/* Voluntary switch. */
 #define	SW_INVOL	0x0200		/* Involuntary switch. */
 #define SW_PREEMPT	0x0400		/* The invol switch is a preemption */
 
 /* How values for thread_single(). */
 #define	SINGLE_NO_EXIT	0
 #define	SINGLE_EXIT	1
 #define	SINGLE_BOUNDARY	2
 #define	SINGLE_ALLPROC	3
 
 #ifdef MALLOC_DECLARE
 MALLOC_DECLARE(M_PARGS);
 MALLOC_DECLARE(M_PGRP);
 MALLOC_DECLARE(M_SESSION);
 MALLOC_DECLARE(M_SUBPROC);
 #endif
 
 #define	FOREACH_PROC_IN_SYSTEM(p)					\
 	LIST_FOREACH((p), &allproc, p_list)
 #define	FOREACH_THREAD_IN_PROC(p, td)					\
 	TAILQ_FOREACH((td), &(p)->p_threads, td_plist)
 
 #define	FIRST_THREAD_IN_PROC(p)	TAILQ_FIRST(&(p)->p_threads)
 
 /*
  * We use process IDs <= pid_max <= PID_MAX; PID_MAX + 1 must also fit
  * in a pid_t, as it is used to represent "no process group".
  */
 #define	PID_MAX		99999
 #define	NO_PID		100000
 extern pid_t pid_max;
 
 #define	SESS_LEADER(p)	((p)->p_session->s_leader == (p))
 
 
 #define	STOPEVENT(p, e, v) do {						\
 	WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK, NULL,			\
  	    "checking stopevent %d", (e));				\
 	if ((p)->p_stops & (e))	{					\
 		PROC_LOCK(p);						\
 		stopevent((p), (e), (v));				\
 		PROC_UNLOCK(p);						\
 	}								\
 } while (0)
 #define	_STOPEVENT(p, e, v) do {					\
 	PROC_LOCK_ASSERT(p, MA_OWNED);					\
 	WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK, &p->p_mtx.lock_object, \
  	    "checking stopevent %d", (e));				\
 	if ((p)->p_stops & (e))						\
 		stopevent((p), (e), (v));				\
 } while (0)
 
 /* Lock and unlock a process. */
 #define	PROC_LOCK(p)	mtx_lock(&(p)->p_mtx)
 #define	PROC_TRYLOCK(p)	mtx_trylock(&(p)->p_mtx)
 #define	PROC_UNLOCK(p)	mtx_unlock(&(p)->p_mtx)
 #define	PROC_LOCKED(p)	mtx_owned(&(p)->p_mtx)
 #define	PROC_LOCK_ASSERT(p, type)	mtx_assert(&(p)->p_mtx, (type))
 
 /* Lock and unlock a process group. */
 #define	PGRP_LOCK(pg)	mtx_lock(&(pg)->pg_mtx)
 #define	PGRP_UNLOCK(pg)	mtx_unlock(&(pg)->pg_mtx)
 #define	PGRP_LOCKED(pg)	mtx_owned(&(pg)->pg_mtx)
 #define	PGRP_LOCK_ASSERT(pg, type)	mtx_assert(&(pg)->pg_mtx, (type))
 
 #define	PGRP_LOCK_PGSIGNAL(pg) do {					\
 	if ((pg) != NULL)						\
 		PGRP_LOCK(pg);						\
 } while (0)
 #define	PGRP_UNLOCK_PGSIGNAL(pg) do {					\
 	if ((pg) != NULL)						\
 		PGRP_UNLOCK(pg);					\
 } while (0)
 
 /* Lock and unlock a session. */
 #define	SESS_LOCK(s)	mtx_lock(&(s)->s_mtx)
 #define	SESS_UNLOCK(s)	mtx_unlock(&(s)->s_mtx)
 #define	SESS_LOCKED(s)	mtx_owned(&(s)->s_mtx)
 #define	SESS_LOCK_ASSERT(s, type)	mtx_assert(&(s)->s_mtx, (type))
 
 /*
  * Non-zero p_lock ensures that:
  * - exit1() is not performed until p_lock reaches zero;
  * - the process' threads stack are not swapped out if they are currently
  *   not (P_INMEM).
  *
  * PHOLD() asserts that the process (except the current process) is
  * not exiting, increments p_lock and swaps threads stacks into memory,
  * if needed.
  * _PHOLD() is same as PHOLD(), it takes the process locked.
  * _PHOLD_LITE() also takes the process locked, but comparing with
  * _PHOLD(), it only guarantees that exit1() is not executed,
  * faultin() is not called.
  */
 #define	PHOLD(p) do {							\
 	PROC_LOCK(p);							\
 	_PHOLD(p);							\
 	PROC_UNLOCK(p);							\
 } while (0)
 #define	_PHOLD(p) do {							\
 	PROC_LOCK_ASSERT((p), MA_OWNED);				\
 	KASSERT(!((p)->p_flag & P_WEXIT) || (p) == curproc,		\
 	    ("PHOLD of exiting process %p", p));			\
 	(p)->p_lock++;							\
 	if (((p)->p_flag & P_INMEM) == 0)				\
 		faultin((p));						\
 } while (0)
 #define	_PHOLD_LITE(p) do {						\
 	PROC_LOCK_ASSERT((p), MA_OWNED);				\
 	KASSERT(!((p)->p_flag & P_WEXIT) || (p) == curproc,		\
 	    ("PHOLD of exiting process %p", p));			\
 	(p)->p_lock++;							\
 } while (0)
 #define	PROC_ASSERT_HELD(p) do {					\
 	KASSERT((p)->p_lock > 0, ("process %p not held", p));		\
 } while (0)
 
 #define	PRELE(p) do {							\
 	PROC_LOCK((p));							\
 	_PRELE((p));							\
 	PROC_UNLOCK((p));						\
 } while (0)
 #define	_PRELE(p) do {							\
 	PROC_LOCK_ASSERT((p), MA_OWNED);				\
 	PROC_ASSERT_HELD(p);						\
 	(--(p)->p_lock);						\
 	if (((p)->p_flag & P_WEXIT) && (p)->p_lock == 0)		\
 		wakeup(&(p)->p_lock);					\
 } while (0)
 #define	PROC_ASSERT_NOT_HELD(p) do {					\
 	KASSERT((p)->p_lock == 0, ("process %p held", p));		\
 } while (0)
 
 #define	PROC_UPDATE_COW(p) do {						\
 	PROC_LOCK_ASSERT((p), MA_OWNED);				\
 	(p)->p_cowgen++;						\
 } while (0)
 
 /* Check whether a thread is safe to be swapped out. */
 #define	thread_safetoswapout(td)	((td)->td_flags & TDF_CANSWAP)
 
 /* Control whether or not it is safe for curthread to sleep. */
 #define	THREAD_NO_SLEEPING()		do {				\
 	curthread->td_no_sleeping++;					\
 	MPASS(curthread->td_no_sleeping > 0);				\
 } while (0)
 
 #define	THREAD_SLEEPING_OK()		do {				\
 	MPASS(curthread->td_no_sleeping > 0);				\
 	curthread->td_no_sleeping--;					\
 } while (0)
 
 #define	THREAD_CAN_SLEEP()		((curthread)->td_no_sleeping == 0)
 
 #define	PIDHASH(pid)	(&pidhashtbl[(pid) & pidhash])
 #define	PIDHASHLOCK(pid) (&pidhashtbl_lock[((pid) & pidhashlock)])
 extern LIST_HEAD(pidhashhead, proc) *pidhashtbl;
 extern struct sx *pidhashtbl_lock;
 extern u_long pidhash;
 extern u_long pidhashlock;
 #define	TIDHASH(tid)	(&tidhashtbl[(tid) & tidhash])
 extern LIST_HEAD(tidhashhead, thread) *tidhashtbl;
 extern u_long tidhash;
 extern struct rwlock tidhash_lock;
 
 #define	PGRPHASH(pgid)	(&pgrphashtbl[(pgid) & pgrphash])
 extern LIST_HEAD(pgrphashhead, pgrp) *pgrphashtbl;
 extern u_long pgrphash;
 
 extern struct sx allproc_lock;
 extern int allproc_gen;
 extern struct sx proctree_lock;
 extern struct mtx ppeers_lock;
 extern struct mtx procid_lock;
 extern struct proc proc0;		/* Process slot for swapper. */
 extern struct thread0_storage thread0_st;	/* Primary thread in proc0. */
 #define	thread0 (thread0_st.t0st_thread)
 extern struct vmspace vmspace0;		/* VM space for proc0. */
 extern int hogticks;			/* Limit on kernel cpu hogs. */
 extern int lastpid;
 extern int nprocs, maxproc;		/* Current and max number of procs. */
 extern int maxprocperuid;		/* Max procs per uid. */
 extern u_long ps_arg_cache_limit;
 
 LIST_HEAD(proclist, proc);
 TAILQ_HEAD(procqueue, proc);
 TAILQ_HEAD(threadqueue, thread);
 extern struct proclist allproc;		/* List of all processes. */
 extern struct proc *initproc, *pageproc; /* Process slots for init, pager. */
 
 extern struct uma_zone *proc_zone;
 
 struct	proc *pfind(pid_t);		/* Find process by id. */
 struct	proc *pfind_any(pid_t);		/* Find (zombie) process by id. */
 struct	proc *pfind_any_locked(pid_t pid); /* Find process by id, locked. */
 struct	pgrp *pgfind(pid_t);		/* Find process group by id. */
 void	pidhash_slockall(void);		/* Shared lock all pid hash lists. */
 void	pidhash_sunlockall(void);	/* Shared unlock all pid hash lists. */
 
 struct	fork_req {
 	int		fr_flags;
 	int		fr_pages;
 	int 		*fr_pidp;
 	struct proc 	**fr_procp;
 	int 		*fr_pd_fd;
 	int 		fr_pd_flags;
 	struct filecaps	*fr_pd_fcaps;
 	int 		fr_flags2;
 #define	FR2_DROPSIG_CAUGHT	0x00001	/* Drop caught non-DFL signals */
 };
 
 /*
  * pget() flags.
  */
 #define	PGET_HOLD	0x00001	/* Hold the process. */
 #define	PGET_CANSEE	0x00002	/* Check against p_cansee(). */
 #define	PGET_CANDEBUG	0x00004	/* Check against p_candebug(). */
 #define	PGET_ISCURRENT	0x00008	/* Check that the found process is current. */
 #define	PGET_NOTWEXIT	0x00010	/* Check that the process is not in P_WEXIT. */
 #define	PGET_NOTINEXEC	0x00020	/* Check that the process is not in P_INEXEC. */
 #define	PGET_NOTID	0x00040	/* Do not assume tid if pid > PID_MAX. */
 
 #define	PGET_WANTREAD	(PGET_HOLD | PGET_CANDEBUG | PGET_NOTWEXIT)
 
 int	pget(pid_t pid, int flags, struct proc **pp);
 
 void	ast(struct trapframe *framep);
 struct	thread *choosethread(void);
 int	cr_cansee(struct ucred *u1, struct ucred *u2);
 int	cr_canseesocket(struct ucred *cred, struct socket *so);
 int	cr_canseeothergids(struct ucred *u1, struct ucred *u2);
 int	cr_canseeotheruids(struct ucred *u1, struct ucred *u2);
 int	cr_canseejailproc(struct ucred *u1, struct ucred *u2);
 int	cr_cansignal(struct ucred *cred, struct proc *proc, int signum);
 int	enterpgrp(struct proc *p, pid_t pgid, struct pgrp *pgrp,
 	    struct session *sess);
 int	enterthispgrp(struct proc *p, struct pgrp *pgrp);
 void	faultin(struct proc *p);
 void	fixjobc(struct proc *p, struct pgrp *pgrp, int entering);
 int	fork1(struct thread *, struct fork_req *);
 void	fork_rfppwait(struct thread *);
 void	fork_exit(void (*)(void *, struct trapframe *), void *,
 	    struct trapframe *);
 void	fork_return(struct thread *, struct trapframe *);
 int	inferior(struct proc *p);
 void	kern_proc_vmmap_resident(struct vm_map *map, struct vm_map_entry *entry,
 	    int *resident_count, bool *super);
 void	kern_yield(int);
 void 	kick_proc0(void);
 void	killjobc(void);
 int	leavepgrp(struct proc *p);
 int	maybe_preempt(struct thread *td);
 void	maybe_yield(void);
 void	mi_switch(int flags);
 int	p_candebug(struct thread *td, struct proc *p);
 int	p_cansee(struct thread *td, struct proc *p);
 int	p_cansched(struct thread *td, struct proc *p);
 int	p_cansignal(struct thread *td, struct proc *p, int signum);
 int	p_canwait(struct thread *td, struct proc *p);
 struct	pargs *pargs_alloc(int len);
 void	pargs_drop(struct pargs *pa);
 void	pargs_hold(struct pargs *pa);
 int	proc_getargv(struct thread *td, struct proc *p, struct sbuf *sb);
 int	proc_getauxv(struct thread *td, struct proc *p, struct sbuf *sb);
 int	proc_getenvv(struct thread *td, struct proc *p, struct sbuf *sb);
 void	procinit(void);
 int	proc_iterate(int (*cb)(struct proc *, void *), void *cbarg);
 void	proc_linkup0(struct proc *p, struct thread *td);
 void	proc_linkup(struct proc *p, struct thread *td);
 struct proc *proc_realparent(struct proc *child);
 void	proc_reap(struct thread *td, struct proc *p, int *status, int options);
 void	proc_reparent(struct proc *child, struct proc *newparent, bool set_oppid);
 void	proc_add_orphan(struct proc *child, struct proc *parent);
 void	proc_set_traced(struct proc *p, bool stop);
 void	proc_wkilled(struct proc *p);
 struct	pstats *pstats_alloc(void);
 void	pstats_fork(struct pstats *src, struct pstats *dst);
 void	pstats_free(struct pstats *ps);
 void	proc_clear_orphan(struct proc *p);
 void	reaper_abandon_children(struct proc *p, bool exiting);
 int	securelevel_ge(struct ucred *cr, int level);
 int	securelevel_gt(struct ucred *cr, int level);
 void	sess_hold(struct session *);
 void	sess_release(struct session *);
 int	setrunnable(struct thread *, int);
 void	setsugid(struct proc *p);
 int	should_yield(void);
 int	sigonstack(size_t sp);
 void	stopevent(struct proc *, u_int, u_int);
 struct	thread *tdfind(lwpid_t, pid_t);
 void	threadinit(void);
 void	tidhash_add(struct thread *);
 void	tidhash_remove(struct thread *);
 void	cpu_idle(int);
 int	cpu_idle_wakeup(int);
 extern	void (*cpu_idle_hook)(sbintime_t);	/* Hook to machdep CPU idler. */
 void	cpu_switch(struct thread *, struct thread *, struct mtx *);
 void	cpu_throw(struct thread *, struct thread *) __dead2;
 void	unsleep(struct thread *);
 void	userret(struct thread *, struct trapframe *);
 
 void	cpu_exit(struct thread *);
 void	exit1(struct thread *, int, int) __dead2;
 void	cpu_copy_thread(struct thread *td, struct thread *td0);
 bool	cpu_exec_vmspace_reuse(struct proc *p, struct vm_map *map);
 int	cpu_fetch_syscall_args(struct thread *td);
 void	cpu_fork(struct thread *, struct proc *, struct thread *, int);
 void	cpu_fork_kthread_handler(struct thread *, void (*)(void *), void *);
 int	cpu_procctl(struct thread *td, int idtype, id_t id, int com,
 	    void *data);
 void	cpu_set_syscall_retval(struct thread *, int);
 void	cpu_set_upcall(struct thread *, void (*)(void *), void *,
 	    stack_t *);
 int	cpu_set_user_tls(struct thread *, void *tls_base);
 void	cpu_thread_alloc(struct thread *);
 void	cpu_thread_clean(struct thread *);
 void	cpu_thread_exit(struct thread *);
 void	cpu_thread_free(struct thread *);
 void	cpu_thread_swapin(struct thread *);
 void	cpu_thread_swapout(struct thread *);
 struct	thread *thread_alloc(int pages);
 int	thread_alloc_stack(struct thread *, int pages);
 int	thread_check_susp(struct thread *td, bool sleep);
 void	thread_cow_get_proc(struct thread *newtd, struct proc *p);
 void	thread_cow_get(struct thread *newtd, struct thread *td);
 void	thread_cow_free(struct thread *td);
 void	thread_cow_update(struct thread *td);
 int	thread_create(struct thread *td, struct rtprio *rtp,
 	    int (*initialize_thread)(struct thread *, void *), void *thunk);
 void	thread_exit(void) __dead2;
 void	thread_free(struct thread *td);
 void	thread_link(struct thread *td, struct proc *p);
 void	thread_reap(void);
 int	thread_single(struct proc *p, int how);
 void	thread_single_end(struct proc *p, int how);
 void	thread_stash(struct thread *td);
 void	thread_stopped(struct proc *p);
 void	childproc_stopped(struct proc *child, int reason);
 void	childproc_continued(struct proc *child);
 void	childproc_exited(struct proc *child);
 int	thread_suspend_check(int how);
 bool	thread_suspend_check_needed(void);
 void	thread_suspend_switch(struct thread *, struct proc *p);
 void	thread_suspend_one(struct thread *td);
 void	thread_unlink(struct thread *td);
 void	thread_unsuspend(struct proc *p);
 void	thread_wait(struct proc *p);
 struct thread	*thread_find(struct proc *p, lwpid_t tid);
 
 void	stop_all_proc(void);
 void	resume_all_proc(void);
 
 static __inline int
 curthread_pflags_set(int flags)
 {
 	struct thread *td;
 	int save;
 
 	td = curthread;
 	save = ~flags | (td->td_pflags & flags);
 	td->td_pflags |= flags;
 	return (save);
 }
 
 static __inline void
 curthread_pflags_restore(int save)
 {
 
 	curthread->td_pflags &= save;
 }
 
 static __inline int
 curthread_pflags2_set(int flags)
 {
 	struct thread *td;
 	int save;
 
 	td = curthread;
 	save = ~flags | (td->td_pflags2 & flags);
 	td->td_pflags2 |= flags;
 	return (save);
 }
 
 static __inline void
 curthread_pflags2_restore(int save)
 {
 
 	curthread->td_pflags2 &= save;
 }
 
 static __inline __pure2 struct td_sched *
 td_get_sched(struct thread *td)
 {
 
 	return ((struct td_sched *)&td[1]);
 }
 
 extern void (*softdep_ast_cleanup)(struct thread *);
 static __inline void
 td_softdep_cleanup(struct thread *td)
 {
 
 	if (td->td_su != NULL && softdep_ast_cleanup != NULL)
 		softdep_ast_cleanup(td);
 }
 
 #define	PROC_ID_PID	0
 #define	PROC_ID_GROUP	1
 #define	PROC_ID_SESSION	2
 #define	PROC_ID_REAP	3
 
 void	proc_id_set(int type, pid_t id);
 void	proc_id_set_cond(int type, pid_t id);
 void	proc_id_clear(int type, pid_t id);
 
 EVENTHANDLER_LIST_DECLARE(process_ctor);
 EVENTHANDLER_LIST_DECLARE(process_dtor);
 EVENTHANDLER_LIST_DECLARE(process_init);
 EVENTHANDLER_LIST_DECLARE(process_fini);
 EVENTHANDLER_LIST_DECLARE(process_exit);
 EVENTHANDLER_LIST_DECLARE(process_fork);
 EVENTHANDLER_LIST_DECLARE(process_exec);
 
 EVENTHANDLER_LIST_DECLARE(thread_ctor);
 EVENTHANDLER_LIST_DECLARE(thread_dtor);
 EVENTHANDLER_LIST_DECLARE(thread_init);
 
 #endif	/* _KERNEL */
 
 #endif	/* !_SYS_PROC_H_ */