Index: head/sys/fs/nfs/nfs_commonkrpc.c
===================================================================
--- head/sys/fs/nfs/nfs_commonkrpc.c	(revision 320061)
+++ head/sys/fs/nfs/nfs_commonkrpc.c	(revision 320062)
@@ -1,1314 +1,1323 @@
 /*-
  * Copyright (c) 1989, 1991, 1993, 1995
  *	The Regents of the University of California.  All rights reserved.
  *
  * This code is derived from software contributed to Berkeley by
  * Rick Macklem at The University of Guelph.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 /*
  * Socket operations for use by nfs
  */
 
 #include "opt_kgssapi.h"
 #include "opt_nfs.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/kernel.h>
 #include <sys/limits.h>
 #include <sys/lock.h>
 #include <sys/malloc.h>
 #include <sys/mbuf.h>
 #include <sys/mount.h>
 #include <sys/mutex.h>
 #include <sys/proc.h>
 #include <sys/signalvar.h>
 #include <sys/syscallsubr.h>
 #include <sys/sysctl.h>
 #include <sys/syslog.h>
 #include <sys/vnode.h>
 
 #include <rpc/rpc.h>
 #include <rpc/krpc.h>
 
 #include <kgssapi/krb5/kcrypto.h>
 
 #include <fs/nfs/nfsport.h>
 
 #ifdef KDTRACE_HOOKS
 #include <sys/dtrace_bsd.h>
 
 dtrace_nfsclient_nfs23_start_probe_func_t
 		dtrace_nfscl_nfs234_start_probe;
 
 dtrace_nfsclient_nfs23_done_probe_func_t
 		dtrace_nfscl_nfs234_done_probe;
 
 /*
  * Registered probes by RPC type.
  */
 uint32_t	nfscl_nfs2_start_probes[NFSV41_NPROCS + 1];
 uint32_t	nfscl_nfs2_done_probes[NFSV41_NPROCS + 1];
 
 uint32_t	nfscl_nfs3_start_probes[NFSV41_NPROCS + 1];
 uint32_t	nfscl_nfs3_done_probes[NFSV41_NPROCS + 1];
 
 uint32_t	nfscl_nfs4_start_probes[NFSV41_NPROCS + 1];
 uint32_t	nfscl_nfs4_done_probes[NFSV41_NPROCS + 1];
 #endif
 
 NFSSTATESPINLOCK;
 NFSREQSPINLOCK;
 NFSDLOCKMUTEX;
 NFSCLSTATEMUTEX;
 extern struct nfsstatsv1 nfsstatsv1;
 extern struct nfsreqhead nfsd_reqq;
 extern int nfscl_ticks;
 extern void (*ncl_call_invalcaches)(struct vnode *);
 extern int nfs_numnfscbd;
 extern int nfscl_debuglevel;
 
 SVCPOOL		*nfscbd_pool;
 static int	nfsrv_gsscallbackson = 0;
 static int	nfs_bufpackets = 4;
 static int	nfs_reconnects;
 static int	nfs3_jukebox_delay = 10;
 static int	nfs_skip_wcc_data_onerr = 1;
 
 SYSCTL_DECL(_vfs_nfs);
 
 SYSCTL_INT(_vfs_nfs, OID_AUTO, bufpackets, CTLFLAG_RW, &nfs_bufpackets, 0,
     "Buffer reservation size 2 < x < 64");
 SYSCTL_INT(_vfs_nfs, OID_AUTO, reconnects, CTLFLAG_RD, &nfs_reconnects, 0,
     "Number of times the nfs client has had to reconnect");
 SYSCTL_INT(_vfs_nfs, OID_AUTO, nfs3_jukebox_delay, CTLFLAG_RW, &nfs3_jukebox_delay, 0,
     "Number of seconds to delay a retry after receiving EJUKEBOX");
 SYSCTL_INT(_vfs_nfs, OID_AUTO, skip_wcc_data_onerr, CTLFLAG_RW, &nfs_skip_wcc_data_onerr, 0,
     "Disable weak cache consistency checking when server returns an error");
 
 static void	nfs_down(struct nfsmount *, struct thread *, const char *,
     int, int);
 static void	nfs_up(struct nfsmount *, struct thread *, const char *,
     int, int);
 static int	nfs_msg(struct thread *, const char *, const char *, int);
 
 struct nfs_cached_auth {
 	int		ca_refs; /* refcount, including 1 from the cache */
 	uid_t		ca_uid;	 /* uid that corresponds to this auth */
 	AUTH		*ca_auth; /* RPC auth handle */
 };
 
 static int nfsv2_procid[NFS_V3NPROCS] = {
 	NFSV2PROC_NULL,
 	NFSV2PROC_GETATTR,
 	NFSV2PROC_SETATTR,
 	NFSV2PROC_LOOKUP,
 	NFSV2PROC_NOOP,
 	NFSV2PROC_READLINK,
 	NFSV2PROC_READ,
 	NFSV2PROC_WRITE,
 	NFSV2PROC_CREATE,
 	NFSV2PROC_MKDIR,
 	NFSV2PROC_SYMLINK,
 	NFSV2PROC_CREATE,
 	NFSV2PROC_REMOVE,
 	NFSV2PROC_RMDIR,
 	NFSV2PROC_RENAME,
 	NFSV2PROC_LINK,
 	NFSV2PROC_READDIR,
 	NFSV2PROC_NOOP,
 	NFSV2PROC_STATFS,
 	NFSV2PROC_NOOP,
 	NFSV2PROC_NOOP,
 	NFSV2PROC_NOOP,
 };
 
 /*
  * Initialize sockets and congestion for a new NFS connection.
  * We do not free the sockaddr if error.
  */
 int
 newnfs_connect(struct nfsmount *nmp, struct nfssockreq *nrp,
     struct ucred *cred, NFSPROC_T *p, int callback_retry_mult)
 {
 	int rcvreserve, sndreserve;
-	int pktscale;
+	int pktscale, pktscalesav;
 	struct sockaddr *saddr;
 	struct ucred *origcred;
 	CLIENT *client;
 	struct netconfig *nconf;
 	struct socket *so;
 	int one = 1, retries, error = 0;
 	struct thread *td = curthread;
 	SVCXPRT *xprt;
 	struct timeval timo;
 
 	/*
 	 * We need to establish the socket using the credentials of
 	 * the mountpoint.  Some parts of this process (such as
 	 * sobind() and soconnect()) will use the curent thread's
 	 * credential instead of the socket credential.  To work
 	 * around this, temporarily change the current thread's
 	 * credential to that of the mountpoint.
 	 *
 	 * XXX: It would be better to explicitly pass the correct
 	 * credential to sobind() and soconnect().
 	 */
 	origcred = td->td_ucred;
 
 	/*
 	 * Use the credential in nr_cred, if not NULL.
 	 */
 	if (nrp->nr_cred != NULL)
 		td->td_ucred = nrp->nr_cred;
 	else
 		td->td_ucred = cred;
 	saddr = nrp->nr_nam;
 
 	if (saddr->sa_family == AF_INET)
 		if (nrp->nr_sotype == SOCK_DGRAM)
 			nconf = getnetconfigent("udp");
 		else
 			nconf = getnetconfigent("tcp");
 	else
 		if (nrp->nr_sotype == SOCK_DGRAM)
 			nconf = getnetconfigent("udp6");
 		else
 			nconf = getnetconfigent("tcp6");
 			
 	pktscale = nfs_bufpackets;
 	if (pktscale < 2)
 		pktscale = 2;
 	if (pktscale > 64)
 		pktscale = 64;
+	pktscalesav = pktscale;
 	/*
 	 * soreserve() can fail if sb_max is too small, so shrink pktscale
 	 * and try again if there is an error.
 	 * Print a log message suggesting increasing sb_max.
 	 * Creating a socket and doing this is necessary since, if the
 	 * reservation sizes are too large and will make soreserve() fail,
 	 * the connection will work until a large send is attempted and
 	 * then it will loop in the krpc code.
 	 */
 	so = NULL;
 	saddr = NFSSOCKADDR(nrp->nr_nam, struct sockaddr *);
 	error = socreate(saddr->sa_family, &so, nrp->nr_sotype, 
 	    nrp->nr_soproto, td->td_ucred, td);
 	if (error) {
 		td->td_ucred = origcred;
 		goto out;
 	}
 	do {
-	    if (error != 0 && pktscale > 2)
+	    if (error != 0 && pktscale > 2) {
+		if (nmp != NULL && nrp->nr_sotype == SOCK_STREAM &&
+		    pktscale == pktscalesav)
+		    printf("Consider increasing kern.ipc.maxsockbuf\n");
 		pktscale--;
+	    }
 	    if (nrp->nr_sotype == SOCK_DGRAM) {
 		if (nmp != NULL) {
 			sndreserve = (NFS_MAXDGRAMDATA + NFS_MAXPKTHDR) *
 			    pktscale;
 			rcvreserve = (NFS_MAXDGRAMDATA + NFS_MAXPKTHDR) *
 			    pktscale;
 		} else {
 			sndreserve = rcvreserve = 1024 * pktscale;
 		}
 	    } else {
 		if (nrp->nr_sotype != SOCK_STREAM)
 			panic("nfscon sotype");
 		if (nmp != NULL) {
-			sndreserve = (NFS_MAXBSIZE + NFS_MAXPKTHDR +
+			sndreserve = (NFS_MAXBSIZE + NFS_MAXXDR +
 			    sizeof (u_int32_t)) * pktscale;
-			rcvreserve = (NFS_MAXBSIZE + NFS_MAXPKTHDR +
+			rcvreserve = (NFS_MAXBSIZE + NFS_MAXXDR +
 			    sizeof (u_int32_t)) * pktscale;
 		} else {
 			sndreserve = rcvreserve = 1024 * pktscale;
 		}
 	    }
 	    error = soreserve(so, sndreserve, rcvreserve);
+	    if (error != 0 && nmp != NULL && nrp->nr_sotype == SOCK_STREAM &&
+		pktscale <= 2)
+		printf("Must increase kern.ipc.maxsockbuf or reduce"
+		    " rsize, wsize\n");
 	} while (error != 0 && pktscale > 2);
 	soclose(so);
 	if (error) {
 		td->td_ucred = origcred;
 		goto out;
 	}
 
 	client = clnt_reconnect_create(nconf, saddr, nrp->nr_prog,
 	    nrp->nr_vers, sndreserve, rcvreserve);
 	CLNT_CONTROL(client, CLSET_WAITCHAN, "nfsreq");
 	if (nmp != NULL) {
 		if ((nmp->nm_flag & NFSMNT_INT))
 			CLNT_CONTROL(client, CLSET_INTERRUPTIBLE, &one);
 		if ((nmp->nm_flag & NFSMNT_RESVPORT))
 			CLNT_CONTROL(client, CLSET_PRIVPORT, &one);
 		if (NFSHASSOFT(nmp)) {
 			if (nmp->nm_sotype == SOCK_DGRAM)
 				/*
 				 * For UDP, the large timeout for a reconnect
 				 * will be set to "nm_retry * nm_timeo / 2", so
 				 * we only want to do 2 reconnect timeout
 				 * retries.
 				 */
 				retries = 2;
 			else
 				retries = nmp->nm_retry;
 		} else
 			retries = INT_MAX;
 		/* cred == NULL for DS connects. */
 		if (NFSHASNFSV4N(nmp) && cred != NULL) {
 			/*
 			 * Make sure the nfscbd_pool doesn't get destroyed
 			 * while doing this.
 			 */
 			NFSD_LOCK();
 			if (nfs_numnfscbd > 0) {
 				nfs_numnfscbd++;
 				NFSD_UNLOCK();
 				xprt = svc_vc_create_backchannel(nfscbd_pool);
 				CLNT_CONTROL(client, CLSET_BACKCHANNEL, xprt);
 				NFSD_LOCK();
 				nfs_numnfscbd--;
 				if (nfs_numnfscbd == 0)
 					wakeup(&nfs_numnfscbd);
 			}
 			NFSD_UNLOCK();
 		}
 	} else {
 		/*
 		 * Three cases:
 		 * - Null RPC callback to client
 		 * - Non-Null RPC callback to client, wait a little longer
 		 * - upcalls to nfsuserd and gssd (clp == NULL)
 		 */
 		if (callback_retry_mult == 0) {
 			retries = NFSV4_UPCALLRETRY;
 			CLNT_CONTROL(client, CLSET_PRIVPORT, &one);
 		} else {
 			retries = NFSV4_CALLBACKRETRY * callback_retry_mult;
 		}
 	}
 	CLNT_CONTROL(client, CLSET_RETRIES, &retries);
 
 	if (nmp != NULL) {
 		/*
 		 * For UDP, there are 2 timeouts:
 		 * - CLSET_RETRY_TIMEOUT sets the initial timeout for the timer
 		 *   that does a retransmit of an RPC request using the same 
 		 *   socket and xid. This is what you normally want to do,
 		 *   since NFS servers depend on "same xid" for their
 		 *   Duplicate Request Cache.
 		 * - timeout specified in CLNT_CALL_MBUF(), which specifies when
 		 *   retransmits on the same socket should fail and a fresh
 		 *   socket created. Each of these timeouts counts as one
 		 *   CLSET_RETRIES as set above.
 		 * Set the initial retransmit timeout for UDP. This timeout
 		 * doesn't exist for TCP and the following call just fails,
 		 * which is ok.
 		 */
 		timo.tv_sec = nmp->nm_timeo / NFS_HZ;
 		timo.tv_usec = (nmp->nm_timeo % NFS_HZ) * 1000000 / NFS_HZ;
 		CLNT_CONTROL(client, CLSET_RETRY_TIMEOUT, &timo);
 	}
 
 	mtx_lock(&nrp->nr_mtx);
 	if (nrp->nr_client != NULL) {
 		mtx_unlock(&nrp->nr_mtx);
 		/*
 		 * Someone else already connected.
 		 */
 		CLNT_RELEASE(client);
 	} else {
 		nrp->nr_client = client;
 		/*
 		 * Protocols that do not require connections may be optionally
 		 * left unconnected for servers that reply from a port other
 		 * than NFS_PORT.
 		 */
 		if (nmp == NULL || (nmp->nm_flag & NFSMNT_NOCONN) == 0) {
 			mtx_unlock(&nrp->nr_mtx);
 			CLNT_CONTROL(client, CLSET_CONNECT, &one);
 		} else
 			mtx_unlock(&nrp->nr_mtx);
 	}
 
 
 	/* Restore current thread's credentials. */
 	td->td_ucred = origcred;
 
 out:
 	NFSEXITCODE(error);
 	return (error);
 }
 
 /*
  * NFS disconnect. Clean up and unlink.
  */
 void
 newnfs_disconnect(struct nfssockreq *nrp)
 {
 	CLIENT *client;
 
 	mtx_lock(&nrp->nr_mtx);
 	if (nrp->nr_client != NULL) {
 		client = nrp->nr_client;
 		nrp->nr_client = NULL;
 		mtx_unlock(&nrp->nr_mtx);
 		rpc_gss_secpurge_call(client);
 		CLNT_CLOSE(client);
 		CLNT_RELEASE(client);
 	} else {
 		mtx_unlock(&nrp->nr_mtx);
 	}
 }
 
 static AUTH *
 nfs_getauth(struct nfssockreq *nrp, int secflavour, char *clnt_principal,
     char *srv_principal, gss_OID mech_oid, struct ucred *cred)
 {
 	rpc_gss_service_t svc;
 	AUTH *auth;
 
 	switch (secflavour) {
 	case RPCSEC_GSS_KRB5:
 	case RPCSEC_GSS_KRB5I:
 	case RPCSEC_GSS_KRB5P:
 		if (!mech_oid) {
 			if (!rpc_gss_mech_to_oid_call("kerberosv5", &mech_oid))
 				return (NULL);
 		}
 		if (secflavour == RPCSEC_GSS_KRB5)
 			svc = rpc_gss_svc_none;
 		else if (secflavour == RPCSEC_GSS_KRB5I)
 			svc = rpc_gss_svc_integrity;
 		else
 			svc = rpc_gss_svc_privacy;
 
 		if (clnt_principal == NULL)
 			auth = rpc_gss_secfind_call(nrp->nr_client, cred,
 			    srv_principal, mech_oid, svc);
 		else {
 			auth = rpc_gss_seccreate_call(nrp->nr_client, cred,
 			    clnt_principal, srv_principal, "kerberosv5",
 			    svc, NULL, NULL, NULL);
 			return (auth);
 		}
 		if (auth != NULL)
 			return (auth);
 		/* fallthrough */
 	case AUTH_SYS:
 	default:
 		return (authunix_create(cred));
 
 	}
 }
 
 /*
  * Callback from the RPC code to generate up/down notifications.
  */
 
 struct nfs_feedback_arg {
 	struct nfsmount *nf_mount;
 	int		nf_lastmsg;	/* last tprintf */
 	int		nf_tprintfmsg;
 	struct thread	*nf_td;
 };
 
 static void
 nfs_feedback(int type, int proc, void *arg)
 {
 	struct nfs_feedback_arg *nf = (struct nfs_feedback_arg *) arg;
 	struct nfsmount *nmp = nf->nf_mount;
 	time_t now;
 
 	switch (type) {
 	case FEEDBACK_REXMIT2:
 	case FEEDBACK_RECONNECT:
 		now = NFSD_MONOSEC;
 		if (nf->nf_lastmsg + nmp->nm_tprintf_delay < now) {
 			nfs_down(nmp, nf->nf_td,
 			    "not responding", 0, NFSSTA_TIMEO);
 			nf->nf_tprintfmsg = TRUE;
 			nf->nf_lastmsg = now;
 		}
 		break;
 
 	case FEEDBACK_OK:
 		nfs_up(nf->nf_mount, nf->nf_td,
 		    "is alive again", NFSSTA_TIMEO, nf->nf_tprintfmsg);
 		break;
 	}
 }
 
 /*
  * newnfs_request - goes something like this
  *	- does the rpc by calling the krpc layer
  *	- break down rpc header and return with nfs reply
  * nb: always frees up nd_mreq mbuf list
  */
 int
 newnfs_request(struct nfsrv_descript *nd, struct nfsmount *nmp,
     struct nfsclient *clp, struct nfssockreq *nrp, vnode_t vp,
     struct thread *td, struct ucred *cred, u_int32_t prog, u_int32_t vers,
     u_char *retsum, int toplevel, u_int64_t *xidp, struct nfsclsession *dssep)
 {
 	uint32_t retseq, retval, slotseq, *tl;
 	time_t waituntil;
 	int i = 0, j = 0, opcnt, set_sigset = 0, slot;
 	int trycnt, error = 0, usegssname = 0, secflavour = AUTH_SYS;
 	int freeslot, maxslot, reterr, slotpos, timeo;
 	u_int16_t procnum;
 	u_int trylater_delay = 1;
 	struct nfs_feedback_arg nf;
 	struct timeval timo;
 	AUTH *auth;
 	struct rpc_callextra ext;
 	enum clnt_stat stat;
 	struct nfsreq *rep = NULL;
 	char *srv_principal = NULL, *clnt_principal = NULL;
 	sigset_t oldset;
 	struct ucred *authcred;
 	struct nfsclsession *sep;
 	uint8_t sessionid[NFSX_V4SESSIONID];
 
 	sep = dssep;
 	if (xidp != NULL)
 		*xidp = 0;
 	/* Reject requests while attempting a forced unmount. */
 	if (nmp != NULL && (nmp->nm_mountp->mnt_kern_flag & MNTK_UNMOUNTF)) {
 		m_freem(nd->nd_mreq);
 		return (ESTALE);
 	}
 
 	/*
 	 * Set authcred, which is used to acquire RPC credentials to
 	 * the cred argument, by default. The crhold() should not be
 	 * necessary, but will ensure that some future code change
 	 * doesn't result in the credential being free'd prematurely.
 	 */
 	authcred = crhold(cred);
 
 	/* For client side interruptible mounts, mask off the signals. */
 	if (nmp != NULL && td != NULL && NFSHASINT(nmp)) {
 		newnfs_set_sigmask(td, &oldset);
 		set_sigset = 1;
 	}
 
 	/*
 	 * XXX if not already connected call nfs_connect now. Longer
 	 * term, change nfs_mount to call nfs_connect unconditionally
 	 * and let clnt_reconnect_create handle reconnects.
 	 */
 	if (nrp->nr_client == NULL)
 		newnfs_connect(nmp, nrp, cred, td, 0);
 
 	/*
 	 * For a client side mount, nmp is != NULL and clp == NULL. For
 	 * server calls (callbacks or upcalls), nmp == NULL.
 	 */
 	if (clp != NULL) {
 		NFSLOCKSTATE();
 		if ((clp->lc_flags & LCL_GSS) && nfsrv_gsscallbackson) {
 			secflavour = RPCSEC_GSS_KRB5;
 			if (nd->nd_procnum != NFSPROC_NULL) {
 				if (clp->lc_flags & LCL_GSSINTEGRITY)
 					secflavour = RPCSEC_GSS_KRB5I;
 				else if (clp->lc_flags & LCL_GSSPRIVACY)
 					secflavour = RPCSEC_GSS_KRB5P;
 			}
 		}
 		NFSUNLOCKSTATE();
 	} else if (nmp != NULL && NFSHASKERB(nmp) &&
 	     nd->nd_procnum != NFSPROC_NULL) {
 		if (NFSHASALLGSSNAME(nmp) && nmp->nm_krbnamelen > 0)
 			nd->nd_flag |= ND_USEGSSNAME;
 		if ((nd->nd_flag & ND_USEGSSNAME) != 0) {
 			/*
 			 * If there is a client side host based credential,
 			 * use that, otherwise use the system uid, if set.
 			 * The system uid is in the nmp->nm_sockreq.nr_cred
 			 * credentials.
 			 */
 			if (nmp->nm_krbnamelen > 0) {
 				usegssname = 1;
 				clnt_principal = nmp->nm_krbname;
 			} else if (nmp->nm_uid != (uid_t)-1) {
 				KASSERT(nmp->nm_sockreq.nr_cred != NULL,
 				    ("newnfs_request: NULL nr_cred"));
 				crfree(authcred);
 				authcred = crhold(nmp->nm_sockreq.nr_cred);
 			}
 		} else if (nmp->nm_krbnamelen == 0 &&
 		    nmp->nm_uid != (uid_t)-1 && cred->cr_uid == (uid_t)0) {
 			/*
 			 * If there is no host based principal name and
 			 * the system uid is set and this is root, use the
 			 * system uid, since root won't have user
 			 * credentials in a credentials cache file.
 			 * The system uid is in the nmp->nm_sockreq.nr_cred
 			 * credentials.
 			 */
 			KASSERT(nmp->nm_sockreq.nr_cred != NULL,
 			    ("newnfs_request: NULL nr_cred"));
 			crfree(authcred);
 			authcred = crhold(nmp->nm_sockreq.nr_cred);
 		}
 		if (NFSHASINTEGRITY(nmp))
 			secflavour = RPCSEC_GSS_KRB5I;
 		else if (NFSHASPRIVACY(nmp))
 			secflavour = RPCSEC_GSS_KRB5P;
 		else
 			secflavour = RPCSEC_GSS_KRB5;
 		srv_principal = NFSMNT_SRVKRBNAME(nmp);
 	} else if (nmp != NULL && !NFSHASKERB(nmp) &&
 	    nd->nd_procnum != NFSPROC_NULL &&
 	    (nd->nd_flag & ND_USEGSSNAME) != 0) {
 		/*
 		 * Use the uid that did the mount when the RPC is doing
 		 * NFSv4 system operations, as indicated by the
 		 * ND_USEGSSNAME flag, for the AUTH_SYS case.
 		 * The credentials in nm_sockreq.nr_cred were used for the
 		 * mount.
 		 */
 		KASSERT(nmp->nm_sockreq.nr_cred != NULL,
 		    ("newnfs_request: NULL nr_cred"));
 		crfree(authcred);
 		authcred = crhold(nmp->nm_sockreq.nr_cred);
 	}
 
 	if (nmp != NULL) {
 		bzero(&nf, sizeof(struct nfs_feedback_arg));
 		nf.nf_mount = nmp;
 		nf.nf_td = td;
 		nf.nf_lastmsg = NFSD_MONOSEC -
 		    ((nmp->nm_tprintf_delay)-(nmp->nm_tprintf_initial_delay));
 	}
 
 	if (nd->nd_procnum == NFSPROC_NULL)
 		auth = authnone_create();
 	else if (usegssname) {
 		/*
 		 * For this case, the authenticator is held in the
 		 * nfssockreq structure, so don't release the reference count
 		 * held on it. --> Don't AUTH_DESTROY() it in this function.
 		 */
 		if (nrp->nr_auth == NULL)
 			nrp->nr_auth = nfs_getauth(nrp, secflavour,
 			    clnt_principal, srv_principal, NULL, authcred);
 		else
 			rpc_gss_refresh_auth_call(nrp->nr_auth);
 		auth = nrp->nr_auth;
 	} else
 		auth = nfs_getauth(nrp, secflavour, NULL,
 		    srv_principal, NULL, authcred);
 	crfree(authcred);
 	if (auth == NULL) {
 		m_freem(nd->nd_mreq);
 		if (set_sigset)
 			newnfs_restore_sigmask(td, &oldset);
 		return (EACCES);
 	}
 	bzero(&ext, sizeof(ext));
 	ext.rc_auth = auth;
 	if (nmp != NULL) {
 		ext.rc_feedback = nfs_feedback;
 		ext.rc_feedback_arg = &nf;
 	}
 
 	procnum = nd->nd_procnum;
 	if ((nd->nd_flag & ND_NFSV4) &&
 	    nd->nd_procnum != NFSPROC_NULL &&
 	    nd->nd_procnum != NFSV4PROC_CBCOMPOUND)
 		procnum = NFSV4PROC_COMPOUND;
 
 	if (nmp != NULL) {
 		NFSINCRGLOBAL(nfsstatsv1.rpcrequests);
 
 		/* Map the procnum to the old NFSv2 one, as required. */
 		if ((nd->nd_flag & ND_NFSV2) != 0) {
 			if (nd->nd_procnum < NFS_V3NPROCS)
 				procnum = nfsv2_procid[nd->nd_procnum];
 			else
 				procnum = NFSV2PROC_NOOP;
 		}
 
 		/*
 		 * Now only used for the R_DONTRECOVER case, but until that is
 		 * supported within the krpc code, I need to keep a queue of
 		 * outstanding RPCs for nfsv4 client requests.
 		 */
 		if ((nd->nd_flag & ND_NFSV4) && procnum == NFSV4PROC_COMPOUND)
 			MALLOC(rep, struct nfsreq *, sizeof(struct nfsreq),
 			    M_NFSDREQ, M_WAITOK);
 #ifdef KDTRACE_HOOKS
 		if (dtrace_nfscl_nfs234_start_probe != NULL) {
 			uint32_t probe_id;
 			int probe_procnum;
 	
 			if (nd->nd_flag & ND_NFSV4) {
 				probe_id =
 				    nfscl_nfs4_start_probes[nd->nd_procnum];
 				probe_procnum = nd->nd_procnum;
 			} else if (nd->nd_flag & ND_NFSV3) {
 				probe_id = nfscl_nfs3_start_probes[procnum];
 				probe_procnum = procnum;
 			} else {
 				probe_id =
 				    nfscl_nfs2_start_probes[nd->nd_procnum];
 				probe_procnum = procnum;
 			}
 			if (probe_id != 0)
 				(dtrace_nfscl_nfs234_start_probe)
 				    (probe_id, vp, nd->nd_mreq, cred,
 				     probe_procnum);
 		}
 #endif
 	}
 	trycnt = 0;
 	freeslot = -1;		/* Set to slot that needs to be free'd */
 tryagain:
 	slot = -1;		/* Slot that needs a sequence# increment. */
 	/*
 	 * This timeout specifies when a new socket should be created,
 	 * along with new xid values. For UDP, this should be done
 	 * infrequently, since retransmits of RPC requests should normally
 	 * use the same xid.
 	 */
 	if (nmp == NULL) {
 		timo.tv_usec = 0;
 		if (clp == NULL)
 			timo.tv_sec = NFSV4_UPCALLTIMEO;
 		else
 			timo.tv_sec = NFSV4_CALLBACKTIMEO;
 	} else {
 		if (nrp->nr_sotype != SOCK_DGRAM) {
 			timo.tv_usec = 0;
 			if ((nmp->nm_flag & NFSMNT_NFSV4))
 				timo.tv_sec = INT_MAX;
 			else
 				timo.tv_sec = NFS_TCPTIMEO;
 		} else {
 			if (NFSHASSOFT(nmp)) {
 				/*
 				 * CLSET_RETRIES is set to 2, so this should be
 				 * half of the total timeout required.
 				 */
 				timeo = nmp->nm_retry * nmp->nm_timeo / 2;
 				if (timeo < 1)
 					timeo = 1;
 				timo.tv_sec = timeo / NFS_HZ;
 				timo.tv_usec = (timeo % NFS_HZ) * 1000000 /
 				    NFS_HZ;
 			} else {
 				/* For UDP hard mounts, use a large value. */
 				timo.tv_sec = NFS_MAXTIMEO / NFS_HZ;
 				timo.tv_usec = 0;
 			}
 		}
 
 		if (rep != NULL) {
 			rep->r_flags = 0;
 			rep->r_nmp = nmp;
 			/*
 			 * Chain request into list of outstanding requests.
 			 */
 			NFSLOCKREQ();
 			TAILQ_INSERT_TAIL(&nfsd_reqq, rep, r_chain);
 			NFSUNLOCKREQ();
 		}
 	}
 
 	nd->nd_mrep = NULL;
 	if (clp != NULL && sep != NULL)
 		stat = clnt_bck_call(nrp->nr_client, &ext, procnum,
 		    nd->nd_mreq, &nd->nd_mrep, timo, sep->nfsess_xprt);
 	else
 		stat = CLNT_CALL_MBUF(nrp->nr_client, &ext, procnum,
 		    nd->nd_mreq, &nd->nd_mrep, timo);
 
 	if (rep != NULL) {
 		/*
 		 * RPC done, unlink the request.
 		 */
 		NFSLOCKREQ();
 		TAILQ_REMOVE(&nfsd_reqq, rep, r_chain);
 		NFSUNLOCKREQ();
 	}
 
 	/*
 	 * If there was a successful reply and a tprintf msg.
 	 * tprintf a response.
 	 */
 	if (stat == RPC_SUCCESS) {
 		error = 0;
 	} else if (stat == RPC_TIMEDOUT) {
 		NFSINCRGLOBAL(nfsstatsv1.rpctimeouts);
 		error = ETIMEDOUT;
 	} else if (stat == RPC_VERSMISMATCH) {
 		NFSINCRGLOBAL(nfsstatsv1.rpcinvalid);
 		error = EOPNOTSUPP;
 	} else if (stat == RPC_PROGVERSMISMATCH) {
 		NFSINCRGLOBAL(nfsstatsv1.rpcinvalid);
 		error = EPROTONOSUPPORT;
 	} else if (stat == RPC_INTR) {
 		error = EINTR;
 	} else {
 		NFSINCRGLOBAL(nfsstatsv1.rpcinvalid);
 		error = EACCES;
 	}
 	if (error) {
 		m_freem(nd->nd_mreq);
 		if (usegssname == 0)
 			AUTH_DESTROY(auth);
 		if (rep != NULL)
 			FREE((caddr_t)rep, M_NFSDREQ);
 		if (set_sigset)
 			newnfs_restore_sigmask(td, &oldset);
 		return (error);
 	}
 
 	KASSERT(nd->nd_mrep != NULL, ("mrep shouldn't be NULL if no error\n"));
 
 	/*
 	 * Search for any mbufs that are not a multiple of 4 bytes long
 	 * or with m_data not longword aligned.
 	 * These could cause pointer alignment problems, so copy them to
 	 * well aligned mbufs.
 	 */
 	newnfs_realign(&nd->nd_mrep, M_WAITOK);
 	nd->nd_md = nd->nd_mrep;
 	nd->nd_dpos = NFSMTOD(nd->nd_md, caddr_t);
 	nd->nd_repstat = 0;
 	if (nd->nd_procnum != NFSPROC_NULL &&
 	    nd->nd_procnum != NFSV4PROC_CBNULL) {
 		/* If sep == NULL, set it to the default in nmp. */
 		if (sep == NULL && nmp != NULL)
 			sep = nfsmnt_mdssession(nmp);
 		/*
 		 * and now the actual NFS xdr.
 		 */
 		NFSM_DISSECT(tl, u_int32_t *, NFSX_UNSIGNED);
 		nd->nd_repstat = fxdr_unsigned(u_int32_t, *tl);
 		if (nd->nd_repstat >= 10000)
 			NFSCL_DEBUG(1, "proc=%d reps=%d\n", (int)nd->nd_procnum,
 			    (int)nd->nd_repstat);
 
 		/*
 		 * Get rid of the tag, return count and SEQUENCE result for
 		 * NFSv4.
 		 */
 		if ((nd->nd_flag & ND_NFSV4) != 0) {
 			NFSM_DISSECT(tl, u_int32_t *, NFSX_UNSIGNED);
 			i = fxdr_unsigned(int, *tl);
 			error = nfsm_advance(nd, NFSM_RNDUP(i), -1);
 			if (error)
 				goto nfsmout;
 			NFSM_DISSECT(tl, u_int32_t *, 3 * NFSX_UNSIGNED);
 			opcnt = fxdr_unsigned(int, *tl++);
 			i = fxdr_unsigned(int, *tl++);
 			j = fxdr_unsigned(int, *tl);
 			if (j >= 10000)
 				NFSCL_DEBUG(1, "fop=%d fst=%d\n", i, j);
 			/*
 			 * If the first op is Sequence, free up the slot.
 			 */
 			if ((nmp != NULL && i == NFSV4OP_SEQUENCE && j != 0) ||
 			    (clp != NULL && i == NFSV4OP_CBSEQUENCE && j != 0))
 				NFSCL_DEBUG(1, "failed seq=%d\n", j);
 			if ((nmp != NULL && i == NFSV4OP_SEQUENCE && j == 0) ||
 			    (clp != NULL && i == NFSV4OP_CBSEQUENCE && j == 0)
 			    ) {
 				if (i == NFSV4OP_SEQUENCE)
 					NFSM_DISSECT(tl, uint32_t *,
 					    NFSX_V4SESSIONID +
 					    5 * NFSX_UNSIGNED);
 				else
 					NFSM_DISSECT(tl, uint32_t *,
 					    NFSX_V4SESSIONID +
 					    4 * NFSX_UNSIGNED);
 				mtx_lock(&sep->nfsess_mtx);
 				if (bcmp(tl, sep->nfsess_sessionid,
 				    NFSX_V4SESSIONID) == 0) {
 					tl += NFSX_V4SESSIONID / NFSX_UNSIGNED;
 					retseq = fxdr_unsigned(uint32_t, *tl++);
 					slot = fxdr_unsigned(int, *tl++);
 					freeslot = slot;
 					if (retseq != sep->nfsess_slotseq[slot])
 						printf("retseq diff 0x%x\n",
 						    retseq);
 					retval = fxdr_unsigned(uint32_t, *++tl);
 					if ((retval + 1) < sep->nfsess_foreslots
 					    )
 						sep->nfsess_foreslots = (retval
 						    + 1);
 					else if ((retval + 1) >
 					    sep->nfsess_foreslots)
 						sep->nfsess_foreslots = (retval
 						    < 64) ? (retval + 1) : 64;
 				}
 				mtx_unlock(&sep->nfsess_mtx);
 
 				/* Grab the op and status for the next one. */
 				if (opcnt > 1) {
 					NFSM_DISSECT(tl, uint32_t *,
 					    2 * NFSX_UNSIGNED);
 					i = fxdr_unsigned(int, *tl++);
 					j = fxdr_unsigned(int, *tl);
 				}
 			}
 		}
 		if (nd->nd_repstat != 0) {
 			if (nd->nd_repstat == NFSERR_BADSESSION &&
 			    nmp != NULL && dssep == NULL) {
 				/*
 				 * If this is a client side MDS RPC, mark
 				 * the MDS session defunct and initiate
 				 * recovery, as required.
 				 * The nfsess_defunct field is protected by
 				 * the NFSLOCKMNT()/nm_mtx lock and not the
 				 * nfsess_mtx lock to simplify its handling,
 				 * for the MDS session. This lock is also
 				 * sufficient for nfsess_sessionid, since it
 				 * never changes in the structure.
 				 */
 				NFSCL_DEBUG(1, "Got badsession\n");
 				NFSLOCKCLSTATE();
 				NFSLOCKMNT(nmp);
 				sep = NFSMNT_MDSSESSION(nmp);
 				if (bcmp(sep->nfsess_sessionid, nd->nd_sequence,
 				    NFSX_V4SESSIONID) == 0) {
 					/* Initiate recovery. */
 					sep->nfsess_defunct = 1;
 					NFSCL_DEBUG(1, "Marked defunct\n");
 					if (nmp->nm_clp != NULL) {
 						nmp->nm_clp->nfsc_flags |=
 						    NFSCLFLAGS_RECOVER;
 						wakeup(nmp->nm_clp);
 					}
 				}
 				NFSUNLOCKCLSTATE();
 				/*
 				 * Sleep for up to 1sec waiting for a new
 				 * session.
 				 */
 				mtx_sleep(&nmp->nm_sess, &nmp->nm_mtx, PZERO,
 				    "nfsbadsess", hz);
 				/*
 				 * Get the session again, in case a new one
 				 * has been created during the sleep.
 				 */
 				sep = NFSMNT_MDSSESSION(nmp);
 				NFSUNLOCKMNT(nmp);
 				if ((nd->nd_flag & ND_LOOPBADSESS) != 0) {
 					reterr = nfsv4_sequencelookup(nmp, sep,
 					    &slotpos, &maxslot, &slotseq,
 					    sessionid);
 					if (reterr == 0) {
 						/* Fill in new session info. */
 						NFSCL_DEBUG(1,
 						  "Filling in new sequence\n");
 						tl = nd->nd_sequence;
 						bcopy(sessionid, tl,
 						    NFSX_V4SESSIONID);
 						tl += NFSX_V4SESSIONID /
 						    NFSX_UNSIGNED;
 						*tl++ = txdr_unsigned(slotseq);
 						*tl++ = txdr_unsigned(slotpos);
 						*tl = txdr_unsigned(maxslot);
 					}
 					if (reterr == NFSERR_BADSESSION ||
 					    reterr == 0) {
 						NFSCL_DEBUG(1,
 						    "Badsession looping\n");
 						m_freem(nd->nd_mrep);
 						nd->nd_mrep = NULL;
 						goto tryagain;
 					}
 					nd->nd_repstat = reterr;
 					NFSCL_DEBUG(1, "Got err=%d\n", reterr);
 				}
 			}
 			if (((nd->nd_repstat == NFSERR_DELAY ||
 			      nd->nd_repstat == NFSERR_GRACE) &&
 			     (nd->nd_flag & ND_NFSV4) &&
 			     nd->nd_procnum != NFSPROC_DELEGRETURN &&
 			     nd->nd_procnum != NFSPROC_SETATTR &&
 			     nd->nd_procnum != NFSPROC_READ &&
 			     nd->nd_procnum != NFSPROC_READDS &&
 			     nd->nd_procnum != NFSPROC_WRITE &&
 			     nd->nd_procnum != NFSPROC_WRITEDS &&
 			     nd->nd_procnum != NFSPROC_OPEN &&
 			     nd->nd_procnum != NFSPROC_CREATE &&
 			     nd->nd_procnum != NFSPROC_OPENCONFIRM &&
 			     nd->nd_procnum != NFSPROC_OPENDOWNGRADE &&
 			     nd->nd_procnum != NFSPROC_CLOSE &&
 			     nd->nd_procnum != NFSPROC_LOCK &&
 			     nd->nd_procnum != NFSPROC_LOCKU) ||
 			    (nd->nd_repstat == NFSERR_DELAY &&
 			     (nd->nd_flag & ND_NFSV4) == 0) ||
 			    nd->nd_repstat == NFSERR_RESOURCE) {
 				if (trylater_delay > NFS_TRYLATERDEL)
 					trylater_delay = NFS_TRYLATERDEL;
 				waituntil = NFSD_MONOSEC + trylater_delay;
 				while (NFSD_MONOSEC < waituntil)
 					(void) nfs_catnap(PZERO, 0, "nfstry");
 				trylater_delay *= 2;
 				if (slot != -1) {
 					mtx_lock(&sep->nfsess_mtx);
 					sep->nfsess_slotseq[slot]++;
 					*nd->nd_slotseq = txdr_unsigned(
 					    sep->nfsess_slotseq[slot]);
 					mtx_unlock(&sep->nfsess_mtx);
 				}
 				m_freem(nd->nd_mrep);
 				nd->nd_mrep = NULL;
 				goto tryagain;
 			}
 
 			/*
 			 * If the File Handle was stale, invalidate the
 			 * lookup cache, just in case.
 			 * (vp != NULL implies a client side call)
 			 */
 			if (nd->nd_repstat == ESTALE && vp != NULL) {
 				cache_purge(vp);
 				if (ncl_call_invalcaches != NULL)
 					(*ncl_call_invalcaches)(vp);
 			}
 		}
 		if ((nd->nd_flag & ND_NFSV4) != 0) {
 			/* Free the slot, as required. */
 			if (freeslot != -1)
 				nfsv4_freeslot(sep, freeslot);
 			/*
 			 * If this op is Putfh, throw its results away.
 			 */
 			if (j >= 10000)
 				NFSCL_DEBUG(1, "nop=%d nst=%d\n", i, j);
 			if (nmp != NULL && i == NFSV4OP_PUTFH && j == 0) {
 				NFSM_DISSECT(tl,u_int32_t *,2 * NFSX_UNSIGNED);
 				i = fxdr_unsigned(int, *tl++);
 				j = fxdr_unsigned(int, *tl);
 				if (j >= 10000)
 					NFSCL_DEBUG(1, "n2op=%d n2st=%d\n", i,
 					    j);
 				/*
 				 * All Compounds that do an Op that must
 				 * be in sequence consist of NFSV4OP_PUTFH
 				 * followed by one of these. As such, we
 				 * can determine if the seqid# should be
 				 * incremented, here.
 				 */
 				if ((i == NFSV4OP_OPEN ||
 				     i == NFSV4OP_OPENCONFIRM ||
 				     i == NFSV4OP_OPENDOWNGRADE ||
 				     i == NFSV4OP_CLOSE ||
 				     i == NFSV4OP_LOCK ||
 				     i == NFSV4OP_LOCKU) &&
 				    (j == 0 ||
 				     (j != NFSERR_STALECLIENTID &&
 				      j != NFSERR_STALESTATEID &&
 				      j != NFSERR_BADSTATEID &&
 				      j != NFSERR_BADSEQID &&
 				      j != NFSERR_BADXDR &&	 
 				      j != NFSERR_RESOURCE &&
 				      j != NFSERR_NOFILEHANDLE)))		 
 					nd->nd_flag |= ND_INCRSEQID;
 			}
 			/*
 			 * If this op's status is non-zero, mark
 			 * that there is no more data to process.
 			 * The exception is Setattr, which always has xdr
 			 * when it has failed.
 			 */
 			if (j != 0 && i != NFSV4OP_SETATTR)
 				nd->nd_flag |= ND_NOMOREDATA;
 
 			/*
 			 * If R_DONTRECOVER is set, replace the stale error
 			 * reply, so that recovery isn't initiated.
 			 */
 			if ((nd->nd_repstat == NFSERR_STALECLIENTID ||
 			     nd->nd_repstat == NFSERR_BADSESSION ||
 			     nd->nd_repstat == NFSERR_STALESTATEID) &&
 			    rep != NULL && (rep->r_flags & R_DONTRECOVER))
 				nd->nd_repstat = NFSERR_STALEDONTRECOVER;
 		}
 	}
 
 #ifdef KDTRACE_HOOKS
 	if (nmp != NULL && dtrace_nfscl_nfs234_done_probe != NULL) {
 		uint32_t probe_id;
 		int probe_procnum;
 
 		if (nd->nd_flag & ND_NFSV4) {
 			probe_id = nfscl_nfs4_done_probes[nd->nd_procnum];
 			probe_procnum = nd->nd_procnum;
 		} else if (nd->nd_flag & ND_NFSV3) {
 			probe_id = nfscl_nfs3_done_probes[procnum];
 			probe_procnum = procnum;
 		} else {
 			probe_id = nfscl_nfs2_done_probes[nd->nd_procnum];
 			probe_procnum = procnum;
 		}
 		if (probe_id != 0)
 			(dtrace_nfscl_nfs234_done_probe)(probe_id, vp,
 			    nd->nd_mreq, cred, probe_procnum, 0);
 	}
 #endif
 
 	m_freem(nd->nd_mreq);
 	if (usegssname == 0)
 		AUTH_DESTROY(auth);
 	if (rep != NULL)
 		FREE((caddr_t)rep, M_NFSDREQ);
 	if (set_sigset)
 		newnfs_restore_sigmask(td, &oldset);
 	return (0);
 nfsmout:
 	mbuf_freem(nd->nd_mrep);
 	mbuf_freem(nd->nd_mreq);
 	if (usegssname == 0)
 		AUTH_DESTROY(auth);
 	if (rep != NULL)
 		FREE((caddr_t)rep, M_NFSDREQ);
 	if (set_sigset)
 		newnfs_restore_sigmask(td, &oldset);
 	return (error);
 }
 
 /*
  * Mark all of an nfs mount's outstanding requests with R_SOFTTERM and
  * wait for all requests to complete. This is used by forced unmounts
  * to terminate any outstanding RPCs.
  */
 int
 newnfs_nmcancelreqs(struct nfsmount *nmp)
 {
 
 	if (nmp->nm_sockreq.nr_client != NULL)
 		CLNT_CLOSE(nmp->nm_sockreq.nr_client);
 	return (0);
 }
 
 /*
  * Any signal that can interrupt an NFS operation in an intr mount
  * should be added to this set. SIGSTOP and SIGKILL cannot be masked.
  */
 int newnfs_sig_set[] = {
 	SIGINT,
 	SIGTERM,
 	SIGHUP,
 	SIGKILL,
 	SIGQUIT
 };
 
 /*
  * Check to see if one of the signals in our subset is pending on
  * the process (in an intr mount).
  */
 static int
 nfs_sig_pending(sigset_t set)
 {
 	int i;
 	
 	for (i = 0 ; i < nitems(newnfs_sig_set); i++)
 		if (SIGISMEMBER(set, newnfs_sig_set[i]))
 			return (1);
 	return (0);
 }
  
 /*
  * The set/restore sigmask functions are used to (temporarily) overwrite
  * the thread td_sigmask during an RPC call (for example). These are also
  * used in other places in the NFS client that might tsleep().
  */
 void
 newnfs_set_sigmask(struct thread *td, sigset_t *oldset)
 {
 	sigset_t newset;
 	int i;
 	struct proc *p;
 	
 	SIGFILLSET(newset);
 	if (td == NULL)
 		td = curthread; /* XXX */
 	p = td->td_proc;
 	/* Remove the NFS set of signals from newset */
 	PROC_LOCK(p);
 	mtx_lock(&p->p_sigacts->ps_mtx);
 	for (i = 0 ; i < nitems(newnfs_sig_set); i++) {
 		/*
 		 * But make sure we leave the ones already masked
 		 * by the process, ie. remove the signal from the
 		 * temporary signalmask only if it wasn't already
 		 * in p_sigmask.
 		 */
 		if (!SIGISMEMBER(td->td_sigmask, newnfs_sig_set[i]) &&
 		    !SIGISMEMBER(p->p_sigacts->ps_sigignore, newnfs_sig_set[i]))
 			SIGDELSET(newset, newnfs_sig_set[i]);
 	}
 	mtx_unlock(&p->p_sigacts->ps_mtx);
 	kern_sigprocmask(td, SIG_SETMASK, &newset, oldset,
 	    SIGPROCMASK_PROC_LOCKED);
 	PROC_UNLOCK(p);
 }
 
 void
 newnfs_restore_sigmask(struct thread *td, sigset_t *set)
 {
 	if (td == NULL)
 		td = curthread; /* XXX */
 	kern_sigprocmask(td, SIG_SETMASK, set, NULL, 0);
 }
 
 /*
  * NFS wrapper to msleep(), that shoves a new p_sigmask and restores the
  * old one after msleep() returns.
  */
 int
 newnfs_msleep(struct thread *td, void *ident, struct mtx *mtx, int priority, char *wmesg, int timo)
 {
 	sigset_t oldset;
 	int error;
 	struct proc *p;
 	
 	if ((priority & PCATCH) == 0)
 		return msleep(ident, mtx, priority, wmesg, timo);
 	if (td == NULL)
 		td = curthread; /* XXX */
 	newnfs_set_sigmask(td, &oldset);
 	error = msleep(ident, mtx, priority, wmesg, timo);
 	newnfs_restore_sigmask(td, &oldset);
 	p = td->td_proc;
 	return (error);
 }
 
 /*
  * Test for a termination condition pending on the process.
  * This is used for NFSMNT_INT mounts.
  */
 int
 newnfs_sigintr(struct nfsmount *nmp, struct thread *td)
 {
 	struct proc *p;
 	sigset_t tmpset;
 	
 	/* Terminate all requests while attempting a forced unmount. */
 	if (nmp->nm_mountp->mnt_kern_flag & MNTK_UNMOUNTF)
 		return (EIO);
 	if (!(nmp->nm_flag & NFSMNT_INT))
 		return (0);
 	if (td == NULL)
 		return (0);
 	p = td->td_proc;
 	PROC_LOCK(p);
 	tmpset = p->p_siglist;
 	SIGSETOR(tmpset, td->td_siglist);
 	SIGSETNAND(tmpset, td->td_sigmask);
 	mtx_lock(&p->p_sigacts->ps_mtx);
 	SIGSETNAND(tmpset, p->p_sigacts->ps_sigignore);
 	mtx_unlock(&p->p_sigacts->ps_mtx);
 	if ((SIGNOTEMPTY(p->p_siglist) || SIGNOTEMPTY(td->td_siglist))
 	    && nfs_sig_pending(tmpset)) {
 		PROC_UNLOCK(p);
 		return (EINTR);
 	}
 	PROC_UNLOCK(p);
 	return (0);
 }
 
 static int
 nfs_msg(struct thread *td, const char *server, const char *msg, int error)
 {
 	struct proc *p;
 
 	p = td ? td->td_proc : NULL;
 	if (error) {
 		tprintf(p, LOG_INFO, "nfs server %s: %s, error %d\n",
 		    server, msg, error);
 	} else {
 		tprintf(p, LOG_INFO, "nfs server %s: %s\n", server, msg);
 	}
 	return (0);
 }
 
 static void
 nfs_down(struct nfsmount *nmp, struct thread *td, const char *msg,
     int error, int flags)
 {
 	if (nmp == NULL)
 		return;
 	mtx_lock(&nmp->nm_mtx);
 	if ((flags & NFSSTA_TIMEO) && !(nmp->nm_state & NFSSTA_TIMEO)) {
 		nmp->nm_state |= NFSSTA_TIMEO;
 		mtx_unlock(&nmp->nm_mtx);
 		vfs_event_signal(&nmp->nm_mountp->mnt_stat.f_fsid,
 		    VQ_NOTRESP, 0);
 	} else
 		mtx_unlock(&nmp->nm_mtx);
 	mtx_lock(&nmp->nm_mtx);
 	if ((flags & NFSSTA_LOCKTIMEO) && !(nmp->nm_state & NFSSTA_LOCKTIMEO)) {
 		nmp->nm_state |= NFSSTA_LOCKTIMEO;
 		mtx_unlock(&nmp->nm_mtx);
 		vfs_event_signal(&nmp->nm_mountp->mnt_stat.f_fsid,
 		    VQ_NOTRESPLOCK, 0);
 	} else
 		mtx_unlock(&nmp->nm_mtx);
 	nfs_msg(td, nmp->nm_mountp->mnt_stat.f_mntfromname, msg, error);
 }
 
 static void
 nfs_up(struct nfsmount *nmp, struct thread *td, const char *msg,
     int flags, int tprintfmsg)
 {
 	if (nmp == NULL)
 		return;
 	if (tprintfmsg) {
 		nfs_msg(td, nmp->nm_mountp->mnt_stat.f_mntfromname, msg, 0);
 	}
 
 	mtx_lock(&nmp->nm_mtx);
 	if ((flags & NFSSTA_TIMEO) && (nmp->nm_state & NFSSTA_TIMEO)) {
 		nmp->nm_state &= ~NFSSTA_TIMEO;
 		mtx_unlock(&nmp->nm_mtx);
 		vfs_event_signal(&nmp->nm_mountp->mnt_stat.f_fsid,
 		    VQ_NOTRESP, 1);
 	} else
 		mtx_unlock(&nmp->nm_mtx);
 	
 	mtx_lock(&nmp->nm_mtx);
 	if ((flags & NFSSTA_LOCKTIMEO) && (nmp->nm_state & NFSSTA_LOCKTIMEO)) {
 		nmp->nm_state &= ~NFSSTA_LOCKTIMEO;
 		mtx_unlock(&nmp->nm_mtx);
 		vfs_event_signal(&nmp->nm_mountp->mnt_stat.f_fsid,
 		    VQ_NOTRESPLOCK, 1);
 	} else
 		mtx_unlock(&nmp->nm_mtx);
 }
 
Index: head/sys/fs/nfs/nfsport.h
===================================================================
--- head/sys/fs/nfs/nfsport.h	(revision 320061)
+++ head/sys/fs/nfs/nfsport.h	(revision 320062)
@@ -1,1042 +1,1042 @@
 /*-
  * Copyright (c) 1989, 1993
  *	The Regents of the University of California.  All rights reserved.
  *
  * This code is derived from software contributed to Berkeley by
  * Rick Macklem at The University of Guelph.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  * $FreeBSD$
  */
 
 #ifndef _NFS_NFSPORT_H_
 #define	_NFS_NFSPORT_H_
 
 /*
  * In general, I'm not fond of #includes in .h files, but this seems
  * to be the cleanest way to handle #include files for the ports.
  */
 #ifdef _KERNEL
 #include <sys/unistd.h>
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/conf.h>
 #include <sys/dirent.h>
 #include <sys/domain.h>
 #include <sys/fcntl.h>
 #include <sys/file.h>
 #include <sys/filedesc.h>
 #include <sys/jail.h>
 #include <sys/kernel.h>
 #include <sys/lockf.h>
 #include <sys/malloc.h>
 #include <sys/mbuf.h>
 #include <sys/mount.h>
 #include <sys/mutex.h>
 #include <sys/namei.h>
 #include <sys/proc.h>
 #include <sys/protosw.h>
 #include <sys/reboot.h>
 #include <sys/resourcevar.h>
 #include <sys/signalvar.h>
 #include <sys/socket.h>
 #include <sys/socketvar.h>
 #include <sys/stat.h>
 #include <sys/syslog.h>
 #include <sys/sysproto.h>
 #include <sys/time.h>
 #include <sys/uio.h>
 #include <sys/vnode.h>
 #include <sys/bio.h>
 #include <sys/buf.h>
 #include <sys/acl.h>
 #include <sys/module.h>
 #include <sys/sysent.h>
 #include <sys/syscall.h>
 #include <sys/priv.h>
 #include <sys/kthread.h>
 #include <sys/syscallsubr.h>
 #include <net/if.h>
 #include <net/if_var.h>
 #include <net/radix.h>
 #include <net/route.h>
 #include <net/if_dl.h>
 #include <netinet/in.h>
 #include <netinet/in_pcb.h>
 #include <netinet/in_systm.h>
 #include <netinet/in_var.h>
 #include <netinet/ip.h>
 #include <netinet/ip_var.h>
 #include <netinet/tcp.h>
 #include <netinet/tcp_fsm.h>
 #include <netinet/tcp_seq.h>
 #include <netinet/tcp_timer.h>
 #include <netinet/tcp_var.h>
 #include <machine/in_cksum.h>
 #include <crypto/des/des.h>
 #include <sys/md5.h>
 #include <rpc/rpc.h>
 #include <rpc/rpcsec_gss.h>
 
 /*
  * For Darwin, these functions should be "static" when built in a kext.
  * (This is always defined as nil otherwise.)
  */
 #define	APPLESTATIC
 #include <ufs/ufs/dir.h>
 #include <ufs/ufs/quota.h>
 #include <ufs/ufs/inode.h>
 #include <ufs/ufs/extattr.h>
 #include <ufs/ufs/ufsmount.h>
 #include <vm/uma.h>
 #include <vm/vm.h>
 #include <vm/vm_object.h>
 #include <vm/vm_extern.h>
 #include <nfs/nfssvc.h>
 #include "opt_nfs.h"
 #include "opt_ufs.h"
 
 /*
  * These types must be defined before the nfs includes.
  */
 #define	NFSSOCKADDR_T	struct sockaddr *
 #define	NFSPROC_T	struct thread
 #define	NFSDEV_T	dev_t
 #define	NFSSVCARGS	nfssvc_args
 #define	NFSACL_T	struct acl
 
 /*
  * These should be defined as the types used for the corresponding VOP's
  * argument type.
  */
 #define	NFS_ACCESS_ARGS		struct vop_access_args
 #define	NFS_OPEN_ARGS		struct vop_open_args
 #define	NFS_GETATTR_ARGS	struct vop_getattr_args
 #define	NFS_LOOKUP_ARGS		struct vop_lookup_args
 #define	NFS_READDIR_ARGS	struct vop_readdir_args
 
 /*
  * Allocate mbufs. Must succeed and never set the mbuf ptr to NULL.
  */
 #define	NFSMGET(m)	do { 					\
 		MGET((m), M_WAITOK, MT_DATA); 			\
 		while ((m) == NULL ) { 				\
 			(void) nfs_catnap(PZERO, 0, "nfsmget");	\
 			MGET((m), M_WAITOK, MT_DATA); 		\
 		} 						\
 	} while (0)
 #define	NFSMGETHDR(m)	do { 					\
 		MGETHDR((m), M_WAITOK, MT_DATA);		\
 		while ((m) == NULL ) { 				\
 			(void) nfs_catnap(PZERO, 0, "nfsmget");	\
 			MGETHDR((m), M_WAITOK, MT_DATA); 	\
 		} 						\
 	} while (0)
 #define	NFSMCLGET(m, w)	do { 					\
 		MGET((m), M_WAITOK, MT_DATA); 			\
 		while ((m) == NULL ) { 				\
 			(void) nfs_catnap(PZERO, 0, "nfsmget");	\
 			MGET((m), M_WAITOK, MT_DATA); 		\
 		} 						\
 		MCLGET((m), (w));				\
 	} while (0)
 #define	NFSMCLGETHDR(m, w) do { 				\
 		MGETHDR((m), M_WAITOK, MT_DATA);		\
 		while ((m) == NULL ) { 				\
 			(void) nfs_catnap(PZERO, 0, "nfsmget");	\
 			MGETHDR((m), M_WAITOK, MT_DATA); 	\
 		} 						\
 	} while (0)
 #define	NFSMTOD	mtod
 
 /*
  * Client side constant for size of a lockowner name.
  */
 #define	NFSV4CL_LOCKNAMELEN	12
 
 /*
  * Type for a mutex lock.
  */
 #define	NFSMUTEX_T		struct mtx
 
 #endif	/* _KERNEL */
 
 /*
  * NFSv4 Operation numbers.
  */
 #define	NFSV4OP_ACCESS		3
 #define	NFSV4OP_CLOSE		4
 #define	NFSV4OP_COMMIT		5
 #define	NFSV4OP_CREATE		6
 #define	NFSV4OP_DELEGPURGE	7
 #define	NFSV4OP_DELEGRETURN	8
 #define	NFSV4OP_GETATTR		9
 #define	NFSV4OP_GETFH		10
 #define	NFSV4OP_LINK		11
 #define	NFSV4OP_LOCK		12
 #define	NFSV4OP_LOCKT		13
 #define	NFSV4OP_LOCKU		14
 #define	NFSV4OP_LOOKUP		15
 #define	NFSV4OP_LOOKUPP		16
 #define	NFSV4OP_NVERIFY		17
 #define	NFSV4OP_OPEN		18
 #define	NFSV4OP_OPENATTR	19
 #define	NFSV4OP_OPENCONFIRM	20
 #define	NFSV4OP_OPENDOWNGRADE	21
 #define	NFSV4OP_PUTFH		22
 #define	NFSV4OP_PUTPUBFH	23
 #define	NFSV4OP_PUTROOTFH	24
 #define	NFSV4OP_READ		25
 #define	NFSV4OP_READDIR		26
 #define	NFSV4OP_READLINK	27
 #define	NFSV4OP_REMOVE		28
 #define	NFSV4OP_RENAME		29
 #define	NFSV4OP_RENEW		30
 #define	NFSV4OP_RESTOREFH	31
 #define	NFSV4OP_SAVEFH		32
 #define	NFSV4OP_SECINFO		33
 #define	NFSV4OP_SETATTR		34
 #define	NFSV4OP_SETCLIENTID	35
 #define	NFSV4OP_SETCLIENTIDCFRM	36
 #define	NFSV4OP_VERIFY		37
 #define	NFSV4OP_WRITE		38
 #define	NFSV4OP_RELEASELCKOWN	39
 
 /*
  * Must be one greater than the last Operation#.
  */
 #define	NFSV4OP_NOPS		40
 
 /*
  * Additional Ops for NFSv4.1.
  */
 #define	NFSV4OP_BACKCHANNELCTL	40
 #define	NFSV4OP_BINDCONNTOSESS	41
 #define	NFSV4OP_EXCHANGEID	42
 #define	NFSV4OP_CREATESESSION	43
 #define	NFSV4OP_DESTROYSESSION	44
 #define	NFSV4OP_FREESTATEID	45
 #define	NFSV4OP_GETDIRDELEG	46
 #define	NFSV4OP_GETDEVINFO	47
 #define	NFSV4OP_GETDEVLIST	48
 #define	NFSV4OP_LAYOUTCOMMIT	49
 #define	NFSV4OP_LAYOUTGET	50
 #define	NFSV4OP_LAYOUTRETURN	51
 #define	NFSV4OP_SECINFONONAME	52
 #define	NFSV4OP_SEQUENCE	53
 #define	NFSV4OP_SETSSV		54
 #define	NFSV4OP_TESTSTATEID	55
 #define	NFSV4OP_WANTDELEG	56
 #define	NFSV4OP_DESTROYCLIENTID	57
 #define	NFSV4OP_RECLAIMCOMPL	58
 
 /*
  * Must be one more than last op#.
  * NFSv4.2 isn't implemented yet, but define the op# limit for it.
  */
 #define	NFSV41_NOPS		59
 #define	NFSV42_NOPS		72
 
 /* Quirky case if the illegal op code */
 #define	NFSV4OP_OPILLEGAL	10044
 
 /*
  * Fake NFSV4OP_xxx used for nfsstat. Start at NFSV42_NOPS.
  */
 #define	NFSV4OP_SYMLINK		(NFSV42_NOPS)
 #define	NFSV4OP_MKDIR		(NFSV42_NOPS + 1)
 #define	NFSV4OP_RMDIR		(NFSV42_NOPS + 2)
 #define	NFSV4OP_READDIRPLUS	(NFSV42_NOPS + 3)
 #define	NFSV4OP_MKNOD		(NFSV42_NOPS + 4)
 #define	NFSV4OP_FSSTAT		(NFSV42_NOPS + 5)
 #define	NFSV4OP_FSINFO		(NFSV42_NOPS + 6)
 #define	NFSV4OP_PATHCONF	(NFSV42_NOPS + 7)
 #define	NFSV4OP_V3CREATE	(NFSV42_NOPS + 8)
 
 /*
  * This is the count of the fake operations listed above.
  */
 #define	NFSV4OP_FAKENOPS	9
 
 /*
  * and the Callback OPs
  */
 #define	NFSV4OP_CBGETATTR	3
 #define	NFSV4OP_CBRECALL	4
 
 /*
  * Must be one greater than the last Callback Operation# for NFSv4.0.
  */
 #define	NFSV4OP_CBNOPS		5
 
 /*
  * Additional Callback Ops for NFSv4.1 only.
  */
 #define	NFSV4OP_CBLAYOUTRECALL	5
 #define	NFSV4OP_CBNOTIFY	6
 #define	NFSV4OP_CBPUSHDELEG	7
 #define	NFSV4OP_CBRECALLANY	8
 #define	NFSV4OP_CBRECALLOBJAVAIL 9
 #define	NFSV4OP_CBRECALLSLOT	10
 #define	NFSV4OP_CBSEQUENCE	11
 #define	NFSV4OP_CBWANTCANCELLED	12
 #define	NFSV4OP_CBNOTIFYLOCK	13
 #define	NFSV4OP_CBNOTIFYDEVID	14
 
 #define	NFSV41_CBNOPS		15
 #define	NFSV42_CBNOPS		16
 
 /*
  * The lower numbers -> 21 are used by NFSv2 and v3. These define higher
  * numbers used by NFSv4.
  * NFS_V3NPROCS is one greater than the last V3 op and NFS_NPROCS is
  * one greater than the last number.
  */
 #ifndef	NFS_V3NPROCS
 #define	NFS_V3NPROCS		22
 
 #define	NFSPROC_LOOKUPP		22
 #define	NFSPROC_SETCLIENTID	23
 #define	NFSPROC_SETCLIENTIDCFRM	24
 #define	NFSPROC_LOCK		25
 #define	NFSPROC_LOCKU		26
 #define	NFSPROC_OPEN		27
 #define	NFSPROC_CLOSE		28
 #define	NFSPROC_OPENCONFIRM	29
 #define	NFSPROC_LOCKT		30
 #define	NFSPROC_OPENDOWNGRADE	31
 #define	NFSPROC_RENEW		32
 #define	NFSPROC_PUTROOTFH	33
 #define	NFSPROC_RELEASELCKOWN	34
 #define	NFSPROC_DELEGRETURN	35
 #define	NFSPROC_RETDELEGREMOVE	36
 #define	NFSPROC_RETDELEGRENAME1	37
 #define	NFSPROC_RETDELEGRENAME2	38
 #define	NFSPROC_GETACL		39
 #define	NFSPROC_SETACL		40
 
 /*
  * Must be defined as one higher than the last Proc# above.
  */
 #define	NFSV4_NPROCS		41
 
 /* Additional procedures for NFSv4.1. */
 #define	NFSPROC_EXCHANGEID	41
 #define	NFSPROC_CREATESESSION	42
 #define	NFSPROC_DESTROYSESSION	43
 #define	NFSPROC_DESTROYCLIENT	44
 #define	NFSPROC_FREESTATEID	45
 #define	NFSPROC_LAYOUTGET	46
 #define	NFSPROC_GETDEVICEINFO	47
 #define	NFSPROC_LAYOUTCOMMIT	48
 #define	NFSPROC_LAYOUTRETURN	49
 #define	NFSPROC_RECLAIMCOMPL	50
 #define	NFSPROC_WRITEDS		51
 #define	NFSPROC_READDS		52
 #define	NFSPROC_COMMITDS	53
 
 /*
  * Must be defined as one higher than the last NFSv4.1 Proc# above.
  */
 #define	NFSV41_NPROCS		54
 
 #endif	/* NFS_V3NPROCS */
 
 /*
  * New stats structure.
  * The vers field will be set to NFSSTATS_V1 by the caller.
  */
 #define	NFSSTATS_V1	1
 struct nfsstatsv1 {
 	int		vers;	/* Set to version requested by caller. */
 	uint64_t	attrcache_hits;
 	uint64_t	attrcache_misses;
 	uint64_t	lookupcache_hits;
 	uint64_t	lookupcache_misses;
 	uint64_t	direofcache_hits;
 	uint64_t	direofcache_misses;
 	uint64_t	accesscache_hits;
 	uint64_t	accesscache_misses;
 	uint64_t	biocache_reads;
 	uint64_t	read_bios;
 	uint64_t	read_physios;
 	uint64_t	biocache_writes;
 	uint64_t	write_bios;
 	uint64_t	write_physios;
 	uint64_t	biocache_readlinks;
 	uint64_t	readlink_bios;
 	uint64_t	biocache_readdirs;
 	uint64_t	readdir_bios;
 	uint64_t	rpccnt[NFSV41_NPROCS + 15];
 	uint64_t	rpcretries;
 	uint64_t	srvrpccnt[NFSV42_NOPS + NFSV4OP_FAKENOPS];
 	uint64_t	srvrpc_errs;
 	uint64_t	srv_errs;
 	uint64_t	rpcrequests;
 	uint64_t	rpctimeouts;
 	uint64_t	rpcunexpected;
 	uint64_t	rpcinvalid;
 	uint64_t	srvcache_inproghits;
 	uint64_t	srvcache_idemdonehits;
 	uint64_t	srvcache_nonidemdonehits;
 	uint64_t	srvcache_misses;
 	uint64_t	srvcache_tcppeak;
 	int		srvcache_size;	/* Updated by atomic_xx_int(). */
 	uint64_t	srvclients;
 	uint64_t	srvopenowners;
 	uint64_t	srvopens;
 	uint64_t	srvlockowners;
 	uint64_t	srvlocks;
 	uint64_t	srvdelegates;
 	uint64_t	cbrpccnt[NFSV42_CBNOPS];
 	uint64_t	clopenowners;
 	uint64_t	clopens;
 	uint64_t	cllockowners;
 	uint64_t	cllocks;
 	uint64_t	cldelegates;
 	uint64_t	cllocalopenowners;
 	uint64_t	cllocalopens;
 	uint64_t	cllocallockowners;
 	uint64_t	cllocallocks;
 	uint64_t	srvstartcnt;
 	uint64_t	srvdonecnt;
 	uint64_t	srvbytes[NFSV42_NOPS + NFSV4OP_FAKENOPS];
 	uint64_t	srvops[NFSV42_NOPS + NFSV4OP_FAKENOPS];
 	struct bintime	srvduration[NFSV42_NOPS + NFSV4OP_FAKENOPS];
 	struct bintime	busyfrom;
 	struct bintime	busytime;
 };
 
 /*
  * Old stats structure.
  */
 struct ext_nfsstats {
 	int	attrcache_hits;
 	int	attrcache_misses;
 	int	lookupcache_hits;
 	int	lookupcache_misses;
 	int	direofcache_hits;
 	int	direofcache_misses;
 	int	accesscache_hits;
 	int	accesscache_misses;
 	int	biocache_reads;
 	int	read_bios;
 	int	read_physios;
 	int	biocache_writes;
 	int	write_bios;
 	int	write_physios;
 	int	biocache_readlinks;
 	int	readlink_bios;
 	int	biocache_readdirs;
 	int	readdir_bios;
 	int	rpccnt[NFSV4_NPROCS];
 	int	rpcretries;
 	int	srvrpccnt[NFSV4OP_NOPS + NFSV4OP_FAKENOPS];
 	int	srvrpc_errs;
 	int	srv_errs;
 	int	rpcrequests;
 	int	rpctimeouts;
 	int	rpcunexpected;
 	int	rpcinvalid;
 	int	srvcache_inproghits;
 	int	srvcache_idemdonehits;
 	int	srvcache_nonidemdonehits;
 	int	srvcache_misses;
 	int	srvcache_tcppeak;
 	int	srvcache_size;
 	int	srvclients;
 	int	srvopenowners;
 	int	srvopens;
 	int	srvlockowners;
 	int	srvlocks;
 	int	srvdelegates;
 	int	cbrpccnt[NFSV4OP_CBNOPS];
 	int	clopenowners;
 	int	clopens;
 	int	cllockowners;
 	int	cllocks;
 	int	cldelegates;
 	int	cllocalopenowners;
 	int	cllocalopens;
 	int	cllocallockowners;
 	int	cllocallocks;
 };
 
 #ifdef _KERNEL
 /*
  * Define NFS_NPROCS as NFSV4_NPROCS for the experimental kernel code.
  */
 #ifndef	NFS_NPROCS
 #define	NFS_NPROCS		NFSV4_NPROCS
 #endif
 
 #include <fs/nfs/nfskpiport.h>
 #include <fs/nfs/nfsdport.h>
 #include <fs/nfs/rpcv2.h>
 #include <fs/nfs/nfsproto.h>
 #include <fs/nfs/nfs.h>
 #include <fs/nfs/nfsclstate.h>
 #include <fs/nfs/nfs_var.h>
 #include <fs/nfs/nfsm_subs.h>
 #include <fs/nfs/nfsrvcache.h>
 #include <fs/nfs/nfsrvstate.h>
 #include <fs/nfs/xdr_subs.h>
 #include <fs/nfs/nfscl.h>
 #include <nfsclient/nfsargs.h>
 #include <fs/nfsclient/nfsmount.h>
 
 /*
  * Just to keep nfs_var.h happy.
  */
 struct nfs_vattr {
 	int	junk;
 };
 
 struct nfsvattr {
 	struct vattr	na_vattr;
 	nfsattrbit_t	na_suppattr;
 	u_int32_t	na_mntonfileno;
 	u_int64_t	na_filesid[2];
 };
 
 #define	na_type		na_vattr.va_type
 #define	na_mode		na_vattr.va_mode
 #define	na_nlink	na_vattr.va_nlink
 #define	na_uid		na_vattr.va_uid
 #define	na_gid		na_vattr.va_gid
 #define	na_fsid		na_vattr.va_fsid
 #define	na_fileid	na_vattr.va_fileid
 #define	na_size		na_vattr.va_size
 #define	na_blocksize	na_vattr.va_blocksize
 #define	na_atime	na_vattr.va_atime
 #define	na_mtime	na_vattr.va_mtime
 #define	na_ctime	na_vattr.va_ctime
 #define	na_gen		na_vattr.va_gen
 #define	na_flags	na_vattr.va_flags
 #define	na_rdev		na_vattr.va_rdev
 #define	na_bytes	na_vattr.va_bytes
 #define	na_filerev	na_vattr.va_filerev
 #define	na_vaflags	na_vattr.va_vaflags
 
 #include <fs/nfsclient/nfsnode.h>
 
 /*
  * This is the header structure used for the lists, etc. (It has the
  * above record in it.
  */
 struct nfsrv_stablefirst {
 	LIST_HEAD(, nfsrv_stable) nsf_head;	/* Head of nfsrv_stable list */
 	time_t		nsf_eograce;	/* Time grace period ends */
 	time_t		*nsf_bootvals;	/* Previous boottime values */
 	struct file	*nsf_fp;	/* File table pointer */
 	u_char		nsf_flags;	/* NFSNSF_ flags */
 	struct nfsf_rec	nsf_rec;	/* and above first record */
 };
 #define	nsf_lease	nsf_rec.lease
 #define	nsf_numboots	nsf_rec.numboots
 
 /* NFSNSF_xxx flags */
 #define	NFSNSF_UPDATEDONE	0x01
 #define	NFSNSF_GRACEOVER	0x02
 #define	NFSNSF_NEEDLOCK		0x04
 #define	NFSNSF_EXPIREDCLIENT	0x08
 #define	NFSNSF_NOOPENS		0x10
 #define	NFSNSF_OK		0x20
 
 /*
  * Maximum number of boot times allowed in record. Although there is
  * really no need for a fixed upper bound, this serves as a sanity check
  * for a corrupted file.
  */
 #define	NFSNSF_MAXNUMBOOTS	10000
 
 /*
  * This structure defines the other records in the file. The
  * nst_client array is actually the size of the client string name.
  */
 struct nfst_rec {
 	u_int16_t	len;
 	u_char		flag;
 	u_char		client[1];
 };
 /* and the values for flag */
 #define	NFSNST_NEWSTATE	0x1
 #define	NFSNST_REVOKE		0x2
 #define	NFSNST_GOTSTATE		0x4
 
 /*
  * This structure is linked onto nfsrv_stablefirst for the duration of
  * reclaim.
  */
 struct nfsrv_stable {
 	LIST_ENTRY(nfsrv_stable) nst_list;
 	struct nfsclient	*nst_clp;
 	struct nfst_rec		nst_rec;
 };
 #define	nst_timestamp	nst_rec.timestamp
 #define	nst_len		nst_rec.len
 #define	nst_flag	nst_rec.flag
 #define	nst_client	nst_rec.client
 
 /*
  * At some point the server will run out of kernel storage for
  * state structures. For FreeBSD5.2, this results in a panic
  * kmem_map is full. It happens at well over 1000000 opens plus
  * locks on a PIII-800 with 256Mbytes, so that is where I've set
  * the limit. If your server panics due to too many opens/locks,
  * decrease the size of NFSRV_V4STATELIMIT. If you find the server
  * returning NFS4ERR_RESOURCE a lot and have lots of memory, try
  * increasing it.
  */
 #define	NFSRV_V4STATELIMIT	500000	/* Max # of Opens + Locks */
 
 /*
  * The type required differs with BSDen (just the second arg).
  */
 void nfsrvd_rcv(struct socket *, void *, int);
 
 /*
  * Macros for handling socket addresses. (Hopefully this makes the code
  * more portable, since I've noticed some 'BSD don't have sockaddrs in
  * mbufs any more.)
  */
 #define	NFSSOCKADDR(a, t)	((t)(a))
 #define	NFSSOCKADDRALLOC(a) 					\
     do {							\
 	MALLOC((a), struct sockaddr *, sizeof (struct sockaddr), \
 	    M_SONAME, M_WAITOK); 				\
 	NFSBZERO((a), sizeof (struct sockaddr)); 		\
     } while (0)
 #define	NFSSOCKADDRSIZE(a, s)		((a)->sa_len = (s))
 #define	NFSSOCKADDRFREE(a) 					\
 	do { 							\
 		if (a) 						\
 			FREE((caddr_t)(a), M_SONAME); 		\
 	} while (0)
 
 /*
  * These should be defined as a process or thread structure, as required
  * for signal handling, etc.
  */
 #define	NFSNEWCRED(c)		(crdup(c))
 #define	NFSPROCCRED(p)		((p)->td_ucred)
 #define	NFSFREECRED(c)		(crfree(c))
 #define	NFSUIOPROC(u, p)	((u)->uio_td = NULL)
 #define	NFSPROCP(p)		((p)->td_proc)
 
 /*
  * Define these so that cn_hash and its length is ignored.
  */
 #define	NFSCNHASHZERO(c)
 #define	NFSCNHASH(c, v)
 #define	NCHNAMLEN	9999999
 
 /*
  * These macros are defined to initialize and set the timer routine.
  */
 #define	NFS_TIMERINIT \
 	newnfs_timer(NULL)
 
 /*
  * Handle SMP stuff:
  */
 #define	NFSSTATESPINLOCK	extern struct mtx nfs_state_mutex
 #define	NFSLOCKSTATE()		mtx_lock(&nfs_state_mutex)
 #define	NFSUNLOCKSTATE()	mtx_unlock(&nfs_state_mutex)
 #define	NFSSTATEMUTEXPTR	(&nfs_state_mutex)
 #define	NFSREQSPINLOCK		extern struct mtx nfs_req_mutex
 #define	NFSLOCKREQ()		mtx_lock(&nfs_req_mutex)
 #define	NFSUNLOCKREQ()		mtx_unlock(&nfs_req_mutex)
 #define	NFSSOCKMUTEX		extern struct mtx nfs_slock_mutex
 #define	NFSSOCKMUTEXPTR		(&nfs_slock_mutex)
 #define	NFSLOCKSOCK()		mtx_lock(&nfs_slock_mutex)
 #define	NFSUNLOCKSOCK()		mtx_unlock(&nfs_slock_mutex)
 #define	NFSNAMEIDMUTEX		extern struct mtx nfs_nameid_mutex
 #define	NFSLOCKNAMEID()		mtx_lock(&nfs_nameid_mutex)
 #define	NFSUNLOCKNAMEID()	mtx_unlock(&nfs_nameid_mutex)
 #define	NFSNAMEIDREQUIRED()	mtx_assert(&nfs_nameid_mutex, MA_OWNED)
 #define	NFSCLSTATEMUTEX		extern struct mtx nfs_clstate_mutex
 #define	NFSCLSTATEMUTEXPTR	(&nfs_clstate_mutex)
 #define	NFSLOCKCLSTATE()	mtx_lock(&nfs_clstate_mutex)
 #define	NFSUNLOCKCLSTATE()	mtx_unlock(&nfs_clstate_mutex)
 #define	NFSDLOCKMUTEX		extern struct mtx newnfsd_mtx
 #define	NFSDLOCKMUTEXPTR	(&newnfsd_mtx)
 #define	NFSD_LOCK()		mtx_lock(&newnfsd_mtx)
 #define	NFSD_UNLOCK()		mtx_unlock(&newnfsd_mtx)
 #define	NFSD_LOCK_ASSERT()	mtx_assert(&newnfsd_mtx, MA_OWNED)
 #define	NFSD_UNLOCK_ASSERT()	mtx_assert(&newnfsd_mtx, MA_NOTOWNED)
 #define	NFSV4ROOTLOCKMUTEX	extern struct mtx nfs_v4root_mutex
 #define	NFSV4ROOTLOCKMUTEXPTR	(&nfs_v4root_mutex)
 #define	NFSLOCKV4ROOTMUTEX()	mtx_lock(&nfs_v4root_mutex)
 #define	NFSUNLOCKV4ROOTMUTEX()	mtx_unlock(&nfs_v4root_mutex)
 #define	NFSLOCKNODE(n)		mtx_lock(&((n)->n_mtx))
 #define	NFSUNLOCKNODE(n)	mtx_unlock(&((n)->n_mtx))
 #define	NFSLOCKMNT(m)		mtx_lock(&((m)->nm_mtx))
 #define	NFSUNLOCKMNT(m)		mtx_unlock(&((m)->nm_mtx))
 #define	NFSLOCKREQUEST(r)	mtx_lock(&((r)->r_mtx))
 #define	NFSUNLOCKREQUEST(r)	mtx_unlock(&((r)->r_mtx))
 #define	NFSPROCLISTLOCK()	sx_slock(&allproc_lock)
 #define	NFSPROCLISTUNLOCK()	sx_sunlock(&allproc_lock)
 #define	NFSLOCKSOCKREQ(r)	mtx_lock(&((r)->nr_mtx))
 #define	NFSUNLOCKSOCKREQ(r)	mtx_unlock(&((r)->nr_mtx))
 #define	NFSLOCKDS(d)		mtx_lock(&((d)->nfsclds_mtx))
 #define	NFSUNLOCKDS(d)		mtx_unlock(&((d)->nfsclds_mtx))
 #define	NFSSESSIONMUTEXPTR(s)	(&((s)->mtx))
 #define	NFSLOCKSESSION(s)	mtx_lock(&((s)->mtx))
 #define	NFSUNLOCKSESSION(s)	mtx_unlock(&((s)->mtx))
 
 /*
  * Use these macros to initialize/free a mutex.
  */
 #define	NFSINITSOCKMUTEX(m)	mtx_init((m), "nfssock", NULL, MTX_DEF)
 #define	NFSFREEMUTEX(m)		mtx_destroy((m))
 
 int nfsmsleep(void *, void *, int, const char *, struct timespec *);
 
 /*
  * And weird vm stuff in the nfs server.
  */
 #define	PDIRUNLOCK	0x0
 #define	MAX_COMMIT_COUNT	(1024 * 1024)
 
 /*
  * Define these to handle the type of va_rdev.
  */
 #define	NFSMAKEDEV(m, n)	makedev((m), (n))
 #define	NFSMAJOR(d)		major(d)
 #define	NFSMINOR(d)		minor(d)
 
 /*
  * The vnode tag for nfsv4root.
  */
 #define	VT_NFSV4ROOT		"nfsv4root"
 
 /*
  * Define whatever it takes to do a vn_rdwr().
  */
 #define	NFSD_RDWR(r, v, b, l, o, s, i, c, a, p) \
 	vn_rdwr((r), (v), (b), (l), (o), (s), (i), (c), NULL, (a), (p))
 
 /*
  * Macros for handling memory for different BSDen.
  * NFSBCOPY(src, dst, len) - copies len bytes, non-overlapping
  * NFSOVBCOPY(src, dst, len) - ditto, but data areas might overlap
  * NFSBCMP(cp1, cp2, len) - compare len bytes, return 0 if same
  * NFSBZERO(cp, len) - set len bytes to 0x0
  */
 #define	NFSBCOPY(s, d, l)	bcopy((s), (d), (l))
 #define	NFSOVBCOPY(s, d, l)	ovbcopy((s), (d), (l))
 #define	NFSBCMP(s, d, l)	bcmp((s), (d), (l))
 #define	NFSBZERO(s, l)		bzero((s), (l))
 
 /*
  * Some queue.h files don't have these dfined in them.
  */
 #define	LIST_END(head)		NULL
 #define	SLIST_END(head)		NULL
 #define	TAILQ_END(head)		NULL
 
 /*
  * This must be defined to be a global variable that increments once
  * per second, but never stops or goes backwards, even when a "date"
  * command changes the TOD clock. It is used for delta times for
  * leases, etc.
  */
 #define	NFSD_MONOSEC		time_uptime
 
 /*
  * Declare the malloc types.
  */
 MALLOC_DECLARE(M_NEWNFSRVCACHE);
 MALLOC_DECLARE(M_NEWNFSDCLIENT);
 MALLOC_DECLARE(M_NEWNFSDSTATE);
 MALLOC_DECLARE(M_NEWNFSDLOCK);
 MALLOC_DECLARE(M_NEWNFSDLOCKFILE);
 MALLOC_DECLARE(M_NEWNFSSTRING);
 MALLOC_DECLARE(M_NEWNFSUSERGROUP);
 MALLOC_DECLARE(M_NEWNFSDREQ);
 MALLOC_DECLARE(M_NEWNFSFH);
 MALLOC_DECLARE(M_NEWNFSCLOWNER);
 MALLOC_DECLARE(M_NEWNFSCLOPEN);
 MALLOC_DECLARE(M_NEWNFSCLDELEG);
 MALLOC_DECLARE(M_NEWNFSCLCLIENT);
 MALLOC_DECLARE(M_NEWNFSCLLOCKOWNER);
 MALLOC_DECLARE(M_NEWNFSCLLOCK);
 MALLOC_DECLARE(M_NEWNFSDIROFF);
 MALLOC_DECLARE(M_NEWNFSV4NODE);
 MALLOC_DECLARE(M_NEWNFSDIRECTIO);
 MALLOC_DECLARE(M_NEWNFSMNT);
 MALLOC_DECLARE(M_NEWNFSDROLLBACK);
 MALLOC_DECLARE(M_NEWNFSLAYOUT);
 MALLOC_DECLARE(M_NEWNFSFLAYOUT);
 MALLOC_DECLARE(M_NEWNFSDEVINFO);
 MALLOC_DECLARE(M_NEWNFSSOCKREQ);
 MALLOC_DECLARE(M_NEWNFSCLDS);
 MALLOC_DECLARE(M_NEWNFSLAYRECALL);
 MALLOC_DECLARE(M_NEWNFSDSESSION);
 #define	M_NFSRVCACHE	M_NEWNFSRVCACHE
 #define	M_NFSDCLIENT	M_NEWNFSDCLIENT
 #define	M_NFSDSTATE	M_NEWNFSDSTATE
 #define	M_NFSDLOCK	M_NEWNFSDLOCK
 #define	M_NFSDLOCKFILE	M_NEWNFSDLOCKFILE
 #define	M_NFSSTRING	M_NEWNFSSTRING
 #define	M_NFSUSERGROUP	M_NEWNFSUSERGROUP
 #define	M_NFSDREQ	M_NEWNFSDREQ
 #define	M_NFSFH		M_NEWNFSFH
 #define	M_NFSCLOWNER	M_NEWNFSCLOWNER
 #define	M_NFSCLOPEN	M_NEWNFSCLOPEN
 #define	M_NFSCLDELEG	M_NEWNFSCLDELEG
 #define	M_NFSCLCLIENT	M_NEWNFSCLCLIENT
 #define	M_NFSCLLOCKOWNER M_NEWNFSCLLOCKOWNER
 #define	M_NFSCLLOCK	M_NEWNFSCLLOCK
 #define	M_NFSDIROFF	M_NEWNFSDIROFF
 #define	M_NFSV4NODE	M_NEWNFSV4NODE
 #define	M_NFSDIRECTIO	M_NEWNFSDIRECTIO
 #define	M_NFSDROLLBACK	M_NEWNFSDROLLBACK
 #define	M_NFSLAYOUT	M_NEWNFSLAYOUT
 #define	M_NFSFLAYOUT	M_NEWNFSFLAYOUT
 #define	M_NFSDEVINFO	M_NEWNFSDEVINFO
 #define	M_NFSSOCKREQ	M_NEWNFSSOCKREQ
 #define	M_NFSCLDS	M_NEWNFSCLDS
 #define	M_NFSLAYRECALL	M_NEWNFSLAYRECALL
 #define	M_NFSDSESSION	M_NEWNFSDSESSION
 
 #define	NFSINT_SIGMASK(set) 						\
 	(SIGISMEMBER(set, SIGINT) || SIGISMEMBER(set, SIGTERM) ||	\
 	 SIGISMEMBER(set, SIGHUP) || SIGISMEMBER(set, SIGKILL) ||	\
 	 SIGISMEMBER(set, SIGQUIT))
 
 /*
  * Convert a quota block count to byte count.
  */
 #define	NFSQUOTABLKTOBYTE(q, b)	(q) *= (b)
 
 /*
  * Define this as the largest file size supported. (It should probably
  * be available via a VFS_xxx Op, but it isn't.
  */
 #define	NFSRV_MAXFILESIZE	((u_int64_t)0x800000000000)
 
 /*
  * Set this macro to index() or strchr(), whichever is supported.
  */
 #define	STRCHR(s, c)		strchr((s), (c))
 
 /*
  * Set the n_time in the client write rpc, as required.
  */
 #define	NFSWRITERPC_SETTIME(w, n, a, v4)				\
 	do {								\
 		if (w) {						\
 			mtx_lock(&((n)->n_mtx));			\
 			(n)->n_mtime = (a)->na_mtime;			\
 			if (v4)						\
 				(n)->n_change = (a)->na_filerev;	\
 			mtx_unlock(&((n)->n_mtx));			\
 		}							\
 	} while (0)
 
 /*
  * Fake value, just to make the client work.
  */
 #define	NFS_LATTR_NOSHRINK	1
 
 /*
  * Prototypes for functions where the arguments vary for different ports.
  */
 int nfscl_loadattrcache(struct vnode **, struct nfsvattr *, void *, void *,
     int, int);
 int newnfs_realign(struct mbuf **, int);
 
 /*
  * If the port runs on an SMP box that can enforce Atomic ops with low
  * overheads, define these as atomic increments/decrements. If not,
  * don't worry about it, since these are used for stats that can be
  * "out by one" without disastrous consequences.
  */
 #define	NFSINCRGLOBAL(a)	((a)++)
 
 /*
  * Assorted funky stuff to make things work under Darwin8.
  */
 /*
  * These macros checks for a field in vattr being set.
  */
 #define	NFSATTRISSET(t, v, a)	((v)->a != (t)VNOVAL)
 #define	NFSATTRISSETTIME(v, a)	((v)->a.tv_sec != VNOVAL)
 
 /*
  * Manipulate mount flags.
  */
 #define	NFSSTA_HASWRITEVERF	0x00040000  /* Has write verifier */
 #define	NFSSTA_GOTFSINFO	0x00100000  /* Got the fsinfo */
 #define	NFSSTA_OPENMODE		0x00200000  /* Must use correct open mode */
 #define	NFSSTA_NOLAYOUTCOMMIT	0x04000000  /* Don't do LayoutCommit */
 #define	NFSSTA_SESSPERSIST	0x08000000  /* Has a persistent session */
 #define	NFSSTA_TIMEO		0x10000000  /* Experiencing a timeout */
 #define	NFSSTA_LOCKTIMEO	0x20000000  /* Experiencing a lockd timeout */
 #define	NFSSTA_HASSETFSID	0x40000000  /* Has set the fsid */
 #define	NFSSTA_PNFS		0x80000000  /* pNFS is enabled */
 
 #define	NFSHASNFSV3(n)		((n)->nm_flag & NFSMNT_NFSV3)
 #define	NFSHASNFSV4(n)		((n)->nm_flag & NFSMNT_NFSV4)
 #define	NFSHASNFSV4N(n)		((n)->nm_minorvers > 0)
 #define	NFSHASNFSV3OR4(n)	((n)->nm_flag & (NFSMNT_NFSV3 | NFSMNT_NFSV4))
 #define	NFSHASGOTFSINFO(n)	((n)->nm_state & NFSSTA_GOTFSINFO)
 #define	NFSHASHASSETFSID(n)	((n)->nm_state & NFSSTA_HASSETFSID)
 #define	NFSHASSTRICT3530(n)	((n)->nm_flag & NFSMNT_STRICT3530)
 #define	NFSHASWRITEVERF(n)	((n)->nm_state & NFSSTA_HASWRITEVERF)
 #define	NFSHASINT(n)		((n)->nm_flag & NFSMNT_INT)
 #define	NFSHASSOFT(n)		((n)->nm_flag & NFSMNT_SOFT)
 #define	NFSHASINTORSOFT(n)	((n)->nm_flag & (NFSMNT_INT | NFSMNT_SOFT))
 #define	NFSHASDUMBTIMR(n)	((n)->nm_flag & NFSMNT_DUMBTIMR)
 #define	NFSHASNOCONN(n)		((n)->nm_flag & NFSMNT_MNTD)
 #define	NFSHASKERB(n)		((n)->nm_flag & NFSMNT_KERB)
 #define	NFSHASALLGSSNAME(n)	((n)->nm_flag & NFSMNT_ALLGSSNAME)
 #define	NFSHASINTEGRITY(n)	((n)->nm_flag & NFSMNT_INTEGRITY)
 #define	NFSHASPRIVACY(n)	((n)->nm_flag & NFSMNT_PRIVACY)
 #define	NFSSETWRITEVERF(n)	((n)->nm_state |= NFSSTA_HASWRITEVERF)
 #define	NFSSETHASSETFSID(n)	((n)->nm_state |= NFSSTA_HASSETFSID)
 #define	NFSHASPNFSOPT(n)	((n)->nm_flag & NFSMNT_PNFS)
 #define	NFSHASNOLAYOUTCOMMIT(n)	((n)->nm_state & NFSSTA_NOLAYOUTCOMMIT)
 #define	NFSHASSESSPERSIST(n)	((n)->nm_state & NFSSTA_SESSPERSIST)
 #define	NFSHASPNFS(n)		((n)->nm_state & NFSSTA_PNFS)
 #define	NFSHASOPENMODE(n)	((n)->nm_state & NFSSTA_OPENMODE)
 #define	NFSHASONEOPENOWN(n)	(((n)->nm_flag & NFSMNT_ONEOPENOWN) != 0 &&	\
 				    (n)->nm_minorvers > 0)
 
 /*
  * Gets the stats field out of the mount structure.
  */
 #define	vfs_statfs(m)	(&((m)->mnt_stat))
 
 /*
  * Set boottime.
  */
 #define	NFSSETBOOTTIME(b)	(getboottime(&b))
 
 /*
  * The size of directory blocks in the buffer cache.
  * MUST BE in the range of PAGE_SIZE <= NFS_DIRBLKSIZ <= MAXBSIZE!!
  */
 #define	NFS_DIRBLKSIZ	(16 * DIRBLKSIZ) /* Must be a multiple of DIRBLKSIZ */
 
 /*
  * Define these macros to access mnt_flag fields.
  */
 #define	NFSMNT_RDONLY(m)	((m)->mnt_flag & MNT_RDONLY)
 #endif	/* _KERNEL */
 
 /*
  * Define a structure similar to ufs_args for use in exporting the V4 root.
  */
 struct nfsex_args {
 	char	*fspec;
 	struct export_args	export;
 };
 
 /*
  * These export flags should be defined, but there are no bits left.
  * Maybe a separate mnt_exflag field could be added or the mnt_flag
  * field increased to 64 bits?
  */
 #ifndef	MNT_EXSTRICTACCESS
 #define	MNT_EXSTRICTACCESS	0x0
 #endif
 #ifndef MNT_EXV4ONLY
 #define	MNT_EXV4ONLY		0x0
 #endif
 
 #ifdef _KERNEL
 /*
  * Define this to invalidate the attribute cache for the nfs node.
  */
 #define	NFSINVALATTRCACHE(n)	((n)->n_attrstamp = 0)
 
 /* Used for FreeBSD only */
 void nfsd_mntinit(void);
 
 /*
  * Define these for vnode lock/unlock ops.
  *
  * These are good abstractions to macro out, so that they can be added to
  * later, for debugging or stats, etc.
  */
 #define	NFSVOPLOCK(v, f)	vn_lock((v), (f))
 #define	NFSVOPUNLOCK(v, f)	VOP_UNLOCK((v), (f))
 #define	NFSVOPISLOCKED(v)	VOP_ISLOCKED((v))
 
 /*
  * Define ncl_hash().
  */
 #define	ncl_hash(f, l)	(fnv_32_buf((f), (l), FNV1_32_INIT))
 
 int newnfs_iosize(struct nfsmount *);
 
 int newnfs_vncmpf(struct vnode *, void *);
 
 #ifndef NFS_MINDIRATTRTIMO
 #define	NFS_MINDIRATTRTIMO 3		/* VDIR attrib cache timeout in sec */
 #endif
 #ifndef NFS_MAXDIRATTRTIMO
 #define	NFS_MAXDIRATTRTIMO 60
 #endif
 
 /*
  * Nfs outstanding request list element
  */
 struct nfsreq {
 	TAILQ_ENTRY(nfsreq) r_chain;
 	u_int32_t	r_flags;	/* flags on request, see below */
 	struct nfsmount *r_nmp;		/* Client mnt ptr */
 	struct mtx	r_mtx;		/* Mutex lock for this structure */
 };
 
 #ifndef NFS_MAXBSIZE
-#define	NFS_MAXBSIZE	MAXBCACHEBUF
+#define	NFS_MAXBSIZE	(maxbcachebuf)
 #endif
 
 /*
  * This macro checks to see if issuing of delegations is allowed for this
  * vnode.
  */
 #ifdef VV_DISABLEDELEG
 #define	NFSVNO_DELEGOK(v)						\
 	((v) == NULL || ((v)->v_vflag & VV_DISABLEDELEG) == 0)
 #else
 #define	NFSVNO_DELEGOK(v)	(1)
 #endif
 
 /*
  * Name used by getnewvnode() to describe filesystem, "nfs".
  * For performance reasons it is useful to have the same string
  * used in both places that call getnewvnode().
  */
 extern const char nfs_vnode_tag[];
 
 #endif	/* _KERNEL */
 
 #endif	/* _NFS_NFSPORT_H */
Index: head/sys/fs/nfsclient/nfs_clrpcops.c
===================================================================
--- head/sys/fs/nfsclient/nfs_clrpcops.c	(revision 320061)
+++ head/sys/fs/nfsclient/nfs_clrpcops.c	(revision 320062)
@@ -1,6015 +1,6034 @@
 /*-
  * Copyright (c) 1989, 1993
  *	The Regents of the University of California.  All rights reserved.
  *
  * This code is derived from software contributed to Berkeley by
  * Rick Macklem at The University of Guelph.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 /*
  * Rpc op calls, generally called from the vnode op calls or through the
  * buffer cache, for NFS v2, 3 and 4.
  * These do not normally make any changes to vnode arguments or use
  * structures that might change between the VFS variants. The returned
  * arguments are all at the end, after the NFSPROC_T *p one.
  */
 
 #ifndef APPLEKEXT
 #include "opt_inet6.h"
 
 #include <fs/nfs/nfsport.h>
 #include <sys/sysctl.h>
 
 SYSCTL_DECL(_vfs_nfs);
 
 static int	nfsignore_eexist = 0;
 SYSCTL_INT(_vfs_nfs, OID_AUTO, ignore_eexist, CTLFLAG_RW,
     &nfsignore_eexist, 0, "NFS ignore EEXIST replies for mkdir/symlink");
 
 /*
  * Global variables
  */
 extern int nfs_numnfscbd;
 extern struct timeval nfsboottime;
 extern u_int32_t newnfs_false, newnfs_true;
 extern nfstype nfsv34_type[9];
 extern int nfsrv_useacl;
 extern char nfsv4_callbackaddr[INET6_ADDRSTRLEN];
 extern int nfscl_debuglevel;
 NFSCLSTATEMUTEX;
 int nfstest_outofseq = 0;
 int nfscl_assumeposixlocks = 1;
 int nfscl_enablecallb = 0;
 short nfsv4_cbport = NFSV4_CBPORT;
 int nfstest_openallsetattr = 0;
 #endif	/* !APPLEKEXT */
 
 #define	DIRHDSIZ	offsetof(struct dirent, d_name)
 
 /*
  * nfscl_getsameserver() can return one of three values:
  * NFSDSP_USETHISSESSION - Use this session for the DS.
  * NFSDSP_SEQTHISSESSION - Use the nfsclds_sequence field of this dsp for new
  *     session.
  * NFSDSP_NOTFOUND - No matching server was found.
  */
 enum nfsclds_state {
 	NFSDSP_USETHISSESSION = 0,
 	NFSDSP_SEQTHISSESSION = 1,
 	NFSDSP_NOTFOUND = 2,
 };
 
 static int nfsrpc_setattrrpc(vnode_t , struct vattr *, nfsv4stateid_t *,
     struct ucred *, NFSPROC_T *, struct nfsvattr *, int *, void *);
 static int nfsrpc_readrpc(vnode_t , struct uio *, struct ucred *,
     nfsv4stateid_t *, NFSPROC_T *, struct nfsvattr *, int *, void *);
 static int nfsrpc_writerpc(vnode_t , struct uio *, int *, int *,
     struct ucred *, nfsv4stateid_t *, NFSPROC_T *, struct nfsvattr *, int *,
     void *);
 static int nfsrpc_createv23(vnode_t , char *, int, struct vattr *,
     nfsquad_t, int, struct ucred *, NFSPROC_T *, struct nfsvattr *,
     struct nfsvattr *, struct nfsfh **, int *, int *, void *);
 static int nfsrpc_createv4(vnode_t , char *, int, struct vattr *,
     nfsquad_t, int, struct nfsclowner *, struct nfscldeleg **, struct ucred *,
     NFSPROC_T *, struct nfsvattr *, struct nfsvattr *, struct nfsfh **, int *,
     int *, void *, int *);
 static int nfsrpc_locku(struct nfsrv_descript *, struct nfsmount *,
     struct nfscllockowner *, u_int64_t, u_int64_t,
     u_int32_t, struct ucred *, NFSPROC_T *, int);
 static int nfsrpc_setaclrpc(vnode_t, struct ucred *, NFSPROC_T *,
     struct acl *, nfsv4stateid_t *, void *);
 static int nfsrpc_getlayout(struct nfsmount *, vnode_t, struct nfsfh *, int,
     uint32_t *, nfsv4stateid_t *, uint64_t, struct nfscllayout **,
     struct ucred *, NFSPROC_T *);
 static int nfsrpc_fillsa(struct nfsmount *, struct sockaddr_storage *,
     struct nfsclds **, NFSPROC_T *);
 static void nfscl_initsessionslots(struct nfsclsession *);
 static int nfscl_doflayoutio(vnode_t, struct uio *, int *, int *, int *,
     nfsv4stateid_t *, int, struct nfscldevinfo *, struct nfscllayout *,
     struct nfsclflayout *, uint64_t, uint64_t, struct ucred *, NFSPROC_T *);
 static int nfsrpc_readds(vnode_t, struct uio *, nfsv4stateid_t *, int *,
     struct nfsclds *, uint64_t, int, struct nfsfh *, struct ucred *,
     NFSPROC_T *);
 static int nfsrpc_writeds(vnode_t, struct uio *, int *, int *,
     nfsv4stateid_t *, struct nfsclds *, uint64_t, int,
     struct nfsfh *, int, struct ucred *, NFSPROC_T *);
 static enum nfsclds_state nfscl_getsameserver(struct nfsmount *,
     struct nfsclds *, struct nfsclds **);
 #ifdef notyet
 static int nfsrpc_commitds(vnode_t, uint64_t, int, struct nfsclds *,
     struct nfsfh *, struct ucred *, NFSPROC_T *, void *);
 #endif
 
 /*
  * nfs null call from vfs.
  */
 APPLESTATIC int
 nfsrpc_null(vnode_t vp, struct ucred *cred, NFSPROC_T *p)
 {
 	int error;
 	struct nfsrv_descript nfsd, *nd = &nfsd;
 	
 	NFSCL_REQSTART(nd, NFSPROC_NULL, vp);
 	error = nfscl_request(nd, vp, p, cred, NULL);
 	if (nd->nd_repstat && !error)
 		error = nd->nd_repstat;
 	mbuf_freem(nd->nd_mrep);
 	return (error);
 }
 
 /*
  * nfs access rpc op.
  * For nfs version 3 and 4, use the access rpc to check accessibility. If file
  * modes are changed on the server, accesses might still fail later.
  */
 APPLESTATIC int
 nfsrpc_access(vnode_t vp, int acmode, struct ucred *cred,
     NFSPROC_T *p, struct nfsvattr *nap, int *attrflagp)
 {
 	int error;
 	u_int32_t mode, rmode;
 
 	if (acmode & VREAD)
 		mode = NFSACCESS_READ;
 	else
 		mode = 0;
 	if (vnode_vtype(vp) == VDIR) {
 		if (acmode & VWRITE)
 			mode |= (NFSACCESS_MODIFY | NFSACCESS_EXTEND |
 				 NFSACCESS_DELETE);
 		if (acmode & VEXEC)
 			mode |= NFSACCESS_LOOKUP;
 	} else {
 		if (acmode & VWRITE)
 			mode |= (NFSACCESS_MODIFY | NFSACCESS_EXTEND);
 		if (acmode & VEXEC)
 			mode |= NFSACCESS_EXECUTE;
 	}
 
 	/*
 	 * Now, just call nfsrpc_accessrpc() to do the actual RPC.
 	 */
 	error = nfsrpc_accessrpc(vp, mode, cred, p, nap, attrflagp, &rmode,
 	    NULL);
 
 	/*
 	 * The NFS V3 spec does not clarify whether or not
 	 * the returned access bits can be a superset of
 	 * the ones requested, so...
 	 */
 	if (!error && (rmode & mode) != mode)
 		error = EACCES;
 	return (error);
 }
 
 /*
  * The actual rpc, separated out for Darwin.
  */
 APPLESTATIC int
 nfsrpc_accessrpc(vnode_t vp, u_int32_t mode, struct ucred *cred,
     NFSPROC_T *p, struct nfsvattr *nap, int *attrflagp, u_int32_t *rmodep,
     void *stuff)
 {
 	u_int32_t *tl;
 	u_int32_t supported, rmode;
 	int error;
 	struct nfsrv_descript nfsd, *nd = &nfsd;
 	nfsattrbit_t attrbits;
 
 	*attrflagp = 0;
 	supported = mode;
 	NFSCL_REQSTART(nd, NFSPROC_ACCESS, vp);
 	NFSM_BUILD(tl, u_int32_t *, NFSX_UNSIGNED);
 	*tl = txdr_unsigned(mode);
 	if (nd->nd_flag & ND_NFSV4) {
 		/*
 		 * And do a Getattr op.
 		 */
 		NFSM_BUILD(tl, u_int32_t *, NFSX_UNSIGNED);
 		*tl = txdr_unsigned(NFSV4OP_GETATTR);
 		NFSGETATTR_ATTRBIT(&attrbits);
 		(void) nfsrv_putattrbit(nd, &attrbits);
 	}
 	error = nfscl_request(nd, vp, p, cred, stuff);
 	if (error)
 		return (error);
 	if (nd->nd_flag & ND_NFSV3) {
 		error = nfscl_postop_attr(nd, nap, attrflagp, stuff);
 		if (error)
 			goto nfsmout;
 	}
 	if (!nd->nd_repstat) {
 		if (nd->nd_flag & ND_NFSV4) {
 			NFSM_DISSECT(tl, u_int32_t *, 2 * NFSX_UNSIGNED);
 			supported = fxdr_unsigned(u_int32_t, *tl++);
 		} else {
 			NFSM_DISSECT(tl, u_int32_t *, NFSX_UNSIGNED);
 		}
 		rmode = fxdr_unsigned(u_int32_t, *tl);
 		if (nd->nd_flag & ND_NFSV4)
 			error = nfscl_postop_attr(nd, nap, attrflagp, stuff);
 
 		/*
 		 * It's not obvious what should be done about
 		 * unsupported access modes. For now, be paranoid
 		 * and clear the unsupported ones.
 		 */
 		rmode &= supported;
 		*rmodep = rmode;
 	} else
 		error = nd->nd_repstat;
 nfsmout:
 	mbuf_freem(nd->nd_mrep);
 	return (error);
 }
 
 /*
  * nfs open rpc
  */
 APPLESTATIC int
 nfsrpc_open(vnode_t vp, int amode, struct ucred *cred, NFSPROC_T *p)
 {
 	struct nfsclopen *op;
 	struct nfscldeleg *dp;
 	struct nfsfh *nfhp;
 	struct nfsnode *np = VTONFS(vp);
 	struct nfsmount *nmp = VFSTONFS(vnode_mount(vp));
 	u_int32_t mode, clidrev;
 	int ret, newone, error, expireret = 0, retrycnt;
 
 	/*
 	 * For NFSv4, Open Ops are only done on Regular Files.
 	 */
 	if (vnode_vtype(vp) != VREG)
 		return (0);
 	mode = 0;
 	if (amode & FREAD)
 		mode |= NFSV4OPEN_ACCESSREAD;
 	if (amode & FWRITE)
 		mode |= NFSV4OPEN_ACCESSWRITE;
 	nfhp = np->n_fhp;
 
 	retrycnt = 0;
 #ifdef notdef
 { char name[100]; int namel;
 namel = (np->n_v4->n4_namelen < 100) ? np->n_v4->n4_namelen : 99;
 bcopy(NFS4NODENAME(np->n_v4), name, namel);
 name[namel] = '\0';
 printf("rpcopen p=0x%x name=%s",p->p_pid,name);
 if (nfhp->nfh_len > 0) printf(" fh=0x%x\n",nfhp->nfh_fh[12]);
 else printf(" fhl=0\n");
 }
 #endif
 	do {
 	    dp = NULL;
 	    error = nfscl_open(vp, nfhp->nfh_fh, nfhp->nfh_len, mode, 1,
 		cred, p, NULL, &op, &newone, &ret, 1);
 	    if (error) {
 		return (error);
 	    }
 	    if (nmp->nm_clp != NULL)
 		clidrev = nmp->nm_clp->nfsc_clientidrev;
 	    else
 		clidrev = 0;
 	    if (ret == NFSCLOPEN_DOOPEN) {
 		if (np->n_v4 != NULL) {
 			error = nfsrpc_openrpc(nmp, vp, np->n_v4->n4_data,
 			   np->n_v4->n4_fhlen, np->n_fhp->nfh_fh,
 			   np->n_fhp->nfh_len, mode, op,
 			   NFS4NODENAME(np->n_v4), np->n_v4->n4_namelen, &dp,
 			   0, 0x0, cred, p, 0, 0);
 			if (dp != NULL) {
 #ifdef APPLE
 				OSBitAndAtomic((int32_t)~NDELEGMOD, (UInt32 *)&np->n_flag);
 #else
 				NFSLOCKNODE(np);
 				np->n_flag &= ~NDELEGMOD;
 				/*
 				 * Invalidate the attribute cache, so that
 				 * attributes that pre-date the issue of a
 				 * delegation are not cached, since the
 				 * cached attributes will remain valid while
 				 * the delegation is held.
 				 */
 				NFSINVALATTRCACHE(np);
 				NFSUNLOCKNODE(np);
 #endif
 				(void) nfscl_deleg(nmp->nm_mountp,
 				    op->nfso_own->nfsow_clp,
 				    nfhp->nfh_fh, nfhp->nfh_len, cred, p, &dp);
 			}
 		} else {
 			error = EIO;
 		}
 		newnfs_copyincred(cred, &op->nfso_cred);
 	    } else if (ret == NFSCLOPEN_SETCRED)
 		/*
 		 * This is a new local open on a delegation. It needs
 		 * to have credentials so that an open can be done
 		 * against the server during recovery.
 		 */
 		newnfs_copyincred(cred, &op->nfso_cred);
 
 	    /*
 	     * nfso_opencnt is the count of how many VOP_OPEN()s have
 	     * been done on this Open successfully and a VOP_CLOSE()
 	     * is expected for each of these.
 	     * If error is non-zero, don't increment it, since the Open
 	     * hasn't succeeded yet.
 	     */
 	    if (!error)
 		op->nfso_opencnt++;
 	    nfscl_openrelease(nmp, op, error, newone);
 	    if (error == NFSERR_GRACE || error == NFSERR_STALECLIENTID ||
 		error == NFSERR_STALEDONTRECOVER || error == NFSERR_DELAY ||
 		error == NFSERR_BADSESSION) {
 		(void) nfs_catnap(PZERO, error, "nfs_open");
 	    } else if ((error == NFSERR_EXPIRED || error == NFSERR_BADSTATEID)
 		&& clidrev != 0) {
 		expireret = nfscl_hasexpired(nmp->nm_clp, clidrev, p);
 		retrycnt++;
 	    }
 	} while (error == NFSERR_GRACE || error == NFSERR_STALECLIENTID ||
 	    error == NFSERR_STALEDONTRECOVER || error == NFSERR_DELAY ||
 	    error == NFSERR_BADSESSION ||
 	    ((error == NFSERR_EXPIRED || error == NFSERR_BADSTATEID) &&
 	     expireret == 0 && clidrev != 0 && retrycnt < 4));
 	if (error && retrycnt >= 4)
 		error = EIO;
 	return (error);
 }
 
 /*
  * the actual open rpc
  */
 APPLESTATIC int
 nfsrpc_openrpc(struct nfsmount *nmp, vnode_t vp, u_int8_t *nfhp, int fhlen,
     u_int8_t *newfhp, int newfhlen, u_int32_t mode, struct nfsclopen *op,
     u_int8_t *name, int namelen, struct nfscldeleg **dpp,
     int reclaim, u_int32_t delegtype, struct ucred *cred, NFSPROC_T *p,
     int syscred, int recursed)
 {
 	u_int32_t *tl;
 	struct nfsrv_descript nfsd, *nd = &nfsd;
 	struct nfscldeleg *dp, *ndp = NULL;
 	struct nfsvattr nfsva;
 	u_int32_t rflags, deleg;
 	nfsattrbit_t attrbits;
 	int error, ret, acesize, limitby;
 	struct nfsclsession *tsep;
 
 	dp = *dpp;
 	*dpp = NULL;
 	nfscl_reqstart(nd, NFSPROC_OPEN, nmp, nfhp, fhlen, NULL, NULL);
 	NFSM_BUILD(tl, u_int32_t *, 5 * NFSX_UNSIGNED);
 	*tl++ = txdr_unsigned(op->nfso_own->nfsow_seqid);
 	*tl++ = txdr_unsigned(mode & NFSV4OPEN_ACCESSBOTH);
 	*tl++ = txdr_unsigned((mode >> NFSLCK_SHIFT) & NFSV4OPEN_DENYBOTH);
 	tsep = nfsmnt_mdssession(nmp);
 	*tl++ = tsep->nfsess_clientid.lval[0];
 	*tl = tsep->nfsess_clientid.lval[1];
 	(void) nfsm_strtom(nd, op->nfso_own->nfsow_owner, NFSV4CL_LOCKNAMELEN);
 	NFSM_BUILD(tl, u_int32_t *, 2 * NFSX_UNSIGNED);
 	*tl++ = txdr_unsigned(NFSV4OPEN_NOCREATE);
 	if (reclaim) {
 		*tl = txdr_unsigned(NFSV4OPEN_CLAIMPREVIOUS);
 		NFSM_BUILD(tl, u_int32_t *, NFSX_UNSIGNED);
 		*tl = txdr_unsigned(delegtype);
 	} else {
 		if (dp != NULL) {
 			*tl = txdr_unsigned(NFSV4OPEN_CLAIMDELEGATECUR);
 			NFSM_BUILD(tl, u_int32_t *, NFSX_STATEID);
 			if (NFSHASNFSV4N(nmp))
 				*tl++ = 0;
 			else
 				*tl++ = dp->nfsdl_stateid.seqid;
 			*tl++ = dp->nfsdl_stateid.other[0];
 			*tl++ = dp->nfsdl_stateid.other[1];
 			*tl = dp->nfsdl_stateid.other[2];
 		} else {
 			*tl = txdr_unsigned(NFSV4OPEN_CLAIMNULL);
 		}
 		(void) nfsm_strtom(nd, name, namelen);
 	}
 	NFSM_BUILD(tl, u_int32_t *, NFSX_UNSIGNED);
 	*tl = txdr_unsigned(NFSV4OP_GETATTR);
 	NFSZERO_ATTRBIT(&attrbits);
 	NFSSETBIT_ATTRBIT(&attrbits, NFSATTRBIT_CHANGE);
 	NFSSETBIT_ATTRBIT(&attrbits, NFSATTRBIT_TIMEMODIFY);
 	(void) nfsrv_putattrbit(nd, &attrbits);
 	if (syscred)
 		nd->nd_flag |= ND_USEGSSNAME;
 	error = newnfs_request(nd, nmp, NULL, &nmp->nm_sockreq, vp, p, cred,
 	    NFS_PROG, NFS_VER4, NULL, 1, NULL, NULL);
 	if (error)
 		return (error);
 	NFSCL_INCRSEQID(op->nfso_own->nfsow_seqid, nd);
 	if (!nd->nd_repstat) {
 		NFSM_DISSECT(tl, u_int32_t *, NFSX_STATEID +
 		    6 * NFSX_UNSIGNED);
 		op->nfso_stateid.seqid = *tl++;
 		op->nfso_stateid.other[0] = *tl++;
 		op->nfso_stateid.other[1] = *tl++;
 		op->nfso_stateid.other[2] = *tl;
 		rflags = fxdr_unsigned(u_int32_t, *(tl + 6));
 		error = nfsrv_getattrbits(nd, &attrbits, NULL, NULL);
 		if (error)
 			goto nfsmout;
 		NFSM_DISSECT(tl, u_int32_t *, NFSX_UNSIGNED);
 		deleg = fxdr_unsigned(u_int32_t, *tl);
 		if (deleg == NFSV4OPEN_DELEGATEREAD ||
 		    deleg == NFSV4OPEN_DELEGATEWRITE) {
 			if (!(op->nfso_own->nfsow_clp->nfsc_flags &
 			      NFSCLFLAGS_FIRSTDELEG))
 				op->nfso_own->nfsow_clp->nfsc_flags |=
 				  (NFSCLFLAGS_FIRSTDELEG | NFSCLFLAGS_GOTDELEG);
 			MALLOC(ndp, struct nfscldeleg *,
 			    sizeof (struct nfscldeleg) + newfhlen,
 			    M_NFSCLDELEG, M_WAITOK);
 			LIST_INIT(&ndp->nfsdl_owner);
 			LIST_INIT(&ndp->nfsdl_lock);
 			ndp->nfsdl_clp = op->nfso_own->nfsow_clp;
 			ndp->nfsdl_fhlen = newfhlen;
 			NFSBCOPY(newfhp, ndp->nfsdl_fh, newfhlen);
 			newnfs_copyincred(cred, &ndp->nfsdl_cred);
 			nfscl_lockinit(&ndp->nfsdl_rwlock);
 			NFSM_DISSECT(tl, u_int32_t *, NFSX_STATEID +
 			    NFSX_UNSIGNED);
 			ndp->nfsdl_stateid.seqid = *tl++;
 			ndp->nfsdl_stateid.other[0] = *tl++;
 			ndp->nfsdl_stateid.other[1] = *tl++;
 			ndp->nfsdl_stateid.other[2] = *tl++;
 			ret = fxdr_unsigned(int, *tl);
 			if (deleg == NFSV4OPEN_DELEGATEWRITE) {
 				ndp->nfsdl_flags = NFSCLDL_WRITE;
 				/*
 				 * Indicates how much the file can grow.
 				 */
 				NFSM_DISSECT(tl, u_int32_t *,
 				    3 * NFSX_UNSIGNED);
 				limitby = fxdr_unsigned(int, *tl++);
 				switch (limitby) {
 				case NFSV4OPEN_LIMITSIZE:
 					ndp->nfsdl_sizelimit = fxdr_hyper(tl);
 					break;
 				case NFSV4OPEN_LIMITBLOCKS:
 					ndp->nfsdl_sizelimit =
 					    fxdr_unsigned(u_int64_t, *tl++);
 					ndp->nfsdl_sizelimit *=
 					    fxdr_unsigned(u_int64_t, *tl);
 					break;
 				default:
 					error = NFSERR_BADXDR;
 					goto nfsmout;
 				}
 			} else {
 				ndp->nfsdl_flags = NFSCLDL_READ;
 			}
 			if (ret)
 				ndp->nfsdl_flags |= NFSCLDL_RECALL;
 			error = nfsrv_dissectace(nd, &ndp->nfsdl_ace, &ret,
 			    &acesize, p);
 			if (error)
 				goto nfsmout;
 		} else if (deleg != NFSV4OPEN_DELEGATENONE) {
 			error = NFSERR_BADXDR;
 			goto nfsmout;
 		}
 		NFSM_DISSECT(tl, u_int32_t *, 2 * NFSX_UNSIGNED);
 		error = nfsv4_loadattr(nd, NULL, &nfsva, NULL,
 		    NULL, 0, NULL, NULL, NULL, NULL, NULL, 0,
 		    NULL, NULL, NULL, p, cred);
 		if (error)
 			goto nfsmout;
 		if (ndp != NULL) {
 			ndp->nfsdl_change = nfsva.na_filerev;
 			ndp->nfsdl_modtime = nfsva.na_mtime;
 			ndp->nfsdl_flags |= NFSCLDL_MODTIMESET;
 		}
 		if (!reclaim && (rflags & NFSV4OPEN_RESULTCONFIRM)) {
 		    do {
 			ret = nfsrpc_openconfirm(vp, newfhp, newfhlen, op,
 			    cred, p);
 			if (ret == NFSERR_DELAY)
 			    (void) nfs_catnap(PZERO, ret, "nfs_open");
 		    } while (ret == NFSERR_DELAY);
 		    error = ret;
 		}
 		if ((rflags & NFSV4OPEN_LOCKTYPEPOSIX) ||
 		    nfscl_assumeposixlocks)
 		    op->nfso_posixlock = 1;
 		else
 		    op->nfso_posixlock = 0;
 
 		/*
 		 * If the server is handing out delegations, but we didn't
 		 * get one because an OpenConfirm was required, try the
 		 * Open again, to get a delegation. This is a harmless no-op,
 		 * from a server's point of view.
 		 */
 		if (!reclaim && (rflags & NFSV4OPEN_RESULTCONFIRM) &&
 		    (op->nfso_own->nfsow_clp->nfsc_flags & NFSCLFLAGS_GOTDELEG)
 		    && !error && dp == NULL && ndp == NULL && !recursed) {
 		    do {
 			ret = nfsrpc_openrpc(nmp, vp, nfhp, fhlen, newfhp,
 			    newfhlen, mode, op, name, namelen, &ndp, 0, 0x0,
 			    cred, p, syscred, 1);
 			if (ret == NFSERR_DELAY)
 			    (void) nfs_catnap(PZERO, ret, "nfs_open2");
 		    } while (ret == NFSERR_DELAY);
 		    if (ret) {
 			if (ndp != NULL) {
 				FREE((caddr_t)ndp, M_NFSCLDELEG);
 				ndp = NULL;
 			}
 			if (ret == NFSERR_STALECLIENTID ||
 			    ret == NFSERR_STALEDONTRECOVER ||
 			    ret == NFSERR_BADSESSION)
 				error = ret;
 		    }
 		}
 	}
 	if (nd->nd_repstat != 0 && error == 0)
 		error = nd->nd_repstat;
 	if (error == NFSERR_STALECLIENTID)
 		nfscl_initiate_recovery(op->nfso_own->nfsow_clp);
 nfsmout:
 	if (!error)
 		*dpp = ndp;
 	else if (ndp != NULL)
 		FREE((caddr_t)ndp, M_NFSCLDELEG);
 	mbuf_freem(nd->nd_mrep);
 	return (error);
 }
 
 /*
  * open downgrade rpc
  */
 APPLESTATIC int
 nfsrpc_opendowngrade(vnode_t vp, u_int32_t mode, struct nfsclopen *op,
     struct ucred *cred, NFSPROC_T *p)
 {
 	u_int32_t *tl;
 	struct nfsrv_descript nfsd, *nd = &nfsd;
 	int error;
 
 	NFSCL_REQSTART(nd, NFSPROC_OPENDOWNGRADE, vp);
 	NFSM_BUILD(tl, u_int32_t *, NFSX_STATEID + 3 * NFSX_UNSIGNED);
 	if (NFSHASNFSV4N(VFSTONFS(vnode_mount(vp))))
 		*tl++ = 0;
 	else
 		*tl++ = op->nfso_stateid.seqid;
 	*tl++ = op->nfso_stateid.other[0];
 	*tl++ = op->nfso_stateid.other[1];
 	*tl++ = op->nfso_stateid.other[2];
 	*tl++ = txdr_unsigned(op->nfso_own->nfsow_seqid);
 	*tl++ = txdr_unsigned(mode & NFSV4OPEN_ACCESSBOTH);
 	*tl = txdr_unsigned((mode >> NFSLCK_SHIFT) & NFSV4OPEN_DENYBOTH);
 	error = nfscl_request(nd, vp, p, cred, NULL);
 	if (error)
 		return (error);
 	NFSCL_INCRSEQID(op->nfso_own->nfsow_seqid, nd);
 	if (!nd->nd_repstat) {
 		NFSM_DISSECT(tl, u_int32_t *, NFSX_STATEID);
 		op->nfso_stateid.seqid = *tl++;
 		op->nfso_stateid.other[0] = *tl++;
 		op->nfso_stateid.other[1] = *tl++;
 		op->nfso_stateid.other[2] = *tl;
 	}
 	if (nd->nd_repstat && error == 0)
 		error = nd->nd_repstat;
 	if (error == NFSERR_STALESTATEID)
 		nfscl_initiate_recovery(op->nfso_own->nfsow_clp);
 nfsmout:
 	mbuf_freem(nd->nd_mrep);
 	return (error);
 }
 
 /*
  * V4 Close operation.
  */
 APPLESTATIC int
 nfsrpc_close(vnode_t vp, int doclose, NFSPROC_T *p)
 {
 	struct nfsclclient *clp;
 	int error;
 
 	if (vnode_vtype(vp) != VREG)
 		return (0);
 	if (doclose)
 		error = nfscl_doclose(vp, &clp, p);
 	else
 		error = nfscl_getclose(vp, &clp);
 	if (error)
 		return (error);
 
 	nfscl_clientrelease(clp);
 	return (0);
 }
 
 /*
  * Close the open.
  */
 APPLESTATIC void
 nfsrpc_doclose(struct nfsmount *nmp, struct nfsclopen *op, NFSPROC_T *p)
 {
 	struct nfsrv_descript nfsd, *nd = &nfsd;
 	struct nfscllockowner *lp, *nlp;
 	struct nfscllock *lop, *nlop;
 	struct ucred *tcred;
 	u_int64_t off = 0, len = 0;
 	u_int32_t type = NFSV4LOCKT_READ;
 	int error, do_unlock, trycnt;
 
 	tcred = newnfs_getcred();
 	newnfs_copycred(&op->nfso_cred, tcred);
 	/*
 	 * (Theoretically this could be done in the same
 	 *  compound as the close, but having multiple
 	 *  sequenced Ops in the same compound might be
 	 *  too scary for some servers.)
 	 */
 	if (op->nfso_posixlock) {
 		off = 0;
 		len = NFS64BITSSET;
 		type = NFSV4LOCKT_READ;
 	}
 
 	/*
 	 * Since this function is only called from VOP_INACTIVE(), no
 	 * other thread will be manipulating this Open. As such, the
 	 * lock lists are not being changed by other threads, so it should
 	 * be safe to do this without locking.
 	 */
 	LIST_FOREACH(lp, &op->nfso_lock, nfsl_list) {
 		do_unlock = 1;
 		LIST_FOREACH_SAFE(lop, &lp->nfsl_lock, nfslo_list, nlop) {
 			if (op->nfso_posixlock == 0) {
 				off = lop->nfslo_first;
 				len = lop->nfslo_end - lop->nfslo_first;
 				if (lop->nfslo_type == F_WRLCK)
 					type = NFSV4LOCKT_WRITE;
 				else
 					type = NFSV4LOCKT_READ;
 			}
 			if (do_unlock) {
 				trycnt = 0;
 				do {
 					error = nfsrpc_locku(nd, nmp, lp, off,
 					    len, type, tcred, p, 0);
 					if ((nd->nd_repstat == NFSERR_GRACE ||
 					    nd->nd_repstat == NFSERR_DELAY) &&
 					    error == 0)
 						(void) nfs_catnap(PZERO,
 						    (int)nd->nd_repstat,
 						    "nfs_close");
 				} while ((nd->nd_repstat == NFSERR_GRACE ||
 				    nd->nd_repstat == NFSERR_DELAY) &&
 				    error == 0 && trycnt++ < 5);
 				if (op->nfso_posixlock)
 					do_unlock = 0;
 			}
 			nfscl_freelock(lop, 0);
 		}
 		/*
 		 * Do a ReleaseLockOwner.
 		 * The lock owner name nfsl_owner may be used by other opens for
 		 * other files but the lock_owner4 name that nfsrpc_rellockown()
 		 * puts on the wire has the file handle for this file appended
 		 * to it, so it can be done now.
 		 */
 		(void)nfsrpc_rellockown(nmp, lp, lp->nfsl_open->nfso_fh,
 		    lp->nfsl_open->nfso_fhlen, tcred, p);
 	}
 
 	/*
 	 * There could be other Opens for different files on the same
 	 * OpenOwner, so locking is required.
 	 */
 	NFSLOCKCLSTATE();
 	nfscl_lockexcl(&op->nfso_own->nfsow_rwlock, NFSCLSTATEMUTEXPTR);
 	NFSUNLOCKCLSTATE();
 	do {
 		error = nfscl_tryclose(op, tcred, nmp, p);
 		if (error == NFSERR_GRACE)
 			(void) nfs_catnap(PZERO, error, "nfs_close");
 	} while (error == NFSERR_GRACE);
 	NFSLOCKCLSTATE();
 	nfscl_lockunlock(&op->nfso_own->nfsow_rwlock);
 
 	LIST_FOREACH_SAFE(lp, &op->nfso_lock, nfsl_list, nlp)
 		nfscl_freelockowner(lp, 0);
 	nfscl_freeopen(op, 0);
 	NFSUNLOCKCLSTATE();
 	NFSFREECRED(tcred);
 }
 
 /*
  * The actual Close RPC.
  */
 APPLESTATIC int
 nfsrpc_closerpc(struct nfsrv_descript *nd, struct nfsmount *nmp,
     struct nfsclopen *op, struct ucred *cred, NFSPROC_T *p,
     int syscred)
 {
 	u_int32_t *tl;
 	int error;
 
 	nfscl_reqstart(nd, NFSPROC_CLOSE, nmp, op->nfso_fh,
 	    op->nfso_fhlen, NULL, NULL);
 	NFSM_BUILD(tl, u_int32_t *, NFSX_UNSIGNED + NFSX_STATEID);
 	*tl++ = txdr_unsigned(op->nfso_own->nfsow_seqid);
 	if (NFSHASNFSV4N(nmp))
 		*tl++ = 0;
 	else
 		*tl++ = op->nfso_stateid.seqid;
 	*tl++ = op->nfso_stateid.other[0];
 	*tl++ = op->nfso_stateid.other[1];
 	*tl = op->nfso_stateid.other[2];
 	if (syscred)
 		nd->nd_flag |= ND_USEGSSNAME;
 	error = newnfs_request(nd, nmp, NULL, &nmp->nm_sockreq, NULL, p, cred,
 	    NFS_PROG, NFS_VER4, NULL, 1, NULL, NULL);
 	if (error)
 		return (error);
 	NFSCL_INCRSEQID(op->nfso_own->nfsow_seqid, nd);
 	if (nd->nd_repstat == 0)
 		NFSM_DISSECT(tl, u_int32_t *, NFSX_STATEID);
 	error = nd->nd_repstat;
 	if (error == NFSERR_STALESTATEID)
 		nfscl_initiate_recovery(op->nfso_own->nfsow_clp);
 nfsmout:
 	mbuf_freem(nd->nd_mrep);
 	return (error);
 }
 
 /*
  * V4 Open Confirm RPC.
  */
 APPLESTATIC int
 nfsrpc_openconfirm(vnode_t vp, u_int8_t *nfhp, int fhlen,
     struct nfsclopen *op, struct ucred *cred, NFSPROC_T *p)
 {
 	u_int32_t *tl;
 	struct nfsrv_descript nfsd, *nd = &nfsd;
 	struct nfsmount *nmp;
 	int error;
 
 	nmp = VFSTONFS(vnode_mount(vp));
 	if (NFSHASNFSV4N(nmp))
 		return (0);		/* No confirmation for NFSv4.1. */
 	nfscl_reqstart(nd, NFSPROC_OPENCONFIRM, nmp, nfhp, fhlen, NULL, NULL);
 	NFSM_BUILD(tl, u_int32_t *, NFSX_UNSIGNED + NFSX_STATEID);
 	*tl++ = op->nfso_stateid.seqid;
 	*tl++ = op->nfso_stateid.other[0];
 	*tl++ = op->nfso_stateid.other[1];
 	*tl++ = op->nfso_stateid.other[2];
 	*tl = txdr_unsigned(op->nfso_own->nfsow_seqid);
 	error = nfscl_request(nd, vp, p, cred, NULL);
 	if (error)
 		return (error);
 	NFSCL_INCRSEQID(op->nfso_own->nfsow_seqid, nd);
 	if (!nd->nd_repstat) {
 		NFSM_DISSECT(tl, u_int32_t *, NFSX_STATEID);
 		op->nfso_stateid.seqid = *tl++;
 		op->nfso_stateid.other[0] = *tl++;
 		op->nfso_stateid.other[1] = *tl++;
 		op->nfso_stateid.other[2] = *tl;
 	}
 	error = nd->nd_repstat;
 	if (error == NFSERR_STALESTATEID)
 		nfscl_initiate_recovery(op->nfso_own->nfsow_clp);
 nfsmout:
 	mbuf_freem(nd->nd_mrep);
 	return (error);
 }
 
 /*
  * Do the setclientid and setclientid confirm RPCs. Called from nfs_statfs()
  * when a mount has just occurred and when the server replies NFSERR_EXPIRED.
  */
 APPLESTATIC int
 nfsrpc_setclient(struct nfsmount *nmp, struct nfsclclient *clp, int reclaim,
     struct ucred *cred, NFSPROC_T *p)
 {
 	u_int32_t *tl;
 	struct nfsrv_descript nfsd;
 	struct nfsrv_descript *nd = &nfsd;
 	nfsattrbit_t attrbits;
 	u_int8_t *cp = NULL, *cp2, addr[INET6_ADDRSTRLEN + 9];
 	u_short port;
 	int error, isinet6 = 0, callblen;
 	nfsquad_t confirm;
 	u_int32_t lease;
 	static u_int32_t rev = 0;
 	struct nfsclds *dsp;
 	struct in6_addr a6;
 	struct nfsclsession *tsep;
 
 	if (nfsboottime.tv_sec == 0)
 		NFSSETBOOTTIME(nfsboottime);
 	clp->nfsc_rev = rev++;
 	if (NFSHASNFSV4N(nmp)) {
 		/*
 		 * Either there was no previous session or the
 		 * previous session has failed, so...
 		 * do an ExchangeID followed by the CreateSession.
 		 */
 		error = nfsrpc_exchangeid(nmp, clp, &nmp->nm_sockreq,
 		    NFSV4EXCH_USEPNFSMDS | NFSV4EXCH_USENONPNFS, &dsp, cred, p);
 		NFSCL_DEBUG(1, "aft exch=%d\n", error);
 		if (error == 0)
 			error = nfsrpc_createsession(nmp, &dsp->nfsclds_sess,
 			    &nmp->nm_sockreq,
 			    dsp->nfsclds_sess.nfsess_sequenceid, 1, cred, p);
 		if (error == 0) {
 			NFSLOCKMNT(nmp);
 			/*
 			 * The old sessions cannot be safely free'd
 			 * here, since they may still be used by
 			 * in-progress RPCs.
 			 */
 			tsep = NULL;
 			if (TAILQ_FIRST(&nmp->nm_sess) != NULL)
 				tsep = NFSMNT_MDSSESSION(nmp);
 			TAILQ_INSERT_HEAD(&nmp->nm_sess, dsp,
 			    nfsclds_list);
 			/*
 			 * Wake up RPCs waiting for a slot on the
 			 * old session. These will then fail with
 			 * NFSERR_BADSESSION and be retried with the
 			 * new session by nfsv4_setsequence().
 			 * Also wakeup() processes waiting for the
 			 * new session.
 			 */
 			if (tsep != NULL)
 				wakeup(&tsep->nfsess_slots);
 			wakeup(&nmp->nm_sess);
 			NFSUNLOCKMNT(nmp);
 		} else
 			nfscl_freenfsclds(dsp);
 		NFSCL_DEBUG(1, "aft createsess=%d\n", error);
 		if (error == 0 && reclaim == 0) {
 			error = nfsrpc_reclaimcomplete(nmp, cred, p);
 			NFSCL_DEBUG(1, "aft reclaimcomp=%d\n", error);
 			if (error == NFSERR_COMPLETEALREADY ||
 			    error == NFSERR_NOTSUPP)
 				/* Ignore this error. */
 				error = 0;
 		}
 		return (error);
 	}
 
 	/*
 	 * Allocate a single session structure for NFSv4.0, because some of
 	 * the fields are used by NFSv4.0 although it doesn't do a session.
 	 */
 	dsp = malloc(sizeof(struct nfsclds), M_NFSCLDS, M_WAITOK | M_ZERO);
 	mtx_init(&dsp->nfsclds_mtx, "nfsds", NULL, MTX_DEF);
 	mtx_init(&dsp->nfsclds_sess.nfsess_mtx, "nfssession", NULL, MTX_DEF);
 	NFSLOCKMNT(nmp);
 	TAILQ_INSERT_HEAD(&nmp->nm_sess, dsp, nfsclds_list);
 	tsep = NFSMNT_MDSSESSION(nmp);
 	NFSUNLOCKMNT(nmp);
 
 	nfscl_reqstart(nd, NFSPROC_SETCLIENTID, nmp, NULL, 0, NULL, NULL);
 	NFSM_BUILD(tl, u_int32_t *, 2 * NFSX_UNSIGNED);
 	*tl++ = txdr_unsigned(nfsboottime.tv_sec);
 	*tl = txdr_unsigned(clp->nfsc_rev);
 	(void) nfsm_strtom(nd, clp->nfsc_id, clp->nfsc_idlen);
 
 	/*
 	 * set up the callback address
 	 */
 	NFSM_BUILD(tl, u_int32_t *, NFSX_UNSIGNED);
 	*tl = txdr_unsigned(NFS_CALLBCKPROG);
 	callblen = strlen(nfsv4_callbackaddr);
 	if (callblen == 0)
 		cp = nfscl_getmyip(nmp, &a6, &isinet6);
 	if (nfscl_enablecallb && nfs_numnfscbd > 0 &&
 	    (callblen > 0 || cp != NULL)) {
 		port = htons(nfsv4_cbport);
 		cp2 = (u_int8_t *)&port;
 #ifdef INET6
 		if ((callblen > 0 &&
 		     strchr(nfsv4_callbackaddr, ':')) || isinet6) {
 			char ip6buf[INET6_ADDRSTRLEN], *ip6add;
 
 			(void) nfsm_strtom(nd, "tcp6", 4);
 			if (callblen == 0) {
 				ip6_sprintf(ip6buf, (struct in6_addr *)cp);
 				ip6add = ip6buf;
 			} else {
 				ip6add = nfsv4_callbackaddr;
 			}
 			snprintf(addr, INET6_ADDRSTRLEN + 9, "%s.%d.%d",
 			    ip6add, cp2[0], cp2[1]);
 		} else
 #endif
 		{
 			(void) nfsm_strtom(nd, "tcp", 3);
 			if (callblen == 0)
 				snprintf(addr, INET6_ADDRSTRLEN + 9,
 				    "%d.%d.%d.%d.%d.%d", cp[0], cp[1],
 				    cp[2], cp[3], cp2[0], cp2[1]);
 			else
 				snprintf(addr, INET6_ADDRSTRLEN + 9,
 				    "%s.%d.%d", nfsv4_callbackaddr,
 				    cp2[0], cp2[1]);
 		}
 		(void) nfsm_strtom(nd, addr, strlen(addr));
 	} else {
 		(void) nfsm_strtom(nd, "tcp", 3);
 		(void) nfsm_strtom(nd, "0.0.0.0.0.0", 11);
 	}
 	NFSM_BUILD(tl, u_int32_t *, NFSX_UNSIGNED);
 	*tl = txdr_unsigned(clp->nfsc_cbident);
 	nd->nd_flag |= ND_USEGSSNAME;
 	error = newnfs_request(nd, nmp, NULL, &nmp->nm_sockreq, NULL, p, cred,
 		NFS_PROG, NFS_VER4, NULL, 1, NULL, NULL);
 	if (error)
 		return (error);
 	if (nd->nd_repstat == 0) {
 	    NFSM_DISSECT(tl, u_int32_t *, 4 * NFSX_UNSIGNED);
 	    tsep->nfsess_clientid.lval[0] = *tl++;
 	    tsep->nfsess_clientid.lval[1] = *tl++;
 	    confirm.lval[0] = *tl++;
 	    confirm.lval[1] = *tl;
 	    mbuf_freem(nd->nd_mrep);
 	    nd->nd_mrep = NULL;
 
 	    /*
 	     * and confirm it.
 	     */
 	    nfscl_reqstart(nd, NFSPROC_SETCLIENTIDCFRM, nmp, NULL, 0, NULL,
 		NULL);
 	    NFSM_BUILD(tl, u_int32_t *, 4 * NFSX_UNSIGNED);
 	    *tl++ = tsep->nfsess_clientid.lval[0];
 	    *tl++ = tsep->nfsess_clientid.lval[1];
 	    *tl++ = confirm.lval[0];
 	    *tl = confirm.lval[1];
 	    nd->nd_flag |= ND_USEGSSNAME;
 	    error = newnfs_request(nd, nmp, NULL, &nmp->nm_sockreq, NULL, p,
 		cred, NFS_PROG, NFS_VER4, NULL, 1, NULL, NULL);
 	    if (error)
 		return (error);
 	    mbuf_freem(nd->nd_mrep);
 	    nd->nd_mrep = NULL;
 	    if (nd->nd_repstat == 0) {
 		nfscl_reqstart(nd, NFSPROC_GETATTR, nmp, nmp->nm_fh,
 		    nmp->nm_fhsize, NULL, NULL);
 		NFSZERO_ATTRBIT(&attrbits);
 		NFSSETBIT_ATTRBIT(&attrbits, NFSATTRBIT_LEASETIME);
 		(void) nfsrv_putattrbit(nd, &attrbits);
 		nd->nd_flag |= ND_USEGSSNAME;
 		error = newnfs_request(nd, nmp, NULL, &nmp->nm_sockreq, NULL, p,
 		    cred, NFS_PROG, NFS_VER4, NULL, 1, NULL, NULL);
 		if (error)
 		    return (error);
 		if (nd->nd_repstat == 0) {
 		    error = nfsv4_loadattr(nd, NULL, NULL, NULL, NULL, 0, NULL,
 			NULL, NULL, NULL, NULL, 0, NULL, &lease, NULL, p, cred);
 		    if (error)
 			goto nfsmout;
 		    clp->nfsc_renew = NFSCL_RENEW(lease);
 		    clp->nfsc_expire = NFSD_MONOSEC + clp->nfsc_renew;
 		    clp->nfsc_clientidrev++;
 		    if (clp->nfsc_clientidrev == 0)
 			clp->nfsc_clientidrev++;
 		}
 	    }
 	}
 	error = nd->nd_repstat;
 nfsmout:
 	mbuf_freem(nd->nd_mrep);
 	return (error);
 }
 
 /*
  * nfs getattr call.
  */
 APPLESTATIC int
 nfsrpc_getattr(vnode_t vp, struct ucred *cred, NFSPROC_T *p,
     struct nfsvattr *nap, void *stuff)
 {
 	struct nfsrv_descript nfsd, *nd = &nfsd;
 	int error;
 	nfsattrbit_t attrbits;
 	
 	NFSCL_REQSTART(nd, NFSPROC_GETATTR, vp);
 	if (nd->nd_flag & ND_NFSV4) {
 		NFSGETATTR_ATTRBIT(&attrbits);
 		(void) nfsrv_putattrbit(nd, &attrbits);
 	}
 	error = nfscl_request(nd, vp, p, cred, stuff);
 	if (error)
 		return (error);
 	if (!nd->nd_repstat)
 		error = nfsm_loadattr(nd, nap);
 	else
 		error = nd->nd_repstat;
 	mbuf_freem(nd->nd_mrep);
 	return (error);
 }
 
 /*
  * nfs getattr call with non-vnode arguemnts.
  */
 APPLESTATIC int
 nfsrpc_getattrnovp(struct nfsmount *nmp, u_int8_t *fhp, int fhlen, int syscred,
     struct ucred *cred, NFSPROC_T *p, struct nfsvattr *nap, u_int64_t *xidp,
     uint32_t *leasep)
 {
 	struct nfsrv_descript nfsd, *nd = &nfsd;
 	int error, vers = NFS_VER2;
 	nfsattrbit_t attrbits;
 	
 	nfscl_reqstart(nd, NFSPROC_GETATTR, nmp, fhp, fhlen, NULL, NULL);
 	if (nd->nd_flag & ND_NFSV4) {
 		vers = NFS_VER4;
 		NFSGETATTR_ATTRBIT(&attrbits);
 		NFSSETBIT_ATTRBIT(&attrbits, NFSATTRBIT_LEASETIME);
 		(void) nfsrv_putattrbit(nd, &attrbits);
 	} else if (nd->nd_flag & ND_NFSV3) {
 		vers = NFS_VER3;
 	}
 	if (syscred)
 		nd->nd_flag |= ND_USEGSSNAME;
 	error = newnfs_request(nd, nmp, NULL, &nmp->nm_sockreq, NULL, p, cred,
 	    NFS_PROG, vers, NULL, 1, xidp, NULL);
 	if (error)
 		return (error);
 	if (nd->nd_repstat == 0) {
 		if ((nd->nd_flag & ND_NFSV4) != 0)
 			error = nfsv4_loadattr(nd, NULL, nap, NULL, NULL, 0,
 			    NULL, NULL, NULL, NULL, NULL, 0, NULL, leasep, NULL,
 			    NULL, NULL);
 		else
 			error = nfsm_loadattr(nd, nap);
 	} else
 		error = nd->nd_repstat;
 	mbuf_freem(nd->nd_mrep);
 	return (error);
 }
 
 /*
  * Do an nfs setattr operation.
  */
 APPLESTATIC int
 nfsrpc_setattr(vnode_t vp, struct vattr *vap, NFSACL_T *aclp,
     struct ucred *cred, NFSPROC_T *p, struct nfsvattr *rnap, int *attrflagp,
     void *stuff)
 {
 	int error, expireret = 0, openerr, retrycnt;
 	u_int32_t clidrev = 0, mode;
 	struct nfsmount *nmp = VFSTONFS(vnode_mount(vp));
 	struct nfsfh *nfhp;
 	nfsv4stateid_t stateid;
 	void *lckp;
 
 	if (nmp->nm_clp != NULL)
 		clidrev = nmp->nm_clp->nfsc_clientidrev;
 	if (vap != NULL && NFSATTRISSET(u_quad_t, vap, va_size))
 		mode = NFSV4OPEN_ACCESSWRITE;
 	else
 		mode = NFSV4OPEN_ACCESSREAD;
 	retrycnt = 0;
 	do {
 		lckp = NULL;
 		openerr = 1;
 		if (NFSHASNFSV4(nmp)) {
 			nfhp = VTONFS(vp)->n_fhp;
 			error = nfscl_getstateid(vp, nfhp->nfh_fh,
 			    nfhp->nfh_len, mode, 0, cred, p, &stateid, &lckp);
 			if (error && vnode_vtype(vp) == VREG &&
 			    (mode == NFSV4OPEN_ACCESSWRITE ||
 			     nfstest_openallsetattr)) {
 				/*
 				 * No Open stateid, so try and open the file
 				 * now.
 				 */
 				if (mode == NFSV4OPEN_ACCESSWRITE)
 					openerr = nfsrpc_open(vp, FWRITE, cred,
 					    p);
 				else
 					openerr = nfsrpc_open(vp, FREAD, cred,
 					    p);
 				if (!openerr)
 					(void) nfscl_getstateid(vp,
 					    nfhp->nfh_fh, nfhp->nfh_len,
 					    mode, 0, cred, p, &stateid, &lckp);
 			}
 		}
 		if (vap != NULL)
 			error = nfsrpc_setattrrpc(vp, vap, &stateid, cred, p,
 			    rnap, attrflagp, stuff);
 		else
 			error = nfsrpc_setaclrpc(vp, cred, p, aclp, &stateid,
 			    stuff);
 		if (error == NFSERR_OPENMODE && mode == NFSV4OPEN_ACCESSREAD) {
 			NFSLOCKMNT(nmp);
 			nmp->nm_state |= NFSSTA_OPENMODE;
 			NFSUNLOCKMNT(nmp);
 		}
 		if (error == NFSERR_STALESTATEID)
 			nfscl_initiate_recovery(nmp->nm_clp);
 		if (lckp != NULL)
 			nfscl_lockderef(lckp);
 		if (!openerr)
 			(void) nfsrpc_close(vp, 0, p);
 		if (error == NFSERR_GRACE || error == NFSERR_STALESTATEID ||
 		    error == NFSERR_STALEDONTRECOVER || error == NFSERR_DELAY ||
 		    error == NFSERR_OLDSTATEID || error == NFSERR_BADSESSION) {
 			(void) nfs_catnap(PZERO, error, "nfs_setattr");
 		} else if ((error == NFSERR_EXPIRED ||
 		    error == NFSERR_BADSTATEID) && clidrev != 0) {
 			expireret = nfscl_hasexpired(nmp->nm_clp, clidrev, p);
 		}
 		retrycnt++;
 	} while (error == NFSERR_GRACE || error == NFSERR_STALESTATEID ||
 	    error == NFSERR_STALEDONTRECOVER || error == NFSERR_DELAY ||
 	    error == NFSERR_BADSESSION ||
 	    (error == NFSERR_OLDSTATEID && retrycnt < 20) ||
 	    ((error == NFSERR_EXPIRED || error == NFSERR_BADSTATEID) &&
 	     expireret == 0 && clidrev != 0 && retrycnt < 4) ||
 	    (error == NFSERR_OPENMODE && mode == NFSV4OPEN_ACCESSREAD &&
 	     retrycnt < 4));
 	if (error && retrycnt >= 4)
 		error = EIO;
 	return (error);
 }
 
 static int
 nfsrpc_setattrrpc(vnode_t vp, struct vattr *vap,
     nfsv4stateid_t *stateidp, struct ucred *cred, NFSPROC_T *p,
     struct nfsvattr *rnap, int *attrflagp, void *stuff)
 {
 	u_int32_t *tl;
 	struct nfsrv_descript nfsd, *nd = &nfsd;
 	int error;
 	nfsattrbit_t attrbits;
 
 	*attrflagp = 0;
 	NFSCL_REQSTART(nd, NFSPROC_SETATTR, vp);
 	if (nd->nd_flag & ND_NFSV4)
 		nfsm_stateidtom(nd, stateidp, NFSSTATEID_PUTSTATEID);
 	vap->va_type = vnode_vtype(vp);
 	nfscl_fillsattr(nd, vap, vp, NFSSATTR_FULL, 0);
 	if (nd->nd_flag & ND_NFSV3) {
 		NFSM_BUILD(tl, u_int32_t *, NFSX_UNSIGNED);
 		*tl = newnfs_false;
 	} else if (nd->nd_flag & ND_NFSV4) {
 		NFSM_BUILD(tl, u_int32_t *, NFSX_UNSIGNED);
 		*tl = txdr_unsigned(NFSV4OP_GETATTR);
 		NFSGETATTR_ATTRBIT(&attrbits);
 		(void) nfsrv_putattrbit(nd, &attrbits);
 	}
 	error = nfscl_request(nd, vp, p, cred, stuff);
 	if (error)
 		return (error);
 	if (nd->nd_flag & (ND_NFSV3 | ND_NFSV4))
 		error = nfscl_wcc_data(nd, vp, rnap, attrflagp, NULL, stuff);
 	if ((nd->nd_flag & (ND_NFSV4 | ND_NOMOREDATA)) == ND_NFSV4 && !error)
 		error = nfsrv_getattrbits(nd, &attrbits, NULL, NULL);
 	if (!(nd->nd_flag & ND_NFSV3) && !nd->nd_repstat && !error)
 		error = nfscl_postop_attr(nd, rnap, attrflagp, stuff);
 	mbuf_freem(nd->nd_mrep);
 	if (nd->nd_repstat && !error)
 		error = nd->nd_repstat;
 	return (error);
 }
 
 /*
  * nfs lookup rpc
  */
 APPLESTATIC int
 nfsrpc_lookup(vnode_t dvp, char *name, int len, struct ucred *cred,
     NFSPROC_T *p, struct nfsvattr *dnap, struct nfsvattr *nap,
     struct nfsfh **nfhpp, int *attrflagp, int *dattrflagp, void *stuff)
 {
 	u_int32_t *tl;
 	struct nfsrv_descript nfsd, *nd = &nfsd;
 	struct nfsmount *nmp;
 	struct nfsnode *np;
 	struct nfsfh *nfhp;
 	nfsattrbit_t attrbits;
 	int error = 0, lookupp = 0;
 
 	*attrflagp = 0;
 	*dattrflagp = 0;
 	if (vnode_vtype(dvp) != VDIR)
 		return (ENOTDIR);
 	nmp = VFSTONFS(vnode_mount(dvp));
 	if (len > NFS_MAXNAMLEN)
 		return (ENAMETOOLONG);
 	if (NFSHASNFSV4(nmp) && len == 1 &&
 		name[0] == '.') {
 		/*
 		 * Just return the current dir's fh.
 		 */
 		np = VTONFS(dvp);
 		MALLOC(nfhp, struct nfsfh *, sizeof (struct nfsfh) +
 			np->n_fhp->nfh_len, M_NFSFH, M_WAITOK);
 		nfhp->nfh_len = np->n_fhp->nfh_len;
 		NFSBCOPY(np->n_fhp->nfh_fh, nfhp->nfh_fh, nfhp->nfh_len);
 		*nfhpp = nfhp;
 		return (0);
 	}
 	if (NFSHASNFSV4(nmp) && len == 2 &&
 		name[0] == '.' && name[1] == '.') {
 		lookupp = 1;
 		NFSCL_REQSTART(nd, NFSPROC_LOOKUPP, dvp);
 	} else {
 		NFSCL_REQSTART(nd, NFSPROC_LOOKUP, dvp);
 		(void) nfsm_strtom(nd, name, len);
 	}
 	if (nd->nd_flag & ND_NFSV4) {
 		NFSGETATTR_ATTRBIT(&attrbits);
 		NFSM_BUILD(tl, u_int32_t *, 2 * NFSX_UNSIGNED);
 		*tl++ = txdr_unsigned(NFSV4OP_GETFH);
 		*tl = txdr_unsigned(NFSV4OP_GETATTR);
 		(void) nfsrv_putattrbit(nd, &attrbits);
 	}
 	error = nfscl_request(nd, dvp, p, cred, stuff);
 	if (error)
 		return (error);
 	if (nd->nd_repstat) {
 		/*
 		 * When an NFSv4 Lookupp returns ENOENT, it means that
 		 * the lookup is at the root of an fs, so return this dir.
 		 */
 		if (nd->nd_repstat == NFSERR_NOENT && lookupp) {
 		    np = VTONFS(dvp);
 		    MALLOC(nfhp, struct nfsfh *, sizeof (struct nfsfh) +
 			np->n_fhp->nfh_len, M_NFSFH, M_WAITOK);
 		    nfhp->nfh_len = np->n_fhp->nfh_len;
 		    NFSBCOPY(np->n_fhp->nfh_fh, nfhp->nfh_fh, nfhp->nfh_len);
 		    *nfhpp = nfhp;
 		    mbuf_freem(nd->nd_mrep);
 		    return (0);
 		}
 		if (nd->nd_flag & ND_NFSV3)
 		    error = nfscl_postop_attr(nd, dnap, dattrflagp, stuff);
 		else if ((nd->nd_flag & (ND_NFSV4 | ND_NOMOREDATA)) ==
 		    ND_NFSV4) {
 			/* Load the directory attributes. */
 			error = nfsm_loadattr(nd, dnap);
 			if (error == 0)
 				*dattrflagp = 1;
 		}
 		goto nfsmout;
 	}
 	if ((nd->nd_flag & (ND_NFSV4 | ND_NOMOREDATA)) == ND_NFSV4) {
 		/* Load the directory attributes. */
 		error = nfsm_loadattr(nd, dnap);
 		if (error != 0)
 			goto nfsmout;
 		*dattrflagp = 1;
 		/* Skip over the Lookup and GetFH operation status values. */
 		NFSM_DISSECT(tl, u_int32_t *, 4 * NFSX_UNSIGNED);
 	}
 	error = nfsm_getfh(nd, nfhpp);
 	if (error)
 		goto nfsmout;
 
 	error = nfscl_postop_attr(nd, nap, attrflagp, stuff);
 	if ((nd->nd_flag & ND_NFSV3) && !error)
 		error = nfscl_postop_attr(nd, dnap, dattrflagp, stuff);
 nfsmout:
 	mbuf_freem(nd->nd_mrep);
 	if (!error && nd->nd_repstat)
 		error = nd->nd_repstat;
 	return (error);
 }
 
 /*
  * Do a readlink rpc.
  */
 APPLESTATIC int
 nfsrpc_readlink(vnode_t vp, struct uio *uiop, struct ucred *cred,
     NFSPROC_T *p, struct nfsvattr *nap, int *attrflagp, void *stuff)
 {
 	u_int32_t *tl;
 	struct nfsrv_descript nfsd, *nd = &nfsd;
 	struct nfsnode *np = VTONFS(vp);
 	nfsattrbit_t attrbits;
 	int error, len, cangetattr = 1;
 
 	*attrflagp = 0;
 	NFSCL_REQSTART(nd, NFSPROC_READLINK, vp);
 	if (nd->nd_flag & ND_NFSV4) {
 		/*
 		 * And do a Getattr op.
 		 */
 		NFSM_BUILD(tl, u_int32_t *, NFSX_UNSIGNED);
 		*tl = txdr_unsigned(NFSV4OP_GETATTR);
 		NFSGETATTR_ATTRBIT(&attrbits);
 		(void) nfsrv_putattrbit(nd, &attrbits);
 	}
 	error = nfscl_request(nd, vp, p, cred, stuff);
 	if (error)
 		return (error);
 	if (nd->nd_flag & ND_NFSV3)
 		error = nfscl_postop_attr(nd, nap, attrflagp, stuff);
 	if (!nd->nd_repstat && !error) {
 		NFSM_STRSIZ(len, NFS_MAXPATHLEN);
 		/*
 		 * This seems weird to me, but must have been added to
 		 * FreeBSD for some reason. The only thing I can think of
 		 * is that there was/is some server that replies with
 		 * more link data than it should?
 		 */
 		if (len == NFS_MAXPATHLEN) {
 			NFSLOCKNODE(np);
 			if (np->n_size > 0 && np->n_size < NFS_MAXPATHLEN) {
 				len = np->n_size;
 				cangetattr = 0;
 			}
 			NFSUNLOCKNODE(np);
 		}
 		error = nfsm_mbufuio(nd, uiop, len);
 		if ((nd->nd_flag & ND_NFSV4) && !error && cangetattr)
 			error = nfscl_postop_attr(nd, nap, attrflagp, stuff);
 	}
 	if (nd->nd_repstat && !error)
 		error = nd->nd_repstat;
 nfsmout:
 	mbuf_freem(nd->nd_mrep);
 	return (error);
 }
 
 /*
  * Read operation.
  */
 APPLESTATIC int
 nfsrpc_read(vnode_t vp, struct uio *uiop, struct ucred *cred,
     NFSPROC_T *p, struct nfsvattr *nap, int *attrflagp, void *stuff)
 {
 	int error, expireret = 0, retrycnt;
 	u_int32_t clidrev = 0;
 	struct nfsmount *nmp = VFSTONFS(vnode_mount(vp));
 	struct nfsnode *np = VTONFS(vp);
 	struct ucred *newcred;
 	struct nfsfh *nfhp = NULL;
 	nfsv4stateid_t stateid;
 	void *lckp;
 
 	if (nmp->nm_clp != NULL)
 		clidrev = nmp->nm_clp->nfsc_clientidrev;
 	newcred = cred;
 	if (NFSHASNFSV4(nmp)) {
 		nfhp = np->n_fhp;
 		newcred = NFSNEWCRED(cred);
 	}
 	retrycnt = 0;
 	do {
 		lckp = NULL;
 		if (NFSHASNFSV4(nmp))
 			(void)nfscl_getstateid(vp, nfhp->nfh_fh, nfhp->nfh_len,
 			    NFSV4OPEN_ACCESSREAD, 0, newcred, p, &stateid,
 			    &lckp);
 		error = nfsrpc_readrpc(vp, uiop, newcred, &stateid, p, nap,
 		    attrflagp, stuff);
 		if (error == NFSERR_OPENMODE) {
 			NFSLOCKMNT(nmp);
 			nmp->nm_state |= NFSSTA_OPENMODE;
 			NFSUNLOCKMNT(nmp);
 		}
 		if (error == NFSERR_STALESTATEID)
 			nfscl_initiate_recovery(nmp->nm_clp);
 		if (lckp != NULL)
 			nfscl_lockderef(lckp);
 		if (error == NFSERR_GRACE || error == NFSERR_STALESTATEID ||
 		    error == NFSERR_STALEDONTRECOVER || error == NFSERR_DELAY ||
 		    error == NFSERR_OLDSTATEID || error == NFSERR_BADSESSION) {
 			(void) nfs_catnap(PZERO, error, "nfs_read");
 		} else if ((error == NFSERR_EXPIRED ||
 		    error == NFSERR_BADSTATEID) && clidrev != 0) {
 			expireret = nfscl_hasexpired(nmp->nm_clp, clidrev, p);
 		}
 		retrycnt++;
 	} while (error == NFSERR_GRACE || error == NFSERR_STALESTATEID ||
 	    error == NFSERR_STALEDONTRECOVER || error == NFSERR_DELAY ||
 	    error == NFSERR_BADSESSION ||
 	    (error == NFSERR_OLDSTATEID && retrycnt < 20) ||
 	    ((error == NFSERR_EXPIRED || error == NFSERR_BADSTATEID) &&
 	     expireret == 0 && clidrev != 0 && retrycnt < 4) ||
 	    (error == NFSERR_OPENMODE && retrycnt < 4));
 	if (error && retrycnt >= 4)
 		error = EIO;
 	if (NFSHASNFSV4(nmp))
 		NFSFREECRED(newcred);
 	return (error);
 }
 
 /*
  * The actual read RPC.
  */
 static int
 nfsrpc_readrpc(vnode_t vp, struct uio *uiop, struct ucred *cred,
     nfsv4stateid_t *stateidp, NFSPROC_T *p, struct nfsvattr *nap,
     int *attrflagp, void *stuff)
 {
 	u_int32_t *tl;
 	int error = 0, len, retlen, tsiz, eof = 0;
 	struct nfsrv_descript nfsd;
 	struct nfsmount *nmp = VFSTONFS(vnode_mount(vp));
 	struct nfsrv_descript *nd = &nfsd;
 	int rsize;
 	off_t tmp_off;
 
 	*attrflagp = 0;
 	tsiz = uio_uio_resid(uiop);
 	tmp_off = uiop->uio_offset + tsiz;
 	NFSLOCKMNT(nmp);
 	if (tmp_off > nmp->nm_maxfilesize || tmp_off < uiop->uio_offset) {
 		NFSUNLOCKMNT(nmp);
 		return (EFBIG);
 	}
 	rsize = nmp->nm_rsize;
 	NFSUNLOCKMNT(nmp);
 	nd->nd_mrep = NULL;
 	while (tsiz > 0) {
 		*attrflagp = 0;
 		len = (tsiz > rsize) ? rsize : tsiz;
 		NFSCL_REQSTART(nd, NFSPROC_READ, vp);
 		if (nd->nd_flag & ND_NFSV4)
 			nfsm_stateidtom(nd, stateidp, NFSSTATEID_PUTSTATEID);
 		NFSM_BUILD(tl, u_int32_t *, NFSX_UNSIGNED * 3);
 		if (nd->nd_flag & ND_NFSV2) {
 			*tl++ = txdr_unsigned(uiop->uio_offset);
 			*tl++ = txdr_unsigned(len);
 			*tl = 0;
 		} else {
 			txdr_hyper(uiop->uio_offset, tl);
 			*(tl + 2) = txdr_unsigned(len);
 		}
 		/*
 		 * Since I can't do a Getattr for NFSv4 for Write, there
 		 * doesn't seem any point in doing one here, either.
 		 * (See the comment in nfsrpc_writerpc() for more info.)
 		 */
 		error = nfscl_request(nd, vp, p, cred, stuff);
 		if (error)
 			return (error);
 		if (nd->nd_flag & ND_NFSV3) {
 			error = nfscl_postop_attr(nd, nap, attrflagp, stuff);
 		} else if (!nd->nd_repstat && (nd->nd_flag & ND_NFSV2)) {
 			error = nfsm_loadattr(nd, nap);
 			if (!error)
 				*attrflagp = 1;
 		}
 		if (nd->nd_repstat || error) {
 			if (!error)
 				error = nd->nd_repstat;
 			goto nfsmout;
 		}
 		if (nd->nd_flag & ND_NFSV3) {
 			NFSM_DISSECT(tl, u_int32_t *, 2 * NFSX_UNSIGNED);
 			eof = fxdr_unsigned(int, *(tl + 1));
 		} else if (nd->nd_flag & ND_NFSV4) {
 			NFSM_DISSECT(tl, u_int32_t *, NFSX_UNSIGNED);
 			eof = fxdr_unsigned(int, *tl);
 		}
 		NFSM_STRSIZ(retlen, len);
 		error = nfsm_mbufuio(nd, uiop, retlen);
 		if (error)
 			goto nfsmout;
 		mbuf_freem(nd->nd_mrep);
 		nd->nd_mrep = NULL;
 		tsiz -= retlen;
 		if (!(nd->nd_flag & ND_NFSV2)) {
 			if (eof || retlen == 0)
 				tsiz = 0;
 		} else if (retlen < len)
 			tsiz = 0;
 	}
 	return (0);
 nfsmout:
 	if (nd->nd_mrep != NULL)
 		mbuf_freem(nd->nd_mrep);
 	return (error);
 }
 
 /*
  * nfs write operation
  * When called_from_strategy != 0, it should return EIO for an error that
  * indicates recovery is in progress, so that the buffer will be left
  * dirty and be written back to the server later. If it loops around,
  * the recovery thread could get stuck waiting for the buffer and recovery
  * will then deadlock.
  */
 APPLESTATIC int
 nfsrpc_write(vnode_t vp, struct uio *uiop, int *iomode, int *must_commit,
     struct ucred *cred, NFSPROC_T *p, struct nfsvattr *nap, int *attrflagp,
     void *stuff, int called_from_strategy)
 {
 	int error, expireret = 0, retrycnt, nostateid;
 	u_int32_t clidrev = 0;
 	struct nfsmount *nmp = VFSTONFS(vnode_mount(vp));
 	struct nfsnode *np = VTONFS(vp);
 	struct ucred *newcred;
 	struct nfsfh *nfhp = NULL;
 	nfsv4stateid_t stateid;
 	void *lckp;
 
 	*must_commit = 0;
 	if (nmp->nm_clp != NULL)
 		clidrev = nmp->nm_clp->nfsc_clientidrev;
 	newcred = cred;
 	if (NFSHASNFSV4(nmp)) {
 		newcred = NFSNEWCRED(cred);
 		nfhp = np->n_fhp;
 	}
 	retrycnt = 0;
 	do {
 		lckp = NULL;
 		nostateid = 0;
 		if (NFSHASNFSV4(nmp)) {
 			(void)nfscl_getstateid(vp, nfhp->nfh_fh, nfhp->nfh_len,
 			    NFSV4OPEN_ACCESSWRITE, 0, newcred, p, &stateid,
 			    &lckp);
 			if (stateid.other[0] == 0 && stateid.other[1] == 0 &&
 			    stateid.other[2] == 0) {
 				nostateid = 1;
 				NFSCL_DEBUG(1, "stateid0 in write\n");
 			}
 		}
 
 		/*
 		 * If there is no stateid for NFSv4, it means this is an
 		 * extraneous write after close. Basically a poorly
 		 * implemented buffer cache. Just don't do the write.
 		 */
 		if (nostateid)
 			error = 0;
 		else
 			error = nfsrpc_writerpc(vp, uiop, iomode, must_commit,
 			    newcred, &stateid, p, nap, attrflagp, stuff);
 		if (error == NFSERR_STALESTATEID)
 			nfscl_initiate_recovery(nmp->nm_clp);
 		if (lckp != NULL)
 			nfscl_lockderef(lckp);
 		if (error == NFSERR_GRACE || error == NFSERR_STALESTATEID ||
 		    error == NFSERR_STALEDONTRECOVER || error == NFSERR_DELAY ||
 		    error == NFSERR_OLDSTATEID || error == NFSERR_BADSESSION) {
 			(void) nfs_catnap(PZERO, error, "nfs_write");
 		} else if ((error == NFSERR_EXPIRED ||
 		    error == NFSERR_BADSTATEID) && clidrev != 0) {
 			expireret = nfscl_hasexpired(nmp->nm_clp, clidrev, p);
 		}
 		retrycnt++;
 	} while (error == NFSERR_GRACE || error == NFSERR_DELAY ||
 	    ((error == NFSERR_STALESTATEID || error == NFSERR_BADSESSION ||
 	      error == NFSERR_STALEDONTRECOVER) && called_from_strategy == 0) ||
 	    (error == NFSERR_OLDSTATEID && retrycnt < 20) ||
 	    ((error == NFSERR_EXPIRED || error == NFSERR_BADSTATEID) &&
 	     expireret == 0 && clidrev != 0 && retrycnt < 4));
 	if (error != 0 && (retrycnt >= 4 ||
 	    ((error == NFSERR_STALESTATEID || error == NFSERR_BADSESSION ||
 	      error == NFSERR_STALEDONTRECOVER) && called_from_strategy != 0)))
 		error = EIO;
 	if (NFSHASNFSV4(nmp))
 		NFSFREECRED(newcred);
 	return (error);
 }
 
 /*
  * The actual write RPC.
  */
 static int
 nfsrpc_writerpc(vnode_t vp, struct uio *uiop, int *iomode,
     int *must_commit, struct ucred *cred, nfsv4stateid_t *stateidp,
     NFSPROC_T *p, struct nfsvattr *nap, int *attrflagp, void *stuff)
 {
 	u_int32_t *tl;
 	struct nfsmount *nmp = VFSTONFS(vnode_mount(vp));
 	struct nfsnode *np = VTONFS(vp);
 	int error = 0, len, tsiz, rlen, commit, committed = NFSWRITE_FILESYNC;
 	int wccflag = 0, wsize;
 	int32_t backup;
 	struct nfsrv_descript nfsd;
 	struct nfsrv_descript *nd = &nfsd;
 	nfsattrbit_t attrbits;
 	off_t tmp_off;
 
 	KASSERT(uiop->uio_iovcnt == 1, ("nfs: writerpc iovcnt > 1"));
 	*attrflagp = 0;
 	tsiz = uio_uio_resid(uiop);
 	tmp_off = uiop->uio_offset + tsiz;
 	NFSLOCKMNT(nmp);
 	if (tmp_off > nmp->nm_maxfilesize || tmp_off < uiop->uio_offset) {
 		NFSUNLOCKMNT(nmp);
 		return (EFBIG);
 	}
 	wsize = nmp->nm_wsize;
 	NFSUNLOCKMNT(nmp);
 	nd->nd_mrep = NULL;	/* NFSv2 sometimes does a write with */
 	nd->nd_repstat = 0;	/* uio_resid == 0, so the while is not done */
 	while (tsiz > 0) {
 		*attrflagp = 0;
 		len = (tsiz > wsize) ? wsize : tsiz;
 		NFSCL_REQSTART(nd, NFSPROC_WRITE, vp);
 		if (nd->nd_flag & ND_NFSV4) {
 			nfsm_stateidtom(nd, stateidp, NFSSTATEID_PUTSTATEID);
 			NFSM_BUILD(tl, u_int32_t *, NFSX_HYPER+2*NFSX_UNSIGNED);
 			txdr_hyper(uiop->uio_offset, tl);
 			tl += 2;
 			*tl++ = txdr_unsigned(*iomode);
 			*tl = txdr_unsigned(len);
 		} else if (nd->nd_flag & ND_NFSV3) {
 			NFSM_BUILD(tl, u_int32_t *, NFSX_HYPER+3*NFSX_UNSIGNED);
 			txdr_hyper(uiop->uio_offset, tl);
 			tl += 2;
 			*tl++ = txdr_unsigned(len);
 			*tl++ = txdr_unsigned(*iomode);
 			*tl = txdr_unsigned(len);
 		} else {
 			u_int32_t x;
 
 			NFSM_BUILD(tl, u_int32_t *, 4 * NFSX_UNSIGNED);
 			/*
 			 * Not sure why someone changed this, since the
 			 * RFC clearly states that "beginoffset" and
 			 * "totalcount" are ignored, but it wouldn't
 			 * surprise me if there's a busted server out there.
 			 */
 			/* Set both "begin" and "current" to non-garbage. */
 			x = txdr_unsigned((u_int32_t)uiop->uio_offset);
 			*tl++ = x;      /* "begin offset" */
 			*tl++ = x;      /* "current offset" */
 			x = txdr_unsigned(len);
 			*tl++ = x;      /* total to this offset */
 			*tl = x;        /* size of this write */
 
 		}
 		nfsm_uiombuf(nd, uiop, len);
 		/*
 		 * Although it is tempting to do a normal Getattr Op in the
 		 * NFSv4 compound, the result can be a nearly hung client
 		 * system if the Getattr asks for Owner and/or OwnerGroup.
 		 * It occurs when the client can't map either the Owner or
 		 * Owner_group name in the Getattr reply to a uid/gid. When
 		 * there is a cache miss, the kernel does an upcall to the
 		 * nfsuserd. Then, it can try and read the local /etc/passwd
 		 * or /etc/group file. It can then block in getnewbuf(),
 		 * waiting for dirty writes to be pushed to the NFS server.
 		 * The only reason this doesn't result in a complete
 		 * deadlock, is that the upcall times out and allows
 		 * the write to complete. However, progress is so slow
 		 * that it might just as well be deadlocked.
 		 * As such, we get the rest of the attributes, but not
 		 * Owner or Owner_group.
 		 * nb: nfscl_loadattrcache() needs to be told that these
 		 *     partial attributes from a write rpc are being
 		 *     passed in, via a argument flag.
 		 */
 		if (nd->nd_flag & ND_NFSV4) {
 			NFSWRITEGETATTR_ATTRBIT(&attrbits);
 			NFSM_BUILD(tl, u_int32_t *, NFSX_UNSIGNED);
 			*tl = txdr_unsigned(NFSV4OP_GETATTR);
 			(void) nfsrv_putattrbit(nd, &attrbits);
 		}
 		error = nfscl_request(nd, vp, p, cred, stuff);
 		if (error)
 			return (error);
 		if (nd->nd_repstat) {
 			/*
 			 * In case the rpc gets retried, roll
 			 * the uio fileds changed by nfsm_uiombuf()
 			 * back.
 			 */
 			uiop->uio_offset -= len;
 			uio_uio_resid_add(uiop, len);
 			uio_iov_base_add(uiop, -len);
 			uio_iov_len_add(uiop, len);
 		}
 		if (nd->nd_flag & (ND_NFSV3 | ND_NFSV4)) {
 			error = nfscl_wcc_data(nd, vp, nap, attrflagp,
 			    &wccflag, stuff);
 			if (error)
 				goto nfsmout;
 		}
 		if (!nd->nd_repstat) {
 			if (nd->nd_flag & (ND_NFSV3 | ND_NFSV4)) {
 				NFSM_DISSECT(tl, u_int32_t *, 2 * NFSX_UNSIGNED
 					+ NFSX_VERF);
 				rlen = fxdr_unsigned(int, *tl++);
 				if (rlen == 0) {
 					error = NFSERR_IO;
 					goto nfsmout;
 				} else if (rlen < len) {
 					backup = len - rlen;
 					uio_iov_base_add(uiop, -(backup));
 					uio_iov_len_add(uiop, backup);
 					uiop->uio_offset -= backup;
 					uio_uio_resid_add(uiop, backup);
 					len = rlen;
 				}
 				commit = fxdr_unsigned(int, *tl++);
 
 				/*
 				 * Return the lowest commitment level
 				 * obtained by any of the RPCs.
 				 */
 				if (committed == NFSWRITE_FILESYNC)
 					committed = commit;
 				else if (committed == NFSWRITE_DATASYNC &&
 					commit == NFSWRITE_UNSTABLE)
 					committed = commit;
 				NFSLOCKMNT(nmp);
 				if (!NFSHASWRITEVERF(nmp)) {
 					NFSBCOPY((caddr_t)tl,
 					    (caddr_t)&nmp->nm_verf[0],
 					    NFSX_VERF);
 					NFSSETWRITEVERF(nmp);
 	    			} else if (NFSBCMP(tl, nmp->nm_verf,
 				    NFSX_VERF)) {
 					*must_commit = 1;
 					NFSBCOPY(tl, nmp->nm_verf, NFSX_VERF);
 				}
 				NFSUNLOCKMNT(nmp);
 			}
 			if (nd->nd_flag & ND_NFSV4)
 				NFSM_DISSECT(tl, u_int32_t *, 2 * NFSX_UNSIGNED);
 			if (nd->nd_flag & (ND_NFSV2 | ND_NFSV4)) {
 				error = nfsm_loadattr(nd, nap);
 				if (!error)
 					*attrflagp = NFS_LATTR_NOSHRINK;
 			}
 		} else {
 			error = nd->nd_repstat;
 		}
 		if (error)
 			goto nfsmout;
 		NFSWRITERPC_SETTIME(wccflag, np, nap, (nd->nd_flag & ND_NFSV4));
 		mbuf_freem(nd->nd_mrep);
 		nd->nd_mrep = NULL;
 		tsiz -= len;
 	}
 nfsmout:
 	if (nd->nd_mrep != NULL)
 		mbuf_freem(nd->nd_mrep);
 	*iomode = committed;
 	if (nd->nd_repstat && !error)
 		error = nd->nd_repstat;
 	return (error);
 }
 
 /*
  * nfs mknod rpc
  * For NFS v2 this is a kludge. Use a create rpc but with the IFMT bits of the
  * mode set to specify the file type and the size field for rdev.
  */
 APPLESTATIC int
 nfsrpc_mknod(vnode_t dvp, char *name, int namelen, struct vattr *vap,
     u_int32_t rdev, enum vtype vtyp, struct ucred *cred, NFSPROC_T *p,
     struct nfsvattr *dnap, struct nfsvattr *nnap, struct nfsfh **nfhpp,
     int *attrflagp, int *dattrflagp, void *dstuff)
 {
 	u_int32_t *tl;
 	int error = 0;
 	struct nfsrv_descript nfsd, *nd = &nfsd;
 	nfsattrbit_t attrbits;
 
 	*nfhpp = NULL;
 	*attrflagp = 0;
 	*dattrflagp = 0;
 	if (namelen > NFS_MAXNAMLEN)
 		return (ENAMETOOLONG);
 	NFSCL_REQSTART(nd, NFSPROC_MKNOD, dvp);
 	if (nd->nd_flag & ND_NFSV4) {
 		if (vtyp == VBLK || vtyp == VCHR) {
 			NFSM_BUILD(tl, u_int32_t *, 3 * NFSX_UNSIGNED);
 			*tl++ = vtonfsv34_type(vtyp);
 			*tl++ = txdr_unsigned(NFSMAJOR(rdev));
 			*tl = txdr_unsigned(NFSMINOR(rdev));
 		} else {
 			NFSM_BUILD(tl, u_int32_t *, NFSX_UNSIGNED);
 			*tl = vtonfsv34_type(vtyp);
 		}
 	}
 	(void) nfsm_strtom(nd, name, namelen);
 	if (nd->nd_flag & ND_NFSV3) {
 		NFSM_BUILD(tl, u_int32_t *, NFSX_UNSIGNED);
 		*tl = vtonfsv34_type(vtyp);
 	}
 	if (nd->nd_flag & (ND_NFSV3 | ND_NFSV4))
 		nfscl_fillsattr(nd, vap, dvp, 0, 0);
 	if ((nd->nd_flag & ND_NFSV3) &&
 	    (vtyp == VCHR || vtyp == VBLK)) {
 		NFSM_BUILD(tl, u_int32_t *, 2 * NFSX_UNSIGNED);
 		*tl++ = txdr_unsigned(NFSMAJOR(rdev));
 		*tl = txdr_unsigned(NFSMINOR(rdev));
 	}
 	if (nd->nd_flag & ND_NFSV4) {
 		NFSGETATTR_ATTRBIT(&attrbits);
 		NFSM_BUILD(tl, u_int32_t *, 2 * NFSX_UNSIGNED);
 		*tl++ = txdr_unsigned(NFSV4OP_GETFH);
 		*tl = txdr_unsigned(NFSV4OP_GETATTR);
 		(void) nfsrv_putattrbit(nd, &attrbits);
 	}
 	if (nd->nd_flag & ND_NFSV2)
 		nfscl_fillsattr(nd, vap, dvp, NFSSATTR_SIZERDEV, rdev);
 	error = nfscl_request(nd, dvp, p, cred, dstuff);
 	if (error)
 		return (error);
 	if (nd->nd_flag & ND_NFSV4)
 		error = nfscl_wcc_data(nd, dvp, dnap, dattrflagp, NULL, dstuff);
 	if (!nd->nd_repstat) {
 		if (nd->nd_flag & ND_NFSV4) {
 			NFSM_DISSECT(tl, u_int32_t *, 5 * NFSX_UNSIGNED);
 			error = nfsrv_getattrbits(nd, &attrbits, NULL, NULL);
 			if (error)
 				goto nfsmout;
 		}
 		error = nfscl_mtofh(nd, nfhpp, nnap, attrflagp);
 		if (error)
 			goto nfsmout;
 	}
 	if (nd->nd_flag & ND_NFSV3)
 		error = nfscl_wcc_data(nd, dvp, dnap, dattrflagp, NULL, dstuff);
 	if (!error && nd->nd_repstat)
 		error = nd->nd_repstat;
 nfsmout:
 	mbuf_freem(nd->nd_mrep);
 	return (error);
 }
 
 /*
  * nfs file create call
  * Mostly just call the approriate routine. (I separated out v4, so that
  * error recovery wouldn't be as difficult.)
  */
 APPLESTATIC int
 nfsrpc_create(vnode_t dvp, char *name, int namelen, struct vattr *vap,
     nfsquad_t cverf, int fmode, struct ucred *cred, NFSPROC_T *p,
     struct nfsvattr *dnap, struct nfsvattr *nnap, struct nfsfh **nfhpp,
     int *attrflagp, int *dattrflagp, void *dstuff)
 {
 	int error = 0, newone, expireret = 0, retrycnt, unlocked;
 	struct nfsclowner *owp;
 	struct nfscldeleg *dp;
 	struct nfsmount *nmp = VFSTONFS(vnode_mount(dvp));
 	u_int32_t clidrev;
 
 	if (NFSHASNFSV4(nmp)) {
 	    retrycnt = 0;
 	    do {
 		dp = NULL;
 		error = nfscl_open(dvp, NULL, 0, (NFSV4OPEN_ACCESSWRITE |
 		    NFSV4OPEN_ACCESSREAD), 0, cred, p, &owp, NULL, &newone,
 		    NULL, 1);
 		if (error)
 			return (error);
 		if (nmp->nm_clp != NULL)
 			clidrev = nmp->nm_clp->nfsc_clientidrev;
 		else
 			clidrev = 0;
 		error = nfsrpc_createv4(dvp, name, namelen, vap, cverf, fmode,
 		  owp, &dp, cred, p, dnap, nnap, nfhpp, attrflagp, dattrflagp,
 		  dstuff, &unlocked);
 		/*
 		 * There is no need to invalidate cached attributes here,
 		 * since new post-delegation issue attributes are always
 		 * returned by nfsrpc_createv4() and these will update the
 		 * attribute cache.
 		 */
 		if (dp != NULL)
 			(void) nfscl_deleg(nmp->nm_mountp, owp->nfsow_clp,
 			    (*nfhpp)->nfh_fh, (*nfhpp)->nfh_len, cred, p, &dp);
 		nfscl_ownerrelease(nmp, owp, error, newone, unlocked);
 		if (error == NFSERR_GRACE || error == NFSERR_STALECLIENTID ||
 		    error == NFSERR_STALEDONTRECOVER || error == NFSERR_DELAY ||
 		    error == NFSERR_BADSESSION) {
 			(void) nfs_catnap(PZERO, error, "nfs_open");
 		} else if ((error == NFSERR_EXPIRED ||
 		    error == NFSERR_BADSTATEID) && clidrev != 0) {
 			expireret = nfscl_hasexpired(nmp->nm_clp, clidrev, p);
 			retrycnt++;
 		}
 	    } while (error == NFSERR_GRACE || error == NFSERR_STALECLIENTID ||
 		error == NFSERR_STALEDONTRECOVER || error == NFSERR_DELAY ||
 		error == NFSERR_BADSESSION ||
 		((error == NFSERR_EXPIRED || error == NFSERR_BADSTATEID) &&
 		 expireret == 0 && clidrev != 0 && retrycnt < 4));
 	    if (error && retrycnt >= 4)
 		    error = EIO;
 	} else {
 		error = nfsrpc_createv23(dvp, name, namelen, vap, cverf,
 		    fmode, cred, p, dnap, nnap, nfhpp, attrflagp, dattrflagp,
 		    dstuff);
 	}
 	return (error);
 }
 
 /*
  * The create rpc for v2 and 3.
  */
 static int
 nfsrpc_createv23(vnode_t dvp, char *name, int namelen, struct vattr *vap,
     nfsquad_t cverf, int fmode, struct ucred *cred, NFSPROC_T *p,
     struct nfsvattr *dnap, struct nfsvattr *nnap, struct nfsfh **nfhpp,
     int *attrflagp, int *dattrflagp, void *dstuff)
 {
 	u_int32_t *tl;
 	int error = 0;
 	struct nfsrv_descript nfsd, *nd = &nfsd;
 
 	*nfhpp = NULL;
 	*attrflagp = 0;
 	*dattrflagp = 0;
 	if (namelen > NFS_MAXNAMLEN)
 		return (ENAMETOOLONG);
 	NFSCL_REQSTART(nd, NFSPROC_CREATE, dvp);
 	(void) nfsm_strtom(nd, name, namelen);
 	if (nd->nd_flag & ND_NFSV3) {
 		NFSM_BUILD(tl, u_int32_t *, NFSX_UNSIGNED);
 		if (fmode & O_EXCL) {
 			*tl = txdr_unsigned(NFSCREATE_EXCLUSIVE);
 			NFSM_BUILD(tl, u_int32_t *, NFSX_VERF);
 			*tl++ = cverf.lval[0];
 			*tl = cverf.lval[1];
 		} else {
 			*tl = txdr_unsigned(NFSCREATE_UNCHECKED);
 			nfscl_fillsattr(nd, vap, dvp, 0, 0);
 		}
 	} else {
 		nfscl_fillsattr(nd, vap, dvp, NFSSATTR_SIZE0, 0);
 	}
 	error = nfscl_request(nd, dvp, p, cred, dstuff);
 	if (error)
 		return (error);
 	if (nd->nd_repstat == 0) {
 		error = nfscl_mtofh(nd, nfhpp, nnap, attrflagp);
 		if (error)
 			goto nfsmout;
 	}
 	if (nd->nd_flag & ND_NFSV3)
 		error = nfscl_wcc_data(nd, dvp, dnap, dattrflagp, NULL, dstuff);
 	if (nd->nd_repstat != 0 && error == 0)
 		error = nd->nd_repstat;
 nfsmout:
 	mbuf_freem(nd->nd_mrep);
 	return (error);
 }
 
 static int
 nfsrpc_createv4(vnode_t dvp, char *name, int namelen, struct vattr *vap,
     nfsquad_t cverf, int fmode, struct nfsclowner *owp, struct nfscldeleg **dpp,
     struct ucred *cred, NFSPROC_T *p, struct nfsvattr *dnap,
     struct nfsvattr *nnap, struct nfsfh **nfhpp, int *attrflagp,
     int *dattrflagp, void *dstuff, int *unlockedp)
 {
 	u_int32_t *tl;
 	int error = 0, deleg, newone, ret, acesize, limitby;
 	struct nfsrv_descript nfsd, *nd = &nfsd;
 	struct nfsclopen *op;
 	struct nfscldeleg *dp = NULL;
 	struct nfsnode *np;
 	struct nfsfh *nfhp;
 	nfsattrbit_t attrbits;
 	nfsv4stateid_t stateid;
 	u_int32_t rflags;
 	struct nfsmount *nmp;
 	struct nfsclsession *tsep;
 
 	nmp = VFSTONFS(dvp->v_mount);
 	np = VTONFS(dvp);
 	*unlockedp = 0;
 	*nfhpp = NULL;
 	*dpp = NULL;
 	*attrflagp = 0;
 	*dattrflagp = 0;
 	if (namelen > NFS_MAXNAMLEN)
 		return (ENAMETOOLONG);
 	NFSCL_REQSTART(nd, NFSPROC_CREATE, dvp);
 	/*
 	 * For V4, this is actually an Open op.
 	 */
 	NFSM_BUILD(tl, u_int32_t *, 5 * NFSX_UNSIGNED);
 	*tl++ = txdr_unsigned(owp->nfsow_seqid);
 	*tl++ = txdr_unsigned(NFSV4OPEN_ACCESSWRITE |
 	    NFSV4OPEN_ACCESSREAD);
 	*tl++ = txdr_unsigned(NFSV4OPEN_DENYNONE);
 	tsep = nfsmnt_mdssession(nmp);
 	*tl++ = tsep->nfsess_clientid.lval[0];
 	*tl = tsep->nfsess_clientid.lval[1];
 	(void) nfsm_strtom(nd, owp->nfsow_owner, NFSV4CL_LOCKNAMELEN);
 	NFSM_BUILD(tl, u_int32_t *, 2 * NFSX_UNSIGNED);
 	*tl++ = txdr_unsigned(NFSV4OPEN_CREATE);
 	if (fmode & O_EXCL) {
 		if (NFSHASNFSV4N(nmp)) {
 			if (NFSHASSESSPERSIST(nmp)) {
 				/* Use GUARDED for persistent sessions. */
 				*tl = txdr_unsigned(NFSCREATE_GUARDED);
 				nfscl_fillsattr(nd, vap, dvp, 0, 0);
 			} else {
 				/* Otherwise, use EXCLUSIVE4_1. */
 				*tl = txdr_unsigned(NFSCREATE_EXCLUSIVE41);
 				NFSM_BUILD(tl, u_int32_t *, NFSX_VERF);
 				*tl++ = cverf.lval[0];
 				*tl = cverf.lval[1];
 				nfscl_fillsattr(nd, vap, dvp, 0, 0);
 			}
 		} else {
 			/* NFSv4.0 */
 			*tl = txdr_unsigned(NFSCREATE_EXCLUSIVE);
 			NFSM_BUILD(tl, u_int32_t *, NFSX_VERF);
 			*tl++ = cverf.lval[0];
 			*tl = cverf.lval[1];
 		}
 	} else {
 		*tl = txdr_unsigned(NFSCREATE_UNCHECKED);
 		nfscl_fillsattr(nd, vap, dvp, 0, 0);
 	}
 	NFSM_BUILD(tl, u_int32_t *, NFSX_UNSIGNED);
 	*tl = txdr_unsigned(NFSV4OPEN_CLAIMNULL);
 	(void) nfsm_strtom(nd, name, namelen);
 	/* Get the new file's handle and attributes. */
 	NFSM_BUILD(tl, u_int32_t *, 2 * NFSX_UNSIGNED);
 	*tl++ = txdr_unsigned(NFSV4OP_GETFH);
 	*tl = txdr_unsigned(NFSV4OP_GETATTR);
 	NFSGETATTR_ATTRBIT(&attrbits);
 	(void) nfsrv_putattrbit(nd, &attrbits);
 	/* Get the directory's post-op attributes. */
 	NFSM_BUILD(tl, u_int32_t *, NFSX_UNSIGNED);
 	*tl = txdr_unsigned(NFSV4OP_PUTFH);
 	(void) nfsm_fhtom(nd, np->n_fhp->nfh_fh, np->n_fhp->nfh_len, 0);
 	NFSM_BUILD(tl, u_int32_t *, NFSX_UNSIGNED);
 	*tl = txdr_unsigned(NFSV4OP_GETATTR);
 	(void) nfsrv_putattrbit(nd, &attrbits);
 	error = nfscl_request(nd, dvp, p, cred, dstuff);
 	if (error)
 		return (error);
 	NFSCL_INCRSEQID(owp->nfsow_seqid, nd);
 	if (nd->nd_repstat == 0) {
 		NFSM_DISSECT(tl, u_int32_t *, NFSX_STATEID +
 		    6 * NFSX_UNSIGNED);
 		stateid.seqid = *tl++;
 		stateid.other[0] = *tl++;
 		stateid.other[1] = *tl++;
 		stateid.other[2] = *tl;
 		rflags = fxdr_unsigned(u_int32_t, *(tl + 6));
 		(void) nfsrv_getattrbits(nd, &attrbits, NULL, NULL);
 		NFSM_DISSECT(tl, u_int32_t *, NFSX_UNSIGNED);
 		deleg = fxdr_unsigned(int, *tl);
 		if (deleg == NFSV4OPEN_DELEGATEREAD ||
 		    deleg == NFSV4OPEN_DELEGATEWRITE) {
 			if (!(owp->nfsow_clp->nfsc_flags &
 			      NFSCLFLAGS_FIRSTDELEG))
 				owp->nfsow_clp->nfsc_flags |=
 				  (NFSCLFLAGS_FIRSTDELEG | NFSCLFLAGS_GOTDELEG);
 			MALLOC(dp, struct nfscldeleg *,
 			    sizeof (struct nfscldeleg) + NFSX_V4FHMAX,
 			    M_NFSCLDELEG, M_WAITOK);
 			LIST_INIT(&dp->nfsdl_owner);
 			LIST_INIT(&dp->nfsdl_lock);
 			dp->nfsdl_clp = owp->nfsow_clp;
 			newnfs_copyincred(cred, &dp->nfsdl_cred);
 			nfscl_lockinit(&dp->nfsdl_rwlock);
 			NFSM_DISSECT(tl, u_int32_t *, NFSX_STATEID +
 			    NFSX_UNSIGNED);
 			dp->nfsdl_stateid.seqid = *tl++;
 			dp->nfsdl_stateid.other[0] = *tl++;
 			dp->nfsdl_stateid.other[1] = *tl++;
 			dp->nfsdl_stateid.other[2] = *tl++;
 			ret = fxdr_unsigned(int, *tl);
 			if (deleg == NFSV4OPEN_DELEGATEWRITE) {
 				dp->nfsdl_flags = NFSCLDL_WRITE;
 				/*
 				 * Indicates how much the file can grow.
 				 */
 				NFSM_DISSECT(tl, u_int32_t *,
 				    3 * NFSX_UNSIGNED);
 				limitby = fxdr_unsigned(int, *tl++);
 				switch (limitby) {
 				case NFSV4OPEN_LIMITSIZE:
 					dp->nfsdl_sizelimit = fxdr_hyper(tl);
 					break;
 				case NFSV4OPEN_LIMITBLOCKS:
 					dp->nfsdl_sizelimit =
 					    fxdr_unsigned(u_int64_t, *tl++);
 					dp->nfsdl_sizelimit *=
 					    fxdr_unsigned(u_int64_t, *tl);
 					break;
 				default:
 					error = NFSERR_BADXDR;
 					goto nfsmout;
 				}
 			} else {
 				dp->nfsdl_flags = NFSCLDL_READ;
 			}
 			if (ret)
 				dp->nfsdl_flags |= NFSCLDL_RECALL;
 			error = nfsrv_dissectace(nd, &dp->nfsdl_ace, &ret,
 			    &acesize, p);
 			if (error)
 				goto nfsmout;
 		} else if (deleg != NFSV4OPEN_DELEGATENONE) {
 			error = NFSERR_BADXDR;
 			goto nfsmout;
 		}
 		error = nfscl_mtofh(nd, nfhpp, nnap, attrflagp);
 		if (error)
 			goto nfsmout;
 		/* Get rid of the PutFH and Getattr status values. */
 		NFSM_DISSECT(tl, u_int32_t *, 4 * NFSX_UNSIGNED);
 		/* Load the directory attributes. */
 		error = nfsm_loadattr(nd, dnap);
 		if (error)
 			goto nfsmout;
 		*dattrflagp = 1;
 		if (dp != NULL && *attrflagp) {
 			dp->nfsdl_change = nnap->na_filerev;
 			dp->nfsdl_modtime = nnap->na_mtime;
 			dp->nfsdl_flags |= NFSCLDL_MODTIMESET;
 		}
 		/*
 		 * We can now complete the Open state.
 		 */
 		nfhp = *nfhpp;
 		if (dp != NULL) {
 			dp->nfsdl_fhlen = nfhp->nfh_len;
 			NFSBCOPY(nfhp->nfh_fh, dp->nfsdl_fh, nfhp->nfh_len);
 		}
 		/*
 		 * Get an Open structure that will be
 		 * attached to the OpenOwner, acquired already.
 		 */
 		error = nfscl_open(dvp, nfhp->nfh_fh, nfhp->nfh_len, 
 		    (NFSV4OPEN_ACCESSWRITE | NFSV4OPEN_ACCESSREAD), 0,
 		    cred, p, NULL, &op, &newone, NULL, 0);
 		if (error)
 			goto nfsmout;
 		op->nfso_stateid = stateid;
 		newnfs_copyincred(cred, &op->nfso_cred);
 		if ((rflags & NFSV4OPEN_RESULTCONFIRM)) {
 		    do {
 			ret = nfsrpc_openconfirm(dvp, nfhp->nfh_fh,
 			    nfhp->nfh_len, op, cred, p);
 			if (ret == NFSERR_DELAY)
 			    (void) nfs_catnap(PZERO, ret, "nfs_create");
 		    } while (ret == NFSERR_DELAY);
 		    error = ret;
 		}
 
 		/*
 		 * If the server is handing out delegations, but we didn't
 		 * get one because an OpenConfirm was required, try the
 		 * Open again, to get a delegation. This is a harmless no-op,
 		 * from a server's point of view.
 		 */
 		if ((rflags & NFSV4OPEN_RESULTCONFIRM) &&
 		    (owp->nfsow_clp->nfsc_flags & NFSCLFLAGS_GOTDELEG) &&
 		    !error && dp == NULL) {
 		    do {
 			ret = nfsrpc_openrpc(VFSTONFS(vnode_mount(dvp)), dvp,
 			    np->n_fhp->nfh_fh, np->n_fhp->nfh_len,
 			    nfhp->nfh_fh, nfhp->nfh_len,
 			    (NFSV4OPEN_ACCESSWRITE | NFSV4OPEN_ACCESSREAD), op,
 			    name, namelen, &dp, 0, 0x0, cred, p, 0, 1);
 			if (ret == NFSERR_DELAY)
 			    (void) nfs_catnap(PZERO, ret, "nfs_crt2");
 		    } while (ret == NFSERR_DELAY);
 		    if (ret) {
 			if (dp != NULL) {
 				FREE((caddr_t)dp, M_NFSCLDELEG);
 				dp = NULL;
 			}
 			if (ret == NFSERR_STALECLIENTID ||
 			    ret == NFSERR_STALEDONTRECOVER ||
 			    ret == NFSERR_BADSESSION)
 				error = ret;
 		    }
 		}
 		nfscl_openrelease(nmp, op, error, newone);
 		*unlockedp = 1;
 	}
 	if (nd->nd_repstat != 0 && error == 0)
 		error = nd->nd_repstat;
 	if (error == NFSERR_STALECLIENTID)
 		nfscl_initiate_recovery(owp->nfsow_clp);
 nfsmout:
 	if (!error)
 		*dpp = dp;
 	else if (dp != NULL)
 		FREE((caddr_t)dp, M_NFSCLDELEG);
 	mbuf_freem(nd->nd_mrep);
 	return (error);
 }
 
 /*
  * Nfs remove rpc
  */
 APPLESTATIC int
 nfsrpc_remove(vnode_t dvp, char *name, int namelen, vnode_t vp,
     struct ucred *cred, NFSPROC_T *p, struct nfsvattr *dnap, int *dattrflagp,
     void *dstuff)
 {
 	u_int32_t *tl;
 	struct nfsrv_descript nfsd, *nd = &nfsd;
 	struct nfsnode *np;
 	struct nfsmount *nmp;
 	nfsv4stateid_t dstateid;
 	int error, ret = 0, i;
 
 	*dattrflagp = 0;
 	if (namelen > NFS_MAXNAMLEN)
 		return (ENAMETOOLONG);
 	nmp = VFSTONFS(vnode_mount(dvp));
 tryagain:
 	if (NFSHASNFSV4(nmp) && ret == 0) {
 		ret = nfscl_removedeleg(vp, p, &dstateid);
 		if (ret == 1) {
 			NFSCL_REQSTART(nd, NFSPROC_RETDELEGREMOVE, vp);
 			NFSM_BUILD(tl, u_int32_t *, NFSX_STATEID +
 			    NFSX_UNSIGNED);
 			if (NFSHASNFSV4N(nmp))
 				*tl++ = 0;
 			else
 				*tl++ = dstateid.seqid;
 			*tl++ = dstateid.other[0];
 			*tl++ = dstateid.other[1];
 			*tl++ = dstateid.other[2];
 			*tl = txdr_unsigned(NFSV4OP_PUTFH);
 			np = VTONFS(dvp);
 			(void) nfsm_fhtom(nd, np->n_fhp->nfh_fh,
 			    np->n_fhp->nfh_len, 0);
 			NFSM_BUILD(tl, u_int32_t *, NFSX_UNSIGNED);
 			*tl = txdr_unsigned(NFSV4OP_REMOVE);
 		}
 	} else {
 		ret = 0;
 	}
 	if (ret == 0)
 		NFSCL_REQSTART(nd, NFSPROC_REMOVE, dvp);
 	(void) nfsm_strtom(nd, name, namelen);
 	error = nfscl_request(nd, dvp, p, cred, dstuff);
 	if (error)
 		return (error);
 	if (nd->nd_flag & (ND_NFSV3 | ND_NFSV4)) {
 		/* For NFSv4, parse out any Delereturn replies. */
 		if (ret > 0 && nd->nd_repstat != 0 &&
 		    (nd->nd_flag & ND_NOMOREDATA)) {
 			/*
 			 * If the Delegreturn failed, try again without
 			 * it. The server will Recall, as required.
 			 */
 			mbuf_freem(nd->nd_mrep);
 			goto tryagain;
 		}
 		for (i = 0; i < (ret * 2); i++) {
 			if ((nd->nd_flag & (ND_NFSV4 | ND_NOMOREDATA)) ==
 			    ND_NFSV4) {
 			    NFSM_DISSECT(tl, u_int32_t *, 2 * NFSX_UNSIGNED);
 			    if (*(tl + 1))
 				nd->nd_flag |= ND_NOMOREDATA;
 			}
 		}
 		error = nfscl_wcc_data(nd, dvp, dnap, dattrflagp, NULL, dstuff);
 	}
 	if (nd->nd_repstat && !error)
 		error = nd->nd_repstat;
 nfsmout:
 	mbuf_freem(nd->nd_mrep);
 	return (error);
 }
 
 /*
  * Do an nfs rename rpc.
  */
 APPLESTATIC int
 nfsrpc_rename(vnode_t fdvp, vnode_t fvp, char *fnameptr, int fnamelen,
     vnode_t tdvp, vnode_t tvp, char *tnameptr, int tnamelen, struct ucred *cred,
     NFSPROC_T *p, struct nfsvattr *fnap, struct nfsvattr *tnap,
     int *fattrflagp, int *tattrflagp, void *fstuff, void *tstuff)
 {
 	u_int32_t *tl;
 	struct nfsrv_descript nfsd, *nd = &nfsd;
 	struct nfsmount *nmp;
 	struct nfsnode *np;
 	nfsattrbit_t attrbits;
 	nfsv4stateid_t fdstateid, tdstateid;
 	int error = 0, ret = 0, gottd = 0, gotfd = 0, i;
 	
 	*fattrflagp = 0;
 	*tattrflagp = 0;
 	nmp = VFSTONFS(vnode_mount(fdvp));
 	if (fnamelen > NFS_MAXNAMLEN || tnamelen > NFS_MAXNAMLEN)
 		return (ENAMETOOLONG);
 tryagain:
 	if (NFSHASNFSV4(nmp) && ret == 0) {
 		ret = nfscl_renamedeleg(fvp, &fdstateid, &gotfd, tvp,
 		    &tdstateid, &gottd, p);
 		if (gotfd && gottd) {
 			NFSCL_REQSTART(nd, NFSPROC_RETDELEGRENAME2, fvp);
 		} else if (gotfd) {
 			NFSCL_REQSTART(nd, NFSPROC_RETDELEGRENAME1, fvp);
 		} else if (gottd) {
 			NFSCL_REQSTART(nd, NFSPROC_RETDELEGRENAME1, tvp);
 		}
 		if (gotfd) {
 			NFSM_BUILD(tl, u_int32_t *, NFSX_STATEID);
 			if (NFSHASNFSV4N(nmp))
 				*tl++ = 0;
 			else
 				*tl++ = fdstateid.seqid;
 			*tl++ = fdstateid.other[0];
 			*tl++ = fdstateid.other[1];
 			*tl = fdstateid.other[2];
 			if (gottd) {
 				NFSM_BUILD(tl, u_int32_t *, NFSX_UNSIGNED);
 				*tl = txdr_unsigned(NFSV4OP_PUTFH);
 				np = VTONFS(tvp);
 				(void) nfsm_fhtom(nd, np->n_fhp->nfh_fh,
 				    np->n_fhp->nfh_len, 0);
 				NFSM_BUILD(tl, u_int32_t *, NFSX_UNSIGNED);
 				*tl = txdr_unsigned(NFSV4OP_DELEGRETURN);
 			}
 		}
 		if (gottd) {
 			NFSM_BUILD(tl, u_int32_t *, NFSX_STATEID);
 			if (NFSHASNFSV4N(nmp))
 				*tl++ = 0;
 			else
 				*tl++ = tdstateid.seqid;
 			*tl++ = tdstateid.other[0];
 			*tl++ = tdstateid.other[1];
 			*tl = tdstateid.other[2];
 		}
 		if (ret > 0) {
 			NFSM_BUILD(tl, u_int32_t *, NFSX_UNSIGNED);
 			*tl = txdr_unsigned(NFSV4OP_PUTFH);
 			np = VTONFS(fdvp);
 			(void) nfsm_fhtom(nd, np->n_fhp->nfh_fh,
 			    np->n_fhp->nfh_len, 0);
 			NFSM_BUILD(tl, u_int32_t *, NFSX_UNSIGNED);
 			*tl = txdr_unsigned(NFSV4OP_SAVEFH);
 		}
 	} else {
 		ret = 0;
 	}
 	if (ret == 0)
 		NFSCL_REQSTART(nd, NFSPROC_RENAME, fdvp);
 	if (nd->nd_flag & ND_NFSV4) {
 		NFSM_BUILD(tl, u_int32_t *, NFSX_UNSIGNED);
 		*tl = txdr_unsigned(NFSV4OP_GETATTR);
 		NFSWCCATTR_ATTRBIT(&attrbits);
 		(void) nfsrv_putattrbit(nd, &attrbits);
 		NFSM_BUILD(tl, u_int32_t *, NFSX_UNSIGNED);
 		*tl = txdr_unsigned(NFSV4OP_PUTFH);
 		(void) nfsm_fhtom(nd, VTONFS(tdvp)->n_fhp->nfh_fh,
 		    VTONFS(tdvp)->n_fhp->nfh_len, 0);
 		NFSM_BUILD(tl, u_int32_t *, NFSX_UNSIGNED);
 		*tl = txdr_unsigned(NFSV4OP_GETATTR);
 		(void) nfsrv_putattrbit(nd, &attrbits);
 		nd->nd_flag |= ND_V4WCCATTR;
 		NFSM_BUILD(tl, u_int32_t *, NFSX_UNSIGNED);
 		*tl = txdr_unsigned(NFSV4OP_RENAME);
 	}
 	(void) nfsm_strtom(nd, fnameptr, fnamelen);
 	if (!(nd->nd_flag & ND_NFSV4))
 		(void) nfsm_fhtom(nd, VTONFS(tdvp)->n_fhp->nfh_fh,
 			VTONFS(tdvp)->n_fhp->nfh_len, 0);
 	(void) nfsm_strtom(nd, tnameptr, tnamelen);
 	error = nfscl_request(nd, fdvp, p, cred, fstuff);
 	if (error)
 		return (error);
 	if (nd->nd_flag & (ND_NFSV3 | ND_NFSV4)) {
 		/* For NFSv4, parse out any Delereturn replies. */
 		if (ret > 0 && nd->nd_repstat != 0 &&
 		    (nd->nd_flag & ND_NOMOREDATA)) {
 			/*
 			 * If the Delegreturn failed, try again without
 			 * it. The server will Recall, as required.
 			 */
 			mbuf_freem(nd->nd_mrep);
 			goto tryagain;
 		}
 		for (i = 0; i < (ret * 2); i++) {
 			if ((nd->nd_flag & (ND_NFSV4 | ND_NOMOREDATA)) ==
 			    ND_NFSV4) {
 			    NFSM_DISSECT(tl, u_int32_t *, 2 * NFSX_UNSIGNED);
 			    if (*(tl + 1)) {
 				if (i == 0 && ret > 1) {
 				    /*
 				     * If the Delegreturn failed, try again
 				     * without it. The server will Recall, as
 				     * required.
 				     * If ret > 1, the first iteration of this
 				     * loop is the second DelegReturn result.
 				     */
 				    mbuf_freem(nd->nd_mrep);
 				    goto tryagain;
 				} else {
 				    nd->nd_flag |= ND_NOMOREDATA;
 				}
 			    }
 			}
 		}
 		/* Now, the first wcc attribute reply. */
 		if ((nd->nd_flag & (ND_NFSV4 | ND_NOMOREDATA)) == ND_NFSV4) {
 			NFSM_DISSECT(tl, u_int32_t *, 2 * NFSX_UNSIGNED);
 			if (*(tl + 1))
 				nd->nd_flag |= ND_NOMOREDATA;
 		}
 		error = nfscl_wcc_data(nd, fdvp, fnap, fattrflagp, NULL,
 		    fstuff);
 		/* and the second wcc attribute reply. */
 		if ((nd->nd_flag & (ND_NFSV4 | ND_NOMOREDATA)) == ND_NFSV4 &&
 		    !error) {
 			NFSM_DISSECT(tl, u_int32_t *, 2 * NFSX_UNSIGNED);
 			if (*(tl + 1))
 				nd->nd_flag |= ND_NOMOREDATA;
 		}
 		if (!error)
 			error = nfscl_wcc_data(nd, tdvp, tnap, tattrflagp,
 			    NULL, tstuff);
 	}
 	if (nd->nd_repstat && !error)
 		error = nd->nd_repstat;
 nfsmout:
 	mbuf_freem(nd->nd_mrep);
 	return (error);
 }
 
 /*
  * nfs hard link create rpc
  */
 APPLESTATIC int
 nfsrpc_link(vnode_t dvp, vnode_t vp, char *name, int namelen,
     struct ucred *cred, NFSPROC_T *p, struct nfsvattr *dnap,
     struct nfsvattr *nap, int *attrflagp, int *dattrflagp, void *dstuff)
 {
 	u_int32_t *tl;
 	struct nfsrv_descript nfsd, *nd = &nfsd;
 	nfsattrbit_t attrbits;
 	int error = 0;
 
 	*attrflagp = 0;
 	*dattrflagp = 0;
 	if (namelen > NFS_MAXNAMLEN)
 		return (ENAMETOOLONG);
 	NFSCL_REQSTART(nd, NFSPROC_LINK, vp);
 	if (nd->nd_flag & ND_NFSV4) {
 		NFSM_BUILD(tl, u_int32_t *, NFSX_UNSIGNED);
 		*tl = txdr_unsigned(NFSV4OP_PUTFH);
 	}
 	(void) nfsm_fhtom(nd, VTONFS(dvp)->n_fhp->nfh_fh,
 		VTONFS(dvp)->n_fhp->nfh_len, 0);
 	if (nd->nd_flag & ND_NFSV4) {
 		NFSM_BUILD(tl, u_int32_t *, NFSX_UNSIGNED);
 		*tl = txdr_unsigned(NFSV4OP_GETATTR);
 		NFSWCCATTR_ATTRBIT(&attrbits);
 		(void) nfsrv_putattrbit(nd, &attrbits);
 		nd->nd_flag |= ND_V4WCCATTR;
 		NFSM_BUILD(tl, u_int32_t *, NFSX_UNSIGNED);
 		*tl = txdr_unsigned(NFSV4OP_LINK);
 	}
 	(void) nfsm_strtom(nd, name, namelen);
 	error = nfscl_request(nd, vp, p, cred, dstuff);
 	if (error)
 		return (error);
 	if (nd->nd_flag & ND_NFSV3) {
 		error = nfscl_postop_attr(nd, nap, attrflagp, dstuff);
 		if (!error)
 			error = nfscl_wcc_data(nd, dvp, dnap, dattrflagp,
 			    NULL, dstuff);
 	} else if ((nd->nd_flag & (ND_NFSV4 | ND_NOMOREDATA)) == ND_NFSV4) {
 		/*
 		 * First, parse out the PutFH and Getattr result.
 		 */
 		NFSM_DISSECT(tl, u_int32_t *, 2 * NFSX_UNSIGNED);
 		if (!(*(tl + 1)))
 			NFSM_DISSECT(tl, u_int32_t *, 2 * NFSX_UNSIGNED);
 		if (*(tl + 1))
 			nd->nd_flag |= ND_NOMOREDATA;
 		/*
 		 * Get the pre-op attributes.
 		 */
 		error = nfscl_wcc_data(nd, dvp, dnap, dattrflagp, NULL, dstuff);
 	}
 	if (nd->nd_repstat && !error)
 		error = nd->nd_repstat;
 nfsmout:
 	mbuf_freem(nd->nd_mrep);
 	return (error);
 }
 
 /*
  * nfs symbolic link create rpc
  */
 APPLESTATIC int
 nfsrpc_symlink(vnode_t dvp, char *name, int namelen, char *target,
     struct vattr *vap, struct ucred *cred, NFSPROC_T *p, struct nfsvattr *dnap,
     struct nfsvattr *nnap, struct nfsfh **nfhpp, int *attrflagp,
     int *dattrflagp, void *dstuff)
 {
 	u_int32_t *tl;
 	struct nfsrv_descript nfsd, *nd = &nfsd;
 	struct nfsmount *nmp;
 	int slen, error = 0;
 
 	*nfhpp = NULL;
 	*attrflagp = 0;
 	*dattrflagp = 0;
 	nmp = VFSTONFS(vnode_mount(dvp));
 	slen = strlen(target);
 	if (slen > NFS_MAXPATHLEN || namelen > NFS_MAXNAMLEN)
 		return (ENAMETOOLONG);
 	NFSCL_REQSTART(nd, NFSPROC_SYMLINK, dvp);
 	if (nd->nd_flag & ND_NFSV4) {
 		NFSM_BUILD(tl, u_int32_t *, NFSX_UNSIGNED);
 		*tl = txdr_unsigned(NFLNK);
 		(void) nfsm_strtom(nd, target, slen);
 	}
 	(void) nfsm_strtom(nd, name, namelen);
 	if (nd->nd_flag & (ND_NFSV3 | ND_NFSV4))
 		nfscl_fillsattr(nd, vap, dvp, 0, 0);
 	if (!(nd->nd_flag & ND_NFSV4))
 		(void) nfsm_strtom(nd, target, slen);
 	if (nd->nd_flag & ND_NFSV2)
 		nfscl_fillsattr(nd, vap, dvp, NFSSATTR_SIZENEG1, 0);
 	error = nfscl_request(nd, dvp, p, cred, dstuff);
 	if (error)
 		return (error);
 	if (nd->nd_flag & ND_NFSV4)
 		error = nfscl_wcc_data(nd, dvp, dnap, dattrflagp, NULL, dstuff);
 	if ((nd->nd_flag & ND_NFSV3) && !error) {
 		if (!nd->nd_repstat)
 			error = nfscl_mtofh(nd, nfhpp, nnap, attrflagp);
 		if (!error)
 			error = nfscl_wcc_data(nd, dvp, dnap, dattrflagp,
 			    NULL, dstuff);
 	}
 	if (nd->nd_repstat && !error)
 		error = nd->nd_repstat;
 	mbuf_freem(nd->nd_mrep);
 	/*
 	 * Kludge: Map EEXIST => 0 assuming that it is a reply to a retry.
 	 * Only do this if vfs.nfs.ignore_eexist is set.
 	 * Never do this for NFSv4.1 or later minor versions, since sessions
 	 * should guarantee "exactly once" RPC semantics.
 	 */
 	if (error == EEXIST && nfsignore_eexist != 0 && (!NFSHASNFSV4(nmp) ||
 	    nmp->nm_minorvers == 0))
 		error = 0;
 	return (error);
 }
 
 /*
  * nfs make dir rpc
  */
 APPLESTATIC int
 nfsrpc_mkdir(vnode_t dvp, char *name, int namelen, struct vattr *vap,
     struct ucred *cred, NFSPROC_T *p, struct nfsvattr *dnap,
     struct nfsvattr *nnap, struct nfsfh **nfhpp, int *attrflagp,
     int *dattrflagp, void *dstuff)
 {
 	u_int32_t *tl;
 	struct nfsrv_descript nfsd, *nd = &nfsd;
 	nfsattrbit_t attrbits;
 	int error = 0;
 	struct nfsfh *fhp;
 	struct nfsmount *nmp;
 
 	*nfhpp = NULL;
 	*attrflagp = 0;
 	*dattrflagp = 0;
 	nmp = VFSTONFS(vnode_mount(dvp));
 	fhp = VTONFS(dvp)->n_fhp;
 	if (namelen > NFS_MAXNAMLEN)
 		return (ENAMETOOLONG);
 	NFSCL_REQSTART(nd, NFSPROC_MKDIR, dvp);
 	if (nd->nd_flag & ND_NFSV4) {
 		NFSM_BUILD(tl, u_int32_t *, NFSX_UNSIGNED);
 		*tl = txdr_unsigned(NFDIR);
 	}
 	(void) nfsm_strtom(nd, name, namelen);
 	nfscl_fillsattr(nd, vap, dvp, NFSSATTR_SIZENEG1, 0);
 	if (nd->nd_flag & ND_NFSV4) {
 		NFSGETATTR_ATTRBIT(&attrbits);
 		NFSM_BUILD(tl, u_int32_t *, 2 * NFSX_UNSIGNED);
 		*tl++ = txdr_unsigned(NFSV4OP_GETFH);
 		*tl = txdr_unsigned(NFSV4OP_GETATTR);
 		(void) nfsrv_putattrbit(nd, &attrbits);
 		NFSM_BUILD(tl, u_int32_t *, NFSX_UNSIGNED);
 		*tl = txdr_unsigned(NFSV4OP_PUTFH);
 		(void) nfsm_fhtom(nd, fhp->nfh_fh, fhp->nfh_len, 0);
 		NFSM_BUILD(tl, u_int32_t *, NFSX_UNSIGNED);
 		*tl = txdr_unsigned(NFSV4OP_GETATTR);
 		(void) nfsrv_putattrbit(nd, &attrbits);
 	}
 	error = nfscl_request(nd, dvp, p, cred, dstuff);
 	if (error)
 		return (error);
 	if (nd->nd_flag & ND_NFSV4)
 		error = nfscl_wcc_data(nd, dvp, dnap, dattrflagp, NULL, dstuff);
 	if (!nd->nd_repstat && !error) {
 		if (nd->nd_flag & ND_NFSV4) {
 			NFSM_DISSECT(tl, u_int32_t *, 5 * NFSX_UNSIGNED);
 			error = nfsrv_getattrbits(nd, &attrbits, NULL, NULL);
 		}
 		if (!error)
 			error = nfscl_mtofh(nd, nfhpp, nnap, attrflagp);
 		if (error == 0 && (nd->nd_flag & ND_NFSV4) != 0) {
 			/* Get rid of the PutFH and Getattr status values. */
 			NFSM_DISSECT(tl, u_int32_t *, 4 * NFSX_UNSIGNED);
 			/* Load the directory attributes. */
 			error = nfsm_loadattr(nd, dnap);
 			if (error == 0)
 				*dattrflagp = 1;
 		}
 	}
 	if ((nd->nd_flag & ND_NFSV3) && !error)
 		error = nfscl_wcc_data(nd, dvp, dnap, dattrflagp, NULL, dstuff);
 	if (nd->nd_repstat && !error)
 		error = nd->nd_repstat;
 nfsmout:
 	mbuf_freem(nd->nd_mrep);
 	/*
 	 * Kludge: Map EEXIST => 0 assuming that it is a reply to a retry.
 	 * Only do this if vfs.nfs.ignore_eexist is set.
 	 * Never do this for NFSv4.1 or later minor versions, since sessions
 	 * should guarantee "exactly once" RPC semantics.
 	 */
 	if (error == EEXIST && nfsignore_eexist != 0 && (!NFSHASNFSV4(nmp) ||
 	    nmp->nm_minorvers == 0))
 		error = 0;
 	return (error);
 }
 
 /*
  * nfs remove directory call
  */
 APPLESTATIC int
 nfsrpc_rmdir(vnode_t dvp, char *name, int namelen, struct ucred *cred,
     NFSPROC_T *p, struct nfsvattr *dnap, int *dattrflagp, void *dstuff)
 {
 	struct nfsrv_descript nfsd, *nd = &nfsd;
 	int error = 0;
 
 	*dattrflagp = 0;
 	if (namelen > NFS_MAXNAMLEN)
 		return (ENAMETOOLONG);
 	NFSCL_REQSTART(nd, NFSPROC_RMDIR, dvp);
 	(void) nfsm_strtom(nd, name, namelen);
 	error = nfscl_request(nd, dvp, p, cred, dstuff);
 	if (error)
 		return (error);
 	if (nd->nd_flag & (ND_NFSV3 | ND_NFSV4))
 		error = nfscl_wcc_data(nd, dvp, dnap, dattrflagp, NULL, dstuff);
 	if (nd->nd_repstat && !error)
 		error = nd->nd_repstat;
 	mbuf_freem(nd->nd_mrep);
 	/*
 	 * Kludge: Map ENOENT => 0 assuming that you have a reply to a retry.
 	 */
 	if (error == ENOENT)
 		error = 0;
 	return (error);
 }
 
 /*
  * Readdir rpc.
  * Always returns with either uio_resid unchanged, if you are at the
  * end of the directory, or uio_resid == 0, with all DIRBLKSIZ chunks
  * filled in.
  * I felt this would allow caching of directory blocks more easily
  * than returning a pertially filled block.
  * Directory offset cookies:
  * Oh my, what to do with them...
  * I can think of three ways to deal with them:
  * 1 - have the layer above these RPCs maintain a map between logical
  *     directory byte offsets and the NFS directory offset cookies
  * 2 - pass the opaque directory offset cookies up into userland
  *     and let the libc functions deal with them, via the system call
  * 3 - return them to userland in the "struct dirent", so future versions
  *     of libc can use them and do whatever is necessary to make things work
  *     above these rpc calls, in the meantime
  * For now, I do #3 by "hiding" the directory offset cookies after the
  * d_name field in struct dirent. This is space inside d_reclen that
  * will be ignored by anything that doesn't know about them.
  * The directory offset cookies are filled in as the last 8 bytes of
  * each directory entry, after d_name. Someday, the userland libc
  * functions may be able to use these. In the meantime, it satisfies
  * OpenBSD's requirements for cookies being returned.
  * If expects the directory offset cookie for the read to be in uio_offset
  * and returns the one for the next entry after this directory block in
  * there, as well.
  */
 APPLESTATIC int
 nfsrpc_readdir(vnode_t vp, struct uio *uiop, nfsuint64 *cookiep,
     struct ucred *cred, NFSPROC_T *p, struct nfsvattr *nap, int *attrflagp,
     int *eofp, void *stuff)
 {
 	int len, left;
 	struct dirent *dp = NULL;
 	u_int32_t *tl;
 	nfsquad_t cookie, ncookie;
 	struct nfsmount *nmp = VFSTONFS(vnode_mount(vp));
 	struct nfsnode *dnp = VTONFS(vp);
 	struct nfsvattr nfsva;
 	struct nfsrv_descript nfsd, *nd = &nfsd;
 	int error = 0, tlen, more_dirs = 1, blksiz = 0, bigenough = 1;
 	int reqsize, tryformoredirs = 1, readsize, eof = 0, gotmnton = 0;
 	long dotfileid, dotdotfileid = 0;
 	u_int32_t fakefileno = 0xffffffff, rderr;
 	char *cp;
 	nfsattrbit_t attrbits, dattrbits;
 	u_int32_t *tl2 = NULL;
 	size_t tresid;
 
 	KASSERT(uiop->uio_iovcnt == 1 &&
 	    (uio_uio_resid(uiop) & (DIRBLKSIZ - 1)) == 0,
 	    ("nfs readdirrpc bad uio"));
 
 	/*
 	 * There is no point in reading a lot more than uio_resid, however
 	 * adding one additional DIRBLKSIZ makes sense. Since uio_resid
 	 * and nm_readdirsize are both exact multiples of DIRBLKSIZ, this
 	 * will never make readsize > nm_readdirsize.
 	 */
 	readsize = nmp->nm_readdirsize;
 	if (readsize > uio_uio_resid(uiop))
 		readsize = uio_uio_resid(uiop) + DIRBLKSIZ;
 
 	*attrflagp = 0;
 	if (eofp)
 		*eofp = 0;
 	tresid = uio_uio_resid(uiop);
 	cookie.lval[0] = cookiep->nfsuquad[0];
 	cookie.lval[1] = cookiep->nfsuquad[1];
 	nd->nd_mrep = NULL;
 
 	/*
 	 * For NFSv4, first create the "." and ".." entries.
 	 */
 	if (NFSHASNFSV4(nmp)) {
 		reqsize = 6 * NFSX_UNSIGNED;
 		NFSGETATTR_ATTRBIT(&dattrbits);
 		NFSZERO_ATTRBIT(&attrbits);
 		NFSSETBIT_ATTRBIT(&attrbits, NFSATTRBIT_FILEID);
 		NFSSETBIT_ATTRBIT(&attrbits, NFSATTRBIT_TYPE);
 		if (NFSISSET_ATTRBIT(&dnp->n_vattr.na_suppattr,
 		    NFSATTRBIT_MOUNTEDONFILEID)) {
 			NFSSETBIT_ATTRBIT(&attrbits,
 			    NFSATTRBIT_MOUNTEDONFILEID);
 			gotmnton = 1;
 		} else {
 			/*
 			 * Must fake it. Use the fileno, except when the
 			 * fsid is != to that of the directory. For that
 			 * case, generate a fake fileno that is not the same.
 			 */
 			NFSSETBIT_ATTRBIT(&attrbits, NFSATTRBIT_FSID);
 			gotmnton = 0;
 		}
 
 		/*
 		 * Joy, oh joy. For V4 we get to hand craft '.' and '..'.
 		 */
 		if (uiop->uio_offset == 0) {
 			NFSCL_REQSTART(nd, NFSPROC_LOOKUPP, vp);
 			NFSM_BUILD(tl, u_int32_t *, 2 * NFSX_UNSIGNED);
 			*tl++ = txdr_unsigned(NFSV4OP_GETFH);
 			*tl = txdr_unsigned(NFSV4OP_GETATTR);
 			(void) nfsrv_putattrbit(nd, &attrbits);
 			error = nfscl_request(nd, vp, p, cred, stuff);
 			if (error)
 			    return (error);
 			dotfileid = 0;	/* Fake out the compiler. */
 			if ((nd->nd_flag & ND_NOMOREDATA) == 0) {
 			    error = nfsm_loadattr(nd, &nfsva);
 			    if (error != 0)
 				goto nfsmout;
 			    dotfileid = nfsva.na_fileid;
 			}
 			if (nd->nd_repstat == 0) {
 			    NFSM_DISSECT(tl, u_int32_t *, 5 * NFSX_UNSIGNED);
 			    len = fxdr_unsigned(int, *(tl + 4));
 			    if (len > 0 && len <= NFSX_V4FHMAX)
 				error = nfsm_advance(nd, NFSM_RNDUP(len), -1);
 			    else
 				error = EPERM;
 			    if (!error) {
 				NFSM_DISSECT(tl, u_int32_t *, 2*NFSX_UNSIGNED);
 				nfsva.na_mntonfileno = 0xffffffff;
 				error = nfsv4_loadattr(nd, NULL, &nfsva, NULL,
 				    NULL, 0, NULL, NULL, NULL, NULL, NULL, 0,
 				    NULL, NULL, NULL, p, cred);
 				if (error) {
 				    dotdotfileid = dotfileid;
 				} else if (gotmnton) {
 				    if (nfsva.na_mntonfileno != 0xffffffff)
 					dotdotfileid = nfsva.na_mntonfileno;
 				    else
 					dotdotfileid = nfsva.na_fileid;
 				} else if (nfsva.na_filesid[0] ==
 				    dnp->n_vattr.na_filesid[0] &&
 				    nfsva.na_filesid[1] ==
 				    dnp->n_vattr.na_filesid[1]) {
 				    dotdotfileid = nfsva.na_fileid;
 				} else {
 				    do {
 					fakefileno--;
 				    } while (fakefileno ==
 					nfsva.na_fileid);
 				    dotdotfileid = fakefileno;
 				}
 			    }
 			} else if (nd->nd_repstat == NFSERR_NOENT) {
 			    /*
 			     * Lookupp returns NFSERR_NOENT when we are
 			     * at the root, so just use the current dir.
 			     */
 			    nd->nd_repstat = 0;
 			    dotdotfileid = dotfileid;
 			} else {
 			    error = nd->nd_repstat;
 			}
 			mbuf_freem(nd->nd_mrep);
 			if (error)
 			    return (error);
 			nd->nd_mrep = NULL;
 			dp = (struct dirent *)uio_iov_base(uiop);
 			dp->d_off = 0;
 			dp->d_type = DT_DIR;
 			dp->d_fileno = dotfileid;
 			dp->d_namlen = 1;
 			*((uint64_t *)dp->d_name) = 0;	/* Zero pad it. */
 			dp->d_name[0] = '.';
 			dp->d_reclen = _GENERIC_DIRSIZ(dp) + NFSX_HYPER;
 			/*
 			 * Just make these offset cookie 0.
 			 */
 			tl = (u_int32_t *)&dp->d_name[8];
 			*tl++ = 0;
 			*tl = 0;
 			blksiz += dp->d_reclen;
 			uio_uio_resid_add(uiop, -(dp->d_reclen));
 			uiop->uio_offset += dp->d_reclen;
 			uio_iov_base_add(uiop, dp->d_reclen);
 			uio_iov_len_add(uiop, -(dp->d_reclen));
 			dp = (struct dirent *)uio_iov_base(uiop);
 			dp->d_off = 0;
 			dp->d_type = DT_DIR;
 			dp->d_fileno = dotdotfileid;
 			dp->d_namlen = 2;
 			*((uint64_t *)dp->d_name) = 0;
 			dp->d_name[0] = '.';
 			dp->d_name[1] = '.';
 			dp->d_reclen = _GENERIC_DIRSIZ(dp) + NFSX_HYPER;
 			/*
 			 * Just make these offset cookie 0.
 			 */
 			tl = (u_int32_t *)&dp->d_name[8];
 			*tl++ = 0;
 			*tl = 0;
 			blksiz += dp->d_reclen;
 			uio_uio_resid_add(uiop, -(dp->d_reclen));
 			uiop->uio_offset += dp->d_reclen;
 			uio_iov_base_add(uiop, dp->d_reclen);
 			uio_iov_len_add(uiop, -(dp->d_reclen));
 		}
 		NFSSETBIT_ATTRBIT(&attrbits, NFSATTRBIT_RDATTRERROR);
 	} else {
 		reqsize = 5 * NFSX_UNSIGNED;
 	}
 
 
 	/*
 	 * Loop around doing readdir rpc's of size readsize.
 	 * The stopping criteria is EOF or buffer full.
 	 */
 	while (more_dirs && bigenough) {
 		*attrflagp = 0;
 		NFSCL_REQSTART(nd, NFSPROC_READDIR, vp);
 		if (nd->nd_flag & ND_NFSV2) {
 			NFSM_BUILD(tl, u_int32_t *, 2 * NFSX_UNSIGNED);
 			*tl++ = cookie.lval[1];
 			*tl = txdr_unsigned(readsize);
 		} else {
 			NFSM_BUILD(tl, u_int32_t *, reqsize);
 			*tl++ = cookie.lval[0];
 			*tl++ = cookie.lval[1];
 			if (cookie.qval == 0) {
 				*tl++ = 0;
 				*tl++ = 0;
 			} else {
 				NFSLOCKNODE(dnp);
 				*tl++ = dnp->n_cookieverf.nfsuquad[0];
 				*tl++ = dnp->n_cookieverf.nfsuquad[1];
 				NFSUNLOCKNODE(dnp);
 			}
 			if (nd->nd_flag & ND_NFSV4) {
 				*tl++ = txdr_unsigned(readsize);
 				*tl = txdr_unsigned(readsize);
 				(void) nfsrv_putattrbit(nd, &attrbits);
 				NFSM_BUILD(tl, u_int32_t *, NFSX_UNSIGNED);
 				*tl = txdr_unsigned(NFSV4OP_GETATTR);
 				(void) nfsrv_putattrbit(nd, &dattrbits);
 			} else {
 				*tl = txdr_unsigned(readsize);
 			}
 		}
 		error = nfscl_request(nd, vp, p, cred, stuff);
 		if (error)
 			return (error);
 		if (!(nd->nd_flag & ND_NFSV2)) {
 			if (nd->nd_flag & ND_NFSV3)
 				error = nfscl_postop_attr(nd, nap, attrflagp,
 				    stuff);
 			if (!nd->nd_repstat && !error) {
 				NFSM_DISSECT(tl, u_int32_t *, NFSX_HYPER);
 				NFSLOCKNODE(dnp);
 				dnp->n_cookieverf.nfsuquad[0] = *tl++;
 				dnp->n_cookieverf.nfsuquad[1] = *tl;
 				NFSUNLOCKNODE(dnp);
 			}
 		}
 		if (nd->nd_repstat || error) {
 			if (!error)
 				error = nd->nd_repstat;
 			goto nfsmout;
 		}
 		NFSM_DISSECT(tl, u_int32_t *, NFSX_UNSIGNED);
 		more_dirs = fxdr_unsigned(int, *tl);
 		if (!more_dirs)
 			tryformoredirs = 0;
 	
 		/* loop through the dir entries, doctoring them to 4bsd form */
 		while (more_dirs && bigenough) {
 			if (nd->nd_flag & ND_NFSV4) {
 				NFSM_DISSECT(tl, u_int32_t *, 3*NFSX_UNSIGNED);
 				ncookie.lval[0] = *tl++;
 				ncookie.lval[1] = *tl++;
 				len = fxdr_unsigned(int, *tl);
 			} else if (nd->nd_flag & ND_NFSV3) {
 				NFSM_DISSECT(tl, u_int32_t *, 3*NFSX_UNSIGNED);
 				nfsva.na_fileid = fxdr_hyper(tl);
 				tl += 2;
 				len = fxdr_unsigned(int, *tl);
 			} else {
 				NFSM_DISSECT(tl, u_int32_t *, 2*NFSX_UNSIGNED);
 				nfsva.na_fileid =
 				    fxdr_unsigned(long, *tl++);
 				len = fxdr_unsigned(int, *tl);
 			}
 			if (len <= 0 || len > NFS_MAXNAMLEN) {
 				error = EBADRPC;
 				goto nfsmout;
 			}
 			tlen = roundup2(len, 8);
 			if (tlen == len)
 				tlen += 8;  /* To ensure null termination. */
 			left = DIRBLKSIZ - blksiz;
 			if (_GENERIC_DIRLEN(len) + NFSX_HYPER > left) {
 				dp->d_reclen += left;
 				uio_iov_base_add(uiop, left);
 				uio_iov_len_add(uiop, -(left));
 				uio_uio_resid_add(uiop, -(left));
 				uiop->uio_offset += left;
 				blksiz = 0;
 			}
 			if (_GENERIC_DIRLEN(len) + NFSX_HYPER >
 			    uio_uio_resid(uiop))
 				bigenough = 0;
 			if (bigenough) {
 				dp = (struct dirent *)uio_iov_base(uiop);
 				dp->d_off = 0;
 				dp->d_namlen = len;
 				dp->d_reclen = _GENERIC_DIRLEN(len) +
 				    NFSX_HYPER;
 				dp->d_type = DT_UNKNOWN;
 				blksiz += dp->d_reclen;
 				if (blksiz == DIRBLKSIZ)
 					blksiz = 0;
 				uio_uio_resid_add(uiop, -(DIRHDSIZ));
 				uiop->uio_offset += DIRHDSIZ;
 				uio_iov_base_add(uiop, DIRHDSIZ);
 				uio_iov_len_add(uiop, -(DIRHDSIZ));
 				error = nfsm_mbufuio(nd, uiop, len);
 				if (error)
 					goto nfsmout;
 				cp = uio_iov_base(uiop);
 				tlen -= len;
 				*cp = '\0';	/* null terminate */
 				cp += tlen;	/* points to cookie storage */
 				tl2 = (u_int32_t *)cp;
 				uio_iov_base_add(uiop, (tlen + NFSX_HYPER));
 				uio_iov_len_add(uiop, -(tlen + NFSX_HYPER));
 				uio_uio_resid_add(uiop, -(tlen + NFSX_HYPER));
 				uiop->uio_offset += (tlen + NFSX_HYPER);
 			} else {
 				error = nfsm_advance(nd, NFSM_RNDUP(len), -1);
 				if (error)
 					goto nfsmout;
 			}
 			if (nd->nd_flag & ND_NFSV4) {
 				rderr = 0;
 				nfsva.na_mntonfileno = 0xffffffff;
 				error = nfsv4_loadattr(nd, NULL, &nfsva, NULL,
 				    NULL, 0, NULL, NULL, NULL, NULL, NULL, 0,
 				    NULL, NULL, &rderr, p, cred);
 				if (error)
 					goto nfsmout;
 				NFSM_DISSECT(tl, u_int32_t *, NFSX_UNSIGNED);
 			} else if (nd->nd_flag & ND_NFSV3) {
 				NFSM_DISSECT(tl, u_int32_t *, 3*NFSX_UNSIGNED);
 				ncookie.lval[0] = *tl++;
 				ncookie.lval[1] = *tl++;
 			} else {
 				NFSM_DISSECT(tl, u_int32_t *, 2*NFSX_UNSIGNED);
 				ncookie.lval[0] = 0;
 				ncookie.lval[1] = *tl++;
 			}
 			if (bigenough) {
 			    if (nd->nd_flag & ND_NFSV4) {
 				if (rderr) {
 				    dp->d_fileno = 0;
 				} else {
 				    if (gotmnton) {
 					if (nfsva.na_mntonfileno != 0xffffffff)
 					    dp->d_fileno = nfsva.na_mntonfileno;
 					else
 					    dp->d_fileno = nfsva.na_fileid;
 				    } else if (nfsva.na_filesid[0] ==
 					dnp->n_vattr.na_filesid[0] &&
 					nfsva.na_filesid[1] ==
 					dnp->n_vattr.na_filesid[1]) {
 					dp->d_fileno = nfsva.na_fileid;
 				    } else {
 					do {
 					    fakefileno--;
 					} while (fakefileno ==
 					    nfsva.na_fileid);
 					dp->d_fileno = fakefileno;
 				    }
 				    dp->d_type = vtonfs_dtype(nfsva.na_type);
 				}
 			    } else {
 				dp->d_fileno = nfsva.na_fileid;
 			    }
 			    *tl2++ = cookiep->nfsuquad[0] = cookie.lval[0] =
 				ncookie.lval[0];
 			    *tl2 = cookiep->nfsuquad[1] = cookie.lval[1] =
 				ncookie.lval[1];
 			}
 			more_dirs = fxdr_unsigned(int, *tl);
 		}
 		/*
 		 * If at end of rpc data, get the eof boolean
 		 */
 		if (!more_dirs) {
 			NFSM_DISSECT(tl, u_int32_t *, NFSX_UNSIGNED);
 			eof = fxdr_unsigned(int, *tl);
 			if (tryformoredirs)
 				more_dirs = !eof;
 			if (nd->nd_flag & ND_NFSV4) {
 				error = nfscl_postop_attr(nd, nap, attrflagp,
 				    stuff);
 				if (error)
 					goto nfsmout;
 			}
 		}
 		mbuf_freem(nd->nd_mrep);
 		nd->nd_mrep = NULL;
 	}
 	/*
 	 * Fill last record, iff any, out to a multiple of DIRBLKSIZ
 	 * by increasing d_reclen for the last record.
 	 */
 	if (blksiz > 0) {
 		left = DIRBLKSIZ - blksiz;
 		dp->d_reclen += left;
 		uio_iov_base_add(uiop, left);
 		uio_iov_len_add(uiop, -(left));
 		uio_uio_resid_add(uiop, -(left));
 		uiop->uio_offset += left;
 	}
 
 	/*
 	 * If returning no data, assume end of file.
 	 * If not bigenough, return not end of file, since you aren't
 	 *    returning all the data
 	 * Otherwise, return the eof flag from the server.
 	 */
 	if (eofp) {
 		if (tresid == ((size_t)(uio_uio_resid(uiop))))
 			*eofp = 1;
 		else if (!bigenough)
 			*eofp = 0;
 		else
 			*eofp = eof;
 	}
 
 	/*
 	 * Add extra empty records to any remaining DIRBLKSIZ chunks.
 	 */
 	while (uio_uio_resid(uiop) > 0 && uio_uio_resid(uiop) != tresid) {
 		dp = (struct dirent *)uio_iov_base(uiop);
 		dp->d_type = DT_UNKNOWN;
 		dp->d_fileno = 0;
 		dp->d_namlen = 0;
 		dp->d_name[0] = '\0';
 		tl = (u_int32_t *)&dp->d_name[4];
 		*tl++ = cookie.lval[0];
 		*tl = cookie.lval[1];
 		dp->d_reclen = DIRBLKSIZ;
 		uio_iov_base_add(uiop, DIRBLKSIZ);
 		uio_iov_len_add(uiop, -(DIRBLKSIZ));
 		uio_uio_resid_add(uiop, -(DIRBLKSIZ));
 		uiop->uio_offset += DIRBLKSIZ;
 	}
 
 nfsmout:
 	if (nd->nd_mrep != NULL)
 		mbuf_freem(nd->nd_mrep);
 	return (error);
 }
 
 #ifndef APPLE
 /*
  * NFS V3 readdir plus RPC. Used in place of nfsrpc_readdir().
  * (Also used for NFS V4 when mount flag set.)
  * (ditto above w.r.t. multiple of DIRBLKSIZ, etc.)
  */
 APPLESTATIC int
 nfsrpc_readdirplus(vnode_t vp, struct uio *uiop, nfsuint64 *cookiep,
     struct ucred *cred, NFSPROC_T *p, struct nfsvattr *nap, int *attrflagp,
     int *eofp, void *stuff)
 {
 	int len, left;
 	struct dirent *dp = NULL;
 	u_int32_t *tl;
 	vnode_t newvp = NULLVP;
 	struct nfsrv_descript nfsd, *nd = &nfsd;
 	struct nameidata nami, *ndp = &nami;
 	struct componentname *cnp = &ndp->ni_cnd;
 	struct nfsmount *nmp = VFSTONFS(vnode_mount(vp));
 	struct nfsnode *dnp = VTONFS(vp), *np;
 	struct nfsvattr nfsva;
 	struct nfsfh *nfhp;
 	nfsquad_t cookie, ncookie;
 	int error = 0, tlen, more_dirs = 1, blksiz = 0, bigenough = 1;
 	int attrflag, tryformoredirs = 1, eof = 0, gotmnton = 0;
 	int isdotdot = 0, unlocknewvp = 0;
 	long dotfileid, dotdotfileid = 0, fileno = 0;
 	char *cp;
 	nfsattrbit_t attrbits, dattrbits;
 	size_t tresid;
 	u_int32_t *tl2 = NULL, fakefileno = 0xffffffff, rderr;
 	struct timespec dctime;
 
 	KASSERT(uiop->uio_iovcnt == 1 &&
 	    (uio_uio_resid(uiop) & (DIRBLKSIZ - 1)) == 0,
 	    ("nfs readdirplusrpc bad uio"));
 	timespecclear(&dctime);
 	*attrflagp = 0;
 	if (eofp != NULL)
 		*eofp = 0;
 	ndp->ni_dvp = vp;
 	nd->nd_mrep = NULL;
 	cookie.lval[0] = cookiep->nfsuquad[0];
 	cookie.lval[1] = cookiep->nfsuquad[1];
 	tresid = uio_uio_resid(uiop);
 
 	/*
 	 * For NFSv4, first create the "." and ".." entries.
 	 */
 	if (NFSHASNFSV4(nmp)) {
 		NFSGETATTR_ATTRBIT(&dattrbits);
 		NFSZERO_ATTRBIT(&attrbits);
 		NFSSETBIT_ATTRBIT(&attrbits, NFSATTRBIT_FILEID);
 		if (NFSISSET_ATTRBIT(&dnp->n_vattr.na_suppattr,
 		    NFSATTRBIT_MOUNTEDONFILEID)) {
 			NFSSETBIT_ATTRBIT(&attrbits,
 			    NFSATTRBIT_MOUNTEDONFILEID);
 			gotmnton = 1;
 		} else {
 			/*
 			 * Must fake it. Use the fileno, except when the
 			 * fsid is != to that of the directory. For that
 			 * case, generate a fake fileno that is not the same.
 			 */
 			NFSSETBIT_ATTRBIT(&attrbits, NFSATTRBIT_FSID);
 			gotmnton = 0;
 		}
 
 		/*
 		 * Joy, oh joy. For V4 we get to hand craft '.' and '..'.
 		 */
 		if (uiop->uio_offset == 0) {
 			NFSCL_REQSTART(nd, NFSPROC_LOOKUPP, vp);
 			NFSM_BUILD(tl, u_int32_t *, 2 * NFSX_UNSIGNED);
 			*tl++ = txdr_unsigned(NFSV4OP_GETFH);
 			*tl = txdr_unsigned(NFSV4OP_GETATTR);
 			(void) nfsrv_putattrbit(nd, &attrbits);
 			error = nfscl_request(nd, vp, p, cred, stuff);
 			if (error)
 			    return (error);
 			dotfileid = 0;	/* Fake out the compiler. */
 			if ((nd->nd_flag & ND_NOMOREDATA) == 0) {
 			    error = nfsm_loadattr(nd, &nfsva);
 			    if (error != 0)
 				goto nfsmout;
 			    dctime = nfsva.na_ctime;
 			    dotfileid = nfsva.na_fileid;
 			}
 			if (nd->nd_repstat == 0) {
 			    NFSM_DISSECT(tl, u_int32_t *, 5 * NFSX_UNSIGNED);
 			    len = fxdr_unsigned(int, *(tl + 4));
 			    if (len > 0 && len <= NFSX_V4FHMAX)
 				error = nfsm_advance(nd, NFSM_RNDUP(len), -1);
 			    else
 				error = EPERM;
 			    if (!error) {
 				NFSM_DISSECT(tl, u_int32_t *, 2*NFSX_UNSIGNED);
 				nfsva.na_mntonfileno = 0xffffffff;
 				error = nfsv4_loadattr(nd, NULL, &nfsva, NULL,
 				    NULL, 0, NULL, NULL, NULL, NULL, NULL, 0,
 				    NULL, NULL, NULL, p, cred);
 				if (error) {
 				    dotdotfileid = dotfileid;
 				} else if (gotmnton) {
 				    if (nfsva.na_mntonfileno != 0xffffffff)
 					dotdotfileid = nfsva.na_mntonfileno;
 				    else
 					dotdotfileid = nfsva.na_fileid;
 				} else if (nfsva.na_filesid[0] ==
 				    dnp->n_vattr.na_filesid[0] &&
 				    nfsva.na_filesid[1] ==
 				    dnp->n_vattr.na_filesid[1]) {
 				    dotdotfileid = nfsva.na_fileid;
 				} else {
 				    do {
 					fakefileno--;
 				    } while (fakefileno ==
 					nfsva.na_fileid);
 				    dotdotfileid = fakefileno;
 				}
 			    }
 			} else if (nd->nd_repstat == NFSERR_NOENT) {
 			    /*
 			     * Lookupp returns NFSERR_NOENT when we are
 			     * at the root, so just use the current dir.
 			     */
 			    nd->nd_repstat = 0;
 			    dotdotfileid = dotfileid;
 			} else {
 			    error = nd->nd_repstat;
 			}
 			mbuf_freem(nd->nd_mrep);
 			if (error)
 			    return (error);
 			nd->nd_mrep = NULL;
 			dp = (struct dirent *)uio_iov_base(uiop);
 			dp->d_off = 0;
 			dp->d_type = DT_DIR;
 			dp->d_fileno = dotfileid;
 			dp->d_namlen = 1;
 			*((uint64_t *)dp->d_name) = 0;	/* Zero pad it. */
 			dp->d_name[0] = '.';
 			dp->d_reclen = _GENERIC_DIRSIZ(dp) + NFSX_HYPER;
 			/*
 			 * Just make these offset cookie 0.
 			 */
 			tl = (u_int32_t *)&dp->d_name[8];
 			*tl++ = 0;
 			*tl = 0;
 			blksiz += dp->d_reclen;
 			uio_uio_resid_add(uiop, -(dp->d_reclen));
 			uiop->uio_offset += dp->d_reclen;
 			uio_iov_base_add(uiop, dp->d_reclen);
 			uio_iov_len_add(uiop, -(dp->d_reclen));
 			dp = (struct dirent *)uio_iov_base(uiop);
 			dp->d_off = 0;
 			dp->d_type = DT_DIR;
 			dp->d_fileno = dotdotfileid;
 			dp->d_namlen = 2;
 			*((uint64_t *)dp->d_name) = 0;
 			dp->d_name[0] = '.';
 			dp->d_name[1] = '.';
 			dp->d_reclen = _GENERIC_DIRSIZ(dp) + NFSX_HYPER;
 			/*
 			 * Just make these offset cookie 0.
 			 */
 			tl = (u_int32_t *)&dp->d_name[8];
 			*tl++ = 0;
 			*tl = 0;
 			blksiz += dp->d_reclen;
 			uio_uio_resid_add(uiop, -(dp->d_reclen));
 			uiop->uio_offset += dp->d_reclen;
 			uio_iov_base_add(uiop, dp->d_reclen);
 			uio_iov_len_add(uiop, -(dp->d_reclen));
 		}
 		NFSREADDIRPLUS_ATTRBIT(&attrbits);
 		if (gotmnton)
 			NFSSETBIT_ATTRBIT(&attrbits,
 			    NFSATTRBIT_MOUNTEDONFILEID);
 	}
 
 	/*
 	 * Loop around doing readdir rpc's of size nm_readdirsize.
 	 * The stopping criteria is EOF or buffer full.
 	 */
 	while (more_dirs && bigenough) {
 		*attrflagp = 0;
 		NFSCL_REQSTART(nd, NFSPROC_READDIRPLUS, vp);
  		NFSM_BUILD(tl, u_int32_t *, 6 * NFSX_UNSIGNED);
 		*tl++ = cookie.lval[0];
 		*tl++ = cookie.lval[1];
 		if (cookie.qval == 0) {
 			*tl++ = 0;
 			*tl++ = 0;
 		} else {
 			NFSLOCKNODE(dnp);
 			*tl++ = dnp->n_cookieverf.nfsuquad[0];
 			*tl++ = dnp->n_cookieverf.nfsuquad[1];
 			NFSUNLOCKNODE(dnp);
 		}
 		*tl++ = txdr_unsigned(nmp->nm_readdirsize);
 		*tl = txdr_unsigned(nmp->nm_readdirsize);
 		if (nd->nd_flag & ND_NFSV4) {
 			(void) nfsrv_putattrbit(nd, &attrbits);
 			NFSM_BUILD(tl, u_int32_t *, NFSX_UNSIGNED);
 			*tl = txdr_unsigned(NFSV4OP_GETATTR);
 			(void) nfsrv_putattrbit(nd, &dattrbits);
 		}
 		error = nfscl_request(nd, vp, p, cred, stuff);
 		if (error)
 			return (error);
 		if (nd->nd_flag & ND_NFSV3)
 			error = nfscl_postop_attr(nd, nap, attrflagp, stuff);
 		if (nd->nd_repstat || error) {
 			if (!error)
 				error = nd->nd_repstat;
 			goto nfsmout;
 		}
 		if ((nd->nd_flag & ND_NFSV3) != 0 && *attrflagp != 0)
 			dctime = nap->na_ctime;
 		NFSM_DISSECT(tl, u_int32_t *, 3 * NFSX_UNSIGNED);
 		NFSLOCKNODE(dnp);
 		dnp->n_cookieverf.nfsuquad[0] = *tl++;
 		dnp->n_cookieverf.nfsuquad[1] = *tl++;
 		NFSUNLOCKNODE(dnp);
 		more_dirs = fxdr_unsigned(int, *tl);
 		if (!more_dirs)
 			tryformoredirs = 0;
 	
 		/* loop through the dir entries, doctoring them to 4bsd form */
 		while (more_dirs && bigenough) {
 			NFSM_DISSECT(tl, u_int32_t *, 3 * NFSX_UNSIGNED);
 			if (nd->nd_flag & ND_NFSV4) {
 				ncookie.lval[0] = *tl++;
 				ncookie.lval[1] = *tl++;
 			} else {
 				fileno = fxdr_unsigned(long, *++tl);
 				tl++;
 			}
 			len = fxdr_unsigned(int, *tl);
 			if (len <= 0 || len > NFS_MAXNAMLEN) {
 				error = EBADRPC;
 				goto nfsmout;
 			}
 			tlen = roundup2(len, 8);
 			if (tlen == len)
 				tlen += 8;  /* To ensure null termination. */
 			left = DIRBLKSIZ - blksiz;
 			if (_GENERIC_DIRLEN(len) + NFSX_HYPER > left) {
 				dp->d_reclen += left;
 				uio_iov_base_add(uiop, left);
 				uio_iov_len_add(uiop, -(left));
 				uio_uio_resid_add(uiop, -(left));
 				uiop->uio_offset += left;
 				blksiz = 0;
 			}
 			if (_GENERIC_DIRLEN(len) + NFSX_HYPER >
 			    uio_uio_resid(uiop))
 				bigenough = 0;
 			if (bigenough) {
 				dp = (struct dirent *)uio_iov_base(uiop);
 				dp->d_off = 0;
 				dp->d_namlen = len;
 				dp->d_reclen = _GENERIC_DIRLEN(len) +
 				    NFSX_HYPER;
 				dp->d_type = DT_UNKNOWN;
 				blksiz += dp->d_reclen;
 				if (blksiz == DIRBLKSIZ)
 					blksiz = 0;
 				uio_uio_resid_add(uiop, -(DIRHDSIZ));
 				uiop->uio_offset += DIRHDSIZ;
 				uio_iov_base_add(uiop, DIRHDSIZ);
 				uio_iov_len_add(uiop, -(DIRHDSIZ));
 				cnp->cn_nameptr = uio_iov_base(uiop);
 				cnp->cn_namelen = len;
 				NFSCNHASHZERO(cnp);
 				error = nfsm_mbufuio(nd, uiop, len);
 				if (error)
 					goto nfsmout;
 				cp = uio_iov_base(uiop);
 				tlen -= len;
 				*cp = '\0';
 				cp += tlen;	/* points to cookie storage */
 				tl2 = (u_int32_t *)cp;
 				if (len == 2 && cnp->cn_nameptr[0] == '.' &&
 				    cnp->cn_nameptr[1] == '.')
 					isdotdot = 1;
 				else
 					isdotdot = 0;
 				uio_iov_base_add(uiop, (tlen + NFSX_HYPER));
 				uio_iov_len_add(uiop, -(tlen + NFSX_HYPER));
 				uio_uio_resid_add(uiop, -(tlen + NFSX_HYPER));
 				uiop->uio_offset += (tlen + NFSX_HYPER);
 			} else {
 				error = nfsm_advance(nd, NFSM_RNDUP(len), -1);
 				if (error)
 					goto nfsmout;
 			}
 			nfhp = NULL;
 			if (nd->nd_flag & ND_NFSV3) {
 				NFSM_DISSECT(tl, u_int32_t *, 3*NFSX_UNSIGNED);
 				ncookie.lval[0] = *tl++;
 				ncookie.lval[1] = *tl++;
 				attrflag = fxdr_unsigned(int, *tl);
 				if (attrflag) {
 				  error = nfsm_loadattr(nd, &nfsva);
 				  if (error)
 					goto nfsmout;
 				}
 				NFSM_DISSECT(tl,u_int32_t *,NFSX_UNSIGNED);
 				if (*tl) {
 					error = nfsm_getfh(nd, &nfhp);
 					if (error)
 					    goto nfsmout;
 				}
 				if (!attrflag && nfhp != NULL) {
 					FREE((caddr_t)nfhp, M_NFSFH);
 					nfhp = NULL;
 				}
 			} else {
 				rderr = 0;
 				nfsva.na_mntonfileno = 0xffffffff;
 				error = nfsv4_loadattr(nd, NULL, &nfsva, &nfhp,
 				    NULL, 0, NULL, NULL, NULL, NULL, NULL, 0,
 				    NULL, NULL, &rderr, p, cred);
 				if (error)
 					goto nfsmout;
 			}
 
 			if (bigenough) {
 			    if (nd->nd_flag & ND_NFSV4) {
 				if (rderr) {
 				    dp->d_fileno = 0;
 				} else if (gotmnton) {
 				    if (nfsva.na_mntonfileno != 0xffffffff)
 					dp->d_fileno = nfsva.na_mntonfileno;
 				    else
 					dp->d_fileno = nfsva.na_fileid;
 				} else if (nfsva.na_filesid[0] ==
 				    dnp->n_vattr.na_filesid[0] &&
 				    nfsva.na_filesid[1] ==
 				    dnp->n_vattr.na_filesid[1]) {
 				    dp->d_fileno = nfsva.na_fileid;
 				} else {
 				    do {
 					fakefileno--;
 				    } while (fakefileno ==
 					nfsva.na_fileid);
 				    dp->d_fileno = fakefileno;
 				}
 			    } else {
 				dp->d_fileno = fileno;
 			    }
 			    *tl2++ = cookiep->nfsuquad[0] = cookie.lval[0] =
 				ncookie.lval[0];
 			    *tl2 = cookiep->nfsuquad[1] = cookie.lval[1] =
 				ncookie.lval[1];
 
 			    if (nfhp != NULL) {
 				if (NFSRV_CMPFH(nfhp->nfh_fh, nfhp->nfh_len,
 				    dnp->n_fhp->nfh_fh, dnp->n_fhp->nfh_len)) {
 				    VREF(vp);
 				    newvp = vp;
 				    unlocknewvp = 0;
 				    FREE((caddr_t)nfhp, M_NFSFH);
 				    np = dnp;
 				} else if (isdotdot != 0) {
 				    /*
 				     * Skip doing a nfscl_nget() call for "..".
 				     * There's a race between acquiring the nfs
 				     * node here and lookups that look for the
 				     * directory being read (in the parent).
 				     * It would try to get a lock on ".." here,
 				     * owning the lock on the directory being
 				     * read. Lookup will hold the lock on ".."
 				     * and try to acquire the lock on the
 				     * directory being read.
 				     * If the directory is unlocked/relocked,
 				     * then there is a LOR with the buflock
 				     * vp is relocked.
 				     */
 				    free(nfhp, M_NFSFH);
 				} else {
 				    error = nfscl_nget(vnode_mount(vp), vp,
 				      nfhp, cnp, p, &np, NULL, LK_EXCLUSIVE);
 				    if (!error) {
 					newvp = NFSTOV(np);
 					unlocknewvp = 1;
 				    }
 				}
 				nfhp = NULL;
 				if (newvp != NULLVP) {
 				    error = nfscl_loadattrcache(&newvp,
 					&nfsva, NULL, NULL, 0, 0);
 				    if (error) {
 					if (unlocknewvp)
 					    vput(newvp);
 					else
 					    vrele(newvp);
 					goto nfsmout;
 				    }
 				    dp->d_type =
 					vtonfs_dtype(np->n_vattr.na_type);
 				    ndp->ni_vp = newvp;
 				    NFSCNHASH(cnp, HASHINIT);
 				    if (cnp->cn_namelen <= NCHNAMLEN &&
 					(newvp->v_type != VDIR ||
 					 dctime.tv_sec != 0)) {
 					cache_enter_time(ndp->ni_dvp,
 					    ndp->ni_vp, cnp,
 					    &nfsva.na_ctime,
 					    newvp->v_type != VDIR ? NULL :
 					    &dctime);
 				    }
 				    if (unlocknewvp)
 					vput(newvp);
 				    else
 					vrele(newvp);
 				    newvp = NULLVP;
 				}
 			    }
 			} else if (nfhp != NULL) {
 			    FREE((caddr_t)nfhp, M_NFSFH);
 			}
 			NFSM_DISSECT(tl, u_int32_t *, NFSX_UNSIGNED);
 			more_dirs = fxdr_unsigned(int, *tl);
 		}
 		/*
 		 * If at end of rpc data, get the eof boolean
 		 */
 		if (!more_dirs) {
 			NFSM_DISSECT(tl, u_int32_t *, NFSX_UNSIGNED);
 			eof = fxdr_unsigned(int, *tl);
 			if (tryformoredirs)
 				more_dirs = !eof;
 			if (nd->nd_flag & ND_NFSV4) {
 				error = nfscl_postop_attr(nd, nap, attrflagp,
 				    stuff);
 				if (error)
 					goto nfsmout;
 			}
 		}
 		mbuf_freem(nd->nd_mrep);
 		nd->nd_mrep = NULL;
 	}
 	/*
 	 * Fill last record, iff any, out to a multiple of DIRBLKSIZ
 	 * by increasing d_reclen for the last record.
 	 */
 	if (blksiz > 0) {
 		left = DIRBLKSIZ - blksiz;
 		dp->d_reclen += left;
 		uio_iov_base_add(uiop, left);
 		uio_iov_len_add(uiop, -(left));
 		uio_uio_resid_add(uiop, -(left));
 		uiop->uio_offset += left;
 	}
 
 	/*
 	 * If returning no data, assume end of file.
 	 * If not bigenough, return not end of file, since you aren't
 	 *    returning all the data
 	 * Otherwise, return the eof flag from the server.
 	 */
 	if (eofp != NULL) {
 		if (tresid == uio_uio_resid(uiop))
 			*eofp = 1;
 		else if (!bigenough)
 			*eofp = 0;
 		else
 			*eofp = eof;
 	}
 
 	/*
 	 * Add extra empty records to any remaining DIRBLKSIZ chunks.
 	 */
 	while (uio_uio_resid(uiop) > 0 && uio_uio_resid(uiop) != tresid) {
 		dp = (struct dirent *)uio_iov_base(uiop);
 		dp->d_type = DT_UNKNOWN;
 		dp->d_fileno = 0;
 		dp->d_namlen = 0;
 		dp->d_name[0] = '\0';
 		tl = (u_int32_t *)&dp->d_name[4];
 		*tl++ = cookie.lval[0];
 		*tl = cookie.lval[1];
 		dp->d_reclen = DIRBLKSIZ;
 		uio_iov_base_add(uiop, DIRBLKSIZ);
 		uio_iov_len_add(uiop, -(DIRBLKSIZ));
 		uio_uio_resid_add(uiop, -(DIRBLKSIZ));
 		uiop->uio_offset += DIRBLKSIZ;
 	}
 
 nfsmout:
 	if (nd->nd_mrep != NULL)
 		mbuf_freem(nd->nd_mrep);
 	return (error);
 }
 #endif	/* !APPLE */
 
 /*
  * Nfs commit rpc
  */
 APPLESTATIC int
 nfsrpc_commit(vnode_t vp, u_quad_t offset, int cnt, struct ucred *cred,
     NFSPROC_T *p, struct nfsvattr *nap, int *attrflagp, void *stuff)
 {
 	u_int32_t *tl;
 	struct nfsrv_descript nfsd, *nd = &nfsd;
 	nfsattrbit_t attrbits;
 	int error;
 	struct nfsmount *nmp = VFSTONFS(vnode_mount(vp));
 	
 	*attrflagp = 0;
 	NFSCL_REQSTART(nd, NFSPROC_COMMIT, vp);
 	NFSM_BUILD(tl, u_int32_t *, 3 * NFSX_UNSIGNED);
 	txdr_hyper(offset, tl);
 	tl += 2;
 	*tl = txdr_unsigned(cnt);
 	if (nd->nd_flag & ND_NFSV4) {
 		/*
 		 * And do a Getattr op.
 		 */
 		NFSM_BUILD(tl, u_int32_t *, NFSX_UNSIGNED);
 		*tl = txdr_unsigned(NFSV4OP_GETATTR);
 		NFSGETATTR_ATTRBIT(&attrbits);
 		(void) nfsrv_putattrbit(nd, &attrbits);
 	}
 	error = nfscl_request(nd, vp, p, cred, stuff);
 	if (error)
 		return (error);
 	error = nfscl_wcc_data(nd, vp, nap, attrflagp, NULL, stuff);
 	if (!error && !nd->nd_repstat) {
 		NFSM_DISSECT(tl, u_int32_t *, NFSX_VERF);
 		NFSLOCKMNT(nmp);
 		if (NFSBCMP(nmp->nm_verf, tl, NFSX_VERF)) {
 			NFSBCOPY(tl, nmp->nm_verf, NFSX_VERF);
 			nd->nd_repstat = NFSERR_STALEWRITEVERF;
 		}
 		NFSUNLOCKMNT(nmp);
 		if (nd->nd_flag & ND_NFSV4)
 			error = nfscl_postop_attr(nd, nap, attrflagp, stuff);
 	}
 nfsmout:
 	if (!error && nd->nd_repstat)
 		error = nd->nd_repstat;
 	mbuf_freem(nd->nd_mrep);
 	return (error);
 }
 
 /*
  * NFS byte range lock rpc.
  * (Mostly just calls one of the three lower level RPC routines.)
  */
 APPLESTATIC int
 nfsrpc_advlock(vnode_t vp, off_t size, int op, struct flock *fl,
     int reclaim, struct ucred *cred, NFSPROC_T *p, void *id, int flags)
 {
 	struct nfscllockowner *lp;
 	struct nfsclclient *clp;
 	struct nfsfh *nfhp;
 	struct nfsrv_descript nfsd, *nd = &nfsd;
 	struct nfsmount *nmp = VFSTONFS(vnode_mount(vp));
 	u_int64_t off, len;
 	off_t start, end;
 	u_int32_t clidrev = 0;
 	int error = 0, newone = 0, expireret = 0, retrycnt, donelocally;
 	int callcnt, dorpc;
 
 	/*
 	 * Convert the flock structure into a start and end and do POSIX
 	 * bounds checking.
 	 */
 	switch (fl->l_whence) {
 	case SEEK_SET:
 	case SEEK_CUR:
 		/*
 		 * Caller is responsible for adding any necessary offset
 		 * when SEEK_CUR is used.
 		 */
 		start = fl->l_start;
 		off = fl->l_start;
 		break;
 	case SEEK_END:
 		start = size + fl->l_start;
 		off = size + fl->l_start;
 		break;
 	default:
 		return (EINVAL);
 	}
 	if (start < 0)
 		return (EINVAL);
 	if (fl->l_len != 0) {
 		end = start + fl->l_len - 1;
 		if (end < start)
 			return (EINVAL);
 	}
 
 	len = fl->l_len;
 	if (len == 0)
 		len = NFS64BITSSET;
 	retrycnt = 0;
 	do {
 	    nd->nd_repstat = 0;
 	    if (op == F_GETLK) {
 		error = nfscl_getcl(vnode_mount(vp), cred, p, 1, &clp);
 		if (error)
 			return (error);
 		error = nfscl_lockt(vp, clp, off, len, fl, p, id, flags);
 		if (!error) {
 			clidrev = clp->nfsc_clientidrev;
 			error = nfsrpc_lockt(nd, vp, clp, off, len, fl, cred,
 			    p, id, flags);
 		} else if (error == -1) {
 			error = 0;
 		}
 		nfscl_clientrelease(clp);
 	    } else if (op == F_UNLCK && fl->l_type == F_UNLCK) {
 		/*
 		 * We must loop around for all lockowner cases.
 		 */
 		callcnt = 0;
 		error = nfscl_getcl(vnode_mount(vp), cred, p, 1, &clp);
 		if (error)
 			return (error);
 		do {
 		    error = nfscl_relbytelock(vp, off, len, cred, p, callcnt,
 			clp, id, flags, &lp, &dorpc);
 		    /*
 		     * If it returns a NULL lp, we're done.
 		     */
 		    if (lp == NULL) {
 			if (callcnt == 0)
 			    nfscl_clientrelease(clp);
 			else
 			    nfscl_releasealllocks(clp, vp, p, id, flags);
 			return (error);
 		    }
 		    if (nmp->nm_clp != NULL)
 			clidrev = nmp->nm_clp->nfsc_clientidrev;
 		    else
 			clidrev = 0;
 		    /*
 		     * If the server doesn't support Posix lock semantics,
 		     * only allow locks on the entire file, since it won't
 		     * handle overlapping byte ranges.
 		     * There might still be a problem when a lock
 		     * upgrade/downgrade (read<->write) occurs, since the
 		     * server "might" expect an unlock first?
 		     */
 		    if (dorpc && (lp->nfsl_open->nfso_posixlock ||
 			(off == 0 && len == NFS64BITSSET))) {
 			/*
 			 * Since the lock records will go away, we must
 			 * wait for grace and delay here.
 			 */
 			do {
 			    error = nfsrpc_locku(nd, nmp, lp, off, len,
 				NFSV4LOCKT_READ, cred, p, 0);
 			    if ((nd->nd_repstat == NFSERR_GRACE ||
 				 nd->nd_repstat == NFSERR_DELAY) &&
 				error == 0)
 				(void) nfs_catnap(PZERO, (int)nd->nd_repstat,
 				    "nfs_advlock");
 			} while ((nd->nd_repstat == NFSERR_GRACE ||
 			    nd->nd_repstat == NFSERR_DELAY) && error == 0);
 		    }
 		    callcnt++;
 		} while (error == 0 && nd->nd_repstat == 0);
 		nfscl_releasealllocks(clp, vp, p, id, flags);
 	    } else if (op == F_SETLK) {
 		error = nfscl_getbytelock(vp, off, len, fl->l_type, cred, p,
 		    NULL, 0, id, flags, NULL, NULL, &lp, &newone, &donelocally);
 		if (error || donelocally) {
 			return (error);
 		}
 		if (nmp->nm_clp != NULL)
 			clidrev = nmp->nm_clp->nfsc_clientidrev;
 		else
 			clidrev = 0;
 		nfhp = VTONFS(vp)->n_fhp;
 		if (!lp->nfsl_open->nfso_posixlock &&
 		    (off != 0 || len != NFS64BITSSET)) {
 			error = EINVAL;
 		} else {
 			error = nfsrpc_lock(nd, nmp, vp, nfhp->nfh_fh,
 			    nfhp->nfh_len, lp, newone, reclaim, off,
 			    len, fl->l_type, cred, p, 0);
 		}
 		if (!error)
 			error = nd->nd_repstat;
 		nfscl_lockrelease(lp, error, newone);
 	    } else {
 		error = EINVAL;
 	    }
 	    if (!error)
 	        error = nd->nd_repstat;
 	    if (error == NFSERR_GRACE || error == NFSERR_STALESTATEID ||
 		error == NFSERR_STALEDONTRECOVER ||
 		error == NFSERR_STALECLIENTID || error == NFSERR_DELAY ||
 		error == NFSERR_BADSESSION) {
 		(void) nfs_catnap(PZERO, error, "nfs_advlock");
 	    } else if ((error == NFSERR_EXPIRED || error == NFSERR_BADSTATEID)
 		&& clidrev != 0) {
 		expireret = nfscl_hasexpired(nmp->nm_clp, clidrev, p);
 		retrycnt++;
 	    }
 	} while (error == NFSERR_GRACE ||
 	    error == NFSERR_STALECLIENTID || error == NFSERR_DELAY ||
 	    error == NFSERR_STALEDONTRECOVER || error == NFSERR_STALESTATEID ||
 	    error == NFSERR_BADSESSION ||
 	    ((error == NFSERR_EXPIRED || error == NFSERR_BADSTATEID) &&
 	     expireret == 0 && clidrev != 0 && retrycnt < 4));
 	if (error && retrycnt >= 4)
 		error = EIO;
 	return (error);
 }
 
 /*
  * The lower level routine for the LockT case.
  */
 APPLESTATIC int
 nfsrpc_lockt(struct nfsrv_descript *nd, vnode_t vp,
     struct nfsclclient *clp, u_int64_t off, u_int64_t len, struct flock *fl,
     struct ucred *cred, NFSPROC_T *p, void *id, int flags)
 {
 	u_int32_t *tl;
 	int error, type, size;
 	uint8_t own[NFSV4CL_LOCKNAMELEN + NFSX_V4FHMAX];
 	struct nfsnode *np;
 	struct nfsmount *nmp;
 	struct nfsclsession *tsep;
 
 	nmp = VFSTONFS(vp->v_mount);
 	NFSCL_REQSTART(nd, NFSPROC_LOCKT, vp);
 	NFSM_BUILD(tl, u_int32_t *, 7 * NFSX_UNSIGNED);
 	if (fl->l_type == F_RDLCK)
 		*tl++ = txdr_unsigned(NFSV4LOCKT_READ);
 	else
 		*tl++ = txdr_unsigned(NFSV4LOCKT_WRITE);
 	txdr_hyper(off, tl);
 	tl += 2;
 	txdr_hyper(len, tl);
 	tl += 2;
 	tsep = nfsmnt_mdssession(nmp);
 	*tl++ = tsep->nfsess_clientid.lval[0];
 	*tl = tsep->nfsess_clientid.lval[1];
 	nfscl_filllockowner(id, own, flags);
 	np = VTONFS(vp);
 	NFSBCOPY(np->n_fhp->nfh_fh, &own[NFSV4CL_LOCKNAMELEN],
 	    np->n_fhp->nfh_len);
 	(void)nfsm_strtom(nd, own, NFSV4CL_LOCKNAMELEN + np->n_fhp->nfh_len);
 	error = nfscl_request(nd, vp, p, cred, NULL);
 	if (error)
 		return (error);
 	if (nd->nd_repstat == 0) {
 		fl->l_type = F_UNLCK;
 	} else if (nd->nd_repstat == NFSERR_DENIED) {
 		nd->nd_repstat = 0;
 		fl->l_whence = SEEK_SET;
 		NFSM_DISSECT(tl, u_int32_t *, 8 * NFSX_UNSIGNED);
 		fl->l_start = fxdr_hyper(tl);
 		tl += 2;
 		len = fxdr_hyper(tl);
 		tl += 2;
 		if (len == NFS64BITSSET)
 			fl->l_len = 0;
 		else
 			fl->l_len = len;
 		type = fxdr_unsigned(int, *tl++);
 		if (type == NFSV4LOCKT_WRITE)
 			fl->l_type = F_WRLCK;
 		else
 			fl->l_type = F_RDLCK;
 		/*
 		 * XXX For now, I have no idea what to do with the
 		 * conflicting lock_owner, so I'll just set the pid == 0
 		 * and skip over the lock_owner.
 		 */
 		fl->l_pid = (pid_t)0;
 		tl += 2;
 		size = fxdr_unsigned(int, *tl);
 		if (size < 0 || size > NFSV4_OPAQUELIMIT)
 			error = EBADRPC;
 		if (!error)
 			error = nfsm_advance(nd, NFSM_RNDUP(size), -1);
 	} else if (nd->nd_repstat == NFSERR_STALECLIENTID)
 		nfscl_initiate_recovery(clp);
 nfsmout:
 	mbuf_freem(nd->nd_mrep);
 	return (error);
 }
 
 /*
  * Lower level function that performs the LockU RPC.
  */
 static int
 nfsrpc_locku(struct nfsrv_descript *nd, struct nfsmount *nmp,
     struct nfscllockowner *lp, u_int64_t off, u_int64_t len,
     u_int32_t type, struct ucred *cred, NFSPROC_T *p, int syscred)
 {
 	u_int32_t *tl;
 	int error;
 
 	nfscl_reqstart(nd, NFSPROC_LOCKU, nmp, lp->nfsl_open->nfso_fh,
 	    lp->nfsl_open->nfso_fhlen, NULL, NULL);
 	NFSM_BUILD(tl, u_int32_t *, NFSX_STATEID + 6 * NFSX_UNSIGNED);
 	*tl++ = txdr_unsigned(type);
 	*tl = txdr_unsigned(lp->nfsl_seqid);
 	if (nfstest_outofseq &&
 	    (arc4random() % nfstest_outofseq) == 0)
 		*tl = txdr_unsigned(lp->nfsl_seqid + 1);
 	tl++;
 	if (NFSHASNFSV4N(nmp))
 		*tl++ = 0;
 	else
 		*tl++ = lp->nfsl_stateid.seqid;
 	*tl++ = lp->nfsl_stateid.other[0];
 	*tl++ = lp->nfsl_stateid.other[1];
 	*tl++ = lp->nfsl_stateid.other[2];
 	txdr_hyper(off, tl);
 	tl += 2;
 	txdr_hyper(len, tl);
 	if (syscred)
 		nd->nd_flag |= ND_USEGSSNAME;
 	error = newnfs_request(nd, nmp, NULL, &nmp->nm_sockreq, NULL, p, cred,
 	    NFS_PROG, NFS_VER4, NULL, 1, NULL, NULL);
 	NFSCL_INCRSEQID(lp->nfsl_seqid, nd);
 	if (error)
 		return (error);
 	if (nd->nd_repstat == 0) {
 		NFSM_DISSECT(tl, u_int32_t *, NFSX_STATEID);
 		lp->nfsl_stateid.seqid = *tl++;
 		lp->nfsl_stateid.other[0] = *tl++;
 		lp->nfsl_stateid.other[1] = *tl++;
 		lp->nfsl_stateid.other[2] = *tl;
 	} else if (nd->nd_repstat == NFSERR_STALESTATEID)
 		nfscl_initiate_recovery(lp->nfsl_open->nfso_own->nfsow_clp);
 nfsmout:
 	mbuf_freem(nd->nd_mrep);
 	return (error);
 }
 
 /*
  * The actual Lock RPC.
  */
 APPLESTATIC int
 nfsrpc_lock(struct nfsrv_descript *nd, struct nfsmount *nmp, vnode_t vp,
     u_int8_t *nfhp, int fhlen, struct nfscllockowner *lp, int newone,
     int reclaim, u_int64_t off, u_int64_t len, short type, struct ucred *cred,
     NFSPROC_T *p, int syscred)
 {
 	u_int32_t *tl;
 	int error, size;
 	uint8_t own[NFSV4CL_LOCKNAMELEN + NFSX_V4FHMAX];
 	struct nfsclsession *tsep;
 
 	nfscl_reqstart(nd, NFSPROC_LOCK, nmp, nfhp, fhlen, NULL, NULL);
 	NFSM_BUILD(tl, u_int32_t *, 7 * NFSX_UNSIGNED);
 	if (type == F_RDLCK)
 		*tl++ = txdr_unsigned(NFSV4LOCKT_READ);
 	else
 		*tl++ = txdr_unsigned(NFSV4LOCKT_WRITE);
 	*tl++ = txdr_unsigned(reclaim);
 	txdr_hyper(off, tl);
 	tl += 2;
 	txdr_hyper(len, tl);
 	tl += 2;
 	if (newone) {
 	    *tl = newnfs_true;
 	    NFSM_BUILD(tl, u_int32_t *, NFSX_STATEID +
 		2 * NFSX_UNSIGNED + NFSX_HYPER);
 	    *tl++ = txdr_unsigned(lp->nfsl_open->nfso_own->nfsow_seqid);
 	    if (NFSHASNFSV4N(nmp))
 		*tl++ = 0;
 	    else
 		*tl++ = lp->nfsl_open->nfso_stateid.seqid;
 	    *tl++ = lp->nfsl_open->nfso_stateid.other[0];
 	    *tl++ = lp->nfsl_open->nfso_stateid.other[1];
 	    *tl++ = lp->nfsl_open->nfso_stateid.other[2];
 	    *tl++ = txdr_unsigned(lp->nfsl_seqid);
 	    tsep = nfsmnt_mdssession(nmp);
 	    *tl++ = tsep->nfsess_clientid.lval[0];
 	    *tl = tsep->nfsess_clientid.lval[1];
 	    NFSBCOPY(lp->nfsl_owner, own, NFSV4CL_LOCKNAMELEN);
 	    NFSBCOPY(nfhp, &own[NFSV4CL_LOCKNAMELEN], fhlen);
 	    (void)nfsm_strtom(nd, own, NFSV4CL_LOCKNAMELEN + fhlen);
 	} else {
 	    *tl = newnfs_false;
 	    NFSM_BUILD(tl, u_int32_t *, NFSX_STATEID + NFSX_UNSIGNED);
 	    if (NFSHASNFSV4N(nmp))
 		*tl++ = 0;
 	    else
 		*tl++ = lp->nfsl_stateid.seqid;
 	    *tl++ = lp->nfsl_stateid.other[0];
 	    *tl++ = lp->nfsl_stateid.other[1];
 	    *tl++ = lp->nfsl_stateid.other[2];
 	    *tl = txdr_unsigned(lp->nfsl_seqid);
 	    if (nfstest_outofseq &&
 		(arc4random() % nfstest_outofseq) == 0)
 		    *tl = txdr_unsigned(lp->nfsl_seqid + 1);
 	}
 	if (syscred)
 		nd->nd_flag |= ND_USEGSSNAME;
 	error = newnfs_request(nd, nmp, NULL, &nmp->nm_sockreq, vp, p, cred,
 	    NFS_PROG, NFS_VER4, NULL, 1, NULL, NULL);
 	if (error)
 		return (error);
 	if (newone)
 	    NFSCL_INCRSEQID(lp->nfsl_open->nfso_own->nfsow_seqid, nd);
 	NFSCL_INCRSEQID(lp->nfsl_seqid, nd);
 	if (nd->nd_repstat == 0) {
 		NFSM_DISSECT(tl, u_int32_t *, NFSX_STATEID);
 		lp->nfsl_stateid.seqid = *tl++;
 		lp->nfsl_stateid.other[0] = *tl++;
 		lp->nfsl_stateid.other[1] = *tl++;
 		lp->nfsl_stateid.other[2] = *tl;
 	} else if (nd->nd_repstat == NFSERR_DENIED) {
 		NFSM_DISSECT(tl, u_int32_t *, 8 * NFSX_UNSIGNED);
 		size = fxdr_unsigned(int, *(tl + 7));
 		if (size < 0 || size > NFSV4_OPAQUELIMIT)
 			error = EBADRPC;
 		if (!error)
 			error = nfsm_advance(nd, NFSM_RNDUP(size), -1);
 	} else if (nd->nd_repstat == NFSERR_STALESTATEID)
 		nfscl_initiate_recovery(lp->nfsl_open->nfso_own->nfsow_clp);
 nfsmout:
 	mbuf_freem(nd->nd_mrep);
 	return (error);
 }
 
 /*
  * nfs statfs rpc
  * (always called with the vp for the mount point)
  */
 APPLESTATIC int
 nfsrpc_statfs(vnode_t vp, struct nfsstatfs *sbp, struct nfsfsinfo *fsp,
     struct ucred *cred, NFSPROC_T *p, struct nfsvattr *nap, int *attrflagp,
     void *stuff)
 {
 	u_int32_t *tl = NULL;
 	struct nfsrv_descript nfsd, *nd = &nfsd;
 	struct nfsmount *nmp;
 	nfsattrbit_t attrbits;
 	int error;
 
 	*attrflagp = 0;
 	nmp = VFSTONFS(vnode_mount(vp));
 	if (NFSHASNFSV4(nmp)) {
 		/*
 		 * For V4, you actually do a getattr.
 		 */
 		NFSCL_REQSTART(nd, NFSPROC_GETATTR, vp);
 		NFSSTATFS_GETATTRBIT(&attrbits);
 		(void) nfsrv_putattrbit(nd, &attrbits);
 		nd->nd_flag |= ND_USEGSSNAME;
 		error = nfscl_request(nd, vp, p, cred, stuff);
 		if (error)
 			return (error);
 		if (nd->nd_repstat == 0) {
 			error = nfsv4_loadattr(nd, NULL, nap, NULL, NULL, 0,
 			    NULL, NULL, sbp, fsp, NULL, 0, NULL, NULL, NULL, p,
 			    cred);
 			if (!error) {
 				nmp->nm_fsid[0] = nap->na_filesid[0];
 				nmp->nm_fsid[1] = nap->na_filesid[1];
 				NFSSETHASSETFSID(nmp);
 				*attrflagp = 1;
 			}
 		} else {
 			error = nd->nd_repstat;
 		}
 		if (error)
 			goto nfsmout;
 	} else {
 		NFSCL_REQSTART(nd, NFSPROC_FSSTAT, vp);
 		error = nfscl_request(nd, vp, p, cred, stuff);
 		if (error)
 			return (error);
 		if (nd->nd_flag & ND_NFSV3) {
 			error = nfscl_postop_attr(nd, nap, attrflagp, stuff);
 			if (error)
 				goto nfsmout;
 		}
 		if (nd->nd_repstat) {
 			error = nd->nd_repstat;
 			goto nfsmout;
 		}
 		NFSM_DISSECT(tl, u_int32_t *,
 		    NFSX_STATFS(nd->nd_flag & ND_NFSV3));
 	}
 	if (NFSHASNFSV3(nmp)) {
 		sbp->sf_tbytes = fxdr_hyper(tl); tl += 2;
 		sbp->sf_fbytes = fxdr_hyper(tl); tl += 2;
 		sbp->sf_abytes = fxdr_hyper(tl); tl += 2;
 		sbp->sf_tfiles = fxdr_hyper(tl); tl += 2;
 		sbp->sf_ffiles = fxdr_hyper(tl); tl += 2;
 		sbp->sf_afiles = fxdr_hyper(tl); tl += 2;
 		sbp->sf_invarsec = fxdr_unsigned(u_int32_t, *tl);
 	} else if (NFSHASNFSV4(nmp) == 0) {
 		sbp->sf_tsize = fxdr_unsigned(u_int32_t, *tl++);
 		sbp->sf_bsize = fxdr_unsigned(u_int32_t, *tl++);
 		sbp->sf_blocks = fxdr_unsigned(u_int32_t, *tl++);
 		sbp->sf_bfree = fxdr_unsigned(u_int32_t, *tl++);
 		sbp->sf_bavail = fxdr_unsigned(u_int32_t, *tl);
 	}
 nfsmout:
 	mbuf_freem(nd->nd_mrep);
 	return (error);
 }
 
 /*
  * nfs pathconf rpc
  */
 APPLESTATIC int
 nfsrpc_pathconf(vnode_t vp, struct nfsv3_pathconf *pc,
     struct ucred *cred, NFSPROC_T *p, struct nfsvattr *nap, int *attrflagp,
     void *stuff)
 {
 	struct nfsrv_descript nfsd, *nd = &nfsd;
 	struct nfsmount *nmp;
 	u_int32_t *tl;
 	nfsattrbit_t attrbits;
 	int error;
 
 	*attrflagp = 0;
 	nmp = VFSTONFS(vnode_mount(vp));
 	if (NFSHASNFSV4(nmp)) {
 		/*
 		 * For V4, you actually do a getattr.
 		 */
 		NFSCL_REQSTART(nd, NFSPROC_GETATTR, vp);
 		NFSPATHCONF_GETATTRBIT(&attrbits);
 		(void) nfsrv_putattrbit(nd, &attrbits);
 		nd->nd_flag |= ND_USEGSSNAME;
 		error = nfscl_request(nd, vp, p, cred, stuff);
 		if (error)
 			return (error);
 		if (nd->nd_repstat == 0) {
 			error = nfsv4_loadattr(nd, NULL, nap, NULL, NULL, 0,
 			    pc, NULL, NULL, NULL, NULL, 0, NULL, NULL, NULL, p,
 			    cred);
 			if (!error)
 				*attrflagp = 1;
 		} else {
 			error = nd->nd_repstat;
 		}
 	} else {
 		NFSCL_REQSTART(nd, NFSPROC_PATHCONF, vp);
 		error = nfscl_request(nd, vp, p, cred, stuff);
 		if (error)
 			return (error);
 		error = nfscl_postop_attr(nd, nap, attrflagp, stuff);
 		if (nd->nd_repstat && !error)
 			error = nd->nd_repstat;
 		if (!error) {
 			NFSM_DISSECT(tl, u_int32_t *, NFSX_V3PATHCONF);
 			pc->pc_linkmax = fxdr_unsigned(u_int32_t, *tl++);
 			pc->pc_namemax = fxdr_unsigned(u_int32_t, *tl++);
 			pc->pc_notrunc = fxdr_unsigned(u_int32_t, *tl++);
 			pc->pc_chownrestricted =
 			    fxdr_unsigned(u_int32_t, *tl++);
 			pc->pc_caseinsensitive =
 			    fxdr_unsigned(u_int32_t, *tl++);
 			pc->pc_casepreserving = fxdr_unsigned(u_int32_t, *tl);
 		}
 	}
 nfsmout:
 	mbuf_freem(nd->nd_mrep);
 	return (error);
 }
 
 /*
  * nfs version 3 fsinfo rpc call
  */
 APPLESTATIC int
 nfsrpc_fsinfo(vnode_t vp, struct nfsfsinfo *fsp, struct ucred *cred,
     NFSPROC_T *p, struct nfsvattr *nap, int *attrflagp, void *stuff)
 {
 	u_int32_t *tl;
 	struct nfsrv_descript nfsd, *nd = &nfsd;
 	int error;
 
 	*attrflagp = 0;
 	NFSCL_REQSTART(nd, NFSPROC_FSINFO, vp);
 	error = nfscl_request(nd, vp, p, cred, stuff);
 	if (error)
 		return (error);
 	error = nfscl_postop_attr(nd, nap, attrflagp, stuff);
 	if (nd->nd_repstat && !error)
 		error = nd->nd_repstat;
 	if (!error) {
 		NFSM_DISSECT(tl, u_int32_t *, NFSX_V3FSINFO);
 		fsp->fs_rtmax = fxdr_unsigned(u_int32_t, *tl++);
 		fsp->fs_rtpref = fxdr_unsigned(u_int32_t, *tl++);
 		fsp->fs_rtmult = fxdr_unsigned(u_int32_t, *tl++);
 		fsp->fs_wtmax = fxdr_unsigned(u_int32_t, *tl++);
 		fsp->fs_wtpref = fxdr_unsigned(u_int32_t, *tl++);
 		fsp->fs_wtmult = fxdr_unsigned(u_int32_t, *tl++);
 		fsp->fs_dtpref = fxdr_unsigned(u_int32_t, *tl++);
 		fsp->fs_maxfilesize = fxdr_hyper(tl);
 		tl += 2;
 		fxdr_nfsv3time(tl, &fsp->fs_timedelta);
 		tl += 2;
 		fsp->fs_properties = fxdr_unsigned(u_int32_t, *tl);
 	}
 nfsmout:
 	mbuf_freem(nd->nd_mrep);
 	return (error);
 }
 
 /*
  * This function performs the Renew RPC.
  */
 APPLESTATIC int
 nfsrpc_renew(struct nfsclclient *clp, struct nfsclds *dsp, struct ucred *cred,
     NFSPROC_T *p)
 {
 	u_int32_t *tl;
 	struct nfsrv_descript nfsd;
 	struct nfsrv_descript *nd = &nfsd;
 	struct nfsmount *nmp;
 	int error;
 	struct nfssockreq *nrp;
 	struct nfsclsession *tsep;
 
 	nmp = clp->nfsc_nmp;
 	if (nmp == NULL)
 		return (0);
 	if (dsp == NULL)
 		nfscl_reqstart(nd, NFSPROC_RENEW, nmp, NULL, 0, NULL, NULL);
 	else
 		nfscl_reqstart(nd, NFSPROC_RENEW, nmp, NULL, 0, NULL,
 		    &dsp->nfsclds_sess);
 	if (!NFSHASNFSV4N(nmp)) {
 		/* NFSv4.1 just uses a Sequence Op and not a Renew. */
 		NFSM_BUILD(tl, u_int32_t *, 2 * NFSX_UNSIGNED);
 		tsep = nfsmnt_mdssession(nmp);
 		*tl++ = tsep->nfsess_clientid.lval[0];
 		*tl = tsep->nfsess_clientid.lval[1];
 	}
 	nrp = NULL;
 	if (dsp != NULL)
 		nrp = dsp->nfsclds_sockp;
 	if (nrp == NULL)
 		/* If NULL, use the MDS socket. */
 		nrp = &nmp->nm_sockreq;
 	nd->nd_flag |= ND_USEGSSNAME;
 	if (dsp == NULL)
 		error = newnfs_request(nd, nmp, NULL, nrp, NULL, p, cred,
 		    NFS_PROG, NFS_VER4, NULL, 1, NULL, NULL);
 	else
 		error = newnfs_request(nd, nmp, NULL, nrp, NULL, p, cred,
 		    NFS_PROG, NFS_VER4, NULL, 1, NULL, &dsp->nfsclds_sess);
 	if (error)
 		return (error);
 	error = nd->nd_repstat;
 	mbuf_freem(nd->nd_mrep);
 	return (error);
 }
 
 /*
  * This function performs the Releaselockowner RPC.
  */
 APPLESTATIC int
 nfsrpc_rellockown(struct nfsmount *nmp, struct nfscllockowner *lp,
     uint8_t *fh, int fhlen, struct ucred *cred, NFSPROC_T *p)
 {
 	struct nfsrv_descript nfsd, *nd = &nfsd;
 	u_int32_t *tl;
 	int error;
 	uint8_t own[NFSV4CL_LOCKNAMELEN + NFSX_V4FHMAX];
 	struct nfsclsession *tsep;
 
 	if (NFSHASNFSV4N(nmp)) {
 		/* For NFSv4.1, do a FreeStateID. */
 		nfscl_reqstart(nd, NFSPROC_FREESTATEID, nmp, NULL, 0, NULL,
 		    NULL);
 		nfsm_stateidtom(nd, &lp->nfsl_stateid, NFSSTATEID_PUTSTATEID);
 	} else {
 		nfscl_reqstart(nd, NFSPROC_RELEASELCKOWN, nmp, NULL, 0, NULL,
 		    NULL);
 		NFSM_BUILD(tl, u_int32_t *, 2 * NFSX_UNSIGNED);
 		tsep = nfsmnt_mdssession(nmp);
 		*tl++ = tsep->nfsess_clientid.lval[0];
 		*tl = tsep->nfsess_clientid.lval[1];
 		NFSBCOPY(lp->nfsl_owner, own, NFSV4CL_LOCKNAMELEN);
 		NFSBCOPY(fh, &own[NFSV4CL_LOCKNAMELEN], fhlen);
 		(void)nfsm_strtom(nd, own, NFSV4CL_LOCKNAMELEN + fhlen);
 	}
 	nd->nd_flag |= ND_USEGSSNAME;
 	error = newnfs_request(nd, nmp, NULL, &nmp->nm_sockreq, NULL, p, cred,
 	    NFS_PROG, NFS_VER4, NULL, 1, NULL, NULL);
 	if (error)
 		return (error);
 	error = nd->nd_repstat;
 	mbuf_freem(nd->nd_mrep);
 	return (error);
 }
 
 /*
  * This function performs the Compound to get the mount pt FH.
  */
 APPLESTATIC int
 nfsrpc_getdirpath(struct nfsmount *nmp, u_char *dirpath, struct ucred *cred,
     NFSPROC_T *p)
 {
 	u_int32_t *tl;
 	struct nfsrv_descript nfsd;
 	struct nfsrv_descript *nd = &nfsd;
 	u_char *cp, *cp2;
 	int error, cnt, len, setnil;
 	u_int32_t *opcntp;
 
 	nfscl_reqstart(nd, NFSPROC_PUTROOTFH, nmp, NULL, 0, &opcntp, NULL);
 	cp = dirpath;
 	cnt = 0;
 	do {
 		setnil = 0;
 		while (*cp == '/')
 			cp++;
 		cp2 = cp;
 		while (*cp2 != '\0' && *cp2 != '/')
 			cp2++;
 		if (*cp2 == '/') {
 			setnil = 1;
 			*cp2 = '\0';
 		}
 		if (cp2 != cp) {
 			NFSM_BUILD(tl, u_int32_t *, NFSX_UNSIGNED);
 			*tl = txdr_unsigned(NFSV4OP_LOOKUP);
 			nfsm_strtom(nd, cp, strlen(cp));
 			cnt++;
 		}
 		if (setnil)
 			*cp2++ = '/';
 		cp = cp2;
 	} while (*cp != '\0');
 	if (NFSHASNFSV4N(nmp))
 		/* Has a Sequence Op done by nfscl_reqstart(). */
 		*opcntp = txdr_unsigned(3 + cnt);
 	else
 		*opcntp = txdr_unsigned(2 + cnt);
 	NFSM_BUILD(tl, u_int32_t *, NFSX_UNSIGNED);
 	*tl = txdr_unsigned(NFSV4OP_GETFH);
 	nd->nd_flag |= ND_USEGSSNAME;
 	error = newnfs_request(nd, nmp, NULL, &nmp->nm_sockreq, NULL, p, cred,
 		NFS_PROG, NFS_VER4, NULL, 1, NULL, NULL);
 	if (error)
 		return (error);
 	if (nd->nd_repstat == 0) {
 		NFSM_DISSECT(tl, u_int32_t *, (3 + 2 * cnt) * NFSX_UNSIGNED);
 		tl += (2 + 2 * cnt);
 		if ((len = fxdr_unsigned(int, *tl)) <= 0 ||
 			len > NFSX_FHMAX) {
 			nd->nd_repstat = NFSERR_BADXDR;
 		} else {
 			nd->nd_repstat = nfsrv_mtostr(nd, nmp->nm_fh, len);
 			if (nd->nd_repstat == 0)
 				nmp->nm_fhsize = len;
 		}
 	}
 	error = nd->nd_repstat;
 nfsmout:
 	mbuf_freem(nd->nd_mrep);
 	return (error);
 }
 
 /*
  * This function performs the Delegreturn RPC.
  */
 APPLESTATIC int
 nfsrpc_delegreturn(struct nfscldeleg *dp, struct ucred *cred,
     struct nfsmount *nmp, NFSPROC_T *p, int syscred)
 {
 	u_int32_t *tl;
 	struct nfsrv_descript nfsd;
 	struct nfsrv_descript *nd = &nfsd;
 	int error;
 
 	nfscl_reqstart(nd, NFSPROC_DELEGRETURN, nmp, dp->nfsdl_fh,
 	    dp->nfsdl_fhlen, NULL, NULL);
 	NFSM_BUILD(tl, u_int32_t *, NFSX_STATEID);
 	if (NFSHASNFSV4N(nmp))
 		*tl++ = 0;
 	else
 		*tl++ = dp->nfsdl_stateid.seqid;
 	*tl++ = dp->nfsdl_stateid.other[0];
 	*tl++ = dp->nfsdl_stateid.other[1];
 	*tl = dp->nfsdl_stateid.other[2];
 	if (syscred)
 		nd->nd_flag |= ND_USEGSSNAME;
 	error = newnfs_request(nd, nmp, NULL, &nmp->nm_sockreq, NULL, p, cred,
 	    NFS_PROG, NFS_VER4, NULL, 1, NULL, NULL);
 	if (error)
 		return (error);
 	error = nd->nd_repstat;
 	mbuf_freem(nd->nd_mrep);
 	return (error);
 }
 
 /*
  * nfs getacl call.
  */
 APPLESTATIC int
 nfsrpc_getacl(vnode_t vp, struct ucred *cred, NFSPROC_T *p,
     struct acl *aclp, void *stuff)
 {
 	struct nfsrv_descript nfsd, *nd = &nfsd;
 	int error;
 	nfsattrbit_t attrbits;
 	struct nfsmount *nmp = VFSTONFS(vnode_mount(vp));
 	
 	if (nfsrv_useacl == 0 || !NFSHASNFSV4(nmp))
 		return (EOPNOTSUPP);
 	NFSCL_REQSTART(nd, NFSPROC_GETACL, vp);
 	NFSZERO_ATTRBIT(&attrbits);
 	NFSSETBIT_ATTRBIT(&attrbits, NFSATTRBIT_ACL);
 	(void) nfsrv_putattrbit(nd, &attrbits);
 	error = nfscl_request(nd, vp, p, cred, stuff);
 	if (error)
 		return (error);
 	if (!nd->nd_repstat)
 		error = nfsv4_loadattr(nd, vp, NULL, NULL, NULL, 0, NULL,
 		    NULL, NULL, NULL, aclp, 0, NULL, NULL, NULL, p, cred);
 	else
 		error = nd->nd_repstat;
 	mbuf_freem(nd->nd_mrep);
 	return (error);
 }
 
 /*
  * nfs setacl call.
  */
 APPLESTATIC int
 nfsrpc_setacl(vnode_t vp, struct ucred *cred, NFSPROC_T *p,
     struct acl *aclp, void *stuff)
 {
 	int error;
 	struct nfsmount *nmp = VFSTONFS(vnode_mount(vp));
 	
 	if (nfsrv_useacl == 0 || !NFSHASNFSV4(nmp))
 		return (EOPNOTSUPP);
 	error = nfsrpc_setattr(vp, NULL, aclp, cred, p, NULL, NULL, stuff);
 	return (error);
 }
 
 /*
  * nfs setacl call.
  */
 static int
 nfsrpc_setaclrpc(vnode_t vp, struct ucred *cred, NFSPROC_T *p,
     struct acl *aclp, nfsv4stateid_t *stateidp, void *stuff)
 {
 	struct nfsrv_descript nfsd, *nd = &nfsd;
 	int error;
 	nfsattrbit_t attrbits;
 	struct nfsmount *nmp = VFSTONFS(vnode_mount(vp));
 	
 	if (!NFSHASNFSV4(nmp))
 		return (EOPNOTSUPP);
 	NFSCL_REQSTART(nd, NFSPROC_SETACL, vp);
 	nfsm_stateidtom(nd, stateidp, NFSSTATEID_PUTSTATEID);
 	NFSZERO_ATTRBIT(&attrbits);
 	NFSSETBIT_ATTRBIT(&attrbits, NFSATTRBIT_ACL);
 	(void) nfsv4_fillattr(nd, vnode_mount(vp), vp, aclp, NULL, NULL, 0,
 	    &attrbits, NULL, NULL, 0, 0, 0, 0, (uint64_t)0);
 	error = nfscl_request(nd, vp, p, cred, stuff);
 	if (error)
 		return (error);
 	/* Don't care about the pre/postop attributes */
 	mbuf_freem(nd->nd_mrep);
 	return (nd->nd_repstat);
 }
 
 /*
  * Do the NFSv4.1 Exchange ID.
  */
 int
 nfsrpc_exchangeid(struct nfsmount *nmp, struct nfsclclient *clp,
     struct nfssockreq *nrp, uint32_t exchflags, struct nfsclds **dspp,
     struct ucred *cred, NFSPROC_T *p)
 {
 	uint32_t *tl, v41flags;
 	struct nfsrv_descript nfsd;
 	struct nfsrv_descript *nd = &nfsd;
 	struct nfsclds *dsp;
 	struct timespec verstime;
 	int error, len;
 
 	*dspp = NULL;
 	nfscl_reqstart(nd, NFSPROC_EXCHANGEID, nmp, NULL, 0, NULL, NULL);
 	NFSM_BUILD(tl, uint32_t *, 2 * NFSX_UNSIGNED);
 	*tl++ = txdr_unsigned(nfsboottime.tv_sec);	/* Client owner */
 	*tl = txdr_unsigned(clp->nfsc_rev);
 	(void) nfsm_strtom(nd, clp->nfsc_id, clp->nfsc_idlen);
 
 	NFSM_BUILD(tl, uint32_t *, 3 * NFSX_UNSIGNED);
 	*tl++ = txdr_unsigned(exchflags);
 	*tl++ = txdr_unsigned(NFSV4EXCH_SP4NONE);
 
 	/* Set the implementation id4 */
 	*tl = txdr_unsigned(1);
 	(void) nfsm_strtom(nd, "freebsd.org", strlen("freebsd.org"));
 	(void) nfsm_strtom(nd, version, strlen(version));
 	NFSM_BUILD(tl, uint32_t *, NFSX_V4TIME);
 	verstime.tv_sec = 1293840000;		/* Jan 1, 2011 */
 	verstime.tv_nsec = 0;
 	txdr_nfsv4time(&verstime, tl);
 	nd->nd_flag |= ND_USEGSSNAME;
 	error = newnfs_request(nd, nmp, NULL, nrp, NULL, p, cred,
 	    NFS_PROG, NFS_VER4, NULL, 1, NULL, NULL);
 	NFSCL_DEBUG(1, "exchangeid err=%d reps=%d\n", error,
 	    (int)nd->nd_repstat);
 	if (error != 0)
 		return (error);
 	if (nd->nd_repstat == 0) {
 		NFSM_DISSECT(tl, uint32_t *, 6 * NFSX_UNSIGNED + NFSX_HYPER);
 		len = fxdr_unsigned(int, *(tl + 7));
 		if (len < 0 || len > NFSV4_OPAQUELIMIT) {
 			error = NFSERR_BADXDR;
 			goto nfsmout;
 		}
 		dsp = malloc(sizeof(struct nfsclds) + len + 1, M_NFSCLDS,
 		    M_WAITOK | M_ZERO);
 		dsp->nfsclds_expire = NFSD_MONOSEC + clp->nfsc_renew;
 		dsp->nfsclds_servownlen = len;
 		dsp->nfsclds_sess.nfsess_clientid.lval[0] = *tl++;
 		dsp->nfsclds_sess.nfsess_clientid.lval[1] = *tl++;
 		dsp->nfsclds_sess.nfsess_sequenceid =
 		    fxdr_unsigned(uint32_t, *tl++);
 		v41flags = fxdr_unsigned(uint32_t, *tl);
 		if ((v41flags & NFSV4EXCH_USEPNFSMDS) != 0 &&
 		    NFSHASPNFSOPT(nmp)) {
 			NFSCL_DEBUG(1, "set PNFS\n");
 			NFSLOCKMNT(nmp);
 			nmp->nm_state |= NFSSTA_PNFS;
 			NFSUNLOCKMNT(nmp);
 			dsp->nfsclds_flags |= NFSCLDS_MDS;
 		}
 		if ((v41flags & NFSV4EXCH_USEPNFSDS) != 0)
 			dsp->nfsclds_flags |= NFSCLDS_DS;
 		if (len > 0)
 			nd->nd_repstat = nfsrv_mtostr(nd,
 			    dsp->nfsclds_serverown, len);
 		if (nd->nd_repstat == 0) {
 			mtx_init(&dsp->nfsclds_mtx, "nfsds", NULL, MTX_DEF);
 			mtx_init(&dsp->nfsclds_sess.nfsess_mtx, "nfssession",
 			    NULL, MTX_DEF);
 			nfscl_initsessionslots(&dsp->nfsclds_sess);
 			*dspp = dsp;
 		} else
 			free(dsp, M_NFSCLDS);
 	}
 	error = nd->nd_repstat;
 nfsmout:
 	mbuf_freem(nd->nd_mrep);
 	return (error);
 }
 
 /*
  * Do the NFSv4.1 Create Session.
  */
 int
 nfsrpc_createsession(struct nfsmount *nmp, struct nfsclsession *sep,
     struct nfssockreq *nrp, uint32_t sequenceid, int mds, struct ucred *cred,
     NFSPROC_T *p)
 {
-	uint32_t crflags, *tl;
+	uint32_t crflags, maxval, *tl;
 	struct nfsrv_descript nfsd;
 	struct nfsrv_descript *nd = &nfsd;
 	int error, irdcnt;
 
 	nfscl_reqstart(nd, NFSPROC_CREATESESSION, nmp, NULL, 0, NULL, NULL);
 	NFSM_BUILD(tl, uint32_t *, 4 * NFSX_UNSIGNED);
 	*tl++ = sep->nfsess_clientid.lval[0];
 	*tl++ = sep->nfsess_clientid.lval[1];
 	*tl++ = txdr_unsigned(sequenceid);
 	crflags = (NFSMNT_RDONLY(nmp->nm_mountp) ? 0 : NFSV4CRSESS_PERSIST);
 	if (nfscl_enablecallb != 0 && nfs_numnfscbd > 0 && mds != 0)
 		crflags |= NFSV4CRSESS_CONNBACKCHAN;
 	*tl = txdr_unsigned(crflags);
 
 	/* Fill in fore channel attributes. */
 	NFSM_BUILD(tl, uint32_t *, 7 * NFSX_UNSIGNED);
 	*tl++ = 0;				/* Header pad size */
-	*tl++ = txdr_unsigned(100000);		/* Max request size */
-	*tl++ = txdr_unsigned(100000);		/* Max response size */
+	*tl++ = txdr_unsigned(nmp->nm_wsize + NFS_MAXXDR);/* Max request size */
+	*tl++ = txdr_unsigned(nmp->nm_rsize + NFS_MAXXDR);/* Max reply size */
 	*tl++ = txdr_unsigned(4096);		/* Max response size cached */
 	*tl++ = txdr_unsigned(20);		/* Max operations */
 	*tl++ = txdr_unsigned(64);		/* Max slots */
 	*tl = 0;				/* No rdma ird */
 
 	/* Fill in back channel attributes. */
 	NFSM_BUILD(tl, uint32_t *, 7 * NFSX_UNSIGNED);
 	*tl++ = 0;				/* Header pad size */
 	*tl++ = txdr_unsigned(10000);		/* Max request size */
 	*tl++ = txdr_unsigned(10000);		/* Max response size */
 	*tl++ = txdr_unsigned(4096);		/* Max response size cached */
 	*tl++ = txdr_unsigned(4);		/* Max operations */
 	*tl++ = txdr_unsigned(NFSV4_CBSLOTS);	/* Max slots */
 	*tl = 0;				/* No rdma ird */
 
 	NFSM_BUILD(tl, uint32_t *, 8 * NFSX_UNSIGNED);
 	*tl++ = txdr_unsigned(NFS_CALLBCKPROG);	/* Call back prog # */
 
 	/* Allow AUTH_SYS callbacks as uid, gid == 0. */
 	*tl++ = txdr_unsigned(1);		/* Auth_sys only */
 	*tl++ = txdr_unsigned(AUTH_SYS);	/* AUTH_SYS type */
 	*tl++ = txdr_unsigned(nfsboottime.tv_sec); /* time stamp */
 	*tl++ = 0;				/* Null machine name */
 	*tl++ = 0;				/* Uid == 0 */
 	*tl++ = 0;				/* Gid == 0 */
 	*tl = 0;				/* No additional gids */
 	nd->nd_flag |= ND_USEGSSNAME;
 	error = newnfs_request(nd, nmp, NULL, nrp, NULL, p, cred, NFS_PROG,
 	    NFS_VER4, NULL, 1, NULL, NULL);
 	if (error != 0)
 		return (error);
 	if (nd->nd_repstat == 0) {
 		NFSM_DISSECT(tl, uint32_t *, NFSX_V4SESSIONID +
 		    2 * NFSX_UNSIGNED);
 		bcopy(tl, sep->nfsess_sessionid, NFSX_V4SESSIONID);
 		tl += NFSX_V4SESSIONID / NFSX_UNSIGNED;
 		sep->nfsess_sequenceid = fxdr_unsigned(uint32_t, *tl++);
 		crflags = fxdr_unsigned(uint32_t, *tl);
 		if ((crflags & NFSV4CRSESS_PERSIST) != 0 && mds != 0) {
 			NFSLOCKMNT(nmp);
 			nmp->nm_state |= NFSSTA_SESSPERSIST;
 			NFSUNLOCKMNT(nmp);
 		}
 
 		/* Get the fore channel slot count. */
 		NFSM_DISSECT(tl, uint32_t *, 7 * NFSX_UNSIGNED);
-		tl += 3;		/* Skip the other counts. */		
+		tl++;			/* Skip the header pad size. */
+
+		/* Make sure nm_wsize is small enough. */
+		maxval = fxdr_unsigned(uint32_t, *tl++);
+		while (maxval < nmp->nm_wsize + NFS_MAXXDR) {
+			if (nmp->nm_wsize > 8096)
+				nmp->nm_wsize /= 2;
+			else
+				break;
+		}
+
+		/* Make sure nm_rsize is small enough. */
+		maxval = fxdr_unsigned(uint32_t, *tl++);
+		while (maxval < nmp->nm_rsize + NFS_MAXXDR) {
+			if (nmp->nm_rsize > 8096)
+				nmp->nm_rsize /= 2;
+			else
+				break;
+		}
+
 		sep->nfsess_maxcache = fxdr_unsigned(int, *tl++);
 		tl++;
 		sep->nfsess_foreslots = fxdr_unsigned(uint16_t, *tl++);
 		NFSCL_DEBUG(4, "fore slots=%d\n", (int)sep->nfsess_foreslots);
 		irdcnt = fxdr_unsigned(int, *tl);
 		if (irdcnt > 0)
 			NFSM_DISSECT(tl, uint32_t *, irdcnt * NFSX_UNSIGNED);
 
 		/* and the back channel slot count. */
 		NFSM_DISSECT(tl, uint32_t *, 7 * NFSX_UNSIGNED);
 		tl += 5;
 		sep->nfsess_backslots = fxdr_unsigned(uint16_t, *tl);
 		NFSCL_DEBUG(4, "back slots=%d\n", (int)sep->nfsess_backslots);
 	}
 	error = nd->nd_repstat;
 nfsmout:
 	mbuf_freem(nd->nd_mrep);
 	return (error);
 }
 
 /*
  * Do the NFSv4.1 Destroy Session.
  */
 int
 nfsrpc_destroysession(struct nfsmount *nmp, struct nfsclclient *clp,
     struct ucred *cred, NFSPROC_T *p)
 {
 	uint32_t *tl;
 	struct nfsrv_descript nfsd;
 	struct nfsrv_descript *nd = &nfsd;
 	int error;
 	struct nfsclsession *tsep;
 
 	nfscl_reqstart(nd, NFSPROC_DESTROYSESSION, nmp, NULL, 0, NULL, NULL);
 	NFSM_BUILD(tl, uint32_t *, NFSX_V4SESSIONID);
 	tsep = nfsmnt_mdssession(nmp);
 	bcopy(tsep->nfsess_sessionid, tl, NFSX_V4SESSIONID);
 	nd->nd_flag |= ND_USEGSSNAME;
 	error = newnfs_request(nd, nmp, NULL, &nmp->nm_sockreq, NULL, p, cred,
 	    NFS_PROG, NFS_VER4, NULL, 1, NULL, NULL);
 	if (error != 0)
 		return (error);
 	error = nd->nd_repstat;
 	mbuf_freem(nd->nd_mrep);
 	return (error);
 }
 
 /*
  * Do the NFSv4.1 Destroy Client.
  */
 int
 nfsrpc_destroyclient(struct nfsmount *nmp, struct nfsclclient *clp,
     struct ucred *cred, NFSPROC_T *p)
 {
 	uint32_t *tl;
 	struct nfsrv_descript nfsd;
 	struct nfsrv_descript *nd = &nfsd;
 	int error;
 	struct nfsclsession *tsep;
 
 	nfscl_reqstart(nd, NFSPROC_DESTROYCLIENT, nmp, NULL, 0, NULL, NULL);
 	NFSM_BUILD(tl, uint32_t *, 2 * NFSX_UNSIGNED);
 	tsep = nfsmnt_mdssession(nmp);
 	*tl++ = tsep->nfsess_clientid.lval[0];
 	*tl = tsep->nfsess_clientid.lval[1];
 	nd->nd_flag |= ND_USEGSSNAME;
 	error = newnfs_request(nd, nmp, NULL, &nmp->nm_sockreq, NULL, p, cred,
 	    NFS_PROG, NFS_VER4, NULL, 1, NULL, NULL);
 	if (error != 0)
 		return (error);
 	error = nd->nd_repstat;
 	mbuf_freem(nd->nd_mrep);
 	return (error);
 }
 
 /*
  * Do the NFSv4.1 LayoutGet.
  */
 int
 nfsrpc_layoutget(struct nfsmount *nmp, uint8_t *fhp, int fhlen, int iomode,
     uint64_t offset, uint64_t len, uint64_t minlen, int layoutlen,
     nfsv4stateid_t *stateidp, int *retonclosep, struct nfsclflayouthead *flhp,
     struct ucred *cred, NFSPROC_T *p, void *stuff)
 {
 	uint32_t *tl;
 	struct nfsrv_descript nfsd, *nd = &nfsd;
 	struct nfsfh *nfhp;
 	struct nfsclflayout *flp, *prevflp, *tflp;
 	int cnt, error, gotiomode, fhcnt, nfhlen, i, j;
 	uint8_t *cp;
 	uint64_t retlen;
 
 	flp = NULL;
 	gotiomode = -1;
 	nfscl_reqstart(nd, NFSPROC_LAYOUTGET, nmp, fhp, fhlen, NULL, NULL);
 	NFSM_BUILD(tl, uint32_t *, 4 * NFSX_UNSIGNED + 3 * NFSX_HYPER +
 	    NFSX_STATEID);
 	*tl++ = newnfs_false;		/* Don't signal availability. */
 	*tl++ = txdr_unsigned(NFSLAYOUT_NFSV4_1_FILES);
 	*tl++ = txdr_unsigned(iomode);
 	txdr_hyper(offset, tl);
 	tl += 2;
 	txdr_hyper(len, tl);
 	tl += 2;
 	txdr_hyper(minlen, tl);
 	tl += 2;
 	*tl++ = txdr_unsigned(stateidp->seqid);
 	NFSCL_DEBUG(4, "layget seq=%d\n", (int)stateidp->seqid);
 	*tl++ = stateidp->other[0];
 	*tl++ = stateidp->other[1];
 	*tl++ = stateidp->other[2];
 	*tl = txdr_unsigned(layoutlen);
 	nd->nd_flag |= ND_USEGSSNAME;
 	error = newnfs_request(nd, nmp, NULL, &nmp->nm_sockreq, NULL, p, cred,
 	    NFS_PROG, NFS_VER4, NULL, 1, NULL, NULL);
 	if (error != 0)
 		return (error);
 	if (nd->nd_repstat == 0) {
 		NFSM_DISSECT(tl, uint32_t *, 2 * NFSX_UNSIGNED + NFSX_STATEID);
 		if (*tl++ != 0)
 			*retonclosep = 1;
 		else
 			*retonclosep = 0;
 		stateidp->seqid = fxdr_unsigned(uint32_t, *tl++);
 		NFSCL_DEBUG(4, "retoncls=%d stseq=%d\n", *retonclosep,
 		    (int)stateidp->seqid);
 		stateidp->other[0] = *tl++;
 		stateidp->other[1] = *tl++;
 		stateidp->other[2] = *tl++;
 		cnt = fxdr_unsigned(int, *tl);
 		NFSCL_DEBUG(4, "layg cnt=%d\n", cnt);
 		if (cnt <= 0 || cnt > 10000) {
 			/* Don't accept more than 10000 layouts in reply. */
 			error = NFSERR_BADXDR;
 			goto nfsmout;
 		}
 		for (i = 0; i < cnt; i++) {
 			/* Dissect all the way to the file handle cnt. */
 			NFSM_DISSECT(tl, uint32_t *, 3 * NFSX_HYPER +
 			    6 * NFSX_UNSIGNED + NFSX_V4DEVICEID);
 			fhcnt = fxdr_unsigned(int, *(tl + 11 +
 			    NFSX_V4DEVICEID / NFSX_UNSIGNED));
 			NFSCL_DEBUG(4, "fhcnt=%d\n", fhcnt);
 			if (fhcnt < 0 || fhcnt > 100) {
 				/* Don't accept more than 100 file handles. */
 				error = NFSERR_BADXDR;
 				goto nfsmout;
 			}
 			if (fhcnt > 1)
 				flp = malloc(sizeof(*flp) + (fhcnt - 1) *
 				    sizeof(struct nfsfh *),
 				    M_NFSFLAYOUT, M_WAITOK);
 			else
 				flp = malloc(sizeof(*flp),
 				    M_NFSFLAYOUT, M_WAITOK);
 			flp->nfsfl_flags = 0;
 			flp->nfsfl_fhcnt = 0;
 			flp->nfsfl_devp = NULL;
 			flp->nfsfl_off = fxdr_hyper(tl); tl += 2;
 			retlen = fxdr_hyper(tl); tl += 2;
 			if (flp->nfsfl_off + retlen < flp->nfsfl_off)
 				flp->nfsfl_end = UINT64_MAX - flp->nfsfl_off;
 			else
 				flp->nfsfl_end = flp->nfsfl_off + retlen;
 			flp->nfsfl_iomode = fxdr_unsigned(int, *tl++);
 			if (gotiomode == -1)
 				gotiomode = flp->nfsfl_iomode;
 			NFSCL_DEBUG(4, "layg reqiom=%d retiom=%d\n", iomode,
 			    (int)flp->nfsfl_iomode);
 			if (fxdr_unsigned(int, *tl++) !=
 			    NFSLAYOUT_NFSV4_1_FILES) {
 				printf("NFSv4.1: got non-files layout\n");
 				error = NFSERR_BADXDR;
 				goto nfsmout;
 			}
 			NFSBCOPY(++tl, flp->nfsfl_dev, NFSX_V4DEVICEID);
 			tl += (NFSX_V4DEVICEID / NFSX_UNSIGNED);
 			flp->nfsfl_util = fxdr_unsigned(uint32_t, *tl++);
 			NFSCL_DEBUG(4, "flutil=0x%x\n", flp->nfsfl_util);
 			flp->nfsfl_stripe1 = fxdr_unsigned(uint32_t, *tl++);
 			flp->nfsfl_patoff = fxdr_hyper(tl); tl += 2;
 			if (fxdr_unsigned(int, *tl) != fhcnt) {
 				printf("EEK! bad fhcnt\n");
 				error = NFSERR_BADXDR;
 				goto nfsmout;
 			}
 			for (j = 0; j < fhcnt; j++) {
 				NFSM_DISSECT(tl, uint32_t *, NFSX_UNSIGNED);
 				nfhlen = fxdr_unsigned(int, *tl);
 				if (nfhlen <= 0 || nfhlen > NFSX_V4FHMAX) {
 					error = NFSERR_BADXDR;
 					goto nfsmout;
 				}
 				nfhp = malloc(sizeof(*nfhp) + nfhlen - 1,
 				    M_NFSFH, M_WAITOK);
 				flp->nfsfl_fh[j] = nfhp;
 				flp->nfsfl_fhcnt++;
 				nfhp->nfh_len = nfhlen;
 				NFSM_DISSECT(cp, uint8_t *, NFSM_RNDUP(nfhlen));
 				NFSBCOPY(cp, nfhp->nfh_fh, nfhlen);
 			}
 			if (flp->nfsfl_iomode == gotiomode) {
 				/* Keep the list in increasing offset order. */
 				tflp = LIST_FIRST(flhp);
 				prevflp = NULL;
 				while (tflp != NULL &&
 				    tflp->nfsfl_off < flp->nfsfl_off) {
 					prevflp = tflp;
 					tflp = LIST_NEXT(tflp, nfsfl_list);
 				}
 				if (prevflp == NULL)
 					LIST_INSERT_HEAD(flhp, flp, nfsfl_list);
 				else
 					LIST_INSERT_AFTER(prevflp, flp,
 					    nfsfl_list);
 			} else {
 				printf("nfscl_layoutget(): got wrong iomode\n");
 				nfscl_freeflayout(flp);
 			}
 			flp = NULL;
 		}
 	}
 	if (nd->nd_repstat != 0 && error == 0)
 		error = nd->nd_repstat;
 nfsmout:
 	if (error != 0 && flp != NULL)
 		nfscl_freeflayout(flp);
 	mbuf_freem(nd->nd_mrep);
 	return (error);
 }
 
 /*
  * Do the NFSv4.1 Get Device Info.
  */
 int
 nfsrpc_getdeviceinfo(struct nfsmount *nmp, uint8_t *deviceid, int layouttype,
     uint32_t *notifybitsp, struct nfscldevinfo **ndip, struct ucred *cred,
     NFSPROC_T *p)
 {
 	uint32_t cnt, *tl;
 	struct nfsrv_descript nfsd;
 	struct nfsrv_descript *nd = &nfsd;
 	struct sockaddr_storage ss;
 	struct nfsclds *dsp = NULL, **dspp;
 	struct nfscldevinfo *ndi;
 	int addrcnt, bitcnt, error, i, isudp, j, pos, safilled, stripecnt;
 	uint8_t stripeindex;
 
 	*ndip = NULL;
 	ndi = NULL;
 	nfscl_reqstart(nd, NFSPROC_GETDEVICEINFO, nmp, NULL, 0, NULL, NULL);
 	NFSM_BUILD(tl, uint32_t *, NFSX_V4DEVICEID + 3 * NFSX_UNSIGNED);
 	NFSBCOPY(deviceid, tl, NFSX_V4DEVICEID);
 	tl += (NFSX_V4DEVICEID / NFSX_UNSIGNED);
 	*tl++ = txdr_unsigned(layouttype);
 	*tl++ = txdr_unsigned(100000);
 	if (notifybitsp != NULL && *notifybitsp != 0) {
 		*tl = txdr_unsigned(1);		/* One word of bits. */
 		NFSM_BUILD(tl, uint32_t *, NFSX_UNSIGNED);
 		*tl = txdr_unsigned(*notifybitsp);
 	} else
 		*tl = txdr_unsigned(0);
 	nd->nd_flag |= ND_USEGSSNAME;
 	error = newnfs_request(nd, nmp, NULL, &nmp->nm_sockreq, NULL, p, cred,
 	    NFS_PROG, NFS_VER4, NULL, 1, NULL, NULL);
 	if (error != 0)
 		return (error);
 	if (nd->nd_repstat == 0) {
 		NFSM_DISSECT(tl, uint32_t *, 3 * NFSX_UNSIGNED);
 		if (layouttype != fxdr_unsigned(int, *tl++))
 			printf("EEK! devinfo layout type not same!\n");
 		stripecnt = fxdr_unsigned(int, *++tl);
 		NFSCL_DEBUG(4, "stripecnt=%d\n", stripecnt);
 		if (stripecnt < 1 || stripecnt > 4096) {
 			printf("NFS devinfo stripecnt %d: out of range\n",
 			    stripecnt);
 			error = NFSERR_BADXDR;
 			goto nfsmout;
 		}
 		NFSM_DISSECT(tl, uint32_t *, (stripecnt + 1) * NFSX_UNSIGNED);
 		addrcnt = fxdr_unsigned(int, *(tl + stripecnt));
 		NFSCL_DEBUG(4, "addrcnt=%d\n", addrcnt);
 		if (addrcnt < 1 || addrcnt > 128) {
 			printf("NFS devinfo addrcnt %d: out of range\n",
 			    addrcnt);
 			error = NFSERR_BADXDR;
 			goto nfsmout;
 		}
 
 		/*
 		 * Now we know how many stripe indices and addresses, so
 		 * we can allocate the structure the correct size.
 		 */
 		i = (stripecnt * sizeof(uint8_t)) / sizeof(struct nfsclds *)
 		    + 1;
 		NFSCL_DEBUG(4, "stripeindices=%d\n", i);
 		ndi = malloc(sizeof(*ndi) + (addrcnt + i) *
 		    sizeof(struct nfsclds *), M_NFSDEVINFO, M_WAITOK | M_ZERO);
 		NFSBCOPY(deviceid, ndi->nfsdi_deviceid, NFSX_V4DEVICEID);
 		ndi->nfsdi_refcnt = 0;
 		ndi->nfsdi_stripecnt = stripecnt;
 		ndi->nfsdi_addrcnt = addrcnt;
 		/* Fill in the stripe indices. */
 		for (i = 0; i < stripecnt; i++) {
 			stripeindex = fxdr_unsigned(uint8_t, *tl++);
 			NFSCL_DEBUG(4, "stripeind=%d\n", stripeindex);
 			if (stripeindex >= addrcnt) {
 				printf("NFS devinfo stripeindex %d: too big\n",
 				    (int)stripeindex);
 				error = NFSERR_BADXDR;
 				goto nfsmout;
 			}
 			nfsfldi_setstripeindex(ndi, i, stripeindex);
 		}
 
 		/* Now, dissect the server address(es). */
 		safilled = 0;
 		for (i = 0; i < addrcnt; i++) {
 			NFSM_DISSECT(tl, uint32_t *, NFSX_UNSIGNED);
 			cnt = fxdr_unsigned(uint32_t, *tl);
 			if (cnt == 0) {
 				printf("NFS devinfo 0 len addrlist\n");
 				error = NFSERR_BADXDR;
 				goto nfsmout;
 			}
 			dspp = nfsfldi_addr(ndi, i);
 			pos = arc4random() % cnt;	/* Choose one. */
 			safilled = 0;
 			for (j = 0; j < cnt; j++) {
 				error = nfsv4_getipaddr(nd, &ss, &isudp);
 				if (error != 0 && error != EPERM) {
 					error = NFSERR_BADXDR;
 					goto nfsmout;
 				}
 				if (error == 0 && isudp == 0) {
 					/*
 					 * The algorithm is:
 					 * - use "pos" entry if it is of the
 					 *   same af_family or none of them
 					 *   is of the same af_family
 					 * else
 					 * - use the first one of the same
 					 *   af_family.
 					 */
 					if ((safilled == 0 && ss.ss_family ==
 					     nmp->nm_nam->sa_family) ||
 					    (j == pos &&
 					     (safilled == 0 || ss.ss_family ==
 					      nmp->nm_nam->sa_family)) ||
 					    (safilled == 1 && ss.ss_family ==
 					     nmp->nm_nam->sa_family)) {
 						error = nfsrpc_fillsa(nmp, &ss,
 						    &dsp, p);
 						if (error == 0) {
 							*dspp = dsp;
 							if (ss.ss_family ==
 							 nmp->nm_nam->sa_family)
 								safilled = 2;
 							else
 								safilled = 1;
 						}
 					}
 				}
 			}
 			if (safilled == 0)
 				break;
 		}
 
 		/* And the notify bits. */
 		NFSM_DISSECT(tl, uint32_t *, NFSX_UNSIGNED);
 		if (safilled != 0) {
 			bitcnt = fxdr_unsigned(int, *tl);
 			if (bitcnt > 0) {
 				NFSM_DISSECT(tl, uint32_t *, NFSX_UNSIGNED);
 				if (notifybitsp != NULL)
 					*notifybitsp =
 					    fxdr_unsigned(uint32_t, *tl);
 			}
 			*ndip = ndi;
 		} else
 			error = EPERM;
 	}
 	if (nd->nd_repstat != 0)
 		error = nd->nd_repstat;
 nfsmout:
 	if (error != 0 && ndi != NULL)
 		nfscl_freedevinfo(ndi);
 	mbuf_freem(nd->nd_mrep);
 	return (error);
 }
 
 /*
  * Do the NFSv4.1 LayoutCommit.
  */
 int
 nfsrpc_layoutcommit(struct nfsmount *nmp, uint8_t *fh, int fhlen, int reclaim,
     uint64_t off, uint64_t len, uint64_t lastbyte, nfsv4stateid_t *stateidp,
     int layouttype, int layoutupdatecnt, uint8_t *layp, struct ucred *cred,
     NFSPROC_T *p, void *stuff)
 {
 	uint32_t *tl;
 	struct nfsrv_descript nfsd, *nd = &nfsd;
 	int error, outcnt, i;
 	uint8_t *cp;
 
 	nfscl_reqstart(nd, NFSPROC_LAYOUTCOMMIT, nmp, fh, fhlen, NULL, NULL);
 	NFSM_BUILD(tl, uint32_t *, 5 * NFSX_UNSIGNED + 3 * NFSX_HYPER +
 	    NFSX_STATEID);
 	txdr_hyper(off, tl);
 	tl += 2;
 	txdr_hyper(len, tl);
 	tl += 2;
 	if (reclaim != 0)
 		*tl++ = newnfs_true;
 	else
 		*tl++ = newnfs_false;
 	*tl++ = txdr_unsigned(stateidp->seqid);
 	*tl++ = stateidp->other[0];
 	*tl++ = stateidp->other[1];
 	*tl++ = stateidp->other[2];
 	*tl++ = newnfs_true;
 	if (lastbyte < off)
 		lastbyte = off;
 	else if (lastbyte >= (off + len))
 		lastbyte = off + len - 1;
 	txdr_hyper(lastbyte, tl);
 	tl += 2;
 	*tl++ = newnfs_false;
 	*tl++ = txdr_unsigned(layouttype);
 	*tl = txdr_unsigned(layoutupdatecnt);
 	if (layoutupdatecnt > 0) {
 		KASSERT(layouttype != NFSLAYOUT_NFSV4_1_FILES,
 		    ("Must be nil for Files Layout"));
 		outcnt = NFSM_RNDUP(layoutupdatecnt);
 		NFSM_BUILD(cp, uint8_t *, outcnt);
 		NFSBCOPY(layp, cp, layoutupdatecnt);
 		cp += layoutupdatecnt;
 		for (i = 0; i < (outcnt - layoutupdatecnt); i++)
 			*cp++ = 0x0;
 	}
 	nd->nd_flag |= ND_USEGSSNAME;
 	error = newnfs_request(nd, nmp, NULL, &nmp->nm_sockreq, NULL, p, cred,
 	    NFS_PROG, NFS_VER4, NULL, 1, NULL, NULL);
 	if (error != 0)
 		return (error);
 	error = nd->nd_repstat;
 	mbuf_freem(nd->nd_mrep);
 	return (error);
 }
 
 /*
  * Do the NFSv4.1 LayoutReturn.
  */
 int
 nfsrpc_layoutreturn(struct nfsmount *nmp, uint8_t *fh, int fhlen, int reclaim,
     int layouttype, uint32_t iomode, int layoutreturn, uint64_t offset,
     uint64_t len, nfsv4stateid_t *stateidp, int layoutcnt, uint32_t *layp,
     struct ucred *cred, NFSPROC_T *p, void *stuff)
 {
 	uint32_t *tl;
 	struct nfsrv_descript nfsd, *nd = &nfsd;
 	int error, outcnt, i;
 	uint8_t *cp;
 
 	nfscl_reqstart(nd, NFSPROC_LAYOUTRETURN, nmp, fh, fhlen, NULL, NULL);
 	NFSM_BUILD(tl, uint32_t *, 4 * NFSX_UNSIGNED);
 	if (reclaim != 0)
 		*tl++ = newnfs_true;
 	else
 		*tl++ = newnfs_false;
 	*tl++ = txdr_unsigned(layouttype);
 	*tl++ = txdr_unsigned(iomode);
 	*tl = txdr_unsigned(layoutreturn);
 	if (layoutreturn == NFSLAYOUTRETURN_FILE) {
 		NFSM_BUILD(tl, uint32_t *, 2 * NFSX_HYPER + NFSX_STATEID +
 		    NFSX_UNSIGNED);
 		txdr_hyper(offset, tl);
 		tl += 2;
 		txdr_hyper(len, tl);
 		tl += 2;
 		NFSCL_DEBUG(4, "layoutret stseq=%d\n", (int)stateidp->seqid);
 		*tl++ = txdr_unsigned(stateidp->seqid);
 		*tl++ = stateidp->other[0];
 		*tl++ = stateidp->other[1];
 		*tl++ = stateidp->other[2];
 		*tl = txdr_unsigned(layoutcnt);
 		if (layoutcnt > 0) {
 			outcnt = NFSM_RNDUP(layoutcnt);
 			NFSM_BUILD(cp, uint8_t *, outcnt);
 			NFSBCOPY(layp, cp, layoutcnt);
 			cp += layoutcnt;
 			for (i = 0; i < (outcnt - layoutcnt); i++)
 				*cp++ = 0x0;
 		}
 	}
 	nd->nd_flag |= ND_USEGSSNAME;
 	error = newnfs_request(nd, nmp, NULL, &nmp->nm_sockreq, NULL, p, cred,
 	    NFS_PROG, NFS_VER4, NULL, 1, NULL, NULL);
 	if (error != 0)
 		return (error);
 	if (nd->nd_repstat == 0) {
 		NFSM_DISSECT(tl, uint32_t *, NFSX_UNSIGNED);
 		if (*tl != 0) {
 			NFSM_DISSECT(tl, uint32_t *, NFSX_STATEID);
 			stateidp->seqid = fxdr_unsigned(uint32_t, *tl++);
 			stateidp->other[0] = *tl++;
 			stateidp->other[1] = *tl++;
 			stateidp->other[2] = *tl;
 		}
 	} else
 		error = nd->nd_repstat;
 nfsmout:
 	mbuf_freem(nd->nd_mrep);
 	return (error);
 }
 
 /*
  * Acquire a layout and devinfo, if possible. The caller must have acquired
  * a reference count on the nfsclclient structure before calling this.
  * Return the layout in lypp with a reference count on it, if successful.
  */
 static int
 nfsrpc_getlayout(struct nfsmount *nmp, vnode_t vp, struct nfsfh *nfhp,
     int iomode, uint32_t *notifybitsp, nfsv4stateid_t *stateidp, uint64_t off,
     struct nfscllayout **lypp, struct ucred *cred, NFSPROC_T *p)
 {
 	struct nfscllayout *lyp;
 	struct nfsclflayout *flp, *tflp;
 	struct nfscldevinfo *dip;
 	struct nfsclflayouthead flh;
 	int error = 0, islocked, layoutlen, recalled, retonclose;
 	nfsv4stateid_t stateid;
 	struct nfsclsession *tsep;
 
 	*lypp = NULL;
 	/*
 	 * If lyp is returned non-NULL, there will be a refcnt (shared lock)
 	 * on it, iff flp != NULL or a lock (exclusive lock) on it iff
 	 * flp == NULL.
 	 */
 	lyp = nfscl_getlayout(nmp->nm_clp, nfhp->nfh_fh, nfhp->nfh_len,
 	    off, &flp, &recalled);
 	islocked = 0;
 	if (lyp == NULL || flp == NULL) {
 		if (recalled != 0)
 			return (EIO);
 		LIST_INIT(&flh);
 		tsep = nfsmnt_mdssession(nmp);
 		layoutlen = tsep->nfsess_maxcache -
 		    (NFSX_STATEID + 3 * NFSX_UNSIGNED);
 		if (lyp == NULL) {
 			stateid.seqid = 0;
 			stateid.other[0] = stateidp->other[0];
 			stateid.other[1] = stateidp->other[1];
 			stateid.other[2] = stateidp->other[2];
 			error = nfsrpc_layoutget(nmp, nfhp->nfh_fh,
 			    nfhp->nfh_len, iomode, (uint64_t)0, UINT64_MAX,
 			    (uint64_t)0, layoutlen, &stateid, &retonclose,
 			    &flh, cred, p, NULL);
 		} else {
 			islocked = 1;
 			stateid.seqid = lyp->nfsly_stateid.seqid;
 			stateid.other[0] = lyp->nfsly_stateid.other[0];
 			stateid.other[1] = lyp->nfsly_stateid.other[1];
 			stateid.other[2] = lyp->nfsly_stateid.other[2];
 			error = nfsrpc_layoutget(nmp, nfhp->nfh_fh,
 			    nfhp->nfh_len, iomode, off, UINT64_MAX,
 			    (uint64_t)0, layoutlen, &stateid, &retonclose,
 			    &flh, cred, p, NULL);
 		}
 		if (error == 0)
 			LIST_FOREACH(tflp, &flh, nfsfl_list) {
 				error = nfscl_adddevinfo(nmp, NULL, tflp);
 				if (error != 0) {
 					error = nfsrpc_getdeviceinfo(nmp,
 					    tflp->nfsfl_dev,
 					    NFSLAYOUT_NFSV4_1_FILES,
 					    notifybitsp, &dip, cred, p);
 					if (error != 0)
 						break;
 					error = nfscl_adddevinfo(nmp, dip,
 					    tflp);
 					if (error != 0)
 						printf(
 						    "getlayout: cannot add\n");
 				}
 			}
 		if (error == 0) {
 			/*
 			 * nfscl_layout() always returns with the nfsly_lock
 			 * set to a refcnt (shared lock).
 			 */
 			error = nfscl_layout(nmp, vp, nfhp->nfh_fh,
 			    nfhp->nfh_len, &stateid, retonclose, &flh, &lyp,
 			    cred, p);
 			if (error == 0)
 				*lypp = lyp;
 		} else if (islocked != 0)
 			nfsv4_unlock(&lyp->nfsly_lock, 0);
 	} else
 		*lypp = lyp;
 	return (error);
 }
 
 /*
  * Do a TCP connection plus exchange id and create session.
  * If successful, a "struct nfsclds" is linked into the list for the
  * mount point and a pointer to it is returned.
  */
 static int
 nfsrpc_fillsa(struct nfsmount *nmp, struct sockaddr_storage *ssp,
     struct nfsclds **dspp, NFSPROC_T *p)
 {
 	struct sockaddr_in *msad, *sad, *ssd;
 	struct sockaddr_in6 *msad6, *sad6, *ssd6;
 	struct nfsclclient *clp;
 	struct nfssockreq *nrp;
 	struct nfsclds *dsp, *tdsp;
 	int error;
 	enum nfsclds_state retv;
 	uint32_t sequenceid;
 
 	KASSERT(nmp->nm_sockreq.nr_cred != NULL,
 	    ("nfsrpc_fillsa: NULL nr_cred"));
 	NFSLOCKCLSTATE();
 	clp = nmp->nm_clp;
 	NFSUNLOCKCLSTATE();
 	if (clp == NULL)
 		return (EPERM);
 	if (ssp->ss_family == AF_INET) {
 		ssd = (struct sockaddr_in *)ssp;
 		NFSLOCKMNT(nmp);
 
 		/*
 		 * Check to see if we already have a session for this
 		 * address that is usable for a DS.
 		 * Note that the MDS's address is in a different place
 		 * than the sessions already acquired for DS's.
 		 */
 		msad = (struct sockaddr_in *)nmp->nm_sockreq.nr_nam;
 		tdsp = TAILQ_FIRST(&nmp->nm_sess);
 		while (tdsp != NULL) {
 			if (msad != NULL && msad->sin_family == AF_INET &&
 			    ssd->sin_addr.s_addr == msad->sin_addr.s_addr &&
 			    ssd->sin_port == msad->sin_port &&
 			    (tdsp->nfsclds_flags & NFSCLDS_DS) != 0 &&
 			    tdsp->nfsclds_sess.nfsess_defunct == 0) {
 				*dspp = tdsp;
 				NFSUNLOCKMNT(nmp);
 				NFSCL_DEBUG(4, "fnd same addr\n");
 				return (0);
 			}
 			tdsp = TAILQ_NEXT(tdsp, nfsclds_list);
 			if (tdsp != NULL && tdsp->nfsclds_sockp != NULL)
 				msad = (struct sockaddr_in *)
 				    tdsp->nfsclds_sockp->nr_nam;
 			else
 				msad = NULL;
 		}
 		NFSUNLOCKMNT(nmp);
 
 		/* No IP address match, so look for new/trunked one. */
 		sad = malloc(sizeof(*sad), M_SONAME, M_WAITOK | M_ZERO);
 		sad->sin_len = sizeof(*sad);
 		sad->sin_family = AF_INET;
 		sad->sin_port = ssd->sin_port;
 		sad->sin_addr.s_addr = ssd->sin_addr.s_addr;
 		nrp = malloc(sizeof(*nrp), M_NFSSOCKREQ, M_WAITOK | M_ZERO);
 		nrp->nr_nam = (struct sockaddr *)sad;
 	} else if (ssp->ss_family == AF_INET6) {
 		ssd6 = (struct sockaddr_in6 *)ssp;
 		NFSLOCKMNT(nmp);
 
 		/*
 		 * Check to see if we already have a session for this
 		 * address that is usable for a DS.
 		 * Note that the MDS's address is in a different place
 		 * than the sessions already acquired for DS's.
 		 */
 		msad6 = (struct sockaddr_in6 *)nmp->nm_sockreq.nr_nam;
 		tdsp = TAILQ_FIRST(&nmp->nm_sess);
 		while (tdsp != NULL) {
 			if (msad6 != NULL && msad6->sin6_family == AF_INET6 &&
 			    IN6_ARE_ADDR_EQUAL(&ssd6->sin6_addr,
 			    &msad6->sin6_addr) &&
 			    ssd6->sin6_port == msad6->sin6_port &&
 			    (tdsp->nfsclds_flags & NFSCLDS_DS) != 0 &&
 			    tdsp->nfsclds_sess.nfsess_defunct == 0) {
 				*dspp = tdsp;
 				NFSUNLOCKMNT(nmp);
 				return (0);
 			}
 			tdsp = TAILQ_NEXT(tdsp, nfsclds_list);
 			if (tdsp != NULL && tdsp->nfsclds_sockp != NULL)
 				msad6 = (struct sockaddr_in6 *)
 				    tdsp->nfsclds_sockp->nr_nam;
 			else
 				msad6 = NULL;
 		}
 		NFSUNLOCKMNT(nmp);
 
 		/* No IP address match, so look for new/trunked one. */
 		sad6 = malloc(sizeof(*sad6), M_SONAME, M_WAITOK | M_ZERO);
 		sad6->sin6_len = sizeof(*sad6);
 		sad6->sin6_family = AF_INET6;
 		sad6->sin6_port = ssd6->sin6_port;
 		NFSBCOPY(&ssd6->sin6_addr, &sad6->sin6_addr,
 		    sizeof(struct in6_addr));
 		nrp = malloc(sizeof(*nrp), M_NFSSOCKREQ, M_WAITOK | M_ZERO);
 		nrp->nr_nam = (struct sockaddr *)sad6;
 	} else
 		return (EPERM);
 
 	nrp->nr_sotype = SOCK_STREAM;
 	mtx_init(&nrp->nr_mtx, "nfssock", NULL, MTX_DEF);
 	nrp->nr_prog = NFS_PROG;
 	nrp->nr_vers = NFS_VER4;
 
 	/*
 	 * Use the credentials that were used for the mount, which are
 	 * in nmp->nm_sockreq.nr_cred for newnfs_connect() etc.
 	 * Ref. counting the credentials with crhold() is probably not
 	 * necessary, since nm_sockreq.nr_cred won't be crfree()'d until
 	 * unmount, but I did it anyhow.
 	 */
 	nrp->nr_cred = crhold(nmp->nm_sockreq.nr_cred);
 	error = newnfs_connect(nmp, nrp, NULL, p, 0);
 	NFSCL_DEBUG(3, "DS connect=%d\n", error);
 
 	/* Now, do the exchangeid and create session. */
 	if (error == 0) {
 		error = nfsrpc_exchangeid(nmp, clp, nrp, NFSV4EXCH_USEPNFSDS,
 		    &dsp, nrp->nr_cred, p);
 		NFSCL_DEBUG(3, "DS exchangeid=%d\n", error);
 		if (error != 0)
 			newnfs_disconnect(nrp);
 	}
 	if (error == 0) {
 		dsp->nfsclds_sockp = nrp;
 		NFSLOCKMNT(nmp);
 		retv = nfscl_getsameserver(nmp, dsp, &tdsp);
 		NFSCL_DEBUG(3, "getsame ret=%d\n", retv);
 		if (retv == NFSDSP_USETHISSESSION) {
 			NFSUNLOCKMNT(nmp);
 			/*
 			 * If there is already a session for this server,
 			 * use it.
 			 */
 			(void)newnfs_disconnect(nrp);
 			nfscl_freenfsclds(dsp);
 			*dspp = tdsp;
 			return (0);
 		}
 		if (retv == NFSDSP_SEQTHISSESSION)
 			sequenceid = tdsp->nfsclds_sess.nfsess_sequenceid;
 		else
 			sequenceid = dsp->nfsclds_sess.nfsess_sequenceid;
 		NFSUNLOCKMNT(nmp);
 		error = nfsrpc_createsession(nmp, &dsp->nfsclds_sess,
 		    nrp, sequenceid, 0, nrp->nr_cred, p);
 		NFSCL_DEBUG(3, "DS createsess=%d\n", error);
 	} else {
 		NFSFREECRED(nrp->nr_cred);
 		NFSFREEMUTEX(&nrp->nr_mtx);
 		free(nrp->nr_nam, M_SONAME);
 		free(nrp, M_NFSSOCKREQ);
 	}
 	if (error == 0) {
 		NFSCL_DEBUG(3, "add DS session\n");
 		/*
 		 * Put it at the end of the list. That way the list
 		 * is ordered by when the entry was added. This matters
 		 * since the one done first is the one that should be
 		 * used for sequencid'ing any subsequent create sessions.
 		 */
 		NFSLOCKMNT(nmp);
 		TAILQ_INSERT_TAIL(&nmp->nm_sess, dsp, nfsclds_list);
 		NFSUNLOCKMNT(nmp);
 		*dspp = dsp;
 	} else if (dsp != NULL) {
 		newnfs_disconnect(nrp);
 		nfscl_freenfsclds(dsp);
 	}
 	return (error);
 }
 
 /*
  * Do the NFSv4.1 Reclaim Complete.
  */
 int
 nfsrpc_reclaimcomplete(struct nfsmount *nmp, struct ucred *cred, NFSPROC_T *p)
 {
 	uint32_t *tl;
 	struct nfsrv_descript nfsd;
 	struct nfsrv_descript *nd = &nfsd;
 	int error;
 
 	nfscl_reqstart(nd, NFSPROC_RECLAIMCOMPL, nmp, NULL, 0, NULL, NULL);
 	NFSM_BUILD(tl, uint32_t *, NFSX_UNSIGNED);
 	*tl = newnfs_false;
 	nd->nd_flag |= ND_USEGSSNAME;
 	error = newnfs_request(nd, nmp, NULL, &nmp->nm_sockreq, NULL, p, cred,
 	    NFS_PROG, NFS_VER4, NULL, 1, NULL, NULL);
 	if (error != 0)
 		return (error);
 	error = nd->nd_repstat;
 	mbuf_freem(nd->nd_mrep);
 	return (error);
 }
 
 /*
  * Initialize the slot tables for a session.
  */
 static void
 nfscl_initsessionslots(struct nfsclsession *sep)
 {
 	int i;
 
 	for (i = 0; i < NFSV4_CBSLOTS; i++) {
 		if (sep->nfsess_cbslots[i].nfssl_reply != NULL)
 			m_freem(sep->nfsess_cbslots[i].nfssl_reply);
 		NFSBZERO(&sep->nfsess_cbslots[i], sizeof(struct nfsslot));
 	}
 	for (i = 0; i < 64; i++)
 		sep->nfsess_slotseq[i] = 0;
 	sep->nfsess_slots = 0;
 }
 
 /*
  * Called to try and do an I/O operation via an NFSv4.1 Data Server (DS).
  */
 int
 nfscl_doiods(vnode_t vp, struct uio *uiop, int *iomode, int *must_commit,
     uint32_t rwaccess, struct ucred *cred, NFSPROC_T *p)
 {
 	struct nfsnode *np = VTONFS(vp);
 	struct nfsmount *nmp = VFSTONFS(vnode_mount(vp));
 	struct nfscllayout *layp;
 	struct nfscldevinfo *dip;
 	struct nfsclflayout *rflp;
 	nfsv4stateid_t stateid;
 	struct ucred *newcred;
 	uint64_t lastbyte, len, off, oresid, xfer;
 	int eof, error, iolaymode, recalled;
 	void *lckp;
 
 	if (!NFSHASPNFS(nmp) || nfscl_enablecallb == 0 || nfs_numnfscbd == 0 ||
 	    (np->n_flag & NNOLAYOUT) != 0)
 		return (EIO);
 	/* Now, get a reference cnt on the clientid for this mount. */
 	if (nfscl_getref(nmp) == 0)
 		return (EIO);
 
 	/* Find an appropriate stateid. */
 	newcred = NFSNEWCRED(cred);
 	error = nfscl_getstateid(vp, np->n_fhp->nfh_fh, np->n_fhp->nfh_len,
 	    rwaccess, 1, newcred, p, &stateid, &lckp);
 	if (error != 0) {
 		NFSFREECRED(newcred);
 		nfscl_relref(nmp);
 		return (error);
 	}
 	/* Search for a layout for this file. */
 	off = uiop->uio_offset;
 	layp = nfscl_getlayout(nmp->nm_clp, np->n_fhp->nfh_fh,
 	    np->n_fhp->nfh_len, off, &rflp, &recalled);
 	if (layp == NULL || rflp == NULL) {
 		if (recalled != 0) {
 			NFSFREECRED(newcred);
 			nfscl_relref(nmp);
 			return (EIO);
 		}
 		if (layp != NULL) {
 			nfscl_rellayout(layp, (rflp == NULL) ? 1 : 0);
 			layp = NULL;
 		}
 		/* Try and get a Layout, if it is supported. */
 		if (rwaccess == NFSV4OPEN_ACCESSWRITE ||
 		    (np->n_flag & NWRITEOPENED) != 0)
 			iolaymode = NFSLAYOUTIOMODE_RW;
 		else
 			iolaymode = NFSLAYOUTIOMODE_READ;
 		error = nfsrpc_getlayout(nmp, vp, np->n_fhp, iolaymode,
 		    NULL, &stateid, off, &layp, newcred, p);
 		if (error != 0) {
 			NFSLOCKNODE(np);
 			np->n_flag |= NNOLAYOUT;
 			NFSUNLOCKNODE(np);
 			if (lckp != NULL)
 				nfscl_lockderef(lckp);
 			NFSFREECRED(newcred);
 			if (layp != NULL)
 				nfscl_rellayout(layp, 0);
 			nfscl_relref(nmp);
 			return (error);
 		}
 	}
 
 	/*
 	 * Loop around finding a layout that works for the first part of
 	 * this I/O operation, and then call the function that actually
 	 * does the RPC.
 	 */
 	eof = 0;
 	len = (uint64_t)uiop->uio_resid;
 	while (len > 0 && error == 0 && eof == 0) {
 		off = uiop->uio_offset;
 		error = nfscl_findlayoutforio(layp, off, rwaccess, &rflp);
 		if (error == 0) {
 			oresid = xfer = (uint64_t)uiop->uio_resid;
 			if (xfer > (rflp->nfsfl_end - rflp->nfsfl_off))
 				xfer = rflp->nfsfl_end - rflp->nfsfl_off;
 			dip = nfscl_getdevinfo(nmp->nm_clp, rflp->nfsfl_dev,
 			    rflp->nfsfl_devp);
 			if (dip != NULL) {
 				error = nfscl_doflayoutio(vp, uiop, iomode,
 				    must_commit, &eof, &stateid, rwaccess, dip,
 				    layp, rflp, off, xfer, newcred, p);
 				nfscl_reldevinfo(dip);
 				lastbyte = off + xfer - 1;
 				if (error == 0) {
 					NFSLOCKCLSTATE();
 					if (lastbyte > layp->nfsly_lastbyte)
 						layp->nfsly_lastbyte = lastbyte;
 					NFSUNLOCKCLSTATE();
 				} else if (error == NFSERR_OPENMODE &&
 				    rwaccess == NFSV4OPEN_ACCESSREAD) {
 					NFSLOCKMNT(nmp);
 					nmp->nm_state |= NFSSTA_OPENMODE;
 					NFSUNLOCKMNT(nmp);
 				}
 			} else
 				error = EIO;
 			if (error == 0)
 				len -= (oresid - (uint64_t)uiop->uio_resid);
 		}
 	}
 	if (lckp != NULL)
 		nfscl_lockderef(lckp);
 	NFSFREECRED(newcred);
 	nfscl_rellayout(layp, 0);
 	nfscl_relref(nmp);
 	return (error);
 }
 
 /*
  * Find a file layout that will handle the first bytes of the requested
  * range and return the information from it needed to to the I/O operation.
  */
 int
 nfscl_findlayoutforio(struct nfscllayout *lyp, uint64_t off, uint32_t rwaccess,
     struct nfsclflayout **retflpp)
 {
 	struct nfsclflayout *flp, *nflp, *rflp;
 	uint32_t rw;
 
 	rflp = NULL;
 	rw = rwaccess;
 	/* For reading, do the Read list first and then the Write list. */
 	do {
 		if (rw == NFSV4OPEN_ACCESSREAD)
 			flp = LIST_FIRST(&lyp->nfsly_flayread);
 		else
 			flp = LIST_FIRST(&lyp->nfsly_flayrw);
 		while (flp != NULL) {
 			nflp = LIST_NEXT(flp, nfsfl_list);
 			if (flp->nfsfl_off > off)
 				break;
 			if (flp->nfsfl_end > off &&
 			    (rflp == NULL || rflp->nfsfl_end < flp->nfsfl_end))
 				rflp = flp;
 			flp = nflp;
 		}
 		if (rw == NFSV4OPEN_ACCESSREAD)
 			rw = NFSV4OPEN_ACCESSWRITE;
 		else
 			rw = 0;
 	} while (rw != 0);
 	if (rflp != NULL) {
 		/* This one covers the most bytes starting at off. */
 		*retflpp = rflp;
 		return (0);
 	}
 	return (EIO);
 }
 
 /*
  * Do I/O using an NFSv4.1 file layout.
  */
 static int
 nfscl_doflayoutio(vnode_t vp, struct uio *uiop, int *iomode, int *must_commit,
     int *eofp, nfsv4stateid_t *stateidp, int rwflag, struct nfscldevinfo *dp,
     struct nfscllayout *lyp, struct nfsclflayout *flp, uint64_t off,
     uint64_t len, struct ucred *cred, NFSPROC_T *p)
 {
 	uint64_t io_off, rel_off, stripe_unit_size, transfer, xfer;
 	int commit_thru_mds, error = 0, stripe_index, stripe_pos;
 	struct nfsnode *np;
 	struct nfsfh *fhp;
 	struct nfsclds **dspp;
 
 	np = VTONFS(vp);
 	rel_off = off - flp->nfsfl_patoff;
 	stripe_unit_size = (flp->nfsfl_util >> 6) & 0x3ffffff;
 	stripe_pos = (rel_off / stripe_unit_size + flp->nfsfl_stripe1) %
 	    dp->nfsdi_stripecnt;
 	transfer = stripe_unit_size - (rel_off % stripe_unit_size);
 
 	/* Loop around, doing I/O for each stripe unit. */
 	while (len > 0 && error == 0) {
 		stripe_index = nfsfldi_stripeindex(dp, stripe_pos);
 		dspp = nfsfldi_addr(dp, stripe_index);
 		if (len > transfer)
 			xfer = transfer;
 		else
 			xfer = len;
 		if ((flp->nfsfl_util & NFSFLAYUTIL_DENSE) != 0) {
 			/* Dense layout. */
 			if (stripe_pos >= flp->nfsfl_fhcnt)
 				return (EIO);
 			fhp = flp->nfsfl_fh[stripe_pos];
 			io_off = (rel_off / (stripe_unit_size *
 			    dp->nfsdi_stripecnt)) * stripe_unit_size +
 			    rel_off % stripe_unit_size;
 		} else {
 			/* Sparse layout. */
 			if (flp->nfsfl_fhcnt > 1) {
 				if (stripe_index >= flp->nfsfl_fhcnt)
 					return (EIO);
 				fhp = flp->nfsfl_fh[stripe_index];
 			} else if (flp->nfsfl_fhcnt == 1)
 				fhp = flp->nfsfl_fh[0];
 			else
 				fhp = np->n_fhp;
 			io_off = off;
 		}
 		if ((flp->nfsfl_util & NFSFLAYUTIL_COMMIT_THRU_MDS) != 0)
 			commit_thru_mds = 1;
 		else
 			commit_thru_mds = 0;
 		if (rwflag == FREAD)
 			error = nfsrpc_readds(vp, uiop, stateidp, eofp, *dspp,
 			    io_off, xfer, fhp, cred, p);
 		else {
 			error = nfsrpc_writeds(vp, uiop, iomode, must_commit,
 			    stateidp, *dspp, io_off, xfer, fhp, commit_thru_mds,
 			    cred, p);
 			if (error == 0) {
 				NFSLOCKCLSTATE();
 				lyp->nfsly_flags |= NFSLY_WRITTEN;
 				NFSUNLOCKCLSTATE();
 			}
 		}
 		if (error == 0) {
 			transfer = stripe_unit_size;
 			stripe_pos = (stripe_pos + 1) % dp->nfsdi_stripecnt;
 			len -= xfer;
 			off += xfer;
 		}
 	}
 	return (error);
 }
 
 /*
  * The actual read RPC done to a DS.
  */
 static int
 nfsrpc_readds(vnode_t vp, struct uio *uiop, nfsv4stateid_t *stateidp, int *eofp,
     struct nfsclds *dsp, uint64_t io_off, int len, struct nfsfh *fhp,
     struct ucred *cred, NFSPROC_T *p)
 {
 	uint32_t *tl;
 	int error, retlen;
 	struct nfsrv_descript nfsd;
 	struct nfsmount *nmp = VFSTONFS(vnode_mount(vp));
 	struct nfsrv_descript *nd = &nfsd;
 	struct nfssockreq *nrp;
 
 	nd->nd_mrep = NULL;
 	nfscl_reqstart(nd, NFSPROC_READDS, nmp, fhp->nfh_fh, fhp->nfh_len,
 	    NULL, &dsp->nfsclds_sess);
 	nfsm_stateidtom(nd, stateidp, NFSSTATEID_PUTSEQIDZERO);
 	NFSM_BUILD(tl, uint32_t *, NFSX_UNSIGNED * 3);
 	txdr_hyper(io_off, tl);
 	*(tl + 2) = txdr_unsigned(len);
 	nrp = dsp->nfsclds_sockp;
 	if (nrp == NULL)
 		/* If NULL, use the MDS socket. */
 		nrp = &nmp->nm_sockreq;
 	error = newnfs_request(nd, nmp, NULL, nrp, vp, p, cred,
 	    NFS_PROG, NFS_VER4, NULL, 1, NULL, &dsp->nfsclds_sess);
 	if (error != 0)
 		return (error);
 	if (nd->nd_repstat != 0) {
 		error = nd->nd_repstat;
 		goto nfsmout;
 	}
 	NFSM_DISSECT(tl, uint32_t *, NFSX_UNSIGNED);
 	*eofp = fxdr_unsigned(int, *tl);
 	NFSM_STRSIZ(retlen, len);
 	error = nfsm_mbufuio(nd, uiop, retlen);
 nfsmout:
 	if (nd->nd_mrep != NULL)
 		mbuf_freem(nd->nd_mrep);
 	return (error);
 }
 
 /*
  * The actual write RPC done to a DS.
  */
 static int
 nfsrpc_writeds(vnode_t vp, struct uio *uiop, int *iomode, int *must_commit,
     nfsv4stateid_t *stateidp, struct nfsclds *dsp, uint64_t io_off, int len,
     struct nfsfh *fhp, int commit_thru_mds, struct ucred *cred, NFSPROC_T *p)
 {
 	uint32_t *tl;
 	struct nfsmount *nmp = VFSTONFS(vnode_mount(vp));
 	int error, rlen, commit, committed = NFSWRITE_FILESYNC;
 	int32_t backup;
 	struct nfsrv_descript nfsd;
 	struct nfsrv_descript *nd = &nfsd;
 	struct nfssockreq *nrp;
 
 	KASSERT(uiop->uio_iovcnt == 1, ("nfs: writerpc iovcnt > 1"));
 	nd->nd_mrep = NULL;
 	nfscl_reqstart(nd, NFSPROC_WRITEDS, nmp, fhp->nfh_fh, fhp->nfh_len,
 	    NULL, &dsp->nfsclds_sess);
 	nfsm_stateidtom(nd, stateidp, NFSSTATEID_PUTSEQIDZERO);
 	NFSM_BUILD(tl, uint32_t *, NFSX_HYPER + 2 * NFSX_UNSIGNED);
 	txdr_hyper(io_off, tl);
 	tl += 2;
 	*tl++ = txdr_unsigned(*iomode);
 	*tl = txdr_unsigned(len);
 	nfsm_uiombuf(nd, uiop, len);
 	nrp = dsp->nfsclds_sockp;
 	if (nrp == NULL)
 		/* If NULL, use the MDS socket. */
 		nrp = &nmp->nm_sockreq;
 	error = newnfs_request(nd, nmp, NULL, nrp, vp, p, cred,
 	    NFS_PROG, NFS_VER4, NULL, 1, NULL, &dsp->nfsclds_sess);
 	if (error != 0)
 		return (error);
 	if (nd->nd_repstat != 0) {
 		/*
 		 * In case the rpc gets retried, roll
 		 * the uio fileds changed by nfsm_uiombuf()
 		 * back.
 		 */
 		uiop->uio_offset -= len;
 		uio_uio_resid_add(uiop, len);
 		uio_iov_base_add(uiop, -len);
 		uio_iov_len_add(uiop, len);
 		error = nd->nd_repstat;
 	} else {
 		NFSM_DISSECT(tl, uint32_t *, 2 * NFSX_UNSIGNED + NFSX_VERF);
 		rlen = fxdr_unsigned(int, *tl++);
 		if (rlen == 0) {
 			error = NFSERR_IO;
 			goto nfsmout;
 		} else if (rlen < len) {
 			backup = len - rlen;
 			uio_iov_base_add(uiop, -(backup));
 			uio_iov_len_add(uiop, backup);
 			uiop->uio_offset -= backup;
 			uio_uio_resid_add(uiop, backup);
 			len = rlen;
 		}
 		commit = fxdr_unsigned(int, *tl++);
 
 		/*
 		 * Return the lowest commitment level
 		 * obtained by any of the RPCs.
 		 */
 		if (committed == NFSWRITE_FILESYNC)
 			committed = commit;
 		else if (committed == NFSWRITE_DATASYNC &&
 		    commit == NFSWRITE_UNSTABLE)
 			committed = commit;
 		if (commit_thru_mds != 0) {
 			NFSLOCKMNT(nmp);
 			if (!NFSHASWRITEVERF(nmp)) {
 				NFSBCOPY(tl, nmp->nm_verf, NFSX_VERF);
 				NFSSETWRITEVERF(nmp);
 	    		} else if (NFSBCMP(tl, nmp->nm_verf, NFSX_VERF)) {
 				*must_commit = 1;
 				NFSBCOPY(tl, nmp->nm_verf, NFSX_VERF);
 			}
 			NFSUNLOCKMNT(nmp);
 		} else {
 			NFSLOCKDS(dsp);
 			if ((dsp->nfsclds_flags & NFSCLDS_HASWRITEVERF) == 0) {
 				NFSBCOPY(tl, dsp->nfsclds_verf, NFSX_VERF);
 				dsp->nfsclds_flags |= NFSCLDS_HASWRITEVERF;
 			} else if (NFSBCMP(tl, dsp->nfsclds_verf, NFSX_VERF)) {
 				*must_commit = 1;
 				NFSBCOPY(tl, dsp->nfsclds_verf, NFSX_VERF);
 			}
 			NFSUNLOCKDS(dsp);
 		}
 	}
 nfsmout:
 	if (nd->nd_mrep != NULL)
 		mbuf_freem(nd->nd_mrep);
 	*iomode = committed;
 	if (nd->nd_repstat != 0 && error == 0)
 		error = nd->nd_repstat;
 	return (error);
 }
 
 /*
  * Free up the nfsclds structure.
  */
 void
 nfscl_freenfsclds(struct nfsclds *dsp)
 {
 	int i;
 
 	if (dsp == NULL)
 		return;
 	if (dsp->nfsclds_sockp != NULL) {
 		NFSFREECRED(dsp->nfsclds_sockp->nr_cred);
 		NFSFREEMUTEX(&dsp->nfsclds_sockp->nr_mtx);
 		free(dsp->nfsclds_sockp->nr_nam, M_SONAME);
 		free(dsp->nfsclds_sockp, M_NFSSOCKREQ);
 	}
 	NFSFREEMUTEX(&dsp->nfsclds_mtx);
 	NFSFREEMUTEX(&dsp->nfsclds_sess.nfsess_mtx);
 	for (i = 0; i < NFSV4_CBSLOTS; i++) {
 		if (dsp->nfsclds_sess.nfsess_cbslots[i].nfssl_reply != NULL)
 			m_freem(
 			    dsp->nfsclds_sess.nfsess_cbslots[i].nfssl_reply);
 	}
 	free(dsp, M_NFSCLDS);
 }
 
 static enum nfsclds_state
 nfscl_getsameserver(struct nfsmount *nmp, struct nfsclds *newdsp,
     struct nfsclds **retdspp)
 {
 	struct nfsclds *dsp, *cur_dsp;
 
 	/*
 	 * Search the list of nfsclds structures for one with the same
 	 * server.
 	 */
 	cur_dsp = NULL;
 	TAILQ_FOREACH(dsp, &nmp->nm_sess, nfsclds_list) {
 		if (dsp->nfsclds_servownlen == newdsp->nfsclds_servownlen &&
 		    dsp->nfsclds_servownlen != 0 &&
 		    !NFSBCMP(dsp->nfsclds_serverown, newdsp->nfsclds_serverown,
 		    dsp->nfsclds_servownlen) &&
 		    dsp->nfsclds_sess.nfsess_defunct == 0) {
 			NFSCL_DEBUG(4, "fnd same fdsp=%p dsp=%p flg=0x%x\n",
 			    TAILQ_FIRST(&nmp->nm_sess), dsp,
 			    dsp->nfsclds_flags);
 			/* Server major id matches. */
 			if ((dsp->nfsclds_flags & NFSCLDS_DS) != 0) {
 				*retdspp = dsp;
 				return (NFSDSP_USETHISSESSION);
 			}
 
 			/*
 			 * Note the first match, so it can be used for
 			 * sequence'ing new sessions.
 			 */
 			if (cur_dsp == NULL)
 				cur_dsp = dsp;
 		}
 	}
 	if (cur_dsp != NULL) {
 		*retdspp = cur_dsp;
 		return (NFSDSP_SEQTHISSESSION);
 	}
 	return (NFSDSP_NOTFOUND);
 }
 
 #ifdef notyet
 /*
  * NFS commit rpc to a DS.
  */
 static int
 nfsrpc_commitds(vnode_t vp, uint64_t offset, int cnt, struct nfsclds *dsp,
     struct nfsfh *fhp, struct ucred *cred, NFSPROC_T *p, void *stuff)
 {
 	uint32_t *tl;
 	struct nfsrv_descript nfsd, *nd = &nfsd;
 	struct nfsmount *nmp = VFSTONFS(vnode_mount(vp));
 	struct nfssockreq *nrp;
 	int error;
 	
 	nfscl_reqstart(nd, NFSPROC_COMMITDS, nmp, fhp->nfh_fh, fhp->nfh_len,
 	    NULL, &dsp->nfsclds_sess);
 	NFSM_BUILD(tl, uint32_t *, NFSX_HYPER + NFSX_UNSIGNED);
 	txdr_hyper(offset, tl);
 	tl += 2;
 	*tl = txdr_unsigned(cnt);
 	nrp = dsp->nfsclds_sockp;
 	if (nrp == NULL)
 		/* If NULL, use the MDS socket. */
 		nrp = &nmp->nm_sockreq;
 	error = newnfs_request(nd, nmp, NULL, nrp, vp, p, cred,
 	    NFS_PROG, NFS_VER4, NULL, 1, NULL, &dsp->nfsclds_sess);
 	if (error)
 		return (error);
 	if (nd->nd_repstat == 0) {
 		NFSM_DISSECT(tl, u_int32_t *, NFSX_VERF);
 		NFSLOCKDS(dsp);
 		if (NFSBCMP(tl, dsp->nfsclds_verf, NFSX_VERF)) {
 			NFSBCOPY(tl, dsp->nfsclds_verf, NFSX_VERF);
 			error = NFSERR_STALEWRITEVERF;
 		}
 		NFSUNLOCKDS(dsp);
 	}
 nfsmout:
 	if (error == 0 && nd->nd_repstat != 0)
 		error = nd->nd_repstat;
 	mbuf_freem(nd->nd_mrep);
 	return (error);
 }
 #endif
 
Index: head/sys/kern/vfs_bio.c
===================================================================
--- head/sys/kern/vfs_bio.c	(revision 320061)
+++ head/sys/kern/vfs_bio.c	(revision 320062)
@@ -1,5003 +1,5033 @@
 /*-
  * Copyright (c) 2004 Poul-Henning Kamp
  * Copyright (c) 1994,1997 John S. Dyson
  * Copyright (c) 2013 The FreeBSD Foundation
  * All rights reserved.
  *
  * Portions of this software were developed by Konstantin Belousov
  * under sponsorship from the FreeBSD Foundation.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 /*
  * this file contains a new buffer I/O scheme implementing a coherent
  * VM object and buffer cache scheme.  Pains have been taken to make
  * sure that the performance degradation associated with schemes such
  * as this is not realized.
  *
  * Author:  John S. Dyson
  * Significant help during the development and debugging phases
  * had been provided by David Greenman, also of the FreeBSD core team.
  *
  * see man buf(9) for more info.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/bio.h>
 #include <sys/conf.h>
 #include <sys/buf.h>
 #include <sys/devicestat.h>
 #include <sys/eventhandler.h>
 #include <sys/fail.h>
 #include <sys/limits.h>
 #include <sys/lock.h>
 #include <sys/malloc.h>
 #include <sys/mount.h>
 #include <sys/mutex.h>
 #include <sys/kernel.h>
 #include <sys/kthread.h>
 #include <sys/proc.h>
 #include <sys/racct.h>
 #include <sys/resourcevar.h>
 #include <sys/rwlock.h>
 #include <sys/smp.h>
 #include <sys/sysctl.h>
 #include <sys/sysproto.h>
 #include <sys/vmem.h>
 #include <sys/vmmeter.h>
 #include <sys/vnode.h>
 #include <sys/watchdog.h>
 #include <geom/geom.h>
 #include <vm/vm.h>
 #include <vm/vm_param.h>
 #include <vm/vm_kern.h>
 #include <vm/vm_object.h>
 #include <vm/vm_page.h>
 #include <vm/vm_pageout.h>
 #include <vm/vm_pager.h>
 #include <vm/vm_extern.h>
 #include <vm/vm_map.h>
 #include <vm/swap_pager.h>
 #include "opt_compat.h"
 #include "opt_swap.h"
 
 static MALLOC_DEFINE(M_BIOBUF, "biobuf", "BIO buffer");
 
 struct	bio_ops bioops;		/* I/O operation notification */
 
 struct	buf_ops buf_ops_bio = {
 	.bop_name	=	"buf_ops_bio",
 	.bop_write	=	bufwrite,
 	.bop_strategy	=	bufstrategy,
 	.bop_sync	=	bufsync,
 	.bop_bdflush	=	bufbdflush,
 };
 
 static struct buf *buf;		/* buffer header pool */
 extern struct buf *swbuf;	/* Swap buffer header pool. */
 caddr_t unmapped_buf;
 
 /* Used below and for softdep flushing threads in ufs/ffs/ffs_softdep.c */
 struct proc *bufdaemonproc;
 struct proc *bufspacedaemonproc;
 
 static int inmem(struct vnode *vp, daddr_t blkno);
 static void vm_hold_free_pages(struct buf *bp, int newbsize);
 static void vm_hold_load_pages(struct buf *bp, vm_offset_t from,
 		vm_offset_t to);
 static void vfs_page_set_valid(struct buf *bp, vm_ooffset_t off, vm_page_t m);
 static void vfs_page_set_validclean(struct buf *bp, vm_ooffset_t off,
 		vm_page_t m);
 static void vfs_clean_pages_dirty_buf(struct buf *bp);
 static void vfs_setdirty_locked_object(struct buf *bp);
 static void vfs_vmio_invalidate(struct buf *bp);
 static void vfs_vmio_truncate(struct buf *bp, int npages);
 static void vfs_vmio_extend(struct buf *bp, int npages, int size);
 static int vfs_bio_clcheck(struct vnode *vp, int size,
 		daddr_t lblkno, daddr_t blkno);
 static int buf_flush(struct vnode *vp, int);
 static int buf_recycle(bool);
 static int buf_scan(bool);
 static int flushbufqueues(struct vnode *, int, int);
 static void buf_daemon(void);
 static void bremfreel(struct buf *bp);
 static __inline void bd_wakeup(void);
 static int sysctl_runningspace(SYSCTL_HANDLER_ARGS);
 static void bufkva_reclaim(vmem_t *, int);
 static void bufkva_free(struct buf *);
 static int buf_import(void *, void **, int, int);
 static void buf_release(void *, void **, int);
+static void maxbcachebuf_adjust(void);
 
 #if defined(COMPAT_FREEBSD4) || defined(COMPAT_FREEBSD5) || \
     defined(COMPAT_FREEBSD6) || defined(COMPAT_FREEBSD7)
 static int sysctl_bufspace(SYSCTL_HANDLER_ARGS);
 #endif
 
 int vmiodirenable = TRUE;
 SYSCTL_INT(_vfs, OID_AUTO, vmiodirenable, CTLFLAG_RW, &vmiodirenable, 0,
     "Use the VM system for directory writes");
 long runningbufspace;
 SYSCTL_LONG(_vfs, OID_AUTO, runningbufspace, CTLFLAG_RD, &runningbufspace, 0,
     "Amount of presently outstanding async buffer io");
 static long bufspace;
 #if defined(COMPAT_FREEBSD4) || defined(COMPAT_FREEBSD5) || \
     defined(COMPAT_FREEBSD6) || defined(COMPAT_FREEBSD7)
 SYSCTL_PROC(_vfs, OID_AUTO, bufspace, CTLTYPE_LONG|CTLFLAG_MPSAFE|CTLFLAG_RD,
     &bufspace, 0, sysctl_bufspace, "L", "Virtual memory used for buffers");
 #else
 SYSCTL_LONG(_vfs, OID_AUTO, bufspace, CTLFLAG_RD, &bufspace, 0,
     "Physical memory used for buffers");
 #endif
 static long bufkvaspace;
 SYSCTL_LONG(_vfs, OID_AUTO, bufkvaspace, CTLFLAG_RD, &bufkvaspace, 0,
     "Kernel virtual memory used for buffers");
 static long maxbufspace;
 SYSCTL_LONG(_vfs, OID_AUTO, maxbufspace, CTLFLAG_RW, &maxbufspace, 0,
     "Maximum allowed value of bufspace (including metadata)");
 static long bufmallocspace;
 SYSCTL_LONG(_vfs, OID_AUTO, bufmallocspace, CTLFLAG_RD, &bufmallocspace, 0,
     "Amount of malloced memory for buffers");
 static long maxbufmallocspace;
 SYSCTL_LONG(_vfs, OID_AUTO, maxmallocbufspace, CTLFLAG_RW, &maxbufmallocspace,
     0, "Maximum amount of malloced memory for buffers");
 static long lobufspace;
 SYSCTL_LONG(_vfs, OID_AUTO, lobufspace, CTLFLAG_RW, &lobufspace, 0,
     "Minimum amount of buffers we want to have");
 long hibufspace;
 SYSCTL_LONG(_vfs, OID_AUTO, hibufspace, CTLFLAG_RW, &hibufspace, 0,
     "Maximum allowed value of bufspace (excluding metadata)");
 long bufspacethresh;
 SYSCTL_LONG(_vfs, OID_AUTO, bufspacethresh, CTLFLAG_RW, &bufspacethresh,
     0, "Bufspace consumed before waking the daemon to free some");
 static int buffreekvacnt;
 SYSCTL_INT(_vfs, OID_AUTO, buffreekvacnt, CTLFLAG_RW, &buffreekvacnt, 0,
     "Number of times we have freed the KVA space from some buffer");
 static int bufdefragcnt;
 SYSCTL_INT(_vfs, OID_AUTO, bufdefragcnt, CTLFLAG_RW, &bufdefragcnt, 0,
     "Number of times we have had to repeat buffer allocation to defragment");
 static long lorunningspace;
 SYSCTL_PROC(_vfs, OID_AUTO, lorunningspace, CTLTYPE_LONG | CTLFLAG_MPSAFE |
     CTLFLAG_RW, &lorunningspace, 0, sysctl_runningspace, "L",
     "Minimum preferred space used for in-progress I/O");
 static long hirunningspace;
 SYSCTL_PROC(_vfs, OID_AUTO, hirunningspace, CTLTYPE_LONG | CTLFLAG_MPSAFE |
     CTLFLAG_RW, &hirunningspace, 0, sysctl_runningspace, "L",
     "Maximum amount of space to use for in-progress I/O");
 int dirtybufferflushes;
 SYSCTL_INT(_vfs, OID_AUTO, dirtybufferflushes, CTLFLAG_RW, &dirtybufferflushes,
     0, "Number of bdwrite to bawrite conversions to limit dirty buffers");
 int bdwriteskip;
 SYSCTL_INT(_vfs, OID_AUTO, bdwriteskip, CTLFLAG_RW, &bdwriteskip,
     0, "Number of buffers supplied to bdwrite with snapshot deadlock risk");
 int altbufferflushes;
 SYSCTL_INT(_vfs, OID_AUTO, altbufferflushes, CTLFLAG_RW, &altbufferflushes,
     0, "Number of fsync flushes to limit dirty buffers");
 static int recursiveflushes;
 SYSCTL_INT(_vfs, OID_AUTO, recursiveflushes, CTLFLAG_RW, &recursiveflushes,
     0, "Number of flushes skipped due to being recursive");
 static int numdirtybuffers;
 SYSCTL_INT(_vfs, OID_AUTO, numdirtybuffers, CTLFLAG_RD, &numdirtybuffers, 0,
     "Number of buffers that are dirty (has unwritten changes) at the moment");
 static int lodirtybuffers;
 SYSCTL_INT(_vfs, OID_AUTO, lodirtybuffers, CTLFLAG_RW, &lodirtybuffers, 0,
     "How many buffers we want to have free before bufdaemon can sleep");
 static int hidirtybuffers;
 SYSCTL_INT(_vfs, OID_AUTO, hidirtybuffers, CTLFLAG_RW, &hidirtybuffers, 0,
     "When the number of dirty buffers is considered severe");
 int dirtybufthresh;
 SYSCTL_INT(_vfs, OID_AUTO, dirtybufthresh, CTLFLAG_RW, &dirtybufthresh,
     0, "Number of bdwrite to bawrite conversions to clear dirty buffers");
 static int numfreebuffers;
 SYSCTL_INT(_vfs, OID_AUTO, numfreebuffers, CTLFLAG_RD, &numfreebuffers, 0,
     "Number of free buffers");
 static int lofreebuffers;
 SYSCTL_INT(_vfs, OID_AUTO, lofreebuffers, CTLFLAG_RW, &lofreebuffers, 0,
    "Target number of free buffers");
 static int hifreebuffers;
 SYSCTL_INT(_vfs, OID_AUTO, hifreebuffers, CTLFLAG_RW, &hifreebuffers, 0,
    "Threshold for clean buffer recycling");
 static int getnewbufcalls;
 SYSCTL_INT(_vfs, OID_AUTO, getnewbufcalls, CTLFLAG_RW, &getnewbufcalls, 0,
    "Number of calls to getnewbuf");
 static int getnewbufrestarts;
 SYSCTL_INT(_vfs, OID_AUTO, getnewbufrestarts, CTLFLAG_RW, &getnewbufrestarts, 0,
     "Number of times getnewbuf has had to restart a buffer acquisition");
 static int mappingrestarts;
 SYSCTL_INT(_vfs, OID_AUTO, mappingrestarts, CTLFLAG_RW, &mappingrestarts, 0,
     "Number of times getblk has had to restart a buffer mapping for "
     "unmapped buffer");
 static int numbufallocfails;
 SYSCTL_INT(_vfs, OID_AUTO, numbufallocfails, CTLFLAG_RW, &numbufallocfails, 0,
     "Number of times buffer allocations failed");
 static int flushbufqtarget = 100;
 SYSCTL_INT(_vfs, OID_AUTO, flushbufqtarget, CTLFLAG_RW, &flushbufqtarget, 0,
     "Amount of work to do in flushbufqueues when helping bufdaemon");
 static long notbufdflushes;
 SYSCTL_LONG(_vfs, OID_AUTO, notbufdflushes, CTLFLAG_RD, &notbufdflushes, 0,
     "Number of dirty buffer flushes done by the bufdaemon helpers");
 static long barrierwrites;
 SYSCTL_LONG(_vfs, OID_AUTO, barrierwrites, CTLFLAG_RW, &barrierwrites, 0,
     "Number of barrier writes");
 SYSCTL_INT(_vfs, OID_AUTO, unmapped_buf_allowed, CTLFLAG_RD,
     &unmapped_buf_allowed, 0,
     "Permit the use of the unmapped i/o");
+int maxbcachebuf = MAXBCACHEBUF;
+SYSCTL_INT(_vfs, OID_AUTO, maxbcachebuf, CTLFLAG_RDTUN, &maxbcachebuf, 0,
+    "Maximum size of a buffer cache block");
 
 /*
  * This lock synchronizes access to bd_request.
  */
 static struct mtx_padalign bdlock;
 
 /*
  * This lock protects the runningbufreq and synchronizes runningbufwakeup and
  * waitrunningbufspace().
  */
 static struct mtx_padalign rbreqlock;
 
 /*
  * Lock that protects needsbuffer and the sleeps/wakeups surrounding it.
  */
 static struct rwlock_padalign nblock;
 
 /*
  * Lock that protects bdirtywait.
  */
 static struct mtx_padalign bdirtylock;
 
 /*
  * Wakeup point for bufdaemon, as well as indicator of whether it is already
  * active.  Set to 1 when the bufdaemon is already "on" the queue, 0 when it
  * is idling.
  */
 static int bd_request;
 
 /*
  * Request/wakeup point for the bufspace daemon.
  */
 static int bufspace_request;
 
 /*
  * Request for the buf daemon to write more buffers than is indicated by
  * lodirtybuf.  This may be necessary to push out excess dependencies or
  * defragment the address space where a simple count of the number of dirty
  * buffers is insufficient to characterize the demand for flushing them.
  */
 static int bd_speedupreq;
 
 /*
  * Synchronization (sleep/wakeup) variable for active buffer space requests.
  * Set when wait starts, cleared prior to wakeup().
  * Used in runningbufwakeup() and waitrunningbufspace().
  */
 static int runningbufreq;
 
 /* 
  * Synchronization (sleep/wakeup) variable for buffer requests.
  * Can contain the VFS_BIO_NEED flags defined below; setting/clearing is done
  * by and/or.
  * Used in numdirtywakeup(), bufspace_wakeup(), bwillwrite(),
  * getnewbuf(), and getblk().
  */
 static volatile int needsbuffer;
 
 /*
  * Synchronization for bwillwrite() waiters.
  */
 static int bdirtywait;
 
 /*
  * Definitions for the buffer free lists.
  */
 #define QUEUE_NONE	0	/* on no queue */
 #define QUEUE_EMPTY	1	/* empty buffer headers */
 #define QUEUE_DIRTY	2	/* B_DELWRI buffers */
 #define QUEUE_CLEAN	3	/* non-B_DELWRI buffers */
 #define QUEUE_SENTINEL	1024	/* not an queue index, but mark for sentinel */
 
 /* Maximum number of clean buffer queues. */
 #define	CLEAN_QUEUES	16
 
 /* Configured number of clean queues. */
 static int clean_queues;
 
 /* Maximum number of buffer queues. */
 #define BUFFER_QUEUES	(QUEUE_CLEAN + CLEAN_QUEUES)
 
 /* Queues for free buffers with various properties */
 static TAILQ_HEAD(bqueues, buf) bufqueues[BUFFER_QUEUES] = { { 0 } };
 #ifdef INVARIANTS
 static int bq_len[BUFFER_QUEUES];
 #endif
 
 /*
  * Lock for each bufqueue
  */
 static struct mtx_padalign bqlocks[BUFFER_QUEUES];
 
 /*
  * per-cpu empty buffer cache.
  */
 uma_zone_t buf_zone;
 
 /*
  * Single global constant for BUF_WMESG, to avoid getting multiple references.
  * buf_wmesg is referred from macros.
  */
 const char *buf_wmesg = BUF_WMESG;
 
 static int
 sysctl_runningspace(SYSCTL_HANDLER_ARGS)
 {
 	long value;
 	int error;
 
 	value = *(long *)arg1;
 	error = sysctl_handle_long(oidp, &value, 0, req);
 	if (error != 0 || req->newptr == NULL)
 		return (error);
 	mtx_lock(&rbreqlock);
 	if (arg1 == &hirunningspace) {
 		if (value < lorunningspace)
 			error = EINVAL;
 		else
 			hirunningspace = value;
 	} else {
 		KASSERT(arg1 == &lorunningspace,
 		    ("%s: unknown arg1", __func__));
 		if (value > hirunningspace)
 			error = EINVAL;
 		else
 			lorunningspace = value;
 	}
 	mtx_unlock(&rbreqlock);
 	return (error);
 }
 
 #if defined(COMPAT_FREEBSD4) || defined(COMPAT_FREEBSD5) || \
     defined(COMPAT_FREEBSD6) || defined(COMPAT_FREEBSD7)
 static int
 sysctl_bufspace(SYSCTL_HANDLER_ARGS)
 {
 	long lvalue;
 	int ivalue;
 
 	if (sizeof(int) == sizeof(long) || req->oldlen >= sizeof(long))
 		return (sysctl_handle_long(oidp, arg1, arg2, req));
 	lvalue = *(long *)arg1;
 	if (lvalue > INT_MAX)
 		/* On overflow, still write out a long to trigger ENOMEM. */
 		return (sysctl_handle_long(oidp, &lvalue, 0, req));
 	ivalue = lvalue;
 	return (sysctl_handle_int(oidp, &ivalue, 0, req));
 }
 #endif
 
 static int
 bqcleanq(void)
 {
 	static int nextq;
 
 	return ((atomic_fetchadd_int(&nextq, 1) % clean_queues) + QUEUE_CLEAN);
 }
 
 static int
 bqisclean(int qindex)
 {
 
 	return (qindex >= QUEUE_CLEAN && qindex < QUEUE_CLEAN + CLEAN_QUEUES);
 }
 
 /*
  *	bqlock:
  *
  *	Return the appropriate queue lock based on the index.
  */
 static inline struct mtx *
 bqlock(int qindex)
 {
 
 	return (struct mtx *)&bqlocks[qindex];
 }
 
 /*
  *	bdirtywakeup:
  *
  *	Wakeup any bwillwrite() waiters.
  */
 static void
 bdirtywakeup(void)
 {
 	mtx_lock(&bdirtylock);
 	if (bdirtywait) {
 		bdirtywait = 0;
 		wakeup(&bdirtywait);
 	}
 	mtx_unlock(&bdirtylock);
 }
 
 /*
  *	bdirtysub:
  *
  *	Decrement the numdirtybuffers count by one and wakeup any
  *	threads blocked in bwillwrite().
  */
 static void
 bdirtysub(void)
 {
 
 	if (atomic_fetchadd_int(&numdirtybuffers, -1) ==
 	    (lodirtybuffers + hidirtybuffers) / 2)
 		bdirtywakeup();
 }
 
 /*
  *	bdirtyadd:
  *
  *	Increment the numdirtybuffers count by one and wakeup the buf 
  *	daemon if needed.
  */
 static void
 bdirtyadd(void)
 {
 
 	/*
 	 * Only do the wakeup once as we cross the boundary.  The
 	 * buf daemon will keep running until the condition clears.
 	 */
 	if (atomic_fetchadd_int(&numdirtybuffers, 1) ==
 	    (lodirtybuffers + hidirtybuffers) / 2)
 		bd_wakeup();
 }
 
 /*
  *	bufspace_wakeup:
  *
  *	Called when buffer space is potentially available for recovery.
  *	getnewbuf() will block on this flag when it is unable to free 
  *	sufficient buffer space.  Buffer space becomes recoverable when 
  *	bp's get placed back in the queues.
  */
 static void
 bufspace_wakeup(void)
 {
 
 	/*
 	 * If someone is waiting for bufspace, wake them up.
 	 *
 	 * Since needsbuffer is set prior to doing an additional queue
 	 * scan it is safe to check for the flag prior to acquiring the
 	 * lock.  The thread that is preparing to scan again before
 	 * blocking would discover the buf we released.
 	 */
 	if (needsbuffer) {
 		rw_rlock(&nblock);
 		if (atomic_cmpset_int(&needsbuffer, 1, 0) == 1)
 			wakeup(__DEVOLATILE(void *, &needsbuffer));
 		rw_runlock(&nblock);
 	}
 }
 
 /*
  *	bufspace_daemonwakeup:
  *
  *	Wakeup the daemon responsible for freeing clean bufs.
  */
 static void
 bufspace_daemonwakeup(void)
 {
 	rw_rlock(&nblock);
 	if (bufspace_request == 0) {
 		bufspace_request = 1;
 		wakeup(&bufspace_request);
 	}
 	rw_runlock(&nblock);
 }
 
 /*
  *	bufspace_adjust:
  *
  *	Adjust the reported bufspace for a KVA managed buffer, possibly
  * 	waking any waiters.
  */
 static void
 bufspace_adjust(struct buf *bp, int bufsize)
 {
 	long space;
 	int diff;
 
 	KASSERT((bp->b_flags & B_MALLOC) == 0,
 	    ("bufspace_adjust: malloc buf %p", bp));
 	diff = bufsize - bp->b_bufsize;
 	if (diff < 0) {
 		atomic_subtract_long(&bufspace, -diff);
 		bufspace_wakeup();
 	} else {
 		space = atomic_fetchadd_long(&bufspace, diff);
 		/* Wake up the daemon on the transition. */
 		if (space < bufspacethresh && space + diff >= bufspacethresh)
 			bufspace_daemonwakeup();
 	}
 	bp->b_bufsize = bufsize;
 }
 
 /*
  *	bufspace_reserve:
  *
  *	Reserve bufspace before calling allocbuf().  metadata has a
  *	different space limit than data.
  */
 static int
 bufspace_reserve(int size, bool metadata)
 {
 	long limit;
 	long space;
 
 	if (metadata)
 		limit = maxbufspace;
 	else
 		limit = hibufspace;
 	do {
 		space = bufspace;
 		if (space + size > limit)
 			return (ENOSPC);
 	} while (atomic_cmpset_long(&bufspace, space, space + size) == 0);
 
 	/* Wake up the daemon on the transition. */
 	if (space < bufspacethresh && space + size >= bufspacethresh)
 		bufspace_daemonwakeup();
 
 	return (0);
 }
 
 /*
  *	bufspace_release:
  *
  *	Release reserved bufspace after bufspace_adjust() has consumed it.
  */
 static void
 bufspace_release(int size)
 {
 	atomic_subtract_long(&bufspace, size);
 	bufspace_wakeup();
 }
 
 /*
  *	bufspace_wait:
  *
  *	Wait for bufspace, acting as the buf daemon if a locked vnode is
  *	supplied.  needsbuffer must be set in a safe fashion prior to
  *	polling for space.  The operation must be re-tried on return.
  */
 static void
 bufspace_wait(struct vnode *vp, int gbflags, int slpflag, int slptimeo)
 {
 	struct thread *td;
 	int error, fl, norunbuf;
 
 	if ((gbflags & GB_NOWAIT_BD) != 0)
 		return;
 
 	td = curthread;
 	rw_wlock(&nblock);
 	while (needsbuffer != 0) {
 		if (vp != NULL && vp->v_type != VCHR &&
 		    (td->td_pflags & TDP_BUFNEED) == 0) {
 			rw_wunlock(&nblock);
 			/*
 			 * getblk() is called with a vnode locked, and
 			 * some majority of the dirty buffers may as
 			 * well belong to the vnode.  Flushing the
 			 * buffers there would make a progress that
 			 * cannot be achieved by the buf_daemon, that
 			 * cannot lock the vnode.
 			 */
 			norunbuf = ~(TDP_BUFNEED | TDP_NORUNNINGBUF) |
 			    (td->td_pflags & TDP_NORUNNINGBUF);
 
 			/*
 			 * Play bufdaemon.  The getnewbuf() function
 			 * may be called while the thread owns lock
 			 * for another dirty buffer for the same
 			 * vnode, which makes it impossible to use
 			 * VOP_FSYNC() there, due to the buffer lock
 			 * recursion.
 			 */
 			td->td_pflags |= TDP_BUFNEED | TDP_NORUNNINGBUF;
 			fl = buf_flush(vp, flushbufqtarget);
 			td->td_pflags &= norunbuf;
 			rw_wlock(&nblock);
 			if (fl != 0)
 				continue;
 			if (needsbuffer == 0)
 				break;
 		}
 		error = rw_sleep(__DEVOLATILE(void *, &needsbuffer), &nblock,
 		    (PRIBIO + 4) | slpflag, "newbuf", slptimeo);
 		if (error != 0)
 			break;
 	}
 	rw_wunlock(&nblock);
 }
 
 
 /*
  *	bufspace_daemon:
  *
  *	buffer space management daemon.  Tries to maintain some marginal
  *	amount of free buffer space so that requesting processes neither
  *	block nor work to reclaim buffers.
  */
 static void
 bufspace_daemon(void)
 {
 	for (;;) {
 		kproc_suspend_check(bufspacedaemonproc);
 
 		/*
 		 * Free buffers from the clean queue until we meet our
 		 * targets.
 		 *
 		 * Theory of operation:  The buffer cache is most efficient
 		 * when some free buffer headers and space are always
 		 * available to getnewbuf().  This daemon attempts to prevent
 		 * the excessive blocking and synchronization associated
 		 * with shortfall.  It goes through three phases according
 		 * demand:
 		 *
 		 * 1)	The daemon wakes up voluntarily once per-second
 		 *	during idle periods when the counters are below
 		 *	the wakeup thresholds (bufspacethresh, lofreebuffers).
 		 *
 		 * 2)	The daemon wakes up as we cross the thresholds
 		 *	ahead of any potential blocking.  This may bounce
 		 *	slightly according to the rate of consumption and
 		 *	release.
 		 *
 		 * 3)	The daemon and consumers are starved for working
 		 *	clean buffers.  This is the 'bufspace' sleep below
 		 *	which will inefficiently trade bufs with bqrelse
 		 *	until we return to condition 2.
 		 */
 		while (bufspace > lobufspace ||
 		    numfreebuffers < hifreebuffers) {
 			if (buf_recycle(false) != 0) {
 				atomic_set_int(&needsbuffer, 1);
 				if (buf_recycle(false) != 0) {
 					rw_wlock(&nblock);
 					if (needsbuffer)
 						rw_sleep(__DEVOLATILE(void *,
 						    &needsbuffer), &nblock,
 						    PRIBIO|PDROP, "bufspace",
 						    hz/10);
 					else
 						rw_wunlock(&nblock);
 				}
 			}
 			maybe_yield();
 		}
 
 		/*
 		 * Re-check our limits under the exclusive nblock.
 		 */
 		rw_wlock(&nblock);
 		if (bufspace < bufspacethresh &&
 		    numfreebuffers > lofreebuffers) {
 			bufspace_request = 0;
 			rw_sleep(&bufspace_request, &nblock, PRIBIO|PDROP,
 			    "-", hz);
 		} else
 			rw_wunlock(&nblock);
 	}
 }
 
 static struct kproc_desc bufspace_kp = {
 	"bufspacedaemon",
 	bufspace_daemon,
 	&bufspacedaemonproc
 };
 SYSINIT(bufspacedaemon, SI_SUB_KTHREAD_BUF, SI_ORDER_FIRST, kproc_start,
     &bufspace_kp);
 
 /*
  *	bufmallocadjust:
  *
  *	Adjust the reported bufspace for a malloc managed buffer, possibly
  *	waking any waiters.
  */
 static void
 bufmallocadjust(struct buf *bp, int bufsize)
 {
 	int diff;
 
 	KASSERT((bp->b_flags & B_MALLOC) != 0,
 	    ("bufmallocadjust: non-malloc buf %p", bp));
 	diff = bufsize - bp->b_bufsize;
 	if (diff < 0)
 		atomic_subtract_long(&bufmallocspace, -diff);
 	else
 		atomic_add_long(&bufmallocspace, diff);
 	bp->b_bufsize = bufsize;
 }
 
 /*
  *	runningwakeup:
  *
  *	Wake up processes that are waiting on asynchronous writes to fall
  *	below lorunningspace.
  */
 static void
 runningwakeup(void)
 {
 
 	mtx_lock(&rbreqlock);
 	if (runningbufreq) {
 		runningbufreq = 0;
 		wakeup(&runningbufreq);
 	}
 	mtx_unlock(&rbreqlock);
 }
 
 /*
  *	runningbufwakeup:
  *
  *	Decrement the outstanding write count according.
  */
 void
 runningbufwakeup(struct buf *bp)
 {
 	long space, bspace;
 
 	bspace = bp->b_runningbufspace;
 	if (bspace == 0)
 		return;
 	space = atomic_fetchadd_long(&runningbufspace, -bspace);
 	KASSERT(space >= bspace, ("runningbufspace underflow %ld %ld",
 	    space, bspace));
 	bp->b_runningbufspace = 0;
 	/*
 	 * Only acquire the lock and wakeup on the transition from exceeding
 	 * the threshold to falling below it.
 	 */
 	if (space < lorunningspace)
 		return;
 	if (space - bspace > lorunningspace)
 		return;
 	runningwakeup();
 }
 
 /*
  *	waitrunningbufspace()
  *
  *	runningbufspace is a measure of the amount of I/O currently
  *	running.  This routine is used in async-write situations to
  *	prevent creating huge backups of pending writes to a device.
  *	Only asynchronous writes are governed by this function.
  *
  *	This does NOT turn an async write into a sync write.  It waits  
  *	for earlier writes to complete and generally returns before the
  *	caller's write has reached the device.
  */
 void
 waitrunningbufspace(void)
 {
 
 	mtx_lock(&rbreqlock);
 	while (runningbufspace > hirunningspace) {
 		runningbufreq = 1;
 		msleep(&runningbufreq, &rbreqlock, PVM, "wdrain", 0);
 	}
 	mtx_unlock(&rbreqlock);
 }
 
 
 /*
  *	vfs_buf_test_cache:
  *
  *	Called when a buffer is extended.  This function clears the B_CACHE
  *	bit if the newly extended portion of the buffer does not contain
  *	valid data.
  */
 static __inline void
 vfs_buf_test_cache(struct buf *bp, vm_ooffset_t foff, vm_offset_t off,
     vm_offset_t size, vm_page_t m)
 {
 
 	VM_OBJECT_ASSERT_LOCKED(m->object);
 	if (bp->b_flags & B_CACHE) {
 		int base = (foff + off) & PAGE_MASK;
 		if (vm_page_is_valid(m, base, size) == 0)
 			bp->b_flags &= ~B_CACHE;
 	}
 }
 
 /* Wake up the buffer daemon if necessary */
 static __inline void
 bd_wakeup(void)
 {
 
 	mtx_lock(&bdlock);
 	if (bd_request == 0) {
 		bd_request = 1;
 		wakeup(&bd_request);
 	}
 	mtx_unlock(&bdlock);
 }
 
 /*
+ * Adjust the maxbcachbuf tunable.
+ */
+static void
+maxbcachebuf_adjust(void)
+{
+	int i;
+
+	/*
+	 * maxbcachebuf must be a power of 2 >= MAXBSIZE.
+	 */
+	i = 2;
+	while (i * 2 <= maxbcachebuf)
+		i *= 2;
+	maxbcachebuf = i;
+	if (maxbcachebuf < MAXBSIZE)
+		maxbcachebuf = MAXBSIZE;
+	if (maxbcachebuf > MAXPHYS)
+		maxbcachebuf = MAXPHYS;
+	if (bootverbose != 0 && maxbcachebuf != MAXBCACHEBUF)
+		printf("maxbcachebuf=%d\n", maxbcachebuf);
+}
+
+/*
  * bd_speedup - speedup the buffer cache flushing code
  */
 void
 bd_speedup(void)
 {
 	int needwake;
 
 	mtx_lock(&bdlock);
 	needwake = 0;
 	if (bd_speedupreq == 0 || bd_request == 0)
 		needwake = 1;
 	bd_speedupreq = 1;
 	bd_request = 1;
 	if (needwake)
 		wakeup(&bd_request);
 	mtx_unlock(&bdlock);
 }
 
 #ifndef NSWBUF_MIN
 #define	NSWBUF_MIN	16
 #endif
 
 #ifdef __i386__
 #define	TRANSIENT_DENOM	5
 #else
 #define	TRANSIENT_DENOM 10
 #endif
 
 /*
  * Calculating buffer cache scaling values and reserve space for buffer
  * headers.  This is called during low level kernel initialization and
  * may be called more then once.  We CANNOT write to the memory area
  * being reserved at this time.
  */
 caddr_t
 kern_vfs_bio_buffer_alloc(caddr_t v, long physmem_est)
 {
 	int tuned_nbuf;
 	long maxbuf, maxbuf_sz, buf_sz,	biotmap_sz;
 
 	/*
 	 * physmem_est is in pages.  Convert it to kilobytes (assumes
 	 * PAGE_SIZE is >= 1K)
 	 */
 	physmem_est = physmem_est * (PAGE_SIZE / 1024);
 
+	maxbcachebuf_adjust();
 	/*
 	 * The nominal buffer size (and minimum KVA allocation) is BKVASIZE.
 	 * For the first 64MB of ram nominally allocate sufficient buffers to
 	 * cover 1/4 of our ram.  Beyond the first 64MB allocate additional
 	 * buffers to cover 1/10 of our ram over 64MB.  When auto-sizing
 	 * the buffer cache we limit the eventual kva reservation to
 	 * maxbcache bytes.
 	 *
 	 * factor represents the 1/4 x ram conversion.
 	 */
 	if (nbuf == 0) {
 		int factor = 4 * BKVASIZE / 1024;
 
 		nbuf = 50;
 		if (physmem_est > 4096)
 			nbuf += min((physmem_est - 4096) / factor,
 			    65536 / factor);
 		if (physmem_est > 65536)
 			nbuf += min((physmem_est - 65536) * 2 / (factor * 5),
 			    32 * 1024 * 1024 / (factor * 5));
 
 		if (maxbcache && nbuf > maxbcache / BKVASIZE)
 			nbuf = maxbcache / BKVASIZE;
 		tuned_nbuf = 1;
 	} else
 		tuned_nbuf = 0;
 
 	/* XXX Avoid unsigned long overflows later on with maxbufspace. */
 	maxbuf = (LONG_MAX / 3) / BKVASIZE;
 	if (nbuf > maxbuf) {
 		if (!tuned_nbuf)
 			printf("Warning: nbufs lowered from %d to %ld\n", nbuf,
 			    maxbuf);
 		nbuf = maxbuf;
 	}
 
 	/*
 	 * Ideal allocation size for the transient bio submap is 10%
 	 * of the maximal space buffer map.  This roughly corresponds
 	 * to the amount of the buffer mapped for typical UFS load.
 	 *
 	 * Clip the buffer map to reserve space for the transient
 	 * BIOs, if its extent is bigger than 90% (80% on i386) of the
 	 * maximum buffer map extent on the platform.
 	 *
 	 * The fall-back to the maxbuf in case of maxbcache unset,
 	 * allows to not trim the buffer KVA for the architectures
 	 * with ample KVA space.
 	 */
 	if (bio_transient_maxcnt == 0 && unmapped_buf_allowed) {
 		maxbuf_sz = maxbcache != 0 ? maxbcache : maxbuf * BKVASIZE;
 		buf_sz = (long)nbuf * BKVASIZE;
 		if (buf_sz < maxbuf_sz / TRANSIENT_DENOM *
 		    (TRANSIENT_DENOM - 1)) {
 			/*
 			 * There is more KVA than memory.  Do not
 			 * adjust buffer map size, and assign the rest
 			 * of maxbuf to transient map.
 			 */
 			biotmap_sz = maxbuf_sz - buf_sz;
 		} else {
 			/*
 			 * Buffer map spans all KVA we could afford on
 			 * this platform.  Give 10% (20% on i386) of
 			 * the buffer map to the transient bio map.
 			 */
 			biotmap_sz = buf_sz / TRANSIENT_DENOM;
 			buf_sz -= biotmap_sz;
 		}
 		if (biotmap_sz / INT_MAX > MAXPHYS)
 			bio_transient_maxcnt = INT_MAX;
 		else
 			bio_transient_maxcnt = biotmap_sz / MAXPHYS;
 		/*
 		 * Artificially limit to 1024 simultaneous in-flight I/Os
 		 * using the transient mapping.
 		 */
 		if (bio_transient_maxcnt > 1024)
 			bio_transient_maxcnt = 1024;
 		if (tuned_nbuf)
 			nbuf = buf_sz / BKVASIZE;
 	}
 
 	/*
 	 * swbufs are used as temporary holders for I/O, such as paging I/O.
 	 * We have no less then 16 and no more then 256.
 	 */
 	nswbuf = min(nbuf / 4, 256);
 	TUNABLE_INT_FETCH("kern.nswbuf", &nswbuf);
 	if (nswbuf < NSWBUF_MIN)
 		nswbuf = NSWBUF_MIN;
 
 	/*
 	 * Reserve space for the buffer cache buffers
 	 */
 	swbuf = (void *)v;
 	v = (caddr_t)(swbuf + nswbuf);
 	buf = (void *)v;
 	v = (caddr_t)(buf + nbuf);
 
 	return(v);
 }
 
 /* Initialize the buffer subsystem.  Called before use of any buffers. */
 void
 bufinit(void)
 {
 	struct buf *bp;
 	int i;
 
-	CTASSERT(MAXBCACHEBUF >= MAXBSIZE);
+	KASSERT(maxbcachebuf >= MAXBSIZE,
+	    ("maxbcachebuf (%d) must be >= MAXBSIZE (%d)\n", maxbcachebuf,
+	    MAXBSIZE));
 	mtx_init(&bqlocks[QUEUE_DIRTY], "bufq dirty lock", NULL, MTX_DEF);
 	mtx_init(&bqlocks[QUEUE_EMPTY], "bufq empty lock", NULL, MTX_DEF);
 	for (i = QUEUE_CLEAN; i < QUEUE_CLEAN + CLEAN_QUEUES; i++)
 		mtx_init(&bqlocks[i], "bufq clean lock", NULL, MTX_DEF);
 	mtx_init(&rbreqlock, "runningbufspace lock", NULL, MTX_DEF);
 	rw_init(&nblock, "needsbuffer lock");
 	mtx_init(&bdlock, "buffer daemon lock", NULL, MTX_DEF);
 	mtx_init(&bdirtylock, "dirty buf lock", NULL, MTX_DEF);
 
 	/* next, make a null set of free lists */
 	for (i = 0; i < BUFFER_QUEUES; i++)
 		TAILQ_INIT(&bufqueues[i]);
 
 	unmapped_buf = (caddr_t)kva_alloc(MAXPHYS);
 
 	/* finally, initialize each buffer header and stick on empty q */
 	for (i = 0; i < nbuf; i++) {
 		bp = &buf[i];
 		bzero(bp, sizeof *bp);
 		bp->b_flags = B_INVAL;
 		bp->b_rcred = NOCRED;
 		bp->b_wcred = NOCRED;
 		bp->b_qindex = QUEUE_EMPTY;
 		bp->b_xflags = 0;
 		bp->b_data = bp->b_kvabase = unmapped_buf;
 		LIST_INIT(&bp->b_dep);
 		BUF_LOCKINIT(bp);
 		TAILQ_INSERT_TAIL(&bufqueues[QUEUE_EMPTY], bp, b_freelist);
 #ifdef INVARIANTS
 		bq_len[QUEUE_EMPTY]++;
 #endif
 	}
 
 	/*
 	 * maxbufspace is the absolute maximum amount of buffer space we are 
 	 * allowed to reserve in KVM and in real terms.  The absolute maximum
 	 * is nominally used by metadata.  hibufspace is the nominal maximum
 	 * used by most other requests.  The differential is required to 
 	 * ensure that metadata deadlocks don't occur.
 	 *
 	 * maxbufspace is based on BKVASIZE.  Allocating buffers larger then
 	 * this may result in KVM fragmentation which is not handled optimally
 	 * by the system. XXX This is less true with vmem.  We could use
 	 * PAGE_SIZE.
 	 */
 	maxbufspace = (long)nbuf * BKVASIZE;
-	hibufspace = lmax(3 * maxbufspace / 4, maxbufspace - MAXBCACHEBUF * 10);
+	hibufspace = lmax(3 * maxbufspace / 4, maxbufspace - maxbcachebuf * 10);
 	lobufspace = (hibufspace / 20) * 19; /* 95% */
 	bufspacethresh = lobufspace + (hibufspace - lobufspace) / 2;
 
 	/*
 	 * Note: The 16 MiB upper limit for hirunningspace was chosen
 	 * arbitrarily and may need further tuning. It corresponds to
 	 * 128 outstanding write IO requests (if IO size is 128 KiB),
 	 * which fits with many RAID controllers' tagged queuing limits.
 	 * The lower 1 MiB limit is the historical upper limit for
 	 * hirunningspace.
 	 */
-	hirunningspace = lmax(lmin(roundup(hibufspace / 64, MAXBCACHEBUF),
+	hirunningspace = lmax(lmin(roundup(hibufspace / 64, maxbcachebuf),
 	    16 * 1024 * 1024), 1024 * 1024);
-	lorunningspace = roundup((hirunningspace * 2) / 3, MAXBCACHEBUF);
+	lorunningspace = roundup((hirunningspace * 2) / 3, maxbcachebuf);
 
 	/*
 	 * Limit the amount of malloc memory since it is wired permanently into
 	 * the kernel space.  Even though this is accounted for in the buffer
 	 * allocation, we don't want the malloced region to grow uncontrolled.
 	 * The malloc scheme improves memory utilization significantly on
 	 * average (small) directories.
 	 */
 	maxbufmallocspace = hibufspace / 20;
 
 	/*
 	 * Reduce the chance of a deadlock occurring by limiting the number
 	 * of delayed-write dirty buffers we allow to stack up.
 	 */
 	hidirtybuffers = nbuf / 4 + 20;
 	dirtybufthresh = hidirtybuffers * 9 / 10;
 	numdirtybuffers = 0;
 	/*
 	 * To support extreme low-memory systems, make sure hidirtybuffers
 	 * cannot eat up all available buffer space.  This occurs when our
 	 * minimum cannot be met.  We try to size hidirtybuffers to 3/4 our
 	 * buffer space assuming BKVASIZE'd buffers.
 	 */
 	while ((long)hidirtybuffers * BKVASIZE > 3 * hibufspace / 4) {
 		hidirtybuffers >>= 1;
 	}
 	lodirtybuffers = hidirtybuffers / 2;
 
 	/*
 	 * lofreebuffers should be sufficient to avoid stalling waiting on
 	 * buf headers under heavy utilization.  The bufs in per-cpu caches
 	 * are counted as free but will be unavailable to threads executing
 	 * on other cpus.
 	 *
 	 * hifreebuffers is the free target for the bufspace daemon.  This
 	 * should be set appropriately to limit work per-iteration.
 	 */
 	lofreebuffers = MIN((nbuf / 25) + (20 * mp_ncpus), 128 * mp_ncpus);
 	hifreebuffers = (3 * lofreebuffers) / 2;
 	numfreebuffers = nbuf;
 
 	/* Setup the kva and free list allocators. */
 	vmem_set_reclaim(buffer_arena, bufkva_reclaim);
 	buf_zone = uma_zcache_create("buf free cache", sizeof(struct buf),
 	    NULL, NULL, NULL, NULL, buf_import, buf_release, NULL, 0);
 
 	/*
 	 * Size the clean queue according to the amount of buffer space.
 	 * One queue per-256mb up to the max.  More queues gives better
 	 * concurrency but less accurate LRU.
 	 */
 	clean_queues = MIN(howmany(maxbufspace, 256*1024*1024), CLEAN_QUEUES);
 
 }
 
 #ifdef INVARIANTS
 static inline void
 vfs_buf_check_mapped(struct buf *bp)
 {
 
 	KASSERT(bp->b_kvabase != unmapped_buf,
 	    ("mapped buf: b_kvabase was not updated %p", bp));
 	KASSERT(bp->b_data != unmapped_buf,
 	    ("mapped buf: b_data was not updated %p", bp));
 	KASSERT(bp->b_data < unmapped_buf || bp->b_data >= unmapped_buf +
 	    MAXPHYS, ("b_data + b_offset unmapped %p", bp));
 }
 
 static inline void
 vfs_buf_check_unmapped(struct buf *bp)
 {
 
 	KASSERT(bp->b_data == unmapped_buf,
 	    ("unmapped buf: corrupted b_data %p", bp));
 }
 
 #define	BUF_CHECK_MAPPED(bp) vfs_buf_check_mapped(bp)
 #define	BUF_CHECK_UNMAPPED(bp) vfs_buf_check_unmapped(bp)
 #else
 #define	BUF_CHECK_MAPPED(bp) do {} while (0)
 #define	BUF_CHECK_UNMAPPED(bp) do {} while (0)
 #endif
 
 static int
 isbufbusy(struct buf *bp)
 {
 	if (((bp->b_flags & B_INVAL) == 0 && BUF_ISLOCKED(bp)) ||
 	    ((bp->b_flags & (B_DELWRI | B_INVAL)) == B_DELWRI))
 		return (1);
 	return (0);
 }
 
 /*
  * Shutdown the system cleanly to prepare for reboot, halt, or power off.
  */
 void
 bufshutdown(int show_busybufs)
 {
 	static int first_buf_printf = 1;
 	struct buf *bp;
 	int iter, nbusy, pbusy;
 #ifndef PREEMPTION
 	int subiter;
 #endif
 
 	/* 
 	 * Sync filesystems for shutdown
 	 */
 	wdog_kern_pat(WD_LASTVAL);
 	sys_sync(curthread, NULL);
 
 	/*
 	 * With soft updates, some buffers that are
 	 * written will be remarked as dirty until other
 	 * buffers are written.
 	 */
 	for (iter = pbusy = 0; iter < 20; iter++) {
 		nbusy = 0;
 		for (bp = &buf[nbuf]; --bp >= buf; )
 			if (isbufbusy(bp))
 				nbusy++;
 		if (nbusy == 0) {
 			if (first_buf_printf)
 				printf("All buffers synced.");
 			break;
 		}
 		if (first_buf_printf) {
 			printf("Syncing disks, buffers remaining... ");
 			first_buf_printf = 0;
 		}
 		printf("%d ", nbusy);
 		if (nbusy < pbusy)
 			iter = 0;
 		pbusy = nbusy;
 
 		wdog_kern_pat(WD_LASTVAL);
 		sys_sync(curthread, NULL);
 
 #ifdef PREEMPTION
 		/*
 		 * Drop Giant and spin for a while to allow
 		 * interrupt threads to run.
 		 */
 		DROP_GIANT();
 		DELAY(50000 * iter);
 		PICKUP_GIANT();
 #else
 		/*
 		 * Drop Giant and context switch several times to
 		 * allow interrupt threads to run.
 		 */
 		DROP_GIANT();
 		for (subiter = 0; subiter < 50 * iter; subiter++) {
 			thread_lock(curthread);
 			mi_switch(SW_VOL, NULL);
 			thread_unlock(curthread);
 			DELAY(1000);
 		}
 		PICKUP_GIANT();
 #endif
 	}
 	printf("\n");
 	/*
 	 * Count only busy local buffers to prevent forcing 
 	 * a fsck if we're just a client of a wedged NFS server
 	 */
 	nbusy = 0;
 	for (bp = &buf[nbuf]; --bp >= buf; ) {
 		if (isbufbusy(bp)) {
 #if 0
 /* XXX: This is bogus.  We should probably have a BO_REMOTE flag instead */
 			if (bp->b_dev == NULL) {
 				TAILQ_REMOVE(&mountlist,
 				    bp->b_vp->v_mount, mnt_list);
 				continue;
 			}
 #endif
 			nbusy++;
 			if (show_busybufs > 0) {
 				printf(
 	    "%d: buf:%p, vnode:%p, flags:%0x, blkno:%jd, lblkno:%jd, buflock:",
 				    nbusy, bp, bp->b_vp, bp->b_flags,
 				    (intmax_t)bp->b_blkno,
 				    (intmax_t)bp->b_lblkno);
 				BUF_LOCKPRINTINFO(bp);
 				if (show_busybufs > 1)
 					vn_printf(bp->b_vp,
 					    "vnode content: ");
 			}
 		}
 	}
 	if (nbusy) {
 		/*
 		 * Failed to sync all blocks. Indicate this and don't
 		 * unmount filesystems (thus forcing an fsck on reboot).
 		 */
 		printf("Giving up on %d buffers\n", nbusy);
 		DELAY(5000000);	/* 5 seconds */
 	} else {
 		if (!first_buf_printf)
 			printf("Final sync complete\n");
 		/*
 		 * Unmount filesystems
 		 */
 		if (panicstr == NULL)
 			vfs_unmountall();
 	}
 	swapoff_all();
 	DELAY(100000);		/* wait for console output to finish */
 }
 
 static void
 bpmap_qenter(struct buf *bp)
 {
 
 	BUF_CHECK_MAPPED(bp);
 
 	/*
 	 * bp->b_data is relative to bp->b_offset, but
 	 * bp->b_offset may be offset into the first page.
 	 */
 	bp->b_data = (caddr_t)trunc_page((vm_offset_t)bp->b_data);
 	pmap_qenter((vm_offset_t)bp->b_data, bp->b_pages, bp->b_npages);
 	bp->b_data = (caddr_t)((vm_offset_t)bp->b_data |
 	    (vm_offset_t)(bp->b_offset & PAGE_MASK));
 }
 
 /*
  *	binsfree:
  *
  *	Insert the buffer into the appropriate free list.
  */
 static void
 binsfree(struct buf *bp, int qindex)
 {
 	struct mtx *olock, *nlock;
 
 	if (qindex != QUEUE_EMPTY) {
 		BUF_ASSERT_XLOCKED(bp);
 	}
 
 	/*
 	 * Stick to the same clean queue for the lifetime of the buf to
 	 * limit locking below.  Otherwise pick ont sequentially.
 	 */
 	if (qindex == QUEUE_CLEAN) {
 		if (bqisclean(bp->b_qindex))
 			qindex = bp->b_qindex;
 		else
 			qindex = bqcleanq();
 	}
 
 	/*
 	 * Handle delayed bremfree() processing.
 	 */
 	nlock = bqlock(qindex);
 	if (bp->b_flags & B_REMFREE) {
 		olock = bqlock(bp->b_qindex);
 		mtx_lock(olock);
 		bremfreel(bp);
 		if (olock != nlock) {
 			mtx_unlock(olock);
 			mtx_lock(nlock);
 		}
 	} else
 		mtx_lock(nlock);
 
 	if (bp->b_qindex != QUEUE_NONE)
 		panic("binsfree: free buffer onto another queue???");
 
 	bp->b_qindex = qindex;
 	if (bp->b_flags & B_AGE)
 		TAILQ_INSERT_HEAD(&bufqueues[bp->b_qindex], bp, b_freelist);
 	else
 		TAILQ_INSERT_TAIL(&bufqueues[bp->b_qindex], bp, b_freelist);
 #ifdef INVARIANTS
 	bq_len[bp->b_qindex]++;
 #endif
 	mtx_unlock(nlock);
 }
 
 /*
  * buf_free:
  *
  *	Free a buffer to the buf zone once it no longer has valid contents.
  */
 static void
 buf_free(struct buf *bp)
 {
 
 	if (bp->b_flags & B_REMFREE)
 		bremfreef(bp);
 	if (bp->b_vflags & BV_BKGRDINPROG)
 		panic("losing buffer 1");
 	if (bp->b_rcred != NOCRED) {
 		crfree(bp->b_rcred);
 		bp->b_rcred = NOCRED;
 	}
 	if (bp->b_wcred != NOCRED) {
 		crfree(bp->b_wcred);
 		bp->b_wcred = NOCRED;
 	}
 	if (!LIST_EMPTY(&bp->b_dep))
 		buf_deallocate(bp);
 	bufkva_free(bp);
 	BUF_UNLOCK(bp);
 	uma_zfree(buf_zone, bp);
 	atomic_add_int(&numfreebuffers, 1);
 	bufspace_wakeup();
 }
 
 /*
  * buf_import:
  *
  *	Import bufs into the uma cache from the buf list.  The system still
  *	expects a static array of bufs and much of the synchronization
  *	around bufs assumes type stable storage.  As a result, UMA is used
  *	only as a per-cpu cache of bufs still maintained on a global list.
  */
 static int
 buf_import(void *arg, void **store, int cnt, int flags)
 {
 	struct buf *bp;
 	int i;
 
 	mtx_lock(&bqlocks[QUEUE_EMPTY]);
 	for (i = 0; i < cnt; i++) {
 		bp = TAILQ_FIRST(&bufqueues[QUEUE_EMPTY]);
 		if (bp == NULL)
 			break;
 		bremfreel(bp);
 		store[i] = bp;
 	}
 	mtx_unlock(&bqlocks[QUEUE_EMPTY]);
 
 	return (i);
 }
 
 /*
  * buf_release:
  *
  *	Release bufs from the uma cache back to the buffer queues.
  */
 static void
 buf_release(void *arg, void **store, int cnt)
 {
         int i;
 
         for (i = 0; i < cnt; i++)
 		binsfree(store[i], QUEUE_EMPTY);
 }
 
 /*
  * buf_alloc:
  *
  *	Allocate an empty buffer header.
  */
 static struct buf *
 buf_alloc(void)
 {
 	struct buf *bp;
 
 	bp = uma_zalloc(buf_zone, M_NOWAIT);
 	if (bp == NULL) {
 		bufspace_daemonwakeup();
 		atomic_add_int(&numbufallocfails, 1);
 		return (NULL);
 	}
 
 	/*
 	 * Wake-up the bufspace daemon on transition.
 	 */
 	if (atomic_fetchadd_int(&numfreebuffers, -1) == lofreebuffers)
 		bufspace_daemonwakeup();
 
 	if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT, NULL) != 0)
 		panic("getnewbuf_empty: Locked buf %p on free queue.", bp);
 	
 	KASSERT(bp->b_vp == NULL,
 	    ("bp: %p still has vnode %p.", bp, bp->b_vp));
 	KASSERT((bp->b_flags & (B_DELWRI | B_NOREUSE)) == 0,
 	    ("invalid buffer %p flags %#x", bp, bp->b_flags));
 	KASSERT((bp->b_xflags & (BX_VNCLEAN|BX_VNDIRTY)) == 0,
 	    ("bp: %p still on a buffer list. xflags %X", bp, bp->b_xflags));
 	KASSERT(bp->b_npages == 0,
 	    ("bp: %p still has %d vm pages\n", bp, bp->b_npages));
 	KASSERT(bp->b_kvasize == 0, ("bp: %p still has kva\n", bp));
 	KASSERT(bp->b_bufsize == 0, ("bp: %p still has bufspace\n", bp));
 
 	bp->b_flags = 0;
 	bp->b_ioflags = 0;
 	bp->b_xflags = 0;
 	bp->b_vflags = 0;
 	bp->b_vp = NULL;
 	bp->b_blkno = bp->b_lblkno = 0;
 	bp->b_offset = NOOFFSET;
 	bp->b_iodone = 0;
 	bp->b_error = 0;
 	bp->b_resid = 0;
 	bp->b_bcount = 0;
 	bp->b_npages = 0;
 	bp->b_dirtyoff = bp->b_dirtyend = 0;
 	bp->b_bufobj = NULL;
 	bp->b_data = bp->b_kvabase = unmapped_buf;
 	bp->b_fsprivate1 = NULL;
 	bp->b_fsprivate2 = NULL;
 	bp->b_fsprivate3 = NULL;
 	LIST_INIT(&bp->b_dep);
 
 	return (bp);
 }
 
 /*
  *	buf_qrecycle:
  *
  *	Free a buffer from the given bufqueue.  kva controls whether the
  *	freed buf must own some kva resources.  This is used for
  *	defragmenting.
  */
 static int
 buf_qrecycle(int qindex, bool kva)
 {
 	struct buf *bp, *nbp;
 
 	if (kva)
 		atomic_add_int(&bufdefragcnt, 1);
 	nbp = NULL;
 	mtx_lock(&bqlocks[qindex]);
 	nbp = TAILQ_FIRST(&bufqueues[qindex]);
 
 	/*
 	 * Run scan, possibly freeing data and/or kva mappings on the fly
 	 * depending.
 	 */
 	while ((bp = nbp) != NULL) {
 		/*
 		 * Calculate next bp (we can only use it if we do not
 		 * release the bqlock).
 		 */
 		nbp = TAILQ_NEXT(bp, b_freelist);
 
 		/*
 		 * If we are defragging then we need a buffer with 
 		 * some kva to reclaim.
 		 */
 		if (kva && bp->b_kvasize == 0)
 			continue;
 
 		if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT, NULL) != 0)
 			continue;
 
 		/*
 		 * Skip buffers with background writes in progress.
 		 */
 		if ((bp->b_vflags & BV_BKGRDINPROG) != 0) {
 			BUF_UNLOCK(bp);
 			continue;
 		}
 
 		KASSERT(bp->b_qindex == qindex,
 		    ("getnewbuf: inconsistent queue %d bp %p", qindex, bp));
 		/*
 		 * NOTE:  nbp is now entirely invalid.  We can only restart
 		 * the scan from this point on.
 		 */
 		bremfreel(bp);
 		mtx_unlock(&bqlocks[qindex]);
 
 		/*
 		 * Requeue the background write buffer with error and
 		 * restart the scan.
 		 */
 		if ((bp->b_vflags & BV_BKGRDERR) != 0) {
 			bqrelse(bp);
 			mtx_lock(&bqlocks[qindex]);
 			nbp = TAILQ_FIRST(&bufqueues[qindex]);
 			continue;
 		}
 		bp->b_flags |= B_INVAL;
 		brelse(bp);
 		return (0);
 	}
 	mtx_unlock(&bqlocks[qindex]);
 
 	return (ENOBUFS);
 }
 
 /*
  *	buf_recycle:
  *
  *	Iterate through all clean queues until we find a buf to recycle or
  *	exhaust the search.
  */
 static int
 buf_recycle(bool kva)
 {
 	int qindex, first_qindex;
 
 	qindex = first_qindex = bqcleanq();
 	do {
 		if (buf_qrecycle(qindex, kva) == 0)
 			return (0);
 		if (++qindex == QUEUE_CLEAN + clean_queues)
 			qindex = QUEUE_CLEAN;
 	} while (qindex != first_qindex);
 
 	return (ENOBUFS);
 }
 
 /*
  *	buf_scan:
  *
  *	Scan the clean queues looking for a buffer to recycle.  needsbuffer
  *	is set on failure so that the caller may optionally bufspace_wait()
  *	in a race-free fashion.
  */
 static int
 buf_scan(bool defrag)
 {
 	int error;
 
 	/*
 	 * To avoid heavy synchronization and wakeup races we set
 	 * needsbuffer and re-poll before failing.  This ensures that
 	 * no frees can be missed between an unsuccessful poll and
 	 * going to sleep in a synchronized fashion.
 	 */
 	if ((error = buf_recycle(defrag)) != 0) {
 		atomic_set_int(&needsbuffer, 1);
 		bufspace_daemonwakeup();
 		error = buf_recycle(defrag);
 	}
 	if (error == 0)
 		atomic_add_int(&getnewbufrestarts, 1);
 	return (error);
 }
 
 /*
  *	bremfree:
  *
  *	Mark the buffer for removal from the appropriate free list.
  *	
  */
 void
 bremfree(struct buf *bp)
 {
 
 	CTR3(KTR_BUF, "bremfree(%p) vp %p flags %X", bp, bp->b_vp, bp->b_flags);
 	KASSERT((bp->b_flags & B_REMFREE) == 0,
 	    ("bremfree: buffer %p already marked for delayed removal.", bp));
 	KASSERT(bp->b_qindex != QUEUE_NONE,
 	    ("bremfree: buffer %p not on a queue.", bp));
 	BUF_ASSERT_XLOCKED(bp);
 
 	bp->b_flags |= B_REMFREE;
 }
 
 /*
  *	bremfreef:
  *
  *	Force an immediate removal from a free list.  Used only in nfs when
  *	it abuses the b_freelist pointer.
  */
 void
 bremfreef(struct buf *bp)
 {
 	struct mtx *qlock;
 
 	qlock = bqlock(bp->b_qindex);
 	mtx_lock(qlock);
 	bremfreel(bp);
 	mtx_unlock(qlock);
 }
 
 /*
  *	bremfreel:
  *
  *	Removes a buffer from the free list, must be called with the
  *	correct qlock held.
  */
 static void
 bremfreel(struct buf *bp)
 {
 
 	CTR3(KTR_BUF, "bremfreel(%p) vp %p flags %X",
 	    bp, bp->b_vp, bp->b_flags);
 	KASSERT(bp->b_qindex != QUEUE_NONE,
 	    ("bremfreel: buffer %p not on a queue.", bp));
 	if (bp->b_qindex != QUEUE_EMPTY) {
 		BUF_ASSERT_XLOCKED(bp);
 	}
 	mtx_assert(bqlock(bp->b_qindex), MA_OWNED);
 
 	TAILQ_REMOVE(&bufqueues[bp->b_qindex], bp, b_freelist);
 #ifdef INVARIANTS
 	KASSERT(bq_len[bp->b_qindex] >= 1, ("queue %d underflow",
 	    bp->b_qindex));
 	bq_len[bp->b_qindex]--;
 #endif
 	bp->b_qindex = QUEUE_NONE;
 	bp->b_flags &= ~B_REMFREE;
 }
 
 /*
  *	bufkva_free:
  *
  *	Free the kva allocation for a buffer.
  *
  */
 static void
 bufkva_free(struct buf *bp)
 {
 
 #ifdef INVARIANTS
 	if (bp->b_kvasize == 0) {
 		KASSERT(bp->b_kvabase == unmapped_buf &&
 		    bp->b_data == unmapped_buf,
 		    ("Leaked KVA space on %p", bp));
 	} else if (buf_mapped(bp))
 		BUF_CHECK_MAPPED(bp);
 	else
 		BUF_CHECK_UNMAPPED(bp);
 #endif
 	if (bp->b_kvasize == 0)
 		return;
 
 	vmem_free(buffer_arena, (vm_offset_t)bp->b_kvabase, bp->b_kvasize);
 	atomic_subtract_long(&bufkvaspace, bp->b_kvasize);
 	atomic_add_int(&buffreekvacnt, 1);
 	bp->b_data = bp->b_kvabase = unmapped_buf;
 	bp->b_kvasize = 0;
 }
 
 /*
  *	bufkva_alloc:
  *
  *	Allocate the buffer KVA and set b_kvasize and b_kvabase.
  */
 static int
 bufkva_alloc(struct buf *bp, int maxsize, int gbflags)
 {
 	vm_offset_t addr;
 	int error;
 
 	KASSERT((gbflags & GB_UNMAPPED) == 0 || (gbflags & GB_KVAALLOC) != 0,
 	    ("Invalid gbflags 0x%x in %s", gbflags, __func__));
 
 	bufkva_free(bp);
 
 	addr = 0;
 	error = vmem_alloc(buffer_arena, maxsize, M_BESTFIT | M_NOWAIT, &addr);
 	if (error != 0) {
 		/*
 		 * Buffer map is too fragmented.  Request the caller
 		 * to defragment the map.
 		 */
 		return (error);
 	}
 	bp->b_kvabase = (caddr_t)addr;
 	bp->b_kvasize = maxsize;
 	atomic_add_long(&bufkvaspace, bp->b_kvasize);
 	if ((gbflags & GB_UNMAPPED) != 0) {
 		bp->b_data = unmapped_buf;
 		BUF_CHECK_UNMAPPED(bp);
 	} else {
 		bp->b_data = bp->b_kvabase;
 		BUF_CHECK_MAPPED(bp);
 	}
 	return (0);
 }
 
 /*
  *	bufkva_reclaim:
  *
  *	Reclaim buffer kva by freeing buffers holding kva.  This is a vmem
  *	callback that fires to avoid returning failure.
  */
 static void
 bufkva_reclaim(vmem_t *vmem, int flags)
 {
 	int i;
 
 	for (i = 0; i < 5; i++)
 		if (buf_scan(true) != 0)
 			break;
 	return;
 }
 
 
 /*
  * Attempt to initiate asynchronous I/O on read-ahead blocks.  We must
  * clear BIO_ERROR and B_INVAL prior to initiating I/O . If B_CACHE is set,
  * the buffer is valid and we do not have to do anything.
  */
 void
 breada(struct vnode * vp, daddr_t * rablkno, int * rabsize,
     int cnt, struct ucred * cred)
 {
 	struct buf *rabp;
 	int i;
 
 	for (i = 0; i < cnt; i++, rablkno++, rabsize++) {
 		if (inmem(vp, *rablkno))
 			continue;
 		rabp = getblk(vp, *rablkno, *rabsize, 0, 0, 0);
 
 		if ((rabp->b_flags & B_CACHE) == 0) {
 			if (!TD_IS_IDLETHREAD(curthread)) {
 #ifdef RACCT
 				if (racct_enable) {
 					PROC_LOCK(curproc);
 					racct_add_buf(curproc, rabp, 0);
 					PROC_UNLOCK(curproc);
 				}
 #endif /* RACCT */
 				curthread->td_ru.ru_inblock++;
 			}
 			rabp->b_flags |= B_ASYNC;
 			rabp->b_flags &= ~B_INVAL;
 			rabp->b_ioflags &= ~BIO_ERROR;
 			rabp->b_iocmd = BIO_READ;
 			if (rabp->b_rcred == NOCRED && cred != NOCRED)
 				rabp->b_rcred = crhold(cred);
 			vfs_busy_pages(rabp, 0);
 			BUF_KERNPROC(rabp);
 			rabp->b_iooffset = dbtob(rabp->b_blkno);
 			bstrategy(rabp);
 		} else {
 			brelse(rabp);
 		}
 	}
 }
 
 /*
  * Entry point for bread() and breadn() via #defines in sys/buf.h.
  *
  * Get a buffer with the specified data.  Look in the cache first.  We
  * must clear BIO_ERROR and B_INVAL prior to initiating I/O.  If B_CACHE
  * is set, the buffer is valid and we do not have to do anything, see
  * getblk(). Also starts asynchronous I/O on read-ahead blocks.
  *
  * Always return a NULL buffer pointer (in bpp) when returning an error.
  */
 int
 breadn_flags(struct vnode *vp, daddr_t blkno, int size, daddr_t *rablkno,
     int *rabsize, int cnt, struct ucred *cred, int flags, struct buf **bpp)
 {
 	struct buf *bp;
 	int rv = 0, readwait = 0;
 
 	CTR3(KTR_BUF, "breadn(%p, %jd, %d)", vp, blkno, size);
 	/*
 	 * Can only return NULL if GB_LOCK_NOWAIT flag is specified.
 	 */
 	*bpp = bp = getblk(vp, blkno, size, 0, 0, flags);
 	if (bp == NULL)
 		return (EBUSY);
 
 	/* if not found in cache, do some I/O */
 	if ((bp->b_flags & B_CACHE) == 0) {
 		if (!TD_IS_IDLETHREAD(curthread)) {
 #ifdef RACCT
 			if (racct_enable) {
 				PROC_LOCK(curproc);
 				racct_add_buf(curproc, bp, 0);
 				PROC_UNLOCK(curproc);
 			}
 #endif /* RACCT */
 			curthread->td_ru.ru_inblock++;
 		}
 		bp->b_iocmd = BIO_READ;
 		bp->b_flags &= ~B_INVAL;
 		bp->b_ioflags &= ~BIO_ERROR;
 		if (bp->b_rcred == NOCRED && cred != NOCRED)
 			bp->b_rcred = crhold(cred);
 		vfs_busy_pages(bp, 0);
 		bp->b_iooffset = dbtob(bp->b_blkno);
 		bstrategy(bp);
 		++readwait;
 	}
 
 	breada(vp, rablkno, rabsize, cnt, cred);
 
 	if (readwait) {
 		rv = bufwait(bp);
 		if (rv != 0) {
 			brelse(bp);
 			*bpp = NULL;
 		}
 	}
 	return (rv);
 }
 
 /*
  * Write, release buffer on completion.  (Done by iodone
  * if async).  Do not bother writing anything if the buffer
  * is invalid.
  *
  * Note that we set B_CACHE here, indicating that buffer is
  * fully valid and thus cacheable.  This is true even of NFS
  * now so we set it generally.  This could be set either here 
  * or in biodone() since the I/O is synchronous.  We put it
  * here.
  */
 int
 bufwrite(struct buf *bp)
 {
 	int oldflags;
 	struct vnode *vp;
 	long space;
 	int vp_md;
 
 	CTR3(KTR_BUF, "bufwrite(%p) vp %p flags %X", bp, bp->b_vp, bp->b_flags);
 	if ((bp->b_bufobj->bo_flag & BO_DEAD) != 0) {
 		bp->b_flags |= B_INVAL | B_RELBUF;
 		bp->b_flags &= ~B_CACHE;
 		brelse(bp);
 		return (ENXIO);
 	}
 	if (bp->b_flags & B_INVAL) {
 		brelse(bp);
 		return (0);
 	}
 
 	if (bp->b_flags & B_BARRIER)
 		barrierwrites++;
 
 	oldflags = bp->b_flags;
 
 	BUF_ASSERT_HELD(bp);
 
 	KASSERT(!(bp->b_vflags & BV_BKGRDINPROG),
 	    ("FFS background buffer should not get here %p", bp));
 
 	vp = bp->b_vp;
 	if (vp)
 		vp_md = vp->v_vflag & VV_MD;
 	else
 		vp_md = 0;
 
 	/*
 	 * Mark the buffer clean.  Increment the bufobj write count
 	 * before bundirty() call, to prevent other thread from seeing
 	 * empty dirty list and zero counter for writes in progress,
 	 * falsely indicating that the bufobj is clean.
 	 */
 	bufobj_wref(bp->b_bufobj);
 	bundirty(bp);
 
 	bp->b_flags &= ~B_DONE;
 	bp->b_ioflags &= ~BIO_ERROR;
 	bp->b_flags |= B_CACHE;
 	bp->b_iocmd = BIO_WRITE;
 
 	vfs_busy_pages(bp, 1);
 
 	/*
 	 * Normal bwrites pipeline writes
 	 */
 	bp->b_runningbufspace = bp->b_bufsize;
 	space = atomic_fetchadd_long(&runningbufspace, bp->b_runningbufspace);
 
 	if (!TD_IS_IDLETHREAD(curthread)) {
 #ifdef RACCT
 		if (racct_enable) {
 			PROC_LOCK(curproc);
 			racct_add_buf(curproc, bp, 1);
 			PROC_UNLOCK(curproc);
 		}
 #endif /* RACCT */
 		curthread->td_ru.ru_oublock++;
 	}
 	if (oldflags & B_ASYNC)
 		BUF_KERNPROC(bp);
 	bp->b_iooffset = dbtob(bp->b_blkno);
 	buf_track(bp, __func__);
 	bstrategy(bp);
 
 	if ((oldflags & B_ASYNC) == 0) {
 		int rtval = bufwait(bp);
 		brelse(bp);
 		return (rtval);
 	} else if (space > hirunningspace) {
 		/*
 		 * don't allow the async write to saturate the I/O
 		 * system.  We will not deadlock here because
 		 * we are blocking waiting for I/O that is already in-progress
 		 * to complete. We do not block here if it is the update
 		 * or syncer daemon trying to clean up as that can lead
 		 * to deadlock.
 		 */
 		if ((curthread->td_pflags & TDP_NORUNNINGBUF) == 0 && !vp_md)
 			waitrunningbufspace();
 	}
 
 	return (0);
 }
 
 void
 bufbdflush(struct bufobj *bo, struct buf *bp)
 {
 	struct buf *nbp;
 
 	if (bo->bo_dirty.bv_cnt > dirtybufthresh + 10) {
 		(void) VOP_FSYNC(bp->b_vp, MNT_NOWAIT, curthread);
 		altbufferflushes++;
 	} else if (bo->bo_dirty.bv_cnt > dirtybufthresh) {
 		BO_LOCK(bo);
 		/*
 		 * Try to find a buffer to flush.
 		 */
 		TAILQ_FOREACH(nbp, &bo->bo_dirty.bv_hd, b_bobufs) {
 			if ((nbp->b_vflags & BV_BKGRDINPROG) ||
 			    BUF_LOCK(nbp,
 				     LK_EXCLUSIVE | LK_NOWAIT, NULL))
 				continue;
 			if (bp == nbp)
 				panic("bdwrite: found ourselves");
 			BO_UNLOCK(bo);
 			/* Don't countdeps with the bo lock held. */
 			if (buf_countdeps(nbp, 0)) {
 				BO_LOCK(bo);
 				BUF_UNLOCK(nbp);
 				continue;
 			}
 			if (nbp->b_flags & B_CLUSTEROK) {
 				vfs_bio_awrite(nbp);
 			} else {
 				bremfree(nbp);
 				bawrite(nbp);
 			}
 			dirtybufferflushes++;
 			break;
 		}
 		if (nbp == NULL)
 			BO_UNLOCK(bo);
 	}
 }
 
 /*
  * Delayed write. (Buffer is marked dirty).  Do not bother writing
  * anything if the buffer is marked invalid.
  *
  * Note that since the buffer must be completely valid, we can safely
  * set B_CACHE.  In fact, we have to set B_CACHE here rather then in
  * biodone() in order to prevent getblk from writing the buffer
  * out synchronously.
  */
 void
 bdwrite(struct buf *bp)
 {
 	struct thread *td = curthread;
 	struct vnode *vp;
 	struct bufobj *bo;
 
 	CTR3(KTR_BUF, "bdwrite(%p) vp %p flags %X", bp, bp->b_vp, bp->b_flags);
 	KASSERT(bp->b_bufobj != NULL, ("No b_bufobj %p", bp));
 	KASSERT((bp->b_flags & B_BARRIER) == 0,
 	    ("Barrier request in delayed write %p", bp));
 	BUF_ASSERT_HELD(bp);
 
 	if (bp->b_flags & B_INVAL) {
 		brelse(bp);
 		return;
 	}
 
 	/*
 	 * If we have too many dirty buffers, don't create any more.
 	 * If we are wildly over our limit, then force a complete
 	 * cleanup. Otherwise, just keep the situation from getting
 	 * out of control. Note that we have to avoid a recursive
 	 * disaster and not try to clean up after our own cleanup!
 	 */
 	vp = bp->b_vp;
 	bo = bp->b_bufobj;
 	if ((td->td_pflags & (TDP_COWINPROGRESS|TDP_INBDFLUSH)) == 0) {
 		td->td_pflags |= TDP_INBDFLUSH;
 		BO_BDFLUSH(bo, bp);
 		td->td_pflags &= ~TDP_INBDFLUSH;
 	} else
 		recursiveflushes++;
 
 	bdirty(bp);
 	/*
 	 * Set B_CACHE, indicating that the buffer is fully valid.  This is
 	 * true even of NFS now.
 	 */
 	bp->b_flags |= B_CACHE;
 
 	/*
 	 * This bmap keeps the system from needing to do the bmap later,
 	 * perhaps when the system is attempting to do a sync.  Since it
 	 * is likely that the indirect block -- or whatever other datastructure
 	 * that the filesystem needs is still in memory now, it is a good
 	 * thing to do this.  Note also, that if the pageout daemon is
 	 * requesting a sync -- there might not be enough memory to do
 	 * the bmap then...  So, this is important to do.
 	 */
 	if (vp->v_type != VCHR && bp->b_lblkno == bp->b_blkno) {
 		VOP_BMAP(vp, bp->b_lblkno, NULL, &bp->b_blkno, NULL, NULL);
 	}
 
 	buf_track(bp, __func__);
 
 	/*
 	 * Set the *dirty* buffer range based upon the VM system dirty
 	 * pages.
 	 *
 	 * Mark the buffer pages as clean.  We need to do this here to
 	 * satisfy the vnode_pager and the pageout daemon, so that it
 	 * thinks that the pages have been "cleaned".  Note that since
 	 * the pages are in a delayed write buffer -- the VFS layer
 	 * "will" see that the pages get written out on the next sync,
 	 * or perhaps the cluster will be completed.
 	 */
 	vfs_clean_pages_dirty_buf(bp);
 	bqrelse(bp);
 
 	/*
 	 * note: we cannot initiate I/O from a bdwrite even if we wanted to,
 	 * due to the softdep code.
 	 */
 }
 
 /*
  *	bdirty:
  *
  *	Turn buffer into delayed write request.  We must clear BIO_READ and
  *	B_RELBUF, and we must set B_DELWRI.  We reassign the buffer to 
  *	itself to properly update it in the dirty/clean lists.  We mark it
  *	B_DONE to ensure that any asynchronization of the buffer properly
  *	clears B_DONE ( else a panic will occur later ).  
  *
  *	bdirty() is kinda like bdwrite() - we have to clear B_INVAL which
  *	might have been set pre-getblk().  Unlike bwrite/bdwrite, bdirty()
  *	should only be called if the buffer is known-good.
  *
  *	Since the buffer is not on a queue, we do not update the numfreebuffers
  *	count.
  *
  *	The buffer must be on QUEUE_NONE.
  */
 void
 bdirty(struct buf *bp)
 {
 
 	CTR3(KTR_BUF, "bdirty(%p) vp %p flags %X",
 	    bp, bp->b_vp, bp->b_flags);
 	KASSERT(bp->b_bufobj != NULL, ("No b_bufobj %p", bp));
 	KASSERT(bp->b_flags & B_REMFREE || bp->b_qindex == QUEUE_NONE,
 	    ("bdirty: buffer %p still on queue %d", bp, bp->b_qindex));
 	BUF_ASSERT_HELD(bp);
 	bp->b_flags &= ~(B_RELBUF);
 	bp->b_iocmd = BIO_WRITE;
 
 	if ((bp->b_flags & B_DELWRI) == 0) {
 		bp->b_flags |= /* XXX B_DONE | */ B_DELWRI;
 		reassignbuf(bp);
 		bdirtyadd();
 	}
 }
 
 /*
  *	bundirty:
  *
  *	Clear B_DELWRI for buffer.
  *
  *	Since the buffer is not on a queue, we do not update the numfreebuffers
  *	count.
  *	
  *	The buffer must be on QUEUE_NONE.
  */
 
 void
 bundirty(struct buf *bp)
 {
 
 	CTR3(KTR_BUF, "bundirty(%p) vp %p flags %X", bp, bp->b_vp, bp->b_flags);
 	KASSERT(bp->b_bufobj != NULL, ("No b_bufobj %p", bp));
 	KASSERT(bp->b_flags & B_REMFREE || bp->b_qindex == QUEUE_NONE,
 	    ("bundirty: buffer %p still on queue %d", bp, bp->b_qindex));
 	BUF_ASSERT_HELD(bp);
 
 	if (bp->b_flags & B_DELWRI) {
 		bp->b_flags &= ~B_DELWRI;
 		reassignbuf(bp);
 		bdirtysub();
 	}
 	/*
 	 * Since it is now being written, we can clear its deferred write flag.
 	 */
 	bp->b_flags &= ~B_DEFERRED;
 }
 
 /*
  *	bawrite:
  *
  *	Asynchronous write.  Start output on a buffer, but do not wait for
  *	it to complete.  The buffer is released when the output completes.
  *
  *	bwrite() ( or the VOP routine anyway ) is responsible for handling 
  *	B_INVAL buffers.  Not us.
  */
 void
 bawrite(struct buf *bp)
 {
 
 	bp->b_flags |= B_ASYNC;
 	(void) bwrite(bp);
 }
 
 /*
  *	babarrierwrite:
  *
  *	Asynchronous barrier write.  Start output on a buffer, but do not
  *	wait for it to complete.  Place a write barrier after this write so
  *	that this buffer and all buffers written before it are committed to
  *	the disk before any buffers written after this write are committed
  *	to the disk.  The buffer is released when the output completes.
  */
 void
 babarrierwrite(struct buf *bp)
 {
 
 	bp->b_flags |= B_ASYNC | B_BARRIER;
 	(void) bwrite(bp);
 }
 
 /*
  *	bbarrierwrite:
  *
  *	Synchronous barrier write.  Start output on a buffer and wait for
  *	it to complete.  Place a write barrier after this write so that
  *	this buffer and all buffers written before it are committed to 
  *	the disk before any buffers written after this write are committed
  *	to the disk.  The buffer is released when the output completes.
  */
 int
 bbarrierwrite(struct buf *bp)
 {
 
 	bp->b_flags |= B_BARRIER;
 	return (bwrite(bp));
 }
 
 /*
  *	bwillwrite:
  *
  *	Called prior to the locking of any vnodes when we are expecting to
  *	write.  We do not want to starve the buffer cache with too many
  *	dirty buffers so we block here.  By blocking prior to the locking
  *	of any vnodes we attempt to avoid the situation where a locked vnode
  *	prevents the various system daemons from flushing related buffers.
  */
 void
 bwillwrite(void)
 {
 
 	if (numdirtybuffers >= hidirtybuffers) {
 		mtx_lock(&bdirtylock);
 		while (numdirtybuffers >= hidirtybuffers) {
 			bdirtywait = 1;
 			msleep(&bdirtywait, &bdirtylock, (PRIBIO + 4),
 			    "flswai", 0);
 		}
 		mtx_unlock(&bdirtylock);
 	}
 }
 
 /*
  * Return true if we have too many dirty buffers.
  */
 int
 buf_dirty_count_severe(void)
 {
 
 	return(numdirtybuffers >= hidirtybuffers);
 }
 
 /*
  *	brelse:
  *
  *	Release a busy buffer and, if requested, free its resources.  The
  *	buffer will be stashed in the appropriate bufqueue[] allowing it
  *	to be accessed later as a cache entity or reused for other purposes.
  */
 void
 brelse(struct buf *bp)
 {
 	int qindex;
 
 	/*
 	 * Many functions erroneously call brelse with a NULL bp under rare
 	 * error conditions. Simply return when called with a NULL bp.
 	 */
 	if (bp == NULL)
 		return;
 	CTR3(KTR_BUF, "brelse(%p) vp %p flags %X",
 	    bp, bp->b_vp, bp->b_flags);
 	KASSERT(!(bp->b_flags & (B_CLUSTER|B_PAGING)),
 	    ("brelse: inappropriate B_PAGING or B_CLUSTER bp %p", bp));
 	KASSERT((bp->b_flags & B_VMIO) != 0 || (bp->b_flags & B_NOREUSE) == 0,
 	    ("brelse: non-VMIO buffer marked NOREUSE"));
 
 	if (BUF_LOCKRECURSED(bp)) {
 		/*
 		 * Do not process, in particular, do not handle the
 		 * B_INVAL/B_RELBUF and do not release to free list.
 		 */
 		BUF_UNLOCK(bp);
 		return;
 	}
 
 	if (bp->b_flags & B_MANAGED) {
 		bqrelse(bp);
 		return;
 	}
 
 	if ((bp->b_vflags & (BV_BKGRDINPROG | BV_BKGRDERR)) == BV_BKGRDERR) {
 		BO_LOCK(bp->b_bufobj);
 		bp->b_vflags &= ~BV_BKGRDERR;
 		BO_UNLOCK(bp->b_bufobj);
 		bdirty(bp);
 	}
 	if (bp->b_iocmd == BIO_WRITE && (bp->b_ioflags & BIO_ERROR) &&
 	    (bp->b_error != ENXIO || !LIST_EMPTY(&bp->b_dep)) &&
 	    !(bp->b_flags & B_INVAL)) {
 		/*
 		 * Failed write, redirty.  All errors except ENXIO (which
 		 * means the device is gone) are expected to be potentially
 		 * transient - underlying media might work if tried again
 		 * after EIO, and memory might be available after an ENOMEM.
 		 *
 		 * Do this also for buffers that failed with ENXIO, but have
 		 * non-empty dependencies - the soft updates code might need
 		 * to access the buffer to untangle them.
 		 *
 		 * Must clear BIO_ERROR to prevent pages from being scrapped.
 		 */
 		bp->b_ioflags &= ~BIO_ERROR;
 		bdirty(bp);
 	} else if ((bp->b_flags & (B_NOCACHE | B_INVAL)) ||
 	    (bp->b_ioflags & BIO_ERROR) || (bp->b_bufsize <= 0)) {
 		/*
 		 * Either a failed read I/O, or we were asked to free or not
 		 * cache the buffer, or we failed to write to a device that's
 		 * no longer present.
 		 */
 		bp->b_flags |= B_INVAL;
 		if (!LIST_EMPTY(&bp->b_dep))
 			buf_deallocate(bp);
 		if (bp->b_flags & B_DELWRI)
 			bdirtysub();
 		bp->b_flags &= ~(B_DELWRI | B_CACHE);
 		if ((bp->b_flags & B_VMIO) == 0) {
 			allocbuf(bp, 0);
 			if (bp->b_vp)
 				brelvp(bp);
 		}
 	}
 
 	/*
 	 * We must clear B_RELBUF if B_DELWRI is set.  If vfs_vmio_truncate() 
 	 * is called with B_DELWRI set, the underlying pages may wind up
 	 * getting freed causing a previous write (bdwrite()) to get 'lost'
 	 * because pages associated with a B_DELWRI bp are marked clean.
 	 * 
 	 * We still allow the B_INVAL case to call vfs_vmio_truncate(), even
 	 * if B_DELWRI is set.
 	 */
 	if (bp->b_flags & B_DELWRI)
 		bp->b_flags &= ~B_RELBUF;
 
 	/*
 	 * VMIO buffer rundown.  It is not very necessary to keep a VMIO buffer
 	 * constituted, not even NFS buffers now.  Two flags effect this.  If
 	 * B_INVAL, the struct buf is invalidated but the VM object is kept
 	 * around ( i.e. so it is trivial to reconstitute the buffer later ).
 	 *
 	 * If BIO_ERROR or B_NOCACHE is set, pages in the VM object will be
 	 * invalidated.  BIO_ERROR cannot be set for a failed write unless the
 	 * buffer is also B_INVAL because it hits the re-dirtying code above.
 	 *
 	 * Normally we can do this whether a buffer is B_DELWRI or not.  If
 	 * the buffer is an NFS buffer, it is tracking piecemeal writes or
 	 * the commit state and we cannot afford to lose the buffer. If the
 	 * buffer has a background write in progress, we need to keep it
 	 * around to prevent it from being reconstituted and starting a second
 	 * background write.
 	 */
 	if ((bp->b_flags & B_VMIO) && (bp->b_flags & B_NOCACHE ||
 	    (bp->b_ioflags & BIO_ERROR && bp->b_iocmd == BIO_READ)) &&
 	    !(bp->b_vp->v_mount != NULL &&
 	    (bp->b_vp->v_mount->mnt_vfc->vfc_flags & VFCF_NETWORK) != 0 &&
 	    !vn_isdisk(bp->b_vp, NULL) && (bp->b_flags & B_DELWRI))) {
 		vfs_vmio_invalidate(bp);
 		allocbuf(bp, 0);
 	}
 
 	if ((bp->b_flags & (B_INVAL | B_RELBUF)) != 0 ||
 	    (bp->b_flags & (B_DELWRI | B_NOREUSE)) == B_NOREUSE) {
 		allocbuf(bp, 0);
 		bp->b_flags &= ~B_NOREUSE;
 		if (bp->b_vp != NULL)
 			brelvp(bp);
 	}
 			
 	/*
 	 * If the buffer has junk contents signal it and eventually
 	 * clean up B_DELWRI and diassociate the vnode so that gbincore()
 	 * doesn't find it.
 	 */
 	if (bp->b_bufsize == 0 || (bp->b_ioflags & BIO_ERROR) != 0 ||
 	    (bp->b_flags & (B_INVAL | B_NOCACHE | B_RELBUF)) != 0)
 		bp->b_flags |= B_INVAL;
 	if (bp->b_flags & B_INVAL) {
 		if (bp->b_flags & B_DELWRI)
 			bundirty(bp);
 		if (bp->b_vp)
 			brelvp(bp);
 	}
 
 	buf_track(bp, __func__);
 
 	/* buffers with no memory */
 	if (bp->b_bufsize == 0) {
 		buf_free(bp);
 		return;
 	}
 	/* buffers with junk contents */
 	if (bp->b_flags & (B_INVAL | B_NOCACHE | B_RELBUF) ||
 	    (bp->b_ioflags & BIO_ERROR)) {
 		bp->b_xflags &= ~(BX_BKGRDWRITE | BX_ALTDATA);
 		if (bp->b_vflags & BV_BKGRDINPROG)
 			panic("losing buffer 2");
 		qindex = QUEUE_CLEAN;
 		bp->b_flags |= B_AGE;
 	/* remaining buffers */
 	} else if (bp->b_flags & B_DELWRI)
 		qindex = QUEUE_DIRTY;
 	else
 		qindex = QUEUE_CLEAN;
 
 	binsfree(bp, qindex);
 
 	bp->b_flags &= ~(B_ASYNC | B_NOCACHE | B_AGE | B_RELBUF | B_DIRECT);
 	if ((bp->b_flags & B_DELWRI) == 0 && (bp->b_xflags & BX_VNDIRTY))
 		panic("brelse: not dirty");
 	/* unlock */
 	BUF_UNLOCK(bp);
 	if (qindex == QUEUE_CLEAN)
 		bufspace_wakeup();
 }
 
 /*
  * Release a buffer back to the appropriate queue but do not try to free
  * it.  The buffer is expected to be used again soon.
  *
  * bqrelse() is used by bdwrite() to requeue a delayed write, and used by
  * biodone() to requeue an async I/O on completion.  It is also used when
  * known good buffers need to be requeued but we think we may need the data
  * again soon.
  *
  * XXX we should be able to leave the B_RELBUF hint set on completion.
  */
 void
 bqrelse(struct buf *bp)
 {
 	int qindex;
 
 	CTR3(KTR_BUF, "bqrelse(%p) vp %p flags %X", bp, bp->b_vp, bp->b_flags);
 	KASSERT(!(bp->b_flags & (B_CLUSTER|B_PAGING)),
 	    ("bqrelse: inappropriate B_PAGING or B_CLUSTER bp %p", bp));
 
 	qindex = QUEUE_NONE;
 	if (BUF_LOCKRECURSED(bp)) {
 		/* do not release to free list */
 		BUF_UNLOCK(bp);
 		return;
 	}
 	bp->b_flags &= ~(B_ASYNC | B_NOCACHE | B_AGE | B_RELBUF);
 
 	if (bp->b_flags & B_MANAGED) {
 		if (bp->b_flags & B_REMFREE)
 			bremfreef(bp);
 		goto out;
 	}
 
 	/* buffers with stale but valid contents */
 	if ((bp->b_flags & B_DELWRI) != 0 || (bp->b_vflags & (BV_BKGRDINPROG |
 	    BV_BKGRDERR)) == BV_BKGRDERR) {
 		BO_LOCK(bp->b_bufobj);
 		bp->b_vflags &= ~BV_BKGRDERR;
 		BO_UNLOCK(bp->b_bufobj);
 		qindex = QUEUE_DIRTY;
 	} else {
 		if ((bp->b_flags & B_DELWRI) == 0 &&
 		    (bp->b_xflags & BX_VNDIRTY))
 			panic("bqrelse: not dirty");
 		if ((bp->b_flags & B_NOREUSE) != 0) {
 			brelse(bp);
 			return;
 		}
 		qindex = QUEUE_CLEAN;
 	}
 	binsfree(bp, qindex);
 
 out:
 	buf_track(bp, __func__);
 	/* unlock */
 	BUF_UNLOCK(bp);
 	if (qindex == QUEUE_CLEAN)
 		bufspace_wakeup();
 }
 
 /*
  * Complete I/O to a VMIO backed page.  Validate the pages as appropriate,
  * restore bogus pages.
  */
 static void
 vfs_vmio_iodone(struct buf *bp)
 {
 	vm_ooffset_t foff;
 	vm_page_t m;
 	vm_object_t obj;
 	struct vnode *vp;
 	int i, iosize, resid;
 	bool bogus;
 
 	obj = bp->b_bufobj->bo_object;
 	KASSERT(obj->paging_in_progress >= bp->b_npages,
 	    ("vfs_vmio_iodone: paging in progress(%d) < b_npages(%d)",
 	    obj->paging_in_progress, bp->b_npages));
 
 	vp = bp->b_vp;
 	KASSERT(vp->v_holdcnt > 0,
 	    ("vfs_vmio_iodone: vnode %p has zero hold count", vp));
 	KASSERT(vp->v_object != NULL,
 	    ("vfs_vmio_iodone: vnode %p has no vm_object", vp));
 
 	foff = bp->b_offset;
 	KASSERT(bp->b_offset != NOOFFSET,
 	    ("vfs_vmio_iodone: bp %p has no buffer offset", bp));
 
 	bogus = false;
 	iosize = bp->b_bcount - bp->b_resid;
 	VM_OBJECT_WLOCK(obj);
 	for (i = 0; i < bp->b_npages; i++) {
 		resid = ((foff + PAGE_SIZE) & ~(off_t)PAGE_MASK) - foff;
 		if (resid > iosize)
 			resid = iosize;
 
 		/*
 		 * cleanup bogus pages, restoring the originals
 		 */
 		m = bp->b_pages[i];
 		if (m == bogus_page) {
 			bogus = true;
 			m = vm_page_lookup(obj, OFF_TO_IDX(foff));
 			if (m == NULL)
 				panic("biodone: page disappeared!");
 			bp->b_pages[i] = m;
 		} else if ((bp->b_iocmd == BIO_READ) && resid > 0) {
 			/*
 			 * In the write case, the valid and clean bits are
 			 * already changed correctly ( see bdwrite() ), so we 
 			 * only need to do this here in the read case.
 			 */
 			KASSERT((m->dirty & vm_page_bits(foff & PAGE_MASK,
 			    resid)) == 0, ("vfs_vmio_iodone: page %p "
 			    "has unexpected dirty bits", m));
 			vfs_page_set_valid(bp, foff, m);
 		}
 		KASSERT(OFF_TO_IDX(foff) == m->pindex,
 		    ("vfs_vmio_iodone: foff(%jd)/pindex(%ju) mismatch",
 		    (intmax_t)foff, (uintmax_t)m->pindex));
 
 		vm_page_sunbusy(m);
 		foff = (foff + PAGE_SIZE) & ~(off_t)PAGE_MASK;
 		iosize -= resid;
 	}
 	vm_object_pip_wakeupn(obj, bp->b_npages);
 	VM_OBJECT_WUNLOCK(obj);
 	if (bogus && buf_mapped(bp)) {
 		BUF_CHECK_MAPPED(bp);
 		pmap_qenter(trunc_page((vm_offset_t)bp->b_data),
 		    bp->b_pages, bp->b_npages);
 	}
 }
 
 /*
  * Unwire a page held by a buf and place it on the appropriate vm queue.
  */
 static void
 vfs_vmio_unwire(struct buf *bp, vm_page_t m)
 {
 	bool freed;
 
 	vm_page_lock(m);
 	if (vm_page_unwire(m, PQ_NONE)) {
 		/*
 		 * Determine if the page should be freed before adding
 		 * it to the inactive queue.
 		 */
 		if (m->valid == 0) {
 			freed = !vm_page_busied(m);
 			if (freed)
 				vm_page_free(m);
 		} else if ((bp->b_flags & B_DIRECT) != 0)
 			freed = vm_page_try_to_free(m);
 		else
 			freed = false;
 		if (!freed) {
 			/*
 			 * If the page is unlikely to be reused, let the
 			 * VM know.  Otherwise, maintain LRU page
 			 * ordering and put the page at the tail of the
 			 * inactive queue.
 			 */
 			if ((bp->b_flags & B_NOREUSE) != 0)
 				vm_page_deactivate_noreuse(m);
 			else
 				vm_page_deactivate(m);
 		}
 	}
 	vm_page_unlock(m);
 }
 
 /*
  * Perform page invalidation when a buffer is released.  The fully invalid
  * pages will be reclaimed later in vfs_vmio_truncate().
  */
 static void
 vfs_vmio_invalidate(struct buf *bp)
 {
 	vm_object_t obj;
 	vm_page_t m;
 	int i, resid, poffset, presid;
 
 	if (buf_mapped(bp)) {
 		BUF_CHECK_MAPPED(bp);
 		pmap_qremove(trunc_page((vm_offset_t)bp->b_data), bp->b_npages);
 	} else
 		BUF_CHECK_UNMAPPED(bp);
 	/*
 	 * Get the base offset and length of the buffer.  Note that 
 	 * in the VMIO case if the buffer block size is not
 	 * page-aligned then b_data pointer may not be page-aligned.
 	 * But our b_pages[] array *IS* page aligned.
 	 *
 	 * block sizes less then DEV_BSIZE (usually 512) are not 
 	 * supported due to the page granularity bits (m->valid,
 	 * m->dirty, etc...). 
 	 *
 	 * See man buf(9) for more information
 	 */
 	obj = bp->b_bufobj->bo_object;
 	resid = bp->b_bufsize;
 	poffset = bp->b_offset & PAGE_MASK;
 	VM_OBJECT_WLOCK(obj);
 	for (i = 0; i < bp->b_npages; i++) {
 		m = bp->b_pages[i];
 		if (m == bogus_page)
 			panic("vfs_vmio_invalidate: Unexpected bogus page.");
 		bp->b_pages[i] = NULL;
 
 		presid = resid > (PAGE_SIZE - poffset) ?
 		    (PAGE_SIZE - poffset) : resid;
 		KASSERT(presid >= 0, ("brelse: extra page"));
 		while (vm_page_xbusied(m)) {
 			vm_page_lock(m);
 			VM_OBJECT_WUNLOCK(obj);
 			vm_page_busy_sleep(m, "mbncsh", true);
 			VM_OBJECT_WLOCK(obj);
 		}
 		if (pmap_page_wired_mappings(m) == 0)
 			vm_page_set_invalid(m, poffset, presid);
 		vfs_vmio_unwire(bp, m);
 		resid -= presid;
 		poffset = 0;
 	}
 	VM_OBJECT_WUNLOCK(obj);
 	bp->b_npages = 0;
 }
 
 /*
  * Page-granular truncation of an existing VMIO buffer.
  */
 static void
 vfs_vmio_truncate(struct buf *bp, int desiredpages)
 {
 	vm_object_t obj;
 	vm_page_t m;
 	int i;
 
 	if (bp->b_npages == desiredpages)
 		return;
 
 	if (buf_mapped(bp)) {
 		BUF_CHECK_MAPPED(bp);
 		pmap_qremove((vm_offset_t)trunc_page((vm_offset_t)bp->b_data) +
 		    (desiredpages << PAGE_SHIFT), bp->b_npages - desiredpages);
 	} else
 		BUF_CHECK_UNMAPPED(bp);
 	obj = bp->b_bufobj->bo_object;
 	if (obj != NULL)
 		VM_OBJECT_WLOCK(obj);
 	for (i = desiredpages; i < bp->b_npages; i++) {
 		m = bp->b_pages[i];
 		KASSERT(m != bogus_page, ("allocbuf: bogus page found"));
 		bp->b_pages[i] = NULL;
 		vfs_vmio_unwire(bp, m);
 	}
 	if (obj != NULL)
 		VM_OBJECT_WUNLOCK(obj);
 	bp->b_npages = desiredpages;
 }
 
 /*
  * Byte granular extension of VMIO buffers.
  */
 static void
 vfs_vmio_extend(struct buf *bp, int desiredpages, int size)
 {
 	/*
 	 * We are growing the buffer, possibly in a 
 	 * byte-granular fashion.
 	 */
 	vm_object_t obj;
 	vm_offset_t toff;
 	vm_offset_t tinc;
 	vm_page_t m;
 
 	/*
 	 * Step 1, bring in the VM pages from the object, allocating
 	 * them if necessary.  We must clear B_CACHE if these pages
 	 * are not valid for the range covered by the buffer.
 	 */
 	obj = bp->b_bufobj->bo_object;
 	VM_OBJECT_WLOCK(obj);
 	while (bp->b_npages < desiredpages) {
 		/*
 		 * We must allocate system pages since blocking
 		 * here could interfere with paging I/O, no
 		 * matter which process we are.
 		 *
 		 * Only exclusive busy can be tested here.
 		 * Blocking on shared busy might lead to
 		 * deadlocks once allocbuf() is called after
 		 * pages are vfs_busy_pages().
 		 */
 		m = vm_page_grab(obj, OFF_TO_IDX(bp->b_offset) + bp->b_npages,
 		    VM_ALLOC_NOBUSY | VM_ALLOC_SYSTEM |
 		    VM_ALLOC_WIRED | VM_ALLOC_IGN_SBUSY |
 		    VM_ALLOC_COUNT(desiredpages - bp->b_npages));
 		if (m->valid == 0)
 			bp->b_flags &= ~B_CACHE;
 		bp->b_pages[bp->b_npages] = m;
 		++bp->b_npages;
 	}
 
 	/*
 	 * Step 2.  We've loaded the pages into the buffer,
 	 * we have to figure out if we can still have B_CACHE
 	 * set.  Note that B_CACHE is set according to the
 	 * byte-granular range ( bcount and size ), not the
 	 * aligned range ( newbsize ).
 	 *
 	 * The VM test is against m->valid, which is DEV_BSIZE
 	 * aligned.  Needless to say, the validity of the data
 	 * needs to also be DEV_BSIZE aligned.  Note that this
 	 * fails with NFS if the server or some other client
 	 * extends the file's EOF.  If our buffer is resized, 
 	 * B_CACHE may remain set! XXX
 	 */
 	toff = bp->b_bcount;
 	tinc = PAGE_SIZE - ((bp->b_offset + toff) & PAGE_MASK);
 	while ((bp->b_flags & B_CACHE) && toff < size) {
 		vm_pindex_t pi;
 
 		if (tinc > (size - toff))
 			tinc = size - toff;
 		pi = ((bp->b_offset & PAGE_MASK) + toff) >> PAGE_SHIFT;
 		m = bp->b_pages[pi];
 		vfs_buf_test_cache(bp, bp->b_offset, toff, tinc, m);
 		toff += tinc;
 		tinc = PAGE_SIZE;
 	}
 	VM_OBJECT_WUNLOCK(obj);
 
 	/*
 	 * Step 3, fixup the KVA pmap.
 	 */
 	if (buf_mapped(bp))
 		bpmap_qenter(bp);
 	else
 		BUF_CHECK_UNMAPPED(bp);
 }
 
 /*
  * Check to see if a block at a particular lbn is available for a clustered
  * write.
  */
 static int
 vfs_bio_clcheck(struct vnode *vp, int size, daddr_t lblkno, daddr_t blkno)
 {
 	struct buf *bpa;
 	int match;
 
 	match = 0;
 
 	/* If the buf isn't in core skip it */
 	if ((bpa = gbincore(&vp->v_bufobj, lblkno)) == NULL)
 		return (0);
 
 	/* If the buf is busy we don't want to wait for it */
 	if (BUF_LOCK(bpa, LK_EXCLUSIVE | LK_NOWAIT, NULL) != 0)
 		return (0);
 
 	/* Only cluster with valid clusterable delayed write buffers */
 	if ((bpa->b_flags & (B_DELWRI | B_CLUSTEROK | B_INVAL)) !=
 	    (B_DELWRI | B_CLUSTEROK))
 		goto done;
 
 	if (bpa->b_bufsize != size)
 		goto done;
 
 	/*
 	 * Check to see if it is in the expected place on disk and that the
 	 * block has been mapped.
 	 */
 	if ((bpa->b_blkno != bpa->b_lblkno) && (bpa->b_blkno == blkno))
 		match = 1;
 done:
 	BUF_UNLOCK(bpa);
 	return (match);
 }
 
 /*
  *	vfs_bio_awrite:
  *
  *	Implement clustered async writes for clearing out B_DELWRI buffers.
  *	This is much better then the old way of writing only one buffer at
  *	a time.  Note that we may not be presented with the buffers in the 
  *	correct order, so we search for the cluster in both directions.
  */
 int
 vfs_bio_awrite(struct buf *bp)
 {
 	struct bufobj *bo;
 	int i;
 	int j;
 	daddr_t lblkno = bp->b_lblkno;
 	struct vnode *vp = bp->b_vp;
 	int ncl;
 	int nwritten;
 	int size;
 	int maxcl;
 	int gbflags;
 
 	bo = &vp->v_bufobj;
 	gbflags = (bp->b_data == unmapped_buf) ? GB_UNMAPPED : 0;
 	/*
 	 * right now we support clustered writing only to regular files.  If
 	 * we find a clusterable block we could be in the middle of a cluster
 	 * rather then at the beginning.
 	 */
 	if ((vp->v_type == VREG) && 
 	    (vp->v_mount != 0) && /* Only on nodes that have the size info */
 	    (bp->b_flags & (B_CLUSTEROK | B_INVAL)) == B_CLUSTEROK) {
 
 		size = vp->v_mount->mnt_stat.f_iosize;
 		maxcl = MAXPHYS / size;
 
 		BO_RLOCK(bo);
 		for (i = 1; i < maxcl; i++)
 			if (vfs_bio_clcheck(vp, size, lblkno + i,
 			    bp->b_blkno + ((i * size) >> DEV_BSHIFT)) == 0)
 				break;
 
 		for (j = 1; i + j <= maxcl && j <= lblkno; j++) 
 			if (vfs_bio_clcheck(vp, size, lblkno - j,
 			    bp->b_blkno - ((j * size) >> DEV_BSHIFT)) == 0)
 				break;
 		BO_RUNLOCK(bo);
 		--j;
 		ncl = i + j;
 		/*
 		 * this is a possible cluster write
 		 */
 		if (ncl != 1) {
 			BUF_UNLOCK(bp);
 			nwritten = cluster_wbuild(vp, size, lblkno - j, ncl,
 			    gbflags);
 			return (nwritten);
 		}
 	}
 	bremfree(bp);
 	bp->b_flags |= B_ASYNC;
 	/*
 	 * default (old) behavior, writing out only one block
 	 *
 	 * XXX returns b_bufsize instead of b_bcount for nwritten?
 	 */
 	nwritten = bp->b_bufsize;
 	(void) bwrite(bp);
 
 	return (nwritten);
 }
 
 /*
  *	getnewbuf_kva:
  *
  *	Allocate KVA for an empty buf header according to gbflags.
  */
 static int
 getnewbuf_kva(struct buf *bp, int gbflags, int maxsize)
 {
 
 	if ((gbflags & (GB_UNMAPPED | GB_KVAALLOC)) != GB_UNMAPPED) {
 		/*
 		 * In order to keep fragmentation sane we only allocate kva
 		 * in BKVASIZE chunks.  XXX with vmem we can do page size.
 		 */
 		maxsize = (maxsize + BKVAMASK) & ~BKVAMASK;
 
 		if (maxsize != bp->b_kvasize &&
 		    bufkva_alloc(bp, maxsize, gbflags))
 			return (ENOSPC);
 	}
 	return (0);
 }
 
 /*
  *	getnewbuf:
  *
  *	Find and initialize a new buffer header, freeing up existing buffers
  *	in the bufqueues as necessary.  The new buffer is returned locked.
  *
  *	We block if:
  *		We have insufficient buffer headers
  *		We have insufficient buffer space
  *		buffer_arena is too fragmented ( space reservation fails )
  *		If we have to flush dirty buffers ( but we try to avoid this )
  *
  *	The caller is responsible for releasing the reserved bufspace after
  *	allocbuf() is called.
  */
 static struct buf *
 getnewbuf(struct vnode *vp, int slpflag, int slptimeo, int maxsize, int gbflags)
 {
 	struct buf *bp;
 	bool metadata, reserved;
 
 	bp = NULL;
 	KASSERT((gbflags & (GB_UNMAPPED | GB_KVAALLOC)) != GB_KVAALLOC,
 	    ("GB_KVAALLOC only makes sense with GB_UNMAPPED"));
 	if (!unmapped_buf_allowed)
 		gbflags &= ~(GB_UNMAPPED | GB_KVAALLOC);
 
 	if (vp == NULL || (vp->v_vflag & (VV_MD | VV_SYSTEM)) != 0 ||
 	    vp->v_type == VCHR)
 		metadata = true;
 	else
 		metadata = false;
 	atomic_add_int(&getnewbufcalls, 1);
 	reserved = false;
 	do {
 		if (reserved == false &&
 		    bufspace_reserve(maxsize, metadata) != 0)
 			continue;
 		reserved = true;
 		if ((bp = buf_alloc()) == NULL)
 			continue;
 		if (getnewbuf_kva(bp, gbflags, maxsize) == 0)
 			return (bp);
 		break;
 	} while(buf_scan(false) == 0);
 
 	if (reserved)
 		atomic_subtract_long(&bufspace, maxsize);
 	if (bp != NULL) {
 		bp->b_flags |= B_INVAL;
 		brelse(bp);
 	}
 	bufspace_wait(vp, gbflags, slpflag, slptimeo);
 
 	return (NULL);
 }
 
 /*
  *	buf_daemon:
  *
  *	buffer flushing daemon.  Buffers are normally flushed by the
  *	update daemon but if it cannot keep up this process starts to
  *	take the load in an attempt to prevent getnewbuf() from blocking.
  */
 static struct kproc_desc buf_kp = {
 	"bufdaemon",
 	buf_daemon,
 	&bufdaemonproc
 };
 SYSINIT(bufdaemon, SI_SUB_KTHREAD_BUF, SI_ORDER_FIRST, kproc_start, &buf_kp);
 
 static int
 buf_flush(struct vnode *vp, int target)
 {
 	int flushed;
 
 	flushed = flushbufqueues(vp, target, 0);
 	if (flushed == 0) {
 		/*
 		 * Could not find any buffers without rollback
 		 * dependencies, so just write the first one
 		 * in the hopes of eventually making progress.
 		 */
 		if (vp != NULL && target > 2)
 			target /= 2;
 		flushbufqueues(vp, target, 1);
 	}
 	return (flushed);
 }
 
 static void
 buf_daemon()
 {
 	int lodirty;
 
 	/*
 	 * This process needs to be suspended prior to shutdown sync.
 	 */
 	EVENTHANDLER_REGISTER(shutdown_pre_sync, kproc_shutdown, bufdaemonproc,
 	    SHUTDOWN_PRI_LAST);
 
 	/*
 	 * This process is allowed to take the buffer cache to the limit
 	 */
 	curthread->td_pflags |= TDP_NORUNNINGBUF | TDP_BUFNEED;
 	mtx_lock(&bdlock);
 	for (;;) {
 		bd_request = 0;
 		mtx_unlock(&bdlock);
 
 		kproc_suspend_check(bufdaemonproc);
 		lodirty = lodirtybuffers;
 		if (bd_speedupreq) {
 			lodirty = numdirtybuffers / 2;
 			bd_speedupreq = 0;
 		}
 		/*
 		 * Do the flush.  Limit the amount of in-transit I/O we
 		 * allow to build up, otherwise we would completely saturate
 		 * the I/O system.
 		 */
 		while (numdirtybuffers > lodirty) {
 			if (buf_flush(NULL, numdirtybuffers - lodirty) == 0)
 				break;
 			kern_yield(PRI_USER);
 		}
 
 		/*
 		 * Only clear bd_request if we have reached our low water
 		 * mark.  The buf_daemon normally waits 1 second and
 		 * then incrementally flushes any dirty buffers that have
 		 * built up, within reason.
 		 *
 		 * If we were unable to hit our low water mark and couldn't
 		 * find any flushable buffers, we sleep for a short period
 		 * to avoid endless loops on unlockable buffers.
 		 */
 		mtx_lock(&bdlock);
 		if (numdirtybuffers <= lodirtybuffers) {
 			/*
 			 * We reached our low water mark, reset the
 			 * request and sleep until we are needed again.
 			 * The sleep is just so the suspend code works.
 			 */
 			bd_request = 0;
 			/*
 			 * Do an extra wakeup in case dirty threshold
 			 * changed via sysctl and the explicit transition
 			 * out of shortfall was missed.
 			 */
 			bdirtywakeup();
 			if (runningbufspace <= lorunningspace)
 				runningwakeup();
 			msleep(&bd_request, &bdlock, PVM, "psleep", hz);
 		} else {
 			/*
 			 * We couldn't find any flushable dirty buffers but
 			 * still have too many dirty buffers, we
 			 * have to sleep and try again.  (rare)
 			 */
 			msleep(&bd_request, &bdlock, PVM, "qsleep", hz / 10);
 		}
 	}
 }
 
 /*
  *	flushbufqueues:
  *
  *	Try to flush a buffer in the dirty queue.  We must be careful to
  *	free up B_INVAL buffers instead of write them, which NFS is 
  *	particularly sensitive to.
  */
 static int flushwithdeps = 0;
 SYSCTL_INT(_vfs, OID_AUTO, flushwithdeps, CTLFLAG_RW, &flushwithdeps,
     0, "Number of buffers flushed with dependecies that require rollbacks");
 
 static int
 flushbufqueues(struct vnode *lvp, int target, int flushdeps)
 {
 	struct buf *sentinel;
 	struct vnode *vp;
 	struct mount *mp;
 	struct buf *bp;
 	int hasdeps;
 	int flushed;
 	int queue;
 	int error;
 	bool unlock;
 
 	flushed = 0;
 	queue = QUEUE_DIRTY;
 	bp = NULL;
 	sentinel = malloc(sizeof(struct buf), M_TEMP, M_WAITOK | M_ZERO);
 	sentinel->b_qindex = QUEUE_SENTINEL;
 	mtx_lock(&bqlocks[queue]);
 	TAILQ_INSERT_HEAD(&bufqueues[queue], sentinel, b_freelist);
 	mtx_unlock(&bqlocks[queue]);
 	while (flushed != target) {
 		maybe_yield();
 		mtx_lock(&bqlocks[queue]);
 		bp = TAILQ_NEXT(sentinel, b_freelist);
 		if (bp != NULL) {
 			TAILQ_REMOVE(&bufqueues[queue], sentinel, b_freelist);
 			TAILQ_INSERT_AFTER(&bufqueues[queue], bp, sentinel,
 			    b_freelist);
 		} else {
 			mtx_unlock(&bqlocks[queue]);
 			break;
 		}
 		/*
 		 * Skip sentinels inserted by other invocations of the
 		 * flushbufqueues(), taking care to not reorder them.
 		 *
 		 * Only flush the buffers that belong to the
 		 * vnode locked by the curthread.
 		 */
 		if (bp->b_qindex == QUEUE_SENTINEL || (lvp != NULL &&
 		    bp->b_vp != lvp)) {
 			mtx_unlock(&bqlocks[queue]);
 			continue;
 		}
 		error = BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT, NULL);
 		mtx_unlock(&bqlocks[queue]);
 		if (error != 0)
 			continue;
 
 		/*
 		 * BKGRDINPROG can only be set with the buf and bufobj
 		 * locks both held.  We tolerate a race to clear it here.
 		 */
 		if ((bp->b_vflags & BV_BKGRDINPROG) != 0 ||
 		    (bp->b_flags & B_DELWRI) == 0) {
 			BUF_UNLOCK(bp);
 			continue;
 		}
 		if (bp->b_flags & B_INVAL) {
 			bremfreef(bp);
 			brelse(bp);
 			flushed++;
 			continue;
 		}
 
 		if (!LIST_EMPTY(&bp->b_dep) && buf_countdeps(bp, 0)) {
 			if (flushdeps == 0) {
 				BUF_UNLOCK(bp);
 				continue;
 			}
 			hasdeps = 1;
 		} else
 			hasdeps = 0;
 		/*
 		 * We must hold the lock on a vnode before writing
 		 * one of its buffers. Otherwise we may confuse, or
 		 * in the case of a snapshot vnode, deadlock the
 		 * system.
 		 *
 		 * The lock order here is the reverse of the normal
 		 * of vnode followed by buf lock.  This is ok because
 		 * the NOWAIT will prevent deadlock.
 		 */
 		vp = bp->b_vp;
 		if (vn_start_write(vp, &mp, V_NOWAIT) != 0) {
 			BUF_UNLOCK(bp);
 			continue;
 		}
 		if (lvp == NULL) {
 			unlock = true;
 			error = vn_lock(vp, LK_EXCLUSIVE | LK_NOWAIT);
 		} else {
 			ASSERT_VOP_LOCKED(vp, "getbuf");
 			unlock = false;
 			error = VOP_ISLOCKED(vp) == LK_EXCLUSIVE ? 0 :
 			    vn_lock(vp, LK_TRYUPGRADE);
 		}
 		if (error == 0) {
 			CTR3(KTR_BUF, "flushbufqueue(%p) vp %p flags %X",
 			    bp, bp->b_vp, bp->b_flags);
 			if (curproc == bufdaemonproc) {
 				vfs_bio_awrite(bp);
 			} else {
 				bremfree(bp);
 				bwrite(bp);
 				notbufdflushes++;
 			}
 			vn_finished_write(mp);
 			if (unlock)
 				VOP_UNLOCK(vp, 0);
 			flushwithdeps += hasdeps;
 			flushed++;
 
 			/*
 			 * Sleeping on runningbufspace while holding
 			 * vnode lock leads to deadlock.
 			 */
 			if (curproc == bufdaemonproc &&
 			    runningbufspace > hirunningspace)
 				waitrunningbufspace();
 			continue;
 		}
 		vn_finished_write(mp);
 		BUF_UNLOCK(bp);
 	}
 	mtx_lock(&bqlocks[queue]);
 	TAILQ_REMOVE(&bufqueues[queue], sentinel, b_freelist);
 	mtx_unlock(&bqlocks[queue]);
 	free(sentinel, M_TEMP);
 	return (flushed);
 }
 
 /*
  * Check to see if a block is currently memory resident.
  */
 struct buf *
 incore(struct bufobj *bo, daddr_t blkno)
 {
 	struct buf *bp;
 
 	BO_RLOCK(bo);
 	bp = gbincore(bo, blkno);
 	BO_RUNLOCK(bo);
 	return (bp);
 }
 
 /*
  * Returns true if no I/O is needed to access the
  * associated VM object.  This is like incore except
  * it also hunts around in the VM system for the data.
  */
 
 static int
 inmem(struct vnode * vp, daddr_t blkno)
 {
 	vm_object_t obj;
 	vm_offset_t toff, tinc, size;
 	vm_page_t m;
 	vm_ooffset_t off;
 
 	ASSERT_VOP_LOCKED(vp, "inmem");
 
 	if (incore(&vp->v_bufobj, blkno))
 		return 1;
 	if (vp->v_mount == NULL)
 		return 0;
 	obj = vp->v_object;
 	if (obj == NULL)
 		return (0);
 
 	size = PAGE_SIZE;
 	if (size > vp->v_mount->mnt_stat.f_iosize)
 		size = vp->v_mount->mnt_stat.f_iosize;
 	off = (vm_ooffset_t)blkno * (vm_ooffset_t)vp->v_mount->mnt_stat.f_iosize;
 
 	VM_OBJECT_RLOCK(obj);
 	for (toff = 0; toff < vp->v_mount->mnt_stat.f_iosize; toff += tinc) {
 		m = vm_page_lookup(obj, OFF_TO_IDX(off + toff));
 		if (!m)
 			goto notinmem;
 		tinc = size;
 		if (tinc > PAGE_SIZE - ((toff + off) & PAGE_MASK))
 			tinc = PAGE_SIZE - ((toff + off) & PAGE_MASK);
 		if (vm_page_is_valid(m,
 		    (vm_offset_t) ((toff + off) & PAGE_MASK), tinc) == 0)
 			goto notinmem;
 	}
 	VM_OBJECT_RUNLOCK(obj);
 	return 1;
 
 notinmem:
 	VM_OBJECT_RUNLOCK(obj);
 	return (0);
 }
 
 /*
  * Set the dirty range for a buffer based on the status of the dirty
  * bits in the pages comprising the buffer.  The range is limited
  * to the size of the buffer.
  *
  * Tell the VM system that the pages associated with this buffer
  * are clean.  This is used for delayed writes where the data is
  * going to go to disk eventually without additional VM intevention.
  *
  * Note that while we only really need to clean through to b_bcount, we
  * just go ahead and clean through to b_bufsize.
  */
 static void
 vfs_clean_pages_dirty_buf(struct buf *bp)
 {
 	vm_ooffset_t foff, noff, eoff;
 	vm_page_t m;
 	int i;
 
 	if ((bp->b_flags & B_VMIO) == 0 || bp->b_bufsize == 0)
 		return;
 
 	foff = bp->b_offset;
 	KASSERT(bp->b_offset != NOOFFSET,
 	    ("vfs_clean_pages_dirty_buf: no buffer offset"));
 
 	VM_OBJECT_WLOCK(bp->b_bufobj->bo_object);
 	vfs_drain_busy_pages(bp);
 	vfs_setdirty_locked_object(bp);
 	for (i = 0; i < bp->b_npages; i++) {
 		noff = (foff + PAGE_SIZE) & ~(off_t)PAGE_MASK;
 		eoff = noff;
 		if (eoff > bp->b_offset + bp->b_bufsize)
 			eoff = bp->b_offset + bp->b_bufsize;
 		m = bp->b_pages[i];
 		vfs_page_set_validclean(bp, foff, m);
 		/* vm_page_clear_dirty(m, foff & PAGE_MASK, eoff - foff); */
 		foff = noff;
 	}
 	VM_OBJECT_WUNLOCK(bp->b_bufobj->bo_object);
 }
 
 static void
 vfs_setdirty_locked_object(struct buf *bp)
 {
 	vm_object_t object;
 	int i;
 
 	object = bp->b_bufobj->bo_object;
 	VM_OBJECT_ASSERT_WLOCKED(object);
 
 	/*
 	 * We qualify the scan for modified pages on whether the
 	 * object has been flushed yet.
 	 */
 	if ((object->flags & OBJ_MIGHTBEDIRTY) != 0) {
 		vm_offset_t boffset;
 		vm_offset_t eoffset;
 
 		/*
 		 * test the pages to see if they have been modified directly
 		 * by users through the VM system.
 		 */
 		for (i = 0; i < bp->b_npages; i++)
 			vm_page_test_dirty(bp->b_pages[i]);
 
 		/*
 		 * Calculate the encompassing dirty range, boffset and eoffset,
 		 * (eoffset - boffset) bytes.
 		 */
 
 		for (i = 0; i < bp->b_npages; i++) {
 			if (bp->b_pages[i]->dirty)
 				break;
 		}
 		boffset = (i << PAGE_SHIFT) - (bp->b_offset & PAGE_MASK);
 
 		for (i = bp->b_npages - 1; i >= 0; --i) {
 			if (bp->b_pages[i]->dirty) {
 				break;
 			}
 		}
 		eoffset = ((i + 1) << PAGE_SHIFT) - (bp->b_offset & PAGE_MASK);
 
 		/*
 		 * Fit it to the buffer.
 		 */
 
 		if (eoffset > bp->b_bcount)
 			eoffset = bp->b_bcount;
 
 		/*
 		 * If we have a good dirty range, merge with the existing
 		 * dirty range.
 		 */
 
 		if (boffset < eoffset) {
 			if (bp->b_dirtyoff > boffset)
 				bp->b_dirtyoff = boffset;
 			if (bp->b_dirtyend < eoffset)
 				bp->b_dirtyend = eoffset;
 		}
 	}
 }
 
 /*
  * Allocate the KVA mapping for an existing buffer.
  * If an unmapped buffer is provided but a mapped buffer is requested, take
  * also care to properly setup mappings between pages and KVA.
  */
 static void
 bp_unmapped_get_kva(struct buf *bp, daddr_t blkno, int size, int gbflags)
 {
 	int bsize, maxsize, need_mapping, need_kva;
 	off_t offset;
 
 	need_mapping = bp->b_data == unmapped_buf &&
 	    (gbflags & GB_UNMAPPED) == 0;
 	need_kva = bp->b_kvabase == unmapped_buf &&
 	    bp->b_data == unmapped_buf &&
 	    (gbflags & GB_KVAALLOC) != 0;
 	if (!need_mapping && !need_kva)
 		return;
 
 	BUF_CHECK_UNMAPPED(bp);
 
 	if (need_mapping && bp->b_kvabase != unmapped_buf) {
 		/*
 		 * Buffer is not mapped, but the KVA was already
 		 * reserved at the time of the instantiation.  Use the
 		 * allocated space.
 		 */
 		goto has_addr;
 	}
 
 	/*
 	 * Calculate the amount of the address space we would reserve
 	 * if the buffer was mapped.
 	 */
 	bsize = vn_isdisk(bp->b_vp, NULL) ? DEV_BSIZE : bp->b_bufobj->bo_bsize;
 	KASSERT(bsize != 0, ("bsize == 0, check bo->bo_bsize"));
 	offset = blkno * bsize;
 	maxsize = size + (offset & PAGE_MASK);
 	maxsize = imax(maxsize, bsize);
 
 	while (bufkva_alloc(bp, maxsize, gbflags) != 0) {
 		if ((gbflags & GB_NOWAIT_BD) != 0) {
 			/*
 			 * XXXKIB: defragmentation cannot
 			 * succeed, not sure what else to do.
 			 */
 			panic("GB_NOWAIT_BD and GB_UNMAPPED %p", bp);
 		}
 		atomic_add_int(&mappingrestarts, 1);
 		bufspace_wait(bp->b_vp, gbflags, 0, 0);
 	}
 has_addr:
 	if (need_mapping) {
 		/* b_offset is handled by bpmap_qenter. */
 		bp->b_data = bp->b_kvabase;
 		BUF_CHECK_MAPPED(bp);
 		bpmap_qenter(bp);
 	}
 }
 
 /*
  *	getblk:
  *
  *	Get a block given a specified block and offset into a file/device.
  *	The buffers B_DONE bit will be cleared on return, making it almost
  * 	ready for an I/O initiation.  B_INVAL may or may not be set on 
  *	return.  The caller should clear B_INVAL prior to initiating a
  *	READ.
  *
  *	For a non-VMIO buffer, B_CACHE is set to the opposite of B_INVAL for
  *	an existing buffer.
  *
  *	For a VMIO buffer, B_CACHE is modified according to the backing VM.
  *	If getblk()ing a previously 0-sized invalid buffer, B_CACHE is set
  *	and then cleared based on the backing VM.  If the previous buffer is
  *	non-0-sized but invalid, B_CACHE will be cleared.
  *
  *	If getblk() must create a new buffer, the new buffer is returned with
  *	both B_INVAL and B_CACHE clear unless it is a VMIO buffer, in which
  *	case it is returned with B_INVAL clear and B_CACHE set based on the
  *	backing VM.
  *
  *	getblk() also forces a bwrite() for any B_DELWRI buffer whos
  *	B_CACHE bit is clear.
  *	
  *	What this means, basically, is that the caller should use B_CACHE to
  *	determine whether the buffer is fully valid or not and should clear
  *	B_INVAL prior to issuing a read.  If the caller intends to validate
  *	the buffer by loading its data area with something, the caller needs
  *	to clear B_INVAL.  If the caller does this without issuing an I/O, 
  *	the caller should set B_CACHE ( as an optimization ), else the caller
  *	should issue the I/O and biodone() will set B_CACHE if the I/O was
  *	a write attempt or if it was a successful read.  If the caller 
  *	intends to issue a READ, the caller must clear B_INVAL and BIO_ERROR
  *	prior to issuing the READ.  biodone() will *not* clear B_INVAL.
  */
 struct buf *
 getblk(struct vnode *vp, daddr_t blkno, int size, int slpflag, int slptimeo,
     int flags)
 {
 	struct buf *bp;
 	struct bufobj *bo;
 	int bsize, error, maxsize, vmio;
 	off_t offset;
 
 	CTR3(KTR_BUF, "getblk(%p, %ld, %d)", vp, (long)blkno, size);
 	KASSERT((flags & (GB_UNMAPPED | GB_KVAALLOC)) != GB_KVAALLOC,
 	    ("GB_KVAALLOC only makes sense with GB_UNMAPPED"));
 	ASSERT_VOP_LOCKED(vp, "getblk");
-	if (size > MAXBCACHEBUF)
-		panic("getblk: size(%d) > MAXBCACHEBUF(%d)\n", size,
-		    MAXBCACHEBUF);
+	if (size > maxbcachebuf)
+		panic("getblk: size(%d) > maxbcachebuf(%d)\n", size,
+		    maxbcachebuf);
 	if (!unmapped_buf_allowed)
 		flags &= ~(GB_UNMAPPED | GB_KVAALLOC);
 
 	bo = &vp->v_bufobj;
 loop:
 	BO_RLOCK(bo);
 	bp = gbincore(bo, blkno);
 	if (bp != NULL) {
 		int lockflags;
 		/*
 		 * Buffer is in-core.  If the buffer is not busy nor managed,
 		 * it must be on a queue.
 		 */
 		lockflags = LK_EXCLUSIVE | LK_SLEEPFAIL | LK_INTERLOCK;
 
 		if (flags & GB_LOCK_NOWAIT)
 			lockflags |= LK_NOWAIT;
 
 		error = BUF_TIMELOCK(bp, lockflags,
 		    BO_LOCKPTR(bo), "getblk", slpflag, slptimeo);
 
 		/*
 		 * If we slept and got the lock we have to restart in case
 		 * the buffer changed identities.
 		 */
 		if (error == ENOLCK)
 			goto loop;
 		/* We timed out or were interrupted. */
 		else if (error)
 			return (NULL);
 		/* If recursed, assume caller knows the rules. */
 		else if (BUF_LOCKRECURSED(bp))
 			goto end;
 
 		/*
 		 * The buffer is locked.  B_CACHE is cleared if the buffer is 
 		 * invalid.  Otherwise, for a non-VMIO buffer, B_CACHE is set
 		 * and for a VMIO buffer B_CACHE is adjusted according to the
 		 * backing VM cache.
 		 */
 		if (bp->b_flags & B_INVAL)
 			bp->b_flags &= ~B_CACHE;
 		else if ((bp->b_flags & (B_VMIO | B_INVAL)) == 0)
 			bp->b_flags |= B_CACHE;
 		if (bp->b_flags & B_MANAGED)
 			MPASS(bp->b_qindex == QUEUE_NONE);
 		else
 			bremfree(bp);
 
 		/*
 		 * check for size inconsistencies for non-VMIO case.
 		 */
 		if (bp->b_bcount != size) {
 			if ((bp->b_flags & B_VMIO) == 0 ||
 			    (size > bp->b_kvasize)) {
 				if (bp->b_flags & B_DELWRI) {
 					bp->b_flags |= B_NOCACHE;
 					bwrite(bp);
 				} else {
 					if (LIST_EMPTY(&bp->b_dep)) {
 						bp->b_flags |= B_RELBUF;
 						brelse(bp);
 					} else {
 						bp->b_flags |= B_NOCACHE;
 						bwrite(bp);
 					}
 				}
 				goto loop;
 			}
 		}
 
 		/*
 		 * Handle the case of unmapped buffer which should
 		 * become mapped, or the buffer for which KVA
 		 * reservation is requested.
 		 */
 		bp_unmapped_get_kva(bp, blkno, size, flags);
 
 		/*
 		 * If the size is inconsistent in the VMIO case, we can resize
 		 * the buffer.  This might lead to B_CACHE getting set or
 		 * cleared.  If the size has not changed, B_CACHE remains
 		 * unchanged from its previous state.
 		 */
 		allocbuf(bp, size);
 
 		KASSERT(bp->b_offset != NOOFFSET, 
 		    ("getblk: no buffer offset"));
 
 		/*
 		 * A buffer with B_DELWRI set and B_CACHE clear must
 		 * be committed before we can return the buffer in
 		 * order to prevent the caller from issuing a read
 		 * ( due to B_CACHE not being set ) and overwriting
 		 * it.
 		 *
 		 * Most callers, including NFS and FFS, need this to
 		 * operate properly either because they assume they
 		 * can issue a read if B_CACHE is not set, or because
 		 * ( for example ) an uncached B_DELWRI might loop due 
 		 * to softupdates re-dirtying the buffer.  In the latter
 		 * case, B_CACHE is set after the first write completes,
 		 * preventing further loops.
 		 * NOTE!  b*write() sets B_CACHE.  If we cleared B_CACHE
 		 * above while extending the buffer, we cannot allow the
 		 * buffer to remain with B_CACHE set after the write
 		 * completes or it will represent a corrupt state.  To
 		 * deal with this we set B_NOCACHE to scrap the buffer
 		 * after the write.
 		 *
 		 * We might be able to do something fancy, like setting
 		 * B_CACHE in bwrite() except if B_DELWRI is already set,
 		 * so the below call doesn't set B_CACHE, but that gets real
 		 * confusing.  This is much easier.
 		 */
 
 		if ((bp->b_flags & (B_CACHE|B_DELWRI)) == B_DELWRI) {
 			bp->b_flags |= B_NOCACHE;
 			bwrite(bp);
 			goto loop;
 		}
 		bp->b_flags &= ~B_DONE;
 	} else {
 		/*
 		 * Buffer is not in-core, create new buffer.  The buffer
 		 * returned by getnewbuf() is locked.  Note that the returned
 		 * buffer is also considered valid (not marked B_INVAL).
 		 */
 		BO_RUNLOCK(bo);
 		/*
 		 * If the user does not want us to create the buffer, bail out
 		 * here.
 		 */
 		if (flags & GB_NOCREAT)
 			return NULL;
 		if (numfreebuffers == 0 && TD_IS_IDLETHREAD(curthread))
 			return NULL;
 
 		bsize = vn_isdisk(vp, NULL) ? DEV_BSIZE : bo->bo_bsize;
 		KASSERT(bsize != 0, ("bsize == 0, check bo->bo_bsize"));
 		offset = blkno * bsize;
 		vmio = vp->v_object != NULL;
 		if (vmio) {
 			maxsize = size + (offset & PAGE_MASK);
 		} else {
 			maxsize = size;
 			/* Do not allow non-VMIO notmapped buffers. */
 			flags &= ~(GB_UNMAPPED | GB_KVAALLOC);
 		}
 		maxsize = imax(maxsize, bsize);
 
 		bp = getnewbuf(vp, slpflag, slptimeo, maxsize, flags);
 		if (bp == NULL) {
 			if (slpflag || slptimeo)
 				return NULL;
 			/*
 			 * XXX This is here until the sleep path is diagnosed
 			 * enough to work under very low memory conditions.
 			 *
 			 * There's an issue on low memory, 4BSD+non-preempt
 			 * systems (eg MIPS routers with 32MB RAM) where buffer
 			 * exhaustion occurs without sleeping for buffer
 			 * reclaimation.  This just sticks in a loop and
 			 * constantly attempts to allocate a buffer, which
 			 * hits exhaustion and tries to wakeup bufdaemon.
 			 * This never happens because we never yield.
 			 *
 			 * The real solution is to identify and fix these cases
 			 * so we aren't effectively busy-waiting in a loop
 			 * until the reclaimation path has cycles to run.
 			 */
 			kern_yield(PRI_USER);
 			goto loop;
 		}
 
 		/*
 		 * This code is used to make sure that a buffer is not
 		 * created while the getnewbuf routine is blocked.
 		 * This can be a problem whether the vnode is locked or not.
 		 * If the buffer is created out from under us, we have to
 		 * throw away the one we just created.
 		 *
 		 * Note: this must occur before we associate the buffer
 		 * with the vp especially considering limitations in
 		 * the splay tree implementation when dealing with duplicate
 		 * lblkno's.
 		 */
 		BO_LOCK(bo);
 		if (gbincore(bo, blkno)) {
 			BO_UNLOCK(bo);
 			bp->b_flags |= B_INVAL;
 			brelse(bp);
 			bufspace_release(maxsize);
 			goto loop;
 		}
 
 		/*
 		 * Insert the buffer into the hash, so that it can
 		 * be found by incore.
 		 */
 		bp->b_blkno = bp->b_lblkno = blkno;
 		bp->b_offset = offset;
 		bgetvp(vp, bp);
 		BO_UNLOCK(bo);
 
 		/*
 		 * set B_VMIO bit.  allocbuf() the buffer bigger.  Since the
 		 * buffer size starts out as 0, B_CACHE will be set by
 		 * allocbuf() for the VMIO case prior to it testing the
 		 * backing store for validity.
 		 */
 
 		if (vmio) {
 			bp->b_flags |= B_VMIO;
 			KASSERT(vp->v_object == bp->b_bufobj->bo_object,
 			    ("ARGH! different b_bufobj->bo_object %p %p %p\n",
 			    bp, vp->v_object, bp->b_bufobj->bo_object));
 		} else {
 			bp->b_flags &= ~B_VMIO;
 			KASSERT(bp->b_bufobj->bo_object == NULL,
 			    ("ARGH! has b_bufobj->bo_object %p %p\n",
 			    bp, bp->b_bufobj->bo_object));
 			BUF_CHECK_MAPPED(bp);
 		}
 
 		allocbuf(bp, size);
 		bufspace_release(maxsize);
 		bp->b_flags &= ~B_DONE;
 	}
 	CTR4(KTR_BUF, "getblk(%p, %ld, %d) = %p", vp, (long)blkno, size, bp);
 	BUF_ASSERT_HELD(bp);
 end:
 	buf_track(bp, __func__);
 	KASSERT(bp->b_bufobj == bo,
 	    ("bp %p wrong b_bufobj %p should be %p", bp, bp->b_bufobj, bo));
 	return (bp);
 }
 
 /*
  * Get an empty, disassociated buffer of given size.  The buffer is initially
  * set to B_INVAL.
  */
 struct buf *
 geteblk(int size, int flags)
 {
 	struct buf *bp;
 	int maxsize;
 
 	maxsize = (size + BKVAMASK) & ~BKVAMASK;
 	while ((bp = getnewbuf(NULL, 0, 0, maxsize, flags)) == NULL) {
 		if ((flags & GB_NOWAIT_BD) &&
 		    (curthread->td_pflags & TDP_BUFNEED) != 0)
 			return (NULL);
 	}
 	allocbuf(bp, size);
 	bufspace_release(maxsize);
 	bp->b_flags |= B_INVAL;	/* b_dep cleared by getnewbuf() */
 	BUF_ASSERT_HELD(bp);
 	return (bp);
 }
 
 /*
  * Truncate the backing store for a non-vmio buffer.
  */
 static void
 vfs_nonvmio_truncate(struct buf *bp, int newbsize)
 {
 
 	if (bp->b_flags & B_MALLOC) {
 		/*
 		 * malloced buffers are not shrunk
 		 */
 		if (newbsize == 0) {
 			bufmallocadjust(bp, 0);
 			free(bp->b_data, M_BIOBUF);
 			bp->b_data = bp->b_kvabase;
 			bp->b_flags &= ~B_MALLOC;
 		}
 		return;
 	}
 	vm_hold_free_pages(bp, newbsize);
 	bufspace_adjust(bp, newbsize);
 }
 
 /*
  * Extend the backing for a non-VMIO buffer.
  */
 static void
 vfs_nonvmio_extend(struct buf *bp, int newbsize)
 {
 	caddr_t origbuf;
 	int origbufsize;
 
 	/*
 	 * We only use malloced memory on the first allocation.
 	 * and revert to page-allocated memory when the buffer
 	 * grows.
 	 *
 	 * There is a potential smp race here that could lead
 	 * to bufmallocspace slightly passing the max.  It
 	 * is probably extremely rare and not worth worrying
 	 * over.
 	 */
 	if (bp->b_bufsize == 0 && newbsize <= PAGE_SIZE/2 &&
 	    bufmallocspace < maxbufmallocspace) {
 		bp->b_data = malloc(newbsize, M_BIOBUF, M_WAITOK);
 		bp->b_flags |= B_MALLOC;
 		bufmallocadjust(bp, newbsize);
 		return;
 	}
 
 	/*
 	 * If the buffer is growing on its other-than-first
 	 * allocation then we revert to the page-allocation
 	 * scheme.
 	 */
 	origbuf = NULL;
 	origbufsize = 0;
 	if (bp->b_flags & B_MALLOC) {
 		origbuf = bp->b_data;
 		origbufsize = bp->b_bufsize;
 		bp->b_data = bp->b_kvabase;
 		bufmallocadjust(bp, 0);
 		bp->b_flags &= ~B_MALLOC;
 		newbsize = round_page(newbsize);
 	}
 	vm_hold_load_pages(bp, (vm_offset_t) bp->b_data + bp->b_bufsize,
 	    (vm_offset_t) bp->b_data + newbsize);
 	if (origbuf != NULL) {
 		bcopy(origbuf, bp->b_data, origbufsize);
 		free(origbuf, M_BIOBUF);
 	}
 	bufspace_adjust(bp, newbsize);
 }
 
 /*
  * This code constitutes the buffer memory from either anonymous system
  * memory (in the case of non-VMIO operations) or from an associated
  * VM object (in the case of VMIO operations).  This code is able to
  * resize a buffer up or down.
  *
  * Note that this code is tricky, and has many complications to resolve
  * deadlock or inconsistent data situations.  Tread lightly!!! 
  * There are B_CACHE and B_DELWRI interactions that must be dealt with by 
  * the caller.  Calling this code willy nilly can result in the loss of data.
  *
  * allocbuf() only adjusts B_CACHE for VMIO buffers.  getblk() deals with
  * B_CACHE for the non-VMIO case.
  */
 int
 allocbuf(struct buf *bp, int size)
 {
 	int newbsize;
 
 	BUF_ASSERT_HELD(bp);
 
 	if (bp->b_bcount == size)
 		return (1);
 
 	if (bp->b_kvasize != 0 && bp->b_kvasize < size)
 		panic("allocbuf: buffer too small");
 
 	newbsize = roundup2(size, DEV_BSIZE);
 	if ((bp->b_flags & B_VMIO) == 0) {
 		if ((bp->b_flags & B_MALLOC) == 0)
 			newbsize = round_page(newbsize);
 		/*
 		 * Just get anonymous memory from the kernel.  Don't
 		 * mess with B_CACHE.
 		 */
 		if (newbsize < bp->b_bufsize)
 			vfs_nonvmio_truncate(bp, newbsize);
 		else if (newbsize > bp->b_bufsize)
 			vfs_nonvmio_extend(bp, newbsize);
 	} else {
 		int desiredpages;
 
 		desiredpages = (size == 0) ? 0 :
 		    num_pages((bp->b_offset & PAGE_MASK) + newbsize);
 
 		if (bp->b_flags & B_MALLOC)
 			panic("allocbuf: VMIO buffer can't be malloced");
 		/*
 		 * Set B_CACHE initially if buffer is 0 length or will become
 		 * 0-length.
 		 */
 		if (size == 0 || bp->b_bufsize == 0)
 			bp->b_flags |= B_CACHE;
 
 		if (newbsize < bp->b_bufsize)
 			vfs_vmio_truncate(bp, desiredpages);
 		/* XXX This looks as if it should be newbsize > b_bufsize */
 		else if (size > bp->b_bcount)
 			vfs_vmio_extend(bp, desiredpages, size);
 		bufspace_adjust(bp, newbsize);
 	}
 	bp->b_bcount = size;		/* requested buffer size. */
 	return (1);
 }
 
 extern int inflight_transient_maps;
 
 void
 biodone(struct bio *bp)
 {
 	struct mtx *mtxp;
 	void (*done)(struct bio *);
 	vm_offset_t start, end;
 
 	biotrack(bp, __func__);
 	if ((bp->bio_flags & BIO_TRANSIENT_MAPPING) != 0) {
 		bp->bio_flags &= ~BIO_TRANSIENT_MAPPING;
 		bp->bio_flags |= BIO_UNMAPPED;
 		start = trunc_page((vm_offset_t)bp->bio_data);
 		end = round_page((vm_offset_t)bp->bio_data + bp->bio_length);
 		bp->bio_data = unmapped_buf;
 		pmap_qremove(start, atop(end - start));
 		vmem_free(transient_arena, start, end - start);
 		atomic_add_int(&inflight_transient_maps, -1);
 	}
 	done = bp->bio_done;
 	if (done == NULL) {
 		mtxp = mtx_pool_find(mtxpool_sleep, bp);
 		mtx_lock(mtxp);
 		bp->bio_flags |= BIO_DONE;
 		wakeup(bp);
 		mtx_unlock(mtxp);
 	} else
 		done(bp);
 }
 
 /*
  * Wait for a BIO to finish.
  */
 int
 biowait(struct bio *bp, const char *wchan)
 {
 	struct mtx *mtxp;
 
 	mtxp = mtx_pool_find(mtxpool_sleep, bp);
 	mtx_lock(mtxp);
 	while ((bp->bio_flags & BIO_DONE) == 0)
 		msleep(bp, mtxp, PRIBIO, wchan, 0);
 	mtx_unlock(mtxp);
 	if (bp->bio_error != 0)
 		return (bp->bio_error);
 	if (!(bp->bio_flags & BIO_ERROR))
 		return (0);
 	return (EIO);
 }
 
 void
 biofinish(struct bio *bp, struct devstat *stat, int error)
 {
 	
 	if (error) {
 		bp->bio_error = error;
 		bp->bio_flags |= BIO_ERROR;
 	}
 	if (stat != NULL)
 		devstat_end_transaction_bio(stat, bp);
 	biodone(bp);
 }
 
 #if defined(BUF_TRACKING) || defined(FULL_BUF_TRACKING)
 void
 biotrack_buf(struct bio *bp, const char *location)
 {
 
 	buf_track(bp->bio_track_bp, location);
 }
 #endif
 
 /*
  *	bufwait:
  *
  *	Wait for buffer I/O completion, returning error status.  The buffer
  *	is left locked and B_DONE on return.  B_EINTR is converted into an EINTR
  *	error and cleared.
  */
 int
 bufwait(struct buf *bp)
 {
 	if (bp->b_iocmd == BIO_READ)
 		bwait(bp, PRIBIO, "biord");
 	else
 		bwait(bp, PRIBIO, "biowr");
 	if (bp->b_flags & B_EINTR) {
 		bp->b_flags &= ~B_EINTR;
 		return (EINTR);
 	}
 	if (bp->b_ioflags & BIO_ERROR) {
 		return (bp->b_error ? bp->b_error : EIO);
 	} else {
 		return (0);
 	}
 }
 
 /*
  *	bufdone:
  *
  *	Finish I/O on a buffer, optionally calling a completion function.
  *	This is usually called from an interrupt so process blocking is
  *	not allowed.
  *
  *	biodone is also responsible for setting B_CACHE in a B_VMIO bp.
  *	In a non-VMIO bp, B_CACHE will be set on the next getblk() 
  *	assuming B_INVAL is clear.
  *
  *	For the VMIO case, we set B_CACHE if the op was a read and no
  *	read error occurred, or if the op was a write.  B_CACHE is never
  *	set if the buffer is invalid or otherwise uncacheable.
  *
  *	biodone does not mess with B_INVAL, allowing the I/O routine or the
  *	initiator to leave B_INVAL set to brelse the buffer out of existence
  *	in the biodone routine.
  */
 void
 bufdone(struct buf *bp)
 {
 	struct bufobj *dropobj;
 	void    (*biodone)(struct buf *);
 
 	buf_track(bp, __func__);
 	CTR3(KTR_BUF, "bufdone(%p) vp %p flags %X", bp, bp->b_vp, bp->b_flags);
 	dropobj = NULL;
 
 	KASSERT(!(bp->b_flags & B_DONE), ("biodone: bp %p already done", bp));
 	BUF_ASSERT_HELD(bp);
 
 	runningbufwakeup(bp);
 	if (bp->b_iocmd == BIO_WRITE)
 		dropobj = bp->b_bufobj;
 	/* call optional completion function if requested */
 	if (bp->b_iodone != NULL) {
 		biodone = bp->b_iodone;
 		bp->b_iodone = NULL;
 		(*biodone) (bp);
 		if (dropobj)
 			bufobj_wdrop(dropobj);
 		return;
 	}
 
 	bufdone_finish(bp);
 
 	if (dropobj)
 		bufobj_wdrop(dropobj);
 }
 
 void
 bufdone_finish(struct buf *bp)
 {
 	BUF_ASSERT_HELD(bp);
 
 	if (!LIST_EMPTY(&bp->b_dep))
 		buf_complete(bp);
 
 	if (bp->b_flags & B_VMIO) {
 		/*
 		 * Set B_CACHE if the op was a normal read and no error
 		 * occurred.  B_CACHE is set for writes in the b*write()
 		 * routines.
 		 */
 		if (bp->b_iocmd == BIO_READ &&
 		    !(bp->b_flags & (B_INVAL|B_NOCACHE)) &&
 		    !(bp->b_ioflags & BIO_ERROR))
 			bp->b_flags |= B_CACHE;
 		vfs_vmio_iodone(bp);
 	}
 
 	/*
 	 * For asynchronous completions, release the buffer now. The brelse
 	 * will do a wakeup there if necessary - so no need to do a wakeup
 	 * here in the async case. The sync case always needs to do a wakeup.
 	 */
 	if (bp->b_flags & B_ASYNC) {
 		if ((bp->b_flags & (B_NOCACHE | B_INVAL | B_RELBUF)) ||
 		    (bp->b_ioflags & BIO_ERROR))
 			brelse(bp);
 		else
 			bqrelse(bp);
 	} else
 		bdone(bp);
 }
 
 /*
  * This routine is called in lieu of iodone in the case of
  * incomplete I/O.  This keeps the busy status for pages
  * consistent.
  */
 void
 vfs_unbusy_pages(struct buf *bp)
 {
 	int i;
 	vm_object_t obj;
 	vm_page_t m;
 
 	runningbufwakeup(bp);
 	if (!(bp->b_flags & B_VMIO))
 		return;
 
 	obj = bp->b_bufobj->bo_object;
 	VM_OBJECT_WLOCK(obj);
 	for (i = 0; i < bp->b_npages; i++) {
 		m = bp->b_pages[i];
 		if (m == bogus_page) {
 			m = vm_page_lookup(obj, OFF_TO_IDX(bp->b_offset) + i);
 			if (!m)
 				panic("vfs_unbusy_pages: page missing\n");
 			bp->b_pages[i] = m;
 			if (buf_mapped(bp)) {
 				BUF_CHECK_MAPPED(bp);
 				pmap_qenter(trunc_page((vm_offset_t)bp->b_data),
 				    bp->b_pages, bp->b_npages);
 			} else
 				BUF_CHECK_UNMAPPED(bp);
 		}
 		vm_page_sunbusy(m);
 	}
 	vm_object_pip_wakeupn(obj, bp->b_npages);
 	VM_OBJECT_WUNLOCK(obj);
 }
 
 /*
  * vfs_page_set_valid:
  *
  *	Set the valid bits in a page based on the supplied offset.   The
  *	range is restricted to the buffer's size.
  *
  *	This routine is typically called after a read completes.
  */
 static void
 vfs_page_set_valid(struct buf *bp, vm_ooffset_t off, vm_page_t m)
 {
 	vm_ooffset_t eoff;
 
 	/*
 	 * Compute the end offset, eoff, such that [off, eoff) does not span a
 	 * page boundary and eoff is not greater than the end of the buffer.
 	 * The end of the buffer, in this case, is our file EOF, not the
 	 * allocation size of the buffer.
 	 */
 	eoff = (off + PAGE_SIZE) & ~(vm_ooffset_t)PAGE_MASK;
 	if (eoff > bp->b_offset + bp->b_bcount)
 		eoff = bp->b_offset + bp->b_bcount;
 
 	/*
 	 * Set valid range.  This is typically the entire buffer and thus the
 	 * entire page.
 	 */
 	if (eoff > off)
 		vm_page_set_valid_range(m, off & PAGE_MASK, eoff - off);
 }
 
 /*
  * vfs_page_set_validclean:
  *
  *	Set the valid bits and clear the dirty bits in a page based on the
  *	supplied offset.   The range is restricted to the buffer's size.
  */
 static void
 vfs_page_set_validclean(struct buf *bp, vm_ooffset_t off, vm_page_t m)
 {
 	vm_ooffset_t soff, eoff;
 
 	/*
 	 * Start and end offsets in buffer.  eoff - soff may not cross a
 	 * page boundary or cross the end of the buffer.  The end of the
 	 * buffer, in this case, is our file EOF, not the allocation size
 	 * of the buffer.
 	 */
 	soff = off;
 	eoff = (off + PAGE_SIZE) & ~(off_t)PAGE_MASK;
 	if (eoff > bp->b_offset + bp->b_bcount)
 		eoff = bp->b_offset + bp->b_bcount;
 
 	/*
 	 * Set valid range.  This is typically the entire buffer and thus the
 	 * entire page.
 	 */
 	if (eoff > soff) {
 		vm_page_set_validclean(
 		    m,
 		   (vm_offset_t) (soff & PAGE_MASK),
 		   (vm_offset_t) (eoff - soff)
 		);
 	}
 }
 
 /*
  * Ensure that all buffer pages are not exclusive busied.  If any page is
  * exclusive busy, drain it.
  */
 void
 vfs_drain_busy_pages(struct buf *bp)
 {
 	vm_page_t m;
 	int i, last_busied;
 
 	VM_OBJECT_ASSERT_WLOCKED(bp->b_bufobj->bo_object);
 	last_busied = 0;
 	for (i = 0; i < bp->b_npages; i++) {
 		m = bp->b_pages[i];
 		if (vm_page_xbusied(m)) {
 			for (; last_busied < i; last_busied++)
 				vm_page_sbusy(bp->b_pages[last_busied]);
 			while (vm_page_xbusied(m)) {
 				vm_page_lock(m);
 				VM_OBJECT_WUNLOCK(bp->b_bufobj->bo_object);
 				vm_page_busy_sleep(m, "vbpage", true);
 				VM_OBJECT_WLOCK(bp->b_bufobj->bo_object);
 			}
 		}
 	}
 	for (i = 0; i < last_busied; i++)
 		vm_page_sunbusy(bp->b_pages[i]);
 }
 
 /*
  * This routine is called before a device strategy routine.
  * It is used to tell the VM system that paging I/O is in
  * progress, and treat the pages associated with the buffer
  * almost as being exclusive busy.  Also the object paging_in_progress
  * flag is handled to make sure that the object doesn't become
  * inconsistent.
  *
  * Since I/O has not been initiated yet, certain buffer flags
  * such as BIO_ERROR or B_INVAL may be in an inconsistent state
  * and should be ignored.
  */
 void
 vfs_busy_pages(struct buf *bp, int clear_modify)
 {
 	vm_object_t obj;
 	vm_ooffset_t foff;
 	vm_page_t m;
 	int i;
 	bool bogus;
 
 	if (!(bp->b_flags & B_VMIO))
 		return;
 
 	obj = bp->b_bufobj->bo_object;
 	foff = bp->b_offset;
 	KASSERT(bp->b_offset != NOOFFSET,
 	    ("vfs_busy_pages: no buffer offset"));
 	VM_OBJECT_WLOCK(obj);
 	vfs_drain_busy_pages(bp);
 	if (bp->b_bufsize != 0)
 		vfs_setdirty_locked_object(bp);
 	bogus = false;
 	for (i = 0; i < bp->b_npages; i++) {
 		m = bp->b_pages[i];
 
 		if ((bp->b_flags & B_CLUSTER) == 0) {
 			vm_object_pip_add(obj, 1);
 			vm_page_sbusy(m);
 		}
 		/*
 		 * When readying a buffer for a read ( i.e
 		 * clear_modify == 0 ), it is important to do
 		 * bogus_page replacement for valid pages in 
 		 * partially instantiated buffers.  Partially 
 		 * instantiated buffers can, in turn, occur when
 		 * reconstituting a buffer from its VM backing store
 		 * base.  We only have to do this if B_CACHE is
 		 * clear ( which causes the I/O to occur in the
 		 * first place ).  The replacement prevents the read
 		 * I/O from overwriting potentially dirty VM-backed
 		 * pages.  XXX bogus page replacement is, uh, bogus.
 		 * It may not work properly with small-block devices.
 		 * We need to find a better way.
 		 */
 		if (clear_modify) {
 			pmap_remove_write(m);
 			vfs_page_set_validclean(bp, foff, m);
 		} else if (m->valid == VM_PAGE_BITS_ALL &&
 		    (bp->b_flags & B_CACHE) == 0) {
 			bp->b_pages[i] = bogus_page;
 			bogus = true;
 		}
 		foff = (foff + PAGE_SIZE) & ~(off_t)PAGE_MASK;
 	}
 	VM_OBJECT_WUNLOCK(obj);
 	if (bogus && buf_mapped(bp)) {
 		BUF_CHECK_MAPPED(bp);
 		pmap_qenter(trunc_page((vm_offset_t)bp->b_data),
 		    bp->b_pages, bp->b_npages);
 	}
 }
 
 /*
  *	vfs_bio_set_valid:
  *
  *	Set the range within the buffer to valid.  The range is
  *	relative to the beginning of the buffer, b_offset.  Note that
  *	b_offset itself may be offset from the beginning of the first
  *	page.
  */
 void   
 vfs_bio_set_valid(struct buf *bp, int base, int size)
 {
 	int i, n;
 	vm_page_t m;
 
 	if (!(bp->b_flags & B_VMIO))
 		return;
 
 	/*
 	 * Fixup base to be relative to beginning of first page.
 	 * Set initial n to be the maximum number of bytes in the
 	 * first page that can be validated.
 	 */
 	base += (bp->b_offset & PAGE_MASK);
 	n = PAGE_SIZE - (base & PAGE_MASK);
 
 	VM_OBJECT_WLOCK(bp->b_bufobj->bo_object);
 	for (i = base / PAGE_SIZE; size > 0 && i < bp->b_npages; ++i) {
 		m = bp->b_pages[i];
 		if (n > size)
 			n = size;
 		vm_page_set_valid_range(m, base & PAGE_MASK, n);
 		base += n;
 		size -= n;
 		n = PAGE_SIZE;
 	}
 	VM_OBJECT_WUNLOCK(bp->b_bufobj->bo_object);
 }
 
 /*
  *	vfs_bio_clrbuf:
  *
  *	If the specified buffer is a non-VMIO buffer, clear the entire
  *	buffer.  If the specified buffer is a VMIO buffer, clear and
  *	validate only the previously invalid portions of the buffer.
  *	This routine essentially fakes an I/O, so we need to clear
  *	BIO_ERROR and B_INVAL.
  *
  *	Note that while we only theoretically need to clear through b_bcount,
  *	we go ahead and clear through b_bufsize.
  */
 void
 vfs_bio_clrbuf(struct buf *bp) 
 {
 	int i, j, mask, sa, ea, slide;
 
 	if ((bp->b_flags & (B_VMIO | B_MALLOC)) != B_VMIO) {
 		clrbuf(bp);
 		return;
 	}
 	bp->b_flags &= ~B_INVAL;
 	bp->b_ioflags &= ~BIO_ERROR;
 	VM_OBJECT_WLOCK(bp->b_bufobj->bo_object);
 	if ((bp->b_npages == 1) && (bp->b_bufsize < PAGE_SIZE) &&
 	    (bp->b_offset & PAGE_MASK) == 0) {
 		if (bp->b_pages[0] == bogus_page)
 			goto unlock;
 		mask = (1 << (bp->b_bufsize / DEV_BSIZE)) - 1;
 		VM_OBJECT_ASSERT_WLOCKED(bp->b_pages[0]->object);
 		if ((bp->b_pages[0]->valid & mask) == mask)
 			goto unlock;
 		if ((bp->b_pages[0]->valid & mask) == 0) {
 			pmap_zero_page_area(bp->b_pages[0], 0, bp->b_bufsize);
 			bp->b_pages[0]->valid |= mask;
 			goto unlock;
 		}
 	}
 	sa = bp->b_offset & PAGE_MASK;
 	slide = 0;
 	for (i = 0; i < bp->b_npages; i++, sa = 0) {
 		slide = imin(slide + PAGE_SIZE, bp->b_offset + bp->b_bufsize);
 		ea = slide & PAGE_MASK;
 		if (ea == 0)
 			ea = PAGE_SIZE;
 		if (bp->b_pages[i] == bogus_page)
 			continue;
 		j = sa / DEV_BSIZE;
 		mask = ((1 << ((ea - sa) / DEV_BSIZE)) - 1) << j;
 		VM_OBJECT_ASSERT_WLOCKED(bp->b_pages[i]->object);
 		if ((bp->b_pages[i]->valid & mask) == mask)
 			continue;
 		if ((bp->b_pages[i]->valid & mask) == 0)
 			pmap_zero_page_area(bp->b_pages[i], sa, ea - sa);
 		else {
 			for (; sa < ea; sa += DEV_BSIZE, j++) {
 				if ((bp->b_pages[i]->valid & (1 << j)) == 0) {
 					pmap_zero_page_area(bp->b_pages[i],
 					    sa, DEV_BSIZE);
 				}
 			}
 		}
 		bp->b_pages[i]->valid |= mask;
 	}
 unlock:
 	VM_OBJECT_WUNLOCK(bp->b_bufobj->bo_object);
 	bp->b_resid = 0;
 }
 
 void
 vfs_bio_bzero_buf(struct buf *bp, int base, int size)
 {
 	vm_page_t m;
 	int i, n;
 
 	if (buf_mapped(bp)) {
 		BUF_CHECK_MAPPED(bp);
 		bzero(bp->b_data + base, size);
 	} else {
 		BUF_CHECK_UNMAPPED(bp);
 		n = PAGE_SIZE - (base & PAGE_MASK);
 		for (i = base / PAGE_SIZE; size > 0 && i < bp->b_npages; ++i) {
 			m = bp->b_pages[i];
 			if (n > size)
 				n = size;
 			pmap_zero_page_area(m, base & PAGE_MASK, n);
 			base += n;
 			size -= n;
 			n = PAGE_SIZE;
 		}
 	}
 }
 
 /*
  * Update buffer flags based on I/O request parameters, optionally releasing the
  * buffer.  If it's VMIO or direct I/O, the buffer pages are released to the VM,
  * where they may be placed on a page queue (VMIO) or freed immediately (direct
  * I/O).  Otherwise the buffer is released to the cache.
  */
 static void
 b_io_dismiss(struct buf *bp, int ioflag, bool release)
 {
 
 	KASSERT((ioflag & IO_NOREUSE) == 0 || (ioflag & IO_VMIO) != 0,
 	    ("buf %p non-VMIO noreuse", bp));
 
 	if ((ioflag & IO_DIRECT) != 0)
 		bp->b_flags |= B_DIRECT;
 	if ((ioflag & (IO_VMIO | IO_DIRECT)) != 0 && LIST_EMPTY(&bp->b_dep)) {
 		bp->b_flags |= B_RELBUF;
 		if ((ioflag & IO_NOREUSE) != 0)
 			bp->b_flags |= B_NOREUSE;
 		if (release)
 			brelse(bp);
 	} else if (release)
 		bqrelse(bp);
 }
 
 void
 vfs_bio_brelse(struct buf *bp, int ioflag)
 {
 
 	b_io_dismiss(bp, ioflag, true);
 }
 
 void
 vfs_bio_set_flags(struct buf *bp, int ioflag)
 {
 
 	b_io_dismiss(bp, ioflag, false);
 }
 
 /*
  * vm_hold_load_pages and vm_hold_free_pages get pages into
  * a buffers address space.  The pages are anonymous and are
  * not associated with a file object.
  */
 static void
 vm_hold_load_pages(struct buf *bp, vm_offset_t from, vm_offset_t to)
 {
 	vm_offset_t pg;
 	vm_page_t p;
 	int index;
 
 	BUF_CHECK_MAPPED(bp);
 
 	to = round_page(to);
 	from = round_page(from);
 	index = (from - trunc_page((vm_offset_t)bp->b_data)) >> PAGE_SHIFT;
 
 	for (pg = from; pg < to; pg += PAGE_SIZE, index++) {
 tryagain:
 		/*
 		 * note: must allocate system pages since blocking here
 		 * could interfere with paging I/O, no matter which
 		 * process we are.
 		 */
 		p = vm_page_alloc(NULL, 0, VM_ALLOC_SYSTEM | VM_ALLOC_NOOBJ |
 		    VM_ALLOC_WIRED | VM_ALLOC_COUNT((to - pg) >> PAGE_SHIFT));
 		if (p == NULL) {
 			VM_WAIT;
 			goto tryagain;
 		}
 		pmap_qenter(pg, &p, 1);
 		bp->b_pages[index] = p;
 	}
 	bp->b_npages = index;
 }
 
 /* Return pages associated with this buf to the vm system */
 static void
 vm_hold_free_pages(struct buf *bp, int newbsize)
 {
 	vm_offset_t from;
 	vm_page_t p;
 	int index, newnpages;
 
 	BUF_CHECK_MAPPED(bp);
 
 	from = round_page((vm_offset_t)bp->b_data + newbsize);
 	newnpages = (from - trunc_page((vm_offset_t)bp->b_data)) >> PAGE_SHIFT;
 	if (bp->b_npages > newnpages)
 		pmap_qremove(from, bp->b_npages - newnpages);
 	for (index = newnpages; index < bp->b_npages; index++) {
 		p = bp->b_pages[index];
 		bp->b_pages[index] = NULL;
 		if (vm_page_sbusied(p))
 			printf("vm_hold_free_pages: blkno: %jd, lblkno: %jd\n",
 			    (intmax_t)bp->b_blkno, (intmax_t)bp->b_lblkno);
 		p->wire_count--;
 		vm_page_free(p);
 		atomic_subtract_int(&vm_cnt.v_wire_count, 1);
 	}
 	bp->b_npages = newnpages;
 }
 
 /*
  * Map an IO request into kernel virtual address space.
  *
  * All requests are (re)mapped into kernel VA space.
  * Notice that we use b_bufsize for the size of the buffer
  * to be mapped.  b_bcount might be modified by the driver.
  *
  * Note that even if the caller determines that the address space should
  * be valid, a race or a smaller-file mapped into a larger space may
  * actually cause vmapbuf() to fail, so all callers of vmapbuf() MUST
  * check the return value.
  *
  * This function only works with pager buffers.
  */
 int
 vmapbuf(struct buf *bp, int mapbuf)
 {
 	vm_prot_t prot;
 	int pidx;
 
 	if (bp->b_bufsize < 0)
 		return (-1);
 	prot = VM_PROT_READ;
 	if (bp->b_iocmd == BIO_READ)
 		prot |= VM_PROT_WRITE;	/* Less backwards than it looks */
 	if ((pidx = vm_fault_quick_hold_pages(&curproc->p_vmspace->vm_map,
 	    (vm_offset_t)bp->b_data, bp->b_bufsize, prot, bp->b_pages,
 	    btoc(MAXPHYS))) < 0)
 		return (-1);
 	bp->b_npages = pidx;
 	bp->b_offset = ((vm_offset_t)bp->b_data) & PAGE_MASK;
 	if (mapbuf || !unmapped_buf_allowed) {
 		pmap_qenter((vm_offset_t)bp->b_kvabase, bp->b_pages, pidx);
 		bp->b_data = bp->b_kvabase + bp->b_offset;
 	} else
 		bp->b_data = unmapped_buf;
 	return(0);
 }
 
 /*
  * Free the io map PTEs associated with this IO operation.
  * We also invalidate the TLB entries and restore the original b_addr.
  *
  * This function only works with pager buffers.
  */
 void
 vunmapbuf(struct buf *bp)
 {
 	int npages;
 
 	npages = bp->b_npages;
 	if (buf_mapped(bp))
 		pmap_qremove(trunc_page((vm_offset_t)bp->b_data), npages);
 	vm_page_unhold_pages(bp->b_pages, npages);
 
 	bp->b_data = unmapped_buf;
 }
 
 void
 bdone(struct buf *bp)
 {
 	struct mtx *mtxp;
 
 	mtxp = mtx_pool_find(mtxpool_sleep, bp);
 	mtx_lock(mtxp);
 	bp->b_flags |= B_DONE;
 	wakeup(bp);
 	mtx_unlock(mtxp);
 }
 
 void
 bwait(struct buf *bp, u_char pri, const char *wchan)
 {
 	struct mtx *mtxp;
 
 	mtxp = mtx_pool_find(mtxpool_sleep, bp);
 	mtx_lock(mtxp);
 	while ((bp->b_flags & B_DONE) == 0)
 		msleep(bp, mtxp, pri, wchan, 0);
 	mtx_unlock(mtxp);
 }
 
 int
 bufsync(struct bufobj *bo, int waitfor)
 {
 
 	return (VOP_FSYNC(bo2vnode(bo), waitfor, curthread));
 }
 
 void
 bufstrategy(struct bufobj *bo, struct buf *bp)
 {
 	int i = 0;
 	struct vnode *vp;
 
 	vp = bp->b_vp;
 	KASSERT(vp == bo->bo_private, ("Inconsistent vnode bufstrategy"));
 	KASSERT(vp->v_type != VCHR && vp->v_type != VBLK,
 	    ("Wrong vnode in bufstrategy(bp=%p, vp=%p)", bp, vp));
 	i = VOP_STRATEGY(vp, bp);
 	KASSERT(i == 0, ("VOP_STRATEGY failed bp=%p vp=%p", bp, bp->b_vp));
 }
 
 void
 bufobj_wrefl(struct bufobj *bo)
 {
 
 	KASSERT(bo != NULL, ("NULL bo in bufobj_wref"));
 	ASSERT_BO_WLOCKED(bo);
 	bo->bo_numoutput++;
 }
 
 void
 bufobj_wref(struct bufobj *bo)
 {
 
 	KASSERT(bo != NULL, ("NULL bo in bufobj_wref"));
 	BO_LOCK(bo);
 	bo->bo_numoutput++;
 	BO_UNLOCK(bo);
 }
 
 void
 bufobj_wdrop(struct bufobj *bo)
 {
 
 	KASSERT(bo != NULL, ("NULL bo in bufobj_wdrop"));
 	BO_LOCK(bo);
 	KASSERT(bo->bo_numoutput > 0, ("bufobj_wdrop non-positive count"));
 	if ((--bo->bo_numoutput == 0) && (bo->bo_flag & BO_WWAIT)) {
 		bo->bo_flag &= ~BO_WWAIT;
 		wakeup(&bo->bo_numoutput);
 	}
 	BO_UNLOCK(bo);
 }
 
 int
 bufobj_wwait(struct bufobj *bo, int slpflag, int timeo)
 {
 	int error;
 
 	KASSERT(bo != NULL, ("NULL bo in bufobj_wwait"));
 	ASSERT_BO_WLOCKED(bo);
 	error = 0;
 	while (bo->bo_numoutput) {
 		bo->bo_flag |= BO_WWAIT;
 		error = msleep(&bo->bo_numoutput, BO_LOCKPTR(bo),
 		    slpflag | (PRIBIO + 1), "bo_wwait", timeo);
 		if (error)
 			break;
 	}
 	return (error);
 }
 
 /*
  * Set bio_data or bio_ma for struct bio from the struct buf.
  */
 void
 bdata2bio(struct buf *bp, struct bio *bip)
 {
 
 	if (!buf_mapped(bp)) {
 		KASSERT(unmapped_buf_allowed, ("unmapped"));
 		bip->bio_ma = bp->b_pages;
 		bip->bio_ma_n = bp->b_npages;
 		bip->bio_data = unmapped_buf;
 		bip->bio_ma_offset = (vm_offset_t)bp->b_offset & PAGE_MASK;
 		bip->bio_flags |= BIO_UNMAPPED;
 		KASSERT(round_page(bip->bio_ma_offset + bip->bio_length) /
 		    PAGE_SIZE == bp->b_npages,
 		    ("Buffer %p too short: %d %lld %d", bp, bip->bio_ma_offset,
 		    (long long)bip->bio_length, bip->bio_ma_n));
 	} else {
 		bip->bio_data = bp->b_data;
 		bip->bio_ma = NULL;
 	}
 }
 
 /*
  * The MIPS pmap code currently doesn't handle aliased pages.
  * The VIPT caches may not handle page aliasing themselves, leading
  * to data corruption.
  *
  * As such, this code makes a system extremely unhappy if said
  * system doesn't support unaliasing the above situation in hardware.
  * Some "recent" systems (eg some mips24k/mips74k cores) don't enable
  * this feature at build time, so it has to be handled in software.
  *
  * Once the MIPS pmap/cache code grows to support this function on
  * earlier chips, it should be flipped back off.
  */
 #ifdef	__mips__
 static int buf_pager_relbuf = 1;
 #else
 static int buf_pager_relbuf = 0;
 #endif
 SYSCTL_INT(_vfs, OID_AUTO, buf_pager_relbuf, CTLFLAG_RWTUN,
     &buf_pager_relbuf, 0,
     "Make buffer pager release buffers after reading");
 
 /*
  * The buffer pager.  It uses buffer reads to validate pages.
  *
  * In contrast to the generic local pager from vm/vnode_pager.c, this
  * pager correctly and easily handles volumes where the underlying
  * device block size is greater than the machine page size.  The
  * buffer cache transparently extends the requested page run to be
  * aligned at the block boundary, and does the necessary bogus page
  * replacements in the addends to avoid obliterating already valid
  * pages.
  *
  * The only non-trivial issue is that the exclusive busy state for
  * pages, which is assumed by the vm_pager_getpages() interface, is
  * incompatible with the VMIO buffer cache's desire to share-busy the
  * pages.  This function performs a trivial downgrade of the pages'
  * state before reading buffers, and a less trivial upgrade from the
  * shared-busy to excl-busy state after the read.
  */
 int
 vfs_bio_getpages(struct vnode *vp, vm_page_t *ma, int count,
     int *rbehind, int *rahead, vbg_get_lblkno_t get_lblkno,
     vbg_get_blksize_t get_blksize)
 {
 	vm_page_t m;
 	vm_object_t object;
 	struct buf *bp;
 	struct mount *mp;
 	daddr_t lbn, lbnp;
 	vm_ooffset_t la, lb, poff, poffe;
 	long bsize;
 	int bo_bs, br_flags, error, i, pgsin, pgsin_a, pgsin_b;
 	bool redo, lpart;
 
 	object = vp->v_object;
 	mp = vp->v_mount;
 	la = IDX_TO_OFF(ma[count - 1]->pindex);
 	if (la >= object->un_pager.vnp.vnp_size)
 		return (VM_PAGER_BAD);
 	lpart = la + PAGE_SIZE > object->un_pager.vnp.vnp_size;
 	bo_bs = get_blksize(vp, get_lblkno(vp, IDX_TO_OFF(ma[0]->pindex)));
 
 	/*
 	 * Calculate read-ahead, behind and total pages.
 	 */
 	pgsin = count;
 	lb = IDX_TO_OFF(ma[0]->pindex);
 	pgsin_b = OFF_TO_IDX(lb - rounddown2(lb, bo_bs));
 	pgsin += pgsin_b;
 	if (rbehind != NULL)
 		*rbehind = pgsin_b;
 	pgsin_a = OFF_TO_IDX(roundup2(la, bo_bs) - la);
 	if (la + IDX_TO_OFF(pgsin_a) >= object->un_pager.vnp.vnp_size)
 		pgsin_a = OFF_TO_IDX(roundup2(object->un_pager.vnp.vnp_size,
 		    PAGE_SIZE) - la);
 	pgsin += pgsin_a;
 	if (rahead != NULL)
 		*rahead = pgsin_a;
 	VM_CNT_INC(v_vnodein);
 	VM_CNT_ADD(v_vnodepgsin, pgsin);
 
 	br_flags = (mp != NULL && (mp->mnt_kern_flag & MNTK_UNMAPPED_BUFS)
 	    != 0) ? GB_UNMAPPED : 0;
 	VM_OBJECT_WLOCK(object);
 again:
 	for (i = 0; i < count; i++)
 		vm_page_busy_downgrade(ma[i]);
 	VM_OBJECT_WUNLOCK(object);
 
 	lbnp = -1;
 	for (i = 0; i < count; i++) {
 		m = ma[i];
 
 		/*
 		 * Pages are shared busy and the object lock is not
 		 * owned, which together allow for the pages'
 		 * invalidation.  The racy test for validity avoids
 		 * useless creation of the buffer for the most typical
 		 * case when invalidation is not used in redo or for
 		 * parallel read.  The shared->excl upgrade loop at
 		 * the end of the function catches the race in a
 		 * reliable way (protected by the object lock).
 		 */
 		if (m->valid == VM_PAGE_BITS_ALL)
 			continue;
 
 		poff = IDX_TO_OFF(m->pindex);
 		poffe = MIN(poff + PAGE_SIZE, object->un_pager.vnp.vnp_size);
 		for (; poff < poffe; poff += bsize) {
 			lbn = get_lblkno(vp, poff);
 			if (lbn == lbnp)
 				goto next_page;
 			lbnp = lbn;
 
 			bsize = get_blksize(vp, lbn);
 			error = bread_gb(vp, lbn, bsize, curthread->td_ucred,
 			    br_flags, &bp);
 			if (error != 0)
 				goto end_pages;
 			if (LIST_EMPTY(&bp->b_dep)) {
 				/*
 				 * Invalidation clears m->valid, but
 				 * may leave B_CACHE flag if the
 				 * buffer existed at the invalidation
 				 * time.  In this case, recycle the
 				 * buffer to do real read on next
 				 * bread() after redo.
 				 *
 				 * Otherwise B_RELBUF is not strictly
 				 * necessary, enable to reduce buf
 				 * cache pressure.
 				 */
 				if (buf_pager_relbuf ||
 				    m->valid != VM_PAGE_BITS_ALL)
 					bp->b_flags |= B_RELBUF;
 
 				bp->b_flags &= ~B_NOCACHE;
 				brelse(bp);
 			} else {
 				bqrelse(bp);
 			}
 		}
 		KASSERT(1 /* racy, enable for debugging */ ||
 		    m->valid == VM_PAGE_BITS_ALL || i == count - 1,
 		    ("buf %d %p invalid", i, m));
 		if (i == count - 1 && lpart) {
 			VM_OBJECT_WLOCK(object);
 			if (m->valid != 0 &&
 			    m->valid != VM_PAGE_BITS_ALL)
 				vm_page_zero_invalid(m, TRUE);
 			VM_OBJECT_WUNLOCK(object);
 		}
 next_page:;
 	}
 end_pages:
 
 	VM_OBJECT_WLOCK(object);
 	redo = false;
 	for (i = 0; i < count; i++) {
 		vm_page_sunbusy(ma[i]);
 		ma[i] = vm_page_grab(object, ma[i]->pindex, VM_ALLOC_NORMAL);
 
 		/*
 		 * Since the pages were only sbusy while neither the
 		 * buffer nor the object lock was held by us, or
 		 * reallocated while vm_page_grab() slept for busy
 		 * relinguish, they could have been invalidated.
 		 * Recheck the valid bits and re-read as needed.
 		 *
 		 * Note that the last page is made fully valid in the
 		 * read loop, and partial validity for the page at
 		 * index count - 1 could mean that the page was
 		 * invalidated or removed, so we must restart for
 		 * safety as well.
 		 */
 		if (ma[i]->valid != VM_PAGE_BITS_ALL)
 			redo = true;
 	}
 	if (redo && error == 0)
 		goto again;
 	VM_OBJECT_WUNLOCK(object);
 	return (error != 0 ? VM_PAGER_ERROR : VM_PAGER_OK);
 }
 
 #include "opt_ddb.h"
 #ifdef DDB
 #include <ddb/ddb.h>
 
 /* DDB command to show buffer data */
 DB_SHOW_COMMAND(buffer, db_show_buffer)
 {
 	/* get args */
 	struct buf *bp = (struct buf *)addr;
 #ifdef FULL_BUF_TRACKING
 	uint32_t i, j;
 #endif
 
 	if (!have_addr) {
 		db_printf("usage: show buffer <addr>\n");
 		return;
 	}
 
 	db_printf("buf at %p\n", bp);
 	db_printf("b_flags = 0x%b, b_xflags=0x%b, b_vflags=0x%b\n",
 	    (u_int)bp->b_flags, PRINT_BUF_FLAGS, (u_int)bp->b_xflags,
 	    PRINT_BUF_XFLAGS, (u_int)bp->b_vflags, PRINT_BUF_VFLAGS);
 	db_printf(
 	    "b_error = %d, b_bufsize = %ld, b_bcount = %ld, b_resid = %ld\n"
 	    "b_bufobj = (%p), b_data = %p, b_blkno = %jd, b_lblkno = %jd, "
 	    "b_dep = %p\n",
 	    bp->b_error, bp->b_bufsize, bp->b_bcount, bp->b_resid,
 	    bp->b_bufobj, bp->b_data, (intmax_t)bp->b_blkno,
 	    (intmax_t)bp->b_lblkno, bp->b_dep.lh_first);
 	db_printf("b_kvabase = %p, b_kvasize = %d\n",
 	    bp->b_kvabase, bp->b_kvasize);
 	if (bp->b_npages) {
 		int i;
 		db_printf("b_npages = %d, pages(OBJ, IDX, PA): ", bp->b_npages);
 		for (i = 0; i < bp->b_npages; i++) {
 			vm_page_t m;
 			m = bp->b_pages[i];
 			if (m != NULL)
 				db_printf("(%p, 0x%lx, 0x%lx)", m->object,
 				    (u_long)m->pindex,
 				    (u_long)VM_PAGE_TO_PHYS(m));
 			else
 				db_printf("( ??? )");
 			if ((i + 1) < bp->b_npages)
 				db_printf(",");
 		}
 		db_printf("\n");
 	}
 #if defined(FULL_BUF_TRACKING)
 	db_printf("b_io_tracking: b_io_tcnt = %u\n", bp->b_io_tcnt);
 
 	i = bp->b_io_tcnt % BUF_TRACKING_SIZE;
 	for (j = 1; j <= BUF_TRACKING_SIZE; j++) {
 		if (bp->b_io_tracking[BUF_TRACKING_ENTRY(i - j)] == NULL)
 			continue;
 		db_printf(" %2u: %s\n", j,
 		    bp->b_io_tracking[BUF_TRACKING_ENTRY(i - j)]);
 	}
 #elif defined(BUF_TRACKING)
 	db_printf("b_io_tracking: %s\n", bp->b_io_tracking);
 #endif
 	db_printf(" ");
 	BUF_LOCKPRINTINFO(bp);
 }
 
 DB_SHOW_COMMAND(lockedbufs, lockedbufs)
 {
 	struct buf *bp;
 	int i;
 
 	for (i = 0; i < nbuf; i++) {
 		bp = &buf[i];
 		if (BUF_ISLOCKED(bp)) {
 			db_show_buffer((uintptr_t)bp, 1, 0, NULL);
 			db_printf("\n");
 			if (db_pager_quit)
 				break;
 		}
 	}
 }
 
 DB_SHOW_COMMAND(vnodebufs, db_show_vnodebufs)
 {
 	struct vnode *vp;
 	struct buf *bp;
 
 	if (!have_addr) {
 		db_printf("usage: show vnodebufs <addr>\n");
 		return;
 	}
 	vp = (struct vnode *)addr;
 	db_printf("Clean buffers:\n");
 	TAILQ_FOREACH(bp, &vp->v_bufobj.bo_clean.bv_hd, b_bobufs) {
 		db_show_buffer((uintptr_t)bp, 1, 0, NULL);
 		db_printf("\n");
 	}
 	db_printf("Dirty buffers:\n");
 	TAILQ_FOREACH(bp, &vp->v_bufobj.bo_dirty.bv_hd, b_bobufs) {
 		db_show_buffer((uintptr_t)bp, 1, 0, NULL);
 		db_printf("\n");
 	}
 }
 
 DB_COMMAND(countfreebufs, db_coundfreebufs)
 {
 	struct buf *bp;
 	int i, used = 0, nfree = 0;
 
 	if (have_addr) {
 		db_printf("usage: countfreebufs\n");
 		return;
 	}
 
 	for (i = 0; i < nbuf; i++) {
 		bp = &buf[i];
 		if (bp->b_qindex == QUEUE_EMPTY)
 			nfree++;
 		else
 			used++;
 	}
 
 	db_printf("Counted %d free, %d used (%d tot)\n", nfree, used,
 	    nfree + used);
 	db_printf("numfreebuffers is %d\n", numfreebuffers);
 }
 #endif /* DDB */
Index: head/sys/sys/param.h
===================================================================
--- head/sys/sys/param.h	(revision 320061)
+++ head/sys/sys/param.h	(revision 320062)
@@ -1,363 +1,369 @@
 /*-
  * Copyright (c) 1982, 1986, 1989, 1993
  *	The Regents of the University of California.  All rights reserved.
  * (c) UNIX System Laboratories, Inc.
  * All or some portions of this file are derived from material licensed
  * to the University of California by American Telephone and Telegraph
  * Co. or Unix System Laboratories, Inc. and are reproduced herein with
  * the permission of UNIX System Laboratories, Inc.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	@(#)param.h	8.3 (Berkeley) 4/4/95
  * $FreeBSD$
  */
 
 #ifndef _SYS_PARAM_H_
 #define _SYS_PARAM_H_
 
 #include <sys/_null.h>
 
 #define	BSD	199506		/* System version (year & month). */
 #define BSD4_3	1
 #define BSD4_4	1
 
 /*
  * __FreeBSD_version numbers are documented in the Porter's Handbook.
  * If you bump the version for any reason, you should update the documentation
  * there.
  * Currently this lives here in the doc/ repository:
  *
  *	head/en_US.ISO8859-1/books/porters-handbook/versions/chapter.xml
  *
  * scheme is:  <major><two digit minor>Rxx
  *		'R' is in the range 0 to 4 if this is a release branch or
  *		x.0-CURRENT before RELENG_*_0 is created, otherwise 'R' is
  *		in the range 5 to 9.
  */
 #undef __FreeBSD_version
 #define __FreeBSD_version 1200033	/* Master, propagated to newvers */
 
 /*
  * __FreeBSD_kernel__ indicates that this system uses the kernel of FreeBSD,
  * which by definition is always true on FreeBSD. This macro is also defined
  * on other systems that use the kernel of FreeBSD, such as GNU/kFreeBSD.
  *
  * It is tempting to use this macro in userland code when we want to enable
  * kernel-specific routines, and in fact it's fine to do this in code that
  * is part of FreeBSD itself.  However, be aware that as presence of this
  * macro is still not widespread (e.g. older FreeBSD versions, 3rd party
  * compilers, etc), it is STRONGLY DISCOURAGED to check for this macro in
  * external applications without also checking for __FreeBSD__ as an
  * alternative.
  */
 #undef __FreeBSD_kernel__
 #define __FreeBSD_kernel__
 
 #ifdef _KERNEL
 #define	P_OSREL_SIGWAIT			700000
 #define	P_OSREL_SIGSEGV			700004
 #define	P_OSREL_MAP_ANON		800104
 #define	P_OSREL_MAP_FSTRICT		1100036
 #define	P_OSREL_SHUTDOWN_ENOTCONN	1100077
 
 #define	P_OSREL_MAJOR(x)		((x) / 100000)
 #endif
 
 #ifndef LOCORE
 #include <sys/types.h>
 #endif
 
 /*
  * Machine-independent constants (some used in following include files).
  * Redefined constants are from POSIX 1003.1 limits file.
  *
  * MAXCOMLEN should be >= sizeof(ac_comm) (see <acct.h>)
  */
 #include <sys/syslimits.h>
 
 #define	MAXCOMLEN	19		/* max command name remembered */
 #define	MAXINTERP	PATH_MAX	/* max interpreter file name length */
 #define	MAXLOGNAME	33		/* max login name length (incl. NUL) */
 #define	MAXUPRC		CHILD_MAX	/* max simultaneous processes */
 #define	NCARGS		ARG_MAX		/* max bytes for an exec function */
 #define	NGROUPS		(NGROUPS_MAX+1)	/* max number groups */
 #define	NOFILE		OPEN_MAX	/* max open files per process */
 #define	NOGROUP		65535		/* marker for empty group set member */
 #define MAXHOSTNAMELEN	256		/* max hostname size */
 #define SPECNAMELEN	63		/* max length of devicename */
 
 /* More types and definitions used throughout the kernel. */
 #ifdef _KERNEL
 #include <sys/cdefs.h>
 #include <sys/errno.h>
 #ifndef LOCORE
 #include <sys/time.h>
 #include <sys/priority.h>
 #endif
 
 #ifndef FALSE
 #define	FALSE	0
 #endif
 #ifndef TRUE
 #define	TRUE	1
 #endif
 #endif
 
 #ifndef _KERNEL
 /* Signals. */
 #include <sys/signal.h>
 #endif
 
 /* Machine type dependent parameters. */
 #include <machine/param.h>
 #ifndef _KERNEL
 #include <sys/limits.h>
 #endif
 
 #ifndef DEV_BSHIFT
 #define	DEV_BSHIFT	9		/* log2(DEV_BSIZE) */
 #endif
 #define	DEV_BSIZE	(1<<DEV_BSHIFT)
 
 #ifndef BLKDEV_IOSIZE
 #define BLKDEV_IOSIZE  PAGE_SIZE	/* default block device I/O size */
 #endif
 #ifndef DFLTPHYS
 #define DFLTPHYS	(64 * 1024)	/* default max raw I/O transfer size */
 #endif
 #ifndef MAXPHYS
 #define MAXPHYS		(128 * 1024)	/* max raw I/O transfer size */
 #endif
 #ifndef MAXDUMPPGS
 #define MAXDUMPPGS	(DFLTPHYS/PAGE_SIZE)
 #endif
 
 /*
  * Constants related to network buffer management.
  * MCLBYTES must be no larger than PAGE_SIZE.
  */
 #ifndef	MSIZE
 #define	MSIZE		256		/* size of an mbuf */
 #endif
 
 #ifndef	MCLSHIFT
 #define MCLSHIFT	11		/* convert bytes to mbuf clusters */
 #endif	/* MCLSHIFT */
 
 #define MCLBYTES	(1 << MCLSHIFT)	/* size of an mbuf cluster */
 
 #if PAGE_SIZE < 2048
 #define	MJUMPAGESIZE	MCLBYTES
 #elif PAGE_SIZE <= 8192
 #define	MJUMPAGESIZE	PAGE_SIZE
 #else
 #define	MJUMPAGESIZE	(8 * 1024)
 #endif
 
 #define	MJUM9BYTES	(9 * 1024)	/* jumbo cluster 9k */
 #define	MJUM16BYTES	(16 * 1024)	/* jumbo cluster 16k */
 
 /*
  * Some macros for units conversion
  */
 
 /* clicks to bytes */
 #ifndef ctob
 #define ctob(x)	((x)<<PAGE_SHIFT)
 #endif
 
 /* bytes to clicks */
 #ifndef btoc
 #define btoc(x)	(((vm_offset_t)(x)+PAGE_MASK)>>PAGE_SHIFT)
 #endif
 
 /*
  * btodb() is messy and perhaps slow because `bytes' may be an off_t.  We
  * want to shift an unsigned type to avoid sign extension and we don't
  * want to widen `bytes' unnecessarily.  Assume that the result fits in
  * a daddr_t.
  */
 #ifndef btodb
 #define btodb(bytes)	 		/* calculates (bytes / DEV_BSIZE) */ \
 	(sizeof (bytes) > sizeof(long) \
 	 ? (daddr_t)((unsigned long long)(bytes) >> DEV_BSHIFT) \
 	 : (daddr_t)((unsigned long)(bytes) >> DEV_BSHIFT))
 #endif
 
 #ifndef dbtob
 #define dbtob(db)			/* calculates (db * DEV_BSIZE) */ \
 	((off_t)(db) << DEV_BSHIFT)
 #endif
 
 #define	PRIMASK	0x0ff
 #define	PCATCH	0x100		/* OR'd with pri for tsleep to check signals */
 #define	PDROP	0x200	/* OR'd with pri to stop re-entry of interlock mutex */
 
 #define	NZERO	0		/* default "nice" */
 
 #define	NBBY	8		/* number of bits in a byte */
 #define	NBPW	sizeof(int)	/* number of bytes per word (integer) */
 
 #define	CMASK	022		/* default file mask: S_IWGRP|S_IWOTH */
 
 #define	NODEV	(dev_t)(-1)	/* non-existent device */
 
 /*
  * File system parameters and macros.
  *
  * MAXBSIZE -	Filesystems are made out of blocks of at most MAXBSIZE bytes
  *		per block.  MAXBSIZE may be made larger without effecting
  *		any existing filesystems as long as it does not exceed MAXPHYS,
  *		and may be made smaller at the risk of not being able to use
  *		filesystems which require a block size exceeding MAXBSIZE.
  *
  * MAXBCACHEBUF - Maximum size of a buffer in the buffer cache.  This must
  *		be >= MAXBSIZE and can be set differently for different
  *		architectures by defining it in <machine/param.h>.
  *		Making this larger allows NFS to do larger reads/writes.
  *
  * BKVASIZE -	Nominal buffer space per buffer, in bytes.  BKVASIZE is the
  *		minimum KVM memory reservation the kernel is willing to make.
  *		Filesystems can of course request smaller chunks.  Actual
  *		backing memory uses a chunk size of a page (PAGE_SIZE).
  *		The default value here can be overridden on a per-architecture
- *		basis by defining it in <machine/param.h>.  This should
- *		probably be done to increase its value, when MAXBCACHEBUF is
- *		defined as a larger value in <machine/param.h>.
+ *		basis by defining it in <machine/param.h>.
  *
  *		If you make BKVASIZE too small you risk seriously fragmenting
  *		the buffer KVM map which may slow things down a bit.  If you
  *		make it too big the kernel will not be able to optimally use
  *		the KVM memory reserved for the buffer cache and will wind
  *		up with too-few buffers.
  *
  *		The default is 16384, roughly 2x the block size used by a
  *		normal UFS filesystem.
  */
 #define MAXBSIZE	65536	/* must be power of 2 */
 #ifndef	MAXBCACHEBUF
 #define	MAXBCACHEBUF	MAXBSIZE /* must be a power of 2 >= MAXBSIZE */
 #endif
 #ifndef	BKVASIZE
 #define BKVASIZE	16384	/* must be power of 2 */
 #endif
 #define BKVAMASK	(BKVASIZE-1)
+
+/*
+ * This variable is tuned via vfs.maxbcachebuf and is set to the value of
+ * MAXBCACHEBUF by default.
+ */
+#ifdef _KERNEL
+extern int	maxbcachebuf;
+#endif
 
 /*
  * MAXPATHLEN defines the longest permissible path length after expanding
  * symbolic links. It is used to allocate a temporary buffer from the buffer
  * pool in which to do the name expansion, hence should be a power of two,
  * and must be less than or equal to MAXBSIZE.  MAXSYMLINKS defines the
  * maximum number of symbolic links that may be expanded in a path name.
  * It should be set high enough to allow all legitimate uses, but halt
  * infinite loops reasonably quickly.
  */
 #define	MAXPATHLEN	PATH_MAX
 #define MAXSYMLINKS	32
 
 /* Bit map related macros. */
 #define	setbit(a,i)	(((unsigned char *)(a))[(i)/NBBY] |= 1<<((i)%NBBY))
 #define	clrbit(a,i)	(((unsigned char *)(a))[(i)/NBBY] &= ~(1<<((i)%NBBY)))
 #define	isset(a,i)							\
 	(((const unsigned char *)(a))[(i)/NBBY] & (1<<((i)%NBBY)))
 #define	isclr(a,i)							\
 	((((const unsigned char *)(a))[(i)/NBBY] & (1<<((i)%NBBY))) == 0)
 
 /* Macros for counting and rounding. */
 #ifndef howmany
 #define	howmany(x, y)	(((x)+((y)-1))/(y))
 #endif
 #define	nitems(x)	(sizeof((x)) / sizeof((x)[0]))
 #define	rounddown(x, y)	(((x)/(y))*(y))
 #define	rounddown2(x, y) ((x)&(~((y)-1)))          /* if y is power of two */
 #define	roundup(x, y)	((((x)+((y)-1))/(y))*(y))  /* to any y */
 #define	roundup2(x, y)	(((x)+((y)-1))&(~((y)-1))) /* if y is powers of two */
 #define powerof2(x)	((((x)-1)&(x))==0)
 
 /* Macros for min/max. */
 #define	MIN(a,b) (((a)<(b))?(a):(b))
 #define	MAX(a,b) (((a)>(b))?(a):(b))
 
 #ifdef _KERNEL
 /*
  * Basic byte order function prototypes for non-inline functions.
  */
 #ifndef LOCORE
 #ifndef _BYTEORDER_PROTOTYPED
 #define	_BYTEORDER_PROTOTYPED
 __BEGIN_DECLS
 __uint32_t	 htonl(__uint32_t);
 __uint16_t	 htons(__uint16_t);
 __uint32_t	 ntohl(__uint32_t);
 __uint16_t	 ntohs(__uint16_t);
 __END_DECLS
 #endif
 #endif
 
 #ifndef lint
 #ifndef _BYTEORDER_FUNC_DEFINED
 #define	_BYTEORDER_FUNC_DEFINED
 #define	htonl(x)	__htonl(x)
 #define	htons(x)	__htons(x)
 #define	ntohl(x)	__ntohl(x)
 #define	ntohs(x)	__ntohs(x)
 #endif /* !_BYTEORDER_FUNC_DEFINED */
 #endif /* lint */
 #endif /* _KERNEL */
 
 /*
  * Scale factor for scaled integers used to count %cpu time and load avgs.
  *
  * The number of CPU `tick's that map to a unique `%age' can be expressed
  * by the formula (1 / (2 ^ (FSHIFT - 11))).  The maximum load average that
  * can be calculated (assuming 32 bits) can be closely approximated using
  * the formula (2 ^ (2 * (16 - FSHIFT))) for (FSHIFT < 15).
  *
  * For the scheduler to maintain a 1:1 mapping of CPU `tick' to `%age',
  * FSHIFT must be at least 11; this gives us a maximum load avg of ~1024.
  */
 #define	FSHIFT	11		/* bits to right of fixed binary point */
 #define FSCALE	(1<<FSHIFT)
 
 #define dbtoc(db)			/* calculates devblks to pages */ \
 	((db + (ctodb(1) - 1)) >> (PAGE_SHIFT - DEV_BSHIFT))
 
 #define ctodb(db)			/* calculates pages to devblks */ \
 	((db) << (PAGE_SHIFT - DEV_BSHIFT))
 
 /*
  * Old spelling of __containerof().
  */
 #define	member2struct(s, m, x)						\
 	((struct s *)(void *)((char *)(x) - offsetof(struct s, m)))
 
 /*
  * Access a variable length array that has been declared as a fixed
  * length array.
  */
 #define __PAST_END(array, offset) (((__typeof__(*(array)) *)(array))[offset])
 
 #endif	/* _SYS_PARAM_H_ */