diff --git a/sys/dev/cxgbe/cxgbei/icl_cxgbei.c b/sys/dev/cxgbe/cxgbei/icl_cxgbei.c
index ab1428c06d87..c8592807f843 100644
--- a/sys/dev/cxgbe/cxgbei/icl_cxgbei.c
+++ b/sys/dev/cxgbe/cxgbei/icl_cxgbei.c
@@ -1,1877 +1,1877 @@
 /*-
  * Copyright (c) 2012 The FreeBSD Foundation
  * Copyright (c) 2015 Chelsio Communications, Inc.
  * All rights reserved.
  *
  * This software was developed by Edward Tomasz Napierala under sponsorship
  * from the FreeBSD Foundation.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  */
 
 /*
  * cxgbei implementation of iSCSI Common Layer kobj(9) interface.
  */
 
 #include <sys/cdefs.h>
 #include "opt_inet.h"
 #include "opt_inet6.h"
 
 #ifdef TCP_OFFLOAD
 #include <sys/param.h>
 #include <sys/bio.h>
 #include <sys/capsicum.h>
 #include <sys/condvar.h>
 #include <sys/conf.h>
 #include <sys/file.h>
 #include <sys/kernel.h>
 #include <sys/kthread.h>
 #include <sys/ktr.h>
 #include <sys/lock.h>
 #include <sys/mbuf.h>
 #include <sys/mutex.h>
 #include <sys/module.h>
 #include <sys/protosw.h>
 #include <sys/socket.h>
 #include <sys/socketvar.h>
 #include <sys/sysctl.h>
 #include <sys/systm.h>
 #include <sys/sx.h>
 #include <sys/uio.h>
 #include <machine/bus.h>
 #include <vm/vm.h>
 #include <vm/vm_page.h>
 #include <vm/pmap.h>
 #include <netinet/in.h>
 #include <netinet/in_pcb.h>
 #include <netinet/tcp.h>
 #include <netinet/tcp_var.h>
 #include <netinet/toecore.h>
 
 #include <dev/iscsi/icl.h>
 #include <dev/iscsi/iscsi_proto.h>
 #include <icl_conn_if.h>
 
 #include <cam/scsi/scsi_all.h>
 #include <cam/scsi/scsi_da.h>
 #include <cam/ctl/ctl_io.h>
 #include <cam/ctl/ctl.h>
 #include <cam/ctl/ctl_backend.h>
 #include <cam/ctl/ctl_error.h>
 #include <cam/ctl/ctl_frontend.h>
 #include <cam/ctl/ctl_debug.h>
 #include <cam/ctl/ctl_ha.h>
 #include <cam/ctl/ctl_ioctl.h>
 
 #include <cam/cam.h>
 #include <cam/cam_ccb.h>
 #include <cam/cam_xpt.h>
 #include <cam/cam_debug.h>
 #include <cam/cam_sim.h>
 #include <cam/cam_xpt_sim.h>
 #include <cam/cam_xpt_periph.h>
 #include <cam/cam_periph.h>
 #include <cam/cam_compat.h>
 #include <cam/scsi/scsi_message.h>
 
 #include "common/common.h"
 #include "common/t4_regs.h"
 #include "common/t4_tcb.h"
 #include "tom/t4_tom.h"
 #include "cxgbei.h"
 
 /*
  * Use the page pod tag for the TT hash.
  */
 #define	TT_HASH(icc, tt)	(G_PPOD_TAG(tt) & (icc)->cmp_hash_mask)
 
 struct cxgbei_ddp_state {
 	struct ppod_reservation prsv;
 	struct cxgbei_cmp cmp;
 };
 
 static MALLOC_DEFINE(M_CXGBEI, "cxgbei", "cxgbei(4)");
 
 SYSCTL_NODE(_kern_icl, OID_AUTO, cxgbei, CTLFLAG_RD | CTLFLAG_MPSAFE, 0,
     "Chelsio iSCSI offload");
 static int first_burst_length = 8192;
 SYSCTL_INT(_kern_icl_cxgbei, OID_AUTO, first_burst_length, CTLFLAG_RWTUN,
     &first_burst_length, 0, "First burst length");
 static int max_burst_length = 2 * 1024 * 1024;
 SYSCTL_INT(_kern_icl_cxgbei, OID_AUTO, max_burst_length, CTLFLAG_RWTUN,
     &max_burst_length, 0, "Maximum burst length");
 static int sendspace = 1048576;
 SYSCTL_INT(_kern_icl_cxgbei, OID_AUTO, sendspace, CTLFLAG_RWTUN,
     &sendspace, 0, "Default send socket buffer size");
 static int recvspace = 1048576;
 SYSCTL_INT(_kern_icl_cxgbei, OID_AUTO, recvspace, CTLFLAG_RWTUN,
     &recvspace, 0, "Default receive socket buffer size");
 
 static volatile u_int icl_cxgbei_ncons;
 
 static icl_conn_new_pdu_t	icl_cxgbei_conn_new_pdu;
 static icl_conn_pdu_data_segment_length_t
 				    icl_cxgbei_conn_pdu_data_segment_length;
 static icl_conn_pdu_append_bio_t	icl_cxgbei_conn_pdu_append_bio;
 static icl_conn_pdu_append_data_t	icl_cxgbei_conn_pdu_append_data;
 static icl_conn_pdu_get_bio_t	icl_cxgbei_conn_pdu_get_bio;
 static icl_conn_pdu_get_data_t	icl_cxgbei_conn_pdu_get_data;
 static icl_conn_pdu_queue_t	icl_cxgbei_conn_pdu_queue;
 static icl_conn_pdu_queue_cb_t	icl_cxgbei_conn_pdu_queue_cb;
 static icl_conn_handoff_t	icl_cxgbei_conn_handoff;
 static icl_conn_free_t		icl_cxgbei_conn_free;
 static icl_conn_close_t		icl_cxgbei_conn_close;
 static icl_conn_task_setup_t	icl_cxgbei_conn_task_setup;
 static icl_conn_task_done_t	icl_cxgbei_conn_task_done;
 static icl_conn_transfer_setup_t	icl_cxgbei_conn_transfer_setup;
 static icl_conn_transfer_done_t	icl_cxgbei_conn_transfer_done;
 
 static kobj_method_t icl_cxgbei_methods[] = {
 	KOBJMETHOD(icl_conn_new_pdu, icl_cxgbei_conn_new_pdu),
 	KOBJMETHOD(icl_conn_pdu_free, icl_cxgbei_conn_pdu_free),
 	KOBJMETHOD(icl_conn_pdu_data_segment_length,
 	    icl_cxgbei_conn_pdu_data_segment_length),
 	KOBJMETHOD(icl_conn_pdu_append_bio, icl_cxgbei_conn_pdu_append_bio),
 	KOBJMETHOD(icl_conn_pdu_append_data, icl_cxgbei_conn_pdu_append_data),
 	KOBJMETHOD(icl_conn_pdu_get_bio, icl_cxgbei_conn_pdu_get_bio),
 	KOBJMETHOD(icl_conn_pdu_get_data, icl_cxgbei_conn_pdu_get_data),
 	KOBJMETHOD(icl_conn_pdu_queue, icl_cxgbei_conn_pdu_queue),
 	KOBJMETHOD(icl_conn_pdu_queue_cb, icl_cxgbei_conn_pdu_queue_cb),
 	KOBJMETHOD(icl_conn_handoff, icl_cxgbei_conn_handoff),
 	KOBJMETHOD(icl_conn_free, icl_cxgbei_conn_free),
 	KOBJMETHOD(icl_conn_close, icl_cxgbei_conn_close),
 	KOBJMETHOD(icl_conn_task_setup, icl_cxgbei_conn_task_setup),
 	KOBJMETHOD(icl_conn_task_done, icl_cxgbei_conn_task_done),
 	KOBJMETHOD(icl_conn_transfer_setup, icl_cxgbei_conn_transfer_setup),
 	KOBJMETHOD(icl_conn_transfer_done, icl_cxgbei_conn_transfer_done),
 	{ 0, 0 }
 };
 
 DEFINE_CLASS(icl_cxgbei, icl_cxgbei_methods, sizeof(struct icl_cxgbei_conn));
 
 void
 icl_cxgbei_conn_pdu_free(struct icl_conn *ic, struct icl_pdu *ip)
 {
 	struct icl_cxgbei_pdu *icp = ip_to_icp(ip);
 
 	KASSERT(icp->ref_cnt != 0, ("freeing deleted PDU"));
 	MPASS(icp->icp_signature == CXGBEI_PDU_SIGNATURE);
 	MPASS(ic == ip->ip_conn);
 
 	m_freem(ip->ip_ahs_mbuf);
 	m_freem(ip->ip_data_mbuf);
 	m_freem(ip->ip_bhs_mbuf);
 
 	KASSERT(ic != NULL || icp->ref_cnt == 1,
 	    ("orphaned PDU has oustanding references"));
 
 	if (atomic_fetchadd_int(&icp->ref_cnt, -1) != 1)
 		return;
 
 	free(icp, M_CXGBEI);
 #ifdef DIAGNOSTIC
 	if (__predict_true(ic != NULL))
 		refcount_release(&ic->ic_outstanding_pdus);
 #endif
 }
 
 static void
 icl_cxgbei_pdu_call_cb(struct icl_pdu *ip)
 {
 	struct icl_cxgbei_pdu *icp = ip_to_icp(ip);
 
 	MPASS(icp->icp_signature == CXGBEI_PDU_SIGNATURE);
 
 	if (icp->cb != NULL)
 		icp->cb(ip, icp->error);
 #ifdef DIAGNOSTIC
 	if (__predict_true(ip->ip_conn != NULL))
 		refcount_release(&ip->ip_conn->ic_outstanding_pdus);
 #endif
 	free(icp, M_CXGBEI);
 }
 
 static void
 icl_cxgbei_pdu_done(struct icl_pdu *ip, int error)
 {
 	struct icl_cxgbei_pdu *icp = ip_to_icp(ip);
 
 	if (error != 0)
 		icp->error = error;
 
 	m_freem(ip->ip_ahs_mbuf);
 	ip->ip_ahs_mbuf = NULL;
 	m_freem(ip->ip_data_mbuf);
 	ip->ip_data_mbuf = NULL;
 	m_freem(ip->ip_bhs_mbuf);
 	ip->ip_bhs_mbuf = NULL;
 
 	/*
 	 * All other references to this PDU should have been dropped
 	 * by the m_freem() of ip_data_mbuf.
 	 */
 	if (atomic_fetchadd_int(&icp->ref_cnt, -1) == 1)
 		icl_cxgbei_pdu_call_cb(ip);
 	else
 		__assert_unreachable();
 }
 
 static void
 icl_cxgbei_mbuf_done(struct mbuf *mb)
 {
 
 	struct icl_cxgbei_pdu *icp = (struct icl_cxgbei_pdu *)mb->m_ext.ext_arg1;
 
 	/*
 	 * NB: mb_free_mext() might leave ref_cnt as 1 without
 	 * decrementing it if it hits the fast path in the ref_cnt
 	 * check.
 	 */
 	icl_cxgbei_pdu_call_cb(&icp->ip);
 }
 
 struct icl_pdu *
 icl_cxgbei_new_pdu(int flags)
 {
 	struct icl_cxgbei_pdu *icp;
 	struct icl_pdu *ip;
 	struct mbuf *m;
 
 	icp = malloc(sizeof(*icp), M_CXGBEI, flags | M_ZERO);
 	if (__predict_false(icp == NULL))
 		return (NULL);
 
 	icp->icp_signature = CXGBEI_PDU_SIGNATURE;
 	icp->ref_cnt = 1;
 	ip = &icp->ip;
 
 	m = m_gethdr(flags, MT_DATA);
 	if (__predict_false(m == NULL)) {
 		free(icp, M_CXGBEI);
 		return (NULL);
 	}
 
 	ip->ip_bhs_mbuf = m;
 	ip->ip_bhs = mtod(m, struct iscsi_bhs *);
 	memset(ip->ip_bhs, 0, sizeof(*ip->ip_bhs));
 	m->m_len = sizeof(struct iscsi_bhs);
 	m->m_pkthdr.len = m->m_len;
 
 	return (ip);
 }
 
 void
 icl_cxgbei_new_pdu_set_conn(struct icl_pdu *ip, struct icl_conn *ic)
 {
 
 	ip->ip_conn = ic;
 #ifdef DIAGNOSTIC
 	refcount_acquire(&ic->ic_outstanding_pdus);
 #endif
 }
 
 /*
  * Allocate icl_pdu with empty BHS to fill up by the caller.
  */
 static struct icl_pdu *
 icl_cxgbei_conn_new_pdu(struct icl_conn *ic, int flags)
 {
 	struct icl_pdu *ip;
 
 	ip = icl_cxgbei_new_pdu(flags);
 	if (__predict_false(ip == NULL))
 		return (NULL);
 	icl_cxgbei_new_pdu_set_conn(ip, ic);
 
 	return (ip);
 }
 
 static size_t
 icl_pdu_data_segment_length(const struct icl_pdu *request)
 {
 	uint32_t len = 0;
 
 	len += request->ip_bhs->bhs_data_segment_len[0];
 	len <<= 8;
 	len += request->ip_bhs->bhs_data_segment_len[1];
 	len <<= 8;
 	len += request->ip_bhs->bhs_data_segment_len[2];
 
 	return (len);
 }
 
 size_t
 icl_cxgbei_conn_pdu_data_segment_length(struct icl_conn *ic,
     const struct icl_pdu *request)
 {
 
 	return (icl_pdu_data_segment_length(request));
 }
 
 static struct mbuf *
 finalize_pdu(struct icl_cxgbei_conn *icc, struct icl_cxgbei_pdu *icp)
 {
 	struct icl_pdu *ip = &icp->ip;
 	uint8_t ulp_submode, padding;
 	struct mbuf *m, *last;
 	struct iscsi_bhs *bhs;
 	int data_len;
 
 	/*
 	 * Fix up the data segment mbuf first.
 	 */
 	m = ip->ip_data_mbuf;
 	ulp_submode = icc->ulp_submode;
 	if (m != NULL) {
 		last = m_last(m);
 
 		/*
 		 * Round up the data segment to a 4B boundary.	Pad with 0 if
 		 * necessary.  There will definitely be room in the mbuf.
 		 */
 		padding = roundup2(ip->ip_data_len, 4) - ip->ip_data_len;
 		if (padding != 0) {
 			MPASS(padding <= M_TRAILINGSPACE(last));
 			bzero(mtod(last, uint8_t *) + last->m_len, padding);
 			last->m_len += padding;
 		}
 	} else {
 		MPASS(ip->ip_data_len == 0);
 		ulp_submode &= ~ULP_CRC_DATA;
 		padding = 0;
 	}
 
 	/*
 	 * Now the header mbuf that has the BHS.
 	 */
 	m = ip->ip_bhs_mbuf;
 	MPASS(m->m_pkthdr.len == sizeof(struct iscsi_bhs));
 	MPASS(m->m_len == sizeof(struct iscsi_bhs));
 
 	bhs = ip->ip_bhs;
 	data_len = ip->ip_data_len;
 	if (data_len > icc->ic.ic_max_send_data_segment_length) {
 		struct iscsi_bhs_data_in *bhsdi;
 		int flags;
 
 		KASSERT(padding == 0, ("%s: ISO with padding %d for icp %p",
 		    __func__, padding, icp));
 		switch (bhs->bhs_opcode) {
 		case ISCSI_BHS_OPCODE_SCSI_DATA_OUT:
 			flags = 1;
 			break;
 		case ISCSI_BHS_OPCODE_SCSI_DATA_IN:
 			flags = 2;
 			break;
 		default:
 			panic("invalid opcode %#x for ISO", bhs->bhs_opcode);
 		}
 		data_len = icc->ic.ic_max_send_data_segment_length;
 		bhsdi = (struct iscsi_bhs_data_in *)bhs;
 		if (bhsdi->bhsdi_flags & BHSDI_FLAGS_F) {
 			/*
 			 * Firmware will set F on the final PDU in the
 			 * burst.
 			 */
 			flags |= CXGBE_ISO_F;
 			bhsdi->bhsdi_flags &= ~BHSDI_FLAGS_F;
 		}
 		set_mbuf_iscsi_iso(m, true);
 		set_mbuf_iscsi_iso_flags(m, flags);
 		set_mbuf_iscsi_iso_mss(m, data_len);
 	}
 
 	bhs->bhs_data_segment_len[2] = data_len;
 	bhs->bhs_data_segment_len[1] = data_len >> 8;
 	bhs->bhs_data_segment_len[0] = data_len >> 16;
 
 	/*
 	 * Extract mbuf chain from PDU.
 	 */
 	m->m_pkthdr.len += ip->ip_data_len + padding;
 	m->m_next = ip->ip_data_mbuf;
 	set_mbuf_ulp_submode(m, ulp_submode);
 	ip->ip_bhs_mbuf = NULL;
 	ip->ip_data_mbuf = NULL;
 	ip->ip_bhs = NULL;
 
 	/*
 	 * Drop PDU reference on icp.  Additional references might
 	 * still be held by zero-copy PDU buffers (ICL_NOCOPY).
 	 */
 	if (atomic_fetchadd_int(&icp->ref_cnt, -1) == 1)
 		icl_cxgbei_pdu_call_cb(ip);
 
 	return (m);
 }
 
 static void
 icl_cxgbei_tx_main(void *arg)
 {
 	struct epoch_tracker et;
 	struct icl_cxgbei_conn *icc = arg;
 	struct icl_conn *ic = &icc->ic;
 	struct toepcb *toep = icc->toep;
 	struct socket *so = ic->ic_socket;
 	struct inpcb *inp = sotoinpcb(so);
 	struct icl_pdu *ip;
 	struct mbuf *m;
 	struct mbufq mq;
 	STAILQ_HEAD(, icl_pdu) tx_pdus = STAILQ_HEAD_INITIALIZER(tx_pdus);
 
 	mbufq_init(&mq, INT_MAX);
 
 	ICL_CONN_LOCK(ic);
 	while (__predict_true(!ic->ic_disconnecting)) {
 		while (STAILQ_EMPTY(&icc->sent_pdus)) {
 			icc->tx_active = false;
 			mtx_sleep(&icc->tx_active, ic->ic_lock, 0, "-", 0);
 			if (__predict_false(ic->ic_disconnecting))
 				goto out;
 			MPASS(icc->tx_active);
 		}
 
 		STAILQ_SWAP(&icc->sent_pdus, &tx_pdus, icl_pdu);
 		ICL_CONN_UNLOCK(ic);
 
 		while ((ip = STAILQ_FIRST(&tx_pdus)) != NULL) {
 			STAILQ_REMOVE_HEAD(&tx_pdus, ip_next);
 
 			m = finalize_pdu(icc, ip_to_icp(ip));
 			M_ASSERTPKTHDR(m);
 			MPASS((m->m_pkthdr.len & 3) == 0);
 
 			mbufq_enqueue(&mq, m);
 		}
 
 		ICL_CONN_LOCK(ic);
 		if (__predict_false(ic->ic_disconnecting) ||
 		    __predict_false(ic->ic_socket == NULL)) {
 			mbufq_drain(&mq);
 			break;
 		}
 
 		CURVNET_SET(toep->vnet);
 		NET_EPOCH_ENTER(et);
 		INP_WLOCK(inp);
 
 		ICL_CONN_UNLOCK(ic);
 		if (__predict_false(inp->inp_flags & INP_DROPPED) ||
 		    __predict_false((toep->flags & TPF_ATTACHED) == 0)) {
 			mbufq_drain(&mq);
 		} else {
 			mbufq_concat(&toep->ulp_pduq, &mq);
 			t4_push_pdus(icc->sc, toep, 0);
 		}
 		INP_WUNLOCK(inp);
 		NET_EPOCH_EXIT(et);
 		CURVNET_RESTORE();
 
 		ICL_CONN_LOCK(ic);
 	}
 out:
 	ICL_CONN_UNLOCK(ic);
 
 	kthread_exit();
 }
 
 static void
 icl_cxgbei_rx_main(void *arg)
 {
 	struct icl_cxgbei_conn *icc = arg;
 	struct icl_conn *ic = &icc->ic;
 	struct icl_pdu *ip;
 	struct sockbuf *sb;
 	STAILQ_HEAD(, icl_pdu) rx_pdus = STAILQ_HEAD_INITIALIZER(rx_pdus);
 	bool cantrcvmore;
 
 	sb = &ic->ic_socket->so_rcv;
 	SOCKBUF_LOCK(sb);
 	while (__predict_true(!ic->ic_disconnecting)) {
 		while (STAILQ_EMPTY(&icc->rcvd_pdus)) {
 			icc->rx_active = false;
 			mtx_sleep(&icc->rx_active, SOCKBUF_MTX(sb), 0, "-", 0);
 			if (__predict_false(ic->ic_disconnecting))
 				goto out;
 			MPASS(icc->rx_active);
 		}
 
 		if (__predict_false(sbused(sb)) != 0) {
 			/*
 			 * PDUs were received before the tid
 			 * transitioned to ULP mode.  Convert
 			 * them to icl_cxgbei_pdus and insert
 			 * them into the head of rcvd_pdus.
 			 */
 			parse_pdus(icc, sb);
 		}
 		cantrcvmore = (sb->sb_state & SBS_CANTRCVMORE) != 0;
 		MPASS(STAILQ_EMPTY(&rx_pdus));
 		STAILQ_SWAP(&icc->rcvd_pdus, &rx_pdus, icl_pdu);
 		SOCKBUF_UNLOCK(sb);
 
 		/* Hand over PDUs to ICL. */
 		while ((ip = STAILQ_FIRST(&rx_pdus)) != NULL) {
 			STAILQ_REMOVE_HEAD(&rx_pdus, ip_next);
 			if (cantrcvmore)
 				icl_cxgbei_pdu_done(ip, ENOTCONN);
 			else
 				ic->ic_receive(ip);
 		}
 
 		SOCKBUF_LOCK(sb);
 	}
 out:
 	/*
 	 * Since ic_disconnecting is set before the SOCKBUF_MTX is
 	 * locked in icl_cxgbei_conn_close, the loop above can exit
 	 * before icl_cxgbei_conn_close can lock SOCKBUF_MTX and block
 	 * waiting for the thread exit.
 	 */
 	while (!icc->rx_exiting)
 		mtx_sleep(&icc->rx_active, SOCKBUF_MTX(sb), 0, "-", 0);
 	SOCKBUF_UNLOCK(sb);
 
 	kthread_exit();
 }
 
 static void
 cxgbei_free_mext_pg(struct mbuf *m)
 {
 	struct icl_cxgbei_pdu *icp;
 
 	M_ASSERTEXTPG(m);
 
 	/*
 	 * Nothing to do for the pages; they are owned by the PDU /
 	 * I/O request.
 	 */
 
 	/* Drop reference on the PDU. */
 	icp = m->m_ext.ext_arg1;
 	if (atomic_fetchadd_int(&icp->ref_cnt, -1) == 1)
 		icl_cxgbei_pdu_call_cb(&icp->ip);
 }
 
 static struct mbuf *
 cxgbei_getm(size_t len, int flags)
 {
 	struct mbuf *m, *m0, *m_tail;
 
 	m_tail = m0 = NULL;
 
 	/* Allocate as jumbo mbufs of size MJUM16BYTES. */
 	while (len >= MJUM16BYTES) {
 		m = m_getjcl(M_NOWAIT, MT_DATA, 0, MJUM16BYTES);
 		if (__predict_false(m == NULL)) {
 			if ((flags & M_WAITOK) != 0) {
 				/* Fall back to non-jumbo mbufs. */
 				break;
 			}
 			return (NULL);
 		}
 		if (m0 == NULL) {
 			m0 = m_tail = m;
 		} else {
 			m_tail->m_next = m;
 			m_tail = m;
 		}
 		len -= MJUM16BYTES;
 	}
 
 	/* Allocate mbuf chain for the remaining data. */
 	if (len != 0) {
 		m = m_getm2(NULL, len, flags, MT_DATA, 0);
 		if (__predict_false(m == NULL)) {
 			m_freem(m0);
 			return (NULL);
 		}
 		if (m0 == NULL)
 			m0 = m;
 		else
 			m_tail->m_next = m;
 	}
 
 	return (m0);
 }
 
 int
 icl_cxgbei_conn_pdu_append_bio(struct icl_conn *ic, struct icl_pdu *ip,
     struct bio *bp, size_t offset, size_t len, int flags)
 {
 	struct icl_cxgbei_pdu *icp = ip_to_icp(ip);
 	struct mbuf *m, *m_tail;
 	vm_offset_t vaddr;
 	size_t page_offset, todo, mtodo;
 	bool mapped;
 	int i;
 
 	MPASS(icp->icp_signature == CXGBEI_PDU_SIGNATURE);
 	MPASS(ic == ip->ip_conn);
 	KASSERT(len > 0, ("%s: len is %jd", __func__, (intmax_t)len));
 
 	m_tail = ip->ip_data_mbuf;
 	if (m_tail != NULL)
 		for (; m_tail->m_next != NULL; m_tail = m_tail->m_next)
 			;
 
 	MPASS(bp->bio_flags & BIO_UNMAPPED);
 	if (offset < PAGE_SIZE - bp->bio_ma_offset) {
 		page_offset = bp->bio_ma_offset + offset;
 		i = 0;
 	} else {
 		offset -= PAGE_SIZE - bp->bio_ma_offset;
 		for (i = 1; offset >= PAGE_SIZE; i++)
 			offset -= PAGE_SIZE;
 		page_offset = offset;
 	}
 
 	if (flags & ICL_NOCOPY) {
 		m = NULL;
 		while (len > 0) {
 			if (m == NULL) {
 				m = mb_alloc_ext_pgs(flags & ~ICL_NOCOPY,
-				    cxgbei_free_mext_pg);
+				    cxgbei_free_mext_pg, 0);
 				if (__predict_false(m == NULL))
 					return (ENOMEM);
 				atomic_add_int(&icp->ref_cnt, 1);
 				m->m_ext.ext_arg1 = icp;
 				m->m_epg_1st_off = page_offset;
 			}
 
 			todo = MIN(len, PAGE_SIZE - page_offset);
 
 			m->m_epg_pa[m->m_epg_npgs] =
 			    VM_PAGE_TO_PHYS(bp->bio_ma[i]);
 			m->m_epg_npgs++;
 			m->m_epg_last_len = todo;
 			m->m_len += todo;
 			m->m_ext.ext_size += PAGE_SIZE;
 			MBUF_EXT_PGS_ASSERT_SANITY(m);
 
 			if (m->m_epg_npgs == MBUF_PEXT_MAX_PGS) {
 				if (m_tail != NULL)
 					m_tail->m_next = m;
 				else
 					ip->ip_data_mbuf = m;
 				m_tail = m;
 				ip->ip_data_len += m->m_len;
 				m = NULL;
 			}
 
 			page_offset = 0;
 			len -= todo;
 			i++;
 		}
 
 		if (m != NULL) {
 			if (m_tail != NULL)
 				m_tail->m_next = m;
 			else
 				ip->ip_data_mbuf = m;
 			ip->ip_data_len += m->m_len;
 		}
 		return (0);
 	}
 
 	m = cxgbei_getm(len, flags);
 	if (__predict_false(m == NULL))
 		return (ENOMEM);
 
 	if (ip->ip_data_mbuf == NULL) {
 		ip->ip_data_mbuf = m;
 		ip->ip_data_len = len;
 	} else {
 		m_tail->m_next = m;
 		ip->ip_data_len += len;
 	}
 
 	while (len > 0) {
 		todo = MIN(len, PAGE_SIZE - page_offset);
 
 		mapped = pmap_map_io_transient(bp->bio_ma + i, &vaddr, 1,
 		    false);
 
 		do {
 			mtodo = min(todo, M_SIZE(m) - m->m_len);
 			memcpy(mtod(m, char *) + m->m_len, (char *)vaddr +
 			    page_offset, mtodo);
 			m->m_len += mtodo;
 			if (m->m_len == M_SIZE(m))
 				m = m->m_next;
 			page_offset += mtodo;
 			todo -= mtodo;
 		} while (todo > 0);
 
 		if (__predict_false(mapped))
 			pmap_unmap_io_transient(bp->bio_ma + 1, &vaddr, 1,
 			    false);
 
 		page_offset = 0;
 		len -= todo;
 		i++;
 	}
 
 	MPASS(ip->ip_data_len <= max(ic->ic_max_send_data_segment_length,
 	    ic->ic_hw_isomax));
 
 	return (0);
 }
 
 int
 icl_cxgbei_conn_pdu_append_data(struct icl_conn *ic, struct icl_pdu *ip,
     const void *addr, size_t len, int flags)
 {
 	struct icl_cxgbei_pdu *icp = ip_to_icp(ip);
 	struct mbuf *m, *m_tail;
 	const char *src;
 
 	MPASS(icp->icp_signature == CXGBEI_PDU_SIGNATURE);
 	MPASS(ic == ip->ip_conn);
 	KASSERT(len > 0, ("%s: len is %jd", __func__, (intmax_t)len));
 
 	m_tail = ip->ip_data_mbuf;
 	if (m_tail != NULL)
 		for (; m_tail->m_next != NULL; m_tail = m_tail->m_next)
 			;
 
 	if (flags & ICL_NOCOPY) {
 		m = m_get(flags & ~ICL_NOCOPY, MT_DATA);
 		if (m == NULL) {
 			ICL_WARN("failed to allocate mbuf");
 			return (ENOMEM);
 		}
 
 		m->m_flags |= M_RDONLY;
 		m_extaddref(m, __DECONST(char *, addr), len, &icp->ref_cnt,
 		    icl_cxgbei_mbuf_done, icp, NULL);
 		m->m_len = len;
 		if (ip->ip_data_mbuf == NULL) {
 			ip->ip_data_mbuf = m;
 			ip->ip_data_len = len;
 		} else {
 			m_tail->m_next = m;
 			m_tail = m_tail->m_next;
 			ip->ip_data_len += len;
 		}
 
 		return (0);
 	}
 
 	m = cxgbei_getm(len, flags);
 	if (__predict_false(m == NULL))
 		return (ENOMEM);
 
 	if (ip->ip_data_mbuf == NULL) {
 		ip->ip_data_mbuf = m;
 		ip->ip_data_len = len;
 	} else {
 		m_tail->m_next = m;
 		ip->ip_data_len += len;
 	}
 	src = (const char *)addr;
 	for (; m != NULL; m = m->m_next) {
 		m->m_len = min(len, M_SIZE(m));
 		memcpy(mtod(m, void *), src, m->m_len);
 		src += m->m_len;
 		len -= m->m_len;
 	}
 	MPASS(len == 0);
 
 	MPASS(ip->ip_data_len <= max(ic->ic_max_send_data_segment_length,
 	    ic->ic_hw_isomax));
 
 	return (0);
 }
 
 void
 icl_cxgbei_conn_pdu_get_bio(struct icl_conn *ic, struct icl_pdu *ip,
     size_t pdu_off, struct bio *bp, size_t bio_off, size_t len)
 {
 	struct icl_cxgbei_pdu *icp = ip_to_icp(ip);
 	vm_offset_t vaddr;
 	size_t page_offset, todo;
 	bool mapped;
 	int i;
 
 	if (icp->icp_flags & ICPF_RX_DDP)
 		return; /* data is DDP'ed, no need to copy */
 
 	MPASS(bp->bio_flags & BIO_UNMAPPED);
 	if (bio_off < PAGE_SIZE - bp->bio_ma_offset) {
 		page_offset = bp->bio_ma_offset + bio_off;
 		i = 0;
 	} else {
 		bio_off -= PAGE_SIZE - bp->bio_ma_offset;
 		for (i = 1; bio_off >= PAGE_SIZE; i++)
 			bio_off -= PAGE_SIZE;
 		page_offset = bio_off;
 	}
 
 	while (len > 0) {
 		todo = MIN(len, PAGE_SIZE - page_offset);
 
 		mapped = pmap_map_io_transient(bp->bio_ma + i, &vaddr, 1,
 		    false);
 		m_copydata(ip->ip_data_mbuf, pdu_off, todo, (char *)vaddr +
 		    page_offset);
 		if (__predict_false(mapped))
 			pmap_unmap_io_transient(bp->bio_ma + 1, &vaddr, 1,
 			    false);
 
 		page_offset = 0;
 		pdu_off += todo;
 		len -= todo;
 		i++;
 	}
 }
 
 void
 icl_cxgbei_conn_pdu_get_data(struct icl_conn *ic, struct icl_pdu *ip,
     size_t off, void *addr, size_t len)
 {
 	struct icl_cxgbei_pdu *icp = ip_to_icp(ip);
 
 	if (icp->icp_flags & ICPF_RX_DDP)
 		return; /* data is DDP'ed, no need to copy */
 	m_copydata(ip->ip_data_mbuf, off, len, addr);
 }
 
 void
 icl_cxgbei_conn_pdu_queue(struct icl_conn *ic, struct icl_pdu *ip)
 {
 	icl_cxgbei_conn_pdu_queue_cb(ic, ip, NULL);
 }
 
 void
 icl_cxgbei_conn_pdu_queue_cb(struct icl_conn *ic, struct icl_pdu *ip,
 			     icl_pdu_cb cb)
 {
 	struct icl_cxgbei_conn *icc = ic_to_icc(ic);
 	struct icl_cxgbei_pdu *icp = ip_to_icp(ip);
 	struct socket *so = ic->ic_socket;
 
 	MPASS(ic == ip->ip_conn);
 	MPASS(ip->ip_bhs_mbuf != NULL);
 	/* The kernel doesn't generate PDUs with AHS. */
 	MPASS(ip->ip_ahs_mbuf == NULL && ip->ip_ahs_len == 0);
 
 	ICL_CONN_LOCK_ASSERT(ic);
 
 	icp->cb = cb;
 
 	/* NOTE: sowriteable without so_snd lock is a mostly harmless race. */
 	if (ic->ic_disconnecting || so == NULL || !sowriteable(so)) {
 		icl_cxgbei_pdu_done(ip, ENOTCONN);
 		return;
 	}
 
 	STAILQ_INSERT_TAIL(&icc->sent_pdus, ip, ip_next);
 	if (!icc->tx_active) {
 		icc->tx_active = true;
 		wakeup(&icc->tx_active);
 	}
 }
 
 static struct icl_conn *
 icl_cxgbei_new_conn(const char *name, struct mtx *lock)
 {
 	struct icl_cxgbei_conn *icc;
 	struct icl_conn *ic;
 
 	refcount_acquire(&icl_cxgbei_ncons);
 
 	icc = (struct icl_cxgbei_conn *)kobj_create(&icl_cxgbei_class, M_CXGBE,
 	    M_WAITOK | M_ZERO);
 	icc->icc_signature = CXGBEI_CONN_SIGNATURE;
 	STAILQ_INIT(&icc->rcvd_pdus);
 	STAILQ_INIT(&icc->sent_pdus);
 
 	icc->cmp_table = hashinit(64, M_CXGBEI, &icc->cmp_hash_mask);
 	mtx_init(&icc->cmp_lock, "cxgbei_cmp", NULL, MTX_DEF);
 
 	ic = &icc->ic;
 	ic->ic_lock = lock;
 
 #ifdef DIAGNOSTIC
 	refcount_init(&ic->ic_outstanding_pdus, 0);
 #endif
 	ic->ic_name = name;
 	ic->ic_offload = "cxgbei";
 	ic->ic_unmapped = true;
 
 	CTR2(KTR_CXGBE, "%s: icc %p", __func__, icc);
 
 	return (ic);
 }
 
 void
 icl_cxgbei_conn_free(struct icl_conn *ic)
 {
 	struct icl_cxgbei_conn *icc = ic_to_icc(ic);
 
 	MPASS(icc->icc_signature == CXGBEI_CONN_SIGNATURE);
 
 	CTR2(KTR_CXGBE, "%s: icc %p", __func__, icc);
 
 	mtx_destroy(&icc->cmp_lock);
 	hashdestroy(icc->cmp_table, M_CXGBEI, icc->cmp_hash_mask);
 	kobj_delete((struct kobj *)icc, M_CXGBE);
 	refcount_release(&icl_cxgbei_ncons);
 }
 
 static int
 icl_cxgbei_setsockopt(struct icl_conn *ic, struct socket *so, int sspace,
     int rspace)
 {
 	struct sockopt opt;
 	int error, one = 1, ss, rs;
 
 	ss = max(sendspace, sspace);
 	rs = max(recvspace, rspace);
 
 	error = soreserve(so, ss, rs);
 	if (error != 0)
 		return (error);
 	SOCKBUF_LOCK(&so->so_snd);
 	so->so_snd.sb_flags |= SB_AUTOSIZE;
 	SOCKBUF_UNLOCK(&so->so_snd);
 	SOCKBUF_LOCK(&so->so_rcv);
 	so->so_rcv.sb_flags |= SB_AUTOSIZE;
 	SOCKBUF_UNLOCK(&so->so_rcv);
 
 	/*
 	 * Disable Nagle.
 	 */
 	bzero(&opt, sizeof(opt));
 	opt.sopt_dir = SOPT_SET;
 	opt.sopt_level = IPPROTO_TCP;
 	opt.sopt_name = TCP_NODELAY;
 	opt.sopt_val = &one;
 	opt.sopt_valsize = sizeof(one);
 	error = sosetopt(so, &opt);
 	if (error != 0)
 		return (error);
 
 	return (0);
 }
 
 /*
  * Request/response structure used to find out the adapter offloading a socket.
  */
 struct find_ofld_adapter_rr {
 	struct socket *so;
 	struct adapter *sc;	/* result */
 };
 
 static void
 find_offload_adapter(struct adapter *sc, void *arg)
 {
 	struct find_ofld_adapter_rr *fa = arg;
 	struct socket *so = fa->so;
 	struct tom_data *td = sc->tom_softc;
 	struct tcpcb *tp;
 	struct inpcb *inp;
 
 	/* Non-TCP were filtered out earlier. */
 	MPASS(so->so_proto->pr_protocol == IPPROTO_TCP);
 
 	if (fa->sc != NULL)
 		return;	/* Found already. */
 
 	if (td == NULL)
 		return;	/* TOE not enabled on this adapter. */
 
 	inp = sotoinpcb(so);
 	INP_WLOCK(inp);
 	if ((inp->inp_flags & INP_DROPPED) == 0) {
 		tp = intotcpcb(inp);
 		if (tp->t_flags & TF_TOE && tp->tod == &td->tod)
 			fa->sc = sc;	/* Found. */
 	}
 	INP_WUNLOCK(inp);
 }
 
 static bool
 is_memfree(struct adapter *sc)
 {
 	uint32_t em;
 
 	em = t4_read_reg(sc, A_MA_TARGET_MEM_ENABLE);
 	if ((em & F_EXT_MEM_ENABLE) != 0)
 		return (false);
 	if (is_t5(sc) && (em & F_EXT_MEM1_ENABLE) != 0)
 		return (false);
 	return (true);
 }
 
 /* XXXNP: move this to t4_tom. */
 static void
 send_iscsi_flowc_wr(struct adapter *sc, struct toepcb *toep, int maxlen)
 {
 	struct wrqe *wr;
 	struct fw_flowc_wr *flowc;
 	const u_int nparams = 1;
 	u_int flowclen;
 	struct ofld_tx_sdesc *txsd = &toep->txsd[toep->txsd_pidx];
 
 	flowclen = sizeof(*flowc) + nparams * sizeof(struct fw_flowc_mnemval);
 
 	wr = alloc_wrqe(roundup2(flowclen, 16), &toep->ofld_txq->wrq);
 	if (wr == NULL) {
 		/* XXX */
 		panic("%s: allocation failure.", __func__);
 	}
 	flowc = wrtod(wr);
 	memset(flowc, 0, wr->wr_len);
 
 	flowc->op_to_nparams = htobe32(V_FW_WR_OP(FW_FLOWC_WR) |
 	    V_FW_FLOWC_WR_NPARAMS(nparams));
 	flowc->flowid_len16 = htonl(V_FW_WR_LEN16(howmany(flowclen, 16)) |
 	    V_FW_WR_FLOWID(toep->tid));
 
 	flowc->mnemval[0].mnemonic = FW_FLOWC_MNEM_TXDATAPLEN_MAX;
 	flowc->mnemval[0].val = htobe32(maxlen);
 
 	txsd->tx_credits = howmany(flowclen, 16);
 	txsd->plen = 0;
 	KASSERT(toep->tx_credits >= txsd->tx_credits && toep->txsd_avail > 0,
 	    ("%s: not enough credits (%d)", __func__, toep->tx_credits));
 	toep->tx_credits -= txsd->tx_credits;
 	if (__predict_false(++toep->txsd_pidx == toep->txsd_total))
 		toep->txsd_pidx = 0;
 	toep->txsd_avail--;
 
 	t4_wrq_tx(sc, wr);
 }
 
 static void
 set_ulp_mode_iscsi(struct adapter *sc, struct toepcb *toep, u_int ulp_submode)
 {
 	uint64_t val;
 
 	CTR3(KTR_CXGBE, "%s: tid %u, ULP_MODE_ISCSI, submode=%#x",
 	    __func__, toep->tid, ulp_submode);
 
 	val = V_TCB_ULP_TYPE(ULP_MODE_ISCSI) | V_TCB_ULP_RAW(ulp_submode);
 	t4_set_tcb_field(sc, toep->ctrlq, toep, W_TCB_ULP_TYPE,
 	    V_TCB_ULP_TYPE(M_TCB_ULP_TYPE) | V_TCB_ULP_RAW(M_TCB_ULP_RAW), val,
 	    0, 0);
 
 	val = V_TF_RX_FLOW_CONTROL_DISABLE(1ULL);
 	t4_set_tcb_field(sc, toep->ctrlq, toep, W_TCB_T_FLAGS, val, val, 0, 0);
 }
 
 /*
  * XXXNP: Who is responsible for cleaning up the socket if this returns with an
  * error?  Review all error paths.
  *
  * XXXNP: What happens to the socket's fd reference if the operation is
  * successful, and how does that affect the socket's life cycle?
  */
 int
 icl_cxgbei_conn_handoff(struct icl_conn *ic, int fd)
 {
 	struct icl_cxgbei_conn *icc = ic_to_icc(ic);
 	struct find_ofld_adapter_rr fa;
 	struct file *fp;
 	struct socket *so;
 	struct inpcb *inp;
 	struct tcpcb *tp;
 	struct toepcb *toep;
 	cap_rights_t rights;
 	u_int max_iso_payload, max_rx_pdu_len, max_tx_pdu_len;
 	int error, max_iso_pdus;
 
 	MPASS(icc->icc_signature == CXGBEI_CONN_SIGNATURE);
 	ICL_CONN_LOCK_ASSERT_NOT(ic);
 
 	/*
 	 * Steal the socket from userland.
 	 */
 	error = fget(curthread, fd,
 	    cap_rights_init_one(&rights, CAP_SOCK_CLIENT), &fp);
 	if (error != 0)
 		return (error);
 	if (fp->f_type != DTYPE_SOCKET) {
 		fdrop(fp, curthread);
 		return (EINVAL);
 	}
 	so = fp->f_data;
 	if (so->so_type != SOCK_STREAM ||
 	    so->so_proto->pr_protocol != IPPROTO_TCP) {
 		fdrop(fp, curthread);
 		return (EINVAL);
 	}
 
 	ICL_CONN_LOCK(ic);
 	if (ic->ic_socket != NULL) {
 		ICL_CONN_UNLOCK(ic);
 		fdrop(fp, curthread);
 		return (EBUSY);
 	}
 	ic->ic_disconnecting = false;
 	ic->ic_socket = so;
 	fp->f_ops = &badfileops;
 	fp->f_data = NULL;
 	fdrop(fp, curthread);
 	ICL_CONN_UNLOCK(ic);
 
 	/* Find the adapter offloading this socket. */
 	fa.sc = NULL;
 	fa.so = so;
 	t4_iterate(find_offload_adapter, &fa);
 	if (fa.sc == NULL) {
 		error = EINVAL;
 		goto out;
 	}
 	icc->sc = fa.sc;
 
 	max_rx_pdu_len = ISCSI_BHS_SIZE + ic->ic_max_recv_data_segment_length;
 	max_tx_pdu_len = ISCSI_BHS_SIZE + ic->ic_max_send_data_segment_length;
 	if (ic->ic_header_crc32c) {
 		max_rx_pdu_len += ISCSI_HEADER_DIGEST_SIZE;
 		max_tx_pdu_len += ISCSI_HEADER_DIGEST_SIZE;
 	}
 	if (ic->ic_data_crc32c) {
 		max_rx_pdu_len += ISCSI_DATA_DIGEST_SIZE;
 		max_tx_pdu_len += ISCSI_DATA_DIGEST_SIZE;
 	}
 
 	inp = sotoinpcb(so);
 	INP_WLOCK(inp);
 	tp = intotcpcb(inp);
 	if (inp->inp_flags & INP_DROPPED) {
 		INP_WUNLOCK(inp);
 		error = ENOTCONN;
 		goto out;
 	}
 
 	/*
 	 * socket could not have been "unoffloaded" if here.
 	 */
 	MPASS(tp->t_flags & TF_TOE);
 	MPASS(tp->tod != NULL);
 	MPASS(tp->t_toe != NULL);
 	toep = tp->t_toe;
 	MPASS(toep->vi->adapter == icc->sc);
 
 	if (ulp_mode(toep) != ULP_MODE_NONE) {
 		INP_WUNLOCK(inp);
 		error = EINVAL;
 		goto out;
 	}
 
 	icc->toep = toep;
 
 	icc->ulp_submode = 0;
 	if (ic->ic_header_crc32c)
 		icc->ulp_submode |= ULP_CRC_HEADER;
 	if (ic->ic_data_crc32c)
 		icc->ulp_submode |= ULP_CRC_DATA;
 
 	if (icc->sc->tt.iso && chip_id(icc->sc) >= CHELSIO_T5 &&
 	    !is_memfree(icc->sc)) {
 		max_iso_payload = rounddown(CXGBEI_MAX_ISO_PAYLOAD,
 		    tp->t_maxseg);
 		max_iso_pdus = max_iso_payload / max_tx_pdu_len;
 		ic->ic_hw_isomax = max_iso_pdus *
 		    ic->ic_max_send_data_segment_length;
 	} else
 		max_iso_pdus = 1;
 
 	toep->params.ulp_mode = ULP_MODE_ISCSI;
 	toep->ulpcb = icc;
 
 	send_iscsi_flowc_wr(icc->sc, toep,
 	    roundup(max_iso_pdus * max_tx_pdu_len, tp->t_maxseg));
 	set_ulp_mode_iscsi(icc->sc, toep, icc->ulp_submode);
 	INP_WUNLOCK(inp);
 
 	error = kthread_add(icl_cxgbei_tx_main, icc, NULL, &icc->tx_thread, 0,
 	    0, "%stx (cxgbei)", ic->ic_name);
 	if (error != 0)
 		goto out;
 
 	error = kthread_add(icl_cxgbei_rx_main, icc, NULL, &icc->rx_thread, 0,
 	    0, "%srx (cxgbei)", ic->ic_name);
 	if (error != 0)
 		goto out;
 
 	error = icl_cxgbei_setsockopt(ic, so, max_tx_pdu_len, max_rx_pdu_len);
 out:
 	if (error != 0)
 		icl_cxgbei_conn_close(ic);
 	return (error);
 }
 
 void
 icl_cxgbei_conn_close(struct icl_conn *ic)
 {
 	struct icl_cxgbei_conn *icc = ic_to_icc(ic);
 	struct icl_pdu *ip;
 	struct socket *so;
 	struct sockbuf *sb;
 	struct inpcb *inp;
 	struct toepcb *toep = icc->toep;
 
 	MPASS(icc->icc_signature == CXGBEI_CONN_SIGNATURE);
 	ICL_CONN_LOCK_ASSERT_NOT(ic);
 
 	ICL_CONN_LOCK(ic);
 	so = ic->ic_socket;
 	if (ic->ic_disconnecting || so == NULL) {
 		CTR4(KTR_CXGBE, "%s: icc %p (disconnecting = %d), so %p",
 		    __func__, icc, ic->ic_disconnecting, so);
 		ICL_CONN_UNLOCK(ic);
 		return;
 	}
 	ic->ic_disconnecting = true;
 
 #ifdef DIAGNOSTIC
 	KASSERT(ic->ic_outstanding_pdus == 0,
 	    ("destroying session with %d outstanding PDUs",
 	     ic->ic_outstanding_pdus));
 #endif
 
 	CTR3(KTR_CXGBE, "%s: tid %d, icc %p", __func__, toep ? toep->tid : -1,
 	    icc);
 
 	/*
 	 * Wait for the transmit thread to stop processing
 	 * this connection.
 	 */
 	if (icc->tx_thread != NULL) {
 		wakeup(&icc->tx_active);
 		mtx_sleep(icc->tx_thread, ic->ic_lock, 0, "conclo", 0);
 	}
 
 	/* Discard PDUs queued for TX. */
 	while (!STAILQ_EMPTY(&icc->sent_pdus)) {
 		ip = STAILQ_FIRST(&icc->sent_pdus);
 		STAILQ_REMOVE_HEAD(&icc->sent_pdus, ip_next);
 		icl_cxgbei_pdu_done(ip, ENOTCONN);
 	}
 	ICL_CONN_UNLOCK(ic);
 
 	inp = sotoinpcb(so);
 	sb = &so->so_rcv;
 
 	/*
 	 * Wait for the receive thread to stop processing this
 	 * connection.
 	 */
 	SOCKBUF_LOCK(sb);
 	if (icc->rx_thread != NULL) {
 		icc->rx_exiting = true;
 		wakeup(&icc->rx_active);
 		mtx_sleep(icc->rx_thread, SOCKBUF_MTX(sb), 0, "conclo", 0);
 	}
 
 	/*
 	 * Discard received PDUs not passed to the iSCSI layer.
 	 */
 	while (!STAILQ_EMPTY(&icc->rcvd_pdus)) {
 		ip = STAILQ_FIRST(&icc->rcvd_pdus);
 		STAILQ_REMOVE_HEAD(&icc->rcvd_pdus, ip_next);
 		icl_cxgbei_pdu_done(ip, ENOTCONN);
 	}
 	SOCKBUF_UNLOCK(sb);
 
 	INP_WLOCK(inp);
 	if (toep != NULL) {	/* NULL if connection was never offloaded. */
 		toep->ulpcb = NULL;
 
 		/* Discard mbufs queued for TX. */
 		mbufq_drain(&toep->ulp_pduq);
 
 		/*
 		 * Grab a reference to use when waiting for the final
 		 * CPL to be received.  If toep->inp is NULL, then
 		 * final_cpl_received() has already been called (e.g.
 		 * due to the peer sending a RST).
 		 */
 		if (toep->inp != NULL) {
 			toep = hold_toepcb(toep);
 			toep->flags |= TPF_WAITING_FOR_FINAL;
 		} else
 			toep = NULL;
 	}
 	INP_WUNLOCK(inp);
 
 	ICL_CONN_LOCK(ic);
 	ic->ic_socket = NULL;
 	ICL_CONN_UNLOCK(ic);
 
 	/*
 	 * XXXNP: we should send RST instead of FIN when PDUs held in various
 	 * queues were purged instead of delivered reliably but soabort isn't
 	 * really general purpose and wouldn't do the right thing here.
 	 */
 	soclose(so);
 
 	/*
 	 * Wait for the socket to fully close.  This ensures any
 	 * pending received data has been received (and in particular,
 	 * any data that would be received by DDP has been handled).
 	 * Callers assume that it is safe to free buffers for tasks
 	 * and transfers after this function returns.
 	 */
 	if (toep != NULL) {
 		struct mtx *lock = mtx_pool_find(mtxpool_sleep, toep);
 
 		mtx_lock(lock);
 		while ((toep->flags & TPF_WAITING_FOR_FINAL) != 0)
 			mtx_sleep(toep, lock, PSOCK, "conclo2", 0);
 		mtx_unlock(lock);
 		free_toepcb(toep);
 	}
 }
 
 static void
 cxgbei_insert_cmp(struct icl_cxgbei_conn *icc, struct cxgbei_cmp *cmp,
     uint32_t tt)
 {
 #ifdef INVARIANTS
 	struct cxgbei_cmp *cmp2;
 #endif
 
 	cmp->tt = tt;
 
 	mtx_lock(&icc->cmp_lock);
 #ifdef INVARIANTS
 	LIST_FOREACH(cmp2, &icc->cmp_table[TT_HASH(icc, tt)], link) {
 		KASSERT(cmp2->tt != tt, ("%s: duplicate cmp", __func__));
 	}
 #endif
 	LIST_INSERT_HEAD(&icc->cmp_table[TT_HASH(icc, tt)], cmp, link);
 	mtx_unlock(&icc->cmp_lock);
 }
 
 struct cxgbei_cmp *
 cxgbei_find_cmp(struct icl_cxgbei_conn *icc, uint32_t tt)
 {
 	struct cxgbei_cmp *cmp;
 
 	mtx_lock(&icc->cmp_lock);
 	LIST_FOREACH(cmp, &icc->cmp_table[TT_HASH(icc, tt)], link) {
 		if (cmp->tt == tt)
 			break;
 	}
 	mtx_unlock(&icc->cmp_lock);
 	return (cmp);
 }
 
 static void
 cxgbei_rm_cmp(struct icl_cxgbei_conn *icc, struct cxgbei_cmp *cmp)
 {
 #ifdef INVARIANTS
 	struct cxgbei_cmp *cmp2;
 #endif
 
 	mtx_lock(&icc->cmp_lock);
 
 #ifdef INVARIANTS
 	LIST_FOREACH(cmp2, &icc->cmp_table[TT_HASH(icc, cmp->tt)], link) {
 		if (cmp2 == cmp)
 			goto found;
 	}
 	panic("%s: could not find cmp", __func__);
 found:
 #endif
 	LIST_REMOVE(cmp, link);
 	mtx_unlock(&icc->cmp_lock);
 }
 
 int
 icl_cxgbei_conn_task_setup(struct icl_conn *ic, struct icl_pdu *ip,
     struct ccb_scsiio *csio, uint32_t *ittp, void **arg)
 {
 	struct icl_cxgbei_conn *icc = ic_to_icc(ic);
 	struct toepcb *toep = icc->toep;
 	struct adapter *sc = icc->sc;
 	struct cxgbei_data *ci = sc->iscsi_ulp_softc;
 	struct ppod_region *pr = &ci->pr;
 	struct cxgbei_ddp_state *ddp;
 	struct ppod_reservation *prsv;
 	struct inpcb *inp;
 	struct mbufq mq;
 	uint32_t itt;
 	int rc = 0;
 
 	ICL_CONN_LOCK_ASSERT(ic);
 
 	/* This is for the offload driver's state.  Must not be set already. */
 	MPASS(arg != NULL);
 	MPASS(*arg == NULL);
 
 	if ((csio->ccb_h.flags & CAM_DIR_MASK) != CAM_DIR_IN ||
 	    csio->dxfer_len < ci->ddp_threshold || ic->ic_disconnecting ||
 	    ic->ic_socket == NULL) {
 no_ddp:
 		/*
 		 * No DDP for this I/O.	 Allocate an ITT (based on the one
 		 * passed in) that cannot be a valid hardware DDP tag in the
 		 * iSCSI region.
 		 */
 		itt = *ittp & M_PPOD_TAG;
 		itt = V_PPOD_TAG(itt) | pr->pr_invalid_bit;
 		*ittp = htobe32(itt);
 		MPASS(*arg == NULL);	/* State is maintained for DDP only. */
 		if (rc != 0)
 			counter_u64_add(
 			    toep->ofld_rxq->rx_iscsi_ddp_setup_error, 1);
 		return (0);
 	}
 
 	/*
 	 * Reserve resources for DDP, update the itt that should be used in the
 	 * PDU, and save DDP specific state for this I/O in *arg.
 	 */
 	ddp = malloc(sizeof(*ddp), M_CXGBEI, M_NOWAIT | M_ZERO);
 	if (ddp == NULL) {
 		rc = ENOMEM;
 		goto no_ddp;
 	}
 	prsv = &ddp->prsv;
 
 	mbufq_init(&mq, INT_MAX);
 	switch (csio->ccb_h.flags & CAM_DATA_MASK) {
 	case CAM_DATA_BIO:
 		rc = t4_alloc_page_pods_for_bio(pr,
 		    (struct bio *)csio->data_ptr, prsv);
 		if (rc != 0) {
 			free(ddp, M_CXGBEI);
 			goto no_ddp;
 		}
 
 		rc = t4_write_page_pods_for_bio(sc, toep, prsv,
 		    (struct bio *)csio->data_ptr, &mq);
 		if (__predict_false(rc != 0)) {
 			mbufq_drain(&mq);
 			t4_free_page_pods(prsv);
 			free(ddp, M_CXGBEI);
 			goto no_ddp;
 		}
 		break;
 	case CAM_DATA_VADDR:
 		rc = t4_alloc_page_pods_for_buf(pr, (vm_offset_t)csio->data_ptr,
 		    csio->dxfer_len, prsv);
 		if (rc != 0) {
 			free(ddp, M_CXGBEI);
 			goto no_ddp;
 		}
 
 		rc = t4_write_page_pods_for_buf(sc, toep, prsv,
 		    (vm_offset_t)csio->data_ptr, csio->dxfer_len, &mq);
 		if (__predict_false(rc != 0)) {
 			mbufq_drain(&mq);
 			t4_free_page_pods(prsv);
 			free(ddp, M_CXGBEI);
 			goto no_ddp;
 		}
 		break;
 	default:
 		free(ddp, M_CXGBEI);
 		rc = EINVAL;
 		goto no_ddp;
 	}
 
 	/*
 	 * Do not get inp from toep->inp as the toepcb might have
 	 * detached already.
 	 */
 	inp = sotoinpcb(ic->ic_socket);
 	INP_WLOCK(inp);
 	if ((inp->inp_flags & INP_DROPPED) != 0) {
 		INP_WUNLOCK(inp);
 		mbufq_drain(&mq);
 		t4_free_page_pods(prsv);
 		free(ddp, M_CXGBEI);
 		goto no_ddp;
 	}
 	mbufq_concat(&toep->ulp_pduq, &mq);
 	INP_WUNLOCK(inp);
 
 	ddp->cmp.last_datasn = -1;
 	cxgbei_insert_cmp(icc, &ddp->cmp, prsv->prsv_tag);
 	*ittp = htobe32(prsv->prsv_tag);
 	*arg = prsv;
 	counter_u64_add(toep->ofld_rxq->rx_iscsi_ddp_setup_ok, 1);
 	return (0);
 }
 
 void
 icl_cxgbei_conn_task_done(struct icl_conn *ic, void *arg)
 {
 
 	if (arg != NULL) {
 		struct cxgbei_ddp_state *ddp = arg;
 
 		cxgbei_rm_cmp(ic_to_icc(ic), &ddp->cmp);
 		t4_free_page_pods(&ddp->prsv);
 		free(ddp, M_CXGBEI);
 	}
 }
 
 static inline bool
 ddp_sgl_check(struct ctl_sg_entry *sg, int entries, int xferlen)
 {
 #ifdef INVARIANTS
 	int total_len = 0;
 #endif
 
 	MPASS(entries > 0);
 	if (((vm_offset_t)sg[--entries].addr & 3U) != 0)
 		return (false);
 
 #ifdef INVARIANTS
 	total_len += sg[entries].len;
 #endif
 
 	while (--entries >= 0) {
 		if (((vm_offset_t)sg[entries].addr & PAGE_MASK) != 0 ||
 		    (sg[entries].len % PAGE_SIZE) != 0)
 			return (false);
 #ifdef INVARIANTS
 		total_len += sg[entries].len;
 #endif
 	}
 
 	MPASS(total_len == xferlen);
 	return (true);
 }
 
 #define io_to_ddp_state(io) ((io)->io_hdr.ctl_private[CTL_PRIV_FRONTEND2].ptr)
 
 int
 icl_cxgbei_conn_transfer_setup(struct icl_conn *ic, struct icl_pdu *ip,
     union ctl_io *io, uint32_t *tttp, void **arg)
 {
 	struct icl_cxgbei_conn *icc = ic_to_icc(ic);
 	struct toepcb *toep = icc->toep;
 	struct ctl_scsiio *ctsio = &io->scsiio;
 	struct adapter *sc = icc->sc;
 	struct cxgbei_data *ci = sc->iscsi_ulp_softc;
 	struct ppod_region *pr = &ci->pr;
 	struct cxgbei_ddp_state *ddp;
 	struct ppod_reservation *prsv;
 	struct ctl_sg_entry *sgl, sg_entry;
 	struct inpcb *inp;
 	struct mbufq mq;
 	int sg_entries = ctsio->kern_sg_entries;
 	uint32_t ttt;
 	int xferlen, rc = 0, alias;
 
 	/* This is for the offload driver's state.  Must not be set already. */
 	MPASS(arg != NULL);
 	MPASS(*arg == NULL);
 
 	if (ctsio->ext_data_filled == 0) {
 		int first_burst;
 #ifdef INVARIANTS
 		struct icl_cxgbei_pdu *icp = ip_to_icp(ip);
 
 		MPASS(icp->icp_signature == CXGBEI_PDU_SIGNATURE);
 		MPASS(ic == ip->ip_conn);
 		MPASS(ip->ip_bhs_mbuf != NULL);
 #endif
 		first_burst = icl_pdu_data_segment_length(ip);
 
 		/*
 		 * Note that ICL calls conn_transfer_setup even if the first
 		 * burst had everything and there's nothing left to transfer.
 		 *
 		 * NB: The CTL frontend might have provided a buffer
 		 * whose length (kern_data_len) is smaller than the
 		 * FirstBurstLength of unsolicited data.  Treat those
 		 * as an empty transfer.
 		 */
 		xferlen = ctsio->kern_data_len;
 		if (xferlen < first_burst ||
 		    xferlen - first_burst < ci->ddp_threshold) {
 no_ddp:
 			/*
 			 * No DDP for this transfer.  Allocate a TTT (based on
 			 * the one passed in) that cannot be a valid hardware
 			 * DDP tag in the iSCSI region.
 			 */
 			ttt = *tttp & M_PPOD_TAG;
 			ttt = V_PPOD_TAG(ttt) | pr->pr_invalid_bit;
 			*tttp = htobe32(ttt);
 			MPASS(io_to_ddp_state(io) == NULL);
 			if (rc != 0)
 				counter_u64_add(
 				    toep->ofld_rxq->rx_iscsi_ddp_setup_error, 1);
 			return (0);
 		}
 
 		if (sg_entries == 0) {
 			sgl = &sg_entry;
 			sgl->len = xferlen;
 			sgl->addr = (void *)ctsio->kern_data_ptr;
 			sg_entries = 1;
 		} else
 			sgl = (void *)ctsio->kern_data_ptr;
 
 		if (!ddp_sgl_check(sgl, sg_entries, xferlen))
 			goto no_ddp;
 
 		/*
 		 * Reserve resources for DDP, update the ttt that should be used
 		 * in the PDU, and save DDP specific state for this I/O.
 		 */
 		MPASS(io_to_ddp_state(io) == NULL);
 		ddp = malloc(sizeof(*ddp), M_CXGBEI, M_NOWAIT | M_ZERO);
 		if (ddp == NULL) {
 			rc = ENOMEM;
 			goto no_ddp;
 		}
 		prsv = &ddp->prsv;
 
 		rc = t4_alloc_page_pods_for_sgl(pr, sgl, sg_entries, prsv);
 		if (rc != 0) {
 			free(ddp, M_CXGBEI);
 			goto no_ddp;
 		}
 
 		mbufq_init(&mq, INT_MAX);
 		rc = t4_write_page_pods_for_sgl(sc, toep, prsv, sgl, sg_entries,
 		    xferlen, &mq);
 		if (__predict_false(rc != 0)) {
 			mbufq_drain(&mq);
 			t4_free_page_pods(prsv);
 			free(ddp, M_CXGBEI);
 			goto no_ddp;
 		}
 
 		/*
 		 * Do not get inp from toep->inp as the toepcb might
 		 * have detached already.
 		 */
 		ICL_CONN_LOCK(ic);
 		if (ic->ic_disconnecting || ic->ic_socket == NULL) {
 			ICL_CONN_UNLOCK(ic);
 			mbufq_drain(&mq);
 			t4_free_page_pods(prsv);
 			free(ddp, M_CXGBEI);
 			return (ECONNRESET);
 		}
 		inp = sotoinpcb(ic->ic_socket);
 		INP_WLOCK(inp);
 		ICL_CONN_UNLOCK(ic);
 		if ((inp->inp_flags & INP_DROPPED) != 0) {
 			INP_WUNLOCK(inp);
 			mbufq_drain(&mq);
 			t4_free_page_pods(prsv);
 			free(ddp, M_CXGBEI);
 			return (ECONNRESET);
 		}
 		mbufq_concat(&toep->ulp_pduq, &mq);
 		INP_WUNLOCK(inp);
 
 		ddp->cmp.next_buffer_offset = ctsio->kern_rel_offset +
 		    first_burst;
 		ddp->cmp.last_datasn = -1;
 		cxgbei_insert_cmp(icc, &ddp->cmp, prsv->prsv_tag);
 		*tttp = htobe32(prsv->prsv_tag);
 		io_to_ddp_state(io) = ddp;
 		*arg = ctsio;
 		counter_u64_add(toep->ofld_rxq->rx_iscsi_ddp_setup_ok, 1);
 		return (0);
 	}
 
 	/*
 	 * In the middle of an I/O.  A non-NULL page pod reservation indicates
 	 * that a DDP buffer is being used for the I/O.
 	 */
 	ddp = io_to_ddp_state(ctsio);
 	if (ddp == NULL)
 		goto no_ddp;
 	prsv = &ddp->prsv;
 
 	alias = (prsv->prsv_tag & pr->pr_alias_mask) >> pr->pr_alias_shift;
 	alias++;
 	prsv->prsv_tag &= ~pr->pr_alias_mask;
 	prsv->prsv_tag |= alias << pr->pr_alias_shift & pr->pr_alias_mask;
 
 	ddp->cmp.last_datasn = -1;
 	cxgbei_insert_cmp(icc, &ddp->cmp, prsv->prsv_tag);
 	*tttp = htobe32(prsv->prsv_tag);
 	*arg = ctsio;
 
 	return (0);
 }
 
 void
 icl_cxgbei_conn_transfer_done(struct icl_conn *ic, void *arg)
 {
 	struct ctl_scsiio *ctsio = arg;
 
 	if (ctsio != NULL) {
 		struct cxgbei_ddp_state *ddp;
 
 		ddp = io_to_ddp_state(ctsio);
 		MPASS(ddp != NULL);
 
 		cxgbei_rm_cmp(ic_to_icc(ic), &ddp->cmp);
 		if (ctsio->kern_data_len == ctsio->ext_data_filled ||
 		    ic->ic_disconnecting) {
 			t4_free_page_pods(&ddp->prsv);
 			free(ddp, M_CXGBEI);
 			io_to_ddp_state(ctsio) = NULL;
 		}
 	}
 }
 
 #ifdef COMPAT_FREEBSD13
 static void
 cxgbei_limits(struct adapter *sc, void *arg)
 {
 	struct icl_drv_limits *idl = arg;
 	struct cxgbei_data *ci;
 	int max_dsl;
 
 	if (begin_synchronized_op(sc, NULL, HOLD_LOCK, "t4lims") != 0)
 		return;
 
 	if (uld_active(sc, ULD_ISCSI)) {
 		ci = sc->iscsi_ulp_softc;
 		MPASS(ci != NULL);
 
 
 		max_dsl = ci->max_rx_data_len;
 		if (idl->idl_max_recv_data_segment_length > max_dsl)
 			idl->idl_max_recv_data_segment_length = max_dsl;
 
 		max_dsl = ci->max_tx_data_len;
 		if (idl->idl_max_send_data_segment_length > max_dsl)
 			idl->idl_max_send_data_segment_length = max_dsl;
 	}
 
 	end_synchronized_op(sc, LOCK_HELD);
 }
 #endif
 
 static int
 cxgbei_limits_fd(struct icl_drv_limits *idl, int fd)
 {
 	struct find_ofld_adapter_rr fa;
 	struct file *fp;
 	struct socket *so;
 	struct adapter *sc;
 	struct cxgbei_data *ci;
 	cap_rights_t rights;
 	int error;
 
 	error = fget(curthread, fd,
 	    cap_rights_init_one(&rights, CAP_SOCK_CLIENT), &fp);
 	if (error != 0)
 		return (error);
 	if (fp->f_type != DTYPE_SOCKET) {
 		fdrop(fp, curthread);
 		return (EINVAL);
 	}
 	so = fp->f_data;
 	if (so->so_type != SOCK_STREAM ||
 	    so->so_proto->pr_protocol != IPPROTO_TCP) {
 		fdrop(fp, curthread);
 		return (EINVAL);
 	}
 
 	/* Find the adapter offloading this socket. */
 	fa.sc = NULL;
 	fa.so = so;
 	t4_iterate(find_offload_adapter, &fa);
 	if (fa.sc == NULL) {
 		fdrop(fp, curthread);
 		return (ENXIO);
 	}
 	fdrop(fp, curthread);
 
 	sc = fa.sc;
 	error = begin_synchronized_op(sc, NULL, HOLD_LOCK, "t4lims");
 	if (error != 0)
 		return (error);
 
 	if (uld_active(sc, ULD_ISCSI)) {
 		ci = sc->iscsi_ulp_softc;
 		MPASS(ci != NULL);
 
 		idl->idl_max_recv_data_segment_length = ci->max_rx_data_len;
 		idl->idl_max_send_data_segment_length = ci->max_tx_data_len;
 	} else
 		error = ENXIO;
 
 	end_synchronized_op(sc, LOCK_HELD);
 
 	return (error);
 }
 
 static int
 icl_cxgbei_limits(struct icl_drv_limits *idl, int socket)
 {
 
 	/* Maximum allowed by the RFC.	cxgbei_limits will clip them. */
 	idl->idl_max_recv_data_segment_length = (1 << 24) - 1;
 	idl->idl_max_send_data_segment_length = (1 << 24) - 1;
 
 	/* These are somewhat arbitrary. */
 	idl->idl_max_burst_length = max_burst_length;
 	idl->idl_first_burst_length = first_burst_length;
 
 #ifdef COMPAT_FREEBSD13
 	if (socket == 0) {
 		t4_iterate(cxgbei_limits, idl);
 		return (0);
 	}
 #endif
 
 	return (cxgbei_limits_fd(idl, socket));
 }
 
 int
 icl_cxgbei_mod_load(void)
 {
 	int rc;
 
 	refcount_init(&icl_cxgbei_ncons, 0);
 
 	rc = icl_register("cxgbei", false, -100, icl_cxgbei_limits,
 	    icl_cxgbei_new_conn);
 
 	return (rc);
 }
 
 int
 icl_cxgbei_mod_unload(void)
 {
 
 	if (icl_cxgbei_ncons != 0)
 		return (EBUSY);
 
 	icl_unregister("cxgbei", false);
 
 	return (0);
 }
 #endif
diff --git a/sys/dev/cxgbe/tom/t4_cpl_io.c b/sys/dev/cxgbe/tom/t4_cpl_io.c
index 0a40bbda3f3f..8cafac61fa8b 100644
--- a/sys/dev/cxgbe/tom/t4_cpl_io.c
+++ b/sys/dev/cxgbe/tom/t4_cpl_io.c
@@ -1,2451 +1,2451 @@
 /*-
  * SPDX-License-Identifier: BSD-2-Clause
  *
  * Copyright (c) 2012, 2015 Chelsio Communications, Inc.
  * All rights reserved.
  * Written by: Navdeep Parhar <np@FreeBSD.org>
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 #include "opt_inet.h"
 #include "opt_inet6.h"
 #include "opt_kern_tls.h"
 #include "opt_ratelimit.h"
 
 #ifdef TCP_OFFLOAD
 #include <sys/param.h>
 #include <sys/aio.h>
 #include <sys/file.h>
 #include <sys/kernel.h>
 #include <sys/ktr.h>
 #include <sys/module.h>
 #include <sys/proc.h>
 #include <sys/protosw.h>
 #include <sys/domain.h>
 #include <sys/socket.h>
 #include <sys/socketvar.h>
 #include <sys/sglist.h>
 #include <sys/taskqueue.h>
 #include <netinet/in.h>
 #include <netinet/in_pcb.h>
 #include <netinet/ip.h>
 #include <netinet/ip6.h>
 #define TCPSTATES
 #include <netinet/tcp_fsm.h>
 #include <netinet/tcp_seq.h>
 #include <netinet/tcp_var.h>
 #include <netinet/toecore.h>
 
 #include <security/mac/mac_framework.h>
 
 #include <vm/vm.h>
 #include <vm/vm_extern.h>
 #include <vm/pmap.h>
 #include <vm/vm_map.h>
 #include <vm/vm_page.h>
 
 #include <dev/iscsi/iscsi_proto.h>
 
 #include "common/common.h"
 #include "common/t4_msg.h"
 #include "common/t4_regs.h"
 #include "common/t4_tcb.h"
 #include "tom/t4_tom_l2t.h"
 #include "tom/t4_tom.h"
 
 static void	t4_aiotx_cancel(struct kaiocb *job);
 static void	t4_aiotx_queue_toep(struct socket *so, struct toepcb *toep);
 
 void
 send_flowc_wr(struct toepcb *toep, struct tcpcb *tp)
 {
 	struct wrqe *wr;
 	struct fw_flowc_wr *flowc;
 	unsigned int nparams, flowclen, paramidx;
 	struct vi_info *vi = toep->vi;
 	struct port_info *pi = vi->pi;
 	struct adapter *sc = pi->adapter;
 	unsigned int pfvf = sc->pf << S_FW_VIID_PFN;
 	struct ofld_tx_sdesc *txsd = &toep->txsd[toep->txsd_pidx];
 
 	KASSERT(!(toep->flags & TPF_FLOWC_WR_SENT),
 	    ("%s: flowc for tid %u sent already", __func__, toep->tid));
 
 	if (tp != NULL)
 		nparams = 8;
 	else
 		nparams = 6;
 	if (toep->params.tc_idx != -1) {
 		MPASS(toep->params.tc_idx >= 0 &&
 		    toep->params.tc_idx < sc->params.nsched_cls);
 		nparams++;
 	}
 
 	flowclen = sizeof(*flowc) + nparams * sizeof(struct fw_flowc_mnemval);
 
 	wr = alloc_wrqe(roundup2(flowclen, 16), &toep->ofld_txq->wrq);
 	if (wr == NULL) {
 		/* XXX */
 		panic("%s: allocation failure.", __func__);
 	}
 	flowc = wrtod(wr);
 	memset(flowc, 0, wr->wr_len);
 
 	flowc->op_to_nparams = htobe32(V_FW_WR_OP(FW_FLOWC_WR) |
 	    V_FW_FLOWC_WR_NPARAMS(nparams));
 	flowc->flowid_len16 = htonl(V_FW_WR_LEN16(howmany(flowclen, 16)) |
 	    V_FW_WR_FLOWID(toep->tid));
 
 #define FLOWC_PARAM(__m, __v) \
 	do { \
 		flowc->mnemval[paramidx].mnemonic = FW_FLOWC_MNEM_##__m; \
 		flowc->mnemval[paramidx].val = htobe32(__v); \
 		paramidx++; \
 	} while (0)
 
 	paramidx = 0;
 
 	FLOWC_PARAM(PFNVFN, pfvf);
 	FLOWC_PARAM(CH, pi->tx_chan);
 	FLOWC_PARAM(PORT, pi->tx_chan);
 	FLOWC_PARAM(IQID, toep->ofld_rxq->iq.abs_id);
 	FLOWC_PARAM(SNDBUF, toep->params.sndbuf);
 	if (tp) {
 		FLOWC_PARAM(MSS, toep->params.emss);
 		FLOWC_PARAM(SNDNXT, tp->snd_nxt);
 		FLOWC_PARAM(RCVNXT, tp->rcv_nxt);
 	} else
 		FLOWC_PARAM(MSS, 512);
 	CTR6(KTR_CXGBE,
 	    "%s: tid %u, mss %u, sndbuf %u, snd_nxt 0x%x, rcv_nxt 0x%x",
 	    __func__, toep->tid, toep->params.emss, toep->params.sndbuf,
 	    tp ? tp->snd_nxt : 0, tp ? tp->rcv_nxt : 0);
 
 	if (toep->params.tc_idx != -1)
 		FLOWC_PARAM(SCHEDCLASS, toep->params.tc_idx);
 #undef FLOWC_PARAM
 
 	KASSERT(paramidx == nparams, ("nparams mismatch"));
 
 	txsd->tx_credits = howmany(flowclen, 16);
 	txsd->plen = 0;
 	KASSERT(toep->tx_credits >= txsd->tx_credits && toep->txsd_avail > 0,
 	    ("%s: not enough credits (%d)", __func__, toep->tx_credits));
 	toep->tx_credits -= txsd->tx_credits;
 	if (__predict_false(++toep->txsd_pidx == toep->txsd_total))
 		toep->txsd_pidx = 0;
 	toep->txsd_avail--;
 
 	toep->flags |= TPF_FLOWC_WR_SENT;
         t4_wrq_tx(sc, wr);
 }
 
 #ifdef RATELIMIT
 /*
  * Input is Bytes/second (so_max_pacing_rate), chip counts in Kilobits/second.
  */
 static int
 update_tx_rate_limit(struct adapter *sc, struct toepcb *toep, u_int Bps)
 {
 	int tc_idx, rc;
 	const u_int kbps = (u_int) (uint64_t)Bps * 8ULL / 1000;
 	const int port_id = toep->vi->pi->port_id;
 
 	CTR3(KTR_CXGBE, "%s: tid %u, rate %uKbps", __func__, toep->tid, kbps);
 
 	if (kbps == 0) {
 		/* unbind */
 		tc_idx = -1;
 	} else {
 		rc = t4_reserve_cl_rl_kbps(sc, port_id, kbps, &tc_idx);
 		if (rc != 0)
 			return (rc);
 		MPASS(tc_idx >= 0 && tc_idx < sc->params.nsched_cls);
 	}
 
 	if (toep->params.tc_idx != tc_idx) {
 		struct wrqe *wr;
 		struct fw_flowc_wr *flowc;
 		int nparams = 1, flowclen, flowclen16;
 		struct ofld_tx_sdesc *txsd = &toep->txsd[toep->txsd_pidx];
 
 		flowclen = sizeof(*flowc) + nparams * sizeof(struct
 		    fw_flowc_mnemval);
 		flowclen16 = howmany(flowclen, 16);
 		if (toep->tx_credits < flowclen16 || toep->txsd_avail == 0 ||
 		    (wr = alloc_wrqe(roundup2(flowclen, 16),
 		    &toep->ofld_txq->wrq)) == NULL) {
 			if (tc_idx >= 0)
 				t4_release_cl_rl(sc, port_id, tc_idx);
 			return (ENOMEM);
 		}
 
 		flowc = wrtod(wr);
 		memset(flowc, 0, wr->wr_len);
 
 		flowc->op_to_nparams = htobe32(V_FW_WR_OP(FW_FLOWC_WR) |
 		    V_FW_FLOWC_WR_NPARAMS(nparams));
 		flowc->flowid_len16 = htonl(V_FW_WR_LEN16(flowclen16) |
 		    V_FW_WR_FLOWID(toep->tid));
 
 		flowc->mnemval[0].mnemonic = FW_FLOWC_MNEM_SCHEDCLASS;
 		if (tc_idx == -1)
 			flowc->mnemval[0].val = htobe32(0xff);
 		else
 			flowc->mnemval[0].val = htobe32(tc_idx);
 
 		txsd->tx_credits = flowclen16;
 		txsd->plen = 0;
 		toep->tx_credits -= txsd->tx_credits;
 		if (__predict_false(++toep->txsd_pidx == toep->txsd_total))
 			toep->txsd_pidx = 0;
 		toep->txsd_avail--;
 		t4_wrq_tx(sc, wr);
 	}
 
 	if (toep->params.tc_idx >= 0)
 		t4_release_cl_rl(sc, port_id, toep->params.tc_idx);
 	toep->params.tc_idx = tc_idx;
 
 	return (0);
 }
 #endif
 
 void
 send_reset(struct adapter *sc, struct toepcb *toep, uint32_t snd_nxt)
 {
 	struct wrqe *wr;
 	struct cpl_abort_req *req;
 	int tid = toep->tid;
 	struct inpcb *inp = toep->inp;
 	struct tcpcb *tp = intotcpcb(inp);	/* don't use if INP_DROPPED */
 
 	INP_WLOCK_ASSERT(inp);
 
 	CTR6(KTR_CXGBE, "%s: tid %d (%s), toep_flags 0x%x, inp_flags 0x%x%s",
 	    __func__, toep->tid,
 	    inp->inp_flags & INP_DROPPED ? "inp dropped" :
 	    tcpstates[tp->t_state],
 	    toep->flags, inp->inp_flags,
 	    toep->flags & TPF_ABORT_SHUTDOWN ?
 	    " (abort already in progress)" : "");
 
 	if (toep->flags & TPF_ABORT_SHUTDOWN)
 		return;	/* abort already in progress */
 
 	toep->flags |= TPF_ABORT_SHUTDOWN;
 
 	KASSERT(toep->flags & TPF_FLOWC_WR_SENT,
 	    ("%s: flowc_wr not sent for tid %d.", __func__, tid));
 
 	wr = alloc_wrqe(sizeof(*req), &toep->ofld_txq->wrq);
 	if (wr == NULL) {
 		/* XXX */
 		panic("%s: allocation failure.", __func__);
 	}
 	req = wrtod(wr);
 
 	INIT_TP_WR_MIT_CPL(req, CPL_ABORT_REQ, tid);
 	if (inp->inp_flags & INP_DROPPED)
 		req->rsvd0 = htobe32(snd_nxt);
 	else
 		req->rsvd0 = htobe32(tp->snd_nxt);
 	req->rsvd1 = !(toep->flags & TPF_TX_DATA_SENT);
 	req->cmd = CPL_ABORT_SEND_RST;
 
 	/*
 	 * XXX: What's the correct way to tell that the inp hasn't been detached
 	 * from its socket?  Should I even be flushing the snd buffer here?
 	 */
 	if ((inp->inp_flags & INP_DROPPED) == 0) {
 		struct socket *so = inp->inp_socket;
 
 		if (so != NULL)	/* because I'm not sure.  See comment above */
 			sbflush(&so->so_snd);
 	}
 
 	t4_l2t_send(sc, wr, toep->l2te);
 }
 
 /*
  * Called when a connection is established to translate the TCP options
  * reported by HW to FreeBSD's native format.
  */
 static void
 assign_rxopt(struct tcpcb *tp, uint16_t opt)
 {
 	struct toepcb *toep = tp->t_toe;
 	struct inpcb *inp = tptoinpcb(tp);
 	struct adapter *sc = td_adapter(toep->td);
 
 	INP_LOCK_ASSERT(inp);
 
 	toep->params.mtu_idx = G_TCPOPT_MSS(opt);
 	tp->t_maxseg = sc->params.mtus[toep->params.mtu_idx];
 	if (inp->inp_inc.inc_flags & INC_ISIPV6)
 		tp->t_maxseg -= sizeof(struct ip6_hdr) + sizeof(struct tcphdr);
 	else
 		tp->t_maxseg -= sizeof(struct ip) + sizeof(struct tcphdr);
 
 	toep->params.emss = tp->t_maxseg;
 	if (G_TCPOPT_TSTAMP(opt)) {
 		toep->params.tstamp = 1;
 		toep->params.emss -= TCPOLEN_TSTAMP_APPA;
 		tp->t_flags |= TF_RCVD_TSTMP;	/* timestamps ok */
 		tp->ts_recent = 0;		/* hmmm */
 		tp->ts_recent_age = tcp_ts_getticks();
 	} else
 		toep->params.tstamp = 0;
 
 	if (G_TCPOPT_SACK(opt)) {
 		toep->params.sack = 1;
 		tp->t_flags |= TF_SACK_PERMIT;	/* should already be set */
 	} else {
 		toep->params.sack = 0;
 		tp->t_flags &= ~TF_SACK_PERMIT;	/* sack disallowed by peer */
 	}
 
 	if (G_TCPOPT_WSCALE_OK(opt))
 		tp->t_flags |= TF_RCVD_SCALE;
 
 	/* Doing window scaling? */
 	if ((tp->t_flags & (TF_RCVD_SCALE | TF_REQ_SCALE)) ==
 	    (TF_RCVD_SCALE | TF_REQ_SCALE)) {
 		tp->rcv_scale = tp->request_r_scale;
 		tp->snd_scale = G_TCPOPT_SND_WSCALE(opt);
 	} else
 		toep->params.wscale = 0;
 
 	CTR6(KTR_CXGBE,
 	    "assign_rxopt: tid %d, mtu_idx %u, emss %u, ts %u, sack %u, wscale %u",
 	    toep->tid, toep->params.mtu_idx, toep->params.emss,
 	    toep->params.tstamp, toep->params.sack, toep->params.wscale);
 }
 
 /*
  * Completes some final bits of initialization for just established connections
  * and changes their state to TCPS_ESTABLISHED.
  *
  * The ISNs are from the exchange of SYNs.
  */
 void
 make_established(struct toepcb *toep, uint32_t iss, uint32_t irs, uint16_t opt)
 {
 	struct inpcb *inp = toep->inp;
 	struct socket *so = inp->inp_socket;
 	struct tcpcb *tp = intotcpcb(inp);
 	uint16_t tcpopt = be16toh(opt);
 
 	INP_WLOCK_ASSERT(inp);
 	KASSERT(tp->t_state == TCPS_SYN_SENT ||
 	    tp->t_state == TCPS_SYN_RECEIVED,
 	    ("%s: TCP state %s", __func__, tcpstates[tp->t_state]));
 
 	CTR6(KTR_CXGBE, "%s: tid %d, so %p, inp %p, tp %p, toep %p",
 	    __func__, toep->tid, so, inp, tp, toep);
 
 	tcp_state_change(tp, TCPS_ESTABLISHED);
 	tp->t_starttime = ticks;
 	TCPSTAT_INC(tcps_connects);
 
 	tp->irs = irs;
 	tcp_rcvseqinit(tp);
 	tp->rcv_wnd = (u_int)toep->params.opt0_bufsize << 10;
 	tp->rcv_adv += tp->rcv_wnd;
 	tp->last_ack_sent = tp->rcv_nxt;
 
 	tp->iss = iss;
 	tcp_sendseqinit(tp);
 	tp->snd_una = iss + 1;
 	tp->snd_nxt = iss + 1;
 	tp->snd_max = iss + 1;
 
 	assign_rxopt(tp, tcpopt);
 	send_flowc_wr(toep, tp);
 
 	soisconnected(so);
 }
 
 int
 send_rx_credits(struct adapter *sc, struct toepcb *toep, int credits)
 {
 	struct wrqe *wr;
 	struct cpl_rx_data_ack *req;
 	uint32_t dack = F_RX_DACK_CHANGE | V_RX_DACK_MODE(1);
 
 	KASSERT(credits >= 0, ("%s: %d credits", __func__, credits));
 
 	wr = alloc_wrqe(sizeof(*req), toep->ctrlq);
 	if (wr == NULL)
 		return (0);
 	req = wrtod(wr);
 
 	INIT_TP_WR_MIT_CPL(req, CPL_RX_DATA_ACK, toep->tid);
 	req->credit_dack = htobe32(dack | V_RX_CREDITS(credits));
 
 	t4_wrq_tx(sc, wr);
 	return (credits);
 }
 
 void
 t4_rcvd_locked(struct toedev *tod, struct tcpcb *tp)
 {
 	struct adapter *sc = tod->tod_softc;
 	struct inpcb *inp = tptoinpcb(tp);
 	struct socket *so = inp->inp_socket;
 	struct sockbuf *sb = &so->so_rcv;
 	struct toepcb *toep = tp->t_toe;
 	int rx_credits;
 
 	INP_WLOCK_ASSERT(inp);
 	SOCKBUF_LOCK_ASSERT(sb);
 
 	rx_credits = sbspace(sb) > tp->rcv_wnd ? sbspace(sb) - tp->rcv_wnd : 0;
 	if (rx_credits > 0 &&
 	    (tp->rcv_wnd <= 32 * 1024 || rx_credits >= 64 * 1024 ||
 	    (rx_credits >= 16 * 1024 && tp->rcv_wnd <= 128 * 1024) ||
 	    sbused(sb) + tp->rcv_wnd < sb->sb_lowat)) {
 		rx_credits = send_rx_credits(sc, toep, rx_credits);
 		tp->rcv_wnd += rx_credits;
 		tp->rcv_adv += rx_credits;
 	}
 }
 
 void
 t4_rcvd(struct toedev *tod, struct tcpcb *tp)
 {
 	struct inpcb *inp = tptoinpcb(tp);
 	struct socket *so = inp->inp_socket;
 	struct sockbuf *sb = &so->so_rcv;
 
 	SOCKBUF_LOCK(sb);
 	t4_rcvd_locked(tod, tp);
 	SOCKBUF_UNLOCK(sb);
 }
 
 /*
  * Close a connection by sending a CPL_CLOSE_CON_REQ message.
  */
 int
 t4_close_conn(struct adapter *sc, struct toepcb *toep)
 {
 	struct wrqe *wr;
 	struct cpl_close_con_req *req;
 	unsigned int tid = toep->tid;
 
 	CTR3(KTR_CXGBE, "%s: tid %u%s", __func__, toep->tid,
 	    toep->flags & TPF_FIN_SENT ? ", IGNORED" : "");
 
 	if (toep->flags & TPF_FIN_SENT)
 		return (0);
 
 	KASSERT(toep->flags & TPF_FLOWC_WR_SENT,
 	    ("%s: flowc_wr not sent for tid %u.", __func__, tid));
 
 	wr = alloc_wrqe(sizeof(*req), &toep->ofld_txq->wrq);
 	if (wr == NULL) {
 		/* XXX */
 		panic("%s: allocation failure.", __func__);
 	}
 	req = wrtod(wr);
 
         req->wr.wr_hi = htonl(V_FW_WR_OP(FW_TP_WR) |
 	    V_FW_WR_IMMDLEN(sizeof(*req) - sizeof(req->wr)));
 	req->wr.wr_mid = htonl(V_FW_WR_LEN16(howmany(sizeof(*req), 16)) |
 	    V_FW_WR_FLOWID(tid));
         req->wr.wr_lo = cpu_to_be64(0);
         OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_CLOSE_CON_REQ, tid));
 	req->rsvd = 0;
 
 	toep->flags |= TPF_FIN_SENT;
 	toep->flags &= ~TPF_SEND_FIN;
 	t4_l2t_send(sc, wr, toep->l2te);
 
 	return (0);
 }
 
 #define MAX_OFLD_TX_CREDITS (SGE_MAX_WR_LEN / 16)
 #define MIN_OFLD_TX_CREDITS (howmany(sizeof(struct fw_ofld_tx_data_wr) + 1, 16))
 #define MIN_ISO_TX_CREDITS  (howmany(sizeof(struct cpl_tx_data_iso), 16))
 #define MIN_TX_CREDITS(iso)						\
 	(MIN_OFLD_TX_CREDITS + ((iso) ? MIN_ISO_TX_CREDITS : 0))
 
 /* Maximum amount of immediate data we could stuff in a WR */
 static inline int
 max_imm_payload(int tx_credits, int iso)
 {
 	const int iso_cpl_size = iso ? sizeof(struct cpl_tx_data_iso) : 0;
 	const int n = 1;	/* Use no more than one desc for imm. data WR */
 
 	KASSERT(tx_credits >= 0 &&
 		tx_credits <= MAX_OFLD_TX_CREDITS,
 		("%s: %d credits", __func__, tx_credits));
 
 	if (tx_credits < MIN_TX_CREDITS(iso))
 		return (0);
 
 	if (tx_credits >= (n * EQ_ESIZE) / 16)
 		return ((n * EQ_ESIZE) - sizeof(struct fw_ofld_tx_data_wr) -
 		    iso_cpl_size);
 	else
 		return (tx_credits * 16 - sizeof(struct fw_ofld_tx_data_wr) -
 		    iso_cpl_size);
 }
 
 /* Maximum number of SGL entries we could stuff in a WR */
 static inline int
 max_dsgl_nsegs(int tx_credits, int iso)
 {
 	int nseg = 1;	/* ulptx_sgl has room for 1, rest ulp_tx_sge_pair */
 	int sge_pair_credits = tx_credits - MIN_TX_CREDITS(iso);
 
 	KASSERT(tx_credits >= 0 &&
 		tx_credits <= MAX_OFLD_TX_CREDITS,
 		("%s: %d credits", __func__, tx_credits));
 
 	if (tx_credits < MIN_TX_CREDITS(iso))
 		return (0);
 
 	nseg += 2 * (sge_pair_credits * 16 / 24);
 	if ((sge_pair_credits * 16) % 24 == 16)
 		nseg++;
 
 	return (nseg);
 }
 
 static inline void
 write_tx_wr(void *dst, struct toepcb *toep, int fw_wr_opcode,
     unsigned int immdlen, unsigned int plen, uint8_t credits, int shove,
     int ulp_submode)
 {
 	struct fw_ofld_tx_data_wr *txwr = dst;
 
 	txwr->op_to_immdlen = htobe32(V_WR_OP(fw_wr_opcode) |
 	    V_FW_WR_IMMDLEN(immdlen));
 	txwr->flowid_len16 = htobe32(V_FW_WR_FLOWID(toep->tid) |
 	    V_FW_WR_LEN16(credits));
 	txwr->lsodisable_to_flags = htobe32(V_TX_ULP_MODE(ulp_mode(toep)) |
 	    V_TX_ULP_SUBMODE(ulp_submode) | V_TX_URG(0) | V_TX_SHOVE(shove));
 	txwr->plen = htobe32(plen);
 
 	if (toep->params.tx_align > 0) {
 		if (plen < 2 * toep->params.emss)
 			txwr->lsodisable_to_flags |=
 			    htobe32(F_FW_OFLD_TX_DATA_WR_LSODISABLE);
 		else
 			txwr->lsodisable_to_flags |=
 			    htobe32(F_FW_OFLD_TX_DATA_WR_ALIGNPLD |
 				(toep->params.nagle == 0 ? 0 :
 				F_FW_OFLD_TX_DATA_WR_ALIGNPLDSHOVE));
 	}
 }
 
 /*
  * Generate a DSGL from a starting mbuf.  The total number of segments and the
  * maximum segments in any one mbuf are provided.
  */
 static void
 write_tx_sgl(void *dst, struct mbuf *start, struct mbuf *stop, int nsegs, int n)
 {
 	struct mbuf *m;
 	struct ulptx_sgl *usgl = dst;
 	int i, j, rc;
 	struct sglist sg;
 	struct sglist_seg segs[n];
 
 	KASSERT(nsegs > 0, ("%s: nsegs 0", __func__));
 
 	sglist_init(&sg, n, segs);
 	usgl->cmd_nsge = htobe32(V_ULPTX_CMD(ULP_TX_SC_DSGL) |
 	    V_ULPTX_NSGE(nsegs));
 
 	i = -1;
 	for (m = start; m != stop; m = m->m_next) {
 		if (m->m_flags & M_EXTPG)
 			rc = sglist_append_mbuf_epg(&sg, m,
 			    mtod(m, vm_offset_t), m->m_len);
 		else
 			rc = sglist_append(&sg, mtod(m, void *), m->m_len);
 		if (__predict_false(rc != 0))
 			panic("%s: sglist_append %d", __func__, rc);
 
 		for (j = 0; j < sg.sg_nseg; i++, j++) {
 			if (i < 0) {
 				usgl->len0 = htobe32(segs[j].ss_len);
 				usgl->addr0 = htobe64(segs[j].ss_paddr);
 			} else {
 				usgl->sge[i / 2].len[i & 1] =
 				    htobe32(segs[j].ss_len);
 				usgl->sge[i / 2].addr[i & 1] =
 				    htobe64(segs[j].ss_paddr);
 			}
 #ifdef INVARIANTS
 			nsegs--;
 #endif
 		}
 		sglist_reset(&sg);
 	}
 	if (i & 1)
 		usgl->sge[i / 2].len[1] = htobe32(0);
 	KASSERT(nsegs == 0, ("%s: nsegs %d, start %p, stop %p",
 	    __func__, nsegs, start, stop));
 }
 
 /*
  * Max number of SGL entries an offload tx work request can have.  This is 41
  * (1 + 40) for a full 512B work request.
  * fw_ofld_tx_data_wr(16B) + ulptx_sgl(16B, 1) + ulptx_sge_pair(480B, 40)
  */
 #define OFLD_SGL_LEN (41)
 
 /*
  * Send data and/or a FIN to the peer.
  *
  * The socket's so_snd buffer consists of a stream of data starting with sb_mb
  * and linked together with m_next.  sb_sndptr, if set, is the last mbuf that
  * was transmitted.
  *
  * drop indicates the number of bytes that should be dropped from the head of
  * the send buffer.  It is an optimization that lets do_fw4_ack avoid creating
  * contention on the send buffer lock (before this change it used to do
  * sowwakeup and then t4_push_frames right after that when recovering from tx
  * stalls).  When drop is set this function MUST drop the bytes and wake up any
  * writers.
  */
 void
 t4_push_frames(struct adapter *sc, struct toepcb *toep, int drop)
 {
 	struct mbuf *sndptr, *m, *sb_sndptr;
 	struct fw_ofld_tx_data_wr *txwr;
 	struct wrqe *wr;
 	u_int plen, nsegs, credits, max_imm, max_nsegs, max_nsegs_1mbuf;
 	struct inpcb *inp = toep->inp;
 	struct tcpcb *tp = intotcpcb(inp);
 	struct socket *so = inp->inp_socket;
 	struct sockbuf *sb = &so->so_snd;
 	int tx_credits, shove, compl, sowwakeup;
 	struct ofld_tx_sdesc *txsd;
 	bool nomap_mbuf_seen;
 
 	INP_WLOCK_ASSERT(inp);
 	KASSERT(toep->flags & TPF_FLOWC_WR_SENT,
 	    ("%s: flowc_wr not sent for tid %u.", __func__, toep->tid));
 
 	KASSERT(ulp_mode(toep) == ULP_MODE_NONE ||
 	    ulp_mode(toep) == ULP_MODE_TCPDDP ||
 	    ulp_mode(toep) == ULP_MODE_TLS ||
 	    ulp_mode(toep) == ULP_MODE_RDMA,
 	    ("%s: ulp_mode %u for toep %p", __func__, ulp_mode(toep), toep));
 
 #ifdef VERBOSE_TRACES
 	CTR5(KTR_CXGBE, "%s: tid %d toep flags %#x tp flags %#x drop %d",
 	    __func__, toep->tid, toep->flags, tp->t_flags, drop);
 #endif
 	if (__predict_false(toep->flags & TPF_ABORT_SHUTDOWN))
 		return;
 
 #ifdef RATELIMIT
 	if (__predict_false(inp->inp_flags2 & INP_RATE_LIMIT_CHANGED) &&
 	    (update_tx_rate_limit(sc, toep, so->so_max_pacing_rate) == 0)) {
 		inp->inp_flags2 &= ~INP_RATE_LIMIT_CHANGED;
 	}
 #endif
 
 	/*
 	 * This function doesn't resume by itself.  Someone else must clear the
 	 * flag and call this function.
 	 */
 	if (__predict_false(toep->flags & TPF_TX_SUSPENDED)) {
 		KASSERT(drop == 0,
 		    ("%s: drop (%d) != 0 but tx is suspended", __func__, drop));
 		return;
 	}
 
 	txsd = &toep->txsd[toep->txsd_pidx];
 	do {
 		tx_credits = min(toep->tx_credits, MAX_OFLD_TX_CREDITS);
 		max_imm = max_imm_payload(tx_credits, 0);
 		max_nsegs = max_dsgl_nsegs(tx_credits, 0);
 
 		SOCKBUF_LOCK(sb);
 		sowwakeup = drop;
 		if (drop) {
 			sbdrop_locked(sb, drop);
 			drop = 0;
 		}
 		sb_sndptr = sb->sb_sndptr;
 		sndptr = sb_sndptr ? sb_sndptr->m_next : sb->sb_mb;
 		plen = 0;
 		nsegs = 0;
 		max_nsegs_1mbuf = 0; /* max # of SGL segments in any one mbuf */
 		nomap_mbuf_seen = false;
 		for (m = sndptr; m != NULL; m = m->m_next) {
 			int n;
 
 			if ((m->m_flags & M_NOTAVAIL) != 0)
 				break;
 			if (m->m_flags & M_EXTPG) {
 #ifdef KERN_TLS
 				if (m->m_epg_tls != NULL) {
 					toep->flags |= TPF_KTLS;
 					if (plen == 0) {
 						SOCKBUF_UNLOCK(sb);
 						t4_push_ktls(sc, toep, 0);
 						return;
 					}
 					break;
 				}
 #endif
 				n = sglist_count_mbuf_epg(m,
 				    mtod(m, vm_offset_t), m->m_len);
 			} else
 				n = sglist_count(mtod(m, void *), m->m_len);
 
 			nsegs += n;
 			plen += m->m_len;
 
 			/* This mbuf sent us _over_ the nsegs limit, back out */
 			if (plen > max_imm && nsegs > max_nsegs) {
 				nsegs -= n;
 				plen -= m->m_len;
 				if (plen == 0) {
 					/* Too few credits */
 					toep->flags |= TPF_TX_SUSPENDED;
 					if (sowwakeup) {
 						if (!TAILQ_EMPTY(
 						    &toep->aiotx_jobq))
 							t4_aiotx_queue_toep(so,
 							    toep);
 						sowwakeup_locked(so);
 					} else
 						SOCKBUF_UNLOCK(sb);
 					SOCKBUF_UNLOCK_ASSERT(sb);
 					return;
 				}
 				break;
 			}
 
 			if (m->m_flags & M_EXTPG)
 				nomap_mbuf_seen = true;
 			if (max_nsegs_1mbuf < n)
 				max_nsegs_1mbuf = n;
 			sb_sndptr = m;	/* new sb->sb_sndptr if all goes well */
 
 			/* This mbuf put us right at the max_nsegs limit */
 			if (plen > max_imm && nsegs == max_nsegs) {
 				m = m->m_next;
 				break;
 			}
 		}
 
 		if (sbused(sb) > sb->sb_hiwat * 5 / 8 &&
 		    toep->plen_nocompl + plen >= sb->sb_hiwat / 4)
 			compl = 1;
 		else
 			compl = 0;
 
 		if (sb->sb_flags & SB_AUTOSIZE &&
 		    V_tcp_do_autosndbuf &&
 		    sb->sb_hiwat < V_tcp_autosndbuf_max &&
 		    sbused(sb) >= sb->sb_hiwat * 7 / 8) {
 			int newsize = min(sb->sb_hiwat + V_tcp_autosndbuf_inc,
 			    V_tcp_autosndbuf_max);
 
 			if (!sbreserve_locked(so, SO_SND, newsize, NULL))
 				sb->sb_flags &= ~SB_AUTOSIZE;
 			else
 				sowwakeup = 1;	/* room available */
 		}
 		if (sowwakeup) {
 			if (!TAILQ_EMPTY(&toep->aiotx_jobq))
 				t4_aiotx_queue_toep(so, toep);
 			sowwakeup_locked(so);
 		} else
 			SOCKBUF_UNLOCK(sb);
 		SOCKBUF_UNLOCK_ASSERT(sb);
 
 		/* nothing to send */
 		if (plen == 0) {
 			KASSERT(m == NULL || (m->m_flags & M_NOTAVAIL) != 0,
 			    ("%s: nothing to send, but m != NULL is ready",
 			    __func__));
 			break;
 		}
 
 		if (__predict_false(toep->flags & TPF_FIN_SENT))
 			panic("%s: excess tx.", __func__);
 
 		shove = m == NULL && !(tp->t_flags & TF_MORETOCOME);
 		if (plen <= max_imm && !nomap_mbuf_seen) {
 
 			/* Immediate data tx */
 
 			wr = alloc_wrqe(roundup2(sizeof(*txwr) + plen, 16),
 					&toep->ofld_txq->wrq);
 			if (wr == NULL) {
 				/* XXX: how will we recover from this? */
 				toep->flags |= TPF_TX_SUSPENDED;
 				return;
 			}
 			txwr = wrtod(wr);
 			credits = howmany(wr->wr_len, 16);
 			write_tx_wr(txwr, toep, FW_OFLD_TX_DATA_WR, plen, plen,
 			    credits, shove, 0);
 			m_copydata(sndptr, 0, plen, (void *)(txwr + 1));
 			nsegs = 0;
 		} else {
 			int wr_len;
 
 			/* DSGL tx */
 
 			wr_len = sizeof(*txwr) + sizeof(struct ulptx_sgl) +
 			    ((3 * (nsegs - 1)) / 2 + ((nsegs - 1) & 1)) * 8;
 			wr = alloc_wrqe(roundup2(wr_len, 16),
 			    &toep->ofld_txq->wrq);
 			if (wr == NULL) {
 				/* XXX: how will we recover from this? */
 				toep->flags |= TPF_TX_SUSPENDED;
 				return;
 			}
 			txwr = wrtod(wr);
 			credits = howmany(wr_len, 16);
 			write_tx_wr(txwr, toep, FW_OFLD_TX_DATA_WR, 0, plen,
 			    credits, shove, 0);
 			write_tx_sgl(txwr + 1, sndptr, m, nsegs,
 			    max_nsegs_1mbuf);
 			if (wr_len & 0xf) {
 				uint64_t *pad = (uint64_t *)
 				    ((uintptr_t)txwr + wr_len);
 				*pad = 0;
 			}
 		}
 
 		KASSERT(toep->tx_credits >= credits,
 			("%s: not enough credits", __func__));
 
 		toep->tx_credits -= credits;
 		toep->tx_nocompl += credits;
 		toep->plen_nocompl += plen;
 		if (toep->tx_credits <= toep->tx_total * 3 / 8 &&
 		    toep->tx_nocompl >= toep->tx_total / 4)
 			compl = 1;
 
 		if (compl || ulp_mode(toep) == ULP_MODE_RDMA) {
 			txwr->op_to_immdlen |= htobe32(F_FW_WR_COMPL);
 			toep->tx_nocompl = 0;
 			toep->plen_nocompl = 0;
 		}
 
 		tp->snd_nxt += plen;
 		tp->snd_max += plen;
 
 		SOCKBUF_LOCK(sb);
 		KASSERT(sb_sndptr, ("%s: sb_sndptr is NULL", __func__));
 		sb->sb_sndptr = sb_sndptr;
 		SOCKBUF_UNLOCK(sb);
 
 		toep->flags |= TPF_TX_DATA_SENT;
 		if (toep->tx_credits < MIN_OFLD_TX_CREDITS)
 			toep->flags |= TPF_TX_SUSPENDED;
 
 		KASSERT(toep->txsd_avail > 0, ("%s: no txsd", __func__));
 		txsd->plen = plen;
 		txsd->tx_credits = credits;
 		txsd++;
 		if (__predict_false(++toep->txsd_pidx == toep->txsd_total)) {
 			toep->txsd_pidx = 0;
 			txsd = &toep->txsd[0];
 		}
 		toep->txsd_avail--;
 
 		t4_l2t_send(sc, wr, toep->l2te);
 	} while (m != NULL && (m->m_flags & M_NOTAVAIL) == 0);
 
 	/* Send a FIN if requested, but only if there's no more data to send */
 	if (m == NULL && toep->flags & TPF_SEND_FIN)
 		t4_close_conn(sc, toep);
 }
 
 static inline void
 rqdrop_locked(struct mbufq *q, int plen)
 {
 	struct mbuf *m;
 
 	while (plen > 0) {
 		m = mbufq_dequeue(q);
 
 		/* Too many credits. */
 		MPASS(m != NULL);
 		M_ASSERTPKTHDR(m);
 
 		/* Partial credits. */
 		MPASS(plen >= m->m_pkthdr.len);
 
 		plen -= m->m_pkthdr.len;
 		m_freem(m);
 	}
 }
 
 /*
  * Not a bit in the TCB, but is a bit in the ulp_submode field of the
  * CPL_TX_DATA flags field in FW_ISCSI_TX_DATA_WR.
  */
 #define	ULP_ISO		G_TX_ULP_SUBMODE(F_FW_ISCSI_TX_DATA_WR_ULPSUBMODE_ISO)
 
 static void
 write_tx_data_iso(void *dst, u_int ulp_submode, uint8_t flags, uint16_t mss,
     int len, int npdu)
 {
 	struct cpl_tx_data_iso *cpl;
 	unsigned int burst_size;
 	unsigned int last;
 
 	/*
 	 * The firmware will set the 'F' bit on the last PDU when
 	 * either condition is true:
 	 *
 	 * - this large PDU is marked as the "last" slice
 	 *
 	 * - the amount of data payload bytes equals the burst_size
 	 *
 	 * The strategy used here is to always set the burst_size
 	 * artificially high (len includes the size of the template
 	 * BHS) and only set the "last" flag if the original PDU had
 	 * 'F' set.
 	 */
 	burst_size = len;
 	last = !!(flags & CXGBE_ISO_F);
 
 	cpl = (struct cpl_tx_data_iso *)dst;
 	cpl->op_to_scsi = htonl(V_CPL_TX_DATA_ISO_OP(CPL_TX_DATA_ISO) |
 	    V_CPL_TX_DATA_ISO_FIRST(1) | V_CPL_TX_DATA_ISO_LAST(last) |
 	    V_CPL_TX_DATA_ISO_CPLHDRLEN(0) |
 	    V_CPL_TX_DATA_ISO_HDRCRC(!!(ulp_submode & ULP_CRC_HEADER)) |
 	    V_CPL_TX_DATA_ISO_PLDCRC(!!(ulp_submode & ULP_CRC_DATA)) |
 	    V_CPL_TX_DATA_ISO_IMMEDIATE(0) |
 	    V_CPL_TX_DATA_ISO_SCSI(CXGBE_ISO_TYPE(flags)));
 
 	cpl->ahs_len = 0;
 	cpl->mpdu = htons(DIV_ROUND_UP(mss, 4));
 	cpl->burst_size = htonl(DIV_ROUND_UP(burst_size, 4));
 	cpl->len = htonl(len);
 	cpl->reserved2_seglen_offset = htonl(0);
 	cpl->datasn_offset = htonl(0);
 	cpl->buffer_offset = htonl(0);
 	cpl->reserved3 = 0;
 }
 
 static struct wrqe *
 write_iscsi_mbuf_wr(struct toepcb *toep, struct mbuf *sndptr)
 {
 	struct mbuf *m;
 	struct fw_ofld_tx_data_wr *txwr;
 	struct cpl_tx_data_iso *cpl_iso;
 	void *p;
 	struct wrqe *wr;
 	u_int plen, nsegs, credits, max_imm, max_nsegs, max_nsegs_1mbuf;
 	u_int adjusted_plen, imm_data, ulp_submode;
 	struct inpcb *inp = toep->inp;
 	struct tcpcb *tp = intotcpcb(inp);
 	int tx_credits, shove, npdu, wr_len;
 	uint16_t iso_mss;
 	static const u_int ulp_extra_len[] = {0, 4, 4, 8};
 	bool iso, nomap_mbuf_seen;
 
 	M_ASSERTPKTHDR(sndptr);
 
 	tx_credits = min(toep->tx_credits, MAX_OFLD_TX_CREDITS);
 	if (mbuf_raw_wr(sndptr)) {
 		plen = sndptr->m_pkthdr.len;
 		KASSERT(plen <= SGE_MAX_WR_LEN,
 		    ("raw WR len %u is greater than max WR len", plen));
 		if (plen > tx_credits * 16)
 			return (NULL);
 
 		wr = alloc_wrqe(roundup2(plen, 16), &toep->ofld_txq->wrq);
 		if (__predict_false(wr == NULL))
 			return (NULL);
 
 		m_copydata(sndptr, 0, plen, wrtod(wr));
 		return (wr);
 	}
 
 	iso = mbuf_iscsi_iso(sndptr);
 	max_imm = max_imm_payload(tx_credits, iso);
 	max_nsegs = max_dsgl_nsegs(tx_credits, iso);
 	iso_mss = mbuf_iscsi_iso_mss(sndptr);
 
 	plen = 0;
 	nsegs = 0;
 	max_nsegs_1mbuf = 0; /* max # of SGL segments in any one mbuf */
 	nomap_mbuf_seen = false;
 	for (m = sndptr; m != NULL; m = m->m_next) {
 		int n;
 
 		if (m->m_flags & M_EXTPG)
 			n = sglist_count_mbuf_epg(m, mtod(m, vm_offset_t),
 			    m->m_len);
 		else
 			n = sglist_count(mtod(m, void *), m->m_len);
 
 		nsegs += n;
 		plen += m->m_len;
 
 		/*
 		 * This mbuf would send us _over_ the nsegs limit.
 		 * Suspend tx because the PDU can't be sent out.
 		 */
 		if ((nomap_mbuf_seen || plen > max_imm) && nsegs > max_nsegs)
 			return (NULL);
 
 		if (m->m_flags & M_EXTPG)
 			nomap_mbuf_seen = true;
 		if (max_nsegs_1mbuf < n)
 			max_nsegs_1mbuf = n;
 	}
 
 	if (__predict_false(toep->flags & TPF_FIN_SENT))
 		panic("%s: excess tx.", __func__);
 
 	/*
 	 * We have a PDU to send.  All of it goes out in one WR so 'm'
 	 * is NULL.  A PDU's length is always a multiple of 4.
 	 */
 	MPASS(m == NULL);
 	MPASS((plen & 3) == 0);
 	MPASS(sndptr->m_pkthdr.len == plen);
 
 	shove = !(tp->t_flags & TF_MORETOCOME);
 
 	/*
 	 * plen doesn't include header and data digests, which are
 	 * generated and inserted in the right places by the TOE, but
 	 * they do occupy TCP sequence space and need to be accounted
 	 * for.
 	 */
 	ulp_submode = mbuf_ulp_submode(sndptr);
 	MPASS(ulp_submode < nitems(ulp_extra_len));
 	npdu = iso ? howmany(plen - ISCSI_BHS_SIZE, iso_mss) : 1;
 	adjusted_plen = plen + ulp_extra_len[ulp_submode] * npdu;
 	if (iso)
 		adjusted_plen += ISCSI_BHS_SIZE * (npdu - 1);
 	wr_len = sizeof(*txwr);
 	if (iso)
 		wr_len += sizeof(struct cpl_tx_data_iso);
 	if (plen <= max_imm && !nomap_mbuf_seen) {
 		/* Immediate data tx */
 		imm_data = plen;
 		wr_len += plen;
 		nsegs = 0;
 	} else {
 		/* DSGL tx */
 		imm_data = 0;
 		wr_len += sizeof(struct ulptx_sgl) +
 		    ((3 * (nsegs - 1)) / 2 + ((nsegs - 1) & 1)) * 8;
 	}
 
 	wr = alloc_wrqe(roundup2(wr_len, 16), &toep->ofld_txq->wrq);
 	if (wr == NULL) {
 		/* XXX: how will we recover from this? */
 		return (NULL);
 	}
 	txwr = wrtod(wr);
 	credits = howmany(wr->wr_len, 16);
 
 	if (iso) {
 		write_tx_wr(txwr, toep, FW_ISCSI_TX_DATA_WR,
 		    imm_data + sizeof(struct cpl_tx_data_iso),
 		    adjusted_plen, credits, shove, ulp_submode | ULP_ISO);
 		cpl_iso = (struct cpl_tx_data_iso *)(txwr + 1);
 		MPASS(plen == sndptr->m_pkthdr.len);
 		write_tx_data_iso(cpl_iso, ulp_submode,
 		    mbuf_iscsi_iso_flags(sndptr), iso_mss, plen, npdu);
 		p = cpl_iso + 1;
 	} else {
 		write_tx_wr(txwr, toep, FW_OFLD_TX_DATA_WR, imm_data,
 		    adjusted_plen, credits, shove, ulp_submode);
 		p = txwr + 1;
 	}
 
 	if (imm_data != 0) {
 		m_copydata(sndptr, 0, plen, p);
 	} else {
 		write_tx_sgl(p, sndptr, m, nsegs, max_nsegs_1mbuf);
 		if (wr_len & 0xf) {
 			uint64_t *pad = (uint64_t *)((uintptr_t)txwr + wr_len);
 			*pad = 0;
 		}
 	}
 
 	KASSERT(toep->tx_credits >= credits,
 	    ("%s: not enough credits: credits %u "
 		"toep->tx_credits %u tx_credits %u nsegs %u "
 		"max_nsegs %u iso %d", __func__, credits,
 		toep->tx_credits, tx_credits, nsegs, max_nsegs, iso));
 
 	tp->snd_nxt += adjusted_plen;
 	tp->snd_max += adjusted_plen;
 
 	counter_u64_add(toep->ofld_txq->tx_iscsi_pdus, npdu);
 	counter_u64_add(toep->ofld_txq->tx_iscsi_octets, plen);
 	if (iso)
 		counter_u64_add(toep->ofld_txq->tx_iscsi_iso_wrs, 1);
 
 	return (wr);
 }
 
 void
 t4_push_pdus(struct adapter *sc, struct toepcb *toep, int drop)
 {
 	struct mbuf *sndptr, *m;
 	struct fw_wr_hdr *wrhdr;
 	struct wrqe *wr;
 	u_int plen, credits;
 	struct inpcb *inp = toep->inp;
 	struct ofld_tx_sdesc *txsd = &toep->txsd[toep->txsd_pidx];
 	struct mbufq *pduq = &toep->ulp_pduq;
 
 	INP_WLOCK_ASSERT(inp);
 	KASSERT(toep->flags & TPF_FLOWC_WR_SENT,
 	    ("%s: flowc_wr not sent for tid %u.", __func__, toep->tid));
 	KASSERT(ulp_mode(toep) == ULP_MODE_ISCSI,
 	    ("%s: ulp_mode %u for toep %p", __func__, ulp_mode(toep), toep));
 
 	if (__predict_false(toep->flags & TPF_ABORT_SHUTDOWN))
 		return;
 
 	/*
 	 * This function doesn't resume by itself.  Someone else must clear the
 	 * flag and call this function.
 	 */
 	if (__predict_false(toep->flags & TPF_TX_SUSPENDED)) {
 		KASSERT(drop == 0,
 		    ("%s: drop (%d) != 0 but tx is suspended", __func__, drop));
 		return;
 	}
 
 	if (drop) {
 		struct socket *so = inp->inp_socket;
 		struct sockbuf *sb = &so->so_snd;
 		int sbu;
 
 		/*
 		 * An unlocked read is ok here as the data should only
 		 * transition from a non-zero value to either another
 		 * non-zero value or zero.  Once it is zero it should
 		 * stay zero.
 		 */
 		if (__predict_false(sbused(sb)) > 0) {
 			SOCKBUF_LOCK(sb);
 			sbu = sbused(sb);
 			if (sbu > 0) {
 				/*
 				 * The data transmitted before the
 				 * tid's ULP mode changed to ISCSI is
 				 * still in so_snd.  Incoming credits
 				 * should account for so_snd first.
 				 */
 				sbdrop_locked(sb, min(sbu, drop));
 				drop -= min(sbu, drop);
 			}
 			sowwakeup_locked(so);	/* unlocks so_snd */
 		}
 		rqdrop_locked(&toep->ulp_pdu_reclaimq, drop);
 	}
 
 	while ((sndptr = mbufq_first(pduq)) != NULL) {
 		wr = write_iscsi_mbuf_wr(toep, sndptr);
 		if (wr == NULL) {
 			toep->flags |= TPF_TX_SUSPENDED;
 			return;
 		}
 
 		plen = sndptr->m_pkthdr.len;
 		credits = howmany(wr->wr_len, 16);
 		KASSERT(toep->tx_credits >= credits,
 			("%s: not enough credits", __func__));
 
 		m = mbufq_dequeue(pduq);
 		MPASS(m == sndptr);
 		mbufq_enqueue(&toep->ulp_pdu_reclaimq, m);
 
 		toep->tx_credits -= credits;
 		toep->tx_nocompl += credits;
 		toep->plen_nocompl += plen;
 
 		/*
 		 * Ensure there are enough credits for a full-sized WR
 		 * as page pod WRs can be full-sized.
 		 */
 		if (toep->tx_credits <= SGE_MAX_WR_LEN * 5 / 4 &&
 		    toep->tx_nocompl >= toep->tx_total / 4) {
 			wrhdr = wrtod(wr);
 			wrhdr->hi |= htobe32(F_FW_WR_COMPL);
 			toep->tx_nocompl = 0;
 			toep->plen_nocompl = 0;
 		}
 
 		toep->flags |= TPF_TX_DATA_SENT;
 		if (toep->tx_credits < MIN_OFLD_TX_CREDITS)
 			toep->flags |= TPF_TX_SUSPENDED;
 
 		KASSERT(toep->txsd_avail > 0, ("%s: no txsd", __func__));
 		txsd->plen = plen;
 		txsd->tx_credits = credits;
 		txsd++;
 		if (__predict_false(++toep->txsd_pidx == toep->txsd_total)) {
 			toep->txsd_pidx = 0;
 			txsd = &toep->txsd[0];
 		}
 		toep->txsd_avail--;
 
 		t4_l2t_send(sc, wr, toep->l2te);
 	}
 
 	/* Send a FIN if requested, but only if there are no more PDUs to send */
 	if (mbufq_first(pduq) == NULL && toep->flags & TPF_SEND_FIN)
 		t4_close_conn(sc, toep);
 }
 
 static inline void
 t4_push_data(struct adapter *sc, struct toepcb *toep, int drop)
 {
 
 	if (ulp_mode(toep) == ULP_MODE_ISCSI)
 		t4_push_pdus(sc, toep, drop);
 	else if (toep->flags & TPF_KTLS)
 		t4_push_ktls(sc, toep, drop);
 	else
 		t4_push_frames(sc, toep, drop);
 }
 
 int
 t4_tod_output(struct toedev *tod, struct tcpcb *tp)
 {
 	struct adapter *sc = tod->tod_softc;
 #ifdef INVARIANTS
 	struct inpcb *inp = tptoinpcb(tp);
 #endif
 	struct toepcb *toep = tp->t_toe;
 
 	INP_WLOCK_ASSERT(inp);
 	KASSERT((inp->inp_flags & INP_DROPPED) == 0,
 	    ("%s: inp %p dropped.", __func__, inp));
 	KASSERT(toep != NULL, ("%s: toep is NULL", __func__));
 
 	t4_push_data(sc, toep, 0);
 
 	return (0);
 }
 
 int
 t4_send_fin(struct toedev *tod, struct tcpcb *tp)
 {
 	struct adapter *sc = tod->tod_softc;
 #ifdef INVARIANTS
 	struct inpcb *inp = tptoinpcb(tp);
 #endif
 	struct toepcb *toep = tp->t_toe;
 
 	INP_WLOCK_ASSERT(inp);
 	KASSERT((inp->inp_flags & INP_DROPPED) == 0,
 	    ("%s: inp %p dropped.", __func__, inp));
 	KASSERT(toep != NULL, ("%s: toep is NULL", __func__));
 
 	toep->flags |= TPF_SEND_FIN;
 	if (tp->t_state >= TCPS_ESTABLISHED)
 		t4_push_data(sc, toep, 0);
 
 	return (0);
 }
 
 int
 t4_send_rst(struct toedev *tod, struct tcpcb *tp)
 {
 	struct adapter *sc = tod->tod_softc;
 #if defined(INVARIANTS)
 	struct inpcb *inp = tptoinpcb(tp);
 #endif
 	struct toepcb *toep = tp->t_toe;
 
 	INP_WLOCK_ASSERT(inp);
 	KASSERT((inp->inp_flags & INP_DROPPED) == 0,
 	    ("%s: inp %p dropped.", __func__, inp));
 	KASSERT(toep != NULL, ("%s: toep is NULL", __func__));
 
 	/* hmmmm */
 	KASSERT(toep->flags & TPF_FLOWC_WR_SENT,
 	    ("%s: flowc for tid %u [%s] not sent already",
 	    __func__, toep->tid, tcpstates[tp->t_state]));
 
 	send_reset(sc, toep, 0);
 	return (0);
 }
 
 /*
  * Peer has sent us a FIN.
  */
 static int
 do_peer_close(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m)
 {
 	struct adapter *sc = iq->adapter;
 	const struct cpl_peer_close *cpl = (const void *)(rss + 1);
 	unsigned int tid = GET_TID(cpl);
 	struct toepcb *toep = lookup_tid(sc, tid);
 	struct inpcb *inp = toep->inp;
 	struct tcpcb *tp = NULL;
 	struct socket *so;
 	struct epoch_tracker et;
 #ifdef INVARIANTS
 	unsigned int opcode = G_CPL_OPCODE(be32toh(OPCODE_TID(cpl)));
 #endif
 
 	KASSERT(opcode == CPL_PEER_CLOSE,
 	    ("%s: unexpected opcode 0x%x", __func__, opcode));
 	KASSERT(m == NULL, ("%s: wasn't expecting payload", __func__));
 
 	if (__predict_false(toep->flags & TPF_SYNQE)) {
 		/*
 		 * do_pass_establish must have run before do_peer_close and if
 		 * this is still a synqe instead of a toepcb then the connection
 		 * must be getting aborted.
 		 */
 		MPASS(toep->flags & TPF_ABORT_SHUTDOWN);
 		CTR4(KTR_CXGBE, "%s: tid %u, synqe %p (0x%x)", __func__, tid,
 		    toep, toep->flags);
 		return (0);
 	}
 
 	KASSERT(toep->tid == tid, ("%s: toep tid mismatch", __func__));
 
 	CURVNET_SET(toep->vnet);
 	NET_EPOCH_ENTER(et);
 	INP_WLOCK(inp);
 	tp = intotcpcb(inp);
 
 	CTR6(KTR_CXGBE,
 	    "%s: tid %u (%s), toep_flags 0x%x, ddp_flags 0x%x, inp %p",
 	    __func__, tid, tp ? tcpstates[tp->t_state] : "no tp", toep->flags,
 	    toep->ddp.flags, inp);
 
 	if (toep->flags & TPF_ABORT_SHUTDOWN)
 		goto done;
 
 	if (ulp_mode(toep) == ULP_MODE_TCPDDP) {
 		DDP_LOCK(toep);
 		if (__predict_false(toep->ddp.flags &
 		    (DDP_BUF0_ACTIVE | DDP_BUF1_ACTIVE)))
 			handle_ddp_close(toep, tp, cpl->rcv_nxt);
 		DDP_UNLOCK(toep);
 	}
 	so = inp->inp_socket;
 	socantrcvmore(so);
 
 	if (ulp_mode(toep) == ULP_MODE_RDMA ||
 	    (ulp_mode(toep) == ULP_MODE_ISCSI && chip_id(sc) >= CHELSIO_T6)) {
 		/*
 		 * There might be data received via DDP before the FIN
 		 * not reported to the driver.  Just assume the
 		 * sequence number in the CPL is correct as the
 		 * sequence number of the FIN.
 		 */
 	} else {
 		KASSERT(tp->rcv_nxt + 1 == be32toh(cpl->rcv_nxt),
 		    ("%s: rcv_nxt mismatch: %u %u", __func__, tp->rcv_nxt,
 		    be32toh(cpl->rcv_nxt)));
 	}
 
 	tp->rcv_nxt = be32toh(cpl->rcv_nxt);
 
 	switch (tp->t_state) {
 	case TCPS_SYN_RECEIVED:
 		tp->t_starttime = ticks;
 		/* FALLTHROUGH */
 
 	case TCPS_ESTABLISHED:
 		tcp_state_change(tp, TCPS_CLOSE_WAIT);
 		break;
 
 	case TCPS_FIN_WAIT_1:
 		tcp_state_change(tp, TCPS_CLOSING);
 		break;
 
 	case TCPS_FIN_WAIT_2:
 		restore_so_proto(so, inp->inp_vflag & INP_IPV6);
 		t4_pcb_detach(NULL, tp);
 		tcp_twstart(tp);
 		INP_UNLOCK_ASSERT(inp);	 /* safe, we have a ref on the inp */
 		NET_EPOCH_EXIT(et);
 		CURVNET_RESTORE();
 
 		INP_WLOCK(inp);
 		final_cpl_received(toep);
 		return (0);
 
 	default:
 		log(LOG_ERR, "%s: TID %u received CPL_PEER_CLOSE in state %d\n",
 		    __func__, tid, tp->t_state);
 	}
 done:
 	INP_WUNLOCK(inp);
 	NET_EPOCH_EXIT(et);
 	CURVNET_RESTORE();
 	return (0);
 }
 
 /*
  * Peer has ACK'd our FIN.
  */
 static int
 do_close_con_rpl(struct sge_iq *iq, const struct rss_header *rss,
     struct mbuf *m)
 {
 	struct adapter *sc = iq->adapter;
 	const struct cpl_close_con_rpl *cpl = (const void *)(rss + 1);
 	unsigned int tid = GET_TID(cpl);
 	struct toepcb *toep = lookup_tid(sc, tid);
 	struct inpcb *inp = toep->inp;
 	struct tcpcb *tp = NULL;
 	struct socket *so = NULL;
 	struct epoch_tracker et;
 #ifdef INVARIANTS
 	unsigned int opcode = G_CPL_OPCODE(be32toh(OPCODE_TID(cpl)));
 #endif
 
 	KASSERT(opcode == CPL_CLOSE_CON_RPL,
 	    ("%s: unexpected opcode 0x%x", __func__, opcode));
 	KASSERT(m == NULL, ("%s: wasn't expecting payload", __func__));
 	KASSERT(toep->tid == tid, ("%s: toep tid mismatch", __func__));
 
 	CURVNET_SET(toep->vnet);
 	NET_EPOCH_ENTER(et);
 	INP_WLOCK(inp);
 	tp = intotcpcb(inp);
 
 	CTR4(KTR_CXGBE, "%s: tid %u (%s), toep_flags 0x%x",
 	    __func__, tid, tp ? tcpstates[tp->t_state] : "no tp", toep->flags);
 
 	if (toep->flags & TPF_ABORT_SHUTDOWN)
 		goto done;
 
 	so = inp->inp_socket;
 	tp->snd_una = be32toh(cpl->snd_nxt) - 1;	/* exclude FIN */
 
 	switch (tp->t_state) {
 	case TCPS_CLOSING:	/* see TCPS_FIN_WAIT_2 in do_peer_close too */
 		restore_so_proto(so, inp->inp_vflag & INP_IPV6);
 		t4_pcb_detach(NULL, tp);
 		tcp_twstart(tp);
 release:
 		INP_UNLOCK_ASSERT(inp);	/* safe, we have a ref on the  inp */
 		NET_EPOCH_EXIT(et);
 		CURVNET_RESTORE();
 
 		INP_WLOCK(inp);
 		final_cpl_received(toep);	/* no more CPLs expected */
 
 		return (0);
 	case TCPS_LAST_ACK:
 		if (tcp_close(tp))
 			INP_WUNLOCK(inp);
 		goto release;
 
 	case TCPS_FIN_WAIT_1:
 		if (so->so_rcv.sb_state & SBS_CANTRCVMORE)
 			soisdisconnected(so);
 		tcp_state_change(tp, TCPS_FIN_WAIT_2);
 		break;
 
 	default:
 		log(LOG_ERR,
 		    "%s: TID %u received CPL_CLOSE_CON_RPL in state %s\n",
 		    __func__, tid, tcpstates[tp->t_state]);
 	}
 done:
 	INP_WUNLOCK(inp);
 	NET_EPOCH_EXIT(et);
 	CURVNET_RESTORE();
 	return (0);
 }
 
 void
 send_abort_rpl(struct adapter *sc, struct sge_ofld_txq *ofld_txq, int tid,
     int rst_status)
 {
 	struct wrqe *wr;
 	struct cpl_abort_rpl *cpl;
 
 	wr = alloc_wrqe(sizeof(*cpl), &ofld_txq->wrq);
 	if (wr == NULL) {
 		/* XXX */
 		panic("%s: allocation failure.", __func__);
 	}
 	cpl = wrtod(wr);
 
 	INIT_TP_WR_MIT_CPL(cpl, CPL_ABORT_RPL, tid);
 	cpl->cmd = rst_status;
 
 	t4_wrq_tx(sc, wr);
 }
 
 static int
 abort_status_to_errno(struct tcpcb *tp, unsigned int abort_reason)
 {
 	switch (abort_reason) {
 	case CPL_ERR_BAD_SYN:
 	case CPL_ERR_CONN_RESET:
 		return (tp->t_state == TCPS_CLOSE_WAIT ? EPIPE : ECONNRESET);
 	case CPL_ERR_XMIT_TIMEDOUT:
 	case CPL_ERR_PERSIST_TIMEDOUT:
 	case CPL_ERR_FINWAIT2_TIMEDOUT:
 	case CPL_ERR_KEEPALIVE_TIMEDOUT:
 		return (ETIMEDOUT);
 	default:
 		return (EIO);
 	}
 }
 
 /*
  * TCP RST from the peer, timeout, or some other such critical error.
  */
 static int
 do_abort_req(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m)
 {
 	struct adapter *sc = iq->adapter;
 	const struct cpl_abort_req_rss *cpl = (const void *)(rss + 1);
 	unsigned int tid = GET_TID(cpl);
 	struct toepcb *toep = lookup_tid(sc, tid);
 	struct sge_ofld_txq *ofld_txq = toep->ofld_txq;
 	struct inpcb *inp;
 	struct tcpcb *tp;
 	struct epoch_tracker et;
 #ifdef INVARIANTS
 	unsigned int opcode = G_CPL_OPCODE(be32toh(OPCODE_TID(cpl)));
 #endif
 
 	KASSERT(opcode == CPL_ABORT_REQ_RSS,
 	    ("%s: unexpected opcode 0x%x", __func__, opcode));
 	KASSERT(m == NULL, ("%s: wasn't expecting payload", __func__));
 
 	if (toep->flags & TPF_SYNQE)
 		return (do_abort_req_synqe(iq, rss, m));
 
 	KASSERT(toep->tid == tid, ("%s: toep tid mismatch", __func__));
 
 	if (negative_advice(cpl->status)) {
 		CTR4(KTR_CXGBE, "%s: negative advice %d for tid %d (0x%x)",
 		    __func__, cpl->status, tid, toep->flags);
 		return (0);	/* Ignore negative advice */
 	}
 
 	inp = toep->inp;
 	CURVNET_SET(toep->vnet);
 	NET_EPOCH_ENTER(et);	/* for tcp_close */
 	INP_WLOCK(inp);
 
 	tp = intotcpcb(inp);
 
 	CTR6(KTR_CXGBE,
 	    "%s: tid %d (%s), toep_flags 0x%x, inp_flags 0x%x, status %d",
 	    __func__, tid, tp ? tcpstates[tp->t_state] : "no tp", toep->flags,
 	    inp->inp_flags, cpl->status);
 
 	/*
 	 * If we'd initiated an abort earlier the reply to it is responsible for
 	 * cleaning up resources.  Otherwise we tear everything down right here
 	 * right now.  We owe the T4 a CPL_ABORT_RPL no matter what.
 	 */
 	if (toep->flags & TPF_ABORT_SHUTDOWN) {
 		INP_WUNLOCK(inp);
 		goto done;
 	}
 	toep->flags |= TPF_ABORT_SHUTDOWN;
 
 	if ((inp->inp_flags & INP_DROPPED) == 0) {
 		struct socket *so = inp->inp_socket;
 
 		if (so != NULL)
 			so_error_set(so, abort_status_to_errno(tp,
 			    cpl->status));
 		tp = tcp_close(tp);
 		if (tp == NULL)
 			INP_WLOCK(inp);	/* re-acquire */
 	}
 
 	final_cpl_received(toep);
 done:
 	NET_EPOCH_EXIT(et);
 	CURVNET_RESTORE();
 	send_abort_rpl(sc, ofld_txq, tid, CPL_ABORT_NO_RST);
 	return (0);
 }
 
 /*
  * Reply to the CPL_ABORT_REQ (send_reset)
  */
 static int
 do_abort_rpl(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m)
 {
 	struct adapter *sc = iq->adapter;
 	const struct cpl_abort_rpl_rss *cpl = (const void *)(rss + 1);
 	unsigned int tid = GET_TID(cpl);
 	struct toepcb *toep = lookup_tid(sc, tid);
 	struct inpcb *inp = toep->inp;
 #ifdef INVARIANTS
 	unsigned int opcode = G_CPL_OPCODE(be32toh(OPCODE_TID(cpl)));
 #endif
 
 	KASSERT(opcode == CPL_ABORT_RPL_RSS,
 	    ("%s: unexpected opcode 0x%x", __func__, opcode));
 	KASSERT(m == NULL, ("%s: wasn't expecting payload", __func__));
 
 	if (toep->flags & TPF_SYNQE)
 		return (do_abort_rpl_synqe(iq, rss, m));
 
 	KASSERT(toep->tid == tid, ("%s: toep tid mismatch", __func__));
 
 	CTR5(KTR_CXGBE, "%s: tid %u, toep %p, inp %p, status %d",
 	    __func__, tid, toep, inp, cpl->status);
 
 	KASSERT(toep->flags & TPF_ABORT_SHUTDOWN,
 	    ("%s: wasn't expecting abort reply", __func__));
 
 	INP_WLOCK(inp);
 	final_cpl_received(toep);
 
 	return (0);
 }
 
 static int
 do_rx_data(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m)
 {
 	struct adapter *sc = iq->adapter;
 	const struct cpl_rx_data *cpl = mtod(m, const void *);
 	unsigned int tid = GET_TID(cpl);
 	struct toepcb *toep = lookup_tid(sc, tid);
 	struct inpcb *inp = toep->inp;
 	struct tcpcb *tp;
 	struct socket *so;
 	struct sockbuf *sb;
 	struct epoch_tracker et;
 	int len;
 	uint32_t ddp_placed = 0;
 
 	if (__predict_false(toep->flags & TPF_SYNQE)) {
 		/*
 		 * do_pass_establish must have run before do_rx_data and if this
 		 * is still a synqe instead of a toepcb then the connection must
 		 * be getting aborted.
 		 */
 		MPASS(toep->flags & TPF_ABORT_SHUTDOWN);
 		CTR4(KTR_CXGBE, "%s: tid %u, synqe %p (0x%x)", __func__, tid,
 		    toep, toep->flags);
 		m_freem(m);
 		return (0);
 	}
 
 	KASSERT(toep->tid == tid, ("%s: toep tid mismatch", __func__));
 
 	/* strip off CPL header */
 	m_adj(m, sizeof(*cpl));
 	len = m->m_pkthdr.len;
 
 	INP_WLOCK(inp);
 	if (inp->inp_flags & INP_DROPPED) {
 		CTR4(KTR_CXGBE, "%s: tid %u, rx (%d bytes), inp_flags 0x%x",
 		    __func__, tid, len, inp->inp_flags);
 		INP_WUNLOCK(inp);
 		m_freem(m);
 		return (0);
 	}
 
 	tp = intotcpcb(inp);
 
 	if (__predict_false(ulp_mode(toep) == ULP_MODE_TLS &&
 	   toep->flags & TPF_TLS_RECEIVE)) {
 		/* Received "raw" data on a TLS socket. */
 		CTR3(KTR_CXGBE, "%s: tid %u, raw TLS data (%d bytes)",
 		    __func__, tid, len);
 		do_rx_data_tls(cpl, toep, m);
 		return (0);
 	}
 
 	if (__predict_false(tp->rcv_nxt != be32toh(cpl->seq)))
 		ddp_placed = be32toh(cpl->seq) - tp->rcv_nxt;
 
 	tp->rcv_nxt += len;
 	if (tp->rcv_wnd < len) {
 		KASSERT(ulp_mode(toep) == ULP_MODE_RDMA,
 				("%s: negative window size", __func__));
 	}
 
 	tp->rcv_wnd -= len;
 	tp->t_rcvtime = ticks;
 
 	if (ulp_mode(toep) == ULP_MODE_TCPDDP)
 		DDP_LOCK(toep);
 	so = inp_inpcbtosocket(inp);
 	sb = &so->so_rcv;
 	SOCKBUF_LOCK(sb);
 
 	if (__predict_false(sb->sb_state & SBS_CANTRCVMORE)) {
 		CTR3(KTR_CXGBE, "%s: tid %u, excess rx (%d bytes)",
 		    __func__, tid, len);
 		m_freem(m);
 		SOCKBUF_UNLOCK(sb);
 		if (ulp_mode(toep) == ULP_MODE_TCPDDP)
 			DDP_UNLOCK(toep);
 		INP_WUNLOCK(inp);
 
 		CURVNET_SET(toep->vnet);
 		NET_EPOCH_ENTER(et);
 		INP_WLOCK(inp);
 		tp = tcp_drop(tp, ECONNRESET);
 		if (tp)
 			INP_WUNLOCK(inp);
 		NET_EPOCH_EXIT(et);
 		CURVNET_RESTORE();
 
 		return (0);
 	}
 
 	/* receive buffer autosize */
 	MPASS(toep->vnet == so->so_vnet);
 	CURVNET_SET(toep->vnet);
 	if (sb->sb_flags & SB_AUTOSIZE &&
 	    V_tcp_do_autorcvbuf &&
 	    sb->sb_hiwat < V_tcp_autorcvbuf_max &&
 	    len > (sbspace(sb) / 8 * 7)) {
 		unsigned int hiwat = sb->sb_hiwat;
 		unsigned int newsize = min(hiwat + sc->tt.autorcvbuf_inc,
 		    V_tcp_autorcvbuf_max);
 
 		if (!sbreserve_locked(so, SO_RCV, newsize, NULL))
 			sb->sb_flags &= ~SB_AUTOSIZE;
 	}
 
 	if (ulp_mode(toep) == ULP_MODE_TCPDDP) {
 		int changed = !(toep->ddp.flags & DDP_ON) ^ cpl->ddp_off;
 
 		if (toep->ddp.waiting_count != 0 || toep->ddp.active_count != 0)
 			CTR3(KTR_CXGBE, "%s: tid %u, non-ddp rx (%d bytes)",
 			    __func__, tid, len);
 
 		if (changed) {
 			if (toep->ddp.flags & DDP_SC_REQ)
 				toep->ddp.flags ^= DDP_ON | DDP_SC_REQ;
 			else if (cpl->ddp_off == 1) {
 				/* Fell out of DDP mode */
 				toep->ddp.flags &= ~DDP_ON;
 				CTR1(KTR_CXGBE, "%s: fell out of DDP mode",
 				    __func__);
 
 				insert_ddp_data(toep, ddp_placed);
 			} else {
 				/*
 				 * Data was received while still
 				 * ULP_MODE_NONE, just fall through.
 				 */
 			}
 		}
 
 		if (toep->ddp.flags & DDP_ON) {
 			/*
 			 * CPL_RX_DATA with DDP on can only be an indicate.
 			 * Start posting queued AIO requests via DDP.  The
 			 * payload that arrived in this indicate is appended
 			 * to the socket buffer as usual.
 			 */
 			handle_ddp_indicate(toep);
 		}
 	}
 
 	sbappendstream_locked(sb, m, 0);
 	t4_rcvd_locked(&toep->td->tod, tp);
 
 	if (ulp_mode(toep) == ULP_MODE_TCPDDP &&
 	    (toep->ddp.flags & DDP_AIO) != 0 && toep->ddp.waiting_count > 0 &&
 	    sbavail(sb) != 0) {
 		CTR2(KTR_CXGBE, "%s: tid %u queueing AIO task", __func__,
 		    tid);
 		ddp_queue_toep(toep);
 	}
 	if (toep->flags & TPF_TLS_STARTING)
 		tls_received_starting_data(sc, toep, sb, len);
 	sorwakeup_locked(so);
 	SOCKBUF_UNLOCK_ASSERT(sb);
 	if (ulp_mode(toep) == ULP_MODE_TCPDDP)
 		DDP_UNLOCK(toep);
 
 	INP_WUNLOCK(inp);
 	CURVNET_RESTORE();
 	return (0);
 }
 
 static int
 do_fw4_ack(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m)
 {
 	struct adapter *sc = iq->adapter;
 	const struct cpl_fw4_ack *cpl = (const void *)(rss + 1);
 	unsigned int tid = G_CPL_FW4_ACK_FLOWID(be32toh(OPCODE_TID(cpl)));
 	struct toepcb *toep = lookup_tid(sc, tid);
 	struct inpcb *inp;
 	struct tcpcb *tp;
 	struct socket *so;
 	uint8_t credits = cpl->credits;
 	struct ofld_tx_sdesc *txsd;
 	int plen;
 #ifdef INVARIANTS
 	unsigned int opcode = G_CPL_FW4_ACK_OPCODE(be32toh(OPCODE_TID(cpl)));
 #endif
 
 	/*
 	 * Very unusual case: we'd sent a flowc + abort_req for a synq entry and
 	 * now this comes back carrying the credits for the flowc.
 	 */
 	if (__predict_false(toep->flags & TPF_SYNQE)) {
 		KASSERT(toep->flags & TPF_ABORT_SHUTDOWN,
 		    ("%s: credits for a synq entry %p", __func__, toep));
 		return (0);
 	}
 
 	inp = toep->inp;
 
 	KASSERT(opcode == CPL_FW4_ACK,
 	    ("%s: unexpected opcode 0x%x", __func__, opcode));
 	KASSERT(m == NULL, ("%s: wasn't expecting payload", __func__));
 	KASSERT(toep->tid == tid, ("%s: toep tid mismatch", __func__));
 
 	INP_WLOCK(inp);
 
 	if (__predict_false(toep->flags & TPF_ABORT_SHUTDOWN)) {
 		INP_WUNLOCK(inp);
 		return (0);
 	}
 
 	KASSERT((inp->inp_flags & INP_DROPPED) == 0,
 	    ("%s: inp_flags 0x%x", __func__, inp->inp_flags));
 
 	tp = intotcpcb(inp);
 
 	if (cpl->flags & CPL_FW4_ACK_FLAGS_SEQVAL) {
 		tcp_seq snd_una = be32toh(cpl->snd_una);
 
 #ifdef INVARIANTS
 		if (__predict_false(SEQ_LT(snd_una, tp->snd_una))) {
 			log(LOG_ERR,
 			    "%s: unexpected seq# %x for TID %u, snd_una %x\n",
 			    __func__, snd_una, toep->tid, tp->snd_una);
 		}
 #endif
 
 		if (tp->snd_una != snd_una) {
 			tp->snd_una = snd_una;
 			tp->ts_recent_age = tcp_ts_getticks();
 		}
 	}
 
 #ifdef VERBOSE_TRACES
 	CTR3(KTR_CXGBE, "%s: tid %d credits %u", __func__, tid, credits);
 #endif
 	so = inp->inp_socket;
 	txsd = &toep->txsd[toep->txsd_cidx];
 	plen = 0;
 	while (credits) {
 		KASSERT(credits >= txsd->tx_credits,
 		    ("%s: too many (or partial) credits", __func__));
 		credits -= txsd->tx_credits;
 		toep->tx_credits += txsd->tx_credits;
 		plen += txsd->plen;
 		txsd++;
 		toep->txsd_avail++;
 		KASSERT(toep->txsd_avail <= toep->txsd_total,
 		    ("%s: txsd avail > total", __func__));
 		if (__predict_false(++toep->txsd_cidx == toep->txsd_total)) {
 			txsd = &toep->txsd[0];
 			toep->txsd_cidx = 0;
 		}
 	}
 
 	if (toep->tx_credits == toep->tx_total) {
 		toep->tx_nocompl = 0;
 		toep->plen_nocompl = 0;
 	}
 
 	if (toep->flags & TPF_TX_SUSPENDED &&
 	    toep->tx_credits >= toep->tx_total / 4) {
 #ifdef VERBOSE_TRACES
 		CTR2(KTR_CXGBE, "%s: tid %d calling t4_push_frames", __func__,
 		    tid);
 #endif
 		toep->flags &= ~TPF_TX_SUSPENDED;
 		CURVNET_SET(toep->vnet);
 		t4_push_data(sc, toep, plen);
 		CURVNET_RESTORE();
 	} else if (plen > 0) {
 		struct sockbuf *sb = &so->so_snd;
 		int sbu;
 
 		SOCKBUF_LOCK(sb);
 		sbu = sbused(sb);
 		if (ulp_mode(toep) == ULP_MODE_ISCSI) {
 			if (__predict_false(sbu > 0)) {
 				/*
 				 * The data transmitted before the
 				 * tid's ULP mode changed to ISCSI is
 				 * still in so_snd.  Incoming credits
 				 * should account for so_snd first.
 				 */
 				sbdrop_locked(sb, min(sbu, plen));
 				plen -= min(sbu, plen);
 			}
 			sowwakeup_locked(so);	/* unlocks so_snd */
 			rqdrop_locked(&toep->ulp_pdu_reclaimq, plen);
 		} else {
 #ifdef VERBOSE_TRACES
 			CTR3(KTR_CXGBE, "%s: tid %d dropped %d bytes", __func__,
 			    tid, plen);
 #endif
 			sbdrop_locked(sb, plen);
 			if (!TAILQ_EMPTY(&toep->aiotx_jobq))
 				t4_aiotx_queue_toep(so, toep);
 			sowwakeup_locked(so);	/* unlocks so_snd */
 		}
 		SOCKBUF_UNLOCK_ASSERT(sb);
 	}
 
 	INP_WUNLOCK(inp);
 
 	return (0);
 }
 
 void
 t4_set_tcb_field(struct adapter *sc, struct sge_wrq *wrq, struct toepcb *toep,
     uint16_t word, uint64_t mask, uint64_t val, int reply, int cookie)
 {
 	struct wrqe *wr;
 	struct cpl_set_tcb_field *req;
 	struct ofld_tx_sdesc *txsd;
 
 	MPASS((cookie & ~M_COOKIE) == 0);
 	if (reply) {
 		MPASS(cookie != CPL_COOKIE_RESERVED);
 	}
 
 	wr = alloc_wrqe(sizeof(*req), wrq);
 	if (wr == NULL) {
 		/* XXX */
 		panic("%s: allocation failure.", __func__);
 	}
 	req = wrtod(wr);
 
 	INIT_TP_WR_MIT_CPL(req, CPL_SET_TCB_FIELD, toep->tid);
 	req->reply_ctrl = htobe16(V_QUEUENO(toep->ofld_rxq->iq.abs_id));
 	if (reply == 0)
 		req->reply_ctrl |= htobe16(F_NO_REPLY);
 	req->word_cookie = htobe16(V_WORD(word) | V_COOKIE(cookie));
 	req->mask = htobe64(mask);
 	req->val = htobe64(val);
 	if (wrq->eq.type == EQ_OFLD) {
 		txsd = &toep->txsd[toep->txsd_pidx];
 		txsd->tx_credits = howmany(sizeof(*req), 16);
 		txsd->plen = 0;
 		KASSERT(toep->tx_credits >= txsd->tx_credits &&
 		    toep->txsd_avail > 0,
 		    ("%s: not enough credits (%d)", __func__,
 		    toep->tx_credits));
 		toep->tx_credits -= txsd->tx_credits;
 		if (__predict_false(++toep->txsd_pidx == toep->txsd_total))
 			toep->txsd_pidx = 0;
 		toep->txsd_avail--;
 	}
 
 	t4_wrq_tx(sc, wr);
 }
 
 void
 t4_init_cpl_io_handlers(void)
 {
 
 	t4_register_cpl_handler(CPL_PEER_CLOSE, do_peer_close);
 	t4_register_cpl_handler(CPL_CLOSE_CON_RPL, do_close_con_rpl);
 	t4_register_cpl_handler(CPL_ABORT_REQ_RSS, do_abort_req);
 	t4_register_shared_cpl_handler(CPL_ABORT_RPL_RSS, do_abort_rpl,
 	    CPL_COOKIE_TOM);
 	t4_register_cpl_handler(CPL_RX_DATA, do_rx_data);
 	t4_register_shared_cpl_handler(CPL_FW4_ACK, do_fw4_ack, CPL_COOKIE_TOM);
 }
 
 void
 t4_uninit_cpl_io_handlers(void)
 {
 
 	t4_register_cpl_handler(CPL_PEER_CLOSE, NULL);
 	t4_register_cpl_handler(CPL_CLOSE_CON_RPL, NULL);
 	t4_register_cpl_handler(CPL_ABORT_REQ_RSS, NULL);
 	t4_register_shared_cpl_handler(CPL_ABORT_RPL_RSS, NULL, CPL_COOKIE_TOM);
 	t4_register_cpl_handler(CPL_RX_DATA, NULL);
 	t4_register_shared_cpl_handler(CPL_FW4_ACK, NULL, CPL_COOKIE_TOM);
 }
 
 /*
  * Use the 'backend1' field in AIO jobs to hold an error that should
  * be reported when the job is completed, the 'backend3' field to
  * store the amount of data sent by the AIO job so far, and the
  * 'backend4' field to hold a reference count on the job.
  *
  * Each unmapped mbuf holds a reference on the job as does the queue
  * so long as the job is queued.
  */
 #define	aio_error	backend1
 #define	aio_sent	backend3
 #define	aio_refs	backend4
 
 #ifdef VERBOSE_TRACES
 static int
 jobtotid(struct kaiocb *job)
 {
 	struct socket *so;
 	struct tcpcb *tp;
 	struct toepcb *toep;
 
 	so = job->fd_file->f_data;
 	tp = sototcpcb(so);
 	toep = tp->t_toe;
 	return (toep->tid);
 }
 #endif
 
 static void
 aiotx_free_job(struct kaiocb *job)
 {
 	long status;
 	int error;
 
 	if (refcount_release(&job->aio_refs) == 0)
 		return;
 
 	error = (intptr_t)job->aio_error;
 	status = job->aio_sent;
 #ifdef VERBOSE_TRACES
 	CTR5(KTR_CXGBE, "%s: tid %d completed %p len %ld, error %d", __func__,
 	    jobtotid(job), job, status, error);
 #endif
 	if (error != 0 && status != 0)
 		error = 0;
 	if (error == ECANCELED)
 		aio_cancel(job);
 	else if (error)
 		aio_complete(job, -1, error);
 	else {
 		job->msgsnd = 1;
 		aio_complete(job, status, 0);
 	}
 }
 
 static void
 aiotx_free_pgs(struct mbuf *m)
 {
 	struct kaiocb *job;
 	vm_page_t pg;
 
 	M_ASSERTEXTPG(m);
 	job = m->m_ext.ext_arg1;
 #ifdef VERBOSE_TRACES
 	CTR3(KTR_CXGBE, "%s: completed %d bytes for tid %d", __func__,
 	    m->m_len, jobtotid(job));
 #endif
 
 	for (int i = 0; i < m->m_epg_npgs; i++) {
 		pg = PHYS_TO_VM_PAGE(m->m_epg_pa[i]);
 		vm_page_unwire(pg, PQ_ACTIVE);
 	}
 
 	aiotx_free_job(job);
 }
 
 /*
  * Allocate a chain of unmapped mbufs describing the next 'len' bytes
  * of an AIO job.
  */
 static struct mbuf *
 alloc_aiotx_mbuf(struct kaiocb *job, int len)
 {
 	struct vmspace *vm;
 	vm_page_t pgs[MBUF_PEXT_MAX_PGS];
 	struct mbuf *m, *top, *last;
 	vm_map_t map;
 	vm_offset_t start;
 	int i, mlen, npages, pgoff;
 
 	KASSERT(job->aio_sent + len <= job->uaiocb.aio_nbytes,
 	    ("%s(%p, %d): request to send beyond end of buffer", __func__,
 	    job, len));
 
 	/*
 	 * The AIO subsystem will cancel and drain all requests before
 	 * permitting a process to exit or exec, so p_vmspace should
 	 * be stable here.
 	 */
 	vm = job->userproc->p_vmspace;
 	map = &vm->vm_map;
 	start = (uintptr_t)job->uaiocb.aio_buf + job->aio_sent;
 	pgoff = start & PAGE_MASK;
 
 	top = NULL;
 	last = NULL;
 	while (len > 0) {
 		mlen = imin(len, MBUF_PEXT_MAX_PGS * PAGE_SIZE - pgoff);
 		KASSERT(mlen == len || ((start + mlen) & PAGE_MASK) == 0,
 		    ("%s: next start (%#jx + %#x) is not page aligned",
 		    __func__, (uintmax_t)start, mlen));
 
 		npages = vm_fault_quick_hold_pages(map, start, mlen,
 		    VM_PROT_WRITE, pgs, nitems(pgs));
 		if (npages < 0)
 			break;
 
-		m = mb_alloc_ext_pgs(M_WAITOK, aiotx_free_pgs);
+		m = mb_alloc_ext_pgs(M_WAITOK, aiotx_free_pgs, M_RDONLY);
 		m->m_epg_1st_off = pgoff;
 		m->m_epg_npgs = npages;
 		if (npages == 1) {
 			KASSERT(mlen + pgoff <= PAGE_SIZE,
 			    ("%s: single page is too large (off %d len %d)",
 			    __func__, pgoff, mlen));
 			m->m_epg_last_len = mlen;
 		} else {
 			m->m_epg_last_len = mlen - (PAGE_SIZE - pgoff) -
 			    (npages - 2) * PAGE_SIZE;
 		}
 		for (i = 0; i < npages; i++)
 			m->m_epg_pa[i] = VM_PAGE_TO_PHYS(pgs[i]);
 
 		m->m_len = mlen;
 		m->m_ext.ext_size = npages * PAGE_SIZE;
 		m->m_ext.ext_arg1 = job;
 		refcount_acquire(&job->aio_refs);
 
 #ifdef VERBOSE_TRACES
 		CTR5(KTR_CXGBE, "%s: tid %d, new mbuf %p for job %p, npages %d",
 		    __func__, jobtotid(job), m, job, npages);
 #endif
 
 		if (top == NULL)
 			top = m;
 		else
 			last->m_next = m;
 		last = m;
 
 		len -= mlen;
 		start += mlen;
 		pgoff = 0;
 	}
 
 	return (top);
 }
 
 static void
 t4_aiotx_process_job(struct toepcb *toep, struct socket *so, struct kaiocb *job)
 {
 	struct sockbuf *sb;
 	struct inpcb *inp;
 	struct tcpcb *tp;
 	struct mbuf *m;
 	u_int sent;
 	int error, len;
 	bool moretocome, sendmore;
 
 	sb = &so->so_snd;
 	SOCKBUF_UNLOCK(sb);
 	m = NULL;
 
 #ifdef MAC
 	error = mac_socket_check_send(job->fd_file->f_cred, so);
 	if (error != 0)
 		goto out;
 #endif
 
 	/* Inline sosend_generic(). */
 
 	error = SOCK_IO_SEND_LOCK(so, SBL_WAIT);
 	MPASS(error == 0);
 
 sendanother:
 	SOCKBUF_LOCK(sb);
 	if (so->so_snd.sb_state & SBS_CANTSENDMORE) {
 		SOCKBUF_UNLOCK(sb);
 		SOCK_IO_SEND_UNLOCK(so);
 		if ((so->so_options & SO_NOSIGPIPE) == 0) {
 			PROC_LOCK(job->userproc);
 			kern_psignal(job->userproc, SIGPIPE);
 			PROC_UNLOCK(job->userproc);
 		}
 		error = EPIPE;
 		goto out;
 	}
 	if (so->so_error) {
 		error = so->so_error;
 		so->so_error = 0;
 		SOCKBUF_UNLOCK(sb);
 		SOCK_IO_SEND_UNLOCK(so);
 		goto out;
 	}
 	if ((so->so_state & SS_ISCONNECTED) == 0) {
 		SOCKBUF_UNLOCK(sb);
 		SOCK_IO_SEND_UNLOCK(so);
 		error = ENOTCONN;
 		goto out;
 	}
 	if (sbspace(sb) < sb->sb_lowat) {
 		MPASS(job->aio_sent == 0 || !(so->so_state & SS_NBIO));
 
 		/*
 		 * Don't block if there is too little room in the socket
 		 * buffer.  Instead, requeue the request.
 		 */
 		if (!aio_set_cancel_function(job, t4_aiotx_cancel)) {
 			SOCKBUF_UNLOCK(sb);
 			SOCK_IO_SEND_UNLOCK(so);
 			error = ECANCELED;
 			goto out;
 		}
 		TAILQ_INSERT_HEAD(&toep->aiotx_jobq, job, list);
 		SOCKBUF_UNLOCK(sb);
 		SOCK_IO_SEND_UNLOCK(so);
 		goto out;
 	}
 
 	/*
 	 * Write as much data as the socket permits, but no more than a
 	 * a single sndbuf at a time.
 	 */
 	len = sbspace(sb);
 	if (len > job->uaiocb.aio_nbytes - job->aio_sent) {
 		len = job->uaiocb.aio_nbytes - job->aio_sent;
 		moretocome = false;
 	} else
 		moretocome = true;
 	if (len > toep->params.sndbuf) {
 		len = toep->params.sndbuf;
 		sendmore = true;
 	} else
 		sendmore = false;
 
 	if (!TAILQ_EMPTY(&toep->aiotx_jobq))
 		moretocome = true;
 	SOCKBUF_UNLOCK(sb);
 	MPASS(len != 0);
 
 	m = alloc_aiotx_mbuf(job, len);
 	if (m == NULL) {
 		SOCK_IO_SEND_UNLOCK(so);
 		error = EFAULT;
 		goto out;
 	}
 
 	/* Inlined tcp_usr_send(). */
 
 	inp = toep->inp;
 	INP_WLOCK(inp);
 	if (inp->inp_flags & INP_DROPPED) {
 		INP_WUNLOCK(inp);
 		SOCK_IO_SEND_UNLOCK(so);
 		error = ECONNRESET;
 		goto out;
 	}
 
 	sent = m_length(m, NULL);
 	job->aio_sent += sent;
 	counter_u64_add(toep->ofld_txq->tx_aio_octets, sent);
 
 	sbappendstream(sb, m, 0);
 	m = NULL;
 
 	if (!(inp->inp_flags & INP_DROPPED)) {
 		tp = intotcpcb(inp);
 		if (moretocome)
 			tp->t_flags |= TF_MORETOCOME;
 		error = tcp_output(tp);
 		if (error < 0) {
 			INP_UNLOCK_ASSERT(inp);
 			SOCK_IO_SEND_UNLOCK(so);
 			error = -error;
 			goto out;
 		}
 		if (moretocome)
 			tp->t_flags &= ~TF_MORETOCOME;
 	}
 
 	INP_WUNLOCK(inp);
 	if (sendmore)
 		goto sendanother;
 	SOCK_IO_SEND_UNLOCK(so);
 
 	if (error)
 		goto out;
 
 	/*
 	 * If this is a blocking socket and the request has not been
 	 * fully completed, requeue it until the socket is ready
 	 * again.
 	 */
 	if (job->aio_sent < job->uaiocb.aio_nbytes &&
 	    !(so->so_state & SS_NBIO)) {
 		SOCKBUF_LOCK(sb);
 		if (!aio_set_cancel_function(job, t4_aiotx_cancel)) {
 			SOCKBUF_UNLOCK(sb);
 			error = ECANCELED;
 			goto out;
 		}
 		TAILQ_INSERT_HEAD(&toep->aiotx_jobq, job, list);
 		return;
 	}
 
 	/*
 	 * If the request will not be requeued, drop the queue's
 	 * reference to the job.  Any mbufs in flight should still
 	 * hold a reference, but this drops the reference that the
 	 * queue owns while it is waiting to queue mbufs to the
 	 * socket.
 	 */
 	aiotx_free_job(job);
 	counter_u64_add(toep->ofld_txq->tx_aio_jobs, 1);
 
 out:
 	if (error) {
 		job->aio_error = (void *)(intptr_t)error;
 		aiotx_free_job(job);
 	}
 	m_freem(m);
 	SOCKBUF_LOCK(sb);
 }
 
 static void
 t4_aiotx_task(void *context, int pending)
 {
 	struct toepcb *toep = context;
 	struct socket *so;
 	struct kaiocb *job;
 	struct epoch_tracker et;
 
 	so = toep->aiotx_so;
 	CURVNET_SET(toep->vnet);
 	NET_EPOCH_ENTER(et);
 	SOCKBUF_LOCK(&so->so_snd);
 	while (!TAILQ_EMPTY(&toep->aiotx_jobq) && sowriteable(so)) {
 		job = TAILQ_FIRST(&toep->aiotx_jobq);
 		TAILQ_REMOVE(&toep->aiotx_jobq, job, list);
 		if (!aio_clear_cancel_function(job))
 			continue;
 
 		t4_aiotx_process_job(toep, so, job);
 	}
 	toep->aiotx_so = NULL;
 	SOCKBUF_UNLOCK(&so->so_snd);
 	NET_EPOCH_EXIT(et);
 
 	free_toepcb(toep);
 	sorele(so);
 	CURVNET_RESTORE();
 }
 
 static void
 t4_aiotx_queue_toep(struct socket *so, struct toepcb *toep)
 {
 
 	SOCKBUF_LOCK_ASSERT(&toep->inp->inp_socket->so_snd);
 #ifdef VERBOSE_TRACES
 	CTR3(KTR_CXGBE, "%s: queueing aiotx task for tid %d, active = %s",
 	    __func__, toep->tid, toep->aiotx_so != NULL ? "true" : "false");
 #endif
 	if (toep->aiotx_so != NULL)
 		return;
 	soref(so);
 	toep->aiotx_so = so;
 	hold_toepcb(toep);
 	soaio_enqueue(&toep->aiotx_task);
 }
 
 static void
 t4_aiotx_cancel(struct kaiocb *job)
 {
 	struct socket *so;
 	struct sockbuf *sb;
 	struct tcpcb *tp;
 	struct toepcb *toep;
 
 	so = job->fd_file->f_data;
 	tp = sototcpcb(so);
 	toep = tp->t_toe;
 	MPASS(job->uaiocb.aio_lio_opcode == LIO_WRITE);
 	sb = &so->so_snd;
 
 	SOCKBUF_LOCK(sb);
 	if (!aio_cancel_cleared(job))
 		TAILQ_REMOVE(&toep->aiotx_jobq, job, list);
 	SOCKBUF_UNLOCK(sb);
 
 	job->aio_error = (void *)(intptr_t)ECANCELED;
 	aiotx_free_job(job);
 }
 
 int
 t4_aio_queue_aiotx(struct socket *so, struct kaiocb *job)
 {
 	struct tcpcb *tp = sototcpcb(so);
 	struct toepcb *toep = tp->t_toe;
 	struct adapter *sc = td_adapter(toep->td);
 
 	/* This only handles writes. */
 	if (job->uaiocb.aio_lio_opcode != LIO_WRITE)
 		return (EOPNOTSUPP);
 
 	if (!sc->tt.tx_zcopy)
 		return (EOPNOTSUPP);
 
 	if (tls_tx_key(toep))
 		return (EOPNOTSUPP);
 
 	SOCKBUF_LOCK(&so->so_snd);
 #ifdef VERBOSE_TRACES
 	CTR3(KTR_CXGBE, "%s: queueing %p for tid %u", __func__, job, toep->tid);
 #endif
 	if (!aio_set_cancel_function(job, t4_aiotx_cancel))
 		panic("new job was cancelled");
 	refcount_init(&job->aio_refs, 1);
 	TAILQ_INSERT_TAIL(&toep->aiotx_jobq, job, list);
 	if (sowriteable(so))
 		t4_aiotx_queue_toep(so, toep);
 	SOCKBUF_UNLOCK(&so->so_snd);
 	return (0);
 }
 
 void
 aiotx_init_toep(struct toepcb *toep)
 {
 
 	TAILQ_INIT(&toep->aiotx_jobq);
 	TASK_INIT(&toep->aiotx_task, 0, t4_aiotx_task, toep);
 }
 #endif
diff --git a/sys/dev/iscsi/icl_soft.c b/sys/dev/iscsi/icl_soft.c
index 832ff8135ec5..812793a9fba3 100644
--- a/sys/dev/iscsi/icl_soft.c
+++ b/sys/dev/iscsi/icl_soft.c
@@ -1,1778 +1,1778 @@
 /*-
  * SPDX-License-Identifier: BSD-2-Clause
  *
  * Copyright (c) 2012 The FreeBSD Foundation
  *
  * This software was developed by Edward Tomasz Napierala under sponsorship
  * from the FreeBSD Foundation.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  */
 
 /*
  * Software implementation of iSCSI Common Layer kobj(9) interface.
  */
 
 #include <sys/param.h>
 #include <sys/bio.h>
 #include <sys/capsicum.h>
 #include <sys/condvar.h>
 #include <sys/conf.h>
 #include <sys/gsb_crc32.h>
 #include <sys/file.h>
 #include <sys/kernel.h>
 #include <sys/kthread.h>
 #include <sys/lock.h>
 #include <sys/mbuf.h>
 #include <sys/mutex.h>
 #include <sys/module.h>
 #include <sys/protosw.h>
 #include <sys/socket.h>
 #include <sys/socketvar.h>
 #include <sys/sysctl.h>
 #include <sys/systm.h>
 #include <sys/sx.h>
 #include <sys/uio.h>
 #include <vm/uma.h>
 #include <vm/vm_page.h>
 #include <netinet/in.h>
 #include <netinet/tcp.h>
 
 #include <dev/iscsi/icl.h>
 #include <dev/iscsi/iscsi_proto.h>
 #include <icl_conn_if.h>
 
 #define ICL_CONN_STATE_BHS		1
 #define ICL_CONN_STATE_AHS		2
 #define ICL_CONN_STATE_HEADER_DIGEST	3
 #define ICL_CONN_STATE_DATA		4
 #define ICL_CONN_STATE_DATA_DIGEST	5
 
 struct icl_soft_conn {
 	struct icl_conn	 ic;
 
 	/* soft specific stuff goes here. */
 	STAILQ_HEAD(, icl_pdu) to_send;
 	struct cv	 send_cv;
 	struct cv	 receive_cv;
 	struct icl_pdu	*receive_pdu;
 	size_t		 receive_len;
 	int		 receive_state;
 	bool		 receive_running;
 	bool		 check_send_space;
 	bool		 send_running;
 };
 
 struct icl_soft_pdu {
 	struct icl_pdu	 ip;
 
 	/* soft specific stuff goes here. */
 	u_int		 ref_cnt;
 	icl_pdu_cb	 cb;
 	int		 error;
 };
 
 SYSCTL_NODE(_kern_icl, OID_AUTO, soft, CTLFLAG_RD | CTLFLAG_MPSAFE, 0,
     "Software iSCSI");
 static int coalesce = 1;
 SYSCTL_INT(_kern_icl_soft, OID_AUTO, coalesce, CTLFLAG_RWTUN,
     &coalesce, 0, "Try to coalesce PDUs before sending");
 static int partial_receive_len = 256 * 1024;
 SYSCTL_INT(_kern_icl_soft, OID_AUTO, partial_receive_len, CTLFLAG_RWTUN,
     &partial_receive_len, 0, "Minimum read size for partially received "
     "data segment");
 static int max_data_segment_length = 256 * 1024;
 SYSCTL_INT(_kern_icl_soft, OID_AUTO, max_data_segment_length, CTLFLAG_RWTUN,
     &max_data_segment_length, 0, "Maximum data segment length");
 static int first_burst_length = 1024 * 1024;
 SYSCTL_INT(_kern_icl_soft, OID_AUTO, first_burst_length, CTLFLAG_RWTUN,
     &first_burst_length, 0, "First burst length");
 static int max_burst_length = 1024 * 1024;
 SYSCTL_INT(_kern_icl_soft, OID_AUTO, max_burst_length, CTLFLAG_RWTUN,
     &max_burst_length, 0, "Maximum burst length");
 static int sendspace = 1536 * 1024;
 SYSCTL_INT(_kern_icl_soft, OID_AUTO, sendspace, CTLFLAG_RWTUN,
     &sendspace, 0, "Default send socket buffer size");
 static int recvspace = 1536 * 1024;
 SYSCTL_INT(_kern_icl_soft, OID_AUTO, recvspace, CTLFLAG_RWTUN,
     &recvspace, 0, "Default receive socket buffer size");
 
 static MALLOC_DEFINE(M_ICL_SOFT, "icl_soft", "iSCSI software backend");
 static uma_zone_t icl_soft_pdu_zone;
 
 static volatile u_int	icl_ncons;
 
 STAILQ_HEAD(icl_pdu_stailq, icl_pdu);
 
 static icl_conn_new_pdu_t	icl_soft_conn_new_pdu;
 static icl_conn_pdu_free_t	icl_soft_conn_pdu_free;
 static icl_conn_pdu_data_segment_length_t
 				    icl_soft_conn_pdu_data_segment_length;
 static icl_conn_pdu_append_bio_t	icl_soft_conn_pdu_append_bio;
 static icl_conn_pdu_append_data_t	icl_soft_conn_pdu_append_data;
 static icl_conn_pdu_get_bio_t	icl_soft_conn_pdu_get_bio;
 static icl_conn_pdu_get_data_t	icl_soft_conn_pdu_get_data;
 static icl_conn_pdu_queue_t	icl_soft_conn_pdu_queue;
 static icl_conn_pdu_queue_cb_t	icl_soft_conn_pdu_queue_cb;
 static icl_conn_handoff_t	icl_soft_conn_handoff;
 static icl_conn_free_t		icl_soft_conn_free;
 static icl_conn_close_t		icl_soft_conn_close;
 static icl_conn_task_setup_t	icl_soft_conn_task_setup;
 static icl_conn_task_done_t	icl_soft_conn_task_done;
 static icl_conn_transfer_setup_t	icl_soft_conn_transfer_setup;
 static icl_conn_transfer_done_t	icl_soft_conn_transfer_done;
 #ifdef ICL_KERNEL_PROXY
 static icl_conn_connect_t	icl_soft_conn_connect;
 #endif
 
 static kobj_method_t icl_soft_methods[] = {
 	KOBJMETHOD(icl_conn_new_pdu, icl_soft_conn_new_pdu),
 	KOBJMETHOD(icl_conn_pdu_free, icl_soft_conn_pdu_free),
 	KOBJMETHOD(icl_conn_pdu_data_segment_length,
 	    icl_soft_conn_pdu_data_segment_length),
 	KOBJMETHOD(icl_conn_pdu_append_bio, icl_soft_conn_pdu_append_bio),
 	KOBJMETHOD(icl_conn_pdu_append_data, icl_soft_conn_pdu_append_data),
 	KOBJMETHOD(icl_conn_pdu_get_bio, icl_soft_conn_pdu_get_bio),
 	KOBJMETHOD(icl_conn_pdu_get_data, icl_soft_conn_pdu_get_data),
 	KOBJMETHOD(icl_conn_pdu_queue, icl_soft_conn_pdu_queue),
 	KOBJMETHOD(icl_conn_pdu_queue_cb, icl_soft_conn_pdu_queue_cb),
 	KOBJMETHOD(icl_conn_handoff, icl_soft_conn_handoff),
 	KOBJMETHOD(icl_conn_free, icl_soft_conn_free),
 	KOBJMETHOD(icl_conn_close, icl_soft_conn_close),
 	KOBJMETHOD(icl_conn_task_setup, icl_soft_conn_task_setup),
 	KOBJMETHOD(icl_conn_task_done, icl_soft_conn_task_done),
 	KOBJMETHOD(icl_conn_transfer_setup, icl_soft_conn_transfer_setup),
 	KOBJMETHOD(icl_conn_transfer_done, icl_soft_conn_transfer_done),
 #ifdef ICL_KERNEL_PROXY
 	KOBJMETHOD(icl_conn_connect, icl_soft_conn_connect),
 #endif
 	{ 0, 0 }
 };
 
 DEFINE_CLASS(icl_soft, icl_soft_methods, sizeof(struct icl_soft_conn));
 
 static void
 icl_conn_fail(struct icl_conn *ic)
 {
 	if (ic->ic_socket == NULL)
 		return;
 
 	/*
 	 * XXX
 	 */
 	ic->ic_socket->so_error = EDOOFUS;
 	(ic->ic_error)(ic);
 }
 
 static void
 icl_soft_conn_pdu_free(struct icl_conn *ic, struct icl_pdu *ip)
 {
 	struct icl_soft_pdu *isp = (struct icl_soft_pdu *)ip;
 
 	KASSERT(isp->ref_cnt == 0, ("freeing active PDU"));
 	m_freem(ip->ip_bhs_mbuf);
 	m_freem(ip->ip_ahs_mbuf);
 	m_freem(ip->ip_data_mbuf);
 	uma_zfree(icl_soft_pdu_zone, isp);
 #ifdef DIAGNOSTIC
 	refcount_release(&ic->ic_outstanding_pdus);
 #endif
 }
 
 static void
 icl_soft_pdu_call_cb(struct icl_pdu *ip)
 {
 	struct icl_soft_pdu *isp = (struct icl_soft_pdu *)ip;
 
 	if (isp->cb != NULL)
 		isp->cb(ip, isp->error);
 #ifdef DIAGNOSTIC
 	refcount_release(&ip->ip_conn->ic_outstanding_pdus);
 #endif
 	uma_zfree(icl_soft_pdu_zone, isp);
 }
 
 static void
 icl_soft_pdu_done(struct icl_pdu *ip, int error)
 {
 	struct icl_soft_pdu *isp = (struct icl_soft_pdu *)ip;
 
 	if (error != 0)
 		isp->error = error;
 
 	m_freem(ip->ip_bhs_mbuf);
 	ip->ip_bhs_mbuf = NULL;
 	m_freem(ip->ip_ahs_mbuf);
 	ip->ip_ahs_mbuf = NULL;
 	m_freem(ip->ip_data_mbuf);
 	ip->ip_data_mbuf = NULL;
 
 	if (atomic_fetchadd_int(&isp->ref_cnt, -1) == 1)
 		icl_soft_pdu_call_cb(ip);
 }
 
 static void
 icl_soft_mbuf_done(struct mbuf *mb)
 {
 	struct icl_soft_pdu *isp = (struct icl_soft_pdu *)mb->m_ext.ext_arg1;
 
 	icl_soft_pdu_call_cb(&isp->ip);
 }
 
 /*
  * Allocate icl_pdu with empty BHS to fill up by the caller.
  */
 struct icl_pdu *
 icl_soft_conn_new_pdu(struct icl_conn *ic, int flags)
 {
 	struct icl_soft_pdu *isp;
 	struct icl_pdu *ip;
 
 #ifdef DIAGNOSTIC
 	refcount_acquire(&ic->ic_outstanding_pdus);
 #endif
 	isp = uma_zalloc(icl_soft_pdu_zone, flags | M_ZERO);
 	if (isp == NULL) {
 		ICL_WARN("failed to allocate soft PDU");
 #ifdef DIAGNOSTIC
 		refcount_release(&ic->ic_outstanding_pdus);
 #endif
 		return (NULL);
 	}
 	ip = &isp->ip;
 	ip->ip_conn = ic;
 
 	CTASSERT(sizeof(struct iscsi_bhs) <= MHLEN);
 	ip->ip_bhs_mbuf = m_gethdr(flags, MT_DATA);
 	if (ip->ip_bhs_mbuf == NULL) {
 		ICL_WARN("failed to allocate BHS mbuf");
 		icl_soft_conn_pdu_free(ic, ip);
 		return (NULL);
 	}
 	ip->ip_bhs = mtod(ip->ip_bhs_mbuf, struct iscsi_bhs *);
 	memset(ip->ip_bhs, 0, sizeof(struct iscsi_bhs));
 	ip->ip_bhs_mbuf->m_len = sizeof(struct iscsi_bhs);
 
 	return (ip);
 }
 
 static int
 icl_pdu_ahs_length(const struct icl_pdu *request)
 {
 
 	return (request->ip_bhs->bhs_total_ahs_len * 4);
 }
 
 static size_t
 icl_pdu_data_segment_length(const struct icl_pdu *request)
 {
 	uint32_t len = 0;
 
 	len += request->ip_bhs->bhs_data_segment_len[0];
 	len <<= 8;
 	len += request->ip_bhs->bhs_data_segment_len[1];
 	len <<= 8;
 	len += request->ip_bhs->bhs_data_segment_len[2];
 
 	return (len);
 }
 
 size_t
 icl_soft_conn_pdu_data_segment_length(struct icl_conn *ic,
     const struct icl_pdu *request)
 {
 
 	return (icl_pdu_data_segment_length(request));
 }
 
 static void
 icl_pdu_set_data_segment_length(struct icl_pdu *response, uint32_t len)
 {
 
 	response->ip_bhs->bhs_data_segment_len[2] = len;
 	response->ip_bhs->bhs_data_segment_len[1] = len >> 8;
 	response->ip_bhs->bhs_data_segment_len[0] = len >> 16;
 }
 
 static size_t
 icl_pdu_padding(const struct icl_pdu *ip)
 {
 
 	if ((ip->ip_data_len % 4) != 0)
 		return (4 - (ip->ip_data_len % 4));
 
 	return (0);
 }
 
 static size_t
 icl_pdu_size(const struct icl_pdu *response)
 {
 	size_t len;
 
 	KASSERT(response->ip_ahs_len == 0, ("responding with AHS"));
 
 	len = sizeof(struct iscsi_bhs) + response->ip_data_len +
 	    icl_pdu_padding(response);
 	if (response->ip_conn->ic_header_crc32c)
 		len += ISCSI_HEADER_DIGEST_SIZE;
 	if (response->ip_data_len != 0 && response->ip_conn->ic_data_crc32c)
 		len += ISCSI_DATA_DIGEST_SIZE;
 
 	return (len);
 }
 
 static void
 icl_soft_receive_buf(struct mbuf **r, size_t *rs, void *buf, size_t s)
 {
 
 	m_copydata(*r, 0, s, buf);
 	m_adj(*r, s);
 	while ((*r) != NULL && (*r)->m_len == 0)
 		*r = m_free(*r);
 	*rs -= s;
 }
 
 static void
 icl_pdu_receive_ahs(struct icl_pdu *request, struct mbuf **r, size_t *rs)
 {
 
 	request->ip_ahs_len = icl_pdu_ahs_length(request);
 	if (request->ip_ahs_len == 0)
 		return;
 
 	request->ip_ahs_mbuf = *r;
 	*r = m_split(request->ip_ahs_mbuf, request->ip_ahs_len, M_WAITOK);
 	*rs -= request->ip_ahs_len;
 }
 
 static int
 mbuf_crc32c_helper(void *arg, void *data, u_int len)
 {
 	uint32_t *digestp = arg;
 
 	*digestp = calculate_crc32c(*digestp, data, len);
 	return (0);
 }
 
 static uint32_t
 icl_mbuf_to_crc32c(struct mbuf *m0, size_t len)
 {
 	uint32_t digest = 0xffffffff;
 
 	m_apply(m0, 0, len, mbuf_crc32c_helper, &digest);
 	digest = digest ^ 0xffffffff;
 
 	return (digest);
 }
 
 static int
 icl_pdu_check_header_digest(struct icl_pdu *request, struct mbuf **r, size_t *rs)
 {
 	uint32_t received_digest, valid_digest;
 
 	if (request->ip_conn->ic_header_crc32c == false)
 		return (0);
 
 	CTASSERT(sizeof(received_digest) == ISCSI_HEADER_DIGEST_SIZE);
 	icl_soft_receive_buf(r, rs, &received_digest, ISCSI_HEADER_DIGEST_SIZE);
 
 	/* Temporary attach AHS to BHS to calculate header digest. */
 	request->ip_bhs_mbuf->m_next = request->ip_ahs_mbuf;
 	valid_digest = icl_mbuf_to_crc32c(request->ip_bhs_mbuf, ISCSI_BHS_SIZE);
 	request->ip_bhs_mbuf->m_next = NULL;
 	if (received_digest != valid_digest) {
 		ICL_WARN("header digest check failed; got 0x%x, "
 		    "should be 0x%x", received_digest, valid_digest);
 		return (-1);
 	}
 
 	return (0);
 }
 
 /*
  * Return the number of bytes that should be waiting in the receive socket
  * before icl_pdu_receive_data_segment() gets called.
  */
 static size_t
 icl_pdu_data_segment_receive_len(const struct icl_pdu *request)
 {
 	size_t len;
 
 	len = icl_pdu_data_segment_length(request);
 	if (len == 0)
 		return (0);
 
 	/*
 	 * Account for the parts of data segment already read from
 	 * the socket buffer.
 	 */
 	KASSERT(len > request->ip_data_len, ("len <= request->ip_data_len"));
 	len -= request->ip_data_len;
 
 	/*
 	 * Don't always wait for the full data segment to be delivered
 	 * to the socket; this might badly affect performance due to
 	 * TCP window scaling.
 	 */
 	if (len > partial_receive_len) {
 #if 0
 		ICL_DEBUG("need %zd bytes of data, limiting to %zd",
 		    len, partial_receive_len));
 #endif
 		len = partial_receive_len;
 
 		return (len);
 	}
 
 	/*
 	 * Account for padding.  Note that due to the way code is written,
 	 * the icl_pdu_receive_data_segment() must always receive padding
 	 * along with the last part of data segment, because it would be
 	 * impossible to tell whether we've already received the full data
 	 * segment including padding, or without it.
 	 */
 	if ((len % 4) != 0)
 		len += 4 - (len % 4);
 
 #if 0
 	ICL_DEBUG("need %zd bytes of data", len));
 #endif
 
 	return (len);
 }
 
 static int
 icl_pdu_receive_data_segment(struct icl_pdu *request, struct mbuf **r,
     size_t *rs, bool *more_neededp)
 {
 	struct icl_soft_conn *isc;
 	size_t len, padding = 0;
 	struct mbuf *m;
 
 	isc = (struct icl_soft_conn *)request->ip_conn;
 
 	*more_neededp = false;
 	isc->receive_len = 0;
 
 	len = icl_pdu_data_segment_length(request);
 	if (len == 0)
 		return (0);
 
 	if ((len % 4) != 0)
 		padding = 4 - (len % 4);
 
 	/*
 	 * Account for already received parts of data segment.
 	 */
 	KASSERT(len > request->ip_data_len, ("len <= request->ip_data_len"));
 	len -= request->ip_data_len;
 
 	if (len + padding > *rs) {
 		/*
 		 * Not enough data in the socket buffer.  Receive as much
 		 * as we can.  Don't receive padding, since, obviously, it's
 		 * not the end of data segment yet.
 		 */
 #if 0
 		ICL_DEBUG("limited from %zd to %zd",
 		    len + padding, *rs - padding));
 #endif
 		len = *rs - padding;
 		*more_neededp = true;
 		padding = 0;
 	}
 
 	/*
 	 * Must not try to receive padding without at least one byte
 	 * of actual data segment.
 	 */
 	if (len > 0) {
 		m = *r;
 		*r = m_split(m, len + padding, M_WAITOK);
 		*rs -= len + padding;
 
 		if (request->ip_data_mbuf == NULL)
 			request->ip_data_mbuf = m;
 		else
 			m_cat(request->ip_data_mbuf, m);
 
 		request->ip_data_len += len;
 	} else
 		ICL_DEBUG("len 0");
 
 	if (*more_neededp)
 		isc->receive_len = icl_pdu_data_segment_receive_len(request);
 
 	return (0);
 }
 
 static int
 icl_pdu_check_data_digest(struct icl_pdu *request, struct mbuf **r, size_t *rs)
 {
 	uint32_t received_digest, valid_digest;
 
 	if (request->ip_conn->ic_data_crc32c == false)
 		return (0);
 
 	if (request->ip_data_len == 0)
 		return (0);
 
 	CTASSERT(sizeof(received_digest) == ISCSI_DATA_DIGEST_SIZE);
 	icl_soft_receive_buf(r, rs, &received_digest, ISCSI_DATA_DIGEST_SIZE);
 
 	/*
 	 * Note that ip_data_mbuf also contains padding; since digest
 	 * calculation is supposed to include that, we iterate over
 	 * the entire ip_data_mbuf chain, not just ip_data_len bytes of it.
 	 */
 	valid_digest = icl_mbuf_to_crc32c(request->ip_data_mbuf,
 	    roundup2(request->ip_data_len, 4));
 	if (received_digest != valid_digest) {
 		ICL_WARN("data digest check failed; got 0x%x, "
 		    "should be 0x%x", received_digest, valid_digest);
 		return (-1);
 	}
 
 	return (0);
 }
 
 /*
  * Somewhat contrary to the name, this attempts to receive only one
  * "part" of PDU at a time; call it repeatedly until it returns non-NULL.
  */
 static struct icl_pdu *
 icl_conn_receive_pdu(struct icl_soft_conn *isc, struct mbuf **r, size_t *rs)
 {
 	struct icl_conn *ic = &isc->ic;
 	struct icl_pdu *request;
 	size_t len;
 	int error = 0;
 	bool more_needed;
 
 	if (isc->receive_state == ICL_CONN_STATE_BHS) {
 		KASSERT(isc->receive_pdu == NULL,
 		    ("isc->receive_pdu != NULL"));
 		request = icl_soft_conn_new_pdu(ic, M_NOWAIT);
 		if (request == NULL) {
 			ICL_DEBUG("failed to allocate PDU; "
 			    "dropping connection");
 			icl_conn_fail(ic);
 			return (NULL);
 		}
 		isc->receive_pdu = request;
 	} else {
 		KASSERT(isc->receive_pdu != NULL,
 		    ("isc->receive_pdu == NULL"));
 		request = isc->receive_pdu;
 	}
 
 	switch (isc->receive_state) {
 	case ICL_CONN_STATE_BHS:
 		//ICL_DEBUG("receiving BHS");
 		icl_soft_receive_buf(r, rs, request->ip_bhs,
 		    sizeof(struct iscsi_bhs));
 
 		/*
 		 * We don't enforce any limit for AHS length;
 		 * its length is stored in 8 bit field.
 		 */
 
 		len = icl_pdu_data_segment_length(request);
 		if (len > ic->ic_max_recv_data_segment_length) {
 			ICL_WARN("received data segment "
 			    "length %zd is larger than negotiated; "
 			    "dropping connection", len);
 			error = EINVAL;
 			break;
 		}
 
 		isc->receive_state = ICL_CONN_STATE_AHS;
 		isc->receive_len = icl_pdu_ahs_length(request);
 		break;
 
 	case ICL_CONN_STATE_AHS:
 		//ICL_DEBUG("receiving AHS");
 		icl_pdu_receive_ahs(request, r, rs);
 		isc->receive_state = ICL_CONN_STATE_HEADER_DIGEST;
 		if (ic->ic_header_crc32c == false)
 			isc->receive_len = 0;
 		else
 			isc->receive_len = ISCSI_HEADER_DIGEST_SIZE;
 		break;
 
 	case ICL_CONN_STATE_HEADER_DIGEST:
 		//ICL_DEBUG("receiving header digest");
 		error = icl_pdu_check_header_digest(request, r, rs);
 		if (error != 0) {
 			ICL_DEBUG("header digest failed; "
 			    "dropping connection");
 			break;
 		}
 
 		isc->receive_state = ICL_CONN_STATE_DATA;
 		isc->receive_len = icl_pdu_data_segment_receive_len(request);
 		break;
 
 	case ICL_CONN_STATE_DATA:
 		//ICL_DEBUG("receiving data segment");
 		error = icl_pdu_receive_data_segment(request, r, rs,
 		    &more_needed);
 		if (error != 0) {
 			ICL_DEBUG("failed to receive data segment;"
 			    "dropping connection");
 			break;
 		}
 
 		if (more_needed)
 			break;
 
 		isc->receive_state = ICL_CONN_STATE_DATA_DIGEST;
 		if (request->ip_data_len == 0 || ic->ic_data_crc32c == false)
 			isc->receive_len = 0;
 		else
 			isc->receive_len = ISCSI_DATA_DIGEST_SIZE;
 		break;
 
 	case ICL_CONN_STATE_DATA_DIGEST:
 		//ICL_DEBUG("receiving data digest");
 		error = icl_pdu_check_data_digest(request, r, rs);
 		if (error != 0) {
 			ICL_DEBUG("data digest failed; "
 			    "dropping connection");
 			break;
 		}
 
 		/*
 		 * We've received complete PDU; reset the receive state machine
 		 * and return the PDU.
 		 */
 		isc->receive_state = ICL_CONN_STATE_BHS;
 		isc->receive_len = sizeof(struct iscsi_bhs);
 		isc->receive_pdu = NULL;
 		return (request);
 
 	default:
 		panic("invalid receive_state %d\n", isc->receive_state);
 	}
 
 	if (error != 0) {
 		/*
 		 * Don't free the PDU; it's pointed to by isc->receive_pdu
 		 * and will get freed in icl_soft_conn_close().
 		 */
 		icl_conn_fail(ic);
 	}
 
 	return (NULL);
 }
 
 static void
 icl_conn_receive_pdus(struct icl_soft_conn *isc, struct mbuf **r, size_t *rs)
 {
 	struct icl_conn *ic = &isc->ic;
 	struct icl_pdu *response;
 
 	for (;;) {
 		if (ic->ic_disconnecting)
 			return;
 
 		/*
 		 * Loop until we have a complete PDU or there is not enough
 		 * data in the socket buffer.
 		 */
 		if (*rs < isc->receive_len) {
 #if 0
 			ICL_DEBUG("not enough data; have %zd, need %zd",
 			    *rs, isc->receive_len);
 #endif
 			return;
 		}
 
 		response = icl_conn_receive_pdu(isc, r, rs);
 		if (response == NULL)
 			continue;
 
 		if (response->ip_ahs_len > 0) {
 			ICL_WARN("received PDU with unsupported "
 			    "AHS; opcode 0x%x; dropping connection",
 			    response->ip_bhs->bhs_opcode);
 			icl_soft_conn_pdu_free(ic, response);
 			icl_conn_fail(ic);
 			return;
 		}
 
 		(ic->ic_receive)(response);
 	}
 }
 
 static void
 icl_receive_thread(void *arg)
 {
 	struct icl_soft_conn *isc = arg;
 	struct icl_conn *ic = &isc->ic;
 	size_t available, read = 0;
 	struct socket *so;
 	struct mbuf *m, *r = NULL;
 	struct uio uio;
 	int error, flags;
 
 	so = ic->ic_socket;
 
 	for (;;) {
 		SOCKBUF_LOCK(&so->so_rcv);
 		if (ic->ic_disconnecting) {
 			SOCKBUF_UNLOCK(&so->so_rcv);
 			break;
 		}
 
 		/*
 		 * Set the low watermark, to be checked by
 		 * soreadable() in icl_soupcall_receive()
 		 * to avoid unnecessary wakeups until there
 		 * is enough data received to read the PDU.
 		 */
 		available = sbavail(&so->so_rcv);
 		if (read + available < isc->receive_len) {
 			so->so_rcv.sb_lowat = isc->receive_len - read;
 			cv_wait(&isc->receive_cv, SOCKBUF_MTX(&so->so_rcv));
 			so->so_rcv.sb_lowat = so->so_rcv.sb_hiwat + 1;
 			available = sbavail(&so->so_rcv);
 		}
 		SOCKBUF_UNLOCK(&so->so_rcv);
 
 		if (available == 0) {
 			if (so->so_error != 0) {
 				ICL_DEBUG("connection error %d; "
 				    "dropping connection", so->so_error);
 				icl_conn_fail(ic);
 				break;
 			}
 			continue;
 		}
 
 		memset(&uio, 0, sizeof(uio));
 		uio.uio_resid = available;
 		flags = MSG_DONTWAIT;
 		error = soreceive(so, NULL, &uio, &m, NULL, &flags);
 		if (error != 0) {
 			ICL_DEBUG("soreceive error %d", error);
 			break;
 		}
 		if (uio.uio_resid != 0) {
 			m_freem(m);
 			ICL_DEBUG("short read");
 			break;
 		}
 		if (r)
 			m_cat(r, m);
 		else
 			r = m;
 		read += available;
 
 		icl_conn_receive_pdus(isc, &r, &read);
 	}
 
 	if (r)
 		m_freem(r);
 
 	ICL_CONN_LOCK(ic);
 	isc->receive_running = false;
 	cv_signal(&isc->send_cv);
 	ICL_CONN_UNLOCK(ic);
 	kthread_exit();
 }
 
 static int
 icl_soupcall_receive(struct socket *so, void *arg, int waitflag)
 {
 	struct icl_soft_conn *isc;
 
 	if (!soreadable(so))
 		return (SU_OK);
 
 	isc = arg;
 	cv_signal(&isc->receive_cv);
 	return (SU_OK);
 }
 
 static int
 icl_pdu_finalize(struct icl_pdu *request)
 {
 	size_t padding, pdu_len;
 	uint32_t digest, zero = 0;
 	int ok;
 	struct icl_conn *ic;
 
 	ic = request->ip_conn;
 
 	icl_pdu_set_data_segment_length(request, request->ip_data_len);
 
 	pdu_len = icl_pdu_size(request);
 
 	if (ic->ic_header_crc32c) {
 		digest = icl_mbuf_to_crc32c(request->ip_bhs_mbuf,
 		    ISCSI_BHS_SIZE);
 		ok = m_append(request->ip_bhs_mbuf, sizeof(digest),
 		    (void *)&digest);
 		if (ok != 1) {
 			ICL_WARN("failed to append header digest");
 			return (1);
 		}
 	}
 
 	if (request->ip_data_len != 0) {
 		padding = icl_pdu_padding(request);
 		if (padding > 0) {
 			ok = m_append(request->ip_data_mbuf, padding,
 			    (void *)&zero);
 			if (ok != 1) {
 				ICL_WARN("failed to append padding");
 				return (1);
 			}
 		}
 
 		if (ic->ic_data_crc32c) {
 			digest = icl_mbuf_to_crc32c(request->ip_data_mbuf,
 			    roundup2(request->ip_data_len, 4));
 
 			ok = m_append(request->ip_data_mbuf, sizeof(digest),
 			    (void *)&digest);
 			if (ok != 1) {
 				ICL_WARN("failed to append data digest");
 				return (1);
 			}
 		}
 
 		m_cat(request->ip_bhs_mbuf, request->ip_data_mbuf);
 		request->ip_data_mbuf = NULL;
 	}
 
 	request->ip_bhs_mbuf->m_pkthdr.len = pdu_len;
 
 	return (0);
 }
 
 static void
 icl_conn_send_pdus(struct icl_soft_conn *isc, struct icl_pdu_stailq *queue)
 {
 	struct icl_conn *ic = &isc->ic;
 	struct icl_pdu *request, *request2;
 	struct mbuf *m;
 	struct socket *so;
 	long available, size, size2;
 #ifdef DEBUG_COALESCED
 	int coalesced;
 #endif
 	int error;
 
 	ICL_CONN_LOCK_ASSERT_NOT(ic);
 
 	so = ic->ic_socket;
 
 	SOCKBUF_LOCK(&so->so_snd);
 	/*
 	 * Check how much space do we have for transmit.  We can't just
 	 * call sosend() and retry when we get EWOULDBLOCK or EMSGSIZE,
 	 * as it always frees the mbuf chain passed to it, even in case
 	 * of error.
 	 */
 	available = sbspace(&so->so_snd);
 	isc->check_send_space = false;
 
 	/*
 	 * Notify the socket upcall that we don't need wakeups
 	 * for the time being.
 	 */
 	so->so_snd.sb_lowat = so->so_snd.sb_hiwat + 1;
 	SOCKBUF_UNLOCK(&so->so_snd);
 
 	while (!STAILQ_EMPTY(queue)) {
 		request = STAILQ_FIRST(queue);
 		size = icl_pdu_size(request);
 		if (available < size) {
 			/*
 			 * Set the low watermark, to be checked by
 			 * sowriteable() in icl_soupcall_send()
 			 * to avoid unnecessary wakeups until there
 			 * is enough space for the PDU to fit.
 			 */
 			SOCKBUF_LOCK(&so->so_snd);
 			available = sbspace(&so->so_snd);
 			if (available < size) {
 #if 1
 				ICL_DEBUG("no space to send; "
 				    "have %ld, need %ld",
 				    available, size);
 #endif
 				so->so_snd.sb_lowat = max(size,
 				    so->so_snd.sb_hiwat / 8);
 				SOCKBUF_UNLOCK(&so->so_snd);
 				return;
 			}
 			SOCKBUF_UNLOCK(&so->so_snd);
 		}
 		STAILQ_REMOVE_HEAD(queue, ip_next);
 		error = icl_pdu_finalize(request);
 		if (error != 0) {
 			ICL_DEBUG("failed to finalize PDU; "
 			    "dropping connection");
 			icl_soft_pdu_done(request, EIO);
 			icl_conn_fail(ic);
 			return;
 		}
 		if (coalesce) {
 			m = request->ip_bhs_mbuf;
 			for (
 #ifdef DEBUG_COALESCED
 			    coalesced = 1
 #endif
 			    ; ;
 #ifdef DEBUG_COALESCED
 			    coalesced++
 #endif
 			    ) {
 				request2 = STAILQ_FIRST(queue);
 				if (request2 == NULL)
 					break;
 				size2 = icl_pdu_size(request2);
 				if (available < size + size2)
 					break;
 				STAILQ_REMOVE_HEAD(queue, ip_next);
 				error = icl_pdu_finalize(request2);
 				if (error != 0) {
 					ICL_DEBUG("failed to finalize PDU; "
 					    "dropping connection");
 					icl_soft_pdu_done(request, EIO);
 					icl_soft_pdu_done(request2, EIO);
 					icl_conn_fail(ic);
 					return;
 				}
 				while (m->m_next)
 					m = m->m_next;
 				m_cat(m, request2->ip_bhs_mbuf);
 				request2->ip_bhs_mbuf = NULL;
 				request->ip_bhs_mbuf->m_pkthdr.len += size2;
 				size += size2;
 				icl_soft_pdu_done(request2, 0);
 			}
 #ifdef DEBUG_COALESCED
 			if (coalesced > 1) {
 				ICL_DEBUG("coalesced %d PDUs into %ld bytes",
 				    coalesced, size);
 			}
 #endif
 		}
 		available -= size;
 		error = sosend(so, NULL, NULL, request->ip_bhs_mbuf,
 		    NULL, MSG_DONTWAIT, curthread);
 		request->ip_bhs_mbuf = NULL; /* Sosend consumes the mbuf. */
 		if (error != 0) {
 			ICL_DEBUG("failed to send PDU, error %d; "
 			    "dropping connection", error);
 			icl_soft_pdu_done(request, error);
 			icl_conn_fail(ic);
 			return;
 		}
 		icl_soft_pdu_done(request, 0);
 	}
 }
 
 static void
 icl_send_thread(void *arg)
 {
 	struct icl_soft_conn *isc;
 	struct icl_conn *ic;
 	struct icl_pdu_stailq queue;
 
 	isc = arg;
 	ic = &isc->ic;
 
 	STAILQ_INIT(&queue);
 
 	ICL_CONN_LOCK(ic);
 	for (;;) {
 		for (;;) {
 			/*
 			 * Populate the local queue from the main one.
 			 * This way the icl_conn_send_pdus() can go through
 			 * all the queued PDUs without holding any locks.
 			 */
 			if (STAILQ_EMPTY(&queue) || isc->check_send_space)
 				STAILQ_CONCAT(&queue, &isc->to_send);
 
 			ICL_CONN_UNLOCK(ic);
 			icl_conn_send_pdus(isc, &queue);
 			ICL_CONN_LOCK(ic);
 
 			/*
 			 * The icl_soupcall_send() was called since the last
 			 * call to sbspace(); go around;
 			 */
 			if (isc->check_send_space)
 				continue;
 
 			/*
 			 * Local queue is empty, but we still have PDUs
 			 * in the main one; go around.
 			 */
 			if (STAILQ_EMPTY(&queue) &&
 			    !STAILQ_EMPTY(&isc->to_send))
 				continue;
 
 			/*
 			 * There might be some stuff in the local queue,
 			 * which didn't get sent due to not having enough send
 			 * space.  Wait for socket upcall.
 			 */
 			break;
 		}
 
 		if (ic->ic_disconnecting) {
 			//ICL_DEBUG("terminating");
 			break;
 		}
 
 		cv_wait(&isc->send_cv, ic->ic_lock);
 	}
 
 	/*
 	 * We're exiting; move PDUs back to the main queue, so they can
 	 * get freed properly.  At this point ordering doesn't matter.
 	 */
 	STAILQ_CONCAT(&isc->to_send, &queue);
 
 	isc->send_running = false;
 	cv_signal(&isc->send_cv);
 	ICL_CONN_UNLOCK(ic);
 	kthread_exit();
 }
 
 static int
 icl_soupcall_send(struct socket *so, void *arg, int waitflag)
 {
 	struct icl_soft_conn *isc;
 	struct icl_conn *ic;
 
 	if (!sowriteable(so))
 		return (SU_OK);
 
 	isc = arg;
 	ic = &isc->ic;
 
 	ICL_CONN_LOCK(ic);
 	isc->check_send_space = true;
 	ICL_CONN_UNLOCK(ic);
 
 	cv_signal(&isc->send_cv);
 
 	return (SU_OK);
 }
 
 static void
 icl_soft_free_mext_pg(struct mbuf *m)
 {
 	struct icl_soft_pdu *isp;
 
 	M_ASSERTEXTPG(m);
 
 	/*
 	 * Nothing to do for the pages; they are owned by the PDU /
 	 * I/O request.
 	 */
 
 	/* Drop reference on the PDU. */
 	isp = m->m_ext.ext_arg1;
 	if (atomic_fetchadd_int(&isp->ref_cnt, -1) == 1)
 		icl_soft_pdu_call_cb(&isp->ip);
 }
 
 static int
 icl_soft_conn_pdu_append_bio(struct icl_conn *ic, struct icl_pdu *request,
     struct bio *bp, size_t offset, size_t len, int flags)
 {
 	struct icl_soft_pdu *isp = (struct icl_soft_pdu *)request;
 	struct mbuf *m, *m_tail;
 	vm_offset_t vaddr;
 	size_t mtodo, page_offset, todo;
 	int i;
 
 	KASSERT(len > 0, ("len == 0"));
 
 	m_tail = request->ip_data_mbuf;
 	if (m_tail != NULL)
 		for (; m_tail->m_next != NULL; m_tail = m_tail->m_next)
 			;
 
 	MPASS(bp->bio_flags & BIO_UNMAPPED);
 	if (offset < PAGE_SIZE - bp->bio_ma_offset) {
 		page_offset = bp->bio_ma_offset + offset;
 		i = 0;
 	} else {
 		offset -= PAGE_SIZE - bp->bio_ma_offset;
 		for (i = 1; offset >= PAGE_SIZE; i++)
 			offset -= PAGE_SIZE;
 		page_offset = offset;
 	}
 
 	if (flags & ICL_NOCOPY) {
 		m = NULL;
 		while (len > 0) {
 			if (m == NULL) {
 				m = mb_alloc_ext_pgs(flags & ~ICL_NOCOPY,
-				    icl_soft_free_mext_pg);
+				    icl_soft_free_mext_pg, 0);
 				if (__predict_false(m == NULL))
 					return (ENOMEM);
 				atomic_add_int(&isp->ref_cnt, 1);
 				m->m_ext.ext_arg1 = isp;
 				m->m_epg_1st_off = page_offset;
 			}
 
 			todo = MIN(len, PAGE_SIZE - page_offset);
 
 			m->m_epg_pa[m->m_epg_npgs] =
 			    VM_PAGE_TO_PHYS(bp->bio_ma[i]);
 			m->m_epg_npgs++;
 			m->m_epg_last_len = todo;
 			m->m_len += todo;
 			m->m_ext.ext_size += PAGE_SIZE;
 			MBUF_EXT_PGS_ASSERT_SANITY(m);
 
 			if (m->m_epg_npgs == MBUF_PEXT_MAX_PGS) {
 				if (m_tail != NULL)
 					m_tail->m_next = m;
 				else
 					request->ip_data_mbuf = m;
 				m_tail = m;
 				request->ip_data_len += m->m_len;
 				m = NULL;
 			}
 
 			page_offset = 0;
 			len -= todo;
 			i++;
 		}
 
 		if (m != NULL) {
 			if (m_tail != NULL)
 				m_tail->m_next = m;
 			else
 				request->ip_data_mbuf = m;
 			request->ip_data_len += m->m_len;
 		}
 		return (0);
 	}
 
 	m = m_getm2(NULL, len, flags, MT_DATA, 0);
 	if (__predict_false(m == NULL))
 		return (ENOMEM);
 
 	if (request->ip_data_mbuf == NULL) {
 		request->ip_data_mbuf = m;
 		request->ip_data_len = len;
 	} else {
 		m_tail->m_next = m;
 		request->ip_data_len += len;
 	}
 
 	while (len > 0) {
 		todo = MIN(len, PAGE_SIZE - page_offset);
 		vaddr = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(bp->bio_ma[i]));
 
 		do {
 			mtodo = min(todo, M_SIZE(m) - m->m_len);
 			memcpy(mtod(m, char *) + m->m_len, (char *)vaddr +
 			    page_offset, mtodo);
 			m->m_len += mtodo;
 			if (m->m_len == M_SIZE(m))
 				m = m->m_next;
 			page_offset += mtodo;
 			todo -= mtodo;
 		} while (todo > 0);
 
 		page_offset = 0;
 		len -= todo;
 		i++;
 	}
 
 	return (0);
 }
 
 static int
 icl_soft_conn_pdu_append_data(struct icl_conn *ic, struct icl_pdu *request,
     const void *addr, size_t len, int flags)
 {
 	struct icl_soft_pdu *isp = (struct icl_soft_pdu *)request;
 	struct mbuf *mb, *newmb;
 	size_t copylen, off = 0;
 
 	KASSERT(len > 0, ("len == 0"));
 
 	if (flags & ICL_NOCOPY) {
 		newmb = m_get(flags & ~ICL_NOCOPY, MT_DATA);
 		if (newmb == NULL) {
 			ICL_WARN("failed to allocate mbuf");
 			return (ENOMEM);
 		}
 
 		newmb->m_flags |= M_RDONLY;
 		m_extaddref(newmb, __DECONST(char *, addr), len, &isp->ref_cnt,
 		    icl_soft_mbuf_done, isp, NULL);
 		newmb->m_len = len;
 	} else {
 		newmb = m_getm2(NULL, len, flags, MT_DATA, 0);
 		if (newmb == NULL) {
 			ICL_WARN("failed to allocate mbuf for %zd bytes", len);
 			return (ENOMEM);
 		}
 
 		for (mb = newmb; mb != NULL; mb = mb->m_next) {
 			copylen = min(M_TRAILINGSPACE(mb), len - off);
 			memcpy(mtod(mb, char *), (const char *)addr + off, copylen);
 			mb->m_len = copylen;
 			off += copylen;
 		}
 		KASSERT(off == len, ("%s: off != len", __func__));
 	}
 
 	if (request->ip_data_mbuf == NULL) {
 		request->ip_data_mbuf = newmb;
 		request->ip_data_len = len;
 	} else {
 		m_cat(request->ip_data_mbuf, newmb);
 		request->ip_data_len += len;
 	}
 
 	return (0);
 }
 
 void
 icl_soft_conn_pdu_get_bio(struct icl_conn *ic, struct icl_pdu *ip,
     size_t pdu_off, struct bio *bp, size_t bio_off, size_t len)
 {
 	vm_offset_t vaddr;
 	size_t page_offset, todo;
 	int i __unused;
 
 	MPASS(bp->bio_flags & BIO_UNMAPPED);
 	if (bio_off < PAGE_SIZE - bp->bio_ma_offset) {
 		page_offset = bp->bio_ma_offset + bio_off;
 		i = 0;
 	} else {
 		bio_off -= PAGE_SIZE - bp->bio_ma_offset;
 		for (i = 1; bio_off >= PAGE_SIZE; i++)
 			bio_off -= PAGE_SIZE;
 		page_offset = bio_off;
 	}
 
 	while (len > 0) {
 		todo = MIN(len, PAGE_SIZE - page_offset);
 
 		vaddr = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(bp->bio_ma[i]));
 		m_copydata(ip->ip_data_mbuf, pdu_off, todo, (char *)vaddr +
 		    page_offset);
 
 		page_offset = 0;
 		pdu_off += todo;
 		len -= todo;
 		i++;
 	}
 }
 
 void
 icl_soft_conn_pdu_get_data(struct icl_conn *ic, struct icl_pdu *ip,
     size_t off, void *addr, size_t len)
 {
 
 	m_copydata(ip->ip_data_mbuf, off, len, addr);
 }
 
 static void
 icl_soft_conn_pdu_queue(struct icl_conn *ic, struct icl_pdu *ip)
 {
 
 	icl_soft_conn_pdu_queue_cb(ic, ip, NULL);
 }
 
 static void
 icl_soft_conn_pdu_queue_cb(struct icl_conn *ic, struct icl_pdu *ip,
     icl_pdu_cb cb)
 {
 	struct icl_soft_conn *isc = (struct icl_soft_conn *)ic;
 	struct icl_soft_pdu *isp = (struct icl_soft_pdu *)ip;
 
 	ICL_CONN_LOCK_ASSERT(ic);
 	isp->ref_cnt++;
 	isp->cb = cb;
 
 	if (ic->ic_disconnecting || ic->ic_socket == NULL) {
 		ICL_DEBUG("icl_pdu_queue on closed connection");
 		icl_soft_pdu_done(ip, ENOTCONN);
 		return;
 	}
 
 	if (!STAILQ_EMPTY(&isc->to_send)) {
 		STAILQ_INSERT_TAIL(&isc->to_send, ip, ip_next);
 		/*
 		 * If the queue is not empty, someone else had already
 		 * signaled the send thread; no need to do that again,
 		 * just return.
 		 */
 		return;
 	}
 
 	STAILQ_INSERT_TAIL(&isc->to_send, ip, ip_next);
 	cv_signal(&isc->send_cv);
 }
 
 static struct icl_conn *
 icl_soft_new_conn(const char *name, struct mtx *lock)
 {
 	struct icl_soft_conn *isc;
 	struct icl_conn *ic;
 
 	refcount_acquire(&icl_ncons);
 
 	isc = (struct icl_soft_conn *)kobj_create(&icl_soft_class, M_ICL_SOFT,
 	    M_WAITOK | M_ZERO);
 
 	STAILQ_INIT(&isc->to_send);
 	cv_init(&isc->send_cv, "icl_tx");
 	cv_init(&isc->receive_cv, "icl_rx");
 
 	ic = &isc->ic;
 	ic->ic_lock = lock;
 #ifdef DIAGNOSTIC
 	refcount_init(&ic->ic_outstanding_pdus, 0);
 #endif
 	ic->ic_name = name;
 	ic->ic_offload = "None";
 	ic->ic_unmapped = PMAP_HAS_DMAP;
 
 	return (ic);
 }
 
 void
 icl_soft_conn_free(struct icl_conn *ic)
 {
 	struct icl_soft_conn *isc = (struct icl_soft_conn *)ic;
 
 #ifdef DIAGNOSTIC
 	KASSERT(ic->ic_outstanding_pdus == 0,
 	    ("destroying session with %d outstanding PDUs",
 	     ic->ic_outstanding_pdus));
 #endif
 	cv_destroy(&isc->send_cv);
 	cv_destroy(&isc->receive_cv);
 	kobj_delete((struct kobj *)isc, M_ICL_SOFT);
 	refcount_release(&icl_ncons);
 }
 
 static int
 icl_conn_start(struct icl_conn *ic)
 {
 	struct icl_soft_conn *isc = (struct icl_soft_conn *)ic;
 	size_t minspace;
 	struct sockopt opt;
 	int error, one = 1;
 
 	ICL_CONN_LOCK(ic);
 
 	/*
 	 * XXX: Ugly hack.
 	 */
 	if (ic->ic_socket == NULL) {
 		ICL_CONN_UNLOCK(ic);
 		return (EINVAL);
 	}
 
 	isc->receive_state = ICL_CONN_STATE_BHS;
 	isc->receive_len = sizeof(struct iscsi_bhs);
 	ic->ic_disconnecting = false;
 
 	ICL_CONN_UNLOCK(ic);
 
 	/*
 	 * For sendspace, this is required because the current code cannot
 	 * send a PDU in pieces; thus, the minimum buffer size is equal
 	 * to the maximum PDU size.  "+4" is to account for possible padding.
 	 */
 	minspace = sizeof(struct iscsi_bhs) +
 	    ic->ic_max_send_data_segment_length +
 	    ISCSI_HEADER_DIGEST_SIZE + ISCSI_DATA_DIGEST_SIZE + 4;
 	if (sendspace < minspace) {
 		ICL_WARN("kern.icl.sendspace too low; must be at least %zd",
 		    minspace);
 		sendspace = minspace;
 	}
 	minspace = sizeof(struct iscsi_bhs) +
 	    ic->ic_max_recv_data_segment_length +
 	    ISCSI_HEADER_DIGEST_SIZE + ISCSI_DATA_DIGEST_SIZE + 4;
 	if (recvspace < minspace) {
 		ICL_WARN("kern.icl.recvspace too low; must be at least %zd",
 		    minspace);
 		recvspace = minspace;
 	}
 
 	error = soreserve(ic->ic_socket, sendspace, recvspace);
 	if (error != 0) {
 		ICL_WARN("soreserve failed with error %d", error);
 		icl_soft_conn_close(ic);
 		return (error);
 	}
 	ic->ic_socket->so_snd.sb_flags |= SB_AUTOSIZE;
 	ic->ic_socket->so_rcv.sb_flags |= SB_AUTOSIZE;
 
 	/*
 	 * Disable Nagle.
 	 */
 	bzero(&opt, sizeof(opt));
 	opt.sopt_dir = SOPT_SET;
 	opt.sopt_level = IPPROTO_TCP;
 	opt.sopt_name = TCP_NODELAY;
 	opt.sopt_val = &one;
 	opt.sopt_valsize = sizeof(one);
 	error = sosetopt(ic->ic_socket, &opt);
 	if (error != 0) {
 		ICL_WARN("disabling TCP_NODELAY failed with error %d", error);
 		icl_soft_conn_close(ic);
 		return (error);
 	}
 
 	/*
 	 * Register socket upcall, to get notified about incoming PDUs
 	 * and free space to send outgoing ones.
 	 */
 	SOCKBUF_LOCK(&ic->ic_socket->so_snd);
 	soupcall_set(ic->ic_socket, SO_SND, icl_soupcall_send, isc);
 	SOCKBUF_UNLOCK(&ic->ic_socket->so_snd);
 	SOCKBUF_LOCK(&ic->ic_socket->so_rcv);
 	soupcall_set(ic->ic_socket, SO_RCV, icl_soupcall_receive, isc);
 	SOCKBUF_UNLOCK(&ic->ic_socket->so_rcv);
 
 	/*
 	 * Start threads.
 	 */
 	ICL_CONN_LOCK(ic);
 	isc->send_running = isc->receive_running = true;
 	ICL_CONN_UNLOCK(ic);
 	error = kthread_add(icl_send_thread, ic, NULL, NULL, 0, 0, "%stx",
 	    ic->ic_name);
 	if (error != 0) {
 		ICL_WARN("kthread_add(9) failed with error %d", error);
 		ICL_CONN_LOCK(ic);
 		isc->send_running = isc->receive_running = false;
 		cv_signal(&isc->send_cv);
 		ICL_CONN_UNLOCK(ic);
 		icl_soft_conn_close(ic);
 		return (error);
 	}
 	error = kthread_add(icl_receive_thread, ic, NULL, NULL, 0, 0, "%srx",
 	    ic->ic_name);
 	if (error != 0) {
 		ICL_WARN("kthread_add(9) failed with error %d", error);
 		ICL_CONN_LOCK(ic);
 		isc->receive_running = false;
 		cv_signal(&isc->send_cv);
 		ICL_CONN_UNLOCK(ic);
 		icl_soft_conn_close(ic);
 		return (error);
 	}
 
 	return (0);
 }
 
 int
 icl_soft_conn_handoff(struct icl_conn *ic, int fd)
 {
 	struct file *fp;
 	struct socket *so;
 	cap_rights_t rights;
 	int error;
 
 	ICL_CONN_LOCK_ASSERT_NOT(ic);
 
 #ifdef ICL_KERNEL_PROXY
 	/*
 	 * We're transitioning to Full Feature phase, and we don't
 	 * really care.
 	 */
 	if (fd == 0) {
 		ICL_CONN_LOCK(ic);
 		if (ic->ic_socket == NULL) {
 			ICL_CONN_UNLOCK(ic);
 			ICL_WARN("proxy handoff without connect"); 
 			return (EINVAL);
 		}
 		ICL_CONN_UNLOCK(ic);
 		return (0);
 	}
 #endif
 
 	/*
 	 * Steal the socket from userland.
 	 */
 	error = fget(curthread, fd,
 	    cap_rights_init_one(&rights, CAP_SOCK_CLIENT), &fp);
 	if (error != 0)
 		return (error);
 	if (fp->f_type != DTYPE_SOCKET) {
 		fdrop(fp, curthread);
 		return (EINVAL);
 	}
 	so = fp->f_data;
 	if (so->so_type != SOCK_STREAM) {
 		fdrop(fp, curthread);
 		return (EINVAL);
 	}
 
 	ICL_CONN_LOCK(ic);
 
 	if (ic->ic_socket != NULL) {
 		ICL_CONN_UNLOCK(ic);
 		fdrop(fp, curthread);
 		return (EBUSY);
 	}
 
 	ic->ic_socket = fp->f_data;
 	fp->f_ops = &badfileops;
 	fp->f_data = NULL;
 	fdrop(fp, curthread);
 	ICL_CONN_UNLOCK(ic);
 
 	error = icl_conn_start(ic);
 
 	return (error);
 }
 
 void
 icl_soft_conn_close(struct icl_conn *ic)
 {
 	struct icl_soft_conn *isc = (struct icl_soft_conn *)ic;
 	struct icl_pdu *pdu;
 	struct socket *so;
 
 	/*
 	 * Wake up the threads, so they can properly terminate.
 	 * Receive thread sleeps on so->so_rcv lock, send on ic->ic_lock.
 	 */
 	ICL_CONN_LOCK(ic);
 	if (!ic->ic_disconnecting) {
 		so = ic->ic_socket;
 		if (so)
 			SOCKBUF_LOCK(&so->so_rcv);
 		ic->ic_disconnecting = true;
 		if (so)
 			SOCKBUF_UNLOCK(&so->so_rcv);
 	}
 	while (isc->receive_running || isc->send_running) {
 		cv_signal(&isc->receive_cv);
 		cv_signal(&isc->send_cv);
 		cv_wait(&isc->send_cv, ic->ic_lock);
 	}
 
 	/* Some other thread could close the connection same time. */
 	so = ic->ic_socket;
 	if (so == NULL) {
 		ICL_CONN_UNLOCK(ic);
 		return;
 	}
 	ic->ic_socket = NULL;
 
 	/*
 	 * Deregister socket upcalls.
 	 */
 	ICL_CONN_UNLOCK(ic);
 	SOCKBUF_LOCK(&so->so_snd);
 	if (so->so_snd.sb_upcall != NULL)
 		soupcall_clear(so, SO_SND);
 	SOCKBUF_UNLOCK(&so->so_snd);
 	SOCKBUF_LOCK(&so->so_rcv);
 	if (so->so_rcv.sb_upcall != NULL)
 		soupcall_clear(so, SO_RCV);
 	SOCKBUF_UNLOCK(&so->so_rcv);
 	soclose(so);
 	ICL_CONN_LOCK(ic);
 
 	if (isc->receive_pdu != NULL) {
 		//ICL_DEBUG("freeing partially received PDU");
 		icl_soft_conn_pdu_free(ic, isc->receive_pdu);
 		isc->receive_pdu = NULL;
 	}
 
 	/*
 	 * Remove any outstanding PDUs from the send queue.
 	 */
 	while (!STAILQ_EMPTY(&isc->to_send)) {
 		pdu = STAILQ_FIRST(&isc->to_send);
 		STAILQ_REMOVE_HEAD(&isc->to_send, ip_next);
 		icl_soft_pdu_done(pdu, ENOTCONN);
 	}
 
 	KASSERT(STAILQ_EMPTY(&isc->to_send),
 	    ("destroying session with non-empty send queue"));
 	ICL_CONN_UNLOCK(ic);
 }
 
 int
 icl_soft_conn_task_setup(struct icl_conn *ic, struct icl_pdu *ip,
     struct ccb_scsiio *csio, uint32_t *task_tagp, void **prvp)
 {
 
 	return (0);
 }
 
 void
 icl_soft_conn_task_done(struct icl_conn *ic, void *prv)
 {
 }
 
 int
 icl_soft_conn_transfer_setup(struct icl_conn *ic, struct icl_pdu *ip,
     union ctl_io *io, uint32_t *transfer_tag, void **prvp)
 {
 
 	return (0);
 }
 
 void
 icl_soft_conn_transfer_done(struct icl_conn *ic, void *prv)
 {
 }
 
 static int
 icl_soft_limits(struct icl_drv_limits *idl, int socket)
 {
 
 	idl->idl_max_recv_data_segment_length = max_data_segment_length;
 	idl->idl_max_send_data_segment_length = max_data_segment_length;
 	idl->idl_max_burst_length = max_burst_length;
 	idl->idl_first_burst_length = first_burst_length;
 
 	return (0);
 }
 
 #ifdef ICL_KERNEL_PROXY
 int
 icl_soft_conn_connect(struct icl_conn *ic, int domain, int socktype,
     int protocol, struct sockaddr *from_sa, struct sockaddr *to_sa)
 {
 
 	return (icl_soft_proxy_connect(ic, domain, socktype, protocol,
 	    from_sa, to_sa));
 }
 
 int
 icl_soft_handoff_sock(struct icl_conn *ic, struct socket *so)
 {
 	int error;
 
 	ICL_CONN_LOCK_ASSERT_NOT(ic);
 
 	if (so->so_type != SOCK_STREAM)
 		return (EINVAL);
 
 	ICL_CONN_LOCK(ic);
 	if (ic->ic_socket != NULL) {
 		ICL_CONN_UNLOCK(ic);
 		return (EBUSY);
 	}
 	ic->ic_socket = so;
 	ICL_CONN_UNLOCK(ic);
 
 	error = icl_conn_start(ic);
 
 	return (error);
 }
 #endif /* ICL_KERNEL_PROXY */
 
 static int
 icl_soft_load(void)
 {
 	int error;
 
 	icl_soft_pdu_zone = uma_zcreate("icl_soft_pdu",
 	    sizeof(struct icl_soft_pdu), NULL, NULL, NULL, NULL,
 	    UMA_ALIGN_PTR, 0);
 	refcount_init(&icl_ncons, 0);
 
 	/*
 	 * The reason we call this "none" is that to the user,
 	 * it's known as "offload driver"; "offload driver: soft"
 	 * doesn't make much sense.
 	 */
 	error = icl_register("none", false, 0,
 	    icl_soft_limits, icl_soft_new_conn);
 	KASSERT(error == 0, ("failed to register"));
 
 #if defined(ICL_KERNEL_PROXY) && 0
 	/*
 	 * Debugging aid for kernel proxy functionality.
 	 */
 	error = icl_register("proxytest", true, 0,
 	    icl_soft_limits, icl_soft_new_conn);
 	KASSERT(error == 0, ("failed to register"));
 #endif
 
 	return (error);
 }
 
 static int
 icl_soft_unload(void)
 {
 
 	if (icl_ncons != 0)
 		return (EBUSY);
 
 	icl_unregister("none", false);
 #if defined(ICL_KERNEL_PROXY) && 0
 	icl_unregister("proxytest", true);
 #endif
 
 	uma_zdestroy(icl_soft_pdu_zone);
 
 	return (0);
 }
 
 static int
 icl_soft_modevent(module_t mod, int what, void *arg)
 {
 
 	switch (what) {
 	case MOD_LOAD:
 		return (icl_soft_load());
 	case MOD_UNLOAD:
 		return (icl_soft_unload());
 	default:
 		return (EINVAL);
 	}
 }
 
 moduledata_t icl_soft_data = {
 	"icl_soft",
 	icl_soft_modevent,
 	0
 };
 
 DECLARE_MODULE(icl_soft, icl_soft_data, SI_SUB_DRIVERS, SI_ORDER_MIDDLE);
 MODULE_DEPEND(icl_soft, icl, 1, 1, 1);
 MODULE_VERSION(icl_soft, 1);
diff --git a/sys/dev/nvmf/nvmf_tcp.c b/sys/dev/nvmf/nvmf_tcp.c
index 22275aaa835b..2e33334b92ee 100644
--- a/sys/dev/nvmf/nvmf_tcp.c
+++ b/sys/dev/nvmf/nvmf_tcp.c
@@ -1,1874 +1,1874 @@
 /*-
  * SPDX-License-Identifier: BSD-2-Clause
  *
  * Copyright (c) 2022-2024 Chelsio Communications, Inc.
  * Written by: John Baldwin <jhb@FreeBSD.org>
  */
 
 #include <sys/param.h>
 #include <sys/capsicum.h>
 #include <sys/condvar.h>
 #include <sys/file.h>
 #include <sys/gsb_crc32.h>
 #include <sys/kernel.h>
 #include <sys/kthread.h>
 #include <sys/limits.h>
 #include <sys/lock.h>
 #include <sys/malloc.h>
 #include <sys/mbuf.h>
 #include <sys/module.h>
 #include <sys/mutex.h>
 #include <sys/protosw.h>
 #include <sys/refcount.h>
 #include <sys/socket.h>
 #include <sys/socketvar.h>
 #include <sys/sysctl.h>
 #include <sys/uio.h>
 #include <netinet/in.h>
 #include <dev/nvme/nvme.h>
 #include <dev/nvmf/nvmf.h>
 #include <dev/nvmf/nvmf_proto.h>
 #include <dev/nvmf/nvmf_tcp.h>
 #include <dev/nvmf/nvmf_transport.h>
 #include <dev/nvmf/nvmf_transport_internal.h>
 
 struct nvmf_tcp_capsule;
 struct nvmf_tcp_qpair;
 
 struct nvmf_tcp_command_buffer {
 	struct nvmf_tcp_qpair *qp;
 
 	struct nvmf_io_request io;
 	size_t	data_len;
 	size_t	data_xfered;
 	uint32_t data_offset;
 
 	u_int	refs;
 	int	error;
 
 	uint16_t cid;
 	uint16_t ttag;
 
 	TAILQ_ENTRY(nvmf_tcp_command_buffer) link;
 
 	/* Controller only */
 	struct nvmf_tcp_capsule *tc;
 };
 
 struct nvmf_tcp_command_buffer_list {
 	TAILQ_HEAD(, nvmf_tcp_command_buffer) head;
 	struct mtx lock;
 };
 
 struct nvmf_tcp_qpair {
 	struct nvmf_qpair qp;
 
 	struct socket *so;
 
 	volatile u_int refs;	/* Every allocated capsule holds a reference */
 	uint8_t	txpda;
 	uint8_t rxpda;
 	bool header_digests;
 	bool data_digests;
 	uint32_t maxr2t;
 	uint32_t maxh2cdata;	/* Controller only */
 	uint32_t max_tx_data;
 	uint32_t max_icd;	/* Host only */
 	uint16_t next_ttag;	/* Controller only */
 	u_int num_ttags;	/* Controller only */
 	u_int active_ttags;	/* Controller only */
 	bool send_success;	/* Controller only */
 
 	/* Receive state. */
 	struct thread *rx_thread;
 	struct cv rx_cv;
 	bool	rx_shutdown;
 
 	/* Transmit state. */
 	struct thread *tx_thread;
 	struct cv tx_cv;
 	bool	tx_shutdown;
 	struct mbufq tx_pdus;
 	STAILQ_HEAD(, nvmf_tcp_capsule) tx_capsules;
 
 	struct nvmf_tcp_command_buffer_list tx_buffers;
 	struct nvmf_tcp_command_buffer_list rx_buffers;
 
 	/*
 	 * For the controller, an RX command buffer can be in one of
 	 * two locations, all protected by the rx_buffers.lock.  If a
 	 * receive request is waiting for either an R2T slot for its
 	 * command (due to exceeding MAXR2T), or a transfer tag it is
 	 * placed on the rx_buffers list.  When a request is allocated
 	 * an active transfer tag, it moves to the open_ttags[] array
 	 * (indexed by the tag) until it completes.
 	 */
 	struct nvmf_tcp_command_buffer **open_ttags;	/* Controller only */
 };
 
 struct nvmf_tcp_rxpdu {
 	struct mbuf *m;
 	const struct nvme_tcp_common_pdu_hdr *hdr;
 	uint32_t data_len;
 	bool data_digest_mismatch;
 };
 
 struct nvmf_tcp_capsule {
 	struct nvmf_capsule nc;
 
 	volatile u_int refs;
 
 	struct nvmf_tcp_rxpdu rx_pdu;
 
 	uint32_t active_r2ts;		/* Controller only */
 #ifdef INVARIANTS
 	uint32_t tx_data_offset;	/* Controller only */
 	u_int pending_r2ts;		/* Controller only */
 #endif
 
 	STAILQ_ENTRY(nvmf_tcp_capsule) link;
 };
 
 #define	TCAP(nc)	((struct nvmf_tcp_capsule *)(nc))
 #define	TQP(qp)		((struct nvmf_tcp_qpair *)(qp))
 
 static void	tcp_release_capsule(struct nvmf_tcp_capsule *tc);
 static void	tcp_free_qpair(struct nvmf_qpair *nq);
 
 SYSCTL_NODE(_kern_nvmf, OID_AUTO, tcp, CTLFLAG_RD | CTLFLAG_MPSAFE, 0,
     "TCP transport");
 static u_int tcp_max_transmit_data = 256 * 1024;
 SYSCTL_UINT(_kern_nvmf_tcp, OID_AUTO, max_transmit_data, CTLFLAG_RWTUN,
     &tcp_max_transmit_data, 0,
     "Maximum size of data payload in a transmitted PDU");
 
 static MALLOC_DEFINE(M_NVMF_TCP, "nvmf_tcp", "NVMe over TCP");
 
 static int
 mbuf_crc32c_helper(void *arg, void *data, u_int len)
 {
 	uint32_t *digestp = arg;
 
 	*digestp = calculate_crc32c(*digestp, data, len);
 	return (0);
 }
 
 static uint32_t
 mbuf_crc32c(struct mbuf *m, u_int offset, u_int len)
 {
 	uint32_t digest = 0xffffffff;
 
 	m_apply(m, offset, len, mbuf_crc32c_helper, &digest);
 	digest = digest ^ 0xffffffff;
 
 	return (digest);
 }
 
 static uint32_t
 compute_digest(const void *buf, size_t len)
 {
 	return (calculate_crc32c(0xffffffff, buf, len) ^ 0xffffffff);
 }
 
 static struct nvmf_tcp_command_buffer *
 tcp_alloc_command_buffer(struct nvmf_tcp_qpair *qp,
     const struct nvmf_io_request *io, uint32_t data_offset, size_t data_len,
     uint16_t cid)
 {
 	struct nvmf_tcp_command_buffer *cb;
 
 	cb = malloc(sizeof(*cb), M_NVMF_TCP, M_WAITOK);
 	cb->qp = qp;
 	cb->io = *io;
 	cb->data_offset = data_offset;
 	cb->data_len = data_len;
 	cb->data_xfered = 0;
 	refcount_init(&cb->refs, 1);
 	cb->error = 0;
 	cb->cid = cid;
 	cb->ttag = 0;
 	cb->tc = NULL;
 
 	return (cb);
 }
 
 static void
 tcp_hold_command_buffer(struct nvmf_tcp_command_buffer *cb)
 {
 	refcount_acquire(&cb->refs);
 }
 
 static void
 tcp_free_command_buffer(struct nvmf_tcp_command_buffer *cb)
 {
 	nvmf_complete_io_request(&cb->io, cb->data_xfered, cb->error);
 	if (cb->tc != NULL)
 		tcp_release_capsule(cb->tc);
 	free(cb, M_NVMF_TCP);
 }
 
 static void
 tcp_release_command_buffer(struct nvmf_tcp_command_buffer *cb)
 {
 	if (refcount_release(&cb->refs))
 		tcp_free_command_buffer(cb);
 }
 
 static void
 tcp_add_command_buffer(struct nvmf_tcp_command_buffer_list *list,
     struct nvmf_tcp_command_buffer *cb)
 {
 	mtx_assert(&list->lock, MA_OWNED);
 	TAILQ_INSERT_HEAD(&list->head, cb, link);
 }
 
 static struct nvmf_tcp_command_buffer *
 tcp_find_command_buffer(struct nvmf_tcp_command_buffer_list *list,
     uint16_t cid, uint16_t ttag)
 {
 	struct nvmf_tcp_command_buffer *cb;
 
 	mtx_assert(&list->lock, MA_OWNED);
 	TAILQ_FOREACH(cb, &list->head, link) {
 		if (cb->cid == cid && cb->ttag == ttag)
 			return (cb);
 	}
 	return (NULL);
 }
 
 static void
 tcp_remove_command_buffer(struct nvmf_tcp_command_buffer_list *list,
     struct nvmf_tcp_command_buffer *cb)
 {
 	mtx_assert(&list->lock, MA_OWNED);
 	TAILQ_REMOVE(&list->head, cb, link);
 }
 
 static void
 tcp_purge_command_buffer(struct nvmf_tcp_command_buffer_list *list,
     uint16_t cid, uint16_t ttag)
 {
 	struct nvmf_tcp_command_buffer *cb;
 
 	mtx_lock(&list->lock);
 	cb = tcp_find_command_buffer(list, cid, ttag);
 	if (cb != NULL) {
 		tcp_remove_command_buffer(list, cb);
 		mtx_unlock(&list->lock);
 		tcp_release_command_buffer(cb);
 	} else
 		mtx_unlock(&list->lock);
 }
 
 static void
 nvmf_tcp_write_pdu(struct nvmf_tcp_qpair *qp, struct mbuf *m)
 {
 	struct socket *so = qp->so;
 
 	SOCKBUF_LOCK(&so->so_snd);
 	mbufq_enqueue(&qp->tx_pdus, m);
 	/* XXX: Do we need to handle sb_hiwat being wrong? */
 	if (sowriteable(so))
 		cv_signal(&qp->tx_cv);
 	SOCKBUF_UNLOCK(&so->so_snd);
 }
 
 static void
 nvmf_tcp_report_error(struct nvmf_tcp_qpair *qp, uint16_t fes, uint32_t fei,
     struct mbuf *rx_pdu, u_int hlen)
 {
 	struct nvme_tcp_term_req_hdr *hdr;
 	struct mbuf *m;
 
 	if (hlen != 0) {
 		hlen = min(hlen, NVME_TCP_TERM_REQ_ERROR_DATA_MAX_SIZE);
 		hlen = min(hlen, m_length(rx_pdu, NULL));
 	}
 
 	m = m_get2(sizeof(*hdr) + hlen, M_WAITOK, MT_DATA, 0);
 	m->m_len = sizeof(*hdr) + hlen;
 	hdr = mtod(m, void *);
 	memset(hdr, 0, sizeof(*hdr));
 	hdr->common.pdu_type = qp->qp.nq_controller ?
 	    NVME_TCP_PDU_TYPE_C2H_TERM_REQ : NVME_TCP_PDU_TYPE_H2C_TERM_REQ;
 	hdr->common.hlen = sizeof(*hdr);
 	hdr->common.plen = sizeof(*hdr) + hlen;
 	hdr->fes = htole16(fes);
 	le32enc(hdr->fei, fei);
 	if (hlen != 0)
 		m_copydata(rx_pdu, 0, hlen, (caddr_t)(hdr + 1));
 
 	nvmf_tcp_write_pdu(qp, m);
 }
 
 static int
 nvmf_tcp_validate_pdu(struct nvmf_tcp_qpair *qp, struct nvmf_tcp_rxpdu *pdu)
 {
 	const struct nvme_tcp_common_pdu_hdr *ch;
 	struct mbuf *m = pdu->m;
 	uint32_t data_len, fei, plen;
 	uint32_t digest, rx_digest;
 	u_int hlen;
 	int error;
 	uint16_t fes;
 
 	/* Determine how large of a PDU header to return for errors. */
 	ch = pdu->hdr;
 	hlen = ch->hlen;
 	plen = le32toh(ch->plen);
 	if (hlen < sizeof(*ch) || hlen > plen)
 		hlen = sizeof(*ch);
 
 	error = nvmf_tcp_validate_pdu_header(ch, qp->qp.nq_controller,
 	    qp->header_digests, qp->data_digests, qp->rxpda, &data_len, &fes,
 	    &fei);
 	if (error != 0) {
 		if (error != ECONNRESET)
 			nvmf_tcp_report_error(qp, fes, fei, m, hlen);
 		return (error);
 	}
 
 	/* Check header digest if present. */
 	if ((ch->flags & NVME_TCP_CH_FLAGS_HDGSTF) != 0) {
 		digest = mbuf_crc32c(m, 0, ch->hlen);
 		m_copydata(m, ch->hlen, sizeof(rx_digest), (caddr_t)&rx_digest);
 		if (digest != rx_digest) {
 			printf("NVMe/TCP: Header digest mismatch\n");
 			nvmf_tcp_report_error(qp,
 			    NVME_TCP_TERM_REQ_FES_HDGST_ERROR, rx_digest, m,
 			    hlen);
 			return (EBADMSG);
 		}
 	}
 
 	/* Check data digest if present. */
 	pdu->data_digest_mismatch = false;
 	if ((ch->flags & NVME_TCP_CH_FLAGS_DDGSTF) != 0) {
 		digest = mbuf_crc32c(m, ch->pdo, data_len);
 		m_copydata(m, plen - sizeof(rx_digest), sizeof(rx_digest),
 		    (caddr_t)&rx_digest);
 		if (digest != rx_digest) {
 			printf("NVMe/TCP: Data digest mismatch\n");
 			pdu->data_digest_mismatch = true;
 		}
 	}
 
 	pdu->data_len = data_len;
 	return (0);
 }
 
 static void
 nvmf_tcp_free_pdu(struct nvmf_tcp_rxpdu *pdu)
 {
 	m_freem(pdu->m);
 	pdu->m = NULL;
 	pdu->hdr = NULL;
 }
 
 static int
 nvmf_tcp_handle_term_req(struct nvmf_tcp_rxpdu *pdu)
 {
 	const struct nvme_tcp_term_req_hdr *hdr;
 
 	hdr = (const void *)pdu->hdr;
 
 	printf("NVMe/TCP: Received termination request: fes %#x fei %#x\n",
 	    le16toh(hdr->fes), le32dec(hdr->fei));
 	nvmf_tcp_free_pdu(pdu);
 	return (ECONNRESET);
 }
 
 static int
 nvmf_tcp_save_command_capsule(struct nvmf_tcp_qpair *qp,
     struct nvmf_tcp_rxpdu *pdu)
 {
 	const struct nvme_tcp_cmd *cmd;
 	struct nvmf_capsule *nc;
 	struct nvmf_tcp_capsule *tc;
 
 	cmd = (const void *)pdu->hdr;
 
 	nc = nvmf_allocate_command(&qp->qp, &cmd->ccsqe, M_WAITOK);
 
 	tc = TCAP(nc);
 	tc->rx_pdu = *pdu;
 
 	nvmf_capsule_received(&qp->qp, nc);
 	return (0);
 }
 
 static int
 nvmf_tcp_save_response_capsule(struct nvmf_tcp_qpair *qp,
     struct nvmf_tcp_rxpdu *pdu)
 {
 	const struct nvme_tcp_rsp *rsp;
 	struct nvmf_capsule *nc;
 	struct nvmf_tcp_capsule *tc;
 
 	rsp = (const void *)pdu->hdr;
 
 	nc = nvmf_allocate_response(&qp->qp, &rsp->rccqe, M_WAITOK);
 
 	nc->nc_sqhd_valid = true;
 	tc = TCAP(nc);
 	tc->rx_pdu = *pdu;
 
 	/*
 	 * Once the CQE has been received, no further transfers to the
 	 * command buffer for the associated CID can occur.
 	 */
 	tcp_purge_command_buffer(&qp->rx_buffers, rsp->rccqe.cid, 0);
 	tcp_purge_command_buffer(&qp->tx_buffers, rsp->rccqe.cid, 0);
 
 	nvmf_capsule_received(&qp->qp, nc);
 	return (0);
 }
 
 /*
  * Construct a PDU that contains an optional data payload.  This
  * includes dealing with digests and the length fields in the common
  * header.
  */
 static struct mbuf *
 nvmf_tcp_construct_pdu(struct nvmf_tcp_qpair *qp, void *hdr, size_t hlen,
     struct mbuf *data, uint32_t data_len)
 {
 	struct nvme_tcp_common_pdu_hdr *ch;
 	struct mbuf *top;
 	uint32_t digest, pad, pdo, plen, mlen;
 
 	plen = hlen;
 	if (qp->header_digests)
 		plen += sizeof(digest);
 	if (data_len != 0) {
 		KASSERT(m_length(data, NULL) == data_len, ("length mismatch"));
 		pdo = roundup2(plen, qp->txpda);
 		pad = pdo - plen;
 		plen = pdo + data_len;
 		if (qp->data_digests)
 			plen += sizeof(digest);
 		mlen = pdo;
 	} else {
 		KASSERT(data == NULL, ("payload mbuf with zero length"));
 		pdo = 0;
 		pad = 0;
 		mlen = plen;
 	}
 
 	top = m_get2(mlen, M_WAITOK, MT_DATA, 0);
 	top->m_len = mlen;
 	ch = mtod(top, void *);
 	memcpy(ch, hdr, hlen);
 	ch->hlen = hlen;
 	if (qp->header_digests)
 		ch->flags |= NVME_TCP_CH_FLAGS_HDGSTF;
 	if (qp->data_digests && data_len != 0)
 		ch->flags |= NVME_TCP_CH_FLAGS_DDGSTF;
 	ch->pdo = pdo;
 	ch->plen = htole32(plen);
 
 	/* HDGST */
 	if (qp->header_digests) {
 		digest = compute_digest(ch, hlen);
 		memcpy((char *)ch + hlen, &digest, sizeof(digest));
 	}
 
 	if (pad != 0) {
 		/* PAD */
 		memset((char *)ch + pdo - pad, 0, pad);
 	}
 
 	if (data_len != 0) {
 		/* DATA */
 		top->m_next = data;
 
 		/* DDGST */
 		if (qp->data_digests) {
 			digest = mbuf_crc32c(data, 0, data_len);
 
 			/* XXX: Can't use m_append as it uses M_NOWAIT. */
 			while (data->m_next != NULL)
 				data = data->m_next;
 
 			data->m_next = m_get(M_WAITOK, MT_DATA);
 			data->m_next->m_len = sizeof(digest);
 			memcpy(mtod(data->m_next, void *), &digest,
 			    sizeof(digest));
 		}
 	}
 
 	return (top);
 }
 
 /* Find the next command buffer eligible to schedule for R2T. */
 static struct nvmf_tcp_command_buffer *
 nvmf_tcp_next_r2t(struct nvmf_tcp_qpair *qp)
 {
 	struct nvmf_tcp_command_buffer *cb;
 
 	mtx_assert(&qp->rx_buffers.lock, MA_OWNED);
 	MPASS(qp->active_ttags < qp->num_ttags);
 
 	TAILQ_FOREACH(cb, &qp->rx_buffers.head, link) {
 		/* NB: maxr2t is 0's based. */
 		if (cb->tc->active_r2ts > qp->maxr2t)
 			continue;
 #ifdef INVARIANTS
 		cb->tc->pending_r2ts--;
 #endif
 		TAILQ_REMOVE(&qp->rx_buffers.head, cb, link);
 		return (cb);
 	}
 	return (NULL);
 }
 
 /* Allocate the next free transfer tag and assign it to cb. */
 static void
 nvmf_tcp_allocate_ttag(struct nvmf_tcp_qpair *qp,
     struct nvmf_tcp_command_buffer *cb)
 {
 	uint16_t ttag;
 
 	mtx_assert(&qp->rx_buffers.lock, MA_OWNED);
 
 	ttag = qp->next_ttag;
 	for (;;) {
 		if (qp->open_ttags[ttag] == NULL)
 			break;
 		if (ttag == qp->num_ttags - 1)
 			ttag = 0;
 		else
 			ttag++;
 		MPASS(ttag != qp->next_ttag);
 	}
 	if (ttag == qp->num_ttags - 1)
 		qp->next_ttag = 0;
 	else
 		qp->next_ttag = ttag + 1;
 
 	cb->tc->active_r2ts++;
 	qp->active_ttags++;
 	qp->open_ttags[ttag] = cb;
 
 	/*
 	 * Don't bother byte-swapping ttag as it is just a cookie
 	 * value returned by the other end as-is.
 	 */
 	cb->ttag = ttag;
 }
 
 /* NB: cid and ttag are both little-endian already. */
 static void
 tcp_send_r2t(struct nvmf_tcp_qpair *qp, uint16_t cid, uint16_t ttag,
     uint32_t data_offset, uint32_t data_len)
 {
 	struct nvme_tcp_r2t_hdr r2t;
 	struct mbuf *m;
 
 	memset(&r2t, 0, sizeof(r2t));
 	r2t.common.pdu_type = NVME_TCP_PDU_TYPE_R2T;
 	r2t.cccid = cid;
 	r2t.ttag = ttag;
 	r2t.r2to = htole32(data_offset);
 	r2t.r2tl = htole32(data_len);
 
 	m = nvmf_tcp_construct_pdu(qp, &r2t, sizeof(r2t), NULL, 0);
 	nvmf_tcp_write_pdu(qp, m);
 }
 
 /*
  * Release a transfer tag and schedule another R2T.
  *
  * NB: This drops the rx_buffers.lock mutex.
  */
 static void
 nvmf_tcp_send_next_r2t(struct nvmf_tcp_qpair *qp,
     struct nvmf_tcp_command_buffer *cb)
 {
 	struct nvmf_tcp_command_buffer *ncb;
 
 	mtx_assert(&qp->rx_buffers.lock, MA_OWNED);
 	MPASS(qp->open_ttags[cb->ttag] == cb);
 
 	/* Release this transfer tag. */
 	qp->open_ttags[cb->ttag] = NULL;
 	qp->active_ttags--;
 	cb->tc->active_r2ts--;
 
 	/* Schedule another R2T. */
 	ncb = nvmf_tcp_next_r2t(qp);
 	if (ncb != NULL) {
 		nvmf_tcp_allocate_ttag(qp, ncb);
 		mtx_unlock(&qp->rx_buffers.lock);
 		tcp_send_r2t(qp, ncb->cid, ncb->ttag, ncb->data_offset,
 		    ncb->data_len);
 	} else
 		mtx_unlock(&qp->rx_buffers.lock);
 }
 
 /*
  * Copy len bytes starting at offset skip from an mbuf chain into an
  * I/O buffer at destination offset io_offset.
  */
 static void
 mbuf_copyto_io(struct mbuf *m, u_int skip, u_int len,
     struct nvmf_io_request *io, u_int io_offset)
 {
 	u_int todo;
 
 	while (m->m_len <= skip) {
 		skip -= m->m_len;
 		m = m->m_next;
 	}
 	while (len != 0) {
 		MPASS((m->m_flags & M_EXTPG) == 0);
 
 		todo = min(m->m_len - skip, len);
 		memdesc_copyback(&io->io_mem, io_offset, todo, mtodo(m, skip));
 		skip = 0;
 		io_offset += todo;
 		len -= todo;
 		m = m->m_next;
 	}
 }
 
 static int
 nvmf_tcp_handle_h2c_data(struct nvmf_tcp_qpair *qp, struct nvmf_tcp_rxpdu *pdu)
 {
 	const struct nvme_tcp_h2c_data_hdr *h2c;
 	struct nvmf_tcp_command_buffer *cb;
 	uint32_t data_len, data_offset;
 	uint16_t ttag;
 
 	h2c = (const void *)pdu->hdr;
 	if (le32toh(h2c->datal) > qp->maxh2cdata) {
 		nvmf_tcp_report_error(qp,
 		    NVME_TCP_TERM_REQ_FES_DATA_TRANSFER_LIMIT_EXCEEDED, 0,
 		    pdu->m, pdu->hdr->hlen);
 		nvmf_tcp_free_pdu(pdu);
 		return (EBADMSG);
 	}
 
 	/*
 	 * NB: Don't bother byte-swapping ttag as we don't byte-swap
 	 * it when sending.
 	 */
 	ttag = h2c->ttag;
 	if (ttag >= qp->num_ttags) {
 		nvmf_tcp_report_error(qp,
 		    NVME_TCP_TERM_REQ_FES_INVALID_HEADER_FIELD,
 		    offsetof(struct nvme_tcp_h2c_data_hdr, ttag), pdu->m,
 		    pdu->hdr->hlen);
 		nvmf_tcp_free_pdu(pdu);
 		return (EBADMSG);
 	}
 
 	mtx_lock(&qp->rx_buffers.lock);
 	cb = qp->open_ttags[ttag];
 	if (cb == NULL) {
 		mtx_unlock(&qp->rx_buffers.lock);
 		nvmf_tcp_report_error(qp,
 		    NVME_TCP_TERM_REQ_FES_INVALID_HEADER_FIELD,
 		    offsetof(struct nvme_tcp_h2c_data_hdr, ttag), pdu->m,
 		    pdu->hdr->hlen);
 		nvmf_tcp_free_pdu(pdu);
 		return (EBADMSG);
 	}
 	MPASS(cb->ttag == ttag);
 
 	/* For a data digest mismatch, fail the I/O request. */
 	if (pdu->data_digest_mismatch) {
 		nvmf_tcp_send_next_r2t(qp, cb);
 		cb->error = EINTEGRITY;
 		tcp_release_command_buffer(cb);
 		nvmf_tcp_free_pdu(pdu);
 		return (0);
 	}
 
 	data_len = le32toh(h2c->datal);
 	if (data_len != pdu->data_len) {
 		mtx_unlock(&qp->rx_buffers.lock);
 		nvmf_tcp_report_error(qp,
 		    NVME_TCP_TERM_REQ_FES_INVALID_HEADER_FIELD,
 		    offsetof(struct nvme_tcp_h2c_data_hdr, datal), pdu->m,
 		    pdu->hdr->hlen);
 		nvmf_tcp_free_pdu(pdu);
 		return (EBADMSG);
 	}
 
 	data_offset = le32toh(h2c->datao);
 	if (data_offset < cb->data_offset ||
 	    data_offset + data_len > cb->data_offset + cb->data_len) {
 		mtx_unlock(&qp->rx_buffers.lock);
 		nvmf_tcp_report_error(qp,
 		    NVME_TCP_TERM_REQ_FES_DATA_TRANSFER_OUT_OF_RANGE, 0, pdu->m,
 		    pdu->hdr->hlen);
 		nvmf_tcp_free_pdu(pdu);
 		return (EBADMSG);
 	}
 
 	if (data_offset != cb->data_offset + cb->data_xfered) {
 		mtx_unlock(&qp->rx_buffers.lock);
 		nvmf_tcp_report_error(qp,
 		    NVME_TCP_TERM_REQ_FES_PDU_SEQUENCE_ERROR, 0, pdu->m,
 		    pdu->hdr->hlen);
 		nvmf_tcp_free_pdu(pdu);
 		return (EBADMSG);
 	}
 
 	if ((cb->data_xfered + data_len == cb->data_len) !=
 	    ((pdu->hdr->flags & NVME_TCP_H2C_DATA_FLAGS_LAST_PDU) != 0)) {
 		mtx_unlock(&qp->rx_buffers.lock);
 		nvmf_tcp_report_error(qp,
 		    NVME_TCP_TERM_REQ_FES_PDU_SEQUENCE_ERROR, 0, pdu->m,
 		    pdu->hdr->hlen);
 		nvmf_tcp_free_pdu(pdu);
 		return (EBADMSG);
 	}
 
 	cb->data_xfered += data_len;
 	data_offset -= cb->data_offset;
 	if (cb->data_xfered == cb->data_len) {
 		nvmf_tcp_send_next_r2t(qp, cb);
 	} else {
 		tcp_hold_command_buffer(cb);
 		mtx_unlock(&qp->rx_buffers.lock);
 	}
 
 	mbuf_copyto_io(pdu->m, pdu->hdr->pdo, data_len, &cb->io, data_offset);
 
 	tcp_release_command_buffer(cb);
 	nvmf_tcp_free_pdu(pdu);
 	return (0);
 }
 
 static int
 nvmf_tcp_handle_c2h_data(struct nvmf_tcp_qpair *qp, struct nvmf_tcp_rxpdu *pdu)
 {
 	const struct nvme_tcp_c2h_data_hdr *c2h;
 	struct nvmf_tcp_command_buffer *cb;
 	uint32_t data_len, data_offset;
 
 	c2h = (const void *)pdu->hdr;
 
 	mtx_lock(&qp->rx_buffers.lock);
 	cb = tcp_find_command_buffer(&qp->rx_buffers, c2h->cccid, 0);
 	if (cb == NULL) {
 		mtx_unlock(&qp->rx_buffers.lock);
 		/*
 		 * XXX: Could be PDU sequence error if cccid is for a
 		 * command that doesn't use a command buffer.
 		 */
 		nvmf_tcp_report_error(qp,
 		    NVME_TCP_TERM_REQ_FES_INVALID_HEADER_FIELD,
 		    offsetof(struct nvme_tcp_c2h_data_hdr, cccid), pdu->m,
 		    pdu->hdr->hlen);
 		nvmf_tcp_free_pdu(pdu);
 		return (EBADMSG);
 	}
 
 	/* For a data digest mismatch, fail the I/O request. */
 	if (pdu->data_digest_mismatch) {
 		cb->error = EINTEGRITY;
 		tcp_remove_command_buffer(&qp->rx_buffers, cb);
 		mtx_unlock(&qp->rx_buffers.lock);
 		tcp_release_command_buffer(cb);
 		nvmf_tcp_free_pdu(pdu);
 		return (0);
 	}
 
 	data_len = le32toh(c2h->datal);
 	if (data_len != pdu->data_len) {
 		mtx_unlock(&qp->rx_buffers.lock);
 		nvmf_tcp_report_error(qp,
 		    NVME_TCP_TERM_REQ_FES_INVALID_HEADER_FIELD,
 		    offsetof(struct nvme_tcp_c2h_data_hdr, datal), pdu->m,
 		    pdu->hdr->hlen);
 		nvmf_tcp_free_pdu(pdu);
 		return (EBADMSG);
 	}
 
 	data_offset = le32toh(c2h->datao);
 	if (data_offset < cb->data_offset ||
 	    data_offset + data_len > cb->data_offset + cb->data_len) {
 		mtx_unlock(&qp->rx_buffers.lock);
 		nvmf_tcp_report_error(qp,
 		    NVME_TCP_TERM_REQ_FES_DATA_TRANSFER_OUT_OF_RANGE, 0,
 		    pdu->m, pdu->hdr->hlen);
 		nvmf_tcp_free_pdu(pdu);
 		return (EBADMSG);
 	}
 
 	if (data_offset != cb->data_offset + cb->data_xfered) {
 		mtx_unlock(&qp->rx_buffers.lock);
 		nvmf_tcp_report_error(qp,
 		    NVME_TCP_TERM_REQ_FES_PDU_SEQUENCE_ERROR, 0, pdu->m,
 		    pdu->hdr->hlen);
 		nvmf_tcp_free_pdu(pdu);
 		return (EBADMSG);
 	}
 
 	if ((cb->data_xfered + data_len == cb->data_len) !=
 	    ((pdu->hdr->flags & NVME_TCP_C2H_DATA_FLAGS_LAST_PDU) != 0)) {
 		mtx_unlock(&qp->rx_buffers.lock);
 		nvmf_tcp_report_error(qp,
 		    NVME_TCP_TERM_REQ_FES_PDU_SEQUENCE_ERROR, 0, pdu->m,
 		    pdu->hdr->hlen);
 		nvmf_tcp_free_pdu(pdu);
 		return (EBADMSG);
 	}
 
 	cb->data_xfered += data_len;
 	data_offset -= cb->data_offset;
 	if (cb->data_xfered == cb->data_len)
 		tcp_remove_command_buffer(&qp->rx_buffers, cb);
 	else
 		tcp_hold_command_buffer(cb);
 	mtx_unlock(&qp->rx_buffers.lock);
 
 	mbuf_copyto_io(pdu->m, pdu->hdr->pdo, data_len, &cb->io, data_offset);
 
 	tcp_release_command_buffer(cb);
 
 	if ((pdu->hdr->flags & NVME_TCP_C2H_DATA_FLAGS_SUCCESS) != 0) {
 		struct nvme_completion cqe;
 		struct nvmf_capsule *nc;
 
 		memset(&cqe, 0, sizeof(cqe));
 		cqe.cid = c2h->cccid;
 
 		nc = nvmf_allocate_response(&qp->qp, &cqe, M_WAITOK);
 		nc->nc_sqhd_valid = false;
 
 		nvmf_capsule_received(&qp->qp, nc);
 	}
 
 	nvmf_tcp_free_pdu(pdu);
 	return (0);
 }
 
 /* Called when m_free drops refcount to 0. */
 static void
 nvmf_tcp_mbuf_done(struct mbuf *m)
 {
 	struct nvmf_tcp_command_buffer *cb = m->m_ext.ext_arg1;
 
 	tcp_free_command_buffer(cb);
 }
 
 static struct mbuf *
 nvmf_tcp_mbuf(void *arg, int how, void *data, size_t len)
 {
 	struct nvmf_tcp_command_buffer *cb = arg;
 	struct mbuf *m;
 
 	m = m_get(how, MT_DATA);
 	m->m_flags |= M_RDONLY;
 	m_extaddref(m, data, len, &cb->refs, nvmf_tcp_mbuf_done, cb, NULL);
 	m->m_len = len;
 	return (m);
 }
 
 static void
 nvmf_tcp_free_mext_pg(struct mbuf *m)
 {
 	struct nvmf_tcp_command_buffer *cb = m->m_ext.ext_arg1;
 
 	M_ASSERTEXTPG(m);
 	tcp_release_command_buffer(cb);
 }
 
 static struct mbuf *
 nvmf_tcp_mext_pg(void *arg, int how)
 {
 	struct nvmf_tcp_command_buffer *cb = arg;
 	struct mbuf *m;
 
-	m = mb_alloc_ext_pgs(how, nvmf_tcp_free_mext_pg);
+	m = mb_alloc_ext_pgs(how, nvmf_tcp_free_mext_pg, M_RDONLY);
 	m->m_ext.ext_arg1 = cb;
 	tcp_hold_command_buffer(cb);
 	return (m);
 }
 
 /*
  * Return an mbuf chain for a range of data belonging to a command
  * buffer.
  *
  * The mbuf chain uses M_EXT mbufs which hold references on the
  * command buffer so that it remains "alive" until the data has been
  * fully transmitted.  If truncate_ok is true, then the mbuf chain
  * might return a short chain to avoid gratuitously splitting up a
  * page.
  */
 static struct mbuf *
 nvmf_tcp_command_buffer_mbuf(struct nvmf_tcp_command_buffer *cb,
     uint32_t data_offset, uint32_t data_len, uint32_t *actual_len,
     bool can_truncate)
 {
 	struct mbuf *m;
 	size_t len;
 
 	m = memdesc_alloc_ext_mbufs(&cb->io.io_mem, nvmf_tcp_mbuf,
 	    nvmf_tcp_mext_pg, cb, M_WAITOK, data_offset, data_len, &len,
 	    can_truncate);
 	if (actual_len != NULL)
 		*actual_len = len;
 	return (m);
 }
 
 /* NB: cid and ttag and little-endian already. */
 static void
 tcp_send_h2c_pdu(struct nvmf_tcp_qpair *qp, uint16_t cid, uint16_t ttag,
     uint32_t data_offset, struct mbuf *m, size_t len, bool last_pdu)
 {
 	struct nvme_tcp_h2c_data_hdr h2c;
 	struct mbuf *top;
 
 	memset(&h2c, 0, sizeof(h2c));
 	h2c.common.pdu_type = NVME_TCP_PDU_TYPE_H2C_DATA;
 	if (last_pdu)
 		h2c.common.flags |= NVME_TCP_H2C_DATA_FLAGS_LAST_PDU;
 	h2c.cccid = cid;
 	h2c.ttag = ttag;
 	h2c.datao = htole32(data_offset);
 	h2c.datal = htole32(len);
 
 	top = nvmf_tcp_construct_pdu(qp, &h2c, sizeof(h2c), m, len);
 	nvmf_tcp_write_pdu(qp, top);
 }
 
 static int
 nvmf_tcp_handle_r2t(struct nvmf_tcp_qpair *qp, struct nvmf_tcp_rxpdu *pdu)
 {
 	const struct nvme_tcp_r2t_hdr *r2t;
 	struct nvmf_tcp_command_buffer *cb;
 	uint32_t data_len, data_offset;
 
 	r2t = (const void *)pdu->hdr;
 
 	mtx_lock(&qp->tx_buffers.lock);
 	cb = tcp_find_command_buffer(&qp->tx_buffers, r2t->cccid, 0);
 	if (cb == NULL) {
 		mtx_unlock(&qp->tx_buffers.lock);
 		nvmf_tcp_report_error(qp,
 		    NVME_TCP_TERM_REQ_FES_INVALID_HEADER_FIELD,
 		    offsetof(struct nvme_tcp_r2t_hdr, cccid), pdu->m,
 		    pdu->hdr->hlen);
 		nvmf_tcp_free_pdu(pdu);
 		return (EBADMSG);
 	}
 
 	data_offset = le32toh(r2t->r2to);
 	if (data_offset != cb->data_xfered) {
 		mtx_unlock(&qp->tx_buffers.lock);
 		nvmf_tcp_report_error(qp,
 		    NVME_TCP_TERM_REQ_FES_PDU_SEQUENCE_ERROR, 0, pdu->m,
 		    pdu->hdr->hlen);
 		nvmf_tcp_free_pdu(pdu);
 		return (EBADMSG);
 	}
 
 	/*
 	 * XXX: The spec does not specify how to handle R2T tranfers
 	 * out of range of the original command.
 	 */
 	data_len = le32toh(r2t->r2tl);
 	if (data_offset + data_len > cb->data_len) {
 		mtx_unlock(&qp->tx_buffers.lock);
 		nvmf_tcp_report_error(qp,
 		    NVME_TCP_TERM_REQ_FES_DATA_TRANSFER_OUT_OF_RANGE, 0,
 		    pdu->m, pdu->hdr->hlen);
 		nvmf_tcp_free_pdu(pdu);
 		return (EBADMSG);
 	}
 
 	cb->data_xfered += data_len;
 	if (cb->data_xfered == cb->data_len)
 		tcp_remove_command_buffer(&qp->tx_buffers, cb);
 	else
 		tcp_hold_command_buffer(cb);
 	mtx_unlock(&qp->tx_buffers.lock);
 
 	/*
 	 * Queue one or more H2C_DATA PDUs containing the requested
 	 * data.
 	 */
 	while (data_len > 0) {
 		struct mbuf *m;
 		uint32_t sent, todo;
 
 		todo = min(data_len, qp->max_tx_data);
 		m = nvmf_tcp_command_buffer_mbuf(cb, data_offset, todo, &sent,
 		    todo < data_len);
 		tcp_send_h2c_pdu(qp, r2t->cccid, r2t->ttag, data_offset, m,
 		    sent, sent == data_len);
 
 		data_offset += sent;
 		data_len -= sent;
 	}
 
 	tcp_release_command_buffer(cb);
 	nvmf_tcp_free_pdu(pdu);
 	return (0);
 }
 
 /*
  * A variant of m_pullup that uses M_WAITOK instead of failing.  It
  * also doesn't do anything if enough bytes are already present in the
  * first mbuf.
  */
 static struct mbuf *
 pullup_pdu_hdr(struct mbuf *m, int len)
 {
 	struct mbuf *n, *p;
 
 	KASSERT(len <= MCLBYTES, ("%s: len too large", __func__));
 	if (m->m_len >= len)
 		return (m);
 
 	n = m_get2(len, M_WAITOK, MT_DATA, 0);
 	n->m_len = len;
 	m_copydata(m, 0, len, mtod(n, void *));
 
 	while (m != NULL && m->m_len <= len) {
 		p = m->m_next;
 		len -= m->m_len;
 		m_free(m);
 		m = p;
 	}
 	if (len > 0) {
 		m->m_data += len;
 		m->m_len -= len;
 	}
 	n->m_next = m;
 	return (n);
 }
 
 static int
 nvmf_tcp_dispatch_pdu(struct nvmf_tcp_qpair *qp,
     const struct nvme_tcp_common_pdu_hdr *ch, struct nvmf_tcp_rxpdu *pdu)
 {
 	/* Ensure the PDU header is contiguous. */
 	pdu->m = pullup_pdu_hdr(pdu->m, ch->hlen);
 	pdu->hdr = mtod(pdu->m, const void *);
 
 	switch (ch->pdu_type) {
 	default:
 		__assert_unreachable();
 		break;
 	case NVME_TCP_PDU_TYPE_H2C_TERM_REQ:
 	case NVME_TCP_PDU_TYPE_C2H_TERM_REQ:
 		return (nvmf_tcp_handle_term_req(pdu));
 	case NVME_TCP_PDU_TYPE_CAPSULE_CMD:
 		return (nvmf_tcp_save_command_capsule(qp, pdu));
 	case NVME_TCP_PDU_TYPE_CAPSULE_RESP:
 		return (nvmf_tcp_save_response_capsule(qp, pdu));
 	case NVME_TCP_PDU_TYPE_H2C_DATA:
 		return (nvmf_tcp_handle_h2c_data(qp, pdu));
 	case NVME_TCP_PDU_TYPE_C2H_DATA:
 		return (nvmf_tcp_handle_c2h_data(qp, pdu));
 	case NVME_TCP_PDU_TYPE_R2T:
 		return (nvmf_tcp_handle_r2t(qp, pdu));
 	}
 }
 
 static void
 nvmf_tcp_receive(void *arg)
 {
 	struct nvmf_tcp_qpair *qp = arg;
 	struct socket *so = qp->so;
 	struct nvmf_tcp_rxpdu pdu;
 	struct nvme_tcp_common_pdu_hdr ch;
 	struct uio uio;
 	struct iovec iov[1];
 	struct mbuf *m, *n, *tail;
 	u_int avail, needed;
 	int error, flags, terror;
 	bool have_header;
 
 	m = tail = NULL;
 	have_header = false;
 	SOCKBUF_LOCK(&so->so_rcv);
 	while (!qp->rx_shutdown) {
 		/* Wait until there is enough data for the next step. */
 		if (so->so_error != 0 || so->so_rerror != 0) {
 			if (so->so_error != 0)
 				error = so->so_error;
 			else
 				error = so->so_rerror;
 			SOCKBUF_UNLOCK(&so->so_rcv);
 		error:
 			m_freem(m);
 			nvmf_qpair_error(&qp->qp, error);
 			SOCKBUF_LOCK(&so->so_rcv);
 			while (!qp->rx_shutdown)
 				cv_wait(&qp->rx_cv, SOCKBUF_MTX(&so->so_rcv));
 			break;
 		}
 		avail = sbavail(&so->so_rcv);
 		if ((so->so_rcv.sb_state & SBS_CANTRCVMORE) != 0) {
 			if (!have_header && avail == 0)
 				error = 0;
 			else
 				error = ECONNRESET;
 			SOCKBUF_UNLOCK(&so->so_rcv);
 			goto error;
 		}
 		if (avail == 0 || (!have_header && avail < sizeof(ch))) {
 			cv_wait(&qp->rx_cv, SOCKBUF_MTX(&so->so_rcv));
 			continue;
 		}
 		SOCKBUF_UNLOCK(&so->so_rcv);
 
 		if (!have_header) {
 			KASSERT(m == NULL, ("%s: m != NULL but no header",
 			    __func__));
 			memset(&uio, 0, sizeof(uio));
 			iov[0].iov_base = &ch;
 			iov[0].iov_len = sizeof(ch);
 			uio.uio_iov = iov;
 			uio.uio_iovcnt = 1;
 			uio.uio_resid = sizeof(ch);
 			uio.uio_segflg = UIO_SYSSPACE;
 			uio.uio_rw = UIO_READ;
 			flags = MSG_DONTWAIT | MSG_PEEK;
 
 			error = soreceive(so, NULL, &uio, NULL, NULL, &flags);
 			if (error != 0)
 				goto error;
 			KASSERT(uio.uio_resid == 0, ("%s: short CH read",
 			    __func__));
 
 			have_header = true;
 			needed = le32toh(ch.plen);
 
 			/*
 			 * Malformed PDUs will be reported as errors
 			 * by nvmf_tcp_validate_pdu.  Just pass along
 			 * garbage headers if the lengths mismatch.
 			 */
 			if (needed < sizeof(ch) || ch.hlen > needed)
 				needed = sizeof(ch);
 
 			memset(&uio, 0, sizeof(uio));
 			uio.uio_resid = needed;
 		}
 
 		flags = MSG_DONTWAIT;
 		error = soreceive(so, NULL, &uio, &n, NULL, &flags);
 		if (error != 0)
 			goto error;
 
 		if (m == NULL)
 			m = n;
 		else
 			tail->m_next = n;
 
 		if (uio.uio_resid != 0) {
 			tail = n;
 			while (tail->m_next != NULL)
 				tail = tail->m_next;
 
 			SOCKBUF_LOCK(&so->so_rcv);
 			continue;
 		}
 #ifdef INVARIANTS
 		tail = NULL;
 #endif
 
 		pdu.m = m;
 		m = NULL;
 		pdu.hdr = &ch;
 		error = nvmf_tcp_validate_pdu(qp, &pdu);
 		if (error != 0)
 			m_freem(pdu.m);
 		else
 			error = nvmf_tcp_dispatch_pdu(qp, &ch, &pdu);
 		if (error != 0) {
 			/*
 			 * If we received a termination request, close
 			 * the connection immediately.
 			 */
 			if (error == ECONNRESET)
 				goto error;
 
 			/*
 			 * Wait for up to 30 seconds for the socket to
 			 * be closed by the other end.
 			 */
 			SOCKBUF_LOCK(&so->so_rcv);
 			if ((so->so_rcv.sb_state & SBS_CANTRCVMORE) == 0) {
 				terror = cv_timedwait(&qp->rx_cv,
 				    SOCKBUF_MTX(&so->so_rcv), 30 * hz);
 				if (terror == ETIMEDOUT)
 					printf("NVMe/TCP: Timed out after sending terminate request\n");
 			}
 			SOCKBUF_UNLOCK(&so->so_rcv);
 			goto error;
 		}
 
 		have_header = false;
 		SOCKBUF_LOCK(&so->so_rcv);
 	}
 	SOCKBUF_UNLOCK(&so->so_rcv);
 	kthread_exit();
 }
 
 static struct mbuf *
 tcp_command_pdu(struct nvmf_tcp_qpair *qp, struct nvmf_tcp_capsule *tc)
 {
 	struct nvmf_capsule *nc = &tc->nc;
 	struct nvmf_tcp_command_buffer *cb;
 	struct nvme_sgl_descriptor *sgl;
 	struct nvme_tcp_cmd cmd;
 	struct mbuf *top, *m;
 	bool use_icd;
 
 	use_icd = false;
 	cb = NULL;
 	m = NULL;
 
 	if (nc->nc_data.io_len != 0) {
 		cb = tcp_alloc_command_buffer(qp, &nc->nc_data, 0,
 		    nc->nc_data.io_len, nc->nc_sqe.cid);
 
 		if (nc->nc_send_data && nc->nc_data.io_len <= qp->max_icd) {
 			use_icd = true;
 			m = nvmf_tcp_command_buffer_mbuf(cb, 0,
 			    nc->nc_data.io_len, NULL, false);
 			cb->data_xfered = nc->nc_data.io_len;
 			tcp_release_command_buffer(cb);
 		} else if (nc->nc_send_data) {
 			mtx_lock(&qp->tx_buffers.lock);
 			tcp_add_command_buffer(&qp->tx_buffers, cb);
 			mtx_unlock(&qp->tx_buffers.lock);
 		} else {
 			mtx_lock(&qp->rx_buffers.lock);
 			tcp_add_command_buffer(&qp->rx_buffers, cb);
 			mtx_unlock(&qp->rx_buffers.lock);
 		}
 	}
 
 	memset(&cmd, 0, sizeof(cmd));
 	cmd.common.pdu_type = NVME_TCP_PDU_TYPE_CAPSULE_CMD;
 	cmd.ccsqe = nc->nc_sqe;
 
 	/* Populate SGL in SQE. */
 	sgl = &cmd.ccsqe.sgl;
 	memset(sgl, 0, sizeof(*sgl));
 	sgl->address = 0;
 	sgl->length = htole32(nc->nc_data.io_len);
 	if (use_icd) {
 		/* Use in-capsule data. */
 		sgl->type = NVME_SGL_TYPE_ICD;
 	} else {
 		/* Use a command buffer. */
 		sgl->type = NVME_SGL_TYPE_COMMAND_BUFFER;
 	}
 
 	top = nvmf_tcp_construct_pdu(qp, &cmd, sizeof(cmd), m, m != NULL ?
 	    nc->nc_data.io_len : 0);
 	return (top);
 }
 
 static struct mbuf *
 tcp_response_pdu(struct nvmf_tcp_qpair *qp, struct nvmf_tcp_capsule *tc)
 {
 	struct nvmf_capsule *nc = &tc->nc;
 	struct nvme_tcp_rsp rsp;
 
 	memset(&rsp, 0, sizeof(rsp));
 	rsp.common.pdu_type = NVME_TCP_PDU_TYPE_CAPSULE_RESP;
 	rsp.rccqe = nc->nc_cqe;
 
 	return (nvmf_tcp_construct_pdu(qp, &rsp, sizeof(rsp), NULL, 0));
 }
 
 static struct mbuf *
 capsule_to_pdu(struct nvmf_tcp_qpair *qp, struct nvmf_tcp_capsule *tc)
 {
 	if (tc->nc.nc_qe_len == sizeof(struct nvme_command))
 		return (tcp_command_pdu(qp, tc));
 	else
 		return (tcp_response_pdu(qp, tc));
 }
 
 static void
 nvmf_tcp_send(void *arg)
 {
 	struct nvmf_tcp_qpair *qp = arg;
 	struct nvmf_tcp_capsule *tc;
 	struct socket *so = qp->so;
 	struct mbuf *m, *n, *p;
 	u_long space, tosend;
 	int error;
 
 	m = NULL;
 	SOCKBUF_LOCK(&so->so_snd);
 	while (!qp->tx_shutdown) {
 		if (so->so_error != 0) {
 			error = so->so_error;
 			SOCKBUF_UNLOCK(&so->so_snd);
 		error:
 			m_freem(m);
 			nvmf_qpair_error(&qp->qp, error);
 			SOCKBUF_LOCK(&so->so_snd);
 			while (!qp->tx_shutdown)
 				cv_wait(&qp->tx_cv, SOCKBUF_MTX(&so->so_snd));
 			break;
 		}
 
 		if (m == NULL) {
 			/* Next PDU to send. */
 			m = mbufq_dequeue(&qp->tx_pdus);
 		}
 		if (m == NULL) {
 			if (STAILQ_EMPTY(&qp->tx_capsules)) {
 				cv_wait(&qp->tx_cv, SOCKBUF_MTX(&so->so_snd));
 				continue;
 			}
 
 			/* Convert a capsule into a PDU. */
 			tc = STAILQ_FIRST(&qp->tx_capsules);
 			STAILQ_REMOVE_HEAD(&qp->tx_capsules, link);
 			SOCKBUF_UNLOCK(&so->so_snd);
 
 			n = capsule_to_pdu(qp, tc);
 			tcp_release_capsule(tc);
 
 			SOCKBUF_LOCK(&so->so_snd);
 			mbufq_enqueue(&qp->tx_pdus, n);
 			continue;
 		}
 
 		/*
 		 * Wait until there is enough room to send some data.
 		 * If the socket buffer is empty, always send at least
 		 * something.
 		 */
 		space = sbspace(&so->so_snd);
 		if (space < m->m_len && sbused(&so->so_snd) != 0) {
 			cv_wait(&qp->tx_cv, SOCKBUF_MTX(&so->so_snd));
 			continue;
 		}
 		SOCKBUF_UNLOCK(&so->so_snd);
 
 		/*
 		 * If 'm' is too big, then the socket buffer must be
 		 * empty.  Split 'm' to make at least some forward
 		 * progress.
 		 *
 		 * Otherwise, chain up as many pending mbufs from 'm'
 		 * that will fit.
 		 */
 		if (m->m_len > space) {
 			n = m_split(m, space, M_WAITOK);
 		} else {
 			tosend = m->m_len;
 			n = m->m_next;
 			p = m;
 			while (n != NULL && tosend + n->m_len <= space) {
 				tosend += n->m_len;
 				p = n;
 				n = n->m_next;
 			}
 			KASSERT(p->m_next == n, ("%s: p not before n",
 			    __func__));
 			p->m_next = NULL;
 
 			KASSERT(m_length(m, NULL) == tosend,
 			    ("%s: length mismatch", __func__));
 		}
 		error = sosend(so, NULL, NULL, m, NULL, MSG_DONTWAIT, NULL);
 		if (error != 0) {
 			m = NULL;
 			m_freem(n);
 			goto error;
 		}
 		m = n;
 		SOCKBUF_LOCK(&so->so_snd);
 	}
 	SOCKBUF_UNLOCK(&so->so_snd);
 	kthread_exit();
 }
 
 static int
 nvmf_soupcall_receive(struct socket *so, void *arg, int waitflag)
 {
 	struct nvmf_tcp_qpair *qp = arg;
 
 	if (soreadable(so))
 		cv_signal(&qp->rx_cv);
 	return (SU_OK);
 }
 
 static int
 nvmf_soupcall_send(struct socket *so, void *arg, int waitflag)
 {
 	struct nvmf_tcp_qpair *qp = arg;
 
 	if (sowriteable(so))
 		cv_signal(&qp->tx_cv);
 	return (SU_OK);
 }
 
 static struct nvmf_qpair *
 tcp_allocate_qpair(bool controller,
     const struct nvmf_handoff_qpair_params *params)
 {
 	struct nvmf_tcp_qpair *qp;
 	struct socket *so;
 	struct file *fp;
 	cap_rights_t rights;
 	int error;
 
 	error = fget(curthread, params->tcp.fd, cap_rights_init_one(&rights,
 	    CAP_SOCK_CLIENT), &fp);
 	if (error != 0)
 		return (NULL);
 	if (fp->f_type != DTYPE_SOCKET) {
 		fdrop(fp, curthread);
 		return (NULL);
 	}
 	so = fp->f_data;
 	if (so->so_type != SOCK_STREAM ||
 	    so->so_proto->pr_protocol != IPPROTO_TCP) {
 		fdrop(fp, curthread);
 		return (NULL);
 	}
 
 	/* Claim socket from file descriptor. */
 	fp->f_ops = &badfileops;
 	fp->f_data = NULL;
 	fdrop(fp, curthread);
 
 	qp = malloc(sizeof(*qp), M_NVMF_TCP, M_WAITOK | M_ZERO);
 	qp->so = so;
 	refcount_init(&qp->refs, 1);
 	qp->txpda = params->tcp.txpda;
 	qp->rxpda = params->tcp.rxpda;
 	qp->header_digests = params->tcp.header_digests;
 	qp->data_digests = params->tcp.data_digests;
 	qp->maxr2t = params->tcp.maxr2t;
 	if (controller)
 		qp->maxh2cdata = params->tcp.maxh2cdata;
 	qp->max_tx_data = tcp_max_transmit_data;
 	if (!controller) {
 		qp->max_tx_data = min(qp->max_tx_data, params->tcp.maxh2cdata);
 		qp->max_icd = params->tcp.max_icd;
 	}
 
 	if (controller) {
 		/* Use the SUCCESS flag if SQ flow control is disabled. */
 		qp->send_success = !params->sq_flow_control;
 
 		/* NB: maxr2t is 0's based. */
 		qp->num_ttags = MIN((u_int)UINT16_MAX + 1,
 		    (uint64_t)params->qsize * ((uint64_t)qp->maxr2t + 1));
 		qp->open_ttags = mallocarray(qp->num_ttags,
 		    sizeof(*qp->open_ttags), M_NVMF_TCP, M_WAITOK | M_ZERO);
 	}
 
 	TAILQ_INIT(&qp->rx_buffers.head);
 	TAILQ_INIT(&qp->tx_buffers.head);
 	mtx_init(&qp->rx_buffers.lock, "nvmf/tcp rx buffers", NULL, MTX_DEF);
 	mtx_init(&qp->tx_buffers.lock, "nvmf/tcp tx buffers", NULL, MTX_DEF);
 
 	cv_init(&qp->rx_cv, "-");
 	cv_init(&qp->tx_cv, "-");
 	mbufq_init(&qp->tx_pdus, 0);
 	STAILQ_INIT(&qp->tx_capsules);
 
 	/* Register socket upcalls. */
 	SOCKBUF_LOCK(&so->so_rcv);
 	soupcall_set(so, SO_RCV, nvmf_soupcall_receive, qp);
 	SOCKBUF_UNLOCK(&so->so_rcv);
 	SOCKBUF_LOCK(&so->so_snd);
 	soupcall_set(so, SO_SND, nvmf_soupcall_send, qp);
 	SOCKBUF_UNLOCK(&so->so_snd);
 
 	/* Spin up kthreads. */
 	error = kthread_add(nvmf_tcp_receive, qp, NULL, &qp->rx_thread, 0, 0,
 	    "nvmef tcp rx");
 	if (error != 0) {
 		tcp_free_qpair(&qp->qp);
 		return (NULL);
 	}
 	error = kthread_add(nvmf_tcp_send, qp, NULL, &qp->tx_thread, 0, 0,
 	    "nvmef tcp tx");
 	if (error != 0) {
 		tcp_free_qpair(&qp->qp);
 		return (NULL);
 	}
 
 	return (&qp->qp);
 }
 
 static void
 tcp_release_qpair(struct nvmf_tcp_qpair *qp)
 {
 	if (refcount_release(&qp->refs))
 		free(qp, M_NVMF_TCP);
 }
 
 static void
 tcp_free_qpair(struct nvmf_qpair *nq)
 {
 	struct nvmf_tcp_qpair *qp = TQP(nq);
 	struct nvmf_tcp_command_buffer *ncb, *cb;
 	struct nvmf_tcp_capsule *ntc, *tc;
 	struct socket *so = qp->so;
 
 	/* Shut down kthreads and clear upcalls */
 	SOCKBUF_LOCK(&so->so_snd);
 	qp->tx_shutdown = true;
 	if (qp->tx_thread != NULL) {
 		cv_signal(&qp->tx_cv);
 		mtx_sleep(qp->tx_thread, SOCKBUF_MTX(&so->so_snd), 0,
 		    "nvtcptx", 0);
 	}
 	soupcall_clear(so, SO_SND);
 	SOCKBUF_UNLOCK(&so->so_snd);
 
 	SOCKBUF_LOCK(&so->so_rcv);
 	qp->rx_shutdown = true;
 	if (qp->rx_thread != NULL) {
 		cv_signal(&qp->rx_cv);
 		mtx_sleep(qp->rx_thread, SOCKBUF_MTX(&so->so_rcv), 0,
 		    "nvtcprx", 0);
 	}
 	soupcall_clear(so, SO_RCV);
 	SOCKBUF_UNLOCK(&so->so_rcv);
 
 	STAILQ_FOREACH_SAFE(tc, &qp->tx_capsules, link, ntc) {
 		nvmf_abort_capsule_data(&tc->nc, ECONNABORTED);
 		tcp_release_capsule(tc);
 	}
 	mbufq_drain(&qp->tx_pdus);
 
 	cv_destroy(&qp->tx_cv);
 	cv_destroy(&qp->rx_cv);
 
 	if (qp->open_ttags != NULL) {
 		for (u_int i = 0; i < qp->num_ttags; i++) {
 			cb = qp->open_ttags[i];
 			if (cb != NULL) {
 				cb->tc->active_r2ts--;
 				cb->error = ECONNABORTED;
 				tcp_release_command_buffer(cb);
 			}
 		}
 		free(qp->open_ttags, M_NVMF_TCP);
 	}
 
 	mtx_lock(&qp->rx_buffers.lock);
 	TAILQ_FOREACH_SAFE(cb, &qp->rx_buffers.head, link, ncb) {
 		tcp_remove_command_buffer(&qp->rx_buffers, cb);
 		mtx_unlock(&qp->rx_buffers.lock);
 #ifdef INVARIANTS
 		if (cb->tc != NULL)
 			cb->tc->pending_r2ts--;
 #endif
 		cb->error = ECONNABORTED;
 		tcp_release_command_buffer(cb);
 		mtx_lock(&qp->rx_buffers.lock);
 	}
 	mtx_destroy(&qp->rx_buffers.lock);
 
 	mtx_lock(&qp->tx_buffers.lock);
 	TAILQ_FOREACH_SAFE(cb, &qp->tx_buffers.head, link, ncb) {
 		tcp_remove_command_buffer(&qp->tx_buffers, cb);
 		mtx_unlock(&qp->tx_buffers.lock);
 		cb->error = ECONNABORTED;
 		tcp_release_command_buffer(cb);
 		mtx_lock(&qp->tx_buffers.lock);
 	}
 	mtx_destroy(&qp->tx_buffers.lock);
 
 	soclose(so);
 
 	tcp_release_qpair(qp);
 }
 
 static struct nvmf_capsule *
 tcp_allocate_capsule(struct nvmf_qpair *nq, int how)
 {
 	struct nvmf_tcp_qpair *qp = TQP(nq);
 	struct nvmf_tcp_capsule *tc;
 
 	tc = malloc(sizeof(*tc), M_NVMF_TCP, how | M_ZERO);
 	if (tc == NULL)
 		return (NULL);
 	refcount_init(&tc->refs, 1);
 	refcount_acquire(&qp->refs);
 	return (&tc->nc);
 }
 
 static void
 tcp_release_capsule(struct nvmf_tcp_capsule *tc)
 {
 	struct nvmf_tcp_qpair *qp = TQP(tc->nc.nc_qpair);
 
 	if (!refcount_release(&tc->refs))
 		return;
 
 	MPASS(tc->active_r2ts == 0);
 	MPASS(tc->pending_r2ts == 0);
 
 	nvmf_tcp_free_pdu(&tc->rx_pdu);
 	free(tc, M_NVMF_TCP);
 	tcp_release_qpair(qp);
 }
 
 static void
 tcp_free_capsule(struct nvmf_capsule *nc)
 {
 	struct nvmf_tcp_capsule *tc = TCAP(nc);
 
 	tcp_release_capsule(tc);
 }
 
 static int
 tcp_transmit_capsule(struct nvmf_capsule *nc)
 {
 	struct nvmf_tcp_qpair *qp = TQP(nc->nc_qpair);
 	struct nvmf_tcp_capsule *tc = TCAP(nc);
 	struct socket *so = qp->so;
 
 	refcount_acquire(&tc->refs);
 	SOCKBUF_LOCK(&so->so_snd);
 	STAILQ_INSERT_TAIL(&qp->tx_capsules, tc, link);
 	if (sowriteable(so))
 		cv_signal(&qp->tx_cv);
 	SOCKBUF_UNLOCK(&so->so_snd);
 	return (0);
 }
 
 static uint8_t
 tcp_validate_command_capsule(struct nvmf_capsule *nc)
 {
 	struct nvmf_tcp_capsule *tc = TCAP(nc);
 	struct nvme_sgl_descriptor *sgl;
 
 	KASSERT(tc->rx_pdu.hdr != NULL, ("capsule wasn't received"));
 
 	sgl = &nc->nc_sqe.sgl;
 	switch (sgl->type) {
 	case NVME_SGL_TYPE_ICD:
 		if (tc->rx_pdu.data_len != le32toh(sgl->length)) {
 			printf("NVMe/TCP: Command Capsule with mismatched ICD length\n");
 			return (NVME_SC_DATA_SGL_LENGTH_INVALID);
 		}
 		break;
 	case NVME_SGL_TYPE_COMMAND_BUFFER:
 		if (tc->rx_pdu.data_len != 0) {
 			printf("NVMe/TCP: Command Buffer SGL with ICD\n");
 			return (NVME_SC_INVALID_FIELD);
 		}
 		break;
 	default:
 		printf("NVMe/TCP: Invalid SGL type in Command Capsule\n");
 		return (NVME_SC_SGL_DESCRIPTOR_TYPE_INVALID);
 	}
 
 	if (sgl->address != 0) {
 		printf("NVMe/TCP: Invalid SGL offset in Command Capsule\n");
 		return (NVME_SC_SGL_OFFSET_INVALID);
 	}
 
 	return (NVME_SC_SUCCESS);
 }
 
 static size_t
 tcp_capsule_data_len(const struct nvmf_capsule *nc)
 {
 	MPASS(nc->nc_qe_len == sizeof(struct nvme_command));
 	return (le32toh(nc->nc_sqe.sgl.length));
 }
 
 static void
 tcp_receive_r2t_data(struct nvmf_capsule *nc, uint32_t data_offset,
     struct nvmf_io_request *io)
 {
 	struct nvmf_tcp_qpair *qp = TQP(nc->nc_qpair);
 	struct nvmf_tcp_capsule *tc = TCAP(nc);
 	struct nvmf_tcp_command_buffer *cb;
 
 	cb = tcp_alloc_command_buffer(qp, io, data_offset, io->io_len,
 	    nc->nc_sqe.cid);
 
 	cb->tc = tc;
 	refcount_acquire(&tc->refs);
 
 	/*
 	 * If this command has too many active R2Ts or there are no
 	 * available transfer tags, queue the request for later.
 	 *
 	 * NB: maxr2t is 0's based.
 	 */
 	mtx_lock(&qp->rx_buffers.lock);
 	if (tc->active_r2ts > qp->maxr2t || qp->active_ttags == qp->num_ttags) {
 #ifdef INVARIANTS
 		tc->pending_r2ts++;
 #endif
 		TAILQ_INSERT_TAIL(&qp->rx_buffers.head, cb, link);
 		mtx_unlock(&qp->rx_buffers.lock);
 		return;
 	}
 
 	nvmf_tcp_allocate_ttag(qp, cb);
 	mtx_unlock(&qp->rx_buffers.lock);
 
 	tcp_send_r2t(qp, nc->nc_sqe.cid, cb->ttag, data_offset, io->io_len);
 }
 
 static void
 tcp_receive_icd_data(struct nvmf_capsule *nc, uint32_t data_offset,
     struct nvmf_io_request *io)
 {
 	struct nvmf_tcp_capsule *tc = TCAP(nc);
 
 	mbuf_copyto_io(tc->rx_pdu.m, tc->rx_pdu.hdr->pdo + data_offset,
 	    io->io_len, io, 0);
 	nvmf_complete_io_request(io, io->io_len, 0);
 }
 
 static int
 tcp_receive_controller_data(struct nvmf_capsule *nc, uint32_t data_offset,
     struct nvmf_io_request *io)
 {
 	struct nvme_sgl_descriptor *sgl;
 	size_t data_len;
 
 	if (nc->nc_qe_len != sizeof(struct nvme_command) ||
 	    !nc->nc_qpair->nq_controller)
 		return (EINVAL);
 
 	sgl = &nc->nc_sqe.sgl;
 	data_len = le32toh(sgl->length);
 	if (data_offset + io->io_len > data_len)
 		return (EFBIG);
 
 	if (sgl->type == NVME_SGL_TYPE_ICD)
 		tcp_receive_icd_data(nc, data_offset, io);
 	else
 		tcp_receive_r2t_data(nc, data_offset, io);
 	return (0);
 }
 
 /* NB: cid is little-endian already. */
 static void
 tcp_send_c2h_pdu(struct nvmf_tcp_qpair *qp, uint16_t cid, uint32_t data_offset,
     struct mbuf *m, size_t len, bool last_pdu, bool success)
 {
 	struct nvme_tcp_c2h_data_hdr c2h;
 	struct mbuf *top;
 
 	memset(&c2h, 0, sizeof(c2h));
 	c2h.common.pdu_type = NVME_TCP_PDU_TYPE_C2H_DATA;
 	if (last_pdu)
 		c2h.common.flags |= NVME_TCP_C2H_DATA_FLAGS_LAST_PDU;
 	if (success)
 		c2h.common.flags |= NVME_TCP_C2H_DATA_FLAGS_SUCCESS;
 	c2h.cccid = cid;
 	c2h.datao = htole32(data_offset);
 	c2h.datal = htole32(len);
 
 	top = nvmf_tcp_construct_pdu(qp, &c2h, sizeof(c2h), m, len);
 	nvmf_tcp_write_pdu(qp, top);
 }
 
 static u_int
 tcp_send_controller_data(struct nvmf_capsule *nc, uint32_t data_offset,
     struct mbuf *m, size_t len)
 {
 	struct nvmf_tcp_qpair *qp = TQP(nc->nc_qpair);
 	struct nvme_sgl_descriptor *sgl;
 	uint32_t data_len;
 	bool last_pdu, last_xfer;
 
 	if (nc->nc_qe_len != sizeof(struct nvme_command) ||
 	    !qp->qp.nq_controller) {
 		m_freem(m);
 		return (NVME_SC_INVALID_FIELD);
 	}
 
 	sgl = &nc->nc_sqe.sgl;
 	data_len = le32toh(sgl->length);
 	if (data_offset + len > data_len) {
 		m_freem(m);
 		return (NVME_SC_INVALID_FIELD);
 	}
 	last_xfer = (data_offset + len == data_len);
 
 	if (sgl->type != NVME_SGL_TYPE_COMMAND_BUFFER) {
 		m_freem(m);
 		return (NVME_SC_INVALID_FIELD);
 	}
 
 	KASSERT(data_offset == TCAP(nc)->tx_data_offset,
 	    ("%s: starting data_offset %u doesn't match end of previous xfer %u",
 	    __func__, data_offset, TCAP(nc)->tx_data_offset));
 
 	/* Queue one more C2H_DATA PDUs containing the data from 'm'. */
 	while (m != NULL) {
 		struct mbuf *n;
 		uint32_t todo;
 
 		if (m->m_len > qp->max_tx_data) {
 			n = m_split(m, qp->max_tx_data, M_WAITOK);
 			todo = m->m_len;
 		} else {
 			struct mbuf *p;
 
 			todo = m->m_len;
 			p = m;
 			n = p->m_next;
 			while (n != NULL) {
 				if (todo + n->m_len > qp->max_tx_data) {
 					p->m_next = NULL;
 					break;
 				}
 				todo += n->m_len;
 				p = n;
 				n = p->m_next;
 			}
 			MPASS(m_length(m, NULL) == todo);
 		}
 
 		last_pdu = (n == NULL && last_xfer);
 		tcp_send_c2h_pdu(qp, nc->nc_sqe.cid, data_offset, m, todo,
 		    last_pdu, last_pdu && qp->send_success);
 
 		data_offset += todo;
 		data_len -= todo;
 		m = n;
 	}
 	MPASS(data_len == 0);
 
 #ifdef INVARIANTS
 	TCAP(nc)->tx_data_offset = data_offset;
 #endif
 	if (!last_xfer)
 		return (NVMF_MORE);
 	else if (qp->send_success)
 		return (NVMF_SUCCESS_SENT);
 	else
 		return (NVME_SC_SUCCESS);
 }
 
 struct nvmf_transport_ops tcp_ops = {
 	.allocate_qpair = tcp_allocate_qpair,
 	.free_qpair = tcp_free_qpair,
 	.allocate_capsule = tcp_allocate_capsule,
 	.free_capsule = tcp_free_capsule,
 	.transmit_capsule = tcp_transmit_capsule,
 	.validate_command_capsule = tcp_validate_command_capsule,
 	.capsule_data_len = tcp_capsule_data_len,
 	.receive_controller_data = tcp_receive_controller_data,
 	.send_controller_data = tcp_send_controller_data,
 	.trtype = NVMF_TRTYPE_TCP,
 	.priority = 0,
 };
 
 NVMF_TRANSPORT(tcp, tcp_ops);
diff --git a/sys/fs/nfsclient/nfs_clrpcops.c b/sys/fs/nfsclient/nfs_clrpcops.c
index 7540893ce63c..e1c02a71939b 100644
--- a/sys/fs/nfsclient/nfs_clrpcops.c
+++ b/sys/fs/nfsclient/nfs_clrpcops.c
@@ -1,9559 +1,9559 @@
 /*-
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Copyright (c) 1989, 1993
  *	The Regents of the University of California.  All rights reserved.
  *
  * This code is derived from software contributed to Berkeley by
  * Rick Macklem at The University of Guelph.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  */
 
 #include <sys/cdefs.h>
 /*
  * Rpc op calls, generally called from the vnode op calls or through the
  * buffer cache, for NFS v2, 3 and 4.
  * These do not normally make any changes to vnode arguments or use
  * structures that might change between the VFS variants. The returned
  * arguments are all at the end, after the NFSPROC_T *p one.
  */
 
 #include "opt_inet6.h"
 
 #include <fs/nfs/nfsport.h>
 #include <fs/nfsclient/nfs.h>
 #include <sys/extattr.h>
 #include <sys/sysctl.h>
 #include <sys/taskqueue.h>
 
 SYSCTL_DECL(_vfs_nfs);
 
 static int	nfsignore_eexist = 0;
 SYSCTL_INT(_vfs_nfs, OID_AUTO, ignore_eexist, CTLFLAG_RW,
     &nfsignore_eexist, 0, "NFS ignore EEXIST replies for mkdir/symlink");
 
 static int	nfscl_dssameconn = 0;
 SYSCTL_INT(_vfs_nfs, OID_AUTO, dssameconn, CTLFLAG_RW,
     &nfscl_dssameconn, 0, "Use same TCP connection to multiple DSs");
 
 static uint64_t nfs_maxcopyrange = SSIZE_MAX;
 SYSCTL_U64(_vfs_nfs, OID_AUTO, maxcopyrange, CTLFLAG_RW,
     &nfs_maxcopyrange, 0, "Max size of a Copy so RPC times reasonable");
 
 /*
  * Global variables
  */
 extern struct nfsstatsv1 nfsstatsv1;
 extern int nfs_numnfscbd;
 extern struct timeval nfsboottime;
 extern u_int32_t newnfs_false, newnfs_true;
 extern nfstype nfsv34_type[9];
 extern int nfsrv_useacl;
 extern char nfsv4_callbackaddr[INET6_ADDRSTRLEN];
 extern int nfscl_debuglevel;
 extern int nfs_pnfsiothreads;
 extern u_long sb_max_adj;
 NFSCLSTATEMUTEX;
 int nfstest_outofseq = 0;
 int nfscl_assumeposixlocks = 1;
 int nfscl_enablecallb = 0;
 short nfsv4_cbport = NFSV4_CBPORT;
 int nfstest_openallsetattr = 0;
 
 #define	DIRHDSIZ	offsetof(struct dirent, d_name)
 
 /*
  * nfscl_getsameserver() can return one of three values:
  * NFSDSP_USETHISSESSION - Use this session for the DS.
  * NFSDSP_SEQTHISSESSION - Use the nfsclds_sequence field of this dsp for new
  *     session.
  * NFSDSP_NOTFOUND - No matching server was found.
  */
 enum nfsclds_state {
 	NFSDSP_USETHISSESSION = 0,
 	NFSDSP_SEQTHISSESSION = 1,
 	NFSDSP_NOTFOUND = 2,
 };
 
 /*
  * Do a write RPC on a DS data file, using this structure for the arguments,
  * so that this function can be executed by a separate kernel process.
  */
 struct nfsclwritedsdorpc {
 	int			done;
 	int			inprog;
 	struct task		tsk;
 	struct vnode		*vp;
 	int			iomode;
 	int			must_commit;
 	nfsv4stateid_t		*stateidp;
 	struct nfsclds		*dsp;
 	uint64_t		off;
 	int			len;
 #ifdef notyet
 	int			advise;
 #endif
 	struct nfsfh		*fhp;
 	struct mbuf		*m;
 	int			vers;
 	int			minorvers;
 	struct ucred		*cred;
 	NFSPROC_T		*p;
 	int			err;
 };
 
 static int nfsrpc_setattrrpc(vnode_t , struct vattr *, nfsv4stateid_t *,
     struct ucred *, NFSPROC_T *, struct nfsvattr *, int *);
 static int nfsrpc_readrpc(vnode_t , struct uio *, struct ucred *,
     nfsv4stateid_t *, NFSPROC_T *, struct nfsvattr *, int *);
 static int nfsrpc_writerpc(vnode_t , struct uio *, int *, int *,
     struct ucred *, nfsv4stateid_t *, NFSPROC_T *, struct nfsvattr *, int *,
     int);
 static int nfsrpc_deallocaterpc(vnode_t, off_t, off_t, nfsv4stateid_t *,
     struct nfsvattr *, int *, struct ucred *, NFSPROC_T *);
 static int nfsrpc_createv23(vnode_t , char *, int, struct vattr *,
     nfsquad_t, int, struct ucred *, NFSPROC_T *, struct nfsvattr *,
     struct nfsvattr *, struct nfsfh **, int *, int *);
 static int nfsrpc_createv4(vnode_t , char *, int, struct vattr *,
     nfsquad_t, int, struct nfsclowner *, struct nfscldeleg **, struct ucred *,
     NFSPROC_T *, struct nfsvattr *, struct nfsvattr *, struct nfsfh **, int *,
     int *, int *);
 static bool nfscl_invalidfname(bool, char *, int);
 static int nfsrpc_locku(struct nfsrv_descript *, struct nfsmount *,
     struct nfscllockowner *, u_int64_t, u_int64_t,
     u_int32_t, struct ucred *, NFSPROC_T *, int);
 static int nfsrpc_setaclrpc(vnode_t, struct ucred *, NFSPROC_T *,
     struct acl *, nfsv4stateid_t *);
 static int nfsrpc_layouterror(struct nfsmount *, uint8_t *, int, uint64_t,
     uint64_t, nfsv4stateid_t *, struct ucred *, NFSPROC_T *, uint32_t,
     uint32_t, char *);
 static int nfsrpc_getlayout(struct nfsmount *, vnode_t, struct nfsfh *, int,
     uint32_t, uint32_t *, nfsv4stateid_t *, uint64_t, struct nfscllayout **,
     struct ucred *, NFSPROC_T *);
 static int nfsrpc_fillsa(struct nfsmount *, struct sockaddr_in *,
     struct sockaddr_in6 *, sa_family_t, int, int, struct nfsclds **,
     NFSPROC_T *);
 static void nfscl_initsessionslots(struct nfsclsession *);
 static int nfscl_doflayoutio(vnode_t, struct uio *, int *, int *, int *,
     nfsv4stateid_t *, int, struct nfscldevinfo *, struct nfscllayout *,
     struct nfsclflayout *, uint64_t, uint64_t, int, struct ucred *,
     NFSPROC_T *);
 static int nfscl_dofflayoutio(vnode_t, struct uio *, int *, int *, int *,
     nfsv4stateid_t *, int, struct nfscldevinfo *, struct nfscllayout *,
     struct nfsclflayout *, uint64_t, uint64_t, int, int, struct mbuf *,
     struct nfsclwritedsdorpc *, struct ucred *, NFSPROC_T *);
 static int nfsrpc_readds(vnode_t, struct uio *, nfsv4stateid_t *, int *,
     struct nfsclds *, uint64_t, int, struct nfsfh *, int, int, int,
     struct ucred *, NFSPROC_T *);
 static int nfsrpc_writeds(vnode_t, struct uio *, int *, int *,
     nfsv4stateid_t *, struct nfsclds *, uint64_t, int,
     struct nfsfh *, int, int, int, int, struct ucred *, NFSPROC_T *);
 static int nfsio_writedsmir(vnode_t, int *, int *, nfsv4stateid_t *,
     struct nfsclds *, uint64_t, int, struct nfsfh *, struct mbuf *, int, int,
     struct nfsclwritedsdorpc *, struct ucred *, NFSPROC_T *);
 static int nfsrpc_writedsmir(vnode_t, int *, int *, nfsv4stateid_t *,
     struct nfsclds *, uint64_t, int, struct nfsfh *, struct mbuf *, int, int,
     struct ucred *, NFSPROC_T *);
 static enum nfsclds_state nfscl_getsameserver(struct nfsmount *,
     struct nfsclds *, struct nfsclds **, uint32_t *);
 static int nfsio_commitds(vnode_t, uint64_t, int, struct nfsclds *,
     struct nfsfh *, int, int, struct nfsclwritedsdorpc *, struct ucred *,
     NFSPROC_T *);
 static int nfsrpc_commitds(vnode_t, uint64_t, int, struct nfsclds *,
     struct nfsfh *, int, int, struct ucred *, NFSPROC_T *);
 #ifdef notyet
 static int nfsio_adviseds(vnode_t, uint64_t, int, int, struct nfsclds *,
     struct nfsfh *, int, int, struct nfsclwritedsdorpc *, struct ucred *,
     NFSPROC_T *);
 static int nfsrpc_adviseds(vnode_t, uint64_t, int, int, struct nfsclds *,
     struct nfsfh *, int, int, struct ucred *, NFSPROC_T *);
 #endif
 static int nfsrpc_allocaterpc(vnode_t, off_t, off_t, nfsv4stateid_t *,
     struct nfsvattr *, int *, struct ucred *, NFSPROC_T *);
 static void nfsrv_setuplayoutget(struct nfsrv_descript *, int, uint64_t,
     uint64_t, uint64_t, nfsv4stateid_t *, int, int, int);
 static int nfsrv_parseug(struct nfsrv_descript *, int, uid_t *, gid_t *,
     NFSPROC_T *);
 static int nfsrv_parselayoutget(struct nfsmount *, struct nfsrv_descript *,
     nfsv4stateid_t *, int *, struct nfsclflayouthead *);
 static int nfsrpc_getopenlayout(struct nfsmount *, vnode_t, u_int8_t *,
     int, uint8_t *, int, uint32_t, struct nfsclopen *, uint8_t *, int,
     struct nfscldeleg **, struct ucred *, NFSPROC_T *);
 static int nfsrpc_getcreatelayout(vnode_t, char *, int, struct vattr *,
     nfsquad_t, int, struct nfsclowner *, struct nfscldeleg **,
     struct ucred *, NFSPROC_T *, struct nfsvattr *, struct nfsvattr *,
     struct nfsfh **, int *, int *, int *);
 static int nfsrpc_openlayoutrpc(struct nfsmount *, vnode_t, u_int8_t *,
     int, uint8_t *, int, uint32_t, struct nfsclopen *, uint8_t *, int,
     struct nfscldeleg **, nfsv4stateid_t *, int, int, int, int *,
     struct nfsclflayouthead *, int *, struct ucred *, NFSPROC_T *);
 static int nfsrpc_createlayout(vnode_t, char *, int, struct vattr *,
     nfsquad_t, int, struct nfsclowner *, struct nfscldeleg **,
     struct ucred *, NFSPROC_T *, struct nfsvattr *, struct nfsvattr *,
     struct nfsfh **, int *, int *, int *, nfsv4stateid_t *,
     int, int, int, int *, struct nfsclflayouthead *, int *);
 static int nfsrpc_layoutget(struct nfsmount *, uint8_t *, int, int, uint64_t,
     uint64_t, uint64_t, int, int, nfsv4stateid_t *, int *,
     struct nfsclflayouthead *, struct ucred *, NFSPROC_T *);
 static int nfsrpc_layoutgetres(struct nfsmount *, vnode_t, uint8_t *,
     int, nfsv4stateid_t *, int, uint32_t *, struct nfscllayout **,
     struct nfsclflayouthead *, int, int, int *, struct ucred *, NFSPROC_T *);
 static int nfsrpc_copyrpc(vnode_t, off_t, vnode_t, off_t, size_t *,
     nfsv4stateid_t *, nfsv4stateid_t *, struct nfsvattr *, int *,
     struct nfsvattr *, int *, bool, int *, struct ucred *, NFSPROC_T *);
 static int nfsrpc_seekrpc(vnode_t, off_t *, nfsv4stateid_t *, bool *,
     int, struct nfsvattr *, int *, struct ucred *);
 static struct mbuf *nfsm_split(struct mbuf *, uint64_t);
 static void nfscl_statfs(struct vnode *, struct ucred *, NFSPROC_T *);
 
 int nfs_pnfsio(task_fn_t *, void *);
 
 /*
  * nfs null call from vfs.
  */
 int
 nfsrpc_null(vnode_t vp, struct ucred *cred, NFSPROC_T *p)
 {
 	int error;
 	struct nfsrv_descript nfsd, *nd = &nfsd;
 
 	NFSCL_REQSTART(nd, NFSPROC_NULL, vp, NULL);
 	error = nfscl_request(nd, vp, p, cred);
 	if (nd->nd_repstat && !error)
 		error = nd->nd_repstat;
 	m_freem(nd->nd_mrep);
 	return (error);
 }
 
 /*
  * nfs access rpc op.
  * For nfs version 3 and 4, use the access rpc to check accessibility. If file
  * modes are changed on the server, accesses might still fail later.
  */
 int
 nfsrpc_access(vnode_t vp, int acmode, struct ucred *cred,
     NFSPROC_T *p, struct nfsvattr *nap, int *attrflagp)
 {
 	int error;
 	u_int32_t mode, rmode;
 
 	if (acmode & VREAD)
 		mode = NFSACCESS_READ;
 	else
 		mode = 0;
 	if (vp->v_type == VDIR) {
 		if (acmode & VWRITE)
 			mode |= (NFSACCESS_MODIFY | NFSACCESS_EXTEND |
 				 NFSACCESS_DELETE);
 		if (acmode & VEXEC)
 			mode |= NFSACCESS_LOOKUP;
 	} else {
 		if (acmode & VWRITE)
 			mode |= (NFSACCESS_MODIFY | NFSACCESS_EXTEND);
 		if (acmode & VEXEC)
 			mode |= NFSACCESS_EXECUTE;
 	}
 
 	/*
 	 * Now, just call nfsrpc_accessrpc() to do the actual RPC.
 	 */
 	error = nfsrpc_accessrpc(vp, mode, cred, p, nap, attrflagp, &rmode);
 
 	/*
 	 * The NFS V3 spec does not clarify whether or not
 	 * the returned access bits can be a superset of
 	 * the ones requested, so...
 	 */
 	if (!error && (rmode & mode) != mode)
 		error = EACCES;
 	return (error);
 }
 
 /*
  * The actual rpc, separated out for Darwin.
  */
 int
 nfsrpc_accessrpc(vnode_t vp, u_int32_t mode, struct ucred *cred,
     NFSPROC_T *p, struct nfsvattr *nap, int *attrflagp, u_int32_t *rmodep)
 {
 	u_int32_t *tl;
 	u_int32_t supported, rmode;
 	int error;
 	struct nfsrv_descript nfsd, *nd = &nfsd;
 	nfsattrbit_t attrbits;
 	struct nfsmount *nmp;
 	struct nfsnode *np;
 
 	*attrflagp = 0;
 	supported = mode;
 	nmp = VFSTONFS(vp->v_mount);
 	np = VTONFS(vp);
 	if ((nmp->nm_privflag & NFSMNTP_FAKEROOTFH) != 0 &&
 	    nmp->nm_fhsize == 0) {
 		/* Attempt to get the actual root file handle. */
 		error = nfsrpc_getdirpath(nmp, NFSMNT_DIRPATH(nmp), cred, p);
 		if (error != 0)
 			return (EACCES);
 		if (np->n_fhp->nfh_len == NFSX_FHMAX + 1)
 			nfscl_statfs(vp, cred, p);
 	}
 	NFSCL_REQSTART(nd, NFSPROC_ACCESS, vp, cred);
 	NFSM_BUILD(tl, u_int32_t *, NFSX_UNSIGNED);
 	*tl = txdr_unsigned(mode);
 	if (nd->nd_flag & ND_NFSV4) {
 		/*
 		 * And do a Getattr op.
 		 */
 		NFSM_BUILD(tl, u_int32_t *, NFSX_UNSIGNED);
 		*tl = txdr_unsigned(NFSV4OP_GETATTR);
 		NFSGETATTR_ATTRBIT(&attrbits);
 		(void) nfsrv_putattrbit(nd, &attrbits);
 	}
 	error = nfscl_request(nd, vp, p, cred);
 	if (error)
 		return (error);
 	if (nd->nd_flag & ND_NFSV3) {
 		error = nfscl_postop_attr(nd, nap, attrflagp);
 		if (error)
 			goto nfsmout;
 	}
 	if (!nd->nd_repstat) {
 		if (nd->nd_flag & ND_NFSV4) {
 			NFSM_DISSECT(tl, u_int32_t *, 2 * NFSX_UNSIGNED);
 			supported = fxdr_unsigned(u_int32_t, *tl++);
 		} else {
 			NFSM_DISSECT(tl, u_int32_t *, NFSX_UNSIGNED);
 		}
 		rmode = fxdr_unsigned(u_int32_t, *tl);
 		if (nd->nd_flag & ND_NFSV4)
 			error = nfscl_postop_attr(nd, nap, attrflagp);
 
 		/*
 		 * It's not obvious what should be done about
 		 * unsupported access modes. For now, be paranoid
 		 * and clear the unsupported ones.
 		 */
 		rmode &= supported;
 		*rmodep = rmode;
 	} else
 		error = nd->nd_repstat;
 nfsmout:
 	m_freem(nd->nd_mrep);
 	return (error);
 }
 
 /*
  * nfs open rpc
  */
 int
 nfsrpc_open(vnode_t vp, int amode, struct ucred *cred, NFSPROC_T *p)
 {
 	struct nfsclopen *op;
 	struct nfscldeleg *dp;
 	struct nfsfh *nfhp;
 	struct nfsnode *np = VTONFS(vp);
 	struct nfsmount *nmp = VFSTONFS(vp->v_mount);
 	u_int32_t mode, clidrev;
 	int ret, newone, error, expireret = 0, retrycnt;
 
 	/*
 	 * For NFSv4, Open Ops are only done on Regular Files.
 	 */
 	if (vp->v_type != VREG)
 		return (0);
 	mode = 0;
 	if (amode & FREAD)
 		mode |= NFSV4OPEN_ACCESSREAD;
 	if (amode & FWRITE)
 		mode |= NFSV4OPEN_ACCESSWRITE;
 	if (NFSHASNFSV4N(nmp)) {
 		if (!NFSHASPNFS(nmp) && nfscl_enablecallb != 0 &&
 		    nfs_numnfscbd > 0) {
 			if ((mode & NFSV4OPEN_ACCESSWRITE) != 0)
 				mode |= NFSV4OPEN_WANTWRITEDELEG;
 			else
 				mode |= NFSV4OPEN_WANTANYDELEG;
 		} else
 			mode |= NFSV4OPEN_WANTNODELEG;
 	}
 	nfhp = np->n_fhp;
 
 	retrycnt = 0;
 	do {
 	    dp = NULL;
 	    error = nfscl_open(vp, nfhp->nfh_fh, nfhp->nfh_len,
 		(mode & NFSV4OPEN_ACCESSBOTH), 1, cred, p, NULL,
 		&op, &newone, &ret, 1, true);
 	    if (error) {
 		return (error);
 	    }
 	    if (nmp->nm_clp != NULL)
 		clidrev = nmp->nm_clp->nfsc_clientidrev;
 	    else
 		clidrev = 0;
 	    if (ret == NFSCLOPEN_DOOPEN) {
 		if (np->n_v4 != NULL) {
 			/*
 			 * For the first attempt, try and get a layout, if
 			 * pNFS is enabled for the mount.
 			 */
 			if (!NFSHASPNFS(nmp) || nfscl_enablecallb == 0 ||
 			    nfs_numnfscbd == 0 ||
 			    (np->n_flag & NNOLAYOUT) != 0 || retrycnt > 0)
 				error = nfsrpc_openrpc(nmp, vp,
 				    np->n_v4->n4_data,
 				    np->n_v4->n4_fhlen, np->n_fhp->nfh_fh,
 				    np->n_fhp->nfh_len, mode, op,
 				    NFS4NODENAME(np->n_v4),
 				    np->n_v4->n4_namelen,
 				    &dp, 0, 0x0, cred, p, 0, 0);
 			else
 				error = nfsrpc_getopenlayout(nmp, vp,
 				    np->n_v4->n4_data,
 				    np->n_v4->n4_fhlen, np->n_fhp->nfh_fh,
 				    np->n_fhp->nfh_len, mode, op,
 				    NFS4NODENAME(np->n_v4),
 				    np->n_v4->n4_namelen, &dp, cred, p);
 			if (dp != NULL) {
 				NFSLOCKNODE(np);
 				np->n_flag &= ~NDELEGMOD;
 				/*
 				 * Invalidate the attribute cache, so that
 				 * attributes that pre-date the issue of a
 				 * delegation are not cached, since the
 				 * cached attributes will remain valid while
 				 * the delegation is held.
 				 */
 				NFSINVALATTRCACHE(np);
 				NFSUNLOCKNODE(np);
 				(void) nfscl_deleg(nmp->nm_mountp,
 				    op->nfso_own->nfsow_clp,
 				    nfhp->nfh_fh, nfhp->nfh_len, cred, p, &dp);
 			}
 		} else if (NFSHASNFSV4N(nmp)) {
 			/*
 			 * For the first attempt, try and get a layout, if
 			 * pNFS is enabled for the mount.
 			 */
 			if (!NFSHASPNFS(nmp) || nfscl_enablecallb == 0 ||
 			    nfs_numnfscbd == 0 ||
 			    (np->n_flag & NNOLAYOUT) != 0 || retrycnt > 0)
 				error = nfsrpc_openrpc(nmp, vp, nfhp->nfh_fh,
 				    nfhp->nfh_len, nfhp->nfh_fh, nfhp->nfh_len,
 				    mode, op, NULL, 0, &dp, 0, 0x0, cred, p, 0,
 				    0);
 			else
 				error = nfsrpc_getopenlayout(nmp, vp,
 				    nfhp->nfh_fh, nfhp->nfh_len, nfhp->nfh_fh,
 				    nfhp->nfh_len, mode, op, NULL, 0, &dp,
 				    cred, p);
 			if (dp != NULL) {
 				NFSLOCKNODE(np);
 				np->n_flag &= ~NDELEGMOD;
 				/*
 				 * Invalidate the attribute cache, so that
 				 * attributes that pre-date the issue of a
 				 * delegation are not cached, since the
 				 * cached attributes will remain valid while
 				 * the delegation is held.
 				 */
 				NFSINVALATTRCACHE(np);
 				NFSUNLOCKNODE(np);
 				(void) nfscl_deleg(nmp->nm_mountp,
 				    op->nfso_own->nfsow_clp,
 				    nfhp->nfh_fh, nfhp->nfh_len, cred, p, &dp);
 			}
 		} else {
 			error = EIO;
 		}
 		newnfs_copyincred(cred, &op->nfso_cred);
 	    } else if (ret == NFSCLOPEN_SETCRED)
 		/*
 		 * This is a new local open on a delegation. It needs
 		 * to have credentials so that an open can be done
 		 * against the server during recovery.
 		 */
 		newnfs_copyincred(cred, &op->nfso_cred);
 
 	    /*
 	     * nfso_opencnt is the count of how many VOP_OPEN()s have
 	     * been done on this Open successfully and a VOP_CLOSE()
 	     * is expected for each of these.
 	     * If error is non-zero, don't increment it, since the Open
 	     * hasn't succeeded yet.
 	     */
 	    if (!error) {
 		op->nfso_opencnt++;
 		if (NFSHASNFSV4N(nmp) && NFSHASONEOPENOWN(nmp)) {
 		    NFSLOCKNODE(np);
 		    np->n_openstateid = op;
 		    NFSUNLOCKNODE(np);
 		}
 	    }
 	    nfscl_openrelease(nmp, op, error, newone);
 	    if (error == NFSERR_GRACE || error == NFSERR_STALECLIENTID ||
 		error == NFSERR_STALEDONTRECOVER || error == NFSERR_DELAY ||
 		error == NFSERR_BADSESSION) {
 		(void) nfs_catnap(PZERO, error, "nfs_open");
 	    } else if ((error == NFSERR_EXPIRED || error == NFSERR_BADSTATEID)
 		&& clidrev != 0) {
 		expireret = nfscl_hasexpired(nmp->nm_clp, clidrev, p);
 		retrycnt++;
 	    }
 	} while (error == NFSERR_GRACE || error == NFSERR_STALECLIENTID ||
 	    error == NFSERR_STALEDONTRECOVER || error == NFSERR_DELAY ||
 	    error == NFSERR_BADSESSION ||
 	    ((error == NFSERR_EXPIRED || error == NFSERR_BADSTATEID) &&
 	     expireret == 0 && clidrev != 0 && retrycnt < 4));
 	if (error && retrycnt >= 4)
 		error = EIO;
 	return (error);
 }
 
 /*
  * the actual open rpc
  */
 int
 nfsrpc_openrpc(struct nfsmount *nmp, vnode_t vp, u_int8_t *nfhp, int fhlen,
     u_int8_t *newfhp, int newfhlen, u_int32_t mode, struct nfsclopen *op,
     u_int8_t *name, int namelen, struct nfscldeleg **dpp,
     int reclaim, u_int32_t delegtype, struct ucred *cred, NFSPROC_T *p,
     int syscred, int recursed)
 {
 	u_int32_t *tl;
 	struct nfsrv_descript nfsd, *nd = &nfsd;
 	struct nfscldeleg *dp, *ndp = NULL;
 	struct nfsvattr nfsva;
 	u_int32_t rflags, deleg;
 	nfsattrbit_t attrbits;
 	int error, ret, acesize, limitby;
 	struct nfsclsession *tsep;
 
 	dp = *dpp;
 	*dpp = NULL;
 	nfscl_reqstart(nd, NFSPROC_OPEN, nmp, nfhp, fhlen, NULL, NULL, 0, 0,
 	    cred);
 	NFSM_BUILD(tl, u_int32_t *, 5 * NFSX_UNSIGNED);
 	*tl++ = txdr_unsigned(op->nfso_own->nfsow_seqid);
 	*tl++ = txdr_unsigned(mode & (NFSV4OPEN_ACCESSBOTH |
 	    NFSV4OPEN_WANTDELEGMASK));
 	*tl++ = txdr_unsigned((mode >> NFSLCK_SHIFT) & NFSV4OPEN_DENYBOTH);
 	tsep = nfsmnt_mdssession(nmp);
 	*tl++ = tsep->nfsess_clientid.lval[0];
 	*tl = tsep->nfsess_clientid.lval[1];
 	(void) nfsm_strtom(nd, op->nfso_own->nfsow_owner, NFSV4CL_LOCKNAMELEN);
 	NFSM_BUILD(tl, u_int32_t *, 2 * NFSX_UNSIGNED);
 	*tl++ = txdr_unsigned(NFSV4OPEN_NOCREATE);
 	if (reclaim) {
 		*tl = txdr_unsigned(NFSV4OPEN_CLAIMPREVIOUS);
 		NFSM_BUILD(tl, u_int32_t *, NFSX_UNSIGNED);
 		*tl = txdr_unsigned(delegtype);
 	} else {
 		if (dp != NULL) {
 			if (NFSHASNFSV4N(nmp))
 				*tl = txdr_unsigned(
 				    NFSV4OPEN_CLAIMDELEGATECURFH);
 			else
 				*tl = txdr_unsigned(NFSV4OPEN_CLAIMDELEGATECUR);
 			NFSM_BUILD(tl, u_int32_t *, NFSX_STATEID);
 			if (NFSHASNFSV4N(nmp))
 				*tl++ = 0;
 			else
 				*tl++ = dp->nfsdl_stateid.seqid;
 			*tl++ = dp->nfsdl_stateid.other[0];
 			*tl++ = dp->nfsdl_stateid.other[1];
 			*tl = dp->nfsdl_stateid.other[2];
 			if (!NFSHASNFSV4N(nmp))
 				(void)nfsm_strtom(nd, name, namelen);
 		} else if (NFSHASNFSV4N(nmp)) {
 			*tl = txdr_unsigned(NFSV4OPEN_CLAIMFH);
 		} else {
 			*tl = txdr_unsigned(NFSV4OPEN_CLAIMNULL);
 			(void)nfsm_strtom(nd, name, namelen);
 		}
 	}
 	NFSM_BUILD(tl, u_int32_t *, NFSX_UNSIGNED);
 	*tl = txdr_unsigned(NFSV4OP_GETATTR);
 	NFSZERO_ATTRBIT(&attrbits);
 	NFSSETBIT_ATTRBIT(&attrbits, NFSATTRBIT_CHANGE);
 	NFSSETBIT_ATTRBIT(&attrbits, NFSATTRBIT_TIMEMODIFY);
 	(void) nfsrv_putattrbit(nd, &attrbits);
 	if (syscred)
 		nd->nd_flag |= ND_USEGSSNAME;
 	error = newnfs_request(nd, nmp, NULL, &nmp->nm_sockreq, vp, p, cred,
 	    NFS_PROG, NFS_VER4, NULL, 1, NULL, NULL);
 	if (error)
 		return (error);
 	NFSCL_INCRSEQID(op->nfso_own->nfsow_seqid, nd);
 	if (nd->nd_repstat == 0 || (nd->nd_repstat == NFSERR_DELAY &&
 	    reclaim != 0 && (nd->nd_flag & ND_NOMOREDATA) == 0)) {
 		NFSM_DISSECT(tl, u_int32_t *, NFSX_STATEID +
 		    6 * NFSX_UNSIGNED);
 		op->nfso_stateid.seqid = *tl++;
 		op->nfso_stateid.other[0] = *tl++;
 		op->nfso_stateid.other[1] = *tl++;
 		op->nfso_stateid.other[2] = *tl;
 		rflags = fxdr_unsigned(u_int32_t, *(tl + 6));
 		error = nfsrv_getattrbits(nd, &attrbits, NULL, NULL);
 		if (error)
 			goto nfsmout;
 		NFSM_DISSECT(tl, u_int32_t *, NFSX_UNSIGNED);
 		deleg = fxdr_unsigned(u_int32_t, *tl);
 		if (deleg == NFSV4OPEN_DELEGATEREAD ||
 		    deleg == NFSV4OPEN_DELEGATEWRITE) {
 			if (!(op->nfso_own->nfsow_clp->nfsc_flags &
 			      NFSCLFLAGS_FIRSTDELEG))
 				op->nfso_own->nfsow_clp->nfsc_flags |=
 				  (NFSCLFLAGS_FIRSTDELEG | NFSCLFLAGS_GOTDELEG);
 			ndp = malloc(
 			    sizeof (struct nfscldeleg) + newfhlen,
 			    M_NFSCLDELEG, M_WAITOK);
 			LIST_INIT(&ndp->nfsdl_owner);
 			LIST_INIT(&ndp->nfsdl_lock);
 			ndp->nfsdl_clp = op->nfso_own->nfsow_clp;
 			ndp->nfsdl_fhlen = newfhlen;
 			NFSBCOPY(newfhp, ndp->nfsdl_fh, newfhlen);
 			newnfs_copyincred(cred, &ndp->nfsdl_cred);
 			nfscl_lockinit(&ndp->nfsdl_rwlock);
 			NFSM_DISSECT(tl, u_int32_t *, NFSX_STATEID +
 			    NFSX_UNSIGNED);
 			ndp->nfsdl_stateid.seqid = *tl++;
 			ndp->nfsdl_stateid.other[0] = *tl++;
 			ndp->nfsdl_stateid.other[1] = *tl++;
 			ndp->nfsdl_stateid.other[2] = *tl++;
 			ret = fxdr_unsigned(int, *tl);
 			if (deleg == NFSV4OPEN_DELEGATEWRITE) {
 				ndp->nfsdl_flags = NFSCLDL_WRITE;
 				/*
 				 * Indicates how much the file can grow.
 				 */
 				NFSM_DISSECT(tl, u_int32_t *,
 				    3 * NFSX_UNSIGNED);
 				limitby = fxdr_unsigned(int, *tl++);
 				switch (limitby) {
 				case NFSV4OPEN_LIMITSIZE:
 					ndp->nfsdl_sizelimit = fxdr_hyper(tl);
 					break;
 				case NFSV4OPEN_LIMITBLOCKS:
 					ndp->nfsdl_sizelimit =
 					    fxdr_unsigned(u_int64_t, *tl++);
 					ndp->nfsdl_sizelimit *=
 					    fxdr_unsigned(u_int64_t, *tl);
 					break;
 				default:
 					error = NFSERR_BADXDR;
 					goto nfsmout;
 				}
 			} else {
 				ndp->nfsdl_flags = NFSCLDL_READ;
 			}
 			if (ret)
 				ndp->nfsdl_flags |= NFSCLDL_RECALL;
 			error = nfsrv_dissectace(nd, &ndp->nfsdl_ace, false,
 			    &ret, &acesize, p);
 			if (error)
 				goto nfsmout;
 		} else if (deleg == NFSV4OPEN_DELEGATENONEEXT &&
 		    NFSHASNFSV4N(nmp)) {
 			NFSM_DISSECT(tl, uint32_t *, NFSX_UNSIGNED);
 			deleg = fxdr_unsigned(uint32_t, *tl);
 			if (deleg == NFSV4OPEN_CONTENTION ||
 			    deleg == NFSV4OPEN_RESOURCE)
 				NFSM_DISSECT(tl, uint32_t *, NFSX_UNSIGNED);
 		} else if (deleg != NFSV4OPEN_DELEGATENONE) {
 			error = NFSERR_BADXDR;
 			goto nfsmout;
 		}
 		NFSM_DISSECT(tl, u_int32_t *, 2 * NFSX_UNSIGNED);
 		/* If the 2nd element == NFS_OK, the Getattr succeeded. */
 		if (*++tl == 0) {
 			KASSERT(nd->nd_repstat == 0,
 			    ("nfsrpc_openrpc: Getattr repstat"));
 			error = nfsv4_loadattr(nd, NULL, &nfsva, NULL,
 			    NULL, 0, NULL, NULL, NULL, NULL, NULL, 0,
 			    NULL, NULL, NULL, p, cred);
 			if (error)
 				goto nfsmout;
 		}
 		if (ndp != NULL) {
 			if (reclaim != 0 && dp != NULL) {
 				ndp->nfsdl_change = dp->nfsdl_change;
 				ndp->nfsdl_modtime = dp->nfsdl_modtime;
 				ndp->nfsdl_flags |= NFSCLDL_MODTIMESET;
 			} else if (nd->nd_repstat == 0) {
 				ndp->nfsdl_change = nfsva.na_filerev;
 				ndp->nfsdl_modtime = nfsva.na_mtime;
 				ndp->nfsdl_flags |= NFSCLDL_MODTIMESET;
 			} else
 				ndp->nfsdl_flags |= NFSCLDL_RECALL;
 		}
 		nd->nd_repstat = 0;
 		if (!reclaim && (rflags & NFSV4OPEN_RESULTCONFIRM)) {
 		    do {
 			ret = nfsrpc_openconfirm(vp, newfhp, newfhlen, op,
 			    cred, p);
 			if (ret == NFSERR_DELAY)
 			    (void) nfs_catnap(PZERO, ret, "nfs_open");
 		    } while (ret == NFSERR_DELAY);
 		    error = ret;
 		}
 		if ((rflags & NFSV4OPEN_LOCKTYPEPOSIX) ||
 		    nfscl_assumeposixlocks)
 		    op->nfso_posixlock = 1;
 		else
 		    op->nfso_posixlock = 0;
 
 		/*
 		 * If the server is handing out delegations, but we didn't
 		 * get one because an OpenConfirm was required, try the
 		 * Open again, to get a delegation. This is a harmless no-op,
 		 * from a server's point of view.
 		 */
 		if (!reclaim && (rflags & NFSV4OPEN_RESULTCONFIRM) &&
 		    (op->nfso_own->nfsow_clp->nfsc_flags & NFSCLFLAGS_GOTDELEG)
 		    && !error && dp == NULL && ndp == NULL && !recursed) {
 		    do {
 			ret = nfsrpc_openrpc(nmp, vp, nfhp, fhlen, newfhp,
 			    newfhlen, mode, op, name, namelen, &ndp, 0, 0x0,
 			    cred, p, syscred, 1);
 			if (ret == NFSERR_DELAY)
 			    (void) nfs_catnap(PZERO, ret, "nfs_open2");
 		    } while (ret == NFSERR_DELAY);
 		    if (ret) {
 			if (ndp != NULL) {
 				free(ndp, M_NFSCLDELEG);
 				ndp = NULL;
 			}
 			if (ret == NFSERR_STALECLIENTID ||
 			    ret == NFSERR_STALEDONTRECOVER ||
 			    ret == NFSERR_BADSESSION)
 				error = ret;
 		    }
 		}
 	}
 	if (nd->nd_repstat != 0 && error == 0)
 		error = nd->nd_repstat;
 	if (error == NFSERR_STALECLIENTID)
 		nfscl_initiate_recovery(op->nfso_own->nfsow_clp);
 nfsmout:
 	if (!error)
 		*dpp = ndp;
 	else if (ndp != NULL)
 		free(ndp, M_NFSCLDELEG);
 	m_freem(nd->nd_mrep);
 	return (error);
 }
 
 /*
  * open downgrade rpc
  */
 int
 nfsrpc_opendowngrade(vnode_t vp, u_int32_t mode, struct nfsclopen *op,
     struct ucred *cred, NFSPROC_T *p)
 {
 	u_int32_t *tl;
 	struct nfsrv_descript nfsd, *nd = &nfsd;
 	int error;
 
 	NFSCL_REQSTART(nd, NFSPROC_OPENDOWNGRADE, vp, cred);
 	NFSM_BUILD(tl, u_int32_t *, NFSX_STATEID + 3 * NFSX_UNSIGNED);
 	if (NFSHASNFSV4N(VFSTONFS(vp->v_mount)))
 		*tl++ = 0;
 	else
 		*tl++ = op->nfso_stateid.seqid;
 	*tl++ = op->nfso_stateid.other[0];
 	*tl++ = op->nfso_stateid.other[1];
 	*tl++ = op->nfso_stateid.other[2];
 	*tl++ = txdr_unsigned(op->nfso_own->nfsow_seqid);
 	*tl++ = txdr_unsigned(mode & NFSV4OPEN_ACCESSBOTH);
 	*tl = txdr_unsigned((mode >> NFSLCK_SHIFT) & NFSV4OPEN_DENYBOTH);
 	error = nfscl_request(nd, vp, p, cred);
 	if (error)
 		return (error);
 	NFSCL_INCRSEQID(op->nfso_own->nfsow_seqid, nd);
 	if (!nd->nd_repstat) {
 		NFSM_DISSECT(tl, u_int32_t *, NFSX_STATEID);
 		op->nfso_stateid.seqid = *tl++;
 		op->nfso_stateid.other[0] = *tl++;
 		op->nfso_stateid.other[1] = *tl++;
 		op->nfso_stateid.other[2] = *tl;
 	}
 	if (nd->nd_repstat && error == 0)
 		error = nd->nd_repstat;
 	if (error == NFSERR_STALESTATEID)
 		nfscl_initiate_recovery(op->nfso_own->nfsow_clp);
 nfsmout:
 	m_freem(nd->nd_mrep);
 	return (error);
 }
 
 /*
  * V4 Close operation.
  */
 int
 nfsrpc_close(vnode_t vp, int doclose, NFSPROC_T *p)
 {
 	struct nfsclclient *clp;
 	int error;
 
 	if (vp->v_type != VREG)
 		return (0);
 	if (doclose)
 		error = nfscl_doclose(vp, &clp, p);
 	else {
 		error = nfscl_getclose(vp, &clp);
 		if (error == 0)
 			nfscl_clientrelease(clp);
 	}
 	return (error);
 }
 
 /*
  * Close the open.
  */
 int
 nfsrpc_doclose(struct nfsmount *nmp, struct nfsclopen *op, NFSPROC_T *p,
     bool loop_on_delayed, bool freeop)
 {
 	struct nfsrv_descript nfsd, *nd = &nfsd;
 	struct nfscllockowner *lp, *nlp;
 	struct nfscllock *lop, *nlop;
 	struct ucred *tcred;
 	u_int64_t off = 0, len = 0;
 	u_int32_t type = NFSV4LOCKT_READ;
 	int error, do_unlock, trycnt;
 	bool own_not_null;
 
 	tcred = newnfs_getcred();
 	newnfs_copycred(&op->nfso_cred, tcred);
 	/*
 	 * (Theoretically this could be done in the same
 	 *  compound as the close, but having multiple
 	 *  sequenced Ops in the same compound might be
 	 *  too scary for some servers.)
 	 */
 	if (op->nfso_posixlock) {
 		off = 0;
 		len = NFS64BITSSET;
 		type = NFSV4LOCKT_READ;
 	}
 
 	/*
 	 * Since this function is only called from VOP_INACTIVE(), no
 	 * other thread will be manipulating this Open. As such, the
 	 * lock lists are not being changed by other threads, so it should
 	 * be safe to do this without locking.
 	 */
 	LIST_FOREACH(lp, &op->nfso_lock, nfsl_list) {
 		do_unlock = 1;
 		LIST_FOREACH_SAFE(lop, &lp->nfsl_lock, nfslo_list, nlop) {
 			if (op->nfso_posixlock == 0) {
 				off = lop->nfslo_first;
 				len = lop->nfslo_end - lop->nfslo_first;
 				if (lop->nfslo_type == F_WRLCK)
 					type = NFSV4LOCKT_WRITE;
 				else
 					type = NFSV4LOCKT_READ;
 			}
 			if (do_unlock) {
 				trycnt = 0;
 				do {
 					error = nfsrpc_locku(nd, nmp, lp, off,
 					    len, type, tcred, p, 0);
 					if ((nd->nd_repstat == NFSERR_GRACE ||
 					    nd->nd_repstat == NFSERR_DELAY) &&
 					    error == 0)
 						(void) nfs_catnap(PZERO,
 						    (int)nd->nd_repstat,
 						    "nfs_close");
 				} while ((nd->nd_repstat == NFSERR_GRACE ||
 				    nd->nd_repstat == NFSERR_DELAY) &&
 				    error == 0 && trycnt++ < 5);
 				if (op->nfso_posixlock)
 					do_unlock = 0;
 			}
 			nfscl_freelock(lop, 0);
 		}
 		/*
 		 * Do a ReleaseLockOwner.
 		 * The lock owner name nfsl_owner may be used by other opens for
 		 * other files but the lock_owner4 name that nfsrpc_rellockown()
 		 * puts on the wire has the file handle for this file appended
 		 * to it, so it can be done now.
 		 */
 		(void)nfsrpc_rellockown(nmp, lp, lp->nfsl_open->nfso_fh,
 		    lp->nfsl_open->nfso_fhlen, tcred, p);
 	}
 
 	/*
 	 * There could be other Opens for different files on the same
 	 * OpenOwner, so locking is required.
 	 */
 	own_not_null = false;
 	if (op->nfso_own != NULL) {
 		own_not_null = true;
 		NFSLOCKCLSTATE();
 		nfscl_lockexcl(&op->nfso_own->nfsow_rwlock, NFSCLSTATEMUTEXPTR);
 		NFSUNLOCKCLSTATE();
 	}
 	do {
 		error = nfscl_tryclose(op, tcred, nmp, p, loop_on_delayed);
 		if (error == NFSERR_GRACE)
 			(void) nfs_catnap(PZERO, error, "nfs_close");
 	} while (error == NFSERR_GRACE);
 	if (own_not_null) {
 		NFSLOCKCLSTATE();
 		nfscl_lockunlock(&op->nfso_own->nfsow_rwlock);
 	}
 
 	LIST_FOREACH_SAFE(lp, &op->nfso_lock, nfsl_list, nlp)
 		nfscl_freelockowner(lp, 0);
 	if (freeop && error != NFSERR_DELAY)
 		nfscl_freeopen(op, 0, true);
 	if (own_not_null)
 		NFSUNLOCKCLSTATE();
 	NFSFREECRED(tcred);
 	return (error);
 }
 
 /*
  * The actual Close RPC.
  */
 int
 nfsrpc_closerpc(struct nfsrv_descript *nd, struct nfsmount *nmp,
     struct nfsclopen *op, struct ucred *cred, NFSPROC_T *p,
     int syscred)
 {
 	u_int32_t *tl;
 	int error;
 
 	nfscl_reqstart(nd, NFSPROC_CLOSE, nmp, op->nfso_fh,
 	    op->nfso_fhlen, NULL, NULL, 0, 0, cred);
 	NFSM_BUILD(tl, u_int32_t *, NFSX_UNSIGNED + NFSX_STATEID);
 	if (NFSHASNFSV4N(nmp)) {
 		*tl++ = 0;
 		*tl++ = 0;
 	} else {
 		*tl++ = txdr_unsigned(op->nfso_own->nfsow_seqid);
 		*tl++ = op->nfso_stateid.seqid;
 	}
 	*tl++ = op->nfso_stateid.other[0];
 	*tl++ = op->nfso_stateid.other[1];
 	*tl = op->nfso_stateid.other[2];
 	if (syscred)
 		nd->nd_flag |= ND_USEGSSNAME;
 	error = newnfs_request(nd, nmp, NULL, &nmp->nm_sockreq, NULL, p, cred,
 	    NFS_PROG, NFS_VER4, NULL, 1, NULL, NULL);
 	if (error)
 		return (error);
 	if (!NFSHASNFSV4N(nmp))
 		NFSCL_INCRSEQID(op->nfso_own->nfsow_seqid, nd);
 	if (nd->nd_repstat == 0)
 		NFSM_DISSECT(tl, u_int32_t *, NFSX_STATEID);
 	error = nd->nd_repstat;
 	if (!NFSHASNFSV4N(nmp) && error == NFSERR_STALESTATEID)
 		nfscl_initiate_recovery(op->nfso_own->nfsow_clp);
 nfsmout:
 	m_freem(nd->nd_mrep);
 	return (error);
 }
 
 /*
  * V4 Open Confirm RPC.
  */
 int
 nfsrpc_openconfirm(vnode_t vp, u_int8_t *nfhp, int fhlen,
     struct nfsclopen *op, struct ucred *cred, NFSPROC_T *p)
 {
 	u_int32_t *tl;
 	struct nfsrv_descript nfsd, *nd = &nfsd;
 	struct nfsmount *nmp;
 	int error;
 
 	nmp = VFSTONFS(vp->v_mount);
 	if (NFSHASNFSV4N(nmp))
 		return (0);		/* No confirmation for NFSv4.1. */
 	nfscl_reqstart(nd, NFSPROC_OPENCONFIRM, nmp, nfhp, fhlen, NULL, NULL,
 	    0, 0, NULL);
 	NFSM_BUILD(tl, u_int32_t *, NFSX_UNSIGNED + NFSX_STATEID);
 	*tl++ = op->nfso_stateid.seqid;
 	*tl++ = op->nfso_stateid.other[0];
 	*tl++ = op->nfso_stateid.other[1];
 	*tl++ = op->nfso_stateid.other[2];
 	*tl = txdr_unsigned(op->nfso_own->nfsow_seqid);
 	error = nfscl_request(nd, vp, p, cred);
 	if (error)
 		return (error);
 	NFSCL_INCRSEQID(op->nfso_own->nfsow_seqid, nd);
 	if (!nd->nd_repstat) {
 		NFSM_DISSECT(tl, u_int32_t *, NFSX_STATEID);
 		op->nfso_stateid.seqid = *tl++;
 		op->nfso_stateid.other[0] = *tl++;
 		op->nfso_stateid.other[1] = *tl++;
 		op->nfso_stateid.other[2] = *tl;
 	}
 	error = nd->nd_repstat;
 	if (error == NFSERR_STALESTATEID)
 		nfscl_initiate_recovery(op->nfso_own->nfsow_clp);
 nfsmout:
 	m_freem(nd->nd_mrep);
 	return (error);
 }
 
 /*
  * Do the setclientid and setclientid confirm RPCs. Called from nfs_statfs()
  * when a mount has just occurred and when the server replies NFSERR_EXPIRED.
  */
 int
 nfsrpc_setclient(struct nfsmount *nmp, struct nfsclclient *clp, int reclaim,
     bool *retokp, struct ucred *cred, NFSPROC_T *p)
 {
 	u_int32_t *tl;
 	struct nfsrv_descript nfsd;
 	struct nfsrv_descript *nd = &nfsd;
 	u_int8_t *cp = NULL, *cp2, addr[INET6_ADDRSTRLEN + 9];
 	u_short port;
 	int error, isinet6 = 0, callblen;
 	nfsquad_t confirm;
 	static u_int32_t rev = 0;
 	struct nfsclds *dsp, *odsp;
 	struct in6_addr a6;
 	struct nfsclsession *tsep;
 	struct rpc_reconupcall recon;
 	struct nfscl_reconarg *rcp;
 
 	if (nfsboottime.tv_sec == 0)
 		NFSSETBOOTTIME(nfsboottime);
 	if (NFSHASNFSV4N(nmp)) {
 		error = NFSERR_BADSESSION;
 		odsp = dsp = NULL;
 		if (retokp != NULL) {
 			NFSLOCKMNT(nmp);
 			odsp = TAILQ_FIRST(&nmp->nm_sess);
 			NFSUNLOCKMNT(nmp);
 		}
 		if (odsp != NULL) {
 			/*
 			 * When a session already exists, first try a
 			 * CreateSession with the extant ClientID.
 			 */
 			dsp = malloc(sizeof(struct nfsclds) +
 			    odsp->nfsclds_servownlen + 1, M_NFSCLDS,
 			    M_WAITOK | M_ZERO);
 			dsp->nfsclds_expire = NFSD_MONOSEC + clp->nfsc_renew;
 			dsp->nfsclds_servownlen = odsp->nfsclds_servownlen;
 			dsp->nfsclds_sess.nfsess_clientid =
 			    odsp->nfsclds_sess.nfsess_clientid;
 			dsp->nfsclds_sess.nfsess_sequenceid =
 			    odsp->nfsclds_sess.nfsess_sequenceid + 1;
 			dsp->nfsclds_flags = odsp->nfsclds_flags;
 			if (dsp->nfsclds_servownlen > 0)
 				memcpy(dsp->nfsclds_serverown,
 				    odsp->nfsclds_serverown,
 				    dsp->nfsclds_servownlen + 1);
 			mtx_init(&dsp->nfsclds_mtx, "nfsds", NULL, MTX_DEF);
 			mtx_init(&dsp->nfsclds_sess.nfsess_mtx, "nfssession",
 			    NULL, MTX_DEF);
 			nfscl_initsessionslots(&dsp->nfsclds_sess);
 			error = nfsrpc_createsession(nmp, &dsp->nfsclds_sess,
 			    &nmp->nm_sockreq, NULL,
 			    dsp->nfsclds_sess.nfsess_sequenceid, 1, cred, p);
 			NFSCL_DEBUG(1, "create session for extant "
 			    "ClientID=%d\n", error);
 			if (error != 0) {
 				nfscl_freenfsclds(dsp);
 				dsp = NULL;
 				/*
 				 * If *retokp is true, return any error other
 				 * than NFSERR_STALECLIENTID,
 				 * NFSERR_BADSESSION or NFSERR_STALEDONTRECOVER
 				 * so that nfscl_recover() will not loop.
 				 */
 				if (*retokp)
 					return (NFSERR_IO);
 			} else
 				*retokp = true;
 		} else if (retokp != NULL && *retokp)
 			return (NFSERR_IO);
 		if (error != 0) {
 			/*
 			 * Either there was no previous session or the
 			 * CreateSession attempt failed, so...
 			 * do an ExchangeID followed by the CreateSession.
 			 */
 			clp->nfsc_rev = rev++;
 			error = nfsrpc_exchangeid(nmp, clp, &nmp->nm_sockreq, 0,
 			    NFSV4EXCH_USEPNFSMDS | NFSV4EXCH_USENONPNFS, &dsp,
 			    cred, p);
 			NFSCL_DEBUG(1, "aft exch=%d\n", error);
 			if (error == 0)
 				error = nfsrpc_createsession(nmp,
 				    &dsp->nfsclds_sess, &nmp->nm_sockreq, NULL,
 				    dsp->nfsclds_sess.nfsess_sequenceid, 1,
 				    cred, p);
 			NFSCL_DEBUG(1, "aft createsess=%d\n", error);
 		}
 		if (error == 0) {
 			/*
 			 * If the session supports a backchannel, set up
 			 * the BindConnectionToSession call in the krpc
 			 * so that it is done on a reconnection.
 			 */
 			if (nfscl_enablecallb != 0 && nfs_numnfscbd > 0) {
 				rcp = mem_alloc(sizeof(*rcp));
 				rcp->minorvers = nmp->nm_minorvers;
 				memcpy(rcp->sessionid,
 				    dsp->nfsclds_sess.nfsess_sessionid,
 				    NFSX_V4SESSIONID);
 				recon.call = nfsrpc_bindconnsess;
 				recon.arg = rcp;
 				CLNT_CONTROL(nmp->nm_client, CLSET_RECONUPCALL,
 				    &recon);
 			}
 
 			NFSLOCKMNT(nmp);
 			/*
 			 * The old sessions cannot be safely free'd
 			 * here, since they may still be used by
 			 * in-progress RPCs.
 			 */
 			tsep = NULL;
 			if (TAILQ_FIRST(&nmp->nm_sess) != NULL) {
 				/*
 				 * Mark the old session defunct.  Needed
 				 * when called from nfscl_hasexpired().
 				 */
 				tsep = NFSMNT_MDSSESSION(nmp);
 				tsep->nfsess_defunct = 1;
 			}
 			TAILQ_INSERT_HEAD(&nmp->nm_sess, dsp,
 			    nfsclds_list);
 			/*
 			 * Wake up RPCs waiting for a slot on the
 			 * old session. These will then fail with
 			 * NFSERR_BADSESSION and be retried with the
 			 * new session by nfsv4_setsequence().
 			 * Also wakeup() processes waiting for the
 			 * new session.
 			 */
 			if (tsep != NULL)
 				wakeup(&tsep->nfsess_slots);
 			wakeup(&nmp->nm_sess);
 			NFSUNLOCKMNT(nmp);
 		} else if (dsp != NULL)
 			nfscl_freenfsclds(dsp);
 		if (error == 0 && reclaim == 0) {
 			error = nfsrpc_reclaimcomplete(nmp, cred, p);
 			NFSCL_DEBUG(1, "aft reclaimcomp=%d\n", error);
 			if (error == NFSERR_COMPLETEALREADY ||
 			    error == NFSERR_NOTSUPP)
 				/* Ignore this error. */
 				error = 0;
 		}
 		return (error);
 	} else if (retokp != NULL && *retokp)
 		return (NFSERR_IO);
 	clp->nfsc_rev = rev++;
 
 	/*
 	 * Allocate a single session structure for NFSv4.0, because some of
 	 * the fields are used by NFSv4.0 although it doesn't do a session.
 	 */
 	dsp = malloc(sizeof(struct nfsclds), M_NFSCLDS, M_WAITOK | M_ZERO);
 	mtx_init(&dsp->nfsclds_mtx, "nfsds", NULL, MTX_DEF);
 	mtx_init(&dsp->nfsclds_sess.nfsess_mtx, "nfssession", NULL, MTX_DEF);
 	NFSLOCKMNT(nmp);
 	TAILQ_INSERT_HEAD(&nmp->nm_sess, dsp, nfsclds_list);
 	tsep = NFSMNT_MDSSESSION(nmp);
 	NFSUNLOCKMNT(nmp);
 
 	nfscl_reqstart(nd, NFSPROC_SETCLIENTID, nmp, NULL, 0, NULL, NULL, 0, 0,
 	    NULL);
 	NFSM_BUILD(tl, u_int32_t *, 2 * NFSX_UNSIGNED);
 	*tl++ = txdr_unsigned(nfsboottime.tv_sec);
 	*tl = txdr_unsigned(clp->nfsc_rev);
 	(void) nfsm_strtom(nd, clp->nfsc_id, clp->nfsc_idlen);
 
 	/*
 	 * set up the callback address
 	 */
 	NFSM_BUILD(tl, u_int32_t *, NFSX_UNSIGNED);
 	*tl = txdr_unsigned(NFS_CALLBCKPROG);
 	callblen = strlen(nfsv4_callbackaddr);
 	if (callblen == 0)
 		cp = nfscl_getmyip(nmp, &a6, &isinet6);
 	if (nfscl_enablecallb && nfs_numnfscbd > 0 &&
 	    (callblen > 0 || cp != NULL)) {
 		port = htons(nfsv4_cbport);
 		cp2 = (u_int8_t *)&port;
 #ifdef INET6
 		if ((callblen > 0 &&
 		     strchr(nfsv4_callbackaddr, ':')) || isinet6) {
 			char ip6buf[INET6_ADDRSTRLEN], *ip6add;
 
 			(void) nfsm_strtom(nd, "tcp6", 4);
 			if (callblen == 0) {
 				ip6_sprintf(ip6buf, (struct in6_addr *)cp);
 				ip6add = ip6buf;
 			} else {
 				ip6add = nfsv4_callbackaddr;
 			}
 			snprintf(addr, INET6_ADDRSTRLEN + 9, "%s.%d.%d",
 			    ip6add, cp2[0], cp2[1]);
 		} else
 #endif
 		{
 			(void) nfsm_strtom(nd, "tcp", 3);
 			if (callblen == 0)
 				snprintf(addr, INET6_ADDRSTRLEN + 9,
 				    "%d.%d.%d.%d.%d.%d", cp[0], cp[1],
 				    cp[2], cp[3], cp2[0], cp2[1]);
 			else
 				snprintf(addr, INET6_ADDRSTRLEN + 9,
 				    "%s.%d.%d", nfsv4_callbackaddr,
 				    cp2[0], cp2[1]);
 		}
 		(void) nfsm_strtom(nd, addr, strlen(addr));
 	} else {
 		(void) nfsm_strtom(nd, "tcp", 3);
 		(void) nfsm_strtom(nd, "0.0.0.0.0.0", 11);
 	}
 	NFSM_BUILD(tl, u_int32_t *, NFSX_UNSIGNED);
 	*tl = txdr_unsigned(clp->nfsc_cbident);
 	nd->nd_flag |= ND_USEGSSNAME;
 	error = newnfs_request(nd, nmp, NULL, &nmp->nm_sockreq, NULL, p, cred,
 		NFS_PROG, NFS_VER4, NULL, 1, NULL, NULL);
 	if (error)
 		return (error);
 	if (nd->nd_repstat == 0) {
 	    NFSM_DISSECT(tl, u_int32_t *, 4 * NFSX_UNSIGNED);
 	    tsep->nfsess_clientid.lval[0] = *tl++;
 	    tsep->nfsess_clientid.lval[1] = *tl++;
 	    confirm.lval[0] = *tl++;
 	    confirm.lval[1] = *tl;
 	    m_freem(nd->nd_mrep);
 	    nd->nd_mrep = NULL;
 
 	    /*
 	     * and confirm it.
 	     */
 	    nfscl_reqstart(nd, NFSPROC_SETCLIENTIDCFRM, nmp, NULL, 0, NULL,
 		NULL, 0, 0, NULL);
 	    NFSM_BUILD(tl, u_int32_t *, 4 * NFSX_UNSIGNED);
 	    *tl++ = tsep->nfsess_clientid.lval[0];
 	    *tl++ = tsep->nfsess_clientid.lval[1];
 	    *tl++ = confirm.lval[0];
 	    *tl = confirm.lval[1];
 	    nd->nd_flag |= ND_USEGSSNAME;
 	    error = newnfs_request(nd, nmp, NULL, &nmp->nm_sockreq, NULL, p,
 		cred, NFS_PROG, NFS_VER4, NULL, 1, NULL, NULL);
 	    if (error)
 		return (error);
 	    m_freem(nd->nd_mrep);
 	    nd->nd_mrep = NULL;
 	}
 	error = nd->nd_repstat;
 nfsmout:
 	m_freem(nd->nd_mrep);
 	return (error);
 }
 
 /*
  * nfs getattr call.
  */
 int
 nfsrpc_getattr(vnode_t vp, struct ucred *cred, NFSPROC_T *p,
     struct nfsvattr *nap)
 {
 	struct nfsrv_descript nfsd, *nd = &nfsd;
 	int error;
 	nfsattrbit_t attrbits;
 	struct nfsnode *np;
 	struct nfsmount *nmp;
 
 	nmp = VFSTONFS(vp->v_mount);
 	np = VTONFS(vp);
 	if ((nmp->nm_privflag & NFSMNTP_FAKEROOTFH) != 0 &&
 	    nmp->nm_fhsize == 0) {
 		/* Attempt to get the actual root file handle. */
 		error = nfsrpc_getdirpath(nmp, NFSMNT_DIRPATH(nmp), cred, p);
 		if (error != 0)
 			return (EACCES);
 		if (np->n_fhp->nfh_len == NFSX_FHMAX + 1)
 			nfscl_statfs(vp, cred, p);
 	}
 	NFSCL_REQSTART(nd, NFSPROC_GETATTR, vp, cred);
 	if (nd->nd_flag & ND_NFSV4) {
 		NFSGETATTR_ATTRBIT(&attrbits);
 		(void) nfsrv_putattrbit(nd, &attrbits);
 	}
 	error = nfscl_request(nd, vp, p, cred);
 	if (error)
 		return (error);
 	if (!nd->nd_repstat)
 		error = nfsm_loadattr(nd, nap);
 	else
 		error = nd->nd_repstat;
 	m_freem(nd->nd_mrep);
 	return (error);
 }
 
 /*
  * nfs getattr call with non-vnode arguments.
  */
 int
 nfsrpc_getattrnovp(struct nfsmount *nmp, u_int8_t *fhp, int fhlen, int syscred,
     struct ucred *cred, NFSPROC_T *p, struct nfsvattr *nap, u_int64_t *xidp,
     uint32_t *leasep)
 {
 	struct nfsrv_descript nfsd, *nd = &nfsd;
 	int error, vers = NFS_VER2;
 	nfsattrbit_t attrbits;
 
 	nfscl_reqstart(nd, NFSPROC_GETATTR, nmp, fhp, fhlen, NULL, NULL, 0, 0,
 	    cred);
 	if (nd->nd_flag & ND_NFSV4) {
 		vers = NFS_VER4;
 		NFSGETATTR_ATTRBIT(&attrbits);
 		NFSSETBIT_ATTRBIT(&attrbits, NFSATTRBIT_LEASETIME);
 		(void) nfsrv_putattrbit(nd, &attrbits);
 	} else if (nd->nd_flag & ND_NFSV3) {
 		vers = NFS_VER3;
 	}
 	if (syscred)
 		nd->nd_flag |= ND_USEGSSNAME;
 	error = newnfs_request(nd, nmp, NULL, &nmp->nm_sockreq, NULL, p, cred,
 	    NFS_PROG, vers, NULL, 1, xidp, NULL);
 	if (error)
 		return (error);
 	if (nd->nd_repstat == 0) {
 		if ((nd->nd_flag & ND_NFSV4) != 0)
 			error = nfsv4_loadattr(nd, NULL, nap, NULL, NULL, 0,
 			    NULL, NULL, NULL, NULL, NULL, 0, NULL, leasep, NULL,
 			    NULL, NULL);
 		else
 			error = nfsm_loadattr(nd, nap);
 	} else
 		error = nd->nd_repstat;
 	m_freem(nd->nd_mrep);
 	return (error);
 }
 
 /*
  * Do an nfs setattr operation.
  */
 int
 nfsrpc_setattr(vnode_t vp, struct vattr *vap, NFSACL_T *aclp,
     struct ucred *cred, NFSPROC_T *p, struct nfsvattr *rnap, int *attrflagp)
 {
 	int error, expireret = 0, openerr, retrycnt;
 	u_int32_t clidrev = 0, mode;
 	struct nfsmount *nmp = VFSTONFS(vp->v_mount);
 	struct nfsfh *nfhp;
 	nfsv4stateid_t stateid;
 	void *lckp;
 
 	if (nmp->nm_clp != NULL)
 		clidrev = nmp->nm_clp->nfsc_clientidrev;
 	if (vap != NULL && NFSATTRISSET(u_quad_t, vap, va_size))
 		mode = NFSV4OPEN_ACCESSWRITE;
 	else
 		mode = NFSV4OPEN_ACCESSREAD;
 	retrycnt = 0;
 	do {
 		lckp = NULL;
 		openerr = 1;
 		if (NFSHASNFSV4(nmp)) {
 			nfhp = VTONFS(vp)->n_fhp;
 			error = nfscl_getstateid(vp, nfhp->nfh_fh,
 			    nfhp->nfh_len, mode, 0, cred, p, &stateid, &lckp);
 			if (error && vp->v_type == VREG &&
 			    (mode == NFSV4OPEN_ACCESSWRITE ||
 			     nfstest_openallsetattr)) {
 				/*
 				 * No Open stateid, so try and open the file
 				 * now.
 				 */
 				if (mode == NFSV4OPEN_ACCESSWRITE)
 					openerr = nfsrpc_open(vp, FWRITE, cred,
 					    p);
 				else
 					openerr = nfsrpc_open(vp, FREAD, cred,
 					    p);
 				if (!openerr)
 					(void) nfscl_getstateid(vp,
 					    nfhp->nfh_fh, nfhp->nfh_len,
 					    mode, 0, cred, p, &stateid, &lckp);
 			}
 		}
 		if (vap != NULL)
 			error = nfsrpc_setattrrpc(vp, vap, &stateid, cred, p,
 			    rnap, attrflagp);
 		else
 			error = nfsrpc_setaclrpc(vp, cred, p, aclp, &stateid);
 		if (error == NFSERR_OPENMODE && mode == NFSV4OPEN_ACCESSREAD) {
 			NFSLOCKMNT(nmp);
 			nmp->nm_state |= NFSSTA_OPENMODE;
 			NFSUNLOCKMNT(nmp);
 		}
 		if (error == NFSERR_STALESTATEID)
 			nfscl_initiate_recovery(nmp->nm_clp);
 		if (lckp != NULL)
 			nfscl_lockderef(lckp);
 		if (!openerr)
 			(void) nfsrpc_close(vp, 0, p);
 		if (error == NFSERR_GRACE || error == NFSERR_STALESTATEID ||
 		    error == NFSERR_STALEDONTRECOVER || error == NFSERR_DELAY ||
 		    error == NFSERR_OLDSTATEID || error == NFSERR_BADSESSION) {
 			(void) nfs_catnap(PZERO, error, "nfs_setattr");
 		} else if ((error == NFSERR_EXPIRED ||
 		    ((!NFSHASINT(nmp) || !NFSHASNFSV4N(nmp)) &&
 		    error == NFSERR_BADSTATEID)) && clidrev != 0) {
 			expireret = nfscl_hasexpired(nmp->nm_clp, clidrev, p);
 		} else if (error == NFSERR_BADSTATEID && NFSHASINT(nmp) &&
 		    NFSHASNFSV4N(nmp)) {
 			error = EIO;
 		}
 		retrycnt++;
 	} while (error == NFSERR_GRACE || error == NFSERR_STALESTATEID ||
 	    error == NFSERR_STALEDONTRECOVER || error == NFSERR_DELAY ||
 	    error == NFSERR_BADSESSION ||
 	    (error == NFSERR_OLDSTATEID && retrycnt < 20) ||
 	    ((error == NFSERR_EXPIRED || error == NFSERR_BADSTATEID) &&
 	     expireret == 0 && clidrev != 0 && retrycnt < 4) ||
 	    (error == NFSERR_OPENMODE && mode == NFSV4OPEN_ACCESSREAD &&
 	     retrycnt < 4));
 	if (error && retrycnt >= 4)
 		error = EIO;
 	return (error);
 }
 
 static int
 nfsrpc_setattrrpc(vnode_t vp, struct vattr *vap,
     nfsv4stateid_t *stateidp, struct ucred *cred, NFSPROC_T *p,
     struct nfsvattr *rnap, int *attrflagp)
 {
 	u_int32_t *tl;
 	struct nfsrv_descript nfsd, *nd = &nfsd;
 	int error;
 	nfsattrbit_t attrbits;
 
 	*attrflagp = 0;
 	NFSCL_REQSTART(nd, NFSPROC_SETATTR, vp, cred);
 	if (nd->nd_flag & ND_NFSV4)
 		nfsm_stateidtom(nd, stateidp, NFSSTATEID_PUTSTATEID);
 	vap->va_type = vp->v_type;
 	nfscl_fillsattr(nd, vap, vp, NFSSATTR_FULL, 0);
 	if (nd->nd_flag & ND_NFSV3) {
 		NFSM_BUILD(tl, u_int32_t *, NFSX_UNSIGNED);
 		*tl = newnfs_false;
 	} else if (nd->nd_flag & ND_NFSV4) {
 		NFSM_BUILD(tl, u_int32_t *, NFSX_UNSIGNED);
 		*tl = txdr_unsigned(NFSV4OP_GETATTR);
 		NFSGETATTR_ATTRBIT(&attrbits);
 		(void) nfsrv_putattrbit(nd, &attrbits);
 	}
 	error = nfscl_request(nd, vp, p, cred);
 	if (error)
 		return (error);
 	if (nd->nd_flag & (ND_NFSV3 | ND_NFSV4))
 		error = nfscl_wcc_data(nd, vp, rnap, attrflagp, NULL, NULL);
 	if ((nd->nd_flag & (ND_NFSV4 | ND_NOMOREDATA)) == ND_NFSV4 && !error)
 		error = nfsrv_getattrbits(nd, &attrbits, NULL, NULL);
 	if (!(nd->nd_flag & ND_NFSV3) && !nd->nd_repstat && !error)
 		error = nfscl_postop_attr(nd, rnap, attrflagp);
 	m_freem(nd->nd_mrep);
 	if (nd->nd_repstat && !error)
 		error = nd->nd_repstat;
 	return (error);
 }
 
 /*
  * nfs lookup rpc
  */
 int
 nfsrpc_lookup(vnode_t dvp, char *name, int len, struct ucred *cred,
     NFSPROC_T *p, struct nfsvattr *dnap, struct nfsvattr *nap,
     struct nfsfh **nfhpp, int *attrflagp, int *dattrflagp, uint32_t openmode)
 {
 	uint32_t deleg, rflags, *tl;
 	struct nfsrv_descript nfsd, *nd = &nfsd;
 	struct nfsmount *nmp;
 	struct nfsnode *np;
 	struct nfsfh *nfhp;
 	nfsattrbit_t attrbits;
 	int error = 0, lookupp = 0, newone, ret, retop;
 	uint8_t own[NFSV4CL_LOCKNAMELEN];
 	struct nfsclopen *op;
 	struct nfscldeleg *ndp;
 	nfsv4stateid_t stateid;
 
 	*attrflagp = 0;
 	*dattrflagp = 0;
 	if (dvp->v_type != VDIR)
 		return (ENOTDIR);
 	nmp = VFSTONFS(dvp->v_mount);
 	if (len > NFS_MAXNAMLEN)
 		return (ENAMETOOLONG);
 	if (NFSHASNFSV4(nmp) && len == 1 &&
 		name[0] == '.') {
 		/*
 		 * Just return the current dir's fh.
 		 */
 		np = VTONFS(dvp);
 		nfhp = malloc(sizeof (struct nfsfh) +
 			np->n_fhp->nfh_len, M_NFSFH, M_WAITOK);
 		nfhp->nfh_len = np->n_fhp->nfh_len;
 		NFSBCOPY(np->n_fhp->nfh_fh, nfhp->nfh_fh, nfhp->nfh_len);
 		*nfhpp = nfhp;
 		return (0);
 	}
 	if (NFSHASNFSV4(nmp) && len == 2 &&
 		name[0] == '.' && name[1] == '.') {
 		lookupp = 1;
 		openmode = 0;
 		NFSCL_REQSTART(nd, NFSPROC_LOOKUPP, dvp, cred);
 	} else if (openmode != 0) {
 		NFSCL_REQSTART(nd, NFSPROC_LOOKUPOPEN, dvp, cred);
 		nfsm_strtom(nd, name, len);
 	} else {
 		NFSCL_REQSTART(nd, NFSPROC_LOOKUP, dvp, cred);
 		(void) nfsm_strtom(nd, name, len);
 	}
 	if (nd->nd_flag & ND_NFSV4) {
 		NFSGETATTR_ATTRBIT(&attrbits);
 		NFSM_BUILD(tl, u_int32_t *, 2 * NFSX_UNSIGNED);
 		*tl++ = txdr_unsigned(NFSV4OP_GETFH);
 		*tl = txdr_unsigned(NFSV4OP_GETATTR);
 		(void) nfsrv_putattrbit(nd, &attrbits);
 		if (openmode != 0) {
 			/* Test for a VREG file. */
 			NFSZERO_ATTRBIT(&attrbits);
 			NFSSETBIT_ATTRBIT(&attrbits, NFSATTRBIT_TYPE);
 			NFSM_BUILD(tl, uint32_t *, NFSX_UNSIGNED);
 			*tl = txdr_unsigned(NFSV4OP_VERIFY);
 			nfsrv_putattrbit(nd, &attrbits);
 			NFSM_BUILD(tl, uint32_t *, 2 * NFSX_UNSIGNED);
 			*tl++ = txdr_unsigned(NFSX_UNSIGNED);
 			*tl = vtonfsv34_type(VREG);
 
 			/* Attempt the Open for VREG. */
 			nfscl_filllockowner(NULL, own, F_POSIX);
 			NFSM_BUILD(tl, uint32_t *, 6 * NFSX_UNSIGNED);
 			*tl++ = txdr_unsigned(NFSV4OP_OPEN);
 			*tl++ = 0;		/* seqid, ignored. */
 			*tl++ = txdr_unsigned(openmode | NFSV4OPEN_WANTNODELEG);
 			*tl++ = txdr_unsigned(NFSV4OPEN_DENYNONE);
 			*tl++ = 0;		/* ClientID, ignored. */
 			*tl = 0;
 			nfsm_strtom(nd, own, NFSV4CL_LOCKNAMELEN);
 			NFSM_BUILD(tl, uint32_t *, 2 * NFSX_UNSIGNED);
 			*tl++ = txdr_unsigned(NFSV4OPEN_NOCREATE);
 			*tl = txdr_unsigned(NFSV4OPEN_CLAIMFH);
 		}
 	}
 	error = nfscl_request(nd, dvp, p, cred);
 	if (error)
 		return (error);
 	ndp = NULL;
 	if (nd->nd_repstat) {
 		/*
 		 * When an NFSv4 Lookupp returns ENOENT, it means that
 		 * the lookup is at the root of an fs, so return this dir.
 		 */
 		if (nd->nd_repstat == NFSERR_NOENT && lookupp) {
 		    np = VTONFS(dvp);
 		    nfhp = malloc(sizeof (struct nfsfh) +
 			np->n_fhp->nfh_len, M_NFSFH, M_WAITOK);
 		    nfhp->nfh_len = np->n_fhp->nfh_len;
 		    NFSBCOPY(np->n_fhp->nfh_fh, nfhp->nfh_fh, nfhp->nfh_len);
 		    *nfhpp = nfhp;
 		    m_freem(nd->nd_mrep);
 		    return (0);
 		}
 		if (nd->nd_flag & ND_NFSV3)
 		    error = nfscl_postop_attr(nd, dnap, dattrflagp);
 		else if ((nd->nd_flag & (ND_NFSV4 | ND_NOMOREDATA)) ==
 		    ND_NFSV4) {
 			/* Load the directory attributes. */
 			error = nfsm_loadattr(nd, dnap);
 			if (error != 0)
 				goto nfsmout;
 			*dattrflagp = 1;
 		}
 		/* Check Lookup operation reply status. */
 		if (openmode != 0 && (nd->nd_flag & ND_NOMOREDATA) == 0) {
 			NFSM_DISSECT(tl, uint32_t *, 2 * NFSX_UNSIGNED);
 			if (*++tl != 0)
 				goto nfsmout;
 		}
 		/* Look for GetFH reply. */
 		if (openmode != 0 && (nd->nd_flag & ND_NOMOREDATA) == 0) {
 			NFSM_DISSECT(tl, uint32_t *, 2 * NFSX_UNSIGNED);
 			if (*++tl != 0)
 				goto nfsmout;
 			error = nfsm_getfh(nd, nfhpp);
 			if (error)
 				goto nfsmout;
 		}
 		/* Look for Getattr reply. */
 		if (openmode != 0 && (nd->nd_flag & ND_NOMOREDATA) == 0) {
 			NFSM_DISSECT(tl, uint32_t *, 2 * NFSX_UNSIGNED);
 			if (*++tl != 0)
 				goto nfsmout;
 			error = nfsm_loadattr(nd, nap);
 			if (error == 0) {
 				/*
 				 * We have now successfully completed the
 				 * lookup, so set nd_repstat to 0.
 				 */
 				nd->nd_repstat = 0;
 				*attrflagp = 1;
 			}
 		}
 		goto nfsmout;
 	}
 	if ((nd->nd_flag & (ND_NFSV4 | ND_NOMOREDATA)) == ND_NFSV4) {
 		/* Load the directory attributes. */
 		error = nfsm_loadattr(nd, dnap);
 		if (error != 0)
 			goto nfsmout;
 		*dattrflagp = 1;
 		/* Skip over the Lookup and GetFH operation status values. */
 		NFSM_DISSECT(tl, u_int32_t *, 4 * NFSX_UNSIGNED);
 	}
 	error = nfsm_getfh(nd, nfhpp);
 	if (error)
 		goto nfsmout;
 
 	error = nfscl_postop_attr(nd, nap, attrflagp);
 	if (openmode != 0 && error == 0) {
 		NFSM_DISSECT(tl, uint32_t *, NFSX_STATEID +
 		    10 * NFSX_UNSIGNED);
 		tl += 4;	/* Skip over Verify+Open status. */
 		stateid.seqid = *tl++;
 		stateid.other[0] = *tl++;
 		stateid.other[1] = *tl++;
 		stateid.other[2] = *tl;
 		rflags = fxdr_unsigned(uint32_t, *(tl + 6));
 		error = nfsrv_getattrbits(nd, &attrbits, NULL, NULL);
 		if (error != 0)
 			goto nfsmout;
 		NFSM_DISSECT(tl, uint32_t *, NFSX_UNSIGNED);
 		deleg = fxdr_unsigned(uint32_t, *tl);
 		if (deleg == NFSV4OPEN_DELEGATEREAD ||
 		    deleg == NFSV4OPEN_DELEGATEWRITE) {
 			/*
 			 * Just need to fill in the fields used by
 			 * nfscl_trydelegreturn().
 			 * Mark the mount point as acquiring
 			 * delegations, so NFSPROC_LOOKUPOPEN will
 			 * no longer be done.
 			 */
 			NFSLOCKMNT(nmp);
 			nmp->nm_privflag |= NFSMNTP_DELEGISSUED;
 			NFSUNLOCKMNT(nmp);
 			ndp = malloc(sizeof(struct nfscldeleg) +
 			    (*nfhpp)->nfh_len, M_NFSCLDELEG, M_WAITOK);
 			ndp->nfsdl_fhlen = (*nfhpp)->nfh_len;
 			NFSBCOPY((*nfhpp)->nfh_fh, ndp->nfsdl_fh,
 			    ndp->nfsdl_fhlen);
 			newnfs_copyincred(cred, &ndp->nfsdl_cred);
 			NFSM_DISSECT(tl, uint32_t *, NFSX_STATEID);
 			ndp->nfsdl_stateid.seqid = *tl++;
 			ndp->nfsdl_stateid.other[0] = *tl++;
 			ndp->nfsdl_stateid.other[1] = *tl++;
 			ndp->nfsdl_stateid.other[2] = *tl++;
 		} else if (deleg == NFSV4OPEN_DELEGATENONEEXT &&
 		    NFSHASNFSV4N(nmp)) {
 			NFSM_DISSECT(tl, uint32_t *, NFSX_UNSIGNED);
 			deleg = fxdr_unsigned(uint32_t, *tl);
 			if (deleg == NFSV4OPEN_CONTENTION ||
 			    deleg == NFSV4OPEN_RESOURCE)
 				NFSM_DISSECT(tl, uint32_t *, NFSX_UNSIGNED);
 		} else if (deleg != NFSV4OPEN_DELEGATENONE) {
 			error = NFSERR_BADXDR;
 			goto nfsmout;
 		}
 		ret = nfscl_open(dvp, (*nfhpp)->nfh_fh, (*nfhpp)->nfh_len,
 		    openmode, 0, cred, p, NULL, &op, &newone, &retop, 1, true);
 		if (ret != 0)
 			goto nfsmout;
 		if (newone != 0) {
 			op->nfso_stateid.seqid = stateid.seqid;
 			op->nfso_stateid.other[0] = stateid.other[0];
 			op->nfso_stateid.other[1] = stateid.other[1];
 			op->nfso_stateid.other[2] = stateid.other[2];
 			op->nfso_mode = openmode;
 		} else {
 			op->nfso_stateid.seqid = stateid.seqid;
 			if (retop == NFSCLOPEN_DOOPEN)
 				op->nfso_mode |= openmode;
 		}
 		if ((rflags & NFSV4OPEN_LOCKTYPEPOSIX) != 0 ||
 		    nfscl_assumeposixlocks)
 			op->nfso_posixlock = 1;
 		else
 			op->nfso_posixlock = 0;
 		nfscl_openrelease(nmp, op, 0, 0);
 		if (ndp != NULL) {
 			/*
 			 * Since we do not have the vnode, we
 			 * cannot invalidate cached attributes.
 			 * Just return the delegation.
 			 */
 			nfscl_trydelegreturn(ndp, cred, nmp, p);
 		}
 	}
 	if ((nd->nd_flag & ND_NFSV3) && !error)
 		error = nfscl_postop_attr(nd, dnap, dattrflagp);
 nfsmout:
 	m_freem(nd->nd_mrep);
 	if (!error && nd->nd_repstat)
 		error = nd->nd_repstat;
 	free(ndp, M_NFSCLDELEG);
 	return (error);
 }
 
 /*
  * Do a readlink rpc.
  */
 int
 nfsrpc_readlink(vnode_t vp, struct uio *uiop, struct ucred *cred,
     NFSPROC_T *p, struct nfsvattr *nap, int *attrflagp)
 {
 	u_int32_t *tl;
 	struct nfsrv_descript nfsd, *nd = &nfsd;
 	struct nfsnode *np = VTONFS(vp);
 	nfsattrbit_t attrbits;
 	int error, len, cangetattr = 1;
 
 	*attrflagp = 0;
 	NFSCL_REQSTART(nd, NFSPROC_READLINK, vp, cred);
 	if (nd->nd_flag & ND_NFSV4) {
 		/*
 		 * And do a Getattr op.
 		 */
 		NFSM_BUILD(tl, u_int32_t *, NFSX_UNSIGNED);
 		*tl = txdr_unsigned(NFSV4OP_GETATTR);
 		NFSGETATTR_ATTRBIT(&attrbits);
 		(void) nfsrv_putattrbit(nd, &attrbits);
 	}
 	error = nfscl_request(nd, vp, p, cred);
 	if (error)
 		return (error);
 	if (nd->nd_flag & ND_NFSV3)
 		error = nfscl_postop_attr(nd, nap, attrflagp);
 	if (!nd->nd_repstat && !error) {
 		NFSM_STRSIZ(len, NFS_MAXPATHLEN);
 		/*
 		 * This seems weird to me, but must have been added to
 		 * FreeBSD for some reason. The only thing I can think of
 		 * is that there was/is some server that replies with
 		 * more link data than it should?
 		 */
 		if (len == NFS_MAXPATHLEN) {
 			NFSLOCKNODE(np);
 			if (np->n_size > 0 && np->n_size < NFS_MAXPATHLEN) {
 				len = np->n_size;
 				cangetattr = 0;
 			}
 			NFSUNLOCKNODE(np);
 		}
 		error = nfsm_mbufuio(nd, uiop, len);
 		if ((nd->nd_flag & ND_NFSV4) && !error && cangetattr)
 			error = nfscl_postop_attr(nd, nap, attrflagp);
 	}
 	if (nd->nd_repstat && !error)
 		error = nd->nd_repstat;
 nfsmout:
 	m_freem(nd->nd_mrep);
 	return (error);
 }
 
 /*
  * Read operation.
  */
 int
 nfsrpc_read(vnode_t vp, struct uio *uiop, struct ucred *cred,
     NFSPROC_T *p, struct nfsvattr *nap, int *attrflagp)
 {
 	int error, expireret = 0, retrycnt;
 	u_int32_t clidrev = 0;
 	struct nfsmount *nmp = VFSTONFS(vp->v_mount);
 	struct nfsnode *np = VTONFS(vp);
 	struct ucred *newcred;
 	struct nfsfh *nfhp = NULL;
 	nfsv4stateid_t stateid;
 	void *lckp;
 
 	if (nmp->nm_clp != NULL)
 		clidrev = nmp->nm_clp->nfsc_clientidrev;
 	newcred = cred;
 	if (NFSHASNFSV4(nmp)) {
 		nfhp = np->n_fhp;
 		newcred = NFSNEWCRED(cred);
 	}
 	retrycnt = 0;
 	do {
 		lckp = NULL;
 		if (NFSHASNFSV4(nmp))
 			(void)nfscl_getstateid(vp, nfhp->nfh_fh, nfhp->nfh_len,
 			    NFSV4OPEN_ACCESSREAD, 0, newcred, p, &stateid,
 			    &lckp);
 		error = nfsrpc_readrpc(vp, uiop, newcred, &stateid, p, nap,
 		    attrflagp);
 		if (error == NFSERR_OPENMODE) {
 			NFSLOCKMNT(nmp);
 			nmp->nm_state |= NFSSTA_OPENMODE;
 			NFSUNLOCKMNT(nmp);
 		}
 		if (error == NFSERR_STALESTATEID)
 			nfscl_initiate_recovery(nmp->nm_clp);
 		if (lckp != NULL)
 			nfscl_lockderef(lckp);
 		if (error == NFSERR_GRACE || error == NFSERR_STALESTATEID ||
 		    error == NFSERR_STALEDONTRECOVER || error == NFSERR_DELAY ||
 		    error == NFSERR_OLDSTATEID || error == NFSERR_BADSESSION) {
 			(void) nfs_catnap(PZERO, error, "nfs_read");
 		} else if ((error == NFSERR_EXPIRED ||
 		    ((!NFSHASINT(nmp) || !NFSHASNFSV4N(nmp)) &&
 		    error == NFSERR_BADSTATEID)) && clidrev != 0) {
 			expireret = nfscl_hasexpired(nmp->nm_clp, clidrev, p);
 		} else if (error == NFSERR_BADSTATEID && NFSHASINT(nmp) &&
 		    NFSHASNFSV4N(nmp)) {
 			error = EIO;
 		}
 		retrycnt++;
 	} while (error == NFSERR_GRACE || error == NFSERR_STALESTATEID ||
 	    error == NFSERR_STALEDONTRECOVER || error == NFSERR_DELAY ||
 	    error == NFSERR_BADSESSION ||
 	    (error == NFSERR_OLDSTATEID && retrycnt < 20) ||
 	    ((error == NFSERR_EXPIRED || error == NFSERR_BADSTATEID) &&
 	     expireret == 0 && clidrev != 0 && retrycnt < 4) ||
 	    (error == NFSERR_OPENMODE && retrycnt < 4));
 	if (error && retrycnt >= 4)
 		error = EIO;
 	if (NFSHASNFSV4(nmp))
 		NFSFREECRED(newcred);
 	return (error);
 }
 
 /*
  * The actual read RPC.
  */
 static int
 nfsrpc_readrpc(vnode_t vp, struct uio *uiop, struct ucred *cred,
     nfsv4stateid_t *stateidp, NFSPROC_T *p, struct nfsvattr *nap,
     int *attrflagp)
 {
 	u_int32_t *tl;
 	int error = 0, len, retlen, tsiz, eof = 0;
 	struct nfsrv_descript nfsd;
 	struct nfsmount *nmp = VFSTONFS(vp->v_mount);
 	struct nfsrv_descript *nd = &nfsd;
 	int rsize;
 	off_t tmp_off;
 
 	*attrflagp = 0;
 	tsiz = uiop->uio_resid;
 	tmp_off = uiop->uio_offset + tsiz;
 	NFSLOCKMNT(nmp);
 	if (tmp_off > nmp->nm_maxfilesize || tmp_off < uiop->uio_offset) {
 		NFSUNLOCKMNT(nmp);
 		return (EFBIG);
 	}
 	rsize = nmp->nm_rsize;
 	NFSUNLOCKMNT(nmp);
 	nd->nd_mrep = NULL;
 	while (tsiz > 0) {
 		*attrflagp = 0;
 		len = (tsiz > rsize) ? rsize : tsiz;
 		NFSCL_REQSTART(nd, NFSPROC_READ, vp, cred);
 		if (nd->nd_flag & ND_NFSV4)
 			nfsm_stateidtom(nd, stateidp, NFSSTATEID_PUTSTATEID);
 		NFSM_BUILD(tl, u_int32_t *, NFSX_UNSIGNED * 3);
 		if (nd->nd_flag & ND_NFSV2) {
 			*tl++ = txdr_unsigned(uiop->uio_offset);
 			*tl++ = txdr_unsigned(len);
 			*tl = 0;
 		} else {
 			txdr_hyper(uiop->uio_offset, tl);
 			*(tl + 2) = txdr_unsigned(len);
 		}
 		/*
 		 * Since I can't do a Getattr for NFSv4 for Write, there
 		 * doesn't seem any point in doing one here, either.
 		 * (See the comment in nfsrpc_writerpc() for more info.)
 		 */
 		error = nfscl_request(nd, vp, p, cred);
 		if (error)
 			return (error);
 		if (nd->nd_flag & ND_NFSV3) {
 			error = nfscl_postop_attr(nd, nap, attrflagp);
 		} else if (!nd->nd_repstat && (nd->nd_flag & ND_NFSV2)) {
 			error = nfsm_loadattr(nd, nap);
 			if (!error)
 				*attrflagp = 1;
 		}
 		if (nd->nd_repstat || error) {
 			if (!error)
 				error = nd->nd_repstat;
 			goto nfsmout;
 		}
 		if (nd->nd_flag & ND_NFSV3) {
 			NFSM_DISSECT(tl, u_int32_t *, 2 * NFSX_UNSIGNED);
 			eof = fxdr_unsigned(int, *(tl + 1));
 		} else if (nd->nd_flag & ND_NFSV4) {
 			NFSM_DISSECT(tl, u_int32_t *, NFSX_UNSIGNED);
 			eof = fxdr_unsigned(int, *tl);
 		}
 		NFSM_STRSIZ(retlen, len);
 		error = nfsm_mbufuio(nd, uiop, retlen);
 		if (error)
 			goto nfsmout;
 		m_freem(nd->nd_mrep);
 		nd->nd_mrep = NULL;
 		tsiz -= retlen;
 		if (!(nd->nd_flag & ND_NFSV2)) {
 			if (eof || retlen == 0)
 				tsiz = 0;
 		} else if (retlen < len)
 			tsiz = 0;
 	}
 	return (0);
 nfsmout:
 	if (nd->nd_mrep != NULL)
 		m_freem(nd->nd_mrep);
 	return (error);
 }
 
 /*
  * nfs write operation
  * When called_from_strategy != 0, it should return EIO for an error that
  * indicates recovery is in progress, so that the buffer will be left
  * dirty and be written back to the server later. If it loops around,
  * the recovery thread could get stuck waiting for the buffer and recovery
  * will then deadlock.
  */
 int
 nfsrpc_write(vnode_t vp, struct uio *uiop, int *iomode, int *must_commit,
     struct ucred *cred, NFSPROC_T *p, struct nfsvattr *nap, int *attrflagp,
     int called_from_strategy, int ioflag)
 {
 	int error, expireret = 0, retrycnt, nostateid;
 	u_int32_t clidrev = 0;
 	struct nfsmount *nmp = VFSTONFS(vp->v_mount);
 	struct nfsnode *np = VTONFS(vp);
 	struct ucred *newcred;
 	struct nfsfh *nfhp = NULL;
 	nfsv4stateid_t stateid;
 	void *lckp;
 
 	KASSERT(*must_commit >= 0 && *must_commit <= 2,
 	    ("nfsrpc_write: must_commit out of range=%d", *must_commit));
 	if (nmp->nm_clp != NULL)
 		clidrev = nmp->nm_clp->nfsc_clientidrev;
 	newcred = cred;
 	if (NFSHASNFSV4(nmp)) {
 		newcred = NFSNEWCRED(cred);
 		nfhp = np->n_fhp;
 	}
 	retrycnt = 0;
 	do {
 		lckp = NULL;
 		nostateid = 0;
 		if (NFSHASNFSV4(nmp)) {
 			(void)nfscl_getstateid(vp, nfhp->nfh_fh, nfhp->nfh_len,
 			    NFSV4OPEN_ACCESSWRITE, 0, newcred, p, &stateid,
 			    &lckp);
 			if (stateid.other[0] == 0 && stateid.other[1] == 0 &&
 			    stateid.other[2] == 0) {
 				nostateid = 1;
 				NFSCL_DEBUG(1, "stateid0 in write\n");
 			}
 		}
 
 		/*
 		 * If there is no stateid for NFSv4, it means this is an
 		 * extraneous write after close. Basically a poorly
 		 * implemented buffer cache. Just don't do the write.
 		 */
 		if (nostateid)
 			error = 0;
 		else
 			error = nfsrpc_writerpc(vp, uiop, iomode, must_commit,
 			    newcred, &stateid, p, nap, attrflagp, ioflag);
 		if (error == NFSERR_STALESTATEID)
 			nfscl_initiate_recovery(nmp->nm_clp);
 		if (lckp != NULL)
 			nfscl_lockderef(lckp);
 		if (error == NFSERR_GRACE || error == NFSERR_STALESTATEID ||
 		    error == NFSERR_STALEDONTRECOVER || error == NFSERR_DELAY ||
 		    error == NFSERR_OLDSTATEID || error == NFSERR_BADSESSION) {
 			(void) nfs_catnap(PZERO, error, "nfs_write");
 		} else if ((error == NFSERR_EXPIRED ||
 		    ((!NFSHASINT(nmp) || !NFSHASNFSV4N(nmp)) &&
 		    error == NFSERR_BADSTATEID)) && clidrev != 0) {
 			expireret = nfscl_hasexpired(nmp->nm_clp, clidrev, p);
 		} else if (error == NFSERR_BADSTATEID && NFSHASINT(nmp) &&
 		    NFSHASNFSV4N(nmp)) {
 			error = EIO;
 		}
 		retrycnt++;
 	} while (error == NFSERR_GRACE || error == NFSERR_DELAY ||
 	    ((error == NFSERR_STALESTATEID || error == NFSERR_BADSESSION ||
 	      error == NFSERR_STALEDONTRECOVER) && called_from_strategy == 0) ||
 	    (error == NFSERR_OLDSTATEID && retrycnt < 20) ||
 	    ((error == NFSERR_EXPIRED || error == NFSERR_BADSTATEID) &&
 	     expireret == 0 && clidrev != 0 && retrycnt < 4));
 	if (error != 0 && (retrycnt >= 4 ||
 	    ((error == NFSERR_STALESTATEID || error == NFSERR_BADSESSION ||
 	      error == NFSERR_STALEDONTRECOVER) && called_from_strategy != 0)))
 		error = EIO;
 	if (NFSHASNFSV4(nmp))
 		NFSFREECRED(newcred);
 	return (error);
 }
 
 /*
  * The actual write RPC.
  */
 static int
 nfsrpc_writerpc(vnode_t vp, struct uio *uiop, int *iomode,
     int *must_commit, struct ucred *cred, nfsv4stateid_t *stateidp,
     NFSPROC_T *p, struct nfsvattr *nap, int *attrflagp, int ioflag)
 {
 	u_int32_t *tl;
 	struct nfsmount *nmp = VFSTONFS(vp->v_mount);
 	struct nfsnode *np = VTONFS(vp);
 	int error = 0, len, rlen, commit, committed = NFSWRITE_FILESYNC;
 	int wccflag = 0;
 	int32_t backup;
 	struct nfsrv_descript *nd;
 	nfsattrbit_t attrbits;
 	uint64_t tmp_off;
 	ssize_t tsiz, wsize;
 	bool do_append;
 
 	KASSERT(uiop->uio_iovcnt == 1, ("nfs: writerpc iovcnt > 1"));
 	*attrflagp = 0;
 	tsiz = uiop->uio_resid;
 	tmp_off = uiop->uio_offset + tsiz;
 	NFSLOCKMNT(nmp);
 	if (tmp_off > nmp->nm_maxfilesize || tmp_off < uiop->uio_offset) {
 		NFSUNLOCKMNT(nmp);
 		return (EFBIG);
 	}
 	wsize = nmp->nm_wsize;
 	do_append = false;
 	if ((ioflag & IO_APPEND) != 0 && NFSHASNFSV4(nmp) && !NFSHASPNFS(nmp))
 		do_append = true;
 	NFSUNLOCKMNT(nmp);
 	nd = malloc(sizeof(*nd), M_TEMP, M_WAITOK);
 	nd->nd_mrep = NULL;	/* NFSv2 sometimes does a write with */
 	nd->nd_repstat = 0;	/* uio_resid == 0, so the while is not done */
 	while (tsiz > 0) {
 		*attrflagp = 0;
 		len = (tsiz > wsize) ? wsize : tsiz;
 		if (do_append)
 			NFSCL_REQSTART(nd, NFSPROC_APPENDWRITE, vp, cred);
 		else
 			NFSCL_REQSTART(nd, NFSPROC_WRITE, vp, cred);
 		if (nd->nd_flag & ND_NFSV4) {
 			if (do_append) {
 				NFSZERO_ATTRBIT(&attrbits);
 				NFSSETBIT_ATTRBIT(&attrbits, NFSATTRBIT_SIZE);
 				nfsrv_putattrbit(nd, &attrbits);
 				NFSM_BUILD(tl, uint32_t *, 2 * NFSX_UNSIGNED +
 				    NFSX_HYPER);
 				*tl++ = txdr_unsigned(NFSX_HYPER);
 				txdr_hyper(uiop->uio_offset, tl); tl += 2;
 				*tl = txdr_unsigned(NFSV4OP_WRITE);
 			}
 			nfsm_stateidtom(nd, stateidp, NFSSTATEID_PUTSTATEID);
 			NFSM_BUILD(tl, u_int32_t *, NFSX_HYPER+2*NFSX_UNSIGNED);
 			txdr_hyper(uiop->uio_offset, tl);
 			tl += 2;
 			*tl++ = txdr_unsigned(*iomode);
 			*tl = txdr_unsigned(len);
 		} else if (nd->nd_flag & ND_NFSV3) {
 			NFSM_BUILD(tl, u_int32_t *, NFSX_HYPER+3*NFSX_UNSIGNED);
 			txdr_hyper(uiop->uio_offset, tl);
 			tl += 2;
 			*tl++ = txdr_unsigned(len);
 			*tl++ = txdr_unsigned(*iomode);
 			*tl = txdr_unsigned(len);
 		} else {
 			u_int32_t x;
 
 			NFSM_BUILD(tl, u_int32_t *, 4 * NFSX_UNSIGNED);
 			/*
 			 * Not sure why someone changed this, since the
 			 * RFC clearly states that "beginoffset" and
 			 * "totalcount" are ignored, but it wouldn't
 			 * surprise me if there's a busted server out there.
 			 */
 			/* Set both "begin" and "current" to non-garbage. */
 			x = txdr_unsigned((u_int32_t)uiop->uio_offset);
 			*tl++ = x;      /* "begin offset" */
 			*tl++ = x;      /* "current offset" */
 			x = txdr_unsigned(len);
 			*tl++ = x;      /* total to this offset */
 			*tl = x;        /* size of this write */
 		}
 		error = nfsm_uiombuf(nd, uiop, len);
 		if (error != 0) {
 			m_freem(nd->nd_mreq);
 			free(nd, M_TEMP);
 			return (error);
 		}
 		/*
 		 * Although it is tempting to do a normal Getattr Op in the
 		 * NFSv4 compound, the result can be a nearly hung client
 		 * system if the Getattr asks for Owner and/or OwnerGroup.
 		 * It occurs when the client can't map either the Owner or
 		 * Owner_group name in the Getattr reply to a uid/gid. When
 		 * there is a cache miss, the kernel does an upcall to the
 		 * nfsuserd. Then, it can try and read the local /etc/passwd
 		 * or /etc/group file. It can then block in getnewbuf(),
 		 * waiting for dirty writes to be pushed to the NFS server.
 		 * The only reason this doesn't result in a complete
 		 * deadlock, is that the upcall times out and allows
 		 * the write to complete. However, progress is so slow
 		 * that it might just as well be deadlocked.
 		 * As such, we get the rest of the attributes, but not
 		 * Owner or Owner_group.
 		 * nb: nfscl_loadattrcache() needs to be told that these
 		 *     partial attributes from a write rpc are being
 		 *     passed in, via a argument flag.
 		 */
 		if (nd->nd_flag & ND_NFSV4) {
 			NFSWRITEGETATTR_ATTRBIT(&attrbits);
 			NFSM_BUILD(tl, u_int32_t *, NFSX_UNSIGNED);
 			*tl = txdr_unsigned(NFSV4OP_GETATTR);
 			(void) nfsrv_putattrbit(nd, &attrbits);
 		}
 		error = nfscl_request(nd, vp, p, cred);
 		if (error) {
 			free(nd, M_TEMP);
 			return (error);
 		}
 		if (nd->nd_repstat) {
 			/*
 			 * In case the rpc gets retried, roll
 			 * the uio fields changed by nfsm_uiombuf()
 			 * back.
 			 */
 			uiop->uio_offset -= len;
 			uiop->uio_resid += len;
 			uiop->uio_iov->iov_base =
 			    (char *)uiop->uio_iov->iov_base - len;
 			uiop->uio_iov->iov_len += len;
 		}
 		if (nd->nd_flag & (ND_NFSV3 | ND_NFSV4)) {
 			error = nfscl_wcc_data(nd, vp, nap, attrflagp,
 			    &wccflag, &tmp_off);
 			if (error)
 				goto nfsmout;
 		}
 		if ((nd->nd_flag & (ND_NFSV4 | ND_NOMOREDATA)) ==
 		    (ND_NFSV4 | ND_NOMOREDATA) &&
 		    nd->nd_repstat == NFSERR_NOTSAME && do_append) {
 			/*
 			 * Verify of the file's size failed, so redo the
 			 * write using the file's size as returned in
 			 * the wcc attributes.
 			 */
 			if (tmp_off + tsiz <= nmp->nm_maxfilesize) {
 				do_append = false;
 				uiop->uio_offset = tmp_off;
 				m_freem(nd->nd_mrep);
 				nd->nd_mrep = NULL;
 				continue;
 			} else
 				nd->nd_repstat = EFBIG;
 		}
 		if (!nd->nd_repstat) {
 			if (do_append) {
 				/* Strip off the Write reply status. */
 				do_append = false;
 				NFSM_DISSECT(tl, uint32_t *, 2 * NFSX_UNSIGNED);
 			}
 			if (nd->nd_flag & (ND_NFSV3 | ND_NFSV4)) {
 				NFSM_DISSECT(tl, u_int32_t *, 2 * NFSX_UNSIGNED
 					+ NFSX_VERF);
 				rlen = fxdr_unsigned(int, *tl++);
 				if (rlen == 0) {
 					error = NFSERR_IO;
 					goto nfsmout;
 				} else if (rlen < len) {
 					backup = len - rlen;
 					uiop->uio_iov->iov_base =
 					    (char *)uiop->uio_iov->iov_base -
 					    backup;
 					uiop->uio_iov->iov_len += backup;
 					uiop->uio_offset -= backup;
 					uiop->uio_resid += backup;
 					len = rlen;
 				}
 				commit = fxdr_unsigned(int, *tl++);
 
 				/*
 				 * Return the lowest commitment level
 				 * obtained by any of the RPCs.
 				 */
 				if (committed == NFSWRITE_FILESYNC)
 					committed = commit;
 				else if (committed == NFSWRITE_DATASYNC &&
 					commit == NFSWRITE_UNSTABLE)
 					committed = commit;
 				NFSLOCKMNT(nmp);
 				if (!NFSHASWRITEVERF(nmp)) {
 					NFSBCOPY((caddr_t)tl,
 					    (caddr_t)&nmp->nm_verf[0],
 					    NFSX_VERF);
 					NFSSETWRITEVERF(nmp);
 	    			} else if (NFSBCMP(tl, nmp->nm_verf,
 				    NFSX_VERF) && *must_commit != 2) {
 					*must_commit = 1;
 					NFSBCOPY(tl, nmp->nm_verf, NFSX_VERF);
 				}
 				NFSUNLOCKMNT(nmp);
 			}
 			if (nd->nd_flag & ND_NFSV4)
 				NFSM_DISSECT(tl, u_int32_t *, 2 * NFSX_UNSIGNED);
 			if (nd->nd_flag & (ND_NFSV2 | ND_NFSV4)) {
 				error = nfsm_loadattr(nd, nap);
 				if (!error)
 					*attrflagp = NFS_LATTR_NOSHRINK;
 			}
 		} else {
 			error = nd->nd_repstat;
 		}
 		if (error)
 			goto nfsmout;
 		NFSWRITERPC_SETTIME(wccflag, np, nap, (nd->nd_flag & ND_NFSV4));
 		m_freem(nd->nd_mrep);
 		nd->nd_mrep = NULL;
 		tsiz -= len;
 	}
 nfsmout:
 	if (nd->nd_mrep != NULL)
 		m_freem(nd->nd_mrep);
 	*iomode = committed;
 	if (nd->nd_repstat && !error)
 		error = nd->nd_repstat;
 	free(nd, M_TEMP);
 	return (error);
 }
 
 /*
  * Do an nfs deallocate operation.
  */
 int
 nfsrpc_deallocate(vnode_t vp, off_t offs, off_t len, struct nfsvattr *nap,
     int *attrflagp, struct ucred *cred, NFSPROC_T *p)
 {
 	int error, expireret = 0, openerr, retrycnt;
 	uint32_t clidrev = 0;
 	struct nfsmount *nmp = VFSTONFS(vp->v_mount);
 	struct nfsfh *nfhp;
 	nfsv4stateid_t stateid;
 	void *lckp;
 
 	if (nmp->nm_clp != NULL)
 		clidrev = nmp->nm_clp->nfsc_clientidrev;
 	retrycnt = 0;
 	do {
 		lckp = NULL;
 		openerr = 1;
 		nfhp = VTONFS(vp)->n_fhp;
 		error = nfscl_getstateid(vp, nfhp->nfh_fh, nfhp->nfh_len,
 		    NFSV4OPEN_ACCESSWRITE, 0, cred, p, &stateid, &lckp);
 		if (error != 0) {
 			/*
 			 * No Open stateid, so try and open the file
 			 * now.
 			 */
 			openerr = nfsrpc_open(vp, FWRITE, cred, p);
 			if (openerr == 0)
 				nfscl_getstateid(vp, nfhp->nfh_fh,
 				    nfhp->nfh_len, NFSV4OPEN_ACCESSWRITE, 0,
 				    cred, p, &stateid, &lckp);
 		}
 		error = nfsrpc_deallocaterpc(vp, offs, len, &stateid, nap,
 		    attrflagp, cred, p);
 		if (error == NFSERR_STALESTATEID)
 			nfscl_initiate_recovery(nmp->nm_clp);
 		if (lckp != NULL)
 			nfscl_lockderef(lckp);
 		if (openerr == 0)
 			nfsrpc_close(vp, 0, p);
 		if (error == NFSERR_GRACE || error == NFSERR_STALESTATEID ||
 		    error == NFSERR_STALEDONTRECOVER || error == NFSERR_DELAY ||
 		    error == NFSERR_OLDSTATEID || error == NFSERR_BADSESSION) {
 			(void) nfs_catnap(PZERO, error, "nfs_deallocate");
 		} else if ((error == NFSERR_EXPIRED || (!NFSHASINT(nmp) &&
 		    error == NFSERR_BADSTATEID)) && clidrev != 0) {
 			expireret = nfscl_hasexpired(nmp->nm_clp, clidrev, p);
 		} else if (error == NFSERR_BADSTATEID && NFSHASINT(nmp)) {
 			error = EIO;
 		}
 		retrycnt++;
 	} while (error == NFSERR_GRACE || error == NFSERR_STALESTATEID ||
 	    error == NFSERR_STALEDONTRECOVER || error == NFSERR_DELAY ||
 	    error == NFSERR_BADSESSION ||
 	    (error == NFSERR_OLDSTATEID && retrycnt < 20) ||
 	    ((error == NFSERR_EXPIRED || error == NFSERR_BADSTATEID) &&
 	     expireret == 0 && clidrev != 0 && retrycnt < 4));
 	if (error && retrycnt >= 4)
 		error = EIO;
 	return (error);
 }
 
 /*
  * The actual deallocate RPC.
  */
 static int
 nfsrpc_deallocaterpc(vnode_t vp, off_t offs, off_t len,
     nfsv4stateid_t *stateidp, struct nfsvattr *nap, int *attrflagp,
     struct ucred *cred, NFSPROC_T *p)
 {
 	uint32_t *tl;
 	struct nfsnode *np = VTONFS(vp);
 	int error, wccflag;
 	struct nfsrv_descript nfsd;
 	struct nfsrv_descript *nd = &nfsd;
 	nfsattrbit_t attrbits;
 
 	*attrflagp = 0;
 	NFSCL_REQSTART(nd, NFSPROC_DEALLOCATE, vp, cred);
 	nfsm_stateidtom(nd, stateidp, NFSSTATEID_PUTSTATEID);
 	NFSM_BUILD(tl, uint32_t *, 2 * NFSX_HYPER);
 	txdr_hyper(offs, tl);
 	tl += 2;
 	txdr_hyper(len, tl);
 	NFSWRITEGETATTR_ATTRBIT(&attrbits);
 	NFSM_BUILD(tl, uint32_t *, NFSX_UNSIGNED);
 	*tl = txdr_unsigned(NFSV4OP_GETATTR);
 	nfsrv_putattrbit(nd, &attrbits);
 	error = nfscl_request(nd, vp, p, cred);
 	if (error != 0)
 		return (error);
 	wccflag = 0;
 	error = nfscl_wcc_data(nd, vp, nap, attrflagp, &wccflag, NULL);
 	if (error != 0)
 		goto nfsmout;
 	if (nd->nd_repstat == 0) {
 		NFSM_DISSECT(tl, uint32_t *, 2 * NFSX_UNSIGNED);
 		error = nfsm_loadattr(nd, nap);
 		if (error != 0)
 			goto nfsmout;
 		*attrflagp = NFS_LATTR_NOSHRINK;
 	}
 	NFSWRITERPC_SETTIME(wccflag, np, nap, 1);
 nfsmout:
 	m_freem(nd->nd_mrep);
 	if (nd->nd_repstat != 0 && error == 0)
 		error = nd->nd_repstat;
 	return (error);
 }
 
 /*
  * nfs mknod rpc
  * For NFS v2 this is a kludge. Use a create rpc but with the IFMT bits of the
  * mode set to specify the file type and the size field for rdev.
  */
 int
 nfsrpc_mknod(vnode_t dvp, char *name, int namelen, struct vattr *vap,
     u_int32_t rdev, __enum_uint8(vtype) vtyp, struct ucred *cred, NFSPROC_T *p,
     struct nfsvattr *dnap, struct nfsvattr *nnap, struct nfsfh **nfhpp,
     int *attrflagp, int *dattrflagp)
 {
 	u_int32_t *tl;
 	int error = 0;
 	struct nfsrv_descript nfsd, *nd = &nfsd;
 	nfsattrbit_t attrbits;
 
 	*nfhpp = NULL;
 	*attrflagp = 0;
 	*dattrflagp = 0;
 	if (namelen > NFS_MAXNAMLEN)
 		return (ENAMETOOLONG);
 	NFSCL_REQSTART(nd, NFSPROC_MKNOD, dvp, cred);
 	if (nd->nd_flag & ND_NFSV4) {
 		if (vtyp == VBLK || vtyp == VCHR) {
 			NFSM_BUILD(tl, u_int32_t *, 3 * NFSX_UNSIGNED);
 			*tl++ = vtonfsv34_type(vtyp);
 			*tl++ = txdr_unsigned(NFSMAJOR(rdev));
 			*tl = txdr_unsigned(NFSMINOR(rdev));
 		} else {
 			NFSM_BUILD(tl, u_int32_t *, NFSX_UNSIGNED);
 			*tl = vtonfsv34_type(vtyp);
 		}
 	}
 	(void) nfsm_strtom(nd, name, namelen);
 	if (nd->nd_flag & ND_NFSV3) {
 		NFSM_BUILD(tl, u_int32_t *, NFSX_UNSIGNED);
 		*tl = vtonfsv34_type(vtyp);
 	}
 	if (nd->nd_flag & (ND_NFSV3 | ND_NFSV4))
 		nfscl_fillsattr(nd, vap, dvp, NFSSATTR_NEWFILE, 0);
 	if ((nd->nd_flag & ND_NFSV3) &&
 	    (vtyp == VCHR || vtyp == VBLK)) {
 		NFSM_BUILD(tl, u_int32_t *, 2 * NFSX_UNSIGNED);
 		*tl++ = txdr_unsigned(NFSMAJOR(rdev));
 		*tl = txdr_unsigned(NFSMINOR(rdev));
 	}
 	if (nd->nd_flag & ND_NFSV4) {
 		NFSGETATTR_ATTRBIT(&attrbits);
 		NFSM_BUILD(tl, u_int32_t *, 2 * NFSX_UNSIGNED);
 		*tl++ = txdr_unsigned(NFSV4OP_GETFH);
 		*tl = txdr_unsigned(NFSV4OP_GETATTR);
 		(void) nfsrv_putattrbit(nd, &attrbits);
 	}
 	if (nd->nd_flag & ND_NFSV2)
 		nfscl_fillsattr(nd, vap, dvp, NFSSATTR_SIZERDEV, rdev);
 	error = nfscl_request(nd, dvp, p, cred);
 	if (error)
 		return (error);
 	if (nd->nd_flag & ND_NFSV4)
 		error = nfscl_wcc_data(nd, dvp, dnap, dattrflagp, NULL, NULL);
 	if (!nd->nd_repstat) {
 		if (nd->nd_flag & ND_NFSV4) {
 			NFSM_DISSECT(tl, u_int32_t *, 5 * NFSX_UNSIGNED);
 			error = nfsrv_getattrbits(nd, &attrbits, NULL, NULL);
 			if (error)
 				goto nfsmout;
 		}
 		error = nfscl_mtofh(nd, nfhpp, nnap, attrflagp);
 		if (error)
 			goto nfsmout;
 	}
 	if (nd->nd_flag & ND_NFSV3)
 		error = nfscl_wcc_data(nd, dvp, dnap, dattrflagp, NULL, NULL);
 	if (!error && nd->nd_repstat)
 		error = nd->nd_repstat;
 nfsmout:
 	m_freem(nd->nd_mrep);
 	return (error);
 }
 
 /*
  * nfs file create call
  * Mostly just call the approriate routine. (I separated out v4, so that
  * error recovery wouldn't be as difficult.)
  */
 int
 nfsrpc_create(vnode_t dvp, char *name, int namelen, struct vattr *vap,
     nfsquad_t cverf, int fmode, struct ucred *cred, NFSPROC_T *p,
     struct nfsvattr *dnap, struct nfsvattr *nnap, struct nfsfh **nfhpp,
     int *attrflagp, int *dattrflagp)
 {
 	int error = 0, newone, expireret = 0, retrycnt, unlocked;
 	struct nfsclowner *owp;
 	struct nfscldeleg *dp;
 	struct nfsmount *nmp = VFSTONFS(dvp->v_mount);
 	u_int32_t clidrev;
 
 	if (NFSHASNFSV4(nmp)) {
 	    retrycnt = 0;
 	    do {
 		dp = NULL;
 		error = nfscl_open(dvp, NULL, 0, (NFSV4OPEN_ACCESSWRITE |
 		    NFSV4OPEN_ACCESSREAD), 0, cred, p, &owp, NULL, &newone,
 		    NULL, 1, true);
 		if (error)
 			return (error);
 		if (nmp->nm_clp != NULL)
 			clidrev = nmp->nm_clp->nfsc_clientidrev;
 		else
 			clidrev = 0;
 		if (!NFSHASPNFS(nmp) || nfscl_enablecallb == 0 ||
 		    nfs_numnfscbd == 0 || retrycnt > 0)
 			error = nfsrpc_createv4(dvp, name, namelen, vap, cverf,
 			  fmode, owp, &dp, cred, p, dnap, nnap, nfhpp,
 			  attrflagp, dattrflagp, &unlocked);
 		else
 			error = nfsrpc_getcreatelayout(dvp, name, namelen, vap,
 			  cverf, fmode, owp, &dp, cred, p, dnap, nnap, nfhpp,
 			  attrflagp, dattrflagp, &unlocked);
 		/*
 		 * There is no need to invalidate cached attributes here,
 		 * since new post-delegation issue attributes are always
 		 * returned by nfsrpc_createv4() and these will update the
 		 * attribute cache.
 		 */
 		if (dp != NULL)
 			(void) nfscl_deleg(nmp->nm_mountp, owp->nfsow_clp,
 			    (*nfhpp)->nfh_fh, (*nfhpp)->nfh_len, cred, p, &dp);
 		nfscl_ownerrelease(nmp, owp, error, newone, unlocked);
 		if (error == NFSERR_GRACE || error == NFSERR_STALECLIENTID ||
 		    error == NFSERR_STALEDONTRECOVER || error == NFSERR_DELAY ||
 		    error == NFSERR_BADSESSION) {
 			(void) nfs_catnap(PZERO, error, "nfs_open");
 		} else if ((error == NFSERR_EXPIRED ||
 		    error == NFSERR_BADSTATEID) && clidrev != 0) {
 			expireret = nfscl_hasexpired(nmp->nm_clp, clidrev, p);
 			retrycnt++;
 		}
 	    } while (error == NFSERR_GRACE || error == NFSERR_STALECLIENTID ||
 		error == NFSERR_STALEDONTRECOVER || error == NFSERR_DELAY ||
 		error == NFSERR_BADSESSION ||
 		((error == NFSERR_EXPIRED || error == NFSERR_BADSTATEID) &&
 		 expireret == 0 && clidrev != 0 && retrycnt < 4));
 	    if (error && retrycnt >= 4)
 		    error = EIO;
 	} else {
 		error = nfsrpc_createv23(dvp, name, namelen, vap, cverf,
 		    fmode, cred, p, dnap, nnap, nfhpp, attrflagp, dattrflagp);
 	}
 	return (error);
 }
 
 /*
  * The create rpc for v2 and 3.
  */
 static int
 nfsrpc_createv23(vnode_t dvp, char *name, int namelen, struct vattr *vap,
     nfsquad_t cverf, int fmode, struct ucred *cred, NFSPROC_T *p,
     struct nfsvattr *dnap, struct nfsvattr *nnap, struct nfsfh **nfhpp,
     int *attrflagp, int *dattrflagp)
 {
 	u_int32_t *tl;
 	int error = 0;
 	struct nfsrv_descript nfsd, *nd = &nfsd;
 
 	*nfhpp = NULL;
 	*attrflagp = 0;
 	*dattrflagp = 0;
 	if (namelen > NFS_MAXNAMLEN)
 		return (ENAMETOOLONG);
 	NFSCL_REQSTART(nd, NFSPROC_CREATE, dvp, cred);
 	(void) nfsm_strtom(nd, name, namelen);
 	if (nd->nd_flag & ND_NFSV3) {
 		NFSM_BUILD(tl, u_int32_t *, NFSX_UNSIGNED);
 		if (fmode & O_EXCL) {
 			*tl = txdr_unsigned(NFSCREATE_EXCLUSIVE);
 			NFSM_BUILD(tl, u_int32_t *, NFSX_VERF);
 			*tl++ = cverf.lval[0];
 			*tl = cverf.lval[1];
 		} else {
 			*tl = txdr_unsigned(NFSCREATE_UNCHECKED);
 			nfscl_fillsattr(nd, vap, dvp, 0, 0);
 		}
 	} else {
 		nfscl_fillsattr(nd, vap, dvp, NFSSATTR_SIZE0, 0);
 	}
 	error = nfscl_request(nd, dvp, p, cred);
 	if (error)
 		return (error);
 	if (nd->nd_repstat == 0) {
 		error = nfscl_mtofh(nd, nfhpp, nnap, attrflagp);
 		if (error)
 			goto nfsmout;
 	}
 	if (nd->nd_flag & ND_NFSV3)
 		error = nfscl_wcc_data(nd, dvp, dnap, dattrflagp, NULL, NULL);
 	if (nd->nd_repstat != 0 && error == 0)
 		error = nd->nd_repstat;
 nfsmout:
 	m_freem(nd->nd_mrep);
 	return (error);
 }
 
 static int
 nfsrpc_createv4(vnode_t dvp, char *name, int namelen, struct vattr *vap,
     nfsquad_t cverf, int fmode, struct nfsclowner *owp, struct nfscldeleg **dpp,
     struct ucred *cred, NFSPROC_T *p, struct nfsvattr *dnap,
     struct nfsvattr *nnap, struct nfsfh **nfhpp, int *attrflagp,
     int *dattrflagp, int *unlockedp)
 {
 	u_int32_t *tl;
 	int error = 0, deleg, newone, ret, acesize, limitby;
 	struct nfsrv_descript nfsd, *nd = &nfsd;
 	struct nfsclopen *op;
 	struct nfscldeleg *dp = NULL;
 	struct nfsnode *np;
 	struct nfsfh *nfhp;
 	nfsattrbit_t attrbits;
 	nfsv4stateid_t stateid;
 	u_int32_t rflags;
 	struct nfsmount *nmp;
 	struct nfsclsession *tsep;
 
 	nmp = VFSTONFS(dvp->v_mount);
 	np = VTONFS(dvp);
 	*unlockedp = 0;
 	*nfhpp = NULL;
 	*dpp = NULL;
 	*attrflagp = 0;
 	*dattrflagp = 0;
 	if (namelen > NFS_MAXNAMLEN)
 		return (ENAMETOOLONG);
 	NFSCL_REQSTART(nd, NFSPROC_CREATE, dvp, cred);
 	/*
 	 * For V4, this is actually an Open op.
 	 */
 	NFSM_BUILD(tl, u_int32_t *, 5 * NFSX_UNSIGNED);
 	*tl++ = txdr_unsigned(owp->nfsow_seqid);
 	if (NFSHASNFSV4N(nmp)) {
 		if (!NFSHASPNFS(nmp) && nfscl_enablecallb != 0 &&
 		    nfs_numnfscbd > 0)
 			*tl++ = txdr_unsigned(NFSV4OPEN_ACCESSWRITE |
 			    NFSV4OPEN_ACCESSREAD | NFSV4OPEN_WANTWRITEDELEG);
 		else
 			*tl++ = txdr_unsigned(NFSV4OPEN_ACCESSWRITE |
 			    NFSV4OPEN_ACCESSREAD | NFSV4OPEN_WANTNODELEG);
 	} else
 		*tl++ = txdr_unsigned(NFSV4OPEN_ACCESSWRITE |
 		    NFSV4OPEN_ACCESSREAD);
 	*tl++ = txdr_unsigned(NFSV4OPEN_DENYNONE);
 	tsep = nfsmnt_mdssession(nmp);
 	*tl++ = tsep->nfsess_clientid.lval[0];
 	*tl = tsep->nfsess_clientid.lval[1];
 	(void) nfsm_strtom(nd, owp->nfsow_owner, NFSV4CL_LOCKNAMELEN);
 	NFSM_BUILD(tl, u_int32_t *, 2 * NFSX_UNSIGNED);
 	*tl++ = txdr_unsigned(NFSV4OPEN_CREATE);
 	if (fmode & O_EXCL) {
 		if (NFSHASNFSV4N(nmp)) {
 			if (NFSHASSESSPERSIST(nmp)) {
 				/* Use GUARDED for persistent sessions. */
 				*tl = txdr_unsigned(NFSCREATE_GUARDED);
 				nfscl_fillsattr(nd, vap, dvp, NFSSATTR_NEWFILE,
 				    0);
 			} else {
 				/* Otherwise, use EXCLUSIVE4_1. */
 				*tl = txdr_unsigned(NFSCREATE_EXCLUSIVE41);
 				NFSM_BUILD(tl, u_int32_t *, NFSX_VERF);
 				*tl++ = cverf.lval[0];
 				*tl = cverf.lval[1];
 				nfscl_fillsattr(nd, vap, dvp, NFSSATTR_NEWFILE,
 				    0);
 			}
 		} else {
 			/* NFSv4.0 */
 			*tl = txdr_unsigned(NFSCREATE_EXCLUSIVE);
 			NFSM_BUILD(tl, u_int32_t *, NFSX_VERF);
 			*tl++ = cverf.lval[0];
 			*tl = cverf.lval[1];
 		}
 	} else {
 		*tl = txdr_unsigned(NFSCREATE_UNCHECKED);
 		nfscl_fillsattr(nd, vap, dvp, NFSSATTR_NEWFILE, 0);
 	}
 	NFSM_BUILD(tl, u_int32_t *, NFSX_UNSIGNED);
 	*tl = txdr_unsigned(NFSV4OPEN_CLAIMNULL);
 	(void) nfsm_strtom(nd, name, namelen);
 	/* Get the new file's handle and attributes. */
 	NFSM_BUILD(tl, u_int32_t *, 2 * NFSX_UNSIGNED);
 	*tl++ = txdr_unsigned(NFSV4OP_GETFH);
 	*tl = txdr_unsigned(NFSV4OP_GETATTR);
 	NFSGETATTR_ATTRBIT(&attrbits);
 	(void) nfsrv_putattrbit(nd, &attrbits);
 	/* Get the directory's post-op attributes. */
 	NFSM_BUILD(tl, u_int32_t *, NFSX_UNSIGNED);
 	*tl = txdr_unsigned(NFSV4OP_PUTFH);
 	(void)nfsm_fhtom(nmp, nd, np->n_fhp->nfh_fh, np->n_fhp->nfh_len, 0);
 	NFSM_BUILD(tl, u_int32_t *, NFSX_UNSIGNED);
 	*tl = txdr_unsigned(NFSV4OP_GETATTR);
 	(void) nfsrv_putattrbit(nd, &attrbits);
 	error = nfscl_request(nd, dvp, p, cred);
 	if (error)
 		return (error);
 	NFSCL_INCRSEQID(owp->nfsow_seqid, nd);
 	if (nd->nd_repstat == 0) {
 		NFSM_DISSECT(tl, u_int32_t *, NFSX_STATEID +
 		    6 * NFSX_UNSIGNED);
 		stateid.seqid = *tl++;
 		stateid.other[0] = *tl++;
 		stateid.other[1] = *tl++;
 		stateid.other[2] = *tl;
 		rflags = fxdr_unsigned(u_int32_t, *(tl + 6));
 		error = nfsrv_getattrbits(nd, &attrbits, NULL, NULL);
 		if (error)
 			goto nfsmout;
 		NFSM_DISSECT(tl, u_int32_t *, NFSX_UNSIGNED);
 		deleg = fxdr_unsigned(int, *tl);
 		if (deleg == NFSV4OPEN_DELEGATEREAD ||
 		    deleg == NFSV4OPEN_DELEGATEWRITE) {
 			if (!(owp->nfsow_clp->nfsc_flags &
 			      NFSCLFLAGS_FIRSTDELEG))
 				owp->nfsow_clp->nfsc_flags |=
 				  (NFSCLFLAGS_FIRSTDELEG | NFSCLFLAGS_GOTDELEG);
 			dp = malloc(
 			    sizeof (struct nfscldeleg) + NFSX_V4FHMAX,
 			    M_NFSCLDELEG, M_WAITOK);
 			LIST_INIT(&dp->nfsdl_owner);
 			LIST_INIT(&dp->nfsdl_lock);
 			dp->nfsdl_clp = owp->nfsow_clp;
 			newnfs_copyincred(cred, &dp->nfsdl_cred);
 			nfscl_lockinit(&dp->nfsdl_rwlock);
 			NFSM_DISSECT(tl, u_int32_t *, NFSX_STATEID +
 			    NFSX_UNSIGNED);
 			dp->nfsdl_stateid.seqid = *tl++;
 			dp->nfsdl_stateid.other[0] = *tl++;
 			dp->nfsdl_stateid.other[1] = *tl++;
 			dp->nfsdl_stateid.other[2] = *tl++;
 			ret = fxdr_unsigned(int, *tl);
 			if (deleg == NFSV4OPEN_DELEGATEWRITE) {
 				dp->nfsdl_flags = NFSCLDL_WRITE;
 				/*
 				 * Indicates how much the file can grow.
 				 */
 				NFSM_DISSECT(tl, u_int32_t *,
 				    3 * NFSX_UNSIGNED);
 				limitby = fxdr_unsigned(int, *tl++);
 				switch (limitby) {
 				case NFSV4OPEN_LIMITSIZE:
 					dp->nfsdl_sizelimit = fxdr_hyper(tl);
 					break;
 				case NFSV4OPEN_LIMITBLOCKS:
 					dp->nfsdl_sizelimit =
 					    fxdr_unsigned(u_int64_t, *tl++);
 					dp->nfsdl_sizelimit *=
 					    fxdr_unsigned(u_int64_t, *tl);
 					break;
 				default:
 					error = NFSERR_BADXDR;
 					goto nfsmout;
 				}
 			} else {
 				dp->nfsdl_flags = NFSCLDL_READ;
 			}
 			if (ret)
 				dp->nfsdl_flags |= NFSCLDL_RECALL;
 			error = nfsrv_dissectace(nd, &dp->nfsdl_ace, false,
 			    &ret, &acesize, p);
 			if (error)
 				goto nfsmout;
 		} else if (deleg == NFSV4OPEN_DELEGATENONEEXT &&
 		    NFSHASNFSV4N(nmp)) {
 			NFSM_DISSECT(tl, uint32_t *, NFSX_UNSIGNED);
 			deleg = fxdr_unsigned(uint32_t, *tl);
 			if (deleg == NFSV4OPEN_CONTENTION ||
 			    deleg == NFSV4OPEN_RESOURCE)
 				NFSM_DISSECT(tl, uint32_t *, NFSX_UNSIGNED);
 		} else if (deleg != NFSV4OPEN_DELEGATENONE) {
 			error = NFSERR_BADXDR;
 			goto nfsmout;
 		}
 		error = nfscl_mtofh(nd, nfhpp, nnap, attrflagp);
 		if (error)
 			goto nfsmout;
 		/* Get rid of the PutFH and Getattr status values. */
 		NFSM_DISSECT(tl, u_int32_t *, 4 * NFSX_UNSIGNED);
 		/* Load the directory attributes. */
 		error = nfsm_loadattr(nd, dnap);
 		if (error)
 			goto nfsmout;
 		*dattrflagp = 1;
 		if (dp != NULL && *attrflagp) {
 			dp->nfsdl_change = nnap->na_filerev;
 			dp->nfsdl_modtime = nnap->na_mtime;
 			dp->nfsdl_flags |= NFSCLDL_MODTIMESET;
 		}
 		/*
 		 * We can now complete the Open state.
 		 */
 		nfhp = *nfhpp;
 		if (dp != NULL) {
 			dp->nfsdl_fhlen = nfhp->nfh_len;
 			NFSBCOPY(nfhp->nfh_fh, dp->nfsdl_fh, nfhp->nfh_len);
 		}
 		/*
 		 * Get an Open structure that will be
 		 * attached to the OpenOwner, acquired already.
 		 */
 		error = nfscl_open(dvp, nfhp->nfh_fh, nfhp->nfh_len, 
 		    (NFSV4OPEN_ACCESSWRITE | NFSV4OPEN_ACCESSREAD), 0,
 		    cred, p, NULL, &op, &newone, NULL, 0, false);
 		if (error)
 			goto nfsmout;
 		op->nfso_stateid = stateid;
 		newnfs_copyincred(cred, &op->nfso_cred);
 		if ((rflags & NFSV4OPEN_RESULTCONFIRM)) {
 		    do {
 			ret = nfsrpc_openconfirm(dvp, nfhp->nfh_fh,
 			    nfhp->nfh_len, op, cred, p);
 			if (ret == NFSERR_DELAY)
 			    (void) nfs_catnap(PZERO, ret, "nfs_create");
 		    } while (ret == NFSERR_DELAY);
 		    error = ret;
 		}
 
 		/*
 		 * If the server is handing out delegations, but we didn't
 		 * get one because an OpenConfirm was required, try the
 		 * Open again, to get a delegation. This is a harmless no-op,
 		 * from a server's point of view.
 		 */
 		if ((rflags & NFSV4OPEN_RESULTCONFIRM) &&
 		    (owp->nfsow_clp->nfsc_flags & NFSCLFLAGS_GOTDELEG) &&
 		    !error && dp == NULL) {
 		    KASSERT(!NFSHASNFSV4N(nmp),
 			("nfsrpc_createv4: result confirm"));
 		    do {
 			ret = nfsrpc_openrpc(VFSTONFS(dvp->v_mount), dvp,
 			    np->n_fhp->nfh_fh, np->n_fhp->nfh_len,
 			    nfhp->nfh_fh, nfhp->nfh_len,
 			    (NFSV4OPEN_ACCESSWRITE | NFSV4OPEN_ACCESSREAD), op,
 			    name, namelen, &dp, 0, 0x0, cred, p, 0, 1);
 			if (ret == NFSERR_DELAY)
 			    (void) nfs_catnap(PZERO, ret, "nfs_crt2");
 		    } while (ret == NFSERR_DELAY);
 		    if (ret) {
 			if (dp != NULL) {
 				free(dp, M_NFSCLDELEG);
 				dp = NULL;
 			}
 			if (ret == NFSERR_STALECLIENTID ||
 			    ret == NFSERR_STALEDONTRECOVER ||
 			    ret == NFSERR_BADSESSION)
 				error = ret;
 		    }
 		}
 		nfscl_openrelease(nmp, op, error, newone);
 		*unlockedp = 1;
 	}
 	if (nd->nd_repstat != 0 && error == 0)
 		error = nd->nd_repstat;
 	if (error == NFSERR_STALECLIENTID)
 		nfscl_initiate_recovery(owp->nfsow_clp);
 nfsmout:
 	if (!error)
 		*dpp = dp;
 	else if (dp != NULL)
 		free(dp, M_NFSCLDELEG);
 	m_freem(nd->nd_mrep);
 	return (error);
 }
 
 /*
  * Nfs remove rpc
  */
 int
 nfsrpc_remove(vnode_t dvp, char *name, int namelen, vnode_t vp,
     struct ucred *cred, NFSPROC_T *p, struct nfsvattr *dnap, int *dattrflagp)
 {
 	u_int32_t *tl;
 	struct nfsrv_descript nfsd, *nd = &nfsd;
 	struct nfsnode *np;
 	struct nfsmount *nmp;
 	nfsv4stateid_t dstateid;
 	int error, ret = 0, i;
 
 	*dattrflagp = 0;
 	if (namelen > NFS_MAXNAMLEN)
 		return (ENAMETOOLONG);
 	nmp = VFSTONFS(dvp->v_mount);
 tryagain:
 	if (NFSHASNFSV4(nmp) && ret == 0) {
 		ret = nfscl_removedeleg(vp, p, &dstateid);
 		if (ret == 1) {
 			NFSCL_REQSTART(nd, NFSPROC_RETDELEGREMOVE, vp, cred);
 			NFSM_BUILD(tl, u_int32_t *, NFSX_STATEID +
 			    NFSX_UNSIGNED);
 			if (NFSHASNFSV4N(nmp))
 				*tl++ = 0;
 			else
 				*tl++ = dstateid.seqid;
 			*tl++ = dstateid.other[0];
 			*tl++ = dstateid.other[1];
 			*tl++ = dstateid.other[2];
 			*tl = txdr_unsigned(NFSV4OP_PUTFH);
 			np = VTONFS(dvp);
 			(void)nfsm_fhtom(nmp, nd, np->n_fhp->nfh_fh,
 			    np->n_fhp->nfh_len, 0);
 			NFSM_BUILD(tl, u_int32_t *, NFSX_UNSIGNED);
 			*tl = txdr_unsigned(NFSV4OP_REMOVE);
 		}
 	} else {
 		ret = 0;
 	}
 	if (ret == 0)
 		NFSCL_REQSTART(nd, NFSPROC_REMOVE, dvp, cred);
 	(void) nfsm_strtom(nd, name, namelen);
 	error = nfscl_request(nd, dvp, p, cred);
 	if (error)
 		return (error);
 	if (nd->nd_flag & (ND_NFSV3 | ND_NFSV4)) {
 		/* For NFSv4, parse out any Delereturn replies. */
 		if (ret > 0 && nd->nd_repstat != 0 &&
 		    (nd->nd_flag & ND_NOMOREDATA)) {
 			/*
 			 * If the Delegreturn failed, try again without
 			 * it. The server will Recall, as required.
 			 */
 			m_freem(nd->nd_mrep);
 			goto tryagain;
 		}
 		for (i = 0; i < (ret * 2); i++) {
 			if ((nd->nd_flag & (ND_NFSV4 | ND_NOMOREDATA)) ==
 			    ND_NFSV4) {
 			    NFSM_DISSECT(tl, u_int32_t *, 2 * NFSX_UNSIGNED);
 			    if (*(tl + 1))
 				nd->nd_flag |= ND_NOMOREDATA;
 			}
 		}
 		error = nfscl_wcc_data(nd, dvp, dnap, dattrflagp, NULL, NULL);
 	}
 	if (nd->nd_repstat && !error)
 		error = nd->nd_repstat;
 nfsmout:
 	m_freem(nd->nd_mrep);
 	return (error);
 }
 
 /*
  * Do an nfs rename rpc.
  */
 int
 nfsrpc_rename(vnode_t fdvp, vnode_t fvp, char *fnameptr, int fnamelen,
     vnode_t tdvp, vnode_t tvp, char *tnameptr, int tnamelen, struct ucred *cred,
     NFSPROC_T *p, struct nfsvattr *fnap, struct nfsvattr *tnap,
     int *fattrflagp, int *tattrflagp)
 {
 	u_int32_t *tl;
 	struct nfsrv_descript nfsd, *nd = &nfsd;
 	struct nfsmount *nmp;
 	struct nfsnode *np;
 	nfsattrbit_t attrbits;
 	nfsv4stateid_t fdstateid, tdstateid;
 	int error = 0, ret = 0, gottd = 0, gotfd = 0, i;
 
 	*fattrflagp = 0;
 	*tattrflagp = 0;
 	nmp = VFSTONFS(fdvp->v_mount);
 	if (fnamelen > NFS_MAXNAMLEN || tnamelen > NFS_MAXNAMLEN)
 		return (ENAMETOOLONG);
 tryagain:
 	if (NFSHASNFSV4(nmp) && ret == 0) {
 		ret = nfscl_renamedeleg(fvp, &fdstateid, &gotfd, tvp,
 		    &tdstateid, &gottd, p);
 		if (gotfd && gottd) {
 			NFSCL_REQSTART(nd, NFSPROC_RETDELEGRENAME2, fvp, cred);
 		} else if (gotfd) {
 			NFSCL_REQSTART(nd, NFSPROC_RETDELEGRENAME1, fvp, cred);
 		} else if (gottd) {
 			NFSCL_REQSTART(nd, NFSPROC_RETDELEGRENAME1, tvp, cred);
 		}
 		if (gotfd) {
 			NFSM_BUILD(tl, u_int32_t *, NFSX_STATEID);
 			if (NFSHASNFSV4N(nmp))
 				*tl++ = 0;
 			else
 				*tl++ = fdstateid.seqid;
 			*tl++ = fdstateid.other[0];
 			*tl++ = fdstateid.other[1];
 			*tl = fdstateid.other[2];
 			if (gottd) {
 				NFSM_BUILD(tl, u_int32_t *, NFSX_UNSIGNED);
 				*tl = txdr_unsigned(NFSV4OP_PUTFH);
 				np = VTONFS(tvp);
 				(void)nfsm_fhtom(nmp, nd, np->n_fhp->nfh_fh,
 				    np->n_fhp->nfh_len, 0);
 				NFSM_BUILD(tl, u_int32_t *, NFSX_UNSIGNED);
 				*tl = txdr_unsigned(NFSV4OP_DELEGRETURN);
 			}
 		}
 		if (gottd) {
 			NFSM_BUILD(tl, u_int32_t *, NFSX_STATEID);
 			if (NFSHASNFSV4N(nmp))
 				*tl++ = 0;
 			else
 				*tl++ = tdstateid.seqid;
 			*tl++ = tdstateid.other[0];
 			*tl++ = tdstateid.other[1];
 			*tl = tdstateid.other[2];
 		}
 		if (ret > 0) {
 			NFSM_BUILD(tl, u_int32_t *, NFSX_UNSIGNED);
 			*tl = txdr_unsigned(NFSV4OP_PUTFH);
 			np = VTONFS(fdvp);
 			(void)nfsm_fhtom(nmp, nd, np->n_fhp->nfh_fh,
 			    np->n_fhp->nfh_len, 0);
 			NFSM_BUILD(tl, u_int32_t *, NFSX_UNSIGNED);
 			*tl = txdr_unsigned(NFSV4OP_SAVEFH);
 		}
 	} else {
 		ret = 0;
 	}
 	if (ret == 0)
 		NFSCL_REQSTART(nd, NFSPROC_RENAME, fdvp, cred);
 	if (nd->nd_flag & ND_NFSV4) {
 		NFSM_BUILD(tl, u_int32_t *, NFSX_UNSIGNED);
 		*tl = txdr_unsigned(NFSV4OP_GETATTR);
 		NFSWCCATTR_ATTRBIT(&attrbits);
 		(void) nfsrv_putattrbit(nd, &attrbits);
 		NFSM_BUILD(tl, u_int32_t *, NFSX_UNSIGNED);
 		*tl = txdr_unsigned(NFSV4OP_PUTFH);
 		(void)nfsm_fhtom(nmp, nd, VTONFS(tdvp)->n_fhp->nfh_fh,
 		    VTONFS(tdvp)->n_fhp->nfh_len, 0);
 		NFSM_BUILD(tl, u_int32_t *, NFSX_UNSIGNED);
 		*tl = txdr_unsigned(NFSV4OP_GETATTR);
 		(void) nfsrv_putattrbit(nd, &attrbits);
 		nd->nd_flag |= ND_V4WCCATTR;
 		NFSM_BUILD(tl, u_int32_t *, NFSX_UNSIGNED);
 		*tl = txdr_unsigned(NFSV4OP_RENAME);
 	}
 	(void) nfsm_strtom(nd, fnameptr, fnamelen);
 	if (!(nd->nd_flag & ND_NFSV4))
 		(void)nfsm_fhtom(nmp, nd, VTONFS(tdvp)->n_fhp->nfh_fh,
 			VTONFS(tdvp)->n_fhp->nfh_len, 0);
 	(void) nfsm_strtom(nd, tnameptr, tnamelen);
 	error = nfscl_request(nd, fdvp, p, cred);
 	if (error)
 		return (error);
 	if (nd->nd_flag & (ND_NFSV3 | ND_NFSV4)) {
 		/* For NFSv4, parse out any Delereturn replies. */
 		if (ret > 0 && nd->nd_repstat != 0 &&
 		    (nd->nd_flag & ND_NOMOREDATA)) {
 			/*
 			 * If the Delegreturn failed, try again without
 			 * it. The server will Recall, as required.
 			 */
 			m_freem(nd->nd_mrep);
 			goto tryagain;
 		}
 		for (i = 0; i < (ret * 2); i++) {
 			if ((nd->nd_flag & (ND_NFSV4 | ND_NOMOREDATA)) ==
 			    ND_NFSV4) {
 			    NFSM_DISSECT(tl, u_int32_t *, 2 * NFSX_UNSIGNED);
 			    if (*(tl + 1)) {
 				if (i == 1 && ret > 1) {
 				    /*
 				     * If the Delegreturn failed, try again
 				     * without it. The server will Recall, as
 				     * required.
 				     * If ret > 1, the second iteration of this
 				     * loop is the second DelegReturn result.
 				     */
 				    m_freem(nd->nd_mrep);
 				    goto tryagain;
 				} else {
 				    nd->nd_flag |= ND_NOMOREDATA;
 				}
 			    }
 			}
 		}
 		/* Now, the first wcc attribute reply. */
 		if ((nd->nd_flag & (ND_NFSV4 | ND_NOMOREDATA)) == ND_NFSV4) {
 			NFSM_DISSECT(tl, u_int32_t *, 2 * NFSX_UNSIGNED);
 			if (*(tl + 1))
 				nd->nd_flag |= ND_NOMOREDATA;
 		}
 		error = nfscl_wcc_data(nd, fdvp, fnap, fattrflagp, NULL, NULL);
 		/* and the second wcc attribute reply. */
 		if ((nd->nd_flag & (ND_NFSV4 | ND_NOMOREDATA)) == ND_NFSV4 &&
 		    !error) {
 			NFSM_DISSECT(tl, u_int32_t *, 2 * NFSX_UNSIGNED);
 			if (*(tl + 1))
 				nd->nd_flag |= ND_NOMOREDATA;
 		}
 		if (!error)
 			error = nfscl_wcc_data(nd, tdvp, tnap, tattrflagp,
 			    NULL, NULL);
 	}
 	if (nd->nd_repstat && !error)
 		error = nd->nd_repstat;
 nfsmout:
 	m_freem(nd->nd_mrep);
 	return (error);
 }
 
 /*
  * nfs hard link create rpc
  */
 int
 nfsrpc_link(vnode_t dvp, vnode_t vp, char *name, int namelen,
     struct ucred *cred, NFSPROC_T *p, struct nfsvattr *dnap,
     struct nfsvattr *nap, int *attrflagp, int *dattrflagp)
 {
 	u_int32_t *tl;
 	struct nfsrv_descript nfsd, *nd = &nfsd;
 	nfsattrbit_t attrbits;
 	int error = 0;
 
 	*attrflagp = 0;
 	*dattrflagp = 0;
 	if (namelen > NFS_MAXNAMLEN)
 		return (ENAMETOOLONG);
 	NFSCL_REQSTART(nd, NFSPROC_LINK, vp, cred);
 	if (nd->nd_flag & ND_NFSV4) {
 		NFSM_BUILD(tl, u_int32_t *, NFSX_UNSIGNED);
 		*tl = txdr_unsigned(NFSV4OP_PUTFH);
 	}
 	(void)nfsm_fhtom(VFSTONFS(dvp->v_mount), nd, VTONFS(dvp)->n_fhp->nfh_fh,
 		VTONFS(dvp)->n_fhp->nfh_len, 0);
 	if (nd->nd_flag & ND_NFSV4) {
 		NFSM_BUILD(tl, u_int32_t *, NFSX_UNSIGNED);
 		*tl = txdr_unsigned(NFSV4OP_GETATTR);
 		NFSWCCATTR_ATTRBIT(&attrbits);
 		(void) nfsrv_putattrbit(nd, &attrbits);
 		nd->nd_flag |= ND_V4WCCATTR;
 		NFSM_BUILD(tl, u_int32_t *, NFSX_UNSIGNED);
 		*tl = txdr_unsigned(NFSV4OP_LINK);
 	}
 	(void) nfsm_strtom(nd, name, namelen);
 	error = nfscl_request(nd, vp, p, cred);
 	if (error)
 		return (error);
 	if (nd->nd_flag & ND_NFSV3) {
 		error = nfscl_postop_attr(nd, nap, attrflagp);
 		if (!error)
 			error = nfscl_wcc_data(nd, dvp, dnap, dattrflagp,
 			    NULL, NULL);
 	} else if ((nd->nd_flag & (ND_NFSV4 | ND_NOMOREDATA)) == ND_NFSV4) {
 		/*
 		 * First, parse out the PutFH and Getattr result.
 		 */
 		NFSM_DISSECT(tl, u_int32_t *, 2 * NFSX_UNSIGNED);
 		if (!(*(tl + 1)))
 			NFSM_DISSECT(tl, u_int32_t *, 2 * NFSX_UNSIGNED);
 		if (*(tl + 1))
 			nd->nd_flag |= ND_NOMOREDATA;
 		/*
 		 * Get the pre-op attributes.
 		 */
 		error = nfscl_wcc_data(nd, dvp, dnap, dattrflagp, NULL, NULL);
 	}
 	if (nd->nd_repstat && !error)
 		error = nd->nd_repstat;
 nfsmout:
 	m_freem(nd->nd_mrep);
 	return (error);
 }
 
 /*
  * nfs symbolic link create rpc
  */
 int
 nfsrpc_symlink(vnode_t dvp, char *name, int namelen, const char *target,
     struct vattr *vap, struct ucred *cred, NFSPROC_T *p, struct nfsvattr *dnap,
     struct nfsvattr *nnap, struct nfsfh **nfhpp, int *attrflagp,
     int *dattrflagp)
 {
 	u_int32_t *tl;
 	struct nfsrv_descript nfsd, *nd = &nfsd;
 	struct nfsmount *nmp;
 	int slen, error = 0;
 
 	*nfhpp = NULL;
 	*attrflagp = 0;
 	*dattrflagp = 0;
 	nmp = VFSTONFS(dvp->v_mount);
 	slen = strlen(target);
 	if (slen > NFS_MAXPATHLEN || namelen > NFS_MAXNAMLEN)
 		return (ENAMETOOLONG);
 	NFSCL_REQSTART(nd, NFSPROC_SYMLINK, dvp, cred);
 	if (nd->nd_flag & ND_NFSV4) {
 		NFSM_BUILD(tl, u_int32_t *, NFSX_UNSIGNED);
 		*tl = txdr_unsigned(NFLNK);
 		(void) nfsm_strtom(nd, target, slen);
 	}
 	(void) nfsm_strtom(nd, name, namelen);
 	if (nd->nd_flag & (ND_NFSV3 | ND_NFSV4))
 		nfscl_fillsattr(nd, vap, dvp, 0, 0);
 	if (!(nd->nd_flag & ND_NFSV4))
 		(void) nfsm_strtom(nd, target, slen);
 	if (nd->nd_flag & ND_NFSV2)
 		nfscl_fillsattr(nd, vap, dvp, NFSSATTR_SIZENEG1, 0);
 	error = nfscl_request(nd, dvp, p, cred);
 	if (error)
 		return (error);
 	if (nd->nd_flag & ND_NFSV4)
 		error = nfscl_wcc_data(nd, dvp, dnap, dattrflagp, NULL, NULL);
 	if ((nd->nd_flag & ND_NFSV3) && !error) {
 		if (!nd->nd_repstat)
 			error = nfscl_mtofh(nd, nfhpp, nnap, attrflagp);
 		if (!error)
 			error = nfscl_wcc_data(nd, dvp, dnap, dattrflagp,
 			    NULL, NULL);
 	}
 	if (nd->nd_repstat && !error)
 		error = nd->nd_repstat;
 	m_freem(nd->nd_mrep);
 	/*
 	 * Kludge: Map EEXIST => 0 assuming that it is a reply to a retry.
 	 * Only do this if vfs.nfs.ignore_eexist is set.
 	 * Never do this for NFSv4.1 or later minor versions, since sessions
 	 * should guarantee "exactly once" RPC semantics.
 	 */
 	if (error == EEXIST && nfsignore_eexist != 0 && (!NFSHASNFSV4(nmp) ||
 	    nmp->nm_minorvers == 0))
 		error = 0;
 	return (error);
 }
 
 /*
  * nfs make dir rpc
  */
 int
 nfsrpc_mkdir(vnode_t dvp, char *name, int namelen, struct vattr *vap,
     struct ucred *cred, NFSPROC_T *p, struct nfsvattr *dnap,
     struct nfsvattr *nnap, struct nfsfh **nfhpp, int *attrflagp,
     int *dattrflagp)
 {
 	u_int32_t *tl;
 	struct nfsrv_descript nfsd, *nd = &nfsd;
 	nfsattrbit_t attrbits;
 	int error = 0;
 	struct nfsfh *fhp;
 	struct nfsmount *nmp;
 
 	*nfhpp = NULL;
 	*attrflagp = 0;
 	*dattrflagp = 0;
 	nmp = VFSTONFS(dvp->v_mount);
 	fhp = VTONFS(dvp)->n_fhp;
 	if (namelen > NFS_MAXNAMLEN)
 		return (ENAMETOOLONG);
 	NFSCL_REQSTART(nd, NFSPROC_MKDIR, dvp, cred);
 	if (nd->nd_flag & ND_NFSV4) {
 		NFSM_BUILD(tl, u_int32_t *, NFSX_UNSIGNED);
 		*tl = txdr_unsigned(NFDIR);
 	}
 	(void) nfsm_strtom(nd, name, namelen);
 	nfscl_fillsattr(nd, vap, dvp, NFSSATTR_SIZENEG1 | NFSSATTR_NEWFILE, 0);
 	if (nd->nd_flag & ND_NFSV4) {
 		NFSGETATTR_ATTRBIT(&attrbits);
 		NFSM_BUILD(tl, u_int32_t *, 2 * NFSX_UNSIGNED);
 		*tl++ = txdr_unsigned(NFSV4OP_GETFH);
 		*tl = txdr_unsigned(NFSV4OP_GETATTR);
 		(void) nfsrv_putattrbit(nd, &attrbits);
 		NFSM_BUILD(tl, u_int32_t *, NFSX_UNSIGNED);
 		*tl = txdr_unsigned(NFSV4OP_PUTFH);
 		(void)nfsm_fhtom(nmp, nd, fhp->nfh_fh, fhp->nfh_len, 0);
 		NFSM_BUILD(tl, u_int32_t *, NFSX_UNSIGNED);
 		*tl = txdr_unsigned(NFSV4OP_GETATTR);
 		(void) nfsrv_putattrbit(nd, &attrbits);
 	}
 	error = nfscl_request(nd, dvp, p, cred);
 	if (error)
 		return (error);
 	if (nd->nd_flag & ND_NFSV4)
 		error = nfscl_wcc_data(nd, dvp, dnap, dattrflagp, NULL, NULL);
 	if (!nd->nd_repstat && !error) {
 		if (nd->nd_flag & ND_NFSV4) {
 			NFSM_DISSECT(tl, u_int32_t *, 5 * NFSX_UNSIGNED);
 			error = nfsrv_getattrbits(nd, &attrbits, NULL, NULL);
 		}
 		if (!error)
 			error = nfscl_mtofh(nd, nfhpp, nnap, attrflagp);
 		if (error == 0 && (nd->nd_flag & ND_NFSV4) != 0) {
 			/* Get rid of the PutFH and Getattr status values. */
 			NFSM_DISSECT(tl, u_int32_t *, 4 * NFSX_UNSIGNED);
 			/* Load the directory attributes. */
 			error = nfsm_loadattr(nd, dnap);
 			if (error == 0)
 				*dattrflagp = 1;
 		}
 	}
 	if ((nd->nd_flag & ND_NFSV3) && !error)
 		error = nfscl_wcc_data(nd, dvp, dnap, dattrflagp, NULL, NULL);
 	if (nd->nd_repstat && !error)
 		error = nd->nd_repstat;
 nfsmout:
 	m_freem(nd->nd_mrep);
 	/*
 	 * Kludge: Map EEXIST => 0 assuming that it is a reply to a retry.
 	 * Only do this if vfs.nfs.ignore_eexist is set.
 	 * Never do this for NFSv4.1 or later minor versions, since sessions
 	 * should guarantee "exactly once" RPC semantics.
 	 */
 	if (error == EEXIST && nfsignore_eexist != 0 && (!NFSHASNFSV4(nmp) ||
 	    nmp->nm_minorvers == 0))
 		error = 0;
 	return (error);
 }
 
 /*
  * nfs remove directory call
  */
 int
 nfsrpc_rmdir(vnode_t dvp, char *name, int namelen, struct ucred *cred,
     NFSPROC_T *p, struct nfsvattr *dnap, int *dattrflagp)
 {
 	struct nfsrv_descript nfsd, *nd = &nfsd;
 	int error = 0;
 
 	*dattrflagp = 0;
 	if (namelen > NFS_MAXNAMLEN)
 		return (ENAMETOOLONG);
 	NFSCL_REQSTART(nd, NFSPROC_RMDIR, dvp, cred);
 	(void) nfsm_strtom(nd, name, namelen);
 	error = nfscl_request(nd, dvp, p, cred);
 	if (error)
 		return (error);
 	if (nd->nd_flag & (ND_NFSV3 | ND_NFSV4))
 		error = nfscl_wcc_data(nd, dvp, dnap, dattrflagp, NULL, NULL);
 	if (nd->nd_repstat && !error)
 		error = nd->nd_repstat;
 	m_freem(nd->nd_mrep);
 	/*
 	 * Kludge: Map ENOENT => 0 assuming that you have a reply to a retry.
 	 */
 	if (error == ENOENT)
 		error = 0;
 	return (error);
 }
 
 /*
  * Check to make sure the file name in a Readdir reply is valid.
  */
 static bool
 nfscl_invalidfname(bool is_v4, char *name, int len)
 {
 	int i;
 	char *cp;
 
 	if (is_v4 && ((len == 1 && name[0] == '.') ||
 	    (len == 2 && name[0] == '.' && name[1] == '.'))) {
 		printf("Readdir NFSv4 reply has dot or dotdot in it\n");
 		return (true);
 	}
 	cp = name;
 	for (i = 0; i < len; i++, cp++) {
 		if (*cp == '/' || *cp == '\0') {
 			printf("Readdir reply file name had imbedded / or nul"
 			    " byte\n");
 			return (true);
 		}
 	}
 	return (false);
 }
 
 /*
  * Readdir rpc.
  * Always returns with either uio_resid unchanged, if you are at the
  * end of the directory, or uio_resid == 0, with all DIRBLKSIZ chunks
  * filled in.
  * I felt this would allow caching of directory blocks more easily
  * than returning a pertially filled block.
  * Directory offset cookies:
  * Oh my, what to do with them...
  * I can think of three ways to deal with them:
  * 1 - have the layer above these RPCs maintain a map between logical
  *     directory byte offsets and the NFS directory offset cookies
  * 2 - pass the opaque directory offset cookies up into userland
  *     and let the libc functions deal with them, via the system call
  * 3 - return them to userland in the "struct dirent", so future versions
  *     of libc can use them and do whatever is necessary to make things work
  *     above these rpc calls, in the meantime
  * For now, I do #3 by "hiding" the directory offset cookies after the
  * d_name field in struct dirent. This is space inside d_reclen that
  * will be ignored by anything that doesn't know about them.
  * The directory offset cookies are filled in as the last 8 bytes of
  * each directory entry, after d_name. Someday, the userland libc
  * functions may be able to use these. In the meantime, it satisfies
  * OpenBSD's requirements for cookies being returned.
  * If expects the directory offset cookie for the read to be in uio_offset
  * and returns the one for the next entry after this directory block in
  * there, as well.
  */
 int
 nfsrpc_readdir(vnode_t vp, struct uio *uiop, nfsuint64 *cookiep,
     struct ucred *cred, NFSPROC_T *p, struct nfsvattr *nap, int *attrflagp,
     int *eofp)
 {
 	int len, left;
 	struct dirent *dp = NULL;
 	u_int32_t *tl;
 	nfsquad_t cookie, ncookie;
 	struct nfsmount *nmp = VFSTONFS(vp->v_mount);
 	struct nfsnode *dnp = VTONFS(vp);
 	struct nfsvattr nfsva;
 	struct nfsrv_descript nfsd, *nd = &nfsd;
 	int error = 0, tlen, more_dirs = 1, blksiz = 0, bigenough = 1;
 	int reqsize, tryformoredirs = 1, readsize, eof = 0, gotmnton = 0;
 	u_int64_t dotfileid, dotdotfileid = 0, fakefileno = UINT64_MAX;
 	char *cp;
 	nfsattrbit_t attrbits, dattrbits;
 	u_int32_t rderr, *tl2 = NULL;
 	size_t tresid;
 
 	KASSERT(uiop->uio_iovcnt == 1 &&
 	    (uiop->uio_resid & (DIRBLKSIZ - 1)) == 0,
 	    ("nfs readdirrpc bad uio"));
 	KASSERT(uiop->uio_segflg == UIO_SYSSPACE,
 	    ("nfsrpc_readdir: uio userspace"));
 	ncookie.lval[0] = ncookie.lval[1] = 0;
 	/*
 	 * There is no point in reading a lot more than uio_resid, however
 	 * adding one additional DIRBLKSIZ makes sense. Since uio_resid
 	 * and nm_readdirsize are both exact multiples of DIRBLKSIZ, this
 	 * will never make readsize > nm_readdirsize.
 	 */
 	readsize = nmp->nm_readdirsize;
 	if (readsize > uiop->uio_resid)
 		readsize = uiop->uio_resid + DIRBLKSIZ;
 
 	*attrflagp = 0;
 	if (eofp)
 		*eofp = 0;
 	tresid = uiop->uio_resid;
 	cookie.lval[0] = cookiep->nfsuquad[0];
 	cookie.lval[1] = cookiep->nfsuquad[1];
 	nd->nd_mrep = NULL;
 
 	/*
 	 * For NFSv4, first create the "." and ".." entries.
 	 */
 	if (NFSHASNFSV4(nmp)) {
 		reqsize = 6 * NFSX_UNSIGNED;
 		NFSGETATTR_ATTRBIT(&dattrbits);
 		NFSZERO_ATTRBIT(&attrbits);
 		NFSSETBIT_ATTRBIT(&attrbits, NFSATTRBIT_FILEID);
 		NFSSETBIT_ATTRBIT(&attrbits, NFSATTRBIT_TYPE);
 		if (NFSISSET_ATTRBIT(&dnp->n_vattr.na_suppattr,
 		    NFSATTRBIT_MOUNTEDONFILEID)) {
 			NFSSETBIT_ATTRBIT(&attrbits,
 			    NFSATTRBIT_MOUNTEDONFILEID);
 			gotmnton = 1;
 		} else {
 			/*
 			 * Must fake it. Use the fileno, except when the
 			 * fsid is != to that of the directory. For that
 			 * case, generate a fake fileno that is not the same.
 			 */
 			NFSSETBIT_ATTRBIT(&attrbits, NFSATTRBIT_FSID);
 			gotmnton = 0;
 		}
 
 		/*
 		 * Joy, oh joy. For V4 we get to hand craft '.' and '..'.
 		 */
 		if (uiop->uio_offset == 0) {
 			NFSCL_REQSTART(nd, NFSPROC_LOOKUPP, vp, cred);
 			NFSM_BUILD(tl, u_int32_t *, 2 * NFSX_UNSIGNED);
 			*tl++ = txdr_unsigned(NFSV4OP_GETFH);
 			*tl = txdr_unsigned(NFSV4OP_GETATTR);
 			(void) nfsrv_putattrbit(nd, &attrbits);
 			error = nfscl_request(nd, vp, p, cred);
 			if (error)
 			    return (error);
 			dotfileid = 0;	/* Fake out the compiler. */
 			if ((nd->nd_flag & ND_NOMOREDATA) == 0) {
 			    error = nfsm_loadattr(nd, &nfsva);
 			    if (error != 0)
 				goto nfsmout;
 			    dotfileid = nfsva.na_fileid;
 			}
 			if (nd->nd_repstat == 0) {
 			    NFSM_DISSECT(tl, u_int32_t *, 5 * NFSX_UNSIGNED);
 			    len = fxdr_unsigned(int, *(tl + 4));
 			    if (len > 0 && len <= NFSX_V4FHMAX)
 				error = nfsm_advance(nd, NFSM_RNDUP(len), -1);
 			    else
 				error = EPERM;
 			    if (!error) {
 				NFSM_DISSECT(tl, u_int32_t *, 2*NFSX_UNSIGNED);
 				nfsva.na_mntonfileno = UINT64_MAX;
 				error = nfsv4_loadattr(nd, NULL, &nfsva, NULL,
 				    NULL, 0, NULL, NULL, NULL, NULL, NULL, 0,
 				    NULL, NULL, NULL, p, cred);
 				if (error) {
 				    dotdotfileid = dotfileid;
 				} else if (gotmnton) {
 				    if (nfsva.na_mntonfileno != UINT64_MAX)
 					dotdotfileid = nfsva.na_mntonfileno;
 				    else
 					dotdotfileid = nfsva.na_fileid;
 				} else if (nfsva.na_filesid[0] ==
 				    dnp->n_vattr.na_filesid[0] &&
 				    nfsva.na_filesid[1] ==
 				    dnp->n_vattr.na_filesid[1]) {
 				    dotdotfileid = nfsva.na_fileid;
 				} else {
 				    do {
 					fakefileno--;
 				    } while (fakefileno ==
 					nfsva.na_fileid);
 				    dotdotfileid = fakefileno;
 				}
 			    }
 			} else if (nd->nd_repstat == NFSERR_NOENT) {
 			    /*
 			     * Lookupp returns NFSERR_NOENT when we are
 			     * at the root, so just use the current dir.
 			     */
 			    nd->nd_repstat = 0;
 			    dotdotfileid = dotfileid;
 			} else {
 			    error = nd->nd_repstat;
 			}
 			m_freem(nd->nd_mrep);
 			if (error)
 			    return (error);
 			nd->nd_mrep = NULL;
 			dp = (struct dirent *)uiop->uio_iov->iov_base;
 			dp->d_pad0 = dp->d_pad1 = 0;
 			dp->d_off = 0;
 			dp->d_type = DT_DIR;
 			dp->d_fileno = dotfileid;
 			dp->d_namlen = 1;
 			*((uint64_t *)dp->d_name) = 0;	/* Zero pad it. */
 			dp->d_name[0] = '.';
 			dp->d_reclen = _GENERIC_DIRSIZ(dp) + NFSX_HYPER;
 			/*
 			 * Just make these offset cookie 0.
 			 */
 			tl = (u_int32_t *)&dp->d_name[8];
 			*tl++ = 0;
 			*tl = 0;
 			blksiz += dp->d_reclen;
 			uiop->uio_resid -= dp->d_reclen;
 			uiop->uio_offset += dp->d_reclen;
 			uiop->uio_iov->iov_base =
 			    (char *)uiop->uio_iov->iov_base + dp->d_reclen;
 			uiop->uio_iov->iov_len -= dp->d_reclen;
 			dp = (struct dirent *)uiop->uio_iov->iov_base;
 			dp->d_pad0 = dp->d_pad1 = 0;
 			dp->d_off = 0;
 			dp->d_type = DT_DIR;
 			dp->d_fileno = dotdotfileid;
 			dp->d_namlen = 2;
 			*((uint64_t *)dp->d_name) = 0;
 			dp->d_name[0] = '.';
 			dp->d_name[1] = '.';
 			dp->d_reclen = _GENERIC_DIRSIZ(dp) + NFSX_HYPER;
 			/*
 			 * Just make these offset cookie 0.
 			 */
 			tl = (u_int32_t *)&dp->d_name[8];
 			*tl++ = 0;
 			*tl = 0;
 			blksiz += dp->d_reclen;
 			uiop->uio_resid -= dp->d_reclen;
 			uiop->uio_offset += dp->d_reclen;
 			uiop->uio_iov->iov_base =
 			    (char *)uiop->uio_iov->iov_base + dp->d_reclen;
 			uiop->uio_iov->iov_len -= dp->d_reclen;
 		}
 		NFSSETBIT_ATTRBIT(&attrbits, NFSATTRBIT_RDATTRERROR);
 	} else {
 		reqsize = 5 * NFSX_UNSIGNED;
 	}
 
 	/*
 	 * Loop around doing readdir rpc's of size readsize.
 	 * The stopping criteria is EOF or buffer full.
 	 */
 	while (more_dirs && bigenough) {
 		*attrflagp = 0;
 		NFSCL_REQSTART(nd, NFSPROC_READDIR, vp, cred);
 		if (nd->nd_flag & ND_NFSV2) {
 			NFSM_BUILD(tl, u_int32_t *, 2 * NFSX_UNSIGNED);
 			*tl++ = cookie.lval[1];
 			*tl = txdr_unsigned(readsize);
 		} else {
 			NFSM_BUILD(tl, u_int32_t *, reqsize);
 			*tl++ = cookie.lval[0];
 			*tl++ = cookie.lval[1];
 			if (cookie.qval == 0) {
 				*tl++ = 0;
 				*tl++ = 0;
 			} else {
 				NFSLOCKNODE(dnp);
 				*tl++ = dnp->n_cookieverf.nfsuquad[0];
 				*tl++ = dnp->n_cookieverf.nfsuquad[1];
 				NFSUNLOCKNODE(dnp);
 			}
 			if (nd->nd_flag & ND_NFSV4) {
 				*tl++ = txdr_unsigned(readsize);
 				*tl = txdr_unsigned(readsize);
 				(void) nfsrv_putattrbit(nd, &attrbits);
 				NFSM_BUILD(tl, u_int32_t *, NFSX_UNSIGNED);
 				*tl = txdr_unsigned(NFSV4OP_GETATTR);
 				(void) nfsrv_putattrbit(nd, &dattrbits);
 			} else {
 				*tl = txdr_unsigned(readsize);
 			}
 		}
 		error = nfscl_request(nd, vp, p, cred);
 		if (error)
 			return (error);
 		if (!(nd->nd_flag & ND_NFSV2)) {
 			if (nd->nd_flag & ND_NFSV3)
 				error = nfscl_postop_attr(nd, nap, attrflagp);
 			if (!nd->nd_repstat && !error) {
 				NFSM_DISSECT(tl, u_int32_t *, NFSX_HYPER);
 				NFSLOCKNODE(dnp);
 				dnp->n_cookieverf.nfsuquad[0] = *tl++;
 				dnp->n_cookieverf.nfsuquad[1] = *tl;
 				NFSUNLOCKNODE(dnp);
 			}
 		}
 		if (nd->nd_repstat || error) {
 			if (!error)
 				error = nd->nd_repstat;
 			goto nfsmout;
 		}
 		NFSM_DISSECT(tl, u_int32_t *, NFSX_UNSIGNED);
 		more_dirs = fxdr_unsigned(int, *tl);
 		if (!more_dirs)
 			tryformoredirs = 0;
 
 		/* loop through the dir entries, doctoring them to 4bsd form */
 		while (more_dirs && bigenough) {
 			if (nd->nd_flag & ND_NFSV4) {
 				NFSM_DISSECT(tl, u_int32_t *, 3*NFSX_UNSIGNED);
 				ncookie.lval[0] = *tl++;
 				ncookie.lval[1] = *tl++;
 				len = fxdr_unsigned(int, *tl);
 			} else if (nd->nd_flag & ND_NFSV3) {
 				NFSM_DISSECT(tl, u_int32_t *, 3*NFSX_UNSIGNED);
 				nfsva.na_fileid = fxdr_hyper(tl);
 				tl += 2;
 				len = fxdr_unsigned(int, *tl);
 			} else {
 				NFSM_DISSECT(tl, u_int32_t *, 2*NFSX_UNSIGNED);
 				nfsva.na_fileid = fxdr_unsigned(uint64_t,
 				    *tl++);
 				len = fxdr_unsigned(int, *tl);
 			}
 			if (len <= 0 || len > NFS_MAXNAMLEN) {
 				error = EBADRPC;
 				goto nfsmout;
 			}
 			tlen = roundup2(len, 8);
 			if (tlen == len)
 				tlen += 8;  /* To ensure null termination. */
 			left = DIRBLKSIZ - blksiz;
 			if (_GENERIC_DIRLEN(len) + NFSX_HYPER > left) {
 				NFSBZERO(uiop->uio_iov->iov_base, left);
 				dp->d_reclen += left;
 				uiop->uio_iov->iov_base =
 				    (char *)uiop->uio_iov->iov_base + left;
 				uiop->uio_iov->iov_len -= left;
 				uiop->uio_resid -= left;
 				uiop->uio_offset += left;
 				blksiz = 0;
 			}
 			if (_GENERIC_DIRLEN(len) + NFSX_HYPER >
 			    uiop->uio_resid)
 				bigenough = 0;
 			if (bigenough) {
 				struct iovec saviov;
 				off_t savoff;
 				ssize_t savresid;
 				int savblksiz;
 
 				saviov.iov_base = uiop->uio_iov->iov_base;
 				saviov.iov_len = uiop->uio_iov->iov_len;
 				savoff = uiop->uio_offset;
 				savresid = uiop->uio_resid;
 				savblksiz = blksiz;
 
 				dp = (struct dirent *)uiop->uio_iov->iov_base;
 				dp->d_pad0 = dp->d_pad1 = 0;
 				dp->d_off = 0;
 				dp->d_namlen = len;
 				dp->d_reclen = _GENERIC_DIRLEN(len) +
 				    NFSX_HYPER;
 				dp->d_type = DT_UNKNOWN;
 				blksiz += dp->d_reclen;
 				if (blksiz == DIRBLKSIZ)
 					blksiz = 0;
 				uiop->uio_resid -= DIRHDSIZ;
 				uiop->uio_offset += DIRHDSIZ;
 				uiop->uio_iov->iov_base =
 				    (char *)uiop->uio_iov->iov_base + DIRHDSIZ;
 				uiop->uio_iov->iov_len -= DIRHDSIZ;
 				cp = uiop->uio_iov->iov_base;
 				error = nfsm_mbufuio(nd, uiop, len);
 				if (error)
 					goto nfsmout;
 				/* Check for an invalid file name. */
 				if (nfscl_invalidfname(
 				    (nd->nd_flag & ND_NFSV4) != 0, cp, len)) {
 					/* Skip over this entry. */
 					uiop->uio_iov->iov_base =
 					    saviov.iov_base;
 					uiop->uio_iov->iov_len =
 					    saviov.iov_len;
 					uiop->uio_offset = savoff;
 					uiop->uio_resid = savresid;
 					blksiz = savblksiz;
 				} else {
 					cp = uiop->uio_iov->iov_base;
 					tlen -= len;
 					NFSBZERO(cp, tlen);
 					cp += tlen; /* points to cookie store */
 					tl2 = (u_int32_t *)cp;
 					uiop->uio_iov->iov_base =
 					    (char *)uiop->uio_iov->iov_base +
 					    tlen + NFSX_HYPER;
 					uiop->uio_iov->iov_len -= tlen +
 					    NFSX_HYPER;
 					uiop->uio_resid -= tlen + NFSX_HYPER;
 					uiop->uio_offset += (tlen + NFSX_HYPER);
 				}
 			} else {
 				error = nfsm_advance(nd, NFSM_RNDUP(len), -1);
 				if (error)
 					goto nfsmout;
 			}
 			if (nd->nd_flag & ND_NFSV4) {
 				rderr = 0;
 				nfsva.na_mntonfileno = UINT64_MAX;
 				error = nfsv4_loadattr(nd, NULL, &nfsva, NULL,
 				    NULL, 0, NULL, NULL, NULL, NULL, NULL, 0,
 				    NULL, NULL, &rderr, p, cred);
 				if (error)
 					goto nfsmout;
 				NFSM_DISSECT(tl, u_int32_t *, NFSX_UNSIGNED);
 			} else if (nd->nd_flag & ND_NFSV3) {
 				NFSM_DISSECT(tl, u_int32_t *, 3*NFSX_UNSIGNED);
 				ncookie.lval[0] = *tl++;
 				ncookie.lval[1] = *tl++;
 			} else {
 				NFSM_DISSECT(tl, u_int32_t *, 2*NFSX_UNSIGNED);
 				ncookie.lval[0] = 0;
 				ncookie.lval[1] = *tl++;
 			}
 			if (bigenough) {
 			    if (nd->nd_flag & ND_NFSV4) {
 				if (rderr) {
 				    dp->d_fileno = 0;
 				} else {
 				    if (gotmnton) {
 					if (nfsva.na_mntonfileno != UINT64_MAX)
 					    dp->d_fileno = nfsva.na_mntonfileno;
 					else
 					    dp->d_fileno = nfsva.na_fileid;
 				    } else if (nfsva.na_filesid[0] ==
 					dnp->n_vattr.na_filesid[0] &&
 					nfsva.na_filesid[1] ==
 					dnp->n_vattr.na_filesid[1]) {
 					dp->d_fileno = nfsva.na_fileid;
 				    } else {
 					do {
 					    fakefileno--;
 					} while (fakefileno ==
 					    nfsva.na_fileid);
 					dp->d_fileno = fakefileno;
 				    }
 				    dp->d_type = vtonfs_dtype(nfsva.na_type);
 				}
 			    } else {
 				dp->d_fileno = nfsva.na_fileid;
 			    }
 			    *tl2++ = cookiep->nfsuquad[0] = cookie.lval[0] =
 				ncookie.lval[0];
 			    *tl2 = cookiep->nfsuquad[1] = cookie.lval[1] =
 				ncookie.lval[1];
 			}
 			more_dirs = fxdr_unsigned(int, *tl);
 		}
 		/*
 		 * If at end of rpc data, get the eof boolean
 		 */
 		if (!more_dirs) {
 			NFSM_DISSECT(tl, u_int32_t *, NFSX_UNSIGNED);
 			eof = fxdr_unsigned(int, *tl);
 			if (tryformoredirs)
 				more_dirs = !eof;
 			if (nd->nd_flag & ND_NFSV4) {
 				error = nfscl_postop_attr(nd, nap, attrflagp);
 				if (error)
 					goto nfsmout;
 			}
 		}
 		m_freem(nd->nd_mrep);
 		nd->nd_mrep = NULL;
 	}
 	/*
 	 * Fill last record, iff any, out to a multiple of DIRBLKSIZ
 	 * by increasing d_reclen for the last record.
 	 */
 	if (blksiz > 0) {
 		left = DIRBLKSIZ - blksiz;
 		NFSBZERO(uiop->uio_iov->iov_base, left);
 		dp->d_reclen += left;
 		uiop->uio_iov->iov_base = (char *)uiop->uio_iov->iov_base +
 		    left;
 		uiop->uio_iov->iov_len -= left;
 		uiop->uio_resid -= left;
 		uiop->uio_offset += left;
 	}
 
 	/*
 	 * If returning no data, assume end of file.
 	 * If not bigenough, return not end of file, since you aren't
 	 *    returning all the data
 	 * Otherwise, return the eof flag from the server.
 	 */
 	if (eofp) {
 		if (tresid == ((size_t)(uiop->uio_resid)))
 			*eofp = 1;
 		else if (!bigenough)
 			*eofp = 0;
 		else
 			*eofp = eof;
 	}
 
 	/*
 	 * Add extra empty records to any remaining DIRBLKSIZ chunks.
 	 */
 	while (uiop->uio_resid > 0 && uiop->uio_resid != tresid) {
 		dp = (struct dirent *)uiop->uio_iov->iov_base;
 		NFSBZERO(dp, DIRBLKSIZ);
 		dp->d_type = DT_UNKNOWN;
 		tl = (u_int32_t *)&dp->d_name[4];
 		*tl++ = cookie.lval[0];
 		*tl = cookie.lval[1];
 		dp->d_reclen = DIRBLKSIZ;
 		uiop->uio_iov->iov_base = (char *)uiop->uio_iov->iov_base +
 		    DIRBLKSIZ;
 		uiop->uio_iov->iov_len -= DIRBLKSIZ;
 		uiop->uio_resid -= DIRBLKSIZ;
 		uiop->uio_offset += DIRBLKSIZ;
 	}
 
 nfsmout:
 	if (nd->nd_mrep != NULL)
 		m_freem(nd->nd_mrep);
 	return (error);
 }
 
 /*
  * NFS V3 readdir plus RPC. Used in place of nfsrpc_readdir().
  * (Also used for NFS V4 when mount flag set.)
  * (ditto above w.r.t. multiple of DIRBLKSIZ, etc.)
  */
 int
 nfsrpc_readdirplus(vnode_t vp, struct uio *uiop, nfsuint64 *cookiep,
     struct ucred *cred, NFSPROC_T *p, struct nfsvattr *nap, int *attrflagp,
     int *eofp)
 {
 	int len, left;
 	struct dirent *dp = NULL;
 	u_int32_t *tl;
 	vnode_t newvp = NULLVP;
 	struct nfsrv_descript nfsd, *nd = &nfsd;
 	struct nameidata nami, *ndp = &nami;
 	struct componentname *cnp = &ndp->ni_cnd;
 	struct nfsmount *nmp = VFSTONFS(vp->v_mount);
 	struct nfsnode *dnp = VTONFS(vp), *np;
 	struct nfsvattr nfsva;
 	struct nfsfh *nfhp;
 	nfsquad_t cookie, ncookie;
 	int error = 0, tlen, more_dirs = 1, blksiz = 0, bigenough = 1;
 	int attrflag, tryformoredirs = 1, eof = 0, gotmnton = 0;
 	int isdotdot = 0, unlocknewvp = 0;
 	u_int64_t dotfileid, dotdotfileid = 0, fakefileno = UINT64_MAX;
 	u_int64_t fileno = 0;
 	char *cp;
 	nfsattrbit_t attrbits, dattrbits;
 	size_t tresid;
 	u_int32_t *tl2 = NULL, rderr;
 	struct timespec dctime, ts;
 	bool attr_ok;
 
 	KASSERT(uiop->uio_iovcnt == 1 &&
 	    (uiop->uio_resid & (DIRBLKSIZ - 1)) == 0,
 	    ("nfs readdirplusrpc bad uio"));
 	KASSERT(uiop->uio_segflg == UIO_SYSSPACE,
 	    ("nfsrpc_readdirplus: uio userspace"));
 	ncookie.lval[0] = ncookie.lval[1] = 0;
 	timespecclear(&dctime);
 	*attrflagp = 0;
 	if (eofp != NULL)
 		*eofp = 0;
 	ndp->ni_dvp = vp;
 	nd->nd_mrep = NULL;
 	cookie.lval[0] = cookiep->nfsuquad[0];
 	cookie.lval[1] = cookiep->nfsuquad[1];
 	tresid = uiop->uio_resid;
 
 	/*
 	 * For NFSv4, first create the "." and ".." entries.
 	 */
 	if (NFSHASNFSV4(nmp)) {
 		NFSGETATTR_ATTRBIT(&dattrbits);
 		NFSZERO_ATTRBIT(&attrbits);
 		NFSSETBIT_ATTRBIT(&attrbits, NFSATTRBIT_FILEID);
 		if (NFSISSET_ATTRBIT(&dnp->n_vattr.na_suppattr,
 		    NFSATTRBIT_MOUNTEDONFILEID)) {
 			NFSSETBIT_ATTRBIT(&attrbits,
 			    NFSATTRBIT_MOUNTEDONFILEID);
 			gotmnton = 1;
 		} else {
 			/*
 			 * Must fake it. Use the fileno, except when the
 			 * fsid is != to that of the directory. For that
 			 * case, generate a fake fileno that is not the same.
 			 */
 			NFSSETBIT_ATTRBIT(&attrbits, NFSATTRBIT_FSID);
 			gotmnton = 0;
 		}
 
 		/*
 		 * Joy, oh joy. For V4 we get to hand craft '.' and '..'.
 		 */
 		if (uiop->uio_offset == 0) {
 			NFSCL_REQSTART(nd, NFSPROC_LOOKUPP, vp, cred);
 			NFSM_BUILD(tl, u_int32_t *, 2 * NFSX_UNSIGNED);
 			*tl++ = txdr_unsigned(NFSV4OP_GETFH);
 			*tl = txdr_unsigned(NFSV4OP_GETATTR);
 			(void) nfsrv_putattrbit(nd, &attrbits);
 			error = nfscl_request(nd, vp, p, cred);
 			if (error)
 			    return (error);
 			dotfileid = 0;	/* Fake out the compiler. */
 			if ((nd->nd_flag & ND_NOMOREDATA) == 0) {
 			    error = nfsm_loadattr(nd, &nfsva);
 			    if (error != 0)
 				goto nfsmout;
 			    dctime = nfsva.na_ctime;
 			    dotfileid = nfsva.na_fileid;
 			}
 			if (nd->nd_repstat == 0) {
 			    NFSM_DISSECT(tl, u_int32_t *, 5 * NFSX_UNSIGNED);
 			    len = fxdr_unsigned(int, *(tl + 4));
 			    if (len > 0 && len <= NFSX_V4FHMAX)
 				error = nfsm_advance(nd, NFSM_RNDUP(len), -1);
 			    else
 				error = EPERM;
 			    if (!error) {
 				NFSM_DISSECT(tl, u_int32_t *, 2*NFSX_UNSIGNED);
 				nfsva.na_mntonfileno = UINT64_MAX;
 				error = nfsv4_loadattr(nd, NULL, &nfsva, NULL,
 				    NULL, 0, NULL, NULL, NULL, NULL, NULL, 0,
 				    NULL, NULL, NULL, p, cred);
 				if (error) {
 				    dotdotfileid = dotfileid;
 				} else if (gotmnton) {
 				    if (nfsva.na_mntonfileno != UINT64_MAX)
 					dotdotfileid = nfsva.na_mntonfileno;
 				    else
 					dotdotfileid = nfsva.na_fileid;
 				} else if (nfsva.na_filesid[0] ==
 				    dnp->n_vattr.na_filesid[0] &&
 				    nfsva.na_filesid[1] ==
 				    dnp->n_vattr.na_filesid[1]) {
 				    dotdotfileid = nfsva.na_fileid;
 				} else {
 				    do {
 					fakefileno--;
 				    } while (fakefileno ==
 					nfsva.na_fileid);
 				    dotdotfileid = fakefileno;
 				}
 			    }
 			} else if (nd->nd_repstat == NFSERR_NOENT) {
 			    /*
 			     * Lookupp returns NFSERR_NOENT when we are
 			     * at the root, so just use the current dir.
 			     */
 			    nd->nd_repstat = 0;
 			    dotdotfileid = dotfileid;
 			} else {
 			    error = nd->nd_repstat;
 			}
 			m_freem(nd->nd_mrep);
 			if (error)
 			    return (error);
 			nd->nd_mrep = NULL;
 			dp = (struct dirent *)uiop->uio_iov->iov_base;
 			dp->d_pad0 = dp->d_pad1 = 0;
 			dp->d_off = 0;
 			dp->d_type = DT_DIR;
 			dp->d_fileno = dotfileid;
 			dp->d_namlen = 1;
 			*((uint64_t *)dp->d_name) = 0;	/* Zero pad it. */
 			dp->d_name[0] = '.';
 			dp->d_reclen = _GENERIC_DIRSIZ(dp) + NFSX_HYPER;
 			/*
 			 * Just make these offset cookie 0.
 			 */
 			tl = (u_int32_t *)&dp->d_name[8];
 			*tl++ = 0;
 			*tl = 0;
 			blksiz += dp->d_reclen;
 			uiop->uio_resid -= dp->d_reclen;
 			uiop->uio_offset += dp->d_reclen;
 			uiop->uio_iov->iov_base =
 			    (char *)uiop->uio_iov->iov_base + dp->d_reclen;
 			uiop->uio_iov->iov_len -= dp->d_reclen;
 			dp = (struct dirent *)uiop->uio_iov->iov_base;
 			dp->d_pad0 = dp->d_pad1 = 0;
 			dp->d_off = 0;
 			dp->d_type = DT_DIR;
 			dp->d_fileno = dotdotfileid;
 			dp->d_namlen = 2;
 			*((uint64_t *)dp->d_name) = 0;
 			dp->d_name[0] = '.';
 			dp->d_name[1] = '.';
 			dp->d_reclen = _GENERIC_DIRSIZ(dp) + NFSX_HYPER;
 			/*
 			 * Just make these offset cookie 0.
 			 */
 			tl = (u_int32_t *)&dp->d_name[8];
 			*tl++ = 0;
 			*tl = 0;
 			blksiz += dp->d_reclen;
 			uiop->uio_resid -= dp->d_reclen;
 			uiop->uio_offset += dp->d_reclen;
 			uiop->uio_iov->iov_base =
 			    (char *)uiop->uio_iov->iov_base + dp->d_reclen;
 			uiop->uio_iov->iov_len -= dp->d_reclen;
 		}
 		NFSREADDIRPLUS_ATTRBIT(&attrbits);
 		if (gotmnton)
 			NFSSETBIT_ATTRBIT(&attrbits,
 			    NFSATTRBIT_MOUNTEDONFILEID);
 		if (!NFSISSET_ATTRBIT(&dnp->n_vattr.na_suppattr,
 		    NFSATTRBIT_TIMECREATE))
 			NFSCLRBIT_ATTRBIT(&attrbits, NFSATTRBIT_TIMECREATE);
 	}
 
 	/*
 	 * Loop around doing readdir rpc's of size nm_readdirsize.
 	 * The stopping criteria is EOF or buffer full.
 	 */
 	while (more_dirs && bigenough) {
 		*attrflagp = 0;
 		NFSCL_REQSTART(nd, NFSPROC_READDIRPLUS, vp, cred);
  		NFSM_BUILD(tl, u_int32_t *, 6 * NFSX_UNSIGNED);
 		*tl++ = cookie.lval[0];
 		*tl++ = cookie.lval[1];
 		if (cookie.qval == 0) {
 			*tl++ = 0;
 			*tl++ = 0;
 		} else {
 			NFSLOCKNODE(dnp);
 			*tl++ = dnp->n_cookieverf.nfsuquad[0];
 			*tl++ = dnp->n_cookieverf.nfsuquad[1];
 			NFSUNLOCKNODE(dnp);
 		}
 		*tl++ = txdr_unsigned(nmp->nm_readdirsize);
 		*tl = txdr_unsigned(nmp->nm_readdirsize);
 		if (nd->nd_flag & ND_NFSV4) {
 			(void) nfsrv_putattrbit(nd, &attrbits);
 			NFSM_BUILD(tl, u_int32_t *, NFSX_UNSIGNED);
 			*tl = txdr_unsigned(NFSV4OP_GETATTR);
 			(void) nfsrv_putattrbit(nd, &dattrbits);
 		}
 		nanouptime(&ts);
 		error = nfscl_request(nd, vp, p, cred);
 		if (error)
 			return (error);
 		if (nd->nd_flag & ND_NFSV3)
 			error = nfscl_postop_attr(nd, nap, attrflagp);
 		if (nd->nd_repstat || error) {
 			if (!error)
 				error = nd->nd_repstat;
 			goto nfsmout;
 		}
 		if ((nd->nd_flag & ND_NFSV3) != 0 && *attrflagp != 0)
 			dctime = nap->na_ctime;
 		NFSM_DISSECT(tl, u_int32_t *, 3 * NFSX_UNSIGNED);
 		NFSLOCKNODE(dnp);
 		dnp->n_cookieverf.nfsuquad[0] = *tl++;
 		dnp->n_cookieverf.nfsuquad[1] = *tl++;
 		NFSUNLOCKNODE(dnp);
 		more_dirs = fxdr_unsigned(int, *tl);
 		if (!more_dirs)
 			tryformoredirs = 0;
 
 		/* loop through the dir entries, doctoring them to 4bsd form */
 		while (more_dirs && bigenough) {
 			NFSM_DISSECT(tl, u_int32_t *, 3 * NFSX_UNSIGNED);
 			if (nd->nd_flag & ND_NFSV4) {
 				ncookie.lval[0] = *tl++;
 				ncookie.lval[1] = *tl++;
 			} else {
 				fileno = fxdr_hyper(tl);
 				tl += 2;
 			}
 			len = fxdr_unsigned(int, *tl);
 			if (len <= 0 || len > NFS_MAXNAMLEN) {
 				error = EBADRPC;
 				goto nfsmout;
 			}
 			tlen = roundup2(len, 8);
 			if (tlen == len)
 				tlen += 8;  /* To ensure null termination. */
 			left = DIRBLKSIZ - blksiz;
 			if (_GENERIC_DIRLEN(len) + NFSX_HYPER > left) {
 				NFSBZERO(uiop->uio_iov->iov_base, left);
 				dp->d_reclen += left;
 				uiop->uio_iov->iov_base =
 				    (char *)uiop->uio_iov->iov_base + left;
 				uiop->uio_iov->iov_len -= left;
 				uiop->uio_resid -= left;
 				uiop->uio_offset += left;
 				blksiz = 0;
 			}
 			if (_GENERIC_DIRLEN(len) + NFSX_HYPER >
 			    uiop->uio_resid)
 				bigenough = 0;
 			if (bigenough) {
 				struct iovec saviov;
 				off_t savoff;
 				ssize_t savresid;
 				int savblksiz;
 
 				saviov.iov_base = uiop->uio_iov->iov_base;
 				saviov.iov_len = uiop->uio_iov->iov_len;
 				savoff = uiop->uio_offset;
 				savresid = uiop->uio_resid;
 				savblksiz = blksiz;
 
 				dp = (struct dirent *)uiop->uio_iov->iov_base;
 				dp->d_pad0 = dp->d_pad1 = 0;
 				dp->d_off = 0;
 				dp->d_namlen = len;
 				dp->d_reclen = _GENERIC_DIRLEN(len) +
 				    NFSX_HYPER;
 				dp->d_type = DT_UNKNOWN;
 				blksiz += dp->d_reclen;
 				if (blksiz == DIRBLKSIZ)
 					blksiz = 0;
 				uiop->uio_resid -= DIRHDSIZ;
 				uiop->uio_offset += DIRHDSIZ;
 				uiop->uio_iov->iov_base =
 				    (char *)uiop->uio_iov->iov_base + DIRHDSIZ;
 				uiop->uio_iov->iov_len -= DIRHDSIZ;
 				cnp->cn_nameptr = uiop->uio_iov->iov_base;
 				cnp->cn_namelen = len;
 				NFSCNHASHZERO(cnp);
 				cp = uiop->uio_iov->iov_base;
 				error = nfsm_mbufuio(nd, uiop, len);
 				if (error)
 					goto nfsmout;
 				/* Check for an invalid file name. */
 				if (nfscl_invalidfname(
 				    (nd->nd_flag & ND_NFSV4) != 0, cp, len)) {
 					/* Skip over this entry. */
 					uiop->uio_iov->iov_base =
 					    saviov.iov_base;
 					uiop->uio_iov->iov_len =
 					    saviov.iov_len;
 					uiop->uio_offset = savoff;
 					uiop->uio_resid = savresid;
 					blksiz = savblksiz;
 				} else {
 					cp = uiop->uio_iov->iov_base;
 					tlen -= len;
 					NFSBZERO(cp, tlen);
 					cp += tlen; /* points to cookie store */
 					tl2 = (u_int32_t *)cp;
 					if (len == 2 &&
 					    cnp->cn_nameptr[0] == '.' &&
 					    cnp->cn_nameptr[1] == '.')
 						isdotdot = 1;
 					else
 						isdotdot = 0;
 					uiop->uio_iov->iov_base =
 					    (char *)uiop->uio_iov->iov_base +
 					    tlen + NFSX_HYPER;
 					uiop->uio_iov->iov_len -= tlen +
 					    NFSX_HYPER;
 					uiop->uio_resid -= tlen + NFSX_HYPER;
 					uiop->uio_offset += (tlen + NFSX_HYPER);
 				}
 			} else {
 				error = nfsm_advance(nd, NFSM_RNDUP(len), -1);
 				if (error)
 					goto nfsmout;
 			}
 			nfhp = NULL;
 			if (nd->nd_flag & ND_NFSV3) {
 				NFSM_DISSECT(tl, u_int32_t *, 3*NFSX_UNSIGNED);
 				ncookie.lval[0] = *tl++;
 				ncookie.lval[1] = *tl++;
 				attrflag = fxdr_unsigned(int, *tl);
 				if (attrflag) {
 				  error = nfsm_loadattr(nd, &nfsva);
 				  if (error)
 					goto nfsmout;
 				}
 				NFSM_DISSECT(tl,u_int32_t *,NFSX_UNSIGNED);
 				if (*tl) {
 					error = nfsm_getfh(nd, &nfhp);
 					if (error)
 					    goto nfsmout;
 				}
 				if (!attrflag && nfhp != NULL) {
 					free(nfhp, M_NFSFH);
 					nfhp = NULL;
 				}
 			} else {
 				rderr = 0;
 				nfsva.na_mntonfileno = 0xffffffff;
 				error = nfsv4_loadattr(nd, NULL, &nfsva, &nfhp,
 				    NULL, 0, NULL, NULL, NULL, NULL, NULL, 0,
 				    NULL, NULL, &rderr, p, cred);
 				if (error)
 					goto nfsmout;
 			}
 
 			if (bigenough) {
 			    if (nd->nd_flag & ND_NFSV4) {
 				if (rderr) {
 				    dp->d_fileno = 0;
 				} else if (gotmnton) {
 				    if (nfsva.na_mntonfileno != 0xffffffff)
 					dp->d_fileno = nfsva.na_mntonfileno;
 				    else
 					dp->d_fileno = nfsva.na_fileid;
 				} else if (nfsva.na_filesid[0] ==
 				    dnp->n_vattr.na_filesid[0] &&
 				    nfsva.na_filesid[1] ==
 				    dnp->n_vattr.na_filesid[1]) {
 				    dp->d_fileno = nfsva.na_fileid;
 				} else {
 				    do {
 					fakefileno--;
 				    } while (fakefileno ==
 					nfsva.na_fileid);
 				    dp->d_fileno = fakefileno;
 				}
 			    } else {
 				dp->d_fileno = fileno;
 			    }
 			    *tl2++ = cookiep->nfsuquad[0] = cookie.lval[0] =
 				ncookie.lval[0];
 			    *tl2 = cookiep->nfsuquad[1] = cookie.lval[1] =
 				ncookie.lval[1];
 
 			    if (nfhp != NULL) {
 				attr_ok = true;
 				if (NFSRV_CMPFH(nfhp->nfh_fh, nfhp->nfh_len,
 				    dnp->n_fhp->nfh_fh, dnp->n_fhp->nfh_len)) {
 				    VREF(vp);
 				    newvp = vp;
 				    unlocknewvp = 0;
 				    free(nfhp, M_NFSFH);
 				    np = dnp;
 				} else if (isdotdot != 0) {
 				    /*
 				     * Skip doing a nfscl_nget() call for "..".
 				     * There's a race between acquiring the nfs
 				     * node here and lookups that look for the
 				     * directory being read (in the parent).
 				     * It would try to get a lock on ".." here,
 				     * owning the lock on the directory being
 				     * read. Lookup will hold the lock on ".."
 				     * and try to acquire the lock on the
 				     * directory being read.
 				     * If the directory is unlocked/relocked,
 				     * then there is a LOR with the buflock
 				     * vp is relocked.
 				     */
 				    free(nfhp, M_NFSFH);
 				} else {
 				    error = nfscl_nget(vp->v_mount, vp,
 				      nfhp, cnp, p, &np, LK_EXCLUSIVE);
 				    if (!error) {
 					newvp = NFSTOV(np);
 					unlocknewvp = 1;
 					/*
 					 * If n_localmodtime >= time before RPC,
 					 * then a file modification operation,
 					 * such as VOP_SETATTR() of size, has
 					 * occurred while the Lookup RPC and
 					 * acquisition of the vnode happened. As
 					 * such, the attributes might be stale,
 					 * with possibly an incorrect size.
 					 */
 					NFSLOCKNODE(np);
 					if (timespecisset(
 					    &np->n_localmodtime) &&
 					    timespeccmp(&np->n_localmodtime,
 					    &ts, >=)) {
 					    NFSCL_DEBUG(4, "nfsrpc_readdirplus:"
 						" localmod stale attributes\n");
 					    attr_ok = false;
 					}
 					NFSUNLOCKNODE(np);
 				    }
 				}
 				nfhp = NULL;
 				if (newvp != NULLVP) {
 				    if (attr_ok)
 					error = nfscl_loadattrcache(&newvp,
 					    &nfsva, NULL, 0, 0);
 				    if (error) {
 					if (unlocknewvp)
 					    vput(newvp);
 					else
 					    vrele(newvp);
 					goto nfsmout;
 				    }
 				    dp->d_type =
 					vtonfs_dtype(np->n_vattr.na_type);
 				    ndp->ni_vp = newvp;
 				    NFSCNHASH(cnp, HASHINIT);
 				    if (cnp->cn_namelen <= NCHNAMLEN &&
 					ndp->ni_dvp != ndp->ni_vp &&
 					(newvp->v_type != VDIR ||
 					 dctime.tv_sec != 0)) {
 					cache_enter_time_flags(ndp->ni_dvp,
 					    ndp->ni_vp, cnp,
 					    &nfsva.na_ctime,
 					    newvp->v_type != VDIR ? NULL :
 					    &dctime, VFS_CACHE_DROPOLD);
 				    }
 				    if (unlocknewvp)
 					vput(newvp);
 				    else
 					vrele(newvp);
 				    newvp = NULLVP;
 				}
 			    }
 			} else if (nfhp != NULL) {
 			    free(nfhp, M_NFSFH);
 			}
 			NFSM_DISSECT(tl, u_int32_t *, NFSX_UNSIGNED);
 			more_dirs = fxdr_unsigned(int, *tl);
 		}
 		/*
 		 * If at end of rpc data, get the eof boolean
 		 */
 		if (!more_dirs) {
 			NFSM_DISSECT(tl, u_int32_t *, NFSX_UNSIGNED);
 			eof = fxdr_unsigned(int, *tl);
 			if (tryformoredirs)
 				more_dirs = !eof;
 			if (nd->nd_flag & ND_NFSV4) {
 				error = nfscl_postop_attr(nd, nap, attrflagp);
 				if (error)
 					goto nfsmout;
 			}
 		}
 		m_freem(nd->nd_mrep);
 		nd->nd_mrep = NULL;
 	}
 	/*
 	 * Fill last record, iff any, out to a multiple of DIRBLKSIZ
 	 * by increasing d_reclen for the last record.
 	 */
 	if (blksiz > 0) {
 		left = DIRBLKSIZ - blksiz;
 		NFSBZERO(uiop->uio_iov->iov_base, left);
 		dp->d_reclen += left;
 		uiop->uio_iov->iov_base = (char *)uiop->uio_iov->iov_base +
 		    left;
 		uiop->uio_iov->iov_len -= left;
 		uiop->uio_resid -= left;
 		uiop->uio_offset += left;
 	}
 
 	/*
 	 * If returning no data, assume end of file.
 	 * If not bigenough, return not end of file, since you aren't
 	 *    returning all the data
 	 * Otherwise, return the eof flag from the server.
 	 */
 	if (eofp != NULL) {
 		if (tresid == uiop->uio_resid)
 			*eofp = 1;
 		else if (!bigenough)
 			*eofp = 0;
 		else
 			*eofp = eof;
 	}
 
 	/*
 	 * Add extra empty records to any remaining DIRBLKSIZ chunks.
 	 */
 	while (uiop->uio_resid > 0 && uiop->uio_resid != tresid) {
 		dp = (struct dirent *)uiop->uio_iov->iov_base;
 		NFSBZERO(dp, DIRBLKSIZ);
 		dp->d_type = DT_UNKNOWN;
 		tl = (u_int32_t *)&dp->d_name[4];
 		*tl++ = cookie.lval[0];
 		*tl = cookie.lval[1];
 		dp->d_reclen = DIRBLKSIZ;
 		uiop->uio_iov->iov_base = (char *)uiop->uio_iov->iov_base +
 		    DIRBLKSIZ;
 		uiop->uio_iov->iov_len -= DIRBLKSIZ;
 		uiop->uio_resid -= DIRBLKSIZ;
 		uiop->uio_offset += DIRBLKSIZ;
 	}
 
 nfsmout:
 	if (nd->nd_mrep != NULL)
 		m_freem(nd->nd_mrep);
 	return (error);
 }
 
 /*
  * Nfs commit rpc
  */
 int
 nfsrpc_commit(vnode_t vp, u_quad_t offset, int cnt, struct ucred *cred,
     NFSPROC_T *p, struct nfsvattr *nap, int *attrflagp)
 {
 	u_int32_t *tl;
 	struct nfsrv_descript nfsd, *nd = &nfsd;
 	nfsattrbit_t attrbits;
 	int error;
 	struct nfsmount *nmp = VFSTONFS(vp->v_mount);
 
 	*attrflagp = 0;
 	NFSCL_REQSTART(nd, NFSPROC_COMMIT, vp, cred);
 	NFSM_BUILD(tl, u_int32_t *, 3 * NFSX_UNSIGNED);
 	txdr_hyper(offset, tl);
 	tl += 2;
 	*tl = txdr_unsigned(cnt);
 	if (nd->nd_flag & ND_NFSV4) {
 		/*
 		 * And do a Getattr op.
 		 */
 		NFSM_BUILD(tl, u_int32_t *, NFSX_UNSIGNED);
 		*tl = txdr_unsigned(NFSV4OP_GETATTR);
 		NFSGETATTR_ATTRBIT(&attrbits);
 		(void) nfsrv_putattrbit(nd, &attrbits);
 	}
 	error = nfscl_request(nd, vp, p, cred);
 	if (error)
 		return (error);
 	error = nfscl_wcc_data(nd, vp, nap, attrflagp, NULL, NULL);
 	if (!error && !nd->nd_repstat) {
 		NFSM_DISSECT(tl, u_int32_t *, NFSX_VERF);
 		NFSLOCKMNT(nmp);
 		if (NFSBCMP(nmp->nm_verf, tl, NFSX_VERF)) {
 			NFSBCOPY(tl, nmp->nm_verf, NFSX_VERF);
 			nd->nd_repstat = NFSERR_STALEWRITEVERF;
 		}
 		NFSUNLOCKMNT(nmp);
 		if (nd->nd_flag & ND_NFSV4)
 			error = nfscl_postop_attr(nd, nap, attrflagp);
 	}
 nfsmout:
 	if (!error && nd->nd_repstat)
 		error = nd->nd_repstat;
 	m_freem(nd->nd_mrep);
 	return (error);
 }
 
 /*
  * NFS byte range lock rpc.
  * (Mostly just calls one of the three lower level RPC routines.)
  */
 int
 nfsrpc_advlock(vnode_t vp, off_t size, int op, struct flock *fl,
     int reclaim, struct ucred *cred, NFSPROC_T *p, void *id, int flags)
 {
 	struct nfscllockowner *lp;
 	struct nfsclclient *clp;
 	struct nfsfh *nfhp;
 	struct nfsrv_descript nfsd, *nd = &nfsd;
 	struct nfsmount *nmp = VFSTONFS(vp->v_mount);
 	u_int64_t off, len;
 	off_t start, end;
 	u_int32_t clidrev = 0;
 	int error = 0, newone = 0, expireret = 0, retrycnt, donelocally;
 	int callcnt, dorpc;
 
 	/*
 	 * Convert the flock structure into a start and end and do POSIX
 	 * bounds checking.
 	 */
 	switch (fl->l_whence) {
 	case SEEK_SET:
 	case SEEK_CUR:
 		/*
 		 * Caller is responsible for adding any necessary offset
 		 * when SEEK_CUR is used.
 		 */
 		start = fl->l_start;
 		off = fl->l_start;
 		break;
 	case SEEK_END:
 		start = size + fl->l_start;
 		off = size + fl->l_start;
 		break;
 	default:
 		return (EINVAL);
 	}
 	if (start < 0)
 		return (EINVAL);
 	if (fl->l_len != 0) {
 		end = start + fl->l_len - 1;
 		if (end < start)
 			return (EINVAL);
 	}
 
 	len = fl->l_len;
 	if (len == 0)
 		len = NFS64BITSSET;
 	retrycnt = 0;
 	do {
 	    nd->nd_repstat = 0;
 	    if (op == F_GETLK) {
 		error = nfscl_getcl(vp->v_mount, cred, p, false, true, &clp);
 		if (error)
 			return (error);
 		error = nfscl_lockt(vp, clp, off, len, fl, p, id, flags);
 		if (!error) {
 			clidrev = clp->nfsc_clientidrev;
 			error = nfsrpc_lockt(nd, vp, clp, off, len, fl, cred,
 			    p, id, flags);
 		} else if (error == -1) {
 			error = 0;
 		}
 		nfscl_clientrelease(clp);
 	    } else if (op == F_UNLCK && fl->l_type == F_UNLCK) {
 		/*
 		 * We must loop around for all lockowner cases.
 		 */
 		callcnt = 0;
 		error = nfscl_getcl(vp->v_mount, cred, p, false, true, &clp);
 		if (error)
 			return (error);
 		do {
 		    error = nfscl_relbytelock(vp, off, len, cred, p, callcnt,
 			clp, id, flags, &lp, &dorpc);
 		    /*
 		     * If it returns a NULL lp, we're done.
 		     */
 		    if (lp == NULL) {
 			if (callcnt == 0)
 			    nfscl_clientrelease(clp);
 			else
 			    nfscl_releasealllocks(clp, vp, p, id, flags);
 			return (error);
 		    }
 		    if (nmp->nm_clp != NULL)
 			clidrev = nmp->nm_clp->nfsc_clientidrev;
 		    else
 			clidrev = 0;
 		    /*
 		     * If the server doesn't support Posix lock semantics,
 		     * only allow locks on the entire file, since it won't
 		     * handle overlapping byte ranges.
 		     * There might still be a problem when a lock
 		     * upgrade/downgrade (read<->write) occurs, since the
 		     * server "might" expect an unlock first?
 		     */
 		    if (dorpc && (lp->nfsl_open->nfso_posixlock ||
 			(off == 0 && len == NFS64BITSSET))) {
 			/*
 			 * Since the lock records will go away, we must
 			 * wait for grace and delay here.
 			 */
 			do {
 			    error = nfsrpc_locku(nd, nmp, lp, off, len,
 				NFSV4LOCKT_READ, cred, p, 0);
 			    if ((nd->nd_repstat == NFSERR_GRACE ||
 				 nd->nd_repstat == NFSERR_DELAY) &&
 				error == 0)
 				(void) nfs_catnap(PZERO, (int)nd->nd_repstat,
 				    "nfs_advlock");
 			} while ((nd->nd_repstat == NFSERR_GRACE ||
 			    nd->nd_repstat == NFSERR_DELAY) && error == 0);
 		    }
 		    callcnt++;
 		} while (error == 0 && nd->nd_repstat == 0);
 		nfscl_releasealllocks(clp, vp, p, id, flags);
 	    } else if (op == F_SETLK) {
 		error = nfscl_getbytelock(vp, off, len, fl->l_type, cred, p,
 		    NULL, 0, id, flags, NULL, NULL, &lp, &newone, &donelocally);
 		if (error || donelocally) {
 			return (error);
 		}
 		if (nmp->nm_clp != NULL)
 			clidrev = nmp->nm_clp->nfsc_clientidrev;
 		else
 			clidrev = 0;
 		nfhp = VTONFS(vp)->n_fhp;
 		if (!lp->nfsl_open->nfso_posixlock &&
 		    (off != 0 || len != NFS64BITSSET)) {
 			error = EINVAL;
 		} else {
 			error = nfsrpc_lock(nd, nmp, vp, nfhp->nfh_fh,
 			    nfhp->nfh_len, lp, newone, reclaim, off,
 			    len, fl->l_type, cred, p, 0);
 		}
 		if (!error)
 			error = nd->nd_repstat;
 		nfscl_lockrelease(lp, error, newone);
 	    } else {
 		error = EINVAL;
 	    }
 	    if (!error)
 	        error = nd->nd_repstat;
 	    if (error == NFSERR_GRACE || error == NFSERR_STALESTATEID ||
 		error == NFSERR_STALEDONTRECOVER ||
 		error == NFSERR_STALECLIENTID || error == NFSERR_DELAY ||
 		error == NFSERR_BADSESSION) {
 		(void) nfs_catnap(PZERO, error, "nfs_advlock");
 	    } else if ((error == NFSERR_EXPIRED || error == NFSERR_BADSTATEID)
 		&& clidrev != 0) {
 		expireret = nfscl_hasexpired(nmp->nm_clp, clidrev, p);
 		retrycnt++;
 	    }
 	} while (error == NFSERR_GRACE ||
 	    error == NFSERR_STALECLIENTID || error == NFSERR_DELAY ||
 	    error == NFSERR_STALEDONTRECOVER || error == NFSERR_STALESTATEID ||
 	    error == NFSERR_BADSESSION ||
 	    ((error == NFSERR_EXPIRED || error == NFSERR_BADSTATEID) &&
 	     expireret == 0 && clidrev != 0 && retrycnt < 4));
 	if (error && retrycnt >= 4)
 		error = EIO;
 	return (error);
 }
 
 /*
  * The lower level routine for the LockT case.
  */
 int
 nfsrpc_lockt(struct nfsrv_descript *nd, vnode_t vp,
     struct nfsclclient *clp, u_int64_t off, u_int64_t len, struct flock *fl,
     struct ucred *cred, NFSPROC_T *p, void *id, int flags)
 {
 	u_int32_t *tl;
 	int error, type, size;
 	uint8_t own[NFSV4CL_LOCKNAMELEN + NFSX_V4FHMAX];
 	struct nfsnode *np;
 	struct nfsmount *nmp;
 	struct nfsclsession *tsep;
 
 	nmp = VFSTONFS(vp->v_mount);
 	NFSCL_REQSTART(nd, NFSPROC_LOCKT, vp, cred);
 	NFSM_BUILD(tl, u_int32_t *, 7 * NFSX_UNSIGNED);
 	if (fl->l_type == F_RDLCK)
 		*tl++ = txdr_unsigned(NFSV4LOCKT_READ);
 	else
 		*tl++ = txdr_unsigned(NFSV4LOCKT_WRITE);
 	txdr_hyper(off, tl);
 	tl += 2;
 	txdr_hyper(len, tl);
 	tl += 2;
 	tsep = nfsmnt_mdssession(nmp);
 	*tl++ = tsep->nfsess_clientid.lval[0];
 	*tl = tsep->nfsess_clientid.lval[1];
 	nfscl_filllockowner(id, own, flags);
 	np = VTONFS(vp);
 	NFSBCOPY(np->n_fhp->nfh_fh, &own[NFSV4CL_LOCKNAMELEN],
 	    np->n_fhp->nfh_len);
 	(void)nfsm_strtom(nd, own, NFSV4CL_LOCKNAMELEN + np->n_fhp->nfh_len);
 	error = nfscl_request(nd, vp, p, cred);
 	if (error)
 		return (error);
 	if (nd->nd_repstat == 0) {
 		fl->l_type = F_UNLCK;
 	} else if (nd->nd_repstat == NFSERR_DENIED) {
 		nd->nd_repstat = 0;
 		fl->l_whence = SEEK_SET;
 		NFSM_DISSECT(tl, u_int32_t *, 8 * NFSX_UNSIGNED);
 		fl->l_start = fxdr_hyper(tl);
 		tl += 2;
 		len = fxdr_hyper(tl);
 		tl += 2;
 		if (len == NFS64BITSSET)
 			fl->l_len = 0;
 		else
 			fl->l_len = len;
 		type = fxdr_unsigned(int, *tl++);
 		if (type == NFSV4LOCKT_WRITE)
 			fl->l_type = F_WRLCK;
 		else
 			fl->l_type = F_RDLCK;
 		/*
 		 * XXX For now, I have no idea what to do with the
 		 * conflicting lock_owner, so I'll just set the pid == 0
 		 * and skip over the lock_owner.
 		 */
 		fl->l_pid = (pid_t)0;
 		tl += 2;
 		size = fxdr_unsigned(int, *tl);
 		if (size < 0 || size > NFSV4_OPAQUELIMIT)
 			error = EBADRPC;
 		if (!error)
 			error = nfsm_advance(nd, NFSM_RNDUP(size), -1);
 	} else if (nd->nd_repstat == NFSERR_STALECLIENTID)
 		nfscl_initiate_recovery(clp);
 nfsmout:
 	m_freem(nd->nd_mrep);
 	return (error);
 }
 
 /*
  * Lower level function that performs the LockU RPC.
  */
 static int
 nfsrpc_locku(struct nfsrv_descript *nd, struct nfsmount *nmp,
     struct nfscllockowner *lp, u_int64_t off, u_int64_t len,
     u_int32_t type, struct ucred *cred, NFSPROC_T *p, int syscred)
 {
 	u_int32_t *tl;
 	int error;
 
 	nfscl_reqstart(nd, NFSPROC_LOCKU, nmp, lp->nfsl_open->nfso_fh,
 	    lp->nfsl_open->nfso_fhlen, NULL, NULL, 0, 0, cred);
 	NFSM_BUILD(tl, u_int32_t *, NFSX_STATEID + 6 * NFSX_UNSIGNED);
 	*tl++ = txdr_unsigned(type);
 	*tl = txdr_unsigned(lp->nfsl_seqid);
 	if (nfstest_outofseq &&
 	    (arc4random() % nfstest_outofseq) == 0)
 		*tl = txdr_unsigned(lp->nfsl_seqid + 1);
 	tl++;
 	if (NFSHASNFSV4N(nmp))
 		*tl++ = 0;
 	else
 		*tl++ = lp->nfsl_stateid.seqid;
 	*tl++ = lp->nfsl_stateid.other[0];
 	*tl++ = lp->nfsl_stateid.other[1];
 	*tl++ = lp->nfsl_stateid.other[2];
 	txdr_hyper(off, tl);
 	tl += 2;
 	txdr_hyper(len, tl);
 	if (syscred)
 		nd->nd_flag |= ND_USEGSSNAME;
 	error = newnfs_request(nd, nmp, NULL, &nmp->nm_sockreq, NULL, p, cred,
 	    NFS_PROG, NFS_VER4, NULL, 1, NULL, NULL);
 	NFSCL_INCRSEQID(lp->nfsl_seqid, nd);
 	if (error)
 		return (error);
 	if (nd->nd_repstat == 0) {
 		NFSM_DISSECT(tl, u_int32_t *, NFSX_STATEID);
 		lp->nfsl_stateid.seqid = *tl++;
 		lp->nfsl_stateid.other[0] = *tl++;
 		lp->nfsl_stateid.other[1] = *tl++;
 		lp->nfsl_stateid.other[2] = *tl;
 	} else if (nd->nd_repstat == NFSERR_STALESTATEID)
 		nfscl_initiate_recovery(lp->nfsl_open->nfso_own->nfsow_clp);
 nfsmout:
 	m_freem(nd->nd_mrep);
 	return (error);
 }
 
 /*
  * The actual Lock RPC.
  */
 int
 nfsrpc_lock(struct nfsrv_descript *nd, struct nfsmount *nmp, vnode_t vp,
     u_int8_t *nfhp, int fhlen, struct nfscllockowner *lp, int newone,
     int reclaim, u_int64_t off, u_int64_t len, short type, struct ucred *cred,
     NFSPROC_T *p, int syscred)
 {
 	u_int32_t *tl;
 	int error, size;
 	uint8_t own[NFSV4CL_LOCKNAMELEN + NFSX_V4FHMAX];
 	struct nfsclsession *tsep;
 
 	nfscl_reqstart(nd, NFSPROC_LOCK, nmp, nfhp, fhlen, NULL, NULL, 0, 0,
 	    cred);
 	NFSM_BUILD(tl, u_int32_t *, 7 * NFSX_UNSIGNED);
 	if (type == F_RDLCK)
 		*tl++ = txdr_unsigned(NFSV4LOCKT_READ);
 	else
 		*tl++ = txdr_unsigned(NFSV4LOCKT_WRITE);
 	*tl++ = txdr_unsigned(reclaim);
 	txdr_hyper(off, tl);
 	tl += 2;
 	txdr_hyper(len, tl);
 	tl += 2;
 	if (newone) {
 	    *tl = newnfs_true;
 	    NFSM_BUILD(tl, u_int32_t *, NFSX_STATEID +
 		2 * NFSX_UNSIGNED + NFSX_HYPER);
 	    *tl++ = txdr_unsigned(lp->nfsl_open->nfso_own->nfsow_seqid);
 	    if (NFSHASNFSV4N(nmp))
 		*tl++ = 0;
 	    else
 		*tl++ = lp->nfsl_open->nfso_stateid.seqid;
 	    *tl++ = lp->nfsl_open->nfso_stateid.other[0];
 	    *tl++ = lp->nfsl_open->nfso_stateid.other[1];
 	    *tl++ = lp->nfsl_open->nfso_stateid.other[2];
 	    *tl++ = txdr_unsigned(lp->nfsl_seqid);
 	    tsep = nfsmnt_mdssession(nmp);
 	    *tl++ = tsep->nfsess_clientid.lval[0];
 	    *tl = tsep->nfsess_clientid.lval[1];
 	    NFSBCOPY(lp->nfsl_owner, own, NFSV4CL_LOCKNAMELEN);
 	    NFSBCOPY(nfhp, &own[NFSV4CL_LOCKNAMELEN], fhlen);
 	    (void)nfsm_strtom(nd, own, NFSV4CL_LOCKNAMELEN + fhlen);
 	} else {
 	    *tl = newnfs_false;
 	    NFSM_BUILD(tl, u_int32_t *, NFSX_STATEID + NFSX_UNSIGNED);
 	    if (NFSHASNFSV4N(nmp))
 		*tl++ = 0;
 	    else
 		*tl++ = lp->nfsl_stateid.seqid;
 	    *tl++ = lp->nfsl_stateid.other[0];
 	    *tl++ = lp->nfsl_stateid.other[1];
 	    *tl++ = lp->nfsl_stateid.other[2];
 	    *tl = txdr_unsigned(lp->nfsl_seqid);
 	    if (nfstest_outofseq &&
 		(arc4random() % nfstest_outofseq) == 0)
 		    *tl = txdr_unsigned(lp->nfsl_seqid + 1);
 	}
 	if (syscred)
 		nd->nd_flag |= ND_USEGSSNAME;
 	error = newnfs_request(nd, nmp, NULL, &nmp->nm_sockreq, vp, p, cred,
 	    NFS_PROG, NFS_VER4, NULL, 1, NULL, NULL);
 	if (error)
 		return (error);
 	if (newone)
 	    NFSCL_INCRSEQID(lp->nfsl_open->nfso_own->nfsow_seqid, nd);
 	NFSCL_INCRSEQID(lp->nfsl_seqid, nd);
 	if (nd->nd_repstat == 0) {
 		NFSM_DISSECT(tl, u_int32_t *, NFSX_STATEID);
 		lp->nfsl_stateid.seqid = *tl++;
 		lp->nfsl_stateid.other[0] = *tl++;
 		lp->nfsl_stateid.other[1] = *tl++;
 		lp->nfsl_stateid.other[2] = *tl;
 	} else if (nd->nd_repstat == NFSERR_DENIED) {
 		NFSM_DISSECT(tl, u_int32_t *, 8 * NFSX_UNSIGNED);
 		size = fxdr_unsigned(int, *(tl + 7));
 		if (size < 0 || size > NFSV4_OPAQUELIMIT)
 			error = EBADRPC;
 		if (!error)
 			error = nfsm_advance(nd, NFSM_RNDUP(size), -1);
 	} else if (nd->nd_repstat == NFSERR_STALESTATEID)
 		nfscl_initiate_recovery(lp->nfsl_open->nfso_own->nfsow_clp);
 nfsmout:
 	m_freem(nd->nd_mrep);
 	return (error);
 }
 
 /*
  * nfs statfs rpc
  * (always called with the vp for the mount point)
  */
 int
 nfsrpc_statfs(vnode_t vp, struct nfsstatfs *sbp, struct nfsfsinfo *fsp,
     uint32_t *leasep, struct ucred *cred, NFSPROC_T *p, struct nfsvattr *nap,
     int *attrflagp)
 {
 	u_int32_t *tl = NULL;
 	struct nfsrv_descript nfsd, *nd = &nfsd;
 	struct nfsmount *nmp;
 	nfsattrbit_t attrbits;
 	int error;
 
 	*attrflagp = 0;
 	nmp = VFSTONFS(vp->v_mount);
 	if (NFSHASNFSV4(nmp)) {
 		/*
 		 * For V4, you actually do a getattr.
 		 */
 		NFSCL_REQSTART(nd, NFSPROC_GETATTR, vp, cred);
 		if (leasep != NULL)
 			NFSROOTFS_GETATTRBIT(&attrbits);
 		else
 			NFSSTATFS_GETATTRBIT(&attrbits);
 		(void) nfsrv_putattrbit(nd, &attrbits);
 		nd->nd_flag |= ND_USEGSSNAME;
 		error = nfscl_request(nd, vp, p, cred);
 		if (error)
 			return (error);
 		if (nd->nd_repstat == 0) {
 			error = nfsv4_loadattr(nd, NULL, nap, NULL, NULL, 0,
 			    NULL, NULL, sbp, fsp, NULL, 0, NULL, leasep, NULL,
 			    p, cred);
 			if (!error) {
 				nmp->nm_fsid[0] = nap->na_filesid[0];
 				nmp->nm_fsid[1] = nap->na_filesid[1];
 				NFSSETHASSETFSID(nmp);
 				*attrflagp = 1;
 			}
 		} else {
 			error = nd->nd_repstat;
 		}
 		if (error)
 			goto nfsmout;
 	} else {
 		NFSCL_REQSTART(nd, NFSPROC_FSSTAT, vp, NULL);
 		error = nfscl_request(nd, vp, p, cred);
 		if (error)
 			return (error);
 		if (nd->nd_flag & ND_NFSV3) {
 			error = nfscl_postop_attr(nd, nap, attrflagp);
 			if (error)
 				goto nfsmout;
 		}
 		if (nd->nd_repstat) {
 			error = nd->nd_repstat;
 			goto nfsmout;
 		}
 		NFSM_DISSECT(tl, u_int32_t *,
 		    NFSX_STATFS(nd->nd_flag & ND_NFSV3));
 	}
 	if (NFSHASNFSV3(nmp)) {
 		sbp->sf_tbytes = fxdr_hyper(tl); tl += 2;
 		sbp->sf_fbytes = fxdr_hyper(tl); tl += 2;
 		sbp->sf_abytes = fxdr_hyper(tl); tl += 2;
 		sbp->sf_tfiles = fxdr_hyper(tl); tl += 2;
 		sbp->sf_ffiles = fxdr_hyper(tl); tl += 2;
 		sbp->sf_afiles = fxdr_hyper(tl); tl += 2;
 		sbp->sf_invarsec = fxdr_unsigned(u_int32_t, *tl);
 	} else if (NFSHASNFSV4(nmp) == 0) {
 		sbp->sf_tsize = fxdr_unsigned(u_int32_t, *tl++);
 		sbp->sf_bsize = fxdr_unsigned(u_int32_t, *tl++);
 		sbp->sf_blocks = fxdr_unsigned(u_int32_t, *tl++);
 		sbp->sf_bfree = fxdr_unsigned(u_int32_t, *tl++);
 		sbp->sf_bavail = fxdr_unsigned(u_int32_t, *tl);
 	}
 nfsmout:
 	m_freem(nd->nd_mrep);
 	return (error);
 }
 
 /*
  * nfs pathconf rpc
  */
 int
 nfsrpc_pathconf(vnode_t vp, struct nfsv3_pathconf *pc,
     struct ucred *cred, NFSPROC_T *p, struct nfsvattr *nap, int *attrflagp)
 {
 	struct nfsrv_descript nfsd, *nd = &nfsd;
 	struct nfsmount *nmp;
 	u_int32_t *tl;
 	nfsattrbit_t attrbits;
 	int error;
 	struct nfsnode *np;
 
 	*attrflagp = 0;
 	nmp = VFSTONFS(vp->v_mount);
 	if (NFSHASNFSV4(nmp)) {
 		np = VTONFS(vp);
 		if ((nmp->nm_privflag & NFSMNTP_FAKEROOTFH) != 0 &&
 		    nmp->nm_fhsize == 0) {
 			/* Attempt to get the actual root file handle. */
 			error = nfsrpc_getdirpath(nmp, NFSMNT_DIRPATH(nmp),
 			    cred, p);
 			if (error != 0)
 				return (EACCES);
 			if (np->n_fhp->nfh_len == NFSX_FHMAX + 1)
 				nfscl_statfs(vp, cred, p);
 		}
 		/*
 		 * For V4, you actually do a getattr.
 		 */
 		NFSCL_REQSTART(nd, NFSPROC_GETATTR, vp, cred);
 		NFSPATHCONF_GETATTRBIT(&attrbits);
 		(void) nfsrv_putattrbit(nd, &attrbits);
 		nd->nd_flag |= ND_USEGSSNAME;
 		error = nfscl_request(nd, vp, p, cred);
 		if (error)
 			return (error);
 		if (nd->nd_repstat == 0) {
 			error = nfsv4_loadattr(nd, NULL, nap, NULL, NULL, 0,
 			    pc, NULL, NULL, NULL, NULL, 0, NULL, NULL, NULL, p,
 			    cred);
 			if (!error)
 				*attrflagp = 1;
 		} else {
 			error = nd->nd_repstat;
 		}
 	} else {
 		NFSCL_REQSTART(nd, NFSPROC_PATHCONF, vp, NULL);
 		error = nfscl_request(nd, vp, p, cred);
 		if (error)
 			return (error);
 		error = nfscl_postop_attr(nd, nap, attrflagp);
 		if (nd->nd_repstat && !error)
 			error = nd->nd_repstat;
 		if (!error) {
 			NFSM_DISSECT(tl, u_int32_t *, NFSX_V3PATHCONF);
 			pc->pc_linkmax = fxdr_unsigned(u_int32_t, *tl++);
 			pc->pc_namemax = fxdr_unsigned(u_int32_t, *tl++);
 			pc->pc_notrunc = fxdr_unsigned(u_int32_t, *tl++);
 			pc->pc_chownrestricted =
 			    fxdr_unsigned(u_int32_t, *tl++);
 			pc->pc_caseinsensitive =
 			    fxdr_unsigned(u_int32_t, *tl++);
 			pc->pc_casepreserving = fxdr_unsigned(u_int32_t, *tl);
 		}
 	}
 nfsmout:
 	m_freem(nd->nd_mrep);
 	return (error);
 }
 
 /*
  * nfs version 3 fsinfo rpc call
  */
 int
 nfsrpc_fsinfo(vnode_t vp, struct nfsfsinfo *fsp, struct ucred *cred,
     NFSPROC_T *p, struct nfsvattr *nap, int *attrflagp)
 {
 	u_int32_t *tl;
 	struct nfsrv_descript nfsd, *nd = &nfsd;
 	int error;
 
 	*attrflagp = 0;
 	NFSCL_REQSTART(nd, NFSPROC_FSINFO, vp, NULL);
 	error = nfscl_request(nd, vp, p, cred);
 	if (error)
 		return (error);
 	error = nfscl_postop_attr(nd, nap, attrflagp);
 	if (nd->nd_repstat && !error)
 		error = nd->nd_repstat;
 	if (!error) {
 		NFSM_DISSECT(tl, u_int32_t *, NFSX_V3FSINFO);
 		fsp->fs_rtmax = fxdr_unsigned(u_int32_t, *tl++);
 		fsp->fs_rtpref = fxdr_unsigned(u_int32_t, *tl++);
 		fsp->fs_rtmult = fxdr_unsigned(u_int32_t, *tl++);
 		fsp->fs_wtmax = fxdr_unsigned(u_int32_t, *tl++);
 		fsp->fs_wtpref = fxdr_unsigned(u_int32_t, *tl++);
 		fsp->fs_wtmult = fxdr_unsigned(u_int32_t, *tl++);
 		fsp->fs_dtpref = fxdr_unsigned(u_int32_t, *tl++);
 		fsp->fs_maxfilesize = fxdr_hyper(tl);
 		tl += 2;
 		fxdr_nfsv3time(tl, &fsp->fs_timedelta);
 		tl += 2;
 		fsp->fs_properties = fxdr_unsigned(u_int32_t, *tl);
 	}
 nfsmout:
 	m_freem(nd->nd_mrep);
 	return (error);
 }
 
 /*
  * This function performs the Renew RPC.
  */
 int
 nfsrpc_renew(struct nfsclclient *clp, struct nfsclds *dsp, struct ucred *cred,
     NFSPROC_T *p)
 {
 	u_int32_t *tl;
 	struct nfsrv_descript nfsd;
 	struct nfsrv_descript *nd = &nfsd;
 	struct nfsmount *nmp;
 	int error;
 	struct nfssockreq *nrp;
 	struct nfsclsession *tsep;
 
 	nmp = clp->nfsc_nmp;
 	if (nmp == NULL)
 		return (0);
 	if (dsp == NULL)
 		nfscl_reqstart(nd, NFSPROC_RENEW, nmp, NULL, 0, NULL, NULL, 0,
 		    0, cred);
 	else
 		nfscl_reqstart(nd, NFSPROC_RENEW, nmp, NULL, 0, NULL,
 		    &dsp->nfsclds_sess, 0, 0, NULL);
 	if (!NFSHASNFSV4N(nmp)) {
 		/* NFSv4.1 just uses a Sequence Op and not a Renew. */
 		NFSM_BUILD(tl, u_int32_t *, 2 * NFSX_UNSIGNED);
 		tsep = nfsmnt_mdssession(nmp);
 		*tl++ = tsep->nfsess_clientid.lval[0];
 		*tl = tsep->nfsess_clientid.lval[1];
 	}
 	nrp = NULL;
 	if (dsp != NULL)
 		nrp = dsp->nfsclds_sockp;
 	if (nrp == NULL)
 		/* If NULL, use the MDS socket. */
 		nrp = &nmp->nm_sockreq;
 	nd->nd_flag |= ND_USEGSSNAME;
 	if (dsp == NULL)
 		error = newnfs_request(nd, nmp, NULL, nrp, NULL, p, cred,
 		    NFS_PROG, NFS_VER4, NULL, 1, NULL, NULL);
 	else {
 		error = newnfs_request(nd, nmp, NULL, nrp, NULL, p, cred,
 		    NFS_PROG, NFS_VER4, NULL, 1, NULL, &dsp->nfsclds_sess);
 		if (error == ENXIO)
 			nfscl_cancelreqs(dsp);
 	}
 	if (error)
 		return (error);
 	error = nd->nd_repstat;
 	m_freem(nd->nd_mrep);
 	return (error);
 }
 
 /*
  * This function performs the Releaselockowner RPC.
  */
 int
 nfsrpc_rellockown(struct nfsmount *nmp, struct nfscllockowner *lp,
     uint8_t *fh, int fhlen, struct ucred *cred, NFSPROC_T *p)
 {
 	struct nfsrv_descript nfsd, *nd = &nfsd;
 	u_int32_t *tl;
 	int error;
 	uint8_t own[NFSV4CL_LOCKNAMELEN + NFSX_V4FHMAX];
 	struct nfsclsession *tsep;
 
 	if (NFSHASNFSV4N(nmp)) {
 		/* For NFSv4.1, do a FreeStateID. */
 		nfscl_reqstart(nd, NFSPROC_FREESTATEID, nmp, NULL, 0, NULL,
 		    NULL, 0, 0, cred);
 		nfsm_stateidtom(nd, &lp->nfsl_stateid, NFSSTATEID_PUTSTATEID);
 	} else {
 		nfscl_reqstart(nd, NFSPROC_RELEASELCKOWN, nmp, NULL, 0, NULL,
 		    NULL, 0, 0, NULL);
 		NFSM_BUILD(tl, u_int32_t *, 2 * NFSX_UNSIGNED);
 		tsep = nfsmnt_mdssession(nmp);
 		*tl++ = tsep->nfsess_clientid.lval[0];
 		*tl = tsep->nfsess_clientid.lval[1];
 		NFSBCOPY(lp->nfsl_owner, own, NFSV4CL_LOCKNAMELEN);
 		NFSBCOPY(fh, &own[NFSV4CL_LOCKNAMELEN], fhlen);
 		(void)nfsm_strtom(nd, own, NFSV4CL_LOCKNAMELEN + fhlen);
 	}
 	nd->nd_flag |= ND_USEGSSNAME;
 	error = newnfs_request(nd, nmp, NULL, &nmp->nm_sockreq, NULL, p, cred,
 	    NFS_PROG, NFS_VER4, NULL, 1, NULL, NULL);
 	if (error)
 		return (error);
 	error = nd->nd_repstat;
 	m_freem(nd->nd_mrep);
 	return (error);
 }
 
 /*
  * This function performs the Compound to get the mount pt FH.
  */
 int
 nfsrpc_getdirpath(struct nfsmount *nmp, u_char *dirpath, struct ucred *cred,
     NFSPROC_T *p)
 {
 	u_int32_t *tl;
 	struct nfsrv_descript nfsd;
 	struct nfsrv_descript *nd = &nfsd;
 	u_char *cp, *cp2, *fhp;
 	int error, cnt, len, setnil;
 	u_int32_t *opcntp;
 
 	nfscl_reqstart(nd, NFSPROC_PUTROOTFH, nmp, NULL, 0, &opcntp, NULL, 0,
 	    0, NULL);
 	cp = dirpath;
 	cnt = 0;
 	do {
 		setnil = 0;
 		while (*cp == '/')
 			cp++;
 		cp2 = cp;
 		while (*cp2 != '\0' && *cp2 != '/')
 			cp2++;
 		if (*cp2 == '/') {
 			setnil = 1;
 			*cp2 = '\0';
 		}
 		if (cp2 != cp) {
 			NFSM_BUILD(tl, u_int32_t *, NFSX_UNSIGNED);
 			*tl = txdr_unsigned(NFSV4OP_LOOKUP);
 			nfsm_strtom(nd, cp, strlen(cp));
 			cnt++;
 		}
 		if (setnil)
 			*cp2++ = '/';
 		cp = cp2;
 	} while (*cp != '\0');
 	if (NFSHASNFSV4N(nmp))
 		/* Has a Sequence Op done by nfscl_reqstart(). */
 		*opcntp = txdr_unsigned(3 + cnt);
 	else
 		*opcntp = txdr_unsigned(2 + cnt);
 	NFSM_BUILD(tl, u_int32_t *, NFSX_UNSIGNED);
 	*tl = txdr_unsigned(NFSV4OP_GETFH);
 	nd->nd_flag |= ND_USEGSSNAME;
 	error = newnfs_request(nd, nmp, NULL, &nmp->nm_sockreq, NULL, p, cred,
 		NFS_PROG, NFS_VER4, NULL, 1, NULL, NULL);
 	if (error)
 		return (error);
 	if (nd->nd_repstat == 0) {
 		NFSM_DISSECT(tl, u_int32_t *, (3 + 2 * cnt) * NFSX_UNSIGNED);
 		tl += (2 + 2 * cnt);
 		if ((len = fxdr_unsigned(int, *tl)) <= 0 ||
 			len > NFSX_FHMAX) {
 			nd->nd_repstat = NFSERR_BADXDR;
 		} else {
 			fhp = malloc(len + 1, M_TEMP, M_WAITOK);
 			nd->nd_repstat = nfsrv_mtostr(nd, fhp, len);
 			if (nd->nd_repstat == 0) {
 				NFSLOCKMNT(nmp);
 				if (nmp->nm_fhsize == 0) {
 					NFSBCOPY(fhp, nmp->nm_fh, len);
 					nmp->nm_fhsize = len;
 				}
 				NFSUNLOCKMNT(nmp);
 			}
 			free(fhp, M_TEMP);
 		}
 	}
 	error = nd->nd_repstat;
 nfsmout:
 	m_freem(nd->nd_mrep);
 	return (error);
 }
 
 /*
  * This function performs the Delegreturn RPC.
  */
 int
 nfsrpc_delegreturn(struct nfscldeleg *dp, struct ucred *cred,
     struct nfsmount *nmp, NFSPROC_T *p, int syscred)
 {
 	u_int32_t *tl;
 	struct nfsrv_descript nfsd;
 	struct nfsrv_descript *nd = &nfsd;
 	int error;
 
 	nfscl_reqstart(nd, NFSPROC_DELEGRETURN, nmp, dp->nfsdl_fh,
 	    dp->nfsdl_fhlen, NULL, NULL, 0, 0, cred);
 	NFSM_BUILD(tl, u_int32_t *, NFSX_STATEID);
 	if (NFSHASNFSV4N(nmp))
 		*tl++ = 0;
 	else
 		*tl++ = dp->nfsdl_stateid.seqid;
 	*tl++ = dp->nfsdl_stateid.other[0];
 	*tl++ = dp->nfsdl_stateid.other[1];
 	*tl = dp->nfsdl_stateid.other[2];
 	if (syscred)
 		nd->nd_flag |= ND_USEGSSNAME;
 	error = newnfs_request(nd, nmp, NULL, &nmp->nm_sockreq, NULL, p, cred,
 	    NFS_PROG, NFS_VER4, NULL, 1, NULL, NULL);
 	if (error)
 		return (error);
 	error = nd->nd_repstat;
 	m_freem(nd->nd_mrep);
 	return (error);
 }
 
 /*
  * nfs getacl call.
  */
 int
 nfsrpc_getacl(vnode_t vp, struct ucred *cred, NFSPROC_T *p, struct acl *aclp)
 {
 	struct nfsrv_descript nfsd, *nd = &nfsd;
 	int error;
 	nfsattrbit_t attrbits;
 	struct nfsmount *nmp = VFSTONFS(vp->v_mount);
 
 	if (nfsrv_useacl == 0 || !NFSHASNFSV4(nmp))
 		return (EOPNOTSUPP);
 	NFSCL_REQSTART(nd, NFSPROC_GETACL, vp, cred);
 	NFSZERO_ATTRBIT(&attrbits);
 	NFSSETBIT_ATTRBIT(&attrbits, NFSATTRBIT_ACL);
 	(void) nfsrv_putattrbit(nd, &attrbits);
 	error = nfscl_request(nd, vp, p, cred);
 	if (error)
 		return (error);
 	if (!nd->nd_repstat)
 		error = nfsv4_loadattr(nd, vp, NULL, NULL, NULL, 0, NULL,
 		    NULL, NULL, NULL, aclp, 0, NULL, NULL, NULL, p, cred);
 	else
 		error = nd->nd_repstat;
 	m_freem(nd->nd_mrep);
 	return (error);
 }
 
 /*
  * nfs setacl call.
  */
 int
 nfsrpc_setacl(vnode_t vp, struct ucred *cred, NFSPROC_T *p, struct acl *aclp)
 {
 	int error;
 	struct nfsmount *nmp = VFSTONFS(vp->v_mount);
 
 	if (nfsrv_useacl == 0 || !NFSHASNFSV4(nmp))
 		return (EOPNOTSUPP);
 	error = nfsrpc_setattr(vp, NULL, aclp, cred, p, NULL, NULL);
 	return (error);
 }
 
 /*
  * nfs setacl call.
  */
 static int
 nfsrpc_setaclrpc(vnode_t vp, struct ucred *cred, NFSPROC_T *p,
     struct acl *aclp, nfsv4stateid_t *stateidp)
 {
 	struct nfsrv_descript nfsd, *nd = &nfsd;
 	int error;
 	nfsattrbit_t attrbits;
 	struct nfsmount *nmp = VFSTONFS(vp->v_mount);
 
 	if (!NFSHASNFSV4(nmp))
 		return (EOPNOTSUPP);
 	NFSCL_REQSTART(nd, NFSPROC_SETACL, vp, cred);
 	nfsm_stateidtom(nd, stateidp, NFSSTATEID_PUTSTATEID);
 	NFSZERO_ATTRBIT(&attrbits);
 	NFSSETBIT_ATTRBIT(&attrbits, NFSATTRBIT_ACL);
 	(void) nfsv4_fillattr(nd, vp->v_mount, vp, aclp, NULL, NULL, 0,
 	    &attrbits, NULL, NULL, 0, 0, 0, 0, (uint64_t)0, NULL);
 	error = nfscl_request(nd, vp, p, cred);
 	if (error)
 		return (error);
 	/* Don't care about the pre/postop attributes */
 	m_freem(nd->nd_mrep);
 	return (nd->nd_repstat);
 }
 
 /*
  * Do the NFSv4.1 Exchange ID.
  */
 int
 nfsrpc_exchangeid(struct nfsmount *nmp, struct nfsclclient *clp,
     struct nfssockreq *nrp, int minorvers, uint32_t exchflags,
     struct nfsclds **dspp, struct ucred *cred, NFSPROC_T *p)
 {
 	uint32_t *tl, v41flags;
 	struct nfsrv_descript nfsd;
 	struct nfsrv_descript *nd = &nfsd;
 	struct nfsclds *dsp;
 	struct timespec verstime;
 	int error, len;
 
 	*dspp = NULL;
 	if (minorvers == 0)
 		minorvers = nmp->nm_minorvers;
 	nfscl_reqstart(nd, NFSPROC_EXCHANGEID, nmp, NULL, 0, NULL, NULL,
 	    NFS_VER4, minorvers, NULL);
 	NFSM_BUILD(tl, uint32_t *, 2 * NFSX_UNSIGNED);
 	*tl++ = txdr_unsigned(nfsboottime.tv_sec);	/* Client owner */
 	*tl = txdr_unsigned(clp->nfsc_rev);
 	(void) nfsm_strtom(nd, clp->nfsc_id, clp->nfsc_idlen);
 
 	NFSM_BUILD(tl, uint32_t *, 3 * NFSX_UNSIGNED);
 	*tl++ = txdr_unsigned(exchflags);
 	*tl++ = txdr_unsigned(NFSV4EXCH_SP4NONE);
 
 	/* Set the implementation id4 */
 	*tl = txdr_unsigned(1);
 	(void) nfsm_strtom(nd, "freebsd.org", strlen("freebsd.org"));
 	(void) nfsm_strtom(nd, version, strlen(version));
 	NFSM_BUILD(tl, uint32_t *, NFSX_V4TIME);
 	verstime.tv_sec = 1293840000;		/* Jan 1, 2011 */
 	verstime.tv_nsec = 0;
 	txdr_nfsv4time(&verstime, tl);
 	nd->nd_flag |= ND_USEGSSNAME;
 	error = newnfs_request(nd, nmp, NULL, nrp, NULL, p, cred,
 	    NFS_PROG, NFS_VER4, NULL, 1, NULL, NULL);
 	NFSCL_DEBUG(1, "exchangeid err=%d reps=%d\n", error,
 	    (int)nd->nd_repstat);
 	if (error != 0)
 		return (error);
 	if (nd->nd_repstat == 0) {
 		NFSM_DISSECT(tl, uint32_t *, 6 * NFSX_UNSIGNED + NFSX_HYPER);
 		len = fxdr_unsigned(int, *(tl + 7));
 		if (len < 0 || len > NFSV4_OPAQUELIMIT) {
 			error = NFSERR_BADXDR;
 			goto nfsmout;
 		}
 		dsp = malloc(sizeof(struct nfsclds) + len + 1, M_NFSCLDS,
 		    M_WAITOK | M_ZERO);
 		dsp->nfsclds_expire = NFSD_MONOSEC + clp->nfsc_renew;
 		dsp->nfsclds_servownlen = len;
 		dsp->nfsclds_sess.nfsess_clientid.lval[0] = *tl++;
 		dsp->nfsclds_sess.nfsess_clientid.lval[1] = *tl++;
 		dsp->nfsclds_sess.nfsess_sequenceid =
 		    fxdr_unsigned(uint32_t, *tl++);
 		v41flags = fxdr_unsigned(uint32_t, *tl);
 		if ((v41flags & NFSV4EXCH_USEPNFSMDS) != 0 &&
 		    NFSHASPNFSOPT(nmp)) {
 			NFSCL_DEBUG(1, "set PNFS\n");
 			NFSLOCKMNT(nmp);
 			nmp->nm_state |= NFSSTA_PNFS;
 			NFSUNLOCKMNT(nmp);
 			dsp->nfsclds_flags |= NFSCLDS_MDS;
 		}
 		if ((v41flags & NFSV4EXCH_USEPNFSDS) != 0)
 			dsp->nfsclds_flags |= NFSCLDS_DS;
 		if (minorvers == NFSV42_MINORVERSION)
 			dsp->nfsclds_flags |= NFSCLDS_MINORV2;
 		if (len > 0)
 			nd->nd_repstat = nfsrv_mtostr(nd,
 			    dsp->nfsclds_serverown, len);
 		if (nd->nd_repstat == 0) {
 			mtx_init(&dsp->nfsclds_mtx, "nfsds", NULL, MTX_DEF);
 			mtx_init(&dsp->nfsclds_sess.nfsess_mtx, "nfssession",
 			    NULL, MTX_DEF);
 			nfscl_initsessionslots(&dsp->nfsclds_sess);
 			*dspp = dsp;
 		} else
 			free(dsp, M_NFSCLDS);
 	}
 	error = nd->nd_repstat;
 nfsmout:
 	m_freem(nd->nd_mrep);
 	return (error);
 }
 
 /*
  * Do the NFSv4.1 Create Session.
  */
 int
 nfsrpc_createsession(struct nfsmount *nmp, struct nfsclsession *sep,
     struct nfssockreq *nrp, struct nfsclds *dsp, uint32_t sequenceid, int mds,
     struct ucred *cred, NFSPROC_T *p)
 {
 	uint32_t crflags, maxval, *tl;
 	struct nfsrv_descript nfsd;
 	struct nfsrv_descript *nd = &nfsd;
 	int error, irdcnt, minorvers;
 
 	/* Make sure nm_rsize, nm_wsize is set. */
 	if (nmp->nm_rsize > NFS_MAXBSIZE || nmp->nm_rsize == 0)
 		nmp->nm_rsize = NFS_MAXBSIZE;
 	if (nmp->nm_wsize > NFS_MAXBSIZE || nmp->nm_wsize == 0)
 		nmp->nm_wsize = NFS_MAXBSIZE;
 	if (dsp == NULL)
 		minorvers = nmp->nm_minorvers;
 	else if ((dsp->nfsclds_flags & NFSCLDS_MINORV2) != 0)
 		minorvers = NFSV42_MINORVERSION;
 	else
 		minorvers = NFSV41_MINORVERSION;
 	nfscl_reqstart(nd, NFSPROC_CREATESESSION, nmp, NULL, 0, NULL, NULL,
 	    NFS_VER4, minorvers, NULL);
 	NFSM_BUILD(tl, uint32_t *, 4 * NFSX_UNSIGNED);
 	*tl++ = sep->nfsess_clientid.lval[0];
 	*tl++ = sep->nfsess_clientid.lval[1];
 	*tl++ = txdr_unsigned(sequenceid);
 	crflags = (NFSMNT_RDONLY(nmp->nm_mountp) ? 0 : NFSV4CRSESS_PERSIST);
 	if (nfscl_enablecallb != 0 && nfs_numnfscbd > 0 && mds != 0)
 		crflags |= NFSV4CRSESS_CONNBACKCHAN;
 	*tl = txdr_unsigned(crflags);
 
 	/* Fill in fore channel attributes. */
 	NFSM_BUILD(tl, uint32_t *, 7 * NFSX_UNSIGNED);
 	*tl++ = 0;				/* Header pad size */
 	if ((nd->nd_flag & ND_NFSV42) != 0 && mds != 0 && sb_max_adj >=
 	    nmp->nm_wsize && sb_max_adj >= nmp->nm_rsize) {
 		/*
 		 * NFSv4.2 Extended Attribute operations may want to do
 		 * requests/replies that are larger than nm_rsize/nm_wsize.
 		 */
 		*tl++ = txdr_unsigned(sb_max_adj - NFS_MAXXDR);
 		*tl++ = txdr_unsigned(sb_max_adj - NFS_MAXXDR);
 	} else {
 		*tl++ = txdr_unsigned(nmp->nm_wsize + NFS_MAXXDR);
 		*tl++ = txdr_unsigned(nmp->nm_rsize + NFS_MAXXDR);
 	}
 	*tl++ = txdr_unsigned(4096);		/* Max response size cached */
 	*tl++ = txdr_unsigned(20);		/* Max operations */
 	*tl++ = txdr_unsigned(64);		/* Max slots */
 	*tl = 0;				/* No rdma ird */
 
 	/* Fill in back channel attributes. */
 	NFSM_BUILD(tl, uint32_t *, 7 * NFSX_UNSIGNED);
 	*tl++ = 0;				/* Header pad size */
 	*tl++ = txdr_unsigned(10000);		/* Max request size */
 	*tl++ = txdr_unsigned(10000);		/* Max response size */
 	*tl++ = txdr_unsigned(4096);		/* Max response size cached */
 	*tl++ = txdr_unsigned(4);		/* Max operations */
 	*tl++ = txdr_unsigned(NFSV4_CBSLOTS);	/* Max slots */
 	*tl = 0;				/* No rdma ird */
 
 	NFSM_BUILD(tl, uint32_t *, 8 * NFSX_UNSIGNED);
 	*tl++ = txdr_unsigned(NFS_CALLBCKPROG);	/* Call back prog # */
 
 	/* Allow AUTH_SYS callbacks as uid, gid == 0. */
 	*tl++ = txdr_unsigned(1);		/* Auth_sys only */
 	*tl++ = txdr_unsigned(AUTH_SYS);	/* AUTH_SYS type */
 	*tl++ = txdr_unsigned(nfsboottime.tv_sec); /* time stamp */
 	*tl++ = 0;				/* Null machine name */
 	*tl++ = 0;				/* Uid == 0 */
 	*tl++ = 0;				/* Gid == 0 */
 	*tl = 0;				/* No additional gids */
 	nd->nd_flag |= ND_USEGSSNAME;
 	error = newnfs_request(nd, nmp, NULL, nrp, NULL, p, cred, NFS_PROG,
 	    NFS_VER4, NULL, 1, NULL, NULL);
 	if (error != 0)
 		return (error);
 	if (nd->nd_repstat == 0) {
 		NFSM_DISSECT(tl, uint32_t *, NFSX_V4SESSIONID +
 		    2 * NFSX_UNSIGNED);
 		bcopy(tl, sep->nfsess_sessionid, NFSX_V4SESSIONID);
 		tl += NFSX_V4SESSIONID / NFSX_UNSIGNED;
 		sep->nfsess_sequenceid = fxdr_unsigned(uint32_t, *tl++);
 		crflags = fxdr_unsigned(uint32_t, *tl);
 		if ((crflags & NFSV4CRSESS_PERSIST) != 0 && mds != 0) {
 			NFSLOCKMNT(nmp);
 			nmp->nm_state |= NFSSTA_SESSPERSIST;
 			NFSUNLOCKMNT(nmp);
 		}
 
 		/* Get the fore channel slot count. */
 		NFSM_DISSECT(tl, uint32_t *, 7 * NFSX_UNSIGNED);
 		tl++;			/* Skip the header pad size. */
 
 		/* Make sure nm_wsize is small enough. */
 		maxval = fxdr_unsigned(uint32_t, *tl++);
 		while (maxval < nmp->nm_wsize + NFS_MAXXDR) {
 			if (nmp->nm_wsize > 8096)
 				nmp->nm_wsize /= 2;
 			else
 				break;
 		}
 		sep->nfsess_maxreq = maxval;
 
 		/* Make sure nm_rsize is small enough. */
 		maxval = fxdr_unsigned(uint32_t, *tl++);
 		while (maxval < nmp->nm_rsize + NFS_MAXXDR) {
 			if (nmp->nm_rsize > 8096)
 				nmp->nm_rsize /= 2;
 			else
 				break;
 		}
 		sep->nfsess_maxresp = maxval;
 
 		sep->nfsess_maxcache = fxdr_unsigned(int, *tl++);
 		tl++;
 		sep->nfsess_foreslots = fxdr_unsigned(uint16_t, *tl++);
 		NFSCL_DEBUG(4, "fore slots=%d\n", (int)sep->nfsess_foreslots);
 		irdcnt = fxdr_unsigned(int, *tl);
 		if (irdcnt < 0 || irdcnt > 1) {
 			error = NFSERR_BADXDR;
 			goto nfsmout;
 		}
 		if (irdcnt > 0)
 			NFSM_DISSECT(tl, uint32_t *, irdcnt * NFSX_UNSIGNED);
 
 		/* and the back channel slot count. */
 		NFSM_DISSECT(tl, uint32_t *, 7 * NFSX_UNSIGNED);
 		tl += 5;
 		sep->nfsess_backslots = fxdr_unsigned(uint16_t, *tl);
 		NFSCL_DEBUG(4, "back slots=%d\n", (int)sep->nfsess_backslots);
 	}
 	error = nd->nd_repstat;
 nfsmout:
 	m_freem(nd->nd_mrep);
 	return (error);
 }
 
 /*
  * Do the NFSv4.1 Destroy Client.
  */
 int
 nfsrpc_destroyclient(struct nfsmount *nmp, struct nfsclclient *clp,
     struct ucred *cred, NFSPROC_T *p)
 {
 	uint32_t *tl;
 	struct nfsrv_descript nfsd;
 	struct nfsrv_descript *nd = &nfsd;
 	int error;
 	struct nfsclsession *tsep;
 
 	nfscl_reqstart(nd, NFSPROC_DESTROYCLIENT, nmp, NULL, 0, NULL, NULL, 0,
 	    0, NULL);
 	NFSM_BUILD(tl, uint32_t *, 2 * NFSX_UNSIGNED);
 	tsep = nfsmnt_mdssession(nmp);
 	*tl++ = tsep->nfsess_clientid.lval[0];
 	*tl = tsep->nfsess_clientid.lval[1];
 	nd->nd_flag |= ND_USEGSSNAME;
 	error = newnfs_request(nd, nmp, NULL, &nmp->nm_sockreq, NULL, p, cred,
 	    NFS_PROG, NFS_VER4, NULL, 1, NULL, NULL);
 	if (error != 0)
 		return (error);
 	error = nd->nd_repstat;
 	m_freem(nd->nd_mrep);
 	return (error);
 }
 
 /*
  * Do the NFSv4.1 LayoutGet.
  */
 static int
 nfsrpc_layoutget(struct nfsmount *nmp, uint8_t *fhp, int fhlen, int iomode,
     uint64_t offset, uint64_t len, uint64_t minlen, int layouttype,
     int layoutlen, nfsv4stateid_t *stateidp, int *retonclosep,
     struct nfsclflayouthead *flhp, struct ucred *cred, NFSPROC_T *p)
 {
 	struct nfsrv_descript nfsd, *nd = &nfsd;
 	int error;
 
 	nfscl_reqstart(nd, NFSPROC_LAYOUTGET, nmp, fhp, fhlen, NULL, NULL, 0,
 	    0, cred);
 	nfsrv_setuplayoutget(nd, iomode, offset, len, minlen, stateidp,
 	    layouttype, layoutlen, 0);
 	nd->nd_flag |= ND_USEGSSNAME;
 	error = newnfs_request(nd, nmp, NULL, &nmp->nm_sockreq, NULL, p, cred,
 	    NFS_PROG, NFS_VER4, NULL, 1, NULL, NULL);
 	NFSCL_DEBUG(4, "layget err=%d st=%d\n", error, nd->nd_repstat);
 	if (error != 0)
 		return (error);
 	if (nd->nd_repstat == 0)
 		error = nfsrv_parselayoutget(nmp, nd, stateidp, retonclosep,
 		    flhp);
 	if (error == 0 && nd->nd_repstat != 0)
 		error = nd->nd_repstat;
 	m_freem(nd->nd_mrep);
 	return (error);
 }
 
 /*
  * Do the NFSv4.1 Get Device Info.
  */
 int
 nfsrpc_getdeviceinfo(struct nfsmount *nmp, uint8_t *deviceid, int layouttype,
     uint32_t *notifybitsp, struct nfscldevinfo **ndip, struct ucred *cred,
     NFSPROC_T *p)
 {
 	uint32_t cnt, *tl, vers, minorvers;
 	struct nfsrv_descript nfsd;
 	struct nfsrv_descript *nd = &nfsd;
 	struct sockaddr_in sin, ssin;
 	struct sockaddr_in6 sin6, ssin6;
 	struct nfsclds *dsp = NULL, **dspp, **gotdspp;
 	struct nfscldevinfo *ndi;
 	int addrcnt = 0, bitcnt, error, gotminor, gotvers, i, isudp, j;
 	int stripecnt;
 	uint8_t stripeindex;
 	sa_family_t af, safilled;
 
 	ssin.sin_port = 0;		/* To shut up compiler. */
 	ssin.sin_addr.s_addr = 0;	/* ditto */
 	*ndip = NULL;
 	ndi = NULL;
 	gotdspp = NULL;
 	nfscl_reqstart(nd, NFSPROC_GETDEVICEINFO, nmp, NULL, 0, NULL, NULL, 0,
 	    0, cred);
 	NFSM_BUILD(tl, uint32_t *, NFSX_V4DEVICEID + 3 * NFSX_UNSIGNED);
 	NFSBCOPY(deviceid, tl, NFSX_V4DEVICEID);
 	tl += (NFSX_V4DEVICEID / NFSX_UNSIGNED);
 	*tl++ = txdr_unsigned(layouttype);
 	*tl++ = txdr_unsigned(100000);
 	if (notifybitsp != NULL && *notifybitsp != 0) {
 		*tl = txdr_unsigned(1);		/* One word of bits. */
 		NFSM_BUILD(tl, uint32_t *, NFSX_UNSIGNED);
 		*tl = txdr_unsigned(*notifybitsp);
 	} else
 		*tl = txdr_unsigned(0);
 	nd->nd_flag |= ND_USEGSSNAME;
 	error = newnfs_request(nd, nmp, NULL, &nmp->nm_sockreq, NULL, p, cred,
 	    NFS_PROG, NFS_VER4, NULL, 1, NULL, NULL);
 	if (error != 0)
 		return (error);
 	if (nd->nd_repstat == 0) {
 		NFSM_DISSECT(tl, uint32_t *, 2 * NFSX_UNSIGNED);
 		if (layouttype != fxdr_unsigned(int, *tl))
 			printf("EEK! devinfo layout type not same!\n");
 		if (layouttype == NFSLAYOUT_NFSV4_1_FILES) {
 			NFSM_DISSECT(tl, uint32_t *, NFSX_UNSIGNED);
 			stripecnt = fxdr_unsigned(int, *tl);
 			NFSCL_DEBUG(4, "stripecnt=%d\n", stripecnt);
 			if (stripecnt < 1 || stripecnt > 4096) {
 				printf("pNFS File layout devinfo stripecnt %d:"
 				    " out of range\n", stripecnt);
 				error = NFSERR_BADXDR;
 				goto nfsmout;
 			}
 			NFSM_DISSECT(tl, uint32_t *, (stripecnt + 1) *
 			    NFSX_UNSIGNED);
 			addrcnt = fxdr_unsigned(int, *(tl + stripecnt));
 			NFSCL_DEBUG(4, "addrcnt=%d\n", addrcnt);
 			if (addrcnt < 1 || addrcnt > 128) {
 				printf("NFS devinfo addrcnt %d: out of range\n",
 				    addrcnt);
 				error = NFSERR_BADXDR;
 				goto nfsmout;
 			}
 
 			/*
 			 * Now we know how many stripe indices and addresses, so
 			 * we can allocate the structure the correct size.
 			 */
 			i = (stripecnt * sizeof(uint8_t)) /
 			    sizeof(struct nfsclds *) + 1;
 			NFSCL_DEBUG(4, "stripeindices=%d\n", i);
 			ndi = malloc(sizeof(*ndi) + (addrcnt + i) *
 			    sizeof(struct nfsclds *), M_NFSDEVINFO, M_WAITOK |
 			    M_ZERO);
 			NFSBCOPY(deviceid, ndi->nfsdi_deviceid,
 			    NFSX_V4DEVICEID);
 			ndi->nfsdi_refcnt = 0;
 			ndi->nfsdi_flags = NFSDI_FILELAYOUT;
 			ndi->nfsdi_stripecnt = stripecnt;
 			ndi->nfsdi_addrcnt = addrcnt;
 			/* Fill in the stripe indices. */
 			for (i = 0; i < stripecnt; i++) {
 				stripeindex = fxdr_unsigned(uint8_t, *tl++);
 				NFSCL_DEBUG(4, "stripeind=%d\n", stripeindex);
 				if (stripeindex >= addrcnt) {
 					printf("pNFS File Layout devinfo"
 					    " stripeindex %d: too big\n",
 					    (int)stripeindex);
 					error = NFSERR_BADXDR;
 					goto nfsmout;
 				}
 				nfsfldi_setstripeindex(ndi, i, stripeindex);
 			}
 		} else if (layouttype == NFSLAYOUT_FLEXFILE) {
 			/* For Flex File, we only get one address list. */
 			ndi = malloc(sizeof(*ndi) + sizeof(struct nfsclds *),
 			    M_NFSDEVINFO, M_WAITOK | M_ZERO);
 			NFSBCOPY(deviceid, ndi->nfsdi_deviceid,
 			    NFSX_V4DEVICEID);
 			ndi->nfsdi_refcnt = 0;
 			ndi->nfsdi_flags = NFSDI_FLEXFILE;
 			addrcnt = ndi->nfsdi_addrcnt = 1;
 		}
 
 		/* Now, dissect the server address(es). */
 		safilled = AF_UNSPEC;
 		for (i = 0; i < addrcnt; i++) {
 			NFSM_DISSECT(tl, uint32_t *, NFSX_UNSIGNED);
 			cnt = fxdr_unsigned(uint32_t, *tl);
 			if (cnt == 0) {
 				printf("NFS devinfo 0 len addrlist\n");
 				error = NFSERR_BADXDR;
 				goto nfsmout;
 			}
 			dspp = nfsfldi_addr(ndi, i);
 			safilled = AF_UNSPEC;
 			for (j = 0; j < cnt; j++) {
 				error = nfsv4_getipaddr(nd, &sin, &sin6, &af,
 				    &isudp);
 				if (error != 0 && error != EPERM) {
 					error = NFSERR_BADXDR;
 					goto nfsmout;
 				}
 				if (error == 0 && isudp == 0) {
 					/*
 					 * The priority is:
 					 * - Same address family.
 					 * Save the address and dspp, so that
 					 * the connection can be done after
 					 * parsing is complete.
 					 */
 					if (safilled == AF_UNSPEC ||
 					    (af == nmp->nm_nam->sa_family &&
 					     safilled != nmp->nm_nam->sa_family)
 					   ) {
 						if (af == AF_INET)
 							ssin = sin;
 						else
 							ssin6 = sin6;
 						safilled = af;
 						gotdspp = dspp;
 					}
 				}
 			}
 		}
 
 		gotvers = NFS_VER4;	/* Default NFSv4.1 for File Layout. */
 		gotminor = NFSV41_MINORVERSION;
 		/* For Flex File, we will take one of the versions to use. */
 		if (layouttype == NFSLAYOUT_FLEXFILE) {
 			NFSM_DISSECT(tl, uint32_t *, NFSX_UNSIGNED);
 			j = fxdr_unsigned(int, *tl);
 			if (j < 1 || j > NFSDEV_MAXVERS) {
 				printf("pNFS: too many versions\n");
 				error = NFSERR_BADXDR;
 				goto nfsmout;
 			}
 			gotvers = 0;
 			gotminor = 0;
 			for (i = 0; i < j; i++) {
 				NFSM_DISSECT(tl, uint32_t *, 5 * NFSX_UNSIGNED);
 				vers = fxdr_unsigned(uint32_t, *tl++);
 				minorvers = fxdr_unsigned(uint32_t, *tl++);
 				if (vers == NFS_VER3)
 					minorvers = 0;
 				if ((vers == NFS_VER4 && ((minorvers ==
 				    NFSV41_MINORVERSION && gotminor == 0) ||
 				    minorvers == NFSV42_MINORVERSION)) ||
 				    (vers == NFS_VER3 && gotvers == 0)) {
 					gotvers = vers;
 					gotminor = minorvers;
 					/* We'll take this one. */
 					ndi->nfsdi_versindex = i;
 					ndi->nfsdi_vers = vers;
 					ndi->nfsdi_minorvers = minorvers;
 					ndi->nfsdi_rsize = fxdr_unsigned(
 					    uint32_t, *tl++);
 					ndi->nfsdi_wsize = fxdr_unsigned(
 					    uint32_t, *tl++);
 					if (*tl == newnfs_true)
 						ndi->nfsdi_flags |=
 						    NFSDI_TIGHTCOUPLED;
 					else
 						ndi->nfsdi_flags &=
 						    ~NFSDI_TIGHTCOUPLED;
 				}
 			}
 			if (gotvers == 0) {
 				printf("pNFS: no NFSv3, NFSv4.1 or NFSv4.2\n");
 				error = NFSERR_BADXDR;
 				goto nfsmout;
 			}
 		}
 
 		/* And the notify bits. */
 		NFSM_DISSECT(tl, uint32_t *, NFSX_UNSIGNED);
 		bitcnt = fxdr_unsigned(int, *tl);
 		if (bitcnt > 0) {
 			NFSM_DISSECT(tl, uint32_t *, NFSX_UNSIGNED);
 			if (notifybitsp != NULL)
 				*notifybitsp =
 				    fxdr_unsigned(uint32_t, *tl);
 		}
 		if (safilled != AF_UNSPEC) {
 			KASSERT(ndi != NULL, ("ndi is NULL"));
 			*ndip = ndi;
 		} else
 			error = EPERM;
 		if (error == 0) {
 			/*
 			 * Now we can do a TCP connection for the correct
 			 * NFS version and IP address.
 			 */
 			error = nfsrpc_fillsa(nmp, &ssin, &ssin6, safilled,
 			    gotvers, gotminor, &dsp, p);
 		}
 		if (error == 0) {
 			KASSERT(gotdspp != NULL, ("gotdspp is NULL"));
 			*gotdspp = dsp;
 		}
 	}
 	if (nd->nd_repstat != 0 && error == 0)
 		error = nd->nd_repstat;
 nfsmout:
 	if (error != 0 && ndi != NULL)
 		nfscl_freedevinfo(ndi);
 	m_freem(nd->nd_mrep);
 	return (error);
 }
 
 /*
  * Do the NFSv4.1 LayoutCommit.
  */
 int
 nfsrpc_layoutcommit(struct nfsmount *nmp, uint8_t *fh, int fhlen, int reclaim,
     uint64_t off, uint64_t len, uint64_t lastbyte, nfsv4stateid_t *stateidp,
     int layouttype, struct ucred *cred, NFSPROC_T *p)
 {
 	uint32_t *tl;
 	struct nfsrv_descript nfsd, *nd = &nfsd;
 	int error;
 
 	nfscl_reqstart(nd, NFSPROC_LAYOUTCOMMIT, nmp, fh, fhlen, NULL, NULL,
 	    0, 0, cred);
 	NFSM_BUILD(tl, uint32_t *, 5 * NFSX_UNSIGNED + 3 * NFSX_HYPER +
 	    NFSX_STATEID);
 	txdr_hyper(off, tl);
 	tl += 2;
 	txdr_hyper(len, tl);
 	tl += 2;
 	if (reclaim != 0)
 		*tl++ = newnfs_true;
 	else
 		*tl++ = newnfs_false;
 	*tl++ = txdr_unsigned(stateidp->seqid);
 	*tl++ = stateidp->other[0];
 	*tl++ = stateidp->other[1];
 	*tl++ = stateidp->other[2];
 	*tl++ = newnfs_true;
 	if (lastbyte < off)
 		lastbyte = off;
 	else if (lastbyte >= (off + len))
 		lastbyte = off + len - 1;
 	txdr_hyper(lastbyte, tl);
 	tl += 2;
 	*tl++ = newnfs_false;
 	*tl++ = txdr_unsigned(layouttype);
 	/* All supported layouts are 0 length. */
 	*tl = txdr_unsigned(0);
 	nd->nd_flag |= ND_USEGSSNAME;
 	error = newnfs_request(nd, nmp, NULL, &nmp->nm_sockreq, NULL, p, cred,
 	    NFS_PROG, NFS_VER4, NULL, 1, NULL, NULL);
 	if (error != 0)
 		return (error);
 	error = nd->nd_repstat;
 	m_freem(nd->nd_mrep);
 	return (error);
 }
 
 /*
  * Do the NFSv4.1 LayoutReturn.
  */
 int
 nfsrpc_layoutreturn(struct nfsmount *nmp, uint8_t *fh, int fhlen, int reclaim,
     int layouttype, uint32_t iomode, int layoutreturn, uint64_t offset,
     uint64_t len, nfsv4stateid_t *stateidp, struct ucred *cred, NFSPROC_T *p,
     uint32_t stat, uint32_t op, char *devid)
 {
 	uint32_t *tl;
 	struct nfsrv_descript nfsd, *nd = &nfsd;
 	uint64_t tu64;
 	int error;
 
 	nfscl_reqstart(nd, NFSPROC_LAYOUTRETURN, nmp, fh, fhlen, NULL, NULL,
 	    0, 0, cred);
 	NFSM_BUILD(tl, uint32_t *, 4 * NFSX_UNSIGNED);
 	if (reclaim != 0)
 		*tl++ = newnfs_true;
 	else
 		*tl++ = newnfs_false;
 	*tl++ = txdr_unsigned(layouttype);
 	*tl++ = txdr_unsigned(iomode);
 	*tl = txdr_unsigned(layoutreturn);
 	if (layoutreturn == NFSLAYOUTRETURN_FILE) {
 		NFSM_BUILD(tl, uint32_t *, 2 * NFSX_HYPER + NFSX_STATEID +
 		    NFSX_UNSIGNED);
 		txdr_hyper(offset, tl);
 		tl += 2;
 		txdr_hyper(len, tl);
 		tl += 2;
 		NFSCL_DEBUG(4, "layoutret stseq=%d\n", (int)stateidp->seqid);
 		*tl++ = txdr_unsigned(stateidp->seqid);
 		*tl++ = stateidp->other[0];
 		*tl++ = stateidp->other[1];
 		*tl++ = stateidp->other[2];
 		if (layouttype == NFSLAYOUT_NFSV4_1_FILES)
 			*tl = txdr_unsigned(0);
 		else if (layouttype == NFSLAYOUT_FLEXFILE) {
 			if (stat != 0) {
 				*tl = txdr_unsigned(2 * NFSX_HYPER +
 				    NFSX_STATEID + NFSX_V4DEVICEID + 5 *
 				    NFSX_UNSIGNED);
 				NFSM_BUILD(tl, uint32_t *, 2 * NFSX_HYPER +
 				    NFSX_STATEID + NFSX_V4DEVICEID + 5 *
 				    NFSX_UNSIGNED);
 				*tl++ = txdr_unsigned(1);	/* One error. */
 				tu64 = 0;			/* Offset. */
 				txdr_hyper(tu64, tl); tl += 2;
 				tu64 = UINT64_MAX;		/* Length. */
 				txdr_hyper(tu64, tl); tl += 2;
 				NFSBCOPY(stateidp, tl, NFSX_STATEID);
 				tl += (NFSX_STATEID / NFSX_UNSIGNED);
 				*tl++ = txdr_unsigned(1);	/* One error. */
 				NFSBCOPY(devid, tl, NFSX_V4DEVICEID);
 				tl += (NFSX_V4DEVICEID / NFSX_UNSIGNED);
 				*tl++ = txdr_unsigned(stat);
 				*tl++ = txdr_unsigned(op);
 			} else {
 				*tl = txdr_unsigned(2 * NFSX_UNSIGNED);
 				NFSM_BUILD(tl, uint32_t *, 2 * NFSX_UNSIGNED);
 				/* No ioerrs. */
 				*tl++ = 0;
 			}
 			*tl = 0;	/* No stats yet. */
 		}
 	}
 	nd->nd_flag |= ND_USEGSSNAME;
 	error = newnfs_request(nd, nmp, NULL, &nmp->nm_sockreq, NULL, p, cred,
 	    NFS_PROG, NFS_VER4, NULL, 1, NULL, NULL);
 	if (error != 0)
 		return (error);
 	if (nd->nd_repstat == 0) {
 		NFSM_DISSECT(tl, uint32_t *, NFSX_UNSIGNED);
 		if (*tl != 0) {
 			NFSM_DISSECT(tl, uint32_t *, NFSX_STATEID);
 			stateidp->seqid = fxdr_unsigned(uint32_t, *tl++);
 			stateidp->other[0] = *tl++;
 			stateidp->other[1] = *tl++;
 			stateidp->other[2] = *tl;
 		}
 	} else
 		error = nd->nd_repstat;
 nfsmout:
 	m_freem(nd->nd_mrep);
 	return (error);
 }
 
 /*
  * Do the NFSv4.2 LayoutError.
  */
 static int
 nfsrpc_layouterror(struct nfsmount *nmp, uint8_t *fh, int fhlen, uint64_t offset,
     uint64_t len, nfsv4stateid_t *stateidp, struct ucred *cred, NFSPROC_T *p,
     uint32_t stat, uint32_t op, char *devid)
 {
 	uint32_t *tl;
 	struct nfsrv_descript nfsd, *nd = &nfsd;
 	int error;
 
 	nfscl_reqstart(nd, NFSPROC_LAYOUTERROR, nmp, fh, fhlen, NULL, NULL,
 	    0, 0, cred);
 	NFSM_BUILD(tl, uint32_t *, 2 * NFSX_HYPER + NFSX_STATEID +
 	    NFSX_V4DEVICEID + 3 * NFSX_UNSIGNED);
 	txdr_hyper(offset, tl); tl += 2;
 	txdr_hyper(len, tl); tl += 2;
 	*tl++ = txdr_unsigned(stateidp->seqid);
 	*tl++ = stateidp->other[0];
 	*tl++ = stateidp->other[1];
 	*tl++ = stateidp->other[2];
 	*tl++ = txdr_unsigned(1);
 	NFSBCOPY(devid, tl, NFSX_V4DEVICEID);
 	tl += (NFSX_V4DEVICEID / NFSX_UNSIGNED);
 	*tl++ = txdr_unsigned(stat);
 	*tl = txdr_unsigned(op);
 	nd->nd_flag |= ND_USEGSSNAME;
 	error = newnfs_request(nd, nmp, NULL, &nmp->nm_sockreq, NULL, p, cred,
 	    NFS_PROG, NFS_VER4, NULL, 1, NULL, NULL);
 	if (error != 0)
 		return (error);
 	if (nd->nd_repstat != 0)
 		error = nd->nd_repstat;
 	m_freem(nd->nd_mrep);
 	return (error);
 }
 
 /*
  * Acquire a layout and devinfo, if possible. The caller must have acquired
  * a reference count on the nfsclclient structure before calling this.
  * Return the layout in lypp with a reference count on it, if successful.
  */
 static int
 nfsrpc_getlayout(struct nfsmount *nmp, vnode_t vp, struct nfsfh *nfhp,
     int iomode, uint32_t rw, uint32_t *notifybitsp, nfsv4stateid_t *stateidp,
     uint64_t off, struct nfscllayout **lypp, struct ucred *cred, NFSPROC_T *p)
 {
 	struct nfscllayout *lyp;
 	struct nfsclflayout *flp;
 	struct nfsclflayouthead flh;
 	int error = 0, islocked, layoutlen, layouttype, recalled, retonclose;
 	nfsv4stateid_t stateid;
 	struct nfsclsession *tsep;
 
 	*lypp = NULL;
 	if (NFSHASFLEXFILE(nmp))
 		layouttype = NFSLAYOUT_FLEXFILE;
 	else
 		layouttype = NFSLAYOUT_NFSV4_1_FILES;
 	/*
 	 * If lyp is returned non-NULL, there will be a refcnt (shared lock)
 	 * on it, iff flp != NULL or a lock (exclusive lock) on it iff
 	 * flp == NULL.
 	 */
 	lyp = nfscl_getlayout(nmp->nm_clp, nfhp->nfh_fh, nfhp->nfh_len,
 	    off, rw, &flp, &recalled);
 	islocked = 0;
 	if (lyp == NULL || flp == NULL) {
 		if (recalled != 0)
 			return (EIO);
 		LIST_INIT(&flh);
 		tsep = nfsmnt_mdssession(nmp);
 		layoutlen = tsep->nfsess_maxcache -
 		    (NFSX_STATEID + 3 * NFSX_UNSIGNED);
 		if (lyp == NULL) {
 			stateid.seqid = 0;
 			stateid.other[0] = stateidp->other[0];
 			stateid.other[1] = stateidp->other[1];
 			stateid.other[2] = stateidp->other[2];
 			error = nfsrpc_layoutget(nmp, nfhp->nfh_fh,
 			    nfhp->nfh_len, iomode, (uint64_t)0, UINT64_MAX,
 			    (uint64_t)0, layouttype, layoutlen, &stateid,
 			    &retonclose, &flh, cred, p);
 		} else {
 			islocked = 1;
 			stateid.seqid = lyp->nfsly_stateid.seqid;
 			stateid.other[0] = lyp->nfsly_stateid.other[0];
 			stateid.other[1] = lyp->nfsly_stateid.other[1];
 			stateid.other[2] = lyp->nfsly_stateid.other[2];
 			error = nfsrpc_layoutget(nmp, nfhp->nfh_fh,
 			    nfhp->nfh_len, iomode, off, UINT64_MAX,
 			    (uint64_t)0, layouttype, layoutlen, &stateid,
 			    &retonclose, &flh, cred, p);
 		}
 		error = nfsrpc_layoutgetres(nmp, vp, nfhp->nfh_fh,
 		    nfhp->nfh_len, &stateid, retonclose, notifybitsp, &lyp,
 		    &flh, layouttype, error, NULL, cred, p);
 		if (error == 0)
 			*lypp = lyp;
 		else if (islocked != 0)
 			nfscl_rellayout(lyp, 1);
 	} else
 		*lypp = lyp;
 	return (error);
 }
 
 /*
  * Do a TCP connection plus exchange id and create session.
  * If successful, a "struct nfsclds" is linked into the list for the
  * mount point and a pointer to it is returned.
  */
 static int
 nfsrpc_fillsa(struct nfsmount *nmp, struct sockaddr_in *sin,
     struct sockaddr_in6 *sin6, sa_family_t af, int vers, int minorvers,
     struct nfsclds **dspp, NFSPROC_T *p)
 {
 	struct sockaddr_in *msad, *sad;
 	struct sockaddr_in6 *msad6, *sad6;
 	struct nfsclclient *clp;
 	struct nfssockreq *nrp;
 	struct nfsclds *dsp, *tdsp;
 	int error, firsttry;
 	enum nfsclds_state retv;
 	uint32_t sequenceid = 0;
 
 	KASSERT(nmp->nm_sockreq.nr_cred != NULL,
 	    ("nfsrpc_fillsa: NULL nr_cred"));
 	NFSLOCKCLSTATE();
 	clp = nmp->nm_clp;
 	NFSUNLOCKCLSTATE();
 	if (clp == NULL)
 		return (EPERM);
 	if (af == AF_INET) {
 		NFSLOCKMNT(nmp);
 		/*
 		 * Check to see if we already have a session for this
 		 * address that is usable for a DS.
 		 * Note that the MDS's address is in a different place
 		 * than the sessions already acquired for DS's.
 		 */
 		msad = (struct sockaddr_in *)nmp->nm_sockreq.nr_nam;
 		tdsp = TAILQ_FIRST(&nmp->nm_sess);
 		while (tdsp != NULL) {
 			if (msad != NULL && msad->sin_family == AF_INET &&
 			    sin->sin_addr.s_addr == msad->sin_addr.s_addr &&
 			    sin->sin_port == msad->sin_port &&
 			    (tdsp->nfsclds_flags & NFSCLDS_DS) != 0 &&
 			    tdsp->nfsclds_sess.nfsess_defunct == 0) {
 				*dspp = tdsp;
 				NFSUNLOCKMNT(nmp);
 				NFSCL_DEBUG(4, "fnd same addr\n");
 				return (0);
 			}
 			tdsp = TAILQ_NEXT(tdsp, nfsclds_list);
 			if (tdsp != NULL && tdsp->nfsclds_sockp != NULL)
 				msad = (struct sockaddr_in *)
 				    tdsp->nfsclds_sockp->nr_nam;
 			else
 				msad = NULL;
 		}
 		NFSUNLOCKMNT(nmp);
 
 		/* No IP address match, so look for new/trunked one. */
 		sad = malloc(sizeof(*sad), M_SONAME, M_WAITOK | M_ZERO);
 		sad->sin_len = sizeof(*sad);
 		sad->sin_family = AF_INET;
 		sad->sin_port = sin->sin_port;
 		sad->sin_addr.s_addr = sin->sin_addr.s_addr;
 		if (NFSHASPNFS(nmp) && NFSHASKERB(nmp)) {
 			/* For pNFS, a separate server principal is needed. */
 			nrp = malloc(sizeof(*nrp) + NI_MAXSERV + NI_MAXHOST,
 			    M_NFSSOCKREQ, M_WAITOK | M_ZERO);
 			/*
 			 * Use the latter part of nr_srvprinc as a temporary
 			 * buffer for the IP address.
 			 */
 			inet_ntoa_r(sad->sin_addr,
 			    &nrp->nr_srvprinc[NI_MAXSERV]);
 			NFSCL_DEBUG(1, "nfsrpc_fillsa: DS IP=%s\n",
 			    &nrp->nr_srvprinc[NI_MAXSERV]);
 			if (!rpc_gss_ip_to_srv_principal_call(
 			    &nrp->nr_srvprinc[NI_MAXSERV], "nfs",
 			    nrp->nr_srvprinc))
 				nrp->nr_srvprinc[0] = '\0';
 			NFSCL_DEBUG(1, "nfsrpc_fillsa: srv principal=%s\n",
 			    nrp->nr_srvprinc);
 		} else
 			nrp = malloc(sizeof(*nrp), M_NFSSOCKREQ,
 			    M_WAITOK | M_ZERO);
 		nrp->nr_nam = (struct sockaddr *)sad;
 	} else if (af == AF_INET6) {
 		NFSLOCKMNT(nmp);
 		/*
 		 * Check to see if we already have a session for this
 		 * address that is usable for a DS.
 		 * Note that the MDS's address is in a different place
 		 * than the sessions already acquired for DS's.
 		 */
 		msad6 = (struct sockaddr_in6 *)nmp->nm_sockreq.nr_nam;
 		tdsp = TAILQ_FIRST(&nmp->nm_sess);
 		while (tdsp != NULL) {
 			if (msad6 != NULL && msad6->sin6_family == AF_INET6 &&
 			    IN6_ARE_ADDR_EQUAL(&sin6->sin6_addr,
 			    &msad6->sin6_addr) &&
 			    sin6->sin6_port == msad6->sin6_port &&
 			    (tdsp->nfsclds_flags & NFSCLDS_DS) != 0 &&
 			    tdsp->nfsclds_sess.nfsess_defunct == 0) {
 				*dspp = tdsp;
 				NFSUNLOCKMNT(nmp);
 				return (0);
 			}
 			tdsp = TAILQ_NEXT(tdsp, nfsclds_list);
 			if (tdsp != NULL && tdsp->nfsclds_sockp != NULL)
 				msad6 = (struct sockaddr_in6 *)
 				    tdsp->nfsclds_sockp->nr_nam;
 			else
 				msad6 = NULL;
 		}
 		NFSUNLOCKMNT(nmp);
 
 		/* No IP address match, so look for new/trunked one. */
 		sad6 = malloc(sizeof(*sad6), M_SONAME, M_WAITOK | M_ZERO);
 		sad6->sin6_len = sizeof(*sad6);
 		sad6->sin6_family = AF_INET6;
 		sad6->sin6_port = sin6->sin6_port;
 		NFSBCOPY(&sin6->sin6_addr, &sad6->sin6_addr,
 		    sizeof(struct in6_addr));
 		if (NFSHASPNFS(nmp) && NFSHASKERB(nmp)) {
 			/* For pNFS, a separate server principal is needed. */
 			nrp = malloc(sizeof(*nrp) + NI_MAXSERV + NI_MAXHOST,
 			    M_NFSSOCKREQ, M_WAITOK | M_ZERO);
 			/*
 			 * Use the latter part of nr_srvprinc as a temporary
 			 * buffer for the IP address.
 			 */
 			inet_ntop(AF_INET6, &sad6->sin6_addr,
 			    &nrp->nr_srvprinc[NI_MAXSERV], NI_MAXHOST);
 			NFSCL_DEBUG(1, "nfsrpc_fillsa: DS IP=%s\n",
 			    &nrp->nr_srvprinc[NI_MAXSERV]);
 			if (!rpc_gss_ip_to_srv_principal_call(
 			    &nrp->nr_srvprinc[NI_MAXSERV], "nfs",
 			    nrp->nr_srvprinc))
 				nrp->nr_srvprinc[0] = '\0';
 			NFSCL_DEBUG(1, "nfsrpc_fillsa: srv principal=%s\n",
 			    nrp->nr_srvprinc);
 		} else
 			nrp = malloc(sizeof(*nrp), M_NFSSOCKREQ,
 			    M_WAITOK | M_ZERO);
 		nrp->nr_nam = (struct sockaddr *)sad6;
 	} else
 		return (EPERM);
 
 	nrp->nr_sotype = SOCK_STREAM;
 	mtx_init(&nrp->nr_mtx, "nfssock", NULL, MTX_DEF);
 	nrp->nr_prog = NFS_PROG;
 	nrp->nr_vers = vers;
 
 	/*
 	 * Use the credentials that were used for the mount, which are
 	 * in nmp->nm_sockreq.nr_cred for newnfs_connect() etc.
 	 * Ref. counting the credentials with crhold() is probably not
 	 * necessary, since nm_sockreq.nr_cred won't be crfree()'d until
 	 * unmount, but I did it anyhow.
 	 */
 	nrp->nr_cred = crhold(nmp->nm_sockreq.nr_cred);
 	error = newnfs_connect(nmp, nrp, NULL, p, 0, false, &nrp->nr_client);
 	NFSCL_DEBUG(3, "DS connect=%d\n", error);
 
 	dsp = NULL;
 	/* Now, do the exchangeid and create session. */
 	if (error == 0) {
 		if (vers == NFS_VER4) {
 			firsttry = 0;
 			do {
 				error = nfsrpc_exchangeid(nmp, clp, nrp, 
 				    minorvers, NFSV4EXCH_USEPNFSDS, &dsp,
 				    nrp->nr_cred, p);
 				NFSCL_DEBUG(3, "DS exchangeid=%d\n", error);
 				if (error == NFSERR_MINORVERMISMATCH)
 					minorvers = NFSV42_MINORVERSION;
 			} while (error == NFSERR_MINORVERMISMATCH &&
 			    firsttry++ == 0);
 			if (error != 0)
 				newnfs_disconnect(NULL, nrp);
 		} else {
 			dsp = malloc(sizeof(struct nfsclds), M_NFSCLDS,
 			    M_WAITOK | M_ZERO);
 			dsp->nfsclds_flags |= NFSCLDS_DS;
 			dsp->nfsclds_expire = INT32_MAX; /* No renews needed. */
 			mtx_init(&dsp->nfsclds_mtx, "nfsds", NULL, MTX_DEF);
 			mtx_init(&dsp->nfsclds_sess.nfsess_mtx, "nfssession",
 			    NULL, MTX_DEF);
 		}
 	}
 	if (error == 0) {
 		dsp->nfsclds_sockp = nrp;
 		if (vers == NFS_VER4) {
 			NFSLOCKMNT(nmp);
 			retv = nfscl_getsameserver(nmp, dsp, &tdsp,
 			    &sequenceid);
 			NFSCL_DEBUG(3, "getsame ret=%d\n", retv);
 			if (retv == NFSDSP_USETHISSESSION &&
 			    nfscl_dssameconn != 0) {
 				NFSLOCKDS(tdsp);
 				tdsp->nfsclds_flags |= NFSCLDS_SAMECONN;
 				NFSUNLOCKDS(tdsp);
 				NFSUNLOCKMNT(nmp);
 				/*
 				 * If there is already a session for this
 				 * server, use it.
 				 */
 				newnfs_disconnect(NULL, nrp);
 				nfscl_freenfsclds(dsp);
 				*dspp = tdsp;
 				return (0);
 			}
 			if (retv == NFSDSP_NOTFOUND)
 				sequenceid =
 				    dsp->nfsclds_sess.nfsess_sequenceid;
 			NFSUNLOCKMNT(nmp);
 			error = nfsrpc_createsession(nmp, &dsp->nfsclds_sess,
 			    nrp, dsp, sequenceid, 0, nrp->nr_cred, p);
 			NFSCL_DEBUG(3, "DS createsess=%d\n", error);
 		}
 	} else {
 		NFSFREECRED(nrp->nr_cred);
 		NFSFREEMUTEX(&nrp->nr_mtx);
 		free(nrp->nr_nam, M_SONAME);
 		free(nrp, M_NFSSOCKREQ);
 	}
 	if (error == 0) {
 		NFSCL_DEBUG(3, "add DS session\n");
 		/*
 		 * Put it at the end of the list. That way the list
 		 * is ordered by when the entry was added. This matters
 		 * since the one done first is the one that should be
 		 * used for sequencid'ing any subsequent create sessions.
 		 */
 		NFSLOCKMNT(nmp);
 		TAILQ_INSERT_TAIL(&nmp->nm_sess, dsp, nfsclds_list);
 		NFSUNLOCKMNT(nmp);
 		*dspp = dsp;
 	} else if (dsp != NULL) {
 		newnfs_disconnect(NULL, nrp);
 		nfscl_freenfsclds(dsp);
 	}
 	return (error);
 }
 
 /*
  * Do the NFSv4.1 Reclaim Complete.
  */
 int
 nfsrpc_reclaimcomplete(struct nfsmount *nmp, struct ucred *cred, NFSPROC_T *p)
 {
 	uint32_t *tl;
 	struct nfsrv_descript nfsd;
 	struct nfsrv_descript *nd = &nfsd;
 	int error;
 
 	nfscl_reqstart(nd, NFSPROC_RECLAIMCOMPL, nmp, NULL, 0, NULL, NULL, 0,
 	    0, cred);
 	NFSM_BUILD(tl, uint32_t *, NFSX_UNSIGNED);
 	*tl = newnfs_false;
 	nd->nd_flag |= ND_USEGSSNAME;
 	error = newnfs_request(nd, nmp, NULL, &nmp->nm_sockreq, NULL, p, cred,
 	    NFS_PROG, NFS_VER4, NULL, 1, NULL, NULL);
 	if (error != 0)
 		return (error);
 	error = nd->nd_repstat;
 	m_freem(nd->nd_mrep);
 	return (error);
 }
 
 /*
  * Initialize the slot tables for a session.
  */
 static void
 nfscl_initsessionslots(struct nfsclsession *sep)
 {
 	int i;
 
 	for (i = 0; i < NFSV4_CBSLOTS; i++) {
 		if (sep->nfsess_cbslots[i].nfssl_reply != NULL)
 			m_freem(sep->nfsess_cbslots[i].nfssl_reply);
 		NFSBZERO(&sep->nfsess_cbslots[i], sizeof(struct nfsslot));
 	}
 	for (i = 0; i < 64; i++)
 		sep->nfsess_slotseq[i] = 0;
 	sep->nfsess_slots = 0;
 	sep->nfsess_badslots = 0;
 }
 
 /*
  * Called to try and do an I/O operation via an NFSv4.1 Data Server (DS).
  */
 int
 nfscl_doiods(vnode_t vp, struct uio *uiop, int *iomode, int *must_commit,
     uint32_t rwaccess, int docommit, struct ucred *cred, NFSPROC_T *p)
 {
 	struct nfsnode *np = VTONFS(vp);
 	struct nfsmount *nmp = VFSTONFS(vp->v_mount);
 	struct nfscllayout *layp;
 	struct nfscldevinfo *dip;
 	struct nfsclflayout *rflp;
 	struct mbuf *m, *m2;
 	struct nfsclwritedsdorpc *drpc, *tdrpc;
 	nfsv4stateid_t stateid;
 	struct ucred *newcred;
 	uint64_t lastbyte, len, off, oresid, xfer;
 	int eof, error, firstmirror, i, iolaymode, mirrorcnt, recalled, timo;
 	void *lckp;
 	uint8_t *dev;
 	void *iovbase = NULL;
 	size_t iovlen = 0;
 	off_t offs = 0;
 	ssize_t resid = 0;
 	uint32_t op;
 
 	if (!NFSHASPNFS(nmp) || nfscl_enablecallb == 0 || nfs_numnfscbd == 0 ||
 	    (np->n_flag & NNOLAYOUT) != 0)
 		return (EIO);
 	/* Now, get a reference cnt on the clientid for this mount. */
 	if (nfscl_getref(nmp) == 0)
 		return (EIO);
 
 	/* Find an appropriate stateid. */
 	newcred = NFSNEWCRED(cred);
 	error = nfscl_getstateid(vp, np->n_fhp->nfh_fh, np->n_fhp->nfh_len,
 	    rwaccess, 1, newcred, p, &stateid, &lckp);
 	if (error != 0) {
 		NFSFREECRED(newcred);
 		nfscl_relref(nmp);
 		return (error);
 	}
 	/* Search for a layout for this file. */
 	off = uiop->uio_offset;
 	layp = nfscl_getlayout(nmp->nm_clp, np->n_fhp->nfh_fh,
 	    np->n_fhp->nfh_len, off, rwaccess, &rflp, &recalled);
 	if (layp == NULL || rflp == NULL) {
 		if (recalled != 0) {
 			NFSFREECRED(newcred);
 			if (lckp != NULL)
 				nfscl_lockderef(lckp);
 			nfscl_relref(nmp);
 			return (EIO);
 		}
 		if (layp != NULL) {
 			nfscl_rellayout(layp, (rflp == NULL) ? 1 : 0);
 			layp = NULL;
 		}
 		/* Try and get a Layout, if it is supported. */
 		if (rwaccess == NFSV4OPEN_ACCESSWRITE ||
 		    (np->n_flag & NWRITEOPENED) != 0)
 			iolaymode = NFSLAYOUTIOMODE_RW;
 		else
 			iolaymode = NFSLAYOUTIOMODE_READ;
 		error = nfsrpc_getlayout(nmp, vp, np->n_fhp, iolaymode,
 		    rwaccess, NULL, &stateid, off, &layp, newcred, p);
 		if (error != 0) {
 			NFSLOCKNODE(np);
 			np->n_flag |= NNOLAYOUT;
 			NFSUNLOCKNODE(np);
 			if (lckp != NULL)
 				nfscl_lockderef(lckp);
 			NFSFREECRED(newcred);
 			if (layp != NULL)
 				nfscl_rellayout(layp, 0);
 			nfscl_relref(nmp);
 			return (error);
 		}
 	}
 
 	/*
 	 * Loop around finding a layout that works for the first part of
 	 * this I/O operation, and then call the function that actually
 	 * does the RPC.
 	 */
 	eof = 0;
 	len = (uint64_t)uiop->uio_resid;
 	while (len > 0 && error == 0 && eof == 0) {
 		off = uiop->uio_offset;
 		error = nfscl_findlayoutforio(layp, off, rwaccess, &rflp);
 		if (error == 0) {
 			oresid = xfer = (uint64_t)uiop->uio_resid;
 			if (xfer > (rflp->nfsfl_end - rflp->nfsfl_off))
 				xfer = rflp->nfsfl_end - rflp->nfsfl_off;
 			/*
 			 * For Flex File layout with mirrored DSs, select one
 			 * of them at random for reads. For writes and commits,
 			 * do all mirrors.
 			 */
 			m = NULL;
 			tdrpc = drpc = NULL;
 			firstmirror = 0;
 			mirrorcnt = 1;
 			if ((layp->nfsly_flags & NFSLY_FLEXFILE) != 0 &&
 			    (mirrorcnt = rflp->nfsfl_mirrorcnt) > 1) {
 				if (rwaccess == NFSV4OPEN_ACCESSREAD) {
 					firstmirror = arc4random() % mirrorcnt;
 					mirrorcnt = firstmirror + 1;
 				} else {
 					if (docommit == 0) {
 						/*
 						 * Save values, so uiop can be
 						 * rolled back upon a write
 						 * error.
 						 */
 						offs = uiop->uio_offset;
 						resid = uiop->uio_resid;
 						iovbase =
 						    uiop->uio_iov->iov_base;
 						iovlen = uiop->uio_iov->iov_len;
 						m = nfsm_uiombuflist(uiop, len,
 						    0);
 						if (m == NULL) {
 							error = EFAULT;
 							break;
 						}
 					}
 					tdrpc = drpc = malloc(sizeof(*drpc) *
 					    (mirrorcnt - 1), M_TEMP, M_WAITOK |
 					    M_ZERO);
 				}
 			}
 			for (i = firstmirror; i < mirrorcnt && error == 0; i++){
 				m2 = NULL;
 				if (m != NULL && i < mirrorcnt - 1)
 					m2 = m_copym(m, 0, M_COPYALL, M_WAITOK);
 				else {
 					m2 = m;
 					m = NULL;
 				}
 				if ((layp->nfsly_flags & NFSLY_FLEXFILE) != 0) {
 					dev = rflp->nfsfl_ffm[i].dev;
 					dip = nfscl_getdevinfo(nmp->nm_clp, dev,
 					    rflp->nfsfl_ffm[i].devp);
 				} else {
 					dev = rflp->nfsfl_dev;
 					dip = nfscl_getdevinfo(nmp->nm_clp, dev,
 					    rflp->nfsfl_devp);
 				}
 				if (dip != NULL) {
 					if ((rflp->nfsfl_flags & NFSFL_FLEXFILE)
 					    != 0)
 						error = nfscl_dofflayoutio(vp,
 						    uiop, iomode, must_commit,
 						    &eof, &stateid, rwaccess,
 						    dip, layp, rflp, off, xfer,
 						    i, docommit, m2, tdrpc,
 						    newcred, p);
 					else
 						error = nfscl_doflayoutio(vp,
 						    uiop, iomode, must_commit,
 						    &eof, &stateid, rwaccess,
 						    dip, layp, rflp, off, xfer,
 						    docommit, newcred, p);
 					nfscl_reldevinfo(dip);
 				} else {
 					if (m2 != NULL)
 						m_freem(m2);
 					error = EIO;
 				}
 				tdrpc++;
 			}
 			if (m != NULL)
 				m_freem(m);
 			tdrpc = drpc;
 			timo = hz / 50;		/* Wait for 20msec. */
 			if (timo < 1)
 				timo = 1;
 			for (i = firstmirror; i < mirrorcnt - 1 &&
 			    tdrpc != NULL; i++, tdrpc++) {
 				/*
 				 * For the unused drpc entries, both inprog and
 				 * err == 0, so this loop won't break.
 				 */
 				while (tdrpc->inprog != 0 && tdrpc->done == 0)
 					tsleep(&tdrpc->tsk, PVFS, "clrpcio",
 					    timo);
 				if (error == 0 && tdrpc->err != 0)
 					error = tdrpc->err;
 				if (rwaccess != NFSV4OPEN_ACCESSREAD &&
 				    docommit == 0 && *must_commit == 0 &&
 				    tdrpc->must_commit == 1)
 					*must_commit = 1;
 			}
 			free(drpc, M_TEMP);
 			if (error == 0) {
 				if (mirrorcnt > 1 && rwaccess ==
 				    NFSV4OPEN_ACCESSWRITE && docommit == 0) {
 					NFSLOCKCLSTATE();
 					layp->nfsly_flags |= NFSLY_WRITTEN;
 					NFSUNLOCKCLSTATE();
 				}
 				lastbyte = off + xfer - 1;
 				NFSLOCKCLSTATE();
 				if (lastbyte > layp->nfsly_lastbyte)
 					layp->nfsly_lastbyte = lastbyte;
 				NFSUNLOCKCLSTATE();
 			} else if (error == NFSERR_OPENMODE &&
 			    rwaccess == NFSV4OPEN_ACCESSREAD) {
 				NFSLOCKMNT(nmp);
 				nmp->nm_state |= NFSSTA_OPENMODE;
 				NFSUNLOCKMNT(nmp);
 			} else if ((error == NFSERR_NOSPC ||
 			    error == NFSERR_IO || error == NFSERR_NXIO) &&
 			    nmp->nm_minorvers == NFSV42_MINORVERSION) {
 				if (docommit != 0)
 					op = NFSV4OP_COMMIT;
 				else if (rwaccess == NFSV4OPEN_ACCESSREAD)
 					op = NFSV4OP_READ;
 				else
 					op = NFSV4OP_WRITE;
 				nfsrpc_layouterror(nmp, np->n_fhp->nfh_fh,
 				    np->n_fhp->nfh_len, off, xfer,
 				    &layp->nfsly_stateid, newcred, p, error, op,
 				    dip->nfsdi_deviceid);
 				error = EIO;
 			} else
 				error = EIO;
 			if (error == 0)
 				len -= (oresid - (uint64_t)uiop->uio_resid);
 			else if (mirrorcnt > 1 && rwaccess ==
 			    NFSV4OPEN_ACCESSWRITE && docommit == 0) {
 				/*
 				 * In case the rpc gets retried, roll the
 				 * uio fields changed by nfsm_uiombuflist()
 				 * back.
 				 */
 				uiop->uio_offset = offs;
 				uiop->uio_resid = resid;
 				uiop->uio_iov->iov_base = iovbase;
 				uiop->uio_iov->iov_len = iovlen;
 			}
 		}
 	}
 	if (lckp != NULL)
 		nfscl_lockderef(lckp);
 	NFSFREECRED(newcred);
 	nfscl_rellayout(layp, 0);
 	nfscl_relref(nmp);
 	return (error);
 }
 
 /*
  * Find a file layout that will handle the first bytes of the requested
  * range and return the information from it needed to the I/O operation.
  */
 int
 nfscl_findlayoutforio(struct nfscllayout *lyp, uint64_t off, uint32_t rwaccess,
     struct nfsclflayout **retflpp)
 {
 	struct nfsclflayout *flp, *nflp, *rflp;
 	uint32_t rw;
 
 	rflp = NULL;
 	rw = rwaccess;
 	/* For reading, do the Read list first and then the Write list. */
 	do {
 		if (rw == NFSV4OPEN_ACCESSREAD)
 			flp = LIST_FIRST(&lyp->nfsly_flayread);
 		else
 			flp = LIST_FIRST(&lyp->nfsly_flayrw);
 		while (flp != NULL) {
 			nflp = LIST_NEXT(flp, nfsfl_list);
 			if (flp->nfsfl_off > off)
 				break;
 			if (flp->nfsfl_end > off &&
 			    (rflp == NULL || rflp->nfsfl_end < flp->nfsfl_end))
 				rflp = flp;
 			flp = nflp;
 		}
 		if (rw == NFSV4OPEN_ACCESSREAD)
 			rw = NFSV4OPEN_ACCESSWRITE;
 		else
 			rw = 0;
 	} while (rw != 0);
 	if (rflp != NULL) {
 		/* This one covers the most bytes starting at off. */
 		*retflpp = rflp;
 		return (0);
 	}
 	return (EIO);
 }
 
 /*
  * Do I/O using an NFSv4.1 or NFSv4.2 file layout.
  */
 static int
 nfscl_doflayoutio(vnode_t vp, struct uio *uiop, int *iomode, int *must_commit,
     int *eofp, nfsv4stateid_t *stateidp, int rwflag, struct nfscldevinfo *dp,
     struct nfscllayout *lyp, struct nfsclflayout *flp, uint64_t off,
     uint64_t len, int docommit, struct ucred *cred, NFSPROC_T *p)
 {
 	uint64_t io_off, rel_off, stripe_unit_size, transfer, xfer;
 	int commit_thru_mds, error, stripe_index, stripe_pos, minorvers;
 	struct nfsnode *np;
 	struct nfsfh *fhp;
 	struct nfsclds **dspp;
 
 	np = VTONFS(vp);
 	rel_off = off - flp->nfsfl_patoff;
 	stripe_unit_size = flp->nfsfl_util & NFSFLAYUTIL_STRIPE_MASK;
 	stripe_pos = (rel_off / stripe_unit_size + flp->nfsfl_stripe1) %
 	    dp->nfsdi_stripecnt;
 	transfer = stripe_unit_size - (rel_off % stripe_unit_size);
 	error = 0;
 
 	/* Loop around, doing I/O for each stripe unit. */
 	while (len > 0 && error == 0) {
 		stripe_index = nfsfldi_stripeindex(dp, stripe_pos);
 		dspp = nfsfldi_addr(dp, stripe_index);
 		if (((*dspp)->nfsclds_flags & NFSCLDS_MINORV2) != 0)
 			minorvers = NFSV42_MINORVERSION;
 		else
 			minorvers = NFSV41_MINORVERSION;
 		if (len > transfer && docommit == 0)
 			xfer = transfer;
 		else
 			xfer = len;
 		if ((flp->nfsfl_util & NFSFLAYUTIL_DENSE) != 0) {
 			/* Dense layout. */
 			if (stripe_pos >= flp->nfsfl_fhcnt)
 				return (EIO);
 			fhp = flp->nfsfl_fh[stripe_pos];
 			io_off = (rel_off / (stripe_unit_size *
 			    dp->nfsdi_stripecnt)) * stripe_unit_size +
 			    rel_off % stripe_unit_size;
 		} else {
 			/* Sparse layout. */
 			if (flp->nfsfl_fhcnt > 1) {
 				if (stripe_index >= flp->nfsfl_fhcnt)
 					return (EIO);
 				fhp = flp->nfsfl_fh[stripe_index];
 			} else if (flp->nfsfl_fhcnt == 1)
 				fhp = flp->nfsfl_fh[0];
 			else
 				fhp = np->n_fhp;
 			io_off = off;
 		}
 		if ((flp->nfsfl_util & NFSFLAYUTIL_COMMIT_THRU_MDS) != 0) {
 			commit_thru_mds = 1;
 			if (docommit != 0)
 				error = EIO;
 		} else {
 			commit_thru_mds = 0;
 			NFSLOCKNODE(np);
 			np->n_flag |= NDSCOMMIT;
 			NFSUNLOCKNODE(np);
 		}
 		if (docommit != 0) {
 			if (error == 0)
 				error = nfsrpc_commitds(vp, io_off, xfer,
 				    *dspp, fhp, NFS_VER4, minorvers, cred, p);
 			if (error == 0) {
 				/*
 				 * Set both eof and uio_resid = 0 to end any
 				 * loops.
 				 */
 				*eofp = 1;
 				uiop->uio_resid = 0;
 			} else {
 				NFSLOCKNODE(np);
 				np->n_flag &= ~NDSCOMMIT;
 				NFSUNLOCKNODE(np);
 			}
 		} else if (rwflag == NFSV4OPEN_ACCESSREAD)
 			error = nfsrpc_readds(vp, uiop, stateidp, eofp, *dspp,
 			    io_off, xfer, fhp, 0, NFS_VER4, minorvers, cred, p);
 		else {
 			error = nfsrpc_writeds(vp, uiop, iomode, must_commit,
 			    stateidp, *dspp, io_off, xfer, fhp, commit_thru_mds,
 			    0, NFS_VER4, minorvers, cred, p);
 			if (error == 0) {
 				NFSLOCKCLSTATE();
 				lyp->nfsly_flags |= NFSLY_WRITTEN;
 				NFSUNLOCKCLSTATE();
 			}
 		}
 		if (error == 0) {
 			transfer = stripe_unit_size;
 			stripe_pos = (stripe_pos + 1) % dp->nfsdi_stripecnt;
 			len -= xfer;
 			off += xfer;
 		}
 	}
 	return (error);
 }
 
 /*
  * Do I/O using an NFSv4.1 flex file layout.
  */
 static int
 nfscl_dofflayoutio(vnode_t vp, struct uio *uiop, int *iomode, int *must_commit,
     int *eofp, nfsv4stateid_t *stateidp, int rwflag, struct nfscldevinfo *dp,
     struct nfscllayout *lyp, struct nfsclflayout *flp, uint64_t off,
     uint64_t len, int mirror, int docommit, struct mbuf *mp,
     struct nfsclwritedsdorpc *drpc, struct ucred *cred, NFSPROC_T *p)
 {
 	uint64_t xfer;
 	int error;
 	struct nfsnode *np;
 	struct nfsfh *fhp;
 	struct nfsclds **dspp;
 	struct ucred *tcred;
 	struct mbuf *m, *m2;
 	uint32_t copylen;
 
 	np = VTONFS(vp);
 	error = 0;
 	NFSCL_DEBUG(4, "nfscl_dofflayoutio: off=%ju len=%ju\n", (uintmax_t)off,
 	    (uintmax_t)len);
 	/* Loop around, doing I/O for each stripe unit. */
 	while (len > 0 && error == 0) {
 		dspp = nfsfldi_addr(dp, 0);
 		fhp = flp->nfsfl_ffm[mirror].fh[dp->nfsdi_versindex];
 		stateidp = &flp->nfsfl_ffm[mirror].st;
 		NFSCL_DEBUG(4, "mirror=%d vind=%d fhlen=%d st.seqid=0x%x\n",
 		    mirror, dp->nfsdi_versindex, fhp->nfh_len, stateidp->seqid);
 		if ((dp->nfsdi_flags & NFSDI_TIGHTCOUPLED) == 0) {
 			tcred = NFSNEWCRED(cred);
 			tcred->cr_uid = flp->nfsfl_ffm[mirror].user;
 			tcred->cr_groups[0] = flp->nfsfl_ffm[mirror].group;
 			tcred->cr_ngroups = 1;
 		} else
 			tcred = cred;
 		if (rwflag == NFSV4OPEN_ACCESSREAD)
 			copylen = dp->nfsdi_rsize;
 		else {
 			copylen = dp->nfsdi_wsize;
 			if (len > copylen && mp != NULL) {
 				/*
 				 * When a mirrored configuration needs to do
 				 * multiple writes to each mirror, all writes
 				 * except the last one must be a multiple of
 				 * 4 bytes.  This is required so that the XDR
 				 * does not need padding.
 				 * If possible, clip the size to an exact
 				 * multiple of the mbuf length, so that the
 				 * split will be on an mbuf boundary.
 				 */
 				copylen &= 0xfffffffc;
 				if (copylen > mp->m_len)
 					copylen = copylen / mp->m_len *
 					    mp->m_len;
 			}
 		}
 		NFSLOCKNODE(np);
 		np->n_flag |= NDSCOMMIT;
 		NFSUNLOCKNODE(np);
 		if (len > copylen && docommit == 0)
 			xfer = copylen;
 		else
 			xfer = len;
 		if (docommit != 0) {
 			if (error == 0) {
 				/*
 				 * Do last mirrored DS commit with this thread.
 				 */
 				if (mirror < flp->nfsfl_mirrorcnt - 1)
 					error = nfsio_commitds(vp, off, xfer,
 					    *dspp, fhp, dp->nfsdi_vers,
 					    dp->nfsdi_minorvers, drpc, tcred,
 					    p);
 				else
 					error = nfsrpc_commitds(vp, off, xfer,
 					    *dspp, fhp, dp->nfsdi_vers,
 					    dp->nfsdi_minorvers, tcred, p);
 				NFSCL_DEBUG(4, "commitds=%d\n", error);
 				if (error != 0 && error != EACCES && error !=
 				    ESTALE) {
 					NFSCL_DEBUG(4,
 					    "DS layreterr for commit\n");
 					nfscl_dserr(NFSV4OP_COMMIT, error, dp,
 					    lyp, *dspp);
 				}
 			}
 			NFSCL_DEBUG(4, "aft nfsio_commitds=%d\n", error);
 			if (error == 0) {
 				/*
 				 * Set both eof and uio_resid = 0 to end any
 				 * loops.
 				 */
 				*eofp = 1;
 				uiop->uio_resid = 0;
 			} else {
 				NFSLOCKNODE(np);
 				np->n_flag &= ~NDSCOMMIT;
 				NFSUNLOCKNODE(np);
 			}
 		} else if (rwflag == NFSV4OPEN_ACCESSREAD) {
 			error = nfsrpc_readds(vp, uiop, stateidp, eofp, *dspp,
 			    off, xfer, fhp, 1, dp->nfsdi_vers,
 			    dp->nfsdi_minorvers, tcred, p);
 			NFSCL_DEBUG(4, "readds=%d\n", error);
 			if (error != 0 && error != EACCES && error != ESTALE) {
 				NFSCL_DEBUG(4, "DS layreterr for read\n");
 				nfscl_dserr(NFSV4OP_READ, error, dp, lyp,
 				    *dspp);
 			}
 		} else {
 			if (flp->nfsfl_mirrorcnt == 1) {
 				error = nfsrpc_writeds(vp, uiop, iomode,
 				    must_commit, stateidp, *dspp, off, xfer,
 				    fhp, 0, 1, dp->nfsdi_vers,
 				    dp->nfsdi_minorvers, tcred, p);
 				if (error == 0) {
 					NFSLOCKCLSTATE();
 					lyp->nfsly_flags |= NFSLY_WRITTEN;
 					NFSUNLOCKCLSTATE();
 				}
 			} else {
 				m = mp;
 				if (xfer < len) {
 					/* The mbuf list must be split. */
 					m2 = nfsm_split(mp, xfer);
 					if (m2 != NULL)
 						mp = m2;
 					else {
 						m_freem(mp);
 						error = EIO;
 					}
 				}
 				NFSCL_DEBUG(4, "mcopy len=%jd xfer=%jd\n",
 				    (uintmax_t)len, (uintmax_t)xfer);
 				/*
 				 * Do last write to a mirrored DS with this
 				 * thread.
 				 */
 				if (error == 0) {
 					if (mirror < flp->nfsfl_mirrorcnt - 1)
 						error = nfsio_writedsmir(vp,
 						    iomode, must_commit,
 						    stateidp, *dspp, off,
 						    xfer, fhp, m,
 						    dp->nfsdi_vers,
 						    dp->nfsdi_minorvers, drpc,
 						    tcred, p);
 					else
 						error = nfsrpc_writedsmir(vp,
 						    iomode, must_commit,
 						    stateidp, *dspp, off,
 						    xfer, fhp, m,
 						    dp->nfsdi_vers,
 						    dp->nfsdi_minorvers, tcred,
 						    p);
 				}
 				NFSCL_DEBUG(4, "nfsio_writedsmir=%d\n", error);
 				if (error != 0 && error != EACCES && error !=
 				    ESTALE) {
 					NFSCL_DEBUG(4,
 					    "DS layreterr for write\n");
 					nfscl_dserr(NFSV4OP_WRITE, error, dp,
 					    lyp, *dspp);
 				}
 			}
 		}
 		NFSCL_DEBUG(4, "aft read/writeds=%d\n", error);
 		if (error == 0) {
 			len -= xfer;
 			off += xfer;
 		}
 		if ((dp->nfsdi_flags & NFSDI_TIGHTCOUPLED) == 0)
 			NFSFREECRED(tcred);
 	}
 	NFSCL_DEBUG(4, "eo nfscl_dofflayoutio=%d\n", error);
 	return (error);
 }
 
 /*
  * The actual read RPC done to a DS.
  */
 static int
 nfsrpc_readds(vnode_t vp, struct uio *uiop, nfsv4stateid_t *stateidp, int *eofp,
     struct nfsclds *dsp, uint64_t io_off, int len, struct nfsfh *fhp, int flex,
     int vers, int minorvers, struct ucred *cred, NFSPROC_T *p)
 {
 	uint32_t *tl;
 	int attrflag, error, retlen;
 	struct nfsrv_descript nfsd;
 	struct nfsmount *nmp = VFSTONFS(vp->v_mount);
 	struct nfsrv_descript *nd = &nfsd;
 	struct nfssockreq *nrp;
 	struct nfsvattr na;
 
 	nd->nd_mrep = NULL;
 	if (vers == 0 || vers == NFS_VER4) {
 		nfscl_reqstart(nd, NFSPROC_READDS, nmp, fhp->nfh_fh,
 		    fhp->nfh_len, NULL, &dsp->nfsclds_sess, vers, minorvers,
 		    NULL);
 		vers = NFS_VER4;
 		NFSCL_DEBUG(4, "nfsrpc_readds: vers4 minvers=%d\n", minorvers);
 		if (flex != 0)
 			nfsm_stateidtom(nd, stateidp, NFSSTATEID_PUTSTATEID);
 		else
 			nfsm_stateidtom(nd, stateidp, NFSSTATEID_PUTSEQIDZERO);
 	} else {
 		nfscl_reqstart(nd, NFSPROC_READ, nmp, fhp->nfh_fh,
 		    fhp->nfh_len, NULL, &dsp->nfsclds_sess, vers, minorvers,
 		    NULL);
 		NFSDECRGLOBAL(nfsstatsv1.rpccnt[NFSPROC_READ]);
 		NFSINCRGLOBAL(nfsstatsv1.rpccnt[NFSPROC_READDS]);
 		NFSCL_DEBUG(4, "nfsrpc_readds: vers3\n");
 	}
 	NFSM_BUILD(tl, uint32_t *, NFSX_UNSIGNED * 3);
 	txdr_hyper(io_off, tl);
 	*(tl + 2) = txdr_unsigned(len);
 	nrp = dsp->nfsclds_sockp;
 	NFSCL_DEBUG(4, "nfsrpc_readds: nrp=%p\n", nrp);
 	if (nrp == NULL)
 		/* If NULL, use the MDS socket. */
 		nrp = &nmp->nm_sockreq;
 	error = newnfs_request(nd, nmp, NULL, nrp, vp, p, cred,
 	    NFS_PROG, vers, NULL, 1, NULL, &dsp->nfsclds_sess);
 	NFSCL_DEBUG(4, "nfsrpc_readds: stat=%d err=%d\n", nd->nd_repstat,
 	    error);
 	if (error != 0)
 		return (error);
 	if (vers == NFS_VER3) {
 		error = nfscl_postop_attr(nd, &na, &attrflag);
 		NFSCL_DEBUG(4, "nfsrpc_readds: postop=%d\n", error);
 		if (error != 0)
 			goto nfsmout;
 	}
 	if (nd->nd_repstat != 0) {
 		error = nd->nd_repstat;
 		goto nfsmout;
 	}
 	if (vers == NFS_VER3) {
 		NFSM_DISSECT(tl, uint32_t *, 2 * NFSX_UNSIGNED);
 		*eofp = fxdr_unsigned(int, *(tl + 1));
 	} else {
 		NFSM_DISSECT(tl, uint32_t *, NFSX_UNSIGNED);
 		*eofp = fxdr_unsigned(int, *tl);
 	}
 	NFSM_STRSIZ(retlen, len);
 	NFSCL_DEBUG(4, "nfsrpc_readds: retlen=%d eof=%d\n", retlen, *eofp);
 	error = nfsm_mbufuio(nd, uiop, retlen);
 nfsmout:
 	if (nd->nd_mrep != NULL)
 		m_freem(nd->nd_mrep);
 	return (error);
 }
 
 /*
  * The actual write RPC done to a DS.
  */
 static int
 nfsrpc_writeds(vnode_t vp, struct uio *uiop, int *iomode, int *must_commit,
     nfsv4stateid_t *stateidp, struct nfsclds *dsp, uint64_t io_off, int len,
     struct nfsfh *fhp, int commit_thru_mds, int flex, int vers, int minorvers,
     struct ucred *cred, NFSPROC_T *p)
 {
 	uint32_t *tl;
 	struct nfsmount *nmp = VFSTONFS(vp->v_mount);
 	int attrflag, error, rlen, commit, committed = NFSWRITE_FILESYNC;
 	int32_t backup;
 	struct nfsrv_descript nfsd;
 	struct nfsrv_descript *nd = &nfsd;
 	struct nfssockreq *nrp;
 	struct nfsvattr na;
 
 	KASSERT(uiop->uio_iovcnt == 1, ("nfs: writerpc iovcnt > 1"));
 	nd->nd_mrep = NULL;
 	if (vers == 0 || vers == NFS_VER4) {
 		nfscl_reqstart(nd, NFSPROC_WRITEDS, nmp, fhp->nfh_fh,
 		    fhp->nfh_len, NULL, &dsp->nfsclds_sess, vers, minorvers,
 		    NULL);
 		NFSCL_DEBUG(4, "nfsrpc_writeds: vers4 minvers=%d\n", minorvers);
 		vers = NFS_VER4;
 		if (flex != 0)
 			nfsm_stateidtom(nd, stateidp, NFSSTATEID_PUTSTATEID);
 		else
 			nfsm_stateidtom(nd, stateidp, NFSSTATEID_PUTSEQIDZERO);
 		NFSM_BUILD(tl, uint32_t *, NFSX_HYPER + 2 * NFSX_UNSIGNED);
 	} else {
 		nfscl_reqstart(nd, NFSPROC_WRITE, nmp, fhp->nfh_fh,
 		    fhp->nfh_len, NULL, &dsp->nfsclds_sess, vers, minorvers,
 		    NULL);
 		NFSDECRGLOBAL(nfsstatsv1.rpccnt[NFSPROC_WRITE]);
 		NFSINCRGLOBAL(nfsstatsv1.rpccnt[NFSPROC_WRITEDS]);
 		NFSCL_DEBUG(4, "nfsrpc_writeds: vers3\n");
 		NFSM_BUILD(tl, uint32_t *, NFSX_HYPER + 3 * NFSX_UNSIGNED);
 	}
 	txdr_hyper(io_off, tl);
 	tl += 2;
 	if (vers == NFS_VER3)
 		*tl++ = txdr_unsigned(len);
 	*tl++ = txdr_unsigned(*iomode);
 	*tl = txdr_unsigned(len);
 	error = nfsm_uiombuf(nd, uiop, len);
 	if (error != 0) {
 		m_freem(nd->nd_mreq);
 		return (error);
 	}
 	nrp = dsp->nfsclds_sockp;
 	if (nrp == NULL)
 		/* If NULL, use the MDS socket. */
 		nrp = &nmp->nm_sockreq;
 	error = newnfs_request(nd, nmp, NULL, nrp, vp, p, cred,
 	    NFS_PROG, vers, NULL, 1, NULL, &dsp->nfsclds_sess);
 	NFSCL_DEBUG(4, "nfsrpc_writeds: err=%d stat=%d\n", error,
 	    nd->nd_repstat);
 	if (error != 0)
 		return (error);
 	if (nd->nd_repstat != 0) {
 		/*
 		 * In case the rpc gets retried, roll
 		 * the uio fields changed by nfsm_uiombuf()
 		 * back.
 		 */
 		uiop->uio_offset -= len;
 		uiop->uio_resid += len;
 		uiop->uio_iov->iov_base = (char *)uiop->uio_iov->iov_base - len;
 		uiop->uio_iov->iov_len += len;
 		error = nd->nd_repstat;
 	} else {
 		if (vers == NFS_VER3) {
 			error = nfscl_wcc_data(nd, vp, &na, &attrflag, NULL,
 			    NULL);
 			NFSCL_DEBUG(4, "nfsrpc_writeds: wcc_data=%d\n", error);
 			if (error != 0)
 				goto nfsmout;
 		}
 		NFSM_DISSECT(tl, uint32_t *, 2 * NFSX_UNSIGNED + NFSX_VERF);
 		rlen = fxdr_unsigned(int, *tl++);
 		NFSCL_DEBUG(4, "nfsrpc_writeds: len=%d rlen=%d\n", len, rlen);
 		if (rlen == 0) {
 			error = NFSERR_IO;
 			goto nfsmout;
 		} else if (rlen < len) {
 			backup = len - rlen;
 			uiop->uio_iov->iov_base =
 			    (char *)uiop->uio_iov->iov_base - backup;
 			uiop->uio_iov->iov_len += backup;
 			uiop->uio_offset -= backup;
 			uiop->uio_resid += backup;
 			len = rlen;
 		}
 		commit = fxdr_unsigned(int, *tl++);
 
 		/*
 		 * Return the lowest commitment level
 		 * obtained by any of the RPCs.
 		 */
 		if (committed == NFSWRITE_FILESYNC)
 			committed = commit;
 		else if (committed == NFSWRITE_DATASYNC &&
 		    commit == NFSWRITE_UNSTABLE)
 			committed = commit;
 		if (commit_thru_mds != 0) {
 			NFSLOCKMNT(nmp);
 			if (!NFSHASWRITEVERF(nmp)) {
 				NFSBCOPY(tl, nmp->nm_verf, NFSX_VERF);
 				NFSSETWRITEVERF(nmp);
 			} else if (NFSBCMP(tl, nmp->nm_verf, NFSX_VERF) &&
 			    *must_commit != 2) {
 				*must_commit = 1;
 				NFSBCOPY(tl, nmp->nm_verf, NFSX_VERF);
 			}
 			NFSUNLOCKMNT(nmp);
 		} else {
 			NFSLOCKDS(dsp);
 			if ((dsp->nfsclds_flags & NFSCLDS_HASWRITEVERF) == 0) {
 				NFSBCOPY(tl, dsp->nfsclds_verf, NFSX_VERF);
 				dsp->nfsclds_flags |= NFSCLDS_HASWRITEVERF;
 			} else if (NFSBCMP(tl, dsp->nfsclds_verf, NFSX_VERF) &&
 			    *must_commit != 2) {
 				*must_commit = 1;
 				NFSBCOPY(tl, dsp->nfsclds_verf, NFSX_VERF);
 			}
 			NFSUNLOCKDS(dsp);
 		}
 	}
 nfsmout:
 	if (nd->nd_mrep != NULL)
 		m_freem(nd->nd_mrep);
 	*iomode = committed;
 	if (nd->nd_repstat != 0 && error == 0)
 		error = nd->nd_repstat;
 	return (error);
 }
 
 /*
  * The actual write RPC done to a DS.
  * This variant is called from a separate kernel process for mirrors.
  * Any short write is considered an IO error.
  */
 static int
 nfsrpc_writedsmir(vnode_t vp, int *iomode, int *must_commit,
     nfsv4stateid_t *stateidp, struct nfsclds *dsp, uint64_t io_off, int len,
     struct nfsfh *fhp, struct mbuf *m, int vers, int minorvers,
     struct ucred *cred, NFSPROC_T *p)
 {
 	uint32_t *tl;
 	struct nfsmount *nmp = VFSTONFS(vp->v_mount);
 	int attrflag, error, commit, committed = NFSWRITE_FILESYNC, rlen;
 	struct nfsrv_descript nfsd;
 	struct nfsrv_descript *nd = &nfsd;
 	struct nfssockreq *nrp;
 	struct nfsvattr na;
 
 	nd->nd_mrep = NULL;
 	if (vers == 0 || vers == NFS_VER4) {
 		nfscl_reqstart(nd, NFSPROC_WRITEDS, nmp, fhp->nfh_fh,
 		    fhp->nfh_len, NULL, &dsp->nfsclds_sess, vers, minorvers,
 		    NULL);
 		vers = NFS_VER4;
 		NFSCL_DEBUG(4, "nfsrpc_writedsmir: vers4 minvers=%d\n",
 		    minorvers);
 		nfsm_stateidtom(nd, stateidp, NFSSTATEID_PUTSTATEID);
 		NFSM_BUILD(tl, uint32_t *, NFSX_HYPER + 2 * NFSX_UNSIGNED);
 	} else {
 		nfscl_reqstart(nd, NFSPROC_WRITE, nmp, fhp->nfh_fh,
 		    fhp->nfh_len, NULL, &dsp->nfsclds_sess, vers, minorvers,
 		    NULL);
 		NFSDECRGLOBAL(nfsstatsv1.rpccnt[NFSPROC_WRITE]);
 		NFSINCRGLOBAL(nfsstatsv1.rpccnt[NFSPROC_WRITEDS]);
 		NFSCL_DEBUG(4, "nfsrpc_writedsmir: vers3\n");
 		NFSM_BUILD(tl, uint32_t *, NFSX_HYPER + 3 * NFSX_UNSIGNED);
 	}
 	txdr_hyper(io_off, tl);
 	tl += 2;
 	if (vers == NFS_VER3)
 		*tl++ = txdr_unsigned(len);
 	*tl++ = txdr_unsigned(*iomode);
 	*tl = txdr_unsigned(len);
 	if (len > 0) {
 		/* Put data in mbuf chain. */
 		nd->nd_mb->m_next = m;
 	}
 	nrp = dsp->nfsclds_sockp;
 	if (nrp == NULL)
 		/* If NULL, use the MDS socket. */
 		nrp = &nmp->nm_sockreq;
 	error = newnfs_request(nd, nmp, NULL, nrp, vp, p, cred,
 	    NFS_PROG, vers, NULL, 1, NULL, &dsp->nfsclds_sess);
 	NFSCL_DEBUG(4, "nfsrpc_writedsmir: err=%d stat=%d\n", error,
 	    nd->nd_repstat);
 	if (error != 0)
 		return (error);
 	if (nd->nd_repstat != 0)
 		error = nd->nd_repstat;
 	else {
 		if (vers == NFS_VER3) {
 			error = nfscl_wcc_data(nd, vp, &na, &attrflag, NULL,
 			    NULL);
 			NFSCL_DEBUG(4, "nfsrpc_writedsmir: wcc_data=%d\n",
 			    error);
 			if (error != 0)
 				goto nfsmout;
 		}
 		NFSM_DISSECT(tl, uint32_t *, 2 * NFSX_UNSIGNED + NFSX_VERF);
 		rlen = fxdr_unsigned(int, *tl++);
 		NFSCL_DEBUG(4, "nfsrpc_writedsmir: len=%d rlen=%d\n", len,
 		    rlen);
 		if (rlen != len) {
 			error = NFSERR_IO;
 			NFSCL_DEBUG(4, "nfsrpc_writedsmir: len=%d rlen=%d\n",
 			    len, rlen);
 			goto nfsmout;
 		}
 		commit = fxdr_unsigned(int, *tl++);
 
 		/*
 		 * Return the lowest commitment level
 		 * obtained by any of the RPCs.
 		 */
 		if (committed == NFSWRITE_FILESYNC)
 			committed = commit;
 		else if (committed == NFSWRITE_DATASYNC &&
 		    commit == NFSWRITE_UNSTABLE)
 			committed = commit;
 		NFSLOCKDS(dsp);
 		if ((dsp->nfsclds_flags & NFSCLDS_HASWRITEVERF) == 0) {
 			NFSBCOPY(tl, dsp->nfsclds_verf, NFSX_VERF);
 			dsp->nfsclds_flags |= NFSCLDS_HASWRITEVERF;
 		} else if (NFSBCMP(tl, dsp->nfsclds_verf, NFSX_VERF) &&
 		    *must_commit != 2) {
 			*must_commit = 1;
 			NFSBCOPY(tl, dsp->nfsclds_verf, NFSX_VERF);
 		}
 		NFSUNLOCKDS(dsp);
 	}
 nfsmout:
 	if (nd->nd_mrep != NULL)
 		m_freem(nd->nd_mrep);
 	*iomode = committed;
 	if (nd->nd_repstat != 0 && error == 0)
 		error = nd->nd_repstat;
 	return (error);
 }
 
 /*
  * Start up the thread that will execute nfsrpc_writedsmir().
  */
 static void
 start_writedsmir(void *arg, int pending)
 {
 	struct nfsclwritedsdorpc *drpc;
 
 	drpc = (struct nfsclwritedsdorpc *)arg;
 	drpc->err = nfsrpc_writedsmir(drpc->vp, &drpc->iomode,
 	    &drpc->must_commit, drpc->stateidp, drpc->dsp, drpc->off, drpc->len,
 	    drpc->fhp, drpc->m, drpc->vers, drpc->minorvers, drpc->cred,
 	    drpc->p);
 	drpc->done = 1;
 	crfree(drpc->cred);
 	NFSCL_DEBUG(4, "start_writedsmir: err=%d\n", drpc->err);
 }
 
 /*
  * Set up the write DS mirror call for the pNFS I/O thread.
  */
 static int
 nfsio_writedsmir(vnode_t vp, int *iomode, int *must_commit,
     nfsv4stateid_t *stateidp, struct nfsclds *dsp, uint64_t off, int len,
     struct nfsfh *fhp, struct mbuf *m, int vers, int minorvers,
     struct nfsclwritedsdorpc *drpc, struct ucred *cred, NFSPROC_T *p)
 {
 	int error, ret;
 
 	error = 0;
 	drpc->done = 0;
 	drpc->vp = vp;
 	drpc->iomode = *iomode;
 	drpc->must_commit = *must_commit;
 	drpc->stateidp = stateidp;
 	drpc->dsp = dsp;
 	drpc->off = off;
 	drpc->len = len;
 	drpc->fhp = fhp;
 	drpc->m = m;
 	drpc->vers = vers;
 	drpc->minorvers = minorvers;
 	drpc->cred = crhold(cred);
 	drpc->p = p;
 	drpc->inprog = 0;
 	ret = EIO;
 	if (nfs_pnfsiothreads != 0) {
 		ret = nfs_pnfsio(start_writedsmir, drpc);
 		NFSCL_DEBUG(4, "nfsio_writedsmir: nfs_pnfsio=%d\n", ret);
 	}
 	if (ret != 0) {
 		error = nfsrpc_writedsmir(vp, iomode, &drpc->must_commit,
 		    stateidp, dsp, off, len, fhp, m, vers, minorvers, cred, p);
 		crfree(drpc->cred);
 	}
 	NFSCL_DEBUG(4, "nfsio_writedsmir: error=%d\n", error);
 	return (error);
 }
 
 /*
  * Free up the nfsclds structure.
  */
 void
 nfscl_freenfsclds(struct nfsclds *dsp)
 {
 	int i;
 
 	if (dsp == NULL)
 		return;
 	if (dsp->nfsclds_sockp != NULL) {
 		NFSFREECRED(dsp->nfsclds_sockp->nr_cred);
 		NFSFREEMUTEX(&dsp->nfsclds_sockp->nr_mtx);
 		free(dsp->nfsclds_sockp->nr_nam, M_SONAME);
 		free(dsp->nfsclds_sockp, M_NFSSOCKREQ);
 	}
 	NFSFREEMUTEX(&dsp->nfsclds_mtx);
 	NFSFREEMUTEX(&dsp->nfsclds_sess.nfsess_mtx);
 	for (i = 0; i < NFSV4_CBSLOTS; i++) {
 		if (dsp->nfsclds_sess.nfsess_cbslots[i].nfssl_reply != NULL)
 			m_freem(
 			    dsp->nfsclds_sess.nfsess_cbslots[i].nfssl_reply);
 	}
 	free(dsp, M_NFSCLDS);
 }
 
 static enum nfsclds_state
 nfscl_getsameserver(struct nfsmount *nmp, struct nfsclds *newdsp,
     struct nfsclds **retdspp, uint32_t *sequencep)
 {
 	struct nfsclds *dsp;
 	int fndseq;
 
 	/*
 	 * Search the list of nfsclds structures for one with the same
 	 * server.
 	 */
 	fndseq = 0;
 	TAILQ_FOREACH(dsp, &nmp->nm_sess, nfsclds_list) {
 		if (dsp->nfsclds_servownlen == newdsp->nfsclds_servownlen &&
 		    dsp->nfsclds_servownlen != 0 &&
 		    !NFSBCMP(dsp->nfsclds_serverown, newdsp->nfsclds_serverown,
 		    dsp->nfsclds_servownlen) &&
 		    dsp->nfsclds_sess.nfsess_defunct == 0) {
 			NFSCL_DEBUG(4, "fnd same fdsp=%p dsp=%p flg=0x%x\n",
 			    TAILQ_FIRST(&nmp->nm_sess), dsp,
 			    dsp->nfsclds_flags);
 			if (fndseq == 0) {
 				/* Get sequenceid# from first entry. */
 				*sequencep =
 				    dsp->nfsclds_sess.nfsess_sequenceid;
 				fndseq = 1;
 			}
 			/* Server major id matches. */
 			if ((dsp->nfsclds_flags & NFSCLDS_DS) != 0) {
 				*retdspp = dsp;
 				return (NFSDSP_USETHISSESSION);
 			}
 		}
 	}
 	if (fndseq != 0)
 		return (NFSDSP_SEQTHISSESSION);
 	return (NFSDSP_NOTFOUND);
 }
 
 /*
  * NFS commit rpc to a NFSv4.1 DS.
  */
 static int
 nfsrpc_commitds(vnode_t vp, uint64_t offset, int cnt, struct nfsclds *dsp,
     struct nfsfh *fhp, int vers, int minorvers, struct ucred *cred,
     NFSPROC_T *p)
 {
 	uint32_t *tl;
 	struct nfsrv_descript nfsd, *nd = &nfsd;
 	struct nfsmount *nmp = VFSTONFS(vp->v_mount);
 	struct nfssockreq *nrp;
 	struct nfsvattr na;
 	int attrflag, error;
 
 	nd->nd_mrep = NULL;
 	if (vers == 0 || vers == NFS_VER4) {
 		nfscl_reqstart(nd, NFSPROC_COMMITDS, nmp, fhp->nfh_fh,
 		    fhp->nfh_len, NULL, &dsp->nfsclds_sess, vers, minorvers,
 		    NULL);
 		vers = NFS_VER4;
 	} else {
 		nfscl_reqstart(nd, NFSPROC_COMMIT, nmp, fhp->nfh_fh,
 		    fhp->nfh_len, NULL, &dsp->nfsclds_sess, vers, minorvers,
 		    NULL);
 		NFSDECRGLOBAL(nfsstatsv1.rpccnt[NFSPROC_COMMIT]);
 		NFSINCRGLOBAL(nfsstatsv1.rpccnt[NFSPROC_COMMITDS]);
 	}
 	NFSCL_DEBUG(4, "nfsrpc_commitds: vers=%d minvers=%d\n", vers,
 	    minorvers);
 	NFSM_BUILD(tl, uint32_t *, NFSX_HYPER + NFSX_UNSIGNED);
 	txdr_hyper(offset, tl);
 	tl += 2;
 	*tl = txdr_unsigned(cnt);
 	nrp = dsp->nfsclds_sockp;
 	if (nrp == NULL)
 		/* If NULL, use the MDS socket. */
 		nrp = &nmp->nm_sockreq;
 	error = newnfs_request(nd, nmp, NULL, nrp, vp, p, cred,
 	    NFS_PROG, vers, NULL, 1, NULL, &dsp->nfsclds_sess);
 	NFSCL_DEBUG(4, "nfsrpc_commitds: err=%d stat=%d\n", error,
 	    nd->nd_repstat);
 	if (error != 0)
 		return (error);
 	if (nd->nd_repstat == 0) {
 		if (vers == NFS_VER3) {
 			error = nfscl_wcc_data(nd, vp, &na, &attrflag, NULL,
 			    NULL);
 			NFSCL_DEBUG(4, "nfsrpc_commitds: wccdata=%d\n", error);
 			if (error != 0)
 				goto nfsmout;
 		}
 		NFSM_DISSECT(tl, u_int32_t *, NFSX_VERF);
 		NFSLOCKDS(dsp);
 		if (NFSBCMP(tl, dsp->nfsclds_verf, NFSX_VERF)) {
 			NFSBCOPY(tl, dsp->nfsclds_verf, NFSX_VERF);
 			error = NFSERR_STALEWRITEVERF;
 		}
 		NFSUNLOCKDS(dsp);
 	}
 nfsmout:
 	if (error == 0 && nd->nd_repstat != 0)
 		error = nd->nd_repstat;
 	m_freem(nd->nd_mrep);
 	return (error);
 }
 
 /*
  * Start up the thread that will execute nfsrpc_commitds().
  */
 static void
 start_commitds(void *arg, int pending)
 {
 	struct nfsclwritedsdorpc *drpc;
 
 	drpc = (struct nfsclwritedsdorpc *)arg;
 	drpc->err = nfsrpc_commitds(drpc->vp, drpc->off, drpc->len,
 	    drpc->dsp, drpc->fhp, drpc->vers, drpc->minorvers, drpc->cred,
 	    drpc->p);
 	drpc->done = 1;
 	crfree(drpc->cred);
 	NFSCL_DEBUG(4, "start_commitds: err=%d\n", drpc->err);
 }
 
 /*
  * Set up the commit DS mirror call for the pNFS I/O thread.
  */
 static int
 nfsio_commitds(vnode_t vp, uint64_t offset, int cnt, struct nfsclds *dsp,
     struct nfsfh *fhp, int vers, int minorvers,
     struct nfsclwritedsdorpc *drpc, struct ucred *cred, NFSPROC_T *p)
 {
 	int error, ret;
 
 	error = 0;
 	drpc->done = 0;
 	drpc->vp = vp;
 	drpc->off = offset;
 	drpc->len = cnt;
 	drpc->dsp = dsp;
 	drpc->fhp = fhp;
 	drpc->vers = vers;
 	drpc->minorvers = minorvers;
 	drpc->cred = crhold(cred);
 	drpc->p = p;
 	drpc->inprog = 0;
 	ret = EIO;
 	if (nfs_pnfsiothreads != 0) {
 		ret = nfs_pnfsio(start_commitds, drpc);
 		NFSCL_DEBUG(4, "nfsio_commitds: nfs_pnfsio=%d\n", ret);
 	}
 	if (ret != 0) {
 		error = nfsrpc_commitds(vp, offset, cnt, dsp, fhp, vers,
 		    minorvers, cred, p);
 		crfree(drpc->cred);
 	}
 	NFSCL_DEBUG(4, "nfsio_commitds: error=%d\n", error);
 	return (error);
 }
 
 /*
  * NFS Advise rpc
  */
 int
 nfsrpc_advise(vnode_t vp, off_t offset, uint64_t cnt, int advise,
     struct ucred *cred, NFSPROC_T *p)
 {
 	u_int32_t *tl;
 	struct nfsrv_descript nfsd, *nd = &nfsd;
 	nfsattrbit_t hints;
 	int error;
 
 	NFSZERO_ATTRBIT(&hints);
 	if (advise == POSIX_FADV_WILLNEED)
 		NFSSETBIT_ATTRBIT(&hints, NFSV4IOHINT_WILLNEED);
 	else if (advise == POSIX_FADV_DONTNEED)
 		NFSSETBIT_ATTRBIT(&hints, NFSV4IOHINT_DONTNEED);
 	else
 		return (0);
 	NFSCL_REQSTART(nd, NFSPROC_IOADVISE, vp, cred);
 	nfsm_stateidtom(nd, NULL, NFSSTATEID_PUTALLZERO);
 	NFSM_BUILD(tl, uint32_t *, 2 * NFSX_HYPER);
 	txdr_hyper(offset, tl);
 	tl += 2;
 	txdr_hyper(cnt, tl);
 	nfsrv_putattrbit(nd, &hints);
 	error = nfscl_request(nd, vp, p, cred);
 	if (error != 0)
 		return (error);
 	if (nd->nd_repstat != 0)
 		error = nd->nd_repstat;
 	m_freem(nd->nd_mrep);
 	return (error);
 }
 
 #ifdef notyet
 /*
  * NFS advise rpc to a NFSv4.2 DS.
  */
 static int
 nfsrpc_adviseds(vnode_t vp, uint64_t offset, int cnt, int advise,
     struct nfsclds *dsp, struct nfsfh *fhp, int vers, int minorvers,
     struct ucred *cred, NFSPROC_T *p)
 {
 	uint32_t *tl;
 	struct nfsrv_descript nfsd, *nd = &nfsd;
 	struct nfsmount *nmp = VFSTONFS(vp->v_mount);
 	struct nfssockreq *nrp;
 	nfsattrbit_t hints;
 	int error;
 
 	/* For NFS DSs prior to NFSv4.2, just return OK. */
 	if (vers == NFS_VER3 || minorversion < NFSV42_MINORVERSION)
 		return (0);
 	NFSZERO_ATTRBIT(&hints);
 	if (advise == POSIX_FADV_WILLNEED)
 		NFSSETBIT_ATTRBIT(&hints, NFSV4IOHINT_WILLNEED);
 	else if (advise == POSIX_FADV_DONTNEED)
 		NFSSETBIT_ATTRBIT(&hints, NFSV4IOHINT_DONTNEED);
 	else
 		return (0);
 	nd->nd_mrep = NULL;
 	nfscl_reqstart(nd, NFSPROC_IOADVISEDS, nmp, fhp->nfh_fh,
 	    fhp->nfh_len, NULL, &dsp->nfsclds_sess, vers, minorvers, NULL);
 	vers = NFS_VER4;
 	NFSCL_DEBUG(4, "nfsrpc_adviseds: vers=%d minvers=%d\n", vers,
 	    minorvers);
 	nfsm_stateidtom(nd, NULL, NFSSTATEID_PUTALLZERO);
 	NFSM_BUILD(tl, uint32_t *, NFSX_HYPER + NFSX_UNSIGNED);
 	txdr_hyper(offset, tl);
 	tl += 2;
 	*tl = txdr_unsigned(cnt);
 	nfsrv_putattrbit(nd, &hints);
 	nrp = dsp->nfsclds_sockp;
 	if (nrp == NULL)
 		/* If NULL, use the MDS socket. */
 		nrp = &nmp->nm_sockreq;
 	error = newnfs_request(nd, nmp, NULL, nrp, vp, p, cred,
 	    NFS_PROG, vers, NULL, 1, NULL, &dsp->nfsclds_sess);
 	NFSCL_DEBUG(4, "nfsrpc_adviseds: err=%d stat=%d\n", error,
 	    nd->nd_repstat);
 	if (error != 0)
 		return (error);
 	if (nd->nd_repstat != 0)
 		error = nd->nd_repstat;
 	m_freem(nd->nd_mrep);
 	return (error);
 }
 
 /*
  * Start up the thread that will execute nfsrpc_commitds().
  */
 static void
 start_adviseds(void *arg, int pending)
 {
 	struct nfsclwritedsdorpc *drpc;
 
 	drpc = (struct nfsclwritedsdorpc *)arg;
 	drpc->err = nfsrpc_adviseds(drpc->vp, drpc->off, drpc->len,
 	    drpc->advise, drpc->dsp, drpc->fhp, drpc->vers, drpc->minorvers,
 	    drpc->cred, drpc->p);
 	drpc->done = 1;
 	crfree(drpc->cred);
 	NFSCL_DEBUG(4, "start_adviseds: err=%d\n", drpc->err);
 }
 
 /*
  * Set up the advise DS mirror call for the pNFS I/O thread.
  */
 static int
 nfsio_adviseds(vnode_t vp, uint64_t offset, int cnt, int advise,
     struct nfsclds *dsp, struct nfsfh *fhp, int vers, int minorvers,
     struct nfsclwritedsdorpc *drpc, struct ucred *cred, NFSPROC_T *p)
 {
 	int error, ret;
 
 	error = 0;
 	drpc->done = 0;
 	drpc->vp = vp;
 	drpc->off = offset;
 	drpc->len = cnt;
 	drpc->advise = advise;
 	drpc->dsp = dsp;
 	drpc->fhp = fhp;
 	drpc->vers = vers;
 	drpc->minorvers = minorvers;
 	drpc->cred = crhold(cred);
 	drpc->p = p;
 	drpc->inprog = 0;
 	ret = EIO;
 	if (nfs_pnfsiothreads != 0) {
 		ret = nfs_pnfsio(start_adviseds, drpc);
 		NFSCL_DEBUG(4, "nfsio_adviseds: nfs_pnfsio=%d\n", ret);
 	}
 	if (ret != 0) {
 		error = nfsrpc_adviseds(vp, offset, cnt, advise, dsp, fhp, vers,
 		    minorvers, cred, p);
 		crfree(drpc->cred);
 	}
 	NFSCL_DEBUG(4, "nfsio_adviseds: error=%d\n", error);
 	return (error);
 }
 #endif	/* notyet */
 
 /*
  * Do the Allocate operation, retrying for recovery.
  */
 int
 nfsrpc_allocate(vnode_t vp, off_t off, off_t len, struct nfsvattr *nap,
     int *attrflagp, struct ucred *cred, NFSPROC_T *p)
 {
 	int error, expireret = 0, retrycnt, nostateid;
 	uint32_t clidrev = 0;
 	struct nfsmount *nmp = VFSTONFS(vp->v_mount);
 	struct nfsfh *nfhp = NULL;
 	nfsv4stateid_t stateid;
 	off_t tmp_off;
 	void *lckp;
 
 	if (len < 0)
 		return (EINVAL);
 	if (len == 0)
 		return (0);
 	tmp_off = off + len;
 	NFSLOCKMNT(nmp);
 	if (tmp_off > nmp->nm_maxfilesize || tmp_off < off) {
 		NFSUNLOCKMNT(nmp);
 		return (EFBIG);
 	}
 	if (nmp->nm_clp != NULL)
 		clidrev = nmp->nm_clp->nfsc_clientidrev;
 	NFSUNLOCKMNT(nmp);
 	nfhp = VTONFS(vp)->n_fhp;
 	retrycnt = 0;
 	do {
 		lckp = NULL;
 		nostateid = 0;
 		nfscl_getstateid(vp, nfhp->nfh_fh, nfhp->nfh_len,
 		    NFSV4OPEN_ACCESSWRITE, 0, cred, p, &stateid, &lckp);
 		if (stateid.other[0] == 0 && stateid.other[1] == 0 &&
 		    stateid.other[2] == 0) {
 			nostateid = 1;
 			NFSCL_DEBUG(1, "stateid0 in allocate\n");
 		}
 
 		/*
 		 * Not finding a stateid should probably never happen,
 		 * but just return an error for this case.
 		 */
 		if (nostateid != 0)
 			error = EIO;
 		else
 			error = nfsrpc_allocaterpc(vp, off, len, &stateid,
 			    nap, attrflagp, cred, p);
 		if (error == NFSERR_STALESTATEID)
 			nfscl_initiate_recovery(nmp->nm_clp);
 		if (lckp != NULL)
 			nfscl_lockderef(lckp);
 		if (error == NFSERR_GRACE || error == NFSERR_STALESTATEID ||
 		    error == NFSERR_STALEDONTRECOVER || error == NFSERR_DELAY ||
 		    error == NFSERR_OLDSTATEID || error == NFSERR_BADSESSION) {
 			(void) nfs_catnap(PZERO, error, "nfs_allocate");
 		} else if ((error == NFSERR_EXPIRED || (!NFSHASINT(nmp) &&
 		    error == NFSERR_BADSTATEID)) && clidrev != 0) {
 			expireret = nfscl_hasexpired(nmp->nm_clp, clidrev, p);
 		} else if (error == NFSERR_BADSTATEID && NFSHASINT(nmp)) {
 			error = EIO;
 		}
 		retrycnt++;
 	} while (error == NFSERR_GRACE || error == NFSERR_DELAY ||
 	    error == NFSERR_STALESTATEID || error == NFSERR_BADSESSION ||
 	    error == NFSERR_STALEDONTRECOVER ||
 	    (error == NFSERR_OLDSTATEID && retrycnt < 20) ||
 	    ((error == NFSERR_EXPIRED || error == NFSERR_BADSTATEID) &&
 	     expireret == 0 && clidrev != 0 && retrycnt < 4));
 	if (error != 0 && retrycnt >= 4)
 		error = EIO;
 	return (error);
 }
 
 /*
  * The allocate RPC.
  */
 static int
 nfsrpc_allocaterpc(vnode_t vp, off_t off, off_t len, nfsv4stateid_t *stateidp,
     struct nfsvattr *nap, int *attrflagp, struct ucred *cred, NFSPROC_T *p)
 {
 	uint32_t *tl;
 	int error;
 	struct nfsrv_descript nfsd;
 	struct nfsrv_descript *nd = &nfsd;
 	nfsattrbit_t attrbits;
 
 	*attrflagp = 0;
 	NFSCL_REQSTART(nd, NFSPROC_ALLOCATE, vp, cred);
 	nfsm_stateidtom(nd, stateidp, NFSSTATEID_PUTSTATEID);
 	NFSM_BUILD(tl, uint32_t *, 2 * NFSX_HYPER + NFSX_UNSIGNED);
 	txdr_hyper(off, tl); tl += 2;
 	txdr_hyper(len, tl); tl += 2;
 	*tl = txdr_unsigned(NFSV4OP_GETATTR);
 	NFSGETATTR_ATTRBIT(&attrbits);
 	nfsrv_putattrbit(nd, &attrbits);
 	error = nfscl_request(nd, vp, p, cred);
 	if (error != 0)
 		return (error);
 	if (nd->nd_repstat == 0) {
 		NFSM_DISSECT(tl, u_int32_t *, 2 * NFSX_UNSIGNED);
 		error = nfsm_loadattr(nd, nap);
 		if (error == 0)
 			*attrflagp = NFS_LATTR_NOSHRINK;
 	} else
 		error = nd->nd_repstat;
 nfsmout:
 	m_freem(nd->nd_mrep);
 	return (error);
 }
 
 /*
  * Set up the XDR arguments for the LayoutGet operation.
  */
 static void
 nfsrv_setuplayoutget(struct nfsrv_descript *nd, int iomode, uint64_t offset,
     uint64_t len, uint64_t minlen, nfsv4stateid_t *stateidp, int layouttype,
     int layoutlen, int usecurstateid)
 {
 	uint32_t *tl;
 
 	NFSM_BUILD(tl, uint32_t *, 4 * NFSX_UNSIGNED + 3 * NFSX_HYPER +
 	    NFSX_STATEID);
 	*tl++ = newnfs_false;		/* Don't signal availability. */
 	*tl++ = txdr_unsigned(layouttype);
 	*tl++ = txdr_unsigned(iomode);
 	txdr_hyper(offset, tl);
 	tl += 2;
 	txdr_hyper(len, tl);
 	tl += 2;
 	txdr_hyper(minlen, tl);
 	tl += 2;
 	if (usecurstateid != 0) {
 		/* Special stateid for Current stateid. */
 		*tl++ = txdr_unsigned(1);
 		*tl++ = 0;
 		*tl++ = 0;
 		*tl++ = 0;
 	} else {
 		*tl++ = txdr_unsigned(stateidp->seqid);
 		NFSCL_DEBUG(4, "layget seq=%d\n", (int)stateidp->seqid);
 		*tl++ = stateidp->other[0];
 		*tl++ = stateidp->other[1];
 		*tl++ = stateidp->other[2];
 	}
 	*tl = txdr_unsigned(layoutlen);
 }
 
 /*
  * Parse the reply for a successful LayoutGet operation.
  */
 static int
 nfsrv_parselayoutget(struct nfsmount *nmp, struct nfsrv_descript *nd,
     nfsv4stateid_t *stateidp, int *retonclosep, struct nfsclflayouthead *flhp)
 {
 	uint32_t *tl;
 	struct nfsclflayout *flp, *prevflp, *tflp;
 	int cnt, error, fhcnt, gotiomode, i, iomode, j, k, l, laytype, nfhlen;
 	int m, mirrorcnt;
 	uint64_t retlen, off;
 	struct nfsfh *nfhp;
 	uint8_t *cp;
 	uid_t user;
 	gid_t grp;
 
 	NFSCL_DEBUG(4, "in nfsrv_parselayoutget\n");
 	error = 0;
 	flp = NULL;
 	gotiomode = -1;
 	NFSM_DISSECT(tl, uint32_t *, 2 * NFSX_UNSIGNED + NFSX_STATEID);
 	if (*tl++ != 0)
 		*retonclosep = 1;
 	else
 		*retonclosep = 0;
 	stateidp->seqid = fxdr_unsigned(uint32_t, *tl++);
 	NFSCL_DEBUG(4, "retoncls=%d stseq=%d\n", *retonclosep,
 	    (int)stateidp->seqid);
 	stateidp->other[0] = *tl++;
 	stateidp->other[1] = *tl++;
 	stateidp->other[2] = *tl++;
 	cnt = fxdr_unsigned(int, *tl);
 	NFSCL_DEBUG(4, "layg cnt=%d\n", cnt);
 	if (cnt <= 0 || cnt > 10000) {
 		/* Don't accept more than 10000 layouts in reply. */
 		error = NFSERR_BADXDR;
 		goto nfsmout;
 	}
 	for (i = 0; i < cnt; i++) {
 		/* Dissect to the layout type. */
 		NFSM_DISSECT(tl, uint32_t *, 2 * NFSX_HYPER +
 		    3 * NFSX_UNSIGNED);
 		off = fxdr_hyper(tl); tl += 2;
 		retlen = fxdr_hyper(tl); tl += 2;
 		iomode = fxdr_unsigned(int, *tl++);
 		laytype = fxdr_unsigned(int, *tl);
 		NFSCL_DEBUG(4, "layt=%d off=%ju len=%ju iom=%d\n", laytype,
 		    (uintmax_t)off, (uintmax_t)retlen, iomode);
 		/* Ignore length of layout body for now. */
 		if (laytype == NFSLAYOUT_NFSV4_1_FILES) {
 			/* Parse the File layout up to fhcnt. */
 			NFSM_DISSECT(tl, uint32_t *, 3 * NFSX_UNSIGNED +
 			    NFSX_HYPER + NFSX_V4DEVICEID);
 			fhcnt = fxdr_unsigned(int, *(tl + 4 +
 			    NFSX_V4DEVICEID / NFSX_UNSIGNED));
 			NFSCL_DEBUG(4, "fhcnt=%d\n", fhcnt);
 			if (fhcnt < 0 || fhcnt > 100) {
 				/* Don't accept more than 100 file handles. */
 				error = NFSERR_BADXDR;
 				goto nfsmout;
 			}
 			if (fhcnt > 0)
 				flp = malloc(sizeof(*flp) + fhcnt *
 				    sizeof(struct nfsfh *), M_NFSFLAYOUT,
 				    M_WAITOK);
 			else
 				flp = malloc(sizeof(*flp), M_NFSFLAYOUT,
 				    M_WAITOK);
 			flp->nfsfl_flags = NFSFL_FILE;
 			flp->nfsfl_fhcnt = 0;
 			flp->nfsfl_devp = NULL;
 			flp->nfsfl_off = off;
 			if (flp->nfsfl_off + retlen < flp->nfsfl_off)
 				flp->nfsfl_end = UINT64_MAX - flp->nfsfl_off;
 			else
 				flp->nfsfl_end = flp->nfsfl_off + retlen;
 			flp->nfsfl_iomode = iomode;
 			if (gotiomode == -1)
 				gotiomode = flp->nfsfl_iomode;
 			/* Ignore layout body length for now. */
 			NFSBCOPY(tl, flp->nfsfl_dev, NFSX_V4DEVICEID);
 			tl += (NFSX_V4DEVICEID / NFSX_UNSIGNED);
 			flp->nfsfl_util = fxdr_unsigned(uint32_t, *tl++);
 			NFSCL_DEBUG(4, "flutil=0x%x\n", flp->nfsfl_util);
 			mtx_lock(&nmp->nm_mtx);
 			if (nmp->nm_minorvers > 1 && (flp->nfsfl_util &
 			    NFSFLAYUTIL_IOADVISE_THRU_MDS) != 0)
 				nmp->nm_privflag |= NFSMNTP_IOADVISETHRUMDS;
 			mtx_unlock(&nmp->nm_mtx);
 			flp->nfsfl_stripe1 = fxdr_unsigned(uint32_t, *tl++);
 			flp->nfsfl_patoff = fxdr_hyper(tl); tl += 2;
 			NFSCL_DEBUG(4, "stripe1=%u poff=%ju\n",
 			    flp->nfsfl_stripe1, (uintmax_t)flp->nfsfl_patoff);
 			for (j = 0; j < fhcnt; j++) {
 				NFSM_DISSECT(tl, uint32_t *, NFSX_UNSIGNED);
 				nfhlen = fxdr_unsigned(int, *tl);
 				if (nfhlen <= 0 || nfhlen > NFSX_V4FHMAX) {
 					error = NFSERR_BADXDR;
 					goto nfsmout;
 				}
 				nfhp = malloc(sizeof(*nfhp) + nfhlen - 1,
 				    M_NFSFH, M_WAITOK);
 				flp->nfsfl_fh[j] = nfhp;
 				flp->nfsfl_fhcnt++;
 				nfhp->nfh_len = nfhlen;
 				NFSM_DISSECT(cp, uint8_t *, NFSM_RNDUP(nfhlen));
 				NFSBCOPY(cp, nfhp->nfh_fh, nfhlen);
 			}
 		} else if (laytype == NFSLAYOUT_FLEXFILE) {
 			NFSM_DISSECT(tl, uint32_t *, NFSX_UNSIGNED +
 			    NFSX_HYPER);
 			mirrorcnt = fxdr_unsigned(int, *(tl + 2));
 			NFSCL_DEBUG(4, "mirrorcnt=%d\n", mirrorcnt);
 			if (mirrorcnt < 1 || mirrorcnt > NFSDEV_MAXMIRRORS) {
 				error = NFSERR_BADXDR;
 				goto nfsmout;
 			}
 			flp = malloc(sizeof(*flp) + mirrorcnt *
 			    sizeof(struct nfsffm), M_NFSFLAYOUT, M_WAITOK);
 			flp->nfsfl_flags = NFSFL_FLEXFILE;
 			flp->nfsfl_mirrorcnt = mirrorcnt;
 			for (j = 0; j < mirrorcnt; j++)
 				flp->nfsfl_ffm[j].devp = NULL;
 			flp->nfsfl_off = off;
 			if (flp->nfsfl_off + retlen < flp->nfsfl_off)
 				flp->nfsfl_end = UINT64_MAX - flp->nfsfl_off;
 			else
 				flp->nfsfl_end = flp->nfsfl_off + retlen;
 			flp->nfsfl_iomode = iomode;
 			if (gotiomode == -1)
 				gotiomode = flp->nfsfl_iomode;
 			flp->nfsfl_stripeunit = fxdr_hyper(tl);
 			NFSCL_DEBUG(4, "stripeunit=%ju\n",
 			    (uintmax_t)flp->nfsfl_stripeunit);
 			for (j = 0; j < mirrorcnt; j++) {
 				NFSM_DISSECT(tl, uint32_t *, NFSX_UNSIGNED);
 				k = fxdr_unsigned(int, *tl);
 				if (k < 1 || k > 128) {
 					error = NFSERR_BADXDR;
 					goto nfsmout;
 				}
 				NFSCL_DEBUG(4, "servercnt=%d\n", k);
 				for (l = 0; l < k; l++) {
 					NFSM_DISSECT(tl, uint32_t *,
 					    NFSX_V4DEVICEID + NFSX_STATEID +
 					    2 * NFSX_UNSIGNED);
 					if (l == 0) {
 						/* Just use the first server. */
 						NFSBCOPY(tl,
 						    flp->nfsfl_ffm[j].dev,
 						    NFSX_V4DEVICEID);
 						tl += (NFSX_V4DEVICEID /
 						    NFSX_UNSIGNED);
 						tl++;
 						flp->nfsfl_ffm[j].st.seqid =
 						    *tl++;
 						flp->nfsfl_ffm[j].st.other[0] =
 						    *tl++;
 						flp->nfsfl_ffm[j].st.other[1] =
 						    *tl++;
 						flp->nfsfl_ffm[j].st.other[2] =
 						    *tl++;
 						NFSCL_DEBUG(4, "st.seqid=%u "
 						 "st.o0=0x%x st.o1=0x%x "
 						 "st.o2=0x%x\n",
 						 flp->nfsfl_ffm[j].st.seqid,
 						 flp->nfsfl_ffm[j].st.other[0],
 						 flp->nfsfl_ffm[j].st.other[1],
 						 flp->nfsfl_ffm[j].st.other[2]);
 					} else
 						tl += ((NFSX_V4DEVICEID +
 						    NFSX_STATEID +
 						    NFSX_UNSIGNED) /
 						    NFSX_UNSIGNED);
 					fhcnt = fxdr_unsigned(int, *tl);
 					NFSCL_DEBUG(4, "fhcnt=%d\n", fhcnt);
 					if (fhcnt < 1 ||
 					    fhcnt > NFSDEV_MAXVERS) {
 						error = NFSERR_BADXDR;
 						goto nfsmout;
 					}
 					for (m = 0; m < fhcnt; m++) {
 						NFSM_DISSECT(tl, uint32_t *,
 						    NFSX_UNSIGNED);
 						nfhlen = fxdr_unsigned(int,
 						    *tl);
 						NFSCL_DEBUG(4, "nfhlen=%d\n",
 						    nfhlen);
 						if (nfhlen <= 0 || nfhlen >
 						    NFSX_V4FHMAX) {
 							error = NFSERR_BADXDR;
 							goto nfsmout;
 						}
 						NFSM_DISSECT(cp, uint8_t *,
 						    NFSM_RNDUP(nfhlen));
 						if (l == 0) {
 							flp->nfsfl_ffm[j].fhcnt 
 							    = fhcnt;
 							nfhp = malloc(
 							    sizeof(*nfhp) +
 							    nfhlen - 1, M_NFSFH,
 							    M_WAITOK);
 							flp->nfsfl_ffm[j].fh[m]
 							    = nfhp;
 							nfhp->nfh_len = nfhlen;
 							NFSBCOPY(cp,
 							    nfhp->nfh_fh,
 							    nfhlen);
 							NFSCL_DEBUG(4,
 							    "got fh\n");
 						}
 					}
 					/* Now, get the ffsd_user/ffds_group. */
 					error = nfsrv_parseug(nd, 0, &user,
 					    &grp, curthread);
 					NFSCL_DEBUG(4, "after parseu=%d\n",
 					    error);
 					if (error == 0)
 						error = nfsrv_parseug(nd, 1,
 						    &user, &grp, curthread);
 					NFSCL_DEBUG(4, "aft parseg=%d\n",
 					    grp);
 					if (error != 0)
 						goto nfsmout;
 					NFSCL_DEBUG(4, "user=%d group=%d\n",
 					    user, grp);
 					if (l == 0) {
 						flp->nfsfl_ffm[j].user = user;
 						flp->nfsfl_ffm[j].group = grp;
 						NFSCL_DEBUG(4,
 						    "usr=%d grp=%d\n", user,
 						    grp);
 					}
 				}
 			}
 			NFSM_DISSECT(tl, uint32_t *, 2 * NFSX_UNSIGNED);
 			flp->nfsfl_fflags = fxdr_unsigned(uint32_t, *tl++);
 #ifdef notnow
 			/*
 			 * At this time, there is no flag.
 			 * NFSFLEXFLAG_IOADVISE_THRU_MDS might need to be
 			 * added, or it may never exist?
 			 */
 			mtx_lock(&nmp->nm_mtx);
 			if (nmp->nm_minorvers > 1 && (flp->nfsfl_fflags &
 			    NFSFLEXFLAG_IOADVISE_THRU_MDS) != 0)
 				nmp->nm_privflag |= NFSMNTP_IOADVISETHRUMDS;
 			mtx_unlock(&nmp->nm_mtx);
 #endif
 			flp->nfsfl_statshint = fxdr_unsigned(uint32_t, *tl);
 			NFSCL_DEBUG(4, "fflags=0x%x statshint=%d\n",
 			    flp->nfsfl_fflags, flp->nfsfl_statshint);
 		} else {
 			error = NFSERR_BADXDR;
 			goto nfsmout;
 		}
 		if (flp->nfsfl_iomode == gotiomode) {
 			/* Keep the list in increasing offset order. */
 			tflp = LIST_FIRST(flhp);
 			prevflp = NULL;
 			while (tflp != NULL &&
 			    tflp->nfsfl_off < flp->nfsfl_off) {
 				prevflp = tflp;
 				tflp = LIST_NEXT(tflp, nfsfl_list);
 			}
 			if (prevflp == NULL)
 				LIST_INSERT_HEAD(flhp, flp, nfsfl_list);
 			else
 				LIST_INSERT_AFTER(prevflp, flp,
 				    nfsfl_list);
 			NFSCL_DEBUG(4, "flp inserted\n");
 		} else {
 			printf("nfscl_layoutget(): got wrong iomode\n");
 			nfscl_freeflayout(flp);
 		}
 		flp = NULL;
 	}
 nfsmout:
 	NFSCL_DEBUG(4, "eo nfsrv_parselayoutget=%d\n", error);
 	if (error != 0 && flp != NULL)
 		nfscl_freeflayout(flp);
 	return (error);
 }
 
 /*
  * Parse a user/group digit string.
  */
 static int
 nfsrv_parseug(struct nfsrv_descript *nd, int dogrp, uid_t *uidp, gid_t *gidp,
     NFSPROC_T *p)
 {
 	uint32_t *tl;
 	char *cp, *str, str0[NFSV4_SMALLSTR + 1];
 	uint32_t len = 0;
 	int error = 0;
 
 	NFSM_DISSECT(tl, uint32_t *, NFSX_UNSIGNED);
 	len = fxdr_unsigned(uint32_t, *tl);
 	str = NULL;
 	if (len > NFSV4_OPAQUELIMIT) {
 		error = NFSERR_BADXDR;
 		goto nfsmout;
 	}
 	NFSCL_DEBUG(4, "nfsrv_parseug: len=%d\n", len);
 	if (len == 0) {
 		if (dogrp != 0)
 			*gidp = GID_NOGROUP;
 		else
 			*uidp = UID_NOBODY;
 		return (0);
 	}
 	if (len > NFSV4_SMALLSTR)
 		str = malloc(len + 1, M_TEMP, M_WAITOK);
 	else
 		str = str0;
 	NFSM_DISSECT(cp, char *, NFSM_RNDUP(len));
 	NFSBCOPY(cp, str, len);
 	str[len] = '\0';
 	NFSCL_DEBUG(4, "nfsrv_parseug: str=%s\n", str);
 	if (dogrp != 0)
 		error = nfsv4_strtogid(nd, str, len, gidp);
 	else
 		error = nfsv4_strtouid(nd, str, len, uidp);
 nfsmout:
 	if (len > NFSV4_SMALLSTR)
 		free(str, M_TEMP);
 	NFSCL_DEBUG(4, "eo nfsrv_parseug=%d\n", error);
 	return (error);
 }
 
 /*
  * Similar to nfsrpc_getlayout(), except that it uses nfsrpc_openlayget(),
  * so that it does both an Open and a Layoutget.
  */
 static int
 nfsrpc_getopenlayout(struct nfsmount *nmp, vnode_t vp, u_int8_t *nfhp,
     int fhlen, uint8_t *newfhp, int newfhlen, uint32_t mode,
     struct nfsclopen *op, uint8_t *name, int namelen, struct nfscldeleg **dpp,
     struct ucred *cred, NFSPROC_T *p)
 {
 	struct nfscllayout *lyp;
 	struct nfsclflayout *flp;
 	struct nfsclflayouthead flh;
 	int error, islocked, layoutlen, recalled, retonclose, usecurstateid;
 	int layouttype, laystat;
 	nfsv4stateid_t stateid;
 	struct nfsclsession *tsep;
 
 	error = 0;
 	if (NFSHASFLEXFILE(nmp))
 		layouttype = NFSLAYOUT_FLEXFILE;
 	else
 		layouttype = NFSLAYOUT_NFSV4_1_FILES;
 	/*
 	 * If lyp is returned non-NULL, there will be a refcnt (shared lock)
 	 * on it, iff flp != NULL or a lock (exclusive lock) on it iff
 	 * flp == NULL.
 	 */
 	lyp = nfscl_getlayout(nmp->nm_clp, newfhp, newfhlen, 0, mode, &flp,
 	    &recalled);
 	NFSCL_DEBUG(4, "nfsrpc_getopenlayout nfscl_getlayout lyp=%p\n", lyp);
 	if (lyp == NULL)
 		islocked = 0;
 	else if (flp != NULL)
 		islocked = 1;
 	else
 		islocked = 2;
 	if ((lyp == NULL || flp == NULL) && recalled == 0) {
 		LIST_INIT(&flh);
 		tsep = nfsmnt_mdssession(nmp);
 		layoutlen = tsep->nfsess_maxcache - (NFSX_STATEID +
 		    3 * NFSX_UNSIGNED);
 		if (lyp == NULL)
 			usecurstateid = 1;
 		else {
 			usecurstateid = 0;
 			stateid.seqid = lyp->nfsly_stateid.seqid;
 			stateid.other[0] = lyp->nfsly_stateid.other[0];
 			stateid.other[1] = lyp->nfsly_stateid.other[1];
 			stateid.other[2] = lyp->nfsly_stateid.other[2];
 		}
 		error = nfsrpc_openlayoutrpc(nmp, vp, nfhp, fhlen,
 		    newfhp, newfhlen, mode, op, name, namelen,
 		    dpp, &stateid, usecurstateid, layouttype, layoutlen,
 		    &retonclose, &flh, &laystat, cred, p);
 		NFSCL_DEBUG(4, "aft nfsrpc_openlayoutrpc laystat=%d err=%d\n",
 		    laystat, error);
 		laystat = nfsrpc_layoutgetres(nmp, vp, newfhp, newfhlen,
 		    &stateid, retonclose, NULL, &lyp, &flh, layouttype, laystat,
 		    &islocked, cred, p);
 	} else
 		error = nfsrpc_openrpc(nmp, vp, nfhp, fhlen, newfhp, newfhlen,
 		    mode, op, name, namelen, dpp, 0, 0, cred, p, 0, 0);
 	if (islocked == 2)
 		nfscl_rellayout(lyp, 1);
 	else if (islocked == 1)
 		nfscl_rellayout(lyp, 0);
 	return (error);
 }
 
 /*
  * This function does an Open+LayoutGet for an NFSv4.1 mount with pNFS
  * enabled, only for the CLAIM_NULL case.  All other NFSv4 Opens are
  * handled by nfsrpc_openrpc().
  * For the case where op == NULL, dvp is the directory.  When op != NULL, it
  * can be NULL.
  */
 static int
 nfsrpc_openlayoutrpc(struct nfsmount *nmp, vnode_t vp, u_int8_t *nfhp,
     int fhlen, uint8_t *newfhp, int newfhlen, uint32_t mode,
     struct nfsclopen *op, uint8_t *name, int namelen, struct nfscldeleg **dpp,
     nfsv4stateid_t *stateidp, int usecurstateid, int layouttype,
     int layoutlen, int *retonclosep, struct nfsclflayouthead *flhp,
     int *laystatp, struct ucred *cred, NFSPROC_T *p)
 {
 	uint32_t *tl;
 	struct nfsrv_descript nfsd, *nd = &nfsd;
 	struct nfscldeleg *ndp = NULL;
 	struct nfsvattr nfsva;
 	struct nfsclsession *tsep;
 	uint32_t rflags, deleg;
 	nfsattrbit_t attrbits;
 	int error, ret, acesize, limitby, iomode;
 
 	*dpp = NULL;
 	*laystatp = ENXIO;
 	nfscl_reqstart(nd, NFSPROC_OPENLAYGET, nmp, nfhp, fhlen, NULL, NULL,
 	    0, 0, cred);
 	NFSM_BUILD(tl, uint32_t *, 5 * NFSX_UNSIGNED);
 	*tl++ = txdr_unsigned(op->nfso_own->nfsow_seqid);
 	*tl++ = txdr_unsigned(mode & (NFSV4OPEN_ACCESSBOTH |
 	    NFSV4OPEN_WANTDELEGMASK));
 	*tl++ = txdr_unsigned((mode >> NFSLCK_SHIFT) & NFSV4OPEN_DENYBOTH);
 	tsep = nfsmnt_mdssession(nmp);
 	*tl++ = tsep->nfsess_clientid.lval[0];
 	*tl = tsep->nfsess_clientid.lval[1];
 	nfsm_strtom(nd, op->nfso_own->nfsow_owner, NFSV4CL_LOCKNAMELEN);
 	NFSM_BUILD(tl, uint32_t *, 2 * NFSX_UNSIGNED);
 	*tl++ = txdr_unsigned(NFSV4OPEN_NOCREATE);
 	if (NFSHASNFSV4N(nmp)) {
 		*tl = txdr_unsigned(NFSV4OPEN_CLAIMFH);
 	} else {
 		*tl = txdr_unsigned(NFSV4OPEN_CLAIMNULL);
 		nfsm_strtom(nd, name, namelen);
 	}
 	NFSM_BUILD(tl, uint32_t *, NFSX_UNSIGNED);
 	*tl = txdr_unsigned(NFSV4OP_GETATTR);
 	NFSZERO_ATTRBIT(&attrbits);
 	NFSSETBIT_ATTRBIT(&attrbits, NFSATTRBIT_CHANGE);
 	NFSSETBIT_ATTRBIT(&attrbits, NFSATTRBIT_TIMEMODIFY);
 	nfsrv_putattrbit(nd, &attrbits);
 	NFSM_BUILD(tl, uint32_t *, NFSX_UNSIGNED);
 	*tl = txdr_unsigned(NFSV4OP_LAYOUTGET);
 	if ((mode & NFSV4OPEN_ACCESSWRITE) != 0)
 		iomode = NFSLAYOUTIOMODE_RW;
 	else
 		iomode = NFSLAYOUTIOMODE_READ;
 	nfsrv_setuplayoutget(nd, iomode, 0, UINT64_MAX, 0, stateidp,
 	    layouttype, layoutlen, usecurstateid);
 	error = newnfs_request(nd, nmp, NULL, &nmp->nm_sockreq, vp, p, cred,
 	    NFS_PROG, NFS_VER4, NULL, 1, NULL, NULL);
 	if (error != 0)
 		return (error);
 	NFSCL_INCRSEQID(op->nfso_own->nfsow_seqid, nd);
 	if (nd->nd_repstat != 0)
 		*laystatp = nd->nd_repstat;
 	if ((nd->nd_flag & ND_NOMOREDATA) == 0) {
 		/* ND_NOMOREDATA will be set if the Open operation failed. */
 		NFSM_DISSECT(tl, u_int32_t *, NFSX_STATEID +
 		    6 * NFSX_UNSIGNED);
 		op->nfso_stateid.seqid = *tl++;
 		op->nfso_stateid.other[0] = *tl++;
 		op->nfso_stateid.other[1] = *tl++;
 		op->nfso_stateid.other[2] = *tl;
 		rflags = fxdr_unsigned(u_int32_t, *(tl + 6));
 		error = nfsrv_getattrbits(nd, &attrbits, NULL, NULL);
 		if (error != 0)
 			goto nfsmout;
 		NFSM_DISSECT(tl, u_int32_t *, NFSX_UNSIGNED);
 		deleg = fxdr_unsigned(u_int32_t, *tl);
 		if (deleg == NFSV4OPEN_DELEGATEREAD ||
 		    deleg == NFSV4OPEN_DELEGATEWRITE) {
 			if (!(op->nfso_own->nfsow_clp->nfsc_flags &
 			      NFSCLFLAGS_FIRSTDELEG))
 				op->nfso_own->nfsow_clp->nfsc_flags |=
 				  (NFSCLFLAGS_FIRSTDELEG | NFSCLFLAGS_GOTDELEG);
 			ndp = malloc(sizeof(struct nfscldeleg) + newfhlen,
 			    M_NFSCLDELEG, M_WAITOK);
 			LIST_INIT(&ndp->nfsdl_owner);
 			LIST_INIT(&ndp->nfsdl_lock);
 			ndp->nfsdl_clp = op->nfso_own->nfsow_clp;
 			ndp->nfsdl_fhlen = newfhlen;
 			NFSBCOPY(newfhp, ndp->nfsdl_fh, newfhlen);
 			newnfs_copyincred(cred, &ndp->nfsdl_cred);
 			nfscl_lockinit(&ndp->nfsdl_rwlock);
 			NFSM_DISSECT(tl, u_int32_t *, NFSX_STATEID +
 			    NFSX_UNSIGNED);
 			ndp->nfsdl_stateid.seqid = *tl++;
 			ndp->nfsdl_stateid.other[0] = *tl++;
 			ndp->nfsdl_stateid.other[1] = *tl++;
 			ndp->nfsdl_stateid.other[2] = *tl++;
 			ret = fxdr_unsigned(int, *tl);
 			if (deleg == NFSV4OPEN_DELEGATEWRITE) {
 				ndp->nfsdl_flags = NFSCLDL_WRITE;
 				/*
 				 * Indicates how much the file can grow.
 				 */
 				NFSM_DISSECT(tl, u_int32_t *,
 				    3 * NFSX_UNSIGNED);
 				limitby = fxdr_unsigned(int, *tl++);
 				switch (limitby) {
 				case NFSV4OPEN_LIMITSIZE:
 					ndp->nfsdl_sizelimit = fxdr_hyper(tl);
 					break;
 				case NFSV4OPEN_LIMITBLOCKS:
 					ndp->nfsdl_sizelimit =
 					    fxdr_unsigned(u_int64_t, *tl++);
 					ndp->nfsdl_sizelimit *=
 					    fxdr_unsigned(u_int64_t, *tl);
 					break;
 				default:
 					error = NFSERR_BADXDR;
 					goto nfsmout;
 				};
 			} else
 				ndp->nfsdl_flags = NFSCLDL_READ;
 			if (ret != 0)
 				ndp->nfsdl_flags |= NFSCLDL_RECALL;
 			error = nfsrv_dissectace(nd, &ndp->nfsdl_ace, false,
 			    &ret, &acesize, p);
 			if (error != 0)
 				goto nfsmout;
 		} else if (deleg == NFSV4OPEN_DELEGATENONEEXT &&
 		    NFSHASNFSV4N(nmp)) {
 			NFSM_DISSECT(tl, uint32_t *, NFSX_UNSIGNED);
 			deleg = fxdr_unsigned(uint32_t, *tl);
 			if (deleg == NFSV4OPEN_CONTENTION ||
 			    deleg == NFSV4OPEN_RESOURCE)
 				NFSM_DISSECT(tl, uint32_t *, NFSX_UNSIGNED);
 		} else if (deleg != NFSV4OPEN_DELEGATENONE) {
 			error = NFSERR_BADXDR;
 			goto nfsmout;
 		}
 		if ((rflags & NFSV4OPEN_LOCKTYPEPOSIX) != 0 ||
 		    nfscl_assumeposixlocks)
 			op->nfso_posixlock = 1;
 		else
 			op->nfso_posixlock = 0;
 		NFSM_DISSECT(tl, u_int32_t *, 2 * NFSX_UNSIGNED);
 		/* If the 2nd element == NFS_OK, the Getattr succeeded. */
 		if (*++tl == 0) {
 			error = nfsv4_loadattr(nd, NULL, &nfsva, NULL,
 			    NULL, 0, NULL, NULL, NULL, NULL, NULL, 0,
 			    NULL, NULL, NULL, p, cred);
 			if (error != 0)
 				goto nfsmout;
 			if (ndp != NULL) {
 				ndp->nfsdl_change = nfsva.na_filerev;
 				ndp->nfsdl_modtime = nfsva.na_mtime;
 				ndp->nfsdl_flags |= NFSCLDL_MODTIMESET;
 				*dpp = ndp;
 				ndp = NULL;
 			}
 			/*
 			 * At this point, the Open has succeeded, so set
 			 * nd_repstat = NFS_OK.  If the Layoutget failed,
 			 * this function just won't return a layout.
 			 */
 			if (nd->nd_repstat == 0) {
 				NFSM_DISSECT(tl, uint32_t *, 2 * NFSX_UNSIGNED);
 				*laystatp = fxdr_unsigned(int, *++tl);
 				if (*laystatp == 0) {
 					error = nfsrv_parselayoutget(nmp, nd,
 					    stateidp, retonclosep, flhp);
 					if (error != 0)
 						*laystatp = error;
 				}
 			} else
 				nd->nd_repstat = 0;	/* Return 0 for Open. */
 		}
 	}
 	if (nd->nd_repstat != 0 && error == 0)
 		error = nd->nd_repstat;
 nfsmout:
 	free(ndp, M_NFSCLDELEG);
 	m_freem(nd->nd_mrep);
 	return (error);
 }
 
 /*
  * Similar nfsrpc_createv4(), but also does the LayoutGet operation.
  * Used only for mounts with pNFS enabled.
  */
 static int
 nfsrpc_createlayout(vnode_t dvp, char *name, int namelen, struct vattr *vap,
     nfsquad_t cverf, int fmode, struct nfsclowner *owp, struct nfscldeleg **dpp,
     struct ucred *cred, NFSPROC_T *p, struct nfsvattr *dnap,
     struct nfsvattr *nnap, struct nfsfh **nfhpp, int *attrflagp,
     int *dattrflagp, int *unlockedp, nfsv4stateid_t *stateidp,
     int usecurstateid, int layouttype, int layoutlen, int *retonclosep,
     struct nfsclflayouthead *flhp, int *laystatp)
 {
 	uint32_t *tl;
 	int error = 0, deleg, newone, ret, acesize, limitby;
 	struct nfsrv_descript nfsd, *nd = &nfsd;
 	struct nfsclopen *op;
 	struct nfscldeleg *dp = NULL;
 	struct nfsnode *np;
 	struct nfsfh *nfhp;
 	struct nfsclsession *tsep;
 	nfsattrbit_t attrbits;
 	nfsv4stateid_t stateid;
 	struct nfsmount *nmp;
 
 	nmp = VFSTONFS(dvp->v_mount);
 	np = VTONFS(dvp);
 	*laystatp = ENXIO;
 	*unlockedp = 0;
 	*nfhpp = NULL;
 	*dpp = NULL;
 	*attrflagp = 0;
 	*dattrflagp = 0;
 	if (namelen > NFS_MAXNAMLEN)
 		return (ENAMETOOLONG);
 	NFSCL_REQSTART(nd, NFSPROC_CREATELAYGET, dvp, cred);
 	/*
 	 * For V4, this is actually an Open op.
 	 */
 	NFSM_BUILD(tl, u_int32_t *, 5 * NFSX_UNSIGNED);
 	*tl++ = txdr_unsigned(owp->nfsow_seqid);
 	if (NFSHASNFSV4N(nmp)) {
 		if (!NFSHASPNFS(nmp) && nfscl_enablecallb != 0 &&
 		    nfs_numnfscbd > 0)
 			*tl++ = txdr_unsigned(NFSV4OPEN_ACCESSWRITE |
 			    NFSV4OPEN_ACCESSREAD | NFSV4OPEN_WANTWRITEDELEG);
 		else
 			*tl++ = txdr_unsigned(NFSV4OPEN_ACCESSWRITE |
 			    NFSV4OPEN_ACCESSREAD | NFSV4OPEN_WANTNODELEG);
 	} else
 		*tl++ = txdr_unsigned(NFSV4OPEN_ACCESSWRITE |
 		    NFSV4OPEN_ACCESSREAD);
 	*tl++ = txdr_unsigned(NFSV4OPEN_DENYNONE);
 	tsep = nfsmnt_mdssession(nmp);
 	*tl++ = tsep->nfsess_clientid.lval[0];
 	*tl = tsep->nfsess_clientid.lval[1];
 	nfsm_strtom(nd, owp->nfsow_owner, NFSV4CL_LOCKNAMELEN);
 	NFSM_BUILD(tl, u_int32_t *, 2 * NFSX_UNSIGNED);
 	*tl++ = txdr_unsigned(NFSV4OPEN_CREATE);
 	if ((fmode & O_EXCL) != 0) {
 		if (NFSHASSESSPERSIST(nmp)) {
 			/* Use GUARDED for persistent sessions. */
 			*tl = txdr_unsigned(NFSCREATE_GUARDED);
 			nfscl_fillsattr(nd, vap, dvp, NFSSATTR_NEWFILE, 0);
 		} else {
 			/* Otherwise, use EXCLUSIVE4_1. */
 			*tl = txdr_unsigned(NFSCREATE_EXCLUSIVE41);
 			NFSM_BUILD(tl, u_int32_t *, NFSX_VERF);
 			*tl++ = cverf.lval[0];
 			*tl = cverf.lval[1];
 			nfscl_fillsattr(nd, vap, dvp, NFSSATTR_NEWFILE, 0);
 		}
 	} else {
 		*tl = txdr_unsigned(NFSCREATE_UNCHECKED);
 		nfscl_fillsattr(nd, vap, dvp, NFSSATTR_NEWFILE, 0);
 	}
 	NFSM_BUILD(tl, u_int32_t *, NFSX_UNSIGNED);
 	*tl = txdr_unsigned(NFSV4OPEN_CLAIMNULL);
 	nfsm_strtom(nd, name, namelen);
 	/* Get the new file's handle and attributes, plus save the FH. */
 	NFSM_BUILD(tl, u_int32_t *, 3 * NFSX_UNSIGNED);
 	*tl++ = txdr_unsigned(NFSV4OP_SAVEFH);
 	*tl++ = txdr_unsigned(NFSV4OP_GETFH);
 	*tl = txdr_unsigned(NFSV4OP_GETATTR);
 	NFSGETATTR_ATTRBIT(&attrbits);
 	nfsrv_putattrbit(nd, &attrbits);
 	/* Get the directory's post-op attributes. */
 	NFSM_BUILD(tl, u_int32_t *, NFSX_UNSIGNED);
 	*tl = txdr_unsigned(NFSV4OP_PUTFH);
 	(void)nfsm_fhtom(nmp, nd, np->n_fhp->nfh_fh, np->n_fhp->nfh_len, 0);
 	NFSM_BUILD(tl, u_int32_t *, NFSX_UNSIGNED);
 	*tl = txdr_unsigned(NFSV4OP_GETATTR);
 	nfsrv_putattrbit(nd, &attrbits);
 	NFSM_BUILD(tl, u_int32_t *, 2 * NFSX_UNSIGNED);
 	*tl++ = txdr_unsigned(NFSV4OP_RESTOREFH);
 	*tl = txdr_unsigned(NFSV4OP_LAYOUTGET);
 	nfsrv_setuplayoutget(nd, NFSLAYOUTIOMODE_RW, 0, UINT64_MAX, 0, stateidp,
 	    layouttype, layoutlen, usecurstateid);
 	error = nfscl_request(nd, dvp, p, cred);
 	if (error != 0)
 		return (error);
 	NFSCL_DEBUG(4, "nfsrpc_createlayout stat=%d err=%d\n", nd->nd_repstat,
 	    error);
 	if (nd->nd_repstat != 0)
 		*laystatp = nd->nd_repstat;
 	NFSCL_INCRSEQID(owp->nfsow_seqid, nd);
 	if ((nd->nd_flag & ND_NOMOREDATA) == 0) {
 		NFSCL_DEBUG(4, "nfsrpc_createlayout open succeeded\n");
 		NFSM_DISSECT(tl, u_int32_t *, NFSX_STATEID +
 		    6 * NFSX_UNSIGNED);
 		stateid.seqid = *tl++;
 		stateid.other[0] = *tl++;
 		stateid.other[1] = *tl++;
 		stateid.other[2] = *tl;
 		error = nfsrv_getattrbits(nd, &attrbits, NULL, NULL);
 		if (error != 0)
 			goto nfsmout;
 		NFSM_DISSECT(tl, u_int32_t *, NFSX_UNSIGNED);
 		deleg = fxdr_unsigned(int, *tl);
 		if (deleg == NFSV4OPEN_DELEGATEREAD ||
 		    deleg == NFSV4OPEN_DELEGATEWRITE) {
 			if (!(owp->nfsow_clp->nfsc_flags &
 			      NFSCLFLAGS_FIRSTDELEG))
 				owp->nfsow_clp->nfsc_flags |=
 				  (NFSCLFLAGS_FIRSTDELEG | NFSCLFLAGS_GOTDELEG);
 			dp = malloc(sizeof(struct nfscldeleg) + NFSX_V4FHMAX,
 			    M_NFSCLDELEG, M_WAITOK);
 			LIST_INIT(&dp->nfsdl_owner);
 			LIST_INIT(&dp->nfsdl_lock);
 			dp->nfsdl_clp = owp->nfsow_clp;
 			newnfs_copyincred(cred, &dp->nfsdl_cred);
 			nfscl_lockinit(&dp->nfsdl_rwlock);
 			NFSM_DISSECT(tl, u_int32_t *, NFSX_STATEID +
 			    NFSX_UNSIGNED);
 			dp->nfsdl_stateid.seqid = *tl++;
 			dp->nfsdl_stateid.other[0] = *tl++;
 			dp->nfsdl_stateid.other[1] = *tl++;
 			dp->nfsdl_stateid.other[2] = *tl++;
 			ret = fxdr_unsigned(int, *tl);
 			if (deleg == NFSV4OPEN_DELEGATEWRITE) {
 				dp->nfsdl_flags = NFSCLDL_WRITE;
 				/*
 				 * Indicates how much the file can grow.
 				 */
 				NFSM_DISSECT(tl, u_int32_t *,
 				    3 * NFSX_UNSIGNED);
 				limitby = fxdr_unsigned(int, *tl++);
 				switch (limitby) {
 				case NFSV4OPEN_LIMITSIZE:
 					dp->nfsdl_sizelimit = fxdr_hyper(tl);
 					break;
 				case NFSV4OPEN_LIMITBLOCKS:
 					dp->nfsdl_sizelimit =
 					    fxdr_unsigned(u_int64_t, *tl++);
 					dp->nfsdl_sizelimit *=
 					    fxdr_unsigned(u_int64_t, *tl);
 					break;
 				default:
 					error = NFSERR_BADXDR;
 					goto nfsmout;
 				};
 			} else {
 				dp->nfsdl_flags = NFSCLDL_READ;
 			}
 			if (ret != 0)
 				dp->nfsdl_flags |= NFSCLDL_RECALL;
 			error = nfsrv_dissectace(nd, &dp->nfsdl_ace, false,
 			    &ret, &acesize, p);
 			if (error != 0)
 				goto nfsmout;
 		} else if (deleg != NFSV4OPEN_DELEGATENONE) {
 			error = NFSERR_BADXDR;
 			goto nfsmout;
 		}
 
 		/* Now, we should have the status for the SaveFH. */
 		NFSM_DISSECT(tl, uint32_t *, 2 * NFSX_UNSIGNED);
 		if (*++tl == 0) {
 			NFSCL_DEBUG(4, "nfsrpc_createlayout SaveFH ok\n");
 			/*
 			 * Now, process the GetFH and Getattr for the newly
 			 * created file. nfscl_mtofh() will set
 			 * ND_NOMOREDATA if these weren't successful.
 			 */
 			error = nfscl_mtofh(nd, nfhpp, nnap, attrflagp);
 			NFSCL_DEBUG(4, "aft nfscl_mtofh err=%d\n", error);
 			if (error != 0)
 				goto nfsmout;
 		} else
 			nd->nd_flag |= ND_NOMOREDATA;
 		/* Now we have the PutFH and Getattr for the directory. */
 		if ((nd->nd_flag & ND_NOMOREDATA) == 0) {
 			NFSM_DISSECT(tl, uint32_t *, 2 * NFSX_UNSIGNED);
 			if (*++tl != 0)
 				nd->nd_flag |= ND_NOMOREDATA;
 			else {
 				NFSM_DISSECT(tl, uint32_t *, 2 *
 				    NFSX_UNSIGNED);
 				if (*++tl != 0)
 					nd->nd_flag |= ND_NOMOREDATA;
 			}
 		}
 		if ((nd->nd_flag & ND_NOMOREDATA) == 0) {
 			/* Load the directory attributes. */
 			error = nfsm_loadattr(nd, dnap);
 			NFSCL_DEBUG(4, "aft nfsm_loadattr err=%d\n", error);
 			if (error != 0)
 				goto nfsmout;
 			*dattrflagp = 1;
 			if (dp != NULL && *attrflagp != 0) {
 				dp->nfsdl_change = nnap->na_filerev;
 				dp->nfsdl_modtime = nnap->na_mtime;
 				dp->nfsdl_flags |= NFSCLDL_MODTIMESET;
 			}
 			/*
 			 * We can now complete the Open state.
 			 */
 			nfhp = *nfhpp;
 			if (dp != NULL) {
 				dp->nfsdl_fhlen = nfhp->nfh_len;
 				NFSBCOPY(nfhp->nfh_fh, dp->nfsdl_fh,
 				    nfhp->nfh_len);
 			}
 			/*
 			 * Get an Open structure that will be
 			 * attached to the OpenOwner, acquired already.
 			 */
 			error = nfscl_open(dvp, nfhp->nfh_fh, nfhp->nfh_len, 
 			    (NFSV4OPEN_ACCESSWRITE | NFSV4OPEN_ACCESSREAD), 0,
 			    cred, p, NULL, &op, &newone, NULL, 0, false);
 			if (error != 0)
 				goto nfsmout;
 			op->nfso_stateid = stateid;
 			newnfs_copyincred(cred, &op->nfso_cred);
 
 			nfscl_openrelease(nmp, op, error, newone);
 			*unlockedp = 1;
 
 			/* Now, handle the RestoreFH and LayoutGet. */
 			if (nd->nd_repstat == 0) {
 				NFSM_DISSECT(tl, uint32_t *, 4 * NFSX_UNSIGNED);
 				*laystatp = fxdr_unsigned(int, *(tl + 3));
 				if (*laystatp == 0) {
 					error = nfsrv_parselayoutget(nmp, nd,
 					    stateidp, retonclosep, flhp);
 					if (error != 0)
 						*laystatp = error;
 				}
 				NFSCL_DEBUG(4, "aft nfsrv_parselayout err=%d\n",
 				    error);
 			} else
 				nd->nd_repstat = 0;
 		}
 	}
 	if (nd->nd_repstat != 0 && error == 0)
 		error = nd->nd_repstat;
 	if (error == NFSERR_STALECLIENTID)
 		nfscl_initiate_recovery(owp->nfsow_clp);
 nfsmout:
 	NFSCL_DEBUG(4, "eo nfsrpc_createlayout err=%d\n", error);
 	if (error == 0)
 		*dpp = dp;
 	else
 		free(dp, M_NFSCLDELEG);
 	m_freem(nd->nd_mrep);
 	return (error);
 }
 
 /*
  * Similar to nfsrpc_getopenlayout(), except that it used for the Create case.
  */
 static int
 nfsrpc_getcreatelayout(vnode_t dvp, char *name, int namelen, struct vattr *vap,
     nfsquad_t cverf, int fmode, struct nfsclowner *owp, struct nfscldeleg **dpp,
     struct ucred *cred, NFSPROC_T *p, struct nfsvattr *dnap,
     struct nfsvattr *nnap, struct nfsfh **nfhpp, int *attrflagp,
     int *dattrflagp, int *unlockedp)
 {
 	struct nfscllayout *lyp;
 	struct nfsclflayouthead flh;
 	struct nfsfh *nfhp;
 	struct nfsclsession *tsep;
 	struct nfsmount *nmp;
 	nfsv4stateid_t stateid;
 	int error, layoutlen, layouttype, retonclose, laystat;
 
 	error = 0;
 	nmp = VFSTONFS(dvp->v_mount);
 	if (NFSHASFLEXFILE(nmp))
 		layouttype = NFSLAYOUT_FLEXFILE;
 	else
 		layouttype = NFSLAYOUT_NFSV4_1_FILES;
 	LIST_INIT(&flh);
 	tsep = nfsmnt_mdssession(nmp);
 	layoutlen = tsep->nfsess_maxcache - (NFSX_STATEID + 3 * NFSX_UNSIGNED);
 	error = nfsrpc_createlayout(dvp, name, namelen, vap, cverf, fmode,
 	    owp, dpp, cred, p, dnap, nnap, nfhpp, attrflagp, dattrflagp,
 	    unlockedp, &stateid, 1, layouttype, layoutlen, &retonclose,
 	    &flh, &laystat);
 	NFSCL_DEBUG(4, "aft nfsrpc_createlayoutrpc laystat=%d err=%d\n",
 	    laystat, error);
 	lyp = NULL;
 	if (laystat == 0) {
 		nfhp = *nfhpp;
 		laystat = nfsrpc_layoutgetres(nmp, dvp, nfhp->nfh_fh,
 		    nfhp->nfh_len, &stateid, retonclose, NULL, &lyp, &flh,
 		    layouttype, laystat, NULL, cred, p);
 	} else
 		laystat = nfsrpc_layoutgetres(nmp, dvp, NULL, 0, &stateid,
 		    retonclose, NULL, &lyp, &flh, layouttype, laystat, NULL,
 		    cred, p);
 	if (laystat == 0)
 		nfscl_rellayout(lyp, 0);
 	return (error);
 }
 
 /*
  * Process the results of a layoutget() operation.
  */
 static int
 nfsrpc_layoutgetres(struct nfsmount *nmp, vnode_t vp, uint8_t *newfhp,
     int newfhlen, nfsv4stateid_t *stateidp, int retonclose, uint32_t *notifybit,
     struct nfscllayout **lypp, struct nfsclflayouthead *flhp, int layouttype,
     int laystat, int *islockedp, struct ucred *cred, NFSPROC_T *p)
 {
 	struct nfsclflayout *tflp;
 	struct nfscldevinfo *dip;
 	uint8_t *dev;
 	int i, mirrorcnt;
 
 	if (laystat == NFSERR_UNKNLAYOUTTYPE) {
 		NFSLOCKMNT(nmp);
 		if (!NFSHASFLEXFILE(nmp)) {
 			/* Switch to using Flex File Layout. */
 			nmp->nm_state |= NFSSTA_FLEXFILE;
 		} else if (layouttype == NFSLAYOUT_FLEXFILE) {
 			/* Disable pNFS. */
 			NFSCL_DEBUG(1, "disable PNFS\n");
 			nmp->nm_state &= ~(NFSSTA_PNFS | NFSSTA_FLEXFILE);
 		}
 		NFSUNLOCKMNT(nmp);
 	}
 	if (laystat == 0) {
 		NFSCL_DEBUG(4, "nfsrpc_layoutgetres at FOREACH\n");
 		LIST_FOREACH(tflp, flhp, nfsfl_list) {
 			if (layouttype == NFSLAYOUT_FLEXFILE)
 				mirrorcnt = tflp->nfsfl_mirrorcnt;
 			else
 				mirrorcnt = 1;
 			for (i = 0; i < mirrorcnt; i++) {
 				laystat = nfscl_adddevinfo(nmp, NULL, i, tflp);
 				NFSCL_DEBUG(4, "aft adddev=%d\n", laystat);
 				if (laystat != 0) {
 					if (layouttype == NFSLAYOUT_FLEXFILE)
 						dev = tflp->nfsfl_ffm[i].dev;
 					else
 						dev = tflp->nfsfl_dev;
 					laystat = nfsrpc_getdeviceinfo(nmp, dev,
 					    layouttype, notifybit, &dip, cred,
 					    p);
 					NFSCL_DEBUG(4, "aft nfsrpc_gdi=%d\n",
 					    laystat);
 					if (laystat != 0)
 						goto out;
 					laystat = nfscl_adddevinfo(nmp, dip, i,
 					    tflp);
 					if (laystat != 0)
 						printf("nfsrpc_layoutgetresout"
 						    ": cannot add\n");
 				}
 			}
 		}
 	}
 out:
 	if (laystat == 0) {
 		/*
 		 * nfscl_layout() always returns with the nfsly_lock
 		 * set to a refcnt (shared lock).
 		 * Passing in dvp is sufficient, since it is only used to
 		 * get the fsid for the file system.
 		 */
 		laystat = nfscl_layout(nmp, vp, newfhp, newfhlen, stateidp,
 		    layouttype, retonclose, flhp, lypp, cred, p);
 		NFSCL_DEBUG(4, "nfsrpc_layoutgetres: aft nfscl_layout=%d\n",
 		    laystat);
 		if (laystat == 0 && islockedp != NULL)
 			*islockedp = 1;
 	}
 	return (laystat);
 }
 
 /*
  * nfs copy_file_range operation.
  */
 int
 nfsrpc_copy_file_range(vnode_t invp, off_t *inoffp, vnode_t outvp,
     off_t *outoffp, size_t *lenp, unsigned int flags, int *inattrflagp,
     struct nfsvattr *innap, int *outattrflagp, struct nfsvattr *outnap,
     struct ucred *cred, bool consecutive, bool *must_commitp)
 {
 	int commit, error, expireret = 0, retrycnt;
 	u_int32_t clidrev = 0;
 	struct nfsmount *nmp = VFSTONFS(invp->v_mount);
 	struct nfsfh *innfhp = NULL, *outnfhp = NULL;
 	nfsv4stateid_t instateid, outstateid;
 	void *inlckp, *outlckp;
 
 	if (nmp->nm_clp != NULL)
 		clidrev = nmp->nm_clp->nfsc_clientidrev;
 	innfhp = VTONFS(invp)->n_fhp;
 	outnfhp = VTONFS(outvp)->n_fhp;
 	retrycnt = 0;
 	do {
 		/* Get both stateids. */
 		inlckp = NULL;
 		nfscl_getstateid(invp, innfhp->nfh_fh, innfhp->nfh_len,
 		    NFSV4OPEN_ACCESSREAD, 0, NULL, curthread, &instateid,
 		    &inlckp);
 		outlckp = NULL;
 		nfscl_getstateid(outvp, outnfhp->nfh_fh, outnfhp->nfh_len,
 		    NFSV4OPEN_ACCESSWRITE, 0, NULL, curthread, &outstateid,
 		    &outlckp);
 
 		error = nfsrpc_copyrpc(invp, *inoffp, outvp, *outoffp, lenp,
 		    &instateid, &outstateid, innap, inattrflagp, outnap,
 		    outattrflagp, consecutive, &commit, cred, curthread);
 		if (error == 0) {
 			if (commit != NFSWRITE_FILESYNC)
 				*must_commitp = true;
 			*inoffp += *lenp;
 			*outoffp += *lenp;
 		} else if (error == NFSERR_STALESTATEID)
 			nfscl_initiate_recovery(nmp->nm_clp);
 		if (inlckp != NULL)
 			nfscl_lockderef(inlckp);
 		if (outlckp != NULL)
 			nfscl_lockderef(outlckp);
 		if (error == NFSERR_GRACE || error == NFSERR_STALESTATEID ||
 		    error == NFSERR_STALEDONTRECOVER || error == NFSERR_DELAY ||
 		    error == NFSERR_OLDSTATEID || error == NFSERR_BADSESSION) {
 			(void) nfs_catnap(PZERO, error, "nfs_cfr");
 		} else if ((error == NFSERR_EXPIRED || (!NFSHASINT(nmp) &&
 		    error == NFSERR_BADSTATEID)) && clidrev != 0) {
 			expireret = nfscl_hasexpired(nmp->nm_clp, clidrev,
 			    curthread);
 		} else if (error == NFSERR_BADSTATEID && NFSHASINT(nmp)) {
 			error = EIO;
 		}
 		retrycnt++;
 	} while (error == NFSERR_GRACE || error == NFSERR_DELAY ||
 	    error == NFSERR_STALESTATEID || error == NFSERR_BADSESSION ||
 	      error == NFSERR_STALEDONTRECOVER ||
 	    (error == NFSERR_OLDSTATEID && retrycnt < 20) ||
 	    ((error == NFSERR_EXPIRED || error == NFSERR_BADSTATEID) &&
 	     expireret == 0 && clidrev != 0 && retrycnt < 4));
 	if (error != 0 && (retrycnt >= 4 ||
 	    error == NFSERR_STALESTATEID || error == NFSERR_BADSESSION ||
 	      error == NFSERR_STALEDONTRECOVER))
 		error = EIO;
 	return (error);
 }
 
 /*
  * The copy RPC.
  */
 static int
 nfsrpc_copyrpc(vnode_t invp, off_t inoff, vnode_t outvp, off_t outoff,
     size_t *lenp, nfsv4stateid_t *instateidp, nfsv4stateid_t *outstateidp,
     struct nfsvattr *innap, int *inattrflagp, struct nfsvattr *outnap,
     int *outattrflagp, bool consecutive, int *commitp, struct ucred *cred,
     NFSPROC_T *p)
 {
 	uint32_t *tl, *opcntp;
 	int error;
 	struct nfsrv_descript nfsd;
 	struct nfsrv_descript *nd = &nfsd;
 	struct nfsmount *nmp;
 	nfsattrbit_t attrbits;
 	struct vattr va;
 	uint64_t len;
 
 	nmp = VFSTONFS(invp->v_mount);
 	*inattrflagp = *outattrflagp = 0;
 	*commitp = NFSWRITE_UNSTABLE;
 	len = *lenp;
 	*lenp = 0;
 	if (len > nfs_maxcopyrange)
 		len = nfs_maxcopyrange;
 	nfscl_reqstart(nd, NFSPROC_COPY, nmp, VTONFS(invp)->n_fhp->nfh_fh,
 	    VTONFS(invp)->n_fhp->nfh_len, &opcntp, NULL, 0, 0, cred);
 	/*
 	 * First do a Setattr of atime to the server's clock
 	 * time.  The FreeBSD "collective" was of the opinion
 	 * that setting atime was necessary for this syscall.
 	 * Do the Setattr before the Copy, so that it can be
 	 * handled well if the server replies NFSERR_DELAY to
 	 * the Setattr operation.
 	 */
 	if ((nmp->nm_mountp->mnt_flag & MNT_NOATIME) == 0) {
 		NFSM_BUILD(tl, uint32_t *, NFSX_UNSIGNED);
 		*tl = txdr_unsigned(NFSV4OP_SETATTR);
 		nfsm_stateidtom(nd, instateidp, NFSSTATEID_PUTSTATEID);
 		VATTR_NULL(&va);
 		va.va_atime.tv_sec = va.va_atime.tv_nsec = 0;
 		va.va_vaflags = VA_UTIMES_NULL;
 		nfscl_fillsattr(nd, &va, invp, 0, 0);
 		/* Bump opcnt from 7 to 8. */
 		*opcntp = txdr_unsigned(8);
 	}
 
 	/* Now Getattr the invp attributes. */
 	NFSM_BUILD(tl, uint32_t *, NFSX_UNSIGNED);
 	*tl = txdr_unsigned(NFSV4OP_GETATTR);
 	NFSGETATTR_ATTRBIT(&attrbits);
 	nfsrv_putattrbit(nd, &attrbits);
 
 	/* Set outvp. */
 	NFSM_BUILD(tl, uint32_t *, NFSX_UNSIGNED);
 	*tl = txdr_unsigned(NFSV4OP_PUTFH);
 	(void)nfsm_fhtom(nmp, nd, VTONFS(outvp)->n_fhp->nfh_fh,
 	    VTONFS(outvp)->n_fhp->nfh_len, 0);
 
 	/* Do the Copy. */
 	NFSM_BUILD(tl, uint32_t *, NFSX_UNSIGNED);
 	*tl = txdr_unsigned(NFSV4OP_COPY);
 	nfsm_stateidtom(nd, instateidp, NFSSTATEID_PUTSTATEID);
 	nfsm_stateidtom(nd, outstateidp, NFSSTATEID_PUTSTATEID);
 	NFSM_BUILD(tl, uint32_t *, 3 * NFSX_HYPER + 4 * NFSX_UNSIGNED);
 	txdr_hyper(inoff, tl); tl += 2;
 	txdr_hyper(outoff, tl); tl += 2;
 	txdr_hyper(len, tl); tl += 2;
 	if (consecutive)
 		*tl++ = newnfs_true;
 	else
 		*tl++ = newnfs_false;
 	*tl++ = newnfs_true;
 	*tl++ = 0;
 
 	/* Get the outvp attributes. */
 	*tl = txdr_unsigned(NFSV4OP_GETATTR);
 	NFSWRITEGETATTR_ATTRBIT(&attrbits);
 	nfsrv_putattrbit(nd, &attrbits);
 
 	error = nfscl_request(nd, invp, p, cred);
 	if (error != 0)
 		return (error);
 	/* Skip over the Setattr reply. */
 	if ((nd->nd_flag & ND_NOMOREDATA) == 0 &&
 	    (nmp->nm_mountp->mnt_flag & MNT_NOATIME) == 0) {
 		NFSM_DISSECT(tl, uint32_t *, 2 * NFSX_UNSIGNED);
 		if (*(tl + 1) == 0) {
 			error = nfsrv_getattrbits(nd, &attrbits, NULL, NULL);
 			if (error != 0)
 				goto nfsmout;
 		} else
 			nd->nd_flag |= ND_NOMOREDATA;
 	}
 	if ((nd->nd_flag & ND_NOMOREDATA) == 0) {
 		/* Get the input file's attributes. */
 		NFSM_DISSECT(tl, uint32_t *, 2 * NFSX_UNSIGNED);
 		if (*(tl + 1) == 0) {
 			error = nfsm_loadattr(nd, innap);
 			if (error != 0)
 				goto nfsmout;
 			*inattrflagp = 1;
 		} else
 			nd->nd_flag |= ND_NOMOREDATA;
 	}
 	/* Skip over return stat for PutFH. */
 	if ((nd->nd_flag & ND_NOMOREDATA) == 0) {
 		NFSM_DISSECT(tl, uint32_t *, 2 * NFSX_UNSIGNED);
 		if (*++tl != 0)
 			nd->nd_flag |= ND_NOMOREDATA;
 	}
 	/* Skip over return stat for Copy. */
 	if ((nd->nd_flag & ND_NOMOREDATA) == 0)
 		NFSM_DISSECT(tl, uint32_t *, 2 * NFSX_UNSIGNED);
 	if (nd->nd_repstat == 0) {
 		NFSM_DISSECT(tl, uint32_t *, NFSX_UNSIGNED);
 		if (*tl != 0) {
 			/* There should be no callback ids. */
 			error = NFSERR_BADXDR;
 			goto nfsmout;
 		}
 		NFSM_DISSECT(tl, uint32_t *, NFSX_HYPER + 3 * NFSX_UNSIGNED +
 		    NFSX_VERF);
 		len = fxdr_hyper(tl); tl += 2;
 		*commitp = fxdr_unsigned(int, *tl++);
 		NFSLOCKMNT(nmp);
 		if (!NFSHASWRITEVERF(nmp)) {
 			NFSBCOPY(tl, nmp->nm_verf, NFSX_VERF);
 			NFSSETWRITEVERF(nmp);
 	    	} else if (NFSBCMP(tl, nmp->nm_verf, NFSX_VERF)) {
 			NFSBCOPY(tl, nmp->nm_verf, NFSX_VERF);
 			nd->nd_repstat = NFSERR_STALEWRITEVERF;
 		}
 		NFSUNLOCKMNT(nmp);
 		tl += (NFSX_VERF / NFSX_UNSIGNED);
 		if (nd->nd_repstat == 0 && *++tl != newnfs_true)
 			/* Must be a synchronous copy. */
 			nd->nd_repstat = NFSERR_NOTSUPP;
 		NFSM_DISSECT(tl, uint32_t *, 2 * NFSX_UNSIGNED);
 		error = nfsm_loadattr(nd, outnap);
 		if (error == 0)
 			*outattrflagp = NFS_LATTR_NOSHRINK;
 		if (nd->nd_repstat == 0)
 			*lenp = len;
 	} else if (nd->nd_repstat == NFSERR_OFFLOADNOREQS) {
 		/*
 		 * For the case where consecutive is not supported, but
 		 * synchronous is supported, we can try consecutive == false
 		 * by returning this error.  Otherwise, return NFSERR_NOTSUPP,
 		 * since Copy cannot be done.
 		 */
 		if ((nd->nd_flag & ND_NOMOREDATA) == 0) {
 			NFSM_DISSECT(tl, uint32_t *, 2 * NFSX_UNSIGNED);
 			if (!consecutive || *++tl == newnfs_false)
 				nd->nd_repstat = NFSERR_NOTSUPP;
 		} else
 			nd->nd_repstat = NFSERR_BADXDR;
 	}
 	if (error == 0)
 		error = nd->nd_repstat;
 nfsmout:
 	m_freem(nd->nd_mrep);
 	return (error);
 }
 
 /*
  * Seek operation.
  */
 int
 nfsrpc_seek(vnode_t vp, off_t *offp, bool *eofp, int content,
     struct ucred *cred, struct nfsvattr *nap, int *attrflagp)
 {
 	int error, expireret = 0, retrycnt;
 	u_int32_t clidrev = 0;
 	struct nfsmount *nmp = VFSTONFS(vp->v_mount);
 	struct nfsnode *np = VTONFS(vp);
 	struct nfsfh *nfhp = NULL;
 	nfsv4stateid_t stateid;
 	void *lckp;
 
 	if (nmp->nm_clp != NULL)
 		clidrev = nmp->nm_clp->nfsc_clientidrev;
 	nfhp = np->n_fhp;
 	retrycnt = 0;
 	do {
 		lckp = NULL;
 		nfscl_getstateid(vp, nfhp->nfh_fh, nfhp->nfh_len,
 		    NFSV4OPEN_ACCESSREAD, 0, cred, curthread, &stateid, &lckp);
 		error = nfsrpc_seekrpc(vp, offp, &stateid, eofp, content,
 		    nap, attrflagp, cred);
 		if (error == NFSERR_STALESTATEID)
 			nfscl_initiate_recovery(nmp->nm_clp);
 		if (lckp != NULL)
 			nfscl_lockderef(lckp);
 		if (error == NFSERR_GRACE || error == NFSERR_STALESTATEID ||
 		    error == NFSERR_STALEDONTRECOVER || error == NFSERR_DELAY ||
 		    error == NFSERR_OLDSTATEID || error == NFSERR_BADSESSION) {
 			(void) nfs_catnap(PZERO, error, "nfs_seek");
 		} else if ((error == NFSERR_EXPIRED || (!NFSHASINT(nmp) &&
 		    error == NFSERR_BADSTATEID)) && clidrev != 0) {
 			expireret = nfscl_hasexpired(nmp->nm_clp, clidrev,
 			    curthread);
 		} else if (error == NFSERR_BADSTATEID && NFSHASINT(nmp)) {
 			error = EIO;
 		}
 		retrycnt++;
 	} while (error == NFSERR_GRACE || error == NFSERR_STALESTATEID ||
 	    error == NFSERR_STALEDONTRECOVER || error == NFSERR_DELAY ||
 	    error == NFSERR_BADSESSION ||
 	    (error == NFSERR_OLDSTATEID && retrycnt < 20) ||
 	    ((error == NFSERR_EXPIRED || error == NFSERR_BADSTATEID) &&
 	     expireret == 0 && clidrev != 0 && retrycnt < 4) ||
 	    (error == NFSERR_OPENMODE && retrycnt < 4));
 	if (error && retrycnt >= 4)
 		error = EIO;
 	return (error);
 }
 
 /*
  * The seek RPC.
  */
 static int
 nfsrpc_seekrpc(vnode_t vp, off_t *offp, nfsv4stateid_t *stateidp, bool *eofp,
     int content, struct nfsvattr *nap, int *attrflagp, struct ucred *cred)
 {
 	uint32_t *tl;
 	int error;
 	struct nfsrv_descript nfsd;
 	struct nfsrv_descript *nd = &nfsd;
 	nfsattrbit_t attrbits;
 
 	*attrflagp = 0;
 	NFSCL_REQSTART(nd, NFSPROC_SEEK, vp, cred);
 	nfsm_stateidtom(nd, stateidp, NFSSTATEID_PUTSTATEID);
 	NFSM_BUILD(tl, uint32_t *, NFSX_HYPER + 2 * NFSX_UNSIGNED);
 	txdr_hyper(*offp, tl); tl += 2;
 	*tl++ = txdr_unsigned(content);
 	*tl = txdr_unsigned(NFSV4OP_GETATTR);
 	NFSGETATTR_ATTRBIT(&attrbits);
 	nfsrv_putattrbit(nd, &attrbits);
 	error = nfscl_request(nd, vp, curthread, cred);
 	if (error != 0)
 		return (error);
 	if (nd->nd_repstat == 0) {
 		NFSM_DISSECT(tl, uint32_t *, 3 * NFSX_UNSIGNED + NFSX_HYPER);
 		if (*tl++ == newnfs_true)
 			*eofp = true;
 		else
 			*eofp = false;
 		*offp = fxdr_hyper(tl);
 		/* Just skip over Getattr op status. */
 		error = nfsm_loadattr(nd, nap);
 		if (error == 0)
 			*attrflagp = 1;
 	}
 	error = nd->nd_repstat;
 nfsmout:
 	m_freem(nd->nd_mrep);
 	return (error);
 }
 
 /*
  * The getextattr RPC.
  */
 int
 nfsrpc_getextattr(vnode_t vp, const char *name, struct uio *uiop, ssize_t *lenp,
     struct nfsvattr *nap, int *attrflagp, struct ucred *cred, NFSPROC_T *p)
 {
 	uint32_t *tl;
 	int error;
 	struct nfsrv_descript nfsd;
 	struct nfsrv_descript *nd = &nfsd;
 	nfsattrbit_t attrbits;
 	uint32_t len, len2;
 
 	*attrflagp = 0;
 	NFSCL_REQSTART(nd, NFSPROC_GETEXTATTR, vp, cred);
 	nfsm_strtom(nd, name, strlen(name));
 	NFSM_BUILD(tl, uint32_t *, NFSX_UNSIGNED);
 	*tl = txdr_unsigned(NFSV4OP_GETATTR);
 	NFSGETATTR_ATTRBIT(&attrbits);
 	nfsrv_putattrbit(nd, &attrbits);
 	error = nfscl_request(nd, vp, p, cred);
 	if (error != 0)
 		return (error);
 	if (nd->nd_repstat == 0) {
 		NFSM_DISSECT(tl, uint32_t *, NFSX_UNSIGNED);
 		len = fxdr_unsigned(uint32_t, *tl);
 		/* Sanity check lengths. */
 		if (uiop != NULL && len > 0 && len <= IOSIZE_MAX &&
 		    uiop->uio_resid <= UINT32_MAX) {
 			len2 = uiop->uio_resid;
 			if (len2 >= len)
 				error = nfsm_mbufuio(nd, uiop, len);
 			else {
 				error = nfsm_mbufuio(nd, uiop, len2);
 				if (error == 0) {
 					/*
 					 * nfsm_mbufuio() advances to a multiple
 					 * of 4, so round up len2 as well.  Then
 					 * we need to advance over the rest of
 					 * the data, rounding up the remaining
 					 * length.
 					 */
 					len2 = NFSM_RNDUP(len2);
 					len2 = NFSM_RNDUP(len - len2);
 					if (len2 > 0)
 						error = nfsm_advance(nd, len2,
 						    -1);
 				}
 			}
 		} else if (uiop == NULL && len > 0) {
 			/* Just wants the length and not the data. */
 			error = nfsm_advance(nd, NFSM_RNDUP(len), -1);
 		} else if (len > 0)
 			error = ENOATTR;
 		if (error != 0)
 			goto nfsmout;
 		*lenp = len;
 		/* Just skip over Getattr op status. */
 		NFSM_DISSECT(tl, uint32_t *, 2 * NFSX_UNSIGNED);
 		error = nfsm_loadattr(nd, nap);
 		if (error == 0)
 			*attrflagp = 1;
 	}
 	if (error == 0)
 		error = nd->nd_repstat;
 nfsmout:
 	m_freem(nd->nd_mrep);
 	return (error);
 }
 
 /*
  * The setextattr RPC.
  */
 int
 nfsrpc_setextattr(vnode_t vp, const char *name, struct uio *uiop,
     struct nfsvattr *nap, int *attrflagp, struct ucred *cred, NFSPROC_T *p)
 {
 	uint32_t *tl;
 	int error;
 	struct nfsrv_descript nfsd;
 	struct nfsrv_descript *nd = &nfsd;
 	nfsattrbit_t attrbits;
 
 	*attrflagp = 0;
 	NFSCL_REQSTART(nd, NFSPROC_SETEXTATTR, vp, cred);
 	if (uiop->uio_resid > nd->nd_maxreq) {
 		/* nd_maxreq is set by NFSCL_REQSTART(). */
 		m_freem(nd->nd_mreq);
 		return (EINVAL);
 	}
 	NFSM_BUILD(tl, uint32_t *, NFSX_UNSIGNED);
 	*tl = txdr_unsigned(NFSV4SXATTR_EITHER);
 	nfsm_strtom(nd, name, strlen(name));
 	NFSM_BUILD(tl, uint32_t *, NFSX_UNSIGNED);
 	*tl = txdr_unsigned(uiop->uio_resid);
 	error = nfsm_uiombuf(nd, uiop, uiop->uio_resid);
 	if (error != 0) {
 		m_freem(nd->nd_mreq);
 		return (error);
 	}
 	NFSM_BUILD(tl, uint32_t *, NFSX_UNSIGNED);
 	*tl = txdr_unsigned(NFSV4OP_GETATTR);
 	NFSGETATTR_ATTRBIT(&attrbits);
 	nfsrv_putattrbit(nd, &attrbits);
 	error = nfscl_request(nd, vp, p, cred);
 	if (error != 0)
 		return (error);
 	if (nd->nd_repstat == 0) {
 		/* Just skip over the reply and Getattr op status. */
 		NFSM_DISSECT(tl, uint32_t *, 2 * NFSX_HYPER + 3 *
 		    NFSX_UNSIGNED);
 		error = nfsm_loadattr(nd, nap);
 		if (error == 0)
 			*attrflagp = 1;
 	}
 	if (error == 0)
 		error = nd->nd_repstat;
 nfsmout:
 	m_freem(nd->nd_mrep);
 	return (error);
 }
 
 /*
  * The removeextattr RPC.
  */
 int
 nfsrpc_rmextattr(vnode_t vp, const char *name, struct nfsvattr *nap,
     int *attrflagp, struct ucred *cred, NFSPROC_T *p)
 {
 	uint32_t *tl;
 	int error;
 	struct nfsrv_descript nfsd;
 	struct nfsrv_descript *nd = &nfsd;
 	nfsattrbit_t attrbits;
 
 	*attrflagp = 0;
 	NFSCL_REQSTART(nd, NFSPROC_RMEXTATTR, vp, cred);
 	nfsm_strtom(nd, name, strlen(name));
 	NFSM_BUILD(tl, uint32_t *, NFSX_UNSIGNED);
 	*tl = txdr_unsigned(NFSV4OP_GETATTR);
 	NFSGETATTR_ATTRBIT(&attrbits);
 	nfsrv_putattrbit(nd, &attrbits);
 	error = nfscl_request(nd, vp, p, cred);
 	if (error != 0)
 		return (error);
 	if (nd->nd_repstat == 0) {
 		/* Just skip over the reply and Getattr op status. */
 		NFSM_DISSECT(tl, uint32_t *, 2 * NFSX_HYPER + 3 *
 		    NFSX_UNSIGNED);
 		error = nfsm_loadattr(nd, nap);
 		if (error == 0)
 			*attrflagp = 1;
 	}
 	if (error == 0)
 		error = nd->nd_repstat;
 nfsmout:
 	m_freem(nd->nd_mrep);
 	return (error);
 }
 
 /*
  * The listextattr RPC.
  */
 int
 nfsrpc_listextattr(vnode_t vp, uint64_t *cookiep, struct uio *uiop,
     size_t *lenp, bool *eofp, struct nfsvattr *nap, int *attrflagp,
     struct ucred *cred, NFSPROC_T *p)
 {
 	uint32_t *tl;
 	int cnt, error, i, len;
 	struct nfsrv_descript nfsd;
 	struct nfsrv_descript *nd = &nfsd;
 	nfsattrbit_t attrbits;
 	u_char c;
 
 	*attrflagp = 0;
 	NFSCL_REQSTART(nd, NFSPROC_LISTEXTATTR, vp, cred);
 	NFSM_BUILD(tl, uint32_t *, NFSX_HYPER + 2 * NFSX_UNSIGNED);
 	txdr_hyper(*cookiep, tl); tl += 2;
 	*tl++ = txdr_unsigned(*lenp);
 	*tl = txdr_unsigned(NFSV4OP_GETATTR);
 	NFSGETATTR_ATTRBIT(&attrbits);
 	nfsrv_putattrbit(nd, &attrbits);
 	error = nfscl_request(nd, vp, p, cred);
 	if (error != 0)
 		return (error);
 	*eofp = true;
 	*lenp = 0;
 	if (nd->nd_repstat == 0) {
 		NFSM_DISSECT(tl, uint32_t *, NFSX_HYPER + NFSX_UNSIGNED);
 		*cookiep = fxdr_hyper(tl); tl += 2;
 		cnt = fxdr_unsigned(int, *tl);
 		if (cnt < 0) {
 			error = EBADRPC;
 			goto nfsmout;
 		}
 		for (i = 0; i < cnt; i++) {
 			NFSM_DISSECT(tl, uint32_t *, NFSX_UNSIGNED);
 			len = fxdr_unsigned(int, *tl);
 			if (len <= 0 || len > EXTATTR_MAXNAMELEN) {
 				error = EBADRPC;
 				goto nfsmout;
 			}
 			if (uiop == NULL)
 				error = nfsm_advance(nd, NFSM_RNDUP(len), -1);
 			else if (uiop->uio_resid >= len + 1) {
 				c = len;
 				error = uiomove(&c, sizeof(c), uiop);
 				if (error == 0)
 					error = nfsm_mbufuio(nd, uiop, len);
 			} else {
 				error = nfsm_advance(nd, NFSM_RNDUP(len), -1);
 				*eofp = false;
 			}
 			if (error != 0)
 				goto nfsmout;
 			*lenp += (len + 1);
 		}
 		/* Get the eof and skip over the Getattr op status. */
 		NFSM_DISSECT(tl, uint32_t *, 3 * NFSX_UNSIGNED);
 		/*
 		 * *eofp is set false above, because it wasn't able to copy
 		 * all of the reply.
 		 */
 		if (*eofp && *tl == 0)
 			*eofp = false;
 		error = nfsm_loadattr(nd, nap);
 		if (error == 0)
 			*attrflagp = 1;
 	}
 	if (error == 0)
 		error = nd->nd_repstat;
 nfsmout:
 	m_freem(nd->nd_mrep);
 	return (error);
 }
 
 /*
  * Split an mbuf list.  For non-M_EXTPG mbufs, just use m_split().
  */
 static struct mbuf *
 nfsm_split(struct mbuf *mp, uint64_t xfer)
 {
 	struct mbuf *m, *m2;
 	vm_page_t pg;
 	int i, j, left, pgno, plen, trim;
 	char *cp, *cp2;
 
 	if ((mp->m_flags & M_EXTPG) == 0) {
 		m = m_split(mp, xfer, M_WAITOK);
 		return (m);
 	}
 
 	/* Find the correct mbuf to split at. */
 	for (m = mp; m != NULL && xfer > m->m_len; m = m->m_next)
 		xfer -= m->m_len;
 	if (m == NULL)
 		return (NULL);
 
 	/* If xfer == m->m_len, we can just split the mbuf list. */
 	if (xfer == m->m_len) {
 		m2 = m->m_next;
 		m->m_next = NULL;
 		return (m2);
 	}
 
 	/* Find the page to split at. */
 	pgno = 0;
 	left = xfer;
 	do {
 		if (pgno == 0)
 			plen = m_epg_pagelen(m, 0, m->m_epg_1st_off);
 		else
 			plen = m_epg_pagelen(m, pgno, 0);
 		if (left <= plen)
 			break;
 		left -= plen;
 		pgno++;
 	} while (pgno < m->m_epg_npgs);
 	if (pgno == m->m_epg_npgs)
 		panic("nfsm_split: eroneous ext_pgs mbuf");
 
-	m2 = mb_alloc_ext_pgs(M_WAITOK, mb_free_mext_pgs);
+	m2 = mb_alloc_ext_pgs(M_WAITOK, mb_free_mext_pgs, 0);
 	m2->m_epg_flags |= EPG_FLAG_ANON;
 
 	/*
 	 * If left < plen, allocate a new page for the new mbuf
 	 * and copy the data after left in the page to this new
 	 * page.
 	 */
 	if (left < plen) {
 		pg = vm_page_alloc_noobj(VM_ALLOC_WAITOK | VM_ALLOC_NODUMP |
 		    VM_ALLOC_WIRED);
 		m2->m_epg_pa[0] = VM_PAGE_TO_PHYS(pg);
 		m2->m_epg_npgs = 1;
 
 		/* Copy the data after left to the new page. */
 		trim = plen - left;
 		cp = (char *)(void *)PHYS_TO_DMAP(m->m_epg_pa[pgno]);
 		if (pgno == 0)
 			cp += m->m_epg_1st_off;
 		cp += left;
 		cp2 = (char *)(void *)PHYS_TO_DMAP(m2->m_epg_pa[0]);
 		if (pgno == m->m_epg_npgs - 1)
 			m2->m_epg_last_len = trim;
 		else {
 			cp2 += PAGE_SIZE - trim;
 			m2->m_epg_1st_off = PAGE_SIZE - trim;
 			m2->m_epg_last_len = m->m_epg_last_len;
 		}
 		memcpy(cp2, cp, trim);
 		m2->m_len = trim;
 	} else {
 		m2->m_len = 0;
 		m2->m_epg_last_len = m->m_epg_last_len;
 	}
 
 	/* Move the pages beyond pgno to the new mbuf. */
 	for (i = pgno + 1, j = m2->m_epg_npgs; i < m->m_epg_npgs; i++, j++) {
 		m2->m_epg_pa[j] = m->m_epg_pa[i];
 		/* Never moves page 0. */
 		m2->m_len += m_epg_pagelen(m, i, 0);
 	}
 	m2->m_epg_npgs = j;
 	m->m_epg_npgs = pgno + 1;
 	m->m_epg_last_len = left;
 	m->m_len = xfer;
 
 	m2->m_next = m->m_next;
 	m->m_next = NULL;
 	return (m2);
 }
 
 /*
  * Do the NFSv4.1 Bind Connection to Session.
  * Called from the reconnect layer of the krpc (sys/rpc/clnt_rc.c).
  */
 void
 nfsrpc_bindconnsess(CLIENT *cl, void *arg, struct ucred *cr)
 {
 	struct nfscl_reconarg *rcp = (struct nfscl_reconarg *)arg;
 	uint32_t res, *tl;
 	struct nfsrv_descript nfsd;
 	struct nfsrv_descript *nd = &nfsd;
 	struct rpc_callextra ext;
 	struct timeval utimeout;
 	enum clnt_stat stat;
 	int error;
 
 	nfscl_reqstart(nd, NFSPROC_BINDCONNTOSESS, NULL, NULL, 0, NULL, NULL,
 	    NFS_VER4, rcp->minorvers, NULL);
 	NFSM_BUILD(tl, uint32_t *, NFSX_V4SESSIONID + 2 * NFSX_UNSIGNED);
 	memcpy(tl, rcp->sessionid, NFSX_V4SESSIONID);
 	tl += NFSX_V4SESSIONID / NFSX_UNSIGNED;
 	*tl++ = txdr_unsigned(NFSCDFC4_FORE_OR_BOTH);
 	*tl = newnfs_false;
 
 	memset(&ext, 0, sizeof(ext));
 	utimeout.tv_sec = 30;
 	utimeout.tv_usec = 0;
 	ext.rc_auth = authunix_create(cr);
 	nd->nd_mrep = NULL;
 	stat = CLNT_CALL_MBUF(cl, &ext, NFSV4PROC_COMPOUND, nd->nd_mreq,
 	    &nd->nd_mrep, utimeout);
 	AUTH_DESTROY(ext.rc_auth);
 	if (stat != RPC_SUCCESS) {
 		printf("nfsrpc_bindconnsess: call failed stat=%d\n", stat);
 		return;
 	}
 	if (nd->nd_mrep == NULL) {
 		printf("nfsrpc_bindconnsess: no reply args\n");
 		return;
 	}
 	error = 0;
 	newnfs_realign(&nd->nd_mrep, M_WAITOK);
 	nd->nd_md = nd->nd_mrep;
 	nd->nd_dpos = mtod(nd->nd_md, char *);
 	NFSM_DISSECT(tl, uint32_t *, 2 * NFSX_UNSIGNED);
 	nd->nd_repstat = fxdr_unsigned(uint32_t, *tl++);
 	if (nd->nd_repstat == NFSERR_OK) {
 		res = fxdr_unsigned(uint32_t, *tl);
 		if (res > 0 && (error = nfsm_advance(nd, NFSM_RNDUP(res),
 		    -1)) != 0)
 			goto nfsmout;
 		NFSM_DISSECT(tl, uint32_t *, NFSX_V4SESSIONID +
 		    4 * NFSX_UNSIGNED);
 		tl += 3;
 		if (!NFSBCMP(tl, rcp->sessionid, NFSX_V4SESSIONID)) {
 			tl += NFSX_V4SESSIONID / NFSX_UNSIGNED;
 			res = fxdr_unsigned(uint32_t, *tl);
 			if (res != NFSCDFS4_BOTH)
 				printf("nfsrpc_bindconnsess: did not "
 				    "return FS4_BOTH\n");
 		} else
 			printf("nfsrpc_bindconnsess: not same "
 			    "sessionid\n");
 	} else if (nd->nd_repstat != NFSERR_BADSESSION)
 		printf("nfsrpc_bindconnsess: returned %d\n", nd->nd_repstat);
 nfsmout:
 	if (error != 0)
 		printf("nfsrpc_bindconnsess: reply bad xdr\n");
 	m_freem(nd->nd_mrep);
 }
 
 /*
  * Do roughly what nfs_statfs() does for NFSv4, but when called with a shared
  * locked vnode.
  */
 static void
 nfscl_statfs(struct vnode *vp, struct ucred *cred, NFSPROC_T *td)
 {
 	struct nfsvattr nfsva;
 	struct nfsfsinfo fs;
 	struct nfsstatfs sb;
 	struct mount *mp;
 	struct nfsmount *nmp;
 	uint32_t lease;
 	int attrflag, error;
 
 	mp = vp->v_mount;
 	nmp = VFSTONFS(mp);
 	error = nfsrpc_statfs(vp, &sb, &fs, &lease, cred, td, &nfsva,
 	    &attrflag);
 	if (attrflag != 0)
 		(void) nfscl_loadattrcache(&vp, &nfsva, NULL, 0, 1);
 	if (error == 0) {
 		NFSLOCKCLSTATE();
 		if (nmp->nm_clp != NULL)
 			nmp->nm_clp->nfsc_renew = NFSCL_RENEW(lease);
 		NFSUNLOCKCLSTATE();
 		mtx_lock(&nmp->nm_mtx);
 		nfscl_loadfsinfo(nmp, &fs);
 		nfscl_loadsbinfo(nmp, &sb, &mp->mnt_stat);
 		mp->mnt_stat.f_iosize = newnfs_iosize(nmp);
 		mtx_unlock(&nmp->nm_mtx);
 	}
 }
diff --git a/sys/kern/kern_mbuf.c b/sys/kern/kern_mbuf.c
index 5c1c5b095449..73c98209474a 100644
--- a/sys/kern/kern_mbuf.c
+++ b/sys/kern/kern_mbuf.c
@@ -1,1801 +1,1802 @@
 /*-
  * SPDX-License-Identifier: BSD-2-Clause
  *
  * Copyright (c) 2004, 2005,
  *	Bosko Milekic <bmilekic@FreeBSD.org>.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice unmodified, this list of conditions and the following
  *    disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 #include "opt_param.h"
 #include "opt_kern_tls.h"
 
 #include <sys/param.h>
 #include <sys/conf.h>
 #include <sys/domainset.h>
 #include <sys/malloc.h>
 #include <sys/systm.h>
 #include <sys/mbuf.h>
 #include <sys/eventhandler.h>
 #include <sys/kernel.h>
 #include <sys/ktls.h>
 #include <sys/limits.h>
 #include <sys/lock.h>
 #include <sys/mutex.h>
 #include <sys/refcount.h>
 #include <sys/sf_buf.h>
 #include <sys/smp.h>
 #include <sys/socket.h>
 #include <sys/sysctl.h>
 
 #include <net/if.h>
 #include <net/if_var.h>
 
 #include <vm/vm.h>
 #include <vm/vm_extern.h>
 #include <vm/vm_kern.h>
 #include <vm/vm_page.h>
 #include <vm/vm_pageout.h>
 #include <vm/vm_map.h>
 #include <vm/uma.h>
 #include <vm/uma_dbg.h>
 
 _Static_assert(MJUMPAGESIZE > MCLBYTES,
     "Cluster must be smaller than a jumbo page");
 
 /*
  * In FreeBSD, Mbufs and Mbuf Clusters are allocated from UMA
  * Zones.
  *
  * Mbuf Clusters (2K, contiguous) are allocated from the Cluster
  * Zone.  The Zone can be capped at kern.ipc.nmbclusters, if the
  * administrator so desires.
  *
  * Mbufs are allocated from a UMA Primary Zone called the Mbuf
  * Zone.
  *
  * Additionally, FreeBSD provides a Packet Zone, which it
  * configures as a Secondary Zone to the Mbuf Primary Zone,
  * thus sharing backend Slab kegs with the Mbuf Primary Zone.
  *
  * Thus common-case allocations and locking are simplified:
  *
  *  m_clget()                m_getcl()
  *    |                         |
  *    |   .------------>[(Packet Cache)]    m_get(), m_gethdr()
  *    |   |             [     Packet   ]            |
  *  [(Cluster Cache)]   [    Secondary ]   [ (Mbuf Cache)     ]
  *  [ Cluster Zone  ]   [     Zone     ]   [ Mbuf Primary Zone ]
  *        |                       \________         |
  *  [ Cluster Keg   ]                      \       /
  *        |	                         [ Mbuf Keg   ]
  *  [ Cluster Slabs ]                         |
  *        |                              [ Mbuf Slabs ]
  *         \____________(VM)_________________/
  *
  *
  * Whenever an object is allocated with uma_zalloc() out of
  * one of the Zones its _ctor_ function is executed.  The same
  * for any deallocation through uma_zfree() the _dtor_ function
  * is executed.
  *
  * Caches are per-CPU and are filled from the Primary Zone.
  *
  * Whenever an object is allocated from the underlying global
  * memory pool it gets pre-initialized with the _zinit_ functions.
  * When the Keg's are overfull objects get decommissioned with
  * _zfini_ functions and free'd back to the global memory pool.
  *
  */
 
 int nmbufs;			/* limits number of mbufs */
 int nmbclusters;		/* limits number of mbuf clusters */
 int nmbjumbop;			/* limits number of page size jumbo clusters */
 int nmbjumbo9;			/* limits number of 9k jumbo clusters */
 int nmbjumbo16;			/* limits number of 16k jumbo clusters */
 
 bool mb_use_ext_pgs = false;	/* use M_EXTPG mbufs for sendfile & TLS */
 
 static int
 sysctl_mb_use_ext_pgs(SYSCTL_HANDLER_ARGS)
 {
 	int error, extpg;
 
 	extpg = mb_use_ext_pgs;
 	error = sysctl_handle_int(oidp, &extpg, 0, req);
 	if (error == 0 && req->newptr != NULL) {
 		if (extpg != 0 && !PMAP_HAS_DMAP)
 			error = EOPNOTSUPP;
 		else
 			mb_use_ext_pgs = extpg != 0;
 	}
 	return (error);
 }
 SYSCTL_PROC(_kern_ipc, OID_AUTO, mb_use_ext_pgs,
     CTLTYPE_INT | CTLFLAG_RWTUN | CTLFLAG_NOFETCH,
     &mb_use_ext_pgs, 0, sysctl_mb_use_ext_pgs, "IU",
     "Use unmapped mbufs for sendfile(2) and TLS offload");
 
 static quad_t maxmbufmem;	/* overall real memory limit for all mbufs */
 
 SYSCTL_QUAD(_kern_ipc, OID_AUTO, maxmbufmem, CTLFLAG_RDTUN | CTLFLAG_NOFETCH, &maxmbufmem, 0,
     "Maximum real memory allocatable to various mbuf types");
 
 static counter_u64_t snd_tag_count;
 SYSCTL_COUNTER_U64(_kern_ipc, OID_AUTO, num_snd_tags, CTLFLAG_RW,
     &snd_tag_count, "# of active mbuf send tags");
 
 /*
  * tunable_mbinit() has to be run before any mbuf allocations are done.
  */
 static void
 tunable_mbinit(void *dummy)
 {
 	quad_t realmem;
 	int extpg;
 
 	/*
 	 * The default limit for all mbuf related memory is 1/2 of all
 	 * available kernel memory (physical or kmem).
 	 * At most it can be 3/4 of available kernel memory.
 	 */
 	realmem = qmin((quad_t)physmem * PAGE_SIZE, vm_kmem_size);
 	maxmbufmem = realmem / 2;
 	TUNABLE_QUAD_FETCH("kern.ipc.maxmbufmem", &maxmbufmem);
 	if (maxmbufmem > realmem / 4 * 3)
 		maxmbufmem = realmem / 4 * 3;
 
 	TUNABLE_INT_FETCH("kern.ipc.nmbclusters", &nmbclusters);
 	if (nmbclusters == 0)
 		nmbclusters = maxmbufmem / MCLBYTES / 4;
 
 	TUNABLE_INT_FETCH("kern.ipc.nmbjumbop", &nmbjumbop);
 	if (nmbjumbop == 0)
 		nmbjumbop = maxmbufmem / MJUMPAGESIZE / 4;
 
 	TUNABLE_INT_FETCH("kern.ipc.nmbjumbo9", &nmbjumbo9);
 	if (nmbjumbo9 == 0)
 		nmbjumbo9 = maxmbufmem / MJUM9BYTES / 6;
 
 	TUNABLE_INT_FETCH("kern.ipc.nmbjumbo16", &nmbjumbo16);
 	if (nmbjumbo16 == 0)
 		nmbjumbo16 = maxmbufmem / MJUM16BYTES / 6;
 
 	/*
 	 * We need at least as many mbufs as we have clusters of
 	 * the various types added together.
 	 */
 	TUNABLE_INT_FETCH("kern.ipc.nmbufs", &nmbufs);
 	if (nmbufs < nmbclusters + nmbjumbop + nmbjumbo9 + nmbjumbo16)
 		nmbufs = lmax(maxmbufmem / MSIZE / 5,
 		    nmbclusters + nmbjumbop + nmbjumbo9 + nmbjumbo16);
 
 	/*
 	 * Unmapped mbufs can only safely be used on platforms with a direct
 	 * map.
 	 */
 	if (PMAP_HAS_DMAP) {
 		extpg = 1;
 		TUNABLE_INT_FETCH("kern.ipc.mb_use_ext_pgs", &extpg);
 		mb_use_ext_pgs = extpg != 0;
 	}
 }
 SYSINIT(tunable_mbinit, SI_SUB_KMEM, SI_ORDER_MIDDLE, tunable_mbinit, NULL);
 
 static int
 sysctl_nmbclusters(SYSCTL_HANDLER_ARGS)
 {
 	int error, newnmbclusters;
 
 	newnmbclusters = nmbclusters;
 	error = sysctl_handle_int(oidp, &newnmbclusters, 0, req);
 	if (error == 0 && req->newptr && newnmbclusters != nmbclusters) {
 		if (newnmbclusters > nmbclusters &&
 		    nmbufs >= nmbclusters + nmbjumbop + nmbjumbo9 + nmbjumbo16) {
 			nmbclusters = newnmbclusters;
 			nmbclusters = uma_zone_set_max(zone_clust, nmbclusters);
 			EVENTHANDLER_INVOKE(nmbclusters_change);
 		} else
 			error = EINVAL;
 	}
 	return (error);
 }
 SYSCTL_PROC(_kern_ipc, OID_AUTO, nmbclusters,
     CTLTYPE_INT | CTLFLAG_RWTUN | CTLFLAG_NOFETCH | CTLFLAG_MPSAFE,
     &nmbclusters, 0, sysctl_nmbclusters, "IU",
     "Maximum number of mbuf clusters allowed");
 
 static int
 sysctl_nmbjumbop(SYSCTL_HANDLER_ARGS)
 {
 	int error, newnmbjumbop;
 
 	newnmbjumbop = nmbjumbop;
 	error = sysctl_handle_int(oidp, &newnmbjumbop, 0, req);
 	if (error == 0 && req->newptr && newnmbjumbop != nmbjumbop) {
 		if (newnmbjumbop > nmbjumbop &&
 		    nmbufs >= nmbclusters + nmbjumbop + nmbjumbo9 + nmbjumbo16) {
 			nmbjumbop = newnmbjumbop;
 			nmbjumbop = uma_zone_set_max(zone_jumbop, nmbjumbop);
 		} else
 			error = EINVAL;
 	}
 	return (error);
 }
 SYSCTL_PROC(_kern_ipc, OID_AUTO, nmbjumbop,
     CTLTYPE_INT | CTLFLAG_RWTUN | CTLFLAG_NOFETCH | CTLFLAG_MPSAFE,
     &nmbjumbop, 0, sysctl_nmbjumbop, "IU",
     "Maximum number of mbuf page size jumbo clusters allowed");
 
 static int
 sysctl_nmbjumbo9(SYSCTL_HANDLER_ARGS)
 {
 	int error, newnmbjumbo9;
 
 	newnmbjumbo9 = nmbjumbo9;
 	error = sysctl_handle_int(oidp, &newnmbjumbo9, 0, req);
 	if (error == 0 && req->newptr && newnmbjumbo9 != nmbjumbo9) {
 		if (newnmbjumbo9 > nmbjumbo9 &&
 		    nmbufs >= nmbclusters + nmbjumbop + nmbjumbo9 + nmbjumbo16) {
 			nmbjumbo9 = newnmbjumbo9;
 			nmbjumbo9 = uma_zone_set_max(zone_jumbo9, nmbjumbo9);
 		} else
 			error = EINVAL;
 	}
 	return (error);
 }
 SYSCTL_PROC(_kern_ipc, OID_AUTO, nmbjumbo9,
     CTLTYPE_INT | CTLFLAG_RWTUN | CTLFLAG_NOFETCH | CTLFLAG_MPSAFE,
     &nmbjumbo9, 0, sysctl_nmbjumbo9, "IU",
     "Maximum number of mbuf 9k jumbo clusters allowed");
 
 static int
 sysctl_nmbjumbo16(SYSCTL_HANDLER_ARGS)
 {
 	int error, newnmbjumbo16;
 
 	newnmbjumbo16 = nmbjumbo16;
 	error = sysctl_handle_int(oidp, &newnmbjumbo16, 0, req);
 	if (error == 0 && req->newptr && newnmbjumbo16 != nmbjumbo16) {
 		if (newnmbjumbo16 > nmbjumbo16 &&
 		    nmbufs >= nmbclusters + nmbjumbop + nmbjumbo9 + nmbjumbo16) {
 			nmbjumbo16 = newnmbjumbo16;
 			nmbjumbo16 = uma_zone_set_max(zone_jumbo16, nmbjumbo16);
 		} else
 			error = EINVAL;
 	}
 	return (error);
 }
 SYSCTL_PROC(_kern_ipc, OID_AUTO, nmbjumbo16,
     CTLTYPE_INT | CTLFLAG_RWTUN | CTLFLAG_NOFETCH | CTLFLAG_MPSAFE,
     &nmbjumbo16, 0, sysctl_nmbjumbo16, "IU",
     "Maximum number of mbuf 16k jumbo clusters allowed");
 
 static int
 sysctl_nmbufs(SYSCTL_HANDLER_ARGS)
 {
 	int error, newnmbufs;
 
 	newnmbufs = nmbufs;
 	error = sysctl_handle_int(oidp, &newnmbufs, 0, req);
 	if (error == 0 && req->newptr && newnmbufs != nmbufs) {
 		if (newnmbufs > nmbufs) {
 			nmbufs = newnmbufs;
 			nmbufs = uma_zone_set_max(zone_mbuf, nmbufs);
 			EVENTHANDLER_INVOKE(nmbufs_change);
 		} else
 			error = EINVAL;
 	}
 	return (error);
 }
 SYSCTL_PROC(_kern_ipc, OID_AUTO, nmbufs,
     CTLTYPE_INT | CTLFLAG_RWTUN | CTLFLAG_NOFETCH | CTLFLAG_MPSAFE,
     &nmbufs, 0, sysctl_nmbufs, "IU",
     "Maximum number of mbufs allowed");
 
 /*
  * Zones from which we allocate.
  */
 uma_zone_t	zone_mbuf;
 uma_zone_t	zone_clust;
 uma_zone_t	zone_pack;
 uma_zone_t	zone_jumbop;
 uma_zone_t	zone_jumbo9;
 uma_zone_t	zone_jumbo16;
 
 /*
  * Local prototypes.
  */
 static int	mb_ctor_mbuf(void *, int, void *, int);
 static int	mb_ctor_clust(void *, int, void *, int);
 static int	mb_ctor_pack(void *, int, void *, int);
 static void	mb_dtor_mbuf(void *, int, void *);
 static void	mb_dtor_pack(void *, int, void *);
 static int	mb_zinit_pack(void *, int, int);
 static void	mb_zfini_pack(void *, int);
 static void	mb_reclaim(uma_zone_t, int);
 
 /* Ensure that MSIZE is a power of 2. */
 CTASSERT((((MSIZE - 1) ^ MSIZE) + 1) >> 1 == MSIZE);
 
 _Static_assert(sizeof(struct mbuf) <= MSIZE,
     "size of mbuf exceeds MSIZE");
 /*
  * Initialize FreeBSD Network buffer allocation.
  */
 static void
 mbuf_init(void *dummy)
 {
 
 	/*
 	 * Configure UMA zones for Mbufs, Clusters, and Packets.
 	 */
 	zone_mbuf = uma_zcreate(MBUF_MEM_NAME, MSIZE,
 	    mb_ctor_mbuf, mb_dtor_mbuf, NULL, NULL,
 	    MSIZE - 1, UMA_ZONE_CONTIG | UMA_ZONE_MAXBUCKET);
 	if (nmbufs > 0)
 		nmbufs = uma_zone_set_max(zone_mbuf, nmbufs);
 	uma_zone_set_warning(zone_mbuf, "kern.ipc.nmbufs limit reached");
 	uma_zone_set_maxaction(zone_mbuf, mb_reclaim);
 
 	zone_clust = uma_zcreate(MBUF_CLUSTER_MEM_NAME, MCLBYTES,
 	    mb_ctor_clust, NULL, NULL, NULL,
 	    UMA_ALIGN_PTR, UMA_ZONE_CONTIG);
 	if (nmbclusters > 0)
 		nmbclusters = uma_zone_set_max(zone_clust, nmbclusters);
 	uma_zone_set_warning(zone_clust, "kern.ipc.nmbclusters limit reached");
 	uma_zone_set_maxaction(zone_clust, mb_reclaim);
 
 	zone_pack = uma_zsecond_create(MBUF_PACKET_MEM_NAME, mb_ctor_pack,
 	    mb_dtor_pack, mb_zinit_pack, mb_zfini_pack, zone_mbuf);
 
 	/* Make jumbo frame zone too. Page size, 9k and 16k. */
 	zone_jumbop = uma_zcreate(MBUF_JUMBOP_MEM_NAME, MJUMPAGESIZE,
 	    mb_ctor_clust, NULL, NULL, NULL,
 	    UMA_ALIGN_PTR, UMA_ZONE_CONTIG);
 	if (nmbjumbop > 0)
 		nmbjumbop = uma_zone_set_max(zone_jumbop, nmbjumbop);
 	uma_zone_set_warning(zone_jumbop, "kern.ipc.nmbjumbop limit reached");
 	uma_zone_set_maxaction(zone_jumbop, mb_reclaim);
 
 	zone_jumbo9 = uma_zcreate(MBUF_JUMBO9_MEM_NAME, MJUM9BYTES,
 	    mb_ctor_clust, NULL, NULL, NULL,
 	    UMA_ALIGN_PTR, UMA_ZONE_CONTIG);
 	if (nmbjumbo9 > 0)
 		nmbjumbo9 = uma_zone_set_max(zone_jumbo9, nmbjumbo9);
 	uma_zone_set_warning(zone_jumbo9, "kern.ipc.nmbjumbo9 limit reached");
 	uma_zone_set_maxaction(zone_jumbo9, mb_reclaim);
 
 	zone_jumbo16 = uma_zcreate(MBUF_JUMBO16_MEM_NAME, MJUM16BYTES,
 	    mb_ctor_clust, NULL, NULL, NULL,
 	    UMA_ALIGN_PTR, UMA_ZONE_CONTIG);
 	if (nmbjumbo16 > 0)
 		nmbjumbo16 = uma_zone_set_max(zone_jumbo16, nmbjumbo16);
 	uma_zone_set_warning(zone_jumbo16, "kern.ipc.nmbjumbo16 limit reached");
 	uma_zone_set_maxaction(zone_jumbo16, mb_reclaim);
 
 	snd_tag_count = counter_u64_alloc(M_WAITOK);
 }
 SYSINIT(mbuf, SI_SUB_MBUF, SI_ORDER_FIRST, mbuf_init, NULL);
 
 #ifdef DEBUGNET
 /*
  * debugnet makes use of a pre-allocated pool of mbufs and clusters.  When
  * debugnet is configured, we initialize a set of UMA cache zones which return
  * items from this pool.  At panic-time, the regular UMA zone pointers are
  * overwritten with those of the cache zones so that drivers may allocate and
  * free mbufs and clusters without attempting to allocate physical memory.
  *
  * We keep mbufs and clusters in a pair of mbuf queues.  In particular, for
  * the purpose of caching clusters, we treat them as mbufs.
  */
 static struct mbufq dn_mbufq =
     { STAILQ_HEAD_INITIALIZER(dn_mbufq.mq_head), 0, INT_MAX };
 static struct mbufq dn_clustq =
     { STAILQ_HEAD_INITIALIZER(dn_clustq.mq_head), 0, INT_MAX };
 
 static int dn_clsize;
 static uma_zone_t dn_zone_mbuf;
 static uma_zone_t dn_zone_clust;
 static uma_zone_t dn_zone_pack;
 
 static struct debugnet_saved_zones {
 	uma_zone_t dsz_mbuf;
 	uma_zone_t dsz_clust;
 	uma_zone_t dsz_pack;
 	uma_zone_t dsz_jumbop;
 	uma_zone_t dsz_jumbo9;
 	uma_zone_t dsz_jumbo16;
 	bool dsz_debugnet_zones_enabled;
 } dn_saved_zones;
 
 static int
 dn_buf_import(void *arg, void **store, int count, int domain __unused,
     int flags)
 {
 	struct mbufq *q;
 	struct mbuf *m;
 	int i;
 
 	q = arg;
 
 	for (i = 0; i < count; i++) {
 		m = mbufq_dequeue(q);
 		if (m == NULL)
 			break;
 		trash_init(m, q == &dn_mbufq ? MSIZE : dn_clsize, flags);
 		store[i] = m;
 	}
 	KASSERT((flags & M_WAITOK) == 0 || i == count,
 	    ("%s: ran out of pre-allocated mbufs", __func__));
 	return (i);
 }
 
 static void
 dn_buf_release(void *arg, void **store, int count)
 {
 	struct mbufq *q;
 	struct mbuf *m;
 	int i;
 
 	q = arg;
 
 	for (i = 0; i < count; i++) {
 		m = store[i];
 		(void)mbufq_enqueue(q, m);
 	}
 }
 
 static int
 dn_pack_import(void *arg __unused, void **store, int count, int domain __unused,
     int flags __unused)
 {
 	struct mbuf *m;
 	void *clust;
 	int i;
 
 	for (i = 0; i < count; i++) {
 		m = m_get(M_NOWAIT, MT_DATA);
 		if (m == NULL)
 			break;
 		clust = uma_zalloc(dn_zone_clust, M_NOWAIT);
 		if (clust == NULL) {
 			m_free(m);
 			break;
 		}
 		mb_ctor_clust(clust, dn_clsize, m, 0);
 		store[i] = m;
 	}
 	KASSERT((flags & M_WAITOK) == 0 || i == count,
 	    ("%s: ran out of pre-allocated mbufs", __func__));
 	return (i);
 }
 
 static void
 dn_pack_release(void *arg __unused, void **store, int count)
 {
 	struct mbuf *m;
 	void *clust;
 	int i;
 
 	for (i = 0; i < count; i++) {
 		m = store[i];
 		clust = m->m_ext.ext_buf;
 		uma_zfree(dn_zone_clust, clust);
 		uma_zfree(dn_zone_mbuf, m);
 	}
 }
 
 /*
  * Free the pre-allocated mbufs and clusters reserved for debugnet, and destroy
  * the corresponding UMA cache zones.
  */
 void
 debugnet_mbuf_drain(void)
 {
 	struct mbuf *m;
 	void *item;
 
 	if (dn_zone_mbuf != NULL) {
 		uma_zdestroy(dn_zone_mbuf);
 		dn_zone_mbuf = NULL;
 	}
 	if (dn_zone_clust != NULL) {
 		uma_zdestroy(dn_zone_clust);
 		dn_zone_clust = NULL;
 	}
 	if (dn_zone_pack != NULL) {
 		uma_zdestroy(dn_zone_pack);
 		dn_zone_pack = NULL;
 	}
 
 	while ((m = mbufq_dequeue(&dn_mbufq)) != NULL)
 		m_free(m);
 	while ((item = mbufq_dequeue(&dn_clustq)) != NULL)
 		uma_zfree(m_getzone(dn_clsize), item);
 }
 
 /*
  * Callback invoked immediately prior to starting a debugnet connection.
  */
 void
 debugnet_mbuf_start(void)
 {
 
 	MPASS(!dn_saved_zones.dsz_debugnet_zones_enabled);
 
 	/* Save the old zone pointers to restore when debugnet is closed. */
 	dn_saved_zones = (struct debugnet_saved_zones) {
 		.dsz_debugnet_zones_enabled = true,
 		.dsz_mbuf = zone_mbuf,
 		.dsz_clust = zone_clust,
 		.dsz_pack = zone_pack,
 		.dsz_jumbop = zone_jumbop,
 		.dsz_jumbo9 = zone_jumbo9,
 		.dsz_jumbo16 = zone_jumbo16,
 	};
 
 	/*
 	 * All cluster zones return buffers of the size requested by the
 	 * drivers.  It's up to the driver to reinitialize the zones if the
 	 * MTU of a debugnet-enabled interface changes.
 	 */
 	printf("debugnet: overwriting mbuf zone pointers\n");
 	zone_mbuf = dn_zone_mbuf;
 	zone_clust = dn_zone_clust;
 	zone_pack = dn_zone_pack;
 	zone_jumbop = dn_zone_clust;
 	zone_jumbo9 = dn_zone_clust;
 	zone_jumbo16 = dn_zone_clust;
 }
 
 /*
  * Callback invoked when a debugnet connection is closed/finished.
  */
 void
 debugnet_mbuf_finish(void)
 {
 
 	MPASS(dn_saved_zones.dsz_debugnet_zones_enabled);
 
 	printf("debugnet: restoring mbuf zone pointers\n");
 	zone_mbuf = dn_saved_zones.dsz_mbuf;
 	zone_clust = dn_saved_zones.dsz_clust;
 	zone_pack = dn_saved_zones.dsz_pack;
 	zone_jumbop = dn_saved_zones.dsz_jumbop;
 	zone_jumbo9 = dn_saved_zones.dsz_jumbo9;
 	zone_jumbo16 = dn_saved_zones.dsz_jumbo16;
 
 	memset(&dn_saved_zones, 0, sizeof(dn_saved_zones));
 }
 
 /*
  * Reinitialize the debugnet mbuf+cluster pool and cache zones.
  */
 void
 debugnet_mbuf_reinit(int nmbuf, int nclust, int clsize)
 {
 	struct mbuf *m;
 	void *item;
 
 	debugnet_mbuf_drain();
 
 	dn_clsize = clsize;
 
 	dn_zone_mbuf = uma_zcache_create("debugnet_" MBUF_MEM_NAME,
 	    MSIZE, mb_ctor_mbuf, mb_dtor_mbuf, NULL, NULL,
 	    dn_buf_import, dn_buf_release,
 	    &dn_mbufq, UMA_ZONE_NOBUCKET);
 
 	dn_zone_clust = uma_zcache_create("debugnet_" MBUF_CLUSTER_MEM_NAME,
 	    clsize, mb_ctor_clust, NULL, NULL, NULL,
 	    dn_buf_import, dn_buf_release,
 	    &dn_clustq, UMA_ZONE_NOBUCKET);
 
 	dn_zone_pack = uma_zcache_create("debugnet_" MBUF_PACKET_MEM_NAME,
 	    MCLBYTES, mb_ctor_pack, mb_dtor_pack, NULL, NULL,
 	    dn_pack_import, dn_pack_release,
 	    NULL, UMA_ZONE_NOBUCKET);
 
 	while (nmbuf-- > 0) {
 		m = m_get(M_WAITOK, MT_DATA);
 		uma_zfree(dn_zone_mbuf, m);
 	}
 	while (nclust-- > 0) {
 		item = uma_zalloc(m_getzone(dn_clsize), M_WAITOK);
 		uma_zfree(dn_zone_clust, item);
 	}
 }
 #endif /* DEBUGNET */
 
 /*
  * Constructor for Mbuf primary zone.
  *
  * The 'arg' pointer points to a mb_args structure which
  * contains call-specific information required to support the
  * mbuf allocation API.  See mbuf.h.
  */
 static int
 mb_ctor_mbuf(void *mem, int size, void *arg, int how)
 {
 	struct mbuf *m;
 	struct mb_args *args;
 	int error;
 	int flags;
 	short type;
 
 	args = (struct mb_args *)arg;
 	type = args->type;
 
 	/*
 	 * The mbuf is initialized later.  The caller has the
 	 * responsibility to set up any MAC labels too.
 	 */
 	if (type == MT_NOINIT)
 		return (0);
 
 	m = (struct mbuf *)mem;
 	flags = args->flags;
 	MPASS((flags & M_NOFREE) == 0);
 
 	error = m_init(m, how, type, flags);
 
 	return (error);
 }
 
 /*
  * The Mbuf primary zone destructor.
  */
 static void
 mb_dtor_mbuf(void *mem, int size, void *arg)
 {
 	struct mbuf *m;
 	unsigned long flags __diagused;
 
 	m = (struct mbuf *)mem;
 	flags = (unsigned long)arg;
 
 	KASSERT((m->m_flags & M_NOFREE) == 0, ("%s: M_NOFREE set", __func__));
 	KASSERT((flags & 0x1) == 0, ("%s: obsolete MB_DTOR_SKIP passed", __func__));
 	if ((m->m_flags & M_PKTHDR) && !SLIST_EMPTY(&m->m_pkthdr.tags))
 		m_tag_delete_chain(m, NULL);
 }
 
 /*
  * The Mbuf Packet zone destructor.
  */
 static void
 mb_dtor_pack(void *mem, int size, void *arg)
 {
 	struct mbuf *m;
 
 	m = (struct mbuf *)mem;
 	if ((m->m_flags & M_PKTHDR) != 0)
 		m_tag_delete_chain(m, NULL);
 
 	/* Make sure we've got a clean cluster back. */
 	KASSERT((m->m_flags & M_EXT) == M_EXT, ("%s: M_EXT not set", __func__));
 	KASSERT(m->m_ext.ext_buf != NULL, ("%s: ext_buf == NULL", __func__));
 	KASSERT(m->m_ext.ext_free == NULL, ("%s: ext_free != NULL", __func__));
 	KASSERT(m->m_ext.ext_arg1 == NULL, ("%s: ext_arg1 != NULL", __func__));
 	KASSERT(m->m_ext.ext_arg2 == NULL, ("%s: ext_arg2 != NULL", __func__));
 	KASSERT(m->m_ext.ext_size == MCLBYTES, ("%s: ext_size != MCLBYTES", __func__));
 	KASSERT(m->m_ext.ext_type == EXT_PACKET, ("%s: ext_type != EXT_PACKET", __func__));
 #if defined(INVARIANTS) && !defined(KMSAN)
 	trash_dtor(m->m_ext.ext_buf, MCLBYTES, zone_clust);
 #endif
 	/*
 	 * If there are processes blocked on zone_clust, waiting for pages
 	 * to be freed up, cause them to be woken up by draining the
 	 * packet zone.  We are exposed to a race here (in the check for
 	 * the UMA_ZFLAG_FULL) where we might miss the flag set, but that
 	 * is deliberate. We don't want to acquire the zone lock for every
 	 * mbuf free.
 	 */
 	if (uma_zone_exhausted(zone_clust))
 		uma_zone_reclaim(zone_pack, UMA_RECLAIM_DRAIN);
 }
 
 /*
  * The Cluster and Jumbo[PAGESIZE|9|16] zone constructor.
  *
  * Here the 'arg' pointer points to the Mbuf which we
  * are configuring cluster storage for.  If 'arg' is
  * empty we allocate just the cluster without setting
  * the mbuf to it.  See mbuf.h.
  */
 static int
 mb_ctor_clust(void *mem, int size, void *arg, int how)
 {
 	struct mbuf *m;
 
 	m = (struct mbuf *)arg;
 	if (m != NULL) {
 		m->m_ext.ext_buf = (char *)mem;
 		m->m_data = m->m_ext.ext_buf;
 		m->m_flags |= M_EXT;
 		m->m_ext.ext_free = NULL;
 		m->m_ext.ext_arg1 = NULL;
 		m->m_ext.ext_arg2 = NULL;
 		m->m_ext.ext_size = size;
 		m->m_ext.ext_type = m_gettype(size);
 		m->m_ext.ext_flags = EXT_FLAG_EMBREF;
 		m->m_ext.ext_count = 1;
 	}
 
 	return (0);
 }
 
 /*
  * The Packet secondary zone's init routine, executed on the
  * object's transition from mbuf keg slab to zone cache.
  */
 static int
 mb_zinit_pack(void *mem, int size, int how)
 {
 	struct mbuf *m;
 
 	m = (struct mbuf *)mem;		/* m is virgin. */
 	if (uma_zalloc_arg(zone_clust, m, how) == NULL ||
 	    m->m_ext.ext_buf == NULL)
 		return (ENOMEM);
 	m->m_ext.ext_type = EXT_PACKET;	/* Override. */
 #if defined(INVARIANTS) && !defined(KMSAN)
 	trash_init(m->m_ext.ext_buf, MCLBYTES, how);
 #endif
 	return (0);
 }
 
 /*
  * The Packet secondary zone's fini routine, executed on the
  * object's transition from zone cache to keg slab.
  */
 static void
 mb_zfini_pack(void *mem, int size)
 {
 	struct mbuf *m;
 
 	m = (struct mbuf *)mem;
 #if defined(INVARIANTS) && !defined(KMSAN)
 	trash_fini(m->m_ext.ext_buf, MCLBYTES);
 #endif
 	uma_zfree_arg(zone_clust, m->m_ext.ext_buf, NULL);
 #if defined(INVARIANTS) && !defined(KMSAN)
 	trash_dtor(mem, size, zone_clust);
 #endif
 }
 
 /*
  * The "packet" keg constructor.
  */
 static int
 mb_ctor_pack(void *mem, int size, void *arg, int how)
 {
 	struct mbuf *m;
 	struct mb_args *args;
 	int error, flags;
 	short type;
 
 	m = (struct mbuf *)mem;
 	args = (struct mb_args *)arg;
 	flags = args->flags;
 	type = args->type;
 	MPASS((flags & M_NOFREE) == 0);
 
 #if defined(INVARIANTS) && !defined(KMSAN)
 	trash_ctor(m->m_ext.ext_buf, MCLBYTES, zone_clust, how);
 #endif
 
 	error = m_init(m, how, type, flags);
 
 	/* m_ext is already initialized. */
 	m->m_data = m->m_ext.ext_buf;
  	m->m_flags = (flags | M_EXT);
 
 	return (error);
 }
 
 /*
  * This is the protocol drain routine.  Called by UMA whenever any of the
  * mbuf zones is closed to its limit.
  */
 static void
 mb_reclaim(uma_zone_t zone __unused, int pending __unused)
 {
 
 	EVENTHANDLER_INVOKE(mbuf_lowmem, VM_LOW_MBUFS);
 }
 
 /*
  * Free "count" units of I/O from an mbuf chain.  They could be held
  * in M_EXTPG or just as a normal mbuf.  This code is intended to be
  * called in an error path (I/O error, closed connection, etc).
  */
 void
 mb_free_notready(struct mbuf *m, int count)
 {
 	int i;
 
 	for (i = 0; i < count && m != NULL; i++) {
 		if ((m->m_flags & M_EXTPG) != 0) {
 			m->m_epg_nrdy--;
 			if (m->m_epg_nrdy != 0)
 				continue;
 		}
 		m = m_free(m);
 	}
 	KASSERT(i == count, ("Removed only %d items from %p", i, m));
 }
 
 /*
  * Compress an unmapped mbuf into a simple mbuf when it holds a small
  * amount of data.  This is used as a DOS defense to avoid having
  * small packets tie up wired pages, an ext_pgs structure, and an
  * mbuf.  Since this converts the existing mbuf in place, it can only
  * be used if there are no other references to 'm'.
  */
 int
 mb_unmapped_compress(struct mbuf *m)
 {
 	volatile u_int *refcnt;
 	char buf[MLEN];
 
 	/*
 	 * Assert that 'm' does not have a packet header.  If 'm' had
 	 * a packet header, it would only be able to hold MHLEN bytes
 	 * and m_data would have to be initialized differently.
 	 */
 	KASSERT((m->m_flags & M_PKTHDR) == 0 && (m->m_flags & M_EXTPG),
             ("%s: m %p !M_EXTPG or M_PKTHDR", __func__, m));
 	KASSERT(m->m_len <= MLEN, ("m_len too large %p", m));
 
 	if (m->m_ext.ext_flags & EXT_FLAG_EMBREF) {
 		refcnt = &m->m_ext.ext_count;
 	} else {
 		KASSERT(m->m_ext.ext_cnt != NULL,
 		    ("%s: no refcounting pointer on %p", __func__, m));
 		refcnt = m->m_ext.ext_cnt;
 	}
 
 	if (*refcnt != 1)
 		return (EBUSY);
 
 	m_copydata(m, 0, m->m_len, buf);
 
 	/* Free the backing pages. */
 	m->m_ext.ext_free(m);
 
 	/* Turn 'm' into a "normal" mbuf. */
 	m->m_flags &= ~(M_EXT | M_RDONLY | M_EXTPG);
 	m->m_data = m->m_dat;
 
 	/* Copy data back into m. */
 	bcopy(buf, mtod(m, char *), m->m_len);
 
 	return (0);
 }
 
 /*
  * These next few routines are used to permit downgrading an unmapped
  * mbuf to a chain of mapped mbufs.  This is used when an interface
  * doesn't supported unmapped mbufs or if checksums need to be
  * computed in software.
  *
  * Each unmapped mbuf is converted to a chain of mbufs.  First, any
  * TLS header data is stored in a regular mbuf.  Second, each page of
  * unmapped data is stored in an mbuf with an EXT_SFBUF external
  * cluster.  These mbufs use an sf_buf to provide a valid KVA for the
  * associated physical page.  They also hold a reference on the
  * original M_EXTPG mbuf to ensure the physical page doesn't go away.
  * Finally, any TLS trailer data is stored in a regular mbuf.
  *
  * mb_unmapped_free_mext() is the ext_free handler for the EXT_SFBUF
  * mbufs.  It frees the associated sf_buf and releases its reference
  * on the original M_EXTPG mbuf.
  *
  * _mb_unmapped_to_ext() is a helper function that converts a single
  * unmapped mbuf into a chain of mbufs.
  *
  * mb_unmapped_to_ext() is the public function that walks an mbuf
  * chain converting any unmapped mbufs to mapped mbufs.  It returns
  * the new chain of unmapped mbufs on success.  On failure it frees
  * the original mbuf chain and returns NULL.
  */
 static void
 mb_unmapped_free_mext(struct mbuf *m)
 {
 	struct sf_buf *sf;
 	struct mbuf *old_m;
 
 	sf = m->m_ext.ext_arg1;
 	sf_buf_free(sf);
 
 	/* Drop the reference on the backing M_EXTPG mbuf. */
 	old_m = m->m_ext.ext_arg2;
 	mb_free_extpg(old_m);
 }
 
 static struct mbuf *
 _mb_unmapped_to_ext(struct mbuf *m)
 {
 	struct mbuf *m_new, *top, *prev, *mref;
 	struct sf_buf *sf;
 	vm_page_t pg;
 	int i, len, off, pglen, pgoff, seglen, segoff;
 	volatile u_int *refcnt;
 	u_int ref_inc = 0;
 
 	M_ASSERTEXTPG(m);
 	len = m->m_len;
 	KASSERT(m->m_epg_tls == NULL, ("%s: can't convert TLS mbuf %p",
 	    __func__, m));
 
 	/* See if this is the mbuf that holds the embedded refcount. */
 	if (m->m_ext.ext_flags & EXT_FLAG_EMBREF) {
 		refcnt = &m->m_ext.ext_count;
 		mref = m;
 	} else {
 		KASSERT(m->m_ext.ext_cnt != NULL,
 		    ("%s: no refcounting pointer on %p", __func__, m));
 		refcnt = m->m_ext.ext_cnt;
 		mref = __containerof(refcnt, struct mbuf, m_ext.ext_count);
 	}
 
 	/* Skip over any data removed from the front. */
 	off = mtod(m, vm_offset_t);
 
 	top = NULL;
 	if (m->m_epg_hdrlen != 0) {
 		if (off >= m->m_epg_hdrlen) {
 			off -= m->m_epg_hdrlen;
 		} else {
 			seglen = m->m_epg_hdrlen - off;
 			segoff = off;
 			seglen = min(seglen, len);
 			off = 0;
 			len -= seglen;
 			m_new = m_get(M_NOWAIT, MT_DATA);
 			if (m_new == NULL)
 				goto fail;
 			m_new->m_len = seglen;
 			prev = top = m_new;
 			memcpy(mtod(m_new, void *), &m->m_epg_hdr[segoff],
 			    seglen);
 		}
 	}
 	pgoff = m->m_epg_1st_off;
 	for (i = 0; i < m->m_epg_npgs && len > 0; i++) {
 		pglen = m_epg_pagelen(m, i, pgoff);
 		if (off >= pglen) {
 			off -= pglen;
 			pgoff = 0;
 			continue;
 		}
 		seglen = pglen - off;
 		segoff = pgoff + off;
 		off = 0;
 		seglen = min(seglen, len);
 		len -= seglen;
 
 		pg = PHYS_TO_VM_PAGE(m->m_epg_pa[i]);
 		m_new = m_get(M_NOWAIT, MT_DATA);
 		if (m_new == NULL)
 			goto fail;
 		if (top == NULL) {
 			top = prev = m_new;
 		} else {
 			prev->m_next = m_new;
 			prev = m_new;
 		}
 		sf = sf_buf_alloc(pg, SFB_NOWAIT);
 		if (sf == NULL)
 			goto fail;
 
 		ref_inc++;
 		m_extadd(m_new, (char *)sf_buf_kva(sf), PAGE_SIZE,
-		    mb_unmapped_free_mext, sf, mref, M_RDONLY, EXT_SFBUF);
+		    mb_unmapped_free_mext, sf, mref, m->m_flags & M_RDONLY,
+		    EXT_SFBUF);
 		m_new->m_data += segoff;
 		m_new->m_len = seglen;
 
 		pgoff = 0;
 	};
 	if (len != 0) {
 		KASSERT((off + len) <= m->m_epg_trllen,
 		    ("off + len > trail (%d + %d > %d)", off, len,
 		    m->m_epg_trllen));
 		m_new = m_get(M_NOWAIT, MT_DATA);
 		if (m_new == NULL)
 			goto fail;
 		if (top == NULL)
 			top = m_new;
 		else
 			prev->m_next = m_new;
 		m_new->m_len = len;
 		memcpy(mtod(m_new, void *), &m->m_epg_trail[off], len);
 	}
 
 	if (ref_inc != 0) {
 		/*
 		 * Obtain an additional reference on the old mbuf for
 		 * each created EXT_SFBUF mbuf.  They will be dropped
 		 * in mb_unmapped_free_mext().
 		 */
 		if (*refcnt == 1)
 			*refcnt += ref_inc;
 		else
 			atomic_add_int(refcnt, ref_inc);
 	}
 	m_free(m);
 	return (top);
 
 fail:
 	if (ref_inc != 0) {
 		/*
 		 * Obtain an additional reference on the old mbuf for
 		 * each created EXT_SFBUF mbuf.  They will be
 		 * immediately dropped when these mbufs are freed
 		 * below.
 		 */
 		if (*refcnt == 1)
 			*refcnt += ref_inc;
 		else
 			atomic_add_int(refcnt, ref_inc);
 	}
 	m_free(m);
 	m_freem(top);
 	return (NULL);
 }
 
 struct mbuf *
 mb_unmapped_to_ext(struct mbuf *top)
 {
 	struct mbuf *m, *next, *prev = NULL;
 
 	prev = NULL;
 	for (m = top; m != NULL; m = next) {
 		/* m might be freed, so cache the next pointer. */
 		next = m->m_next;
 		if (m->m_flags & M_EXTPG) {
 			if (prev != NULL) {
 				/*
 				 * Remove 'm' from the new chain so
 				 * that the 'top' chain terminates
 				 * before 'm' in case 'top' is freed
 				 * due to an error.
 				 */
 				prev->m_next = NULL;
 			}
 			m = _mb_unmapped_to_ext(m);
 			if (m == NULL) {
 				m_freem(top);
 				m_freem(next);
 				return (NULL);
 			}
 			if (prev == NULL) {
 				top = m;
 			} else {
 				prev->m_next = m;
 			}
 
 			/*
 			 * Replaced one mbuf with a chain, so we must
 			 * find the end of chain.
 			 */
 			prev = m_last(m);
 		} else {
 			if (prev != NULL) {
 				prev->m_next = m;
 			}
 			prev = m;
 		}
 	}
 	return (top);
 }
 
 /*
  * Allocate an empty M_EXTPG mbuf.  The ext_free routine is
  * responsible for freeing any pages backing this mbuf when it is
  * freed.
  */
 struct mbuf *
-mb_alloc_ext_pgs(int how, m_ext_free_t ext_free)
+mb_alloc_ext_pgs(int how, m_ext_free_t ext_free, int flags)
 {
 	struct mbuf *m;
 
 	m = m_get(how, MT_DATA);
 	if (m == NULL)
 		return (NULL);
 
 	m->m_epg_npgs = 0;
 	m->m_epg_nrdy = 0;
 	m->m_epg_1st_off = 0;
 	m->m_epg_last_len = 0;
 	m->m_epg_flags = 0;
 	m->m_epg_hdrlen = 0;
 	m->m_epg_trllen = 0;
 	m->m_epg_tls = NULL;
 	m->m_epg_so = NULL;
 	m->m_data = NULL;
-	m->m_flags |= (M_EXT | M_RDONLY | M_EXTPG);
+	m->m_flags |= M_EXT | M_EXTPG | flags;
 	m->m_ext.ext_flags = EXT_FLAG_EMBREF;
 	m->m_ext.ext_count = 1;
 	m->m_ext.ext_size = 0;
 	m->m_ext.ext_free = ext_free;
 	return (m);
 }
 
 /*
  * Clean up after mbufs with M_EXT storage attached to them if the
  * reference count hits 1.
  */
 void
 mb_free_ext(struct mbuf *m)
 {
 	volatile u_int *refcnt;
 	struct mbuf *mref;
 	int freembuf;
 
 	KASSERT(m->m_flags & M_EXT, ("%s: M_EXT not set on %p", __func__, m));
 
 	/* See if this is the mbuf that holds the embedded refcount. */
 	if (m->m_ext.ext_flags & EXT_FLAG_EMBREF) {
 		refcnt = &m->m_ext.ext_count;
 		mref = m;
 	} else {
 		KASSERT(m->m_ext.ext_cnt != NULL,
 		    ("%s: no refcounting pointer on %p", __func__, m));
 		refcnt = m->m_ext.ext_cnt;
 		mref = __containerof(refcnt, struct mbuf, m_ext.ext_count);
 	}
 
 	/*
 	 * Check if the header is embedded in the cluster.  It is
 	 * important that we can't touch any of the mbuf fields
 	 * after we have freed the external storage, since mbuf
 	 * could have been embedded in it.  For now, the mbufs
 	 * embedded into the cluster are always of type EXT_EXTREF,
 	 * and for this type we won't free the mref.
 	 */
 	if (m->m_flags & M_NOFREE) {
 		freembuf = 0;
 		KASSERT(m->m_ext.ext_type == EXT_EXTREF ||
 		    m->m_ext.ext_type == EXT_RXRING,
 		    ("%s: no-free mbuf %p has wrong type", __func__, m));
 	} else
 		freembuf = 1;
 
 	/* Free attached storage if this mbuf is the only reference to it. */
 	if (*refcnt == 1 || atomic_fetchadd_int(refcnt, -1) == 1) {
 		switch (m->m_ext.ext_type) {
 		case EXT_PACKET:
 			/* The packet zone is special. */
 			if (*refcnt == 0)
 				*refcnt = 1;
 			uma_zfree(zone_pack, mref);
 			break;
 		case EXT_CLUSTER:
 			uma_zfree(zone_clust, m->m_ext.ext_buf);
 			m_free_raw(mref);
 			break;
 		case EXT_JUMBOP:
 			uma_zfree(zone_jumbop, m->m_ext.ext_buf);
 			m_free_raw(mref);
 			break;
 		case EXT_JUMBO9:
 			uma_zfree(zone_jumbo9, m->m_ext.ext_buf);
 			m_free_raw(mref);
 			break;
 		case EXT_JUMBO16:
 			uma_zfree(zone_jumbo16, m->m_ext.ext_buf);
 			m_free_raw(mref);
 			break;
 		case EXT_SFBUF:
 		case EXT_NET_DRV:
 		case EXT_CTL:
 		case EXT_MOD_TYPE:
 		case EXT_DISPOSABLE:
 			KASSERT(mref->m_ext.ext_free != NULL,
 			    ("%s: ext_free not set", __func__));
 			mref->m_ext.ext_free(mref);
 			m_free_raw(mref);
 			break;
 		case EXT_EXTREF:
 			KASSERT(m->m_ext.ext_free != NULL,
 			    ("%s: ext_free not set", __func__));
 			m->m_ext.ext_free(m);
 			break;
 		case EXT_RXRING:
 			KASSERT(m->m_ext.ext_free == NULL,
 			    ("%s: ext_free is set", __func__));
 			break;
 		default:
 			KASSERT(m->m_ext.ext_type == 0,
 			    ("%s: unknown ext_type", __func__));
 		}
 	}
 
 	if (freembuf && m != mref)
 		m_free_raw(m);
 }
 
 /*
  * Clean up after mbufs with M_EXTPG storage attached to them if the
  * reference count hits 1.
  */
 void
 mb_free_extpg(struct mbuf *m)
 {
 	volatile u_int *refcnt;
 	struct mbuf *mref;
 
 	M_ASSERTEXTPG(m);
 
 	/* See if this is the mbuf that holds the embedded refcount. */
 	if (m->m_ext.ext_flags & EXT_FLAG_EMBREF) {
 		refcnt = &m->m_ext.ext_count;
 		mref = m;
 	} else {
 		KASSERT(m->m_ext.ext_cnt != NULL,
 		    ("%s: no refcounting pointer on %p", __func__, m));
 		refcnt = m->m_ext.ext_cnt;
 		mref = __containerof(refcnt, struct mbuf, m_ext.ext_count);
 	}
 
 	/* Free attached storage if this mbuf is the only reference to it. */
 	if (*refcnt == 1 || atomic_fetchadd_int(refcnt, -1) == 1) {
 		KASSERT(mref->m_ext.ext_free != NULL,
 		    ("%s: ext_free not set", __func__));
 
 		mref->m_ext.ext_free(mref);
 #ifdef KERN_TLS
 		if (mref->m_epg_tls != NULL &&
 		    !refcount_release_if_not_last(&mref->m_epg_tls->refcount))
 			ktls_enqueue_to_free(mref);
 		else
 #endif
 			m_free_raw(mref);
 	}
 
 	if (m != mref)
 		m_free_raw(m);
 }
 
 /*
  * Official mbuf(9) allocation KPI for stack and drivers:
  *
  * m_get()	- a single mbuf without any attachments, sys/mbuf.h.
  * m_gethdr()	- a single mbuf initialized as M_PKTHDR, sys/mbuf.h.
  * m_getcl()	- an mbuf + 2k cluster, sys/mbuf.h.
  * m_clget()	- attach cluster to already allocated mbuf.
  * m_cljget()	- attach jumbo cluster to already allocated mbuf.
  * m_get2()	- allocate minimum mbuf that would fit size argument.
  * m_getm2()	- allocate a chain of mbufs/clusters.
  * m_extadd()	- attach external cluster to mbuf.
  *
  * m_free()	- free single mbuf with its tags and ext, sys/mbuf.h.
  * m_freem()	- free chain of mbufs.
  */
 
 int
 m_clget(struct mbuf *m, int how)
 {
 
 	KASSERT((m->m_flags & M_EXT) == 0, ("%s: mbuf %p has M_EXT",
 	    __func__, m));
 	m->m_ext.ext_buf = (char *)NULL;
 	uma_zalloc_arg(zone_clust, m, how);
 	/*
 	 * On a cluster allocation failure, drain the packet zone and retry,
 	 * we might be able to loosen a few clusters up on the drain.
 	 */
 	if ((how & M_NOWAIT) && (m->m_ext.ext_buf == NULL)) {
 		uma_zone_reclaim(zone_pack, UMA_RECLAIM_DRAIN);
 		uma_zalloc_arg(zone_clust, m, how);
 	}
 	MBUF_PROBE2(m__clget, m, how);
 	return (m->m_flags & M_EXT);
 }
 
 /*
  * m_cljget() is different from m_clget() as it can allocate clusters without
  * attaching them to an mbuf.  In that case the return value is the pointer
  * to the cluster of the requested size.  If an mbuf was specified, it gets
  * the cluster attached to it and the return value can be safely ignored.
  * For size it takes MCLBYTES, MJUMPAGESIZE, MJUM9BYTES, MJUM16BYTES.
  */
 void *
 m_cljget(struct mbuf *m, int how, int size)
 {
 	uma_zone_t zone;
 	void *retval;
 
 	if (m != NULL) {
 		KASSERT((m->m_flags & M_EXT) == 0, ("%s: mbuf %p has M_EXT",
 		    __func__, m));
 		m->m_ext.ext_buf = NULL;
 	}
 
 	zone = m_getzone(size);
 	retval = uma_zalloc_arg(zone, m, how);
 
 	MBUF_PROBE4(m__cljget, m, how, size, retval);
 
 	return (retval);
 }
 
 /*
  * m_get2() allocates minimum mbuf that would fit "size" argument.
  */
 struct mbuf *
 m_get2(int size, int how, short type, int flags)
 {
 	struct mb_args args;
 	struct mbuf *m, *n;
 
 	args.flags = flags;
 	args.type = type;
 
 	if (size <= MHLEN || (size <= MLEN && (flags & M_PKTHDR) == 0))
 		return (uma_zalloc_arg(zone_mbuf, &args, how));
 	if (size <= MCLBYTES)
 		return (uma_zalloc_arg(zone_pack, &args, how));
 
 	if (size > MJUMPAGESIZE)
 		return (NULL);
 
 	m = uma_zalloc_arg(zone_mbuf, &args, how);
 	if (m == NULL)
 		return (NULL);
 
 	n = uma_zalloc_arg(zone_jumbop, m, how);
 	if (n == NULL) {
 		m_free_raw(m);
 		return (NULL);
 	}
 
 	return (m);
 }
 
 /*
  * m_get3() allocates minimum mbuf that would fit "size" argument.
  * Unlike m_get2() it can allocate clusters up to MJUM16BYTES.
  */
 struct mbuf *
 m_get3(int size, int how, short type, int flags)
 {
 	struct mb_args args;
 	struct mbuf *m, *n;
 	uma_zone_t zone;
 
 	if (size <= MJUMPAGESIZE)
 		return (m_get2(size, how, type, flags));
 
 	if (size > MJUM16BYTES)
 		return (NULL);
 
 	args.flags = flags;
 	args.type = type;
 
 	m = uma_zalloc_arg(zone_mbuf, &args, how);
 	if (m == NULL)
 		return (NULL);
 
 	if (size <= MJUM9BYTES)
 		zone = zone_jumbo9;
 	else
 		zone = zone_jumbo16;
 
 	n = uma_zalloc_arg(zone, m, how);
 	if (n == NULL) {
 		m_free_raw(m);
 		return (NULL);
 	}
 
 	return (m);
 }
 
 /*
  * m_getjcl() returns an mbuf with a cluster of the specified size attached.
  * For size it takes MCLBYTES, MJUMPAGESIZE, MJUM9BYTES, MJUM16BYTES.
  */
 struct mbuf *
 m_getjcl(int how, short type, int flags, int size)
 {
 	struct mb_args args;
 	struct mbuf *m, *n;
 	uma_zone_t zone;
 
 	if (size == MCLBYTES)
 		return m_getcl(how, type, flags);
 
 	args.flags = flags;
 	args.type = type;
 
 	m = uma_zalloc_arg(zone_mbuf, &args, how);
 	if (m == NULL)
 		return (NULL);
 
 	zone = m_getzone(size);
 	n = uma_zalloc_arg(zone, m, how);
 	if (n == NULL) {
 		m_free_raw(m);
 		return (NULL);
 	}
 	MBUF_PROBE5(m__getjcl, how, type, flags, size, m);
 	return (m);
 }
 
 /*
  * Allocate mchain of a given length of mbufs and/or clusters (whatever fits
  * best).  May fail due to ENOMEM.  In case of failure state of mchain is
  * inconsistent.
  */
 int
 mc_get(struct mchain *mc, u_int length, int how, short type, int flags)
 {
 	struct mbuf *mb;
 	u_int progress;
 
 	MPASS(length >= 0);
 
 	*mc = MCHAIN_INITIALIZER(mc);
 	flags &= (M_PKTHDR | M_EOR);
 	progress = 0;
 
 	/* Loop and append maximum sized mbufs to the chain tail. */
 	do {
 		if (length - progress > MCLBYTES) {
 			/*
 			 * M_NOWAIT here is intentional, it avoids blocking if
 			 * the jumbop zone is exhausted. See 796d4eb89e2c and
 			 * D26150 for more detail.
 			 */
 			mb = m_getjcl(M_NOWAIT, type, (flags & M_PKTHDR),
 			    MJUMPAGESIZE);
 		} else
 			mb = NULL;
 		if (mb == NULL) {
 			if (length - progress >= MINCLSIZE)
 				mb = m_getcl(how, type, (flags & M_PKTHDR));
 			else if (flags & M_PKTHDR)
 				mb = m_gethdr(how, type);
 			else
 				mb = m_get(how, type);
 
 			/*
 			 * Fail the whole operation if one mbuf can't be
 			 * allocated.
 			 */
 			if (mb == NULL) {
 				m_freem(mc_first(mc));
 				return (ENOMEM);
 			}
 		}
 
 		progress += M_SIZE(mb);
 		mc_append(mc, mb);
 		/* Only valid on the first mbuf. */
 		flags &= ~M_PKTHDR;
 	} while (progress < length);
 	if (flags & M_EOR)
 		/* Only valid on the last mbuf. */
 		mc_last(mc)->m_flags |= M_EOR;
 
 	return (0);
 }
 
 /*
  * Allocate a given length worth of mbufs and/or clusters (whatever fits
  * best) and return a pointer to the top of the allocated chain.  If an
  * existing mbuf chain is provided, then we will append the new chain
  * to the existing one and return a pointer to the provided mbuf.
  */
 struct mbuf *
 m_getm2(struct mbuf *m, int len, int how, short type, int flags)
 {
 	struct mchain mc;
 
 	/* Packet header mbuf must be first in chain. */
 	if (m != NULL && (flags & M_PKTHDR))
 		flags &= ~M_PKTHDR;
 
 	if (__predict_false(mc_get(&mc, len, how, type, flags) != 0))
 		return (NULL);
 
 	/* If mbuf was supplied, append new chain to the end of it. */
 	if (m != NULL) {
 		struct mbuf *mtail;
 
 		mtail = m_last(m);
 		mtail->m_next = mc_first(&mc);
 		mtail->m_flags &= ~M_EOR;
 	} else
 		m = mc_first(&mc);
 
 	return (m);
 }
 
 /*-
  * Configure a provided mbuf to refer to the provided external storage
  * buffer and setup a reference count for said buffer.
  *
  * Arguments:
  *    mb     The existing mbuf to which to attach the provided buffer.
  *    buf    The address of the provided external storage buffer.
  *    size   The size of the provided buffer.
  *    freef  A pointer to a routine that is responsible for freeing the
  *           provided external storage buffer.
  *    args   A pointer to an argument structure (of any type) to be passed
  *           to the provided freef routine (may be NULL).
  *    flags  Any other flags to be passed to the provided mbuf.
  *    type   The type that the external storage buffer should be
  *           labeled with.
  *
  * Returns:
  *    Nothing.
  */
 void
 m_extadd(struct mbuf *mb, char *buf, u_int size, m_ext_free_t freef,
     void *arg1, void *arg2, int flags, int type)
 {
 
 	KASSERT(type != EXT_CLUSTER, ("%s: EXT_CLUSTER not allowed", __func__));
 
 	mb->m_flags |= (M_EXT | flags);
 	mb->m_ext.ext_buf = buf;
 	mb->m_data = mb->m_ext.ext_buf;
 	mb->m_ext.ext_size = size;
 	mb->m_ext.ext_free = freef;
 	mb->m_ext.ext_arg1 = arg1;
 	mb->m_ext.ext_arg2 = arg2;
 	mb->m_ext.ext_type = type;
 
 	if (type != EXT_EXTREF) {
 		mb->m_ext.ext_count = 1;
 		mb->m_ext.ext_flags = EXT_FLAG_EMBREF;
 	} else
 		mb->m_ext.ext_flags = 0;
 }
 
 /*
  * Free an entire chain of mbufs and associated external buffers, if
  * applicable.
  */
 void
 m_freem(struct mbuf *mb)
 {
 
 	MBUF_PROBE1(m__freem, mb);
 	while (mb != NULL)
 		mb = m_free(mb);
 }
 
 /*
  * Free an entire chain of mbufs and associated external buffers, following
  * both m_next and m_nextpkt linkage.
  * Note: doesn't support NULL argument.
  */
 void
 m_freemp(struct mbuf *m)
 {
 	struct mbuf *n;
 
 	MBUF_PROBE1(m__freemp, m);
 	do {
 		n = m->m_nextpkt;
 		while (m != NULL)
 			m = m_free(m);
 		m = n;
 	} while (m != NULL);
 }
 
 /*
  * Temporary primitive to allow freeing without going through m_free.
  */
 void
 m_free_raw(struct mbuf *mb)
 {
 
 	uma_zfree(zone_mbuf, mb);
 }
 
 int
 m_snd_tag_alloc(struct ifnet *ifp, union if_snd_tag_alloc_params *params,
     struct m_snd_tag **mstp)
 {
 
 	return (if_snd_tag_alloc(ifp, params, mstp));
 }
 
 void
 m_snd_tag_init(struct m_snd_tag *mst, struct ifnet *ifp,
     const struct if_snd_tag_sw *sw)
 {
 
 	if_ref(ifp);
 	mst->ifp = ifp;
 	refcount_init(&mst->refcount, 1);
 	mst->sw = sw;
 	counter_u64_add(snd_tag_count, 1);
 }
 
 void
 m_snd_tag_destroy(struct m_snd_tag *mst)
 {
 	struct ifnet *ifp;
 
 	ifp = mst->ifp;
 	mst->sw->snd_tag_free(mst);
 	if_rele(ifp);
 	counter_u64_add(snd_tag_count, -1);
 }
 
 void
 m_rcvif_serialize(struct mbuf *m)
 {
 	u_short idx, gen;
 
 	M_ASSERTPKTHDR(m);
 	idx = if_getindex(m->m_pkthdr.rcvif);
 	gen = if_getidxgen(m->m_pkthdr.rcvif);
 	m->m_pkthdr.rcvidx = idx;
 	m->m_pkthdr.rcvgen = gen;
 	if (__predict_false(m->m_pkthdr.leaf_rcvif != NULL)) {
 		idx = if_getindex(m->m_pkthdr.leaf_rcvif);
 		gen = if_getidxgen(m->m_pkthdr.leaf_rcvif);
 	} else {
 		idx = -1;
 		gen = 0;
 	}
 	m->m_pkthdr.leaf_rcvidx = idx;
 	m->m_pkthdr.leaf_rcvgen = gen;
 }
 
 struct ifnet *
 m_rcvif_restore(struct mbuf *m)
 {
 	struct ifnet *ifp, *leaf_ifp;
 
 	M_ASSERTPKTHDR(m);
 	NET_EPOCH_ASSERT();
 
 	ifp = ifnet_byindexgen(m->m_pkthdr.rcvidx, m->m_pkthdr.rcvgen);
 	if (ifp == NULL || (if_getflags(ifp) & IFF_DYING))
 		return (NULL);
 
 	if (__predict_true(m->m_pkthdr.leaf_rcvidx == (u_short)-1)) {
 		leaf_ifp = NULL;
 	} else {
 		leaf_ifp = ifnet_byindexgen(m->m_pkthdr.leaf_rcvidx,
 		    m->m_pkthdr.leaf_rcvgen);
 		if (__predict_false(leaf_ifp != NULL && (if_getflags(leaf_ifp) & IFF_DYING)))
 			leaf_ifp = NULL;
 	}
 
 	m->m_pkthdr.leaf_rcvif = leaf_ifp;
 	m->m_pkthdr.rcvif = ifp;
 
 	return (ifp);
 }
 
 /*
  * Allocate an mbuf with anonymous external pages.
  */
 struct mbuf *
 mb_alloc_ext_plus_pages(int len, int how)
 {
 	struct mbuf *m;
 	vm_page_t pg;
 	int i, npgs;
 
-	m = mb_alloc_ext_pgs(how, mb_free_mext_pgs);
+	m = mb_alloc_ext_pgs(how, mb_free_mext_pgs, 0);
 	if (m == NULL)
 		return (NULL);
 	m->m_epg_flags |= EPG_FLAG_ANON;
 	npgs = howmany(len, PAGE_SIZE);
 	for (i = 0; i < npgs; i++) {
 		do {
 			pg = vm_page_alloc_noobj(VM_ALLOC_NODUMP |
 			    VM_ALLOC_WIRED);
 			if (pg == NULL) {
 				if (how == M_NOWAIT) {
 					m->m_epg_npgs = i;
 					m_free(m);
 					return (NULL);
 				}
 				vm_wait(NULL);
 			}
 		} while (pg == NULL);
 		m->m_epg_pa[i] = VM_PAGE_TO_PHYS(pg);
 	}
 	m->m_epg_npgs = npgs;
 	return (m);
 }
 
 /*
  * Copy the data in the mbuf chain to a chain of mbufs with anonymous external
  * unmapped pages.
  * len is the length of data in the input mbuf chain.
  * mlen is the maximum number of bytes put into each ext_page mbuf.
  */
 struct mbuf *
 mb_mapped_to_unmapped(struct mbuf *mp, int len, int mlen, int how,
     struct mbuf **mlast)
 {
 	struct mbuf *m, *mout;
 	char *pgpos, *mbpos;
 	int i, mblen, mbufsiz, pglen, xfer;
 
 	if (len == 0)
 		return (NULL);
 	mbufsiz = min(mlen, len);
 	m = mout = mb_alloc_ext_plus_pages(mbufsiz, how);
 	if (m == NULL)
 		return (m);
 	pgpos = (char *)(void *)PHYS_TO_DMAP(m->m_epg_pa[0]);
 	pglen = PAGE_SIZE;
 	mblen = 0;
 	i = 0;
 	do {
 		if (pglen == 0) {
 			if (++i == m->m_epg_npgs) {
 				m->m_epg_last_len = PAGE_SIZE;
 				mbufsiz = min(mlen, len);
 				m->m_next = mb_alloc_ext_plus_pages(mbufsiz,
 				    how);
 				m = m->m_next;
 				if (m == NULL) {
 					m_freem(mout);
 					return (m);
 				}
 				i = 0;
 			}
 			pgpos = (char *)(void *)PHYS_TO_DMAP(m->m_epg_pa[i]);
 			pglen = PAGE_SIZE;
 		}
 		while (mblen == 0) {
 			if (mp == NULL) {
 				m_freem(mout);
 				return (NULL);
 			}
 			KASSERT((mp->m_flags & M_EXTPG) == 0,
 			    ("mb_copym_ext_pgs: ext_pgs input mbuf"));
 			mbpos = mtod(mp, char *);
 			mblen = mp->m_len;
 			mp = mp->m_next;
 		}
 		xfer = min(mblen, pglen);
 		memcpy(pgpos, mbpos, xfer);
 		pgpos += xfer;
 		mbpos += xfer;
 		pglen -= xfer;
 		mblen -= xfer;
 		len -= xfer;
 		m->m_len += xfer;
 	} while (len > 0);
 	m->m_epg_last_len = PAGE_SIZE - pglen;
 	if (mlast != NULL)
 		*mlast = m;
 	return (mout);
 }
diff --git a/sys/kern/kern_sendfile.c b/sys/kern/kern_sendfile.c
index 323e7fcde07b..05a820fe5ac1 100644
--- a/sys/kern/kern_sendfile.c
+++ b/sys/kern/kern_sendfile.c
@@ -1,1373 +1,1373 @@
 /*-
  * Copyright (c) 2013-2015 Gleb Smirnoff <glebius@FreeBSD.org>
  * Copyright (c) 1998, David Greenman. All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 #include "opt_kern_tls.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/capsicum.h>
 #include <sys/kernel.h>
 #include <sys/lock.h>
 #include <sys/ktls.h>
 #include <sys/mutex.h>
 #include <sys/malloc.h>
 #include <sys/mman.h>
 #include <sys/mount.h>
 #include <sys/mbuf.h>
 #include <sys/proc.h>
 #include <sys/protosw.h>
 #include <sys/rwlock.h>
 #include <sys/sf_buf.h>
 #include <sys/socket.h>
 #include <sys/socketvar.h>
 #include <sys/syscallsubr.h>
 #include <sys/sysctl.h>
 #include <sys/sysproto.h>
 #include <sys/vnode.h>
 
 #include <net/vnet.h>
 #include <netinet/in.h>
 #include <netinet/tcp.h>
 #include <netinet/in_pcb.h>
 #include <netinet/tcp_var.h>
 #include <netinet/tcp_log_buf.h>
 
 #include <security/audit/audit.h>
 #include <security/mac/mac_framework.h>
 
 #include <vm/vm.h>
 #include <vm/vm_object.h>
 #include <vm/vm_pager.h>
 
 static MALLOC_DEFINE(M_SENDFILE, "sendfile", "sendfile dynamic memory");
 
 #define	EXT_FLAG_SYNC		EXT_FLAG_VENDOR1
 #define	EXT_FLAG_NOCACHE	EXT_FLAG_VENDOR2
 #define	EXT_FLAG_CACHE_LAST	EXT_FLAG_VENDOR3
 
 /*
  * Structure describing a single sendfile(2) I/O, which may consist of
  * several underlying pager I/Os.
  *
  * The syscall context allocates the structure and initializes 'nios'
  * to 1.  As sendfile_swapin() runs through pages and starts asynchronous
  * paging operations, it increments 'nios'.
  *
  * Every I/O completion calls sendfile_iodone(), which decrements the 'nios',
  * and the syscall also calls sendfile_iodone() after allocating all mbufs,
  * linking them and sending to socket.  Whoever reaches zero 'nios' is
  * responsible to * call pru_ready on the socket, to notify it of readyness
  * of the data.
  */
 struct sf_io {
 	volatile u_int	nios;
 	u_int		error;
 	int		npages;
 	struct socket	*so;
 	struct mbuf	*m;
 	vm_object_t	obj;
 	vm_pindex_t	pindex0;
 #ifdef KERN_TLS
 	struct ktls_session *tls;
 #endif
 	vm_page_t	pa[];
 };
 
 /*
  * Structure used to track requests with SF_SYNC flag.
  */
 struct sendfile_sync {
 	struct mtx	mtx;
 	struct cv	cv;
 	unsigned	count;
 	bool		waiting;
 };
 
 static void
 sendfile_sync_destroy(struct sendfile_sync *sfs)
 {
 	KASSERT(sfs->count == 0, ("sendfile sync %p still busy", sfs));
 
 	cv_destroy(&sfs->cv);
 	mtx_destroy(&sfs->mtx);
 	free(sfs, M_SENDFILE);
 }
 
 static void
 sendfile_sync_signal(struct sendfile_sync *sfs)
 {
 	mtx_lock(&sfs->mtx);
 	KASSERT(sfs->count > 0, ("sendfile sync %p not busy", sfs));
 	if (--sfs->count == 0) {
 		if (!sfs->waiting) {
 			/* The sendfile() waiter was interrupted by a signal. */
 			sendfile_sync_destroy(sfs);
 			return;
 		} else {
 			cv_signal(&sfs->cv);
 		}
 	}
 	mtx_unlock(&sfs->mtx);
 }
 
 counter_u64_t sfstat[sizeof(struct sfstat) / sizeof(uint64_t)];
 
 static void
 sfstat_init(const void *unused)
 {
 
 	COUNTER_ARRAY_ALLOC(sfstat, sizeof(struct sfstat) / sizeof(uint64_t),
 	    M_WAITOK);
 }
 SYSINIT(sfstat, SI_SUB_MBUF, SI_ORDER_FIRST, sfstat_init, NULL);
 
 static int
 sfstat_sysctl(SYSCTL_HANDLER_ARGS)
 {
 	struct sfstat s;
 
 	COUNTER_ARRAY_COPY(sfstat, &s, sizeof(s) / sizeof(uint64_t));
 	if (req->newptr)
 		COUNTER_ARRAY_ZERO(sfstat, sizeof(s) / sizeof(uint64_t));
 	return (SYSCTL_OUT(req, &s, sizeof(s)));
 }
 SYSCTL_PROC(_kern_ipc, OID_AUTO, sfstat,
     CTLTYPE_OPAQUE | CTLFLAG_RW | CTLFLAG_MPSAFE, NULL, 0,
     sfstat_sysctl, "I",
     "sendfile statistics");
 
 static void
 sendfile_free_mext(struct mbuf *m)
 {
 	struct sf_buf *sf;
 	vm_page_t pg;
 	int flags;
 
 	KASSERT(m->m_flags & M_EXT && m->m_ext.ext_type == EXT_SFBUF,
 	    ("%s: m %p !M_EXT or !EXT_SFBUF", __func__, m));
 
 	sf = m->m_ext.ext_arg1;
 	pg = sf_buf_page(sf);
 	flags = (m->m_ext.ext_flags & EXT_FLAG_NOCACHE) != 0 ? VPR_TRYFREE : 0;
 
 	sf_buf_free(sf);
 	vm_page_release(pg, flags);
 
 	if (m->m_ext.ext_flags & EXT_FLAG_SYNC) {
 		struct sendfile_sync *sfs = m->m_ext.ext_arg2;
 		sendfile_sync_signal(sfs);
 	}
 }
 
 static void
 sendfile_free_mext_pg(struct mbuf *m)
 {
 	vm_page_t pg;
 	int flags, i;
 	bool cache_last;
 
 	M_ASSERTEXTPG(m);
 
 	cache_last = m->m_ext.ext_flags & EXT_FLAG_CACHE_LAST;
 	flags = (m->m_ext.ext_flags & EXT_FLAG_NOCACHE) != 0 ? VPR_TRYFREE : 0;
 
 	for (i = 0; i < m->m_epg_npgs; i++) {
 		if (cache_last && i == m->m_epg_npgs - 1)
 			flags = 0;
 		pg = PHYS_TO_VM_PAGE(m->m_epg_pa[i]);
 		vm_page_release(pg, flags);
 	}
 
 	if (m->m_ext.ext_flags & EXT_FLAG_SYNC) {
 		struct sendfile_sync *sfs = m->m_ext.ext_arg1;
 		sendfile_sync_signal(sfs);
 	}
 }
 
 /*
  * Helper function to calculate how much data to put into page i of n.
  * Only first and last pages are special.
  */
 static inline off_t
 xfsize(int i, int n, off_t off, off_t len)
 {
 
 	if (i == 0)
 		return (omin(PAGE_SIZE - (off & PAGE_MASK), len));
 
 	if (i == n - 1 && ((off + len) & PAGE_MASK) > 0)
 		return ((off + len) & PAGE_MASK);
 
 	return (PAGE_SIZE);
 }
 
 /*
  * Helper function to get offset within object for i page.
  */
 static inline vm_ooffset_t
 vmoff(int i, off_t off)
 {
 
 	if (i == 0)
 		return ((vm_ooffset_t)off);
 
 	return (trunc_page(off + i * PAGE_SIZE));
 }
 
 /*
  * Helper function used when allocation of a page or sf_buf failed.
  * Pretend as if we don't have enough space, subtract xfsize() of
  * all pages that failed.
  */
 static inline void
 fixspace(int old, int new, off_t off, int *space)
 {
 
 	KASSERT(old > new, ("%s: old %d new %d", __func__, old, new));
 
 	/* Subtract last one. */
 	*space -= xfsize(old - 1, old, off, *space);
 	old--;
 
 	if (new == old)
 		/* There was only one page. */
 		return;
 
 	/* Subtract first one. */
 	if (new == 0) {
 		*space -= xfsize(0, old, off, *space);
 		new++;
 	}
 
 	/* Rest of pages are full sized. */
 	*space -= (old - new) * PAGE_SIZE;
 
 	KASSERT(*space >= 0, ("%s: space went backwards", __func__));
 }
 
 /*
  * Wait for all in-flight ios to complete, we must not unwire pages
  * under them.
  */
 static void
 sendfile_iowait(struct sf_io *sfio, const char *wmesg)
 {
 	while (atomic_load_int(&sfio->nios) != 1)
 		pause(wmesg, 1);
 }
 
 /*
  * I/O completion callback.
  */
 static void
 sendfile_iodone(void *arg, vm_page_t *pa, int count, int error)
 {
 	struct sf_io *sfio = arg;
 	struct socket *so;
 	int i;
 
 	if (error != 0)
 		sfio->error = error;
 
 	/*
 	 * Restore the valid page pointers.  They are already
 	 * unbusied, but still wired.
 	 *
 	 * XXXKIB since pages are only wired, and we do not
 	 * own the object lock, other users might have
 	 * invalidated them in meantime.  Similarly, after we
 	 * unbusied the swapped-in pages, they can become
 	 * invalid under us.
 	 */
 	MPASS(count == 0 || pa[0] != bogus_page);
 	for (i = 0; i < count; i++) {
 		if (pa[i] == bogus_page) {
 			sfio->pa[(pa[0]->pindex - sfio->pindex0) + i] =
 			    pa[i] = vm_page_relookup(sfio->obj,
 			    pa[0]->pindex + i);
 			KASSERT(pa[i] != NULL,
 			    ("%s: page %p[%d] disappeared",
 			    __func__, pa, i));
 		} else {
 			vm_page_xunbusy_unchecked(pa[i]);
 		}
 	}
 
 	if (!refcount_release(&sfio->nios))
 		return;
 
 #ifdef INVARIANTS
 	for (i = 1; i < sfio->npages; i++) {
 		if (sfio->pa[i] == NULL)
 			break;
 		KASSERT(vm_page_wired(sfio->pa[i]),
 		    ("sfio %p page %d %p not wired", sfio, i, sfio->pa[i]));
 		if (i == 0)
 			continue;
 		KASSERT(sfio->pa[0]->object == sfio->pa[i]->object,
 		    ("sfio %p page %d %p wrong owner %p %p", sfio, i,
 		    sfio->pa[i], sfio->pa[0]->object, sfio->pa[i]->object));
 		KASSERT(sfio->pa[0]->pindex + i == sfio->pa[i]->pindex,
 		    ("sfio %p page %d %p wrong index %jx %jx", sfio, i,
 		    sfio->pa[i], (uintmax_t)sfio->pa[0]->pindex,
 		    (uintmax_t)sfio->pa[i]->pindex));
 	}
 #endif
 
 	vm_object_pip_wakeup(sfio->obj);
 
 	if (sfio->m == NULL) {
 		/*
 		 * Either I/O operation failed, or we failed to allocate
 		 * buffers, or we bailed out on first busy page, or we
 		 * succeeded filling the request without any I/Os. Anyway,
 		 * pru_send hadn't been executed - nothing had been sent
 		 * to the socket yet.
 		 */
 		MPASS((curthread->td_pflags & TDP_KTHREAD) == 0);
 		free(sfio, M_SENDFILE);
 		return;
 	}
 
 #if defined(KERN_TLS) && defined(INVARIANTS)
 	if ((sfio->m->m_flags & M_EXTPG) != 0)
 		KASSERT(sfio->tls == sfio->m->m_epg_tls,
 		    ("TLS session mismatch"));
 	else
 		KASSERT(sfio->tls == NULL,
 		    ("non-ext_pgs mbuf with TLS session"));
 #endif
 	so = sfio->so;
 	CURVNET_SET(so->so_vnet);
 	if (__predict_false(sfio->error)) {
 		/*
 		 * I/O operation failed.  The state of data in the socket
 		 * is now inconsistent, and all what we can do is to tear
 		 * it down. Protocol abort method would tear down protocol
 		 * state, free all ready mbufs and detach not ready ones.
 		 * We will free the mbufs corresponding to this I/O manually.
 		 *
 		 * The socket would be marked with EIO and made available
 		 * for read, so that application receives EIO on next
 		 * syscall and eventually closes the socket.
 		 */
 		so->so_proto->pr_abort(so);
 		so->so_error = EIO;
 
 		mb_free_notready(sfio->m, sfio->npages);
 #ifdef KERN_TLS
 	} else if (sfio->tls != NULL && sfio->tls->mode == TCP_TLS_MODE_SW) {
 		/*
 		 * I/O operation is complete, but we still need to
 		 * encrypt.  We cannot do this in the interrupt thread
 		 * of the disk controller, so forward the mbufs to a
 		 * different thread.
 		 *
 		 * Donate the socket reference from sfio to rather
 		 * than explicitly invoking soref().
 		 */
 		ktls_enqueue(sfio->m, so, sfio->npages);
 		goto out_with_ref;
 #endif
 	} else
 		(void)so->so_proto->pr_ready(so, sfio->m, sfio->npages);
 
 	sorele(so);
 #ifdef KERN_TLS
 out_with_ref:
 #endif
 	CURVNET_RESTORE();
 	free(sfio, M_SENDFILE);
 }
 
 /*
  * Iterate through pages vector and request paging for non-valid pages.
  */
 static int
 sendfile_swapin(vm_object_t obj, struct sf_io *sfio, int *nios, off_t off,
     off_t len, int rhpages, int flags)
 {
 	vm_page_t *pa;
 	int a, count, count1, grabbed, i, j, npages, rv;
 
 	pa = sfio->pa;
 	npages = sfio->npages;
 	*nios = 0;
 	flags = (flags & SF_NODISKIO) ? VM_ALLOC_NOWAIT : 0;
 	sfio->pindex0 = OFF_TO_IDX(off);
 
 	/*
 	 * First grab all the pages and wire them.  Note that we grab
 	 * only required pages.  Readahead pages are dealt with later.
 	 */
 	grabbed = vm_page_grab_pages_unlocked(obj, OFF_TO_IDX(off),
 	    VM_ALLOC_NORMAL | VM_ALLOC_WIRED | flags, pa, npages);
 	if (grabbed < npages) {
 		for (int i = grabbed; i < npages; i++)
 			pa[i] = NULL;
 		npages = grabbed;
 		rhpages = 0;
 	}
 
 	for (i = 0; i < npages;) {
 		/* Skip valid pages. */
 		if (vm_page_is_valid(pa[i], vmoff(i, off) & PAGE_MASK,
 		    xfsize(i, npages, off, len))) {
 			vm_page_xunbusy(pa[i]);
 			SFSTAT_INC(sf_pages_valid);
 			i++;
 			continue;
 		}
 
 		/*
 		 * Next page is invalid.  Check if it belongs to pager.  It
 		 * may not be there, which is a regular situation for shmem
 		 * pager.  For vnode pager this happens only in case of
 		 * a sparse file.
 		 *
 		 * Important feature of vm_pager_has_page() is the hint
 		 * stored in 'a', about how many pages we can pagein after
 		 * this page in a single I/O.
 		 */
 		VM_OBJECT_RLOCK(obj);
 		if (!vm_pager_has_page(obj, OFF_TO_IDX(vmoff(i, off)), NULL,
 		    &a)) {
 			VM_OBJECT_RUNLOCK(obj);
 			pmap_zero_page(pa[i]);
 			vm_page_valid(pa[i]);
 			MPASS(pa[i]->dirty == 0);
 			vm_page_xunbusy(pa[i]);
 			i++;
 			continue;
 		}
 		VM_OBJECT_RUNLOCK(obj);
 
 		/*
 		 * We want to pagein as many pages as possible, limited only
 		 * by the 'a' hint and actual request.
 		 */
 		count = min(a + 1, npages - i);
 
 		/*
 		 * We should not pagein into a valid page because
 		 * there might be still unfinished write tracked by
 		 * e.g. a buffer, thus we substitute any valid pages
 		 * with the bogus one.
 		 *
 		 * We must not leave around xbusy pages which are not
 		 * part of the run passed to vm_pager_getpages(),
 		 * otherwise pager might deadlock waiting for the busy
 		 * status of the page, e.g. if it constitues the
 		 * buffer needed to validate other page.
 		 *
 		 * First trim the end of the run consisting of the
 		 * valid pages, then replace the rest of the valid
 		 * with bogus.
 		 */
 		count1 = count;
 		for (j = i + count - 1; j > i; j--) {
 			if (vm_page_is_valid(pa[j], vmoff(j, off) & PAGE_MASK,
 			    xfsize(j, npages, off, len))) {
 				vm_page_xunbusy(pa[j]);
 				SFSTAT_INC(sf_pages_valid);
 				count--;
 			} else {
 				break;
 			}
 		}
 
 		/*
 		 * The last page in the run pa[i + count - 1] is
 		 * guaranteed to be invalid by the trim above, so it
 		 * is not replaced with bogus, thus -1 in the loop end
 		 * condition.
 		 */
 		MPASS(pa[i + count - 1]->valid != VM_PAGE_BITS_ALL);
 		for (j = i + 1; j < i + count - 1; j++) {
 			if (vm_page_is_valid(pa[j], vmoff(j, off) & PAGE_MASK,
 			    xfsize(j, npages, off, len))) {
 				vm_page_xunbusy(pa[j]);
 				SFSTAT_INC(sf_pages_valid);
 				SFSTAT_INC(sf_pages_bogus);
 				pa[j] = bogus_page;
 			}
 		}
 
 		refcount_acquire(&sfio->nios);
 		rv = vm_pager_get_pages_async(obj, pa + i, count, NULL,
 		    i + count == npages ? &rhpages : NULL,
 		    &sendfile_iodone, sfio);
 		if (__predict_false(rv != VM_PAGER_OK)) {
 			sendfile_iowait(sfio, "sferrio");
 
 			/*
 			 * Do remaining pages recovery before returning EIO.
 			 * Pages from 0 to npages are wired.
 			 * Pages from (i + count1) to npages are busied.
 			 */
 			for (j = 0; j < npages; j++) {
 				if (j >= i + count1)
 					vm_page_xunbusy(pa[j]);
 				KASSERT(pa[j] != NULL && pa[j] != bogus_page,
 				    ("%s: page %p[%d] I/O recovery failure",
 				    __func__, pa, j));
 				vm_page_unwire(pa[j], PQ_INACTIVE);
 				pa[j] = NULL;
 			}
 			return (EIO);
 		}
 
 		SFSTAT_INC(sf_iocnt);
 		SFSTAT_ADD(sf_pages_read, count);
 		if (i + count == npages)
 			SFSTAT_ADD(sf_rhpages_read, rhpages);
 
 		i += count1;
 		(*nios)++;
 	}
 
 	if (*nios == 0 && npages != 0)
 		SFSTAT_INC(sf_noiocnt);
 
 	return (0);
 }
 
 static int
 sendfile_getobj(struct thread *td, struct file *fp, vm_object_t *obj_res,
     struct vnode **vp_res, struct shmfd **shmfd_res, off_t *obj_size,
     int *bsize)
 {
 	vm_object_t obj;
 	struct vnode *vp;
 	struct shmfd *shmfd;
 	int error;
 
 	error = 0;
 	vp = *vp_res = NULL;
 	obj = NULL;
 	shmfd = *shmfd_res = NULL;
 	*bsize = 0;
 
 	/*
 	 * The file descriptor must be a regular file and have a
 	 * backing VM object.
 	 */
 	if (fp->f_type == DTYPE_VNODE) {
 		vp = fp->f_vnode;
 		vn_lock(vp, LK_SHARED | LK_RETRY);
 		if (vp->v_type != VREG) {
 			error = EINVAL;
 			goto out;
 		}
 		*bsize = vp->v_mount->mnt_stat.f_iosize;
 		obj = vp->v_object;
 		if (obj == NULL) {
 			error = EINVAL;
 			goto out;
 		}
 
 		/*
 		 * Use the pager size when available to simplify synchronization
 		 * with filesystems, which otherwise must atomically update both
 		 * the vnode pager size and file size.
 		 */
 		if (obj->type == OBJT_VNODE) {
 			VM_OBJECT_RLOCK(obj);
 			*obj_size = obj->un_pager.vnp.vnp_size;
 		} else {
 			error = vn_getsize_locked(vp, obj_size, td->td_ucred);
 			if (error != 0)
 				goto out;
 			VM_OBJECT_RLOCK(obj);
 		}
 	} else if (fp->f_type == DTYPE_SHM) {
 		shmfd = fp->f_data;
 		obj = shmfd->shm_object;
 		VM_OBJECT_RLOCK(obj);
 		*obj_size = shmfd->shm_size;
 	} else {
 		error = EINVAL;
 		goto out;
 	}
 
 	if ((obj->flags & OBJ_DEAD) != 0) {
 		VM_OBJECT_RUNLOCK(obj);
 		error = EBADF;
 		goto out;
 	}
 
 	/*
 	 * Temporarily increase the backing VM object's reference
 	 * count so that a forced reclamation of its vnode does not
 	 * immediately destroy it.
 	 */
 	vm_object_reference_locked(obj);
 	VM_OBJECT_RUNLOCK(obj);
 	*obj_res = obj;
 	*vp_res = vp;
 	*shmfd_res = shmfd;
 
 out:
 	if (vp != NULL)
 		VOP_UNLOCK(vp);
 	return (error);
 }
 
 static int
 sendfile_getsock(struct thread *td, int s, struct file **sock_fp,
     struct socket **so)
 {
 	int error;
 
 	*sock_fp = NULL;
 	*so = NULL;
 
 	/*
 	 * The socket must be a stream socket and connected.
 	 */
 	error = getsock(td, s, &cap_send_rights, sock_fp);
 	if (error != 0)
 		return (error);
 	*so = (*sock_fp)->f_data;
 	if ((*so)->so_type != SOCK_STREAM)
 		return (EINVAL);
 	/*
 	 * SCTP one-to-one style sockets currently don't work with
 	 * sendfile(). So indicate EINVAL for now.
 	 */
 	if ((*so)->so_proto->pr_protocol == IPPROTO_SCTP)
 		return (EINVAL);
 	return (0);
 }
 
 int
 vn_sendfile(struct file *fp, int sockfd, struct uio *hdr_uio,
     struct uio *trl_uio, off_t offset, size_t nbytes, off_t *sent, int flags,
     struct thread *td)
 {
 	struct file *sock_fp;
 	struct vnode *vp;
 	struct vm_object *obj;
 	vm_page_t pga;
 	struct socket *so;
 #ifdef KERN_TLS
 	struct ktls_session *tls;
 #endif
 	struct mbuf *m, *mh, *mhtail;
 	struct sf_buf *sf;
 	struct shmfd *shmfd;
 	struct sendfile_sync *sfs;
 	struct vattr va;
 	off_t off, sbytes, rem, obj_size, nobj_size;
 	int bsize, error, ext_pgs_idx, hdrlen, max_pgs, softerr;
 #ifdef KERN_TLS
 	int tls_enq_cnt;
 #endif
 	bool use_ext_pgs;
 
 	obj = NULL;
 	so = NULL;
 	m = mh = NULL;
 	sfs = NULL;
 #ifdef KERN_TLS
 	tls = NULL;
 #endif
 	hdrlen = sbytes = 0;
 	softerr = 0;
 	use_ext_pgs = false;
 
 	error = sendfile_getobj(td, fp, &obj, &vp, &shmfd, &obj_size, &bsize);
 	if (error != 0)
 		return (error);
 
 	error = sendfile_getsock(td, sockfd, &sock_fp, &so);
 	if (error != 0)
 		goto out;
 
 #ifdef MAC
 	error = mac_socket_check_send(td->td_ucred, so);
 	if (error != 0)
 		goto out;
 #endif
 
 	SFSTAT_INC(sf_syscalls);
 	SFSTAT_ADD(sf_rhpages_requested, SF_READAHEAD(flags));
 
 	if (flags & SF_SYNC) {
 		sfs = malloc(sizeof(*sfs), M_SENDFILE, M_WAITOK | M_ZERO);
 		mtx_init(&sfs->mtx, "sendfile", NULL, MTX_DEF);
 		cv_init(&sfs->cv, "sendfile");
 		sfs->waiting = true;
 	}
 
 	rem = nbytes ? omin(nbytes, obj_size - offset) : obj_size - offset;
 
 	/*
 	 * Protect against multiple writers to the socket.
 	 *
 	 * XXXRW: Historically this has assumed non-interruptibility, so now
 	 * we implement that, but possibly shouldn't.
 	 */
 	error = SOCK_IO_SEND_LOCK(so, SBL_WAIT | SBL_NOINTR);
 	if (error != 0)
 		goto out;
 #ifdef KERN_TLS
 	tls = ktls_hold(so->so_snd.sb_tls_info);
 #endif
 
 	/*
 	 * Loop through the pages of the file, starting with the requested
 	 * offset. Get a file page (do I/O if necessary), map the file page
 	 * into an sf_buf, attach an mbuf header to the sf_buf, and queue
 	 * it on the socket.
 	 * This is done in two loops.  The inner loop turns as many pages
 	 * as it can, up to available socket buffer space, without blocking
 	 * into mbufs to have it bulk delivered into the socket send buffer.
 	 * The outer loop checks the state and available space of the socket
 	 * and takes care of the overall progress.
 	 */
 	for (off = offset; rem > 0; ) {
 		struct sf_io *sfio;
 		vm_page_t *pa;
 		struct mbuf *m0, *mtail;
 		int nios, space, npages, rhpages;
 
 		mtail = NULL;
 		/*
 		 * Check the socket state for ongoing connection,
 		 * no errors and space in socket buffer.
 		 * If space is low allow for the remainder of the
 		 * file to be processed if it fits the socket buffer.
 		 * Otherwise block in waiting for sufficient space
 		 * to proceed, or if the socket is nonblocking, return
 		 * to userland with EAGAIN while reporting how far
 		 * we've come.
 		 * We wait until the socket buffer has significant free
 		 * space to do bulk sends.  This makes good use of file
 		 * system read ahead and allows packet segmentation
 		 * offloading hardware to take over lots of work.  If
 		 * we were not careful here we would send off only one
 		 * sfbuf at a time.
 		 */
 		SOCKBUF_LOCK(&so->so_snd);
 		if (so->so_snd.sb_lowat < so->so_snd.sb_hiwat / 2)
 			so->so_snd.sb_lowat = so->so_snd.sb_hiwat / 2;
 retry_space:
 		if (so->so_snd.sb_state & SBS_CANTSENDMORE) {
 			error = EPIPE;
 			SOCKBUF_UNLOCK(&so->so_snd);
 			goto done;
 		} else if (so->so_error) {
 			error = so->so_error;
 			so->so_error = 0;
 			SOCKBUF_UNLOCK(&so->so_snd);
 			goto done;
 		}
 		if ((so->so_state & SS_ISCONNECTED) == 0) {
 			SOCKBUF_UNLOCK(&so->so_snd);
 			error = ENOTCONN;
 			goto done;
 		}
 
 		space = sbspace(&so->so_snd);
 		if (space < rem &&
 		    (space <= 0 ||
 		     space < so->so_snd.sb_lowat)) {
 			if (so->so_state & SS_NBIO) {
 				SOCKBUF_UNLOCK(&so->so_snd);
 				error = EAGAIN;
 				goto done;
 			}
 			/*
 			 * sbwait drops the lock while sleeping.
 			 * When we loop back to retry_space the
 			 * state may have changed and we retest
 			 * for it.
 			 */
 			error = sbwait(so, SO_SND);
 			/*
 			 * An error from sbwait usually indicates that we've
 			 * been interrupted by a signal. If we've sent anything
 			 * then return bytes sent, otherwise return the error.
 			 */
 			if (error != 0) {
 				SOCKBUF_UNLOCK(&so->so_snd);
 				goto done;
 			}
 			goto retry_space;
 		}
 		SOCKBUF_UNLOCK(&so->so_snd);
 
 		/*
 		 * At the beginning of the first loop check if any headers
 		 * are specified and copy them into mbufs.  Reduce space in
 		 * the socket buffer by the size of the header mbuf chain.
 		 * Clear hdr_uio here and hdrlen at the end of the first loop.
 		 */
 		if (hdr_uio != NULL && hdr_uio->uio_resid > 0) {
 			hdr_uio->uio_td = td;
 			hdr_uio->uio_rw = UIO_WRITE;
 #ifdef KERN_TLS
 			if (tls != NULL)
 				mh = m_uiotombuf(hdr_uio, M_WAITOK, space,
 				    tls->params.max_frame_len, M_EXTPG);
 			else
 #endif
 				mh = m_uiotombuf(hdr_uio, M_WAITOK,
 				    space, 0, 0);
 			hdrlen = m_length(mh, &mhtail);
 			space -= hdrlen;
 			/*
 			 * If header consumed all the socket buffer space,
 			 * don't waste CPU cycles and jump to the end.
 			 */
 			if (space == 0) {
 				sfio = NULL;
 				nios = 0;
 				goto prepend_header;
 			}
 			hdr_uio = NULL;
 		}
 
 		if (vp != NULL) {
 			error = vn_lock(vp, LK_SHARED);
 			if (error != 0)
 				goto done;
 
 			/*
 			 * Check to see if the file size has changed.
 			 */
 			if (obj->type == OBJT_VNODE) {
 				VM_OBJECT_RLOCK(obj);
 				nobj_size = obj->un_pager.vnp.vnp_size;
 				VM_OBJECT_RUNLOCK(obj);
 			} else {
 				error = VOP_GETATTR(vp, &va, td->td_ucred);
 				if (error != 0) {
 					VOP_UNLOCK(vp);
 					goto done;
 				}
 				nobj_size = va.va_size;
 			}
 			if (off >= nobj_size) {
 				VOP_UNLOCK(vp);
 				goto done;
 			}
 			if (nobj_size != obj_size) {
 				obj_size = nobj_size;
 				rem = nbytes ? omin(nbytes + offset, obj_size) :
 				    obj_size;
 				rem -= off;
 			}
 		}
 
 		if (space > rem)
 			space = rem;
 		else if (space > PAGE_SIZE) {
 			/*
 			 * Use page boundaries when possible for large
 			 * requests.
 			 */
 			if (off & PAGE_MASK)
 				space -= (PAGE_SIZE - (off & PAGE_MASK));
 			space = trunc_page(space);
 			if (off & PAGE_MASK)
 				space += (PAGE_SIZE - (off & PAGE_MASK));
 		}
 
 		npages = howmany(space + (off & PAGE_MASK), PAGE_SIZE);
 
 		/*
 		 * Calculate maximum allowed number of pages for readahead
 		 * at this iteration.  If SF_USER_READAHEAD was set, we don't
 		 * do any heuristics and use exactly the value supplied by
 		 * application.  Otherwise, we allow readahead up to "rem".
 		 * If application wants more, let it be, but there is no
 		 * reason to go above maxphys.  Also check against "obj_size",
 		 * since vm_pager_has_page() can hint beyond EOF.
 		 */
 		if (flags & SF_USER_READAHEAD) {
 			rhpages = SF_READAHEAD(flags);
 		} else {
 			rhpages = howmany(rem + (off & PAGE_MASK), PAGE_SIZE) -
 			    npages;
 			rhpages += SF_READAHEAD(flags);
 		}
 		rhpages = min(howmany(maxphys, PAGE_SIZE), rhpages);
 		rhpages = min(howmany(obj_size - trunc_page(off), PAGE_SIZE) -
 		    npages, rhpages);
 
 		sfio = malloc(sizeof(struct sf_io) +
 		    npages * sizeof(vm_page_t), M_SENDFILE, M_WAITOK);
 		refcount_init(&sfio->nios, 1);
 		sfio->obj = obj;
 		sfio->error = 0;
 		sfio->m = NULL;
 		sfio->npages = npages;
 #ifdef KERN_TLS
 		/*
 		 * This doesn't use ktls_hold() because sfio->m will
 		 * also have a reference on 'tls' that will be valid
 		 * for all of sfio's lifetime.
 		 */
 		sfio->tls = tls;
 #endif
 		vm_object_pip_add(obj, 1);
 		error = sendfile_swapin(obj, sfio, &nios, off, space, rhpages,
 		    flags);
 		if (error != 0) {
 			if (vp != NULL)
 				VOP_UNLOCK(vp);
 			sendfile_iodone(sfio, NULL, 0, error);
 			goto done;
 		}
 
 		/*
 		 * Loop and construct maximum sized mbuf chain to be bulk
 		 * dumped into socket buffer.
 		 */
 		pa = sfio->pa;
 
 		/*
 		 * Use unmapped mbufs if enabled for TCP.  Unmapped
 		 * bufs are restricted to TCP as that is what has been
 		 * tested.  In particular, unmapped mbufs have not
 		 * been tested with UNIX-domain sockets.
 		 *
 		 * TLS frames always require unmapped mbufs.
 		 */
 		if ((mb_use_ext_pgs &&
 		    so->so_proto->pr_protocol == IPPROTO_TCP)
 #ifdef KERN_TLS
 		    || tls != NULL
 #endif
 		    ) {
 			use_ext_pgs = true;
 #ifdef KERN_TLS
 			if (tls != NULL)
 				max_pgs = num_pages(tls->params.max_frame_len);
 			else
 #endif
 				max_pgs = MBUF_PEXT_MAX_PGS;
 
 			/* Start at last index, to wrap on first use. */
 			ext_pgs_idx = max_pgs - 1;
 		}
 
 		for (int i = 0; i < npages; i++) {
 			/*
 			 * If a page wasn't grabbed successfully, then
 			 * trim the array. Can happen only with SF_NODISKIO.
 			 */
 			if (pa[i] == NULL) {
 				SFSTAT_INC(sf_busy);
 				fixspace(npages, i, off, &space);
 				sfio->npages = i;
 				softerr = EBUSY;
 				break;
 			}
 			pga = pa[i];
 			if (pga == bogus_page)
 				pga = vm_page_relookup(obj, sfio->pindex0 + i);
 
 			if (use_ext_pgs) {
 				off_t xfs;
 
 				ext_pgs_idx++;
 				if (ext_pgs_idx == max_pgs) {
 					m0 = mb_alloc_ext_pgs(M_WAITOK,
-					    sendfile_free_mext_pg);
+					    sendfile_free_mext_pg, M_RDONLY);
 
 					if (flags & SF_NOCACHE) {
 						m0->m_ext.ext_flags |=
 						    EXT_FLAG_NOCACHE;
 
 						/*
 						 * See comment below regarding
 						 * ignoring SF_NOCACHE for the
 						 * last page.
 						 */
 						if ((npages - i <= max_pgs) &&
 						    ((off + space) & PAGE_MASK) &&
 						    (rem > space || rhpages > 0))
 							m0->m_ext.ext_flags |=
 							    EXT_FLAG_CACHE_LAST;
 					}
 					if (sfs != NULL) {
 						m0->m_ext.ext_flags |=
 						    EXT_FLAG_SYNC;
 						m0->m_ext.ext_arg1 = sfs;
 						mtx_lock(&sfs->mtx);
 						sfs->count++;
 						mtx_unlock(&sfs->mtx);
 					}
 					ext_pgs_idx = 0;
 
 					/* Append to mbuf chain. */
 					if (mtail != NULL)
 						mtail->m_next = m0;
 					else
 						m = m0;
 					mtail = m0;
 					m0->m_epg_1st_off =
 					    vmoff(i, off) & PAGE_MASK;
 				}
 				if (nios) {
 					mtail->m_flags |= M_NOTREADY;
 					m0->m_epg_nrdy++;
 				}
 
 				m0->m_epg_pa[ext_pgs_idx] = VM_PAGE_TO_PHYS(pga);
 				m0->m_epg_npgs++;
 				xfs = xfsize(i, npages, off, space);
 				m0->m_epg_last_len = xfs;
 				MBUF_EXT_PGS_ASSERT_SANITY(m0);
 				mtail->m_len += xfs;
 				mtail->m_ext.ext_size += PAGE_SIZE;
 				continue;
 			}
 
 			/*
 			 * Get a sendfile buf.  When allocating the
 			 * first buffer for mbuf chain, we usually
 			 * wait as long as necessary, but this wait
 			 * can be interrupted.  For consequent
 			 * buffers, do not sleep, since several
 			 * threads might exhaust the buffers and then
 			 * deadlock.
 			 */
 			sf = sf_buf_alloc(pga,
 			    m != NULL ? SFB_NOWAIT : SFB_CATCH);
 			if (sf == NULL) {
 				SFSTAT_INC(sf_allocfail);
 				sendfile_iowait(sfio, "sfnosf");
 				for (int j = i; j < npages; j++) {
 					vm_page_unwire(pa[j], PQ_INACTIVE);
 					pa[j] = NULL;
 				}
 				if (m == NULL)
 					softerr = ENOBUFS;
 				fixspace(npages, i, off, &space);
 				sfio->npages = i;
 				break;
 			}
 
 			m0 = m_get(M_WAITOK, MT_DATA);
 			m0->m_ext.ext_buf = (char *)sf_buf_kva(sf);
 			m0->m_ext.ext_size = PAGE_SIZE;
 			m0->m_ext.ext_arg1 = sf;
 			m0->m_ext.ext_type = EXT_SFBUF;
 			m0->m_ext.ext_flags = EXT_FLAG_EMBREF;
 			m0->m_ext.ext_free = sendfile_free_mext;
 			/*
 			 * SF_NOCACHE sets the page as being freed upon send.
 			 * However, we ignore it for the last page in 'space',
 			 * if the page is truncated, and we got more data to
 			 * send (rem > space), or if we have readahead
 			 * configured (rhpages > 0).
 			 */
 			if ((flags & SF_NOCACHE) &&
 			    (i != npages - 1 ||
 			    !((off + space) & PAGE_MASK) ||
 			    !(rem > space || rhpages > 0)))
 				m0->m_ext.ext_flags |= EXT_FLAG_NOCACHE;
 			if (sfs != NULL) {
 				m0->m_ext.ext_flags |= EXT_FLAG_SYNC;
 				m0->m_ext.ext_arg2 = sfs;
 				mtx_lock(&sfs->mtx);
 				sfs->count++;
 				mtx_unlock(&sfs->mtx);
 			}
 			m0->m_ext.ext_count = 1;
 			m0->m_flags |= (M_EXT | M_RDONLY);
 			if (nios)
 				m0->m_flags |= M_NOTREADY;
 			m0->m_data = (char *)sf_buf_kva(sf) +
 			    (vmoff(i, off) & PAGE_MASK);
 			m0->m_len = xfsize(i, npages, off, space);
 
 			/* Append to mbuf chain. */
 			if (mtail != NULL)
 				mtail->m_next = m0;
 			else
 				m = m0;
 			mtail = m0;
 		}
 
 		if (vp != NULL)
 			VOP_UNLOCK(vp);
 
 		/* Keep track of bytes processed. */
 		off += space;
 		rem -= space;
 
 		/*
 		 * Prepend header, if any.  Save pointer to first mbuf
 		 * with a page.
 		 */
 		if (hdrlen) {
 prepend_header:
 			m0 = mhtail->m_next = m;
 			m = mh;
 			mh = NULL;
 		} else
 			m0 = m;
 
 		if (m == NULL) {
 			KASSERT(softerr, ("%s: m NULL, no error", __func__));
 			error = softerr;
 			sendfile_iodone(sfio, NULL, 0, 0);
 			goto done;
 		}
 
 		/* Add the buffer chain to the socket buffer. */
 		KASSERT(m_length(m, NULL) == space + hdrlen,
 		    ("%s: mlen %u space %d hdrlen %d",
 		    __func__, m_length(m, NULL), space, hdrlen));
 
 		CURVNET_SET(so->so_vnet);
 #ifdef KERN_TLS
 		if (tls != NULL)
 			ktls_frame(m, tls, &tls_enq_cnt, TLS_RLTYPE_APP);
 #endif
 		if (nios == 0) {
 			/*
 			 * If sendfile_swapin() didn't initiate any I/Os,
 			 * which happens if all data is cached in VM, or if
 			 * the header consumed all socket buffer space and
 			 * sfio is NULL, then we can send data right now
 			 * without the PRUS_NOTREADY flag.
 			 */
 			if (sfio != NULL)
 				sendfile_iodone(sfio, NULL, 0, 0);
 #ifdef KERN_TLS
 			if (tls != NULL && tls->mode == TCP_TLS_MODE_SW) {
 				error = so->so_proto->pr_send(so,
 				    PRUS_NOTREADY, m, NULL, NULL, td);
 				if (error != 0) {
 					m_freem(m);
 				} else {
 					soref(so);
 					ktls_enqueue(m, so, tls_enq_cnt);
 				}
 			} else
 #endif
 				error = so->so_proto->pr_send(so, 0, m, NULL,
 				    NULL, td);
 		} else {
 			sfio->so = so;
 			sfio->m = m0;
 			soref(so);
 			error = so->so_proto->pr_send(so, PRUS_NOTREADY, m,
 			    NULL, NULL, td);
 			sendfile_iodone(sfio, NULL, 0, error);
 		}
 #ifdef TCP_REQUEST_TRK
 		if (so->so_proto->pr_protocol == IPPROTO_TCP) {
 			/* log the sendfile call to the TCP log, if enabled */
 			tcp_log_sendfile(so, offset, nbytes, flags);
 		}
 #endif
 		CURVNET_RESTORE();
 
 		m = NULL;
 		if (error)
 			goto done;
 		sbytes += space + hdrlen;
 		if (hdrlen)
 			hdrlen = 0;
 		if (softerr) {
 			error = softerr;
 			goto done;
 		}
 	}
 
 	/*
 	 * Send trailers. Wimp out and use writev(2).
 	 */
 	if (trl_uio != NULL) {
 		SOCK_IO_SEND_UNLOCK(so);
 		error = kern_writev(td, sockfd, trl_uio);
 		if (error == 0)
 			sbytes += td->td_retval[0];
 		goto out;
 	}
 
 done:
 	SOCK_IO_SEND_UNLOCK(so);
 out:
 	/*
 	 * If there was no error we have to clear td->td_retval[0]
 	 * because it may have been set by writev.
 	 */
 	if (error == 0) {
 		td->td_retval[0] = 0;
 	}
 	if (sent != NULL) {
 		(*sent) = sbytes;
 	}
 	if (obj != NULL)
 		vm_object_deallocate(obj);
 	if (so)
 		fdrop(sock_fp, td);
 	if (m)
 		m_freem(m);
 	if (mh)
 		m_freem(mh);
 
 	if (sfs != NULL) {
 		mtx_lock(&sfs->mtx);
 		if (sfs->count != 0)
 			error = cv_wait_sig(&sfs->cv, &sfs->mtx);
 		if (sfs->count == 0) {
 			sendfile_sync_destroy(sfs);
 		} else {
 			sfs->waiting = false;
 			mtx_unlock(&sfs->mtx);
 		}
 	}
 #ifdef KERN_TLS
 	if (tls != NULL)
 		ktls_free(tls);
 #endif
 
 	if (error == ERESTART)
 		error = EINTR;
 
 	return (error);
 }
 
 static int
 sendfile(struct thread *td, struct sendfile_args *uap, int compat)
 {
 	struct sf_hdtr hdtr;
 	struct uio *hdr_uio, *trl_uio;
 	struct file *fp;
 	off_t sbytes;
 	int error;
 
 	/*
 	 * File offset must be positive.  If it goes beyond EOF
 	 * we send only the header/trailer and no payload data.
 	 */
 	if (uap->offset < 0)
 		return (EINVAL);
 
 	sbytes = 0;
 	hdr_uio = trl_uio = NULL;
 
 	if (uap->hdtr != NULL) {
 		error = copyin(uap->hdtr, &hdtr, sizeof(hdtr));
 		if (error != 0)
 			goto out;
 		if (hdtr.headers != NULL) {
 			error = copyinuio(hdtr.headers, hdtr.hdr_cnt,
 			    &hdr_uio);
 			if (error != 0)
 				goto out;
 #ifdef COMPAT_FREEBSD4
 			/*
 			 * In FreeBSD < 5.0 the nbytes to send also included
 			 * the header.  If compat is specified subtract the
 			 * header size from nbytes.
 			 */
 			if (compat) {
 				if (uap->nbytes > hdr_uio->uio_resid)
 					uap->nbytes -= hdr_uio->uio_resid;
 				else
 					uap->nbytes = 0;
 			}
 #endif
 		}
 		if (hdtr.trailers != NULL) {
 			error = copyinuio(hdtr.trailers, hdtr.trl_cnt,
 			    &trl_uio);
 			if (error != 0)
 				goto out;
 		}
 	}
 
 	AUDIT_ARG_FD(uap->fd);
 
 	/*
 	 * sendfile(2) can start at any offset within a file so we require
 	 * CAP_READ+CAP_SEEK = CAP_PREAD.
 	 */
 	if ((error = fget_read(td, uap->fd, &cap_pread_rights, &fp)) != 0)
 		goto out;
 
 	error = fo_sendfile(fp, uap->s, hdr_uio, trl_uio, uap->offset,
 	    uap->nbytes, &sbytes, uap->flags, td);
 	fdrop(fp, td);
 
 	if (uap->sbytes != NULL)
 		(void)copyout(&sbytes, uap->sbytes, sizeof(off_t));
 
 out:
 	freeuio(hdr_uio);
 	freeuio(trl_uio);
 	return (error);
 }
 
 /*
  * sendfile(2)
  * 
  * int sendfile(int fd, int s, off_t offset, size_t nbytes,
  *       struct sf_hdtr *hdtr, off_t *sbytes, int flags)
  * 
  * Send a file specified by 'fd' and starting at 'offset' to a socket
  * specified by 's'. Send only 'nbytes' of the file or until EOF if nbytes ==
  * 0.  Optionally add a header and/or trailer to the socket output.  If
  * specified, write the total number of bytes sent into *sbytes.
  */
 int
 sys_sendfile(struct thread *td, struct sendfile_args *uap)
 {
 
 	return (sendfile(td, uap, 0));
 }
 
 #ifdef COMPAT_FREEBSD4
 int
 freebsd4_sendfile(struct thread *td, struct freebsd4_sendfile_args *uap)
 {
 	struct sendfile_args args;
 
 	args.fd = uap->fd;
 	args.s = uap->s;
 	args.offset = uap->offset;
 	args.nbytes = uap->nbytes;
 	args.hdtr = uap->hdtr;
 	args.sbytes = uap->sbytes;
 	args.flags = uap->flags;
 
 	return (sendfile(td, &args, 1));
 }
 #endif /* COMPAT_FREEBSD4 */
diff --git a/sys/kern/uipc_mbuf.c b/sys/kern/uipc_mbuf.c
index f6ce9b5cc74b..f3e2f13e89ec 100644
--- a/sys/kern/uipc_mbuf.c
+++ b/sys/kern/uipc_mbuf.c
@@ -1,2390 +1,2390 @@
 /*-
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Copyright (c) 1982, 1986, 1988, 1991, 1993
  *	The Regents of the University of California.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 #include "opt_param.h"
 #include "opt_mbuf_stress_test.h"
 #include "opt_mbuf_profiling.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/kernel.h>
 #include <sys/limits.h>
 #include <sys/lock.h>
 #include <sys/malloc.h>
 #include <sys/mbuf.h>
 #include <sys/sysctl.h>
 #include <sys/domain.h>
 #include <sys/protosw.h>
 #include <sys/uio.h>
 #include <sys/vmmeter.h>
 #include <sys/sbuf.h>
 #include <sys/sdt.h>
 #include <vm/vm.h>
 #include <vm/vm_pageout.h>
 #include <vm/vm_page.h>
 
 SDT_PROBE_DEFINE5_XLATE(sdt, , , m__init,
     "struct mbuf *", "mbufinfo_t *",
     "uint32_t", "uint32_t",
     "uint16_t", "uint16_t",
     "uint32_t", "uint32_t",
     "uint32_t", "uint32_t");
 
 SDT_PROBE_DEFINE3_XLATE(sdt, , , m__gethdr_raw,
     "uint32_t", "uint32_t",
     "uint16_t", "uint16_t",
     "struct mbuf *", "mbufinfo_t *");
 
 SDT_PROBE_DEFINE3_XLATE(sdt, , , m__gethdr,
     "uint32_t", "uint32_t",
     "uint16_t", "uint16_t",
     "struct mbuf *", "mbufinfo_t *");
 
 SDT_PROBE_DEFINE3_XLATE(sdt, , , m__get_raw,
     "uint32_t", "uint32_t",
     "uint16_t", "uint16_t",
     "struct mbuf *", "mbufinfo_t *");
 
 SDT_PROBE_DEFINE3_XLATE(sdt, , , m__get,
     "uint32_t", "uint32_t",
     "uint16_t", "uint16_t",
     "struct mbuf *", "mbufinfo_t *");
 
 SDT_PROBE_DEFINE4_XLATE(sdt, , , m__getcl,
     "uint32_t", "uint32_t",
     "uint16_t", "uint16_t",
     "uint32_t", "uint32_t",
     "struct mbuf *", "mbufinfo_t *");
 
 SDT_PROBE_DEFINE5_XLATE(sdt, , , m__getjcl,
     "uint32_t", "uint32_t",
     "uint16_t", "uint16_t",
     "uint32_t", "uint32_t",
     "uint32_t", "uint32_t",
     "struct mbuf *", "mbufinfo_t *");
 
 SDT_PROBE_DEFINE3_XLATE(sdt, , , m__clget,
     "struct mbuf *", "mbufinfo_t *",
     "uint32_t", "uint32_t",
     "uint32_t", "uint32_t");
 
 SDT_PROBE_DEFINE4_XLATE(sdt, , , m__cljget,
     "struct mbuf *", "mbufinfo_t *",
     "uint32_t", "uint32_t",
     "uint32_t", "uint32_t",
     "void*", "void*");
 
 SDT_PROBE_DEFINE(sdt, , , m__cljset);
 
 SDT_PROBE_DEFINE1_XLATE(sdt, , , m__free,
         "struct mbuf *", "mbufinfo_t *");
 
 SDT_PROBE_DEFINE1_XLATE(sdt, , , m__freem,
     "struct mbuf *", "mbufinfo_t *");
 
 SDT_PROBE_DEFINE1_XLATE(sdt, , , m__freemp,
     "struct mbuf *", "mbufinfo_t *");
 
 #include <security/mac/mac_framework.h>
 
 /*
  * Provide minimum possible defaults for link and protocol header space,
  * assuming IPv4 over Ethernet.  Enabling IPv6, IEEE802.11 or some other
  * protocol may grow these values.
  */
 u_int	max_linkhdr = 16;
 u_int	max_protohdr = 40;
 u_int	max_hdr = 16 + 40;
 SYSCTL_INT(_kern_ipc, KIPC_MAX_LINKHDR, max_linkhdr, CTLFLAG_RD,
 	   &max_linkhdr, 16, "Size of largest link layer header");
 SYSCTL_INT(_kern_ipc, KIPC_MAX_PROTOHDR, max_protohdr, CTLFLAG_RD,
 	   &max_protohdr, 40, "Size of largest protocol layer header");
 SYSCTL_INT(_kern_ipc, KIPC_MAX_HDR, max_hdr, CTLFLAG_RD,
 	   &max_hdr, 16 + 40, "Size of largest link plus protocol header");
 
 static void
 max_hdr_grow(void)
 {
 
 	max_hdr = max_linkhdr + max_protohdr;
 	MPASS(max_hdr <= MHLEN);
 }
 
 void
 max_linkhdr_grow(u_int new)
 {
 
 	if (new > max_linkhdr) {
 		max_linkhdr = new;
 		max_hdr_grow();
 	}
 }
 
 void
 max_protohdr_grow(u_int new)
 {
 
 	if (new > max_protohdr) {
 		max_protohdr = new;
 		max_hdr_grow();
 	}
 }
 
 #ifdef MBUF_STRESS_TEST
 int	m_defragpackets;
 int	m_defragbytes;
 int	m_defraguseless;
 int	m_defragfailure;
 int	m_defragrandomfailures;
 
 SYSCTL_INT(_kern_ipc, OID_AUTO, m_defragpackets, CTLFLAG_RD,
 	   &m_defragpackets, 0, "");
 SYSCTL_INT(_kern_ipc, OID_AUTO, m_defragbytes, CTLFLAG_RD,
 	   &m_defragbytes, 0, "");
 SYSCTL_INT(_kern_ipc, OID_AUTO, m_defraguseless, CTLFLAG_RD,
 	   &m_defraguseless, 0, "");
 SYSCTL_INT(_kern_ipc, OID_AUTO, m_defragfailure, CTLFLAG_RD,
 	   &m_defragfailure, 0, "");
 SYSCTL_INT(_kern_ipc, OID_AUTO, m_defragrandomfailures, CTLFLAG_RW,
 	   &m_defragrandomfailures, 0, "");
 #endif
 
 /*
  * Ensure the correct size of various mbuf parameters.  It could be off due
  * to compiler-induced padding and alignment artifacts.
  */
 CTASSERT(MSIZE - offsetof(struct mbuf, m_dat) == MLEN);
 CTASSERT(MSIZE - offsetof(struct mbuf, m_pktdat) == MHLEN);
 
 /*
  * mbuf data storage should be 64-bit aligned regardless of architectural
  * pointer size; check this is the case with and without a packet header.
  */
 CTASSERT(offsetof(struct mbuf, m_dat) % 8 == 0);
 CTASSERT(offsetof(struct mbuf, m_pktdat) % 8 == 0);
 
 /*
  * While the specific values here don't matter too much (i.e., +/- a few
  * words), we do want to ensure that changes to these values are carefully
  * reasoned about and properly documented.  This is especially the case as
  * network-protocol and device-driver modules encode these layouts, and must
  * be recompiled if the structures change.  Check these values at compile time
  * against the ones documented in comments in mbuf.h.
  *
  * NB: Possibly they should be documented there via #define's and not just
  * comments.
  */
 #if defined(__LP64__)
 CTASSERT(offsetof(struct mbuf, m_dat) == 32);
 CTASSERT(sizeof(struct pkthdr) == 64);
 CTASSERT(sizeof(struct m_ext) == 160);
 #else
 CTASSERT(offsetof(struct mbuf, m_dat) == 24);
 CTASSERT(sizeof(struct pkthdr) == 56);
 #if defined(__powerpc__) && defined(BOOKE)
 /* PowerPC booke has 64-bit physical pointers. */
 CTASSERT(sizeof(struct m_ext) == 176);
 #else
 CTASSERT(sizeof(struct m_ext) == 172);
 #endif
 #endif
 
 /*
  * Assert that the queue(3) macros produce code of the same size as an old
  * plain pointer does.
  */
 #ifdef INVARIANTS
 static struct mbuf __used m_assertbuf;
 CTASSERT(sizeof(m_assertbuf.m_slist) == sizeof(m_assertbuf.m_next));
 CTASSERT(sizeof(m_assertbuf.m_stailq) == sizeof(m_assertbuf.m_next));
 CTASSERT(sizeof(m_assertbuf.m_slistpkt) == sizeof(m_assertbuf.m_nextpkt));
 CTASSERT(sizeof(m_assertbuf.m_stailqpkt) == sizeof(m_assertbuf.m_nextpkt));
 #endif
 
 /*
  * Attach the cluster from *m to *n, set up m_ext in *n
  * and bump the refcount of the cluster.
  */
 void
 mb_dupcl(struct mbuf *n, struct mbuf *m)
 {
 	volatile u_int *refcnt;
 
 	KASSERT(m->m_flags & (M_EXT | M_EXTPG),
 	    ("%s: M_EXT | M_EXTPG not set on %p", __func__, m));
 	KASSERT(!(n->m_flags & (M_EXT | M_EXTPG)),
 	    ("%s: M_EXT | M_EXTPG set on %p", __func__, n));
 
 	/*
 	 * Cache access optimization.
 	 *
 	 * o Regular M_EXT storage doesn't need full copy of m_ext, since
 	 *   the holder of the 'ext_count' is responsible to carry the free
 	 *   routine and its arguments.
 	 * o M_EXTPG data is split between main part of mbuf and m_ext, the
 	 *   main part is copied in full, the m_ext part is similar to M_EXT.
 	 * o EXT_EXTREF, where 'ext_cnt' doesn't point into mbuf at all, is
 	 *   special - it needs full copy of m_ext into each mbuf, since any
 	 *   copy could end up as the last to free.
 	 */
 	if (m->m_flags & M_EXTPG) {
 		bcopy(&m->m_epg_startcopy, &n->m_epg_startcopy,
 		    __rangeof(struct mbuf, m_epg_startcopy, m_epg_endcopy));
 		bcopy(&m->m_ext, &n->m_ext, m_epg_ext_copylen);
 	} else if (m->m_ext.ext_type == EXT_EXTREF)
 		bcopy(&m->m_ext, &n->m_ext, sizeof(struct m_ext));
 	else
 		bcopy(&m->m_ext, &n->m_ext, m_ext_copylen);
 
 	n->m_flags |= m->m_flags & (M_RDONLY | M_EXT | M_EXTPG);
 
 	/* See if this is the mbuf that holds the embedded refcount. */
 	if (m->m_ext.ext_flags & EXT_FLAG_EMBREF) {
 		refcnt = n->m_ext.ext_cnt = &m->m_ext.ext_count;
 		n->m_ext.ext_flags &= ~EXT_FLAG_EMBREF;
 	} else {
 		KASSERT(m->m_ext.ext_cnt != NULL,
 		    ("%s: no refcounting pointer on %p", __func__, m));
 		refcnt = m->m_ext.ext_cnt;
 	}
 
 	if (*refcnt == 1)
 		*refcnt += 1;
 	else
 		atomic_add_int(refcnt, 1);
 }
 
 void
 m_demote_pkthdr(struct mbuf *m)
 {
 
 	M_ASSERTPKTHDR(m);
 	M_ASSERT_NO_SND_TAG(m);
 
 	m_tag_delete_chain(m, NULL);
 	m->m_flags &= ~M_PKTHDR;
 	bzero(&m->m_pkthdr, sizeof(struct pkthdr));
 }
 
 /*
  * Clean up mbuf (chain) from any tags and packet headers.
  * If "all" is set then the first mbuf in the chain will be
  * cleaned too.
  */
 void
 m_demote(struct mbuf *m0, int all, int flags)
 {
 	struct mbuf *m;
 
 	flags |= M_DEMOTEFLAGS;
 
 	for (m = all ? m0 : m0->m_next; m != NULL; m = m->m_next) {
 		KASSERT(m->m_nextpkt == NULL, ("%s: m_nextpkt in m %p, m0 %p",
 		    __func__, m, m0));
 		if (m->m_flags & M_PKTHDR)
 			m_demote_pkthdr(m);
 		m->m_flags &= flags;
 	}
 }
 
 /*
  * Sanity checks on mbuf (chain) for use in KASSERT() and general
  * debugging.
  * Returns 0 or panics when bad and 1 on all tests passed.
  * Sanitize, 0 to run M_SANITY_ACTION, 1 to garble things so they
  * blow up later.
  */
 int
 m_sanity(struct mbuf *m0, int sanitize)
 {
 	struct mbuf *m;
 	caddr_t a, b;
 	int pktlen = 0;
 
 #ifdef INVARIANTS
 #define	M_SANITY_ACTION(s)	panic("mbuf %p: " s, m)
 #else
 #define	M_SANITY_ACTION(s)	printf("mbuf %p: " s, m)
 #endif
 
 	for (m = m0; m != NULL; m = m->m_next) {
 		/*
 		 * Basic pointer checks.  If any of these fails then some
 		 * unrelated kernel memory before or after us is trashed.
 		 * No way to recover from that.
 		 */
 		a = M_START(m);
 		b = a + M_SIZE(m);
 		if ((caddr_t)m->m_data < a)
 			M_SANITY_ACTION("m_data outside mbuf data range left");
 		if ((caddr_t)m->m_data > b)
 			M_SANITY_ACTION("m_data outside mbuf data range right");
 		if ((caddr_t)m->m_data + m->m_len > b)
 			M_SANITY_ACTION("m_data + m_len exeeds mbuf space");
 
 		/* m->m_nextpkt may only be set on first mbuf in chain. */
 		if (m != m0 && m->m_nextpkt != NULL) {
 			if (sanitize) {
 				m_freem(m->m_nextpkt);
 				m->m_nextpkt = (struct mbuf *)0xDEADC0DE;
 			} else
 				M_SANITY_ACTION("m->m_nextpkt on in-chain mbuf");
 		}
 
 		/* packet length (not mbuf length!) calculation */
 		if (m0->m_flags & M_PKTHDR)
 			pktlen += m->m_len;
 
 		/* m_tags may only be attached to first mbuf in chain. */
 		if (m != m0 && m->m_flags & M_PKTHDR &&
 		    !SLIST_EMPTY(&m->m_pkthdr.tags)) {
 			if (sanitize) {
 				m_tag_delete_chain(m, NULL);
 				/* put in 0xDEADC0DE perhaps? */
 			} else
 				M_SANITY_ACTION("m_tags on in-chain mbuf");
 		}
 
 		/* M_PKTHDR may only be set on first mbuf in chain */
 		if (m != m0 && m->m_flags & M_PKTHDR) {
 			if (sanitize) {
 				bzero(&m->m_pkthdr, sizeof(m->m_pkthdr));
 				m->m_flags &= ~M_PKTHDR;
 				/* put in 0xDEADCODE and leave hdr flag in */
 			} else
 				M_SANITY_ACTION("M_PKTHDR on in-chain mbuf");
 		}
 	}
 	m = m0;
 	if (pktlen && pktlen != m->m_pkthdr.len) {
 		if (sanitize)
 			m->m_pkthdr.len = 0;
 		else
 			M_SANITY_ACTION("m_pkthdr.len != mbuf chain length");
 	}
 	return 1;
 
 #undef	M_SANITY_ACTION
 }
 
 /*
  * Non-inlined part of m_init().
  */
 int
 m_pkthdr_init(struct mbuf *m, int how)
 {
 #ifdef MAC
 	int error;
 #endif
 	m->m_data = m->m_pktdat;
 	bzero(&m->m_pkthdr, sizeof(m->m_pkthdr));
 #ifdef NUMA
 	m->m_pkthdr.numa_domain = M_NODOM;
 #endif
 #ifdef MAC
 	/* If the label init fails, fail the alloc */
 	error = mac_mbuf_init(m, how);
 	if (error)
 		return (error);
 #endif
 
 	return (0);
 }
 
 /*
  * "Move" mbuf pkthdr from "from" to "to".
  * "from" must have M_PKTHDR set, and "to" must be empty.
  */
 void
 m_move_pkthdr(struct mbuf *to, struct mbuf *from)
 {
 
 #if 0
 	/* see below for why these are not enabled */
 	M_ASSERTPKTHDR(to);
 	/* Note: with MAC, this may not be a good assertion. */
 	KASSERT(SLIST_EMPTY(&to->m_pkthdr.tags),
 	    ("m_move_pkthdr: to has tags"));
 #endif
 #ifdef MAC
 	/*
 	 * XXXMAC: It could be this should also occur for non-MAC?
 	 */
 	if (to->m_flags & M_PKTHDR)
 		m_tag_delete_chain(to, NULL);
 #endif
 	to->m_flags = (from->m_flags & M_COPYFLAGS) |
 	    (to->m_flags & (M_EXT | M_EXTPG));
 	if ((to->m_flags & M_EXT) == 0)
 		to->m_data = to->m_pktdat;
 	to->m_pkthdr = from->m_pkthdr;		/* especially tags */
 	SLIST_INIT(&from->m_pkthdr.tags);	/* purge tags from src */
 	from->m_flags &= ~M_PKTHDR;
 	if (from->m_pkthdr.csum_flags & CSUM_SND_TAG) {
 		from->m_pkthdr.csum_flags &= ~CSUM_SND_TAG;
 		from->m_pkthdr.snd_tag = NULL;
 	}
 }
 
 /*
  * Duplicate "from"'s mbuf pkthdr in "to".
  * "from" must have M_PKTHDR set, and "to" must be empty.
  * In particular, this does a deep copy of the packet tags.
  */
 int
 m_dup_pkthdr(struct mbuf *to, const struct mbuf *from, int how)
 {
 
 #if 0
 	/*
 	 * The mbuf allocator only initializes the pkthdr
 	 * when the mbuf is allocated with m_gethdr(). Many users
 	 * (e.g. m_copy*, m_prepend) use m_get() and then
 	 * smash the pkthdr as needed causing these
 	 * assertions to trip.  For now just disable them.
 	 */
 	M_ASSERTPKTHDR(to);
 	/* Note: with MAC, this may not be a good assertion. */
 	KASSERT(SLIST_EMPTY(&to->m_pkthdr.tags), ("m_dup_pkthdr: to has tags"));
 #endif
 	MBUF_CHECKSLEEP(how);
 #ifdef MAC
 	if (to->m_flags & M_PKTHDR)
 		m_tag_delete_chain(to, NULL);
 #endif
 	to->m_flags = (from->m_flags & M_COPYFLAGS) |
 	    (to->m_flags & (M_EXT | M_EXTPG));
 	if ((to->m_flags & M_EXT) == 0)
 		to->m_data = to->m_pktdat;
 	to->m_pkthdr = from->m_pkthdr;
 	if (from->m_pkthdr.csum_flags & CSUM_SND_TAG)
 		m_snd_tag_ref(from->m_pkthdr.snd_tag);
 	SLIST_INIT(&to->m_pkthdr.tags);
 	return (m_tag_copy_chain(to, from, how));
 }
 
 /*
  * Lesser-used path for M_PREPEND:
  * allocate new mbuf to prepend to chain,
  * copy junk along.
  */
 struct mbuf *
 m_prepend(struct mbuf *m, int len, int how)
 {
 	struct mbuf *mn;
 
 	if (m->m_flags & M_PKTHDR)
 		mn = m_gethdr(how, m->m_type);
 	else
 		mn = m_get(how, m->m_type);
 	if (mn == NULL) {
 		m_freem(m);
 		return (NULL);
 	}
 	if (m->m_flags & M_PKTHDR)
 		m_move_pkthdr(mn, m);
 	mn->m_next = m;
 	m = mn;
 	if (len < M_SIZE(m))
 		M_ALIGN(m, len);
 	m->m_len = len;
 	return (m);
 }
 
 /*
  * Make a copy of an mbuf chain starting "off0" bytes from the beginning,
  * continuing for "len" bytes.  If len is M_COPYALL, copy to end of mbuf.
  * The wait parameter is a choice of M_WAITOK/M_NOWAIT from caller.
  * Note that the copy is read-only, because clusters are not copied,
  * only their reference counts are incremented.
  */
 struct mbuf *
 m_copym(struct mbuf *m, int off0, int len, int wait)
 {
 	struct mbuf *n, **np;
 	int off = off0;
 	struct mbuf *top;
 	int copyhdr = 0;
 
 	KASSERT(off >= 0, ("m_copym, negative off %d", off));
 	KASSERT(len >= 0, ("m_copym, negative len %d", len));
 	MBUF_CHECKSLEEP(wait);
 	if (off == 0 && m->m_flags & M_PKTHDR)
 		copyhdr = 1;
 	while (off > 0) {
 		KASSERT(m != NULL, ("m_copym, offset > size of mbuf chain"));
 		if (off < m->m_len)
 			break;
 		off -= m->m_len;
 		m = m->m_next;
 	}
 	np = &top;
 	top = NULL;
 	while (len > 0) {
 		if (m == NULL) {
 			KASSERT(len == M_COPYALL,
 			    ("m_copym, length > size of mbuf chain"));
 			break;
 		}
 		if (copyhdr)
 			n = m_gethdr(wait, m->m_type);
 		else
 			n = m_get(wait, m->m_type);
 		*np = n;
 		if (n == NULL)
 			goto nospace;
 		if (copyhdr) {
 			if (!m_dup_pkthdr(n, m, wait))
 				goto nospace;
 			if (len == M_COPYALL)
 				n->m_pkthdr.len -= off0;
 			else
 				n->m_pkthdr.len = len;
 			copyhdr = 0;
 		}
 		n->m_len = min(len, m->m_len - off);
 		if (m->m_flags & (M_EXT | M_EXTPG)) {
 			n->m_data = m->m_data + off;
 			mb_dupcl(n, m);
 		} else
 			bcopy(mtod(m, caddr_t)+off, mtod(n, caddr_t),
 			    (u_int)n->m_len);
 		if (len != M_COPYALL)
 			len -= n->m_len;
 		off = 0;
 		m = m->m_next;
 		np = &n->m_next;
 	}
 
 	return (top);
 nospace:
 	m_freem(top);
 	return (NULL);
 }
 
 /*
  * Copy an entire packet, including header (which must be present).
  * An optimization of the common case `m_copym(m, 0, M_COPYALL, how)'.
  * Note that the copy is read-only, because clusters are not copied,
  * only their reference counts are incremented.
  * Preserve alignment of the first mbuf so if the creator has left
  * some room at the beginning (e.g. for inserting protocol headers)
  * the copies still have the room available.
  */
 struct mbuf *
 m_copypacket(struct mbuf *m, int how)
 {
 	struct mbuf *top, *n, *o;
 
 	MBUF_CHECKSLEEP(how);
 	n = m_get(how, m->m_type);
 	top = n;
 	if (n == NULL)
 		goto nospace;
 
 	if (!m_dup_pkthdr(n, m, how))
 		goto nospace;
 	n->m_len = m->m_len;
 	if (m->m_flags & (M_EXT | M_EXTPG)) {
 		n->m_data = m->m_data;
 		mb_dupcl(n, m);
 	} else {
 		n->m_data = n->m_pktdat + (m->m_data - m->m_pktdat );
 		bcopy(mtod(m, char *), mtod(n, char *), n->m_len);
 	}
 
 	m = m->m_next;
 	while (m) {
 		o = m_get(how, m->m_type);
 		if (o == NULL)
 			goto nospace;
 
 		n->m_next = o;
 		n = n->m_next;
 
 		n->m_len = m->m_len;
 		if (m->m_flags & (M_EXT | M_EXTPG)) {
 			n->m_data = m->m_data;
 			mb_dupcl(n, m);
 		} else {
 			bcopy(mtod(m, char *), mtod(n, char *), n->m_len);
 		}
 
 		m = m->m_next;
 	}
 	return top;
 nospace:
 	m_freem(top);
 	return (NULL);
 }
 
 static void
 m_copyfromunmapped(const struct mbuf *m, int off, int len, caddr_t cp)
 {
 	struct iovec iov;
 	struct uio uio;
 	int error __diagused;
 
 	KASSERT(off >= 0, ("m_copyfromunmapped: negative off %d", off));
 	KASSERT(len >= 0, ("m_copyfromunmapped: negative len %d", len));
 	KASSERT(off < m->m_len,
 	    ("m_copyfromunmapped: len exceeds mbuf length"));
 	iov.iov_base = cp;
 	iov.iov_len = len;
 	uio.uio_resid = len;
 	uio.uio_iov = &iov;
 	uio.uio_segflg = UIO_SYSSPACE;
 	uio.uio_iovcnt = 1;
 	uio.uio_offset = 0;
 	uio.uio_rw = UIO_READ;
 	error = m_unmapped_uiomove(m, off, &uio, len);
 	KASSERT(error == 0, ("m_unmapped_uiomove failed: off %d, len %d", off,
 	   len));
 }
 
 /*
  * Copy data from an mbuf chain starting "off" bytes from the beginning,
  * continuing for "len" bytes, into the indicated buffer.
  */
 void
 m_copydata(const struct mbuf *m, int off, int len, caddr_t cp)
 {
 	u_int count;
 
 	KASSERT(off >= 0, ("m_copydata, negative off %d", off));
 	KASSERT(len >= 0, ("m_copydata, negative len %d", len));
 	while (off > 0) {
 		KASSERT(m != NULL, ("m_copydata, offset > size of mbuf chain"));
 		if (off < m->m_len)
 			break;
 		off -= m->m_len;
 		m = m->m_next;
 	}
 	while (len > 0) {
 		KASSERT(m != NULL, ("m_copydata, length > size of mbuf chain"));
 		count = min(m->m_len - off, len);
 		if ((m->m_flags & M_EXTPG) != 0)
 			m_copyfromunmapped(m, off, count, cp);
 		else
 			bcopy(mtod(m, caddr_t) + off, cp, count);
 		len -= count;
 		cp += count;
 		off = 0;
 		m = m->m_next;
 	}
 }
 
 /*
  * Copy a packet header mbuf chain into a completely new chain, including
  * copying any mbuf clusters.  Use this instead of m_copypacket() when
  * you need a writable copy of an mbuf chain.
  */
 struct mbuf *
 m_dup(const struct mbuf *m, int how)
 {
 	struct mbuf **p, *top = NULL;
 	int remain, moff, nsize;
 
 	MBUF_CHECKSLEEP(how);
 	/* Sanity check */
 	if (m == NULL)
 		return (NULL);
 	M_ASSERTPKTHDR(m);
 
 	/* While there's more data, get a new mbuf, tack it on, and fill it */
 	remain = m->m_pkthdr.len;
 	moff = 0;
 	p = &top;
 	while (remain > 0 || top == NULL) {	/* allow m->m_pkthdr.len == 0 */
 		struct mbuf *n;
 
 		/* Get the next new mbuf */
 		if (remain >= MINCLSIZE) {
 			n = m_getcl(how, m->m_type, 0);
 			nsize = MCLBYTES;
 		} else {
 			n = m_get(how, m->m_type);
 			nsize = MLEN;
 		}
 		if (n == NULL)
 			goto nospace;
 
 		if (top == NULL) {		/* First one, must be PKTHDR */
 			if (!m_dup_pkthdr(n, m, how)) {
 				m_free(n);
 				goto nospace;
 			}
 			if ((n->m_flags & M_EXT) == 0)
 				nsize = MHLEN;
 			n->m_flags &= ~M_RDONLY;
 		}
 		n->m_len = 0;
 
 		/* Link it into the new chain */
 		*p = n;
 		p = &n->m_next;
 
 		/* Copy data from original mbuf(s) into new mbuf */
 		while (n->m_len < nsize && m != NULL) {
 			int chunk = min(nsize - n->m_len, m->m_len - moff);
 
 			m_copydata(m, moff, chunk, n->m_data + n->m_len);
 			moff += chunk;
 			n->m_len += chunk;
 			remain -= chunk;
 			if (moff == m->m_len) {
 				m = m->m_next;
 				moff = 0;
 			}
 		}
 
 		/* Check correct total mbuf length */
 		KASSERT((remain > 0 && m != NULL) || (remain == 0 && m == NULL),
 		    	("%s: bogus m_pkthdr.len", __func__));
 	}
 	return (top);
 
 nospace:
 	m_freem(top);
 	return (NULL);
 }
 
 /*
  * Concatenate mbuf chain n to m.
  * Both chains must be of the same type (e.g. MT_DATA).
  * Any m_pkthdr is not updated.
  */
 void
 m_cat(struct mbuf *m, struct mbuf *n)
 {
 	while (m->m_next)
 		m = m->m_next;
 	while (n) {
 		if (!M_WRITABLE(m) ||
 		    (n->m_flags & M_EXTPG) != 0 ||
 		    M_TRAILINGSPACE(m) < n->m_len) {
 			/* just join the two chains */
 			m->m_next = n;
 			return;
 		}
 		/* splat the data from one into the other */
 		bcopy(mtod(n, caddr_t), mtod(m, caddr_t) + m->m_len,
 		    (u_int)n->m_len);
 		m->m_len += n->m_len;
 		n = m_free(n);
 	}
 }
 
 /*
  * Concatenate two pkthdr mbuf chains.
  */
 void
 m_catpkt(struct mbuf *m, struct mbuf *n)
 {
 
 	M_ASSERTPKTHDR(m);
 	M_ASSERTPKTHDR(n);
 
 	m->m_pkthdr.len += n->m_pkthdr.len;
 	m_demote(n, 1, 0);
 
 	m_cat(m, n);
 }
 
 void
 m_adj(struct mbuf *mp, int req_len)
 {
 	int len = req_len;
 	struct mbuf *m;
 	int count;
 
 	if ((m = mp) == NULL)
 		return;
 	if (len >= 0) {
 		/*
 		 * Trim from head.
 		 */
 		while (m != NULL && len > 0) {
 			if (m->m_len <= len) {
 				len -= m->m_len;
 				m->m_len = 0;
 				m = m->m_next;
 			} else {
 				m->m_len -= len;
 				m->m_data += len;
 				len = 0;
 			}
 		}
 		if (mp->m_flags & M_PKTHDR)
 			mp->m_pkthdr.len -= (req_len - len);
 	} else {
 		/*
 		 * Trim from tail.  Scan the mbuf chain,
 		 * calculating its length and finding the last mbuf.
 		 * If the adjustment only affects this mbuf, then just
 		 * adjust and return.  Otherwise, rescan and truncate
 		 * after the remaining size.
 		 */
 		len = -len;
 		count = 0;
 		for (;;) {
 			count += m->m_len;
 			if (m->m_next == (struct mbuf *)0)
 				break;
 			m = m->m_next;
 		}
 		if (m->m_len >= len) {
 			m->m_len -= len;
 			if (mp->m_flags & M_PKTHDR)
 				mp->m_pkthdr.len -= len;
 			return;
 		}
 		count -= len;
 		if (count < 0)
 			count = 0;
 		/*
 		 * Correct length for chain is "count".
 		 * Find the mbuf with last data, adjust its length,
 		 * and toss data from remaining mbufs on chain.
 		 */
 		m = mp;
 		if (m->m_flags & M_PKTHDR)
 			m->m_pkthdr.len = count;
 		for (; m; m = m->m_next) {
 			if (m->m_len >= count) {
 				m->m_len = count;
 				if (m->m_next != NULL) {
 					m_freem(m->m_next);
 					m->m_next = NULL;
 				}
 				break;
 			}
 			count -= m->m_len;
 		}
 	}
 }
 
 void
 m_adj_decap(struct mbuf *mp, int len)
 {
 	uint8_t rsstype;
 
 	m_adj(mp, len);
 	if ((mp->m_flags & M_PKTHDR) != 0) {
 		/*
 		 * If flowid was calculated by card from the inner
 		 * headers, move flowid to the decapsulated mbuf
 		 * chain, otherwise clear.  This depends on the
 		 * internals of m_adj, which keeps pkthdr as is, in
 		 * particular not changing rsstype and flowid.
 		 */
 		rsstype = mp->m_pkthdr.rsstype;
 		if ((rsstype & M_HASHTYPE_INNER) != 0) {
 			M_HASHTYPE_SET(mp, rsstype & ~M_HASHTYPE_INNER);
 		} else {
 			M_HASHTYPE_CLEAR(mp);
 		}
 	}
 }
 
 /*
  * Rearange an mbuf chain so that len bytes are contiguous
  * and in the data area of an mbuf (so that mtod will work
  * for a structure of size len).  Returns the resulting
  * mbuf chain on success, frees it and returns null on failure.
  * If there is room, it will add up to max_protohdr-len extra bytes to the
  * contiguous region in an attempt to avoid being called next time.
  */
 struct mbuf *
 m_pullup(struct mbuf *n, int len)
 {
 	struct mbuf *m;
 	int count;
 	int space;
 
 	KASSERT((n->m_flags & M_EXTPG) == 0,
 	    ("%s: unmapped mbuf %p", __func__, n));
 
 	/*
 	 * If first mbuf has no cluster, and has room for len bytes
 	 * without shifting current data, pullup into it,
 	 * otherwise allocate a new mbuf to prepend to the chain.
 	 */
 	if ((n->m_flags & M_EXT) == 0 &&
 	    n->m_data + len < &n->m_dat[MLEN] && n->m_next) {
 		if (n->m_len >= len)
 			return (n);
 		m = n;
 		n = n->m_next;
 		len -= m->m_len;
 	} else {
 		if (len > MHLEN)
 			goto bad;
 		m = m_get(M_NOWAIT, n->m_type);
 		if (m == NULL)
 			goto bad;
 		if (n->m_flags & M_PKTHDR)
 			m_move_pkthdr(m, n);
 	}
 	space = &m->m_dat[MLEN] - (m->m_data + m->m_len);
 	do {
 		count = min(min(max(len, max_protohdr), space), n->m_len);
 		bcopy(mtod(n, caddr_t), mtod(m, caddr_t) + m->m_len,
 		  (u_int)count);
 		len -= count;
 		m->m_len += count;
 		n->m_len -= count;
 		space -= count;
 		if (n->m_len)
 			n->m_data += count;
 		else
 			n = m_free(n);
 	} while (len > 0 && n);
 	if (len > 0) {
 		(void) m_free(m);
 		goto bad;
 	}
 	m->m_next = n;
 	return (m);
 bad:
 	m_freem(n);
 	return (NULL);
 }
 
 /*
  * Like m_pullup(), except a new mbuf is always allocated, and we allow
  * the amount of empty space before the data in the new mbuf to be specified
  * (in the event that the caller expects to prepend later).
  */
 struct mbuf *
 m_copyup(struct mbuf *n, int len, int dstoff)
 {
 	struct mbuf *m;
 	int count, space;
 
 	if (len > (MHLEN - dstoff))
 		goto bad;
 	m = m_get(M_NOWAIT, n->m_type);
 	if (m == NULL)
 		goto bad;
 	if (n->m_flags & M_PKTHDR)
 		m_move_pkthdr(m, n);
 	m->m_data += dstoff;
 	space = &m->m_dat[MLEN] - (m->m_data + m->m_len);
 	do {
 		count = min(min(max(len, max_protohdr), space), n->m_len);
 		memcpy(mtod(m, caddr_t) + m->m_len, mtod(n, caddr_t),
 		    (unsigned)count);
 		len -= count;
 		m->m_len += count;
 		n->m_len -= count;
 		space -= count;
 		if (n->m_len)
 			n->m_data += count;
 		else
 			n = m_free(n);
 	} while (len > 0 && n);
 	if (len > 0) {
 		(void) m_free(m);
 		goto bad;
 	}
 	m->m_next = n;
 	return (m);
  bad:
 	m_freem(n);
 	return (NULL);
 }
 
 /*
  * Partition an mbuf chain in two pieces, returning the tail --
  * all but the first len0 bytes.  In case of failure, it returns NULL and
  * attempts to restore the chain to its original state.
  *
  * Note that the resulting mbufs might be read-only, because the new
  * mbuf can end up sharing an mbuf cluster with the original mbuf if
  * the "breaking point" happens to lie within a cluster mbuf. Use the
  * M_WRITABLE() macro to check for this case.
  */
 struct mbuf *
 m_split(struct mbuf *m0, int len0, int wait)
 {
 	struct mbuf *m, *n;
 	u_int len = len0, remain;
 
 	MBUF_CHECKSLEEP(wait);
 	for (m = m0; m && len > m->m_len; m = m->m_next)
 		len -= m->m_len;
 	if (m == NULL)
 		return (NULL);
 	remain = m->m_len - len;
 	if (m0->m_flags & M_PKTHDR && remain == 0) {
 		n = m_gethdr(wait, m0->m_type);
 		if (n == NULL)
 			return (NULL);
 		n->m_next = m->m_next;
 		m->m_next = NULL;
 		if (m0->m_pkthdr.csum_flags & CSUM_SND_TAG) {
 			n->m_pkthdr.snd_tag =
 			    m_snd_tag_ref(m0->m_pkthdr.snd_tag);
 			n->m_pkthdr.csum_flags |= CSUM_SND_TAG;
 		} else
 			n->m_pkthdr.rcvif = m0->m_pkthdr.rcvif;
 		n->m_pkthdr.len = m0->m_pkthdr.len - len0;
 		m0->m_pkthdr.len = len0;
 		return (n);
 	} else if (m0->m_flags & M_PKTHDR) {
 		n = m_gethdr(wait, m0->m_type);
 		if (n == NULL)
 			return (NULL);
 		if (m0->m_pkthdr.csum_flags & CSUM_SND_TAG) {
 			n->m_pkthdr.snd_tag =
 			    m_snd_tag_ref(m0->m_pkthdr.snd_tag);
 			n->m_pkthdr.csum_flags |= CSUM_SND_TAG;
 		} else
 			n->m_pkthdr.rcvif = m0->m_pkthdr.rcvif;
 		n->m_pkthdr.len = m0->m_pkthdr.len - len0;
 		m0->m_pkthdr.len = len0;
 		if (m->m_flags & (M_EXT | M_EXTPG))
 			goto extpacket;
 		if (remain > MHLEN) {
 			/* m can't be the lead packet */
 			M_ALIGN(n, 0);
 			n->m_next = m_split(m, len, wait);
 			if (n->m_next == NULL) {
 				(void) m_free(n);
 				return (NULL);
 			} else {
 				n->m_len = 0;
 				return (n);
 			}
 		} else
 			M_ALIGN(n, remain);
 	} else if (remain == 0) {
 		n = m->m_next;
 		m->m_next = NULL;
 		return (n);
 	} else {
 		n = m_get(wait, m->m_type);
 		if (n == NULL)
 			return (NULL);
 		M_ALIGN(n, remain);
 	}
 extpacket:
 	if (m->m_flags & (M_EXT | M_EXTPG)) {
 		n->m_data = m->m_data + len;
 		mb_dupcl(n, m);
 	} else {
 		bcopy(mtod(m, caddr_t) + len, mtod(n, caddr_t), remain);
 	}
 	n->m_len = remain;
 	m->m_len = len;
 	n->m_next = m->m_next;
 	m->m_next = NULL;
 	return (n);
 }
 
 /*
  * Partition mchain in two pieces, keeping len0 bytes in head and transferring
  * remainder to tail.  In case of failure, both chains to be left untouched.
  * M_EOR is observed correctly.
  * Resulting mbufs might be read-only.
  */
 int
 mc_split(struct mchain *head, struct mchain *tail, u_int len0, int wait)
 {
 	struct mbuf *m, *n;
 	u_int len, mlen, remain;
 
 	MPASS(!(mc_first(head)->m_flags & M_PKTHDR));
 	MBUF_CHECKSLEEP(wait);
 
 	mlen = 0;
 	len = len0;
 	STAILQ_FOREACH(m, &head->mc_q, m_stailq) {
 		mlen += MSIZE;
 		if (m->m_flags & M_EXT)
 			mlen += m->m_ext.ext_size;
 		if (len > m->m_len)
 			len -= m->m_len;
 		else
 			break;
 	}
 	if (__predict_false(m == NULL)) {
 		*tail = MCHAIN_INITIALIZER(tail);
 		return (0);
 	}
 	remain = m->m_len - len;
 	if (remain > 0) {
 		if (__predict_false((n = m_get(wait, m->m_type)) == NULL))
 			return (ENOMEM);
 		m_align(n, remain);
 		if (m->m_flags & M_EXT) {
 			n->m_data = m->m_data + len;
 			mb_dupcl(n, m);
 		} else
 			bcopy(mtod(m, char *) + len, mtod(n, char *), remain);
 	}
 
 	/* XXXGL: need STAILQ_SPLIT */
 	STAILQ_FIRST(&tail->mc_q) = STAILQ_NEXT(m, m_stailq);
 	tail->mc_q.stqh_last = head->mc_q.stqh_last;
 	tail->mc_len = head->mc_len - len0;
 	tail->mc_mlen = head->mc_mlen - mlen;
 	if (remain > 0) {
 		MPASS(n->m_len == 0);
 		mc_prepend(tail, n);
 		n->m_len = remain;
 		m->m_len -= remain;
 		if (m->m_flags & M_EOR) {
 			m->m_flags &= ~M_EOR;
 			n->m_flags |= M_EOR;
 		}
 	}
 	head->mc_q.stqh_last = &STAILQ_NEXT(m, m_stailq);
 	STAILQ_NEXT(m, m_stailq) = NULL;
 	head->mc_len = len0;
 	head->mc_mlen = mlen;
 
 	return (0);
 }
 
 /*
  * Routine to copy from device local memory into mbufs.
  * Note that `off' argument is offset into first mbuf of target chain from
  * which to begin copying the data to.
  */
 struct mbuf *
 m_devget(char *buf, int totlen, int off, struct ifnet *ifp,
     void (*copy)(char *from, caddr_t to, u_int len))
 {
 	struct mbuf *m;
 	struct mbuf *top = NULL, **mp = &top;
 	int len;
 
 	if (off < 0 || off > MHLEN)
 		return (NULL);
 
 	while (totlen > 0) {
 		if (top == NULL) {	/* First one, must be PKTHDR */
 			if (totlen + off >= MINCLSIZE) {
 				m = m_getcl(M_NOWAIT, MT_DATA, M_PKTHDR);
 				len = MCLBYTES;
 			} else {
 				m = m_gethdr(M_NOWAIT, MT_DATA);
 				len = MHLEN;
 
 				/* Place initial small packet/header at end of mbuf */
 				if (m && totlen + off + max_linkhdr <= MHLEN) {
 					m->m_data += max_linkhdr;
 					len -= max_linkhdr;
 				}
 			}
 			if (m == NULL)
 				return NULL;
 			m->m_pkthdr.rcvif = ifp;
 			m->m_pkthdr.len = totlen;
 		} else {
 			if (totlen + off >= MINCLSIZE) {
 				m = m_getcl(M_NOWAIT, MT_DATA, 0);
 				len = MCLBYTES;
 			} else {
 				m = m_get(M_NOWAIT, MT_DATA);
 				len = MLEN;
 			}
 			if (m == NULL) {
 				m_freem(top);
 				return NULL;
 			}
 		}
 		if (off) {
 			m->m_data += off;
 			len -= off;
 			off = 0;
 		}
 		m->m_len = len = min(totlen, len);
 		if (copy)
 			copy(buf, mtod(m, caddr_t), (u_int)len);
 		else
 			bcopy(buf, mtod(m, caddr_t), (u_int)len);
 		buf += len;
 		*mp = m;
 		mp = &m->m_next;
 		totlen -= len;
 	}
 	return (top);
 }
 
 static void
 m_copytounmapped(const struct mbuf *m, int off, int len, c_caddr_t cp)
 {
 	struct iovec iov;
 	struct uio uio;
 	int error __diagused;
 
 	KASSERT(off >= 0, ("m_copytounmapped: negative off %d", off));
 	KASSERT(len >= 0, ("m_copytounmapped: negative len %d", len));
 	KASSERT(off < m->m_len, ("m_copytounmapped: len exceeds mbuf length"));
 	iov.iov_base = __DECONST(caddr_t, cp);
 	iov.iov_len = len;
 	uio.uio_resid = len;
 	uio.uio_iov = &iov;
 	uio.uio_segflg = UIO_SYSSPACE;
 	uio.uio_iovcnt = 1;
 	uio.uio_offset = 0;
 	uio.uio_rw = UIO_WRITE;
 	error = m_unmapped_uiomove(m, off, &uio, len);
 	KASSERT(error == 0, ("m_unmapped_uiomove failed: off %d, len %d", off,
 	   len));
 }
 
 /*
  * Copy data from a buffer back into the indicated mbuf chain,
  * starting "off" bytes from the beginning, extending the mbuf
  * chain if necessary.
  */
 void
 m_copyback(struct mbuf *m0, int off, int len, c_caddr_t cp)
 {
 	int mlen;
 	struct mbuf *m = m0, *n;
 	int totlen = 0;
 
 	if (m0 == NULL)
 		return;
 	while (off > (mlen = m->m_len)) {
 		off -= mlen;
 		totlen += mlen;
 		if (m->m_next == NULL) {
 			n = m_get(M_NOWAIT, m->m_type);
 			if (n == NULL)
 				goto out;
 			bzero(mtod(n, caddr_t), MLEN);
 			n->m_len = min(MLEN, len + off);
 			m->m_next = n;
 		}
 		m = m->m_next;
 	}
 	while (len > 0) {
 		if (m->m_next == NULL && (len > m->m_len - off)) {
 			m->m_len += min(len - (m->m_len - off),
 			    M_TRAILINGSPACE(m));
 		}
 		mlen = min (m->m_len - off, len);
 		if ((m->m_flags & M_EXTPG) != 0)
 			m_copytounmapped(m, off, mlen, cp);
 		else
 			bcopy(cp, off + mtod(m, caddr_t), (u_int)mlen);
 		cp += mlen;
 		len -= mlen;
 		mlen += off;
 		off = 0;
 		totlen += mlen;
 		if (len == 0)
 			break;
 		if (m->m_next == NULL) {
 			n = m_get(M_NOWAIT, m->m_type);
 			if (n == NULL)
 				break;
 			n->m_len = min(MLEN, len);
 			m->m_next = n;
 		}
 		m = m->m_next;
 	}
 out:	if (((m = m0)->m_flags & M_PKTHDR) && (m->m_pkthdr.len < totlen))
 		m->m_pkthdr.len = totlen;
 }
 
 /*
  * Append the specified data to the indicated mbuf chain,
  * Extend the mbuf chain if the new data does not fit in
  * existing space.
  *
  * Return 1 if able to complete the job; otherwise 0.
  */
 int
 m_append(struct mbuf *m0, int len, c_caddr_t cp)
 {
 	struct mbuf *m, *n;
 	int remainder, space;
 
 	for (m = m0; m->m_next != NULL; m = m->m_next)
 		;
 	remainder = len;
 	space = M_TRAILINGSPACE(m);
 	if (space > 0) {
 		/*
 		 * Copy into available space.
 		 */
 		if (space > remainder)
 			space = remainder;
 		bcopy(cp, mtod(m, caddr_t) + m->m_len, space);
 		m->m_len += space;
 		cp += space, remainder -= space;
 	}
 	while (remainder > 0) {
 		/*
 		 * Allocate a new mbuf; could check space
 		 * and allocate a cluster instead.
 		 */
 		n = m_get(M_NOWAIT, m->m_type);
 		if (n == NULL)
 			break;
 		n->m_len = min(MLEN, remainder);
 		bcopy(cp, mtod(n, caddr_t), n->m_len);
 		cp += n->m_len, remainder -= n->m_len;
 		m->m_next = n;
 		m = n;
 	}
 	if (m0->m_flags & M_PKTHDR)
 		m0->m_pkthdr.len += len - remainder;
 	return (remainder == 0);
 }
 
 static int
 m_apply_extpg_one(struct mbuf *m, int off, int len,
     int (*f)(void *, void *, u_int), void *arg)
 {
 	void *p;
 	u_int i, count, pgoff, pglen;
 	int rval;
 
 	KASSERT(PMAP_HAS_DMAP,
 	    ("m_apply_extpg_one does not support unmapped mbufs"));
 	off += mtod(m, vm_offset_t);
 	if (off < m->m_epg_hdrlen) {
 		count = min(m->m_epg_hdrlen - off, len);
 		rval = f(arg, m->m_epg_hdr + off, count);
 		if (rval)
 			return (rval);
 		len -= count;
 		off = 0;
 	} else
 		off -= m->m_epg_hdrlen;
 	pgoff = m->m_epg_1st_off;
 	for (i = 0; i < m->m_epg_npgs && len > 0; i++) {
 		pglen = m_epg_pagelen(m, i, pgoff);
 		if (off < pglen) {
 			count = min(pglen - off, len);
 			p = (void *)PHYS_TO_DMAP(m->m_epg_pa[i] + pgoff + off);
 			rval = f(arg, p, count);
 			if (rval)
 				return (rval);
 			len -= count;
 			off = 0;
 		} else
 			off -= pglen;
 		pgoff = 0;
 	}
 	if (len > 0) {
 		KASSERT(off < m->m_epg_trllen,
 		    ("m_apply_extpg_one: offset beyond trailer"));
 		KASSERT(len <= m->m_epg_trllen - off,
 		    ("m_apply_extpg_one: length beyond trailer"));
 		return (f(arg, m->m_epg_trail + off, len));
 	}
 	return (0);
 }
 
 /* Apply function f to the data in a single mbuf. */
 static int
 m_apply_one(struct mbuf *m, int off, int len,
     int (*f)(void *, void *, u_int), void *arg)
 {
 	if ((m->m_flags & M_EXTPG) != 0)
 		return (m_apply_extpg_one(m, off, len, f, arg));
 	else
 		return (f(arg, mtod(m, caddr_t) + off, len));
 }
 
 /*
  * Apply function f to the data in an mbuf chain starting "off" bytes from
  * the beginning, continuing for "len" bytes.
  */
 int
 m_apply(struct mbuf *m, int off, int len,
     int (*f)(void *, void *, u_int), void *arg)
 {
 	u_int count;
 	int rval;
 
 	KASSERT(off >= 0, ("m_apply, negative off %d", off));
 	KASSERT(len >= 0, ("m_apply, negative len %d", len));
 	while (off > 0) {
 		KASSERT(m != NULL, ("m_apply, offset > size of mbuf chain "
 		    "(%d extra)", off));
 		if (off < m->m_len)
 			break;
 		off -= m->m_len;
 		m = m->m_next;
 	}
 	while (len > 0) {
 		KASSERT(m != NULL, ("m_apply, length > size of mbuf chain "
 		    "(%d extra)", len));
 		count = min(m->m_len - off, len);
 		rval = m_apply_one(m, off, count, f, arg);
 		if (rval)
 			return (rval);
 		len -= count;
 		off = 0;
 		m = m->m_next;
 	}
 	return (0);
 }
 
 /*
  * Return a pointer to mbuf/offset of location in mbuf chain.
  */
 struct mbuf *
 m_getptr(struct mbuf *m, int loc, int *off)
 {
 
 	while (loc >= 0) {
 		/* Normal end of search. */
 		if (m->m_len > loc) {
 			*off = loc;
 			return (m);
 		} else {
 			loc -= m->m_len;
 			if (m->m_next == NULL) {
 				if (loc == 0) {
 					/* Point at the end of valid data. */
 					*off = m->m_len;
 					return (m);
 				}
 				return (NULL);
 			}
 			m = m->m_next;
 		}
 	}
 	return (NULL);
 }
 
 void
 m_print(const struct mbuf *m, int maxlen)
 {
 	int len;
 	int pdata;
 	const struct mbuf *m2;
 
 	if (m == NULL) {
 		printf("mbuf: %p\n", m);
 		return;
 	}
 
 	if (m->m_flags & M_PKTHDR)
 		len = m->m_pkthdr.len;
 	else
 		len = -1;
 	m2 = m;
 	while (m2 != NULL && (len == -1 || len)) {
 		pdata = m2->m_len;
 		if (maxlen != -1 && pdata > maxlen)
 			pdata = maxlen;
 		printf("mbuf: %p len: %d, next: %p, %b%s", m2, m2->m_len,
 		    m2->m_next, m2->m_flags, "\20\20freelist\17skipfw"
 		    "\11proto5\10proto4\7proto3\6proto2\5proto1\4rdonly"
 		    "\3eor\2pkthdr\1ext", pdata ? "" : "\n");
 		if (pdata)
 			printf(", %*D\n", pdata, (u_char *)m2->m_data, "-");
 		if (len != -1)
 			len -= m2->m_len;
 		m2 = m2->m_next;
 	}
 	if (len > 0)
 		printf("%d bytes unaccounted for.\n", len);
 	return;
 }
 
 u_int
 m_fixhdr(struct mbuf *m0)
 {
 	u_int len;
 
 	len = m_length(m0, NULL);
 	m0->m_pkthdr.len = len;
 	return (len);
 }
 
 u_int
 m_length(struct mbuf *m0, struct mbuf **last)
 {
 	struct mbuf *m;
 	u_int len;
 
 	len = 0;
 	for (m = m0; m != NULL; m = m->m_next) {
 		len += m->m_len;
 		if (m->m_next == NULL)
 			break;
 	}
 	if (last != NULL)
 		*last = m;
 	return (len);
 }
 
 /*
  * Defragment a mbuf chain, returning the shortest possible
  * chain of mbufs and clusters.  If allocation fails and
  * this cannot be completed, NULL will be returned, but
  * the passed in chain will be unchanged.  Upon success,
  * the original chain will be freed, and the new chain
  * will be returned.
  *
  * If a non-packet header is passed in, the original
  * mbuf (chain?) will be returned unharmed.
  */
 struct mbuf *
 m_defrag(struct mbuf *m0, int how)
 {
 	struct mbuf *m_new = NULL, *m_final = NULL;
 	int progress = 0, length;
 
 	MBUF_CHECKSLEEP(how);
 	if (!(m0->m_flags & M_PKTHDR))
 		return (m0);
 
 	m_fixhdr(m0); /* Needed sanity check */
 
 #ifdef MBUF_STRESS_TEST
 	if (m_defragrandomfailures) {
 		int temp = arc4random() & 0xff;
 		if (temp == 0xba)
 			goto nospace;
 	}
 #endif
 
 	if (m0->m_pkthdr.len > MHLEN)
 		m_final = m_getcl(how, MT_DATA, M_PKTHDR);
 	else
 		m_final = m_gethdr(how, MT_DATA);
 
 	if (m_final == NULL)
 		goto nospace;
 
 	if (m_dup_pkthdr(m_final, m0, how) == 0)
 		goto nospace;
 
 	m_new = m_final;
 
 	while (progress < m0->m_pkthdr.len) {
 		length = m0->m_pkthdr.len - progress;
 		if (length > MCLBYTES)
 			length = MCLBYTES;
 
 		if (m_new == NULL) {
 			if (length > MLEN)
 				m_new = m_getcl(how, MT_DATA, 0);
 			else
 				m_new = m_get(how, MT_DATA);
 			if (m_new == NULL)
 				goto nospace;
 		}
 
 		m_copydata(m0, progress, length, mtod(m_new, caddr_t));
 		progress += length;
 		m_new->m_len = length;
 		if (m_new != m_final)
 			m_cat(m_final, m_new);
 		m_new = NULL;
 	}
 #ifdef MBUF_STRESS_TEST
 	if (m0->m_next == NULL)
 		m_defraguseless++;
 #endif
 	m_freem(m0);
 	m0 = m_final;
 #ifdef MBUF_STRESS_TEST
 	m_defragpackets++;
 	m_defragbytes += m0->m_pkthdr.len;
 #endif
 	return (m0);
 nospace:
 #ifdef MBUF_STRESS_TEST
 	m_defragfailure++;
 #endif
 	if (m_final)
 		m_freem(m_final);
 	return (NULL);
 }
 
 /*
  * Return the number of fragments an mbuf will use.  This is usually
  * used as a proxy for the number of scatter/gather elements needed by
  * a DMA engine to access an mbuf.  In general mapped mbufs are
  * assumed to be backed by physically contiguous buffers that only
  * need a single fragment.  Unmapped mbufs, on the other hand, can
  * span disjoint physical pages.
  */
 static int
 frags_per_mbuf(struct mbuf *m)
 {
 	int frags;
 
 	if ((m->m_flags & M_EXTPG) == 0)
 		return (1);
 
 	/*
 	 * The header and trailer are counted as a single fragment
 	 * each when present.
 	 *
 	 * XXX: This overestimates the number of fragments by assuming
 	 * all the backing physical pages are disjoint.
 	 */
 	frags = 0;
 	if (m->m_epg_hdrlen != 0)
 		frags++;
 	frags += m->m_epg_npgs;
 	if (m->m_epg_trllen != 0)
 		frags++;
 
 	return (frags);
 }
 
 /*
  * Defragment an mbuf chain, returning at most maxfrags separate
  * mbufs+clusters.  If this is not possible NULL is returned and
  * the original mbuf chain is left in its present (potentially
  * modified) state.  We use two techniques: collapsing consecutive
  * mbufs and replacing consecutive mbufs by a cluster.
  *
  * NB: this should really be named m_defrag but that name is taken
  */
 struct mbuf *
 m_collapse(struct mbuf *m0, int how, int maxfrags)
 {
 	struct mbuf *m, *n, *n2, **prev;
 	u_int curfrags;
 
 	/*
 	 * Calculate the current number of frags.
 	 */
 	curfrags = 0;
 	for (m = m0; m != NULL; m = m->m_next)
 		curfrags += frags_per_mbuf(m);
 	/*
 	 * First, try to collapse mbufs.  Note that we always collapse
 	 * towards the front so we don't need to deal with moving the
 	 * pkthdr.  This may be suboptimal if the first mbuf has much
 	 * less data than the following.
 	 */
 	m = m0;
 again:
 	for (;;) {
 		n = m->m_next;
 		if (n == NULL)
 			break;
 		if (M_WRITABLE(m) &&
 		    n->m_len < M_TRAILINGSPACE(m)) {
 			m_copydata(n, 0, n->m_len,
 			    mtod(m, char *) + m->m_len);
 			m->m_len += n->m_len;
 			m->m_next = n->m_next;
 			curfrags -= frags_per_mbuf(n);
 			m_free(n);
 			if (curfrags <= maxfrags)
 				return m0;
 		} else
 			m = n;
 	}
 	KASSERT(maxfrags > 1,
 		("maxfrags %u, but normal collapse failed", maxfrags));
 	/*
 	 * Collapse consecutive mbufs to a cluster.
 	 */
 	prev = &m0->m_next;		/* NB: not the first mbuf */
 	while ((n = *prev) != NULL) {
 		if ((n2 = n->m_next) != NULL &&
 		    n->m_len + n2->m_len < MCLBYTES) {
 			m = m_getcl(how, MT_DATA, 0);
 			if (m == NULL)
 				goto bad;
 			m_copydata(n, 0,  n->m_len, mtod(m, char *));
 			m_copydata(n2, 0,  n2->m_len,
 			    mtod(m, char *) + n->m_len);
 			m->m_len = n->m_len + n2->m_len;
 			m->m_next = n2->m_next;
 			*prev = m;
 			curfrags += 1;  /* For the new cluster */
 			curfrags -= frags_per_mbuf(n);
 			curfrags -= frags_per_mbuf(n2);
 			m_free(n);
 			m_free(n2);
 			if (curfrags <= maxfrags)
 				return m0;
 			/*
 			 * Still not there, try the normal collapse
 			 * again before we allocate another cluster.
 			 */
 			goto again;
 		}
 		prev = &n->m_next;
 	}
 	/*
 	 * No place where we can collapse to a cluster; punt.
 	 * This can occur if, for example, you request 2 frags
 	 * but the packet requires that both be clusters (we
 	 * never reallocate the first mbuf to avoid moving the
 	 * packet header).
 	 */
 bad:
 	return NULL;
 }
 
 #ifdef MBUF_STRESS_TEST
 
 /*
  * Fragment an mbuf chain.  There's no reason you'd ever want to do
  * this in normal usage, but it's great for stress testing various
  * mbuf consumers.
  *
  * If fragmentation is not possible, the original chain will be
  * returned.
  *
  * Possible length values:
  * 0	 no fragmentation will occur
  * > 0	each fragment will be of the specified length
  * -1	each fragment will be the same random value in length
  * -2	each fragment's length will be entirely random
  * (Random values range from 1 to 256)
  */
 struct mbuf *
 m_fragment(struct mbuf *m0, int how, int length)
 {
 	struct mbuf *m_first, *m_last;
 	int divisor = 255, progress = 0, fraglen;
 
 	if (!(m0->m_flags & M_PKTHDR))
 		return (m0);
 
 	if (length == 0 || length < -2)
 		return (m0);
 	if (length > MCLBYTES)
 		length = MCLBYTES;
 	if (length < 0 && divisor > MCLBYTES)
 		divisor = MCLBYTES;
 	if (length == -1)
 		length = 1 + (arc4random() % divisor);
 	if (length > 0)
 		fraglen = length;
 
 	m_fixhdr(m0); /* Needed sanity check */
 
 	m_first = m_getcl(how, MT_DATA, M_PKTHDR);
 	if (m_first == NULL)
 		goto nospace;
 
 	if (m_dup_pkthdr(m_first, m0, how) == 0)
 		goto nospace;
 
 	m_last = m_first;
 
 	while (progress < m0->m_pkthdr.len) {
 		if (length == -2)
 			fraglen = 1 + (arc4random() % divisor);
 		if (fraglen > m0->m_pkthdr.len - progress)
 			fraglen = m0->m_pkthdr.len - progress;
 
 		if (progress != 0) {
 			struct mbuf *m_new = m_getcl(how, MT_DATA, 0);
 			if (m_new == NULL)
 				goto nospace;
 
 			m_last->m_next = m_new;
 			m_last = m_new;
 		}
 
 		m_copydata(m0, progress, fraglen, mtod(m_last, caddr_t));
 		progress += fraglen;
 		m_last->m_len = fraglen;
 	}
 	m_freem(m0);
 	m0 = m_first;
 	return (m0);
 nospace:
 	if (m_first)
 		m_freem(m_first);
 	/* Return the original chain on failure */
 	return (m0);
 }
 
 #endif
 
 /*
  * Free pages from mbuf_ext_pgs, assuming they were allocated via
  * vm_page_alloc() and aren't associated with any object.  Complement
  * to allocator from m_uiotombuf_nomap().
  */
 void
 mb_free_mext_pgs(struct mbuf *m)
 {
 	vm_page_t pg;
 
 	M_ASSERTEXTPG(m);
 	for (int i = 0; i < m->m_epg_npgs; i++) {
 		pg = PHYS_TO_VM_PAGE(m->m_epg_pa[i]);
 		vm_page_unwire_noq(pg);
 		vm_page_free(pg);
 	}
 }
 
 static struct mbuf *
 m_uiotombuf_nomap(struct uio *uio, int how, int len, int maxseg, int flags)
 {
 	struct mbuf *m, *mb, *prev;
 	vm_page_t pg_array[MBUF_PEXT_MAX_PGS];
 	int error, length, i, needed;
 	ssize_t total;
 	int pflags = malloc2vm_flags(how) | VM_ALLOC_NODUMP | VM_ALLOC_WIRED;
 
 	MPASS((flags & M_PKTHDR) == 0);
 	MPASS((how & M_ZERO) == 0);
 
 	/*
 	 * len can be zero or an arbitrary large value bound by
 	 * the total data supplied by the uio.
 	 */
 	if (len > 0)
 		total = MIN(uio->uio_resid, len);
 	else
 		total = uio->uio_resid;
 
 	if (maxseg == 0)
 		maxseg = MBUF_PEXT_MAX_PGS * PAGE_SIZE;
 
 	/*
 	 * If total is zero, return an empty mbuf.  This can occur
 	 * for TLS 1.0 connections which send empty fragments as
 	 * a countermeasure against the known-IV weakness in CBC
 	 * ciphersuites.
 	 */
 	if (__predict_false(total == 0)) {
-		mb = mb_alloc_ext_pgs(how, mb_free_mext_pgs);
+		mb = mb_alloc_ext_pgs(how, mb_free_mext_pgs, 0);
 		if (mb == NULL)
 			return (NULL);
 		mb->m_epg_flags = EPG_FLAG_ANON;
 		return (mb);
 	}
 
 	/*
 	 * Allocate the pages
 	 */
 	m = NULL;
 	while (total > 0) {
-		mb = mb_alloc_ext_pgs(how, mb_free_mext_pgs);
+		mb = mb_alloc_ext_pgs(how, mb_free_mext_pgs, 0);
 		if (mb == NULL)
 			goto failed;
 		if (m == NULL)
 			m = mb;
 		else
 			prev->m_next = mb;
 		prev = mb;
 		mb->m_epg_flags = EPG_FLAG_ANON;
 		needed = length = MIN(maxseg, total);
 		for (i = 0; needed > 0; i++, needed -= PAGE_SIZE) {
 retry_page:
 			pg_array[i] = vm_page_alloc_noobj(pflags);
 			if (pg_array[i] == NULL) {
 				if (how & M_NOWAIT) {
 					goto failed;
 				} else {
 					vm_wait(NULL);
 					goto retry_page;
 				}
 			}
 			mb->m_epg_pa[i] = VM_PAGE_TO_PHYS(pg_array[i]);
 			mb->m_epg_npgs++;
 		}
 		mb->m_epg_last_len = length - PAGE_SIZE * (mb->m_epg_npgs - 1);
 		MBUF_EXT_PGS_ASSERT_SANITY(mb);
 		total -= length;
 		error = uiomove_fromphys(pg_array, 0, length, uio);
 		if (error != 0)
 			goto failed;
 		mb->m_len = length;
 		mb->m_ext.ext_size += PAGE_SIZE * mb->m_epg_npgs;
 		if (flags & M_PKTHDR)
 			m->m_pkthdr.len += length;
 	}
 	return (m);
 
 failed:
 	m_freem(m);
 	return (NULL);
 }
 
 /*
  * Copy the contents of uio into a properly sized mbuf chain.
  * A compat KPI.  Users are recommended to use direct calls to backing
  * functions.
  */
 struct mbuf *
 m_uiotombuf(struct uio *uio, int how, int len, int lspace, int flags)
 {
 
 	if (flags & M_EXTPG) {
 		/* XXX: 'lspace' magically becomes maxseg! */
 		return (m_uiotombuf_nomap(uio, how, len, lspace, flags));
 	} else if (__predict_false(uio->uio_resid == 0)) {
 		struct mbuf *m;
 
 		/*
 		 * m_uiotombuf() is known to return zero length buffer, keep
 		 * this compatibility. mc_uiotomc() won't do that.
 		 */
 		if (flags & M_PKTHDR) {
 			m = m_gethdr(how, MT_DATA);
 			m->m_pkthdr.memlen = MSIZE;
 		} else
 			m = m_get(how, MT_DATA);
 		if (m != NULL)
 			m->m_data += lspace;
 		return (m);
 	} else {
 		struct mchain mc;
 		int error;
 
 		error = mc_uiotomc(&mc, uio, len, lspace, how, flags);
 		if (__predict_true(error == 0)) {
 			if (flags & M_PKTHDR) {
 				mc_first(&mc)->m_pkthdr.len = mc.mc_len;
 				mc_first(&mc)->m_pkthdr.memlen = mc.mc_mlen;
 			}
 			return (mc_first(&mc));
 		} else
 			return (NULL);
 	}
 }
 
 /*
  * Copy the contents of uio into a properly sized mbuf chain.
  * In case of failure state of mchain is inconsistent.
  * @param length Limit copyout length.  If 0 entire uio_resid is copied.
  * @param lspace Provide leading space in the first mbuf in the chain.
  */
 int
 mc_uiotomc(struct mchain *mc, struct uio *uio, u_int length, u_int lspace,
     int how, int flags)
 {
 	struct mbuf *mb;
 	u_int total;
 	int error;
 
 	MPASS(lspace < MHLEN);
 	MPASS(UINT_MAX - lspace >= length);
 	MPASS(uio->uio_rw == UIO_WRITE);
 	MPASS(uio->uio_resid >= 0);
 
 	if (length > 0) {
 		if (uio->uio_resid > length) {
 			total = length;
 			flags &= ~M_EOR;
 		} else
 			total = uio->uio_resid;
 	} else if (__predict_false(uio->uio_resid + lspace > UINT_MAX))
 		return (EOVERFLOW);
 	else
 		total = uio->uio_resid;
 
 	if (__predict_false(total + lspace == 0)) {
 		*mc = MCHAIN_INITIALIZER(mc);
 		return (0);
 	}
 
 	error = mc_get(mc, total + lspace, how, MT_DATA, flags);
 	if (__predict_false(error))
 		return (error);
 	mc_first(mc)->m_data += lspace;
 
 	/* Fill all mbufs with uio data and update header information. */
 	STAILQ_FOREACH(mb, &mc->mc_q, m_stailq) {
 		u_int mlen;
 
 		mlen = min(M_TRAILINGSPACE(mb), total - mc->mc_len);
 		error = uiomove(mtod(mb, void *), mlen, uio);
 		if (__predict_false(error)) {
 			mc_freem(mc);
 			return (error);
 		}
 		mb->m_len = mlen;
 		mc->mc_len += mlen;
 	}
 	MPASS(mc->mc_len == total);
 
 	return (0);
 }
 
 /*
  * Copy data to/from an unmapped mbuf into a uio limited by len if set.
  */
 int
 m_unmapped_uiomove(const struct mbuf *m, int m_off, struct uio *uio, int len)
 {
 	vm_page_t pg;
 	int error, i, off, pglen, pgoff, seglen, segoff;
 
 	M_ASSERTEXTPG(m);
 	error = 0;
 
 	/* Skip over any data removed from the front. */
 	off = mtod(m, vm_offset_t);
 
 	off += m_off;
 	if (m->m_epg_hdrlen != 0) {
 		if (off >= m->m_epg_hdrlen) {
 			off -= m->m_epg_hdrlen;
 		} else {
 			seglen = m->m_epg_hdrlen - off;
 			segoff = off;
 			seglen = min(seglen, len);
 			off = 0;
 			len -= seglen;
 			error = uiomove(__DECONST(void *,
 			    &m->m_epg_hdr[segoff]), seglen, uio);
 		}
 	}
 	pgoff = m->m_epg_1st_off;
 	for (i = 0; i < m->m_epg_npgs && error == 0 && len > 0; i++) {
 		pglen = m_epg_pagelen(m, i, pgoff);
 		if (off >= pglen) {
 			off -= pglen;
 			pgoff = 0;
 			continue;
 		}
 		seglen = pglen - off;
 		segoff = pgoff + off;
 		off = 0;
 		seglen = min(seglen, len);
 		len -= seglen;
 		pg = PHYS_TO_VM_PAGE(m->m_epg_pa[i]);
 		error = uiomove_fromphys(&pg, segoff, seglen, uio);
 		pgoff = 0;
 	};
 	if (len != 0 && error == 0) {
 		KASSERT((off + len) <= m->m_epg_trllen,
 		    ("off + len > trail (%d + %d > %d, m_off = %d)", off, len,
 		    m->m_epg_trllen, m_off));
 		error = uiomove(__DECONST(void *, &m->m_epg_trail[off]),
 		    len, uio);
 	}
 	return (error);
 }
 
 /*
  * Copy an mbuf chain into a uio limited by len if set.
  */
 int
 m_mbuftouio(struct uio *uio, const struct mbuf *m, int len)
 {
 	int error, length, total;
 	int progress = 0;
 
 	if (len > 0)
 		total = min(uio->uio_resid, len);
 	else
 		total = uio->uio_resid;
 
 	/* Fill the uio with data from the mbufs. */
 	for (; m != NULL; m = m->m_next) {
 		length = min(m->m_len, total - progress);
 
 		if ((m->m_flags & M_EXTPG) != 0)
 			error = m_unmapped_uiomove(m, 0, uio, length);
 		else
 			error = uiomove(mtod(m, void *), length, uio);
 		if (error)
 			return (error);
 
 		progress += length;
 	}
 
 	return (0);
 }
 
 /*
  * Create a writable copy of the mbuf chain.  While doing this
  * we compact the chain with a goal of producing a chain with
  * at most two mbufs.  The second mbuf in this chain is likely
  * to be a cluster.  The primary purpose of this work is to create
  * a writable packet for encryption, compression, etc.  The
  * secondary goal is to linearize the data so the data can be
  * passed to crypto hardware in the most efficient manner possible.
  */
 struct mbuf *
 m_unshare(struct mbuf *m0, int how)
 {
 	struct mbuf *m, *mprev;
 	struct mbuf *n, *mfirst, *mlast;
 	int len, off;
 
 	mprev = NULL;
 	for (m = m0; m != NULL; m = mprev->m_next) {
 		/*
 		 * Regular mbufs are ignored unless there's a cluster
 		 * in front of it that we can use to coalesce.  We do
 		 * the latter mainly so later clusters can be coalesced
 		 * also w/o having to handle them specially (i.e. convert
 		 * mbuf+cluster -> cluster).  This optimization is heavily
 		 * influenced by the assumption that we're running over
 		 * Ethernet where MCLBYTES is large enough that the max
 		 * packet size will permit lots of coalescing into a
 		 * single cluster.  This in turn permits efficient
 		 * crypto operations, especially when using hardware.
 		 */
 		if ((m->m_flags & M_EXT) == 0) {
 			if (mprev && (mprev->m_flags & M_EXT) &&
 			    m->m_len <= M_TRAILINGSPACE(mprev)) {
 				/* XXX: this ignores mbuf types */
 				memcpy(mtod(mprev, caddr_t) + mprev->m_len,
 				    mtod(m, caddr_t), m->m_len);
 				mprev->m_len += m->m_len;
 				mprev->m_next = m->m_next;	/* unlink from chain */
 				m_free(m);			/* reclaim mbuf */
 			} else {
 				mprev = m;
 			}
 			continue;
 		}
 		/*
 		 * Writable mbufs are left alone (for now).
 		 */
 		if (M_WRITABLE(m)) {
 			mprev = m;
 			continue;
 		}
 
 		/*
 		 * Not writable, replace with a copy or coalesce with
 		 * the previous mbuf if possible (since we have to copy
 		 * it anyway, we try to reduce the number of mbufs and
 		 * clusters so that future work is easier).
 		 */
 		KASSERT(m->m_flags & M_EXT, ("m_flags 0x%x", m->m_flags));
 		/* NB: we only coalesce into a cluster or larger */
 		if (mprev != NULL && (mprev->m_flags & M_EXT) &&
 		    m->m_len <= M_TRAILINGSPACE(mprev)) {
 			/* XXX: this ignores mbuf types */
 			memcpy(mtod(mprev, caddr_t) + mprev->m_len,
 			    mtod(m, caddr_t), m->m_len);
 			mprev->m_len += m->m_len;
 			mprev->m_next = m->m_next;	/* unlink from chain */
 			m_free(m);			/* reclaim mbuf */
 			continue;
 		}
 
 		/*
 		 * Allocate new space to hold the copy and copy the data.
 		 * We deal with jumbo mbufs (i.e. m_len > MCLBYTES) by
 		 * splitting them into clusters.  We could just malloc a
 		 * buffer and make it external but too many device drivers
 		 * don't know how to break up the non-contiguous memory when
 		 * doing DMA.
 		 */
 		n = m_getcl(how, m->m_type, m->m_flags & M_COPYFLAGS);
 		if (n == NULL) {
 			m_freem(m0);
 			return (NULL);
 		}
 		if (m->m_flags & M_PKTHDR) {
 			KASSERT(mprev == NULL, ("%s: m0 %p, m %p has M_PKTHDR",
 			    __func__, m0, m));
 			m_move_pkthdr(n, m);
 		}
 		len = m->m_len;
 		off = 0;
 		mfirst = n;
 		mlast = NULL;
 		for (;;) {
 			int cc = min(len, MCLBYTES);
 			memcpy(mtod(n, caddr_t), mtod(m, caddr_t) + off, cc);
 			n->m_len = cc;
 			if (mlast != NULL)
 				mlast->m_next = n;
 			mlast = n;
 #if 0
 			newipsecstat.ips_clcopied++;
 #endif
 
 			len -= cc;
 			if (len <= 0)
 				break;
 			off += cc;
 
 			n = m_getcl(how, m->m_type, m->m_flags & M_COPYFLAGS);
 			if (n == NULL) {
 				m_freem(mfirst);
 				m_freem(m0);
 				return (NULL);
 			}
 		}
 		n->m_next = m->m_next;
 		if (mprev == NULL)
 			m0 = mfirst;		/* new head of chain */
 		else
 			mprev->m_next = mfirst;	/* replace old mbuf */
 		m_free(m);			/* release old mbuf */
 		mprev = mfirst;
 	}
 	return (m0);
 }
 
 #ifdef MBUF_PROFILING
 
 #define MP_BUCKETS 32 /* don't just change this as things may overflow.*/
 struct mbufprofile {
 	uintmax_t wasted[MP_BUCKETS];
 	uintmax_t used[MP_BUCKETS];
 	uintmax_t segments[MP_BUCKETS];
 } mbprof;
 
 void
 m_profile(struct mbuf *m)
 {
 	int segments = 0;
 	int used = 0;
 	int wasted = 0;
 
 	while (m) {
 		segments++;
 		used += m->m_len;
 		if (m->m_flags & M_EXT) {
 			wasted += MHLEN - sizeof(m->m_ext) +
 			    m->m_ext.ext_size - m->m_len;
 		} else {
 			if (m->m_flags & M_PKTHDR)
 				wasted += MHLEN - m->m_len;
 			else
 				wasted += MLEN - m->m_len;
 		}
 		m = m->m_next;
 	}
 	/* be paranoid.. it helps */
 	if (segments > MP_BUCKETS - 1)
 		segments = MP_BUCKETS - 1;
 	if (used > 100000)
 		used = 100000;
 	if (wasted > 100000)
 		wasted = 100000;
 	/* store in the appropriate bucket */
 	/* don't bother locking. if it's slightly off, so what? */
 	mbprof.segments[segments]++;
 	mbprof.used[fls(used)]++;
 	mbprof.wasted[fls(wasted)]++;
 }
 
 static int
 mbprof_handler(SYSCTL_HANDLER_ARGS)
 {
 	char buf[256];
 	struct sbuf sb;
 	int error;
 	uint64_t *p;
 
 	sbuf_new_for_sysctl(&sb, buf, sizeof(buf), req);
 
 	p = &mbprof.wasted[0];
 	sbuf_printf(&sb,
 	    "wasted:\n"
 	    "%ju %ju %ju %ju %ju %ju %ju %ju "
 	    "%ju %ju %ju %ju %ju %ju %ju %ju\n",
 	    p[0], p[1], p[2], p[3], p[4], p[5], p[6], p[7],
 	    p[8], p[9], p[10], p[11], p[12], p[13], p[14], p[15]);
 #ifdef BIG_ARRAY
 	p = &mbprof.wasted[16];
 	sbuf_printf(&sb,
 	    "%ju %ju %ju %ju %ju %ju %ju %ju "
 	    "%ju %ju %ju %ju %ju %ju %ju %ju\n",
 	    p[0], p[1], p[2], p[3], p[4], p[5], p[6], p[7],
 	    p[8], p[9], p[10], p[11], p[12], p[13], p[14], p[15]);
 #endif
 	p = &mbprof.used[0];
 	sbuf_printf(&sb,
 	    "used:\n"
 	    "%ju %ju %ju %ju %ju %ju %ju %ju "
 	    "%ju %ju %ju %ju %ju %ju %ju %ju\n",
 	    p[0], p[1], p[2], p[3], p[4], p[5], p[6], p[7],
 	    p[8], p[9], p[10], p[11], p[12], p[13], p[14], p[15]);
 #ifdef BIG_ARRAY
 	p = &mbprof.used[16];
 	sbuf_printf(&sb,
 	    "%ju %ju %ju %ju %ju %ju %ju %ju "
 	    "%ju %ju %ju %ju %ju %ju %ju %ju\n",
 	    p[0], p[1], p[2], p[3], p[4], p[5], p[6], p[7],
 	    p[8], p[9], p[10], p[11], p[12], p[13], p[14], p[15]);
 #endif
 	p = &mbprof.segments[0];
 	sbuf_printf(&sb,
 	    "segments:\n"
 	    "%ju %ju %ju %ju %ju %ju %ju %ju "
 	    "%ju %ju %ju %ju %ju %ju %ju %ju\n",
 	    p[0], p[1], p[2], p[3], p[4], p[5], p[6], p[7],
 	    p[8], p[9], p[10], p[11], p[12], p[13], p[14], p[15]);
 #ifdef BIG_ARRAY
 	p = &mbprof.segments[16];
 	sbuf_printf(&sb,
 	    "%ju %ju %ju %ju %ju %ju %ju %ju "
 	    "%ju %ju %ju %ju %ju %ju %ju %jju",
 	    p[0], p[1], p[2], p[3], p[4], p[5], p[6], p[7],
 	    p[8], p[9], p[10], p[11], p[12], p[13], p[14], p[15]);
 #endif
 
 	error = sbuf_finish(&sb);
 	sbuf_delete(&sb);
 	return (error);
 }
 
 static int
 mbprof_clr_handler(SYSCTL_HANDLER_ARGS)
 {
 	int clear, error;
 
 	clear = 0;
 	error = sysctl_handle_int(oidp, &clear, 0, req);
 	if (error || !req->newptr)
 		return (error);
 
 	if (clear) {
 		bzero(&mbprof, sizeof(mbprof));
 	}
 
 	return (error);
 }
 
 SYSCTL_PROC(_kern_ipc, OID_AUTO, mbufprofile,
     CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 0,
     mbprof_handler, "A",
     "mbuf profiling statistics");
 
 SYSCTL_PROC(_kern_ipc, OID_AUTO, mbufprofileclr,
     CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, NULL, 0,
     mbprof_clr_handler, "I",
     "clear mbuf profiling statistics");
 #endif
diff --git a/sys/sys/mbuf.h b/sys/sys/mbuf.h
index ab494a76833e..434f29feddcf 100644
--- a/sys/sys/mbuf.h
+++ b/sys/sys/mbuf.h
@@ -1,1852 +1,1852 @@
 /*-
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Copyright (c) 1982, 1986, 1988, 1993
  *	The Regents of the University of California.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #ifndef _SYS_MBUF_H_
 #define	_SYS_MBUF_H_
 
 /* XXX: These includes suck. Sorry! */
 #include <sys/queue.h>
 #ifdef _KERNEL
 #include <sys/systm.h>
 #include <sys/refcount.h>
 #include <vm/uma.h>
 
 #include <sys/sdt.h>
 
 #define	MBUF_PROBE1(probe, arg0)					\
 	SDT_PROBE1(sdt, , , probe, arg0)
 #define	MBUF_PROBE2(probe, arg0, arg1)					\
 	SDT_PROBE2(sdt, , , probe, arg0, arg1)
 #define	MBUF_PROBE3(probe, arg0, arg1, arg2)				\
 	SDT_PROBE3(sdt, , , probe, arg0, arg1, arg2)
 #define	MBUF_PROBE4(probe, arg0, arg1, arg2, arg3)			\
 	SDT_PROBE4(sdt, , , probe, arg0, arg1, arg2, arg3)
 #define	MBUF_PROBE5(probe, arg0, arg1, arg2, arg3, arg4)		\
 	SDT_PROBE5(sdt, , , probe, arg0, arg1, arg2, arg3, arg4)
 
 SDT_PROBE_DECLARE(sdt, , , m__init);
 SDT_PROBE_DECLARE(sdt, , , m__gethdr_raw);
 SDT_PROBE_DECLARE(sdt, , , m__gethdr);
 SDT_PROBE_DECLARE(sdt, , , m__get_raw);
 SDT_PROBE_DECLARE(sdt, , , m__get);
 SDT_PROBE_DECLARE(sdt, , , m__getcl);
 SDT_PROBE_DECLARE(sdt, , , m__getjcl);
 SDT_PROBE_DECLARE(sdt, , , m__clget);
 SDT_PROBE_DECLARE(sdt, , , m__cljget);
 SDT_PROBE_DECLARE(sdt, , , m__cljset);
 SDT_PROBE_DECLARE(sdt, , , m__free);
 SDT_PROBE_DECLARE(sdt, , , m__freem);
 SDT_PROBE_DECLARE(sdt, , , m__freemp);
 
 #endif /* _KERNEL */
 
 /*
  * Mbufs are of a single size, MSIZE (sys/param.h), which includes overhead.
  * An mbuf may add a single "mbuf cluster" of size MCLBYTES (also in
  * sys/param.h), which has no additional overhead and is used instead of the
  * internal data area; this is done when at least MINCLSIZE of data must be
  * stored.  Additionally, it is possible to allocate a separate buffer
  * externally and attach it to the mbuf in a way similar to that of mbuf
  * clusters.
  *
  * NB: These calculation do not take actual compiler-induced alignment and
  * padding inside the complete struct mbuf into account.  Appropriate
  * attention is required when changing members of struct mbuf.
  *
  * MLEN is data length in a normal mbuf.
  * MHLEN is data length in an mbuf with pktheader.
  * MINCLSIZE is a smallest amount of data that should be put into cluster.
  *
  * Compile-time assertions in uipc_mbuf.c test these values to ensure that
  * they are sensible.
  */
 struct mbuf;
 #define	MHSIZE		offsetof(struct mbuf, m_dat)
 #define	MPKTHSIZE	offsetof(struct mbuf, m_pktdat)
 #define	MLEN		((int)(MSIZE - MHSIZE))
 #define	MHLEN		((int)(MSIZE - MPKTHSIZE))
 #define	MINCLSIZE	(MHLEN + 1)
 #define	M_NODOM		255
 
 #ifdef _KERNEL
 /*-
  * Macro for type conversion: convert mbuf pointer to data pointer of correct
  * type:
  *
  * mtod(m, t)	-- Convert mbuf pointer to data pointer of correct type.
  * mtodo(m, o) -- Same as above but with offset 'o' into data.
  */
 #define	mtod(m, t)	((t)((m)->m_data))
 #define	mtodo(m, o)	((void *)(((m)->m_data) + (o)))
 
 /*
  * Argument structure passed to UMA routines during mbuf and packet
  * allocations.
  */
 struct mb_args {
 	int	flags;	/* Flags for mbuf being allocated */
 	short	type;	/* Type of mbuf being allocated */
 };
 #endif /* _KERNEL */
 
 /*
  * Packet tag structure (see below for details).
  */
 struct m_tag {
 	SLIST_ENTRY(m_tag)	m_tag_link;	/* List of packet tags */
 	u_int16_t		m_tag_id;	/* Tag ID */
 	u_int16_t		m_tag_len;	/* Length of data */
 	u_int32_t		m_tag_cookie;	/* ABI/Module ID */
 	void			(*m_tag_free)(struct m_tag *);
 };
 
 /*
  * Static network interface owned tag.
  * Allocated through ifp->if_snd_tag_alloc().
  */
 struct if_snd_tag_sw;
 
 struct m_snd_tag {
 	struct ifnet *ifp;		/* network interface tag belongs to */
 	const struct if_snd_tag_sw *sw;
 	volatile u_int refcount;
 };
 
 /*
  * Record/packet header in first mbuf of chain; valid only if M_PKTHDR is set.
  * Size ILP32: 56
  *	 LP64: 64
  * Compile-time assertions in uipc_mbuf.c test these values to ensure that
  * they are correct.
  */
 struct pkthdr {
 	union {
 		struct m_snd_tag *snd_tag;	/* send tag, if any */
 		struct ifnet	*rcvif;		/* rcv interface */
 		struct {
 			uint16_t rcvidx;	/* rcv interface index ... */
 			uint16_t rcvgen;	/* ... and generation count */
 		};
 	};
 	union {
 		struct ifnet	*leaf_rcvif;	/* leaf rcv interface */
 		struct {
 			uint16_t leaf_rcvidx;	/* leaf rcv interface index ... */
 			uint16_t leaf_rcvgen;	/* ... and generation count */
 		};
 	};
 	SLIST_HEAD(packet_tags, m_tag) tags; /* list of packet tags */
 	int32_t		 len;		/* total packet length */
 
 	/* Layer crossing persistent information. */
 	uint32_t	 flowid;	/* packet's 4-tuple system */
 	uint32_t	 csum_flags;	/* checksum and offload features */
 	uint16_t	 fibnum;	/* this packet should use this fib */
 	uint8_t		 numa_domain;	/* NUMA domain of recvd pkt */
 	uint8_t		 rsstype;	/* hash type */
 #if !defined(__LP64__)
 	uint32_t	 pad;		/* pad for 64bit alignment */
 #endif
 	union {
 		uint64_t	rcv_tstmp;	/* timestamp in ns */
 		struct {
 			uint8_t		 l2hlen;	/* layer 2 hdr len */
 			uint8_t		 l3hlen;	/* layer 3 hdr len */
 			uint8_t		 l4hlen;	/* layer 4 hdr len */
 			uint8_t		 l5hlen;	/* layer 5 hdr len */
 			uint8_t		 inner_l2hlen;
 			uint8_t		 inner_l3hlen;
 			uint8_t		 inner_l4hlen;
 			uint8_t		 inner_l5hlen;
 		};
 	};
 	union {
 		uint8_t  eight[8];
 		uint16_t sixteen[4];
 		uint32_t thirtytwo[2];
 		uint64_t sixtyfour[1];
 		uintptr_t unintptr[1];
 		void	*ptr;
 	} PH_per;
 
 	/* Layer specific non-persistent local storage for reassembly, etc. */
 	union {
 		union {
 			uint8_t  eight[8];
 			uint16_t sixteen[4];
 			uint32_t thirtytwo[2];
 			uint64_t sixtyfour[1];
 			uintptr_t unintptr[1];
 			void 	*ptr;
 		} PH_loc;
 		/* Upon allocation: total packet memory consumption. */
 		u_int	memlen;
 	};
 };
 #define	ether_vtag	PH_per.sixteen[0]
 #define tcp_tun_port	PH_per.sixteen[0] /* outbound */
 #define	vt_nrecs	PH_per.sixteen[0]	  /* mld and v6-ND */
 #define	tso_segsz	PH_per.sixteen[1] /* inbound after LRO */
 #define	lro_nsegs	tso_segsz	  /* inbound after LRO */
 #define	csum_data	PH_per.thirtytwo[1] /* inbound from hardware up */
 #define	lro_tcp_d_len	PH_loc.sixteen[0] /* inbound during LRO (no reassembly) */
 #define	lro_tcp_d_csum	PH_loc.sixteen[1] /* inbound during LRO (no reassembly) */
 #define	lro_tcp_h_off	PH_loc.sixteen[2] /* inbound during LRO (no reassembly) */
 #define	lro_etype	PH_loc.sixteen[3] /* inbound during LRO (no reassembly) */
 /* Note PH_loc is used during IP reassembly (all 8 bytes as a ptr) */
 
 /*
  * TLS records for TLS 1.0-1.2 can have the following header lengths:
  * - 5 (AES-CBC with implicit IV)
  * - 21 (AES-CBC with explicit IV)
  * - 13 (AES-GCM with 8 byte explicit IV)
  */
 #define	MBUF_PEXT_HDR_LEN	23
 
 /*
  * TLS records for TLS 1.0-1.2 can have the following maximum trailer
  * lengths:
  * - 16 (AES-GCM)
  * - 36 (AES-CBC with SHA1 and up to 16 bytes of padding)
  * - 48 (AES-CBC with SHA2-256 and up to 16 bytes of padding)
  * - 64 (AES-CBC with SHA2-384 and up to 16 bytes of padding)
  */
 #define	MBUF_PEXT_TRAIL_LEN	64
 
 #if defined(__LP64__)
 #define MBUF_PEXT_MAX_PGS (40 / sizeof(vm_paddr_t))
 #else
 #define MBUF_PEXT_MAX_PGS (64 / sizeof(vm_paddr_t))
 #endif
 
 #define	MBUF_PEXT_MAX_BYTES						\
     (MBUF_PEXT_MAX_PGS * PAGE_SIZE + MBUF_PEXT_HDR_LEN + MBUF_PEXT_TRAIL_LEN)
 
 struct ktls_session;
 struct socket;
 
 /*
  * Description of external storage mapped into mbuf; valid only if M_EXT is
  * set.
  * Size ILP32: 28
  *	 LP64: 48
  * Compile-time assertions in uipc_mbuf.c test these values to ensure that
  * they are correct.
  */
 typedef	void m_ext_free_t(struct mbuf *);
 struct m_ext {
 	union {
 		/*
 		 * If EXT_FLAG_EMBREF is set, then we use refcount in the
 		 * mbuf, the 'ext_count' member.  Otherwise, we have a
 		 * shadow copy and we use pointer 'ext_cnt'.  The original
 		 * mbuf is responsible to carry the pointer to free routine
 		 * and its arguments.  They aren't copied into shadows in
 		 * mb_dupcl() to avoid dereferencing next cachelines.
 		 */
 		volatile u_int	 ext_count;
 		volatile u_int	*ext_cnt;
 	};
 	uint32_t	 ext_size;	/* size of buffer, for ext_free */
 	uint32_t	 ext_type:8,	/* type of external storage */
 			 ext_flags:24;	/* external storage mbuf flags */
 	union {
 		struct {
 			/*
 			 * Regular M_EXT mbuf:
 			 * o ext_buf always points to the external buffer.
 			 * o ext_free (below) and two optional arguments
 			 *   ext_arg1 and ext_arg2 store the free context for
 			 *   the external storage.  They are set only in the
 			 *   refcount carrying mbuf, the one with
 			 *   EXT_FLAG_EMBREF flag, with exclusion for
 			 *   EXT_EXTREF type, where the free context is copied
 			 *   into all mbufs that use same external storage.
 			 */
 			char 	*ext_buf;	/* start of buffer */
 #define	m_ext_copylen	offsetof(struct m_ext, ext_arg2)
 			void	*ext_arg2;
 		};
 		struct {
 			/*
 			 * Multi-page M_EXTPG mbuf:
 			 * o extpg_pa - page vector.
 			 * o extpg_trail and extpg_hdr - TLS trailer and
 			 *   header.
 			 * Uses ext_free and may also use ext_arg1.
 			 */
 			vm_paddr_t	extpg_pa[MBUF_PEXT_MAX_PGS];
 			char		extpg_trail[MBUF_PEXT_TRAIL_LEN];
 			char		extpg_hdr[MBUF_PEXT_HDR_LEN];
 			/* Pretend these 3 fields are part of mbuf itself. */
 #define	m_epg_pa	m_ext.extpg_pa
 #define	m_epg_trail	m_ext.extpg_trail
 #define	m_epg_hdr	m_ext.extpg_hdr
 #define	m_epg_ext_copylen	offsetof(struct m_ext, ext_free)
 		};
 	};
 	/*
 	 * Free method and optional argument pointer, both
 	 * used by M_EXT and M_EXTPG.
 	 */
 	m_ext_free_t	*ext_free;
 	void		*ext_arg1;
 };
 
 /*
  * The core of the mbuf object along with some shortcut defines for practical
  * purposes.
  */
 struct mbuf {
 	/*
 	 * Header present at the beginning of every mbuf.
 	 * Size ILP32: 24
 	 *      LP64: 32
 	 * Compile-time assertions in uipc_mbuf.c test these values to ensure
 	 * that they are correct.
 	 */
 	union {	/* next buffer in chain */
 		struct mbuf		*m_next;
 		SLIST_ENTRY(mbuf)	m_slist;
 		STAILQ_ENTRY(mbuf)	m_stailq;
 	};
 	union {	/* next chain in queue/record */
 		struct mbuf		*m_nextpkt;
 		SLIST_ENTRY(mbuf)	m_slistpkt;
 		STAILQ_ENTRY(mbuf)	m_stailqpkt;
 	};
 	caddr_t		 m_data;	/* location of data */
 	int32_t		 m_len;		/* amount of data in this mbuf */
 	uint32_t	 m_type:8,	/* type of data in this mbuf */
 			 m_flags:24;	/* flags; see below */
 #if !defined(__LP64__)
 	uint32_t	 m_pad;		/* pad for 64bit alignment */
 #endif
 
 	/*
 	 * A set of optional headers (packet header, external storage header)
 	 * and internal data storage.  Historically, these arrays were sized
 	 * to MHLEN (space left after a packet header) and MLEN (space left
 	 * after only a regular mbuf header); they are now variable size in
 	 * order to support future work on variable-size mbufs.
 	 */
 	union {
 		struct {
 			union {
 				/* M_PKTHDR set. */
 				struct pkthdr	m_pkthdr;
 
 				/* M_EXTPG set.
 				 * Multi-page M_EXTPG mbuf has its meta data
 				 * split between the below anonymous structure
 				 * and m_ext.  It carries vector of pages,
 				 * optional header and trailer char vectors
 				 * and pointers to socket/TLS data.
 				 */
 #define	m_epg_startcopy		m_epg_npgs
 #define	m_epg_endcopy		m_epg_stailq
 				struct {
 					/* Overall count of pages and count of
 					 * pages with I/O pending. */
 					uint8_t	m_epg_npgs;
 					uint8_t	m_epg_nrdy;
 					/* TLS header and trailer lengths.
 					 * The data itself resides in m_ext. */
 					uint8_t	m_epg_hdrlen;
 					uint8_t	m_epg_trllen;
 					/* Offset into 1st page and length of
 					 * data in the last page. */
 					uint16_t m_epg_1st_off;
 					uint16_t m_epg_last_len;
 					uint8_t	m_epg_flags;
 #define	EPG_FLAG_ANON	0x1	/* Data can be encrypted in place. */
 #define	EPG_FLAG_2FREE	0x2	/* Scheduled for free. */
 					uint8_t	m_epg_record_type;
 					uint8_t	__spare[2];
 					int	m_epg_enc_cnt;
 					struct ktls_session *m_epg_tls;
 					struct socket	*m_epg_so;
 					uint64_t	m_epg_seqno;
 					STAILQ_ENTRY(mbuf) m_epg_stailq;
 				};
 			};
 			union {
 				/* M_EXT or M_EXTPG set. */
 				struct m_ext	m_ext;
 				/* M_PKTHDR set, neither M_EXT nor M_EXTPG. */
 				char		m_pktdat[0];
 			};
 		};
 		char	m_dat[0];			/* !M_PKTHDR, !M_EXT */
 	};
 };
 
 #ifdef _KERNEL
 static inline int
 m_epg_pagelen(const struct mbuf *m, int pidx, int pgoff)
 {
 
 	KASSERT(pgoff == 0 || pidx == 0,
 	    ("page %d with non-zero offset %d in %p", pidx, pgoff, m));
 
 	if (pidx == m->m_epg_npgs - 1) {
 		return (m->m_epg_last_len);
 	} else {
 		return (PAGE_SIZE - pgoff);
 	}
 }
 
 #ifdef INVARIANTS
 #define	MCHECK(ex, msg)	KASSERT((ex),				\
 	    ("Multi page mbuf %p with " #msg " at %s:%d",	\
 	    m, __FILE__, __LINE__))
 /*
  * NB: This expects a non-empty buffer (npgs > 0 and
  * last_pg_len > 0).
  */
 #define	MBUF_EXT_PGS_ASSERT_SANITY(m)	do {				\
 	MCHECK(m->m_epg_npgs > 0, "no valid pages");		\
 	MCHECK(m->m_epg_npgs <= nitems(m->m_epg_pa),		\
 	    "too many pages");						\
 	MCHECK(m->m_epg_nrdy <= m->m_epg_npgs,			\
 	    "too many ready pages");					\
 	MCHECK(m->m_epg_1st_off < PAGE_SIZE,			\
 		"too large page offset");				\
 	MCHECK(m->m_epg_last_len > 0, "zero last page length");	\
 	MCHECK(m->m_epg_last_len <= PAGE_SIZE,			\
 	    "too large last page length");				\
 	if (m->m_epg_npgs == 1)					\
 		MCHECK(m->m_epg_1st_off +			\
 		    m->m_epg_last_len <=	 PAGE_SIZE,		\
 		    "single page too large");				\
 	MCHECK(m->m_epg_hdrlen <= sizeof(m->m_epg_hdr),		\
 	    "too large header length");					\
 	MCHECK(m->m_epg_trllen <= sizeof(m->m_epg_trail),	\
 	    "too large header length");					\
 } while (0)
 #else
 #define	MBUF_EXT_PGS_ASSERT_SANITY(m)	do {} while (0)
 #endif
 #endif
 
 /*
  * mbuf flags of global significance and layer crossing.
  * Those of only protocol/layer specific significance are to be mapped
  * to M_PROTO[1-11] and cleared at layer handoff boundaries.
  * NB: Limited to the lower 24 bits.
  */
 #define	M_EXT		0x00000001 /* has associated external storage */
 #define	M_PKTHDR	0x00000002 /* start of record */
 #define	M_EOR		0x00000004 /* end of record */
 #define	M_RDONLY	0x00000008 /* associated data is marked read-only */
 #define	M_BCAST		0x00000010 /* send/received as link-level broadcast */
 #define	M_MCAST		0x00000020 /* send/received as link-level multicast */
 #define	M_PROMISC	0x00000040 /* packet was not for us */
 #define	M_VLANTAG	0x00000080 /* ether_vtag is valid */
 #define	M_EXTPG		0x00000100 /* has array of unmapped pages and TLS */
 #define	M_NOFREE	0x00000200 /* do not free mbuf, embedded in cluster */
 #define	M_TSTMP		0x00000400 /* rcv_tstmp field is valid */
 #define	M_TSTMP_HPREC	0x00000800 /* rcv_tstmp is high-prec, typically
 				      hw-stamped on port (useful for IEEE 1588
 				      and 802.1AS) */
 #define M_TSTMP_LRO	0x00001000 /* Time LRO pushed in pkt is valid in (PH_loc) */
 
 #define	M_PROTO1	0x00002000 /* protocol-specific */
 #define	M_PROTO2	0x00004000 /* protocol-specific */
 #define	M_PROTO3	0x00008000 /* protocol-specific */
 #define	M_PROTO4	0x00010000 /* protocol-specific */
 #define	M_PROTO5	0x00020000 /* protocol-specific */
 #define	M_PROTO6	0x00040000 /* protocol-specific */
 #define	M_PROTO7	0x00080000 /* protocol-specific */
 #define	M_PROTO8	0x00100000 /* protocol-specific */
 #define	M_PROTO9	0x00200000 /* protocol-specific */
 #define	M_PROTO10	0x00400000 /* protocol-specific */
 #define	M_PROTO11	0x00800000 /* protocol-specific */
 
 /*
  * Flags to purge when crossing layers.
  */
 #define	M_PROTOFLAGS \
     (M_PROTO1|M_PROTO2|M_PROTO3|M_PROTO4|M_PROTO5|M_PROTO6|M_PROTO7|M_PROTO8|\
      M_PROTO9|M_PROTO10|M_PROTO11)
 
 /*
  * Flags preserved when copying m_pkthdr.
  */
 #define M_COPYFLAGS \
     (M_PKTHDR|M_EOR|M_RDONLY|M_BCAST|M_MCAST|M_PROMISC|M_VLANTAG|M_TSTMP| \
      M_TSTMP_HPREC|M_TSTMP_LRO|M_PROTOFLAGS)
 
 /*
  * Flags preserved during demote.
  */
 #define	M_DEMOTEFLAGS \
     (M_EXT | M_RDONLY | M_NOFREE | M_EXTPG)
 
 /*
  * Mbuf flag description for use with printf(9) %b identifier.
  */
 #define	M_FLAG_BITS \
     "\20\1M_EXT\2M_PKTHDR\3M_EOR\4M_RDONLY\5M_BCAST\6M_MCAST" \
     "\7M_PROMISC\10M_VLANTAG\11M_EXTPG\12M_NOFREE\13M_TSTMP\14M_TSTMP_HPREC\15M_TSTMP_LRO"
 #define	M_FLAG_PROTOBITS \
     "\16M_PROTO1\17M_PROTO2\20M_PROTO3\21M_PROTO4" \
     "\22M_PROTO5\23M_PROTO6\24M_PROTO7\25M_PROTO8\26M_PROTO9" \
     "\27M_PROTO10\28M_PROTO11"
 #define	M_FLAG_PRINTF (M_FLAG_BITS M_FLAG_PROTOBITS)
 
 /*
  * Network interface cards are able to hash protocol fields (such as IPv4
  * addresses and TCP port numbers) classify packets into flows.  These flows
  * can then be used to maintain ordering while delivering packets to the OS
  * via parallel input queues, as well as to provide a stateless affinity
  * model.  NIC drivers can pass up the hash via m->m_pkthdr.flowid, and set
  * m_flag fields to indicate how the hash should be interpreted by the
  * network stack.
  *
  * Most NICs support RSS, which provides ordering and explicit affinity, and
  * use the hash m_flag bits to indicate what header fields were covered by
  * the hash.  M_HASHTYPE_OPAQUE and M_HASHTYPE_OPAQUE_HASH can be set by non-
  * RSS cards or configurations that provide an opaque flow identifier, allowing
  * for ordering and distribution without explicit affinity.  Additionally,
  * M_HASHTYPE_OPAQUE_HASH indicates that the flow identifier has hash
  * properties.
  *
  * The meaning of the IPV6_EX suffix:
  * "o  Home address from the home address option in the IPv6 destination
  *     options header.  If the extension header is not present, use the Source
  *     IPv6 Address.
  *  o  IPv6 address that is contained in the Routing-Header-Type-2 from the
  *     associated extension header.  If the extension header is not present,
  *     use the Destination IPv6 Address."
  * Quoted from:
  * https://docs.microsoft.com/en-us/windows-hardware/drivers/network/rss-hashing-types#ndishashipv6ex
  */
 #define	M_HASHTYPE_HASHPROP		0x80	/* has hash properties */
 #define	M_HASHTYPE_INNER		0x40	/* calculated from inner headers */
 #define	M_HASHTYPE_HASH(t)		(M_HASHTYPE_HASHPROP | (t))
 /* Microsoft RSS standard hash types */
 #define	M_HASHTYPE_NONE			0
 #define	M_HASHTYPE_RSS_IPV4		M_HASHTYPE_HASH(1) /* IPv4 2-tuple */
 #define	M_HASHTYPE_RSS_TCP_IPV4		M_HASHTYPE_HASH(2) /* TCPv4 4-tuple */
 #define	M_HASHTYPE_RSS_IPV6		M_HASHTYPE_HASH(3) /* IPv6 2-tuple */
 #define	M_HASHTYPE_RSS_TCP_IPV6		M_HASHTYPE_HASH(4) /* TCPv6 4-tuple */
 #define	M_HASHTYPE_RSS_IPV6_EX		M_HASHTYPE_HASH(5) /* IPv6 2-tuple +
 							    * ext hdrs */
 #define	M_HASHTYPE_RSS_TCP_IPV6_EX	M_HASHTYPE_HASH(6) /* TCPv6 4-tuple +
 							    * ext hdrs */
 #define	M_HASHTYPE_RSS_UDP_IPV4		M_HASHTYPE_HASH(7) /* IPv4 UDP 4-tuple*/
 #define	M_HASHTYPE_RSS_UDP_IPV6		M_HASHTYPE_HASH(9) /* IPv6 UDP 4-tuple*/
 #define	M_HASHTYPE_RSS_UDP_IPV6_EX	M_HASHTYPE_HASH(10)/* IPv6 UDP 4-tuple +
 							    * ext hdrs */
 
 #define	M_HASHTYPE_OPAQUE		0x3f	/* ordering, not affinity */
 #define	M_HASHTYPE_OPAQUE_HASH		M_HASHTYPE_HASH(M_HASHTYPE_OPAQUE)
 						/* ordering+hash, not affinity*/
 
 #define	M_HASHTYPE_CLEAR(m)	((m)->m_pkthdr.rsstype = 0)
 #define	M_HASHTYPE_GET(m)	((m)->m_pkthdr.rsstype & ~M_HASHTYPE_INNER)
 #define	M_HASHTYPE_SET(m, v)	((m)->m_pkthdr.rsstype = (v))
 #define	M_HASHTYPE_TEST(m, v)	(M_HASHTYPE_GET(m) == (v))
 #define	M_HASHTYPE_ISHASH(m)	\
     (((m)->m_pkthdr.rsstype & M_HASHTYPE_HASHPROP) != 0)
 #define	M_HASHTYPE_SETINNER(m)	do {			\
 	(m)->m_pkthdr.rsstype |= M_HASHTYPE_INNER;	\
     } while (0)
 
 /*
  * External mbuf storage buffer types.
  */
 #define	EXT_CLUSTER	1	/* mbuf cluster */
 #define	EXT_SFBUF	2	/* sendfile(2)'s sf_buf */
 #define	EXT_JUMBOP	3	/* jumbo cluster page sized */
 #define	EXT_JUMBO9	4	/* jumbo cluster 9216 bytes */
 #define	EXT_JUMBO16	5	/* jumbo cluster 16184 bytes */
 #define	EXT_PACKET	6	/* mbuf+cluster from packet zone */
 #define	EXT_MBUF	7	/* external mbuf reference */
 #define	EXT_RXRING	8	/* data in NIC receive ring */
 #define	EXT_CTL		9	/* buffer from a ctl(4) backend */
 
 #define	EXT_VENDOR1	224	/* for vendor-internal use */
 #define	EXT_VENDOR2	225	/* for vendor-internal use */
 #define	EXT_VENDOR3	226	/* for vendor-internal use */
 #define	EXT_VENDOR4	227	/* for vendor-internal use */
 
 #define	EXT_EXP1	244	/* for experimental use */
 #define	EXT_EXP2	245	/* for experimental use */
 #define	EXT_EXP3	246	/* for experimental use */
 #define	EXT_EXP4	247	/* for experimental use */
 
 #define	EXT_NET_DRV	252	/* custom ext_buf provided by net driver(s) */
 #define	EXT_MOD_TYPE	253	/* custom module's ext_buf type */
 #define	EXT_DISPOSABLE	254	/* can throw this buffer away w/page flipping */
 #define	EXT_EXTREF	255	/* has externally maintained ext_cnt ptr */
 
 /*
  * Flags for external mbuf buffer types.
  * NB: limited to the lower 24 bits.
  */
 #define	EXT_FLAG_EMBREF		0x000001	/* embedded ext_count */
 #define	EXT_FLAG_EXTREF		0x000002	/* external ext_cnt, notyet */
 
 #define	EXT_FLAG_NOFREE		0x000010	/* don't free mbuf to pool, notyet */
 
 #define	EXT_FLAG_VENDOR1	0x010000	/* These flags are vendor */
 #define	EXT_FLAG_VENDOR2	0x020000	/* or submodule specific, */
 #define	EXT_FLAG_VENDOR3	0x040000	/* not used by mbuf code. */
 #define	EXT_FLAG_VENDOR4	0x080000	/* Set/read by submodule. */
 
 #define	EXT_FLAG_EXP1		0x100000	/* for experimental use */
 #define	EXT_FLAG_EXP2		0x200000	/* for experimental use */
 #define	EXT_FLAG_EXP3		0x400000	/* for experimental use */
 #define	EXT_FLAG_EXP4		0x800000	/* for experimental use */
 
 /*
  * EXT flag description for use with printf(9) %b identifier.
  */
 #define	EXT_FLAG_BITS \
     "\20\1EXT_FLAG_EMBREF\2EXT_FLAG_EXTREF\5EXT_FLAG_NOFREE" \
     "\21EXT_FLAG_VENDOR1\22EXT_FLAG_VENDOR2\23EXT_FLAG_VENDOR3" \
     "\24EXT_FLAG_VENDOR4\25EXT_FLAG_EXP1\26EXT_FLAG_EXP2\27EXT_FLAG_EXP3" \
     "\30EXT_FLAG_EXP4"
 
 /*
  * Flags indicating checksum, segmentation and other offload work to be
  * done, or already done, by hardware or lower layers.  It is split into
  * separate inbound and outbound flags.
  *
  * Outbound flags that are set by upper protocol layers requesting lower
  * layers, or ideally the hardware, to perform these offloading tasks.
  * For outbound packets this field and its flags can be directly tested
  * against ifnet if_hwassist.  Note that the outbound and the inbound flags do
  * not collide right now but they could be allowed to (as long as the flags are
  * scrubbed appropriately when the direction of an mbuf changes).  CSUM_BITS
  * would also have to split into CSUM_BITS_TX and CSUM_BITS_RX.
  *
  * CSUM_INNER_<x> is the same as CSUM_<x> but it applies to the inner frame.
  * The CSUM_ENCAP_<x> bits identify the outer encapsulation.
  */
 #define	CSUM_IP			0x00000001	/* IP header checksum offload */
 #define	CSUM_IP_UDP		0x00000002	/* UDP checksum offload */
 #define	CSUM_IP_TCP		0x00000004	/* TCP checksum offload */
 #define	CSUM_IP_SCTP		0x00000008	/* SCTP checksum offload */
 #define	CSUM_IP_TSO		0x00000010	/* TCP segmentation offload */
 #define	CSUM_IP_ISCSI		0x00000020	/* iSCSI checksum offload */
 
 #define	CSUM_INNER_IP6_UDP	0x00000040
 #define	CSUM_INNER_IP6_TCP	0x00000080
 #define	CSUM_INNER_IP6_TSO	0x00000100
 #define	CSUM_IP6_UDP		0x00000200	/* UDP checksum offload */
 #define	CSUM_IP6_TCP		0x00000400	/* TCP checksum offload */
 #define	CSUM_IP6_SCTP		0x00000800	/* SCTP checksum offload */
 #define	CSUM_IP6_TSO		0x00001000	/* TCP segmentation offload */
 #define	CSUM_IP6_ISCSI		0x00002000	/* iSCSI checksum offload */
 
 #define	CSUM_INNER_IP		0x00004000
 #define	CSUM_INNER_IP_UDP	0x00008000
 #define	CSUM_INNER_IP_TCP	0x00010000
 #define	CSUM_INNER_IP_TSO	0x00020000
 
 #define	CSUM_ENCAP_VXLAN	0x00040000	/* VXLAN outer encapsulation */
 #define	CSUM_ENCAP_RSVD1	0x00080000
 
 /* Inbound checksum support where the checksum was verified by hardware. */
 #define	CSUM_INNER_L3_CALC	0x00100000
 #define	CSUM_INNER_L3_VALID	0x00200000
 #define	CSUM_INNER_L4_CALC	0x00400000
 #define	CSUM_INNER_L4_VALID	0x00800000
 #define	CSUM_L3_CALC		0x01000000	/* calculated layer 3 csum */
 #define	CSUM_L3_VALID		0x02000000	/* checksum is correct */
 #define	CSUM_L4_CALC		0x04000000	/* calculated layer 4 csum */
 #define	CSUM_L4_VALID		0x08000000	/* checksum is correct */
 #define	CSUM_L5_CALC		0x10000000	/* calculated layer 5 csum */
 #define	CSUM_L5_VALID		0x20000000	/* checksum is correct */
 #define	CSUM_COALESCED		0x40000000	/* contains merged segments */
 
 #define	CSUM_SND_TAG		0x80000000	/* Packet header has send tag */
 
 #define CSUM_FLAGS_TX (CSUM_IP | CSUM_IP_UDP | CSUM_IP_TCP | CSUM_IP_SCTP | \
     CSUM_IP_TSO | CSUM_IP_ISCSI | CSUM_INNER_IP6_UDP | CSUM_INNER_IP6_TCP | \
     CSUM_INNER_IP6_TSO | CSUM_IP6_UDP | CSUM_IP6_TCP | CSUM_IP6_SCTP | \
     CSUM_IP6_TSO | CSUM_IP6_ISCSI | CSUM_INNER_IP | CSUM_INNER_IP_UDP | \
     CSUM_INNER_IP_TCP | CSUM_INNER_IP_TSO | CSUM_ENCAP_VXLAN | \
     CSUM_ENCAP_RSVD1 | CSUM_SND_TAG)
 
 #define CSUM_FLAGS_RX (CSUM_INNER_L3_CALC | CSUM_INNER_L3_VALID | \
     CSUM_INNER_L4_CALC | CSUM_INNER_L4_VALID | CSUM_L3_CALC | CSUM_L3_VALID | \
     CSUM_L4_CALC | CSUM_L4_VALID | CSUM_L5_CALC | CSUM_L5_VALID | \
     CSUM_COALESCED)
 
 /*
  * CSUM flag description for use with printf(9) %b identifier.
  */
 #define	CSUM_BITS \
     "\20\1CSUM_IP\2CSUM_IP_UDP\3CSUM_IP_TCP\4CSUM_IP_SCTP\5CSUM_IP_TSO" \
     "\6CSUM_IP_ISCSI\7CSUM_INNER_IP6_UDP\10CSUM_INNER_IP6_TCP" \
     "\11CSUM_INNER_IP6_TSO\12CSUM_IP6_UDP\13CSUM_IP6_TCP\14CSUM_IP6_SCTP" \
     "\15CSUM_IP6_TSO\16CSUM_IP6_ISCSI\17CSUM_INNER_IP\20CSUM_INNER_IP_UDP" \
     "\21CSUM_INNER_IP_TCP\22CSUM_INNER_IP_TSO\23CSUM_ENCAP_VXLAN" \
     "\24CSUM_ENCAP_RSVD1\25CSUM_INNER_L3_CALC\26CSUM_INNER_L3_VALID" \
     "\27CSUM_INNER_L4_CALC\30CSUM_INNER_L4_VALID\31CSUM_L3_CALC" \
     "\32CSUM_L3_VALID\33CSUM_L4_CALC\34CSUM_L4_VALID\35CSUM_L5_CALC" \
     "\36CSUM_L5_VALID\37CSUM_COALESCED\40CSUM_SND_TAG"
 
 /* CSUM flags compatibility mappings. */
 #define	CSUM_IP_CHECKED		CSUM_L3_CALC
 #define	CSUM_IP_VALID		CSUM_L3_VALID
 #define	CSUM_DATA_VALID		CSUM_L4_VALID
 #define	CSUM_PSEUDO_HDR		CSUM_L4_CALC
 #define	CSUM_SCTP_VALID		CSUM_L4_VALID
 #define	CSUM_DELAY_DATA		(CSUM_TCP|CSUM_UDP)
 #define	CSUM_DELAY_IP		CSUM_IP		/* Only v4, no v6 IP hdr csum */
 #define	CSUM_DELAY_DATA_IPV6	(CSUM_TCP_IPV6|CSUM_UDP_IPV6)
 #define	CSUM_DATA_VALID_IPV6	CSUM_DATA_VALID
 #define	CSUM_TCP		CSUM_IP_TCP
 #define	CSUM_UDP		CSUM_IP_UDP
 #define	CSUM_SCTP		CSUM_IP_SCTP
 #define	CSUM_TSO		(CSUM_IP_TSO|CSUM_IP6_TSO)
 #define	CSUM_INNER_TSO		(CSUM_INNER_IP_TSO|CSUM_INNER_IP6_TSO)
 #define	CSUM_UDP_IPV6		CSUM_IP6_UDP
 #define	CSUM_TCP_IPV6		CSUM_IP6_TCP
 #define	CSUM_SCTP_IPV6		CSUM_IP6_SCTP
 #define	CSUM_TLS_MASK		(CSUM_L5_CALC|CSUM_L5_VALID)
 #define	CSUM_TLS_DECRYPTED	CSUM_L5_CALC
 
 /*
  * mbuf types describing the content of the mbuf (including external storage).
  */
 #define	MT_NOTMBUF	0	/* USED INTERNALLY ONLY! Object is not mbuf */
 #define	MT_DATA		1	/* dynamic (data) allocation */
 #define	MT_HEADER	MT_DATA	/* packet header, use M_PKTHDR instead */
 
 #define	MT_VENDOR1	4	/* for vendor-internal use */
 #define	MT_VENDOR2	5	/* for vendor-internal use */
 #define	MT_VENDOR3	6	/* for vendor-internal use */
 #define	MT_VENDOR4	7	/* for vendor-internal use */
 
 #define	MT_SONAME	8	/* socket name */
 
 #define	MT_EXP1		9	/* for experimental use */
 #define	MT_EXP2		10	/* for experimental use */
 #define	MT_EXP3		11	/* for experimental use */
 #define	MT_EXP4		12	/* for experimental use */
 
 #define	MT_CONTROL	14	/* extra-data protocol message */
 #define	MT_EXTCONTROL	15	/* control message with externalized contents */
 #define	MT_OOBDATA	16	/* expedited data  */
 
 #define	MT_NOINIT	255	/* Not a type but a flag to allocate
 				   a non-initialized mbuf */
 
 /*
  * String names of mbuf-related UMA(9) and malloc(9) types.  Exposed to
  * !_KERNEL so that monitoring tools can look up the zones with
  * libmemstat(3).
  */
 #define	MBUF_MEM_NAME		"mbuf"
 #define	MBUF_CLUSTER_MEM_NAME	"mbuf_cluster"
 #define	MBUF_PACKET_MEM_NAME	"mbuf_packet"
 #define	MBUF_JUMBOP_MEM_NAME	"mbuf_jumbo_page"
 #define	MBUF_JUMBO9_MEM_NAME	"mbuf_jumbo_9k"
 #define	MBUF_JUMBO16_MEM_NAME	"mbuf_jumbo_16k"
 #define	MBUF_TAG_MEM_NAME	"mbuf_tag"
 #define	MBUF_EXTREFCNT_MEM_NAME	"mbuf_ext_refcnt"
 #define	MBUF_EXTPGS_MEM_NAME	"mbuf_extpgs"
 
 #ifdef _KERNEL
 union if_snd_tag_alloc_params;
 
 #define	MBUF_CHECKSLEEP(how) do {					\
 	if (how == M_WAITOK)						\
 		WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK, NULL,		\
 		    "Sleeping in \"%s\"", __func__);			\
 } while (0)
 
 /*
  * Network buffer allocation API
  *
  * The rest of it is defined in kern/kern_mbuf.c
  */
 extern uma_zone_t	zone_mbuf;
 extern uma_zone_t	zone_clust;
 extern uma_zone_t	zone_pack;
 extern uma_zone_t	zone_jumbop;
 extern uma_zone_t	zone_jumbo9;
 extern uma_zone_t	zone_jumbo16;
 extern uma_zone_t	zone_extpgs;
 
 void		 mb_dupcl(struct mbuf *, struct mbuf *);
 void		 mb_free_ext(struct mbuf *);
 void		 mb_free_extpg(struct mbuf *);
 void		 mb_free_mext_pgs(struct mbuf *);
-struct mbuf	*mb_alloc_ext_pgs(int, m_ext_free_t);
+struct mbuf	*mb_alloc_ext_pgs(int, m_ext_free_t, int);
 struct mbuf	*mb_alloc_ext_plus_pages(int, int);
 struct mbuf	*mb_mapped_to_unmapped(struct mbuf *, int, int, int,
 		    struct mbuf **);
 int		 mb_unmapped_compress(struct mbuf *m);
 struct mbuf 	*mb_unmapped_to_ext(struct mbuf *m);
 void		 mb_free_notready(struct mbuf *m, int count);
 void		 m_adj(struct mbuf *, int);
 void		 m_adj_decap(struct mbuf *, int);
 int		 m_apply(struct mbuf *, int, int,
 		    int (*)(void *, void *, u_int), void *);
 int		 m_append(struct mbuf *, int, c_caddr_t);
 void		 m_cat(struct mbuf *, struct mbuf *);
 void		 m_catpkt(struct mbuf *, struct mbuf *);
 int		 m_clget(struct mbuf *m, int how);
 void 		*m_cljget(struct mbuf *m, int how, int size);
 struct mbuf	*m_collapse(struct mbuf *, int, int);
 void		 m_copyback(struct mbuf *, int, int, c_caddr_t);
 void		 m_copydata(const struct mbuf *, int, int, caddr_t);
 struct mbuf	*m_copym(struct mbuf *, int, int, int);
 struct mbuf	*m_copypacket(struct mbuf *, int);
 void		 m_copy_pkthdr(struct mbuf *, struct mbuf *);
 struct mbuf	*m_copyup(struct mbuf *, int, int);
 struct mbuf	*m_defrag(struct mbuf *, int);
 void		 m_demote_pkthdr(struct mbuf *);
 void		 m_demote(struct mbuf *, int, int);
 struct mbuf	*m_devget(char *, int, int, struct ifnet *,
 		    void (*)(char *, caddr_t, u_int));
 void		 m_dispose_extcontrolm(struct mbuf *m);
 struct mbuf	*m_dup(const struct mbuf *, int);
 int		 m_dup_pkthdr(struct mbuf *, const struct mbuf *, int);
 void		 m_extadd(struct mbuf *, char *, u_int, m_ext_free_t,
 		    void *, void *, int, int);
 u_int		 m_fixhdr(struct mbuf *);
 struct mbuf	*m_fragment(struct mbuf *, int, int);
 void		 m_freem(struct mbuf *);
 void		 m_freemp(struct mbuf *);
 void		 m_free_raw(struct mbuf *);
 struct mbuf	*m_get2(int, int, short, int);
 struct mbuf	*m_get3(int, int, short, int);
 struct mbuf	*m_getjcl(int, short, int, int);
 struct mbuf	*m_getm2(struct mbuf *, int, int, short, int);
 struct mbuf	*m_getptr(struct mbuf *, int, int *);
 u_int		 m_length(struct mbuf *, struct mbuf **);
 int		 m_mbuftouio(struct uio *, const struct mbuf *, int);
 void		 m_move_pkthdr(struct mbuf *, struct mbuf *);
 int		 m_pkthdr_init(struct mbuf *, int);
 struct mbuf	*m_prepend(struct mbuf *, int, int);
 void		 m_print(const struct mbuf *, int);
 struct mbuf	*m_pulldown(struct mbuf *, int, int, int *);
 struct mbuf	*m_pullup(struct mbuf *, int);
 int		 m_sanity(struct mbuf *, int);
 struct mbuf	*m_split(struct mbuf *, int, int);
 struct mbuf	*m_uiotombuf(struct uio *, int, int, int, int);
 int		 m_unmapped_uiomove(const struct mbuf *, int, struct uio *,
 		    int);
 struct mbuf	*m_unshare(struct mbuf *, int);
 int		 m_snd_tag_alloc(struct ifnet *,
 		    union if_snd_tag_alloc_params *, struct m_snd_tag **);
 void		 m_snd_tag_init(struct m_snd_tag *, struct ifnet *,
 		    const struct if_snd_tag_sw *);
 void		 m_snd_tag_destroy(struct m_snd_tag *);
 void		 m_rcvif_serialize(struct mbuf *);
 struct ifnet	*m_rcvif_restore(struct mbuf *);
 
 static __inline int
 m_gettype(int size)
 {
 	int type;
 
 	switch (size) {
 	case MSIZE:
 		type = EXT_MBUF;
 		break;
 	case MCLBYTES:
 		type = EXT_CLUSTER;
 		break;
 	case MJUMPAGESIZE:
 		type = EXT_JUMBOP;
 		break;
 	case MJUM9BYTES:
 		type = EXT_JUMBO9;
 		break;
 	case MJUM16BYTES:
 		type = EXT_JUMBO16;
 		break;
 	default:
 		panic("%s: invalid cluster size %d", __func__, size);
 	}
 
 	return (type);
 }
 
 /*
  * Associated an external reference counted buffer with an mbuf.
  */
 static __inline void
 m_extaddref(struct mbuf *m, char *buf, u_int size, u_int *ref_cnt,
     m_ext_free_t freef, void *arg1, void *arg2)
 {
 
 	KASSERT(ref_cnt != NULL, ("%s: ref_cnt not provided", __func__));
 
 	atomic_add_int(ref_cnt, 1);
 	m->m_flags |= M_EXT;
 	m->m_ext.ext_buf = buf;
 	m->m_ext.ext_cnt = ref_cnt;
 	m->m_data = m->m_ext.ext_buf;
 	m->m_ext.ext_size = size;
 	m->m_ext.ext_free = freef;
 	m->m_ext.ext_arg1 = arg1;
 	m->m_ext.ext_arg2 = arg2;
 	m->m_ext.ext_type = EXT_EXTREF;
 	m->m_ext.ext_flags = 0;
 }
 
 static __inline uma_zone_t
 m_getzone(int size)
 {
 	uma_zone_t zone;
 
 	switch (size) {
 	case MCLBYTES:
 		zone = zone_clust;
 		break;
 	case MJUMPAGESIZE:
 		zone = zone_jumbop;
 		break;
 	case MJUM9BYTES:
 		zone = zone_jumbo9;
 		break;
 	case MJUM16BYTES:
 		zone = zone_jumbo16;
 		break;
 	default:
 		panic("%s: invalid cluster size %d", __func__, size);
 	}
 
 	return (zone);
 }
 
 /*
  * Initialize an mbuf with linear storage.
  *
  * Inline because the consumer text overhead will be roughly the same to
  * initialize or call a function with this many parameters and M_PKTHDR
  * should go away with constant propagation for !MGETHDR.
  */
 static __inline int
 m_init(struct mbuf *m, int how, short type, int flags)
 {
 	int error;
 
 	m->m_next = NULL;
 	m->m_nextpkt = NULL;
 	m->m_data = m->m_dat;
 	m->m_len = 0;
 	m->m_flags = flags;
 	m->m_type = type;
 	if (flags & M_PKTHDR)
 		error = m_pkthdr_init(m, how);
 	else
 		error = 0;
 
 	MBUF_PROBE5(m__init, m, how, type, flags, error);
 	return (error);
 }
 
 static __inline struct mbuf *
 m_get_raw(int how, short type)
 {
 	struct mbuf *m;
 	struct mb_args args;
 
 	args.flags = 0;
 	args.type = type | MT_NOINIT;
 	m = uma_zalloc_arg(zone_mbuf, &args, how);
 	MBUF_PROBE3(m__get_raw, how, type, m);
 	return (m);
 }
 
 static __inline struct mbuf *
 m_get(int how, short type)
 {
 	struct mbuf *m;
 	struct mb_args args;
 
 	args.flags = 0;
 	args.type = type;
 	m = uma_zalloc_arg(zone_mbuf, &args, how);
 	MBUF_PROBE3(m__get, how, type, m);
 	return (m);
 }
 
 static __inline struct mbuf *
 m_gethdr_raw(int how, short type)
 {
 	struct mbuf *m;
 	struct mb_args args;
 
 	args.flags = M_PKTHDR;
 	args.type = type | MT_NOINIT;
 	m = uma_zalloc_arg(zone_mbuf, &args, how);
 	MBUF_PROBE3(m__gethdr_raw, how, type, m);
 	return (m);
 }
 
 static __inline struct mbuf *
 m_gethdr(int how, short type)
 {
 	struct mbuf *m;
 	struct mb_args args;
 
 	args.flags = M_PKTHDR;
 	args.type = type;
 	m = uma_zalloc_arg(zone_mbuf, &args, how);
 	MBUF_PROBE3(m__gethdr, how, type, m);
 	return (m);
 }
 
 static __inline struct mbuf *
 m_getcl(int how, short type, int flags)
 {
 	struct mbuf *m;
 	struct mb_args args;
 
 	args.flags = flags;
 	args.type = type;
 	m = uma_zalloc_arg(zone_pack, &args, how);
 	MBUF_PROBE4(m__getcl, how, type, flags, m);
 	return (m);
 }
 
 /*
  * XXX: m_cljset() is a dangerous API.  One must attach only a new,
  * unreferenced cluster to an mbuf(9).  It is not possible to assert
  * that, so care can be taken only by users of the API.
  */
 static __inline void
 m_cljset(struct mbuf *m, void *cl, int type)
 {
 	int size;
 
 	switch (type) {
 	case EXT_CLUSTER:
 		size = MCLBYTES;
 		break;
 	case EXT_JUMBOP:
 		size = MJUMPAGESIZE;
 		break;
 	case EXT_JUMBO9:
 		size = MJUM9BYTES;
 		break;
 	case EXT_JUMBO16:
 		size = MJUM16BYTES;
 		break;
 	default:
 		panic("%s: unknown cluster type %d", __func__, type);
 		break;
 	}
 
 	m->m_data = m->m_ext.ext_buf = cl;
 	m->m_ext.ext_free = m->m_ext.ext_arg1 = m->m_ext.ext_arg2 = NULL;
 	m->m_ext.ext_size = size;
 	m->m_ext.ext_type = type;
 	m->m_ext.ext_flags = EXT_FLAG_EMBREF;
 	m->m_ext.ext_count = 1;
 	m->m_flags |= M_EXT;
 	MBUF_PROBE3(m__cljset, m, cl, type);
 }
 
 static __inline void
 m_chtype(struct mbuf *m, short new_type)
 {
 
 	m->m_type = new_type;
 }
 
 static __inline void
 m_clrprotoflags(struct mbuf *m)
 {
 
 	while (m) {
 		m->m_flags &= ~M_PROTOFLAGS;
 		m = m->m_next;
 	}
 }
 
 static __inline struct mbuf *
 m_last(struct mbuf *m)
 {
 
 	while (m->m_next)
 		m = m->m_next;
 	return (m);
 }
 
 static inline u_int
 m_extrefcnt(struct mbuf *m)
 {
 
 	KASSERT(m->m_flags & M_EXT, ("%s: M_EXT missing", __func__));
 
 	return ((m->m_ext.ext_flags & EXT_FLAG_EMBREF) ? m->m_ext.ext_count :
 	    *m->m_ext.ext_cnt);
 }
 
 /*
  * mbuf, cluster, and external object allocation macros (for compatibility
  * purposes).
  */
 #define	M_MOVE_PKTHDR(to, from)	m_move_pkthdr((to), (from))
 #define	MGET(m, how, type)	((m) = m_get((how), (type)))
 #define	MGETHDR(m, how, type)	((m) = m_gethdr((how), (type)))
 #define	MCLGET(m, how)		m_clget((m), (how))
 #define	MEXTADD(m, buf, size, free, arg1, arg2, flags, type)		\
     m_extadd((m), (char *)(buf), (size), (free), (arg1), (arg2),	\
     (flags), (type))
 #define	m_getm(m, len, how, type)					\
     m_getm2((m), (len), (how), (type), M_PKTHDR)
 
 /*
  * Evaluate TRUE if it's safe to write to the mbuf m's data region (this can
  * be both the local data payload, or an external buffer area, depending on
  * whether M_EXT is set).
  */
 #define	M_WRITABLE(m)	(((m)->m_flags & (M_RDONLY | M_EXTPG)) == 0 &&	\
 			 (!(((m)->m_flags & M_EXT)) ||			\
 			 (m_extrefcnt(m) == 1)))
 
 /* Check if the supplied mbuf has a packet header, or else panic. */
 #define	M_ASSERTPKTHDR(m)						\
 	KASSERT((m) != NULL && (m)->m_flags & M_PKTHDR,			\
 	    ("%s: no mbuf packet header!", __func__))
 
 /* Check if the supplied mbuf has no send tag, or else panic. */
 #define	M_ASSERT_NO_SND_TAG(m)						\
 	KASSERT((m) != NULL && (m)->m_flags & M_PKTHDR &&		\
 	       ((m)->m_pkthdr.csum_flags & CSUM_SND_TAG) == 0,		\
 	    ("%s: receive mbuf has send tag!", __func__))
 
 /* Check if mbuf is multipage. */
 #define M_ASSERTEXTPG(m)						\
 	KASSERT(((m)->m_flags & (M_EXTPG|M_PKTHDR)) == M_EXTPG,		\
 	    ("%s: m %p is not multipage!", __func__, m))
 
 /*
  * Ensure that the supplied mbuf is a valid, non-free mbuf.
  *
  * XXX: Broken at the moment.  Need some UMA magic to make it work again.
  */
 #define	M_ASSERTVALID(m)						\
 	KASSERT((((struct mbuf *)m)->m_flags & 0) == 0,			\
 	    ("%s: attempted use of a free mbuf!", __func__))
 
 /* Check whether any mbuf in the chain is unmapped. */
 #ifdef INVARIANTS
 #define	M_ASSERTMAPPED(m) do {						\
 	for (struct mbuf *__m = (m); __m != NULL; __m = __m->m_next)	\
 		KASSERT((__m->m_flags & M_EXTPG) == 0,			\
 		    ("%s: chain %p contains an unmapped mbuf", __func__, (m)));\
 } while (0)
 #else
 #define	M_ASSERTMAPPED(m) do {} while (0)
 #endif
 
 /*
  * Return the address of the start of the buffer associated with an mbuf,
  * handling external storage, packet-header mbufs, and regular data mbufs.
  */
 #define	M_START(m)							\
 	(((m)->m_flags & M_EXTPG) ? NULL :				\
 	 ((m)->m_flags & M_EXT) ? (m)->m_ext.ext_buf :			\
 	 ((m)->m_flags & M_PKTHDR) ? &(m)->m_pktdat[0] :		\
 	 &(m)->m_dat[0])
 
 /*
  * Return the size of the buffer associated with an mbuf, handling external
  * storage, packet-header mbufs, and regular data mbufs.
  */
 #define	M_SIZE(m)							\
 	(((m)->m_flags & M_EXT) ? (m)->m_ext.ext_size :			\
 	 ((m)->m_flags & M_PKTHDR) ? MHLEN :				\
 	 MLEN)
 
 /*
  * Set the m_data pointer of a newly allocated mbuf to place an object of the
  * specified size at the end of the mbuf, longword aligned.
  *
  * NB: Historically, we had M_ALIGN(), MH_ALIGN(), and MEXT_ALIGN() as
  * separate macros, each asserting that it was called at the proper moment.
  * This required callers to themselves test the storage type and call the
  * right one.  Rather than require callers to be aware of those layout
  * decisions, we centralize here.
  */
 static __inline void
 m_align(struct mbuf *m, int len)
 {
 #ifdef INVARIANTS
 	const char *msg = "%s: not a virgin mbuf";
 #endif
 	int adjust;
 
 	KASSERT(m->m_data == M_START(m), (msg, __func__));
 
 	adjust = M_SIZE(m) - len;
 	m->m_data += adjust &~ (sizeof(long)-1);
 }
 
 #define	M_ALIGN(m, len)		m_align(m, len)
 #define	MH_ALIGN(m, len)	m_align(m, len)
 #define	MEXT_ALIGN(m, len)	m_align(m, len)
 
 /*
  * Compute the amount of space available before the current start of data in
  * an mbuf.
  *
  * The M_WRITABLE() is a temporary, conservative safety measure: the burden
  * of checking writability of the mbuf data area rests solely with the caller.
  *
  * NB: In previous versions, M_LEADINGSPACE() would only check M_WRITABLE()
  * for mbufs with external storage.  We now allow mbuf-embedded data to be
  * read-only as well.
  */
 #define	M_LEADINGSPACE(m)						\
 	(M_WRITABLE(m) ? ((m)->m_data - M_START(m)) : 0)
 
 /*
  * So M_TRAILINGROOM() is for when you want to know how much space
  * would be there if it was writable. This can be used to
  * detect changes in mbufs by knowing the value at one point
  * and then being able to compare it later to the current M_TRAILINGROOM().
  * The TRAILINGSPACE() macro is not suitable for this since an mbuf
  * at one point might not be writable and then later it becomes writable
  * even though the space at the back of it has not changed.
  */
 #define M_TRAILINGROOM(m) ((M_START(m) + M_SIZE(m)) - ((m)->m_data + (m)->m_len))
 /*
  * Compute the amount of space available after the end of data in an mbuf.
  *
  * The M_WRITABLE() is a temporary, conservative safety measure: the burden
  * of checking writability of the mbuf data area rests solely with the caller.
  *
  * NB: In previous versions, M_TRAILINGSPACE() would only check M_WRITABLE()
  * for mbufs with external storage.  We now allow mbuf-embedded data to be
  * read-only as well.
  */
 #define	M_TRAILINGSPACE(m) (M_WRITABLE(m) ? M_TRAILINGROOM(m) : 0)
 
 /*
  * Arrange to prepend space of size plen to mbuf m.  If a new mbuf must be
  * allocated, how specifies whether to wait.  If the allocation fails, the
  * original mbuf chain is freed and m is set to NULL.
  */
 #define	M_PREPEND(m, plen, how) do {					\
 	struct mbuf **_mmp = &(m);					\
 	struct mbuf *_mm = *_mmp;					\
 	int _mplen = (plen);						\
 	int __mhow = (how);						\
 									\
 	MBUF_CHECKSLEEP(how);						\
 	if (M_LEADINGSPACE(_mm) >= _mplen) {				\
 		_mm->m_data -= _mplen;					\
 		_mm->m_len += _mplen;					\
 	} else								\
 		_mm = m_prepend(_mm, _mplen, __mhow);			\
 	if (_mm != NULL && _mm->m_flags & M_PKTHDR)			\
 		_mm->m_pkthdr.len += _mplen;				\
 	*_mmp = _mm;							\
 } while (0)
 
 /*
  * Change mbuf to new type.  This is a relatively expensive operation and
  * should be avoided.
  */
 #define	MCHTYPE(m, t)	m_chtype((m), (t))
 
 /* Return the rcvif of a packet header. */
 static __inline struct ifnet *
 m_rcvif(struct mbuf *m)
 {
 
 	M_ASSERTPKTHDR(m);
 	if (m->m_pkthdr.csum_flags & CSUM_SND_TAG)
 		return (NULL);
 	return (m->m_pkthdr.rcvif);
 }
 
 /* Length to m_copy to copy all. */
 #define	M_COPYALL	1000000000
 
 extern u_int		max_linkhdr;	/* Largest link-level header */
 extern u_int		max_hdr;	/* Largest link + protocol header */
 extern u_int		max_protohdr;	/* Largest protocol header */
 void max_linkhdr_grow(u_int);
 void max_protohdr_grow(u_int);
 
 extern int		nmbclusters;	/* Maximum number of clusters */
 extern bool		mb_use_ext_pgs;	/* Use ext_pgs for sendfile */
 
 /*-
  * Network packets may have annotations attached by affixing a list of
  * "packet tags" to the pkthdr structure.  Packet tags are dynamically
  * allocated semi-opaque data structures that have a fixed header
  * (struct m_tag) that specifies the size of the memory block and a
  * <cookie,type> pair that identifies it.  The cookie is a 32-bit unique
  * unsigned value used to identify a module or ABI.  By convention this value
  * is chosen as the date+time that the module is created, expressed as the
  * number of seconds since the epoch (e.g., using date -u +'%s').  The type
  * value is an ABI/module-specific value that identifies a particular
  * annotation and is private to the module.  For compatibility with systems
  * like OpenBSD that define packet tags w/o an ABI/module cookie, the value
  * PACKET_ABI_COMPAT is used to implement m_tag_get and m_tag_find
  * compatibility shim functions and several tag types are defined below.
  * Users that do not require compatibility should use a private cookie value
  * so that packet tag-related definitions can be maintained privately.
  *
  * Note that the packet tag returned by m_tag_alloc has the default memory
  * alignment implemented by malloc.  To reference private data one can use a
  * construct like:
  *
  *	struct m_tag *mtag = m_tag_alloc(...);
  *	struct foo *p = (struct foo *)(mtag+1);
  *
  * if the alignment of struct m_tag is sufficient for referencing members of
  * struct foo.  Otherwise it is necessary to embed struct m_tag within the
  * private data structure to insure proper alignment; e.g.,
  *
  *	struct foo {
  *		struct m_tag	tag;
  *		...
  *	};
  *	struct foo *p = (struct foo *) m_tag_alloc(...);
  *	struct m_tag *mtag = &p->tag;
  */
 
 /*
  * Persistent tags stay with an mbuf until the mbuf is reclaimed.  Otherwise
  * tags are expected to ``vanish'' when they pass through a network
  * interface.  For most interfaces this happens normally as the tags are
  * reclaimed when the mbuf is free'd.  However in some special cases
  * reclaiming must be done manually.  An example is packets that pass through
  * the loopback interface.  Also, one must be careful to do this when
  * ``turning around'' packets (e.g., icmp_reflect).
  *
  * To mark a tag persistent bit-or this flag in when defining the tag id.
  * The tag will then be treated as described above.
  */
 #define	MTAG_PERSISTENT				0x800
 
 #define	PACKET_TAG_NONE				0  /* Nadda */
 
 /* Packet tags for use with PACKET_ABI_COMPAT. */
 #define	PACKET_TAG_IPSEC_IN_DONE		1  /* IPsec applied, in */
 #define	PACKET_TAG_IPSEC_OUT_DONE		2  /* IPsec applied, out */
 #define	PACKET_TAG_IPSEC_IN_CRYPTO_DONE		3  /* NIC IPsec crypto done */
 #define	PACKET_TAG_IPSEC_OUT_CRYPTO_NEEDED	4  /* NIC IPsec crypto req'ed */
 #define	PACKET_TAG_IPSEC_IN_COULD_DO_CRYPTO	5  /* NIC notifies IPsec */
 #define	PACKET_TAG_IPSEC_PENDING_TDB		6  /* Reminder to do IPsec */
 #define	PACKET_TAG_BRIDGE			7  /* Bridge processing done */
 #define	PACKET_TAG_GIF				8  /* GIF processing done */
 #define	PACKET_TAG_GRE				9  /* GRE processing done */
 #define	PACKET_TAG_IN_PACKET_CHECKSUM		10 /* NIC checksumming done */
 #define	PACKET_TAG_ENCAP			11 /* Encap.  processing */
 #define	PACKET_TAG_IPSEC_SOCKET			12 /* IPSEC socket ref */
 #define	PACKET_TAG_IPSEC_HISTORY		13 /* IPSEC history */
 #define	PACKET_TAG_IPV6_INPUT			14 /* IPV6 input processing */
 #define	PACKET_TAG_DUMMYNET			15 /* dummynet info */
 #define	PACKET_TAG_DIVERT			17 /* divert info */
 #define	PACKET_TAG_IPFORWARD			18 /* ipforward info */
 #define	PACKET_TAG_MACLABEL	(19 | MTAG_PERSISTENT) /* MAC label */
 #define	PACKET_TAG_PF				21 /* PF/ALTQ information */
 /* was	PACKET_TAG_RTSOCKFAM			25    rtsock sa family */
 #define	PACKET_TAG_IPOPTIONS			27 /* Saved IP options */
 #define	PACKET_TAG_CARP				28 /* CARP info */
 #define	PACKET_TAG_IPSEC_NAT_T_PORTS		29 /* two uint16_t */
 #define	PACKET_TAG_ND_OUTGOING			30 /* ND outgoing */
 #define	PACKET_TAG_PF_REASSEMBLED		31
 #define	PACKET_TAG_IPSEC_ACCEL_OUT		32  /* IPSEC accel out */
 #define	PACKET_TAG_IPSEC_ACCEL_IN		33  /* IPSEC accel in */
 
 /* Specific cookies and tags. */
 
 /* Packet tag routines. */
 struct m_tag	*m_tag_alloc(uint32_t, uint16_t, int, int);
 void		 m_tag_delete(struct mbuf *, struct m_tag *);
 void		 m_tag_delete_chain(struct mbuf *, struct m_tag *);
 void		 m_tag_free_default(struct m_tag *);
 struct m_tag	*m_tag_locate(struct mbuf *, uint32_t, uint16_t,
     struct m_tag *);
 struct m_tag	*m_tag_copy(struct m_tag *, int);
 int		 m_tag_copy_chain(struct mbuf *, const struct mbuf *, int);
 void		 m_tag_delete_nonpersistent(struct mbuf *);
 
 /*
  * Initialize the list of tags associated with an mbuf.
  */
 static __inline void
 m_tag_init(struct mbuf *m)
 {
 
 	SLIST_INIT(&m->m_pkthdr.tags);
 }
 
 /*
  * Set up the contents of a tag.  Note that this does not fill in the free
  * method; the caller is expected to do that.
  *
  * XXX probably should be called m_tag_init, but that was already taken.
  */
 static __inline void
 m_tag_setup(struct m_tag *t, uint32_t cookie, uint16_t type, int len)
 {
 
 	t->m_tag_id = type;
 	t->m_tag_len = len;
 	t->m_tag_cookie = cookie;
 }
 
 /*
  * Reclaim resources associated with a tag.
  */
 static __inline void
 m_tag_free(struct m_tag *t)
 {
 
 	(*t->m_tag_free)(t);
 }
 
 /*
  * Return the first tag associated with an mbuf.
  */
 static __inline struct m_tag *
 m_tag_first(struct mbuf *m)
 {
 
 	return (SLIST_FIRST(&m->m_pkthdr.tags));
 }
 
 /*
  * Return the next tag in the list of tags associated with an mbuf.
  */
 static __inline struct m_tag *
 m_tag_next(struct mbuf *m __unused, struct m_tag *t)
 {
 
 	return (SLIST_NEXT(t, m_tag_link));
 }
 
 /*
  * Prepend a tag to the list of tags associated with an mbuf.
  */
 static __inline void
 m_tag_prepend(struct mbuf *m, struct m_tag *t)
 {
 
 	SLIST_INSERT_HEAD(&m->m_pkthdr.tags, t, m_tag_link);
 }
 
 /*
  * Unlink a tag from the list of tags associated with an mbuf.
  */
 static __inline void
 m_tag_unlink(struct mbuf *m, struct m_tag *t)
 {
 
 	SLIST_REMOVE(&m->m_pkthdr.tags, t, m_tag, m_tag_link);
 }
 
 /* These are for OpenBSD compatibility. */
 #define	MTAG_ABI_COMPAT		0		/* compatibility ABI */
 
 static __inline struct m_tag *
 m_tag_get(uint16_t type, int length, int wait)
 {
 	return (m_tag_alloc(MTAG_ABI_COMPAT, type, length, wait));
 }
 
 static __inline struct m_tag *
 m_tag_find(struct mbuf *m, uint16_t type, struct m_tag *start)
 {
 	return (SLIST_EMPTY(&m->m_pkthdr.tags) ? (struct m_tag *)NULL :
 	    m_tag_locate(m, MTAG_ABI_COMPAT, type, start));
 }
 
 static inline struct m_snd_tag *
 m_snd_tag_ref(struct m_snd_tag *mst)
 {
 
 	refcount_acquire(&mst->refcount);
 	return (mst);
 }
 
 static inline void
 m_snd_tag_rele(struct m_snd_tag *mst)
 {
 
 	if (refcount_release(&mst->refcount))
 		m_snd_tag_destroy(mst);
 }
 
 static __inline struct mbuf *
 m_free(struct mbuf *m)
 {
 	struct mbuf *n = m->m_next;
 
 	MBUF_PROBE1(m__free, m);
 	if ((m->m_flags & (M_PKTHDR|M_NOFREE)) == (M_PKTHDR|M_NOFREE))
 		m_tag_delete_chain(m, NULL);
 	if (m->m_flags & M_PKTHDR && m->m_pkthdr.csum_flags & CSUM_SND_TAG)
 		m_snd_tag_rele(m->m_pkthdr.snd_tag);
 	if (m->m_flags & M_EXTPG)
 		mb_free_extpg(m);
 	else if (m->m_flags & M_EXT)
 		mb_free_ext(m);
 	else if ((m->m_flags & M_NOFREE) == 0)
 		uma_zfree(zone_mbuf, m);
 	return (n);
 }
 
 static __inline int
 rt_m_getfib(struct mbuf *m)
 {
 	KASSERT(m->m_flags & M_PKTHDR , ("Attempt to get FIB from non header mbuf."));
 	return (m->m_pkthdr.fibnum);
 }
 
 #define M_GETFIB(_m)   rt_m_getfib(_m)
 
 #define M_SETFIB(_m, _fib) do {						\
         KASSERT((_m)->m_flags & M_PKTHDR, ("Attempt to set FIB on non header mbuf."));	\
 	((_m)->m_pkthdr.fibnum) = (_fib);				\
 } while (0)
 
 /* flags passed as first argument for "m_xxx_tcpip_hash()" */
 #define	MBUF_HASHFLAG_L2	(1 << 2)
 #define	MBUF_HASHFLAG_L3	(1 << 3)
 #define	MBUF_HASHFLAG_L4	(1 << 4)
 
 /* mbuf hashing helper routines */
 uint32_t	m_ether_tcpip_hash_init(void);
 uint32_t	m_ether_tcpip_hash(const uint32_t, const struct mbuf *, uint32_t);
 uint32_t	m_infiniband_tcpip_hash_init(void);
 uint32_t	m_infiniband_tcpip_hash(const uint32_t, const struct mbuf *, uint32_t);
 
 #ifdef MBUF_PROFILING
  void m_profile(struct mbuf *m);
  #define M_PROFILE(m) m_profile(m)
 #else
  #define M_PROFILE(m)
 #endif
 
 /*
  * Structure describing a packet queue: mbufs linked by m_stailqpkt.
  * Does accounting of number of packets and has a cap.
  */
 struct mbufq {
 	STAILQ_HEAD(, mbuf)	mq_head;
 	int			mq_len;
 	int			mq_maxlen;
 };
 
 static inline void
 mbufq_init(struct mbufq *mq, int maxlen)
 {
 
 	STAILQ_INIT(&mq->mq_head);
 	mq->mq_maxlen = maxlen;
 	mq->mq_len = 0;
 }
 
 static inline struct mbuf *
 mbufq_flush(struct mbufq *mq)
 {
 	struct mbuf *m;
 
 	m = STAILQ_FIRST(&mq->mq_head);
 	STAILQ_INIT(&mq->mq_head);
 	mq->mq_len = 0;
 	return (m);
 }
 
 static inline void
 mbufq_drain(struct mbufq *mq)
 {
 	struct mbuf *m, *n;
 
 	n = mbufq_flush(mq);
 	while ((m = n) != NULL) {
 		n = STAILQ_NEXT(m, m_stailqpkt);
 		m_freem(m);
 	}
 }
 
 static inline struct mbuf *
 mbufq_first(const struct mbufq *mq)
 {
 
 	return (STAILQ_FIRST(&mq->mq_head));
 }
 
 static inline struct mbuf *
 mbufq_last(const struct mbufq *mq)
 {
 
 	return (STAILQ_LAST(&mq->mq_head, mbuf, m_stailqpkt));
 }
 
 static inline bool
 mbufq_empty(const struct mbufq *mq)
 {
 	return (mq->mq_len == 0);
 }
 
 static inline int
 mbufq_full(const struct mbufq *mq)
 {
 
 	return (mq->mq_maxlen > 0 && mq->mq_len >= mq->mq_maxlen);
 }
 
 static inline int
 mbufq_len(const struct mbufq *mq)
 {
 
 	return (mq->mq_len);
 }
 
 static inline int
 mbufq_enqueue(struct mbufq *mq, struct mbuf *m)
 {
 
 	if (mbufq_full(mq))
 		return (ENOBUFS);
 	STAILQ_INSERT_TAIL(&mq->mq_head, m, m_stailqpkt);
 	mq->mq_len++;
 	return (0);
 }
 
 static inline struct mbuf *
 mbufq_dequeue(struct mbufq *mq)
 {
 	struct mbuf *m;
 
 	m = STAILQ_FIRST(&mq->mq_head);
 	if (m) {
 		STAILQ_REMOVE_HEAD(&mq->mq_head, m_stailqpkt);
 		m->m_nextpkt = NULL;
 		mq->mq_len--;
 	}
 	return (m);
 }
 
 static inline void
 mbufq_prepend(struct mbufq *mq, struct mbuf *m)
 {
 
 	STAILQ_INSERT_HEAD(&mq->mq_head, m, m_stailqpkt);
 	mq->mq_len++;
 }
 
 /*
  * Note: this doesn't enforce the maximum list size for dst.
  */
 static inline void
 mbufq_concat(struct mbufq *mq_dst, struct mbufq *mq_src)
 {
 
 	mq_dst->mq_len += mq_src->mq_len;
 	STAILQ_CONCAT(&mq_dst->mq_head, &mq_src->mq_head);
 	mq_src->mq_len = 0;
 }
 
 /*
  * Structure describing a chain of mbufs linked by m_stailq, also tracking
  * the pointer to the last.  Also does accounting of data length and memory
  * usage.
  * To be used as an argument to mbuf chain allocation and manipulation KPIs,
  * and can be allocated on the stack of a caller.  Kernel facilities may use
  * it internally as a most simple implementation of a stream data buffer.
  */
 struct mchain {
 	STAILQ_HEAD(, mbuf) mc_q;
 	u_int mc_len;
 	u_int mc_mlen;
 };
 
 #define	MCHAIN_INITIALIZER(mc)	\
 	(struct mchain){ .mc_q = STAILQ_HEAD_INITIALIZER((mc)->mc_q) }
 
 static inline struct mbuf *
 mc_first(struct mchain *mc)
 {
 	return (STAILQ_FIRST(&mc->mc_q));
 }
 
 static inline struct mbuf *
 mc_last(struct mchain *mc)
 {
 	return (STAILQ_LAST(&mc->mc_q, mbuf, m_stailq));
 }
 
 static inline bool
 mc_empty(struct mchain *mc)
 {
 	return (STAILQ_EMPTY(&mc->mc_q));
 }
 
 /* Account addition of m to mc. */
 static inline void
 mc_inc(struct mchain *mc, struct mbuf *m)
 {
 	mc->mc_len += m->m_len;
 	mc->mc_mlen += MSIZE;
 	if (m->m_flags & M_EXT)
 		mc->mc_mlen += m->m_ext.ext_size;
 }
 
 /* Account removal of m from mc. */
 static inline void
 mc_dec(struct mchain *mc, struct mbuf *m)
 {
 	MPASS(mc->mc_len >= m->m_len);
 	mc->mc_len -= m->m_len;
 	MPASS(mc->mc_mlen >= MSIZE);
 	mc->mc_mlen -= MSIZE;
 	if (m->m_flags & M_EXT) {
 		MPASS(mc->mc_mlen >= m->m_ext.ext_size);
 		mc->mc_mlen -= m->m_ext.ext_size;
 	}
 }
 
 /*
  * Get mchain from a classic mbuf chain linked by m_next.  Two hacks here:
  * we use the fact that m_next is alias to m_stailq, we use internal queue(3)
  * fields.
  */
 static inline void
 mc_init_m(struct mchain *mc, struct mbuf *m)
 {
 	struct mbuf *last;
 
 	STAILQ_FIRST(&mc->mc_q) = m;
 	mc->mc_len = mc->mc_mlen = 0;
 	STAILQ_FOREACH(m, &mc->mc_q, m_stailq) {
 		mc_inc(mc, m);
 		last = m;
 	}
 	mc->mc_q.stqh_last = &STAILQ_NEXT(last, m_stailq);
 }
 
 static inline void
 mc_freem(struct mchain *mc)
 {
 	if (!mc_empty(mc))
 		m_freem(mc_first(mc));
 }
 
 static inline void
 mc_prepend(struct mchain *mc, struct mbuf *m)
 {
 	STAILQ_INSERT_HEAD(&mc->mc_q, m, m_stailq);
 	mc_inc(mc, m);
 }
 
 static inline void
 mc_append(struct mchain *mc, struct mbuf *m)
 {
 	STAILQ_INSERT_TAIL(&mc->mc_q, m, m_stailq);
 	mc_inc(mc, m);
 }
 
 static inline void
 mc_concat(struct mchain *head, struct mchain *tail)
 {
 	STAILQ_CONCAT(&head->mc_q, &tail->mc_q);
 	head->mc_len += tail->mc_len;
 	head->mc_mlen += tail->mc_mlen;
 	tail->mc_len = tail->mc_mlen = 0;
 }
 
 /*
  * Note: STAILQ_REMOVE() is expensive. mc_remove_after() needs to be provided
  * as long as there consumers that would benefit from it.
  */
 static inline void
 mc_remove(struct mchain *mc, struct mbuf *m)
 {
 	STAILQ_REMOVE(&mc->mc_q, m, mbuf, m_stailq);
 	mc_dec(mc, m);
 }
 
 int mc_get(struct mchain *, u_int, int, short, int);
 int mc_split(struct mchain *, struct mchain *, u_int, int);
 int mc_uiotomc(struct mchain *, struct uio *, u_int, u_int, int, int);
 
 #ifdef _SYS_TIMESPEC_H_
 static inline void
 mbuf_tstmp2timespec(struct mbuf *m, struct timespec *ts)
 {
 
 	KASSERT((m->m_flags & M_PKTHDR) != 0, ("mbuf %p no M_PKTHDR", m));
 	KASSERT((m->m_flags & (M_TSTMP|M_TSTMP_LRO)) != 0,
 	    ("mbuf %p no M_TSTMP or M_TSTMP_LRO", m));
 	ts->tv_sec = m->m_pkthdr.rcv_tstmp / 1000000000;
 	ts->tv_nsec = m->m_pkthdr.rcv_tstmp % 1000000000;
 }
 #endif
 
 static inline void
 mbuf_tstmp2timeval(struct mbuf *m, struct timeval *tv)
 {
 
 	KASSERT((m->m_flags & M_PKTHDR) != 0, ("mbuf %p no M_PKTHDR", m));
 	KASSERT((m->m_flags & (M_TSTMP|M_TSTMP_LRO)) != 0,
 	    ("mbuf %p no M_TSTMP or M_TSTMP_LRO", m));
 	tv->tv_sec = m->m_pkthdr.rcv_tstmp / 1000000000;
 	tv->tv_usec = (m->m_pkthdr.rcv_tstmp % 1000000000) / 1000;
 }
 
 #ifdef DEBUGNET
 /* Invoked from the debugnet client code. */
 void	debugnet_mbuf_drain(void);
 void	debugnet_mbuf_start(void);
 void	debugnet_mbuf_finish(void);
 void	debugnet_mbuf_reinit(int nmbuf, int nclust, int clsize);
 #endif
 
 static inline bool
 mbuf_has_tls_session(struct mbuf *m)
 {
 
 	if (m->m_flags & M_EXTPG) {
 		if (m->m_epg_tls != NULL) {
 			return (true);
 		}
 	}
 	return (false);
 }
 
 #endif /* _KERNEL */
 #endif /* !_SYS_MBUF_H_ */