Index: head/sys/dev/cxgbe/cxgbei/icl_cxgbei.c
===================================================================
--- head/sys/dev/cxgbe/cxgbei/icl_cxgbei.c	(revision 331648)
+++ head/sys/dev/cxgbe/cxgbei/icl_cxgbei.c	(revision 331649)
@@ -1,1096 +1,1096 @@
 /*-
  * Copyright (c) 2012 The FreeBSD Foundation
  * Copyright (c) 2015 Chelsio Communications, Inc.
  * All rights reserved.
  *
  * This software was developed by Edward Tomasz Napierala under sponsorship
  * from the FreeBSD Foundation.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  */
 
 /*
  * cxgbei implementation of iSCSI Common Layer kobj(9) interface.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_inet.h"
 #include "opt_inet6.h"
 
 #ifdef TCP_OFFLOAD
 #include <sys/param.h>
 #include <sys/capsicum.h>
 #include <sys/condvar.h>
 #include <sys/conf.h>
 #include <sys/file.h>
 #include <sys/kernel.h>
 #include <sys/kthread.h>
 #include <sys/lock.h>
 #include <sys/mbuf.h>
 #include <sys/mutex.h>
 #include <sys/module.h>
 #include <sys/protosw.h>
 #include <sys/socket.h>
 #include <sys/socketvar.h>
 #include <sys/sysctl.h>
 #include <sys/systm.h>
 #include <sys/sx.h>
 #include <sys/uio.h>
 #include <machine/bus.h>
 #include <vm/uma.h>
 #include <vm/vm.h>
 #include <vm/pmap.h>
 #include <netinet/in.h>
 #include <netinet/in_pcb.h>
 #include <netinet/tcp.h>
 #include <netinet/tcp_var.h>
 #include <netinet/toecore.h>
 
 #include <dev/iscsi/icl.h>
 #include <dev/iscsi/iscsi_proto.h>
 #include <icl_conn_if.h>
 
 #include <cam/scsi/scsi_all.h>
 #include <cam/scsi/scsi_da.h>
 #include <cam/ctl/ctl_io.h>
 #include <cam/ctl/ctl.h>
 #include <cam/ctl/ctl_backend.h>
 #include <cam/ctl/ctl_error.h>
 #include <cam/ctl/ctl_frontend.h>
 #include <cam/ctl/ctl_debug.h>
 #include <cam/ctl/ctl_ha.h>
 #include <cam/ctl/ctl_ioctl.h>
 
 #include <cam/cam.h>
 #include <cam/cam_ccb.h>
 #include <cam/cam_xpt.h>
 #include <cam/cam_debug.h>
 #include <cam/cam_sim.h>
 #include <cam/cam_xpt_sim.h>
 #include <cam/cam_xpt_periph.h>
 #include <cam/cam_periph.h>
 #include <cam/cam_compat.h>
 #include <cam/scsi/scsi_message.h>
 
 #include "common/common.h"
 #include "common/t4_tcb.h"
 #include "tom/t4_tom.h"
 #include "cxgbei.h"
 
 SYSCTL_NODE(_kern_icl, OID_AUTO, cxgbei, CTLFLAG_RD, 0, "Chelsio iSCSI offload");
 static int coalesce = 1;
 SYSCTL_INT(_kern_icl_cxgbei, OID_AUTO, coalesce, CTLFLAG_RWTUN,
 	&coalesce, 0, "Try to coalesce PDUs before sending");
 static int partial_receive_len = 128 * 1024;
 SYSCTL_INT(_kern_icl_cxgbei, OID_AUTO, partial_receive_len, CTLFLAG_RWTUN,
     &partial_receive_len, 0, "Minimum read size for partially received "
     "data segment");
 static int sendspace = 1048576;
 SYSCTL_INT(_kern_icl_cxgbei, OID_AUTO, sendspace, CTLFLAG_RWTUN,
     &sendspace, 0, "Default send socket buffer size");
 static int recvspace = 1048576;
 SYSCTL_INT(_kern_icl_cxgbei, OID_AUTO, recvspace, CTLFLAG_RWTUN,
     &recvspace, 0, "Default receive socket buffer size");
 
 static uma_zone_t prsv_zone;
 static volatile u_int icl_cxgbei_ncons;
 
 #define ICL_CONN_LOCK(X)		mtx_lock(X->ic_lock)
 #define ICL_CONN_UNLOCK(X)		mtx_unlock(X->ic_lock)
 #define ICL_CONN_LOCK_ASSERT(X)		mtx_assert(X->ic_lock, MA_OWNED)
 #define ICL_CONN_LOCK_ASSERT_NOT(X)	mtx_assert(X->ic_lock, MA_NOTOWNED)
 
 struct icl_pdu *icl_cxgbei_new_pdu(int);
 void icl_cxgbei_new_pdu_set_conn(struct icl_pdu *, struct icl_conn *);
 
 static icl_conn_new_pdu_t	icl_cxgbei_conn_new_pdu;
 icl_conn_pdu_free_t	icl_cxgbei_conn_pdu_free;
 static icl_conn_pdu_data_segment_length_t
 				    icl_cxgbei_conn_pdu_data_segment_length;
 static icl_conn_pdu_append_data_t	icl_cxgbei_conn_pdu_append_data;
 static icl_conn_pdu_get_data_t	icl_cxgbei_conn_pdu_get_data;
 static icl_conn_pdu_queue_t	icl_cxgbei_conn_pdu_queue;
 static icl_conn_handoff_t	icl_cxgbei_conn_handoff;
 static icl_conn_free_t		icl_cxgbei_conn_free;
 static icl_conn_close_t		icl_cxgbei_conn_close;
 static icl_conn_task_setup_t	icl_cxgbei_conn_task_setup;
 static icl_conn_task_done_t	icl_cxgbei_conn_task_done;
 static icl_conn_transfer_setup_t	icl_cxgbei_conn_transfer_setup;
 static icl_conn_transfer_done_t	icl_cxgbei_conn_transfer_done;
 
 static kobj_method_t icl_cxgbei_methods[] = {
 	KOBJMETHOD(icl_conn_new_pdu, icl_cxgbei_conn_new_pdu),
 	KOBJMETHOD(icl_conn_pdu_free, icl_cxgbei_conn_pdu_free),
 	KOBJMETHOD(icl_conn_pdu_data_segment_length,
 	    icl_cxgbei_conn_pdu_data_segment_length),
 	KOBJMETHOD(icl_conn_pdu_append_data, icl_cxgbei_conn_pdu_append_data),
 	KOBJMETHOD(icl_conn_pdu_get_data, icl_cxgbei_conn_pdu_get_data),
 	KOBJMETHOD(icl_conn_pdu_queue, icl_cxgbei_conn_pdu_queue),
 	KOBJMETHOD(icl_conn_handoff, icl_cxgbei_conn_handoff),
 	KOBJMETHOD(icl_conn_free, icl_cxgbei_conn_free),
 	KOBJMETHOD(icl_conn_close, icl_cxgbei_conn_close),
 	KOBJMETHOD(icl_conn_task_setup, icl_cxgbei_conn_task_setup),
 	KOBJMETHOD(icl_conn_task_done, icl_cxgbei_conn_task_done),
 	KOBJMETHOD(icl_conn_transfer_setup, icl_cxgbei_conn_transfer_setup),
 	KOBJMETHOD(icl_conn_transfer_done, icl_cxgbei_conn_transfer_done),
 	{ 0, 0 }
 };
 
 DEFINE_CLASS(icl_cxgbei, icl_cxgbei_methods, sizeof(struct icl_cxgbei_conn));
 
 void
 icl_cxgbei_conn_pdu_free(struct icl_conn *ic, struct icl_pdu *ip)
 {
 #ifdef INVARIANTS
 	struct icl_cxgbei_pdu *icp = ip_to_icp(ip);
 #endif
 
 	MPASS(icp->icp_signature == CXGBEI_PDU_SIGNATURE);
 	MPASS(ic == ip->ip_conn);
 	MPASS(ip->ip_bhs_mbuf != NULL);
 
 	m_freem(ip->ip_ahs_mbuf);
 	m_freem(ip->ip_data_mbuf);
 	m_freem(ip->ip_bhs_mbuf);	/* storage for icl_cxgbei_pdu itself */
 
 #ifdef DIAGNOSTIC
 	if (__predict_true(ic != NULL))
 		refcount_release(&ic->ic_outstanding_pdus);
 #endif
 }
 
 struct icl_pdu *
 icl_cxgbei_new_pdu(int flags)
 {
 	struct icl_cxgbei_pdu *icp;
 	struct icl_pdu *ip;
 	struct mbuf *m;
 	uintptr_t a;
 
 	m = m_gethdr(flags, MT_DATA);
 	if (__predict_false(m == NULL))
 		return (NULL);
 
 	a = roundup2(mtod(m, uintptr_t), _Alignof(struct icl_cxgbei_pdu));
 	icp = (struct icl_cxgbei_pdu *)a;
 	bzero(icp, sizeof(*icp));
 
 	icp->icp_signature = CXGBEI_PDU_SIGNATURE;
 	ip = &icp->ip;
 	ip->ip_bhs_mbuf = m;
 
 	a = roundup2((uintptr_t)(icp + 1), _Alignof(struct iscsi_bhs *));
 	ip->ip_bhs = (struct iscsi_bhs *)a;
 #ifdef INVARIANTS
 	/* Everything must fit entirely in the mbuf. */
 	a = (uintptr_t)(ip->ip_bhs + 1);
 	MPASS(a <= (uintptr_t)m + MSIZE);
 #endif
 	bzero(ip->ip_bhs, sizeof(*ip->ip_bhs));
 
 	m->m_data = (void *)ip->ip_bhs;
 	m->m_len = sizeof(struct iscsi_bhs);
 	m->m_pkthdr.len = m->m_len;
 
 	return (ip);
 }
 
 void
 icl_cxgbei_new_pdu_set_conn(struct icl_pdu *ip, struct icl_conn *ic)
 {
 
 	ip->ip_conn = ic;
 #ifdef DIAGNOSTIC
 	refcount_acquire(&ic->ic_outstanding_pdus);
 #endif
 }
 
 /*
  * Allocate icl_pdu with empty BHS to fill up by the caller.
  */
 static struct icl_pdu *
 icl_cxgbei_conn_new_pdu(struct icl_conn *ic, int flags)
 {
 	struct icl_pdu *ip;
 
 	ip = icl_cxgbei_new_pdu(flags);
 	if (__predict_false(ip == NULL))
 		return (NULL);
 	icl_cxgbei_new_pdu_set_conn(ip, ic);
 
 	return (ip);
 }
 
 static size_t
 icl_pdu_data_segment_length(const struct icl_pdu *request)
 {
 	uint32_t len = 0;
 
 	len += request->ip_bhs->bhs_data_segment_len[0];
 	len <<= 8;
 	len += request->ip_bhs->bhs_data_segment_len[1];
 	len <<= 8;
 	len += request->ip_bhs->bhs_data_segment_len[2];
 
 	return (len);
 }
 
 size_t
 icl_cxgbei_conn_pdu_data_segment_length(struct icl_conn *ic,
     const struct icl_pdu *request)
 {
 
 	return (icl_pdu_data_segment_length(request));
 }
 
 static struct mbuf *
 finalize_pdu(struct icl_cxgbei_conn *icc, struct icl_cxgbei_pdu *icp)
 {
 	struct icl_pdu *ip = &icp->ip;
 	uint8_t ulp_submode, padding;
 	struct mbuf *m, *last;
 	struct iscsi_bhs *bhs;
 
 	/*
 	 * Fix up the data segment mbuf first.
 	 */
 	m = ip->ip_data_mbuf;
 	ulp_submode = icc->ulp_submode;
 	if (m) {
 		last = m_last(m);
 
 		/*
 		 * Round up the data segment to a 4B boundary.  Pad with 0 if
 		 * necessary.  There will definitely be room in the mbuf.
 		 */
 		padding = roundup2(ip->ip_data_len, 4) - ip->ip_data_len;
 		if (padding) {
 			bzero(mtod(last, uint8_t *) + last->m_len, padding);
 			last->m_len += padding;
 		}
 	} else {
 		MPASS(ip->ip_data_len == 0);
 		ulp_submode &= ~ULP_CRC_DATA;
 		padding = 0;
 	}
 
 	/*
 	 * Now the header mbuf that has the BHS.
 	 */
 	m = ip->ip_bhs_mbuf;
 	MPASS(m->m_pkthdr.len == sizeof(struct iscsi_bhs));
 	MPASS(m->m_len == sizeof(struct iscsi_bhs));
 
 	bhs = ip->ip_bhs;
 	bhs->bhs_data_segment_len[2] = ip->ip_data_len;
 	bhs->bhs_data_segment_len[1] = ip->ip_data_len >> 8;
 	bhs->bhs_data_segment_len[0] = ip->ip_data_len >> 16;
 
 	/* "Convert" PDU to mbuf chain.  Do not use icp/ip after this. */
 	m->m_pkthdr.len = sizeof(struct iscsi_bhs) + ip->ip_data_len + padding;
 	m->m_next = ip->ip_data_mbuf;
 	set_mbuf_ulp_submode(m, ulp_submode);
 #ifdef INVARIANTS
 	bzero(icp, sizeof(*icp));
 #endif
 #ifdef DIAGNOSTIC
 	refcount_release(&icc->ic.ic_outstanding_pdus);
 #endif
 
 	return (m);
 }
 
 int
 icl_cxgbei_conn_pdu_append_data(struct icl_conn *ic, struct icl_pdu *ip,
     const void *addr, size_t len, int flags)
 {
 	struct mbuf *m;
 #ifdef INVARIANTS
 	struct icl_cxgbei_pdu *icp = ip_to_icp(ip);
 #endif
 
 	MPASS(icp->icp_signature == CXGBEI_PDU_SIGNATURE);
 	MPASS(ic == ip->ip_conn);
 	KASSERT(len > 0, ("%s: len is %jd", __func__, (intmax_t)len));
 
 	m = ip->ip_data_mbuf;
 	if (m == NULL) {
 		m = m_getjcl(M_NOWAIT, MT_DATA, 0, MJUM16BYTES);
 		if (__predict_false(m == NULL))
 			return (ENOMEM);
 
 		ip->ip_data_mbuf = m;
 	}
 
 	if (__predict_true(m_append(m, len, addr) != 0)) {
 		ip->ip_data_len += len;
 		MPASS(ip->ip_data_len <= ic->ic_max_data_segment_length);
 		return (0);
 	} else {
 	    	if (flags & M_WAITOK) {
 			CXGBE_UNIMPLEMENTED("fail safe append");
 		}
 		ip->ip_data_len = m_length(m, NULL);
 		return (1);
 	}
 }
 
 void
 icl_cxgbei_conn_pdu_get_data(struct icl_conn *ic, struct icl_pdu *ip,
     size_t off, void *addr, size_t len)
 {
 	struct icl_cxgbei_pdu *icp = ip_to_icp(ip);
 
 	if (icp->icp_flags & ICPF_RX_DDP)
 		return; /* data is DDP'ed, no need to copy */
 	m_copydata(ip->ip_data_mbuf, off, len, addr);
 }
 
 void
 icl_cxgbei_conn_pdu_queue(struct icl_conn *ic, struct icl_pdu *ip)
 {
 	struct icl_cxgbei_conn *icc = ic_to_icc(ic);
 	struct icl_cxgbei_pdu *icp = ip_to_icp(ip);
 	struct socket *so = ic->ic_socket;
 	struct toepcb *toep = icc->toep;
 	struct inpcb *inp;
 	struct mbuf *m;
 
 	MPASS(ic == ip->ip_conn);
 	MPASS(ip->ip_bhs_mbuf != NULL);
 	/* The kernel doesn't generate PDUs with AHS. */
 	MPASS(ip->ip_ahs_mbuf == NULL && ip->ip_ahs_len == 0);
 
 	ICL_CONN_LOCK_ASSERT(ic);
 	/* NOTE: sowriteable without so_snd lock is a mostly harmless race. */
 	if (ic->ic_disconnecting || so == NULL || !sowriteable(so)) {
 		icl_cxgbei_conn_pdu_free(ic, ip);
 		return;
 	}
 
 	m = finalize_pdu(icc, icp);
 	M_ASSERTPKTHDR(m);
 	MPASS((m->m_pkthdr.len & 3) == 0);
 
 	/*
 	 * Do not get inp from toep->inp as the toepcb might have detached
 	 * already.
 	 */
 	inp = sotoinpcb(so);
 	INP_WLOCK(inp);
 	if (__predict_false(inp->inp_flags & (INP_DROPPED | INP_TIMEWAIT)) ||
 	    __predict_false((toep->flags & TPF_ATTACHED) == 0))
 		m_freem(m);
 	else {
 		mbufq_enqueue(&toep->ulp_pduq, m);
 		t4_push_pdus(icc->sc, toep, 0);
 	}
 	INP_WUNLOCK(inp);
 }
 
 static struct icl_conn *
 icl_cxgbei_new_conn(const char *name, struct mtx *lock)
 {
 	struct icl_cxgbei_conn *icc;
 	struct icl_conn *ic;
 
 	refcount_acquire(&icl_cxgbei_ncons);
 
 	icc = (struct icl_cxgbei_conn *)kobj_create(&icl_cxgbei_class, M_CXGBE,
 	    M_WAITOK | M_ZERO);
 	icc->icc_signature = CXGBEI_CONN_SIGNATURE;
 	STAILQ_INIT(&icc->rcvd_pdus);
 
 	ic = &icc->ic;
 	ic->ic_lock = lock;
 
 	/* XXXNP: review.  Most of these icl_conn fields aren't really used */
 	STAILQ_INIT(&ic->ic_to_send);
 	cv_init(&ic->ic_send_cv, "icl_cxgbei_tx");
 	cv_init(&ic->ic_receive_cv, "icl_cxgbei_rx");
 #ifdef DIAGNOSTIC
 	refcount_init(&ic->ic_outstanding_pdus, 0);
 #endif
 	/* This is a stop-gap value that will be corrected during handoff. */
 	ic->ic_max_data_segment_length = 16384;
 	ic->ic_name = name;
 	ic->ic_offload = "cxgbei";
 	ic->ic_unmapped = false;
 
 	CTR2(KTR_CXGBE, "%s: icc %p", __func__, icc);
 
 	return (ic);
 }
 
 void
 icl_cxgbei_conn_free(struct icl_conn *ic)
 {
 	struct icl_cxgbei_conn *icc = ic_to_icc(ic);
 
 	MPASS(icc->icc_signature == CXGBEI_CONN_SIGNATURE);
 
 	CTR2(KTR_CXGBE, "%s: icc %p", __func__, icc);
 
 	cv_destroy(&ic->ic_send_cv);
 	cv_destroy(&ic->ic_receive_cv);
 
 	kobj_delete((struct kobj *)icc, M_CXGBE);
 	refcount_release(&icl_cxgbei_ncons);
 }
 
 static int
 icl_cxgbei_setsockopt(struct icl_conn *ic, struct socket *so, int sspace,
     int rspace)
 {
 	struct sockopt opt;
 	int error, one = 1, ss, rs;
 
 	ss = max(sendspace, sspace);
 	rs = max(recvspace, rspace);
 
 	error = soreserve(so, ss, rs);
 	if (error != 0) {
 		icl_cxgbei_conn_close(ic);
 		return (error);
 	}
 	SOCKBUF_LOCK(&so->so_snd);
 	so->so_snd.sb_flags |= SB_AUTOSIZE;
 	SOCKBUF_UNLOCK(&so->so_snd);
 	SOCKBUF_LOCK(&so->so_rcv);
 	so->so_rcv.sb_flags |= SB_AUTOSIZE;
 	SOCKBUF_UNLOCK(&so->so_rcv);
 
 	/*
 	 * Disable Nagle.
 	 */
 	bzero(&opt, sizeof(opt));
 	opt.sopt_dir = SOPT_SET;
 	opt.sopt_level = IPPROTO_TCP;
 	opt.sopt_name = TCP_NODELAY;
 	opt.sopt_val = &one;
 	opt.sopt_valsize = sizeof(one);
 	error = sosetopt(so, &opt);
 	if (error != 0) {
 		icl_cxgbei_conn_close(ic);
 		return (error);
 	}
 
 	return (0);
 }
 
 /*
  * Request/response structure used to find out the adapter offloading a socket.
  */
 struct find_ofld_adapter_rr {
 	struct socket *so;
 	struct adapter *sc;	/* result */
 };
 
 static void
 find_offload_adapter(struct adapter *sc, void *arg)
 {
 	struct find_ofld_adapter_rr *fa = arg;
 	struct socket *so = fa->so;
 	struct tom_data *td = sc->tom_softc;
 	struct tcpcb *tp;
 	struct inpcb *inp;
 
 	/* Non-TCP were filtered out earlier. */
 	MPASS(so->so_proto->pr_protocol == IPPROTO_TCP);
 
 	if (fa->sc != NULL)
 		return;	/* Found already. */
 
 	if (td == NULL)
 		return;	/* TOE not enabled on this adapter. */
 
 	inp = sotoinpcb(so);
 	INP_WLOCK(inp);
 	if ((inp->inp_flags & (INP_DROPPED | INP_TIMEWAIT)) == 0) {
 		tp = intotcpcb(inp);
 		if (tp->t_flags & TF_TOE && tp->tod == &td->tod)
 			fa->sc = sc;	/* Found. */
 	}
 	INP_WUNLOCK(inp);
 }
 
 /* XXXNP: move this to t4_tom. */
 static void
 send_iscsi_flowc_wr(struct adapter *sc, struct toepcb *toep, int maxlen)
 {
 	struct wrqe *wr;
 	struct fw_flowc_wr *flowc;
 	const u_int nparams = 1;
 	u_int flowclen;
 	struct ofld_tx_sdesc *txsd = &toep->txsd[toep->txsd_pidx];
 
 	flowclen = sizeof(*flowc) + nparams * sizeof(struct fw_flowc_mnemval);
 
 	wr = alloc_wrqe(roundup2(flowclen, 16), toep->ofld_txq);
 	if (wr == NULL) {
 		/* XXX */
 		panic("%s: allocation failure.", __func__);
 	}
 	flowc = wrtod(wr);
 	memset(flowc, 0, wr->wr_len);
 
 	flowc->op_to_nparams = htobe32(V_FW_WR_OP(FW_FLOWC_WR) |
 	    V_FW_FLOWC_WR_NPARAMS(nparams));
 	flowc->flowid_len16 = htonl(V_FW_WR_LEN16(howmany(flowclen, 16)) |
 	    V_FW_WR_FLOWID(toep->tid));
 
 	flowc->mnemval[0].mnemonic = FW_FLOWC_MNEM_TXDATAPLEN_MAX;
 	flowc->mnemval[0].val = htobe32(maxlen);
 
 	txsd->tx_credits = howmany(flowclen, 16);
 	txsd->plen = 0;
 	KASSERT(toep->tx_credits >= txsd->tx_credits && toep->txsd_avail > 0,
 	    ("%s: not enough credits (%d)", __func__, toep->tx_credits));
 	toep->tx_credits -= txsd->tx_credits;
 	if (__predict_false(++toep->txsd_pidx == toep->txsd_total))
 		toep->txsd_pidx = 0;
 	toep->txsd_avail--;
 
         t4_wrq_tx(sc, wr);
 }
 
 static void
 set_ulp_mode_iscsi(struct adapter *sc, struct toepcb *toep, int hcrc, int dcrc)
 {
 	uint64_t val = ULP_MODE_ISCSI;
 
 	if (hcrc)
 		val |= ULP_CRC_HEADER << 4;
 	if (dcrc)
 		val |= ULP_CRC_DATA << 4;
 
 	CTR4(KTR_CXGBE, "%s: tid %u, ULP_MODE_ISCSI, CRC hdr=%d data=%d",
 	    __func__, toep->tid, hcrc, dcrc);
 
-	t4_set_tcb_field(sc, toep->ctrlq, toep->tid, W_TCB_ULP_TYPE,
+	t4_set_tcb_field(sc, toep->ctrlq, toep, W_TCB_ULP_TYPE,
 	    V_TCB_ULP_TYPE(M_TCB_ULP_TYPE) | V_TCB_ULP_RAW(M_TCB_ULP_RAW), val,
-	    0, 0, toep->ofld_rxq->iq.abs_id);
+	    0, 0);
 }
 
 /*
  * XXXNP: Who is responsible for cleaning up the socket if this returns with an
  * error?  Review all error paths.
  *
  * XXXNP: What happens to the socket's fd reference if the operation is
  * successful, and how does that affect the socket's life cycle?
  */
 int
 icl_cxgbei_conn_handoff(struct icl_conn *ic, int fd)
 {
 	struct icl_cxgbei_conn *icc = ic_to_icc(ic);
 	struct cxgbei_data *ci;
 	struct find_ofld_adapter_rr fa;
 	struct file *fp;
 	struct socket *so;
 	struct inpcb *inp;
 	struct tcpcb *tp;
 	struct toepcb *toep;
 	cap_rights_t rights;
 	int error;
 
 	MPASS(icc->icc_signature == CXGBEI_CONN_SIGNATURE);
 	ICL_CONN_LOCK_ASSERT_NOT(ic);
 
 	/*
 	 * Steal the socket from userland.
 	 */
 	error = fget(curthread, fd,
 	    cap_rights_init(&rights, CAP_SOCK_CLIENT), &fp);
 	if (error != 0)
 		return (error);
 	if (fp->f_type != DTYPE_SOCKET) {
 		fdrop(fp, curthread);
 		return (EINVAL);
 	}
 	so = fp->f_data;
 	if (so->so_type != SOCK_STREAM ||
 	    so->so_proto->pr_protocol != IPPROTO_TCP) {
 		fdrop(fp, curthread);
 		return (EINVAL);
 	}
 
 	ICL_CONN_LOCK(ic);
 	if (ic->ic_socket != NULL) {
 		ICL_CONN_UNLOCK(ic);
 		fdrop(fp, curthread);
 		return (EBUSY);
 	}
 	ic->ic_disconnecting = false;
 	ic->ic_socket = so;
 	fp->f_ops = &badfileops;
 	fp->f_data = NULL;
 	fdrop(fp, curthread);
 	ICL_CONN_UNLOCK(ic);
 
 	/* Find the adapter offloading this socket. */
 	fa.sc = NULL;
 	fa.so = so;
 	t4_iterate(find_offload_adapter, &fa);
 	if (fa.sc == NULL)
 		return (EINVAL);
 	icc->sc = fa.sc;
 	ci = icc->sc->iscsi_ulp_softc;
 
 	inp = sotoinpcb(so);
 	INP_WLOCK(inp);
 	tp = intotcpcb(inp);
 	if (inp->inp_flags & (INP_DROPPED | INP_TIMEWAIT))
 		error = EBUSY;
 	else {
 		/*
 		 * socket could not have been "unoffloaded" if here.
 		 */
 		MPASS(tp->t_flags & TF_TOE);
 		MPASS(tp->tod != NULL);
 		MPASS(tp->t_toe != NULL);
 		toep = tp->t_toe;
 		MPASS(toep->vi->pi->adapter == icc->sc);
 		icc->toep = toep;
 		icc->cwt = cxgbei_select_worker_thread(icc);
 
 		/*
 		 * We maintain the _send_ DSL in this field just to have a
 		 * convenient way to assert that the kernel never sends
 		 * oversized PDUs.  This field is otherwise unused in the driver
 		 * or the kernel.
 		 */
 		ic->ic_max_data_segment_length = ci->max_tx_pdu_len -
 		    ISCSI_BHS_SIZE;
 
 		icc->ulp_submode = 0;
 		if (ic->ic_header_crc32c) {
 			icc->ulp_submode |= ULP_CRC_HEADER;
 			ic->ic_max_data_segment_length -=
 			    ISCSI_HEADER_DIGEST_SIZE;
 		}
 		if (ic->ic_data_crc32c) {
 			icc->ulp_submode |= ULP_CRC_DATA;
 			ic->ic_max_data_segment_length -=
 			    ISCSI_DATA_DIGEST_SIZE;
 		}
 		so->so_options |= SO_NO_DDP;
 		toep->ulp_mode = ULP_MODE_ISCSI;
 		toep->ulpcb = icc;
 
 		send_iscsi_flowc_wr(icc->sc, toep, ci->max_tx_pdu_len);
 		set_ulp_mode_iscsi(icc->sc, toep, ic->ic_header_crc32c,
 		    ic->ic_data_crc32c);
 		error = 0;
 	}
 	INP_WUNLOCK(inp);
 
 	if (error == 0) {
 		error = icl_cxgbei_setsockopt(ic, so, ci->max_tx_pdu_len,
 		    ci->max_rx_pdu_len);
 	}
 
 	return (error);
 }
 
 void
 icl_cxgbei_conn_close(struct icl_conn *ic)
 {
 	struct icl_cxgbei_conn *icc = ic_to_icc(ic);
 	struct icl_pdu *ip;
 	struct socket *so;
 	struct sockbuf *sb;
 	struct inpcb *inp;
 	struct toepcb *toep = icc->toep;
 
 	MPASS(icc->icc_signature == CXGBEI_CONN_SIGNATURE);
 	ICL_CONN_LOCK_ASSERT_NOT(ic);
 
 	ICL_CONN_LOCK(ic);
 	so = ic->ic_socket;
 	if (ic->ic_disconnecting || so == NULL) {
 		CTR4(KTR_CXGBE, "%s: icc %p (disconnecting = %d), so %p",
 		    __func__, icc, ic->ic_disconnecting, so);
 		ICL_CONN_UNLOCK(ic);
 		return;
 	}
 	ic->ic_disconnecting = true;
 
 	/* These are unused in this driver right now. */
 	MPASS(STAILQ_EMPTY(&ic->ic_to_send));
 	MPASS(ic->ic_receive_pdu == NULL);
 
 #ifdef DIAGNOSTIC
 	KASSERT(ic->ic_outstanding_pdus == 0,
 	    ("destroying session with %d outstanding PDUs",
 	     ic->ic_outstanding_pdus));
 #endif
 	ICL_CONN_UNLOCK(ic);
 
 	CTR3(KTR_CXGBE, "%s: tid %d, icc %p", __func__, toep ? toep->tid : -1,
 	    icc);
 	inp = sotoinpcb(so);
 	sb = &so->so_rcv;
 	INP_WLOCK(inp);
 	if (toep != NULL) {	/* NULL if connection was never offloaded. */
 		toep->ulpcb = NULL;
 		mbufq_drain(&toep->ulp_pduq);
 		SOCKBUF_LOCK(sb);
 		if (icc->rx_flags & RXF_ACTIVE) {
 			volatile u_int *p = &icc->rx_flags;
 
 			SOCKBUF_UNLOCK(sb);
 			INP_WUNLOCK(inp);
 
 			while (*p & RXF_ACTIVE)
 				pause("conclo", 1);
 
 			INP_WLOCK(inp);
 			SOCKBUF_LOCK(sb);
 		}
 
 		while (!STAILQ_EMPTY(&icc->rcvd_pdus)) {
 			ip = STAILQ_FIRST(&icc->rcvd_pdus);
 			STAILQ_REMOVE_HEAD(&icc->rcvd_pdus, ip_next);
 			icl_cxgbei_conn_pdu_free(ic, ip);
 		}
 		SOCKBUF_UNLOCK(sb);
 	}
 	INP_WUNLOCK(inp);
 
 	ICL_CONN_LOCK(ic);
 	ic->ic_socket = NULL;
 	ICL_CONN_UNLOCK(ic);
 
 	/*
 	 * XXXNP: we should send RST instead of FIN when PDUs held in various
 	 * queues were purged instead of delivered reliably but soabort isn't
 	 * really general purpose and wouldn't do the right thing here.
 	 */
 	soclose(so);
 }
 
 int
 icl_cxgbei_conn_task_setup(struct icl_conn *ic, struct icl_pdu *ip,
     struct ccb_scsiio *csio, uint32_t *ittp, void **arg)
 {
 	struct icl_cxgbei_conn *icc = ic_to_icc(ic);
 	struct toepcb *toep = icc->toep;
 	struct adapter *sc = icc->sc;
 	struct cxgbei_data *ci = sc->iscsi_ulp_softc;
 	struct ppod_region *pr = &ci->pr;
 	struct ppod_reservation *prsv;
 	uint32_t itt;
 	int rc = 0;
 
 	/* This is for the offload driver's state.  Must not be set already. */
 	MPASS(arg != NULL);
 	MPASS(*arg == NULL);
 
 	if ((csio->ccb_h.flags & CAM_DIR_MASK) != CAM_DIR_IN ||
 	    csio->dxfer_len < ci->ddp_threshold) {
 no_ddp:
 		/*
 		 * No DDP for this I/O.  Allocate an ITT (based on the one
 		 * passed in) that cannot be a valid hardware DDP tag in the
 		 * iSCSI region.
 		 */
 		itt = *ittp & M_PPOD_TAG;
 		itt = V_PPOD_TAG(itt) | pr->pr_invalid_bit;
 		*ittp = htobe32(itt);
 		MPASS(*arg == NULL);	/* State is maintained for DDP only. */
 		if (rc != 0)
 			counter_u64_add(ci->ddp_setup_error, 1);
 		return (0);
 	}
 
 	/*
 	 * Reserve resources for DDP, update the itt that should be used in the
 	 * PDU, and save DDP specific state for this I/O in *arg.
 	 */
 
 	prsv = uma_zalloc(prsv_zone, M_NOWAIT);
 	if (prsv == NULL) {
 		rc = ENOMEM;
 		goto no_ddp;
 	}
 
 	/* XXX add support for all CAM_DATA_ types */
 	MPASS((csio->ccb_h.flags & CAM_DATA_MASK) == CAM_DATA_VADDR);
 	rc = t4_alloc_page_pods_for_buf(pr, (vm_offset_t)csio->data_ptr,
 	    csio->dxfer_len, prsv);
 	if (rc != 0) {
 		uma_zfree(prsv_zone, prsv);
 		goto no_ddp;
 	}
 
 	rc = t4_write_page_pods_for_buf(sc, toep->ofld_txq, toep->tid, prsv,
 	    (vm_offset_t)csio->data_ptr, csio->dxfer_len);
 	if (rc != 0) {
 		t4_free_page_pods(prsv);
 		uma_zfree(prsv_zone, prsv);
 		goto no_ddp;
 	}
 
 	*ittp = htobe32(prsv->prsv_tag);
 	*arg = prsv;
 	counter_u64_add(ci->ddp_setup_ok, 1);
 	return (0);
 }
 
 void
 icl_cxgbei_conn_task_done(struct icl_conn *ic, void *arg)
 {
 
 	if (arg != NULL) {
 		struct ppod_reservation *prsv = arg;
 
 		t4_free_page_pods(prsv);
 		uma_zfree(prsv_zone, prsv);
 	}
 }
 
 /* XXXNP: PDU should be passed in as parameter, like on the initiator. */
 #define io_to_request_pdu(io) ((io)->io_hdr.ctl_private[CTL_PRIV_FRONTEND].ptr)
 #define io_to_ppod_reservation(io) ((io)->io_hdr.ctl_private[CTL_PRIV_FRONTEND2].ptr)
 
 int
 icl_cxgbei_conn_transfer_setup(struct icl_conn *ic, union ctl_io *io,
     uint32_t *tttp, void **arg)
 {
 	struct icl_cxgbei_conn *icc = ic_to_icc(ic);
 	struct toepcb *toep = icc->toep;
 	struct ctl_scsiio *ctsio = &io->scsiio;
 	struct adapter *sc = icc->sc;
 	struct cxgbei_data *ci = sc->iscsi_ulp_softc;
 	struct ppod_region *pr = &ci->pr;
 	struct ppod_reservation *prsv;
 	uint32_t ttt;
 	int xferlen, rc = 0, alias;
 
 	/* This is for the offload driver's state.  Must not be set already. */
 	MPASS(arg != NULL);
 	MPASS(*arg == NULL);
 
 	if (ctsio->ext_data_filled == 0) {
 		int first_burst;
 		struct icl_pdu *ip = io_to_request_pdu(io);
 		vm_offset_t buf;
 #ifdef INVARIANTS
 		struct icl_cxgbei_pdu *icp = ip_to_icp(ip);
 
 		MPASS(icp->icp_signature == CXGBEI_PDU_SIGNATURE);
 		MPASS(ic == ip->ip_conn);
 		MPASS(ip->ip_bhs_mbuf != NULL);
 #endif
 		first_burst = icl_pdu_data_segment_length(ip);
 
 		/*
 		 * Note that ICL calls conn_transfer_setup even if the first
 		 * burst had everything and there's nothing left to transfer.
 		 */
 		MPASS(ctsio->kern_data_len >= first_burst);
 		xferlen = ctsio->kern_data_len;
 		if (xferlen - first_burst < ci->ddp_threshold) {
 no_ddp:
 			/*
 			 * No DDP for this transfer.  Allocate a TTT (based on
 			 * the one passed in) that cannot be a valid hardware
 			 * DDP tag in the iSCSI region.
 			 */
 			ttt = *tttp & M_PPOD_TAG;
 			ttt = V_PPOD_TAG(ttt) | pr->pr_invalid_bit;
 			*tttp = htobe32(ttt);
 			MPASS(io_to_ppod_reservation(io) == NULL);
 			if (rc != 0)
 				counter_u64_add(ci->ddp_setup_error, 1);
 			return (0);
 		}
 
 		if (ctsio->kern_sg_entries == 0)
 			buf = (vm_offset_t)ctsio->kern_data_ptr;
 		else if (ctsio->kern_sg_entries == 1) {
 			struct ctl_sg_entry *sgl = (void *)ctsio->kern_data_ptr;
 
 			MPASS(sgl->len == xferlen);
 			buf = (vm_offset_t)sgl->addr;
 		} else {
 			rc = EAGAIN;	/* XXX implement */
 			goto no_ddp;
 		}
 
 
 		/*
 		 * Reserve resources for DDP, update the ttt that should be used
 		 * in the PDU, and save DDP specific state for this I/O.
 		 */
 
 		MPASS(io_to_ppod_reservation(io) == NULL);
 		prsv = uma_zalloc(prsv_zone, M_NOWAIT);
 		if (prsv == NULL) {
 			rc = ENOMEM;
 			goto no_ddp;
 		}
 
 		rc = t4_alloc_page_pods_for_buf(pr, buf, xferlen, prsv);
 		if (rc != 0) {
 			uma_zfree(prsv_zone, prsv);
 			goto no_ddp;
 		}
 
 		rc = t4_write_page_pods_for_buf(sc, toep->ofld_txq, toep->tid,
 		    prsv, buf, xferlen);
 		if (rc != 0) {
 			t4_free_page_pods(prsv);
 			uma_zfree(prsv_zone, prsv);
 			goto no_ddp;
 		}
 
 		*tttp = htobe32(prsv->prsv_tag);
 		io_to_ppod_reservation(io) = prsv;
 		*arg = ctsio;
 		counter_u64_add(ci->ddp_setup_ok, 1);
 		return (0);
 	}
 
 	/*
 	 * In the middle of an I/O.  A non-NULL page pod reservation indicates
 	 * that a DDP buffer is being used for the I/O.
 	 */
 
 	prsv = io_to_ppod_reservation(ctsio);
 	if (prsv == NULL)
 		goto no_ddp;
 
 	alias = (prsv->prsv_tag & pr->pr_alias_mask) >> pr->pr_alias_shift;
 	alias++;
 	prsv->prsv_tag &= ~pr->pr_alias_mask;
 	prsv->prsv_tag |= alias << pr->pr_alias_shift & pr->pr_alias_mask;
 
 	*tttp = htobe32(prsv->prsv_tag);
 	*arg = ctsio;
 
 	return (0);
 }
 
 void
 icl_cxgbei_conn_transfer_done(struct icl_conn *ic, void *arg)
 {
 	struct ctl_scsiio *ctsio = arg;
 
 	if (ctsio != NULL && ctsio->kern_data_len == ctsio->ext_data_filled) {
 		struct ppod_reservation *prsv;
 
 		prsv = io_to_ppod_reservation(ctsio);
 		MPASS(prsv != NULL);
 
 		t4_free_page_pods(prsv);
 		uma_zfree(prsv_zone, prsv);
 	}
 }
 
 static void
 cxgbei_limits(struct adapter *sc, void *arg)
 {
 	struct icl_drv_limits *idl = arg;
 	struct cxgbei_data *ci;
 	int max_dsl;
 
 	if (begin_synchronized_op(sc, NULL, HOLD_LOCK, "t4lims") != 0)
 		return;
 
 	if (uld_active(sc, ULD_ISCSI)) {
 		ci = sc->iscsi_ulp_softc;
 		MPASS(ci != NULL);
 
 		/*
 		 * AHS is not supported by the kernel so we'll not account for
 		 * it either in our PDU len -> data segment len conversions.
 		 */
 
 		max_dsl = ci->max_rx_pdu_len - ISCSI_BHS_SIZE -
 		    ISCSI_HEADER_DIGEST_SIZE - ISCSI_DATA_DIGEST_SIZE;
 		if (idl->idl_max_recv_data_segment_length > max_dsl)
 			idl->idl_max_recv_data_segment_length = max_dsl;
 
 		max_dsl = ci->max_tx_pdu_len - ISCSI_BHS_SIZE -
 		    ISCSI_HEADER_DIGEST_SIZE - ISCSI_DATA_DIGEST_SIZE;
 		if (idl->idl_max_send_data_segment_length > max_dsl)
 			idl->idl_max_send_data_segment_length = max_dsl;
 	}
 
 	end_synchronized_op(sc, LOCK_HELD);
 }
 
 static int
 icl_cxgbei_limits(struct icl_drv_limits *idl)
 {
 
 	/* Maximum allowed by the RFC.  cxgbei_limits will clip them. */
 	idl->idl_max_recv_data_segment_length = (1 << 24) - 1;
 	idl->idl_max_send_data_segment_length = (1 << 24) - 1;
 
 	/* These are somewhat arbitrary. */
 	idl->idl_max_burst_length = 2 * 1024 * 1024;
 	idl->idl_first_burst_length = 8192;
 
 	t4_iterate(cxgbei_limits, idl);
 
 	return (0);
 }
 
 int
 icl_cxgbei_mod_load(void)
 {
 	int rc;
 
 	/*
 	 * Space to track pagepod reservations.
 	 */
 	prsv_zone = uma_zcreate("Pagepod reservations",
 	    sizeof(struct ppod_reservation), NULL, NULL, NULL, NULL,
 	    UMA_ALIGN_CACHE, 0);
 
 	refcount_init(&icl_cxgbei_ncons, 0);
 
 	rc = icl_register("cxgbei", false, -100, icl_cxgbei_limits,
 	    icl_cxgbei_new_conn);
 
 	return (rc);
 }
 
 int
 icl_cxgbei_mod_unload(void)
 {
 
 	if (icl_cxgbei_ncons != 0)
 		return (EBUSY);
 
 	icl_unregister("cxgbei", false);
 
 	uma_zdestroy(prsv_zone);
 
 	return (0);
 }
 #endif
Index: head/sys/dev/cxgbe/tom/t4_cpl_io.c
===================================================================
--- head/sys/dev/cxgbe/tom/t4_cpl_io.c	(revision 331648)
+++ head/sys/dev/cxgbe/tom/t4_cpl_io.c	(revision 331649)
@@ -1,2383 +1,2396 @@
 /*-
  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
  *
  * Copyright (c) 2012, 2015 Chelsio Communications, Inc.
  * All rights reserved.
  * Written by: Navdeep Parhar <np@FreeBSD.org>
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_inet.h"
 #include "opt_inet6.h"
 #include "opt_ratelimit.h"
 
 #ifdef TCP_OFFLOAD
 #include <sys/param.h>
 #include <sys/aio.h>
 #include <sys/file.h>
 #include <sys/kernel.h>
 #include <sys/ktr.h>
 #include <sys/module.h>
 #include <sys/proc.h>
 #include <sys/protosw.h>
 #include <sys/domain.h>
 #include <sys/socket.h>
 #include <sys/socketvar.h>
 #include <sys/sglist.h>
 #include <sys/taskqueue.h>
 #include <netinet/in.h>
 #include <netinet/in_pcb.h>
 #include <netinet/ip.h>
 #include <netinet/ip6.h>
 #define TCPSTATES
 #include <netinet/tcp_fsm.h>
 #include <netinet/tcp_seq.h>
 #include <netinet/tcp_var.h>
 #include <netinet/toecore.h>
 
 #include <security/mac/mac_framework.h>
 
 #include <vm/vm.h>
 #include <vm/vm_extern.h>
 #include <vm/pmap.h>
 #include <vm/vm_map.h>
 #include <vm/vm_page.h>
 
 #include "common/common.h"
 #include "common/t4_msg.h"
 #include "common/t4_regs.h"
 #include "common/t4_tcb.h"
 #include "tom/t4_tom_l2t.h"
 #include "tom/t4_tom.h"
 
 static void	t4_aiotx_cancel(struct kaiocb *job);
 static void	t4_aiotx_queue_toep(struct toepcb *toep);
 
 static size_t
 aiotx_mbuf_pgoff(struct mbuf *m)
 {
 	struct aiotx_buffer *ab;
 
 	MPASS(IS_AIOTX_MBUF(m));
 	ab = m->m_ext.ext_arg1;
 	return ((ab->ps.offset + (uintptr_t)m->m_ext.ext_arg2) % PAGE_SIZE);
 }
 
 static vm_page_t *
 aiotx_mbuf_pages(struct mbuf *m)
 {
 	struct aiotx_buffer *ab;
 	int npages;
 
 	MPASS(IS_AIOTX_MBUF(m));
 	ab = m->m_ext.ext_arg1;
 	npages = (ab->ps.offset + (uintptr_t)m->m_ext.ext_arg2) / PAGE_SIZE;
 	return (ab->ps.pages + npages);
 }
 
 void
 send_flowc_wr(struct toepcb *toep, struct flowc_tx_params *ftxp)
 {
 	struct wrqe *wr;
 	struct fw_flowc_wr *flowc;
 	unsigned int nparams, flowclen, paramidx;
 	struct vi_info *vi = toep->vi;
 	struct port_info *pi = vi->pi;
 	struct adapter *sc = pi->adapter;
 	unsigned int pfvf = G_FW_VIID_PFN(vi->viid) << S_FW_VIID_PFN;
 	struct ofld_tx_sdesc *txsd = &toep->txsd[toep->txsd_pidx];
 
 	KASSERT(!(toep->flags & TPF_FLOWC_WR_SENT),
 	    ("%s: flowc for tid %u sent already", __func__, toep->tid));
 
 	if (ftxp != NULL)
 		nparams = 8;
 	else
 		nparams = 6;
 	if (toep->ulp_mode == ULP_MODE_TLS)
 		nparams++;
 	if (toep->tls.fcplenmax != 0)
 		nparams++;
 
 	flowclen = sizeof(*flowc) + nparams * sizeof(struct fw_flowc_mnemval);
 
 	wr = alloc_wrqe(roundup2(flowclen, 16), toep->ofld_txq);
 	if (wr == NULL) {
 		/* XXX */
 		panic("%s: allocation failure.", __func__);
 	}
 	flowc = wrtod(wr);
 	memset(flowc, 0, wr->wr_len);
 
 	flowc->op_to_nparams = htobe32(V_FW_WR_OP(FW_FLOWC_WR) |
 	    V_FW_FLOWC_WR_NPARAMS(nparams));
 	flowc->flowid_len16 = htonl(V_FW_WR_LEN16(howmany(flowclen, 16)) |
 	    V_FW_WR_FLOWID(toep->tid));
 
 #define FLOWC_PARAM(__m, __v) \
 	do { \
 		flowc->mnemval[paramidx].mnemonic = FW_FLOWC_MNEM_##__m; \
 		flowc->mnemval[paramidx].val = htobe32(__v); \
 		paramidx++; \
 	} while (0)
 
 	paramidx = 0;
 
 	FLOWC_PARAM(PFNVFN, pfvf);
 	FLOWC_PARAM(CH, pi->tx_chan);
 	FLOWC_PARAM(PORT, pi->tx_chan);
 	FLOWC_PARAM(IQID, toep->ofld_rxq->iq.abs_id);
 	if (ftxp) {
 		uint32_t sndbuf = min(ftxp->snd_space, sc->tt.sndbuf);
 
 		FLOWC_PARAM(SNDNXT, ftxp->snd_nxt);
 		FLOWC_PARAM(RCVNXT, ftxp->rcv_nxt);
 		FLOWC_PARAM(SNDBUF, sndbuf);
 		FLOWC_PARAM(MSS, ftxp->mss);
 
 		CTR6(KTR_CXGBE,
 		    "%s: tid %u, mss %u, sndbuf %u, snd_nxt 0x%x, rcv_nxt 0x%x",
 		    __func__, toep->tid, ftxp->mss, sndbuf, ftxp->snd_nxt,
 		    ftxp->rcv_nxt);
 	} else {
 		FLOWC_PARAM(SNDBUF, 512);
 		FLOWC_PARAM(MSS, 512);
 
 		CTR2(KTR_CXGBE, "%s: tid %u", __func__, toep->tid);
 	}
 	if (toep->ulp_mode == ULP_MODE_TLS)
 		FLOWC_PARAM(ULP_MODE, toep->ulp_mode);
 	if (toep->tls.fcplenmax != 0)
 		FLOWC_PARAM(TXDATAPLEN_MAX, toep->tls.fcplenmax);
 #undef FLOWC_PARAM
 
 	KASSERT(paramidx == nparams, ("nparams mismatch"));
 
 	txsd->tx_credits = howmany(flowclen, 16);
 	txsd->plen = 0;
 	KASSERT(toep->tx_credits >= txsd->tx_credits && toep->txsd_avail > 0,
 	    ("%s: not enough credits (%d)", __func__, toep->tx_credits));
 	toep->tx_credits -= txsd->tx_credits;
 	if (__predict_false(++toep->txsd_pidx == toep->txsd_total))
 		toep->txsd_pidx = 0;
 	toep->txsd_avail--;
 
 	toep->flags |= TPF_FLOWC_WR_SENT;
         t4_wrq_tx(sc, wr);
 }
 
 #ifdef RATELIMIT
 /*
  * Input is Bytes/second (so_max_pacing-rate), chip counts in Kilobits/second.
  */
 static int
 update_tx_rate_limit(struct adapter *sc, struct toepcb *toep, u_int Bps)
 {
 	int tc_idx, rc;
 	const u_int kbps = (u_int) (uint64_t)Bps * 8ULL / 1000;
 	const int port_id = toep->vi->pi->port_id;
 
 	CTR3(KTR_CXGBE, "%s: tid %u, rate %uKbps", __func__, toep->tid, kbps);
 
 	if (kbps == 0) {
 		/* unbind */
 		tc_idx = -1;
 	} else {
 		rc = t4_reserve_cl_rl_kbps(sc, port_id, kbps, &tc_idx);
 		if (rc != 0)
 			return (rc);
 		MPASS(tc_idx >= 0 && tc_idx < sc->chip_params->nsched_cls);
 	}
 
 	if (toep->tc_idx != tc_idx) {
 		struct wrqe *wr;
 		struct fw_flowc_wr *flowc;
 		int nparams = 1, flowclen, flowclen16;
 		struct ofld_tx_sdesc *txsd = &toep->txsd[toep->txsd_pidx];
 
 		flowclen = sizeof(*flowc) + nparams * sizeof(struct
 		    fw_flowc_mnemval);
 		flowclen16 = howmany(flowclen, 16);
 		if (toep->tx_credits < flowclen16 || toep->txsd_avail == 0 ||
 		    (wr = alloc_wrqe(roundup2(flowclen, 16), toep->ofld_txq)) == NULL) {
 			if (tc_idx >= 0)
 				t4_release_cl_rl_kbps(sc, port_id, tc_idx);
 			return (ENOMEM);
 		}
 
 		flowc = wrtod(wr);
 		memset(flowc, 0, wr->wr_len);
 
 		flowc->op_to_nparams = htobe32(V_FW_WR_OP(FW_FLOWC_WR) |
 		    V_FW_FLOWC_WR_NPARAMS(nparams));
 		flowc->flowid_len16 = htonl(V_FW_WR_LEN16(flowclen16) |
 		    V_FW_WR_FLOWID(toep->tid));
 
 		flowc->mnemval[0].mnemonic = FW_FLOWC_MNEM_SCHEDCLASS;
 		if (tc_idx == -1)
 			flowc->mnemval[0].val = htobe32(0xff);
 		else
 			flowc->mnemval[0].val = htobe32(tc_idx);
 
 		txsd->tx_credits = flowclen16;
 		txsd->plen = 0;
 		toep->tx_credits -= txsd->tx_credits;
 		if (__predict_false(++toep->txsd_pidx == toep->txsd_total))
 			toep->txsd_pidx = 0;
 		toep->txsd_avail--;
 		t4_wrq_tx(sc, wr);
 	}
 
 	if (toep->tc_idx >= 0)
 		t4_release_cl_rl_kbps(sc, port_id, toep->tc_idx);
 	toep->tc_idx = tc_idx;
 
 	return (0);
 }
 #endif
 
 void
 send_reset(struct adapter *sc, struct toepcb *toep, uint32_t snd_nxt)
 {
 	struct wrqe *wr;
 	struct cpl_abort_req *req;
 	int tid = toep->tid;
 	struct inpcb *inp = toep->inp;
 	struct tcpcb *tp = intotcpcb(inp);	/* don't use if INP_DROPPED */
 
 	INP_WLOCK_ASSERT(inp);
 
 	CTR6(KTR_CXGBE, "%s: tid %d (%s), toep_flags 0x%x, inp_flags 0x%x%s",
 	    __func__, toep->tid,
 	    inp->inp_flags & INP_DROPPED ? "inp dropped" :
 	    tcpstates[tp->t_state],
 	    toep->flags, inp->inp_flags,
 	    toep->flags & TPF_ABORT_SHUTDOWN ?
 	    " (abort already in progress)" : "");
 
 	if (toep->flags & TPF_ABORT_SHUTDOWN)
 		return;	/* abort already in progress */
 
 	toep->flags |= TPF_ABORT_SHUTDOWN;
 
 	KASSERT(toep->flags & TPF_FLOWC_WR_SENT,
 	    ("%s: flowc_wr not sent for tid %d.", __func__, tid));
 
 	wr = alloc_wrqe(sizeof(*req), toep->ofld_txq);
 	if (wr == NULL) {
 		/* XXX */
 		panic("%s: allocation failure.", __func__);
 	}
 	req = wrtod(wr);
 
 	INIT_TP_WR_MIT_CPL(req, CPL_ABORT_REQ, tid);
 	if (inp->inp_flags & INP_DROPPED)
 		req->rsvd0 = htobe32(snd_nxt);
 	else
 		req->rsvd0 = htobe32(tp->snd_nxt);
 	req->rsvd1 = !(toep->flags & TPF_TX_DATA_SENT);
 	req->cmd = CPL_ABORT_SEND_RST;
 
 	/*
 	 * XXX: What's the correct way to tell that the inp hasn't been detached
 	 * from its socket?  Should I even be flushing the snd buffer here?
 	 */
 	if ((inp->inp_flags & (INP_DROPPED | INP_TIMEWAIT)) == 0) {
 		struct socket *so = inp->inp_socket;
 
 		if (so != NULL)	/* because I'm not sure.  See comment above */
 			sbflush(&so->so_snd);
 	}
 
 	t4_l2t_send(sc, wr, toep->l2te);
 }
 
 /*
  * Called when a connection is established to translate the TCP options
  * reported by HW to FreeBSD's native format.
  */
 static void
 assign_rxopt(struct tcpcb *tp, unsigned int opt)
 {
 	struct toepcb *toep = tp->t_toe;
 	struct inpcb *inp = tp->t_inpcb;
 	struct adapter *sc = td_adapter(toep->td);
 	int n;
 
 	INP_LOCK_ASSERT(inp);
 
 	if (inp->inp_inc.inc_flags & INC_ISIPV6)
 		n = sizeof(struct ip6_hdr) + sizeof(struct tcphdr);
 	else
 		n = sizeof(struct ip) + sizeof(struct tcphdr);
 	if (V_tcp_do_rfc1323)
 		n += TCPOLEN_TSTAMP_APPA;
 	tp->t_maxseg = sc->params.mtus[G_TCPOPT_MSS(opt)] - n;
 
 	CTR4(KTR_CXGBE, "%s: tid %d, mtu_idx %u (%u)", __func__, toep->tid,
 	    G_TCPOPT_MSS(opt), sc->params.mtus[G_TCPOPT_MSS(opt)]);
 
 	if (G_TCPOPT_TSTAMP(opt)) {
 		tp->t_flags |= TF_RCVD_TSTMP;	/* timestamps ok */
 		tp->ts_recent = 0;		/* hmmm */
 		tp->ts_recent_age = tcp_ts_getticks();
 	}
 
 	if (G_TCPOPT_SACK(opt))
 		tp->t_flags |= TF_SACK_PERMIT;	/* should already be set */
 	else
 		tp->t_flags &= ~TF_SACK_PERMIT;	/* sack disallowed by peer */
 
 	if (G_TCPOPT_WSCALE_OK(opt))
 		tp->t_flags |= TF_RCVD_SCALE;
 
 	/* Doing window scaling? */
 	if ((tp->t_flags & (TF_RCVD_SCALE | TF_REQ_SCALE)) ==
 	    (TF_RCVD_SCALE | TF_REQ_SCALE)) {
 		tp->rcv_scale = tp->request_r_scale;
 		tp->snd_scale = G_TCPOPT_SND_WSCALE(opt);
 	}
 }
 
 /*
  * Completes some final bits of initialization for just established connections
  * and changes their state to TCPS_ESTABLISHED.
  *
  * The ISNs are from after the exchange of SYNs.  i.e., the true ISN + 1.
  */
 void
 make_established(struct toepcb *toep, uint32_t snd_isn, uint32_t rcv_isn,
     uint16_t opt)
 {
 	struct inpcb *inp = toep->inp;
 	struct socket *so = inp->inp_socket;
 	struct tcpcb *tp = intotcpcb(inp);
 	long bufsize;
 	uint32_t iss = be32toh(snd_isn) - 1;	/* true ISS */
 	uint32_t irs = be32toh(rcv_isn) - 1;	/* true IRS */
 	uint16_t tcpopt = be16toh(opt);
 	struct flowc_tx_params ftxp;
 
 	INP_WLOCK_ASSERT(inp);
 	KASSERT(tp->t_state == TCPS_SYN_SENT ||
 	    tp->t_state == TCPS_SYN_RECEIVED,
 	    ("%s: TCP state %s", __func__, tcpstates[tp->t_state]));
 
 	CTR6(KTR_CXGBE, "%s: tid %d, so %p, inp %p, tp %p, toep %p",
 	    __func__, toep->tid, so, inp, tp, toep);
 
 	tp->t_state = TCPS_ESTABLISHED;
 	tp->t_starttime = ticks;
 	TCPSTAT_INC(tcps_connects);
 
 	tp->irs = irs;
 	tcp_rcvseqinit(tp);
 	tp->rcv_wnd = toep->rx_credits << 10;
 	tp->rcv_adv += tp->rcv_wnd;
 	tp->last_ack_sent = tp->rcv_nxt;
 
 	/*
 	 * If we were unable to send all rx credits via opt0, save the remainder
 	 * in rx_credits so that they can be handed over with the next credit
 	 * update.
 	 */
 	SOCKBUF_LOCK(&so->so_rcv);
 	bufsize = select_rcv_wnd(so);
 	SOCKBUF_UNLOCK(&so->so_rcv);
 	toep->rx_credits = bufsize - tp->rcv_wnd;
 
 	tp->iss = iss;
 	tcp_sendseqinit(tp);
 	tp->snd_una = iss + 1;
 	tp->snd_nxt = iss + 1;
 	tp->snd_max = iss + 1;
 
 	assign_rxopt(tp, tcpopt);
 
 	SOCKBUF_LOCK(&so->so_snd);
 	if (so->so_snd.sb_flags & SB_AUTOSIZE && V_tcp_do_autosndbuf)
 		bufsize = V_tcp_autosndbuf_max;
 	else
 		bufsize = sbspace(&so->so_snd);
 	SOCKBUF_UNLOCK(&so->so_snd);
 
 	ftxp.snd_nxt = tp->snd_nxt;
 	ftxp.rcv_nxt = tp->rcv_nxt;
 	ftxp.snd_space = bufsize;
 	ftxp.mss = tp->t_maxseg;
 	send_flowc_wr(toep, &ftxp);
 
 	soisconnected(so);
 }
 
 int
 send_rx_credits(struct adapter *sc, struct toepcb *toep, int credits)
 {
 	struct wrqe *wr;
 	struct cpl_rx_data_ack *req;
 	uint32_t dack = F_RX_DACK_CHANGE | V_RX_DACK_MODE(1);
 
 	KASSERT(credits >= 0, ("%s: %d credits", __func__, credits));
 
 	wr = alloc_wrqe(sizeof(*req), toep->ctrlq);
 	if (wr == NULL)
 		return (0);
 	req = wrtod(wr);
 
 	INIT_TP_WR_MIT_CPL(req, CPL_RX_DATA_ACK, toep->tid);
 	req->credit_dack = htobe32(dack | V_RX_CREDITS(credits));
 
 	t4_wrq_tx(sc, wr);
 	return (credits);
 }
 
 void
 send_rx_modulate(struct adapter *sc, struct toepcb *toep)
 {
 	struct wrqe *wr;
 	struct cpl_rx_data_ack *req;
 
 	wr = alloc_wrqe(sizeof(*req), toep->ctrlq);
 	if (wr == NULL)
 		return;
 	req = wrtod(wr);
 
 	INIT_TP_WR_MIT_CPL(req, CPL_RX_DATA_ACK, toep->tid);
 	req->credit_dack = htobe32(F_RX_MODULATE_RX);
 
 	t4_wrq_tx(sc, wr);
 }
 
 void
 t4_rcvd_locked(struct toedev *tod, struct tcpcb *tp)
 {
 	struct adapter *sc = tod->tod_softc;
 	struct inpcb *inp = tp->t_inpcb;
 	struct socket *so = inp->inp_socket;
 	struct sockbuf *sb = &so->so_rcv;
 	struct toepcb *toep = tp->t_toe;
 	int credits;
 
 	INP_WLOCK_ASSERT(inp);
 
 	SOCKBUF_LOCK_ASSERT(sb);
 	KASSERT(toep->sb_cc >= sbused(sb),
 	    ("%s: sb %p has more data (%d) than last time (%d).",
 	    __func__, sb, sbused(sb), toep->sb_cc));
 
 	credits = toep->sb_cc - sbused(sb);
 	toep->sb_cc = sbused(sb);
 	if (toep->ulp_mode == ULP_MODE_TLS) {
 		if (toep->tls.rcv_over >= credits) {
 			toep->tls.rcv_over -= credits;
 			credits = 0;
 		} else {
 			credits -= toep->tls.rcv_over;
 			toep->tls.rcv_over = 0;
 		}
 	}
 	toep->rx_credits += credits;
 
 	if (toep->rx_credits > 0 &&
 	    (tp->rcv_wnd <= 32 * 1024 || toep->rx_credits >= 64 * 1024 ||
 	    (toep->rx_credits >= 16 * 1024 && tp->rcv_wnd <= 128 * 1024) ||
 	    toep->sb_cc + tp->rcv_wnd < sb->sb_lowat)) {
 
 		credits = send_rx_credits(sc, toep, toep->rx_credits);
 		toep->rx_credits -= credits;
 		tp->rcv_wnd += credits;
 		tp->rcv_adv += credits;
 	} else if (toep->flags & TPF_FORCE_CREDITS)
 		send_rx_modulate(sc, toep);
 }
 
 void
 t4_rcvd(struct toedev *tod, struct tcpcb *tp)
 {
 	struct inpcb *inp = tp->t_inpcb;
 	struct socket *so = inp->inp_socket;
 	struct sockbuf *sb = &so->so_rcv;
 
 	SOCKBUF_LOCK(sb);
 	t4_rcvd_locked(tod, tp);
 	SOCKBUF_UNLOCK(sb);
 }
 
 /*
  * Close a connection by sending a CPL_CLOSE_CON_REQ message.
  */
 int
 t4_close_conn(struct adapter *sc, struct toepcb *toep)
 {
 	struct wrqe *wr;
 	struct cpl_close_con_req *req;
 	unsigned int tid = toep->tid;
 
 	CTR3(KTR_CXGBE, "%s: tid %u%s", __func__, toep->tid,
 	    toep->flags & TPF_FIN_SENT ? ", IGNORED" : "");
 
 	if (toep->flags & TPF_FIN_SENT)
 		return (0);
 
 	KASSERT(toep->flags & TPF_FLOWC_WR_SENT,
 	    ("%s: flowc_wr not sent for tid %u.", __func__, tid));
 
 	wr = alloc_wrqe(sizeof(*req), toep->ofld_txq);
 	if (wr == NULL) {
 		/* XXX */
 		panic("%s: allocation failure.", __func__);
 	}
 	req = wrtod(wr);
 
         req->wr.wr_hi = htonl(V_FW_WR_OP(FW_TP_WR) |
 	    V_FW_WR_IMMDLEN(sizeof(*req) - sizeof(req->wr)));
 	req->wr.wr_mid = htonl(V_FW_WR_LEN16(howmany(sizeof(*req), 16)) |
 	    V_FW_WR_FLOWID(tid));
         req->wr.wr_lo = cpu_to_be64(0);
         OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_CLOSE_CON_REQ, tid));
 	req->rsvd = 0;
 
 	toep->flags |= TPF_FIN_SENT;
 	toep->flags &= ~TPF_SEND_FIN;
 	t4_l2t_send(sc, wr, toep->l2te);
 
 	return (0);
 }
 
 #define MAX_OFLD_TX_CREDITS (SGE_MAX_WR_LEN / 16)
 #define MIN_OFLD_TX_CREDITS (howmany(sizeof(struct fw_ofld_tx_data_wr) + 1, 16))
 
 /* Maximum amount of immediate data we could stuff in a WR */
 static inline int
 max_imm_payload(int tx_credits)
 {
 	const int n = 2;	/* Use only up to 2 desc for imm. data WR */
 
 	KASSERT(tx_credits >= 0 &&
 		tx_credits <= MAX_OFLD_TX_CREDITS,
 		("%s: %d credits", __func__, tx_credits));
 
 	if (tx_credits < MIN_OFLD_TX_CREDITS)
 		return (0);
 
 	if (tx_credits >= (n * EQ_ESIZE) / 16)
 		return ((n * EQ_ESIZE) - sizeof(struct fw_ofld_tx_data_wr));
 	else
 		return (tx_credits * 16 - sizeof(struct fw_ofld_tx_data_wr));
 }
 
 /* Maximum number of SGL entries we could stuff in a WR */
 static inline int
 max_dsgl_nsegs(int tx_credits)
 {
 	int nseg = 1;	/* ulptx_sgl has room for 1, rest ulp_tx_sge_pair */
 	int sge_pair_credits = tx_credits - MIN_OFLD_TX_CREDITS;
 
 	KASSERT(tx_credits >= 0 &&
 		tx_credits <= MAX_OFLD_TX_CREDITS,
 		("%s: %d credits", __func__, tx_credits));
 
 	if (tx_credits < MIN_OFLD_TX_CREDITS)
 		return (0);
 
 	nseg += 2 * (sge_pair_credits * 16 / 24);
 	if ((sge_pair_credits * 16) % 24 == 16)
 		nseg++;
 
 	return (nseg);
 }
 
 static inline void
 write_tx_wr(void *dst, struct toepcb *toep, unsigned int immdlen,
     unsigned int plen, uint8_t credits, int shove, int ulp_submode, int txalign)
 {
 	struct fw_ofld_tx_data_wr *txwr = dst;
 
 	txwr->op_to_immdlen = htobe32(V_WR_OP(FW_OFLD_TX_DATA_WR) |
 	    V_FW_WR_IMMDLEN(immdlen));
 	txwr->flowid_len16 = htobe32(V_FW_WR_FLOWID(toep->tid) |
 	    V_FW_WR_LEN16(credits));
 	txwr->lsodisable_to_flags = htobe32(V_TX_ULP_MODE(toep->ulp_mode) |
 	    V_TX_ULP_SUBMODE(ulp_submode) | V_TX_URG(0) | V_TX_SHOVE(shove));
 	txwr->plen = htobe32(plen);
 
 	if (txalign > 0) {
 		struct tcpcb *tp = intotcpcb(toep->inp);
 
 		if (plen < 2 * tp->t_maxseg || is_10G_port(toep->vi->pi))
 			txwr->lsodisable_to_flags |=
 			    htobe32(F_FW_OFLD_TX_DATA_WR_LSODISABLE);
 		else
 			txwr->lsodisable_to_flags |=
 			    htobe32(F_FW_OFLD_TX_DATA_WR_ALIGNPLD |
 				(tp->t_flags & TF_NODELAY ? 0 :
 				F_FW_OFLD_TX_DATA_WR_ALIGNPLDSHOVE));
 	}
 }
 
 /*
  * Generate a DSGL from a starting mbuf.  The total number of segments and the
  * maximum segments in any one mbuf are provided.
  */
 static void
 write_tx_sgl(void *dst, struct mbuf *start, struct mbuf *stop, int nsegs, int n)
 {
 	struct mbuf *m;
 	struct ulptx_sgl *usgl = dst;
 	int i, j, rc;
 	struct sglist sg;
 	struct sglist_seg segs[n];
 
 	KASSERT(nsegs > 0, ("%s: nsegs 0", __func__));
 
 	sglist_init(&sg, n, segs);
 	usgl->cmd_nsge = htobe32(V_ULPTX_CMD(ULP_TX_SC_DSGL) |
 	    V_ULPTX_NSGE(nsegs));
 
 	i = -1;
 	for (m = start; m != stop; m = m->m_next) {
 		if (IS_AIOTX_MBUF(m))
 			rc = sglist_append_vmpages(&sg, aiotx_mbuf_pages(m),
 			    aiotx_mbuf_pgoff(m), m->m_len);
 		else
 			rc = sglist_append(&sg, mtod(m, void *), m->m_len);
 		if (__predict_false(rc != 0))
 			panic("%s: sglist_append %d", __func__, rc);
 
 		for (j = 0; j < sg.sg_nseg; i++, j++) {
 			if (i < 0) {
 				usgl->len0 = htobe32(segs[j].ss_len);
 				usgl->addr0 = htobe64(segs[j].ss_paddr);
 			} else {
 				usgl->sge[i / 2].len[i & 1] =
 				    htobe32(segs[j].ss_len);
 				usgl->sge[i / 2].addr[i & 1] =
 				    htobe64(segs[j].ss_paddr);
 			}
 #ifdef INVARIANTS
 			nsegs--;
 #endif
 		}
 		sglist_reset(&sg);
 	}
 	if (i & 1)
 		usgl->sge[i / 2].len[1] = htobe32(0);
 	KASSERT(nsegs == 0, ("%s: nsegs %d, start %p, stop %p",
 	    __func__, nsegs, start, stop));
 }
 
 /*
  * Max number of SGL entries an offload tx work request can have.  This is 41
  * (1 + 40) for a full 512B work request.
  * fw_ofld_tx_data_wr(16B) + ulptx_sgl(16B, 1) + ulptx_sge_pair(480B, 40)
  */
 #define OFLD_SGL_LEN (41)
 
 /*
  * Send data and/or a FIN to the peer.
  *
  * The socket's so_snd buffer consists of a stream of data starting with sb_mb
  * and linked together with m_next.  sb_sndptr, if set, is the last mbuf that
  * was transmitted.
  *
  * drop indicates the number of bytes that should be dropped from the head of
  * the send buffer.  It is an optimization that lets do_fw4_ack avoid creating
  * contention on the send buffer lock (before this change it used to do
  * sowwakeup and then t4_push_frames right after that when recovering from tx
  * stalls).  When drop is set this function MUST drop the bytes and wake up any
  * writers.
  */
 void
 t4_push_frames(struct adapter *sc, struct toepcb *toep, int drop)
 {
 	struct mbuf *sndptr, *m, *sb_sndptr;
 	struct fw_ofld_tx_data_wr *txwr;
 	struct wrqe *wr;
 	u_int plen, nsegs, credits, max_imm, max_nsegs, max_nsegs_1mbuf;
 	struct inpcb *inp = toep->inp;
 	struct tcpcb *tp = intotcpcb(inp);
 	struct socket *so = inp->inp_socket;
 	struct sockbuf *sb = &so->so_snd;
 	int tx_credits, shove, compl, sowwakeup;
 	struct ofld_tx_sdesc *txsd;
 	bool aiotx_mbuf_seen;
 
 	INP_WLOCK_ASSERT(inp);
 	KASSERT(toep->flags & TPF_FLOWC_WR_SENT,
 	    ("%s: flowc_wr not sent for tid %u.", __func__, toep->tid));
 
 	KASSERT(toep->ulp_mode == ULP_MODE_NONE ||
 	    toep->ulp_mode == ULP_MODE_TCPDDP ||
 	    toep->ulp_mode == ULP_MODE_TLS ||
 	    toep->ulp_mode == ULP_MODE_RDMA,
 	    ("%s: ulp_mode %u for toep %p", __func__, toep->ulp_mode, toep));
 
 #ifdef VERBOSE_TRACES
 	CTR4(KTR_CXGBE, "%s: tid %d toep flags %#x tp flags %#x drop %d",
 	    __func__, toep->tid, toep->flags, tp->t_flags);
 #endif
 	if (__predict_false(toep->flags & TPF_ABORT_SHUTDOWN))
 		return;
 
 #ifdef RATELIMIT
 	if (__predict_false(inp->inp_flags2 & INP_RATE_LIMIT_CHANGED) &&
 	    (update_tx_rate_limit(sc, toep, so->so_max_pacing_rate) == 0)) {
 		inp->inp_flags2 &= ~INP_RATE_LIMIT_CHANGED;
 	}
 #endif
 
 	/*
 	 * This function doesn't resume by itself.  Someone else must clear the
 	 * flag and call this function.
 	 */
 	if (__predict_false(toep->flags & TPF_TX_SUSPENDED)) {
 		KASSERT(drop == 0,
 		    ("%s: drop (%d) != 0 but tx is suspended", __func__, drop));
 		return;
 	}
 
 	txsd = &toep->txsd[toep->txsd_pidx];
 	do {
 		tx_credits = min(toep->tx_credits, MAX_OFLD_TX_CREDITS);
 		max_imm = max_imm_payload(tx_credits);
 		max_nsegs = max_dsgl_nsegs(tx_credits);
 
 		SOCKBUF_LOCK(sb);
 		sowwakeup = drop;
 		if (drop) {
 			sbdrop_locked(sb, drop);
 			drop = 0;
 		}
 		sb_sndptr = sb->sb_sndptr;
 		sndptr = sb_sndptr ? sb_sndptr->m_next : sb->sb_mb;
 		plen = 0;
 		nsegs = 0;
 		max_nsegs_1mbuf = 0; /* max # of SGL segments in any one mbuf */
 		aiotx_mbuf_seen = false;
 		for (m = sndptr; m != NULL; m = m->m_next) {
 			int n;
 
 			if (IS_AIOTX_MBUF(m))
 				n = sglist_count_vmpages(aiotx_mbuf_pages(m),
 				    aiotx_mbuf_pgoff(m), m->m_len);
 			else
 				n = sglist_count(mtod(m, void *), m->m_len);
 
 			nsegs += n;
 			plen += m->m_len;
 
 			/* This mbuf sent us _over_ the nsegs limit, back out */
 			if (plen > max_imm && nsegs > max_nsegs) {
 				nsegs -= n;
 				plen -= m->m_len;
 				if (plen == 0) {
 					/* Too few credits */
 					toep->flags |= TPF_TX_SUSPENDED;
 					if (sowwakeup) {
 						if (!TAILQ_EMPTY(
 						    &toep->aiotx_jobq))
 							t4_aiotx_queue_toep(
 							    toep);
 						sowwakeup_locked(so);
 					} else
 						SOCKBUF_UNLOCK(sb);
 					SOCKBUF_UNLOCK_ASSERT(sb);
 					return;
 				}
 				break;
 			}
 
 			if (IS_AIOTX_MBUF(m))
 				aiotx_mbuf_seen = true;
 			if (max_nsegs_1mbuf < n)
 				max_nsegs_1mbuf = n;
 			sb_sndptr = m;	/* new sb->sb_sndptr if all goes well */
 
 			/* This mbuf put us right at the max_nsegs limit */
 			if (plen > max_imm && nsegs == max_nsegs) {
 				m = m->m_next;
 				break;
 			}
 		}
 
 		if (sbused(sb) > sb->sb_hiwat * 5 / 8 &&
 		    toep->plen_nocompl + plen >= sb->sb_hiwat / 4)
 			compl = 1;
 		else
 			compl = 0;
 
 		if (sb->sb_flags & SB_AUTOSIZE &&
 		    V_tcp_do_autosndbuf &&
 		    sb->sb_hiwat < V_tcp_autosndbuf_max &&
 		    sbused(sb) >= sb->sb_hiwat * 7 / 8) {
 			int newsize = min(sb->sb_hiwat + V_tcp_autosndbuf_inc,
 			    V_tcp_autosndbuf_max);
 
 			if (!sbreserve_locked(sb, newsize, so, NULL))
 				sb->sb_flags &= ~SB_AUTOSIZE;
 			else
 				sowwakeup = 1;	/* room available */
 		}
 		if (sowwakeup) {
 			if (!TAILQ_EMPTY(&toep->aiotx_jobq))
 				t4_aiotx_queue_toep(toep);
 			sowwakeup_locked(so);
 		} else
 			SOCKBUF_UNLOCK(sb);
 		SOCKBUF_UNLOCK_ASSERT(sb);
 
 		/* nothing to send */
 		if (plen == 0) {
 			KASSERT(m == NULL,
 			    ("%s: nothing to send, but m != NULL", __func__));
 			break;
 		}
 
 		if (__predict_false(toep->flags & TPF_FIN_SENT))
 			panic("%s: excess tx.", __func__);
 
 		shove = m == NULL && !(tp->t_flags & TF_MORETOCOME);
 		if (plen <= max_imm && !aiotx_mbuf_seen) {
 
 			/* Immediate data tx */
 
 			wr = alloc_wrqe(roundup2(sizeof(*txwr) + plen, 16),
 					toep->ofld_txq);
 			if (wr == NULL) {
 				/* XXX: how will we recover from this? */
 				toep->flags |= TPF_TX_SUSPENDED;
 				return;
 			}
 			txwr = wrtod(wr);
 			credits = howmany(wr->wr_len, 16);
 			write_tx_wr(txwr, toep, plen, plen, credits, shove, 0,
 			    sc->tt.tx_align);
 			m_copydata(sndptr, 0, plen, (void *)(txwr + 1));
 			nsegs = 0;
 		} else {
 			int wr_len;
 
 			/* DSGL tx */
 
 			wr_len = sizeof(*txwr) + sizeof(struct ulptx_sgl) +
 			    ((3 * (nsegs - 1)) / 2 + ((nsegs - 1) & 1)) * 8;
 			wr = alloc_wrqe(roundup2(wr_len, 16), toep->ofld_txq);
 			if (wr == NULL) {
 				/* XXX: how will we recover from this? */
 				toep->flags |= TPF_TX_SUSPENDED;
 				return;
 			}
 			txwr = wrtod(wr);
 			credits = howmany(wr_len, 16);
 			write_tx_wr(txwr, toep, 0, plen, credits, shove, 0,
 			    sc->tt.tx_align);
 			write_tx_sgl(txwr + 1, sndptr, m, nsegs,
 			    max_nsegs_1mbuf);
 			if (wr_len & 0xf) {
 				uint64_t *pad = (uint64_t *)
 				    ((uintptr_t)txwr + wr_len);
 				*pad = 0;
 			}
 		}
 
 		KASSERT(toep->tx_credits >= credits,
 			("%s: not enough credits", __func__));
 
 		toep->tx_credits -= credits;
 		toep->tx_nocompl += credits;
 		toep->plen_nocompl += plen;
 		if (toep->tx_credits <= toep->tx_total * 3 / 8 &&
 		    toep->tx_nocompl >= toep->tx_total / 4)
 			compl = 1;
 
 		if (compl || toep->ulp_mode == ULP_MODE_RDMA) {
 			txwr->op_to_immdlen |= htobe32(F_FW_WR_COMPL);
 			toep->tx_nocompl = 0;
 			toep->plen_nocompl = 0;
 		}
 
 		tp->snd_nxt += plen;
 		tp->snd_max += plen;
 
 		SOCKBUF_LOCK(sb);
 		KASSERT(sb_sndptr, ("%s: sb_sndptr is NULL", __func__));
 		sb->sb_sndptr = sb_sndptr;
 		SOCKBUF_UNLOCK(sb);
 
 		toep->flags |= TPF_TX_DATA_SENT;
 		if (toep->tx_credits < MIN_OFLD_TX_CREDITS)
 			toep->flags |= TPF_TX_SUSPENDED;
 
 		KASSERT(toep->txsd_avail > 0, ("%s: no txsd", __func__));
 		txsd->plen = plen;
 		txsd->tx_credits = credits;
 		txsd++;
 		if (__predict_false(++toep->txsd_pidx == toep->txsd_total)) {
 			toep->txsd_pidx = 0;
 			txsd = &toep->txsd[0];
 		}
 		toep->txsd_avail--;
 
 		t4_l2t_send(sc, wr, toep->l2te);
 	} while (m != NULL);
 
 	/* Send a FIN if requested, but only if there's no more data to send */
 	if (m == NULL && toep->flags & TPF_SEND_FIN)
 		t4_close_conn(sc, toep);
 }
 
 static inline void
 rqdrop_locked(struct mbufq *q, int plen)
 {
 	struct mbuf *m;
 
 	while (plen > 0) {
 		m = mbufq_dequeue(q);
 
 		/* Too many credits. */
 		MPASS(m != NULL);
 		M_ASSERTPKTHDR(m);
 
 		/* Partial credits. */
 		MPASS(plen >= m->m_pkthdr.len);
 
 		plen -= m->m_pkthdr.len;
 		m_freem(m);
 	}
 }
 
 void
 t4_push_pdus(struct adapter *sc, struct toepcb *toep, int drop)
 {
 	struct mbuf *sndptr, *m;
 	struct fw_ofld_tx_data_wr *txwr;
 	struct wrqe *wr;
 	u_int plen, nsegs, credits, max_imm, max_nsegs, max_nsegs_1mbuf;
 	u_int adjusted_plen, ulp_submode;
 	struct inpcb *inp = toep->inp;
 	struct tcpcb *tp = intotcpcb(inp);
 	int tx_credits, shove;
 	struct ofld_tx_sdesc *txsd = &toep->txsd[toep->txsd_pidx];
 	struct mbufq *pduq = &toep->ulp_pduq;
 	static const u_int ulp_extra_len[] = {0, 4, 4, 8};
 
 	INP_WLOCK_ASSERT(inp);
 	KASSERT(toep->flags & TPF_FLOWC_WR_SENT,
 	    ("%s: flowc_wr not sent for tid %u.", __func__, toep->tid));
 	KASSERT(toep->ulp_mode == ULP_MODE_ISCSI,
 	    ("%s: ulp_mode %u for toep %p", __func__, toep->ulp_mode, toep));
 
 	if (__predict_false(toep->flags & TPF_ABORT_SHUTDOWN))
 		return;
 
 	/*
 	 * This function doesn't resume by itself.  Someone else must clear the
 	 * flag and call this function.
 	 */
 	if (__predict_false(toep->flags & TPF_TX_SUSPENDED)) {
 		KASSERT(drop == 0,
 		    ("%s: drop (%d) != 0 but tx is suspended", __func__, drop));
 		return;
 	}
 
 	if (drop)
 		rqdrop_locked(&toep->ulp_pdu_reclaimq, drop);
 
 	while ((sndptr = mbufq_first(pduq)) != NULL) {
 		M_ASSERTPKTHDR(sndptr);
 
 		tx_credits = min(toep->tx_credits, MAX_OFLD_TX_CREDITS);
 		max_imm = max_imm_payload(tx_credits);
 		max_nsegs = max_dsgl_nsegs(tx_credits);
 
 		plen = 0;
 		nsegs = 0;
 		max_nsegs_1mbuf = 0; /* max # of SGL segments in any one mbuf */
 		for (m = sndptr; m != NULL; m = m->m_next) {
 			int n = sglist_count(mtod(m, void *), m->m_len);
 
 			nsegs += n;
 			plen += m->m_len;
 
 			/*
 			 * This mbuf would send us _over_ the nsegs limit.
 			 * Suspend tx because the PDU can't be sent out.
 			 */
 			if (plen > max_imm && nsegs > max_nsegs) {
 				toep->flags |= TPF_TX_SUSPENDED;
 				return;
 			}
 
 			if (max_nsegs_1mbuf < n)
 				max_nsegs_1mbuf = n;
 		}
 
 		if (__predict_false(toep->flags & TPF_FIN_SENT))
 			panic("%s: excess tx.", __func__);
 
 		/*
 		 * We have a PDU to send.  All of it goes out in one WR so 'm'
 		 * is NULL.  A PDU's length is always a multiple of 4.
 		 */
 		MPASS(m == NULL);
 		MPASS((plen & 3) == 0);
 		MPASS(sndptr->m_pkthdr.len == plen);
 
 		shove = !(tp->t_flags & TF_MORETOCOME);
 		ulp_submode = mbuf_ulp_submode(sndptr);
 		MPASS(ulp_submode < nitems(ulp_extra_len));
 
 		/*
 		 * plen doesn't include header and data digests, which are
 		 * generated and inserted in the right places by the TOE, but
 		 * they do occupy TCP sequence space and need to be accounted
 		 * for.
 		 */
 		adjusted_plen = plen + ulp_extra_len[ulp_submode];
 		if (plen <= max_imm) {
 
 			/* Immediate data tx */
 
 			wr = alloc_wrqe(roundup2(sizeof(*txwr) + plen, 16),
 					toep->ofld_txq);
 			if (wr == NULL) {
 				/* XXX: how will we recover from this? */
 				toep->flags |= TPF_TX_SUSPENDED;
 				return;
 			}
 			txwr = wrtod(wr);
 			credits = howmany(wr->wr_len, 16);
 			write_tx_wr(txwr, toep, plen, adjusted_plen, credits,
 			    shove, ulp_submode, sc->tt.tx_align);
 			m_copydata(sndptr, 0, plen, (void *)(txwr + 1));
 			nsegs = 0;
 		} else {
 			int wr_len;
 
 			/* DSGL tx */
 			wr_len = sizeof(*txwr) + sizeof(struct ulptx_sgl) +
 			    ((3 * (nsegs - 1)) / 2 + ((nsegs - 1) & 1)) * 8;
 			wr = alloc_wrqe(roundup2(wr_len, 16), toep->ofld_txq);
 			if (wr == NULL) {
 				/* XXX: how will we recover from this? */
 				toep->flags |= TPF_TX_SUSPENDED;
 				return;
 			}
 			txwr = wrtod(wr);
 			credits = howmany(wr_len, 16);
 			write_tx_wr(txwr, toep, 0, adjusted_plen, credits,
 			    shove, ulp_submode, sc->tt.tx_align);
 			write_tx_sgl(txwr + 1, sndptr, m, nsegs,
 			    max_nsegs_1mbuf);
 			if (wr_len & 0xf) {
 				uint64_t *pad = (uint64_t *)
 				    ((uintptr_t)txwr + wr_len);
 				*pad = 0;
 			}
 		}
 
 		KASSERT(toep->tx_credits >= credits,
 			("%s: not enough credits", __func__));
 
 		m = mbufq_dequeue(pduq);
 		MPASS(m == sndptr);
 		mbufq_enqueue(&toep->ulp_pdu_reclaimq, m);
 
 		toep->tx_credits -= credits;
 		toep->tx_nocompl += credits;
 		toep->plen_nocompl += plen;
 		if (toep->tx_credits <= toep->tx_total * 3 / 8 &&
 		    toep->tx_nocompl >= toep->tx_total / 4) {
 			txwr->op_to_immdlen |= htobe32(F_FW_WR_COMPL);
 			toep->tx_nocompl = 0;
 			toep->plen_nocompl = 0;
 		}
 
 		tp->snd_nxt += adjusted_plen;
 		tp->snd_max += adjusted_plen;
 
 		toep->flags |= TPF_TX_DATA_SENT;
 		if (toep->tx_credits < MIN_OFLD_TX_CREDITS)
 			toep->flags |= TPF_TX_SUSPENDED;
 
 		KASSERT(toep->txsd_avail > 0, ("%s: no txsd", __func__));
 		txsd->plen = plen;
 		txsd->tx_credits = credits;
 		txsd++;
 		if (__predict_false(++toep->txsd_pidx == toep->txsd_total)) {
 			toep->txsd_pidx = 0;
 			txsd = &toep->txsd[0];
 		}
 		toep->txsd_avail--;
 
 		t4_l2t_send(sc, wr, toep->l2te);
 	}
 
 	/* Send a FIN if requested, but only if there are no more PDUs to send */
 	if (mbufq_first(pduq) == NULL && toep->flags & TPF_SEND_FIN)
 		t4_close_conn(sc, toep);
 }
 
 int
 t4_tod_output(struct toedev *tod, struct tcpcb *tp)
 {
 	struct adapter *sc = tod->tod_softc;
 #ifdef INVARIANTS
 	struct inpcb *inp = tp->t_inpcb;
 #endif
 	struct toepcb *toep = tp->t_toe;
 
 	INP_WLOCK_ASSERT(inp);
 	KASSERT((inp->inp_flags & INP_DROPPED) == 0,
 	    ("%s: inp %p dropped.", __func__, inp));
 	KASSERT(toep != NULL, ("%s: toep is NULL", __func__));
 
 	if (toep->ulp_mode == ULP_MODE_ISCSI)
 		t4_push_pdus(sc, toep, 0);
 	else if (tls_tx_key(toep))
 		t4_push_tls_records(sc, toep, 0);
 	else
 		t4_push_frames(sc, toep, 0);
 
 	return (0);
 }
 
 int
 t4_send_fin(struct toedev *tod, struct tcpcb *tp)
 {
 	struct adapter *sc = tod->tod_softc;
 #ifdef INVARIANTS
 	struct inpcb *inp = tp->t_inpcb;
 #endif
 	struct toepcb *toep = tp->t_toe;
 
 	INP_WLOCK_ASSERT(inp);
 	KASSERT((inp->inp_flags & INP_DROPPED) == 0,
 	    ("%s: inp %p dropped.", __func__, inp));
 	KASSERT(toep != NULL, ("%s: toep is NULL", __func__));
 
 	toep->flags |= TPF_SEND_FIN;
 	if (tp->t_state >= TCPS_ESTABLISHED) {
 		if (toep->ulp_mode == ULP_MODE_ISCSI)
 			t4_push_pdus(sc, toep, 0);
 		else if (tls_tx_key(toep))
 			t4_push_tls_records(sc, toep, 0);
 		else
 			t4_push_frames(sc, toep, 0);
 	}
 
 	return (0);
 }
 
 int
 t4_send_rst(struct toedev *tod, struct tcpcb *tp)
 {
 	struct adapter *sc = tod->tod_softc;
 #if defined(INVARIANTS)
 	struct inpcb *inp = tp->t_inpcb;
 #endif
 	struct toepcb *toep = tp->t_toe;
 
 	INP_WLOCK_ASSERT(inp);
 	KASSERT((inp->inp_flags & INP_DROPPED) == 0,
 	    ("%s: inp %p dropped.", __func__, inp));
 	KASSERT(toep != NULL, ("%s: toep is NULL", __func__));
 
 	/* hmmmm */
 	KASSERT(toep->flags & TPF_FLOWC_WR_SENT,
 	    ("%s: flowc for tid %u [%s] not sent already",
 	    __func__, toep->tid, tcpstates[tp->t_state]));
 
 	send_reset(sc, toep, 0);
 	return (0);
 }
 
 /*
  * Peer has sent us a FIN.
  */
 static int
 do_peer_close(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m)
 {
 	struct adapter *sc = iq->adapter;
 	const struct cpl_peer_close *cpl = (const void *)(rss + 1);
 	unsigned int tid = GET_TID(cpl);
 	struct toepcb *toep = lookup_tid(sc, tid);
 	struct inpcb *inp = toep->inp;
 	struct tcpcb *tp = NULL;
 	struct socket *so;
 #ifdef INVARIANTS
 	unsigned int opcode = G_CPL_OPCODE(be32toh(OPCODE_TID(cpl)));
 #endif
 
 	KASSERT(opcode == CPL_PEER_CLOSE,
 	    ("%s: unexpected opcode 0x%x", __func__, opcode));
 	KASSERT(m == NULL, ("%s: wasn't expecting payload", __func__));
 
 	if (__predict_false(toep->flags & TPF_SYNQE)) {
 #ifdef INVARIANTS
 		struct synq_entry *synqe = (void *)toep;
 
 		INP_WLOCK(synqe->lctx->inp);
 		if (synqe->flags & TPF_SYNQE_HAS_L2TE) {
 			KASSERT(synqe->flags & TPF_ABORT_SHUTDOWN,
 			    ("%s: listen socket closed but tid %u not aborted.",
 			    __func__, tid));
 		} else {
 			/*
 			 * do_pass_accept_req is still running and will
 			 * eventually take care of this tid.
 			 */
 		}
 		INP_WUNLOCK(synqe->lctx->inp);
 #endif
 		CTR4(KTR_CXGBE, "%s: tid %u, synqe %p (0x%x)", __func__, tid,
 		    toep, toep->flags);
 		return (0);
 	}
 
 	KASSERT(toep->tid == tid, ("%s: toep tid mismatch", __func__));
 
 	CURVNET_SET(toep->vnet);
 	INP_INFO_RLOCK(&V_tcbinfo);
 	INP_WLOCK(inp);
 	tp = intotcpcb(inp);
 
 	CTR5(KTR_CXGBE, "%s: tid %u (%s), toep_flags 0x%x, inp %p", __func__,
 	    tid, tp ? tcpstates[tp->t_state] : "no tp", toep->flags, inp);
 
 	if (toep->flags & TPF_ABORT_SHUTDOWN)
 		goto done;
 
 	tp->rcv_nxt++;	/* FIN */
 
 	so = inp->inp_socket;
 	if (toep->ulp_mode == ULP_MODE_TCPDDP) {
 		DDP_LOCK(toep);
 		if (__predict_false(toep->ddp.flags &
 		    (DDP_BUF0_ACTIVE | DDP_BUF1_ACTIVE)))
 			handle_ddp_close(toep, tp, cpl->rcv_nxt);
 		DDP_UNLOCK(toep);
 	}
 	socantrcvmore(so);
 
 	if (toep->ulp_mode != ULP_MODE_RDMA) {
 		KASSERT(tp->rcv_nxt == be32toh(cpl->rcv_nxt),
 	    		("%s: rcv_nxt mismatch: %u %u", __func__, tp->rcv_nxt,
 	    		be32toh(cpl->rcv_nxt)));
 	}
 
 	switch (tp->t_state) {
 	case TCPS_SYN_RECEIVED:
 		tp->t_starttime = ticks;
 		/* FALLTHROUGH */ 
 
 	case TCPS_ESTABLISHED:
 		tp->t_state = TCPS_CLOSE_WAIT;
 		break;
 
 	case TCPS_FIN_WAIT_1:
 		tp->t_state = TCPS_CLOSING;
 		break;
 
 	case TCPS_FIN_WAIT_2:
 		tcp_twstart(tp);
 		INP_UNLOCK_ASSERT(inp);	 /* safe, we have a ref on the inp */
 		INP_INFO_RUNLOCK(&V_tcbinfo);
 		CURVNET_RESTORE();
 
 		INP_WLOCK(inp);
 		final_cpl_received(toep);
 		return (0);
 
 	default:
 		log(LOG_ERR, "%s: TID %u received CPL_PEER_CLOSE in state %d\n",
 		    __func__, tid, tp->t_state);
 	}
 done:
 	INP_WUNLOCK(inp);
 	INP_INFO_RUNLOCK(&V_tcbinfo);
 	CURVNET_RESTORE();
 	return (0);
 }
 
 /*
  * Peer has ACK'd our FIN.
  */
 static int
 do_close_con_rpl(struct sge_iq *iq, const struct rss_header *rss,
     struct mbuf *m)
 {
 	struct adapter *sc = iq->adapter;
 	const struct cpl_close_con_rpl *cpl = (const void *)(rss + 1);
 	unsigned int tid = GET_TID(cpl);
 	struct toepcb *toep = lookup_tid(sc, tid);
 	struct inpcb *inp = toep->inp;
 	struct tcpcb *tp = NULL;
 	struct socket *so = NULL;
 #ifdef INVARIANTS
 	unsigned int opcode = G_CPL_OPCODE(be32toh(OPCODE_TID(cpl)));
 #endif
 
 	KASSERT(opcode == CPL_CLOSE_CON_RPL,
 	    ("%s: unexpected opcode 0x%x", __func__, opcode));
 	KASSERT(m == NULL, ("%s: wasn't expecting payload", __func__));
 	KASSERT(toep->tid == tid, ("%s: toep tid mismatch", __func__));
 
 	CURVNET_SET(toep->vnet);
 	INP_INFO_RLOCK(&V_tcbinfo);
 	INP_WLOCK(inp);
 	tp = intotcpcb(inp);
 
 	CTR4(KTR_CXGBE, "%s: tid %u (%s), toep_flags 0x%x",
 	    __func__, tid, tp ? tcpstates[tp->t_state] : "no tp", toep->flags);
 
 	if (toep->flags & TPF_ABORT_SHUTDOWN)
 		goto done;
 
 	so = inp->inp_socket;
 	tp->snd_una = be32toh(cpl->snd_nxt) - 1;	/* exclude FIN */
 
 	switch (tp->t_state) {
 	case TCPS_CLOSING:	/* see TCPS_FIN_WAIT_2 in do_peer_close too */
 		tcp_twstart(tp);
 release:
 		INP_UNLOCK_ASSERT(inp);	/* safe, we have a ref on the  inp */
 		INP_INFO_RUNLOCK(&V_tcbinfo);
 		CURVNET_RESTORE();
 
 		INP_WLOCK(inp);
 		final_cpl_received(toep);	/* no more CPLs expected */
 
 		return (0);
 	case TCPS_LAST_ACK:
 		if (tcp_close(tp))
 			INP_WUNLOCK(inp);
 		goto release;
 
 	case TCPS_FIN_WAIT_1:
 		if (so->so_rcv.sb_state & SBS_CANTRCVMORE)
 			soisdisconnected(so);
 		tp->t_state = TCPS_FIN_WAIT_2;
 		break;
 
 	default:
 		log(LOG_ERR,
 		    "%s: TID %u received CPL_CLOSE_CON_RPL in state %s\n",
 		    __func__, tid, tcpstates[tp->t_state]);
 	}
 done:
 	INP_WUNLOCK(inp);
 	INP_INFO_RUNLOCK(&V_tcbinfo);
 	CURVNET_RESTORE();
 	return (0);
 }
 
 void
 send_abort_rpl(struct adapter *sc, struct sge_wrq *ofld_txq, int tid,
     int rst_status)
 {
 	struct wrqe *wr;
 	struct cpl_abort_rpl *cpl;
 
 	wr = alloc_wrqe(sizeof(*cpl), ofld_txq);
 	if (wr == NULL) {
 		/* XXX */
 		panic("%s: allocation failure.", __func__);
 	}
 	cpl = wrtod(wr);
 
 	INIT_TP_WR_MIT_CPL(cpl, CPL_ABORT_RPL, tid);
 	cpl->cmd = rst_status;
 
 	t4_wrq_tx(sc, wr);
 }
 
 static int
 abort_status_to_errno(struct tcpcb *tp, unsigned int abort_reason)
 {
 	switch (abort_reason) {
 	case CPL_ERR_BAD_SYN:
 	case CPL_ERR_CONN_RESET:
 		return (tp->t_state == TCPS_CLOSE_WAIT ? EPIPE : ECONNRESET);
 	case CPL_ERR_XMIT_TIMEDOUT:
 	case CPL_ERR_PERSIST_TIMEDOUT:
 	case CPL_ERR_FINWAIT2_TIMEDOUT:
 	case CPL_ERR_KEEPALIVE_TIMEDOUT:
 		return (ETIMEDOUT);
 	default:
 		return (EIO);
 	}
 }
 
 /*
  * TCP RST from the peer, timeout, or some other such critical error.
  */
 static int
 do_abort_req(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m)
 {
 	struct adapter *sc = iq->adapter;
 	const struct cpl_abort_req_rss *cpl = (const void *)(rss + 1);
 	unsigned int tid = GET_TID(cpl);
 	struct toepcb *toep = lookup_tid(sc, tid);
 	struct sge_wrq *ofld_txq = toep->ofld_txq;
 	struct inpcb *inp;
 	struct tcpcb *tp;
 #ifdef INVARIANTS
 	unsigned int opcode = G_CPL_OPCODE(be32toh(OPCODE_TID(cpl)));
 #endif
 
 	KASSERT(opcode == CPL_ABORT_REQ_RSS,
 	    ("%s: unexpected opcode 0x%x", __func__, opcode));
 	KASSERT(m == NULL, ("%s: wasn't expecting payload", __func__));
 
 	if (toep->flags & TPF_SYNQE)
 		return (do_abort_req_synqe(iq, rss, m));
 
 	KASSERT(toep->tid == tid, ("%s: toep tid mismatch", __func__));
 
 	if (negative_advice(cpl->status)) {
 		CTR4(KTR_CXGBE, "%s: negative advice %d for tid %d (0x%x)",
 		    __func__, cpl->status, tid, toep->flags);
 		return (0);	/* Ignore negative advice */
 	}
 
 	inp = toep->inp;
 	CURVNET_SET(toep->vnet);
 	INP_INFO_RLOCK(&V_tcbinfo);	/* for tcp_close */
 	INP_WLOCK(inp);
 
 	tp = intotcpcb(inp);
 
 	CTR6(KTR_CXGBE,
 	    "%s: tid %d (%s), toep_flags 0x%x, inp_flags 0x%x, status %d",
 	    __func__, tid, tp ? tcpstates[tp->t_state] : "no tp", toep->flags,
 	    inp->inp_flags, cpl->status);
 
 	/*
 	 * If we'd initiated an abort earlier the reply to it is responsible for
 	 * cleaning up resources.  Otherwise we tear everything down right here
 	 * right now.  We owe the T4 a CPL_ABORT_RPL no matter what.
 	 */
 	if (toep->flags & TPF_ABORT_SHUTDOWN) {
 		INP_WUNLOCK(inp);
 		goto done;
 	}
 	toep->flags |= TPF_ABORT_SHUTDOWN;
 
 	if ((inp->inp_flags & (INP_DROPPED | INP_TIMEWAIT)) == 0) {
 		struct socket *so = inp->inp_socket;
 
 		if (so != NULL)
 			so_error_set(so, abort_status_to_errno(tp,
 			    cpl->status));
 		tp = tcp_close(tp);
 		if (tp == NULL)
 			INP_WLOCK(inp);	/* re-acquire */
 	}
 
 	final_cpl_received(toep);
 done:
 	INP_INFO_RUNLOCK(&V_tcbinfo);
 	CURVNET_RESTORE();
 	send_abort_rpl(sc, ofld_txq, tid, CPL_ABORT_NO_RST);
 	return (0);
 }
 
 /*
  * Reply to the CPL_ABORT_REQ (send_reset)
  */
 static int
 do_abort_rpl(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m)
 {
 	struct adapter *sc = iq->adapter;
 	const struct cpl_abort_rpl_rss *cpl = (const void *)(rss + 1);
 	unsigned int tid = GET_TID(cpl);
 	struct toepcb *toep = lookup_tid(sc, tid);
 	struct inpcb *inp = toep->inp;
 #ifdef INVARIANTS
 	unsigned int opcode = G_CPL_OPCODE(be32toh(OPCODE_TID(cpl)));
 #endif
 
 	KASSERT(opcode == CPL_ABORT_RPL_RSS,
 	    ("%s: unexpected opcode 0x%x", __func__, opcode));
 	KASSERT(m == NULL, ("%s: wasn't expecting payload", __func__));
 
 	if (toep->flags & TPF_SYNQE)
 		return (do_abort_rpl_synqe(iq, rss, m));
 
 	KASSERT(toep->tid == tid, ("%s: toep tid mismatch", __func__));
 
 	CTR5(KTR_CXGBE, "%s: tid %u, toep %p, inp %p, status %d",
 	    __func__, tid, toep, inp, cpl->status);
 
 	KASSERT(toep->flags & TPF_ABORT_SHUTDOWN,
 	    ("%s: wasn't expecting abort reply", __func__));
 
 	INP_WLOCK(inp);
 	final_cpl_received(toep);
 
 	return (0);
 }
 
 static int
 do_rx_data(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m)
 {
 	struct adapter *sc = iq->adapter;
 	const struct cpl_rx_data *cpl = mtod(m, const void *);
 	unsigned int tid = GET_TID(cpl);
 	struct toepcb *toep = lookup_tid(sc, tid);
 	struct inpcb *inp = toep->inp;
 	struct tcpcb *tp;
 	struct socket *so;
 	struct sockbuf *sb;
 	int len;
 	uint32_t ddp_placed = 0;
 
 	if (__predict_false(toep->flags & TPF_SYNQE)) {
 #ifdef INVARIANTS
 		struct synq_entry *synqe = (void *)toep;
 
 		INP_WLOCK(synqe->lctx->inp);
 		if (synqe->flags & TPF_SYNQE_HAS_L2TE) {
 			KASSERT(synqe->flags & TPF_ABORT_SHUTDOWN,
 			    ("%s: listen socket closed but tid %u not aborted.",
 			    __func__, tid));
 		} else {
 			/*
 			 * do_pass_accept_req is still running and will
 			 * eventually take care of this tid.
 			 */
 		}
 		INP_WUNLOCK(synqe->lctx->inp);
 #endif
 		CTR4(KTR_CXGBE, "%s: tid %u, synqe %p (0x%x)", __func__, tid,
 		    toep, toep->flags);
 		m_freem(m);
 		return (0);
 	}
 
 	KASSERT(toep->tid == tid, ("%s: toep tid mismatch", __func__));
 
 	/* strip off CPL header */
 	m_adj(m, sizeof(*cpl));
 	len = m->m_pkthdr.len;
 
 	INP_WLOCK(inp);
 	if (inp->inp_flags & (INP_DROPPED | INP_TIMEWAIT)) {
 		CTR4(KTR_CXGBE, "%s: tid %u, rx (%d bytes), inp_flags 0x%x",
 		    __func__, tid, len, inp->inp_flags);
 		INP_WUNLOCK(inp);
 		m_freem(m);
 		return (0);
 	}
 
 	tp = intotcpcb(inp);
 
 	if (__predict_false(tp->rcv_nxt != be32toh(cpl->seq)))
 		ddp_placed = be32toh(cpl->seq) - tp->rcv_nxt;
 
 	tp->rcv_nxt += len;
 	if (tp->rcv_wnd < len) {
 		KASSERT(toep->ulp_mode == ULP_MODE_RDMA,
 				("%s: negative window size", __func__));
 	}
 
 	tp->rcv_wnd -= len;
 	tp->t_rcvtime = ticks;
 
 	if (toep->ulp_mode == ULP_MODE_TCPDDP)
 		DDP_LOCK(toep);
 	so = inp_inpcbtosocket(inp);
 	sb = &so->so_rcv;
 	SOCKBUF_LOCK(sb);
 
 	if (__predict_false(sb->sb_state & SBS_CANTRCVMORE)) {
 		CTR3(KTR_CXGBE, "%s: tid %u, excess rx (%d bytes)",
 		    __func__, tid, len);
 		m_freem(m);
 		SOCKBUF_UNLOCK(sb);
 		if (toep->ulp_mode == ULP_MODE_TCPDDP)
 			DDP_UNLOCK(toep);
 		INP_WUNLOCK(inp);
 
 		CURVNET_SET(toep->vnet);
 		INP_INFO_RLOCK(&V_tcbinfo);
 		INP_WLOCK(inp);
 		tp = tcp_drop(tp, ECONNRESET);
 		if (tp)
 			INP_WUNLOCK(inp);
 		INP_INFO_RUNLOCK(&V_tcbinfo);
 		CURVNET_RESTORE();
 
 		return (0);
 	}
 
 	/* receive buffer autosize */
 	MPASS(toep->vnet == so->so_vnet);
 	CURVNET_SET(toep->vnet);
 	if (sb->sb_flags & SB_AUTOSIZE &&
 	    V_tcp_do_autorcvbuf &&
 	    sb->sb_hiwat < V_tcp_autorcvbuf_max &&
 	    len > (sbspace(sb) / 8 * 7)) {
 		unsigned int hiwat = sb->sb_hiwat;
 		unsigned int newsize = min(hiwat + V_tcp_autorcvbuf_inc,
 		    V_tcp_autorcvbuf_max);
 
 		if (!sbreserve_locked(sb, newsize, so, NULL))
 			sb->sb_flags &= ~SB_AUTOSIZE;
 		else
 			toep->rx_credits += newsize - hiwat;
 	}
 
 	if (toep->ulp_mode == ULP_MODE_TCPDDP) {
 		int changed = !(toep->ddp.flags & DDP_ON) ^ cpl->ddp_off;
 
 		if (toep->ddp.waiting_count != 0 || toep->ddp.active_count != 0)
 			CTR3(KTR_CXGBE, "%s: tid %u, non-ddp rx (%d bytes)",
 			    __func__, tid, len);
 
 		if (changed) {
 			if (toep->ddp.flags & DDP_SC_REQ)
 				toep->ddp.flags ^= DDP_ON | DDP_SC_REQ;
 			else {
 				KASSERT(cpl->ddp_off == 1,
 				    ("%s: DDP switched on by itself.",
 				    __func__));
 
 				/* Fell out of DDP mode */
 				toep->ddp.flags &= ~DDP_ON;
 				CTR1(KTR_CXGBE, "%s: fell out of DDP mode",
 				    __func__);
 
 				insert_ddp_data(toep, ddp_placed);
 			}
 		}
 
 		if (toep->ddp.flags & DDP_ON) {
 			/*
 			 * CPL_RX_DATA with DDP on can only be an indicate.
 			 * Start posting queued AIO requests via DDP.  The
 			 * payload that arrived in this indicate is appended
 			 * to the socket buffer as usual.
 			 */
 			handle_ddp_indicate(toep);
 		}
 	}
 
 	KASSERT(toep->sb_cc >= sbused(sb),
 	    ("%s: sb %p has more data (%d) than last time (%d).",
 	    __func__, sb, sbused(sb), toep->sb_cc));
 	toep->rx_credits += toep->sb_cc - sbused(sb);
 	sbappendstream_locked(sb, m, 0);
 	toep->sb_cc = sbused(sb);
 	if (toep->rx_credits > 0 && toep->sb_cc + tp->rcv_wnd < sb->sb_lowat) {
 		int credits;
 
 		credits = send_rx_credits(sc, toep, toep->rx_credits);
 		toep->rx_credits -= credits;
 		tp->rcv_wnd += credits;
 		tp->rcv_adv += credits;
 	}
 
 	if (toep->ulp_mode == ULP_MODE_TCPDDP && toep->ddp.waiting_count > 0 &&
 	    sbavail(sb) != 0) {
 		CTR2(KTR_CXGBE, "%s: tid %u queueing AIO task", __func__,
 		    tid);
 		ddp_queue_toep(toep);
 	}
 	sorwakeup_locked(so);
 	SOCKBUF_UNLOCK_ASSERT(sb);
 	if (toep->ulp_mode == ULP_MODE_TCPDDP)
 		DDP_UNLOCK(toep);
 
 	INP_WUNLOCK(inp);
 	CURVNET_RESTORE();
 	return (0);
 }
 
 #define S_CPL_FW4_ACK_OPCODE    24
 #define M_CPL_FW4_ACK_OPCODE    0xff
 #define V_CPL_FW4_ACK_OPCODE(x) ((x) << S_CPL_FW4_ACK_OPCODE)
 #define G_CPL_FW4_ACK_OPCODE(x) \
     (((x) >> S_CPL_FW4_ACK_OPCODE) & M_CPL_FW4_ACK_OPCODE)
 
 #define S_CPL_FW4_ACK_FLOWID    0
 #define M_CPL_FW4_ACK_FLOWID    0xffffff
 #define V_CPL_FW4_ACK_FLOWID(x) ((x) << S_CPL_FW4_ACK_FLOWID)
 #define G_CPL_FW4_ACK_FLOWID(x) \
     (((x) >> S_CPL_FW4_ACK_FLOWID) & M_CPL_FW4_ACK_FLOWID)
 
 #define S_CPL_FW4_ACK_CR        24
 #define M_CPL_FW4_ACK_CR        0xff
 #define V_CPL_FW4_ACK_CR(x)     ((x) << S_CPL_FW4_ACK_CR)
 #define G_CPL_FW4_ACK_CR(x)     (((x) >> S_CPL_FW4_ACK_CR) & M_CPL_FW4_ACK_CR)
 
 #define S_CPL_FW4_ACK_SEQVAL    0
 #define M_CPL_FW4_ACK_SEQVAL    0x1
 #define V_CPL_FW4_ACK_SEQVAL(x) ((x) << S_CPL_FW4_ACK_SEQVAL)
 #define G_CPL_FW4_ACK_SEQVAL(x) \
     (((x) >> S_CPL_FW4_ACK_SEQVAL) & M_CPL_FW4_ACK_SEQVAL)
 #define F_CPL_FW4_ACK_SEQVAL    V_CPL_FW4_ACK_SEQVAL(1U)
 
 static int
 do_fw4_ack(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m)
 {
 	struct adapter *sc = iq->adapter;
 	const struct cpl_fw4_ack *cpl = (const void *)(rss + 1);
 	unsigned int tid = G_CPL_FW4_ACK_FLOWID(be32toh(OPCODE_TID(cpl)));
 	struct toepcb *toep = lookup_tid(sc, tid);
 	struct inpcb *inp;
 	struct tcpcb *tp;
 	struct socket *so;
 	uint8_t credits = cpl->credits;
 	struct ofld_tx_sdesc *txsd;
 	int plen;
 #ifdef INVARIANTS
 	unsigned int opcode = G_CPL_FW4_ACK_OPCODE(be32toh(OPCODE_TID(cpl)));
 #endif
 
 	/*
 	 * Very unusual case: we'd sent a flowc + abort_req for a synq entry and
 	 * now this comes back carrying the credits for the flowc.
 	 */
 	if (__predict_false(toep->flags & TPF_SYNQE)) {
 		KASSERT(toep->flags & TPF_ABORT_SHUTDOWN,
 		    ("%s: credits for a synq entry %p", __func__, toep));
 		return (0);
 	}
 
 	inp = toep->inp;
 
 	KASSERT(opcode == CPL_FW4_ACK,
 	    ("%s: unexpected opcode 0x%x", __func__, opcode));
 	KASSERT(m == NULL, ("%s: wasn't expecting payload", __func__));
 	KASSERT(toep->tid == tid, ("%s: toep tid mismatch", __func__));
 
 	INP_WLOCK(inp);
 
 	if (__predict_false(toep->flags & TPF_ABORT_SHUTDOWN)) {
 		INP_WUNLOCK(inp);
 		return (0);
 	}
 
 	KASSERT((inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) == 0,
 	    ("%s: inp_flags 0x%x", __func__, inp->inp_flags));
 
 	tp = intotcpcb(inp);
 
 	if (cpl->flags & CPL_FW4_ACK_FLAGS_SEQVAL) {
 		tcp_seq snd_una = be32toh(cpl->snd_una);
 
 #ifdef INVARIANTS
 		if (__predict_false(SEQ_LT(snd_una, tp->snd_una))) {
 			log(LOG_ERR,
 			    "%s: unexpected seq# %x for TID %u, snd_una %x\n",
 			    __func__, snd_una, toep->tid, tp->snd_una);
 		}
 #endif
 
 		if (tp->snd_una != snd_una) {
 			tp->snd_una = snd_una;
 			tp->ts_recent_age = tcp_ts_getticks();
 		}
 	}
 
 #ifdef VERBOSE_TRACES
 	CTR3(KTR_CXGBE, "%s: tid %d credits %u", __func__, tid, credits);
 #endif
 	so = inp->inp_socket;
 	txsd = &toep->txsd[toep->txsd_cidx];
 	plen = 0;
 	while (credits) {
 		KASSERT(credits >= txsd->tx_credits,
 		    ("%s: too many (or partial) credits", __func__));
 		credits -= txsd->tx_credits;
 		toep->tx_credits += txsd->tx_credits;
 		plen += txsd->plen;
 		if (txsd->iv_buffer) {
 			free(txsd->iv_buffer, M_CXGBE);
 			txsd->iv_buffer = NULL;
 		}
 		txsd++;
 		toep->txsd_avail++;
 		KASSERT(toep->txsd_avail <= toep->txsd_total,
 		    ("%s: txsd avail > total", __func__));
 		if (__predict_false(++toep->txsd_cidx == toep->txsd_total)) {
 			txsd = &toep->txsd[0];
 			toep->txsd_cidx = 0;
 		}
 	}
 
 	if (toep->tx_credits == toep->tx_total) {
 		toep->tx_nocompl = 0;
 		toep->plen_nocompl = 0;
 	}
 
 	if (toep->flags & TPF_TX_SUSPENDED &&
 	    toep->tx_credits >= toep->tx_total / 4) {
 #ifdef VERBOSE_TRACES
 		CTR2(KTR_CXGBE, "%s: tid %d calling t4_push_frames", __func__,
 		    tid);
 #endif
 		toep->flags &= ~TPF_TX_SUSPENDED;
 		CURVNET_SET(toep->vnet);
 		if (toep->ulp_mode == ULP_MODE_ISCSI)
 			t4_push_pdus(sc, toep, plen);
 		else if (tls_tx_key(toep))
 			t4_push_tls_records(sc, toep, plen);
 		else
 			t4_push_frames(sc, toep, plen);
 		CURVNET_RESTORE();
 	} else if (plen > 0) {
 		struct sockbuf *sb = &so->so_snd;
 		int sbu;
 
 		SOCKBUF_LOCK(sb);
 		sbu = sbused(sb);
 		if (toep->ulp_mode == ULP_MODE_ISCSI) {
 
 			if (__predict_false(sbu > 0)) {
 				/*
 				 * The data trasmitted before the tid's ULP mode
 				 * changed to ISCSI is still in so_snd.
 				 * Incoming credits should account for so_snd
 				 * first.
 				 */
 				sbdrop_locked(sb, min(sbu, plen));
 				plen -= min(sbu, plen);
 			}
 			sowwakeup_locked(so);	/* unlocks so_snd */
 			rqdrop_locked(&toep->ulp_pdu_reclaimq, plen);
 		} else {
 #ifdef VERBOSE_TRACES
 			CTR3(KTR_CXGBE, "%s: tid %d dropped %d bytes", __func__,
 			    tid, plen);
 #endif
 			sbdrop_locked(sb, plen);
 			if (tls_tx_key(toep)) {
 				struct tls_ofld_info *tls_ofld = &toep->tls;
 
 				MPASS(tls_ofld->sb_off >= plen);
 				tls_ofld->sb_off -= plen;
 			}
 			if (!TAILQ_EMPTY(&toep->aiotx_jobq))
 				t4_aiotx_queue_toep(toep);
 			sowwakeup_locked(so);	/* unlocks so_snd */
 		}
 		SOCKBUF_UNLOCK_ASSERT(sb);
 	}
 
 	INP_WUNLOCK(inp);
 
 	return (0);
 }
 
 int
 do_set_tcb_rpl(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m)
 {
 	struct adapter *sc = iq->adapter;
 	const struct cpl_set_tcb_rpl *cpl = (const void *)(rss + 1);
 	unsigned int tid = GET_TID(cpl);
 	struct toepcb *toep;
 #ifdef INVARIANTS
 	unsigned int opcode = G_CPL_OPCODE(be32toh(OPCODE_TID(cpl)));
 #endif
 
 	KASSERT(opcode == CPL_SET_TCB_RPL,
 	    ("%s: unexpected opcode 0x%x", __func__, opcode));
 	KASSERT(m == NULL, ("%s: wasn't expecting payload", __func__));
 	MPASS(iq != &sc->sge.fwq);
 
 	toep = lookup_tid(sc, tid);
 	if (toep->ulp_mode == ULP_MODE_TCPDDP) {
 		handle_ddp_tcb_rpl(toep, cpl);
 		return (0);
 	}
 
 	/*
 	 * TOM and/or other ULPs don't request replies for CPL_SET_TCB or
 	 * CPL_SET_TCB_FIELD requests.  This can easily change and when it does
 	 * the dispatch code will go here.
 	 */
 #ifdef INVARIANTS
 	panic("%s: Unexpected CPL_SET_TCB_RPL for tid %u on iq %p", __func__,
 	    tid, iq);
 #else
 	log(LOG_ERR, "%s: Unexpected CPL_SET_TCB_RPL for tid %u on iq %p\n",
 	    __func__, tid, iq);
 #endif
 
 	return (0);
 }
 
 void
-t4_set_tcb_field(struct adapter *sc, struct sge_wrq *wrq, int tid,
-    uint16_t word, uint64_t mask, uint64_t val, int reply, int cookie, int iqid)
+t4_set_tcb_field(struct adapter *sc, struct sge_wrq *wrq, struct toepcb *toep,
+    uint16_t word, uint64_t mask, uint64_t val, int reply, int cookie)
 {
 	struct wrqe *wr;
 	struct cpl_set_tcb_field *req;
+	struct ofld_tx_sdesc *txsd;
 
 	MPASS((cookie & ~M_COOKIE) == 0);
-	MPASS((iqid & ~M_QUEUENO) == 0);
 
 	wr = alloc_wrqe(sizeof(*req), wrq);
 	if (wr == NULL) {
 		/* XXX */
 		panic("%s: allocation failure.", __func__);
 	}
 	req = wrtod(wr);
 
-	INIT_TP_WR_MIT_CPL(req, CPL_SET_TCB_FIELD, tid);
-	req->reply_ctrl = htobe16(V_QUEUENO(iqid));
+	INIT_TP_WR_MIT_CPL(req, CPL_SET_TCB_FIELD, toep->tid);
+	req->reply_ctrl = htobe16(V_QUEUENO(toep->ofld_rxq->iq.abs_id));
 	if (reply == 0)
 		req->reply_ctrl |= htobe16(F_NO_REPLY);
 	req->word_cookie = htobe16(V_WORD(word) | V_COOKIE(cookie));
 	req->mask = htobe64(mask);
 	req->val = htobe64(val);
+	if ((wrq->eq.flags & EQ_TYPEMASK) == EQ_OFLD) {
+		txsd = &toep->txsd[toep->txsd_pidx];
+		txsd->tx_credits = howmany(sizeof(*req), 16);
+		txsd->plen = 0;
+		KASSERT(toep->tx_credits >= txsd->tx_credits &&
+		    toep->txsd_avail > 0,
+		    ("%s: not enough credits (%d)", __func__,
+		    toep->tx_credits));
+		toep->tx_credits -= txsd->tx_credits;
+		if (__predict_false(++toep->txsd_pidx == toep->txsd_total))
+			toep->txsd_pidx = 0;
+		toep->txsd_avail--;
+	}
 
 	t4_wrq_tx(sc, wr);
 }
 
 void
 t4_init_cpl_io_handlers(void)
 {
 
 	t4_register_cpl_handler(CPL_PEER_CLOSE, do_peer_close);
 	t4_register_cpl_handler(CPL_CLOSE_CON_RPL, do_close_con_rpl);
 	t4_register_cpl_handler(CPL_ABORT_REQ_RSS, do_abort_req);
 	t4_register_cpl_handler(CPL_ABORT_RPL_RSS, do_abort_rpl);
 	t4_register_cpl_handler(CPL_RX_DATA, do_rx_data);
 	t4_register_cpl_handler(CPL_FW4_ACK, do_fw4_ack);
 }
 
 void
 t4_uninit_cpl_io_handlers(void)
 {
 
 	t4_register_cpl_handler(CPL_PEER_CLOSE, NULL);
 	t4_register_cpl_handler(CPL_CLOSE_CON_RPL, NULL);
 	t4_register_cpl_handler(CPL_ABORT_REQ_RSS, NULL);
 	t4_register_cpl_handler(CPL_ABORT_RPL_RSS, NULL);
 	t4_register_cpl_handler(CPL_RX_DATA, NULL);
 	t4_register_cpl_handler(CPL_FW4_ACK, NULL);
 }
 
 /*
  * Use the 'backend3' field in AIO jobs to store the amount of data
  * sent by the AIO job so far and the 'backend4' field to hold an
  * error that should be reported when the job is completed.
  */
 #define	aio_sent	backend3
 #define	aio_error	backend4
 
 #define	jobtotid(job)							\
 	(((struct toepcb *)(so_sototcpcb((job)->fd_file->f_data)->t_toe))->tid)
 	
 static void
 free_aiotx_buffer(struct aiotx_buffer *ab)
 {
 	struct kaiocb *job;
 	long status;
 	int error;
 
 	if (refcount_release(&ab->refcount) == 0)
 		return;
 
 	job = ab->job;
 	error = job->aio_error;
 	status = job->aio_sent;
 	vm_page_unhold_pages(ab->ps.pages, ab->ps.npages);
 	free(ab, M_CXGBE);
 #ifdef VERBOSE_TRACES
 	CTR5(KTR_CXGBE, "%s: tid %d completed %p len %ld, error %d", __func__,
 	    jobtotid(job), job, status, error);
 #endif
 	if (error == ECANCELED && status != 0)
 		error = 0;
 	if (error == ECANCELED)
 		aio_cancel(job);
 	else if (error)
 		aio_complete(job, -1, error);
 	else
 		aio_complete(job, status, 0);
 }
 
 static void
 t4_aiotx_mbuf_free(struct mbuf *m)
 {
 	struct aiotx_buffer *ab = m->m_ext.ext_arg1;
 
 #ifdef VERBOSE_TRACES
 	CTR3(KTR_CXGBE, "%s: completed %d bytes for tid %d", __func__,
 	    m->m_len, jobtotid(ab->job));
 #endif
 	free_aiotx_buffer(ab);
 }
 
 /*
  * Hold the buffer backing an AIO request and return an AIO transmit
  * buffer.
  */
 static int
 hold_aio(struct kaiocb *job)
 {
 	struct aiotx_buffer *ab;
 	struct vmspace *vm;
 	vm_map_t map;
 	vm_offset_t start, end, pgoff;
 	int n;
 
 	MPASS(job->backend1 == NULL);
 
 	/*
 	 * The AIO subsystem will cancel and drain all requests before
 	 * permitting a process to exit or exec, so p_vmspace should
 	 * be stable here.
 	 */
 	vm = job->userproc->p_vmspace;
 	map = &vm->vm_map;
 	start = (uintptr_t)job->uaiocb.aio_buf;
 	pgoff = start & PAGE_MASK;
 	end = round_page(start + job->uaiocb.aio_nbytes);
 	start = trunc_page(start);
 	n = atop(end - start);
 
 	ab = malloc(sizeof(*ab) + n * sizeof(vm_page_t), M_CXGBE, M_WAITOK |
 	    M_ZERO);
 	refcount_init(&ab->refcount, 1);
 	ab->ps.pages = (vm_page_t *)(ab + 1);
 	ab->ps.npages = vm_fault_quick_hold_pages(map, start, end - start,
 	    VM_PROT_WRITE, ab->ps.pages, n);
 	if (ab->ps.npages < 0) {
 		free(ab, M_CXGBE);
 		return (EFAULT);
 	}
 
 	KASSERT(ab->ps.npages == n,
 	    ("hold_aio: page count mismatch: %d vs %d", ab->ps.npages, n));
 
 	ab->ps.offset = pgoff;
 	ab->ps.len = job->uaiocb.aio_nbytes;
 	ab->job = job;
 	job->backend1 = ab;
 #ifdef VERBOSE_TRACES
 	CTR5(KTR_CXGBE, "%s: tid %d, new pageset %p for job %p, npages %d",
 	    __func__, jobtotid(job), &ab->ps, job, ab->ps.npages);
 #endif
 	return (0);
 }
 
 static void
 t4_aiotx_process_job(struct toepcb *toep, struct socket *so, struct kaiocb *job)
 {
 	struct adapter *sc;
 	struct sockbuf *sb;
 	struct file *fp;
 	struct aiotx_buffer *ab;
 	struct inpcb *inp;
 	struct tcpcb *tp;
 	struct mbuf *m;
 	int error;
 	bool moretocome, sendmore;
 
 	sc = td_adapter(toep->td);
 	sb = &so->so_snd;
 	SOCKBUF_UNLOCK(sb);
 	fp = job->fd_file;
 	ab = job->backend1;
 	m = NULL;
 
 #ifdef MAC
 	error = mac_socket_check_send(fp->f_cred, so);
 	if (error != 0)
 		goto out;
 #endif
 
 	if (ab == NULL) {
 		error = hold_aio(job);
 		if (error != 0)
 			goto out;
 		ab = job->backend1;
 	}
 
 	/* Inline sosend_generic(). */
 
 	job->msgsnd = 1;
 
 	error = sblock(sb, SBL_WAIT);
 	MPASS(error == 0);
 
 sendanother:
 	m = m_get(M_WAITOK, MT_DATA);
 
 	SOCKBUF_LOCK(sb);
 	if (so->so_snd.sb_state & SBS_CANTSENDMORE) {
 		SOCKBUF_UNLOCK(sb);
 		sbunlock(sb);
 		if ((so->so_options & SO_NOSIGPIPE) == 0) {
 			PROC_LOCK(job->userproc);
 			kern_psignal(job->userproc, SIGPIPE);
 			PROC_UNLOCK(job->userproc);
 		}
 		error = EPIPE;
 		goto out;
 	}
 	if (so->so_error) {
 		error = so->so_error;
 		so->so_error = 0;
 		SOCKBUF_UNLOCK(sb);
 		sbunlock(sb);
 		goto out;
 	}
 	if ((so->so_state & SS_ISCONNECTED) == 0) {
 		SOCKBUF_UNLOCK(sb);
 		sbunlock(sb);
 		error = ENOTCONN;
 		goto out;
 	}
 	if (sbspace(sb) < sb->sb_lowat) {
 		MPASS(job->aio_sent == 0 || !(so->so_state & SS_NBIO));
 
 		/*
 		 * Don't block if there is too little room in the socket
 		 * buffer.  Instead, requeue the request.
 		 */
 		if (!aio_set_cancel_function(job, t4_aiotx_cancel)) {
 			SOCKBUF_UNLOCK(sb);
 			sbunlock(sb);
 			error = ECANCELED;
 			goto out;
 		}
 		TAILQ_INSERT_HEAD(&toep->aiotx_jobq, job, list);
 		SOCKBUF_UNLOCK(sb);
 		sbunlock(sb);
 		goto out;
 	}
 
 	/*
 	 * Write as much data as the socket permits, but no more than a
 	 * a single sndbuf at a time.
 	 */
 	m->m_len = sbspace(sb);
 	if (m->m_len > ab->ps.len - job->aio_sent) {
 		m->m_len = ab->ps.len - job->aio_sent;
 		moretocome = false;
 	} else
 		moretocome = true;
 	if (m->m_len > sc->tt.sndbuf) {
 		m->m_len = sc->tt.sndbuf;
 		sendmore = true;
 	} else
 		sendmore = false;
 
 	if (!TAILQ_EMPTY(&toep->aiotx_jobq))
 		moretocome = true;
 	SOCKBUF_UNLOCK(sb);
 	MPASS(m->m_len != 0);
 
 	/* Inlined tcp_usr_send(). */
 
 	inp = toep->inp;
 	INP_WLOCK(inp);
 	if (inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) {
 		INP_WUNLOCK(inp);
 		sbunlock(sb);
 		error = ECONNRESET;
 		goto out;
 	}
 
 	refcount_acquire(&ab->refcount);
 	m_extadd(m, NULL, ab->ps.len, t4_aiotx_mbuf_free, ab,
 	    (void *)(uintptr_t)job->aio_sent, 0, EXT_NET_DRV);
 	m->m_ext.ext_flags |= EXT_FLAG_AIOTX;
 	job->aio_sent += m->m_len;
 	
 	sbappendstream(sb, m, 0);
 	m = NULL;
 
 	if (!(inp->inp_flags & INP_DROPPED)) {
 		tp = intotcpcb(inp);
 		if (moretocome)
 			tp->t_flags |= TF_MORETOCOME;
 		error = tp->t_fb->tfb_tcp_output(tp);
 		if (moretocome)
 			tp->t_flags &= ~TF_MORETOCOME;
 	}
 
 	INP_WUNLOCK(inp);
 	if (sendmore)
 		goto sendanother;
 	sbunlock(sb);
 
 	if (error)
 		goto out;
 
 	/*
 	 * If this is a non-blocking socket and the request has not
 	 * been fully completed, requeue it until the socket is ready
 	 * again.
 	 */
 	if (job->aio_sent < job->uaiocb.aio_nbytes &&
 	    !(so->so_state & SS_NBIO)) {
 		SOCKBUF_LOCK(sb);
 		if (!aio_set_cancel_function(job, t4_aiotx_cancel)) {
 			SOCKBUF_UNLOCK(sb);
 			error = ECANCELED;
 			goto out;
 		}
 		TAILQ_INSERT_HEAD(&toep->aiotx_jobq, job, list);
 		return;
 	}
 
 	/*
 	 * If the request will not be requeued, drop a reference on
 	 * the aiotx buffer.  Any mbufs in flight should still
 	 * contain a reference, but this drops the reference that the
 	 * job owns while it is waiting to queue mbufs to the socket.
 	 */
 	free_aiotx_buffer(ab);
 
 out:
 	if (error) {
 		if (ab != NULL) {
 			job->aio_error = error;
 			free_aiotx_buffer(ab);
 		} else {
 			MPASS(job->aio_sent == 0);
 			aio_complete(job, -1, error);
 		}
 	}
 	if (m != NULL)
 		m_free(m);
 	SOCKBUF_LOCK(sb);
 }
 
 static void
 t4_aiotx_task(void *context, int pending)
 {
 	struct toepcb *toep = context;
 	struct inpcb *inp = toep->inp;
 	struct socket *so = inp->inp_socket;
 	struct kaiocb *job;
 
 	CURVNET_SET(toep->vnet);
 	SOCKBUF_LOCK(&so->so_snd);
 	while (!TAILQ_EMPTY(&toep->aiotx_jobq) && sowriteable(so)) {
 		job = TAILQ_FIRST(&toep->aiotx_jobq);
 		TAILQ_REMOVE(&toep->aiotx_jobq, job, list);
 		if (!aio_clear_cancel_function(job))
 			continue;
 
 		t4_aiotx_process_job(toep, so, job);
 	}
 	toep->aiotx_task_active = false;
 	SOCKBUF_UNLOCK(&so->so_snd);
 	CURVNET_RESTORE();
 
 	free_toepcb(toep);
 }
 
 static void
 t4_aiotx_queue_toep(struct toepcb *toep)
 {
 
 	SOCKBUF_LOCK_ASSERT(&toep->inp->inp_socket->so_snd);
 #ifdef VERBOSE_TRACES
 	CTR3(KTR_CXGBE, "%s: queueing aiotx task for tid %d, active = %s",
 	    __func__, toep->tid, toep->aiotx_task_active ? "true" : "false");
 #endif
 	if (toep->aiotx_task_active)
 		return;
 	toep->aiotx_task_active = true;
 	hold_toepcb(toep);
 	soaio_enqueue(&toep->aiotx_task);
 }
 
 static void
 t4_aiotx_cancel(struct kaiocb *job)
 {
 	struct aiotx_buffer *ab;
 	struct socket *so;
 	struct sockbuf *sb;
 	struct tcpcb *tp;
 	struct toepcb *toep;
 
 	so = job->fd_file->f_data;
 	tp = so_sototcpcb(so);
 	toep = tp->t_toe;
 	MPASS(job->uaiocb.aio_lio_opcode == LIO_WRITE);
 	sb = &so->so_snd;
 
 	SOCKBUF_LOCK(sb);
 	if (!aio_cancel_cleared(job))
 		TAILQ_REMOVE(&toep->aiotx_jobq, job, list);
 	SOCKBUF_UNLOCK(sb);
 
 	ab = job->backend1;
 	if (ab != NULL)
 		free_aiotx_buffer(ab);
 	else
 		aio_cancel(job);
 }
 
 int
 t4_aio_queue_aiotx(struct socket *so, struct kaiocb *job)
 {
 	struct tcpcb *tp = so_sototcpcb(so);
 	struct toepcb *toep = tp->t_toe;
 	struct adapter *sc = td_adapter(toep->td);
 
 	/* This only handles writes. */
 	if (job->uaiocb.aio_lio_opcode != LIO_WRITE)
 		return (EOPNOTSUPP);
 
 	if (!sc->tt.tx_zcopy)
 		return (EOPNOTSUPP);
 
 	if (tls_tx_key(toep))
 		return (EOPNOTSUPP);
 
 	SOCKBUF_LOCK(&so->so_snd);
 #ifdef VERBOSE_TRACES
 	CTR2(KTR_CXGBE, "%s: queueing %p", __func__, job);
 #endif
 	if (!aio_set_cancel_function(job, t4_aiotx_cancel))
 		panic("new job was cancelled");
 	TAILQ_INSERT_TAIL(&toep->aiotx_jobq, job, list);
 	if (sowriteable(so))
 		t4_aiotx_queue_toep(toep);
 	SOCKBUF_UNLOCK(&so->so_snd);
 	return (0);
 }
 
 void
 aiotx_init_toep(struct toepcb *toep)
 {
 
 	TAILQ_INIT(&toep->aiotx_jobq);
 	TASK_INIT(&toep->aiotx_task, 0, t4_aiotx_task, toep);
 }
 #endif
Index: head/sys/dev/cxgbe/tom/t4_ddp.c
===================================================================
--- head/sys/dev/cxgbe/tom/t4_ddp.c	(revision 331648)
+++ head/sys/dev/cxgbe/tom/t4_ddp.c	(revision 331649)
@@ -1,1963 +1,1961 @@
 /*-
  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
  *
  * Copyright (c) 2012 Chelsio Communications, Inc.
  * All rights reserved.
  * Written by: Navdeep Parhar <np@FreeBSD.org>
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_inet.h"
 
 #include <sys/param.h>
 #include <sys/aio.h>
 #include <sys/file.h>
 #include <sys/systm.h>
 #include <sys/kernel.h>
 #include <sys/ktr.h>
 #include <sys/module.h>
 #include <sys/protosw.h>
 #include <sys/proc.h>
 #include <sys/domain.h>
 #include <sys/socket.h>
 #include <sys/socketvar.h>
 #include <sys/taskqueue.h>
 #include <sys/uio.h>
 #include <netinet/in.h>
 #include <netinet/in_pcb.h>
 #include <netinet/ip.h>
 #include <netinet/tcp_var.h>
 #define TCPSTATES
 #include <netinet/tcp_fsm.h>
 #include <netinet/toecore.h>
 
 #include <vm/vm.h>
 #include <vm/vm_extern.h>
 #include <vm/vm_param.h>
 #include <vm/pmap.h>
 #include <vm/vm_map.h>
 #include <vm/vm_page.h>
 #include <vm/vm_object.h>
 
 #ifdef TCP_OFFLOAD
 #include "common/common.h"
 #include "common/t4_msg.h"
 #include "common/t4_regs.h"
 #include "common/t4_tcb.h"
 #include "tom/t4_tom.h"
 
 /*
  * Use the 'backend3' field in AIO jobs to store the amount of data
  * received by the AIO job so far.
  */
 #define	aio_received	backend3
 
 static void aio_ddp_requeue_task(void *context, int pending);
 static void ddp_complete_all(struct toepcb *toep, int error);
 static void t4_aio_cancel_active(struct kaiocb *job);
 static void t4_aio_cancel_queued(struct kaiocb *job);
 
 static TAILQ_HEAD(, pageset) ddp_orphan_pagesets;
 static struct mtx ddp_orphan_pagesets_lock;
 static struct task ddp_orphan_task;
 
 #define MAX_DDP_BUFFER_SIZE		(M_TCB_RX_DDP_BUF0_LEN)
 
 /*
  * A page set holds information about a buffer used for DDP.  The page
  * set holds resources such as the VM pages backing the buffer (either
  * held or wired) and the page pods associated with the buffer.
  * Recently used page sets are cached to allow for efficient reuse of
  * buffers (avoiding the need to re-fault in pages, hold them, etc.).
  * Note that cached page sets keep the backing pages wired.  The
  * number of wired pages is capped by only allowing for two wired
  * pagesets per connection.  This is not a perfect cap, but is a
  * trade-off for performance.
  *
  * If an application ping-pongs two buffers for a connection via
  * aio_read(2) then those buffers should remain wired and expensive VM
  * fault lookups should be avoided after each buffer has been used
  * once.  If an application uses more than two buffers then this will
  * fall back to doing expensive VM fault lookups for each operation.
  */
 static void
 free_pageset(struct tom_data *td, struct pageset *ps)
 {
 	vm_page_t p;
 	int i;
 
 	if (ps->prsv.prsv_nppods > 0)
 		t4_free_page_pods(&ps->prsv);
 
 	if (ps->flags & PS_WIRED) {
 		for (i = 0; i < ps->npages; i++) {
 			p = ps->pages[i];
 			vm_page_lock(p);
 			vm_page_unwire(p, PQ_INACTIVE);
 			vm_page_unlock(p);
 		}
 	} else
 		vm_page_unhold_pages(ps->pages, ps->npages);
 	mtx_lock(&ddp_orphan_pagesets_lock);
 	TAILQ_INSERT_TAIL(&ddp_orphan_pagesets, ps, link);
 	taskqueue_enqueue(taskqueue_thread, &ddp_orphan_task);
 	mtx_unlock(&ddp_orphan_pagesets_lock);
 }
 
 static void
 ddp_free_orphan_pagesets(void *context, int pending)
 {
 	struct pageset *ps;
 
 	mtx_lock(&ddp_orphan_pagesets_lock);
 	while (!TAILQ_EMPTY(&ddp_orphan_pagesets)) {
 		ps = TAILQ_FIRST(&ddp_orphan_pagesets);
 		TAILQ_REMOVE(&ddp_orphan_pagesets, ps, link);
 		mtx_unlock(&ddp_orphan_pagesets_lock);
 		if (ps->vm)
 			vmspace_free(ps->vm);
 		free(ps, M_CXGBE);
 		mtx_lock(&ddp_orphan_pagesets_lock);
 	}
 	mtx_unlock(&ddp_orphan_pagesets_lock);
 }
 
 static void
 recycle_pageset(struct toepcb *toep, struct pageset *ps)
 {
 
 	DDP_ASSERT_LOCKED(toep);
 	if (!(toep->ddp.flags & DDP_DEAD) && ps->flags & PS_WIRED) {
 		KASSERT(toep->ddp.cached_count + toep->ddp.active_count <
 		    nitems(toep->ddp.db), ("too many wired pagesets"));
 		TAILQ_INSERT_HEAD(&toep->ddp.cached_pagesets, ps, link);
 		toep->ddp.cached_count++;
 	} else
 		free_pageset(toep->td, ps);
 }
 
 static void
 ddp_complete_one(struct kaiocb *job, int error)
 {
 	long copied;
 
 	/*
 	 * If this job had copied data out of the socket buffer before
 	 * it was cancelled, report it as a short read rather than an
 	 * error.
 	 */
 	copied = job->aio_received;
 	if (copied != 0 || error == 0)
 		aio_complete(job, copied, 0);
 	else
 		aio_complete(job, -1, error);
 }
 
 static void
 free_ddp_buffer(struct tom_data *td, struct ddp_buffer *db)
 {
 
 	if (db->job) {
 		/*
 		 * XXX: If we are un-offloading the socket then we
 		 * should requeue these on the socket somehow.  If we
 		 * got a FIN from the remote end, then this completes
 		 * any remaining requests with an EOF read.
 		 */
 		if (!aio_clear_cancel_function(db->job))
 			ddp_complete_one(db->job, 0);
 	}
 
 	if (db->ps)
 		free_pageset(td, db->ps);
 }
 
 void
 ddp_init_toep(struct toepcb *toep)
 {
 
 	TAILQ_INIT(&toep->ddp.aiojobq);
 	TASK_INIT(&toep->ddp.requeue_task, 0, aio_ddp_requeue_task, toep);
 	toep->ddp.flags = DDP_OK;
 	toep->ddp.active_id = -1;
 	mtx_init(&toep->ddp.lock, "t4 ddp", NULL, MTX_DEF);
 }
 
 void
 ddp_uninit_toep(struct toepcb *toep)
 {
 
 	mtx_destroy(&toep->ddp.lock);
 }
 
 void
 release_ddp_resources(struct toepcb *toep)
 {
 	struct pageset *ps;
 	int i;
 
 	DDP_LOCK(toep);
 	toep->flags |= DDP_DEAD;
 	for (i = 0; i < nitems(toep->ddp.db); i++) {
 		free_ddp_buffer(toep->td, &toep->ddp.db[i]);
 	}
 	while ((ps = TAILQ_FIRST(&toep->ddp.cached_pagesets)) != NULL) {
 		TAILQ_REMOVE(&toep->ddp.cached_pagesets, ps, link);
 		free_pageset(toep->td, ps);
 	}
 	ddp_complete_all(toep, 0);
 	DDP_UNLOCK(toep);
 }
 
 #ifdef INVARIANTS
 void
 ddp_assert_empty(struct toepcb *toep)
 {
 	int i;
 
 	MPASS(!(toep->ddp.flags & DDP_TASK_ACTIVE));
 	for (i = 0; i < nitems(toep->ddp.db); i++) {
 		MPASS(toep->ddp.db[i].job == NULL);
 		MPASS(toep->ddp.db[i].ps == NULL);
 	}
 	MPASS(TAILQ_EMPTY(&toep->ddp.cached_pagesets));
 	MPASS(TAILQ_EMPTY(&toep->ddp.aiojobq));
 }
 #endif
 
 static void
 complete_ddp_buffer(struct toepcb *toep, struct ddp_buffer *db,
     unsigned int db_idx)
 {
 	unsigned int db_flag;
 
 	toep->ddp.active_count--;
 	if (toep->ddp.active_id == db_idx) {
 		if (toep->ddp.active_count == 0) {
 			KASSERT(toep->ddp.db[db_idx ^ 1].job == NULL,
 			    ("%s: active_count mismatch", __func__));
 			toep->ddp.active_id = -1;
 		} else
 			toep->ddp.active_id ^= 1;
 #ifdef VERBOSE_TRACES
 		CTR2(KTR_CXGBE, "%s: ddp_active_id = %d", __func__,
 		    toep->ddp.active_id);
 #endif
 	} else {
 		KASSERT(toep->ddp.active_count != 0 &&
 		    toep->ddp.active_id != -1,
 		    ("%s: active count mismatch", __func__));
 	}
 
 	db->cancel_pending = 0;
 	db->job = NULL;
 	recycle_pageset(toep, db->ps);
 	db->ps = NULL;
 
 	db_flag = db_idx == 1 ? DDP_BUF1_ACTIVE : DDP_BUF0_ACTIVE;
 	KASSERT(toep->ddp.flags & db_flag,
 	    ("%s: DDP buffer not active. toep %p, ddp_flags 0x%x",
 	    __func__, toep, toep->ddp.flags));
 	toep->ddp.flags &= ~db_flag;
 }
 
 /* XXX: handle_ddp_data code duplication */
 void
 insert_ddp_data(struct toepcb *toep, uint32_t n)
 {
 	struct inpcb *inp = toep->inp;
 	struct tcpcb *tp = intotcpcb(inp);
 	struct ddp_buffer *db;
 	struct kaiocb *job;
 	size_t placed;
 	long copied;
 	unsigned int db_flag, db_idx;
 
 	INP_WLOCK_ASSERT(inp);
 	DDP_ASSERT_LOCKED(toep);
 
 	tp->rcv_nxt += n;
 #ifndef USE_DDP_RX_FLOW_CONTROL
 	KASSERT(tp->rcv_wnd >= n, ("%s: negative window size", __func__));
 	tp->rcv_wnd -= n;
 #endif
 #ifndef USE_DDP_RX_FLOW_CONTROL
 	toep->rx_credits += n;
 #endif
 	CTR2(KTR_CXGBE, "%s: placed %u bytes before falling out of DDP",
 	    __func__, n);
 	while (toep->ddp.active_count > 0) {
 		MPASS(toep->ddp.active_id != -1);
 		db_idx = toep->ddp.active_id;
 		db_flag = db_idx == 1 ? DDP_BUF1_ACTIVE : DDP_BUF0_ACTIVE;
 		MPASS((toep->ddp.flags & db_flag) != 0);
 		db = &toep->ddp.db[db_idx];
 		job = db->job;
 		copied = job->aio_received;
 		placed = n;
 		if (placed > job->uaiocb.aio_nbytes - copied)
 			placed = job->uaiocb.aio_nbytes - copied;
 		if (placed > 0)
 			job->msgrcv = 1;
 		if (!aio_clear_cancel_function(job)) {
 			/*
 			 * Update the copied length for when
 			 * t4_aio_cancel_active() completes this
 			 * request.
 			 */
 			job->aio_received += placed;
 		} else if (copied + placed != 0) {
 			CTR4(KTR_CXGBE,
 			    "%s: completing %p (copied %ld, placed %lu)",
 			    __func__, job, copied, placed);
 			/* XXX: This always completes if there is some data. */
 			aio_complete(job, copied + placed, 0);
 		} else if (aio_set_cancel_function(job, t4_aio_cancel_queued)) {
 			TAILQ_INSERT_HEAD(&toep->ddp.aiojobq, job, list);
 			toep->ddp.waiting_count++;
 		} else
 			aio_cancel(job);
 		n -= placed;
 		complete_ddp_buffer(toep, db, db_idx);
 	}
 
 	MPASS(n == 0);
 }
 
 /* SET_TCB_FIELD sent as a ULP command looks like this */
 #define LEN__SET_TCB_FIELD_ULP (sizeof(struct ulp_txpkt) + \
     sizeof(struct ulptx_idata) + sizeof(struct cpl_set_tcb_field_core))
 
 /* RX_DATA_ACK sent as a ULP command looks like this */
 #define LEN__RX_DATA_ACK_ULP (sizeof(struct ulp_txpkt) + \
     sizeof(struct ulptx_idata) + sizeof(struct cpl_rx_data_ack_core))
 
 static inline void *
 mk_set_tcb_field_ulp(struct ulp_txpkt *ulpmc, struct toepcb *toep,
     uint64_t word, uint64_t mask, uint64_t val)
 {
 	struct ulptx_idata *ulpsc;
 	struct cpl_set_tcb_field_core *req;
 
 	ulpmc->cmd_dest = htonl(V_ULPTX_CMD(ULP_TX_PKT) | V_ULP_TXPKT_DEST(0));
 	ulpmc->len = htobe32(howmany(LEN__SET_TCB_FIELD_ULP, 16));
 
 	ulpsc = (struct ulptx_idata *)(ulpmc + 1);
 	ulpsc->cmd_more = htobe32(V_ULPTX_CMD(ULP_TX_SC_IMM));
 	ulpsc->len = htobe32(sizeof(*req));
 
 	req = (struct cpl_set_tcb_field_core *)(ulpsc + 1);
 	OPCODE_TID(req) = htobe32(MK_OPCODE_TID(CPL_SET_TCB_FIELD, toep->tid));
 	req->reply_ctrl = htobe16(V_NO_REPLY(1) |
 	    V_QUEUENO(toep->ofld_rxq->iq.abs_id));
 	req->word_cookie = htobe16(V_WORD(word) | V_COOKIE(0));
         req->mask = htobe64(mask);
         req->val = htobe64(val);
 
 	ulpsc = (struct ulptx_idata *)(req + 1);
 	if (LEN__SET_TCB_FIELD_ULP % 16) {
 		ulpsc->cmd_more = htobe32(V_ULPTX_CMD(ULP_TX_SC_NOOP));
 		ulpsc->len = htobe32(0);
 		return (ulpsc + 1);
 	}
 	return (ulpsc);
 }
 
 static inline void *
 mk_rx_data_ack_ulp(struct ulp_txpkt *ulpmc, struct toepcb *toep)
 {
 	struct ulptx_idata *ulpsc;
 	struct cpl_rx_data_ack_core *req;
 
 	ulpmc->cmd_dest = htonl(V_ULPTX_CMD(ULP_TX_PKT) | V_ULP_TXPKT_DEST(0));
 	ulpmc->len = htobe32(howmany(LEN__RX_DATA_ACK_ULP, 16));
 
 	ulpsc = (struct ulptx_idata *)(ulpmc + 1);
 	ulpsc->cmd_more = htobe32(V_ULPTX_CMD(ULP_TX_SC_IMM));
 	ulpsc->len = htobe32(sizeof(*req));
 
 	req = (struct cpl_rx_data_ack_core *)(ulpsc + 1);
 	OPCODE_TID(req) = htobe32(MK_OPCODE_TID(CPL_RX_DATA_ACK, toep->tid));
 	req->credit_dack = htobe32(F_RX_MODULATE_RX);
 
 	ulpsc = (struct ulptx_idata *)(req + 1);
 	if (LEN__RX_DATA_ACK_ULP % 16) {
 		ulpsc->cmd_more = htobe32(V_ULPTX_CMD(ULP_TX_SC_NOOP));
 		ulpsc->len = htobe32(0);
 		return (ulpsc + 1);
 	}
 	return (ulpsc);
 }
 
 static struct wrqe *
 mk_update_tcb_for_ddp(struct adapter *sc, struct toepcb *toep, int db_idx,
     struct pageset *ps, int offset, uint64_t ddp_flags, uint64_t ddp_flags_mask)
 {
 	struct wrqe *wr;
 	struct work_request_hdr *wrh;
 	struct ulp_txpkt *ulpmc;
 	int len;
 
 	KASSERT(db_idx == 0 || db_idx == 1,
 	    ("%s: bad DDP buffer index %d", __func__, db_idx));
 
 	/*
 	 * We'll send a compound work request that has 3 SET_TCB_FIELDs and an
 	 * RX_DATA_ACK (with RX_MODULATE to speed up delivery).
 	 *
 	 * The work request header is 16B and always ends at a 16B boundary.
 	 * The ULPTX master commands that follow must all end at 16B boundaries
 	 * too so we round up the size to 16.
 	 */
 	len = sizeof(*wrh) + 3 * roundup2(LEN__SET_TCB_FIELD_ULP, 16) +
 	    roundup2(LEN__RX_DATA_ACK_ULP, 16);
 
 	wr = alloc_wrqe(len, toep->ctrlq);
 	if (wr == NULL)
 		return (NULL);
 	wrh = wrtod(wr);
 	INIT_ULPTX_WRH(wrh, len, 1, 0);	/* atomic */
 	ulpmc = (struct ulp_txpkt *)(wrh + 1);
 
 	/* Write the buffer's tag */
 	ulpmc = mk_set_tcb_field_ulp(ulpmc, toep,
 	    W_TCB_RX_DDP_BUF0_TAG + db_idx,
 	    V_TCB_RX_DDP_BUF0_TAG(M_TCB_RX_DDP_BUF0_TAG),
 	    V_TCB_RX_DDP_BUF0_TAG(ps->prsv.prsv_tag));
 
 	/* Update the current offset in the DDP buffer and its total length */
 	if (db_idx == 0)
 		ulpmc = mk_set_tcb_field_ulp(ulpmc, toep,
 		    W_TCB_RX_DDP_BUF0_OFFSET,
 		    V_TCB_RX_DDP_BUF0_OFFSET(M_TCB_RX_DDP_BUF0_OFFSET) |
 		    V_TCB_RX_DDP_BUF0_LEN(M_TCB_RX_DDP_BUF0_LEN),
 		    V_TCB_RX_DDP_BUF0_OFFSET(offset) |
 		    V_TCB_RX_DDP_BUF0_LEN(ps->len));
 	else
 		ulpmc = mk_set_tcb_field_ulp(ulpmc, toep,
 		    W_TCB_RX_DDP_BUF1_OFFSET,
 		    V_TCB_RX_DDP_BUF1_OFFSET(M_TCB_RX_DDP_BUF1_OFFSET) |
 		    V_TCB_RX_DDP_BUF1_LEN((u64)M_TCB_RX_DDP_BUF1_LEN << 32),
 		    V_TCB_RX_DDP_BUF1_OFFSET(offset) |
 		    V_TCB_RX_DDP_BUF1_LEN((u64)ps->len << 32));
 
 	/* Update DDP flags */
 	ulpmc = mk_set_tcb_field_ulp(ulpmc, toep, W_TCB_RX_DDP_FLAGS,
 	    ddp_flags_mask, ddp_flags);
 
 	/* Gratuitous RX_DATA_ACK with RX_MODULATE set to speed up delivery. */
 	ulpmc = mk_rx_data_ack_ulp(ulpmc, toep);
 
 	return (wr);
 }
 
 static int
 handle_ddp_data(struct toepcb *toep, __be32 ddp_report, __be32 rcv_nxt, int len)
 {
 	uint32_t report = be32toh(ddp_report);
 	unsigned int db_idx;
 	struct inpcb *inp = toep->inp;
 	struct ddp_buffer *db;
 	struct tcpcb *tp;
 	struct socket *so;
 	struct sockbuf *sb;
 	struct kaiocb *job;
 	long copied;
 
 	db_idx = report & F_DDP_BUF_IDX ? 1 : 0;
 
 	if (__predict_false(!(report & F_DDP_INV)))
 		CXGBE_UNIMPLEMENTED("DDP buffer still valid");
 
 	INP_WLOCK(inp);
 	so = inp_inpcbtosocket(inp);
 	sb = &so->so_rcv;
 	DDP_LOCK(toep);
 
 	KASSERT(toep->ddp.active_id == db_idx,
 	    ("completed DDP buffer (%d) != active_id (%d) for tid %d", db_idx,
 	    toep->ddp.active_id, toep->tid));
 	db = &toep->ddp.db[db_idx];
 	job = db->job;
 
 	if (__predict_false(inp->inp_flags & (INP_DROPPED | INP_TIMEWAIT))) {
 		/*
 		 * This can happen due to an administrative tcpdrop(8).
 		 * Just fail the request with ECONNRESET.
 		 */
 		CTR5(KTR_CXGBE, "%s: tid %u, seq 0x%x, len %d, inp_flags 0x%x",
 		    __func__, toep->tid, be32toh(rcv_nxt), len, inp->inp_flags);
 		if (aio_clear_cancel_function(job))
 			ddp_complete_one(job, ECONNRESET);
 		goto completed;
 	}
 
 	tp = intotcpcb(inp);
 
 	/*
 	 * For RX_DDP_COMPLETE, len will be zero and rcv_nxt is the
 	 * sequence number of the next byte to receive.  The length of
 	 * the data received for this message must be computed by
 	 * comparing the new and old values of rcv_nxt.
 	 *
 	 * For RX_DATA_DDP, len might be non-zero, but it is only the
 	 * length of the most recent DMA.  It does not include the
 	 * total length of the data received since the previous update
 	 * for this DDP buffer.  rcv_nxt is the sequence number of the
 	 * first received byte from the most recent DMA.
 	 */
 	len += be32toh(rcv_nxt) - tp->rcv_nxt;
 	tp->rcv_nxt += len;
 	tp->t_rcvtime = ticks;
 #ifndef USE_DDP_RX_FLOW_CONTROL
 	KASSERT(tp->rcv_wnd >= len, ("%s: negative window size", __func__));
 	tp->rcv_wnd -= len;
 #endif
 #ifdef VERBOSE_TRACES
 	CTR4(KTR_CXGBE, "%s: DDP[%d] placed %d bytes (%#x)", __func__, db_idx,
 	    len, report);
 #endif
 
 	/* receive buffer autosize */
 	MPASS(toep->vnet == so->so_vnet);
 	CURVNET_SET(toep->vnet);
 	SOCKBUF_LOCK(sb);
 	if (sb->sb_flags & SB_AUTOSIZE &&
 	    V_tcp_do_autorcvbuf &&
 	    sb->sb_hiwat < V_tcp_autorcvbuf_max &&
 	    len > (sbspace(sb) / 8 * 7)) {
 		unsigned int hiwat = sb->sb_hiwat;
 		unsigned int newsize = min(hiwat + V_tcp_autorcvbuf_inc,
 		    V_tcp_autorcvbuf_max);
 
 		if (!sbreserve_locked(sb, newsize, so, NULL))
 			sb->sb_flags &= ~SB_AUTOSIZE;
 		else
 			toep->rx_credits += newsize - hiwat;
 	}
 	SOCKBUF_UNLOCK(sb);
 	CURVNET_RESTORE();
 
 #ifndef USE_DDP_RX_FLOW_CONTROL
 	toep->rx_credits += len;
 #endif
 
 	job->msgrcv = 1;
 	if (db->cancel_pending) {
 		/*
 		 * Update the job's length but defer completion to the
 		 * TCB_RPL callback.
 		 */
 		job->aio_received += len;
 		goto out;
 	} else if (!aio_clear_cancel_function(job)) {
 		/*
 		 * Update the copied length for when
 		 * t4_aio_cancel_active() completes this request.
 		 */
 		job->aio_received += len;
 	} else {
 		copied = job->aio_received;
 #ifdef VERBOSE_TRACES
 		CTR4(KTR_CXGBE, "%s: completing %p (copied %ld, placed %d)",
 		    __func__, job, copied, len);
 #endif
 		aio_complete(job, copied + len, 0);
 		t4_rcvd(&toep->td->tod, tp);
 	}
 
 completed:
 	complete_ddp_buffer(toep, db, db_idx);
 	if (toep->ddp.waiting_count > 0)
 		ddp_queue_toep(toep);
 out:
 	DDP_UNLOCK(toep);
 	INP_WUNLOCK(inp);
 
 	return (0);
 }
 
 void
 handle_ddp_indicate(struct toepcb *toep)
 {
 
 	DDP_ASSERT_LOCKED(toep);
 	MPASS(toep->ddp.active_count == 0);
 	MPASS((toep->ddp.flags & (DDP_BUF0_ACTIVE | DDP_BUF1_ACTIVE)) == 0);
 	if (toep->ddp.waiting_count == 0) {
 		/*
 		 * The pending requests that triggered the request for an
 		 * an indicate were cancelled.  Those cancels should have
 		 * already disabled DDP.  Just ignore this as the data is
 		 * going into the socket buffer anyway.
 		 */
 		return;
 	}
 	CTR3(KTR_CXGBE, "%s: tid %d indicated (%d waiting)", __func__,
 	    toep->tid, toep->ddp.waiting_count);
 	ddp_queue_toep(toep);
 }
 
 enum {
 	DDP_BUF0_INVALIDATED = 0x2,
 	DDP_BUF1_INVALIDATED
 };
 
 void
 handle_ddp_tcb_rpl(struct toepcb *toep, const struct cpl_set_tcb_rpl *cpl)
 {
 	unsigned int db_idx;
 	struct inpcb *inp = toep->inp;
 	struct ddp_buffer *db;
 	struct kaiocb *job;
 	long copied;
 
 	if (cpl->status != CPL_ERR_NONE)
 		panic("XXX: tcp_rpl failed: %d", cpl->status);
 
 	switch (cpl->cookie) {
 	case V_WORD(W_TCB_RX_DDP_FLAGS) | V_COOKIE(DDP_BUF0_INVALIDATED):
 	case V_WORD(W_TCB_RX_DDP_FLAGS) | V_COOKIE(DDP_BUF1_INVALIDATED):
 		/*
 		 * XXX: This duplicates a lot of code with handle_ddp_data().
 		 */
 		db_idx = G_COOKIE(cpl->cookie) - DDP_BUF0_INVALIDATED;
 		INP_WLOCK(inp);
 		DDP_LOCK(toep);
 		db = &toep->ddp.db[db_idx];
 
 		/*
 		 * handle_ddp_data() should leave the job around until
 		 * this callback runs once a cancel is pending.
 		 */
 		MPASS(db != NULL);
 		MPASS(db->job != NULL);
 		MPASS(db->cancel_pending);
 
 		/*
 		 * XXX: It's not clear what happens if there is data
 		 * placed when the buffer is invalidated.  I suspect we
 		 * need to read the TCB to see how much data was placed.
 		 *
 		 * For now this just pretends like nothing was placed.
 		 *
 		 * XXX: Note that if we did check the PCB we would need to
 		 * also take care of updating the tp, etc.
 		 */
 		job = db->job;
 		copied = job->aio_received;
 		if (copied == 0) {
 			CTR2(KTR_CXGBE, "%s: cancelling %p", __func__, job);
 			aio_cancel(job);
 		} else {
 			CTR3(KTR_CXGBE, "%s: completing %p (copied %ld)",
 			    __func__, job, copied);
 			aio_complete(job, copied, 0);
 			t4_rcvd(&toep->td->tod, intotcpcb(inp));
 		}
 
 		complete_ddp_buffer(toep, db, db_idx);
 		if (toep->ddp.waiting_count > 0)
 			ddp_queue_toep(toep);
 		DDP_UNLOCK(toep);
 		INP_WUNLOCK(inp);
 		break;
 	default:
 		panic("XXX: unknown tcb_rpl offset %#x, cookie %#x",
 		    G_WORD(cpl->cookie), G_COOKIE(cpl->cookie));
 	}
 }
 
 void
 handle_ddp_close(struct toepcb *toep, struct tcpcb *tp, __be32 rcv_nxt)
 {
 	struct ddp_buffer *db;
 	struct kaiocb *job;
 	long copied;
 	unsigned int db_flag, db_idx;
 	int len, placed;
 
 	INP_WLOCK_ASSERT(toep->inp);
 	DDP_ASSERT_LOCKED(toep);
 	len = be32toh(rcv_nxt) - tp->rcv_nxt;
 
 	tp->rcv_nxt += len;
 #ifndef USE_DDP_RX_FLOW_CONTROL
 	toep->rx_credits += len;
 #endif
 
 	while (toep->ddp.active_count > 0) {
 		MPASS(toep->ddp.active_id != -1);
 		db_idx = toep->ddp.active_id;
 		db_flag = db_idx == 1 ? DDP_BUF1_ACTIVE : DDP_BUF0_ACTIVE;
 		MPASS((toep->ddp.flags & db_flag) != 0);
 		db = &toep->ddp.db[db_idx];
 		job = db->job;
 		copied = job->aio_received;
 		placed = len;
 		if (placed > job->uaiocb.aio_nbytes - copied)
 			placed = job->uaiocb.aio_nbytes - copied;
 		if (placed > 0)
 			job->msgrcv = 1;
 		if (!aio_clear_cancel_function(job)) {
 			/*
 			 * Update the copied length for when
 			 * t4_aio_cancel_active() completes this
 			 * request.
 			 */
 			job->aio_received += placed;
 		} else {
 			CTR4(KTR_CXGBE, "%s: tid %d completed buf %d len %d",
 			    __func__, toep->tid, db_idx, placed);
 			aio_complete(job, copied + placed, 0);
 		}
 		len -= placed;
 		complete_ddp_buffer(toep, db, db_idx);
 	}
 
 	MPASS(len == 0);
 	ddp_complete_all(toep, 0);
 }
 
 #define DDP_ERR (F_DDP_PPOD_MISMATCH | F_DDP_LLIMIT_ERR | F_DDP_ULIMIT_ERR |\
 	 F_DDP_PPOD_PARITY_ERR | F_DDP_PADDING_ERR | F_DDP_OFFSET_ERR |\
 	 F_DDP_INVALID_TAG | F_DDP_COLOR_ERR | F_DDP_TID_MISMATCH |\
 	 F_DDP_INVALID_PPOD | F_DDP_HDRCRC_ERR | F_DDP_DATACRC_ERR)
 
 extern cpl_handler_t t4_cpl_handler[];
 
 static int
 do_rx_data_ddp(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m)
 {
 	struct adapter *sc = iq->adapter;
 	const struct cpl_rx_data_ddp *cpl = (const void *)(rss + 1);
 	unsigned int tid = GET_TID(cpl);
 	uint32_t vld;
 	struct toepcb *toep = lookup_tid(sc, tid);
 
 	KASSERT(m == NULL, ("%s: wasn't expecting payload", __func__));
 	KASSERT(toep->tid == tid, ("%s: toep tid/atid mismatch", __func__));
 	KASSERT(!(toep->flags & TPF_SYNQE),
 	    ("%s: toep %p claims to be a synq entry", __func__, toep));
 
 	vld = be32toh(cpl->ddpvld);
 	if (__predict_false(vld & DDP_ERR)) {
 		panic("%s: DDP error 0x%x (tid %d, toep %p)",
 		    __func__, vld, tid, toep);
 	}
 
 	if (toep->ulp_mode == ULP_MODE_ISCSI) {
 		t4_cpl_handler[CPL_RX_ISCSI_DDP](iq, rss, m);
 		return (0);
 	}
 
 	handle_ddp_data(toep, cpl->u.ddp_report, cpl->seq, be16toh(cpl->len));
 
 	return (0);
 }
 
 static int
 do_rx_ddp_complete(struct sge_iq *iq, const struct rss_header *rss,
     struct mbuf *m)
 {
 	struct adapter *sc = iq->adapter;
 	const struct cpl_rx_ddp_complete *cpl = (const void *)(rss + 1);
 	unsigned int tid = GET_TID(cpl);
 	struct toepcb *toep = lookup_tid(sc, tid);
 
 	KASSERT(m == NULL, ("%s: wasn't expecting payload", __func__));
 	KASSERT(toep->tid == tid, ("%s: toep tid/atid mismatch", __func__));
 	KASSERT(!(toep->flags & TPF_SYNQE),
 	    ("%s: toep %p claims to be a synq entry", __func__, toep));
 
 	handle_ddp_data(toep, cpl->ddp_report, cpl->rcv_nxt, 0);
 
 	return (0);
 }
 
 static void
 enable_ddp(struct adapter *sc, struct toepcb *toep)
 {
 
 	KASSERT((toep->ddp.flags & (DDP_ON | DDP_OK | DDP_SC_REQ)) == DDP_OK,
 	    ("%s: toep %p has bad ddp_flags 0x%x",
 	    __func__, toep, toep->ddp.flags));
 
 	CTR3(KTR_CXGBE, "%s: tid %u (time %u)",
 	    __func__, toep->tid, time_uptime);
 
 	DDP_ASSERT_LOCKED(toep);
 	toep->ddp.flags |= DDP_SC_REQ;
-	t4_set_tcb_field(sc, toep->ctrlq, toep->tid, W_TCB_RX_DDP_FLAGS,
+	t4_set_tcb_field(sc, toep->ctrlq, toep, W_TCB_RX_DDP_FLAGS,
 	    V_TF_DDP_OFF(1) | V_TF_DDP_INDICATE_OUT(1) |
 	    V_TF_DDP_BUF0_INDICATE(1) | V_TF_DDP_BUF1_INDICATE(1) |
 	    V_TF_DDP_BUF0_VALID(1) | V_TF_DDP_BUF1_VALID(1),
-	    V_TF_DDP_BUF0_INDICATE(1) | V_TF_DDP_BUF1_INDICATE(1), 0, 0,
-	    toep->ofld_rxq->iq.abs_id);
-	t4_set_tcb_field(sc, toep->ctrlq, toep->tid, W_TCB_T_FLAGS,
-	    V_TF_RCV_COALESCE_ENABLE(1), 0, 0, 0, toep->ofld_rxq->iq.abs_id);
+	    V_TF_DDP_BUF0_INDICATE(1) | V_TF_DDP_BUF1_INDICATE(1), 0, 0);
+	t4_set_tcb_field(sc, toep->ctrlq, toep, W_TCB_T_FLAGS,
+	    V_TF_RCV_COALESCE_ENABLE(1), 0, 0, 0);
 }
 
 static int
 calculate_hcf(int n1, int n2)
 {
 	int a, b, t;
 
 	if (n1 <= n2) {
 		a = n1;
 		b = n2;
 	} else {
 		a = n2;
 		b = n1;
 	}
 
 	while (a != 0) {
 		t = a;
 		a = b % a;
 		b = t;
 	}
 
 	return (b);
 }
 
 static inline int
 pages_to_nppods(int npages, int ddp_page_shift)
 {
 
 	MPASS(ddp_page_shift >= PAGE_SHIFT);
 
 	return (howmany(npages >> (ddp_page_shift - PAGE_SHIFT), PPOD_PAGES));
 }
 
 static int
 alloc_page_pods(struct ppod_region *pr, u_int nppods, u_int pgsz_idx,
     struct ppod_reservation *prsv)
 {
 	vmem_addr_t addr;       /* relative to start of region */
 
 	if (vmem_alloc(pr->pr_arena, PPOD_SZ(nppods), M_NOWAIT | M_FIRSTFIT,
 	    &addr) != 0)
 		return (ENOMEM);
 
 	CTR5(KTR_CXGBE, "%-17s arena %p, addr 0x%08x, nppods %d, pgsz %d",
 	    __func__, pr->pr_arena, (uint32_t)addr & pr->pr_tag_mask,
 	    nppods, 1 << pr->pr_page_shift[pgsz_idx]);
 
 	/*
 	 * The hardware tagmask includes an extra invalid bit but the arena was
 	 * seeded with valid values only.  An allocation out of this arena will
 	 * fit inside the tagmask but won't have the invalid bit set.
 	 */
 	MPASS((addr & pr->pr_tag_mask) == addr);
 	MPASS((addr & pr->pr_invalid_bit) == 0);
 
 	prsv->prsv_pr = pr;
 	prsv->prsv_tag = V_PPOD_PGSZ(pgsz_idx) | addr;
 	prsv->prsv_nppods = nppods;
 
 	return (0);
 }
 
 int
 t4_alloc_page_pods_for_ps(struct ppod_region *pr, struct pageset *ps)
 {
 	int i, hcf, seglen, idx, nppods;
 	struct ppod_reservation *prsv = &ps->prsv;
 
 	KASSERT(prsv->prsv_nppods == 0,
 	    ("%s: page pods already allocated", __func__));
 
 	/*
 	 * The DDP page size is unrelated to the VM page size.  We combine
 	 * contiguous physical pages into larger segments to get the best DDP
 	 * page size possible.  This is the largest of the four sizes in
 	 * A_ULP_RX_TDDP_PSZ that evenly divides the HCF of the segment sizes in
 	 * the page list.
 	 */
 	hcf = 0;
 	for (i = 0; i < ps->npages; i++) {
 		seglen = PAGE_SIZE;
 		while (i < ps->npages - 1 &&
 		    ps->pages[i]->phys_addr + PAGE_SIZE ==
 		    ps->pages[i + 1]->phys_addr) {
 			seglen += PAGE_SIZE;
 			i++;
 		}
 
 		hcf = calculate_hcf(hcf, seglen);
 		if (hcf < (1 << pr->pr_page_shift[1])) {
 			idx = 0;
 			goto have_pgsz;	/* give up, short circuit */
 		}
 	}
 
 #define PR_PAGE_MASK(x) ((1 << pr->pr_page_shift[(x)]) - 1)
 	MPASS((hcf & PR_PAGE_MASK(0)) == 0); /* PAGE_SIZE is >= 4K everywhere */
 	for (idx = nitems(pr->pr_page_shift) - 1; idx > 0; idx--) {
 		if ((hcf & PR_PAGE_MASK(idx)) == 0)
 			break;
 	}
 #undef PR_PAGE_MASK
 
 have_pgsz:
 	MPASS(idx <= M_PPOD_PGSZ);
 
 	nppods = pages_to_nppods(ps->npages, pr->pr_page_shift[idx]);
 	if (alloc_page_pods(pr, nppods, idx, prsv) != 0)
 		return (0);
 	MPASS(prsv->prsv_nppods > 0);
 
 	return (1);
 }
 
 int
 t4_alloc_page_pods_for_buf(struct ppod_region *pr, vm_offset_t buf, int len,
     struct ppod_reservation *prsv)
 {
 	int hcf, seglen, idx, npages, nppods;
 	uintptr_t start_pva, end_pva, pva, p1;
 
 	MPASS(buf > 0);
 	MPASS(len > 0);
 
 	/*
 	 * The DDP page size is unrelated to the VM page size.  We combine
 	 * contiguous physical pages into larger segments to get the best DDP
 	 * page size possible.  This is the largest of the four sizes in
 	 * A_ULP_RX_ISCSI_PSZ that evenly divides the HCF of the segment sizes
 	 * in the page list.
 	 */
 	hcf = 0;
 	start_pva = trunc_page(buf);
 	end_pva = trunc_page(buf + len - 1);
 	pva = start_pva;
 	while (pva <= end_pva) {
 		seglen = PAGE_SIZE;
 		p1 = pmap_kextract(pva);
 		pva += PAGE_SIZE;
 		while (pva <= end_pva && p1 + seglen == pmap_kextract(pva)) {
 			seglen += PAGE_SIZE;
 			pva += PAGE_SIZE;
 		}
 
 		hcf = calculate_hcf(hcf, seglen);
 		if (hcf < (1 << pr->pr_page_shift[1])) {
 			idx = 0;
 			goto have_pgsz;	/* give up, short circuit */
 		}
 	}
 
 #define PR_PAGE_MASK(x) ((1 << pr->pr_page_shift[(x)]) - 1)
 	MPASS((hcf & PR_PAGE_MASK(0)) == 0); /* PAGE_SIZE is >= 4K everywhere */
 	for (idx = nitems(pr->pr_page_shift) - 1; idx > 0; idx--) {
 		if ((hcf & PR_PAGE_MASK(idx)) == 0)
 			break;
 	}
 #undef PR_PAGE_MASK
 
 have_pgsz:
 	MPASS(idx <= M_PPOD_PGSZ);
 
 	npages = 1;
 	npages += (end_pva - start_pva) >> pr->pr_page_shift[idx];
 	nppods = howmany(npages, PPOD_PAGES);
 	if (alloc_page_pods(pr, nppods, idx, prsv) != 0)
 		return (ENOMEM);
 	MPASS(prsv->prsv_nppods > 0);
 
 	return (0);
 }
 
 void
 t4_free_page_pods(struct ppod_reservation *prsv)
 {
 	struct ppod_region *pr = prsv->prsv_pr;
 	vmem_addr_t addr;
 
 	MPASS(prsv != NULL);
 	MPASS(prsv->prsv_nppods != 0);
 
 	addr = prsv->prsv_tag & pr->pr_tag_mask;
 	MPASS((addr & pr->pr_invalid_bit) == 0);
 
 	CTR4(KTR_CXGBE, "%-17s arena %p, addr 0x%08x, nppods %d", __func__,
 	    pr->pr_arena, addr, prsv->prsv_nppods);
 
 	vmem_free(pr->pr_arena, addr, PPOD_SZ(prsv->prsv_nppods));
 	prsv->prsv_nppods = 0;
 }
 
 #define NUM_ULP_TX_SC_IMM_PPODS (256 / PPOD_SIZE)
 
 int
 t4_write_page_pods_for_ps(struct adapter *sc, struct sge_wrq *wrq, int tid,
     struct pageset *ps)
 {
 	struct wrqe *wr;
 	struct ulp_mem_io *ulpmc;
 	struct ulptx_idata *ulpsc;
 	struct pagepod *ppod;
 	int i, j, k, n, chunk, len, ddp_pgsz, idx;
 	u_int ppod_addr;
 	uint32_t cmd;
 	struct ppod_reservation *prsv = &ps->prsv;
 	struct ppod_region *pr = prsv->prsv_pr;
 
 	KASSERT(!(ps->flags & PS_PPODS_WRITTEN),
 	    ("%s: page pods already written", __func__));
 	MPASS(prsv->prsv_nppods > 0);
 
 	cmd = htobe32(V_ULPTX_CMD(ULP_TX_MEM_WRITE));
 	if (is_t4(sc))
 		cmd |= htobe32(F_ULP_MEMIO_ORDER);
 	else
 		cmd |= htobe32(F_T5_ULP_MEMIO_IMM);
 	ddp_pgsz = 1 << pr->pr_page_shift[G_PPOD_PGSZ(prsv->prsv_tag)];
 	ppod_addr = pr->pr_start + (prsv->prsv_tag & pr->pr_tag_mask);
 	for (i = 0; i < prsv->prsv_nppods; ppod_addr += chunk) {
 
 		/* How many page pods are we writing in this cycle */
 		n = min(prsv->prsv_nppods - i, NUM_ULP_TX_SC_IMM_PPODS);
 		chunk = PPOD_SZ(n);
 		len = roundup2(sizeof(*ulpmc) + sizeof(*ulpsc) + chunk, 16);
 
 		wr = alloc_wrqe(len, wrq);
 		if (wr == NULL)
 			return (ENOMEM);	/* ok to just bail out */
 		ulpmc = wrtod(wr);
 
 		INIT_ULPTX_WR(ulpmc, len, 0, 0);
 		ulpmc->cmd = cmd;
 		ulpmc->dlen = htobe32(V_ULP_MEMIO_DATA_LEN(chunk / 32));
 		ulpmc->len16 = htobe32(howmany(len - sizeof(ulpmc->wr), 16));
 		ulpmc->lock_addr = htobe32(V_ULP_MEMIO_ADDR(ppod_addr >> 5));
 
 		ulpsc = (struct ulptx_idata *)(ulpmc + 1);
 		ulpsc->cmd_more = htobe32(V_ULPTX_CMD(ULP_TX_SC_IMM));
 		ulpsc->len = htobe32(chunk);
 
 		ppod = (struct pagepod *)(ulpsc + 1);
 		for (j = 0; j < n; i++, j++, ppod++) {
 			ppod->vld_tid_pgsz_tag_color = htobe64(F_PPOD_VALID |
 			    V_PPOD_TID(tid) | prsv->prsv_tag);
 			ppod->len_offset = htobe64(V_PPOD_LEN(ps->len) |
 			    V_PPOD_OFST(ps->offset));
 			ppod->rsvd = 0;
 			idx = i * PPOD_PAGES * (ddp_pgsz / PAGE_SIZE);
 			for (k = 0; k < nitems(ppod->addr); k++) {
 				if (idx < ps->npages) {
 					ppod->addr[k] =
 					    htobe64(ps->pages[idx]->phys_addr);
 					idx += ddp_pgsz / PAGE_SIZE;
 				} else
 					ppod->addr[k] = 0;
 #if 0
 				CTR5(KTR_CXGBE,
 				    "%s: tid %d ppod[%d]->addr[%d] = %p",
 				    __func__, toep->tid, i, k,
 				    htobe64(ppod->addr[k]));
 #endif
 			}
 
 		}
 
 		t4_wrq_tx(sc, wr);
 	}
 	ps->flags |= PS_PPODS_WRITTEN;
 
 	return (0);
 }
 
 int
 t4_write_page_pods_for_buf(struct adapter *sc, struct sge_wrq *wrq, int tid,
     struct ppod_reservation *prsv, vm_offset_t buf, int buflen)
 {
 	struct wrqe *wr;
 	struct ulp_mem_io *ulpmc;
 	struct ulptx_idata *ulpsc;
 	struct pagepod *ppod;
 	int i, j, k, n, chunk, len, ddp_pgsz;
 	u_int ppod_addr, offset;
 	uint32_t cmd;
 	struct ppod_region *pr = prsv->prsv_pr;
 	uintptr_t end_pva, pva, pa;
 
 	cmd = htobe32(V_ULPTX_CMD(ULP_TX_MEM_WRITE));
 	if (is_t4(sc))
 		cmd |= htobe32(F_ULP_MEMIO_ORDER);
 	else
 		cmd |= htobe32(F_T5_ULP_MEMIO_IMM);
 	ddp_pgsz = 1 << pr->pr_page_shift[G_PPOD_PGSZ(prsv->prsv_tag)];
 	offset = buf & PAGE_MASK;
 	ppod_addr = pr->pr_start + (prsv->prsv_tag & pr->pr_tag_mask);
 	pva = trunc_page(buf);
 	end_pva = trunc_page(buf + buflen - 1);
 	for (i = 0; i < prsv->prsv_nppods; ppod_addr += chunk) {
 
 		/* How many page pods are we writing in this cycle */
 		n = min(prsv->prsv_nppods - i, NUM_ULP_TX_SC_IMM_PPODS);
 		MPASS(n > 0);
 		chunk = PPOD_SZ(n);
 		len = roundup2(sizeof(*ulpmc) + sizeof(*ulpsc) + chunk, 16);
 
 		wr = alloc_wrqe(len, wrq);
 		if (wr == NULL)
 			return (ENOMEM);	/* ok to just bail out */
 		ulpmc = wrtod(wr);
 
 		INIT_ULPTX_WR(ulpmc, len, 0, 0);
 		ulpmc->cmd = cmd;
 		ulpmc->dlen = htobe32(V_ULP_MEMIO_DATA_LEN(chunk / 32));
 		ulpmc->len16 = htobe32(howmany(len - sizeof(ulpmc->wr), 16));
 		ulpmc->lock_addr = htobe32(V_ULP_MEMIO_ADDR(ppod_addr >> 5));
 
 		ulpsc = (struct ulptx_idata *)(ulpmc + 1);
 		ulpsc->cmd_more = htobe32(V_ULPTX_CMD(ULP_TX_SC_IMM));
 		ulpsc->len = htobe32(chunk);
 
 		ppod = (struct pagepod *)(ulpsc + 1);
 		for (j = 0; j < n; i++, j++, ppod++) {
 			ppod->vld_tid_pgsz_tag_color = htobe64(F_PPOD_VALID |
 			    V_PPOD_TID(tid) |
 			    (prsv->prsv_tag & ~V_PPOD_PGSZ(M_PPOD_PGSZ)));
 			ppod->len_offset = htobe64(V_PPOD_LEN(buflen) |
 			    V_PPOD_OFST(offset));
 			ppod->rsvd = 0;
 
 			for (k = 0; k < nitems(ppod->addr); k++) {
 				if (pva > end_pva)
 					ppod->addr[k] = 0;
 				else {
 					pa = pmap_kextract(pva);
 					ppod->addr[k] = htobe64(pa);
 					pva += ddp_pgsz;
 				}
 #if 0
 				CTR5(KTR_CXGBE,
 				    "%s: tid %d ppod[%d]->addr[%d] = %p",
 				    __func__, tid, i, k,
 				    htobe64(ppod->addr[k]));
 #endif
 			}
 
 			/*
 			 * Walk back 1 segment so that the first address in the
 			 * next pod is the same as the last one in the current
 			 * pod.
 			 */
 			pva -= ddp_pgsz;
 		}
 
 		t4_wrq_tx(sc, wr);
 	}
 
 	MPASS(pva <= end_pva);
 
 	return (0);
 }
 
 static void
 wire_pageset(struct pageset *ps)
 {
 	vm_page_t p;
 	int i;
 
 	KASSERT(!(ps->flags & PS_WIRED), ("pageset already wired"));
 
 	for (i = 0; i < ps->npages; i++) {
 		p = ps->pages[i];
 		vm_page_lock(p);
 		vm_page_wire(p);
 		vm_page_unhold(p);
 		vm_page_unlock(p);
 	}
 	ps->flags |= PS_WIRED;
 }
 
 /*
  * Prepare a pageset for DDP.  This wires the pageset and sets up page
  * pods.
  */
 static int
 prep_pageset(struct adapter *sc, struct toepcb *toep, struct pageset *ps)
 {
 	struct tom_data *td = sc->tom_softc;
 
 	if (!(ps->flags & PS_WIRED))
 		wire_pageset(ps);
 	if (ps->prsv.prsv_nppods == 0 &&
 	    !t4_alloc_page_pods_for_ps(&td->pr, ps)) {
 		return (0);
 	}
 	if (!(ps->flags & PS_PPODS_WRITTEN) &&
 	    t4_write_page_pods_for_ps(sc, toep->ctrlq, toep->tid, ps) != 0) {
 		return (0);
 	}
 
 	return (1);
 }
 
 int
 t4_init_ppod_region(struct ppod_region *pr, struct t4_range *r, u_int psz,
     const char *name)
 {
 	int i;
 
 	MPASS(pr != NULL);
 	MPASS(r->size > 0);
 
 	pr->pr_start = r->start;
 	pr->pr_len = r->size;
 	pr->pr_page_shift[0] = 12 + G_HPZ0(psz);
 	pr->pr_page_shift[1] = 12 + G_HPZ1(psz);
 	pr->pr_page_shift[2] = 12 + G_HPZ2(psz);
 	pr->pr_page_shift[3] = 12 + G_HPZ3(psz);
 
 	/* The SGL -> page pod algorithm requires the sizes to be in order. */
 	for (i = 1; i < nitems(pr->pr_page_shift); i++) {
 		if (pr->pr_page_shift[i] <= pr->pr_page_shift[i - 1])
 			return (ENXIO);
 	}
 
 	pr->pr_tag_mask = ((1 << fls(r->size)) - 1) & V_PPOD_TAG(M_PPOD_TAG);
 	pr->pr_alias_mask = V_PPOD_TAG(M_PPOD_TAG) & ~pr->pr_tag_mask;
 	if (pr->pr_tag_mask == 0 || pr->pr_alias_mask == 0)
 		return (ENXIO);
 	pr->pr_alias_shift = fls(pr->pr_tag_mask);
 	pr->pr_invalid_bit = 1 << (pr->pr_alias_shift - 1);
 
 	pr->pr_arena = vmem_create(name, 0, pr->pr_len, PPOD_SIZE, 0,
 	    M_FIRSTFIT | M_NOWAIT);
 	if (pr->pr_arena == NULL)
 		return (ENOMEM);
 
 	return (0);
 }
 
 void
 t4_free_ppod_region(struct ppod_region *pr)
 {
 
 	MPASS(pr != NULL);
 
 	if (pr->pr_arena)
 		vmem_destroy(pr->pr_arena);
 	bzero(pr, sizeof(*pr));
 }
 
 static int
 pscmp(struct pageset *ps, struct vmspace *vm, vm_offset_t start, int npages,
     int pgoff, int len)
 {
 
 	if (ps->start != start || ps->npages != npages ||
 	    ps->offset != pgoff || ps->len != len)
 		return (1);
 
 	return (ps->vm != vm || ps->vm_timestamp != vm->vm_map.timestamp);
 }
 
 static int
 hold_aio(struct toepcb *toep, struct kaiocb *job, struct pageset **pps)
 {
 	struct vmspace *vm;
 	vm_map_t map;
 	vm_offset_t start, end, pgoff;
 	struct pageset *ps;
 	int n;
 
 	DDP_ASSERT_LOCKED(toep);
 
 	/*
 	 * The AIO subsystem will cancel and drain all requests before
 	 * permitting a process to exit or exec, so p_vmspace should
 	 * be stable here.
 	 */
 	vm = job->userproc->p_vmspace;
 	map = &vm->vm_map;
 	start = (uintptr_t)job->uaiocb.aio_buf;
 	pgoff = start & PAGE_MASK;
 	end = round_page(start + job->uaiocb.aio_nbytes);
 	start = trunc_page(start);
 
 	if (end - start > MAX_DDP_BUFFER_SIZE) {
 		/*
 		 * Truncate the request to a short read.
 		 * Alternatively, we could DDP in chunks to the larger
 		 * buffer, but that would be quite a bit more work.
 		 *
 		 * When truncating, round the request down to avoid
 		 * crossing a cache line on the final transaction.
 		 */
 		end = rounddown2(start + MAX_DDP_BUFFER_SIZE, CACHE_LINE_SIZE);
 #ifdef VERBOSE_TRACES
 		CTR4(KTR_CXGBE, "%s: tid %d, truncating size from %lu to %lu",
 		    __func__, toep->tid, (unsigned long)job->uaiocb.aio_nbytes,
 		    (unsigned long)(end - (start + pgoff)));
 		job->uaiocb.aio_nbytes = end - (start + pgoff);
 #endif
 		end = round_page(end);
 	}
 
 	n = atop(end - start);
 
 	/*
 	 * Try to reuse a cached pageset.
 	 */
 	TAILQ_FOREACH(ps, &toep->ddp.cached_pagesets, link) {
 		if (pscmp(ps, vm, start, n, pgoff,
 		    job->uaiocb.aio_nbytes) == 0) {
 			TAILQ_REMOVE(&toep->ddp.cached_pagesets, ps, link);
 			toep->ddp.cached_count--;
 			*pps = ps;
 			return (0);
 		}
 	}
 
 	/*
 	 * If there are too many cached pagesets to create a new one,
 	 * free a pageset before creating a new one.
 	 */
 	KASSERT(toep->ddp.active_count + toep->ddp.cached_count <=
 	    nitems(toep->ddp.db), ("%s: too many wired pagesets", __func__));
 	if (toep->ddp.active_count + toep->ddp.cached_count ==
 	    nitems(toep->ddp.db)) {
 		KASSERT(toep->ddp.cached_count > 0,
 		    ("no cached pageset to free"));
 		ps = TAILQ_LAST(&toep->ddp.cached_pagesets, pagesetq);
 		TAILQ_REMOVE(&toep->ddp.cached_pagesets, ps, link);
 		toep->ddp.cached_count--;
 		free_pageset(toep->td, ps);
 	}
 	DDP_UNLOCK(toep);
 
 	/* Create a new pageset. */
 	ps = malloc(sizeof(*ps) + n * sizeof(vm_page_t), M_CXGBE, M_WAITOK |
 	    M_ZERO);
 	ps->pages = (vm_page_t *)(ps + 1);
 	ps->vm_timestamp = map->timestamp;
 	ps->npages = vm_fault_quick_hold_pages(map, start, end - start,
 	    VM_PROT_WRITE, ps->pages, n);
 
 	DDP_LOCK(toep);
 	if (ps->npages < 0) {
 		free(ps, M_CXGBE);
 		return (EFAULT);
 	}
 
 	KASSERT(ps->npages == n, ("hold_aio: page count mismatch: %d vs %d",
 	    ps->npages, n));
 
 	ps->offset = pgoff;
 	ps->len = job->uaiocb.aio_nbytes;
 	atomic_add_int(&vm->vm_refcnt, 1);
 	ps->vm = vm;
 	ps->start = start;
 
 	CTR5(KTR_CXGBE, "%s: tid %d, new pageset %p for job %p, npages %d",
 	    __func__, toep->tid, ps, job, ps->npages);
 	*pps = ps;
 	return (0);
 }
 
 static void
 ddp_complete_all(struct toepcb *toep, int error)
 {
 	struct kaiocb *job;
 
 	DDP_ASSERT_LOCKED(toep);
 	while (!TAILQ_EMPTY(&toep->ddp.aiojobq)) {
 		job = TAILQ_FIRST(&toep->ddp.aiojobq);
 		TAILQ_REMOVE(&toep->ddp.aiojobq, job, list);
 		toep->ddp.waiting_count--;
 		if (aio_clear_cancel_function(job))
 			ddp_complete_one(job, error);
 	}
 }
 
 static void
 aio_ddp_cancel_one(struct kaiocb *job)
 {
 	long copied;
 
 	/*
 	 * If this job had copied data out of the socket buffer before
 	 * it was cancelled, report it as a short read rather than an
 	 * error.
 	 */
 	copied = job->aio_received;
 	if (copied != 0)
 		aio_complete(job, copied, 0);
 	else
 		aio_cancel(job);
 }
 
 /*
  * Called when the main loop wants to requeue a job to retry it later.
  * Deals with the race of the job being cancelled while it was being
  * examined.
  */
 static void
 aio_ddp_requeue_one(struct toepcb *toep, struct kaiocb *job)
 {
 
 	DDP_ASSERT_LOCKED(toep);
 	if (!(toep->ddp.flags & DDP_DEAD) &&
 	    aio_set_cancel_function(job, t4_aio_cancel_queued)) {
 		TAILQ_INSERT_HEAD(&toep->ddp.aiojobq, job, list);
 		toep->ddp.waiting_count++;
 	} else
 		aio_ddp_cancel_one(job);
 }
 
 static void
 aio_ddp_requeue(struct toepcb *toep)
 {
 	struct adapter *sc = td_adapter(toep->td);
 	struct socket *so;
 	struct sockbuf *sb;
 	struct inpcb *inp;
 	struct kaiocb *job;
 	struct ddp_buffer *db;
 	size_t copied, offset, resid;
 	struct pageset *ps;
 	struct mbuf *m;
 	uint64_t ddp_flags, ddp_flags_mask;
 	struct wrqe *wr;
 	int buf_flag, db_idx, error;
 
 	DDP_ASSERT_LOCKED(toep);
 
 restart:
 	if (toep->ddp.flags & DDP_DEAD) {
 		MPASS(toep->ddp.waiting_count == 0);
 		MPASS(toep->ddp.active_count == 0);
 		return;
 	}
 
 	if (toep->ddp.waiting_count == 0 ||
 	    toep->ddp.active_count == nitems(toep->ddp.db)) {
 		return;
 	}
 
 	job = TAILQ_FIRST(&toep->ddp.aiojobq);
 	so = job->fd_file->f_data;
 	sb = &so->so_rcv;
 	SOCKBUF_LOCK(sb);
 
 	/* We will never get anything unless we are or were connected. */
 	if (!(so->so_state & (SS_ISCONNECTED|SS_ISDISCONNECTED))) {
 		SOCKBUF_UNLOCK(sb);
 		ddp_complete_all(toep, ENOTCONN);
 		return;
 	}
 
 	KASSERT(toep->ddp.active_count == 0 || sbavail(sb) == 0,
 	    ("%s: pending sockbuf data and DDP is active", __func__));
 
 	/* Abort if socket has reported problems. */
 	/* XXX: Wait for any queued DDP's to finish and/or flush them? */
 	if (so->so_error && sbavail(sb) == 0) {
 		toep->ddp.waiting_count--;
 		TAILQ_REMOVE(&toep->ddp.aiojobq, job, list);
 		if (!aio_clear_cancel_function(job)) {
 			SOCKBUF_UNLOCK(sb);
 			goto restart;
 		}
 
 		/*
 		 * If this job has previously copied some data, report
 		 * a short read and leave the error to be reported by
 		 * a future request.
 		 */
 		copied = job->aio_received;
 		if (copied != 0) {
 			SOCKBUF_UNLOCK(sb);
 			aio_complete(job, copied, 0);
 			goto restart;
 		}
 		error = so->so_error;
 		so->so_error = 0;
 		SOCKBUF_UNLOCK(sb);
 		aio_complete(job, -1, error);
 		goto restart;
 	}
 
 	/*
 	 * Door is closed.  If there is pending data in the socket buffer,
 	 * deliver it.  If there are pending DDP requests, wait for those
 	 * to complete.  Once they have completed, return EOF reads.
 	 */
 	if (sb->sb_state & SBS_CANTRCVMORE && sbavail(sb) == 0) {
 		SOCKBUF_UNLOCK(sb);
 		if (toep->ddp.active_count != 0)
 			return;
 		ddp_complete_all(toep, 0);
 		return;
 	}
 
 	/*
 	 * If DDP is not enabled and there is no pending socket buffer
 	 * data, try to enable DDP.
 	 */
 	if (sbavail(sb) == 0 && (toep->ddp.flags & DDP_ON) == 0) {
 		SOCKBUF_UNLOCK(sb);
 
 		/*
 		 * Wait for the card to ACK that DDP is enabled before
 		 * queueing any buffers.  Currently this waits for an
 		 * indicate to arrive.  This could use a TCB_SET_FIELD_RPL
 		 * message to know that DDP was enabled instead of waiting
 		 * for the indicate which would avoid copying the indicate
 		 * if no data is pending.
 		 *
 		 * XXX: Might want to limit the indicate size to the size
 		 * of the first queued request.
 		 */
 		if ((toep->ddp.flags & DDP_SC_REQ) == 0)
 			enable_ddp(sc, toep);
 		return;
 	}
 	SOCKBUF_UNLOCK(sb);
 
 	/*
 	 * If another thread is queueing a buffer for DDP, let it
 	 * drain any work and return.
 	 */
 	if (toep->ddp.queueing != NULL)
 		return;
 
 	/* Take the next job to prep it for DDP. */
 	toep->ddp.waiting_count--;
 	TAILQ_REMOVE(&toep->ddp.aiojobq, job, list);
 	if (!aio_clear_cancel_function(job))
 		goto restart;
 	toep->ddp.queueing = job;
 
 	/* NB: This drops DDP_LOCK while it holds the backing VM pages. */
 	error = hold_aio(toep, job, &ps);
 	if (error != 0) {
 		ddp_complete_one(job, error);
 		toep->ddp.queueing = NULL;
 		goto restart;
 	}
 
 	SOCKBUF_LOCK(sb);
 	if (so->so_error && sbavail(sb) == 0) {
 		copied = job->aio_received;
 		if (copied != 0) {
 			SOCKBUF_UNLOCK(sb);
 			recycle_pageset(toep, ps);
 			aio_complete(job, copied, 0);
 			toep->ddp.queueing = NULL;
 			goto restart;
 		}
 
 		error = so->so_error;
 		so->so_error = 0;
 		SOCKBUF_UNLOCK(sb);
 		recycle_pageset(toep, ps);
 		aio_complete(job, -1, error);
 		toep->ddp.queueing = NULL;
 		goto restart;
 	}
 
 	if (sb->sb_state & SBS_CANTRCVMORE && sbavail(sb) == 0) {
 		SOCKBUF_UNLOCK(sb);
 		recycle_pageset(toep, ps);
 		if (toep->ddp.active_count != 0) {
 			/*
 			 * The door is closed, but there are still pending
 			 * DDP buffers.  Requeue.  These jobs will all be
 			 * completed once those buffers drain.
 			 */
 			aio_ddp_requeue_one(toep, job);
 			toep->ddp.queueing = NULL;
 			return;
 		}
 		ddp_complete_one(job, 0);
 		ddp_complete_all(toep, 0);
 		toep->ddp.queueing = NULL;
 		return;
 	}
 
 sbcopy:
 	/*
 	 * If the toep is dead, there shouldn't be any data in the socket
 	 * buffer, so the above case should have handled this.
 	 */
 	MPASS(!(toep->ddp.flags & DDP_DEAD));
 
 	/*
 	 * If there is pending data in the socket buffer (either
 	 * from before the requests were queued or a DDP indicate),
 	 * copy those mbufs out directly.
 	 */
 	copied = 0;
 	offset = ps->offset + job->aio_received;
 	MPASS(job->aio_received <= job->uaiocb.aio_nbytes);
 	resid = job->uaiocb.aio_nbytes - job->aio_received;
 	m = sb->sb_mb;
 	KASSERT(m == NULL || toep->ddp.active_count == 0,
 	    ("%s: sockbuf data with active DDP", __func__));
 	while (m != NULL && resid > 0) {
 		struct iovec iov[1];
 		struct uio uio;
 		int error;
 
 		iov[0].iov_base = mtod(m, void *);
 		iov[0].iov_len = m->m_len;
 		if (iov[0].iov_len > resid)
 			iov[0].iov_len = resid;
 		uio.uio_iov = iov;
 		uio.uio_iovcnt = 1;
 		uio.uio_offset = 0;
 		uio.uio_resid = iov[0].iov_len;
 		uio.uio_segflg = UIO_SYSSPACE;
 		uio.uio_rw = UIO_WRITE;
 		error = uiomove_fromphys(ps->pages, offset + copied,
 		    uio.uio_resid, &uio);
 		MPASS(error == 0 && uio.uio_resid == 0);
 		copied += uio.uio_offset;
 		resid -= uio.uio_offset;
 		m = m->m_next;
 	}
 	if (copied != 0) {
 		sbdrop_locked(sb, copied);
 		job->aio_received += copied;
 		job->msgrcv = 1;
 		copied = job->aio_received;
 		inp = sotoinpcb(so);
 		if (!INP_TRY_WLOCK(inp)) {
 			/*
 			 * The reference on the socket file descriptor in
 			 * the AIO job should keep 'sb' and 'inp' stable.
 			 * Our caller has a reference on the 'toep' that
 			 * keeps it stable.
 			 */
 			SOCKBUF_UNLOCK(sb);
 			DDP_UNLOCK(toep);
 			INP_WLOCK(inp);
 			DDP_LOCK(toep);
 			SOCKBUF_LOCK(sb);
 
 			/*
 			 * If the socket has been closed, we should detect
 			 * that and complete this request if needed on
 			 * the next trip around the loop.
 			 */
 		}
 		t4_rcvd_locked(&toep->td->tod, intotcpcb(inp));
 		INP_WUNLOCK(inp);
 		if (resid == 0 || toep->ddp.flags & DDP_DEAD) {
 			/*
 			 * We filled the entire buffer with socket
 			 * data, DDP is not being used, or the socket
 			 * is being shut down, so complete the
 			 * request.
 			 */
 			SOCKBUF_UNLOCK(sb);
 			recycle_pageset(toep, ps);
 			aio_complete(job, copied, 0);
 			toep->ddp.queueing = NULL;
 			goto restart;
 		}
 
 		/*
 		 * If DDP is not enabled, requeue this request and restart.
 		 * This will either enable DDP or wait for more data to
 		 * arrive on the socket buffer.
 		 */
 		if ((toep->ddp.flags & (DDP_ON | DDP_SC_REQ)) != DDP_ON) {
 			SOCKBUF_UNLOCK(sb);
 			recycle_pageset(toep, ps);
 			aio_ddp_requeue_one(toep, job);
 			toep->ddp.queueing = NULL;
 			goto restart;
 		}
 
 		/*
 		 * An indicate might have arrived and been added to
 		 * the socket buffer while it was unlocked after the
 		 * copy to lock the INP.  If so, restart the copy.
 		 */
 		if (sbavail(sb) != 0)
 			goto sbcopy;
 	}
 	SOCKBUF_UNLOCK(sb);
 
 	if (prep_pageset(sc, toep, ps) == 0) {
 		recycle_pageset(toep, ps);
 		aio_ddp_requeue_one(toep, job);
 		toep->ddp.queueing = NULL;
 
 		/*
 		 * XXX: Need to retry this later.  Mostly need a trigger
 		 * when page pods are freed up.
 		 */
 		printf("%s: prep_pageset failed\n", __func__);
 		return;
 	}
 
 	/* Determine which DDP buffer to use. */
 	if (toep->ddp.db[0].job == NULL) {
 		db_idx = 0;
 	} else {
 		MPASS(toep->ddp.db[1].job == NULL);
 		db_idx = 1;
 	}
 
 	ddp_flags = 0;
 	ddp_flags_mask = 0;
 	if (db_idx == 0) {
 		ddp_flags |= V_TF_DDP_BUF0_VALID(1);
 		if (so->so_state & SS_NBIO)
 			ddp_flags |= V_TF_DDP_BUF0_FLUSH(1);
 		ddp_flags_mask |= V_TF_DDP_PSH_NO_INVALIDATE0(1) |
 		    V_TF_DDP_PUSH_DISABLE_0(1) | V_TF_DDP_PSHF_ENABLE_0(1) |
 		    V_TF_DDP_BUF0_FLUSH(1) | V_TF_DDP_BUF0_VALID(1);
 		buf_flag = DDP_BUF0_ACTIVE;
 	} else {
 		ddp_flags |= V_TF_DDP_BUF1_VALID(1);
 		if (so->so_state & SS_NBIO)
 			ddp_flags |= V_TF_DDP_BUF1_FLUSH(1);
 		ddp_flags_mask |= V_TF_DDP_PSH_NO_INVALIDATE1(1) |
 		    V_TF_DDP_PUSH_DISABLE_1(1) | V_TF_DDP_PSHF_ENABLE_1(1) |
 		    V_TF_DDP_BUF1_FLUSH(1) | V_TF_DDP_BUF1_VALID(1);
 		buf_flag = DDP_BUF1_ACTIVE;
 	}
 	MPASS((toep->ddp.flags & buf_flag) == 0);
 	if ((toep->ddp.flags & (DDP_BUF0_ACTIVE | DDP_BUF1_ACTIVE)) == 0) {
 		MPASS(db_idx == 0);
 		MPASS(toep->ddp.active_id == -1);
 		MPASS(toep->ddp.active_count == 0);
 		ddp_flags_mask |= V_TF_DDP_ACTIVE_BUF(1);
 	}
 
 	/*
 	 * The TID for this connection should still be valid.  If DDP_DEAD
 	 * is set, SBS_CANTRCVMORE should be set, so we shouldn't be
 	 * this far anyway.  Even if the socket is closing on the other
 	 * end, the AIO job holds a reference on this end of the socket
 	 * which will keep it open and keep the TCP PCB attached until
 	 * after the job is completed.
 	 */
 	wr = mk_update_tcb_for_ddp(sc, toep, db_idx, ps, job->aio_received,
 	    ddp_flags, ddp_flags_mask);
 	if (wr == NULL) {
 		recycle_pageset(toep, ps);
 		aio_ddp_requeue_one(toep, job);
 		toep->ddp.queueing = NULL;
 
 		/*
 		 * XXX: Need a way to kick a retry here.
 		 *
 		 * XXX: We know the fixed size needed and could
 		 * preallocate this using a blocking request at the
 		 * start of the task to avoid having to handle this
 		 * edge case.
 		 */
 		printf("%s: mk_update_tcb_for_ddp failed\n", __func__);
 		return;
 	}
 
 	if (!aio_set_cancel_function(job, t4_aio_cancel_active)) {
 		free_wrqe(wr);
 		recycle_pageset(toep, ps);
 		aio_ddp_cancel_one(job);
 		toep->ddp.queueing = NULL;
 		goto restart;
 	}
 
 #ifdef VERBOSE_TRACES
 	CTR5(KTR_CXGBE, "%s: scheduling %p for DDP[%d] (flags %#lx/%#lx)",
 	    __func__, job, db_idx, ddp_flags, ddp_flags_mask);
 #endif
 	/* Give the chip the go-ahead. */
 	t4_wrq_tx(sc, wr);
 	db = &toep->ddp.db[db_idx];
 	db->cancel_pending = 0;
 	db->job = job;
 	db->ps = ps;
 	toep->ddp.queueing = NULL;
 	toep->ddp.flags |= buf_flag;
 	toep->ddp.active_count++;
 	if (toep->ddp.active_count == 1) {
 		MPASS(toep->ddp.active_id == -1);
 		toep->ddp.active_id = db_idx;
 		CTR2(KTR_CXGBE, "%s: ddp_active_id = %d", __func__,
 		    toep->ddp.active_id);
 	}
 	goto restart;
 }
 
 void
 ddp_queue_toep(struct toepcb *toep)
 {
 
 	DDP_ASSERT_LOCKED(toep);
 	if (toep->ddp.flags & DDP_TASK_ACTIVE)
 		return;
 	toep->ddp.flags |= DDP_TASK_ACTIVE;
 	hold_toepcb(toep);
 	soaio_enqueue(&toep->ddp.requeue_task);
 }
 
 static void
 aio_ddp_requeue_task(void *context, int pending)
 {
 	struct toepcb *toep = context;
 
 	DDP_LOCK(toep);
 	aio_ddp_requeue(toep);
 	toep->ddp.flags &= ~DDP_TASK_ACTIVE;
 	DDP_UNLOCK(toep);
 
 	free_toepcb(toep);
 }
 
 static void
 t4_aio_cancel_active(struct kaiocb *job)
 {
 	struct socket *so = job->fd_file->f_data;
 	struct tcpcb *tp = so_sototcpcb(so);
 	struct toepcb *toep = tp->t_toe;
 	struct adapter *sc = td_adapter(toep->td);
 	uint64_t valid_flag;
 	int i;
 
 	DDP_LOCK(toep);
 	if (aio_cancel_cleared(job)) {
 		DDP_UNLOCK(toep);
 		aio_ddp_cancel_one(job);
 		return;
 	}
 
 	for (i = 0; i < nitems(toep->ddp.db); i++) {
 		if (toep->ddp.db[i].job == job) {
 			/* Should only ever get one cancel request for a job. */
 			MPASS(toep->ddp.db[i].cancel_pending == 0);
 
 			/*
 			 * Invalidate this buffer.  It will be
 			 * cancelled or partially completed once the
 			 * card ACKs the invalidate.
 			 */
 			valid_flag = i == 0 ? V_TF_DDP_BUF0_VALID(1) :
 			    V_TF_DDP_BUF1_VALID(1);
-			t4_set_tcb_field(sc, toep->ctrlq, toep->tid,
+			t4_set_tcb_field(sc, toep->ctrlq, toep,
 			    W_TCB_RX_DDP_FLAGS, valid_flag, 0, 1,
-			    i + DDP_BUF0_INVALIDATED,
-			    toep->ofld_rxq->iq.abs_id);
+			    i + DDP_BUF0_INVALIDATED);
 			toep->ddp.db[i].cancel_pending = 1;
 			CTR2(KTR_CXGBE, "%s: request %p marked pending",
 			    __func__, job);
 			break;
 		}
 	}
 	DDP_UNLOCK(toep);
 }
 
 static void
 t4_aio_cancel_queued(struct kaiocb *job)
 {
 	struct socket *so = job->fd_file->f_data;
 	struct tcpcb *tp = so_sototcpcb(so);
 	struct toepcb *toep = tp->t_toe;
 
 	DDP_LOCK(toep);
 	if (!aio_cancel_cleared(job)) {
 		TAILQ_REMOVE(&toep->ddp.aiojobq, job, list);
 		toep->ddp.waiting_count--;
 		if (toep->ddp.waiting_count == 0)
 			ddp_queue_toep(toep);
 	}
 	CTR2(KTR_CXGBE, "%s: request %p cancelled", __func__, job);
 	DDP_UNLOCK(toep);
 
 	aio_ddp_cancel_one(job);
 }
 
 int
 t4_aio_queue_ddp(struct socket *so, struct kaiocb *job)
 {
 	struct tcpcb *tp = so_sototcpcb(so);
 	struct toepcb *toep = tp->t_toe;
 
 
 	/* Ignore writes. */
 	if (job->uaiocb.aio_lio_opcode != LIO_READ)
 		return (EOPNOTSUPP);
 
 	DDP_LOCK(toep);
 
 	/*
 	 * XXX: Think about possibly returning errors for ENOTCONN,
 	 * etc.  Perhaps the caller would only queue the request
 	 * if it failed with EOPNOTSUPP?
 	 */
 
 #ifdef VERBOSE_TRACES
 	CTR2(KTR_CXGBE, "%s: queueing %p", __func__, job);
 #endif
 	if (!aio_set_cancel_function(job, t4_aio_cancel_queued))
 		panic("new job was cancelled");
 	TAILQ_INSERT_TAIL(&toep->ddp.aiojobq, job, list);
 	toep->ddp.waiting_count++;
 	toep->ddp.flags |= DDP_OK;
 
 	/*
 	 * Try to handle this request synchronously.  If this has
 	 * to block because the task is running, it will just bail
 	 * and let the task handle it instead.
 	 */
 	aio_ddp_requeue(toep);
 	DDP_UNLOCK(toep);
 	return (0);
 }
 
 void
 t4_ddp_mod_load(void)
 {
 
 	t4_register_cpl_handler(CPL_RX_DATA_DDP, do_rx_data_ddp);
 	t4_register_cpl_handler(CPL_RX_DDP_COMPLETE, do_rx_ddp_complete);
 	TAILQ_INIT(&ddp_orphan_pagesets);
 	mtx_init(&ddp_orphan_pagesets_lock, "ddp orphans", NULL, MTX_DEF);
 	TASK_INIT(&ddp_orphan_task, 0, ddp_free_orphan_pagesets, NULL);
 }
 
 void
 t4_ddp_mod_unload(void)
 {
 
 	taskqueue_drain(taskqueue_thread, &ddp_orphan_task);
 	MPASS(TAILQ_EMPTY(&ddp_orphan_pagesets));
 	mtx_destroy(&ddp_orphan_pagesets_lock);
 	t4_register_cpl_handler(CPL_RX_DATA_DDP, NULL);
 	t4_register_cpl_handler(CPL_RX_DDP_COMPLETE, NULL);
 }
 #endif
Index: head/sys/dev/cxgbe/tom/t4_tls.c
===================================================================
--- head/sys/dev/cxgbe/tom/t4_tls.c	(revision 331648)
+++ head/sys/dev/cxgbe/tom/t4_tls.c	(revision 331649)
@@ -1,1650 +1,1649 @@
 /*-
  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
  *
  * Copyright (c) 2017-2018 Chelsio Communications, Inc.
  * All rights reserved.
  * Written by: John Baldwin <jhb@FreeBSD.org>
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #include "opt_inet.h"
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <sys/param.h>
 #include <sys/sglist.h>
 #include <sys/socket.h>
 #include <sys/socketvar.h>
 #include <sys/systm.h>
 #include <netinet/in.h>
 #include <netinet/in_pcb.h>
 #include <netinet/tcp_var.h>
 #include <netinet/toecore.h>
 
 #ifdef TCP_OFFLOAD
 #include "common/common.h"
 #include "common/t4_tcb.h"
 #include "tom/t4_tom_l2t.h"
 #include "tom/t4_tom.h"
 
 /*
  * The TCP sequence number of a CPL_TLS_DATA mbuf is saved here while
  * the mbuf is in the ulp_pdu_reclaimq.
  */
 #define	tls_tcp_seq	PH_loc.thirtytwo[0]
 
 /*
  * Handshake lock used for the handshake timer.  Having a global lock
  * is perhaps not ideal, but it avoids having to use callout_drain()
  * in tls_uninit_toep() which can't block.  Also, the timer shouldn't
  * actually fire for most connections.
  */
 static struct mtx tls_handshake_lock;
 
 static void
 t4_set_tls_tcb_field(struct toepcb *toep, uint16_t word, uint64_t mask,
     uint64_t val)
 {
 	struct adapter *sc = td_adapter(toep->td);
 
-	t4_set_tcb_field(sc, toep->ctrlq, toep->tid, word, mask, val, 0, 0,
-	    toep->ofld_rxq->iq.abs_id);
+	t4_set_tcb_field(sc, toep->ofld_txq, toep, word, mask, val, 0, 0);
 }
 
 /* TLS and DTLS common routines */
 bool
 can_tls_offload(struct adapter *sc)
 {
 
 	return (sc->tt.tls && sc->cryptocaps & FW_CAPS_CONFIG_TLSKEYS);
 }
 
 int
 tls_tx_key(struct toepcb *toep)
 {
 	struct tls_ofld_info *tls_ofld = &toep->tls;
 
 	return (tls_ofld->tx_key_addr >= 0);
 }
 
 int
 tls_rx_key(struct toepcb *toep)
 {
 	struct tls_ofld_info *tls_ofld = &toep->tls;
 
 	return (tls_ofld->rx_key_addr >= 0);
 }
 
 static int
 key_size(struct toepcb *toep)
 {
 	struct tls_ofld_info *tls_ofld = &toep->tls;
 
 	return ((tls_ofld->key_location == TLS_SFO_WR_CONTEXTLOC_IMMEDIATE) ?
 		tls_ofld->k_ctx.tx_key_info_size : KEY_IN_DDR_SIZE);
 }
 
 /* Set TLS Key-Id in TCB */
 static void
 t4_set_tls_keyid(struct toepcb *toep, unsigned int key_id)
 {
 
 	t4_set_tls_tcb_field(toep, W_TCB_RX_TLS_KEY_TAG,
 			 V_TCB_RX_TLS_KEY_TAG(M_TCB_RX_TLS_BUF_TAG),
 			 V_TCB_RX_TLS_KEY_TAG(key_id));
 }
 
 /* Clear TF_RX_QUIESCE to re-enable receive. */
 static void
 t4_clear_rx_quiesce(struct toepcb *toep)
 {
 
 	t4_set_tls_tcb_field(toep, W_TCB_T_FLAGS, V_TF_RX_QUIESCE(1), 0);
 }
 
 static void
 tls_clr_ofld_mode(struct toepcb *toep)
 {
 
 	tls_stop_handshake_timer(toep);
 
 	/* Operate in PDU extraction mode only. */
 	t4_set_tls_tcb_field(toep, W_TCB_ULP_RAW,
 	    V_TCB_ULP_RAW(M_TCB_ULP_RAW),
 	    V_TCB_ULP_RAW(V_TF_TLS_ENABLE(1)));
 	t4_clear_rx_quiesce(toep);
 }
 
 static void
 tls_clr_quiesce(struct toepcb *toep)
 {
 
 	tls_stop_handshake_timer(toep);
 	t4_clear_rx_quiesce(toep);
 }
 
 /*
  * Calculate the TLS data expansion size
  */
 static int
 tls_expansion_size(struct toepcb *toep, int data_len, int full_pdus_only,
     unsigned short *pdus_per_ulp)
 {
 	struct tls_ofld_info *tls_ofld = &toep->tls;
 	struct tls_scmd *scmd = &tls_ofld->scmd0;
 	int expn_size = 0, frag_count = 0, pad_per_pdu = 0,
 	    pad_last_pdu = 0, last_frag_size = 0, max_frag_size = 0;
 	int exp_per_pdu = 0;
 	int hdr_len = TLS_HEADER_LENGTH;
 
 	do {
 		max_frag_size = tls_ofld->k_ctx.frag_size;
 		if (G_SCMD_CIPH_MODE(scmd->seqno_numivs) ==
 		   SCMD_CIPH_MODE_AES_GCM) {
 			frag_count = (data_len / max_frag_size);
 			exp_per_pdu = GCM_TAG_SIZE + AEAD_EXPLICIT_DATA_SIZE +
 				hdr_len;
 			expn_size =  frag_count * exp_per_pdu;
 			if (full_pdus_only) {
 				*pdus_per_ulp = data_len / (exp_per_pdu +
 					max_frag_size);
 				if (*pdus_per_ulp > 32)
 					*pdus_per_ulp = 32;
 				else if(!*pdus_per_ulp)
 					*pdus_per_ulp = 1;
 				expn_size = (*pdus_per_ulp) * exp_per_pdu;
 				break;
 			}
 			if ((last_frag_size = data_len % max_frag_size) > 0) {
 				frag_count += 1;
 				expn_size += exp_per_pdu;
 			}
 			break;
 		} else if (G_SCMD_CIPH_MODE(scmd->seqno_numivs) !=
 			   SCMD_CIPH_MODE_NOP) {
 			/* Calculate the number of fragments we can make */
 			frag_count  = (data_len / max_frag_size);
 			if (frag_count > 0) {
 				pad_per_pdu = (((howmany((max_frag_size +
 						       tls_ofld->mac_length),
 						      CIPHER_BLOCK_SIZE)) *
 						CIPHER_BLOCK_SIZE) -
 					       (max_frag_size +
 						tls_ofld->mac_length));
 				if (!pad_per_pdu)
 					pad_per_pdu = CIPHER_BLOCK_SIZE;
 				exp_per_pdu = pad_per_pdu +
 				       	tls_ofld->mac_length +
 					hdr_len + CIPHER_BLOCK_SIZE;
 				expn_size = frag_count * exp_per_pdu;
 			}
 			if (full_pdus_only) {
 				*pdus_per_ulp = data_len / (exp_per_pdu +
 					max_frag_size);
 				if (*pdus_per_ulp > 32)
 					*pdus_per_ulp = 32;
 				else if (!*pdus_per_ulp)
 					*pdus_per_ulp = 1;
 				expn_size = (*pdus_per_ulp) * exp_per_pdu;
 				break;
 			}
 			/* Consider the last fragment */
 			if ((last_frag_size = data_len % max_frag_size) > 0) {
 				pad_last_pdu = (((howmany((last_frag_size +
 							tls_ofld->mac_length),
 						       CIPHER_BLOCK_SIZE)) *
 						 CIPHER_BLOCK_SIZE) -
 						(last_frag_size +
 						 tls_ofld->mac_length));
 				if (!pad_last_pdu)
 					pad_last_pdu = CIPHER_BLOCK_SIZE;
 				expn_size += (pad_last_pdu +
 					      tls_ofld->mac_length + hdr_len +
 					      CIPHER_BLOCK_SIZE);
 			}
 		}
 	} while (0);
 
 	return (expn_size);
 }
 
 /* Copy Key to WR */
 static void
 tls_copy_tx_key(struct toepcb *toep, void *dst)
 {
 	struct tls_ofld_info *tls_ofld = &toep->tls;
 	struct ulptx_sc_memrd *sc_memrd;
 	struct ulptx_idata *sc;
 
 	if (tls_ofld->k_ctx.tx_key_info_size <= 0)
 		return;
 
 	if (tls_ofld->key_location == TLS_SFO_WR_CONTEXTLOC_DDR) {
 		sc = dst;
 		sc->cmd_more = htobe32(V_ULPTX_CMD(ULP_TX_SC_NOOP));
 		sc->len = htobe32(0);
 		sc_memrd = (struct ulptx_sc_memrd *)(sc + 1);
 		sc_memrd->cmd_to_len = htobe32(V_ULPTX_CMD(ULP_TX_SC_MEMRD) |
 		    V_ULP_TX_SC_MORE(1) |
 		    V_ULPTX_LEN16(tls_ofld->k_ctx.tx_key_info_size >> 4));
 		sc_memrd->addr = htobe32(tls_ofld->tx_key_addr >> 5);
 	} else if (tls_ofld->key_location == TLS_SFO_WR_CONTEXTLOC_IMMEDIATE) {
 		memcpy(dst, &tls_ofld->k_ctx.tx,
 		    tls_ofld->k_ctx.tx_key_info_size);
 	}
 }
 
 /* TLS/DTLS content type  for CPL SFO */
 static inline unsigned char
 tls_content_type(unsigned char content_type)
 {
 	/*
 	 * XXX: Shouldn't this map CONTENT_TYPE_APP_DATA to DATA and
 	 * default to "CUSTOM" for all other types including
 	 * heartbeat?
 	 */
 	switch (content_type) {
 	case CONTENT_TYPE_CCS:
 		return CPL_TX_TLS_SFO_TYPE_CCS;
 	case CONTENT_TYPE_ALERT:
 		return CPL_TX_TLS_SFO_TYPE_ALERT;
 	case CONTENT_TYPE_HANDSHAKE:
 		return CPL_TX_TLS_SFO_TYPE_HANDSHAKE;
 	case CONTENT_TYPE_HEARTBEAT:
 		return CPL_TX_TLS_SFO_TYPE_HEARTBEAT;
 	}
 	return CPL_TX_TLS_SFO_TYPE_DATA;
 }
 
 static unsigned char
 get_cipher_key_size(unsigned int ck_size)
 {
 	switch (ck_size) {
 	case AES_NOP: /* NOP */
 		return 15;
 	case AES_128: /* AES128 */
 		return CH_CK_SIZE_128;
 	case AES_192: /* AES192 */
 		return CH_CK_SIZE_192;
 	case AES_256: /* AES256 */
 		return CH_CK_SIZE_256;
 	default:
 		return CH_CK_SIZE_256;
 	}
 }
 
 static unsigned char
 get_mac_key_size(unsigned int mk_size)
 {
 	switch (mk_size) {
 	case SHA_NOP: /* NOP */
 		return CH_MK_SIZE_128;
 	case SHA_GHASH: /* GHASH */
 	case SHA_512: /* SHA512 */
 		return CH_MK_SIZE_512;
 	case SHA_224: /* SHA2-224 */
 		return CH_MK_SIZE_192;
 	case SHA_256: /* SHA2-256*/
 		return CH_MK_SIZE_256;
 	case SHA_384: /* SHA384 */
 		return CH_MK_SIZE_512;
 	case SHA1: /* SHA1 */
 	default:
 		return CH_MK_SIZE_160;
 	}
 }
 
 static unsigned int
 get_proto_ver(int proto_ver)
 {
 	switch (proto_ver) {
 	case TLS1_2_VERSION:
 		return TLS_1_2_VERSION;
 	case TLS1_1_VERSION:
 		return TLS_1_1_VERSION;
 	case DTLS1_2_VERSION:
 		return DTLS_1_2_VERSION;
 	default:
 		return TLS_VERSION_MAX;
 	}
 }
 
 static void
 tls_rxkey_flit1(struct tls_keyctx *kwr, struct tls_key_context *kctx)
 {
 
 	if (kctx->state.enc_mode == CH_EVP_CIPH_GCM_MODE) {
 		kwr->u.rxhdr.ivinsert_to_authinsrt =
 		    htobe64(V_TLS_KEYCTX_TX_WR_IVINSERT(6ULL) |
 			V_TLS_KEYCTX_TX_WR_AADSTRTOFST(1ULL) |
 			V_TLS_KEYCTX_TX_WR_AADSTOPOFST(5ULL) |
 			V_TLS_KEYCTX_TX_WR_AUTHSRTOFST(14ULL) |
 			V_TLS_KEYCTX_TX_WR_AUTHSTOPOFST(16ULL) |
 			V_TLS_KEYCTX_TX_WR_CIPHERSRTOFST(14ULL) |
 			V_TLS_KEYCTX_TX_WR_CIPHERSTOPOFST(0ULL) |
 			V_TLS_KEYCTX_TX_WR_AUTHINSRT(16ULL));
 		kwr->u.rxhdr.ivpresent_to_rxmk_size &=
 			~(V_TLS_KEYCTX_TX_WR_RXOPAD_PRESENT(1));
 		kwr->u.rxhdr.authmode_to_rxvalid &=
 			~(V_TLS_KEYCTX_TX_WR_CIPHAUTHSEQCTRL(1));
 	} else {
 		kwr->u.rxhdr.ivinsert_to_authinsrt =
 		    htobe64(V_TLS_KEYCTX_TX_WR_IVINSERT(6ULL) |
 			V_TLS_KEYCTX_TX_WR_AADSTRTOFST(1ULL) |
 			V_TLS_KEYCTX_TX_WR_AADSTOPOFST(5ULL) |
 			V_TLS_KEYCTX_TX_WR_AUTHSRTOFST(22ULL) |
 			V_TLS_KEYCTX_TX_WR_AUTHSTOPOFST(0ULL) |
 			V_TLS_KEYCTX_TX_WR_CIPHERSRTOFST(22ULL) |
 			V_TLS_KEYCTX_TX_WR_CIPHERSTOPOFST(0ULL) |
 			V_TLS_KEYCTX_TX_WR_AUTHINSRT(0ULL));
 	}
 }
 
 /* Rx key */
 static void
 prepare_rxkey_wr(struct tls_keyctx *kwr, struct tls_key_context *kctx)
 {
 	unsigned int ck_size = kctx->cipher_secret_size;
 	unsigned int mk_size = kctx->mac_secret_size;
 	int proto_ver = kctx->proto_ver;
 
 	kwr->u.rxhdr.flitcnt_hmacctrl =
 		((kctx->tx_key_info_size >> 4) << 3) | kctx->hmac_ctrl;
 
 	kwr->u.rxhdr.protover_ciphmode =
 		V_TLS_KEYCTX_TX_WR_PROTOVER(get_proto_ver(proto_ver)) |
 		V_TLS_KEYCTX_TX_WR_CIPHMODE(kctx->state.enc_mode);
 
 	kwr->u.rxhdr.authmode_to_rxvalid =
 		V_TLS_KEYCTX_TX_WR_AUTHMODE(kctx->state.auth_mode) |
 		V_TLS_KEYCTX_TX_WR_CIPHAUTHSEQCTRL(1) |
 		V_TLS_KEYCTX_TX_WR_SEQNUMCTRL(3) |
 		V_TLS_KEYCTX_TX_WR_RXVALID(1);
 
 	kwr->u.rxhdr.ivpresent_to_rxmk_size =
 		V_TLS_KEYCTX_TX_WR_IVPRESENT(0) |
 		V_TLS_KEYCTX_TX_WR_RXOPAD_PRESENT(1) |
 		V_TLS_KEYCTX_TX_WR_RXCK_SIZE(get_cipher_key_size(ck_size)) |
 		V_TLS_KEYCTX_TX_WR_RXMK_SIZE(get_mac_key_size(mk_size));
 
 	tls_rxkey_flit1(kwr, kctx);
 
 	/* No key reversal for GCM */
 	if (kctx->state.enc_mode != CH_EVP_CIPH_GCM_MODE) {
 		t4_aes_getdeckey(kwr->keys.edkey, kctx->rx.key,
 				 (kctx->cipher_secret_size << 3));
 		memcpy(kwr->keys.edkey + kctx->cipher_secret_size,
 		       kctx->rx.key + kctx->cipher_secret_size,
 		       (IPAD_SIZE + OPAD_SIZE));
 	} else {
 		memcpy(kwr->keys.edkey, kctx->rx.key,
 		       (kctx->tx_key_info_size - SALT_SIZE));
 		memcpy(kwr->u.rxhdr.rxsalt, kctx->rx.salt, SALT_SIZE);
 	}
 }
 
 /* Tx key */
 static void
 prepare_txkey_wr(struct tls_keyctx *kwr, struct tls_key_context *kctx)
 {
 	unsigned int ck_size = kctx->cipher_secret_size;
 	unsigned int mk_size = kctx->mac_secret_size;
 
 	kwr->u.txhdr.ctxlen =
 		(kctx->tx_key_info_size >> 4);
 	kwr->u.txhdr.dualck_to_txvalid =
 		V_TLS_KEYCTX_TX_WR_TXOPAD_PRESENT(1) |
 		V_TLS_KEYCTX_TX_WR_SALT_PRESENT(1) |
 		V_TLS_KEYCTX_TX_WR_TXCK_SIZE(get_cipher_key_size(ck_size)) |
 		V_TLS_KEYCTX_TX_WR_TXMK_SIZE(get_mac_key_size(mk_size)) |
 		V_TLS_KEYCTX_TX_WR_TXVALID(1);
 
 	memcpy(kwr->keys.edkey, kctx->tx.key, HDR_KCTX_SIZE);
 	if (kctx->state.enc_mode == CH_EVP_CIPH_GCM_MODE) {
 		memcpy(kwr->u.txhdr.txsalt, kctx->tx.salt, SALT_SIZE);
 		kwr->u.txhdr.dualck_to_txvalid &=
 			~(V_TLS_KEYCTX_TX_WR_TXOPAD_PRESENT(1));
 	}
 	kwr->u.txhdr.dualck_to_txvalid = htons(kwr->u.txhdr.dualck_to_txvalid);
 }
 
 /* TLS Key memory management */
 int
 tls_init_kmap(struct adapter *sc, struct tom_data *td)
 {
 
 	td->key_map = vmem_create("T4TLS key map", sc->vres.key.start,
 	    sc->vres.key.size, 8, 0, M_FIRSTFIT | M_NOWAIT);
 	if (td->key_map == NULL)
 		return (ENOMEM);
 	return (0);
 }
 
 void
 tls_free_kmap(struct tom_data *td)
 {
 
 	if (td->key_map != NULL)
 		vmem_destroy(td->key_map);
 }
 
 static int
 get_new_keyid(struct toepcb *toep, struct tls_key_context *k_ctx)
 {
 	struct tom_data *td = toep->td;
 	vmem_addr_t addr;
 
 	if (vmem_alloc(td->key_map, TLS_KEY_CONTEXT_SZ, M_NOWAIT | M_FIRSTFIT,
 	    &addr) != 0)
 		return (-1);
 
 	return (addr);
 }
 
 static void
 free_keyid(struct toepcb *toep, int keyid)
 {
 	struct tom_data *td = toep->td;
 
 	vmem_free(td->key_map, keyid, TLS_KEY_CONTEXT_SZ);
 }
 
 static void
 clear_tls_keyid(struct toepcb *toep)
 {
 	struct tls_ofld_info *tls_ofld = &toep->tls;
 
 	if (tls_ofld->rx_key_addr >= 0) {
 		free_keyid(toep, tls_ofld->rx_key_addr);
 		tls_ofld->rx_key_addr = -1;
 	}
 	if (tls_ofld->tx_key_addr >= 0) {
 		free_keyid(toep, tls_ofld->tx_key_addr);
 		tls_ofld->tx_key_addr = -1;
 	}
 }
 
 static int
 get_keyid(struct tls_ofld_info *tls_ofld, unsigned int ops)
 {
 	return (ops & KEY_WRITE_RX ? tls_ofld->rx_key_addr :
 		((ops & KEY_WRITE_TX) ? tls_ofld->rx_key_addr : -1));
 }
 
 static int
 get_tp_plen_max(struct tls_ofld_info *tls_ofld)
 {
 	int plen = ((min(3*4096, TP_TX_PG_SZ))/1448) * 1448;
 
 	return (tls_ofld->k_ctx.frag_size <= 8192 ? plen : FC_TP_PLEN_MAX);
 }
 
 /* Send request to get the key-id */
 static int
 tls_program_key_id(struct toepcb *toep, struct tls_key_context *k_ctx)
 {
 	struct tls_ofld_info *tls_ofld = &toep->tls;
 	struct adapter *sc = td_adapter(toep->td);
 	struct ofld_tx_sdesc *txsd;
 	int kwrlen, kctxlen, keyid, len;
 	struct wrqe *wr;
 	struct tls_key_req *kwr;
 	struct tls_keyctx *kctx;
 
 	kwrlen = roundup2(sizeof(*kwr), 16);
 	kctxlen = roundup2(sizeof(*kctx), 32);
 	len = kwrlen + kctxlen;
 
 	if (toep->txsd_avail == 0)
 		return (EAGAIN);
 
 	/* Dont initialize key for re-neg */
 	if (!G_KEY_CLR_LOC(k_ctx->l_p_key)) {
 		if ((keyid = get_new_keyid(toep, k_ctx)) < 0) {
 			return (ENOSPC);
 		}
 	} else {
 		keyid = get_keyid(tls_ofld, k_ctx->l_p_key);
 	}
 
 	wr = alloc_wrqe(len, toep->ofld_txq);
 	if (wr == NULL) {
 		free_keyid(toep, keyid);
 		return (ENOMEM);
 	}
 	kwr = wrtod(wr);
 	memset(kwr, 0, kwrlen);
 
 	kwr->wr_hi = htobe32(V_FW_WR_OP(FW_ULPTX_WR) | F_FW_WR_COMPL |
 	    F_FW_WR_ATOMIC);
 	kwr->wr_mid = htobe32(V_FW_WR_LEN16(DIV_ROUND_UP(len, 16)) |
 	    V_FW_WR_FLOWID(toep->tid));
 	kwr->protocol = get_proto_ver(k_ctx->proto_ver);
 	kwr->mfs = htons(k_ctx->frag_size);
 	kwr->reneg_to_write_rx = k_ctx->l_p_key;
 
 	/* master command */
 	kwr->cmd = htobe32(V_ULPTX_CMD(ULP_TX_MEM_WRITE) |
 	    V_T5_ULP_MEMIO_ORDER(1) | V_T5_ULP_MEMIO_IMM(1));
 	kwr->dlen = htobe32(V_ULP_MEMIO_DATA_LEN(kctxlen >> 5));
 	kwr->len16 = htobe32((toep->tid << 8) |
 	    DIV_ROUND_UP(len - sizeof(struct work_request_hdr), 16));
 	kwr->kaddr = htobe32(V_ULP_MEMIO_ADDR(keyid >> 5));
 
 	/* sub command */
 	kwr->sc_more = htobe32(V_ULPTX_CMD(ULP_TX_SC_IMM));
 	kwr->sc_len = htobe32(kctxlen);
 
 	/* XXX: This assumes that kwrlen == sizeof(*kwr). */
 	kctx = (struct tls_keyctx *)(kwr + 1);
 	memset(kctx, 0, kctxlen);
 
 	if (G_KEY_GET_LOC(k_ctx->l_p_key) == KEY_WRITE_TX) {
 		tls_ofld->tx_key_addr = keyid;
 		prepare_txkey_wr(kctx, k_ctx);
 	} else if (G_KEY_GET_LOC(k_ctx->l_p_key) == KEY_WRITE_RX) {
 		tls_ofld->rx_key_addr = keyid;
 		prepare_rxkey_wr(kctx, k_ctx);
 	}
 
 	txsd = &toep->txsd[toep->txsd_pidx];
 	txsd->tx_credits = DIV_ROUND_UP(len, 16);
 	txsd->plen = 0;
 	toep->tx_credits -= txsd->tx_credits;
 	if (__predict_false(++toep->txsd_pidx == toep->txsd_total))
 		toep->txsd_pidx = 0;
 	toep->txsd_avail--;
 
 	t4_wrq_tx(sc, wr);
 
 	return (0);
 }
 
 /* Store a key received from SSL in DDR. */
 static int
 program_key_context(struct tcpcb *tp, struct toepcb *toep,
     struct tls_key_context *uk_ctx)
 {
 	struct adapter *sc = td_adapter(toep->td);
 	struct tls_ofld_info *tls_ofld = &toep->tls;
 	struct tls_key_context *k_ctx;
 	int error, key_offset;
 
 	if (tp->t_state != TCPS_ESTABLISHED) {
 		/*
 		 * XXX: Matches Linux driver, but not sure this is a
 		 * very appropriate error.
 		 */
 		return (ENOENT);
 	}
 
 	/* Stop timer on handshake completion */
 	tls_stop_handshake_timer(toep);
 
 	toep->flags &= ~TPF_FORCE_CREDITS;
 
 	CTR4(KTR_CXGBE, "%s: tid %d %s proto_ver %#x", __func__, toep->tid,
 	    G_KEY_GET_LOC(uk_ctx->l_p_key) == KEY_WRITE_RX ? "KEY_WRITE_RX" :
 	    "KEY_WRITE_TX", uk_ctx->proto_ver);
 
 	if (G_KEY_GET_LOC(uk_ctx->l_p_key) == KEY_WRITE_RX &&
 	    toep->ulp_mode != ULP_MODE_TLS)
 		return (EOPNOTSUPP);
 
 	/* Don't copy the 'tx' and 'rx' fields. */
 	k_ctx = &tls_ofld->k_ctx;
 	memcpy(&k_ctx->l_p_key, &uk_ctx->l_p_key,
 	    sizeof(*k_ctx) - offsetof(struct tls_key_context, l_p_key));
 
 	/* TLS version != 1.1 and !1.2 OR DTLS != 1.2 */
 	if (get_proto_ver(k_ctx->proto_ver) > DTLS_1_2_VERSION) {
 		if (G_KEY_GET_LOC(k_ctx->l_p_key) == KEY_WRITE_RX) {
 			tls_ofld->rx_key_addr = -1;
 			t4_clear_rx_quiesce(toep);
 		} else {
 			tls_ofld->tx_key_addr = -1;
 		}
 		return (0);
 	}
 
 	if (k_ctx->state.enc_mode == CH_EVP_CIPH_GCM_MODE) {
 		k_ctx->iv_size = 4;
 		k_ctx->mac_first = 0;
 		k_ctx->hmac_ctrl = 0;
 	} else {
 		k_ctx->iv_size = 8; /* for CBC, iv is 16B, unit of 2B */
 		k_ctx->mac_first = 1;
 	}
 
 	tls_ofld->scmd0.seqno_numivs =
 		(V_SCMD_SEQ_NO_CTRL(3) |
 		 V_SCMD_PROTO_VERSION(get_proto_ver(k_ctx->proto_ver)) |
 		 V_SCMD_ENC_DEC_CTRL(SCMD_ENCDECCTRL_ENCRYPT) |
 		 V_SCMD_CIPH_AUTH_SEQ_CTRL((k_ctx->mac_first == 0)) |
 		 V_SCMD_CIPH_MODE(k_ctx->state.enc_mode) |
 		 V_SCMD_AUTH_MODE(k_ctx->state.auth_mode) |
 		 V_SCMD_HMAC_CTRL(k_ctx->hmac_ctrl) |
 		 V_SCMD_IV_SIZE(k_ctx->iv_size));
 
 	tls_ofld->scmd0.ivgen_hdrlen =
 		(V_SCMD_IV_GEN_CTRL(k_ctx->iv_ctrl) |
 		 V_SCMD_KEY_CTX_INLINE(0) |
 		 V_SCMD_TLS_FRAG_ENABLE(1));
 
 	tls_ofld->mac_length = k_ctx->mac_secret_size;
 
 	if (G_KEY_GET_LOC(k_ctx->l_p_key) == KEY_WRITE_RX) {
 		k_ctx->rx = uk_ctx->rx;
 		/* Dont initialize key for re-neg */
 		if (!G_KEY_CLR_LOC(k_ctx->l_p_key))
 			tls_ofld->rx_key_addr = -1;
 	} else {
 		k_ctx->tx = uk_ctx->tx;
 		/* Dont initialize key for re-neg */
 		if (!G_KEY_CLR_LOC(k_ctx->l_p_key))
 			tls_ofld->tx_key_addr = -1;
 	}
 
 	/* Flush pending data before new Tx key becomes active */
 	if (G_KEY_GET_LOC(k_ctx->l_p_key) == KEY_WRITE_TX) {
 		struct sockbuf *sb;
 
 		/* XXX: This might not drain everything. */
 		t4_push_frames(sc, toep, 0);
 		sb = &toep->inp->inp_socket->so_snd;
 		SOCKBUF_LOCK(sb);
 
 		/* XXX: This asserts that everything has been pushed. */
 		MPASS(sb->sb_sndptr == NULL || sb->sb_sndptr->m_next == NULL);
 		sb->sb_sndptr = NULL;
 		tls_ofld->sb_off = sbavail(sb);
 		SOCKBUF_UNLOCK(sb);
 		tls_ofld->tx_seq_no = 0;
 	}
 
 	if ((G_KEY_GET_LOC(k_ctx->l_p_key) == KEY_WRITE_RX) ||
 	    (tls_ofld->key_location == TLS_SFO_WR_CONTEXTLOC_DDR)) {
 		error = tls_program_key_id(toep, k_ctx);
 		if (error) {
 			/* XXX: Only clear quiesce for KEY_WRITE_RX? */
 			t4_clear_rx_quiesce(toep);
 			return (error);
 		}
 	}
 
 	if (G_KEY_GET_LOC(k_ctx->l_p_key) == KEY_WRITE_RX) {
 		/*
 		 * RX key tags are an index into the key portion of MA
 		 * memory stored as an offset from the base address in
 		 * units of 64 bytes.
 		 */
 		key_offset = tls_ofld->rx_key_addr - sc->vres.key.start;
 		t4_set_tls_keyid(toep, key_offset / 64);
 		t4_set_tls_tcb_field(toep, W_TCB_ULP_RAW,
 				 V_TCB_ULP_RAW(M_TCB_ULP_RAW),
 				 V_TCB_ULP_RAW((V_TF_TLS_KEY_SIZE(3) |
 						V_TF_TLS_CONTROL(1) |
 						V_TF_TLS_ACTIVE(1) |
 						V_TF_TLS_ENABLE(1))));
 		t4_set_tls_tcb_field(toep, W_TCB_TLS_SEQ,
 				 V_TCB_TLS_SEQ(M_TCB_TLS_SEQ),
 				 V_TCB_TLS_SEQ(0));
 		t4_clear_rx_quiesce(toep);
 	} else {
 		unsigned short pdus_per_ulp;
 
 		if (tls_ofld->key_location == TLS_SFO_WR_CONTEXTLOC_IMMEDIATE)
 			tls_ofld->tx_key_addr = 1;
 
 		tls_ofld->fcplenmax = get_tp_plen_max(tls_ofld);
 		tls_ofld->expn_per_ulp = tls_expansion_size(toep,
 				tls_ofld->fcplenmax, 1, &pdus_per_ulp);
 		tls_ofld->pdus_per_ulp = pdus_per_ulp;
 		tls_ofld->adjusted_plen = tls_ofld->pdus_per_ulp *
 			((tls_ofld->expn_per_ulp/tls_ofld->pdus_per_ulp) +
 			 tls_ofld->k_ctx.frag_size);
 	}
 
 	return (0);
 }
 
 /*
  * In some cases a client connection can hang without sending the
  * ServerHelloDone message from the NIC to the host.  Send a dummy
  * RX_DATA_ACK with RX_MODULATE to unstick the connection.
  */
 static void
 tls_send_handshake_ack(void *arg)
 {
 	struct toepcb *toep = arg;
 	struct tls_ofld_info *tls_ofld = &toep->tls;
 	struct adapter *sc = td_adapter(toep->td);
 
 	/*
 	 * XXX: Does not have the t4_get_tcb() checks to refine the
 	 * workaround.
 	 */
 	callout_schedule(&tls_ofld->handshake_timer, TLS_SRV_HELLO_RD_TM * hz);
 
 	CTR2(KTR_CXGBE, "%s: tid %d sending RX_DATA_ACK", __func__, toep->tid);
 	send_rx_modulate(sc, toep);
 }
 
 static void
 tls_start_handshake_timer(struct toepcb *toep)
 {
 	struct tls_ofld_info *tls_ofld = &toep->tls;
 
 	mtx_lock(&tls_handshake_lock);
 	callout_reset(&tls_ofld->handshake_timer, TLS_SRV_HELLO_BKOFF_TM * hz,
 	    tls_send_handshake_ack, toep);
 	mtx_unlock(&tls_handshake_lock);
 }
 
 void
 tls_stop_handshake_timer(struct toepcb *toep)
 {
 	struct tls_ofld_info *tls_ofld = &toep->tls;
 
 	mtx_lock(&tls_handshake_lock);
 	callout_stop(&tls_ofld->handshake_timer);
 	mtx_unlock(&tls_handshake_lock);
 }
 
 int
 t4_ctloutput_tls(struct socket *so, struct sockopt *sopt)
 {
 	struct tls_key_context uk_ctx;
 	struct inpcb *inp;
 	struct tcpcb *tp;
 	struct toepcb *toep;
 	int error, optval;
 
 	error = 0;
 	if (sopt->sopt_dir == SOPT_SET &&
 	    sopt->sopt_name == TCP_TLSOM_SET_TLS_CONTEXT) {
 		error = sooptcopyin(sopt, &uk_ctx, sizeof(uk_ctx),
 		    sizeof(uk_ctx));
 		if (error)
 			return (error);
 	}
 
 	inp = sotoinpcb(so);
 	KASSERT(inp != NULL, ("tcp_ctloutput: inp == NULL"));
 	INP_WLOCK(inp);
 	if (inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) {
 		INP_WUNLOCK(inp);
 		return (ECONNRESET);
 	}
 	tp = intotcpcb(inp);
 	toep = tp->t_toe;
 	switch (sopt->sopt_dir) {
 	case SOPT_SET:
 		switch (sopt->sopt_name) {
 		case TCP_TLSOM_SET_TLS_CONTEXT:
 			error = program_key_context(tp, toep, &uk_ctx);
 			INP_WUNLOCK(inp);
 			break;
 		case TCP_TLSOM_CLR_TLS_TOM:
 			if (toep->ulp_mode == ULP_MODE_TLS) {
 				CTR2(KTR_CXGBE, "%s: tid %d CLR_TLS_TOM",
 				    __func__, toep->tid);
 				tls_clr_ofld_mode(toep);
 			} else
 				error = EOPNOTSUPP;
 			INP_WUNLOCK(inp);
 			break;
 		case TCP_TLSOM_CLR_QUIES:
 			if (toep->ulp_mode == ULP_MODE_TLS) {
 				CTR2(KTR_CXGBE, "%s: tid %d CLR_QUIES",
 				    __func__, toep->tid);
 				tls_clr_quiesce(toep);
 			} else
 				error = EOPNOTSUPP;
 			INP_WUNLOCK(inp);
 			break;
 		default:
 			INP_WUNLOCK(inp);
 			error = EOPNOTSUPP;
 			break;
 		}
 		break;
 	case SOPT_GET:
 		switch (sopt->sopt_name) {
 		case TCP_TLSOM_GET_TLS_TOM:
 			/*
 			 * TLS TX is permitted on any TOE socket, but
 			 * TLS RX requires a TLS ULP mode.
 			 */
 			optval = TLS_TOM_NONE;
 			if (can_tls_offload(td_adapter(toep->td))) {
 				switch (toep->ulp_mode) {
 				case ULP_MODE_NONE:
 				case ULP_MODE_TCPDDP:
 					optval = TLS_TOM_TXONLY;
 					break;
 				case ULP_MODE_TLS:
 					optval = TLS_TOM_BOTH;
 					break;
 				}
 			}
 			CTR3(KTR_CXGBE, "%s: tid %d GET_TLS_TOM = %d",
 			    __func__, toep->tid, optval);
 			INP_WUNLOCK(inp);
 			error = sooptcopyout(sopt, &optval, sizeof(optval));
 			break;
 		default:
 			INP_WUNLOCK(inp);
 			error = EOPNOTSUPP;
 			break;
 		}
 		break;
 	}
 	return (error);
 }
 
 void
 tls_init_toep(struct toepcb *toep)
 {
 	struct tls_ofld_info *tls_ofld = &toep->tls;
 
 	tls_ofld->key_location = TLS_SFO_WR_CONTEXTLOC_DDR;
 	tls_ofld->rx_key_addr = -1;
 	tls_ofld->tx_key_addr = -1;
 	if (toep->ulp_mode == ULP_MODE_TLS)
 		callout_init_mtx(&tls_ofld->handshake_timer,
 		    &tls_handshake_lock, 0);
 }
 
 void
 tls_establish(struct toepcb *toep)
 {
 
 	/*
 	 * Enable PDU extraction.
 	 *
 	 * XXX: Supposedly this should be done by the firmware when
 	 * the ULP_MODE FLOWC parameter is set in send_flowc_wr(), but
 	 * in practice this seems to be required.
 	 */
 	CTR2(KTR_CXGBE, "%s: tid %d setting TLS_ENABLE", __func__, toep->tid);
 	t4_set_tls_tcb_field(toep, W_TCB_ULP_RAW, V_TCB_ULP_RAW(M_TCB_ULP_RAW),
 	    V_TCB_ULP_RAW(V_TF_TLS_ENABLE(1)));
 
 	toep->flags |= TPF_FORCE_CREDITS;
 
 	tls_start_handshake_timer(toep);
 }
 
 void
 tls_uninit_toep(struct toepcb *toep)
 {
 
 	if (toep->ulp_mode == ULP_MODE_TLS)
 		tls_stop_handshake_timer(toep);
 	clear_tls_keyid(toep);
 }
 
 #define MAX_OFLD_TX_CREDITS (SGE_MAX_WR_LEN / 16)
 #define	MIN_OFLD_TLSTX_CREDITS(toep)					\
 	(howmany(sizeof(struct fw_tlstx_data_wr) +			\
 	    sizeof(struct cpl_tx_tls_sfo) + key_size((toep)) +		\
 	    CIPHER_BLOCK_SIZE + 1, 16))
 
 static inline u_int
 max_imm_tls_space(int tx_credits)
 {
 	const int n = 2;	/* Use only up to 2 desc for imm. data WR */
 	int space;
 
 	KASSERT(tx_credits >= 0 &&
 		tx_credits <= MAX_OFLD_TX_CREDITS,
 		("%s: %d credits", __func__, tx_credits));
 
 	if (tx_credits >= (n * EQ_ESIZE) / 16)
 		space = (n * EQ_ESIZE);
 	else
 		space = tx_credits * 16;
 	return (space);
 }
 
 static int
 count_mbuf_segs(struct mbuf *m, int skip, int len, int *max_nsegs_1mbufp)
 {
 	int max_nsegs_1mbuf, n, nsegs;
 
 	while (skip >= m->m_len) {
 		skip -= m->m_len;
 		m = m->m_next;
 	}
 
 	nsegs = 0;
 	max_nsegs_1mbuf = 0;
 	while (len > 0) {
 		n = sglist_count(mtod(m, char *) + skip, m->m_len - skip);
 		if (n > max_nsegs_1mbuf)
 			max_nsegs_1mbuf = n;
 		nsegs += n;
 		len -= m->m_len - skip;
 		skip = 0;
 		m = m->m_next;
 	}
 	*max_nsegs_1mbufp = max_nsegs_1mbuf;
 	return (nsegs);
 }
 
 static void
 write_tlstx_wr(struct fw_tlstx_data_wr *txwr, struct toepcb *toep,
     unsigned int immdlen, unsigned int plen, unsigned int expn,
     unsigned int pdus, uint8_t credits, int shove, int imm_ivs)
 {
 	struct tls_ofld_info *tls_ofld = &toep->tls;
 	unsigned int len = plen + expn;
 
 	txwr->op_to_immdlen = htobe32(V_WR_OP(FW_TLSTX_DATA_WR) |
 	    V_FW_TLSTX_DATA_WR_COMPL(1) |
 	    V_FW_TLSTX_DATA_WR_IMMDLEN(immdlen));
 	txwr->flowid_len16 = htobe32(V_FW_TLSTX_DATA_WR_FLOWID(toep->tid) |
 	    V_FW_TLSTX_DATA_WR_LEN16(credits));
 	txwr->plen = htobe32(len);
 	txwr->lsodisable_to_flags = htobe32(V_TX_ULP_MODE(ULP_MODE_TLS) |
 	    V_TX_URG(0) | /* F_T6_TX_FORCE | */ V_TX_SHOVE(shove));
 	txwr->ctxloc_to_exp = htobe32(V_FW_TLSTX_DATA_WR_NUMIVS(pdus) |
 	    V_FW_TLSTX_DATA_WR_EXP(expn) |
 	    V_FW_TLSTX_DATA_WR_CTXLOC(tls_ofld->key_location) |
 	    V_FW_TLSTX_DATA_WR_IVDSGL(!imm_ivs) |
 	    V_FW_TLSTX_DATA_WR_KEYSIZE(tls_ofld->k_ctx.tx_key_info_size >> 4));
 	txwr->mfs = htobe16(tls_ofld->k_ctx.frag_size);
 	txwr->adjustedplen_pkd = htobe16(
 	    V_FW_TLSTX_DATA_WR_ADJUSTEDPLEN(tls_ofld->adjusted_plen));
 	txwr->expinplenmax_pkd = htobe16(
 	    V_FW_TLSTX_DATA_WR_EXPINPLENMAX(tls_ofld->expn_per_ulp));
 	txwr->pdusinplenmax_pkd = htobe16(
 	    V_FW_TLSTX_DATA_WR_PDUSINPLENMAX(tls_ofld->pdus_per_ulp));
 }
 
 static void
 write_tlstx_cpl(struct cpl_tx_tls_sfo *cpl, struct toepcb *toep,
     struct tls_hdr *tls_hdr, unsigned int plen, unsigned int pdus)
 {
 	struct tls_ofld_info *tls_ofld = &toep->tls;
 	int data_type, seglen;
 
 	if (plen < tls_ofld->k_ctx.frag_size)
 		seglen = plen;
 	else
 		seglen = tls_ofld->k_ctx.frag_size;
 	data_type = tls_content_type(tls_hdr->type);
 	cpl->op_to_seg_len = htobe32(V_CPL_TX_TLS_SFO_OPCODE(CPL_TX_TLS_SFO) |
 	    V_CPL_TX_TLS_SFO_DATA_TYPE(data_type) |
 	    V_CPL_TX_TLS_SFO_CPL_LEN(2) | V_CPL_TX_TLS_SFO_SEG_LEN(seglen));
 	cpl->pld_len = htobe32(plen);
 	if (data_type == CPL_TX_TLS_SFO_TYPE_HEARTBEAT)
 		cpl->type_protover = htobe32(
 		    V_CPL_TX_TLS_SFO_TYPE(tls_hdr->type));
 	cpl->seqno_numivs = htobe32(tls_ofld->scmd0.seqno_numivs |
 	    V_SCMD_NUM_IVS(pdus));
 	cpl->ivgen_hdrlen = htobe32(tls_ofld->scmd0.ivgen_hdrlen);
 	cpl->scmd1 = htobe64(tls_ofld->tx_seq_no);
 	tls_ofld->tx_seq_no += pdus;
 }
 
 /*
  * Similar to write_tx_sgl() except that it accepts an optional
  * trailer buffer for IVs.
  */
 static void
 write_tlstx_sgl(void *dst, struct mbuf *start, int skip, int plen,
     void *iv_buffer, int iv_len, int nsegs, int n)
 {
 	struct mbuf *m;
 	struct ulptx_sgl *usgl = dst;
 	int i, j, rc;
 	struct sglist sg;
 	struct sglist_seg segs[n];
 
 	KASSERT(nsegs > 0, ("%s: nsegs 0", __func__));
 
 	sglist_init(&sg, n, segs);
 	usgl->cmd_nsge = htobe32(V_ULPTX_CMD(ULP_TX_SC_DSGL) |
 	    V_ULPTX_NSGE(nsegs));
 
 	for (m = start; skip >= m->m_len; m = m->m_next)
 		skip -= m->m_len;
 
 	i = -1;
 	for (m = start; plen > 0; m = m->m_next) {
 		rc = sglist_append(&sg, mtod(m, char *) + skip,
 		    m->m_len - skip);
 		if (__predict_false(rc != 0))
 			panic("%s: sglist_append %d", __func__, rc);
 		plen -= m->m_len - skip;
 		skip = 0;
 
 		for (j = 0; j < sg.sg_nseg; i++, j++) {
 			if (i < 0) {
 				usgl->len0 = htobe32(segs[j].ss_len);
 				usgl->addr0 = htobe64(segs[j].ss_paddr);
 			} else {
 				usgl->sge[i / 2].len[i & 1] =
 				    htobe32(segs[j].ss_len);
 				usgl->sge[i / 2].addr[i & 1] =
 				    htobe64(segs[j].ss_paddr);
 			}
 #ifdef INVARIANTS
 			nsegs--;
 #endif
 		}
 		sglist_reset(&sg);
 	}
 	if (iv_buffer != NULL) {
 		rc = sglist_append(&sg, iv_buffer, iv_len);
 		if (__predict_false(rc != 0))
 			panic("%s: sglist_append %d", __func__, rc);
 
 		for (j = 0; j < sg.sg_nseg; i++, j++) {
 			if (i < 0) {
 				usgl->len0 = htobe32(segs[j].ss_len);
 				usgl->addr0 = htobe64(segs[j].ss_paddr);
 			} else {
 				usgl->sge[i / 2].len[i & 1] =
 				    htobe32(segs[j].ss_len);
 				usgl->sge[i / 2].addr[i & 1] =
 				    htobe64(segs[j].ss_paddr);
 			}
 #ifdef INVARIANTS
 			nsegs--;
 #endif
 		}
 	}
 	if (i & 1)
 		usgl->sge[i / 2].len[1] = htobe32(0);
 	KASSERT(nsegs == 0, ("%s: nsegs %d, start %p, iv_buffer %p",
 	    __func__, nsegs, start, iv_buffer));
 }
 
 /*
  * Similar to t4_push_frames() but handles TLS sockets when TLS offload
  * is enabled.  Rather than transmitting bulk data, the socket buffer
  * contains TLS records.  The work request requires a full TLS record,
  * so batch mbufs up until a full TLS record is seen.  This requires
  * reading the TLS header out of the start of each record to determine
  * its length.
  */
 void
 t4_push_tls_records(struct adapter *sc, struct toepcb *toep, int drop)
 {
 	struct tls_hdr thdr;
 	struct mbuf *sndptr;
 	struct fw_tlstx_data_wr *txwr;
 	struct cpl_tx_tls_sfo *cpl;
 	struct wrqe *wr;
 	u_int plen, nsegs, credits, space, max_nsegs_1mbuf, wr_len;
 	u_int expn_size, iv_len, pdus, sndptroff;
 	struct tls_ofld_info *tls_ofld = &toep->tls;
 	struct inpcb *inp = toep->inp;
 	struct tcpcb *tp = intotcpcb(inp);
 	struct socket *so = inp->inp_socket;
 	struct sockbuf *sb = &so->so_snd;
 	int tls_size, tx_credits, shove, /* compl,*/ sowwakeup;
 	struct ofld_tx_sdesc *txsd;
 	bool imm_ivs, imm_payload;
 	void *iv_buffer, *iv_dst, *buf;
 
 	INP_WLOCK_ASSERT(inp);
 	KASSERT(toep->flags & TPF_FLOWC_WR_SENT,
 	    ("%s: flowc_wr not sent for tid %u.", __func__, toep->tid));
 
 	KASSERT(toep->ulp_mode == ULP_MODE_NONE ||
 	    toep->ulp_mode == ULP_MODE_TCPDDP || toep->ulp_mode == ULP_MODE_TLS,
 	    ("%s: ulp_mode %u for toep %p", __func__, toep->ulp_mode, toep));
 	KASSERT(tls_tx_key(toep),
 	    ("%s: TX key not set for toep %p", __func__, toep));
 
 #ifdef VERBOSE_TRACES
 	CTR4(KTR_CXGBE, "%s: tid %d toep flags %#x tp flags %#x drop %d",
 	    __func__, toep->tid, toep->flags, tp->t_flags);
 #endif
 	if (__predict_false(toep->flags & TPF_ABORT_SHUTDOWN))
 		return;
 
 #ifdef RATELIMIT
 	if (__predict_false(inp->inp_flags2 & INP_RATE_LIMIT_CHANGED) &&
 	    (update_tx_rate_limit(sc, toep, so->so_max_pacing_rate) == 0)) {
 		inp->inp_flags2 &= ~INP_RATE_LIMIT_CHANGED;
 	}
 #endif
 
 	/*
 	 * This function doesn't resume by itself.  Someone else must clear the
 	 * flag and call this function.
 	 */
 	if (__predict_false(toep->flags & TPF_TX_SUSPENDED)) {
 		KASSERT(drop == 0,
 		    ("%s: drop (%d) != 0 but tx is suspended", __func__, drop));
 		return;
 	}
 
 	txsd = &toep->txsd[toep->txsd_pidx];
 	for (;;) {
 		tx_credits = min(toep->tx_credits, MAX_OFLD_TX_CREDITS);
 		space = max_imm_tls_space(tx_credits);
 		wr_len = sizeof(struct fw_tlstx_data_wr) +
 		    sizeof(struct cpl_tx_tls_sfo) + key_size(toep);
 		if (wr_len + CIPHER_BLOCK_SIZE + 1 > space) {
 #ifdef VERBOSE_TRACES
 			CTR5(KTR_CXGBE,
 			    "%s: tid %d tx_credits %d min_wr %d space %d",
 			    __func__, toep->tid, tx_credits, wr_len +
 			    CIPHER_BLOCK_SIZE + 1, space);
 #endif
 			return;
 		}
 
 		SOCKBUF_LOCK(sb);
 		sowwakeup = drop;
 		if (drop) {
 			sbdrop_locked(sb, drop);
 			MPASS(tls_ofld->sb_off >= drop);
 			tls_ofld->sb_off -= drop;
 			drop = 0;
 		}
 
 		/*
 		 * Send a FIN if requested, but only if there's no
 		 * more data to send.
 		 */
 		if (sbavail(sb) == tls_ofld->sb_off &&
 		    toep->flags & TPF_SEND_FIN) {
 			if (sowwakeup)
 				sowwakeup_locked(so);
 			else
 				SOCKBUF_UNLOCK(sb);
 			SOCKBUF_UNLOCK_ASSERT(sb);
 			t4_close_conn(sc, toep);
 			return;
 		}
 
 		if (sbavail(sb) < tls_ofld->sb_off + TLS_HEADER_LENGTH) {
 			/*
 			 * A full TLS header is not yet queued, stop
 			 * for now until more data is added to the
 			 * socket buffer.
 			 */
 #ifdef VERBOSE_TRACES
 			CTR4(KTR_CXGBE, "%s: tid %d sbavail %d sb_off %d",
 			    __func__, toep->tid, sbavail(sb), tls_ofld->sb_off);
 #endif
 			if (sowwakeup)
 				sowwakeup_locked(so);
 			else
 				SOCKBUF_UNLOCK(sb);
 			SOCKBUF_UNLOCK_ASSERT(sb);
 			return;
 		}
 
 		/* Read the header of the next TLS record. */
 		sndptr = sbsndmbuf(sb, tls_ofld->sb_off, &sndptroff);
 		MPASS(!IS_AIOTX_MBUF(sndptr));
 		m_copydata(sndptr, sndptroff, sizeof(thdr), (caddr_t)&thdr);
 		tls_size = htons(thdr.length);
 		plen = TLS_HEADER_LENGTH + tls_size;
 		pdus = howmany(tls_size, tls_ofld->k_ctx.frag_size);
 		iv_len = pdus * CIPHER_BLOCK_SIZE;
 
 		if (sbavail(sb) < tls_ofld->sb_off + plen) {
 			/*
 			 * The full TLS record is not yet queued, stop
 			 * for now until more data is added to the
 			 * socket buffer.
 			 */
 #ifdef VERBOSE_TRACES
 			CTR5(KTR_CXGBE,
 			    "%s: tid %d sbavail %d sb_off %d plen %d",
 			    __func__, toep->tid, sbavail(sb), tls_ofld->sb_off,
 			    plen);
 #endif
 			if (sowwakeup)
 				sowwakeup_locked(so);
 			else
 				SOCKBUF_UNLOCK(sb);
 			SOCKBUF_UNLOCK_ASSERT(sb);
 			return;
 		}
 
 		/* Shove if there is no additional data pending. */
 		shove = (sbavail(sb) == tls_ofld->sb_off + plen) &&
 		    !(tp->t_flags & TF_MORETOCOME);
 
 		if (sb->sb_flags & SB_AUTOSIZE &&
 		    V_tcp_do_autosndbuf &&
 		    sb->sb_hiwat < V_tcp_autosndbuf_max &&
 		    sbused(sb) >= sb->sb_hiwat * 7 / 8) {
 			int newsize = min(sb->sb_hiwat + V_tcp_autosndbuf_inc,
 			    V_tcp_autosndbuf_max);
 
 			if (!sbreserve_locked(sb, newsize, so, NULL))
 				sb->sb_flags &= ~SB_AUTOSIZE;
 			else
 				sowwakeup = 1;	/* room available */
 		}
 		if (sowwakeup)
 			sowwakeup_locked(so);
 		else
 			SOCKBUF_UNLOCK(sb);
 		SOCKBUF_UNLOCK_ASSERT(sb);
 
 		if (__predict_false(toep->flags & TPF_FIN_SENT))
 			panic("%s: excess tx.", __func__);
 
 		/* Determine whether to use immediate vs SGL. */
 		imm_payload = false;
 		imm_ivs = false;
 		if (wr_len + iv_len <= space) {
 			imm_ivs = true;
 			wr_len += iv_len;
 			if (wr_len + tls_size <= space) {
 				wr_len += tls_size;
 				imm_payload = true;
 			}
 		}
 
 		/* Allocate space for IVs if needed. */
 		if (!imm_ivs) {
 			iv_buffer = malloc(iv_len, M_CXGBE, M_NOWAIT);
 			if (iv_buffer == NULL) {
 				/*
 				 * XXX: How to restart this?
 				 */
 				if (sowwakeup)
 					sowwakeup_locked(so);
 				else
 					SOCKBUF_UNLOCK(sb);
 				SOCKBUF_UNLOCK_ASSERT(sb);
 				CTR3(KTR_CXGBE,
 			    "%s: tid %d failed to alloc IV space len %d",
 				    __func__, toep->tid, iv_len);
 				return;
 			}
 		} else
 			iv_buffer = NULL;
 
 		/* Determine size of SGL. */
 		nsegs = 0;
 		max_nsegs_1mbuf = 0; /* max # of SGL segments in any one mbuf */
 		if (!imm_payload) {
 			nsegs = count_mbuf_segs(sndptr, sndptroff +
 			    TLS_HEADER_LENGTH, tls_size, &max_nsegs_1mbuf);
 			if (!imm_ivs) {
 				int n = sglist_count(iv_buffer, iv_len);
 				nsegs += n;
 				if (n > max_nsegs_1mbuf)
 					max_nsegs_1mbuf = n;
 			}
 
 			/* Account for SGL in work request length. */
 			wr_len += sizeof(struct ulptx_sgl) +
 			    ((3 * (nsegs - 1)) / 2 + ((nsegs - 1) & 1)) * 8;
 		}
 
 		wr = alloc_wrqe(roundup2(wr_len, 16), toep->ofld_txq);
 		if (wr == NULL) {
 			/* XXX: how will we recover from this? */
 			toep->flags |= TPF_TX_SUSPENDED;
 			return;
 		}
 
 #ifdef VERBOSE_TRACES
 		CTR5(KTR_CXGBE, "%s: tid %d TLS record %d len %#x pdus %d",
 		    __func__, toep->tid, thdr.type, tls_size, pdus);
 #endif
 		txwr = wrtod(wr);
 		cpl = (struct cpl_tx_tls_sfo *)(txwr + 1);
 		memset(txwr, 0, roundup2(wr_len, 16));
 		credits = howmany(wr_len, 16);
 		expn_size = tls_expansion_size(toep, tls_size, 0, NULL);
 		write_tlstx_wr(txwr, toep, imm_payload ? tls_size : 0,
 		    tls_size, expn_size, pdus, credits, shove, imm_ivs ? 1 : 0);
 		write_tlstx_cpl(cpl, toep, &thdr, tls_size, pdus);
 		tls_copy_tx_key(toep, cpl + 1);
 
 		/* Generate random IVs */
 		buf = (char *)(cpl + 1) + key_size(toep);
 		if (imm_ivs) {
 			MPASS(iv_buffer == NULL);
 			iv_dst = buf;
 			buf = (char *)iv_dst + iv_len;
 		} else
 			iv_dst = iv_buffer;
 		arc4rand(iv_dst, iv_len, 0);
 
 		if (imm_payload) {
 			m_copydata(sndptr, sndptroff + TLS_HEADER_LENGTH,
 			    tls_size, buf);
 		} else {
 			write_tlstx_sgl(buf, sndptr,
 			    sndptroff + TLS_HEADER_LENGTH, tls_size, iv_buffer,
 			    iv_len, nsegs, max_nsegs_1mbuf);
 		}
 
 		KASSERT(toep->tx_credits >= credits,
 			("%s: not enough credits", __func__));
 
 		toep->tx_credits -= credits;
 
 		tp->snd_nxt += plen;
 		tp->snd_max += plen;
 
 		SOCKBUF_LOCK(sb);
 		sbsndptr(sb, tls_ofld->sb_off, plen, &sndptroff);
 		tls_ofld->sb_off += plen;
 		SOCKBUF_UNLOCK(sb);
 
 		toep->flags |= TPF_TX_DATA_SENT;
 		if (toep->tx_credits < MIN_OFLD_TLSTX_CREDITS(toep))
 			toep->flags |= TPF_TX_SUSPENDED;
 
 		KASSERT(toep->txsd_avail > 0, ("%s: no txsd", __func__));
 		txsd->plen = plen;
 		txsd->tx_credits = credits;
 		txsd->iv_buffer = iv_buffer;
 		txsd++;
 		if (__predict_false(++toep->txsd_pidx == toep->txsd_total)) {
 			toep->txsd_pidx = 0;
 			txsd = &toep->txsd[0];
 		}
 		toep->txsd_avail--;
 
 		atomic_add_long(&toep->vi->pi->tx_tls_records, 1);
 		atomic_add_long(&toep->vi->pi->tx_tls_octets, plen);
 
 		t4_l2t_send(sc, wr, toep->l2te);
 	}
 }
 
 /*
  * For TLS data we place received mbufs received via CPL_TLS_DATA into
  * an mbufq in the TLS offload state.  When CPL_RX_TLS_CMP is
  * received, the completed PDUs are placed into the socket receive
  * buffer.
  *
  * The TLS code reuses the ulp_pdu_reclaimq to hold the pending mbufs.
  */
 static int
 do_tls_data(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m)
 {
 	struct adapter *sc = iq->adapter;
 	const struct cpl_tls_data *cpl = mtod(m, const void *);
 	unsigned int tid = GET_TID(cpl);
 	struct toepcb *toep = lookup_tid(sc, tid);
 	struct inpcb *inp = toep->inp;
 	struct tcpcb *tp;
 	int len;
 
 	/* XXX: Should this match do_rx_data instead? */
 	KASSERT(!(toep->flags & TPF_SYNQE),
 	    ("%s: toep %p claims to be a synq entry", __func__, toep));
 
 	KASSERT(toep->tid == tid, ("%s: toep tid/atid mismatch", __func__));
 
 	/* strip off CPL header */
 	m_adj(m, sizeof(*cpl));
 	len = m->m_pkthdr.len;
 
 	atomic_add_long(&toep->vi->pi->rx_tls_octets, len);
 
 	KASSERT(len == G_CPL_TLS_DATA_LENGTH(be32toh(cpl->length_pkd)),
 	    ("%s: payload length mismatch", __func__));
 
 	INP_WLOCK(inp);
 	if (inp->inp_flags & (INP_DROPPED | INP_TIMEWAIT)) {
 		CTR4(KTR_CXGBE, "%s: tid %u, rx (%d bytes), inp_flags 0x%x",
 		    __func__, tid, len, inp->inp_flags);
 		INP_WUNLOCK(inp);
 		m_freem(m);
 		return (0);
 	}
 
 	/* Save TCP sequence number. */
 	m->m_pkthdr.tls_tcp_seq = be32toh(cpl->seq);
 
 	if (mbufq_enqueue(&toep->ulp_pdu_reclaimq, m)) {
 #ifdef INVARIANTS
 		panic("Failed to queue TLS data packet");
 #else
 		printf("%s: Failed to queue TLS data packet\n", __func__);
 		INP_WUNLOCK(inp);
 		m_freem(m);
 		return (0);
 #endif
 	}
 
 	tp = intotcpcb(inp);
 	tp->t_rcvtime = ticks;
 
 #ifdef VERBOSE_TRACES
 	CTR4(KTR_CXGBE, "%s: tid %u len %d seq %u", __func__, tid, len,
 	    be32toh(cpl->seq));
 #endif
 
 	INP_WUNLOCK(inp);
 	return (0);
 }
 
 static int
 do_rx_tls_cmp(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m)
 {
 	struct adapter *sc = iq->adapter;
 	const struct cpl_rx_tls_cmp *cpl = mtod(m, const void *);
 	struct tlsrx_hdr_pkt *tls_hdr_pkt;
 	unsigned int tid = GET_TID(cpl);
 	struct toepcb *toep = lookup_tid(sc, tid);
 	struct inpcb *inp = toep->inp;
 	struct tcpcb *tp;
 	struct socket *so;
 	struct sockbuf *sb;
 	struct mbuf *tls_data;
 	int len, pdu_length, pdu_overhead, sb_length;
 
 	KASSERT(toep->tid == tid, ("%s: toep tid/atid mismatch", __func__));
 	KASSERT(!(toep->flags & TPF_SYNQE),
 	    ("%s: toep %p claims to be a synq entry", __func__, toep));
 
 	/* strip off CPL header */
 	m_adj(m, sizeof(*cpl));
 	len = m->m_pkthdr.len;
 
 	atomic_add_long(&toep->vi->pi->rx_tls_records, 1);
 
 	KASSERT(len == G_CPL_RX_TLS_CMP_LENGTH(be32toh(cpl->pdulength_length)),
 	    ("%s: payload length mismatch", __func__));
 
 	INP_WLOCK(inp);
 	if (inp->inp_flags & (INP_DROPPED | INP_TIMEWAIT)) {
 		CTR4(KTR_CXGBE, "%s: tid %u, rx (%d bytes), inp_flags 0x%x",
 		    __func__, tid, len, inp->inp_flags);
 		INP_WUNLOCK(inp);
 		m_freem(m);
 		return (0);
 	}
 
 	pdu_length = G_CPL_RX_TLS_CMP_PDULENGTH(be32toh(cpl->pdulength_length));
 
 	tp = intotcpcb(inp);
 
 #ifdef VERBOSE_TRACES
 	CTR6(KTR_CXGBE, "%s: tid %u PDU len %d len %d seq %u, rcv_nxt %u",
 	    __func__, tid, pdu_length, len, be32toh(cpl->seq), tp->rcv_nxt);
 #endif
 
 	tp->rcv_nxt += pdu_length;
 	if (tp->rcv_wnd < pdu_length) {
 		toep->tls.rcv_over += pdu_length - tp->rcv_wnd;
 		tp->rcv_wnd = 0;
 	} else
 		tp->rcv_wnd -= pdu_length;
 
 	/* XXX: Not sure what to do about urgent data. */
 
 	/*
 	 * The payload of this CPL is the TLS header followed by
 	 * additional fields.
 	 */
 	KASSERT(m->m_len >= sizeof(*tls_hdr_pkt),
 	    ("%s: payload too small", __func__));
 	tls_hdr_pkt = mtod(m, void *);
 
 	/*
 	 * Only the TLS header is sent to OpenSSL, so report errors by
 	 * altering the record type.
 	 */
 	if ((tls_hdr_pkt->res_to_mac_error & M_TLSRX_HDR_PKT_ERROR) != 0)
 		tls_hdr_pkt->type = CONTENT_TYPE_ERROR;
 
 	/* Trim this CPL's mbuf to only include the TLS header. */
 	KASSERT(m->m_len == len && m->m_next == NULL,
 	    ("%s: CPL spans multiple mbufs", __func__));
 	m->m_len = TLS_HEADER_LENGTH;
 	m->m_pkthdr.len = TLS_HEADER_LENGTH;
 
 	tls_data = mbufq_dequeue(&toep->ulp_pdu_reclaimq);
 	if (tls_data != NULL) {
 		KASSERT(be32toh(cpl->seq) == tls_data->m_pkthdr.tls_tcp_seq,
 		    ("%s: sequence mismatch", __func__));
 
 		/*
 		 * Update the TLS header length to be the length of
 		 * the payload data.
 		 */
 		tls_hdr_pkt->length = htobe16(tls_data->m_pkthdr.len);
 
 		m->m_next = tls_data;
 		m->m_pkthdr.len += tls_data->m_len;
 	}
 
 	so = inp_inpcbtosocket(inp);
 	sb = &so->so_rcv;
 	SOCKBUF_LOCK(sb);
 
 	if (__predict_false(sb->sb_state & SBS_CANTRCVMORE)) {
 		CTR3(KTR_CXGBE, "%s: tid %u, excess rx (%d bytes)",
 		    __func__, tid, pdu_length);
 		m_freem(m);
 		SOCKBUF_UNLOCK(sb);
 		INP_WUNLOCK(inp);
 
 		CURVNET_SET(toep->vnet);
 		INP_INFO_RLOCK(&V_tcbinfo);
 		INP_WLOCK(inp);
 		tp = tcp_drop(tp, ECONNRESET);
 		if (tp)
 			INP_WUNLOCK(inp);
 		INP_INFO_RUNLOCK(&V_tcbinfo);
 		CURVNET_RESTORE();
 
 		return (0);
 	}
 
 	/*
 	 * Not all of the bytes on the wire are included in the socket
 	 * buffer (e.g. the MAC of the TLS record).  However, those
 	 * bytes are included in the TCP sequence space.  To handle
 	 * this, compute the delta for this TLS record in
 	 * 'pdu_overhead' and treat those bytes as having already been
 	 * "read" by the application for the purposes of expanding the
 	 * window.  The meat of the TLS record passed to the
 	 * application ('sb_length') will still not be counted as
 	 * "read" until userland actually reads the bytes.
 	 *
 	 * XXX: Some of the calculations below are probably still not
 	 * really correct.
 	 */
 	sb_length = m->m_pkthdr.len;
 	pdu_overhead = pdu_length - sb_length;
 	toep->rx_credits += pdu_overhead;
 	tp->rcv_wnd += pdu_overhead;
 	tp->rcv_adv += pdu_overhead;
 
 	/* receive buffer autosize */
 	MPASS(toep->vnet == so->so_vnet);
 	CURVNET_SET(toep->vnet);
 	if (sb->sb_flags & SB_AUTOSIZE &&
 	    V_tcp_do_autorcvbuf &&
 	    sb->sb_hiwat < V_tcp_autorcvbuf_max &&
 	    sb_length > (sbspace(sb) / 8 * 7)) {
 		unsigned int hiwat = sb->sb_hiwat;
 		unsigned int newsize = min(hiwat + V_tcp_autorcvbuf_inc,
 		    V_tcp_autorcvbuf_max);
 
 		if (!sbreserve_locked(sb, newsize, so, NULL))
 			sb->sb_flags &= ~SB_AUTOSIZE;
 		else
 			toep->rx_credits += newsize - hiwat;
 	}
 
 	KASSERT(toep->sb_cc >= sbused(sb),
 	    ("%s: sb %p has more data (%d) than last time (%d).",
 	    __func__, sb, sbused(sb), toep->sb_cc));
 	toep->rx_credits += toep->sb_cc - sbused(sb);
 	sbappendstream_locked(sb, m, 0);
 	toep->sb_cc = sbused(sb);
 #ifdef VERBOSE_TRACES
 	CTR5(KTR_CXGBE, "%s: tid %u PDU overhead %d rx_credits %u rcv_wnd %u",
 	    __func__, tid, pdu_overhead, toep->rx_credits, tp->rcv_wnd);
 #endif
 	if (toep->rx_credits > 0 && toep->sb_cc + tp->rcv_wnd < sb->sb_lowat) {
 		int credits;
 
 		credits = send_rx_credits(sc, toep, toep->rx_credits);
 		toep->rx_credits -= credits;
 		tp->rcv_wnd += credits;
 		tp->rcv_adv += credits;
 	}
 
 	sorwakeup_locked(so);
 	SOCKBUF_UNLOCK_ASSERT(sb);
 
 	INP_WUNLOCK(inp);
 	CURVNET_RESTORE();
 	return (0);
 }
 
 void
 t4_tls_mod_load(void)
 {
 
 	mtx_init(&tls_handshake_lock, "t4tls handshake", NULL, MTX_DEF);
 	t4_register_cpl_handler(CPL_TLS_DATA, do_tls_data);
 	t4_register_cpl_handler(CPL_RX_TLS_CMP, do_rx_tls_cmp);
 }
 
 void
 t4_tls_mod_unload(void)
 {
 
 	t4_register_cpl_handler(CPL_TLS_DATA, NULL);
 	t4_register_cpl_handler(CPL_RX_TLS_CMP, NULL);
 	mtx_destroy(&tls_handshake_lock);
 }
 #endif	/* TCP_OFFLOAD */
Index: head/sys/dev/cxgbe/tom/t4_tom.c
===================================================================
--- head/sys/dev/cxgbe/tom/t4_tom.c	(revision 331648)
+++ head/sys/dev/cxgbe/tom/t4_tom.c	(revision 331649)
@@ -1,1342 +1,1342 @@
 /*-
  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
  *
  * Copyright (c) 2012 Chelsio Communications, Inc.
  * All rights reserved.
  * Written by: Navdeep Parhar <np@FreeBSD.org>
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_inet.h"
 #include "opt_inet6.h"
 #include "opt_ratelimit.h"
 
 #include <sys/param.h>
 #include <sys/types.h>
 #include <sys/systm.h>
 #include <sys/kernel.h>
 #include <sys/ktr.h>
 #include <sys/lock.h>
 #include <sys/limits.h>
 #include <sys/module.h>
 #include <sys/protosw.h>
 #include <sys/domain.h>
 #include <sys/refcount.h>
 #include <sys/rmlock.h>
 #include <sys/socket.h>
 #include <sys/socketvar.h>
 #include <sys/taskqueue.h>
 #include <net/if.h>
 #include <net/if_var.h>
 #include <netinet/in.h>
 #include <netinet/in_pcb.h>
 #include <netinet/in_var.h>
 #include <netinet/ip.h>
 #include <netinet/ip6.h>
 #include <netinet6/scope6_var.h>
 #define TCPSTATES
 #include <netinet/tcp_fsm.h>
 #include <netinet/tcp_timer.h>
 #include <netinet/tcp_var.h>
 #include <netinet/toecore.h>
 
 #ifdef TCP_OFFLOAD
 #include "common/common.h"
 #include "common/t4_msg.h"
 #include "common/t4_regs.h"
 #include "common/t4_regs_values.h"
 #include "common/t4_tcb.h"
 #include "tom/t4_tom_l2t.h"
 #include "tom/t4_tom.h"
 #include "tom/t4_tls.h"
 
 static struct protosw toe_protosw;
 static struct pr_usrreqs toe_usrreqs;
 
 static struct protosw toe6_protosw;
 static struct pr_usrreqs toe6_usrreqs;
 
 /* Module ops */
 static int t4_tom_mod_load(void);
 static int t4_tom_mod_unload(void);
 static int t4_tom_modevent(module_t, int, void *);
 
 /* ULD ops and helpers */
 static int t4_tom_activate(struct adapter *);
 static int t4_tom_deactivate(struct adapter *);
 
 static struct uld_info tom_uld_info = {
 	.uld_id = ULD_TOM,
 	.activate = t4_tom_activate,
 	.deactivate = t4_tom_deactivate,
 };
 
 static void queue_tid_release(struct adapter *, int);
 static void release_offload_resources(struct toepcb *);
 static int alloc_tid_tabs(struct tid_info *);
 static void free_tid_tabs(struct tid_info *);
 static int add_lip(struct adapter *, struct in6_addr *);
 static int delete_lip(struct adapter *, struct in6_addr *);
 static struct clip_entry *search_lip(struct tom_data *, struct in6_addr *);
 static void init_clip_table(struct adapter *, struct tom_data *);
 static void update_clip(struct adapter *, void *);
 static void t4_clip_task(void *, int);
 static void update_clip_table(struct adapter *, struct tom_data *);
 static void destroy_clip_table(struct adapter *, struct tom_data *);
 static void free_tom_data(struct adapter *, struct tom_data *);
 static void reclaim_wr_resources(void *, int);
 
 static int in6_ifaddr_gen;
 static eventhandler_tag ifaddr_evhandler;
 static struct timeout_task clip_task;
 
 struct toepcb *
 alloc_toepcb(struct vi_info *vi, int txqid, int rxqid, int flags)
 {
 	struct port_info *pi = vi->pi;
 	struct adapter *sc = pi->adapter;
 	struct toepcb *toep;
 	int tx_credits, txsd_total, len;
 
 	/*
 	 * The firmware counts tx work request credits in units of 16 bytes
 	 * each.  Reserve room for an ABORT_REQ so the driver never has to worry
 	 * about tx credits if it wants to abort a connection.
 	 */
 	tx_credits = sc->params.ofldq_wr_cred;
 	tx_credits -= howmany(sizeof(struct cpl_abort_req), 16);
 
 	/*
 	 * Shortest possible tx work request is a fw_ofld_tx_data_wr + 1 byte
 	 * immediate payload, and firmware counts tx work request credits in
 	 * units of 16 byte.  Calculate the maximum work requests possible.
 	 */
 	txsd_total = tx_credits /
 	    howmany(sizeof(struct fw_ofld_tx_data_wr) + 1, 16);
 
 	if (txqid < 0)
 		txqid = (arc4random() % vi->nofldtxq) + vi->first_ofld_txq;
 	KASSERT(txqid >= vi->first_ofld_txq &&
 	    txqid < vi->first_ofld_txq + vi->nofldtxq,
 	    ("%s: txqid %d for vi %p (first %d, n %d)", __func__, txqid, vi,
 		vi->first_ofld_txq, vi->nofldtxq));
 
 	if (rxqid < 0)
 		rxqid = (arc4random() % vi->nofldrxq) + vi->first_ofld_rxq;
 	KASSERT(rxqid >= vi->first_ofld_rxq &&
 	    rxqid < vi->first_ofld_rxq + vi->nofldrxq,
 	    ("%s: rxqid %d for vi %p (first %d, n %d)", __func__, rxqid, vi,
 		vi->first_ofld_rxq, vi->nofldrxq));
 
 	len = offsetof(struct toepcb, txsd) +
 	    txsd_total * sizeof(struct ofld_tx_sdesc);
 
 	toep = malloc(len, M_CXGBE, M_ZERO | flags);
 	if (toep == NULL)
 		return (NULL);
 
 	refcount_init(&toep->refcount, 1);
 	toep->td = sc->tom_softc;
 	toep->vi = vi;
 	toep->tc_idx = -1;
 	toep->tx_total = tx_credits;
 	toep->tx_credits = tx_credits;
 	toep->ofld_txq = &sc->sge.ofld_txq[txqid];
 	toep->ofld_rxq = &sc->sge.ofld_rxq[rxqid];
 	toep->ctrlq = &sc->sge.ctrlq[pi->port_id];
 	mbufq_init(&toep->ulp_pduq, INT_MAX);
 	mbufq_init(&toep->ulp_pdu_reclaimq, INT_MAX);
 	toep->txsd_total = txsd_total;
 	toep->txsd_avail = txsd_total;
 	toep->txsd_pidx = 0;
 	toep->txsd_cidx = 0;
 	aiotx_init_toep(toep);
 
 	return (toep);
 }
 
 struct toepcb *
 hold_toepcb(struct toepcb *toep)
 {
 
 	refcount_acquire(&toep->refcount);
 	return (toep);
 }
 
 void
 free_toepcb(struct toepcb *toep)
 {
 
 	if (refcount_release(&toep->refcount) == 0)
 		return;
 
 	KASSERT(!(toep->flags & TPF_ATTACHED),
 	    ("%s: attached to an inpcb", __func__));
 	KASSERT(!(toep->flags & TPF_CPL_PENDING),
 	    ("%s: CPL pending", __func__));
 
 	if (toep->ulp_mode == ULP_MODE_TCPDDP)
 		ddp_uninit_toep(toep);
 	tls_uninit_toep(toep);
 	free(toep, M_CXGBE);
 }
 
 /*
  * Set up the socket for TCP offload.
  */
 void
 offload_socket(struct socket *so, struct toepcb *toep)
 {
 	struct tom_data *td = toep->td;
 	struct inpcb *inp = sotoinpcb(so);
 	struct tcpcb *tp = intotcpcb(inp);
 	struct sockbuf *sb;
 
 	INP_WLOCK_ASSERT(inp);
 
 	/* Update socket */
 	sb = &so->so_snd;
 	SOCKBUF_LOCK(sb);
 	sb->sb_flags |= SB_NOCOALESCE;
 	SOCKBUF_UNLOCK(sb);
 	sb = &so->so_rcv;
 	SOCKBUF_LOCK(sb);
 	sb->sb_flags |= SB_NOCOALESCE;
 	if (inp->inp_vflag & INP_IPV6)
 		so->so_proto = &toe6_protosw;
 	else
 		so->so_proto = &toe_protosw;
 	SOCKBUF_UNLOCK(sb);
 
 	/* Update TCP PCB */
 	tp->tod = &td->tod;
 	tp->t_toe = toep;
 	tp->t_flags |= TF_TOE;
 
 	/* Install an extra hold on inp */
 	toep->inp = inp;
 	toep->flags |= TPF_ATTACHED;
 	in_pcbref(inp);
 
 	/* Add the TOE PCB to the active list */
 	mtx_lock(&td->toep_list_lock);
 	TAILQ_INSERT_HEAD(&td->toep_list, toep, link);
 	mtx_unlock(&td->toep_list_lock);
 }
 
 /* This is _not_ the normal way to "unoffload" a socket. */
 void
 undo_offload_socket(struct socket *so)
 {
 	struct inpcb *inp = sotoinpcb(so);
 	struct tcpcb *tp = intotcpcb(inp);
 	struct toepcb *toep = tp->t_toe;
 	struct tom_data *td = toep->td;
 	struct sockbuf *sb;
 
 	INP_WLOCK_ASSERT(inp);
 
 	sb = &so->so_snd;
 	SOCKBUF_LOCK(sb);
 	sb->sb_flags &= ~SB_NOCOALESCE;
 	SOCKBUF_UNLOCK(sb);
 	sb = &so->so_rcv;
 	SOCKBUF_LOCK(sb);
 	sb->sb_flags &= ~SB_NOCOALESCE;
 	SOCKBUF_UNLOCK(sb);
 
 	tp->tod = NULL;
 	tp->t_toe = NULL;
 	tp->t_flags &= ~TF_TOE;
 
 	toep->inp = NULL;
 	toep->flags &= ~TPF_ATTACHED;
 	if (in_pcbrele_wlocked(inp))
 		panic("%s: inp freed.", __func__);
 
 	mtx_lock(&td->toep_list_lock);
 	TAILQ_REMOVE(&td->toep_list, toep, link);
 	mtx_unlock(&td->toep_list_lock);
 }
 
 static void
 release_offload_resources(struct toepcb *toep)
 {
 	struct tom_data *td = toep->td;
 	struct adapter *sc = td_adapter(td);
 	int tid = toep->tid;
 
 	KASSERT(!(toep->flags & TPF_CPL_PENDING),
 	    ("%s: %p has CPL pending.", __func__, toep));
 	KASSERT(!(toep->flags & TPF_ATTACHED),
 	    ("%s: %p is still attached.", __func__, toep));
 
 	CTR5(KTR_CXGBE, "%s: toep %p (tid %d, l2te %p, ce %p)",
 	    __func__, toep, tid, toep->l2te, toep->ce);
 
 	/*
 	 * These queues should have been emptied at approximately the same time
 	 * that a normal connection's socket's so_snd would have been purged or
 	 * drained.  Do _not_ clean up here.
 	 */
 	MPASS(mbufq_len(&toep->ulp_pduq) == 0);
 	MPASS(mbufq_len(&toep->ulp_pdu_reclaimq) == 0);
 #ifdef INVARIANTS
 	if (toep->ulp_mode == ULP_MODE_TCPDDP)
 		ddp_assert_empty(toep);
 #endif
 
 	if (toep->l2te)
 		t4_l2t_release(toep->l2te);
 
 	if (tid >= 0) {
 		remove_tid(sc, tid, toep->ce ? 2 : 1);
 		release_tid(sc, tid, toep->ctrlq);
 	}
 
 	if (toep->ce)
 		release_lip(td, toep->ce);
 
 #ifdef RATELIMIT
 	if (toep->tc_idx != -1)
 		t4_release_cl_rl_kbps(sc, toep->vi->pi->port_id, toep->tc_idx);
 #endif
 	mtx_lock(&td->toep_list_lock);
 	TAILQ_REMOVE(&td->toep_list, toep, link);
 	mtx_unlock(&td->toep_list_lock);
 
 	free_toepcb(toep);
 }
 
 /*
  * The kernel is done with the TCP PCB and this is our opportunity to unhook the
  * toepcb hanging off of it.  If the TOE driver is also done with the toepcb (no
  * pending CPL) then it is time to release all resources tied to the toepcb.
  *
  * Also gets called when an offloaded active open fails and the TOM wants the
  * kernel to take the TCP PCB back.
  */
 static void
 t4_pcb_detach(struct toedev *tod __unused, struct tcpcb *tp)
 {
 #if defined(KTR) || defined(INVARIANTS)
 	struct inpcb *inp = tp->t_inpcb;
 #endif
 	struct toepcb *toep = tp->t_toe;
 
 	INP_WLOCK_ASSERT(inp);
 
 	KASSERT(toep != NULL, ("%s: toep is NULL", __func__));
 	KASSERT(toep->flags & TPF_ATTACHED,
 	    ("%s: not attached", __func__));
 
 #ifdef KTR
 	if (tp->t_state == TCPS_SYN_SENT) {
 		CTR6(KTR_CXGBE, "%s: atid %d, toep %p (0x%x), inp %p (0x%x)",
 		    __func__, toep->tid, toep, toep->flags, inp,
 		    inp->inp_flags);
 	} else {
 		CTR6(KTR_CXGBE,
 		    "t4_pcb_detach: tid %d (%s), toep %p (0x%x), inp %p (0x%x)",
 		    toep->tid, tcpstates[tp->t_state], toep, toep->flags, inp,
 		    inp->inp_flags);
 	}
 #endif
 
 	tp->t_toe = NULL;
 	tp->t_flags &= ~TF_TOE;
 	toep->flags &= ~TPF_ATTACHED;
 
 	if (!(toep->flags & TPF_CPL_PENDING))
 		release_offload_resources(toep);
 }
 
 /*
  * setsockopt handler.
  */
 static void
 t4_ctloutput(struct toedev *tod, struct tcpcb *tp, int dir, int name)
 {
 	struct adapter *sc = tod->tod_softc;
 	struct toepcb *toep = tp->t_toe;
 
 	if (dir == SOPT_GET)
 		return;
 
 	CTR4(KTR_CXGBE, "%s: tp %p, dir %u, name %u", __func__, tp, dir, name);
 
 	switch (name) {
 	case TCP_NODELAY:
 		if (tp->t_state != TCPS_ESTABLISHED)
 			break;
-		t4_set_tcb_field(sc, toep->ctrlq, toep->tid, W_TCB_T_FLAGS,
+		t4_set_tcb_field(sc, toep->ctrlq, toep, W_TCB_T_FLAGS,
 		    V_TF_NAGLE(1), V_TF_NAGLE(tp->t_flags & TF_NODELAY ? 0 : 1),
-		    0, 0, toep->ofld_rxq->iq.abs_id);
+		    0, 0);
 		break;
 	default:
 		break;
 	}
 }
 
 /*
  * The TOE driver will not receive any more CPLs for the tid associated with the
  * toepcb; release the hold on the inpcb.
  */
 void
 final_cpl_received(struct toepcb *toep)
 {
 	struct inpcb *inp = toep->inp;
 
 	KASSERT(inp != NULL, ("%s: inp is NULL", __func__));
 	INP_WLOCK_ASSERT(inp);
 	KASSERT(toep->flags & TPF_CPL_PENDING,
 	    ("%s: CPL not pending already?", __func__));
 
 	CTR6(KTR_CXGBE, "%s: tid %d, toep %p (0x%x), inp %p (0x%x)",
 	    __func__, toep->tid, toep, toep->flags, inp, inp->inp_flags);
 
 	if (toep->ulp_mode == ULP_MODE_TCPDDP)
 		release_ddp_resources(toep);
 	toep->inp = NULL;
 	toep->flags &= ~TPF_CPL_PENDING;
 	mbufq_drain(&toep->ulp_pdu_reclaimq);
 
 	if (!(toep->flags & TPF_ATTACHED))
 		release_offload_resources(toep);
 
 	if (!in_pcbrele_wlocked(inp))
 		INP_WUNLOCK(inp);
 }
 
 void
 insert_tid(struct adapter *sc, int tid, void *ctx, int ntids)
 {
 	struct tid_info *t = &sc->tids;
 
 	t->tid_tab[tid] = ctx;
 	atomic_add_int(&t->tids_in_use, ntids);
 }
 
 void *
 lookup_tid(struct adapter *sc, int tid)
 {
 	struct tid_info *t = &sc->tids;
 
 	return (t->tid_tab[tid]);
 }
 
 void
 update_tid(struct adapter *sc, int tid, void *ctx)
 {
 	struct tid_info *t = &sc->tids;
 
 	t->tid_tab[tid] = ctx;
 }
 
 void
 remove_tid(struct adapter *sc, int tid, int ntids)
 {
 	struct tid_info *t = &sc->tids;
 
 	t->tid_tab[tid] = NULL;
 	atomic_subtract_int(&t->tids_in_use, ntids);
 }
 
 void
 release_tid(struct adapter *sc, int tid, struct sge_wrq *ctrlq)
 {
 	struct wrqe *wr;
 	struct cpl_tid_release *req;
 
 	wr = alloc_wrqe(sizeof(*req), ctrlq);
 	if (wr == NULL) {
 		queue_tid_release(sc, tid);	/* defer */
 		return;
 	}
 	req = wrtod(wr);
 
 	INIT_TP_WR_MIT_CPL(req, CPL_TID_RELEASE, tid);
 
 	t4_wrq_tx(sc, wr);
 }
 
 static void
 queue_tid_release(struct adapter *sc, int tid)
 {
 
 	CXGBE_UNIMPLEMENTED("deferred tid release");
 }
 
 /*
  * What mtu_idx to use, given a 4-tuple and/or an MSS cap
  */
 int
 find_best_mtu_idx(struct adapter *sc, struct in_conninfo *inc, int pmss)
 {
 	unsigned short *mtus = &sc->params.mtus[0];
 	int i, mss, n;
 
 	KASSERT(inc != NULL || pmss > 0,
 	    ("%s: at least one of inc/pmss must be specified", __func__));
 
 	mss = inc ? tcp_mssopt(inc) : pmss;
 	if (pmss > 0 && mss > pmss)
 		mss = pmss;
 
 	if (inc->inc_flags & INC_ISIPV6)
 		n = sizeof(struct ip6_hdr) + sizeof(struct tcphdr);
 	else
 		n = sizeof(struct ip) + sizeof(struct tcphdr);
 
 	for (i = 0; i < NMTUS - 1 && mtus[i + 1] <= mss + n; i++)
 		continue;
 
 	return (i);
 }
 
 /*
  * Determine the receive window size for a socket.
  */
 u_long
 select_rcv_wnd(struct socket *so)
 {
 	unsigned long wnd;
 
 	SOCKBUF_LOCK_ASSERT(&so->so_rcv);
 
 	wnd = sbspace(&so->so_rcv);
 	if (wnd < MIN_RCV_WND)
 		wnd = MIN_RCV_WND;
 
 	return min(wnd, MAX_RCV_WND);
 }
 
 int
 select_rcv_wscale(void)
 {
 	int wscale = 0;
 	unsigned long space = sb_max;
 
 	if (space > MAX_RCV_WND)
 		space = MAX_RCV_WND;
 
 	while (wscale < TCP_MAX_WINSHIFT && (TCP_MAXWIN << wscale) < space)
 		wscale++;
 
 	return (wscale);
 }
 
 /*
  * socket so could be a listening socket too.
  */
 uint64_t
 calc_opt0(struct socket *so, struct vi_info *vi, struct l2t_entry *e,
     int mtu_idx, int rscale, int rx_credits, int ulp_mode)
 {
 	uint64_t opt0;
 
 	KASSERT(rx_credits <= M_RCV_BUFSIZ,
 	    ("%s: rcv_bufsiz too high", __func__));
 
 	opt0 = F_TCAM_BYPASS | V_WND_SCALE(rscale) | V_MSS_IDX(mtu_idx) |
 	    V_ULP_MODE(ulp_mode) | V_RCV_BUFSIZ(rx_credits);
 
 	if (so != NULL) {
 		struct inpcb *inp = sotoinpcb(so);
 		struct tcpcb *tp = intotcpcb(inp);
 		int keepalive = tcp_always_keepalive ||
 		    so_options_get(so) & SO_KEEPALIVE;
 
 		opt0 |= V_NAGLE((tp->t_flags & TF_NODELAY) == 0);
 		opt0 |= V_KEEP_ALIVE(keepalive != 0);
 	}
 
 	if (e != NULL)
 		opt0 |= V_L2T_IDX(e->idx);
 
 	if (vi != NULL) {
 		opt0 |= V_SMAC_SEL(vi->smt_idx);
 		opt0 |= V_TX_CHAN(vi->pi->tx_chan);
 	}
 
 	return htobe64(opt0);
 }
 
 uint64_t
 select_ntuple(struct vi_info *vi, struct l2t_entry *e)
 {
 	struct adapter *sc = vi->pi->adapter;
 	struct tp_params *tp = &sc->params.tp;
 	uint16_t viid = vi->viid;
 	uint64_t ntuple = 0;
 
 	/*
 	 * Initialize each of the fields which we care about which are present
 	 * in the Compressed Filter Tuple.
 	 */
 	if (tp->vlan_shift >= 0 && e->vlan != CPL_L2T_VLAN_NONE)
 		ntuple |= (uint64_t)(F_FT_VLAN_VLD | e->vlan) << tp->vlan_shift;
 
 	if (tp->port_shift >= 0)
 		ntuple |= (uint64_t)e->lport << tp->port_shift;
 
 	if (tp->protocol_shift >= 0)
 		ntuple |= (uint64_t)IPPROTO_TCP << tp->protocol_shift;
 
 	if (tp->vnic_shift >= 0) {
 		uint32_t vf = G_FW_VIID_VIN(viid);
 		uint32_t pf = G_FW_VIID_PFN(viid);
 		uint32_t vld = G_FW_VIID_VIVLD(viid);
 
 		ntuple |= (uint64_t)(V_FT_VNID_ID_VF(vf) | V_FT_VNID_ID_PF(pf) |
 		    V_FT_VNID_ID_VLD(vld)) << tp->vnic_shift;
 	}
 
 	if (is_t4(sc))
 		return (htobe32((uint32_t)ntuple));
 	else
 		return (htobe64(V_FILTER_TUPLE(ntuple)));
 }
 
 static int
 is_tls_sock(struct socket *so, struct adapter *sc)
 {
 	struct inpcb *inp = sotoinpcb(so);
 	int i, rc;
 
 	/* XXX: Eventually add a SO_WANT_TLS socket option perhaps? */
 	rc = 0;
 	ADAPTER_LOCK(sc);
 	for (i = 0; i < sc->tt.num_tls_rx_ports; i++) {
 		if (inp->inp_lport == htons(sc->tt.tls_rx_ports[i]) ||
 		    inp->inp_fport == htons(sc->tt.tls_rx_ports[i])) {
 			rc = 1;
 			break;
 		}
 	}
 	ADAPTER_UNLOCK(sc);
 	return (rc);
 }
 
 int
 select_ulp_mode(struct socket *so, struct adapter *sc)
 {
 
 	if (can_tls_offload(sc) && is_tls_sock(so, sc))
 		return (ULP_MODE_TLS);
 	else if (sc->tt.ddp && (so->so_options & SO_NO_DDP) == 0)
 		return (ULP_MODE_TCPDDP);
 	else
 		return (ULP_MODE_NONE);
 }
 
 void
 set_ulp_mode(struct toepcb *toep, int ulp_mode)
 {
 
 	CTR4(KTR_CXGBE, "%s: toep %p (tid %d) ulp_mode %d",
 	    __func__, toep, toep->tid, ulp_mode);
 	toep->ulp_mode = ulp_mode;
 	tls_init_toep(toep);
 	if (toep->ulp_mode == ULP_MODE_TCPDDP)
 		ddp_init_toep(toep);
 }
 
 int
 negative_advice(int status)
 {
 
 	return (status == CPL_ERR_RTX_NEG_ADVICE ||
 	    status == CPL_ERR_PERSIST_NEG_ADVICE ||
 	    status == CPL_ERR_KEEPALV_NEG_ADVICE);
 }
 
 static int
 alloc_tid_tabs(struct tid_info *t)
 {
 	size_t size;
 	unsigned int i;
 
 	size = t->ntids * sizeof(*t->tid_tab) +
 	    t->natids * sizeof(*t->atid_tab) +
 	    t->nstids * sizeof(*t->stid_tab);
 
 	t->tid_tab = malloc(size, M_CXGBE, M_ZERO | M_NOWAIT);
 	if (t->tid_tab == NULL)
 		return (ENOMEM);
 
 	mtx_init(&t->atid_lock, "atid lock", NULL, MTX_DEF);
 	t->atid_tab = (union aopen_entry *)&t->tid_tab[t->ntids];
 	t->afree = t->atid_tab;
 	t->atids_in_use = 0;
 	for (i = 1; i < t->natids; i++)
 		t->atid_tab[i - 1].next = &t->atid_tab[i];
 	t->atid_tab[t->natids - 1].next = NULL;
 
 	mtx_init(&t->stid_lock, "stid lock", NULL, MTX_DEF);
 	t->stid_tab = (struct listen_ctx **)&t->atid_tab[t->natids];
 	t->stids_in_use = 0;
 	TAILQ_INIT(&t->stids);
 	t->nstids_free_head = t->nstids;
 
 	atomic_store_rel_int(&t->tids_in_use, 0);
 
 	return (0);
 }
 
 static void
 free_tid_tabs(struct tid_info *t)
 {
 	KASSERT(t->tids_in_use == 0,
 	    ("%s: %d tids still in use.", __func__, t->tids_in_use));
 	KASSERT(t->atids_in_use == 0,
 	    ("%s: %d atids still in use.", __func__, t->atids_in_use));
 	KASSERT(t->stids_in_use == 0,
 	    ("%s: %d tids still in use.", __func__, t->stids_in_use));
 
 	free(t->tid_tab, M_CXGBE);
 	t->tid_tab = NULL;
 
 	if (mtx_initialized(&t->atid_lock))
 		mtx_destroy(&t->atid_lock);
 	if (mtx_initialized(&t->stid_lock))
 		mtx_destroy(&t->stid_lock);
 }
 
 static int
 add_lip(struct adapter *sc, struct in6_addr *lip)
 {
         struct fw_clip_cmd c;
 
 	ASSERT_SYNCHRONIZED_OP(sc);
 	/* mtx_assert(&td->clip_table_lock, MA_OWNED); */
 
         memset(&c, 0, sizeof(c));
 	c.op_to_write = htonl(V_FW_CMD_OP(FW_CLIP_CMD) | F_FW_CMD_REQUEST |
 	    F_FW_CMD_WRITE);
         c.alloc_to_len16 = htonl(F_FW_CLIP_CMD_ALLOC | FW_LEN16(c));
         c.ip_hi = *(uint64_t *)&lip->s6_addr[0];
         c.ip_lo = *(uint64_t *)&lip->s6_addr[8];
 
 	return (-t4_wr_mbox_ns(sc, sc->mbox, &c, sizeof(c), &c));
 }
 
 static int
 delete_lip(struct adapter *sc, struct in6_addr *lip)
 {
 	struct fw_clip_cmd c;
 
 	ASSERT_SYNCHRONIZED_OP(sc);
 	/* mtx_assert(&td->clip_table_lock, MA_OWNED); */
 
 	memset(&c, 0, sizeof(c));
 	c.op_to_write = htonl(V_FW_CMD_OP(FW_CLIP_CMD) | F_FW_CMD_REQUEST |
 	    F_FW_CMD_READ);
         c.alloc_to_len16 = htonl(F_FW_CLIP_CMD_FREE | FW_LEN16(c));
         c.ip_hi = *(uint64_t *)&lip->s6_addr[0];
         c.ip_lo = *(uint64_t *)&lip->s6_addr[8];
 
 	return (-t4_wr_mbox_ns(sc, sc->mbox, &c, sizeof(c), &c));
 }
 
 static struct clip_entry *
 search_lip(struct tom_data *td, struct in6_addr *lip)
 {
 	struct clip_entry *ce;
 
 	mtx_assert(&td->clip_table_lock, MA_OWNED);
 
 	TAILQ_FOREACH(ce, &td->clip_table, link) {
 		if (IN6_ARE_ADDR_EQUAL(&ce->lip, lip))
 			return (ce);
 	}
 
 	return (NULL);
 }
 
 struct clip_entry *
 hold_lip(struct tom_data *td, struct in6_addr *lip, struct clip_entry *ce)
 {
 
 	mtx_lock(&td->clip_table_lock);
 	if (ce == NULL)
 		ce = search_lip(td, lip);
 	if (ce != NULL)
 		ce->refcount++;
 	mtx_unlock(&td->clip_table_lock);
 
 	return (ce);
 }
 
 void
 release_lip(struct tom_data *td, struct clip_entry *ce)
 {
 
 	mtx_lock(&td->clip_table_lock);
 	KASSERT(search_lip(td, &ce->lip) == ce,
 	    ("%s: CLIP entry %p p not in CLIP table.", __func__, ce));
 	KASSERT(ce->refcount > 0,
 	    ("%s: CLIP entry %p has refcount 0", __func__, ce));
 	--ce->refcount;
 	mtx_unlock(&td->clip_table_lock);
 }
 
 static void
 init_clip_table(struct adapter *sc, struct tom_data *td)
 {
 
 	ASSERT_SYNCHRONIZED_OP(sc);
 
 	mtx_init(&td->clip_table_lock, "CLIP table lock", NULL, MTX_DEF);
 	TAILQ_INIT(&td->clip_table);
 	td->clip_gen = -1;
 
 	update_clip_table(sc, td);
 }
 
 static void
 update_clip(struct adapter *sc, void *arg __unused)
 {
 
 	if (begin_synchronized_op(sc, NULL, HOLD_LOCK, "t4tomuc"))
 		return;
 
 	if (uld_active(sc, ULD_TOM))
 		update_clip_table(sc, sc->tom_softc);
 
 	end_synchronized_op(sc, LOCK_HELD);
 }
 
 static void
 t4_clip_task(void *arg, int count)
 {
 
 	t4_iterate(update_clip, NULL);
 }
 
 static void
 update_clip_table(struct adapter *sc, struct tom_data *td)
 {
 	struct rm_priotracker in6_ifa_tracker;
 	struct in6_ifaddr *ia;
 	struct in6_addr *lip, tlip;
 	struct clip_head stale;
 	struct clip_entry *ce, *ce_temp;
 	struct vi_info *vi;
 	int rc, gen, i, j;
 	uintptr_t last_vnet;
 
 	ASSERT_SYNCHRONIZED_OP(sc);
 
 	IN6_IFADDR_RLOCK(&in6_ifa_tracker);
 	mtx_lock(&td->clip_table_lock);
 
 	gen = atomic_load_acq_int(&in6_ifaddr_gen);
 	if (gen == td->clip_gen)
 		goto done;
 
 	TAILQ_INIT(&stale);
 	TAILQ_CONCAT(&stale, &td->clip_table, link);
 
 	/*
 	 * last_vnet optimizes the common cases where all if_vnet = NULL (no
 	 * VIMAGE) or all if_vnet = vnet0.
 	 */
 	last_vnet = (uintptr_t)(-1);
 	for_each_port(sc, i)
 	for_each_vi(sc->port[i], j, vi) {
 		if (last_vnet == (uintptr_t)vi->ifp->if_vnet)
 			continue;
 
 		/* XXX: races with if_vmove */
 		CURVNET_SET(vi->ifp->if_vnet);
 		TAILQ_FOREACH(ia, &V_in6_ifaddrhead, ia_link) {
 			lip = &ia->ia_addr.sin6_addr;
 
 			KASSERT(!IN6_IS_ADDR_MULTICAST(lip),
 			    ("%s: mcast address in in6_ifaddr list", __func__));
 
 			if (IN6_IS_ADDR_LOOPBACK(lip))
 				continue;
 			if (IN6_IS_SCOPE_EMBED(lip)) {
 				/* Remove the embedded scope */
 				tlip = *lip;
 				lip = &tlip;
 				in6_clearscope(lip);
 			}
 			/*
 			 * XXX: how to weed out the link local address for the
 			 * loopback interface?  It's fe80::1 usually (always?).
 			 */
 
 			/*
 			 * If it's in the main list then we already know it's
 			 * not stale.
 			 */
 			TAILQ_FOREACH(ce, &td->clip_table, link) {
 				if (IN6_ARE_ADDR_EQUAL(&ce->lip, lip))
 					goto next;
 			}
 
 			/*
 			 * If it's in the stale list we should move it to the
 			 * main list.
 			 */
 			TAILQ_FOREACH(ce, &stale, link) {
 				if (IN6_ARE_ADDR_EQUAL(&ce->lip, lip)) {
 					TAILQ_REMOVE(&stale, ce, link);
 					TAILQ_INSERT_TAIL(&td->clip_table, ce,
 					    link);
 					goto next;
 				}
 			}
 
 			/* A new IP6 address; add it to the CLIP table */
 			ce = malloc(sizeof(*ce), M_CXGBE, M_NOWAIT);
 			memcpy(&ce->lip, lip, sizeof(ce->lip));
 			ce->refcount = 0;
 			rc = add_lip(sc, lip);
 			if (rc == 0)
 				TAILQ_INSERT_TAIL(&td->clip_table, ce, link);
 			else {
 				char ip[INET6_ADDRSTRLEN];
 
 				inet_ntop(AF_INET6, &ce->lip, &ip[0],
 				    sizeof(ip));
 				log(LOG_ERR, "%s: could not add %s (%d)\n",
 				    __func__, ip, rc);
 				free(ce, M_CXGBE);
 			}
 next:
 			continue;
 		}
 		CURVNET_RESTORE();
 		last_vnet = (uintptr_t)vi->ifp->if_vnet;
 	}
 
 	/*
 	 * Remove stale addresses (those no longer in V_in6_ifaddrhead) that are
 	 * no longer referenced by the driver.
 	 */
 	TAILQ_FOREACH_SAFE(ce, &stale, link, ce_temp) {
 		if (ce->refcount == 0) {
 			rc = delete_lip(sc, &ce->lip);
 			if (rc == 0) {
 				TAILQ_REMOVE(&stale, ce, link);
 				free(ce, M_CXGBE);
 			} else {
 				char ip[INET6_ADDRSTRLEN];
 
 				inet_ntop(AF_INET6, &ce->lip, &ip[0],
 				    sizeof(ip));
 				log(LOG_ERR, "%s: could not delete %s (%d)\n",
 				    __func__, ip, rc);
 			}
 		}
 	}
 	/* The ones that are still referenced need to stay in the CLIP table */
 	TAILQ_CONCAT(&td->clip_table, &stale, link);
 
 	td->clip_gen = gen;
 done:
 	mtx_unlock(&td->clip_table_lock);
 	IN6_IFADDR_RUNLOCK(&in6_ifa_tracker);
 }
 
 static void
 destroy_clip_table(struct adapter *sc, struct tom_data *td)
 {
 	struct clip_entry *ce, *ce_temp;
 
 	if (mtx_initialized(&td->clip_table_lock)) {
 		mtx_lock(&td->clip_table_lock);
 		TAILQ_FOREACH_SAFE(ce, &td->clip_table, link, ce_temp) {
 			KASSERT(ce->refcount == 0,
 			    ("%s: CLIP entry %p still in use (%d)", __func__,
 			    ce, ce->refcount));
 			TAILQ_REMOVE(&td->clip_table, ce, link);
 			delete_lip(sc, &ce->lip);
 			free(ce, M_CXGBE);
 		}
 		mtx_unlock(&td->clip_table_lock);
 		mtx_destroy(&td->clip_table_lock);
 	}
 }
 
 static void
 free_tom_data(struct adapter *sc, struct tom_data *td)
 {
 
 	ASSERT_SYNCHRONIZED_OP(sc);
 
 	KASSERT(TAILQ_EMPTY(&td->toep_list),
 	    ("%s: TOE PCB list is not empty.", __func__));
 	KASSERT(td->lctx_count == 0,
 	    ("%s: lctx hash table is not empty.", __func__));
 
 	tls_free_kmap(td);
 	t4_free_ppod_region(&td->pr);
 	destroy_clip_table(sc, td);
 
 	if (td->listen_mask != 0)
 		hashdestroy(td->listen_hash, M_CXGBE, td->listen_mask);
 
 	if (mtx_initialized(&td->unsent_wr_lock))
 		mtx_destroy(&td->unsent_wr_lock);
 	if (mtx_initialized(&td->lctx_hash_lock))
 		mtx_destroy(&td->lctx_hash_lock);
 	if (mtx_initialized(&td->toep_list_lock))
 		mtx_destroy(&td->toep_list_lock);
 
 	free_tid_tabs(&sc->tids);
 	free(td, M_CXGBE);
 }
 
 static void
 reclaim_wr_resources(void *arg, int count)
 {
 	struct tom_data *td = arg;
 	STAILQ_HEAD(, wrqe) twr_list = STAILQ_HEAD_INITIALIZER(twr_list);
 	struct cpl_act_open_req *cpl;
 	u_int opcode, atid;
 	struct wrqe *wr;
 	struct adapter *sc;
 
 	mtx_lock(&td->unsent_wr_lock);
 	STAILQ_SWAP(&td->unsent_wr_list, &twr_list, wrqe);
 	mtx_unlock(&td->unsent_wr_lock);
 
 	while ((wr = STAILQ_FIRST(&twr_list)) != NULL) {
 		STAILQ_REMOVE_HEAD(&twr_list, link);
 
 		cpl = wrtod(wr);
 		opcode = GET_OPCODE(cpl);
 
 		switch (opcode) {
 		case CPL_ACT_OPEN_REQ:
 		case CPL_ACT_OPEN_REQ6:
 			atid = G_TID_TID(be32toh(OPCODE_TID(cpl)));
 			sc = td_adapter(td);
 
 			CTR2(KTR_CXGBE, "%s: atid %u ", __func__, atid);
 			act_open_failure_cleanup(sc, atid, EHOSTUNREACH);
 			free(wr, M_CXGBE);
 			break;
 		default:
 			log(LOG_ERR, "%s: leaked work request %p, wr_len %d, "
 			    "opcode %x\n", __func__, wr, wr->wr_len, opcode);
 			/* WR not freed here; go look at it with a debugger.  */
 		}
 	}
 }
 
 /*
  * Ground control to Major TOM
  * Commencing countdown, engines on
  */
 static int
 t4_tom_activate(struct adapter *sc)
 {
 	struct tom_data *td;
 	struct toedev *tod;
 	struct vi_info *vi;
 	struct sge_ofld_rxq *ofld_rxq;
 	int i, j, rc, v;
 
 	ASSERT_SYNCHRONIZED_OP(sc);
 
 	/* per-adapter softc for TOM */
 	td = malloc(sizeof(*td), M_CXGBE, M_ZERO | M_NOWAIT);
 	if (td == NULL)
 		return (ENOMEM);
 
 	/* List of TOE PCBs and associated lock */
 	mtx_init(&td->toep_list_lock, "PCB list lock", NULL, MTX_DEF);
 	TAILQ_INIT(&td->toep_list);
 
 	/* Listen context */
 	mtx_init(&td->lctx_hash_lock, "lctx hash lock", NULL, MTX_DEF);
 	td->listen_hash = hashinit_flags(LISTEN_HASH_SIZE, M_CXGBE,
 	    &td->listen_mask, HASH_NOWAIT);
 
 	/* List of WRs for which L2 resolution failed */
 	mtx_init(&td->unsent_wr_lock, "Unsent WR list lock", NULL, MTX_DEF);
 	STAILQ_INIT(&td->unsent_wr_list);
 	TASK_INIT(&td->reclaim_wr_resources, 0, reclaim_wr_resources, td);
 
 	/* TID tables */
 	rc = alloc_tid_tabs(&sc->tids);
 	if (rc != 0)
 		goto done;
 
 	rc = t4_init_ppod_region(&td->pr, &sc->vres.ddp,
 	    t4_read_reg(sc, A_ULP_RX_TDDP_PSZ), "TDDP page pods");
 	if (rc != 0)
 		goto done;
 	t4_set_reg_field(sc, A_ULP_RX_TDDP_TAGMASK,
 	    V_TDDPTAGMASK(M_TDDPTAGMASK), td->pr.pr_tag_mask);
 
 	/* CLIP table for IPv6 offload */
 	init_clip_table(sc, td);
 
 	if (sc->vres.key.size != 0) {
 		rc = tls_init_kmap(sc, td);
 		if (rc != 0)
 			goto done;
 	}
 
 	/* toedev ops */
 	tod = &td->tod;
 	init_toedev(tod);
 	tod->tod_softc = sc;
 	tod->tod_connect = t4_connect;
 	tod->tod_listen_start = t4_listen_start;
 	tod->tod_listen_stop = t4_listen_stop;
 	tod->tod_rcvd = t4_rcvd;
 	tod->tod_output = t4_tod_output;
 	tod->tod_send_rst = t4_send_rst;
 	tod->tod_send_fin = t4_send_fin;
 	tod->tod_pcb_detach = t4_pcb_detach;
 	tod->tod_l2_update = t4_l2_update;
 	tod->tod_syncache_added = t4_syncache_added;
 	tod->tod_syncache_removed = t4_syncache_removed;
 	tod->tod_syncache_respond = t4_syncache_respond;
 	tod->tod_offload_socket = t4_offload_socket;
 	tod->tod_ctloutput = t4_ctloutput;
 
 	for_each_port(sc, i) {
 		for_each_vi(sc->port[i], v, vi) {
 			TOEDEV(vi->ifp) = &td->tod;
 			for_each_ofld_rxq(vi, j, ofld_rxq) {
 				ofld_rxq->iq.set_tcb_rpl = do_set_tcb_rpl;
 				ofld_rxq->iq.l2t_write_rpl = do_l2t_write_rpl2;
 			}
 		}
 	}
 
 	sc->tom_softc = td;
 	register_toedev(sc->tom_softc);
 
 done:
 	if (rc != 0)
 		free_tom_data(sc, td);
 	return (rc);
 }
 
 static int
 t4_tom_deactivate(struct adapter *sc)
 {
 	int rc = 0;
 	struct tom_data *td = sc->tom_softc;
 
 	ASSERT_SYNCHRONIZED_OP(sc);
 
 	if (td == NULL)
 		return (0);	/* XXX. KASSERT? */
 
 	if (sc->offload_map != 0)
 		return (EBUSY);	/* at least one port has IFCAP_TOE enabled */
 
 	if (uld_active(sc, ULD_IWARP) || uld_active(sc, ULD_ISCSI))
 		return (EBUSY);	/* both iWARP and iSCSI rely on the TOE. */
 
 	mtx_lock(&td->toep_list_lock);
 	if (!TAILQ_EMPTY(&td->toep_list))
 		rc = EBUSY;
 	mtx_unlock(&td->toep_list_lock);
 
 	mtx_lock(&td->lctx_hash_lock);
 	if (td->lctx_count > 0)
 		rc = EBUSY;
 	mtx_unlock(&td->lctx_hash_lock);
 
 	taskqueue_drain(taskqueue_thread, &td->reclaim_wr_resources);
 	mtx_lock(&td->unsent_wr_lock);
 	if (!STAILQ_EMPTY(&td->unsent_wr_list))
 		rc = EBUSY;
 	mtx_unlock(&td->unsent_wr_lock);
 
 	if (rc == 0) {
 		unregister_toedev(sc->tom_softc);
 		free_tom_data(sc, td);
 		sc->tom_softc = NULL;
 	}
 
 	return (rc);
 }
 
 static void
 t4_tom_ifaddr_event(void *arg __unused, struct ifnet *ifp)
 {
 
 	atomic_add_rel_int(&in6_ifaddr_gen, 1);
 	taskqueue_enqueue_timeout(taskqueue_thread, &clip_task, -hz / 4);
 }
 
 static int
 t4_aio_queue_tom(struct socket *so, struct kaiocb *job)
 {
 	struct tcpcb *tp = so_sototcpcb(so);
 	struct toepcb *toep = tp->t_toe;
 	int error;
 
 	if (toep->ulp_mode == ULP_MODE_TCPDDP) {
 		error = t4_aio_queue_ddp(so, job);
 		if (error != EOPNOTSUPP)
 			return (error);
 	}
 
 	return (t4_aio_queue_aiotx(so, job));
 }
 
 static int
 t4_ctloutput_tom(struct socket *so, struct sockopt *sopt)
 {
 
 	if (sopt->sopt_level != IPPROTO_TCP)
 		return (tcp_ctloutput(so, sopt));
 
 	switch (sopt->sopt_name) {
 	case TCP_TLSOM_SET_TLS_CONTEXT:
 	case TCP_TLSOM_GET_TLS_TOM:
 	case TCP_TLSOM_CLR_TLS_TOM:
 	case TCP_TLSOM_CLR_QUIES:
 		return (t4_ctloutput_tls(so, sopt));
 	default:
 		return (tcp_ctloutput(so, sopt));
 	}
 }
 
 static int
 t4_tom_mod_load(void)
 {
 	struct protosw *tcp_protosw, *tcp6_protosw;
 
 	/* CPL handlers */
 	t4_init_connect_cpl_handlers();
 	t4_init_listen_cpl_handlers();
 	t4_init_cpl_io_handlers();
 
 	t4_ddp_mod_load();
 	t4_tls_mod_load();
 
 	tcp_protosw = pffindproto(PF_INET, IPPROTO_TCP, SOCK_STREAM);
 	if (tcp_protosw == NULL)
 		return (ENOPROTOOPT);
 	bcopy(tcp_protosw, &toe_protosw, sizeof(toe_protosw));
 	bcopy(tcp_protosw->pr_usrreqs, &toe_usrreqs, sizeof(toe_usrreqs));
 	toe_usrreqs.pru_aio_queue = t4_aio_queue_tom;
 	toe_protosw.pr_ctloutput = t4_ctloutput_tom;
 	toe_protosw.pr_usrreqs = &toe_usrreqs;
 
 	tcp6_protosw = pffindproto(PF_INET6, IPPROTO_TCP, SOCK_STREAM);
 	if (tcp6_protosw == NULL)
 		return (ENOPROTOOPT);
 	bcopy(tcp6_protosw, &toe6_protosw, sizeof(toe6_protosw));
 	bcopy(tcp6_protosw->pr_usrreqs, &toe6_usrreqs, sizeof(toe6_usrreqs));
 	toe6_usrreqs.pru_aio_queue = t4_aio_queue_tom;
 	toe6_protosw.pr_ctloutput = t4_ctloutput_tom;
 	toe6_protosw.pr_usrreqs = &toe6_usrreqs;
 
 	TIMEOUT_TASK_INIT(taskqueue_thread, &clip_task, 0, t4_clip_task, NULL);
 	ifaddr_evhandler = EVENTHANDLER_REGISTER(ifaddr_event,
 	    t4_tom_ifaddr_event, NULL, EVENTHANDLER_PRI_ANY);
 
 	return (t4_register_uld(&tom_uld_info));
 }
 
 static void
 tom_uninit(struct adapter *sc, void *arg __unused)
 {
 	if (begin_synchronized_op(sc, NULL, SLEEP_OK | INTR_OK, "t4tomun"))
 		return;
 
 	/* Try to free resources (works only if no port has IFCAP_TOE) */
 	if (uld_active(sc, ULD_TOM))
 		t4_deactivate_uld(sc, ULD_TOM);
 
 	end_synchronized_op(sc, 0);
 }
 
 static int
 t4_tom_mod_unload(void)
 {
 	t4_iterate(tom_uninit, NULL);
 
 	if (t4_unregister_uld(&tom_uld_info) == EBUSY)
 		return (EBUSY);
 
 	if (ifaddr_evhandler) {
 		EVENTHANDLER_DEREGISTER(ifaddr_event, ifaddr_evhandler);
 		taskqueue_cancel_timeout(taskqueue_thread, &clip_task, NULL);
 	}
 
 	t4_tls_mod_unload();
 	t4_ddp_mod_unload();
 
 	t4_uninit_connect_cpl_handlers();
 	t4_uninit_listen_cpl_handlers();
 	t4_uninit_cpl_io_handlers();
 
 	return (0);
 }
 #endif	/* TCP_OFFLOAD */
 
 static int
 t4_tom_modevent(module_t mod, int cmd, void *arg)
 {
 	int rc = 0;
 
 #ifdef TCP_OFFLOAD
 	switch (cmd) {
 	case MOD_LOAD:
 		rc = t4_tom_mod_load();
 		break;
 
 	case MOD_UNLOAD:
 		rc = t4_tom_mod_unload();
 		break;
 
 	default:
 		rc = EINVAL;
 	}
 #else
 	printf("t4_tom: compiled without TCP_OFFLOAD support.\n");
 	rc = EOPNOTSUPP;
 #endif
 	return (rc);
 }
 
 static moduledata_t t4_tom_moddata= {
 	"t4_tom",
 	t4_tom_modevent,
 	0
 };
 
 MODULE_VERSION(t4_tom, 1);
 MODULE_DEPEND(t4_tom, toecore, 1, 1, 1);
 MODULE_DEPEND(t4_tom, t4nex, 1, 1, 1);
 DECLARE_MODULE(t4_tom, t4_tom_moddata, SI_SUB_EXEC, SI_ORDER_ANY);
Index: head/sys/dev/cxgbe/tom/t4_tom.h
===================================================================
--- head/sys/dev/cxgbe/tom/t4_tom.h	(revision 331648)
+++ head/sys/dev/cxgbe/tom/t4_tom.h	(revision 331649)
@@ -1,435 +1,435 @@
 /*-
  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
  *
  * Copyright (c) 2012, 2015 Chelsio Communications, Inc.
  * All rights reserved.
  * Written by: Navdeep Parhar <np@FreeBSD.org>
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  * $FreeBSD$
  *
  */
 
 #ifndef __T4_TOM_H__
 #define __T4_TOM_H__
 #include <sys/vmem.h>
 #include "tom/t4_tls.h"
 
 #define LISTEN_HASH_SIZE 32
 
 /*
  * Min receive window.  We want it to be large enough to accommodate receive
  * coalescing, handle jumbo frames, and not trigger sender SWS avoidance.
  */
 #define MIN_RCV_WND (24 * 1024U)
 
 /*
  * Max receive window supported by HW in bytes.  Only a small part of it can
  * be set through option0, the rest needs to be set through RX_DATA_ACK.
  */
 #define MAX_RCV_WND ((1U << 27) - 1)
 
 #define	DDP_RSVD_WIN (16 * 1024U)
 #define	SB_DDP_INDICATE	SB_IN_TOE	/* soreceive must respond to indicate */
 
 #define USE_DDP_RX_FLOW_CONTROL
 
 #define PPOD_SZ(n)	((n) * sizeof(struct pagepod))
 #define PPOD_SIZE	(PPOD_SZ(1))
 
 /* TOE PCB flags */
 enum {
 	TPF_ATTACHED	   = (1 << 0),	/* a tcpcb refers to this toepcb */
 	TPF_FLOWC_WR_SENT  = (1 << 1),	/* firmware flow context WR sent */
 	TPF_TX_DATA_SENT   = (1 << 2),	/* some data sent */
 	TPF_TX_SUSPENDED   = (1 << 3),	/* tx suspended for lack of resources */
 	TPF_SEND_FIN	   = (1 << 4),	/* send FIN after all pending data */
 	TPF_FIN_SENT	   = (1 << 5),	/* FIN has been sent */
 	TPF_ABORT_SHUTDOWN = (1 << 6),	/* connection abort is in progress */
 	TPF_CPL_PENDING    = (1 << 7),	/* haven't received the last CPL */
 	TPF_SYNQE	   = (1 << 8),	/* synq_entry, not really a toepcb */
 	TPF_SYNQE_NEEDFREE = (1 << 9),	/* synq_entry was malloc'd separately */
 	TPF_SYNQE_TCPDDP   = (1 << 10),	/* ulp_mode TCPDDP in toepcb */
 	TPF_SYNQE_EXPANDED = (1 << 11),	/* toepcb ready, tid context updated */
 	TPF_SYNQE_HAS_L2TE = (1 << 12),	/* we've replied to PASS_ACCEPT_REQ */
 	TPF_SYNQE_TLS      = (1 << 13), /* ulp_mode TLS in toepcb */
 	TPF_FORCE_CREDITS  = (1 << 14), /* always send credits */
 };
 
 enum {
 	DDP_OK		= (1 << 0),	/* OK to turn on DDP */
 	DDP_SC_REQ	= (1 << 1),	/* state change (on/off) requested */
 	DDP_ON		= (1 << 2),	/* DDP is turned on */
 	DDP_BUF0_ACTIVE	= (1 << 3),	/* buffer 0 in use (not invalidated) */
 	DDP_BUF1_ACTIVE	= (1 << 4),	/* buffer 1 in use (not invalidated) */
 	DDP_TASK_ACTIVE = (1 << 5),	/* requeue task is queued / running */
 	DDP_DEAD	= (1 << 6),	/* toepcb is shutting down */
 };
 
 struct sockopt;
 
 struct ofld_tx_sdesc {
 	uint32_t plen;		/* payload length */
 	uint8_t tx_credits;	/* firmware tx credits (unit is 16B) */
 	void *iv_buffer;	/* optional buffer holding IVs for TLS */
 };
 
 struct ppod_region {
 	u_int pr_start;
 	u_int pr_len;
 	u_int pr_page_shift[4];
 	uint32_t pr_tag_mask;		/* hardware tagmask for this region. */
 	uint32_t pr_invalid_bit;	/* OR with this to invalidate tag. */
 	uint32_t pr_alias_mask;		/* AND with tag to get alias bits. */
 	u_int pr_alias_shift;		/* shift this much for first alias bit. */
 	vmem_t *pr_arena;
 };
 
 struct ppod_reservation {
 	struct ppod_region *prsv_pr;
 	uint32_t prsv_tag;		/* Full tag: pgsz, alias, tag, color */
 	u_int prsv_nppods;
 };
 
 struct pageset {
 	TAILQ_ENTRY(pageset) link;
 	vm_page_t *pages;
 	int npages;
 	int flags;
 	int offset;		/* offset in first page */
 	int len;
 	struct ppod_reservation prsv;
 	struct vmspace *vm;
 	vm_offset_t start;
 	u_int vm_timestamp;
 };
 
 TAILQ_HEAD(pagesetq, pageset);
 
 #define	PS_WIRED		0x0001	/* Pages wired rather than held. */
 #define	PS_PPODS_WRITTEN	0x0002	/* Page pods written to the card. */
 
 #define	EXT_FLAG_AIOTX		EXT_FLAG_VENDOR1
 
 #define	IS_AIOTX_MBUF(m)						\
 	((m)->m_flags & M_EXT && (m)->m_ext.ext_flags & EXT_FLAG_AIOTX)
 
 struct ddp_buffer {
 	struct pageset *ps;
 
 	struct kaiocb *job;
 	int cancel_pending;
 };
 
 struct ddp_pcb {
 	u_int flags;
 	struct ddp_buffer db[2];
 	TAILQ_HEAD(, pageset) cached_pagesets;
 	TAILQ_HEAD(, kaiocb) aiojobq;
 	u_int waiting_count;
 	u_int active_count;
 	u_int cached_count;
 	int active_id;	/* the currently active DDP buffer */
 	struct task requeue_task;
 	struct kaiocb *queueing;
 	struct mtx lock;
 };
 
 struct aiotx_buffer {
 	struct pageset ps;
 	struct kaiocb *job;
 	int refcount;
 };
 
 struct toepcb {
 	TAILQ_ENTRY(toepcb) link; /* toep_list */
 	u_int flags;		/* miscellaneous flags */
 	int refcount;
 	struct tom_data *td;
 	struct inpcb *inp;	/* backpointer to host stack's PCB */
 	struct vnet *vnet;
 	struct vi_info *vi;	/* virtual interface */
 	struct sge_wrq *ofld_txq;
 	struct sge_ofld_rxq *ofld_rxq;
 	struct sge_wrq *ctrlq;
 	struct l2t_entry *l2te;	/* L2 table entry used by this connection */
 	struct clip_entry *ce;	/* CLIP table entry used by this tid */
 	int tid;		/* Connection identifier */
 	int tc_idx;		/* traffic class that this tid is bound to */
 
 	/* tx credit handling */
 	u_int tx_total;		/* total tx WR credits (in 16B units) */
 	u_int tx_credits;	/* tx WR credits (in 16B units) available */
 	u_int tx_nocompl;	/* tx WR credits since last compl request */
 	u_int plen_nocompl;	/* payload since last compl request */
 
 	/* rx credit handling */
 	u_int sb_cc;		/* last noted value of so_rcv->sb_cc */
 	int rx_credits;		/* rx credits (in bytes) to be returned to hw */
 
 	u_int ulp_mode;	/* ULP mode */
 	void *ulpcb;
 	void *ulpcb2;
 	struct mbufq ulp_pduq;	/* PDUs waiting to be sent out. */
 	struct mbufq ulp_pdu_reclaimq;
 
 	struct ddp_pcb ddp;
 	struct tls_ofld_info tls;
 
 	TAILQ_HEAD(, kaiocb) aiotx_jobq;
 	struct task aiotx_task;
 	bool aiotx_task_active;
 
 	/* Tx software descriptor */
 	uint8_t txsd_total;
 	uint8_t txsd_pidx;
 	uint8_t txsd_cidx;
 	uint8_t txsd_avail;
 	struct ofld_tx_sdesc txsd[];
 };
 
 #define	DDP_LOCK(toep)		mtx_lock(&(toep)->ddp.lock)
 #define	DDP_UNLOCK(toep)	mtx_unlock(&(toep)->ddp.lock)
 #define	DDP_ASSERT_LOCKED(toep)	mtx_assert(&(toep)->ddp.lock, MA_OWNED)
 
 struct flowc_tx_params {
 	uint32_t snd_nxt;
 	uint32_t rcv_nxt;
 	unsigned int snd_space;
 	unsigned int mss;
 };
 
 #define	DDP_RETRY_WAIT	5	/* seconds to wait before re-enabling DDP */
 #define	DDP_LOW_SCORE	1
 #define	DDP_HIGH_SCORE	3
 
 /*
  * Compressed state for embryonic connections for a listener.  Barely fits in
  * 64B, try not to grow it further.
  */
 struct synq_entry {
 	TAILQ_ENTRY(synq_entry) link;	/* listen_ctx's synq link */
 	int flags;			/* same as toepcb's tp_flags */
 	int tid;
 	struct listen_ctx *lctx;	/* backpointer to listen ctx */
 	struct mbuf *syn;
 	uint32_t iss;
 	uint32_t ts;
 	volatile uintptr_t wr;
 	volatile u_int refcnt;
 	uint16_t l2e_idx;
 	uint16_t rcv_bufsize;
 };
 
 /* listen_ctx flags */
 #define LCTX_RPL_PENDING 1	/* waiting for a CPL_PASS_OPEN_RPL */
 
 struct listen_ctx {
 	LIST_ENTRY(listen_ctx) link;	/* listen hash linkage */
 	volatile int refcount;
 	int stid;
 	struct stid_region stid_region;
 	int flags;
 	struct inpcb *inp;		/* listening socket's inp */
 	struct vnet *vnet;
 	struct sge_wrq *ctrlq;
 	struct sge_ofld_rxq *ofld_rxq;
 	struct clip_entry *ce;
 	TAILQ_HEAD(, synq_entry) synq;
 };
 
 struct clip_entry {
 	TAILQ_ENTRY(clip_entry) link;
 	struct in6_addr lip;	/* local IPv6 address */
 	u_int refcount;
 };
 
 TAILQ_HEAD(clip_head, clip_entry);
 struct tom_data {
 	struct toedev tod;
 
 	/* toepcb's associated with this TOE device */
 	struct mtx toep_list_lock;
 	TAILQ_HEAD(, toepcb) toep_list;
 
 	struct mtx lctx_hash_lock;
 	LIST_HEAD(, listen_ctx) *listen_hash;
 	u_long listen_mask;
 	int lctx_count;		/* # of lctx in the hash table */
 
 	struct ppod_region pr;
 
 	vmem_t *key_map;
 	
 	struct mtx clip_table_lock;
 	struct clip_head clip_table;
 	int clip_gen;
 
 	/* WRs that will not be sent to the chip because L2 resolution failed */
 	struct mtx unsent_wr_lock;
 	STAILQ_HEAD(, wrqe) unsent_wr_list;
 	struct task reclaim_wr_resources;
 };
 
 static inline struct tom_data *
 tod_td(struct toedev *tod)
 {
 
 	return (__containerof(tod, struct tom_data, tod));
 }
 
 static inline struct adapter *
 td_adapter(struct tom_data *td)
 {
 
 	return (td->tod.tod_softc);
 }
 
 static inline void
 set_mbuf_ulp_submode(struct mbuf *m, uint8_t ulp_submode)
 {
 
 	M_ASSERTPKTHDR(m);
 	m->m_pkthdr.PH_per.eight[0] = ulp_submode;
 }
 
 static inline uint8_t
 mbuf_ulp_submode(struct mbuf *m)
 {
 
 	M_ASSERTPKTHDR(m);
 	return (m->m_pkthdr.PH_per.eight[0]);
 }
 
 /* t4_tom.c */
 struct toepcb *alloc_toepcb(struct vi_info *, int, int, int);
 struct toepcb *hold_toepcb(struct toepcb *);
 void free_toepcb(struct toepcb *);
 void offload_socket(struct socket *, struct toepcb *);
 void undo_offload_socket(struct socket *);
 void final_cpl_received(struct toepcb *);
 void insert_tid(struct adapter *, int, void *, int);
 void *lookup_tid(struct adapter *, int);
 void update_tid(struct adapter *, int, void *);
 void remove_tid(struct adapter *, int, int);
 void release_tid(struct adapter *, int, struct sge_wrq *);
 int find_best_mtu_idx(struct adapter *, struct in_conninfo *, int);
 u_long select_rcv_wnd(struct socket *);
 int select_rcv_wscale(void);
 uint64_t calc_opt0(struct socket *, struct vi_info *, struct l2t_entry *,
     int, int, int, int);
 uint64_t select_ntuple(struct vi_info *, struct l2t_entry *);
 int select_ulp_mode(struct socket *, struct adapter *);
 void set_ulp_mode(struct toepcb *, int);
 int negative_advice(int);
 struct clip_entry *hold_lip(struct tom_data *, struct in6_addr *,
     struct clip_entry *);
 void release_lip(struct tom_data *, struct clip_entry *);
 
 /* t4_connect.c */
 void t4_init_connect_cpl_handlers(void);
 void t4_uninit_connect_cpl_handlers(void);
 int t4_connect(struct toedev *, struct socket *, struct rtentry *,
     struct sockaddr *);
 void act_open_failure_cleanup(struct adapter *, u_int, u_int);
 
 /* t4_listen.c */
 void t4_init_listen_cpl_handlers(void);
 void t4_uninit_listen_cpl_handlers(void);
 int t4_listen_start(struct toedev *, struct tcpcb *);
 int t4_listen_stop(struct toedev *, struct tcpcb *);
 void t4_syncache_added(struct toedev *, void *);
 void t4_syncache_removed(struct toedev *, void *);
 int t4_syncache_respond(struct toedev *, void *, struct mbuf *);
 int do_abort_req_synqe(struct sge_iq *, const struct rss_header *,
     struct mbuf *);
 int do_abort_rpl_synqe(struct sge_iq *, const struct rss_header *,
     struct mbuf *);
 void t4_offload_socket(struct toedev *, void *, struct socket *);
 
 /* t4_cpl_io.c */
 void aiotx_init_toep(struct toepcb *);
 int t4_aio_queue_aiotx(struct socket *, struct kaiocb *);
 void t4_init_cpl_io_handlers(void);
 void t4_uninit_cpl_io_handlers(void);
 void send_abort_rpl(struct adapter *, struct sge_wrq *, int , int);
 void send_flowc_wr(struct toepcb *, struct flowc_tx_params *);
 void send_reset(struct adapter *, struct toepcb *, uint32_t);
 int send_rx_credits(struct adapter *, struct toepcb *, int);
 void send_rx_modulate(struct adapter *, struct toepcb *);
 void make_established(struct toepcb *, uint32_t, uint32_t, uint16_t);
 int t4_close_conn(struct adapter *, struct toepcb *);
 void t4_rcvd(struct toedev *, struct tcpcb *);
 void t4_rcvd_locked(struct toedev *, struct tcpcb *);
 int t4_tod_output(struct toedev *, struct tcpcb *);
 int t4_send_fin(struct toedev *, struct tcpcb *);
 int t4_send_rst(struct toedev *, struct tcpcb *);
-void t4_set_tcb_field(struct adapter *, struct sge_wrq *, int, uint16_t,
-    uint64_t, uint64_t, int, int, int);
+void t4_set_tcb_field(struct adapter *, struct sge_wrq *, struct toepcb *,
+    uint16_t, uint64_t, uint64_t, int, int);
 void t4_push_frames(struct adapter *sc, struct toepcb *toep, int drop);
 void t4_push_pdus(struct adapter *sc, struct toepcb *toep, int drop);
 int do_set_tcb_rpl(struct sge_iq *, const struct rss_header *, struct mbuf *);
 
 /* t4_ddp.c */
 int t4_init_ppod_region(struct ppod_region *, struct t4_range *, u_int,
     const char *);
 void t4_free_ppod_region(struct ppod_region *);
 int t4_alloc_page_pods_for_ps(struct ppod_region *, struct pageset *);
 int t4_alloc_page_pods_for_buf(struct ppod_region *, vm_offset_t, int,
     struct ppod_reservation *);
 int t4_write_page_pods_for_ps(struct adapter *, struct sge_wrq *, int,
     struct pageset *);
 int t4_write_page_pods_for_buf(struct adapter *, struct sge_wrq *, int tid,
     struct ppod_reservation *, vm_offset_t, int);
 void t4_free_page_pods(struct ppod_reservation *);
 int t4_soreceive_ddp(struct socket *, struct sockaddr **, struct uio *,
     struct mbuf **, struct mbuf **, int *);
 int t4_aio_queue_ddp(struct socket *, struct kaiocb *);
 void t4_ddp_mod_load(void);
 void t4_ddp_mod_unload(void);
 void ddp_assert_empty(struct toepcb *);
 void ddp_init_toep(struct toepcb *);
 void ddp_uninit_toep(struct toepcb *);
 void ddp_queue_toep(struct toepcb *);
 void release_ddp_resources(struct toepcb *toep);
 void handle_ddp_close(struct toepcb *, struct tcpcb *, uint32_t);
 void handle_ddp_indicate(struct toepcb *);
 void handle_ddp_tcb_rpl(struct toepcb *, const struct cpl_set_tcb_rpl *);
 void insert_ddp_data(struct toepcb *, uint32_t);
 
 /* t4_tls.c */
 bool can_tls_offload(struct adapter *);
 int t4_ctloutput_tls(struct socket *, struct sockopt *);
 void t4_push_tls_records(struct adapter *, struct toepcb *, int);
 void t4_tls_mod_load(void);
 void t4_tls_mod_unload(void);
 void tls_establish(struct toepcb *);
 void tls_free_kmap(struct tom_data *);
 int tls_init_kmap(struct adapter *, struct tom_data *);
 void tls_init_toep(struct toepcb *);
 int tls_rx_key(struct toepcb *);
 void tls_stop_handshake_timer(struct toepcb *);
 int tls_tx_key(struct toepcb *);
 void tls_uninit_toep(struct toepcb *);
 
 #endif