Index: stable/11/sys/dev/cxgbe/cxgbei/cxgbei.c
===================================================================
--- stable/11/sys/dev/cxgbe/cxgbei/cxgbei.c	(revision 348703)
+++ stable/11/sys/dev/cxgbe/cxgbei/cxgbei.c	(revision 348704)
@@ -1,1118 +1,1117 @@
 /*-
  * Copyright (c) 2012 Chelsio Communications, Inc.
  * All rights reserved.
  *
  * Chelsio T5xx iSCSI driver
  *
  * Written by: Sreenivasa Honnur <shonnur@chelsio.com>
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_inet.h"
 #include "opt_inet6.h"
 
 #include <sys/types.h>
 #include <sys/param.h>
 #include <sys/kernel.h>
 #include <sys/module.h>
 #include <sys/systm.h>
 
 #ifdef TCP_OFFLOAD
 #include <sys/errno.h>
 #include <sys/kthread.h>
 #include <sys/smp.h>
 #include <sys/socket.h>
 #include <sys/socketvar.h>
 #include <sys/mbuf.h>
 #include <sys/lock.h>
 #include <sys/mutex.h>
 #include <sys/condvar.h>
 
 #include <netinet/in.h>
 #include <netinet/in_pcb.h>
 #include <netinet/toecore.h>
 #include <netinet/tcp_var.h>
 #include <netinet/tcp_fsm.h>
 
 #include <cam/scsi/scsi_all.h>
 #include <cam/scsi/scsi_da.h>
 #include <cam/ctl/ctl_io.h>
 #include <cam/ctl/ctl.h>
 #include <cam/ctl/ctl_backend.h>
 #include <cam/ctl/ctl_error.h>
 #include <cam/ctl/ctl_frontend.h>
 #include <cam/ctl/ctl_debug.h>
 #include <cam/ctl/ctl_ha.h>
 #include <cam/ctl/ctl_ioctl.h>
 
 #include <dev/iscsi/icl.h>
 #include <dev/iscsi/iscsi_proto.h>
 #include <dev/iscsi/iscsi_ioctl.h>
 #include <dev/iscsi/iscsi.h>
 #include <cam/ctl/ctl_frontend_iscsi.h>
 
 #include <cam/cam.h>
 #include <cam/cam_ccb.h>
 #include <cam/cam_xpt.h>
 #include <cam/cam_debug.h>
 #include <cam/cam_sim.h>
 #include <cam/cam_xpt_sim.h>
 #include <cam/cam_xpt_periph.h>
 #include <cam/cam_periph.h>
 #include <cam/cam_compat.h>
 #include <cam/scsi/scsi_message.h>
 
 #include "common/common.h"
 #include "common/t4_msg.h"
 #include "common/t4_regs.h"     /* for PCIE_MEM_ACCESS */
 #include "tom/t4_tom.h"
 #include "cxgbei.h"
 #include "cxgbei_ulp2_ddp.h"
 
 static int worker_thread_count;
 static struct cxgbei_worker_thread_softc *cwt_softc;
 static struct proc *cxgbei_proc;
 
 /* XXXNP some header instead. */
 struct icl_pdu *icl_cxgbei_new_pdu(int);
 void icl_cxgbei_new_pdu_set_conn(struct icl_pdu *, struct icl_conn *);
 void icl_cxgbei_conn_pdu_free(struct icl_conn *, struct icl_pdu *);
 
 /*
  * Direct Data Placement -
  * Directly place the iSCSI Data-In or Data-Out PDU's payload into pre-posted
  * final destination host-memory buffers based on the Initiator Task Tag (ITT)
  * in Data-In or Target Task Tag (TTT) in Data-Out PDUs.
  * The host memory address is programmed into h/w in the format of pagepod
  * entries.
  * The location of the pagepod entry is encoded into ddp tag which is used as
  * the base for ITT/TTT.
  */
 
 /*
  * functions to program the pagepod in h/w
  */
 static void inline
 ppod_set(struct pagepod *ppod,
 	struct cxgbei_ulp2_pagepod_hdr *hdr,
 	struct cxgbei_ulp2_gather_list *gl,
 	unsigned int pidx)
 {
 	int i;
 
 	memcpy(ppod, hdr, sizeof(*hdr));
 
 	for (i = 0; i < (PPOD_PAGES + 1); i++, pidx++) {
 		ppod->addr[i] = pidx < gl->nelem ?
 			cpu_to_be64(gl->dma_sg[pidx].phys_addr) : 0ULL;
 	}
 }
 
 static void inline
 ppod_clear(struct pagepod *ppod)
 {
 	memset(ppod, 0, sizeof(*ppod));
 }
 
 static inline void
 ulp_mem_io_set_hdr(struct adapter *sc, int tid, struct ulp_mem_io *req,
 		unsigned int wr_len, unsigned int dlen,
 		unsigned int pm_addr)
 {
 	struct ulptx_idata *idata = (struct ulptx_idata *)(req + 1);
 
 	INIT_ULPTX_WR(req, wr_len, 0, 0);
 	req->cmd = cpu_to_be32(V_ULPTX_CMD(ULP_TX_MEM_WRITE) |
 				V_ULP_MEMIO_ORDER(is_t4(sc)) |
 				V_T5_ULP_MEMIO_IMM(is_t5(sc)));
 	req->dlen = htonl(V_ULP_MEMIO_DATA_LEN(dlen >> 5));
 	req->len16 = htonl(DIV_ROUND_UP(wr_len - sizeof(req->wr), 16)
 				| V_FW_WR_FLOWID(tid));
 	req->lock_addr = htonl(V_ULP_MEMIO_ADDR(pm_addr >> 5));
 
 	idata->cmd_more = htonl(V_ULPTX_CMD(ULP_TX_SC_IMM));
 	idata->len = htonl(dlen);
 }
 
 #define ULPMEM_IDATA_MAX_NPPODS 1	/* 256/PPOD_SIZE */
 #define PCIE_MEMWIN_MAX_NPPODS 16	/* 1024/PPOD_SIZE */
 
 static int
 ppod_write_idata(struct cxgbei_data *ci,
 			struct cxgbei_ulp2_pagepod_hdr *hdr,
 			unsigned int idx, unsigned int npods,
 			struct cxgbei_ulp2_gather_list *gl,
 			unsigned int gl_pidx, struct toepcb *toep)
 {
 	u_int dlen = PPOD_SIZE * npods;
 	u_int pm_addr = idx * PPOD_SIZE + ci->llimit;
 	u_int wr_len = roundup(sizeof(struct ulp_mem_io) +
 	    sizeof(struct ulptx_idata) + dlen, 16);
 	struct ulp_mem_io *req;
 	struct ulptx_idata *idata;
 	struct pagepod *ppod;
 	u_int i;
 	struct wrqe *wr;
 	struct adapter *sc = toep->vi->pi->adapter;
 
 	wr = alloc_wrqe(wr_len, toep->ctrlq);
 	if (wr == NULL) {
 		CXGBE_UNIMPLEMENTED("ppod_write_idata: alloc_wrqe failure");
 		return (ENOMEM);
 	}
 
 	req = wrtod(wr);
 	memset(req, 0, wr_len);
 	ulp_mem_io_set_hdr(sc, toep->tid, req, wr_len, dlen, pm_addr);
 	idata = (struct ulptx_idata *)(req + 1);
 
 	ppod = (struct pagepod *)(idata + 1);
 	for (i = 0; i < npods; i++, ppod++, gl_pidx += PPOD_PAGES) {
 		if (!hdr) /* clear the pagepod */
 			ppod_clear(ppod);
 		else /* set the pagepod */
 			ppod_set(ppod, hdr, gl, gl_pidx);
 	}
 
 	t4_wrq_tx(sc, wr);
 	return 0;
 }
 
 int
 t4_ddp_set_map(struct cxgbei_data *ci, void *iccp,
     struct cxgbei_ulp2_pagepod_hdr *hdr, u_int idx, u_int npods,
     struct cxgbei_ulp2_gather_list *gl, int reply)
 {
 	struct icl_cxgbei_conn *icc = (struct icl_cxgbei_conn *)iccp;
 	struct toepcb *toep = icc->toep;
 	int err;
 	unsigned int pidx = 0, w_npods = 0, cnt;
 
 	/*
 	 * on T4, if we use a mix of IMMD and DSGL with ULP_MEM_WRITE,
 	 * the order would not be guaranteed, so we will stick with IMMD
 	 */
 	gl->tid = toep->tid;
 	gl->port_id = toep->vi->pi->port_id;
 	gl->egress_dev = (void *)toep->vi->ifp;
 
 	/* send via immediate data */
 	for (; w_npods < npods; idx += cnt, w_npods += cnt,
 		pidx += PPOD_PAGES) {
 		cnt = npods - w_npods;
 		if (cnt > ULPMEM_IDATA_MAX_NPPODS)
 			cnt = ULPMEM_IDATA_MAX_NPPODS;
 		err = ppod_write_idata(ci, hdr, idx, cnt, gl, pidx, toep);
 		if (err) {
 			printf("%s: ppod_write_idata failed\n", __func__);
 			break;
 		}
 	}
 	return err;
 }
 
 void
 t4_ddp_clear_map(struct cxgbei_data *ci, struct cxgbei_ulp2_gather_list *gl,
     u_int tag, u_int idx, u_int npods, struct icl_cxgbei_conn *icc)
 {
 	struct toepcb *toep = icc->toep;
 	int err = -1;
 	u_int pidx = 0;
 	u_int w_npods = 0;
 	u_int cnt;
 
 	for (; w_npods < npods; idx += cnt, w_npods += cnt,
 		pidx += PPOD_PAGES) {
 		cnt = npods - w_npods;
 		if (cnt > ULPMEM_IDATA_MAX_NPPODS)
 			cnt = ULPMEM_IDATA_MAX_NPPODS;
 		err = ppod_write_idata(ci, NULL, idx, cnt, gl, 0, toep);
 		if (err)
 			break;
 	}
 }
 
 static int
 cxgbei_map_sg(struct cxgbei_sgl *sgl, struct ccb_scsiio *csio)
 {
 	unsigned int data_len = csio->dxfer_len;
 	unsigned int sgoffset = (uint64_t)csio->data_ptr & PAGE_MASK;
 	unsigned int nsge;
 	unsigned char *sgaddr = csio->data_ptr;
 	unsigned int len = 0;
 
 	nsge = (csio->dxfer_len + sgoffset + PAGE_SIZE - 1) >> PAGE_SHIFT;
 	sgl->sg_addr = sgaddr;
 	sgl->sg_offset = sgoffset;
 	if (data_len <  (PAGE_SIZE - sgoffset))
 		len = data_len;
 	else
 		len = PAGE_SIZE - sgoffset;
 
 	sgl->sg_length = len;
 
 	data_len -= len;
 	sgaddr += len;
 	sgl = sgl+1;
 
 	while (data_len > 0) {
 		sgl->sg_addr = sgaddr;
 		len = (data_len < PAGE_SIZE)? data_len: PAGE_SIZE;
 		sgl->sg_length = len;
 	        sgaddr += len;
 		data_len -= len;
 		sgl = sgl + 1;
 	}
 
 	return nsge;
 }
 
 static int
 cxgbei_map_sg_tgt(struct cxgbei_sgl *sgl, union ctl_io *io)
 {
 	unsigned int data_len, sgoffset, nsge;
 	unsigned char *sgaddr;
 	unsigned int len = 0, index = 0, ctl_sg_count, i;
 	struct ctl_sg_entry ctl_sg_entry, *ctl_sglist;
 
 	if (io->scsiio.kern_sg_entries > 0) {
 		ctl_sglist = (struct ctl_sg_entry *)io->scsiio.kern_data_ptr;
 		ctl_sg_count = io->scsiio.kern_sg_entries;
 	} else {
 		ctl_sglist = &ctl_sg_entry;
 		ctl_sglist->addr = io->scsiio.kern_data_ptr;
 		ctl_sglist->len = io->scsiio.kern_data_len;
 		ctl_sg_count = 1;
 	}
 
 	sgaddr = sgl->sg_addr = ctl_sglist[index].addr;
 	sgoffset = sgl->sg_offset = (uint64_t)sgl->sg_addr & PAGE_MASK;
 	data_len = ctl_sglist[index].len;
 
 	if (data_len <  (PAGE_SIZE - sgoffset))
 		len = data_len;
 	else
 		len = PAGE_SIZE - sgoffset;
 
 	sgl->sg_length = len;
 
 	data_len -= len;
 	sgaddr += len;
 	sgl = sgl+1;
 
 	len = 0;
 	for (i = 0;  i< ctl_sg_count; i++)
 		len += ctl_sglist[i].len;
 	nsge = (len + sgoffset + PAGE_SIZE -1) >> PAGE_SHIFT;
 	while (data_len > 0) {
 		sgl->sg_addr = sgaddr;
 		len = (data_len < PAGE_SIZE)? data_len: PAGE_SIZE;
 		sgl->sg_length = len;
 		sgaddr += len;
 		data_len -= len;
 		sgl = sgl + 1;
 		if (data_len == 0) {
 			if (index == ctl_sg_count - 1)
 				break;
 			index++;
 			sgaddr = ctl_sglist[index].addr;
 			data_len = ctl_sglist[index].len;
 		}
 	}
 
 	return nsge;
 }
 
 static int
 t4_sk_ddp_tag_reserve(struct cxgbei_data *ci, struct icl_cxgbei_conn *icc,
     u_int xferlen, struct cxgbei_sgl *sgl, u_int sgcnt, u_int *ddp_tag)
 {
 	struct cxgbei_ulp2_gather_list *gl;
 	int err = -EINVAL;
 	struct toepcb *toep = icc->toep;
 
 	gl = cxgbei_ulp2_ddp_make_gl_from_iscsi_sgvec(xferlen, sgl, sgcnt, ci, 0);
 	if (gl) {
 		err = cxgbei_ulp2_ddp_tag_reserve(ci, icc, toep->tid,
 		    &ci->tag_format, ddp_tag, gl, 0, 0);
 		if (err) {
 			cxgbei_ulp2_ddp_release_gl(ci, gl);
 		}
 	}
 
 	return err;
 }
 
 static unsigned int
 cxgbei_task_reserve_itt(struct icl_conn *ic, void **prv,
 			struct ccb_scsiio *scmd, unsigned int *itt)
 {
 	struct icl_cxgbei_conn *icc = ic_to_icc(ic);
 	int xferlen = scmd->dxfer_len;
 	struct cxgbei_task_data *tdata = NULL;
 	struct cxgbei_sgl *sge = NULL;
 	struct toepcb *toep = icc->toep;
 	struct adapter *sc = td_adapter(toep->td);
 	struct cxgbei_data *ci = sc->iscsi_ulp_softc;
 	int err = -1;
 
 	MPASS(icc->icc_signature == CXGBEI_CONN_SIGNATURE);
 
 	tdata = (struct cxgbei_task_data *)*prv;
 	if (xferlen == 0 || tdata == NULL)
 		goto out;
 	if (xferlen < DDP_THRESHOLD)
 		goto out;
 
 	if ((scmd->ccb_h.flags & CAM_DIR_MASK) == CAM_DIR_IN) {
 		tdata->nsge = cxgbei_map_sg(tdata->sgl, scmd);
 		if (tdata->nsge == 0) {
 			CTR1(KTR_CXGBE, "%s: map_sg failed", __func__);
 			return 0;
 		}
 		sge = tdata->sgl;
 
 		tdata->sc_ddp_tag = *itt;
 
 		CTR3(KTR_CXGBE, "%s: *itt:0x%x sc_ddp_tag:0x%x",
 				__func__, *itt, tdata->sc_ddp_tag);
 		if (cxgbei_ulp2_sw_tag_usable(&ci->tag_format,
 							tdata->sc_ddp_tag)) {
 			err = t4_sk_ddp_tag_reserve(ci, icc, scmd->dxfer_len,
 			    sge, tdata->nsge, &tdata->sc_ddp_tag);
 		} else {
 			CTR3(KTR_CXGBE,
 				"%s: itt:0x%x sc_ddp_tag:0x%x not usable",
 				__func__, *itt, tdata->sc_ddp_tag);
 		}
 	}
 out:
 	if (err < 0)
 		tdata->sc_ddp_tag =
 			cxgbei_ulp2_set_non_ddp_tag(&ci->tag_format, *itt);
 
 	return tdata->sc_ddp_tag;
 }
 
 static unsigned int
 cxgbei_task_reserve_ttt(struct icl_conn *ic, void **prv, union ctl_io *io,
 				unsigned int *ttt)
 {
 	struct icl_cxgbei_conn *icc = ic_to_icc(ic);
 	struct toepcb *toep = icc->toep;
 	struct adapter *sc = td_adapter(toep->td);
 	struct cxgbei_data *ci = sc->iscsi_ulp_softc;
 	struct cxgbei_task_data *tdata = NULL;
 	int xferlen, err = -1;
 	struct cxgbei_sgl *sge = NULL;
 
 	MPASS(icc->icc_signature == CXGBEI_CONN_SIGNATURE);
 
 	xferlen = (io->scsiio.kern_data_len - io->scsiio.ext_data_filled);
 	tdata = (struct cxgbei_task_data *)*prv;
 	if ((xferlen == 0) || (tdata == NULL))
 		goto out;
 	if (xferlen < DDP_THRESHOLD)
 		goto out;
 	tdata->nsge = cxgbei_map_sg_tgt(tdata->sgl, io);
 	if (tdata->nsge == 0) {
 		CTR1(KTR_CXGBE, "%s: map_sg failed", __func__);
 		return 0;
 	}
 	sge = tdata->sgl;
 
 	tdata->sc_ddp_tag = *ttt;
 	if (cxgbei_ulp2_sw_tag_usable(&ci->tag_format, tdata->sc_ddp_tag)) {
 		err = t4_sk_ddp_tag_reserve(ci, icc, xferlen, sge,
 		    tdata->nsge, &tdata->sc_ddp_tag);
 	} else {
 		CTR2(KTR_CXGBE, "%s: sc_ddp_tag:0x%x not usable",
 				__func__, tdata->sc_ddp_tag);
 	}
 out:
 	if (err < 0)
 		tdata->sc_ddp_tag =
 			cxgbei_ulp2_set_non_ddp_tag(&ci->tag_format, *ttt);
 	return tdata->sc_ddp_tag;
 }
 
 static int
 t4_sk_ddp_tag_release(struct icl_cxgbei_conn *icc, unsigned int ddp_tag)
 {
 	struct toepcb *toep = icc->toep;
 	struct adapter *sc = td_adapter(toep->td);
 	struct cxgbei_data *ci = sc->iscsi_ulp_softc;
 
 	cxgbei_ulp2_ddp_tag_release(ci, ddp_tag, icc);
 
 	return (0);
 }
 
 static int
 cxgbei_ddp_init(struct adapter *sc, struct cxgbei_data *ci)
 {
 	int nppods, bits, max_sz, rc;
 	static const u_int pgsz_order[] = {0, 1, 2, 3};
 
 	MPASS(sc->vres.iscsi.size > 0);
 
 	ci->llimit = sc->vres.iscsi.start;
 	ci->ulimit = sc->vres.iscsi.start + sc->vres.iscsi.size - 1;
 	max_sz = G_MAXRXDATA(t4_read_reg(sc, A_TP_PARA_REG2));
 
 	nppods = sc->vres.iscsi.size >> IPPOD_SIZE_SHIFT;
 	if (nppods <= 1024)
 		return (ENXIO);
 
 	bits = fls(nppods);
 	if (bits > IPPOD_IDX_MAX_SIZE)
 		bits = IPPOD_IDX_MAX_SIZE;
 	nppods = (1 << (bits - 1)) - 1;
 
 	rc = bus_dma_tag_create(NULL, 1, 0, BUS_SPACE_MAXADDR,
 	    BUS_SPACE_MAXADDR, NULL, NULL, UINT32_MAX , 8, BUS_SPACE_MAXSIZE,
 	    BUS_DMA_ALLOCNOW, NULL, NULL, &ci->ulp_ddp_tag);
 	if (rc != 0) {
 		device_printf(sc->dev, "%s: failed to create DMA tag: %u.\n",
 		    __func__, rc);
 		return (rc);
 	}
 
 	ci->colors = malloc(nppods * sizeof(char), M_CXGBE, M_NOWAIT | M_ZERO);
 	ci->gl_map = malloc(nppods * sizeof(struct cxgbei_ulp2_gather_list *),
 	    M_CXGBE, M_NOWAIT | M_ZERO);
 	if (ci->colors == NULL || ci->gl_map == NULL) {
 		bus_dma_tag_destroy(ci->ulp_ddp_tag);
 		free(ci->colors, M_CXGBE);
 		free(ci->gl_map, M_CXGBE);
 		return (ENOMEM);
 	}
 
 	mtx_init(&ci->map_lock, "ddp lock", NULL, MTX_DEF | MTX_DUPOK);
 	ci->max_txsz = ci->max_rxsz = min(max_sz, ULP2_MAX_PKT_SIZE);
 	ci->nppods = nppods;
 	ci->idx_last = nppods;
 	ci->idx_bits = bits;
 	ci->idx_mask = (1 << bits) - 1;
 	ci->rsvd_tag_mask = (1 << (bits + IPPOD_IDX_SHIFT)) - 1;
 
 	ci->tag_format.sw_bits = bits;
 	ci->tag_format.rsvd_bits = bits;
 	ci->tag_format.rsvd_shift = IPPOD_IDX_SHIFT;
 	ci->tag_format.rsvd_mask = ci->idx_mask;
 
 	t4_iscsi_init(sc, ci->idx_mask << IPPOD_IDX_SHIFT, pgsz_order);
 
 	return (rc);
 }
 
 static int
 do_rx_iscsi_hdr(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m)
 {
 	struct adapter *sc = iq->adapter;
 	struct cpl_iscsi_hdr *cpl = mtod(m, struct cpl_iscsi_hdr *);
 	u_int tid = GET_TID(cpl);
 	struct toepcb *toep = lookup_tid(sc, tid);
 	struct icl_pdu *ip;
 	struct icl_cxgbei_pdu *icp;
 
 	M_ASSERTPKTHDR(m);
 
 	ip = icl_cxgbei_new_pdu(M_NOWAIT);
 	if (ip == NULL)
 		CXGBE_UNIMPLEMENTED("PDU allocation failure");
 	icp = ip_to_icp(ip);
 	bcopy(mtod(m, caddr_t) + sizeof(*cpl), icp->ip.ip_bhs, sizeof(struct
 	    iscsi_bhs));
 	icp->pdu_seq = ntohl(cpl->seq);
 	icp->pdu_flags = SBUF_ULP_FLAG_HDR_RCVD;
 
 	/* This is the start of a new PDU.  There should be no old state. */
 	MPASS(toep->ulpcb2 == NULL);
 	toep->ulpcb2 = icp;
 
 #if 0
 	CTR4(KTR_CXGBE, "%s: tid %u, cpl->len hlen %u, m->m_len hlen %u",
 	    __func__, tid, ntohs(cpl->len), m->m_len);
 #endif
 
 	m_freem(m);
 	return (0);
 }
 
 static int
 do_rx_iscsi_data(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m)
 {
 	struct adapter *sc = iq->adapter;
 	struct cpl_iscsi_data *cpl =  mtod(m, struct cpl_iscsi_data *);
 	u_int tid = GET_TID(cpl);
 	struct toepcb *toep = lookup_tid(sc, tid);
 	struct icl_cxgbei_pdu *icp = toep->ulpcb2;
 
 	M_ASSERTPKTHDR(m);
 
 	/* Must already have received the header (but not the data). */
 	MPASS(icp != NULL);
 	MPASS(icp->pdu_flags == SBUF_ULP_FLAG_HDR_RCVD);
 	MPASS(icp->ip.ip_data_mbuf == NULL);
 	MPASS(icp->ip.ip_data_len == 0);
 
 	m_adj(m, sizeof(*cpl));
 
 	icp->pdu_flags |= SBUF_ULP_FLAG_DATA_RCVD;
 	icp->ip.ip_data_mbuf = m;
 	icp->ip.ip_data_len = m->m_pkthdr.len;
 
 #if 0
 	CTR4(KTR_CXGBE, "%s: tid %u, cpl->len dlen %u, m->m_len dlen %u",
 	    __func__, tid, ntohs(cpl->len), m->m_len);
 #endif
 
 	return (0);
 }
 
 static int
 do_rx_iscsi_ddp(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m)
 {
 	struct adapter *sc = iq->adapter;
 	const struct cpl_rx_data_ddp *cpl = (const void *)(rss + 1);
 	u_int tid = GET_TID(cpl);
 	struct toepcb *toep = lookup_tid(sc, tid);
 	struct inpcb *inp = toep->inp;
 	struct socket *so;
 	struct sockbuf *sb;
 	struct tcpcb *tp;
 	struct icl_cxgbei_conn *icc;
 	struct icl_conn *ic;
 	struct icl_cxgbei_pdu *icp = toep->ulpcb2;
 	struct icl_pdu *ip;
 	u_int pdu_len, val;
 
 	MPASS(m == NULL);
 
 	/* Must already be assembling a PDU. */
 	MPASS(icp != NULL);
 	MPASS(icp->pdu_flags & SBUF_ULP_FLAG_HDR_RCVD);	/* Data is optional. */
 	ip = &icp->ip;
 	icp->pdu_flags |= SBUF_ULP_FLAG_STATUS_RCVD;
 	val = ntohl(cpl->ddpvld);
 	if (val & F_DDP_PADDING_ERR)
 		icp->pdu_flags |= SBUF_ULP_FLAG_PAD_ERROR;
 	if (val & F_DDP_HDRCRC_ERR)
 		icp->pdu_flags |= SBUF_ULP_FLAG_HCRC_ERROR;
 	if (val & F_DDP_DATACRC_ERR)
 		icp->pdu_flags |= SBUF_ULP_FLAG_DCRC_ERROR;
 	if (ip->ip_data_mbuf == NULL) {
 		/* XXXNP: what should ip->ip_data_len be, and why? */
 		icp->pdu_flags |= SBUF_ULP_FLAG_DATA_DDPED;
 	}
 	pdu_len = ntohs(cpl->len);	/* includes everything. */
 
 	INP_WLOCK(inp);
 	if (__predict_false(inp->inp_flags & (INP_DROPPED | INP_TIMEWAIT))) {
 		CTR4(KTR_CXGBE, "%s: tid %u, rx (%d bytes), inp_flags 0x%x",
 		    __func__, tid, pdu_len, inp->inp_flags);
 		INP_WUNLOCK(inp);
 		icl_cxgbei_conn_pdu_free(NULL, ip);
 #ifdef INVARIANTS
 		toep->ulpcb2 = NULL;
 #endif
 		return (0);
 	}
 
 	tp = intotcpcb(inp);
 	MPASS(icp->pdu_seq == tp->rcv_nxt);
 	MPASS(tp->rcv_wnd >= pdu_len);
 	tp->rcv_nxt += pdu_len;
 	tp->rcv_wnd -= pdu_len;
 	tp->t_rcvtime = ticks;
 
 	/* update rx credits */
-	toep->rx_credits += pdu_len;
 	t4_rcvd(&toep->td->tod, tp);	/* XXX: sc->tom_softc.tod */
 
 	so = inp->inp_socket;
 	sb = &so->so_rcv;
 	SOCKBUF_LOCK(sb);
 
 	icc = toep->ulpcb;
 	if (__predict_false(icc == NULL || sb->sb_state & SBS_CANTRCVMORE)) {
 		CTR5(KTR_CXGBE,
 		    "%s: tid %u, excess rx (%d bytes), icc %p, sb_state 0x%x",
 		    __func__, tid, pdu_len, icc, sb->sb_state);
 		SOCKBUF_UNLOCK(sb);
 		INP_WUNLOCK(inp);
 
 		INP_INFO_RLOCK(&V_tcbinfo);
 		INP_WLOCK(inp);
 		tp = tcp_drop(tp, ECONNRESET);
 		if (tp)
 			INP_WUNLOCK(inp);
 		INP_INFO_RUNLOCK(&V_tcbinfo);
 
 		icl_cxgbei_conn_pdu_free(NULL, ip);
 #ifdef INVARIANTS
 		toep->ulpcb2 = NULL;
 #endif
 		return (0);
 	}
 	MPASS(icc->icc_signature == CXGBEI_CONN_SIGNATURE);
 	ic = &icc->ic;
 	icl_cxgbei_new_pdu_set_conn(ip, ic);
 
 	MPASS(m == NULL); /* was unused, we'll use it now. */
 	m = sbcut_locked(sb, sbused(sb)); /* XXXNP: toep->sb_cc accounting? */
 	if (__predict_false(m != NULL)) {
 		int len = m_length(m, NULL);
 
 		/*
 		 * PDUs were received before the tid transitioned to ULP mode.
 		 * Convert them to icl_cxgbei_pdus and send them to ICL before
 		 * the PDU in icp/ip.
 		 */
 		CTR3(KTR_CXGBE, "%s: tid %u, %u bytes in so_rcv", __func__, tid,
 		    len);
 
 		/* XXXNP: needs to be rewritten. */
 		if (len == sizeof(struct iscsi_bhs) || len == 4 + sizeof(struct
 		    iscsi_bhs)) {
 			struct icl_cxgbei_pdu *icp0;
 			struct icl_pdu *ip0;
 
 			ip0 = icl_cxgbei_new_pdu(M_NOWAIT);
 			icl_cxgbei_new_pdu_set_conn(ip0, ic);
 			if (ip0 == NULL)
 				CXGBE_UNIMPLEMENTED("PDU allocation failure");
 			icp0 = ip_to_icp(ip0);
 			icp0->pdu_seq = 0; /* XXX */
 			icp0->pdu_flags = SBUF_ULP_FLAG_HDR_RCVD |
 			    SBUF_ULP_FLAG_STATUS_RCVD;
 			m_copydata(m, 0, sizeof(struct iscsi_bhs), (void *)ip0->ip_bhs);
 			STAILQ_INSERT_TAIL(&icc->rcvd_pdus, ip0, ip_next);
 		}
 		m_freem(m);
 	}
 
 #if 0
 	CTR4(KTR_CXGBE, "%s: tid %u, pdu_len %u, pdu_flags 0x%x",
 	    __func__, tid, pdu_len, icp->pdu_flags);
 #endif
 
 	STAILQ_INSERT_TAIL(&icc->rcvd_pdus, ip, ip_next);
 	if ((icc->rx_flags & RXF_ACTIVE) == 0) {
 		struct cxgbei_worker_thread_softc *cwt = &cwt_softc[icc->cwt];
 
 		mtx_lock(&cwt->cwt_lock);
 		icc->rx_flags |= RXF_ACTIVE;
 		TAILQ_INSERT_TAIL(&cwt->rx_head, icc, rx_link);
 		if (cwt->cwt_state == CWT_SLEEPING) {
 			cwt->cwt_state = CWT_RUNNING;
 			cv_signal(&cwt->cwt_cv);
 		}
 		mtx_unlock(&cwt->cwt_lock);
 	}
 	SOCKBUF_UNLOCK(sb);
 	INP_WUNLOCK(inp);
 
 #ifdef INVARIANTS
 	toep->ulpcb2 = NULL;
 #endif
 
 	return (0);
 }
 
 /* initiator */
 void
 cxgbei_conn_task_reserve_itt(void *conn, void **prv,
 				void *scmd, unsigned int *itt)
 {
 	unsigned int tag;
 	tag = cxgbei_task_reserve_itt(conn, prv, scmd, itt);
 	if (tag)
 		*itt = htonl(tag);
 	return;
 }
 
 /* target */
 void
 cxgbei_conn_transfer_reserve_ttt(void *conn, void **prv,
 				void *scmd, unsigned int *ttt)
 {
 	unsigned int tag;
 	tag = cxgbei_task_reserve_ttt(conn, prv, scmd, ttt);
 	if (tag)
 		*ttt = htonl(tag);
 	return;
 }
 
 void
 cxgbei_cleanup_task(void *conn, void *ofld_priv)
 {
 	struct icl_conn *ic = (struct icl_conn *)conn;
 	struct icl_cxgbei_conn *icc = ic_to_icc(ic);
 	struct cxgbei_task_data *tdata = ofld_priv;
 	struct adapter *sc = icc->sc;
 	struct cxgbei_data *ci = sc->iscsi_ulp_softc;
 
 	MPASS(icc->icc_signature == CXGBEI_CONN_SIGNATURE);
 	MPASS(tdata != NULL);
 
 	if (cxgbei_ulp2_is_ddp_tag(&ci->tag_format, tdata->sc_ddp_tag))
 		t4_sk_ddp_tag_release(icc, tdata->sc_ddp_tag);
 	memset(tdata, 0, sizeof(*tdata));
 }
 
 static int
 cxgbei_activate(struct adapter *sc)
 {
 	struct cxgbei_data *ci;
 	int rc;
 
 	ASSERT_SYNCHRONIZED_OP(sc);
 
 	if (uld_active(sc, ULD_ISCSI)) {
 		KASSERT(0, ("%s: iSCSI offload already enabled on adapter %p",
 		    __func__, sc));
 		return (0);
 	}
 
 	if (sc->iscsicaps == 0 || sc->vres.iscsi.size == 0) {
 		device_printf(sc->dev,
 		    "not iSCSI offload capable, or capability disabled.\n");
 		return (ENOSYS);
 	}
 
 	/* per-adapter softc for iSCSI */
 	ci = malloc(sizeof(*ci), M_CXGBE, M_ZERO | M_NOWAIT);
 	if (ci == NULL)
 		return (ENOMEM);
 
 	rc = cxgbei_ddp_init(sc, ci);
 	if (rc != 0) {
 		free(ci, M_CXGBE);
 		return (rc);
 	}
 
 	sc->iscsi_ulp_softc = ci;
 
 	return (0);
 }
 
 static int
 cxgbei_deactivate(struct adapter *sc)
 {
 
 	ASSERT_SYNCHRONIZED_OP(sc);
 
 	if (sc->iscsi_ulp_softc != NULL) {
 		cxgbei_ddp_cleanup(sc->iscsi_ulp_softc);
 		free(sc->iscsi_ulp_softc, M_CXGBE);
 		sc->iscsi_ulp_softc = NULL;
 	}
 
 	return (0);
 }
 
 static void
 cxgbei_activate_all(struct adapter *sc, void *arg __unused)
 {
 
 	if (begin_synchronized_op(sc, NULL, SLEEP_OK | INTR_OK, "t4isact") != 0)
 		return;
 
 	/* Activate iSCSI if any port on this adapter has IFCAP_TOE enabled. */
 	if (sc->offload_map && !uld_active(sc, ULD_ISCSI))
 		(void) t4_activate_uld(sc, ULD_ISCSI);
 
 	end_synchronized_op(sc, 0);
 }
 
 static void
 cxgbei_deactivate_all(struct adapter *sc, void *arg __unused)
 {
 
 	if (begin_synchronized_op(sc, NULL, SLEEP_OK | INTR_OK, "t4isdea") != 0)
 		return;
 
 	if (uld_active(sc, ULD_ISCSI))
 	    (void) t4_deactivate_uld(sc, ULD_ISCSI);
 
 	end_synchronized_op(sc, 0);
 }
 
 static struct uld_info cxgbei_uld_info = {
 	.uld_id = ULD_ISCSI,
 	.activate = cxgbei_activate,
 	.deactivate = cxgbei_deactivate,
 };
 
 static void
 cwt_main(void *arg)
 {
 	struct cxgbei_worker_thread_softc *cwt = arg;
 	struct icl_cxgbei_conn *icc = NULL;
 	struct icl_conn *ic;
 	struct icl_pdu *ip;
 	struct sockbuf *sb;
 	STAILQ_HEAD(, icl_pdu) rx_pdus = STAILQ_HEAD_INITIALIZER(rx_pdus);
 
 	MPASS(cwt != NULL);
 
 	mtx_lock(&cwt->cwt_lock);
 	MPASS(cwt->cwt_state == 0);
 	cwt->cwt_state = CWT_RUNNING;
 	cv_signal(&cwt->cwt_cv);
 
 	while (__predict_true(cwt->cwt_state != CWT_STOP)) {
 		cwt->cwt_state = CWT_RUNNING;
 		while ((icc = TAILQ_FIRST(&cwt->rx_head)) != NULL) {
 			TAILQ_REMOVE(&cwt->rx_head, icc, rx_link);
 			mtx_unlock(&cwt->cwt_lock);
 
 			ic = &icc->ic;
 			sb = &ic->ic_socket->so_rcv;
 
 			SOCKBUF_LOCK(sb);
 			MPASS(icc->rx_flags & RXF_ACTIVE);
 			if (__predict_true(!(sb->sb_state & SBS_CANTRCVMORE))) {
 				MPASS(STAILQ_EMPTY(&rx_pdus));
 				STAILQ_SWAP(&icc->rcvd_pdus, &rx_pdus, icl_pdu);
 				SOCKBUF_UNLOCK(sb);
 
 				/* Hand over PDUs to ICL. */
 				while ((ip = STAILQ_FIRST(&rx_pdus)) != NULL) {
 					STAILQ_REMOVE_HEAD(&rx_pdus, ip_next);
 					ic->ic_receive(ip);
 				}
 
 				SOCKBUF_LOCK(sb);
 				MPASS(STAILQ_EMPTY(&rx_pdus));
 			}
 			MPASS(icc->rx_flags & RXF_ACTIVE);
 			if (STAILQ_EMPTY(&icc->rcvd_pdus) ||
 			    __predict_false(sb->sb_state & SBS_CANTRCVMORE)) {
 				icc->rx_flags &= ~RXF_ACTIVE;
 			} else {
 				/*
 				 * More PDUs were received while we were busy
 				 * handing over the previous batch to ICL.
 				 * Re-add this connection to the end of the
 				 * queue.
 				 */
 				mtx_lock(&cwt->cwt_lock);
 				TAILQ_INSERT_TAIL(&cwt->rx_head, icc,
 				    rx_link);
 				mtx_unlock(&cwt->cwt_lock);
 			}
 			SOCKBUF_UNLOCK(sb);
 
 			mtx_lock(&cwt->cwt_lock);
 		}
 
 		/* Inner loop doesn't check for CWT_STOP, do that first. */
 		if (__predict_false(cwt->cwt_state == CWT_STOP))
 			break;
 		cwt->cwt_state = CWT_SLEEPING;
 		cv_wait(&cwt->cwt_cv, &cwt->cwt_lock);
 	}
 
 	MPASS(TAILQ_FIRST(&cwt->rx_head) == NULL);
 	mtx_assert(&cwt->cwt_lock, MA_OWNED);
 	cwt->cwt_state = CWT_STOPPED;
 	cv_signal(&cwt->cwt_cv);
 	mtx_unlock(&cwt->cwt_lock);
 	kthread_exit();
 }
 
 static int
 start_worker_threads(void)
 {
 	int i, rc;
 	struct cxgbei_worker_thread_softc *cwt;
 
 	worker_thread_count = min(mp_ncpus, 32);
 	cwt_softc = malloc(worker_thread_count * sizeof(*cwt), M_CXGBE,
 	    M_WAITOK | M_ZERO);
 
 	MPASS(cxgbei_proc == NULL);
 	for (i = 0, cwt = &cwt_softc[0]; i < worker_thread_count; i++, cwt++) {
 		mtx_init(&cwt->cwt_lock, "cwt lock", NULL, MTX_DEF);
 		cv_init(&cwt->cwt_cv, "cwt cv");
 		TAILQ_INIT(&cwt->rx_head);
 		rc = kproc_kthread_add(cwt_main, cwt, &cxgbei_proc, NULL, 0, 0,
 		    "cxgbei", "%d", i);
 		if (rc != 0) {
 			printf("cxgbei: failed to start thread #%d/%d (%d)\n",
 			    i + 1, worker_thread_count, rc);
 			mtx_destroy(&cwt->cwt_lock);
 			cv_destroy(&cwt->cwt_cv);
 			bzero(cwt, sizeof(*cwt));
 			if (i == 0) {
 				free(cwt_softc, M_CXGBE);
 				worker_thread_count = 0;
 
 				return (rc);
 			}
 
 			/* Not fatal, carry on with fewer threads. */
 			worker_thread_count = i;
 			rc = 0;
 			break;
 		}
 
 		/* Wait for thread to start before moving on to the next one. */
 		mtx_lock(&cwt->cwt_lock);
 		while (cwt->cwt_state == 0)
 			cv_wait(&cwt->cwt_cv, &cwt->cwt_lock);
 		mtx_unlock(&cwt->cwt_lock);
 	}
 
 	MPASS(cwt_softc != NULL);
 	MPASS(worker_thread_count > 0);
 	return (0);
 }
 
 static void
 stop_worker_threads(void)
 {
 	int i;
 	struct cxgbei_worker_thread_softc *cwt = &cwt_softc[0];
 
 	MPASS(worker_thread_count >= 0);
 
 	for (i = 0, cwt = &cwt_softc[0]; i < worker_thread_count; i++, cwt++) {
 		mtx_lock(&cwt->cwt_lock);
 		MPASS(cwt->cwt_state == CWT_RUNNING ||
 		    cwt->cwt_state == CWT_SLEEPING);
 		cwt->cwt_state = CWT_STOP;
 		cv_signal(&cwt->cwt_cv);
 		do {
 			cv_wait(&cwt->cwt_cv, &cwt->cwt_lock);
 		} while (cwt->cwt_state != CWT_STOPPED);
 		mtx_unlock(&cwt->cwt_lock);
 	}
 	free(cwt_softc, M_CXGBE);
 }
 
 /* Select a worker thread for a connection. */
 u_int
 cxgbei_select_worker_thread(struct icl_cxgbei_conn *icc)
 {
 	struct adapter *sc = icc->sc;
 	struct toepcb *toep = icc->toep;
 	u_int i, n;
 
 	n = worker_thread_count / sc->sge.nofldrxq;
 	if (n > 0)
 		i = toep->vi->pi->port_id * n + arc4random() % n;
 	else
 		i = arc4random() % worker_thread_count;
 
 	CTR3(KTR_CXGBE, "%s: tid %u, cwt %u", __func__, toep->tid, i);
 
 	return (i);
 }
 
 static int
 cxgbei_mod_load(void)
 {
 	int rc;
 
 	t4_register_cpl_handler(CPL_ISCSI_HDR, do_rx_iscsi_hdr);
 	t4_register_cpl_handler(CPL_ISCSI_DATA, do_rx_iscsi_data);
 	t4_register_cpl_handler(CPL_RX_ISCSI_DDP, do_rx_iscsi_ddp);
 
 	rc = start_worker_threads();
 	if (rc != 0)
 		return (rc);
 
 	rc = t4_register_uld(&cxgbei_uld_info);
 	if (rc != 0) {
 		stop_worker_threads();
 		return (rc);
 	}
 
 	t4_iterate(cxgbei_activate_all, NULL);
 
 	return (rc);
 }
 
 static int
 cxgbei_mod_unload(void)
 {
 
 	t4_iterate(cxgbei_deactivate_all, NULL);
 
 	if (t4_unregister_uld(&cxgbei_uld_info) == EBUSY)
 		return (EBUSY);
 
 	stop_worker_threads();
 
 	t4_register_cpl_handler(CPL_ISCSI_HDR, NULL);
 	t4_register_cpl_handler(CPL_ISCSI_DATA, NULL);
 	t4_register_cpl_handler(CPL_RX_ISCSI_DDP, NULL);
 
 	return (0);
 }
 #endif
 
 static int
 cxgbei_modevent(module_t mod, int cmd, void *arg)
 {
 	int rc = 0;
 
 #ifdef TCP_OFFLOAD
 	switch (cmd) {
 	case MOD_LOAD:
 		rc = cxgbei_mod_load();
 		break;
 
 	case MOD_UNLOAD:
 		rc = cxgbei_mod_unload();
 		break;
 
 	default:
 		rc = EINVAL;
 	}
 #else
 	printf("cxgbei: compiled without TCP_OFFLOAD support.\n");
 	rc = EOPNOTSUPP;
 #endif
 
 	return (rc);
 }
 
 static moduledata_t cxgbei_mod = {
 	"cxgbei",
 	cxgbei_modevent,
 	NULL,
 };
 
 MODULE_VERSION(cxgbei, 1);
 DECLARE_MODULE(cxgbei, cxgbei_mod, SI_SUB_EXEC, SI_ORDER_ANY);
 MODULE_DEPEND(cxgbei, t4_tom, 1, 1, 1);
 MODULE_DEPEND(cxgbei, cxgbe, 1, 1, 1);
 MODULE_DEPEND(cxgbei, icl, 1, 1, 1);
Index: stable/11/sys/dev/cxgbe/tom/t4_connect.c
===================================================================
--- stable/11/sys/dev/cxgbe/tom/t4_connect.c	(revision 348703)
+++ stable/11/sys/dev/cxgbe/tom/t4_connect.c	(revision 348704)
@@ -1,501 +1,500 @@
 /*-
  * Copyright (c) 2012 Chelsio Communications, Inc.
  * All rights reserved.
  * Written by: Navdeep Parhar <np@FreeBSD.org>
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_inet.h"
 #include "opt_inet6.h"
 
 #ifdef TCP_OFFLOAD
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/kernel.h>
 #include <sys/ktr.h>
 #include <sys/module.h>
 #include <sys/protosw.h>
 #include <sys/domain.h>
 #include <sys/socket.h>
 #include <sys/socketvar.h>
 #include <sys/sysctl.h>
 #include <net/ethernet.h>
 #include <net/if.h>
 #include <net/if_types.h>
 #include <net/if_vlan_var.h>
 #include <net/route.h>
 #include <netinet/in.h>
 #include <netinet/in_pcb.h>
 #include <netinet/ip.h>
 #define TCPSTATES
 #include <netinet/tcp_fsm.h>
 #include <netinet/tcp_var.h>
 #include <netinet/toecore.h>
 #include <netinet/cc/cc.h>
 
 #include "common/common.h"
 #include "common/t4_msg.h"
 #include "common/t4_regs.h"
 #include "common/t4_regs_values.h"
 #include "t4_clip.h"
 #include "tom/t4_tom_l2t.h"
 #include "tom/t4_tom.h"
 
 /*
  * Active open succeeded.
  */
 static int
 do_act_establish(struct sge_iq *iq, const struct rss_header *rss,
     struct mbuf *m)
 {
 	struct adapter *sc = iq->adapter;
 	const struct cpl_act_establish *cpl = (const void *)(rss + 1);
 	u_int tid = GET_TID(cpl);
 	u_int atid = G_TID_TID(ntohl(cpl->tos_atid));
 	struct toepcb *toep = lookup_atid(sc, atid);
 	struct inpcb *inp = toep->inp;
 
 	KASSERT(m == NULL, ("%s: wasn't expecting payload", __func__));
 	KASSERT(toep->tid == atid, ("%s: toep tid/atid mismatch", __func__));
 
 	CTR3(KTR_CXGBE, "%s: atid %u, tid %u", __func__, atid, tid);
 	free_atid(sc, atid);
 
 	CURVNET_SET(toep->vnet);
 	INP_WLOCK(inp);
 	toep->tid = tid;
 	insert_tid(sc, tid, toep, inp->inp_vflag & INP_IPV6 ? 2 : 1);
 	if (inp->inp_flags & INP_DROPPED) {
 
 		/* socket closed by the kernel before hw told us it connected */
 
 		send_flowc_wr(toep, NULL);
 		send_reset(sc, toep, be32toh(cpl->snd_isn));
 		goto done;
 	}
 
 	make_established(toep, be32toh(cpl->snd_isn) - 1,
 	    be32toh(cpl->rcv_isn) - 1, cpl->tcp_opt);
 
 	if (toep->ulp_mode == ULP_MODE_TLS)
 		tls_establish(toep);
 
 done:
 	INP_WUNLOCK(inp);
 	CURVNET_RESTORE();
 	return (0);
 }
 
 void
 act_open_failure_cleanup(struct adapter *sc, u_int atid, u_int status)
 {
 	struct toepcb *toep = lookup_atid(sc, atid);
 	struct inpcb *inp = toep->inp;
 	struct toedev *tod = &toep->td->tod;
 
 	free_atid(sc, atid);
 	toep->tid = -1;
 
 	CURVNET_SET(toep->vnet);
 	if (status != EAGAIN)
 		INP_INFO_RLOCK(&V_tcbinfo);
 	INP_WLOCK(inp);
 	toe_connect_failed(tod, inp, status);
 	final_cpl_received(toep);	/* unlocks inp */
 	if (status != EAGAIN)
 		INP_INFO_RUNLOCK(&V_tcbinfo);
 	CURVNET_RESTORE();
 }
 
 /*
  * Active open failed.
  */
 static int
 do_act_open_rpl(struct sge_iq *iq, const struct rss_header *rss,
     struct mbuf *m)
 {
 	struct adapter *sc = iq->adapter;
 	const struct cpl_act_open_rpl *cpl = (const void *)(rss + 1);
 	u_int atid = G_TID_TID(G_AOPEN_ATID(be32toh(cpl->atid_status)));
 	u_int status = G_AOPEN_STATUS(be32toh(cpl->atid_status));
 	struct toepcb *toep = lookup_atid(sc, atid);
 	int rc;
 
 	KASSERT(m == NULL, ("%s: wasn't expecting payload", __func__));
 	KASSERT(toep->tid == atid, ("%s: toep tid/atid mismatch", __func__));
 
 	CTR3(KTR_CXGBE, "%s: atid %u, status %u ", __func__, atid, status);
 
 	/* Ignore negative advice */
 	if (negative_advice(status))
 		return (0);
 
 	if (status && act_open_has_tid(status))
 		release_tid(sc, GET_TID(cpl), toep->ctrlq);
 
 	rc = act_open_rpl_status_to_errno(status);
 	act_open_failure_cleanup(sc, atid, rc);
 
 	return (0);
 }
 
 /*
  * Options2 for active open.
  */
 static uint32_t
 calc_opt2a(struct socket *so, struct toepcb *toep,
     const struct offload_settings *s)
 {
 	struct tcpcb *tp = so_sototcpcb(so);
 	struct port_info *pi = toep->vi->pi;
 	struct adapter *sc = pi->adapter;
 	uint32_t opt2 = 0;
 
 	/*
 	 * rx flow control, rx coalesce, congestion control, and tx pace are all
 	 * explicitly set by the driver.  On T5+ the ISS is also set by the
 	 * driver to the value picked by the kernel.
 	 */
 	if (is_t4(sc)) {
 		opt2 |= F_RX_FC_VALID | F_RX_COALESCE_VALID;
 		opt2 |= F_CONG_CNTRL_VALID | F_PACE_VALID;
 	} else {
 		opt2 |= F_T5_OPT_2_VALID;	/* all 4 valid */
 		opt2 |= F_T5_ISS;		/* ISS provided in CPL */
 	}
 
 	if (s->sack > 0 || (s->sack < 0 && (tp->t_flags & TF_SACK_PERMIT)))
 		opt2 |= F_SACK_EN;
 
 	if (s->tstamp > 0 || (s->tstamp < 0 && (tp->t_flags & TF_REQ_TSTMP)))
 		opt2 |= F_TSTAMPS_EN;
 
 	if (tp->t_flags & TF_REQ_SCALE)
 		opt2 |= F_WND_SCALE_EN;
 
 	if (s->ecn > 0 || (s->ecn < 0 && V_tcp_do_ecn == 1))
 		opt2 |= F_CCTRL_ECN;
 
 	/* XXX: F_RX_CHANNEL for multiple rx c-chan support goes here. */
 
 	opt2 |= V_TX_QUEUE(sc->params.tp.tx_modq[pi->tx_chan]);
 
 	/* These defaults are subject to ULP specific fixups later. */
 	opt2 |= V_RX_FC_DDP(0) | V_RX_FC_DISABLE(0);
 
 	opt2 |= V_PACE(0);
 
 	if (s->cong_algo >= 0)
 		opt2 |= V_CONG_CNTRL(s->cong_algo);
 	else if (sc->tt.cong_algorithm >= 0)
 		opt2 |= V_CONG_CNTRL(sc->tt.cong_algorithm & M_CONG_CNTRL);
 	else {
 		struct cc_algo *cc = CC_ALGO(tp);
 
 		if (strcasecmp(cc->name, "reno") == 0)
 			opt2 |= V_CONG_CNTRL(CONG_ALG_RENO);
 		else if (strcasecmp(cc->name, "tahoe") == 0)
 			opt2 |= V_CONG_CNTRL(CONG_ALG_TAHOE);
 		if (strcasecmp(cc->name, "newreno") == 0)
 			opt2 |= V_CONG_CNTRL(CONG_ALG_NEWRENO);
 		if (strcasecmp(cc->name, "highspeed") == 0)
 			opt2 |= V_CONG_CNTRL(CONG_ALG_HIGHSPEED);
 		else {
 			/*
 			 * Use newreno in case the algorithm selected by the
 			 * host stack is not supported by the hardware.
 			 */
 			opt2 |= V_CONG_CNTRL(CONG_ALG_NEWRENO);
 		}
 	}
 
 	if (s->rx_coalesce > 0 || (s->rx_coalesce < 0 && sc->tt.rx_coalesce))
 		opt2 |= V_RX_COALESCE(M_RX_COALESCE);
 
 	/* Note that ofld_rxq is already set according to s->rxq. */
 	opt2 |= F_RSS_QUEUE_VALID;
 	opt2 |= V_RSS_QUEUE(toep->ofld_rxq->iq.abs_id);
 
 #ifdef USE_DDP_RX_FLOW_CONTROL
 	if (toep->ulp_mode == ULP_MODE_TCPDDP)
 		opt2 |= F_RX_FC_DDP;
 #endif
 
 	if (toep->ulp_mode == ULP_MODE_TLS) {
 		opt2 &= ~V_RX_COALESCE(M_RX_COALESCE);
 		opt2 |= F_RX_FC_DISABLE;
 	}
 
 	return (htobe32(opt2));
 }
 
 void
 t4_init_connect_cpl_handlers(void)
 {
 
 	t4_register_cpl_handler(CPL_ACT_ESTABLISH, do_act_establish);
 	t4_register_shared_cpl_handler(CPL_ACT_OPEN_RPL, do_act_open_rpl,
 	    CPL_COOKIE_TOM);
 }
 
 void
 t4_uninit_connect_cpl_handlers(void)
 {
 
 	t4_register_cpl_handler(CPL_ACT_ESTABLISH, NULL);
 	t4_register_shared_cpl_handler(CPL_ACT_OPEN_RPL, NULL, CPL_COOKIE_TOM);
 }
 
 #define DONT_OFFLOAD_ACTIVE_OPEN(x)	do { \
 	reason = __LINE__; \
 	rc = (x); \
 	goto failed; \
 } while (0)
 
 static inline int
 act_open_cpl_size(struct adapter *sc, int isipv6)
 {
 	int idx;
 	static const int sz_table[3][2] = {
 		{
 			sizeof (struct cpl_act_open_req),
 			sizeof (struct cpl_act_open_req6)
 		},
 		{
 			sizeof (struct cpl_t5_act_open_req),
 			sizeof (struct cpl_t5_act_open_req6)
 		},
 		{
 			sizeof (struct cpl_t6_act_open_req),
 			sizeof (struct cpl_t6_act_open_req6)
 		},
 	};
 
 	MPASS(chip_id(sc) >= CHELSIO_T4);
 	idx = min(chip_id(sc) - CHELSIO_T4, 2);
 
 	return (sz_table[idx][!!isipv6]);
 }
 
 /*
  * active open (soconnect).
  *
  * State of affairs on entry:
  * soisconnecting (so_state |= SS_ISCONNECTING)
  * tcbinfo not locked (This has changed - used to be WLOCKed)
  * inp WLOCKed
  * tp->t_state = TCPS_SYN_SENT
  * rtalloc1, RT_UNLOCK on rt.
  */
 int
 t4_connect(struct toedev *tod, struct socket *so, struct rtentry *rt,
     struct sockaddr *nam)
 {
 	struct adapter *sc = tod->tod_softc;
 	struct toepcb *toep = NULL;
 	struct wrqe *wr = NULL;
 	struct ifnet *rt_ifp = rt->rt_ifp;
 	struct vi_info *vi;
 	int mtu_idx, rscale, qid_atid, rc, isipv6, txqid, rxqid;
 	struct inpcb *inp = sotoinpcb(so);
 	struct tcpcb *tp = intotcpcb(inp);
 	int reason;
 	struct offload_settings settings;
 	uint16_t vid = 0xffff;
 
 	INP_WLOCK_ASSERT(inp);
 	KASSERT(nam->sa_family == AF_INET || nam->sa_family == AF_INET6,
 	    ("%s: dest addr %p has family %u", __func__, nam, nam->sa_family));
 
 	if (rt_ifp->if_type == IFT_ETHER)
 		vi = rt_ifp->if_softc;
 	else if (rt_ifp->if_type == IFT_L2VLAN) {
 		struct ifnet *ifp = VLAN_TRUNKDEV(rt_ifp);
 
 		vi = ifp->if_softc;
 		VLAN_TAG(rt_ifp, &vid);
 	} else if (rt_ifp->if_type == IFT_IEEE8023ADLAG)
 		DONT_OFFLOAD_ACTIVE_OPEN(ENOSYS); /* XXX: implement lagg+TOE */
 	else
 		DONT_OFFLOAD_ACTIVE_OPEN(ENOTSUP);
 
 	rw_rlock(&sc->policy_lock);
 	settings = *lookup_offload_policy(sc, OPEN_TYPE_ACTIVE, NULL, vid, inp);
 	rw_runlock(&sc->policy_lock);
 	if (!settings.offload)
 		DONT_OFFLOAD_ACTIVE_OPEN(EPERM);
 
 	if (settings.txq >= 0 && settings.txq < vi->nofldtxq)
 		txqid = settings.txq;
 	else
 		txqid = arc4random() % vi->nofldtxq;
 	txqid += vi->first_ofld_txq;
 	if (settings.rxq >= 0 && settings.rxq < vi->nofldrxq)
 		rxqid = settings.rxq;
 	else
 		rxqid = arc4random() % vi->nofldrxq;
 	rxqid += vi->first_ofld_rxq;
 
 	toep = alloc_toepcb(vi, txqid, rxqid, M_NOWAIT | M_ZERO);
 	if (toep == NULL)
 		DONT_OFFLOAD_ACTIVE_OPEN(ENOMEM);
 
 	toep->tid = alloc_atid(sc, toep);
 	if (toep->tid < 0)
 		DONT_OFFLOAD_ACTIVE_OPEN(ENOMEM);
 
 	toep->l2te = t4_l2t_get(vi->pi, rt_ifp,
 	    rt->rt_flags & RTF_GATEWAY ? rt->rt_gateway : nam);
 	if (toep->l2te == NULL)
 		DONT_OFFLOAD_ACTIVE_OPEN(ENOMEM);
 
 	isipv6 = nam->sa_family == AF_INET6;
 	wr = alloc_wrqe(act_open_cpl_size(sc, isipv6), toep->ctrlq);
 	if (wr == NULL)
 		DONT_OFFLOAD_ACTIVE_OPEN(ENOMEM);
 
 	toep->vnet = so->so_vnet;
 	set_ulp_mode(toep, select_ulp_mode(so, sc, &settings));
 	SOCKBUF_LOCK(&so->so_rcv);
-	/* opt0 rcv_bufsiz initially, assumes its normal meaning later */
-	toep->rx_credits = min(select_rcv_wnd(so) >> 10, M_RCV_BUFSIZ);
+	toep->opt0_rcv_bufsize = min(select_rcv_wnd(so) >> 10, M_RCV_BUFSIZ);
 	SOCKBUF_UNLOCK(&so->so_rcv);
 
 	/*
 	 * The kernel sets request_r_scale based on sb_max whereas we need to
 	 * take hardware's MAX_RCV_WND into account too.  This is normally a
 	 * no-op as MAX_RCV_WND is much larger than the default sb_max.
 	 */
 	if (tp->t_flags & TF_REQ_SCALE)
 		rscale = tp->request_r_scale = select_rcv_wscale();
 	else
 		rscale = 0;
 	mtu_idx = find_best_mtu_idx(sc, &inp->inp_inc, &settings);
 	qid_atid = V_TID_QID(toep->ofld_rxq->iq.abs_id) | V_TID_TID(toep->tid) |
 	    V_TID_COOKIE(CPL_COOKIE_TOM);
 
 	if (isipv6) {
 		struct cpl_act_open_req6 *cpl = wrtod(wr);
 		struct cpl_t5_act_open_req6 *cpl5 = (void *)cpl;
 		struct cpl_t6_act_open_req6 *cpl6 = (void *)cpl;
 
 		if ((inp->inp_vflag & INP_IPV6) == 0)
 			DONT_OFFLOAD_ACTIVE_OPEN(ENOTSUP);
 
 		toep->ce = t4_hold_lip(sc, &inp->in6p_laddr, NULL);
 		if (toep->ce == NULL)
 			DONT_OFFLOAD_ACTIVE_OPEN(ENOENT);
 
 		switch (chip_id(sc)) {
 		case CHELSIO_T4:
 			INIT_TP_WR(cpl, 0);
 			cpl->params = select_ntuple(vi, toep->l2te);
 			break;
 		case CHELSIO_T5:
 			INIT_TP_WR(cpl5, 0);
 			cpl5->iss = htobe32(tp->iss);
 			cpl5->params = select_ntuple(vi, toep->l2te);
 			break;
 		case CHELSIO_T6:
 		default:
 			INIT_TP_WR(cpl6, 0);
 			cpl6->iss = htobe32(tp->iss);
 			cpl6->params = select_ntuple(vi, toep->l2te);
 			break;
 		}
 		OPCODE_TID(cpl) = htobe32(MK_OPCODE_TID(CPL_ACT_OPEN_REQ6,
 		    qid_atid));
 		cpl->local_port = inp->inp_lport;
 		cpl->local_ip_hi = *(uint64_t *)&inp->in6p_laddr.s6_addr[0];
 		cpl->local_ip_lo = *(uint64_t *)&inp->in6p_laddr.s6_addr[8];
 		cpl->peer_port = inp->inp_fport;
 		cpl->peer_ip_hi = *(uint64_t *)&inp->in6p_faddr.s6_addr[0];
 		cpl->peer_ip_lo = *(uint64_t *)&inp->in6p_faddr.s6_addr[8];
 		cpl->opt0 = calc_opt0(so, vi, toep->l2te, mtu_idx, rscale,
-		    toep->rx_credits, toep->ulp_mode, &settings);
+		    toep->opt0_rcv_bufsize, toep->ulp_mode, &settings);
 		cpl->opt2 = calc_opt2a(so, toep, &settings);
 	} else {
 		struct cpl_act_open_req *cpl = wrtod(wr);
 		struct cpl_t5_act_open_req *cpl5 = (void *)cpl;
 		struct cpl_t6_act_open_req *cpl6 = (void *)cpl;
 
 		switch (chip_id(sc)) {
 		case CHELSIO_T4:
 			INIT_TP_WR(cpl, 0);
 			cpl->params = select_ntuple(vi, toep->l2te);
 			break;
 		case CHELSIO_T5:
 			INIT_TP_WR(cpl5, 0);
 			cpl5->iss = htobe32(tp->iss);
 			cpl5->params = select_ntuple(vi, toep->l2te);
 			break;
 		case CHELSIO_T6:
 		default:
 			INIT_TP_WR(cpl6, 0);
 			cpl6->iss = htobe32(tp->iss);
 			cpl6->params = select_ntuple(vi, toep->l2te);
 			break;
 		}
 		OPCODE_TID(cpl) = htobe32(MK_OPCODE_TID(CPL_ACT_OPEN_REQ,
 		    qid_atid));
 		inp_4tuple_get(inp, &cpl->local_ip, &cpl->local_port,
 		    &cpl->peer_ip, &cpl->peer_port);
 		cpl->opt0 = calc_opt0(so, vi, toep->l2te, mtu_idx, rscale,
-		    toep->rx_credits, toep->ulp_mode, &settings);
+		    toep->opt0_rcv_bufsize, toep->ulp_mode, &settings);
 		cpl->opt2 = calc_opt2a(so, toep, &settings);
 	}
 
 	CTR5(KTR_CXGBE, "%s: atid %u (%s), toep %p, inp %p", __func__,
 	    toep->tid, tcpstates[tp->t_state], toep, inp);
 
 	offload_socket(so, toep);
 	rc = t4_l2t_send(sc, wr, toep->l2te);
 	if (rc == 0) {
 		toep->flags |= TPF_CPL_PENDING;
 		return (0);
 	}
 
 	undo_offload_socket(so);
 	reason = __LINE__;
 failed:
 	CTR3(KTR_CXGBE, "%s: not offloading (%d), rc %d", __func__, reason, rc);
 
 	if (wr)
 		free_wrqe(wr);
 
 	if (toep) {
 		if (toep->tid >= 0)
 			free_atid(sc, toep->tid);
 		if (toep->l2te)
 			t4_l2t_release(toep->l2te);
 		if (toep->ce)
 			t4_release_lip(sc, toep->ce);
 		free_toepcb(toep);
 	}
 
 	return (rc);
 }
 #endif
Index: stable/11/sys/dev/cxgbe/tom/t4_cpl_io.c
===================================================================
--- stable/11/sys/dev/cxgbe/tom/t4_cpl_io.c	(revision 348703)
+++ stable/11/sys/dev/cxgbe/tom/t4_cpl_io.c	(revision 348704)
@@ -1,2332 +1,2305 @@
 /*-
  * Copyright (c) 2012, 2015 Chelsio Communications, Inc.
  * All rights reserved.
  * Written by: Navdeep Parhar <np@FreeBSD.org>
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_inet.h"
 #include "opt_inet6.h"
 
 #ifdef TCP_OFFLOAD
 #include <sys/param.h>
 #include <sys/aio.h>
 #include <sys/file.h>
 #include <sys/kernel.h>
 #include <sys/ktr.h>
 #include <sys/module.h>
 #include <sys/proc.h>
 #include <sys/protosw.h>
 #include <sys/domain.h>
 #include <sys/socket.h>
 #include <sys/socketvar.h>
 #include <sys/sglist.h>
 #include <sys/taskqueue.h>
 #include <netinet/in.h>
 #include <netinet/in_pcb.h>
 #include <netinet/ip.h>
 #include <netinet/ip6.h>
 #define TCPSTATES
 #include <netinet/tcp_fsm.h>
 #include <netinet/tcp_seq.h>
 #include <netinet/tcp_var.h>
 #include <netinet/toecore.h>
 
 #include <security/mac/mac_framework.h>
 
 #include <vm/vm.h>
 #include <vm/vm_extern.h>
 #include <vm/pmap.h>
 #include <vm/vm_map.h>
 #include <vm/vm_page.h>
 
 #include "common/common.h"
 #include "common/t4_msg.h"
 #include "common/t4_regs.h"
 #include "common/t4_tcb.h"
 #include "tom/t4_tom_l2t.h"
 #include "tom/t4_tom.h"
 
 VNET_DECLARE(int, tcp_do_autosndbuf);
 #define V_tcp_do_autosndbuf VNET(tcp_do_autosndbuf)
 VNET_DECLARE(int, tcp_autosndbuf_inc);
 #define V_tcp_autosndbuf_inc VNET(tcp_autosndbuf_inc)
 VNET_DECLARE(int, tcp_autosndbuf_max);
 #define V_tcp_autosndbuf_max VNET(tcp_autosndbuf_max)
 VNET_DECLARE(int, tcp_do_autorcvbuf);
 #define V_tcp_do_autorcvbuf VNET(tcp_do_autorcvbuf)
 VNET_DECLARE(int, tcp_autorcvbuf_inc);
 #define V_tcp_autorcvbuf_inc VNET(tcp_autorcvbuf_inc)
 VNET_DECLARE(int, tcp_autorcvbuf_max);
 #define V_tcp_autorcvbuf_max VNET(tcp_autorcvbuf_max)
 
 static void	t4_aiotx_cancel(struct kaiocb *job);
 static void	t4_aiotx_queue_toep(struct toepcb *toep);
 
 static size_t
 aiotx_mbuf_pgoff(struct mbuf *m)
 {
 	struct aiotx_buffer *ab;
 
 	MPASS(IS_AIOTX_MBUF(m));
 	ab = m->m_ext.ext_arg1;
 	return ((ab->ps.offset + (uintptr_t)m->m_ext.ext_arg2) % PAGE_SIZE);
 }
 
 static vm_page_t *
 aiotx_mbuf_pages(struct mbuf *m)
 {
 	struct aiotx_buffer *ab;
 	int npages;
 
 	MPASS(IS_AIOTX_MBUF(m));
 	ab = m->m_ext.ext_arg1;
 	npages = (ab->ps.offset + (uintptr_t)m->m_ext.ext_arg2) / PAGE_SIZE;
 	return (ab->ps.pages + npages);
 }
 
 void
 send_flowc_wr(struct toepcb *toep, struct flowc_tx_params *ftxp)
 {
 	struct wrqe *wr;
 	struct fw_flowc_wr *flowc;
 	unsigned int nparams, flowclen, paramidx;
 	struct vi_info *vi = toep->vi;
 	struct port_info *pi = vi->pi;
 	struct adapter *sc = pi->adapter;
 	unsigned int pfvf = sc->pf << S_FW_VIID_PFN;
 	struct ofld_tx_sdesc *txsd = &toep->txsd[toep->txsd_pidx];
 
 	KASSERT(!(toep->flags & TPF_FLOWC_WR_SENT),
 	    ("%s: flowc for tid %u sent already", __func__, toep->tid));
 
 	if (ftxp != NULL)
 		nparams = 8;
 	else
 		nparams = 6;
 	if (toep->ulp_mode == ULP_MODE_TLS)
 		nparams++;
 	if (toep->tls.fcplenmax != 0)
 		nparams++;
 	if (toep->tc_idx != -1) {
 		MPASS(toep->tc_idx >= 0 &&
 		    toep->tc_idx < sc->chip_params->nsched_cls);
 		nparams++;
 	}
 
 	flowclen = sizeof(*flowc) + nparams * sizeof(struct fw_flowc_mnemval);
 
 	wr = alloc_wrqe(roundup2(flowclen, 16), toep->ofld_txq);
 	if (wr == NULL) {
 		/* XXX */
 		panic("%s: allocation failure.", __func__);
 	}
 	flowc = wrtod(wr);
 	memset(flowc, 0, wr->wr_len);
 
 	flowc->op_to_nparams = htobe32(V_FW_WR_OP(FW_FLOWC_WR) |
 	    V_FW_FLOWC_WR_NPARAMS(nparams));
 	flowc->flowid_len16 = htonl(V_FW_WR_LEN16(howmany(flowclen, 16)) |
 	    V_FW_WR_FLOWID(toep->tid));
 
 #define FLOWC_PARAM(__m, __v) \
 	do { \
 		flowc->mnemval[paramidx].mnemonic = FW_FLOWC_MNEM_##__m; \
 		flowc->mnemval[paramidx].val = htobe32(__v); \
 		paramidx++; \
 	} while (0)
 
 	paramidx = 0;
 
 	FLOWC_PARAM(PFNVFN, pfvf);
 	FLOWC_PARAM(CH, pi->tx_chan);
 	FLOWC_PARAM(PORT, pi->tx_chan);
 	FLOWC_PARAM(IQID, toep->ofld_rxq->iq.abs_id);
 	if (ftxp) {
 		uint32_t sndbuf = min(ftxp->snd_space, sc->tt.sndbuf);
 
 		FLOWC_PARAM(SNDNXT, ftxp->snd_nxt);
 		FLOWC_PARAM(RCVNXT, ftxp->rcv_nxt);
 		FLOWC_PARAM(SNDBUF, sndbuf);
 		FLOWC_PARAM(MSS, ftxp->mss);
 
 		CTR6(KTR_CXGBE,
 		    "%s: tid %u, mss %u, sndbuf %u, snd_nxt 0x%x, rcv_nxt 0x%x",
 		    __func__, toep->tid, ftxp->mss, sndbuf, ftxp->snd_nxt,
 		    ftxp->rcv_nxt);
 	} else {
 		FLOWC_PARAM(SNDBUF, 512);
 		FLOWC_PARAM(MSS, 512);
 
 		CTR2(KTR_CXGBE, "%s: tid %u", __func__, toep->tid);
 	}
 	if (toep->ulp_mode == ULP_MODE_TLS)
 		FLOWC_PARAM(ULP_MODE, toep->ulp_mode);
 	if (toep->tls.fcplenmax != 0)
 		FLOWC_PARAM(TXDATAPLEN_MAX, toep->tls.fcplenmax);
 	if (toep->tc_idx != -1)
 		FLOWC_PARAM(SCHEDCLASS, toep->tc_idx);
 #undef FLOWC_PARAM
 
 	KASSERT(paramidx == nparams, ("nparams mismatch"));
 
 	txsd->tx_credits = howmany(flowclen, 16);
 	txsd->plen = 0;
 	KASSERT(toep->tx_credits >= txsd->tx_credits && toep->txsd_avail > 0,
 	    ("%s: not enough credits (%d)", __func__, toep->tx_credits));
 	toep->tx_credits -= txsd->tx_credits;
 	if (__predict_false(++toep->txsd_pidx == toep->txsd_total))
 		toep->txsd_pidx = 0;
 	toep->txsd_avail--;
 
 	toep->flags |= TPF_FLOWC_WR_SENT;
         t4_wrq_tx(sc, wr);
 }
 
 #ifdef RATELIMIT
 /*
  * Input is Bytes/second (so_max_pacing-rate), chip counts in Kilobits/second.
  */
 static int
 update_tx_rate_limit(struct adapter *sc, struct toepcb *toep, u_int Bps)
 {
 	int tc_idx, rc;
 	const u_int kbps = (u_int) (uint64_t)Bps * 8ULL / 1000;
 	const int port_id = toep->vi->pi->port_id;
 
 	CTR3(KTR_CXGBE, "%s: tid %u, rate %uKbps", __func__, toep->tid, kbps);
 
 	if (kbps == 0) {
 		/* unbind */
 		tc_idx = -1;
 	} else {
 		rc = t4_reserve_cl_rl_kbps(sc, port_id, kbps, &tc_idx);
 		if (rc != 0)
 			return (rc);
 		MPASS(tc_idx >= 0 && tc_idx < sc->chip_params->nsched_cls);
 	}
 
 	if (toep->tc_idx != tc_idx) {
 		struct wrqe *wr;
 		struct fw_flowc_wr *flowc;
 		int nparams = 1, flowclen, flowclen16;
 		struct ofld_tx_sdesc *txsd = &toep->txsd[toep->txsd_pidx];
 
 		flowclen = sizeof(*flowc) + nparams * sizeof(struct
 		    fw_flowc_mnemval);
 		flowclen16 = howmany(flowclen, 16);
 		if (toep->tx_credits < flowclen16 || toep->txsd_avail == 0 ||
 		    (wr = alloc_wrqe(roundup2(flowclen, 16), toep->ofld_txq)) == NULL) {
 			if (tc_idx >= 0)
 				t4_release_cl_rl(sc, port_id, tc_idx);
 			return (ENOMEM);
 		}
 
 		flowc = wrtod(wr);
 		memset(flowc, 0, wr->wr_len);
 
 		flowc->op_to_nparams = htobe32(V_FW_WR_OP(FW_FLOWC_WR) |
 		    V_FW_FLOWC_WR_NPARAMS(nparams));
 		flowc->flowid_len16 = htonl(V_FW_WR_LEN16(flowclen16) |
 		    V_FW_WR_FLOWID(toep->tid));
 
 		flowc->mnemval[0].mnemonic = FW_FLOWC_MNEM_SCHEDCLASS;
 		if (tc_idx == -1)
 			flowc->mnemval[0].val = htobe32(0xff);
 		else
 			flowc->mnemval[0].val = htobe32(tc_idx);
 
 		txsd->tx_credits = flowclen16;
 		txsd->plen = 0;
 		toep->tx_credits -= txsd->tx_credits;
 		if (__predict_false(++toep->txsd_pidx == toep->txsd_total))
 			toep->txsd_pidx = 0;
 		toep->txsd_avail--;
 		t4_wrq_tx(sc, wr);
 	}
 
 	if (toep->tc_idx >= 0)
 		t4_release_cl_rl(sc, port_id, toep->tc_idx);
 	toep->tc_idx = tc_idx;
 
 	return (0);
 }
 #endif
 
 void
 send_reset(struct adapter *sc, struct toepcb *toep, uint32_t snd_nxt)
 {
 	struct wrqe *wr;
 	struct cpl_abort_req *req;
 	int tid = toep->tid;
 	struct inpcb *inp = toep->inp;
 	struct tcpcb *tp = intotcpcb(inp);	/* don't use if INP_DROPPED */
 
 	INP_WLOCK_ASSERT(inp);
 
 	CTR6(KTR_CXGBE, "%s: tid %d (%s), toep_flags 0x%x, inp_flags 0x%x%s",
 	    __func__, toep->tid,
 	    inp->inp_flags & INP_DROPPED ? "inp dropped" :
 	    tcpstates[tp->t_state],
 	    toep->flags, inp->inp_flags,
 	    toep->flags & TPF_ABORT_SHUTDOWN ?
 	    " (abort already in progress)" : "");
 
 	if (toep->flags & TPF_ABORT_SHUTDOWN)
 		return;	/* abort already in progress */
 
 	toep->flags |= TPF_ABORT_SHUTDOWN;
 
 	KASSERT(toep->flags & TPF_FLOWC_WR_SENT,
 	    ("%s: flowc_wr not sent for tid %d.", __func__, tid));
 
 	wr = alloc_wrqe(sizeof(*req), toep->ofld_txq);
 	if (wr == NULL) {
 		/* XXX */
 		panic("%s: allocation failure.", __func__);
 	}
 	req = wrtod(wr);
 
 	INIT_TP_WR_MIT_CPL(req, CPL_ABORT_REQ, tid);
 	if (inp->inp_flags & INP_DROPPED)
 		req->rsvd0 = htobe32(snd_nxt);
 	else
 		req->rsvd0 = htobe32(tp->snd_nxt);
 	req->rsvd1 = !(toep->flags & TPF_TX_DATA_SENT);
 	req->cmd = CPL_ABORT_SEND_RST;
 
 	/*
 	 * XXX: What's the correct way to tell that the inp hasn't been detached
 	 * from its socket?  Should I even be flushing the snd buffer here?
 	 */
 	if ((inp->inp_flags & (INP_DROPPED | INP_TIMEWAIT)) == 0) {
 		struct socket *so = inp->inp_socket;
 
 		if (so != NULL)	/* because I'm not sure.  See comment above */
 			sbflush(&so->so_snd);
 	}
 
 	t4_l2t_send(sc, wr, toep->l2te);
 }
 
 /*
  * Called when a connection is established to translate the TCP options
  * reported by HW to FreeBSD's native format.
  */
 static void
 assign_rxopt(struct tcpcb *tp, unsigned int opt)
 {
 	struct toepcb *toep = tp->t_toe;
 	struct inpcb *inp = tp->t_inpcb;
 	struct adapter *sc = td_adapter(toep->td);
 	int n;
 
 	INP_LOCK_ASSERT(inp);
 
 	if (inp->inp_inc.inc_flags & INC_ISIPV6)
 		n = sizeof(struct ip6_hdr) + sizeof(struct tcphdr);
 	else
 		n = sizeof(struct ip) + sizeof(struct tcphdr);
 	tp->t_maxseg = sc->params.mtus[G_TCPOPT_MSS(opt)] - n;
 
 	if (G_TCPOPT_TSTAMP(opt)) {
 		tp->t_flags |= TF_RCVD_TSTMP;	/* timestamps ok */
 		tp->ts_recent = 0;		/* hmmm */
 		tp->ts_recent_age = tcp_ts_getticks();
 		tp->t_maxseg -= TCPOLEN_TSTAMP_APPA;
 	}
 
 	CTR5(KTR_CXGBE, "%s: tid %d, mtu_idx %u (%u), mss %u", __func__,
 	    toep->tid, G_TCPOPT_MSS(opt), sc->params.mtus[G_TCPOPT_MSS(opt)],
 	    tp->t_maxseg);
 
 	if (G_TCPOPT_SACK(opt))
 		tp->t_flags |= TF_SACK_PERMIT;	/* should already be set */
 	else
 		tp->t_flags &= ~TF_SACK_PERMIT;	/* sack disallowed by peer */
 
 	if (G_TCPOPT_WSCALE_OK(opt))
 		tp->t_flags |= TF_RCVD_SCALE;
 
 	/* Doing window scaling? */
 	if ((tp->t_flags & (TF_RCVD_SCALE | TF_REQ_SCALE)) ==
 	    (TF_RCVD_SCALE | TF_REQ_SCALE)) {
 		tp->rcv_scale = tp->request_r_scale;
 		tp->snd_scale = G_TCPOPT_SND_WSCALE(opt);
 	}
 }
 
 /*
  * Completes some final bits of initialization for just established connections
  * and changes their state to TCPS_ESTABLISHED.
  *
  * The ISNs are from the exchange of SYNs.
  */
 void
 make_established(struct toepcb *toep, uint32_t iss, uint32_t irs, uint16_t opt)
 {
 	struct inpcb *inp = toep->inp;
 	struct socket *so = inp->inp_socket;
 	struct tcpcb *tp = intotcpcb(inp);
 	long bufsize;
 	uint16_t tcpopt = be16toh(opt);
 	struct flowc_tx_params ftxp;
 
 	INP_WLOCK_ASSERT(inp);
 	KASSERT(tp->t_state == TCPS_SYN_SENT ||
 	    tp->t_state == TCPS_SYN_RECEIVED,
 	    ("%s: TCP state %s", __func__, tcpstates[tp->t_state]));
 
 	CTR6(KTR_CXGBE, "%s: tid %d, so %p, inp %p, tp %p, toep %p",
 	    __func__, toep->tid, so, inp, tp, toep);
 
 	tcp_state_change(tp, TCPS_ESTABLISHED);
 	tp->t_starttime = ticks;
 	TCPSTAT_INC(tcps_connects);
 
 	tp->irs = irs;
 	tcp_rcvseqinit(tp);
-	tp->rcv_wnd = toep->rx_credits << 10;
+	tp->rcv_wnd = toep->opt0_rcv_bufsize << 10;
 	tp->rcv_adv += tp->rcv_wnd;
 	tp->last_ack_sent = tp->rcv_nxt;
 
-	/*
-	 * If we were unable to send all rx credits via opt0, save the remainder
-	 * in rx_credits so that they can be handed over with the next credit
-	 * update.
-	 */
-	SOCKBUF_LOCK(&so->so_rcv);
-	bufsize = select_rcv_wnd(so);
-	SOCKBUF_UNLOCK(&so->so_rcv);
-	toep->rx_credits = bufsize - tp->rcv_wnd;
-
 	tp->iss = iss;
 	tcp_sendseqinit(tp);
 	tp->snd_una = iss + 1;
 	tp->snd_nxt = iss + 1;
 	tp->snd_max = iss + 1;
 
 	assign_rxopt(tp, tcpopt);
 
 	SOCKBUF_LOCK(&so->so_snd);
 	if (so->so_snd.sb_flags & SB_AUTOSIZE && V_tcp_do_autosndbuf)
 		bufsize = V_tcp_autosndbuf_max;
 	else
 		bufsize = sbspace(&so->so_snd);
 	SOCKBUF_UNLOCK(&so->so_snd);
 
 	ftxp.snd_nxt = tp->snd_nxt;
 	ftxp.rcv_nxt = tp->rcv_nxt;
 	ftxp.snd_space = bufsize;
 	ftxp.mss = tp->t_maxseg;
 	send_flowc_wr(toep, &ftxp);
 
 	soisconnected(so);
 }
 
 int
 send_rx_credits(struct adapter *sc, struct toepcb *toep, int credits)
 {
 	struct wrqe *wr;
 	struct cpl_rx_data_ack *req;
 	uint32_t dack = F_RX_DACK_CHANGE | V_RX_DACK_MODE(1);
 
 	KASSERT(credits >= 0, ("%s: %d credits", __func__, credits));
 
 	wr = alloc_wrqe(sizeof(*req), toep->ctrlq);
 	if (wr == NULL)
 		return (0);
 	req = wrtod(wr);
 
 	INIT_TP_WR_MIT_CPL(req, CPL_RX_DATA_ACK, toep->tid);
 	req->credit_dack = htobe32(dack | V_RX_CREDITS(credits));
 
 	t4_wrq_tx(sc, wr);
 	return (credits);
 }
 
 void
 send_rx_modulate(struct adapter *sc, struct toepcb *toep)
 {
 	struct wrqe *wr;
 	struct cpl_rx_data_ack *req;
 
 	wr = alloc_wrqe(sizeof(*req), toep->ctrlq);
 	if (wr == NULL)
 		return;
 	req = wrtod(wr);
 
 	INIT_TP_WR_MIT_CPL(req, CPL_RX_DATA_ACK, toep->tid);
 	req->credit_dack = htobe32(F_RX_MODULATE_RX);
 
 	t4_wrq_tx(sc, wr);
 }
 
 void
 t4_rcvd_locked(struct toedev *tod, struct tcpcb *tp)
 {
 	struct adapter *sc = tod->tod_softc;
 	struct inpcb *inp = tp->t_inpcb;
 	struct socket *so = inp->inp_socket;
 	struct sockbuf *sb = &so->so_rcv;
 	struct toepcb *toep = tp->t_toe;
-	int credits;
+	int rx_credits;
 
 	INP_WLOCK_ASSERT(inp);
-
 	SOCKBUF_LOCK_ASSERT(sb);
-	KASSERT(toep->sb_cc >= sbused(sb),
-	    ("%s: sb %p has more data (%d) than last time (%d).",
-	    __func__, sb, sbused(sb), toep->sb_cc));
 
-	credits = toep->sb_cc - sbused(sb);
-	toep->sb_cc = sbused(sb);
+	rx_credits = sbspace(sb) > tp->rcv_wnd ? sbspace(sb) - tp->rcv_wnd : 0;
 	if (toep->ulp_mode == ULP_MODE_TLS) {
-		if (toep->tls.rcv_over >= credits) {
-			toep->tls.rcv_over -= credits;
-			credits = 0;
+		if (toep->tls.rcv_over >= rx_credits) {
+			toep->tls.rcv_over -= rx_credits;
+			rx_credits = 0;
 		} else {
-			credits -= toep->tls.rcv_over;
+			rx_credits -= toep->tls.rcv_over;
 			toep->tls.rcv_over = 0;
 		}
 	}
-	toep->rx_credits += credits;
 
-	if (toep->rx_credits > 0 &&
-	    (tp->rcv_wnd <= 32 * 1024 || toep->rx_credits >= 64 * 1024 ||
-	    (toep->rx_credits >= 16 * 1024 && tp->rcv_wnd <= 128 * 1024) ||
-	    toep->sb_cc + tp->rcv_wnd < sb->sb_lowat)) {
-
-		credits = send_rx_credits(sc, toep, toep->rx_credits);
-		toep->rx_credits -= credits;
-		tp->rcv_wnd += credits;
-		tp->rcv_adv += credits;
+	if (rx_credits > 0 &&
+	    (tp->rcv_wnd <= 32 * 1024 || rx_credits >= 64 * 1024 ||
+	    (rx_credits >= 16 * 1024 && tp->rcv_wnd <= 128 * 1024) ||
+	    sbused(sb) + tp->rcv_wnd < sb->sb_lowat)) {
+		rx_credits = send_rx_credits(sc, toep, rx_credits);
+		tp->rcv_wnd += rx_credits;
+		tp->rcv_adv += rx_credits;
 	} else if (toep->flags & TPF_FORCE_CREDITS)
 		send_rx_modulate(sc, toep);
 }
 
 void
 t4_rcvd(struct toedev *tod, struct tcpcb *tp)
 {
 	struct inpcb *inp = tp->t_inpcb;
 	struct socket *so = inp->inp_socket;
 	struct sockbuf *sb = &so->so_rcv;
 
 	SOCKBUF_LOCK(sb);
 	t4_rcvd_locked(tod, tp);
 	SOCKBUF_UNLOCK(sb);
 }
 
 /*
  * Close a connection by sending a CPL_CLOSE_CON_REQ message.
  */
 int
 t4_close_conn(struct adapter *sc, struct toepcb *toep)
 {
 	struct wrqe *wr;
 	struct cpl_close_con_req *req;
 	unsigned int tid = toep->tid;
 
 	CTR3(KTR_CXGBE, "%s: tid %u%s", __func__, toep->tid,
 	    toep->flags & TPF_FIN_SENT ? ", IGNORED" : "");
 
 	if (toep->flags & TPF_FIN_SENT)
 		return (0);
 
 	KASSERT(toep->flags & TPF_FLOWC_WR_SENT,
 	    ("%s: flowc_wr not sent for tid %u.", __func__, tid));
 
 	wr = alloc_wrqe(sizeof(*req), toep->ofld_txq);
 	if (wr == NULL) {
 		/* XXX */
 		panic("%s: allocation failure.", __func__);
 	}
 	req = wrtod(wr);
 
         req->wr.wr_hi = htonl(V_FW_WR_OP(FW_TP_WR) |
 	    V_FW_WR_IMMDLEN(sizeof(*req) - sizeof(req->wr)));
 	req->wr.wr_mid = htonl(V_FW_WR_LEN16(howmany(sizeof(*req), 16)) |
 	    V_FW_WR_FLOWID(tid));
         req->wr.wr_lo = cpu_to_be64(0);
         OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_CLOSE_CON_REQ, tid));
 	req->rsvd = 0;
 
 	toep->flags |= TPF_FIN_SENT;
 	toep->flags &= ~TPF_SEND_FIN;
 	t4_l2t_send(sc, wr, toep->l2te);
 
 	return (0);
 }
 
 #define MAX_OFLD_TX_CREDITS (SGE_MAX_WR_LEN / 16)
 #define MIN_OFLD_TX_CREDITS (howmany(sizeof(struct fw_ofld_tx_data_wr) + 1, 16))
 
 /* Maximum amount of immediate data we could stuff in a WR */
 static inline int
 max_imm_payload(int tx_credits)
 {
 	const int n = 2;	/* Use only up to 2 desc for imm. data WR */
 
 	KASSERT(tx_credits >= 0 &&
 		tx_credits <= MAX_OFLD_TX_CREDITS,
 		("%s: %d credits", __func__, tx_credits));
 
 	if (tx_credits < MIN_OFLD_TX_CREDITS)
 		return (0);
 
 	if (tx_credits >= (n * EQ_ESIZE) / 16)
 		return ((n * EQ_ESIZE) - sizeof(struct fw_ofld_tx_data_wr));
 	else
 		return (tx_credits * 16 - sizeof(struct fw_ofld_tx_data_wr));
 }
 
 /* Maximum number of SGL entries we could stuff in a WR */
 static inline int
 max_dsgl_nsegs(int tx_credits)
 {
 	int nseg = 1;	/* ulptx_sgl has room for 1, rest ulp_tx_sge_pair */
 	int sge_pair_credits = tx_credits - MIN_OFLD_TX_CREDITS;
 
 	KASSERT(tx_credits >= 0 &&
 		tx_credits <= MAX_OFLD_TX_CREDITS,
 		("%s: %d credits", __func__, tx_credits));
 
 	if (tx_credits < MIN_OFLD_TX_CREDITS)
 		return (0);
 
 	nseg += 2 * (sge_pair_credits * 16 / 24);
 	if ((sge_pair_credits * 16) % 24 == 16)
 		nseg++;
 
 	return (nseg);
 }
 
 static inline void
 write_tx_wr(void *dst, struct toepcb *toep, unsigned int immdlen,
     unsigned int plen, uint8_t credits, int shove, int ulp_submode, int txalign)
 {
 	struct fw_ofld_tx_data_wr *txwr = dst;
 
 	txwr->op_to_immdlen = htobe32(V_WR_OP(FW_OFLD_TX_DATA_WR) |
 	    V_FW_WR_IMMDLEN(immdlen));
 	txwr->flowid_len16 = htobe32(V_FW_WR_FLOWID(toep->tid) |
 	    V_FW_WR_LEN16(credits));
 	txwr->lsodisable_to_flags = htobe32(V_TX_ULP_MODE(toep->ulp_mode) |
 	    V_TX_ULP_SUBMODE(ulp_submode) | V_TX_URG(0) | V_TX_SHOVE(shove));
 	txwr->plen = htobe32(plen);
 
 	if (txalign > 0) {
 		struct tcpcb *tp = intotcpcb(toep->inp);
 
 		if (plen < 2 * tp->t_maxseg)
 			txwr->lsodisable_to_flags |=
 			    htobe32(F_FW_OFLD_TX_DATA_WR_LSODISABLE);
 		else
 			txwr->lsodisable_to_flags |=
 			    htobe32(F_FW_OFLD_TX_DATA_WR_ALIGNPLD |
 				(tp->t_flags & TF_NODELAY ? 0 :
 				F_FW_OFLD_TX_DATA_WR_ALIGNPLDSHOVE));
 	}
 }
 
 /*
  * Generate a DSGL from a starting mbuf.  The total number of segments and the
  * maximum segments in any one mbuf are provided.
  */
 static void
 write_tx_sgl(void *dst, struct mbuf *start, struct mbuf *stop, int nsegs, int n)
 {
 	struct mbuf *m;
 	struct ulptx_sgl *usgl = dst;
 	int i, j, rc;
 	struct sglist sg;
 	struct sglist_seg segs[n];
 
 	KASSERT(nsegs > 0, ("%s: nsegs 0", __func__));
 
 	sglist_init(&sg, n, segs);
 	usgl->cmd_nsge = htobe32(V_ULPTX_CMD(ULP_TX_SC_DSGL) |
 	    V_ULPTX_NSGE(nsegs));
 
 	i = -1;
 	for (m = start; m != stop; m = m->m_next) {
 		if (IS_AIOTX_MBUF(m))
 			rc = sglist_append_vmpages(&sg, aiotx_mbuf_pages(m),
 			    aiotx_mbuf_pgoff(m), m->m_len);
 		else
 			rc = sglist_append(&sg, mtod(m, void *), m->m_len);
 		if (__predict_false(rc != 0))
 			panic("%s: sglist_append %d", __func__, rc);
 
 		for (j = 0; j < sg.sg_nseg; i++, j++) {
 			if (i < 0) {
 				usgl->len0 = htobe32(segs[j].ss_len);
 				usgl->addr0 = htobe64(segs[j].ss_paddr);
 			} else {
 				usgl->sge[i / 2].len[i & 1] =
 				    htobe32(segs[j].ss_len);
 				usgl->sge[i / 2].addr[i & 1] =
 				    htobe64(segs[j].ss_paddr);
 			}
 #ifdef INVARIANTS
 			nsegs--;
 #endif
 		}
 		sglist_reset(&sg);
 	}
 	if (i & 1)
 		usgl->sge[i / 2].len[1] = htobe32(0);
 	KASSERT(nsegs == 0, ("%s: nsegs %d, start %p, stop %p",
 	    __func__, nsegs, start, stop));
 }
 
 /*
  * Max number of SGL entries an offload tx work request can have.  This is 41
  * (1 + 40) for a full 512B work request.
  * fw_ofld_tx_data_wr(16B) + ulptx_sgl(16B, 1) + ulptx_sge_pair(480B, 40)
  */
 #define OFLD_SGL_LEN (41)
 
 /*
  * Send data and/or a FIN to the peer.
  *
  * The socket's so_snd buffer consists of a stream of data starting with sb_mb
  * and linked together with m_next.  sb_sndptr, if set, is the last mbuf that
  * was transmitted.
  *
  * drop indicates the number of bytes that should be dropped from the head of
  * the send buffer.  It is an optimization that lets do_fw4_ack avoid creating
  * contention on the send buffer lock (before this change it used to do
  * sowwakeup and then t4_push_frames right after that when recovering from tx
  * stalls).  When drop is set this function MUST drop the bytes and wake up any
  * writers.
  */
 void
 t4_push_frames(struct adapter *sc, struct toepcb *toep, int drop)
 {
 	struct mbuf *sndptr, *m, *sb_sndptr;
 	struct fw_ofld_tx_data_wr *txwr;
 	struct wrqe *wr;
 	u_int plen, nsegs, credits, max_imm, max_nsegs, max_nsegs_1mbuf;
 	struct inpcb *inp = toep->inp;
 	struct tcpcb *tp = intotcpcb(inp);
 	struct socket *so = inp->inp_socket;
 	struct sockbuf *sb = &so->so_snd;
 	int tx_credits, shove, compl, sowwakeup;
 	struct ofld_tx_sdesc *txsd;
 	bool aiotx_mbuf_seen;
 
 	INP_WLOCK_ASSERT(inp);
 	KASSERT(toep->flags & TPF_FLOWC_WR_SENT,
 	    ("%s: flowc_wr not sent for tid %u.", __func__, toep->tid));
 
 	KASSERT(toep->ulp_mode == ULP_MODE_NONE ||
 	    toep->ulp_mode == ULP_MODE_TCPDDP ||
 	    toep->ulp_mode == ULP_MODE_TLS ||
 	    toep->ulp_mode == ULP_MODE_RDMA,
 	    ("%s: ulp_mode %u for toep %p", __func__, toep->ulp_mode, toep));
 
 #ifdef VERBOSE_TRACES
 	CTR4(KTR_CXGBE, "%s: tid %d toep flags %#x tp flags %#x drop %d",
 	    __func__, toep->tid, toep->flags, tp->t_flags);
 #endif
 	if (__predict_false(toep->flags & TPF_ABORT_SHUTDOWN))
 		return;
 
 #ifdef RATELIMIT
 	if (__predict_false(inp->inp_flags2 & INP_RATE_LIMIT_CHANGED) &&
 	    (update_tx_rate_limit(sc, toep, so->so_max_pacing_rate) == 0)) {
 		inp->inp_flags2 &= ~INP_RATE_LIMIT_CHANGED;
 	}
 #endif
 
 	/*
 	 * This function doesn't resume by itself.  Someone else must clear the
 	 * flag and call this function.
 	 */
 	if (__predict_false(toep->flags & TPF_TX_SUSPENDED)) {
 		KASSERT(drop == 0,
 		    ("%s: drop (%d) != 0 but tx is suspended", __func__, drop));
 		return;
 	}
 
 	txsd = &toep->txsd[toep->txsd_pidx];
 	do {
 		tx_credits = min(toep->tx_credits, MAX_OFLD_TX_CREDITS);
 		max_imm = max_imm_payload(tx_credits);
 		max_nsegs = max_dsgl_nsegs(tx_credits);
 
 		SOCKBUF_LOCK(sb);
 		sowwakeup = drop;
 		if (drop) {
 			sbdrop_locked(sb, drop);
 			drop = 0;
 		}
 		sb_sndptr = sb->sb_sndptr;
 		sndptr = sb_sndptr ? sb_sndptr->m_next : sb->sb_mb;
 		plen = 0;
 		nsegs = 0;
 		max_nsegs_1mbuf = 0; /* max # of SGL segments in any one mbuf */
 		aiotx_mbuf_seen = false;
 		for (m = sndptr; m != NULL; m = m->m_next) {
 			int n;
 
 			if (IS_AIOTX_MBUF(m))
 				n = sglist_count_vmpages(aiotx_mbuf_pages(m),
 				    aiotx_mbuf_pgoff(m), m->m_len);
 			else
 				n = sglist_count(mtod(m, void *), m->m_len);
 
 			nsegs += n;
 			plen += m->m_len;
 
 			/* This mbuf sent us _over_ the nsegs limit, back out */
 			if (plen > max_imm && nsegs > max_nsegs) {
 				nsegs -= n;
 				plen -= m->m_len;
 				if (plen == 0) {
 					/* Too few credits */
 					toep->flags |= TPF_TX_SUSPENDED;
 					if (sowwakeup) {
 						if (!TAILQ_EMPTY(
 						    &toep->aiotx_jobq))
 							t4_aiotx_queue_toep(
 							    toep);
 						sowwakeup_locked(so);
 					} else
 						SOCKBUF_UNLOCK(sb);
 					SOCKBUF_UNLOCK_ASSERT(sb);
 					return;
 				}
 				break;
 			}
 
 			if (IS_AIOTX_MBUF(m))
 				aiotx_mbuf_seen = true;
 			if (max_nsegs_1mbuf < n)
 				max_nsegs_1mbuf = n;
 			sb_sndptr = m;	/* new sb->sb_sndptr if all goes well */
 
 			/* This mbuf put us right at the max_nsegs limit */
 			if (plen > max_imm && nsegs == max_nsegs) {
 				m = m->m_next;
 				break;
 			}
 		}
 
 		if (sbused(sb) > sb->sb_hiwat * 5 / 8 &&
 		    toep->plen_nocompl + plen >= sb->sb_hiwat / 4)
 			compl = 1;
 		else
 			compl = 0;
 
 		if (sb->sb_flags & SB_AUTOSIZE &&
 		    V_tcp_do_autosndbuf &&
 		    sb->sb_hiwat < V_tcp_autosndbuf_max &&
 		    sbused(sb) >= sb->sb_hiwat * 7 / 8) {
 			int newsize = min(sb->sb_hiwat + V_tcp_autosndbuf_inc,
 			    V_tcp_autosndbuf_max);
 
 			if (!sbreserve_locked(sb, newsize, so, NULL))
 				sb->sb_flags &= ~SB_AUTOSIZE;
 			else
 				sowwakeup = 1;	/* room available */
 		}
 		if (sowwakeup) {
 			if (!TAILQ_EMPTY(&toep->aiotx_jobq))
 				t4_aiotx_queue_toep(toep);
 			sowwakeup_locked(so);
 		} else
 			SOCKBUF_UNLOCK(sb);
 		SOCKBUF_UNLOCK_ASSERT(sb);
 
 		/* nothing to send */
 		if (plen == 0) {
 			KASSERT(m == NULL,
 			    ("%s: nothing to send, but m != NULL", __func__));
 			break;
 		}
 
 		if (__predict_false(toep->flags & TPF_FIN_SENT))
 			panic("%s: excess tx.", __func__);
 
 		shove = m == NULL && !(tp->t_flags & TF_MORETOCOME);
 		if (plen <= max_imm && !aiotx_mbuf_seen) {
 
 			/* Immediate data tx */
 
 			wr = alloc_wrqe(roundup2(sizeof(*txwr) + plen, 16),
 					toep->ofld_txq);
 			if (wr == NULL) {
 				/* XXX: how will we recover from this? */
 				toep->flags |= TPF_TX_SUSPENDED;
 				return;
 			}
 			txwr = wrtod(wr);
 			credits = howmany(wr->wr_len, 16);
 			write_tx_wr(txwr, toep, plen, plen, credits, shove, 0,
 			    sc->tt.tx_align);
 			m_copydata(sndptr, 0, plen, (void *)(txwr + 1));
 			nsegs = 0;
 		} else {
 			int wr_len;
 
 			/* DSGL tx */
 
 			wr_len = sizeof(*txwr) + sizeof(struct ulptx_sgl) +
 			    ((3 * (nsegs - 1)) / 2 + ((nsegs - 1) & 1)) * 8;
 			wr = alloc_wrqe(roundup2(wr_len, 16), toep->ofld_txq);
 			if (wr == NULL) {
 				/* XXX: how will we recover from this? */
 				toep->flags |= TPF_TX_SUSPENDED;
 				return;
 			}
 			txwr = wrtod(wr);
 			credits = howmany(wr_len, 16);
 			write_tx_wr(txwr, toep, 0, plen, credits, shove, 0,
 			    sc->tt.tx_align);
 			write_tx_sgl(txwr + 1, sndptr, m, nsegs,
 			    max_nsegs_1mbuf);
 			if (wr_len & 0xf) {
 				uint64_t *pad = (uint64_t *)
 				    ((uintptr_t)txwr + wr_len);
 				*pad = 0;
 			}
 		}
 
 		KASSERT(toep->tx_credits >= credits,
 			("%s: not enough credits", __func__));
 
 		toep->tx_credits -= credits;
 		toep->tx_nocompl += credits;
 		toep->plen_nocompl += plen;
 		if (toep->tx_credits <= toep->tx_total * 3 / 8 &&
 		    toep->tx_nocompl >= toep->tx_total / 4)
 			compl = 1;
 
 		if (compl || toep->ulp_mode == ULP_MODE_RDMA) {
 			txwr->op_to_immdlen |= htobe32(F_FW_WR_COMPL);
 			toep->tx_nocompl = 0;
 			toep->plen_nocompl = 0;
 		}
 
 		tp->snd_nxt += plen;
 		tp->snd_max += plen;
 
 		SOCKBUF_LOCK(sb);
 		KASSERT(sb_sndptr, ("%s: sb_sndptr is NULL", __func__));
 		sb->sb_sndptr = sb_sndptr;
 		SOCKBUF_UNLOCK(sb);
 
 		toep->flags |= TPF_TX_DATA_SENT;
 		if (toep->tx_credits < MIN_OFLD_TX_CREDITS)
 			toep->flags |= TPF_TX_SUSPENDED;
 
 		KASSERT(toep->txsd_avail > 0, ("%s: no txsd", __func__));
 		txsd->plen = plen;
 		txsd->tx_credits = credits;
 		txsd++;
 		if (__predict_false(++toep->txsd_pidx == toep->txsd_total)) {
 			toep->txsd_pidx = 0;
 			txsd = &toep->txsd[0];
 		}
 		toep->txsd_avail--;
 
 		t4_l2t_send(sc, wr, toep->l2te);
 	} while (m != NULL);
 
 	/* Send a FIN if requested, but only if there's no more data to send */
 	if (m == NULL && toep->flags & TPF_SEND_FIN)
 		t4_close_conn(sc, toep);
 }
 
 static inline void
 rqdrop_locked(struct mbufq *q, int plen)
 {
 	struct mbuf *m;
 
 	while (plen > 0) {
 		m = mbufq_dequeue(q);
 
 		/* Too many credits. */
 		MPASS(m != NULL);
 		M_ASSERTPKTHDR(m);
 
 		/* Partial credits. */
 		MPASS(plen >= m->m_pkthdr.len);
 
 		plen -= m->m_pkthdr.len;
 		m_freem(m);
 	}
 }
 
 void
 t4_push_pdus(struct adapter *sc, struct toepcb *toep, int drop)
 {
 	struct mbuf *sndptr, *m;
 	struct fw_ofld_tx_data_wr *txwr;
 	struct wrqe *wr;
 	u_int plen, nsegs, credits, max_imm, max_nsegs, max_nsegs_1mbuf;
 	u_int adjusted_plen, ulp_submode;
 	struct inpcb *inp = toep->inp;
 	struct tcpcb *tp = intotcpcb(inp);
 	int tx_credits, shove;
 	struct ofld_tx_sdesc *txsd = &toep->txsd[toep->txsd_pidx];
 	struct mbufq *pduq = &toep->ulp_pduq;
 	static const u_int ulp_extra_len[] = {0, 4, 4, 8};
 
 	INP_WLOCK_ASSERT(inp);
 	KASSERT(toep->flags & TPF_FLOWC_WR_SENT,
 	    ("%s: flowc_wr not sent for tid %u.", __func__, toep->tid));
 	KASSERT(toep->ulp_mode == ULP_MODE_ISCSI,
 	    ("%s: ulp_mode %u for toep %p", __func__, toep->ulp_mode, toep));
 
 	if (__predict_false(toep->flags & TPF_ABORT_SHUTDOWN))
 		return;
 
 	/*
 	 * This function doesn't resume by itself.  Someone else must clear the
 	 * flag and call this function.
 	 */
 	if (__predict_false(toep->flags & TPF_TX_SUSPENDED)) {
 		KASSERT(drop == 0,
 		    ("%s: drop (%d) != 0 but tx is suspended", __func__, drop));
 		return;
 	}
 
 	if (drop)
 		rqdrop_locked(&toep->ulp_pdu_reclaimq, drop);
 
 	while ((sndptr = mbufq_first(pduq)) != NULL) {
 		M_ASSERTPKTHDR(sndptr);
 
 		tx_credits = min(toep->tx_credits, MAX_OFLD_TX_CREDITS);
 		max_imm = max_imm_payload(tx_credits);
 		max_nsegs = max_dsgl_nsegs(tx_credits);
 
 		plen = 0;
 		nsegs = 0;
 		max_nsegs_1mbuf = 0; /* max # of SGL segments in any one mbuf */
 		for (m = sndptr; m != NULL; m = m->m_next) {
 			int n = sglist_count(mtod(m, void *), m->m_len);
 
 			nsegs += n;
 			plen += m->m_len;
 
 			/*
 			 * This mbuf would send us _over_ the nsegs limit.
 			 * Suspend tx because the PDU can't be sent out.
 			 */
 			if (plen > max_imm && nsegs > max_nsegs) {
 				toep->flags |= TPF_TX_SUSPENDED;
 				return;
 			}
 
 			if (max_nsegs_1mbuf < n)
 				max_nsegs_1mbuf = n;
 		}
 
 		if (__predict_false(toep->flags & TPF_FIN_SENT))
 			panic("%s: excess tx.", __func__);
 
 		/*
 		 * We have a PDU to send.  All of it goes out in one WR so 'm'
 		 * is NULL.  A PDU's length is always a multiple of 4.
 		 */
 		MPASS(m == NULL);
 		MPASS((plen & 3) == 0);
 		MPASS(sndptr->m_pkthdr.len == plen);
 
 		shove = !(tp->t_flags & TF_MORETOCOME);
 		ulp_submode = mbuf_ulp_submode(sndptr);
 		MPASS(ulp_submode < nitems(ulp_extra_len));
 
 		/*
 		 * plen doesn't include header and data digests, which are
 		 * generated and inserted in the right places by the TOE, but
 		 * they do occupy TCP sequence space and need to be accounted
 		 * for.
 		 */
 		adjusted_plen = plen + ulp_extra_len[ulp_submode];
 		if (plen <= max_imm) {
 
 			/* Immediate data tx */
 
 			wr = alloc_wrqe(roundup2(sizeof(*txwr) + plen, 16),
 					toep->ofld_txq);
 			if (wr == NULL) {
 				/* XXX: how will we recover from this? */
 				toep->flags |= TPF_TX_SUSPENDED;
 				return;
 			}
 			txwr = wrtod(wr);
 			credits = howmany(wr->wr_len, 16);
 			write_tx_wr(txwr, toep, plen, adjusted_plen, credits,
 			    shove, ulp_submode, sc->tt.tx_align);
 			m_copydata(sndptr, 0, plen, (void *)(txwr + 1));
 			nsegs = 0;
 		} else {
 			int wr_len;
 
 			/* DSGL tx */
 			wr_len = sizeof(*txwr) + sizeof(struct ulptx_sgl) +
 			    ((3 * (nsegs - 1)) / 2 + ((nsegs - 1) & 1)) * 8;
 			wr = alloc_wrqe(roundup2(wr_len, 16), toep->ofld_txq);
 			if (wr == NULL) {
 				/* XXX: how will we recover from this? */
 				toep->flags |= TPF_TX_SUSPENDED;
 				return;
 			}
 			txwr = wrtod(wr);
 			credits = howmany(wr_len, 16);
 			write_tx_wr(txwr, toep, 0, adjusted_plen, credits,
 			    shove, ulp_submode, sc->tt.tx_align);
 			write_tx_sgl(txwr + 1, sndptr, m, nsegs,
 			    max_nsegs_1mbuf);
 			if (wr_len & 0xf) {
 				uint64_t *pad = (uint64_t *)
 				    ((uintptr_t)txwr + wr_len);
 				*pad = 0;
 			}
 		}
 
 		KASSERT(toep->tx_credits >= credits,
 			("%s: not enough credits", __func__));
 
 		m = mbufq_dequeue(pduq);
 		MPASS(m == sndptr);
 		mbufq_enqueue(&toep->ulp_pdu_reclaimq, m);
 
 		toep->tx_credits -= credits;
 		toep->tx_nocompl += credits;
 		toep->plen_nocompl += plen;
 		if (toep->tx_credits <= toep->tx_total * 3 / 8 &&
 		    toep->tx_nocompl >= toep->tx_total / 4) {
 			txwr->op_to_immdlen |= htobe32(F_FW_WR_COMPL);
 			toep->tx_nocompl = 0;
 			toep->plen_nocompl = 0;
 		}
 
 		tp->snd_nxt += adjusted_plen;
 		tp->snd_max += adjusted_plen;
 
 		toep->flags |= TPF_TX_DATA_SENT;
 		if (toep->tx_credits < MIN_OFLD_TX_CREDITS)
 			toep->flags |= TPF_TX_SUSPENDED;
 
 		KASSERT(toep->txsd_avail > 0, ("%s: no txsd", __func__));
 		txsd->plen = plen;
 		txsd->tx_credits = credits;
 		txsd++;
 		if (__predict_false(++toep->txsd_pidx == toep->txsd_total)) {
 			toep->txsd_pidx = 0;
 			txsd = &toep->txsd[0];
 		}
 		toep->txsd_avail--;
 
 		t4_l2t_send(sc, wr, toep->l2te);
 	}
 
 	/* Send a FIN if requested, but only if there are no more PDUs to send */
 	if (mbufq_first(pduq) == NULL && toep->flags & TPF_SEND_FIN)
 		t4_close_conn(sc, toep);
 }
 
 int
 t4_tod_output(struct toedev *tod, struct tcpcb *tp)
 {
 	struct adapter *sc = tod->tod_softc;
 #ifdef INVARIANTS
 	struct inpcb *inp = tp->t_inpcb;
 #endif
 	struct toepcb *toep = tp->t_toe;
 
 	INP_WLOCK_ASSERT(inp);
 	KASSERT((inp->inp_flags & INP_DROPPED) == 0,
 	    ("%s: inp %p dropped.", __func__, inp));
 	KASSERT(toep != NULL, ("%s: toep is NULL", __func__));
 
 	if (toep->ulp_mode == ULP_MODE_ISCSI)
 		t4_push_pdus(sc, toep, 0);
 	else if (tls_tx_key(toep))
 		t4_push_tls_records(sc, toep, 0);
 	else
 		t4_push_frames(sc, toep, 0);
 
 	return (0);
 }
 
 int
 t4_send_fin(struct toedev *tod, struct tcpcb *tp)
 {
 	struct adapter *sc = tod->tod_softc;
 #ifdef INVARIANTS
 	struct inpcb *inp = tp->t_inpcb;
 #endif
 	struct toepcb *toep = tp->t_toe;
 
 	INP_WLOCK_ASSERT(inp);
 	KASSERT((inp->inp_flags & INP_DROPPED) == 0,
 	    ("%s: inp %p dropped.", __func__, inp));
 	KASSERT(toep != NULL, ("%s: toep is NULL", __func__));
 
 	toep->flags |= TPF_SEND_FIN;
 	if (tp->t_state >= TCPS_ESTABLISHED) {
 		if (toep->ulp_mode == ULP_MODE_ISCSI)
 			t4_push_pdus(sc, toep, 0);
 		else if (tls_tx_key(toep))
 			t4_push_tls_records(sc, toep, 0);
 		else
 			t4_push_frames(sc, toep, 0);
 	}
 
 	return (0);
 }
 
 int
 t4_send_rst(struct toedev *tod, struct tcpcb *tp)
 {
 	struct adapter *sc = tod->tod_softc;
 #if defined(INVARIANTS)
 	struct inpcb *inp = tp->t_inpcb;
 #endif
 	struct toepcb *toep = tp->t_toe;
 
 	INP_WLOCK_ASSERT(inp);
 	KASSERT((inp->inp_flags & INP_DROPPED) == 0,
 	    ("%s: inp %p dropped.", __func__, inp));
 	KASSERT(toep != NULL, ("%s: toep is NULL", __func__));
 
 	/* hmmmm */
 	KASSERT(toep->flags & TPF_FLOWC_WR_SENT,
 	    ("%s: flowc for tid %u [%s] not sent already",
 	    __func__, toep->tid, tcpstates[tp->t_state]));
 
 	send_reset(sc, toep, 0);
 	return (0);
 }
 
 /*
  * Peer has sent us a FIN.
  */
 static int
 do_peer_close(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m)
 {
 	struct adapter *sc = iq->adapter;
 	const struct cpl_peer_close *cpl = (const void *)(rss + 1);
 	unsigned int tid = GET_TID(cpl);
 	struct toepcb *toep = lookup_tid(sc, tid);
 	struct inpcb *inp = toep->inp;
 	struct tcpcb *tp = NULL;
 	struct socket *so;
 #ifdef INVARIANTS
 	unsigned int opcode = G_CPL_OPCODE(be32toh(OPCODE_TID(cpl)));
 #endif
 
 	KASSERT(opcode == CPL_PEER_CLOSE,
 	    ("%s: unexpected opcode 0x%x", __func__, opcode));
 	KASSERT(m == NULL, ("%s: wasn't expecting payload", __func__));
 
 	if (__predict_false(toep->flags & TPF_SYNQE)) {
 		/*
 		 * do_pass_establish must have run before do_peer_close and if
 		 * this is still a synqe instead of a toepcb then the connection
 		 * must be getting aborted.
 		 */
 		MPASS(toep->flags & TPF_ABORT_SHUTDOWN);
 		CTR4(KTR_CXGBE, "%s: tid %u, synqe %p (0x%x)", __func__, tid,
 		    toep, toep->flags);
 		return (0);
 	}
 
 	KASSERT(toep->tid == tid, ("%s: toep tid mismatch", __func__));
 
 	CURVNET_SET(toep->vnet);
 	INP_INFO_RLOCK(&V_tcbinfo);
 	INP_WLOCK(inp);
 	tp = intotcpcb(inp);
 
 	CTR5(KTR_CXGBE, "%s: tid %u (%s), toep_flags 0x%x, inp %p", __func__,
 	    tid, tp ? tcpstates[tp->t_state] : "no tp", toep->flags, inp);
 
 	if (toep->flags & TPF_ABORT_SHUTDOWN)
 		goto done;
 
 	tp->rcv_nxt++;	/* FIN */
 
 	so = inp->inp_socket;
 	if (toep->ulp_mode == ULP_MODE_TCPDDP) {
 		DDP_LOCK(toep);
 		if (__predict_false(toep->ddp.flags &
 		    (DDP_BUF0_ACTIVE | DDP_BUF1_ACTIVE)))
 			handle_ddp_close(toep, tp, cpl->rcv_nxt);
 		DDP_UNLOCK(toep);
 	}
 	socantrcvmore(so);
 
 	if (toep->ulp_mode != ULP_MODE_RDMA) {
 		KASSERT(tp->rcv_nxt == be32toh(cpl->rcv_nxt),
 	    		("%s: rcv_nxt mismatch: %u %u", __func__, tp->rcv_nxt,
 	    		be32toh(cpl->rcv_nxt)));
 	}
 
 	switch (tp->t_state) {
 	case TCPS_SYN_RECEIVED:
 		tp->t_starttime = ticks;
 		/* FALLTHROUGH */ 
 
 	case TCPS_ESTABLISHED:
 		tcp_state_change(tp, TCPS_CLOSE_WAIT);
 		break;
 
 	case TCPS_FIN_WAIT_1:
 		tcp_state_change(tp, TCPS_CLOSING);
 		break;
 
 	case TCPS_FIN_WAIT_2:
 		tcp_twstart(tp);
 		INP_UNLOCK_ASSERT(inp);	 /* safe, we have a ref on the inp */
 		INP_INFO_RUNLOCK(&V_tcbinfo);
 		CURVNET_RESTORE();
 
 		INP_WLOCK(inp);
 		final_cpl_received(toep);
 		return (0);
 
 	default:
 		log(LOG_ERR, "%s: TID %u received CPL_PEER_CLOSE in state %d\n",
 		    __func__, tid, tp->t_state);
 	}
 done:
 	INP_WUNLOCK(inp);
 	INP_INFO_RUNLOCK(&V_tcbinfo);
 	CURVNET_RESTORE();
 	return (0);
 }
 
 /*
  * Peer has ACK'd our FIN.
  */
 static int
 do_close_con_rpl(struct sge_iq *iq, const struct rss_header *rss,
     struct mbuf *m)
 {
 	struct adapter *sc = iq->adapter;
 	const struct cpl_close_con_rpl *cpl = (const void *)(rss + 1);
 	unsigned int tid = GET_TID(cpl);
 	struct toepcb *toep = lookup_tid(sc, tid);
 	struct inpcb *inp = toep->inp;
 	struct tcpcb *tp = NULL;
 	struct socket *so = NULL;
 #ifdef INVARIANTS
 	unsigned int opcode = G_CPL_OPCODE(be32toh(OPCODE_TID(cpl)));
 #endif
 
 	KASSERT(opcode == CPL_CLOSE_CON_RPL,
 	    ("%s: unexpected opcode 0x%x", __func__, opcode));
 	KASSERT(m == NULL, ("%s: wasn't expecting payload", __func__));
 	KASSERT(toep->tid == tid, ("%s: toep tid mismatch", __func__));
 
 	CURVNET_SET(toep->vnet);
 	INP_INFO_RLOCK(&V_tcbinfo);
 	INP_WLOCK(inp);
 	tp = intotcpcb(inp);
 
 	CTR4(KTR_CXGBE, "%s: tid %u (%s), toep_flags 0x%x",
 	    __func__, tid, tp ? tcpstates[tp->t_state] : "no tp", toep->flags);
 
 	if (toep->flags & TPF_ABORT_SHUTDOWN)
 		goto done;
 
 	so = inp->inp_socket;
 	tp->snd_una = be32toh(cpl->snd_nxt) - 1;	/* exclude FIN */
 
 	switch (tp->t_state) {
 	case TCPS_CLOSING:	/* see TCPS_FIN_WAIT_2 in do_peer_close too */
 		tcp_twstart(tp);
 release:
 		INP_UNLOCK_ASSERT(inp);	/* safe, we have a ref on the  inp */
 		INP_INFO_RUNLOCK(&V_tcbinfo);
 		CURVNET_RESTORE();
 
 		INP_WLOCK(inp);
 		final_cpl_received(toep);	/* no more CPLs expected */
 
 		return (0);
 	case TCPS_LAST_ACK:
 		if (tcp_close(tp))
 			INP_WUNLOCK(inp);
 		goto release;
 
 	case TCPS_FIN_WAIT_1:
 		if (so->so_rcv.sb_state & SBS_CANTRCVMORE)
 			soisdisconnected(so);
 		tcp_state_change(tp, TCPS_FIN_WAIT_2);
 		break;
 
 	default:
 		log(LOG_ERR,
 		    "%s: TID %u received CPL_CLOSE_CON_RPL in state %s\n",
 		    __func__, tid, tcpstates[tp->t_state]);
 	}
 done:
 	INP_WUNLOCK(inp);
 	INP_INFO_RUNLOCK(&V_tcbinfo);
 	CURVNET_RESTORE();
 	return (0);
 }
 
 void
 send_abort_rpl(struct adapter *sc, struct sge_wrq *ofld_txq, int tid,
     int rst_status)
 {
 	struct wrqe *wr;
 	struct cpl_abort_rpl *cpl;
 
 	wr = alloc_wrqe(sizeof(*cpl), ofld_txq);
 	if (wr == NULL) {
 		/* XXX */
 		panic("%s: allocation failure.", __func__);
 	}
 	cpl = wrtod(wr);
 
 	INIT_TP_WR_MIT_CPL(cpl, CPL_ABORT_RPL, tid);
 	cpl->cmd = rst_status;
 
 	t4_wrq_tx(sc, wr);
 }
 
 static int
 abort_status_to_errno(struct tcpcb *tp, unsigned int abort_reason)
 {
 	switch (abort_reason) {
 	case CPL_ERR_BAD_SYN:
 	case CPL_ERR_CONN_RESET:
 		return (tp->t_state == TCPS_CLOSE_WAIT ? EPIPE : ECONNRESET);
 	case CPL_ERR_XMIT_TIMEDOUT:
 	case CPL_ERR_PERSIST_TIMEDOUT:
 	case CPL_ERR_FINWAIT2_TIMEDOUT:
 	case CPL_ERR_KEEPALIVE_TIMEDOUT:
 		return (ETIMEDOUT);
 	default:
 		return (EIO);
 	}
 }
 
 /*
  * TCP RST from the peer, timeout, or some other such critical error.
  */
 static int
 do_abort_req(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m)
 {
 	struct adapter *sc = iq->adapter;
 	const struct cpl_abort_req_rss *cpl = (const void *)(rss + 1);
 	unsigned int tid = GET_TID(cpl);
 	struct toepcb *toep = lookup_tid(sc, tid);
 	struct sge_wrq *ofld_txq = toep->ofld_txq;
 	struct inpcb *inp;
 	struct tcpcb *tp;
 #ifdef INVARIANTS
 	unsigned int opcode = G_CPL_OPCODE(be32toh(OPCODE_TID(cpl)));
 #endif
 
 	KASSERT(opcode == CPL_ABORT_REQ_RSS,
 	    ("%s: unexpected opcode 0x%x", __func__, opcode));
 	KASSERT(m == NULL, ("%s: wasn't expecting payload", __func__));
 
 	if (toep->flags & TPF_SYNQE)
 		return (do_abort_req_synqe(iq, rss, m));
 
 	KASSERT(toep->tid == tid, ("%s: toep tid mismatch", __func__));
 
 	if (negative_advice(cpl->status)) {
 		CTR4(KTR_CXGBE, "%s: negative advice %d for tid %d (0x%x)",
 		    __func__, cpl->status, tid, toep->flags);
 		return (0);	/* Ignore negative advice */
 	}
 
 	inp = toep->inp;
 	CURVNET_SET(toep->vnet);
 	INP_INFO_RLOCK(&V_tcbinfo);	/* for tcp_close */
 	INP_WLOCK(inp);
 
 	tp = intotcpcb(inp);
 
 	CTR6(KTR_CXGBE,
 	    "%s: tid %d (%s), toep_flags 0x%x, inp_flags 0x%x, status %d",
 	    __func__, tid, tp ? tcpstates[tp->t_state] : "no tp", toep->flags,
 	    inp->inp_flags, cpl->status);
 
 	/*
 	 * If we'd initiated an abort earlier the reply to it is responsible for
 	 * cleaning up resources.  Otherwise we tear everything down right here
 	 * right now.  We owe the T4 a CPL_ABORT_RPL no matter what.
 	 */
 	if (toep->flags & TPF_ABORT_SHUTDOWN) {
 		INP_WUNLOCK(inp);
 		goto done;
 	}
 	toep->flags |= TPF_ABORT_SHUTDOWN;
 
 	if ((inp->inp_flags & (INP_DROPPED | INP_TIMEWAIT)) == 0) {
 		struct socket *so = inp->inp_socket;
 
 		if (so != NULL)
 			so_error_set(so, abort_status_to_errno(tp,
 			    cpl->status));
 		tp = tcp_close(tp);
 		if (tp == NULL)
 			INP_WLOCK(inp);	/* re-acquire */
 	}
 
 	final_cpl_received(toep);
 done:
 	INP_INFO_RUNLOCK(&V_tcbinfo);
 	CURVNET_RESTORE();
 	send_abort_rpl(sc, ofld_txq, tid, CPL_ABORT_NO_RST);
 	return (0);
 }
 
 /*
  * Reply to the CPL_ABORT_REQ (send_reset)
  */
 static int
 do_abort_rpl(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m)
 {
 	struct adapter *sc = iq->adapter;
 	const struct cpl_abort_rpl_rss *cpl = (const void *)(rss + 1);
 	unsigned int tid = GET_TID(cpl);
 	struct toepcb *toep = lookup_tid(sc, tid);
 	struct inpcb *inp = toep->inp;
 #ifdef INVARIANTS
 	unsigned int opcode = G_CPL_OPCODE(be32toh(OPCODE_TID(cpl)));
 #endif
 
 	KASSERT(opcode == CPL_ABORT_RPL_RSS,
 	    ("%s: unexpected opcode 0x%x", __func__, opcode));
 	KASSERT(m == NULL, ("%s: wasn't expecting payload", __func__));
 
 	if (toep->flags & TPF_SYNQE)
 		return (do_abort_rpl_synqe(iq, rss, m));
 
 	KASSERT(toep->tid == tid, ("%s: toep tid mismatch", __func__));
 
 	CTR5(KTR_CXGBE, "%s: tid %u, toep %p, inp %p, status %d",
 	    __func__, tid, toep, inp, cpl->status);
 
 	KASSERT(toep->flags & TPF_ABORT_SHUTDOWN,
 	    ("%s: wasn't expecting abort reply", __func__));
 
 	INP_WLOCK(inp);
 	final_cpl_received(toep);
 
 	return (0);
 }
 
 static int
 do_rx_data(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m)
 {
 	struct adapter *sc = iq->adapter;
 	const struct cpl_rx_data *cpl = mtod(m, const void *);
 	unsigned int tid = GET_TID(cpl);
 	struct toepcb *toep = lookup_tid(sc, tid);
 	struct inpcb *inp = toep->inp;
 	struct tcpcb *tp;
 	struct socket *so;
 	struct sockbuf *sb;
-	int len;
+	int len, rx_credits;
 	uint32_t ddp_placed = 0;
 
 	if (__predict_false(toep->flags & TPF_SYNQE)) {
 		/*
 		 * do_pass_establish must have run before do_rx_data and if this
 		 * is still a synqe instead of a toepcb then the connection must
 		 * be getting aborted.
 		 */
 		MPASS(toep->flags & TPF_ABORT_SHUTDOWN);
 		CTR4(KTR_CXGBE, "%s: tid %u, synqe %p (0x%x)", __func__, tid,
 		    toep, toep->flags);
 		m_freem(m);
 		return (0);
 	}
 
 	KASSERT(toep->tid == tid, ("%s: toep tid mismatch", __func__));
 
 	/* strip off CPL header */
 	m_adj(m, sizeof(*cpl));
 	len = m->m_pkthdr.len;
 
 	INP_WLOCK(inp);
 	if (inp->inp_flags & (INP_DROPPED | INP_TIMEWAIT)) {
 		CTR4(KTR_CXGBE, "%s: tid %u, rx (%d bytes), inp_flags 0x%x",
 		    __func__, tid, len, inp->inp_flags);
 		INP_WUNLOCK(inp);
 		m_freem(m);
 		return (0);
 	}
 
 	tp = intotcpcb(inp);
 
 	if (__predict_false(tp->rcv_nxt != be32toh(cpl->seq)))
 		ddp_placed = be32toh(cpl->seq) - tp->rcv_nxt;
 
 	tp->rcv_nxt += len;
 	if (tp->rcv_wnd < len) {
 		KASSERT(toep->ulp_mode == ULP_MODE_RDMA,
 				("%s: negative window size", __func__));
 	}
 
 	tp->rcv_wnd -= len;
 	tp->t_rcvtime = ticks;
 
 	if (toep->ulp_mode == ULP_MODE_TCPDDP)
 		DDP_LOCK(toep);
 	so = inp_inpcbtosocket(inp);
 	sb = &so->so_rcv;
 	SOCKBUF_LOCK(sb);
 
 	if (__predict_false(sb->sb_state & SBS_CANTRCVMORE)) {
 		CTR3(KTR_CXGBE, "%s: tid %u, excess rx (%d bytes)",
 		    __func__, tid, len);
 		m_freem(m);
 		SOCKBUF_UNLOCK(sb);
 		if (toep->ulp_mode == ULP_MODE_TCPDDP)
 			DDP_UNLOCK(toep);
 		INP_WUNLOCK(inp);
 
 		CURVNET_SET(toep->vnet);
 		INP_INFO_RLOCK(&V_tcbinfo);
 		INP_WLOCK(inp);
 		tp = tcp_drop(tp, ECONNRESET);
 		if (tp)
 			INP_WUNLOCK(inp);
 		INP_INFO_RUNLOCK(&V_tcbinfo);
 		CURVNET_RESTORE();
 
 		return (0);
 	}
 
 	/* receive buffer autosize */
 	MPASS(toep->vnet == so->so_vnet);
 	CURVNET_SET(toep->vnet);
 	if (sb->sb_flags & SB_AUTOSIZE &&
 	    V_tcp_do_autorcvbuf &&
 	    sb->sb_hiwat < V_tcp_autorcvbuf_max &&
 	    len > (sbspace(sb) / 8 * 7)) {
 		unsigned int hiwat = sb->sb_hiwat;
 		unsigned int newsize = min(hiwat + V_tcp_autorcvbuf_inc,
 		    V_tcp_autorcvbuf_max);
 
 		if (!sbreserve_locked(sb, newsize, so, NULL))
 			sb->sb_flags &= ~SB_AUTOSIZE;
-		else
-			toep->rx_credits += newsize - hiwat;
 	}
 
 	if (toep->ulp_mode == ULP_MODE_TCPDDP) {
 		int changed = !(toep->ddp.flags & DDP_ON) ^ cpl->ddp_off;
 
 		if (toep->ddp.waiting_count != 0 || toep->ddp.active_count != 0)
 			CTR3(KTR_CXGBE, "%s: tid %u, non-ddp rx (%d bytes)",
 			    __func__, tid, len);
 
 		if (changed) {
 			if (toep->ddp.flags & DDP_SC_REQ)
 				toep->ddp.flags ^= DDP_ON | DDP_SC_REQ;
 			else {
 				KASSERT(cpl->ddp_off == 1,
 				    ("%s: DDP switched on by itself.",
 				    __func__));
 
 				/* Fell out of DDP mode */
 				toep->ddp.flags &= ~DDP_ON;
 				CTR1(KTR_CXGBE, "%s: fell out of DDP mode",
 				    __func__);
 
 				insert_ddp_data(toep, ddp_placed);
 			}
 		}
 
 		if (toep->ddp.flags & DDP_ON) {
 			/*
 			 * CPL_RX_DATA with DDP on can only be an indicate.
 			 * Start posting queued AIO requests via DDP.  The
 			 * payload that arrived in this indicate is appended
 			 * to the socket buffer as usual.
 			 */
 			handle_ddp_indicate(toep);
 		}
 	}
 
-	KASSERT(toep->sb_cc >= sbused(sb),
-	    ("%s: sb %p has more data (%d) than last time (%d).",
-	    __func__, sb, sbused(sb), toep->sb_cc));
-	toep->rx_credits += toep->sb_cc - sbused(sb);
 	sbappendstream_locked(sb, m, 0);
-	toep->sb_cc = sbused(sb);
-	if (toep->rx_credits > 0 && toep->sb_cc + tp->rcv_wnd < sb->sb_lowat) {
-		int credits;
-
-		credits = send_rx_credits(sc, toep, toep->rx_credits);
-		toep->rx_credits -= credits;
-		tp->rcv_wnd += credits;
-		tp->rcv_adv += credits;
+	rx_credits = sbspace(sb) > tp->rcv_wnd ? sbspace(sb) - tp->rcv_wnd : 0;
+	if (rx_credits > 0 && sbused(sb) + tp->rcv_wnd < sb->sb_lowat) {
+		rx_credits = send_rx_credits(sc, toep, rx_credits);
+		tp->rcv_wnd += rx_credits;
+		tp->rcv_adv += rx_credits;
 	}
 
 	if (toep->ulp_mode == ULP_MODE_TCPDDP && toep->ddp.waiting_count > 0 &&
 	    sbavail(sb) != 0) {
 		CTR2(KTR_CXGBE, "%s: tid %u queueing AIO task", __func__,
 		    tid);
 		ddp_queue_toep(toep);
 	}
 	sorwakeup_locked(so);
 	SOCKBUF_UNLOCK_ASSERT(sb);
 	if (toep->ulp_mode == ULP_MODE_TCPDDP)
 		DDP_UNLOCK(toep);
 
 	INP_WUNLOCK(inp);
 	CURVNET_RESTORE();
 	return (0);
 }
 
 static int
 do_fw4_ack(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m)
 {
 	struct adapter *sc = iq->adapter;
 	const struct cpl_fw4_ack *cpl = (const void *)(rss + 1);
 	unsigned int tid = G_CPL_FW4_ACK_FLOWID(be32toh(OPCODE_TID(cpl)));
 	struct toepcb *toep = lookup_tid(sc, tid);
 	struct inpcb *inp;
 	struct tcpcb *tp;
 	struct socket *so;
 	uint8_t credits = cpl->credits;
 	struct ofld_tx_sdesc *txsd;
 	int plen;
 #ifdef INVARIANTS
 	unsigned int opcode = G_CPL_FW4_ACK_OPCODE(be32toh(OPCODE_TID(cpl)));
 #endif
 
 	/*
 	 * Very unusual case: we'd sent a flowc + abort_req for a synq entry and
 	 * now this comes back carrying the credits for the flowc.
 	 */
 	if (__predict_false(toep->flags & TPF_SYNQE)) {
 		KASSERT(toep->flags & TPF_ABORT_SHUTDOWN,
 		    ("%s: credits for a synq entry %p", __func__, toep));
 		return (0);
 	}
 
 	inp = toep->inp;
 
 	KASSERT(opcode == CPL_FW4_ACK,
 	    ("%s: unexpected opcode 0x%x", __func__, opcode));
 	KASSERT(m == NULL, ("%s: wasn't expecting payload", __func__));
 	KASSERT(toep->tid == tid, ("%s: toep tid mismatch", __func__));
 
 	INP_WLOCK(inp);
 
 	if (__predict_false(toep->flags & TPF_ABORT_SHUTDOWN)) {
 		INP_WUNLOCK(inp);
 		return (0);
 	}
 
 	KASSERT((inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) == 0,
 	    ("%s: inp_flags 0x%x", __func__, inp->inp_flags));
 
 	tp = intotcpcb(inp);
 
 	if (cpl->flags & CPL_FW4_ACK_FLAGS_SEQVAL) {
 		tcp_seq snd_una = be32toh(cpl->snd_una);
 
 #ifdef INVARIANTS
 		if (__predict_false(SEQ_LT(snd_una, tp->snd_una))) {
 			log(LOG_ERR,
 			    "%s: unexpected seq# %x for TID %u, snd_una %x\n",
 			    __func__, snd_una, toep->tid, tp->snd_una);
 		}
 #endif
 
 		if (tp->snd_una != snd_una) {
 			tp->snd_una = snd_una;
 			tp->ts_recent_age = tcp_ts_getticks();
 		}
 	}
 
 #ifdef VERBOSE_TRACES
 	CTR3(KTR_CXGBE, "%s: tid %d credits %u", __func__, tid, credits);
 #endif
 	so = inp->inp_socket;
 	txsd = &toep->txsd[toep->txsd_cidx];
 	plen = 0;
 	while (credits) {
 		KASSERT(credits >= txsd->tx_credits,
 		    ("%s: too many (or partial) credits", __func__));
 		credits -= txsd->tx_credits;
 		toep->tx_credits += txsd->tx_credits;
 		plen += txsd->plen;
 		if (txsd->iv_buffer) {
 			free(txsd->iv_buffer, M_CXGBE);
 			txsd->iv_buffer = NULL;
 		}
 		txsd++;
 		toep->txsd_avail++;
 		KASSERT(toep->txsd_avail <= toep->txsd_total,
 		    ("%s: txsd avail > total", __func__));
 		if (__predict_false(++toep->txsd_cidx == toep->txsd_total)) {
 			txsd = &toep->txsd[0];
 			toep->txsd_cidx = 0;
 		}
 	}
 
 	if (toep->tx_credits == toep->tx_total) {
 		toep->tx_nocompl = 0;
 		toep->plen_nocompl = 0;
 	}
 
 	if (toep->flags & TPF_TX_SUSPENDED &&
 	    toep->tx_credits >= toep->tx_total / 4) {
 #ifdef VERBOSE_TRACES
 		CTR2(KTR_CXGBE, "%s: tid %d calling t4_push_frames", __func__,
 		    tid);
 #endif
 		toep->flags &= ~TPF_TX_SUSPENDED;
 		CURVNET_SET(toep->vnet);
 		if (toep->ulp_mode == ULP_MODE_ISCSI)
 			t4_push_pdus(sc, toep, plen);
 		else if (tls_tx_key(toep))
 			t4_push_tls_records(sc, toep, plen);
 		else
 			t4_push_frames(sc, toep, plen);
 		CURVNET_RESTORE();
 	} else if (plen > 0) {
 		struct sockbuf *sb = &so->so_snd;
 		int sbu;
 
 		SOCKBUF_LOCK(sb);
 		sbu = sbused(sb);
 		if (toep->ulp_mode == ULP_MODE_ISCSI) {
 
 			if (__predict_false(sbu > 0)) {
 				/*
 				 * The data trasmitted before the tid's ULP mode
 				 * changed to ISCSI is still in so_snd.
 				 * Incoming credits should account for so_snd
 				 * first.
 				 */
 				sbdrop_locked(sb, min(sbu, plen));
 				plen -= min(sbu, plen);
 			}
 			sowwakeup_locked(so);	/* unlocks so_snd */
 			rqdrop_locked(&toep->ulp_pdu_reclaimq, plen);
 		} else {
 #ifdef VERBOSE_TRACES
 			CTR3(KTR_CXGBE, "%s: tid %d dropped %d bytes", __func__,
 			    tid, plen);
 #endif
 			sbdrop_locked(sb, plen);
 			if (tls_tx_key(toep)) {
 				struct tls_ofld_info *tls_ofld = &toep->tls;
 
 				MPASS(tls_ofld->sb_off >= plen);
 				tls_ofld->sb_off -= plen;
 			}
 			if (!TAILQ_EMPTY(&toep->aiotx_jobq))
 				t4_aiotx_queue_toep(toep);
 			sowwakeup_locked(so);	/* unlocks so_snd */
 		}
 		SOCKBUF_UNLOCK_ASSERT(sb);
 	}
 
 	INP_WUNLOCK(inp);
 
 	return (0);
 }
 
 void
 t4_set_tcb_field(struct adapter *sc, struct sge_wrq *wrq, struct toepcb *toep,
     uint16_t word, uint64_t mask, uint64_t val, int reply, int cookie)
 {
 	struct wrqe *wr;
 	struct cpl_set_tcb_field *req;
 	struct ofld_tx_sdesc *txsd;
 
 	MPASS((cookie & ~M_COOKIE) == 0);
 	if (reply) {
 		MPASS(cookie != CPL_COOKIE_RESERVED);
 	}
 
 	wr = alloc_wrqe(sizeof(*req), wrq);
 	if (wr == NULL) {
 		/* XXX */
 		panic("%s: allocation failure.", __func__);
 	}
 	req = wrtod(wr);
 
 	INIT_TP_WR_MIT_CPL(req, CPL_SET_TCB_FIELD, toep->tid);
 	req->reply_ctrl = htobe16(V_QUEUENO(toep->ofld_rxq->iq.abs_id));
 	if (reply == 0)
 		req->reply_ctrl |= htobe16(F_NO_REPLY);
 	req->word_cookie = htobe16(V_WORD(word) | V_COOKIE(cookie));
 	req->mask = htobe64(mask);
 	req->val = htobe64(val);
 	if ((wrq->eq.flags & EQ_TYPEMASK) == EQ_OFLD) {
 		txsd = &toep->txsd[toep->txsd_pidx];
 		txsd->tx_credits = howmany(sizeof(*req), 16);
 		txsd->plen = 0;
 		KASSERT(toep->tx_credits >= txsd->tx_credits &&
 		    toep->txsd_avail > 0,
 		    ("%s: not enough credits (%d)", __func__,
 		    toep->tx_credits));
 		toep->tx_credits -= txsd->tx_credits;
 		if (__predict_false(++toep->txsd_pidx == toep->txsd_total))
 			toep->txsd_pidx = 0;
 		toep->txsd_avail--;
 	}
 
 	t4_wrq_tx(sc, wr);
 }
 
 void
 t4_init_cpl_io_handlers(void)
 {
 
 	t4_register_cpl_handler(CPL_PEER_CLOSE, do_peer_close);
 	t4_register_cpl_handler(CPL_CLOSE_CON_RPL, do_close_con_rpl);
 	t4_register_cpl_handler(CPL_ABORT_REQ_RSS, do_abort_req);
 	t4_register_shared_cpl_handler(CPL_ABORT_RPL_RSS, do_abort_rpl,
 	    CPL_COOKIE_TOM);
 	t4_register_cpl_handler(CPL_RX_DATA, do_rx_data);
 	t4_register_shared_cpl_handler(CPL_FW4_ACK, do_fw4_ack, CPL_COOKIE_TOM);
 }
 
 void
 t4_uninit_cpl_io_handlers(void)
 {
 
 	t4_register_cpl_handler(CPL_PEER_CLOSE, NULL);
 	t4_register_cpl_handler(CPL_CLOSE_CON_RPL, NULL);
 	t4_register_cpl_handler(CPL_ABORT_REQ_RSS, NULL);
 	t4_register_shared_cpl_handler(CPL_ABORT_RPL_RSS, NULL, CPL_COOKIE_TOM);
 	t4_register_cpl_handler(CPL_RX_DATA, NULL);
 	t4_register_shared_cpl_handler(CPL_FW4_ACK, NULL, CPL_COOKIE_TOM);
 }
 
 /*
  * Use the 'backend3' field in AIO jobs to store the amount of data
  * sent by the AIO job so far and the 'backend4' field to hold an
  * error that should be reported when the job is completed.
  */
 #define	aio_sent	backend3
 #define	aio_error	backend4
 
 #define	jobtotid(job)							\
 	(((struct toepcb *)(so_sototcpcb((job)->fd_file->f_data)->t_toe))->tid)
 	
 static void
 free_aiotx_buffer(struct aiotx_buffer *ab)
 {
 	struct kaiocb *job;
 	long status;
 	int error;
 
 	if (refcount_release(&ab->refcount) == 0)
 		return;
 
 	job = ab->job;
 	error = job->aio_error;
 	status = job->aio_sent;
 	vm_page_unhold_pages(ab->ps.pages, ab->ps.npages);
 	free(ab, M_CXGBE);
 #ifdef VERBOSE_TRACES
 	CTR5(KTR_CXGBE, "%s: tid %d completed %p len %ld, error %d", __func__,
 	    jobtotid(job), job, status, error);
 #endif
 	if (error == ECANCELED && status != 0)
 		error = 0;
 	if (error == ECANCELED)
 		aio_cancel(job);
 	else if (error)
 		aio_complete(job, -1, error);
 	else
 		aio_complete(job, status, 0);
 }
 
 static void
 t4_aiotx_mbuf_free(struct mbuf *m, void *buffer, void *arg)
 {
 	struct aiotx_buffer *ab = buffer;
 
 #ifdef VERBOSE_TRACES
 	CTR3(KTR_CXGBE, "%s: completed %d bytes for tid %d", __func__,
 	    m->m_len, jobtotid(ab->job));
 #endif
 	free_aiotx_buffer(ab);
 }
 
 /*
  * Hold the buffer backing an AIO request and return an AIO transmit
  * buffer.
  */
 static int
 hold_aio(struct kaiocb *job)
 {
 	struct aiotx_buffer *ab;
 	struct vmspace *vm;
 	vm_map_t map;
 	vm_offset_t start, end, pgoff;
 	int n;
 
 	MPASS(job->backend1 == NULL);
 
 	/*
 	 * The AIO subsystem will cancel and drain all requests before
 	 * permitting a process to exit or exec, so p_vmspace should
 	 * be stable here.
 	 */
 	vm = job->userproc->p_vmspace;
 	map = &vm->vm_map;
 	start = (uintptr_t)job->uaiocb.aio_buf;
 	pgoff = start & PAGE_MASK;
 	end = round_page(start + job->uaiocb.aio_nbytes);
 	start = trunc_page(start);
 	n = atop(end - start);
 
 	ab = malloc(sizeof(*ab) + n * sizeof(vm_page_t), M_CXGBE, M_WAITOK |
 	    M_ZERO);
 	refcount_init(&ab->refcount, 1);
 	ab->ps.pages = (vm_page_t *)(ab + 1);
 	ab->ps.npages = vm_fault_quick_hold_pages(map, start, end - start,
 	    VM_PROT_WRITE, ab->ps.pages, n);
 	if (ab->ps.npages < 0) {
 		free(ab, M_CXGBE);
 		return (EFAULT);
 	}
 
 	KASSERT(ab->ps.npages == n,
 	    ("hold_aio: page count mismatch: %d vs %d", ab->ps.npages, n));
 
 	ab->ps.offset = pgoff;
 	ab->ps.len = job->uaiocb.aio_nbytes;
 	ab->job = job;
 	job->backend1 = ab;
 #ifdef VERBOSE_TRACES
 	CTR5(KTR_CXGBE, "%s: tid %d, new pageset %p for job %p, npages %d",
 	    __func__, jobtotid(job), &ab->ps, job, ab->ps.npages);
 #endif
 	return (0);
 }
 
 static void
 t4_aiotx_process_job(struct toepcb *toep, struct socket *so, struct kaiocb *job)
 {
 	struct adapter *sc;
 	struct sockbuf *sb;
 	struct file *fp;
 	struct aiotx_buffer *ab;
 	struct inpcb *inp;
 	struct tcpcb *tp;
 	struct mbuf *m;
 	int error;
 	bool moretocome, sendmore;
 
 	sc = td_adapter(toep->td);
 	sb = &so->so_snd;
 	SOCKBUF_UNLOCK(sb);
 	fp = job->fd_file;
 	ab = job->backend1;
 	m = NULL;
 
 #ifdef MAC
 	error = mac_socket_check_send(fp->f_cred, so);
 	if (error != 0)
 		goto out;
 #endif
 
 	if (ab == NULL) {
 		error = hold_aio(job);
 		if (error != 0)
 			goto out;
 		ab = job->backend1;
 	}
 
 	/* Inline sosend_generic(). */
 
 	job->msgsnd = 1;
 
 	error = sblock(sb, SBL_WAIT);
 	MPASS(error == 0);
 
 sendanother:
 	m = m_get(M_WAITOK, MT_DATA);
 
 	SOCKBUF_LOCK(sb);
 	if (so->so_snd.sb_state & SBS_CANTSENDMORE) {
 		SOCKBUF_UNLOCK(sb);
 		sbunlock(sb);
 		if ((so->so_options & SO_NOSIGPIPE) == 0) {
 			PROC_LOCK(job->userproc);
 			kern_psignal(job->userproc, SIGPIPE);
 			PROC_UNLOCK(job->userproc);
 		}
 		error = EPIPE;
 		goto out;
 	}
 	if (so->so_error) {
 		error = so->so_error;
 		so->so_error = 0;
 		SOCKBUF_UNLOCK(sb);
 		sbunlock(sb);
 		goto out;
 	}
 	if ((so->so_state & SS_ISCONNECTED) == 0) {
 		SOCKBUF_UNLOCK(sb);
 		sbunlock(sb);
 		error = ENOTCONN;
 		goto out;
 	}
 	if (sbspace(sb) < sb->sb_lowat) {
 		MPASS(job->aio_sent == 0 || !(so->so_state & SS_NBIO));
 
 		/*
 		 * Don't block if there is too little room in the socket
 		 * buffer.  Instead, requeue the request.
 		 */
 		if (!aio_set_cancel_function(job, t4_aiotx_cancel)) {
 			SOCKBUF_UNLOCK(sb);
 			sbunlock(sb);
 			error = ECANCELED;
 			goto out;
 		}
 		TAILQ_INSERT_HEAD(&toep->aiotx_jobq, job, list);
 		SOCKBUF_UNLOCK(sb);
 		sbunlock(sb);
 		goto out;
 	}
 
 	/*
 	 * Write as much data as the socket permits, but no more than a
 	 * a single sndbuf at a time.
 	 */
 	m->m_len = sbspace(sb);
 	if (m->m_len > ab->ps.len - job->aio_sent) {
 		m->m_len = ab->ps.len - job->aio_sent;
 		moretocome = false;
 	} else
 		moretocome = true;
 	if (m->m_len > sc->tt.sndbuf) {
 		m->m_len = sc->tt.sndbuf;
 		sendmore = true;
 	} else
 		sendmore = false;
 
 	if (!TAILQ_EMPTY(&toep->aiotx_jobq))
 		moretocome = true;
 	SOCKBUF_UNLOCK(sb);
 	MPASS(m->m_len != 0);
 
 	/* Inlined tcp_usr_send(). */
 
 	inp = toep->inp;
 	INP_WLOCK(inp);
 	if (inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) {
 		INP_WUNLOCK(inp);
 		sbunlock(sb);
 		error = ECONNRESET;
 		goto out;
 	}
 
 	refcount_acquire(&ab->refcount);
 	m_extadd(m, NULL, ab->ps.len, t4_aiotx_mbuf_free, ab,
 	    (void *)(uintptr_t)job->aio_sent, 0, EXT_NET_DRV);
 	m->m_ext.ext_flags |= EXT_FLAG_AIOTX;
 	job->aio_sent += m->m_len;
 	
 	sbappendstream(sb, m, 0);
 	m = NULL;
 
 	if (!(inp->inp_flags & INP_DROPPED)) {
 		tp = intotcpcb(inp);
 		if (moretocome)
 			tp->t_flags |= TF_MORETOCOME;
 		error = tp->t_fb->tfb_tcp_output(tp);
 		if (moretocome)
 			tp->t_flags &= ~TF_MORETOCOME;
 	}
 
 	INP_WUNLOCK(inp);
 	if (sendmore)
 		goto sendanother;
 	sbunlock(sb);
 
 	if (error)
 		goto out;
 
 	/*
 	 * If this is a non-blocking socket and the request has not
 	 * been fully completed, requeue it until the socket is ready
 	 * again.
 	 */
 	if (job->aio_sent < job->uaiocb.aio_nbytes &&
 	    !(so->so_state & SS_NBIO)) {
 		SOCKBUF_LOCK(sb);
 		if (!aio_set_cancel_function(job, t4_aiotx_cancel)) {
 			SOCKBUF_UNLOCK(sb);
 			error = ECANCELED;
 			goto out;
 		}
 		TAILQ_INSERT_HEAD(&toep->aiotx_jobq, job, list);
 		return;
 	}
 
 	/*
 	 * If the request will not be requeued, drop a reference on
 	 * the the aiotx buffer.  Any mbufs in flight should still
 	 * contain a reference, but this drops the reference that the
 	 * job owns while it is waiting to queue mbufs to the socket.
 	 */
 	free_aiotx_buffer(ab);
 
 out:
 	if (error) {
 		if (ab != NULL) {
 			job->aio_error = error;
 			free_aiotx_buffer(ab);
 		} else {
 			MPASS(job->aio_sent == 0);
 			aio_complete(job, -1, error);
 		}
 	}
 	if (m != NULL)
 		m_free(m);
 	SOCKBUF_LOCK(sb);
 }
 
 static void
 t4_aiotx_task(void *context, int pending)
 {
 	struct toepcb *toep = context;
 	struct inpcb *inp = toep->inp;
 	struct socket *so = inp->inp_socket;
 	struct kaiocb *job;
 
 	CURVNET_SET(toep->vnet);
 	SOCKBUF_LOCK(&so->so_snd);
 	while (!TAILQ_EMPTY(&toep->aiotx_jobq) && sowriteable(so)) {
 		job = TAILQ_FIRST(&toep->aiotx_jobq);
 		TAILQ_REMOVE(&toep->aiotx_jobq, job, list);
 		if (!aio_clear_cancel_function(job))
 			continue;
 
 		t4_aiotx_process_job(toep, so, job);
 	}
 	toep->aiotx_task_active = false;
 	SOCKBUF_UNLOCK(&so->so_snd);
 	CURVNET_RESTORE();
 
 	free_toepcb(toep);
 }
 
 static void
 t4_aiotx_queue_toep(struct toepcb *toep)
 {
 
 	SOCKBUF_LOCK_ASSERT(&toep->inp->inp_socket->so_snd);
 #ifdef VERBOSE_TRACES
 	CTR3(KTR_CXGBE, "%s: queueing aiotx task for tid %d, active = %s",
 	    __func__, toep->tid, toep->aiotx_task_active ? "true" : "false");
 #endif
 	if (toep->aiotx_task_active)
 		return;
 	toep->aiotx_task_active = true;
 	hold_toepcb(toep);
 	soaio_enqueue(&toep->aiotx_task);
 }
 
 static void
 t4_aiotx_cancel(struct kaiocb *job)
 {
 	struct aiotx_buffer *ab;
 	struct socket *so;
 	struct sockbuf *sb;
 	struct tcpcb *tp;
 	struct toepcb *toep;
 
 	so = job->fd_file->f_data;
 	tp = so_sototcpcb(so);
 	toep = tp->t_toe;
 	MPASS(job->uaiocb.aio_lio_opcode == LIO_WRITE);
 	sb = &so->so_snd;
 
 	SOCKBUF_LOCK(sb);
 	if (!aio_cancel_cleared(job))
 		TAILQ_REMOVE(&toep->aiotx_jobq, job, list);
 	SOCKBUF_UNLOCK(sb);
 
 	ab = job->backend1;
 	if (ab != NULL)
 		free_aiotx_buffer(ab);
 	else
 		aio_cancel(job);
 }
 
 int
 t4_aio_queue_aiotx(struct socket *so, struct kaiocb *job)
 {
 	struct tcpcb *tp = so_sototcpcb(so);
 	struct toepcb *toep = tp->t_toe;
 	struct adapter *sc = td_adapter(toep->td);
 
 	/* This only handles writes. */
 	if (job->uaiocb.aio_lio_opcode != LIO_WRITE)
 		return (EOPNOTSUPP);
 
 	if (!sc->tt.tx_zcopy)
 		return (EOPNOTSUPP);
 
 	if (tls_tx_key(toep))
 		return (EOPNOTSUPP);
 
 	SOCKBUF_LOCK(&so->so_snd);
 #ifdef VERBOSE_TRACES
 	CTR2(KTR_CXGBE, "%s: queueing %p", __func__, job);
 #endif
 	if (!aio_set_cancel_function(job, t4_aiotx_cancel))
 		panic("new job was cancelled");
 	TAILQ_INSERT_TAIL(&toep->aiotx_jobq, job, list);
 	if (sowriteable(so))
 		t4_aiotx_queue_toep(toep);
 	SOCKBUF_UNLOCK(&so->so_snd);
 	return (0);
 }
 
 void
 aiotx_init_toep(struct toepcb *toep)
 {
 
 	TAILQ_INIT(&toep->aiotx_jobq);
 	TASK_INIT(&toep->aiotx_task, 0, t4_aiotx_task, toep);
 }
 #endif
Index: stable/11/sys/dev/cxgbe/tom/t4_ddp.c
===================================================================
--- stable/11/sys/dev/cxgbe/tom/t4_ddp.c	(revision 348703)
+++ stable/11/sys/dev/cxgbe/tom/t4_ddp.c	(revision 348704)
@@ -1,1983 +1,1971 @@
 /*-
  * Copyright (c) 2012 Chelsio Communications, Inc.
  * All rights reserved.
  * Written by: Navdeep Parhar <np@FreeBSD.org>
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_inet.h"
 
 #include <sys/param.h>
 #include <sys/aio.h>
 #include <sys/file.h>
 #include <sys/systm.h>
 #include <sys/kernel.h>
 #include <sys/ktr.h>
 #include <sys/module.h>
 #include <sys/protosw.h>
 #include <sys/proc.h>
 #include <sys/domain.h>
 #include <sys/socket.h>
 #include <sys/socketvar.h>
 #include <sys/taskqueue.h>
 #include <sys/uio.h>
 #include <netinet/in.h>
 #include <netinet/in_pcb.h>
 #include <netinet/ip.h>
 #include <netinet/tcp_var.h>
 #define TCPSTATES
 #include <netinet/tcp_fsm.h>
 #include <netinet/toecore.h>
 
 #include <vm/vm.h>
 #include <vm/vm_extern.h>
 #include <vm/vm_param.h>
 #include <vm/pmap.h>
 #include <vm/vm_map.h>
 #include <vm/vm_page.h>
 #include <vm/vm_object.h>
 
 #ifdef TCP_OFFLOAD
 #include "common/common.h"
 #include "common/t4_msg.h"
 #include "common/t4_regs.h"
 #include "common/t4_tcb.h"
 #include "tom/t4_tom.h"
 
 VNET_DECLARE(int, tcp_do_autorcvbuf);
 #define V_tcp_do_autorcvbuf VNET(tcp_do_autorcvbuf)
 VNET_DECLARE(int, tcp_autorcvbuf_inc);
 #define V_tcp_autorcvbuf_inc VNET(tcp_autorcvbuf_inc)
 VNET_DECLARE(int, tcp_autorcvbuf_max);
 #define V_tcp_autorcvbuf_max VNET(tcp_autorcvbuf_max)
 
 /*
  * Use the 'backend3' field in AIO jobs to store the amount of data
  * received by the AIO job so far.
  */
 #define	aio_received	backend3
 
 static void aio_ddp_requeue_task(void *context, int pending);
 static void ddp_complete_all(struct toepcb *toep, int error);
 static void t4_aio_cancel_active(struct kaiocb *job);
 static void t4_aio_cancel_queued(struct kaiocb *job);
 
 static TAILQ_HEAD(, pageset) ddp_orphan_pagesets;
 static struct mtx ddp_orphan_pagesets_lock;
 static struct task ddp_orphan_task;
 
 #define MAX_DDP_BUFFER_SIZE		(M_TCB_RX_DDP_BUF0_LEN)
 
 /*
  * A page set holds information about a buffer used for DDP.  The page
  * set holds resources such as the VM pages backing the buffer (either
  * held or wired) and the page pods associated with the buffer.
  * Recently used page sets are cached to allow for efficient reuse of
  * buffers (avoiding the need to re-fault in pages, hold them, etc.).
  * Note that cached page sets keep the backing pages wired.  The
  * number of wired pages is capped by only allowing for two wired
  * pagesets per connection.  This is not a perfect cap, but is a
  * trade-off for performance.
  *
  * If an application ping-pongs two buffers for a connection via
  * aio_read(2) then those buffers should remain wired and expensive VM
  * fault lookups should be avoided after each buffer has been used
  * once.  If an application uses more than two buffers then this will
  * fall back to doing expensive VM fault lookups for each operation.
  */
 static void
 free_pageset(struct tom_data *td, struct pageset *ps)
 {
 	vm_page_t p;
 	int i;
 
 	if (ps->prsv.prsv_nppods > 0)
 		t4_free_page_pods(&ps->prsv);
 
 	if (ps->flags & PS_WIRED) {
 		for (i = 0; i < ps->npages; i++) {
 			p = ps->pages[i];
 			vm_page_lock(p);
 			vm_page_unwire(p, PQ_INACTIVE);
 			vm_page_unlock(p);
 		}
 	} else
 		vm_page_unhold_pages(ps->pages, ps->npages);
 	mtx_lock(&ddp_orphan_pagesets_lock);
 	TAILQ_INSERT_TAIL(&ddp_orphan_pagesets, ps, link);
 	taskqueue_enqueue(taskqueue_thread, &ddp_orphan_task);
 	mtx_unlock(&ddp_orphan_pagesets_lock);
 }
 
 static void
 ddp_free_orphan_pagesets(void *context, int pending)
 {
 	struct pageset *ps;
 
 	mtx_lock(&ddp_orphan_pagesets_lock);
 	while (!TAILQ_EMPTY(&ddp_orphan_pagesets)) {
 		ps = TAILQ_FIRST(&ddp_orphan_pagesets);
 		TAILQ_REMOVE(&ddp_orphan_pagesets, ps, link);
 		mtx_unlock(&ddp_orphan_pagesets_lock);
 		if (ps->vm)
 			vmspace_free(ps->vm);
 		free(ps, M_CXGBE);
 		mtx_lock(&ddp_orphan_pagesets_lock);
 	}
 	mtx_unlock(&ddp_orphan_pagesets_lock);
 }
 
 static void
 recycle_pageset(struct toepcb *toep, struct pageset *ps)
 {
 
 	DDP_ASSERT_LOCKED(toep);
 	if (!(toep->ddp.flags & DDP_DEAD) && ps->flags & PS_WIRED) {
 		KASSERT(toep->ddp.cached_count + toep->ddp.active_count <
 		    nitems(toep->ddp.db), ("too many wired pagesets"));
 		TAILQ_INSERT_HEAD(&toep->ddp.cached_pagesets, ps, link);
 		toep->ddp.cached_count++;
 	} else
 		free_pageset(toep->td, ps);
 }
 
 static void
 ddp_complete_one(struct kaiocb *job, int error)
 {
 	long copied;
 
 	/*
 	 * If this job had copied data out of the socket buffer before
 	 * it was cancelled, report it as a short read rather than an
 	 * error.
 	 */
 	copied = job->aio_received;
 	if (copied != 0 || error == 0)
 		aio_complete(job, copied, 0);
 	else
 		aio_complete(job, -1, error);
 }
 
 static void
 free_ddp_buffer(struct tom_data *td, struct ddp_buffer *db)
 {
 
 	if (db->job) {
 		/*
 		 * XXX: If we are un-offloading the socket then we
 		 * should requeue these on the socket somehow.  If we
 		 * got a FIN from the remote end, then this completes
 		 * any remaining requests with an EOF read.
 		 */
 		if (!aio_clear_cancel_function(db->job))
 			ddp_complete_one(db->job, 0);
 	}
 
 	if (db->ps)
 		free_pageset(td, db->ps);
 }
 
 void
 ddp_init_toep(struct toepcb *toep)
 {
 
 	TAILQ_INIT(&toep->ddp.aiojobq);
 	TASK_INIT(&toep->ddp.requeue_task, 0, aio_ddp_requeue_task, toep);
 	toep->ddp.flags = DDP_OK;
 	toep->ddp.active_id = -1;
 	mtx_init(&toep->ddp.lock, "t4 ddp", NULL, MTX_DEF);
 }
 
 void
 ddp_uninit_toep(struct toepcb *toep)
 {
 
 	mtx_destroy(&toep->ddp.lock);
 }
 
 void
 release_ddp_resources(struct toepcb *toep)
 {
 	struct pageset *ps;
 	int i;
 
 	DDP_LOCK(toep);
 	toep->flags |= DDP_DEAD;
 	for (i = 0; i < nitems(toep->ddp.db); i++) {
 		free_ddp_buffer(toep->td, &toep->ddp.db[i]);
 	}
 	while ((ps = TAILQ_FIRST(&toep->ddp.cached_pagesets)) != NULL) {
 		TAILQ_REMOVE(&toep->ddp.cached_pagesets, ps, link);
 		free_pageset(toep->td, ps);
 	}
 	ddp_complete_all(toep, 0);
 	DDP_UNLOCK(toep);
 }
 
 #ifdef INVARIANTS
 void
 ddp_assert_empty(struct toepcb *toep)
 {
 	int i;
 
 	MPASS(!(toep->ddp.flags & DDP_TASK_ACTIVE));
 	for (i = 0; i < nitems(toep->ddp.db); i++) {
 		MPASS(toep->ddp.db[i].job == NULL);
 		MPASS(toep->ddp.db[i].ps == NULL);
 	}
 	MPASS(TAILQ_EMPTY(&toep->ddp.cached_pagesets));
 	MPASS(TAILQ_EMPTY(&toep->ddp.aiojobq));
 }
 #endif
 
 static void
 complete_ddp_buffer(struct toepcb *toep, struct ddp_buffer *db,
     unsigned int db_idx)
 {
 	unsigned int db_flag;
 
 	toep->ddp.active_count--;
 	if (toep->ddp.active_id == db_idx) {
 		if (toep->ddp.active_count == 0) {
 			KASSERT(toep->ddp.db[db_idx ^ 1].job == NULL,
 			    ("%s: active_count mismatch", __func__));
 			toep->ddp.active_id = -1;
 		} else
 			toep->ddp.active_id ^= 1;
 #ifdef VERBOSE_TRACES
 		CTR2(KTR_CXGBE, "%s: ddp_active_id = %d", __func__,
 		    toep->ddp.active_id);
 #endif
 	} else {
 		KASSERT(toep->ddp.active_count != 0 &&
 		    toep->ddp.active_id != -1,
 		    ("%s: active count mismatch", __func__));
 	}
 
 	db->cancel_pending = 0;
 	db->job = NULL;
 	recycle_pageset(toep, db->ps);
 	db->ps = NULL;
 
 	db_flag = db_idx == 1 ? DDP_BUF1_ACTIVE : DDP_BUF0_ACTIVE;
 	KASSERT(toep->ddp.flags & db_flag,
 	    ("%s: DDP buffer not active. toep %p, ddp_flags 0x%x",
 	    __func__, toep, toep->ddp.flags));
 	toep->ddp.flags &= ~db_flag;
 }
 
 /* XXX: handle_ddp_data code duplication */
 void
 insert_ddp_data(struct toepcb *toep, uint32_t n)
 {
 	struct inpcb *inp = toep->inp;
 	struct tcpcb *tp = intotcpcb(inp);
 	struct ddp_buffer *db;
 	struct kaiocb *job;
 	size_t placed;
 	long copied;
 	unsigned int db_flag, db_idx;
 
 	INP_WLOCK_ASSERT(inp);
 	DDP_ASSERT_LOCKED(toep);
 
 	tp->rcv_nxt += n;
 #ifndef USE_DDP_RX_FLOW_CONTROL
 	KASSERT(tp->rcv_wnd >= n, ("%s: negative window size", __func__));
 	tp->rcv_wnd -= n;
 #endif
-#ifndef USE_DDP_RX_FLOW_CONTROL
-	toep->rx_credits += n;
-#endif
 	CTR2(KTR_CXGBE, "%s: placed %u bytes before falling out of DDP",
 	    __func__, n);
 	while (toep->ddp.active_count > 0) {
 		MPASS(toep->ddp.active_id != -1);
 		db_idx = toep->ddp.active_id;
 		db_flag = db_idx == 1 ? DDP_BUF1_ACTIVE : DDP_BUF0_ACTIVE;
 		MPASS((toep->ddp.flags & db_flag) != 0);
 		db = &toep->ddp.db[db_idx];
 		job = db->job;
 		copied = job->aio_received;
 		placed = n;
 		if (placed > job->uaiocb.aio_nbytes - copied)
 			placed = job->uaiocb.aio_nbytes - copied;
 		if (placed > 0)
 			job->msgrcv = 1;
 		if (!aio_clear_cancel_function(job)) {
 			/*
 			 * Update the copied length for when
 			 * t4_aio_cancel_active() completes this
 			 * request.
 			 */
 			job->aio_received += placed;
 		} else if (copied + placed != 0) {
 			CTR4(KTR_CXGBE,
 			    "%s: completing %p (copied %ld, placed %lu)",
 			    __func__, job, copied, placed);
 			/* XXX: This always completes if there is some data. */
 			aio_complete(job, copied + placed, 0);
 		} else if (aio_set_cancel_function(job, t4_aio_cancel_queued)) {
 			TAILQ_INSERT_HEAD(&toep->ddp.aiojobq, job, list);
 			toep->ddp.waiting_count++;
 		} else
 			aio_cancel(job);
 		n -= placed;
 		complete_ddp_buffer(toep, db, db_idx);
 	}
 
 	MPASS(n == 0);
 }
 
 /* SET_TCB_FIELD sent as a ULP command looks like this */
 #define LEN__SET_TCB_FIELD_ULP (sizeof(struct ulp_txpkt) + \
     sizeof(struct ulptx_idata) + sizeof(struct cpl_set_tcb_field_core))
 
 /* RX_DATA_ACK sent as a ULP command looks like this */
 #define LEN__RX_DATA_ACK_ULP (sizeof(struct ulp_txpkt) + \
     sizeof(struct ulptx_idata) + sizeof(struct cpl_rx_data_ack_core))
 
 static inline void *
 mk_set_tcb_field_ulp(struct ulp_txpkt *ulpmc, struct toepcb *toep,
     uint64_t word, uint64_t mask, uint64_t val)
 {
 	struct ulptx_idata *ulpsc;
 	struct cpl_set_tcb_field_core *req;
 
 	ulpmc->cmd_dest = htonl(V_ULPTX_CMD(ULP_TX_PKT) | V_ULP_TXPKT_DEST(0));
 	ulpmc->len = htobe32(howmany(LEN__SET_TCB_FIELD_ULP, 16));
 
 	ulpsc = (struct ulptx_idata *)(ulpmc + 1);
 	ulpsc->cmd_more = htobe32(V_ULPTX_CMD(ULP_TX_SC_IMM));
 	ulpsc->len = htobe32(sizeof(*req));
 
 	req = (struct cpl_set_tcb_field_core *)(ulpsc + 1);
 	OPCODE_TID(req) = htobe32(MK_OPCODE_TID(CPL_SET_TCB_FIELD, toep->tid));
 	req->reply_ctrl = htobe16(V_NO_REPLY(1) |
 	    V_QUEUENO(toep->ofld_rxq->iq.abs_id));
 	req->word_cookie = htobe16(V_WORD(word) | V_COOKIE(0));
         req->mask = htobe64(mask);
         req->val = htobe64(val);
 
 	ulpsc = (struct ulptx_idata *)(req + 1);
 	if (LEN__SET_TCB_FIELD_ULP % 16) {
 		ulpsc->cmd_more = htobe32(V_ULPTX_CMD(ULP_TX_SC_NOOP));
 		ulpsc->len = htobe32(0);
 		return (ulpsc + 1);
 	}
 	return (ulpsc);
 }
 
 static inline void *
 mk_rx_data_ack_ulp(struct ulp_txpkt *ulpmc, struct toepcb *toep)
 {
 	struct ulptx_idata *ulpsc;
 	struct cpl_rx_data_ack_core *req;
 
 	ulpmc->cmd_dest = htonl(V_ULPTX_CMD(ULP_TX_PKT) | V_ULP_TXPKT_DEST(0));
 	ulpmc->len = htobe32(howmany(LEN__RX_DATA_ACK_ULP, 16));
 
 	ulpsc = (struct ulptx_idata *)(ulpmc + 1);
 	ulpsc->cmd_more = htobe32(V_ULPTX_CMD(ULP_TX_SC_IMM));
 	ulpsc->len = htobe32(sizeof(*req));
 
 	req = (struct cpl_rx_data_ack_core *)(ulpsc + 1);
 	OPCODE_TID(req) = htobe32(MK_OPCODE_TID(CPL_RX_DATA_ACK, toep->tid));
 	req->credit_dack = htobe32(F_RX_MODULATE_RX);
 
 	ulpsc = (struct ulptx_idata *)(req + 1);
 	if (LEN__RX_DATA_ACK_ULP % 16) {
 		ulpsc->cmd_more = htobe32(V_ULPTX_CMD(ULP_TX_SC_NOOP));
 		ulpsc->len = htobe32(0);
 		return (ulpsc + 1);
 	}
 	return (ulpsc);
 }
 
 static struct wrqe *
 mk_update_tcb_for_ddp(struct adapter *sc, struct toepcb *toep, int db_idx,
     struct pageset *ps, int offset, uint64_t ddp_flags, uint64_t ddp_flags_mask)
 {
 	struct wrqe *wr;
 	struct work_request_hdr *wrh;
 	struct ulp_txpkt *ulpmc;
 	int len;
 
 	KASSERT(db_idx == 0 || db_idx == 1,
 	    ("%s: bad DDP buffer index %d", __func__, db_idx));
 
 	/*
 	 * We'll send a compound work request that has 3 SET_TCB_FIELDs and an
 	 * RX_DATA_ACK (with RX_MODULATE to speed up delivery).
 	 *
 	 * The work request header is 16B and always ends at a 16B boundary.
 	 * The ULPTX master commands that follow must all end at 16B boundaries
 	 * too so we round up the size to 16.
 	 */
 	len = sizeof(*wrh) + 3 * roundup2(LEN__SET_TCB_FIELD_ULP, 16) +
 	    roundup2(LEN__RX_DATA_ACK_ULP, 16);
 
 	wr = alloc_wrqe(len, toep->ctrlq);
 	if (wr == NULL)
 		return (NULL);
 	wrh = wrtod(wr);
 	INIT_ULPTX_WRH(wrh, len, 1, 0);	/* atomic */
 	ulpmc = (struct ulp_txpkt *)(wrh + 1);
 
 	/* Write the buffer's tag */
 	ulpmc = mk_set_tcb_field_ulp(ulpmc, toep,
 	    W_TCB_RX_DDP_BUF0_TAG + db_idx,
 	    V_TCB_RX_DDP_BUF0_TAG(M_TCB_RX_DDP_BUF0_TAG),
 	    V_TCB_RX_DDP_BUF0_TAG(ps->prsv.prsv_tag));
 
 	/* Update the current offset in the DDP buffer and its total length */
 	if (db_idx == 0)
 		ulpmc = mk_set_tcb_field_ulp(ulpmc, toep,
 		    W_TCB_RX_DDP_BUF0_OFFSET,
 		    V_TCB_RX_DDP_BUF0_OFFSET(M_TCB_RX_DDP_BUF0_OFFSET) |
 		    V_TCB_RX_DDP_BUF0_LEN(M_TCB_RX_DDP_BUF0_LEN),
 		    V_TCB_RX_DDP_BUF0_OFFSET(offset) |
 		    V_TCB_RX_DDP_BUF0_LEN(ps->len));
 	else
 		ulpmc = mk_set_tcb_field_ulp(ulpmc, toep,
 		    W_TCB_RX_DDP_BUF1_OFFSET,
 		    V_TCB_RX_DDP_BUF1_OFFSET(M_TCB_RX_DDP_BUF1_OFFSET) |
 		    V_TCB_RX_DDP_BUF1_LEN((u64)M_TCB_RX_DDP_BUF1_LEN << 32),
 		    V_TCB_RX_DDP_BUF1_OFFSET(offset) |
 		    V_TCB_RX_DDP_BUF1_LEN((u64)ps->len << 32));
 
 	/* Update DDP flags */
 	ulpmc = mk_set_tcb_field_ulp(ulpmc, toep, W_TCB_RX_DDP_FLAGS,
 	    ddp_flags_mask, ddp_flags);
 
 	/* Gratuitous RX_DATA_ACK with RX_MODULATE set to speed up delivery. */
 	ulpmc = mk_rx_data_ack_ulp(ulpmc, toep);
 
 	return (wr);
 }
 
 static int
 handle_ddp_data(struct toepcb *toep, __be32 ddp_report, __be32 rcv_nxt, int len)
 {
 	uint32_t report = be32toh(ddp_report);
 	unsigned int db_idx;
 	struct inpcb *inp = toep->inp;
 	struct ddp_buffer *db;
 	struct tcpcb *tp;
 	struct socket *so;
 	struct sockbuf *sb;
 	struct kaiocb *job;
 	long copied;
 
 	db_idx = report & F_DDP_BUF_IDX ? 1 : 0;
 
 	if (__predict_false(!(report & F_DDP_INV)))
 		CXGBE_UNIMPLEMENTED("DDP buffer still valid");
 
 	INP_WLOCK(inp);
 	so = inp_inpcbtosocket(inp);
 	sb = &so->so_rcv;
 	DDP_LOCK(toep);
 
 	KASSERT(toep->ddp.active_id == db_idx,
 	    ("completed DDP buffer (%d) != active_id (%d) for tid %d", db_idx,
 	    toep->ddp.active_id, toep->tid));
 	db = &toep->ddp.db[db_idx];
 	job = db->job;
 
 	if (__predict_false(inp->inp_flags & (INP_DROPPED | INP_TIMEWAIT))) {
 		/*
 		 * This can happen due to an administrative tcpdrop(8).
 		 * Just fail the request with ECONNRESET.
 		 */
 		CTR5(KTR_CXGBE, "%s: tid %u, seq 0x%x, len %d, inp_flags 0x%x",
 		    __func__, toep->tid, be32toh(rcv_nxt), len, inp->inp_flags);
 		if (aio_clear_cancel_function(job))
 			ddp_complete_one(job, ECONNRESET);
 		goto completed;
 	}
 
 	tp = intotcpcb(inp);
 
 	/*
 	 * For RX_DDP_COMPLETE, len will be zero and rcv_nxt is the
 	 * sequence number of the next byte to receive.  The length of
 	 * the data received for this message must be computed by
 	 * comparing the new and old values of rcv_nxt.
 	 *
 	 * For RX_DATA_DDP, len might be non-zero, but it is only the
 	 * length of the most recent DMA.  It does not include the
 	 * total length of the data received since the previous update
 	 * for this DDP buffer.  rcv_nxt is the sequence number of the
 	 * first received byte from the most recent DMA.
 	 */
 	len += be32toh(rcv_nxt) - tp->rcv_nxt;
 	tp->rcv_nxt += len;
 	tp->t_rcvtime = ticks;
 #ifndef USE_DDP_RX_FLOW_CONTROL
 	KASSERT(tp->rcv_wnd >= len, ("%s: negative window size", __func__));
 	tp->rcv_wnd -= len;
 #endif
 #ifdef VERBOSE_TRACES
 	CTR4(KTR_CXGBE, "%s: DDP[%d] placed %d bytes (%#x)", __func__, db_idx,
 	    len, report);
 #endif
 
 	/* receive buffer autosize */
 	MPASS(toep->vnet == so->so_vnet);
 	CURVNET_SET(toep->vnet);
 	SOCKBUF_LOCK(sb);
 	if (sb->sb_flags & SB_AUTOSIZE &&
 	    V_tcp_do_autorcvbuf &&
 	    sb->sb_hiwat < V_tcp_autorcvbuf_max &&
 	    len > (sbspace(sb) / 8 * 7)) {
 		unsigned int hiwat = sb->sb_hiwat;
 		unsigned int newsize = min(hiwat + V_tcp_autorcvbuf_inc,
 		    V_tcp_autorcvbuf_max);
 
 		if (!sbreserve_locked(sb, newsize, so, NULL))
 			sb->sb_flags &= ~SB_AUTOSIZE;
-		else
-			toep->rx_credits += newsize - hiwat;
 	}
 	SOCKBUF_UNLOCK(sb);
 	CURVNET_RESTORE();
 
-#ifndef USE_DDP_RX_FLOW_CONTROL
-	toep->rx_credits += len;
-#endif
-
 	job->msgrcv = 1;
 	if (db->cancel_pending) {
 		/*
 		 * Update the job's length but defer completion to the
 		 * TCB_RPL callback.
 		 */
 		job->aio_received += len;
 		goto out;
 	} else if (!aio_clear_cancel_function(job)) {
 		/*
 		 * Update the copied length for when
 		 * t4_aio_cancel_active() completes this request.
 		 */
 		job->aio_received += len;
 	} else {
 		copied = job->aio_received;
 #ifdef VERBOSE_TRACES
 		CTR4(KTR_CXGBE, "%s: completing %p (copied %ld, placed %d)",
 		    __func__, job, copied, len);
 #endif
 		aio_complete(job, copied + len, 0);
 		t4_rcvd(&toep->td->tod, tp);
 	}
 
 completed:
 	complete_ddp_buffer(toep, db, db_idx);
 	if (toep->ddp.waiting_count > 0)
 		ddp_queue_toep(toep);
 out:
 	DDP_UNLOCK(toep);
 	INP_WUNLOCK(inp);
 
 	return (0);
 }
 
 void
 handle_ddp_indicate(struct toepcb *toep)
 {
 
 	DDP_ASSERT_LOCKED(toep);
 	MPASS(toep->ddp.active_count == 0);
 	MPASS((toep->ddp.flags & (DDP_BUF0_ACTIVE | DDP_BUF1_ACTIVE)) == 0);
 	if (toep->ddp.waiting_count == 0) {
 		/*
 		 * The pending requests that triggered the request for an
 		 * an indicate were cancelled.  Those cancels should have
 		 * already disabled DDP.  Just ignore this as the data is
 		 * going into the socket buffer anyway.
 		 */
 		return;
 	}
 	CTR3(KTR_CXGBE, "%s: tid %d indicated (%d waiting)", __func__,
 	    toep->tid, toep->ddp.waiting_count);
 	ddp_queue_toep(toep);
 }
 
 enum {
 	DDP_BUF0_INVALIDATED = 0x2,
 	DDP_BUF1_INVALIDATED
 };
 
 CTASSERT(DDP_BUF0_INVALIDATED == CPL_COOKIE_DDP0);
 
 static int
 do_ddp_tcb_rpl(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m)
 {
 	struct adapter *sc = iq->adapter;
 	const struct cpl_set_tcb_rpl *cpl = (const void *)(rss + 1);
 	unsigned int tid = GET_TID(cpl);
 	unsigned int db_idx;
 	struct toepcb *toep;
 	struct inpcb *inp;
 	struct ddp_buffer *db;
 	struct kaiocb *job;
 	long copied;
 
 	if (cpl->status != CPL_ERR_NONE)
 		panic("XXX: tcp_rpl failed: %d", cpl->status);
 
 	toep = lookup_tid(sc, tid);
 	inp = toep->inp;
 	switch (cpl->cookie) {
 	case V_WORD(W_TCB_RX_DDP_FLAGS) | V_COOKIE(DDP_BUF0_INVALIDATED):
 	case V_WORD(W_TCB_RX_DDP_FLAGS) | V_COOKIE(DDP_BUF1_INVALIDATED):
 		/*
 		 * XXX: This duplicates a lot of code with handle_ddp_data().
 		 */
 		db_idx = G_COOKIE(cpl->cookie) - DDP_BUF0_INVALIDATED;
 		MPASS(db_idx < nitems(toep->ddp.db));
 		INP_WLOCK(inp);
 		DDP_LOCK(toep);
 		db = &toep->ddp.db[db_idx];
 
 		/*
 		 * handle_ddp_data() should leave the job around until
 		 * this callback runs once a cancel is pending.
 		 */
 		MPASS(db != NULL);
 		MPASS(db->job != NULL);
 		MPASS(db->cancel_pending);
 
 		/*
 		 * XXX: It's not clear what happens if there is data
 		 * placed when the buffer is invalidated.  I suspect we
 		 * need to read the TCB to see how much data was placed.
 		 *
 		 * For now this just pretends like nothing was placed.
 		 *
 		 * XXX: Note that if we did check the PCB we would need to
 		 * also take care of updating the tp, etc.
 		 */
 		job = db->job;
 		copied = job->aio_received;
 		if (copied == 0) {
 			CTR2(KTR_CXGBE, "%s: cancelling %p", __func__, job);
 			aio_cancel(job);
 		} else {
 			CTR3(KTR_CXGBE, "%s: completing %p (copied %ld)",
 			    __func__, job, copied);
 			aio_complete(job, copied, 0);
 			t4_rcvd(&toep->td->tod, intotcpcb(inp));
 		}
 
 		complete_ddp_buffer(toep, db, db_idx);
 		if (toep->ddp.waiting_count > 0)
 			ddp_queue_toep(toep);
 		DDP_UNLOCK(toep);
 		INP_WUNLOCK(inp);
 		break;
 	default:
 		panic("XXX: unknown tcb_rpl offset %#x, cookie %#x",
 		    G_WORD(cpl->cookie), G_COOKIE(cpl->cookie));
 	}
 
 	return (0);
 }
 
 void
 handle_ddp_close(struct toepcb *toep, struct tcpcb *tp, __be32 rcv_nxt)
 {
 	struct ddp_buffer *db;
 	struct kaiocb *job;
 	long copied;
 	unsigned int db_flag, db_idx;
 	int len, placed;
 
 	INP_WLOCK_ASSERT(toep->inp);
 	DDP_ASSERT_LOCKED(toep);
-	len = be32toh(rcv_nxt) - tp->rcv_nxt;
 
+	len = be32toh(rcv_nxt) - tp->rcv_nxt;
 	tp->rcv_nxt += len;
-#ifndef USE_DDP_RX_FLOW_CONTROL
-	toep->rx_credits += len;
-#endif
 
 	while (toep->ddp.active_count > 0) {
 		MPASS(toep->ddp.active_id != -1);
 		db_idx = toep->ddp.active_id;
 		db_flag = db_idx == 1 ? DDP_BUF1_ACTIVE : DDP_BUF0_ACTIVE;
 		MPASS((toep->ddp.flags & db_flag) != 0);
 		db = &toep->ddp.db[db_idx];
 		job = db->job;
 		copied = job->aio_received;
 		placed = len;
 		if (placed > job->uaiocb.aio_nbytes - copied)
 			placed = job->uaiocb.aio_nbytes - copied;
 		if (placed > 0)
 			job->msgrcv = 1;
 		if (!aio_clear_cancel_function(job)) {
 			/*
 			 * Update the copied length for when
 			 * t4_aio_cancel_active() completes this
 			 * request.
 			 */
 			job->aio_received += placed;
 		} else {
 			CTR4(KTR_CXGBE, "%s: tid %d completed buf %d len %d",
 			    __func__, toep->tid, db_idx, placed);
 			aio_complete(job, copied + placed, 0);
 		}
 		len -= placed;
 		complete_ddp_buffer(toep, db, db_idx);
 	}
 
 	MPASS(len == 0);
 	ddp_complete_all(toep, 0);
 }
 
 #define DDP_ERR (F_DDP_PPOD_MISMATCH | F_DDP_LLIMIT_ERR | F_DDP_ULIMIT_ERR |\
 	 F_DDP_PPOD_PARITY_ERR | F_DDP_PADDING_ERR | F_DDP_OFFSET_ERR |\
 	 F_DDP_INVALID_TAG | F_DDP_COLOR_ERR | F_DDP_TID_MISMATCH |\
 	 F_DDP_INVALID_PPOD | F_DDP_HDRCRC_ERR | F_DDP_DATACRC_ERR)
 
 extern cpl_handler_t t4_cpl_handler[];
 
 static int
 do_rx_data_ddp(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m)
 {
 	struct adapter *sc = iq->adapter;
 	const struct cpl_rx_data_ddp *cpl = (const void *)(rss + 1);
 	unsigned int tid = GET_TID(cpl);
 	uint32_t vld;
 	struct toepcb *toep = lookup_tid(sc, tid);
 
 	KASSERT(m == NULL, ("%s: wasn't expecting payload", __func__));
 	KASSERT(toep->tid == tid, ("%s: toep tid/atid mismatch", __func__));
 	KASSERT(!(toep->flags & TPF_SYNQE),
 	    ("%s: toep %p claims to be a synq entry", __func__, toep));
 
 	vld = be32toh(cpl->ddpvld);
 	if (__predict_false(vld & DDP_ERR)) {
 		panic("%s: DDP error 0x%x (tid %d, toep %p)",
 		    __func__, vld, tid, toep);
 	}
 
 	if (toep->ulp_mode == ULP_MODE_ISCSI) {
 		t4_cpl_handler[CPL_RX_ISCSI_DDP](iq, rss, m);
 		return (0);
 	}
 
 	handle_ddp_data(toep, cpl->u.ddp_report, cpl->seq, be16toh(cpl->len));
 
 	return (0);
 }
 
 static int
 do_rx_ddp_complete(struct sge_iq *iq, const struct rss_header *rss,
     struct mbuf *m)
 {
 	struct adapter *sc = iq->adapter;
 	const struct cpl_rx_ddp_complete *cpl = (const void *)(rss + 1);
 	unsigned int tid = GET_TID(cpl);
 	struct toepcb *toep = lookup_tid(sc, tid);
 
 	KASSERT(m == NULL, ("%s: wasn't expecting payload", __func__));
 	KASSERT(toep->tid == tid, ("%s: toep tid/atid mismatch", __func__));
 	KASSERT(!(toep->flags & TPF_SYNQE),
 	    ("%s: toep %p claims to be a synq entry", __func__, toep));
 
 	handle_ddp_data(toep, cpl->ddp_report, cpl->rcv_nxt, 0);
 
 	return (0);
 }
 
 static void
 enable_ddp(struct adapter *sc, struct toepcb *toep)
 {
 
 	KASSERT((toep->ddp.flags & (DDP_ON | DDP_OK | DDP_SC_REQ)) == DDP_OK,
 	    ("%s: toep %p has bad ddp_flags 0x%x",
 	    __func__, toep, toep->ddp.flags));
 
 	CTR3(KTR_CXGBE, "%s: tid %u (time %u)",
 	    __func__, toep->tid, time_uptime);
 
 	DDP_ASSERT_LOCKED(toep);
 	toep->ddp.flags |= DDP_SC_REQ;
 	t4_set_tcb_field(sc, toep->ctrlq, toep, W_TCB_RX_DDP_FLAGS,
 	    V_TF_DDP_OFF(1) | V_TF_DDP_INDICATE_OUT(1) |
 	    V_TF_DDP_BUF0_INDICATE(1) | V_TF_DDP_BUF1_INDICATE(1) |
 	    V_TF_DDP_BUF0_VALID(1) | V_TF_DDP_BUF1_VALID(1),
 	    V_TF_DDP_BUF0_INDICATE(1) | V_TF_DDP_BUF1_INDICATE(1), 0, 0);
 	t4_set_tcb_field(sc, toep->ctrlq, toep, W_TCB_T_FLAGS,
 	    V_TF_RCV_COALESCE_ENABLE(1), 0, 0, 0);
 }
 
 static int
 calculate_hcf(int n1, int n2)
 {
 	int a, b, t;
 
 	if (n1 <= n2) {
 		a = n1;
 		b = n2;
 	} else {
 		a = n2;
 		b = n1;
 	}
 
 	while (a != 0) {
 		t = a;
 		a = b % a;
 		b = t;
 	}
 
 	return (b);
 }
 
 static inline int
 pages_to_nppods(int npages, int ddp_page_shift)
 {
 
 	MPASS(ddp_page_shift >= PAGE_SHIFT);
 
 	return (howmany(npages >> (ddp_page_shift - PAGE_SHIFT), PPOD_PAGES));
 }
 
 static int
 alloc_page_pods(struct ppod_region *pr, u_int nppods, u_int pgsz_idx,
     struct ppod_reservation *prsv)
 {
 	vmem_addr_t addr;       /* relative to start of region */
 
 	if (vmem_alloc(pr->pr_arena, PPOD_SZ(nppods), M_NOWAIT | M_FIRSTFIT,
 	    &addr) != 0)
 		return (ENOMEM);
 
 	CTR5(KTR_CXGBE, "%-17s arena %p, addr 0x%08x, nppods %d, pgsz %d",
 	    __func__, pr->pr_arena, (uint32_t)addr & pr->pr_tag_mask,
 	    nppods, 1 << pr->pr_page_shift[pgsz_idx]);
 
 	/*
 	 * The hardware tagmask includes an extra invalid bit but the arena was
 	 * seeded with valid values only.  An allocation out of this arena will
 	 * fit inside the tagmask but won't have the invalid bit set.
 	 */
 	MPASS((addr & pr->pr_tag_mask) == addr);
 	MPASS((addr & pr->pr_invalid_bit) == 0);
 
 	prsv->prsv_pr = pr;
 	prsv->prsv_tag = V_PPOD_PGSZ(pgsz_idx) | addr;
 	prsv->prsv_nppods = nppods;
 
 	return (0);
 }
 
 int
 t4_alloc_page_pods_for_ps(struct ppod_region *pr, struct pageset *ps)
 {
 	int i, hcf, seglen, idx, nppods;
 	struct ppod_reservation *prsv = &ps->prsv;
 
 	KASSERT(prsv->prsv_nppods == 0,
 	    ("%s: page pods already allocated", __func__));
 
 	/*
 	 * The DDP page size is unrelated to the VM page size.  We combine
 	 * contiguous physical pages into larger segments to get the best DDP
 	 * page size possible.  This is the largest of the four sizes in
 	 * A_ULP_RX_TDDP_PSZ that evenly divides the HCF of the segment sizes in
 	 * the page list.
 	 */
 	hcf = 0;
 	for (i = 0; i < ps->npages; i++) {
 		seglen = PAGE_SIZE;
 		while (i < ps->npages - 1 &&
 		    ps->pages[i]->phys_addr + PAGE_SIZE ==
 		    ps->pages[i + 1]->phys_addr) {
 			seglen += PAGE_SIZE;
 			i++;
 		}
 
 		hcf = calculate_hcf(hcf, seglen);
 		if (hcf < (1 << pr->pr_page_shift[1])) {
 			idx = 0;
 			goto have_pgsz;	/* give up, short circuit */
 		}
 	}
 
 #define PR_PAGE_MASK(x) ((1 << pr->pr_page_shift[(x)]) - 1)
 	MPASS((hcf & PR_PAGE_MASK(0)) == 0); /* PAGE_SIZE is >= 4K everywhere */
 	for (idx = nitems(pr->pr_page_shift) - 1; idx > 0; idx--) {
 		if ((hcf & PR_PAGE_MASK(idx)) == 0)
 			break;
 	}
 #undef PR_PAGE_MASK
 
 have_pgsz:
 	MPASS(idx <= M_PPOD_PGSZ);
 
 	nppods = pages_to_nppods(ps->npages, pr->pr_page_shift[idx]);
 	if (alloc_page_pods(pr, nppods, idx, prsv) != 0)
 		return (0);
 	MPASS(prsv->prsv_nppods > 0);
 
 	return (1);
 }
 
 int
 t4_alloc_page_pods_for_buf(struct ppod_region *pr, vm_offset_t buf, int len,
     struct ppod_reservation *prsv)
 {
 	int hcf, seglen, idx, npages, nppods;
 	uintptr_t start_pva, end_pva, pva, p1;
 
 	MPASS(buf > 0);
 	MPASS(len > 0);
 
 	/*
 	 * The DDP page size is unrelated to the VM page size.  We combine
 	 * contiguous physical pages into larger segments to get the best DDP
 	 * page size possible.  This is the largest of the four sizes in
 	 * A_ULP_RX_ISCSI_PSZ that evenly divides the HCF of the segment sizes
 	 * in the page list.
 	 */
 	hcf = 0;
 	start_pva = trunc_page(buf);
 	end_pva = trunc_page(buf + len - 1);
 	pva = start_pva;
 	while (pva <= end_pva) {
 		seglen = PAGE_SIZE;
 		p1 = pmap_kextract(pva);
 		pva += PAGE_SIZE;
 		while (pva <= end_pva && p1 + seglen == pmap_kextract(pva)) {
 			seglen += PAGE_SIZE;
 			pva += PAGE_SIZE;
 		}
 
 		hcf = calculate_hcf(hcf, seglen);
 		if (hcf < (1 << pr->pr_page_shift[1])) {
 			idx = 0;
 			goto have_pgsz;	/* give up, short circuit */
 		}
 	}
 
 #define PR_PAGE_MASK(x) ((1 << pr->pr_page_shift[(x)]) - 1)
 	MPASS((hcf & PR_PAGE_MASK(0)) == 0); /* PAGE_SIZE is >= 4K everywhere */
 	for (idx = nitems(pr->pr_page_shift) - 1; idx > 0; idx--) {
 		if ((hcf & PR_PAGE_MASK(idx)) == 0)
 			break;
 	}
 #undef PR_PAGE_MASK
 
 have_pgsz:
 	MPASS(idx <= M_PPOD_PGSZ);
 
 	npages = 1;
 	npages += (end_pva - start_pva) >> pr->pr_page_shift[idx];
 	nppods = howmany(npages, PPOD_PAGES);
 	if (alloc_page_pods(pr, nppods, idx, prsv) != 0)
 		return (ENOMEM);
 	MPASS(prsv->prsv_nppods > 0);
 
 	return (0);
 }
 
 void
 t4_free_page_pods(struct ppod_reservation *prsv)
 {
 	struct ppod_region *pr = prsv->prsv_pr;
 	vmem_addr_t addr;
 
 	MPASS(prsv != NULL);
 	MPASS(prsv->prsv_nppods != 0);
 
 	addr = prsv->prsv_tag & pr->pr_tag_mask;
 	MPASS((addr & pr->pr_invalid_bit) == 0);
 
 	CTR4(KTR_CXGBE, "%-17s arena %p, addr 0x%08x, nppods %d", __func__,
 	    pr->pr_arena, addr, prsv->prsv_nppods);
 
 	vmem_free(pr->pr_arena, addr, PPOD_SZ(prsv->prsv_nppods));
 	prsv->prsv_nppods = 0;
 }
 
 #define NUM_ULP_TX_SC_IMM_PPODS (256 / PPOD_SIZE)
 
 int
 t4_write_page_pods_for_ps(struct adapter *sc, struct sge_wrq *wrq, int tid,
     struct pageset *ps)
 {
 	struct wrqe *wr;
 	struct ulp_mem_io *ulpmc;
 	struct ulptx_idata *ulpsc;
 	struct pagepod *ppod;
 	int i, j, k, n, chunk, len, ddp_pgsz, idx;
 	u_int ppod_addr;
 	uint32_t cmd;
 	struct ppod_reservation *prsv = &ps->prsv;
 	struct ppod_region *pr = prsv->prsv_pr;
 
 	KASSERT(!(ps->flags & PS_PPODS_WRITTEN),
 	    ("%s: page pods already written", __func__));
 	MPASS(prsv->prsv_nppods > 0);
 
 	cmd = htobe32(V_ULPTX_CMD(ULP_TX_MEM_WRITE));
 	if (is_t4(sc))
 		cmd |= htobe32(F_ULP_MEMIO_ORDER);
 	else
 		cmd |= htobe32(F_T5_ULP_MEMIO_IMM);
 	ddp_pgsz = 1 << pr->pr_page_shift[G_PPOD_PGSZ(prsv->prsv_tag)];
 	ppod_addr = pr->pr_start + (prsv->prsv_tag & pr->pr_tag_mask);
 	for (i = 0; i < prsv->prsv_nppods; ppod_addr += chunk) {
 
 		/* How many page pods are we writing in this cycle */
 		n = min(prsv->prsv_nppods - i, NUM_ULP_TX_SC_IMM_PPODS);
 		chunk = PPOD_SZ(n);
 		len = roundup2(sizeof(*ulpmc) + sizeof(*ulpsc) + chunk, 16);
 
 		wr = alloc_wrqe(len, wrq);
 		if (wr == NULL)
 			return (ENOMEM);	/* ok to just bail out */
 		ulpmc = wrtod(wr);
 
 		INIT_ULPTX_WR(ulpmc, len, 0, 0);
 		ulpmc->cmd = cmd;
 		ulpmc->dlen = htobe32(V_ULP_MEMIO_DATA_LEN(chunk / 32));
 		ulpmc->len16 = htobe32(howmany(len - sizeof(ulpmc->wr), 16));
 		ulpmc->lock_addr = htobe32(V_ULP_MEMIO_ADDR(ppod_addr >> 5));
 
 		ulpsc = (struct ulptx_idata *)(ulpmc + 1);
 		ulpsc->cmd_more = htobe32(V_ULPTX_CMD(ULP_TX_SC_IMM));
 		ulpsc->len = htobe32(chunk);
 
 		ppod = (struct pagepod *)(ulpsc + 1);
 		for (j = 0; j < n; i++, j++, ppod++) {
 			ppod->vld_tid_pgsz_tag_color = htobe64(F_PPOD_VALID |
 			    V_PPOD_TID(tid) | prsv->prsv_tag);
 			ppod->len_offset = htobe64(V_PPOD_LEN(ps->len) |
 			    V_PPOD_OFST(ps->offset));
 			ppod->rsvd = 0;
 			idx = i * PPOD_PAGES * (ddp_pgsz / PAGE_SIZE);
 			for (k = 0; k < nitems(ppod->addr); k++) {
 				if (idx < ps->npages) {
 					ppod->addr[k] =
 					    htobe64(ps->pages[idx]->phys_addr);
 					idx += ddp_pgsz / PAGE_SIZE;
 				} else
 					ppod->addr[k] = 0;
 #if 0
 				CTR5(KTR_CXGBE,
 				    "%s: tid %d ppod[%d]->addr[%d] = %p",
 				    __func__, toep->tid, i, k,
 				    htobe64(ppod->addr[k]));
 #endif
 			}
 
 		}
 
 		t4_wrq_tx(sc, wr);
 	}
 	ps->flags |= PS_PPODS_WRITTEN;
 
 	return (0);
 }
 
 int
 t4_write_page_pods_for_buf(struct adapter *sc, struct sge_wrq *wrq, int tid,
     struct ppod_reservation *prsv, vm_offset_t buf, int buflen)
 {
 	struct wrqe *wr;
 	struct ulp_mem_io *ulpmc;
 	struct ulptx_idata *ulpsc;
 	struct pagepod *ppod;
 	int i, j, k, n, chunk, len, ddp_pgsz;
 	u_int ppod_addr, offset;
 	uint32_t cmd;
 	struct ppod_region *pr = prsv->prsv_pr;
 	uintptr_t end_pva, pva, pa;
 
 	cmd = htobe32(V_ULPTX_CMD(ULP_TX_MEM_WRITE));
 	if (is_t4(sc))
 		cmd |= htobe32(F_ULP_MEMIO_ORDER);
 	else
 		cmd |= htobe32(F_T5_ULP_MEMIO_IMM);
 	ddp_pgsz = 1 << pr->pr_page_shift[G_PPOD_PGSZ(prsv->prsv_tag)];
 	offset = buf & PAGE_MASK;
 	ppod_addr = pr->pr_start + (prsv->prsv_tag & pr->pr_tag_mask);
 	pva = trunc_page(buf);
 	end_pva = trunc_page(buf + buflen - 1);
 	for (i = 0; i < prsv->prsv_nppods; ppod_addr += chunk) {
 
 		/* How many page pods are we writing in this cycle */
 		n = min(prsv->prsv_nppods - i, NUM_ULP_TX_SC_IMM_PPODS);
 		MPASS(n > 0);
 		chunk = PPOD_SZ(n);
 		len = roundup2(sizeof(*ulpmc) + sizeof(*ulpsc) + chunk, 16);
 
 		wr = alloc_wrqe(len, wrq);
 		if (wr == NULL)
 			return (ENOMEM);	/* ok to just bail out */
 		ulpmc = wrtod(wr);
 
 		INIT_ULPTX_WR(ulpmc, len, 0, 0);
 		ulpmc->cmd = cmd;
 		ulpmc->dlen = htobe32(V_ULP_MEMIO_DATA_LEN(chunk / 32));
 		ulpmc->len16 = htobe32(howmany(len - sizeof(ulpmc->wr), 16));
 		ulpmc->lock_addr = htobe32(V_ULP_MEMIO_ADDR(ppod_addr >> 5));
 
 		ulpsc = (struct ulptx_idata *)(ulpmc + 1);
 		ulpsc->cmd_more = htobe32(V_ULPTX_CMD(ULP_TX_SC_IMM));
 		ulpsc->len = htobe32(chunk);
 
 		ppod = (struct pagepod *)(ulpsc + 1);
 		for (j = 0; j < n; i++, j++, ppod++) {
 			ppod->vld_tid_pgsz_tag_color = htobe64(F_PPOD_VALID |
 			    V_PPOD_TID(tid) |
 			    (prsv->prsv_tag & ~V_PPOD_PGSZ(M_PPOD_PGSZ)));
 			ppod->len_offset = htobe64(V_PPOD_LEN(buflen) |
 			    V_PPOD_OFST(offset));
 			ppod->rsvd = 0;
 
 			for (k = 0; k < nitems(ppod->addr); k++) {
 				if (pva > end_pva)
 					ppod->addr[k] = 0;
 				else {
 					pa = pmap_kextract(pva);
 					ppod->addr[k] = htobe64(pa);
 					pva += ddp_pgsz;
 				}
 #if 0
 				CTR5(KTR_CXGBE,
 				    "%s: tid %d ppod[%d]->addr[%d] = %p",
 				    __func__, tid, i, k,
 				    htobe64(ppod->addr[k]));
 #endif
 			}
 
 			/*
 			 * Walk back 1 segment so that the first address in the
 			 * next pod is the same as the last one in the current
 			 * pod.
 			 */
 			pva -= ddp_pgsz;
 		}
 
 		t4_wrq_tx(sc, wr);
 	}
 
 	MPASS(pva <= end_pva);
 
 	return (0);
 }
 
 static void
 wire_pageset(struct pageset *ps)
 {
 	vm_page_t p;
 	int i;
 
 	KASSERT(!(ps->flags & PS_WIRED), ("pageset already wired"));
 
 	for (i = 0; i < ps->npages; i++) {
 		p = ps->pages[i];
 		vm_page_lock(p);
 		vm_page_wire(p);
 		vm_page_unhold(p);
 		vm_page_unlock(p);
 	}
 	ps->flags |= PS_WIRED;
 }
 
 /*
  * Prepare a pageset for DDP.  This wires the pageset and sets up page
  * pods.
  */
 static int
 prep_pageset(struct adapter *sc, struct toepcb *toep, struct pageset *ps)
 {
 	struct tom_data *td = sc->tom_softc;
 
 	if (!(ps->flags & PS_WIRED))
 		wire_pageset(ps);
 	if (ps->prsv.prsv_nppods == 0 &&
 	    !t4_alloc_page_pods_for_ps(&td->pr, ps)) {
 		return (0);
 	}
 	if (!(ps->flags & PS_PPODS_WRITTEN) &&
 	    t4_write_page_pods_for_ps(sc, toep->ctrlq, toep->tid, ps) != 0) {
 		return (0);
 	}
 
 	return (1);
 }
 
 int
 t4_init_ppod_region(struct ppod_region *pr, struct t4_range *r, u_int psz,
     const char *name)
 {
 	int i;
 
 	MPASS(pr != NULL);
 	MPASS(r->size > 0);
 
 	pr->pr_start = r->start;
 	pr->pr_len = r->size;
 	pr->pr_page_shift[0] = 12 + G_HPZ0(psz);
 	pr->pr_page_shift[1] = 12 + G_HPZ1(psz);
 	pr->pr_page_shift[2] = 12 + G_HPZ2(psz);
 	pr->pr_page_shift[3] = 12 + G_HPZ3(psz);
 
 	/* The SGL -> page pod algorithm requires the sizes to be in order. */
 	for (i = 1; i < nitems(pr->pr_page_shift); i++) {
 		if (pr->pr_page_shift[i] <= pr->pr_page_shift[i - 1])
 			return (ENXIO);
 	}
 
 	pr->pr_tag_mask = ((1 << fls(r->size)) - 1) & V_PPOD_TAG(M_PPOD_TAG);
 	pr->pr_alias_mask = V_PPOD_TAG(M_PPOD_TAG) & ~pr->pr_tag_mask;
 	if (pr->pr_tag_mask == 0 || pr->pr_alias_mask == 0)
 		return (ENXIO);
 	pr->pr_alias_shift = fls(pr->pr_tag_mask);
 	pr->pr_invalid_bit = 1 << (pr->pr_alias_shift - 1);
 
 	pr->pr_arena = vmem_create(name, 0, pr->pr_len, PPOD_SIZE, 0,
 	    M_FIRSTFIT | M_NOWAIT);
 	if (pr->pr_arena == NULL)
 		return (ENOMEM);
 
 	return (0);
 }
 
 void
 t4_free_ppod_region(struct ppod_region *pr)
 {
 
 	MPASS(pr != NULL);
 
 	if (pr->pr_arena)
 		vmem_destroy(pr->pr_arena);
 	bzero(pr, sizeof(*pr));
 }
 
 static int
 pscmp(struct pageset *ps, struct vmspace *vm, vm_offset_t start, int npages,
     int pgoff, int len)
 {
 
 	if (ps->start != start || ps->npages != npages ||
 	    ps->offset != pgoff || ps->len != len)
 		return (1);
 
 	return (ps->vm != vm || ps->vm_timestamp != vm->vm_map.timestamp);
 }
 
 static int
 hold_aio(struct toepcb *toep, struct kaiocb *job, struct pageset **pps)
 {
 	struct vmspace *vm;
 	vm_map_t map;
 	vm_offset_t start, end, pgoff;
 	struct pageset *ps;
 	int n;
 
 	DDP_ASSERT_LOCKED(toep);
 
 	/*
 	 * The AIO subsystem will cancel and drain all requests before
 	 * permitting a process to exit or exec, so p_vmspace should
 	 * be stable here.
 	 */
 	vm = job->userproc->p_vmspace;
 	map = &vm->vm_map;
 	start = (uintptr_t)job->uaiocb.aio_buf;
 	pgoff = start & PAGE_MASK;
 	end = round_page(start + job->uaiocb.aio_nbytes);
 	start = trunc_page(start);
 
 	if (end - start > MAX_DDP_BUFFER_SIZE) {
 		/*
 		 * Truncate the request to a short read.
 		 * Alternatively, we could DDP in chunks to the larger
 		 * buffer, but that would be quite a bit more work.
 		 *
 		 * When truncating, round the request down to avoid
 		 * crossing a cache line on the final transaction.
 		 */
 		end = rounddown2(start + MAX_DDP_BUFFER_SIZE, CACHE_LINE_SIZE);
 #ifdef VERBOSE_TRACES
 		CTR4(KTR_CXGBE, "%s: tid %d, truncating size from %lu to %lu",
 		    __func__, toep->tid, (unsigned long)job->uaiocb.aio_nbytes,
 		    (unsigned long)(end - (start + pgoff)));
 		job->uaiocb.aio_nbytes = end - (start + pgoff);
 #endif
 		end = round_page(end);
 	}
 
 	n = atop(end - start);
 
 	/*
 	 * Try to reuse a cached pageset.
 	 */
 	TAILQ_FOREACH(ps, &toep->ddp.cached_pagesets, link) {
 		if (pscmp(ps, vm, start, n, pgoff,
 		    job->uaiocb.aio_nbytes) == 0) {
 			TAILQ_REMOVE(&toep->ddp.cached_pagesets, ps, link);
 			toep->ddp.cached_count--;
 			*pps = ps;
 			return (0);
 		}
 	}
 
 	/*
 	 * If there are too many cached pagesets to create a new one,
 	 * free a pageset before creating a new one.
 	 */
 	KASSERT(toep->ddp.active_count + toep->ddp.cached_count <=
 	    nitems(toep->ddp.db), ("%s: too many wired pagesets", __func__));
 	if (toep->ddp.active_count + toep->ddp.cached_count ==
 	    nitems(toep->ddp.db)) {
 		KASSERT(toep->ddp.cached_count > 0,
 		    ("no cached pageset to free"));
 		ps = TAILQ_LAST(&toep->ddp.cached_pagesets, pagesetq);
 		TAILQ_REMOVE(&toep->ddp.cached_pagesets, ps, link);
 		toep->ddp.cached_count--;
 		free_pageset(toep->td, ps);
 	}
 	DDP_UNLOCK(toep);
 
 	/* Create a new pageset. */
 	ps = malloc(sizeof(*ps) + n * sizeof(vm_page_t), M_CXGBE, M_WAITOK |
 	    M_ZERO);
 	ps->pages = (vm_page_t *)(ps + 1);
 	ps->vm_timestamp = map->timestamp;
 	ps->npages = vm_fault_quick_hold_pages(map, start, end - start,
 	    VM_PROT_WRITE, ps->pages, n);
 
 	DDP_LOCK(toep);
 	if (ps->npages < 0) {
 		free(ps, M_CXGBE);
 		return (EFAULT);
 	}
 
 	KASSERT(ps->npages == n, ("hold_aio: page count mismatch: %d vs %d",
 	    ps->npages, n));
 
 	ps->offset = pgoff;
 	ps->len = job->uaiocb.aio_nbytes;
 	atomic_add_int(&vm->vm_refcnt, 1);
 	ps->vm = vm;
 	ps->start = start;
 
 	CTR5(KTR_CXGBE, "%s: tid %d, new pageset %p for job %p, npages %d",
 	    __func__, toep->tid, ps, job, ps->npages);
 	*pps = ps;
 	return (0);
 }
 
 static void
 ddp_complete_all(struct toepcb *toep, int error)
 {
 	struct kaiocb *job;
 
 	DDP_ASSERT_LOCKED(toep);
 	while (!TAILQ_EMPTY(&toep->ddp.aiojobq)) {
 		job = TAILQ_FIRST(&toep->ddp.aiojobq);
 		TAILQ_REMOVE(&toep->ddp.aiojobq, job, list);
 		toep->ddp.waiting_count--;
 		if (aio_clear_cancel_function(job))
 			ddp_complete_one(job, error);
 	}
 }
 
 static void
 aio_ddp_cancel_one(struct kaiocb *job)
 {
 	long copied;
 
 	/*
 	 * If this job had copied data out of the socket buffer before
 	 * it was cancelled, report it as a short read rather than an
 	 * error.
 	 */
 	copied = job->aio_received;
 	if (copied != 0)
 		aio_complete(job, copied, 0);
 	else
 		aio_cancel(job);
 }
 
 /*
  * Called when the main loop wants to requeue a job to retry it later.
  * Deals with the race of the job being cancelled while it was being
  * examined.
  */
 static void
 aio_ddp_requeue_one(struct toepcb *toep, struct kaiocb *job)
 {
 
 	DDP_ASSERT_LOCKED(toep);
 	if (!(toep->ddp.flags & DDP_DEAD) &&
 	    aio_set_cancel_function(job, t4_aio_cancel_queued)) {
 		TAILQ_INSERT_HEAD(&toep->ddp.aiojobq, job, list);
 		toep->ddp.waiting_count++;
 	} else
 		aio_ddp_cancel_one(job);
 }
 
 static void
 aio_ddp_requeue(struct toepcb *toep)
 {
 	struct adapter *sc = td_adapter(toep->td);
 	struct socket *so;
 	struct sockbuf *sb;
 	struct inpcb *inp;
 	struct kaiocb *job;
 	struct ddp_buffer *db;
 	size_t copied, offset, resid;
 	struct pageset *ps;
 	struct mbuf *m;
 	uint64_t ddp_flags, ddp_flags_mask;
 	struct wrqe *wr;
 	int buf_flag, db_idx, error;
 
 	DDP_ASSERT_LOCKED(toep);
 
 restart:
 	if (toep->ddp.flags & DDP_DEAD) {
 		MPASS(toep->ddp.waiting_count == 0);
 		MPASS(toep->ddp.active_count == 0);
 		return;
 	}
 
 	if (toep->ddp.waiting_count == 0 ||
 	    toep->ddp.active_count == nitems(toep->ddp.db)) {
 		return;
 	}
 
 	job = TAILQ_FIRST(&toep->ddp.aiojobq);
 	so = job->fd_file->f_data;
 	sb = &so->so_rcv;
 	SOCKBUF_LOCK(sb);
 
 	/* We will never get anything unless we are or were connected. */
 	if (!(so->so_state & (SS_ISCONNECTED|SS_ISDISCONNECTED))) {
 		SOCKBUF_UNLOCK(sb);
 		ddp_complete_all(toep, ENOTCONN);
 		return;
 	}
 
 	KASSERT(toep->ddp.active_count == 0 || sbavail(sb) == 0,
 	    ("%s: pending sockbuf data and DDP is active", __func__));
 
 	/* Abort if socket has reported problems. */
 	/* XXX: Wait for any queued DDP's to finish and/or flush them? */
 	if (so->so_error && sbavail(sb) == 0) {
 		toep->ddp.waiting_count--;
 		TAILQ_REMOVE(&toep->ddp.aiojobq, job, list);
 		if (!aio_clear_cancel_function(job)) {
 			SOCKBUF_UNLOCK(sb);
 			goto restart;
 		}
 
 		/*
 		 * If this job has previously copied some data, report
 		 * a short read and leave the error to be reported by
 		 * a future request.
 		 */
 		copied = job->aio_received;
 		if (copied != 0) {
 			SOCKBUF_UNLOCK(sb);
 			aio_complete(job, copied, 0);
 			goto restart;
 		}
 		error = so->so_error;
 		so->so_error = 0;
 		SOCKBUF_UNLOCK(sb);
 		aio_complete(job, -1, error);
 		goto restart;
 	}
 
 	/*
 	 * Door is closed.  If there is pending data in the socket buffer,
 	 * deliver it.  If there are pending DDP requests, wait for those
 	 * to complete.  Once they have completed, return EOF reads.
 	 */
 	if (sb->sb_state & SBS_CANTRCVMORE && sbavail(sb) == 0) {
 		SOCKBUF_UNLOCK(sb);
 		if (toep->ddp.active_count != 0)
 			return;
 		ddp_complete_all(toep, 0);
 		return;
 	}
 
 	/*
 	 * If DDP is not enabled and there is no pending socket buffer
 	 * data, try to enable DDP.
 	 */
 	if (sbavail(sb) == 0 && (toep->ddp.flags & DDP_ON) == 0) {
 		SOCKBUF_UNLOCK(sb);
 
 		/*
 		 * Wait for the card to ACK that DDP is enabled before
 		 * queueing any buffers.  Currently this waits for an
 		 * indicate to arrive.  This could use a TCB_SET_FIELD_RPL
 		 * message to know that DDP was enabled instead of waiting
 		 * for the indicate which would avoid copying the indicate
 		 * if no data is pending.
 		 *
 		 * XXX: Might want to limit the indicate size to the size
 		 * of the first queued request.
 		 */
 		if ((toep->ddp.flags & DDP_SC_REQ) == 0)
 			enable_ddp(sc, toep);
 		return;
 	}
 	SOCKBUF_UNLOCK(sb);
 
 	/*
 	 * If another thread is queueing a buffer for DDP, let it
 	 * drain any work and return.
 	 */
 	if (toep->ddp.queueing != NULL)
 		return;
 
 	/* Take the next job to prep it for DDP. */
 	toep->ddp.waiting_count--;
 	TAILQ_REMOVE(&toep->ddp.aiojobq, job, list);
 	if (!aio_clear_cancel_function(job))
 		goto restart;
 	toep->ddp.queueing = job;
 
 	/* NB: This drops DDP_LOCK while it holds the backing VM pages. */
 	error = hold_aio(toep, job, &ps);
 	if (error != 0) {
 		ddp_complete_one(job, error);
 		toep->ddp.queueing = NULL;
 		goto restart;
 	}
 
 	SOCKBUF_LOCK(sb);
 	if (so->so_error && sbavail(sb) == 0) {
 		copied = job->aio_received;
 		if (copied != 0) {
 			SOCKBUF_UNLOCK(sb);
 			recycle_pageset(toep, ps);
 			aio_complete(job, copied, 0);
 			toep->ddp.queueing = NULL;
 			goto restart;
 		}
 
 		error = so->so_error;
 		so->so_error = 0;
 		SOCKBUF_UNLOCK(sb);
 		recycle_pageset(toep, ps);
 		aio_complete(job, -1, error);
 		toep->ddp.queueing = NULL;
 		goto restart;
 	}
 
 	if (sb->sb_state & SBS_CANTRCVMORE && sbavail(sb) == 0) {
 		SOCKBUF_UNLOCK(sb);
 		recycle_pageset(toep, ps);
 		if (toep->ddp.active_count != 0) {
 			/*
 			 * The door is closed, but there are still pending
 			 * DDP buffers.  Requeue.  These jobs will all be
 			 * completed once those buffers drain.
 			 */
 			aio_ddp_requeue_one(toep, job);
 			toep->ddp.queueing = NULL;
 			return;
 		}
 		ddp_complete_one(job, 0);
 		ddp_complete_all(toep, 0);
 		toep->ddp.queueing = NULL;
 		return;
 	}
 
 sbcopy:
 	/*
 	 * If the toep is dead, there shouldn't be any data in the socket
 	 * buffer, so the above case should have handled this.
 	 */
 	MPASS(!(toep->ddp.flags & DDP_DEAD));
 
 	/*
 	 * If there is pending data in the socket buffer (either
 	 * from before the requests were queued or a DDP indicate),
 	 * copy those mbufs out directly.
 	 */
 	copied = 0;
 	offset = ps->offset + job->aio_received;
 	MPASS(job->aio_received <= job->uaiocb.aio_nbytes);
 	resid = job->uaiocb.aio_nbytes - job->aio_received;
 	m = sb->sb_mb;
 	KASSERT(m == NULL || toep->ddp.active_count == 0,
 	    ("%s: sockbuf data with active DDP", __func__));
 	while (m != NULL && resid > 0) {
 		struct iovec iov[1];
 		struct uio uio;
 		int error;
 
 		iov[0].iov_base = mtod(m, void *);
 		iov[0].iov_len = m->m_len;
 		if (iov[0].iov_len > resid)
 			iov[0].iov_len = resid;
 		uio.uio_iov = iov;
 		uio.uio_iovcnt = 1;
 		uio.uio_offset = 0;
 		uio.uio_resid = iov[0].iov_len;
 		uio.uio_segflg = UIO_SYSSPACE;
 		uio.uio_rw = UIO_WRITE;
 		error = uiomove_fromphys(ps->pages, offset + copied,
 		    uio.uio_resid, &uio);
 		MPASS(error == 0 && uio.uio_resid == 0);
 		copied += uio.uio_offset;
 		resid -= uio.uio_offset;
 		m = m->m_next;
 	}
 	if (copied != 0) {
 		sbdrop_locked(sb, copied);
 		job->aio_received += copied;
 		job->msgrcv = 1;
 		copied = job->aio_received;
 		inp = sotoinpcb(so);
 		if (!INP_TRY_WLOCK(inp)) {
 			/*
 			 * The reference on the socket file descriptor in
 			 * the AIO job should keep 'sb' and 'inp' stable.
 			 * Our caller has a reference on the 'toep' that
 			 * keeps it stable.
 			 */
 			SOCKBUF_UNLOCK(sb);
 			DDP_UNLOCK(toep);
 			INP_WLOCK(inp);
 			DDP_LOCK(toep);
 			SOCKBUF_LOCK(sb);
 
 			/*
 			 * If the socket has been closed, we should detect
 			 * that and complete this request if needed on
 			 * the next trip around the loop.
 			 */
 		}
 		t4_rcvd_locked(&toep->td->tod, intotcpcb(inp));
 		INP_WUNLOCK(inp);
 		if (resid == 0 || toep->ddp.flags & DDP_DEAD) {
 			/*
 			 * We filled the entire buffer with socket
 			 * data, DDP is not being used, or the socket
 			 * is being shut down, so complete the
 			 * request.
 			 */
 			SOCKBUF_UNLOCK(sb);
 			recycle_pageset(toep, ps);
 			aio_complete(job, copied, 0);
 			toep->ddp.queueing = NULL;
 			goto restart;
 		}
 
 		/*
 		 * If DDP is not enabled, requeue this request and restart.
 		 * This will either enable DDP or wait for more data to
 		 * arrive on the socket buffer.
 		 */
 		if ((toep->ddp.flags & (DDP_ON | DDP_SC_REQ)) != DDP_ON) {
 			SOCKBUF_UNLOCK(sb);
 			recycle_pageset(toep, ps);
 			aio_ddp_requeue_one(toep, job);
 			toep->ddp.queueing = NULL;
 			goto restart;
 		}
 
 		/*
 		 * An indicate might have arrived and been added to
 		 * the socket buffer while it was unlocked after the
 		 * copy to lock the INP.  If so, restart the copy.
 		 */
 		if (sbavail(sb) != 0)
 			goto sbcopy;
 	}
 	SOCKBUF_UNLOCK(sb);
 
 	if (prep_pageset(sc, toep, ps) == 0) {
 		recycle_pageset(toep, ps);
 		aio_ddp_requeue_one(toep, job);
 		toep->ddp.queueing = NULL;
 
 		/*
 		 * XXX: Need to retry this later.  Mostly need a trigger
 		 * when page pods are freed up.
 		 */
 		printf("%s: prep_pageset failed\n", __func__);
 		return;
 	}
 
 	/* Determine which DDP buffer to use. */
 	if (toep->ddp.db[0].job == NULL) {
 		db_idx = 0;
 	} else {
 		MPASS(toep->ddp.db[1].job == NULL);
 		db_idx = 1;
 	}
 
 	ddp_flags = 0;
 	ddp_flags_mask = 0;
 	if (db_idx == 0) {
 		ddp_flags |= V_TF_DDP_BUF0_VALID(1);
 		if (so->so_state & SS_NBIO)
 			ddp_flags |= V_TF_DDP_BUF0_FLUSH(1);
 		ddp_flags_mask |= V_TF_DDP_PSH_NO_INVALIDATE0(1) |
 		    V_TF_DDP_PUSH_DISABLE_0(1) | V_TF_DDP_PSHF_ENABLE_0(1) |
 		    V_TF_DDP_BUF0_FLUSH(1) | V_TF_DDP_BUF0_VALID(1);
 		buf_flag = DDP_BUF0_ACTIVE;
 	} else {
 		ddp_flags |= V_TF_DDP_BUF1_VALID(1);
 		if (so->so_state & SS_NBIO)
 			ddp_flags |= V_TF_DDP_BUF1_FLUSH(1);
 		ddp_flags_mask |= V_TF_DDP_PSH_NO_INVALIDATE1(1) |
 		    V_TF_DDP_PUSH_DISABLE_1(1) | V_TF_DDP_PSHF_ENABLE_1(1) |
 		    V_TF_DDP_BUF1_FLUSH(1) | V_TF_DDP_BUF1_VALID(1);
 		buf_flag = DDP_BUF1_ACTIVE;
 	}
 	MPASS((toep->ddp.flags & buf_flag) == 0);
 	if ((toep->ddp.flags & (DDP_BUF0_ACTIVE | DDP_BUF1_ACTIVE)) == 0) {
 		MPASS(db_idx == 0);
 		MPASS(toep->ddp.active_id == -1);
 		MPASS(toep->ddp.active_count == 0);
 		ddp_flags_mask |= V_TF_DDP_ACTIVE_BUF(1);
 	}
 
 	/*
 	 * The TID for this connection should still be valid.  If DDP_DEAD
 	 * is set, SBS_CANTRCVMORE should be set, so we shouldn't be
 	 * this far anyway.  Even if the socket is closing on the other
 	 * end, the AIO job holds a reference on this end of the socket
 	 * which will keep it open and keep the TCP PCB attached until
 	 * after the job is completed.
 	 */
 	wr = mk_update_tcb_for_ddp(sc, toep, db_idx, ps, job->aio_received,
 	    ddp_flags, ddp_flags_mask);
 	if (wr == NULL) {
 		recycle_pageset(toep, ps);
 		aio_ddp_requeue_one(toep, job);
 		toep->ddp.queueing = NULL;
 
 		/*
 		 * XXX: Need a way to kick a retry here.
 		 *
 		 * XXX: We know the fixed size needed and could
 		 * preallocate this using a blocking request at the
 		 * start of the task to avoid having to handle this
 		 * edge case.
 		 */
 		printf("%s: mk_update_tcb_for_ddp failed\n", __func__);
 		return;
 	}
 
 	if (!aio_set_cancel_function(job, t4_aio_cancel_active)) {
 		free_wrqe(wr);
 		recycle_pageset(toep, ps);
 		aio_ddp_cancel_one(job);
 		toep->ddp.queueing = NULL;
 		goto restart;
 	}
 
 #ifdef VERBOSE_TRACES
 	CTR5(KTR_CXGBE, "%s: scheduling %p for DDP[%d] (flags %#lx/%#lx)",
 	    __func__, job, db_idx, ddp_flags, ddp_flags_mask);
 #endif
 	/* Give the chip the go-ahead. */
 	t4_wrq_tx(sc, wr);
 	db = &toep->ddp.db[db_idx];
 	db->cancel_pending = 0;
 	db->job = job;
 	db->ps = ps;
 	toep->ddp.queueing = NULL;
 	toep->ddp.flags |= buf_flag;
 	toep->ddp.active_count++;
 	if (toep->ddp.active_count == 1) {
 		MPASS(toep->ddp.active_id == -1);
 		toep->ddp.active_id = db_idx;
 		CTR2(KTR_CXGBE, "%s: ddp_active_id = %d", __func__,
 		    toep->ddp.active_id);
 	}
 	goto restart;
 }
 
 void
 ddp_queue_toep(struct toepcb *toep)
 {
 
 	DDP_ASSERT_LOCKED(toep);
 	if (toep->ddp.flags & DDP_TASK_ACTIVE)
 		return;
 	toep->ddp.flags |= DDP_TASK_ACTIVE;
 	hold_toepcb(toep);
 	soaio_enqueue(&toep->ddp.requeue_task);
 }
 
 static void
 aio_ddp_requeue_task(void *context, int pending)
 {
 	struct toepcb *toep = context;
 
 	DDP_LOCK(toep);
 	aio_ddp_requeue(toep);
 	toep->ddp.flags &= ~DDP_TASK_ACTIVE;
 	DDP_UNLOCK(toep);
 
 	free_toepcb(toep);
 }
 
 static void
 t4_aio_cancel_active(struct kaiocb *job)
 {
 	struct socket *so = job->fd_file->f_data;
 	struct tcpcb *tp = so_sototcpcb(so);
 	struct toepcb *toep = tp->t_toe;
 	struct adapter *sc = td_adapter(toep->td);
 	uint64_t valid_flag;
 	int i;
 
 	DDP_LOCK(toep);
 	if (aio_cancel_cleared(job)) {
 		DDP_UNLOCK(toep);
 		aio_ddp_cancel_one(job);
 		return;
 	}
 
 	for (i = 0; i < nitems(toep->ddp.db); i++) {
 		if (toep->ddp.db[i].job == job) {
 			/* Should only ever get one cancel request for a job. */
 			MPASS(toep->ddp.db[i].cancel_pending == 0);
 
 			/*
 			 * Invalidate this buffer.  It will be
 			 * cancelled or partially completed once the
 			 * card ACKs the invalidate.
 			 */
 			valid_flag = i == 0 ? V_TF_DDP_BUF0_VALID(1) :
 			    V_TF_DDP_BUF1_VALID(1);
 			t4_set_tcb_field(sc, toep->ctrlq, toep,
 			    W_TCB_RX_DDP_FLAGS, valid_flag, 0, 1,
 			    i + DDP_BUF0_INVALIDATED);
 			toep->ddp.db[i].cancel_pending = 1;
 			CTR2(KTR_CXGBE, "%s: request %p marked pending",
 			    __func__, job);
 			break;
 		}
 	}
 	DDP_UNLOCK(toep);
 }
 
 static void
 t4_aio_cancel_queued(struct kaiocb *job)
 {
 	struct socket *so = job->fd_file->f_data;
 	struct tcpcb *tp = so_sototcpcb(so);
 	struct toepcb *toep = tp->t_toe;
 
 	DDP_LOCK(toep);
 	if (!aio_cancel_cleared(job)) {
 		TAILQ_REMOVE(&toep->ddp.aiojobq, job, list);
 		toep->ddp.waiting_count--;
 		if (toep->ddp.waiting_count == 0)
 			ddp_queue_toep(toep);
 	}
 	CTR2(KTR_CXGBE, "%s: request %p cancelled", __func__, job);
 	DDP_UNLOCK(toep);
 
 	aio_ddp_cancel_one(job);
 }
 
 int
 t4_aio_queue_ddp(struct socket *so, struct kaiocb *job)
 {
 	struct tcpcb *tp = so_sototcpcb(so);
 	struct toepcb *toep = tp->t_toe;
 
 
 	/* Ignore writes. */
 	if (job->uaiocb.aio_lio_opcode != LIO_READ)
 		return (EOPNOTSUPP);
 
 	DDP_LOCK(toep);
 
 	/*
 	 * XXX: Think about possibly returning errors for ENOTCONN,
 	 * etc.  Perhaps the caller would only queue the request
 	 * if it failed with EOPNOTSUPP?
 	 */
 
 #ifdef VERBOSE_TRACES
 	CTR2(KTR_CXGBE, "%s: queueing %p", __func__, job);
 #endif
 	if (!aio_set_cancel_function(job, t4_aio_cancel_queued))
 		panic("new job was cancelled");
 	TAILQ_INSERT_TAIL(&toep->ddp.aiojobq, job, list);
 	toep->ddp.waiting_count++;
 	toep->ddp.flags |= DDP_OK;
 
 	/*
 	 * Try to handle this request synchronously.  If this has
 	 * to block because the task is running, it will just bail
 	 * and let the task handle it instead.
 	 */
 	aio_ddp_requeue(toep);
 	DDP_UNLOCK(toep);
 	return (0);
 }
 
 void
 t4_ddp_mod_load(void)
 {
 
 	t4_register_shared_cpl_handler(CPL_SET_TCB_RPL, do_ddp_tcb_rpl,
 	    CPL_COOKIE_DDP0);
 	t4_register_shared_cpl_handler(CPL_SET_TCB_RPL, do_ddp_tcb_rpl,
 	    CPL_COOKIE_DDP1);
 	t4_register_cpl_handler(CPL_RX_DATA_DDP, do_rx_data_ddp);
 	t4_register_cpl_handler(CPL_RX_DDP_COMPLETE, do_rx_ddp_complete);
 	TAILQ_INIT(&ddp_orphan_pagesets);
 	mtx_init(&ddp_orphan_pagesets_lock, "ddp orphans", NULL, MTX_DEF);
 	TASK_INIT(&ddp_orphan_task, 0, ddp_free_orphan_pagesets, NULL);
 }
 
 void
 t4_ddp_mod_unload(void)
 {
 
 	taskqueue_drain(taskqueue_thread, &ddp_orphan_task);
 	MPASS(TAILQ_EMPTY(&ddp_orphan_pagesets));
 	mtx_destroy(&ddp_orphan_pagesets_lock);
 	t4_register_shared_cpl_handler(CPL_SET_TCB_RPL, NULL, CPL_COOKIE_DDP0);
 	t4_register_shared_cpl_handler(CPL_SET_TCB_RPL, NULL, CPL_COOKIE_DDP1);
 	t4_register_cpl_handler(CPL_RX_DATA_DDP, NULL);
 	t4_register_cpl_handler(CPL_RX_DDP_COMPLETE, NULL);
 }
 #endif
Index: stable/11/sys/dev/cxgbe/tom/t4_listen.c
===================================================================
--- stable/11/sys/dev/cxgbe/tom/t4_listen.c	(revision 348703)
+++ stable/11/sys/dev/cxgbe/tom/t4_listen.c	(revision 348704)
@@ -1,1615 +1,1613 @@
 /*-
  * Copyright (c) 2012 Chelsio Communications, Inc.
  * All rights reserved.
  * Written by: Navdeep Parhar <np@FreeBSD.org>
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_inet.h"
 #include "opt_inet6.h"
 
 #ifdef TCP_OFFLOAD
 #include <sys/param.h>
 #include <sys/types.h>
 #include <sys/kernel.h>
 #include <sys/ktr.h>
 #include <sys/module.h>
 #include <sys/protosw.h>
 #include <sys/refcount.h>
 #include <sys/domain.h>
 #include <sys/fnv_hash.h>
 #include <sys/socket.h>
 #include <sys/socketvar.h>
 #include <sys/sysctl.h>
 #include <net/ethernet.h>
 #include <net/if.h>
 #include <net/if_types.h>
 #include <net/if_vlan_var.h>
 #include <net/route.h>
 #include <netinet/in.h>
 #include <netinet/in_fib.h>
 #include <netinet/in_pcb.h>
 #include <netinet/ip.h>
 #include <netinet/ip6.h>
 #include <netinet6/in6_fib.h>
 #include <netinet6/scope6_var.h>
 #include <netinet/tcp_timer.h>
 #define TCPSTATES
 #include <netinet/tcp_fsm.h>
 #include <netinet/tcp_var.h>
 #include <netinet/toecore.h>
 #include <netinet/cc/cc.h>
 
 #include "common/common.h"
 #include "common/t4_msg.h"
 #include "common/t4_regs.h"
 #include "t4_clip.h"
 #include "tom/t4_tom_l2t.h"
 #include "tom/t4_tom.h"
 
 /* stid services */
 static int alloc_stid(struct adapter *, struct listen_ctx *, int);
 static struct listen_ctx *lookup_stid(struct adapter *, int);
 static void free_stid(struct adapter *, struct listen_ctx *);
 
 /* lctx services */
 static struct listen_ctx *alloc_lctx(struct adapter *, struct inpcb *,
     struct vi_info *);
 static int free_lctx(struct adapter *, struct listen_ctx *);
 static void hold_lctx(struct listen_ctx *);
 static void listen_hash_add(struct adapter *, struct listen_ctx *);
 static struct listen_ctx *listen_hash_find(struct adapter *, struct inpcb *);
 static struct listen_ctx *listen_hash_del(struct adapter *, struct inpcb *);
 static struct inpcb *release_lctx(struct adapter *, struct listen_ctx *);
 
 static void send_reset_synqe(struct toedev *, struct synq_entry *);
 
 static int
 alloc_stid(struct adapter *sc, struct listen_ctx *lctx, int isipv6)
 {
 	struct tid_info *t = &sc->tids;
 	u_int stid, n, f, mask;
 	struct stid_region *sr = &lctx->stid_region;
 
 	/*
 	 * An IPv6 server needs 2 naturally aligned stids (1 stid = 4 cells) in
 	 * the TCAM.  The start of the stid region is properly aligned (the chip
 	 * requires each region to be 128-cell aligned).
 	 */
 	n = isipv6 ? 2 : 1;
 	mask = n - 1;
 	KASSERT((t->stid_base & mask) == 0 && (t->nstids & mask) == 0,
 	    ("%s: stid region (%u, %u) not properly aligned.  n = %u",
 	    __func__, t->stid_base, t->nstids, n));
 
 	mtx_lock(&t->stid_lock);
 	if (n > t->nstids - t->stids_in_use) {
 		mtx_unlock(&t->stid_lock);
 		return (-1);
 	}
 
 	if (t->nstids_free_head >= n) {
 		/*
 		 * This allocation will definitely succeed because the region
 		 * starts at a good alignment and we just checked we have enough
 		 * stids free.
 		 */
 		f = t->nstids_free_head & mask;
 		t->nstids_free_head -= n + f;
 		stid = t->nstids_free_head;
 		TAILQ_INSERT_HEAD(&t->stids, sr, link);
 	} else {
 		struct stid_region *s;
 
 		stid = t->nstids_free_head;
 		TAILQ_FOREACH(s, &t->stids, link) {
 			stid += s->used + s->free;
 			f = stid & mask;
 			if (s->free >= n + f) {
 				stid -= n + f;
 				s->free -= n + f;
 				TAILQ_INSERT_AFTER(&t->stids, s, sr, link);
 				goto allocated;
 			}
 		}
 
 		if (__predict_false(stid != t->nstids)) {
 			panic("%s: stids TAILQ (%p) corrupt."
 			    "  At %d instead of %d at the end of the queue.",
 			    __func__, &t->stids, stid, t->nstids);
 		}
 
 		mtx_unlock(&t->stid_lock);
 		return (-1);
 	}
 
 allocated:
 	sr->used = n;
 	sr->free = f;
 	t->stids_in_use += n;
 	t->stid_tab[stid] = lctx;
 	mtx_unlock(&t->stid_lock);
 
 	KASSERT(((stid + t->stid_base) & mask) == 0,
 	    ("%s: EDOOFUS.", __func__));
 	return (stid + t->stid_base);
 }
 
 static struct listen_ctx *
 lookup_stid(struct adapter *sc, int stid)
 {
 	struct tid_info *t = &sc->tids;
 
 	return (t->stid_tab[stid - t->stid_base]);
 }
 
 static void
 free_stid(struct adapter *sc, struct listen_ctx *lctx)
 {
 	struct tid_info *t = &sc->tids;
 	struct stid_region *sr = &lctx->stid_region;
 	struct stid_region *s;
 
 	KASSERT(sr->used > 0, ("%s: nonsense free (%d)", __func__, sr->used));
 
 	mtx_lock(&t->stid_lock);
 	s = TAILQ_PREV(sr, stid_head, link);
 	if (s != NULL)
 		s->free += sr->used + sr->free;
 	else
 		t->nstids_free_head += sr->used + sr->free;
 	KASSERT(t->stids_in_use >= sr->used,
 	    ("%s: stids_in_use (%u) < stids being freed (%u)", __func__,
 	    t->stids_in_use, sr->used));
 	t->stids_in_use -= sr->used;
 	TAILQ_REMOVE(&t->stids, sr, link);
 	mtx_unlock(&t->stid_lock);
 }
 
 static struct listen_ctx *
 alloc_lctx(struct adapter *sc, struct inpcb *inp, struct vi_info *vi)
 {
 	struct listen_ctx *lctx;
 
 	INP_WLOCK_ASSERT(inp);
 
 	lctx = malloc(sizeof(struct listen_ctx), M_CXGBE, M_NOWAIT | M_ZERO);
 	if (lctx == NULL)
 		return (NULL);
 
 	lctx->stid = alloc_stid(sc, lctx, inp->inp_vflag & INP_IPV6);
 	if (lctx->stid < 0) {
 		free(lctx, M_CXGBE);
 		return (NULL);
 	}
 
 	if (inp->inp_vflag & INP_IPV6 &&
 	    !IN6_ARE_ADDR_EQUAL(&in6addr_any, &inp->in6p_laddr)) {
 		lctx->ce = t4_hold_lip(sc, &inp->in6p_laddr, NULL);
 		if (lctx->ce == NULL) {
 			free(lctx, M_CXGBE);
 			return (NULL);
 		}
 	}
 
 	lctx->ctrlq = &sc->sge.ctrlq[vi->pi->port_id];
 	lctx->ofld_rxq = &sc->sge.ofld_rxq[vi->first_ofld_rxq];
 	refcount_init(&lctx->refcount, 1);
 
 	lctx->inp = inp;
 	lctx->vnet = inp->inp_socket->so_vnet;
 	in_pcbref(inp);
 
 	return (lctx);
 }
 
 /* Don't call this directly, use release_lctx instead */
 static int
 free_lctx(struct adapter *sc, struct listen_ctx *lctx)
 {
 	struct inpcb *inp = lctx->inp;
 
 	INP_WLOCK_ASSERT(inp);
 	KASSERT(lctx->refcount == 0,
 	    ("%s: refcount %d", __func__, lctx->refcount));
 	KASSERT(lctx->stid >= 0, ("%s: bad stid %d.", __func__, lctx->stid));
 
 	CTR4(KTR_CXGBE, "%s: stid %u, lctx %p, inp %p",
 	    __func__, lctx->stid, lctx, lctx->inp);
 
 	if (lctx->ce)
 		t4_release_lip(sc, lctx->ce);
 	free_stid(sc, lctx);
 	free(lctx, M_CXGBE);
 
 	return (in_pcbrele_wlocked(inp));
 }
 
 static void
 hold_lctx(struct listen_ctx *lctx)
 {
 
 	refcount_acquire(&lctx->refcount);
 }
 
 static inline uint32_t
 listen_hashfn(void *key, u_long mask)
 {
 
 	return (fnv_32_buf(&key, sizeof(key), FNV1_32_INIT) & mask);
 }
 
 /*
  * Add a listen_ctx entry to the listen hash table.
  */
 static void
 listen_hash_add(struct adapter *sc, struct listen_ctx *lctx)
 {
 	struct tom_data *td = sc->tom_softc;
 	int bucket = listen_hashfn(lctx->inp, td->listen_mask);
 
 	mtx_lock(&td->lctx_hash_lock);
 	LIST_INSERT_HEAD(&td->listen_hash[bucket], lctx, link);
 	td->lctx_count++;
 	mtx_unlock(&td->lctx_hash_lock);
 }
 
 /*
  * Look for the listening socket's context entry in the hash and return it.
  */
 static struct listen_ctx *
 listen_hash_find(struct adapter *sc, struct inpcb *inp)
 {
 	struct tom_data *td = sc->tom_softc;
 	int bucket = listen_hashfn(inp, td->listen_mask);
 	struct listen_ctx *lctx;
 
 	mtx_lock(&td->lctx_hash_lock);
 	LIST_FOREACH(lctx, &td->listen_hash[bucket], link) {
 		if (lctx->inp == inp)
 			break;
 	}
 	mtx_unlock(&td->lctx_hash_lock);
 
 	return (lctx);
 }
 
 /*
  * Removes the listen_ctx structure for inp from the hash and returns it.
  */
 static struct listen_ctx *
 listen_hash_del(struct adapter *sc, struct inpcb *inp)
 {
 	struct tom_data *td = sc->tom_softc;
 	int bucket = listen_hashfn(inp, td->listen_mask);
 	struct listen_ctx *lctx, *l;
 
 	mtx_lock(&td->lctx_hash_lock);
 	LIST_FOREACH_SAFE(lctx, &td->listen_hash[bucket], link, l) {
 		if (lctx->inp == inp) {
 			LIST_REMOVE(lctx, link);
 			td->lctx_count--;
 			break;
 		}
 	}
 	mtx_unlock(&td->lctx_hash_lock);
 
 	return (lctx);
 }
 
 /*
  * Releases a hold on the lctx.  Must be called with the listening socket's inp
  * locked.  The inp may be freed by this function and it returns NULL to
  * indicate this.
  */
 static struct inpcb *
 release_lctx(struct adapter *sc, struct listen_ctx *lctx)
 {
 	struct inpcb *inp = lctx->inp;
 	int inp_freed = 0;
 
 	INP_WLOCK_ASSERT(inp);
 	if (refcount_release(&lctx->refcount))
 		inp_freed = free_lctx(sc, lctx);
 
 	return (inp_freed ? NULL : inp);
 }
 
 static void
 send_reset_synqe(struct toedev *tod, struct synq_entry *synqe)
 {
 	struct adapter *sc = tod->tod_softc;
 	struct mbuf *m = synqe->syn;
 	struct ifnet *ifp = m->m_pkthdr.rcvif;
 	struct vi_info *vi = ifp->if_softc;
 	struct port_info *pi = vi->pi;
 	struct l2t_entry *e = &sc->l2t->l2tab[synqe->l2e_idx];
 	struct wrqe *wr;
 	struct fw_flowc_wr *flowc;
 	struct cpl_abort_req *req;
 	int flowclen;
 	struct sge_wrq *ofld_txq;
 	struct sge_ofld_rxq *ofld_rxq;
 	const int nparams = 6;
 	const u_int pfvf = sc->pf << S_FW_VIID_PFN;
 
 	INP_WLOCK_ASSERT(synqe->lctx->inp);
 
 	CTR5(KTR_CXGBE, "%s: synqe %p (0x%x), tid %d%s",
 	    __func__, synqe, synqe->flags, synqe->tid,
 	    synqe->flags & TPF_ABORT_SHUTDOWN ?
 	    " (abort already in progress)" : "");
 	if (synqe->flags & TPF_ABORT_SHUTDOWN)
 		return;	/* abort already in progress */
 	synqe->flags |= TPF_ABORT_SHUTDOWN;
 
 	ofld_txq = &sc->sge.ofld_txq[synqe->txqid];
 	ofld_rxq = &sc->sge.ofld_rxq[synqe->rxqid];
 
 	/* The wrqe will have two WRs - a flowc followed by an abort_req */
 	flowclen = sizeof(*flowc) + nparams * sizeof(struct fw_flowc_mnemval);
 
 	wr = alloc_wrqe(roundup2(flowclen, EQ_ESIZE) + sizeof(*req), ofld_txq);
 	if (wr == NULL) {
 		/* XXX */
 		panic("%s: allocation failure.", __func__);
 	}
 	flowc = wrtod(wr);
 	req = (void *)((caddr_t)flowc + roundup2(flowclen, EQ_ESIZE));
 
 	/* First the flowc ... */
 	memset(flowc, 0, wr->wr_len);
 	flowc->op_to_nparams = htobe32(V_FW_WR_OP(FW_FLOWC_WR) |
 	    V_FW_FLOWC_WR_NPARAMS(nparams));
 	flowc->flowid_len16 = htonl(V_FW_WR_LEN16(howmany(flowclen, 16)) |
 	    V_FW_WR_FLOWID(synqe->tid));
 	flowc->mnemval[0].mnemonic = FW_FLOWC_MNEM_PFNVFN;
 	flowc->mnemval[0].val = htobe32(pfvf);
 	flowc->mnemval[1].mnemonic = FW_FLOWC_MNEM_CH;
 	flowc->mnemval[1].val = htobe32(pi->tx_chan);
 	flowc->mnemval[2].mnemonic = FW_FLOWC_MNEM_PORT;
 	flowc->mnemval[2].val = htobe32(pi->tx_chan);
 	flowc->mnemval[3].mnemonic = FW_FLOWC_MNEM_IQID;
 	flowc->mnemval[3].val = htobe32(ofld_rxq->iq.abs_id);
  	flowc->mnemval[4].mnemonic = FW_FLOWC_MNEM_SNDBUF;
  	flowc->mnemval[4].val = htobe32(512);
  	flowc->mnemval[5].mnemonic = FW_FLOWC_MNEM_MSS;
  	flowc->mnemval[5].val = htobe32(512);
 	synqe->flags |= TPF_FLOWC_WR_SENT;
 
 	/* ... then ABORT request */
 	INIT_TP_WR_MIT_CPL(req, CPL_ABORT_REQ, synqe->tid);
 	req->rsvd0 = 0;	/* don't have a snd_nxt */
 	req->rsvd1 = 1;	/* no data sent yet */
 	req->cmd = CPL_ABORT_SEND_RST;
 
 	t4_l2t_send(sc, wr, e);
 }
 
 static int
 create_server(struct adapter *sc, struct listen_ctx *lctx)
 {
 	struct wrqe *wr;
 	struct cpl_pass_open_req *req;
 	struct inpcb *inp = lctx->inp;
 
 	wr = alloc_wrqe(sizeof(*req), lctx->ctrlq);
 	if (wr == NULL) {
 		log(LOG_ERR, "%s: allocation failure", __func__);
 		return (ENOMEM);
 	}
 	req = wrtod(wr);
 
 	INIT_TP_WR(req, 0);
 	OPCODE_TID(req) = htobe32(MK_OPCODE_TID(CPL_PASS_OPEN_REQ, lctx->stid));
 	req->local_port = inp->inp_lport;
 	req->peer_port = 0;
 	req->local_ip = inp->inp_laddr.s_addr;
 	req->peer_ip = 0;
 	req->opt0 = htobe64(V_TX_CHAN(lctx->ctrlq->eq.tx_chan));
 	req->opt1 = htobe64(V_CONN_POLICY(CPL_CONN_POLICY_ASK) |
 	    F_SYN_RSS_ENABLE | V_SYN_RSS_QUEUE(lctx->ofld_rxq->iq.abs_id));
 
 	t4_wrq_tx(sc, wr);
 	return (0);
 }
 
 static int
 create_server6(struct adapter *sc, struct listen_ctx *lctx)
 {
 	struct wrqe *wr;
 	struct cpl_pass_open_req6 *req;
 	struct inpcb *inp = lctx->inp;
 
 	wr = alloc_wrqe(sizeof(*req), lctx->ctrlq);
 	if (wr == NULL) {
 		log(LOG_ERR, "%s: allocation failure", __func__);
 		return (ENOMEM);
 	}
 	req = wrtod(wr);
 
 	INIT_TP_WR(req, 0);
 	OPCODE_TID(req) = htobe32(MK_OPCODE_TID(CPL_PASS_OPEN_REQ6, lctx->stid));
 	req->local_port = inp->inp_lport;
 	req->peer_port = 0;
 	req->local_ip_hi = *(uint64_t *)&inp->in6p_laddr.s6_addr[0];
 	req->local_ip_lo = *(uint64_t *)&inp->in6p_laddr.s6_addr[8];
 	req->peer_ip_hi = 0;
 	req->peer_ip_lo = 0;
 	req->opt0 = htobe64(V_TX_CHAN(lctx->ctrlq->eq.tx_chan));
 	req->opt1 = htobe64(V_CONN_POLICY(CPL_CONN_POLICY_ASK) |
 	    F_SYN_RSS_ENABLE | V_SYN_RSS_QUEUE(lctx->ofld_rxq->iq.abs_id));
 
 	t4_wrq_tx(sc, wr);
 	return (0);
 }
 
 static int
 destroy_server(struct adapter *sc, struct listen_ctx *lctx)
 {
 	struct wrqe *wr;
 	struct cpl_close_listsvr_req *req;
 
 	wr = alloc_wrqe(sizeof(*req), lctx->ctrlq);
 	if (wr == NULL) {
 		/* XXX */
 		panic("%s: allocation failure.", __func__);
 	}
 	req = wrtod(wr);
 
 	INIT_TP_WR(req, 0);
 	OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_CLOSE_LISTSRV_REQ,
 	    lctx->stid));
 	req->reply_ctrl = htobe16(lctx->ofld_rxq->iq.abs_id);
 	req->rsvd = htobe16(0);
 
 	t4_wrq_tx(sc, wr);
 	return (0);
 }
 
 /*
  * Start a listening server by sending a passive open request to HW.
  *
  * Can't take adapter lock here and access to sc->flags,
  * sc->offload_map, if_capenable are all race prone.
  */
 int
 t4_listen_start(struct toedev *tod, struct tcpcb *tp)
 {
 	struct adapter *sc = tod->tod_softc;
 	struct vi_info *vi;
 	struct port_info *pi;
 	struct inpcb *inp = tp->t_inpcb;
 	struct listen_ctx *lctx;
 	int i, rc, v;
 	struct offload_settings settings;
 
 	INP_WLOCK_ASSERT(inp);
 
 	rw_rlock(&sc->policy_lock);
 	settings = *lookup_offload_policy(sc, OPEN_TYPE_LISTEN, NULL, 0xffff,
 	    inp);
 	rw_runlock(&sc->policy_lock);
 	if (!settings.offload)
 		return (0);
 
 	/* Don't start a hardware listener for any loopback address. */
 	if (inp->inp_vflag & INP_IPV6 && IN6_IS_ADDR_LOOPBACK(&inp->in6p_laddr))
 		return (0);
 	if (!(inp->inp_vflag & INP_IPV6) &&
 	    IN_LOOPBACK(ntohl(inp->inp_laddr.s_addr)))
 		return (0);
 #if 0
 	ADAPTER_LOCK(sc);
 	if (IS_BUSY(sc)) {
 		log(LOG_ERR, "%s: listen request ignored, %s is busy",
 		    __func__, device_get_nameunit(sc->dev));
 		goto done;
 	}
 
 	KASSERT(uld_active(sc, ULD_TOM),
 	    ("%s: TOM not initialized", __func__));
 #endif
 
 	/*
 	 * Find an initialized VI with IFCAP_TOE (4 or 6).  We'll use the first
 	 * such VI's queues to send the passive open and receive the reply to
 	 * it.
 	 *
 	 * XXX: need a way to mark a port in use by offload.  if_cxgbe should
 	 * then reject any attempt to bring down such a port (and maybe reject
 	 * attempts to disable IFCAP_TOE on that port too?).
 	 */
 	for_each_port(sc, i) {
 		pi = sc->port[i];
 		for_each_vi(pi, v, vi) {
 			if (vi->flags & VI_INIT_DONE &&
 			    vi->ifp->if_capenable & IFCAP_TOE)
 				goto found;
 		}
 	}
 	goto done;	/* no port that's UP with IFCAP_TOE enabled */
 found:
 
 	if (listen_hash_find(sc, inp) != NULL)
 		goto done;	/* already setup */
 
 	lctx = alloc_lctx(sc, inp, vi);
 	if (lctx == NULL) {
 		log(LOG_ERR,
 		    "%s: listen request ignored, %s couldn't allocate lctx\n",
 		    __func__, device_get_nameunit(sc->dev));
 		goto done;
 	}
 	listen_hash_add(sc, lctx);
 
 	CTR6(KTR_CXGBE, "%s: stid %u (%s), lctx %p, inp %p vflag 0x%x",
 	    __func__, lctx->stid, tcpstates[tp->t_state], lctx, inp,
 	    inp->inp_vflag);
 
 	if (inp->inp_vflag & INP_IPV6)
 		rc = create_server6(sc, lctx);
 	else
 		rc = create_server(sc, lctx);
 	if (rc != 0) {
 		log(LOG_ERR, "%s: %s failed to create hw listener: %d.\n",
 		    __func__, device_get_nameunit(sc->dev), rc);
 		(void) listen_hash_del(sc, inp);
 		inp = release_lctx(sc, lctx);
 		/* can't be freed, host stack has a reference */
 		KASSERT(inp != NULL, ("%s: inp freed", __func__));
 		goto done;
 	}
 	lctx->flags |= LCTX_RPL_PENDING;
 done:
 #if 0
 	ADAPTER_UNLOCK(sc);
 #endif
 	return (0);
 }
 
 int
 t4_listen_stop(struct toedev *tod, struct tcpcb *tp)
 {
 	struct listen_ctx *lctx;
 	struct adapter *sc = tod->tod_softc;
 	struct inpcb *inp = tp->t_inpcb;
 
 	INP_WLOCK_ASSERT(inp);
 
 	lctx = listen_hash_del(sc, inp);
 	if (lctx == NULL)
 		return (ENOENT);	/* no hardware listener for this inp */
 
 	CTR4(KTR_CXGBE, "%s: stid %u, lctx %p, flags %x", __func__, lctx->stid,
 	    lctx, lctx->flags);
 
 	/*
 	 * If the reply to the PASS_OPEN is still pending we'll wait for it to
 	 * arrive and clean up when it does.
 	 */
 	if (lctx->flags & LCTX_RPL_PENDING) {
 		return (EINPROGRESS);
 	}
 
 	destroy_server(sc, lctx);
 	return (0);
 }
 
 static inline struct synq_entry *
 alloc_synqe(struct adapter *sc __unused, struct listen_ctx *lctx, int flags)
 {
 	struct synq_entry *synqe;
 
 	INP_WLOCK_ASSERT(lctx->inp);
 	MPASS(flags == M_WAITOK || flags == M_NOWAIT);
 
 	synqe = malloc(sizeof(*synqe), M_CXGBE, flags);
 	if (__predict_true(synqe != NULL)) {
 		synqe->flags = TPF_SYNQE;
 		refcount_init(&synqe->refcnt, 1);
 		synqe->lctx = lctx;
 		hold_lctx(lctx);	/* Every synqe has a ref on its lctx. */
 		synqe->syn = NULL;
 	}
 
 	return (synqe);
 }
 
 static inline void
 hold_synqe(struct synq_entry *synqe)
 {
 
 	refcount_acquire(&synqe->refcnt);
 }
 
 static inline struct inpcb *
 release_synqe(struct adapter *sc, struct synq_entry *synqe)
 {
 	struct inpcb *inp;
 
 	MPASS(synqe->flags & TPF_SYNQE);
 	MPASS(synqe->lctx != NULL);
 
 	inp = synqe->lctx->inp;
 	MPASS(inp != NULL);
 	INP_WLOCK_ASSERT(inp);
 
 	if (refcount_release(&synqe->refcnt)) {
 		inp = release_lctx(sc, synqe->lctx);
 		m_freem(synqe->syn);
 		free(synqe, M_CXGBE);
 	}
 
 	return (inp);
 }
 
 void
 t4_syncache_added(struct toedev *tod __unused, void *arg)
 {
 	struct synq_entry *synqe = arg;
 
 	hold_synqe(synqe);
 }
 
 void
 t4_syncache_removed(struct toedev *tod, void *arg)
 {
 	struct adapter *sc = tod->tod_softc;
 	struct synq_entry *synqe = arg;
 	struct inpcb *inp = synqe->lctx->inp;
 
 	/*
 	 * XXX: this is a LOR but harmless when running from the softclock.
 	 */
 	INP_WLOCK(inp);
 	inp = release_synqe(sc, synqe);
 	if (inp != NULL)
 		INP_WUNLOCK(inp);
 }
 
 int
 t4_syncache_respond(struct toedev *tod, void *arg, struct mbuf *m)
 {
 	struct synq_entry *synqe = arg;
 
 	if (atomic_fetchadd_int(&synqe->ok_to_respond, 1) == 0) {
 		struct tcpopt to;
 		struct ip *ip = mtod(m, struct ip *);
 		struct tcphdr *th;
 
 		if (ip->ip_v == IPVERSION)
 			th = (void *)(ip + 1);
 		else
 			th = (void *)((struct ip6_hdr *)ip + 1);
 		bzero(&to, sizeof(to));
 		tcp_dooptions(&to, (void *)(th + 1),
 		    (th->th_off << 2) - sizeof(*th), TO_SYN);
 
 		/* save these for later */
 		synqe->iss = be32toh(th->th_seq);
 		synqe->irs = be32toh(th->th_ack) - 1;
 		synqe->ts = to.to_tsval;
 	}
 
 	m_freem(m);	/* don't need this any more */
 	return (0);
 }
 
 static int
 do_pass_open_rpl(struct sge_iq *iq, const struct rss_header *rss,
     struct mbuf *m)
 {
 	struct adapter *sc = iq->adapter;
 	const struct cpl_pass_open_rpl *cpl = (const void *)(rss + 1);
 	int stid = GET_TID(cpl);
 	unsigned int status = cpl->status;
 	struct listen_ctx *lctx = lookup_stid(sc, stid);
 	struct inpcb *inp = lctx->inp;
 #ifdef INVARIANTS
 	unsigned int opcode = G_CPL_OPCODE(be32toh(OPCODE_TID(cpl)));
 #endif
 
 	KASSERT(opcode == CPL_PASS_OPEN_RPL,
 	    ("%s: unexpected opcode 0x%x", __func__, opcode));
 	KASSERT(m == NULL, ("%s: wasn't expecting payload", __func__));
 	KASSERT(lctx->stid == stid, ("%s: lctx stid mismatch", __func__));
 
 	INP_WLOCK(inp);
 
 	CTR4(KTR_CXGBE, "%s: stid %d, status %u, flags 0x%x",
 	    __func__, stid, status, lctx->flags);
 
 	lctx->flags &= ~LCTX_RPL_PENDING;
 
 	if (status != CPL_ERR_NONE)
 		log(LOG_ERR, "listener (stid %u) failed: %d\n", stid, status);
 
 #ifdef INVARIANTS
 	/*
 	 * If the inp has been dropped (listening socket closed) then
 	 * listen_stop must have run and taken the inp out of the hash.
 	 */
 	if (inp->inp_flags & INP_DROPPED) {
 		KASSERT(listen_hash_del(sc, inp) == NULL,
 		    ("%s: inp %p still in listen hash", __func__, inp));
 	}
 #endif
 
 	if (inp->inp_flags & INP_DROPPED && status != CPL_ERR_NONE) {
 		if (release_lctx(sc, lctx) != NULL)
 			INP_WUNLOCK(inp);
 		return (status);
 	}
 
 	/*
 	 * Listening socket stopped listening earlier and now the chip tells us
 	 * it has started the hardware listener.  Stop it; the lctx will be
 	 * released in do_close_server_rpl.
 	 */
 	if (inp->inp_flags & INP_DROPPED) {
 		destroy_server(sc, lctx);
 		INP_WUNLOCK(inp);
 		return (status);
 	}
 
 	/*
 	 * Failed to start hardware listener.  Take inp out of the hash and
 	 * release our reference on it.  An error message has been logged
 	 * already.
 	 */
 	if (status != CPL_ERR_NONE) {
 		listen_hash_del(sc, inp);
 		if (release_lctx(sc, lctx) != NULL)
 			INP_WUNLOCK(inp);
 		return (status);
 	}
 
 	/* hardware listener open for business */
 
 	INP_WUNLOCK(inp);
 	return (status);
 }
 
 static int
 do_close_server_rpl(struct sge_iq *iq, const struct rss_header *rss,
     struct mbuf *m)
 {
 	struct adapter *sc = iq->adapter;
 	const struct cpl_close_listsvr_rpl *cpl = (const void *)(rss + 1);
 	int stid = GET_TID(cpl);
 	unsigned int status = cpl->status;
 	struct listen_ctx *lctx = lookup_stid(sc, stid);
 	struct inpcb *inp = lctx->inp;
 #ifdef INVARIANTS
 	unsigned int opcode = G_CPL_OPCODE(be32toh(OPCODE_TID(cpl)));
 #endif
 
 	KASSERT(opcode == CPL_CLOSE_LISTSRV_RPL,
 	    ("%s: unexpected opcode 0x%x", __func__, opcode));
 	KASSERT(m == NULL, ("%s: wasn't expecting payload", __func__));
 	KASSERT(lctx->stid == stid, ("%s: lctx stid mismatch", __func__));
 
 	CTR3(KTR_CXGBE, "%s: stid %u, status %u", __func__, stid, status);
 
 	if (status != CPL_ERR_NONE) {
 		log(LOG_ERR, "%s: failed (%u) to close listener for stid %u\n",
 		    __func__, status, stid);
 		return (status);
 	}
 
 	INP_WLOCK(inp);
 	inp = release_lctx(sc, lctx);
 	if (inp != NULL)
 		INP_WUNLOCK(inp);
 
 	return (status);
 }
 
 static void
 done_with_synqe(struct adapter *sc, struct synq_entry *synqe)
 {
 	struct listen_ctx *lctx = synqe->lctx;
 	struct inpcb *inp = lctx->inp;
 	struct l2t_entry *e = &sc->l2t->l2tab[synqe->l2e_idx];
 	int ntids;
 
 	INP_WLOCK_ASSERT(inp);
 	ntids = inp->inp_vflag & INP_IPV6 ? 2 : 1;
 
 	remove_tid(sc, synqe->tid, ntids);
 	release_tid(sc, synqe->tid, lctx->ctrlq);
 	t4_l2t_release(e);
 	inp = release_synqe(sc, synqe);
 	if (inp)
 		INP_WUNLOCK(inp);
 }
 
 void
 synack_failure_cleanup(struct adapter *sc, int tid)
 {
 	struct synq_entry *synqe = lookup_tid(sc, tid);
 
 	INP_WLOCK(synqe->lctx->inp);
 	done_with_synqe(sc, synqe);
 }
 
 int
 do_abort_req_synqe(struct sge_iq *iq, const struct rss_header *rss,
     struct mbuf *m)
 {
 	struct adapter *sc = iq->adapter;
 	const struct cpl_abort_req_rss *cpl = (const void *)(rss + 1);
 	unsigned int tid = GET_TID(cpl);
 	struct synq_entry *synqe = lookup_tid(sc, tid);
 	struct listen_ctx *lctx = synqe->lctx;
 	struct inpcb *inp = lctx->inp;
 	struct sge_wrq *ofld_txq;
 #ifdef INVARIANTS
 	unsigned int opcode = G_CPL_OPCODE(be32toh(OPCODE_TID(cpl)));
 #endif
 
 	KASSERT(opcode == CPL_ABORT_REQ_RSS,
 	    ("%s: unexpected opcode 0x%x", __func__, opcode));
 	KASSERT(m == NULL, ("%s: wasn't expecting payload", __func__));
 	KASSERT(synqe->tid == tid, ("%s: toep tid mismatch", __func__));
 
 	CTR6(KTR_CXGBE, "%s: tid %u, synqe %p (0x%x), lctx %p, status %d",
 	    __func__, tid, synqe, synqe->flags, synqe->lctx, cpl->status);
 
 	if (negative_advice(cpl->status))
 		return (0);	/* Ignore negative advice */
 
 	INP_WLOCK(inp);
 
 	ofld_txq = &sc->sge.ofld_txq[synqe->txqid];
 
 	/*
 	 * If we'd initiated an abort earlier the reply to it is responsible for
 	 * cleaning up resources.  Otherwise we tear everything down right here
 	 * right now.  We owe the T4 a CPL_ABORT_RPL no matter what.
 	 */
 	if (synqe->flags & TPF_ABORT_SHUTDOWN) {
 		INP_WUNLOCK(inp);
 		goto done;
 	}
 
 	done_with_synqe(sc, synqe);
 	/* inp lock released by done_with_synqe */
 done:
 	send_abort_rpl(sc, ofld_txq, tid, CPL_ABORT_NO_RST);
 	return (0);
 }
 
 int
 do_abort_rpl_synqe(struct sge_iq *iq, const struct rss_header *rss,
     struct mbuf *m)
 {
 	struct adapter *sc = iq->adapter;
 	const struct cpl_abort_rpl_rss *cpl = (const void *)(rss + 1);
 	unsigned int tid = GET_TID(cpl);
 	struct synq_entry *synqe = lookup_tid(sc, tid);
 	struct listen_ctx *lctx = synqe->lctx;
 	struct inpcb *inp = lctx->inp;
 #ifdef INVARIANTS
 	unsigned int opcode = G_CPL_OPCODE(be32toh(OPCODE_TID(cpl)));
 #endif
 
 	KASSERT(opcode == CPL_ABORT_RPL_RSS,
 	    ("%s: unexpected opcode 0x%x", __func__, opcode));
 	KASSERT(m == NULL, ("%s: wasn't expecting payload", __func__));
 	KASSERT(synqe->tid == tid, ("%s: toep tid mismatch", __func__));
 
 	CTR6(KTR_CXGBE, "%s: tid %u, synqe %p (0x%x), lctx %p, status %d",
 	    __func__, tid, synqe, synqe->flags, synqe->lctx, cpl->status);
 
 	INP_WLOCK(inp);
 	KASSERT(synqe->flags & TPF_ABORT_SHUTDOWN,
 	    ("%s: wasn't expecting abort reply for synqe %p (0x%x)",
 	    __func__, synqe, synqe->flags));
 
 	done_with_synqe(sc, synqe);
 	/* inp lock released by done_with_synqe */
 
 	return (0);
 }
 
 void
 t4_offload_socket(struct toedev *tod, void *arg, struct socket *so)
 {
 	struct adapter *sc = tod->tod_softc;
 	struct synq_entry *synqe = arg;
 #ifdef INVARIANTS
 	struct inpcb *inp = sotoinpcb(so);
 #endif
 	struct toepcb *toep = synqe->toep;
 
 	INP_INFO_RLOCK_ASSERT(&V_tcbinfo); /* prevents bad race with accept() */
 	INP_WLOCK_ASSERT(inp);
 	KASSERT(synqe->flags & TPF_SYNQE,
 	    ("%s: %p not a synq_entry?", __func__, arg));
 	MPASS(toep->tid == synqe->tid);
 
 	offload_socket(so, toep);
 	make_established(toep, synqe->iss, synqe->irs, synqe->tcp_opt);
 	toep->flags |= TPF_CPL_PENDING;
 	update_tid(sc, synqe->tid, toep);
 	synqe->flags |= TPF_SYNQE_EXPANDED;
 }
 
 static inline void
 save_qids_in_synqe(struct synq_entry *synqe, struct vi_info *vi,
     struct offload_settings *s)
 {
 	uint32_t txqid, rxqid;
 
 	if (s->txq >= 0 && s->txq < vi->nofldtxq)
 		txqid = s->txq;
 	else
 		txqid = arc4random() % vi->nofldtxq;
 	txqid += vi->first_ofld_txq;
 
 	if (s->rxq >= 0 && s->rxq < vi->nofldrxq)
 		rxqid = s->rxq;
 	else
 		rxqid = arc4random() % vi->nofldrxq;
 	rxqid += vi->first_ofld_rxq;
 
 	synqe->txqid = txqid;
 	synqe->rxqid = rxqid;
 }
 
 static void
 t4opt_to_tcpopt(const struct tcp_options *t4opt, struct tcpopt *to)
 {
 	bzero(to, sizeof(*to));
 
 	if (t4opt->mss) {
 		to->to_flags |= TOF_MSS;
 		to->to_mss = be16toh(t4opt->mss);
 	}
 
 	if (t4opt->wsf) {
 		to->to_flags |= TOF_SCALE;
 		to->to_wscale = t4opt->wsf;
 	}
 
 	if (t4opt->tstamp)
 		to->to_flags |= TOF_TS;
 
 	if (t4opt->sack)
 		to->to_flags |= TOF_SACKPERM;
 }
 
 /*
  * Options2 for passive open.
  */
 static uint32_t
 calc_opt2p(struct adapter *sc, struct port_info *pi, int rxqid,
 	const struct tcp_options *tcpopt, struct tcphdr *th, int ulp_mode,
 	struct cc_algo *cc, const struct offload_settings *s)
 {
 	struct sge_ofld_rxq *ofld_rxq = &sc->sge.ofld_rxq[rxqid];
 	uint32_t opt2 = 0;
 
 	/*
 	 * rx flow control, rx coalesce, congestion control, and tx pace are all
 	 * explicitly set by the driver.  On T5+ the ISS is also set by the
 	 * driver to the value picked by the kernel.
 	 */
 	if (is_t4(sc)) {
 		opt2 |= F_RX_FC_VALID | F_RX_COALESCE_VALID;
 		opt2 |= F_CONG_CNTRL_VALID | F_PACE_VALID;
 	} else {
 		opt2 |= F_T5_OPT_2_VALID;	/* all 4 valid */
 		opt2 |= F_T5_ISS;		/* ISS provided in CPL */
 	}
 
 	if (tcpopt->sack && (s->sack > 0 || (s->sack < 0 && V_tcp_do_rfc1323)))
 		opt2 |= F_SACK_EN;
 
 	if (tcpopt->tstamp &&
 	    (s->tstamp > 0 || (s->tstamp < 0 && V_tcp_do_rfc1323)))
 		opt2 |= F_TSTAMPS_EN;
 
 	if (tcpopt->wsf < 15 && V_tcp_do_rfc1323)
 		opt2 |= F_WND_SCALE_EN;
 
 	if (th->th_flags & (TH_ECE | TH_CWR) &&
 	    (s->ecn > 0 || (s->ecn < 0 && V_tcp_do_ecn)))
 		opt2 |= F_CCTRL_ECN;
 
 	/* XXX: F_RX_CHANNEL for multiple rx c-chan support goes here. */
 
 	opt2 |= V_TX_QUEUE(sc->params.tp.tx_modq[pi->tx_chan]);
 
 	/* These defaults are subject to ULP specific fixups later. */
 	opt2 |= V_RX_FC_DDP(0) | V_RX_FC_DISABLE(0);
 
 	opt2 |= V_PACE(0);
 
 	if (s->cong_algo >= 0)
 		opt2 |= V_CONG_CNTRL(s->cong_algo);
 	else if (sc->tt.cong_algorithm >= 0)
 		opt2 |= V_CONG_CNTRL(sc->tt.cong_algorithm & M_CONG_CNTRL);
 	else {
 		if (strcasecmp(cc->name, "reno") == 0)
 			opt2 |= V_CONG_CNTRL(CONG_ALG_RENO);
 		else if (strcasecmp(cc->name, "tahoe") == 0)
 			opt2 |= V_CONG_CNTRL(CONG_ALG_TAHOE);
 		if (strcasecmp(cc->name, "newreno") == 0)
 			opt2 |= V_CONG_CNTRL(CONG_ALG_NEWRENO);
 		if (strcasecmp(cc->name, "highspeed") == 0)
 			opt2 |= V_CONG_CNTRL(CONG_ALG_HIGHSPEED);
 		else {
 			/*
 			 * Use newreno in case the algorithm selected by the
 			 * host stack is not supported by the hardware.
 			 */
 			opt2 |= V_CONG_CNTRL(CONG_ALG_NEWRENO);
 		}
 	}
 
 	if (s->rx_coalesce > 0 || (s->rx_coalesce < 0 && sc->tt.rx_coalesce))
 		opt2 |= V_RX_COALESCE(M_RX_COALESCE);
 
 	/* Note that ofld_rxq is already set according to s->rxq. */
 	opt2 |= F_RSS_QUEUE_VALID;
 	opt2 |= V_RSS_QUEUE(ofld_rxq->iq.abs_id);
 
 #ifdef USE_DDP_RX_FLOW_CONTROL
 	if (ulp_mode == ULP_MODE_TCPDDP)
 		opt2 |= F_RX_FC_DDP;
 #endif
 
 	if (ulp_mode == ULP_MODE_TLS) {
 		opt2 &= ~V_RX_COALESCE(M_RX_COALESCE);
 		opt2 |= F_RX_FC_DISABLE;
 	}
 
 	return (htobe32(opt2));
 }
 
 static void
 pass_accept_req_to_protohdrs(struct adapter *sc, const struct mbuf *m,
     struct in_conninfo *inc, struct tcphdr *th)
 {
 	const struct cpl_pass_accept_req *cpl = mtod(m, const void *);
 	const struct ether_header *eh;
 	unsigned int hlen = be32toh(cpl->hdr_len);
 	uintptr_t l3hdr;
 	const struct tcphdr *tcp;
 
 	eh = (const void *)(cpl + 1);
 	if (chip_id(sc) >= CHELSIO_T6) {
 		l3hdr = ((uintptr_t)eh + G_T6_ETH_HDR_LEN(hlen));
 		tcp = (const void *)(l3hdr + G_T6_IP_HDR_LEN(hlen));
 	} else {
 		l3hdr = ((uintptr_t)eh + G_ETH_HDR_LEN(hlen));
 		tcp = (const void *)(l3hdr + G_IP_HDR_LEN(hlen));
 	}
 
 	if (inc) {
 		bzero(inc, sizeof(*inc));
 		inc->inc_fport = tcp->th_sport;
 		inc->inc_lport = tcp->th_dport;
 		if (((struct ip *)l3hdr)->ip_v == IPVERSION) {
 			const struct ip *ip = (const void *)l3hdr;
 
 			inc->inc_faddr = ip->ip_src;
 			inc->inc_laddr = ip->ip_dst;
 		} else {
 			const struct ip6_hdr *ip6 = (const void *)l3hdr;
 
 			inc->inc_flags |= INC_ISIPV6;
 			inc->inc6_faddr = ip6->ip6_src;
 			inc->inc6_laddr = ip6->ip6_dst;
 		}
 	}
 
 	if (th) {
 		bcopy(tcp, th, sizeof(*th));
 		tcp_fields_to_host(th);		/* just like tcp_input */
 	}
 }
 
 static struct l2t_entry *
 get_l2te_for_nexthop(struct port_info *pi, struct ifnet *ifp,
     struct in_conninfo *inc)
 {
 	struct l2t_entry *e;
 	struct sockaddr_in6 sin6;
 	struct sockaddr *dst = (void *)&sin6;
  
 	if (inc->inc_flags & INC_ISIPV6) {
 		struct nhop6_basic nh6;
 
 		bzero(dst, sizeof(struct sockaddr_in6));
 		dst->sa_len = sizeof(struct sockaddr_in6);
 		dst->sa_family = AF_INET6;
 
 		if (IN6_IS_ADDR_LINKLOCAL(&inc->inc6_laddr)) {
 			/* no need for route lookup */
 			e = t4_l2t_get(pi, ifp, dst);
 			return (e);
 		}
 
 		if (fib6_lookup_nh_basic(RT_DEFAULT_FIB, &inc->inc6_faddr,
 		    0, 0, 0, &nh6) != 0)
 			return (NULL);
 		if (nh6.nh_ifp != ifp)
 			return (NULL);
 		((struct sockaddr_in6 *)dst)->sin6_addr = nh6.nh_addr;
 	} else {
 		struct nhop4_basic nh4;
 
 		dst->sa_len = sizeof(struct sockaddr_in);
 		dst->sa_family = AF_INET;
 
 		if (fib4_lookup_nh_basic(RT_DEFAULT_FIB, inc->inc_faddr, 0, 0,
 		    &nh4) != 0)
 			return (NULL);
 		if (nh4.nh_ifp != ifp)
 			return (NULL);
 		((struct sockaddr_in *)dst)->sin_addr = nh4.nh_addr;
 	}
 
 	e = t4_l2t_get(pi, ifp, dst);
 	return (e);
 }
 
 static int
 send_synack(struct adapter *sc, struct synq_entry *synqe, uint64_t opt0,
     uint32_t opt2, int tid)
 {
 	struct wrqe *wr;
 	struct cpl_pass_accept_rpl *rpl;
 	struct l2t_entry *e = &sc->l2t->l2tab[synqe->l2e_idx];
 
 	wr = alloc_wrqe(is_t4(sc) ? sizeof(struct cpl_pass_accept_rpl) :
 	    sizeof(struct cpl_t5_pass_accept_rpl), &sc->sge.ctrlq[0]);
 	if (wr == NULL)
 		return (ENOMEM);
 	rpl = wrtod(wr);
 
 	if (is_t4(sc))
 		INIT_TP_WR_MIT_CPL(rpl, CPL_PASS_ACCEPT_RPL, tid);
 	else {
 		struct cpl_t5_pass_accept_rpl *rpl5 = (void *)rpl;
 
 		INIT_TP_WR_MIT_CPL(rpl5, CPL_PASS_ACCEPT_RPL, tid);
 		rpl5->iss = htobe32(synqe->iss);
 	}
 	rpl->opt0 = opt0;
 	rpl->opt2 = opt2;
 
 	return (t4_l2t_send(sc, wr, e));
 }
 
 #define REJECT_PASS_ACCEPT_REQ(tunnel)	do { \
 	if (!tunnel) { \
 		m_freem(m); \
 		m = NULL; \
 	} \
 	reject_reason = __LINE__; \
 	goto reject; \
 } while (0)
 
 /*
  * The context associated with a tid entry via insert_tid could be a synq_entry
  * or a toepcb.  The only way CPL handlers can tell is via a bit in these flags.
  */
 CTASSERT(offsetof(struct toepcb, flags) == offsetof(struct synq_entry, flags));
 
 /*
  * Incoming SYN on a listening socket.
  *
  * XXX: Every use of ifp in this routine has a bad race with up/down, toe/-toe,
  * etc.
  */
 static int
 do_pass_accept_req(struct sge_iq *iq, const struct rss_header *rss,
     struct mbuf *m)
 {
 	struct adapter *sc = iq->adapter;
 	struct toedev *tod;
 	const struct cpl_pass_accept_req *cpl = mtod(m, const void *);
 	unsigned int stid = G_PASS_OPEN_TID(be32toh(cpl->tos_stid));
 	unsigned int tid = GET_TID(cpl);
 	struct listen_ctx *lctx = lookup_stid(sc, stid);
 	struct inpcb *inp;
 	struct socket *so;
 	struct in_conninfo inc;
 	struct tcphdr th;
 	struct tcpopt to;
 	struct port_info *pi;
 	struct vi_info *vi;
 	struct ifnet *hw_ifp, *ifp;
 	struct l2t_entry *e = NULL;
 	struct synq_entry *synqe = NULL;
 	int reject_reason, v, ntids;
 	uint16_t vid, l2info;
 #ifdef INVARIANTS
 	unsigned int opcode = G_CPL_OPCODE(be32toh(OPCODE_TID(cpl)));
 #endif
 	struct offload_settings settings;
 
 	KASSERT(opcode == CPL_PASS_ACCEPT_REQ,
 	    ("%s: unexpected opcode 0x%x", __func__, opcode));
 	KASSERT(lctx->stid == stid, ("%s: lctx stid mismatch", __func__));
 
 	CTR4(KTR_CXGBE, "%s: stid %u, tid %u, lctx %p", __func__, stid, tid,
 	    lctx);
 
 	CURVNET_SET(lctx->vnet);	/* before any potential REJECT */
 
 	/*
 	 * Use the MAC index to lookup the associated VI.  If this SYN didn't
 	 * match a perfect MAC filter, punt.
 	 */
 	l2info = be16toh(cpl->l2info);
 	pi = sc->port[G_SYN_INTF(l2info)];
 	if (!(l2info & F_SYN_XACT_MATCH)) {
 		REJECT_PASS_ACCEPT_REQ(false);
 	}
 	for_each_vi(pi, v, vi) {
 		if (vi->xact_addr_filt == G_SYN_MAC_IDX(l2info))
 			goto found;
 	}
 	REJECT_PASS_ACCEPT_REQ(false);
 found:
 	hw_ifp = vi->ifp;	/* the cxgbe ifnet */
 	m->m_pkthdr.rcvif = hw_ifp;
 	tod = TOEDEV(hw_ifp);
 
 	/*
 	 * Don't offload if the peer requested a TCP option that's not known to
 	 * the silicon.  Send the SYN to the kernel instead.
 	 */
 	if (__predict_false(cpl->tcpopt.unknown))
 		REJECT_PASS_ACCEPT_REQ(true);
 
 	/*
 	 * Figure out if there is a pseudo interface (vlan, lagg, etc.)
 	 * involved.  Don't offload if the SYN had a VLAN tag and the vid
 	 * doesn't match anything on this interface.
 	 *
 	 * XXX: lagg support, lagg + vlan support.
 	 */
 	vid = EVL_VLANOFTAG(be16toh(cpl->vlan));
 	if (vid != 0xfff) {
 		ifp = VLAN_DEVAT(hw_ifp, vid);
 		if (ifp == NULL)
 			REJECT_PASS_ACCEPT_REQ(true);
 	} else
 		ifp = hw_ifp;
 
 	/*
 	 * Don't offload if the ifnet that the SYN came in on is not in the same
 	 * vnet as the listening socket.
 	 */
 	if (lctx->vnet != ifp->if_vnet)
 		REJECT_PASS_ACCEPT_REQ(true);
 
 	pass_accept_req_to_protohdrs(sc, m, &inc, &th);
 	if (inc.inc_flags & INC_ISIPV6) {
 
 		/* Don't offload if the ifcap isn't enabled */
 		if ((ifp->if_capenable & IFCAP_TOE6) == 0)
 			REJECT_PASS_ACCEPT_REQ(true);
 
 		/*
 		 * SYN must be directed to an IP6 address on this ifnet.  This
 		 * is more restrictive than in6_localip.
 		 */
 		if (!in6_ifhasaddr(ifp, &inc.inc6_laddr))
 			REJECT_PASS_ACCEPT_REQ(true);
 
 		ntids = 2;
 	} else {
 
 		/* Don't offload if the ifcap isn't enabled */
 		if ((ifp->if_capenable & IFCAP_TOE4) == 0)
 			REJECT_PASS_ACCEPT_REQ(true);
 
 		/*
 		 * SYN must be directed to an IP address on this ifnet.  This
 		 * is more restrictive than in_localip.
 		 */
 		if (!in_ifhasaddr(ifp, inc.inc_laddr))
 			REJECT_PASS_ACCEPT_REQ(true);
 
 		ntids = 1;
 	}
 
 	e = get_l2te_for_nexthop(pi, ifp, &inc);
 	if (e == NULL)
 		REJECT_PASS_ACCEPT_REQ(true);
 
 	/* Don't offload if the 4-tuple is already in use */
 	INP_INFO_RLOCK(&V_tcbinfo);	/* for 4-tuple check */
 	if (toe_4tuple_check(&inc, &th, ifp) != 0) {
 		INP_INFO_RUNLOCK(&V_tcbinfo);
 		REJECT_PASS_ACCEPT_REQ(false);
 	}
 	INP_INFO_RUNLOCK(&V_tcbinfo);
 
 	inp = lctx->inp;		/* listening socket, not owned by TOE */
 	INP_WLOCK(inp);
 
 	/* Don't offload if the listening socket has closed */
 	if (__predict_false(inp->inp_flags & INP_DROPPED)) {
 		INP_WUNLOCK(inp);
 		REJECT_PASS_ACCEPT_REQ(false);
 	}
 	so = inp->inp_socket;
 	rw_rlock(&sc->policy_lock);
 	settings = *lookup_offload_policy(sc, OPEN_TYPE_PASSIVE, m, 0xffff, inp);
 	rw_runlock(&sc->policy_lock);
 	if (!settings.offload) {
 		INP_WUNLOCK(inp);
 		REJECT_PASS_ACCEPT_REQ(true);	/* Rejected by COP. */
 	}
 
 	synqe = alloc_synqe(sc, lctx, M_NOWAIT);
 	if (synqe == NULL) {
 		INP_WUNLOCK(inp);
 		REJECT_PASS_ACCEPT_REQ(true);
 	}
 	atomic_store_int(&synqe->ok_to_respond, 0);
 
 	/*
 	 * If all goes well t4_syncache_respond will get called during
 	 * syncache_add.  Note that syncache_add releases the pcb lock.
 	 */
 	t4opt_to_tcpopt(&cpl->tcpopt, &to);
 	toe_syncache_add(&inc, &to, &th, inp, tod, synqe);
 
 	if (atomic_load_int(&synqe->ok_to_respond) > 0) {
 		uint64_t opt0;
 		uint32_t opt2;
 		int rscale, mtu_idx, rx_credits;
 
 		mtu_idx = find_best_mtu_idx(sc, &inc, &settings);
 		rscale = cpl->tcpopt.wsf && V_tcp_do_rfc1323 ?  select_rcv_wscale() : 0;
-		/* opt0 rcv_bufsiz initially, assumes its normal meaning later */
 		rx_credits = min(select_rcv_wnd(so) >> 10, M_RCV_BUFSIZ);
 
 		save_qids_in_synqe(synqe, vi, &settings);
 		synqe->ulp_mode = select_ulp_mode(so, sc, &settings);
 
 		opt0 = calc_opt0(so, vi, e, mtu_idx, rscale, rx_credits,
 		    synqe->ulp_mode, &settings);
 		opt2 = calc_opt2p(sc, pi, synqe->rxqid, &cpl->tcpopt, &th,
 		    synqe->ulp_mode, CC_ALGO(intotcpcb(inp)), &settings);
 
 		insert_tid(sc, tid, synqe, ntids);
 		synqe->tid = tid;
 		synqe->l2e_idx = e->idx;
 		synqe->rcv_bufsize = rx_credits;
 		synqe->syn = m;
 		m = NULL;
 
 		if (send_synack(sc, synqe, opt0, opt2, tid) != 0) {
 			remove_tid(sc, tid, ntids);
 			m = synqe->syn;
 			synqe->syn = NULL;
 			REJECT_PASS_ACCEPT_REQ(true);
 		}
 
 		CTR6(KTR_CXGBE,
 		    "%s: stid %u, tid %u, lctx %p, synqe %p, mode %d, SYNACK",
 		    __func__, stid, tid, lctx, synqe, synqe->ulp_mode);
 	} else
 		REJECT_PASS_ACCEPT_REQ(false);
 
 	CURVNET_RESTORE();
 	return (0);
 reject:
 	CURVNET_RESTORE();
 	CTR4(KTR_CXGBE, "%s: stid %u, tid %u, REJECT (%d)", __func__, stid, tid,
 	    reject_reason);
 
 	if (e)
 		t4_l2t_release(e);
 	release_tid(sc, tid, lctx->ctrlq);
 	if (synqe) {
 		inp = synqe->lctx->inp;
 		INP_WLOCK(inp);
 		inp = release_synqe(sc, synqe);
 		if (inp)
 			INP_WUNLOCK(inp);
 	}
 
 	if (m) {
 		/*
 		 * The connection request hit a TOE listener but is being passed
 		 * on to the kernel sw stack instead of getting offloaded.
 		 */
 		m_adj(m, sizeof(*cpl));
 		m->m_pkthdr.csum_flags |= (CSUM_IP_CHECKED | CSUM_IP_VALID |
 		    CSUM_DATA_VALID | CSUM_PSEUDO_HDR);
 		m->m_pkthdr.csum_data = 0xffff;
 		hw_ifp->if_input(hw_ifp, m);
 	}
 
 	return (reject_reason);
 }
 
 static void
 synqe_to_protohdrs(struct adapter *sc, struct synq_entry *synqe,
     const struct cpl_pass_establish *cpl, struct in_conninfo *inc,
     struct tcphdr *th, struct tcpopt *to)
 {
 	uint16_t tcp_opt = be16toh(cpl->tcp_opt);
 
 	/* start off with the original SYN */
 	pass_accept_req_to_protohdrs(sc, synqe->syn, inc, th);
 
 	/* modify parts to make it look like the ACK to our SYN|ACK */
 	th->th_flags = TH_ACK;
 	th->th_ack = synqe->iss + 1;
 	th->th_seq = be32toh(cpl->rcv_isn);
 	bzero(to, sizeof(*to));
 	if (G_TCPOPT_TSTAMP(tcp_opt)) {
 		to->to_flags |= TOF_TS;
 		to->to_tsecr = synqe->ts;
 	}
 }
 
 static int
 do_pass_establish(struct sge_iq *iq, const struct rss_header *rss,
     struct mbuf *m)
 {
 	struct adapter *sc = iq->adapter;
 	struct vi_info *vi;
 	struct ifnet *ifp;
 	const struct cpl_pass_establish *cpl = (const void *)(rss + 1);
 #if defined(KTR) || defined(INVARIANTS)
 	unsigned int stid = G_PASS_OPEN_TID(be32toh(cpl->tos_stid));
 #endif
 	unsigned int tid = GET_TID(cpl);
 	struct synq_entry *synqe = lookup_tid(sc, tid);
 	struct listen_ctx *lctx = synqe->lctx;
 	struct inpcb *inp = lctx->inp, *new_inp;
 	struct socket *so;
 	struct tcphdr th;
 	struct tcpopt to;
 	struct in_conninfo inc;
 	struct toepcb *toep;
 #ifdef INVARIANTS
 	unsigned int opcode = G_CPL_OPCODE(be32toh(OPCODE_TID(cpl)));
 #endif
 
 	KASSERT(opcode == CPL_PASS_ESTABLISH,
 	    ("%s: unexpected opcode 0x%x", __func__, opcode));
 	KASSERT(m == NULL, ("%s: wasn't expecting payload", __func__));
 	KASSERT(lctx->stid == stid, ("%s: lctx stid mismatch", __func__));
 	KASSERT(synqe->flags & TPF_SYNQE,
 	    ("%s: tid %u (ctx %p) not a synqe", __func__, tid, synqe));
 
 	CURVNET_SET(lctx->vnet);
 	INP_INFO_RLOCK(&V_tcbinfo);	/* for syncache_expand */
 	INP_WLOCK(inp);
 
 	CTR6(KTR_CXGBE,
 	    "%s: stid %u, tid %u, synqe %p (0x%x), inp_flags 0x%x",
 	    __func__, stid, tid, synqe, synqe->flags, inp->inp_flags);
 
 	ifp = synqe->syn->m_pkthdr.rcvif;
 	vi = ifp->if_softc;
 	KASSERT(vi->pi->adapter == sc,
 	    ("%s: vi %p, sc %p mismatch", __func__, vi, sc));
 
 	if (__predict_false(inp->inp_flags & INP_DROPPED)) {
 reset:
 		send_reset_synqe(TOEDEV(ifp), synqe);
 		INP_WUNLOCK(inp);
 		INP_INFO_RUNLOCK(&V_tcbinfo);
 		CURVNET_RESTORE();
 		return (0);
 	}
 
 	KASSERT(synqe->rxqid == iq_to_ofld_rxq(iq) - &sc->sge.ofld_rxq[0],
 	    ("%s: CPL arrived on unexpected rxq.  %d %d", __func__,
 	    synqe->rxqid, (int)(iq_to_ofld_rxq(iq) - &sc->sge.ofld_rxq[0])));
 
 	toep = alloc_toepcb(vi, synqe->txqid, synqe->rxqid, M_NOWAIT);
 	if (toep == NULL)
 		goto reset;
 	toep->tid = tid;
 	toep->l2te = &sc->l2t->l2tab[synqe->l2e_idx];
 	toep->vnet = lctx->vnet;
 	set_ulp_mode(toep, synqe->ulp_mode);
-	/* opt0 rcv_bufsiz initially, assumes its normal meaning later */
-	toep->rx_credits = synqe->rcv_bufsize;
+	toep->opt0_rcv_bufsize = synqe->rcv_bufsize;
 
 	MPASS(be32toh(cpl->snd_isn) - 1 == synqe->iss);
 	MPASS(be32toh(cpl->rcv_isn) - 1 == synqe->irs);
 	synqe->tcp_opt = cpl->tcp_opt;
 	synqe->toep = toep;
 
 	/* Come up with something that syncache_expand should be ok with. */
 	synqe_to_protohdrs(sc, synqe, cpl, &inc, &th, &to);
 	if (inc.inc_flags & INC_ISIPV6)
 		toep->ce = t4_hold_lip(sc, &inc.inc6_laddr, lctx->ce);
 	so = inp->inp_socket;
 	KASSERT(so != NULL, ("%s: socket is NULL", __func__));
 
 	if (!toe_syncache_expand(&inc, &to, &th, &so) || so == NULL) {
 		free_toepcb(toep);
 		goto reset;
 	}
 
 	/* New connection inpcb is already locked by syncache_expand(). */
 	new_inp = sotoinpcb(so);
 	INP_WLOCK_ASSERT(new_inp);
 	MPASS(so->so_vnet == lctx->vnet);
 
 	/*
 	 * This is for expansion from syncookies.
 	 *
 	 * XXX: we've held the tcbinfo lock throughout so there's no risk of
 	 * anyone accept'ing a connection before we've installed our hooks, but
 	 * this somewhat defeats the purpose of having a tod_offload_socket :-(
 	 */
 	if (__predict_false(!(synqe->flags & TPF_SYNQE_EXPANDED))) {
 		tcp_timer_activate(intotcpcb(new_inp), TT_KEEP, 0);
 		t4_offload_socket(TOEDEV(ifp), synqe, so);
 	}
 
 	INP_WUNLOCK(new_inp);
 
 	/* Done with the synqe */
 	inp = release_synqe(sc, synqe);
 	if (inp != NULL)
 		INP_WUNLOCK(inp);
 	INP_INFO_RUNLOCK(&V_tcbinfo);
 	CURVNET_RESTORE();
 
 	return (0);
 }
 
 void
 t4_init_listen_cpl_handlers(void)
 {
 
 	t4_register_cpl_handler(CPL_PASS_OPEN_RPL, do_pass_open_rpl);
 	t4_register_cpl_handler(CPL_CLOSE_LISTSRV_RPL, do_close_server_rpl);
 	t4_register_cpl_handler(CPL_PASS_ACCEPT_REQ, do_pass_accept_req);
 	t4_register_cpl_handler(CPL_PASS_ESTABLISH, do_pass_establish);
 }
 
 void
 t4_uninit_listen_cpl_handlers(void)
 {
 
 	t4_register_cpl_handler(CPL_PASS_OPEN_RPL, NULL);
 	t4_register_cpl_handler(CPL_CLOSE_LISTSRV_RPL, NULL);
 	t4_register_cpl_handler(CPL_PASS_ACCEPT_REQ, NULL);
 	t4_register_cpl_handler(CPL_PASS_ESTABLISH, NULL);
 }
 #endif
Index: stable/11/sys/dev/cxgbe/tom/t4_tls.c
===================================================================
--- stable/11/sys/dev/cxgbe/tom/t4_tls.c	(revision 348703)
+++ stable/11/sys/dev/cxgbe/tom/t4_tls.c	(revision 348704)
@@ -1,1654 +1,1631 @@
 /*-
  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
  *
  * Copyright (c) 2017-2018 Chelsio Communications, Inc.
  * All rights reserved.
  * Written by: John Baldwin <jhb@FreeBSD.org>
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #include "opt_inet.h"
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <sys/param.h>
 #include <sys/sglist.h>
 #include <sys/socket.h>
 #include <sys/socketvar.h>
 #include <sys/systm.h>
 #include <netinet/in.h>
 #include <netinet/in_pcb.h>
 #include <netinet/tcp_var.h>
 #include <netinet/toecore.h>
 
 #ifdef TCP_OFFLOAD
 #include "common/common.h"
 #include "common/t4_tcb.h"
 #include "tom/t4_tom_l2t.h"
 #include "tom/t4_tom.h"
 
 VNET_DECLARE(int, tcp_do_autosndbuf);
 #define V_tcp_do_autosndbuf VNET(tcp_do_autosndbuf)
 VNET_DECLARE(int, tcp_autosndbuf_inc);
 #define V_tcp_autosndbuf_inc VNET(tcp_autosndbuf_inc)
 VNET_DECLARE(int, tcp_autosndbuf_max);
 #define V_tcp_autosndbuf_max VNET(tcp_autosndbuf_max)
 VNET_DECLARE(int, tcp_do_autorcvbuf);
 #define V_tcp_do_autorcvbuf VNET(tcp_do_autorcvbuf)
 VNET_DECLARE(int, tcp_autorcvbuf_inc);
 #define V_tcp_autorcvbuf_inc VNET(tcp_autorcvbuf_inc)
 VNET_DECLARE(int, tcp_autorcvbuf_max);
 #define V_tcp_autorcvbuf_max VNET(tcp_autorcvbuf_max)
 
 /*
  * The TCP sequence number of a CPL_TLS_DATA mbuf is saved here while
  * the mbuf is in the ulp_pdu_reclaimq.
  */
 #define	tls_tcp_seq	PH_loc.thirtytwo[0]
 
 /*
  * Handshake lock used for the handshake timer.  Having a global lock
  * is perhaps not ideal, but it avoids having to use callout_drain()
  * in tls_uninit_toep() which can't block.  Also, the timer shouldn't
  * actually fire for most connections.
  */
 static struct mtx tls_handshake_lock;
 
 static void
 t4_set_tls_tcb_field(struct toepcb *toep, uint16_t word, uint64_t mask,
     uint64_t val)
 {
 	struct adapter *sc = td_adapter(toep->td);
 
 	t4_set_tcb_field(sc, toep->ofld_txq, toep, word, mask, val, 0, 0);
 }
 
 /* TLS and DTLS common routines */
 bool
 can_tls_offload(struct adapter *sc)
 {
 
 	return (sc->tt.tls && sc->cryptocaps & FW_CAPS_CONFIG_TLSKEYS);
 }
 
 int
 tls_tx_key(struct toepcb *toep)
 {
 	struct tls_ofld_info *tls_ofld = &toep->tls;
 
 	return (tls_ofld->tx_key_addr >= 0);
 }
 
 int
 tls_rx_key(struct toepcb *toep)
 {
 	struct tls_ofld_info *tls_ofld = &toep->tls;
 
 	return (tls_ofld->rx_key_addr >= 0);
 }
 
 static int
 key_size(struct toepcb *toep)
 {
 	struct tls_ofld_info *tls_ofld = &toep->tls;
 
 	return ((tls_ofld->key_location == TLS_SFO_WR_CONTEXTLOC_IMMEDIATE) ?
 		tls_ofld->k_ctx.tx_key_info_size : KEY_IN_DDR_SIZE);
 }
 
 /* Set TLS Key-Id in TCB */
 static void
 t4_set_tls_keyid(struct toepcb *toep, unsigned int key_id)
 {
 
 	t4_set_tls_tcb_field(toep, W_TCB_RX_TLS_KEY_TAG,
 			 V_TCB_RX_TLS_KEY_TAG(M_TCB_RX_TLS_BUF_TAG),
 			 V_TCB_RX_TLS_KEY_TAG(key_id));
 }
 
 /* Clear TF_RX_QUIESCE to re-enable receive. */
 static void
 t4_clear_rx_quiesce(struct toepcb *toep)
 {
 
 	t4_set_tls_tcb_field(toep, W_TCB_T_FLAGS, V_TF_RX_QUIESCE(1), 0);
 }
 
 static void
 tls_clr_ofld_mode(struct toepcb *toep)
 {
 
 	tls_stop_handshake_timer(toep);
 
 	/* Operate in PDU extraction mode only. */
 	t4_set_tls_tcb_field(toep, W_TCB_ULP_RAW,
 	    V_TCB_ULP_RAW(M_TCB_ULP_RAW),
 	    V_TCB_ULP_RAW(V_TF_TLS_ENABLE(1)));
 	t4_clear_rx_quiesce(toep);
 }
 
 static void
 tls_clr_quiesce(struct toepcb *toep)
 {
 
 	tls_stop_handshake_timer(toep);
 	t4_clear_rx_quiesce(toep);
 }
 
 /*
  * Calculate the TLS data expansion size
  */
 static int
 tls_expansion_size(struct toepcb *toep, int data_len, int full_pdus_only,
     unsigned short *pdus_per_ulp)
 {
 	struct tls_ofld_info *tls_ofld = &toep->tls;
 	struct tls_scmd *scmd = &tls_ofld->scmd0;
 	int expn_size = 0, frag_count = 0, pad_per_pdu = 0,
 	    pad_last_pdu = 0, last_frag_size = 0, max_frag_size = 0;
 	int exp_per_pdu = 0;
 	int hdr_len = TLS_HEADER_LENGTH;
 
 	do {
 		max_frag_size = tls_ofld->k_ctx.frag_size;
 		if (G_SCMD_CIPH_MODE(scmd->seqno_numivs) ==
 		   SCMD_CIPH_MODE_AES_GCM) {
 			frag_count = (data_len / max_frag_size);
 			exp_per_pdu = GCM_TAG_SIZE + AEAD_EXPLICIT_DATA_SIZE +
 				hdr_len;
 			expn_size =  frag_count * exp_per_pdu;
 			if (full_pdus_only) {
 				*pdus_per_ulp = data_len / (exp_per_pdu +
 					max_frag_size);
 				if (*pdus_per_ulp > 32)
 					*pdus_per_ulp = 32;
 				else if(!*pdus_per_ulp)
 					*pdus_per_ulp = 1;
 				expn_size = (*pdus_per_ulp) * exp_per_pdu;
 				break;
 			}
 			if ((last_frag_size = data_len % max_frag_size) > 0) {
 				frag_count += 1;
 				expn_size += exp_per_pdu;
 			}
 			break;
 		} else if (G_SCMD_CIPH_MODE(scmd->seqno_numivs) !=
 			   SCMD_CIPH_MODE_NOP) {
 			/* Calculate the number of fragments we can make */
 			frag_count  = (data_len / max_frag_size);
 			if (frag_count > 0) {
 				pad_per_pdu = (((howmany((max_frag_size +
 						       tls_ofld->mac_length),
 						      CIPHER_BLOCK_SIZE)) *
 						CIPHER_BLOCK_SIZE) -
 					       (max_frag_size +
 						tls_ofld->mac_length));
 				if (!pad_per_pdu)
 					pad_per_pdu = CIPHER_BLOCK_SIZE;
 				exp_per_pdu = pad_per_pdu +
 				       	tls_ofld->mac_length +
 					hdr_len + CIPHER_BLOCK_SIZE;
 				expn_size = frag_count * exp_per_pdu;
 			}
 			if (full_pdus_only) {
 				*pdus_per_ulp = data_len / (exp_per_pdu +
 					max_frag_size);
 				if (*pdus_per_ulp > 32)
 					*pdus_per_ulp = 32;
 				else if (!*pdus_per_ulp)
 					*pdus_per_ulp = 1;
 				expn_size = (*pdus_per_ulp) * exp_per_pdu;
 				break;
 			}
 			/* Consider the last fragment */
 			if ((last_frag_size = data_len % max_frag_size) > 0) {
 				pad_last_pdu = (((howmany((last_frag_size +
 							tls_ofld->mac_length),
 						       CIPHER_BLOCK_SIZE)) *
 						 CIPHER_BLOCK_SIZE) -
 						(last_frag_size +
 						 tls_ofld->mac_length));
 				if (!pad_last_pdu)
 					pad_last_pdu = CIPHER_BLOCK_SIZE;
 				expn_size += (pad_last_pdu +
 					      tls_ofld->mac_length + hdr_len +
 					      CIPHER_BLOCK_SIZE);
 			}
 		}
 	} while (0);
 
 	return (expn_size);
 }
 
 /* Copy Key to WR */
 static void
 tls_copy_tx_key(struct toepcb *toep, void *dst)
 {
 	struct tls_ofld_info *tls_ofld = &toep->tls;
 	struct ulptx_sc_memrd *sc_memrd;
 	struct ulptx_idata *sc;
 
 	if (tls_ofld->k_ctx.tx_key_info_size <= 0)
 		return;
 
 	if (tls_ofld->key_location == TLS_SFO_WR_CONTEXTLOC_DDR) {
 		sc = dst;
 		sc->cmd_more = htobe32(V_ULPTX_CMD(ULP_TX_SC_NOOP));
 		sc->len = htobe32(0);
 		sc_memrd = (struct ulptx_sc_memrd *)(sc + 1);
 		sc_memrd->cmd_to_len = htobe32(V_ULPTX_CMD(ULP_TX_SC_MEMRD) |
 		    V_ULP_TX_SC_MORE(1) |
 		    V_ULPTX_LEN16(tls_ofld->k_ctx.tx_key_info_size >> 4));
 		sc_memrd->addr = htobe32(tls_ofld->tx_key_addr >> 5);
 	} else if (tls_ofld->key_location == TLS_SFO_WR_CONTEXTLOC_IMMEDIATE) {
 		memcpy(dst, &tls_ofld->k_ctx.tx,
 		    tls_ofld->k_ctx.tx_key_info_size);
 	}
 }
 
 /* TLS/DTLS content type  for CPL SFO */
 static inline unsigned char
 tls_content_type(unsigned char content_type)
 {
 	/*
 	 * XXX: Shouldn't this map CONTENT_TYPE_APP_DATA to DATA and
 	 * default to "CUSTOM" for all other types including
 	 * heartbeat?
 	 */
 	switch (content_type) {
 	case CONTENT_TYPE_CCS:
 		return CPL_TX_TLS_SFO_TYPE_CCS;
 	case CONTENT_TYPE_ALERT:
 		return CPL_TX_TLS_SFO_TYPE_ALERT;
 	case CONTENT_TYPE_HANDSHAKE:
 		return CPL_TX_TLS_SFO_TYPE_HANDSHAKE;
 	case CONTENT_TYPE_HEARTBEAT:
 		return CPL_TX_TLS_SFO_TYPE_HEARTBEAT;
 	}
 	return CPL_TX_TLS_SFO_TYPE_DATA;
 }
 
 static unsigned char
 get_cipher_key_size(unsigned int ck_size)
 {
 	switch (ck_size) {
 	case AES_NOP: /* NOP */
 		return 15;
 	case AES_128: /* AES128 */
 		return CH_CK_SIZE_128;
 	case AES_192: /* AES192 */
 		return CH_CK_SIZE_192;
 	case AES_256: /* AES256 */
 		return CH_CK_SIZE_256;
 	default:
 		return CH_CK_SIZE_256;
 	}
 }
 
 static unsigned char
 get_mac_key_size(unsigned int mk_size)
 {
 	switch (mk_size) {
 	case SHA_NOP: /* NOP */
 		return CH_MK_SIZE_128;
 	case SHA_GHASH: /* GHASH */
 	case SHA_512: /* SHA512 */
 		return CH_MK_SIZE_512;
 	case SHA_224: /* SHA2-224 */
 		return CH_MK_SIZE_192;
 	case SHA_256: /* SHA2-256*/
 		return CH_MK_SIZE_256;
 	case SHA_384: /* SHA384 */
 		return CH_MK_SIZE_512;
 	case SHA1: /* SHA1 */
 	default:
 		return CH_MK_SIZE_160;
 	}
 }
 
 static unsigned int
 get_proto_ver(int proto_ver)
 {
 	switch (proto_ver) {
 	case TLS1_2_VERSION:
 		return TLS_1_2_VERSION;
 	case TLS1_1_VERSION:
 		return TLS_1_1_VERSION;
 	case DTLS1_2_VERSION:
 		return DTLS_1_2_VERSION;
 	default:
 		return TLS_VERSION_MAX;
 	}
 }
 
 static void
 tls_rxkey_flit1(struct tls_keyctx *kwr, struct tls_key_context *kctx)
 {
 
 	if (kctx->state.enc_mode == CH_EVP_CIPH_GCM_MODE) {
 		kwr->u.rxhdr.ivinsert_to_authinsrt =
 		    htobe64(V_TLS_KEYCTX_TX_WR_IVINSERT(6ULL) |
 			V_TLS_KEYCTX_TX_WR_AADSTRTOFST(1ULL) |
 			V_TLS_KEYCTX_TX_WR_AADSTOPOFST(5ULL) |
 			V_TLS_KEYCTX_TX_WR_AUTHSRTOFST(14ULL) |
 			V_TLS_KEYCTX_TX_WR_AUTHSTOPOFST(16ULL) |
 			V_TLS_KEYCTX_TX_WR_CIPHERSRTOFST(14ULL) |
 			V_TLS_KEYCTX_TX_WR_CIPHERSTOPOFST(0ULL) |
 			V_TLS_KEYCTX_TX_WR_AUTHINSRT(16ULL));
 		kwr->u.rxhdr.ivpresent_to_rxmk_size &=
 			~(V_TLS_KEYCTX_TX_WR_RXOPAD_PRESENT(1));
 		kwr->u.rxhdr.authmode_to_rxvalid &=
 			~(V_TLS_KEYCTX_TX_WR_CIPHAUTHSEQCTRL(1));
 	} else {
 		kwr->u.rxhdr.ivinsert_to_authinsrt =
 		    htobe64(V_TLS_KEYCTX_TX_WR_IVINSERT(6ULL) |
 			V_TLS_KEYCTX_TX_WR_AADSTRTOFST(1ULL) |
 			V_TLS_KEYCTX_TX_WR_AADSTOPOFST(5ULL) |
 			V_TLS_KEYCTX_TX_WR_AUTHSRTOFST(22ULL) |
 			V_TLS_KEYCTX_TX_WR_AUTHSTOPOFST(0ULL) |
 			V_TLS_KEYCTX_TX_WR_CIPHERSRTOFST(22ULL) |
 			V_TLS_KEYCTX_TX_WR_CIPHERSTOPOFST(0ULL) |
 			V_TLS_KEYCTX_TX_WR_AUTHINSRT(0ULL));
 	}
 }
 
 /* Rx key */
 static void
 prepare_rxkey_wr(struct tls_keyctx *kwr, struct tls_key_context *kctx)
 {
 	unsigned int ck_size = kctx->cipher_secret_size;
 	unsigned int mk_size = kctx->mac_secret_size;
 	int proto_ver = kctx->proto_ver;
 
 	kwr->u.rxhdr.flitcnt_hmacctrl =
 		((kctx->tx_key_info_size >> 4) << 3) | kctx->hmac_ctrl;
 
 	kwr->u.rxhdr.protover_ciphmode =
 		V_TLS_KEYCTX_TX_WR_PROTOVER(get_proto_ver(proto_ver)) |
 		V_TLS_KEYCTX_TX_WR_CIPHMODE(kctx->state.enc_mode);
 
 	kwr->u.rxhdr.authmode_to_rxvalid =
 		V_TLS_KEYCTX_TX_WR_AUTHMODE(kctx->state.auth_mode) |
 		V_TLS_KEYCTX_TX_WR_CIPHAUTHSEQCTRL(1) |
 		V_TLS_KEYCTX_TX_WR_SEQNUMCTRL(3) |
 		V_TLS_KEYCTX_TX_WR_RXVALID(1);
 
 	kwr->u.rxhdr.ivpresent_to_rxmk_size =
 		V_TLS_KEYCTX_TX_WR_IVPRESENT(0) |
 		V_TLS_KEYCTX_TX_WR_RXOPAD_PRESENT(1) |
 		V_TLS_KEYCTX_TX_WR_RXCK_SIZE(get_cipher_key_size(ck_size)) |
 		V_TLS_KEYCTX_TX_WR_RXMK_SIZE(get_mac_key_size(mk_size));
 
 	tls_rxkey_flit1(kwr, kctx);
 
 	/* No key reversal for GCM */
 	if (kctx->state.enc_mode != CH_EVP_CIPH_GCM_MODE) {
 		t4_aes_getdeckey(kwr->keys.edkey, kctx->rx.key,
 				 (kctx->cipher_secret_size << 3));
 		memcpy(kwr->keys.edkey + kctx->cipher_secret_size,
 		       kctx->rx.key + kctx->cipher_secret_size,
 		       (IPAD_SIZE + OPAD_SIZE));
 	} else {
 		memcpy(kwr->keys.edkey, kctx->rx.key,
 		       (kctx->tx_key_info_size - SALT_SIZE));
 		memcpy(kwr->u.rxhdr.rxsalt, kctx->rx.salt, SALT_SIZE);
 	}
 }
 
 /* Tx key */
 static void
 prepare_txkey_wr(struct tls_keyctx *kwr, struct tls_key_context *kctx)
 {
 	unsigned int ck_size = kctx->cipher_secret_size;
 	unsigned int mk_size = kctx->mac_secret_size;
 
 	kwr->u.txhdr.ctxlen =
 		(kctx->tx_key_info_size >> 4);
 	kwr->u.txhdr.dualck_to_txvalid =
 		V_TLS_KEYCTX_TX_WR_TXOPAD_PRESENT(1) |
 		V_TLS_KEYCTX_TX_WR_SALT_PRESENT(1) |
 		V_TLS_KEYCTX_TX_WR_TXCK_SIZE(get_cipher_key_size(ck_size)) |
 		V_TLS_KEYCTX_TX_WR_TXMK_SIZE(get_mac_key_size(mk_size)) |
 		V_TLS_KEYCTX_TX_WR_TXVALID(1);
 
 	memcpy(kwr->keys.edkey, kctx->tx.key, HDR_KCTX_SIZE);
 	if (kctx->state.enc_mode == CH_EVP_CIPH_GCM_MODE) {
 		memcpy(kwr->u.txhdr.txsalt, kctx->tx.salt, SALT_SIZE);
 		kwr->u.txhdr.dualck_to_txvalid &=
 			~(V_TLS_KEYCTX_TX_WR_TXOPAD_PRESENT(1));
 	}
 	kwr->u.txhdr.dualck_to_txvalid = htons(kwr->u.txhdr.dualck_to_txvalid);
 }
 
 /* TLS Key memory management */
 static int
 get_new_keyid(struct toepcb *toep, struct tls_key_context *k_ctx)
 {
 	struct adapter *sc = td_adapter(toep->td);
 	vmem_addr_t addr;
 
 	if (vmem_alloc(sc->key_map, TLS_KEY_CONTEXT_SZ, M_NOWAIT | M_FIRSTFIT,
 	    &addr) != 0)
 		return (-1);
 
 	return (addr);
 }
 
 static void
 free_keyid(struct toepcb *toep, int keyid)
 {
 	struct adapter *sc = td_adapter(toep->td);
 
 	vmem_free(sc->key_map, keyid, TLS_KEY_CONTEXT_SZ);
 }
 
 static void
 clear_tls_keyid(struct toepcb *toep)
 {
 	struct tls_ofld_info *tls_ofld = &toep->tls;
 
 	if (tls_ofld->rx_key_addr >= 0) {
 		free_keyid(toep, tls_ofld->rx_key_addr);
 		tls_ofld->rx_key_addr = -1;
 	}
 	if (tls_ofld->tx_key_addr >= 0) {
 		free_keyid(toep, tls_ofld->tx_key_addr);
 		tls_ofld->tx_key_addr = -1;
 	}
 }
 
 static int
 get_keyid(struct tls_ofld_info *tls_ofld, unsigned int ops)
 {
 	return (ops & KEY_WRITE_RX ? tls_ofld->rx_key_addr :
 		((ops & KEY_WRITE_TX) ? tls_ofld->tx_key_addr : -1));
 }
 
 static int
 get_tp_plen_max(struct tls_ofld_info *tls_ofld)
 {
 	int plen = ((min(3*4096, TP_TX_PG_SZ))/1448) * 1448;
 
 	return (tls_ofld->k_ctx.frag_size <= 8192 ? plen : FC_TP_PLEN_MAX);
 }
 
 /* Send request to get the key-id */
 static int
 tls_program_key_id(struct toepcb *toep, struct tls_key_context *k_ctx)
 {
 	struct tls_ofld_info *tls_ofld = &toep->tls;
 	struct adapter *sc = td_adapter(toep->td);
 	struct ofld_tx_sdesc *txsd;
 	int kwrlen, kctxlen, keyid, len;
 	struct wrqe *wr;
 	struct tls_key_req *kwr;
 	struct tls_keyctx *kctx;
 
 	kwrlen = sizeof(*kwr);
 	kctxlen = roundup2(sizeof(*kctx), 32);
 	len = roundup2(kwrlen + kctxlen, 16);
 
 	if (toep->txsd_avail == 0)
 		return (EAGAIN);
 
 	/* Dont initialize key for re-neg */
 	if (!G_KEY_CLR_LOC(k_ctx->l_p_key)) {
 		if ((keyid = get_new_keyid(toep, k_ctx)) < 0) {
 			return (ENOSPC);
 		}
 	} else {
 		keyid = get_keyid(tls_ofld, k_ctx->l_p_key);
 	}
 
 	wr = alloc_wrqe(len, toep->ofld_txq);
 	if (wr == NULL) {
 		free_keyid(toep, keyid);
 		return (ENOMEM);
 	}
 	kwr = wrtod(wr);
 	memset(kwr, 0, kwrlen);
 
 	kwr->wr_hi = htobe32(V_FW_WR_OP(FW_ULPTX_WR) | F_FW_WR_COMPL |
 	    F_FW_WR_ATOMIC);
 	kwr->wr_mid = htobe32(V_FW_WR_LEN16(DIV_ROUND_UP(len, 16)) |
 	    V_FW_WR_FLOWID(toep->tid));
 	kwr->protocol = get_proto_ver(k_ctx->proto_ver);
 	kwr->mfs = htons(k_ctx->frag_size);
 	kwr->reneg_to_write_rx = k_ctx->l_p_key;
 
 	/* master command */
 	kwr->cmd = htobe32(V_ULPTX_CMD(ULP_TX_MEM_WRITE) |
 	    V_T5_ULP_MEMIO_ORDER(1) | V_T5_ULP_MEMIO_IMM(1));
 	kwr->dlen = htobe32(V_ULP_MEMIO_DATA_LEN(kctxlen >> 5));
 	kwr->len16 = htobe32((toep->tid << 8) |
 	    DIV_ROUND_UP(len - sizeof(struct work_request_hdr), 16));
 	kwr->kaddr = htobe32(V_ULP_MEMIO_ADDR(keyid >> 5));
 
 	/* sub command */
 	kwr->sc_more = htobe32(V_ULPTX_CMD(ULP_TX_SC_IMM));
 	kwr->sc_len = htobe32(kctxlen);
 
 	kctx = (struct tls_keyctx *)(kwr + 1);
 	memset(kctx, 0, kctxlen);
 
 	if (G_KEY_GET_LOC(k_ctx->l_p_key) == KEY_WRITE_TX) {
 		tls_ofld->tx_key_addr = keyid;
 		prepare_txkey_wr(kctx, k_ctx);
 	} else if (G_KEY_GET_LOC(k_ctx->l_p_key) == KEY_WRITE_RX) {
 		tls_ofld->rx_key_addr = keyid;
 		prepare_rxkey_wr(kctx, k_ctx);
 	}
 
 	txsd = &toep->txsd[toep->txsd_pidx];
 	txsd->tx_credits = DIV_ROUND_UP(len, 16);
 	txsd->plen = 0;
 	toep->tx_credits -= txsd->tx_credits;
 	if (__predict_false(++toep->txsd_pidx == toep->txsd_total))
 		toep->txsd_pidx = 0;
 	toep->txsd_avail--;
 
 	t4_wrq_tx(sc, wr);
 
 	return (0);
 }
 
 /* Store a key received from SSL in DDR. */
 static int
 program_key_context(struct tcpcb *tp, struct toepcb *toep,
     struct tls_key_context *uk_ctx)
 {
 	struct adapter *sc = td_adapter(toep->td);
 	struct tls_ofld_info *tls_ofld = &toep->tls;
 	struct tls_key_context *k_ctx;
 	int error, key_offset;
 
 	if (tp->t_state != TCPS_ESTABLISHED) {
 		/*
 		 * XXX: Matches Linux driver, but not sure this is a
 		 * very appropriate error.
 		 */
 		return (ENOENT);
 	}
 
 	/* Stop timer on handshake completion */
 	tls_stop_handshake_timer(toep);
 
 	toep->flags &= ~TPF_FORCE_CREDITS;
 
 	CTR4(KTR_CXGBE, "%s: tid %d %s proto_ver %#x", __func__, toep->tid,
 	    G_KEY_GET_LOC(uk_ctx->l_p_key) == KEY_WRITE_RX ? "KEY_WRITE_RX" :
 	    "KEY_WRITE_TX", uk_ctx->proto_ver);
 
 	if (G_KEY_GET_LOC(uk_ctx->l_p_key) == KEY_WRITE_RX &&
 	    toep->ulp_mode != ULP_MODE_TLS)
 		return (EOPNOTSUPP);
 
 	/* Don't copy the 'tx' and 'rx' fields. */
 	k_ctx = &tls_ofld->k_ctx;
 	memcpy(&k_ctx->l_p_key, &uk_ctx->l_p_key,
 	    sizeof(*k_ctx) - offsetof(struct tls_key_context, l_p_key));
 
 	/* TLS version != 1.1 and !1.2 OR DTLS != 1.2 */
 	if (get_proto_ver(k_ctx->proto_ver) > DTLS_1_2_VERSION) {
 		if (G_KEY_GET_LOC(k_ctx->l_p_key) == KEY_WRITE_RX) {
 			tls_ofld->rx_key_addr = -1;
 			t4_clear_rx_quiesce(toep);
 		} else {
 			tls_ofld->tx_key_addr = -1;
 		}
 		return (0);
 	}
 
 	if (k_ctx->state.enc_mode == CH_EVP_CIPH_GCM_MODE) {
 		k_ctx->iv_size = 4;
 		k_ctx->mac_first = 0;
 		k_ctx->hmac_ctrl = 0;
 	} else {
 		k_ctx->iv_size = 8; /* for CBC, iv is 16B, unit of 2B */
 		k_ctx->mac_first = 1;
 	}
 
 	tls_ofld->scmd0.seqno_numivs =
 		(V_SCMD_SEQ_NO_CTRL(3) |
 		 V_SCMD_PROTO_VERSION(get_proto_ver(k_ctx->proto_ver)) |
 		 V_SCMD_ENC_DEC_CTRL(SCMD_ENCDECCTRL_ENCRYPT) |
 		 V_SCMD_CIPH_AUTH_SEQ_CTRL((k_ctx->mac_first == 0)) |
 		 V_SCMD_CIPH_MODE(k_ctx->state.enc_mode) |
 		 V_SCMD_AUTH_MODE(k_ctx->state.auth_mode) |
 		 V_SCMD_HMAC_CTRL(k_ctx->hmac_ctrl) |
 		 V_SCMD_IV_SIZE(k_ctx->iv_size));
 
 	tls_ofld->scmd0.ivgen_hdrlen =
 		(V_SCMD_IV_GEN_CTRL(k_ctx->iv_ctrl) |
 		 V_SCMD_KEY_CTX_INLINE(0) |
 		 V_SCMD_TLS_FRAG_ENABLE(1));
 
 	tls_ofld->mac_length = k_ctx->mac_secret_size;
 
 	if (G_KEY_GET_LOC(k_ctx->l_p_key) == KEY_WRITE_RX) {
 		k_ctx->rx = uk_ctx->rx;
 		/* Dont initialize key for re-neg */
 		if (!G_KEY_CLR_LOC(k_ctx->l_p_key))
 			tls_ofld->rx_key_addr = -1;
 	} else {
 		k_ctx->tx = uk_ctx->tx;
 		/* Dont initialize key for re-neg */
 		if (!G_KEY_CLR_LOC(k_ctx->l_p_key))
 			tls_ofld->tx_key_addr = -1;
 	}
 
 	/* Flush pending data before new Tx key becomes active */
 	if (G_KEY_GET_LOC(k_ctx->l_p_key) == KEY_WRITE_TX) {
 		struct sockbuf *sb;
 
 		/* XXX: This might not drain everything. */
 		t4_push_frames(sc, toep, 0);
 		sb = &toep->inp->inp_socket->so_snd;
 		SOCKBUF_LOCK(sb);
 
 		/* XXX: This asserts that everything has been pushed. */
 		MPASS(sb->sb_sndptr == NULL || sb->sb_sndptr->m_next == NULL);
 		sb->sb_sndptr = NULL;
 		tls_ofld->sb_off = sbavail(sb);
 		SOCKBUF_UNLOCK(sb);
 		tls_ofld->tx_seq_no = 0;
 	}
 
 	if ((G_KEY_GET_LOC(k_ctx->l_p_key) == KEY_WRITE_RX) ||
 	    (tls_ofld->key_location == TLS_SFO_WR_CONTEXTLOC_DDR)) {
 		error = tls_program_key_id(toep, k_ctx);
 		if (error) {
 			/* XXX: Only clear quiesce for KEY_WRITE_RX? */
 			t4_clear_rx_quiesce(toep);
 			return (error);
 		}
 	}
 
 	if (G_KEY_GET_LOC(k_ctx->l_p_key) == KEY_WRITE_RX) {
 		/*
 		 * RX key tags are an index into the key portion of MA
 		 * memory stored as an offset from the base address in
 		 * units of 64 bytes.
 		 */
 		key_offset = tls_ofld->rx_key_addr - sc->vres.key.start;
 		t4_set_tls_keyid(toep, key_offset / 64);
 		t4_set_tls_tcb_field(toep, W_TCB_ULP_RAW,
 				 V_TCB_ULP_RAW(M_TCB_ULP_RAW),
 				 V_TCB_ULP_RAW((V_TF_TLS_KEY_SIZE(3) |
 						V_TF_TLS_CONTROL(1) |
 						V_TF_TLS_ACTIVE(1) |
 						V_TF_TLS_ENABLE(1))));
 		t4_set_tls_tcb_field(toep, W_TCB_TLS_SEQ,
 				 V_TCB_TLS_SEQ(M_TCB_TLS_SEQ),
 				 V_TCB_TLS_SEQ(0));
 		t4_clear_rx_quiesce(toep);
 	} else {
 		unsigned short pdus_per_ulp;
 
 		if (tls_ofld->key_location == TLS_SFO_WR_CONTEXTLOC_IMMEDIATE)
 			tls_ofld->tx_key_addr = 1;
 
 		tls_ofld->fcplenmax = get_tp_plen_max(tls_ofld);
 		tls_ofld->expn_per_ulp = tls_expansion_size(toep,
 				tls_ofld->fcplenmax, 1, &pdus_per_ulp);
 		tls_ofld->pdus_per_ulp = pdus_per_ulp;
 		tls_ofld->adjusted_plen = tls_ofld->pdus_per_ulp *
 			((tls_ofld->expn_per_ulp/tls_ofld->pdus_per_ulp) +
 			 tls_ofld->k_ctx.frag_size);
 	}
 
 	return (0);
 }
 
 /*
  * In some cases a client connection can hang without sending the
  * ServerHelloDone message from the NIC to the host.  Send a dummy
  * RX_DATA_ACK with RX_MODULATE to unstick the connection.
  */
 static void
 tls_send_handshake_ack(void *arg)
 {
 	struct toepcb *toep = arg;
 	struct tls_ofld_info *tls_ofld = &toep->tls;
 	struct adapter *sc = td_adapter(toep->td);
 
 	/*
 	 * XXX: Does not have the t4_get_tcb() checks to refine the
 	 * workaround.
 	 */
 	callout_schedule(&tls_ofld->handshake_timer, TLS_SRV_HELLO_RD_TM * hz);
 
 	CTR2(KTR_CXGBE, "%s: tid %d sending RX_DATA_ACK", __func__, toep->tid);
 	send_rx_modulate(sc, toep);
 }
 
 static void
 tls_start_handshake_timer(struct toepcb *toep)
 {
 	struct tls_ofld_info *tls_ofld = &toep->tls;
 
 	mtx_lock(&tls_handshake_lock);
 	callout_reset(&tls_ofld->handshake_timer, TLS_SRV_HELLO_BKOFF_TM * hz,
 	    tls_send_handshake_ack, toep);
 	mtx_unlock(&tls_handshake_lock);
 }
 
 void
 tls_stop_handshake_timer(struct toepcb *toep)
 {
 	struct tls_ofld_info *tls_ofld = &toep->tls;
 
 	mtx_lock(&tls_handshake_lock);
 	callout_stop(&tls_ofld->handshake_timer);
 	mtx_unlock(&tls_handshake_lock);
 }
 
 int
 t4_ctloutput_tls(struct socket *so, struct sockopt *sopt)
 {
 	struct tls_key_context uk_ctx;
 	struct inpcb *inp;
 	struct tcpcb *tp;
 	struct toepcb *toep;
 	int error, optval;
 
 	error = 0;
 	if (sopt->sopt_dir == SOPT_SET &&
 	    sopt->sopt_name == TCP_TLSOM_SET_TLS_CONTEXT) {
 		error = sooptcopyin(sopt, &uk_ctx, sizeof(uk_ctx),
 		    sizeof(uk_ctx));
 		if (error)
 			return (error);
 	}
 
 	inp = sotoinpcb(so);
 	KASSERT(inp != NULL, ("tcp_ctloutput: inp == NULL"));
 	INP_WLOCK(inp);
 	if (inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) {
 		INP_WUNLOCK(inp);
 		return (ECONNRESET);
 	}
 	tp = intotcpcb(inp);
 	toep = tp->t_toe;
 	switch (sopt->sopt_dir) {
 	case SOPT_SET:
 		switch (sopt->sopt_name) {
 		case TCP_TLSOM_SET_TLS_CONTEXT:
 			error = program_key_context(tp, toep, &uk_ctx);
 			INP_WUNLOCK(inp);
 			break;
 		case TCP_TLSOM_CLR_TLS_TOM:
 			if (toep->ulp_mode == ULP_MODE_TLS) {
 				CTR2(KTR_CXGBE, "%s: tid %d CLR_TLS_TOM",
 				    __func__, toep->tid);
 				tls_clr_ofld_mode(toep);
 			} else
 				error = EOPNOTSUPP;
 			INP_WUNLOCK(inp);
 			break;
 		case TCP_TLSOM_CLR_QUIES:
 			if (toep->ulp_mode == ULP_MODE_TLS) {
 				CTR2(KTR_CXGBE, "%s: tid %d CLR_QUIES",
 				    __func__, toep->tid);
 				tls_clr_quiesce(toep);
 			} else
 				error = EOPNOTSUPP;
 			INP_WUNLOCK(inp);
 			break;
 		default:
 			INP_WUNLOCK(inp);
 			error = EOPNOTSUPP;
 			break;
 		}
 		break;
 	case SOPT_GET:
 		switch (sopt->sopt_name) {
 		case TCP_TLSOM_GET_TLS_TOM:
 			/*
 			 * TLS TX is permitted on any TOE socket, but
 			 * TLS RX requires a TLS ULP mode.
 			 */
 			optval = TLS_TOM_NONE;
 			if (can_tls_offload(td_adapter(toep->td))) {
 				switch (toep->ulp_mode) {
 				case ULP_MODE_NONE:
 				case ULP_MODE_TCPDDP:
 					optval = TLS_TOM_TXONLY;
 					break;
 				case ULP_MODE_TLS:
 					optval = TLS_TOM_BOTH;
 					break;
 				}
 			}
 			CTR3(KTR_CXGBE, "%s: tid %d GET_TLS_TOM = %d",
 			    __func__, toep->tid, optval);
 			INP_WUNLOCK(inp);
 			error = sooptcopyout(sopt, &optval, sizeof(optval));
 			break;
 		default:
 			INP_WUNLOCK(inp);
 			error = EOPNOTSUPP;
 			break;
 		}
 		break;
 	}
 	return (error);
 }
 
 void
 tls_init_toep(struct toepcb *toep)
 {
 	struct tls_ofld_info *tls_ofld = &toep->tls;
 
 	tls_ofld->key_location = TLS_SFO_WR_CONTEXTLOC_DDR;
 	tls_ofld->rx_key_addr = -1;
 	tls_ofld->tx_key_addr = -1;
 	if (toep->ulp_mode == ULP_MODE_TLS)
 		callout_init_mtx(&tls_ofld->handshake_timer,
 		    &tls_handshake_lock, 0);
 }
 
 void
 tls_establish(struct toepcb *toep)
 {
 
 	/*
 	 * Enable PDU extraction.
 	 *
 	 * XXX: Supposedly this should be done by the firmware when
 	 * the ULP_MODE FLOWC parameter is set in send_flowc_wr(), but
 	 * in practice this seems to be required.
 	 */
 	CTR2(KTR_CXGBE, "%s: tid %d setting TLS_ENABLE", __func__, toep->tid);
 	t4_set_tls_tcb_field(toep, W_TCB_ULP_RAW, V_TCB_ULP_RAW(M_TCB_ULP_RAW),
 	    V_TCB_ULP_RAW(V_TF_TLS_ENABLE(1)));
 
 	toep->flags |= TPF_FORCE_CREDITS;
 
 	tls_start_handshake_timer(toep);
 }
 
 void
 tls_uninit_toep(struct toepcb *toep)
 {
 
 	if (toep->ulp_mode == ULP_MODE_TLS)
 		tls_stop_handshake_timer(toep);
 	clear_tls_keyid(toep);
 }
 
 #define MAX_OFLD_TX_CREDITS (SGE_MAX_WR_LEN / 16)
 #define	MIN_OFLD_TLSTX_CREDITS(toep)					\
 	(howmany(sizeof(struct fw_tlstx_data_wr) +			\
 	    sizeof(struct cpl_tx_tls_sfo) + key_size((toep)) +		\
 	    CIPHER_BLOCK_SIZE + 1, 16))
 
 static inline u_int
 max_imm_tls_space(int tx_credits)
 {
 	const int n = 2;	/* Use only up to 2 desc for imm. data WR */
 	int space;
 
 	KASSERT(tx_credits >= 0 &&
 		tx_credits <= MAX_OFLD_TX_CREDITS,
 		("%s: %d credits", __func__, tx_credits));
 
 	if (tx_credits >= (n * EQ_ESIZE) / 16)
 		space = (n * EQ_ESIZE);
 	else
 		space = tx_credits * 16;
 	return (space);
 }
 
 static int
 count_mbuf_segs(struct mbuf *m, int skip, int len, int *max_nsegs_1mbufp)
 {
 	int max_nsegs_1mbuf, n, nsegs;
 
 	while (skip >= m->m_len) {
 		skip -= m->m_len;
 		m = m->m_next;
 	}
 
 	nsegs = 0;
 	max_nsegs_1mbuf = 0;
 	while (len > 0) {
 		n = sglist_count(mtod(m, char *) + skip, m->m_len - skip);
 		if (n > max_nsegs_1mbuf)
 			max_nsegs_1mbuf = n;
 		nsegs += n;
 		len -= m->m_len - skip;
 		skip = 0;
 		m = m->m_next;
 	}
 	*max_nsegs_1mbufp = max_nsegs_1mbuf;
 	return (nsegs);
 }
 
 static void
 write_tlstx_wr(struct fw_tlstx_data_wr *txwr, struct toepcb *toep,
     unsigned int immdlen, unsigned int plen, unsigned int expn,
     unsigned int pdus, uint8_t credits, int shove, int imm_ivs)
 {
 	struct tls_ofld_info *tls_ofld = &toep->tls;
 	unsigned int len = plen + expn;
 
 	txwr->op_to_immdlen = htobe32(V_WR_OP(FW_TLSTX_DATA_WR) |
 	    V_FW_TLSTX_DATA_WR_COMPL(1) |
 	    V_FW_TLSTX_DATA_WR_IMMDLEN(immdlen));
 	txwr->flowid_len16 = htobe32(V_FW_TLSTX_DATA_WR_FLOWID(toep->tid) |
 	    V_FW_TLSTX_DATA_WR_LEN16(credits));
 	txwr->plen = htobe32(len);
 	txwr->lsodisable_to_flags = htobe32(V_TX_ULP_MODE(ULP_MODE_TLS) |
 	    V_TX_URG(0) | /* F_T6_TX_FORCE | */ V_TX_SHOVE(shove));
 	txwr->ctxloc_to_exp = htobe32(V_FW_TLSTX_DATA_WR_NUMIVS(pdus) |
 	    V_FW_TLSTX_DATA_WR_EXP(expn) |
 	    V_FW_TLSTX_DATA_WR_CTXLOC(tls_ofld->key_location) |
 	    V_FW_TLSTX_DATA_WR_IVDSGL(!imm_ivs) |
 	    V_FW_TLSTX_DATA_WR_KEYSIZE(tls_ofld->k_ctx.tx_key_info_size >> 4));
 	txwr->mfs = htobe16(tls_ofld->k_ctx.frag_size);
 	txwr->adjustedplen_pkd = htobe16(
 	    V_FW_TLSTX_DATA_WR_ADJUSTEDPLEN(tls_ofld->adjusted_plen));
 	txwr->expinplenmax_pkd = htobe16(
 	    V_FW_TLSTX_DATA_WR_EXPINPLENMAX(tls_ofld->expn_per_ulp));
 	txwr->pdusinplenmax_pkd = htobe16(
 	    V_FW_TLSTX_DATA_WR_PDUSINPLENMAX(tls_ofld->pdus_per_ulp));
 }
 
 static void
 write_tlstx_cpl(struct cpl_tx_tls_sfo *cpl, struct toepcb *toep,
     struct tls_hdr *tls_hdr, unsigned int plen, unsigned int pdus)
 {
 	struct tls_ofld_info *tls_ofld = &toep->tls;
 	int data_type, seglen;
 
 	if (plen < tls_ofld->k_ctx.frag_size)
 		seglen = plen;
 	else
 		seglen = tls_ofld->k_ctx.frag_size;
 	data_type = tls_content_type(tls_hdr->type);
 	cpl->op_to_seg_len = htobe32(V_CPL_TX_TLS_SFO_OPCODE(CPL_TX_TLS_SFO) |
 	    V_CPL_TX_TLS_SFO_DATA_TYPE(data_type) |
 	    V_CPL_TX_TLS_SFO_CPL_LEN(2) | V_CPL_TX_TLS_SFO_SEG_LEN(seglen));
 	cpl->pld_len = htobe32(plen);
 	if (data_type == CPL_TX_TLS_SFO_TYPE_HEARTBEAT)
 		cpl->type_protover = htobe32(
 		    V_CPL_TX_TLS_SFO_TYPE(tls_hdr->type));
 	cpl->seqno_numivs = htobe32(tls_ofld->scmd0.seqno_numivs |
 	    V_SCMD_NUM_IVS(pdus));
 	cpl->ivgen_hdrlen = htobe32(tls_ofld->scmd0.ivgen_hdrlen);
 	cpl->scmd1 = htobe64(tls_ofld->tx_seq_no);
 	tls_ofld->tx_seq_no += pdus;
 }
 
 /*
  * Similar to write_tx_sgl() except that it accepts an optional
  * trailer buffer for IVs.
  */
 static void
 write_tlstx_sgl(void *dst, struct mbuf *start, int skip, int plen,
     void *iv_buffer, int iv_len, int nsegs, int n)
 {
 	struct mbuf *m;
 	struct ulptx_sgl *usgl = dst;
 	int i, j, rc;
 	struct sglist sg;
 	struct sglist_seg segs[n];
 
 	KASSERT(nsegs > 0, ("%s: nsegs 0", __func__));
 
 	sglist_init(&sg, n, segs);
 	usgl->cmd_nsge = htobe32(V_ULPTX_CMD(ULP_TX_SC_DSGL) |
 	    V_ULPTX_NSGE(nsegs));
 
 	for (m = start; skip >= m->m_len; m = m->m_next)
 		skip -= m->m_len;
 
 	i = -1;
 	for (m = start; plen > 0; m = m->m_next) {
 		rc = sglist_append(&sg, mtod(m, char *) + skip,
 		    m->m_len - skip);
 		if (__predict_false(rc != 0))
 			panic("%s: sglist_append %d", __func__, rc);
 		plen -= m->m_len - skip;
 		skip = 0;
 
 		for (j = 0; j < sg.sg_nseg; i++, j++) {
 			if (i < 0) {
 				usgl->len0 = htobe32(segs[j].ss_len);
 				usgl->addr0 = htobe64(segs[j].ss_paddr);
 			} else {
 				usgl->sge[i / 2].len[i & 1] =
 				    htobe32(segs[j].ss_len);
 				usgl->sge[i / 2].addr[i & 1] =
 				    htobe64(segs[j].ss_paddr);
 			}
 #ifdef INVARIANTS
 			nsegs--;
 #endif
 		}
 		sglist_reset(&sg);
 	}
 	if (iv_buffer != NULL) {
 		rc = sglist_append(&sg, iv_buffer, iv_len);
 		if (__predict_false(rc != 0))
 			panic("%s: sglist_append %d", __func__, rc);
 
 		for (j = 0; j < sg.sg_nseg; i++, j++) {
 			if (i < 0) {
 				usgl->len0 = htobe32(segs[j].ss_len);
 				usgl->addr0 = htobe64(segs[j].ss_paddr);
 			} else {
 				usgl->sge[i / 2].len[i & 1] =
 				    htobe32(segs[j].ss_len);
 				usgl->sge[i / 2].addr[i & 1] =
 				    htobe64(segs[j].ss_paddr);
 			}
 #ifdef INVARIANTS
 			nsegs--;
 #endif
 		}
 	}
 	if (i & 1)
 		usgl->sge[i / 2].len[1] = htobe32(0);
 	KASSERT(nsegs == 0, ("%s: nsegs %d, start %p, iv_buffer %p",
 	    __func__, nsegs, start, iv_buffer));
 }
 
 /*
  * Similar to t4_push_frames() but handles TLS sockets when TLS offload
  * is enabled.  Rather than transmitting bulk data, the socket buffer
  * contains TLS records.  The work request requires a full TLS record,
  * so batch mbufs up until a full TLS record is seen.  This requires
  * reading the TLS header out of the start of each record to determine
  * its length.
  */
 void
 t4_push_tls_records(struct adapter *sc, struct toepcb *toep, int drop)
 {
 	struct tls_hdr thdr;
 	struct mbuf *sndptr;
 	struct fw_tlstx_data_wr *txwr;
 	struct cpl_tx_tls_sfo *cpl;
 	struct wrqe *wr;
 	u_int plen, nsegs, credits, space, max_nsegs_1mbuf, wr_len;
 	u_int expn_size, iv_len, pdus, sndptroff;
 	struct tls_ofld_info *tls_ofld = &toep->tls;
 	struct inpcb *inp = toep->inp;
 	struct tcpcb *tp = intotcpcb(inp);
 	struct socket *so = inp->inp_socket;
 	struct sockbuf *sb = &so->so_snd;
 	int tls_size, tx_credits, shove, /* compl,*/ sowwakeup;
 	struct ofld_tx_sdesc *txsd;
 	bool imm_ivs, imm_payload;
 	void *iv_buffer, *iv_dst, *buf;
 
 	INP_WLOCK_ASSERT(inp);
 	KASSERT(toep->flags & TPF_FLOWC_WR_SENT,
 	    ("%s: flowc_wr not sent for tid %u.", __func__, toep->tid));
 
 	KASSERT(toep->ulp_mode == ULP_MODE_NONE ||
 	    toep->ulp_mode == ULP_MODE_TCPDDP || toep->ulp_mode == ULP_MODE_TLS,
 	    ("%s: ulp_mode %u for toep %p", __func__, toep->ulp_mode, toep));
 	KASSERT(tls_tx_key(toep),
 	    ("%s: TX key not set for toep %p", __func__, toep));
 
 #ifdef VERBOSE_TRACES
 	CTR4(KTR_CXGBE, "%s: tid %d toep flags %#x tp flags %#x drop %d",
 	    __func__, toep->tid, toep->flags, tp->t_flags);
 #endif
 	if (__predict_false(toep->flags & TPF_ABORT_SHUTDOWN))
 		return;
 
 #ifdef RATELIMIT
 	if (__predict_false(inp->inp_flags2 & INP_RATE_LIMIT_CHANGED) &&
 	    (update_tx_rate_limit(sc, toep, so->so_max_pacing_rate) == 0)) {
 		inp->inp_flags2 &= ~INP_RATE_LIMIT_CHANGED;
 	}
 #endif
 
 	/*
 	 * This function doesn't resume by itself.  Someone else must clear the
 	 * flag and call this function.
 	 */
 	if (__predict_false(toep->flags & TPF_TX_SUSPENDED)) {
 		KASSERT(drop == 0,
 		    ("%s: drop (%d) != 0 but tx is suspended", __func__, drop));
 		return;
 	}
 
 	txsd = &toep->txsd[toep->txsd_pidx];
 	for (;;) {
 		tx_credits = min(toep->tx_credits, MAX_OFLD_TX_CREDITS);
 		space = max_imm_tls_space(tx_credits);
 		wr_len = sizeof(struct fw_tlstx_data_wr) +
 		    sizeof(struct cpl_tx_tls_sfo) + key_size(toep);
 		if (wr_len + CIPHER_BLOCK_SIZE + 1 > space) {
 #ifdef VERBOSE_TRACES
 			CTR5(KTR_CXGBE,
 			    "%s: tid %d tx_credits %d min_wr %d space %d",
 			    __func__, toep->tid, tx_credits, wr_len +
 			    CIPHER_BLOCK_SIZE + 1, space);
 #endif
 			return;
 		}
 
 		SOCKBUF_LOCK(sb);
 		sowwakeup = drop;
 		if (drop) {
 			sbdrop_locked(sb, drop);
 			MPASS(tls_ofld->sb_off >= drop);
 			tls_ofld->sb_off -= drop;
 			drop = 0;
 		}
 
 		/*
 		 * Send a FIN if requested, but only if there's no
 		 * more data to send.
 		 */
 		if (sbavail(sb) == tls_ofld->sb_off &&
 		    toep->flags & TPF_SEND_FIN) {
 			if (sowwakeup)
 				sowwakeup_locked(so);
 			else
 				SOCKBUF_UNLOCK(sb);
 			SOCKBUF_UNLOCK_ASSERT(sb);
 			t4_close_conn(sc, toep);
 			return;
 		}
 
 		if (sbavail(sb) < tls_ofld->sb_off + TLS_HEADER_LENGTH) {
 			/*
 			 * A full TLS header is not yet queued, stop
 			 * for now until more data is added to the
 			 * socket buffer.  However, if the connection
 			 * has been closed, we will never get the rest
 			 * of the header so just discard the partial
 			 * header and close the connection.
 			 */
 #ifdef VERBOSE_TRACES
 			CTR5(KTR_CXGBE, "%s: tid %d sbavail %d sb_off %d%s",
 			    __func__, toep->tid, sbavail(sb), tls_ofld->sb_off,
 			    toep->flags & TPF_SEND_FIN ? "" : " SEND_FIN");
 #endif
 			if (sowwakeup)
 				sowwakeup_locked(so);
 			else
 				SOCKBUF_UNLOCK(sb);
 			SOCKBUF_UNLOCK_ASSERT(sb);
 			if (toep->flags & TPF_SEND_FIN)
 				t4_close_conn(sc, toep);
 			return;
 		}
 
 		/* Read the header of the next TLS record. */
 		sndptr = sbsndmbuf(sb, tls_ofld->sb_off, &sndptroff);
 		MPASS(!IS_AIOTX_MBUF(sndptr));
 		m_copydata(sndptr, sndptroff, sizeof(thdr), (caddr_t)&thdr);
 		tls_size = htons(thdr.length);
 		plen = TLS_HEADER_LENGTH + tls_size;
 		pdus = howmany(tls_size, tls_ofld->k_ctx.frag_size);
 		iv_len = pdus * CIPHER_BLOCK_SIZE;
 
 		if (sbavail(sb) < tls_ofld->sb_off + plen) {
 			/*
 			 * The full TLS record is not yet queued, stop
 			 * for now until more data is added to the
 			 * socket buffer.  However, if the connection
 			 * has been closed, we will never get the rest
 			 * of the record so just discard the partial
 			 * record and close the connection.
 			 */
 #ifdef VERBOSE_TRACES
 			CTR6(KTR_CXGBE,
 			    "%s: tid %d sbavail %d sb_off %d plen %d%s",
 			    __func__, toep->tid, sbavail(sb), tls_ofld->sb_off,
 			    plen, toep->flags & TPF_SEND_FIN ? "" :
 			    " SEND_FIN");
 #endif
 			if (sowwakeup)
 				sowwakeup_locked(so);
 			else
 				SOCKBUF_UNLOCK(sb);
 			SOCKBUF_UNLOCK_ASSERT(sb);
 			if (toep->flags & TPF_SEND_FIN)
 				t4_close_conn(sc, toep);
 			return;
 		}
 
 		/* Shove if there is no additional data pending. */
 		shove = (sbavail(sb) == tls_ofld->sb_off + plen) &&
 		    !(tp->t_flags & TF_MORETOCOME);
 
 		if (sb->sb_flags & SB_AUTOSIZE &&
 		    V_tcp_do_autosndbuf &&
 		    sb->sb_hiwat < V_tcp_autosndbuf_max &&
 		    sbused(sb) >= sb->sb_hiwat * 7 / 8) {
 			int newsize = min(sb->sb_hiwat + V_tcp_autosndbuf_inc,
 			    V_tcp_autosndbuf_max);
 
 			if (!sbreserve_locked(sb, newsize, so, NULL))
 				sb->sb_flags &= ~SB_AUTOSIZE;
 			else
 				sowwakeup = 1;	/* room available */
 		}
 		if (sowwakeup)
 			sowwakeup_locked(so);
 		else
 			SOCKBUF_UNLOCK(sb);
 		SOCKBUF_UNLOCK_ASSERT(sb);
 
 		if (__predict_false(toep->flags & TPF_FIN_SENT))
 			panic("%s: excess tx.", __func__);
 
 		/* Determine whether to use immediate vs SGL. */
 		imm_payload = false;
 		imm_ivs = false;
 		if (wr_len + iv_len <= space) {
 			imm_ivs = true;
 			wr_len += iv_len;
 			if (wr_len + tls_size <= space) {
 				wr_len += tls_size;
 				imm_payload = true;
 			}
 		}
 
 		/* Allocate space for IVs if needed. */
 		if (!imm_ivs) {
 			iv_buffer = malloc(iv_len, M_CXGBE, M_NOWAIT);
 			if (iv_buffer == NULL) {
 				/*
 				 * XXX: How to restart this?
 				 */
 				if (sowwakeup)
 					sowwakeup_locked(so);
 				else
 					SOCKBUF_UNLOCK(sb);
 				SOCKBUF_UNLOCK_ASSERT(sb);
 				CTR3(KTR_CXGBE,
 			    "%s: tid %d failed to alloc IV space len %d",
 				    __func__, toep->tid, iv_len);
 				return;
 			}
 		} else
 			iv_buffer = NULL;
 
 		/* Determine size of SGL. */
 		nsegs = 0;
 		max_nsegs_1mbuf = 0; /* max # of SGL segments in any one mbuf */
 		if (!imm_payload) {
 			nsegs = count_mbuf_segs(sndptr, sndptroff +
 			    TLS_HEADER_LENGTH, tls_size, &max_nsegs_1mbuf);
 			if (!imm_ivs) {
 				int n = sglist_count(iv_buffer, iv_len);
 				nsegs += n;
 				if (n > max_nsegs_1mbuf)
 					max_nsegs_1mbuf = n;
 			}
 
 			/* Account for SGL in work request length. */
 			wr_len += sizeof(struct ulptx_sgl) +
 			    ((3 * (nsegs - 1)) / 2 + ((nsegs - 1) & 1)) * 8;
 		}
 
 		wr = alloc_wrqe(roundup2(wr_len, 16), toep->ofld_txq);
 		if (wr == NULL) {
 			/* XXX: how will we recover from this? */
 			toep->flags |= TPF_TX_SUSPENDED;
 			return;
 		}
 
 #ifdef VERBOSE_TRACES
 		CTR5(KTR_CXGBE, "%s: tid %d TLS record %d len %#x pdus %d",
 		    __func__, toep->tid, thdr.type, tls_size, pdus);
 #endif
 		txwr = wrtod(wr);
 		cpl = (struct cpl_tx_tls_sfo *)(txwr + 1);
 		memset(txwr, 0, roundup2(wr_len, 16));
 		credits = howmany(wr_len, 16);
 		expn_size = tls_expansion_size(toep, tls_size, 0, NULL);
 		write_tlstx_wr(txwr, toep, imm_payload ? tls_size : 0,
 		    tls_size, expn_size, pdus, credits, shove, imm_ivs ? 1 : 0);
 		write_tlstx_cpl(cpl, toep, &thdr, tls_size, pdus);
 		tls_copy_tx_key(toep, cpl + 1);
 
 		/* Generate random IVs */
 		buf = (char *)(cpl + 1) + key_size(toep);
 		if (imm_ivs) {
 			MPASS(iv_buffer == NULL);
 			iv_dst = buf;
 			buf = (char *)iv_dst + iv_len;
 		} else
 			iv_dst = iv_buffer;
 		arc4rand(iv_dst, iv_len, 0);
 
 		if (imm_payload) {
 			m_copydata(sndptr, sndptroff + TLS_HEADER_LENGTH,
 			    tls_size, buf);
 		} else {
 			write_tlstx_sgl(buf, sndptr,
 			    sndptroff + TLS_HEADER_LENGTH, tls_size, iv_buffer,
 			    iv_len, nsegs, max_nsegs_1mbuf);
 		}
 
 		KASSERT(toep->tx_credits >= credits,
 			("%s: not enough credits", __func__));
 
 		toep->tx_credits -= credits;
 
 		tp->snd_nxt += plen;
 		tp->snd_max += plen;
 
 		SOCKBUF_LOCK(sb);
 		sbsndptr(sb, tls_ofld->sb_off, plen, &sndptroff);
 		tls_ofld->sb_off += plen;
 		SOCKBUF_UNLOCK(sb);
 
 		toep->flags |= TPF_TX_DATA_SENT;
 		if (toep->tx_credits < MIN_OFLD_TLSTX_CREDITS(toep))
 			toep->flags |= TPF_TX_SUSPENDED;
 
 		KASSERT(toep->txsd_avail > 0, ("%s: no txsd", __func__));
 		txsd->plen = plen;
 		txsd->tx_credits = credits;
 		txsd->iv_buffer = iv_buffer;
 		txsd++;
 		if (__predict_false(++toep->txsd_pidx == toep->txsd_total)) {
 			toep->txsd_pidx = 0;
 			txsd = &toep->txsd[0];
 		}
 		toep->txsd_avail--;
 
 		atomic_add_long(&toep->vi->pi->tx_tls_records, 1);
 		atomic_add_long(&toep->vi->pi->tx_tls_octets, plen);
 
 		t4_l2t_send(sc, wr, toep->l2te);
 	}
 }
 
 /*
  * For TLS data we place received mbufs received via CPL_TLS_DATA into
  * an mbufq in the TLS offload state.  When CPL_RX_TLS_CMP is
  * received, the completed PDUs are placed into the socket receive
  * buffer.
  *
  * The TLS code reuses the ulp_pdu_reclaimq to hold the pending mbufs.
  */
 static int
 do_tls_data(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m)
 {
 	struct adapter *sc = iq->adapter;
 	const struct cpl_tls_data *cpl = mtod(m, const void *);
 	unsigned int tid = GET_TID(cpl);
 	struct toepcb *toep = lookup_tid(sc, tid);
 	struct inpcb *inp = toep->inp;
 	struct tcpcb *tp;
 	int len;
 
 	/* XXX: Should this match do_rx_data instead? */
 	KASSERT(!(toep->flags & TPF_SYNQE),
 	    ("%s: toep %p claims to be a synq entry", __func__, toep));
 
 	KASSERT(toep->tid == tid, ("%s: toep tid/atid mismatch", __func__));
 
 	/* strip off CPL header */
 	m_adj(m, sizeof(*cpl));
 	len = m->m_pkthdr.len;
 
 	atomic_add_long(&toep->vi->pi->rx_tls_octets, len);
 
 	KASSERT(len == G_CPL_TLS_DATA_LENGTH(be32toh(cpl->length_pkd)),
 	    ("%s: payload length mismatch", __func__));
 
 	INP_WLOCK(inp);
 	if (inp->inp_flags & (INP_DROPPED | INP_TIMEWAIT)) {
 		CTR4(KTR_CXGBE, "%s: tid %u, rx (%d bytes), inp_flags 0x%x",
 		    __func__, tid, len, inp->inp_flags);
 		INP_WUNLOCK(inp);
 		m_freem(m);
 		return (0);
 	}
 
 	/* Save TCP sequence number. */
 	m->m_pkthdr.tls_tcp_seq = be32toh(cpl->seq);
 
 	if (mbufq_enqueue(&toep->ulp_pdu_reclaimq, m)) {
 #ifdef INVARIANTS
 		panic("Failed to queue TLS data packet");
 #else
 		printf("%s: Failed to queue TLS data packet\n", __func__);
 		INP_WUNLOCK(inp);
 		m_freem(m);
 		return (0);
 #endif
 	}
 
 	tp = intotcpcb(inp);
 	tp->t_rcvtime = ticks;
 
 #ifdef VERBOSE_TRACES
 	CTR4(KTR_CXGBE, "%s: tid %u len %d seq %u", __func__, tid, len,
 	    be32toh(cpl->seq));
 #endif
 
 	INP_WUNLOCK(inp);
 	return (0);
 }
 
 static int
 do_rx_tls_cmp(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m)
 {
 	struct adapter *sc = iq->adapter;
 	const struct cpl_rx_tls_cmp *cpl = mtod(m, const void *);
 	struct tlsrx_hdr_pkt *tls_hdr_pkt;
 	unsigned int tid = GET_TID(cpl);
 	struct toepcb *toep = lookup_tid(sc, tid);
 	struct inpcb *inp = toep->inp;
 	struct tcpcb *tp;
 	struct socket *so;
 	struct sockbuf *sb;
 	struct mbuf *tls_data;
-	int len, pdu_length, pdu_overhead, sb_length;
+	int len, pdu_length, rx_credits;
 
 	KASSERT(toep->tid == tid, ("%s: toep tid/atid mismatch", __func__));
 	KASSERT(!(toep->flags & TPF_SYNQE),
 	    ("%s: toep %p claims to be a synq entry", __func__, toep));
 
 	/* strip off CPL header */
 	m_adj(m, sizeof(*cpl));
 	len = m->m_pkthdr.len;
 
 	atomic_add_long(&toep->vi->pi->rx_tls_records, 1);
 
 	KASSERT(len == G_CPL_RX_TLS_CMP_LENGTH(be32toh(cpl->pdulength_length)),
 	    ("%s: payload length mismatch", __func__));
 
 	INP_WLOCK(inp);
 	if (inp->inp_flags & (INP_DROPPED | INP_TIMEWAIT)) {
 		CTR4(KTR_CXGBE, "%s: tid %u, rx (%d bytes), inp_flags 0x%x",
 		    __func__, tid, len, inp->inp_flags);
 		INP_WUNLOCK(inp);
 		m_freem(m);
 		return (0);
 	}
 
 	pdu_length = G_CPL_RX_TLS_CMP_PDULENGTH(be32toh(cpl->pdulength_length));
 
 	tp = intotcpcb(inp);
 
 #ifdef VERBOSE_TRACES
 	CTR6(KTR_CXGBE, "%s: tid %u PDU len %d len %d seq %u, rcv_nxt %u",
 	    __func__, tid, pdu_length, len, be32toh(cpl->seq), tp->rcv_nxt);
 #endif
 
 	tp->rcv_nxt += pdu_length;
 	if (tp->rcv_wnd < pdu_length) {
 		toep->tls.rcv_over += pdu_length - tp->rcv_wnd;
 		tp->rcv_wnd = 0;
 	} else
 		tp->rcv_wnd -= pdu_length;
 
 	/* XXX: Not sure what to do about urgent data. */
 
 	/*
 	 * The payload of this CPL is the TLS header followed by
 	 * additional fields.
 	 */
 	KASSERT(m->m_len >= sizeof(*tls_hdr_pkt),
 	    ("%s: payload too small", __func__));
 	tls_hdr_pkt = mtod(m, void *);
 
 	/*
 	 * Only the TLS header is sent to OpenSSL, so report errors by
 	 * altering the record type.
 	 */
 	if ((tls_hdr_pkt->res_to_mac_error & M_TLSRX_HDR_PKT_ERROR) != 0)
 		tls_hdr_pkt->type = CONTENT_TYPE_ERROR;
 
 	/* Trim this CPL's mbuf to only include the TLS header. */
 	KASSERT(m->m_len == len && m->m_next == NULL,
 	    ("%s: CPL spans multiple mbufs", __func__));
 	m->m_len = TLS_HEADER_LENGTH;
 	m->m_pkthdr.len = TLS_HEADER_LENGTH;
 
 	tls_data = mbufq_dequeue(&toep->ulp_pdu_reclaimq);
 	if (tls_data != NULL) {
 		KASSERT(be32toh(cpl->seq) == tls_data->m_pkthdr.tls_tcp_seq,
 		    ("%s: sequence mismatch", __func__));
 
 		/*
 		 * Update the TLS header length to be the length of
 		 * the payload data.
 		 */
 		tls_hdr_pkt->length = htobe16(tls_data->m_pkthdr.len);
 
 		m->m_next = tls_data;
 		m->m_pkthdr.len += tls_data->m_len;
 	}
 
 	so = inp_inpcbtosocket(inp);
 	sb = &so->so_rcv;
 	SOCKBUF_LOCK(sb);
 
 	if (__predict_false(sb->sb_state & SBS_CANTRCVMORE)) {
 		CTR3(KTR_CXGBE, "%s: tid %u, excess rx (%d bytes)",
 		    __func__, tid, pdu_length);
 		m_freem(m);
 		SOCKBUF_UNLOCK(sb);
 		INP_WUNLOCK(inp);
 
 		CURVNET_SET(toep->vnet);
 		INP_INFO_RLOCK(&V_tcbinfo);
 		INP_WLOCK(inp);
 		tp = tcp_drop(tp, ECONNRESET);
 		if (tp)
 			INP_WUNLOCK(inp);
 		INP_INFO_RUNLOCK(&V_tcbinfo);
 		CURVNET_RESTORE();
 
 		return (0);
 	}
 
 	/*
-	 * Not all of the bytes on the wire are included in the socket
-	 * buffer (e.g. the MAC of the TLS record).  However, those
-	 * bytes are included in the TCP sequence space.  To handle
-	 * this, compute the delta for this TLS record in
-	 * 'pdu_overhead' and treat those bytes as having already been
-	 * "read" by the application for the purposes of expanding the
-	 * window.  The meat of the TLS record passed to the
-	 * application ('sb_length') will still not be counted as
-	 * "read" until userland actually reads the bytes.
-	 *
-	 * XXX: Some of the calculations below are probably still not
-	 * really correct.
+	 * Not all of the bytes on the wire are included in the socket buffer
+	 * (e.g. the MAC of the TLS record).  However, those bytes are included
+	 * in the TCP sequence space.
 	 */
-	sb_length = m->m_pkthdr.len;
-	pdu_overhead = pdu_length - sb_length;
-	toep->rx_credits += pdu_overhead;
-	tp->rcv_wnd += pdu_overhead;
-	tp->rcv_adv += pdu_overhead;
 
 	/* receive buffer autosize */
 	MPASS(toep->vnet == so->so_vnet);
 	CURVNET_SET(toep->vnet);
 	if (sb->sb_flags & SB_AUTOSIZE &&
 	    V_tcp_do_autorcvbuf &&
 	    sb->sb_hiwat < V_tcp_autorcvbuf_max &&
-	    sb_length > (sbspace(sb) / 8 * 7)) {
+	    m->m_pkthdr.len > (sbspace(sb) / 8 * 7)) {
 		unsigned int hiwat = sb->sb_hiwat;
 		unsigned int newsize = min(hiwat + V_tcp_autorcvbuf_inc,
 		    V_tcp_autorcvbuf_max);
 
 		if (!sbreserve_locked(sb, newsize, so, NULL))
 			sb->sb_flags &= ~SB_AUTOSIZE;
-		else
-			toep->rx_credits += newsize - hiwat;
 	}
 
-	KASSERT(toep->sb_cc >= sbused(sb),
-	    ("%s: sb %p has more data (%d) than last time (%d).",
-	    __func__, sb, sbused(sb), toep->sb_cc));
-	toep->rx_credits += toep->sb_cc - sbused(sb);
 	sbappendstream_locked(sb, m, 0);
-	toep->sb_cc = sbused(sb);
+	rx_credits = sbspace(sb) > tp->rcv_wnd ? sbspace(sb) - tp->rcv_wnd : 0;
 #ifdef VERBOSE_TRACES
 	CTR5(KTR_CXGBE, "%s: tid %u PDU overhead %d rx_credits %u rcv_wnd %u",
-	    __func__, tid, pdu_overhead, toep->rx_credits, tp->rcv_wnd);
+	    __func__, tid, pdu_overhead, rx_credits, tp->rcv_wnd);
 #endif
-	if (toep->rx_credits > 0 && toep->sb_cc + tp->rcv_wnd < sb->sb_lowat) {
-		int credits;
-
-		credits = send_rx_credits(sc, toep, toep->rx_credits);
-		toep->rx_credits -= credits;
-		tp->rcv_wnd += credits;
-		tp->rcv_adv += credits;
+	if (rx_credits > 0 && sbused(sb) + tp->rcv_wnd < sb->sb_lowat) {
+		rx_credits = send_rx_credits(sc, toep, rx_credits);
+		tp->rcv_wnd += rx_credits;
+		tp->rcv_adv += rx_credits;
 	}
 
 	sorwakeup_locked(so);
 	SOCKBUF_UNLOCK_ASSERT(sb);
 
 	INP_WUNLOCK(inp);
 	CURVNET_RESTORE();
 	return (0);
 }
 
 void
 t4_tls_mod_load(void)
 {
 
 	mtx_init(&tls_handshake_lock, "t4tls handshake", NULL, MTX_DEF);
 	t4_register_cpl_handler(CPL_TLS_DATA, do_tls_data);
 	t4_register_cpl_handler(CPL_RX_TLS_CMP, do_rx_tls_cmp);
 }
 
 void
 t4_tls_mod_unload(void)
 {
 
 	t4_register_cpl_handler(CPL_TLS_DATA, NULL);
 	t4_register_cpl_handler(CPL_RX_TLS_CMP, NULL);
 	mtx_destroy(&tls_handshake_lock);
 }
 #endif	/* TCP_OFFLOAD */
Index: stable/11/sys/dev/cxgbe/tom/t4_tom.h
===================================================================
--- stable/11/sys/dev/cxgbe/tom/t4_tom.h	(revision 348703)
+++ stable/11/sys/dev/cxgbe/tom/t4_tom.h	(revision 348704)
@@ -1,413 +1,411 @@
 /*-
  * Copyright (c) 2012, 2015 Chelsio Communications, Inc.
  * All rights reserved.
  * Written by: Navdeep Parhar <np@FreeBSD.org>
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  * $FreeBSD$
  *
  */
 
 #ifndef __T4_TOM_H__
 #define __T4_TOM_H__
 #include <sys/vmem.h>
 #include "tom/t4_tls.h"
 
 #define LISTEN_HASH_SIZE 32
 
 /*
  * Min receive window.  We want it to be large enough to accommodate receive
  * coalescing, handle jumbo frames, and not trigger sender SWS avoidance.
  */
 #define MIN_RCV_WND (24 * 1024U)
 
 /*
  * Max receive window supported by HW in bytes.  Only a small part of it can
  * be set through option0, the rest needs to be set through RX_DATA_ACK.
  */
 #define MAX_RCV_WND ((1U << 27) - 1)
 
 #define	DDP_RSVD_WIN (16 * 1024U)
 #define	SB_DDP_INDICATE	SB_IN_TOE	/* soreceive must respond to indicate */
 
 #define USE_DDP_RX_FLOW_CONTROL
 
 #define PPOD_SZ(n)	((n) * sizeof(struct pagepod))
 #define PPOD_SIZE	(PPOD_SZ(1))
 
 /* TOE PCB flags */
 enum {
 	TPF_ATTACHED	   = (1 << 0),	/* a tcpcb refers to this toepcb */
 	TPF_FLOWC_WR_SENT  = (1 << 1),	/* firmware flow context WR sent */
 	TPF_TX_DATA_SENT   = (1 << 2),	/* some data sent */
 	TPF_TX_SUSPENDED   = (1 << 3),	/* tx suspended for lack of resources */
 	TPF_SEND_FIN	   = (1 << 4),	/* send FIN after all pending data */
 	TPF_FIN_SENT	   = (1 << 5),	/* FIN has been sent */
 	TPF_ABORT_SHUTDOWN = (1 << 6),	/* connection abort is in progress */
 	TPF_CPL_PENDING    = (1 << 7),	/* haven't received the last CPL */
 	TPF_SYNQE	   = (1 << 8),	/* synq_entry, not really a toepcb */
 	TPF_SYNQE_EXPANDED = (1 << 9),	/* toepcb ready, tid context updated */
 	TPF_FORCE_CREDITS  = (1 << 10), /* always send credits */
 };
 
 enum {
 	DDP_OK		= (1 << 0),	/* OK to turn on DDP */
 	DDP_SC_REQ	= (1 << 1),	/* state change (on/off) requested */
 	DDP_ON		= (1 << 2),	/* DDP is turned on */
 	DDP_BUF0_ACTIVE	= (1 << 3),	/* buffer 0 in use (not invalidated) */
 	DDP_BUF1_ACTIVE	= (1 << 4),	/* buffer 1 in use (not invalidated) */
 	DDP_TASK_ACTIVE = (1 << 5),	/* requeue task is queued / running */
 	DDP_DEAD	= (1 << 6),	/* toepcb is shutting down */
 };
 
 struct sockopt;
 struct offload_settings;
 
 struct ofld_tx_sdesc {
 	uint32_t plen;		/* payload length */
 	uint8_t tx_credits;	/* firmware tx credits (unit is 16B) */
 	void *iv_buffer;	/* optional buffer holding IVs for TLS */
 };
 
 struct ppod_region {
 	u_int pr_start;
 	u_int pr_len;
 	u_int pr_page_shift[4];
 	uint32_t pr_tag_mask;		/* hardware tagmask for this region. */
 	uint32_t pr_invalid_bit;	/* OR with this to invalidate tag. */
 	uint32_t pr_alias_mask;		/* AND with tag to get alias bits. */
 	u_int pr_alias_shift;		/* shift this much for first alias bit. */
 	vmem_t *pr_arena;
 };
 
 struct ppod_reservation {
 	struct ppod_region *prsv_pr;
 	uint32_t prsv_tag;		/* Full tag: pgsz, alias, tag, color */
 	u_int prsv_nppods;
 };
 
 struct pageset {
 	TAILQ_ENTRY(pageset) link;
 	vm_page_t *pages;
 	int npages;
 	int flags;
 	int offset;		/* offset in first page */
 	int len;
 	struct ppod_reservation prsv;
 	struct vmspace *vm;
 	vm_offset_t start;
 	u_int vm_timestamp;
 };
 
 TAILQ_HEAD(pagesetq, pageset);
 
 #define	PS_WIRED		0x0001	/* Pages wired rather than held. */
 #define	PS_PPODS_WRITTEN	0x0002	/* Page pods written to the card. */
 
 #define	EXT_FLAG_AIOTX		EXT_FLAG_VENDOR1
 
 #define	IS_AIOTX_MBUF(m)						\
 	((m)->m_flags & M_EXT && (m)->m_ext.ext_flags & EXT_FLAG_AIOTX)
 
 struct ddp_buffer {
 	struct pageset *ps;
 
 	struct kaiocb *job;
 	int cancel_pending;
 };
 
 struct ddp_pcb {
 	u_int flags;
 	struct ddp_buffer db[2];
 	TAILQ_HEAD(, pageset) cached_pagesets;
 	TAILQ_HEAD(, kaiocb) aiojobq;
 	u_int waiting_count;
 	u_int active_count;
 	u_int cached_count;
 	int active_id;	/* the currently active DDP buffer */
 	struct task requeue_task;
 	struct kaiocb *queueing;
 	struct mtx lock;
 };
 
 struct aiotx_buffer {
 	struct pageset ps;
 	struct kaiocb *job;
 	int refcount;
 };
 
 struct toepcb {
 	TAILQ_ENTRY(toepcb) link; /* toep_list */
 	u_int flags;		/* miscellaneous flags */
 	int refcount;
 	struct tom_data *td;
 	struct inpcb *inp;	/* backpointer to host stack's PCB */
 	struct vnet *vnet;
 	struct vi_info *vi;	/* virtual interface */
 	struct sge_wrq *ofld_txq;
 	struct sge_ofld_rxq *ofld_rxq;
 	struct sge_wrq *ctrlq;
 	struct l2t_entry *l2te;	/* L2 table entry used by this connection */
 	struct clip_entry *ce;	/* CLIP table entry used by this tid */
 	int tid;		/* Connection identifier */
 	int tc_idx;		/* traffic class that this tid is bound to */
 
 	/* tx credit handling */
 	u_int tx_total;		/* total tx WR credits (in 16B units) */
 	u_int tx_credits;	/* tx WR credits (in 16B units) available */
 	u_int tx_nocompl;	/* tx WR credits since last compl request */
 	u_int plen_nocompl;	/* payload since last compl request */
 
-	/* rx credit handling */
-	u_int sb_cc;		/* last noted value of so_rcv->sb_cc */
-	int rx_credits;		/* rx credits (in bytes) to be returned to hw */
+	int opt0_rcv_bufsize;	/* XXX: save full opt0/opt2 for later? */
 
 	u_int ulp_mode;	/* ULP mode */
 	void *ulpcb;
 	void *ulpcb2;
 	struct mbufq ulp_pduq;	/* PDUs waiting to be sent out. */
 	struct mbufq ulp_pdu_reclaimq;
 
 	struct ddp_pcb ddp;
 	struct tls_ofld_info tls;
 
 	TAILQ_HEAD(, kaiocb) aiotx_jobq;
 	struct task aiotx_task;
 	bool aiotx_task_active;
 
 	/* Tx software descriptor */
 	uint8_t txsd_total;
 	uint8_t txsd_pidx;
 	uint8_t txsd_cidx;
 	uint8_t txsd_avail;
 	struct ofld_tx_sdesc txsd[];
 };
 
 #define	DDP_LOCK(toep)		mtx_lock(&(toep)->ddp.lock)
 #define	DDP_UNLOCK(toep)	mtx_unlock(&(toep)->ddp.lock)
 #define	DDP_ASSERT_LOCKED(toep)	mtx_assert(&(toep)->ddp.lock, MA_OWNED)
 
 struct flowc_tx_params {
 	uint32_t snd_nxt;
 	uint32_t rcv_nxt;
 	unsigned int snd_space;
 	unsigned int mss;
 };
 
 /*
  * Compressed state for embryonic connections for a listener.
  */
 struct synq_entry {
 	struct listen_ctx *lctx;	/* backpointer to listen ctx */
 	struct mbuf *syn;
 	int flags;			/* same as toepcb's tp_flags */
 	volatile int ok_to_respond;
 	volatile u_int refcnt;
 	int tid;
 	uint32_t iss;
 	uint32_t irs;
 	uint32_t ts;
 	uint16_t txqid;
 	uint16_t rxqid;
 	uint16_t l2e_idx;
 	uint16_t ulp_mode;
 	uint16_t rcv_bufsize;
 	__be16 tcp_opt; /* from cpl_pass_establish */
 	struct toepcb *toep;
 };
 
 /* listen_ctx flags */
 #define LCTX_RPL_PENDING 1	/* waiting for a CPL_PASS_OPEN_RPL */
 
 struct listen_ctx {
 	LIST_ENTRY(listen_ctx) link;	/* listen hash linkage */
 	volatile int refcount;
 	int stid;
 	struct stid_region stid_region;
 	int flags;
 	struct inpcb *inp;		/* listening socket's inp */
 	struct vnet *vnet;
 	struct sge_wrq *ctrlq;
 	struct sge_ofld_rxq *ofld_rxq;
 	struct clip_entry *ce;
 };
 
 struct tom_data {
 	struct toedev tod;
 
 	/* toepcb's associated with this TOE device */
 	struct mtx toep_list_lock;
 	TAILQ_HEAD(, toepcb) toep_list;
 
 	struct mtx lctx_hash_lock;
 	LIST_HEAD(, listen_ctx) *listen_hash;
 	u_long listen_mask;
 	int lctx_count;		/* # of lctx in the hash table */
 
 	struct ppod_region pr;
 
 	/* WRs that will not be sent to the chip because L2 resolution failed */
 	struct mtx unsent_wr_lock;
 	STAILQ_HEAD(, wrqe) unsent_wr_list;
 	struct task reclaim_wr_resources;
 };
 
 static inline struct tom_data *
 tod_td(struct toedev *tod)
 {
 
 	return (__containerof(tod, struct tom_data, tod));
 }
 
 static inline struct adapter *
 td_adapter(struct tom_data *td)
 {
 
 	return (td->tod.tod_softc);
 }
 
 static inline void
 set_mbuf_ulp_submode(struct mbuf *m, uint8_t ulp_submode)
 {
 
 	M_ASSERTPKTHDR(m);
 	m->m_pkthdr.PH_per.eight[0] = ulp_submode;
 }
 
 static inline uint8_t
 mbuf_ulp_submode(struct mbuf *m)
 {
 
 	M_ASSERTPKTHDR(m);
 	return (m->m_pkthdr.PH_per.eight[0]);
 }
 
 /* t4_tom.c */
 struct toepcb *alloc_toepcb(struct vi_info *, int, int, int);
 struct toepcb *hold_toepcb(struct toepcb *);
 void free_toepcb(struct toepcb *);
 void offload_socket(struct socket *, struct toepcb *);
 void undo_offload_socket(struct socket *);
 void final_cpl_received(struct toepcb *);
 void insert_tid(struct adapter *, int, void *, int);
 void *lookup_tid(struct adapter *, int);
 void update_tid(struct adapter *, int, void *);
 void remove_tid(struct adapter *, int, int);
 int find_best_mtu_idx(struct adapter *, struct in_conninfo *,
     struct offload_settings *);
 u_long select_rcv_wnd(struct socket *);
 int select_rcv_wscale(void);
 uint64_t calc_opt0(struct socket *, struct vi_info *, struct l2t_entry *,
     int, int, int, int, struct offload_settings *);
 uint64_t select_ntuple(struct vi_info *, struct l2t_entry *);
 int select_ulp_mode(struct socket *, struct adapter *,
     struct offload_settings *);
 void set_ulp_mode(struct toepcb *, int);
 int negative_advice(int);
 
 /* t4_connect.c */
 void t4_init_connect_cpl_handlers(void);
 void t4_uninit_connect_cpl_handlers(void);
 int t4_connect(struct toedev *, struct socket *, struct rtentry *,
     struct sockaddr *);
 void act_open_failure_cleanup(struct adapter *, u_int, u_int);
 
 /* t4_listen.c */
 void t4_init_listen_cpl_handlers(void);
 void t4_uninit_listen_cpl_handlers(void);
 int t4_listen_start(struct toedev *, struct tcpcb *);
 int t4_listen_stop(struct toedev *, struct tcpcb *);
 void t4_syncache_added(struct toedev *, void *);
 void t4_syncache_removed(struct toedev *, void *);
 int t4_syncache_respond(struct toedev *, void *, struct mbuf *);
 int do_abort_req_synqe(struct sge_iq *, const struct rss_header *,
     struct mbuf *);
 int do_abort_rpl_synqe(struct sge_iq *, const struct rss_header *,
     struct mbuf *);
 void t4_offload_socket(struct toedev *, void *, struct socket *);
 void synack_failure_cleanup(struct adapter *, int);
 
 /* t4_cpl_io.c */
 void aiotx_init_toep(struct toepcb *);
 int t4_aio_queue_aiotx(struct socket *, struct kaiocb *);
 void t4_init_cpl_io_handlers(void);
 void t4_uninit_cpl_io_handlers(void);
 void send_abort_rpl(struct adapter *, struct sge_wrq *, int , int);
 void send_flowc_wr(struct toepcb *, struct flowc_tx_params *);
 void send_reset(struct adapter *, struct toepcb *, uint32_t);
 int send_rx_credits(struct adapter *, struct toepcb *, int);
 void send_rx_modulate(struct adapter *, struct toepcb *);
 void make_established(struct toepcb *, uint32_t, uint32_t, uint16_t);
 int t4_close_conn(struct adapter *, struct toepcb *);
 void t4_rcvd(struct toedev *, struct tcpcb *);
 void t4_rcvd_locked(struct toedev *, struct tcpcb *);
 int t4_tod_output(struct toedev *, struct tcpcb *);
 int t4_send_fin(struct toedev *, struct tcpcb *);
 int t4_send_rst(struct toedev *, struct tcpcb *);
 void t4_set_tcb_field(struct adapter *, struct sge_wrq *, struct toepcb *,
     uint16_t, uint64_t, uint64_t, int, int);
 void t4_push_frames(struct adapter *sc, struct toepcb *toep, int drop);
 void t4_push_pdus(struct adapter *sc, struct toepcb *toep, int drop);
 
 /* t4_ddp.c */
 int t4_init_ppod_region(struct ppod_region *, struct t4_range *, u_int,
     const char *);
 void t4_free_ppod_region(struct ppod_region *);
 int t4_alloc_page_pods_for_ps(struct ppod_region *, struct pageset *);
 int t4_alloc_page_pods_for_buf(struct ppod_region *, vm_offset_t, int,
     struct ppod_reservation *);
 int t4_write_page_pods_for_ps(struct adapter *, struct sge_wrq *, int,
     struct pageset *);
 int t4_write_page_pods_for_buf(struct adapter *, struct sge_wrq *, int tid,
     struct ppod_reservation *, vm_offset_t, int);
 void t4_free_page_pods(struct ppod_reservation *);
 int t4_soreceive_ddp(struct socket *, struct sockaddr **, struct uio *,
     struct mbuf **, struct mbuf **, int *);
 int t4_aio_queue_ddp(struct socket *, struct kaiocb *);
 void t4_ddp_mod_load(void);
 void t4_ddp_mod_unload(void);
 void ddp_assert_empty(struct toepcb *);
 void ddp_init_toep(struct toepcb *);
 void ddp_uninit_toep(struct toepcb *);
 void ddp_queue_toep(struct toepcb *);
 void release_ddp_resources(struct toepcb *toep);
 void handle_ddp_close(struct toepcb *, struct tcpcb *, uint32_t);
 void handle_ddp_indicate(struct toepcb *);
 void insert_ddp_data(struct toepcb *, uint32_t);
 const struct offload_settings *lookup_offload_policy(struct adapter *, int,
     struct mbuf *, uint16_t, struct inpcb *);
 
 /* t4_tls.c */
 bool can_tls_offload(struct adapter *);
 int t4_ctloutput_tls(struct socket *, struct sockopt *);
 void t4_push_tls_records(struct adapter *, struct toepcb *, int);
 void t4_tls_mod_load(void);
 void t4_tls_mod_unload(void);
 void tls_establish(struct toepcb *);
 void tls_init_toep(struct toepcb *);
 int tls_rx_key(struct toepcb *);
 void tls_stop_handshake_timer(struct toepcb *);
 int tls_tx_key(struct toepcb *);
 void tls_uninit_toep(struct toepcb *);
 
 #endif
Index: stable/11
===================================================================
--- stable/11	(revision 348703)
+++ stable/11	(revision 348704)

Property changes on: stable/11
___________________________________________________________________
Modified: svn:mergeinfo
## -0,0 +0,1 ##
   Merged /head:r348491