Index: head/sys/dev/cxgbe/cxgbei/cxgbei.c
===================================================================
--- head/sys/dev/cxgbe/cxgbei/cxgbei.c	(revision 305165)
+++ head/sys/dev/cxgbe/cxgbei/cxgbei.c	(revision 305166)
@@ -1,1151 +1,1150 @@
 /*-
  * Copyright (c) 2012 Chelsio Communications, Inc.
  * All rights reserved.
  *
  * Chelsio T5xx iSCSI driver
  *
  * Written by: Sreenivasa Honnur <shonnur@chelsio.com>
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_inet.h"
 #include "opt_inet6.h"
 
 #include <sys/types.h>
 #include <sys/param.h>
 #include <sys/kernel.h>
 #include <sys/module.h>
 #include <sys/systm.h>
 
 #ifdef TCP_OFFLOAD
 #include <sys/errno.h>
 #include <sys/kthread.h>
 #include <sys/smp.h>
 #include <sys/socket.h>
 #include <sys/socketvar.h>
 #include <sys/mbuf.h>
 #include <sys/lock.h>
 #include <sys/mutex.h>
 #include <sys/condvar.h>
 
 #include <netinet/in.h>
 #include <netinet/in_pcb.h>
 #include <netinet/toecore.h>
 #include <netinet/tcp_var.h>
 #include <netinet/tcp_fsm.h>
 
 #include <cam/scsi/scsi_all.h>
 #include <cam/scsi/scsi_da.h>
 #include <cam/ctl/ctl_io.h>
 #include <cam/ctl/ctl.h>
 #include <cam/ctl/ctl_backend.h>
 #include <cam/ctl/ctl_error.h>
 #include <cam/ctl/ctl_frontend.h>
 #include <cam/ctl/ctl_debug.h>
 #include <cam/ctl/ctl_ha.h>
 #include <cam/ctl/ctl_ioctl.h>
 
 #include <dev/iscsi/icl.h>
 #include <dev/iscsi/iscsi_proto.h>
 #include <dev/iscsi/iscsi_ioctl.h>
 #include <dev/iscsi/iscsi.h>
 #include <cam/ctl/ctl_frontend_iscsi.h>
 
 #include <cam/cam.h>
 #include <cam/cam_ccb.h>
 #include <cam/cam_xpt.h>
 #include <cam/cam_debug.h>
 #include <cam/cam_sim.h>
 #include <cam/cam_xpt_sim.h>
 #include <cam/cam_xpt_periph.h>
 #include <cam/cam_periph.h>
 #include <cam/cam_compat.h>
 #include <cam/scsi/scsi_message.h>
 
 #include "common/common.h"
 #include "common/t4_msg.h"
 #include "common/t4_regs.h"     /* for PCIE_MEM_ACCESS */
 #include "tom/t4_tom.h"
 #include "cxgbei.h"
 #include "cxgbei_ulp2_ddp.h"
 
 static int worker_thread_count;
 static struct cxgbei_worker_thread_softc *cwt_softc;
 static struct proc *cxgbei_proc;
 
 /* XXXNP some header instead. */
 struct icl_pdu *icl_cxgbei_new_pdu(int);
 void icl_cxgbei_new_pdu_set_conn(struct icl_pdu *, struct icl_conn *);
 void icl_cxgbei_conn_pdu_free(struct icl_conn *, struct icl_pdu *);
 
 /*
  * Direct Data Placement -
  * Directly place the iSCSI Data-In or Data-Out PDU's payload into pre-posted
  * final destination host-memory buffers based on the Initiator Task Tag (ITT)
  * in Data-In or Target Task Tag (TTT) in Data-Out PDUs.
  * The host memory address is programmed into h/w in the format of pagepod
  * entries.
  * The location of the pagepod entry is encoded into ddp tag which is used as
  * the base for ITT/TTT.
  */
 
 /*
  * functions to program the pagepod in h/w
  */
 static void inline
 ppod_set(struct pagepod *ppod,
 	struct cxgbei_ulp2_pagepod_hdr *hdr,
 	struct cxgbei_ulp2_gather_list *gl,
 	unsigned int pidx)
 {
 	int i;
 
 	memcpy(ppod, hdr, sizeof(*hdr));
 
 	for (i = 0; i < (PPOD_PAGES + 1); i++, pidx++) {
 		ppod->addr[i] = pidx < gl->nelem ?
 			cpu_to_be64(gl->dma_sg[pidx].phys_addr) : 0ULL;
 	}
 }
 
 static void inline
 ppod_clear(struct pagepod *ppod)
 {
 	memset(ppod, 0, sizeof(*ppod));
 }
 
 static inline void
 ulp_mem_io_set_hdr(struct adapter *sc, int tid, struct ulp_mem_io *req,
 		unsigned int wr_len, unsigned int dlen,
 		unsigned int pm_addr)
 {
 	struct ulptx_idata *idata = (struct ulptx_idata *)(req + 1);
 
 	INIT_ULPTX_WR(req, wr_len, 0, 0);
 	req->cmd = cpu_to_be32(V_ULPTX_CMD(ULP_TX_MEM_WRITE) |
 				V_ULP_MEMIO_ORDER(is_t4(sc)) |
 				V_T5_ULP_MEMIO_IMM(is_t5(sc)));
 	req->dlen = htonl(V_ULP_MEMIO_DATA_LEN(dlen >> 5));
 	req->len16 = htonl(DIV_ROUND_UP(wr_len - sizeof(req->wr), 16)
 				| V_FW_WR_FLOWID(tid));
 	req->lock_addr = htonl(V_ULP_MEMIO_ADDR(pm_addr >> 5));
 
 	idata->cmd_more = htonl(V_ULPTX_CMD(ULP_TX_SC_IMM));
 	idata->len = htonl(dlen);
 }
 
-#define PPOD_SIZE		sizeof(struct pagepod)
 #define ULPMEM_IDATA_MAX_NPPODS 1	/* 256/PPOD_SIZE */
 #define PCIE_MEMWIN_MAX_NPPODS 16	/* 1024/PPOD_SIZE */
 
 static int
 ppod_write_idata(struct cxgbei_data *ci,
 			struct cxgbei_ulp2_pagepod_hdr *hdr,
 			unsigned int idx, unsigned int npods,
 			struct cxgbei_ulp2_gather_list *gl,
 			unsigned int gl_pidx, struct toepcb *toep)
 {
 	u_int dlen = PPOD_SIZE * npods;
 	u_int pm_addr = idx * PPOD_SIZE + ci->llimit;
 	u_int wr_len = roundup(sizeof(struct ulp_mem_io) +
 	    sizeof(struct ulptx_idata) + dlen, 16);
 	struct ulp_mem_io *req;
 	struct ulptx_idata *idata;
 	struct pagepod *ppod;
 	u_int i;
 	struct wrqe *wr;
 	struct adapter *sc = toep->vi->pi->adapter;
 
 	wr = alloc_wrqe(wr_len, toep->ctrlq);
 	if (wr == NULL) {
 		CXGBE_UNIMPLEMENTED("ppod_write_idata: alloc_wrqe failure");
 		return (ENOMEM);
 	}
 
 	req = wrtod(wr);
 	memset(req, 0, wr_len);
 	ulp_mem_io_set_hdr(sc, toep->tid, req, wr_len, dlen, pm_addr);
 	idata = (struct ulptx_idata *)(req + 1);
 
 	ppod = (struct pagepod *)(idata + 1);
 	for (i = 0; i < npods; i++, ppod++, gl_pidx += PPOD_PAGES) {
 		if (!hdr) /* clear the pagepod */
 			ppod_clear(ppod);
 		else /* set the pagepod */
 			ppod_set(ppod, hdr, gl, gl_pidx);
 	}
 
 	t4_wrq_tx(sc, wr);
 	return 0;
 }
 
 int
 t4_ddp_set_map(struct cxgbei_data *ci, void *iccp,
     struct cxgbei_ulp2_pagepod_hdr *hdr, u_int idx, u_int npods,
     struct cxgbei_ulp2_gather_list *gl, int reply)
 {
 	struct icl_cxgbei_conn *icc = (struct icl_cxgbei_conn *)iccp;
 	struct toepcb *toep = icc->toep;
 	int err;
 	unsigned int pidx = 0, w_npods = 0, cnt;
 
 	/*
 	 * on T4, if we use a mix of IMMD and DSGL with ULP_MEM_WRITE,
 	 * the order would not be guaranteed, so we will stick with IMMD
 	 */
 	gl->tid = toep->tid;
 	gl->port_id = toep->vi->pi->port_id;
 	gl->egress_dev = (void *)toep->vi->ifp;
 
 	/* send via immediate data */
 	for (; w_npods < npods; idx += cnt, w_npods += cnt,
 		pidx += PPOD_PAGES) {
 		cnt = npods - w_npods;
 		if (cnt > ULPMEM_IDATA_MAX_NPPODS)
 			cnt = ULPMEM_IDATA_MAX_NPPODS;
 		err = ppod_write_idata(ci, hdr, idx, cnt, gl, pidx, toep);
 		if (err) {
 			printf("%s: ppod_write_idata failed\n", __func__);
 			break;
 		}
 	}
 	return err;
 }
 
 void
 t4_ddp_clear_map(struct cxgbei_data *ci, struct cxgbei_ulp2_gather_list *gl,
     u_int tag, u_int idx, u_int npods, struct icl_cxgbei_conn *icc)
 {
 	struct toepcb *toep = icc->toep;
 	int err = -1;
 	u_int pidx = 0;
 	u_int w_npods = 0;
 	u_int cnt;
 
 	for (; w_npods < npods; idx += cnt, w_npods += cnt,
 		pidx += PPOD_PAGES) {
 		cnt = npods - w_npods;
 		if (cnt > ULPMEM_IDATA_MAX_NPPODS)
 			cnt = ULPMEM_IDATA_MAX_NPPODS;
 		err = ppod_write_idata(ci, NULL, idx, cnt, gl, 0, toep);
 		if (err)
 			break;
 	}
 }
 
 static int
 cxgbei_map_sg(struct cxgbei_sgl *sgl, struct ccb_scsiio *csio)
 {
 	unsigned int data_len = csio->dxfer_len;
 	unsigned int sgoffset = (uint64_t)csio->data_ptr & PAGE_MASK;
 	unsigned int nsge;
 	unsigned char *sgaddr = csio->data_ptr;
 	unsigned int len = 0;
 
 	nsge = (csio->dxfer_len + sgoffset + PAGE_SIZE - 1) >> PAGE_SHIFT;
 	sgl->sg_addr = sgaddr;
 	sgl->sg_offset = sgoffset;
 	if (data_len <  (PAGE_SIZE - sgoffset))
 		len = data_len;
 	else
 		len = PAGE_SIZE - sgoffset;
 
 	sgl->sg_length = len;
 
 	data_len -= len;
 	sgaddr += len;
 	sgl = sgl+1;
 
 	while (data_len > 0) {
 		sgl->sg_addr = sgaddr;
 		len = (data_len < PAGE_SIZE)? data_len: PAGE_SIZE;
 		sgl->sg_length = len;
 	        sgaddr += len;
 		data_len -= len;
 		sgl = sgl + 1;
 	}
 
 	return nsge;
 }
 
 static int
 cxgbei_map_sg_tgt(struct cxgbei_sgl *sgl, union ctl_io *io)
 {
 	unsigned int data_len, sgoffset, nsge;
 	unsigned char *sgaddr;
 	unsigned int len = 0, index = 0, ctl_sg_count, i;
 	struct ctl_sg_entry ctl_sg_entry, *ctl_sglist;
 
 	if (io->scsiio.kern_sg_entries > 0) {
 		ctl_sglist = (struct ctl_sg_entry *)io->scsiio.kern_data_ptr;
 		ctl_sg_count = io->scsiio.kern_sg_entries;
 	} else {
 		ctl_sglist = &ctl_sg_entry;
 		ctl_sglist->addr = io->scsiio.kern_data_ptr;
 		ctl_sglist->len = io->scsiio.kern_data_len;
 		ctl_sg_count = 1;
 	}
 
 	sgaddr = sgl->sg_addr = ctl_sglist[index].addr;
 	sgoffset = sgl->sg_offset = (uint64_t)sgl->sg_addr & PAGE_MASK;
 	data_len = ctl_sglist[index].len;
 
 	if (data_len <  (PAGE_SIZE - sgoffset))
 		len = data_len;
 	else
 		len = PAGE_SIZE - sgoffset;
 
 	sgl->sg_length = len;
 
 	data_len -= len;
 	sgaddr += len;
 	sgl = sgl+1;
 
 	len = 0;
 	for (i = 0;  i< ctl_sg_count; i++)
 		len += ctl_sglist[i].len;
 	nsge = (len + sgoffset + PAGE_SIZE -1) >> PAGE_SHIFT;
 	while (data_len > 0) {
 		sgl->sg_addr = sgaddr;
 		len = (data_len < PAGE_SIZE)? data_len: PAGE_SIZE;
 		sgl->sg_length = len;
 		sgaddr += len;
 		data_len -= len;
 		sgl = sgl + 1;
 		if (data_len == 0) {
 			if (index == ctl_sg_count - 1)
 				break;
 			index++;
 			sgaddr = ctl_sglist[index].addr;
 			data_len = ctl_sglist[index].len;
 		}
 	}
 
 	return nsge;
 }
 
 static int
 t4_sk_ddp_tag_reserve(struct cxgbei_data *ci, struct icl_cxgbei_conn *icc,
     u_int xferlen, struct cxgbei_sgl *sgl, u_int sgcnt, u_int *ddp_tag)
 {
 	struct cxgbei_ulp2_gather_list *gl;
 	int err = -EINVAL;
 	struct toepcb *toep = icc->toep;
 
 	gl = cxgbei_ulp2_ddp_make_gl_from_iscsi_sgvec(xferlen, sgl, sgcnt, ci, 0);
 	if (gl) {
 		err = cxgbei_ulp2_ddp_tag_reserve(ci, icc, toep->tid,
 		    &ci->tag_format, ddp_tag, gl, 0, 0);
 		if (err) {
 			cxgbei_ulp2_ddp_release_gl(ci, gl);
 		}
 	}
 
 	return err;
 }
 
 static unsigned int
 cxgbei_task_reserve_itt(struct icl_conn *ic, void **prv,
 			struct ccb_scsiio *scmd, unsigned int *itt)
 {
 	struct icl_cxgbei_conn *icc = ic_to_icc(ic);
 	int xferlen = scmd->dxfer_len;
 	struct cxgbei_task_data *tdata = NULL;
 	struct cxgbei_sgl *sge = NULL;
 	struct toepcb *toep = icc->toep;
 	struct adapter *sc = td_adapter(toep->td);
 	struct cxgbei_data *ci = sc->iscsi_ulp_softc;
 	int err = -1;
 
 	MPASS(icc->icc_signature == CXGBEI_CONN_SIGNATURE);
 
 	tdata = (struct cxgbei_task_data *)*prv;
 	if (xferlen == 0 || tdata == NULL)
 		goto out;
 	if (xferlen < DDP_THRESHOLD)
 		goto out;
 
 	if ((scmd->ccb_h.flags & CAM_DIR_MASK) == CAM_DIR_IN) {
 		tdata->nsge = cxgbei_map_sg(tdata->sgl, scmd);
 		if (tdata->nsge == 0) {
 			CTR1(KTR_CXGBE, "%s: map_sg failed", __func__);
 			return 0;
 		}
 		sge = tdata->sgl;
 
 		tdata->sc_ddp_tag = *itt;
 
 		CTR3(KTR_CXGBE, "%s: *itt:0x%x sc_ddp_tag:0x%x",
 				__func__, *itt, tdata->sc_ddp_tag);
 		if (cxgbei_ulp2_sw_tag_usable(&ci->tag_format,
 							tdata->sc_ddp_tag)) {
 			err = t4_sk_ddp_tag_reserve(ci, icc, scmd->dxfer_len,
 			    sge, tdata->nsge, &tdata->sc_ddp_tag);
 		} else {
 			CTR3(KTR_CXGBE,
 				"%s: itt:0x%x sc_ddp_tag:0x%x not usable",
 				__func__, *itt, tdata->sc_ddp_tag);
 		}
 	}
 out:
 	if (err < 0)
 		tdata->sc_ddp_tag =
 			cxgbei_ulp2_set_non_ddp_tag(&ci->tag_format, *itt);
 
 	return tdata->sc_ddp_tag;
 }
 
 static unsigned int
 cxgbei_task_reserve_ttt(struct icl_conn *ic, void **prv, union ctl_io *io,
 				unsigned int *ttt)
 {
 	struct icl_cxgbei_conn *icc = ic_to_icc(ic);
 	struct toepcb *toep = icc->toep;
 	struct adapter *sc = td_adapter(toep->td);
 	struct cxgbei_data *ci = sc->iscsi_ulp_softc;
 	struct cxgbei_task_data *tdata = NULL;
 	int xferlen, err = -1;
 	struct cxgbei_sgl *sge = NULL;
 
 	MPASS(icc->icc_signature == CXGBEI_CONN_SIGNATURE);
 
 	xferlen = (io->scsiio.kern_data_len - io->scsiio.ext_data_filled);
 	tdata = (struct cxgbei_task_data *)*prv;
 	if ((xferlen == 0) || (tdata == NULL))
 		goto out;
 	if (xferlen < DDP_THRESHOLD)
 		goto out;
 	tdata->nsge = cxgbei_map_sg_tgt(tdata->sgl, io);
 	if (tdata->nsge == 0) {
 		CTR1(KTR_CXGBE, "%s: map_sg failed", __func__);
 		return 0;
 	}
 	sge = tdata->sgl;
 
 	tdata->sc_ddp_tag = *ttt;
 	if (cxgbei_ulp2_sw_tag_usable(&ci->tag_format, tdata->sc_ddp_tag)) {
 		err = t4_sk_ddp_tag_reserve(ci, icc, xferlen, sge,
 		    tdata->nsge, &tdata->sc_ddp_tag);
 	} else {
 		CTR2(KTR_CXGBE, "%s: sc_ddp_tag:0x%x not usable",
 				__func__, tdata->sc_ddp_tag);
 	}
 out:
 	if (err < 0)
 		tdata->sc_ddp_tag =
 			cxgbei_ulp2_set_non_ddp_tag(&ci->tag_format, *ttt);
 	return tdata->sc_ddp_tag;
 }
 
 static int
 t4_sk_ddp_tag_release(struct icl_cxgbei_conn *icc, unsigned int ddp_tag)
 {
 	struct toepcb *toep = icc->toep;
 	struct adapter *sc = td_adapter(toep->td);
 	struct cxgbei_data *ci = sc->iscsi_ulp_softc;
 
 	cxgbei_ulp2_ddp_tag_release(ci, ddp_tag, icc);
 
 	return (0);
 }
 
 static void
 read_pdu_limits(struct adapter *sc, uint32_t *max_tx_pdu_len,
     uint32_t *max_rx_pdu_len)
 {
 	uint32_t tx_len, rx_len, r, v;
 
 	rx_len = t4_read_reg(sc, A_TP_PMM_RX_PAGE_SIZE);
 	tx_len = t4_read_reg(sc, A_TP_PMM_TX_PAGE_SIZE);
 
 	r = t4_read_reg(sc, A_TP_PARA_REG2);
 	rx_len = min(rx_len, G_MAXRXDATA(r));
 	tx_len = min(tx_len, G_MAXRXDATA(r));
 
 	r = t4_read_reg(sc, A_TP_PARA_REG7);
 	v = min(G_PMMAXXFERLEN0(r), G_PMMAXXFERLEN1(r));
 	rx_len = min(rx_len, v);
 	tx_len = min(tx_len, v);
 
 	/* Remove after FW_FLOWC_MNEM_TXDATAPLEN_MAX fix in firmware. */
 	tx_len = min(tx_len, 3 * 4096);
 
 	*max_tx_pdu_len = rounddown2(tx_len, 512);
 	*max_rx_pdu_len = rounddown2(rx_len, 512);
 }
 
 /*
  * Initialize the software state of the iSCSI ULP driver.
  *
  * ENXIO means firmware didn't set up something that it was supposed to.
  */
 static int
 cxgbei_init(struct adapter *sc, struct cxgbei_data *ci)
 {
 	int nppods, bits, rc;
 	static const u_int pgsz_order[] = {0, 1, 2, 3};
 
 	MPASS(sc->vres.iscsi.size > 0);
 
 	ci->llimit = sc->vres.iscsi.start;
 	ci->ulimit = sc->vres.iscsi.start + sc->vres.iscsi.size - 1;
 	read_pdu_limits(sc, &ci->max_tx_pdu_len, &ci->max_rx_pdu_len);
 
 	nppods = sc->vres.iscsi.size >> IPPOD_SIZE_SHIFT;
 	if (nppods <= 1024)
 		return (ENXIO);
 
 	bits = fls(nppods);
 	if (bits > IPPOD_IDX_MAX_SIZE)
 		bits = IPPOD_IDX_MAX_SIZE;
 	nppods = (1 << (bits - 1)) - 1;
 
 	rc = bus_dma_tag_create(NULL, 1, 0, BUS_SPACE_MAXADDR,
 	    BUS_SPACE_MAXADDR, NULL, NULL, UINT32_MAX , 8, BUS_SPACE_MAXSIZE,
 	    BUS_DMA_ALLOCNOW, NULL, NULL, &ci->ulp_ddp_tag);
 	if (rc != 0) {
 		device_printf(sc->dev, "%s: failed to create DMA tag: %u.\n",
 		    __func__, rc);
 		return (rc);
 	}
 
 	ci->colors = malloc(nppods * sizeof(char), M_CXGBE, M_NOWAIT | M_ZERO);
 	ci->gl_map = malloc(nppods * sizeof(struct cxgbei_ulp2_gather_list *),
 	    M_CXGBE, M_NOWAIT | M_ZERO);
 	if (ci->colors == NULL || ci->gl_map == NULL) {
 		bus_dma_tag_destroy(ci->ulp_ddp_tag);
 		free(ci->colors, M_CXGBE);
 		free(ci->gl_map, M_CXGBE);
 		return (ENOMEM);
 	}
 
 	mtx_init(&ci->map_lock, "ddp lock", NULL, MTX_DEF | MTX_DUPOK);
 	ci->nppods = nppods;
 	ci->idx_last = nppods;
 	ci->idx_bits = bits;
 	ci->idx_mask = (1 << bits) - 1;
 	ci->rsvd_tag_mask = (1 << (bits + IPPOD_IDX_SHIFT)) - 1;
 
 	ci->tag_format.sw_bits = bits;
 	ci->tag_format.rsvd_bits = bits;
 	ci->tag_format.rsvd_shift = IPPOD_IDX_SHIFT;
 	ci->tag_format.rsvd_mask = ci->idx_mask;
 
 	t4_iscsi_init(sc, ci->idx_mask << IPPOD_IDX_SHIFT, pgsz_order);
 
 	return (rc);
 }
 
 static int
 do_rx_iscsi_hdr(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m)
 {
 	struct adapter *sc = iq->adapter;
 	struct cpl_iscsi_hdr *cpl = mtod(m, struct cpl_iscsi_hdr *);
 	u_int tid = GET_TID(cpl);
 	struct toepcb *toep = lookup_tid(sc, tid);
 	struct icl_pdu *ip;
 	struct icl_cxgbei_pdu *icp;
 
 	M_ASSERTPKTHDR(m);
 
 	ip = icl_cxgbei_new_pdu(M_NOWAIT);
 	if (ip == NULL)
 		CXGBE_UNIMPLEMENTED("PDU allocation failure");
 	icp = ip_to_icp(ip);
 	bcopy(mtod(m, caddr_t) + sizeof(*cpl), icp->ip.ip_bhs, sizeof(struct
 	    iscsi_bhs));
 	icp->icp_seq = ntohl(cpl->seq);
 	icp->icp_flags = ICPF_RX_HDR;
 
 	/* This is the start of a new PDU.  There should be no old state. */
 	MPASS(toep->ulpcb2 == NULL);
 	toep->ulpcb2 = icp;
 
 #if 0
 	CTR4(KTR_CXGBE, "%s: tid %u, cpl->len hlen %u, m->m_len hlen %u",
 	    __func__, tid, ntohs(cpl->len), m->m_len);
 #endif
 
 	m_freem(m);
 	return (0);
 }
 
 static int
 do_rx_iscsi_data(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m)
 {
 	struct adapter *sc = iq->adapter;
 	struct cpl_iscsi_data *cpl =  mtod(m, struct cpl_iscsi_data *);
 	u_int tid = GET_TID(cpl);
 	struct toepcb *toep = lookup_tid(sc, tid);
 	struct icl_cxgbei_pdu *icp = toep->ulpcb2;
 
 	M_ASSERTPKTHDR(m);
 
 	/* Must already have received the header (but not the data). */
 	MPASS(icp != NULL);
 	MPASS(icp->icp_flags == ICPF_RX_HDR);
 	MPASS(icp->ip.ip_data_mbuf == NULL);
 	MPASS(icp->ip.ip_data_len == 0);
 
 	m_adj(m, sizeof(*cpl));
 
 	icp->icp_flags |= ICPF_RX_FLBUF;
 	icp->ip.ip_data_mbuf = m;
 	icp->ip.ip_data_len = m->m_pkthdr.len;
 
 #if 0
 	CTR4(KTR_CXGBE, "%s: tid %u, cpl->len dlen %u, m->m_len dlen %u",
 	    __func__, tid, ntohs(cpl->len), m->m_len);
 #endif
 
 	return (0);
 }
 
 static int
 do_rx_iscsi_ddp(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m)
 {
 	struct adapter *sc = iq->adapter;
 	const struct cpl_rx_data_ddp *cpl = (const void *)(rss + 1);
 	u_int tid = GET_TID(cpl);
 	struct toepcb *toep = lookup_tid(sc, tid);
 	struct inpcb *inp = toep->inp;
 	struct socket *so;
 	struct sockbuf *sb;
 	struct tcpcb *tp;
 	struct icl_cxgbei_conn *icc;
 	struct icl_conn *ic;
 	struct icl_cxgbei_pdu *icp = toep->ulpcb2;
 	struct icl_pdu *ip;
 	u_int pdu_len, val;
 
 	MPASS(m == NULL);
 
 	/* Must already be assembling a PDU. */
 	MPASS(icp != NULL);
 	MPASS(icp->icp_flags & ICPF_RX_HDR);	/* Data is optional. */
 	ip = &icp->ip;
 	icp->icp_flags |= ICPF_RX_STATUS;
 	val = ntohl(cpl->ddpvld);
 	if (val & F_DDP_PADDING_ERR)
 		icp->icp_flags |= ICPF_PAD_ERR;
 	if (val & F_DDP_HDRCRC_ERR)
 		icp->icp_flags |= ICPF_HCRC_ERR;
 	if (val & F_DDP_DATACRC_ERR)
 		icp->icp_flags |= ICPF_DCRC_ERR;
 	if (ip->ip_data_mbuf == NULL) {
 		/* XXXNP: what should ip->ip_data_len be, and why? */
 		icp->icp_flags |= ICPF_RX_DDP;
 	}
 	pdu_len = ntohs(cpl->len);	/* includes everything. */
 
 	INP_WLOCK(inp);
 	if (__predict_false(inp->inp_flags & (INP_DROPPED | INP_TIMEWAIT))) {
 		CTR4(KTR_CXGBE, "%s: tid %u, rx (%d bytes), inp_flags 0x%x",
 		    __func__, tid, pdu_len, inp->inp_flags);
 		INP_WUNLOCK(inp);
 		icl_cxgbei_conn_pdu_free(NULL, ip);
 #ifdef INVARIANTS
 		toep->ulpcb2 = NULL;
 #endif
 		return (0);
 	}
 
 	tp = intotcpcb(inp);
 	MPASS(icp->icp_seq == tp->rcv_nxt);
 	MPASS(tp->rcv_wnd >= pdu_len);
 	tp->rcv_nxt += pdu_len;
 	tp->rcv_wnd -= pdu_len;
 	tp->t_rcvtime = ticks;
 
 	/* update rx credits */
 	toep->rx_credits += pdu_len;
 	t4_rcvd(&toep->td->tod, tp);	/* XXX: sc->tom_softc.tod */
 
 	so = inp->inp_socket;
 	sb = &so->so_rcv;
 	SOCKBUF_LOCK(sb);
 
 	icc = toep->ulpcb;
 	if (__predict_false(icc == NULL || sb->sb_state & SBS_CANTRCVMORE)) {
 		CTR5(KTR_CXGBE,
 		    "%s: tid %u, excess rx (%d bytes), icc %p, sb_state 0x%x",
 		    __func__, tid, pdu_len, icc, sb->sb_state);
 		SOCKBUF_UNLOCK(sb);
 		INP_WUNLOCK(inp);
 
 		INP_INFO_RLOCK(&V_tcbinfo);
 		INP_WLOCK(inp);
 		tp = tcp_drop(tp, ECONNRESET);
 		if (tp)
 			INP_WUNLOCK(inp);
 		INP_INFO_RUNLOCK(&V_tcbinfo);
 
 		icl_cxgbei_conn_pdu_free(NULL, ip);
 #ifdef INVARIANTS
 		toep->ulpcb2 = NULL;
 #endif
 		return (0);
 	}
 	MPASS(icc->icc_signature == CXGBEI_CONN_SIGNATURE);
 	ic = &icc->ic;
 	icl_cxgbei_new_pdu_set_conn(ip, ic);
 
 	MPASS(m == NULL); /* was unused, we'll use it now. */
 	m = sbcut_locked(sb, sbused(sb)); /* XXXNP: toep->sb_cc accounting? */
 	if (__predict_false(m != NULL)) {
 		int len = m_length(m, NULL);
 
 		/*
 		 * PDUs were received before the tid transitioned to ULP mode.
 		 * Convert them to icl_cxgbei_pdus and send them to ICL before
 		 * the PDU in icp/ip.
 		 */
 		CTR3(KTR_CXGBE, "%s: tid %u, %u bytes in so_rcv", __func__, tid,
 		    len);
 
 		/* XXXNP: needs to be rewritten. */
 		if (len == sizeof(struct iscsi_bhs) || len == 4 + sizeof(struct
 		    iscsi_bhs)) {
 			struct icl_cxgbei_pdu *icp0;
 			struct icl_pdu *ip0;
 
 			ip0 = icl_cxgbei_new_pdu(M_NOWAIT);
 			icl_cxgbei_new_pdu_set_conn(ip0, ic);
 			if (ip0 == NULL)
 				CXGBE_UNIMPLEMENTED("PDU allocation failure");
 			icp0 = ip_to_icp(ip0);
 			icp0->icp_seq = 0; /* XXX */
 			icp0->icp_flags = ICPF_RX_HDR | ICPF_RX_STATUS;
 			m_copydata(m, 0, sizeof(struct iscsi_bhs), (void *)ip0->ip_bhs);
 			STAILQ_INSERT_TAIL(&icc->rcvd_pdus, ip0, ip_next);
 		}
 		m_freem(m);
 	}
 
 #if 0
 	CTR4(KTR_CXGBE, "%s: tid %u, pdu_len %u, pdu_flags 0x%x",
 	    __func__, tid, pdu_len, icp->icp_flags);
 #endif
 
 	STAILQ_INSERT_TAIL(&icc->rcvd_pdus, ip, ip_next);
 	if ((icc->rx_flags & RXF_ACTIVE) == 0) {
 		struct cxgbei_worker_thread_softc *cwt = &cwt_softc[icc->cwt];
 
 		mtx_lock(&cwt->cwt_lock);
 		icc->rx_flags |= RXF_ACTIVE;
 		TAILQ_INSERT_TAIL(&cwt->rx_head, icc, rx_link);
 		if (cwt->cwt_state == CWT_SLEEPING) {
 			cwt->cwt_state = CWT_RUNNING;
 			cv_signal(&cwt->cwt_cv);
 		}
 		mtx_unlock(&cwt->cwt_lock);
 	}
 	SOCKBUF_UNLOCK(sb);
 	INP_WUNLOCK(inp);
 
 #ifdef INVARIANTS
 	toep->ulpcb2 = NULL;
 #endif
 
 	return (0);
 }
 
 /* initiator */
 void
 cxgbei_conn_task_reserve_itt(void *conn, void **prv,
 				void *scmd, unsigned int *itt)
 {
 	unsigned int tag;
 	tag = cxgbei_task_reserve_itt(conn, prv, scmd, itt);
 	if (tag)
 		*itt = htonl(tag);
 	return;
 }
 
 /* target */
 void
 cxgbei_conn_transfer_reserve_ttt(void *conn, void **prv,
 				void *scmd, unsigned int *ttt)
 {
 	unsigned int tag;
 	tag = cxgbei_task_reserve_ttt(conn, prv, scmd, ttt);
 	if (tag)
 		*ttt = htonl(tag);
 	return;
 }
 
 void
 cxgbei_cleanup_task(void *conn, void *ofld_priv)
 {
 	struct icl_conn *ic = (struct icl_conn *)conn;
 	struct icl_cxgbei_conn *icc = ic_to_icc(ic);
 	struct cxgbei_task_data *tdata = ofld_priv;
 	struct adapter *sc = icc->sc;
 	struct cxgbei_data *ci = sc->iscsi_ulp_softc;
 
 	MPASS(icc->icc_signature == CXGBEI_CONN_SIGNATURE);
 	MPASS(tdata != NULL);
 
 	if (cxgbei_ulp2_is_ddp_tag(&ci->tag_format, tdata->sc_ddp_tag))
 		t4_sk_ddp_tag_release(icc, tdata->sc_ddp_tag);
 	memset(tdata, 0, sizeof(*tdata));
 }
 
 static int
 cxgbei_activate(struct adapter *sc)
 {
 	struct cxgbei_data *ci;
 	int rc;
 
 	ASSERT_SYNCHRONIZED_OP(sc);
 
 	if (uld_active(sc, ULD_ISCSI)) {
 		KASSERT(0, ("%s: iSCSI offload already enabled on adapter %p",
 		    __func__, sc));
 		return (0);
 	}
 
 	if (sc->iscsicaps == 0 || sc->vres.iscsi.size == 0) {
 		device_printf(sc->dev,
 		    "not iSCSI offload capable, or capability disabled.\n");
 		return (ENOSYS);
 	}
 
 	/* per-adapter softc for iSCSI */
 	ci = malloc(sizeof(*ci), M_CXGBE, M_ZERO | M_NOWAIT);
 	if (ci == NULL)
 		return (ENOMEM);
 
 	rc = cxgbei_init(sc, ci);
 	if (rc != 0) {
 		free(ci, M_CXGBE);
 		return (rc);
 	}
 
 	sc->iscsi_ulp_softc = ci;
 
 	return (0);
 }
 
 static int
 cxgbei_deactivate(struct adapter *sc)
 {
 
 	ASSERT_SYNCHRONIZED_OP(sc);
 
 	if (sc->iscsi_ulp_softc != NULL) {
 		cxgbei_ddp_cleanup(sc->iscsi_ulp_softc);
 		free(sc->iscsi_ulp_softc, M_CXGBE);
 		sc->iscsi_ulp_softc = NULL;
 	}
 
 	return (0);
 }
 
 static void
 cxgbei_activate_all(struct adapter *sc, void *arg __unused)
 {
 
 	if (begin_synchronized_op(sc, NULL, SLEEP_OK | INTR_OK, "t4isact") != 0)
 		return;
 
 	/* Activate iSCSI if any port on this adapter has IFCAP_TOE enabled. */
 	if (sc->offload_map && !uld_active(sc, ULD_ISCSI))
 		(void) t4_activate_uld(sc, ULD_ISCSI);
 
 	end_synchronized_op(sc, 0);
 }
 
 static void
 cxgbei_deactivate_all(struct adapter *sc, void *arg __unused)
 {
 
 	if (begin_synchronized_op(sc, NULL, SLEEP_OK | INTR_OK, "t4isdea") != 0)
 		return;
 
 	if (uld_active(sc, ULD_ISCSI))
 	    (void) t4_deactivate_uld(sc, ULD_ISCSI);
 
 	end_synchronized_op(sc, 0);
 }
 
 static struct uld_info cxgbei_uld_info = {
 	.uld_id = ULD_ISCSI,
 	.activate = cxgbei_activate,
 	.deactivate = cxgbei_deactivate,
 };
 
 static void
 cwt_main(void *arg)
 {
 	struct cxgbei_worker_thread_softc *cwt = arg;
 	struct icl_cxgbei_conn *icc = NULL;
 	struct icl_conn *ic;
 	struct icl_pdu *ip;
 	struct sockbuf *sb;
 	STAILQ_HEAD(, icl_pdu) rx_pdus = STAILQ_HEAD_INITIALIZER(rx_pdus);
 
 	MPASS(cwt != NULL);
 
 	mtx_lock(&cwt->cwt_lock);
 	MPASS(cwt->cwt_state == 0);
 	cwt->cwt_state = CWT_RUNNING;
 	cv_signal(&cwt->cwt_cv);
 
 	while (__predict_true(cwt->cwt_state != CWT_STOP)) {
 		cwt->cwt_state = CWT_RUNNING;
 		while ((icc = TAILQ_FIRST(&cwt->rx_head)) != NULL) {
 			TAILQ_REMOVE(&cwt->rx_head, icc, rx_link);
 			mtx_unlock(&cwt->cwt_lock);
 
 			ic = &icc->ic;
 			sb = &ic->ic_socket->so_rcv;
 
 			SOCKBUF_LOCK(sb);
 			MPASS(icc->rx_flags & RXF_ACTIVE);
 			if (__predict_true(!(sb->sb_state & SBS_CANTRCVMORE))) {
 				MPASS(STAILQ_EMPTY(&rx_pdus));
 				STAILQ_SWAP(&icc->rcvd_pdus, &rx_pdus, icl_pdu);
 				SOCKBUF_UNLOCK(sb);
 
 				/* Hand over PDUs to ICL. */
 				while ((ip = STAILQ_FIRST(&rx_pdus)) != NULL) {
 					STAILQ_REMOVE_HEAD(&rx_pdus, ip_next);
 					ic->ic_receive(ip);
 				}
 
 				SOCKBUF_LOCK(sb);
 				MPASS(STAILQ_EMPTY(&rx_pdus));
 			}
 			MPASS(icc->rx_flags & RXF_ACTIVE);
 			if (STAILQ_EMPTY(&icc->rcvd_pdus) ||
 			    __predict_false(sb->sb_state & SBS_CANTRCVMORE)) {
 				icc->rx_flags &= ~RXF_ACTIVE;
 			} else {
 				/*
 				 * More PDUs were received while we were busy
 				 * handing over the previous batch to ICL.
 				 * Re-add this connection to the end of the
 				 * queue.
 				 */
 				mtx_lock(&cwt->cwt_lock);
 				TAILQ_INSERT_TAIL(&cwt->rx_head, icc,
 				    rx_link);
 				mtx_unlock(&cwt->cwt_lock);
 			}
 			SOCKBUF_UNLOCK(sb);
 
 			mtx_lock(&cwt->cwt_lock);
 		}
 
 		/* Inner loop doesn't check for CWT_STOP, do that first. */
 		if (__predict_false(cwt->cwt_state == CWT_STOP))
 			break;
 		cwt->cwt_state = CWT_SLEEPING;
 		cv_wait(&cwt->cwt_cv, &cwt->cwt_lock);
 	}
 
 	MPASS(TAILQ_FIRST(&cwt->rx_head) == NULL);
 	mtx_assert(&cwt->cwt_lock, MA_OWNED);
 	cwt->cwt_state = CWT_STOPPED;
 	cv_signal(&cwt->cwt_cv);
 	mtx_unlock(&cwt->cwt_lock);
 	kthread_exit();
 }
 
 static int
 start_worker_threads(void)
 {
 	int i, rc;
 	struct cxgbei_worker_thread_softc *cwt;
 
 	worker_thread_count = min(mp_ncpus, 32);
 	cwt_softc = malloc(worker_thread_count * sizeof(*cwt), M_CXGBE,
 	    M_WAITOK | M_ZERO);
 
 	MPASS(cxgbei_proc == NULL);
 	for (i = 0, cwt = &cwt_softc[0]; i < worker_thread_count; i++, cwt++) {
 		mtx_init(&cwt->cwt_lock, "cwt lock", NULL, MTX_DEF);
 		cv_init(&cwt->cwt_cv, "cwt cv");
 		TAILQ_INIT(&cwt->rx_head);
 		rc = kproc_kthread_add(cwt_main, cwt, &cxgbei_proc, NULL, 0, 0,
 		    "cxgbei", "%d", i);
 		if (rc != 0) {
 			printf("cxgbei: failed to start thread #%d/%d (%d)\n",
 			    i + 1, worker_thread_count, rc);
 			mtx_destroy(&cwt->cwt_lock);
 			cv_destroy(&cwt->cwt_cv);
 			bzero(&cwt, sizeof(*cwt));
 			if (i == 0) {
 				free(cwt_softc, M_CXGBE);
 				worker_thread_count = 0;
 
 				return (rc);
 			}
 
 			/* Not fatal, carry on with fewer threads. */
 			worker_thread_count = i;
 			rc = 0;
 			break;
 		}
 
 		/* Wait for thread to start before moving on to the next one. */
 		mtx_lock(&cwt->cwt_lock);
 		while (cwt->cwt_state == 0)
 			cv_wait(&cwt->cwt_cv, &cwt->cwt_lock);
 		mtx_unlock(&cwt->cwt_lock);
 	}
 
 	MPASS(cwt_softc != NULL);
 	MPASS(worker_thread_count > 0);
 	return (0);
 }
 
 static void
 stop_worker_threads(void)
 {
 	int i;
 	struct cxgbei_worker_thread_softc *cwt = &cwt_softc[0];
 
 	MPASS(worker_thread_count >= 0);
 
 	for (i = 0, cwt = &cwt_softc[0]; i < worker_thread_count; i++, cwt++) {
 		mtx_lock(&cwt->cwt_lock);
 		MPASS(cwt->cwt_state == CWT_RUNNING ||
 		    cwt->cwt_state == CWT_SLEEPING);
 		cwt->cwt_state = CWT_STOP;
 		cv_signal(&cwt->cwt_cv);
 		do {
 			cv_wait(&cwt->cwt_cv, &cwt->cwt_lock);
 		} while (cwt->cwt_state != CWT_STOPPED);
 		mtx_unlock(&cwt->cwt_lock);
 	}
 	free(cwt_softc, M_CXGBE);
 }
 
 /* Select a worker thread for a connection. */
 u_int
 cxgbei_select_worker_thread(struct icl_cxgbei_conn *icc)
 {
 	struct adapter *sc = icc->sc;
 	struct toepcb *toep = icc->toep;
 	u_int i, n;
 
 	n = worker_thread_count / sc->sge.nofldrxq;
 	if (n > 0)
 		i = toep->vi->pi->port_id * n + arc4random() % n;
 	else
 		i = arc4random() % worker_thread_count;
 
 	CTR3(KTR_CXGBE, "%s: tid %u, cwt %u", __func__, toep->tid, i);
 
 	return (i);
 }
 
 static int
 cxgbei_mod_load(void)
 {
 	int rc;
 
 	t4_register_cpl_handler(CPL_ISCSI_HDR, do_rx_iscsi_hdr);
 	t4_register_cpl_handler(CPL_ISCSI_DATA, do_rx_iscsi_data);
 	t4_register_cpl_handler(CPL_RX_ISCSI_DDP, do_rx_iscsi_ddp);
 
 	rc = start_worker_threads();
 	if (rc != 0)
 		return (rc);
 
 	rc = t4_register_uld(&cxgbei_uld_info);
 	if (rc != 0) {
 		stop_worker_threads();
 		return (rc);
 	}
 
 	t4_iterate(cxgbei_activate_all, NULL);
 
 	return (rc);
 }
 
 static int
 cxgbei_mod_unload(void)
 {
 
 	t4_iterate(cxgbei_deactivate_all, NULL);
 
 	if (t4_unregister_uld(&cxgbei_uld_info) == EBUSY)
 		return (EBUSY);
 
 	stop_worker_threads();
 
 	t4_register_cpl_handler(CPL_ISCSI_HDR, NULL);
 	t4_register_cpl_handler(CPL_ISCSI_DATA, NULL);
 	t4_register_cpl_handler(CPL_RX_ISCSI_DDP, NULL);
 
 	return (0);
 }
 #endif
 
 static int
 cxgbei_modevent(module_t mod, int cmd, void *arg)
 {
 	int rc = 0;
 
 #ifdef TCP_OFFLOAD
 	switch (cmd) {
 	case MOD_LOAD:
 		rc = cxgbei_mod_load();
 		if (rc == 0)
 			rc = icl_cxgbei_mod_load();
 		break;
 
 	case MOD_UNLOAD:
 		rc = icl_cxgbei_mod_unload();
 		if (rc == 0)
 			rc = cxgbei_mod_unload();
 		break;
 
 	default:
 		rc = EINVAL;
 	}
 #else
 	printf("cxgbei: compiled without TCP_OFFLOAD support.\n");
 	rc = EOPNOTSUPP;
 #endif
 
 	return (rc);
 }
 
 static moduledata_t cxgbei_mod = {
 	"cxgbei",
 	cxgbei_modevent,
 	NULL,
 };
 
 MODULE_VERSION(cxgbei, 1);
 DECLARE_MODULE(cxgbei, cxgbei_mod, SI_SUB_EXEC, SI_ORDER_ANY);
 MODULE_DEPEND(cxgbei, t4_tom, 1, 1, 1);
 MODULE_DEPEND(cxgbei, cxgbe, 1, 1, 1);
 MODULE_DEPEND(cxgbei, icl, 1, 1, 1);
Index: head/sys/dev/cxgbe/tom/t4_ddp.c
===================================================================
--- head/sys/dev/cxgbe/tom/t4_ddp.c	(revision 305165)
+++ head/sys/dev/cxgbe/tom/t4_ddp.c	(revision 305166)
@@ -1,1794 +1,1819 @@
 /*-
  * Copyright (c) 2012 Chelsio Communications, Inc.
  * All rights reserved.
  * Written by: Navdeep Parhar <np@FreeBSD.org>
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_inet.h"
 
 #include <sys/param.h>
 #include <sys/aio.h>
 #include <sys/file.h>
 #include <sys/systm.h>
 #include <sys/kernel.h>
 #include <sys/ktr.h>
 #include <sys/module.h>
 #include <sys/protosw.h>
 #include <sys/proc.h>
 #include <sys/domain.h>
 #include <sys/socket.h>
 #include <sys/socketvar.h>
 #include <sys/taskqueue.h>
 #include <sys/uio.h>
 #include <netinet/in.h>
 #include <netinet/in_pcb.h>
 #include <netinet/ip.h>
 #include <netinet/tcp_var.h>
 #define TCPSTATES
 #include <netinet/tcp_fsm.h>
 #include <netinet/toecore.h>
 
 #include <vm/vm.h>
 #include <vm/vm_extern.h>
 #include <vm/vm_param.h>
 #include <vm/pmap.h>
 #include <vm/vm_map.h>
 #include <vm/vm_page.h>
 #include <vm/vm_object.h>
 
 #ifdef TCP_OFFLOAD
 #include "common/common.h"
 #include "common/t4_msg.h"
 #include "common/t4_regs.h"
 #include "common/t4_tcb.h"
 #include "tom/t4_tom.h"
 
 VNET_DECLARE(int, tcp_do_autorcvbuf);
 #define V_tcp_do_autorcvbuf VNET(tcp_do_autorcvbuf)
 VNET_DECLARE(int, tcp_autorcvbuf_inc);
 #define V_tcp_autorcvbuf_inc VNET(tcp_autorcvbuf_inc)
 VNET_DECLARE(int, tcp_autorcvbuf_max);
 #define V_tcp_autorcvbuf_max VNET(tcp_autorcvbuf_max)
 
 /*
  * Use the 'backend3' field in AIO jobs to store the amount of data
  * received by the AIO job so far.
  */
 #define	aio_received	backend3
 
 static void aio_ddp_requeue_task(void *context, int pending);
 static void ddp_complete_all(struct toepcb *toep, int error);
 static void t4_aio_cancel_active(struct kaiocb *job);
 static void t4_aio_cancel_queued(struct kaiocb *job);
 
-#define PPOD_SZ(n)	((n) * sizeof(struct pagepod))
-#define PPOD_SIZE	(PPOD_SZ(1))
-
 static TAILQ_HEAD(, pageset) ddp_orphan_pagesets;
 static struct mtx ddp_orphan_pagesets_lock;
 static struct task ddp_orphan_task;
 
 #define MAX_DDP_BUFFER_SIZE		(M_TCB_RX_DDP_BUF0_LEN)
-static int
-alloc_ppods(struct tom_data *td, int n, u_int *ppod_addr)
-{
-	vmem_addr_t v;
-	int rc;
 
-	MPASS(n > 0);
-
-	rc = vmem_alloc(td->ppod_arena, PPOD_SZ(n), M_NOWAIT | M_FIRSTFIT, &v);
-	*ppod_addr = (u_int)v;
-
-	return (rc);
-}
-
-static void
-free_ppods(struct tom_data *td, u_int ppod_addr, int n)
-{
-
-	MPASS(n > 0);
-
-	vmem_free(td->ppod_arena, (vmem_addr_t)ppod_addr, PPOD_SZ(n));
-}
-
-static inline int
-pages_to_nppods(int npages, int ddp_pgsz)
-{
-	int nsegs = npages * PAGE_SIZE / ddp_pgsz;
-
-	return (howmany(nsegs, PPOD_PAGES));
-}
-
 /*
  * A page set holds information about a buffer used for DDP.  The page
  * set holds resources such as the VM pages backing the buffer (either
  * held or wired) and the page pods associated with the buffer.
  * Recently used page sets are cached to allow for efficient reuse of
  * buffers (avoiding the need to re-fault in pages, hold them, etc.).
  * Note that cached page sets keep the backing pages wired.  The
  * number of wired pages is capped by only allowing for two wired
  * pagesets per connection.  This is not a perfect cap, but is a
  * trade-off for performance.
  *
  * If an application ping-pongs two buffers for a connection via
  * aio_read(2) then those buffers should remain wired and expensive VM
  * fault lookups should be avoided after each buffer has been used
  * once.  If an application uses more than two buffers then this will
  * fall back to doing expensive VM fault lookups for each operation.
  */
 static void
 free_pageset(struct tom_data *td, struct pageset *ps)
 {
 	vm_page_t p;
 	int i;
 
-	if (ps->nppods > 0)
-		free_ppods(td, ps->ppod_addr, ps->nppods);
+	if (ps->prsv.prsv_nppods > 0)
+		t4_free_page_pods(&ps->prsv);
 
 	if (ps->flags & PS_WIRED) {
 		for (i = 0; i < ps->npages; i++) {
 			p = ps->pages[i];
 			vm_page_lock(p);
 			vm_page_unwire(p, PQ_INACTIVE);
 			vm_page_unlock(p);
 		}
 	} else
 		vm_page_unhold_pages(ps->pages, ps->npages);
 	mtx_lock(&ddp_orphan_pagesets_lock);
 	TAILQ_INSERT_TAIL(&ddp_orphan_pagesets, ps, link);
 	taskqueue_enqueue(taskqueue_thread, &ddp_orphan_task);
 	mtx_unlock(&ddp_orphan_pagesets_lock);
 }
 
 static void
 ddp_free_orphan_pagesets(void *context, int pending)
 {
 	struct pageset *ps;
 
 	mtx_lock(&ddp_orphan_pagesets_lock);
 	while (!TAILQ_EMPTY(&ddp_orphan_pagesets)) {
 		ps = TAILQ_FIRST(&ddp_orphan_pagesets);
 		TAILQ_REMOVE(&ddp_orphan_pagesets, ps, link);
 		mtx_unlock(&ddp_orphan_pagesets_lock);
 		if (ps->vm)
 			vmspace_free(ps->vm);
 		free(ps, M_CXGBE);
 		mtx_lock(&ddp_orphan_pagesets_lock);
 	}
 	mtx_unlock(&ddp_orphan_pagesets_lock);
 }
 
 static void
 recycle_pageset(struct toepcb *toep, struct pageset *ps)
 {
 
 	DDP_ASSERT_LOCKED(toep);
 	if (!(toep->ddp_flags & DDP_DEAD) && ps->flags & PS_WIRED) {
 		KASSERT(toep->ddp_cached_count + toep->ddp_active_count <
 		    nitems(toep->db), ("too many wired pagesets"));
 		TAILQ_INSERT_HEAD(&toep->ddp_cached_pagesets, ps, link);
 		toep->ddp_cached_count++;
 	} else
 		free_pageset(toep->td, ps);
 }
 
 static void
 ddp_complete_one(struct kaiocb *job, int error)
 {
 	long copied;
 
 	/*
 	 * If this job had copied data out of the socket buffer before
 	 * it was cancelled, report it as a short read rather than an
 	 * error.
 	 */
 	copied = job->aio_received;
 	if (copied != 0 || error == 0)
 		aio_complete(job, copied, 0);
 	else
 		aio_complete(job, -1, error);
 }
 
 static void
 free_ddp_buffer(struct tom_data *td, struct ddp_buffer *db)
 {
 
 	if (db->job) {
 		/*
 		 * XXX: If we are un-offloading the socket then we
 		 * should requeue these on the socket somehow.  If we
 		 * got a FIN from the remote end, then this completes
 		 * any remaining requests with an EOF read.
 		 */
 		if (!aio_clear_cancel_function(db->job))
 			ddp_complete_one(db->job, 0);
 	}
 
 	if (db->ps)
 		free_pageset(td, db->ps);
 }
 
 void
 ddp_init_toep(struct toepcb *toep)
 {
 
 	TAILQ_INIT(&toep->ddp_aiojobq);
 	TASK_INIT(&toep->ddp_requeue_task, 0, aio_ddp_requeue_task, toep);
 	toep->ddp_active_id = -1;
 	mtx_init(&toep->ddp_lock, "t4 ddp", NULL, MTX_DEF);
 }
 
 void
 ddp_uninit_toep(struct toepcb *toep)
 {
 
 	mtx_destroy(&toep->ddp_lock);
 }
 
 void
 release_ddp_resources(struct toepcb *toep)
 {
 	struct pageset *ps;
 	int i;
 
 	DDP_LOCK(toep);
 	toep->flags |= DDP_DEAD;
 	for (i = 0; i < nitems(toep->db); i++) {
 		free_ddp_buffer(toep->td, &toep->db[i]);
 	}
 	while ((ps = TAILQ_FIRST(&toep->ddp_cached_pagesets)) != NULL) {
 		TAILQ_REMOVE(&toep->ddp_cached_pagesets, ps, link);
 		free_pageset(toep->td, ps);
 	}
 	ddp_complete_all(toep, 0);
 	DDP_UNLOCK(toep);
 }
 
 #ifdef INVARIANTS
 void
 ddp_assert_empty(struct toepcb *toep)
 {
 	int i;
 
 	MPASS(!(toep->ddp_flags & DDP_TASK_ACTIVE));
 	for (i = 0; i < nitems(toep->db); i++) {
 		MPASS(toep->db[i].job == NULL);
 		MPASS(toep->db[i].ps == NULL);
 	}
 	MPASS(TAILQ_EMPTY(&toep->ddp_cached_pagesets));
 	MPASS(TAILQ_EMPTY(&toep->ddp_aiojobq));
 }
 #endif
 
 static void
 complete_ddp_buffer(struct toepcb *toep, struct ddp_buffer *db,
     unsigned int db_idx)
 {
 	unsigned int db_flag;
 
 	toep->ddp_active_count--;
 	if (toep->ddp_active_id == db_idx) {
 		if (toep->ddp_active_count == 0) {
 			KASSERT(toep->db[db_idx ^ 1].job == NULL,
 			    ("%s: active_count mismatch", __func__));
 			toep->ddp_active_id = -1;
 		} else
 			toep->ddp_active_id ^= 1;
 #ifdef VERBOSE_TRACES
 		CTR2(KTR_CXGBE, "%s: ddp_active_id = %d", __func__,
 		    toep->ddp_active_id);
 #endif
 	} else {
 		KASSERT(toep->ddp_active_count != 0 &&
 		    toep->ddp_active_id != -1,
 		    ("%s: active count mismatch", __func__));
 	}
 
 	db->cancel_pending = 0;
 	db->job = NULL;
 	recycle_pageset(toep, db->ps);
 	db->ps = NULL;
 
 	db_flag = db_idx == 1 ? DDP_BUF1_ACTIVE : DDP_BUF0_ACTIVE;
 	KASSERT(toep->ddp_flags & db_flag,
 	    ("%s: DDP buffer not active. toep %p, ddp_flags 0x%x",
 	    __func__, toep, toep->ddp_flags));
 	toep->ddp_flags &= ~db_flag;
 }
 
 /* XXX: handle_ddp_data code duplication */
 void
 insert_ddp_data(struct toepcb *toep, uint32_t n)
 {
 	struct inpcb *inp = toep->inp;
 	struct tcpcb *tp = intotcpcb(inp);
 	struct ddp_buffer *db;
 	struct kaiocb *job;
 	size_t placed;
 	long copied;
 	unsigned int db_flag, db_idx;
 
 	INP_WLOCK_ASSERT(inp);
 	DDP_ASSERT_LOCKED(toep);
 
 	tp->rcv_nxt += n;
 #ifndef USE_DDP_RX_FLOW_CONTROL
 	KASSERT(tp->rcv_wnd >= n, ("%s: negative window size", __func__));
 	tp->rcv_wnd -= n;
 #endif
 #ifndef USE_DDP_RX_FLOW_CONTROL
 	toep->rx_credits += n;
 #endif
 	CTR2(KTR_CXGBE, "%s: placed %u bytes before falling out of DDP",
 	    __func__, n);
 	while (toep->ddp_active_count > 0) {
 		MPASS(toep->ddp_active_id != -1);
 		db_idx = toep->ddp_active_id;
 		db_flag = db_idx == 1 ? DDP_BUF1_ACTIVE : DDP_BUF0_ACTIVE;
 		MPASS((toep->ddp_flags & db_flag) != 0);
 		db = &toep->db[db_idx];
 		job = db->job;
 		copied = job->aio_received;
 		placed = n;
 		if (placed > job->uaiocb.aio_nbytes - copied)
 			placed = job->uaiocb.aio_nbytes - copied;
 		if (placed > 0)
 			job->msgrcv = 1;
 		if (!aio_clear_cancel_function(job)) {
 			/*
 			 * Update the copied length for when
 			 * t4_aio_cancel_active() completes this
 			 * request.
 			 */
 			job->aio_received += placed;
 		} else if (copied + placed != 0) {
 			CTR4(KTR_CXGBE,
 			    "%s: completing %p (copied %ld, placed %lu)",
 			    __func__, job, copied, placed);
 			/* XXX: This always completes if there is some data. */
 			aio_complete(job, copied + placed, 0);
 		} else if (aio_set_cancel_function(job, t4_aio_cancel_queued)) {
 			TAILQ_INSERT_HEAD(&toep->ddp_aiojobq, job, list);
 			toep->ddp_waiting_count++;
 		} else
 			aio_cancel(job);
 		n -= placed;
 		complete_ddp_buffer(toep, db, db_idx);
 	}
 
 	MPASS(n == 0);
 }
 
 /* SET_TCB_FIELD sent as a ULP command looks like this */
 #define LEN__SET_TCB_FIELD_ULP (sizeof(struct ulp_txpkt) + \
     sizeof(struct ulptx_idata) + sizeof(struct cpl_set_tcb_field_core))
 
 /* RX_DATA_ACK sent as a ULP command looks like this */
 #define LEN__RX_DATA_ACK_ULP (sizeof(struct ulp_txpkt) + \
     sizeof(struct ulptx_idata) + sizeof(struct cpl_rx_data_ack_core))
 
 static inline void *
 mk_set_tcb_field_ulp(struct ulp_txpkt *ulpmc, struct toepcb *toep,
     uint64_t word, uint64_t mask, uint64_t val)
 {
 	struct ulptx_idata *ulpsc;
 	struct cpl_set_tcb_field_core *req;
 
 	ulpmc->cmd_dest = htonl(V_ULPTX_CMD(ULP_TX_PKT) | V_ULP_TXPKT_DEST(0));
 	ulpmc->len = htobe32(howmany(LEN__SET_TCB_FIELD_ULP, 16));
 
 	ulpsc = (struct ulptx_idata *)(ulpmc + 1);
 	ulpsc->cmd_more = htobe32(V_ULPTX_CMD(ULP_TX_SC_IMM));
 	ulpsc->len = htobe32(sizeof(*req));
 
 	req = (struct cpl_set_tcb_field_core *)(ulpsc + 1);
 	OPCODE_TID(req) = htobe32(MK_OPCODE_TID(CPL_SET_TCB_FIELD, toep->tid));
 	req->reply_ctrl = htobe16(V_NO_REPLY(1) |
 	    V_QUEUENO(toep->ofld_rxq->iq.abs_id));
 	req->word_cookie = htobe16(V_WORD(word) | V_COOKIE(0));
         req->mask = htobe64(mask);
         req->val = htobe64(val);
 
 	ulpsc = (struct ulptx_idata *)(req + 1);
 	if (LEN__SET_TCB_FIELD_ULP % 16) {
 		ulpsc->cmd_more = htobe32(V_ULPTX_CMD(ULP_TX_SC_NOOP));
 		ulpsc->len = htobe32(0);
 		return (ulpsc + 1);
 	}
 	return (ulpsc);
 }
 
 static inline void *
 mk_rx_data_ack_ulp(struct ulp_txpkt *ulpmc, struct toepcb *toep)
 {
 	struct ulptx_idata *ulpsc;
 	struct cpl_rx_data_ack_core *req;
 
 	ulpmc->cmd_dest = htonl(V_ULPTX_CMD(ULP_TX_PKT) | V_ULP_TXPKT_DEST(0));
 	ulpmc->len = htobe32(howmany(LEN__RX_DATA_ACK_ULP, 16));
 
 	ulpsc = (struct ulptx_idata *)(ulpmc + 1);
 	ulpsc->cmd_more = htobe32(V_ULPTX_CMD(ULP_TX_SC_IMM));
 	ulpsc->len = htobe32(sizeof(*req));
 
 	req = (struct cpl_rx_data_ack_core *)(ulpsc + 1);
 	OPCODE_TID(req) = htobe32(MK_OPCODE_TID(CPL_RX_DATA_ACK, toep->tid));
 	req->credit_dack = htobe32(F_RX_MODULATE_RX);
 
 	ulpsc = (struct ulptx_idata *)(req + 1);
 	if (LEN__RX_DATA_ACK_ULP % 16) {
 		ulpsc->cmd_more = htobe32(V_ULPTX_CMD(ULP_TX_SC_NOOP));
 		ulpsc->len = htobe32(0);
 		return (ulpsc + 1);
 	}
 	return (ulpsc);
 }
 
 static struct wrqe *
 mk_update_tcb_for_ddp(struct adapter *sc, struct toepcb *toep, int db_idx,
     struct pageset *ps, int offset, uint64_t ddp_flags, uint64_t ddp_flags_mask)
 {
 	struct wrqe *wr;
 	struct work_request_hdr *wrh;
 	struct ulp_txpkt *ulpmc;
 	int len;
 
 	KASSERT(db_idx == 0 || db_idx == 1,
 	    ("%s: bad DDP buffer index %d", __func__, db_idx));
 
 	/*
 	 * We'll send a compound work request that has 3 SET_TCB_FIELDs and an
 	 * RX_DATA_ACK (with RX_MODULATE to speed up delivery).
 	 *
 	 * The work request header is 16B and always ends at a 16B boundary.
 	 * The ULPTX master commands that follow must all end at 16B boundaries
 	 * too so we round up the size to 16.
 	 */
 	len = sizeof(*wrh) + 3 * roundup2(LEN__SET_TCB_FIELD_ULP, 16) +
 	    roundup2(LEN__RX_DATA_ACK_ULP, 16);
 
 	wr = alloc_wrqe(len, toep->ctrlq);
 	if (wr == NULL)
 		return (NULL);
 	wrh = wrtod(wr);
 	INIT_ULPTX_WRH(wrh, len, 1, 0);	/* atomic */
 	ulpmc = (struct ulp_txpkt *)(wrh + 1);
 
 	/* Write the buffer's tag */
 	ulpmc = mk_set_tcb_field_ulp(ulpmc, toep,
 	    W_TCB_RX_DDP_BUF0_TAG + db_idx,
 	    V_TCB_RX_DDP_BUF0_TAG(M_TCB_RX_DDP_BUF0_TAG),
-	    V_TCB_RX_DDP_BUF0_TAG(ps->tag));
+	    V_TCB_RX_DDP_BUF0_TAG(ps->prsv.prsv_tag));
 
 	/* Update the current offset in the DDP buffer and its total length */
 	if (db_idx == 0)
 		ulpmc = mk_set_tcb_field_ulp(ulpmc, toep,
 		    W_TCB_RX_DDP_BUF0_OFFSET,
 		    V_TCB_RX_DDP_BUF0_OFFSET(M_TCB_RX_DDP_BUF0_OFFSET) |
 		    V_TCB_RX_DDP_BUF0_LEN(M_TCB_RX_DDP_BUF0_LEN),
 		    V_TCB_RX_DDP_BUF0_OFFSET(offset) |
 		    V_TCB_RX_DDP_BUF0_LEN(ps->len));
 	else
 		ulpmc = mk_set_tcb_field_ulp(ulpmc, toep,
 		    W_TCB_RX_DDP_BUF1_OFFSET,
 		    V_TCB_RX_DDP_BUF1_OFFSET(M_TCB_RX_DDP_BUF1_OFFSET) |
 		    V_TCB_RX_DDP_BUF1_LEN((u64)M_TCB_RX_DDP_BUF1_LEN << 32),
 		    V_TCB_RX_DDP_BUF1_OFFSET(offset) |
 		    V_TCB_RX_DDP_BUF1_LEN((u64)ps->len << 32));
 
 	/* Update DDP flags */
 	ulpmc = mk_set_tcb_field_ulp(ulpmc, toep, W_TCB_RX_DDP_FLAGS,
 	    ddp_flags_mask, ddp_flags);
 
 	/* Gratuitous RX_DATA_ACK with RX_MODULATE set to speed up delivery. */
 	ulpmc = mk_rx_data_ack_ulp(ulpmc, toep);
 
 	return (wr);
 }
 
 static int
 handle_ddp_data(struct toepcb *toep, __be32 ddp_report, __be32 rcv_nxt, int len)
 {
 	uint32_t report = be32toh(ddp_report);
 	unsigned int db_idx;
 	struct inpcb *inp = toep->inp;
 	struct ddp_buffer *db;
 	struct tcpcb *tp;
 	struct socket *so;
 	struct sockbuf *sb;
 	struct kaiocb *job;
 	long copied;
 
 	db_idx = report & F_DDP_BUF_IDX ? 1 : 0;
 
 	if (__predict_false(!(report & F_DDP_INV)))
 		CXGBE_UNIMPLEMENTED("DDP buffer still valid");
 
 	INP_WLOCK(inp);
 	so = inp_inpcbtosocket(inp);
 	sb = &so->so_rcv;
 	DDP_LOCK(toep);
 
 	KASSERT(toep->ddp_active_id == db_idx,
 	    ("completed DDP buffer (%d) != active_id (%d) for tid %d", db_idx,
 	    toep->ddp_active_id, toep->tid));
 	db = &toep->db[db_idx];
 	job = db->job;
 
 	if (__predict_false(inp->inp_flags & (INP_DROPPED | INP_TIMEWAIT))) {
 		/*
 		 * This can happen due to an administrative tcpdrop(8).
 		 * Just fail the request with ECONNRESET.
 		 */
 		CTR5(KTR_CXGBE, "%s: tid %u, seq 0x%x, len %d, inp_flags 0x%x",
 		    __func__, toep->tid, be32toh(rcv_nxt), len, inp->inp_flags);
 		if (aio_clear_cancel_function(job))
 			ddp_complete_one(job, ECONNRESET);
 		goto completed;
 	}
 
 	tp = intotcpcb(inp);
 
 	/*
 	 * For RX_DDP_COMPLETE, len will be zero and rcv_nxt is the
 	 * sequence number of the next byte to receive.  The length of
 	 * the data received for this message must be computed by
 	 * comparing the new and old values of rcv_nxt.
 	 *
 	 * For RX_DATA_DDP, len might be non-zero, but it is only the
 	 * length of the most recent DMA.  It does not include the
 	 * total length of the data received since the previous update
 	 * for this DDP buffer.  rcv_nxt is the sequence number of the
 	 * first received byte from the most recent DMA.
 	 */
 	len += be32toh(rcv_nxt) - tp->rcv_nxt;
 	tp->rcv_nxt += len;
 	tp->t_rcvtime = ticks;
 #ifndef USE_DDP_RX_FLOW_CONTROL
 	KASSERT(tp->rcv_wnd >= len, ("%s: negative window size", __func__));
 	tp->rcv_wnd -= len;
 #endif
 #ifdef VERBOSE_TRACES
 	CTR4(KTR_CXGBE, "%s: DDP[%d] placed %d bytes (%#x)", __func__, db_idx,
 	    len, report);
 #endif
 
 	/* receive buffer autosize */
 	CURVNET_SET(so->so_vnet);
 	SOCKBUF_LOCK(sb);
 	if (sb->sb_flags & SB_AUTOSIZE &&
 	    V_tcp_do_autorcvbuf &&
 	    sb->sb_hiwat < V_tcp_autorcvbuf_max &&
 	    len > (sbspace(sb) / 8 * 7)) {
 		unsigned int hiwat = sb->sb_hiwat;
 		unsigned int newsize = min(hiwat + V_tcp_autorcvbuf_inc,
 		    V_tcp_autorcvbuf_max);
 
 		if (!sbreserve_locked(sb, newsize, so, NULL))
 			sb->sb_flags &= ~SB_AUTOSIZE;
 		else
 			toep->rx_credits += newsize - hiwat;
 	}
 	SOCKBUF_UNLOCK(sb);
 	CURVNET_RESTORE();
 
 #ifndef USE_DDP_RX_FLOW_CONTROL
 	toep->rx_credits += len;
 #endif
 
 	job->msgrcv = 1;
 	if (db->cancel_pending) {
 		/*
 		 * Update the job's length but defer completion to the
 		 * TCB_RPL callback.
 		 */
 		job->aio_received += len;
 		goto out;
 	} else if (!aio_clear_cancel_function(job)) {
 		/*
 		 * Update the copied length for when
 		 * t4_aio_cancel_active() completes this request.
 		 */
 		job->aio_received += len;
 	} else {
 		copied = job->aio_received;
 #ifdef VERBOSE_TRACES
 		CTR4(KTR_CXGBE, "%s: completing %p (copied %ld, placed %d)",
 		    __func__, job, copied, len);
 #endif
 		aio_complete(job, copied + len, 0);
 		t4_rcvd(&toep->td->tod, tp);
 	}
 
 completed:
 	complete_ddp_buffer(toep, db, db_idx);
 	if (toep->ddp_waiting_count > 0)
 		ddp_queue_toep(toep);
 out:
 	DDP_UNLOCK(toep);
 	INP_WUNLOCK(inp);
 
 	return (0);
 }
 
 void
 handle_ddp_indicate(struct toepcb *toep)
 {
 
 	DDP_ASSERT_LOCKED(toep);
 	MPASS(toep->ddp_active_count == 0);
 	MPASS((toep->ddp_flags & (DDP_BUF0_ACTIVE | DDP_BUF1_ACTIVE)) == 0);
 	if (toep->ddp_waiting_count == 0) {
 		/*
 		 * The pending requests that triggered the request for an
 		 * an indicate were cancelled.  Those cancels should have
 		 * already disabled DDP.  Just ignore this as the data is
 		 * going into the socket buffer anyway.
 		 */
 		return;
 	}
 	CTR3(KTR_CXGBE, "%s: tid %d indicated (%d waiting)", __func__,
 	    toep->tid, toep->ddp_waiting_count);
 	ddp_queue_toep(toep);
 }
 
 enum {
 	DDP_BUF0_INVALIDATED = 0x2,
 	DDP_BUF1_INVALIDATED
 };
 
 void
 handle_ddp_tcb_rpl(struct toepcb *toep, const struct cpl_set_tcb_rpl *cpl)
 {
 	unsigned int db_idx;
 	struct inpcb *inp = toep->inp;
 	struct ddp_buffer *db;
 	struct kaiocb *job;
 	long copied;
 
 	if (cpl->status != CPL_ERR_NONE)
 		panic("XXX: tcp_rpl failed: %d", cpl->status);
 
 	switch (cpl->cookie) {
 	case V_WORD(W_TCB_RX_DDP_FLAGS) | V_COOKIE(DDP_BUF0_INVALIDATED):
 	case V_WORD(W_TCB_RX_DDP_FLAGS) | V_COOKIE(DDP_BUF1_INVALIDATED):
 		/*
 		 * XXX: This duplicates a lot of code with handle_ddp_data().
 		 */
 		db_idx = G_COOKIE(cpl->cookie) - DDP_BUF0_INVALIDATED;
 		INP_WLOCK(inp);
 		DDP_LOCK(toep);
 		db = &toep->db[db_idx];
 
 		/*
 		 * handle_ddp_data() should leave the job around until
 		 * this callback runs once a cancel is pending.
 		 */
 		MPASS(db != NULL);
 		MPASS(db->job != NULL);
 		MPASS(db->cancel_pending);
 
 		/*
 		 * XXX: It's not clear what happens if there is data
 		 * placed when the buffer is invalidated.  I suspect we
 		 * need to read the TCB to see how much data was placed.
 		 *
 		 * For now this just pretends like nothing was placed.
 		 *
 		 * XXX: Note that if we did check the PCB we would need to
 		 * also take care of updating the tp, etc.
 		 */
 		job = db->job;
 		copied = job->aio_received;
 		if (copied == 0) {
 			CTR2(KTR_CXGBE, "%s: cancelling %p", __func__, job);
 			aio_cancel(job);
 		} else {
 			CTR3(KTR_CXGBE, "%s: completing %p (copied %ld)",
 			    __func__, job, copied);
 			aio_complete(job, copied, 0);
 			t4_rcvd(&toep->td->tod, intotcpcb(inp));
 		}
 
 		complete_ddp_buffer(toep, db, db_idx);
 		if (toep->ddp_waiting_count > 0)
 			ddp_queue_toep(toep);
 		DDP_UNLOCK(toep);
 		INP_WUNLOCK(inp);
 		break;
 	default:
 		panic("XXX: unknown tcb_rpl offset %#x, cookie %#x",
 		    G_WORD(cpl->cookie), G_COOKIE(cpl->cookie));
 	}
 }
 
 void
 handle_ddp_close(struct toepcb *toep, struct tcpcb *tp, __be32 rcv_nxt)
 {
 	struct ddp_buffer *db;
 	struct kaiocb *job;
 	long copied;
 	unsigned int db_flag, db_idx;
 	int len, placed;
 
 	INP_WLOCK_ASSERT(toep->inp);
 	DDP_ASSERT_LOCKED(toep);
 	len = be32toh(rcv_nxt) - tp->rcv_nxt;
 
 	tp->rcv_nxt += len;
 #ifndef USE_DDP_RX_FLOW_CONTROL
 	toep->rx_credits += len;
 #endif
 
 	while (toep->ddp_active_count > 0) {
 		MPASS(toep->ddp_active_id != -1);
 		db_idx = toep->ddp_active_id;
 		db_flag = db_idx == 1 ? DDP_BUF1_ACTIVE : DDP_BUF0_ACTIVE;
 		MPASS((toep->ddp_flags & db_flag) != 0);
 		db = &toep->db[db_idx];
 		job = db->job;
 		copied = job->aio_received;
 		placed = len;
 		if (placed > job->uaiocb.aio_nbytes - copied)
 			placed = job->uaiocb.aio_nbytes - copied;
 		if (placed > 0)
 			job->msgrcv = 1;
 		if (!aio_clear_cancel_function(job)) {
 			/*
 			 * Update the copied length for when
 			 * t4_aio_cancel_active() completes this
 			 * request.
 			 */
 			job->aio_received += placed;
 		} else {
 			CTR4(KTR_CXGBE, "%s: tid %d completed buf %d len %d",
 			    __func__, toep->tid, db_idx, placed);
 			aio_complete(job, copied + placed, 0);
 		}
 		len -= placed;
 		complete_ddp_buffer(toep, db, db_idx);
 	}
 
 	MPASS(len == 0);
 	ddp_complete_all(toep, 0);
 }
 
 #define DDP_ERR (F_DDP_PPOD_MISMATCH | F_DDP_LLIMIT_ERR | F_DDP_ULIMIT_ERR |\
 	 F_DDP_PPOD_PARITY_ERR | F_DDP_PADDING_ERR | F_DDP_OFFSET_ERR |\
 	 F_DDP_INVALID_TAG | F_DDP_COLOR_ERR | F_DDP_TID_MISMATCH |\
 	 F_DDP_INVALID_PPOD | F_DDP_HDRCRC_ERR | F_DDP_DATACRC_ERR)
 
 extern cpl_handler_t t4_cpl_handler[];
 
 static int
 do_rx_data_ddp(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m)
 {
 	struct adapter *sc = iq->adapter;
 	const struct cpl_rx_data_ddp *cpl = (const void *)(rss + 1);
 	unsigned int tid = GET_TID(cpl);
 	uint32_t vld;
 	struct toepcb *toep = lookup_tid(sc, tid);
 
 	KASSERT(m == NULL, ("%s: wasn't expecting payload", __func__));
 	KASSERT(toep->tid == tid, ("%s: toep tid/atid mismatch", __func__));
 	KASSERT(!(toep->flags & TPF_SYNQE),
 	    ("%s: toep %p claims to be a synq entry", __func__, toep));
 
 	vld = be32toh(cpl->ddpvld);
 	if (__predict_false(vld & DDP_ERR)) {
 		panic("%s: DDP error 0x%x (tid %d, toep %p)",
 		    __func__, vld, tid, toep);
 	}
 
 	if (toep->ulp_mode == ULP_MODE_ISCSI) {
 		t4_cpl_handler[CPL_RX_ISCSI_DDP](iq, rss, m);
 		return (0);
 	}
 
 	handle_ddp_data(toep, cpl->u.ddp_report, cpl->seq, be16toh(cpl->len));
 
 	return (0);
 }
 
 static int
 do_rx_ddp_complete(struct sge_iq *iq, const struct rss_header *rss,
     struct mbuf *m)
 {
 	struct adapter *sc = iq->adapter;
 	const struct cpl_rx_ddp_complete *cpl = (const void *)(rss + 1);
 	unsigned int tid = GET_TID(cpl);
 	struct toepcb *toep = lookup_tid(sc, tid);
 
 	KASSERT(m == NULL, ("%s: wasn't expecting payload", __func__));
 	KASSERT(toep->tid == tid, ("%s: toep tid/atid mismatch", __func__));
 	KASSERT(!(toep->flags & TPF_SYNQE),
 	    ("%s: toep %p claims to be a synq entry", __func__, toep));
 
 	handle_ddp_data(toep, cpl->ddp_report, cpl->rcv_nxt, 0);
 
 	return (0);
 }
 
 static void
 enable_ddp(struct adapter *sc, struct toepcb *toep)
 {
 
 	KASSERT((toep->ddp_flags & (DDP_ON | DDP_OK | DDP_SC_REQ)) == DDP_OK,
 	    ("%s: toep %p has bad ddp_flags 0x%x",
 	    __func__, toep, toep->ddp_flags));
 
 	CTR3(KTR_CXGBE, "%s: tid %u (time %u)",
 	    __func__, toep->tid, time_uptime);
 
 	DDP_ASSERT_LOCKED(toep);
 	toep->ddp_flags |= DDP_SC_REQ;
 	t4_set_tcb_field(sc, toep->ctrlq, toep->tid, W_TCB_RX_DDP_FLAGS,
 	    V_TF_DDP_OFF(1) | V_TF_DDP_INDICATE_OUT(1) |
 	    V_TF_DDP_BUF0_INDICATE(1) | V_TF_DDP_BUF1_INDICATE(1) |
 	    V_TF_DDP_BUF0_VALID(1) | V_TF_DDP_BUF1_VALID(1),
 	    V_TF_DDP_BUF0_INDICATE(1) | V_TF_DDP_BUF1_INDICATE(1), 0, 0,
 	    toep->ofld_rxq->iq.abs_id);
 	t4_set_tcb_field(sc, toep->ctrlq, toep->tid, W_TCB_T_FLAGS,
 	    V_TF_RCV_COALESCE_ENABLE(1), 0, 0, 0, toep->ofld_rxq->iq.abs_id);
 }
 
 static int
 calculate_hcf(int n1, int n2)
 {
 	int a, b, t;
 
 	if (n1 <= n2) {
 		a = n1;
 		b = n2;
 	} else {
 		a = n2;
 		b = n1;
 	}
 
 	while (a != 0) {
 		t = a;
 		a = b % a;
 		b = t;
 	}
 
 	return (b);
 }
 
+static inline int
+pages_to_nppods(int npages, int ddp_page_shift)
+{
+
+	MPASS(ddp_page_shift >= PAGE_SHIFT);
+
+	return (howmany(npages >> (ddp_page_shift - PAGE_SHIFT), PPOD_PAGES));
+}
+
 static int
-alloc_page_pods(struct tom_data *td, struct pageset *ps)
+alloc_page_pods(struct ppod_region *pr, u_int nppods, u_int pgsz_idx,
+    struct ppod_reservation *prsv)
 {
-	int i, hcf, seglen, idx, ppod, nppods;
-	u_int ppod_addr;
+	vmem_addr_t addr;       /* relative to start of region */
 
-	KASSERT(ps->nppods == 0, ("%s: page pods already allocated", __func__));
+	if (vmem_alloc(pr->pr_arena, PPOD_SZ(nppods), M_NOWAIT | M_FIRSTFIT,
+	    &addr) != 0)
+		return (ENOMEM);
 
+	CTR5(KTR_CXGBE, "%-17s arena %p, addr 0x%08x, nppods %d, pgsz %d",
+	    __func__, pr->pr_arena, (uint32_t)addr & pr->pr_tag_mask,
+	    nppods, 1 << pr->pr_page_shift[pgsz_idx]);
+
 	/*
+	 * The hardware tagmask includes an extra invalid bit but the arena was
+	 * seeded with valid values only.  An allocation out of this arena will
+	 * fit inside the tagmask but won't have the invalid bit set.
+	 */
+	MPASS((addr & pr->pr_tag_mask) == addr);
+	MPASS((addr & pr->pr_invalid_bit) == 0);
+
+	prsv->prsv_pr = pr;
+	prsv->prsv_tag = V_PPOD_PGSZ(pgsz_idx) | addr;
+	prsv->prsv_nppods = nppods;
+
+	return (0);
+}
+
+int
+t4_alloc_page_pods_for_ps(struct ppod_region *pr, struct pageset *ps)
+{
+	int i, hcf, seglen, idx, nppods;
+	struct ppod_reservation *prsv = &ps->prsv;
+
+	KASSERT(prsv->prsv_nppods == 0,
+	    ("%s: page pods already allocated", __func__));
+
+	/*
 	 * The DDP page size is unrelated to the VM page size.  We combine
 	 * contiguous physical pages into larger segments to get the best DDP
 	 * page size possible.  This is the largest of the four sizes in
 	 * A_ULP_RX_TDDP_PSZ that evenly divides the HCF of the segment sizes in
 	 * the page list.
 	 */
 	hcf = 0;
 	for (i = 0; i < ps->npages; i++) {
 		seglen = PAGE_SIZE;
 		while (i < ps->npages - 1 &&
 		    ps->pages[i]->phys_addr + PAGE_SIZE ==
 		    ps->pages[i + 1]->phys_addr) {
 			seglen += PAGE_SIZE;
 			i++;
 		}
 
 		hcf = calculate_hcf(hcf, seglen);
-		if (hcf < td->ddp_pgsz[1]) {
+		if (hcf < (1 << pr->pr_page_shift[1])) {
 			idx = 0;
 			goto have_pgsz;	/* give up, short circuit */
 		}
 	}
 
-	if (hcf % td->ddp_pgsz[0] != 0) {
-		/* hmmm.  This could only happen when PAGE_SIZE < 4K */
-		KASSERT(PAGE_SIZE < 4096,
-		    ("%s: PAGE_SIZE %d, hcf %d", __func__, PAGE_SIZE, hcf));
-		CTR3(KTR_CXGBE, "%s: PAGE_SIZE %d, hcf %d",
-		    __func__, PAGE_SIZE, hcf);
-		return (0);
-	}
-
-	for (idx = nitems(td->ddp_pgsz) - 1; idx > 0; idx--) {
-		if (hcf % td->ddp_pgsz[idx] == 0)
+#define PR_PAGE_MASK(x) ((1 << pr->pr_page_shift[(x)]) - 1)
+	MPASS((hcf & PR_PAGE_MASK(0)) == 0); /* PAGE_SIZE is >= 4K everywhere */
+	for (idx = nitems(pr->pr_page_shift) - 1; idx > 0; idx--) {
+		if ((hcf & PR_PAGE_MASK(idx)) == 0)
 			break;
 	}
+#undef PR_PAGE_MASK
+
 have_pgsz:
 	MPASS(idx <= M_PPOD_PGSZ);
 
-	nppods = pages_to_nppods(ps->npages, td->ddp_pgsz[idx]);
-	if (alloc_ppods(td, nppods, &ppod_addr) != 0) {
-		CTR4(KTR_CXGBE, "%s: no pods, nppods %d, npages %d, pgsz %d",
-		    __func__, nppods, ps->npages, td->ddp_pgsz[idx]);
+	nppods = pages_to_nppods(ps->npages, pr->pr_page_shift[idx]);
+	if (alloc_page_pods(pr, nppods, idx, prsv) != 0)
 		return (0);
-	}
+	MPASS(prsv->prsv_nppods > 0);
 
-	ppod = (ppod_addr - td->ppod_start) / PPOD_SIZE;
-	ps->tag = V_PPOD_PGSZ(idx) | V_PPOD_TAG(ppod);
-	ps->ppod_addr = ppod_addr;
-	ps->nppods = nppods;
+	return (1);
+}
 
-	CTR5(KTR_CXGBE, "New page pods.  "
-	    "ps %p, ddp_pgsz %d, ppod 0x%x, npages %d, nppods %d",
-	    ps, td->ddp_pgsz[idx], ppod, ps->npages, ps->nppods);
+void
+t4_free_page_pods(struct ppod_reservation *prsv)
+{
+	struct ppod_region *pr = prsv->prsv_pr;
+	vmem_addr_t addr;
 
-	return (1);
+	MPASS(prsv != NULL);
+	MPASS(prsv->prsv_nppods != 0);
+
+	addr = prsv->prsv_tag & pr->pr_tag_mask;
+	MPASS((addr & pr->pr_invalid_bit) == 0);
+
+	CTR4(KTR_CXGBE, "%-17s arena %p, addr 0x%08x, nppods %d", __func__,
+	    pr->pr_arena, addr, prsv->prsv_nppods);
+
+	vmem_free(pr->pr_arena, addr, PPOD_SZ(prsv->prsv_nppods));
+	prsv->prsv_nppods = 0;
 }
 
 #define NUM_ULP_TX_SC_IMM_PPODS (256 / PPOD_SIZE)
 
-static int
-write_page_pods(struct adapter *sc, struct toepcb *toep, struct pageset *ps)
+int
+t4_write_page_pods_for_ps(struct adapter *sc, struct sge_wrq *wrq, int tid,
+    struct pageset *ps)
 {
 	struct wrqe *wr;
 	struct ulp_mem_io *ulpmc;
 	struct ulptx_idata *ulpsc;
 	struct pagepod *ppod;
-	struct tom_data *td = sc->tom_softc;
 	int i, j, k, n, chunk, len, ddp_pgsz, idx;
 	u_int ppod_addr;
 	uint32_t cmd;
+	struct ppod_reservation *prsv = &ps->prsv;
+	struct ppod_region *pr = prsv->prsv_pr;
 
 	KASSERT(!(ps->flags & PS_PPODS_WRITTEN),
 	    ("%s: page pods already written", __func__));
+	MPASS(prsv->prsv_nppods > 0);
 
 	cmd = htobe32(V_ULPTX_CMD(ULP_TX_MEM_WRITE));
 	if (is_t4(sc))
 		cmd |= htobe32(F_ULP_MEMIO_ORDER);
 	else
 		cmd |= htobe32(F_T5_ULP_MEMIO_IMM);
-	ddp_pgsz = td->ddp_pgsz[G_PPOD_PGSZ(ps->tag)];
-	ppod_addr = ps->ppod_addr;
-	for (i = 0; i < ps->nppods; ppod_addr += chunk) {
+	ddp_pgsz = 1 << pr->pr_page_shift[G_PPOD_PGSZ(prsv->prsv_tag)];
+	ppod_addr = pr->pr_start + (prsv->prsv_tag & pr->pr_tag_mask);
+	for (i = 0; i < prsv->prsv_nppods; ppod_addr += chunk) {
 
 		/* How many page pods are we writing in this cycle */
-		n = min(ps->nppods - i, NUM_ULP_TX_SC_IMM_PPODS);
+		n = min(prsv->prsv_nppods - i, NUM_ULP_TX_SC_IMM_PPODS);
 		chunk = PPOD_SZ(n);
 		len = roundup2(sizeof(*ulpmc) + sizeof(*ulpsc) + chunk, 16);
 
-		wr = alloc_wrqe(len, toep->ctrlq);
+		wr = alloc_wrqe(len, wrq);
 		if (wr == NULL)
 			return (ENOMEM);	/* ok to just bail out */
 		ulpmc = wrtod(wr);
 
 		INIT_ULPTX_WR(ulpmc, len, 0, 0);
 		ulpmc->cmd = cmd;
 		ulpmc->dlen = htobe32(V_ULP_MEMIO_DATA_LEN(chunk / 32));
 		ulpmc->len16 = htobe32(howmany(len - sizeof(ulpmc->wr), 16));
 		ulpmc->lock_addr = htobe32(V_ULP_MEMIO_ADDR(ppod_addr >> 5));
 
 		ulpsc = (struct ulptx_idata *)(ulpmc + 1);
 		ulpsc->cmd_more = htobe32(V_ULPTX_CMD(ULP_TX_SC_IMM));
 		ulpsc->len = htobe32(chunk);
 
 		ppod = (struct pagepod *)(ulpsc + 1);
 		for (j = 0; j < n; i++, j++, ppod++) {
 			ppod->vld_tid_pgsz_tag_color = htobe64(F_PPOD_VALID |
-			    V_PPOD_TID(toep->tid) | ps->tag);
+			    V_PPOD_TID(tid) | prsv->prsv_tag);
 			ppod->len_offset = htobe64(V_PPOD_LEN(ps->len) |
 			    V_PPOD_OFST(ps->offset));
 			ppod->rsvd = 0;
 			idx = i * PPOD_PAGES * (ddp_pgsz / PAGE_SIZE);
 			for (k = 0; k < nitems(ppod->addr); k++) {
 				if (idx < ps->npages) {
 					ppod->addr[k] =
 					    htobe64(ps->pages[idx]->phys_addr);
 					idx += ddp_pgsz / PAGE_SIZE;
 				} else
 					ppod->addr[k] = 0;
 #if 0
 				CTR5(KTR_CXGBE,
 				    "%s: tid %d ppod[%d]->addr[%d] = %p",
 				    __func__, toep->tid, i, k,
 				    htobe64(ppod->addr[k]));
 #endif
 			}
 
 		}
 
 		t4_wrq_tx(sc, wr);
 	}
 	ps->flags |= PS_PPODS_WRITTEN;
 
 	return (0);
 }
 
 static void
 wire_pageset(struct pageset *ps)
 {
 	vm_page_t p;
 	int i;
 
 	KASSERT(!(ps->flags & PS_WIRED), ("pageset already wired"));
 
 	for (i = 0; i < ps->npages; i++) {
 		p = ps->pages[i];
 		vm_page_lock(p);
 		vm_page_wire(p);
 		vm_page_unhold(p);
 		vm_page_unlock(p);
 	}
 	ps->flags |= PS_WIRED;
 }
 
 /*
  * Prepare a pageset for DDP.  This wires the pageset and sets up page
  * pods.
  */
 static int
 prep_pageset(struct adapter *sc, struct toepcb *toep, struct pageset *ps)
 {
 	struct tom_data *td = sc->tom_softc;
 
 	if (!(ps->flags & PS_WIRED))
 		wire_pageset(ps);
-	if (ps->nppods == 0 && !alloc_page_pods(td, ps)) {
+	if (ps->prsv.prsv_nppods == 0 &&
+	    !t4_alloc_page_pods_for_ps(&td->pr, ps)) {
 		return (0);
 	}
 	if (!(ps->flags & PS_PPODS_WRITTEN) &&
-	    write_page_pods(sc, toep, ps) != 0) {
+	    t4_write_page_pods_for_ps(sc, toep->ctrlq, toep->tid, ps) != 0) {
 		return (0);
 	}
 
 	return (1);
 }
 
-void
-t4_init_ddp(struct adapter *sc, struct tom_data *td)
+int
+t4_init_ppod_region(struct ppod_region *pr, struct t4_range *r, u_int psz,
+    const char *name)
 {
 	int i;
-	uint32_t r;
 
-	r = t4_read_reg(sc, A_ULP_RX_TDDP_PSZ);
-	td->ddp_pgsz[0] = 4096 << G_HPZ0(r);
-	td->ddp_pgsz[1] = 4096 << G_HPZ1(r);
-	td->ddp_pgsz[2] = 4096 << G_HPZ2(r);
-	td->ddp_pgsz[3] = 4096 << G_HPZ3(r);
+	MPASS(pr != NULL);
+	MPASS(r->size > 0);
 
-	/*
-	 * The SGL -> page pod algorithm requires the sizes to be in increasing
-	 * order.
-	 */
-	for (i = 1; i < nitems(td->ddp_pgsz); i++) {
-		if (td->ddp_pgsz[i] <= td->ddp_pgsz[i - 1])
-			return;
+	pr->pr_start = r->start;
+	pr->pr_len = r->size;
+	pr->pr_page_shift[0] = 12 + G_HPZ0(psz);
+	pr->pr_page_shift[1] = 12 + G_HPZ1(psz);
+	pr->pr_page_shift[2] = 12 + G_HPZ2(psz);
+	pr->pr_page_shift[3] = 12 + G_HPZ3(psz);
+
+	/* The SGL -> page pod algorithm requires the sizes to be in order. */
+	for (i = 1; i < nitems(pr->pr_page_shift); i++) {
+		if (pr->pr_page_shift[i] <= pr->pr_page_shift[i - 1])
+			return (ENXIO);
 	}
 
-	td->ppod_start = sc->vres.ddp.start;
-	td->ppod_arena = vmem_create("DDP page pods", sc->vres.ddp.start,
-	    sc->vres.ddp.size, PPOD_SIZE, 512, M_FIRSTFIT | M_NOWAIT);
+	pr->pr_tag_mask = ((1 << fls(r->size)) - 1) & V_PPOD_TAG(M_PPOD_TAG);
+	pr->pr_alias_mask = V_PPOD_TAG(M_PPOD_TAG) & ~pr->pr_tag_mask;
+	if (pr->pr_tag_mask == 0 || pr->pr_alias_mask == 0)
+		return (ENXIO);
+	pr->pr_alias_shift = fls(pr->pr_tag_mask);
+	pr->pr_invalid_bit = 1 << (pr->pr_alias_shift - 1);
+
+	pr->pr_arena = vmem_create(name, 0, pr->pr_len, PPOD_SIZE, 0,
+	    M_FIRSTFIT | M_NOWAIT);
+	if (pr->pr_arena == NULL)
+		return (ENOMEM);
+
+	return (0);
 }
 
 void
-t4_uninit_ddp(struct adapter *sc __unused, struct tom_data *td)
+t4_free_ppod_region(struct ppod_region *pr)
 {
 
-	if (td->ppod_arena != NULL) {
-		vmem_destroy(td->ppod_arena);
-		td->ppod_arena = NULL;
-	}
+	MPASS(pr != NULL);
+
+	if (pr->pr_arena)
+		vmem_destroy(pr->pr_arena);
+	bzero(pr, sizeof(*pr));
 }
 
 static int
 pscmp(struct pageset *ps, struct vmspace *vm, vm_offset_t start, int npages,
     int pgoff, int len)
 {
 
 	if (ps->npages != npages || ps->offset != pgoff || ps->len != len)
 		return (1);
 
 	return (ps->vm != vm || ps->vm_timestamp != vm->vm_map.timestamp);
 }
 
 static int
 hold_aio(struct toepcb *toep, struct kaiocb *job, struct pageset **pps)
 {
 	struct vmspace *vm;
 	vm_map_t map;
 	vm_offset_t start, end, pgoff;
 	struct pageset *ps;
 	int n;
 
 	DDP_ASSERT_LOCKED(toep);
 
 	/*
 	 * The AIO subsystem will cancel and drain all requests before
 	 * permitting a process to exit or exec, so p_vmspace should
 	 * be stable here.
 	 */
 	vm = job->userproc->p_vmspace;
 	map = &vm->vm_map;
 	start = (uintptr_t)job->uaiocb.aio_buf;
 	pgoff = start & PAGE_MASK;
 	end = round_page(start + job->uaiocb.aio_nbytes);
 	start = trunc_page(start);
 
 	if (end - start > MAX_DDP_BUFFER_SIZE) {
 		/*
 		 * Truncate the request to a short read.
 		 * Alternatively, we could DDP in chunks to the larger
 		 * buffer, but that would be quite a bit more work.
 		 *
 		 * When truncating, round the request down to avoid
 		 * crossing a cache line on the final transaction.
 		 */
 		end = rounddown2(start + MAX_DDP_BUFFER_SIZE, CACHE_LINE_SIZE);
 #ifdef VERBOSE_TRACES
 		CTR4(KTR_CXGBE, "%s: tid %d, truncating size from %lu to %lu",
 		    __func__, toep->tid, (unsigned long)job->uaiocb.aio_nbytes,
 		    (unsigned long)(end - (start + pgoff)));
 		job->uaiocb.aio_nbytes = end - (start + pgoff);
 #endif
 		end = round_page(end);
 	}
 
 	n = atop(end - start);
 
 	/*
 	 * Try to reuse a cached pageset.
 	 */
 	TAILQ_FOREACH(ps, &toep->ddp_cached_pagesets, link) {
 		if (pscmp(ps, vm, start, n, pgoff,
 		    job->uaiocb.aio_nbytes) == 0) {
 			TAILQ_REMOVE(&toep->ddp_cached_pagesets, ps, link);
 			toep->ddp_cached_count--;
 			*pps = ps;
 			return (0);
 		}
 	}
 
 	/*
 	 * If there are too many cached pagesets to create a new one,
 	 * free a pageset before creating a new one.
 	 */
 	KASSERT(toep->ddp_active_count + toep->ddp_cached_count <=
 	    nitems(toep->db), ("%s: too many wired pagesets", __func__));
 	if (toep->ddp_active_count + toep->ddp_cached_count ==
 	    nitems(toep->db)) {
 		KASSERT(toep->ddp_cached_count > 0,
 		    ("no cached pageset to free"));
 		ps = TAILQ_LAST(&toep->ddp_cached_pagesets, pagesetq);
 		TAILQ_REMOVE(&toep->ddp_cached_pagesets, ps, link);
 		toep->ddp_cached_count--;
 		free_pageset(toep->td, ps);
 	}
 	DDP_UNLOCK(toep);
 
 	/* Create a new pageset. */
 	ps = malloc(sizeof(*ps) + n * sizeof(vm_page_t), M_CXGBE, M_WAITOK |
 	    M_ZERO);
 	ps->pages = (vm_page_t *)(ps + 1);
 	ps->vm_timestamp = map->timestamp;
 	ps->npages = vm_fault_quick_hold_pages(map, start, end - start,
 	    VM_PROT_WRITE, ps->pages, n);
 
 	DDP_LOCK(toep);
 	if (ps->npages < 0) {
 		free(ps, M_CXGBE);
 		return (EFAULT);
 	}
 
 	KASSERT(ps->npages == n, ("hold_aio: page count mismatch: %d vs %d",
 	    ps->npages, n));
 
 	ps->offset = pgoff;
 	ps->len = job->uaiocb.aio_nbytes;
 	atomic_add_int(&vm->vm_refcnt, 1);
 	ps->vm = vm;
 
 	CTR5(KTR_CXGBE, "%s: tid %d, new pageset %p for job %p, npages %d",
 	    __func__, toep->tid, ps, job, ps->npages);
 	*pps = ps;
 	return (0);
 }
 
 static void
 ddp_complete_all(struct toepcb *toep, int error)
 {
 	struct kaiocb *job;
 
 	DDP_ASSERT_LOCKED(toep);
 	while (!TAILQ_EMPTY(&toep->ddp_aiojobq)) {
 		job = TAILQ_FIRST(&toep->ddp_aiojobq);
 		TAILQ_REMOVE(&toep->ddp_aiojobq, job, list);
 		toep->ddp_waiting_count--;
 		if (aio_clear_cancel_function(job))
 			ddp_complete_one(job, error);
 	}
 }
 
 static void
 aio_ddp_cancel_one(struct kaiocb *job)
 {
 	long copied;
 
 	/*
 	 * If this job had copied data out of the socket buffer before
 	 * it was cancelled, report it as a short read rather than an
 	 * error.
 	 */
 	copied = job->aio_received;
 	if (copied != 0)
 		aio_complete(job, copied, 0);
 	else
 		aio_cancel(job);
 }
 
 /*
  * Called when the main loop wants to requeue a job to retry it later.
  * Deals with the race of the job being cancelled while it was being
  * examined.
  */
 static void
 aio_ddp_requeue_one(struct toepcb *toep, struct kaiocb *job)
 {
 
 	DDP_ASSERT_LOCKED(toep);
 	if (!(toep->ddp_flags & DDP_DEAD) &&
 	    aio_set_cancel_function(job, t4_aio_cancel_queued)) {
 		TAILQ_INSERT_HEAD(&toep->ddp_aiojobq, job, list);
 		toep->ddp_waiting_count++;
 	} else
 		aio_ddp_cancel_one(job);
 }
 
 static void
 aio_ddp_requeue(struct toepcb *toep)
 {
 	struct adapter *sc = td_adapter(toep->td);
 	struct socket *so;
 	struct sockbuf *sb;
 	struct inpcb *inp;
 	struct kaiocb *job;
 	struct ddp_buffer *db;
 	size_t copied, offset, resid;
 	struct pageset *ps;
 	struct mbuf *m;
 	uint64_t ddp_flags, ddp_flags_mask;
 	struct wrqe *wr;
 	int buf_flag, db_idx, error;
 
 	DDP_ASSERT_LOCKED(toep);
 
 restart:
 	if (toep->ddp_flags & DDP_DEAD) {
 		MPASS(toep->ddp_waiting_count == 0);
 		MPASS(toep->ddp_active_count == 0);
 		return;
 	}
 
 	if (toep->ddp_waiting_count == 0 ||
 	    toep->ddp_active_count == nitems(toep->db)) {
 		return;
 	}
 
 	job = TAILQ_FIRST(&toep->ddp_aiojobq);
 	so = job->fd_file->f_data;
 	sb = &so->so_rcv;
 	SOCKBUF_LOCK(sb);
 
 	/* We will never get anything unless we are or were connected. */
 	if (!(so->so_state & (SS_ISCONNECTED|SS_ISDISCONNECTED))) {
 		SOCKBUF_UNLOCK(sb);
 		ddp_complete_all(toep, ENOTCONN);
 		return;
 	}
 
 	KASSERT(toep->ddp_active_count == 0 || sbavail(sb) == 0,
 	    ("%s: pending sockbuf data and DDP is active", __func__));
 
 	/* Abort if socket has reported problems. */
 	/* XXX: Wait for any queued DDP's to finish and/or flush them? */
 	if (so->so_error && sbavail(sb) == 0) {
 		toep->ddp_waiting_count--;
 		TAILQ_REMOVE(&toep->ddp_aiojobq, job, list);
 		if (!aio_clear_cancel_function(job)) {
 			SOCKBUF_UNLOCK(sb);
 			goto restart;
 		}
 
 		/*
 		 * If this job has previously copied some data, report
 		 * a short read and leave the error to be reported by
 		 * a future request.
 		 */
 		copied = job->aio_received;
 		if (copied != 0) {
 			SOCKBUF_UNLOCK(sb);
 			aio_complete(job, copied, 0);
 			goto restart;
 		}
 		error = so->so_error;
 		so->so_error = 0;
 		SOCKBUF_UNLOCK(sb);
 		aio_complete(job, -1, error);
 		goto restart;
 	}
 
 	/*
 	 * Door is closed.  If there is pending data in the socket buffer,
 	 * deliver it.  If there are pending DDP requests, wait for those
 	 * to complete.  Once they have completed, return EOF reads.
 	 */
 	if (sb->sb_state & SBS_CANTRCVMORE && sbavail(sb) == 0) {
 		SOCKBUF_UNLOCK(sb);
 		if (toep->ddp_active_count != 0)
 			return;
 		ddp_complete_all(toep, 0);
 		return;
 	}
 
 	/*
 	 * If DDP is not enabled and there is no pending socket buffer
 	 * data, try to enable DDP.
 	 */
 	if (sbavail(sb) == 0 && (toep->ddp_flags & DDP_ON) == 0) {
 		SOCKBUF_UNLOCK(sb);
 
 		/*
 		 * Wait for the card to ACK that DDP is enabled before
 		 * queueing any buffers.  Currently this waits for an
 		 * indicate to arrive.  This could use a TCB_SET_FIELD_RPL
 		 * message to know that DDP was enabled instead of waiting
 		 * for the indicate which would avoid copying the indicate
 		 * if no data is pending.
 		 *
 		 * XXX: Might want to limit the indicate size to the size
 		 * of the first queued request.
 		 */
 		if ((toep->ddp_flags & DDP_SC_REQ) == 0)
 			enable_ddp(sc, toep);
 		return;
 	}
 	SOCKBUF_UNLOCK(sb);
 
 	/*
 	 * If another thread is queueing a buffer for DDP, let it
 	 * drain any work and return.
 	 */
 	if (toep->ddp_queueing != NULL)
 		return;
 
 	/* Take the next job to prep it for DDP. */
 	toep->ddp_waiting_count--;
 	TAILQ_REMOVE(&toep->ddp_aiojobq, job, list);
 	if (!aio_clear_cancel_function(job))
 		goto restart;
 	toep->ddp_queueing = job;
 
 	/* NB: This drops DDP_LOCK while it holds the backing VM pages. */
 	error = hold_aio(toep, job, &ps);
 	if (error != 0) {
 		ddp_complete_one(job, error);
 		toep->ddp_queueing = NULL;
 		goto restart;
 	}
 
 	SOCKBUF_LOCK(sb);
 	if (so->so_error && sbavail(sb) == 0) {
 		copied = job->aio_received;
 		if (copied != 0) {
 			SOCKBUF_UNLOCK(sb);
 			recycle_pageset(toep, ps);
 			aio_complete(job, copied, 0);
 			toep->ddp_queueing = NULL;
 			goto restart;
 		}
 
 		error = so->so_error;
 		so->so_error = 0;
 		SOCKBUF_UNLOCK(sb);
 		recycle_pageset(toep, ps);
 		aio_complete(job, -1, error);
 		toep->ddp_queueing = NULL;
 		goto restart;
 	}
 
 	if (sb->sb_state & SBS_CANTRCVMORE && sbavail(sb) == 0) {
 		SOCKBUF_UNLOCK(sb);
 		recycle_pageset(toep, ps);
 		if (toep->ddp_active_count != 0) {
 			/*
 			 * The door is closed, but there are still pending
 			 * DDP buffers.  Requeue.  These jobs will all be
 			 * completed once those buffers drain.
 			 */
 			aio_ddp_requeue_one(toep, job);
 			toep->ddp_queueing = NULL;
 			return;
 		}
 		ddp_complete_one(job, 0);
 		ddp_complete_all(toep, 0);
 		toep->ddp_queueing = NULL;
 		return;
 	}
 
 sbcopy:
 	/*
 	 * If the toep is dead, there shouldn't be any data in the socket
 	 * buffer, so the above case should have handled this.
 	 */
 	MPASS(!(toep->ddp_flags & DDP_DEAD));
 
 	/*
 	 * If there is pending data in the socket buffer (either
 	 * from before the requests were queued or a DDP indicate),
 	 * copy those mbufs out directly.
 	 */
 	copied = 0;
 	offset = ps->offset + job->aio_received;
 	MPASS(job->aio_received <= job->uaiocb.aio_nbytes);
 	resid = job->uaiocb.aio_nbytes - job->aio_received;
 	m = sb->sb_mb;
 	KASSERT(m == NULL || toep->ddp_active_count == 0,
 	    ("%s: sockbuf data with active DDP", __func__));
 	while (m != NULL && resid > 0) {
 		struct iovec iov[1];
 		struct uio uio;
 		int error;
 
 		iov[0].iov_base = mtod(m, void *);
 		iov[0].iov_len = m->m_len;
 		if (iov[0].iov_len > resid)
 			iov[0].iov_len = resid;
 		uio.uio_iov = iov;
 		uio.uio_iovcnt = 1;
 		uio.uio_offset = 0;
 		uio.uio_resid = iov[0].iov_len;
 		uio.uio_segflg = UIO_SYSSPACE;
 		uio.uio_rw = UIO_WRITE;
 		error = uiomove_fromphys(ps->pages, offset + copied,
 		    uio.uio_resid, &uio);
 		MPASS(error == 0 && uio.uio_resid == 0);
 		copied += uio.uio_offset;
 		resid -= uio.uio_offset;
 		m = m->m_next;
 	}
 	if (copied != 0) {
 		sbdrop_locked(sb, copied);
 		job->aio_received += copied;
 		job->msgrcv = 1;
 		copied = job->aio_received;
 		inp = sotoinpcb(so);
 		if (!INP_TRY_WLOCK(inp)) {
 			/*
 			 * The reference on the socket file descriptor in
 			 * the AIO job should keep 'sb' and 'inp' stable.
 			 * Our caller has a reference on the 'toep' that
 			 * keeps it stable.
 			 */
 			SOCKBUF_UNLOCK(sb);
 			DDP_UNLOCK(toep);
 			INP_WLOCK(inp);
 			DDP_LOCK(toep);
 			SOCKBUF_LOCK(sb);
 
 			/*
 			 * If the socket has been closed, we should detect
 			 * that and complete this request if needed on
 			 * the next trip around the loop.
 			 */
 		}
 		t4_rcvd_locked(&toep->td->tod, intotcpcb(inp));
 		INP_WUNLOCK(inp);
 		if (resid == 0 || toep->ddp_flags & DDP_DEAD) {
 			/*
 			 * We filled the entire buffer with socket
 			 * data, DDP is not being used, or the socket
 			 * is being shut down, so complete the
 			 * request.
 			 */
 			SOCKBUF_UNLOCK(sb);
 			recycle_pageset(toep, ps);
 			aio_complete(job, copied, 0);
 			toep->ddp_queueing = NULL;
 			goto restart;
 		}
 
 		/*
 		 * If DDP is not enabled, requeue this request and restart.
 		 * This will either enable DDP or wait for more data to
 		 * arrive on the socket buffer.
 		 */
 		if ((toep->ddp_flags & (DDP_ON | DDP_SC_REQ)) != DDP_ON) {
 			SOCKBUF_UNLOCK(sb);
 			recycle_pageset(toep, ps);
 			aio_ddp_requeue_one(toep, job);
 			toep->ddp_queueing = NULL;
 			goto restart;
 		}
 
 		/*
 		 * An indicate might have arrived and been added to
 		 * the socket buffer while it was unlocked after the
 		 * copy to lock the INP.  If so, restart the copy.
 		 */
 		if (sbavail(sb) != 0)
 			goto sbcopy;
 	}
 	SOCKBUF_UNLOCK(sb);
 
 	if (prep_pageset(sc, toep, ps) == 0) {
 		recycle_pageset(toep, ps);
 		aio_ddp_requeue_one(toep, job);
 		toep->ddp_queueing = NULL;
 
 		/*
 		 * XXX: Need to retry this later.  Mostly need a trigger
 		 * when page pods are freed up.
 		 */
 		printf("%s: prep_pageset failed\n", __func__);
 		return;
 	}
 
 	/* Determine which DDP buffer to use. */
 	if (toep->db[0].job == NULL) {
 		db_idx = 0;
 	} else {
 		MPASS(toep->db[1].job == NULL);
 		db_idx = 1;
 	}
 
 	ddp_flags = 0;
 	ddp_flags_mask = 0;
 	if (db_idx == 0) {
 		ddp_flags |= V_TF_DDP_BUF0_VALID(1);
 		if (so->so_state & SS_NBIO)
 			ddp_flags |= V_TF_DDP_BUF0_FLUSH(1);
 		ddp_flags_mask |= V_TF_DDP_PSH_NO_INVALIDATE0(1) |
 		    V_TF_DDP_PUSH_DISABLE_0(1) | V_TF_DDP_PSHF_ENABLE_0(1) |
 		    V_TF_DDP_BUF0_FLUSH(1) | V_TF_DDP_BUF0_VALID(1);
 		buf_flag = DDP_BUF0_ACTIVE;
 	} else {
 		ddp_flags |= V_TF_DDP_BUF1_VALID(1);
 		if (so->so_state & SS_NBIO)
 			ddp_flags |= V_TF_DDP_BUF1_FLUSH(1);
 		ddp_flags_mask |= V_TF_DDP_PSH_NO_INVALIDATE1(1) |
 		    V_TF_DDP_PUSH_DISABLE_1(1) | V_TF_DDP_PSHF_ENABLE_1(1) |
 		    V_TF_DDP_BUF1_FLUSH(1) | V_TF_DDP_BUF1_VALID(1);
 		buf_flag = DDP_BUF1_ACTIVE;
 	}
 	MPASS((toep->ddp_flags & buf_flag) == 0);
 	if ((toep->ddp_flags & (DDP_BUF0_ACTIVE | DDP_BUF1_ACTIVE)) == 0) {
 		MPASS(db_idx == 0);
 		MPASS(toep->ddp_active_id == -1);
 		MPASS(toep->ddp_active_count == 0);
 		ddp_flags_mask |= V_TF_DDP_ACTIVE_BUF(1);
 	}
 
 	/*
 	 * The TID for this connection should still be valid.  If DDP_DEAD
 	 * is set, SBS_CANTRCVMORE should be set, so we shouldn't be
 	 * this far anyway.  Even if the socket is closing on the other
 	 * end, the AIO job holds a reference on this end of the socket
 	 * which will keep it open and keep the TCP PCB attached until
 	 * after the job is completed.
 	 */
 	wr = mk_update_tcb_for_ddp(sc, toep, db_idx, ps, job->aio_received,
 	    ddp_flags, ddp_flags_mask);
 	if (wr == NULL) {
 		recycle_pageset(toep, ps);
 		aio_ddp_requeue_one(toep, job);
 		toep->ddp_queueing = NULL;
 
 		/*
 		 * XXX: Need a way to kick a retry here.
 		 *
 		 * XXX: We know the fixed size needed and could
 		 * preallocate this using a blocking request at the
 		 * start of the task to avoid having to handle this
 		 * edge case.
 		 */
 		printf("%s: mk_update_tcb_for_ddp failed\n", __func__);
 		return;
 	}
 
 	if (!aio_set_cancel_function(job, t4_aio_cancel_active)) {
 		free_wrqe(wr);
 		recycle_pageset(toep, ps);
 		aio_ddp_cancel_one(job);
 		toep->ddp_queueing = NULL;
 		goto restart;
 	}
 
 #ifdef VERBOSE_TRACES
 	CTR5(KTR_CXGBE, "%s: scheduling %p for DDP[%d] (flags %#lx/%#lx)",
 	    __func__, job, db_idx, ddp_flags, ddp_flags_mask);
 #endif
 	/* Give the chip the go-ahead. */
 	t4_wrq_tx(sc, wr);
 	db = &toep->db[db_idx];
 	db->cancel_pending = 0;
 	db->job = job;
 	db->ps = ps;
 	toep->ddp_queueing = NULL;
 	toep->ddp_flags |= buf_flag;
 	toep->ddp_active_count++;
 	if (toep->ddp_active_count == 1) {
 		MPASS(toep->ddp_active_id == -1);
 		toep->ddp_active_id = db_idx;
 		CTR2(KTR_CXGBE, "%s: ddp_active_id = %d", __func__,
 		    toep->ddp_active_id);
 	}
 	goto restart;
 }
 
 void
 ddp_queue_toep(struct toepcb *toep)
 {
 
 	DDP_ASSERT_LOCKED(toep);
 	if (toep->ddp_flags & DDP_TASK_ACTIVE)
 		return;
 	toep->ddp_flags |= DDP_TASK_ACTIVE;
 	hold_toepcb(toep);
 	soaio_enqueue(&toep->ddp_requeue_task);
 }
 
 static void
 aio_ddp_requeue_task(void *context, int pending)
 {
 	struct toepcb *toep = context;
 
 	DDP_LOCK(toep);
 	aio_ddp_requeue(toep);
 	toep->ddp_flags &= ~DDP_TASK_ACTIVE;
 	DDP_UNLOCK(toep);
 
 	free_toepcb(toep);
 }
 
 static void
 t4_aio_cancel_active(struct kaiocb *job)
 {
 	struct socket *so = job->fd_file->f_data;
 	struct tcpcb *tp = so_sototcpcb(so);
 	struct toepcb *toep = tp->t_toe;
 	struct adapter *sc = td_adapter(toep->td);
 	uint64_t valid_flag;
 	int i;
 
 	DDP_LOCK(toep);
 	if (aio_cancel_cleared(job)) {
 		DDP_UNLOCK(toep);
 		aio_ddp_cancel_one(job);
 		return;
 	}
 
 	for (i = 0; i < nitems(toep->db); i++) {
 		if (toep->db[i].job == job) {
 			/* Should only ever get one cancel request for a job. */
 			MPASS(toep->db[i].cancel_pending == 0);
 
 			/*
 			 * Invalidate this buffer.  It will be
 			 * cancelled or partially completed once the
 			 * card ACKs the invalidate.
 			 */
 			valid_flag = i == 0 ? V_TF_DDP_BUF0_VALID(1) :
 			    V_TF_DDP_BUF1_VALID(1);
 			t4_set_tcb_field(sc, toep->ctrlq, toep->tid,
 			    W_TCB_RX_DDP_FLAGS, valid_flag, 0, 1,
 			    i + DDP_BUF0_INVALIDATED,
 			    toep->ofld_rxq->iq.abs_id);
 			toep->db[i].cancel_pending = 1;
 			CTR2(KTR_CXGBE, "%s: request %p marked pending",
 			    __func__, job);
 			break;
 		}
 	}
 	DDP_UNLOCK(toep);
 }
 
 static void
 t4_aio_cancel_queued(struct kaiocb *job)
 {
 	struct socket *so = job->fd_file->f_data;
 	struct tcpcb *tp = so_sototcpcb(so);
 	struct toepcb *toep = tp->t_toe;
 
 	DDP_LOCK(toep);
 	if (!aio_cancel_cleared(job)) {
 		TAILQ_REMOVE(&toep->ddp_aiojobq, job, list);
 		toep->ddp_waiting_count--;
 		if (toep->ddp_waiting_count == 0)
 			ddp_queue_toep(toep);
 	}
 	CTR2(KTR_CXGBE, "%s: request %p cancelled", __func__, job);
 	DDP_UNLOCK(toep);
 
 	aio_ddp_cancel_one(job);
 }
 
 int
 t4_aio_queue_ddp(struct socket *so, struct kaiocb *job)
 {
 	struct tcpcb *tp = so_sototcpcb(so);
 	struct toepcb *toep = tp->t_toe;
 
 
 	/* Ignore writes. */
 	if (job->uaiocb.aio_lio_opcode != LIO_READ)
 		return (EOPNOTSUPP);
 
 	DDP_LOCK(toep);
 
 	/*
 	 * XXX: Think about possibly returning errors for ENOTCONN,
 	 * etc.  Perhaps the caller would only queue the request
 	 * if it failed with EOPNOTSUPP?
 	 */
 
 #ifdef VERBOSE_TRACES
 	CTR2(KTR_CXGBE, "%s: queueing %p", __func__, job);
 #endif
 	if (!aio_set_cancel_function(job, t4_aio_cancel_queued))
 		panic("new job was cancelled");
 	TAILQ_INSERT_TAIL(&toep->ddp_aiojobq, job, list);
 	toep->ddp_waiting_count++;
 	toep->ddp_flags |= DDP_OK;
 
 	/*
 	 * Try to handle this request synchronously.  If this has
 	 * to block because the task is running, it will just bail
 	 * and let the task handle it instead.
 	 */
 	aio_ddp_requeue(toep);
 	DDP_UNLOCK(toep);
 	return (0);
 }
 
 int
 t4_ddp_mod_load(void)
 {
 
 	t4_register_cpl_handler(CPL_RX_DATA_DDP, do_rx_data_ddp);
 	t4_register_cpl_handler(CPL_RX_DDP_COMPLETE, do_rx_ddp_complete);
 	TAILQ_INIT(&ddp_orphan_pagesets);
 	mtx_init(&ddp_orphan_pagesets_lock, "ddp orphans", NULL, MTX_DEF);
 	TASK_INIT(&ddp_orphan_task, 0, ddp_free_orphan_pagesets, NULL);
 	return (0);
 }
 
 void
 t4_ddp_mod_unload(void)
 {
 
 	taskqueue_drain(taskqueue_thread, &ddp_orphan_task);
 	MPASS(TAILQ_EMPTY(&ddp_orphan_pagesets));
 	mtx_destroy(&ddp_orphan_pagesets_lock);
 	t4_register_cpl_handler(CPL_RX_DATA_DDP, NULL);
 	t4_register_cpl_handler(CPL_RX_DDP_COMPLETE, NULL);
 }
 #endif
Index: head/sys/dev/cxgbe/tom/t4_tom.c
===================================================================
--- head/sys/dev/cxgbe/tom/t4_tom.c	(revision 305165)
+++ head/sys/dev/cxgbe/tom/t4_tom.c	(revision 305166)
@@ -1,1245 +1,1249 @@
 /*-
  * Copyright (c) 2012 Chelsio Communications, Inc.
  * All rights reserved.
  * Written by: Navdeep Parhar <np@FreeBSD.org>
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_inet.h"
 #include "opt_inet6.h"
 
 #include <sys/param.h>
 #include <sys/types.h>
 #include <sys/systm.h>
 #include <sys/kernel.h>
 #include <sys/ktr.h>
 #include <sys/lock.h>
 #include <sys/limits.h>
 #include <sys/module.h>
 #include <sys/protosw.h>
 #include <sys/domain.h>
 #include <sys/refcount.h>
 #include <sys/rmlock.h>
 #include <sys/socket.h>
 #include <sys/socketvar.h>
 #include <sys/taskqueue.h>
 #include <net/if.h>
 #include <net/if_var.h>
 #include <netinet/in.h>
 #include <netinet/in_pcb.h>
 #include <netinet/in_var.h>
 #include <netinet/ip.h>
 #include <netinet/ip6.h>
 #include <netinet6/scope6_var.h>
 #define TCPSTATES
 #include <netinet/tcp_fsm.h>
 #include <netinet/tcp_var.h>
 #include <netinet/toecore.h>
 
 #ifdef TCP_OFFLOAD
 #include "common/common.h"
 #include "common/t4_msg.h"
 #include "common/t4_regs.h"
 #include "common/t4_regs_values.h"
 #include "common/t4_tcb.h"
 #include "tom/t4_tom_l2t.h"
 #include "tom/t4_tom.h"
 
 static struct protosw toe_protosw;
 static struct pr_usrreqs toe_usrreqs;
 
 static struct protosw toe6_protosw;
 static struct pr_usrreqs toe6_usrreqs;
 
 /* Module ops */
 static int t4_tom_mod_load(void);
 static int t4_tom_mod_unload(void);
 static int t4_tom_modevent(module_t, int, void *);
 
 /* ULD ops and helpers */
 static int t4_tom_activate(struct adapter *);
 static int t4_tom_deactivate(struct adapter *);
 
 static struct uld_info tom_uld_info = {
 	.uld_id = ULD_TOM,
 	.activate = t4_tom_activate,
 	.deactivate = t4_tom_deactivate,
 };
 
 static void queue_tid_release(struct adapter *, int);
 static void release_offload_resources(struct toepcb *);
 static int alloc_tid_tabs(struct tid_info *);
 static void free_tid_tabs(struct tid_info *);
 static int add_lip(struct adapter *, struct in6_addr *);
 static int delete_lip(struct adapter *, struct in6_addr *);
 static struct clip_entry *search_lip(struct tom_data *, struct in6_addr *);
 static void init_clip_table(struct adapter *, struct tom_data *);
 static void update_clip(struct adapter *, void *);
 static void t4_clip_task(void *, int);
 static void update_clip_table(struct adapter *, struct tom_data *);
 static void destroy_clip_table(struct adapter *, struct tom_data *);
 static void free_tom_data(struct adapter *, struct tom_data *);
 static void reclaim_wr_resources(void *, int);
 
 static int in6_ifaddr_gen;
 static eventhandler_tag ifaddr_evhandler;
 static struct timeout_task clip_task;
 
 struct toepcb *
 alloc_toepcb(struct vi_info *vi, int txqid, int rxqid, int flags)
 {
 	struct port_info *pi = vi->pi;
 	struct adapter *sc = pi->adapter;
 	struct toepcb *toep;
 	int tx_credits, txsd_total, len;
 
 	/*
 	 * The firmware counts tx work request credits in units of 16 bytes
 	 * each.  Reserve room for an ABORT_REQ so the driver never has to worry
 	 * about tx credits if it wants to abort a connection.
 	 */
 	tx_credits = sc->params.ofldq_wr_cred;
 	tx_credits -= howmany(sizeof(struct cpl_abort_req), 16);
 
 	/*
 	 * Shortest possible tx work request is a fw_ofld_tx_data_wr + 1 byte
 	 * immediate payload, and firmware counts tx work request credits in
 	 * units of 16 byte.  Calculate the maximum work requests possible.
 	 */
 	txsd_total = tx_credits /
 	    howmany(sizeof(struct fw_ofld_tx_data_wr) + 1, 16);
 
 	if (txqid < 0)
 		txqid = (arc4random() % vi->nofldtxq) + vi->first_ofld_txq;
 	KASSERT(txqid >= vi->first_ofld_txq &&
 	    txqid < vi->first_ofld_txq + vi->nofldtxq,
 	    ("%s: txqid %d for vi %p (first %d, n %d)", __func__, txqid, vi,
 		vi->first_ofld_txq, vi->nofldtxq));
 
 	if (rxqid < 0)
 		rxqid = (arc4random() % vi->nofldrxq) + vi->first_ofld_rxq;
 	KASSERT(rxqid >= vi->first_ofld_rxq &&
 	    rxqid < vi->first_ofld_rxq + vi->nofldrxq,
 	    ("%s: rxqid %d for vi %p (first %d, n %d)", __func__, rxqid, vi,
 		vi->first_ofld_rxq, vi->nofldrxq));
 
 	len = offsetof(struct toepcb, txsd) +
 	    txsd_total * sizeof(struct ofld_tx_sdesc);
 
 	toep = malloc(len, M_CXGBE, M_ZERO | flags);
 	if (toep == NULL)
 		return (NULL);
 
 	refcount_init(&toep->refcount, 1);
 	toep->td = sc->tom_softc;
 	toep->vi = vi;
 	toep->tx_total = tx_credits;
 	toep->tx_credits = tx_credits;
 	toep->ofld_txq = &sc->sge.ofld_txq[txqid];
 	toep->ofld_rxq = &sc->sge.ofld_rxq[rxqid];
 	toep->ctrlq = &sc->sge.ctrlq[pi->port_id];
 	mbufq_init(&toep->ulp_pduq, INT_MAX);
 	mbufq_init(&toep->ulp_pdu_reclaimq, INT_MAX);
 	toep->txsd_total = txsd_total;
 	toep->txsd_avail = txsd_total;
 	toep->txsd_pidx = 0;
 	toep->txsd_cidx = 0;
 	aiotx_init_toep(toep);
 	ddp_init_toep(toep);
 
 	return (toep);
 }
 
 struct toepcb *
 hold_toepcb(struct toepcb *toep)
 {
 
 	refcount_acquire(&toep->refcount);
 	return (toep);
 }
 
 void
 free_toepcb(struct toepcb *toep)
 {
 
 	if (refcount_release(&toep->refcount) == 0)
 		return;
 
 	KASSERT(!(toep->flags & TPF_ATTACHED),
 	    ("%s: attached to an inpcb", __func__));
 	KASSERT(!(toep->flags & TPF_CPL_PENDING),
 	    ("%s: CPL pending", __func__));
 
 	ddp_uninit_toep(toep);
 	free(toep, M_CXGBE);
 }
 
 /*
  * Set up the socket for TCP offload.
  */
 void
 offload_socket(struct socket *so, struct toepcb *toep)
 {
 	struct tom_data *td = toep->td;
 	struct inpcb *inp = sotoinpcb(so);
 	struct tcpcb *tp = intotcpcb(inp);
 	struct sockbuf *sb;
 
 	INP_WLOCK_ASSERT(inp);
 
 	/* Update socket */
 	sb = &so->so_snd;
 	SOCKBUF_LOCK(sb);
 	sb->sb_flags |= SB_NOCOALESCE;
 	SOCKBUF_UNLOCK(sb);
 	sb = &so->so_rcv;
 	SOCKBUF_LOCK(sb);
 	sb->sb_flags |= SB_NOCOALESCE;
 	if (inp->inp_vflag & INP_IPV6)
 		so->so_proto = &toe6_protosw;
 	else
 		so->so_proto = &toe_protosw;
 	SOCKBUF_UNLOCK(sb);
 
 	/* Update TCP PCB */
 	tp->tod = &td->tod;
 	tp->t_toe = toep;
 	tp->t_flags |= TF_TOE;
 
 	/* Install an extra hold on inp */
 	toep->inp = inp;
 	toep->flags |= TPF_ATTACHED;
 	in_pcbref(inp);
 
 	/* Add the TOE PCB to the active list */
 	mtx_lock(&td->toep_list_lock);
 	TAILQ_INSERT_HEAD(&td->toep_list, toep, link);
 	mtx_unlock(&td->toep_list_lock);
 }
 
 /* This is _not_ the normal way to "unoffload" a socket. */
 void
 undo_offload_socket(struct socket *so)
 {
 	struct inpcb *inp = sotoinpcb(so);
 	struct tcpcb *tp = intotcpcb(inp);
 	struct toepcb *toep = tp->t_toe;
 	struct tom_data *td = toep->td;
 	struct sockbuf *sb;
 
 	INP_WLOCK_ASSERT(inp);
 
 	sb = &so->so_snd;
 	SOCKBUF_LOCK(sb);
 	sb->sb_flags &= ~SB_NOCOALESCE;
 	SOCKBUF_UNLOCK(sb);
 	sb = &so->so_rcv;
 	SOCKBUF_LOCK(sb);
 	sb->sb_flags &= ~SB_NOCOALESCE;
 	SOCKBUF_UNLOCK(sb);
 
 	tp->tod = NULL;
 	tp->t_toe = NULL;
 	tp->t_flags &= ~TF_TOE;
 
 	toep->inp = NULL;
 	toep->flags &= ~TPF_ATTACHED;
 	if (in_pcbrele_wlocked(inp))
 		panic("%s: inp freed.", __func__);
 
 	mtx_lock(&td->toep_list_lock);
 	TAILQ_REMOVE(&td->toep_list, toep, link);
 	mtx_unlock(&td->toep_list_lock);
 
 	free_toepcb(toep);
 }
 
 static void
 release_offload_resources(struct toepcb *toep)
 {
 	struct tom_data *td = toep->td;
 	struct adapter *sc = td_adapter(td);
 	int tid = toep->tid;
 
 	KASSERT(!(toep->flags & TPF_CPL_PENDING),
 	    ("%s: %p has CPL pending.", __func__, toep));
 	KASSERT(!(toep->flags & TPF_ATTACHED),
 	    ("%s: %p is still attached.", __func__, toep));
 
 	CTR5(KTR_CXGBE, "%s: toep %p (tid %d, l2te %p, ce %p)",
 	    __func__, toep, tid, toep->l2te, toep->ce);
 
 	/*
 	 * These queues should have been emptied at approximately the same time
 	 * that a normal connection's socket's so_snd would have been purged or
 	 * drained.  Do _not_ clean up here.
 	 */
 	MPASS(mbufq_len(&toep->ulp_pduq) == 0);
 	MPASS(mbufq_len(&toep->ulp_pdu_reclaimq) == 0);
 #ifdef INVARIANTS
 	ddp_assert_empty(toep);
 #endif
 
 	if (toep->l2te)
 		t4_l2t_release(toep->l2te);
 
 	if (tid >= 0) {
 		remove_tid(sc, tid);
 		release_tid(sc, tid, toep->ctrlq);
 	}
 
 	if (toep->ce)
 		release_lip(td, toep->ce);
 
 	mtx_lock(&td->toep_list_lock);
 	TAILQ_REMOVE(&td->toep_list, toep, link);
 	mtx_unlock(&td->toep_list_lock);
 
 	free_toepcb(toep);
 }
 
 /*
  * The kernel is done with the TCP PCB and this is our opportunity to unhook the
  * toepcb hanging off of it.  If the TOE driver is also done with the toepcb (no
  * pending CPL) then it is time to release all resources tied to the toepcb.
  *
  * Also gets called when an offloaded active open fails and the TOM wants the
  * kernel to take the TCP PCB back.
  */
 static void
 t4_pcb_detach(struct toedev *tod __unused, struct tcpcb *tp)
 {
 #if defined(KTR) || defined(INVARIANTS)
 	struct inpcb *inp = tp->t_inpcb;
 #endif
 	struct toepcb *toep = tp->t_toe;
 
 	INP_WLOCK_ASSERT(inp);
 
 	KASSERT(toep != NULL, ("%s: toep is NULL", __func__));
 	KASSERT(toep->flags & TPF_ATTACHED,
 	    ("%s: not attached", __func__));
 
 #ifdef KTR
 	if (tp->t_state == TCPS_SYN_SENT) {
 		CTR6(KTR_CXGBE, "%s: atid %d, toep %p (0x%x), inp %p (0x%x)",
 		    __func__, toep->tid, toep, toep->flags, inp,
 		    inp->inp_flags);
 	} else {
 		CTR6(KTR_CXGBE,
 		    "t4_pcb_detach: tid %d (%s), toep %p (0x%x), inp %p (0x%x)",
 		    toep->tid, tcpstates[tp->t_state], toep, toep->flags, inp,
 		    inp->inp_flags);
 	}
 #endif
 
 	tp->t_toe = NULL;
 	tp->t_flags &= ~TF_TOE;
 	toep->flags &= ~TPF_ATTACHED;
 
 	if (!(toep->flags & TPF_CPL_PENDING))
 		release_offload_resources(toep);
 }
 
 /*
  * setsockopt handler.
  */
 static void
 t4_ctloutput(struct toedev *tod, struct tcpcb *tp, int dir, int name)
 {
 	struct adapter *sc = tod->tod_softc;
 	struct toepcb *toep = tp->t_toe;
 
 	if (dir == SOPT_GET)
 		return;
 
 	CTR4(KTR_CXGBE, "%s: tp %p, dir %u, name %u", __func__, tp, dir, name);
 
 	switch (name) {
 	case TCP_NODELAY:
 		t4_set_tcb_field(sc, toep->ctrlq, toep->tid, W_TCB_T_FLAGS,
 		    V_TF_NAGLE(1), V_TF_NAGLE(tp->t_flags & TF_NODELAY ? 0 : 1),
 		    0, 0, toep->ofld_rxq->iq.abs_id);
 		break;
 	default:
 		break;
 	}
 }
 
 /*
  * The TOE driver will not receive any more CPLs for the tid associated with the
  * toepcb; release the hold on the inpcb.
  */
 void
 final_cpl_received(struct toepcb *toep)
 {
 	struct inpcb *inp = toep->inp;
 
 	KASSERT(inp != NULL, ("%s: inp is NULL", __func__));
 	INP_WLOCK_ASSERT(inp);
 	KASSERT(toep->flags & TPF_CPL_PENDING,
 	    ("%s: CPL not pending already?", __func__));
 
 	CTR6(KTR_CXGBE, "%s: tid %d, toep %p (0x%x), inp %p (0x%x)",
 	    __func__, toep->tid, toep, toep->flags, inp, inp->inp_flags);
 
 	if (toep->ulp_mode == ULP_MODE_TCPDDP)
 		release_ddp_resources(toep);
 	toep->inp = NULL;
 	toep->flags &= ~TPF_CPL_PENDING;
 	mbufq_drain(&toep->ulp_pdu_reclaimq);
 
 	if (!(toep->flags & TPF_ATTACHED))
 		release_offload_resources(toep);
 
 	if (!in_pcbrele_wlocked(inp))
 		INP_WUNLOCK(inp);
 }
 
 void
 insert_tid(struct adapter *sc, int tid, void *ctx)
 {
 	struct tid_info *t = &sc->tids;
 
 	t->tid_tab[tid] = ctx;
 	atomic_add_int(&t->tids_in_use, 1);
 }
 
 void *
 lookup_tid(struct adapter *sc, int tid)
 {
 	struct tid_info *t = &sc->tids;
 
 	return (t->tid_tab[tid]);
 }
 
 void
 update_tid(struct adapter *sc, int tid, void *ctx)
 {
 	struct tid_info *t = &sc->tids;
 
 	t->tid_tab[tid] = ctx;
 }
 
 void
 remove_tid(struct adapter *sc, int tid)
 {
 	struct tid_info *t = &sc->tids;
 
 	t->tid_tab[tid] = NULL;
 	atomic_subtract_int(&t->tids_in_use, 1);
 }
 
 void
 release_tid(struct adapter *sc, int tid, struct sge_wrq *ctrlq)
 {
 	struct wrqe *wr;
 	struct cpl_tid_release *req;
 
 	wr = alloc_wrqe(sizeof(*req), ctrlq);
 	if (wr == NULL) {
 		queue_tid_release(sc, tid);	/* defer */
 		return;
 	}
 	req = wrtod(wr);
 
 	INIT_TP_WR_MIT_CPL(req, CPL_TID_RELEASE, tid);
 
 	t4_wrq_tx(sc, wr);
 }
 
 static void
 queue_tid_release(struct adapter *sc, int tid)
 {
 
 	CXGBE_UNIMPLEMENTED("deferred tid release");
 }
 
 /*
  * What mtu_idx to use, given a 4-tuple and/or an MSS cap
  */
 int
 find_best_mtu_idx(struct adapter *sc, struct in_conninfo *inc, int pmss)
 {
 	unsigned short *mtus = &sc->params.mtus[0];
 	int i, mss, n;
 
 	KASSERT(inc != NULL || pmss > 0,
 	    ("%s: at least one of inc/pmss must be specified", __func__));
 
 	mss = inc ? tcp_mssopt(inc) : pmss;
 	if (pmss > 0 && mss > pmss)
 		mss = pmss;
 
 	if (inc->inc_flags & INC_ISIPV6)
 		n = sizeof(struct ip6_hdr) + sizeof(struct tcphdr);
 	else
 		n = sizeof(struct ip) + sizeof(struct tcphdr);
 
 	for (i = 0; i < NMTUS - 1 && mtus[i + 1] <= mss + n; i++)
 		continue;
 
 	return (i);
 }
 
 /*
  * Determine the receive window size for a socket.
  */
 u_long
 select_rcv_wnd(struct socket *so)
 {
 	unsigned long wnd;
 
 	SOCKBUF_LOCK_ASSERT(&so->so_rcv);
 
 	wnd = sbspace(&so->so_rcv);
 	if (wnd < MIN_RCV_WND)
 		wnd = MIN_RCV_WND;
 
 	return min(wnd, MAX_RCV_WND);
 }
 
 int
 select_rcv_wscale(void)
 {
 	int wscale = 0;
 	unsigned long space = sb_max;
 
 	if (space > MAX_RCV_WND)
 		space = MAX_RCV_WND;
 
 	while (wscale < TCP_MAX_WINSHIFT && (TCP_MAXWIN << wscale) < space)
 		wscale++;
 
 	return (wscale);
 }
 
 extern int always_keepalive;
 #define VIID_SMACIDX(v)	(((unsigned int)(v) & 0x7f) << 1)
 
 /*
  * socket so could be a listening socket too.
  */
 uint64_t
 calc_opt0(struct socket *so, struct vi_info *vi, struct l2t_entry *e,
     int mtu_idx, int rscale, int rx_credits, int ulp_mode)
 {
 	uint64_t opt0;
 
 	KASSERT(rx_credits <= M_RCV_BUFSIZ,
 	    ("%s: rcv_bufsiz too high", __func__));
 
 	opt0 = F_TCAM_BYPASS | V_WND_SCALE(rscale) | V_MSS_IDX(mtu_idx) |
 	    V_ULP_MODE(ulp_mode) | V_RCV_BUFSIZ(rx_credits);
 
 	if (so != NULL) {
 		struct inpcb *inp = sotoinpcb(so);
 		struct tcpcb *tp = intotcpcb(inp);
 		int keepalive = always_keepalive ||
 		    so_options_get(so) & SO_KEEPALIVE;
 
 		opt0 |= V_NAGLE((tp->t_flags & TF_NODELAY) == 0);
 		opt0 |= V_KEEP_ALIVE(keepalive != 0);
 	}
 
 	if (e != NULL)
 		opt0 |= V_L2T_IDX(e->idx);
 
 	if (vi != NULL) {
 		opt0 |= V_SMAC_SEL(VIID_SMACIDX(vi->viid));
 		opt0 |= V_TX_CHAN(vi->pi->tx_chan);
 	}
 
 	return htobe64(opt0);
 }
 
 uint64_t
 select_ntuple(struct vi_info *vi, struct l2t_entry *e)
 {
 	struct adapter *sc = vi->pi->adapter;
 	struct tp_params *tp = &sc->params.tp;
 	uint16_t viid = vi->viid;
 	uint64_t ntuple = 0;
 
 	/*
 	 * Initialize each of the fields which we care about which are present
 	 * in the Compressed Filter Tuple.
 	 */
 	if (tp->vlan_shift >= 0 && e->vlan != CPL_L2T_VLAN_NONE)
 		ntuple |= (uint64_t)(F_FT_VLAN_VLD | e->vlan) << tp->vlan_shift;
 
 	if (tp->port_shift >= 0)
 		ntuple |= (uint64_t)e->lport << tp->port_shift;
 
 	if (tp->protocol_shift >= 0)
 		ntuple |= (uint64_t)IPPROTO_TCP << tp->protocol_shift;
 
 	if (tp->vnic_shift >= 0) {
 		uint32_t vf = G_FW_VIID_VIN(viid);
 		uint32_t pf = G_FW_VIID_PFN(viid);
 		uint32_t vld = G_FW_VIID_VIVLD(viid);
 
 		ntuple |= (uint64_t)(V_FT_VNID_ID_VF(vf) | V_FT_VNID_ID_PF(pf) |
 		    V_FT_VNID_ID_VLD(vld)) << tp->vnic_shift;
 	}
 
 	if (is_t4(sc))
 		return (htobe32((uint32_t)ntuple));
 	else
 		return (htobe64(V_FILTER_TUPLE(ntuple)));
 }
 
 void
 set_tcpddp_ulp_mode(struct toepcb *toep)
 {
 
 	toep->ulp_mode = ULP_MODE_TCPDDP;
 	toep->ddp_flags = DDP_OK;
 }
 
 int
 negative_advice(int status)
 {
 
 	return (status == CPL_ERR_RTX_NEG_ADVICE ||
 	    status == CPL_ERR_PERSIST_NEG_ADVICE ||
 	    status == CPL_ERR_KEEPALV_NEG_ADVICE);
 }
 
 static int
 alloc_tid_tabs(struct tid_info *t)
 {
 	size_t size;
 	unsigned int i;
 
 	size = t->ntids * sizeof(*t->tid_tab) +
 	    t->natids * sizeof(*t->atid_tab) +
 	    t->nstids * sizeof(*t->stid_tab);
 
 	t->tid_tab = malloc(size, M_CXGBE, M_ZERO | M_NOWAIT);
 	if (t->tid_tab == NULL)
 		return (ENOMEM);
 
 	mtx_init(&t->atid_lock, "atid lock", NULL, MTX_DEF);
 	t->atid_tab = (union aopen_entry *)&t->tid_tab[t->ntids];
 	t->afree = t->atid_tab;
 	t->atids_in_use = 0;
 	for (i = 1; i < t->natids; i++)
 		t->atid_tab[i - 1].next = &t->atid_tab[i];
 	t->atid_tab[t->natids - 1].next = NULL;
 
 	mtx_init(&t->stid_lock, "stid lock", NULL, MTX_DEF);
 	t->stid_tab = (struct listen_ctx **)&t->atid_tab[t->natids];
 	t->stids_in_use = 0;
 	TAILQ_INIT(&t->stids);
 	t->nstids_free_head = t->nstids;
 
 	atomic_store_rel_int(&t->tids_in_use, 0);
 
 	return (0);
 }
 
 static void
 free_tid_tabs(struct tid_info *t)
 {
 	KASSERT(t->tids_in_use == 0,
 	    ("%s: %d tids still in use.", __func__, t->tids_in_use));
 	KASSERT(t->atids_in_use == 0,
 	    ("%s: %d atids still in use.", __func__, t->atids_in_use));
 	KASSERT(t->stids_in_use == 0,
 	    ("%s: %d tids still in use.", __func__, t->stids_in_use));
 
 	free(t->tid_tab, M_CXGBE);
 	t->tid_tab = NULL;
 
 	if (mtx_initialized(&t->atid_lock))
 		mtx_destroy(&t->atid_lock);
 	if (mtx_initialized(&t->stid_lock))
 		mtx_destroy(&t->stid_lock);
 }
 
 static int
 add_lip(struct adapter *sc, struct in6_addr *lip)
 {
         struct fw_clip_cmd c;
 
 	ASSERT_SYNCHRONIZED_OP(sc);
 	/* mtx_assert(&td->clip_table_lock, MA_OWNED); */
 
         memset(&c, 0, sizeof(c));
 	c.op_to_write = htonl(V_FW_CMD_OP(FW_CLIP_CMD) | F_FW_CMD_REQUEST |
 	    F_FW_CMD_WRITE);
         c.alloc_to_len16 = htonl(F_FW_CLIP_CMD_ALLOC | FW_LEN16(c));
         c.ip_hi = *(uint64_t *)&lip->s6_addr[0];
         c.ip_lo = *(uint64_t *)&lip->s6_addr[8];
 
 	return (-t4_wr_mbox_ns(sc, sc->mbox, &c, sizeof(c), &c));
 }
 
 static int
 delete_lip(struct adapter *sc, struct in6_addr *lip)
 {
 	struct fw_clip_cmd c;
 
 	ASSERT_SYNCHRONIZED_OP(sc);
 	/* mtx_assert(&td->clip_table_lock, MA_OWNED); */
 
 	memset(&c, 0, sizeof(c));
 	c.op_to_write = htonl(V_FW_CMD_OP(FW_CLIP_CMD) | F_FW_CMD_REQUEST |
 	    F_FW_CMD_READ);
         c.alloc_to_len16 = htonl(F_FW_CLIP_CMD_FREE | FW_LEN16(c));
         c.ip_hi = *(uint64_t *)&lip->s6_addr[0];
         c.ip_lo = *(uint64_t *)&lip->s6_addr[8];
 
 	return (-t4_wr_mbox_ns(sc, sc->mbox, &c, sizeof(c), &c));
 }
 
 static struct clip_entry *
 search_lip(struct tom_data *td, struct in6_addr *lip)
 {
 	struct clip_entry *ce;
 
 	mtx_assert(&td->clip_table_lock, MA_OWNED);
 
 	TAILQ_FOREACH(ce, &td->clip_table, link) {
 		if (IN6_ARE_ADDR_EQUAL(&ce->lip, lip))
 			return (ce);
 	}
 
 	return (NULL);
 }
 
 struct clip_entry *
 hold_lip(struct tom_data *td, struct in6_addr *lip)
 {
 	struct clip_entry *ce;
 
 	mtx_lock(&td->clip_table_lock);
 	ce = search_lip(td, lip);
 	if (ce != NULL)
 		ce->refcount++;
 	mtx_unlock(&td->clip_table_lock);
 
 	return (ce);
 }
 
 void
 release_lip(struct tom_data *td, struct clip_entry *ce)
 {
 
 	mtx_lock(&td->clip_table_lock);
 	KASSERT(search_lip(td, &ce->lip) == ce,
 	    ("%s: CLIP entry %p p not in CLIP table.", __func__, ce));
 	KASSERT(ce->refcount > 0,
 	    ("%s: CLIP entry %p has refcount 0", __func__, ce));
 	--ce->refcount;
 	mtx_unlock(&td->clip_table_lock);
 }
 
 static void
 init_clip_table(struct adapter *sc, struct tom_data *td)
 {
 
 	ASSERT_SYNCHRONIZED_OP(sc);
 
 	mtx_init(&td->clip_table_lock, "CLIP table lock", NULL, MTX_DEF);
 	TAILQ_INIT(&td->clip_table);
 	td->clip_gen = -1;
 
 	update_clip_table(sc, td);
 }
 
 static void
 update_clip(struct adapter *sc, void *arg __unused)
 {
 
 	if (begin_synchronized_op(sc, NULL, HOLD_LOCK, "t4tomuc"))
 		return;
 
 	if (uld_active(sc, ULD_TOM))
 		update_clip_table(sc, sc->tom_softc);
 
 	end_synchronized_op(sc, LOCK_HELD);
 }
 
 static void
 t4_clip_task(void *arg, int count)
 {
 
 	t4_iterate(update_clip, NULL);
 }
 
 static void
 update_clip_table(struct adapter *sc, struct tom_data *td)
 {
 	struct rm_priotracker in6_ifa_tracker;
 	struct in6_ifaddr *ia;
 	struct in6_addr *lip, tlip;
 	struct clip_head stale;
 	struct clip_entry *ce, *ce_temp;
 	int rc, gen = atomic_load_acq_int(&in6_ifaddr_gen);
 
 	ASSERT_SYNCHRONIZED_OP(sc);
 
 	IN6_IFADDR_RLOCK(&in6_ifa_tracker);
 	mtx_lock(&td->clip_table_lock);
 
 	if (gen == td->clip_gen)
 		goto done;
 
 	TAILQ_INIT(&stale);
 	TAILQ_CONCAT(&stale, &td->clip_table, link);
 
 	TAILQ_FOREACH(ia, &V_in6_ifaddrhead, ia_link) {
 		lip = &ia->ia_addr.sin6_addr;
 
 		KASSERT(!IN6_IS_ADDR_MULTICAST(lip),
 		    ("%s: mcast address in in6_ifaddr list", __func__));
 
 		if (IN6_IS_ADDR_LOOPBACK(lip))
 			continue;
 		if (IN6_IS_SCOPE_EMBED(lip)) {
 			/* Remove the embedded scope */
 			tlip = *lip;
 			lip = &tlip;
 			in6_clearscope(lip);
 		}
 		/*
 		 * XXX: how to weed out the link local address for the loopback
 		 * interface?  It's fe80::1 usually (always?).
 		 */
 
 		/*
 		 * If it's in the main list then we already know it's not stale.
 		 */
 		TAILQ_FOREACH(ce, &td->clip_table, link) {
 			if (IN6_ARE_ADDR_EQUAL(&ce->lip, lip))
 				goto next;
 		}
 
 		/*
 		 * If it's in the stale list we should move it to the main list.
 		 */
 		TAILQ_FOREACH(ce, &stale, link) {
 			if (IN6_ARE_ADDR_EQUAL(&ce->lip, lip)) {
 				TAILQ_REMOVE(&stale, ce, link);
 				TAILQ_INSERT_TAIL(&td->clip_table, ce, link);
 				goto next;
 			}
 		}
 
 		/* A new IP6 address; add it to the CLIP table */
 		ce = malloc(sizeof(*ce), M_CXGBE, M_NOWAIT);
 		memcpy(&ce->lip, lip, sizeof(ce->lip));
 		ce->refcount = 0;
 		rc = add_lip(sc, lip);
 		if (rc == 0)
 			TAILQ_INSERT_TAIL(&td->clip_table, ce, link);
 		else {
 			char ip[INET6_ADDRSTRLEN];
 
 			inet_ntop(AF_INET6, &ce->lip, &ip[0], sizeof(ip));
 			log(LOG_ERR, "%s: could not add %s (%d)\n",
 			    __func__, ip, rc);
 			free(ce, M_CXGBE);
 		}
 next:
 		continue;
 	}
 
 	/*
 	 * Remove stale addresses (those no longer in V_in6_ifaddrhead) that are
 	 * no longer referenced by the driver.
 	 */
 	TAILQ_FOREACH_SAFE(ce, &stale, link, ce_temp) {
 		if (ce->refcount == 0) {
 			rc = delete_lip(sc, &ce->lip);
 			if (rc == 0) {
 				TAILQ_REMOVE(&stale, ce, link);
 				free(ce, M_CXGBE);
 			} else {
 				char ip[INET6_ADDRSTRLEN];
 
 				inet_ntop(AF_INET6, &ce->lip, &ip[0],
 				    sizeof(ip));
 				log(LOG_ERR, "%s: could not delete %s (%d)\n",
 				    __func__, ip, rc);
 			}
 		}
 	}
 	/* The ones that are still referenced need to stay in the CLIP table */
 	TAILQ_CONCAT(&td->clip_table, &stale, link);
 
 	td->clip_gen = gen;
 done:
 	mtx_unlock(&td->clip_table_lock);
 	IN6_IFADDR_RUNLOCK(&in6_ifa_tracker);
 }
 
 static void
 destroy_clip_table(struct adapter *sc, struct tom_data *td)
 {
 	struct clip_entry *ce, *ce_temp;
 
 	if (mtx_initialized(&td->clip_table_lock)) {
 		mtx_lock(&td->clip_table_lock);
 		TAILQ_FOREACH_SAFE(ce, &td->clip_table, link, ce_temp) {
 			KASSERT(ce->refcount == 0,
 			    ("%s: CLIP entry %p still in use (%d)", __func__,
 			    ce, ce->refcount));
 			TAILQ_REMOVE(&td->clip_table, ce, link);
 			delete_lip(sc, &ce->lip);
 			free(ce, M_CXGBE);
 		}
 		mtx_unlock(&td->clip_table_lock);
 		mtx_destroy(&td->clip_table_lock);
 	}
 }
 
 static void
 free_tom_data(struct adapter *sc, struct tom_data *td)
 {
 
 	ASSERT_SYNCHRONIZED_OP(sc);
 
 	KASSERT(TAILQ_EMPTY(&td->toep_list),
 	    ("%s: TOE PCB list is not empty.", __func__));
 	KASSERT(td->lctx_count == 0,
 	    ("%s: lctx hash table is not empty.", __func__));
 
-	t4_uninit_ddp(sc, td);
+	t4_free_ppod_region(&td->pr);
 	destroy_clip_table(sc, td);
 
 	if (td->listen_mask != 0)
 		hashdestroy(td->listen_hash, M_CXGBE, td->listen_mask);
 
 	if (mtx_initialized(&td->unsent_wr_lock))
 		mtx_destroy(&td->unsent_wr_lock);
 	if (mtx_initialized(&td->lctx_hash_lock))
 		mtx_destroy(&td->lctx_hash_lock);
 	if (mtx_initialized(&td->toep_list_lock))
 		mtx_destroy(&td->toep_list_lock);
 
 	free_tid_tabs(&sc->tids);
 	free(td, M_CXGBE);
 }
 
 static void
 reclaim_wr_resources(void *arg, int count)
 {
 	struct tom_data *td = arg;
 	STAILQ_HEAD(, wrqe) twr_list = STAILQ_HEAD_INITIALIZER(twr_list);
 	struct cpl_act_open_req *cpl;
 	u_int opcode, atid;
 	struct wrqe *wr;
 	struct adapter *sc;
 
 	mtx_lock(&td->unsent_wr_lock);
 	STAILQ_SWAP(&td->unsent_wr_list, &twr_list, wrqe);
 	mtx_unlock(&td->unsent_wr_lock);
 
 	while ((wr = STAILQ_FIRST(&twr_list)) != NULL) {
 		STAILQ_REMOVE_HEAD(&twr_list, link);
 
 		cpl = wrtod(wr);
 		opcode = GET_OPCODE(cpl);
 
 		switch (opcode) {
 		case CPL_ACT_OPEN_REQ:
 		case CPL_ACT_OPEN_REQ6:
 			atid = G_TID_TID(be32toh(OPCODE_TID(cpl)));
 			sc = td_adapter(td);
 
 			CTR2(KTR_CXGBE, "%s: atid %u ", __func__, atid);
 			act_open_failure_cleanup(sc, atid, EHOSTUNREACH);
 			free(wr, M_CXGBE);
 			break;
 		default:
 			log(LOG_ERR, "%s: leaked work request %p, wr_len %d, "
 			    "opcode %x\n", __func__, wr, wr->wr_len, opcode);
 			/* WR not freed here; go look at it with a debugger.  */
 		}
 	}
 }
 
 /*
  * Ground control to Major TOM
  * Commencing countdown, engines on
  */
 static int
 t4_tom_activate(struct adapter *sc)
 {
 	struct tom_data *td;
 	struct toedev *tod;
 	struct vi_info *vi;
 	struct sge_ofld_rxq *ofld_rxq;
 	int i, j, rc, v;
 
 	ASSERT_SYNCHRONIZED_OP(sc);
 
 	/* per-adapter softc for TOM */
 	td = malloc(sizeof(*td), M_CXGBE, M_ZERO | M_NOWAIT);
 	if (td == NULL)
 		return (ENOMEM);
 
 	/* List of TOE PCBs and associated lock */
 	mtx_init(&td->toep_list_lock, "PCB list lock", NULL, MTX_DEF);
 	TAILQ_INIT(&td->toep_list);
 
 	/* Listen context */
 	mtx_init(&td->lctx_hash_lock, "lctx hash lock", NULL, MTX_DEF);
 	td->listen_hash = hashinit_flags(LISTEN_HASH_SIZE, M_CXGBE,
 	    &td->listen_mask, HASH_NOWAIT);
 
 	/* List of WRs for which L2 resolution failed */
 	mtx_init(&td->unsent_wr_lock, "Unsent WR list lock", NULL, MTX_DEF);
 	STAILQ_INIT(&td->unsent_wr_list);
 	TASK_INIT(&td->reclaim_wr_resources, 0, reclaim_wr_resources, td);
 
 	/* TID tables */
 	rc = alloc_tid_tabs(&sc->tids);
 	if (rc != 0)
 		goto done;
 
-	/* DDP page pods and CPL handlers */
-	t4_init_ddp(sc, td);
+	rc = t4_init_ppod_region(&td->pr, &sc->vres.ddp,
+	    t4_read_reg(sc, A_ULP_RX_TDDP_PSZ), "TDDP page pods");
+	if (rc != 0)
+		goto done;
+	t4_set_reg_field(sc, A_ULP_RX_TDDP_TAGMASK,
+	    V_TDDPTAGMASK(M_TDDPTAGMASK), td->pr.pr_tag_mask);
 
 	/* CLIP table for IPv6 offload */
 	init_clip_table(sc, td);
 
 	/* toedev ops */
 	tod = &td->tod;
 	init_toedev(tod);
 	tod->tod_softc = sc;
 	tod->tod_connect = t4_connect;
 	tod->tod_listen_start = t4_listen_start;
 	tod->tod_listen_stop = t4_listen_stop;
 	tod->tod_rcvd = t4_rcvd;
 	tod->tod_output = t4_tod_output;
 	tod->tod_send_rst = t4_send_rst;
 	tod->tod_send_fin = t4_send_fin;
 	tod->tod_pcb_detach = t4_pcb_detach;
 	tod->tod_l2_update = t4_l2_update;
 	tod->tod_syncache_added = t4_syncache_added;
 	tod->tod_syncache_removed = t4_syncache_removed;
 	tod->tod_syncache_respond = t4_syncache_respond;
 	tod->tod_offload_socket = t4_offload_socket;
 	tod->tod_ctloutput = t4_ctloutput;
 
 	for_each_port(sc, i) {
 		for_each_vi(sc->port[i], v, vi) {
 			TOEDEV(vi->ifp) = &td->tod;
 			for_each_ofld_rxq(vi, j, ofld_rxq) {
 				ofld_rxq->iq.set_tcb_rpl = do_set_tcb_rpl;
 				ofld_rxq->iq.l2t_write_rpl = do_l2t_write_rpl2;
 			}
 		}
 	}
 
 	sc->tom_softc = td;
 	register_toedev(sc->tom_softc);
 
 done:
 	if (rc != 0)
 		free_tom_data(sc, td);
 	return (rc);
 }
 
 static int
 t4_tom_deactivate(struct adapter *sc)
 {
 	int rc = 0;
 	struct tom_data *td = sc->tom_softc;
 
 	ASSERT_SYNCHRONIZED_OP(sc);
 
 	if (td == NULL)
 		return (0);	/* XXX. KASSERT? */
 
 	if (sc->offload_map != 0)
 		return (EBUSY);	/* at least one port has IFCAP_TOE enabled */
 
 	if (uld_active(sc, ULD_IWARP) || uld_active(sc, ULD_ISCSI))
 		return (EBUSY);	/* both iWARP and iSCSI rely on the TOE. */
 
 	mtx_lock(&td->toep_list_lock);
 	if (!TAILQ_EMPTY(&td->toep_list))
 		rc = EBUSY;
 	mtx_unlock(&td->toep_list_lock);
 
 	mtx_lock(&td->lctx_hash_lock);
 	if (td->lctx_count > 0)
 		rc = EBUSY;
 	mtx_unlock(&td->lctx_hash_lock);
 
 	taskqueue_drain(taskqueue_thread, &td->reclaim_wr_resources);
 	mtx_lock(&td->unsent_wr_lock);
 	if (!STAILQ_EMPTY(&td->unsent_wr_list))
 		rc = EBUSY;
 	mtx_unlock(&td->unsent_wr_lock);
 
 	if (rc == 0) {
 		unregister_toedev(sc->tom_softc);
 		free_tom_data(sc, td);
 		sc->tom_softc = NULL;
 	}
 
 	return (rc);
 }
 
 static void
 t4_tom_ifaddr_event(void *arg __unused, struct ifnet *ifp)
 {
 
 	atomic_add_rel_int(&in6_ifaddr_gen, 1);
 	taskqueue_enqueue_timeout(taskqueue_thread, &clip_task, -hz / 4);
 }
 
 static int
 t4_aio_queue_tom(struct socket *so, struct kaiocb *job)
 {
 	struct tcpcb *tp = so_sototcpcb(so);
 	struct toepcb *toep = tp->t_toe;
 	int error;
 
 	if (toep->ulp_mode == ULP_MODE_TCPDDP) {
 		error = t4_aio_queue_ddp(so, job);
 		if (error != EOPNOTSUPP)
 			return (error);
 	}
 
 	return (t4_aio_queue_aiotx(so, job));
 }
 
 static int
 t4_tom_mod_load(void)
 {
 	int rc;
 	struct protosw *tcp_protosw, *tcp6_protosw;
 
 	/* CPL handlers */
 	t4_init_connect_cpl_handlers();
 	t4_init_listen_cpl_handlers();
 	t4_init_cpl_io_handlers();
 
 	rc = t4_ddp_mod_load();
 	if (rc != 0)
 		return (rc);
 
 	tcp_protosw = pffindproto(PF_INET, IPPROTO_TCP, SOCK_STREAM);
 	if (tcp_protosw == NULL)
 		return (ENOPROTOOPT);
 	bcopy(tcp_protosw, &toe_protosw, sizeof(toe_protosw));
 	bcopy(tcp_protosw->pr_usrreqs, &toe_usrreqs, sizeof(toe_usrreqs));
 	toe_usrreqs.pru_aio_queue = t4_aio_queue_tom;
 	toe_protosw.pr_usrreqs = &toe_usrreqs;
 
 	tcp6_protosw = pffindproto(PF_INET6, IPPROTO_TCP, SOCK_STREAM);
 	if (tcp6_protosw == NULL)
 		return (ENOPROTOOPT);
 	bcopy(tcp6_protosw, &toe6_protosw, sizeof(toe6_protosw));
 	bcopy(tcp6_protosw->pr_usrreqs, &toe6_usrreqs, sizeof(toe6_usrreqs));
 	toe6_usrreqs.pru_aio_queue = t4_aio_queue_tom;
 	toe6_protosw.pr_usrreqs = &toe6_usrreqs;
 
 	TIMEOUT_TASK_INIT(taskqueue_thread, &clip_task, 0, t4_clip_task, NULL);
 	ifaddr_evhandler = EVENTHANDLER_REGISTER(ifaddr_event,
 	    t4_tom_ifaddr_event, NULL, EVENTHANDLER_PRI_ANY);
 
 	rc = t4_register_uld(&tom_uld_info);
 	if (rc != 0)
 		t4_tom_mod_unload();
 
 	return (rc);
 }
 
 static void
 tom_uninit(struct adapter *sc, void *arg __unused)
 {
 	if (begin_synchronized_op(sc, NULL, SLEEP_OK | INTR_OK, "t4tomun"))
 		return;
 
 	/* Try to free resources (works only if no port has IFCAP_TOE) */
 	if (uld_active(sc, ULD_TOM))
 		t4_deactivate_uld(sc, ULD_TOM);
 
 	end_synchronized_op(sc, 0);
 }
 
 static int
 t4_tom_mod_unload(void)
 {
 	t4_iterate(tom_uninit, NULL);
 
 	if (t4_unregister_uld(&tom_uld_info) == EBUSY)
 		return (EBUSY);
 
 	if (ifaddr_evhandler) {
 		EVENTHANDLER_DEREGISTER(ifaddr_event, ifaddr_evhandler);
 		taskqueue_cancel_timeout(taskqueue_thread, &clip_task, NULL);
 	}
 
 	t4_ddp_mod_unload();
 
 	return (0);
 }
 #endif	/* TCP_OFFLOAD */
 
 static int
 t4_tom_modevent(module_t mod, int cmd, void *arg)
 {
 	int rc = 0;
 
 #ifdef TCP_OFFLOAD
 	switch (cmd) {
 	case MOD_LOAD:
 		rc = t4_tom_mod_load();
 		break;
 
 	case MOD_UNLOAD:
 		rc = t4_tom_mod_unload();
 		break;
 
 	default:
 		rc = EINVAL;
 	}
 #else
 	printf("t4_tom: compiled without TCP_OFFLOAD support.\n");
 	rc = EOPNOTSUPP;
 #endif
 	return (rc);
 }
 
 static moduledata_t t4_tom_moddata= {
 	"t4_tom",
 	t4_tom_modevent,
 	0
 };
 
 MODULE_VERSION(t4_tom, 1);
 MODULE_DEPEND(t4_tom, toecore, 1, 1, 1);
 MODULE_DEPEND(t4_tom, t4nex, 1, 1, 1);
 DECLARE_MODULE(t4_tom, t4_tom_moddata, SI_SUB_EXEC, SI_ORDER_ANY);
Index: head/sys/dev/cxgbe/tom/t4_tom.h
===================================================================
--- head/sys/dev/cxgbe/tom/t4_tom.h	(revision 305165)
+++ head/sys/dev/cxgbe/tom/t4_tom.h	(revision 305166)
@@ -1,366 +1,387 @@
 /*-
  * Copyright (c) 2012, 2015 Chelsio Communications, Inc.
  * All rights reserved.
  * Written by: Navdeep Parhar <np@FreeBSD.org>
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  * $FreeBSD$
  *
  */
 
 #ifndef __T4_TOM_H__
 #define __T4_TOM_H__
 #include <sys/vmem.h>
 
 #define LISTEN_HASH_SIZE 32
 
 /*
  * Min receive window.  We want it to be large enough to accommodate receive
  * coalescing, handle jumbo frames, and not trigger sender SWS avoidance.
  */
 #define MIN_RCV_WND (24 * 1024U)
 
 /*
  * Max receive window supported by HW in bytes.  Only a small part of it can
  * be set through option0, the rest needs to be set through RX_DATA_ACK.
  */
 #define MAX_RCV_WND ((1U << 27) - 1)
 
 #define	DDP_RSVD_WIN (16 * 1024U)
 #define	SB_DDP_INDICATE	SB_IN_TOE	/* soreceive must respond to indicate */
 
 #define USE_DDP_RX_FLOW_CONTROL
 
+#define PPOD_SZ(n)	((n) * sizeof(struct pagepod))
+#define PPOD_SIZE	(PPOD_SZ(1))
+
 /* TOE PCB flags */
 enum {
 	TPF_ATTACHED	   = (1 << 0),	/* a tcpcb refers to this toepcb */
 	TPF_FLOWC_WR_SENT  = (1 << 1),	/* firmware flow context WR sent */
 	TPF_TX_DATA_SENT   = (1 << 2),	/* some data sent */
 	TPF_TX_SUSPENDED   = (1 << 3),	/* tx suspended for lack of resources */
 	TPF_SEND_FIN	   = (1 << 4),	/* send FIN after all pending data */
 	TPF_FIN_SENT	   = (1 << 5),	/* FIN has been sent */
 	TPF_ABORT_SHUTDOWN = (1 << 6),	/* connection abort is in progress */
 	TPF_CPL_PENDING    = (1 << 7),	/* haven't received the last CPL */
 	TPF_SYNQE	   = (1 << 8),	/* synq_entry, not really a toepcb */
 	TPF_SYNQE_NEEDFREE = (1 << 9),	/* synq_entry was malloc'd separately */
 	TPF_SYNQE_TCPDDP   = (1 << 10),	/* ulp_mode TCPDDP in toepcb */
 	TPF_SYNQE_EXPANDED = (1 << 11),	/* toepcb ready, tid context updated */
 	TPF_SYNQE_HAS_L2TE = (1 << 12),	/* we've replied to PASS_ACCEPT_REQ */
 };
 
 enum {
 	DDP_OK		= (1 << 0),	/* OK to turn on DDP */
 	DDP_SC_REQ	= (1 << 1),	/* state change (on/off) requested */
 	DDP_ON		= (1 << 2),	/* DDP is turned on */
 	DDP_BUF0_ACTIVE	= (1 << 3),	/* buffer 0 in use (not invalidated) */
 	DDP_BUF1_ACTIVE	= (1 << 4),	/* buffer 1 in use (not invalidated) */
 	DDP_TASK_ACTIVE = (1 << 5),	/* requeue task is queued / running */
 	DDP_DEAD	= (1 << 6),	/* toepcb is shutting down */
 };
 
 struct ofld_tx_sdesc {
 	uint32_t plen;		/* payload length */
 	uint8_t tx_credits;	/* firmware tx credits (unit is 16B) */
 };
 
+struct ppod_region {
+	u_int pr_start;
+	u_int pr_len;
+	u_int pr_page_shift[4];
+	uint32_t pr_tag_mask;		/* hardware tagmask for this region. */
+	uint32_t pr_invalid_bit;	/* OR with this to invalidate tag. */
+	uint32_t pr_alias_mask;		/* AND with tag to get alias bits. */
+	u_int pr_alias_shift;		/* shift this much for first alias bit. */
+	vmem_t *pr_arena;
+};
+
+struct ppod_reservation {
+	struct ppod_region *prsv_pr;
+	uint32_t prsv_tag;		/* Full tag: pgsz, alias, tag, color */
+	u_int prsv_nppods;
+};
+
 struct pageset {
 	TAILQ_ENTRY(pageset) link;
 	vm_page_t *pages;
 	int npages;
 	int flags;
-	u_int ppod_addr;
-	int nppods;
-	uint32_t tag;	/* includes color, page pod addr, and DDP page size */
 	int offset;		/* offset in first page */
 	int len;
+	struct ppod_reservation prsv;
 	struct vmspace *vm;
 	u_int vm_timestamp;
 };
 
 TAILQ_HEAD(pagesetq, pageset);
 
 #define	PS_WIRED		0x0001	/* Pages wired rather than held. */
 #define	PS_PPODS_WRITTEN	0x0002	/* Page pods written to the card. */
 
 #define	EXT_FLAG_AIOTX		EXT_FLAG_VENDOR1
 
 struct ddp_buffer {
 	struct pageset *ps;
 
 	struct kaiocb *job;
 	int cancel_pending;
 };
 
 struct aiotx_buffer {
 	struct pageset ps;
 	struct kaiocb *job;
 	int refcount;
 };
 
 struct toepcb {
 	TAILQ_ENTRY(toepcb) link; /* toep_list */
 	u_int flags;		/* miscellaneous flags */
 	int refcount;
 	struct tom_data *td;
 	struct inpcb *inp;	/* backpointer to host stack's PCB */
 	struct vi_info *vi;	/* virtual interface */
 	struct sge_wrq *ofld_txq;
 	struct sge_ofld_rxq *ofld_rxq;
 	struct sge_wrq *ctrlq;
 	struct l2t_entry *l2te;	/* L2 table entry used by this connection */
 	struct clip_entry *ce;	/* CLIP table entry used by this tid */
 	int tid;		/* Connection identifier */
 
 	/* tx credit handling */
 	u_int tx_total;		/* total tx WR credits (in 16B units) */
 	u_int tx_credits;	/* tx WR credits (in 16B units) available */
 	u_int tx_nocompl;	/* tx WR credits since last compl request */
 	u_int plen_nocompl;	/* payload since last compl request */
 
 	/* rx credit handling */
 	u_int sb_cc;		/* last noted value of so_rcv->sb_cc */
 	int rx_credits;		/* rx credits (in bytes) to be returned to hw */
 
 	u_int ulp_mode;	/* ULP mode */
 	void *ulpcb;
 	void *ulpcb2;
 	struct mbufq ulp_pduq;	/* PDUs waiting to be sent out. */
 	struct mbufq ulp_pdu_reclaimq;
 
 	u_int ddp_flags;
 	struct ddp_buffer db[2];
 	TAILQ_HEAD(, pageset) ddp_cached_pagesets;
 	TAILQ_HEAD(, kaiocb) ddp_aiojobq;
 	u_int ddp_waiting_count;
 	u_int ddp_active_count;
 	u_int ddp_cached_count;
 	int ddp_active_id;	/* the currently active DDP buffer */
 	struct task ddp_requeue_task;
 	struct kaiocb *ddp_queueing;
 	struct mtx ddp_lock;
 
 	TAILQ_HEAD(, kaiocb) aiotx_jobq;
 	struct task aiotx_task;
 	bool aiotx_task_active;
 
 	/* Tx software descriptor */
 	uint8_t txsd_total;
 	uint8_t txsd_pidx;
 	uint8_t txsd_cidx;
 	uint8_t txsd_avail;
 	struct ofld_tx_sdesc txsd[];
 };
 
 #define	DDP_LOCK(toep)		mtx_lock(&(toep)->ddp_lock)
 #define	DDP_UNLOCK(toep)	mtx_unlock(&(toep)->ddp_lock)
 #define	DDP_ASSERT_LOCKED(toep)	mtx_assert(&(toep)->ddp_lock, MA_OWNED)
 
 struct flowc_tx_params {
 	uint32_t snd_nxt;
 	uint32_t rcv_nxt;
 	unsigned int snd_space;
 	unsigned int mss;
 };
 
 #define	DDP_RETRY_WAIT	5	/* seconds to wait before re-enabling DDP */
 #define	DDP_LOW_SCORE	1
 #define	DDP_HIGH_SCORE	3
 
 /*
  * Compressed state for embryonic connections for a listener.  Barely fits in
  * 64B, try not to grow it further.
  */
 struct synq_entry {
 	TAILQ_ENTRY(synq_entry) link;	/* listen_ctx's synq link */
 	int flags;			/* same as toepcb's tp_flags */
 	int tid;
 	struct listen_ctx *lctx;	/* backpointer to listen ctx */
 	struct mbuf *syn;
 	uint32_t iss;
 	uint32_t ts;
 	volatile uintptr_t wr;
 	volatile u_int refcnt;
 	uint16_t l2e_idx;
 	uint16_t rcv_bufsize;
 };
 
 /* listen_ctx flags */
 #define LCTX_RPL_PENDING 1	/* waiting for a CPL_PASS_OPEN_RPL */
 
 struct listen_ctx {
 	LIST_ENTRY(listen_ctx) link;	/* listen hash linkage */
 	volatile int refcount;
 	int stid;
 	struct stid_region stid_region;
 	int flags;
 	struct inpcb *inp;		/* listening socket's inp */
 	struct sge_wrq *ctrlq;
 	struct sge_ofld_rxq *ofld_rxq;
 	struct clip_entry *ce;
 	TAILQ_HEAD(, synq_entry) synq;
 };
 
 struct clip_entry {
 	TAILQ_ENTRY(clip_entry) link;
 	struct in6_addr lip;	/* local IPv6 address */
 	u_int refcount;
 };
 
 TAILQ_HEAD(clip_head, clip_entry);
 struct tom_data {
 	struct toedev tod;
 
 	/* toepcb's associated with this TOE device */
 	struct mtx toep_list_lock;
 	TAILQ_HEAD(, toepcb) toep_list;
 
 	struct mtx lctx_hash_lock;
 	LIST_HEAD(, listen_ctx) *listen_hash;
 	u_long listen_mask;
 	int lctx_count;		/* # of lctx in the hash table */
 
-	u_int ppod_start;
-	u_int ddp_pgsz[4];
-	vmem_t *ppod_arena;
+	struct ppod_region pr;
 
 	struct mtx clip_table_lock;
 	struct clip_head clip_table;
 	int clip_gen;
 
 	/* WRs that will not be sent to the chip because L2 resolution failed */
 	struct mtx unsent_wr_lock;
 	STAILQ_HEAD(, wrqe) unsent_wr_list;
 	struct task reclaim_wr_resources;
 };
 
 static inline struct tom_data *
 tod_td(struct toedev *tod)
 {
 
 	return (__containerof(tod, struct tom_data, tod));
 }
 
 static inline struct adapter *
 td_adapter(struct tom_data *td)
 {
 
 	return (td->tod.tod_softc);
 }
 
 static inline void
 set_mbuf_ulp_submode(struct mbuf *m, uint8_t ulp_submode)
 {
 
 	M_ASSERTPKTHDR(m);
 	m->m_pkthdr.PH_per.eight[0] = ulp_submode;
 }
 
 static inline uint8_t
 mbuf_ulp_submode(struct mbuf *m)
 {
 
 	M_ASSERTPKTHDR(m);
 	return (m->m_pkthdr.PH_per.eight[0]);
 }
 
 /* t4_tom.c */
 struct toepcb *alloc_toepcb(struct vi_info *, int, int, int);
 struct toepcb *hold_toepcb(struct toepcb *);
 void free_toepcb(struct toepcb *);
 void offload_socket(struct socket *, struct toepcb *);
 void undo_offload_socket(struct socket *);
 void final_cpl_received(struct toepcb *);
 void insert_tid(struct adapter *, int, void *);
 void *lookup_tid(struct adapter *, int);
 void update_tid(struct adapter *, int, void *);
 void remove_tid(struct adapter *, int);
 void release_tid(struct adapter *, int, struct sge_wrq *);
 int find_best_mtu_idx(struct adapter *, struct in_conninfo *, int);
 u_long select_rcv_wnd(struct socket *);
 int select_rcv_wscale(void);
 uint64_t calc_opt0(struct socket *, struct vi_info *, struct l2t_entry *,
     int, int, int, int);
 uint64_t select_ntuple(struct vi_info *, struct l2t_entry *);
 void set_tcpddp_ulp_mode(struct toepcb *);
 int negative_advice(int);
 struct clip_entry *hold_lip(struct tom_data *, struct in6_addr *);
 void release_lip(struct tom_data *, struct clip_entry *);
 
 /* t4_connect.c */
 void t4_init_connect_cpl_handlers(void);
 int t4_connect(struct toedev *, struct socket *, struct rtentry *,
     struct sockaddr *);
 void act_open_failure_cleanup(struct adapter *, u_int, u_int);
 
 /* t4_listen.c */
 void t4_init_listen_cpl_handlers(void);
 int t4_listen_start(struct toedev *, struct tcpcb *);
 int t4_listen_stop(struct toedev *, struct tcpcb *);
 void t4_syncache_added(struct toedev *, void *);
 void t4_syncache_removed(struct toedev *, void *);
 int t4_syncache_respond(struct toedev *, void *, struct mbuf *);
 int do_abort_req_synqe(struct sge_iq *, const struct rss_header *,
     struct mbuf *);
 int do_abort_rpl_synqe(struct sge_iq *, const struct rss_header *,
     struct mbuf *);
 void t4_offload_socket(struct toedev *, void *, struct socket *);
 
 /* t4_cpl_io.c */
 void aiotx_init_toep(struct toepcb *);
 int t4_aio_queue_aiotx(struct socket *, struct kaiocb *);
 void t4_init_cpl_io_handlers(void);
 void t4_uninit_cpl_io_handlers(void);
 void send_abort_rpl(struct adapter *, struct sge_wrq *, int , int);
 void send_flowc_wr(struct toepcb *, struct flowc_tx_params *);
 void send_reset(struct adapter *, struct toepcb *, uint32_t);
 void make_established(struct toepcb *, uint32_t, uint32_t, uint16_t);
 void t4_rcvd(struct toedev *, struct tcpcb *);
 void t4_rcvd_locked(struct toedev *, struct tcpcb *);
 int t4_tod_output(struct toedev *, struct tcpcb *);
 int t4_send_fin(struct toedev *, struct tcpcb *);
 int t4_send_rst(struct toedev *, struct tcpcb *);
 void t4_set_tcb_field(struct adapter *, struct sge_wrq *, int, uint16_t,
     uint64_t, uint64_t, int, int, int);
 void t4_push_frames(struct adapter *sc, struct toepcb *toep, int drop);
 void t4_push_pdus(struct adapter *sc, struct toepcb *toep, int drop);
 int do_set_tcb_rpl(struct sge_iq *, const struct rss_header *, struct mbuf *);
 
 /* t4_ddp.c */
-void t4_init_ddp(struct adapter *, struct tom_data *);
-void t4_uninit_ddp(struct adapter *, struct tom_data *);
+int t4_init_ppod_region(struct ppod_region *, struct t4_range *, u_int,
+    const char *);
+void t4_free_ppod_region(struct ppod_region *);
+int t4_alloc_page_pods_for_ps(struct ppod_region *, struct pageset *);
+int t4_write_page_pods_for_ps(struct adapter *, struct sge_wrq *, int,
+    struct pageset *);
+void t4_free_page_pods(struct ppod_reservation *);
 int t4_soreceive_ddp(struct socket *, struct sockaddr **, struct uio *,
     struct mbuf **, struct mbuf **, int *);
 int t4_aio_queue_ddp(struct socket *, struct kaiocb *);
 int t4_ddp_mod_load(void);
 void t4_ddp_mod_unload(void);
 void ddp_assert_empty(struct toepcb *);
 void ddp_init_toep(struct toepcb *);
 void ddp_uninit_toep(struct toepcb *);
 void ddp_queue_toep(struct toepcb *);
 void release_ddp_resources(struct toepcb *toep);
 void handle_ddp_close(struct toepcb *, struct tcpcb *, uint32_t);
 void handle_ddp_indicate(struct toepcb *);
 void handle_ddp_tcb_rpl(struct toepcb *, const struct cpl_set_tcb_rpl *);
 void insert_ddp_data(struct toepcb *, uint32_t);
 
 #endif