Index: projects/cxl_iscsi/sys/dev/cxgbe/cxgbei/cxgbei.c =================================================================== --- projects/cxl_iscsi/sys/dev/cxgbe/cxgbei/cxgbei.c (revision 291231) +++ projects/cxl_iscsi/sys/dev/cxgbe/cxgbei/cxgbei.c (revision 291232) @@ -1,978 +1,1104 @@ /*- * Copyright (c) 2012 Chelsio Communications, Inc. * All rights reserved. * * Chelsio T5xx iSCSI driver * * Written by: Sreenivasa Honnur * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include "opt_inet.h" #include "opt_inet6.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "common/common.h" #include "common/t4_msg.h" #include "common/t4_regs.h" /* for PCIE_MEM_ACCESS */ #include "tom/t4_tom.h" #include "cxgbei.h" #include "cxgbei_ulp2_ddp.h" /* XXXNP some header instead. */ struct icl_pdu *icl_cxgbei_new_pdu(int); void icl_cxgbei_new_pdu_set_conn(struct icl_pdu *, struct icl_conn *); +void icl_cxgbei_conn_pdu_free(struct icl_conn *, struct icl_pdu *); /* * Direct Data Placement - * Directly place the iSCSI Data-In or Data-Out PDU's payload into pre-posted * final destination host-memory buffers based on the Initiator Task Tag (ITT) * in Data-In or Target Task Tag (TTT) in Data-Out PDUs. * The host memory address is programmed into h/w in the format of pagepod * entries. * The location of the pagepod entry is encoded into ddp tag which is used as * the base for ITT/TTT. */ /* * functions to program the pagepod in h/w */ static void inline ppod_set(struct pagepod *ppod, struct cxgbei_ulp2_pagepod_hdr *hdr, struct cxgbei_ulp2_gather_list *gl, unsigned int pidx) { int i; memcpy(ppod, hdr, sizeof(*hdr)); for (i = 0; i < (PPOD_PAGES + 1); i++, pidx++) { ppod->addr[i] = pidx < gl->nelem ? cpu_to_be64(gl->dma_sg[pidx].phys_addr) : 0ULL; } } static void inline ppod_clear(struct pagepod *ppod) { memset(ppod, 0, sizeof(*ppod)); } static inline void ulp_mem_io_set_hdr(struct adapter *sc, int tid, struct ulp_mem_io *req, unsigned int wr_len, unsigned int dlen, unsigned int pm_addr) { struct ulptx_idata *idata = (struct ulptx_idata *)(req + 1); INIT_ULPTX_WR(req, wr_len, 0, 0); req->cmd = cpu_to_be32(V_ULPTX_CMD(ULP_TX_MEM_WRITE) | V_ULP_MEMIO_ORDER(is_t4(sc)) | V_T5_ULP_MEMIO_IMM(is_t5(sc))); req->dlen = htonl(V_ULP_MEMIO_DATA_LEN(dlen >> 5)); req->len16 = htonl(DIV_ROUND_UP(wr_len - sizeof(req->wr), 16) | V_FW_WR_FLOWID(tid)); req->lock_addr = htonl(V_ULP_MEMIO_ADDR(pm_addr >> 5)); idata->cmd_more = htonl(V_ULPTX_CMD(ULP_TX_SC_IMM)); idata->len = htonl(dlen); } #define PPOD_SIZE sizeof(struct pagepod) #define ULPMEM_IDATA_MAX_NPPODS 1 /* 256/PPOD_SIZE */ #define PCIE_MEMWIN_MAX_NPPODS 16 /* 1024/PPOD_SIZE */ static int ppod_write_idata(struct cxgbei_data *ci, struct cxgbei_ulp2_pagepod_hdr *hdr, unsigned int idx, unsigned int npods, struct cxgbei_ulp2_gather_list *gl, unsigned int gl_pidx, struct toepcb *toep) { u_int dlen = PPOD_SIZE * npods; u_int pm_addr = idx * PPOD_SIZE + ci->llimit; u_int wr_len = roundup(sizeof(struct ulp_mem_io) + sizeof(struct ulptx_idata) + dlen, 16); struct ulp_mem_io *req; struct ulptx_idata *idata; struct pagepod *ppod; u_int i; struct wrqe *wr; struct adapter *sc = toep->port->adapter; wr = alloc_wrqe(wr_len, toep->ctrlq); if (wr == NULL) { CXGBE_UNIMPLEMENTED("ppod_write_idata: alloc_wrqe failure"); return (ENOMEM); } req = wrtod(wr); memset(req, 0, wr_len); ulp_mem_io_set_hdr(sc, toep->tid, req, wr_len, dlen, pm_addr); idata = (struct ulptx_idata *)(req + 1); ppod = (struct pagepod *)(idata + 1); for (i = 0; i < npods; i++, ppod++, gl_pidx += PPOD_PAGES) { if (!hdr) /* clear the pagepod */ ppod_clear(ppod); else /* set the pagepod */ ppod_set(ppod, hdr, gl, gl_pidx); } t4_wrq_tx(sc, wr); return 0; } int t4_ddp_set_map(struct cxgbei_data *ci, void *iccp, struct cxgbei_ulp2_pagepod_hdr *hdr, u_int idx, u_int npods, struct cxgbei_ulp2_gather_list *gl, int reply) { struct icl_cxgbei_conn *icc = (struct icl_cxgbei_conn *)iccp; struct toepcb *toep = icc->toep; int err; unsigned int pidx = 0, w_npods = 0, cnt; /* * on T4, if we use a mix of IMMD and DSGL with ULP_MEM_WRITE, * the order would not be garanteed, so we will stick with IMMD */ gl->tid = toep->tid; gl->port_id = toep->port->port_id; gl->egress_dev = (void *)toep->port->ifp; /* send via immediate data */ for (; w_npods < npods; idx += cnt, w_npods += cnt, pidx += PPOD_PAGES) { cnt = npods - w_npods; if (cnt > ULPMEM_IDATA_MAX_NPPODS) cnt = ULPMEM_IDATA_MAX_NPPODS; err = ppod_write_idata(ci, hdr, idx, cnt, gl, pidx, toep); if (err) { printf("%s: ppod_write_idata failed\n", __func__); break; } } return err; } void t4_ddp_clear_map(struct cxgbei_data *ci, struct cxgbei_ulp2_gather_list *gl, u_int tag, u_int idx, u_int npods, struct icl_cxgbei_conn *icc) { struct toepcb *toep = icc->toep; int err = -1; u_int pidx = 0; u_int w_npods = 0; u_int cnt; for (; w_npods < npods; idx += cnt, w_npods += cnt, pidx += PPOD_PAGES) { cnt = npods - w_npods; if (cnt > ULPMEM_IDATA_MAX_NPPODS) cnt = ULPMEM_IDATA_MAX_NPPODS; err = ppod_write_idata(ci, NULL, idx, cnt, gl, 0, toep); if (err) break; } } static int cxgbei_map_sg(struct cxgbei_sgl *sgl, struct ccb_scsiio *csio) { unsigned int data_len = csio->dxfer_len; unsigned int sgoffset = (uint64_t)csio->data_ptr & PAGE_MASK; unsigned int nsge; unsigned char *sgaddr = csio->data_ptr; unsigned int len = 0; nsge = (csio->dxfer_len + sgoffset + PAGE_SIZE - 1) >> PAGE_SHIFT; sgl->sg_addr = sgaddr; sgl->sg_offset = sgoffset; if (data_len < (PAGE_SIZE - sgoffset)) len = data_len; else len = PAGE_SIZE - sgoffset; sgl->sg_length = len; data_len -= len; sgaddr += len; sgl = sgl+1; while (data_len > 0) { sgl->sg_addr = sgaddr; len = (data_len < PAGE_SIZE)? data_len: PAGE_SIZE; sgl->sg_length = len; sgaddr += len; data_len -= len; sgl = sgl + 1; } return nsge; } static int cxgbei_map_sg_tgt(struct cxgbei_sgl *sgl, union ctl_io *io) { unsigned int data_len, sgoffset, nsge; unsigned char *sgaddr; unsigned int len = 0, index = 0, ctl_sg_count, i; struct ctl_sg_entry ctl_sg_entry, *ctl_sglist; if (io->scsiio.kern_sg_entries > 0) { ctl_sglist = (struct ctl_sg_entry *)io->scsiio.kern_data_ptr; ctl_sg_count = io->scsiio.kern_sg_entries; } else { ctl_sglist = &ctl_sg_entry; ctl_sglist->addr = io->scsiio.kern_data_ptr; ctl_sglist->len = io->scsiio.kern_data_len; ctl_sg_count = 1; } sgaddr = sgl->sg_addr = ctl_sglist[index].addr; sgoffset = sgl->sg_offset = (uint64_t)sgl->sg_addr & PAGE_MASK; data_len = ctl_sglist[index].len; if (data_len < (PAGE_SIZE - sgoffset)) len = data_len; else len = PAGE_SIZE - sgoffset; sgl->sg_length = len; data_len -= len; sgaddr += len; sgl = sgl+1; len = 0; for (i = 0; i< ctl_sg_count; i++) len += ctl_sglist[i].len; nsge = (len + sgoffset + PAGE_SIZE -1) >> PAGE_SHIFT; while (data_len > 0) { sgl->sg_addr = sgaddr; len = (data_len < PAGE_SIZE)? data_len: PAGE_SIZE; sgl->sg_length = len; sgaddr += len; data_len -= len; sgl = sgl + 1; if (data_len == 0) { if (index == ctl_sg_count - 1) break; index++; sgaddr = ctl_sglist[index].addr; data_len = ctl_sglist[index].len; } } return nsge; } static int t4_sk_ddp_tag_reserve(struct cxgbei_data *ci, struct icl_cxgbei_conn *icc, u_int xferlen, struct cxgbei_sgl *sgl, u_int sgcnt, u_int *ddp_tag) { struct cxgbei_ulp2_gather_list *gl; int err = -EINVAL; struct toepcb *toep = icc->toep; gl = cxgbei_ulp2_ddp_make_gl_from_iscsi_sgvec(xferlen, sgl, sgcnt, ci, 0); if (gl) { err = cxgbei_ulp2_ddp_tag_reserve(ci, icc, toep->tid, &ci->tag_format, ddp_tag, gl, 0, 0); if (err) { cxgbei_ulp2_ddp_release_gl(ci, gl); } } return err; } static unsigned int cxgbei_task_reserve_itt(struct icl_conn *ic, void **prv, struct ccb_scsiio *scmd, unsigned int *itt) { struct icl_cxgbei_conn *icc = ic_to_icc(ic); int xferlen = scmd->dxfer_len; struct cxgbei_task_data *tdata = NULL; struct cxgbei_sgl *sge = NULL; struct toepcb *toep = icc->toep; struct adapter *sc = td_adapter(toep->td); struct cxgbei_data *ci = sc->iscsi_softc; int err = -1; MPASS(icc->icc_signature == CXGBEI_CONN_SIGNATURE); tdata = (struct cxgbei_task_data *)*prv; if (xferlen == 0 || tdata == NULL) goto out; if (xferlen < DDP_THRESHOLD) goto out; if ((scmd->ccb_h.flags & CAM_DIR_MASK) == CAM_DIR_IN) { tdata->nsge = cxgbei_map_sg(tdata->sgl, scmd); if (tdata->nsge == 0) { CTR1(KTR_CXGBE, "%s: map_sg failed", __func__); return 0; } sge = tdata->sgl; tdata->sc_ddp_tag = *itt; CTR3(KTR_CXGBE, "%s: *itt:0x%x sc_ddp_tag:0x%x", __func__, *itt, tdata->sc_ddp_tag); if (cxgbei_ulp2_sw_tag_usable(&ci->tag_format, tdata->sc_ddp_tag)) { err = t4_sk_ddp_tag_reserve(ci, icc, scmd->dxfer_len, sge, tdata->nsge, &tdata->sc_ddp_tag); } else { CTR3(KTR_CXGBE, "%s: itt:0x%x sc_ddp_tag:0x%x not usable", __func__, *itt, tdata->sc_ddp_tag); } } out: if (err < 0) tdata->sc_ddp_tag = cxgbei_ulp2_set_non_ddp_tag(&ci->tag_format, *itt); return tdata->sc_ddp_tag; } static unsigned int cxgbei_task_reserve_ttt(struct icl_conn *ic, void **prv, union ctl_io *io, unsigned int *ttt) { struct icl_cxgbei_conn *icc = ic_to_icc(ic); struct toepcb *toep = icc->toep; struct adapter *sc = td_adapter(toep->td); struct cxgbei_data *ci = sc->iscsi_softc; struct cxgbei_task_data *tdata = NULL; int xferlen, err = -1; struct cxgbei_sgl *sge = NULL; MPASS(icc->icc_signature == CXGBEI_CONN_SIGNATURE); xferlen = (io->scsiio.kern_data_len - io->scsiio.ext_data_filled); tdata = (struct cxgbei_task_data *)*prv; if ((xferlen == 0) || (tdata == NULL)) goto out; if (xferlen < DDP_THRESHOLD) goto out; tdata->nsge = cxgbei_map_sg_tgt(tdata->sgl, io); if (tdata->nsge == 0) { CTR1(KTR_CXGBE, "%s: map_sg failed", __func__); return 0; } sge = tdata->sgl; tdata->sc_ddp_tag = *ttt; if (cxgbei_ulp2_sw_tag_usable(&ci->tag_format, tdata->sc_ddp_tag)) { err = t4_sk_ddp_tag_reserve(ci, icc, xferlen, sge, tdata->nsge, &tdata->sc_ddp_tag); } else { CTR2(KTR_CXGBE, "%s: sc_ddp_tag:0x%x not usable", __func__, tdata->sc_ddp_tag); } out: if (err < 0) tdata->sc_ddp_tag = cxgbei_ulp2_set_non_ddp_tag(&ci->tag_format, *ttt); return tdata->sc_ddp_tag; } static int t4_sk_ddp_tag_release(struct icl_cxgbei_conn *icc, unsigned int ddp_tag) { struct toepcb *toep = icc->toep; struct adapter *sc = td_adapter(toep->td); struct cxgbei_data *ci = sc->iscsi_softc; cxgbei_ulp2_ddp_tag_release(ci, ddp_tag, icc); return (0); } static int cxgbei_ddp_init(struct adapter *sc, struct cxgbei_data *ci) { int nppods, bits, max_sz, rc; static const u_int pgsz_order[] = {0, 1, 2, 3}; MPASS(sc->vres.iscsi.size > 0); ci->llimit = sc->vres.iscsi.start; ci->ulimit = sc->vres.iscsi.start + sc->vres.iscsi.size - 1; max_sz = G_MAXRXDATA(t4_read_reg(sc, A_TP_PARA_REG2)); nppods = sc->vres.iscsi.size >> IPPOD_SIZE_SHIFT; if (nppods <= 1024) return (ENXIO); bits = fls(nppods); if (bits > IPPOD_IDX_MAX_SIZE) bits = IPPOD_IDX_MAX_SIZE; nppods = (1 << (bits - 1)) - 1; rc = bus_dma_tag_create(NULL, 1, 0, BUS_SPACE_MAXADDR, BUS_SPACE_MAXADDR, NULL, NULL, UINT32_MAX , 8, BUS_SPACE_MAXSIZE, BUS_DMA_ALLOCNOW, NULL, NULL, &ci->ulp_ddp_tag); if (rc != 0) { device_printf(sc->dev, "%s: failed to create DMA tag: %u.\n", __func__, rc); return (rc); } ci->colors = malloc(nppods * sizeof(char), M_CXGBE, M_NOWAIT | M_ZERO); ci->gl_map = malloc(nppods * sizeof(struct cxgbei_ulp2_gather_list *), M_CXGBE, M_NOWAIT | M_ZERO); if (ci->colors == NULL || ci->gl_map == NULL) { bus_dma_tag_destroy(ci->ulp_ddp_tag); free(ci->colors, M_CXGBE); free(ci->gl_map, M_CXGBE); return (ENOMEM); } mtx_init(&ci->map_lock, "ddp lock", NULL, MTX_DEF | MTX_DUPOK); ci->max_txsz = ci->max_rxsz = min(max_sz, ULP2_MAX_PKT_SIZE); ci->nppods = nppods; ci->idx_last = nppods; ci->idx_bits = bits; ci->idx_mask = (1 << bits) - 1; ci->rsvd_tag_mask = (1 << (bits + IPPOD_IDX_SHIFT)) - 1; ci->tag_format.sw_bits = bits; ci->tag_format.rsvd_bits = bits; ci->tag_format.rsvd_shift = IPPOD_IDX_SHIFT; ci->tag_format.rsvd_mask = ci->idx_mask; t4_iscsi_init(sc, ci->idx_mask << IPPOD_IDX_SHIFT, pgsz_order); return (rc); } static int do_rx_iscsi_hdr(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m) { struct adapter *sc = iq->adapter; struct cpl_iscsi_hdr *cpl = mtod(m, struct cpl_iscsi_hdr *); u_int tid = GET_TID(cpl); struct toepcb *toep = lookup_tid(sc, tid); - struct icl_cxgbei_conn *icc = toep->ulpcb; struct icl_pdu *ip; struct icl_cxgbei_pdu *icp; - MPASS(icc != NULL); - MPASS(icc->icc_signature == CXGBEI_CONN_SIGNATURE); M_ASSERTPKTHDR(m); ip = icl_cxgbei_new_pdu(M_NOWAIT); if (ip == NULL) CXGBE_UNIMPLEMENTED("PDU allocation failure"); - icl_cxgbei_new_pdu_set_conn(ip, &icc->ic); icp = ip_to_icp(ip); bcopy(mtod(m, caddr_t) + sizeof(*cpl), icp->ip.ip_bhs, sizeof(struct iscsi_bhs)); + icp->pdu_seq = ntohl(cpl->seq); icp->pdu_flags = SBUF_ULP_FLAG_HDR_RCVD; /* This is the start of a new PDU. There should be no old state. */ - MPASS(icc->icp == NULL); - icc->icp = icp; - icc->pdu_seq = ntohl(cpl->seq); + MPASS(toep->ulpcb2 == NULL); + toep->ulpcb2 = icp; #if 0 CTR4(KTR_CXGBE, "%s: tid %u, cpl->len hlen %u, m->m_len hlen %u", __func__, tid, ntohs(cpl->len), m->m_len); #endif m_freem(m); return (0); } static int do_rx_iscsi_data(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m) { struct adapter *sc = iq->adapter; struct cpl_iscsi_data *cpl = mtod(m, struct cpl_iscsi_data *); u_int tid = GET_TID(cpl); struct toepcb *toep = lookup_tid(sc, tid); - struct icl_cxgbei_conn *icc = toep->ulpcb; - struct icl_cxgbei_pdu *icp = icc->icp; + struct icl_cxgbei_pdu *icp = toep->ulpcb2; - MPASS(icc != NULL); - MPASS(icc->icc_signature == CXGBEI_CONN_SIGNATURE); M_ASSERTPKTHDR(m); /* Must already have received the header (but not the data). */ MPASS(icp != NULL); MPASS(icp->pdu_flags == SBUF_ULP_FLAG_HDR_RCVD); MPASS(icp->ip.ip_data_mbuf == NULL); MPASS(icp->ip.ip_data_len == 0); m_adj(m, sizeof(*cpl)); icp->pdu_flags |= SBUF_ULP_FLAG_DATA_RCVD; icp->ip.ip_data_mbuf = m; icp->ip.ip_data_len = m->m_pkthdr.len; /* XXXNP: round up to 4? */ #if 0 CTR4(KTR_CXGBE, "%s: tid %u, cpl->len dlen %u, m->m_len dlen %u", __func__, tid, ntohs(cpl->len), m->m_len); #endif return (0); } static int do_rx_iscsi_ddp(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m) { struct adapter *sc = iq->adapter; const struct cpl_rx_data_ddp *cpl = (const void *)(rss + 1); u_int tid = GET_TID(cpl); struct toepcb *toep = lookup_tid(sc, tid); struct inpcb *inp = toep->inp; + struct socket *so; + struct sockbuf *sb; struct tcpcb *tp; - struct icl_cxgbei_conn *icc = toep->ulpcb; - struct icl_conn *ic = &icc->ic; - struct icl_cxgbei_pdu *icp = icc->icp; + struct icl_cxgbei_conn *icc; + struct icl_conn *ic; + struct icl_cxgbei_pdu *icp = toep->ulpcb2; + struct icl_pdu *ip; u_int pdu_len, val; - MPASS(icc != NULL); - MPASS(icc->icc_signature == CXGBEI_CONN_SIGNATURE); MPASS(m == NULL); /* Must already be assembling a PDU. */ MPASS(icp != NULL); MPASS(icp->pdu_flags & SBUF_ULP_FLAG_HDR_RCVD); /* Data is optional. */ - + ip = &icp->ip; icp->pdu_flags |= SBUF_ULP_FLAG_STATUS_RCVD; - + val = ntohl(cpl->ddpvld); + if (val & F_DDP_PADDING_ERR) + icp->pdu_flags |= SBUF_ULP_FLAG_PAD_ERROR; + if (val & F_DDP_HDRCRC_ERR) + icp->pdu_flags |= SBUF_ULP_FLAG_HCRC_ERROR; + if (val & F_DDP_DATACRC_ERR) + icp->pdu_flags |= SBUF_ULP_FLAG_DCRC_ERROR; + if (ip->ip_data_mbuf == NULL) { + /* XXXNP: what should ip->ip_data_len be, and why? */ + icp->pdu_flags |= SBUF_ULP_FLAG_DATA_DDPED; + } pdu_len = ntohs(cpl->len); /* includes everything. */ INP_WLOCK(inp); - /* XXXNP: check inp for dropped etc., and toep for abort in progress. */ + if (__predict_false(inp->inp_flags & (INP_DROPPED | INP_TIMEWAIT))) { + CTR4(KTR_CXGBE, "%s: tid %u, rx (%d bytes), inp_flags 0x%x", + __func__, tid, pdu_len, inp->inp_flags); + INP_WUNLOCK(inp); + icl_cxgbei_conn_pdu_free(NULL, ip); +#ifdef INVARIANTS + toep->ulpcb2 = NULL; +#endif + return (0); + } tp = intotcpcb(inp); - MPASS(icc->pdu_seq == tp->rcv_nxt); + MPASS(icp->pdu_seq == tp->rcv_nxt); MPASS(tp->rcv_wnd >= pdu_len); tp->rcv_nxt += pdu_len; tp->rcv_wnd -= pdu_len; tp->t_rcvtime = ticks; /* update rx credits */ toep->rx_credits += pdu_len; t4_rcvd(&toep->td->tod, tp); /* XXX: sc->tom_softc.tod */ - INP_WUNLOCK(inp); - val = ntohl(cpl->ddpvld); - if (val & F_DDP_PADDING_ERR) - icp->pdu_flags |= SBUF_ULP_FLAG_PAD_ERROR; - if (val & F_DDP_HDRCRC_ERR) - icp->pdu_flags |= SBUF_ULP_FLAG_HCRC_ERROR; - if (val & F_DDP_DATACRC_ERR) - icp->pdu_flags |= SBUF_ULP_FLAG_DCRC_ERROR; - if (icp->ip.ip_data_mbuf == NULL) - icp->pdu_flags |= SBUF_ULP_FLAG_DATA_DDPED; + so = inp->inp_socket; + sb = &so->so_rcv; + SOCKBUF_LOCK(sb); + icc = toep->ulpcb; + if (__predict_false(icc == NULL || sb->sb_state & SBS_CANTRCVMORE)) { + CTR5(KTR_CXGBE, + "%s: tid %u, excess rx (%d bytes), icc %p, sb_state 0x%x", + __func__, tid, pdu_len, icc, sb->sb_state); + SOCKBUF_UNLOCK(sb); + INP_WUNLOCK(inp); + + INP_INFO_RLOCK(&V_tcbinfo); + INP_WLOCK(inp); + tp = tcp_drop(tp, ECONNRESET); + if (tp) + INP_WUNLOCK(inp); + INP_INFO_RUNLOCK(&V_tcbinfo); + + icl_cxgbei_conn_pdu_free(NULL, ip); +#ifdef INVARIANTS + toep->ulpcb2 = NULL; +#endif + return (0); + } + MPASS(icc->icc_signature == CXGBEI_CONN_SIGNATURE); + ic = &icc->ic; + icl_cxgbei_new_pdu_set_conn(ip, ic); + + MPASS(m == NULL); /* was unused, we'll use it now. */ + m = sbcut_locked(sb, sbused(sb)); /* XXXNP: toep->sb_cc accounting? */ + if (__predict_false(m != NULL)) { + int len = m_length(m, NULL); + + /* + * PDUs were received before the tid transitioned to ULP mode. + * Convert them to icl_cxgbei_pdus and send them to ICL before + * the PDU in icp/ip. + */ + CTR3(KTR_CXGBE, "%s: tid %u, %u bytes in so_rcv", __func__, tid, + len); + + /* XXXNP: needs to be rewritten. */ + if (len == sizeof(struct iscsi_bhs) || len == 4 + sizeof(struct + iscsi_bhs)) { + struct icl_cxgbei_pdu *icp0; + struct icl_pdu *ip0; + + ip0 = icl_cxgbei_new_pdu(M_NOWAIT); + icl_cxgbei_new_pdu_set_conn(ip0, ic); + if (ip0 == NULL) + CXGBE_UNIMPLEMENTED("PDU allocation failure"); + icp0 = ip_to_icp(ip0); + icp0->pdu_seq = 0; /* XXX */ + icp0->pdu_flags = SBUF_ULP_FLAG_HDR_RCVD | + SBUF_ULP_FLAG_STATUS_RCVD; + m_copydata(m, 0, sizeof(struct iscsi_bhs), (void *)ip0->ip_bhs); + STAILQ_INSERT_TAIL(&icc->rcvd_pdus, ip0, ip_next); + } + m_freem(m); + } + #if 0 CTR4(KTR_CXGBE, "%s: tid %u, pdu_len %u, pdu_flags 0x%x", __func__, tid, pdu_len, icp->pdu_flags); #endif - icc->icp = NULL; - ic->ic_receive(&icp->ip); + STAILQ_INSERT_TAIL(&icc->rcvd_pdus, ip, ip_next); + if ((icc->rx_flags & RXF_ACTIVE) == 0) { + struct cxgbei_worker_thread_softc *cwt = icc->cwt; + mtx_lock(&cwt->cwt_lock); + icc->rx_flags |= RXF_ACTIVE; + TAILQ_INSERT_TAIL(&cwt->rx_head, icc, rx_link); + if (cwt->cwt_state == CWT_SLEEPING) { + cwt->cwt_state = CWT_RUNNING; + cv_signal(&cwt->cwt_cv); + } + mtx_unlock(&cwt->cwt_lock); + } + SOCKBUF_UNLOCK(sb); + INP_WUNLOCK(inp); + +#ifdef INVARIANTS + toep->ulpcb2 = NULL; +#endif + return (0); } static void t4_register_cpl_handler_with_tom(struct adapter *sc) { t4_register_cpl_handler(sc, CPL_ISCSI_HDR, do_rx_iscsi_hdr); t4_register_cpl_handler(sc, CPL_ISCSI_DATA, do_rx_iscsi_data); t4_register_cpl_handler(sc, CPL_RX_ISCSI_DDP, do_rx_iscsi_ddp); } static void t4_unregister_cpl_handler_with_tom(struct adapter *sc) { t4_register_cpl_handler(sc, CPL_ISCSI_HDR, NULL); t4_register_cpl_handler(sc, CPL_ISCSI_DATA, NULL); t4_register_cpl_handler(sc, CPL_RX_ISCSI_DDP, NULL); } /* initiator */ void cxgbei_conn_task_reserve_itt(void *conn, void **prv, void *scmd, unsigned int *itt) { unsigned int tag; tag = cxgbei_task_reserve_itt(conn, prv, scmd, itt); if (tag) *itt = htonl(tag); return; } /* target */ void cxgbei_conn_transfer_reserve_ttt(void *conn, void **prv, void *scmd, unsigned int *ttt) { unsigned int tag; tag = cxgbei_task_reserve_ttt(conn, prv, scmd, ttt); if (tag) *ttt = htonl(tag); return; } void cxgbei_cleanup_task(void *conn, void *ofld_priv) { struct icl_conn *ic = (struct icl_conn *)conn; struct icl_cxgbei_conn *icc = ic_to_icc(ic); struct cxgbei_task_data *tdata = ofld_priv; struct adapter *sc = icc->sc; struct cxgbei_data *ci = sc->iscsi_softc; MPASS(icc->icc_signature == CXGBEI_CONN_SIGNATURE); MPASS(tdata != NULL); if (cxgbei_ulp2_is_ddp_tag(&ci->tag_format, tdata->sc_ddp_tag)) t4_sk_ddp_tag_release(icc, tdata->sc_ddp_tag); memset(tdata, 0, sizeof(*tdata)); } static int cxgbei_activate(struct adapter *sc) { struct cxgbei_data *ci; int rc; ASSERT_SYNCHRONIZED_OP(sc); if (uld_active(sc, ULD_ISCSI)) { KASSERT(0, ("%s: iSCSI offload already enabled on adapter %p", __func__, sc)); return (0); } if (sc->iscsicaps == 0 || sc->vres.iscsi.size == 0) { device_printf(sc->dev, "not iSCSI offload capable, or capability disabled.\n"); return (ENOSYS); } /* per-adapter softc for iSCSI */ ci = malloc(sizeof(*ci), M_CXGBE, M_ZERO | M_NOWAIT); if (ci == NULL) return (ENOMEM); rc = cxgbei_ddp_init(sc, ci); if (rc != 0) { free(ci, M_CXGBE); return (rc); } t4_register_cpl_handler_with_tom(sc); sc->iscsi_softc = ci; return (0); } static int cxgbei_deactivate(struct adapter *sc) { ASSERT_SYNCHRONIZED_OP(sc); if (sc->iscsi_softc != NULL) { cxgbei_ddp_cleanup(sc->iscsi_softc); t4_unregister_cpl_handler_with_tom(sc); free(sc->iscsi_softc, M_CXGBE); sc->iscsi_softc = NULL; } return (0); } static void cxgbei_activate_all(struct adapter *sc, void *arg __unused) { if (begin_synchronized_op(sc, NULL, SLEEP_OK | INTR_OK, "t4isact") != 0) return; /* Activate iSCSI if any port on this adapter has IFCAP_TOE enabled. */ if (sc->offload_map && !uld_active(sc, ULD_ISCSI)) (void) t4_activate_uld(sc, ULD_ISCSI); end_synchronized_op(sc, 0); } static void cxgbei_deactivate_all(struct adapter *sc, void *arg __unused) { if (begin_synchronized_op(sc, NULL, SLEEP_OK | INTR_OK, "t4isdea") != 0) return; if (uld_active(sc, ULD_ISCSI)) (void) t4_deactivate_uld(sc, ULD_ISCSI); end_synchronized_op(sc, 0); } static struct uld_info cxgbei_uld_info = { .uld_id = ULD_ISCSI, .activate = cxgbei_activate, .deactivate = cxgbei_deactivate, }; -enum { - CWT_RUNNING = 1, - CWT_STOP = 2, - CWT_STOPPED = 3, -}; - -struct cxgbei_worker_thread_softc { - struct mtx cwt_lock; - struct cv cwt_cv; - volatile int cwt_state; -} __aligned(CACHE_LINE_SIZE); - -int worker_thread_count; +static int worker_thread_count; static struct cxgbei_worker_thread_softc *cwt_softc; static struct proc *cxgbei_proc; static void cwt_main(void *arg) { struct cxgbei_worker_thread_softc *cwt = arg; + struct icl_cxgbei_conn *icc = NULL; + struct icl_conn *ic; + struct icl_pdu *ip; + struct sockbuf *sb; + STAILQ_HEAD(, icl_pdu) rx_pdus = STAILQ_HEAD_INITIALIZER(rx_pdus); MPASS(cwt != NULL); mtx_lock(&cwt->cwt_lock); MPASS(cwt->cwt_state == 0); cwt->cwt_state = CWT_RUNNING; cv_signal(&cwt->cwt_cv); - for (;;) { - cv_wait(&cwt->cwt_cv, &cwt->cwt_lock); - if (cwt->cwt_state == CWT_STOP) + + while (__predict_true(cwt->cwt_state != CWT_STOP)) { + cwt->cwt_state = CWT_RUNNING; + while ((icc = TAILQ_FIRST(&cwt->rx_head)) != NULL) { + TAILQ_REMOVE(&cwt->rx_head, icc, rx_link); + mtx_unlock(&cwt->cwt_lock); + + ic = &icc->ic; + sb = &ic->ic_socket->so_rcv; + + SOCKBUF_LOCK(sb); + MPASS(icc->rx_flags & RXF_ACTIVE); + if (__predict_true(!(sb->sb_state & SBS_CANTRCVMORE))) { + MPASS(STAILQ_EMPTY(&rx_pdus)); + STAILQ_SWAP(&icc->rcvd_pdus, &rx_pdus, icl_pdu); + SOCKBUF_UNLOCK(sb); + + /* Hand over PDUs to ICL. */ + while ((ip = STAILQ_FIRST(&rx_pdus)) != NULL) { + STAILQ_REMOVE_HEAD(&rx_pdus, ip_next); + ic->ic_receive(ip); + } + + SOCKBUF_LOCK(sb); + MPASS(STAILQ_EMPTY(&rx_pdus)); + } + MPASS(icc->rx_flags & RXF_ACTIVE); + if (STAILQ_EMPTY(&icc->rcvd_pdus) || + __predict_false(sb->sb_state & SBS_CANTRCVMORE)) { + icc->rx_flags &= ~RXF_ACTIVE; + } else { + /* + * More PDUs were received while we were busy + * handing over the previous batch to ICL. + * Re-add this connection to the end of the + * queue. + */ + mtx_lock(&cwt->cwt_lock); + TAILQ_INSERT_TAIL(&cwt->rx_head, icc, + rx_link); + mtx_unlock(&cwt->cwt_lock); + } + SOCKBUF_UNLOCK(sb); + + mtx_lock(&cwt->cwt_lock); + } + + /* Inner loop doesn't check for CWT_STOP, do that first. */ + if (__predict_false(cwt->cwt_state == CWT_STOP)) break; + cwt->cwt_state = CWT_SLEEPING; + cv_wait(&cwt->cwt_cv, &cwt->cwt_lock); } + MPASS(TAILQ_FIRST(&cwt->rx_head) == NULL); mtx_assert(&cwt->cwt_lock, MA_OWNED); cwt->cwt_state = CWT_STOPPED; cv_signal(&cwt->cwt_cv); mtx_unlock(&cwt->cwt_lock); kthread_exit(); } static int start_worker_threads(void) { int i, rc; struct cxgbei_worker_thread_softc *cwt; worker_thread_count = min(mp_ncpus, 32); cwt_softc = malloc(worker_thread_count * sizeof(*cwt), M_CXGBE, M_WAITOK | M_ZERO); MPASS(cxgbei_proc == NULL); for (i = 0, cwt = &cwt_softc[0]; i < worker_thread_count; i++, cwt++) { mtx_init(&cwt->cwt_lock, "cwt lock", NULL, MTX_DEF); cv_init(&cwt->cwt_cv, "cwt cv"); + TAILQ_INIT(&cwt->rx_head); rc = kproc_kthread_add(cwt_main, cwt, &cxgbei_proc, NULL, 0, 0, "cxgbei", "%d", i); if (rc != 0) { printf("cxgbei: failed to start thread #%d/%d (%d)\n", i + 1, worker_thread_count, rc); mtx_destroy(&cwt->cwt_lock); cv_destroy(&cwt->cwt_cv); bzero(&cwt, sizeof(*cwt)); if (i == 0) { free(cwt_softc, M_CXGBE); worker_thread_count = 0; return (rc); } /* Not fatal, carry on with fewer threads. */ worker_thread_count = i; rc = 0; break; } /* Wait for thread to start before moving on to the next one. */ mtx_lock(&cwt->cwt_lock); - while (cwt->cwt_state != CWT_RUNNING) + while (cwt->cwt_state == 0) cv_wait(&cwt->cwt_cv, &cwt->cwt_lock); mtx_unlock(&cwt->cwt_lock); } MPASS(cwt_softc != NULL); MPASS(worker_thread_count > 0); return (0); } static void stop_worker_threads(void) { int i; struct cxgbei_worker_thread_softc *cwt = &cwt_softc[0]; MPASS(worker_thread_count >= 0); for (i = 0, cwt = &cwt_softc[0]; i < worker_thread_count; i++, cwt++) { mtx_lock(&cwt->cwt_lock); - MPASS(cwt->cwt_state == CWT_RUNNING); + MPASS(cwt->cwt_state == CWT_RUNNING || + cwt->cwt_state == CWT_SLEEPING); cwt->cwt_state = CWT_STOP; cv_signal(&cwt->cwt_cv); do { cv_wait(&cwt->cwt_cv, &cwt->cwt_lock); } while (cwt->cwt_state != CWT_STOPPED); mtx_unlock(&cwt->cwt_lock); } free(cwt_softc, M_CXGBE); } static int cxgbei_mod_load(void) { int rc; rc = start_worker_threads(); if (rc != 0) return (rc); rc = t4_register_uld(&cxgbei_uld_info); if (rc != 0) { stop_worker_threads(); return (rc); } t4_iterate(cxgbei_activate_all, NULL); return (rc); } static int cxgbei_mod_unload(void) { t4_iterate(cxgbei_deactivate_all, NULL); if (t4_unregister_uld(&cxgbei_uld_info) == EBUSY) return (EBUSY); stop_worker_threads(); return (0); } static int cxgbei_modevent(module_t mod, int cmd, void *arg) { int rc = 0; switch (cmd) { case MOD_LOAD: rc = cxgbei_mod_load(); break; case MOD_UNLOAD: rc = cxgbei_mod_unload(); break; default: rc = EINVAL; } return (rc); } static moduledata_t cxgbei_mod = { "cxgbei", cxgbei_modevent, NULL, }; MODULE_VERSION(cxgbei, 1); DECLARE_MODULE(cxgbei, cxgbei_mod, SI_SUB_EXEC, SI_ORDER_ANY); MODULE_DEPEND(cxgbei, t4_tom, 1, 1, 1); MODULE_DEPEND(cxgbei, cxgbe, 1, 1, 1); MODULE_DEPEND(cxgbei, icl, 1, 1, 1); Index: projects/cxl_iscsi/sys/dev/cxgbe/cxgbei/cxgbei.h =================================================================== --- projects/cxl_iscsi/sys/dev/cxgbe/cxgbei/cxgbei.h (revision 291231) +++ projects/cxl_iscsi/sys/dev/cxgbe/cxgbei/cxgbei.h (revision 291232) @@ -1,142 +1,163 @@ /*- * Copyright (c) 2012, 2015 Chelsio Communications, Inc. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #ifndef __CXGBEI_OFLD_H__ #define __CXGBEI_OFLD_H__ #include +enum { + CWT_SLEEPING = 1, + CWT_RUNNING = 2, + CWT_STOP = 3, + CWT_STOPPED = 4, +}; + +struct cxgbei_worker_thread_softc { + struct mtx cwt_lock; + struct cv cwt_cv; + volatile int cwt_state; + + TAILQ_HEAD(, icl_cxgbei_conn) rx_head; +} __aligned(CACHE_LINE_SIZE); + #define CXGBEI_CONN_SIGNATURE 0x56788765 +enum { + RXF_ACTIVE = 1 << 0, /* In the worker thread's queue */ +}; + struct icl_cxgbei_conn { struct icl_conn ic; /* cxgbei specific stuff goes here. */ uint32_t icc_signature; int ulp_submode; struct adapter *sc; struct toepcb *toep; - /* PDU currently being assembled. */ - /* XXXNP: maybe just use ic->ic_receive_pdu instead? */ - struct icl_cxgbei_pdu *icp; - uint32_t pdu_seq; /* For debug only */ + /* Receive related. */ + u_int rx_flags; /* protected by so_rcv lock */ + STAILQ_HEAD(, icl_pdu) rcvd_pdus; /* protected by so_rcv lock */ + TAILQ_ENTRY(icl_cxgbei_conn) rx_link; /* protected by cwt lock */ + struct cxgbei_worker_thread_softc *cwt; }; static inline struct icl_cxgbei_conn * ic_to_icc(struct icl_conn *ic) { return (__containerof(ic, struct icl_cxgbei_conn, ic)); } #define CXGBEI_PDU_SIGNATURE 0x12344321 struct icl_cxgbei_pdu { struct icl_pdu ip; /* cxgbei specific stuff goes here. */ uint32_t icp_signature; + uint32_t pdu_seq; /* For debug only */ u_int pdu_flags; }; static inline struct icl_cxgbei_pdu * ip_to_icp(struct icl_pdu *ip) { return (__containerof(ip, struct icl_cxgbei_pdu, ip)); } struct cxgbei_sgl { int sg_flag; void *sg_addr; void *sg_dma_addr; size_t sg_offset; size_t sg_length; }; #define cxgbei_scsi_for_each_sg(_sgl, _sgel, _n, _i) \ for (_i = 0, _sgel = (cxgbei_sgl*) (_sgl); _i < _n; _i++, \ _sgel++) #define sg_dma_addr(_sgel) _sgel->sg_dma_addr #define sg_virt(_sgel) _sgel->sg_addr #define sg_len(_sgel) _sgel->sg_length #define sg_off(_sgel) _sgel->sg_offset #define sg_next(_sgel) _sgel + 1 #define SBUF_ULP_FLAG_HDR_RCVD 0x1 #define SBUF_ULP_FLAG_DATA_RCVD 0x2 #define SBUF_ULP_FLAG_STATUS_RCVD 0x4 #define SBUF_ULP_FLAG_HCRC_ERROR 0x10 #define SBUF_ULP_FLAG_DCRC_ERROR 0x20 #define SBUF_ULP_FLAG_PAD_ERROR 0x40 #define SBUF_ULP_FLAG_DATA_DDPED 0x80 /* private data for each scsi task */ struct cxgbei_task_data { struct cxgbei_sgl sgl[256]; u_int nsge; u_int sc_ddp_tag; }; struct cxgbei_ulp2_tag_format { u_char sw_bits; u_char rsvd_bits; u_char rsvd_shift; u_char filler[1]; uint32_t rsvd_mask; }; struct cxgbei_data { u_int max_txsz; u_int max_rxsz; u_int llimit; u_int ulimit; u_int nppods; u_int idx_last; u_char idx_bits; uint32_t idx_mask; uint32_t rsvd_tag_mask; struct mtx map_lock; bus_dma_tag_t ulp_ddp_tag; unsigned char *colors; struct cxgbei_ulp2_gather_list **gl_map; struct cxgbei_ulp2_tag_format tag_format; }; void cxgbei_conn_task_reserve_itt(void *, void **, void *, unsigned int *); void cxgbei_conn_transfer_reserve_ttt(void *, void **, void *, unsigned int *); void cxgbei_cleanup_task(void *, void *); struct cxgbei_ulp2_pagepod_hdr; int t4_ddp_set_map(struct cxgbei_data *, void *, struct cxgbei_ulp2_pagepod_hdr *, u_int, u_int, struct cxgbei_ulp2_gather_list *, int); void t4_ddp_clear_map(struct cxgbei_data *, struct cxgbei_ulp2_gather_list *, u_int, u_int, u_int, struct icl_cxgbei_conn *); #endif Index: projects/cxl_iscsi/sys/dev/cxgbe/cxgbei/icl_cxgbei.c =================================================================== --- projects/cxl_iscsi/sys/dev/cxgbe/cxgbei/icl_cxgbei.c (revision 291231) +++ projects/cxl_iscsi/sys/dev/cxgbe/cxgbei/icl_cxgbei.c (revision 291232) @@ -1,820 +1,860 @@ /*- * Copyright (c) 2012 The FreeBSD Foundation * Copyright (c) 2015 Chelsio Communications, Inc. * All rights reserved. * * This software was developed by Edward Tomasz Napierala under sponsorship * from the FreeBSD Foundation. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * */ /* * cxgbei implementation of iSCSI Common Layer kobj(9) interface. */ #include __FBSDID("$FreeBSD$"); #include "opt_inet.h" #include "opt_inet6.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "common/common.h" #include "tom/t4_tom.h" #include "cxgbei.h" SYSCTL_NODE(_kern_icl, OID_AUTO, cxgbei, CTLFLAG_RD, 0, "Chelsio iSCSI offload"); static int coalesce = 1; SYSCTL_INT(_kern_icl_cxgbei, OID_AUTO, coalesce, CTLFLAG_RWTUN, &coalesce, 0, "Try to coalesce PDUs before sending"); static int partial_receive_len = 128 * 1024; SYSCTL_INT(_kern_icl_cxgbei, OID_AUTO, partial_receive_len, CTLFLAG_RWTUN, &partial_receive_len, 0, "Minimum read size for partially received " "data segment"); static int sendspace = 1048576; SYSCTL_INT(_kern_icl_cxgbei, OID_AUTO, sendspace, CTLFLAG_RWTUN, &sendspace, 0, "Default send socket buffer size"); static int recvspace = 1048576; SYSCTL_INT(_kern_icl_cxgbei, OID_AUTO, recvspace, CTLFLAG_RWTUN, &recvspace, 0, "Default receive socket buffer size"); static uma_zone_t icl_transfer_zone; static volatile u_int icl_cxgbei_ncons; #define ICL_CONN_LOCK(X) mtx_lock(X->ic_lock) #define ICL_CONN_UNLOCK(X) mtx_unlock(X->ic_lock) #define ICL_CONN_LOCK_ASSERT(X) mtx_assert(X->ic_lock, MA_OWNED) #define ICL_CONN_LOCK_ASSERT_NOT(X) mtx_assert(X->ic_lock, MA_NOTOWNED) struct icl_pdu *icl_cxgbei_new_pdu(int); void icl_cxgbei_new_pdu_set_conn(struct icl_pdu *, struct icl_conn *); static icl_conn_new_pdu_t icl_cxgbei_conn_new_pdu; -static icl_conn_pdu_free_t icl_cxgbei_conn_pdu_free; +icl_conn_pdu_free_t icl_cxgbei_conn_pdu_free; static icl_conn_pdu_data_segment_length_t icl_cxgbei_conn_pdu_data_segment_length; static icl_conn_pdu_append_data_t icl_cxgbei_conn_pdu_append_data; static icl_conn_pdu_get_data_t icl_cxgbei_conn_pdu_get_data; static icl_conn_pdu_queue_t icl_cxgbei_conn_pdu_queue; static icl_conn_handoff_t icl_cxgbei_conn_handoff; static icl_conn_free_t icl_cxgbei_conn_free; static icl_conn_close_t icl_cxgbei_conn_close; static icl_conn_task_setup_t icl_cxgbei_conn_task_setup; static icl_conn_task_done_t icl_cxgbei_conn_task_done; static icl_conn_transfer_setup_t icl_cxgbei_conn_transfer_setup; static icl_conn_transfer_done_t icl_cxgbei_conn_transfer_done; static kobj_method_t icl_cxgbei_methods[] = { KOBJMETHOD(icl_conn_new_pdu, icl_cxgbei_conn_new_pdu), KOBJMETHOD(icl_conn_pdu_free, icl_cxgbei_conn_pdu_free), KOBJMETHOD(icl_conn_pdu_data_segment_length, icl_cxgbei_conn_pdu_data_segment_length), KOBJMETHOD(icl_conn_pdu_append_data, icl_cxgbei_conn_pdu_append_data), KOBJMETHOD(icl_conn_pdu_get_data, icl_cxgbei_conn_pdu_get_data), KOBJMETHOD(icl_conn_pdu_queue, icl_cxgbei_conn_pdu_queue), KOBJMETHOD(icl_conn_handoff, icl_cxgbei_conn_handoff), KOBJMETHOD(icl_conn_free, icl_cxgbei_conn_free), KOBJMETHOD(icl_conn_close, icl_cxgbei_conn_close), KOBJMETHOD(icl_conn_task_setup, icl_cxgbei_conn_task_setup), KOBJMETHOD(icl_conn_task_done, icl_cxgbei_conn_task_done), KOBJMETHOD(icl_conn_transfer_setup, icl_cxgbei_conn_transfer_setup), KOBJMETHOD(icl_conn_transfer_done, icl_cxgbei_conn_transfer_done), { 0, 0 } }; DEFINE_CLASS(icl_cxgbei, icl_cxgbei_methods, sizeof(struct icl_cxgbei_conn)); /* * Subtract another 256 for AHS from MAX_DSL if AHS could be used. */ #define CXGBEI_MAX_PDU 16224 #define CXGBEI_MAX_DSL (CXGBEI_MAX_PDU - sizeof(struct iscsi_bhs) - 8) -static void +void icl_cxgbei_conn_pdu_free(struct icl_conn *ic, struct icl_pdu *ip) { #ifdef INVARIANTS struct icl_cxgbei_pdu *icp = ip_to_icp(ip); #endif MPASS(icp->icp_signature == CXGBEI_PDU_SIGNATURE); MPASS(ic == ip->ip_conn); MPASS(ip->ip_bhs_mbuf != NULL); m_freem(ip->ip_ahs_mbuf); m_freem(ip->ip_data_mbuf); m_freem(ip->ip_bhs_mbuf); /* storage for icl_cxgbei_pdu itself */ #ifdef DIAGNOSTIC - if (ic != NULL) + if (__predict_true(ic != NULL)) refcount_release(&ic->ic_outstanding_pdus); #endif } struct icl_pdu * icl_cxgbei_new_pdu(int flags) { struct icl_cxgbei_pdu *icp; struct icl_pdu *ip; struct mbuf *m; uintptr_t a; m = m_gethdr(flags, MT_DATA); if (__predict_false(m == NULL)) return (NULL); a = roundup2(mtod(m, uintptr_t), _Alignof(struct icl_cxgbei_pdu)); icp = (struct icl_cxgbei_pdu *)a; bzero(icp, sizeof(*icp)); icp->icp_signature = CXGBEI_PDU_SIGNATURE; ip = &icp->ip; ip->ip_bhs_mbuf = m; a = roundup2((uintptr_t)(icp + 1), _Alignof(struct iscsi_bhs *)); ip->ip_bhs = (struct iscsi_bhs *)a; #ifdef INVARIANTS /* Everything must fit entirely in the mbuf. */ a = (uintptr_t)(ip->ip_bhs + 1); MPASS(a <= (uintptr_t)m + MSIZE); #endif bzero(ip->ip_bhs, sizeof(*ip->ip_bhs)); m->m_data = (void *)ip->ip_bhs; m->m_len = sizeof(struct iscsi_bhs); m->m_pkthdr.len = m->m_len; return (ip); } void icl_cxgbei_new_pdu_set_conn(struct icl_pdu *ip, struct icl_conn *ic) { ip->ip_conn = ic; #ifdef DIAGNOSTIC refcount_acquire(&ic->ic_outstanding_pdus); #endif } /* * Allocate icl_pdu with empty BHS to fill up by the caller. */ static struct icl_pdu * icl_cxgbei_conn_new_pdu(struct icl_conn *ic, int flags) { struct icl_pdu *ip; ip = icl_cxgbei_new_pdu(flags); if (__predict_false(ip == NULL)) return (NULL); icl_cxgbei_new_pdu_set_conn(ip, ic); return (ip); } static size_t icl_pdu_data_segment_length(const struct icl_pdu *request) { uint32_t len = 0; len += request->ip_bhs->bhs_data_segment_len[0]; len <<= 8; len += request->ip_bhs->bhs_data_segment_len[1]; len <<= 8; len += request->ip_bhs->bhs_data_segment_len[2]; return (len); } size_t icl_cxgbei_conn_pdu_data_segment_length(struct icl_conn *ic, const struct icl_pdu *request) { return (icl_pdu_data_segment_length(request)); } static uint32_t icl_conn_build_tasktag(struct icl_conn *ic, uint32_t tag) { return tag; } static struct mbuf * finalize_pdu(struct icl_cxgbei_conn *icc, struct icl_cxgbei_pdu *icp) { struct icl_pdu *ip = &icp->ip; uint8_t ulp_submode, padding; struct mbuf *m, *last; struct iscsi_bhs *bhs; /* * Fix up the data segment mbuf first. */ m = ip->ip_data_mbuf; ulp_submode = icc->ulp_submode; if (m) { last = m_last(m); /* * Round up the data segment to a 4B boundary. Pad with 0 if * necessary. There will definitely be room in the mbuf. */ padding = roundup2(ip->ip_data_len, 4) - ip->ip_data_len; if (padding) { bzero(mtod(last, uint8_t *) + last->m_len, padding); last->m_len += padding; } } else { MPASS(ip->ip_data_len == 0); ulp_submode &= ~ULP_CRC_DATA; padding = 0; } /* * Now the header mbuf that has the BHS. */ m = ip->ip_bhs_mbuf; MPASS(m->m_pkthdr.len == sizeof(struct iscsi_bhs)); MPASS(m->m_len == sizeof(struct iscsi_bhs)); bhs = ip->ip_bhs; bhs->bhs_data_segment_len[2] = ip->ip_data_len; bhs->bhs_data_segment_len[1] = ip->ip_data_len >> 8; bhs->bhs_data_segment_len[0] = ip->ip_data_len >> 16; /* "Convert" PDU to mbuf chain. Do not use icp/ip after this. */ m->m_pkthdr.len = sizeof(struct iscsi_bhs) + ip->ip_data_len + padding; m->m_next = ip->ip_data_mbuf; set_mbuf_ulp_submode(m, ulp_submode); #ifdef INVARIANTS bzero(icp, sizeof(*icp)); #endif #ifdef DIAGNOSTIC refcount_release(&icc->ic.ic_outstanding_pdus); #endif return (m); } int icl_cxgbei_conn_pdu_append_data(struct icl_conn *ic, struct icl_pdu *ip, const void *addr, size_t len, int flags) { struct mbuf *m; #ifdef INVARIANTS struct icl_cxgbei_pdu *icp = ip_to_icp(ip); #endif MPASS(icp->icp_signature == CXGBEI_PDU_SIGNATURE); MPASS(ic == ip->ip_conn); KASSERT(len > 0, ("%s: len is %jd", __func__, (intmax_t)len)); /* * XXXNP: add assertions here, after fixing the problems around * max_data_segment_length: * a) len should not cause the max_data_segment_length to be exceeded. * b) all data should fit in a single jumbo16. The hardware limit just * happens to be within jumbo16 so this is very convenient. */ m = ip->ip_data_mbuf; if (m == NULL) { m = m_getjcl(M_NOWAIT, MT_DATA, 0, MJUM16BYTES); if (__predict_false(m == NULL)) return (ENOMEM); ip->ip_data_mbuf = m; } if (__predict_true(m_append(m, len, addr) != 0)) { ip->ip_data_len += len; MPASS(ip->ip_data_len <= CXGBEI_MAX_DSL); return (0); } else { if (flags & M_WAITOK) { CXGBE_UNIMPLEMENTED("fail safe append"); } ip->ip_data_len = m_length(m, NULL); return (1); } } void icl_cxgbei_conn_pdu_get_data(struct icl_conn *ic, struct icl_pdu *ip, size_t off, void *addr, size_t len) { struct icl_cxgbei_pdu *icp = ip_to_icp(ip); if (icp->pdu_flags & SBUF_ULP_FLAG_DATA_DDPED) return; /* data is DDP'ed, no need to copy */ m_copydata(ip->ip_data_mbuf, off, len, addr); } void icl_cxgbei_conn_pdu_queue(struct icl_conn *ic, struct icl_pdu *ip) { struct icl_cxgbei_conn *icc = ic_to_icc(ic); struct icl_cxgbei_pdu *icp = ip_to_icp(ip); struct socket *so = ic->ic_socket; struct toepcb *toep = icc->toep; struct inpcb *inp; struct mbuf *m; MPASS(ic == ip->ip_conn); MPASS(ip->ip_bhs_mbuf != NULL); /* The kernel doesn't generate PDUs with AHS. */ MPASS(ip->ip_ahs_mbuf == NULL && ip->ip_ahs_len == 0); ICL_CONN_LOCK_ASSERT(ic); /* NOTE: sowriteable without so_snd lock is a mostly harmless race. */ if (ic->ic_disconnecting || so == NULL || !sowriteable(so)) { icl_cxgbei_conn_pdu_free(ic, ip); return; } m = finalize_pdu(icc, icp); M_ASSERTPKTHDR(m); MPASS((m->m_pkthdr.len & 3) == 0); MPASS(m->m_pkthdr.len + 8 <= CXGBEI_MAX_PDU); /* * Do not get inp from toep->inp as the toepcb might have detached * already. */ inp = sotoinpcb(so); INP_WLOCK(inp); if (__predict_false(inp->inp_flags & (INP_DROPPED | INP_TIMEWAIT)) || __predict_false((toep->flags & TPF_ATTACHED) == 0)) m_freem(m); else { mbufq_enqueue(&toep->ulp_pduq, m); t4_push_pdus(icc->sc, toep, 0); } INP_WUNLOCK(inp); } static struct icl_conn * icl_cxgbei_new_conn(const char *name, struct mtx *lock) { struct icl_cxgbei_conn *icc; struct icl_conn *ic; refcount_acquire(&icl_cxgbei_ncons); icc = (struct icl_cxgbei_conn *)kobj_create(&icl_cxgbei_class, M_CXGBE, M_WAITOK | M_ZERO); icc->icc_signature = CXGBEI_CONN_SIGNATURE; + STAILQ_INIT(&icc->rcvd_pdus); ic = &icc->ic; ic->ic_lock = lock; /* XXXNP: review. Most of these icl_conn fields aren't really used */ STAILQ_INIT(&ic->ic_to_send); cv_init(&ic->ic_send_cv, "icl_cxgbei_tx"); cv_init(&ic->ic_receive_cv, "icl_cxgbei_rx"); #ifdef DIAGNOSTIC refcount_init(&ic->ic_outstanding_pdus, 0); #endif ic->ic_max_data_segment_length = CXGBEI_MAX_DSL; ic->ic_name = name; ic->ic_offload = "cxgbei"; CTR2(KTR_CXGBE, "%s: icc %p", __func__, icc); return (ic); } void icl_cxgbei_conn_free(struct icl_conn *ic) { struct icl_cxgbei_conn *icc = ic_to_icc(ic); MPASS(icc->icc_signature == CXGBEI_CONN_SIGNATURE); CTR2(KTR_CXGBE, "%s: icc %p", __func__, icc); cv_destroy(&ic->ic_send_cv); cv_destroy(&ic->ic_receive_cv); kobj_delete((struct kobj *)icc, M_CXGBE); refcount_release(&icl_cxgbei_ncons); } static int icl_cxgbei_setsockopt(struct icl_conn *ic, struct socket *so) { size_t minspace; struct sockopt opt; int error, one = 1; /* * For sendspace, this is required because the current code cannot * send a PDU in pieces; thus, the minimum buffer size is equal * to the maximum PDU size. "+4" is to account for possible padding. * * What we should actually do here is to use autoscaling, but set * some minimal buffer size to "minspace". I don't know a way to do * that, though. */ minspace = sizeof(struct iscsi_bhs) + ic->ic_max_data_segment_length + ISCSI_HEADER_DIGEST_SIZE + ISCSI_DATA_DIGEST_SIZE + 4; if (sendspace < minspace) sendspace = minspace; if (recvspace < minspace) recvspace = minspace; error = soreserve(so, sendspace, recvspace); if (error != 0) { icl_cxgbei_conn_close(ic); return (error); } SOCKBUF_LOCK(&so->so_snd); so->so_snd.sb_flags |= SB_AUTOSIZE; SOCKBUF_UNLOCK(&so->so_snd); SOCKBUF_LOCK(&so->so_rcv); so->so_rcv.sb_flags |= SB_AUTOSIZE; SOCKBUF_UNLOCK(&so->so_rcv); /* * Disable Nagle. */ bzero(&opt, sizeof(opt)); opt.sopt_dir = SOPT_SET; opt.sopt_level = IPPROTO_TCP; opt.sopt_name = TCP_NODELAY; opt.sopt_val = &one; opt.sopt_valsize = sizeof(one); error = sosetopt(so, &opt); if (error != 0) { icl_cxgbei_conn_close(ic); return (error); } return (0); } /* * Request/response structure used to find out the adapter offloading a socket. */ struct find_ofld_adapter_rr { struct socket *so; struct adapter *sc; /* result */ }; static void find_offload_adapter(struct adapter *sc, void *arg) { struct find_ofld_adapter_rr *fa = arg; struct socket *so = fa->so; struct tom_data *td = sc->tom_softc; struct tcpcb *tp; struct inpcb *inp; /* Non-TCP were filtered out earlier. */ MPASS(so->so_proto->pr_protocol == IPPROTO_TCP); if (fa->sc != NULL) return; /* Found already. */ if (td == NULL) return; /* TOE not enabled on this adapter. */ inp = sotoinpcb(so); INP_WLOCK(inp); if ((inp->inp_flags & (INP_DROPPED | INP_TIMEWAIT)) == 0) { tp = intotcpcb(inp); if (tp->t_flags & TF_TOE && tp->tod == &td->tod) fa->sc = sc; /* Found. */ } INP_WUNLOCK(inp); } static void set_ulp_mode_iscsi(struct adapter *sc, struct toepcb *toep, int hcrc, int dcrc) { uint64_t val = 0; if (hcrc) val |= ULP_CRC_HEADER; if (dcrc) val |= ULP_CRC_DATA; val <<= 4; val |= ULP_MODE_ISCSI; CTR4(KTR_CXGBE, "%s: tid %u, ULP_MODE_ISCSI, CRC hdr=%d data=%d", __func__, toep->tid, hcrc, dcrc); t4_set_tcb_field(sc, toep, 1, 0, 0xfff, val); } +/* XXXNP */ +extern struct cxgbei_worker_thread_softc *cwt_softc; + /* * XXXNP: Who is responsible for cleaning up the socket if this returns with an * error? Review all error paths. * * XXXNP: What happens to the socket's fd reference if the operation is * successful, and how does that affect the socket's life cycle? */ int icl_cxgbei_conn_handoff(struct icl_conn *ic, int fd) { struct icl_cxgbei_conn *icc = ic_to_icc(ic); struct find_ofld_adapter_rr fa; struct file *fp; struct socket *so; struct inpcb *inp; struct tcpcb *tp; struct toepcb *toep; cap_rights_t rights; int error; MPASS(icc->icc_signature == CXGBEI_CONN_SIGNATURE); ICL_CONN_LOCK_ASSERT_NOT(ic); /* * Steal the socket from userland. */ error = fget(curthread, fd, cap_rights_init(&rights, CAP_SOCK_CLIENT), &fp); if (error != 0) return (error); if (fp->f_type != DTYPE_SOCKET) { fdrop(fp, curthread); return (EINVAL); } so = fp->f_data; if (so->so_type != SOCK_STREAM || so->so_proto->pr_protocol != IPPROTO_TCP) { fdrop(fp, curthread); return (EINVAL); } ICL_CONN_LOCK(ic); if (ic->ic_socket != NULL) { ICL_CONN_UNLOCK(ic); fdrop(fp, curthread); return (EBUSY); } ic->ic_disconnecting = false; ic->ic_socket = so; fp->f_ops = &badfileops; fp->f_data = NULL; fdrop(fp, curthread); ICL_CONN_UNLOCK(ic); /* Find the adapter offloading this socket. */ fa.sc = NULL; fa.so = so; t4_iterate(find_offload_adapter, &fa); if (fa.sc == NULL) return (EINVAL); icc->sc = fa.sc; error = icl_cxgbei_setsockopt(ic, so); if (error) return (error); inp = sotoinpcb(so); INP_WLOCK(inp); tp = intotcpcb(inp); if (inp->inp_flags & (INP_DROPPED | INP_TIMEWAIT)) error = EBUSY; else { /* * socket could not have been "unoffloaded" if here. */ MPASS(tp->t_flags & TF_TOE); MPASS(tp->tod != NULL); MPASS(tp->t_toe != NULL); toep = tp->t_toe; icc->toep = toep; + icc->cwt = &cwt_softc[0]; /* XXXNP */ icc->ulp_submode = 0; if (ic->ic_header_crc32c) icc->ulp_submode |= ULP_CRC_HEADER; if (ic->ic_data_crc32c) icc->ulp_submode |= ULP_CRC_DATA; so->so_options |= SO_NO_DDP; toep->ulp_mode = ULP_MODE_ISCSI; toep->ulpcb = icc; set_ulp_mode_iscsi(icc->sc, toep, ic->ic_header_crc32c, ic->ic_data_crc32c); error = 0; } INP_WUNLOCK(inp); return (error); } void icl_cxgbei_conn_close(struct icl_conn *ic) { struct icl_cxgbei_conn *icc = ic_to_icc(ic); + struct icl_pdu *ip; struct socket *so; + struct sockbuf *sb; + struct inpcb *inp; struct toepcb *toep = icc->toep; MPASS(icc->icc_signature == CXGBEI_CONN_SIGNATURE); ICL_CONN_LOCK_ASSERT_NOT(ic); - CTR3(KTR_CXGBE, "%s: tid %u, icc %p", __func__, toep->tid, icc); - ICL_CONN_LOCK(ic); so = ic->ic_socket; - if (so == NULL) { + if (ic->ic_disconnecting || so == NULL) { + CTR4(KTR_CXGBE, "%s: icc %p (disconnecting = %d), so %p", + __func__, icc, ic->ic_disconnecting, so); ICL_CONN_UNLOCK(ic); return; } - ic->ic_socket = NULL; ic->ic_disconnecting = true; - mbufq_drain(&toep->ulp_pduq); - /* These are unused in this driver right now. */ MPASS(STAILQ_EMPTY(&ic->ic_to_send)); MPASS(ic->ic_receive_pdu == NULL); #ifdef DIAGNOSTIC KASSERT(ic->ic_outstanding_pdus == 0, ("destroying session with %d outstanding PDUs", ic->ic_outstanding_pdus)); #endif + ICL_CONN_UNLOCK(ic); + + CTR3(KTR_CXGBE, "%s: tid %d, icc %p", __func__, toep ? toep->tid : -1, + icc); + inp = sotoinpcb(so); + sb = &so->so_rcv; + INP_WLOCK(inp); + if (toep != NULL) { /* NULL if connection was never offloaded. */ + toep->ulpcb = NULL; + mbufq_drain(&toep->ulp_pduq); + SOCKBUF_LOCK(sb); + if (icc->rx_flags & RXF_ACTIVE) { + volatile u_int *p = &icc->rx_flags; + + SOCKBUF_UNLOCK(sb); + INP_WUNLOCK(inp); + + while (*p & RXF_ACTIVE) + pause("conclo", 1); + + INP_WLOCK(inp); + SOCKBUF_LOCK(sb); + } + + while (!STAILQ_EMPTY(&icc->rcvd_pdus)) { + ip = STAILQ_FIRST(&icc->rcvd_pdus); + STAILQ_REMOVE_HEAD(&icc->rcvd_pdus, ip_next); + icl_cxgbei_conn_pdu_free(ic, ip); + } + SOCKBUF_UNLOCK(sb); + } + INP_WUNLOCK(inp); + + ICL_CONN_LOCK(ic); + ic->ic_socket = NULL; ICL_CONN_UNLOCK(ic); /* * XXXNP: we should send RST instead of FIN when PDUs held in various * queues were purged instead of delivered reliably but soabort isn't * really general purpose and wouldn't do the right thing here. */ soclose(so); } int icl_cxgbei_conn_task_setup(struct icl_conn *ic, struct ccb_scsiio *csio, uint32_t *task_tagp, void **prvp) { void *prv; *task_tagp = icl_conn_build_tasktag(ic, *task_tagp); prv = uma_zalloc(icl_transfer_zone, M_NOWAIT | M_ZERO); if (prv == NULL) return (ENOMEM); *prvp = prv; cxgbei_conn_task_reserve_itt(ic, prvp, csio, task_tagp); return (0); } void icl_cxgbei_conn_task_done(struct icl_conn *ic, void *prv) { cxgbei_cleanup_task(ic, prv); uma_zfree(icl_transfer_zone, prv); } int icl_cxgbei_conn_transfer_setup(struct icl_conn *ic, union ctl_io *io, uint32_t *transfer_tag, void **prvp) { void *prv; *transfer_tag = icl_conn_build_tasktag(ic, *transfer_tag); prv = uma_zalloc(icl_transfer_zone, M_NOWAIT | M_ZERO); if (prv == NULL) return (ENOMEM); *prvp = prv; cxgbei_conn_transfer_reserve_ttt(ic, prvp, io, transfer_tag); return (0); } void icl_cxgbei_conn_transfer_done(struct icl_conn *ic, void *prv) { cxgbei_cleanup_task(ic, prv); uma_zfree(icl_transfer_zone, prv); } static int icl_cxgbei_limits(size_t *limitp) { *limitp = CXGBEI_MAX_DSL; return (0); } static int icl_cxgbei_load(void) { int error; icl_transfer_zone = uma_zcreate("icl_transfer", 16 * 1024, NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0); refcount_init(&icl_cxgbei_ncons, 0); error = icl_register("cxgbei", 100, icl_cxgbei_limits, icl_cxgbei_new_conn); KASSERT(error == 0, ("failed to register")); return (error); } static int icl_cxgbei_unload(void) { if (icl_cxgbei_ncons != 0) return (EBUSY); icl_unregister("cxgbei"); uma_zdestroy(icl_transfer_zone); return (0); } static int icl_cxgbei_modevent(module_t mod, int what, void *arg) { switch (what) { case MOD_LOAD: return (icl_cxgbei_load()); case MOD_UNLOAD: return (icl_cxgbei_unload()); default: return (EINVAL); } } moduledata_t icl_cxgbei_data = { "icl_cxgbei", icl_cxgbei_modevent, 0 }; DECLARE_MODULE(icl_cxgbei, icl_cxgbei_data, SI_SUB_DRIVERS, SI_ORDER_MIDDLE); MODULE_DEPEND(icl_cxgbei, icl, 1, 1, 1); MODULE_VERSION(icl_cxgbei, 1); Index: projects/cxl_iscsi/sys/dev/cxgbe/tom/t4_tom.h =================================================================== --- projects/cxl_iscsi/sys/dev/cxgbe/tom/t4_tom.h (revision 291231) +++ projects/cxl_iscsi/sys/dev/cxgbe/tom/t4_tom.h (revision 291232) @@ -1,310 +1,311 @@ /*- * Copyright (c) 2012, 2015 Chelsio Communications, Inc. * All rights reserved. * Written by: Navdeep Parhar * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ * */ #ifndef __T4_TOM_H__ #define __T4_TOM_H__ #include #define LISTEN_HASH_SIZE 32 /* * Min receive window. We want it to be large enough to accommodate receive * coalescing, handle jumbo frames, and not trigger sender SWS avoidance. */ #define MIN_RCV_WND (24 * 1024U) /* * Max receive window supported by HW in bytes. Only a small part of it can * be set through option0, the rest needs to be set through RX_DATA_ACK. */ #define MAX_RCV_WND ((1U << 27) - 1) #define DDP_RSVD_WIN (16 * 1024U) #define SB_DDP_INDICATE SB_IN_TOE /* soreceive must respond to indicate */ #define USE_DDP_RX_FLOW_CONTROL /* TOE PCB flags */ enum { TPF_ATTACHED = (1 << 0), /* a tcpcb refers to this toepcb */ TPF_FLOWC_WR_SENT = (1 << 1), /* firmware flow context WR sent */ TPF_TX_DATA_SENT = (1 << 2), /* some data sent */ TPF_TX_SUSPENDED = (1 << 3), /* tx suspended for lack of resources */ TPF_SEND_FIN = (1 << 4), /* send FIN after all pending data */ TPF_FIN_SENT = (1 << 5), /* FIN has been sent */ TPF_ABORT_SHUTDOWN = (1 << 6), /* connection abort is in progress */ TPF_CPL_PENDING = (1 << 7), /* haven't received the last CPL */ TPF_SYNQE = (1 << 8), /* synq_entry, not really a toepcb */ TPF_SYNQE_NEEDFREE = (1 << 9), /* synq_entry was malloc'd separately */ TPF_SYNQE_TCPDDP = (1 << 10), /* ulp_mode TCPDDP in toepcb */ TPF_SYNQE_EXPANDED = (1 << 11), /* toepcb ready, tid context updated */ TPF_SYNQE_HAS_L2TE = (1 << 12), /* we've replied to PASS_ACCEPT_REQ */ }; enum { DDP_OK = (1 << 0), /* OK to turn on DDP */ DDP_SC_REQ = (1 << 1), /* state change (on/off) requested */ DDP_ON = (1 << 2), /* DDP is turned on */ DDP_BUF0_ACTIVE = (1 << 3), /* buffer 0 in use (not invalidated) */ DDP_BUF1_ACTIVE = (1 << 4), /* buffer 1 in use (not invalidated) */ }; struct ofld_tx_sdesc { uint32_t plen; /* payload length */ uint8_t tx_credits; /* firmware tx credits (unit is 16B) */ }; struct ddp_buffer { uint32_t tag; /* includes color, page pod addr, and DDP page size */ u_int ppod_addr; int nppods; int offset; int len; int npages; vm_page_t *pages; }; struct toepcb { TAILQ_ENTRY(toepcb) link; /* toep_list */ u_int flags; /* miscellaneous flags */ struct tom_data *td; struct inpcb *inp; /* backpointer to host stack's PCB */ struct port_info *port; /* physical port */ struct sge_wrq *ofld_txq; struct sge_ofld_rxq *ofld_rxq; struct sge_wrq *ctrlq; struct l2t_entry *l2te; /* L2 table entry used by this connection */ struct clip_entry *ce; /* CLIP table entry used by this tid */ int tid; /* Connection identifier */ /* tx credit handling */ u_int tx_total; /* total tx WR credits (in 16B units) */ u_int tx_credits; /* tx WR credits (in 16B units) available */ u_int tx_nocompl; /* tx WR credits since last compl request */ u_int plen_nocompl; /* payload since last compl request */ /* rx credit handling */ u_int sb_cc; /* last noted value of so_rcv->sb_cc */ int rx_credits; /* rx credits (in bytes) to be returned to hw */ u_int ulp_mode; /* ULP mode */ void *ulpcb; + void *ulpcb2; struct mbufq ulp_pduq; /* PDUs waiting to be sent out. */ struct mbufq ulp_pdu_reclaimq; u_int ddp_flags; struct ddp_buffer *db[2]; time_t ddp_disabled; uint8_t ddp_score; /* Tx software descriptor */ uint8_t txsd_total; uint8_t txsd_pidx; uint8_t txsd_cidx; uint8_t txsd_avail; struct ofld_tx_sdesc txsd[]; }; struct flowc_tx_params { uint32_t snd_nxt; uint32_t rcv_nxt; unsigned int snd_space; unsigned int mss; }; #define DDP_RETRY_WAIT 5 /* seconds to wait before re-enabling DDP */ #define DDP_LOW_SCORE 1 #define DDP_HIGH_SCORE 3 /* * Compressed state for embryonic connections for a listener. Barely fits in * 64B, try not to grow it further. */ struct synq_entry { TAILQ_ENTRY(synq_entry) link; /* listen_ctx's synq link */ int flags; /* same as toepcb's tp_flags */ int tid; struct listen_ctx *lctx; /* backpointer to listen ctx */ struct mbuf *syn; uint32_t iss; uint32_t ts; volatile uintptr_t wr; volatile u_int refcnt; uint16_t l2e_idx; uint16_t rcv_bufsize; }; /* listen_ctx flags */ #define LCTX_RPL_PENDING 1 /* waiting for a CPL_PASS_OPEN_RPL */ struct listen_ctx { LIST_ENTRY(listen_ctx) link; /* listen hash linkage */ volatile int refcount; int stid; struct stid_region stid_region; int flags; struct inpcb *inp; /* listening socket's inp */ struct sge_wrq *ctrlq; struct sge_ofld_rxq *ofld_rxq; struct clip_entry *ce; TAILQ_HEAD(, synq_entry) synq; }; struct clip_entry { TAILQ_ENTRY(clip_entry) link; struct in6_addr lip; /* local IPv6 address */ u_int refcount; }; TAILQ_HEAD(clip_head, clip_entry); struct tom_data { struct toedev tod; /* toepcb's associated with this TOE device */ struct mtx toep_list_lock; TAILQ_HEAD(, toepcb) toep_list; struct mtx lctx_hash_lock; LIST_HEAD(, listen_ctx) *listen_hash; u_long listen_mask; int lctx_count; /* # of lctx in the hash table */ u_int ppod_start; vmem_t *ppod_arena; struct mtx clip_table_lock; struct clip_head clip_table; int clip_gen; /* WRs that will not be sent to the chip because L2 resolution failed */ struct mtx unsent_wr_lock; STAILQ_HEAD(, wrqe) unsent_wr_list; struct task reclaim_wr_resources; }; static inline struct tom_data * tod_td(struct toedev *tod) { return (__containerof(tod, struct tom_data, tod)); } static inline struct adapter * td_adapter(struct tom_data *td) { return (td->tod.tod_softc); } static inline void set_mbuf_ulp_submode(struct mbuf *m, uint8_t ulp_submode) { M_ASSERTPKTHDR(m); m->m_pkthdr.PH_per.eight[0] = ulp_submode; } static inline uint8_t mbuf_ulp_submode(struct mbuf *m) { M_ASSERTPKTHDR(m); return (m->m_pkthdr.PH_per.eight[0]); } /* t4_tom.c */ struct toepcb *alloc_toepcb(struct port_info *, int, int, int); void free_toepcb(struct toepcb *); void offload_socket(struct socket *, struct toepcb *); void undo_offload_socket(struct socket *); void final_cpl_received(struct toepcb *); void insert_tid(struct adapter *, int, void *); void *lookup_tid(struct adapter *, int); void update_tid(struct adapter *, int, void *); void remove_tid(struct adapter *, int); void release_tid(struct adapter *, int, struct sge_wrq *); int find_best_mtu_idx(struct adapter *, struct in_conninfo *, int); u_long select_rcv_wnd(struct socket *); int select_rcv_wscale(void); uint64_t calc_opt0(struct socket *, struct port_info *, struct l2t_entry *, int, int, int, int); uint64_t select_ntuple(struct port_info *, struct l2t_entry *); void set_tcpddp_ulp_mode(struct toepcb *); int negative_advice(int); struct clip_entry *hold_lip(struct tom_data *, struct in6_addr *); void release_lip(struct tom_data *, struct clip_entry *); /* t4_connect.c */ void t4_init_connect_cpl_handlers(struct adapter *); int t4_connect(struct toedev *, struct socket *, struct rtentry *, struct sockaddr *); void act_open_failure_cleanup(struct adapter *, u_int, u_int); /* t4_listen.c */ void t4_init_listen_cpl_handlers(struct adapter *); int t4_listen_start(struct toedev *, struct tcpcb *); int t4_listen_stop(struct toedev *, struct tcpcb *); void t4_syncache_added(struct toedev *, void *); void t4_syncache_removed(struct toedev *, void *); int t4_syncache_respond(struct toedev *, void *, struct mbuf *); int do_abort_req_synqe(struct sge_iq *, const struct rss_header *, struct mbuf *); int do_abort_rpl_synqe(struct sge_iq *, const struct rss_header *, struct mbuf *); void t4_offload_socket(struct toedev *, void *, struct socket *); /* t4_cpl_io.c */ void t4_init_cpl_io_handlers(struct adapter *); void t4_uninit_cpl_io_handlers(struct adapter *); void send_abort_rpl(struct adapter *, struct sge_wrq *, int , int); void send_flowc_wr(struct toepcb *, struct flowc_tx_params *); void send_reset(struct adapter *, struct toepcb *, uint32_t); void make_established(struct toepcb *, uint32_t, uint32_t, uint16_t); void t4_rcvd(struct toedev *, struct tcpcb *); int t4_tod_output(struct toedev *, struct tcpcb *); int t4_send_fin(struct toedev *, struct tcpcb *); int t4_send_rst(struct toedev *, struct tcpcb *); void t4_set_tcb_field(struct adapter *, struct toepcb *, int, uint16_t, uint64_t, uint64_t); void t4_push_frames(struct adapter *sc, struct toepcb *toep, int drop); void t4_push_pdus(struct adapter *sc, struct toepcb *toep, int drop); /* t4_ddp.c */ void t4_init_ddp(struct adapter *, struct tom_data *); void t4_uninit_ddp(struct adapter *, struct tom_data *); int t4_soreceive_ddp(struct socket *, struct sockaddr **, struct uio *, struct mbuf **, struct mbuf **, int *); void enable_ddp(struct adapter *, struct toepcb *toep); void release_ddp_resources(struct toepcb *toep); void handle_ddp_close(struct toepcb *, struct tcpcb *, struct sockbuf *, uint32_t); void insert_ddp_data(struct toepcb *, uint32_t); #endif